diff --git a/dhp-build/dhp-build-properties-maven-plugin/src/main/java/eu/dnetlib/maven/plugin/properties/GenerateOoziePropertiesMojo.java b/dhp-build/dhp-build-properties-maven-plugin/src/main/java/eu/dnetlib/maven/plugin/properties/GenerateOoziePropertiesMojo.java index ccc2abef0..10a25fdc3 100644 --- a/dhp-build/dhp-build-properties-maven-plugin/src/main/java/eu/dnetlib/maven/plugin/properties/GenerateOoziePropertiesMojo.java +++ b/dhp-build/dhp-build-properties-maven-plugin/src/main/java/eu/dnetlib/maven/plugin/properties/GenerateOoziePropertiesMojo.java @@ -1,8 +1,10 @@ + package eu.dnetlib.maven.plugin.properties; import java.io.File; import java.util.ArrayList; import java.util.List; + import org.apache.commons.lang.ArrayUtils; import org.apache.commons.lang.StringUtils; import org.apache.maven.plugin.AbstractMojo; @@ -17,55 +19,58 @@ import org.apache.maven.plugin.MojoFailureException; */ public class GenerateOoziePropertiesMojo extends AbstractMojo { - public static final String PROPERTY_NAME_WF_SOURCE_DIR = "workflow.source.dir"; - public static final String PROPERTY_NAME_SANDBOX_NAME = "sandboxName"; + public static final String PROPERTY_NAME_WF_SOURCE_DIR = "workflow.source.dir"; + public static final String PROPERTY_NAME_SANDBOX_NAME = "sandboxName"; - private final String[] limiters = {"dhp", "dnetlib", "eu"}; + private final String[] limiters = { + "dhp", "dnetlib", "eu" + }; - @Override - public void execute() throws MojoExecutionException, MojoFailureException { - if (System.getProperties().containsKey(PROPERTY_NAME_WF_SOURCE_DIR) - && !System.getProperties().containsKey(PROPERTY_NAME_SANDBOX_NAME)) { - String generatedSandboxName = - generateSandboxName(System.getProperties().getProperty(PROPERTY_NAME_WF_SOURCE_DIR)); - if (generatedSandboxName != null) { - System.getProperties().setProperty(PROPERTY_NAME_SANDBOX_NAME, generatedSandboxName); - } else { - System.out.println( - "unable to generate sandbox name from path: " - + System.getProperties().getProperty(PROPERTY_NAME_WF_SOURCE_DIR)); - } - } - } + @Override + public void execute() throws MojoExecutionException, MojoFailureException { + if (System.getProperties().containsKey(PROPERTY_NAME_WF_SOURCE_DIR) + && !System.getProperties().containsKey(PROPERTY_NAME_SANDBOX_NAME)) { + String generatedSandboxName = generateSandboxName( + System.getProperties().getProperty(PROPERTY_NAME_WF_SOURCE_DIR)); + if (generatedSandboxName != null) { + System.getProperties().setProperty(PROPERTY_NAME_SANDBOX_NAME, generatedSandboxName); + } else { + System.out + .println( + "unable to generate sandbox name from path: " + + System.getProperties().getProperty(PROPERTY_NAME_WF_SOURCE_DIR)); + } + } + } - /** - * Generates sandbox name from workflow source directory. - * - * @param wfSourceDir - * @return generated sandbox name - */ - private String generateSandboxName(String wfSourceDir) { - // utilize all dir names until finding one of the limiters - List sandboxNameParts = new ArrayList(); - String[] tokens = StringUtils.split(wfSourceDir, File.separatorChar); - ArrayUtils.reverse(tokens); - if (tokens.length > 0) { - for (String token : tokens) { - for (String limiter : limiters) { - if (limiter.equals(token)) { - return sandboxNameParts.size() > 0 - ? StringUtils.join(sandboxNameParts.toArray()) - : null; - } - } - if (sandboxNameParts.size() > 0) { - sandboxNameParts.add(0, File.separator); - } - sandboxNameParts.add(0, token); - } - return StringUtils.join(sandboxNameParts.toArray()); - } else { - return null; - } - } + /** + * Generates sandbox name from workflow source directory. + * + * @param wfSourceDir + * @return generated sandbox name + */ + private String generateSandboxName(String wfSourceDir) { + // utilize all dir names until finding one of the limiters + List sandboxNameParts = new ArrayList(); + String[] tokens = StringUtils.split(wfSourceDir, File.separatorChar); + ArrayUtils.reverse(tokens); + if (tokens.length > 0) { + for (String token : tokens) { + for (String limiter : limiters) { + if (limiter.equals(token)) { + return sandboxNameParts.size() > 0 + ? StringUtils.join(sandboxNameParts.toArray()) + : null; + } + } + if (sandboxNameParts.size() > 0) { + sandboxNameParts.add(0, File.separator); + } + sandboxNameParts.add(0, token); + } + return StringUtils.join(sandboxNameParts.toArray()); + } else { + return null; + } + } } diff --git a/dhp-build/dhp-build-properties-maven-plugin/src/main/java/eu/dnetlib/maven/plugin/properties/WritePredefinedProjectProperties.java b/dhp-build/dhp-build-properties-maven-plugin/src/main/java/eu/dnetlib/maven/plugin/properties/WritePredefinedProjectProperties.java index 5e0166e4f..c1c567f95 100644 --- a/dhp-build/dhp-build-properties-maven-plugin/src/main/java/eu/dnetlib/maven/plugin/properties/WritePredefinedProjectProperties.java +++ b/dhp-build/dhp-build-properties-maven-plugin/src/main/java/eu/dnetlib/maven/plugin/properties/WritePredefinedProjectProperties.java @@ -9,9 +9,9 @@ * express or implied. See the License for the specific language governing permissions and * limitations under the License. */ + package eu.dnetlib.maven.plugin.properties; -import edu.umd.cs.findbugs.annotations.SuppressFBWarnings; import java.io.File; import java.io.FileInputStream; import java.io.IOException; @@ -24,6 +24,7 @@ import java.util.List; import java.util.Map.Entry; import java.util.Properties; import java.util.Set; + import org.apache.commons.io.FileUtils; import org.apache.commons.io.IOUtils; import org.apache.commons.lang.StringUtils; @@ -35,6 +36,8 @@ import org.springframework.core.io.DefaultResourceLoader; import org.springframework.core.io.Resource; import org.springframework.core.io.ResourceLoader; +import edu.umd.cs.findbugs.annotations.SuppressFBWarnings; + /** * Writes project properties for the keys listed in specified properties files. Based on: * http://site.kuali.org/maven/plugins/properties-maven-plugin/1.3.2/write-project-properties-mojo.html @@ -44,401 +47,401 @@ import org.springframework.core.io.ResourceLoader; */ public class WritePredefinedProjectProperties extends AbstractMojo { - private static final String CR = "\r"; - private static final String LF = "\n"; - private static final String TAB = "\t"; - protected static final String PROPERTY_PREFIX_ENV = "env."; - private static final String ENCODING_UTF8 = "utf8"; + private static final String CR = "\r"; + private static final String LF = "\n"; + private static final String TAB = "\t"; + protected static final String PROPERTY_PREFIX_ENV = "env."; + private static final String ENCODING_UTF8 = "utf8"; - /** @parameter property="properties.includePropertyKeysFromFiles" */ - private String[] includePropertyKeysFromFiles; + /** @parameter property="properties.includePropertyKeysFromFiles" */ + private String[] includePropertyKeysFromFiles; - /** - * @parameter default-value="${project}" - * @required - * @readonly - */ - protected MavenProject project; + /** + * @parameter default-value="${project}" + * @required + * @readonly + */ + protected MavenProject project; - /** - * The file that properties will be written to - * - * @parameter property="properties.outputFile" - * default-value="${project.build.directory}/properties/project.properties"; - * @required - */ - protected File outputFile; + /** + * The file that properties will be written to + * + * @parameter property="properties.outputFile" + * default-value="${project.build.directory}/properties/project.properties"; + * @required + */ + protected File outputFile; - /** - * If true, the plugin will silently ignore any non-existent properties files, and the build will - * continue - * - * @parameter property="properties.quiet" default-value="true" - */ - private boolean quiet; + /** + * If true, the plugin will silently ignore any non-existent properties files, and the build will continue + * + * @parameter property="properties.quiet" default-value="true" + */ + private boolean quiet; - /** - * Comma separated list of characters to escape when writing property values. cr=carriage return, - * lf=linefeed, tab=tab. Any other values are taken literally. - * - * @parameter default-value="cr,lf,tab" property="properties.escapeChars" - */ - private String escapeChars; + /** + * Comma separated list of characters to escape when writing property values. cr=carriage return, lf=linefeed, + * tab=tab. Any other values are taken literally. + * + * @parameter default-value="cr,lf,tab" property="properties.escapeChars" + */ + private String escapeChars; - /** - * If true, the plugin will include system properties when writing the properties file. System - * properties override both environment variables and project properties. - * - * @parameter default-value="false" property="properties.includeSystemProperties" - */ - private boolean includeSystemProperties; + /** + * If true, the plugin will include system properties when writing the properties file. System properties override + * both environment variables and project properties. + * + * @parameter default-value="false" property="properties.includeSystemProperties" + */ + private boolean includeSystemProperties; - /** - * If true, the plugin will include environment variables when writing the properties file. - * Environment variables are prefixed with "env". Environment variables override project - * properties. - * - * @parameter default-value="false" property="properties.includeEnvironmentVariables" - */ - private boolean includeEnvironmentVariables; + /** + * If true, the plugin will include environment variables when writing the properties file. Environment variables + * are prefixed with "env". Environment variables override project properties. + * + * @parameter default-value="false" property="properties.includeEnvironmentVariables" + */ + private boolean includeEnvironmentVariables; - /** - * Comma separated set of properties to exclude when writing the properties file - * - * @parameter property="properties.exclude" - */ - private String exclude; + /** + * Comma separated set of properties to exclude when writing the properties file + * + * @parameter property="properties.exclude" + */ + private String exclude; - /** - * Comma separated set of properties to write to the properties file. If provided, only the - * properties matching those supplied here will be written to the properties file. - * - * @parameter property="properties.include" - */ - private String include; + /** + * Comma separated set of properties to write to the properties file. If provided, only the properties matching + * those supplied here will be written to the properties file. + * + * @parameter property="properties.include" + */ + private String include; - /* - * (non-Javadoc) - * @see org.apache.maven.plugin.AbstractMojo#execute() - */ - @Override - @SuppressFBWarnings({"NP_UNWRITTEN_FIELD", "UWF_UNWRITTEN_FIELD"}) - public void execute() throws MojoExecutionException, MojoFailureException { - Properties properties = new Properties(); - // Add project properties - properties.putAll(project.getProperties()); - if (includeEnvironmentVariables) { - // Add environment variables, overriding any existing properties with the same key - properties.putAll(getEnvironmentVariables()); - } - if (includeSystemProperties) { - // Add system properties, overriding any existing properties with the same key - properties.putAll(System.getProperties()); - } + /* + * (non-Javadoc) + * @see org.apache.maven.plugin.AbstractMojo#execute() + */ + @Override + @SuppressFBWarnings({ + "NP_UNWRITTEN_FIELD", "UWF_UNWRITTEN_FIELD" + }) + public void execute() throws MojoExecutionException, MojoFailureException { + Properties properties = new Properties(); + // Add project properties + properties.putAll(project.getProperties()); + if (includeEnvironmentVariables) { + // Add environment variables, overriding any existing properties with the same key + properties.putAll(getEnvironmentVariables()); + } + if (includeSystemProperties) { + // Add system properties, overriding any existing properties with the same key + properties.putAll(System.getProperties()); + } - // Remove properties as appropriate - trim(properties, exclude, include); + // Remove properties as appropriate + trim(properties, exclude, include); - String comment = "# " + new Date() + "\n"; - List escapeTokens = getEscapeChars(escapeChars); + String comment = "# " + new Date() + "\n"; + List escapeTokens = getEscapeChars(escapeChars); - getLog().info("Creating " + outputFile); - writeProperties(outputFile, comment, properties, escapeTokens); - } + getLog().info("Creating " + outputFile); + writeProperties(outputFile, comment, properties, escapeTokens); + } - /** - * Provides environment variables. - * - * @return environment variables - */ - protected static Properties getEnvironmentVariables() { - Properties props = new Properties(); - for (Entry entry : System.getenv().entrySet()) { - props.setProperty(PROPERTY_PREFIX_ENV + entry.getKey(), entry.getValue()); - } - return props; - } + /** + * Provides environment variables. + * + * @return environment variables + */ + protected static Properties getEnvironmentVariables() { + Properties props = new Properties(); + for (Entry entry : System.getenv().entrySet()) { + props.setProperty(PROPERTY_PREFIX_ENV + entry.getKey(), entry.getValue()); + } + return props; + } - /** - * Removes properties which should not be written. - * - * @param properties - * @param omitCSV - * @param includeCSV - * @throws MojoExecutionException - */ - protected void trim(Properties properties, String omitCSV, String includeCSV) - throws MojoExecutionException { - List omitKeys = getListFromCSV(omitCSV); - for (String key : omitKeys) { - properties.remove(key); - } + /** + * Removes properties which should not be written. + * + * @param properties + * @param omitCSV + * @param includeCSV + * @throws MojoExecutionException + */ + protected void trim(Properties properties, String omitCSV, String includeCSV) + throws MojoExecutionException { + List omitKeys = getListFromCSV(omitCSV); + for (String key : omitKeys) { + properties.remove(key); + } - List includeKeys = getListFromCSV(includeCSV); - // mh: including keys from predefined properties - if (includePropertyKeysFromFiles != null && includePropertyKeysFromFiles.length > 0) { - for (String currentIncludeLoc : includePropertyKeysFromFiles) { - if (validate(currentIncludeLoc)) { - Properties p = getProperties(currentIncludeLoc); - for (String key : p.stringPropertyNames()) { - includeKeys.add(key); - } - } - } - } - if (includeKeys != null && !includeKeys.isEmpty()) { - // removing only when include keys provided - Set keys = properties.stringPropertyNames(); - for (String key : keys) { - if (!includeKeys.contains(key)) { - properties.remove(key); - } - } - } - } + List includeKeys = getListFromCSV(includeCSV); + // mh: including keys from predefined properties + if (includePropertyKeysFromFiles != null && includePropertyKeysFromFiles.length > 0) { + for (String currentIncludeLoc : includePropertyKeysFromFiles) { + if (validate(currentIncludeLoc)) { + Properties p = getProperties(currentIncludeLoc); + for (String key : p.stringPropertyNames()) { + includeKeys.add(key); + } + } + } + } + if (includeKeys != null && !includeKeys.isEmpty()) { + // removing only when include keys provided + Set keys = properties.stringPropertyNames(); + for (String key : keys) { + if (!includeKeys.contains(key)) { + properties.remove(key); + } + } + } + } - /** - * Checks whether file exists. - * - * @param location - * @return true when exists, false otherwise. - */ - protected boolean exists(String location) { - if (StringUtils.isBlank(location)) { - return false; - } - File file = new File(location); - if (file.exists()) { - return true; - } - ResourceLoader loader = new DefaultResourceLoader(); - Resource resource = loader.getResource(location); - return resource.exists(); - } + /** + * Checks whether file exists. + * + * @param location + * @return true when exists, false otherwise. + */ + protected boolean exists(String location) { + if (StringUtils.isBlank(location)) { + return false; + } + File file = new File(location); + if (file.exists()) { + return true; + } + ResourceLoader loader = new DefaultResourceLoader(); + Resource resource = loader.getResource(location); + return resource.exists(); + } - /** - * Validates resource location. - * - * @param location - * @return true when valid, false otherwise - * @throws MojoExecutionException - */ - protected boolean validate(String location) throws MojoExecutionException { - boolean exists = exists(location); - if (exists) { - return true; - } - if (quiet) { - getLog().info("Ignoring non-existent properties file '" + location + "'"); - return false; - } else { - throw new MojoExecutionException("Non-existent properties file '" + location + "'"); - } - } + /** + * Validates resource location. + * + * @param location + * @return true when valid, false otherwise + * @throws MojoExecutionException + */ + protected boolean validate(String location) throws MojoExecutionException { + boolean exists = exists(location); + if (exists) { + return true; + } + if (quiet) { + getLog().info("Ignoring non-existent properties file '" + location + "'"); + return false; + } else { + throw new MojoExecutionException("Non-existent properties file '" + location + "'"); + } + } - /** - * Provides input stream. - * - * @param location - * @return input stream - * @throws IOException - */ - protected InputStream getInputStream(String location) throws IOException { - File file = new File(location); - if (file.exists()) { - return new FileInputStream(location); - } - ResourceLoader loader = new DefaultResourceLoader(); - Resource resource = loader.getResource(location); - return resource.getInputStream(); - } + /** + * Provides input stream. + * + * @param location + * @return input stream + * @throws IOException + */ + protected InputStream getInputStream(String location) throws IOException { + File file = new File(location); + if (file.exists()) { + return new FileInputStream(location); + } + ResourceLoader loader = new DefaultResourceLoader(); + Resource resource = loader.getResource(location); + return resource.getInputStream(); + } - /** - * Creates properties for given location. - * - * @param location - * @return properties for given location - * @throws MojoExecutionException - */ - protected Properties getProperties(String location) throws MojoExecutionException { - InputStream in = null; - try { - Properties properties = new Properties(); - in = getInputStream(location); - if (location.toLowerCase().endsWith(".xml")) { - properties.loadFromXML(in); - } else { - properties.load(in); - } - return properties; - } catch (IOException e) { - throw new MojoExecutionException("Error reading properties file " + location, e); - } finally { - IOUtils.closeQuietly(in); - } - } + /** + * Creates properties for given location. + * + * @param location + * @return properties for given location + * @throws MojoExecutionException + */ + protected Properties getProperties(String location) throws MojoExecutionException { + InputStream in = null; + try { + Properties properties = new Properties(); + in = getInputStream(location); + if (location.toLowerCase().endsWith(".xml")) { + properties.loadFromXML(in); + } else { + properties.load(in); + } + return properties; + } catch (IOException e) { + throw new MojoExecutionException("Error reading properties file " + location, e); + } finally { + IOUtils.closeQuietly(in); + } + } - /** - * Provides escape characters. - * - * @param escapeChars - * @return escape characters - */ - protected List getEscapeChars(String escapeChars) { - List tokens = getListFromCSV(escapeChars); - List realTokens = new ArrayList(); - for (String token : tokens) { - String realToken = getRealToken(token); - realTokens.add(realToken); - } - return realTokens; - } + /** + * Provides escape characters. + * + * @param escapeChars + * @return escape characters + */ + protected List getEscapeChars(String escapeChars) { + List tokens = getListFromCSV(escapeChars); + List realTokens = new ArrayList(); + for (String token : tokens) { + String realToken = getRealToken(token); + realTokens.add(realToken); + } + return realTokens; + } - /** - * Provides real token. - * - * @param token - * @return real token - */ - protected String getRealToken(String token) { - if (token.equalsIgnoreCase("CR")) { - return CR; - } else if (token.equalsIgnoreCase("LF")) { - return LF; - } else if (token.equalsIgnoreCase("TAB")) { - return TAB; - } else { - return token; - } - } + /** + * Provides real token. + * + * @param token + * @return real token + */ + protected String getRealToken(String token) { + if (token.equalsIgnoreCase("CR")) { + return CR; + } else if (token.equalsIgnoreCase("LF")) { + return LF; + } else if (token.equalsIgnoreCase("TAB")) { + return TAB; + } else { + return token; + } + } - /** - * Returns content. - * - * @param comment - * @param properties - * @param escapeTokens - * @return content - */ - protected String getContent(String comment, Properties properties, List escapeTokens) { - List names = new ArrayList(properties.stringPropertyNames()); - Collections.sort(names); - StringBuilder sb = new StringBuilder(); - if (!StringUtils.isBlank(comment)) { - sb.append(comment); - } - for (String name : names) { - String value = properties.getProperty(name); - String escapedValue = escape(value, escapeTokens); - sb.append(name + "=" + escapedValue + "\n"); - } - return sb.toString(); - } + /** + * Returns content. + * + * @param comment + * @param properties + * @param escapeTokens + * @return content + */ + protected String getContent(String comment, Properties properties, List escapeTokens) { + List names = new ArrayList(properties.stringPropertyNames()); + Collections.sort(names); + StringBuilder sb = new StringBuilder(); + if (!StringUtils.isBlank(comment)) { + sb.append(comment); + } + for (String name : names) { + String value = properties.getProperty(name); + String escapedValue = escape(value, escapeTokens); + sb.append(name + "=" + escapedValue + "\n"); + } + return sb.toString(); + } - /** - * Writes properties to given file. - * - * @param file - * @param comment - * @param properties - * @param escapeTokens - * @throws MojoExecutionException - */ - protected void writeProperties( - File file, String comment, Properties properties, List escapeTokens) - throws MojoExecutionException { - try { - String content = getContent(comment, properties, escapeTokens); - FileUtils.writeStringToFile(file, content, ENCODING_UTF8); - } catch (IOException e) { - throw new MojoExecutionException("Error creating properties file", e); - } - } + /** + * Writes properties to given file. + * + * @param file + * @param comment + * @param properties + * @param escapeTokens + * @throws MojoExecutionException + */ + protected void writeProperties( + File file, String comment, Properties properties, List escapeTokens) + throws MojoExecutionException { + try { + String content = getContent(comment, properties, escapeTokens); + FileUtils.writeStringToFile(file, content, ENCODING_UTF8); + } catch (IOException e) { + throw new MojoExecutionException("Error creating properties file", e); + } + } - /** - * Escapes characters. - * - * @param s - * @param escapeChars - * @return - */ - protected String escape(String s, List escapeChars) { - String result = s; - for (String escapeChar : escapeChars) { - result = result.replace(escapeChar, getReplacementToken(escapeChar)); - } - return result; - } + /** + * Escapes characters. + * + * @param s + * @param escapeChars + * @return + */ + protected String escape(String s, List escapeChars) { + String result = s; + for (String escapeChar : escapeChars) { + result = result.replace(escapeChar, getReplacementToken(escapeChar)); + } + return result; + } - /** - * Provides replacement token. - * - * @param escapeChar - * @return replacement token - */ - protected String getReplacementToken(String escapeChar) { - if (escapeChar.equals(CR)) { - return "\\r"; - } else if (escapeChar.equals(LF)) { - return "\\n"; - } else if (escapeChar.equals(TAB)) { - return "\\t"; - } else { - return "\\" + escapeChar; - } - } + /** + * Provides replacement token. + * + * @param escapeChar + * @return replacement token + */ + protected String getReplacementToken(String escapeChar) { + if (escapeChar.equals(CR)) { + return "\\r"; + } else if (escapeChar.equals(LF)) { + return "\\n"; + } else if (escapeChar.equals(TAB)) { + return "\\t"; + } else { + return "\\" + escapeChar; + } + } - /** - * Returns list from csv. - * - * @param csv - * @return list of values generated from CSV - */ - protected static final List getListFromCSV(String csv) { - if (StringUtils.isBlank(csv)) { - return new ArrayList(); - } - List list = new ArrayList(); - String[] tokens = StringUtils.split(csv, ","); - for (String token : tokens) { - list.add(token.trim()); - } - return list; - } + /** + * Returns list from csv. + * + * @param csv + * @return list of values generated from CSV + */ + protected static final List getListFromCSV(String csv) { + if (StringUtils.isBlank(csv)) { + return new ArrayList(); + } + List list = new ArrayList(); + String[] tokens = StringUtils.split(csv, ","); + for (String token : tokens) { + list.add(token.trim()); + } + return list; + } - public void setIncludeSystemProperties(boolean includeSystemProperties) { - this.includeSystemProperties = includeSystemProperties; - } + public void setIncludeSystemProperties(boolean includeSystemProperties) { + this.includeSystemProperties = includeSystemProperties; + } - public void setEscapeChars(String escapeChars) { - this.escapeChars = escapeChars; - } + public void setEscapeChars(String escapeChars) { + this.escapeChars = escapeChars; + } - public void setIncludeEnvironmentVariables(boolean includeEnvironmentVariables) { - this.includeEnvironmentVariables = includeEnvironmentVariables; - } + public void setIncludeEnvironmentVariables(boolean includeEnvironmentVariables) { + this.includeEnvironmentVariables = includeEnvironmentVariables; + } - public void setExclude(String exclude) { - this.exclude = exclude; - } + public void setExclude(String exclude) { + this.exclude = exclude; + } - public void setInclude(String include) { - this.include = include; - } + public void setInclude(String include) { + this.include = include; + } - public void setQuiet(boolean quiet) { - this.quiet = quiet; - } + public void setQuiet(boolean quiet) { + this.quiet = quiet; + } - /** - * Sets property files for which keys properties should be included. - * - * @param includePropertyKeysFromFiles - */ - public void setIncludePropertyKeysFromFiles(String[] includePropertyKeysFromFiles) { - if (includePropertyKeysFromFiles != null) { - this.includePropertyKeysFromFiles = - Arrays.copyOf(includePropertyKeysFromFiles, includePropertyKeysFromFiles.length); - } - } + /** + * Sets property files for which keys properties should be included. + * + * @param includePropertyKeysFromFiles + */ + public void setIncludePropertyKeysFromFiles(String[] includePropertyKeysFromFiles) { + if (includePropertyKeysFromFiles != null) { + this.includePropertyKeysFromFiles = Arrays + .copyOf(includePropertyKeysFromFiles, includePropertyKeysFromFiles.length); + } + } } diff --git a/dhp-build/dhp-build-properties-maven-plugin/src/test/java/eu/dnetlib/maven/plugin/properties/GenerateOoziePropertiesMojoTest.java b/dhp-build/dhp-build-properties-maven-plugin/src/test/java/eu/dnetlib/maven/plugin/properties/GenerateOoziePropertiesMojoTest.java index 3a0d5fcc7..b8075ba5d 100644 --- a/dhp-build/dhp-build-properties-maven-plugin/src/test/java/eu/dnetlib/maven/plugin/properties/GenerateOoziePropertiesMojoTest.java +++ b/dhp-build/dhp-build-properties-maven-plugin/src/test/java/eu/dnetlib/maven/plugin/properties/GenerateOoziePropertiesMojoTest.java @@ -1,3 +1,4 @@ + package eu.dnetlib.maven.plugin.properties; import static eu.dnetlib.maven.plugin.properties.GenerateOoziePropertiesMojo.PROPERTY_NAME_SANDBOX_NAME; @@ -10,87 +11,87 @@ import org.junit.jupiter.api.Test; /** @author mhorst, claudio.atzori */ public class GenerateOoziePropertiesMojoTest { - private GenerateOoziePropertiesMojo mojo = new GenerateOoziePropertiesMojo(); + private GenerateOoziePropertiesMojo mojo = new GenerateOoziePropertiesMojo(); - @BeforeEach - public void clearSystemProperties() { - System.clearProperty(PROPERTY_NAME_SANDBOX_NAME); - System.clearProperty(PROPERTY_NAME_WF_SOURCE_DIR); - } + @BeforeEach + public void clearSystemProperties() { + System.clearProperty(PROPERTY_NAME_SANDBOX_NAME); + System.clearProperty(PROPERTY_NAME_WF_SOURCE_DIR); + } - @Test - public void testExecuteEmpty() throws Exception { - // execute - mojo.execute(); + @Test + public void testExecuteEmpty() throws Exception { + // execute + mojo.execute(); - // assert - assertNull(System.getProperty(PROPERTY_NAME_SANDBOX_NAME)); - } + // assert + assertNull(System.getProperty(PROPERTY_NAME_SANDBOX_NAME)); + } - @Test - public void testExecuteSandboxNameAlreadySet() throws Exception { - // given - String workflowSourceDir = "eu/dnetlib/dhp/wf/transformers"; - String sandboxName = "originalSandboxName"; - System.setProperty(PROPERTY_NAME_WF_SOURCE_DIR, workflowSourceDir); - System.setProperty(PROPERTY_NAME_SANDBOX_NAME, sandboxName); + @Test + public void testExecuteSandboxNameAlreadySet() throws Exception { + // given + String workflowSourceDir = "eu/dnetlib/dhp/wf/transformers"; + String sandboxName = "originalSandboxName"; + System.setProperty(PROPERTY_NAME_WF_SOURCE_DIR, workflowSourceDir); + System.setProperty(PROPERTY_NAME_SANDBOX_NAME, sandboxName); - // execute - mojo.execute(); + // execute + mojo.execute(); - // assert - assertEquals(sandboxName, System.getProperty(PROPERTY_NAME_SANDBOX_NAME)); - } + // assert + assertEquals(sandboxName, System.getProperty(PROPERTY_NAME_SANDBOX_NAME)); + } - @Test - public void testExecuteEmptyWorkflowSourceDir() throws Exception { - // given - String workflowSourceDir = ""; - System.setProperty(PROPERTY_NAME_WF_SOURCE_DIR, workflowSourceDir); + @Test + public void testExecuteEmptyWorkflowSourceDir() throws Exception { + // given + String workflowSourceDir = ""; + System.setProperty(PROPERTY_NAME_WF_SOURCE_DIR, workflowSourceDir); - // execute - mojo.execute(); + // execute + mojo.execute(); - // assert - assertNull(System.getProperty(PROPERTY_NAME_SANDBOX_NAME)); - } + // assert + assertNull(System.getProperty(PROPERTY_NAME_SANDBOX_NAME)); + } - @Test - public void testExecuteNullSandboxNameGenerated() throws Exception { - // given - String workflowSourceDir = "eu/dnetlib/dhp/"; - System.setProperty(PROPERTY_NAME_WF_SOURCE_DIR, workflowSourceDir); + @Test + public void testExecuteNullSandboxNameGenerated() throws Exception { + // given + String workflowSourceDir = "eu/dnetlib/dhp/"; + System.setProperty(PROPERTY_NAME_WF_SOURCE_DIR, workflowSourceDir); - // execute - mojo.execute(); + // execute + mojo.execute(); - // assert - assertNull(System.getProperty(PROPERTY_NAME_SANDBOX_NAME)); - } + // assert + assertNull(System.getProperty(PROPERTY_NAME_SANDBOX_NAME)); + } - @Test - public void testExecute() throws Exception { - // given - String workflowSourceDir = "eu/dnetlib/dhp/wf/transformers"; - System.setProperty(PROPERTY_NAME_WF_SOURCE_DIR, workflowSourceDir); + @Test + public void testExecute() throws Exception { + // given + String workflowSourceDir = "eu/dnetlib/dhp/wf/transformers"; + System.setProperty(PROPERTY_NAME_WF_SOURCE_DIR, workflowSourceDir); - // execute - mojo.execute(); + // execute + mojo.execute(); - // assert - assertEquals("wf/transformers", System.getProperty(PROPERTY_NAME_SANDBOX_NAME)); - } + // assert + assertEquals("wf/transformers", System.getProperty(PROPERTY_NAME_SANDBOX_NAME)); + } - @Test - public void testExecuteWithoutRoot() throws Exception { - // given - String workflowSourceDir = "wf/transformers"; - System.setProperty(PROPERTY_NAME_WF_SOURCE_DIR, workflowSourceDir); + @Test + public void testExecuteWithoutRoot() throws Exception { + // given + String workflowSourceDir = "wf/transformers"; + System.setProperty(PROPERTY_NAME_WF_SOURCE_DIR, workflowSourceDir); - // execute - mojo.execute(); + // execute + mojo.execute(); - // assert - assertEquals("wf/transformers", System.getProperty(PROPERTY_NAME_SANDBOX_NAME)); - } + // assert + assertEquals("wf/transformers", System.getProperty(PROPERTY_NAME_SANDBOX_NAME)); + } } diff --git a/dhp-build/dhp-build-properties-maven-plugin/src/test/java/eu/dnetlib/maven/plugin/properties/WritePredefinedProjectPropertiesTest.java b/dhp-build/dhp-build-properties-maven-plugin/src/test/java/eu/dnetlib/maven/plugin/properties/WritePredefinedProjectPropertiesTest.java index 1b247198b..e0b2eff37 100644 --- a/dhp-build/dhp-build-properties-maven-plugin/src/test/java/eu/dnetlib/maven/plugin/properties/WritePredefinedProjectPropertiesTest.java +++ b/dhp-build/dhp-build-properties-maven-plugin/src/test/java/eu/dnetlib/maven/plugin/properties/WritePredefinedProjectPropertiesTest.java @@ -1,3 +1,4 @@ + package eu.dnetlib.maven.plugin.properties; import static eu.dnetlib.maven.plugin.properties.WritePredefinedProjectProperties.PROPERTY_PREFIX_ENV; @@ -7,6 +8,7 @@ import static org.mockito.Mockito.lenient; import java.io.*; import java.util.Properties; + import org.apache.maven.plugin.MojoExecutionException; import org.apache.maven.project.MavenProject; import org.junit.jupiter.api.*; @@ -20,337 +22,353 @@ import org.mockito.junit.jupiter.MockitoExtension; @ExtendWith(MockitoExtension.class) public class WritePredefinedProjectPropertiesTest { - @Mock private MavenProject mavenProject; + @Mock + private MavenProject mavenProject; - private WritePredefinedProjectProperties mojo; + private WritePredefinedProjectProperties mojo; - @BeforeEach - public void init(@TempDir File testFolder) { - MockitoAnnotations.initMocks(this); - mojo = new WritePredefinedProjectProperties(); - mojo.outputFile = getPropertiesFileLocation(testFolder); - mojo.project = mavenProject; - lenient().doReturn(new Properties()).when(mavenProject).getProperties(); - } + @BeforeEach + public void init(@TempDir File testFolder) { + MockitoAnnotations.initMocks(this); + mojo = new WritePredefinedProjectProperties(); + mojo.outputFile = getPropertiesFileLocation(testFolder); + mojo.project = mavenProject; + lenient().doReturn(new Properties()).when(mavenProject).getProperties(); + } - // ----------------------------------- TESTS --------------------------------------------- + // ----------------------------------- TESTS --------------------------------------------- - @Test - public void testExecuteEmpty() throws Exception { - // execute - mojo.execute(); + @Test + public void testExecuteEmpty() throws Exception { + // execute + mojo.execute(); - // assert - assertTrue(mojo.outputFile.exists()); - Properties storedProperties = getStoredProperties(mojo.outputFile.getParentFile()); - assertEquals(0, storedProperties.size()); - } + // assert + assertTrue(mojo.outputFile.exists()); + Properties storedProperties = getStoredProperties(mojo.outputFile.getParentFile()); + assertEquals(0, storedProperties.size()); + } - @Test - public void testExecuteWithProjectProperties() throws Exception { - // given - String key = "projectPropertyKey"; - String value = "projectPropertyValue"; - Properties projectProperties = new Properties(); - projectProperties.setProperty(key, value); - doReturn(projectProperties).when(mavenProject).getProperties(); + @Test + public void testExecuteWithProjectProperties() throws Exception { + // given + String key = "projectPropertyKey"; + String value = "projectPropertyValue"; + Properties projectProperties = new Properties(); + projectProperties.setProperty(key, value); + doReturn(projectProperties).when(mavenProject).getProperties(); - // execute - mojo.execute(); + // execute + mojo.execute(); - // assert - assertTrue(mojo.outputFile.exists()); - Properties storedProperties = getStoredProperties(mojo.outputFile.getParentFile()); - assertEquals(1, storedProperties.size()); - assertTrue(storedProperties.containsKey(key)); - assertEquals(value, storedProperties.getProperty(key)); - } + // assert + assertTrue(mojo.outputFile.exists()); + Properties storedProperties = getStoredProperties(mojo.outputFile.getParentFile()); + assertEquals(1, storedProperties.size()); + assertTrue(storedProperties.containsKey(key)); + assertEquals(value, storedProperties.getProperty(key)); + } - @Test() - public void testExecuteWithProjectPropertiesAndInvalidOutputFile(@TempDir File testFolder) { - // given - String key = "projectPropertyKey"; - String value = "projectPropertyValue"; - Properties projectProperties = new Properties(); - projectProperties.setProperty(key, value); - doReturn(projectProperties).when(mavenProject).getProperties(); - mojo.outputFile = testFolder; + @Test() + public void testExecuteWithProjectPropertiesAndInvalidOutputFile(@TempDir File testFolder) { + // given + String key = "projectPropertyKey"; + String value = "projectPropertyValue"; + Properties projectProperties = new Properties(); + projectProperties.setProperty(key, value); + doReturn(projectProperties).when(mavenProject).getProperties(); + mojo.outputFile = testFolder; - // execute - Assertions.assertThrows(MojoExecutionException.class, () -> mojo.execute()); - } + // execute + Assertions.assertThrows(MojoExecutionException.class, () -> mojo.execute()); + } - @Test - public void testExecuteWithProjectPropertiesExclusion(@TempDir File testFolder) throws Exception { - // given - String key = "projectPropertyKey"; - String value = "projectPropertyValue"; - String excludedKey = "excludedPropertyKey"; - String excludedValue = "excludedPropertyValue"; - Properties projectProperties = new Properties(); - projectProperties.setProperty(key, value); - projectProperties.setProperty(excludedKey, excludedValue); - doReturn(projectProperties).when(mavenProject).getProperties(); - mojo.setExclude(excludedKey); + @Test + public void testExecuteWithProjectPropertiesExclusion(@TempDir File testFolder) throws Exception { + // given + String key = "projectPropertyKey"; + String value = "projectPropertyValue"; + String excludedKey = "excludedPropertyKey"; + String excludedValue = "excludedPropertyValue"; + Properties projectProperties = new Properties(); + projectProperties.setProperty(key, value); + projectProperties.setProperty(excludedKey, excludedValue); + doReturn(projectProperties).when(mavenProject).getProperties(); + mojo.setExclude(excludedKey); - // execute - mojo.execute(); + // execute + mojo.execute(); - // assert - assertTrue(mojo.outputFile.exists()); - Properties storedProperties = getStoredProperties(testFolder); - assertEquals(1, storedProperties.size()); - assertTrue(storedProperties.containsKey(key)); - assertEquals(value, storedProperties.getProperty(key)); - } + // assert + assertTrue(mojo.outputFile.exists()); + Properties storedProperties = getStoredProperties(testFolder); + assertEquals(1, storedProperties.size()); + assertTrue(storedProperties.containsKey(key)); + assertEquals(value, storedProperties.getProperty(key)); + } - @Test - public void testExecuteWithProjectPropertiesInclusion(@TempDir File testFolder) throws Exception { - // given - String key = "projectPropertyKey"; - String value = "projectPropertyValue"; - String includedKey = "includedPropertyKey"; - String includedValue = "includedPropertyValue"; - Properties projectProperties = new Properties(); - projectProperties.setProperty(key, value); - projectProperties.setProperty(includedKey, includedValue); - doReturn(projectProperties).when(mavenProject).getProperties(); - mojo.setInclude(includedKey); + @Test + public void testExecuteWithProjectPropertiesInclusion(@TempDir File testFolder) throws Exception { + // given + String key = "projectPropertyKey"; + String value = "projectPropertyValue"; + String includedKey = "includedPropertyKey"; + String includedValue = "includedPropertyValue"; + Properties projectProperties = new Properties(); + projectProperties.setProperty(key, value); + projectProperties.setProperty(includedKey, includedValue); + doReturn(projectProperties).when(mavenProject).getProperties(); + mojo.setInclude(includedKey); - // execute - mojo.execute(); + // execute + mojo.execute(); - // assert - assertTrue(mojo.outputFile.exists()); - Properties storedProperties = getStoredProperties(testFolder); - assertEquals(1, storedProperties.size()); - assertTrue(storedProperties.containsKey(includedKey)); - assertEquals(includedValue, storedProperties.getProperty(includedKey)); - } + // assert + assertTrue(mojo.outputFile.exists()); + Properties storedProperties = getStoredProperties(testFolder); + assertEquals(1, storedProperties.size()); + assertTrue(storedProperties.containsKey(includedKey)); + assertEquals(includedValue, storedProperties.getProperty(includedKey)); + } - @Test - public void testExecuteIncludingPropertyKeysFromFile(@TempDir File testFolder) throws Exception { - // given - String key = "projectPropertyKey"; - String value = "projectPropertyValue"; - String includedKey = "includedPropertyKey"; - String includedValue = "includedPropertyValue"; - Properties projectProperties = new Properties(); - projectProperties.setProperty(key, value); - projectProperties.setProperty(includedKey, includedValue); - doReturn(projectProperties).when(mavenProject).getProperties(); + @Test + public void testExecuteIncludingPropertyKeysFromFile(@TempDir File testFolder) throws Exception { + // given + String key = "projectPropertyKey"; + String value = "projectPropertyValue"; + String includedKey = "includedPropertyKey"; + String includedValue = "includedPropertyValue"; + Properties projectProperties = new Properties(); + projectProperties.setProperty(key, value); + projectProperties.setProperty(includedKey, includedValue); + doReturn(projectProperties).when(mavenProject).getProperties(); - File includedPropertiesFile = new File(testFolder, "included.properties"); - Properties includedProperties = new Properties(); - includedProperties.setProperty(includedKey, "irrelevantValue"); - includedProperties.store(new FileWriter(includedPropertiesFile), null); + File includedPropertiesFile = new File(testFolder, "included.properties"); + Properties includedProperties = new Properties(); + includedProperties.setProperty(includedKey, "irrelevantValue"); + includedProperties.store(new FileWriter(includedPropertiesFile), null); - mojo.setIncludePropertyKeysFromFiles(new String[] {includedPropertiesFile.getAbsolutePath()}); + mojo.setIncludePropertyKeysFromFiles(new String[] { + includedPropertiesFile.getAbsolutePath() + }); - // execute - mojo.execute(); + // execute + mojo.execute(); - // assert - assertTrue(mojo.outputFile.exists()); - Properties storedProperties = getStoredProperties(testFolder); - assertEquals(1, storedProperties.size()); - assertTrue(storedProperties.containsKey(includedKey)); - assertEquals(includedValue, storedProperties.getProperty(includedKey)); - } + // assert + assertTrue(mojo.outputFile.exists()); + Properties storedProperties = getStoredProperties(testFolder); + assertEquals(1, storedProperties.size()); + assertTrue(storedProperties.containsKey(includedKey)); + assertEquals(includedValue, storedProperties.getProperty(includedKey)); + } - @Test - public void testExecuteIncludingPropertyKeysFromClasspathResource(@TempDir File testFolder) - throws Exception { - // given - String key = "projectPropertyKey"; - String value = "projectPropertyValue"; - String includedKey = "includedPropertyKey"; - String includedValue = "includedPropertyValue"; - Properties projectProperties = new Properties(); - projectProperties.setProperty(key, value); - projectProperties.setProperty(includedKey, includedValue); - doReturn(projectProperties).when(mavenProject).getProperties(); + @Test + public void testExecuteIncludingPropertyKeysFromClasspathResource(@TempDir File testFolder) + throws Exception { + // given + String key = "projectPropertyKey"; + String value = "projectPropertyValue"; + String includedKey = "includedPropertyKey"; + String includedValue = "includedPropertyValue"; + Properties projectProperties = new Properties(); + projectProperties.setProperty(key, value); + projectProperties.setProperty(includedKey, includedValue); + doReturn(projectProperties).when(mavenProject).getProperties(); - mojo.setIncludePropertyKeysFromFiles( - new String[] {"/eu/dnetlib/maven/plugin/properties/included.properties"}); + mojo + .setIncludePropertyKeysFromFiles( + new String[] { + "/eu/dnetlib/maven/plugin/properties/included.properties" + }); - // execute - mojo.execute(); + // execute + mojo.execute(); - // assert - assertTrue(mojo.outputFile.exists()); - Properties storedProperties = getStoredProperties(testFolder); - assertEquals(1, storedProperties.size()); - assertTrue(storedProperties.containsKey(includedKey)); - assertEquals(includedValue, storedProperties.getProperty(includedKey)); - } + // assert + assertTrue(mojo.outputFile.exists()); + Properties storedProperties = getStoredProperties(testFolder); + assertEquals(1, storedProperties.size()); + assertTrue(storedProperties.containsKey(includedKey)); + assertEquals(includedValue, storedProperties.getProperty(includedKey)); + } - @Test - public void testExecuteIncludingPropertyKeysFromBlankLocation() { - // given - String key = "projectPropertyKey"; - String value = "projectPropertyValue"; - String includedKey = "includedPropertyKey"; - String includedValue = "includedPropertyValue"; - Properties projectProperties = new Properties(); - projectProperties.setProperty(key, value); - projectProperties.setProperty(includedKey, includedValue); - doReturn(projectProperties).when(mavenProject).getProperties(); + @Test + public void testExecuteIncludingPropertyKeysFromBlankLocation() { + // given + String key = "projectPropertyKey"; + String value = "projectPropertyValue"; + String includedKey = "includedPropertyKey"; + String includedValue = "includedPropertyValue"; + Properties projectProperties = new Properties(); + projectProperties.setProperty(key, value); + projectProperties.setProperty(includedKey, includedValue); + doReturn(projectProperties).when(mavenProject).getProperties(); - mojo.setIncludePropertyKeysFromFiles(new String[] {""}); + mojo.setIncludePropertyKeysFromFiles(new String[] { + "" + }); - // execute - Assertions.assertThrows(MojoExecutionException.class, () -> mojo.execute()); - } + // execute + Assertions.assertThrows(MojoExecutionException.class, () -> mojo.execute()); + } - @Test - public void testExecuteIncludingPropertyKeysFromXmlFile(@TempDir File testFolder) - throws Exception { - // given - String key = "projectPropertyKey"; - String value = "projectPropertyValue"; - String includedKey = "includedPropertyKey"; - String includedValue = "includedPropertyValue"; - Properties projectProperties = new Properties(); - projectProperties.setProperty(key, value); - projectProperties.setProperty(includedKey, includedValue); - doReturn(projectProperties).when(mavenProject).getProperties(); + @Test + public void testExecuteIncludingPropertyKeysFromXmlFile(@TempDir File testFolder) + throws Exception { + // given + String key = "projectPropertyKey"; + String value = "projectPropertyValue"; + String includedKey = "includedPropertyKey"; + String includedValue = "includedPropertyValue"; + Properties projectProperties = new Properties(); + projectProperties.setProperty(key, value); + projectProperties.setProperty(includedKey, includedValue); + doReturn(projectProperties).when(mavenProject).getProperties(); - File includedPropertiesFile = new File(testFolder, "included.xml"); - Properties includedProperties = new Properties(); - includedProperties.setProperty(includedKey, "irrelevantValue"); - includedProperties.storeToXML(new FileOutputStream(includedPropertiesFile), null); + File includedPropertiesFile = new File(testFolder, "included.xml"); + Properties includedProperties = new Properties(); + includedProperties.setProperty(includedKey, "irrelevantValue"); + includedProperties.storeToXML(new FileOutputStream(includedPropertiesFile), null); - mojo.setIncludePropertyKeysFromFiles(new String[] {includedPropertiesFile.getAbsolutePath()}); + mojo.setIncludePropertyKeysFromFiles(new String[] { + includedPropertiesFile.getAbsolutePath() + }); - // execute - mojo.execute(); + // execute + mojo.execute(); - // assert - assertTrue(mojo.outputFile.exists()); - Properties storedProperties = getStoredProperties(testFolder); - assertEquals(1, storedProperties.size()); - assertTrue(storedProperties.containsKey(includedKey)); - assertEquals(includedValue, storedProperties.getProperty(includedKey)); - } + // assert + assertTrue(mojo.outputFile.exists()); + Properties storedProperties = getStoredProperties(testFolder); + assertEquals(1, storedProperties.size()); + assertTrue(storedProperties.containsKey(includedKey)); + assertEquals(includedValue, storedProperties.getProperty(includedKey)); + } - @Test - public void testExecuteIncludingPropertyKeysFromInvalidXmlFile(@TempDir File testFolder) - throws Exception { - // given - String key = "projectPropertyKey"; - String value = "projectPropertyValue"; - String includedKey = "includedPropertyKey"; - String includedValue = "includedPropertyValue"; - Properties projectProperties = new Properties(); - projectProperties.setProperty(key, value); - projectProperties.setProperty(includedKey, includedValue); - doReturn(projectProperties).when(mavenProject).getProperties(); + @Test + public void testExecuteIncludingPropertyKeysFromInvalidXmlFile(@TempDir File testFolder) + throws Exception { + // given + String key = "projectPropertyKey"; + String value = "projectPropertyValue"; + String includedKey = "includedPropertyKey"; + String includedValue = "includedPropertyValue"; + Properties projectProperties = new Properties(); + projectProperties.setProperty(key, value); + projectProperties.setProperty(includedKey, includedValue); + doReturn(projectProperties).when(mavenProject).getProperties(); - File includedPropertiesFile = new File(testFolder, "included.xml"); - Properties includedProperties = new Properties(); - includedProperties.setProperty(includedKey, "irrelevantValue"); - includedProperties.store(new FileOutputStream(includedPropertiesFile), null); + File includedPropertiesFile = new File(testFolder, "included.xml"); + Properties includedProperties = new Properties(); + includedProperties.setProperty(includedKey, "irrelevantValue"); + includedProperties.store(new FileOutputStream(includedPropertiesFile), null); - mojo.setIncludePropertyKeysFromFiles(new String[] {includedPropertiesFile.getAbsolutePath()}); + mojo.setIncludePropertyKeysFromFiles(new String[] { + includedPropertiesFile.getAbsolutePath() + }); - // execute - Assertions.assertThrows(MojoExecutionException.class, () -> mojo.execute()); - } + // execute + Assertions.assertThrows(MojoExecutionException.class, () -> mojo.execute()); + } - @Test - public void testExecuteWithQuietModeOn(@TempDir File testFolder) throws Exception { - // given - mojo.setQuiet(true); - mojo.setIncludePropertyKeysFromFiles(new String[] {"invalid location"}); + @Test + public void testExecuteWithQuietModeOn(@TempDir File testFolder) throws Exception { + // given + mojo.setQuiet(true); + mojo.setIncludePropertyKeysFromFiles(new String[] { + "invalid location" + }); - // execute - mojo.execute(); + // execute + mojo.execute(); - // assert - assertTrue(mojo.outputFile.exists()); - Properties storedProperties = getStoredProperties(testFolder); - assertEquals(0, storedProperties.size()); - } + // assert + assertTrue(mojo.outputFile.exists()); + Properties storedProperties = getStoredProperties(testFolder); + assertEquals(0, storedProperties.size()); + } - @Test - public void testExecuteIncludingPropertyKeysFromInvalidFile() { - // given - mojo.setIncludePropertyKeysFromFiles(new String[] {"invalid location"}); + @Test + public void testExecuteIncludingPropertyKeysFromInvalidFile() { + // given + mojo.setIncludePropertyKeysFromFiles(new String[] { + "invalid location" + }); - // execute - Assertions.assertThrows(MojoExecutionException.class, () -> mojo.execute()); - } + // execute + Assertions.assertThrows(MojoExecutionException.class, () -> mojo.execute()); + } - @Test - public void testExecuteWithEnvironmentProperties(@TempDir File testFolder) throws Exception { - // given - mojo.setIncludeEnvironmentVariables(true); + @Test + public void testExecuteWithEnvironmentProperties(@TempDir File testFolder) throws Exception { + // given + mojo.setIncludeEnvironmentVariables(true); - // execute - mojo.execute(); + // execute + mojo.execute(); - // assert - assertTrue(mojo.outputFile.exists()); - Properties storedProperties = getStoredProperties(testFolder); - assertTrue(storedProperties.size() > 0); - for (Object currentKey : storedProperties.keySet()) { - assertTrue(((String) currentKey).startsWith(PROPERTY_PREFIX_ENV)); - } - } + // assert + assertTrue(mojo.outputFile.exists()); + Properties storedProperties = getStoredProperties(testFolder); + assertTrue(storedProperties.size() > 0); + for (Object currentKey : storedProperties.keySet()) { + assertTrue(((String) currentKey).startsWith(PROPERTY_PREFIX_ENV)); + } + } - @Test - public void testExecuteWithSystemProperties(@TempDir File testFolder) throws Exception { - // given - String key = "systemPropertyKey"; - String value = "systemPropertyValue"; - System.setProperty(key, value); - mojo.setIncludeSystemProperties(true); + @Test + public void testExecuteWithSystemProperties(@TempDir File testFolder) throws Exception { + // given + String key = "systemPropertyKey"; + String value = "systemPropertyValue"; + System.setProperty(key, value); + mojo.setIncludeSystemProperties(true); - // execute - mojo.execute(); + // execute + mojo.execute(); - // assert - assertTrue(mojo.outputFile.exists()); - Properties storedProperties = getStoredProperties(testFolder); - assertTrue(storedProperties.size() > 0); - assertTrue(storedProperties.containsKey(key)); - assertEquals(value, storedProperties.getProperty(key)); - } + // assert + assertTrue(mojo.outputFile.exists()); + Properties storedProperties = getStoredProperties(testFolder); + assertTrue(storedProperties.size() > 0); + assertTrue(storedProperties.containsKey(key)); + assertEquals(value, storedProperties.getProperty(key)); + } - @Test - public void testExecuteWithSystemPropertiesAndEscapeChars(@TempDir File testFolder) - throws Exception { - // given - String key = "systemPropertyKey "; - String value = "systemPropertyValue"; - System.setProperty(key, value); - mojo.setIncludeSystemProperties(true); - String escapeChars = "cr,lf,tab,|"; - mojo.setEscapeChars(escapeChars); + @Test + public void testExecuteWithSystemPropertiesAndEscapeChars(@TempDir File testFolder) + throws Exception { + // given + String key = "systemPropertyKey "; + String value = "systemPropertyValue"; + System.setProperty(key, value); + mojo.setIncludeSystemProperties(true); + String escapeChars = "cr,lf,tab,|"; + mojo.setEscapeChars(escapeChars); - // execute - mojo.execute(); + // execute + mojo.execute(); - // assert - assertTrue(mojo.outputFile.exists()); - Properties storedProperties = getStoredProperties(testFolder); - assertTrue(storedProperties.size() > 0); - assertFalse(storedProperties.containsKey(key)); - assertTrue(storedProperties.containsKey(key.trim())); - assertEquals(value, storedProperties.getProperty(key.trim())); - } + // assert + assertTrue(mojo.outputFile.exists()); + Properties storedProperties = getStoredProperties(testFolder); + assertTrue(storedProperties.size() > 0); + assertFalse(storedProperties.containsKey(key)); + assertTrue(storedProperties.containsKey(key.trim())); + assertEquals(value, storedProperties.getProperty(key.trim())); + } - // ----------------------------------- PRIVATE ------------------------------------------- + // ----------------------------------- PRIVATE ------------------------------------------- - private File getPropertiesFileLocation(File testFolder) { - return new File(testFolder, "test.properties"); - } + private File getPropertiesFileLocation(File testFolder) { + return new File(testFolder, "test.properties"); + } - private Properties getStoredProperties(File testFolder) - throws FileNotFoundException, IOException { - Properties properties = new Properties(); - properties.load(new FileInputStream(getPropertiesFileLocation(testFolder))); - return properties; - } + private Properties getStoredProperties(File testFolder) + throws FileNotFoundException, IOException { + Properties properties = new Properties(); + properties.load(new FileInputStream(getPropertiesFileLocation(testFolder))); + return properties; + } } diff --git a/dhp-common/src/main/java/eu/dnetlib/collector/worker/model/ApiDescriptor.java b/dhp-common/src/main/java/eu/dnetlib/collector/worker/model/ApiDescriptor.java index 1a7c2a6ef..bfd70e8c6 100644 --- a/dhp-common/src/main/java/eu/dnetlib/collector/worker/model/ApiDescriptor.java +++ b/dhp-common/src/main/java/eu/dnetlib/collector/worker/model/ApiDescriptor.java @@ -1,3 +1,4 @@ + package eu.dnetlib.collector.worker.model; import java.util.HashMap; @@ -5,43 +6,43 @@ import java.util.Map; public class ApiDescriptor { - private String id; + private String id; - private String baseUrl; + private String baseUrl; - private String protocol; + private String protocol; - private Map params = new HashMap<>(); + private Map params = new HashMap<>(); - public String getBaseUrl() { - return baseUrl; - } + public String getBaseUrl() { + return baseUrl; + } - public void setBaseUrl(final String baseUrl) { - this.baseUrl = baseUrl; - } + public void setBaseUrl(final String baseUrl) { + this.baseUrl = baseUrl; + } - public String getId() { - return id; - } + public String getId() { + return id; + } - public void setId(final String id) { - this.id = id; - } + public void setId(final String id) { + this.id = id; + } - public Map getParams() { - return params; - } + public Map getParams() { + return params; + } - public void setParams(final HashMap params) { - this.params = params; - } + public void setParams(final HashMap params) { + this.params = params; + } - public String getProtocol() { - return protocol; - } + public String getProtocol() { + return protocol; + } - public void setProtocol(final String protocol) { - this.protocol = protocol; - } + public void setProtocol(final String protocol) { + this.protocol = protocol; + } } diff --git a/dhp-common/src/main/java/eu/dnetlib/data/mdstore/manager/common/model/MDStore.java b/dhp-common/src/main/java/eu/dnetlib/data/mdstore/manager/common/model/MDStore.java index f076bd188..68fc024af 100644 --- a/dhp-common/src/main/java/eu/dnetlib/data/mdstore/manager/common/model/MDStore.java +++ b/dhp-common/src/main/java/eu/dnetlib/data/mdstore/manager/common/model/MDStore.java @@ -1,7 +1,9 @@ + package eu.dnetlib.data.mdstore.manager.common.model; import java.io.Serializable; import java.util.UUID; + import javax.persistence.Column; import javax.persistence.Entity; import javax.persistence.Id; @@ -11,107 +13,107 @@ import javax.persistence.Table; @Table(name = "mdstores") public class MDStore implements Serializable { - /** */ - private static final long serialVersionUID = 3160530489149700055L; + /** */ + private static final long serialVersionUID = 3160530489149700055L; - @Id - @Column(name = "id") - private String id; + @Id + @Column(name = "id") + private String id; - @Column(name = "format") - private String format; + @Column(name = "format") + private String format; - @Column(name = "layout") - private String layout; + @Column(name = "layout") + private String layout; - @Column(name = "interpretation") - private String interpretation; + @Column(name = "interpretation") + private String interpretation; - @Column(name = "datasource_name") - private String datasourceName; + @Column(name = "datasource_name") + private String datasourceName; - @Column(name = "datasource_id") - private String datasourceId; + @Column(name = "datasource_id") + private String datasourceId; - @Column(name = "api_id") - private String apiId; + @Column(name = "api_id") + private String apiId; - public String getId() { - return id; - } + public String getId() { + return id; + } - public void setId(final String id) { - this.id = id; - } + public void setId(final String id) { + this.id = id; + } - public String getFormat() { - return format; - } + public String getFormat() { + return format; + } - public void setFormat(final String format) { - this.format = format; - } + public void setFormat(final String format) { + this.format = format; + } - public String getLayout() { - return layout; - } + public String getLayout() { + return layout; + } - public void setLayout(final String layout) { - this.layout = layout; - } + public void setLayout(final String layout) { + this.layout = layout; + } - public String getInterpretation() { - return interpretation; - } + public String getInterpretation() { + return interpretation; + } - public void setInterpretation(final String interpretation) { - this.interpretation = interpretation; - } + public void setInterpretation(final String interpretation) { + this.interpretation = interpretation; + } - public String getDatasourceName() { - return datasourceName; - } + public String getDatasourceName() { + return datasourceName; + } - public void setDatasourceName(final String datasourceName) { - this.datasourceName = datasourceName; - } + public void setDatasourceName(final String datasourceName) { + this.datasourceName = datasourceName; + } - public String getDatasourceId() { - return datasourceId; - } + public String getDatasourceId() { + return datasourceId; + } - public void setDatasourceId(final String datasourceId) { - this.datasourceId = datasourceId; - } + public void setDatasourceId(final String datasourceId) { + this.datasourceId = datasourceId; + } - public String getApiId() { - return apiId; - } + public String getApiId() { + return apiId; + } - public void setApiId(final String apiId) { - this.apiId = apiId; - } + public void setApiId(final String apiId) { + this.apiId = apiId; + } - public static MDStore newInstance( - final String format, final String layout, final String interpretation) { - return newInstance(format, layout, interpretation, null, null, null); - } + public static MDStore newInstance( + final String format, final String layout, final String interpretation) { + return newInstance(format, layout, interpretation, null, null, null); + } - public static MDStore newInstance( - final String format, - final String layout, - final String interpretation, - final String dsName, - final String dsId, - final String apiId) { - final MDStore md = new MDStore(); - md.setId("md-" + UUID.randomUUID()); - md.setFormat(format); - md.setLayout(layout); - md.setInterpretation(interpretation); - md.setDatasourceName(dsName); - md.setDatasourceId(dsId); - md.setApiId(apiId); - return md; - } + public static MDStore newInstance( + final String format, + final String layout, + final String interpretation, + final String dsName, + final String dsId, + final String apiId) { + final MDStore md = new MDStore(); + md.setId("md-" + UUID.randomUUID()); + md.setFormat(format); + md.setLayout(layout); + md.setInterpretation(interpretation); + md.setDatasourceName(dsName); + md.setDatasourceId(dsId); + md.setApiId(apiId); + return md; + } } diff --git a/dhp-common/src/main/java/eu/dnetlib/data/mdstore/manager/common/model/MDStoreCurrentVersion.java b/dhp-common/src/main/java/eu/dnetlib/data/mdstore/manager/common/model/MDStoreCurrentVersion.java index 0f8f04322..f74ab39be 100644 --- a/dhp-common/src/main/java/eu/dnetlib/data/mdstore/manager/common/model/MDStoreCurrentVersion.java +++ b/dhp-common/src/main/java/eu/dnetlib/data/mdstore/manager/common/model/MDStoreCurrentVersion.java @@ -1,6 +1,8 @@ + package eu.dnetlib.data.mdstore.manager.common.model; import java.io.Serializable; + import javax.persistence.Column; import javax.persistence.Entity; import javax.persistence.Id; @@ -10,40 +12,40 @@ import javax.persistence.Table; @Table(name = "mdstore_current_versions") public class MDStoreCurrentVersion implements Serializable { - /** */ - private static final long serialVersionUID = -4757725888593745773L; + /** */ + private static final long serialVersionUID = -4757725888593745773L; - @Id - @Column(name = "mdstore") - private String mdstore; + @Id + @Column(name = "mdstore") + private String mdstore; - @Column(name = "current_version") - private String currentVersion; + @Column(name = "current_version") + private String currentVersion; - public String getMdstore() { - return mdstore; - } + public String getMdstore() { + return mdstore; + } - public void setMdstore(final String mdstore) { - this.mdstore = mdstore; - } + public void setMdstore(final String mdstore) { + this.mdstore = mdstore; + } - public String getCurrentVersion() { - return currentVersion; - } + public String getCurrentVersion() { + return currentVersion; + } - public void setCurrentVersion(final String currentVersion) { - this.currentVersion = currentVersion; - } + public void setCurrentVersion(final String currentVersion) { + this.currentVersion = currentVersion; + } - public static MDStoreCurrentVersion newInstance(final String mdId, final String versionId) { - final MDStoreCurrentVersion cv = new MDStoreCurrentVersion(); - cv.setMdstore(mdId); - cv.setCurrentVersion(versionId); - return cv; - } + public static MDStoreCurrentVersion newInstance(final String mdId, final String versionId) { + final MDStoreCurrentVersion cv = new MDStoreCurrentVersion(); + cv.setMdstore(mdId); + cv.setCurrentVersion(versionId); + return cv; + } - public static MDStoreCurrentVersion newInstance(final MDStoreVersion v) { - return newInstance(v.getMdstore(), v.getId()); - } + public static MDStoreCurrentVersion newInstance(final MDStoreVersion v) { + return newInstance(v.getMdstore(), v.getId()); + } } diff --git a/dhp-common/src/main/java/eu/dnetlib/data/mdstore/manager/common/model/MDStoreVersion.java b/dhp-common/src/main/java/eu/dnetlib/data/mdstore/manager/common/model/MDStoreVersion.java index ca784b2fb..7ef24f191 100644 --- a/dhp-common/src/main/java/eu/dnetlib/data/mdstore/manager/common/model/MDStoreVersion.java +++ b/dhp-common/src/main/java/eu/dnetlib/data/mdstore/manager/common/model/MDStoreVersion.java @@ -1,7 +1,9 @@ + package eu.dnetlib.data.mdstore.manager.common.model; import java.io.Serializable; import java.util.Date; + import javax.persistence.Column; import javax.persistence.Entity; import javax.persistence.Id; @@ -13,85 +15,85 @@ import javax.persistence.TemporalType; @Table(name = "mdstore_versions") public class MDStoreVersion implements Serializable { - /** */ - private static final long serialVersionUID = -4763494442274298339L; + /** */ + private static final long serialVersionUID = -4763494442274298339L; - @Id - @Column(name = "id") - private String id; + @Id + @Column(name = "id") + private String id; - @Column(name = "mdstore") - private String mdstore; + @Column(name = "mdstore") + private String mdstore; - @Column(name = "writing") - private boolean writing; + @Column(name = "writing") + private boolean writing; - @Column(name = "readcount") - private int readCount = 0; + @Column(name = "readcount") + private int readCount = 0; - @Column(name = "lastupdate") - @Temporal(TemporalType.TIMESTAMP) - private Date lastUpdate; + @Column(name = "lastupdate") + @Temporal(TemporalType.TIMESTAMP) + private Date lastUpdate; - @Column(name = "size") - private long size = 0; + @Column(name = "size") + private long size = 0; - public static MDStoreVersion newInstance(final String mdId, final boolean writing) { - final MDStoreVersion t = new MDStoreVersion(); - t.setId(mdId + "-" + new Date().getTime()); - t.setMdstore(mdId); - t.setLastUpdate(null); - t.setWriting(writing); - t.setReadCount(0); - t.setSize(0); - return t; - } + public static MDStoreVersion newInstance(final String mdId, final boolean writing) { + final MDStoreVersion t = new MDStoreVersion(); + t.setId(mdId + "-" + new Date().getTime()); + t.setMdstore(mdId); + t.setLastUpdate(null); + t.setWriting(writing); + t.setReadCount(0); + t.setSize(0); + return t; + } - public String getId() { - return id; - } + public String getId() { + return id; + } - public void setId(final String id) { - this.id = id; - } + public void setId(final String id) { + this.id = id; + } - public String getMdstore() { - return mdstore; - } + public String getMdstore() { + return mdstore; + } - public void setMdstore(final String mdstore) { - this.mdstore = mdstore; - } + public void setMdstore(final String mdstore) { + this.mdstore = mdstore; + } - public boolean isWriting() { - return writing; - } + public boolean isWriting() { + return writing; + } - public void setWriting(final boolean writing) { - this.writing = writing; - } + public void setWriting(final boolean writing) { + this.writing = writing; + } - public int getReadCount() { - return readCount; - } + public int getReadCount() { + return readCount; + } - public void setReadCount(final int readCount) { - this.readCount = readCount; - } + public void setReadCount(final int readCount) { + this.readCount = readCount; + } - public Date getLastUpdate() { - return lastUpdate; - } + public Date getLastUpdate() { + return lastUpdate; + } - public void setLastUpdate(final Date lastUpdate) { - this.lastUpdate = lastUpdate; - } + public void setLastUpdate(final Date lastUpdate) { + this.lastUpdate = lastUpdate; + } - public long getSize() { - return size; - } + public long getSize() { + return size; + } - public void setSize(final long size) { - this.size = size; - } + public void setSize(final long size) { + this.size = size; + } } diff --git a/dhp-common/src/main/java/eu/dnetlib/data/mdstore/manager/common/model/MDStoreWithInfo.java b/dhp-common/src/main/java/eu/dnetlib/data/mdstore/manager/common/model/MDStoreWithInfo.java index 9225a4876..438359241 100644 --- a/dhp-common/src/main/java/eu/dnetlib/data/mdstore/manager/common/model/MDStoreWithInfo.java +++ b/dhp-common/src/main/java/eu/dnetlib/data/mdstore/manager/common/model/MDStoreWithInfo.java @@ -1,7 +1,9 @@ + package eu.dnetlib.data.mdstore.manager.common.model; import java.io.Serializable; import java.util.Date; + import javax.persistence.Column; import javax.persistence.Entity; import javax.persistence.Id; @@ -13,129 +15,129 @@ import javax.persistence.TemporalType; @Table(name = "mdstores_with_info") public class MDStoreWithInfo implements Serializable { - /** */ - private static final long serialVersionUID = -8445784770687571492L; + /** */ + private static final long serialVersionUID = -8445784770687571492L; - @Id - @Column(name = "id") - private String id; + @Id + @Column(name = "id") + private String id; - @Column(name = "format") - private String format; + @Column(name = "format") + private String format; - @Column(name = "layout") - private String layout; + @Column(name = "layout") + private String layout; - @Column(name = "interpretation") - private String interpretation; + @Column(name = "interpretation") + private String interpretation; - @Column(name = "datasource_name") - private String datasourceName; + @Column(name = "datasource_name") + private String datasourceName; - @Column(name = "datasource_id") - private String datasourceId; + @Column(name = "datasource_id") + private String datasourceId; - @Column(name = "api_id") - private String apiId; + @Column(name = "api_id") + private String apiId; - @Column(name = "current_version") - private String currentVersion; + @Column(name = "current_version") + private String currentVersion; - @Column(name = "lastupdate") - @Temporal(TemporalType.TIMESTAMP) - private Date lastUpdate; + @Column(name = "lastupdate") + @Temporal(TemporalType.TIMESTAMP) + private Date lastUpdate; - @Column(name = "size") - private long size = 0; + @Column(name = "size") + private long size = 0; - @Column(name = "n_versions") - private long numberOfVersions = 0; + @Column(name = "n_versions") + private long numberOfVersions = 0; - public String getId() { - return id; - } + public String getId() { + return id; + } - public void setId(final String id) { - this.id = id; - } + public void setId(final String id) { + this.id = id; + } - public String getFormat() { - return format; - } + public String getFormat() { + return format; + } - public void setFormat(final String format) { - this.format = format; - } + public void setFormat(final String format) { + this.format = format; + } - public String getLayout() { - return layout; - } + public String getLayout() { + return layout; + } - public void setLayout(final String layout) { - this.layout = layout; - } + public void setLayout(final String layout) { + this.layout = layout; + } - public String getInterpretation() { - return interpretation; - } + public String getInterpretation() { + return interpretation; + } - public void setInterpretation(final String interpretation) { - this.interpretation = interpretation; - } + public void setInterpretation(final String interpretation) { + this.interpretation = interpretation; + } - public String getDatasourceName() { - return datasourceName; - } + public String getDatasourceName() { + return datasourceName; + } - public void setDatasourceName(final String datasourceName) { - this.datasourceName = datasourceName; - } + public void setDatasourceName(final String datasourceName) { + this.datasourceName = datasourceName; + } - public String getDatasourceId() { - return datasourceId; - } + public String getDatasourceId() { + return datasourceId; + } - public void setDatasourceId(final String datasourceId) { - this.datasourceId = datasourceId; - } + public void setDatasourceId(final String datasourceId) { + this.datasourceId = datasourceId; + } - public String getApiId() { - return apiId; - } + public String getApiId() { + return apiId; + } - public void setApiId(final String apiId) { - this.apiId = apiId; - } + public void setApiId(final String apiId) { + this.apiId = apiId; + } - public String getCurrentVersion() { - return currentVersion; - } + public String getCurrentVersion() { + return currentVersion; + } - public void setCurrentVersion(final String currentVersion) { - this.currentVersion = currentVersion; - } + public void setCurrentVersion(final String currentVersion) { + this.currentVersion = currentVersion; + } - public Date getLastUpdate() { - return lastUpdate; - } + public Date getLastUpdate() { + return lastUpdate; + } - public void setLastUpdate(final Date lastUpdate) { - this.lastUpdate = lastUpdate; - } + public void setLastUpdate(final Date lastUpdate) { + this.lastUpdate = lastUpdate; + } - public long getSize() { - return size; - } + public long getSize() { + return size; + } - public void setSize(final long size) { - this.size = size; - } + public void setSize(final long size) { + this.size = size; + } - public long getNumberOfVersions() { - return numberOfVersions; - } + public long getNumberOfVersions() { + return numberOfVersions; + } - public void setNumberOfVersions(final long numberOfVersions) { - this.numberOfVersions = numberOfVersions; - } + public void setNumberOfVersions(final long numberOfVersions) { + this.numberOfVersions = numberOfVersions; + } } diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/application/ArgumentApplicationParser.java b/dhp-common/src/main/java/eu/dnetlib/dhp/application/ArgumentApplicationParser.java index d98874bf3..e65b4bb0b 100644 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/application/ArgumentApplicationParser.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/application/ArgumentApplicationParser.java @@ -1,6 +1,6 @@ + package eu.dnetlib.dhp.application; -import com.fasterxml.jackson.databind.ObjectMapper; import java.io.ByteArrayInputStream; import java.io.ByteArrayOutputStream; import java.io.Serializable; @@ -8,87 +8,91 @@ import java.io.StringWriter; import java.util.*; import java.util.zip.GZIPInputStream; import java.util.zip.GZIPOutputStream; + import org.apache.commons.cli.*; import org.apache.commons.codec.binary.Base64; import org.apache.commons.io.IOUtils; +import com.fasterxml.jackson.databind.ObjectMapper; + public class ArgumentApplicationParser implements Serializable { - private final Options options = new Options(); - private final Map objectMap = new HashMap<>(); + private final Options options = new Options(); + private final Map objectMap = new HashMap<>(); - private final List compressedValues = new ArrayList<>(); + private final List compressedValues = new ArrayList<>(); - public ArgumentApplicationParser(final String json_configuration) throws Exception { - final ObjectMapper mapper = new ObjectMapper(); - final OptionsParameter[] configuration = - mapper.readValue(json_configuration, OptionsParameter[].class); - createOptionMap(configuration); - } + public ArgumentApplicationParser(final String json_configuration) throws Exception { + final ObjectMapper mapper = new ObjectMapper(); + final OptionsParameter[] configuration = mapper.readValue(json_configuration, OptionsParameter[].class); + createOptionMap(configuration); + } - public ArgumentApplicationParser(final OptionsParameter[] configuration) { - createOptionMap(configuration); - } + public ArgumentApplicationParser(final OptionsParameter[] configuration) { + createOptionMap(configuration); + } - private void createOptionMap(final OptionsParameter[] configuration) { + private void createOptionMap(final OptionsParameter[] configuration) { - Arrays.stream(configuration) - .map( - conf -> { - final Option o = new Option(conf.getParamName(), true, conf.getParamDescription()); - o.setLongOpt(conf.getParamLongName()); - o.setRequired(conf.isParamRequired()); - if (conf.isCompressed()) { - compressedValues.add(conf.getParamLongName()); - } - return o; - }) - .forEach(options::addOption); + Arrays + .stream(configuration) + .map( + conf -> { + final Option o = new Option(conf.getParamName(), true, conf.getParamDescription()); + o.setLongOpt(conf.getParamLongName()); + o.setRequired(conf.isParamRequired()); + if (conf.isCompressed()) { + compressedValues.add(conf.getParamLongName()); + } + return o; + }) + .forEach(options::addOption); - // HelpFormatter formatter = new HelpFormatter(); - // formatter.printHelp("myapp", null, options, null, true); + // HelpFormatter formatter = new HelpFormatter(); + // formatter.printHelp("myapp", null, options, null, true); - } + } - public static String decompressValue(final String abstractCompressed) { - try { - byte[] byteArray = Base64.decodeBase64(abstractCompressed.getBytes()); - GZIPInputStream gis = new GZIPInputStream(new ByteArrayInputStream(byteArray)); - final StringWriter stringWriter = new StringWriter(); - IOUtils.copy(gis, stringWriter); - return stringWriter.toString(); - } catch (Throwable e) { - System.out.println("Wrong value to decompress:" + abstractCompressed); - throw new RuntimeException(e); - } - } + public static String decompressValue(final String abstractCompressed) { + try { + byte[] byteArray = Base64.decodeBase64(abstractCompressed.getBytes()); + GZIPInputStream gis = new GZIPInputStream(new ByteArrayInputStream(byteArray)); + final StringWriter stringWriter = new StringWriter(); + IOUtils.copy(gis, stringWriter); + return stringWriter.toString(); + } catch (Throwable e) { + System.out.println("Wrong value to decompress:" + abstractCompressed); + throw new RuntimeException(e); + } + } - public static String compressArgument(final String value) throws Exception { - ByteArrayOutputStream out = new ByteArrayOutputStream(); - GZIPOutputStream gzip = new GZIPOutputStream(out); - gzip.write(value.getBytes()); - gzip.close(); - return java.util.Base64.getEncoder().encodeToString(out.toByteArray()); - } + public static String compressArgument(final String value) throws Exception { + ByteArrayOutputStream out = new ByteArrayOutputStream(); + GZIPOutputStream gzip = new GZIPOutputStream(out); + gzip.write(value.getBytes()); + gzip.close(); + return java.util.Base64.getEncoder().encodeToString(out.toByteArray()); + } - public void parseArgument(final String[] args) throws Exception { - CommandLineParser parser = new BasicParser(); - CommandLine cmd = parser.parse(options, args); - Arrays.stream(cmd.getOptions()) - .forEach( - it -> - objectMap.put( - it.getLongOpt(), - compressedValues.contains(it.getLongOpt()) - ? decompressValue(it.getValue()) - : it.getValue())); - } + public void parseArgument(final String[] args) throws Exception { + CommandLineParser parser = new BasicParser(); + CommandLine cmd = parser.parse(options, args); + Arrays + .stream(cmd.getOptions()) + .forEach( + it -> objectMap + .put( + it.getLongOpt(), + compressedValues.contains(it.getLongOpt()) + ? decompressValue(it.getValue()) + : it.getValue())); + } - public String get(final String key) { - return objectMap.get(key); - } + public String get(final String key) { + return objectMap.get(key); + } - public Map getObjectMap() { - return objectMap; - } + public Map getObjectMap() { + return objectMap; + } } diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/application/OptionsParameter.java b/dhp-common/src/main/java/eu/dnetlib/dhp/application/OptionsParameter.java index 03227d316..7004112e4 100644 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/application/OptionsParameter.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/application/OptionsParameter.java @@ -1,36 +1,38 @@ + package eu.dnetlib.dhp.application; public class OptionsParameter { - private String paramName; - private String paramLongName; - private String paramDescription; - private boolean paramRequired; - private boolean compressed; + private String paramName; + private String paramLongName; + private String paramDescription; + private boolean paramRequired; + private boolean compressed; - public OptionsParameter() {} + public OptionsParameter() { + } - public String getParamName() { - return paramName; - } + public String getParamName() { + return paramName; + } - public String getParamLongName() { - return paramLongName; - } + public String getParamLongName() { + return paramLongName; + } - public String getParamDescription() { - return paramDescription; - } + public String getParamDescription() { + return paramDescription; + } - public boolean isParamRequired() { - return paramRequired; - } + public boolean isParamRequired() { + return paramRequired; + } - public boolean isCompressed() { - return compressed; - } + public boolean isCompressed() { + return compressed; + } - public void setCompressed(boolean compressed) { - this.compressed = compressed; - } + public void setCompressed(boolean compressed) { + this.compressed = compressed; + } } diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/common/FunctionalInterfaceSupport.java b/dhp-common/src/main/java/eu/dnetlib/dhp/common/FunctionalInterfaceSupport.java index 4b0e1506e..e793e3f29 100644 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/common/FunctionalInterfaceSupport.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/common/FunctionalInterfaceSupport.java @@ -1,3 +1,4 @@ + package eu.dnetlib.dhp.common; import java.io.Serializable; @@ -6,46 +7,48 @@ import java.util.function.Supplier; /** Provides serializable and throwing extensions to standard functional interfaces. */ public class FunctionalInterfaceSupport { - private FunctionalInterfaceSupport() {} + private FunctionalInterfaceSupport() { + } - /** - * Serializable supplier of any kind of objects. To be used withing spark processing pipelines - * when supplying functions externally. - * - * @param - */ - @FunctionalInterface - public interface SerializableSupplier extends Supplier, Serializable {} + /** + * Serializable supplier of any kind of objects. To be used withing spark processing pipelines when supplying + * functions externally. + * + * @param + */ + @FunctionalInterface + public interface SerializableSupplier extends Supplier, Serializable { + } - /** - * Extension of consumer accepting functions throwing an exception. - * - * @param - * @param - */ - @FunctionalInterface - public interface ThrowingConsumer { - void accept(T t) throws E; - } + /** + * Extension of consumer accepting functions throwing an exception. + * + * @param + * @param + */ + @FunctionalInterface + public interface ThrowingConsumer { + void accept(T t) throws E; + } - /** - * Extension of supplier accepting functions throwing an exception. - * - * @param - * @param - */ - @FunctionalInterface - public interface ThrowingSupplier { - T get() throws E; - } + /** + * Extension of supplier accepting functions throwing an exception. + * + * @param + * @param + */ + @FunctionalInterface + public interface ThrowingSupplier { + T get() throws E; + } - /** - * Extension of runnable accepting functions throwing an exception. - * - * @param - */ - @FunctionalInterface - public interface ThrowingRunnable { - void run() throws E; - } + /** + * Extension of runnable accepting functions throwing an exception. + * + * @param + */ + @FunctionalInterface + public interface ThrowingRunnable { + void run() throws E; + } } diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/common/HdfsSupport.java b/dhp-common/src/main/java/eu/dnetlib/dhp/common/HdfsSupport.java index 1e5c264d1..0b2cd571f 100644 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/common/HdfsSupport.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/common/HdfsSupport.java @@ -1,3 +1,4 @@ + package eu.dnetlib.dhp.common; import static eu.dnetlib.dhp.common.ThrowingSupport.rethrowAsRuntimeException; @@ -5,6 +6,7 @@ import static eu.dnetlib.dhp.common.ThrowingSupport.rethrowAsRuntimeException; import java.util.Arrays; import java.util.List; import java.util.stream.Collectors; + import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; @@ -14,58 +16,59 @@ import org.slf4j.LoggerFactory; /** HDFS utility methods. */ public class HdfsSupport { - private static final Logger logger = LoggerFactory.getLogger(HdfsSupport.class); + private static final Logger logger = LoggerFactory.getLogger(HdfsSupport.class); - private HdfsSupport() {} + private HdfsSupport() { + } - /** - * Checks a path (file or dir) exists on HDFS. - * - * @param path Path to be checked - * @param configuration Configuration of hadoop env - */ - public static boolean exists(String path, Configuration configuration) { - logger.info("Removing path: {}", path); - return rethrowAsRuntimeException( - () -> { - Path f = new Path(path); - FileSystem fileSystem = FileSystem.get(configuration); - return fileSystem.exists(f); - }); - } + /** + * Checks a path (file or dir) exists on HDFS. + * + * @param path Path to be checked + * @param configuration Configuration of hadoop env + */ + public static boolean exists(String path, Configuration configuration) { + logger.info("Removing path: {}", path); + return rethrowAsRuntimeException( + () -> { + Path f = new Path(path); + FileSystem fileSystem = FileSystem.get(configuration); + return fileSystem.exists(f); + }); + } - /** - * Removes a path (file or dir) from HDFS. - * - * @param path Path to be removed - * @param configuration Configuration of hadoop env - */ - public static void remove(String path, Configuration configuration) { - logger.info("Removing path: {}", path); - rethrowAsRuntimeException( - () -> { - Path f = new Path(path); - FileSystem fileSystem = FileSystem.get(configuration); - if (fileSystem.exists(f)) { - fileSystem.delete(f, true); - } - }); - } + /** + * Removes a path (file or dir) from HDFS. + * + * @param path Path to be removed + * @param configuration Configuration of hadoop env + */ + public static void remove(String path, Configuration configuration) { + logger.info("Removing path: {}", path); + rethrowAsRuntimeException( + () -> { + Path f = new Path(path); + FileSystem fileSystem = FileSystem.get(configuration); + if (fileSystem.exists(f)) { + fileSystem.delete(f, true); + } + }); + } - /** - * Lists hadoop files located below path or alternatively lists subdirs under path. - * - * @param path Path to be listed for hadoop files - * @param configuration Configuration of hadoop env - * @return List with string locations of hadoop files - */ - public static List listFiles(String path, Configuration configuration) { - logger.info("Listing files in path: {}", path); - return rethrowAsRuntimeException( - () -> - Arrays.stream(FileSystem.get(configuration).listStatus(new Path(path))) - .filter(FileStatus::isDirectory) - .map(x -> x.getPath().toString()) - .collect(Collectors.toList())); - } + /** + * Lists hadoop files located below path or alternatively lists subdirs under path. + * + * @param path Path to be listed for hadoop files + * @param configuration Configuration of hadoop env + * @return List with string locations of hadoop files + */ + public static List listFiles(String path, Configuration configuration) { + logger.info("Listing files in path: {}", path); + return rethrowAsRuntimeException( + () -> Arrays + .stream(FileSystem.get(configuration).listStatus(new Path(path))) + .filter(FileStatus::isDirectory) + .map(x -> x.getPath().toString()) + .collect(Collectors.toList())); + } } diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/common/SparkSessionSupport.java b/dhp-common/src/main/java/eu/dnetlib/dhp/common/SparkSessionSupport.java index 433f64ecd..03cc94961 100644 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/common/SparkSessionSupport.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/common/SparkSessionSupport.java @@ -1,74 +1,75 @@ + package eu.dnetlib.dhp.common; -import eu.dnetlib.dhp.common.FunctionalInterfaceSupport.ThrowingConsumer; import java.util.Objects; import java.util.function.Function; + import org.apache.spark.SparkConf; import org.apache.spark.sql.SparkSession; +import eu.dnetlib.dhp.common.FunctionalInterfaceSupport.ThrowingConsumer; + /** SparkSession utility methods. */ public class SparkSessionSupport { - private SparkSessionSupport() {} + private SparkSessionSupport() { + } - /** - * Runs a given function using SparkSession created using default builder and supplied SparkConf. - * Stops SparkSession when SparkSession is managed. Allows to reuse SparkSession created - * externally. - * - * @param conf SparkConf instance - * @param isSparkSessionManaged When true will stop SparkSession - * @param fn Consumer to be applied to constructed SparkSession - */ - public static void runWithSparkSession( - SparkConf conf, Boolean isSparkSessionManaged, ThrowingConsumer fn) { - runWithSparkSession( - c -> SparkSession.builder().config(c).getOrCreate(), conf, isSparkSessionManaged, fn); - } + /** + * Runs a given function using SparkSession created using default builder and supplied SparkConf. Stops SparkSession + * when SparkSession is managed. Allows to reuse SparkSession created externally. + * + * @param conf SparkConf instance + * @param isSparkSessionManaged When true will stop SparkSession + * @param fn Consumer to be applied to constructed SparkSession + */ + public static void runWithSparkSession( + SparkConf conf, Boolean isSparkSessionManaged, ThrowingConsumer fn) { + runWithSparkSession( + c -> SparkSession.builder().config(c).getOrCreate(), conf, isSparkSessionManaged, fn); + } - /** - * Runs a given function using SparkSession created with hive support and using default builder - * and supplied SparkConf. Stops SparkSession when SparkSession is managed. Allows to reuse - * SparkSession created externally. - * - * @param conf SparkConf instance - * @param isSparkSessionManaged When true will stop SparkSession - * @param fn Consumer to be applied to constructed SparkSession - */ - public static void runWithSparkHiveSession( - SparkConf conf, Boolean isSparkSessionManaged, ThrowingConsumer fn) { - runWithSparkSession( - c -> SparkSession.builder().config(c).enableHiveSupport().getOrCreate(), - conf, - isSparkSessionManaged, - fn); - } + /** + * Runs a given function using SparkSession created with hive support and using default builder and supplied + * SparkConf. Stops SparkSession when SparkSession is managed. Allows to reuse SparkSession created externally. + * + * @param conf SparkConf instance + * @param isSparkSessionManaged When true will stop SparkSession + * @param fn Consumer to be applied to constructed SparkSession + */ + public static void runWithSparkHiveSession( + SparkConf conf, Boolean isSparkSessionManaged, ThrowingConsumer fn) { + runWithSparkSession( + c -> SparkSession.builder().config(c).enableHiveSupport().getOrCreate(), + conf, + isSparkSessionManaged, + fn); + } - /** - * Runs a given function using SparkSession created using supplied builder and supplied SparkConf. - * Stops SparkSession when SparkSession is managed. Allows to reuse SparkSession created - * externally. - * - * @param sparkSessionBuilder Builder of SparkSession - * @param conf SparkConf instance - * @param isSparkSessionManaged When true will stop SparkSession - * @param fn Consumer to be applied to constructed SparkSession - */ - public static void runWithSparkSession( - Function sparkSessionBuilder, - SparkConf conf, - Boolean isSparkSessionManaged, - ThrowingConsumer fn) { - SparkSession spark = null; - try { - spark = sparkSessionBuilder.apply(conf); - fn.accept(spark); - } catch (Exception e) { - throw new RuntimeException(e); - } finally { - if (Objects.nonNull(spark) && isSparkSessionManaged) { - spark.stop(); - } - } - } + /** + * Runs a given function using SparkSession created using supplied builder and supplied SparkConf. Stops + * SparkSession when SparkSession is managed. Allows to reuse SparkSession created externally. + * + * @param sparkSessionBuilder Builder of SparkSession + * @param conf SparkConf instance + * @param isSparkSessionManaged When true will stop SparkSession + * @param fn Consumer to be applied to constructed SparkSession + */ + public static void runWithSparkSession( + Function sparkSessionBuilder, + SparkConf conf, + Boolean isSparkSessionManaged, + ThrowingConsumer fn) { + SparkSession spark = null; + try { + spark = sparkSessionBuilder.apply(conf); + fn.accept(spark); + } catch (Exception e) { + throw new RuntimeException(e); + } finally { + if (Objects.nonNull(spark) && isSparkSessionManaged) { + spark.stop(); + } + } + } } diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/common/ThrowingSupport.java b/dhp-common/src/main/java/eu/dnetlib/dhp/common/ThrowingSupport.java index 54342a46a..f3f59b2a2 100644 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/common/ThrowingSupport.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/common/ThrowingSupport.java @@ -1,3 +1,4 @@ + package eu.dnetlib.dhp.common; import eu.dnetlib.dhp.common.FunctionalInterfaceSupport.ThrowingRunnable; @@ -6,69 +7,70 @@ import eu.dnetlib.dhp.common.FunctionalInterfaceSupport.ThrowingSupplier; /** Exception handling utility methods. */ public class ThrowingSupport { - private ThrowingSupport() {} + private ThrowingSupport() { + } - /** - * Executes given runnable and rethrows any exceptions as RuntimeException. - * - * @param fn Runnable to be executed - * @param Type of exception thrown - */ - public static void rethrowAsRuntimeException(ThrowingRunnable fn) { - try { - fn.run(); - } catch (Exception e) { - throw new RuntimeException(e); - } - } + /** + * Executes given runnable and rethrows any exceptions as RuntimeException. + * + * @param fn Runnable to be executed + * @param Type of exception thrown + */ + public static void rethrowAsRuntimeException(ThrowingRunnable fn) { + try { + fn.run(); + } catch (Exception e) { + throw new RuntimeException(e); + } + } - /** - * Executes given runnable and rethrows any exceptions as RuntimeException with custom message. - * - * @param fn Runnable to be executed - * @param msg Message to be set for rethrown exception - * @param Type of exception thrown - */ - public static void rethrowAsRuntimeException( - ThrowingRunnable fn, String msg) { - try { - fn.run(); - } catch (Exception e) { - throw new RuntimeException(msg, e); - } - } + /** + * Executes given runnable and rethrows any exceptions as RuntimeException with custom message. + * + * @param fn Runnable to be executed + * @param msg Message to be set for rethrown exception + * @param Type of exception thrown + */ + public static void rethrowAsRuntimeException( + ThrowingRunnable fn, String msg) { + try { + fn.run(); + } catch (Exception e) { + throw new RuntimeException(msg, e); + } + } - /** - * Executes given supplier and rethrows any exceptions as RuntimeException. - * - * @param fn Supplier to be executed - * @param Type of returned value - * @param Type of exception thrown - * @return Result of supplier execution - */ - public static T rethrowAsRuntimeException(ThrowingSupplier fn) { - try { - return fn.get(); - } catch (Exception e) { - throw new RuntimeException(e); - } - } + /** + * Executes given supplier and rethrows any exceptions as RuntimeException. + * + * @param fn Supplier to be executed + * @param Type of returned value + * @param Type of exception thrown + * @return Result of supplier execution + */ + public static T rethrowAsRuntimeException(ThrowingSupplier fn) { + try { + return fn.get(); + } catch (Exception e) { + throw new RuntimeException(e); + } + } - /** - * Executes given supplier and rethrows any exceptions as RuntimeException with custom message. - * - * @param fn Supplier to be executed - * @param msg Message to be set for rethrown exception - * @param Type of returned value - * @param Type of exception thrown - * @return Result of supplier execution - */ - public static T rethrowAsRuntimeException( - ThrowingSupplier fn, String msg) { - try { - return fn.get(); - } catch (Exception e) { - throw new RuntimeException(msg, e); - } - } + /** + * Executes given supplier and rethrows any exceptions as RuntimeException with custom message. + * + * @param fn Supplier to be executed + * @param msg Message to be set for rethrown exception + * @param Type of returned value + * @param Type of exception thrown + * @return Result of supplier execution + */ + public static T rethrowAsRuntimeException( + ThrowingSupplier fn, String msg) { + try { + return fn.get(); + } catch (Exception e) { + throw new RuntimeException(msg, e); + } + } } diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/model/mdstore/MetadataRecord.java b/dhp-common/src/main/java/eu/dnetlib/dhp/model/mdstore/MetadataRecord.java index 56d7217ff..ce65e710f 100644 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/model/mdstore/MetadataRecord.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/model/mdstore/MetadataRecord.java @@ -1,120 +1,121 @@ + package eu.dnetlib.dhp.model.mdstore; -import eu.dnetlib.dhp.utils.DHPUtils; import java.io.Serializable; +import eu.dnetlib.dhp.utils.DHPUtils; + /** This class models a record inside the new Metadata store collection on HDFS * */ public class MetadataRecord implements Serializable { - /** The D-Net Identifier associated to the record */ - private String id; + /** The D-Net Identifier associated to the record */ + private String id; - /** The original Identifier of the record */ - private String originalId; + /** The original Identifier of the record */ + private String originalId; - /** The encoding of the record, should be JSON or XML */ - private String encoding; + /** The encoding of the record, should be JSON or XML */ + private String encoding; - /** - * The information about the provenance of the record see @{@link Provenance} for the model of - * this information - */ - private Provenance provenance; + /** + * The information about the provenance of the record see @{@link Provenance} for the model of this information + */ + private Provenance provenance; - /** The content of the metadata */ - private String body; + /** The content of the metadata */ + private String body; - /** the date when the record has been stored */ - private long dateOfCollection; + /** the date when the record has been stored */ + private long dateOfCollection; - /** the date when the record has been stored */ - private long dateOfTransformation; + /** the date when the record has been stored */ + private long dateOfTransformation; - public MetadataRecord() { - this.dateOfCollection = System.currentTimeMillis(); - } + public MetadataRecord() { + this.dateOfCollection = System.currentTimeMillis(); + } - public MetadataRecord( - String originalId, - String encoding, - Provenance provenance, - String body, - long dateOfCollection) { + public MetadataRecord( + String originalId, + String encoding, + Provenance provenance, + String body, + long dateOfCollection) { - this.originalId = originalId; - this.encoding = encoding; - this.provenance = provenance; - this.body = body; - this.dateOfCollection = dateOfCollection; - this.id = DHPUtils.generateIdentifier(originalId, this.provenance.getNsPrefix()); - } + this.originalId = originalId; + this.encoding = encoding; + this.provenance = provenance; + this.body = body; + this.dateOfCollection = dateOfCollection; + this.id = DHPUtils.generateIdentifier(originalId, this.provenance.getNsPrefix()); + } - public String getId() { - return id; - } + public String getId() { + return id; + } - public void setId(String id) { - this.id = id; - } + public void setId(String id) { + this.id = id; + } - public String getOriginalId() { - return originalId; - } + public String getOriginalId() { + return originalId; + } - public void setOriginalId(String originalId) { - this.originalId = originalId; - } + public void setOriginalId(String originalId) { + this.originalId = originalId; + } - public String getEncoding() { - return encoding; - } + public String getEncoding() { + return encoding; + } - public void setEncoding(String encoding) { - this.encoding = encoding; - } + public void setEncoding(String encoding) { + this.encoding = encoding; + } - public Provenance getProvenance() { - return provenance; - } + public Provenance getProvenance() { + return provenance; + } - public void setProvenance(Provenance provenance) { - this.provenance = provenance; - } + public void setProvenance(Provenance provenance) { + this.provenance = provenance; + } - public String getBody() { - return body; - } + public String getBody() { + return body; + } - public void setBody(String body) { - this.body = body; - } + public void setBody(String body) { + this.body = body; + } - public long getDateOfCollection() { - return dateOfCollection; - } + public long getDateOfCollection() { + return dateOfCollection; + } - public void setDateOfCollection(long dateOfCollection) { - this.dateOfCollection = dateOfCollection; - } + public void setDateOfCollection(long dateOfCollection) { + this.dateOfCollection = dateOfCollection; + } - public long getDateOfTransformation() { - return dateOfTransformation; - } + public long getDateOfTransformation() { + return dateOfTransformation; + } - public void setDateOfTransformation(long dateOfTransformation) { - this.dateOfTransformation = dateOfTransformation; - } + public void setDateOfTransformation(long dateOfTransformation) { + this.dateOfTransformation = dateOfTransformation; + } - @Override - public boolean equals(Object o) { - if (!(o instanceof MetadataRecord)) { - return false; - } - return ((MetadataRecord) o).getId().equalsIgnoreCase(id); - } + @Override + public boolean equals(Object o) { + if (!(o instanceof MetadataRecord)) { + return false; + } + return ((MetadataRecord) o).getId().equalsIgnoreCase(id); + } - @Override - public int hashCode() { - return id.hashCode(); - } + @Override + public int hashCode() { + return id.hashCode(); + } } diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/model/mdstore/Provenance.java b/dhp-common/src/main/java/eu/dnetlib/dhp/model/mdstore/Provenance.java index 90897c5c4..556535022 100644 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/model/mdstore/Provenance.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/model/mdstore/Provenance.java @@ -1,49 +1,52 @@ + package eu.dnetlib.dhp.model.mdstore; import java.io.Serializable; /** * @author Sandro La Bruzzo - *

Provenace class models the provenance of the record in the metadataStore It contains the - * identifier and the name of the datasource that gives the record + *

+ * Provenace class models the provenance of the record in the metadataStore It contains the identifier and the + * name of the datasource that gives the record */ public class Provenance implements Serializable { - private String datasourceId; + private String datasourceId; - private String datasourceName; + private String datasourceName; - private String nsPrefix; + private String nsPrefix; - public Provenance() {} + public Provenance() { + } - public Provenance(String datasourceId, String datasourceName, String nsPrefix) { - this.datasourceId = datasourceId; - this.datasourceName = datasourceName; - this.nsPrefix = nsPrefix; - } + public Provenance(String datasourceId, String datasourceName, String nsPrefix) { + this.datasourceId = datasourceId; + this.datasourceName = datasourceName; + this.nsPrefix = nsPrefix; + } - public String getDatasourceId() { - return datasourceId; - } + public String getDatasourceId() { + return datasourceId; + } - public void setDatasourceId(String datasourceId) { - this.datasourceId = datasourceId; - } + public void setDatasourceId(String datasourceId) { + this.datasourceId = datasourceId; + } - public String getDatasourceName() { - return datasourceName; - } + public String getDatasourceName() { + return datasourceName; + } - public void setDatasourceName(String datasourceName) { - this.datasourceName = datasourceName; - } + public void setDatasourceName(String datasourceName) { + this.datasourceName = datasourceName; + } - public String getNsPrefix() { - return nsPrefix; - } + public String getNsPrefix() { + return nsPrefix; + } - public void setNsPrefix(String nsPrefix) { - this.nsPrefix = nsPrefix; - } + public void setNsPrefix(String nsPrefix) { + this.nsPrefix = nsPrefix; + } } diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/parser/utility/VtdException.java b/dhp-common/src/main/java/eu/dnetlib/dhp/parser/utility/VtdException.java index 3576dc92b..22945309c 100644 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/parser/utility/VtdException.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/parser/utility/VtdException.java @@ -1,12 +1,13 @@ + package eu.dnetlib.dhp.parser.utility; public class VtdException extends Exception { - public VtdException(final Exception e) { - super(e); - } + public VtdException(final Exception e) { + super(e); + } - public VtdException(final Throwable e) { - super(e); - } + public VtdException(final Throwable e) { + super(e); + } } diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/parser/utility/VtdUtilityParser.java b/dhp-common/src/main/java/eu/dnetlib/dhp/parser/utility/VtdUtilityParser.java index a12662d1f..9ac0a0bf7 100644 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/parser/utility/VtdUtilityParser.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/parser/utility/VtdUtilityParser.java @@ -1,105 +1,110 @@ + package eu.dnetlib.dhp.parser.utility; -import com.ximpleware.AutoPilot; -import com.ximpleware.VTDNav; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; +import com.ximpleware.AutoPilot; +import com.ximpleware.VTDNav; + /** Created by sandro on 9/29/16. */ public class VtdUtilityParser { - public static List getTextValuesWithAttributes( - final AutoPilot ap, final VTDNav vn, final String xpath, final List attributes) - throws VtdException { - final List results = new ArrayList<>(); - try { - ap.selectXPath(xpath); + public static List getTextValuesWithAttributes( + final AutoPilot ap, final VTDNav vn, final String xpath, final List attributes) + throws VtdException { + final List results = new ArrayList<>(); + try { + ap.selectXPath(xpath); - while (ap.evalXPath() != -1) { - final Node currentNode = new Node(); - int t = vn.getText(); - if (t >= 0) { - currentNode.setTextValue(vn.toNormalizedString(t)); - } - currentNode.setAttributes(getAttributes(vn, attributes)); - results.add(currentNode); - } - return results; - } catch (Exception e) { - throw new VtdException(e); - } - } + while (ap.evalXPath() != -1) { + final Node currentNode = new Node(); + int t = vn.getText(); + if (t >= 0) { + currentNode.setTextValue(vn.toNormalizedString(t)); + } + currentNode.setAttributes(getAttributes(vn, attributes)); + results.add(currentNode); + } + return results; + } catch (Exception e) { + throw new VtdException(e); + } + } - private static Map getAttributes(final VTDNav vn, final List attributes) { - final Map currentAttributes = new HashMap<>(); - if (attributes != null) { + private static Map getAttributes(final VTDNav vn, final List attributes) { + final Map currentAttributes = new HashMap<>(); + if (attributes != null) { - attributes.forEach( - attributeKey -> { - try { - int attr = vn.getAttrVal(attributeKey); - if (attr > -1) { - currentAttributes.put(attributeKey, vn.toNormalizedString(attr)); - } - } catch (Throwable e) { - throw new RuntimeException(e); - } - }); - } - return currentAttributes; - } + attributes + .forEach( + attributeKey -> { + try { + int attr = vn.getAttrVal(attributeKey); + if (attr > -1) { + currentAttributes.put(attributeKey, vn.toNormalizedString(attr)); + } + } catch (Throwable e) { + throw new RuntimeException(e); + } + }); + } + return currentAttributes; + } - public static List getTextValue(final AutoPilot ap, final VTDNav vn, final String xpath) - throws VtdException { - List results = new ArrayList<>(); - try { - ap.selectXPath(xpath); - while (ap.evalXPath() != -1) { - int t = vn.getText(); - if (t > -1) results.add(vn.toNormalizedString(t)); - } - return results; - } catch (Exception e) { - throw new VtdException(e); - } - } + public static List getTextValue(final AutoPilot ap, final VTDNav vn, final String xpath) + throws VtdException { + List results = new ArrayList<>(); + try { + ap.selectXPath(xpath); + while (ap.evalXPath() != -1) { + int t = vn.getText(); + if (t > -1) + results.add(vn.toNormalizedString(t)); + } + return results; + } catch (Exception e) { + throw new VtdException(e); + } + } - public static String getSingleValue(final AutoPilot ap, final VTDNav nav, final String xpath) - throws VtdException { - try { - ap.selectXPath(xpath); - while (ap.evalXPath() != -1) { - int it = nav.getText(); - if (it > -1) return nav.toNormalizedString(it); - } - return null; - } catch (Exception e) { - throw new VtdException(e); - } - } + public static String getSingleValue(final AutoPilot ap, final VTDNav nav, final String xpath) + throws VtdException { + try { + ap.selectXPath(xpath); + while (ap.evalXPath() != -1) { + int it = nav.getText(); + if (it > -1) + return nav.toNormalizedString(it); + } + return null; + } catch (Exception e) { + throw new VtdException(e); + } + } - public static class Node { + public static class Node { - private String textValue; + private String textValue; - private Map attributes; + private Map attributes; - public String getTextValue() { - return textValue; - } + public String getTextValue() { + return textValue; + } - public void setTextValue(final String textValue) { - this.textValue = textValue; - } + public void setTextValue(final String textValue) { + this.textValue = textValue; + } - public Map getAttributes() { - return attributes; - } + public Map getAttributes() { + return attributes; + } - public void setAttributes(final Map attributes) { - this.attributes = attributes; - } - } + public void setAttributes(final Map attributes) { + this.attributes = attributes; + } + } } diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/utils/DHPUtils.java b/dhp-common/src/main/java/eu/dnetlib/dhp/utils/DHPUtils.java index f5800cdaf..18e489a21 100644 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/utils/DHPUtils.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/utils/DHPUtils.java @@ -1,70 +1,75 @@ + package eu.dnetlib.dhp.utils; -import com.jayway.jsonpath.JsonPath; import java.io.ByteArrayInputStream; import java.io.ByteArrayOutputStream; import java.nio.charset.StandardCharsets; import java.security.MessageDigest; import java.util.zip.GZIPInputStream; import java.util.zip.GZIPOutputStream; -import net.minidev.json.JSONArray; + import org.apache.commons.codec.binary.Base64; import org.apache.commons.codec.binary.Base64OutputStream; import org.apache.commons.codec.binary.Hex; +import com.jayway.jsonpath.JsonPath; + +import net.minidev.json.JSONArray; + public class DHPUtils { - public static String md5(final String s) { - try { - final MessageDigest md = MessageDigest.getInstance("MD5"); - md.update(s.getBytes("UTF-8")); - return new String(Hex.encodeHex(md.digest())); - } catch (final Exception e) { - System.err.println("Error creating id"); - return null; - } - } + public static String md5(final String s) { + try { + final MessageDigest md = MessageDigest.getInstance("MD5"); + md.update(s.getBytes("UTF-8")); + return new String(Hex.encodeHex(md.digest())); + } catch (final Exception e) { + System.err.println("Error creating id"); + return null; + } + } - public static String generateIdentifier(final String originalId, final String nsPrefix) { - return String.format("%s::%s", nsPrefix, DHPUtils.md5(originalId)); - } + public static String generateIdentifier(final String originalId, final String nsPrefix) { + return String.format("%s::%s", nsPrefix, DHPUtils.md5(originalId)); + } - public static String compressString(final String input) { - try (ByteArrayOutputStream out = new ByteArrayOutputStream(); - Base64OutputStream b64os = new Base64OutputStream(out)) { - GZIPOutputStream gzip = new GZIPOutputStream(b64os); - gzip.write(input.getBytes(StandardCharsets.UTF_8)); - gzip.close(); - return out.toString(); - } catch (Throwable e) { - return null; - } - } + public static String compressString(final String input) { + try (ByteArrayOutputStream out = new ByteArrayOutputStream(); + Base64OutputStream b64os = new Base64OutputStream(out)) { + GZIPOutputStream gzip = new GZIPOutputStream(b64os); + gzip.write(input.getBytes(StandardCharsets.UTF_8)); + gzip.close(); + return out.toString(); + } catch (Throwable e) { + return null; + } + } - public static String decompressString(final String input) { - byte[] byteArray = Base64.decodeBase64(input.getBytes()); - int len; - try (GZIPInputStream gis = new GZIPInputStream(new ByteArrayInputStream((byteArray))); - ByteArrayOutputStream bos = new ByteArrayOutputStream(byteArray.length)) { - byte[] buffer = new byte[1024]; - while ((len = gis.read(buffer)) != -1) { - bos.write(buffer, 0, len); - } - return bos.toString(); - } catch (Exception e) { - return null; - } - } + public static String decompressString(final String input) { + byte[] byteArray = Base64.decodeBase64(input.getBytes()); + int len; + try (GZIPInputStream gis = new GZIPInputStream(new ByteArrayInputStream((byteArray))); + ByteArrayOutputStream bos = new ByteArrayOutputStream(byteArray.length)) { + byte[] buffer = new byte[1024]; + while ((len = gis.read(buffer)) != -1) { + bos.write(buffer, 0, len); + } + return bos.toString(); + } catch (Exception e) { + return null; + } + } - public static String getJPathString(final String jsonPath, final String json) { - try { - Object o = JsonPath.read(json, jsonPath); - if (o instanceof String) return (String) o; - if (o instanceof JSONArray && ((JSONArray) o).size() > 0) - return (String) ((JSONArray) o).get(0); - return o.toString(); - } catch (Exception e) { - return ""; - } - } + public static String getJPathString(final String jsonPath, final String json) { + try { + Object o = JsonPath.read(json, jsonPath); + if (o instanceof String) + return (String) o; + if (o instanceof JSONArray && ((JSONArray) o).size() > 0) + return (String) ((JSONArray) o).get(0); + return o.toString(); + } catch (Exception e) { + return ""; + } + } } diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/utils/ISLookupClientFactory.java b/dhp-common/src/main/java/eu/dnetlib/dhp/utils/ISLookupClientFactory.java index b6f3f111a..97fe4b9d8 100644 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/utils/ISLookupClientFactory.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/utils/ISLookupClientFactory.java @@ -1,24 +1,26 @@ + package eu.dnetlib.dhp.utils; -import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.cxf.jaxws.JaxWsProxyFactoryBean; +import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService; + public class ISLookupClientFactory { - private static final Log log = LogFactory.getLog(ISLookupClientFactory.class); + private static final Log log = LogFactory.getLog(ISLookupClientFactory.class); - public static ISLookUpService getLookUpService(final String isLookupUrl) { - return getServiceStub(ISLookUpService.class, isLookupUrl); - } + public static ISLookUpService getLookUpService(final String isLookupUrl) { + return getServiceStub(ISLookUpService.class, isLookupUrl); + } - @SuppressWarnings("unchecked") - private static T getServiceStub(final Class clazz, final String endpoint) { - log.info(String.format("creating %s stub from %s", clazz.getName(), endpoint)); - final JaxWsProxyFactoryBean jaxWsProxyFactory = new JaxWsProxyFactoryBean(); - jaxWsProxyFactory.setServiceClass(clazz); - jaxWsProxyFactory.setAddress(endpoint); - return (T) jaxWsProxyFactory.create(); - } + @SuppressWarnings("unchecked") + private static T getServiceStub(final Class clazz, final String endpoint) { + log.info(String.format("creating %s stub from %s", clazz.getName(), endpoint)); + final JaxWsProxyFactoryBean jaxWsProxyFactory = new JaxWsProxyFactoryBean(); + jaxWsProxyFactory.setServiceClass(clazz); + jaxWsProxyFactory.setAddress(endpoint); + return (T) jaxWsProxyFactory.create(); + } } diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/utils/saxon/AbstractExtensionFunction.java b/dhp-common/src/main/java/eu/dnetlib/dhp/utils/saxon/AbstractExtensionFunction.java index 57bd130cb..9b00b908c 100644 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/utils/saxon/AbstractExtensionFunction.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/utils/saxon/AbstractExtensionFunction.java @@ -1,3 +1,4 @@ + package eu.dnetlib.dhp.utils.saxon; import net.sf.saxon.expr.XPathContext; @@ -9,25 +10,24 @@ import net.sf.saxon.trans.XPathException; public abstract class AbstractExtensionFunction extends ExtensionFunctionDefinition { - public static String DEFAULT_SAXON_EXT_NS_URI = - "http://www.d-net.research-infrastructures.eu/saxon-extension"; + public static String DEFAULT_SAXON_EXT_NS_URI = "http://www.d-net.research-infrastructures.eu/saxon-extension"; - public abstract String getName(); + public abstract String getName(); - public abstract Sequence doCall(XPathContext context, Sequence[] arguments) throws XPathException; + public abstract Sequence doCall(XPathContext context, Sequence[] arguments) throws XPathException; - @Override - public StructuredQName getFunctionQName() { - return new StructuredQName("dnet", DEFAULT_SAXON_EXT_NS_URI, getName()); - } + @Override + public StructuredQName getFunctionQName() { + return new StructuredQName("dnet", DEFAULT_SAXON_EXT_NS_URI, getName()); + } - @Override - public ExtensionFunctionCall makeCallExpression() { - return new ExtensionFunctionCall() { - @Override - public Sequence call(XPathContext context, Sequence[] arguments) throws XPathException { - return doCall(context, arguments); - } - }; - } + @Override + public ExtensionFunctionCall makeCallExpression() { + return new ExtensionFunctionCall() { + @Override + public Sequence call(XPathContext context, Sequence[] arguments) throws XPathException { + return doCall(context, arguments); + } + }; + } } diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/utils/saxon/ExtractYear.java b/dhp-common/src/main/java/eu/dnetlib/dhp/utils/saxon/ExtractYear.java index 38ecb6377..c7e311b02 100644 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/utils/saxon/ExtractYear.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/utils/saxon/ExtractYear.java @@ -1,9 +1,11 @@ + package eu.dnetlib.dhp.utils.saxon; import java.text.ParseException; import java.text.SimpleDateFormat; import java.util.Calendar; import java.util.GregorianCalendar; + import net.sf.saxon.expr.XPathContext; import net.sf.saxon.om.Item; import net.sf.saxon.om.Sequence; @@ -13,55 +15,59 @@ import net.sf.saxon.value.StringValue; public class ExtractYear extends AbstractExtensionFunction { - private static final String[] dateFormats = {"yyyy-MM-dd", "yyyy/MM/dd"}; + private static final String[] dateFormats = { + "yyyy-MM-dd", "yyyy/MM/dd" + }; - @Override - public String getName() { - return "extractYear"; - } + @Override + public String getName() { + return "extractYear"; + } - @Override - public Sequence doCall(XPathContext context, Sequence[] arguments) throws XPathException { - if (arguments == null | arguments.length == 0) { - return new StringValue(""); - } - final Item item = arguments[0].head(); - if (item == null) { - return new StringValue(""); - } - return new StringValue(_year(item.getStringValue())); - } + @Override + public Sequence doCall(XPathContext context, Sequence[] arguments) throws XPathException { + if (arguments == null | arguments.length == 0) { + return new StringValue(""); + } + final Item item = arguments[0].head(); + if (item == null) { + return new StringValue(""); + } + return new StringValue(_year(item.getStringValue())); + } - @Override - public int getMinimumNumberOfArguments() { - return 0; - } + @Override + public int getMinimumNumberOfArguments() { + return 0; + } - @Override - public int getMaximumNumberOfArguments() { - return 1; - } + @Override + public int getMaximumNumberOfArguments() { + return 1; + } - @Override - public SequenceType[] getArgumentTypes() { - return new SequenceType[] {SequenceType.OPTIONAL_ITEM}; - } + @Override + public SequenceType[] getArgumentTypes() { + return new SequenceType[] { + SequenceType.OPTIONAL_ITEM + }; + } - @Override - public SequenceType getResultType(SequenceType[] suppliedArgumentTypes) { - return SequenceType.SINGLE_STRING; - } + @Override + public SequenceType getResultType(SequenceType[] suppliedArgumentTypes) { + return SequenceType.SINGLE_STRING; + } - private String _year(String s) { - Calendar c = new GregorianCalendar(); - for (String format : dateFormats) { - try { - c.setTime(new SimpleDateFormat(format).parse(s)); - String year = String.valueOf(c.get(Calendar.YEAR)); - return year; - } catch (ParseException e) { - } - } - return ""; - } + private String _year(String s) { + Calendar c = new GregorianCalendar(); + for (String format : dateFormats) { + try { + c.setTime(new SimpleDateFormat(format).parse(s)); + String year = String.valueOf(c.get(Calendar.YEAR)); + return year; + } catch (ParseException e) { + } + } + return ""; + } } diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/utils/saxon/NormalizeDate.java b/dhp-common/src/main/java/eu/dnetlib/dhp/utils/saxon/NormalizeDate.java index def4fdfc7..4a719909a 100644 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/utils/saxon/NormalizeDate.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/utils/saxon/NormalizeDate.java @@ -1,8 +1,10 @@ + package eu.dnetlib.dhp.utils.saxon; import java.text.ParseException; import java.text.SimpleDateFormat; import java.util.Date; + import net.sf.saxon.expr.XPathContext; import net.sf.saxon.om.Sequence; import net.sf.saxon.trans.XPathException; @@ -11,57 +13,59 @@ import net.sf.saxon.value.StringValue; public class NormalizeDate extends AbstractExtensionFunction { - private static final String[] normalizeDateFormats = { - "yyyy-MM-dd'T'hh:mm:ss", "yyyy-MM-dd", "yyyy/MM/dd", "yyyy" - }; + private static final String[] normalizeDateFormats = { + "yyyy-MM-dd'T'hh:mm:ss", "yyyy-MM-dd", "yyyy/MM/dd", "yyyy" + }; - private static final String normalizeOutFormat = new String("yyyy-MM-dd'T'hh:mm:ss'Z'"); + private static final String normalizeOutFormat = new String("yyyy-MM-dd'T'hh:mm:ss'Z'"); - @Override - public String getName() { - return "normalizeDate"; - } + @Override + public String getName() { + return "normalizeDate"; + } - @Override - public Sequence doCall(XPathContext context, Sequence[] arguments) throws XPathException { - if (arguments == null | arguments.length == 0) { - return new StringValue(""); - } - String s = arguments[0].head().getStringValue(); - return new StringValue(_year(s)); - } + @Override + public Sequence doCall(XPathContext context, Sequence[] arguments) throws XPathException { + if (arguments == null | arguments.length == 0) { + return new StringValue(""); + } + String s = arguments[0].head().getStringValue(); + return new StringValue(_year(s)); + } - @Override - public int getMinimumNumberOfArguments() { - return 0; - } + @Override + public int getMinimumNumberOfArguments() { + return 0; + } - @Override - public int getMaximumNumberOfArguments() { - return 1; - } + @Override + public int getMaximumNumberOfArguments() { + return 1; + } - @Override - public SequenceType[] getArgumentTypes() { - return new SequenceType[] {SequenceType.OPTIONAL_ITEM}; - } + @Override + public SequenceType[] getArgumentTypes() { + return new SequenceType[] { + SequenceType.OPTIONAL_ITEM + }; + } - @Override - public SequenceType getResultType(SequenceType[] suppliedArgumentTypes) { - return SequenceType.SINGLE_STRING; - } + @Override + public SequenceType getResultType(SequenceType[] suppliedArgumentTypes) { + return SequenceType.SINGLE_STRING; + } - private String _year(String s) { - final String date = s != null ? s.trim() : ""; + private String _year(String s) { + final String date = s != null ? s.trim() : ""; - for (String format : normalizeDateFormats) { - try { - Date parse = new SimpleDateFormat(format).parse(date); - String res = new SimpleDateFormat(normalizeOutFormat).format(parse); - return res; - } catch (ParseException e) { - } - } - return ""; - } + for (String format : normalizeDateFormats) { + try { + Date parse = new SimpleDateFormat(format).parse(date); + String res = new SimpleDateFormat(normalizeOutFormat).format(parse); + return res; + } catch (ParseException e) { + } + } + return ""; + } } diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/utils/saxon/PickFirst.java b/dhp-common/src/main/java/eu/dnetlib/dhp/utils/saxon/PickFirst.java index 73159c617..46ecafd0a 100644 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/utils/saxon/PickFirst.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/utils/saxon/PickFirst.java @@ -1,59 +1,63 @@ + package eu.dnetlib.dhp.utils.saxon; +import org.apache.commons.lang3.StringUtils; + import net.sf.saxon.expr.XPathContext; import net.sf.saxon.om.Item; import net.sf.saxon.om.Sequence; import net.sf.saxon.trans.XPathException; import net.sf.saxon.value.SequenceType; import net.sf.saxon.value.StringValue; -import org.apache.commons.lang3.StringUtils; public class PickFirst extends AbstractExtensionFunction { - @Override - public String getName() { - return "pickFirst"; - } + @Override + public String getName() { + return "pickFirst"; + } - @Override - public Sequence doCall(XPathContext context, Sequence[] arguments) throws XPathException { - if (arguments == null | arguments.length == 0) { - return new StringValue(""); - } + @Override + public Sequence doCall(XPathContext context, Sequence[] arguments) throws XPathException { + if (arguments == null | arguments.length == 0) { + return new StringValue(""); + } - final String s1 = getValue(arguments[0]); - final String s2 = getValue(arguments[1]); + final String s1 = getValue(arguments[0]); + final String s2 = getValue(arguments[1]); - return new StringValue(StringUtils.isNotBlank(s1) ? s1 : StringUtils.isNotBlank(s2) ? s2 : ""); - } + return new StringValue(StringUtils.isNotBlank(s1) ? s1 : StringUtils.isNotBlank(s2) ? s2 : ""); + } - private String getValue(final Sequence arg) throws XPathException { - if (arg != null) { - final Item item = arg.head(); - if (item != null) { - return item.getStringValue(); - } - } - return ""; - } + private String getValue(final Sequence arg) throws XPathException { + if (arg != null) { + final Item item = arg.head(); + if (item != null) { + return item.getStringValue(); + } + } + return ""; + } - @Override - public int getMinimumNumberOfArguments() { - return 0; - } + @Override + public int getMinimumNumberOfArguments() { + return 0; + } - @Override - public int getMaximumNumberOfArguments() { - return 2; - } + @Override + public int getMaximumNumberOfArguments() { + return 2; + } - @Override - public SequenceType[] getArgumentTypes() { - return new SequenceType[] {SequenceType.OPTIONAL_ITEM}; - } + @Override + public SequenceType[] getArgumentTypes() { + return new SequenceType[] { + SequenceType.OPTIONAL_ITEM + }; + } - @Override - public SequenceType getResultType(SequenceType[] suppliedArgumentTypes) { - return SequenceType.SINGLE_STRING; - } + @Override + public SequenceType getResultType(SequenceType[] suppliedArgumentTypes) { + return SequenceType.SINGLE_STRING; + } } diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/utils/saxon/SaxonTransformerFactory.java b/dhp-common/src/main/java/eu/dnetlib/dhp/utils/saxon/SaxonTransformerFactory.java index 18ce51887..b85d866f1 100644 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/utils/saxon/SaxonTransformerFactory.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/utils/saxon/SaxonTransformerFactory.java @@ -1,29 +1,32 @@ + package eu.dnetlib.dhp.utils.saxon; import java.io.StringReader; + import javax.xml.transform.Transformer; import javax.xml.transform.TransformerException; import javax.xml.transform.stream.StreamSource; + import net.sf.saxon.Configuration; import net.sf.saxon.TransformerFactoryImpl; public class SaxonTransformerFactory { - /** - * Creates the index record transformer from the given XSLT - * - * @param xslt - * @return - * @throws TransformerException - */ - public static Transformer newInstance(final String xslt) throws TransformerException { + /** + * Creates the index record transformer from the given XSLT + * + * @param xslt + * @return + * @throws TransformerException + */ + public static Transformer newInstance(final String xslt) throws TransformerException { - final TransformerFactoryImpl factory = new TransformerFactoryImpl(); - final Configuration conf = factory.getConfiguration(); - conf.registerExtensionFunction(new ExtractYear()); - conf.registerExtensionFunction(new NormalizeDate()); - conf.registerExtensionFunction(new PickFirst()); + final TransformerFactoryImpl factory = new TransformerFactoryImpl(); + final Configuration conf = factory.getConfiguration(); + conf.registerExtensionFunction(new ExtractYear()); + conf.registerExtensionFunction(new NormalizeDate()); + conf.registerExtensionFunction(new PickFirst()); - return factory.newTransformer(new StreamSource(new StringReader(xslt))); - } + return factory.newTransformer(new StreamSource(new StringReader(xslt))); + } } diff --git a/dhp-common/src/main/java/eu/dnetlib/message/Message.java b/dhp-common/src/main/java/eu/dnetlib/message/Message.java index b62afb19a..fc1c38291 100644 --- a/dhp-common/src/main/java/eu/dnetlib/message/Message.java +++ b/dhp-common/src/main/java/eu/dnetlib/message/Message.java @@ -1,73 +1,76 @@ + package eu.dnetlib.message; -import com.fasterxml.jackson.core.JsonProcessingException; -import com.fasterxml.jackson.databind.ObjectMapper; import java.io.IOException; import java.util.Map; +import com.fasterxml.jackson.core.JsonProcessingException; +import com.fasterxml.jackson.databind.ObjectMapper; + public class Message { - private String workflowId; + private String workflowId; - private String jobName; + private String jobName; - private MessageType type; + private MessageType type; - private Map body; + private Map body; - public static Message fromJson(final String json) throws IOException { - final ObjectMapper jsonMapper = new ObjectMapper(); - return jsonMapper.readValue(json, Message.class); - } + public static Message fromJson(final String json) throws IOException { + final ObjectMapper jsonMapper = new ObjectMapper(); + return jsonMapper.readValue(json, Message.class); + } - public Message() {} + public Message() { + } - public Message(String workflowId, String jobName, MessageType type, Map body) { - this.workflowId = workflowId; - this.jobName = jobName; - this.type = type; - this.body = body; - } + public Message(String workflowId, String jobName, MessageType type, Map body) { + this.workflowId = workflowId; + this.jobName = jobName; + this.type = type; + this.body = body; + } - public String getWorkflowId() { - return workflowId; - } + public String getWorkflowId() { + return workflowId; + } - public void setWorkflowId(String workflowId) { - this.workflowId = workflowId; - } + public void setWorkflowId(String workflowId) { + this.workflowId = workflowId; + } - public String getJobName() { - return jobName; - } + public String getJobName() { + return jobName; + } - public void setJobName(String jobName) { - this.jobName = jobName; - } + public void setJobName(String jobName) { + this.jobName = jobName; + } - public MessageType getType() { - return type; - } + public MessageType getType() { + return type; + } - public void setType(MessageType type) { - this.type = type; - } + public void setType(MessageType type) { + this.type = type; + } - public Map getBody() { - return body; - } + public Map getBody() { + return body; + } - public void setBody(Map body) { - this.body = body; - } + public void setBody(Map body) { + this.body = body; + } - @Override - public String toString() { - final ObjectMapper jsonMapper = new ObjectMapper(); - try { - return jsonMapper.writeValueAsString(this); - } catch (JsonProcessingException e) { - return null; - } - } + @Override + public String toString() { + final ObjectMapper jsonMapper = new ObjectMapper(); + try { + return jsonMapper.writeValueAsString(this); + } catch (JsonProcessingException e) { + return null; + } + } } diff --git a/dhp-common/src/main/java/eu/dnetlib/message/MessageConsumer.java b/dhp-common/src/main/java/eu/dnetlib/message/MessageConsumer.java index 3df712a62..fb3f0bd95 100644 --- a/dhp-common/src/main/java/eu/dnetlib/message/MessageConsumer.java +++ b/dhp-common/src/main/java/eu/dnetlib/message/MessageConsumer.java @@ -1,45 +1,47 @@ + package eu.dnetlib.message; +import java.io.IOException; +import java.nio.charset.StandardCharsets; +import java.util.concurrent.LinkedBlockingQueue; + import com.rabbitmq.client.AMQP; import com.rabbitmq.client.Channel; import com.rabbitmq.client.DefaultConsumer; import com.rabbitmq.client.Envelope; -import java.io.IOException; -import java.nio.charset.StandardCharsets; -import java.util.concurrent.LinkedBlockingQueue; public class MessageConsumer extends DefaultConsumer { - final LinkedBlockingQueue queueMessages; + final LinkedBlockingQueue queueMessages; - /** - * Constructs a new instance and records its association to the passed-in channel. - * - * @param channel the channel to which this consumer is attached - * @param queueMessages - */ - public MessageConsumer(Channel channel, LinkedBlockingQueue queueMessages) { - super(channel); - this.queueMessages = queueMessages; - } + /** + * Constructs a new instance and records its association to the passed-in channel. + * + * @param channel the channel to which this consumer is attached + * @param queueMessages + */ + public MessageConsumer(Channel channel, LinkedBlockingQueue queueMessages) { + super(channel); + this.queueMessages = queueMessages; + } - @Override - public void handleDelivery( - String consumerTag, Envelope envelope, AMQP.BasicProperties properties, byte[] body) - throws IOException { - final String json = new String(body, StandardCharsets.UTF_8); - Message message = Message.fromJson(json); - try { - this.queueMessages.put(message); - System.out.println("Receiving Message " + message); - } catch (InterruptedException e) { - if (message.getType() == MessageType.REPORT) - throw new RuntimeException("Error on sending message"); - else { - // TODO LOGGING EXCEPTION - } - } finally { - getChannel().basicAck(envelope.getDeliveryTag(), false); - } - } + @Override + public void handleDelivery( + String consumerTag, Envelope envelope, AMQP.BasicProperties properties, byte[] body) + throws IOException { + final String json = new String(body, StandardCharsets.UTF_8); + Message message = Message.fromJson(json); + try { + this.queueMessages.put(message); + System.out.println("Receiving Message " + message); + } catch (InterruptedException e) { + if (message.getType() == MessageType.REPORT) + throw new RuntimeException("Error on sending message"); + else { + // TODO LOGGING EXCEPTION + } + } finally { + getChannel().basicAck(envelope.getDeliveryTag(), false); + } + } } diff --git a/dhp-common/src/main/java/eu/dnetlib/message/MessageManager.java b/dhp-common/src/main/java/eu/dnetlib/message/MessageManager.java index 8370a6cc8..4c5c48c55 100644 --- a/dhp-common/src/main/java/eu/dnetlib/message/MessageManager.java +++ b/dhp-common/src/main/java/eu/dnetlib/message/MessageManager.java @@ -1,134 +1,136 @@ + package eu.dnetlib.message; -import com.rabbitmq.client.Channel; -import com.rabbitmq.client.Connection; -import com.rabbitmq.client.ConnectionFactory; import java.io.IOException; import java.util.HashMap; import java.util.Map; import java.util.concurrent.LinkedBlockingQueue; import java.util.concurrent.TimeoutException; +import com.rabbitmq.client.Channel; +import com.rabbitmq.client.Connection; +import com.rabbitmq.client.ConnectionFactory; + public class MessageManager { - private final String messageHost; + private final String messageHost; - private final String username; + private final String username; - private final String password; + private final String password; - private Connection connection; + private Connection connection; - private Map channels = new HashMap<>(); + private Map channels = new HashMap<>(); - private boolean durable; + private boolean durable; - private boolean autodelete; + private boolean autodelete; - private final LinkedBlockingQueue queueMessages; + private final LinkedBlockingQueue queueMessages; - public MessageManager( - String messageHost, - String username, - String password, - final LinkedBlockingQueue queueMessages) { - this.queueMessages = queueMessages; - this.messageHost = messageHost; - this.username = username; - this.password = password; - } + public MessageManager( + String messageHost, + String username, + String password, + final LinkedBlockingQueue queueMessages) { + this.queueMessages = queueMessages; + this.messageHost = messageHost; + this.username = username; + this.password = password; + } - public MessageManager( - String messageHost, - String username, - String password, - boolean durable, - boolean autodelete, - final LinkedBlockingQueue queueMessages) { - this.queueMessages = queueMessages; - this.messageHost = messageHost; - this.username = username; - this.password = password; + public MessageManager( + String messageHost, + String username, + String password, + boolean durable, + boolean autodelete, + final LinkedBlockingQueue queueMessages) { + this.queueMessages = queueMessages; + this.messageHost = messageHost; + this.username = username; + this.password = password; - this.durable = durable; - this.autodelete = autodelete; - } + this.durable = durable; + this.autodelete = autodelete; + } - private Connection createConnection() throws IOException, TimeoutException { - ConnectionFactory factory = new ConnectionFactory(); - factory.setHost(this.messageHost); - factory.setUsername(this.username); - factory.setPassword(this.password); - return factory.newConnection(); - } + private Connection createConnection() throws IOException, TimeoutException { + ConnectionFactory factory = new ConnectionFactory(); + factory.setHost(this.messageHost); + factory.setUsername(this.username); + factory.setPassword(this.password); + return factory.newConnection(); + } - private Channel createChannel( - final Connection connection, - final String queueName, - final boolean durable, - final boolean autodelete) - throws Exception { - Map args = new HashMap<>(); - args.put("x-message-ttl", 10000); - Channel channel = connection.createChannel(); - channel.queueDeclare(queueName, durable, false, this.autodelete, args); - return channel; - } + private Channel createChannel( + final Connection connection, + final String queueName, + final boolean durable, + final boolean autodelete) + throws Exception { + Map args = new HashMap<>(); + args.put("x-message-ttl", 10000); + Channel channel = connection.createChannel(); + channel.queueDeclare(queueName, durable, false, this.autodelete, args); + return channel; + } - private Channel getOrCreateChannel(final String queueName, boolean durable, boolean autodelete) - throws Exception { - if (channels.containsKey(queueName)) { - return channels.get(queueName); - } + private Channel getOrCreateChannel(final String queueName, boolean durable, boolean autodelete) + throws Exception { + if (channels.containsKey(queueName)) { + return channels.get(queueName); + } - if (this.connection == null) { - this.connection = createConnection(); - } - channels.put(queueName, createChannel(this.connection, queueName, durable, autodelete)); - return channels.get(queueName); - } + if (this.connection == null) { + this.connection = createConnection(); + } + channels.put(queueName, createChannel(this.connection, queueName, durable, autodelete)); + return channels.get(queueName); + } - public void close() throws IOException { - channels - .values() - .forEach( - ch -> { - try { - ch.close(); - } catch (Exception e) { - // TODO LOG - } - }); + public void close() throws IOException { + channels + .values() + .forEach( + ch -> { + try { + ch.close(); + } catch (Exception e) { + // TODO LOG + } + }); - this.connection.close(); - } + this.connection.close(); + } - public boolean sendMessage(final Message message, String queueName) throws Exception { - try { - Channel channel = getOrCreateChannel(queueName, this.durable, this.autodelete); - channel.basicPublish("", queueName, null, message.toString().getBytes()); - return true; - } catch (Throwable e) { - throw new RuntimeException(e); - } - } + public boolean sendMessage(final Message message, String queueName) throws Exception { + try { + Channel channel = getOrCreateChannel(queueName, this.durable, this.autodelete); + channel.basicPublish("", queueName, null, message.toString().getBytes()); + return true; + } catch (Throwable e) { + throw new RuntimeException(e); + } + } - public boolean sendMessage( - final Message message, String queueName, boolean durable_var, boolean autodelete_var) - throws Exception { - try { - Channel channel = getOrCreateChannel(queueName, durable_var, autodelete_var); - channel.basicPublish("", queueName, null, message.toString().getBytes()); - return true; - } catch (Throwable e) { - throw new RuntimeException(e); - } - } + public boolean sendMessage( + final Message message, String queueName, boolean durable_var, boolean autodelete_var) + throws Exception { + try { + Channel channel = getOrCreateChannel(queueName, durable_var, autodelete_var); + channel.basicPublish("", queueName, null, message.toString().getBytes()); + return true; + } catch (Throwable e) { + throw new RuntimeException(e); + } + } - public void startConsumingMessage( - final String queueName, final boolean durable, final boolean autodelete) throws Exception { + public void startConsumingMessage( + final String queueName, final boolean durable, final boolean autodelete) throws Exception { - Channel channel = createChannel(createConnection(), queueName, durable, autodelete); - channel.basicConsume(queueName, false, new MessageConsumer(channel, queueMessages)); - } + Channel channel = createChannel(createConnection(), queueName, durable, autodelete); + channel.basicConsume(queueName, false, new MessageConsumer(channel, queueMessages)); + } } diff --git a/dhp-common/src/main/java/eu/dnetlib/message/MessageType.java b/dhp-common/src/main/java/eu/dnetlib/message/MessageType.java index edca90061..72cbda252 100644 --- a/dhp-common/src/main/java/eu/dnetlib/message/MessageType.java +++ b/dhp-common/src/main/java/eu/dnetlib/message/MessageType.java @@ -1,6 +1,6 @@ + package eu.dnetlib.message; public enum MessageType { - ONGOING, - REPORT + ONGOING, REPORT } diff --git a/dhp-common/src/main/java/eu/dnetlib/scholexplorer/relation/RelInfo.java b/dhp-common/src/main/java/eu/dnetlib/scholexplorer/relation/RelInfo.java index 1ae6e8ead..e07fcef66 100644 --- a/dhp-common/src/main/java/eu/dnetlib/scholexplorer/relation/RelInfo.java +++ b/dhp-common/src/main/java/eu/dnetlib/scholexplorer/relation/RelInfo.java @@ -1,24 +1,25 @@ + package eu.dnetlib.scholexplorer.relation; import java.io.Serializable; public class RelInfo implements Serializable { - private String original; - private String inverse; + private String original; + private String inverse; - public String getOriginal() { - return original; - } + public String getOriginal() { + return original; + } - public void setOriginal(String original) { - this.original = original; - } + public void setOriginal(String original) { + this.original = original; + } - public String getInverse() { - return inverse; - } + public String getInverse() { + return inverse; + } - public void setInverse(String inverse) { - this.inverse = inverse; - } + public void setInverse(String inverse) { + this.inverse = inverse; + } } diff --git a/dhp-common/src/main/java/eu/dnetlib/scholexplorer/relation/RelationMapper.java b/dhp-common/src/main/java/eu/dnetlib/scholexplorer/relation/RelationMapper.java index 9cc995821..eb708c390 100644 --- a/dhp-common/src/main/java/eu/dnetlib/scholexplorer/relation/RelationMapper.java +++ b/dhp-common/src/main/java/eu/dnetlib/scholexplorer/relation/RelationMapper.java @@ -1,18 +1,20 @@ + package eu.dnetlib.scholexplorer.relation; -import com.fasterxml.jackson.databind.ObjectMapper; import java.io.Serializable; import java.util.HashMap; + import org.apache.commons.io.IOUtils; +import com.fasterxml.jackson.databind.ObjectMapper; + public class RelationMapper extends HashMap implements Serializable { - public static RelationMapper load() throws Exception { + public static RelationMapper load() throws Exception { - final String json = - IOUtils.toString(RelationMapper.class.getResourceAsStream("relations.json")); + final String json = IOUtils.toString(RelationMapper.class.getResourceAsStream("relations.json")); - ObjectMapper mapper = new ObjectMapper(); - return mapper.readValue(json, RelationMapper.class); - } + ObjectMapper mapper = new ObjectMapper(); + return mapper.readValue(json, RelationMapper.class); + } } diff --git a/dhp-common/src/test/java/eu/dnetlib/dhp/application/ArgumentApplicationParserTest.java b/dhp-common/src/test/java/eu/dnetlib/dhp/application/ArgumentApplicationParserTest.java index bb7351745..e14020830 100644 --- a/dhp-common/src/test/java/eu/dnetlib/dhp/application/ArgumentApplicationParserTest.java +++ b/dhp-common/src/test/java/eu/dnetlib/dhp/application/ArgumentApplicationParserTest.java @@ -1,3 +1,4 @@ + package eu.dnetlib.dhp.application; import static org.junit.jupiter.api.Assertions.assertEquals; @@ -8,58 +9,59 @@ import org.junit.jupiter.api.Test; public class ArgumentApplicationParserTest { - @Test - public void testParseParameter() throws Exception { - final String jsonConfiguration = - IOUtils.toString( - this.getClass().getResourceAsStream("/eu/dnetlib/application/parameters.json")); - assertNotNull(jsonConfiguration); - ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration); - parser.parseArgument( - new String[] { - "-p", - "value0", - "-a", - "value1", - "-n", - "value2", - "-u", - "value3", - "-ru", - "value4", - "-rp", - "value5", - "-rh", - "value6", - "-ro", - "value7", - "-rr", - "value8", - "-w", - "value9", - "-cc", - ArgumentApplicationParser.compressArgument(jsonConfiguration) - }); - assertNotNull(parser.get("hdfsPath")); - assertNotNull(parser.get("apidescriptor")); - assertNotNull(parser.get("namenode")); - assertNotNull(parser.get("userHDFS")); - assertNotNull(parser.get("rabbitUser")); - assertNotNull(parser.get("rabbitPassWord")); - assertNotNull(parser.get("rabbitHost")); - assertNotNull(parser.get("rabbitOngoingQueue")); - assertNotNull(parser.get("rabbitReportQueue")); - assertNotNull(parser.get("workflowId")); - assertEquals("value0", parser.get("hdfsPath")); - assertEquals("value1", parser.get("apidescriptor")); - assertEquals("value2", parser.get("namenode")); - assertEquals("value3", parser.get("userHDFS")); - assertEquals("value4", parser.get("rabbitUser")); - assertEquals("value5", parser.get("rabbitPassWord")); - assertEquals("value6", parser.get("rabbitHost")); - assertEquals("value7", parser.get("rabbitOngoingQueue")); - assertEquals("value8", parser.get("rabbitReportQueue")); - assertEquals("value9", parser.get("workflowId")); - assertEquals(jsonConfiguration, parser.get("ccCoco")); - } + @Test + public void testParseParameter() throws Exception { + final String jsonConfiguration = IOUtils + .toString( + this.getClass().getResourceAsStream("/eu/dnetlib/application/parameters.json")); + assertNotNull(jsonConfiguration); + ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration); + parser + .parseArgument( + new String[] { + "-p", + "value0", + "-a", + "value1", + "-n", + "value2", + "-u", + "value3", + "-ru", + "value4", + "-rp", + "value5", + "-rh", + "value6", + "-ro", + "value7", + "-rr", + "value8", + "-w", + "value9", + "-cc", + ArgumentApplicationParser.compressArgument(jsonConfiguration) + }); + assertNotNull(parser.get("hdfsPath")); + assertNotNull(parser.get("apidescriptor")); + assertNotNull(parser.get("namenode")); + assertNotNull(parser.get("userHDFS")); + assertNotNull(parser.get("rabbitUser")); + assertNotNull(parser.get("rabbitPassWord")); + assertNotNull(parser.get("rabbitHost")); + assertNotNull(parser.get("rabbitOngoingQueue")); + assertNotNull(parser.get("rabbitReportQueue")); + assertNotNull(parser.get("workflowId")); + assertEquals("value0", parser.get("hdfsPath")); + assertEquals("value1", parser.get("apidescriptor")); + assertEquals("value2", parser.get("namenode")); + assertEquals("value3", parser.get("userHDFS")); + assertEquals("value4", parser.get("rabbitUser")); + assertEquals("value5", parser.get("rabbitPassWord")); + assertEquals("value6", parser.get("rabbitHost")); + assertEquals("value7", parser.get("rabbitOngoingQueue")); + assertEquals("value8", parser.get("rabbitReportQueue")); + assertEquals("value9", parser.get("workflowId")); + assertEquals(jsonConfiguration, parser.get("ccCoco")); + } } diff --git a/dhp-common/src/test/java/eu/dnetlib/dhp/common/HdfsSupportTest.java b/dhp-common/src/test/java/eu/dnetlib/dhp/common/HdfsSupportTest.java index a8f0bbb0d..870943816 100644 --- a/dhp-common/src/test/java/eu/dnetlib/dhp/common/HdfsSupportTest.java +++ b/dhp-common/src/test/java/eu/dnetlib/dhp/common/HdfsSupportTest.java @@ -1,3 +1,4 @@ + package eu.dnetlib.dhp.common; import static org.junit.jupiter.api.Assertions.*; @@ -8,6 +9,7 @@ import java.nio.file.Path; import java.util.Arrays; import java.util.List; import java.util.stream.Collectors; + import org.apache.hadoop.conf.Configuration; import org.junit.jupiter.api.Nested; import org.junit.jupiter.api.Test; @@ -15,63 +17,64 @@ import org.junit.jupiter.api.io.TempDir; public class HdfsSupportTest { - @Nested - class Remove { + @Nested + class Remove { - @Test - public void shouldThrowARuntimeExceptionOnError() { - // when - assertThrows(RuntimeException.class, () -> HdfsSupport.remove(null, new Configuration())); - } + @Test + public void shouldThrowARuntimeExceptionOnError() { + // when + assertThrows(RuntimeException.class, () -> HdfsSupport.remove(null, new Configuration())); + } - @Test - public void shouldRemoveADirFromHDFS(@TempDir Path tempDir) { - // when - HdfsSupport.remove(tempDir.toString(), new Configuration()); + @Test + public void shouldRemoveADirFromHDFS(@TempDir Path tempDir) { + // when + HdfsSupport.remove(tempDir.toString(), new Configuration()); - // then - assertFalse(Files.exists(tempDir)); - } + // then + assertFalse(Files.exists(tempDir)); + } - @Test - public void shouldRemoveAFileFromHDFS(@TempDir Path tempDir) throws IOException { - // given - Path file = Files.createTempFile(tempDir, "p", "s"); + @Test + public void shouldRemoveAFileFromHDFS(@TempDir Path tempDir) throws IOException { + // given + Path file = Files.createTempFile(tempDir, "p", "s"); - // when - HdfsSupport.remove(file.toString(), new Configuration()); + // when + HdfsSupport.remove(file.toString(), new Configuration()); - // then - assertFalse(Files.exists(file)); - } - } + // then + assertFalse(Files.exists(file)); + } + } - @Nested - class ListFiles { + @Nested + class ListFiles { - @Test - public void shouldThrowARuntimeExceptionOnError() { - // when - assertThrows(RuntimeException.class, () -> HdfsSupport.listFiles(null, new Configuration())); - } + @Test + public void shouldThrowARuntimeExceptionOnError() { + // when + assertThrows(RuntimeException.class, () -> HdfsSupport.listFiles(null, new Configuration())); + } - @Test - public void shouldListFilesLocatedInPath(@TempDir Path tempDir) throws IOException { - Path subDir1 = Files.createTempDirectory(tempDir, "list_me"); - Path subDir2 = Files.createTempDirectory(tempDir, "list_me"); + @Test + public void shouldListFilesLocatedInPath(@TempDir Path tempDir) throws IOException { + Path subDir1 = Files.createTempDirectory(tempDir, "list_me"); + Path subDir2 = Files.createTempDirectory(tempDir, "list_me"); - // when - List paths = HdfsSupport.listFiles(tempDir.toString(), new Configuration()); + // when + List paths = HdfsSupport.listFiles(tempDir.toString(), new Configuration()); - // then - assertEquals(2, paths.size()); - List expecteds = - Arrays.stream(new String[] {subDir1.toString(), subDir2.toString()}) - .sorted() - .collect(Collectors.toList()); - List actuals = paths.stream().sorted().collect(Collectors.toList()); - assertTrue(actuals.get(0).contains(expecteds.get(0))); - assertTrue(actuals.get(1).contains(expecteds.get(1))); - } - } + // then + assertEquals(2, paths.size()); + List expecteds = Arrays.stream(new String[] { + subDir1.toString(), subDir2.toString() + }) + .sorted() + .collect(Collectors.toList()); + List actuals = paths.stream().sorted().collect(Collectors.toList()); + assertTrue(actuals.get(0).contains(expecteds.get(0))); + assertTrue(actuals.get(1).contains(expecteds.get(1))); + } + } } diff --git a/dhp-common/src/test/java/eu/dnetlib/dhp/common/SparkSessionSupportTest.java b/dhp-common/src/test/java/eu/dnetlib/dhp/common/SparkSessionSupportTest.java index 698b9cea5..2f01c0863 100644 --- a/dhp-common/src/test/java/eu/dnetlib/dhp/common/SparkSessionSupportTest.java +++ b/dhp-common/src/test/java/eu/dnetlib/dhp/common/SparkSessionSupportTest.java @@ -1,55 +1,58 @@ + package eu.dnetlib.dhp.common; import static org.mockito.Mockito.*; -import eu.dnetlib.dhp.common.FunctionalInterfaceSupport.ThrowingConsumer; import java.util.function.Function; + import org.apache.spark.SparkConf; import org.apache.spark.sql.SparkSession; import org.junit.jupiter.api.Nested; import org.junit.jupiter.api.Test; +import eu.dnetlib.dhp.common.FunctionalInterfaceSupport.ThrowingConsumer; + public class SparkSessionSupportTest { - @Nested - class RunWithSparkSession { + @Nested + class RunWithSparkSession { - @Test - public void shouldExecuteFunctionAndNotStopSparkSessionWhenSparkSessionIsNotManaged() - throws Exception { - // given - SparkSession spark = mock(SparkSession.class); - SparkConf conf = mock(SparkConf.class); - Function sparkSessionBuilder = mock(Function.class); - when(sparkSessionBuilder.apply(conf)).thenReturn(spark); - ThrowingConsumer fn = mock(ThrowingConsumer.class); + @Test + public void shouldExecuteFunctionAndNotStopSparkSessionWhenSparkSessionIsNotManaged() + throws Exception { + // given + SparkSession spark = mock(SparkSession.class); + SparkConf conf = mock(SparkConf.class); + Function sparkSessionBuilder = mock(Function.class); + when(sparkSessionBuilder.apply(conf)).thenReturn(spark); + ThrowingConsumer fn = mock(ThrowingConsumer.class); - // when - SparkSessionSupport.runWithSparkSession(sparkSessionBuilder, conf, false, fn); + // when + SparkSessionSupport.runWithSparkSession(sparkSessionBuilder, conf, false, fn); - // then - verify(sparkSessionBuilder).apply(conf); - verify(fn).accept(spark); - verify(spark, never()).stop(); - } + // then + verify(sparkSessionBuilder).apply(conf); + verify(fn).accept(spark); + verify(spark, never()).stop(); + } - @Test - public void shouldExecuteFunctionAndStopSparkSessionWhenSparkSessionIsManaged() - throws Exception { - // given - SparkSession spark = mock(SparkSession.class); - SparkConf conf = mock(SparkConf.class); - Function sparkSessionBuilder = mock(Function.class); - when(sparkSessionBuilder.apply(conf)).thenReturn(spark); - ThrowingConsumer fn = mock(ThrowingConsumer.class); + @Test + public void shouldExecuteFunctionAndStopSparkSessionWhenSparkSessionIsManaged() + throws Exception { + // given + SparkSession spark = mock(SparkSession.class); + SparkConf conf = mock(SparkConf.class); + Function sparkSessionBuilder = mock(Function.class); + when(sparkSessionBuilder.apply(conf)).thenReturn(spark); + ThrowingConsumer fn = mock(ThrowingConsumer.class); - // when - SparkSessionSupport.runWithSparkSession(sparkSessionBuilder, conf, true, fn); + // when + SparkSessionSupport.runWithSparkSession(sparkSessionBuilder, conf, true, fn); - // then - verify(sparkSessionBuilder).apply(conf); - verify(fn).accept(spark); - verify(spark, times(1)).stop(); - } - } + // then + verify(sparkSessionBuilder).apply(conf); + verify(fn).accept(spark); + verify(spark, times(1)).stop(); + } + } } diff --git a/dhp-common/src/test/java/eu/dnetlib/dhp/model/mdstore/MetadataRecordTest.java b/dhp-common/src/test/java/eu/dnetlib/dhp/model/mdstore/MetadataRecordTest.java index 84cb08d95..cb4d0ab50 100644 --- a/dhp-common/src/test/java/eu/dnetlib/dhp/model/mdstore/MetadataRecordTest.java +++ b/dhp-common/src/test/java/eu/dnetlib/dhp/model/mdstore/MetadataRecordTest.java @@ -1,3 +1,4 @@ + package eu.dnetlib.dhp.model.mdstore; import static org.junit.jupiter.api.Assertions.assertTrue; @@ -6,10 +7,10 @@ import org.junit.jupiter.api.Test; public class MetadataRecordTest { - @Test - public void getTimestamp() { + @Test + public void getTimestamp() { - MetadataRecord r = new MetadataRecord(); - assertTrue(r.getDateOfCollection() > 0); - } + MetadataRecord r = new MetadataRecord(); + assertTrue(r.getDateOfCollection() > 0); + } } diff --git a/dhp-common/src/test/java/eu/dnetlib/message/MessageTest.java b/dhp-common/src/test/java/eu/dnetlib/message/MessageTest.java index a514f8573..442f7b5c2 100644 --- a/dhp-common/src/test/java/eu/dnetlib/message/MessageTest.java +++ b/dhp-common/src/test/java/eu/dnetlib/message/MessageTest.java @@ -1,3 +1,4 @@ + package eu.dnetlib.message; import static org.junit.jupiter.api.Assertions.*; @@ -5,46 +6,46 @@ import static org.junit.jupiter.api.Assertions.*; import java.io.IOException; import java.util.HashMap; import java.util.Map; + import org.junit.jupiter.api.Test; public class MessageTest { - @Test - public void fromJsonTest() throws IOException { - Message m = new Message(); - m.setWorkflowId("wId"); - m.setType(MessageType.ONGOING); - m.setJobName("Collection"); - Map body = new HashMap<>(); - body.put("parsedItem", "300"); - body.put("ExecutionTime", "30s"); + @Test + public void fromJsonTest() throws IOException { + Message m = new Message(); + m.setWorkflowId("wId"); + m.setType(MessageType.ONGOING); + m.setJobName("Collection"); + Map body = new HashMap<>(); + body.put("parsedItem", "300"); + body.put("ExecutionTime", "30s"); - m.setBody(body); - System.out.println("m = " + m); - Message m1 = Message.fromJson(m.toString()); - assertEquals(m1.getWorkflowId(), m.getWorkflowId()); - assertEquals(m1.getType(), m.getType()); - assertEquals(m1.getJobName(), m.getJobName()); + m.setBody(body); + System.out.println("m = " + m); + Message m1 = Message.fromJson(m.toString()); + assertEquals(m1.getWorkflowId(), m.getWorkflowId()); + assertEquals(m1.getType(), m.getType()); + assertEquals(m1.getJobName(), m.getJobName()); - assertNotNull(m1.getBody()); - m1.getBody().keySet().forEach(it -> assertEquals(m1.getBody().get(it), m.getBody().get(it))); - assertEquals(m1.getJobName(), m.getJobName()); - } + assertNotNull(m1.getBody()); + m1.getBody().keySet().forEach(it -> assertEquals(m1.getBody().get(it), m.getBody().get(it))); + assertEquals(m1.getJobName(), m.getJobName()); + } - @Test - public void toStringTest() { - final String expectedJson = - "{\"workflowId\":\"wId\",\"jobName\":\"Collection\",\"type\":\"ONGOING\",\"body\":{\"ExecutionTime\":\"30s\",\"parsedItem\":\"300\"}}"; - Message m = new Message(); - m.setWorkflowId("wId"); - m.setType(MessageType.ONGOING); - m.setJobName("Collection"); - Map body = new HashMap<>(); - body.put("parsedItem", "300"); - body.put("ExecutionTime", "30s"); + @Test + public void toStringTest() { + final String expectedJson = "{\"workflowId\":\"wId\",\"jobName\":\"Collection\",\"type\":\"ONGOING\",\"body\":{\"ExecutionTime\":\"30s\",\"parsedItem\":\"300\"}}"; + Message m = new Message(); + m.setWorkflowId("wId"); + m.setType(MessageType.ONGOING); + m.setJobName("Collection"); + Map body = new HashMap<>(); + body.put("parsedItem", "300"); + body.put("ExecutionTime", "30s"); - m.setBody(body); + m.setBody(body); - assertEquals(expectedJson, m.toString()); - } + assertEquals(expectedJson, m.toString()); + } } diff --git a/dhp-common/src/test/java/eu/dnetlib/scholexplorer/relation/RelationMapperTest.java b/dhp-common/src/test/java/eu/dnetlib/scholexplorer/relation/RelationMapperTest.java index 9381cb01f..d1d1ada71 100644 --- a/dhp-common/src/test/java/eu/dnetlib/scholexplorer/relation/RelationMapperTest.java +++ b/dhp-common/src/test/java/eu/dnetlib/scholexplorer/relation/RelationMapperTest.java @@ -1,13 +1,14 @@ + package eu.dnetlib.scholexplorer.relation; import org.junit.jupiter.api.Test; public class RelationMapperTest { - @Test - public void testLoadRels() throws Exception { + @Test + public void testLoadRels() throws Exception { - RelationMapper relationMapper = RelationMapper.load(); - relationMapper.keySet().forEach(System.out::println); - } + RelationMapper relationMapper = RelationMapper.load(); + relationMapper.keySet().forEach(System.out::println); + } } diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/action/AtomicAction.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/action/AtomicAction.java index c803fab52..84b22c81c 100644 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/action/AtomicAction.java +++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/action/AtomicAction.java @@ -1,36 +1,40 @@ + package eu.dnetlib.dhp.schema.action; -import com.fasterxml.jackson.databind.annotation.JsonDeserialize; -import eu.dnetlib.dhp.schema.oaf.Oaf; import java.io.Serializable; +import com.fasterxml.jackson.databind.annotation.JsonDeserialize; + +import eu.dnetlib.dhp.schema.oaf.Oaf; + @JsonDeserialize(using = AtomicActionDeserializer.class) public class AtomicAction implements Serializable { - private Class clazz; + private Class clazz; - private T payload; + private T payload; - public AtomicAction() {} + public AtomicAction() { + } - public AtomicAction(Class clazz, T payload) { - this.clazz = clazz; - this.payload = payload; - } + public AtomicAction(Class clazz, T payload) { + this.clazz = clazz; + this.payload = payload; + } - public Class getClazz() { - return clazz; - } + public Class getClazz() { + return clazz; + } - public void setClazz(Class clazz) { - this.clazz = clazz; - } + public void setClazz(Class clazz) { + this.clazz = clazz; + } - public T getPayload() { - return payload; - } + public T getPayload() { + return payload; + } - public void setPayload(T payload) { - this.payload = payload; - } + public void setPayload(T payload) { + this.payload = payload; + } } diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/action/AtomicActionDeserializer.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/action/AtomicActionDeserializer.java index 701833c42..a9543d27a 100644 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/action/AtomicActionDeserializer.java +++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/action/AtomicActionDeserializer.java @@ -1,29 +1,32 @@ + package eu.dnetlib.dhp.schema.action; +import java.io.IOException; + import com.fasterxml.jackson.core.JsonParser; import com.fasterxml.jackson.core.JsonProcessingException; import com.fasterxml.jackson.databind.DeserializationContext; import com.fasterxml.jackson.databind.JsonDeserializer; import com.fasterxml.jackson.databind.JsonNode; import com.fasterxml.jackson.databind.ObjectMapper; + import eu.dnetlib.dhp.schema.oaf.Oaf; -import java.io.IOException; public class AtomicActionDeserializer extends JsonDeserializer { - @Override - public Object deserialize(JsonParser jp, DeserializationContext ctxt) - throws IOException, JsonProcessingException { - JsonNode node = jp.getCodec().readTree(jp); - String classTag = node.get("clazz").asText(); - JsonNode payload = node.get("payload"); - ObjectMapper mapper = new ObjectMapper(); + @Override + public Object deserialize(JsonParser jp, DeserializationContext ctxt) + throws IOException, JsonProcessingException { + JsonNode node = jp.getCodec().readTree(jp); + String classTag = node.get("clazz").asText(); + JsonNode payload = node.get("payload"); + ObjectMapper mapper = new ObjectMapper(); - try { - final Class clazz = Class.forName(classTag); - return new AtomicAction(clazz, (Oaf) mapper.readValue(payload.toString(), clazz)); - } catch (ClassNotFoundException e) { - throw new IOException(e); - } - } + try { + final Class clazz = Class.forName(classTag); + return new AtomicAction(clazz, (Oaf) mapper.readValue(payload.toString(), clazz)); + } catch (ClassNotFoundException e) { + throw new IOException(e); + } + } } diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/common/EntityType.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/common/EntityType.java index d597ecb53..54f30cf33 100644 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/common/EntityType.java +++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/common/EntityType.java @@ -1,26 +1,21 @@ + package eu.dnetlib.dhp.schema.common; import eu.dnetlib.dhp.schema.oaf.OafEntity; /** Actual entity types in the Graph */ public enum EntityType { - publication, - dataset, - otherresearchproduct, - software, - datasource, - organization, - project; + publication, dataset, otherresearchproduct, software, datasource, organization, project; - /** - * Resolves the EntityType, given the relative class name - * - * @param clazz the given class name - * @param actual OafEntity subclass - * @return the EntityType associated to the given class - */ - public static EntityType fromClass(Class clazz) { + /** + * Resolves the EntityType, given the relative class name + * + * @param clazz the given class name + * @param actual OafEntity subclass + * @return the EntityType associated to the given class + */ + public static EntityType fromClass(Class clazz) { - return EntityType.valueOf(clazz.getSimpleName().toLowerCase()); - } + return EntityType.valueOf(clazz.getSimpleName().toLowerCase()); + } } diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/common/MainEntityType.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/common/MainEntityType.java index 466cdc9e9..cda8ba484 100644 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/common/MainEntityType.java +++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/common/MainEntityType.java @@ -1,9 +1,7 @@ + package eu.dnetlib.dhp.schema.common; /** Main entity types in the Graph */ public enum MainEntityType { - result, - datasource, - organization, - project + result, datasource, organization, project } diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/common/ModelConstants.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/common/ModelConstants.java index 0dfdaad52..c6bfff12d 100644 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/common/ModelConstants.java +++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/common/ModelConstants.java @@ -1,40 +1,41 @@ + package eu.dnetlib.dhp.schema.common; import eu.dnetlib.dhp.schema.oaf.Qualifier; public class ModelConstants { - public static final String DNET_RESULT_TYPOLOGIES = "dnet:result_typologies"; + public static final String DNET_RESULT_TYPOLOGIES = "dnet:result_typologies"; - public static final String DATASET_RESULTTYPE_CLASSID = "dataset"; - public static final String PUBLICATION_RESULTTYPE_CLASSID = "publication"; - public static final String SOFTWARE_RESULTTYPE_CLASSID = "software"; - public static final String ORP_RESULTTYPE_CLASSID = "other"; + public static final String DATASET_RESULTTYPE_CLASSID = "dataset"; + public static final String PUBLICATION_RESULTTYPE_CLASSID = "publication"; + public static final String SOFTWARE_RESULTTYPE_CLASSID = "software"; + public static final String ORP_RESULTTYPE_CLASSID = "other"; - public static Qualifier PUBLICATION_DEFAULT_RESULTTYPE = new Qualifier(); - public static Qualifier DATASET_DEFAULT_RESULTTYPE = new Qualifier(); - public static Qualifier SOFTWARE_DEFAULT_RESULTTYPE = new Qualifier(); - public static Qualifier ORP_DEFAULT_RESULTTYPE = new Qualifier(); + public static Qualifier PUBLICATION_DEFAULT_RESULTTYPE = new Qualifier(); + public static Qualifier DATASET_DEFAULT_RESULTTYPE = new Qualifier(); + public static Qualifier SOFTWARE_DEFAULT_RESULTTYPE = new Qualifier(); + public static Qualifier ORP_DEFAULT_RESULTTYPE = new Qualifier(); - static { - PUBLICATION_DEFAULT_RESULTTYPE.setClassid(PUBLICATION_RESULTTYPE_CLASSID); - PUBLICATION_DEFAULT_RESULTTYPE.setClassname(PUBLICATION_RESULTTYPE_CLASSID); - PUBLICATION_DEFAULT_RESULTTYPE.setSchemeid(DNET_RESULT_TYPOLOGIES); - PUBLICATION_DEFAULT_RESULTTYPE.setSchemename(DNET_RESULT_TYPOLOGIES); + static { + PUBLICATION_DEFAULT_RESULTTYPE.setClassid(PUBLICATION_RESULTTYPE_CLASSID); + PUBLICATION_DEFAULT_RESULTTYPE.setClassname(PUBLICATION_RESULTTYPE_CLASSID); + PUBLICATION_DEFAULT_RESULTTYPE.setSchemeid(DNET_RESULT_TYPOLOGIES); + PUBLICATION_DEFAULT_RESULTTYPE.setSchemename(DNET_RESULT_TYPOLOGIES); - DATASET_DEFAULT_RESULTTYPE.setClassid(DATASET_RESULTTYPE_CLASSID); - DATASET_DEFAULT_RESULTTYPE.setClassname(DATASET_RESULTTYPE_CLASSID); - DATASET_DEFAULT_RESULTTYPE.setSchemeid(DNET_RESULT_TYPOLOGIES); - DATASET_DEFAULT_RESULTTYPE.setSchemename(DNET_RESULT_TYPOLOGIES); + DATASET_DEFAULT_RESULTTYPE.setClassid(DATASET_RESULTTYPE_CLASSID); + DATASET_DEFAULT_RESULTTYPE.setClassname(DATASET_RESULTTYPE_CLASSID); + DATASET_DEFAULT_RESULTTYPE.setSchemeid(DNET_RESULT_TYPOLOGIES); + DATASET_DEFAULT_RESULTTYPE.setSchemename(DNET_RESULT_TYPOLOGIES); - SOFTWARE_DEFAULT_RESULTTYPE.setClassid(SOFTWARE_RESULTTYPE_CLASSID); - SOFTWARE_DEFAULT_RESULTTYPE.setClassname(SOFTWARE_RESULTTYPE_CLASSID); - SOFTWARE_DEFAULT_RESULTTYPE.setSchemeid(DNET_RESULT_TYPOLOGIES); - SOFTWARE_DEFAULT_RESULTTYPE.setSchemename(DNET_RESULT_TYPOLOGIES); + SOFTWARE_DEFAULT_RESULTTYPE.setClassid(SOFTWARE_RESULTTYPE_CLASSID); + SOFTWARE_DEFAULT_RESULTTYPE.setClassname(SOFTWARE_RESULTTYPE_CLASSID); + SOFTWARE_DEFAULT_RESULTTYPE.setSchemeid(DNET_RESULT_TYPOLOGIES); + SOFTWARE_DEFAULT_RESULTTYPE.setSchemename(DNET_RESULT_TYPOLOGIES); - ORP_DEFAULT_RESULTTYPE.setClassid(ORP_RESULTTYPE_CLASSID); - ORP_DEFAULT_RESULTTYPE.setClassname(ORP_RESULTTYPE_CLASSID); - ORP_DEFAULT_RESULTTYPE.setSchemeid(DNET_RESULT_TYPOLOGIES); - ORP_DEFAULT_RESULTTYPE.setSchemename(DNET_RESULT_TYPOLOGIES); - } + ORP_DEFAULT_RESULTTYPE.setClassid(ORP_RESULTTYPE_CLASSID); + ORP_DEFAULT_RESULTTYPE.setClassname(ORP_RESULTTYPE_CLASSID); + ORP_DEFAULT_RESULTTYPE.setSchemeid(DNET_RESULT_TYPOLOGIES); + ORP_DEFAULT_RESULTTYPE.setSchemename(DNET_RESULT_TYPOLOGIES); + } } diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Author.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Author.java index e004e5800..89398455e 100644 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Author.java +++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Author.java @@ -1,3 +1,4 @@ + package eu.dnetlib.dhp.schema.oaf; import java.io.Serializable; @@ -5,92 +6,95 @@ import java.util.*; public class Author implements Serializable { - private String fullname; + private String fullname; - private String name; + private String name; - private String surname; + private String surname; - private Integer rank; + private Integer rank; - private List pid; + private List pid; - private List> affiliation; + private List> affiliation; - public String getFullname() { - return fullname; - } + public String getFullname() { + return fullname; + } - public void setFullname(String fullname) { - this.fullname = fullname; - } + public void setFullname(String fullname) { + this.fullname = fullname; + } - public String getName() { - return name; - } + public String getName() { + return name; + } - public void setName(String name) { - this.name = name; - } + public void setName(String name) { + this.name = name; + } - public String getSurname() { - return surname; - } + public String getSurname() { + return surname; + } - public void setSurname(String surname) { - this.surname = surname; - } + public void setSurname(String surname) { + this.surname = surname; + } - public Integer getRank() { - return rank; - } + public Integer getRank() { + return rank; + } - public void setRank(Integer rank) { - this.rank = rank; - } + public void setRank(Integer rank) { + this.rank = rank; + } - public List getPid() { - return pid; - } + public List getPid() { + return pid; + } - public void setPid(List pid) { - this.pid = pid; - } + public void setPid(List pid) { + this.pid = pid; + } - public List> getAffiliation() { - return affiliation; - } + public List> getAffiliation() { + return affiliation; + } - public void setAffiliation(List> affiliation) { - this.affiliation = affiliation; - } + public void setAffiliation(List> affiliation) { + this.affiliation = affiliation; + } - @Override - public boolean equals(Object o) { - if (this == o) return true; - if (o == null || getClass() != o.getClass()) return false; - Author author = (Author) o; - return Objects.equals(fullname, author.fullname) - && Objects.equals(name, author.name) - && Objects.equals(surname, author.surname) - && Objects.equals(rank, author.rank) - && Objects.equals(pid, author.pid) - && Objects.equals(affiliation, author.affiliation); - } + @Override + public boolean equals(Object o) { + if (this == o) + return true; + if (o == null || getClass() != o.getClass()) + return false; + Author author = (Author) o; + return Objects.equals(fullname, author.fullname) + && Objects.equals(name, author.name) + && Objects.equals(surname, author.surname) + && Objects.equals(rank, author.rank) + && Objects.equals(pid, author.pid) + && Objects.equals(affiliation, author.affiliation); + } - @Override - public int hashCode() { - return Objects.hash(fullname, name, surname, rank, pid, affiliation); - } + @Override + public int hashCode() { + return Objects.hash(fullname, name, surname, rank, pid, affiliation); + } - public void addPid(StructuredProperty pid) { + public void addPid(StructuredProperty pid) { - if (pid == null) return; + if (pid == null) + return; - if (this.pid == null) { - this.pid = Arrays.asList(pid); - } else { - this.pid.add(pid); - } - } + if (this.pid == null) { + this.pid = Arrays.asList(pid); + } else { + this.pid.add(pid); + } + } } diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Context.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Context.java index 7d930630d..57912c463 100644 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Context.java +++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Context.java @@ -1,42 +1,46 @@ + package eu.dnetlib.dhp.schema.oaf; import java.io.Serializable; import java.util.List; public class Context implements Serializable { - private String id; + private String id; - private List dataInfo; + private List dataInfo; - public String getId() { - return id; - } + public String getId() { + return id; + } - public void setId(String id) { - this.id = id; - } + public void setId(String id) { + this.id = id; + } - public List getDataInfo() { - return dataInfo; - } + public List getDataInfo() { + return dataInfo; + } - public void setDataInfo(List dataInfo) { - this.dataInfo = dataInfo; - } + public void setDataInfo(List dataInfo) { + this.dataInfo = dataInfo; + } - @Override - public int hashCode() { - return id == null ? 0 : id.hashCode(); - } + @Override + public int hashCode() { + return id == null ? 0 : id.hashCode(); + } - @Override - public boolean equals(Object obj) { - if (this == obj) return true; - if (obj == null) return false; - if (getClass() != obj.getClass()) return false; + @Override + public boolean equals(Object obj) { + if (this == obj) + return true; + if (obj == null) + return false; + if (getClass() != obj.getClass()) + return false; - Context other = (Context) obj; + Context other = (Context) obj; - return id.equals(other.getId()); - } + return id.equals(other.getId()); + } } diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Country.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Country.java index 388b9aab6..e25fdcade 100644 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Country.java +++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Country.java @@ -1,30 +1,34 @@ + package eu.dnetlib.dhp.schema.oaf; import java.util.Objects; public class Country extends Qualifier { - private DataInfo dataInfo; + private DataInfo dataInfo; - public DataInfo getDataInfo() { - return dataInfo; - } + public DataInfo getDataInfo() { + return dataInfo; + } - public void setDataInfo(DataInfo dataInfo) { - this.dataInfo = dataInfo; - } + public void setDataInfo(DataInfo dataInfo) { + this.dataInfo = dataInfo; + } - @Override - public boolean equals(Object o) { - if (this == o) return true; - if (o == null || getClass() != o.getClass()) return false; - if (!super.equals(o)) return false; - Country country = (Country) o; - return Objects.equals(dataInfo, country.dataInfo); - } + @Override + public boolean equals(Object o) { + if (this == o) + return true; + if (o == null || getClass() != o.getClass()) + return false; + if (!super.equals(o)) + return false; + Country country = (Country) o; + return Objects.equals(dataInfo, country.dataInfo); + } - @Override - public int hashCode() { - return Objects.hash(super.hashCode(), dataInfo); - } + @Override + public int hashCode() { + return Objects.hash(super.hashCode(), dataInfo); + } } diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/DataInfo.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/DataInfo.java index f65518a1f..cc77e1ea0 100644 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/DataInfo.java +++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/DataInfo.java @@ -1,3 +1,4 @@ + package eu.dnetlib.dhp.schema.oaf; import java.io.Serializable; @@ -5,77 +6,80 @@ import java.util.Objects; public class DataInfo implements Serializable { - private Boolean invisible = false; - private Boolean inferred; - private Boolean deletedbyinference; - private String trust; - private String inferenceprovenance; - private Qualifier provenanceaction; + private Boolean invisible = false; + private Boolean inferred; + private Boolean deletedbyinference; + private String trust; + private String inferenceprovenance; + private Qualifier provenanceaction; - public Boolean getInvisible() { - return invisible; - } + public Boolean getInvisible() { + return invisible; + } - public void setInvisible(Boolean invisible) { - this.invisible = invisible; - } + public void setInvisible(Boolean invisible) { + this.invisible = invisible; + } - public Boolean getInferred() { - return inferred; - } + public Boolean getInferred() { + return inferred; + } - public void setInferred(Boolean inferred) { - this.inferred = inferred; - } + public void setInferred(Boolean inferred) { + this.inferred = inferred; + } - public Boolean getDeletedbyinference() { - return deletedbyinference; - } + public Boolean getDeletedbyinference() { + return deletedbyinference; + } - public void setDeletedbyinference(Boolean deletedbyinference) { - this.deletedbyinference = deletedbyinference; - } + public void setDeletedbyinference(Boolean deletedbyinference) { + this.deletedbyinference = deletedbyinference; + } - public String getTrust() { - return trust; - } + public String getTrust() { + return trust; + } - public void setTrust(String trust) { - this.trust = trust; - } + public void setTrust(String trust) { + this.trust = trust; + } - public String getInferenceprovenance() { - return inferenceprovenance; - } + public String getInferenceprovenance() { + return inferenceprovenance; + } - public void setInferenceprovenance(String inferenceprovenance) { - this.inferenceprovenance = inferenceprovenance; - } + public void setInferenceprovenance(String inferenceprovenance) { + this.inferenceprovenance = inferenceprovenance; + } - public Qualifier getProvenanceaction() { - return provenanceaction; - } + public Qualifier getProvenanceaction() { + return provenanceaction; + } - public void setProvenanceaction(Qualifier provenanceaction) { - this.provenanceaction = provenanceaction; - } + public void setProvenanceaction(Qualifier provenanceaction) { + this.provenanceaction = provenanceaction; + } - @Override - public boolean equals(Object o) { - if (this == o) return true; - if (o == null || getClass() != o.getClass()) return false; - DataInfo dataInfo = (DataInfo) o; - return Objects.equals(invisible, dataInfo.invisible) - && Objects.equals(inferred, dataInfo.inferred) - && Objects.equals(deletedbyinference, dataInfo.deletedbyinference) - && Objects.equals(trust, dataInfo.trust) - && Objects.equals(inferenceprovenance, dataInfo.inferenceprovenance) - && Objects.equals(provenanceaction, dataInfo.provenanceaction); - } + @Override + public boolean equals(Object o) { + if (this == o) + return true; + if (o == null || getClass() != o.getClass()) + return false; + DataInfo dataInfo = (DataInfo) o; + return Objects.equals(invisible, dataInfo.invisible) + && Objects.equals(inferred, dataInfo.inferred) + && Objects.equals(deletedbyinference, dataInfo.deletedbyinference) + && Objects.equals(trust, dataInfo.trust) + && Objects.equals(inferenceprovenance, dataInfo.inferenceprovenance) + && Objects.equals(provenanceaction, dataInfo.provenanceaction); + } - @Override - public int hashCode() { - return Objects.hash( - invisible, inferred, deletedbyinference, trust, inferenceprovenance, provenanceaction); - } + @Override + public int hashCode() { + return Objects + .hash( + invisible, inferred, deletedbyinference, trust, inferenceprovenance, provenanceaction); + } } diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Dataset.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Dataset.java index 93b51f352..07ddbb00e 100644 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Dataset.java +++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Dataset.java @@ -1,116 +1,115 @@ + package eu.dnetlib.dhp.schema.oaf; -import eu.dnetlib.dhp.schema.common.ModelConstants; import java.io.Serializable; import java.util.List; +import eu.dnetlib.dhp.schema.common.ModelConstants; + public class Dataset extends Result implements Serializable { - private Field storagedate; + private Field storagedate; - private Field device; + private Field device; - private Field size; + private Field size; - private Field version; + private Field version; - private Field lastmetadataupdate; + private Field lastmetadataupdate; - private Field metadataversionnumber; + private Field metadataversionnumber; - private List geolocation; + private List geolocation; - public Dataset() { - setResulttype(ModelConstants.DATASET_DEFAULT_RESULTTYPE); - } + public Dataset() { + setResulttype(ModelConstants.DATASET_DEFAULT_RESULTTYPE); + } - public Field getStoragedate() { - return storagedate; - } + public Field getStoragedate() { + return storagedate; + } - public void setStoragedate(Field storagedate) { - this.storagedate = storagedate; - } + public void setStoragedate(Field storagedate) { + this.storagedate = storagedate; + } - public Field getDevice() { - return device; - } + public Field getDevice() { + return device; + } - public void setDevice(Field device) { - this.device = device; - } + public void setDevice(Field device) { + this.device = device; + } - public Field getSize() { - return size; - } + public Field getSize() { + return size; + } - public void setSize(Field size) { - this.size = size; - } + public void setSize(Field size) { + this.size = size; + } - public Field getVersion() { - return version; - } + public Field getVersion() { + return version; + } - public void setVersion(Field version) { - this.version = version; - } + public void setVersion(Field version) { + this.version = version; + } - public Field getLastmetadataupdate() { - return lastmetadataupdate; - } + public Field getLastmetadataupdate() { + return lastmetadataupdate; + } - public void setLastmetadataupdate(Field lastmetadataupdate) { - this.lastmetadataupdate = lastmetadataupdate; - } + public void setLastmetadataupdate(Field lastmetadataupdate) { + this.lastmetadataupdate = lastmetadataupdate; + } - public Field getMetadataversionnumber() { - return metadataversionnumber; - } + public Field getMetadataversionnumber() { + return metadataversionnumber; + } - public void setMetadataversionnumber(Field metadataversionnumber) { - this.metadataversionnumber = metadataversionnumber; - } + public void setMetadataversionnumber(Field metadataversionnumber) { + this.metadataversionnumber = metadataversionnumber; + } - public List getGeolocation() { - return geolocation; - } + public List getGeolocation() { + return geolocation; + } - public void setGeolocation(List geolocation) { - this.geolocation = geolocation; - } + public void setGeolocation(List geolocation) { + this.geolocation = geolocation; + } - @Override - public void mergeFrom(OafEntity e) { - super.mergeFrom(e); + @Override + public void mergeFrom(OafEntity e) { + super.mergeFrom(e); - if (!Dataset.class.isAssignableFrom(e.getClass())) { - return; - } + if (!Dataset.class.isAssignableFrom(e.getClass())) { + return; + } - final Dataset d = (Dataset) e; + final Dataset d = (Dataset) e; - storagedate = - d.getStoragedate() != null && compareTrust(this, e) < 0 ? d.getStoragedate() : storagedate; + storagedate = d.getStoragedate() != null && compareTrust(this, e) < 0 ? d.getStoragedate() : storagedate; - device = d.getDevice() != null && compareTrust(this, e) < 0 ? d.getDevice() : device; + device = d.getDevice() != null && compareTrust(this, e) < 0 ? d.getDevice() : device; - size = d.getSize() != null && compareTrust(this, e) < 0 ? d.getSize() : size; + size = d.getSize() != null && compareTrust(this, e) < 0 ? d.getSize() : size; - version = d.getVersion() != null && compareTrust(this, e) < 0 ? d.getVersion() : version; + version = d.getVersion() != null && compareTrust(this, e) < 0 ? d.getVersion() : version; - lastmetadataupdate = - d.getLastmetadataupdate() != null && compareTrust(this, e) < 0 - ? d.getLastmetadataupdate() - : lastmetadataupdate; + lastmetadataupdate = d.getLastmetadataupdate() != null && compareTrust(this, e) < 0 + ? d.getLastmetadataupdate() + : lastmetadataupdate; - metadataversionnumber = - d.getMetadataversionnumber() != null && compareTrust(this, e) < 0 - ? d.getMetadataversionnumber() - : metadataversionnumber; + metadataversionnumber = d.getMetadataversionnumber() != null && compareTrust(this, e) < 0 + ? d.getMetadataversionnumber() + : metadataversionnumber; - geolocation = mergeLists(geolocation, d.getGeolocation()); + geolocation = mergeLists(geolocation, d.getGeolocation()); - mergeOAFDataInfo(d); - } + mergeOAFDataInfo(d); + } } diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Datasource.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Datasource.java index f0c797631..721798206 100644 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Datasource.java +++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Datasource.java @@ -1,3 +1,4 @@ + package eu.dnetlib.dhp.schema.oaf; import java.io.Serializable; @@ -5,494 +6,467 @@ import java.util.List; public class Datasource extends OafEntity implements Serializable { - private Qualifier datasourcetype; + private Qualifier datasourcetype; - private Qualifier openairecompatibility; + private Qualifier openairecompatibility; - private Field officialname; + private Field officialname; - private Field englishname; + private Field englishname; - private Field websiteurl; + private Field websiteurl; - private Field logourl; + private Field logourl; - private Field contactemail; + private Field contactemail; - private Field namespaceprefix; + private Field namespaceprefix; - private Field latitude; + private Field latitude; - private Field longitude; + private Field longitude; - private Field dateofvalidation; + private Field dateofvalidation; - private Field description; + private Field description; - private List subjects; + private List subjects; - // opendoar specific fields (od*) - private Field odnumberofitems; + // opendoar specific fields (od*) + private Field odnumberofitems; - private Field odnumberofitemsdate; + private Field odnumberofitemsdate; - private Field odpolicies; + private Field odpolicies; - private List> odlanguages; + private List> odlanguages; - private List> odcontenttypes; + private List> odcontenttypes; - private List> accessinfopackage; + private List> accessinfopackage; - // re3data fields - private Field releasestartdate; + // re3data fields + private Field releasestartdate; - private Field releaseenddate; + private Field releaseenddate; - private Field missionstatementurl; + private Field missionstatementurl; - private Field dataprovider; + private Field dataprovider; - private Field serviceprovider; + private Field serviceprovider; - // {open, restricted or closed} - private Field databaseaccesstype; + // {open, restricted or closed} + private Field databaseaccesstype; - // {open, restricted or closed} - private Field datauploadtype; + // {open, restricted or closed} + private Field datauploadtype; - // {feeRequired, registration, other} - private Field databaseaccessrestriction; + // {feeRequired, registration, other} + private Field databaseaccessrestriction; - // {feeRequired, registration, other} - private Field datauploadrestriction; + // {feeRequired, registration, other} + private Field datauploadrestriction; - private Field versioning; + private Field versioning; - private Field citationguidelineurl; + private Field citationguidelineurl; - // {yes, no, uknown} - private Field qualitymanagementkind; + // {yes, no, uknown} + private Field qualitymanagementkind; - private Field pidsystems; + private Field pidsystems; - private Field certificates; + private Field certificates; - private List policies; + private List policies; - private Journal journal; + private Journal journal; - public Qualifier getDatasourcetype() { - return datasourcetype; - } + public Qualifier getDatasourcetype() { + return datasourcetype; + } - public void setDatasourcetype(Qualifier datasourcetype) { - this.datasourcetype = datasourcetype; - } + public void setDatasourcetype(Qualifier datasourcetype) { + this.datasourcetype = datasourcetype; + } - public Qualifier getOpenairecompatibility() { - return openairecompatibility; - } + public Qualifier getOpenairecompatibility() { + return openairecompatibility; + } - public void setOpenairecompatibility(Qualifier openairecompatibility) { - this.openairecompatibility = openairecompatibility; - } + public void setOpenairecompatibility(Qualifier openairecompatibility) { + this.openairecompatibility = openairecompatibility; + } - public Field getOfficialname() { - return officialname; - } + public Field getOfficialname() { + return officialname; + } - public void setOfficialname(Field officialname) { - this.officialname = officialname; - } + public void setOfficialname(Field officialname) { + this.officialname = officialname; + } - public Field getEnglishname() { - return englishname; - } + public Field getEnglishname() { + return englishname; + } - public void setEnglishname(Field englishname) { - this.englishname = englishname; - } + public void setEnglishname(Field englishname) { + this.englishname = englishname; + } - public Field getWebsiteurl() { - return websiteurl; - } + public Field getWebsiteurl() { + return websiteurl; + } - public void setWebsiteurl(Field websiteurl) { - this.websiteurl = websiteurl; - } + public void setWebsiteurl(Field websiteurl) { + this.websiteurl = websiteurl; + } - public Field getLogourl() { - return logourl; - } + public Field getLogourl() { + return logourl; + } - public void setLogourl(Field logourl) { - this.logourl = logourl; - } + public void setLogourl(Field logourl) { + this.logourl = logourl; + } - public Field getContactemail() { - return contactemail; - } + public Field getContactemail() { + return contactemail; + } - public void setContactemail(Field contactemail) { - this.contactemail = contactemail; - } + public void setContactemail(Field contactemail) { + this.contactemail = contactemail; + } - public Field getNamespaceprefix() { - return namespaceprefix; - } + public Field getNamespaceprefix() { + return namespaceprefix; + } - public void setNamespaceprefix(Field namespaceprefix) { - this.namespaceprefix = namespaceprefix; - } + public void setNamespaceprefix(Field namespaceprefix) { + this.namespaceprefix = namespaceprefix; + } - public Field getLatitude() { - return latitude; - } + public Field getLatitude() { + return latitude; + } - public void setLatitude(Field latitude) { - this.latitude = latitude; - } + public void setLatitude(Field latitude) { + this.latitude = latitude; + } - public Field getLongitude() { - return longitude; - } + public Field getLongitude() { + return longitude; + } - public void setLongitude(Field longitude) { - this.longitude = longitude; - } + public void setLongitude(Field longitude) { + this.longitude = longitude; + } - public Field getDateofvalidation() { - return dateofvalidation; - } + public Field getDateofvalidation() { + return dateofvalidation; + } - public void setDateofvalidation(Field dateofvalidation) { - this.dateofvalidation = dateofvalidation; - } + public void setDateofvalidation(Field dateofvalidation) { + this.dateofvalidation = dateofvalidation; + } - public Field getDescription() { - return description; - } + public Field getDescription() { + return description; + } - public void setDescription(Field description) { - this.description = description; - } + public void setDescription(Field description) { + this.description = description; + } - public List getSubjects() { - return subjects; - } + public List getSubjects() { + return subjects; + } - public void setSubjects(List subjects) { - this.subjects = subjects; - } + public void setSubjects(List subjects) { + this.subjects = subjects; + } - public Field getOdnumberofitems() { - return odnumberofitems; - } + public Field getOdnumberofitems() { + return odnumberofitems; + } - public void setOdnumberofitems(Field odnumberofitems) { - this.odnumberofitems = odnumberofitems; - } + public void setOdnumberofitems(Field odnumberofitems) { + this.odnumberofitems = odnumberofitems; + } - public Field getOdnumberofitemsdate() { - return odnumberofitemsdate; - } + public Field getOdnumberofitemsdate() { + return odnumberofitemsdate; + } - public void setOdnumberofitemsdate(Field odnumberofitemsdate) { - this.odnumberofitemsdate = odnumberofitemsdate; - } + public void setOdnumberofitemsdate(Field odnumberofitemsdate) { + this.odnumberofitemsdate = odnumberofitemsdate; + } - public Field getOdpolicies() { - return odpolicies; - } + public Field getOdpolicies() { + return odpolicies; + } - public void setOdpolicies(Field odpolicies) { - this.odpolicies = odpolicies; - } + public void setOdpolicies(Field odpolicies) { + this.odpolicies = odpolicies; + } - public List> getOdlanguages() { - return odlanguages; - } + public List> getOdlanguages() { + return odlanguages; + } - public void setOdlanguages(List> odlanguages) { - this.odlanguages = odlanguages; - } + public void setOdlanguages(List> odlanguages) { + this.odlanguages = odlanguages; + } - public List> getOdcontenttypes() { - return odcontenttypes; - } + public List> getOdcontenttypes() { + return odcontenttypes; + } - public void setOdcontenttypes(List> odcontenttypes) { - this.odcontenttypes = odcontenttypes; - } + public void setOdcontenttypes(List> odcontenttypes) { + this.odcontenttypes = odcontenttypes; + } - public List> getAccessinfopackage() { - return accessinfopackage; - } + public List> getAccessinfopackage() { + return accessinfopackage; + } - public void setAccessinfopackage(List> accessinfopackage) { - this.accessinfopackage = accessinfopackage; - } + public void setAccessinfopackage(List> accessinfopackage) { + this.accessinfopackage = accessinfopackage; + } - public Field getReleasestartdate() { - return releasestartdate; - } + public Field getReleasestartdate() { + return releasestartdate; + } - public void setReleasestartdate(Field releasestartdate) { - this.releasestartdate = releasestartdate; - } + public void setReleasestartdate(Field releasestartdate) { + this.releasestartdate = releasestartdate; + } - public Field getReleaseenddate() { - return releaseenddate; - } + public Field getReleaseenddate() { + return releaseenddate; + } - public void setReleaseenddate(Field releaseenddate) { - this.releaseenddate = releaseenddate; - } + public void setReleaseenddate(Field releaseenddate) { + this.releaseenddate = releaseenddate; + } - public Field getMissionstatementurl() { - return missionstatementurl; - } + public Field getMissionstatementurl() { + return missionstatementurl; + } - public void setMissionstatementurl(Field missionstatementurl) { - this.missionstatementurl = missionstatementurl; - } + public void setMissionstatementurl(Field missionstatementurl) { + this.missionstatementurl = missionstatementurl; + } - public Field getDataprovider() { - return dataprovider; - } + public Field getDataprovider() { + return dataprovider; + } - public void setDataprovider(Field dataprovider) { - this.dataprovider = dataprovider; - } + public void setDataprovider(Field dataprovider) { + this.dataprovider = dataprovider; + } - public Field getServiceprovider() { - return serviceprovider; - } + public Field getServiceprovider() { + return serviceprovider; + } - public void setServiceprovider(Field serviceprovider) { - this.serviceprovider = serviceprovider; - } + public void setServiceprovider(Field serviceprovider) { + this.serviceprovider = serviceprovider; + } - public Field getDatabaseaccesstype() { - return databaseaccesstype; - } + public Field getDatabaseaccesstype() { + return databaseaccesstype; + } - public void setDatabaseaccesstype(Field databaseaccesstype) { - this.databaseaccesstype = databaseaccesstype; - } - - public Field getDatauploadtype() { - return datauploadtype; - } - - public void setDatauploadtype(Field datauploadtype) { - this.datauploadtype = datauploadtype; - } - - public Field getDatabaseaccessrestriction() { - return databaseaccessrestriction; - } - - public void setDatabaseaccessrestriction(Field databaseaccessrestriction) { - this.databaseaccessrestriction = databaseaccessrestriction; - } - - public Field getDatauploadrestriction() { - return datauploadrestriction; - } - - public void setDatauploadrestriction(Field datauploadrestriction) { - this.datauploadrestriction = datauploadrestriction; - } - - public Field getVersioning() { - return versioning; - } - - public void setVersioning(Field versioning) { - this.versioning = versioning; - } - - public Field getCitationguidelineurl() { - return citationguidelineurl; - } - - public void setCitationguidelineurl(Field citationguidelineurl) { - this.citationguidelineurl = citationguidelineurl; - } - - public Field getQualitymanagementkind() { - return qualitymanagementkind; - } - - public void setQualitymanagementkind(Field qualitymanagementkind) { - this.qualitymanagementkind = qualitymanagementkind; - } - - public Field getPidsystems() { - return pidsystems; - } - - public void setPidsystems(Field pidsystems) { - this.pidsystems = pidsystems; - } - - public Field getCertificates() { - return certificates; - } - - public void setCertificates(Field certificates) { - this.certificates = certificates; - } - - public List getPolicies() { - return policies; - } - - public void setPolicies(List policies) { - this.policies = policies; - } - - public Journal getJournal() { - return journal; - } - - public void setJournal(Journal journal) { - this.journal = journal; - } - - @Override - public void mergeFrom(OafEntity e) { - super.mergeFrom(e); - - if (!Datasource.class.isAssignableFrom(e.getClass())) { - return; - } - - Datasource d = (Datasource) e; - - datasourcetype = - d.getDatasourcetype() != null && compareTrust(this, e) < 0 - ? d.getDatasourcetype() - : datasourcetype; - openairecompatibility = - d.getOpenairecompatibility() != null && compareTrust(this, e) < 0 - ? d.getOpenairecompatibility() - : openairecompatibility; - officialname = - d.getOfficialname() != null && compareTrust(this, e) < 0 - ? d.getOfficialname() - : officialname; - englishname = - d.getEnglishname() != null && compareTrust(this, e) < 0 ? d.getEnglishname() : officialname; - websiteurl = - d.getWebsiteurl() != null && compareTrust(this, e) < 0 ? d.getWebsiteurl() : websiteurl; - logourl = d.getLogourl() != null && compareTrust(this, e) < 0 ? d.getLogourl() : getLogourl(); - contactemail = - d.getContactemail() != null && compareTrust(this, e) < 0 - ? d.getContactemail() - : contactemail; - namespaceprefix = - d.getNamespaceprefix() != null && compareTrust(this, e) < 0 - ? d.getNamespaceprefix() - : namespaceprefix; - latitude = d.getLatitude() != null && compareTrust(this, e) < 0 ? d.getLatitude() : latitude; - longitude = - d.getLongitude() != null && compareTrust(this, e) < 0 ? d.getLongitude() : longitude; - dateofvalidation = - d.getDateofvalidation() != null && compareTrust(this, e) < 0 - ? d.getDateofvalidation() - : dateofvalidation; - description = - d.getDescription() != null && compareTrust(this, e) < 0 ? d.getDescription() : description; - subjects = mergeLists(subjects, d.getSubjects()); - - // opendoar specific fields (od*) - odnumberofitems = - d.getOdnumberofitems() != null && compareTrust(this, e) < 0 - ? d.getOdnumberofitems() - : odnumberofitems; - odnumberofitemsdate = - d.getOdnumberofitemsdate() != null && compareTrust(this, e) < 0 - ? d.getOdnumberofitemsdate() - : odnumberofitemsdate; - odpolicies = - d.getOdpolicies() != null && compareTrust(this, e) < 0 ? d.getOdpolicies() : odpolicies; - odlanguages = mergeLists(odlanguages, d.getOdlanguages()); - odcontenttypes = mergeLists(odcontenttypes, d.getOdcontenttypes()); - accessinfopackage = mergeLists(accessinfopackage, d.getAccessinfopackage()); - - // re3data fields - releasestartdate = - d.getReleasestartdate() != null && compareTrust(this, e) < 0 - ? d.getReleasestartdate() - : releasestartdate; - releaseenddate = - d.getReleaseenddate() != null && compareTrust(this, e) < 0 - ? d.getReleaseenddate() - : releaseenddate; - missionstatementurl = - d.getMissionstatementurl() != null && compareTrust(this, e) < 0 - ? d.getMissionstatementurl() - : missionstatementurl; - dataprovider = - d.getDataprovider() != null && compareTrust(this, e) < 0 - ? d.getDataprovider() - : dataprovider; - serviceprovider = - d.getServiceprovider() != null && compareTrust(this, e) < 0 - ? d.getServiceprovider() - : serviceprovider; - - // {open, restricted or closed} - databaseaccesstype = - d.getDatabaseaccesstype() != null && compareTrust(this, e) < 0 - ? d.getDatabaseaccesstype() - : databaseaccesstype; - - // {open, restricted or closed} - datauploadtype = - d.getDatauploadtype() != null && compareTrust(this, e) < 0 - ? d.getDatauploadtype() - : datauploadtype; - - // {feeRequired, registration, other} - databaseaccessrestriction = - d.getDatabaseaccessrestriction() != null && compareTrust(this, e) < 0 - ? d.getDatabaseaccessrestriction() - : databaseaccessrestriction; - - // {feeRequired, registration, other} - datauploadrestriction = - d.getDatauploadrestriction() != null && compareTrust(this, e) < 0 - ? d.getDatauploadrestriction() - : datauploadrestriction; - - versioning = - d.getVersioning() != null && compareTrust(this, e) < 0 ? d.getVersioning() : versioning; - citationguidelineurl = - d.getCitationguidelineurl() != null && compareTrust(this, e) < 0 - ? d.getCitationguidelineurl() - : citationguidelineurl; - - // {yes, no, unknown} - qualitymanagementkind = - d.getQualitymanagementkind() != null && compareTrust(this, e) < 0 - ? d.getQualitymanagementkind() - : qualitymanagementkind; - pidsystems = - d.getPidsystems() != null && compareTrust(this, e) < 0 ? d.getPidsystems() : pidsystems; - - certificates = - d.getCertificates() != null && compareTrust(this, e) < 0 - ? d.getCertificates() - : certificates; - - policies = mergeLists(policies, d.getPolicies()); - - journal = d.getJournal() != null && compareTrust(this, e) < 0 ? d.getJournal() : journal; - - mergeOAFDataInfo(e); - } + public void setDatabaseaccesstype(Field databaseaccesstype) { + this.databaseaccesstype = databaseaccesstype; + } + + public Field getDatauploadtype() { + return datauploadtype; + } + + public void setDatauploadtype(Field datauploadtype) { + this.datauploadtype = datauploadtype; + } + + public Field getDatabaseaccessrestriction() { + return databaseaccessrestriction; + } + + public void setDatabaseaccessrestriction(Field databaseaccessrestriction) { + this.databaseaccessrestriction = databaseaccessrestriction; + } + + public Field getDatauploadrestriction() { + return datauploadrestriction; + } + + public void setDatauploadrestriction(Field datauploadrestriction) { + this.datauploadrestriction = datauploadrestriction; + } + + public Field getVersioning() { + return versioning; + } + + public void setVersioning(Field versioning) { + this.versioning = versioning; + } + + public Field getCitationguidelineurl() { + return citationguidelineurl; + } + + public void setCitationguidelineurl(Field citationguidelineurl) { + this.citationguidelineurl = citationguidelineurl; + } + + public Field getQualitymanagementkind() { + return qualitymanagementkind; + } + + public void setQualitymanagementkind(Field qualitymanagementkind) { + this.qualitymanagementkind = qualitymanagementkind; + } + + public Field getPidsystems() { + return pidsystems; + } + + public void setPidsystems(Field pidsystems) { + this.pidsystems = pidsystems; + } + + public Field getCertificates() { + return certificates; + } + + public void setCertificates(Field certificates) { + this.certificates = certificates; + } + + public List getPolicies() { + return policies; + } + + public void setPolicies(List policies) { + this.policies = policies; + } + + public Journal getJournal() { + return journal; + } + + public void setJournal(Journal journal) { + this.journal = journal; + } + + @Override + public void mergeFrom(OafEntity e) { + super.mergeFrom(e); + + if (!Datasource.class.isAssignableFrom(e.getClass())) { + return; + } + + Datasource d = (Datasource) e; + + datasourcetype = d.getDatasourcetype() != null && compareTrust(this, e) < 0 + ? d.getDatasourcetype() + : datasourcetype; + openairecompatibility = d.getOpenairecompatibility() != null && compareTrust(this, e) < 0 + ? d.getOpenairecompatibility() + : openairecompatibility; + officialname = d.getOfficialname() != null && compareTrust(this, e) < 0 + ? d.getOfficialname() + : officialname; + englishname = d.getEnglishname() != null && compareTrust(this, e) < 0 ? d.getEnglishname() : officialname; + websiteurl = d.getWebsiteurl() != null && compareTrust(this, e) < 0 ? d.getWebsiteurl() : websiteurl; + logourl = d.getLogourl() != null && compareTrust(this, e) < 0 ? d.getLogourl() : getLogourl(); + contactemail = d.getContactemail() != null && compareTrust(this, e) < 0 + ? d.getContactemail() + : contactemail; + namespaceprefix = d.getNamespaceprefix() != null && compareTrust(this, e) < 0 + ? d.getNamespaceprefix() + : namespaceprefix; + latitude = d.getLatitude() != null && compareTrust(this, e) < 0 ? d.getLatitude() : latitude; + longitude = d.getLongitude() != null && compareTrust(this, e) < 0 ? d.getLongitude() : longitude; + dateofvalidation = d.getDateofvalidation() != null && compareTrust(this, e) < 0 + ? d.getDateofvalidation() + : dateofvalidation; + description = d.getDescription() != null && compareTrust(this, e) < 0 ? d.getDescription() : description; + subjects = mergeLists(subjects, d.getSubjects()); + + // opendoar specific fields (od*) + odnumberofitems = d.getOdnumberofitems() != null && compareTrust(this, e) < 0 + ? d.getOdnumberofitems() + : odnumberofitems; + odnumberofitemsdate = d.getOdnumberofitemsdate() != null && compareTrust(this, e) < 0 + ? d.getOdnumberofitemsdate() + : odnumberofitemsdate; + odpolicies = d.getOdpolicies() != null && compareTrust(this, e) < 0 ? d.getOdpolicies() : odpolicies; + odlanguages = mergeLists(odlanguages, d.getOdlanguages()); + odcontenttypes = mergeLists(odcontenttypes, d.getOdcontenttypes()); + accessinfopackage = mergeLists(accessinfopackage, d.getAccessinfopackage()); + + // re3data fields + releasestartdate = d.getReleasestartdate() != null && compareTrust(this, e) < 0 + ? d.getReleasestartdate() + : releasestartdate; + releaseenddate = d.getReleaseenddate() != null && compareTrust(this, e) < 0 + ? d.getReleaseenddate() + : releaseenddate; + missionstatementurl = d.getMissionstatementurl() != null && compareTrust(this, e) < 0 + ? d.getMissionstatementurl() + : missionstatementurl; + dataprovider = d.getDataprovider() != null && compareTrust(this, e) < 0 + ? d.getDataprovider() + : dataprovider; + serviceprovider = d.getServiceprovider() != null && compareTrust(this, e) < 0 + ? d.getServiceprovider() + : serviceprovider; + + // {open, restricted or closed} + databaseaccesstype = d.getDatabaseaccesstype() != null && compareTrust(this, e) < 0 + ? d.getDatabaseaccesstype() + : databaseaccesstype; + + // {open, restricted or closed} + datauploadtype = d.getDatauploadtype() != null && compareTrust(this, e) < 0 + ? d.getDatauploadtype() + : datauploadtype; + + // {feeRequired, registration, other} + databaseaccessrestriction = d.getDatabaseaccessrestriction() != null && compareTrust(this, e) < 0 + ? d.getDatabaseaccessrestriction() + : databaseaccessrestriction; + + // {feeRequired, registration, other} + datauploadrestriction = d.getDatauploadrestriction() != null && compareTrust(this, e) < 0 + ? d.getDatauploadrestriction() + : datauploadrestriction; + + versioning = d.getVersioning() != null && compareTrust(this, e) < 0 ? d.getVersioning() : versioning; + citationguidelineurl = d.getCitationguidelineurl() != null && compareTrust(this, e) < 0 + ? d.getCitationguidelineurl() + : citationguidelineurl; + + // {yes, no, unknown} + qualitymanagementkind = d.getQualitymanagementkind() != null && compareTrust(this, e) < 0 + ? d.getQualitymanagementkind() + : qualitymanagementkind; + pidsystems = d.getPidsystems() != null && compareTrust(this, e) < 0 ? d.getPidsystems() : pidsystems; + + certificates = d.getCertificates() != null && compareTrust(this, e) < 0 + ? d.getCertificates() + : certificates; + + policies = mergeLists(policies, d.getPolicies()); + + journal = d.getJournal() != null && compareTrust(this, e) < 0 ? d.getJournal() : journal; + + mergeOAFDataInfo(e); + } } diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/ExternalReference.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/ExternalReference.java index 67b48ed16..d509b954e 100644 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/ExternalReference.java +++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/ExternalReference.java @@ -1,115 +1,119 @@ + package eu.dnetlib.dhp.schema.oaf; import java.io.Serializable; import java.util.Objects; public class ExternalReference implements Serializable { - // source - private String sitename; + // source + private String sitename; - // title - private String label; + // title + private String label; - // text() - private String url; + // text() + private String url; - // ?? not mapped yet ?? - private String description; + // ?? not mapped yet ?? + private String description; - // type - private Qualifier qualifier; + // type + private Qualifier qualifier; - // site internal identifier - private String refidentifier; + // site internal identifier + private String refidentifier; - // maps the oaf:reference/@query attribute - private String query; + // maps the oaf:reference/@query attribute + private String query; - // ExternalReferences might be also inferred - private DataInfo dataInfo; + // ExternalReferences might be also inferred + private DataInfo dataInfo; - public String getSitename() { - return sitename; - } + public String getSitename() { + return sitename; + } - public void setSitename(String sitename) { - this.sitename = sitename; - } + public void setSitename(String sitename) { + this.sitename = sitename; + } - public String getLabel() { - return label; - } + public String getLabel() { + return label; + } - public void setLabel(String label) { - this.label = label; - } + public void setLabel(String label) { + this.label = label; + } - public String getUrl() { - return url; - } + public String getUrl() { + return url; + } - public void setUrl(String url) { - this.url = url; - } + public void setUrl(String url) { + this.url = url; + } - public String getDescription() { - return description; - } + public String getDescription() { + return description; + } - public void setDescription(String description) { - this.description = description; - } + public void setDescription(String description) { + this.description = description; + } - public Qualifier getQualifier() { - return qualifier; - } + public Qualifier getQualifier() { + return qualifier; + } - public void setQualifier(Qualifier qualifier) { - this.qualifier = qualifier; - } + public void setQualifier(Qualifier qualifier) { + this.qualifier = qualifier; + } - public String getRefidentifier() { - return refidentifier; - } + public String getRefidentifier() { + return refidentifier; + } - public void setRefidentifier(String refidentifier) { - this.refidentifier = refidentifier; - } + public void setRefidentifier(String refidentifier) { + this.refidentifier = refidentifier; + } - public String getQuery() { - return query; - } + public String getQuery() { + return query; + } - public void setQuery(String query) { - this.query = query; - } + public void setQuery(String query) { + this.query = query; + } - public DataInfo getDataInfo() { - return dataInfo; - } + public DataInfo getDataInfo() { + return dataInfo; + } - public void setDataInfo(DataInfo dataInfo) { - this.dataInfo = dataInfo; - } + public void setDataInfo(DataInfo dataInfo) { + this.dataInfo = dataInfo; + } - @Override - public boolean equals(Object o) { - if (this == o) return true; - if (o == null || getClass() != o.getClass()) return false; - ExternalReference that = (ExternalReference) o; - return Objects.equals(sitename, that.sitename) - && Objects.equals(label, that.label) - && Objects.equals(url, that.url) - && Objects.equals(description, that.description) - && Objects.equals(qualifier, that.qualifier) - && Objects.equals(refidentifier, that.refidentifier) - && Objects.equals(query, that.query) - && Objects.equals(dataInfo, that.dataInfo); - } + @Override + public boolean equals(Object o) { + if (this == o) + return true; + if (o == null || getClass() != o.getClass()) + return false; + ExternalReference that = (ExternalReference) o; + return Objects.equals(sitename, that.sitename) + && Objects.equals(label, that.label) + && Objects.equals(url, that.url) + && Objects.equals(description, that.description) + && Objects.equals(qualifier, that.qualifier) + && Objects.equals(refidentifier, that.refidentifier) + && Objects.equals(query, that.query) + && Objects.equals(dataInfo, that.dataInfo); + } - @Override - public int hashCode() { - return Objects.hash( - sitename, label, url, description, qualifier, refidentifier, query, dataInfo); - } + @Override + public int hashCode() { + return Objects + .hash( + sitename, label, url, description, qualifier, refidentifier, query, dataInfo); + } } diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/ExtraInfo.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/ExtraInfo.java index c19c08f5f..3682cc2aa 100644 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/ExtraInfo.java +++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/ExtraInfo.java @@ -1,74 +1,77 @@ + package eu.dnetlib.dhp.schema.oaf; import java.io.Serializable; import java.util.Objects; public class ExtraInfo implements Serializable { - private String name; + private String name; - private String typology; + private String typology; - private String provenance; + private String provenance; - private String trust; + private String trust; - // json containing a Citation or Statistics - private String value; + // json containing a Citation or Statistics + private String value; - public String getName() { - return name; - } + public String getName() { + return name; + } - public void setName(String name) { - this.name = name; - } + public void setName(String name) { + this.name = name; + } - public String getTypology() { - return typology; - } + public String getTypology() { + return typology; + } - public void setTypology(String typology) { - this.typology = typology; - } + public void setTypology(String typology) { + this.typology = typology; + } - public String getProvenance() { - return provenance; - } + public String getProvenance() { + return provenance; + } - public void setProvenance(String provenance) { - this.provenance = provenance; - } + public void setProvenance(String provenance) { + this.provenance = provenance; + } - public String getTrust() { - return trust; - } + public String getTrust() { + return trust; + } - public void setTrust(String trust) { - this.trust = trust; - } + public void setTrust(String trust) { + this.trust = trust; + } - public String getValue() { - return value; - } + public String getValue() { + return value; + } - public void setValue(String value) { - this.value = value; - } + public void setValue(String value) { + this.value = value; + } - @Override - public boolean equals(Object o) { - if (this == o) return true; - if (o == null || getClass() != o.getClass()) return false; - ExtraInfo extraInfo = (ExtraInfo) o; - return Objects.equals(name, extraInfo.name) - && Objects.equals(typology, extraInfo.typology) - && Objects.equals(provenance, extraInfo.provenance) - && Objects.equals(trust, extraInfo.trust) - && Objects.equals(value, extraInfo.value); - } + @Override + public boolean equals(Object o) { + if (this == o) + return true; + if (o == null || getClass() != o.getClass()) + return false; + ExtraInfo extraInfo = (ExtraInfo) o; + return Objects.equals(name, extraInfo.name) + && Objects.equals(typology, extraInfo.typology) + && Objects.equals(provenance, extraInfo.provenance) + && Objects.equals(trust, extraInfo.trust) + && Objects.equals(value, extraInfo.value); + } - @Override - public int hashCode() { - return Objects.hash(name, typology, provenance, trust, value); - } + @Override + public int hashCode() { + return Objects.hash(name, typology, provenance, trust, value); + } } diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Field.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Field.java index 1854b85c1..1a85c6842 100644 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Field.java +++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Field.java @@ -1,40 +1,44 @@ + package eu.dnetlib.dhp.schema.oaf; import java.io.Serializable; public class Field implements Serializable { - private T value; + private T value; - private DataInfo dataInfo; + private DataInfo dataInfo; - public T getValue() { - return value; - } + public T getValue() { + return value; + } - public void setValue(T value) { - this.value = value; - } + public void setValue(T value) { + this.value = value; + } - public DataInfo getDataInfo() { - return dataInfo; - } + public DataInfo getDataInfo() { + return dataInfo; + } - public void setDataInfo(DataInfo dataInfo) { - this.dataInfo = dataInfo; - } + public void setDataInfo(DataInfo dataInfo) { + this.dataInfo = dataInfo; + } - @Override - public int hashCode() { - return getValue() == null ? 0 : getValue().hashCode(); - } + @Override + public int hashCode() { + return getValue() == null ? 0 : getValue().hashCode(); + } - @Override - public boolean equals(Object obj) { - if (this == obj) return true; - if (obj == null) return false; - if (getClass() != obj.getClass()) return false; - Field other = (Field) obj; - return getValue().equals(other.getValue()); - } + @Override + public boolean equals(Object obj) { + if (this == obj) + return true; + if (obj == null) + return false; + if (getClass() != obj.getClass()) + return false; + Field other = (Field) obj; + return getValue().equals(other.getValue()); + } } diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/GeoLocation.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/GeoLocation.java index 741f19002..7ed313a59 100644 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/GeoLocation.java +++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/GeoLocation.java @@ -1,69 +1,76 @@ + package eu.dnetlib.dhp.schema.oaf; -import com.fasterxml.jackson.annotation.JsonIgnore; import java.io.Serializable; + import org.apache.commons.lang3.StringUtils; +import com.fasterxml.jackson.annotation.JsonIgnore; + public class GeoLocation implements Serializable { - private String point; + private String point; - private String box; + private String box; - private String place; + private String place; - public String getPoint() { - return point; - } + public String getPoint() { + return point; + } - public void setPoint(String point) { - this.point = point; - } + public void setPoint(String point) { + this.point = point; + } - public String getBox() { - return box; - } + public String getBox() { + return box; + } - public void setBox(String box) { - this.box = box; - } + public void setBox(String box) { + this.box = box; + } - public String getPlace() { - return place; - } + public String getPlace() { + return place; + } - public void setPlace(String place) { - this.place = place; - } + public void setPlace(String place) { + this.place = place; + } - @JsonIgnore - public boolean isBlank() { - return StringUtils.isBlank(point) && StringUtils.isBlank(box) && StringUtils.isBlank(place); - } + @JsonIgnore + public boolean isBlank() { + return StringUtils.isBlank(point) && StringUtils.isBlank(box) && StringUtils.isBlank(place); + } - public String toComparableString() { - return isBlank() - ? "" - : String.format( - "%s::%s%s", - point != null ? point.toLowerCase() : "", - box != null ? box.toLowerCase() : "", - place != null ? place.toLowerCase() : ""); - } + public String toComparableString() { + return isBlank() + ? "" + : String + .format( + "%s::%s%s", + point != null ? point.toLowerCase() : "", + box != null ? box.toLowerCase() : "", + place != null ? place.toLowerCase() : ""); + } - @Override - public int hashCode() { - return toComparableString().hashCode(); - } + @Override + public int hashCode() { + return toComparableString().hashCode(); + } - @Override - public boolean equals(Object obj) { - if (this == obj) return true; - if (obj == null) return false; - if (getClass() != obj.getClass()) return false; + @Override + public boolean equals(Object obj) { + if (this == obj) + return true; + if (obj == null) + return false; + if (getClass() != obj.getClass()) + return false; - GeoLocation other = (GeoLocation) obj; + GeoLocation other = (GeoLocation) obj; - return toComparableString().equals(other.toComparableString()); - } + return toComparableString().equals(other.toComparableString()); + } } diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Instance.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Instance.java index 76b72cfbc..2b7d3846c 100644 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Instance.java +++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Instance.java @@ -1,3 +1,4 @@ + package eu.dnetlib.dhp.schema.oaf; import java.io.Serializable; @@ -5,143 +6,147 @@ import java.util.List; public class Instance implements Serializable { - private Field license; + private Field license; - private Qualifier accessright; + private Qualifier accessright; - private Qualifier instancetype; + private Qualifier instancetype; - private KeyValue hostedby; + private KeyValue hostedby; - private List url; + private List url; - // other research products specifc - private String distributionlocation; + // other research products specifc + private String distributionlocation; - private KeyValue collectedfrom; + private KeyValue collectedfrom; - private Field dateofacceptance; + private Field dateofacceptance; - // ( article | book ) processing charges. Defined here to cope with possible wrongly typed - // results - private Field processingchargeamount; + // ( article | book ) processing charges. Defined here to cope with possible wrongly typed + // results + private Field processingchargeamount; - // currency - alphabetic code describe in ISO-4217. Defined here to cope with possible wrongly - // typed results - private Field processingchargecurrency; + // currency - alphabetic code describe in ISO-4217. Defined here to cope with possible wrongly + // typed results + private Field processingchargecurrency; - private Field refereed; // peer-review status + private Field refereed; // peer-review status - public Field getLicense() { - return license; - } + public Field getLicense() { + return license; + } - public void setLicense(Field license) { - this.license = license; - } + public void setLicense(Field license) { + this.license = license; + } - public Qualifier getAccessright() { - return accessright; - } + public Qualifier getAccessright() { + return accessright; + } - public void setAccessright(Qualifier accessright) { - this.accessright = accessright; - } + public void setAccessright(Qualifier accessright) { + this.accessright = accessright; + } - public Qualifier getInstancetype() { - return instancetype; - } + public Qualifier getInstancetype() { + return instancetype; + } - public void setInstancetype(Qualifier instancetype) { - this.instancetype = instancetype; - } + public void setInstancetype(Qualifier instancetype) { + this.instancetype = instancetype; + } - public KeyValue getHostedby() { - return hostedby; - } + public KeyValue getHostedby() { + return hostedby; + } - public void setHostedby(KeyValue hostedby) { - this.hostedby = hostedby; - } + public void setHostedby(KeyValue hostedby) { + this.hostedby = hostedby; + } - public List getUrl() { - return url; - } + public List getUrl() { + return url; + } - public void setUrl(List url) { - this.url = url; - } + public void setUrl(List url) { + this.url = url; + } - public String getDistributionlocation() { - return distributionlocation; - } + public String getDistributionlocation() { + return distributionlocation; + } - public void setDistributionlocation(String distributionlocation) { - this.distributionlocation = distributionlocation; - } + public void setDistributionlocation(String distributionlocation) { + this.distributionlocation = distributionlocation; + } - public KeyValue getCollectedfrom() { - return collectedfrom; - } + public KeyValue getCollectedfrom() { + return collectedfrom; + } - public void setCollectedfrom(KeyValue collectedfrom) { - this.collectedfrom = collectedfrom; - } + public void setCollectedfrom(KeyValue collectedfrom) { + this.collectedfrom = collectedfrom; + } - public Field getDateofacceptance() { - return dateofacceptance; - } + public Field getDateofacceptance() { + return dateofacceptance; + } - public void setDateofacceptance(Field dateofacceptance) { - this.dateofacceptance = dateofacceptance; - } + public void setDateofacceptance(Field dateofacceptance) { + this.dateofacceptance = dateofacceptance; + } - public Field getProcessingchargeamount() { - return processingchargeamount; - } + public Field getProcessingchargeamount() { + return processingchargeamount; + } - public void setProcessingchargeamount(Field processingchargeamount) { - this.processingchargeamount = processingchargeamount; - } + public void setProcessingchargeamount(Field processingchargeamount) { + this.processingchargeamount = processingchargeamount; + } - public Field getProcessingchargecurrency() { - return processingchargecurrency; - } + public Field getProcessingchargecurrency() { + return processingchargecurrency; + } - public void setProcessingchargecurrency(Field processingchargecurrency) { - this.processingchargecurrency = processingchargecurrency; - } + public void setProcessingchargecurrency(Field processingchargecurrency) { + this.processingchargecurrency = processingchargecurrency; + } - public Field getRefereed() { - return refereed; - } + public Field getRefereed() { + return refereed; + } - public void setRefereed(Field refereed) { - this.refereed = refereed; - } + public void setRefereed(Field refereed) { + this.refereed = refereed; + } - public String toComparableString() { - return String.format( - "%s::%s::%s::%s", - hostedby != null && hostedby.getKey() != null ? hostedby.getKey().toLowerCase() : "", - accessright != null && accessright.getClassid() != null ? accessright.getClassid() : "", - instancetype != null && instancetype.getClassid() != null ? instancetype.getClassid() : "", - url != null ? url : ""); - } + public String toComparableString() { + return String + .format( + "%s::%s::%s::%s", + hostedby != null && hostedby.getKey() != null ? hostedby.getKey().toLowerCase() : "", + accessright != null && accessright.getClassid() != null ? accessright.getClassid() : "", + instancetype != null && instancetype.getClassid() != null ? instancetype.getClassid() : "", + url != null ? url : ""); + } - @Override - public int hashCode() { - return toComparableString().hashCode(); - } + @Override + public int hashCode() { + return toComparableString().hashCode(); + } - @Override - public boolean equals(Object obj) { - if (this == obj) return true; - if (obj == null) return false; - if (getClass() != obj.getClass()) return false; + @Override + public boolean equals(Object obj) { + if (this == obj) + return true; + if (obj == null) + return false; + if (getClass() != obj.getClass()) + return false; - Instance other = (Instance) obj; + Instance other = (Instance) obj; - return toComparableString().equals(other.toComparableString()); - } + return toComparableString().equals(other.toComparableString()); + } } diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Journal.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Journal.java index bdf64f812..7a375e28b 100644 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Journal.java +++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Journal.java @@ -1,3 +1,4 @@ + package eu.dnetlib.dhp.schema.oaf; import java.io.Serializable; @@ -5,159 +6,162 @@ import java.util.Objects; public class Journal implements Serializable { - private String name; + private String name; - private String issnPrinted; + private String issnPrinted; - private String issnOnline; + private String issnOnline; - private String issnLinking; + private String issnLinking; - private String ep; + private String ep; - private String iss; + private String iss; - private String sp; + private String sp; - private String vol; + private String vol; - private String edition; + private String edition; - private String conferenceplace; + private String conferenceplace; - private String conferencedate; + private String conferencedate; - private DataInfo dataInfo; + private DataInfo dataInfo; - public String getName() { - return name; - } + public String getName() { + return name; + } - public void setName(String name) { - this.name = name; - } + public void setName(String name) { + this.name = name; + } - public String getIssnPrinted() { - return issnPrinted; - } + public String getIssnPrinted() { + return issnPrinted; + } - public void setIssnPrinted(String issnPrinted) { - this.issnPrinted = issnPrinted; - } + public void setIssnPrinted(String issnPrinted) { + this.issnPrinted = issnPrinted; + } - public String getIssnOnline() { - return issnOnline; - } + public String getIssnOnline() { + return issnOnline; + } - public void setIssnOnline(String issnOnline) { - this.issnOnline = issnOnline; - } + public void setIssnOnline(String issnOnline) { + this.issnOnline = issnOnline; + } - public String getIssnLinking() { - return issnLinking; - } + public String getIssnLinking() { + return issnLinking; + } - public void setIssnLinking(String issnLinking) { - this.issnLinking = issnLinking; - } + public void setIssnLinking(String issnLinking) { + this.issnLinking = issnLinking; + } - public String getEp() { - return ep; - } + public String getEp() { + return ep; + } - public void setEp(String ep) { - this.ep = ep; - } + public void setEp(String ep) { + this.ep = ep; + } - public String getIss() { - return iss; - } + public String getIss() { + return iss; + } - public void setIss(String iss) { - this.iss = iss; - } + public void setIss(String iss) { + this.iss = iss; + } - public String getSp() { - return sp; - } + public String getSp() { + return sp; + } - public void setSp(String sp) { - this.sp = sp; - } + public void setSp(String sp) { + this.sp = sp; + } - public String getVol() { - return vol; - } + public String getVol() { + return vol; + } - public void setVol(String vol) { - this.vol = vol; - } + public void setVol(String vol) { + this.vol = vol; + } - public String getEdition() { - return edition; - } + public String getEdition() { + return edition; + } - public void setEdition(String edition) { - this.edition = edition; - } + public void setEdition(String edition) { + this.edition = edition; + } - public String getConferenceplace() { - return conferenceplace; - } + public String getConferenceplace() { + return conferenceplace; + } - public void setConferenceplace(String conferenceplace) { - this.conferenceplace = conferenceplace; - } + public void setConferenceplace(String conferenceplace) { + this.conferenceplace = conferenceplace; + } - public String getConferencedate() { - return conferencedate; - } + public String getConferencedate() { + return conferencedate; + } - public void setConferencedate(String conferencedate) { - this.conferencedate = conferencedate; - } + public void setConferencedate(String conferencedate) { + this.conferencedate = conferencedate; + } - public DataInfo getDataInfo() { - return dataInfo; - } + public DataInfo getDataInfo() { + return dataInfo; + } - public void setDataInfo(DataInfo dataInfo) { - this.dataInfo = dataInfo; - } + public void setDataInfo(DataInfo dataInfo) { + this.dataInfo = dataInfo; + } - @Override - public boolean equals(Object o) { - if (this == o) return true; - if (o == null || getClass() != o.getClass()) return false; - Journal journal = (Journal) o; - return Objects.equals(name, journal.name) - && Objects.equals(issnPrinted, journal.issnPrinted) - && Objects.equals(issnOnline, journal.issnOnline) - && Objects.equals(issnLinking, journal.issnLinking) - && Objects.equals(ep, journal.ep) - && Objects.equals(iss, journal.iss) - && Objects.equals(sp, journal.sp) - && Objects.equals(vol, journal.vol) - && Objects.equals(edition, journal.edition) - && Objects.equals(conferenceplace, journal.conferenceplace) - && Objects.equals(conferencedate, journal.conferencedate) - && Objects.equals(dataInfo, journal.dataInfo); - } + @Override + public boolean equals(Object o) { + if (this == o) + return true; + if (o == null || getClass() != o.getClass()) + return false; + Journal journal = (Journal) o; + return Objects.equals(name, journal.name) + && Objects.equals(issnPrinted, journal.issnPrinted) + && Objects.equals(issnOnline, journal.issnOnline) + && Objects.equals(issnLinking, journal.issnLinking) + && Objects.equals(ep, journal.ep) + && Objects.equals(iss, journal.iss) + && Objects.equals(sp, journal.sp) + && Objects.equals(vol, journal.vol) + && Objects.equals(edition, journal.edition) + && Objects.equals(conferenceplace, journal.conferenceplace) + && Objects.equals(conferencedate, journal.conferencedate) + && Objects.equals(dataInfo, journal.dataInfo); + } - @Override - public int hashCode() { - return Objects.hash( - name, - issnPrinted, - issnOnline, - issnLinking, - ep, - iss, - sp, - vol, - edition, - conferenceplace, - conferencedate, - dataInfo); - } + @Override + public int hashCode() { + return Objects + .hash( + name, + issnPrinted, + issnOnline, + issnLinking, + ep, + iss, + sp, + vol, + edition, + conferenceplace, + conferencedate, + dataInfo); + } } diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/KeyValue.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/KeyValue.java index 31b898788..4e2d60138 100644 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/KeyValue.java +++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/KeyValue.java @@ -1,67 +1,74 @@ + package eu.dnetlib.dhp.schema.oaf; -import com.fasterxml.jackson.annotation.JsonIgnore; import java.io.Serializable; + import org.apache.commons.lang3.StringUtils; +import com.fasterxml.jackson.annotation.JsonIgnore; + public class KeyValue implements Serializable { - private String key; + private String key; - private String value; + private String value; - private DataInfo dataInfo; + private DataInfo dataInfo; - public String getKey() { - return key; - } + public String getKey() { + return key; + } - public void setKey(String key) { - this.key = key; - } + public void setKey(String key) { + this.key = key; + } - public String getValue() { - return value; - } + public String getValue() { + return value; + } - public void setValue(String value) { - this.value = value; - } + public void setValue(String value) { + this.value = value; + } - public DataInfo getDataInfo() { - return dataInfo; - } + public DataInfo getDataInfo() { + return dataInfo; + } - public void setDataInfo(DataInfo dataInfo) { - this.dataInfo = dataInfo; - } + public void setDataInfo(DataInfo dataInfo) { + this.dataInfo = dataInfo; + } - public String toComparableString() { - return isBlank() - ? "" - : String.format( - "%s::%s", - key != null ? key.toLowerCase() : "", value != null ? value.toLowerCase() : ""); - } + public String toComparableString() { + return isBlank() + ? "" + : String + .format( + "%s::%s", + key != null ? key.toLowerCase() : "", value != null ? value.toLowerCase() : ""); + } - @JsonIgnore - public boolean isBlank() { - return StringUtils.isBlank(key) && StringUtils.isBlank(value); - } + @JsonIgnore + public boolean isBlank() { + return StringUtils.isBlank(key) && StringUtils.isBlank(value); + } - @Override - public int hashCode() { - return toComparableString().hashCode(); - } + @Override + public int hashCode() { + return toComparableString().hashCode(); + } - @Override - public boolean equals(Object obj) { - if (this == obj) return true; - if (obj == null) return false; - if (getClass() != obj.getClass()) return false; + @Override + public boolean equals(Object obj) { + if (this == obj) + return true; + if (obj == null) + return false; + if (getClass() != obj.getClass()) + return false; - KeyValue other = (KeyValue) obj; + KeyValue other = (KeyValue) obj; - return toComparableString().equals(other.toComparableString()); - } + return toComparableString().equals(other.toComparableString()); + } } diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/OAIProvenance.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/OAIProvenance.java index 5798adae9..88d74afbf 100644 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/OAIProvenance.java +++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/OAIProvenance.java @@ -1,3 +1,4 @@ + package eu.dnetlib.dhp.schema.oaf; import java.io.Serializable; @@ -5,26 +6,28 @@ import java.util.Objects; public class OAIProvenance implements Serializable { - private OriginDescription originDescription; + private OriginDescription originDescription; - public OriginDescription getOriginDescription() { - return originDescription; - } + public OriginDescription getOriginDescription() { + return originDescription; + } - public void setOriginDescription(OriginDescription originDescription) { - this.originDescription = originDescription; - } + public void setOriginDescription(OriginDescription originDescription) { + this.originDescription = originDescription; + } - @Override - public boolean equals(Object o) { - if (this == o) return true; - if (o == null || getClass() != o.getClass()) return false; - OAIProvenance that = (OAIProvenance) o; - return Objects.equals(originDescription, that.originDescription); - } + @Override + public boolean equals(Object o) { + if (this == o) + return true; + if (o == null || getClass() != o.getClass()) + return false; + OAIProvenance that = (OAIProvenance) o; + return Objects.equals(originDescription, that.originDescription); + } - @Override - public int hashCode() { - return Objects.hash(originDescription); - } + @Override + public int hashCode() { + return Objects.hash(originDescription); + } } diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Oaf.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Oaf.java index d6561f5cb..4bfc05039 100644 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Oaf.java +++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Oaf.java @@ -1,3 +1,4 @@ + package eu.dnetlib.dhp.schema.oaf; import java.io.Serializable; @@ -6,60 +7,64 @@ import java.util.Objects; public abstract class Oaf implements Serializable { - protected List collectedfrom; + protected List collectedfrom; - private DataInfo dataInfo; + private DataInfo dataInfo; - private Long lastupdatetimestamp; + private Long lastupdatetimestamp; - public List getCollectedfrom() { - return collectedfrom; - } + public List getCollectedfrom() { + return collectedfrom; + } - public void setCollectedfrom(List collectedfrom) { - this.collectedfrom = collectedfrom; - } + public void setCollectedfrom(List collectedfrom) { + this.collectedfrom = collectedfrom; + } - public DataInfo getDataInfo() { - return dataInfo; - } + public DataInfo getDataInfo() { + return dataInfo; + } - public void setDataInfo(DataInfo dataInfo) { - this.dataInfo = dataInfo; - } + public void setDataInfo(DataInfo dataInfo) { + this.dataInfo = dataInfo; + } - public Long getLastupdatetimestamp() { - return lastupdatetimestamp; - } + public Long getLastupdatetimestamp() { + return lastupdatetimestamp; + } - public void setLastupdatetimestamp(Long lastupdatetimestamp) { - this.lastupdatetimestamp = lastupdatetimestamp; - } + public void setLastupdatetimestamp(Long lastupdatetimestamp) { + this.lastupdatetimestamp = lastupdatetimestamp; + } - public void mergeOAFDataInfo(Oaf e) { - if (e.getDataInfo() != null && compareTrust(this, e) < 0) dataInfo = e.getDataInfo(); - } + public void mergeOAFDataInfo(Oaf e) { + if (e.getDataInfo() != null && compareTrust(this, e) < 0) + dataInfo = e.getDataInfo(); + } - protected String extractTrust(Oaf e) { - if (e == null || e.getDataInfo() == null || e.getDataInfo().getTrust() == null) return "0.0"; - return e.getDataInfo().getTrust(); - } + protected String extractTrust(Oaf e) { + if (e == null || e.getDataInfo() == null || e.getDataInfo().getTrust() == null) + return "0.0"; + return e.getDataInfo().getTrust(); + } - protected int compareTrust(Oaf a, Oaf b) { - return extractTrust(a).compareTo(extractTrust(b)); - } + protected int compareTrust(Oaf a, Oaf b) { + return extractTrust(a).compareTo(extractTrust(b)); + } - @Override - public boolean equals(Object o) { - if (this == o) return true; - if (o == null || getClass() != o.getClass()) return false; - Oaf oaf = (Oaf) o; - return Objects.equals(dataInfo, oaf.dataInfo) - && Objects.equals(lastupdatetimestamp, oaf.lastupdatetimestamp); - } + @Override + public boolean equals(Object o) { + if (this == o) + return true; + if (o == null || getClass() != o.getClass()) + return false; + Oaf oaf = (Oaf) o; + return Objects.equals(dataInfo, oaf.dataInfo) + && Objects.equals(lastupdatetimestamp, oaf.lastupdatetimestamp); + } - @Override - public int hashCode() { - return Objects.hash(dataInfo, lastupdatetimestamp); - } + @Override + public int hashCode() { + return Objects.hash(dataInfo, lastupdatetimestamp); + } } diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/OafEntity.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/OafEntity.java index fbc73bb0a..09742748d 100644 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/OafEntity.java +++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/OafEntity.java @@ -1,3 +1,4 @@ + package eu.dnetlib.dhp.schema.oaf; import java.io.Serializable; @@ -6,118 +7,123 @@ import java.util.stream.Collectors; public abstract class OafEntity extends Oaf implements Serializable { - private String id; + private String id; - private List originalId; + private List originalId; - private List pid; + private List pid; - private String dateofcollection; + private String dateofcollection; - private String dateoftransformation; + private String dateoftransformation; - private List extraInfo; + private List extraInfo; - private OAIProvenance oaiprovenance; + private OAIProvenance oaiprovenance; - public String getId() { - return id; - } + public String getId() { + return id; + } - public void setId(String id) { - this.id = id; - } + public void setId(String id) { + this.id = id; + } - public List getOriginalId() { - return originalId; - } + public List getOriginalId() { + return originalId; + } - public void setOriginalId(List originalId) { - this.originalId = originalId; - } + public void setOriginalId(List originalId) { + this.originalId = originalId; + } - public List getPid() { - return pid; - } + public List getPid() { + return pid; + } - public void setPid(List pid) { - this.pid = pid; - } + public void setPid(List pid) { + this.pid = pid; + } - public String getDateofcollection() { - return dateofcollection; - } + public String getDateofcollection() { + return dateofcollection; + } - public void setDateofcollection(String dateofcollection) { - this.dateofcollection = dateofcollection; - } + public void setDateofcollection(String dateofcollection) { + this.dateofcollection = dateofcollection; + } - public String getDateoftransformation() { - return dateoftransformation; - } + public String getDateoftransformation() { + return dateoftransformation; + } - public void setDateoftransformation(String dateoftransformation) { - this.dateoftransformation = dateoftransformation; - } + public void setDateoftransformation(String dateoftransformation) { + this.dateoftransformation = dateoftransformation; + } - public List getExtraInfo() { - return extraInfo; - } + public List getExtraInfo() { + return extraInfo; + } - public void setExtraInfo(List extraInfo) { - this.extraInfo = extraInfo; - } + public void setExtraInfo(List extraInfo) { + this.extraInfo = extraInfo; + } - public OAIProvenance getOaiprovenance() { - return oaiprovenance; - } + public OAIProvenance getOaiprovenance() { + return oaiprovenance; + } - public void setOaiprovenance(OAIProvenance oaiprovenance) { - this.oaiprovenance = oaiprovenance; - } + public void setOaiprovenance(OAIProvenance oaiprovenance) { + this.oaiprovenance = oaiprovenance; + } - public void mergeFrom(OafEntity e) { + public void mergeFrom(OafEntity e) { - if (e == null) return; + if (e == null) + return; - originalId = mergeLists(originalId, e.getOriginalId()); + originalId = mergeLists(originalId, e.getOriginalId()); - collectedfrom = mergeLists(collectedfrom, e.getCollectedfrom()); + collectedfrom = mergeLists(collectedfrom, e.getCollectedfrom()); - pid = mergeLists(pid, e.getPid()); + pid = mergeLists(pid, e.getPid()); - if (e.getDateofcollection() != null && compareTrust(this, e) < 0) - dateofcollection = e.getDateofcollection(); + if (e.getDateofcollection() != null && compareTrust(this, e) < 0) + dateofcollection = e.getDateofcollection(); - if (e.getDateoftransformation() != null && compareTrust(this, e) < 0) - dateoftransformation = e.getDateoftransformation(); + if (e.getDateoftransformation() != null && compareTrust(this, e) < 0) + dateoftransformation = e.getDateoftransformation(); - extraInfo = mergeLists(extraInfo, e.getExtraInfo()); + extraInfo = mergeLists(extraInfo, e.getExtraInfo()); - if (e.getOaiprovenance() != null && compareTrust(this, e) < 0) - oaiprovenance = e.getOaiprovenance(); - } + if (e.getOaiprovenance() != null && compareTrust(this, e) < 0) + oaiprovenance = e.getOaiprovenance(); + } - protected List mergeLists(final List... lists) { + protected List mergeLists(final List... lists) { - return Arrays.stream(lists) - .filter(Objects::nonNull) - .flatMap(List::stream) - .distinct() - .collect(Collectors.toList()); - } + return Arrays + .stream(lists) + .filter(Objects::nonNull) + .flatMap(List::stream) + .distinct() + .collect(Collectors.toList()); + } - @Override - public boolean equals(Object o) { - if (this == o) return true; - if (o == null || getClass() != o.getClass()) return false; - if (!super.equals(o)) return false; - OafEntity oafEntity = (OafEntity) o; - return Objects.equals(id, oafEntity.id); - } + @Override + public boolean equals(Object o) { + if (this == o) + return true; + if (o == null || getClass() != o.getClass()) + return false; + if (!super.equals(o)) + return false; + OafEntity oafEntity = (OafEntity) o; + return Objects.equals(id, oafEntity.id); + } - @Override - public int hashCode() { - return Objects.hash(super.hashCode(), id); - } + @Override + public int hashCode() { + return Objects.hash(super.hashCode(), id); + } } diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Organization.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Organization.java index 4339ff5b4..a5f9bce30 100644 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Organization.java +++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Organization.java @@ -1,3 +1,4 @@ + package eu.dnetlib.dhp.schema.oaf; import java.io.Serializable; @@ -5,221 +6,209 @@ import java.util.List; public class Organization extends OafEntity implements Serializable { - private Field legalshortname; + private Field legalshortname; - private Field legalname; + private Field legalname; - private List> alternativeNames; + private List> alternativeNames; - private Field websiteurl; + private Field websiteurl; - private Field logourl; + private Field logourl; - private Field eclegalbody; + private Field eclegalbody; - private Field eclegalperson; + private Field eclegalperson; - private Field ecnonprofit; + private Field ecnonprofit; - private Field ecresearchorganization; + private Field ecresearchorganization; - private Field echighereducation; + private Field echighereducation; - private Field ecinternationalorganizationeurinterests; + private Field ecinternationalorganizationeurinterests; - private Field ecinternationalorganization; + private Field ecinternationalorganization; - private Field ecenterprise; + private Field ecenterprise; - private Field ecsmevalidated; + private Field ecsmevalidated; - private Field ecnutscode; + private Field ecnutscode; - private Qualifier country; + private Qualifier country; - public Field getLegalshortname() { - return legalshortname; - } + public Field getLegalshortname() { + return legalshortname; + } - public void setLegalshortname(Field legalshortname) { - this.legalshortname = legalshortname; - } + public void setLegalshortname(Field legalshortname) { + this.legalshortname = legalshortname; + } - public Field getLegalname() { - return legalname; - } + public Field getLegalname() { + return legalname; + } - public void setLegalname(Field legalname) { - this.legalname = legalname; - } + public void setLegalname(Field legalname) { + this.legalname = legalname; + } - public List> getAlternativeNames() { - return alternativeNames; - } + public List> getAlternativeNames() { + return alternativeNames; + } - public void setAlternativeNames(List> alternativeNames) { - this.alternativeNames = alternativeNames; - } + public void setAlternativeNames(List> alternativeNames) { + this.alternativeNames = alternativeNames; + } - public Field getWebsiteurl() { - return websiteurl; - } + public Field getWebsiteurl() { + return websiteurl; + } - public void setWebsiteurl(Field websiteurl) { - this.websiteurl = websiteurl; - } + public void setWebsiteurl(Field websiteurl) { + this.websiteurl = websiteurl; + } - public Field getLogourl() { - return logourl; - } + public Field getLogourl() { + return logourl; + } - public void setLogourl(Field logourl) { - this.logourl = logourl; - } + public void setLogourl(Field logourl) { + this.logourl = logourl; + } - public Field getEclegalbody() { - return eclegalbody; - } + public Field getEclegalbody() { + return eclegalbody; + } - public void setEclegalbody(Field eclegalbody) { - this.eclegalbody = eclegalbody; - } + public void setEclegalbody(Field eclegalbody) { + this.eclegalbody = eclegalbody; + } - public Field getEclegalperson() { - return eclegalperson; - } + public Field getEclegalperson() { + return eclegalperson; + } - public void setEclegalperson(Field eclegalperson) { - this.eclegalperson = eclegalperson; - } + public void setEclegalperson(Field eclegalperson) { + this.eclegalperson = eclegalperson; + } - public Field getEcnonprofit() { - return ecnonprofit; - } + public Field getEcnonprofit() { + return ecnonprofit; + } - public void setEcnonprofit(Field ecnonprofit) { - this.ecnonprofit = ecnonprofit; - } + public void setEcnonprofit(Field ecnonprofit) { + this.ecnonprofit = ecnonprofit; + } - public Field getEcresearchorganization() { - return ecresearchorganization; - } + public Field getEcresearchorganization() { + return ecresearchorganization; + } - public void setEcresearchorganization(Field ecresearchorganization) { - this.ecresearchorganization = ecresearchorganization; - } + public void setEcresearchorganization(Field ecresearchorganization) { + this.ecresearchorganization = ecresearchorganization; + } - public Field getEchighereducation() { - return echighereducation; - } + public Field getEchighereducation() { + return echighereducation; + } - public void setEchighereducation(Field echighereducation) { - this.echighereducation = echighereducation; - } + public void setEchighereducation(Field echighereducation) { + this.echighereducation = echighereducation; + } - public Field getEcinternationalorganizationeurinterests() { - return ecinternationalorganizationeurinterests; - } + public Field getEcinternationalorganizationeurinterests() { + return ecinternationalorganizationeurinterests; + } - public void setEcinternationalorganizationeurinterests( - Field ecinternationalorganizationeurinterests) { - this.ecinternationalorganizationeurinterests = ecinternationalorganizationeurinterests; - } + public void setEcinternationalorganizationeurinterests( + Field ecinternationalorganizationeurinterests) { + this.ecinternationalorganizationeurinterests = ecinternationalorganizationeurinterests; + } - public Field getEcinternationalorganization() { - return ecinternationalorganization; - } + public Field getEcinternationalorganization() { + return ecinternationalorganization; + } - public void setEcinternationalorganization(Field ecinternationalorganization) { - this.ecinternationalorganization = ecinternationalorganization; - } + public void setEcinternationalorganization(Field ecinternationalorganization) { + this.ecinternationalorganization = ecinternationalorganization; + } - public Field getEcenterprise() { - return ecenterprise; - } + public Field getEcenterprise() { + return ecenterprise; + } - public void setEcenterprise(Field ecenterprise) { - this.ecenterprise = ecenterprise; - } + public void setEcenterprise(Field ecenterprise) { + this.ecenterprise = ecenterprise; + } - public Field getEcsmevalidated() { - return ecsmevalidated; - } + public Field getEcsmevalidated() { + return ecsmevalidated; + } - public void setEcsmevalidated(Field ecsmevalidated) { - this.ecsmevalidated = ecsmevalidated; - } + public void setEcsmevalidated(Field ecsmevalidated) { + this.ecsmevalidated = ecsmevalidated; + } - public Field getEcnutscode() { - return ecnutscode; - } + public Field getEcnutscode() { + return ecnutscode; + } - public void setEcnutscode(Field ecnutscode) { - this.ecnutscode = ecnutscode; - } + public void setEcnutscode(Field ecnutscode) { + this.ecnutscode = ecnutscode; + } - public Qualifier getCountry() { - return country; - } + public Qualifier getCountry() { + return country; + } - public void setCountry(Qualifier country) { - this.country = country; - } + public void setCountry(Qualifier country) { + this.country = country; + } - @Override - public void mergeFrom(OafEntity e) { - super.mergeFrom(e); + @Override + public void mergeFrom(OafEntity e) { + super.mergeFrom(e); - if (!Organization.class.isAssignableFrom(e.getClass())) { - return; - } + if (!Organization.class.isAssignableFrom(e.getClass())) { + return; + } - final Organization o = (Organization) e; - legalshortname = - o.getLegalshortname() != null && compareTrust(this, e) < 0 - ? o.getLegalshortname() - : legalshortname; - legalname = - o.getLegalname() != null && compareTrust(this, e) < 0 ? o.getLegalname() : legalname; - alternativeNames = mergeLists(o.getAlternativeNames(), alternativeNames); - websiteurl = - o.getWebsiteurl() != null && compareTrust(this, e) < 0 ? o.getWebsiteurl() : websiteurl; - logourl = o.getLogourl() != null && compareTrust(this, e) < 0 ? o.getLogourl() : logourl; - eclegalbody = - o.getEclegalbody() != null && compareTrust(this, e) < 0 ? o.getEclegalbody() : eclegalbody; - eclegalperson = - o.getEclegalperson() != null && compareTrust(this, e) < 0 - ? o.getEclegalperson() - : eclegalperson; - ecnonprofit = - o.getEcnonprofit() != null && compareTrust(this, e) < 0 ? o.getEcnonprofit() : ecnonprofit; - ecresearchorganization = - o.getEcresearchorganization() != null && compareTrust(this, e) < 0 - ? o.getEcresearchorganization() - : ecresearchorganization; - echighereducation = - o.getEchighereducation() != null && compareTrust(this, e) < 0 - ? o.getEchighereducation() - : echighereducation; - ecinternationalorganizationeurinterests = - o.getEcinternationalorganizationeurinterests() != null && compareTrust(this, e) < 0 - ? o.getEcinternationalorganizationeurinterests() - : ecinternationalorganizationeurinterests; - ecinternationalorganization = - o.getEcinternationalorganization() != null && compareTrust(this, e) < 0 - ? o.getEcinternationalorganization() - : ecinternationalorganization; - ecenterprise = - o.getEcenterprise() != null && compareTrust(this, e) < 0 - ? o.getEcenterprise() - : ecenterprise; - ecsmevalidated = - o.getEcsmevalidated() != null && compareTrust(this, e) < 0 - ? o.getEcsmevalidated() - : ecsmevalidated; - ecnutscode = - o.getEcnutscode() != null && compareTrust(this, e) < 0 ? o.getEcnutscode() : ecnutscode; - country = o.getCountry() != null && compareTrust(this, e) < 0 ? o.getCountry() : country; - mergeOAFDataInfo(o); - } + final Organization o = (Organization) e; + legalshortname = o.getLegalshortname() != null && compareTrust(this, e) < 0 + ? o.getLegalshortname() + : legalshortname; + legalname = o.getLegalname() != null && compareTrust(this, e) < 0 ? o.getLegalname() : legalname; + alternativeNames = mergeLists(o.getAlternativeNames(), alternativeNames); + websiteurl = o.getWebsiteurl() != null && compareTrust(this, e) < 0 ? o.getWebsiteurl() : websiteurl; + logourl = o.getLogourl() != null && compareTrust(this, e) < 0 ? o.getLogourl() : logourl; + eclegalbody = o.getEclegalbody() != null && compareTrust(this, e) < 0 ? o.getEclegalbody() : eclegalbody; + eclegalperson = o.getEclegalperson() != null && compareTrust(this, e) < 0 + ? o.getEclegalperson() + : eclegalperson; + ecnonprofit = o.getEcnonprofit() != null && compareTrust(this, e) < 0 ? o.getEcnonprofit() : ecnonprofit; + ecresearchorganization = o.getEcresearchorganization() != null && compareTrust(this, e) < 0 + ? o.getEcresearchorganization() + : ecresearchorganization; + echighereducation = o.getEchighereducation() != null && compareTrust(this, e) < 0 + ? o.getEchighereducation() + : echighereducation; + ecinternationalorganizationeurinterests = o.getEcinternationalorganizationeurinterests() != null + && compareTrust(this, e) < 0 + ? o.getEcinternationalorganizationeurinterests() + : ecinternationalorganizationeurinterests; + ecinternationalorganization = o.getEcinternationalorganization() != null && compareTrust(this, e) < 0 + ? o.getEcinternationalorganization() + : ecinternationalorganization; + ecenterprise = o.getEcenterprise() != null && compareTrust(this, e) < 0 + ? o.getEcenterprise() + : ecenterprise; + ecsmevalidated = o.getEcsmevalidated() != null && compareTrust(this, e) < 0 + ? o.getEcsmevalidated() + : ecsmevalidated; + ecnutscode = o.getEcnutscode() != null && compareTrust(this, e) < 0 ? o.getEcnutscode() : ecnutscode; + country = o.getCountry() != null && compareTrust(this, e) < 0 ? o.getCountry() : country; + mergeOAFDataInfo(o); + } } diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/OriginDescription.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/OriginDescription.java index 5bdabb558..a275fc1a9 100644 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/OriginDescription.java +++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/OriginDescription.java @@ -1,3 +1,4 @@ + package eu.dnetlib.dhp.schema.oaf; import java.io.Serializable; @@ -5,81 +6,83 @@ import java.util.Objects; public class OriginDescription implements Serializable { - private String harvestDate; + private String harvestDate; - private Boolean altered = true; + private Boolean altered = true; - private String baseURL; + private String baseURL; - private String identifier; + private String identifier; - private String datestamp; + private String datestamp; - private String metadataNamespace; + private String metadataNamespace; - public String getHarvestDate() { - return harvestDate; - } + public String getHarvestDate() { + return harvestDate; + } - public void setHarvestDate(String harvestDate) { - this.harvestDate = harvestDate; - } + public void setHarvestDate(String harvestDate) { + this.harvestDate = harvestDate; + } - public Boolean getAltered() { - return altered; - } + public Boolean getAltered() { + return altered; + } - public void setAltered(Boolean altered) { - this.altered = altered; - } + public void setAltered(Boolean altered) { + this.altered = altered; + } - public String getBaseURL() { - return baseURL; - } + public String getBaseURL() { + return baseURL; + } - public void setBaseURL(String baseURL) { - this.baseURL = baseURL; - } + public void setBaseURL(String baseURL) { + this.baseURL = baseURL; + } - public String getIdentifier() { - return identifier; - } + public String getIdentifier() { + return identifier; + } - public void setIdentifier(String identifier) { - this.identifier = identifier; - } + public void setIdentifier(String identifier) { + this.identifier = identifier; + } - public String getDatestamp() { - return datestamp; - } + public String getDatestamp() { + return datestamp; + } - public void setDatestamp(String datestamp) { - this.datestamp = datestamp; - } + public void setDatestamp(String datestamp) { + this.datestamp = datestamp; + } - public String getMetadataNamespace() { - return metadataNamespace; - } + public String getMetadataNamespace() { + return metadataNamespace; + } - public void setMetadataNamespace(String metadataNamespace) { - this.metadataNamespace = metadataNamespace; - } + public void setMetadataNamespace(String metadataNamespace) { + this.metadataNamespace = metadataNamespace; + } - @Override - public boolean equals(Object o) { - if (this == o) return true; - if (o == null || getClass() != o.getClass()) return false; - OriginDescription that = (OriginDescription) o; - return Objects.equals(harvestDate, that.harvestDate) - && Objects.equals(altered, that.altered) - && Objects.equals(baseURL, that.baseURL) - && Objects.equals(identifier, that.identifier) - && Objects.equals(datestamp, that.datestamp) - && Objects.equals(metadataNamespace, that.metadataNamespace); - } + @Override + public boolean equals(Object o) { + if (this == o) + return true; + if (o == null || getClass() != o.getClass()) + return false; + OriginDescription that = (OriginDescription) o; + return Objects.equals(harvestDate, that.harvestDate) + && Objects.equals(altered, that.altered) + && Objects.equals(baseURL, that.baseURL) + && Objects.equals(identifier, that.identifier) + && Objects.equals(datestamp, that.datestamp) + && Objects.equals(metadataNamespace, that.metadataNamespace); + } - @Override - public int hashCode() { - return Objects.hash(harvestDate, altered, baseURL, identifier, datestamp, metadataNamespace); - } + @Override + public int hashCode() { + return Objects.hash(harvestDate, altered, baseURL, identifier, datestamp, metadataNamespace); + } } diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/OtherResearchProduct.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/OtherResearchProduct.java index 6cd803ec5..b04934c23 100644 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/OtherResearchProduct.java +++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/OtherResearchProduct.java @@ -1,58 +1,60 @@ + package eu.dnetlib.dhp.schema.oaf; -import eu.dnetlib.dhp.schema.common.ModelConstants; import java.io.Serializable; import java.util.List; +import eu.dnetlib.dhp.schema.common.ModelConstants; + public class OtherResearchProduct extends Result implements Serializable { - private List> contactperson; + private List> contactperson; - private List> contactgroup; + private List> contactgroup; - private List> tool; + private List> tool; - public OtherResearchProduct() { - setResulttype(ModelConstants.ORP_DEFAULT_RESULTTYPE); - } + public OtherResearchProduct() { + setResulttype(ModelConstants.ORP_DEFAULT_RESULTTYPE); + } - public List> getContactperson() { - return contactperson; - } + public List> getContactperson() { + return contactperson; + } - public void setContactperson(List> contactperson) { - this.contactperson = contactperson; - } + public void setContactperson(List> contactperson) { + this.contactperson = contactperson; + } - public List> getContactgroup() { - return contactgroup; - } + public List> getContactgroup() { + return contactgroup; + } - public void setContactgroup(List> contactgroup) { - this.contactgroup = contactgroup; - } + public void setContactgroup(List> contactgroup) { + this.contactgroup = contactgroup; + } - public List> getTool() { - return tool; - } + public List> getTool() { + return tool; + } - public void setTool(List> tool) { - this.tool = tool; - } + public void setTool(List> tool) { + this.tool = tool; + } - @Override - public void mergeFrom(OafEntity e) { - super.mergeFrom(e); + @Override + public void mergeFrom(OafEntity e) { + super.mergeFrom(e); - if (!OtherResearchProduct.class.isAssignableFrom(e.getClass())) { - return; - } + if (!OtherResearchProduct.class.isAssignableFrom(e.getClass())) { + return; + } - OtherResearchProduct o = (OtherResearchProduct) e; + OtherResearchProduct o = (OtherResearchProduct) e; - contactperson = mergeLists(contactperson, o.getContactperson()); - contactgroup = mergeLists(contactgroup, o.getContactgroup()); - tool = mergeLists(tool, o.getTool()); - mergeOAFDataInfo(e); - } + contactperson = mergeLists(contactperson, o.getContactperson()); + contactgroup = mergeLists(contactgroup, o.getContactgroup()); + tool = mergeLists(tool, o.getTool()); + mergeOAFDataInfo(e); + } } diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Project.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Project.java index 2187a8828..924c08cc9 100644 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Project.java +++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Project.java @@ -1,3 +1,4 @@ + package eu.dnetlib.dhp.schema.oaf; import java.io.Serializable; @@ -5,335 +6,320 @@ import java.util.List; public class Project extends OafEntity implements Serializable { - private Field websiteurl; + private Field websiteurl; - private Field code; + private Field code; - private Field acronym; + private Field acronym; - private Field title; + private Field title; - private Field startdate; + private Field startdate; - private Field enddate; + private Field enddate; - private Field callidentifier; + private Field callidentifier; - private Field keywords; + private Field keywords; - private Field duration; + private Field duration; - private Field ecsc39; + private Field ecsc39; - private Field oamandatepublications; + private Field oamandatepublications; - private Field ecarticle29_3; + private Field ecarticle29_3; - private List subjects; + private List subjects; - private List> fundingtree; + private List> fundingtree; - private Qualifier contracttype; + private Qualifier contracttype; - private Field optional1; + private Field optional1; - private Field optional2; + private Field optional2; - private Field jsonextrainfo; + private Field jsonextrainfo; - private Field contactfullname; + private Field contactfullname; - private Field contactfax; + private Field contactfax; - private Field contactphone; + private Field contactphone; - private Field contactemail; + private Field contactemail; - private Field summary; + private Field summary; - private Field currency; + private Field currency; - private Float totalcost; + private Float totalcost; - private Float fundedamount; + private Float fundedamount; - public Field getWebsiteurl() { - return websiteurl; - } + public Field getWebsiteurl() { + return websiteurl; + } - public void setWebsiteurl(Field websiteurl) { - this.websiteurl = websiteurl; - } + public void setWebsiteurl(Field websiteurl) { + this.websiteurl = websiteurl; + } - public Field getCode() { - return code; - } + public Field getCode() { + return code; + } - public void setCode(Field code) { - this.code = code; - } + public void setCode(Field code) { + this.code = code; + } - public Field getAcronym() { - return acronym; - } + public Field getAcronym() { + return acronym; + } - public void setAcronym(Field acronym) { - this.acronym = acronym; - } + public void setAcronym(Field acronym) { + this.acronym = acronym; + } - public Field getTitle() { - return title; - } + public Field getTitle() { + return title; + } - public void setTitle(Field title) { - this.title = title; - } + public void setTitle(Field title) { + this.title = title; + } - public Field getStartdate() { - return startdate; - } + public Field getStartdate() { + return startdate; + } - public void setStartdate(Field startdate) { - this.startdate = startdate; - } + public void setStartdate(Field startdate) { + this.startdate = startdate; + } - public Field getEnddate() { - return enddate; - } + public Field getEnddate() { + return enddate; + } - public void setEnddate(Field enddate) { - this.enddate = enddate; - } + public void setEnddate(Field enddate) { + this.enddate = enddate; + } - public Field getCallidentifier() { - return callidentifier; - } + public Field getCallidentifier() { + return callidentifier; + } - public void setCallidentifier(Field callidentifier) { - this.callidentifier = callidentifier; - } + public void setCallidentifier(Field callidentifier) { + this.callidentifier = callidentifier; + } - public Field getKeywords() { - return keywords; - } + public Field getKeywords() { + return keywords; + } - public void setKeywords(Field keywords) { - this.keywords = keywords; - } + public void setKeywords(Field keywords) { + this.keywords = keywords; + } - public Field getDuration() { - return duration; - } + public Field getDuration() { + return duration; + } - public void setDuration(Field duration) { - this.duration = duration; - } + public void setDuration(Field duration) { + this.duration = duration; + } - public Field getEcsc39() { - return ecsc39; - } + public Field getEcsc39() { + return ecsc39; + } - public void setEcsc39(Field ecsc39) { - this.ecsc39 = ecsc39; - } + public void setEcsc39(Field ecsc39) { + this.ecsc39 = ecsc39; + } - public Field getOamandatepublications() { - return oamandatepublications; - } + public Field getOamandatepublications() { + return oamandatepublications; + } - public void setOamandatepublications(Field oamandatepublications) { - this.oamandatepublications = oamandatepublications; - } + public void setOamandatepublications(Field oamandatepublications) { + this.oamandatepublications = oamandatepublications; + } - public Field getEcarticle29_3() { - return ecarticle29_3; - } + public Field getEcarticle29_3() { + return ecarticle29_3; + } - public void setEcarticle29_3(Field ecarticle29_3) { - this.ecarticle29_3 = ecarticle29_3; - } + public void setEcarticle29_3(Field ecarticle29_3) { + this.ecarticle29_3 = ecarticle29_3; + } - public List getSubjects() { - return subjects; - } + public List getSubjects() { + return subjects; + } - public void setSubjects(List subjects) { - this.subjects = subjects; - } + public void setSubjects(List subjects) { + this.subjects = subjects; + } - public List> getFundingtree() { - return fundingtree; - } + public List> getFundingtree() { + return fundingtree; + } - public void setFundingtree(List> fundingtree) { - this.fundingtree = fundingtree; - } + public void setFundingtree(List> fundingtree) { + this.fundingtree = fundingtree; + } - public Qualifier getContracttype() { - return contracttype; - } + public Qualifier getContracttype() { + return contracttype; + } - public void setContracttype(Qualifier contracttype) { - this.contracttype = contracttype; - } + public void setContracttype(Qualifier contracttype) { + this.contracttype = contracttype; + } - public Field getOptional1() { - return optional1; - } + public Field getOptional1() { + return optional1; + } - public void setOptional1(Field optional1) { - this.optional1 = optional1; - } - - public Field getOptional2() { - return optional2; - } - - public void setOptional2(Field optional2) { - this.optional2 = optional2; - } - - public Field getJsonextrainfo() { - return jsonextrainfo; - } - - public void setJsonextrainfo(Field jsonextrainfo) { - this.jsonextrainfo = jsonextrainfo; - } - - public Field getContactfullname() { - return contactfullname; - } - - public void setContactfullname(Field contactfullname) { - this.contactfullname = contactfullname; - } - - public Field getContactfax() { - return contactfax; - } - - public void setContactfax(Field contactfax) { - this.contactfax = contactfax; - } - - public Field getContactphone() { - return contactphone; - } - - public void setContactphone(Field contactphone) { - this.contactphone = contactphone; - } - - public Field getContactemail() { - return contactemail; - } - - public void setContactemail(Field contactemail) { - this.contactemail = contactemail; - } - - public Field getSummary() { - return summary; - } - - public void setSummary(Field summary) { - this.summary = summary; - } - - public Field getCurrency() { - return currency; - } - - public void setCurrency(Field currency) { - this.currency = currency; - } - - public Float getTotalcost() { - return totalcost; - } - - public void setTotalcost(Float totalcost) { - this.totalcost = totalcost; - } - - public Float getFundedamount() { - return fundedamount; - } - - public void setFundedamount(Float fundedamount) { - this.fundedamount = fundedamount; - } - - @Override - public void mergeFrom(OafEntity e) { - super.mergeFrom(e); - - if (!Project.class.isAssignableFrom(e.getClass())) { - return; - } - - Project p = (Project) e; - - websiteurl = - p.getWebsiteurl() != null && compareTrust(this, e) < 0 ? p.getWebsiteurl() : websiteurl; - code = p.getCode() != null && compareTrust(this, e) < 0 ? p.getCode() : code; - acronym = p.getAcronym() != null && compareTrust(this, e) < 0 ? p.getAcronym() : acronym; - title = p.getTitle() != null && compareTrust(this, e) < 0 ? p.getTitle() : title; - startdate = - p.getStartdate() != null && compareTrust(this, e) < 0 ? p.getStartdate() : startdate; - enddate = p.getEnddate() != null && compareTrust(this, e) < 0 ? p.getEnddate() : enddate; - callidentifier = - p.getCallidentifier() != null && compareTrust(this, e) < 0 - ? p.getCallidentifier() - : callidentifier; - keywords = p.getKeywords() != null && compareTrust(this, e) < 0 ? p.getKeywords() : keywords; - duration = p.getDuration() != null && compareTrust(this, e) < 0 ? p.getDuration() : duration; - ecsc39 = p.getEcsc39() != null && compareTrust(this, e) < 0 ? p.getEcsc39() : ecsc39; - oamandatepublications = - p.getOamandatepublications() != null && compareTrust(this, e) < 0 - ? p.getOamandatepublications() - : oamandatepublications; - ecarticle29_3 = - p.getEcarticle29_3() != null && compareTrust(this, e) < 0 - ? p.getEcarticle29_3() - : ecarticle29_3; - subjects = mergeLists(subjects, p.getSubjects()); - fundingtree = mergeLists(fundingtree, p.getFundingtree()); - contracttype = - p.getContracttype() != null && compareTrust(this, e) < 0 - ? p.getContracttype() - : contracttype; - optional1 = - p.getOptional1() != null && compareTrust(this, e) < 0 ? p.getOptional1() : optional1; - optional2 = - p.getOptional2() != null && compareTrust(this, e) < 0 ? p.getOptional2() : optional2; - jsonextrainfo = - p.getJsonextrainfo() != null && compareTrust(this, e) < 0 - ? p.getJsonextrainfo() - : jsonextrainfo; - contactfullname = - p.getContactfullname() != null && compareTrust(this, e) < 0 - ? p.getContactfullname() - : contactfullname; - contactfax = - p.getContactfax() != null && compareTrust(this, e) < 0 ? p.getContactfax() : contactfax; - contactphone = - p.getContactphone() != null && compareTrust(this, e) < 0 - ? p.getContactphone() - : contactphone; - contactemail = - p.getContactemail() != null && compareTrust(this, e) < 0 - ? p.getContactemail() - : contactemail; - summary = p.getSummary() != null && compareTrust(this, e) < 0 ? p.getSummary() : summary; - currency = p.getCurrency() != null && compareTrust(this, e) < 0 ? p.getCurrency() : currency; - totalcost = - p.getTotalcost() != null && compareTrust(this, e) < 0 ? p.getTotalcost() : totalcost; - fundedamount = - p.getFundedamount() != null && compareTrust(this, e) < 0 - ? p.getFundedamount() - : fundedamount; - mergeOAFDataInfo(e); - } + public void setOptional1(Field optional1) { + this.optional1 = optional1; + } + + public Field getOptional2() { + return optional2; + } + + public void setOptional2(Field optional2) { + this.optional2 = optional2; + } + + public Field getJsonextrainfo() { + return jsonextrainfo; + } + + public void setJsonextrainfo(Field jsonextrainfo) { + this.jsonextrainfo = jsonextrainfo; + } + + public Field getContactfullname() { + return contactfullname; + } + + public void setContactfullname(Field contactfullname) { + this.contactfullname = contactfullname; + } + + public Field getContactfax() { + return contactfax; + } + + public void setContactfax(Field contactfax) { + this.contactfax = contactfax; + } + + public Field getContactphone() { + return contactphone; + } + + public void setContactphone(Field contactphone) { + this.contactphone = contactphone; + } + + public Field getContactemail() { + return contactemail; + } + + public void setContactemail(Field contactemail) { + this.contactemail = contactemail; + } + + public Field getSummary() { + return summary; + } + + public void setSummary(Field summary) { + this.summary = summary; + } + + public Field getCurrency() { + return currency; + } + + public void setCurrency(Field currency) { + this.currency = currency; + } + + public Float getTotalcost() { + return totalcost; + } + + public void setTotalcost(Float totalcost) { + this.totalcost = totalcost; + } + + public Float getFundedamount() { + return fundedamount; + } + + public void setFundedamount(Float fundedamount) { + this.fundedamount = fundedamount; + } + + @Override + public void mergeFrom(OafEntity e) { + super.mergeFrom(e); + + if (!Project.class.isAssignableFrom(e.getClass())) { + return; + } + + Project p = (Project) e; + + websiteurl = p.getWebsiteurl() != null && compareTrust(this, e) < 0 ? p.getWebsiteurl() : websiteurl; + code = p.getCode() != null && compareTrust(this, e) < 0 ? p.getCode() : code; + acronym = p.getAcronym() != null && compareTrust(this, e) < 0 ? p.getAcronym() : acronym; + title = p.getTitle() != null && compareTrust(this, e) < 0 ? p.getTitle() : title; + startdate = p.getStartdate() != null && compareTrust(this, e) < 0 ? p.getStartdate() : startdate; + enddate = p.getEnddate() != null && compareTrust(this, e) < 0 ? p.getEnddate() : enddate; + callidentifier = p.getCallidentifier() != null && compareTrust(this, e) < 0 + ? p.getCallidentifier() + : callidentifier; + keywords = p.getKeywords() != null && compareTrust(this, e) < 0 ? p.getKeywords() : keywords; + duration = p.getDuration() != null && compareTrust(this, e) < 0 ? p.getDuration() : duration; + ecsc39 = p.getEcsc39() != null && compareTrust(this, e) < 0 ? p.getEcsc39() : ecsc39; + oamandatepublications = p.getOamandatepublications() != null && compareTrust(this, e) < 0 + ? p.getOamandatepublications() + : oamandatepublications; + ecarticle29_3 = p.getEcarticle29_3() != null && compareTrust(this, e) < 0 + ? p.getEcarticle29_3() + : ecarticle29_3; + subjects = mergeLists(subjects, p.getSubjects()); + fundingtree = mergeLists(fundingtree, p.getFundingtree()); + contracttype = p.getContracttype() != null && compareTrust(this, e) < 0 + ? p.getContracttype() + : contracttype; + optional1 = p.getOptional1() != null && compareTrust(this, e) < 0 ? p.getOptional1() : optional1; + optional2 = p.getOptional2() != null && compareTrust(this, e) < 0 ? p.getOptional2() : optional2; + jsonextrainfo = p.getJsonextrainfo() != null && compareTrust(this, e) < 0 + ? p.getJsonextrainfo() + : jsonextrainfo; + contactfullname = p.getContactfullname() != null && compareTrust(this, e) < 0 + ? p.getContactfullname() + : contactfullname; + contactfax = p.getContactfax() != null && compareTrust(this, e) < 0 ? p.getContactfax() : contactfax; + contactphone = p.getContactphone() != null && compareTrust(this, e) < 0 + ? p.getContactphone() + : contactphone; + contactemail = p.getContactemail() != null && compareTrust(this, e) < 0 + ? p.getContactemail() + : contactemail; + summary = p.getSummary() != null && compareTrust(this, e) < 0 ? p.getSummary() : summary; + currency = p.getCurrency() != null && compareTrust(this, e) < 0 ? p.getCurrency() : currency; + totalcost = p.getTotalcost() != null && compareTrust(this, e) < 0 ? p.getTotalcost() : totalcost; + fundedamount = p.getFundedamount() != null && compareTrust(this, e) < 0 + ? p.getFundedamount() + : fundedamount; + mergeOAFDataInfo(e); + } } diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Publication.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Publication.java index 9227df6ee..3058c262b 100644 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Publication.java +++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Publication.java @@ -1,36 +1,39 @@ + package eu.dnetlib.dhp.schema.oaf; -import eu.dnetlib.dhp.schema.common.ModelConstants; import java.io.Serializable; +import eu.dnetlib.dhp.schema.common.ModelConstants; + public class Publication extends Result implements Serializable { - // publication specific - private Journal journal; + // publication specific + private Journal journal; - public Publication() { - setResulttype(ModelConstants.PUBLICATION_DEFAULT_RESULTTYPE); - } + public Publication() { + setResulttype(ModelConstants.PUBLICATION_DEFAULT_RESULTTYPE); + } - public Journal getJournal() { - return journal; - } + public Journal getJournal() { + return journal; + } - public void setJournal(Journal journal) { - this.journal = journal; - } + public void setJournal(Journal journal) { + this.journal = journal; + } - @Override - public void mergeFrom(OafEntity e) { - super.mergeFrom(e); + @Override + public void mergeFrom(OafEntity e) { + super.mergeFrom(e); - if (!Publication.class.isAssignableFrom(e.getClass())) { - return; - } + if (!Publication.class.isAssignableFrom(e.getClass())) { + return; + } - Publication p = (Publication) e; + Publication p = (Publication) e; - if (p.getJournal() != null && compareTrust(this, e) < 0) journal = p.getJournal(); - mergeOAFDataInfo(e); - } + if (p.getJournal() != null && compareTrust(this, e) < 0) + journal = p.getJournal(); + mergeOAFDataInfo(e); + } } diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Qualifier.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Qualifier.java index 003d4a7a4..87ecb55f1 100644 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Qualifier.java +++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Qualifier.java @@ -1,80 +1,87 @@ + package eu.dnetlib.dhp.schema.oaf; -import com.fasterxml.jackson.annotation.JsonIgnore; import java.io.Serializable; + import org.apache.commons.lang3.StringUtils; +import com.fasterxml.jackson.annotation.JsonIgnore; + public class Qualifier implements Serializable { - private String classid; - private String classname; - private String schemeid; - private String schemename; + private String classid; + private String classname; + private String schemeid; + private String schemename; - public String getClassid() { - return classid; - } + public String getClassid() { + return classid; + } - public void setClassid(String classid) { - this.classid = classid; - } + public void setClassid(String classid) { + this.classid = classid; + } - public String getClassname() { - return classname; - } + public String getClassname() { + return classname; + } - public void setClassname(String classname) { - this.classname = classname; - } + public void setClassname(String classname) { + this.classname = classname; + } - public String getSchemeid() { - return schemeid; - } + public String getSchemeid() { + return schemeid; + } - public void setSchemeid(String schemeid) { - this.schemeid = schemeid; - } + public void setSchemeid(String schemeid) { + this.schemeid = schemeid; + } - public String getSchemename() { - return schemename; - } + public String getSchemename() { + return schemename; + } - public void setSchemename(String schemename) { - this.schemename = schemename; - } + public void setSchemename(String schemename) { + this.schemename = schemename; + } - public String toComparableString() { - return isBlank() - ? "" - : String.format( - "%s::%s::%s::%s", - classid != null ? classid : "", - classname != null ? classname : "", - schemeid != null ? schemeid : "", - schemename != null ? schemename : ""); - } + public String toComparableString() { + return isBlank() + ? "" + : String + .format( + "%s::%s::%s::%s", + classid != null ? classid : "", + classname != null ? classname : "", + schemeid != null ? schemeid : "", + schemename != null ? schemename : ""); + } - @JsonIgnore - public boolean isBlank() { - return StringUtils.isBlank(classid) - && StringUtils.isBlank(classname) - && StringUtils.isBlank(schemeid) - && StringUtils.isBlank(schemename); - } + @JsonIgnore + public boolean isBlank() { + return StringUtils.isBlank(classid) + && StringUtils.isBlank(classname) + && StringUtils.isBlank(schemeid) + && StringUtils.isBlank(schemename); + } - @Override - public int hashCode() { - return toComparableString().hashCode(); - } + @Override + public int hashCode() { + return toComparableString().hashCode(); + } - @Override - public boolean equals(Object obj) { - if (this == obj) return true; - if (obj == null) return false; - if (getClass() != obj.getClass()) return false; + @Override + public boolean equals(Object obj) { + if (this == obj) + return true; + if (obj == null) + return false; + if (getClass() != obj.getClass()) + return false; - Qualifier other = (Qualifier) obj; + Qualifier other = (Qualifier) obj; - return toComparableString().equals(other.toComparableString()); - } + return toComparableString().equals(other.toComparableString()); + } } diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Relation.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Relation.java index 197adfb81..2c282c29e 100644 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Relation.java +++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Relation.java @@ -1,3 +1,4 @@ + package eu.dnetlib.dhp.schema.oaf; import static com.google.common.base.Preconditions.checkArgument; @@ -8,91 +9,96 @@ import java.util.stream.Stream; public class Relation extends Oaf { - private String relType; + private String relType; - private String subRelType; + private String subRelType; - private String relClass; + private String relClass; - private String source; + private String source; - private String target; + private String target; - public String getRelType() { - return relType; - } + public String getRelType() { + return relType; + } - public void setRelType(final String relType) { - this.relType = relType; - } + public void setRelType(final String relType) { + this.relType = relType; + } - public String getSubRelType() { - return subRelType; - } + public String getSubRelType() { + return subRelType; + } - public void setSubRelType(final String subRelType) { - this.subRelType = subRelType; - } + public void setSubRelType(final String subRelType) { + this.subRelType = subRelType; + } - public String getRelClass() { - return relClass; - } + public String getRelClass() { + return relClass; + } - public void setRelClass(final String relClass) { - this.relClass = relClass; - } + public void setRelClass(final String relClass) { + this.relClass = relClass; + } - public String getSource() { - return source; - } + public String getSource() { + return source; + } - public void setSource(final String source) { - this.source = source; - } + public void setSource(final String source) { + this.source = source; + } - public String getTarget() { - return target; - } + public String getTarget() { + return target; + } - public void setTarget(final String target) { - this.target = target; - } + public void setTarget(final String target) { + this.target = target; + } - public void mergeFrom(final Relation r) { + public void mergeFrom(final Relation r) { - checkArgument(Objects.equals(getSource(), r.getSource()), "source ids must be equal"); - checkArgument(Objects.equals(getTarget(), r.getTarget()), "target ids must be equal"); - checkArgument(Objects.equals(getRelType(), r.getRelType()), "relType(s) must be equal"); - checkArgument( - Objects.equals(getSubRelType(), r.getSubRelType()), "subRelType(s) must be equal"); - checkArgument(Objects.equals(getRelClass(), r.getRelClass()), "relClass(es) must be equal"); + checkArgument(Objects.equals(getSource(), r.getSource()), "source ids must be equal"); + checkArgument(Objects.equals(getTarget(), r.getTarget()), "target ids must be equal"); + checkArgument(Objects.equals(getRelType(), r.getRelType()), "relType(s) must be equal"); + checkArgument( + Objects.equals(getSubRelType(), r.getSubRelType()), "subRelType(s) must be equal"); + checkArgument(Objects.equals(getRelClass(), r.getRelClass()), "relClass(es) must be equal"); - setCollectedfrom( - Stream.concat( - Optional.ofNullable(getCollectedfrom()) - .map(Collection::stream) - .orElse(Stream.empty()), - Optional.ofNullable(r.getCollectedfrom()) - .map(Collection::stream) - .orElse(Stream.empty())) - .distinct() // relies on KeyValue.equals - .collect(Collectors.toList())); - } + setCollectedfrom( + Stream + .concat( + Optional + .ofNullable(getCollectedfrom()) + .map(Collection::stream) + .orElse(Stream.empty()), + Optional + .ofNullable(r.getCollectedfrom()) + .map(Collection::stream) + .orElse(Stream.empty())) + .distinct() // relies on KeyValue.equals + .collect(Collectors.toList())); + } - @Override - public boolean equals(Object o) { - if (this == o) return true; - if (o == null || getClass() != o.getClass()) return false; - Relation relation = (Relation) o; - return relType.equals(relation.relType) - && subRelType.equals(relation.subRelType) - && relClass.equals(relation.relClass) - && source.equals(relation.source) - && target.equals(relation.target); - } + @Override + public boolean equals(Object o) { + if (this == o) + return true; + if (o == null || getClass() != o.getClass()) + return false; + Relation relation = (Relation) o; + return relType.equals(relation.relType) + && subRelType.equals(relation.subRelType) + && relClass.equals(relation.relClass) + && source.equals(relation.source) + && target.equals(relation.target); + } - @Override - public int hashCode() { - return Objects.hash(relType, subRelType, relClass, source, target, collectedfrom); - } + @Override + public int hashCode() { + return Objects.hash(relType, subRelType, relClass, source, target, collectedfrom); + } } diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Result.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Result.java index 5da50b921..711b1ca68 100644 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Result.java +++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Result.java @@ -1,3 +1,4 @@ + package eu.dnetlib.dhp.schema.oaf; import java.io.Serializable; @@ -6,286 +7,291 @@ import java.util.List; public class Result extends OafEntity implements Serializable { - private List author; + private List author; - // resulttype allows subclassing results into publications | datasets | software - private Qualifier resulttype; + // resulttype allows subclassing results into publications | datasets | software + private Qualifier resulttype; - // common fields - private Qualifier language; + // common fields + private Qualifier language; - private List country; + private List country; - private List subject; + private List subject; - private List title; + private List title; - private List relevantdate; + private List relevantdate; - private List> description; + private List> description; - private Field dateofacceptance; + private Field dateofacceptance; - private Field publisher; + private Field publisher; - private Field embargoenddate; + private Field embargoenddate; - private List> source; + private List> source; - private List> fulltext; // remove candidate + private List> fulltext; // remove candidate - private List> format; + private List> format; - private List> contributor; + private List> contributor; - private Qualifier resourcetype; + private Qualifier resourcetype; - private List> coverage; + private List> coverage; - private Qualifier bestaccessright; + private Qualifier bestaccessright; - private List context; + private List context; - private List externalReference; + private List externalReference; - private List instance; + private List instance; - public List getAuthor() { - return author; - } + public List getAuthor() { + return author; + } - public void setAuthor(List author) { - this.author = author; - } + public void setAuthor(List author) { + this.author = author; + } - public Qualifier getResulttype() { - return resulttype; - } + public Qualifier getResulttype() { + return resulttype; + } - public void setResulttype(Qualifier resulttype) { - this.resulttype = resulttype; - } + public void setResulttype(Qualifier resulttype) { + this.resulttype = resulttype; + } - public Qualifier getLanguage() { - return language; - } + public Qualifier getLanguage() { + return language; + } - public void setLanguage(Qualifier language) { - this.language = language; - } + public void setLanguage(Qualifier language) { + this.language = language; + } - public List getCountry() { - return country; - } + public List getCountry() { + return country; + } - public void setCountry(List country) { - this.country = country; - } + public void setCountry(List country) { + this.country = country; + } - public List getSubject() { - return subject; - } + public List getSubject() { + return subject; + } - public void setSubject(List subject) { - this.subject = subject; - } + public void setSubject(List subject) { + this.subject = subject; + } - public List getTitle() { - return title; - } + public List getTitle() { + return title; + } - public void setTitle(List title) { - this.title = title; - } + public void setTitle(List title) { + this.title = title; + } - public List getRelevantdate() { - return relevantdate; - } + public List getRelevantdate() { + return relevantdate; + } - public void setRelevantdate(List relevantdate) { - this.relevantdate = relevantdate; - } + public void setRelevantdate(List relevantdate) { + this.relevantdate = relevantdate; + } - public List> getDescription() { - return description; - } + public List> getDescription() { + return description; + } - public void setDescription(List> description) { - this.description = description; - } + public void setDescription(List> description) { + this.description = description; + } - public Field getDateofacceptance() { - return dateofacceptance; - } + public Field getDateofacceptance() { + return dateofacceptance; + } - public void setDateofacceptance(Field dateofacceptance) { - this.dateofacceptance = dateofacceptance; - } + public void setDateofacceptance(Field dateofacceptance) { + this.dateofacceptance = dateofacceptance; + } - public Field getPublisher() { - return publisher; - } + public Field getPublisher() { + return publisher; + } - public void setPublisher(Field publisher) { - this.publisher = publisher; - } + public void setPublisher(Field publisher) { + this.publisher = publisher; + } - public Field getEmbargoenddate() { - return embargoenddate; - } + public Field getEmbargoenddate() { + return embargoenddate; + } - public void setEmbargoenddate(Field embargoenddate) { - this.embargoenddate = embargoenddate; - } + public void setEmbargoenddate(Field embargoenddate) { + this.embargoenddate = embargoenddate; + } - public List> getSource() { - return source; - } + public List> getSource() { + return source; + } - public void setSource(List> source) { - this.source = source; - } + public void setSource(List> source) { + this.source = source; + } - public List> getFulltext() { - return fulltext; - } + public List> getFulltext() { + return fulltext; + } - public void setFulltext(List> fulltext) { - this.fulltext = fulltext; - } + public void setFulltext(List> fulltext) { + this.fulltext = fulltext; + } - public List> getFormat() { - return format; - } + public List> getFormat() { + return format; + } - public void setFormat(List> format) { - this.format = format; - } + public void setFormat(List> format) { + this.format = format; + } - public List> getContributor() { - return contributor; - } + public List> getContributor() { + return contributor; + } - public void setContributor(List> contributor) { - this.contributor = contributor; - } + public void setContributor(List> contributor) { + this.contributor = contributor; + } - public Qualifier getResourcetype() { - return resourcetype; - } + public Qualifier getResourcetype() { + return resourcetype; + } - public void setResourcetype(Qualifier resourcetype) { - this.resourcetype = resourcetype; - } + public void setResourcetype(Qualifier resourcetype) { + this.resourcetype = resourcetype; + } - public List> getCoverage() { - return coverage; - } + public List> getCoverage() { + return coverage; + } - public void setCoverage(List> coverage) { - this.coverage = coverage; - } + public void setCoverage(List> coverage) { + this.coverage = coverage; + } - public Qualifier getBestaccessright() { - return bestaccessright; - } + public Qualifier getBestaccessright() { + return bestaccessright; + } - public void setBestaccessright(Qualifier bestaccessright) { - this.bestaccessright = bestaccessright; - } + public void setBestaccessright(Qualifier bestaccessright) { + this.bestaccessright = bestaccessright; + } - public List getContext() { - return context; - } + public List getContext() { + return context; + } - public void setContext(List context) { - this.context = context; - } + public void setContext(List context) { + this.context = context; + } - public List getExternalReference() { - return externalReference; - } + public List getExternalReference() { + return externalReference; + } - public void setExternalReference(List externalReference) { - this.externalReference = externalReference; - } + public void setExternalReference(List externalReference) { + this.externalReference = externalReference; + } - public List getInstance() { - return instance; - } + public List getInstance() { + return instance; + } - public void setInstance(List instance) { - this.instance = instance; - } + public void setInstance(List instance) { + this.instance = instance; + } - @Override - public void mergeFrom(OafEntity e) { - super.mergeFrom(e); + @Override + public void mergeFrom(OafEntity e) { + super.mergeFrom(e); - if (!Result.class.isAssignableFrom(e.getClass())) { - return; - } + if (!Result.class.isAssignableFrom(e.getClass())) { + return; + } - Result r = (Result) e; + Result r = (Result) e; - instance = mergeLists(instance, r.getInstance()); + instance = mergeLists(instance, r.getInstance()); - if (r.getBestaccessright() != null && compareTrust(this, r) < 0) - bestaccessright = r.getBestaccessright(); + if (r.getBestaccessright() != null && compareTrust(this, r) < 0) + bestaccessright = r.getBestaccessright(); - if (r.getResulttype() != null && compareTrust(this, r) < 0) resulttype = r.getResulttype(); + if (r.getResulttype() != null && compareTrust(this, r) < 0) + resulttype = r.getResulttype(); - if (r.getLanguage() != null && compareTrust(this, r) < 0) language = r.getLanguage(); + if (r.getLanguage() != null && compareTrust(this, r) < 0) + language = r.getLanguage(); - country = mergeLists(country, r.getCountry()); + country = mergeLists(country, r.getCountry()); - subject = mergeLists(subject, r.getSubject()); + subject = mergeLists(subject, r.getSubject()); - title = mergeLists(title, r.getTitle()); + title = mergeLists(title, r.getTitle()); - relevantdate = mergeLists(relevantdate, r.getRelevantdate()); + relevantdate = mergeLists(relevantdate, r.getRelevantdate()); - description = longestLists(description, r.getDescription()); + description = longestLists(description, r.getDescription()); - if (r.getPublisher() != null && compareTrust(this, r) < 0) publisher = r.getPublisher(); + if (r.getPublisher() != null && compareTrust(this, r) < 0) + publisher = r.getPublisher(); - if (r.getEmbargoenddate() != null && compareTrust(this, r) < 0) - embargoenddate = r.getEmbargoenddate(); + if (r.getEmbargoenddate() != null && compareTrust(this, r) < 0) + embargoenddate = r.getEmbargoenddate(); - source = mergeLists(source, r.getSource()); + source = mergeLists(source, r.getSource()); - fulltext = mergeLists(fulltext, r.getFulltext()); + fulltext = mergeLists(fulltext, r.getFulltext()); - format = mergeLists(format, r.getFormat()); + format = mergeLists(format, r.getFormat()); - contributor = mergeLists(contributor, r.getContributor()); + contributor = mergeLists(contributor, r.getContributor()); - if (r.getResourcetype() != null) resourcetype = r.getResourcetype(); + if (r.getResourcetype() != null) + resourcetype = r.getResourcetype(); - coverage = mergeLists(coverage, r.getCoverage()); + coverage = mergeLists(coverage, r.getCoverage()); - context = mergeLists(context, r.getContext()); + context = mergeLists(context, r.getContext()); - externalReference = mergeLists(externalReference, r.getExternalReference()); - } + externalReference = mergeLists(externalReference, r.getExternalReference()); + } - private List> longestLists(List> a, List> b) { - if (a == null || b == null) return a == null ? b : a; - if (a.size() == b.size()) { - int msa = - a.stream() - .filter(i -> i.getValue() != null) - .map(i -> i.getValue().length()) - .max(Comparator.naturalOrder()) - .orElse(0); - int msb = - b.stream() - .filter(i -> i.getValue() != null) - .map(i -> i.getValue().length()) - .max(Comparator.naturalOrder()) - .orElse(0); - return msa > msb ? a : b; - } - return a.size() > b.size() ? a : b; - } + private List> longestLists(List> a, List> b) { + if (a == null || b == null) + return a == null ? b : a; + if (a.size() == b.size()) { + int msa = a + .stream() + .filter(i -> i.getValue() != null) + .map(i -> i.getValue().length()) + .max(Comparator.naturalOrder()) + .orElse(0); + int msb = b + .stream() + .filter(i -> i.getValue() != null) + .map(i -> i.getValue().length()) + .max(Comparator.naturalOrder()) + .orElse(0); + return msa > msb ? a : b; + } + return a.size() > b.size() ? a : b; + } } diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Software.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Software.java index ffb7e03f7..40332bf53 100644 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Software.java +++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Software.java @@ -1,78 +1,78 @@ + package eu.dnetlib.dhp.schema.oaf; -import eu.dnetlib.dhp.schema.common.ModelConstants; import java.io.Serializable; import java.util.List; +import eu.dnetlib.dhp.schema.common.ModelConstants; + public class Software extends Result implements Serializable { - private List> documentationUrl; + private List> documentationUrl; - private List license; + private List license; - private Field codeRepositoryUrl; + private Field codeRepositoryUrl; - private Qualifier programmingLanguage; + private Qualifier programmingLanguage; - public Software() { - setResulttype(ModelConstants.SOFTWARE_DEFAULT_RESULTTYPE); - } + public Software() { + setResulttype(ModelConstants.SOFTWARE_DEFAULT_RESULTTYPE); + } - public List> getDocumentationUrl() { - return documentationUrl; - } + public List> getDocumentationUrl() { + return documentationUrl; + } - public void setDocumentationUrl(List> documentationUrl) { - this.documentationUrl = documentationUrl; - } + public void setDocumentationUrl(List> documentationUrl) { + this.documentationUrl = documentationUrl; + } - public List getLicense() { - return license; - } + public List getLicense() { + return license; + } - public void setLicense(List license) { - this.license = license; - } + public void setLicense(List license) { + this.license = license; + } - public Field getCodeRepositoryUrl() { - return codeRepositoryUrl; - } + public Field getCodeRepositoryUrl() { + return codeRepositoryUrl; + } - public void setCodeRepositoryUrl(Field codeRepositoryUrl) { - this.codeRepositoryUrl = codeRepositoryUrl; - } + public void setCodeRepositoryUrl(Field codeRepositoryUrl) { + this.codeRepositoryUrl = codeRepositoryUrl; + } - public Qualifier getProgrammingLanguage() { - return programmingLanguage; - } + public Qualifier getProgrammingLanguage() { + return programmingLanguage; + } - public void setProgrammingLanguage(Qualifier programmingLanguage) { - this.programmingLanguage = programmingLanguage; - } + public void setProgrammingLanguage(Qualifier programmingLanguage) { + this.programmingLanguage = programmingLanguage; + } - @Override - public void mergeFrom(OafEntity e) { - super.mergeFrom(e); + @Override + public void mergeFrom(OafEntity e) { + super.mergeFrom(e); - if (!Software.class.isAssignableFrom(e.getClass())) { - return; - } + if (!Software.class.isAssignableFrom(e.getClass())) { + return; + } - final Software s = (Software) e; - documentationUrl = mergeLists(documentationUrl, s.getDocumentationUrl()); + final Software s = (Software) e; + documentationUrl = mergeLists(documentationUrl, s.getDocumentationUrl()); - license = mergeLists(license, s.getLicense()); + license = mergeLists(license, s.getLicense()); - codeRepositoryUrl = - s.getCodeRepositoryUrl() != null && compareTrust(this, s) < 0 - ? s.getCodeRepositoryUrl() - : codeRepositoryUrl; + codeRepositoryUrl = s.getCodeRepositoryUrl() != null && compareTrust(this, s) < 0 + ? s.getCodeRepositoryUrl() + : codeRepositoryUrl; - programmingLanguage = - s.getProgrammingLanguage() != null && compareTrust(this, s) < 0 - ? s.getProgrammingLanguage() - : programmingLanguage; + programmingLanguage = s.getProgrammingLanguage() != null && compareTrust(this, s) < 0 + ? s.getProgrammingLanguage() + : programmingLanguage; - mergeOAFDataInfo(e); - } + mergeOAFDataInfo(e); + } } diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/StructuredProperty.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/StructuredProperty.java index 2e77389a3..1fa0de0be 100644 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/StructuredProperty.java +++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/StructuredProperty.java @@ -1,56 +1,60 @@ + package eu.dnetlib.dhp.schema.oaf; import java.io.Serializable; public class StructuredProperty implements Serializable { - private String value; + private String value; - private Qualifier qualifier; + private Qualifier qualifier; - private DataInfo dataInfo; + private DataInfo dataInfo; - public String getValue() { - return value; - } + public String getValue() { + return value; + } - public void setValue(String value) { - this.value = value; - } + public void setValue(String value) { + this.value = value; + } - public Qualifier getQualifier() { - return qualifier; - } + public Qualifier getQualifier() { + return qualifier; + } - public void setQualifier(Qualifier qualifier) { - this.qualifier = qualifier; - } + public void setQualifier(Qualifier qualifier) { + this.qualifier = qualifier; + } - public DataInfo getDataInfo() { - return dataInfo; - } + public DataInfo getDataInfo() { + return dataInfo; + } - public void setDataInfo(DataInfo dataInfo) { - this.dataInfo = dataInfo; - } + public void setDataInfo(DataInfo dataInfo) { + this.dataInfo = dataInfo; + } - public String toComparableString() { - return value != null ? value.toLowerCase() : ""; - } + public String toComparableString() { + return value != null ? value.toLowerCase() : ""; + } - @Override - public int hashCode() { - return toComparableString().hashCode(); - } + @Override + public int hashCode() { + return toComparableString().hashCode(); + } - @Override - public boolean equals(Object obj) { - if (this == obj) return true; - if (obj == null) return false; - if (getClass() != obj.getClass()) return false; + @Override + public boolean equals(Object obj) { + if (this == obj) + return true; + if (obj == null) + return false; + if (getClass() != obj.getClass()) + return false; - StructuredProperty other = (StructuredProperty) obj; + StructuredProperty other = (StructuredProperty) obj; - return toComparableString().equals(other.toComparableString()); - } + return toComparableString().equals(other.toComparableString()); + } } diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/scholexplorer/DLIDataset.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/scholexplorer/DLIDataset.java index e1569787b..421b4ecaa 100644 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/scholexplorer/DLIDataset.java +++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/scholexplorer/DLIDataset.java @@ -1,83 +1,89 @@ + package eu.dnetlib.dhp.schema.scholexplorer; -import eu.dnetlib.dhp.schema.oaf.Dataset; -import eu.dnetlib.dhp.schema.oaf.OafEntity; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; + import org.apache.commons.lang3.StringUtils; +import eu.dnetlib.dhp.schema.oaf.Dataset; +import eu.dnetlib.dhp.schema.oaf.OafEntity; + public class DLIDataset extends Dataset { - private String originalObjIdentifier; + private String originalObjIdentifier; - private List dlicollectedfrom; + private List dlicollectedfrom; - private String completionStatus; + private String completionStatus; - public String getCompletionStatus() { - return completionStatus; - } + public String getCompletionStatus() { + return completionStatus; + } - public void setCompletionStatus(String completionStatus) { - this.completionStatus = completionStatus; - } + public void setCompletionStatus(String completionStatus) { + this.completionStatus = completionStatus; + } - public List getDlicollectedfrom() { - return dlicollectedfrom; - } + public List getDlicollectedfrom() { + return dlicollectedfrom; + } - public void setDlicollectedfrom(List dlicollectedfrom) { - this.dlicollectedfrom = dlicollectedfrom; - } + public void setDlicollectedfrom(List dlicollectedfrom) { + this.dlicollectedfrom = dlicollectedfrom; + } - public String getOriginalObjIdentifier() { - return originalObjIdentifier; - } + public String getOriginalObjIdentifier() { + return originalObjIdentifier; + } - public void setOriginalObjIdentifier(String originalObjIdentifier) { - this.originalObjIdentifier = originalObjIdentifier; - } + public void setOriginalObjIdentifier(String originalObjIdentifier) { + this.originalObjIdentifier = originalObjIdentifier; + } - @Override - public void mergeFrom(OafEntity e) { - super.mergeFrom(e); - DLIDataset p = (DLIDataset) e; - if (StringUtils.isBlank(completionStatus) && StringUtils.isNotBlank(p.completionStatus)) - completionStatus = p.completionStatus; - if ("complete".equalsIgnoreCase(p.completionStatus)) completionStatus = "complete"; - dlicollectedfrom = mergeProvenance(dlicollectedfrom, p.getDlicollectedfrom()); - } + @Override + public void mergeFrom(OafEntity e) { + super.mergeFrom(e); + DLIDataset p = (DLIDataset) e; + if (StringUtils.isBlank(completionStatus) && StringUtils.isNotBlank(p.completionStatus)) + completionStatus = p.completionStatus; + if ("complete".equalsIgnoreCase(p.completionStatus)) + completionStatus = "complete"; + dlicollectedfrom = mergeProvenance(dlicollectedfrom, p.getDlicollectedfrom()); + } - private List mergeProvenance( - final List a, final List b) { - Map result = new HashMap<>(); - if (a != null) - a.forEach( - p -> { - if (p != null && StringUtils.isNotBlank(p.getId()) && result.containsKey(p.getId())) { - if ("incomplete".equalsIgnoreCase(result.get(p.getId()).getCompletionStatus()) - && StringUtils.isNotBlank(p.getCompletionStatus())) { - result.put(p.getId(), p); - } + private List mergeProvenance( + final List a, final List b) { + Map result = new HashMap<>(); + if (a != null) + a + .forEach( + p -> { + if (p != null && StringUtils.isNotBlank(p.getId()) && result.containsKey(p.getId())) { + if ("incomplete".equalsIgnoreCase(result.get(p.getId()).getCompletionStatus()) + && StringUtils.isNotBlank(p.getCompletionStatus())) { + result.put(p.getId(), p); + } - } else if (p != null && p.getId() != null && !result.containsKey(p.getId())) - result.put(p.getId(), p); - }); - if (b != null) - b.forEach( - p -> { - if (p != null && StringUtils.isNotBlank(p.getId()) && result.containsKey(p.getId())) { - if ("incomplete".equalsIgnoreCase(result.get(p.getId()).getCompletionStatus()) - && StringUtils.isNotBlank(p.getCompletionStatus())) { - result.put(p.getId(), p); - } + } else if (p != null && p.getId() != null && !result.containsKey(p.getId())) + result.put(p.getId(), p); + }); + if (b != null) + b + .forEach( + p -> { + if (p != null && StringUtils.isNotBlank(p.getId()) && result.containsKey(p.getId())) { + if ("incomplete".equalsIgnoreCase(result.get(p.getId()).getCompletionStatus()) + && StringUtils.isNotBlank(p.getCompletionStatus())) { + result.put(p.getId(), p); + } - } else if (p != null && p.getId() != null && !result.containsKey(p.getId())) - result.put(p.getId(), p); - }); + } else if (p != null && p.getId() != null && !result.containsKey(p.getId())) + result.put(p.getId(), p); + }); - return new ArrayList<>(result.values()); - } + return new ArrayList<>(result.values()); + } } diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/scholexplorer/DLIPublication.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/scholexplorer/DLIPublication.java index 2cfb6515c..c899a899c 100644 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/scholexplorer/DLIPublication.java +++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/scholexplorer/DLIPublication.java @@ -1,81 +1,87 @@ + package eu.dnetlib.dhp.schema.scholexplorer; +import java.io.Serializable; +import java.util.*; + +import org.apache.commons.lang3.StringUtils; + import eu.dnetlib.dhp.schema.oaf.OafEntity; import eu.dnetlib.dhp.schema.oaf.Publication; -import java.io.Serializable; -import java.util.*; -import org.apache.commons.lang3.StringUtils; public class DLIPublication extends Publication implements Serializable { - private String originalObjIdentifier; + private String originalObjIdentifier; - private List dlicollectedfrom; + private List dlicollectedfrom; - private String completionStatus; + private String completionStatus; - public String getCompletionStatus() { - return completionStatus; - } + public String getCompletionStatus() { + return completionStatus; + } - public void setCompletionStatus(String completionStatus) { - this.completionStatus = completionStatus; - } + public void setCompletionStatus(String completionStatus) { + this.completionStatus = completionStatus; + } - public List getDlicollectedfrom() { - return dlicollectedfrom; - } + public List getDlicollectedfrom() { + return dlicollectedfrom; + } - public void setDlicollectedfrom(List dlicollectedfrom) { - this.dlicollectedfrom = dlicollectedfrom; - } + public void setDlicollectedfrom(List dlicollectedfrom) { + this.dlicollectedfrom = dlicollectedfrom; + } - public String getOriginalObjIdentifier() { - return originalObjIdentifier; - } + public String getOriginalObjIdentifier() { + return originalObjIdentifier; + } - public void setOriginalObjIdentifier(String originalObjIdentifier) { - this.originalObjIdentifier = originalObjIdentifier; - } + public void setOriginalObjIdentifier(String originalObjIdentifier) { + this.originalObjIdentifier = originalObjIdentifier; + } - @Override - public void mergeFrom(OafEntity e) { - super.mergeFrom(e); - DLIPublication p = (DLIPublication) e; - if (StringUtils.isBlank(completionStatus) && StringUtils.isNotBlank(p.completionStatus)) - completionStatus = p.completionStatus; - if ("complete".equalsIgnoreCase(p.completionStatus)) completionStatus = "complete"; - dlicollectedfrom = mergeProvenance(dlicollectedfrom, p.getDlicollectedfrom()); - } + @Override + public void mergeFrom(OafEntity e) { + super.mergeFrom(e); + DLIPublication p = (DLIPublication) e; + if (StringUtils.isBlank(completionStatus) && StringUtils.isNotBlank(p.completionStatus)) + completionStatus = p.completionStatus; + if ("complete".equalsIgnoreCase(p.completionStatus)) + completionStatus = "complete"; + dlicollectedfrom = mergeProvenance(dlicollectedfrom, p.getDlicollectedfrom()); + } - private List mergeProvenance( - final List a, final List b) { - Map result = new HashMap<>(); - if (a != null) - a.forEach( - p -> { - if (p != null && StringUtils.isNotBlank(p.getId()) && result.containsKey(p.getId())) { - if ("incomplete".equalsIgnoreCase(result.get(p.getId()).getCompletionStatus()) - && StringUtils.isNotBlank(p.getCompletionStatus())) { - result.put(p.getId(), p); - } + private List mergeProvenance( + final List a, final List b) { + Map result = new HashMap<>(); + if (a != null) + a + .forEach( + p -> { + if (p != null && StringUtils.isNotBlank(p.getId()) && result.containsKey(p.getId())) { + if ("incomplete".equalsIgnoreCase(result.get(p.getId()).getCompletionStatus()) + && StringUtils.isNotBlank(p.getCompletionStatus())) { + result.put(p.getId(), p); + } - } else if (p != null && p.getId() != null && !result.containsKey(p.getId())) - result.put(p.getId(), p); - }); - if (b != null) - b.forEach( - p -> { - if (p != null && StringUtils.isNotBlank(p.getId()) && result.containsKey(p.getId())) { - if ("incomplete".equalsIgnoreCase(result.get(p.getId()).getCompletionStatus()) - && StringUtils.isNotBlank(p.getCompletionStatus())) { - result.put(p.getId(), p); - } + } else if (p != null && p.getId() != null && !result.containsKey(p.getId())) + result.put(p.getId(), p); + }); + if (b != null) + b + .forEach( + p -> { + if (p != null && StringUtils.isNotBlank(p.getId()) && result.containsKey(p.getId())) { + if ("incomplete".equalsIgnoreCase(result.get(p.getId()).getCompletionStatus()) + && StringUtils.isNotBlank(p.getCompletionStatus())) { + result.put(p.getId(), p); + } - } else if (p != null && p.getId() != null && !result.containsKey(p.getId())) - result.put(p.getId(), p); - }); + } else if (p != null && p.getId() != null && !result.containsKey(p.getId())) + result.put(p.getId(), p); + }); - return new ArrayList<>(result.values()); - } + return new ArrayList<>(result.values()); + } } diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/scholexplorer/DLIRelation.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/scholexplorer/DLIRelation.java index b58483cbb..d2d2089c0 100644 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/scholexplorer/DLIRelation.java +++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/scholexplorer/DLIRelation.java @@ -1,15 +1,16 @@ + package eu.dnetlib.dhp.schema.scholexplorer; import eu.dnetlib.dhp.schema.oaf.Relation; public class DLIRelation extends Relation { - private String dateOfCollection; + private String dateOfCollection; - public String getDateOfCollection() { - return dateOfCollection; - } + public String getDateOfCollection() { + return dateOfCollection; + } - public void setDateOfCollection(String dateOfCollection) { - this.dateOfCollection = dateOfCollection; - } + public void setDateOfCollection(String dateOfCollection) { + this.dateOfCollection = dateOfCollection; + } } diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/scholexplorer/DLIUnknown.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/scholexplorer/DLIUnknown.java index 6a58ab54f..e9b670d03 100644 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/scholexplorer/DLIUnknown.java +++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/scholexplorer/DLIUnknown.java @@ -1,109 +1,115 @@ + package eu.dnetlib.dhp.schema.scholexplorer; -import eu.dnetlib.dhp.schema.oaf.Oaf; -import eu.dnetlib.dhp.schema.oaf.StructuredProperty; import java.io.Serializable; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; + import org.apache.commons.lang3.StringUtils; +import eu.dnetlib.dhp.schema.oaf.Oaf; +import eu.dnetlib.dhp.schema.oaf.StructuredProperty; + public class DLIUnknown extends Oaf implements Serializable { - private String id; + private String id; - private List pid; + private List pid; - private String dateofcollection; + private String dateofcollection; - private String dateoftransformation; + private String dateoftransformation; - private List dlicollectedfrom; + private List dlicollectedfrom; - private String completionStatus = "incomplete"; + private String completionStatus = "incomplete"; - public String getCompletionStatus() { - return completionStatus; - } + public String getCompletionStatus() { + return completionStatus; + } - public void setCompletionStatus(String completionStatus) { - this.completionStatus = completionStatus; - } + public void setCompletionStatus(String completionStatus) { + this.completionStatus = completionStatus; + } - public List getDlicollectedfrom() { - return dlicollectedfrom; - } + public List getDlicollectedfrom() { + return dlicollectedfrom; + } - public void setDlicollectedfrom(List dlicollectedfrom) { - this.dlicollectedfrom = dlicollectedfrom; - } + public void setDlicollectedfrom(List dlicollectedfrom) { + this.dlicollectedfrom = dlicollectedfrom; + } - public String getId() { - return id; - } + public String getId() { + return id; + } - public void setId(String id) { - this.id = id; - } + public void setId(String id) { + this.id = id; + } - public List getPid() { - return pid; - } + public List getPid() { + return pid; + } - public void setPid(List pid) { - this.pid = pid; - } + public void setPid(List pid) { + this.pid = pid; + } - public String getDateofcollection() { - return dateofcollection; - } + public String getDateofcollection() { + return dateofcollection; + } - public void setDateofcollection(String dateofcollection) { - this.dateofcollection = dateofcollection; - } + public void setDateofcollection(String dateofcollection) { + this.dateofcollection = dateofcollection; + } - public String getDateoftransformation() { - return dateoftransformation; - } + public String getDateoftransformation() { + return dateoftransformation; + } - public void setDateoftransformation(String dateoftransformation) { - this.dateoftransformation = dateoftransformation; - } + public void setDateoftransformation(String dateoftransformation) { + this.dateoftransformation = dateoftransformation; + } - public void mergeFrom(DLIUnknown p) { - if ("complete".equalsIgnoreCase(p.completionStatus)) completionStatus = "complete"; - dlicollectedfrom = mergeProvenance(dlicollectedfrom, p.getDlicollectedfrom()); - } + public void mergeFrom(DLIUnknown p) { + if ("complete".equalsIgnoreCase(p.completionStatus)) + completionStatus = "complete"; + dlicollectedfrom = mergeProvenance(dlicollectedfrom, p.getDlicollectedfrom()); + } - private List mergeProvenance( - final List a, final List b) { - Map result = new HashMap<>(); - if (a != null) - a.forEach( - p -> { - if (p != null && StringUtils.isNotBlank(p.getId()) && result.containsKey(p.getId())) { - if ("incomplete".equalsIgnoreCase(result.get(p.getId()).getCompletionStatus()) - && StringUtils.isNotBlank(p.getCompletionStatus())) { - result.put(p.getId(), p); - } + private List mergeProvenance( + final List a, final List b) { + Map result = new HashMap<>(); + if (a != null) + a + .forEach( + p -> { + if (p != null && StringUtils.isNotBlank(p.getId()) && result.containsKey(p.getId())) { + if ("incomplete".equalsIgnoreCase(result.get(p.getId()).getCompletionStatus()) + && StringUtils.isNotBlank(p.getCompletionStatus())) { + result.put(p.getId(), p); + } - } else if (p != null && p.getId() != null && !result.containsKey(p.getId())) - result.put(p.getId(), p); - }); - if (b != null) - b.forEach( - p -> { - if (p != null && StringUtils.isNotBlank(p.getId()) && result.containsKey(p.getId())) { - if ("incomplete".equalsIgnoreCase(result.get(p.getId()).getCompletionStatus()) - && StringUtils.isNotBlank(p.getCompletionStatus())) { - result.put(p.getId(), p); - } + } else if (p != null && p.getId() != null && !result.containsKey(p.getId())) + result.put(p.getId(), p); + }); + if (b != null) + b + .forEach( + p -> { + if (p != null && StringUtils.isNotBlank(p.getId()) && result.containsKey(p.getId())) { + if ("incomplete".equalsIgnoreCase(result.get(p.getId()).getCompletionStatus()) + && StringUtils.isNotBlank(p.getCompletionStatus())) { + result.put(p.getId(), p); + } - } else if (p != null && p.getId() != null && !result.containsKey(p.getId())) - result.put(p.getId(), p); - }); + } else if (p != null && p.getId() != null && !result.containsKey(p.getId())) + result.put(p.getId(), p); + }); - return new ArrayList<>(result.values()); - } + return new ArrayList<>(result.values()); + } } diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/scholexplorer/ProvenaceInfo.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/scholexplorer/ProvenaceInfo.java index 52f7161b9..b1188f064 100644 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/scholexplorer/ProvenaceInfo.java +++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/scholexplorer/ProvenaceInfo.java @@ -1,46 +1,47 @@ + package eu.dnetlib.dhp.schema.scholexplorer; import java.io.Serializable; public class ProvenaceInfo implements Serializable { - private String id; + private String id; - private String name; + private String name; - private String completionStatus; + private String completionStatus; - private String collectionMode = "collected"; + private String collectionMode = "collected"; - public String getId() { - return id; - } + public String getId() { + return id; + } - public void setId(String id) { - this.id = id; - } + public void setId(String id) { + this.id = id; + } - public String getName() { - return name; - } + public String getName() { + return name; + } - public void setName(String name) { - this.name = name; - } + public void setName(String name) { + this.name = name; + } - public String getCompletionStatus() { - return completionStatus; - } + public String getCompletionStatus() { + return completionStatus; + } - public void setCompletionStatus(String completionStatus) { - this.completionStatus = completionStatus; - } + public void setCompletionStatus(String completionStatus) { + this.completionStatus = completionStatus; + } - public String getCollectionMode() { - return collectionMode; - } + public String getCollectionMode() { + return collectionMode; + } - public void setCollectionMode(String collectionMode) { - this.collectionMode = collectionMode; - } + public void setCollectionMode(String collectionMode) { + this.collectionMode = collectionMode; + } } diff --git a/dhp-schemas/src/test/java/eu/dnetlib/dhp/schema/action/AtomicActionTest.java b/dhp-schemas/src/test/java/eu/dnetlib/dhp/schema/action/AtomicActionTest.java index 482c1c223..4d31591a0 100644 --- a/dhp-schemas/src/test/java/eu/dnetlib/dhp/schema/action/AtomicActionTest.java +++ b/dhp-schemas/src/test/java/eu/dnetlib/dhp/schema/action/AtomicActionTest.java @@ -1,36 +1,40 @@ + package eu.dnetlib.dhp.schema.action; import static org.junit.jupiter.api.Assertions.*; -import com.fasterxml.jackson.databind.ObjectMapper; -import eu.dnetlib.dhp.schema.oaf.Relation; import java.io.IOException; + import org.apache.commons.lang3.StringUtils; import org.junit.jupiter.api.Test; +import com.fasterxml.jackson.databind.ObjectMapper; + +import eu.dnetlib.dhp.schema.oaf.Relation; + /** @author claudio.atzori */ public class AtomicActionTest { - @Test - public void serializationTest() throws IOException { + @Test + public void serializationTest() throws IOException { - Relation rel = new Relation(); - rel.setSource("1"); - rel.setTarget("2"); - rel.setRelType("resultResult"); - rel.setSubRelType("dedup"); - rel.setRelClass("merges"); + Relation rel = new Relation(); + rel.setSource("1"); + rel.setTarget("2"); + rel.setRelType("resultResult"); + rel.setSubRelType("dedup"); + rel.setRelClass("merges"); - AtomicAction aa1 = new AtomicAction(Relation.class, rel); + AtomicAction aa1 = new AtomicAction(Relation.class, rel); - final ObjectMapper mapper = new ObjectMapper(); - String json = mapper.writeValueAsString(aa1); + final ObjectMapper mapper = new ObjectMapper(); + String json = mapper.writeValueAsString(aa1); - assertTrue(StringUtils.isNotBlank(json)); + assertTrue(StringUtils.isNotBlank(json)); - AtomicAction aa2 = mapper.readValue(json, AtomicAction.class); + AtomicAction aa2 = mapper.readValue(json, AtomicAction.class); - assertEquals(aa1.getClazz(), aa2.getClazz()); - assertEquals(aa1.getPayload(), aa2.getPayload()); - } + assertEquals(aa1.getClazz(), aa2.getClazz()); + assertEquals(aa1.getPayload(), aa2.getPayload()); + } } diff --git a/dhp-schemas/src/test/java/eu/dnetlib/dhp/schema/common/ModelSupportTest.java b/dhp-schemas/src/test/java/eu/dnetlib/dhp/schema/common/ModelSupportTest.java index 3e07ea87c..73e8c47ff 100644 --- a/dhp-schemas/src/test/java/eu/dnetlib/dhp/schema/common/ModelSupportTest.java +++ b/dhp-schemas/src/test/java/eu/dnetlib/dhp/schema/common/ModelSupportTest.java @@ -1,35 +1,37 @@ + package eu.dnetlib.dhp.schema.common; import static org.junit.jupiter.api.Assertions.assertFalse; import static org.junit.jupiter.api.Assertions.assertTrue; -import eu.dnetlib.dhp.schema.oaf.OafEntity; -import eu.dnetlib.dhp.schema.oaf.Relation; -import eu.dnetlib.dhp.schema.oaf.Result; import org.junit.jupiter.api.Nested; import org.junit.jupiter.api.Test; +import eu.dnetlib.dhp.schema.oaf.OafEntity; +import eu.dnetlib.dhp.schema.oaf.Relation; +import eu.dnetlib.dhp.schema.oaf.Result; + public class ModelSupportTest { - @Nested - class IsSubClass { + @Nested + class IsSubClass { - @Test - public void shouldReturnFalseWhenSubClassDoesNotExtendSuperClass() { - // when - Boolean result = ModelSupport.isSubClass(Relation.class, OafEntity.class); + @Test + public void shouldReturnFalseWhenSubClassDoesNotExtendSuperClass() { + // when + Boolean result = ModelSupport.isSubClass(Relation.class, OafEntity.class); - // then - assertFalse(result); - } + // then + assertFalse(result); + } - @Test - public void shouldReturnTrueWhenSubClassExtendsSuperClass() { - // when - Boolean result = ModelSupport.isSubClass(Result.class, OafEntity.class); + @Test + public void shouldReturnTrueWhenSubClassExtendsSuperClass() { + // when + Boolean result = ModelSupport.isSubClass(Result.class, OafEntity.class); - // then - assertTrue(result); - } - } + // then + assertTrue(result); + } + } } diff --git a/dhp-schemas/src/test/java/eu/dnetlib/dhp/schema/oaf/MergeTest.java b/dhp-schemas/src/test/java/eu/dnetlib/dhp/schema/oaf/MergeTest.java index bb5b824f6..f91646f2c 100644 --- a/dhp-schemas/src/test/java/eu/dnetlib/dhp/schema/oaf/MergeTest.java +++ b/dhp-schemas/src/test/java/eu/dnetlib/dhp/schema/oaf/MergeTest.java @@ -1,86 +1,88 @@ + package eu.dnetlib.dhp.schema.oaf; import static org.junit.jupiter.api.Assertions.*; import java.util.Arrays; import java.util.List; + import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; public class MergeTest { - OafEntity oaf; + OafEntity oaf; - @BeforeEach - public void setUp() { - oaf = new Publication(); - } + @BeforeEach + public void setUp() { + oaf = new Publication(); + } - @Test - public void mergeListsTest() { + @Test + public void mergeListsTest() { - // string list merge test - List a = Arrays.asList("a", "b", "c", "e"); - List b = Arrays.asList("a", "b", "c", "d"); - List c = null; + // string list merge test + List a = Arrays.asList("a", "b", "c", "e"); + List b = Arrays.asList("a", "b", "c", "d"); + List c = null; - System.out.println("merge result 1 = " + oaf.mergeLists(a, b)); + System.out.println("merge result 1 = " + oaf.mergeLists(a, b)); - System.out.println("merge result 2 = " + oaf.mergeLists(a, c)); + System.out.println("merge result 2 = " + oaf.mergeLists(a, c)); - System.out.println("merge result 3 = " + oaf.mergeLists(c, c)); - } + System.out.println("merge result 3 = " + oaf.mergeLists(c, c)); + } - @Test - public void mergePublicationCollectedFromTest() { + @Test + public void mergePublicationCollectedFromTest() { - Publication a = new Publication(); - Publication b = new Publication(); + Publication a = new Publication(); + Publication b = new Publication(); - a.setCollectedfrom(Arrays.asList(setKV("a", "open"), setKV("b", "closed"))); - b.setCollectedfrom(Arrays.asList(setKV("A", "open"), setKV("b", "Open"))); + a.setCollectedfrom(Arrays.asList(setKV("a", "open"), setKV("b", "closed"))); + b.setCollectedfrom(Arrays.asList(setKV("A", "open"), setKV("b", "Open"))); - a.mergeFrom(b); + a.mergeFrom(b); - assertNotNull(a.getCollectedfrom()); - assertEquals(3, a.getCollectedfrom().size()); - } + assertNotNull(a.getCollectedfrom()); + assertEquals(3, a.getCollectedfrom().size()); + } - @Test - public void mergePublicationSubjectTest() { + @Test + public void mergePublicationSubjectTest() { - Publication a = new Publication(); - Publication b = new Publication(); + Publication a = new Publication(); + Publication b = new Publication(); - a.setSubject(Arrays.asList(setSP("a", "open", "classe"), setSP("b", "open", "classe"))); - b.setSubject(Arrays.asList(setSP("A", "open", "classe"), setSP("c", "open", "classe"))); + a.setSubject(Arrays.asList(setSP("a", "open", "classe"), setSP("b", "open", "classe"))); + b.setSubject(Arrays.asList(setSP("A", "open", "classe"), setSP("c", "open", "classe"))); - a.mergeFrom(b); + a.mergeFrom(b); - assertNotNull(a.getSubject()); - assertEquals(3, a.getSubject().size()); - } + assertNotNull(a.getSubject()); + assertEquals(3, a.getSubject().size()); + } - private KeyValue setKV(final String key, final String value) { + private KeyValue setKV(final String key, final String value) { - KeyValue k = new KeyValue(); + KeyValue k = new KeyValue(); - k.setKey(key); - k.setValue(value); + k.setKey(key); + k.setValue(value); - return k; - } + return k; + } - private StructuredProperty setSP( - final String value, final String schema, final String classname) { - StructuredProperty s = new StructuredProperty(); - s.setValue(value); - Qualifier q = new Qualifier(); - q.setClassname(classname); - q.setClassid(classname); - q.setSchemename(schema); - q.setSchemeid(schema); - s.setQualifier(q); - return s; - } + private StructuredProperty setSP( + final String value, final String schema, final String classname) { + StructuredProperty s = new StructuredProperty(); + s.setValue(value); + Qualifier q = new Qualifier(); + q.setClassname(classname); + q.setClassid(classname); + q.setSchemename(schema); + q.setSchemeid(schema); + s.setQualifier(q); + return s; + } } diff --git a/dhp-schemas/src/test/java/eu/dnetlib/dhp/schema/scholexplorer/DLItest.java b/dhp-schemas/src/test/java/eu/dnetlib/dhp/schema/scholexplorer/DLItest.java index 4f82cfe10..e4596fcdd 100644 --- a/dhp-schemas/src/test/java/eu/dnetlib/dhp/schema/scholexplorer/DLItest.java +++ b/dhp-schemas/src/test/java/eu/dnetlib/dhp/schema/scholexplorer/DLItest.java @@ -1,76 +1,83 @@ + package eu.dnetlib.dhp.schema.scholexplorer; +import java.io.IOException; +import java.util.Arrays; +import java.util.Collections; + +import org.junit.jupiter.api.Test; + import com.fasterxml.jackson.core.JsonProcessingException; import com.fasterxml.jackson.databind.DeserializationFeature; import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.databind.SerializationFeature; + import eu.dnetlib.dhp.schema.oaf.Qualifier; import eu.dnetlib.dhp.schema.oaf.StructuredProperty; -import java.io.IOException; -import java.util.Arrays; -import java.util.Collections; -import org.junit.jupiter.api.Test; public class DLItest { - @Test - public void testMergePublication() throws JsonProcessingException { - DLIPublication a1 = new DLIPublication(); - a1.setPid(Arrays.asList(createSP("123456", "pdb", "dnet:pid_types"))); - a1.setTitle(Collections.singletonList(createSP("Un Titolo", "title", "dnetTitle"))); - a1.setDlicollectedfrom(Arrays.asList(createCollectedFrom("znd", "Zenodo", "complete"))); - a1.setCompletionStatus("complete"); + @Test + public void testMergePublication() throws JsonProcessingException { + DLIPublication a1 = new DLIPublication(); + a1.setPid(Arrays.asList(createSP("123456", "pdb", "dnet:pid_types"))); + a1.setTitle(Collections.singletonList(createSP("Un Titolo", "title", "dnetTitle"))); + a1.setDlicollectedfrom(Arrays.asList(createCollectedFrom("znd", "Zenodo", "complete"))); + a1.setCompletionStatus("complete"); - DLIPublication a = new DLIPublication(); - a.setPid( - Arrays.asList( - createSP("10.11", "doi", "dnet:pid_types"), - createSP("123456", "pdb", "dnet:pid_types"))); - a.setTitle(Collections.singletonList(createSP("A Title", "title", "dnetTitle"))); - a.setDlicollectedfrom( - Arrays.asList( - createCollectedFrom("dct", "datacite", "complete"), - createCollectedFrom("dct", "datacite", "incomplete"))); - a.setCompletionStatus("incomplete"); + DLIPublication a = new DLIPublication(); + a + .setPid( + Arrays + .asList( + createSP("10.11", "doi", "dnet:pid_types"), + createSP("123456", "pdb", "dnet:pid_types"))); + a.setTitle(Collections.singletonList(createSP("A Title", "title", "dnetTitle"))); + a + .setDlicollectedfrom( + Arrays + .asList( + createCollectedFrom("dct", "datacite", "complete"), + createCollectedFrom("dct", "datacite", "incomplete"))); + a.setCompletionStatus("incomplete"); - a.mergeFrom(a1); + a.mergeFrom(a1); - ObjectMapper mapper = new ObjectMapper(); - System.out.println(mapper.writeValueAsString(a)); - } + ObjectMapper mapper = new ObjectMapper(); + System.out.println(mapper.writeValueAsString(a)); + } - @Test - public void testDeserialization() throws IOException { + @Test + public void testDeserialization() throws IOException { - final String json = - "{\"dataInfo\":{\"invisible\":false,\"inferred\":null,\"deletedbyinference\":false,\"trust\":\"0.9\",\"inferenceprovenance\":null,\"provenanceaction\":null},\"lastupdatetimestamp\":null,\"id\":\"60|bd9352547098929a394655ad1a44a479\",\"originalId\":[\"bd9352547098929a394655ad1a44a479\"],\"collectedfrom\":[{\"key\":\"dli_________::datacite\",\"value\":\"Datasets in Datacite\",\"dataInfo\":null,\"blank\":false}],\"pid\":[{\"value\":\"10.7925/DRS1.DUCHAS_5078760\",\"qualifier\":{\"classid\":\"doi\",\"classname\":\"doi\",\"schemeid\":\"dnet:pid_types\",\"schemename\":\"dnet:pid_types\",\"blank\":false},\"dataInfo\":null}],\"dateofcollection\":\"2020-01-09T08:29:31.885Z\",\"dateoftransformation\":null,\"extraInfo\":null,\"oaiprovenance\":null,\"author\":[{\"fullname\":\"Cathail, S. Ó\",\"name\":null,\"surname\":null,\"rank\":null,\"pid\":null,\"affiliation\":null},{\"fullname\":\"Donnell, Breda Mc\",\"name\":null,\"surname\":null,\"rank\":null,\"pid\":null,\"affiliation\":null},{\"fullname\":\"Ireland. Department of Arts, Culture, and the Gaeltacht\",\"name\":null,\"surname\":null,\"rank\":null,\"pid\":null,\"affiliation\":null},{\"fullname\":\"University College Dublin\",\"name\":null,\"surname\":null,\"rank\":null,\"pid\":null,\"affiliation\":null},{\"fullname\":\"National Folklore Foundation\",\"name\":null,\"surname\":null,\"rank\":null,\"pid\":null,\"affiliation\":null},{\"fullname\":\"Cathail, S. Ó\",\"name\":null,\"surname\":null,\"rank\":null,\"pid\":null,\"affiliation\":null},{\"fullname\":\"Donnell, Breda Mc\",\"name\":null,\"surname\":null,\"rank\":null,\"pid\":null,\"affiliation\":null}],\"resulttype\":null,\"language\":null,\"country\":null,\"subject\":[{\"value\":\"Recreation\",\"qualifier\":{\"classid\":\"dnet:subject\",\"classname\":\"dnet:subject\",\"schemeid\":\"unknown\",\"schemename\":\"unknown\",\"blank\":false},\"dataInfo\":null},{\"value\":\"Entertainments and recreational activities\",\"qualifier\":{\"classid\":\"dnet:subject\",\"classname\":\"dnet:subject\",\"schemeid\":\"unknown\",\"schemename\":\"unknown\",\"blank\":false},\"dataInfo\":null},{\"value\":\"Siamsaíocht agus caitheamh aimsire\",\"qualifier\":{\"classid\":\"dnet:subject\",\"classname\":\"dnet:subject\",\"schemeid\":\"unknown\",\"schemename\":\"unknown\",\"blank\":false},\"dataInfo\":null}],\"title\":[{\"value\":\"Games We Play\",\"qualifier\":null,\"dataInfo\":null}],\"relevantdate\":[{\"value\":\"1938-09-28\",\"qualifier\":{\"classid\":\"date\",\"classname\":\"date\",\"schemeid\":\"dnet::date\",\"schemename\":\"dnet::date\",\"blank\":false},\"dataInfo\":null}],\"description\":[{\"value\":\"Story collected by Breda Mc Donnell, a student at Tenure school (Tinure, Co. Louth) (no informant identified).\",\"dataInfo\":null}],\"dateofacceptance\":null,\"publisher\":{\"value\":\"University College Dublin\",\"dataInfo\":null},\"embargoenddate\":null,\"source\":null,\"fulltext\":null,\"format\":null,\"contributor\":null,\"resourcetype\":null,\"coverage\":null,\"refereed\":null,\"context\":null,\"processingchargeamount\":null,\"processingchargecurrency\":null,\"externalReference\":null,\"instance\":[],\"storagedate\":null,\"device\":null,\"size\":null,\"version\":null,\"lastmetadataupdate\":null,\"metadataversionnumber\":null,\"geolocation\":null,\"dlicollectedfrom\":[{\"id\":\"dli_________::datacite\",\"name\":\"Datasets in Datacite\",\"completionStatus\":\"complete\",\"collectionMode\":\"resolved\"}],\"completionStatus\":\"complete\"}"; + final String json = "{\"dataInfo\":{\"invisible\":false,\"inferred\":null,\"deletedbyinference\":false,\"trust\":\"0.9\",\"inferenceprovenance\":null,\"provenanceaction\":null},\"lastupdatetimestamp\":null,\"id\":\"60|bd9352547098929a394655ad1a44a479\",\"originalId\":[\"bd9352547098929a394655ad1a44a479\"],\"collectedfrom\":[{\"key\":\"dli_________::datacite\",\"value\":\"Datasets in Datacite\",\"dataInfo\":null,\"blank\":false}],\"pid\":[{\"value\":\"10.7925/DRS1.DUCHAS_5078760\",\"qualifier\":{\"classid\":\"doi\",\"classname\":\"doi\",\"schemeid\":\"dnet:pid_types\",\"schemename\":\"dnet:pid_types\",\"blank\":false},\"dataInfo\":null}],\"dateofcollection\":\"2020-01-09T08:29:31.885Z\",\"dateoftransformation\":null,\"extraInfo\":null,\"oaiprovenance\":null,\"author\":[{\"fullname\":\"Cathail, S. Ó\",\"name\":null,\"surname\":null,\"rank\":null,\"pid\":null,\"affiliation\":null},{\"fullname\":\"Donnell, Breda Mc\",\"name\":null,\"surname\":null,\"rank\":null,\"pid\":null,\"affiliation\":null},{\"fullname\":\"Ireland. Department of Arts, Culture, and the Gaeltacht\",\"name\":null,\"surname\":null,\"rank\":null,\"pid\":null,\"affiliation\":null},{\"fullname\":\"University College Dublin\",\"name\":null,\"surname\":null,\"rank\":null,\"pid\":null,\"affiliation\":null},{\"fullname\":\"National Folklore Foundation\",\"name\":null,\"surname\":null,\"rank\":null,\"pid\":null,\"affiliation\":null},{\"fullname\":\"Cathail, S. Ó\",\"name\":null,\"surname\":null,\"rank\":null,\"pid\":null,\"affiliation\":null},{\"fullname\":\"Donnell, Breda Mc\",\"name\":null,\"surname\":null,\"rank\":null,\"pid\":null,\"affiliation\":null}],\"resulttype\":null,\"language\":null,\"country\":null,\"subject\":[{\"value\":\"Recreation\",\"qualifier\":{\"classid\":\"dnet:subject\",\"classname\":\"dnet:subject\",\"schemeid\":\"unknown\",\"schemename\":\"unknown\",\"blank\":false},\"dataInfo\":null},{\"value\":\"Entertainments and recreational activities\",\"qualifier\":{\"classid\":\"dnet:subject\",\"classname\":\"dnet:subject\",\"schemeid\":\"unknown\",\"schemename\":\"unknown\",\"blank\":false},\"dataInfo\":null},{\"value\":\"Siamsaíocht agus caitheamh aimsire\",\"qualifier\":{\"classid\":\"dnet:subject\",\"classname\":\"dnet:subject\",\"schemeid\":\"unknown\",\"schemename\":\"unknown\",\"blank\":false},\"dataInfo\":null}],\"title\":[{\"value\":\"Games We Play\",\"qualifier\":null,\"dataInfo\":null}],\"relevantdate\":[{\"value\":\"1938-09-28\",\"qualifier\":{\"classid\":\"date\",\"classname\":\"date\",\"schemeid\":\"dnet::date\",\"schemename\":\"dnet::date\",\"blank\":false},\"dataInfo\":null}],\"description\":[{\"value\":\"Story collected by Breda Mc Donnell, a student at Tenure school (Tinure, Co. Louth) (no informant identified).\",\"dataInfo\":null}],\"dateofacceptance\":null,\"publisher\":{\"value\":\"University College Dublin\",\"dataInfo\":null},\"embargoenddate\":null,\"source\":null,\"fulltext\":null,\"format\":null,\"contributor\":null,\"resourcetype\":null,\"coverage\":null,\"refereed\":null,\"context\":null,\"processingchargeamount\":null,\"processingchargecurrency\":null,\"externalReference\":null,\"instance\":[],\"storagedate\":null,\"device\":null,\"size\":null,\"version\":null,\"lastmetadataupdate\":null,\"metadataversionnumber\":null,\"geolocation\":null,\"dlicollectedfrom\":[{\"id\":\"dli_________::datacite\",\"name\":\"Datasets in Datacite\",\"completionStatus\":\"complete\",\"collectionMode\":\"resolved\"}],\"completionStatus\":\"complete\"}"; - ObjectMapper mapper = new ObjectMapper(); - mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false); - DLIDataset dliDataset = mapper.readValue(json, DLIDataset.class); - mapper.enable(SerializationFeature.INDENT_OUTPUT); - System.out.println(mapper.writeValueAsString(dliDataset)); - } + ObjectMapper mapper = new ObjectMapper(); + mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false); + DLIDataset dliDataset = mapper.readValue(json, DLIDataset.class); + mapper.enable(SerializationFeature.INDENT_OUTPUT); + System.out.println(mapper.writeValueAsString(dliDataset)); + } - private ProvenaceInfo createCollectedFrom( - final String id, final String name, final String completionStatus) { - ProvenaceInfo p = new ProvenaceInfo(); - p.setId(id); - p.setName(name); - p.setCompletionStatus(completionStatus); - return p; - } + private ProvenaceInfo createCollectedFrom( + final String id, final String name, final String completionStatus) { + ProvenaceInfo p = new ProvenaceInfo(); + p.setId(id); + p.setName(name); + p.setCompletionStatus(completionStatus); + return p; + } - private StructuredProperty createSP( - final String value, final String className, final String schemeName) { - StructuredProperty p = new StructuredProperty(); - p.setValue(value); - Qualifier schema = new Qualifier(); - schema.setClassname(className); - schema.setClassid(className); - schema.setSchemename(schemeName); - schema.setSchemeid(schemeName); - p.setQualifier(schema); - return p; - } + private StructuredProperty createSP( + final String value, final String className, final String schemeName) { + StructuredProperty p = new StructuredProperty(); + p.setValue(value); + Qualifier schema = new Qualifier(); + schema.setClassname(className); + schema.setClassid(className); + schema.setSchemename(schemeName); + schema.setSchemeid(schemeName); + p.setQualifier(schema); + return p; + } } diff --git a/dhp-workflows/dhp-actionmanager/src/main/java/eu/dnetlib/dhp/actionmanager/ISClient.java b/dhp-workflows/dhp-actionmanager/src/main/java/eu/dnetlib/dhp/actionmanager/ISClient.java index 42ca86f5f..091438195 100644 --- a/dhp-workflows/dhp-actionmanager/src/main/java/eu/dnetlib/dhp/actionmanager/ISClient.java +++ b/dhp-workflows/dhp-actionmanager/src/main/java/eu/dnetlib/dhp/actionmanager/ISClient.java @@ -1,8 +1,23 @@ + package eu.dnetlib.dhp.actionmanager; +import java.io.Serializable; +import java.io.StringReader; +import java.util.ArrayList; +import java.util.List; +import java.util.NoSuchElementException; +import java.util.stream.Collectors; + +import org.dom4j.Document; +import org.dom4j.Element; +import org.dom4j.io.SAXReader; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + import com.google.common.base.Splitter; import com.google.common.collect.Iterables; import com.google.common.collect.Lists; + import eu.dnetlib.actionmanager.rmi.ActionManagerException; import eu.dnetlib.actionmanager.set.ActionManagerSet; import eu.dnetlib.actionmanager.set.ActionManagerSet.ImpactTypes; @@ -10,130 +25,120 @@ import eu.dnetlib.dhp.actionmanager.partition.PartitionActionSetsByPayloadTypeJo import eu.dnetlib.dhp.utils.ISLookupClientFactory; import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException; import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService; -import java.io.Serializable; -import java.io.StringReader; -import java.util.ArrayList; -import java.util.List; -import java.util.NoSuchElementException; -import java.util.stream.Collectors; -import org.dom4j.Document; -import org.dom4j.Element; -import org.dom4j.io.SAXReader; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; public class ISClient implements Serializable { - private static final Logger log = - LoggerFactory.getLogger(PartitionActionSetsByPayloadTypeJob.class); + private static final Logger log = LoggerFactory.getLogger(PartitionActionSetsByPayloadTypeJob.class); - private static final String INPUT_ACTION_SET_ID_SEPARATOR = ","; + private static final String INPUT_ACTION_SET_ID_SEPARATOR = ","; - private ISLookUpService isLookup; + private ISLookUpService isLookup; - public ISClient(String isLookupUrl) { - isLookup = ISLookupClientFactory.getLookUpService(isLookupUrl); - } + public ISClient(String isLookupUrl) { + isLookup = ISLookupClientFactory.getLookUpService(isLookupUrl); + } - public List getLatestRawsetPaths(String setIds) { + public List getLatestRawsetPaths(String setIds) { - List ids = - Lists.newArrayList( - Splitter.on(INPUT_ACTION_SET_ID_SEPARATOR) - .omitEmptyStrings() - .trimResults() - .split(setIds)); + List ids = Lists + .newArrayList( + Splitter + .on(INPUT_ACTION_SET_ID_SEPARATOR) + .omitEmptyStrings() + .trimResults() + .split(setIds)); - return ids.stream() - .map(id -> getSet(isLookup, id)) - .map(as -> as.getPathToLatest()) - .collect(Collectors.toCollection(ArrayList::new)); - } + return ids + .stream() + .map(id -> getSet(isLookup, id)) + .map(as -> as.getPathToLatest()) + .collect(Collectors.toCollection(ArrayList::new)); + } - private ActionManagerSet getSet(ISLookUpService isLookup, final String setId) { + private ActionManagerSet getSet(ISLookUpService isLookup, final String setId) { - final String q = - "for $x in collection('/db/DRIVER/ActionManagerSetDSResources/ActionManagerSetDSResourceType') " - + "where $x//SET/@id = '" - + setId - + "' return $x"; + final String q = "for $x in collection('/db/DRIVER/ActionManagerSetDSResources/ActionManagerSetDSResourceType') " + + "where $x//SET/@id = '" + + setId + + "' return $x"; - try { - final String basePath = getBasePathHDFS(isLookup); - final String setProfile = isLookup.getResourceProfileByQuery(q); - return getActionManagerSet(basePath, setProfile); - } catch (ISLookUpException | ActionManagerException e) { - throw new RuntimeException("Error accessing Sets, using query: " + q); - } - } + try { + final String basePath = getBasePathHDFS(isLookup); + final String setProfile = isLookup.getResourceProfileByQuery(q); + return getActionManagerSet(basePath, setProfile); + } catch (ISLookUpException | ActionManagerException e) { + throw new RuntimeException("Error accessing Sets, using query: " + q); + } + } - private ActionManagerSet getActionManagerSet(final String basePath, final String profile) - throws ActionManagerException { - final SAXReader reader = new SAXReader(); - final ActionManagerSet set = new ActionManagerSet(); + private ActionManagerSet getActionManagerSet(final String basePath, final String profile) + throws ActionManagerException { + final SAXReader reader = new SAXReader(); + final ActionManagerSet set = new ActionManagerSet(); - try { - final Document doc = reader.read(new StringReader(profile)); + try { + final Document doc = reader.read(new StringReader(profile)); - set.setId(doc.valueOf("//SET/@id").trim()); - set.setName(doc.valueOf("//SET").trim()); - set.setImpact(ImpactTypes.valueOf(doc.valueOf("//IMPACT").trim())); - set.setLatest( - doc.valueOf("//RAW_SETS/LATEST/@id"), - doc.valueOf("//RAW_SETS/LATEST/@creationDate"), - doc.valueOf("//RAW_SETS/LATEST/@lastUpdate")); - set.setDirectory(doc.valueOf("//SET/@directory")); - final List expiredNodes = doc.selectNodes("//RAW_SETS/EXPIRED"); - if (expiredNodes != null) { - for (int i = 0; i < expiredNodes.size(); i++) { - Element ex = (Element) expiredNodes.get(i); - set.addExpired( - ex.attributeValue("id"), - ex.attributeValue("creationDate"), - ex.attributeValue("lastUpdate")); - } - } + set.setId(doc.valueOf("//SET/@id").trim()); + set.setName(doc.valueOf("//SET").trim()); + set.setImpact(ImpactTypes.valueOf(doc.valueOf("//IMPACT").trim())); + set + .setLatest( + doc.valueOf("//RAW_SETS/LATEST/@id"), + doc.valueOf("//RAW_SETS/LATEST/@creationDate"), + doc.valueOf("//RAW_SETS/LATEST/@lastUpdate")); + set.setDirectory(doc.valueOf("//SET/@directory")); + final List expiredNodes = doc.selectNodes("//RAW_SETS/EXPIRED"); + if (expiredNodes != null) { + for (int i = 0; i < expiredNodes.size(); i++) { + Element ex = (Element) expiredNodes.get(i); + set + .addExpired( + ex.attributeValue("id"), + ex.attributeValue("creationDate"), + ex.attributeValue("lastUpdate")); + } + } - final StringBuilder sb = new StringBuilder(); - sb.append(basePath); - sb.append("/"); - sb.append(doc.valueOf("//SET/@directory")); - sb.append("/"); - sb.append(doc.valueOf("//RAW_SETS/LATEST/@id")); - set.setPathToLatest(sb.toString()); + final StringBuilder sb = new StringBuilder(); + sb.append(basePath); + sb.append("/"); + sb.append(doc.valueOf("//SET/@directory")); + sb.append("/"); + sb.append(doc.valueOf("//RAW_SETS/LATEST/@id")); + set.setPathToLatest(sb.toString()); - return set; - } catch (Exception e) { - throw new ActionManagerException("Error creating set from profile: " + profile, e); - } - } + return set; + } catch (Exception e) { + throw new ActionManagerException("Error creating set from profile: " + profile, e); + } + } - private String getBasePathHDFS(ISLookUpService isLookup) throws ActionManagerException { - return queryServiceProperty(isLookup, "basePath"); - } + private String getBasePathHDFS(ISLookUpService isLookup) throws ActionManagerException { + return queryServiceProperty(isLookup, "basePath"); + } - private String queryServiceProperty(ISLookUpService isLookup, final String propertyName) - throws ActionManagerException { - final String q = - "for $x in /RESOURCE_PROFILE[.//RESOURCE_TYPE/@value='ActionManagerServiceResourceType'] return $x//SERVICE_PROPERTIES/PROPERTY[./@ key='" - + propertyName - + "']/@value/string()"; - log.debug("quering for service property: " + q); - try { - final List value = isLookup.quickSearchProfile(q); - return Iterables.getOnlyElement(value); - } catch (ISLookUpException e) { - String msg = "Error accessing service profile, using query: " + q; - log.error(msg, e); - throw new ActionManagerException(msg, e); - } catch (NoSuchElementException e) { - String msg = "missing service property: " + propertyName; - log.error(msg, e); - throw new ActionManagerException(msg, e); - } catch (IllegalArgumentException e) { - String msg = "found more than one service property: " + propertyName; - log.error(msg, e); - throw new ActionManagerException(msg, e); - } - } + private String queryServiceProperty(ISLookUpService isLookup, final String propertyName) + throws ActionManagerException { + final String q = "for $x in /RESOURCE_PROFILE[.//RESOURCE_TYPE/@value='ActionManagerServiceResourceType'] return $x//SERVICE_PROPERTIES/PROPERTY[./@ key='" + + propertyName + + "']/@value/string()"; + log.debug("quering for service property: " + q); + try { + final List value = isLookup.quickSearchProfile(q); + return Iterables.getOnlyElement(value); + } catch (ISLookUpException e) { + String msg = "Error accessing service profile, using query: " + q; + log.error(msg, e); + throw new ActionManagerException(msg, e); + } catch (NoSuchElementException e) { + String msg = "missing service property: " + propertyName; + log.error(msg, e); + throw new ActionManagerException(msg, e); + } catch (IllegalArgumentException e) { + String msg = "found more than one service property: " + propertyName; + log.error(msg, e); + throw new ActionManagerException(msg, e); + } + } } diff --git a/dhp-workflows/dhp-actionmanager/src/main/java/eu/dnetlib/dhp/actionmanager/migration/LicenseComparator.java b/dhp-workflows/dhp-actionmanager/src/main/java/eu/dnetlib/dhp/actionmanager/migration/LicenseComparator.java index ae498c411..7b6046f8b 100644 --- a/dhp-workflows/dhp-actionmanager/src/main/java/eu/dnetlib/dhp/actionmanager/migration/LicenseComparator.java +++ b/dhp-workflows/dhp-actionmanager/src/main/java/eu/dnetlib/dhp/actionmanager/migration/LicenseComparator.java @@ -1,47 +1,69 @@ + package eu.dnetlib.dhp.actionmanager.migration; -import eu.dnetlib.data.proto.FieldTypeProtos.Qualifier; import java.util.Comparator; +import eu.dnetlib.data.proto.FieldTypeProtos.Qualifier; + public class LicenseComparator implements Comparator { - @Override - public int compare(Qualifier left, Qualifier right) { + @Override + public int compare(Qualifier left, Qualifier right) { - if (left == null && right == null) return 0; - if (left == null) return 1; - if (right == null) return -1; + if (left == null && right == null) + return 0; + if (left == null) + return 1; + if (right == null) + return -1; - String lClass = left.getClassid(); - String rClass = right.getClassid(); + String lClass = left.getClassid(); + String rClass = right.getClassid(); - if (lClass.equals(rClass)) return 0; + if (lClass.equals(rClass)) + return 0; - if (lClass.equals("OPEN SOURCE")) return -1; - if (rClass.equals("OPEN SOURCE")) return 1; + if (lClass.equals("OPEN SOURCE")) + return -1; + if (rClass.equals("OPEN SOURCE")) + return 1; - if (lClass.equals("OPEN")) return -1; - if (rClass.equals("OPEN")) return 1; + if (lClass.equals("OPEN")) + return -1; + if (rClass.equals("OPEN")) + return 1; - if (lClass.equals("6MONTHS")) return -1; - if (rClass.equals("6MONTHS")) return 1; + if (lClass.equals("6MONTHS")) + return -1; + if (rClass.equals("6MONTHS")) + return 1; - if (lClass.equals("12MONTHS")) return -1; - if (rClass.equals("12MONTHS")) return 1; + if (lClass.equals("12MONTHS")) + return -1; + if (rClass.equals("12MONTHS")) + return 1; - if (lClass.equals("EMBARGO")) return -1; - if (rClass.equals("EMBARGO")) return 1; + if (lClass.equals("EMBARGO")) + return -1; + if (rClass.equals("EMBARGO")) + return 1; - if (lClass.equals("RESTRICTED")) return -1; - if (rClass.equals("RESTRICTED")) return 1; + if (lClass.equals("RESTRICTED")) + return -1; + if (rClass.equals("RESTRICTED")) + return 1; - if (lClass.equals("CLOSED")) return -1; - if (rClass.equals("CLOSED")) return 1; + if (lClass.equals("CLOSED")) + return -1; + if (rClass.equals("CLOSED")) + return 1; - if (lClass.equals("UNKNOWN")) return -1; - if (rClass.equals("UNKNOWN")) return 1; + if (lClass.equals("UNKNOWN")) + return -1; + if (rClass.equals("UNKNOWN")) + return 1; - // Else (but unlikely), lexicographical ordering will do. - return lClass.compareTo(rClass); - } + // Else (but unlikely), lexicographical ordering will do. + return lClass.compareTo(rClass); + } } diff --git a/dhp-workflows/dhp-actionmanager/src/main/java/eu/dnetlib/dhp/actionmanager/migration/MigrateActionSet.java b/dhp-workflows/dhp-actionmanager/src/main/java/eu/dnetlib/dhp/actionmanager/migration/MigrateActionSet.java index 43ad7c5e3..89cb63fab 100644 --- a/dhp-workflows/dhp-actionmanager/src/main/java/eu/dnetlib/dhp/actionmanager/migration/MigrateActionSet.java +++ b/dhp-workflows/dhp-actionmanager/src/main/java/eu/dnetlib/dhp/actionmanager/migration/MigrateActionSet.java @@ -1,11 +1,6 @@ + package eu.dnetlib.dhp.actionmanager.migration; -import com.google.common.base.Splitter; -import com.google.common.collect.Lists; -import eu.dnetlib.dhp.application.ArgumentApplicationParser; -import eu.dnetlib.dhp.utils.ISLookupClientFactory; -import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException; -import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService; import java.io.File; import java.io.FileOutputStream; import java.io.OutputStream; @@ -14,6 +9,7 @@ import java.util.LinkedList; import java.util.List; import java.util.Properties; import java.util.stream.Collectors; + import org.apache.commons.io.IOUtils; import org.apache.commons.lang3.StringUtils; import org.apache.hadoop.conf.Configuration; @@ -25,164 +21,174 @@ import org.apache.hadoop.util.ToolRunner; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import com.google.common.base.Splitter; +import com.google.common.collect.Lists; + +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.utils.ISLookupClientFactory; +import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException; +import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService; + public class MigrateActionSet { - private static final Logger log = LoggerFactory.getLogger(MigrateActionSet.class); + private static final Logger log = LoggerFactory.getLogger(MigrateActionSet.class); - private static final String SEPARATOR = "/"; - private static final String TARGET_PATHS = "target_paths"; - private static final String RAWSET_PREFIX = "rawset_"; + private static final String SEPARATOR = "/"; + private static final String TARGET_PATHS = "target_paths"; + private static final String RAWSET_PREFIX = "rawset_"; - public static void main(String[] args) throws Exception { - final ArgumentApplicationParser parser = - new ArgumentApplicationParser( - IOUtils.toString( - MigrateActionSet.class.getResourceAsStream( - "/eu/dnetlib/dhp/actionmanager/migration/migrate_actionsets_parameters.json"))); - parser.parseArgument(args); + public static void main(String[] args) throws Exception { + final ArgumentApplicationParser parser = new ArgumentApplicationParser( + IOUtils + .toString( + MigrateActionSet.class + .getResourceAsStream( + "/eu/dnetlib/dhp/actionmanager/migration/migrate_actionsets_parameters.json"))); + parser.parseArgument(args); - new MigrateActionSet().run(parser); - } + new MigrateActionSet().run(parser); + } - private void run(ArgumentApplicationParser parser) throws Exception { + private void run(ArgumentApplicationParser parser) throws Exception { - final String isLookupUrl = parser.get("isLookupUrl"); - final String sourceNN = parser.get("sourceNameNode"); - final String targetNN = parser.get("targetNameNode"); - final String workDir = parser.get("workingDirectory"); - final Integer distcp_num_maps = Integer.parseInt(parser.get("distcp_num_maps")); + final String isLookupUrl = parser.get("isLookupUrl"); + final String sourceNN = parser.get("sourceNameNode"); + final String targetNN = parser.get("targetNameNode"); + final String workDir = parser.get("workingDirectory"); + final Integer distcp_num_maps = Integer.parseInt(parser.get("distcp_num_maps")); - final String distcp_memory_mb = parser.get("distcp_memory_mb"); - final String distcp_task_timeout = parser.get("distcp_task_timeout"); + final String distcp_memory_mb = parser.get("distcp_memory_mb"); + final String distcp_task_timeout = parser.get("distcp_task_timeout"); - final String transform_only_s = parser.get("transform_only"); + final String transform_only_s = parser.get("transform_only"); - log.info("transform only param: {}", transform_only_s); + log.info("transform only param: {}", transform_only_s); - final Boolean transformOnly = Boolean.valueOf(parser.get("transform_only")); + final Boolean transformOnly = Boolean.valueOf(parser.get("transform_only")); - log.info("transform only: {}", transformOnly); + log.info("transform only: {}", transformOnly); - ISLookUpService isLookUp = ISLookupClientFactory.getLookUpService(isLookupUrl); + ISLookUpService isLookUp = ISLookupClientFactory.getLookUpService(isLookupUrl); - Configuration conf = getConfiguration(distcp_task_timeout, distcp_memory_mb, distcp_num_maps); - FileSystem targetFS = FileSystem.get(conf); + Configuration conf = getConfiguration(distcp_task_timeout, distcp_memory_mb, distcp_num_maps); + FileSystem targetFS = FileSystem.get(conf); - Configuration sourceConf = - getConfiguration(distcp_task_timeout, distcp_memory_mb, distcp_num_maps); - sourceConf.set(FileSystem.FS_DEFAULT_NAME_KEY, sourceNN); - FileSystem sourceFS = FileSystem.get(sourceConf); + Configuration sourceConf = getConfiguration(distcp_task_timeout, distcp_memory_mb, distcp_num_maps); + sourceConf.set(FileSystem.FS_DEFAULT_NAME_KEY, sourceNN); + FileSystem sourceFS = FileSystem.get(sourceConf); - Properties props = new Properties(); + Properties props = new Properties(); - List targetPaths = new ArrayList<>(); + List targetPaths = new ArrayList<>(); - final List sourcePaths = getSourcePaths(sourceNN, isLookUp); - log.info( - "paths to process:\n{}", - sourcePaths.stream().map(p -> p.toString()).collect(Collectors.joining("\n"))); - for (Path source : sourcePaths) { + final List sourcePaths = getSourcePaths(sourceNN, isLookUp); + log + .info( + "paths to process:\n{}", + sourcePaths.stream().map(p -> p.toString()).collect(Collectors.joining("\n"))); + for (Path source : sourcePaths) { - if (!sourceFS.exists(source)) { - log.warn("skipping unexisting path: {}", source); - } else { + if (!sourceFS.exists(source)) { + log.warn("skipping unexisting path: {}", source); + } else { - LinkedList pathQ = - Lists.newLinkedList(Splitter.on(SEPARATOR).split(source.toUri().getPath())); + LinkedList pathQ = Lists.newLinkedList(Splitter.on(SEPARATOR).split(source.toUri().getPath())); - final String rawSet = pathQ.pollLast(); - log.info("got RAWSET: {}", rawSet); + final String rawSet = pathQ.pollLast(); + log.info("got RAWSET: {}", rawSet); - if (StringUtils.isNotBlank(rawSet) && rawSet.startsWith(RAWSET_PREFIX)) { + if (StringUtils.isNotBlank(rawSet) && rawSet.startsWith(RAWSET_PREFIX)) { - final String actionSetDirectory = pathQ.pollLast(); + final String actionSetDirectory = pathQ.pollLast(); - final Path targetPath = - new Path(targetNN + workDir + SEPARATOR + actionSetDirectory + SEPARATOR + rawSet); + final Path targetPath = new Path( + targetNN + workDir + SEPARATOR + actionSetDirectory + SEPARATOR + rawSet); - log.info("using TARGET PATH: {}", targetPath); + log.info("using TARGET PATH: {}", targetPath); - if (!transformOnly) { - if (targetFS.exists(targetPath)) { - targetFS.delete(targetPath, true); - } - runDistcp( - distcp_num_maps, distcp_memory_mb, distcp_task_timeout, conf, source, targetPath); - } + if (!transformOnly) { + if (targetFS.exists(targetPath)) { + targetFS.delete(targetPath, true); + } + runDistcp( + distcp_num_maps, distcp_memory_mb, distcp_task_timeout, conf, source, targetPath); + } - targetPaths.add(targetPath); - } - } - } + targetPaths.add(targetPath); + } + } + } - props.setProperty( - TARGET_PATHS, targetPaths.stream().map(p -> p.toString()).collect(Collectors.joining(","))); - File file = new File(System.getProperty("oozie.action.output.properties")); + props + .setProperty( + TARGET_PATHS, targetPaths.stream().map(p -> p.toString()).collect(Collectors.joining(","))); + File file = new File(System.getProperty("oozie.action.output.properties")); - try (OutputStream os = new FileOutputStream(file)) { - props.store(os, ""); - } - System.out.println(file.getAbsolutePath()); - } + try (OutputStream os = new FileOutputStream(file)) { + props.store(os, ""); + } + System.out.println(file.getAbsolutePath()); + } - private void runDistcp( - Integer distcp_num_maps, - String distcp_memory_mb, - String distcp_task_timeout, - Configuration conf, - Path source, - Path targetPath) - throws Exception { + private void runDistcp( + Integer distcp_num_maps, + String distcp_memory_mb, + String distcp_task_timeout, + Configuration conf, + Path source, + Path targetPath) + throws Exception { - final DistCpOptions op = new DistCpOptions(source, targetPath); - op.setMaxMaps(distcp_num_maps); - op.preserve(DistCpOptions.FileAttribute.BLOCKSIZE); - op.preserve(DistCpOptions.FileAttribute.REPLICATION); - op.preserve(DistCpOptions.FileAttribute.CHECKSUMTYPE); + final DistCpOptions op = new DistCpOptions(source, targetPath); + op.setMaxMaps(distcp_num_maps); + op.preserve(DistCpOptions.FileAttribute.BLOCKSIZE); + op.preserve(DistCpOptions.FileAttribute.REPLICATION); + op.preserve(DistCpOptions.FileAttribute.CHECKSUMTYPE); - int res = - ToolRunner.run( - new DistCp(conf, op), - new String[] { - "-Dmapred.task.timeout=" + distcp_task_timeout, - "-Dmapreduce.map.memory.mb=" + distcp_memory_mb, - "-pb", - "-m " + distcp_num_maps, - source.toString(), - targetPath.toString() - }); + int res = ToolRunner + .run( + new DistCp(conf, op), + new String[] { + "-Dmapred.task.timeout=" + distcp_task_timeout, + "-Dmapreduce.map.memory.mb=" + distcp_memory_mb, + "-pb", + "-m " + distcp_num_maps, + source.toString(), + targetPath.toString() + }); - if (res != 0) { - throw new RuntimeException(String.format("distcp exited with code %s", res)); - } - } + if (res != 0) { + throw new RuntimeException(String.format("distcp exited with code %s", res)); + } + } - private Configuration getConfiguration( - String distcp_task_timeout, String distcp_memory_mb, Integer distcp_num_maps) { - final Configuration conf = new Configuration(); - conf.set("dfs.webhdfs.socket.connect-timeout", distcp_task_timeout); - conf.set("dfs.webhdfs.socket.read-timeout", distcp_task_timeout); - conf.set("dfs.http.client.retry.policy.enabled", "true"); - conf.set("mapred.task.timeout", distcp_task_timeout); - conf.set("mapreduce.map.memory.mb", distcp_memory_mb); - conf.set("mapred.map.tasks", String.valueOf(distcp_num_maps)); - return conf; - } + private Configuration getConfiguration( + String distcp_task_timeout, String distcp_memory_mb, Integer distcp_num_maps) { + final Configuration conf = new Configuration(); + conf.set("dfs.webhdfs.socket.connect-timeout", distcp_task_timeout); + conf.set("dfs.webhdfs.socket.read-timeout", distcp_task_timeout); + conf.set("dfs.http.client.retry.policy.enabled", "true"); + conf.set("mapred.task.timeout", distcp_task_timeout); + conf.set("mapreduce.map.memory.mb", distcp_memory_mb); + conf.set("mapred.map.tasks", String.valueOf(distcp_num_maps)); + return conf; + } - private List getSourcePaths(String sourceNN, ISLookUpService isLookUp) - throws ISLookUpException { - String XQUERY = - "distinct-values(\n" - + "let $basePath := collection('/db/DRIVER/ServiceResources/ActionManagerServiceResourceType')//SERVICE_PROPERTIES/PROPERTY[@key = 'basePath']/@value/string()\n" - + "for $x in collection('/db/DRIVER/ActionManagerSetDSResources/ActionManagerSetDSResourceType') \n" - + "let $setDir := $x//SET/@directory/string()\n" - + "let $rawSet := $x//RAW_SETS/LATEST/@id/string()\n" - + "return concat($basePath, '/', $setDir, '/', $rawSet))"; + private List getSourcePaths(String sourceNN, ISLookUpService isLookUp) + throws ISLookUpException { + String XQUERY = "distinct-values(\n" + + "let $basePath := collection('/db/DRIVER/ServiceResources/ActionManagerServiceResourceType')//SERVICE_PROPERTIES/PROPERTY[@key = 'basePath']/@value/string()\n" + + "for $x in collection('/db/DRIVER/ActionManagerSetDSResources/ActionManagerSetDSResourceType') \n" + + "let $setDir := $x//SET/@directory/string()\n" + + "let $rawSet := $x//RAW_SETS/LATEST/@id/string()\n" + + "return concat($basePath, '/', $setDir, '/', $rawSet))"; - log.info(String.format("running xquery:\n%s", XQUERY)); - return isLookUp.quickSearchProfile(XQUERY).stream() - .map(p -> sourceNN + p) - .map(Path::new) - .collect(Collectors.toList()); - } + log.info(String.format("running xquery:\n%s", XQUERY)); + return isLookUp + .quickSearchProfile(XQUERY) + .stream() + .map(p -> sourceNN + p) + .map(Path::new) + .collect(Collectors.toList()); + } } diff --git a/dhp-workflows/dhp-actionmanager/src/main/java/eu/dnetlib/dhp/actionmanager/migration/ProtoConverter.java b/dhp-workflows/dhp-actionmanager/src/main/java/eu/dnetlib/dhp/actionmanager/migration/ProtoConverter.java index 894804e25..456113c43 100644 --- a/dhp-workflows/dhp-actionmanager/src/main/java/eu/dnetlib/dhp/actionmanager/migration/ProtoConverter.java +++ b/dhp-workflows/dhp-actionmanager/src/main/java/eu/dnetlib/dhp/actionmanager/migration/ProtoConverter.java @@ -1,3 +1,4 @@ + package eu.dnetlib.dhp.actionmanager.migration; import static eu.dnetlib.data.proto.KindProtos.Kind.entity; @@ -5,569 +6,659 @@ import static eu.dnetlib.data.proto.KindProtos.Kind.relation; import static eu.dnetlib.data.proto.TypeProtos.*; import static eu.dnetlib.data.proto.TypeProtos.Type.*; -import com.google.common.collect.Lists; -import com.googlecode.protobuf.format.JsonFormat; -import eu.dnetlib.data.proto.*; -import eu.dnetlib.dhp.schema.oaf.*; import java.io.Serializable; import java.util.List; import java.util.Optional; import java.util.stream.Collectors; + import org.apache.commons.lang3.StringUtils; +import com.google.common.collect.Lists; +import com.googlecode.protobuf.format.JsonFormat; + +import eu.dnetlib.data.proto.*; +import eu.dnetlib.dhp.schema.oaf.*; + public class ProtoConverter implements Serializable { - public static final String UNKNOWN = "UNKNOWN"; - public static final String NOT_AVAILABLE = "not available"; - public static final String DNET_ACCESS_MODES = "dnet:access_modes"; + public static final String UNKNOWN = "UNKNOWN"; + public static final String NOT_AVAILABLE = "not available"; + public static final String DNET_ACCESS_MODES = "dnet:access_modes"; - public static Oaf convert(OafProtos.Oaf oaf) { - try { - switch (oaf.getKind()) { - case entity: - return convertEntity(oaf); - case relation: - return convertRelation(oaf); - default: - throw new IllegalArgumentException("invalid kind " + oaf.getKind()); - } - } catch (Throwable e) { - throw new RuntimeException("error on getting " + JsonFormat.printToString(oaf), e); - } - } + public static Oaf convert(OafProtos.Oaf oaf) { + try { + switch (oaf.getKind()) { + case entity: + return convertEntity(oaf); + case relation: + return convertRelation(oaf); + default: + throw new IllegalArgumentException("invalid kind " + oaf.getKind()); + } + } catch (Throwable e) { + throw new RuntimeException("error on getting " + JsonFormat.printToString(oaf), e); + } + } - private static Relation convertRelation(OafProtos.Oaf oaf) { - final OafProtos.OafRel r = oaf.getRel(); - final Relation rel = new Relation(); - rel.setDataInfo(mapDataInfo(oaf.getDataInfo())); - rel.setLastupdatetimestamp(oaf.getLastupdatetimestamp()); - rel.setSource(r.getSource()); - rel.setTarget(r.getTarget()); - rel.setRelType(r.getRelType().toString()); - rel.setSubRelType(r.getSubRelType().toString()); - rel.setRelClass(r.getRelClass()); - rel.setCollectedfrom( - r.getCollectedfromCount() > 0 - ? r.getCollectedfromList().stream().map(kv -> mapKV(kv)).collect(Collectors.toList()) - : null); - return rel; - } + private static Relation convertRelation(OafProtos.Oaf oaf) { + final OafProtos.OafRel r = oaf.getRel(); + final Relation rel = new Relation(); + rel.setDataInfo(mapDataInfo(oaf.getDataInfo())); + rel.setLastupdatetimestamp(oaf.getLastupdatetimestamp()); + rel.setSource(r.getSource()); + rel.setTarget(r.getTarget()); + rel.setRelType(r.getRelType().toString()); + rel.setSubRelType(r.getSubRelType().toString()); + rel.setRelClass(r.getRelClass()); + rel + .setCollectedfrom( + r.getCollectedfromCount() > 0 + ? r.getCollectedfromList().stream().map(kv -> mapKV(kv)).collect(Collectors.toList()) + : null); + return rel; + } - private static OafEntity convertEntity(OafProtos.Oaf oaf) { + private static OafEntity convertEntity(OafProtos.Oaf oaf) { - switch (oaf.getEntity().getType()) { - case result: - final Result r = convertResult(oaf); - r.setInstance(convertInstances(oaf)); - return r; - case project: - return convertProject(oaf); - case datasource: - return convertDataSource(oaf); - case organization: - return convertOrganization(oaf); - default: - throw new RuntimeException("received unknown type"); - } - } + switch (oaf.getEntity().getType()) { + case result: + final Result r = convertResult(oaf); + r.setInstance(convertInstances(oaf)); + return r; + case project: + return convertProject(oaf); + case datasource: + return convertDataSource(oaf); + case organization: + return convertOrganization(oaf); + default: + throw new RuntimeException("received unknown type"); + } + } - private static List convertInstances(OafProtos.Oaf oaf) { + private static List convertInstances(OafProtos.Oaf oaf) { - final ResultProtos.Result r = oaf.getEntity().getResult(); - if (r.getInstanceCount() > 0) { - return r.getInstanceList().stream().map(i -> convertInstance(i)).collect(Collectors.toList()); - } - return Lists.newArrayList(); - } + final ResultProtos.Result r = oaf.getEntity().getResult(); + if (r.getInstanceCount() > 0) { + return r.getInstanceList().stream().map(i -> convertInstance(i)).collect(Collectors.toList()); + } + return Lists.newArrayList(); + } - private static Instance convertInstance(ResultProtos.Result.Instance ri) { - final Instance i = new Instance(); - i.setAccessright(mapQualifier(ri.getAccessright())); - i.setCollectedfrom(mapKV(ri.getCollectedfrom())); - i.setDateofacceptance(mapStringField(ri.getDateofacceptance())); - i.setDistributionlocation(ri.getDistributionlocation()); - i.setHostedby(mapKV(ri.getHostedby())); - i.setInstancetype(mapQualifier(ri.getInstancetype())); - i.setLicense(mapStringField(ri.getLicense())); - i.setUrl(ri.getUrlList()); - i.setRefereed(mapStringField(ri.getRefereed())); - i.setProcessingchargeamount(mapStringField(ri.getProcessingchargeamount())); - i.setProcessingchargecurrency(mapStringField(ri.getProcessingchargecurrency())); - return i; - } + private static Instance convertInstance(ResultProtos.Result.Instance ri) { + final Instance i = new Instance(); + i.setAccessright(mapQualifier(ri.getAccessright())); + i.setCollectedfrom(mapKV(ri.getCollectedfrom())); + i.setDateofacceptance(mapStringField(ri.getDateofacceptance())); + i.setDistributionlocation(ri.getDistributionlocation()); + i.setHostedby(mapKV(ri.getHostedby())); + i.setInstancetype(mapQualifier(ri.getInstancetype())); + i.setLicense(mapStringField(ri.getLicense())); + i.setUrl(ri.getUrlList()); + i.setRefereed(mapStringField(ri.getRefereed())); + i.setProcessingchargeamount(mapStringField(ri.getProcessingchargeamount())); + i.setProcessingchargecurrency(mapStringField(ri.getProcessingchargecurrency())); + return i; + } - private static Organization convertOrganization(OafProtos.Oaf oaf) { - final OrganizationProtos.Organization.Metadata m = - oaf.getEntity().getOrganization().getMetadata(); - final Organization org = setOaf(new Organization(), oaf); - setEntity(org, oaf); - org.setLegalshortname(mapStringField(m.getLegalshortname())); - org.setLegalname(mapStringField(m.getLegalname())); - org.setAlternativeNames( - m.getAlternativeNamesList().stream() - .map(ProtoConverter::mapStringField) - .collect(Collectors.toList())); - org.setWebsiteurl(mapStringField(m.getWebsiteurl())); - org.setLogourl(mapStringField(m.getLogourl())); - org.setEclegalbody(mapStringField(m.getEclegalbody())); - org.setEclegalperson(mapStringField(m.getEclegalperson())); - org.setEcnonprofit(mapStringField(m.getEcnonprofit())); - org.setEcresearchorganization(mapStringField(m.getEcresearchorganization())); - org.setEchighereducation(mapStringField(m.getEchighereducation())); - org.setEcinternationalorganizationeurinterests( - mapStringField(m.getEcinternationalorganizationeurinterests())); - org.setEcinternationalorganization(mapStringField(m.getEcinternationalorganization())); - org.setEcenterprise(mapStringField(m.getEcenterprise())); - org.setEcsmevalidated(mapStringField(m.getEcsmevalidated())); - org.setEcnutscode(mapStringField(m.getEcnutscode())); - org.setCountry(mapQualifier(m.getCountry())); + private static Organization convertOrganization(OafProtos.Oaf oaf) { + final OrganizationProtos.Organization.Metadata m = oaf.getEntity().getOrganization().getMetadata(); + final Organization org = setOaf(new Organization(), oaf); + setEntity(org, oaf); + org.setLegalshortname(mapStringField(m.getLegalshortname())); + org.setLegalname(mapStringField(m.getLegalname())); + org + .setAlternativeNames( + m + .getAlternativeNamesList() + .stream() + .map(ProtoConverter::mapStringField) + .collect(Collectors.toList())); + org.setWebsiteurl(mapStringField(m.getWebsiteurl())); + org.setLogourl(mapStringField(m.getLogourl())); + org.setEclegalbody(mapStringField(m.getEclegalbody())); + org.setEclegalperson(mapStringField(m.getEclegalperson())); + org.setEcnonprofit(mapStringField(m.getEcnonprofit())); + org.setEcresearchorganization(mapStringField(m.getEcresearchorganization())); + org.setEchighereducation(mapStringField(m.getEchighereducation())); + org + .setEcinternationalorganizationeurinterests( + mapStringField(m.getEcinternationalorganizationeurinterests())); + org.setEcinternationalorganization(mapStringField(m.getEcinternationalorganization())); + org.setEcenterprise(mapStringField(m.getEcenterprise())); + org.setEcsmevalidated(mapStringField(m.getEcsmevalidated())); + org.setEcnutscode(mapStringField(m.getEcnutscode())); + org.setCountry(mapQualifier(m.getCountry())); - return org; - } + return org; + } - private static Datasource convertDataSource(OafProtos.Oaf oaf) { - final DatasourceProtos.Datasource.Metadata m = oaf.getEntity().getDatasource().getMetadata(); - final Datasource datasource = setOaf(new Datasource(), oaf); - setEntity(datasource, oaf); - datasource.setAccessinfopackage( - m.getAccessinfopackageList().stream() - .map(ProtoConverter::mapStringField) - .collect(Collectors.toList())); - datasource.setCertificates(mapStringField(m.getCertificates())); - datasource.setCitationguidelineurl(mapStringField(m.getCitationguidelineurl())); - datasource.setContactemail(mapStringField(m.getContactemail())); - datasource.setDatabaseaccessrestriction(mapStringField(m.getDatabaseaccessrestriction())); - datasource.setDatabaseaccesstype(mapStringField(m.getDatabaseaccesstype())); - datasource.setDataprovider(mapBoolField(m.getDataprovider())); - datasource.setDatasourcetype(mapQualifier(m.getDatasourcetype())); - datasource.setDatauploadrestriction(mapStringField(m.getDatauploadrestriction())); - datasource.setCitationguidelineurl(mapStringField(m.getCitationguidelineurl())); - datasource.setDatauploadtype(mapStringField(m.getDatauploadtype())); - datasource.setDateofvalidation(mapStringField(m.getDateofvalidation())); - datasource.setDescription(mapStringField(m.getDescription())); - datasource.setEnglishname(mapStringField(m.getEnglishname())); - datasource.setLatitude(mapStringField(m.getLatitude())); - datasource.setLongitude(mapStringField(m.getLongitude())); - datasource.setLogourl(mapStringField(m.getLogourl())); - datasource.setMissionstatementurl(mapStringField(m.getMissionstatementurl())); - datasource.setNamespaceprefix(mapStringField(m.getNamespaceprefix())); - datasource.setOdcontenttypes( - m.getOdcontenttypesList().stream() - .map(ProtoConverter::mapStringField) - .collect(Collectors.toList())); - datasource.setOdlanguages( - m.getOdlanguagesList().stream() - .map(ProtoConverter::mapStringField) - .collect(Collectors.toList())); - datasource.setOdnumberofitems(mapStringField(m.getOdnumberofitems())); - datasource.setOdnumberofitemsdate(mapStringField(m.getOdnumberofitemsdate())); - datasource.setOdpolicies(mapStringField(m.getOdpolicies())); - datasource.setOfficialname(mapStringField(m.getOfficialname())); - datasource.setOpenairecompatibility(mapQualifier(m.getOpenairecompatibility())); - datasource.setPidsystems(mapStringField(m.getPidsystems())); - datasource.setPolicies( - m.getPoliciesList().stream().map(ProtoConverter::mapKV).collect(Collectors.toList())); - datasource.setQualitymanagementkind(mapStringField(m.getQualitymanagementkind())); - datasource.setReleaseenddate(mapStringField(m.getReleaseenddate())); - datasource.setServiceprovider(mapBoolField(m.getServiceprovider())); - datasource.setReleasestartdate(mapStringField(m.getReleasestartdate())); - datasource.setSubjects( - m.getSubjectsList().stream() - .map(ProtoConverter::mapStructuredProperty) - .collect(Collectors.toList())); - datasource.setVersioning(mapBoolField(m.getVersioning())); - datasource.setWebsiteurl(mapStringField(m.getWebsiteurl())); - datasource.setJournal(mapJournal(m.getJournal())); + private static Datasource convertDataSource(OafProtos.Oaf oaf) { + final DatasourceProtos.Datasource.Metadata m = oaf.getEntity().getDatasource().getMetadata(); + final Datasource datasource = setOaf(new Datasource(), oaf); + setEntity(datasource, oaf); + datasource + .setAccessinfopackage( + m + .getAccessinfopackageList() + .stream() + .map(ProtoConverter::mapStringField) + .collect(Collectors.toList())); + datasource.setCertificates(mapStringField(m.getCertificates())); + datasource.setCitationguidelineurl(mapStringField(m.getCitationguidelineurl())); + datasource.setContactemail(mapStringField(m.getContactemail())); + datasource.setDatabaseaccessrestriction(mapStringField(m.getDatabaseaccessrestriction())); + datasource.setDatabaseaccesstype(mapStringField(m.getDatabaseaccesstype())); + datasource.setDataprovider(mapBoolField(m.getDataprovider())); + datasource.setDatasourcetype(mapQualifier(m.getDatasourcetype())); + datasource.setDatauploadrestriction(mapStringField(m.getDatauploadrestriction())); + datasource.setCitationguidelineurl(mapStringField(m.getCitationguidelineurl())); + datasource.setDatauploadtype(mapStringField(m.getDatauploadtype())); + datasource.setDateofvalidation(mapStringField(m.getDateofvalidation())); + datasource.setDescription(mapStringField(m.getDescription())); + datasource.setEnglishname(mapStringField(m.getEnglishname())); + datasource.setLatitude(mapStringField(m.getLatitude())); + datasource.setLongitude(mapStringField(m.getLongitude())); + datasource.setLogourl(mapStringField(m.getLogourl())); + datasource.setMissionstatementurl(mapStringField(m.getMissionstatementurl())); + datasource.setNamespaceprefix(mapStringField(m.getNamespaceprefix())); + datasource + .setOdcontenttypes( + m + .getOdcontenttypesList() + .stream() + .map(ProtoConverter::mapStringField) + .collect(Collectors.toList())); + datasource + .setOdlanguages( + m + .getOdlanguagesList() + .stream() + .map(ProtoConverter::mapStringField) + .collect(Collectors.toList())); + datasource.setOdnumberofitems(mapStringField(m.getOdnumberofitems())); + datasource.setOdnumberofitemsdate(mapStringField(m.getOdnumberofitemsdate())); + datasource.setOdpolicies(mapStringField(m.getOdpolicies())); + datasource.setOfficialname(mapStringField(m.getOfficialname())); + datasource.setOpenairecompatibility(mapQualifier(m.getOpenairecompatibility())); + datasource.setPidsystems(mapStringField(m.getPidsystems())); + datasource + .setPolicies( + m.getPoliciesList().stream().map(ProtoConverter::mapKV).collect(Collectors.toList())); + datasource.setQualitymanagementkind(mapStringField(m.getQualitymanagementkind())); + datasource.setReleaseenddate(mapStringField(m.getReleaseenddate())); + datasource.setServiceprovider(mapBoolField(m.getServiceprovider())); + datasource.setReleasestartdate(mapStringField(m.getReleasestartdate())); + datasource + .setSubjects( + m + .getSubjectsList() + .stream() + .map(ProtoConverter::mapStructuredProperty) + .collect(Collectors.toList())); + datasource.setVersioning(mapBoolField(m.getVersioning())); + datasource.setWebsiteurl(mapStringField(m.getWebsiteurl())); + datasource.setJournal(mapJournal(m.getJournal())); - return datasource; - } + return datasource; + } - private static Project convertProject(OafProtos.Oaf oaf) { - final ProjectProtos.Project.Metadata m = oaf.getEntity().getProject().getMetadata(); - final Project project = setOaf(new Project(), oaf); - setEntity(project, oaf); - project.setAcronym(mapStringField(m.getAcronym())); - project.setCallidentifier(mapStringField(m.getCallidentifier())); - project.setCode(mapStringField(m.getCode())); - project.setContactemail(mapStringField(m.getContactemail())); - project.setContactfax(mapStringField(m.getContactfax())); - project.setContactfullname(mapStringField(m.getContactfullname())); - project.setContactphone(mapStringField(m.getContactphone())); - project.setContracttype(mapQualifier(m.getContracttype())); - project.setCurrency(mapStringField(m.getCurrency())); - project.setDuration(mapStringField(m.getDuration())); - project.setEcarticle29_3(mapStringField(m.getEcarticle293())); - project.setEcsc39(mapStringField(m.getEcsc39())); - project.setOamandatepublications(mapStringField(m.getOamandatepublications())); - project.setStartdate(mapStringField(m.getStartdate())); - project.setEnddate(mapStringField(m.getEnddate())); - project.setFundedamount(m.getFundedamount()); - project.setTotalcost(m.getTotalcost()); - project.setKeywords(mapStringField(m.getKeywords())); - project.setSubjects( - m.getSubjectsList().stream() - .map(sp -> mapStructuredProperty(sp)) - .collect(Collectors.toList())); - project.setTitle(mapStringField(m.getTitle())); - project.setWebsiteurl(mapStringField(m.getWebsiteurl())); - project.setFundingtree( - m.getFundingtreeList().stream().map(f -> mapStringField(f)).collect(Collectors.toList())); - project.setJsonextrainfo(mapStringField(m.getJsonextrainfo())); - project.setSummary(mapStringField(m.getSummary())); - project.setOptional1(mapStringField(m.getOptional1())); - project.setOptional2(mapStringField(m.getOptional2())); - return project; - } + private static Project convertProject(OafProtos.Oaf oaf) { + final ProjectProtos.Project.Metadata m = oaf.getEntity().getProject().getMetadata(); + final Project project = setOaf(new Project(), oaf); + setEntity(project, oaf); + project.setAcronym(mapStringField(m.getAcronym())); + project.setCallidentifier(mapStringField(m.getCallidentifier())); + project.setCode(mapStringField(m.getCode())); + project.setContactemail(mapStringField(m.getContactemail())); + project.setContactfax(mapStringField(m.getContactfax())); + project.setContactfullname(mapStringField(m.getContactfullname())); + project.setContactphone(mapStringField(m.getContactphone())); + project.setContracttype(mapQualifier(m.getContracttype())); + project.setCurrency(mapStringField(m.getCurrency())); + project.setDuration(mapStringField(m.getDuration())); + project.setEcarticle29_3(mapStringField(m.getEcarticle293())); + project.setEcsc39(mapStringField(m.getEcsc39())); + project.setOamandatepublications(mapStringField(m.getOamandatepublications())); + project.setStartdate(mapStringField(m.getStartdate())); + project.setEnddate(mapStringField(m.getEnddate())); + project.setFundedamount(m.getFundedamount()); + project.setTotalcost(m.getTotalcost()); + project.setKeywords(mapStringField(m.getKeywords())); + project + .setSubjects( + m + .getSubjectsList() + .stream() + .map(sp -> mapStructuredProperty(sp)) + .collect(Collectors.toList())); + project.setTitle(mapStringField(m.getTitle())); + project.setWebsiteurl(mapStringField(m.getWebsiteurl())); + project + .setFundingtree( + m.getFundingtreeList().stream().map(f -> mapStringField(f)).collect(Collectors.toList())); + project.setJsonextrainfo(mapStringField(m.getJsonextrainfo())); + project.setSummary(mapStringField(m.getSummary())); + project.setOptional1(mapStringField(m.getOptional1())); + project.setOptional2(mapStringField(m.getOptional2())); + return project; + } - private static Result convertResult(OafProtos.Oaf oaf) { - switch (oaf.getEntity().getResult().getMetadata().getResulttype().getClassid()) { - case "dataset": - return createDataset(oaf); - case "publication": - return createPublication(oaf); - case "software": - return createSoftware(oaf); - case "other": - return createORP(oaf); - default: - Result result = setOaf(new Result(), oaf); - setEntity(result, oaf); - return setResult(result, oaf); - } - } + private static Result convertResult(OafProtos.Oaf oaf) { + switch (oaf.getEntity().getResult().getMetadata().getResulttype().getClassid()) { + case "dataset": + return createDataset(oaf); + case "publication": + return createPublication(oaf); + case "software": + return createSoftware(oaf); + case "other": + return createORP(oaf); + default: + Result result = setOaf(new Result(), oaf); + setEntity(result, oaf); + return setResult(result, oaf); + } + } - private static Software createSoftware(OafProtos.Oaf oaf) { - ResultProtos.Result.Metadata m = oaf.getEntity().getResult().getMetadata(); - Software software = setOaf(new Software(), oaf); - setEntity(software, oaf); - setResult(software, oaf); + private static Software createSoftware(OafProtos.Oaf oaf) { + ResultProtos.Result.Metadata m = oaf.getEntity().getResult().getMetadata(); + Software software = setOaf(new Software(), oaf); + setEntity(software, oaf); + setResult(software, oaf); - software.setDocumentationUrl( - m.getDocumentationUrlList().stream() - .map(ProtoConverter::mapStringField) - .collect(Collectors.toList())); - software.setLicense( - m.getLicenseList().stream() - .map(ProtoConverter::mapStructuredProperty) - .collect(Collectors.toList())); - software.setCodeRepositoryUrl(mapStringField(m.getCodeRepositoryUrl())); - software.setProgrammingLanguage(mapQualifier(m.getProgrammingLanguage())); - return software; - } + software + .setDocumentationUrl( + m + .getDocumentationUrlList() + .stream() + .map(ProtoConverter::mapStringField) + .collect(Collectors.toList())); + software + .setLicense( + m + .getLicenseList() + .stream() + .map(ProtoConverter::mapStructuredProperty) + .collect(Collectors.toList())); + software.setCodeRepositoryUrl(mapStringField(m.getCodeRepositoryUrl())); + software.setProgrammingLanguage(mapQualifier(m.getProgrammingLanguage())); + return software; + } - private static OtherResearchProduct createORP(OafProtos.Oaf oaf) { - ResultProtos.Result.Metadata m = oaf.getEntity().getResult().getMetadata(); - OtherResearchProduct otherResearchProducts = setOaf(new OtherResearchProduct(), oaf); - setEntity(otherResearchProducts, oaf); - setResult(otherResearchProducts, oaf); - otherResearchProducts.setContactperson( - m.getContactpersonList().stream() - .map(ProtoConverter::mapStringField) - .collect(Collectors.toList())); - otherResearchProducts.setContactgroup( - m.getContactgroupList().stream() - .map(ProtoConverter::mapStringField) - .collect(Collectors.toList())); - otherResearchProducts.setTool( - m.getToolList().stream().map(ProtoConverter::mapStringField).collect(Collectors.toList())); + private static OtherResearchProduct createORP(OafProtos.Oaf oaf) { + ResultProtos.Result.Metadata m = oaf.getEntity().getResult().getMetadata(); + OtherResearchProduct otherResearchProducts = setOaf(new OtherResearchProduct(), oaf); + setEntity(otherResearchProducts, oaf); + setResult(otherResearchProducts, oaf); + otherResearchProducts + .setContactperson( + m + .getContactpersonList() + .stream() + .map(ProtoConverter::mapStringField) + .collect(Collectors.toList())); + otherResearchProducts + .setContactgroup( + m + .getContactgroupList() + .stream() + .map(ProtoConverter::mapStringField) + .collect(Collectors.toList())); + otherResearchProducts + .setTool( + m.getToolList().stream().map(ProtoConverter::mapStringField).collect(Collectors.toList())); - return otherResearchProducts; - } + return otherResearchProducts; + } - private static Publication createPublication(OafProtos.Oaf oaf) { + private static Publication createPublication(OafProtos.Oaf oaf) { - ResultProtos.Result.Metadata m = oaf.getEntity().getResult().getMetadata(); - Publication publication = setOaf(new Publication(), oaf); - setEntity(publication, oaf); - setResult(publication, oaf); - publication.setJournal(mapJournal(m.getJournal())); - return publication; - } + ResultProtos.Result.Metadata m = oaf.getEntity().getResult().getMetadata(); + Publication publication = setOaf(new Publication(), oaf); + setEntity(publication, oaf); + setResult(publication, oaf); + publication.setJournal(mapJournal(m.getJournal())); + return publication; + } - private static Dataset createDataset(OafProtos.Oaf oaf) { + private static Dataset createDataset(OafProtos.Oaf oaf) { - ResultProtos.Result.Metadata m = oaf.getEntity().getResult().getMetadata(); - Dataset dataset = setOaf(new Dataset(), oaf); - setEntity(dataset, oaf); - setResult(dataset, oaf); - dataset.setStoragedate(mapStringField(m.getStoragedate())); - dataset.setDevice(mapStringField(m.getDevice())); - dataset.setSize(mapStringField(m.getSize())); - dataset.setVersion(mapStringField(m.getVersion())); - dataset.setLastmetadataupdate(mapStringField(m.getLastmetadataupdate())); - dataset.setMetadataversionnumber(mapStringField(m.getMetadataversionnumber())); - dataset.setGeolocation( - m.getGeolocationList().stream() - .map(ProtoConverter::mapGeolocation) - .collect(Collectors.toList())); - return dataset; - } + ResultProtos.Result.Metadata m = oaf.getEntity().getResult().getMetadata(); + Dataset dataset = setOaf(new Dataset(), oaf); + setEntity(dataset, oaf); + setResult(dataset, oaf); + dataset.setStoragedate(mapStringField(m.getStoragedate())); + dataset.setDevice(mapStringField(m.getDevice())); + dataset.setSize(mapStringField(m.getSize())); + dataset.setVersion(mapStringField(m.getVersion())); + dataset.setLastmetadataupdate(mapStringField(m.getLastmetadataupdate())); + dataset.setMetadataversionnumber(mapStringField(m.getMetadataversionnumber())); + dataset + .setGeolocation( + m + .getGeolocationList() + .stream() + .map(ProtoConverter::mapGeolocation) + .collect(Collectors.toList())); + return dataset; + } - public static T setOaf(T oaf, OafProtos.Oaf o) { - oaf.setDataInfo(mapDataInfo(o.getDataInfo())); - oaf.setLastupdatetimestamp(o.getLastupdatetimestamp()); - return oaf; - } + public static T setOaf(T oaf, OafProtos.Oaf o) { + oaf.setDataInfo(mapDataInfo(o.getDataInfo())); + oaf.setLastupdatetimestamp(o.getLastupdatetimestamp()); + return oaf; + } - public static T setEntity(T entity, OafProtos.Oaf oaf) { - // setting Entity fields - final OafProtos.OafEntity e = oaf.getEntity(); - entity.setId(e.getId()); - entity.setOriginalId(e.getOriginalIdList()); - entity.setCollectedfrom( - e.getCollectedfromList().stream().map(ProtoConverter::mapKV).collect(Collectors.toList())); - entity.setPid( - e.getPidList().stream() - .map(ProtoConverter::mapStructuredProperty) - .collect(Collectors.toList())); - entity.setDateofcollection(e.getDateofcollection()); - entity.setDateoftransformation(e.getDateoftransformation()); - entity.setExtraInfo( - e.getExtraInfoList().stream() - .map(ProtoConverter::mapExtraInfo) - .collect(Collectors.toList())); - return entity; - } + public static T setEntity(T entity, OafProtos.Oaf oaf) { + // setting Entity fields + final OafProtos.OafEntity e = oaf.getEntity(); + entity.setId(e.getId()); + entity.setOriginalId(e.getOriginalIdList()); + entity + .setCollectedfrom( + e.getCollectedfromList().stream().map(ProtoConverter::mapKV).collect(Collectors.toList())); + entity + .setPid( + e + .getPidList() + .stream() + .map(ProtoConverter::mapStructuredProperty) + .collect(Collectors.toList())); + entity.setDateofcollection(e.getDateofcollection()); + entity.setDateoftransformation(e.getDateoftransformation()); + entity + .setExtraInfo( + e + .getExtraInfoList() + .stream() + .map(ProtoConverter::mapExtraInfo) + .collect(Collectors.toList())); + return entity; + } - public static T setResult(T entity, OafProtos.Oaf oaf) { - // setting Entity fields - final ResultProtos.Result.Metadata m = oaf.getEntity().getResult().getMetadata(); - entity.setAuthor( - m.getAuthorList().stream().map(ProtoConverter::mapAuthor).collect(Collectors.toList())); - entity.setResulttype(mapQualifier(m.getResulttype())); - entity.setLanguage(mapQualifier(m.getLanguage())); - entity.setCountry( - m.getCountryList().stream() - .map(ProtoConverter::mapQualifierAsCountry) - .collect(Collectors.toList())); - entity.setSubject( - m.getSubjectList().stream() - .map(ProtoConverter::mapStructuredProperty) - .collect(Collectors.toList())); - entity.setTitle( - m.getTitleList().stream() - .map(ProtoConverter::mapStructuredProperty) - .collect(Collectors.toList())); - entity.setRelevantdate( - m.getRelevantdateList().stream() - .map(ProtoConverter::mapStructuredProperty) - .collect(Collectors.toList())); - entity.setDescription( - m.getDescriptionList().stream() - .map(ProtoConverter::mapStringField) - .collect(Collectors.toList())); - entity.setDateofacceptance(mapStringField(m.getDateofacceptance())); - entity.setPublisher(mapStringField(m.getPublisher())); - entity.setEmbargoenddate(mapStringField(m.getEmbargoenddate())); - entity.setSource( - m.getSourceList().stream() - .map(ProtoConverter::mapStringField) - .collect(Collectors.toList())); - entity.setFulltext( - m.getFulltextList().stream() - .map(ProtoConverter::mapStringField) - .collect(Collectors.toList())); - entity.setFormat( - m.getFormatList().stream() - .map(ProtoConverter::mapStringField) - .collect(Collectors.toList())); - entity.setContributor( - m.getContributorList().stream() - .map(ProtoConverter::mapStringField) - .collect(Collectors.toList())); - entity.setResourcetype(mapQualifier(m.getResourcetype())); - entity.setCoverage( - m.getCoverageList().stream() - .map(ProtoConverter::mapStringField) - .collect(Collectors.toList())); - entity.setContext( - m.getContextList().stream().map(ProtoConverter::mapContext).collect(Collectors.toList())); + public static T setResult(T entity, OafProtos.Oaf oaf) { + // setting Entity fields + final ResultProtos.Result.Metadata m = oaf.getEntity().getResult().getMetadata(); + entity + .setAuthor( + m.getAuthorList().stream().map(ProtoConverter::mapAuthor).collect(Collectors.toList())); + entity.setResulttype(mapQualifier(m.getResulttype())); + entity.setLanguage(mapQualifier(m.getLanguage())); + entity + .setCountry( + m + .getCountryList() + .stream() + .map(ProtoConverter::mapQualifierAsCountry) + .collect(Collectors.toList())); + entity + .setSubject( + m + .getSubjectList() + .stream() + .map(ProtoConverter::mapStructuredProperty) + .collect(Collectors.toList())); + entity + .setTitle( + m + .getTitleList() + .stream() + .map(ProtoConverter::mapStructuredProperty) + .collect(Collectors.toList())); + entity + .setRelevantdate( + m + .getRelevantdateList() + .stream() + .map(ProtoConverter::mapStructuredProperty) + .collect(Collectors.toList())); + entity + .setDescription( + m + .getDescriptionList() + .stream() + .map(ProtoConverter::mapStringField) + .collect(Collectors.toList())); + entity.setDateofacceptance(mapStringField(m.getDateofacceptance())); + entity.setPublisher(mapStringField(m.getPublisher())); + entity.setEmbargoenddate(mapStringField(m.getEmbargoenddate())); + entity + .setSource( + m + .getSourceList() + .stream() + .map(ProtoConverter::mapStringField) + .collect(Collectors.toList())); + entity + .setFulltext( + m + .getFulltextList() + .stream() + .map(ProtoConverter::mapStringField) + .collect(Collectors.toList())); + entity + .setFormat( + m + .getFormatList() + .stream() + .map(ProtoConverter::mapStringField) + .collect(Collectors.toList())); + entity + .setContributor( + m + .getContributorList() + .stream() + .map(ProtoConverter::mapStringField) + .collect(Collectors.toList())); + entity.setResourcetype(mapQualifier(m.getResourcetype())); + entity + .setCoverage( + m + .getCoverageList() + .stream() + .map(ProtoConverter::mapStringField) + .collect(Collectors.toList())); + entity + .setContext( + m.getContextList().stream().map(ProtoConverter::mapContext).collect(Collectors.toList())); - entity.setBestaccessright(getBestAccessRights(oaf.getEntity().getResult().getInstanceList())); + entity.setBestaccessright(getBestAccessRights(oaf.getEntity().getResult().getInstanceList())); - return entity; - } + return entity; + } - private static Qualifier getBestAccessRights(List instanceList) { - if (instanceList != null) { - final Optional min = - instanceList.stream().map(i -> i.getAccessright()).min(new LicenseComparator()); + private static Qualifier getBestAccessRights(List instanceList) { + if (instanceList != null) { + final Optional min = instanceList + .stream() + .map(i -> i.getAccessright()) + .min(new LicenseComparator()); - final Qualifier rights = min.isPresent() ? mapQualifier(min.get()) : new Qualifier(); + final Qualifier rights = min.isPresent() ? mapQualifier(min.get()) : new Qualifier(); - if (StringUtils.isBlank(rights.getClassid())) { - rights.setClassid(UNKNOWN); - } - if (StringUtils.isBlank(rights.getClassname()) - || UNKNOWN.equalsIgnoreCase(rights.getClassname())) { - rights.setClassname(NOT_AVAILABLE); - } - if (StringUtils.isBlank(rights.getSchemeid())) { - rights.setSchemeid(DNET_ACCESS_MODES); - } - if (StringUtils.isBlank(rights.getSchemename())) { - rights.setSchemename(DNET_ACCESS_MODES); - } + if (StringUtils.isBlank(rights.getClassid())) { + rights.setClassid(UNKNOWN); + } + if (StringUtils.isBlank(rights.getClassname()) + || UNKNOWN.equalsIgnoreCase(rights.getClassname())) { + rights.setClassname(NOT_AVAILABLE); + } + if (StringUtils.isBlank(rights.getSchemeid())) { + rights.setSchemeid(DNET_ACCESS_MODES); + } + if (StringUtils.isBlank(rights.getSchemename())) { + rights.setSchemename(DNET_ACCESS_MODES); + } - return rights; - } - return null; - } + return rights; + } + return null; + } - private static Context mapContext(ResultProtos.Result.Context context) { + private static Context mapContext(ResultProtos.Result.Context context) { - final Context entity = new Context(); - entity.setId(context.getId()); - entity.setDataInfo( - context.getDataInfoList().stream() - .map(ProtoConverter::mapDataInfo) - .collect(Collectors.toList())); - return entity; - } + final Context entity = new Context(); + entity.setId(context.getId()); + entity + .setDataInfo( + context + .getDataInfoList() + .stream() + .map(ProtoConverter::mapDataInfo) + .collect(Collectors.toList())); + return entity; + } - public static KeyValue mapKV(FieldTypeProtos.KeyValue kv) { - final KeyValue keyValue = new KeyValue(); - keyValue.setKey(kv.getKey()); - keyValue.setValue(kv.getValue()); - keyValue.setDataInfo(mapDataInfo(kv.getDataInfo())); - return keyValue; - } + public static KeyValue mapKV(FieldTypeProtos.KeyValue kv) { + final KeyValue keyValue = new KeyValue(); + keyValue.setKey(kv.getKey()); + keyValue.setValue(kv.getValue()); + keyValue.setDataInfo(mapDataInfo(kv.getDataInfo())); + return keyValue; + } - public static DataInfo mapDataInfo(FieldTypeProtos.DataInfo d) { - final DataInfo dataInfo = new DataInfo(); - dataInfo.setDeletedbyinference(d.getDeletedbyinference()); - dataInfo.setInferenceprovenance(d.getInferenceprovenance()); - dataInfo.setInferred(d.getInferred()); - dataInfo.setInvisible(d.getInvisible()); - dataInfo.setProvenanceaction(mapQualifier(d.getProvenanceaction())); - dataInfo.setTrust(d.getTrust()); - return dataInfo; - } + public static DataInfo mapDataInfo(FieldTypeProtos.DataInfo d) { + final DataInfo dataInfo = new DataInfo(); + dataInfo.setDeletedbyinference(d.getDeletedbyinference()); + dataInfo.setInferenceprovenance(d.getInferenceprovenance()); + dataInfo.setInferred(d.getInferred()); + dataInfo.setInvisible(d.getInvisible()); + dataInfo.setProvenanceaction(mapQualifier(d.getProvenanceaction())); + dataInfo.setTrust(d.getTrust()); + return dataInfo; + } - public static Qualifier mapQualifier(FieldTypeProtos.Qualifier q) { - final Qualifier qualifier = new Qualifier(); - qualifier.setClassid(q.getClassid()); - qualifier.setClassname(q.getClassname()); - qualifier.setSchemeid(q.getSchemeid()); - qualifier.setSchemename(q.getSchemename()); - return qualifier; - } + public static Qualifier mapQualifier(FieldTypeProtos.Qualifier q) { + final Qualifier qualifier = new Qualifier(); + qualifier.setClassid(q.getClassid()); + qualifier.setClassname(q.getClassname()); + qualifier.setSchemeid(q.getSchemeid()); + qualifier.setSchemename(q.getSchemename()); + return qualifier; + } - public static Country mapQualifierAsCountry(FieldTypeProtos.Qualifier q) { - final Country c = new Country(); - c.setClassid(q.getClassid()); - c.setClassname(q.getClassname()); - c.setSchemeid(q.getSchemeid()); - c.setSchemename(q.getSchemename()); - c.setDataInfo(mapDataInfo(q.getDataInfo())); - return c; - } + public static Country mapQualifierAsCountry(FieldTypeProtos.Qualifier q) { + final Country c = new Country(); + c.setClassid(q.getClassid()); + c.setClassname(q.getClassname()); + c.setSchemeid(q.getSchemeid()); + c.setSchemename(q.getSchemename()); + c.setDataInfo(mapDataInfo(q.getDataInfo())); + return c; + } - public static StructuredProperty mapStructuredProperty(FieldTypeProtos.StructuredProperty sp) { - final StructuredProperty structuredProperty = new StructuredProperty(); - structuredProperty.setValue(sp.getValue()); - structuredProperty.setQualifier(mapQualifier(sp.getQualifier())); - structuredProperty.setDataInfo(mapDataInfo(sp.getDataInfo())); - return structuredProperty; - } + public static StructuredProperty mapStructuredProperty(FieldTypeProtos.StructuredProperty sp) { + final StructuredProperty structuredProperty = new StructuredProperty(); + structuredProperty.setValue(sp.getValue()); + structuredProperty.setQualifier(mapQualifier(sp.getQualifier())); + structuredProperty.setDataInfo(mapDataInfo(sp.getDataInfo())); + return structuredProperty; + } - public static ExtraInfo mapExtraInfo(FieldTypeProtos.ExtraInfo extraInfo) { - final ExtraInfo entity = new ExtraInfo(); - entity.setName(extraInfo.getName()); - entity.setTypology(extraInfo.getTypology()); - entity.setProvenance(extraInfo.getProvenance()); - entity.setTrust(extraInfo.getTrust()); - entity.setValue(extraInfo.getValue()); - return entity; - } + public static ExtraInfo mapExtraInfo(FieldTypeProtos.ExtraInfo extraInfo) { + final ExtraInfo entity = new ExtraInfo(); + entity.setName(extraInfo.getName()); + entity.setTypology(extraInfo.getTypology()); + entity.setProvenance(extraInfo.getProvenance()); + entity.setTrust(extraInfo.getTrust()); + entity.setValue(extraInfo.getValue()); + return entity; + } - public static OAIProvenance mapOAIProvenance(FieldTypeProtos.OAIProvenance oaiProvenance) { - final OAIProvenance entity = new OAIProvenance(); - entity.setOriginDescription(mapOriginalDescription(oaiProvenance.getOriginDescription())); - return entity; - } + public static OAIProvenance mapOAIProvenance(FieldTypeProtos.OAIProvenance oaiProvenance) { + final OAIProvenance entity = new OAIProvenance(); + entity.setOriginDescription(mapOriginalDescription(oaiProvenance.getOriginDescription())); + return entity; + } - public static OriginDescription mapOriginalDescription( - FieldTypeProtos.OAIProvenance.OriginDescription originDescription) { - final OriginDescription originDescriptionResult = new OriginDescription(); - originDescriptionResult.setHarvestDate(originDescription.getHarvestDate()); - originDescriptionResult.setAltered(originDescription.getAltered()); - originDescriptionResult.setBaseURL(originDescription.getBaseURL()); - originDescriptionResult.setIdentifier(originDescription.getIdentifier()); - originDescriptionResult.setDatestamp(originDescription.getDatestamp()); - originDescriptionResult.setMetadataNamespace(originDescription.getMetadataNamespace()); - return originDescriptionResult; - } + public static OriginDescription mapOriginalDescription( + FieldTypeProtos.OAIProvenance.OriginDescription originDescription) { + final OriginDescription originDescriptionResult = new OriginDescription(); + originDescriptionResult.setHarvestDate(originDescription.getHarvestDate()); + originDescriptionResult.setAltered(originDescription.getAltered()); + originDescriptionResult.setBaseURL(originDescription.getBaseURL()); + originDescriptionResult.setIdentifier(originDescription.getIdentifier()); + originDescriptionResult.setDatestamp(originDescription.getDatestamp()); + originDescriptionResult.setMetadataNamespace(originDescription.getMetadataNamespace()); + return originDescriptionResult; + } - public static Field mapStringField(FieldTypeProtos.StringField s) { - final Field stringField = new Field<>(); - stringField.setValue(s.getValue()); - stringField.setDataInfo(mapDataInfo(s.getDataInfo())); - return stringField; - } + public static Field mapStringField(FieldTypeProtos.StringField s) { + final Field stringField = new Field<>(); + stringField.setValue(s.getValue()); + stringField.setDataInfo(mapDataInfo(s.getDataInfo())); + return stringField; + } - public static Field mapBoolField(FieldTypeProtos.BoolField b) { - final Field booleanField = new Field<>(); - booleanField.setValue(b.getValue()); - booleanField.setDataInfo(mapDataInfo(b.getDataInfo())); - return booleanField; - } + public static Field mapBoolField(FieldTypeProtos.BoolField b) { + final Field booleanField = new Field<>(); + booleanField.setValue(b.getValue()); + booleanField.setDataInfo(mapDataInfo(b.getDataInfo())); + return booleanField; + } - public static Field mapIntField(FieldTypeProtos.IntField b) { - final Field entity = new Field<>(); - entity.setValue(b.getValue()); - entity.setDataInfo(mapDataInfo(b.getDataInfo())); - return entity; - } + public static Field mapIntField(FieldTypeProtos.IntField b) { + final Field entity = new Field<>(); + entity.setValue(b.getValue()); + entity.setDataInfo(mapDataInfo(b.getDataInfo())); + return entity; + } - public static Journal mapJournal(FieldTypeProtos.Journal j) { - final Journal journal = new Journal(); - journal.setConferencedate(j.getConferencedate()); - journal.setConferenceplace(j.getConferenceplace()); - journal.setEdition(j.getEdition()); - journal.setEp(j.getEp()); - journal.setIss(j.getIss()); - journal.setIssnLinking(j.getIssnLinking()); - journal.setIssnOnline(j.getIssnOnline()); - journal.setIssnPrinted(j.getIssnPrinted()); - journal.setName(j.getName()); - journal.setSp(j.getSp()); - journal.setVol(j.getVol()); - journal.setDataInfo(mapDataInfo(j.getDataInfo())); - return journal; - } + public static Journal mapJournal(FieldTypeProtos.Journal j) { + final Journal journal = new Journal(); + journal.setConferencedate(j.getConferencedate()); + journal.setConferenceplace(j.getConferenceplace()); + journal.setEdition(j.getEdition()); + journal.setEp(j.getEp()); + journal.setIss(j.getIss()); + journal.setIssnLinking(j.getIssnLinking()); + journal.setIssnOnline(j.getIssnOnline()); + journal.setIssnPrinted(j.getIssnPrinted()); + journal.setName(j.getName()); + journal.setSp(j.getSp()); + journal.setVol(j.getVol()); + journal.setDataInfo(mapDataInfo(j.getDataInfo())); + return journal; + } - public static Author mapAuthor(FieldTypeProtos.Author author) { - final Author entity = new Author(); - entity.setFullname(author.getFullname()); - entity.setName(author.getName()); - entity.setSurname(author.getSurname()); - entity.setRank(author.getRank()); - entity.setPid( - author.getPidList().stream() - .map( - kv -> { - final StructuredProperty sp = new StructuredProperty(); - sp.setValue(kv.getValue()); - final Qualifier q = new Qualifier(); - q.setClassid(kv.getKey()); - q.setClassname(kv.getKey()); - sp.setQualifier(q); - return sp; - }) - .collect(Collectors.toList())); - entity.setAffiliation( - author.getAffiliationList().stream() - .map(ProtoConverter::mapStringField) - .collect(Collectors.toList())); - return entity; - } + public static Author mapAuthor(FieldTypeProtos.Author author) { + final Author entity = new Author(); + entity.setFullname(author.getFullname()); + entity.setName(author.getName()); + entity.setSurname(author.getSurname()); + entity.setRank(author.getRank()); + entity + .setPid( + author + .getPidList() + .stream() + .map( + kv -> { + final StructuredProperty sp = new StructuredProperty(); + sp.setValue(kv.getValue()); + final Qualifier q = new Qualifier(); + q.setClassid(kv.getKey()); + q.setClassname(kv.getKey()); + sp.setQualifier(q); + return sp; + }) + .collect(Collectors.toList())); + entity + .setAffiliation( + author + .getAffiliationList() + .stream() + .map(ProtoConverter::mapStringField) + .collect(Collectors.toList())); + return entity; + } - public static GeoLocation mapGeolocation(ResultProtos.Result.GeoLocation geoLocation) { - final GeoLocation entity = new GeoLocation(); - entity.setPoint(geoLocation.getPoint()); - entity.setBox(geoLocation.getBox()); - entity.setPlace(geoLocation.getPlace()); - return entity; - } + public static GeoLocation mapGeolocation(ResultProtos.Result.GeoLocation geoLocation) { + final GeoLocation entity = new GeoLocation(); + entity.setPoint(geoLocation.getPoint()); + entity.setBox(geoLocation.getBox()); + entity.setPlace(geoLocation.getPlace()); + return entity; + } } diff --git a/dhp-workflows/dhp-actionmanager/src/main/java/eu/dnetlib/dhp/actionmanager/migration/TransformActions.java b/dhp-workflows/dhp-actionmanager/src/main/java/eu/dnetlib/dhp/actionmanager/migration/TransformActions.java index d200ac18f..490668606 100644 --- a/dhp-workflows/dhp-actionmanager/src/main/java/eu/dnetlib/dhp/actionmanager/migration/TransformActions.java +++ b/dhp-workflows/dhp-actionmanager/src/main/java/eu/dnetlib/dhp/actionmanager/migration/TransformActions.java @@ -1,23 +1,14 @@ + package eu.dnetlib.dhp.actionmanager.migration; import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; -import com.fasterxml.jackson.databind.ObjectMapper; -import com.google.common.base.Splitter; -import com.google.common.collect.Lists; -import com.google.protobuf.InvalidProtocolBufferException; -import eu.dnetlib.data.proto.OafProtos; -import eu.dnetlib.dhp.application.ArgumentApplicationParser; -import eu.dnetlib.dhp.schema.action.AtomicAction; -import eu.dnetlib.dhp.schema.oaf.*; -import eu.dnetlib.dhp.utils.ISLookupClientFactory; -import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException; -import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService; import java.io.IOException; import java.io.Serializable; import java.util.LinkedList; import java.util.Objects; import java.util.Optional; + import org.apache.commons.io.IOUtils; import org.apache.commons.lang3.StringUtils; import org.apache.hadoop.fs.FileSystem; @@ -29,136 +20,153 @@ import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.sql.SparkSession; import org.slf4j.Logger; import org.slf4j.LoggerFactory; + +import com.fasterxml.jackson.databind.ObjectMapper; +import com.google.common.base.Splitter; +import com.google.common.collect.Lists; +import com.google.protobuf.InvalidProtocolBufferException; + +import eu.dnetlib.data.proto.OafProtos; +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.schema.action.AtomicAction; +import eu.dnetlib.dhp.schema.oaf.*; +import eu.dnetlib.dhp.utils.ISLookupClientFactory; +import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException; +import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService; import scala.Tuple2; public class TransformActions implements Serializable { - private static final Logger log = LoggerFactory.getLogger(TransformActions.class); + private static final Logger log = LoggerFactory.getLogger(TransformActions.class); - private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); + private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); - private static final String SEPARATOR = "/"; + private static final String SEPARATOR = "/"; - public static void main(String[] args) throws Exception { - final ArgumentApplicationParser parser = - new ArgumentApplicationParser( - IOUtils.toString( - MigrateActionSet.class.getResourceAsStream( - "/eu/dnetlib/dhp/actionmanager/migration/transform_actionsets_parameters.json"))); - parser.parseArgument(args); + public static void main(String[] args) throws Exception { + final ArgumentApplicationParser parser = new ArgumentApplicationParser( + IOUtils + .toString( + MigrateActionSet.class + .getResourceAsStream( + "/eu/dnetlib/dhp/actionmanager/migration/transform_actionsets_parameters.json"))); + parser.parseArgument(args); - Boolean isSparkSessionManaged = - Optional.ofNullable(parser.get("isSparkSessionManaged")) - .map(Boolean::valueOf) - .orElse(Boolean.TRUE); - log.info("isSparkSessionManaged: {}", isSparkSessionManaged); + Boolean isSparkSessionManaged = Optional + .ofNullable(parser.get("isSparkSessionManaged")) + .map(Boolean::valueOf) + .orElse(Boolean.TRUE); + log.info("isSparkSessionManaged: {}", isSparkSessionManaged); - final String isLookupUrl = parser.get("isLookupUrl"); - log.info("isLookupUrl: {}", isLookupUrl); + final String isLookupUrl = parser.get("isLookupUrl"); + log.info("isLookupUrl: {}", isLookupUrl); - final String inputPaths = parser.get("inputPaths"); + final String inputPaths = parser.get("inputPaths"); - if (StringUtils.isBlank(inputPaths)) { - throw new RuntimeException("empty inputPaths"); - } - log.info("inputPaths: {}", inputPaths); + if (StringUtils.isBlank(inputPaths)) { + throw new RuntimeException("empty inputPaths"); + } + log.info("inputPaths: {}", inputPaths); - final String targetBaseDir = getTargetBaseDir(isLookupUrl); + final String targetBaseDir = getTargetBaseDir(isLookupUrl); - SparkConf conf = new SparkConf(); + SparkConf conf = new SparkConf(); - runWithSparkSession( - conf, isSparkSessionManaged, spark -> transformActions(inputPaths, targetBaseDir, spark)); - } + runWithSparkSession( + conf, isSparkSessionManaged, spark -> transformActions(inputPaths, targetBaseDir, spark)); + } - private static void transformActions(String inputPaths, String targetBaseDir, SparkSession spark) - throws IOException { - final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); - final FileSystem fs = FileSystem.get(spark.sparkContext().hadoopConfiguration()); + private static void transformActions(String inputPaths, String targetBaseDir, SparkSession spark) + throws IOException { + final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); + final FileSystem fs = FileSystem.get(spark.sparkContext().hadoopConfiguration()); - for (String sourcePath : Lists.newArrayList(Splitter.on(",").split(inputPaths))) { + for (String sourcePath : Lists.newArrayList(Splitter.on(",").split(inputPaths))) { - LinkedList pathQ = Lists.newLinkedList(Splitter.on(SEPARATOR).split(sourcePath)); + LinkedList pathQ = Lists.newLinkedList(Splitter.on(SEPARATOR).split(sourcePath)); - final String rawset = pathQ.pollLast(); - final String actionSetDirectory = pathQ.pollLast(); + final String rawset = pathQ.pollLast(); + final String actionSetDirectory = pathQ.pollLast(); - final Path targetDirectory = - new Path(targetBaseDir + SEPARATOR + actionSetDirectory + SEPARATOR + rawset); + final Path targetDirectory = new Path(targetBaseDir + SEPARATOR + actionSetDirectory + SEPARATOR + rawset); - if (fs.exists(targetDirectory)) { - log.info("found target directory '{}", targetDirectory); - fs.delete(targetDirectory, true); - log.info("deleted target directory '{}", targetDirectory); - } + if (fs.exists(targetDirectory)) { + log.info("found target directory '{}", targetDirectory); + fs.delete(targetDirectory, true); + log.info("deleted target directory '{}", targetDirectory); + } - log.info("transforming actions from '{}' to '{}'", sourcePath, targetDirectory); + log.info("transforming actions from '{}' to '{}'", sourcePath, targetDirectory); - sc.sequenceFile(sourcePath, Text.class, Text.class) - .map(a -> eu.dnetlib.actionmanager.actions.AtomicAction.fromJSON(a._2().toString())) - .map(TransformActions::doTransform) - .filter(Objects::nonNull) - .mapToPair( - a -> new Tuple2<>(a.getClazz().toString(), OBJECT_MAPPER.writeValueAsString(a))) - .mapToPair(t -> new Tuple2(new Text(t._1()), new Text(t._2()))) - .saveAsNewAPIHadoopFile( - targetDirectory.toString(), - Text.class, - Text.class, - SequenceFileOutputFormat.class, - sc.hadoopConfiguration()); - } - } + sc + .sequenceFile(sourcePath, Text.class, Text.class) + .map(a -> eu.dnetlib.actionmanager.actions.AtomicAction.fromJSON(a._2().toString())) + .map(TransformActions::doTransform) + .filter(Objects::nonNull) + .mapToPair( + a -> new Tuple2<>(a.getClazz().toString(), OBJECT_MAPPER.writeValueAsString(a))) + .mapToPair(t -> new Tuple2(new Text(t._1()), new Text(t._2()))) + .saveAsNewAPIHadoopFile( + targetDirectory.toString(), + Text.class, + Text.class, + SequenceFileOutputFormat.class, + sc.hadoopConfiguration()); + } + } - private static AtomicAction doTransform(eu.dnetlib.actionmanager.actions.AtomicAction aa) - throws InvalidProtocolBufferException { + private static AtomicAction doTransform(eu.dnetlib.actionmanager.actions.AtomicAction aa) + throws InvalidProtocolBufferException { - // dedup similarity relations had empty target value, don't migrate them - if (aa.getTargetValue().length == 0) { - return null; - } - final OafProtos.Oaf proto_oaf = OafProtos.Oaf.parseFrom(aa.getTargetValue()); - final Oaf oaf = ProtoConverter.convert(proto_oaf); - switch (proto_oaf.getKind()) { - case entity: - switch (proto_oaf.getEntity().getType()) { - case datasource: - return new AtomicAction<>(Datasource.class, (Datasource) oaf); - case organization: - return new AtomicAction<>(Organization.class, (Organization) oaf); - case project: - return new AtomicAction<>(Project.class, (Project) oaf); - case result: - final String resulttypeid = - proto_oaf.getEntity().getResult().getMetadata().getResulttype().getClassid(); - switch (resulttypeid) { - case "publication": - return new AtomicAction<>(Publication.class, (Publication) oaf); - case "software": - return new AtomicAction<>(Software.class, (Software) oaf); - case "other": - return new AtomicAction<>(OtherResearchProduct.class, (OtherResearchProduct) oaf); - case "dataset": - return new AtomicAction<>(Dataset.class, (Dataset) oaf); - default: - // can be an update, where the resulttype is not specified - return new AtomicAction<>(Result.class, (Result) oaf); - } - default: - throw new IllegalArgumentException( - "invalid entity type: " + proto_oaf.getEntity().getType()); - } - case relation: - return new AtomicAction<>(Relation.class, (Relation) oaf); - default: - throw new IllegalArgumentException("invalid kind: " + proto_oaf.getKind()); - } - } + // dedup similarity relations had empty target value, don't migrate them + if (aa.getTargetValue().length == 0) { + return null; + } + final OafProtos.Oaf proto_oaf = OafProtos.Oaf.parseFrom(aa.getTargetValue()); + final Oaf oaf = ProtoConverter.convert(proto_oaf); + switch (proto_oaf.getKind()) { + case entity: + switch (proto_oaf.getEntity().getType()) { + case datasource: + return new AtomicAction<>(Datasource.class, (Datasource) oaf); + case organization: + return new AtomicAction<>(Organization.class, (Organization) oaf); + case project: + return new AtomicAction<>(Project.class, (Project) oaf); + case result: + final String resulttypeid = proto_oaf + .getEntity() + .getResult() + .getMetadata() + .getResulttype() + .getClassid(); + switch (resulttypeid) { + case "publication": + return new AtomicAction<>(Publication.class, (Publication) oaf); + case "software": + return new AtomicAction<>(Software.class, (Software) oaf); + case "other": + return new AtomicAction<>(OtherResearchProduct.class, (OtherResearchProduct) oaf); + case "dataset": + return new AtomicAction<>(Dataset.class, (Dataset) oaf); + default: + // can be an update, where the resulttype is not specified + return new AtomicAction<>(Result.class, (Result) oaf); + } + default: + throw new IllegalArgumentException( + "invalid entity type: " + proto_oaf.getEntity().getType()); + } + case relation: + return new AtomicAction<>(Relation.class, (Relation) oaf); + default: + throw new IllegalArgumentException("invalid kind: " + proto_oaf.getKind()); + } + } - private static String getTargetBaseDir(String isLookupUrl) throws ISLookUpException { - ISLookUpService isLookUp = ISLookupClientFactory.getLookUpService(isLookupUrl); - String XQUERY = - "collection('/db/DRIVER/ServiceResources/ActionManagerServiceResourceType')//SERVICE_PROPERTIES/PROPERTY[@key = 'basePath']/@value/string()"; - return isLookUp.getResourceProfileByQuery(XQUERY); - } + private static String getTargetBaseDir(String isLookupUrl) throws ISLookUpException { + ISLookUpService isLookUp = ISLookupClientFactory.getLookUpService(isLookupUrl); + String XQUERY = "collection('/db/DRIVER/ServiceResources/ActionManagerServiceResourceType')//SERVICE_PROPERTIES/PROPERTY[@key = 'basePath']/@value/string()"; + return isLookUp.getResourceProfileByQuery(XQUERY); + } } diff --git a/dhp-workflows/dhp-actionmanager/src/main/java/eu/dnetlib/dhp/actionmanager/partition/PartitionActionSetsByPayloadTypeJob.java b/dhp-workflows/dhp-actionmanager/src/main/java/eu/dnetlib/dhp/actionmanager/partition/PartitionActionSetsByPayloadTypeJob.java index 6eb0bac3b..af3ef0c12 100644 --- a/dhp-workflows/dhp-actionmanager/src/main/java/eu/dnetlib/dhp/actionmanager/partition/PartitionActionSetsByPayloadTypeJob.java +++ b/dhp-workflows/dhp-actionmanager/src/main/java/eu/dnetlib/dhp/actionmanager/partition/PartitionActionSetsByPayloadTypeJob.java @@ -1,15 +1,13 @@ + package eu.dnetlib.dhp.actionmanager.partition; import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; import static org.apache.spark.sql.functions.*; -import eu.dnetlib.dhp.actionmanager.ISClient; -import eu.dnetlib.dhp.actionmanager.promote.PromoteActionPayloadForGraphTableJob; -import eu.dnetlib.dhp.application.ArgumentApplicationParser; -import eu.dnetlib.dhp.common.HdfsSupport; import java.util.Arrays; import java.util.List; import java.util.Optional; + import org.apache.commons.io.IOUtils; import org.apache.hadoop.io.Text; import org.apache.spark.SparkConf; @@ -20,117 +18,127 @@ import org.apache.spark.sql.types.*; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import eu.dnetlib.dhp.actionmanager.ISClient; +import eu.dnetlib.dhp.actionmanager.promote.PromoteActionPayloadForGraphTableJob; +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.common.HdfsSupport; + /** Partitions given set of action sets by payload type. */ public class PartitionActionSetsByPayloadTypeJob { - private static final Logger logger = - LoggerFactory.getLogger(PartitionActionSetsByPayloadTypeJob.class); + private static final Logger logger = LoggerFactory.getLogger(PartitionActionSetsByPayloadTypeJob.class); - private static final StructType KV_SCHEMA = - StructType$.MODULE$.apply( - Arrays.asList( - StructField$.MODULE$.apply("key", DataTypes.StringType, false, Metadata.empty()), - StructField$.MODULE$.apply("value", DataTypes.StringType, false, Metadata.empty()))); + private static final StructType KV_SCHEMA = StructType$.MODULE$ + .apply( + Arrays + .asList( + StructField$.MODULE$.apply("key", DataTypes.StringType, false, Metadata.empty()), + StructField$.MODULE$.apply("value", DataTypes.StringType, false, Metadata.empty()))); - private static final StructType ATOMIC_ACTION_SCHEMA = - StructType$.MODULE$.apply( - Arrays.asList( - StructField$.MODULE$.apply("clazz", DataTypes.StringType, false, Metadata.empty()), - StructField$.MODULE$.apply( - "payload", DataTypes.StringType, false, Metadata.empty()))); + private static final StructType ATOMIC_ACTION_SCHEMA = StructType$.MODULE$ + .apply( + Arrays + .asList( + StructField$.MODULE$.apply("clazz", DataTypes.StringType, false, Metadata.empty()), + StructField$.MODULE$ + .apply( + "payload", DataTypes.StringType, false, Metadata.empty()))); - private ISClient isClient; + private ISClient isClient; - public PartitionActionSetsByPayloadTypeJob(String isLookupUrl) { - this.isClient = new ISClient(isLookupUrl); - } + public PartitionActionSetsByPayloadTypeJob(String isLookupUrl) { + this.isClient = new ISClient(isLookupUrl); + } - public PartitionActionSetsByPayloadTypeJob() {} + public PartitionActionSetsByPayloadTypeJob() { + } - public static void main(String[] args) throws Exception { - String jsonConfiguration = - IOUtils.toString( - PromoteActionPayloadForGraphTableJob.class.getResourceAsStream( - "/eu/dnetlib/dhp/actionmanager/partition/partition_action_sets_by_payload_type_input_parameters.json")); - final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration); - parser.parseArgument(args); + public static void main(String[] args) throws Exception { + String jsonConfiguration = IOUtils + .toString( + PromoteActionPayloadForGraphTableJob.class + .getResourceAsStream( + "/eu/dnetlib/dhp/actionmanager/partition/partition_action_sets_by_payload_type_input_parameters.json")); + final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration); + parser.parseArgument(args); - Boolean isSparkSessionManaged = - Optional.ofNullable(parser.get("isSparkSessionManaged")) - .map(Boolean::valueOf) - .orElse(Boolean.TRUE); - logger.info("isSparkSessionManaged: {}", isSparkSessionManaged); + Boolean isSparkSessionManaged = Optional + .ofNullable(parser.get("isSparkSessionManaged")) + .map(Boolean::valueOf) + .orElse(Boolean.TRUE); + logger.info("isSparkSessionManaged: {}", isSparkSessionManaged); - String inputActionSetIds = parser.get("inputActionSetIds"); - logger.info("inputActionSetIds: {}", inputActionSetIds); + String inputActionSetIds = parser.get("inputActionSetIds"); + logger.info("inputActionSetIds: {}", inputActionSetIds); - String outputPath = parser.get("outputPath"); - logger.info("outputPath: {}", outputPath); + String outputPath = parser.get("outputPath"); + logger.info("outputPath: {}", outputPath); - String isLookupUrl = parser.get("isLookupUrl"); - logger.info("isLookupUrl: {}", isLookupUrl); + String isLookupUrl = parser.get("isLookupUrl"); + logger.info("isLookupUrl: {}", isLookupUrl); - new PartitionActionSetsByPayloadTypeJob(isLookupUrl) - .run(isSparkSessionManaged, inputActionSetIds, outputPath); - } + new PartitionActionSetsByPayloadTypeJob(isLookupUrl) + .run(isSparkSessionManaged, inputActionSetIds, outputPath); + } - protected void run(Boolean isSparkSessionManaged, String inputActionSetIds, String outputPath) { + protected void run(Boolean isSparkSessionManaged, String inputActionSetIds, String outputPath) { - List inputActionSetPaths = getIsClient().getLatestRawsetPaths(inputActionSetIds); - logger.info("inputActionSetPaths: {}", String.join(",", inputActionSetPaths)); + List inputActionSetPaths = getIsClient().getLatestRawsetPaths(inputActionSetIds); + logger.info("inputActionSetPaths: {}", String.join(",", inputActionSetPaths)); - SparkConf conf = new SparkConf(); - conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer"); + SparkConf conf = new SparkConf(); + conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer"); - runWithSparkSession( - conf, - isSparkSessionManaged, - spark -> { - removeOutputDir(spark, outputPath); - readAndWriteActionSetsFromPaths(spark, inputActionSetPaths, outputPath); - }); - } + runWithSparkSession( + conf, + isSparkSessionManaged, + spark -> { + removeOutputDir(spark, outputPath); + readAndWriteActionSetsFromPaths(spark, inputActionSetPaths, outputPath); + }); + } - private static void removeOutputDir(SparkSession spark, String path) { - HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration()); - } + private static void removeOutputDir(SparkSession spark, String path) { + HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration()); + } - private static void readAndWriteActionSetsFromPaths( - SparkSession spark, List inputActionSetPaths, String outputPath) { - inputActionSetPaths.stream() - .filter(path -> HdfsSupport.exists(path, spark.sparkContext().hadoopConfiguration())) - .forEach( - inputActionSetPath -> { - Dataset actionDS = readActionSetFromPath(spark, inputActionSetPath); - saveActions(actionDS, outputPath); - }); - } + private static void readAndWriteActionSetsFromPaths( + SparkSession spark, List inputActionSetPaths, String outputPath) { + inputActionSetPaths + .stream() + .filter(path -> HdfsSupport.exists(path, spark.sparkContext().hadoopConfiguration())) + .forEach( + inputActionSetPath -> { + Dataset actionDS = readActionSetFromPath(spark, inputActionSetPath); + saveActions(actionDS, outputPath); + }); + } - private static Dataset readActionSetFromPath(SparkSession spark, String path) { - logger.info("Reading actions from path: {}", path); + private static Dataset readActionSetFromPath(SparkSession spark, String path) { + logger.info("Reading actions from path: {}", path); - JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); + JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); - JavaRDD rdd = - sc.sequenceFile(path, Text.class, Text.class) - .map(x -> RowFactory.create(x._1().toString(), x._2().toString())); + JavaRDD rdd = sc + .sequenceFile(path, Text.class, Text.class) + .map(x -> RowFactory.create(x._1().toString(), x._2().toString())); - return spark - .createDataFrame(rdd, KV_SCHEMA) - .withColumn("atomic_action", from_json(col("value"), ATOMIC_ACTION_SCHEMA)) - .select(expr("atomic_action.*")); - } + return spark + .createDataFrame(rdd, KV_SCHEMA) + .withColumn("atomic_action", from_json(col("value"), ATOMIC_ACTION_SCHEMA)) + .select(expr("atomic_action.*")); + } - private static void saveActions(Dataset actionDS, String path) { - logger.info("Saving actions to path: {}", path); - actionDS.write().partitionBy("clazz").mode(SaveMode.Append).parquet(path); - } + private static void saveActions(Dataset actionDS, String path) { + logger.info("Saving actions to path: {}", path); + actionDS.write().partitionBy("clazz").mode(SaveMode.Append).parquet(path); + } - public ISClient getIsClient() { - return isClient; - } + public ISClient getIsClient() { + return isClient; + } - public void setIsClient(ISClient isClient) { - this.isClient = isClient; - } + public void setIsClient(ISClient isClient) { + this.isClient = isClient; + } } diff --git a/dhp-workflows/dhp-actionmanager/src/main/java/eu/dnetlib/dhp/actionmanager/promote/MergeAndGet.java b/dhp-workflows/dhp-actionmanager/src/main/java/eu/dnetlib/dhp/actionmanager/promote/MergeAndGet.java index ac8291842..fbb072957 100644 --- a/dhp-workflows/dhp-actionmanager/src/main/java/eu/dnetlib/dhp/actionmanager/promote/MergeAndGet.java +++ b/dhp-workflows/dhp-actionmanager/src/main/java/eu/dnetlib/dhp/actionmanager/promote/MergeAndGet.java @@ -1,82 +1,87 @@ + package eu.dnetlib.dhp.actionmanager.promote; import static eu.dnetlib.dhp.schema.common.ModelSupport.isSubClass; +import java.util.function.BiFunction; + import eu.dnetlib.dhp.common.FunctionalInterfaceSupport.SerializableSupplier; import eu.dnetlib.dhp.schema.oaf.Oaf; import eu.dnetlib.dhp.schema.oaf.OafEntity; import eu.dnetlib.dhp.schema.oaf.Relation; -import java.util.function.BiFunction; /** OAF model merging support. */ public class MergeAndGet { - private MergeAndGet() {} + private MergeAndGet() { + } - /** - * Strategy for merging OAF model objects. - * - *

MERGE_FROM_AND_GET: use OAF 'mergeFrom' method SELECT_NEWER_AND_GET: use last update - * timestamp to return newer instance - */ - public enum Strategy { - MERGE_FROM_AND_GET, - SELECT_NEWER_AND_GET - } + /** + * Strategy for merging OAF model objects. + *

+ * MERGE_FROM_AND_GET: use OAF 'mergeFrom' method SELECT_NEWER_AND_GET: use last update timestamp to return newer + * instance + */ + public enum Strategy { + MERGE_FROM_AND_GET, SELECT_NEWER_AND_GET + } - /** - * Returns a function for merging OAF model objects. - * - * @param strategy Strategy to be used to merge objects - * @param Graph table type - * @param Action payload type - * @return BiFunction to be used to merge OAF objects - */ - public static - SerializableSupplier> functionFor(Strategy strategy) { - switch (strategy) { - case MERGE_FROM_AND_GET: - return () -> MergeAndGet::mergeFromAndGet; - case SELECT_NEWER_AND_GET: - return () -> MergeAndGet::selectNewerAndGet; - } - throw new RuntimeException(); - } + /** + * Returns a function for merging OAF model objects. + * + * @param strategy Strategy to be used to merge objects + * @param Graph table type + * @param Action payload type + * @return BiFunction to be used to merge OAF objects + */ + public static SerializableSupplier> functionFor( + Strategy strategy) { + switch (strategy) { + case MERGE_FROM_AND_GET: + return () -> MergeAndGet::mergeFromAndGet; + case SELECT_NEWER_AND_GET: + return () -> MergeAndGet::selectNewerAndGet; + } + throw new RuntimeException(); + } - private static G mergeFromAndGet(G x, A y) { - if (isSubClass(x, Relation.class) && isSubClass(y, Relation.class)) { - ((Relation) x).mergeFrom((Relation) y); - return x; - } else if (isSubClass(x, OafEntity.class) - && isSubClass(y, OafEntity.class) - && isSubClass(x, y)) { - ((OafEntity) x).mergeFrom((OafEntity) y); - return x; - } - throw new RuntimeException( - String.format( - "MERGE_FROM_AND_GET incompatible types: %s, %s", - x.getClass().getCanonicalName(), y.getClass().getCanonicalName())); - } + private static G mergeFromAndGet(G x, A y) { + if (isSubClass(x, Relation.class) && isSubClass(y, Relation.class)) { + ((Relation) x).mergeFrom((Relation) y); + return x; + } else if (isSubClass(x, OafEntity.class) + && isSubClass(y, OafEntity.class) + && isSubClass(x, y)) { + ((OafEntity) x).mergeFrom((OafEntity) y); + return x; + } + throw new RuntimeException( + String + .format( + "MERGE_FROM_AND_GET incompatible types: %s, %s", + x.getClass().getCanonicalName(), y.getClass().getCanonicalName())); + } - private static G selectNewerAndGet(G x, A y) { - if (x.getClass().equals(y.getClass()) - && x.getLastupdatetimestamp() > y.getLastupdatetimestamp()) { - return x; - } else if (x.getClass().equals(y.getClass()) - && x.getLastupdatetimestamp() < y.getLastupdatetimestamp()) { - return (G) y; - } else if (isSubClass(x, y) && x.getLastupdatetimestamp() > y.getLastupdatetimestamp()) { - return x; - } else if (isSubClass(x, y) && x.getLastupdatetimestamp() < y.getLastupdatetimestamp()) { - throw new RuntimeException( - String.format( - "SELECT_NEWER_AND_GET cannot return right type when it is not the same as left type: %s, %s", - x.getClass().getCanonicalName(), y.getClass().getCanonicalName())); - } - throw new RuntimeException( - String.format( - "SELECT_NEWER_AND_GET cannot be used when left is not subtype of right: %s, %s", - x.getClass().getCanonicalName(), y.getClass().getCanonicalName())); - } + private static G selectNewerAndGet(G x, A y) { + if (x.getClass().equals(y.getClass()) + && x.getLastupdatetimestamp() > y.getLastupdatetimestamp()) { + return x; + } else if (x.getClass().equals(y.getClass()) + && x.getLastupdatetimestamp() < y.getLastupdatetimestamp()) { + return (G) y; + } else if (isSubClass(x, y) && x.getLastupdatetimestamp() > y.getLastupdatetimestamp()) { + return x; + } else if (isSubClass(x, y) && x.getLastupdatetimestamp() < y.getLastupdatetimestamp()) { + throw new RuntimeException( + String + .format( + "SELECT_NEWER_AND_GET cannot return right type when it is not the same as left type: %s, %s", + x.getClass().getCanonicalName(), y.getClass().getCanonicalName())); + } + throw new RuntimeException( + String + .format( + "SELECT_NEWER_AND_GET cannot be used when left is not subtype of right: %s, %s", + x.getClass().getCanonicalName(), y.getClass().getCanonicalName())); + } } diff --git a/dhp-workflows/dhp-actionmanager/src/main/java/eu/dnetlib/dhp/actionmanager/promote/PromoteActionPayloadForGraphTableJob.java b/dhp-workflows/dhp-actionmanager/src/main/java/eu/dnetlib/dhp/actionmanager/promote/PromoteActionPayloadForGraphTableJob.java index 24af1973f..17bfc4af3 100644 --- a/dhp-workflows/dhp-actionmanager/src/main/java/eu/dnetlib/dhp/actionmanager/promote/PromoteActionPayloadForGraphTableJob.java +++ b/dhp-workflows/dhp-actionmanager/src/main/java/eu/dnetlib/dhp/actionmanager/promote/PromoteActionPayloadForGraphTableJob.java @@ -1,18 +1,14 @@ + package eu.dnetlib.dhp.actionmanager.promote; import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; import static eu.dnetlib.dhp.schema.common.ModelSupport.isSubClass; -import com.fasterxml.jackson.databind.ObjectMapper; -import eu.dnetlib.dhp.application.ArgumentApplicationParser; -import eu.dnetlib.dhp.common.FunctionalInterfaceSupport.SerializableSupplier; -import eu.dnetlib.dhp.common.HdfsSupport; -import eu.dnetlib.dhp.schema.common.ModelSupport; -import eu.dnetlib.dhp.schema.oaf.*; import java.util.Objects; import java.util.Optional; import java.util.function.BiFunction; import java.util.function.Function; + import org.apache.commons.io.IOUtils; import org.apache.spark.SparkConf; import org.apache.spark.api.java.function.MapFunction; @@ -23,204 +19,207 @@ import org.apache.spark.sql.SparkSession; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import com.fasterxml.jackson.databind.ObjectMapper; + +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.common.FunctionalInterfaceSupport.SerializableSupplier; +import eu.dnetlib.dhp.common.HdfsSupport; +import eu.dnetlib.dhp.schema.common.ModelSupport; +import eu.dnetlib.dhp.schema.oaf.*; + /** Applies a given action payload file to graph table of compatible type. */ public class PromoteActionPayloadForGraphTableJob { - private static final Logger logger = - LoggerFactory.getLogger(PromoteActionPayloadForGraphTableJob.class); + private static final Logger logger = LoggerFactory.getLogger(PromoteActionPayloadForGraphTableJob.class); - private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); + private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); - public static void main(String[] args) throws Exception { - String jsonConfiguration = - IOUtils.toString( - PromoteActionPayloadForGraphTableJob.class.getResourceAsStream( - "/eu/dnetlib/dhp/actionmanager/promote/promote_action_payload_for_graph_table_input_parameters.json")); - final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration); - parser.parseArgument(args); + public static void main(String[] args) throws Exception { + String jsonConfiguration = IOUtils + .toString( + PromoteActionPayloadForGraphTableJob.class + .getResourceAsStream( + "/eu/dnetlib/dhp/actionmanager/promote/promote_action_payload_for_graph_table_input_parameters.json")); + final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration); + parser.parseArgument(args); - Boolean isSparkSessionManaged = - Optional.ofNullable(parser.get("isSparkSessionManaged")) - .map(Boolean::valueOf) - .orElse(Boolean.TRUE); - logger.info("isSparkSessionManaged: {}", isSparkSessionManaged); + Boolean isSparkSessionManaged = Optional + .ofNullable(parser.get("isSparkSessionManaged")) + .map(Boolean::valueOf) + .orElse(Boolean.TRUE); + logger.info("isSparkSessionManaged: {}", isSparkSessionManaged); - String inputGraphTablePath = parser.get("inputGraphTablePath"); - logger.info("inputGraphTablePath: {}", inputGraphTablePath); + String inputGraphTablePath = parser.get("inputGraphTablePath"); + logger.info("inputGraphTablePath: {}", inputGraphTablePath); - String graphTableClassName = parser.get("graphTableClassName"); - logger.info("graphTableClassName: {}", graphTableClassName); + String graphTableClassName = parser.get("graphTableClassName"); + logger.info("graphTableClassName: {}", graphTableClassName); - String inputActionPayloadPath = parser.get("inputActionPayloadPath"); - logger.info("inputActionPayloadPath: {}", inputActionPayloadPath); + String inputActionPayloadPath = parser.get("inputActionPayloadPath"); + logger.info("inputActionPayloadPath: {}", inputActionPayloadPath); - String actionPayloadClassName = parser.get("actionPayloadClassName"); - logger.info("actionPayloadClassName: {}", actionPayloadClassName); + String actionPayloadClassName = parser.get("actionPayloadClassName"); + logger.info("actionPayloadClassName: {}", actionPayloadClassName); - String outputGraphTablePath = parser.get("outputGraphTablePath"); - logger.info("outputGraphTablePath: {}", outputGraphTablePath); + String outputGraphTablePath = parser.get("outputGraphTablePath"); + logger.info("outputGraphTablePath: {}", outputGraphTablePath); - MergeAndGet.Strategy strategy = - MergeAndGet.Strategy.valueOf(parser.get("mergeAndGetStrategy").toUpperCase()); - logger.info("strategy: {}", strategy); + MergeAndGet.Strategy strategy = MergeAndGet.Strategy.valueOf(parser.get("mergeAndGetStrategy").toUpperCase()); + logger.info("strategy: {}", strategy); - Class rowClazz = (Class) Class.forName(graphTableClassName); - Class actionPayloadClazz = - (Class) Class.forName(actionPayloadClassName); + Class rowClazz = (Class) Class.forName(graphTableClassName); + Class actionPayloadClazz = (Class) Class.forName(actionPayloadClassName); - throwIfGraphTableClassIsNotSubClassOfActionPayloadClass(rowClazz, actionPayloadClazz); + throwIfGraphTableClassIsNotSubClassOfActionPayloadClass(rowClazz, actionPayloadClazz); - SparkConf conf = new SparkConf(); - conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer"); - conf.registerKryoClasses(ModelSupport.getOafModelClasses()); + SparkConf conf = new SparkConf(); + conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer"); + conf.registerKryoClasses(ModelSupport.getOafModelClasses()); - runWithSparkSession( - conf, - isSparkSessionManaged, - spark -> { - removeOutputDir(spark, outputGraphTablePath); - promoteActionPayloadForGraphTable( - spark, - inputGraphTablePath, - inputActionPayloadPath, - outputGraphTablePath, - strategy, - rowClazz, - actionPayloadClazz); - }); - } + runWithSparkSession( + conf, + isSparkSessionManaged, + spark -> { + removeOutputDir(spark, outputGraphTablePath); + promoteActionPayloadForGraphTable( + spark, + inputGraphTablePath, + inputActionPayloadPath, + outputGraphTablePath, + strategy, + rowClazz, + actionPayloadClazz); + }); + } - private static void throwIfGraphTableClassIsNotSubClassOfActionPayloadClass( - Class rowClazz, Class actionPayloadClazz) { - if (!isSubClass(rowClazz, actionPayloadClazz)) { - String msg = - String.format( - "graph table class is not a subclass of action payload class: graph=%s, action=%s", - rowClazz.getCanonicalName(), actionPayloadClazz.getCanonicalName()); - throw new RuntimeException(msg); - } - } + private static void throwIfGraphTableClassIsNotSubClassOfActionPayloadClass( + Class rowClazz, Class actionPayloadClazz) { + if (!isSubClass(rowClazz, actionPayloadClazz)) { + String msg = String + .format( + "graph table class is not a subclass of action payload class: graph=%s, action=%s", + rowClazz.getCanonicalName(), actionPayloadClazz.getCanonicalName()); + throw new RuntimeException(msg); + } + } - private static void removeOutputDir(SparkSession spark, String path) { - HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration()); - } + private static void removeOutputDir(SparkSession spark, String path) { + HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration()); + } - private static void promoteActionPayloadForGraphTable( - SparkSession spark, - String inputGraphTablePath, - String inputActionPayloadPath, - String outputGraphTablePath, - MergeAndGet.Strategy strategy, - Class rowClazz, - Class actionPayloadClazz) { - Dataset rowDS = readGraphTable(spark, inputGraphTablePath, rowClazz); - Dataset actionPayloadDS = - readActionPayload(spark, inputActionPayloadPath, actionPayloadClazz); + private static void promoteActionPayloadForGraphTable( + SparkSession spark, + String inputGraphTablePath, + String inputActionPayloadPath, + String outputGraphTablePath, + MergeAndGet.Strategy strategy, + Class rowClazz, + Class actionPayloadClazz) { + Dataset rowDS = readGraphTable(spark, inputGraphTablePath, rowClazz); + Dataset actionPayloadDS = readActionPayload(spark, inputActionPayloadPath, actionPayloadClazz); - Dataset result = - promoteActionPayloadForGraphTable( - rowDS, actionPayloadDS, strategy, rowClazz, actionPayloadClazz) - .map((MapFunction) value -> value, Encoders.bean(rowClazz)); + Dataset result = promoteActionPayloadForGraphTable( + rowDS, actionPayloadDS, strategy, rowClazz, actionPayloadClazz) + .map((MapFunction) value -> value, Encoders.bean(rowClazz)); - saveGraphTable(result, outputGraphTablePath); - } + saveGraphTable(result, outputGraphTablePath); + } - private static Dataset readGraphTable( - SparkSession spark, String path, Class rowClazz) { - logger.info("Reading graph table from path: {}", path); + private static Dataset readGraphTable( + SparkSession spark, String path, Class rowClazz) { + logger.info("Reading graph table from path: {}", path); - return spark - .read() - .textFile(path) - .map( - (MapFunction) value -> OBJECT_MAPPER.readValue(value, rowClazz), - Encoders.bean(rowClazz)); + return spark + .read() + .textFile(path) + .map( + (MapFunction) value -> OBJECT_MAPPER.readValue(value, rowClazz), + Encoders.bean(rowClazz)); - /* - * return spark .read() .parquet(path) .as(Encoders.bean(rowClazz)); - */ - } + /* + * return spark .read() .parquet(path) .as(Encoders.bean(rowClazz)); + */ + } - private static Dataset readActionPayload( - SparkSession spark, String path, Class actionPayloadClazz) { - logger.info("Reading action payload from path: {}", path); - return spark - .read() - .parquet(path) - .map( - (MapFunction) - value -> - OBJECT_MAPPER.readValue(value.getAs("payload"), actionPayloadClazz), - Encoders.bean(actionPayloadClazz)); - } + private static Dataset readActionPayload( + SparkSession spark, String path, Class actionPayloadClazz) { + logger.info("Reading action payload from path: {}", path); + return spark + .read() + .parquet(path) + .map( + (MapFunction) value -> OBJECT_MAPPER + .readValue(value. getAs("payload"), actionPayloadClazz), + Encoders.bean(actionPayloadClazz)); + } - private static Dataset promoteActionPayloadForGraphTable( - Dataset rowDS, - Dataset actionPayloadDS, - MergeAndGet.Strategy strategy, - Class rowClazz, - Class actionPayloadClazz) { - logger.info( - "Promoting action payload for graph table: payload={}, table={}", - actionPayloadClazz.getSimpleName(), - rowClazz.getSimpleName()); + private static Dataset promoteActionPayloadForGraphTable( + Dataset rowDS, + Dataset actionPayloadDS, + MergeAndGet.Strategy strategy, + Class rowClazz, + Class actionPayloadClazz) { + logger + .info( + "Promoting action payload for graph table: payload={}, table={}", + actionPayloadClazz.getSimpleName(), + rowClazz.getSimpleName()); - SerializableSupplier> rowIdFn = ModelSupport::idFn; - SerializableSupplier> actionPayloadIdFn = ModelSupport::idFn; - SerializableSupplier> mergeRowWithActionPayloadAndGetFn = - MergeAndGet.functionFor(strategy); - SerializableSupplier> mergeRowsAndGetFn = MergeAndGet.functionFor(strategy); - SerializableSupplier zeroFn = zeroFn(rowClazz); - SerializableSupplier> isNotZeroFn = - PromoteActionPayloadForGraphTableJob::isNotZeroFnUsingIdOrSource; + SerializableSupplier> rowIdFn = ModelSupport::idFn; + SerializableSupplier> actionPayloadIdFn = ModelSupport::idFn; + SerializableSupplier> mergeRowWithActionPayloadAndGetFn = MergeAndGet.functionFor(strategy); + SerializableSupplier> mergeRowsAndGetFn = MergeAndGet.functionFor(strategy); + SerializableSupplier zeroFn = zeroFn(rowClazz); + SerializableSupplier> isNotZeroFn = PromoteActionPayloadForGraphTableJob::isNotZeroFnUsingIdOrSource; - Dataset joinedAndMerged = - PromoteActionPayloadFunctions.joinGraphTableWithActionPayloadAndMerge( - rowDS, - actionPayloadDS, - rowIdFn, - actionPayloadIdFn, - mergeRowWithActionPayloadAndGetFn, - rowClazz, - actionPayloadClazz); + Dataset joinedAndMerged = PromoteActionPayloadFunctions + .joinGraphTableWithActionPayloadAndMerge( + rowDS, + actionPayloadDS, + rowIdFn, + actionPayloadIdFn, + mergeRowWithActionPayloadAndGetFn, + rowClazz, + actionPayloadClazz); - return PromoteActionPayloadFunctions.groupGraphTableByIdAndMerge( - joinedAndMerged, rowIdFn, mergeRowsAndGetFn, zeroFn, isNotZeroFn, rowClazz); - } + return PromoteActionPayloadFunctions + .groupGraphTableByIdAndMerge( + joinedAndMerged, rowIdFn, mergeRowsAndGetFn, zeroFn, isNotZeroFn, rowClazz); + } - private static SerializableSupplier zeroFn(Class clazz) { - switch (clazz.getCanonicalName()) { - case "eu.dnetlib.dhp.schema.oaf.Dataset": - return () -> clazz.cast(new eu.dnetlib.dhp.schema.oaf.Dataset()); - case "eu.dnetlib.dhp.schema.oaf.Datasource": - return () -> clazz.cast(new eu.dnetlib.dhp.schema.oaf.Datasource()); - case "eu.dnetlib.dhp.schema.oaf.Organization": - return () -> clazz.cast(new eu.dnetlib.dhp.schema.oaf.Organization()); - case "eu.dnetlib.dhp.schema.oaf.OtherResearchProduct": - return () -> clazz.cast(new eu.dnetlib.dhp.schema.oaf.OtherResearchProduct()); - case "eu.dnetlib.dhp.schema.oaf.Project": - return () -> clazz.cast(new eu.dnetlib.dhp.schema.oaf.Project()); - case "eu.dnetlib.dhp.schema.oaf.Publication": - return () -> clazz.cast(new eu.dnetlib.dhp.schema.oaf.Publication()); - case "eu.dnetlib.dhp.schema.oaf.Relation": - return () -> clazz.cast(new eu.dnetlib.dhp.schema.oaf.Relation()); - case "eu.dnetlib.dhp.schema.oaf.Software": - return () -> clazz.cast(new eu.dnetlib.dhp.schema.oaf.Software()); - default: - throw new RuntimeException("unknown class: " + clazz.getCanonicalName()); - } - } + private static SerializableSupplier zeroFn(Class clazz) { + switch (clazz.getCanonicalName()) { + case "eu.dnetlib.dhp.schema.oaf.Dataset": + return () -> clazz.cast(new eu.dnetlib.dhp.schema.oaf.Dataset()); + case "eu.dnetlib.dhp.schema.oaf.Datasource": + return () -> clazz.cast(new eu.dnetlib.dhp.schema.oaf.Datasource()); + case "eu.dnetlib.dhp.schema.oaf.Organization": + return () -> clazz.cast(new eu.dnetlib.dhp.schema.oaf.Organization()); + case "eu.dnetlib.dhp.schema.oaf.OtherResearchProduct": + return () -> clazz.cast(new eu.dnetlib.dhp.schema.oaf.OtherResearchProduct()); + case "eu.dnetlib.dhp.schema.oaf.Project": + return () -> clazz.cast(new eu.dnetlib.dhp.schema.oaf.Project()); + case "eu.dnetlib.dhp.schema.oaf.Publication": + return () -> clazz.cast(new eu.dnetlib.dhp.schema.oaf.Publication()); + case "eu.dnetlib.dhp.schema.oaf.Relation": + return () -> clazz.cast(new eu.dnetlib.dhp.schema.oaf.Relation()); + case "eu.dnetlib.dhp.schema.oaf.Software": + return () -> clazz.cast(new eu.dnetlib.dhp.schema.oaf.Software()); + default: + throw new RuntimeException("unknown class: " + clazz.getCanonicalName()); + } + } - private static Function isNotZeroFnUsingIdOrSource() { - return t -> { - if (isSubClass(t, Relation.class)) { - return Objects.nonNull(((Relation) t).getSource()); - } - return Objects.nonNull(((OafEntity) t).getId()); - }; - } + private static Function isNotZeroFnUsingIdOrSource() { + return t -> { + if (isSubClass(t, Relation.class)) { + return Objects.nonNull(((Relation) t).getSource()); + } + return Objects.nonNull(((OafEntity) t).getId()); + }; + } - private static void saveGraphTable(Dataset result, String path) { - logger.info("Saving graph table to path: {}", path); - result.toJSON().write().option("compression", "gzip").text(path); - } + private static void saveGraphTable(Dataset result, String path) { + logger.info("Saving graph table to path: {}", path); + result.toJSON().write().option("compression", "gzip").text(path); + } } diff --git a/dhp-workflows/dhp-actionmanager/src/main/java/eu/dnetlib/dhp/actionmanager/promote/PromoteActionPayloadFunctions.java b/dhp-workflows/dhp-actionmanager/src/main/java/eu/dnetlib/dhp/actionmanager/promote/PromoteActionPayloadFunctions.java index cff964003..ffde658bd 100644 --- a/dhp-workflows/dhp-actionmanager/src/main/java/eu/dnetlib/dhp/actionmanager/promote/PromoteActionPayloadFunctions.java +++ b/dhp-workflows/dhp-actionmanager/src/main/java/eu/dnetlib/dhp/actionmanager/promote/PromoteActionPayloadFunctions.java @@ -1,13 +1,13 @@ + package eu.dnetlib.dhp.actionmanager.promote; import static eu.dnetlib.dhp.schema.common.ModelSupport.isSubClass; -import eu.dnetlib.dhp.common.FunctionalInterfaceSupport.SerializableSupplier; -import eu.dnetlib.dhp.schema.oaf.Oaf; import java.util.Objects; import java.util.Optional; import java.util.function.BiFunction; import java.util.function.Function; + import org.apache.spark.api.java.function.FilterFunction; import org.apache.spark.api.java.function.MapFunction; import org.apache.spark.sql.Dataset; @@ -15,171 +15,170 @@ import org.apache.spark.sql.Encoder; import org.apache.spark.sql.Encoders; import org.apache.spark.sql.TypedColumn; import org.apache.spark.sql.expressions.Aggregator; + +import eu.dnetlib.dhp.common.FunctionalInterfaceSupport.SerializableSupplier; +import eu.dnetlib.dhp.schema.oaf.Oaf; import scala.Tuple2; /** Promote action payload functions. */ public class PromoteActionPayloadFunctions { - private PromoteActionPayloadFunctions() {} + private PromoteActionPayloadFunctions() { + } - /** - * Joins dataset representing graph table with dataset representing action payload using supplied - * functions. - * - * @param rowDS Dataset representing graph table - * @param actionPayloadDS Dataset representing action payload - * @param rowIdFn Function used to get the id of graph table row - * @param actionPayloadIdFn Function used to get id of action payload instance - * @param mergeAndGetFn Function used to merge graph table row and action payload instance - * @param rowClazz Class of graph table - * @param actionPayloadClazz Class of action payload - * @param Type of graph table row - * @param Type of action payload instance - * @return Dataset of merged graph table rows and action payload instances - */ - public static Dataset joinGraphTableWithActionPayloadAndMerge( - Dataset rowDS, - Dataset actionPayloadDS, - SerializableSupplier> rowIdFn, - SerializableSupplier> actionPayloadIdFn, - SerializableSupplier> mergeAndGetFn, - Class rowClazz, - Class actionPayloadClazz) { - if (!isSubClass(rowClazz, actionPayloadClazz)) { - throw new RuntimeException( - "action payload type must be the same or be a super type of table row type"); - } + /** + * Joins dataset representing graph table with dataset representing action payload using supplied functions. + * + * @param rowDS Dataset representing graph table + * @param actionPayloadDS Dataset representing action payload + * @param rowIdFn Function used to get the id of graph table row + * @param actionPayloadIdFn Function used to get id of action payload instance + * @param mergeAndGetFn Function used to merge graph table row and action payload instance + * @param rowClazz Class of graph table + * @param actionPayloadClazz Class of action payload + * @param Type of graph table row + * @param Type of action payload instance + * @return Dataset of merged graph table rows and action payload instances + */ + public static Dataset joinGraphTableWithActionPayloadAndMerge( + Dataset rowDS, + Dataset actionPayloadDS, + SerializableSupplier> rowIdFn, + SerializableSupplier> actionPayloadIdFn, + SerializableSupplier> mergeAndGetFn, + Class rowClazz, + Class actionPayloadClazz) { + if (!isSubClass(rowClazz, actionPayloadClazz)) { + throw new RuntimeException( + "action payload type must be the same or be a super type of table row type"); + } - Dataset> rowWithIdDS = mapToTupleWithId(rowDS, rowIdFn, rowClazz); - Dataset> actionPayloadWithIdDS = - mapToTupleWithId(actionPayloadDS, actionPayloadIdFn, actionPayloadClazz); + Dataset> rowWithIdDS = mapToTupleWithId(rowDS, rowIdFn, rowClazz); + Dataset> actionPayloadWithIdDS = mapToTupleWithId( + actionPayloadDS, actionPayloadIdFn, actionPayloadClazz); - return rowWithIdDS - .joinWith( - actionPayloadWithIdDS, - rowWithIdDS.col("_1").equalTo(actionPayloadWithIdDS.col("_1")), - "full_outer") - .map( - (MapFunction, Tuple2>, G>) - value -> { - Optional rowOpt = Optional.ofNullable(value._1()).map(Tuple2::_2); - Optional actionPayloadOpt = Optional.ofNullable(value._2()).map(Tuple2::_2); - return rowOpt - .map( - row -> - actionPayloadOpt - .map( - actionPayload -> - mergeAndGetFn.get().apply(row, actionPayload)) - .orElse(row)) - .orElseGet( - () -> - actionPayloadOpt - .filter( - actionPayload -> actionPayload.getClass().equals(rowClazz)) - .map(rowClazz::cast) - .orElse(null)); - }, - Encoders.kryo(rowClazz)) - .filter((FilterFunction) Objects::nonNull); - } + return rowWithIdDS + .joinWith( + actionPayloadWithIdDS, + rowWithIdDS.col("_1").equalTo(actionPayloadWithIdDS.col("_1")), + "full_outer") + .map( + (MapFunction, Tuple2>, G>) value -> { + Optional rowOpt = Optional.ofNullable(value._1()).map(Tuple2::_2); + Optional actionPayloadOpt = Optional.ofNullable(value._2()).map(Tuple2::_2); + return rowOpt + .map( + row -> actionPayloadOpt + .map( + actionPayload -> mergeAndGetFn.get().apply(row, actionPayload)) + .orElse(row)) + .orElseGet( + () -> actionPayloadOpt + .filter( + actionPayload -> actionPayload.getClass().equals(rowClazz)) + .map(rowClazz::cast) + .orElse(null)); + }, + Encoders.kryo(rowClazz)) + .filter((FilterFunction) Objects::nonNull); + } - private static Dataset> mapToTupleWithId( - Dataset ds, SerializableSupplier> idFn, Class clazz) { - return ds.map( - (MapFunction>) value -> new Tuple2<>(idFn.get().apply(value), value), - Encoders.tuple(Encoders.STRING(), Encoders.kryo(clazz))); - } + private static Dataset> mapToTupleWithId( + Dataset ds, SerializableSupplier> idFn, Class clazz) { + return ds + .map( + (MapFunction>) value -> new Tuple2<>(idFn.get().apply(value), value), + Encoders.tuple(Encoders.STRING(), Encoders.kryo(clazz))); + } - /** - * Groups graph table by id and aggregates using supplied functions. - * - * @param rowDS Dataset representing graph table - * @param rowIdFn Function used to get the id of graph table row - * @param mergeAndGetFn Function used to merge graph table rows - * @param zeroFn Function to create a zero/empty instance of graph table row - * @param isNotZeroFn Function to check if graph table row is not zero/empty - * @param rowClazz Class of graph table - * @param Type of graph table row - * @return Dataset of aggregated graph table rows - */ - public static Dataset groupGraphTableByIdAndMerge( - Dataset rowDS, - SerializableSupplier> rowIdFn, - SerializableSupplier> mergeAndGetFn, - SerializableSupplier zeroFn, - SerializableSupplier> isNotZeroFn, - Class rowClazz) { - TypedColumn aggregator = - new TableAggregator<>(zeroFn, mergeAndGetFn, isNotZeroFn, rowClazz).toColumn(); - return rowDS - .groupByKey((MapFunction) x -> rowIdFn.get().apply(x), Encoders.STRING()) - .agg(aggregator) - .map((MapFunction, G>) Tuple2::_2, Encoders.kryo(rowClazz)); - } + /** + * Groups graph table by id and aggregates using supplied functions. + * + * @param rowDS Dataset representing graph table + * @param rowIdFn Function used to get the id of graph table row + * @param mergeAndGetFn Function used to merge graph table rows + * @param zeroFn Function to create a zero/empty instance of graph table row + * @param isNotZeroFn Function to check if graph table row is not zero/empty + * @param rowClazz Class of graph table + * @param Type of graph table row + * @return Dataset of aggregated graph table rows + */ + public static Dataset groupGraphTableByIdAndMerge( + Dataset rowDS, + SerializableSupplier> rowIdFn, + SerializableSupplier> mergeAndGetFn, + SerializableSupplier zeroFn, + SerializableSupplier> isNotZeroFn, + Class rowClazz) { + TypedColumn aggregator = new TableAggregator<>(zeroFn, mergeAndGetFn, isNotZeroFn, rowClazz).toColumn(); + return rowDS + .groupByKey((MapFunction) x -> rowIdFn.get().apply(x), Encoders.STRING()) + .agg(aggregator) + .map((MapFunction, G>) Tuple2::_2, Encoders.kryo(rowClazz)); + } - /** - * Aggregator to be used for aggregating graph table rows during grouping. - * - * @param Type of graph table row - */ - public static class TableAggregator extends Aggregator { - private SerializableSupplier zeroFn; - private SerializableSupplier> mergeAndGetFn; - private SerializableSupplier> isNotZeroFn; - private Class rowClazz; + /** + * Aggregator to be used for aggregating graph table rows during grouping. + * + * @param Type of graph table row + */ + public static class TableAggregator extends Aggregator { + private SerializableSupplier zeroFn; + private SerializableSupplier> mergeAndGetFn; + private SerializableSupplier> isNotZeroFn; + private Class rowClazz; - public TableAggregator( - SerializableSupplier zeroFn, - SerializableSupplier> mergeAndGetFn, - SerializableSupplier> isNotZeroFn, - Class rowClazz) { - this.zeroFn = zeroFn; - this.mergeAndGetFn = mergeAndGetFn; - this.isNotZeroFn = isNotZeroFn; - this.rowClazz = rowClazz; - } + public TableAggregator( + SerializableSupplier zeroFn, + SerializableSupplier> mergeAndGetFn, + SerializableSupplier> isNotZeroFn, + Class rowClazz) { + this.zeroFn = zeroFn; + this.mergeAndGetFn = mergeAndGetFn; + this.isNotZeroFn = isNotZeroFn; + this.rowClazz = rowClazz; + } - @Override - public G zero() { - return zeroFn.get(); - } + @Override + public G zero() { + return zeroFn.get(); + } - @Override - public G reduce(G b, G a) { - return zeroSafeMergeAndGet(b, a); - } + @Override + public G reduce(G b, G a) { + return zeroSafeMergeAndGet(b, a); + } - @Override - public G merge(G b1, G b2) { - return zeroSafeMergeAndGet(b1, b2); - } + @Override + public G merge(G b1, G b2) { + return zeroSafeMergeAndGet(b1, b2); + } - private G zeroSafeMergeAndGet(G left, G right) { - Function isNotZero = isNotZeroFn.get(); - if (isNotZero.apply(left) && isNotZero.apply(right)) { - return mergeAndGetFn.get().apply(left, right); - } else if (isNotZero.apply(left) && !isNotZero.apply(right)) { - return left; - } else if (!isNotZero.apply(left) && isNotZero.apply(right)) { - return right; - } - throw new RuntimeException("internal aggregation error: left and right objects are zero"); - } + private G zeroSafeMergeAndGet(G left, G right) { + Function isNotZero = isNotZeroFn.get(); + if (isNotZero.apply(left) && isNotZero.apply(right)) { + return mergeAndGetFn.get().apply(left, right); + } else if (isNotZero.apply(left) && !isNotZero.apply(right)) { + return left; + } else if (!isNotZero.apply(left) && isNotZero.apply(right)) { + return right; + } + throw new RuntimeException("internal aggregation error: left and right objects are zero"); + } - @Override - public G finish(G reduction) { - return reduction; - } + @Override + public G finish(G reduction) { + return reduction; + } - @Override - public Encoder bufferEncoder() { - return Encoders.kryo(rowClazz); - } + @Override + public Encoder bufferEncoder() { + return Encoders.kryo(rowClazz); + } - @Override - public Encoder outputEncoder() { - return Encoders.kryo(rowClazz); - } - } + @Override + public Encoder outputEncoder() { + return Encoders.kryo(rowClazz); + } + } } diff --git a/dhp-workflows/dhp-actionmanager/src/test/java/eu/dnetlib/dhp/actionmanager/partition/PartitionActionSetsByPayloadTypeJobTest.java b/dhp-workflows/dhp-actionmanager/src/test/java/eu/dnetlib/dhp/actionmanager/partition/PartitionActionSetsByPayloadTypeJobTest.java index 3d36cef69..f51c697f4 100644 --- a/dhp-workflows/dhp-actionmanager/src/test/java/eu/dnetlib/dhp/actionmanager/partition/PartitionActionSetsByPayloadTypeJobTest.java +++ b/dhp-workflows/dhp-actionmanager/src/test/java/eu/dnetlib/dhp/actionmanager/partition/PartitionActionSetsByPayloadTypeJobTest.java @@ -1,3 +1,4 @@ + package eu.dnetlib.dhp.actionmanager.partition; import static eu.dnetlib.dhp.common.ThrowingSupport.rethrowAsRuntimeException; @@ -5,16 +6,13 @@ import static org.apache.spark.sql.functions.*; import static org.junit.jupiter.api.Assertions.assertIterableEquals; import static scala.collection.JavaConversions.mutableSeqAsJavaList; -import com.fasterxml.jackson.databind.ObjectMapper; -import eu.dnetlib.dhp.actionmanager.ISClient; -import eu.dnetlib.dhp.actionmanager.promote.PromoteActionPayloadForGraphTableJobTest; -import eu.dnetlib.dhp.schema.oaf.*; import java.io.IOException; import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.Paths; import java.util.*; import java.util.stream.Collectors; + import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; @@ -32,197 +30,212 @@ import org.junit.jupiter.api.io.TempDir; import org.mockito.Mock; import org.mockito.Mockito; import org.mockito.junit.jupiter.MockitoExtension; + +import com.fasterxml.jackson.databind.ObjectMapper; + +import eu.dnetlib.dhp.actionmanager.ISClient; +import eu.dnetlib.dhp.actionmanager.promote.PromoteActionPayloadForGraphTableJobTest; +import eu.dnetlib.dhp.schema.oaf.*; import scala.Tuple2; import scala.collection.mutable.Seq; @ExtendWith(MockitoExtension.class) public class PartitionActionSetsByPayloadTypeJobTest { - private static final ClassLoader cl = - PartitionActionSetsByPayloadTypeJobTest.class.getClassLoader(); + private static final ClassLoader cl = PartitionActionSetsByPayloadTypeJobTest.class.getClassLoader(); - private static Configuration configuration; - private static SparkSession spark; + private static Configuration configuration; + private static SparkSession spark; - private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); + private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); - private static final StructType ATOMIC_ACTION_SCHEMA = - StructType$.MODULE$.apply( - Arrays.asList( - StructField$.MODULE$.apply("clazz", DataTypes.StringType, false, Metadata.empty()), - StructField$.MODULE$.apply( - "payload", DataTypes.StringType, false, Metadata.empty()))); + private static final StructType ATOMIC_ACTION_SCHEMA = StructType$.MODULE$ + .apply( + Arrays + .asList( + StructField$.MODULE$.apply("clazz", DataTypes.StringType, false, Metadata.empty()), + StructField$.MODULE$ + .apply( + "payload", DataTypes.StringType, false, Metadata.empty()))); - @BeforeAll - public static void beforeAll() throws IOException { - configuration = Job.getInstance().getConfiguration(); - SparkConf conf = new SparkConf(); - conf.setAppName(PromoteActionPayloadForGraphTableJobTest.class.getSimpleName()); - conf.setMaster("local"); - conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer"); - spark = SparkSession.builder().config(conf).getOrCreate(); - } + @BeforeAll + public static void beforeAll() throws IOException { + configuration = Job.getInstance().getConfiguration(); + SparkConf conf = new SparkConf(); + conf.setAppName(PromoteActionPayloadForGraphTableJobTest.class.getSimpleName()); + conf.setMaster("local"); + conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer"); + spark = SparkSession.builder().config(conf).getOrCreate(); + } - @AfterAll - public static void afterAll() { - spark.stop(); - } + @AfterAll + public static void afterAll() { + spark.stop(); + } - @DisplayName("Job") - @Nested - class Main { + @DisplayName("Job") + @Nested + class Main { - @Mock private ISClient isClient; + @Mock + private ISClient isClient; - @Test - public void shouldPartitionActionSetsByPayloadType(@TempDir Path workingDir) throws Exception { - // given - Path inputActionSetsBaseDir = workingDir.resolve("input").resolve("action_sets"); - Path outputDir = workingDir.resolve("output"); + @Test + public void shouldPartitionActionSetsByPayloadType(@TempDir Path workingDir) throws Exception { + // given + Path inputActionSetsBaseDir = workingDir.resolve("input").resolve("action_sets"); + Path outputDir = workingDir.resolve("output"); - Map> oafsByClassName = createActionSets(inputActionSetsBaseDir); + Map> oafsByClassName = createActionSets(inputActionSetsBaseDir); - List inputActionSetsPaths = resolveInputActionSetPaths(inputActionSetsBaseDir); + List inputActionSetsPaths = resolveInputActionSetPaths(inputActionSetsBaseDir); - // when - Mockito.when(isClient.getLatestRawsetPaths(Mockito.anyString())) - .thenReturn(inputActionSetsPaths); + // when + Mockito + .when(isClient.getLatestRawsetPaths(Mockito.anyString())) + .thenReturn(inputActionSetsPaths); - PartitionActionSetsByPayloadTypeJob job = new PartitionActionSetsByPayloadTypeJob(); - job.setIsClient(isClient); - job.run( - Boolean.FALSE, - "", // it can be empty we're mocking the response from isClient - // to - // resolve the - // paths - outputDir.toString()); + PartitionActionSetsByPayloadTypeJob job = new PartitionActionSetsByPayloadTypeJob(); + job.setIsClient(isClient); + job + .run( + Boolean.FALSE, + "", // it can be empty we're mocking the response from isClient + // to + // resolve the + // paths + outputDir.toString()); - // then - Files.exists(outputDir); + // then + Files.exists(outputDir); - assertForOafType(outputDir, oafsByClassName, eu.dnetlib.dhp.schema.oaf.Dataset.class); - assertForOafType(outputDir, oafsByClassName, Datasource.class); - assertForOafType(outputDir, oafsByClassName, Organization.class); - assertForOafType(outputDir, oafsByClassName, OtherResearchProduct.class); - assertForOafType(outputDir, oafsByClassName, Project.class); - assertForOafType(outputDir, oafsByClassName, Publication.class); - assertForOafType(outputDir, oafsByClassName, Result.class); - assertForOafType(outputDir, oafsByClassName, Relation.class); - assertForOafType(outputDir, oafsByClassName, Software.class); - } - } + assertForOafType(outputDir, oafsByClassName, eu.dnetlib.dhp.schema.oaf.Dataset.class); + assertForOafType(outputDir, oafsByClassName, Datasource.class); + assertForOafType(outputDir, oafsByClassName, Organization.class); + assertForOafType(outputDir, oafsByClassName, OtherResearchProduct.class); + assertForOafType(outputDir, oafsByClassName, Project.class); + assertForOafType(outputDir, oafsByClassName, Publication.class); + assertForOafType(outputDir, oafsByClassName, Result.class); + assertForOafType(outputDir, oafsByClassName, Relation.class); + assertForOafType(outputDir, oafsByClassName, Software.class); + } + } - private List resolveInputActionSetPaths(Path inputActionSetsBaseDir) throws IOException { - Path inputActionSetJsonDumpsDir = getInputActionSetJsonDumpsDir(); - return Files.list(inputActionSetJsonDumpsDir) - .map( - path -> { - String inputActionSetId = path.getFileName().toString(); - return inputActionSetsBaseDir.resolve(inputActionSetId).toString(); - }) - .collect(Collectors.toCollection(ArrayList::new)); - } + private List resolveInputActionSetPaths(Path inputActionSetsBaseDir) throws IOException { + Path inputActionSetJsonDumpsDir = getInputActionSetJsonDumpsDir(); + return Files + .list(inputActionSetJsonDumpsDir) + .map( + path -> { + String inputActionSetId = path.getFileName().toString(); + return inputActionSetsBaseDir.resolve(inputActionSetId).toString(); + }) + .collect(Collectors.toCollection(ArrayList::new)); + } - private static Map> createActionSets(Path inputActionSetsDir) - throws IOException { - Path inputActionSetJsonDumpsDir = getInputActionSetJsonDumpsDir(); + private static Map> createActionSets(Path inputActionSetsDir) + throws IOException { + Path inputActionSetJsonDumpsDir = getInputActionSetJsonDumpsDir(); - Map> oafsByType = new HashMap<>(); - Files.list(inputActionSetJsonDumpsDir) - .forEach( - inputActionSetJsonDumpFile -> { - String inputActionSetId = inputActionSetJsonDumpFile.getFileName().toString(); - Path inputActionSetDir = inputActionSetsDir.resolve(inputActionSetId); + Map> oafsByType = new HashMap<>(); + Files + .list(inputActionSetJsonDumpsDir) + .forEach( + inputActionSetJsonDumpFile -> { + String inputActionSetId = inputActionSetJsonDumpFile.getFileName().toString(); + Path inputActionSetDir = inputActionSetsDir.resolve(inputActionSetId); - Dataset actionDS = - readActionsFromJsonDump(inputActionSetJsonDumpFile.toString()).cache(); + Dataset actionDS = readActionsFromJsonDump(inputActionSetJsonDumpFile.toString()).cache(); - writeActionsAsJobInput(actionDS, inputActionSetId, inputActionSetDir.toString()); + writeActionsAsJobInput(actionDS, inputActionSetId, inputActionSetDir.toString()); - Map> actionSetOafsByType = - actionDS - .withColumn("atomic_action", from_json(col("value"), ATOMIC_ACTION_SCHEMA)) - .select(expr("atomic_action.*")).groupBy(col("clazz")) - .agg(collect_list(col("payload")).as("payload_list")).collectAsList().stream() - .map( - row -> - new AbstractMap.SimpleEntry<>( - row.getAs("clazz"), - mutableSeqAsJavaList(row.>getAs("payload_list")))) - .collect( - Collectors.toMap( - AbstractMap.SimpleEntry::getKey, AbstractMap.SimpleEntry::getValue)); + Map> actionSetOafsByType = actionDS + .withColumn("atomic_action", from_json(col("value"), ATOMIC_ACTION_SCHEMA)) + .select(expr("atomic_action.*")) + .groupBy(col("clazz")) + .agg(collect_list(col("payload")).as("payload_list")) + .collectAsList() + .stream() + .map( + row -> new AbstractMap.SimpleEntry<>( + row. getAs("clazz"), + mutableSeqAsJavaList(row.> getAs("payload_list")))) + .collect( + Collectors + .toMap( + AbstractMap.SimpleEntry::getKey, AbstractMap.SimpleEntry::getValue)); - actionSetOafsByType - .keySet() - .forEach( - x -> { - if (oafsByType.containsKey(x)) { - List collected = new ArrayList<>(); - collected.addAll(oafsByType.get(x)); - collected.addAll(actionSetOafsByType.get(x)); - oafsByType.put(x, collected); - } else { - oafsByType.put(x, actionSetOafsByType.get(x)); - } - }); - }); + actionSetOafsByType + .keySet() + .forEach( + x -> { + if (oafsByType.containsKey(x)) { + List collected = new ArrayList<>(); + collected.addAll(oafsByType.get(x)); + collected.addAll(actionSetOafsByType.get(x)); + oafsByType.put(x, collected); + } else { + oafsByType.put(x, actionSetOafsByType.get(x)); + } + }); + }); - return oafsByType; - } + return oafsByType; + } - private static Path getInputActionSetJsonDumpsDir() { - return Paths.get( - Objects.requireNonNull(cl.getResource("eu/dnetlib/dhp/actionmanager/partition/input/")) - .getFile()); - } + private static Path getInputActionSetJsonDumpsDir() { + return Paths + .get( + Objects + .requireNonNull(cl.getResource("eu/dnetlib/dhp/actionmanager/partition/input/")) + .getFile()); + } - private static Dataset readActionsFromJsonDump(String path) { - return spark.read().textFile(path); - } + private static Dataset readActionsFromJsonDump(String path) { + return spark.read().textFile(path); + } - private static void writeActionsAsJobInput( - Dataset actionDS, String inputActionSetId, String path) { - actionDS - .javaRDD() - .mapToPair(json -> new Tuple2<>(new Text(inputActionSetId), new Text(json))) - .saveAsNewAPIHadoopFile( - path, Text.class, Text.class, SequenceFileOutputFormat.class, configuration); - } + private static void writeActionsAsJobInput( + Dataset actionDS, String inputActionSetId, String path) { + actionDS + .javaRDD() + .mapToPair(json -> new Tuple2<>(new Text(inputActionSetId), new Text(json))) + .saveAsNewAPIHadoopFile( + path, Text.class, Text.class, SequenceFileOutputFormat.class, configuration); + } - private static void assertForOafType( - Path outputDir, Map> oafsByClassName, Class clazz) { - Path outputDatasetDir = outputDir.resolve(String.format("clazz=%s", clazz.getCanonicalName())); - Files.exists(outputDatasetDir); + private static void assertForOafType( + Path outputDir, Map> oafsByClassName, Class clazz) { + Path outputDatasetDir = outputDir.resolve(String.format("clazz=%s", clazz.getCanonicalName())); + Files.exists(outputDatasetDir); - List actuals = - readActionPayloadFromJobOutput(outputDatasetDir.toString(), clazz).collectAsList(); - actuals.sort(Comparator.comparingInt(Object::hashCode)); + List actuals = readActionPayloadFromJobOutput(outputDatasetDir.toString(), clazz).collectAsList(); + actuals.sort(Comparator.comparingInt(Object::hashCode)); - List expecteds = - oafsByClassName.get(clazz.getCanonicalName()).stream() - .map(json -> mapToOaf(json, clazz)) - .sorted(Comparator.comparingInt(Object::hashCode)) - .collect(Collectors.toList()); + List expecteds = oafsByClassName + .get(clazz.getCanonicalName()) + .stream() + .map(json -> mapToOaf(json, clazz)) + .sorted(Comparator.comparingInt(Object::hashCode)) + .collect(Collectors.toList()); - assertIterableEquals(expecteds, actuals); - } + assertIterableEquals(expecteds, actuals); + } - private static Dataset readActionPayloadFromJobOutput( - String path, Class clazz) { - return spark - .read() - .parquet(path) - .map( - (MapFunction) - value -> OBJECT_MAPPER.readValue(value.getAs("payload"), clazz), - Encoders.bean(clazz)); - } + private static Dataset readActionPayloadFromJobOutput( + String path, Class clazz) { + return spark + .read() + .parquet(path) + .map( + (MapFunction) value -> OBJECT_MAPPER.readValue(value. getAs("payload"), clazz), + Encoders.bean(clazz)); + } - private static T mapToOaf(String json, Class clazz) { - return rethrowAsRuntimeException( - () -> OBJECT_MAPPER.readValue(json, clazz), - String.format( - "failed to map json to class: json=%s, class=%s", json, clazz.getCanonicalName())); - } + private static T mapToOaf(String json, Class clazz) { + return rethrowAsRuntimeException( + () -> OBJECT_MAPPER.readValue(json, clazz), + String + .format( + "failed to map json to class: json=%s, class=%s", json, clazz.getCanonicalName())); + } } diff --git a/dhp-workflows/dhp-actionmanager/src/test/java/eu/dnetlib/dhp/actionmanager/promote/MergeAndGetTest.java b/dhp-workflows/dhp-actionmanager/src/test/java/eu/dnetlib/dhp/actionmanager/promote/MergeAndGetTest.java index 0de6f6b4f..b2248d77a 100644 --- a/dhp-workflows/dhp-actionmanager/src/test/java/eu/dnetlib/dhp/actionmanager/promote/MergeAndGetTest.java +++ b/dhp-workflows/dhp-actionmanager/src/test/java/eu/dnetlib/dhp/actionmanager/promote/MergeAndGetTest.java @@ -1,3 +1,4 @@ + package eu.dnetlib.dhp.actionmanager.promote; import static eu.dnetlib.dhp.actionmanager.promote.MergeAndGet.Strategy; @@ -5,254 +6,252 @@ import static eu.dnetlib.dhp.actionmanager.promote.MergeAndGet.functionFor; import static org.junit.jupiter.api.Assertions.*; import static org.mockito.Mockito.*; -import eu.dnetlib.dhp.common.FunctionalInterfaceSupport.SerializableSupplier; -import eu.dnetlib.dhp.schema.oaf.*; import java.util.function.BiFunction; + import org.junit.jupiter.api.Nested; import org.junit.jupiter.api.Test; +import eu.dnetlib.dhp.common.FunctionalInterfaceSupport.SerializableSupplier; +import eu.dnetlib.dhp.schema.oaf.*; + public class MergeAndGetTest { - @Nested - class MergeFromAndGetStrategy { + @Nested + class MergeFromAndGetStrategy { - @Test - public void shouldThrowForOafAndOaf() { - // given - Oaf a = mock(Oaf.class); - Oaf b = mock(Oaf.class); + @Test + public void shouldThrowForOafAndOaf() { + // given + Oaf a = mock(Oaf.class); + Oaf b = mock(Oaf.class); - // when - SerializableSupplier> fn = functionFor(Strategy.MERGE_FROM_AND_GET); + // when + SerializableSupplier> fn = functionFor(Strategy.MERGE_FROM_AND_GET); - // then - assertThrows(RuntimeException.class, () -> fn.get().apply(a, b)); - } + // then + assertThrows(RuntimeException.class, () -> fn.get().apply(a, b)); + } - @Test - public void shouldThrowForOafAndRelation() { - // given - Oaf a = mock(Oaf.class); - Relation b = mock(Relation.class); + @Test + public void shouldThrowForOafAndRelation() { + // given + Oaf a = mock(Oaf.class); + Relation b = mock(Relation.class); - // when - SerializableSupplier> fn = functionFor(Strategy.MERGE_FROM_AND_GET); + // when + SerializableSupplier> fn = functionFor(Strategy.MERGE_FROM_AND_GET); - // then - assertThrows(RuntimeException.class, () -> fn.get().apply(a, b)); - } + // then + assertThrows(RuntimeException.class, () -> fn.get().apply(a, b)); + } - @Test - public void shouldThrowForOafAndOafEntity() { - // given - Oaf a = mock(Oaf.class); - OafEntity b = mock(OafEntity.class); + @Test + public void shouldThrowForOafAndOafEntity() { + // given + Oaf a = mock(Oaf.class); + OafEntity b = mock(OafEntity.class); - // when - SerializableSupplier> fn = functionFor(Strategy.MERGE_FROM_AND_GET); + // when + SerializableSupplier> fn = functionFor(Strategy.MERGE_FROM_AND_GET); - // then - assertThrows(RuntimeException.class, () -> fn.get().apply(a, b)); - } + // then + assertThrows(RuntimeException.class, () -> fn.get().apply(a, b)); + } - @Test - public void shouldThrowForRelationAndOaf() { - // given - Relation a = mock(Relation.class); - Oaf b = mock(Oaf.class); + @Test + public void shouldThrowForRelationAndOaf() { + // given + Relation a = mock(Relation.class); + Oaf b = mock(Oaf.class); - // when - SerializableSupplier> fn = functionFor(Strategy.MERGE_FROM_AND_GET); + // when + SerializableSupplier> fn = functionFor(Strategy.MERGE_FROM_AND_GET); - // then - assertThrows(RuntimeException.class, () -> fn.get().apply(a, b)); - } + // then + assertThrows(RuntimeException.class, () -> fn.get().apply(a, b)); + } - @Test - public void shouldThrowForRelationAndOafEntity() { - // given - Relation a = mock(Relation.class); - OafEntity b = mock(OafEntity.class); + @Test + public void shouldThrowForRelationAndOafEntity() { + // given + Relation a = mock(Relation.class); + OafEntity b = mock(OafEntity.class); - // when - SerializableSupplier> fn = functionFor(Strategy.MERGE_FROM_AND_GET); + // when + SerializableSupplier> fn = functionFor(Strategy.MERGE_FROM_AND_GET); - // then - assertThrows(RuntimeException.class, () -> fn.get().apply(a, b)); - } + // then + assertThrows(RuntimeException.class, () -> fn.get().apply(a, b)); + } - @Test - public void shouldBehaveProperlyForRelationAndRelation() { - // given - Relation a = mock(Relation.class); - Relation b = mock(Relation.class); + @Test + public void shouldBehaveProperlyForRelationAndRelation() { + // given + Relation a = mock(Relation.class); + Relation b = mock(Relation.class); - // when - SerializableSupplier> fn = functionFor(Strategy.MERGE_FROM_AND_GET); + // when + SerializableSupplier> fn = functionFor(Strategy.MERGE_FROM_AND_GET); - // then - Oaf x = fn.get().apply(a, b); - assertTrue(Relation.class.isAssignableFrom(x.getClass())); - verify(a).mergeFrom(b); - assertEquals(a, x); - } + // then + Oaf x = fn.get().apply(a, b); + assertTrue(Relation.class.isAssignableFrom(x.getClass())); + verify(a).mergeFrom(b); + assertEquals(a, x); + } - @Test - public void shouldThrowForOafEntityAndOaf() { - // given - OafEntity a = mock(OafEntity.class); - Oaf b = mock(Oaf.class); + @Test + public void shouldThrowForOafEntityAndOaf() { + // given + OafEntity a = mock(OafEntity.class); + Oaf b = mock(Oaf.class); - // when - SerializableSupplier> fn = functionFor(Strategy.MERGE_FROM_AND_GET); + // when + SerializableSupplier> fn = functionFor(Strategy.MERGE_FROM_AND_GET); - // then - assertThrows(RuntimeException.class, () -> fn.get().apply(a, b)); - } + // then + assertThrows(RuntimeException.class, () -> fn.get().apply(a, b)); + } - @Test - public void shouldThrowForOafEntityAndRelation() { - // given - OafEntity a = mock(OafEntity.class); - Relation b = mock(Relation.class); + @Test + public void shouldThrowForOafEntityAndRelation() { + // given + OafEntity a = mock(OafEntity.class); + Relation b = mock(Relation.class); - // when - SerializableSupplier> fn = functionFor(Strategy.MERGE_FROM_AND_GET); + // when + SerializableSupplier> fn = functionFor(Strategy.MERGE_FROM_AND_GET); - // then - assertThrows(RuntimeException.class, () -> fn.get().apply(a, b)); - } + // then + assertThrows(RuntimeException.class, () -> fn.get().apply(a, b)); + } - @Test - public void shouldThrowForOafEntityAndOafEntityButNotSubclasses() { - // given - class OafEntitySub1 extends OafEntity {} - class OafEntitySub2 extends OafEntity {} + @Test + public void shouldThrowForOafEntityAndOafEntityButNotSubclasses() { + // given + class OafEntitySub1 extends OafEntity { + } + class OafEntitySub2 extends OafEntity { + } - OafEntitySub1 a = mock(OafEntitySub1.class); - OafEntitySub2 b = mock(OafEntitySub2.class); + OafEntitySub1 a = mock(OafEntitySub1.class); + OafEntitySub2 b = mock(OafEntitySub2.class); - // when - SerializableSupplier> fn = functionFor(Strategy.MERGE_FROM_AND_GET); + // when + SerializableSupplier> fn = functionFor(Strategy.MERGE_FROM_AND_GET); - // then - assertThrows(RuntimeException.class, () -> fn.get().apply(a, b)); - } + // then + assertThrows(RuntimeException.class, () -> fn.get().apply(a, b)); + } - @Test - public void shouldBehaveProperlyForOafEntityAndOafEntity() { - // given - OafEntity a = mock(OafEntity.class); - OafEntity b = mock(OafEntity.class); + @Test + public void shouldBehaveProperlyForOafEntityAndOafEntity() { + // given + OafEntity a = mock(OafEntity.class); + OafEntity b = mock(OafEntity.class); - // when - SerializableSupplier> fn = functionFor(Strategy.MERGE_FROM_AND_GET); + // when + SerializableSupplier> fn = functionFor(Strategy.MERGE_FROM_AND_GET); - // then - Oaf x = fn.get().apply(a, b); - assertTrue(OafEntity.class.isAssignableFrom(x.getClass())); - verify(a).mergeFrom(b); - assertEquals(a, x); - } - } + // then + Oaf x = fn.get().apply(a, b); + assertTrue(OafEntity.class.isAssignableFrom(x.getClass())); + verify(a).mergeFrom(b); + assertEquals(a, x); + } + } - @Nested - class SelectNewerAndGetStrategy { + @Nested + class SelectNewerAndGetStrategy { - @Test - public void shouldThrowForOafEntityAndRelation() { - // given - OafEntity a = mock(OafEntity.class); - Relation b = mock(Relation.class); + @Test + public void shouldThrowForOafEntityAndRelation() { + // given + OafEntity a = mock(OafEntity.class); + Relation b = mock(Relation.class); - // when - SerializableSupplier> fn = - functionFor(Strategy.SELECT_NEWER_AND_GET); + // when + SerializableSupplier> fn = functionFor(Strategy.SELECT_NEWER_AND_GET); - // then - assertThrows(RuntimeException.class, () -> fn.get().apply(a, b)); - } + // then + assertThrows(RuntimeException.class, () -> fn.get().apply(a, b)); + } - @Test - public void shouldThrowForRelationAndOafEntity() { - // given - Relation a = mock(Relation.class); - OafEntity b = mock(OafEntity.class); + @Test + public void shouldThrowForRelationAndOafEntity() { + // given + Relation a = mock(Relation.class); + OafEntity b = mock(OafEntity.class); - // when - SerializableSupplier> fn = - functionFor(Strategy.SELECT_NEWER_AND_GET); + // when + SerializableSupplier> fn = functionFor(Strategy.SELECT_NEWER_AND_GET); - // then - assertThrows(RuntimeException.class, () -> fn.get().apply(a, b)); - } + // then + assertThrows(RuntimeException.class, () -> fn.get().apply(a, b)); + } - @Test - public void shouldThrowForOafEntityAndResult() { - // given - OafEntity a = mock(OafEntity.class); - Result b = mock(Result.class); + @Test + public void shouldThrowForOafEntityAndResult() { + // given + OafEntity a = mock(OafEntity.class); + Result b = mock(Result.class); - // when - SerializableSupplier> fn = - functionFor(Strategy.SELECT_NEWER_AND_GET); + // when + SerializableSupplier> fn = functionFor(Strategy.SELECT_NEWER_AND_GET); - // then - assertThrows(RuntimeException.class, () -> fn.get().apply(a, b)); - } + // then + assertThrows(RuntimeException.class, () -> fn.get().apply(a, b)); + } - @Test - public void shouldThrowWhenSuperTypeIsNewerForResultAndOafEntity() { - // given - // real types must be used because subclass-superclass resolution does not work for - // mocks - Dataset a = new Dataset(); - a.setLastupdatetimestamp(1L); - Result b = new Result(); - b.setLastupdatetimestamp(2L); + @Test + public void shouldThrowWhenSuperTypeIsNewerForResultAndOafEntity() { + // given + // real types must be used because subclass-superclass resolution does not work for + // mocks + Dataset a = new Dataset(); + a.setLastupdatetimestamp(1L); + Result b = new Result(); + b.setLastupdatetimestamp(2L); - // when - SerializableSupplier> fn = - functionFor(Strategy.SELECT_NEWER_AND_GET); + // when + SerializableSupplier> fn = functionFor(Strategy.SELECT_NEWER_AND_GET); - // then - assertThrows(RuntimeException.class, () -> fn.get().apply(a, b)); - } + // then + assertThrows(RuntimeException.class, () -> fn.get().apply(a, b)); + } - @Test - public void shouldShouldReturnLeftForOafEntityAndOafEntity() { - // given - OafEntity a = mock(OafEntity.class); - when(a.getLastupdatetimestamp()).thenReturn(1L); - OafEntity b = mock(OafEntity.class); - when(b.getLastupdatetimestamp()).thenReturn(2L); + @Test + public void shouldShouldReturnLeftForOafEntityAndOafEntity() { + // given + OafEntity a = mock(OafEntity.class); + when(a.getLastupdatetimestamp()).thenReturn(1L); + OafEntity b = mock(OafEntity.class); + when(b.getLastupdatetimestamp()).thenReturn(2L); - // when - SerializableSupplier> fn = - functionFor(Strategy.SELECT_NEWER_AND_GET); + // when + SerializableSupplier> fn = functionFor(Strategy.SELECT_NEWER_AND_GET); - // then - Oaf x = fn.get().apply(a, b); - assertTrue(OafEntity.class.isAssignableFrom(x.getClass())); - assertEquals(b, x); - } + // then + Oaf x = fn.get().apply(a, b); + assertTrue(OafEntity.class.isAssignableFrom(x.getClass())); + assertEquals(b, x); + } - @Test - public void shouldShouldReturnRightForOafEntityAndOafEntity() { - // given - OafEntity a = mock(OafEntity.class); - when(a.getLastupdatetimestamp()).thenReturn(2L); - OafEntity b = mock(OafEntity.class); - when(b.getLastupdatetimestamp()).thenReturn(1L); + @Test + public void shouldShouldReturnRightForOafEntityAndOafEntity() { + // given + OafEntity a = mock(OafEntity.class); + when(a.getLastupdatetimestamp()).thenReturn(2L); + OafEntity b = mock(OafEntity.class); + when(b.getLastupdatetimestamp()).thenReturn(1L); - // when - SerializableSupplier> fn = - functionFor(Strategy.SELECT_NEWER_AND_GET); + // when + SerializableSupplier> fn = functionFor(Strategy.SELECT_NEWER_AND_GET); - // then - Oaf x = fn.get().apply(a, b); - assertTrue(OafEntity.class.isAssignableFrom(x.getClass())); - assertEquals(a, x); - } - } + // then + Oaf x = fn.get().apply(a, b); + assertTrue(OafEntity.class.isAssignableFrom(x.getClass())); + assertEquals(a, x); + } + } } diff --git a/dhp-workflows/dhp-actionmanager/src/test/java/eu/dnetlib/dhp/actionmanager/promote/PromoteActionPayloadForGraphTableJobTest.java b/dhp-workflows/dhp-actionmanager/src/test/java/eu/dnetlib/dhp/actionmanager/promote/PromoteActionPayloadForGraphTableJobTest.java index e8f802585..129daadcc 100644 --- a/dhp-workflows/dhp-actionmanager/src/test/java/eu/dnetlib/dhp/actionmanager/promote/PromoteActionPayloadForGraphTableJobTest.java +++ b/dhp-workflows/dhp-actionmanager/src/test/java/eu/dnetlib/dhp/actionmanager/promote/PromoteActionPayloadForGraphTableJobTest.java @@ -1,11 +1,9 @@ + package eu.dnetlib.dhp.actionmanager.promote; import static org.junit.jupiter.api.Assertions.*; import static org.junit.jupiter.params.provider.Arguments.arguments; -import com.fasterxml.jackson.databind.ObjectMapper; -import eu.dnetlib.dhp.schema.common.ModelSupport; -import eu.dnetlib.dhp.schema.oaf.*; import java.io.IOException; import java.nio.file.Files; import java.nio.file.Path; @@ -15,6 +13,7 @@ import java.util.List; import java.util.Objects; import java.util.stream.Collectors; import java.util.stream.Stream; + import org.apache.commons.io.FileUtils; import org.apache.spark.SparkConf; import org.apache.spark.api.java.function.MapFunction; @@ -26,253 +25,256 @@ import org.junit.jupiter.params.ParameterizedTest; import org.junit.jupiter.params.provider.Arguments; import org.junit.jupiter.params.provider.MethodSource; +import com.fasterxml.jackson.databind.ObjectMapper; + +import eu.dnetlib.dhp.schema.common.ModelSupport; +import eu.dnetlib.dhp.schema.oaf.*; + public class PromoteActionPayloadForGraphTableJobTest { - private static final ClassLoader cl = - PromoteActionPayloadForGraphTableJobTest.class.getClassLoader(); + private static final ClassLoader cl = PromoteActionPayloadForGraphTableJobTest.class.getClassLoader(); - private static SparkSession spark; + private static SparkSession spark; - private Path workingDir; - private Path inputDir; - private Path inputGraphRootDir; - private Path inputActionPayloadRootDir; - private Path outputDir; + private Path workingDir; + private Path inputDir; + private Path inputGraphRootDir; + private Path inputActionPayloadRootDir; + private Path outputDir; - private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); + private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); - @BeforeAll - public static void beforeAll() { - SparkConf conf = new SparkConf(); - conf.setAppName(PromoteActionPayloadForGraphTableJobTest.class.getSimpleName()); - conf.setMaster("local"); - conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer"); - conf.registerKryoClasses(ModelSupport.getOafModelClasses()); - spark = SparkSession.builder().config(conf).getOrCreate(); - } + @BeforeAll + public static void beforeAll() { + SparkConf conf = new SparkConf(); + conf.setAppName(PromoteActionPayloadForGraphTableJobTest.class.getSimpleName()); + conf.setMaster("local"); + conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer"); + conf.registerKryoClasses(ModelSupport.getOafModelClasses()); + spark = SparkSession.builder().config(conf).getOrCreate(); + } - @BeforeEach - public void beforeEach() throws IOException { - workingDir = - Files.createTempDirectory(PromoteActionPayloadForGraphTableJobTest.class.getSimpleName()); - inputDir = workingDir.resolve("input"); - inputGraphRootDir = inputDir.resolve("graph"); - inputActionPayloadRootDir = inputDir.resolve("action_payload"); - outputDir = workingDir.resolve("output"); - } + @BeforeEach + public void beforeEach() throws IOException { + workingDir = Files.createTempDirectory(PromoteActionPayloadForGraphTableJobTest.class.getSimpleName()); + inputDir = workingDir.resolve("input"); + inputGraphRootDir = inputDir.resolve("graph"); + inputActionPayloadRootDir = inputDir.resolve("action_payload"); + outputDir = workingDir.resolve("output"); + } - @AfterEach - public void afterEach() throws IOException { - FileUtils.deleteDirectory(workingDir.toFile()); - } + @AfterEach + public void afterEach() throws IOException { + FileUtils.deleteDirectory(workingDir.toFile()); + } - @AfterAll - public static void afterAll() { - spark.stop(); - } + @AfterAll + public static void afterAll() { + spark.stop(); + } - @DisplayName("Job") - @Nested - class Main { + @DisplayName("Job") + @Nested + class Main { - @Test - public void shouldThrowWhenGraphTableClassIsNotASubClassOfActionPayloadClass() { - // given - Class rowClazz = Relation.class; - Class actionPayloadClazz = OafEntity.class; + @Test + public void shouldThrowWhenGraphTableClassIsNotASubClassOfActionPayloadClass() { + // given + Class rowClazz = Relation.class; + Class actionPayloadClazz = OafEntity.class; - // when - RuntimeException exception = - assertThrows( - RuntimeException.class, - () -> - PromoteActionPayloadForGraphTableJob.main( - new String[] { - "-isSparkSessionManaged", - Boolean.FALSE.toString(), - "-inputGraphTablePath", - "", - "-graphTableClassName", - rowClazz.getCanonicalName(), - "-inputActionPayloadPath", - "", - "-actionPayloadClassName", - actionPayloadClazz.getCanonicalName(), - "-outputGraphTablePath", - "", - "-mergeAndGetStrategy", - MergeAndGet.Strategy.SELECT_NEWER_AND_GET.name() - })); + // when + RuntimeException exception = assertThrows( + RuntimeException.class, + () -> PromoteActionPayloadForGraphTableJob + .main( + new String[] { + "-isSparkSessionManaged", + Boolean.FALSE.toString(), + "-inputGraphTablePath", + "", + "-graphTableClassName", + rowClazz.getCanonicalName(), + "-inputActionPayloadPath", + "", + "-actionPayloadClassName", + actionPayloadClazz.getCanonicalName(), + "-outputGraphTablePath", + "", + "-mergeAndGetStrategy", + MergeAndGet.Strategy.SELECT_NEWER_AND_GET.name() + })); - // then - String msg = - String.format( - "graph table class is not a subclass of action payload class: graph=%s, action=%s", - rowClazz.getCanonicalName(), actionPayloadClazz.getCanonicalName()); - assertTrue(exception.getMessage().contains(msg)); - } + // then + String msg = String + .format( + "graph table class is not a subclass of action payload class: graph=%s, action=%s", + rowClazz.getCanonicalName(), actionPayloadClazz.getCanonicalName()); + assertTrue(exception.getMessage().contains(msg)); + } - @ParameterizedTest(name = "strategy: {0}, graph table: {1}, action payload: {2}") - @MethodSource( - "eu.dnetlib.dhp.actionmanager.promote.PromoteActionPayloadForGraphTableJobTest#promoteJobTestParams") - public void shouldPromoteActionPayloadForGraphTable( - MergeAndGet.Strategy strategy, - Class rowClazz, - Class actionPayloadClazz) - throws Exception { - // given - Path inputGraphTableDir = createGraphTable(inputGraphRootDir, rowClazz); - Path inputActionPayloadDir = - createActionPayload(inputActionPayloadRootDir, rowClazz, actionPayloadClazz); - Path outputGraphTableDir = - outputDir.resolve("graph").resolve(rowClazz.getSimpleName().toLowerCase()); + @ParameterizedTest(name = "strategy: {0}, graph table: {1}, action payload: {2}") + @MethodSource("eu.dnetlib.dhp.actionmanager.promote.PromoteActionPayloadForGraphTableJobTest#promoteJobTestParams") + public void shouldPromoteActionPayloadForGraphTable( + MergeAndGet.Strategy strategy, + Class rowClazz, + Class actionPayloadClazz) + throws Exception { + // given + Path inputGraphTableDir = createGraphTable(inputGraphRootDir, rowClazz); + Path inputActionPayloadDir = createActionPayload(inputActionPayloadRootDir, rowClazz, actionPayloadClazz); + Path outputGraphTableDir = outputDir.resolve("graph").resolve(rowClazz.getSimpleName().toLowerCase()); - // when - PromoteActionPayloadForGraphTableJob.main( - new String[] { - "-isSparkSessionManaged", - Boolean.FALSE.toString(), - "-inputGraphTablePath", - inputGraphTableDir.toString(), - "-graphTableClassName", - rowClazz.getCanonicalName(), - "-inputActionPayloadPath", - inputActionPayloadDir.toString(), - "-actionPayloadClassName", - actionPayloadClazz.getCanonicalName(), - "-outputGraphTablePath", - outputGraphTableDir.toString(), - "-mergeAndGetStrategy", - strategy.name() - }); + // when + PromoteActionPayloadForGraphTableJob + .main( + new String[] { + "-isSparkSessionManaged", + Boolean.FALSE.toString(), + "-inputGraphTablePath", + inputGraphTableDir.toString(), + "-graphTableClassName", + rowClazz.getCanonicalName(), + "-inputActionPayloadPath", + inputActionPayloadDir.toString(), + "-actionPayloadClassName", + actionPayloadClazz.getCanonicalName(), + "-outputGraphTablePath", + outputGraphTableDir.toString(), + "-mergeAndGetStrategy", + strategy.name() + }); - // then - assertTrue(Files.exists(outputGraphTableDir)); + // then + assertTrue(Files.exists(outputGraphTableDir)); - List actualOutputRows = - readGraphTableFromJobOutput(outputGraphTableDir.toString(), rowClazz).collectAsList() - .stream() - .sorted(Comparator.comparingInt(Object::hashCode)) - .collect(Collectors.toList()); - String expectedOutputGraphTableJsonDumpPath = - resultFileLocation(strategy, rowClazz, actionPayloadClazz); - Path expectedOutputGraphTableJsonDumpFile = - Paths.get( - Objects.requireNonNull(cl.getResource(expectedOutputGraphTableJsonDumpPath)) - .getFile()); - List expectedOutputRows = - readGraphTableFromJsonDump(expectedOutputGraphTableJsonDumpFile.toString(), rowClazz) - .collectAsList().stream() - .sorted(Comparator.comparingInt(Object::hashCode)) - .collect(Collectors.toList()); - assertIterableEquals(expectedOutputRows, actualOutputRows); - } - } + List actualOutputRows = readGraphTableFromJobOutput(outputGraphTableDir.toString(), rowClazz) + .collectAsList() + .stream() + .sorted(Comparator.comparingInt(Object::hashCode)) + .collect(Collectors.toList()); + String expectedOutputGraphTableJsonDumpPath = resultFileLocation(strategy, rowClazz, actionPayloadClazz); + Path expectedOutputGraphTableJsonDumpFile = Paths + .get( + Objects + .requireNonNull(cl.getResource(expectedOutputGraphTableJsonDumpPath)) + .getFile()); + List expectedOutputRows = readGraphTableFromJsonDump( + expectedOutputGraphTableJsonDumpFile.toString(), rowClazz) + .collectAsList() + .stream() + .sorted(Comparator.comparingInt(Object::hashCode)) + .collect(Collectors.toList()); + assertIterableEquals(expectedOutputRows, actualOutputRows); + } + } - public static Stream promoteJobTestParams() { - return Stream.of( - arguments( - MergeAndGet.Strategy.MERGE_FROM_AND_GET, - eu.dnetlib.dhp.schema.oaf.Dataset.class, - eu.dnetlib.dhp.schema.oaf.Dataset.class), - arguments( - MergeAndGet.Strategy.MERGE_FROM_AND_GET, - eu.dnetlib.dhp.schema.oaf.Dataset.class, - eu.dnetlib.dhp.schema.oaf.Result.class), - arguments(MergeAndGet.Strategy.MERGE_FROM_AND_GET, Datasource.class, Datasource.class), - arguments(MergeAndGet.Strategy.MERGE_FROM_AND_GET, Organization.class, Organization.class), - arguments( - MergeAndGet.Strategy.MERGE_FROM_AND_GET, - OtherResearchProduct.class, - OtherResearchProduct.class), - arguments( - MergeAndGet.Strategy.MERGE_FROM_AND_GET, OtherResearchProduct.class, Result.class), - arguments(MergeAndGet.Strategy.MERGE_FROM_AND_GET, Project.class, Project.class), - arguments(MergeAndGet.Strategy.MERGE_FROM_AND_GET, Publication.class, Publication.class), - arguments(MergeAndGet.Strategy.MERGE_FROM_AND_GET, Publication.class, Result.class), - arguments(MergeAndGet.Strategy.MERGE_FROM_AND_GET, Relation.class, Relation.class), - arguments(MergeAndGet.Strategy.MERGE_FROM_AND_GET, Software.class, Software.class), - arguments(MergeAndGet.Strategy.MERGE_FROM_AND_GET, Software.class, Result.class)); - } + public static Stream promoteJobTestParams() { + return Stream + .of( + arguments( + MergeAndGet.Strategy.MERGE_FROM_AND_GET, + eu.dnetlib.dhp.schema.oaf.Dataset.class, + eu.dnetlib.dhp.schema.oaf.Dataset.class), + arguments( + MergeAndGet.Strategy.MERGE_FROM_AND_GET, + eu.dnetlib.dhp.schema.oaf.Dataset.class, + eu.dnetlib.dhp.schema.oaf.Result.class), + arguments(MergeAndGet.Strategy.MERGE_FROM_AND_GET, Datasource.class, Datasource.class), + arguments(MergeAndGet.Strategy.MERGE_FROM_AND_GET, Organization.class, Organization.class), + arguments( + MergeAndGet.Strategy.MERGE_FROM_AND_GET, + OtherResearchProduct.class, + OtherResearchProduct.class), + arguments( + MergeAndGet.Strategy.MERGE_FROM_AND_GET, OtherResearchProduct.class, Result.class), + arguments(MergeAndGet.Strategy.MERGE_FROM_AND_GET, Project.class, Project.class), + arguments(MergeAndGet.Strategy.MERGE_FROM_AND_GET, Publication.class, Publication.class), + arguments(MergeAndGet.Strategy.MERGE_FROM_AND_GET, Publication.class, Result.class), + arguments(MergeAndGet.Strategy.MERGE_FROM_AND_GET, Relation.class, Relation.class), + arguments(MergeAndGet.Strategy.MERGE_FROM_AND_GET, Software.class, Software.class), + arguments(MergeAndGet.Strategy.MERGE_FROM_AND_GET, Software.class, Result.class)); + } - private static Path createGraphTable(Path inputGraphRootDir, Class rowClazz) { - String inputGraphTableJsonDumpPath = inputGraphTableJsonDumpLocation(rowClazz); - Path inputGraphTableJsonDumpFile = - Paths.get(Objects.requireNonNull(cl.getResource(inputGraphTableJsonDumpPath)).getFile()); - Dataset rowDS = readGraphTableFromJsonDump(inputGraphTableJsonDumpFile.toString(), rowClazz); - String inputGraphTableName = rowClazz.getSimpleName().toLowerCase(); - Path inputGraphTableDir = inputGraphRootDir.resolve(inputGraphTableName); - writeGraphTableAaJobInput(rowDS, inputGraphTableDir.toString()); - return inputGraphTableDir; - } + private static Path createGraphTable(Path inputGraphRootDir, Class rowClazz) { + String inputGraphTableJsonDumpPath = inputGraphTableJsonDumpLocation(rowClazz); + Path inputGraphTableJsonDumpFile = Paths + .get(Objects.requireNonNull(cl.getResource(inputGraphTableJsonDumpPath)).getFile()); + Dataset rowDS = readGraphTableFromJsonDump(inputGraphTableJsonDumpFile.toString(), rowClazz); + String inputGraphTableName = rowClazz.getSimpleName().toLowerCase(); + Path inputGraphTableDir = inputGraphRootDir.resolve(inputGraphTableName); + writeGraphTableAaJobInput(rowDS, inputGraphTableDir.toString()); + return inputGraphTableDir; + } - private static String inputGraphTableJsonDumpLocation(Class rowClazz) { - return String.format( - "%s/%s.json", - "eu/dnetlib/dhp/actionmanager/promote/input/graph", rowClazz.getSimpleName().toLowerCase()); - } + private static String inputGraphTableJsonDumpLocation(Class rowClazz) { + return String + .format( + "%s/%s.json", + "eu/dnetlib/dhp/actionmanager/promote/input/graph", rowClazz.getSimpleName().toLowerCase()); + } - private static Dataset readGraphTableFromJsonDump( - String path, Class rowClazz) { - return spark - .read() - .textFile(path) - .map( - (MapFunction) json -> OBJECT_MAPPER.readValue(json, rowClazz), - Encoders.bean(rowClazz)); - } + private static Dataset readGraphTableFromJsonDump( + String path, Class rowClazz) { + return spark + .read() + .textFile(path) + .map( + (MapFunction) json -> OBJECT_MAPPER.readValue(json, rowClazz), + Encoders.bean(rowClazz)); + } - private static void writeGraphTableAaJobInput(Dataset rowDS, String path) { - rowDS.write().option("compression", "gzip").json(path); - } + private static void writeGraphTableAaJobInput(Dataset rowDS, String path) { + rowDS.write().option("compression", "gzip").json(path); + } - private static Path createActionPayload( - Path inputActionPayloadRootDir, Class rowClazz, Class actionPayloadClazz) { - String inputActionPayloadJsonDumpPath = - inputActionPayloadJsonDumpLocation(rowClazz, actionPayloadClazz); - Path inputActionPayloadJsonDumpFile = - Paths.get(Objects.requireNonNull(cl.getResource(inputActionPayloadJsonDumpPath)).getFile()); - Dataset actionPayloadDS = - readActionPayloadFromJsonDump(inputActionPayloadJsonDumpFile.toString()); - Path inputActionPayloadDir = - inputActionPayloadRootDir.resolve(actionPayloadClazz.getSimpleName().toLowerCase()); - writeActionPayloadAsJobInput(actionPayloadDS, inputActionPayloadDir.toString()); - return inputActionPayloadDir; - } + private static Path createActionPayload( + Path inputActionPayloadRootDir, Class rowClazz, Class actionPayloadClazz) { + String inputActionPayloadJsonDumpPath = inputActionPayloadJsonDumpLocation(rowClazz, actionPayloadClazz); + Path inputActionPayloadJsonDumpFile = Paths + .get(Objects.requireNonNull(cl.getResource(inputActionPayloadJsonDumpPath)).getFile()); + Dataset actionPayloadDS = readActionPayloadFromJsonDump(inputActionPayloadJsonDumpFile.toString()); + Path inputActionPayloadDir = inputActionPayloadRootDir + .resolve(actionPayloadClazz.getSimpleName().toLowerCase()); + writeActionPayloadAsJobInput(actionPayloadDS, inputActionPayloadDir.toString()); + return inputActionPayloadDir; + } - private static String inputActionPayloadJsonDumpLocation( - Class rowClazz, Class actionPayloadClazz) { + private static String inputActionPayloadJsonDumpLocation( + Class rowClazz, Class actionPayloadClazz) { - return String.format( - "eu/dnetlib/dhp/actionmanager/promote/input/action_payload/%s_table/%s.json", - rowClazz.getSimpleName().toLowerCase(), actionPayloadClazz.getSimpleName().toLowerCase()); - } + return String + .format( + "eu/dnetlib/dhp/actionmanager/promote/input/action_payload/%s_table/%s.json", + rowClazz.getSimpleName().toLowerCase(), actionPayloadClazz.getSimpleName().toLowerCase()); + } - private static Dataset readActionPayloadFromJsonDump(String path) { - return spark.read().textFile(path); - } + private static Dataset readActionPayloadFromJsonDump(String path) { + return spark.read().textFile(path); + } - private static void writeActionPayloadAsJobInput(Dataset actionPayloadDS, String path) { - actionPayloadDS.withColumnRenamed("value", "payload").write().parquet(path); - } + private static void writeActionPayloadAsJobInput(Dataset actionPayloadDS, String path) { + actionPayloadDS.withColumnRenamed("value", "payload").write().parquet(path); + } - private static Dataset readGraphTableFromJobOutput( - String path, Class rowClazz) { - return spark - .read() - .textFile(path) - .map( - (MapFunction) json -> OBJECT_MAPPER.readValue(json, rowClazz), - Encoders.bean(rowClazz)); - } + private static Dataset readGraphTableFromJobOutput( + String path, Class rowClazz) { + return spark + .read() + .textFile(path) + .map( + (MapFunction) json -> OBJECT_MAPPER.readValue(json, rowClazz), + Encoders.bean(rowClazz)); + } - private static String resultFileLocation( - MergeAndGet.Strategy strategy, - Class rowClazz, - Class actionPayloadClazz) { - return String.format( - "eu/dnetlib/dhp/actionmanager/promote/output/graph/%s/%s/%s_action_payload/result.json", - strategy.name().toLowerCase(), - rowClazz.getSimpleName().toLowerCase(), - actionPayloadClazz.getSimpleName().toLowerCase()); - } + private static String resultFileLocation( + MergeAndGet.Strategy strategy, + Class rowClazz, + Class actionPayloadClazz) { + return String + .format( + "eu/dnetlib/dhp/actionmanager/promote/output/graph/%s/%s/%s_action_payload/result.json", + strategy.name().toLowerCase(), + rowClazz.getSimpleName().toLowerCase(), + actionPayloadClazz.getSimpleName().toLowerCase()); + } } diff --git a/dhp-workflows/dhp-actionmanager/src/test/java/eu/dnetlib/dhp/actionmanager/promote/PromoteActionPayloadFunctionsTest.java b/dhp-workflows/dhp-actionmanager/src/test/java/eu/dnetlib/dhp/actionmanager/promote/PromoteActionPayloadFunctionsTest.java index 9abb0858f..477e4b204 100644 --- a/dhp-workflows/dhp-actionmanager/src/test/java/eu/dnetlib/dhp/actionmanager/promote/PromoteActionPayloadFunctionsTest.java +++ b/dhp-workflows/dhp-actionmanager/src/test/java/eu/dnetlib/dhp/actionmanager/promote/PromoteActionPayloadFunctionsTest.java @@ -1,15 +1,15 @@ + package eu.dnetlib.dhp.actionmanager.promote; import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertThrows; -import eu.dnetlib.dhp.common.FunctionalInterfaceSupport.SerializableSupplier; -import eu.dnetlib.dhp.schema.oaf.Oaf; import java.util.Arrays; import java.util.List; import java.util.Objects; import java.util.function.BiFunction; import java.util.function.Function; + import org.apache.spark.SparkConf; import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Encoders; @@ -19,314 +19,311 @@ import org.junit.jupiter.api.BeforeAll; import org.junit.jupiter.api.Nested; import org.junit.jupiter.api.Test; +import eu.dnetlib.dhp.common.FunctionalInterfaceSupport.SerializableSupplier; +import eu.dnetlib.dhp.schema.oaf.Oaf; + public class PromoteActionPayloadFunctionsTest { - private static SparkSession spark; + private static SparkSession spark; - @BeforeAll - public static void beforeAll() { - SparkConf conf = new SparkConf(); - conf.setMaster("local"); - conf.setAppName(PromoteActionPayloadFunctionsTest.class.getSimpleName()); - conf.set("spark.driver.host", "localhost"); - spark = SparkSession.builder().config(conf).getOrCreate(); - } + @BeforeAll + public static void beforeAll() { + SparkConf conf = new SparkConf(); + conf.setMaster("local"); + conf.setAppName(PromoteActionPayloadFunctionsTest.class.getSimpleName()); + conf.set("spark.driver.host", "localhost"); + spark = SparkSession.builder().config(conf).getOrCreate(); + } - @AfterAll - public static void afterAll() { - spark.stop(); - } + @AfterAll + public static void afterAll() { + spark.stop(); + } - @Nested - class JoinTableWithActionPayloadAndMerge { + @Nested + class JoinTableWithActionPayloadAndMerge { - @Test - public void shouldThrowWhenTableTypeIsNotSubtypeOfActionPayloadType() { - // given - class OafImpl extends Oaf {} + @Test + public void shouldThrowWhenTableTypeIsNotSubtypeOfActionPayloadType() { + // given + class OafImpl extends Oaf { + } - // when - assertThrows( - RuntimeException.class, - () -> - PromoteActionPayloadFunctions.joinGraphTableWithActionPayloadAndMerge( - null, null, null, null, null, OafImplSubSub.class, OafImpl.class)); - } + // when + assertThrows( + RuntimeException.class, + () -> PromoteActionPayloadFunctions + .joinGraphTableWithActionPayloadAndMerge( + null, null, null, null, null, OafImplSubSub.class, OafImpl.class)); + } - @Test - public void shouldRunProperlyWhenActionPayloadTypeAndTableTypeAreTheSame() { - // given - String id0 = "id0"; - String id1 = "id1"; - String id2 = "id2"; - String id3 = "id3"; - String id4 = "id4"; - List rowData = - Arrays.asList( - createOafImplSubSub(id0), - createOafImplSubSub(id1), - createOafImplSubSub(id2), - createOafImplSubSub(id3)); - Dataset rowDS = - spark.createDataset(rowData, Encoders.bean(OafImplSubSub.class)); + @Test + public void shouldRunProperlyWhenActionPayloadTypeAndTableTypeAreTheSame() { + // given + String id0 = "id0"; + String id1 = "id1"; + String id2 = "id2"; + String id3 = "id3"; + String id4 = "id4"; + List rowData = Arrays + .asList( + createOafImplSubSub(id0), + createOafImplSubSub(id1), + createOafImplSubSub(id2), + createOafImplSubSub(id3)); + Dataset rowDS = spark.createDataset(rowData, Encoders.bean(OafImplSubSub.class)); - List actionPayloadData = - Arrays.asList( - createOafImplSubSub(id1), - createOafImplSubSub(id2), - createOafImplSubSub(id2), - createOafImplSubSub(id3), - createOafImplSubSub(id3), - createOafImplSubSub(id3), - createOafImplSubSub(id4), - createOafImplSubSub(id4), - createOafImplSubSub(id4), - createOafImplSubSub(id4)); - Dataset actionPayloadDS = - spark.createDataset(actionPayloadData, Encoders.bean(OafImplSubSub.class)); + List actionPayloadData = Arrays + .asList( + createOafImplSubSub(id1), + createOafImplSubSub(id2), + createOafImplSubSub(id2), + createOafImplSubSub(id3), + createOafImplSubSub(id3), + createOafImplSubSub(id3), + createOafImplSubSub(id4), + createOafImplSubSub(id4), + createOafImplSubSub(id4), + createOafImplSubSub(id4)); + Dataset actionPayloadDS = spark + .createDataset(actionPayloadData, Encoders.bean(OafImplSubSub.class)); - SerializableSupplier> rowIdFn = () -> OafImplRoot::getId; - SerializableSupplier> actionPayloadIdFn = - () -> OafImplRoot::getId; - SerializableSupplier> mergeAndGetFn = - () -> - (x, y) -> { - x.merge(y); - return x; - }; + SerializableSupplier> rowIdFn = () -> OafImplRoot::getId; + SerializableSupplier> actionPayloadIdFn = () -> OafImplRoot::getId; + SerializableSupplier> mergeAndGetFn = () -> (x, + y) -> { + x.merge(y); + return x; + }; - // when - List results = - PromoteActionPayloadFunctions.joinGraphTableWithActionPayloadAndMerge( - rowDS, - actionPayloadDS, - rowIdFn, - actionPayloadIdFn, - mergeAndGetFn, - OafImplSubSub.class, - OafImplSubSub.class) - .collectAsList(); + // when + List results = PromoteActionPayloadFunctions + .joinGraphTableWithActionPayloadAndMerge( + rowDS, + actionPayloadDS, + rowIdFn, + actionPayloadIdFn, + mergeAndGetFn, + OafImplSubSub.class, + OafImplSubSub.class) + .collectAsList(); - // then - assertEquals(11, results.size()); - assertEquals(1, results.stream().filter(x -> x.getId().equals(id0)).count()); - assertEquals(1, results.stream().filter(x -> x.getId().equals(id1)).count()); - assertEquals(2, results.stream().filter(x -> x.getId().equals(id2)).count()); - assertEquals(3, results.stream().filter(x -> x.getId().equals(id3)).count()); - assertEquals(4, results.stream().filter(x -> x.getId().equals(id4)).count()); + // then + assertEquals(11, results.size()); + assertEquals(1, results.stream().filter(x -> x.getId().equals(id0)).count()); + assertEquals(1, results.stream().filter(x -> x.getId().equals(id1)).count()); + assertEquals(2, results.stream().filter(x -> x.getId().equals(id2)).count()); + assertEquals(3, results.stream().filter(x -> x.getId().equals(id3)).count()); + assertEquals(4, results.stream().filter(x -> x.getId().equals(id4)).count()); - results.forEach( - result -> { - switch (result.getId()) { - case "id0": - assertEquals(1, result.getMerged()); - break; - case "id1": - case "id2": - case "id3": - assertEquals(2, result.getMerged()); - break; - case "id4": - assertEquals(1, result.getMerged()); - break; - default: - throw new RuntimeException(); - } - }); - } + results + .forEach( + result -> { + switch (result.getId()) { + case "id0": + assertEquals(1, result.getMerged()); + break; + case "id1": + case "id2": + case "id3": + assertEquals(2, result.getMerged()); + break; + case "id4": + assertEquals(1, result.getMerged()); + break; + default: + throw new RuntimeException(); + } + }); + } - @Test - public void shouldRunProperlyWhenActionPayloadTypeIsSuperTypeOfTableType() { - // given - String id0 = "id0"; - String id1 = "id1"; - String id2 = "id2"; - String id3 = "id3"; - String id4 = "id4"; - List rowData = - Arrays.asList( - createOafImplSubSub(id0), - createOafImplSubSub(id1), - createOafImplSubSub(id2), - createOafImplSubSub(id3)); - Dataset rowDS = - spark.createDataset(rowData, Encoders.bean(OafImplSubSub.class)); + @Test + public void shouldRunProperlyWhenActionPayloadTypeIsSuperTypeOfTableType() { + // given + String id0 = "id0"; + String id1 = "id1"; + String id2 = "id2"; + String id3 = "id3"; + String id4 = "id4"; + List rowData = Arrays + .asList( + createOafImplSubSub(id0), + createOafImplSubSub(id1), + createOafImplSubSub(id2), + createOafImplSubSub(id3)); + Dataset rowDS = spark.createDataset(rowData, Encoders.bean(OafImplSubSub.class)); - List actionPayloadData = - Arrays.asList( - createOafImplSub(id1), - createOafImplSub(id2), - createOafImplSub(id2), - createOafImplSub(id3), - createOafImplSub(id3), - createOafImplSub(id3), - createOafImplSub(id4), - createOafImplSub(id4), - createOafImplSub(id4), - createOafImplSub(id4)); - Dataset actionPayloadDS = - spark.createDataset(actionPayloadData, Encoders.bean(OafImplSub.class)); + List actionPayloadData = Arrays + .asList( + createOafImplSub(id1), + createOafImplSub(id2), + createOafImplSub(id2), + createOafImplSub(id3), + createOafImplSub(id3), + createOafImplSub(id3), + createOafImplSub(id4), + createOafImplSub(id4), + createOafImplSub(id4), + createOafImplSub(id4)); + Dataset actionPayloadDS = spark + .createDataset(actionPayloadData, Encoders.bean(OafImplSub.class)); - SerializableSupplier> rowIdFn = () -> OafImplRoot::getId; - SerializableSupplier> actionPayloadIdFn = - () -> OafImplRoot::getId; - SerializableSupplier> mergeAndGetFn = - () -> - (x, y) -> { - x.merge(y); - return x; - }; + SerializableSupplier> rowIdFn = () -> OafImplRoot::getId; + SerializableSupplier> actionPayloadIdFn = () -> OafImplRoot::getId; + SerializableSupplier> mergeAndGetFn = () -> (x, y) -> { + x.merge(y); + return x; + }; - // when - List results = - PromoteActionPayloadFunctions.joinGraphTableWithActionPayloadAndMerge( - rowDS, - actionPayloadDS, - rowIdFn, - actionPayloadIdFn, - mergeAndGetFn, - OafImplSubSub.class, - OafImplSub.class) - .collectAsList(); + // when + List results = PromoteActionPayloadFunctions + .joinGraphTableWithActionPayloadAndMerge( + rowDS, + actionPayloadDS, + rowIdFn, + actionPayloadIdFn, + mergeAndGetFn, + OafImplSubSub.class, + OafImplSub.class) + .collectAsList(); - // then - assertEquals(7, results.size()); - assertEquals(1, results.stream().filter(x -> x.getId().equals(id0)).count()); - assertEquals(1, results.stream().filter(x -> x.getId().equals(id1)).count()); - assertEquals(2, results.stream().filter(x -> x.getId().equals(id2)).count()); - assertEquals(3, results.stream().filter(x -> x.getId().equals(id3)).count()); - assertEquals(0, results.stream().filter(x -> x.getId().equals(id4)).count()); + // then + assertEquals(7, results.size()); + assertEquals(1, results.stream().filter(x -> x.getId().equals(id0)).count()); + assertEquals(1, results.stream().filter(x -> x.getId().equals(id1)).count()); + assertEquals(2, results.stream().filter(x -> x.getId().equals(id2)).count()); + assertEquals(3, results.stream().filter(x -> x.getId().equals(id3)).count()); + assertEquals(0, results.stream().filter(x -> x.getId().equals(id4)).count()); - results.forEach( - result -> { - switch (result.getId()) { - case "id0": - assertEquals(1, result.getMerged()); - break; - case "id1": - case "id2": - case "id3": - assertEquals(2, result.getMerged()); - break; - default: - throw new RuntimeException(); - } - }); - } - } + results + .forEach( + result -> { + switch (result.getId()) { + case "id0": + assertEquals(1, result.getMerged()); + break; + case "id1": + case "id2": + case "id3": + assertEquals(2, result.getMerged()); + break; + default: + throw new RuntimeException(); + } + }); + } + } - @Nested - class GroupTableByIdAndMerge { + @Nested + class GroupTableByIdAndMerge { - @Test - public void shouldRunProperly() { - // given - String id1 = "id1"; - String id2 = "id2"; - String id3 = "id3"; - List rowData = - Arrays.asList( - createOafImplSubSub(id1), - createOafImplSubSub(id2), - createOafImplSubSub(id2), - createOafImplSubSub(id3), - createOafImplSubSub(id3), - createOafImplSubSub(id3)); - Dataset rowDS = - spark.createDataset(rowData, Encoders.bean(OafImplSubSub.class)); + @Test + public void shouldRunProperly() { + // given + String id1 = "id1"; + String id2 = "id2"; + String id3 = "id3"; + List rowData = Arrays + .asList( + createOafImplSubSub(id1), + createOafImplSubSub(id2), + createOafImplSubSub(id2), + createOafImplSubSub(id3), + createOafImplSubSub(id3), + createOafImplSubSub(id3)); + Dataset rowDS = spark.createDataset(rowData, Encoders.bean(OafImplSubSub.class)); - SerializableSupplier> rowIdFn = () -> OafImplRoot::getId; - SerializableSupplier> mergeAndGetFn = - () -> - (x, y) -> { - x.merge(y); - return x; - }; - SerializableSupplier zeroFn = OafImplSubSub::new; - SerializableSupplier> isNotZeroFn = - () -> x -> Objects.nonNull(x.getId()); + SerializableSupplier> rowIdFn = () -> OafImplRoot::getId; + SerializableSupplier> mergeAndGetFn = () -> (x, + y) -> { + x.merge(y); + return x; + }; + SerializableSupplier zeroFn = OafImplSubSub::new; + SerializableSupplier> isNotZeroFn = () -> x -> Objects.nonNull(x.getId()); - // when - List results = - PromoteActionPayloadFunctions.groupGraphTableByIdAndMerge( - rowDS, rowIdFn, mergeAndGetFn, zeroFn, isNotZeroFn, OafImplSubSub.class) - .collectAsList(); + // when + List results = PromoteActionPayloadFunctions + .groupGraphTableByIdAndMerge( + rowDS, rowIdFn, mergeAndGetFn, zeroFn, isNotZeroFn, OafImplSubSub.class) + .collectAsList(); - // then - assertEquals(3, results.size()); - assertEquals(1, results.stream().filter(x -> x.getId().equals(id1)).count()); - assertEquals(1, results.stream().filter(x -> x.getId().equals(id2)).count()); - assertEquals(1, results.stream().filter(x -> x.getId().equals(id3)).count()); + // then + assertEquals(3, results.size()); + assertEquals(1, results.stream().filter(x -> x.getId().equals(id1)).count()); + assertEquals(1, results.stream().filter(x -> x.getId().equals(id2)).count()); + assertEquals(1, results.stream().filter(x -> x.getId().equals(id3)).count()); - results.forEach( - result -> { - switch (result.getId()) { - case "id1": - assertEquals(1, result.getMerged()); - break; - case "id2": - assertEquals(2, result.getMerged()); - break; - case "id3": - assertEquals(3, result.getMerged()); - break; - default: - throw new RuntimeException(); - } - }); - } - } + results + .forEach( + result -> { + switch (result.getId()) { + case "id1": + assertEquals(1, result.getMerged()); + break; + case "id2": + assertEquals(2, result.getMerged()); + break; + case "id3": + assertEquals(3, result.getMerged()); + break; + default: + throw new RuntimeException(); + } + }); + } + } - public static class OafImplRoot extends Oaf { - private String id; - private int merged = 1; + public static class OafImplRoot extends Oaf { + private String id; + private int merged = 1; - public void merge(OafImplRoot e) { - merged += e.merged; - } + public void merge(OafImplRoot e) { + merged += e.merged; + } - public String getId() { - return id; - } + public String getId() { + return id; + } - public void setId(String id) { - this.id = id; - } + public void setId(String id) { + this.id = id; + } - public int getMerged() { - return merged; - } + public int getMerged() { + return merged; + } - public void setMerged(int merged) { - this.merged = merged; - } - } + public void setMerged(int merged) { + this.merged = merged; + } + } - public static class OafImplSub extends OafImplRoot { + public static class OafImplSub extends OafImplRoot { - @Override - public void merge(OafImplRoot e) { - super.merge(e); - } - } + @Override + public void merge(OafImplRoot e) { + super.merge(e); + } + } - private static OafImplSub createOafImplSub(String id) { - OafImplSub x = new OafImplSub(); - x.setId(id); - return x; - } + private static OafImplSub createOafImplSub(String id) { + OafImplSub x = new OafImplSub(); + x.setId(id); + return x; + } - public static class OafImplSubSub extends OafImplSub { + public static class OafImplSubSub extends OafImplSub { - @Override - public void merge(OafImplRoot e) { - super.merge(e); - } - } + @Override + public void merge(OafImplRoot e) { + super.merge(e); + } + } - private static OafImplSubSub createOafImplSubSub(String id) { - OafImplSubSub x = new OafImplSubSub(); - x.setId(id); - return x; - } + private static OafImplSubSub createOafImplSubSub(String id) { + OafImplSubSub x = new OafImplSubSub(); + x.setId(id); + return x; + } } diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/GenerateNativeStoreSparkJob.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/GenerateNativeStoreSparkJob.java index 526bff2e1..9811fb707 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/GenerateNativeStoreSparkJob.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/GenerateNativeStoreSparkJob.java @@ -1,17 +1,12 @@ + package eu.dnetlib.dhp.collection; -import com.fasterxml.jackson.databind.ObjectMapper; -import eu.dnetlib.dhp.application.ArgumentApplicationParser; -import eu.dnetlib.dhp.model.mdstore.MetadataRecord; -import eu.dnetlib.dhp.model.mdstore.Provenance; -import eu.dnetlib.message.Message; -import eu.dnetlib.message.MessageManager; -import eu.dnetlib.message.MessageType; import java.io.ByteArrayInputStream; import java.nio.charset.StandardCharsets; import java.util.HashMap; import java.util.Map; import java.util.Objects; + import org.apache.commons.cli.*; import org.apache.commons.io.IOUtils; import org.apache.commons.lang3.StringUtils; @@ -29,127 +24,138 @@ import org.dom4j.Document; import org.dom4j.Node; import org.dom4j.io.SAXReader; +import com.fasterxml.jackson.databind.ObjectMapper; + +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.model.mdstore.MetadataRecord; +import eu.dnetlib.dhp.model.mdstore.Provenance; +import eu.dnetlib.message.Message; +import eu.dnetlib.message.MessageManager; +import eu.dnetlib.message.MessageType; + public class GenerateNativeStoreSparkJob { - public static MetadataRecord parseRecord( - final String input, - final String xpath, - final String encoding, - final Provenance provenance, - final Long dateOfCollection, - final LongAccumulator totalItems, - final LongAccumulator invalidRecords) { + public static MetadataRecord parseRecord( + final String input, + final String xpath, + final String encoding, + final Provenance provenance, + final Long dateOfCollection, + final LongAccumulator totalItems, + final LongAccumulator invalidRecords) { - if (totalItems != null) totalItems.add(1); - try { - SAXReader reader = new SAXReader(); - Document document = - reader.read(new ByteArrayInputStream(input.getBytes(StandardCharsets.UTF_8))); - Node node = document.selectSingleNode(xpath); - final String originalIdentifier = node.getText(); - if (StringUtils.isBlank(originalIdentifier)) { - if (invalidRecords != null) invalidRecords.add(1); - return null; - } - return new MetadataRecord(originalIdentifier, encoding, provenance, input, dateOfCollection); - } catch (Throwable e) { - if (invalidRecords != null) invalidRecords.add(1); - e.printStackTrace(); - return null; - } - } + if (totalItems != null) + totalItems.add(1); + try { + SAXReader reader = new SAXReader(); + Document document = reader.read(new ByteArrayInputStream(input.getBytes(StandardCharsets.UTF_8))); + Node node = document.selectSingleNode(xpath); + final String originalIdentifier = node.getText(); + if (StringUtils.isBlank(originalIdentifier)) { + if (invalidRecords != null) + invalidRecords.add(1); + return null; + } + return new MetadataRecord(originalIdentifier, encoding, provenance, input, dateOfCollection); + } catch (Throwable e) { + if (invalidRecords != null) + invalidRecords.add(1); + e.printStackTrace(); + return null; + } + } - public static void main(String[] args) throws Exception { + public static void main(String[] args) throws Exception { - final ArgumentApplicationParser parser = - new ArgumentApplicationParser( - IOUtils.toString( - GenerateNativeStoreSparkJob.class.getResourceAsStream( - "/eu/dnetlib/dhp/collection/collection_input_parameters.json"))); - parser.parseArgument(args); - final ObjectMapper jsonMapper = new ObjectMapper(); - final Provenance provenance = jsonMapper.readValue(parser.get("provenance"), Provenance.class); - final long dateOfCollection = new Long(parser.get("dateOfCollection")); + final ArgumentApplicationParser parser = new ArgumentApplicationParser( + IOUtils + .toString( + GenerateNativeStoreSparkJob.class + .getResourceAsStream( + "/eu/dnetlib/dhp/collection/collection_input_parameters.json"))); + parser.parseArgument(args); + final ObjectMapper jsonMapper = new ObjectMapper(); + final Provenance provenance = jsonMapper.readValue(parser.get("provenance"), Provenance.class); + final long dateOfCollection = new Long(parser.get("dateOfCollection")); - final SparkSession spark = - SparkSession.builder() - .appName("GenerateNativeStoreSparkJob") - .master(parser.get("master")) - .getOrCreate(); + final SparkSession spark = SparkSession + .builder() + .appName("GenerateNativeStoreSparkJob") + .master(parser.get("master")) + .getOrCreate(); - final Map ongoingMap = new HashMap<>(); - final Map reportMap = new HashMap<>(); + final Map ongoingMap = new HashMap<>(); + final Map reportMap = new HashMap<>(); - final boolean test = - parser.get("isTest") == null ? false : Boolean.valueOf(parser.get("isTest")); + final boolean test = parser.get("isTest") == null ? false : Boolean.valueOf(parser.get("isTest")); - final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); + final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); - final JavaPairRDD inputRDD = - sc.sequenceFile(parser.get("input"), IntWritable.class, Text.class); + final JavaPairRDD inputRDD = sc + .sequenceFile(parser.get("input"), IntWritable.class, Text.class); - final LongAccumulator totalItems = sc.sc().longAccumulator("TotalItems"); + final LongAccumulator totalItems = sc.sc().longAccumulator("TotalItems"); - final LongAccumulator invalidRecords = sc.sc().longAccumulator("InvalidRecords"); + final LongAccumulator invalidRecords = sc.sc().longAccumulator("InvalidRecords"); - final MessageManager manager = - new MessageManager( - parser.get("rabbitHost"), - parser.get("rabbitUser"), - parser.get("rabbitPassword"), - false, - false, - null); + final MessageManager manager = new MessageManager( + parser.get("rabbitHost"), + parser.get("rabbitUser"), + parser.get("rabbitPassword"), + false, + false, + null); - final JavaRDD mappeRDD = - inputRDD - .map( - item -> - parseRecord( - item._2().toString(), - parser.get("xpath"), - parser.get("encoding"), - provenance, - dateOfCollection, - totalItems, - invalidRecords)) - .filter(Objects::nonNull) - .distinct(); + final JavaRDD mappeRDD = inputRDD + .map( + item -> parseRecord( + item._2().toString(), + parser.get("xpath"), + parser.get("encoding"), + provenance, + dateOfCollection, + totalItems, + invalidRecords)) + .filter(Objects::nonNull) + .distinct(); - ongoingMap.put("ongoing", "0"); - if (!test) { - manager.sendMessage( - new Message( - parser.get("workflowId"), "DataFrameCreation", MessageType.ONGOING, ongoingMap), - parser.get("rabbitOngoingQueue"), - true, - false); - } + ongoingMap.put("ongoing", "0"); + if (!test) { + manager + .sendMessage( + new Message( + parser.get("workflowId"), "DataFrameCreation", MessageType.ONGOING, ongoingMap), + parser.get("rabbitOngoingQueue"), + true, + false); + } - final Encoder encoder = Encoders.bean(MetadataRecord.class); - final Dataset mdstore = spark.createDataset(mappeRDD.rdd(), encoder); - final LongAccumulator mdStoreRecords = sc.sc().longAccumulator("MDStoreRecords"); - mdStoreRecords.add(mdstore.count()); - ongoingMap.put("ongoing", "" + totalItems.value()); - if (!test) { - manager.sendMessage( - new Message( - parser.get("workflowId"), "DataFrameCreation", MessageType.ONGOING, ongoingMap), - parser.get("rabbitOngoingQueue"), - true, - false); - } - mdstore.write().format("parquet").save(parser.get("output")); - reportMap.put("inputItem", "" + totalItems.value()); - reportMap.put("invalidRecords", "" + invalidRecords.value()); - reportMap.put("mdStoreSize", "" + mdStoreRecords.value()); - if (!test) { - manager.sendMessage( - new Message(parser.get("workflowId"), "Collection", MessageType.REPORT, reportMap), - parser.get("rabbitReportQueue"), - true, - false); - manager.close(); - } - } + final Encoder encoder = Encoders.bean(MetadataRecord.class); + final Dataset mdstore = spark.createDataset(mappeRDD.rdd(), encoder); + final LongAccumulator mdStoreRecords = sc.sc().longAccumulator("MDStoreRecords"); + mdStoreRecords.add(mdstore.count()); + ongoingMap.put("ongoing", "" + totalItems.value()); + if (!test) { + manager + .sendMessage( + new Message( + parser.get("workflowId"), "DataFrameCreation", MessageType.ONGOING, ongoingMap), + parser.get("rabbitOngoingQueue"), + true, + false); + } + mdstore.write().format("parquet").save(parser.get("output")); + reportMap.put("inputItem", "" + totalItems.value()); + reportMap.put("invalidRecords", "" + invalidRecords.value()); + reportMap.put("mdStoreSize", "" + mdStoreRecords.value()); + if (!test) { + manager + .sendMessage( + new Message(parser.get("workflowId"), "Collection", MessageType.REPORT, reportMap), + parser.get("rabbitReportQueue"), + true, + false); + manager.close(); + } + } } diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/CollectorPlugin.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/CollectorPlugin.java index 82f28afe6..4a0c70c45 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/CollectorPlugin.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/CollectorPlugin.java @@ -1,10 +1,12 @@ + package eu.dnetlib.dhp.collection.plugin; +import java.util.stream.Stream; + import eu.dnetlib.collector.worker.model.ApiDescriptor; import eu.dnetlib.dhp.collection.worker.DnetCollectorException; -import java.util.stream.Stream; public interface CollectorPlugin { - Stream collect(ApiDescriptor api) throws DnetCollectorException; + Stream collect(ApiDescriptor api) throws DnetCollectorException; } diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/oai/OaiCollectorPlugin.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/oai/OaiCollectorPlugin.java index 415102a1a..7f71f401d 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/oai/OaiCollectorPlugin.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/oai/OaiCollectorPlugin.java @@ -1,11 +1,6 @@ + package eu.dnetlib.dhp.collection.plugin.oai; -import com.google.common.base.Splitter; -import com.google.common.collect.Iterators; -import com.google.common.collect.Lists; -import eu.dnetlib.collector.worker.model.ApiDescriptor; -import eu.dnetlib.dhp.collection.plugin.CollectorPlugin; -import eu.dnetlib.dhp.collection.worker.DnetCollectorException; import java.util.ArrayList; import java.util.Iterator; import java.util.List; @@ -14,65 +9,74 @@ import java.util.Spliterators; import java.util.stream.Stream; import java.util.stream.StreamSupport; +import com.google.common.base.Splitter; +import com.google.common.collect.Iterators; +import com.google.common.collect.Lists; + +import eu.dnetlib.collector.worker.model.ApiDescriptor; +import eu.dnetlib.dhp.collection.plugin.CollectorPlugin; +import eu.dnetlib.dhp.collection.worker.DnetCollectorException; + public class OaiCollectorPlugin implements CollectorPlugin { - private static final String FORMAT_PARAM = "format"; - private static final String OAI_SET_PARAM = "set"; - private static final Object OAI_FROM_DATE_PARAM = "fromDate"; - private static final Object OAI_UNTIL_DATE_PARAM = "untilDate"; + private static final String FORMAT_PARAM = "format"; + private static final String OAI_SET_PARAM = "set"; + private static final Object OAI_FROM_DATE_PARAM = "fromDate"; + private static final Object OAI_UNTIL_DATE_PARAM = "untilDate"; - private OaiIteratorFactory oaiIteratorFactory; + private OaiIteratorFactory oaiIteratorFactory; - @Override - public Stream collect(final ApiDescriptor api) throws DnetCollectorException { - final String baseUrl = api.getBaseUrl(); - final String mdFormat = api.getParams().get(FORMAT_PARAM); - final String setParam = api.getParams().get(OAI_SET_PARAM); - final String fromDate = api.getParams().get(OAI_FROM_DATE_PARAM); - final String untilDate = api.getParams().get(OAI_UNTIL_DATE_PARAM); + @Override + public Stream collect(final ApiDescriptor api) throws DnetCollectorException { + final String baseUrl = api.getBaseUrl(); + final String mdFormat = api.getParams().get(FORMAT_PARAM); + final String setParam = api.getParams().get(OAI_SET_PARAM); + final String fromDate = api.getParams().get(OAI_FROM_DATE_PARAM); + final String untilDate = api.getParams().get(OAI_UNTIL_DATE_PARAM); - final List sets = new ArrayList<>(); - if (setParam != null) { - sets.addAll( - Lists.newArrayList(Splitter.on(",").omitEmptyStrings().trimResults().split(setParam))); - } - if (sets.isEmpty()) { - // If no set is defined, ALL the sets must be harvested - sets.add(""); - } + final List sets = new ArrayList<>(); + if (setParam != null) { + sets + .addAll( + Lists.newArrayList(Splitter.on(",").omitEmptyStrings().trimResults().split(setParam))); + } + if (sets.isEmpty()) { + // If no set is defined, ALL the sets must be harvested + sets.add(""); + } - if (baseUrl == null || baseUrl.isEmpty()) { - throw new DnetCollectorException("Param 'baseurl' is null or empty"); - } + if (baseUrl == null || baseUrl.isEmpty()) { + throw new DnetCollectorException("Param 'baseurl' is null or empty"); + } - if (mdFormat == null || mdFormat.isEmpty()) { - throw new DnetCollectorException("Param 'mdFormat' is null or empty"); - } + if (mdFormat == null || mdFormat.isEmpty()) { + throw new DnetCollectorException("Param 'mdFormat' is null or empty"); + } - if (fromDate != null && !fromDate.matches("\\d{4}-\\d{2}-\\d{2}")) { - throw new DnetCollectorException("Invalid date (YYYY-MM-DD): " + fromDate); - } + if (fromDate != null && !fromDate.matches("\\d{4}-\\d{2}-\\d{2}")) { + throw new DnetCollectorException("Invalid date (YYYY-MM-DD): " + fromDate); + } - if (untilDate != null && !untilDate.matches("\\d{4}-\\d{2}-\\d{2}")) { - throw new DnetCollectorException("Invalid date (YYYY-MM-DD): " + untilDate); - } + if (untilDate != null && !untilDate.matches("\\d{4}-\\d{2}-\\d{2}")) { + throw new DnetCollectorException("Invalid date (YYYY-MM-DD): " + untilDate); + } - final Iterator> iters = - sets.stream() - .map( - set -> - getOaiIteratorFactory() - .newIterator(baseUrl, mdFormat, set, fromDate, untilDate)) - .iterator(); + final Iterator> iters = sets + .stream() + .map( + set -> getOaiIteratorFactory() + .newIterator(baseUrl, mdFormat, set, fromDate, untilDate)) + .iterator(); - return StreamSupport.stream( - Spliterators.spliteratorUnknownSize(Iterators.concat(iters), Spliterator.ORDERED), false); - } + return StreamSupport + .stream( + Spliterators.spliteratorUnknownSize(Iterators.concat(iters), Spliterator.ORDERED), false); + } - public OaiIteratorFactory getOaiIteratorFactory() { - if (oaiIteratorFactory == null) { - oaiIteratorFactory = new OaiIteratorFactory(); - } - return oaiIteratorFactory; - } + public OaiIteratorFactory getOaiIteratorFactory() { + if (oaiIteratorFactory == null) { + oaiIteratorFactory = new OaiIteratorFactory(); + } + return oaiIteratorFactory; + } } diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/oai/OaiIterator.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/oai/OaiIterator.java index 13c40de06..d61f13fb5 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/oai/OaiIterator.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/oai/OaiIterator.java @@ -1,14 +1,13 @@ + package eu.dnetlib.dhp.collection.plugin.oai; -import eu.dnetlib.dhp.collection.worker.DnetCollectorException; -import eu.dnetlib.dhp.collection.worker.utils.HttpConnector; -import eu.dnetlib.dhp.collection.worker.utils.XmlCleaner; import java.io.StringReader; import java.io.UnsupportedEncodingException; import java.net.URLEncoder; import java.util.Iterator; import java.util.Queue; import java.util.concurrent.PriorityBlockingQueue; + import org.apache.commons.lang.StringUtils; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; @@ -17,160 +16,162 @@ import org.dom4j.DocumentException; import org.dom4j.Node; import org.dom4j.io.SAXReader; +import eu.dnetlib.dhp.collection.worker.DnetCollectorException; +import eu.dnetlib.dhp.collection.worker.utils.HttpConnector; +import eu.dnetlib.dhp.collection.worker.utils.XmlCleaner; + public class OaiIterator implements Iterator { - private static final Log log = LogFactory.getLog(OaiIterator.class); // NOPMD by marko on - // 11/24/08 5:02 PM + private static final Log log = LogFactory.getLog(OaiIterator.class); // NOPMD by marko on + // 11/24/08 5:02 PM - private final Queue queue = new PriorityBlockingQueue<>(); - private final SAXReader reader = new SAXReader(); + private final Queue queue = new PriorityBlockingQueue<>(); + private final SAXReader reader = new SAXReader(); - private final String baseUrl; - private final String set; - private final String mdFormat; - private final String fromDate; - private final String untilDate; - private String token; - private boolean started; - private final HttpConnector httpConnector; + private final String baseUrl; + private final String set; + private final String mdFormat; + private final String fromDate; + private final String untilDate; + private String token; + private boolean started; + private final HttpConnector httpConnector; - public OaiIterator( - final String baseUrl, - final String mdFormat, - final String set, - final String fromDate, - final String untilDate, - final HttpConnector httpConnector) { - this.baseUrl = baseUrl; - this.mdFormat = mdFormat; - this.set = set; - this.fromDate = fromDate; - this.untilDate = untilDate; - this.started = false; - this.httpConnector = httpConnector; - } + public OaiIterator( + final String baseUrl, + final String mdFormat, + final String set, + final String fromDate, + final String untilDate, + final HttpConnector httpConnector) { + this.baseUrl = baseUrl; + this.mdFormat = mdFormat; + this.set = set; + this.fromDate = fromDate; + this.untilDate = untilDate; + this.started = false; + this.httpConnector = httpConnector; + } - private void verifyStarted() { - if (!this.started) { - this.started = true; - try { - this.token = firstPage(); - } catch (final DnetCollectorException e) { - throw new RuntimeException(e); - } - } - } + private void verifyStarted() { + if (!this.started) { + this.started = true; + try { + this.token = firstPage(); + } catch (final DnetCollectorException e) { + throw new RuntimeException(e); + } + } + } - @Override - public boolean hasNext() { - synchronized (queue) { - verifyStarted(); - return !queue.isEmpty(); - } - } + @Override + public boolean hasNext() { + synchronized (queue) { + verifyStarted(); + return !queue.isEmpty(); + } + } - @Override - public String next() { - synchronized (queue) { - verifyStarted(); - final String res = queue.poll(); - while (queue.isEmpty() && token != null && !token.isEmpty()) { - try { - token = otherPages(token); - } catch (final DnetCollectorException e) { - throw new RuntimeException(e); - } - } - return res; - } - } + @Override + public String next() { + synchronized (queue) { + verifyStarted(); + final String res = queue.poll(); + while (queue.isEmpty() && token != null && !token.isEmpty()) { + try { + token = otherPages(token); + } catch (final DnetCollectorException e) { + throw new RuntimeException(e); + } + } + return res; + } + } - @Override - public void remove() {} + @Override + public void remove() { + } - private String firstPage() throws DnetCollectorException { - try { - String url = - baseUrl + "?verb=ListRecords&metadataPrefix=" + URLEncoder.encode(mdFormat, "UTF-8"); - if (set != null && !set.isEmpty()) { - url += "&set=" + URLEncoder.encode(set, "UTF-8"); - } - if (fromDate != null && fromDate.matches("\\d{4}-\\d{2}-\\d{2}")) { - url += "&from=" + URLEncoder.encode(fromDate, "UTF-8"); - } - if (untilDate != null && untilDate.matches("\\d{4}-\\d{2}-\\d{2}")) { - url += "&until=" + URLEncoder.encode(untilDate, "UTF-8"); - } - log.info("Start harvesting using url: " + url); + private String firstPage() throws DnetCollectorException { + try { + String url = baseUrl + "?verb=ListRecords&metadataPrefix=" + URLEncoder.encode(mdFormat, "UTF-8"); + if (set != null && !set.isEmpty()) { + url += "&set=" + URLEncoder.encode(set, "UTF-8"); + } + if (fromDate != null && fromDate.matches("\\d{4}-\\d{2}-\\d{2}")) { + url += "&from=" + URLEncoder.encode(fromDate, "UTF-8"); + } + if (untilDate != null && untilDate.matches("\\d{4}-\\d{2}-\\d{2}")) { + url += "&until=" + URLEncoder.encode(untilDate, "UTF-8"); + } + log.info("Start harvesting using url: " + url); - return downloadPage(url); - } catch (final UnsupportedEncodingException e) { - throw new DnetCollectorException(e); - } - } + return downloadPage(url); + } catch (final UnsupportedEncodingException e) { + throw new DnetCollectorException(e); + } + } - private String extractResumptionToken(final String xml) { + private String extractResumptionToken(final String xml) { - final String s = StringUtils.substringAfter(xml, "", "", " newIterator( - final String baseUrl, - final String mdFormat, - final String set, - final String fromDate, - final String untilDate) { - return new OaiIterator(baseUrl, mdFormat, set, fromDate, untilDate, getHttpConnector()); - } + public Iterator newIterator( + final String baseUrl, + final String mdFormat, + final String set, + final String fromDate, + final String untilDate) { + return new OaiIterator(baseUrl, mdFormat, set, fromDate, untilDate, getHttpConnector()); + } - private HttpConnector getHttpConnector() { - if (httpConnector == null) httpConnector = new HttpConnector(); - return httpConnector; - } + private HttpConnector getHttpConnector() { + if (httpConnector == null) + httpConnector = new HttpConnector(); + return httpConnector; + } } diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/DnetCollectorException.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/DnetCollectorException.java index 320f735b3..f40962c21 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/DnetCollectorException.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/DnetCollectorException.java @@ -1,31 +1,32 @@ + package eu.dnetlib.dhp.collection.worker; public class DnetCollectorException extends Exception { - /** */ - private static final long serialVersionUID = -290723075076039757L; + /** */ + private static final long serialVersionUID = -290723075076039757L; - public DnetCollectorException() { - super(); - } + public DnetCollectorException() { + super(); + } - public DnetCollectorException( - final String message, - final Throwable cause, - final boolean enableSuppression, - final boolean writableStackTrace) { - super(message, cause, enableSuppression, writableStackTrace); - } + public DnetCollectorException( + final String message, + final Throwable cause, + final boolean enableSuppression, + final boolean writableStackTrace) { + super(message, cause, enableSuppression, writableStackTrace); + } - public DnetCollectorException(final String message, final Throwable cause) { - super(message, cause); - } + public DnetCollectorException(final String message, final Throwable cause) { + super(message, cause); + } - public DnetCollectorException(final String message) { - super(message); - } + public DnetCollectorException(final String message) { + super(message); + } - public DnetCollectorException(final Throwable cause) { - super(cause); - } + public DnetCollectorException(final Throwable cause) { + super(cause); + } } diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/DnetCollectorWorker.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/DnetCollectorWorker.java index d76ec8e37..e686ad518 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/DnetCollectorWorker.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/DnetCollectorWorker.java @@ -1,18 +1,12 @@ + package eu.dnetlib.dhp.collection.worker; -import com.fasterxml.jackson.databind.ObjectMapper; -import eu.dnetlib.collector.worker.model.ApiDescriptor; -import eu.dnetlib.dhp.application.ArgumentApplicationParser; -import eu.dnetlib.dhp.collection.plugin.CollectorPlugin; -import eu.dnetlib.dhp.collection.worker.utils.CollectorPluginFactory; -import eu.dnetlib.message.Message; -import eu.dnetlib.message.MessageManager; -import eu.dnetlib.message.MessageType; import java.io.IOException; import java.net.URI; import java.util.HashMap; import java.util.Map; import java.util.concurrent.atomic.AtomicInteger; + import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; @@ -22,111 +16,124 @@ import org.apache.hadoop.io.Text; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import com.fasterxml.jackson.databind.ObjectMapper; + +import eu.dnetlib.collector.worker.model.ApiDescriptor; +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.collection.plugin.CollectorPlugin; +import eu.dnetlib.dhp.collection.worker.utils.CollectorPluginFactory; +import eu.dnetlib.message.Message; +import eu.dnetlib.message.MessageManager; +import eu.dnetlib.message.MessageType; + public class DnetCollectorWorker { - private static final Logger log = LoggerFactory.getLogger(DnetCollectorWorker.class); + private static final Logger log = LoggerFactory.getLogger(DnetCollectorWorker.class); - private final CollectorPluginFactory collectorPluginFactory; + private final CollectorPluginFactory collectorPluginFactory; - private final ArgumentApplicationParser argumentParser; + private final ArgumentApplicationParser argumentParser; - private final MessageManager manager; + private final MessageManager manager; - public DnetCollectorWorker( - final CollectorPluginFactory collectorPluginFactory, - final ArgumentApplicationParser argumentParser, - final MessageManager manager) - throws DnetCollectorException { - this.collectorPluginFactory = collectorPluginFactory; - this.argumentParser = argumentParser; - this.manager = manager; - } + public DnetCollectorWorker( + final CollectorPluginFactory collectorPluginFactory, + final ArgumentApplicationParser argumentParser, + final MessageManager manager) + throws DnetCollectorException { + this.collectorPluginFactory = collectorPluginFactory; + this.argumentParser = argumentParser; + this.manager = manager; + } - public void collect() throws DnetCollectorException { - try { - final ObjectMapper jsonMapper = new ObjectMapper(); - final ApiDescriptor api = - jsonMapper.readValue(argumentParser.get("apidescriptor"), ApiDescriptor.class); + public void collect() throws DnetCollectorException { + try { + final ObjectMapper jsonMapper = new ObjectMapper(); + final ApiDescriptor api = jsonMapper.readValue(argumentParser.get("apidescriptor"), ApiDescriptor.class); - final CollectorPlugin plugin = collectorPluginFactory.getPluginByProtocol(api.getProtocol()); + final CollectorPlugin plugin = collectorPluginFactory.getPluginByProtocol(api.getProtocol()); - final String hdfsuri = argumentParser.get("namenode"); + final String hdfsuri = argumentParser.get("namenode"); - // ====== Init HDFS File System Object - Configuration conf = new Configuration(); - // Set FileSystem URI - conf.set("fs.defaultFS", hdfsuri); - // Because of Maven - conf.set("fs.hdfs.impl", org.apache.hadoop.hdfs.DistributedFileSystem.class.getName()); - conf.set("fs.file.impl", org.apache.hadoop.fs.LocalFileSystem.class.getName()); + // ====== Init HDFS File System Object + Configuration conf = new Configuration(); + // Set FileSystem URI + conf.set("fs.defaultFS", hdfsuri); + // Because of Maven + conf.set("fs.hdfs.impl", org.apache.hadoop.hdfs.DistributedFileSystem.class.getName()); + conf.set("fs.file.impl", org.apache.hadoop.fs.LocalFileSystem.class.getName()); - System.setProperty("HADOOP_USER_NAME", argumentParser.get("userHDFS")); - System.setProperty("hadoop.home.dir", "/"); - // Get the filesystem - HDFS - FileSystem.get(URI.create(hdfsuri), conf); - Path hdfswritepath = new Path(argumentParser.get("hdfsPath")); + System.setProperty("HADOOP_USER_NAME", argumentParser.get("userHDFS")); + System.setProperty("hadoop.home.dir", "/"); + // Get the filesystem - HDFS + FileSystem.get(URI.create(hdfsuri), conf); + Path hdfswritepath = new Path(argumentParser.get("hdfsPath")); - log.info("Created path " + hdfswritepath.toString()); + log.info("Created path " + hdfswritepath.toString()); - final Map ongoingMap = new HashMap<>(); - final Map reportMap = new HashMap<>(); - final AtomicInteger counter = new AtomicInteger(0); - try (SequenceFile.Writer writer = - SequenceFile.createWriter( - conf, - SequenceFile.Writer.file(hdfswritepath), - SequenceFile.Writer.keyClass(IntWritable.class), - SequenceFile.Writer.valueClass(Text.class))) { - final IntWritable key = new IntWritable(counter.get()); - final Text value = new Text(); - plugin - .collect(api) - .forEach( - content -> { - key.set(counter.getAndIncrement()); - value.set(content); - if (counter.get() % 10 == 0) { - try { - ongoingMap.put("ongoing", "" + counter.get()); - log.debug( - "Sending message: " - + manager.sendMessage( - new Message( - argumentParser.get("workflowId"), - "Collection", - MessageType.ONGOING, - ongoingMap), - argumentParser.get("rabbitOngoingQueue"), - true, - false)); - } catch (Exception e) { - log.error("Error on sending message ", e); - } - } - try { - writer.append(key, value); - } catch (IOException e) { - throw new RuntimeException(e); - } - }); - } - ongoingMap.put("ongoing", "" + counter.get()); - manager.sendMessage( - new Message( - argumentParser.get("workflowId"), "Collection", MessageType.ONGOING, ongoingMap), - argumentParser.get("rabbitOngoingQueue"), - true, - false); - reportMap.put("collected", "" + counter.get()); - manager.sendMessage( - new Message( - argumentParser.get("workflowId"), "Collection", MessageType.REPORT, reportMap), - argumentParser.get("rabbitOngoingQueue"), - true, - false); - manager.close(); - } catch (Throwable e) { - throw new DnetCollectorException("Error on collecting ", e); - } - } + final Map ongoingMap = new HashMap<>(); + final Map reportMap = new HashMap<>(); + final AtomicInteger counter = new AtomicInteger(0); + try (SequenceFile.Writer writer = SequenceFile + .createWriter( + conf, + SequenceFile.Writer.file(hdfswritepath), + SequenceFile.Writer.keyClass(IntWritable.class), + SequenceFile.Writer.valueClass(Text.class))) { + final IntWritable key = new IntWritable(counter.get()); + final Text value = new Text(); + plugin + .collect(api) + .forEach( + content -> { + key.set(counter.getAndIncrement()); + value.set(content); + if (counter.get() % 10 == 0) { + try { + ongoingMap.put("ongoing", "" + counter.get()); + log + .debug( + "Sending message: " + + manager + .sendMessage( + new Message( + argumentParser.get("workflowId"), + "Collection", + MessageType.ONGOING, + ongoingMap), + argumentParser.get("rabbitOngoingQueue"), + true, + false)); + } catch (Exception e) { + log.error("Error on sending message ", e); + } + } + try { + writer.append(key, value); + } catch (IOException e) { + throw new RuntimeException(e); + } + }); + } + ongoingMap.put("ongoing", "" + counter.get()); + manager + .sendMessage( + new Message( + argumentParser.get("workflowId"), "Collection", MessageType.ONGOING, ongoingMap), + argumentParser.get("rabbitOngoingQueue"), + true, + false); + reportMap.put("collected", "" + counter.get()); + manager + .sendMessage( + new Message( + argumentParser.get("workflowId"), "Collection", MessageType.REPORT, reportMap), + argumentParser.get("rabbitOngoingQueue"), + true, + false); + manager.close(); + } catch (Throwable e) { + throw new DnetCollectorException("Error on collecting ", e); + } + } } diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/DnetCollectorWorkerApplication.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/DnetCollectorWorkerApplication.java index 7ff61d677..cda07d151 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/DnetCollectorWorkerApplication.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/DnetCollectorWorkerApplication.java @@ -1,48 +1,49 @@ + package eu.dnetlib.dhp.collection.worker; -import eu.dnetlib.dhp.application.ArgumentApplicationParser; -import eu.dnetlib.dhp.collection.worker.utils.CollectorPluginFactory; -import eu.dnetlib.message.MessageManager; import org.apache.commons.io.IOUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.collection.worker.utils.CollectorPluginFactory; +import eu.dnetlib.message.MessageManager; + /** - * DnetCollectortWorkerApplication is the main class responsible to start the Dnet Collection into - * HDFS. This module will be executed on the hadoop cluster and taking in input some parameters that - * tells it which is the right collector plugin to use and where store the data into HDFS path + * DnetCollectortWorkerApplication is the main class responsible to start the Dnet Collection into HDFS. This module + * will be executed on the hadoop cluster and taking in input some parameters that tells it which is the right collector + * plugin to use and where store the data into HDFS path * * @author Sandro La Bruzzo */ public class DnetCollectorWorkerApplication { - private static final Logger log = LoggerFactory.getLogger(DnetCollectorWorkerApplication.class); + private static final Logger log = LoggerFactory.getLogger(DnetCollectorWorkerApplication.class); - private static CollectorPluginFactory collectorPluginFactory = new CollectorPluginFactory(); + private static CollectorPluginFactory collectorPluginFactory = new CollectorPluginFactory(); - private static ArgumentApplicationParser argumentParser; + private static ArgumentApplicationParser argumentParser; - /** @param args */ - public static void main(final String[] args) throws Exception { + /** @param args */ + public static void main(final String[] args) throws Exception { - argumentParser = - new ArgumentApplicationParser( - IOUtils.toString( - DnetCollectorWorker.class.getResourceAsStream( - "/eu/dnetlib/collector/worker/collector_parameter.json"))); - argumentParser.parseArgument(args); - log.info("hdfsPath =" + argumentParser.get("hdfsPath")); - log.info("json = " + argumentParser.get("apidescriptor")); - final MessageManager manager = - new MessageManager( - argumentParser.get("rabbitHost"), - argumentParser.get("rabbitUser"), - argumentParser.get("rabbitPassword"), - false, - false, - null); - final DnetCollectorWorker worker = - new DnetCollectorWorker(collectorPluginFactory, argumentParser, manager); - worker.collect(); - } + argumentParser = new ArgumentApplicationParser( + IOUtils + .toString( + DnetCollectorWorker.class + .getResourceAsStream( + "/eu/dnetlib/collector/worker/collector_parameter.json"))); + argumentParser.parseArgument(args); + log.info("hdfsPath =" + argumentParser.get("hdfsPath")); + log.info("json = " + argumentParser.get("apidescriptor")); + final MessageManager manager = new MessageManager( + argumentParser.get("rabbitHost"), + argumentParser.get("rabbitUser"), + argumentParser.get("rabbitPassword"), + false, + false, + null); + final DnetCollectorWorker worker = new DnetCollectorWorker(collectorPluginFactory, argumentParser, manager); + worker.collect(); + } } diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/utils/CollectorPluginErrorLogList.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/utils/CollectorPluginErrorLogList.java index 27d982796..6ee8a8b49 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/utils/CollectorPluginErrorLogList.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/utils/CollectorPluginErrorLogList.java @@ -1,18 +1,19 @@ + package eu.dnetlib.dhp.collection.worker.utils; import java.util.LinkedList; public class CollectorPluginErrorLogList extends LinkedList { - private static final long serialVersionUID = -6925786561303289704L; + private static final long serialVersionUID = -6925786561303289704L; - @Override - public String toString() { - String log = new String(); - int index = 0; - for (final String errorMessage : this) { - log += String.format("Retry #%s: %s / ", index++, errorMessage); - } - return log; - } + @Override + public String toString() { + String log = new String(); + int index = 0; + for (final String errorMessage : this) { + log += String.format("Retry #%s: %s / ", index++, errorMessage); + } + return log; + } } diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/utils/CollectorPluginFactory.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/utils/CollectorPluginFactory.java index 857200119..7a0028e79 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/utils/CollectorPluginFactory.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/utils/CollectorPluginFactory.java @@ -1,3 +1,4 @@ + package eu.dnetlib.dhp.collection.worker.utils; import eu.dnetlib.dhp.collection.plugin.CollectorPlugin; @@ -6,13 +7,14 @@ import eu.dnetlib.dhp.collection.worker.DnetCollectorException; public class CollectorPluginFactory { - public CollectorPlugin getPluginByProtocol(final String protocol) throws DnetCollectorException { - if (protocol == null) throw new DnetCollectorException("protocol cannot be null"); - switch (protocol.toLowerCase().trim()) { - case "oai": - return new OaiCollectorPlugin(); - default: - throw new DnetCollectorException("UNknown protocol"); - } - } + public CollectorPlugin getPluginByProtocol(final String protocol) throws DnetCollectorException { + if (protocol == null) + throw new DnetCollectorException("protocol cannot be null"); + switch (protocol.toLowerCase().trim()) { + case "oai": + return new OaiCollectorPlugin(); + default: + throw new DnetCollectorException("UNknown protocol"); + } + } } diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/utils/HttpConnector.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/utils/HttpConnector.java index 36b08008a..5d6108fad 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/utils/HttpConnector.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/utils/HttpConnector.java @@ -1,6 +1,6 @@ + package eu.dnetlib.dhp.collection.worker.utils; -import eu.dnetlib.dhp.collection.worker.DnetCollectorException; import java.io.IOException; import java.io.InputStream; import java.net.*; @@ -8,226 +8,237 @@ import java.security.GeneralSecurityException; import java.security.cert.X509Certificate; import java.util.List; import java.util.Map; + import javax.net.ssl.HttpsURLConnection; import javax.net.ssl.SSLContext; import javax.net.ssl.TrustManager; import javax.net.ssl.X509TrustManager; + import org.apache.commons.io.IOUtils; import org.apache.commons.lang.math.NumberUtils; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; +import eu.dnetlib.dhp.collection.worker.DnetCollectorException; + public class HttpConnector { - private static final Log log = LogFactory.getLog(HttpConnector.class); + private static final Log log = LogFactory.getLog(HttpConnector.class); - private int maxNumberOfRetry = 6; - private int defaultDelay = 120; // seconds - private int readTimeOut = 120; // seconds + private int maxNumberOfRetry = 6; + private int defaultDelay = 120; // seconds + private int readTimeOut = 120; // seconds - private String responseType = null; + private String responseType = null; - private final String userAgent = "Mozilla/5.0 (compatible; OAI; +http://www.openaire.eu)"; + private final String userAgent = "Mozilla/5.0 (compatible; OAI; +http://www.openaire.eu)"; - public HttpConnector() { - CookieHandler.setDefault(new CookieManager(null, CookiePolicy.ACCEPT_ALL)); - } + public HttpConnector() { + CookieHandler.setDefault(new CookieManager(null, CookiePolicy.ACCEPT_ALL)); + } - /** - * Given the URL returns the content via HTTP GET - * - * @param requestUrl the URL - * @return the content of the downloaded resource - * @throws DnetCollectorException when retrying more than maxNumberOfRetry times - */ - public String getInputSource(final String requestUrl) throws DnetCollectorException { - return attemptDownlaodAsString(requestUrl, 1, new CollectorPluginErrorLogList()); - } + /** + * Given the URL returns the content via HTTP GET + * + * @param requestUrl the URL + * @return the content of the downloaded resource + * @throws DnetCollectorException when retrying more than maxNumberOfRetry times + */ + public String getInputSource(final String requestUrl) throws DnetCollectorException { + return attemptDownlaodAsString(requestUrl, 1, new CollectorPluginErrorLogList()); + } - /** - * Given the URL returns the content as a stream via HTTP GET - * - * @param requestUrl the URL - * @return the content of the downloaded resource as InputStream - * @throws DnetCollectorException when retrying more than maxNumberOfRetry times - */ - public InputStream getInputSourceAsStream(final String requestUrl) throws DnetCollectorException { - return attemptDownload(requestUrl, 1, new CollectorPluginErrorLogList()); - } + /** + * Given the URL returns the content as a stream via HTTP GET + * + * @param requestUrl the URL + * @return the content of the downloaded resource as InputStream + * @throws DnetCollectorException when retrying more than maxNumberOfRetry times + */ + public InputStream getInputSourceAsStream(final String requestUrl) throws DnetCollectorException { + return attemptDownload(requestUrl, 1, new CollectorPluginErrorLogList()); + } - private String attemptDownlaodAsString( - final String requestUrl, final int retryNumber, final CollectorPluginErrorLogList errorList) - throws DnetCollectorException { - try { - final InputStream s = attemptDownload(requestUrl, 1, new CollectorPluginErrorLogList()); - try { - return IOUtils.toString(s); - } catch (final IOException e) { - log.error("error while retrieving from http-connection occured: " + requestUrl, e); - Thread.sleep(defaultDelay * 1000); - errorList.add(e.getMessage()); - return attemptDownlaodAsString(requestUrl, retryNumber + 1, errorList); - } finally { - IOUtils.closeQuietly(s); - } - } catch (final InterruptedException e) { - throw new DnetCollectorException(e); - } - } + private String attemptDownlaodAsString( + final String requestUrl, final int retryNumber, final CollectorPluginErrorLogList errorList) + throws DnetCollectorException { + try { + final InputStream s = attemptDownload(requestUrl, 1, new CollectorPluginErrorLogList()); + try { + return IOUtils.toString(s); + } catch (final IOException e) { + log.error("error while retrieving from http-connection occured: " + requestUrl, e); + Thread.sleep(defaultDelay * 1000); + errorList.add(e.getMessage()); + return attemptDownlaodAsString(requestUrl, retryNumber + 1, errorList); + } finally { + IOUtils.closeQuietly(s); + } + } catch (final InterruptedException e) { + throw new DnetCollectorException(e); + } + } - private InputStream attemptDownload( - final String requestUrl, final int retryNumber, final CollectorPluginErrorLogList errorList) - throws DnetCollectorException { + private InputStream attemptDownload( + final String requestUrl, final int retryNumber, final CollectorPluginErrorLogList errorList) + throws DnetCollectorException { - if (retryNumber > maxNumberOfRetry) { - throw new DnetCollectorException("Max number of retries exceeded. Cause: \n " + errorList); - } + if (retryNumber > maxNumberOfRetry) { + throw new DnetCollectorException("Max number of retries exceeded. Cause: \n " + errorList); + } - log.debug("Downloading " + requestUrl + " - try: " + retryNumber); - try { - InputStream input = null; + log.debug("Downloading " + requestUrl + " - try: " + retryNumber); + try { + InputStream input = null; - try { - final HttpURLConnection urlConn = (HttpURLConnection) new URL(requestUrl).openConnection(); - urlConn.setInstanceFollowRedirects(false); - urlConn.setReadTimeout(readTimeOut * 1000); - urlConn.addRequestProperty("User-Agent", userAgent); + try { + final HttpURLConnection urlConn = (HttpURLConnection) new URL(requestUrl).openConnection(); + urlConn.setInstanceFollowRedirects(false); + urlConn.setReadTimeout(readTimeOut * 1000); + urlConn.addRequestProperty("User-Agent", userAgent); - if (log.isDebugEnabled()) { - logHeaderFields(urlConn); - } + if (log.isDebugEnabled()) { + logHeaderFields(urlConn); + } - final int retryAfter = obtainRetryAfter(urlConn.getHeaderFields()); - if (retryAfter > 0 && urlConn.getResponseCode() == HttpURLConnection.HTTP_UNAVAILABLE) { - log.warn("waiting and repeating request after " + retryAfter + " sec."); - Thread.sleep(retryAfter * 1000); - errorList.add("503 Service Unavailable"); - urlConn.disconnect(); - return attemptDownload(requestUrl, retryNumber + 1, errorList); - } else if (urlConn.getResponseCode() == HttpURLConnection.HTTP_MOVED_PERM - || urlConn.getResponseCode() == HttpURLConnection.HTTP_MOVED_TEMP) { - final String newUrl = obtainNewLocation(urlConn.getHeaderFields()); - log.debug("The requested url has been moved to " + newUrl); - errorList.add( - String.format( - "%s %s. Moved to: %s", - urlConn.getResponseCode(), urlConn.getResponseMessage(), newUrl)); - urlConn.disconnect(); - return attemptDownload(newUrl, retryNumber + 1, errorList); - } else if (urlConn.getResponseCode() != HttpURLConnection.HTTP_OK) { - log.error( - String.format( - "HTTP error: %s %s", urlConn.getResponseCode(), urlConn.getResponseMessage())); - Thread.sleep(defaultDelay * 1000); - errorList.add( - String.format("%s %s", urlConn.getResponseCode(), urlConn.getResponseMessage())); - urlConn.disconnect(); - return attemptDownload(requestUrl, retryNumber + 1, errorList); - } else { - input = urlConn.getInputStream(); - responseType = urlConn.getContentType(); - return input; - } - } catch (final IOException e) { - log.error("error while retrieving from http-connection occured: " + requestUrl, e); - Thread.sleep(defaultDelay * 1000); - errorList.add(e.getMessage()); - return attemptDownload(requestUrl, retryNumber + 1, errorList); - } - } catch (final InterruptedException e) { - throw new DnetCollectorException(e); - } - } + final int retryAfter = obtainRetryAfter(urlConn.getHeaderFields()); + if (retryAfter > 0 && urlConn.getResponseCode() == HttpURLConnection.HTTP_UNAVAILABLE) { + log.warn("waiting and repeating request after " + retryAfter + " sec."); + Thread.sleep(retryAfter * 1000); + errorList.add("503 Service Unavailable"); + urlConn.disconnect(); + return attemptDownload(requestUrl, retryNumber + 1, errorList); + } else if (urlConn.getResponseCode() == HttpURLConnection.HTTP_MOVED_PERM + || urlConn.getResponseCode() == HttpURLConnection.HTTP_MOVED_TEMP) { + final String newUrl = obtainNewLocation(urlConn.getHeaderFields()); + log.debug("The requested url has been moved to " + newUrl); + errorList + .add( + String + .format( + "%s %s. Moved to: %s", + urlConn.getResponseCode(), urlConn.getResponseMessage(), newUrl)); + urlConn.disconnect(); + return attemptDownload(newUrl, retryNumber + 1, errorList); + } else if (urlConn.getResponseCode() != HttpURLConnection.HTTP_OK) { + log + .error( + String + .format( + "HTTP error: %s %s", urlConn.getResponseCode(), urlConn.getResponseMessage())); + Thread.sleep(defaultDelay * 1000); + errorList + .add( + String.format("%s %s", urlConn.getResponseCode(), urlConn.getResponseMessage())); + urlConn.disconnect(); + return attemptDownload(requestUrl, retryNumber + 1, errorList); + } else { + input = urlConn.getInputStream(); + responseType = urlConn.getContentType(); + return input; + } + } catch (final IOException e) { + log.error("error while retrieving from http-connection occured: " + requestUrl, e); + Thread.sleep(defaultDelay * 1000); + errorList.add(e.getMessage()); + return attemptDownload(requestUrl, retryNumber + 1, errorList); + } + } catch (final InterruptedException e) { + throw new DnetCollectorException(e); + } + } - private void logHeaderFields(final HttpURLConnection urlConn) throws IOException { - log.debug("StatusCode: " + urlConn.getResponseMessage()); + private void logHeaderFields(final HttpURLConnection urlConn) throws IOException { + log.debug("StatusCode: " + urlConn.getResponseMessage()); - for (final Map.Entry> e : urlConn.getHeaderFields().entrySet()) { - if (e.getKey() != null) { - for (final String v : e.getValue()) { - log.debug(" key: " + e.getKey() + " - value: " + v); - } - } - } - } + for (final Map.Entry> e : urlConn.getHeaderFields().entrySet()) { + if (e.getKey() != null) { + for (final String v : e.getValue()) { + log.debug(" key: " + e.getKey() + " - value: " + v); + } + } + } + } - private int obtainRetryAfter(final Map> headerMap) { - for (final String key : headerMap.keySet()) { - if (key != null - && key.toLowerCase().equals("retry-after") - && headerMap.get(key).size() > 0 - && NumberUtils.isNumber(headerMap.get(key).get(0))) { - return Integer.parseInt(headerMap.get(key).get(0)) + 10; - } - } - return -1; - } + private int obtainRetryAfter(final Map> headerMap) { + for (final String key : headerMap.keySet()) { + if (key != null + && key.toLowerCase().equals("retry-after") + && headerMap.get(key).size() > 0 + && NumberUtils.isNumber(headerMap.get(key).get(0))) { + return Integer.parseInt(headerMap.get(key).get(0)) + 10; + } + } + return -1; + } - private String obtainNewLocation(final Map> headerMap) - throws DnetCollectorException { - for (final String key : headerMap.keySet()) { - if (key != null && key.toLowerCase().equals("location") && headerMap.get(key).size() > 0) { - return headerMap.get(key).get(0); - } - } - throw new DnetCollectorException( - "The requested url has been MOVED, but 'location' param is MISSING"); - } + private String obtainNewLocation(final Map> headerMap) + throws DnetCollectorException { + for (final String key : headerMap.keySet()) { + if (key != null && key.toLowerCase().equals("location") && headerMap.get(key).size() > 0) { + return headerMap.get(key).get(0); + } + } + throw new DnetCollectorException( + "The requested url has been MOVED, but 'location' param is MISSING"); + } - /** - * register for https scheme; this is a workaround and not intended for the use in trusted - * environments - */ - public void initTrustManager() { - final X509TrustManager tm = - new X509TrustManager() { + /** + * register for https scheme; this is a workaround and not intended for the use in trusted environments + */ + public void initTrustManager() { + final X509TrustManager tm = new X509TrustManager() { - @Override - public void checkClientTrusted(final X509Certificate[] xcs, final String string) {} + @Override + public void checkClientTrusted(final X509Certificate[] xcs, final String string) { + } - @Override - public void checkServerTrusted(final X509Certificate[] xcs, final String string) {} + @Override + public void checkServerTrusted(final X509Certificate[] xcs, final String string) { + } - @Override - public X509Certificate[] getAcceptedIssuers() { - return null; - } - }; - try { - final SSLContext ctx = SSLContext.getInstance("TLS"); - ctx.init(null, new TrustManager[] {tm}, null); - HttpsURLConnection.setDefaultSSLSocketFactory(ctx.getSocketFactory()); - } catch (final GeneralSecurityException e) { - log.fatal(e); - throw new IllegalStateException(e); - } - } + @Override + public X509Certificate[] getAcceptedIssuers() { + return null; + } + }; + try { + final SSLContext ctx = SSLContext.getInstance("TLS"); + ctx.init(null, new TrustManager[] { + tm + }, null); + HttpsURLConnection.setDefaultSSLSocketFactory(ctx.getSocketFactory()); + } catch (final GeneralSecurityException e) { + log.fatal(e); + throw new IllegalStateException(e); + } + } - public int getMaxNumberOfRetry() { - return maxNumberOfRetry; - } + public int getMaxNumberOfRetry() { + return maxNumberOfRetry; + } - public void setMaxNumberOfRetry(final int maxNumberOfRetry) { - this.maxNumberOfRetry = maxNumberOfRetry; - } + public void setMaxNumberOfRetry(final int maxNumberOfRetry) { + this.maxNumberOfRetry = maxNumberOfRetry; + } - public int getDefaultDelay() { - return defaultDelay; - } + public int getDefaultDelay() { + return defaultDelay; + } - public void setDefaultDelay(final int defaultDelay) { - this.defaultDelay = defaultDelay; - } + public void setDefaultDelay(final int defaultDelay) { + this.defaultDelay = defaultDelay; + } - public int getReadTimeOut() { - return readTimeOut; - } + public int getReadTimeOut() { + return readTimeOut; + } - public void setReadTimeOut(final int readTimeOut) { - this.readTimeOut = readTimeOut; - } + public void setReadTimeOut(final int readTimeOut) { + this.readTimeOut = readTimeOut; + } - public String getResponseType() { - return responseType; - } + public String getResponseType() { + return responseType; + } } diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/utils/XmlCleaner.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/utils/XmlCleaner.java index 6a9afd591..32eeeab4b 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/utils/XmlCleaner.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/utils/XmlCleaner.java @@ -1,3 +1,4 @@ + package eu.dnetlib.dhp.collection.worker.utils; import java.util.HashMap; @@ -9,376 +10,374 @@ import java.util.regex.Pattern; /** @author jochen, Andreas Czerniak */ public class XmlCleaner { - /** Pattern for numeric entities. */ - private static Pattern validCharacterEntityPattern = - Pattern.compile("^&#x?\\d{2,4};"); // $NON-NLS-1$ - // private static Pattern validCharacterEntityPattern = Pattern.compile("^&#?\\d{2,4};"); - // //$NON-NLS-1$ + /** Pattern for numeric entities. */ + private static Pattern validCharacterEntityPattern = Pattern.compile("^&#x?\\d{2,4};"); // $NON-NLS-1$ + // private static Pattern validCharacterEntityPattern = Pattern.compile("^&#?\\d{2,4};"); + // //$NON-NLS-1$ - // see https://www.w3.org/TR/REC-xml/#charsets , not only limited to - private static Pattern invalidControlCharPattern = Pattern.compile("&#x?1[0-9a-fA-F];"); + // see https://www.w3.org/TR/REC-xml/#charsets , not only limited to + private static Pattern invalidControlCharPattern = Pattern.compile("&#x?1[0-9a-fA-F];"); - /** - * Pattern that negates the allowable XML 4 byte unicode characters. Valid are: #x9 | #xA | #xD | - * [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF] - */ - private static Pattern invalidCharacterPattern = - Pattern.compile("[^\t\r\n\u0020-\uD7FF\uE000-\uFFFD]"); // $NON-NLS-1$ + /** + * Pattern that negates the allowable XML 4 byte unicode characters. Valid are: #x9 | #xA | #xD | [#x20-#xD7FF] | + * [#xE000-#xFFFD] | [#x10000-#x10FFFF] + */ + private static Pattern invalidCharacterPattern = Pattern.compile("[^\t\r\n\u0020-\uD7FF\uE000-\uFFFD]"); // $NON-NLS-1$ - // Map entities to their unicode equivalent - private static Set goodEntities = new HashSet<>(); - private static Map badEntities = new HashMap<>(); + // Map entities to their unicode equivalent + private static Set goodEntities = new HashSet<>(); + private static Map badEntities = new HashMap<>(); - static { - // pre-defined XML entities - goodEntities.add("""); // $NON-NLS-1$ // quotation mark - goodEntities.add("&"); // $NON-NLS-1$ // ampersand - goodEntities.add("<"); // $NON-NLS-1$ // less-than sign - goodEntities.add(">"); // $NON-NLS-1$ // greater-than sign - // control entities - // badEntities.put(" ", ""); - badEntities.put("", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character - badEntities.put("€", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character - badEntities.put("", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character - badEntities.put("‚", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character - badEntities.put("ƒ", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character - badEntities.put("„", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character - badEntities.put("…", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character - badEntities.put("†", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character - badEntities.put("‡", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character - badEntities.put("ˆ", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character - badEntities.put("‰", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character - badEntities.put("Š", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character - badEntities.put("‹", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character - badEntities.put("Œ", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character - badEntities.put("", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character - badEntities.put("Ž", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character - badEntities.put("", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character - badEntities.put("", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character - badEntities.put("‘", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character - badEntities.put("’", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character - badEntities.put("“", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character - badEntities.put("”", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character - badEntities.put("•", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character - badEntities.put("–", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character - badEntities.put("—", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character - badEntities.put("˜", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character - badEntities.put("™", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character - badEntities.put("š", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character - badEntities.put("›", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character - badEntities.put("œ", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character - badEntities.put("", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character - badEntities.put("ž", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character - badEntities.put("Ÿ", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character - // misc entities - badEntities.put("€", "\u20AC"); // $NON-NLS-1$ //$NON-NLS-2$ // euro - badEntities.put("‘", "\u2018"); // $NON-NLS-1$ //$NON-NLS-2$ // left single quotation - // mark - badEntities.put("’", "\u2019"); // $NON-NLS-1$ //$NON-NLS-2$ // right single quotation - // mark - // Latin 1 entities - badEntities.put(" ", "\u00A0"); // $NON-NLS-1$ //$NON-NLS-2$ // no-break space - badEntities.put("¡", "\u00A1"); // $NON-NLS-1$ //$NON-NLS-2$ // inverted exclamation - // mark - badEntities.put("¢", "\u00A2"); // $NON-NLS-1$ //$NON-NLS-2$ // cent sign - badEntities.put("£", "\u00A3"); // $NON-NLS-1$ //$NON-NLS-2$ // pound sign - badEntities.put("¤", "\u00A4"); // $NON-NLS-1$ //$NON-NLS-2$ // currency sign - badEntities.put("¥", "\u00A5"); // $NON-NLS-1$ //$NON-NLS-2$ // yen sign - badEntities.put("¦", "\u00A6"); // $NON-NLS-1$ //$NON-NLS-2$ // broken vertical bar - badEntities.put("§", "\u00A7"); // $NON-NLS-1$ //$NON-NLS-2$ // section sign - badEntities.put("¨", "\u00A8"); // $NON-NLS-1$ //$NON-NLS-2$ // diaeresis - badEntities.put("©", "\u00A9"); // $NON-NLS-1$ //$NON-NLS-2$ // copyright sign - badEntities.put("ª", "\u00AA"); // $NON-NLS-1$ //$NON-NLS-2$ // feminine ordinal - // indicator - badEntities.put("«", "\u00AB"); // $NON-NLS-2$ // left-pointing double angle - // quotation mark - badEntities.put("¬", "\u00AC"); // $NON-NLS-1$ //$NON-NLS-2$ // not sign - badEntities.put("­", "\u00AD"); // $NON-NLS-1$ //$NON-NLS-2$ // soft hyphen - badEntities.put("®", "\u00AE"); // $NON-NLS-1$ //$NON-NLS-2$ // registered sign - badEntities.put("¯", "\u00AF"); // $NON-NLS-1$ //$NON-NLS-2$ // macron - badEntities.put("°", "\u00B0"); // $NON-NLS-1$ //$NON-NLS-2$ // degree sign - badEntities.put("±", "\u00B1"); // $NON-NLS-1$ //$NON-NLS-2$ // plus-minus sign - badEntities.put("²", "\u00B2"); // $NON-NLS-1$ //$NON-NLS-2$ // superscript two - badEntities.put("³", "\u00B3"); // $NON-NLS-1$ //$NON-NLS-2$ // superscript three - badEntities.put("´", "\u00B4"); // $NON-NLS-1$ //$NON-NLS-2$ // acute accent - badEntities.put("µ", "\u00B5"); // $NON-NLS-1$ //$NON-NLS-2$ // micro sign - badEntities.put("¶", "\u00B6"); // $NON-NLS-1$ //$NON-NLS-2$ // pilcrow sign - badEntities.put("·", "\u00B7"); // $NON-NLS-1$ //$NON-NLS-2$ // middle dot - badEntities.put("¸", "\u00B8"); // $NON-NLS-1$ //$NON-NLS-2$ // cedilla - badEntities.put("¹", "\u00B9"); // $NON-NLS-1$ //$NON-NLS-2$ // superscript one - badEntities.put("º", "\u00BA"); // $NON-NLS-1$ //$NON-NLS-2$ // masculine ordinal - // indicator - badEntities.put("»", "\u00BB"); // $NON-NLS-2$ // right-pointing double - // angle quotation - // mark - badEntities.put("¼", "\u00BC"); // $NON-NLS-1$ //$NON-NLS-2$ // vulgar fraction one - // quarter - badEntities.put("½", "\u00BD"); // $NON-NLS-1$ //$NON-NLS-2$ // vulgar fraction one - // half - badEntities.put("¾", "\u00BE"); // $NON-NLS-1$ //$NON-NLS-2$ // vulgar fraction three - // quarters - badEntities.put("¿", "\u00BF"); // $NON-NLS-1$ //$NON-NLS-2$ // inverted question - // mark - badEntities.put("À", "\u00C0"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter - // A - // with grave - badEntities.put("Á", "\u00C1"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter - // A - // with acute - badEntities.put("Â", "\u00C2"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter A - // with circumflex - badEntities.put("Ã", "\u00C3"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter - // A - // with tilde - badEntities.put("Ä", "\u00C4"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter A - // with - // diaeresis - badEntities.put("Å", "\u00C5"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter A - // with ring above - badEntities.put("Æ", "\u00C6"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter - // AE - badEntities.put("Ç", "\u00C7"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter - // C - // with cedilla - badEntities.put("È", "\u00C8"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter - // E - // with grave - badEntities.put("É", "\u00C9"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter - // E - // with acute - badEntities.put("Ê", "\u00CA"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter E - // with circumflex - badEntities.put("Ë", "\u00CB"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter E - // with - // diaeresis - badEntities.put("Ì", "\u00CC"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter - // I - // with grave - badEntities.put("Í", "\u00CD"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter - // I - // with acute - badEntities.put("Î", "\u00CE"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter I - // with circumflex - badEntities.put("Ï", "\u00CF"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter I - // with - // diaeresis - badEntities.put("Ð", "\u00D0"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter ETH - badEntities.put("Ñ", "\u00D1"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter - // N - // with tilde - badEntities.put("Ò", "\u00D2"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter - // O - // with grave - badEntities.put("Ó", "\u00D3"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter - // O - // with acute - badEntities.put("Ô", "\u00D4"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter O - // with circumflex - badEntities.put("Õ", "\u00D5"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter - // O - // with tilde - badEntities.put("Ö", "\u00D6"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter O - // with - // diaeresis - badEntities.put("×", "\u00D7"); // $NON-NLS-1$ //$NON-NLS-2$ // multiplication sign - badEntities.put("Ø", "\u00D8"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter - // O - // with stroke - badEntities.put("Ù", "\u00D9"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter - // U - // with grave - badEntities.put("Ú", "\u00DA"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter - // U - // with acute - badEntities.put("Û", "\u00DB"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter U - // with circumflex - badEntities.put("Ü", "\u00DC"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter U - // with - // diaeresis - badEntities.put("Ý", "\u00DD"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter - // Y - // with acute - badEntities.put("Þ", "\u00DE"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter - // THORN - badEntities.put("ß", "\u00DF"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter - // sharp s - badEntities.put("à", "\u00E0"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter a - // with - // grave - badEntities.put("á", "\u00E1"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter a - // with - // acute - badEntities.put("â", "\u00E2"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter a - // with - // circumflex - badEntities.put("ã", "\u00E3"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter a - // with - // tilde - badEntities.put("ä", "\u00E4"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter a - // with - // diaeresis - badEntities.put("å", "\u00E5"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter a - // with - // ring above - badEntities.put("æ", "\u00E6"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter ae - badEntities.put("ç", "\u00E7"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter c - // with - // cedilla - badEntities.put("è", "\u00E8"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter e - // with - // grave - badEntities.put("é", "\u00E9"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter e - // with - // acute - badEntities.put("ê", "\u00EA"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter e - // with - // circumflex - badEntities.put("ë", "\u00EB"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter e - // with - // diaeresis - badEntities.put("ì", "\u00EC"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter i - // with - // grave - badEntities.put("í", "\u00ED"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter i - // with - // acute - badEntities.put("î", "\u00EE"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter i - // with - // circumflex - badEntities.put("ï", "\u00EF"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter i - // with - // diaeresis - badEntities.put("ð", "\u00F0"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter eth - badEntities.put("ñ", "\u00F1"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter n - // with - // tilde - badEntities.put("ò", "\u00F2"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter o - // with - // grave - badEntities.put("ó", "\u00F3"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter o - // with - // acute - badEntities.put("ô", "\u00F4"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter o - // with - // circumflex - badEntities.put("õ", "\u00F5"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter o - // with - // tilde - badEntities.put("ö", "\u00F6"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter o - // with - // diaeresis - badEntities.put("÷", "\u00F7"); // $NON-NLS-1$ //$NON-NLS-2$ // division sign - badEntities.put("ø", "\u00F8"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter o - // with - // stroke - badEntities.put("ù", "\u00F9"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter u - // with - // grave - badEntities.put("ú", "\u00FA"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter u - // with - // acute - badEntities.put("û", "\u00FB"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter u - // with - // circumflex - badEntities.put("ü", "\u00FC"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter u - // with - // diaeresis - badEntities.put("ý", "\u00FD"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter y - // with - // acute - badEntities.put("þ", "\u00FE"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter - // thorn - badEntities.put("ÿ", "\u00FF"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter y - // with - // diaeresis - } + static { + // pre-defined XML entities + goodEntities.add("""); // $NON-NLS-1$ // quotation mark + goodEntities.add("&"); // $NON-NLS-1$ // ampersand + goodEntities.add("<"); // $NON-NLS-1$ // less-than sign + goodEntities.add(">"); // $NON-NLS-1$ // greater-than sign + // control entities + // badEntities.put(" ", ""); + badEntities.put("", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character + badEntities.put("€", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character + badEntities.put("", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character + badEntities.put("‚", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character + badEntities.put("ƒ", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character + badEntities.put("„", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character + badEntities.put("…", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character + badEntities.put("†", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character + badEntities.put("‡", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character + badEntities.put("ˆ", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character + badEntities.put("‰", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character + badEntities.put("Š", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character + badEntities.put("‹", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character + badEntities.put("Œ", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character + badEntities.put("", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character + badEntities.put("Ž", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character + badEntities.put("", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character + badEntities.put("", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character + badEntities.put("‘", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character + badEntities.put("’", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character + badEntities.put("“", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character + badEntities.put("”", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character + badEntities.put("•", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character + badEntities.put("–", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character + badEntities.put("—", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character + badEntities.put("˜", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character + badEntities.put("™", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character + badEntities.put("š", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character + badEntities.put("›", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character + badEntities.put("œ", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character + badEntities.put("", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character + badEntities.put("ž", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character + badEntities.put("Ÿ", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character + // misc entities + badEntities.put("€", "\u20AC"); // $NON-NLS-1$ //$NON-NLS-2$ // euro + badEntities.put("‘", "\u2018"); // $NON-NLS-1$ //$NON-NLS-2$ // left single quotation + // mark + badEntities.put("’", "\u2019"); // $NON-NLS-1$ //$NON-NLS-2$ // right single quotation + // mark + // Latin 1 entities + badEntities.put(" ", "\u00A0"); // $NON-NLS-1$ //$NON-NLS-2$ // no-break space + badEntities.put("¡", "\u00A1"); // $NON-NLS-1$ //$NON-NLS-2$ // inverted exclamation + // mark + badEntities.put("¢", "\u00A2"); // $NON-NLS-1$ //$NON-NLS-2$ // cent sign + badEntities.put("£", "\u00A3"); // $NON-NLS-1$ //$NON-NLS-2$ // pound sign + badEntities.put("¤", "\u00A4"); // $NON-NLS-1$ //$NON-NLS-2$ // currency sign + badEntities.put("¥", "\u00A5"); // $NON-NLS-1$ //$NON-NLS-2$ // yen sign + badEntities.put("¦", "\u00A6"); // $NON-NLS-1$ //$NON-NLS-2$ // broken vertical bar + badEntities.put("§", "\u00A7"); // $NON-NLS-1$ //$NON-NLS-2$ // section sign + badEntities.put("¨", "\u00A8"); // $NON-NLS-1$ //$NON-NLS-2$ // diaeresis + badEntities.put("©", "\u00A9"); // $NON-NLS-1$ //$NON-NLS-2$ // copyright sign + badEntities.put("ª", "\u00AA"); // $NON-NLS-1$ //$NON-NLS-2$ // feminine ordinal + // indicator + badEntities.put("«", "\u00AB"); // $NON-NLS-2$ // left-pointing double angle + // quotation mark + badEntities.put("¬", "\u00AC"); // $NON-NLS-1$ //$NON-NLS-2$ // not sign + badEntities.put("­", "\u00AD"); // $NON-NLS-1$ //$NON-NLS-2$ // soft hyphen + badEntities.put("®", "\u00AE"); // $NON-NLS-1$ //$NON-NLS-2$ // registered sign + badEntities.put("¯", "\u00AF"); // $NON-NLS-1$ //$NON-NLS-2$ // macron + badEntities.put("°", "\u00B0"); // $NON-NLS-1$ //$NON-NLS-2$ // degree sign + badEntities.put("±", "\u00B1"); // $NON-NLS-1$ //$NON-NLS-2$ // plus-minus sign + badEntities.put("²", "\u00B2"); // $NON-NLS-1$ //$NON-NLS-2$ // superscript two + badEntities.put("³", "\u00B3"); // $NON-NLS-1$ //$NON-NLS-2$ // superscript three + badEntities.put("´", "\u00B4"); // $NON-NLS-1$ //$NON-NLS-2$ // acute accent + badEntities.put("µ", "\u00B5"); // $NON-NLS-1$ //$NON-NLS-2$ // micro sign + badEntities.put("¶", "\u00B6"); // $NON-NLS-1$ //$NON-NLS-2$ // pilcrow sign + badEntities.put("·", "\u00B7"); // $NON-NLS-1$ //$NON-NLS-2$ // middle dot + badEntities.put("¸", "\u00B8"); // $NON-NLS-1$ //$NON-NLS-2$ // cedilla + badEntities.put("¹", "\u00B9"); // $NON-NLS-1$ //$NON-NLS-2$ // superscript one + badEntities.put("º", "\u00BA"); // $NON-NLS-1$ //$NON-NLS-2$ // masculine ordinal + // indicator + badEntities.put("»", "\u00BB"); // $NON-NLS-2$ // right-pointing double + // angle quotation + // mark + badEntities.put("¼", "\u00BC"); // $NON-NLS-1$ //$NON-NLS-2$ // vulgar fraction one + // quarter + badEntities.put("½", "\u00BD"); // $NON-NLS-1$ //$NON-NLS-2$ // vulgar fraction one + // half + badEntities.put("¾", "\u00BE"); // $NON-NLS-1$ //$NON-NLS-2$ // vulgar fraction three + // quarters + badEntities.put("¿", "\u00BF"); // $NON-NLS-1$ //$NON-NLS-2$ // inverted question + // mark + badEntities.put("À", "\u00C0"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter + // A + // with grave + badEntities.put("Á", "\u00C1"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter + // A + // with acute + badEntities.put("Â", "\u00C2"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter A + // with circumflex + badEntities.put("Ã", "\u00C3"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter + // A + // with tilde + badEntities.put("Ä", "\u00C4"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter A + // with + // diaeresis + badEntities.put("Å", "\u00C5"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter A + // with ring above + badEntities.put("Æ", "\u00C6"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter + // AE + badEntities.put("Ç", "\u00C7"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter + // C + // with cedilla + badEntities.put("È", "\u00C8"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter + // E + // with grave + badEntities.put("É", "\u00C9"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter + // E + // with acute + badEntities.put("Ê", "\u00CA"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter E + // with circumflex + badEntities.put("Ë", "\u00CB"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter E + // with + // diaeresis + badEntities.put("Ì", "\u00CC"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter + // I + // with grave + badEntities.put("Í", "\u00CD"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter + // I + // with acute + badEntities.put("Î", "\u00CE"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter I + // with circumflex + badEntities.put("Ï", "\u00CF"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter I + // with + // diaeresis + badEntities.put("Ð", "\u00D0"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter ETH + badEntities.put("Ñ", "\u00D1"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter + // N + // with tilde + badEntities.put("Ò", "\u00D2"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter + // O + // with grave + badEntities.put("Ó", "\u00D3"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter + // O + // with acute + badEntities.put("Ô", "\u00D4"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter O + // with circumflex + badEntities.put("Õ", "\u00D5"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter + // O + // with tilde + badEntities.put("Ö", "\u00D6"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter O + // with + // diaeresis + badEntities.put("×", "\u00D7"); // $NON-NLS-1$ //$NON-NLS-2$ // multiplication sign + badEntities.put("Ø", "\u00D8"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter + // O + // with stroke + badEntities.put("Ù", "\u00D9"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter + // U + // with grave + badEntities.put("Ú", "\u00DA"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter + // U + // with acute + badEntities.put("Û", "\u00DB"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter U + // with circumflex + badEntities.put("Ü", "\u00DC"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter U + // with + // diaeresis + badEntities.put("Ý", "\u00DD"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter + // Y + // with acute + badEntities.put("Þ", "\u00DE"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter + // THORN + badEntities.put("ß", "\u00DF"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter + // sharp s + badEntities.put("à", "\u00E0"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter a + // with + // grave + badEntities.put("á", "\u00E1"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter a + // with + // acute + badEntities.put("â", "\u00E2"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter a + // with + // circumflex + badEntities.put("ã", "\u00E3"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter a + // with + // tilde + badEntities.put("ä", "\u00E4"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter a + // with + // diaeresis + badEntities.put("å", "\u00E5"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter a + // with + // ring above + badEntities.put("æ", "\u00E6"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter ae + badEntities.put("ç", "\u00E7"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter c + // with + // cedilla + badEntities.put("è", "\u00E8"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter e + // with + // grave + badEntities.put("é", "\u00E9"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter e + // with + // acute + badEntities.put("ê", "\u00EA"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter e + // with + // circumflex + badEntities.put("ë", "\u00EB"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter e + // with + // diaeresis + badEntities.put("ì", "\u00EC"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter i + // with + // grave + badEntities.put("í", "\u00ED"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter i + // with + // acute + badEntities.put("î", "\u00EE"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter i + // with + // circumflex + badEntities.put("ï", "\u00EF"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter i + // with + // diaeresis + badEntities.put("ð", "\u00F0"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter eth + badEntities.put("ñ", "\u00F1"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter n + // with + // tilde + badEntities.put("ò", "\u00F2"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter o + // with + // grave + badEntities.put("ó", "\u00F3"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter o + // with + // acute + badEntities.put("ô", "\u00F4"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter o + // with + // circumflex + badEntities.put("õ", "\u00F5"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter o + // with + // tilde + badEntities.put("ö", "\u00F6"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter o + // with + // diaeresis + badEntities.put("÷", "\u00F7"); // $NON-NLS-1$ //$NON-NLS-2$ // division sign + badEntities.put("ø", "\u00F8"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter o + // with + // stroke + badEntities.put("ù", "\u00F9"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter u + // with + // grave + badEntities.put("ú", "\u00FA"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter u + // with + // acute + badEntities.put("û", "\u00FB"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter u + // with + // circumflex + badEntities.put("ü", "\u00FC"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter u + // with + // diaeresis + badEntities.put("ý", "\u00FD"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter y + // with + // acute + badEntities.put("þ", "\u00FE"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter + // thorn + badEntities.put("ÿ", "\u00FF"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter y + // with + // diaeresis + } - /** - * For each entity in the input that is not allowed in XML, replace the entity with its unicode - * equivalent or remove it. For each instance of a bare {@literal &}, replace it with {@literal - * &
} XML only allows 4 entities: {@literal &amp;}, {@literal &quot;}, {@literal - * &lt;} and {@literal &gt;}. - * - * @param broken the string to handle entities - * @return the string with entities appropriately fixed up - */ - public static String cleanAllEntities(final String broken) { - if (broken == null) { - return null; - } + /** + * For each entity in the input that is not allowed in XML, replace the entity with its unicode equivalent or remove + * it. For each instance of a bare {@literal &}, replace it with {@literal + * &
+ } XML only allows 4 entities: {@literal &amp;}, {@literal &quot;}, {@literal + * &lt;} and {@literal &gt;}. + * + * @param broken the string to handle entities + * @return the string with entities appropriately fixed up + */ + public static String cleanAllEntities(final String broken) { + if (broken == null) { + return null; + } - String working = invalidControlCharPattern.matcher(broken).replaceAll(""); - working = invalidCharacterPattern.matcher(working).replaceAll(""); + String working = invalidControlCharPattern.matcher(broken).replaceAll(""); + working = invalidCharacterPattern.matcher(working).replaceAll(""); - int cleanfrom = 0; + int cleanfrom = 0; - while (true) { - int amp = working.indexOf('&', cleanfrom); - // If there are no more amps then we are done - if (amp == -1) { - break; - } - // Skip references of the kind &#ddd; - if (validCharacterEntityPattern.matcher(working.substring(amp)).find()) { - cleanfrom = working.indexOf(';', amp) + 1; - continue; - } - int i = amp + 1; - while (true) { - // if we are at the end of the string then just escape the '&'; - if (i >= working.length()) { - return working.substring(0, amp) + "&" + working.substring(amp + 1); // $NON-NLS-1$ - } - // if we have come to a ; then we have an entity - // If it is something that xml can't handle then replace it. - final char c = working.charAt(i); - if (c == ';') { - final String entity = working.substring(amp, i + 1); - final String replace = handleEntity(entity); - working = working.substring(0, amp) + replace + working.substring(i + 1); - break; - } - // Did we end an entity without finding a closing ; - // Then treat it as an '&' that needs to be replaced with & - if (!Character.isLetterOrDigit(c)) { - working = working.substring(0, amp) + "&" + working.substring(amp + 1); // $NON-NLS-1$ - amp = i + 4; // account for the 4 extra characters - break; - } - i++; - } - cleanfrom = amp + 1; - } + while (true) { + int amp = working.indexOf('&', cleanfrom); + // If there are no more amps then we are done + if (amp == -1) { + break; + } + // Skip references of the kind &#ddd; + if (validCharacterEntityPattern.matcher(working.substring(amp)).find()) { + cleanfrom = working.indexOf(';', amp) + 1; + continue; + } + int i = amp + 1; + while (true) { + // if we are at the end of the string then just escape the '&'; + if (i >= working.length()) { + return working.substring(0, amp) + "&" + working.substring(amp + 1); // $NON-NLS-1$ + } + // if we have come to a ; then we have an entity + // If it is something that xml can't handle then replace it. + final char c = working.charAt(i); + if (c == ';') { + final String entity = working.substring(amp, i + 1); + final String replace = handleEntity(entity); + working = working.substring(0, amp) + replace + working.substring(i + 1); + break; + } + // Did we end an entity without finding a closing ; + // Then treat it as an '&' that needs to be replaced with & + if (!Character.isLetterOrDigit(c)) { + working = working.substring(0, amp) + "&" + working.substring(amp + 1); // $NON-NLS-1$ + amp = i + 4; // account for the 4 extra characters + break; + } + i++; + } + cleanfrom = amp + 1; + } - if (Pattern.compile("<<").matcher(working).find()) { - working = working.replaceAll("<<", "<<"); - } + if (Pattern.compile("<<").matcher(working).find()) { + working = working.replaceAll("<<", "<<"); + } - if (Pattern.compile(">>").matcher(working).find()) { - working = working.replaceAll(">>", ">>"); - } + if (Pattern.compile(">>").matcher(working).find()) { + working = working.replaceAll(">>", ">>"); + } - return working; - } + return working; + } - /** - * Replace entity with its unicode equivalent, if it is not a valid XML entity. Otherwise strip it - * out. XML only allows 4 entities: &amp;, &quot;, &lt; and &gt;. - * - * @param entity the entity to be replaced - * @return the substitution for the entity, either itself, the unicode equivalent or an empty - * string. - */ - private static String handleEntity(final String entity) { - if (goodEntities.contains(entity)) { - return entity; - } + /** + * Replace entity with its unicode equivalent, if it is not a valid XML entity. Otherwise strip it out. XML only + * allows 4 entities: &amp;, &quot;, &lt; and &gt;. + * + * @param entity the entity to be replaced + * @return the substitution for the entity, either itself, the unicode equivalent or an empty string. + */ + private static String handleEntity(final String entity) { + if (goodEntities.contains(entity)) { + return entity; + } - final String replace = badEntities.get(entity); - if (replace != null) { - return replace; - } + final String replace = badEntities.get(entity); + if (replace != null) { + return replace; + } - return replace != null ? replace : ""; - } + return replace != null ? replace : ""; + } } diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/TransformFunction.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/TransformFunction.java index c568714de..f4bf78e18 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/TransformFunction.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/TransformFunction.java @@ -1,71 +1,74 @@ + package eu.dnetlib.dhp.transformation; +import java.io.ByteArrayInputStream; +import java.io.StringWriter; +import java.util.Map; + +import javax.xml.transform.stream.StreamSource; + +import org.apache.spark.api.java.function.MapFunction; +import org.apache.spark.util.LongAccumulator; + import eu.dnetlib.dhp.model.mdstore.MetadataRecord; import eu.dnetlib.dhp.transformation.functions.Cleaner; import eu.dnetlib.dhp.transformation.vocabulary.Vocabulary; -import java.io.ByteArrayInputStream; -import java.io.StringWriter; -import java.util.Map; -import javax.xml.transform.stream.StreamSource; import net.sf.saxon.s9api.*; -import org.apache.spark.api.java.function.MapFunction; -import org.apache.spark.util.LongAccumulator; public class TransformFunction implements MapFunction { - private final LongAccumulator totalItems; - private final LongAccumulator errorItems; - private final LongAccumulator transformedItems; - private final String transformationRule; - private final Cleaner cleanFunction; + private final LongAccumulator totalItems; + private final LongAccumulator errorItems; + private final LongAccumulator transformedItems; + private final String transformationRule; + private final Cleaner cleanFunction; - private final long dateOfTransformation; + private final long dateOfTransformation; - public TransformFunction( - LongAccumulator totalItems, - LongAccumulator errorItems, - LongAccumulator transformedItems, - final String transformationRule, - long dateOfTransformation, - final Map vocabularies) - throws Exception { - this.totalItems = totalItems; - this.errorItems = errorItems; - this.transformedItems = transformedItems; - this.transformationRule = transformationRule; - this.dateOfTransformation = dateOfTransformation; - cleanFunction = new Cleaner(vocabularies); - } + public TransformFunction( + LongAccumulator totalItems, + LongAccumulator errorItems, + LongAccumulator transformedItems, + final String transformationRule, + long dateOfTransformation, + final Map vocabularies) + throws Exception { + this.totalItems = totalItems; + this.errorItems = errorItems; + this.transformedItems = transformedItems; + this.transformationRule = transformationRule; + this.dateOfTransformation = dateOfTransformation; + cleanFunction = new Cleaner(vocabularies); + } - @Override - public MetadataRecord call(MetadataRecord value) { - totalItems.add(1); - try { - Processor processor = new Processor(false); - processor.registerExtensionFunction(cleanFunction); - final XsltCompiler comp = processor.newXsltCompiler(); - XsltExecutable xslt = - comp.compile(new StreamSource(new ByteArrayInputStream(transformationRule.getBytes()))); - XdmNode source = - processor - .newDocumentBuilder() - .build(new StreamSource(new ByteArrayInputStream(value.getBody().getBytes()))); - XsltTransformer trans = xslt.load(); - trans.setInitialContextNode(source); - final StringWriter output = new StringWriter(); - Serializer out = processor.newSerializer(output); - out.setOutputProperty(Serializer.Property.METHOD, "xml"); - out.setOutputProperty(Serializer.Property.INDENT, "yes"); - trans.setDestination(out); - trans.transform(); - final String xml = output.toString(); - value.setBody(xml); - value.setDateOfTransformation(dateOfTransformation); - transformedItems.add(1); - return value; - } catch (Throwable e) { - errorItems.add(1); - return null; - } - } + @Override + public MetadataRecord call(MetadataRecord value) { + totalItems.add(1); + try { + Processor processor = new Processor(false); + processor.registerExtensionFunction(cleanFunction); + final XsltCompiler comp = processor.newXsltCompiler(); + XsltExecutable xslt = comp + .compile(new StreamSource(new ByteArrayInputStream(transformationRule.getBytes()))); + XdmNode source = processor + .newDocumentBuilder() + .build(new StreamSource(new ByteArrayInputStream(value.getBody().getBytes()))); + XsltTransformer trans = xslt.load(); + trans.setInitialContextNode(source); + final StringWriter output = new StringWriter(); + Serializer out = processor.newSerializer(output); + out.setOutputProperty(Serializer.Property.METHOD, "xml"); + out.setOutputProperty(Serializer.Property.INDENT, "yes"); + trans.setDestination(out); + trans.transform(); + final String xml = output.toString(); + value.setBody(xml); + value.setDateOfTransformation(dateOfTransformation); + transformedItems.add(1); + return value; + } catch (Throwable e) { + errorItems.add(1); + return null; + } + } } diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/TransformSparkJobNode.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/TransformSparkJobNode.java index 550136247..5f39717d0 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/TransformSparkJobNode.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/TransformSparkJobNode.java @@ -1,17 +1,11 @@ + package eu.dnetlib.dhp.transformation; -import eu.dnetlib.dhp.application.ArgumentApplicationParser; -import eu.dnetlib.dhp.model.mdstore.MetadataRecord; -import eu.dnetlib.dhp.transformation.vocabulary.Vocabulary; -import eu.dnetlib.dhp.transformation.vocabulary.VocabularyHelper; -import eu.dnetlib.dhp.utils.DHPUtils; -import eu.dnetlib.message.Message; -import eu.dnetlib.message.MessageManager; -import eu.dnetlib.message.MessageType; import java.io.ByteArrayInputStream; import java.util.HashMap; import java.util.Map; import java.util.Objects; + import org.apache.commons.cli.*; import org.apache.commons.io.IOUtils; import org.apache.spark.sql.Dataset; @@ -24,78 +18,87 @@ import org.dom4j.DocumentException; import org.dom4j.Node; import org.dom4j.io.SAXReader; +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.model.mdstore.MetadataRecord; +import eu.dnetlib.dhp.transformation.vocabulary.Vocabulary; +import eu.dnetlib.dhp.transformation.vocabulary.VocabularyHelper; +import eu.dnetlib.dhp.utils.DHPUtils; +import eu.dnetlib.message.Message; +import eu.dnetlib.message.MessageManager; +import eu.dnetlib.message.MessageType; + public class TransformSparkJobNode { - public static void main(String[] args) throws Exception { + public static void main(String[] args) throws Exception { - final ArgumentApplicationParser parser = - new ArgumentApplicationParser( - IOUtils.toString( - TransformSparkJobNode.class.getResourceAsStream( - "/eu/dnetlib/dhp/transformation/transformation_input_parameters.json"))); + final ArgumentApplicationParser parser = new ArgumentApplicationParser( + IOUtils + .toString( + TransformSparkJobNode.class + .getResourceAsStream( + "/eu/dnetlib/dhp/transformation/transformation_input_parameters.json"))); - parser.parseArgument(args); + parser.parseArgument(args); - final String inputPath = parser.get("input"); - final String outputPath = parser.get("output"); - final String workflowId = parser.get("workflowId"); - final String trasformationRule = - extractXSLTFromTR( - Objects.requireNonNull(DHPUtils.decompressString(parser.get("transformationRule")))); - final String master = parser.get("master"); - final String rabbitUser = parser.get("rabbitUser"); - final String rabbitPassword = parser.get("rabbitPassword"); - final String rabbitHost = parser.get("rabbitHost"); - final String rabbitReportQueue = parser.get("rabbitReportQueue"); - final long dateOfCollection = new Long(parser.get("dateOfCollection")); - final boolean test = - parser.get("isTest") == null ? false : Boolean.valueOf(parser.get("isTest")); + final String inputPath = parser.get("input"); + final String outputPath = parser.get("output"); + final String workflowId = parser.get("workflowId"); + final String trasformationRule = extractXSLTFromTR( + Objects.requireNonNull(DHPUtils.decompressString(parser.get("transformationRule")))); + final String master = parser.get("master"); + final String rabbitUser = parser.get("rabbitUser"); + final String rabbitPassword = parser.get("rabbitPassword"); + final String rabbitHost = parser.get("rabbitHost"); + final String rabbitReportQueue = parser.get("rabbitReportQueue"); + final long dateOfCollection = new Long(parser.get("dateOfCollection")); + final boolean test = parser.get("isTest") == null ? false : Boolean.valueOf(parser.get("isTest")); - final SparkSession spark = - SparkSession.builder().appName("TransformStoreSparkJob").master(master).getOrCreate(); + final SparkSession spark = SparkSession + .builder() + .appName("TransformStoreSparkJob") + .master(master) + .getOrCreate(); - final Encoder encoder = Encoders.bean(MetadataRecord.class); - final Dataset mdstoreInput = - spark.read().format("parquet").load(inputPath).as(encoder); - final LongAccumulator totalItems = spark.sparkContext().longAccumulator("TotalItems"); - final LongAccumulator errorItems = spark.sparkContext().longAccumulator("errorItems"); - final LongAccumulator transformedItems = - spark.sparkContext().longAccumulator("transformedItems"); - final Map vocabularies = new HashMap<>(); - vocabularies.put("dnet:languages", VocabularyHelper.getVocabularyFromAPI("dnet:languages")); - final TransformFunction transformFunction = - new TransformFunction( - totalItems, - errorItems, - transformedItems, - trasformationRule, - dateOfCollection, - vocabularies); - mdstoreInput.map(transformFunction, encoder).write().format("parquet").save(outputPath); - if (rabbitHost != null) { - System.out.println("SEND FINAL REPORT"); - final Map reportMap = new HashMap<>(); - reportMap.put("inputItem", "" + totalItems.value()); - reportMap.put("invalidRecords", "" + errorItems.value()); - reportMap.put("mdStoreSize", "" + transformedItems.value()); - System.out.println(new Message(workflowId, "Transform", MessageType.REPORT, reportMap)); - if (!test) { - final MessageManager manager = - new MessageManager(rabbitHost, rabbitUser, rabbitPassword, false, false, null); - manager.sendMessage( - new Message(workflowId, "Transform", MessageType.REPORT, reportMap), - rabbitReportQueue, - true, - false); - manager.close(); - } - } - } + final Encoder encoder = Encoders.bean(MetadataRecord.class); + final Dataset mdstoreInput = spark.read().format("parquet").load(inputPath).as(encoder); + final LongAccumulator totalItems = spark.sparkContext().longAccumulator("TotalItems"); + final LongAccumulator errorItems = spark.sparkContext().longAccumulator("errorItems"); + final LongAccumulator transformedItems = spark.sparkContext().longAccumulator("transformedItems"); + final Map vocabularies = new HashMap<>(); + vocabularies.put("dnet:languages", VocabularyHelper.getVocabularyFromAPI("dnet:languages")); + final TransformFunction transformFunction = new TransformFunction( + totalItems, + errorItems, + transformedItems, + trasformationRule, + dateOfCollection, + vocabularies); + mdstoreInput.map(transformFunction, encoder).write().format("parquet").save(outputPath); + if (rabbitHost != null) { + System.out.println("SEND FINAL REPORT"); + final Map reportMap = new HashMap<>(); + reportMap.put("inputItem", "" + totalItems.value()); + reportMap.put("invalidRecords", "" + errorItems.value()); + reportMap.put("mdStoreSize", "" + transformedItems.value()); + System.out.println(new Message(workflowId, "Transform", MessageType.REPORT, reportMap)); + if (!test) { + final MessageManager manager = new MessageManager(rabbitHost, rabbitUser, rabbitPassword, false, false, + null); + manager + .sendMessage( + new Message(workflowId, "Transform", MessageType.REPORT, reportMap), + rabbitReportQueue, + true, + false); + manager.close(); + } + } + } - private static String extractXSLTFromTR(final String tr) throws DocumentException { - SAXReader reader = new SAXReader(); - Document document = reader.read(new ByteArrayInputStream(tr.getBytes())); - Node node = document.selectSingleNode("//CODE/*[local-name()='stylesheet']"); - return node.asXML(); - } + private static String extractXSLTFromTR(final String tr) throws DocumentException { + SAXReader reader = new SAXReader(); + Document document = reader.read(new ByteArrayInputStream(tr.getBytes())); + Node node = document.selectSingleNode("//CODE/*[local-name()='stylesheet']"); + return node.asXML(); + } } diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/functions/Cleaner.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/functions/Cleaner.java index 09e77613c..7f9b6646c 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/functions/Cleaner.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/functions/Cleaner.java @@ -1,48 +1,52 @@ + package eu.dnetlib.dhp.transformation.functions; +import java.util.Map; +import java.util.Optional; + import eu.dnetlib.dhp.transformation.vocabulary.Term; import eu.dnetlib.dhp.transformation.vocabulary.Vocabulary; -import java.util.Map; -import java.util.Optional; import net.sf.saxon.s9api.*; import scala.Serializable; public class Cleaner implements ExtensionFunction, Serializable { - private final Map vocabularies; + private final Map vocabularies; - public Cleaner(Map vocabularies) { - this.vocabularies = vocabularies; - } + public Cleaner(Map vocabularies) { + this.vocabularies = vocabularies; + } - @Override - public QName getName() { - return new QName("http://eu/dnetlib/trasform/extension", "clean"); - } + @Override + public QName getName() { + return new QName("http://eu/dnetlib/trasform/extension", "clean"); + } - @Override - public SequenceType getResultType() { - return SequenceType.makeSequenceType(ItemType.STRING, OccurrenceIndicator.ONE_OR_MORE); - } + @Override + public SequenceType getResultType() { + return SequenceType.makeSequenceType(ItemType.STRING, OccurrenceIndicator.ONE_OR_MORE); + } - @Override - public SequenceType[] getArgumentTypes() { - return new SequenceType[] { - SequenceType.makeSequenceType(ItemType.STRING, OccurrenceIndicator.ONE), - SequenceType.makeSequenceType(ItemType.STRING, OccurrenceIndicator.ONE) - }; - } + @Override + public SequenceType[] getArgumentTypes() { + return new SequenceType[] { + SequenceType.makeSequenceType(ItemType.STRING, OccurrenceIndicator.ONE), + SequenceType.makeSequenceType(ItemType.STRING, OccurrenceIndicator.ONE) + }; + } - @Override - public XdmValue call(XdmValue[] xdmValues) throws SaxonApiException { - final String currentValue = xdmValues[0].itemAt(0).getStringValue(); - final String vocabularyName = xdmValues[1].itemAt(0).getStringValue(); - Optional cleanedValue = - vocabularies.get(vocabularyName).getTerms().stream() - .filter(it -> it.getNativeName().equalsIgnoreCase(currentValue)) - .findAny(); + @Override + public XdmValue call(XdmValue[] xdmValues) throws SaxonApiException { + final String currentValue = xdmValues[0].itemAt(0).getStringValue(); + final String vocabularyName = xdmValues[1].itemAt(0).getStringValue(); + Optional cleanedValue = vocabularies + .get(vocabularyName) + .getTerms() + .stream() + .filter(it -> it.getNativeName().equalsIgnoreCase(currentValue)) + .findAny(); - return new XdmAtomicValue( - cleanedValue.isPresent() ? cleanedValue.get().getCode() : currentValue); - } + return new XdmAtomicValue( + cleanedValue.isPresent() ? cleanedValue.get().getCode() : currentValue); + } } diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/vocabulary/Term.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/vocabulary/Term.java index 813a77941..b5ac18169 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/vocabulary/Term.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/vocabulary/Term.java @@ -1,52 +1,53 @@ + package eu.dnetlib.dhp.transformation.vocabulary; import java.io.Serializable; public class Term implements Serializable { - private String englishName; - private String nativeName; - private String encoding; - private String code; - private String synonyms; + private String englishName; + private String nativeName; + private String encoding; + private String code; + private String synonyms; - public String getEnglishName() { - return englishName; - } + public String getEnglishName() { + return englishName; + } - public void setEnglishName(String englishName) { - this.englishName = englishName; - } + public void setEnglishName(String englishName) { + this.englishName = englishName; + } - public String getNativeName() { - return nativeName; - } + public String getNativeName() { + return nativeName; + } - public void setNativeName(String nativeName) { - this.nativeName = nativeName; - } + public void setNativeName(String nativeName) { + this.nativeName = nativeName; + } - public String getEncoding() { - return encoding; - } + public String getEncoding() { + return encoding; + } - public void setEncoding(String encoding) { - this.encoding = encoding; - } + public void setEncoding(String encoding) { + this.encoding = encoding; + } - public String getCode() { - return code; - } + public String getCode() { + return code; + } - public void setCode(String code) { - this.code = code; - } + public void setCode(String code) { + this.code = code; + } - public String getSynonyms() { - return synonyms; - } + public String getSynonyms() { + return synonyms; + } - public void setSynonyms(String synonyms) { - this.synonyms = synonyms; - } + public void setSynonyms(String synonyms) { + this.synonyms = synonyms; + } } diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/vocabulary/Vocabulary.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/vocabulary/Vocabulary.java index 0579c8244..a9da6b725 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/vocabulary/Vocabulary.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/vocabulary/Vocabulary.java @@ -1,3 +1,4 @@ + package eu.dnetlib.dhp.transformation.vocabulary; import java.io.Serializable; @@ -5,49 +6,49 @@ import java.util.List; public class Vocabulary implements Serializable { - private String id; - private String name; - private String description; - private String code; - private List terms; + private String id; + private String name; + private String description; + private String code; + private List terms; - public String getId() { - return id; - } + public String getId() { + return id; + } - public void setId(String id) { - this.id = id; - } + public void setId(String id) { + this.id = id; + } - public String getName() { - return name; - } + public String getName() { + return name; + } - public void setName(String name) { - this.name = name; - } + public void setName(String name) { + this.name = name; + } - public String getDescription() { - return description; - } + public String getDescription() { + return description; + } - public void setDescription(String description) { - this.description = description; - } + public void setDescription(String description) { + this.description = description; + } - public String getCode() { - return code; - } + public String getCode() { + return code; + } - public void setCode(String code) { - this.code = code; - } + public void setCode(String code) { + this.code = code; + } - public List getTerms() { - return terms; - } + public List getTerms() { + return terms; + } - public void setTerms(List terms) { - this.terms = terms; - } + public void setTerms(List terms) { + this.terms = terms; + } } diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/vocabulary/VocabularyHelper.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/vocabulary/VocabularyHelper.java index 349fc53de..10e959be0 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/vocabulary/VocabularyHelper.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/vocabulary/VocabularyHelper.java @@ -1,21 +1,24 @@ + package eu.dnetlib.dhp.transformation.vocabulary; -import com.fasterxml.jackson.databind.ObjectMapper; import java.io.Serializable; import java.net.URL; import java.nio.charset.Charset; + import org.apache.commons.io.IOUtils; +import com.fasterxml.jackson.databind.ObjectMapper; + public class VocabularyHelper implements Serializable { - private static final String OPENAIRE_URL = "http://api.openaire.eu/vocabularies/%s.json"; + private static final String OPENAIRE_URL = "http://api.openaire.eu/vocabularies/%s.json"; - public static Vocabulary getVocabularyFromAPI(final String vocabularyName) throws Exception { - final URL url = new URL(String.format(OPENAIRE_URL, vocabularyName)); + public static Vocabulary getVocabularyFromAPI(final String vocabularyName) throws Exception { + final URL url = new URL(String.format(OPENAIRE_URL, vocabularyName)); - final String response = IOUtils.toString(url, Charset.defaultCharset()); - final ObjectMapper jsonMapper = new ObjectMapper(); - final Vocabulary vocabulary = jsonMapper.readValue(response, Vocabulary.class); - return vocabulary; - } + final String response = IOUtils.toString(url, Charset.defaultCharset()); + final ObjectMapper jsonMapper = new ObjectMapper(); + final Vocabulary vocabulary = jsonMapper.readValue(response, Vocabulary.class); + return vocabulary; + } } diff --git a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/CollectionJobTest.java b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/CollectionJobTest.java index cbf0cfd01..44364b30a 100644 --- a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/CollectionJobTest.java +++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/CollectionJobTest.java @@ -1,116 +1,121 @@ + package eu.dnetlib.dhp.collection; import static org.junit.jupiter.api.Assertions.*; -import com.fasterxml.jackson.databind.ObjectMapper; -import eu.dnetlib.dhp.model.mdstore.MetadataRecord; -import eu.dnetlib.dhp.model.mdstore.Provenance; import java.io.IOException; import java.nio.file.Files; import java.nio.file.Path; + import org.apache.commons.io.FileUtils; import org.apache.commons.io.IOUtils; import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; +import com.fasterxml.jackson.databind.ObjectMapper; + +import eu.dnetlib.dhp.model.mdstore.MetadataRecord; +import eu.dnetlib.dhp.model.mdstore.Provenance; + public class CollectionJobTest { - private Path testDir; + private Path testDir; - @BeforeEach - public void setup() throws IOException { - testDir = Files.createTempDirectory("dhp-collection"); - } + @BeforeEach + public void setup() throws IOException { + testDir = Files.createTempDirectory("dhp-collection"); + } - @AfterEach - public void teadDown() throws IOException { - FileUtils.deleteDirectory(testDir.toFile()); - } + @AfterEach + public void teadDown() throws IOException { + FileUtils.deleteDirectory(testDir.toFile()); + } - @Test - public void tesCollection() throws Exception { - final Provenance provenance = new Provenance("pippo", "puppa", "ns_prefix"); - GenerateNativeStoreSparkJob.main( - new String[] { - "-mt", - "local", - "-w", - "wid", - "-e", - "XML", - "-d", - "" + System.currentTimeMillis(), - "-p", - new ObjectMapper().writeValueAsString(provenance), - "-x", - "./*[local-name()='record']/*[local-name()='header']/*[local-name()='identifier']", - "-i", - this.getClass().getResource("/eu/dnetlib/dhp/collection/native.seq").toString(), - "-o", - testDir.toString() + "/store", - "-t", - "true", - "-ru", - "", - "-rp", - "", - "-rh", - "", - "-ro", - "", - "-rr", - "" - }); - System.out.println(new ObjectMapper().writeValueAsString(provenance)); - } + @Test + public void tesCollection() throws Exception { + final Provenance provenance = new Provenance("pippo", "puppa", "ns_prefix"); + GenerateNativeStoreSparkJob + .main( + new String[] { + "-mt", + "local", + "-w", + "wid", + "-e", + "XML", + "-d", + "" + System.currentTimeMillis(), + "-p", + new ObjectMapper().writeValueAsString(provenance), + "-x", + "./*[local-name()='record']/*[local-name()='header']/*[local-name()='identifier']", + "-i", + this.getClass().getResource("/eu/dnetlib/dhp/collection/native.seq").toString(), + "-o", + testDir.toString() + "/store", + "-t", + "true", + "-ru", + "", + "-rp", + "", + "-rh", + "", + "-ro", + "", + "-rr", + "" + }); + System.out.println(new ObjectMapper().writeValueAsString(provenance)); + } - @Test - public void testGenerationMetadataRecord() throws Exception { + @Test + public void testGenerationMetadataRecord() throws Exception { - final String xml = IOUtils.toString(this.getClass().getResourceAsStream("./record.xml")); + final String xml = IOUtils.toString(this.getClass().getResourceAsStream("./record.xml")); - final MetadataRecord record = - GenerateNativeStoreSparkJob.parseRecord( - xml, - "./*[local-name()='record']/*[local-name()='header']/*[local-name()='identifier']", - "XML", - new Provenance("foo", "bar", "ns_prefix"), - System.currentTimeMillis(), - null, - null); + final MetadataRecord record = GenerateNativeStoreSparkJob + .parseRecord( + xml, + "./*[local-name()='record']/*[local-name()='header']/*[local-name()='identifier']", + "XML", + new Provenance("foo", "bar", "ns_prefix"), + System.currentTimeMillis(), + null, + null); - assert record != null; - System.out.println(record.getId()); - System.out.println(record.getOriginalId()); - } + assert record != null; + System.out.println(record.getId()); + System.out.println(record.getOriginalId()); + } - @Test - public void TestEquals() throws IOException { + @Test + public void TestEquals() throws IOException { - final String xml = IOUtils.toString(this.getClass().getResourceAsStream("./record.xml")); - final MetadataRecord record = - GenerateNativeStoreSparkJob.parseRecord( - xml, - "./*[local-name()='record']/*[local-name()='header']/*[local-name()='identifier']", - "XML", - new Provenance("foo", "bar", "ns_prefix"), - System.currentTimeMillis(), - null, - null); - final MetadataRecord record1 = - GenerateNativeStoreSparkJob.parseRecord( - xml, - "./*[local-name()='record']/*[local-name()='header']/*[local-name()='identifier']", - "XML", - new Provenance("foo", "bar", "ns_prefix"), - System.currentTimeMillis(), - null, - null); - assert record != null; - record.setBody("ciao"); - assert record1 != null; - record1.setBody("mondo"); - assertEquals(record, record1); - } + final String xml = IOUtils.toString(this.getClass().getResourceAsStream("./record.xml")); + final MetadataRecord record = GenerateNativeStoreSparkJob + .parseRecord( + xml, + "./*[local-name()='record']/*[local-name()='header']/*[local-name()='identifier']", + "XML", + new Provenance("foo", "bar", "ns_prefix"), + System.currentTimeMillis(), + null, + null); + final MetadataRecord record1 = GenerateNativeStoreSparkJob + .parseRecord( + xml, + "./*[local-name()='record']/*[local-name()='header']/*[local-name()='identifier']", + "XML", + new Provenance("foo", "bar", "ns_prefix"), + System.currentTimeMillis(), + null, + null); + assert record != null; + record.setBody("ciao"); + assert record1 != null; + record1.setBody("mondo"); + assertEquals(record, record1); + } } diff --git a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collector/worker/DnetCollectorWorkerApplicationTests.java b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collector/worker/DnetCollectorWorkerApplicationTests.java index a524d75e7..1a4fafb66 100644 --- a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collector/worker/DnetCollectorWorkerApplicationTests.java +++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collector/worker/DnetCollectorWorkerApplicationTests.java @@ -1,88 +1,92 @@ + package eu.dnetlib.dhp.collector.worker; import static org.junit.jupiter.api.Assertions.assertNotNull; import static org.mockito.Mockito.*; +import java.io.File; + +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; + import com.fasterxml.jackson.databind.ObjectMapper; + import eu.dnetlib.collector.worker.model.ApiDescriptor; import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.collection.worker.DnetCollectorWorker; import eu.dnetlib.dhp.collection.worker.utils.CollectorPluginFactory; import eu.dnetlib.message.Message; import eu.dnetlib.message.MessageManager; -import java.io.File; -import org.junit.jupiter.api.AfterEach; -import org.junit.jupiter.api.BeforeEach; -import org.junit.jupiter.api.Test; public class DnetCollectorWorkerApplicationTests { - private ArgumentApplicationParser argumentParser = mock(ArgumentApplicationParser.class); - private MessageManager messageManager = mock(MessageManager.class); + private ArgumentApplicationParser argumentParser = mock(ArgumentApplicationParser.class); + private MessageManager messageManager = mock(MessageManager.class); - private DnetCollectorWorker worker; + private DnetCollectorWorker worker; - @BeforeEach - public void setup() throws Exception { - ObjectMapper mapper = new ObjectMapper(); - final String apiJson = mapper.writeValueAsString(getApi()); - when(argumentParser.get("apidescriptor")).thenReturn(apiJson); - when(argumentParser.get("namenode")).thenReturn("file://tmp/test.seq"); - when(argumentParser.get("hdfsPath")).thenReturn("/tmp/file.seq"); - when(argumentParser.get("userHDFS")).thenReturn("sandro"); - when(argumentParser.get("workflowId")).thenReturn("sandro"); - when(argumentParser.get("rabbitOngoingQueue")).thenReturn("sandro"); + @BeforeEach + public void setup() throws Exception { + ObjectMapper mapper = new ObjectMapper(); + final String apiJson = mapper.writeValueAsString(getApi()); + when(argumentParser.get("apidescriptor")).thenReturn(apiJson); + when(argumentParser.get("namenode")).thenReturn("file://tmp/test.seq"); + when(argumentParser.get("hdfsPath")).thenReturn("/tmp/file.seq"); + when(argumentParser.get("userHDFS")).thenReturn("sandro"); + when(argumentParser.get("workflowId")).thenReturn("sandro"); + when(argumentParser.get("rabbitOngoingQueue")).thenReturn("sandro"); - when(messageManager.sendMessage(any(Message.class), anyString(), anyBoolean(), anyBoolean())) - .thenAnswer( - a -> { - System.out.println("sent message: " + a.getArguments()[0]); - return true; - }); - when(messageManager.sendMessage(any(Message.class), anyString())) - .thenAnswer( - a -> { - System.out.println("Called"); - return true; - }); - worker = new DnetCollectorWorker(new CollectorPluginFactory(), argumentParser, messageManager); - } + when(messageManager.sendMessage(any(Message.class), anyString(), anyBoolean(), anyBoolean())) + .thenAnswer( + a -> { + System.out.println("sent message: " + a.getArguments()[0]); + return true; + }); + when(messageManager.sendMessage(any(Message.class), anyString())) + .thenAnswer( + a -> { + System.out.println("Called"); + return true; + }); + worker = new DnetCollectorWorker(new CollectorPluginFactory(), argumentParser, messageManager); + } - @AfterEach - public void dropDown() { - File f = new File("/tmp/file.seq"); - f.delete(); - } + @AfterEach + public void dropDown() { + File f = new File("/tmp/file.seq"); + f.delete(); + } - @Test - public void testFindPlugin() throws Exception { - final CollectorPluginFactory collectorPluginEnumerator = new CollectorPluginFactory(); - assertNotNull(collectorPluginEnumerator.getPluginByProtocol("oai")); - assertNotNull(collectorPluginEnumerator.getPluginByProtocol("OAI")); - } + @Test + public void testFindPlugin() throws Exception { + final CollectorPluginFactory collectorPluginEnumerator = new CollectorPluginFactory(); + assertNotNull(collectorPluginEnumerator.getPluginByProtocol("oai")); + assertNotNull(collectorPluginEnumerator.getPluginByProtocol("OAI")); + } - @Test - public void testCollectionOAI() throws Exception { - final ApiDescriptor api = new ApiDescriptor(); - api.setId("oai"); - api.setProtocol("oai"); - api.setBaseUrl("http://www.revista.vocesdelaeducacion.com.mx/index.php/index/oai"); - api.getParams().put("format", "oai_dc"); - ObjectMapper mapper = new ObjectMapper(); - assertNotNull(mapper.writeValueAsString(api)); - } + @Test + public void testCollectionOAI() throws Exception { + final ApiDescriptor api = new ApiDescriptor(); + api.setId("oai"); + api.setProtocol("oai"); + api.setBaseUrl("http://www.revista.vocesdelaeducacion.com.mx/index.php/index/oai"); + api.getParams().put("format", "oai_dc"); + ObjectMapper mapper = new ObjectMapper(); + assertNotNull(mapper.writeValueAsString(api)); + } - @Test - public void testFeeding() throws Exception { - worker.collect(); - } + @Test + public void testFeeding() throws Exception { + worker.collect(); + } - private ApiDescriptor getApi() { - final ApiDescriptor api = new ApiDescriptor(); - api.setId("oai"); - api.setProtocol("oai"); - api.setBaseUrl("http://www.revista.vocesdelaeducacion.com.mx/index.php/index/oai"); - api.getParams().put("format", "oai_dc"); - return api; - } + private ApiDescriptor getApi() { + final ApiDescriptor api = new ApiDescriptor(); + api.setId("oai"); + api.setProtocol("oai"); + api.setBaseUrl("http://www.revista.vocesdelaeducacion.com.mx/index.php/index/oai"); + api.getParams().put("format", "oai_dc"); + return api; + } } diff --git a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/transformation/TransformationJobTest.java b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/transformation/TransformationJobTest.java index 12a89053e..01c9e3103 100644 --- a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/transformation/TransformationJobTest.java +++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/transformation/TransformationJobTest.java @@ -1,19 +1,16 @@ + package eu.dnetlib.dhp.transformation; import static org.junit.jupiter.api.Assertions.assertNotNull; -import eu.dnetlib.dhp.model.mdstore.MetadataRecord; -import eu.dnetlib.dhp.transformation.functions.Cleaner; -import eu.dnetlib.dhp.transformation.vocabulary.Vocabulary; -import eu.dnetlib.dhp.transformation.vocabulary.VocabularyHelper; -import eu.dnetlib.dhp.utils.DHPUtils; import java.io.StringWriter; import java.nio.file.Files; import java.nio.file.Path; import java.util.HashMap; import java.util.Map; + import javax.xml.transform.stream.StreamSource; -import net.sf.saxon.s9api.*; + import org.apache.commons.io.IOUtils; import org.apache.spark.util.LongAccumulator; import org.dom4j.Document; @@ -26,127 +23,133 @@ import org.junit.jupiter.api.io.TempDir; import org.mockito.Mock; import org.mockito.junit.jupiter.MockitoExtension; +import eu.dnetlib.dhp.model.mdstore.MetadataRecord; +import eu.dnetlib.dhp.transformation.functions.Cleaner; +import eu.dnetlib.dhp.transformation.vocabulary.Vocabulary; +import eu.dnetlib.dhp.transformation.vocabulary.VocabularyHelper; +import eu.dnetlib.dhp.utils.DHPUtils; +import net.sf.saxon.s9api.*; + @ExtendWith(MockitoExtension.class) public class TransformationJobTest { - @Mock private LongAccumulator accumulator; + @Mock + private LongAccumulator accumulator; - @Test - public void testTransformSaxonHE() throws Exception { + @Test + public void testTransformSaxonHE() throws Exception { - Map vocabularies = new HashMap<>(); - vocabularies.put("dnet:languages", VocabularyHelper.getVocabularyFromAPI("dnet:languages")); - Cleaner cleanFunction = new Cleaner(vocabularies); - Processor proc = new Processor(false); - proc.registerExtensionFunction(cleanFunction); - final XsltCompiler comp = proc.newXsltCompiler(); - XsltExecutable exp = - comp.compile( - new StreamSource( - this.getClass().getResourceAsStream("/eu/dnetlib/dhp/transform/ext_simple.xsl"))); - XdmNode source = - proc.newDocumentBuilder() - .build( - new StreamSource( - this.getClass().getResourceAsStream("/eu/dnetlib/dhp/transform/input.xml"))); - XsltTransformer trans = exp.load(); - trans.setInitialContextNode(source); - final StringWriter output = new StringWriter(); - Serializer out = proc.newSerializer(output); - out.setOutputProperty(Serializer.Property.METHOD, "xml"); - out.setOutputProperty(Serializer.Property.INDENT, "yes"); - trans.setDestination(out); - trans.transform(); - System.out.println(output.toString()); - } + Map vocabularies = new HashMap<>(); + vocabularies.put("dnet:languages", VocabularyHelper.getVocabularyFromAPI("dnet:languages")); + Cleaner cleanFunction = new Cleaner(vocabularies); + Processor proc = new Processor(false); + proc.registerExtensionFunction(cleanFunction); + final XsltCompiler comp = proc.newXsltCompiler(); + XsltExecutable exp = comp + .compile( + new StreamSource( + this.getClass().getResourceAsStream("/eu/dnetlib/dhp/transform/ext_simple.xsl"))); + XdmNode source = proc + .newDocumentBuilder() + .build( + new StreamSource( + this.getClass().getResourceAsStream("/eu/dnetlib/dhp/transform/input.xml"))); + XsltTransformer trans = exp.load(); + trans.setInitialContextNode(source); + final StringWriter output = new StringWriter(); + Serializer out = proc.newSerializer(output); + out.setOutputProperty(Serializer.Property.METHOD, "xml"); + out.setOutputProperty(Serializer.Property.INDENT, "yes"); + trans.setDestination(out); + trans.transform(); + System.out.println(output.toString()); + } - @DisplayName("Test TransformSparkJobNode.main") - @Test - public void transformTest(@TempDir Path testDir) throws Exception { - final String mdstore_input = - this.getClass().getResource("/eu/dnetlib/dhp/transform/mdstorenative").getFile(); - final String mdstore_output = testDir.toString() + "/version"; - final String xslt = - DHPUtils.compressString( - IOUtils.toString( - this.getClass().getResourceAsStream("/eu/dnetlib/dhp/transform/tr.xml"))); - TransformSparkJobNode.main( - new String[] { - "-mt", - "local", - "-i", - mdstore_input, - "-o", - mdstore_output, - "-d", - "1", - "-w", - "1", - "-tr", - xslt, - "-t", - "true", - "-ru", - "", - "-rp", - "", - "-rh", - "", - "-ro", - "", - "-rr", - "" - }); - } + @DisplayName("Test TransformSparkJobNode.main") + @Test + public void transformTest(@TempDir Path testDir) throws Exception { + final String mdstore_input = this.getClass().getResource("/eu/dnetlib/dhp/transform/mdstorenative").getFile(); + final String mdstore_output = testDir.toString() + "/version"; + final String xslt = DHPUtils + .compressString( + IOUtils + .toString( + this.getClass().getResourceAsStream("/eu/dnetlib/dhp/transform/tr.xml"))); + TransformSparkJobNode + .main( + new String[] { + "-mt", + "local", + "-i", + mdstore_input, + "-o", + mdstore_output, + "-d", + "1", + "-w", + "1", + "-tr", + xslt, + "-t", + "true", + "-ru", + "", + "-rp", + "", + "-rh", + "", + "-ro", + "", + "-rr", + "" + }); + } - @Test - public void tryLoadFolderOnCP() throws Exception { - final String path = - this.getClass().getResource("/eu/dnetlib/dhp/transform/mdstorenative").getFile(); - System.out.println("path = " + path); + @Test + public void tryLoadFolderOnCP() throws Exception { + final String path = this.getClass().getResource("/eu/dnetlib/dhp/transform/mdstorenative").getFile(); + System.out.println("path = " + path); - Path tempDirWithPrefix = Files.createTempDirectory("mdstore_output"); + Path tempDirWithPrefix = Files.createTempDirectory("mdstore_output"); - System.out.println(tempDirWithPrefix.toFile().getAbsolutePath()); + System.out.println(tempDirWithPrefix.toFile().getAbsolutePath()); - Files.deleteIfExists(tempDirWithPrefix); - } + Files.deleteIfExists(tempDirWithPrefix); + } - @Test - public void testTransformFunction() throws Exception { - SAXReader reader = new SAXReader(); - Document document = - reader.read(this.getClass().getResourceAsStream("/eu/dnetlib/dhp/transform/tr.xml")); - Node node = document.selectSingleNode("//CODE/*[local-name()='stylesheet']"); - final String xslt = node.asXML(); - Map vocabularies = new HashMap<>(); - vocabularies.put("dnet:languages", VocabularyHelper.getVocabularyFromAPI("dnet:languages")); + @Test + public void testTransformFunction() throws Exception { + SAXReader reader = new SAXReader(); + Document document = reader.read(this.getClass().getResourceAsStream("/eu/dnetlib/dhp/transform/tr.xml")); + Node node = document.selectSingleNode("//CODE/*[local-name()='stylesheet']"); + final String xslt = node.asXML(); + Map vocabularies = new HashMap<>(); + vocabularies.put("dnet:languages", VocabularyHelper.getVocabularyFromAPI("dnet:languages")); - TransformFunction tf = - new TransformFunction(accumulator, accumulator, accumulator, xslt, 1, vocabularies); + TransformFunction tf = new TransformFunction(accumulator, accumulator, accumulator, xslt, 1, vocabularies); - MetadataRecord record = new MetadataRecord(); - record.setBody( - IOUtils.toString( - this.getClass().getResourceAsStream("/eu/dnetlib/dhp/transform/input.xml"))); + MetadataRecord record = new MetadataRecord(); + record + .setBody( + IOUtils + .toString( + this.getClass().getResourceAsStream("/eu/dnetlib/dhp/transform/input.xml"))); - final MetadataRecord result = tf.call(record); - assertNotNull(result.getBody()); + final MetadataRecord result = tf.call(record); + assertNotNull(result.getBody()); - System.out.println(result.getBody()); - } + System.out.println(result.getBody()); + } - @Test - public void extractTr() throws Exception { + @Test + public void extractTr() throws Exception { - final String xmlTr = - IOUtils.toString(this.getClass().getResourceAsStream("/eu/dnetlib/dhp/transform/tr.xml")); + final String xmlTr = IOUtils.toString(this.getClass().getResourceAsStream("/eu/dnetlib/dhp/transform/tr.xml")); - SAXReader reader = new SAXReader(); - Document document = - reader.read(this.getClass().getResourceAsStream("/eu/dnetlib/dhp/transform/tr.xml")); - Node node = document.selectSingleNode("//CODE/*[local-name()='stylesheet']"); + SAXReader reader = new SAXReader(); + Document document = reader.read(this.getClass().getResourceAsStream("/eu/dnetlib/dhp/transform/tr.xml")); + Node node = document.selectSingleNode("//CODE/*[local-name()='stylesheet']"); - System.out.println(node.asXML()); - } + System.out.println(node.asXML()); + } } diff --git a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/transformation/vocabulary/VocabularyTest.java b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/transformation/vocabulary/VocabularyTest.java index 3732c5e82..1ae942a6b 100644 --- a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/transformation/vocabulary/VocabularyTest.java +++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/transformation/vocabulary/VocabularyTest.java @@ -1,3 +1,4 @@ + package eu.dnetlib.dhp.transformation.vocabulary; import static org.junit.jupiter.api.Assertions.*; @@ -6,10 +7,10 @@ import org.junit.jupiter.api.Test; public class VocabularyTest { - @Test - public void testLoadVocabulary() throws Exception { + @Test + public void testLoadVocabulary() throws Exception { - final Vocabulary vocabulary = VocabularyHelper.getVocabularyFromAPI("dnet:languages"); - assertEquals("dnet:languages", vocabulary.getName()); - } + final Vocabulary vocabulary = VocabularyHelper.getVocabularyFromAPI("dnet:languages"); + assertEquals("dnet:languages", vocabulary.getName()); + } } diff --git a/dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/bulktag/SparkBulkTagJob2.java b/dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/bulktag/SparkBulkTagJob2.java index 89106560c..7a8ae0bd0 100644 --- a/dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/bulktag/SparkBulkTagJob2.java +++ b/dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/bulktag/SparkBulkTagJob2.java @@ -1,13 +1,10 @@ + package eu.dnetlib.dhp.bulktag; import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; -import com.fasterxml.jackson.databind.ObjectMapper; -import com.google.gson.Gson; -import eu.dnetlib.dhp.application.ArgumentApplicationParser; -import eu.dnetlib.dhp.community.*; -import eu.dnetlib.dhp.schema.oaf.*; import java.util.Optional; + import org.apache.commons.io.IOUtils; import org.apache.spark.SparkConf; import org.apache.spark.api.java.function.MapFunction; @@ -18,144 +15,152 @@ import org.apache.spark.sql.SparkSession; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import com.fasterxml.jackson.databind.ObjectMapper; +import com.google.gson.Gson; + +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.community.*; +import eu.dnetlib.dhp.schema.oaf.*; + public class SparkBulkTagJob2 { - private static final Logger log = LoggerFactory.getLogger(SparkBulkTagJob2.class); + private static final Logger log = LoggerFactory.getLogger(SparkBulkTagJob2.class); - private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); + private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); - public static void main(String[] args) throws Exception { - String jsonConfiguration = - IOUtils.toString( - SparkBulkTagJob2.class.getResourceAsStream( - "/eu/dnetlib/dhp/bulktag/input_bulkTag_parameters.json")); + public static void main(String[] args) throws Exception { + String jsonConfiguration = IOUtils + .toString( + SparkBulkTagJob2.class + .getResourceAsStream( + "/eu/dnetlib/dhp/bulktag/input_bulkTag_parameters.json")); - final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration); + final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration); - parser.parseArgument(args); + parser.parseArgument(args); - Boolean isSparkSessionManaged = - Optional.ofNullable(parser.get("isSparkSessionManaged")) - .map(Boolean::valueOf) - .orElse(Boolean.TRUE); - log.info("isSparkSessionManaged: {}", isSparkSessionManaged); + Boolean isSparkSessionManaged = Optional + .ofNullable(parser.get("isSparkSessionManaged")) + .map(Boolean::valueOf) + .orElse(Boolean.TRUE); + log.info("isSparkSessionManaged: {}", isSparkSessionManaged); - Boolean isTest = - Optional.ofNullable(parser.get("isTest")) - .map(Boolean::valueOf) - .orElse(Boolean.FALSE); - log.info("isTest: {} ", isTest); + Boolean isTest = Optional + .ofNullable(parser.get("isTest")) + .map(Boolean::valueOf) + .orElse(Boolean.FALSE); + log.info("isTest: {} ", isTest); - final String inputPath = parser.get("sourcePath"); - log.info("inputPath: {}", inputPath); + final String inputPath = parser.get("sourcePath"); + log.info("inputPath: {}", inputPath); - final String outputPath = parser.get("outputPath"); - log.info("outputPath: {}", outputPath); + final String outputPath = parser.get("outputPath"); + log.info("outputPath: {}", outputPath); - ProtoMap protoMappingParams = new Gson().fromJson(parser.get("protoMap"), ProtoMap.class); - ; - log.info("protoMap: {}", new Gson().toJson(protoMappingParams)); + ProtoMap protoMappingParams = new Gson().fromJson(parser.get("protoMap"), ProtoMap.class); + ; + log.info("protoMap: {}", new Gson().toJson(protoMappingParams)); - final String resultClassName = parser.get("resultTableName"); - log.info("resultTableName: {}", resultClassName); + final String resultClassName = parser.get("resultTableName"); + log.info("resultTableName: {}", resultClassName); - final Boolean saveGraph = - Optional.ofNullable(parser.get("saveGraph")) - .map(Boolean::valueOf) - .orElse(Boolean.TRUE); - log.info("saveGraph: {}", saveGraph); + final Boolean saveGraph = Optional + .ofNullable(parser.get("saveGraph")) + .map(Boolean::valueOf) + .orElse(Boolean.TRUE); + log.info("saveGraph: {}", saveGraph); - Class resultClazz = - (Class) Class.forName(resultClassName); + Class resultClazz = (Class) Class.forName(resultClassName); - SparkConf conf = new SparkConf(); - CommunityConfiguration cc; + SparkConf conf = new SparkConf(); + CommunityConfiguration cc; - String taggingConf = parser.get("taggingConf"); + String taggingConf = parser.get("taggingConf"); - if (isTest) { - cc = CommunityConfigurationFactory.newInstance(taggingConf); - } else { - cc = QueryInformationSystem.getCommunityConfiguration(parser.get("isLookupUrl")); - } + if (isTest) { + cc = CommunityConfigurationFactory.newInstance(taggingConf); + } else { + cc = QueryInformationSystem.getCommunityConfiguration(parser.get("isLookupUrl")); + } - runWithSparkSession( - conf, - isSparkSessionManaged, - spark -> { - execBulkTag(spark, inputPath, outputPath, protoMappingParams, resultClazz, cc); - }); + runWithSparkSession( + conf, + isSparkSessionManaged, + spark -> { + execBulkTag(spark, inputPath, outputPath, protoMappingParams, resultClazz, cc); + }); - // runWithSparkSession(conf, isSparkSessionManaged, - // spark -> { - // if(isTest(parser)) { - // removeOutputDir(spark, outputPath); - // } - // if(saveGraph) - // execPropagation(spark, possibleUpdates, inputPath, outputPath, - // resultClazz); - // }); - // - // - // - // - // - // - // sc.textFile(inputPath + "/publication") - // .map(item -> new ObjectMapper().readValue(item, Publication.class)) - // .map(p -> resultTagger.enrichContextCriteria(p, cc, protoMappingParams)) - // .map(p -> new ObjectMapper().writeValueAsString(p)) - // .saveAsTextFile(outputPath+"/publication"); - // sc.textFile(inputPath + "/dataset") - // .map(item -> new ObjectMapper().readValue(item, Dataset.class)) - // .map(p -> resultTagger.enrichContextCriteria(p, cc, protoMappingParams)) - // .map(p -> new ObjectMapper().writeValueAsString(p)) - // .saveAsTextFile(outputPath+"/dataset"); - // sc.textFile(inputPath + "/software") - // .map(item -> new ObjectMapper().readValue(item, Software.class)) - // .map(p -> resultTagger.enrichContextCriteria(p, cc, protoMappingParams)) - // .map(p -> new ObjectMapper().writeValueAsString(p)) - // .saveAsTextFile(outputPath+"/software"); - // sc.textFile(inputPath + "/otherresearchproduct") - // .map(item -> new ObjectMapper().readValue(item, - // OtherResearchProduct.class)) - // .map(p -> resultTagger.enrichContextCriteria(p, cc, protoMappingParams)) - // .map(p -> new ObjectMapper().writeValueAsString(p)) - // .saveAsTextFile(outputPath+"/otherresearchproduct"); - // + // runWithSparkSession(conf, isSparkSessionManaged, + // spark -> { + // if(isTest(parser)) { + // removeOutputDir(spark, outputPath); + // } + // if(saveGraph) + // execPropagation(spark, possibleUpdates, inputPath, outputPath, + // resultClazz); + // }); + // + // + // + // + // + // + // sc.textFile(inputPath + "/publication") + // .map(item -> new ObjectMapper().readValue(item, Publication.class)) + // .map(p -> resultTagger.enrichContextCriteria(p, cc, protoMappingParams)) + // .map(p -> new ObjectMapper().writeValueAsString(p)) + // .saveAsTextFile(outputPath+"/publication"); + // sc.textFile(inputPath + "/dataset") + // .map(item -> new ObjectMapper().readValue(item, Dataset.class)) + // .map(p -> resultTagger.enrichContextCriteria(p, cc, protoMappingParams)) + // .map(p -> new ObjectMapper().writeValueAsString(p)) + // .saveAsTextFile(outputPath+"/dataset"); + // sc.textFile(inputPath + "/software") + // .map(item -> new ObjectMapper().readValue(item, Software.class)) + // .map(p -> resultTagger.enrichContextCriteria(p, cc, protoMappingParams)) + // .map(p -> new ObjectMapper().writeValueAsString(p)) + // .saveAsTextFile(outputPath+"/software"); + // sc.textFile(inputPath + "/otherresearchproduct") + // .map(item -> new ObjectMapper().readValue(item, + // OtherResearchProduct.class)) + // .map(p -> resultTagger.enrichContextCriteria(p, cc, protoMappingParams)) + // .map(p -> new ObjectMapper().writeValueAsString(p)) + // .saveAsTextFile(outputPath+"/otherresearchproduct"); + // - } + } - private static void execBulkTag( - SparkSession spark, - String inputPath, - String outputPath, - ProtoMap protoMappingParams, - Class resultClazz, - CommunityConfiguration communityConfiguration) { + private static void execBulkTag( + SparkSession spark, + String inputPath, + String outputPath, + ProtoMap protoMappingParams, + Class resultClazz, + CommunityConfiguration communityConfiguration) { - ResultTagger resultTagger = new ResultTagger(); - Dataset result = readPathEntity(spark, inputPath, resultClazz); - result.map( - value -> - resultTagger.enrichContextCriteria( - value, communityConfiguration, protoMappingParams), - Encoders.bean(resultClazz)) - .toJSON() - .write() - .mode(SaveMode.Overwrite) - .option("compression", "gzip") - .text(outputPath); - } + ResultTagger resultTagger = new ResultTagger(); + Dataset result = readPathEntity(spark, inputPath, resultClazz); + result + .map( + value -> resultTagger + .enrichContextCriteria( + value, communityConfiguration, protoMappingParams), + Encoders.bean(resultClazz)) + .toJSON() + .write() + .mode(SaveMode.Overwrite) + .option("compression", "gzip") + .text(outputPath); + } - private static org.apache.spark.sql.Dataset readPathEntity( - SparkSession spark, String inputEntityPath, Class resultClazz) { + private static org.apache.spark.sql.Dataset readPathEntity( + SparkSession spark, String inputEntityPath, Class resultClazz) { - return spark.read() - .textFile(inputEntityPath) - .map( - (MapFunction) - value -> OBJECT_MAPPER.readValue(value, resultClazz), - Encoders.bean(resultClazz)); - } + return spark + .read() + .textFile(inputEntityPath) + .map( + (MapFunction) value -> OBJECT_MAPPER.readValue(value, resultClazz), + Encoders.bean(resultClazz)); + } } diff --git a/dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/community/Community.java b/dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/community/Community.java index b0c213c12..a73ff4d3e 100644 --- a/dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/community/Community.java +++ b/dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/community/Community.java @@ -1,62 +1,65 @@ + package eu.dnetlib.dhp.community; -import com.google.gson.Gson; import java.io.Serializable; import java.util.ArrayList; import java.util.List; + import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; +import com.google.gson.Gson; + /** Created by miriam on 01/08/2018. */ public class Community implements Serializable { - private static final Log log = LogFactory.getLog(Community.class); + private static final Log log = LogFactory.getLog(Community.class); - private String id; - private List subjects = new ArrayList<>(); - private List datasources = new ArrayList<>(); - private List zenodoCommunities = new ArrayList<>(); + private String id; + private List subjects = new ArrayList<>(); + private List datasources = new ArrayList<>(); + private List zenodoCommunities = new ArrayList<>(); - public String toJson() { - final Gson g = new Gson(); - return g.toJson(this); - } + public String toJson() { + final Gson g = new Gson(); + return g.toJson(this); + } - public boolean isValid() { - return !getSubjects().isEmpty() - || !getDatasources().isEmpty() - || !getZenodoCommunities().isEmpty(); - } + public boolean isValid() { + return !getSubjects().isEmpty() + || !getDatasources().isEmpty() + || !getZenodoCommunities().isEmpty(); + } - public String getId() { - return id; - } + public String getId() { + return id; + } - public void setId(String id) { - this.id = id; - } + public void setId(String id) { + this.id = id; + } - public List getSubjects() { - return subjects; - } + public List getSubjects() { + return subjects; + } - public void setSubjects(List subjects) { - this.subjects = subjects; - } + public void setSubjects(List subjects) { + this.subjects = subjects; + } - public List getDatasources() { - return datasources; - } + public List getDatasources() { + return datasources; + } - public void setDatasources(List datasources) { - this.datasources = datasources; - } + public void setDatasources(List datasources) { + this.datasources = datasources; + } - public List getZenodoCommunities() { - return zenodoCommunities; - } + public List getZenodoCommunities() { + return zenodoCommunities; + } - public void setZenodoCommunities(List zenodoCommunities) { - this.zenodoCommunities = zenodoCommunities; - } + public void setZenodoCommunities(List zenodoCommunities) { + this.zenodoCommunities = zenodoCommunities; + } } diff --git a/dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/community/CommunityConfiguration.java b/dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/community/CommunityConfiguration.java index 1fd5bedd4..c5bbb66eb 100644 --- a/dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/community/CommunityConfiguration.java +++ b/dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/community/CommunityConfiguration.java @@ -1,189 +1,196 @@ + package eu.dnetlib.dhp.community; -import com.google.common.collect.Lists; -import com.google.common.collect.Maps; -import com.google.gson.Gson; -import com.google.gson.GsonBuilder; -import eu.dnetlib.dhp.selectioncriteria.InterfaceAdapter; -import eu.dnetlib.dhp.selectioncriteria.Selection; import java.io.Serializable; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.stream.Collectors; + import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; +import com.google.common.collect.Lists; +import com.google.common.collect.Maps; +import com.google.gson.Gson; +import com.google.gson.GsonBuilder; + +import eu.dnetlib.dhp.selectioncriteria.InterfaceAdapter; +import eu.dnetlib.dhp.selectioncriteria.Selection; + /** Created by miriam on 02/08/2018. */ public class CommunityConfiguration implements Serializable { - private static final Log log = LogFactory.getLog(CommunityConfiguration.class); + private static final Log log = LogFactory.getLog(CommunityConfiguration.class); - private Map communities; + private Map communities; - // map subject -> communityid - private Map>> subjectMap = new HashMap<>(); - // map datasourceid -> communityid - private Map>> datasourceMap = new HashMap<>(); - // map zenodocommunityid -> communityid - private Map>> zenodocommunityMap = - new HashMap<>(); + // map subject -> communityid + private Map>> subjectMap = new HashMap<>(); + // map datasourceid -> communityid + private Map>> datasourceMap = new HashMap<>(); + // map zenodocommunityid -> communityid + private Map>> zenodocommunityMap = new HashMap<>(); - public Map>> getSubjectMap() { - return subjectMap; - } + public Map>> getSubjectMap() { + return subjectMap; + } - public void setSubjectMap(Map>> subjectMap) { - this.subjectMap = subjectMap; - } + public void setSubjectMap(Map>> subjectMap) { + this.subjectMap = subjectMap; + } - public Map>> getDatasourceMap() { - return datasourceMap; - } + public Map>> getDatasourceMap() { + return datasourceMap; + } - public void setDatasourceMap( - Map>> datasourceMap) { - this.datasourceMap = datasourceMap; - } + public void setDatasourceMap( + Map>> datasourceMap) { + this.datasourceMap = datasourceMap; + } - public Map>> getZenodocommunityMap() { - return zenodocommunityMap; - } + public Map>> getZenodocommunityMap() { + return zenodocommunityMap; + } - public void setZenodocommunityMap( - Map>> zenodocommunityMap) { - this.zenodocommunityMap = zenodocommunityMap; - } + public void setZenodocommunityMap( + Map>> zenodocommunityMap) { + this.zenodocommunityMap = zenodocommunityMap; + } - CommunityConfiguration(final Map communities) { - this.communities = communities; - init(); - } + CommunityConfiguration(final Map communities) { + this.communities = communities; + init(); + } - void init() { + void init() { - if (subjectMap == null) { - subjectMap = Maps.newHashMap(); - } - if (datasourceMap == null) { - datasourceMap = Maps.newHashMap(); - } - if (zenodocommunityMap == null) { - zenodocommunityMap = Maps.newHashMap(); - } + if (subjectMap == null) { + subjectMap = Maps.newHashMap(); + } + if (datasourceMap == null) { + datasourceMap = Maps.newHashMap(); + } + if (zenodocommunityMap == null) { + zenodocommunityMap = Maps.newHashMap(); + } - for (Community c : getCommunities().values()) { - // get subjects - final String id = c.getId(); - for (String sbj : c.getSubjects()) { - Pair p = new Pair<>(id, new SelectionConstraints()); - add(sbj.toLowerCase().trim(), p, subjectMap); - } - // get datasources - for (Datasource d : c.getDatasources()) { + for (Community c : getCommunities().values()) { + // get subjects + final String id = c.getId(); + for (String sbj : c.getSubjects()) { + Pair p = new Pair<>(id, new SelectionConstraints()); + add(sbj.toLowerCase().trim(), p, subjectMap); + } + // get datasources + for (Datasource d : c.getDatasources()) { - add(d.getOpenaireId(), new Pair<>(id, d.getSelectionConstraints()), datasourceMap); - } - // get zenodo communities - for (ZenodoCommunity zc : c.getZenodoCommunities()) { - add( - zc.getZenodoCommunityId(), - new Pair<>(id, zc.getSelCriteria()), - zenodocommunityMap); - } - } - } + add(d.getOpenaireId(), new Pair<>(id, d.getSelectionConstraints()), datasourceMap); + } + // get zenodo communities + for (ZenodoCommunity zc : c.getZenodoCommunities()) { + add( + zc.getZenodoCommunityId(), + new Pair<>(id, zc.getSelCriteria()), + zenodocommunityMap); + } + } + } - private void add( - String key, - Pair value, - Map>> map) { - List> values = map.get(key); + private void add( + String key, + Pair value, + Map>> map) { + List> values = map.get(key); - if (values == null) { - values = new ArrayList<>(); - map.put(key, values); - } - values.add(value); - } + if (values == null) { + values = new ArrayList<>(); + map.put(key, values); + } + values.add(value); + } - public List> getCommunityForSubject(String sbj) { - return subjectMap.get(sbj); - } + public List> getCommunityForSubject(String sbj) { + return subjectMap.get(sbj); + } - public List> getCommunityForDatasource(String dts) { - return datasourceMap.get(dts); - } + public List> getCommunityForDatasource(String dts) { + return datasourceMap.get(dts); + } - public List getCommunityForDatasource( - final String dts, final Map> param) { - List> lp = datasourceMap.get(dts); - if (lp == null) return Lists.newArrayList(); + public List getCommunityForDatasource( + final String dts, final Map> param) { + List> lp = datasourceMap.get(dts); + if (lp == null) + return Lists.newArrayList(); - return lp.stream() - .map( - p -> { - if (p.getSnd() == null) return p.getFst(); - if (((SelectionConstraints) p.getSnd()).verifyCriteria(param)) - return p.getFst(); - else return null; - }) - .filter(st -> (st != null)) - .collect(Collectors.toList()); - } + return lp + .stream() + .map( + p -> { + if (p.getSnd() == null) + return p.getFst(); + if (((SelectionConstraints) p.getSnd()).verifyCriteria(param)) + return p.getFst(); + else + return null; + }) + .filter(st -> (st != null)) + .collect(Collectors.toList()); + } - public List> getCommunityForZenodoCommunity(String zc) { - return zenodocommunityMap.get(zc); - } + public List> getCommunityForZenodoCommunity(String zc) { + return zenodocommunityMap.get(zc); + } - public List getCommunityForSubjectValue(String value) { + public List getCommunityForSubjectValue(String value) { - return getContextIds(subjectMap.get(value)); - } + return getContextIds(subjectMap.get(value)); + } - public List getCommunityForDatasourceValue(String value) { + public List getCommunityForDatasourceValue(String value) { - return getContextIds(datasourceMap.get(value.toLowerCase())); - } + return getContextIds(datasourceMap.get(value.toLowerCase())); + } - public List getCommunityForZenodoCommunityValue(String value) { + public List getCommunityForZenodoCommunityValue(String value) { - return getContextIds(zenodocommunityMap.get(value.toLowerCase())); - } + return getContextIds(zenodocommunityMap.get(value.toLowerCase())); + } - private List getContextIds(List> list) { - if (list != null) { - return list.stream().map(p -> p.getFst()).collect(Collectors.toList()); - } - return Lists.newArrayList(); - } + private List getContextIds(List> list) { + if (list != null) { + return list.stream().map(p -> p.getFst()).collect(Collectors.toList()); + } + return Lists.newArrayList(); + } - public Map getCommunities() { - return communities; - } + public Map getCommunities() { + return communities; + } - public void setCommunities(Map communities) { - this.communities = communities; - } + public void setCommunities(Map communities) { + this.communities = communities; + } - public String toJson() { - GsonBuilder builder = new GsonBuilder(); - builder.registerTypeAdapter(Selection.class, new InterfaceAdapter()); - Gson gson = builder.create(); + public String toJson() { + GsonBuilder builder = new GsonBuilder(); + builder.registerTypeAdapter(Selection.class, new InterfaceAdapter()); + Gson gson = builder.create(); - return gson.toJson(this); - } + return gson.toJson(this); + } - public int size() { - return communities.keySet().size(); - } + public int size() { + return communities.keySet().size(); + } - public Community getCommunityById(String id) { - return communities.get(id); - } + public Community getCommunityById(String id) { + return communities.get(id); + } - public List getCommunityList() { - return Lists.newLinkedList(communities.values()); - } + public List getCommunityList() { + return Lists.newLinkedList(communities.values()); + } } diff --git a/dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/community/CommunityConfigurationFactory.java b/dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/community/CommunityConfigurationFactory.java index 866ec28a1..508f0663d 100644 --- a/dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/community/CommunityConfigurationFactory.java +++ b/dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/community/CommunityConfigurationFactory.java @@ -1,17 +1,11 @@ + package eu.dnetlib.dhp.community; -import com.google.common.collect.Lists; -import com.google.common.collect.Maps; -import com.google.gson.Gson; -import com.google.gson.GsonBuilder; -import eu.dnetlib.dhp.selectioncriteria.InterfaceAdapter; -import eu.dnetlib.dhp.selectioncriteria.Selection; -import eu.dnetlib.dhp.selectioncriteria.VerbResolver; -import eu.dnetlib.dhp.selectioncriteria.VerbResolverFactory; import java.io.StringReader; import java.util.ArrayList; import java.util.List; import java.util.Map; + import org.apache.commons.lang3.StringUtils; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; @@ -20,114 +14,125 @@ import org.dom4j.DocumentException; import org.dom4j.Node; import org.dom4j.io.SAXReader; +import com.google.common.collect.Lists; +import com.google.common.collect.Maps; +import com.google.gson.Gson; +import com.google.gson.GsonBuilder; + +import eu.dnetlib.dhp.selectioncriteria.InterfaceAdapter; +import eu.dnetlib.dhp.selectioncriteria.Selection; +import eu.dnetlib.dhp.selectioncriteria.VerbResolver; +import eu.dnetlib.dhp.selectioncriteria.VerbResolverFactory; + /** Created by miriam on 03/08/2018. */ public class CommunityConfigurationFactory { - private static final Log log = LogFactory.getLog(CommunityConfigurationFactory.class); + private static final Log log = LogFactory.getLog(CommunityConfigurationFactory.class); - private static VerbResolver resolver = VerbResolverFactory.newInstance(); + private static VerbResolver resolver = VerbResolverFactory.newInstance(); - public static CommunityConfiguration newInstance(final String xml) throws DocumentException { + public static CommunityConfiguration newInstance(final String xml) throws DocumentException { - log.debug(String.format("parsing community configuration from:\n%s", xml)); + log.debug(String.format("parsing community configuration from:\n%s", xml)); - final Document doc = new SAXReader().read(new StringReader(xml)); + final Document doc = new SAXReader().read(new StringReader(xml)); - final Map communities = Maps.newHashMap(); + final Map communities = Maps.newHashMap(); - for (final Object o : doc.selectNodes("//community")) { + for (final Object o : doc.selectNodes("//community")) { - final Node node = (Node) o; + final Node node = (Node) o; - final Community community = parseCommunity(node); + final Community community = parseCommunity(node); - if (community.isValid()) { - communities.put(community.getId(), community); - } - } + if (community.isValid()) { + communities.put(community.getId(), community); + } + } - log.info(String.format("loaded %s community configuration profiles", communities.size())); - log.debug(String.format("loaded community configuration:\n%s", communities.toString())); + log.info(String.format("loaded %s community configuration profiles", communities.size())); + log.debug(String.format("loaded community configuration:\n%s", communities.toString())); - return new CommunityConfiguration(communities); - } + return new CommunityConfiguration(communities); + } - public static CommunityConfiguration fromJson(final String json) { - GsonBuilder builder = new GsonBuilder(); - builder.registerTypeAdapter(Selection.class, new InterfaceAdapter()); - Gson gson = builder.create(); - final CommunityConfiguration conf = gson.fromJson(json, CommunityConfiguration.class); - log.info(String.format("loaded %s community configuration profiles", conf.size())); - conf.init(); - log.info("created inverse maps"); + public static CommunityConfiguration fromJson(final String json) { + GsonBuilder builder = new GsonBuilder(); + builder.registerTypeAdapter(Selection.class, new InterfaceAdapter()); + Gson gson = builder.create(); + final CommunityConfiguration conf = gson.fromJson(json, CommunityConfiguration.class); + log.info(String.format("loaded %s community configuration profiles", conf.size())); + conf.init(); + log.info("created inverse maps"); - return conf; - } + return conf; + } - private static Community parseCommunity(final Node node) { + private static Community parseCommunity(final Node node) { - final Community c = new Community(); + final Community c = new Community(); - c.setId(node.valueOf("./@id")); + c.setId(node.valueOf("./@id")); - log.info(String.format("community id: %s", c.getId())); + log.info(String.format("community id: %s", c.getId())); - c.setSubjects(parseSubjects(node)); - c.setDatasources(parseDatasources(node)); - c.setZenodoCommunities(parseZenodoCommunities(node)); - return c; - } + c.setSubjects(parseSubjects(node)); + c.setDatasources(parseDatasources(node)); + c.setZenodoCommunities(parseZenodoCommunities(node)); + return c; + } - private static List parseSubjects(final Node node) { + private static List parseSubjects(final Node node) { - final List subjects = Lists.newArrayList(); + final List subjects = Lists.newArrayList(); - final List list = node.selectNodes("./subjects/subject"); + final List list = node.selectNodes("./subjects/subject"); - for (Node n : list) { - log.debug("text of the node " + n.getText()); - subjects.add(StringUtils.trim(n.getText())); - } - log.info("size of the subject list " + subjects.size()); - return subjects; - } + for (Node n : list) { + log.debug("text of the node " + n.getText()); + subjects.add(StringUtils.trim(n.getText())); + } + log.info("size of the subject list " + subjects.size()); + return subjects; + } - private static List parseDatasources(final Node node) { - final List list = node.selectNodes("./datasources/datasource"); - final List datasourceList = new ArrayList<>(); - for (Node n : list) { - Datasource d = new Datasource(); - d.setOpenaireId(n.selectSingleNode("./openaireId").getText()); - d.setSelCriteria(n.selectSingleNode("./selcriteria"), resolver); - datasourceList.add(d); - } - log.info("size of the datasource list " + datasourceList.size()); - return datasourceList; - } + private static List parseDatasources(final Node node) { + final List list = node.selectNodes("./datasources/datasource"); + final List datasourceList = new ArrayList<>(); + for (Node n : list) { + Datasource d = new Datasource(); + d.setOpenaireId(n.selectSingleNode("./openaireId").getText()); + d.setSelCriteria(n.selectSingleNode("./selcriteria"), resolver); + datasourceList.add(d); + } + log.info("size of the datasource list " + datasourceList.size()); + return datasourceList; + } - private static List parseZenodoCommunities(final Node node) { - final Node oacommunitynode = node.selectSingleNode("./oacommunity"); - String oacommunity = null; - if (oacommunitynode != null) { - String tmp = oacommunitynode.getText(); - if (StringUtils.isNotBlank(tmp)) oacommunity = tmp; - } + private static List parseZenodoCommunities(final Node node) { + final Node oacommunitynode = node.selectSingleNode("./oacommunity"); + String oacommunity = null; + if (oacommunitynode != null) { + String tmp = oacommunitynode.getText(); + if (StringUtils.isNotBlank(tmp)) + oacommunity = tmp; + } - final List list = node.selectNodes("./zenodocommunities/zenodocommunity"); - final List zenodoCommunityList = new ArrayList<>(); - for (Node n : list) { - ZenodoCommunity zc = new ZenodoCommunity(); - zc.setZenodoCommunityId(n.selectSingleNode("./zenodoid").getText()); - zc.setSelCriteria(n.selectSingleNode("./selcriteria")); + final List list = node.selectNodes("./zenodocommunities/zenodocommunity"); + final List zenodoCommunityList = new ArrayList<>(); + for (Node n : list) { + ZenodoCommunity zc = new ZenodoCommunity(); + zc.setZenodoCommunityId(n.selectSingleNode("./zenodoid").getText()); + zc.setSelCriteria(n.selectSingleNode("./selcriteria")); - zenodoCommunityList.add(zc); - } - if (oacommunity != null) { - ZenodoCommunity zc = new ZenodoCommunity(); - zc.setZenodoCommunityId(oacommunity); - zenodoCommunityList.add(zc); - } - log.info("size of the zenodo community list " + zenodoCommunityList.size()); - return zenodoCommunityList; - } + zenodoCommunityList.add(zc); + } + if (oacommunity != null) { + ZenodoCommunity zc = new ZenodoCommunity(); + zc.setZenodoCommunityId(oacommunity); + zenodoCommunityList.add(zc); + } + log.info("size of the zenodo community list " + zenodoCommunityList.size()); + return zenodoCommunityList; + } } diff --git a/dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/community/Constraint.java b/dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/community/Constraint.java index 491f3de05..54f381d4a 100644 --- a/dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/community/Constraint.java +++ b/dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/community/Constraint.java @@ -1,53 +1,56 @@ + package eu.dnetlib.dhp.community; -import eu.dnetlib.dhp.selectioncriteria.Selection; -import eu.dnetlib.dhp.selectioncriteria.VerbResolver; import java.io.Serializable; import java.lang.reflect.InvocationTargetException; +import eu.dnetlib.dhp.selectioncriteria.Selection; +import eu.dnetlib.dhp.selectioncriteria.VerbResolver; + public class Constraint implements Serializable { - private String verb; - private String field; - private String value; - private Selection selection; + private String verb; + private String field; + private String value; + private Selection selection; - public Constraint() {} + public Constraint() { + } - public String getVerb() { - return verb; - } + public String getVerb() { + return verb; + } - public void setVerb(String verb) { - this.verb = verb; - } + public void setVerb(String verb) { + this.verb = verb; + } - public String getField() { - return field; - } + public String getField() { + return field; + } - public void setField(String field) { - this.field = field; - } + public void setField(String field) { + this.field = field; + } - public String getValue() { - return value; - } + public String getValue() { + return value; + } - public void setValue(String value) { - this.value = value; - } + public void setValue(String value) { + this.value = value; + } - public void setSelection(Selection sel) { - selection = sel; - } + public void setSelection(Selection sel) { + selection = sel; + } - public void setSelection(VerbResolver resolver) - throws InvocationTargetException, NoSuchMethodException, InstantiationException, - IllegalAccessException { - selection = resolver.getSelectionCriteria(verb, value); - } + public void setSelection(VerbResolver resolver) + throws InvocationTargetException, NoSuchMethodException, InstantiationException, + IllegalAccessException { + selection = resolver.getSelectionCriteria(verb, value); + } - public boolean verifyCriteria(String metadata) { - return selection.apply(metadata); - } + public boolean verifyCriteria(String metadata) { + return selection.apply(metadata); + } } diff --git a/dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/community/Constraints.java b/dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/community/Constraints.java index 9b2974c27..af095c513 100644 --- a/dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/community/Constraints.java +++ b/dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/community/Constraints.java @@ -1,67 +1,74 @@ + package eu.dnetlib.dhp.community; -import com.google.gson.Gson; -import com.google.gson.reflect.TypeToken; -import eu.dnetlib.dhp.selectioncriteria.VerbResolver; import java.io.Serializable; import java.lang.reflect.InvocationTargetException; import java.lang.reflect.Type; import java.util.Collection; import java.util.List; import java.util.Map; + import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; +import com.google.gson.Gson; +import com.google.gson.reflect.TypeToken; + +import eu.dnetlib.dhp.selectioncriteria.VerbResolver; + /** Created by miriam on 02/08/2018. */ public class Constraints implements Serializable { - private static final Log log = LogFactory.getLog(Constraints.class); - // private ConstraintEncapsulator ce; - private List constraint; + private static final Log log = LogFactory.getLog(Constraints.class); + // private ConstraintEncapsulator ce; + private List constraint; - public Constraints() {} + public Constraints() { + } - public List getConstraint() { - return constraint; - } + public List getConstraint() { + return constraint; + } - public void setConstraint(List constraint) { - this.constraint = constraint; - } + public void setConstraint(List constraint) { + this.constraint = constraint; + } - public void setSc(String json) { - Type collectionType = new TypeToken>() {}.getType(); - constraint = new Gson().fromJson(json, collectionType); - } + public void setSc(String json) { + Type collectionType = new TypeToken>() { + }.getType(); + constraint = new Gson().fromJson(json, collectionType); + } - void setSelection(VerbResolver resolver) { - for (Constraint st : constraint) { + void setSelection(VerbResolver resolver) { + for (Constraint st : constraint) { - try { - st.setSelection(resolver); - } catch (NoSuchMethodException e) { - log.error(e.getMessage()); - } catch (IllegalAccessException e) { - log.error(e.getMessage()); - } catch (InvocationTargetException e) { - log.error(e.getMessage()); - } catch (InstantiationException e) { - log.error(e.getMessage()); - } - } - } + try { + st.setSelection(resolver); + } catch (NoSuchMethodException e) { + log.error(e.getMessage()); + } catch (IllegalAccessException e) { + log.error(e.getMessage()); + } catch (InvocationTargetException e) { + log.error(e.getMessage()); + } catch (InstantiationException e) { + log.error(e.getMessage()); + } + } + } - // Constraint in and - public boolean verifyCriteria(final Map> param) { + // Constraint in and + public boolean verifyCriteria(final Map> param) { - for (Constraint sc : constraint) { - boolean verified = false; - for (String value : param.get(sc.getField())) { - if (sc.verifyCriteria(value.trim())) { - verified = true; - } - } - if (!verified) return verified; - } - return true; - } + for (Constraint sc : constraint) { + boolean verified = false; + for (String value : param.get(sc.getField())) { + if (sc.verifyCriteria(value.trim())) { + verified = true; + } + } + if (!verified) + return verified; + } + return true; + } } diff --git a/dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/community/Datasource.java b/dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/community/Datasource.java index 5acba31d6..a3d343087 100644 --- a/dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/community/Datasource.java +++ b/dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/community/Datasource.java @@ -1,57 +1,61 @@ + package eu.dnetlib.dhp.community; -import com.google.gson.Gson; -import eu.dnetlib.dhp.selectioncriteria.VerbResolver; import java.io.Serializable; + import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.dom4j.Node; +import com.google.gson.Gson; + +import eu.dnetlib.dhp.selectioncriteria.VerbResolver; + /** Created by miriam on 01/08/2018. */ public class Datasource implements Serializable { - private static final Log log = LogFactory.getLog(Datasource.class); + private static final Log log = LogFactory.getLog(Datasource.class); - private String openaireId; + private String openaireId; - private SelectionConstraints selectionConstraints; + private SelectionConstraints selectionConstraints; - public SelectionConstraints getSelCriteria() { - return selectionConstraints; - } + public SelectionConstraints getSelCriteria() { + return selectionConstraints; + } - public SelectionConstraints getSelectionConstraints() { - return selectionConstraints; - } + public SelectionConstraints getSelectionConstraints() { + return selectionConstraints; + } - public void setSelectionConstraints(SelectionConstraints selectionConstraints) { - this.selectionConstraints = selectionConstraints; - } + public void setSelectionConstraints(SelectionConstraints selectionConstraints) { + this.selectionConstraints = selectionConstraints; + } - public void setSelCriteria(SelectionConstraints selCriteria) { - this.selectionConstraints = selCriteria; - } + public void setSelCriteria(SelectionConstraints selCriteria) { + this.selectionConstraints = selCriteria; + } - public String getOpenaireId() { - return openaireId; - } + public String getOpenaireId() { + return openaireId; + } - public void setOpenaireId(String openaireId) { - this.openaireId = openaireId; - } + public void setOpenaireId(String openaireId) { + this.openaireId = openaireId; + } - private void setSelCriteria(String json, VerbResolver resolver) { - log.info("Selection constraints for datasource = " + json); - selectionConstraints = new Gson().fromJson(json, SelectionConstraints.class); + private void setSelCriteria(String json, VerbResolver resolver) { + log.info("Selection constraints for datasource = " + json); + selectionConstraints = new Gson().fromJson(json, SelectionConstraints.class); - selectionConstraints.setSelection(resolver); - } + selectionConstraints.setSelection(resolver); + } - public void setSelCriteria(Node n, VerbResolver resolver) { - try { - setSelCriteria(n.getText(), resolver); - } catch (Exception e) { - log.info("not set selection criteria... "); - selectionConstraints = null; - } - } + public void setSelCriteria(Node n, VerbResolver resolver) { + try { + setSelCriteria(n.getText(), resolver); + } catch (Exception e) { + log.info("not set selection criteria... "); + selectionConstraints = null; + } + } } diff --git a/dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/community/Pair.java b/dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/community/Pair.java index 78ffe860d..01cd3ce22 100644 --- a/dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/community/Pair.java +++ b/dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/community/Pair.java @@ -1,37 +1,39 @@ + package eu.dnetlib.dhp.community; -import com.google.gson.Gson; import java.io.Serializable; +import com.google.gson.Gson; + /** Created by miriam on 03/08/2018. */ public class Pair implements Serializable { - private A fst; - private B snd; + private A fst; + private B snd; - public A getFst() { - return fst; - } + public A getFst() { + return fst; + } - public Pair setFst(A fst) { - this.fst = fst; - return this; - } + public Pair setFst(A fst) { + this.fst = fst; + return this; + } - public B getSnd() { - return snd; - } + public B getSnd() { + return snd; + } - public Pair setSnd(B snd) { - this.snd = snd; - return this; - } + public Pair setSnd(B snd) { + this.snd = snd; + return this; + } - public Pair(A a, B b) { - fst = a; - snd = b; - } + public Pair(A a, B b) { + fst = a; + snd = b; + } - public String toJson() { - return new Gson().toJson(this); - } + public String toJson() { + return new Gson().toJson(this); + } } diff --git a/dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/community/ProtoMap.java b/dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/community/ProtoMap.java index 773955d4a..d48dce2c6 100644 --- a/dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/community/ProtoMap.java +++ b/dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/community/ProtoMap.java @@ -1,3 +1,4 @@ + package eu.dnetlib.dhp.community; import java.io.Serializable; @@ -5,7 +6,7 @@ import java.util.HashMap; public class ProtoMap extends HashMap implements Serializable { - public ProtoMap() { - super(); - } + public ProtoMap() { + super(); + } } diff --git a/dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/community/QueryInformationSystem.java b/dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/community/QueryInformationSystem.java index 7dd9339ef..2c18392c7 100644 --- a/dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/community/QueryInformationSystem.java +++ b/dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/community/QueryInformationSystem.java @@ -1,62 +1,65 @@ + package eu.dnetlib.dhp.community; +import java.util.List; + +import org.dom4j.DocumentException; + import com.google.common.base.Joiner; + import eu.dnetlib.dhp.utils.ISLookupClientFactory; import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException; import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService; -import java.util.List; -import org.dom4j.DocumentException; public class QueryInformationSystem { - private static final String XQUERY = - "for $x in collection('/db/DRIVER/ContextDSResources/ContextDSResourceType') " - + " let $subj := $x//CONFIGURATION/context/param[./@name='subject']/text() " - + " let $datasources := $x//CONFIGURATION/context/category[./@id=concat($x//CONFIGURATION/context/@id,'::contentproviders')]/concept " - + " let $organizations := $x//CONFIGURATION/context/category[./@id=concat($x//CONFIGURATION/context/@id,'::resultorganizations')]/concept " - + " let $communities := $x//CONFIGURATION/context/category[./@id=concat($x//CONFIGURATION/context/@id,'::zenodocommunities')]/concept " - + " where $x//CONFIGURATION/context[./@type='community' or ./@type='ri'] " - + " return " - + " " - + " { $x//CONFIGURATION/context/@id} " - + " " - + " {for $y in tokenize($subj,',') " - + " return " - + " {$y}} " - + " " - + " " - + " {for $d in $datasources " - + " where $d/param[./@name='enabled']/text()='true' " - + " return " - + " " - + " " - + " {$d//param[./@name='openaireId']/text()} " - + " " - + " " - + " {$d/param[./@name='selcriteria']/text()} " - + " " - + " } " - + " " - + " " - + " {for $zc in $communities " - + " return " - + " " - + " " - + " {$zc/param[./@name='zenodoid']/text()} " - + " " - + " " - + " {$zc/param[./@name='selcriteria']/text()} " - + " " - + " } " - + " " - + " "; + private static final String XQUERY = "for $x in collection('/db/DRIVER/ContextDSResources/ContextDSResourceType') " + + " let $subj := $x//CONFIGURATION/context/param[./@name='subject']/text() " + + " let $datasources := $x//CONFIGURATION/context/category[./@id=concat($x//CONFIGURATION/context/@id,'::contentproviders')]/concept " + + " let $organizations := $x//CONFIGURATION/context/category[./@id=concat($x//CONFIGURATION/context/@id,'::resultorganizations')]/concept " + + " let $communities := $x//CONFIGURATION/context/category[./@id=concat($x//CONFIGURATION/context/@id,'::zenodocommunities')]/concept " + + " where $x//CONFIGURATION/context[./@type='community' or ./@type='ri'] " + + " return " + + " " + + " { $x//CONFIGURATION/context/@id} " + + " " + + " {for $y in tokenize($subj,',') " + + " return " + + " {$y}} " + + " " + + " " + + " {for $d in $datasources " + + " where $d/param[./@name='enabled']/text()='true' " + + " return " + + " " + + " " + + " {$d//param[./@name='openaireId']/text()} " + + " " + + " " + + " {$d/param[./@name='selcriteria']/text()} " + + " " + + " } " + + " " + + " " + + " {for $zc in $communities " + + " return " + + " " + + " " + + " {$zc/param[./@name='zenodoid']/text()} " + + " " + + " " + + " {$zc/param[./@name='selcriteria']/text()} " + + " " + + " } " + + " " + + " "; - public static CommunityConfiguration getCommunityConfiguration(final String isLookupUrl) - throws ISLookUpException, DocumentException { - ISLookUpService isLookUp = ISLookupClientFactory.getLookUpService(isLookupUrl); - final List res = isLookUp.quickSearchProfile(XQUERY); + public static CommunityConfiguration getCommunityConfiguration(final String isLookupUrl) + throws ISLookUpException, DocumentException { + ISLookUpService isLookUp = ISLookupClientFactory.getLookUpService(isLookupUrl); + final List res = isLookUp.quickSearchProfile(XQUERY); - final String xmlConf = "" + Joiner.on(" ").join(res) + ""; + final String xmlConf = "" + Joiner.on(" ").join(res) + ""; - return CommunityConfigurationFactory.newInstance(xmlConf); - } + return CommunityConfigurationFactory.newInstance(xmlConf); + } } diff --git a/dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/community/ResultTagger.java b/dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/community/ResultTagger.java index 8752a4c57..eb531c6b1 100644 --- a/dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/community/ResultTagger.java +++ b/dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/community/ResultTagger.java @@ -1,224 +1,246 @@ + package eu.dnetlib.dhp.community; import static eu.dnetlib.dhp.community.TagginConstants.*; -import com.google.gson.Gson; -import com.jayway.jsonpath.DocumentContext; -import com.jayway.jsonpath.JsonPath; -import eu.dnetlib.dhp.schema.oaf.*; import java.io.Serializable; import java.util.*; import java.util.stream.Collectors; import java.util.stream.Stream; + import org.apache.commons.lang3.StringUtils; +import com.google.gson.Gson; +import com.jayway.jsonpath.DocumentContext; +import com.jayway.jsonpath.JsonPath; + +import eu.dnetlib.dhp.schema.oaf.*; + /** Created by miriam on 02/08/2018. */ public class ResultTagger implements Serializable { - private String trust = "0.8"; + private String trust = "0.8"; - private boolean clearContext(Result result) { - int tmp = result.getContext().size(); - List clist = - result.getContext().stream() - .filter(c -> (!c.getId().contains(ZENODO_COMMUNITY_INDICATOR))) - .collect(Collectors.toList()); - result.setContext(clist); - return (tmp != clist.size()); - } + private boolean clearContext(Result result) { + int tmp = result.getContext().size(); + List clist = result + .getContext() + .stream() + .filter(c -> (!c.getId().contains(ZENODO_COMMUNITY_INDICATOR))) + .collect(Collectors.toList()); + result.setContext(clist); + return (tmp != clist.size()); + } - private Map> getParamMap(final Result result, Map params) { - Map> param = new HashMap<>(); - String json = new Gson().toJson(result, Result.class); - DocumentContext jsonContext = JsonPath.parse(json); - if (params == null) { - params = new HashMap<>(); - } - for (String key : params.keySet()) { - try { - param.put(key, jsonContext.read(params.get(key))); - } catch (com.jayway.jsonpath.PathNotFoundException e) { - param.put(key, new ArrayList<>()); - // throw e; - } - } - return param; - } + private Map> getParamMap(final Result result, Map params) { + Map> param = new HashMap<>(); + String json = new Gson().toJson(result, Result.class); + DocumentContext jsonContext = JsonPath.parse(json); + if (params == null) { + params = new HashMap<>(); + } + for (String key : params.keySet()) { + try { + param.put(key, jsonContext.read(params.get(key))); + } catch (com.jayway.jsonpath.PathNotFoundException e) { + param.put(key, new ArrayList<>()); + // throw e; + } + } + return param; + } - public R enrichContextCriteria( - final R result, final CommunityConfiguration conf, final Map criteria) { + public R enrichContextCriteria( + final R result, final CommunityConfiguration conf, final Map criteria) { - // } - // public Result enrichContextCriteria(final Result result, final CommunityConfiguration - // conf, final Map criteria) { - final Map> param = getParamMap(result, criteria); + // } + // public Result enrichContextCriteria(final Result result, final CommunityConfiguration + // conf, final Map criteria) { + final Map> param = getParamMap(result, criteria); - // Verify if the entity is deletedbyinference. In case verify if to clean the context list - // from all the zenodo communities - if (result.getDataInfo().getDeletedbyinference()) { - clearContext(result); - return result; - } + // Verify if the entity is deletedbyinference. In case verify if to clean the context list + // from all the zenodo communities + if (result.getDataInfo().getDeletedbyinference()) { + clearContext(result); + return result; + } - // communities contains all the communities to be added as context for the result - final Set communities = new HashSet<>(); + // communities contains all the communities to be added as context for the result + final Set communities = new HashSet<>(); - // tagging for Subject - final Set subjects = new HashSet<>(); - Optional> oresultsubj = Optional.ofNullable(result.getSubject()); - if (oresultsubj.isPresent()) { - oresultsubj.get().stream() - .map(subject -> subject.getValue()) - .filter(StringUtils::isNotBlank) - .map(String::toLowerCase) - .map(String::trim) - .collect(Collectors.toCollection(HashSet::new)) - .forEach(s -> subjects.addAll(conf.getCommunityForSubjectValue(s))); - } + // tagging for Subject + final Set subjects = new HashSet<>(); + Optional> oresultsubj = Optional.ofNullable(result.getSubject()); + if (oresultsubj.isPresent()) { + oresultsubj + .get() + .stream() + .map(subject -> subject.getValue()) + .filter(StringUtils::isNotBlank) + .map(String::toLowerCase) + .map(String::trim) + .collect(Collectors.toCollection(HashSet::new)) + .forEach(s -> subjects.addAll(conf.getCommunityForSubjectValue(s))); + } - communities.addAll(subjects); + communities.addAll(subjects); - // Tagging for datasource - final Set datasources = new HashSet<>(); - final Set tmp = new HashSet<>(); + // Tagging for datasource + final Set datasources = new HashSet<>(); + final Set tmp = new HashSet<>(); - Optional> oresultinstance = Optional.ofNullable(result.getInstance()); - if (oresultinstance.isPresent()) { - for (Instance i : oresultinstance.get()) { - tmp.add(StringUtils.substringAfter(i.getCollectedfrom().getKey(), "|")); - tmp.add(StringUtils.substringAfter(i.getHostedby().getKey(), "|")); - } + Optional> oresultinstance = Optional.ofNullable(result.getInstance()); + if (oresultinstance.isPresent()) { + for (Instance i : oresultinstance.get()) { + tmp.add(StringUtils.substringAfter(i.getCollectedfrom().getKey(), "|")); + tmp.add(StringUtils.substringAfter(i.getHostedby().getKey(), "|")); + } - oresultinstance.get().stream() - .map(i -> new Pair<>(i.getCollectedfrom().getKey(), i.getHostedby().getKey())) - .flatMap(p -> Stream.of(p.getFst(), p.getSnd())) - .map(s -> StringUtils.substringAfter(s, "|")) - .collect(Collectors.toCollection(HashSet::new)) - .forEach( - dsId -> - datasources.addAll( - conf.getCommunityForDatasource(dsId, param))); - } + oresultinstance + .get() + .stream() + .map(i -> new Pair<>(i.getCollectedfrom().getKey(), i.getHostedby().getKey())) + .flatMap(p -> Stream.of(p.getFst(), p.getSnd())) + .map(s -> StringUtils.substringAfter(s, "|")) + .collect(Collectors.toCollection(HashSet::new)) + .forEach( + dsId -> datasources + .addAll( + conf.getCommunityForDatasource(dsId, param))); + } - communities.addAll(datasources); + communities.addAll(datasources); - /*Tagging for Zenodo Communities*/ - final Set czenodo = new HashSet<>(); + /* Tagging for Zenodo Communities */ + final Set czenodo = new HashSet<>(); - Optional> oresultcontext = Optional.ofNullable(result.getContext()); - if (oresultcontext.isPresent()) { - oresultcontext.get().stream() - .filter(c -> c.getId().contains(ZENODO_COMMUNITY_INDICATOR)) - .collect(Collectors.toList()) - .forEach( - c -> - czenodo.addAll( - conf.getCommunityForZenodoCommunityValue( - c.getId() - .substring( - c.getId().lastIndexOf("/") + 1) - .trim()))); - } + Optional> oresultcontext = Optional.ofNullable(result.getContext()); + if (oresultcontext.isPresent()) { + oresultcontext + .get() + .stream() + .filter(c -> c.getId().contains(ZENODO_COMMUNITY_INDICATOR)) + .collect(Collectors.toList()) + .forEach( + c -> czenodo + .addAll( + conf + .getCommunityForZenodoCommunityValue( + c + .getId() + .substring( + c.getId().lastIndexOf("/") + 1) + .trim()))); + } - communities.addAll(czenodo); + communities.addAll(czenodo); - clearContext(result); + clearContext(result); - /*Verify if there is something to bulktag*/ - if (communities.isEmpty()) { - return result; - } + /* Verify if there is something to bulktag */ + if (communities.isEmpty()) { + return result; + } - result.getContext().stream() - .map( - c -> { - if (communities.contains(c.getId())) { - Optional> opt_dataInfoList = - Optional.ofNullable(c.getDataInfo()); - List dataInfoList; - if (opt_dataInfoList.isPresent()) - dataInfoList = opt_dataInfoList.get(); - else { - dataInfoList = new ArrayList<>(); - c.setDataInfo(dataInfoList); - } - if (subjects.contains(c.getId())) - dataInfoList.add( - getDataInfo( - BULKTAG_DATA_INFO_TYPE, - CLASS_ID_SUBJECT, - CLASS_NAME_BULKTAG_SUBJECT)); - if (datasources.contains(c.getId())) - dataInfoList.add( - getDataInfo( - BULKTAG_DATA_INFO_TYPE, - CLASS_ID_DATASOURCE, - CLASS_NAME_BULKTAG_DATASOURCE)); - if (czenodo.contains(c.getId())) - dataInfoList.add( - getDataInfo( - BULKTAG_DATA_INFO_TYPE, - CLASS_ID_CZENODO, - CLASS_NAME_BULKTAG_ZENODO)); - } - return c; - }) - .collect(Collectors.toList()); + result + .getContext() + .stream() + .map( + c -> { + if (communities.contains(c.getId())) { + Optional> opt_dataInfoList = Optional.ofNullable(c.getDataInfo()); + List dataInfoList; + if (opt_dataInfoList.isPresent()) + dataInfoList = opt_dataInfoList.get(); + else { + dataInfoList = new ArrayList<>(); + c.setDataInfo(dataInfoList); + } + if (subjects.contains(c.getId())) + dataInfoList + .add( + getDataInfo( + BULKTAG_DATA_INFO_TYPE, + CLASS_ID_SUBJECT, + CLASS_NAME_BULKTAG_SUBJECT)); + if (datasources.contains(c.getId())) + dataInfoList + .add( + getDataInfo( + BULKTAG_DATA_INFO_TYPE, + CLASS_ID_DATASOURCE, + CLASS_NAME_BULKTAG_DATASOURCE)); + if (czenodo.contains(c.getId())) + dataInfoList + .add( + getDataInfo( + BULKTAG_DATA_INFO_TYPE, + CLASS_ID_CZENODO, + CLASS_NAME_BULKTAG_ZENODO)); + } + return c; + }) + .collect(Collectors.toList()); - communities.removeAll( - result.getContext().stream().map(c -> c.getId()).collect(Collectors.toSet())); + communities + .removeAll( + result.getContext().stream().map(c -> c.getId()).collect(Collectors.toSet())); - if (communities.isEmpty()) return result; + if (communities.isEmpty()) + return result; - List toaddcontext = - communities.stream() - .map( - c -> { - Context context = new Context(); - context.setId(c); - List dataInfoList = new ArrayList<>(); - if (subjects.contains(c)) - dataInfoList.add( - getDataInfo( - BULKTAG_DATA_INFO_TYPE, - CLASS_ID_SUBJECT, - CLASS_NAME_BULKTAG_SUBJECT)); - if (datasources.contains(c)) - dataInfoList.add( - getDataInfo( - BULKTAG_DATA_INFO_TYPE, - CLASS_ID_DATASOURCE, - CLASS_NAME_BULKTAG_DATASOURCE)); - if (czenodo.contains(c)) - dataInfoList.add( - getDataInfo( - BULKTAG_DATA_INFO_TYPE, - CLASS_ID_CZENODO, - CLASS_NAME_BULKTAG_ZENODO)); - context.setDataInfo(dataInfoList); - return context; - }) - .collect(Collectors.toList()); + List toaddcontext = communities + .stream() + .map( + c -> { + Context context = new Context(); + context.setId(c); + List dataInfoList = new ArrayList<>(); + if (subjects.contains(c)) + dataInfoList + .add( + getDataInfo( + BULKTAG_DATA_INFO_TYPE, + CLASS_ID_SUBJECT, + CLASS_NAME_BULKTAG_SUBJECT)); + if (datasources.contains(c)) + dataInfoList + .add( + getDataInfo( + BULKTAG_DATA_INFO_TYPE, + CLASS_ID_DATASOURCE, + CLASS_NAME_BULKTAG_DATASOURCE)); + if (czenodo.contains(c)) + dataInfoList + .add( + getDataInfo( + BULKTAG_DATA_INFO_TYPE, + CLASS_ID_CZENODO, + CLASS_NAME_BULKTAG_ZENODO)); + context.setDataInfo(dataInfoList); + return context; + }) + .collect(Collectors.toList()); - result.getContext().addAll(toaddcontext); - return result; - } + result.getContext().addAll(toaddcontext); + return result; + } - public static DataInfo getDataInfo( - String inference_provenance, String inference_class_id, String inference_class_name) { - DataInfo di = new DataInfo(); - di.setInferred(true); - di.setInferenceprovenance(inference_provenance); - di.setProvenanceaction(getQualifier(inference_class_id, inference_class_name)); - return di; - } + public static DataInfo getDataInfo( + String inference_provenance, String inference_class_id, String inference_class_name) { + DataInfo di = new DataInfo(); + di.setInferred(true); + di.setInferenceprovenance(inference_provenance); + di.setProvenanceaction(getQualifier(inference_class_id, inference_class_name)); + return di; + } - public static Qualifier getQualifier(String inference_class_id, String inference_class_name) { - Qualifier pa = new Qualifier(); - pa.setClassid(inference_class_id); - pa.setClassname(inference_class_name); - pa.setSchemeid(DNET_SCHEMA_ID); - pa.setSchemename(DNET_SCHEMA_NAME); - return pa; - } + public static Qualifier getQualifier(String inference_class_id, String inference_class_name) { + Qualifier pa = new Qualifier(); + pa.setClassid(inference_class_id); + pa.setClassname(inference_class_name); + pa.setSchemeid(DNET_SCHEMA_ID); + pa.setSchemename(DNET_SCHEMA_NAME); + return pa; + } } diff --git a/dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/community/SelectionConstraints.java b/dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/community/SelectionConstraints.java index 530861425..802e2f5d6 100644 --- a/dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/community/SelectionConstraints.java +++ b/dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/community/SelectionConstraints.java @@ -1,46 +1,51 @@ + package eu.dnetlib.dhp.community; -import com.google.gson.Gson; -import com.google.gson.reflect.TypeToken; -import eu.dnetlib.dhp.selectioncriteria.VerbResolver; import java.io.Serializable; import java.lang.reflect.Type; import java.util.Collection; import java.util.List; import java.util.Map; +import com.google.gson.Gson; +import com.google.gson.reflect.TypeToken; + +import eu.dnetlib.dhp.selectioncriteria.VerbResolver; + public class SelectionConstraints implements Serializable { - private List criteria; + private List criteria; - public SelectionConstraints() {} + public SelectionConstraints() { + } - public List getCriteria() { - return criteria; - } + public List getCriteria() { + return criteria; + } - public void setCriteria(List criteria) { - this.criteria = criteria; - } + public void setCriteria(List criteria) { + this.criteria = criteria; + } - public void setSc(String json) { - Type collectionType = new TypeToken>() {}.getType(); - criteria = new Gson().fromJson(json, collectionType); - } + public void setSc(String json) { + Type collectionType = new TypeToken>() { + }.getType(); + criteria = new Gson().fromJson(json, collectionType); + } - // Constraints in or - public boolean verifyCriteria(final Map> param) { - for (Constraints selc : criteria) { - if (selc.verifyCriteria(param)) { - return true; - } - } - return false; - } + // Constraints in or + public boolean verifyCriteria(final Map> param) { + for (Constraints selc : criteria) { + if (selc.verifyCriteria(param)) { + return true; + } + } + return false; + } - public void setSelection(VerbResolver resolver) { + public void setSelection(VerbResolver resolver) { - for (Constraints cs : criteria) { - cs.setSelection(resolver); - } - } + for (Constraints cs : criteria) { + cs.setSelection(resolver); + } + } } diff --git a/dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/community/TagginConstants.java b/dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/community/TagginConstants.java index 9f681472a..92d37d089 100644 --- a/dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/community/TagginConstants.java +++ b/dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/community/TagginConstants.java @@ -1,23 +1,23 @@ + package eu.dnetlib.dhp.community; public class TagginConstants { - public static final String BULKTAG_DATA_INFO_TYPE = "bulktagging"; + public static final String BULKTAG_DATA_INFO_TYPE = "bulktagging"; - public static final String DNET_SCHEMA_NAME = "dnet:provenanceActions"; - public static final String DNET_SCHEMA_ID = "dnet:provenanceActions"; + public static final String DNET_SCHEMA_NAME = "dnet:provenanceActions"; + public static final String DNET_SCHEMA_ID = "dnet:provenanceActions"; - public static final String CLASS_ID_SUBJECT = "community:subject"; - public static final String CLASS_ID_DATASOURCE = "community:datasource"; - public static final String CLASS_ID_CZENODO = "community:zenodocommunity"; + public static final String CLASS_ID_SUBJECT = "community:subject"; + public static final String CLASS_ID_DATASOURCE = "community:datasource"; + public static final String CLASS_ID_CZENODO = "community:zenodocommunity"; - public static final String SCHEMA_ID = "dnet:provenanceActions"; - public static final String COUNTER_GROUP = "Bulk Tagging"; + public static final String SCHEMA_ID = "dnet:provenanceActions"; + public static final String COUNTER_GROUP = "Bulk Tagging"; - public static final String ZENODO_COMMUNITY_INDICATOR = "zenodo.org/communities/"; + public static final String ZENODO_COMMUNITY_INDICATOR = "zenodo.org/communities/"; - public static final String CLASS_NAME_BULKTAG_SUBJECT = "Bulktagging for Community - Subject"; - public static final String CLASS_NAME_BULKTAG_DATASOURCE = - "Bulktagging for Community - Datasource"; - public static final String CLASS_NAME_BULKTAG_ZENODO = "Bulktagging for Community - Zenodo"; + public static final String CLASS_NAME_BULKTAG_SUBJECT = "Bulktagging for Community - Subject"; + public static final String CLASS_NAME_BULKTAG_DATASOURCE = "Bulktagging for Community - Datasource"; + public static final String CLASS_NAME_BULKTAG_ZENODO = "Bulktagging for Community - Zenodo"; } diff --git a/dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/community/ZenodoCommunity.java b/dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/community/ZenodoCommunity.java index 19d97d221..e1492f6a5 100644 --- a/dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/community/ZenodoCommunity.java +++ b/dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/community/ZenodoCommunity.java @@ -1,42 +1,45 @@ + package eu.dnetlib.dhp.community; -import com.google.gson.Gson; import java.io.Serializable; + import org.dom4j.Node; +import com.google.gson.Gson; + /** Created by miriam on 01/08/2018. */ public class ZenodoCommunity implements Serializable { - private String zenodoCommunityId; + private String zenodoCommunityId; - private SelectionConstraints selCriteria; + private SelectionConstraints selCriteria; - public String getZenodoCommunityId() { - return zenodoCommunityId; - } + public String getZenodoCommunityId() { + return zenodoCommunityId; + } - public void setZenodoCommunityId(String zenodoCommunityId) { - this.zenodoCommunityId = zenodoCommunityId; - } + public void setZenodoCommunityId(String zenodoCommunityId) { + this.zenodoCommunityId = zenodoCommunityId; + } - public SelectionConstraints getSelCriteria() { - return selCriteria; - } + public SelectionConstraints getSelCriteria() { + return selCriteria; + } - public void setSelCriteria(SelectionConstraints selCriteria) { - this.selCriteria = selCriteria; - } + public void setSelCriteria(SelectionConstraints selCriteria) { + this.selCriteria = selCriteria; + } - private void setSelCriteria(String json) { - // Type collectionType = new TypeToken>(){}.getType(); - selCriteria = new Gson().fromJson(json, SelectionConstraints.class); - } + private void setSelCriteria(String json) { + // Type collectionType = new TypeToken>(){}.getType(); + selCriteria = new Gson().fromJson(json, SelectionConstraints.class); + } - public void setSelCriteria(Node n) { - if (n == null) { - selCriteria = null; - } else { - setSelCriteria(n.getText()); - } - } + public void setSelCriteria(Node n) { + if (n == null) { + selCriteria = null; + } else { + setSelCriteria(n.getText()); + } + } } diff --git a/dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/selectioncriteria/ContainsVerb.java b/dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/selectioncriteria/ContainsVerb.java index fc6456a8c..a6ef2d908 100644 --- a/dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/selectioncriteria/ContainsVerb.java +++ b/dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/selectioncriteria/ContainsVerb.java @@ -1,3 +1,4 @@ + package eu.dnetlib.dhp.selectioncriteria; import java.io.Serializable; @@ -5,24 +6,25 @@ import java.io.Serializable; @VerbClass("contains") public class ContainsVerb implements Selection, Serializable { - private String param; + private String param; - public ContainsVerb() {} + public ContainsVerb() { + } - public ContainsVerb(final String param) { - this.param = param; - } + public ContainsVerb(final String param) { + this.param = param; + } - @Override - public boolean apply(String value) { - return value.contains(param); - } + @Override + public boolean apply(String value) { + return value.contains(param); + } - public String getParam() { - return param; - } + public String getParam() { + return param; + } - public void setParam(String param) { - this.param = param; - } + public void setParam(String param) { + this.param = param; + } } diff --git a/dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/selectioncriteria/ContainsVerbIgnoreCase.java b/dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/selectioncriteria/ContainsVerbIgnoreCase.java index d5651e5b8..b8b0262e9 100644 --- a/dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/selectioncriteria/ContainsVerbIgnoreCase.java +++ b/dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/selectioncriteria/ContainsVerbIgnoreCase.java @@ -1,3 +1,4 @@ + package eu.dnetlib.dhp.selectioncriteria; import java.io.Serializable; @@ -5,24 +6,25 @@ import java.io.Serializable; @VerbClass("contains_ignorecase") public class ContainsVerbIgnoreCase implements Selection, Serializable { - private String param; + private String param; - public ContainsVerbIgnoreCase() {} + public ContainsVerbIgnoreCase() { + } - public ContainsVerbIgnoreCase(final String param) { - this.param = param; - } + public ContainsVerbIgnoreCase(final String param) { + this.param = param; + } - @Override - public boolean apply(String value) { - return value.toLowerCase().contains(param.toLowerCase()); - } + @Override + public boolean apply(String value) { + return value.toLowerCase().contains(param.toLowerCase()); + } - public String getParam() { - return param; - } + public String getParam() { + return param; + } - public void setParam(String param) { - this.param = param; - } + public void setParam(String param) { + this.param = param; + } } diff --git a/dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/selectioncriteria/EqualVerb.java b/dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/selectioncriteria/EqualVerb.java index cc793f8f8..3f17a6bb3 100644 --- a/dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/selectioncriteria/EqualVerb.java +++ b/dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/selectioncriteria/EqualVerb.java @@ -1,3 +1,4 @@ + package eu.dnetlib.dhp.selectioncriteria; import java.io.Serializable; @@ -5,24 +6,25 @@ import java.io.Serializable; @VerbClass("equals") public class EqualVerb implements Selection, Serializable { - private String param; + private String param; - public EqualVerb() {} + public EqualVerb() { + } - public EqualVerb(final String param) { - this.param = param; - } + public EqualVerb(final String param) { + this.param = param; + } - @Override - public boolean apply(String value) { - return value.equals(param); - } + @Override + public boolean apply(String value) { + return value.equals(param); + } - public String getParam() { - return param; - } + public String getParam() { + return param; + } - public void setParam(String param) { - this.param = param; - } + public void setParam(String param) { + this.param = param; + } } diff --git a/dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/selectioncriteria/EqualVerbIgnoreCase.java b/dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/selectioncriteria/EqualVerbIgnoreCase.java index 26c636090..934406859 100644 --- a/dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/selectioncriteria/EqualVerbIgnoreCase.java +++ b/dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/selectioncriteria/EqualVerbIgnoreCase.java @@ -1,3 +1,4 @@ + package eu.dnetlib.dhp.selectioncriteria; import java.io.Serializable; @@ -5,24 +6,25 @@ import java.io.Serializable; @VerbClass("equals_ignorecase") public class EqualVerbIgnoreCase implements Selection, Serializable { - private String param; + private String param; - public EqualVerbIgnoreCase() {} + public EqualVerbIgnoreCase() { + } - public EqualVerbIgnoreCase(final String param) { - this.param = param; - } + public EqualVerbIgnoreCase(final String param) { + this.param = param; + } - @Override - public boolean apply(String value) { - return value.equalsIgnoreCase(param); - } + @Override + public boolean apply(String value) { + return value.equalsIgnoreCase(param); + } - public String getParam() { - return param; - } + public String getParam() { + return param; + } - public void setParam(String param) { - this.param = param; - } + public void setParam(String param) { + this.param = param; + } } diff --git a/dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/selectioncriteria/InterfaceAdapter.java b/dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/selectioncriteria/InterfaceAdapter.java index a4a1494b3..9ef3bd60c 100644 --- a/dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/selectioncriteria/InterfaceAdapter.java +++ b/dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/selectioncriteria/InterfaceAdapter.java @@ -1,40 +1,43 @@ + package eu.dnetlib.dhp.selectioncriteria; -import com.google.gson.*; import java.lang.reflect.Type; +import com.google.gson.*; + public class InterfaceAdapter implements JsonSerializer, JsonDeserializer { - private static final String CLASSNAME = "CLASSNAME"; - private static final String DATA = "DATA"; + private static final String CLASSNAME = "CLASSNAME"; + private static final String DATA = "DATA"; - public Object deserialize( - JsonElement jsonElement, - Type type, - JsonDeserializationContext jsonDeserializationContext) - throws JsonParseException { + public Object deserialize( + JsonElement jsonElement, + Type type, + JsonDeserializationContext jsonDeserializationContext) + throws JsonParseException { - JsonObject jsonObject = jsonElement.getAsJsonObject(); - JsonPrimitive prim = (JsonPrimitive) jsonObject.get(CLASSNAME); - String className = prim.getAsString(); - Class klass = getObjectClass(className); - return jsonDeserializationContext.deserialize(jsonObject.get(DATA), klass); - } + JsonObject jsonObject = jsonElement.getAsJsonObject(); + JsonPrimitive prim = (JsonPrimitive) jsonObject.get(CLASSNAME); + String className = prim.getAsString(); + Class klass = getObjectClass(className); + return jsonDeserializationContext.deserialize(jsonObject.get(DATA), klass); + } - public JsonElement serialize( - Object jsonElement, Type type, JsonSerializationContext jsonSerializationContext) { - JsonObject jsonObject = new JsonObject(); - jsonObject.addProperty(CLASSNAME, jsonElement.getClass().getName()); - jsonObject.add(DATA, jsonSerializationContext.serialize(jsonElement)); - return jsonObject; - } - /** **** Helper method to get the className of the object to be deserialized **** */ - public Class getObjectClass(String className) { - try { - return Class.forName(className); - } catch (ClassNotFoundException e) { - // e.printStackTrace(); - throw new JsonParseException(e.getMessage()); - } - } + public JsonElement serialize( + Object jsonElement, Type type, JsonSerializationContext jsonSerializationContext) { + JsonObject jsonObject = new JsonObject(); + jsonObject.addProperty(CLASSNAME, jsonElement.getClass().getName()); + jsonObject.add(DATA, jsonSerializationContext.serialize(jsonElement)); + return jsonObject; + } + + /** **** Helper method to get the className of the object to be deserialized **** */ + public Class getObjectClass(String className) { + try { + return Class.forName(className); + } catch (ClassNotFoundException e) { + // e.printStackTrace(); + throw new JsonParseException(e.getMessage()); + } + } } diff --git a/dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/selectioncriteria/NotContainsVerb.java b/dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/selectioncriteria/NotContainsVerb.java index f07540d35..eb83b256e 100644 --- a/dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/selectioncriteria/NotContainsVerb.java +++ b/dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/selectioncriteria/NotContainsVerb.java @@ -1,3 +1,4 @@ + package eu.dnetlib.dhp.selectioncriteria; import java.io.Serializable; @@ -5,24 +6,25 @@ import java.io.Serializable; @VerbClass("not_contains") public class NotContainsVerb implements Selection, Serializable { - private String param; + private String param; - public NotContainsVerb() {} + public NotContainsVerb() { + } - public NotContainsVerb(final String param) { - this.param = param; - } + public NotContainsVerb(final String param) { + this.param = param; + } - @Override - public boolean apply(String value) { - return !value.contains(param); - } + @Override + public boolean apply(String value) { + return !value.contains(param); + } - public String getParam() { - return param; - } + public String getParam() { + return param; + } - public void setParam(String param) { - this.param = param; - } + public void setParam(String param) { + this.param = param; + } } diff --git a/dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/selectioncriteria/NotContainsVerbIgnoreCase.java b/dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/selectioncriteria/NotContainsVerbIgnoreCase.java index e26b4a1a3..fab3efef3 100644 --- a/dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/selectioncriteria/NotContainsVerbIgnoreCase.java +++ b/dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/selectioncriteria/NotContainsVerbIgnoreCase.java @@ -1,3 +1,4 @@ + package eu.dnetlib.dhp.selectioncriteria; import java.io.Serializable; @@ -5,24 +6,25 @@ import java.io.Serializable; @VerbClass("not_contains_ignorecase") public class NotContainsVerbIgnoreCase implements Selection, Serializable { - private String param; + private String param; - public NotContainsVerbIgnoreCase() {} + public NotContainsVerbIgnoreCase() { + } - public NotContainsVerbIgnoreCase(final String param) { - this.param = param; - } + public NotContainsVerbIgnoreCase(final String param) { + this.param = param; + } - @Override - public boolean apply(String value) { - return !(value.toLowerCase().contains(param.toLowerCase())); - } + @Override + public boolean apply(String value) { + return !(value.toLowerCase().contains(param.toLowerCase())); + } - public String getParam() { - return param; - } + public String getParam() { + return param; + } - public void setParam(String param) { - this.param = param; - } + public void setParam(String param) { + this.param = param; + } } diff --git a/dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/selectioncriteria/NotEqualVerb.java b/dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/selectioncriteria/NotEqualVerb.java index 1946812f5..2311c2987 100644 --- a/dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/selectioncriteria/NotEqualVerb.java +++ b/dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/selectioncriteria/NotEqualVerb.java @@ -1,3 +1,4 @@ + package eu.dnetlib.dhp.selectioncriteria; import java.io.Serializable; @@ -5,24 +6,25 @@ import java.io.Serializable; @VerbClass("not_equals") public class NotEqualVerb implements Selection, Serializable { - private String param; + private String param; - public NotEqualVerb(final String param) { - this.param = param; - } + public NotEqualVerb(final String param) { + this.param = param; + } - public NotEqualVerb() {} + public NotEqualVerb() { + } - public String getParam() { - return param; - } + public String getParam() { + return param; + } - public void setParam(String param) { - this.param = param; - } + public void setParam(String param) { + this.param = param; + } - @Override - public boolean apply(String value) { - return !value.equals(param); - } + @Override + public boolean apply(String value) { + return !value.equals(param); + } } diff --git a/dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/selectioncriteria/NotEqualVerbIgnoreCase.java b/dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/selectioncriteria/NotEqualVerbIgnoreCase.java index d669fa6ac..de2f682a5 100644 --- a/dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/selectioncriteria/NotEqualVerbIgnoreCase.java +++ b/dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/selectioncriteria/NotEqualVerbIgnoreCase.java @@ -1,3 +1,4 @@ + package eu.dnetlib.dhp.selectioncriteria; import java.io.Serializable; @@ -5,24 +6,25 @@ import java.io.Serializable; @VerbClass("not_equals_ignorecase") public class NotEqualVerbIgnoreCase implements Selection, Serializable { - private String param; + private String param; - public NotEqualVerbIgnoreCase(final String param) { - this.param = param; - } + public NotEqualVerbIgnoreCase(final String param) { + this.param = param; + } - public NotEqualVerbIgnoreCase() {} + public NotEqualVerbIgnoreCase() { + } - public String getParam() { - return param; - } + public String getParam() { + return param; + } - public void setParam(String param) { - this.param = param; - } + public void setParam(String param) { + this.param = param; + } - @Override - public boolean apply(String value) { - return !value.equalsIgnoreCase(param); - } + @Override + public boolean apply(String value) { + return !value.equalsIgnoreCase(param); + } } diff --git a/dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/selectioncriteria/Selection.java b/dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/selectioncriteria/Selection.java index c9b30790f..b488bda01 100644 --- a/dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/selectioncriteria/Selection.java +++ b/dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/selectioncriteria/Selection.java @@ -1,6 +1,7 @@ + package eu.dnetlib.dhp.selectioncriteria; public interface Selection { - boolean apply(String value); + boolean apply(String value); } diff --git a/dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/selectioncriteria/VerbClass.java b/dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/selectioncriteria/VerbClass.java index 9a5fe4e8a..d467f934f 100644 --- a/dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/selectioncriteria/VerbClass.java +++ b/dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/selectioncriteria/VerbClass.java @@ -1,3 +1,4 @@ + package eu.dnetlib.dhp.selectioncriteria; import java.lang.annotation.ElementType; @@ -9,5 +10,5 @@ import java.lang.annotation.Target; @Target(ElementType.TYPE) @interface VerbClass { - String value(); + String value(); } diff --git a/dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/selectioncriteria/VerbResolver.java b/dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/selectioncriteria/VerbResolver.java index c2a9b4544..6a8ceebc3 100644 --- a/dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/selectioncriteria/VerbResolver.java +++ b/dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/selectioncriteria/VerbResolver.java @@ -1,57 +1,56 @@ + package eu.dnetlib.dhp.selectioncriteria; -import io.github.classgraph.ClassGraph; -import io.github.classgraph.ClassInfo; -import io.github.classgraph.ClassInfoList; -import io.github.classgraph.ScanResult; import java.io.Serializable; import java.lang.reflect.InvocationTargetException; import java.util.Map; import java.util.stream.Collectors; +import io.github.classgraph.ClassGraph; +import io.github.classgraph.ClassInfo; +import io.github.classgraph.ClassInfoList; +import io.github.classgraph.ScanResult; + public class VerbResolver implements Serializable { - private Map> map = null; // = new HashMap<>(); - private final ClassGraph classgraph = new ClassGraph(); + private Map> map = null; // = new HashMap<>(); + private final ClassGraph classgraph = new ClassGraph(); - public VerbResolver() { + public VerbResolver() { - try (ScanResult scanResult = // Assign scanResult in try-with-resources - classgraph // Create a new ClassGraph instance - .verbose() // If you want to enable logging to stderr - .enableAllInfo() // Scan classes, methods, fields, annotations - .whitelistPackages( - "eu.dnetlib.dhp.selectioncriteria") // Scan com.xyz and subpackages - .scan()) { // Perform the scan and return a ScanResult + try (ScanResult scanResult = // Assign scanResult in try-with-resources + classgraph // Create a new ClassGraph instance + .verbose() // If you want to enable logging to stderr + .enableAllInfo() // Scan classes, methods, fields, annotations + .whitelistPackages( + "eu.dnetlib.dhp.selectioncriteria") // Scan com.xyz and subpackages + .scan()) { // Perform the scan and return a ScanResult - ClassInfoList routeClassInfoList = - scanResult.getClassesWithAnnotation( - "eu.dnetlib.dhp.selectioncriteria.VerbClass"); + ClassInfoList routeClassInfoList = scanResult + .getClassesWithAnnotation( + "eu.dnetlib.dhp.selectioncriteria.VerbClass"); - this.map = - routeClassInfoList.stream() - .collect( - Collectors.toMap( - value -> - (String) - ((ClassInfo) value) - .getAnnotationInfo() - .get(0) - .getParameterValues() - .get(0) - .getValue(), - value -> - (Class) - ((ClassInfo) value).loadClass())); - } catch (Exception e) { - e.printStackTrace(); - } - } + this.map = routeClassInfoList + .stream() + .collect( + Collectors + .toMap( + value -> (String) ((ClassInfo) value) + .getAnnotationInfo() + .get(0) + .getParameterValues() + .get(0) + .getValue(), + value -> (Class) ((ClassInfo) value).loadClass())); + } catch (Exception e) { + e.printStackTrace(); + } + } - public Selection getSelectionCriteria(String name, String param) - throws NoSuchMethodException, IllegalAccessException, InvocationTargetException, - InstantiationException { + public Selection getSelectionCriteria(String name, String param) + throws NoSuchMethodException, IllegalAccessException, InvocationTargetException, + InstantiationException { - // return Class.forName(tmp_map.get(name)). - return map.get(name).getDeclaredConstructor((String.class)).newInstance(param); - } + // return Class.forName(tmp_map.get(name)). + return map.get(name).getDeclaredConstructor((String.class)).newInstance(param); + } } diff --git a/dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/selectioncriteria/VerbResolverFactory.java b/dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/selectioncriteria/VerbResolverFactory.java index c878c7f58..58bf60d42 100644 --- a/dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/selectioncriteria/VerbResolverFactory.java +++ b/dhp-workflows/dhp-bulktag/src/main/java/eu/dnetlib/dhp/selectioncriteria/VerbResolverFactory.java @@ -1,9 +1,10 @@ + package eu.dnetlib.dhp.selectioncriteria; public class VerbResolverFactory { - public static VerbResolver newInstance() { + public static VerbResolver newInstance() { - return new VerbResolver(); - } + return new VerbResolver(); + } } diff --git a/dhp-workflows/dhp-bulktag/src/test/java/eu/dnetlib/dhp/BulkTagJobTest.java b/dhp-workflows/dhp-bulktag/src/test/java/eu/dnetlib/dhp/BulkTagJobTest.java index 04c8816df..6873f2df9 100644 --- a/dhp-workflows/dhp-bulktag/src/test/java/eu/dnetlib/dhp/BulkTagJobTest.java +++ b/dhp-workflows/dhp-bulktag/src/test/java/eu/dnetlib/dhp/BulkTagJobTest.java @@ -1,16 +1,12 @@ + package eu.dnetlib.dhp; import static eu.dnetlib.dhp.community.TagginConstants.ZENODO_COMMUNITY_INDICATOR; -import com.fasterxml.jackson.databind.ObjectMapper; -import eu.dnetlib.dhp.bulktag.SparkBulkTagJob2; -import eu.dnetlib.dhp.schema.oaf.Dataset; -import eu.dnetlib.dhp.schema.oaf.OtherResearchProduct; -import eu.dnetlib.dhp.schema.oaf.Publication; -import eu.dnetlib.dhp.schema.oaf.Software; import java.io.IOException; import java.nio.file.Files; import java.nio.file.Path; + import org.apache.commons.io.FileUtils; import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaRDD; @@ -26,778 +22,832 @@ import org.mortbay.util.IO; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import com.fasterxml.jackson.databind.ObjectMapper; + +import eu.dnetlib.dhp.bulktag.SparkBulkTagJob2; +import eu.dnetlib.dhp.schema.oaf.Dataset; +import eu.dnetlib.dhp.schema.oaf.OtherResearchProduct; +import eu.dnetlib.dhp.schema.oaf.Publication; +import eu.dnetlib.dhp.schema.oaf.Software; + public class BulkTagJobTest { - private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); - - private static final ClassLoader cl = eu.dnetlib.dhp.BulkTagJobTest.class.getClassLoader(); - - private static SparkSession spark; - - private static Path workingDir; - private static final Logger log = LoggerFactory.getLogger(eu.dnetlib.dhp.BulkTagJobTest.class); - - private static String taggingConf = ""; - - static { - try { - taggingConf = - IO.toString( - BulkTagJobTest.class.getResourceAsStream( - "/eu/dnetlib/dhp/communityconfiguration/tagging_conf.xml")); - } catch (IOException e) { - e.printStackTrace(); - } - } - - @BeforeAll - public static void beforeAll() throws IOException { - workingDir = Files.createTempDirectory(eu.dnetlib.dhp.BulkTagJobTest.class.getSimpleName()); - log.info("using work dir {}", workingDir); - - SparkConf conf = new SparkConf(); - conf.setAppName(eu.dnetlib.dhp.BulkTagJobTest.class.getSimpleName()); - - conf.setMaster("local[*]"); - conf.set("spark.driver.host", "localhost"); - conf.set("hive.metastore.local", "true"); - conf.set("spark.ui.enabled", "false"); - conf.set("spark.sql.warehouse.dir", workingDir.toString()); - conf.set("hive.metastore.warehouse.dir", workingDir.resolve("warehouse").toString()); - - spark = - SparkSession.builder() - .appName(BulkTagJobTest.class.getSimpleName()) - .config(conf) - .getOrCreate(); - } - - @AfterAll - public static void afterAll() throws IOException { - FileUtils.deleteDirectory(workingDir.toFile()); - spark.stop(); - } - - @Test - public void noUpdatesTest() throws Exception { - SparkBulkTagJob2.main( - new String[] { - "-isTest", - Boolean.TRUE.toString(), - "-isSparkSessionManaged", - Boolean.FALSE.toString(), - "-sourcePath", - getClass().getResource("/eu/dnetlib/dhp/sample/dataset/no_updates").getPath(), - "-taggingConf", - taggingConf, - "-resultTableName", - "eu.dnetlib.dhp.schema.oaf.Dataset", - "-outputPath", - workingDir.toString() + "/dataset", - "-isLookupUrl", - "http://beta.services.openaire.eu:8280/is/services/isLookUp", - "-protoMap", - "{ \"author\" : \"$['author'][*]['fullname']\"," - + " \"title\" : \"$['title'][*]['value']\"," - + " \"orcid\" : \"$['author'][*]['pid'][*][?(@['key']=='ORCID')]['value']\"," - + " \"contributor\" : \"$['contributor'][*]['value']\"," - + " \"description\" : \"$['description'][*]['value']\"}" - // "-preparedInfoPath", - // getClass().getResource("/eu/dnetlib/dhp/resulttocommunityfromsemrel/preparedInfo").getPath() - }); - - final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); - - JavaRDD tmp = - sc.textFile(workingDir.toString() + "/dataset") - .map(item -> OBJECT_MAPPER.readValue(item, Dataset.class)); - - Assertions.assertEquals(10, tmp.count()); - org.apache.spark.sql.Dataset verificationDataset = - spark.createDataset(tmp.rdd(), Encoders.bean(Dataset.class)); - - verificationDataset.createOrReplaceTempView("dataset"); - - String query = - "select id, MyT.id community " - + "from dataset " - + "lateral view explode(context) c as MyT " - + "lateral view explode(MyT.datainfo) d as MyD " - + "where MyD.inferenceprovenance = 'bulktagging'"; - - Assertions.assertEquals(0, spark.sql(query).count()); - } - - @Test - public void bulktagBySubjectNoPreviousContextTest() throws Exception { - SparkBulkTagJob2.main( - new String[] { - "-isTest", - Boolean.TRUE.toString(), - "-isSparkSessionManaged", - Boolean.FALSE.toString(), - "-sourcePath", - getClass() - .getResource("/eu/dnetlib/dhp/sample/dataset/update_subject/nocontext") - .getPath(), - "-taggingConf", - taggingConf, - "-resultTableName", - "eu.dnetlib.dhp.schema.oaf.Dataset", - "-outputPath", - workingDir.toString() + "/dataset", - "-isLookupUrl", - "http://beta.services.openaire.eu:8280/is/services/isLookUp", - "-protoMap", - "{ \"author\" : \"$['author'][*]['fullname']\"," - + " \"title\" : \"$['title'][*]['value']\"," - + " \"orcid\" : \"$['author'][*]['pid'][*][?(@['key']=='ORCID')]['value']\"," - + " \"contributor\" : \"$['contributor'][*]['value']\"," - + " \"description\" : \"$['description'][*]['value']\"}" - }); - - final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); - - JavaRDD tmp = - sc.textFile(workingDir.toString() + "/dataset") - .map(item -> OBJECT_MAPPER.readValue(item, Dataset.class)); - - Assertions.assertEquals(10, tmp.count()); - org.apache.spark.sql.Dataset verificationDataset = - spark.createDataset(tmp.rdd(), Encoders.bean(Dataset.class)); - - verificationDataset.createOrReplaceTempView("dataset"); - - String query = - "select id, MyT.id community, MyD.provenanceaction.classid provenance, MyD.provenanceaction.classname name " - + "from dataset " - + "lateral view explode(context) c as MyT " - + "lateral view explode(MyT.datainfo) d as MyD " - + "where MyD.inferenceprovenance = 'bulktagging'"; - - Assertions.assertEquals(5, spark.sql(query).count()); - - org.apache.spark.sql.Dataset idExplodeCommunity = spark.sql(query); - Assertions.assertEquals( - 5, idExplodeCommunity.filter("provenance = 'community:subject'").count()); - Assertions.assertEquals( - 5, - idExplodeCommunity.filter("name = 'Bulktagging for Community - Subject'").count()); - - Assertions.assertEquals(2, idExplodeCommunity.filter("community = 'covid-19'").count()); - Assertions.assertEquals(1, idExplodeCommunity.filter("community = 'mes'").count()); - Assertions.assertEquals(1, idExplodeCommunity.filter("community = 'fam'").count()); - Assertions.assertEquals(1, idExplodeCommunity.filter("community = 'aginfra'").count()); - - Assertions.assertEquals( - 1, - idExplodeCommunity - .filter("id = '50|od______3989::02dd5d2c222191b0b9bd4f33c8e96529'") - .count()); - Assertions.assertEquals( - 1, - idExplodeCommunity - .filter( - "community = 'covid-19' and id = '50|od______3989::02dd5d2c222191b0b9bd4f33c8e96529'") - .count()); - - Assertions.assertEquals( - 2, - idExplodeCommunity - .filter("id = '50|od______3989::05d8c751462f9bb8d2b06956dfbc5c7b'") - .count()); - Assertions.assertEquals( - 2, - idExplodeCommunity - .filter( - "(community = 'covid-19' or community = 'aginfra') and id = '50|od______3989::05d8c751462f9bb8d2b06956dfbc5c7b'") - .count()); - - Assertions.assertEquals( - 2, - idExplodeCommunity - .filter("id = '50|od______3989::0f89464c4ac4c398fe0c71433b175a62'") - .count()); - Assertions.assertEquals( - 2, - idExplodeCommunity - .filter( - "(community = 'mes' or community = 'fam') and id = '50|od______3989::0f89464c4ac4c398fe0c71433b175a62'") - .count()); - } - - @Test - public void bulktagBySubjectPreviousContextNoProvenanceTest() throws Exception { - SparkBulkTagJob2.main( - new String[] { - "-isTest", - Boolean.TRUE.toString(), - "-isSparkSessionManaged", - Boolean.FALSE.toString(), - "-sourcePath", - getClass() - .getResource( - "/eu/dnetlib/dhp/sample/dataset/update_subject/contextnoprovenance") - .getPath(), - "-taggingConf", - taggingConf, - "-resultTableName", - "eu.dnetlib.dhp.schema.oaf.Dataset", - "-outputPath", - workingDir.toString() + "/dataset", - "-isLookupUrl", - "http://beta.services.openaire.eu:8280/is/services/isLookUp", - "-protoMap", - "{ \"author\" : \"$['author'][*]['fullname']\"," - + " \"title\" : \"$['title'][*]['value']\"," - + " \"orcid\" : \"$['author'][*]['pid'][*][?(@['key']=='ORCID')]['value']\"," - + " \"contributor\" : \"$['contributor'][*]['value']\"," - + " \"description\" : \"$['description'][*]['value']\"}" - }); - - final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); - - JavaRDD tmp = - sc.textFile(workingDir.toString() + "/dataset") - .map(item -> OBJECT_MAPPER.readValue(item, Dataset.class)); - - Assertions.assertEquals(10, tmp.count()); - org.apache.spark.sql.Dataset verificationDataset = - spark.createDataset(tmp.rdd(), Encoders.bean(Dataset.class)); - - verificationDataset.createOrReplaceTempView("dataset"); - - String query = - "select id, MyT.id community, MyD.provenanceaction.classid provenance " - + "from dataset " - + "lateral view explode(context) c as MyT " - + "lateral view explode(MyT.datainfo) d as MyD " - + "where MyT.id = 'covid-19' "; - - Assertions.assertEquals(3, spark.sql(query).count()); - - org.apache.spark.sql.Dataset communityContext = spark.sql(query); - - Assertions.assertEquals( - 2, - communityContext - .filter("id = '50|od______3989::02dd5d2c222191b0b9bd4f33c8e96529'") - .count()); - Assertions.assertEquals( - 1, - communityContext - .filter( - "id = '50|od______3989::02dd5d2c222191b0b9bd4f33c8e96529' and provenance = 'community:subject'") - .count()); - Assertions.assertEquals( - 1, - communityContext - .filter( - "id = '50|od______3989::02dd5d2c222191b0b9bd4f33c8e96529' and provenance = 'propagation:community:productsthroughsemrel'") - .count()); - - query = - "select id, MyT.id community, size(MyT.datainfo) datainfosize " - + "from dataset " - + "lateral view explode (context) as MyT " - + "where size(MyT.datainfo) > 0"; - - Assertions.assertEquals( - 2, - spark.sql(query) - .select("datainfosize") - .where( - "id = '50|od______3989::02dd5d2c222191b0b9bd4f33c8e96529' a" - + "nd community = 'covid-19'") - .collectAsList() - .get(0) - .getInt(0)); - } - - @Test - public void bulktagByDatasourceTest() throws Exception { - SparkBulkTagJob2.main( - new String[] { - "-isTest", - Boolean.TRUE.toString(), - "-isSparkSessionManaged", - Boolean.FALSE.toString(), - "-sourcePath", - getClass() - .getResource("/eu/dnetlib/dhp/sample/publication/update_datasource") - .getPath(), - "-taggingConf", - taggingConf, - "-resultTableName", - "eu.dnetlib.dhp.schema.oaf.Publication", - "-outputPath", - workingDir.toString() + "/publication", - "-isLookupUrl", - "http://beta.services.openaire.eu:8280/is/services/isLookUp", - "-protoMap", - "{ \"author\" : \"$['author'][*]['fullname']\"," - + " \"title\" : \"$['title'][*]['value']\"," - + " \"orcid\" : \"$['author'][*]['pid'][*][?(@['key']=='ORCID')]['value']\"," - + " \"contributor\" : \"$['contributor'][*]['value']\"," - + " \"description\" : \"$['description'][*]['value']\"}" - }); - - final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); - - JavaRDD tmp = - sc.textFile(workingDir.toString() + "/publication") - .map(item -> OBJECT_MAPPER.readValue(item, Publication.class)); - - Assertions.assertEquals(10, tmp.count()); - org.apache.spark.sql.Dataset verificationDataset = - spark.createDataset(tmp.rdd(), Encoders.bean(Publication.class)); - - verificationDataset.createOrReplaceTempView("publication"); - - String query = - "select id, MyT.id community, MyD.provenanceaction.classid provenance, MyD.provenanceaction.classname name " - + "from publication " - + "lateral view explode(context) c as MyT " - + "lateral view explode(MyT.datainfo) d as MyD " - + "where MyD.inferenceprovenance = 'bulktagging'"; - - org.apache.spark.sql.Dataset idExplodeCommunity = spark.sql(query); - - Assertions.assertEquals(5, idExplodeCommunity.count()); - Assertions.assertEquals( - 5, idExplodeCommunity.filter("provenance = 'community:datasource'").count()); - Assertions.assertEquals( - 5, - idExplodeCommunity - .filter("name = 'Bulktagging for Community - Datasource'") - .count()); - - Assertions.assertEquals(3, idExplodeCommunity.filter("community = 'fam'").count()); - Assertions.assertEquals(2, idExplodeCommunity.filter("community = 'aginfra'").count()); - - Assertions.assertEquals( - 3, - idExplodeCommunity - .filter( - "community = 'fam' and (id = '50|ec_fp7health::000085c89f4b96dc2269bd37edb35306' " - + "or id = '50|ec_fp7health::000b9e61f83f5a4b0c35777b7bccdf38' " - + "or id = '50|ec_fp7health::0010eb63e181e3e91b8b6dc6b3e1c798')") - .count()); - - Assertions.assertEquals( - 2, - idExplodeCommunity - .filter( - "community = 'aginfra' and (id = '50|ec_fp7health::000c8195edd542e4e64ebb32172cbf89' " - + "or id = '50|ec_fp7health::0010eb63e181e3e91b8b6dc6b3e1c798')") - .count()); - } - - @Test - public void bulktagByZenodoCommunityTest() throws Exception { - SparkBulkTagJob2.main( - new String[] { - "-isTest", - Boolean.TRUE.toString(), - "-isSparkSessionManaged", - Boolean.FALSE.toString(), - "-sourcePath", - getClass() - .getResource( - "/eu/dnetlib/dhp/sample/otherresearchproduct/update_zenodocommunity") - .getPath(), - "-taggingConf", - taggingConf, - "-resultTableName", - "eu.dnetlib.dhp.schema.oaf.OtherResearchProduct", - "-outputPath", - workingDir.toString() + "/orp", - "-isLookupUrl", - "http://beta.services.openaire.eu:8280/is/services/isLookUp", - "-protoMap", - "{ \"author\" : \"$['author'][*]['fullname']\"," - + " \"title\" : \"$['title'][*]['value']\"," - + " \"orcid\" : \"$['author'][*]['pid'][*][?(@['key']=='ORCID')]['value']\"," - + " \"contributor\" : \"$['contributor'][*]['value']\"," - + " \"description\" : \"$['description'][*]['value']\"}" - }); - - final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); - - JavaRDD tmp = - sc.textFile(workingDir.toString() + "/orp") - .map(item -> OBJECT_MAPPER.readValue(item, OtherResearchProduct.class)); - - Assertions.assertEquals(10, tmp.count()); - org.apache.spark.sql.Dataset verificationDataset = - spark.createDataset(tmp.rdd(), Encoders.bean(OtherResearchProduct.class)); - - verificationDataset.createOrReplaceTempView("orp"); - - String query = - "select id, MyT.id community, MyD.provenanceaction.classid provenance, MyD.provenanceaction.classname name " - + "from orp " - + "lateral view explode(context) c as MyT " - + "lateral view explode(MyT.datainfo) d as MyD " - + "where MyD.inferenceprovenance = 'bulktagging'"; - - org.apache.spark.sql.Dataset idExplodeCommunity = spark.sql(query); - Assertions.assertEquals(8, idExplodeCommunity.count()); - - Assertions.assertEquals( - 8, idExplodeCommunity.filter("provenance = 'community:zenodocommunity'").count()); - Assertions.assertEquals( - 8, - idExplodeCommunity.filter("name = 'Bulktagging for Community - Zenodo'").count()); - - Assertions.assertEquals(1, idExplodeCommunity.filter("community = 'covid-19'").count()); - Assertions.assertEquals(1, idExplodeCommunity.filter("community = 'aginfra'").count()); - Assertions.assertEquals(2, idExplodeCommunity.filter("community = 'beopen'").count()); - Assertions.assertEquals(2, idExplodeCommunity.filter("community = 'fam'").count()); - Assertions.assertEquals(2, idExplodeCommunity.filter("community = 'mes'").count()); - - Assertions.assertEquals( - 1, - idExplodeCommunity - .filter( - "id = '50|od______2017::0750a4d0782265873d669520f5e33c07' " - + "and community = 'covid-19'") - .count()); - Assertions.assertEquals( - 3, - idExplodeCommunity - .filter( - "id = '50|od______2017::1bd97baef19dbd2db3203b112bb83bc5' and " - + "(community = 'aginfra' or community = 'mes' or community = 'fam')") - .count()); - Assertions.assertEquals( - 1, - idExplodeCommunity - .filter( - "id = '50|od______2017::1e400f1747487fd15998735c41a55c72' " - + "and community = 'beopen'") - .count()); - Assertions.assertEquals( - 3, - idExplodeCommunity - .filter( - "id = '50|od______2017::210281c5bc1c739a11ccceeeca806396' and " - + "(community = 'beopen' or community = 'fam' or community = 'mes')") - .count()); - - query = - "select id, MyT.id community, size(MyT.datainfo) datainfosize " - + "from orp " - + "lateral view explode (context) as MyT " - + "where size(MyT.datainfo) > 0"; - - Assertions.assertEquals( - 2, - spark.sql(query) - .select("datainfosize") - .where( - "id = '50|od______2017::210281c5bc1c739a11ccceeeca806396' a" - + "nd community = 'beopen'") - .collectAsList() - .get(0) - .getInt(0)); - - // verify the zenodo community context is not present anymore in the records - query = - "select id, MyT.id community " - + "from orp " - + "lateral view explode(context) c as MyT " - + "lateral view explode(MyT.datainfo) d as MyD "; - - org.apache.spark.sql.Dataset tmp2 = spark.sql(query); - - Assertions.assertEquals( - 0, - tmp2.select("community") - .where(tmp2.col("community").contains(ZENODO_COMMUNITY_INDICATOR)) - .count()); - } - - @Test - public void bulktagBySubjectDatasourceTest() throws Exception { - SparkBulkTagJob2.main( - new String[] { - "-isTest", - Boolean.TRUE.toString(), - "-isSparkSessionManaged", - Boolean.FALSE.toString(), - "-sourcePath", - getClass() - .getResource("/eu/dnetlib/dhp/sample/dataset/update_subject_datasource") - .getPath(), - "-taggingConf", - taggingConf, - "-resultTableName", - "eu.dnetlib.dhp.schema.oaf.Dataset", - "-outputPath", - workingDir.toString() + "/dataset", - "-isLookupUrl", - "http://beta.services.openaire.eu:8280/is/services/isLookUp", - "-protoMap", - "{ \"author\" : \"$['author'][*]['fullname']\"," - + " \"title\" : \"$['title'][*]['value']\"," - + " \"orcid\" : \"$['author'][*]['pid'][*][?(@['key']=='ORCID')]['value']\"," - + " \"contributor\" : \"$['contributor'][*]['value']\"," - + " \"description\" : \"$['description'][*]['value']\"}" - }); - - final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); - - JavaRDD tmp = - sc.textFile(workingDir.toString() + "/dataset") - .map(item -> OBJECT_MAPPER.readValue(item, Dataset.class)); - - Assertions.assertEquals(10, tmp.count()); - org.apache.spark.sql.Dataset verificationDataset = - spark.createDataset(tmp.rdd(), Encoders.bean(Dataset.class)); - - verificationDataset.createOrReplaceTempView("dataset"); - - String query = - "select id, MyT.id community, MyD.provenanceaction.classid provenance, MyD.provenanceaction.classname name " - + "from dataset " - + "lateral view explode(context) c as MyT " - + "lateral view explode(MyT.datainfo) d as MyD " - + "where MyD.inferenceprovenance = 'bulktagging'"; - - org.apache.spark.sql.Dataset idExplodeCommunity = spark.sql(query); - Assertions.assertEquals(7, idExplodeCommunity.count()); - - Assertions.assertEquals( - 5, idExplodeCommunity.filter("provenance = 'community:subject'").count()); - Assertions.assertEquals( - 2, idExplodeCommunity.filter("provenance = 'community:datasource'").count()); - Assertions.assertEquals(2, idExplodeCommunity.filter("community = 'covid-19'").count()); - Assertions.assertEquals(2, idExplodeCommunity.filter("community = 'fam'").count()); - Assertions.assertEquals(2, idExplodeCommunity.filter("community = 'aginfra'").count()); - Assertions.assertEquals(1, idExplodeCommunity.filter("community = 'mes'").count()); - - query = - "select id, MyT.id community, size(MyT.datainfo) datainfosize " - + "from dataset " - + "lateral view explode (context) as MyT " - + "where size(MyT.datainfo) > 0"; - - org.apache.spark.sql.Dataset tmp2 = spark.sql(query); - - Assertions.assertEquals( - 2, - tmp2.select("datainfosize") - .where( - "id = '50|od______3989::05d8c751462f9bb8d2b06956dfbc5c7b' and " - + "community = 'aginfra'") - .collectAsList() - .get(0) - .getInt(0)); - - Assertions.assertEquals( - 1, - tmp2.select("datainfosize") - .where( - "id = '50|od______3989::05d8c751462f9bb8d2b06956dfbc5c7b' and " - + "community = 'covid-19'") - .collectAsList() - .get(0) - .getInt(0)); - - Assertions.assertEquals( - 2, - tmp2.select("datainfosize") - .where( - "id = '50|od______3989::02dd5d2c222191b0b9bd4f33c8e96529' and " - + "community = 'fam'") - .collectAsList() - .get(0) - .getInt(0)); - Assertions.assertEquals( - 2, - tmp2.select("datainfosize") - .where( - "id = '50|od______3989::02dd5d2c222191b0b9bd4f33c8e96529' and " - + "community = 'covid-19'") - .collectAsList() - .get(0) - .getInt(0)); - - Assertions.assertEquals( - 1, - tmp2.select("datainfosize") - .where( - "id = '50|od______3989::0f89464c4ac4c398fe0c71433b175a62' and " - + "community = 'fam'") - .collectAsList() - .get(0) - .getInt(0)); - Assertions.assertEquals( - 1, - tmp2.select("datainfosize") - .where( - "id = '50|od______3989::0f89464c4ac4c398fe0c71433b175a62' and " - + "community = 'mes'") - .collectAsList() - .get(0) - .getInt(0)); - } - - @Test - public void bulktagBySubjectDatasourceZenodoCommunityTest() throws Exception { - - SparkBulkTagJob2.main( - new String[] { - "-isTest", - Boolean.TRUE.toString(), - "-isSparkSessionManaged", - Boolean.FALSE.toString(), - "-sourcePath", - getClass().getResource("/eu/dnetlib/dhp/sample/software/").getPath(), - "-taggingConf", - taggingConf, - "-resultTableName", - "eu.dnetlib.dhp.schema.oaf.Software", - "-outputPath", - workingDir.toString() + "/software", - "-isLookupUrl", - "http://beta.services.openaire.eu:8280/is/services/isLookUp", - "-protoMap", - "{ \"author\" : \"$['author'][*]['fullname']\"," - + " \"title\" : \"$['title'][*]['value']\"," - + " \"orcid\" : \"$['author'][*]['pid'][*][?(@['key']=='ORCID')]['value']\"," - + " \"contributor\" : \"$['contributor'][*]['value']\"," - + " \"description\" : \"$['description'][*]['value']\"}" - }); - - final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); - - JavaRDD tmp = - sc.textFile(workingDir.toString() + "/software") - .map(item -> OBJECT_MAPPER.readValue(item, Software.class)); - - Assertions.assertEquals(10, tmp.count()); - org.apache.spark.sql.Dataset verificationDataset = - spark.createDataset(tmp.rdd(), Encoders.bean(Software.class)); - - verificationDataset.createOrReplaceTempView("software"); - - String query = - "select id, MyT.id community, MyD.provenanceaction.classid provenance, MyD.provenanceaction.classname name " - + "from software " - + "lateral view explode(context) c as MyT " - + "lateral view explode(MyT.datainfo) d as MyD " - + "where MyD.inferenceprovenance = 'bulktagging'"; - - org.apache.spark.sql.Dataset idExplodeCommunity = spark.sql(query); - Assertions.assertEquals(10, idExplodeCommunity.count()); - - idExplodeCommunity.show(false); - Assertions.assertEquals( - 3, idExplodeCommunity.filter("provenance = 'community:subject'").count()); - Assertions.assertEquals( - 3, idExplodeCommunity.filter("provenance = 'community:datasource'").count()); - Assertions.assertEquals( - 4, idExplodeCommunity.filter("provenance = 'community:zenodocommunity'").count()); - - Assertions.assertEquals(3, idExplodeCommunity.filter("community = 'covid-19'").count()); - Assertions.assertEquals(1, idExplodeCommunity.filter("community = 'dh-ch'").count()); - Assertions.assertEquals(4, idExplodeCommunity.filter("community = 'aginfra'").count()); - Assertions.assertEquals(1, idExplodeCommunity.filter("community = 'dariah'").count()); - Assertions.assertEquals(1, idExplodeCommunity.filter("community = 'fam'").count()); - - Assertions.assertEquals( - 2, - idExplodeCommunity - .filter( - "provenance = 'community:zenodocommunity' and " - + "id = '50|od______1582::4132f5ec9496f0d6adc7b00a50a56ff4' and (" - + "community = 'dh-ch' or community = 'dariah')") - .count()); - - query = - "select id, MyT.id community, size(MyT.datainfo) datainfosize " - + "from software " - + "lateral view explode (context) as MyT " - + "where size(MyT.datainfo) > 0"; - - org.apache.spark.sql.Dataset tmp2 = spark.sql(query); - - Assertions.assertEquals( - 2, - tmp2.select("datainfosize") - .where( - "id = '50|od______1582::501b25d420f808c8eddcd9b16e917f11' and " - + "community = 'covid-19'") - .collectAsList() - .get(0) - .getInt(0)); - - Assertions.assertEquals( - 3, - tmp2.select("datainfosize") - .where( - "id = '50|od______1582::581621232a561b7e8b4952b18b8b0e56' and " - + "community = 'aginfra'") - .collectAsList() - .get(0) - .getInt(0)); - } - - @Test - public void bulktagDatasourcewithConstraintsTest() throws Exception { - - SparkBulkTagJob2.main( - new String[] { - "-isTest", - Boolean.TRUE.toString(), - "-isSparkSessionManaged", - Boolean.FALSE.toString(), - "-sourcePath", - getClass() - .getResource( - "/eu/dnetlib/dhp/sample/dataset/update_datasourcewithconstraints") - .getPath(), - "-taggingConf", - taggingConf, - "-resultTableName", - "eu.dnetlib.dhp.schema.oaf.Dataset", - "-outputPath", - workingDir.toString() + "/dataset", - "-isLookupUrl", - "http://beta.services.openaire.eu:8280/is/services/isLookUp", - "-protoMap", - "{ \"author\" : \"$['author'][*]['fullname']\"," - + " \"title\" : \"$['title'][*]['value']\"," - + " \"orcid\" : \"$['author'][*]['pid'][*][?(@['key']=='ORCID')]['value']\"," - + " \"contributor\" : \"$['contributor'][*]['value']\"," - + " \"description\" : \"$['description'][*]['value']\"}" - }); - - final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); - - JavaRDD tmp = - sc.textFile(workingDir.toString() + "/dataset") - .map(item -> OBJECT_MAPPER.readValue(item, Dataset.class)); - - Assertions.assertEquals(10, tmp.count()); - org.apache.spark.sql.Dataset verificationDataset = - spark.createDataset(tmp.rdd(), Encoders.bean(Dataset.class)); - - verificationDataset.createOrReplaceTempView("dataset"); - String query = - "select id, MyT.id community, MyD.provenanceaction.classid provenance, MyD.provenanceaction.classname name " - + "from dataset " - + "lateral view explode(context) c as MyT " - + "lateral view explode(MyT.datainfo) d as MyD " - + "where MyD.inferenceprovenance = 'bulktagging'"; - - org.apache.spark.sql.Dataset idExplodeCommunity = spark.sql(query); - - idExplodeCommunity.show(false); - Assertions.assertEquals(3, idExplodeCommunity.count()); - - Assertions.assertEquals( - 3, idExplodeCommunity.filter("provenance = 'community:datasource'").count()); - } + private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); + + private static final ClassLoader cl = eu.dnetlib.dhp.BulkTagJobTest.class.getClassLoader(); + + private static SparkSession spark; + + private static Path workingDir; + private static final Logger log = LoggerFactory.getLogger(eu.dnetlib.dhp.BulkTagJobTest.class); + + private static String taggingConf = ""; + + static { + try { + taggingConf = IO + .toString( + BulkTagJobTest.class + .getResourceAsStream( + "/eu/dnetlib/dhp/communityconfiguration/tagging_conf.xml")); + } catch (IOException e) { + e.printStackTrace(); + } + } + + @BeforeAll + public static void beforeAll() throws IOException { + workingDir = Files.createTempDirectory(eu.dnetlib.dhp.BulkTagJobTest.class.getSimpleName()); + log.info("using work dir {}", workingDir); + + SparkConf conf = new SparkConf(); + conf.setAppName(eu.dnetlib.dhp.BulkTagJobTest.class.getSimpleName()); + + conf.setMaster("local[*]"); + conf.set("spark.driver.host", "localhost"); + conf.set("hive.metastore.local", "true"); + conf.set("spark.ui.enabled", "false"); + conf.set("spark.sql.warehouse.dir", workingDir.toString()); + conf.set("hive.metastore.warehouse.dir", workingDir.resolve("warehouse").toString()); + + spark = SparkSession + .builder() + .appName(BulkTagJobTest.class.getSimpleName()) + .config(conf) + .getOrCreate(); + } + + @AfterAll + public static void afterAll() throws IOException { + FileUtils.deleteDirectory(workingDir.toFile()); + spark.stop(); + } + + @Test + public void noUpdatesTest() throws Exception { + SparkBulkTagJob2 + .main( + new String[] { + "-isTest", + Boolean.TRUE.toString(), + "-isSparkSessionManaged", + Boolean.FALSE.toString(), + "-sourcePath", + getClass().getResource("/eu/dnetlib/dhp/sample/dataset/no_updates").getPath(), + "-taggingConf", + taggingConf, + "-resultTableName", + "eu.dnetlib.dhp.schema.oaf.Dataset", + "-outputPath", + workingDir.toString() + "/dataset", + "-isLookupUrl", + "http://beta.services.openaire.eu:8280/is/services/isLookUp", + "-protoMap", + "{ \"author\" : \"$['author'][*]['fullname']\"," + + " \"title\" : \"$['title'][*]['value']\"," + + " \"orcid\" : \"$['author'][*]['pid'][*][?(@['key']=='ORCID')]['value']\"," + + " \"contributor\" : \"$['contributor'][*]['value']\"," + + " \"description\" : \"$['description'][*]['value']\"}" + // "-preparedInfoPath", + // getClass().getResource("/eu/dnetlib/dhp/resulttocommunityfromsemrel/preparedInfo").getPath() + }); + + final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); + + JavaRDD tmp = sc + .textFile(workingDir.toString() + "/dataset") + .map(item -> OBJECT_MAPPER.readValue(item, Dataset.class)); + + Assertions.assertEquals(10, tmp.count()); + org.apache.spark.sql.Dataset verificationDataset = spark + .createDataset(tmp.rdd(), Encoders.bean(Dataset.class)); + + verificationDataset.createOrReplaceTempView("dataset"); + + String query = "select id, MyT.id community " + + "from dataset " + + "lateral view explode(context) c as MyT " + + "lateral view explode(MyT.datainfo) d as MyD " + + "where MyD.inferenceprovenance = 'bulktagging'"; + + Assertions.assertEquals(0, spark.sql(query).count()); + } + + @Test + public void bulktagBySubjectNoPreviousContextTest() throws Exception { + SparkBulkTagJob2 + .main( + new String[] { + "-isTest", + Boolean.TRUE.toString(), + "-isSparkSessionManaged", + Boolean.FALSE.toString(), + "-sourcePath", + getClass() + .getResource("/eu/dnetlib/dhp/sample/dataset/update_subject/nocontext") + .getPath(), + "-taggingConf", + taggingConf, + "-resultTableName", + "eu.dnetlib.dhp.schema.oaf.Dataset", + "-outputPath", + workingDir.toString() + "/dataset", + "-isLookupUrl", + "http://beta.services.openaire.eu:8280/is/services/isLookUp", + "-protoMap", + "{ \"author\" : \"$['author'][*]['fullname']\"," + + " \"title\" : \"$['title'][*]['value']\"," + + " \"orcid\" : \"$['author'][*]['pid'][*][?(@['key']=='ORCID')]['value']\"," + + " \"contributor\" : \"$['contributor'][*]['value']\"," + + " \"description\" : \"$['description'][*]['value']\"}" + }); + + final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); + + JavaRDD tmp = sc + .textFile(workingDir.toString() + "/dataset") + .map(item -> OBJECT_MAPPER.readValue(item, Dataset.class)); + + Assertions.assertEquals(10, tmp.count()); + org.apache.spark.sql.Dataset verificationDataset = spark + .createDataset(tmp.rdd(), Encoders.bean(Dataset.class)); + + verificationDataset.createOrReplaceTempView("dataset"); + + String query = "select id, MyT.id community, MyD.provenanceaction.classid provenance, MyD.provenanceaction.classname name " + + "from dataset " + + "lateral view explode(context) c as MyT " + + "lateral view explode(MyT.datainfo) d as MyD " + + "where MyD.inferenceprovenance = 'bulktagging'"; + + Assertions.assertEquals(5, spark.sql(query).count()); + + org.apache.spark.sql.Dataset idExplodeCommunity = spark.sql(query); + Assertions + .assertEquals( + 5, idExplodeCommunity.filter("provenance = 'community:subject'").count()); + Assertions + .assertEquals( + 5, + idExplodeCommunity.filter("name = 'Bulktagging for Community - Subject'").count()); + + Assertions.assertEquals(2, idExplodeCommunity.filter("community = 'covid-19'").count()); + Assertions.assertEquals(1, idExplodeCommunity.filter("community = 'mes'").count()); + Assertions.assertEquals(1, idExplodeCommunity.filter("community = 'fam'").count()); + Assertions.assertEquals(1, idExplodeCommunity.filter("community = 'aginfra'").count()); + + Assertions + .assertEquals( + 1, + idExplodeCommunity + .filter("id = '50|od______3989::02dd5d2c222191b0b9bd4f33c8e96529'") + .count()); + Assertions + .assertEquals( + 1, + idExplodeCommunity + .filter( + "community = 'covid-19' and id = '50|od______3989::02dd5d2c222191b0b9bd4f33c8e96529'") + .count()); + + Assertions + .assertEquals( + 2, + idExplodeCommunity + .filter("id = '50|od______3989::05d8c751462f9bb8d2b06956dfbc5c7b'") + .count()); + Assertions + .assertEquals( + 2, + idExplodeCommunity + .filter( + "(community = 'covid-19' or community = 'aginfra') and id = '50|od______3989::05d8c751462f9bb8d2b06956dfbc5c7b'") + .count()); + + Assertions + .assertEquals( + 2, + idExplodeCommunity + .filter("id = '50|od______3989::0f89464c4ac4c398fe0c71433b175a62'") + .count()); + Assertions + .assertEquals( + 2, + idExplodeCommunity + .filter( + "(community = 'mes' or community = 'fam') and id = '50|od______3989::0f89464c4ac4c398fe0c71433b175a62'") + .count()); + } + + @Test + public void bulktagBySubjectPreviousContextNoProvenanceTest() throws Exception { + SparkBulkTagJob2 + .main( + new String[] { + "-isTest", + Boolean.TRUE.toString(), + "-isSparkSessionManaged", + Boolean.FALSE.toString(), + "-sourcePath", + getClass() + .getResource( + "/eu/dnetlib/dhp/sample/dataset/update_subject/contextnoprovenance") + .getPath(), + "-taggingConf", + taggingConf, + "-resultTableName", + "eu.dnetlib.dhp.schema.oaf.Dataset", + "-outputPath", + workingDir.toString() + "/dataset", + "-isLookupUrl", + "http://beta.services.openaire.eu:8280/is/services/isLookUp", + "-protoMap", + "{ \"author\" : \"$['author'][*]['fullname']\"," + + " \"title\" : \"$['title'][*]['value']\"," + + " \"orcid\" : \"$['author'][*]['pid'][*][?(@['key']=='ORCID')]['value']\"," + + " \"contributor\" : \"$['contributor'][*]['value']\"," + + " \"description\" : \"$['description'][*]['value']\"}" + }); + + final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); + + JavaRDD tmp = sc + .textFile(workingDir.toString() + "/dataset") + .map(item -> OBJECT_MAPPER.readValue(item, Dataset.class)); + + Assertions.assertEquals(10, tmp.count()); + org.apache.spark.sql.Dataset verificationDataset = spark + .createDataset(tmp.rdd(), Encoders.bean(Dataset.class)); + + verificationDataset.createOrReplaceTempView("dataset"); + + String query = "select id, MyT.id community, MyD.provenanceaction.classid provenance " + + "from dataset " + + "lateral view explode(context) c as MyT " + + "lateral view explode(MyT.datainfo) d as MyD " + + "where MyT.id = 'covid-19' "; + + Assertions.assertEquals(3, spark.sql(query).count()); + + org.apache.spark.sql.Dataset communityContext = spark.sql(query); + + Assertions + .assertEquals( + 2, + communityContext + .filter("id = '50|od______3989::02dd5d2c222191b0b9bd4f33c8e96529'") + .count()); + Assertions + .assertEquals( + 1, + communityContext + .filter( + "id = '50|od______3989::02dd5d2c222191b0b9bd4f33c8e96529' and provenance = 'community:subject'") + .count()); + Assertions + .assertEquals( + 1, + communityContext + .filter( + "id = '50|od______3989::02dd5d2c222191b0b9bd4f33c8e96529' and provenance = 'propagation:community:productsthroughsemrel'") + .count()); + + query = "select id, MyT.id community, size(MyT.datainfo) datainfosize " + + "from dataset " + + "lateral view explode (context) as MyT " + + "where size(MyT.datainfo) > 0"; + + Assertions + .assertEquals( + 2, + spark + .sql(query) + .select("datainfosize") + .where( + "id = '50|od______3989::02dd5d2c222191b0b9bd4f33c8e96529' a" + + "nd community = 'covid-19'") + .collectAsList() + .get(0) + .getInt(0)); + } + + @Test + public void bulktagByDatasourceTest() throws Exception { + SparkBulkTagJob2 + .main( + new String[] { + "-isTest", + Boolean.TRUE.toString(), + "-isSparkSessionManaged", + Boolean.FALSE.toString(), + "-sourcePath", + getClass() + .getResource("/eu/dnetlib/dhp/sample/publication/update_datasource") + .getPath(), + "-taggingConf", + taggingConf, + "-resultTableName", + "eu.dnetlib.dhp.schema.oaf.Publication", + "-outputPath", + workingDir.toString() + "/publication", + "-isLookupUrl", + "http://beta.services.openaire.eu:8280/is/services/isLookUp", + "-protoMap", + "{ \"author\" : \"$['author'][*]['fullname']\"," + + " \"title\" : \"$['title'][*]['value']\"," + + " \"orcid\" : \"$['author'][*]['pid'][*][?(@['key']=='ORCID')]['value']\"," + + " \"contributor\" : \"$['contributor'][*]['value']\"," + + " \"description\" : \"$['description'][*]['value']\"}" + }); + + final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); + + JavaRDD tmp = sc + .textFile(workingDir.toString() + "/publication") + .map(item -> OBJECT_MAPPER.readValue(item, Publication.class)); + + Assertions.assertEquals(10, tmp.count()); + org.apache.spark.sql.Dataset verificationDataset = spark + .createDataset(tmp.rdd(), Encoders.bean(Publication.class)); + + verificationDataset.createOrReplaceTempView("publication"); + + String query = "select id, MyT.id community, MyD.provenanceaction.classid provenance, MyD.provenanceaction.classname name " + + "from publication " + + "lateral view explode(context) c as MyT " + + "lateral view explode(MyT.datainfo) d as MyD " + + "where MyD.inferenceprovenance = 'bulktagging'"; + + org.apache.spark.sql.Dataset idExplodeCommunity = spark.sql(query); + + Assertions.assertEquals(5, idExplodeCommunity.count()); + Assertions + .assertEquals( + 5, idExplodeCommunity.filter("provenance = 'community:datasource'").count()); + Assertions + .assertEquals( + 5, + idExplodeCommunity + .filter("name = 'Bulktagging for Community - Datasource'") + .count()); + + Assertions.assertEquals(3, idExplodeCommunity.filter("community = 'fam'").count()); + Assertions.assertEquals(2, idExplodeCommunity.filter("community = 'aginfra'").count()); + + Assertions + .assertEquals( + 3, + idExplodeCommunity + .filter( + "community = 'fam' and (id = '50|ec_fp7health::000085c89f4b96dc2269bd37edb35306' " + + "or id = '50|ec_fp7health::000b9e61f83f5a4b0c35777b7bccdf38' " + + "or id = '50|ec_fp7health::0010eb63e181e3e91b8b6dc6b3e1c798')") + .count()); + + Assertions + .assertEquals( + 2, + idExplodeCommunity + .filter( + "community = 'aginfra' and (id = '50|ec_fp7health::000c8195edd542e4e64ebb32172cbf89' " + + "or id = '50|ec_fp7health::0010eb63e181e3e91b8b6dc6b3e1c798')") + .count()); + } + + @Test + public void bulktagByZenodoCommunityTest() throws Exception { + SparkBulkTagJob2 + .main( + new String[] { + "-isTest", + Boolean.TRUE.toString(), + "-isSparkSessionManaged", + Boolean.FALSE.toString(), + "-sourcePath", + getClass() + .getResource( + "/eu/dnetlib/dhp/sample/otherresearchproduct/update_zenodocommunity") + .getPath(), + "-taggingConf", + taggingConf, + "-resultTableName", + "eu.dnetlib.dhp.schema.oaf.OtherResearchProduct", + "-outputPath", + workingDir.toString() + "/orp", + "-isLookupUrl", + "http://beta.services.openaire.eu:8280/is/services/isLookUp", + "-protoMap", + "{ \"author\" : \"$['author'][*]['fullname']\"," + + " \"title\" : \"$['title'][*]['value']\"," + + " \"orcid\" : \"$['author'][*]['pid'][*][?(@['key']=='ORCID')]['value']\"," + + " \"contributor\" : \"$['contributor'][*]['value']\"," + + " \"description\" : \"$['description'][*]['value']\"}" + }); + + final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); + + JavaRDD tmp = sc + .textFile(workingDir.toString() + "/orp") + .map(item -> OBJECT_MAPPER.readValue(item, OtherResearchProduct.class)); + + Assertions.assertEquals(10, tmp.count()); + org.apache.spark.sql.Dataset verificationDataset = spark + .createDataset(tmp.rdd(), Encoders.bean(OtherResearchProduct.class)); + + verificationDataset.createOrReplaceTempView("orp"); + + String query = "select id, MyT.id community, MyD.provenanceaction.classid provenance, MyD.provenanceaction.classname name " + + "from orp " + + "lateral view explode(context) c as MyT " + + "lateral view explode(MyT.datainfo) d as MyD " + + "where MyD.inferenceprovenance = 'bulktagging'"; + + org.apache.spark.sql.Dataset idExplodeCommunity = spark.sql(query); + Assertions.assertEquals(8, idExplodeCommunity.count()); + + Assertions + .assertEquals( + 8, idExplodeCommunity.filter("provenance = 'community:zenodocommunity'").count()); + Assertions + .assertEquals( + 8, + idExplodeCommunity.filter("name = 'Bulktagging for Community - Zenodo'").count()); + + Assertions.assertEquals(1, idExplodeCommunity.filter("community = 'covid-19'").count()); + Assertions.assertEquals(1, idExplodeCommunity.filter("community = 'aginfra'").count()); + Assertions.assertEquals(2, idExplodeCommunity.filter("community = 'beopen'").count()); + Assertions.assertEquals(2, idExplodeCommunity.filter("community = 'fam'").count()); + Assertions.assertEquals(2, idExplodeCommunity.filter("community = 'mes'").count()); + + Assertions + .assertEquals( + 1, + idExplodeCommunity + .filter( + "id = '50|od______2017::0750a4d0782265873d669520f5e33c07' " + + "and community = 'covid-19'") + .count()); + Assertions + .assertEquals( + 3, + idExplodeCommunity + .filter( + "id = '50|od______2017::1bd97baef19dbd2db3203b112bb83bc5' and " + + "(community = 'aginfra' or community = 'mes' or community = 'fam')") + .count()); + Assertions + .assertEquals( + 1, + idExplodeCommunity + .filter( + "id = '50|od______2017::1e400f1747487fd15998735c41a55c72' " + + "and community = 'beopen'") + .count()); + Assertions + .assertEquals( + 3, + idExplodeCommunity + .filter( + "id = '50|od______2017::210281c5bc1c739a11ccceeeca806396' and " + + "(community = 'beopen' or community = 'fam' or community = 'mes')") + .count()); + + query = "select id, MyT.id community, size(MyT.datainfo) datainfosize " + + "from orp " + + "lateral view explode (context) as MyT " + + "where size(MyT.datainfo) > 0"; + + Assertions + .assertEquals( + 2, + spark + .sql(query) + .select("datainfosize") + .where( + "id = '50|od______2017::210281c5bc1c739a11ccceeeca806396' a" + + "nd community = 'beopen'") + .collectAsList() + .get(0) + .getInt(0)); + + // verify the zenodo community context is not present anymore in the records + query = "select id, MyT.id community " + + "from orp " + + "lateral view explode(context) c as MyT " + + "lateral view explode(MyT.datainfo) d as MyD "; + + org.apache.spark.sql.Dataset tmp2 = spark.sql(query); + + Assertions + .assertEquals( + 0, + tmp2 + .select("community") + .where(tmp2.col("community").contains(ZENODO_COMMUNITY_INDICATOR)) + .count()); + } + + @Test + public void bulktagBySubjectDatasourceTest() throws Exception { + SparkBulkTagJob2 + .main( + new String[] { + "-isTest", + Boolean.TRUE.toString(), + "-isSparkSessionManaged", + Boolean.FALSE.toString(), + "-sourcePath", + getClass() + .getResource("/eu/dnetlib/dhp/sample/dataset/update_subject_datasource") + .getPath(), + "-taggingConf", + taggingConf, + "-resultTableName", + "eu.dnetlib.dhp.schema.oaf.Dataset", + "-outputPath", + workingDir.toString() + "/dataset", + "-isLookupUrl", + "http://beta.services.openaire.eu:8280/is/services/isLookUp", + "-protoMap", + "{ \"author\" : \"$['author'][*]['fullname']\"," + + " \"title\" : \"$['title'][*]['value']\"," + + " \"orcid\" : \"$['author'][*]['pid'][*][?(@['key']=='ORCID')]['value']\"," + + " \"contributor\" : \"$['contributor'][*]['value']\"," + + " \"description\" : \"$['description'][*]['value']\"}" + }); + + final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); + + JavaRDD tmp = sc + .textFile(workingDir.toString() + "/dataset") + .map(item -> OBJECT_MAPPER.readValue(item, Dataset.class)); + + Assertions.assertEquals(10, tmp.count()); + org.apache.spark.sql.Dataset verificationDataset = spark + .createDataset(tmp.rdd(), Encoders.bean(Dataset.class)); + + verificationDataset.createOrReplaceTempView("dataset"); + + String query = "select id, MyT.id community, MyD.provenanceaction.classid provenance, MyD.provenanceaction.classname name " + + "from dataset " + + "lateral view explode(context) c as MyT " + + "lateral view explode(MyT.datainfo) d as MyD " + + "where MyD.inferenceprovenance = 'bulktagging'"; + + org.apache.spark.sql.Dataset idExplodeCommunity = spark.sql(query); + Assertions.assertEquals(7, idExplodeCommunity.count()); + + Assertions + .assertEquals( + 5, idExplodeCommunity.filter("provenance = 'community:subject'").count()); + Assertions + .assertEquals( + 2, idExplodeCommunity.filter("provenance = 'community:datasource'").count()); + Assertions.assertEquals(2, idExplodeCommunity.filter("community = 'covid-19'").count()); + Assertions.assertEquals(2, idExplodeCommunity.filter("community = 'fam'").count()); + Assertions.assertEquals(2, idExplodeCommunity.filter("community = 'aginfra'").count()); + Assertions.assertEquals(1, idExplodeCommunity.filter("community = 'mes'").count()); + + query = "select id, MyT.id community, size(MyT.datainfo) datainfosize " + + "from dataset " + + "lateral view explode (context) as MyT " + + "where size(MyT.datainfo) > 0"; + + org.apache.spark.sql.Dataset tmp2 = spark.sql(query); + + Assertions + .assertEquals( + 2, + tmp2 + .select("datainfosize") + .where( + "id = '50|od______3989::05d8c751462f9bb8d2b06956dfbc5c7b' and " + + "community = 'aginfra'") + .collectAsList() + .get(0) + .getInt(0)); + + Assertions + .assertEquals( + 1, + tmp2 + .select("datainfosize") + .where( + "id = '50|od______3989::05d8c751462f9bb8d2b06956dfbc5c7b' and " + + "community = 'covid-19'") + .collectAsList() + .get(0) + .getInt(0)); + + Assertions + .assertEquals( + 2, + tmp2 + .select("datainfosize") + .where( + "id = '50|od______3989::02dd5d2c222191b0b9bd4f33c8e96529' and " + + "community = 'fam'") + .collectAsList() + .get(0) + .getInt(0)); + Assertions + .assertEquals( + 2, + tmp2 + .select("datainfosize") + .where( + "id = '50|od______3989::02dd5d2c222191b0b9bd4f33c8e96529' and " + + "community = 'covid-19'") + .collectAsList() + .get(0) + .getInt(0)); + + Assertions + .assertEquals( + 1, + tmp2 + .select("datainfosize") + .where( + "id = '50|od______3989::0f89464c4ac4c398fe0c71433b175a62' and " + + "community = 'fam'") + .collectAsList() + .get(0) + .getInt(0)); + Assertions + .assertEquals( + 1, + tmp2 + .select("datainfosize") + .where( + "id = '50|od______3989::0f89464c4ac4c398fe0c71433b175a62' and " + + "community = 'mes'") + .collectAsList() + .get(0) + .getInt(0)); + } + + @Test + public void bulktagBySubjectDatasourceZenodoCommunityTest() throws Exception { + + SparkBulkTagJob2 + .main( + new String[] { + "-isTest", + Boolean.TRUE.toString(), + "-isSparkSessionManaged", + Boolean.FALSE.toString(), + "-sourcePath", + getClass().getResource("/eu/dnetlib/dhp/sample/software/").getPath(), + "-taggingConf", + taggingConf, + "-resultTableName", + "eu.dnetlib.dhp.schema.oaf.Software", + "-outputPath", + workingDir.toString() + "/software", + "-isLookupUrl", + "http://beta.services.openaire.eu:8280/is/services/isLookUp", + "-protoMap", + "{ \"author\" : \"$['author'][*]['fullname']\"," + + " \"title\" : \"$['title'][*]['value']\"," + + " \"orcid\" : \"$['author'][*]['pid'][*][?(@['key']=='ORCID')]['value']\"," + + " \"contributor\" : \"$['contributor'][*]['value']\"," + + " \"description\" : \"$['description'][*]['value']\"}" + }); + + final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); + + JavaRDD tmp = sc + .textFile(workingDir.toString() + "/software") + .map(item -> OBJECT_MAPPER.readValue(item, Software.class)); + + Assertions.assertEquals(10, tmp.count()); + org.apache.spark.sql.Dataset verificationDataset = spark + .createDataset(tmp.rdd(), Encoders.bean(Software.class)); + + verificationDataset.createOrReplaceTempView("software"); + + String query = "select id, MyT.id community, MyD.provenanceaction.classid provenance, MyD.provenanceaction.classname name " + + "from software " + + "lateral view explode(context) c as MyT " + + "lateral view explode(MyT.datainfo) d as MyD " + + "where MyD.inferenceprovenance = 'bulktagging'"; + + org.apache.spark.sql.Dataset idExplodeCommunity = spark.sql(query); + Assertions.assertEquals(10, idExplodeCommunity.count()); + + idExplodeCommunity.show(false); + Assertions + .assertEquals( + 3, idExplodeCommunity.filter("provenance = 'community:subject'").count()); + Assertions + .assertEquals( + 3, idExplodeCommunity.filter("provenance = 'community:datasource'").count()); + Assertions + .assertEquals( + 4, idExplodeCommunity.filter("provenance = 'community:zenodocommunity'").count()); + + Assertions.assertEquals(3, idExplodeCommunity.filter("community = 'covid-19'").count()); + Assertions.assertEquals(1, idExplodeCommunity.filter("community = 'dh-ch'").count()); + Assertions.assertEquals(4, idExplodeCommunity.filter("community = 'aginfra'").count()); + Assertions.assertEquals(1, idExplodeCommunity.filter("community = 'dariah'").count()); + Assertions.assertEquals(1, idExplodeCommunity.filter("community = 'fam'").count()); + + Assertions + .assertEquals( + 2, + idExplodeCommunity + .filter( + "provenance = 'community:zenodocommunity' and " + + "id = '50|od______1582::4132f5ec9496f0d6adc7b00a50a56ff4' and (" + + "community = 'dh-ch' or community = 'dariah')") + .count()); + + query = "select id, MyT.id community, size(MyT.datainfo) datainfosize " + + "from software " + + "lateral view explode (context) as MyT " + + "where size(MyT.datainfo) > 0"; + + org.apache.spark.sql.Dataset tmp2 = spark.sql(query); + + Assertions + .assertEquals( + 2, + tmp2 + .select("datainfosize") + .where( + "id = '50|od______1582::501b25d420f808c8eddcd9b16e917f11' and " + + "community = 'covid-19'") + .collectAsList() + .get(0) + .getInt(0)); + + Assertions + .assertEquals( + 3, + tmp2 + .select("datainfosize") + .where( + "id = '50|od______1582::581621232a561b7e8b4952b18b8b0e56' and " + + "community = 'aginfra'") + .collectAsList() + .get(0) + .getInt(0)); + } + + @Test + public void bulktagDatasourcewithConstraintsTest() throws Exception { + + SparkBulkTagJob2 + .main( + new String[] { + "-isTest", + Boolean.TRUE.toString(), + "-isSparkSessionManaged", + Boolean.FALSE.toString(), + "-sourcePath", + getClass() + .getResource( + "/eu/dnetlib/dhp/sample/dataset/update_datasourcewithconstraints") + .getPath(), + "-taggingConf", + taggingConf, + "-resultTableName", + "eu.dnetlib.dhp.schema.oaf.Dataset", + "-outputPath", + workingDir.toString() + "/dataset", + "-isLookupUrl", + "http://beta.services.openaire.eu:8280/is/services/isLookUp", + "-protoMap", + "{ \"author\" : \"$['author'][*]['fullname']\"," + + " \"title\" : \"$['title'][*]['value']\"," + + " \"orcid\" : \"$['author'][*]['pid'][*][?(@['key']=='ORCID')]['value']\"," + + " \"contributor\" : \"$['contributor'][*]['value']\"," + + " \"description\" : \"$['description'][*]['value']\"}" + }); + + final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); + + JavaRDD tmp = sc + .textFile(workingDir.toString() + "/dataset") + .map(item -> OBJECT_MAPPER.readValue(item, Dataset.class)); + + Assertions.assertEquals(10, tmp.count()); + org.apache.spark.sql.Dataset verificationDataset = spark + .createDataset(tmp.rdd(), Encoders.bean(Dataset.class)); + + verificationDataset.createOrReplaceTempView("dataset"); + String query = "select id, MyT.id community, MyD.provenanceaction.classid provenance, MyD.provenanceaction.classname name " + + "from dataset " + + "lateral view explode(context) c as MyT " + + "lateral view explode(MyT.datainfo) d as MyD " + + "where MyD.inferenceprovenance = 'bulktagging'"; + + org.apache.spark.sql.Dataset idExplodeCommunity = spark.sql(query); + + idExplodeCommunity.show(false); + Assertions.assertEquals(3, idExplodeCommunity.count()); + + Assertions + .assertEquals( + 3, idExplodeCommunity.filter("provenance = 'community:datasource'").count()); + } } diff --git a/dhp-workflows/dhp-bulktag/src/test/java/eu/dnetlib/dhp/CommunityConfigurationFactoryTest.java b/dhp-workflows/dhp-bulktag/src/test/java/eu/dnetlib/dhp/CommunityConfigurationFactoryTest.java index a04395337..3aae9ebee 100644 --- a/dhp-workflows/dhp-bulktag/src/test/java/eu/dnetlib/dhp/CommunityConfigurationFactoryTest.java +++ b/dhp-workflows/dhp-bulktag/src/test/java/eu/dnetlib/dhp/CommunityConfigurationFactoryTest.java @@ -1,155 +1,166 @@ + package eu.dnetlib.dhp; -import com.google.gson.Gson; -import eu.dnetlib.dhp.community.CommunityConfiguration; -import eu.dnetlib.dhp.community.CommunityConfigurationFactory; -import eu.dnetlib.dhp.community.Constraint; -import eu.dnetlib.dhp.community.SelectionConstraints; -import eu.dnetlib.dhp.selectioncriteria.VerbResolver; import java.io.IOException; import java.lang.reflect.InvocationTargetException; import java.util.*; + import org.apache.commons.io.IOUtils; import org.apache.commons.lang3.StringUtils; import org.dom4j.DocumentException; import org.junit.jupiter.api.Assertions; import org.junit.jupiter.api.Test; +import com.google.gson.Gson; + +import eu.dnetlib.dhp.community.CommunityConfiguration; +import eu.dnetlib.dhp.community.CommunityConfigurationFactory; +import eu.dnetlib.dhp.community.Constraint; +import eu.dnetlib.dhp.community.SelectionConstraints; +import eu.dnetlib.dhp.selectioncriteria.VerbResolver; + /** Created by miriam on 03/08/2018. */ public class CommunityConfigurationFactoryTest { - private final VerbResolver resolver = new VerbResolver(); + private final VerbResolver resolver = new VerbResolver(); - @Test - public void parseTest() throws DocumentException, IOException { - String xml = - IOUtils.toString( - getClass() - .getResourceAsStream( - "/eu/dnetlib/dhp/communityconfiguration/community_configuration.xml")); - final CommunityConfiguration cc = CommunityConfigurationFactory.newInstance(xml); - Assertions.assertEquals(5, cc.size()); - cc.getCommunityList() - .forEach(c -> Assertions.assertTrue(StringUtils.isNoneBlank(c.getId()))); - } + @Test + public void parseTest() throws DocumentException, IOException { + String xml = IOUtils + .toString( + getClass() + .getResourceAsStream( + "/eu/dnetlib/dhp/communityconfiguration/community_configuration.xml")); + final CommunityConfiguration cc = CommunityConfigurationFactory.newInstance(xml); + Assertions.assertEquals(5, cc.size()); + cc + .getCommunityList() + .forEach(c -> Assertions.assertTrue(StringUtils.isNoneBlank(c.getId()))); + } - @Test - public void applyVerb() - throws InvocationTargetException, IllegalAccessException, NoSuchMethodException, - InstantiationException { - Constraint sc = new Constraint(); - sc.setVerb("not_contains"); - sc.setField("contributor"); - sc.setValue("DARIAH"); - sc.setSelection(resolver.getSelectionCriteria(sc.getVerb(), sc.getValue())); - String metadata = "This work has been partially supported by DARIAH-EU infrastructure"; - Assertions.assertFalse(sc.verifyCriteria(metadata)); - } + @Test + public void applyVerb() + throws InvocationTargetException, IllegalAccessException, NoSuchMethodException, + InstantiationException { + Constraint sc = new Constraint(); + sc.setVerb("not_contains"); + sc.setField("contributor"); + sc.setValue("DARIAH"); + sc.setSelection(resolver.getSelectionCriteria(sc.getVerb(), sc.getValue())); + String metadata = "This work has been partially supported by DARIAH-EU infrastructure"; + Assertions.assertFalse(sc.verifyCriteria(metadata)); + } - @Test - public void loadSelCriteriaTest() throws DocumentException, IOException { - String xml = - IOUtils.toString( - getClass() - .getResourceAsStream( - "/eu/dnetlib/dhp/communityconfiguration/community_configuration_selcrit.xml")); - final CommunityConfiguration cc = CommunityConfigurationFactory.newInstance(xml); - Map> param = new HashMap<>(); - param.put("author", new ArrayList<>(Collections.singletonList("Pippo Pippi"))); - param.put( - "description", - new ArrayList<>( - Collections.singletonList( - "This work has been partially supported by DARIAH-EU infrastructure"))); - param.put( - "contributor", - new ArrayList<>( - Collections.singletonList( - "Pallino ha aiutato a scrivere il paper. Pallino lavora per DARIAH"))); - List comm = - cc.getCommunityForDatasource( - "openaire____::1cfdb2e14977f31a98e0118283401f32", param); - Assertions.assertEquals(1, comm.size()); - Assertions.assertEquals("dariah", comm.get(0)); - } + @Test + public void loadSelCriteriaTest() throws DocumentException, IOException { + String xml = IOUtils + .toString( + getClass() + .getResourceAsStream( + "/eu/dnetlib/dhp/communityconfiguration/community_configuration_selcrit.xml")); + final CommunityConfiguration cc = CommunityConfigurationFactory.newInstance(xml); + Map> param = new HashMap<>(); + param.put("author", new ArrayList<>(Collections.singletonList("Pippo Pippi"))); + param + .put( + "description", + new ArrayList<>( + Collections + .singletonList( + "This work has been partially supported by DARIAH-EU infrastructure"))); + param + .put( + "contributor", + new ArrayList<>( + Collections + .singletonList( + "Pallino ha aiutato a scrivere il paper. Pallino lavora per DARIAH"))); + List comm = cc + .getCommunityForDatasource( + "openaire____::1cfdb2e14977f31a98e0118283401f32", param); + Assertions.assertEquals(1, comm.size()); + Assertions.assertEquals("dariah", comm.get(0)); + } - @Test - public void test4() throws DocumentException, IOException { - final CommunityConfiguration cc = - CommunityConfigurationFactory.fromJson( - IOUtils.toString( - getClass() - .getResourceAsStream( - "/eu/dnetlib/dhp/communityconfiguration/community_configuration_selcrit.json"))); - cc.toString(); - } + @Test + public void test4() throws DocumentException, IOException { + final CommunityConfiguration cc = CommunityConfigurationFactory + .fromJson( + IOUtils + .toString( + getClass() + .getResourceAsStream( + "/eu/dnetlib/dhp/communityconfiguration/community_configuration_selcrit.json"))); + cc.toString(); + } - @Test - public void test5() throws IOException, DocumentException { + @Test + public void test5() throws IOException, DocumentException { - // final CommunityConfiguration cc = - // CommunityConfigurationFactory.newInstance(IOUtils.toString(getClass().getResourceAsStream("test.xml"))); - final CommunityConfiguration cc = - CommunityConfigurationFactory.fromJson( - IOUtils.toString( - getClass() - .getResourceAsStream( - "/eu/dnetlib/dhp/communityconfiguration/community_configuration.json"))); + // final CommunityConfiguration cc = + // CommunityConfigurationFactory.newInstance(IOUtils.toString(getClass().getResourceAsStream("test.xml"))); + final CommunityConfiguration cc = CommunityConfigurationFactory + .fromJson( + IOUtils + .toString( + getClass() + .getResourceAsStream( + "/eu/dnetlib/dhp/communityconfiguration/community_configuration.json"))); - System.out.println(cc.toJson()); - } + System.out.println(cc.toJson()); + } - @Test - public void test6() { - String json = - "{\"criteria\":[{\"constraint\":[{\"verb\":\"contains\",\"field\":\"contributor\",\"value\":\"DARIAH\"}]}]}"; + @Test + public void test6() { + String json = "{\"criteria\":[{\"constraint\":[{\"verb\":\"contains\",\"field\":\"contributor\",\"value\":\"DARIAH\"}]}]}"; - String step1 = "{\"verb\":\"contains\",\"field\":\"contributor\",\"value\":\"DARIAH\"}"; + String step1 = "{\"verb\":\"contains\",\"field\":\"contributor\",\"value\":\"DARIAH\"}"; - Constraint c = new Gson().fromJson(step1, Constraint.class); - // - // String step2 = - // "{\"constraint\":[{\"verb\":\"contains\",\"field\":\"contributor\",\"value\":\"DARIAH\"}]}"; - // - // ConstraintEncapsulator ce = new - // Gson().fromJson(step2,ConstraintEncapsulator.class); - // - // - // String step3 = - // "{\"ce\":{\"constraint\":[{\"verb\":\"contains\",\"field\":\"contributor\",\"value\":\"DARIAH\"}]}}"; - // - // Constraints cons = new Gson().fromJson(step3,Constraints.class); - // - // String step4 = - // "{\"criteria\":[{\"ce\":{\"constraint\":[{\"verb\":\"contains\",\"field\":\"contributor\",\"value\":\"DARIAH\"}]}}]}"; - // - // ConstraintsList cl = new Gson().fromJson(step4,ConstraintsList.class); - // - // String step5 = - // "{\"cl\":{\"criteria\":[{\"ce\":{\"constraint\":[{\"verb\":\"contains\",\"field\":\"contributor\",\"value\":\"DARIAH\"}]}}]}}"; - SelectionConstraints sl = new Gson().fromJson(json, SelectionConstraints.class); - } + Constraint c = new Gson().fromJson(step1, Constraint.class); + // + // String step2 = + // "{\"constraint\":[{\"verb\":\"contains\",\"field\":\"contributor\",\"value\":\"DARIAH\"}]}"; + // + // ConstraintEncapsulator ce = new + // Gson().fromJson(step2,ConstraintEncapsulator.class); + // + // + // String step3 = + // "{\"ce\":{\"constraint\":[{\"verb\":\"contains\",\"field\":\"contributor\",\"value\":\"DARIAH\"}]}}"; + // + // Constraints cons = new Gson().fromJson(step3,Constraints.class); + // + // String step4 = + // "{\"criteria\":[{\"ce\":{\"constraint\":[{\"verb\":\"contains\",\"field\":\"contributor\",\"value\":\"DARIAH\"}]}}]}"; + // + // ConstraintsList cl = new Gson().fromJson(step4,ConstraintsList.class); + // + // String step5 = + // "{\"cl\":{\"criteria\":[{\"ce\":{\"constraint\":[{\"verb\":\"contains\",\"field\":\"contributor\",\"value\":\"DARIAH\"}]}}]}}"; + SelectionConstraints sl = new Gson().fromJson(json, SelectionConstraints.class); + } - @Test - public void test7() throws IOException { - final CommunityConfiguration cc = - CommunityConfigurationFactory.fromJson( - IOUtils.toString( - getClass() - .getResourceAsStream( - "/eu/dnetlib/dhp/communityconfiguration/tagging_conf.json"))); + @Test + public void test7() throws IOException { + final CommunityConfiguration cc = CommunityConfigurationFactory + .fromJson( + IOUtils + .toString( + getClass() + .getResourceAsStream( + "/eu/dnetlib/dhp/communityconfiguration/tagging_conf.json"))); - System.out.println(cc.toJson()); - } + System.out.println(cc.toJson()); + } - @Test - public void temporaneo() throws Exception { - String xml = - IOUtils.toString( - getClass() - .getResourceAsStream( - "/eu/dnetlib/dhp/communityconfiguration/tagging_conf.xml")); - final CommunityConfiguration cc = CommunityConfigurationFactory.newInstance(xml); - System.out.println(cc.toJson()); - } + @Test + public void temporaneo() throws Exception { + String xml = IOUtils + .toString( + getClass() + .getResourceAsStream( + "/eu/dnetlib/dhp/communityconfiguration/tagging_conf.xml")); + final CommunityConfiguration cc = CommunityConfigurationFactory.newInstance(xml); + System.out.println(cc.toJson()); + } } diff --git a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/AbstractSparkAction.java b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/AbstractSparkAction.java index 0c4a77be9..2120da080 100644 --- a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/AbstractSparkAction.java +++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/AbstractSparkAction.java @@ -1,18 +1,12 @@ + package eu.dnetlib.dhp.oa.dedup; -import com.fasterxml.jackson.databind.DeserializationFeature; -import com.fasterxml.jackson.databind.ObjectMapper; -import eu.dnetlib.dhp.application.ArgumentApplicationParser; -import eu.dnetlib.dhp.common.HdfsSupport; -import eu.dnetlib.dhp.schema.oaf.*; -import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException; -import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService; -import eu.dnetlib.pace.config.DedupConfig; import java.io.IOException; import java.io.Serializable; import java.io.StringReader; import java.util.ArrayList; import java.util.List; + import org.apache.spark.SparkConf; import org.apache.spark.sql.Dataset; import org.apache.spark.sql.SaveMode; @@ -22,72 +16,82 @@ import org.dom4j.DocumentException; import org.dom4j.Element; import org.dom4j.io.SAXReader; +import com.fasterxml.jackson.databind.DeserializationFeature; +import com.fasterxml.jackson.databind.ObjectMapper; + +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.common.HdfsSupport; +import eu.dnetlib.dhp.schema.oaf.*; +import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException; +import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService; +import eu.dnetlib.pace.config.DedupConfig; + abstract class AbstractSparkAction implements Serializable { - protected static final ObjectMapper OBJECT_MAPPER = - new ObjectMapper().configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false); + protected static final ObjectMapper OBJECT_MAPPER = new ObjectMapper() + .configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false); - public ArgumentApplicationParser parser; // parameters for the spark action - public SparkSession spark; // the spark session + public ArgumentApplicationParser parser; // parameters for the spark action + public SparkSession spark; // the spark session - public AbstractSparkAction(ArgumentApplicationParser parser, SparkSession spark) { + public AbstractSparkAction(ArgumentApplicationParser parser, SparkSession spark) { - this.parser = parser; - this.spark = spark; - } + this.parser = parser; + this.spark = spark; + } - public List getConfigurations(ISLookUpService isLookUpService, String orchestrator) - throws ISLookUpException, DocumentException, IOException { + public List getConfigurations(ISLookUpService isLookUpService, String orchestrator) + throws ISLookUpException, DocumentException, IOException { - final String xquery = - String.format("/RESOURCE_PROFILE[.//DEDUPLICATION/ACTION_SET/@id = '%s']", orchestrator); + final String xquery = String.format("/RESOURCE_PROFILE[.//DEDUPLICATION/ACTION_SET/@id = '%s']", orchestrator); - String orchestratorProfile = isLookUpService.getResourceProfileByQuery(xquery); + String orchestratorProfile = isLookUpService.getResourceProfileByQuery(xquery); - final Document doc = new SAXReader().read(new StringReader(orchestratorProfile)); + final Document doc = new SAXReader().read(new StringReader(orchestratorProfile)); - final String actionSetId = doc.valueOf("//DEDUPLICATION/ACTION_SET/@id"); + final String actionSetId = doc.valueOf("//DEDUPLICATION/ACTION_SET/@id"); - final List configurations = new ArrayList<>(); + final List configurations = new ArrayList<>(); - for (final Object o : doc.selectNodes("//SCAN_SEQUENCE/SCAN")) { - configurations.add(loadConfig(isLookUpService, actionSetId, o)); - } + for (final Object o : doc.selectNodes("//SCAN_SEQUENCE/SCAN")) { + configurations.add(loadConfig(isLookUpService, actionSetId, o)); + } - return configurations; - } + return configurations; + } - private DedupConfig loadConfig( - final ISLookUpService isLookUpService, final String actionSetId, final Object o) - throws ISLookUpException, IOException { - final Element s = (Element) o; - final String configProfileId = s.attributeValue("id"); - final String conf = - isLookUpService.getResourceProfileByQuery( - String.format( - "for $x in /RESOURCE_PROFILE[.//RESOURCE_IDENTIFIER/@value = '%s'] return $x//DEDUPLICATION/text()", - configProfileId)); + private DedupConfig loadConfig( + final ISLookUpService isLookUpService, final String actionSetId, final Object o) + throws ISLookUpException, IOException { + final Element s = (Element) o; + final String configProfileId = s.attributeValue("id"); + final String conf = isLookUpService + .getResourceProfileByQuery( + String + .format( + "for $x in /RESOURCE_PROFILE[.//RESOURCE_IDENTIFIER/@value = '%s'] return $x//DEDUPLICATION/text()", + configProfileId)); - DedupConfig dedupConfig = new ObjectMapper().readValue(conf, DedupConfig.class); - dedupConfig.getPace().initModel(); - dedupConfig.getPace().initTranslationMap(); - dedupConfig.getWf().setConfigurationId(actionSetId); + DedupConfig dedupConfig = new ObjectMapper().readValue(conf, DedupConfig.class); + dedupConfig.getPace().initModel(); + dedupConfig.getPace().initTranslationMap(); + dedupConfig.getWf().setConfigurationId(actionSetId); - return dedupConfig; - } + return dedupConfig; + } - abstract void run(ISLookUpService isLookUpService) - throws DocumentException, IOException, ISLookUpException; + abstract void run(ISLookUpService isLookUpService) + throws DocumentException, IOException, ISLookUpException; - protected static SparkSession getSparkSession(SparkConf conf) { - return SparkSession.builder().config(conf).getOrCreate(); - } + protected static SparkSession getSparkSession(SparkConf conf) { + return SparkSession.builder().config(conf).getOrCreate(); + } - protected static void save(Dataset dataset, String outPath, SaveMode mode) { - dataset.write().option("compression", "gzip").mode(mode).json(outPath); - } + protected static void save(Dataset dataset, String outPath, SaveMode mode) { + dataset.write().option("compression", "gzip").mode(mode).json(outPath); + } - protected static void removeOutputDir(SparkSession spark, String path) { - HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration()); - } + protected static void removeOutputDir(SparkSession spark, String path) { + HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration()); + } } diff --git a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/DatePicker.java b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/DatePicker.java index 098d024f4..70fb2cc5b 100644 --- a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/DatePicker.java +++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/DatePicker.java @@ -1,113 +1,121 @@ + package eu.dnetlib.dhp.oa.dedup; import static java.util.Collections.reverseOrder; import static java.util.Map.Entry.comparingByValue; import static java.util.stream.Collectors.toMap; + import static org.apache.commons.lang.StringUtils.endsWith; import static org.apache.commons.lang.StringUtils.substringBefore; -import eu.dnetlib.dhp.schema.oaf.Field; import java.time.Year; import java.util.*; import java.util.stream.Collectors; + import org.apache.commons.lang.StringUtils; +import eu.dnetlib.dhp.schema.oaf.Field; + public class DatePicker { - private static final String DATE_PATTERN = "\\d{4}-\\d{2}-\\d{2}"; - private static final String DATE_DEFAULT_SUFFIX = "01-01"; - private static final int YEAR_LB = 1300; - private static final int YEAR_UB = Year.now().getValue() + 5; + private static final String DATE_PATTERN = "\\d{4}-\\d{2}-\\d{2}"; + private static final String DATE_DEFAULT_SUFFIX = "01-01"; + private static final int YEAR_LB = 1300; + private static final int YEAR_UB = Year.now().getValue() + 5; - public static Field pick(final Collection dateofacceptance) { + public static Field pick(final Collection dateofacceptance) { - final Map frequencies = - dateofacceptance - .parallelStream() - .filter(StringUtils::isNotBlank) - .collect(Collectors.toConcurrentMap(w -> w, w -> 1, Integer::sum)); + final Map frequencies = dateofacceptance + .parallelStream() + .filter(StringUtils::isNotBlank) + .collect(Collectors.toConcurrentMap(w -> w, w -> 1, Integer::sum)); - if (frequencies.isEmpty()) { - return new Field<>(); - } + if (frequencies.isEmpty()) { + return new Field<>(); + } - final Field date = new Field<>(); - date.setValue(frequencies.keySet().iterator().next()); + final Field date = new Field<>(); + date.setValue(frequencies.keySet().iterator().next()); - // let's sort this map by values first, filtering out invalid dates - final Map sorted = - frequencies.entrySet().stream() - .filter(d -> StringUtils.isNotBlank(d.getKey())) - .filter(d -> d.getKey().matches(DATE_PATTERN)) - .filter(d -> inRange(d.getKey())) - .sorted(reverseOrder(comparingByValue())) - .collect( - toMap(Map.Entry::getKey, Map.Entry::getValue, (e1, e2) -> e2, LinkedHashMap::new)); + // let's sort this map by values first, filtering out invalid dates + final Map sorted = frequencies + .entrySet() + .stream() + .filter(d -> StringUtils.isNotBlank(d.getKey())) + .filter(d -> d.getKey().matches(DATE_PATTERN)) + .filter(d -> inRange(d.getKey())) + .sorted(reverseOrder(comparingByValue())) + .collect( + toMap(Map.Entry::getKey, Map.Entry::getValue, (e1, e2) -> e2, LinkedHashMap::new)); - // shortcut - if (sorted.size() == 0) { - return date; - } + // shortcut + if (sorted.size() == 0) { + return date; + } - // voting method (1/3 + 1) wins - if (sorted.size() >= 3) { - final int acceptThreshold = (sorted.size() / 3) + 1; - final List accepted = - sorted.entrySet().stream() - .filter(e -> e.getValue() >= acceptThreshold) - .map(e -> e.getKey()) - .collect(Collectors.toList()); + // voting method (1/3 + 1) wins + if (sorted.size() >= 3) { + final int acceptThreshold = (sorted.size() / 3) + 1; + final List accepted = sorted + .entrySet() + .stream() + .filter(e -> e.getValue() >= acceptThreshold) + .map(e -> e.getKey()) + .collect(Collectors.toList()); - // cannot find strong majority - if (accepted.isEmpty()) { - final int max = sorted.values().iterator().next(); - Optional first = - sorted.entrySet().stream() - .filter(e -> e.getValue() == max && !endsWith(e.getKey(), DATE_DEFAULT_SUFFIX)) - .map(Map.Entry::getKey) - .findFirst(); - if (first.isPresent()) { - date.setValue(first.get()); - return date; - } + // cannot find strong majority + if (accepted.isEmpty()) { + final int max = sorted.values().iterator().next(); + Optional first = sorted + .entrySet() + .stream() + .filter(e -> e.getValue() == max && !endsWith(e.getKey(), DATE_DEFAULT_SUFFIX)) + .map(Map.Entry::getKey) + .findFirst(); + if (first.isPresent()) { + date.setValue(first.get()); + return date; + } - date.setValue(sorted.keySet().iterator().next()); - return date; - } + date.setValue(sorted.keySet().iterator().next()); + return date; + } - if (accepted.size() == 1) { - date.setValue(accepted.get(0)); - return date; - } else { - final Optional first = - accepted.stream().filter(d -> !endsWith(d, DATE_DEFAULT_SUFFIX)).findFirst(); - if (first.isPresent()) { - date.setValue(first.get()); - return date; - } + if (accepted.size() == 1) { + date.setValue(accepted.get(0)); + return date; + } else { + final Optional first = accepted + .stream() + .filter(d -> !endsWith(d, DATE_DEFAULT_SUFFIX)) + .findFirst(); + if (first.isPresent()) { + date.setValue(first.get()); + return date; + } - return date; - } + return date; + } - // 1st non YYYY-01-01 is returned - } else { - if (sorted.size() == 2) { - for (Map.Entry e : sorted.entrySet()) { - if (!endsWith(e.getKey(), DATE_DEFAULT_SUFFIX)) { - date.setValue(e.getKey()); - return date; - } - } - } + // 1st non YYYY-01-01 is returned + } else { + if (sorted.size() == 2) { + for (Map.Entry e : sorted.entrySet()) { + if (!endsWith(e.getKey(), DATE_DEFAULT_SUFFIX)) { + date.setValue(e.getKey()); + return date; + } + } + } - // none of the dates seems good enough, return the 1st one - date.setValue(sorted.keySet().iterator().next()); - return date; - } - } + // none of the dates seems good enough, return the 1st one + date.setValue(sorted.keySet().iterator().next()); + return date; + } + } - private static boolean inRange(final String date) { - final int year = Integer.parseInt(substringBefore(date, "-")); - return year >= YEAR_LB && year <= YEAR_UB; - } + private static boolean inRange(final String date) { + final int year = Integer.parseInt(substringBefore(date, "-")); + return year >= YEAR_LB && year <= YEAR_UB; + } } diff --git a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/DedupRecordFactory.java b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/DedupRecordFactory.java index 5a806c0a0..fa06424d7 100644 --- a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/DedupRecordFactory.java +++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/DedupRecordFactory.java @@ -1,12 +1,9 @@ + package eu.dnetlib.dhp.oa.dedup; -import com.fasterxml.jackson.databind.DeserializationFeature; -import com.fasterxml.jackson.databind.ObjectMapper; -import com.google.common.collect.Lists; -import eu.dnetlib.dhp.schema.common.ModelSupport; -import eu.dnetlib.dhp.schema.oaf.*; import java.util.Collection; import java.util.Iterator; + import org.apache.spark.api.java.function.MapFunction; import org.apache.spark.api.java.function.MapGroupsFunction; import org.apache.spark.sql.Dataset; @@ -14,92 +11,96 @@ import org.apache.spark.sql.Encoders; import org.apache.spark.sql.SparkSession; import org.slf4j.Logger; import org.slf4j.LoggerFactory; + +import com.fasterxml.jackson.databind.DeserializationFeature; +import com.fasterxml.jackson.databind.ObjectMapper; +import com.google.common.collect.Lists; + +import eu.dnetlib.dhp.schema.common.ModelSupport; +import eu.dnetlib.dhp.schema.oaf.*; import scala.Tuple2; public class DedupRecordFactory { - private static final Logger log = LoggerFactory.getLogger(DedupRecordFactory.class); + private static final Logger log = LoggerFactory.getLogger(DedupRecordFactory.class); - protected static final ObjectMapper OBJECT_MAPPER = - new ObjectMapper().configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false); + protected static final ObjectMapper OBJECT_MAPPER = new ObjectMapper() + .configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false); - public static Dataset createDedupRecord( - final SparkSession spark, - final DataInfo dataInfo, - final String mergeRelsInputPath, - final String entitiesInputPath, - final Class clazz) { + public static Dataset createDedupRecord( + final SparkSession spark, + final DataInfo dataInfo, + final String mergeRelsInputPath, + final String entitiesInputPath, + final Class clazz) { - long ts = System.currentTimeMillis(); + long ts = System.currentTimeMillis(); - // - Dataset> entities = - spark - .read() - .textFile(entitiesInputPath) - .map( - (MapFunction>) - it -> { - T entity = OBJECT_MAPPER.readValue(it, clazz); - return new Tuple2<>(entity.getId(), entity); - }, - Encoders.tuple(Encoders.STRING(), Encoders.kryo(clazz))); + // + Dataset> entities = spark + .read() + .textFile(entitiesInputPath) + .map( + (MapFunction>) it -> { + T entity = OBJECT_MAPPER.readValue(it, clazz); + return new Tuple2<>(entity.getId(), entity); + }, + Encoders.tuple(Encoders.STRING(), Encoders.kryo(clazz))); - // : source is the dedup_id, target is the id of the mergedIn - Dataset> mergeRels = - spark - .read() - .load(mergeRelsInputPath) - .as(Encoders.bean(Relation.class)) - .where("relClass == 'merges'") - .map( - (MapFunction>) - r -> new Tuple2<>(r.getSource(), r.getTarget()), - Encoders.tuple(Encoders.STRING(), Encoders.STRING())); + // : source is the dedup_id, target is the id of the mergedIn + Dataset> mergeRels = spark + .read() + .load(mergeRelsInputPath) + .as(Encoders.bean(Relation.class)) + .where("relClass == 'merges'") + .map( + (MapFunction>) r -> new Tuple2<>(r.getSource(), r.getTarget()), + Encoders.tuple(Encoders.STRING(), Encoders.STRING())); - return mergeRels - .joinWith(entities, mergeRels.col("_2").equalTo(entities.col("_1")), "inner") - .map( - (MapFunction, Tuple2>, Tuple2>) - value -> new Tuple2<>(value._1()._1(), value._2()._2()), - Encoders.tuple(Encoders.STRING(), Encoders.kryo(clazz))) - .groupByKey( - (MapFunction, String>) entity -> entity._1(), Encoders.STRING()) - .mapGroups( - (MapGroupsFunction, T>) - (key, values) -> entityMerger(key, values, ts, dataInfo), - Encoders.bean(clazz)); - } + return mergeRels + .joinWith(entities, mergeRels.col("_2").equalTo(entities.col("_1")), "inner") + .map( + (MapFunction, Tuple2>, Tuple2>) value -> new Tuple2<>( + value._1()._1(), value._2()._2()), + Encoders.tuple(Encoders.STRING(), Encoders.kryo(clazz))) + .groupByKey( + (MapFunction, String>) entity -> entity._1(), Encoders.STRING()) + .mapGroups( + (MapGroupsFunction, T>) (key, + values) -> entityMerger(key, values, ts, dataInfo), + Encoders.bean(clazz)); + } - private static T entityMerger( - String id, Iterator> entities, long ts, DataInfo dataInfo) { + private static T entityMerger( + String id, Iterator> entities, long ts, DataInfo dataInfo) { - T entity = entities.next()._2(); + T entity = entities.next()._2(); - final Collection dates = Lists.newArrayList(); - entities.forEachRemaining( - t -> { - T duplicate = t._2(); - entity.mergeFrom(duplicate); - if (ModelSupport.isSubClass(duplicate, Result.class)) { - Result r1 = (Result) duplicate; - Result er = (Result) entity; - er.setAuthor(DedupUtility.mergeAuthor(er.getAuthor(), r1.getAuthor())); + final Collection dates = Lists.newArrayList(); + entities + .forEachRemaining( + t -> { + T duplicate = t._2(); + entity.mergeFrom(duplicate); + if (ModelSupport.isSubClass(duplicate, Result.class)) { + Result r1 = (Result) duplicate; + Result er = (Result) entity; + er.setAuthor(DedupUtility.mergeAuthor(er.getAuthor(), r1.getAuthor())); - if (r1.getDateofacceptance() != null) { - dates.add(r1.getDateofacceptance().getValue()); - } - } - }); + if (r1.getDateofacceptance() != null) { + dates.add(r1.getDateofacceptance().getValue()); + } + } + }); - if (ModelSupport.isSubClass(entity, Result.class)) { - ((Result) entity).setDateofacceptance(DatePicker.pick(dates)); - } + if (ModelSupport.isSubClass(entity, Result.class)) { + ((Result) entity).setDateofacceptance(DatePicker.pick(dates)); + } - entity.setId(id); - entity.setLastupdatetimestamp(ts); - entity.setDataInfo(dataInfo); + entity.setId(id); + entity.setLastupdatetimestamp(ts); + entity.setDataInfo(dataInfo); - return entity; - } + return entity; + } } diff --git a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/DedupUtility.java b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/DedupUtility.java index f239e072f..4f797f7f7 100644 --- a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/DedupUtility.java +++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/DedupUtility.java @@ -1,7 +1,24 @@ + package eu.dnetlib.dhp.oa.dedup; +import java.io.StringReader; +import java.security.MessageDigest; +import java.text.Normalizer; +import java.util.*; +import java.util.stream.Collectors; + +import org.apache.commons.codec.binary.Hex; +import org.apache.commons.lang3.StringUtils; +import org.apache.spark.SparkContext; +import org.apache.spark.util.LongAccumulator; +import org.dom4j.Document; +import org.dom4j.DocumentException; +import org.dom4j.Element; +import org.dom4j.io.SAXReader; + import com.google.common.collect.Sets; import com.wcohen.ss.JaroWinkler; + import eu.dnetlib.dhp.schema.oaf.Author; import eu.dnetlib.dhp.schema.oaf.StructuredProperty; import eu.dnetlib.dhp.utils.ISLookupClientFactory; @@ -11,230 +28,222 @@ import eu.dnetlib.pace.clustering.BlacklistAwareClusteringCombiner; import eu.dnetlib.pace.config.DedupConfig; import eu.dnetlib.pace.model.MapDocument; import eu.dnetlib.pace.model.Person; -import java.io.StringReader; -import java.security.MessageDigest; -import java.text.Normalizer; -import java.util.*; -import java.util.stream.Collectors; -import org.apache.commons.codec.binary.Hex; -import org.apache.commons.lang3.StringUtils; -import org.apache.spark.SparkContext; -import org.apache.spark.util.LongAccumulator; -import org.dom4j.Document; -import org.dom4j.DocumentException; -import org.dom4j.Element; -import org.dom4j.io.SAXReader; import scala.Tuple2; public class DedupUtility { - private static final Double THRESHOLD = 0.95; + private static final Double THRESHOLD = 0.95; - public static Map constructAccumulator( - final DedupConfig dedupConf, final SparkContext context) { + public static Map constructAccumulator( + final DedupConfig dedupConf, final SparkContext context) { - Map accumulators = new HashMap<>(); + Map accumulators = new HashMap<>(); - String acc1 = - String.format("%s::%s", dedupConf.getWf().getEntityType(), "records per hash key = 1"); - accumulators.put(acc1, context.longAccumulator(acc1)); - String acc2 = - String.format( - "%s::%s", - dedupConf.getWf().getEntityType(), "missing " + dedupConf.getWf().getOrderField()); - accumulators.put(acc2, context.longAccumulator(acc2)); - String acc3 = - String.format( - "%s::%s", - dedupConf.getWf().getEntityType(), - String.format( - "Skipped records for count(%s) >= %s", - dedupConf.getWf().getOrderField(), dedupConf.getWf().getGroupMaxSize())); - accumulators.put(acc3, context.longAccumulator(acc3)); - String acc4 = String.format("%s::%s", dedupConf.getWf().getEntityType(), "skip list"); - accumulators.put(acc4, context.longAccumulator(acc4)); - String acc5 = - String.format("%s::%s", dedupConf.getWf().getEntityType(), "dedupSimilarity (x2)"); - accumulators.put(acc5, context.longAccumulator(acc5)); - String acc6 = - String.format( - "%s::%s", dedupConf.getWf().getEntityType(), "d < " + dedupConf.getWf().getThreshold()); - accumulators.put(acc6, context.longAccumulator(acc6)); + String acc1 = String.format("%s::%s", dedupConf.getWf().getEntityType(), "records per hash key = 1"); + accumulators.put(acc1, context.longAccumulator(acc1)); + String acc2 = String + .format( + "%s::%s", + dedupConf.getWf().getEntityType(), "missing " + dedupConf.getWf().getOrderField()); + accumulators.put(acc2, context.longAccumulator(acc2)); + String acc3 = String + .format( + "%s::%s", + dedupConf.getWf().getEntityType(), + String + .format( + "Skipped records for count(%s) >= %s", + dedupConf.getWf().getOrderField(), dedupConf.getWf().getGroupMaxSize())); + accumulators.put(acc3, context.longAccumulator(acc3)); + String acc4 = String.format("%s::%s", dedupConf.getWf().getEntityType(), "skip list"); + accumulators.put(acc4, context.longAccumulator(acc4)); + String acc5 = String.format("%s::%s", dedupConf.getWf().getEntityType(), "dedupSimilarity (x2)"); + accumulators.put(acc5, context.longAccumulator(acc5)); + String acc6 = String + .format( + "%s::%s", dedupConf.getWf().getEntityType(), "d < " + dedupConf.getWf().getThreshold()); + accumulators.put(acc6, context.longAccumulator(acc6)); - return accumulators; - } + return accumulators; + } - static Set getGroupingKeys(DedupConfig conf, MapDocument doc) { - return Sets.newHashSet(BlacklistAwareClusteringCombiner.filterAndCombine(doc, conf)); - } + static Set getGroupingKeys(DedupConfig conf, MapDocument doc) { + return Sets.newHashSet(BlacklistAwareClusteringCombiner.filterAndCombine(doc, conf)); + } - public static String md5(final String s) { - try { - final MessageDigest md = MessageDigest.getInstance("MD5"); - md.update(s.getBytes("UTF-8")); - return new String(Hex.encodeHex(md.digest())); - } catch (final Exception e) { - System.err.println("Error creating id"); - return null; - } - } + public static String md5(final String s) { + try { + final MessageDigest md = MessageDigest.getInstance("MD5"); + md.update(s.getBytes("UTF-8")); + return new String(Hex.encodeHex(md.digest())); + } catch (final Exception e) { + System.err.println("Error creating id"); + return null; + } + } - public static List mergeAuthor(final List a, final List b) { - int pa = countAuthorsPids(a); - int pb = countAuthorsPids(b); - List base, enrich; - int sa = authorsSize(a); - int sb = authorsSize(b); + public static List mergeAuthor(final List a, final List b) { + int pa = countAuthorsPids(a); + int pb = countAuthorsPids(b); + List base, enrich; + int sa = authorsSize(a); + int sb = authorsSize(b); - if (pa == pb) { - base = sa > sb ? a : b; - enrich = sa > sb ? b : a; - } else { - base = pa > pb ? a : b; - enrich = pa > pb ? b : a; - } - enrichPidFromList(base, enrich); - return base; - } + if (pa == pb) { + base = sa > sb ? a : b; + enrich = sa > sb ? b : a; + } else { + base = pa > pb ? a : b; + enrich = pa > pb ? b : a; + } + enrichPidFromList(base, enrich); + return base; + } - private static void enrichPidFromList(List base, List enrich) { - if (base == null || enrich == null) return; - final Map basePidAuthorMap = - base.stream() - .filter(a -> a.getPid() != null && a.getPid().size() > 0) - .flatMap(a -> a.getPid().stream().map(p -> new Tuple2<>(p.toComparableString(), a))) - .collect(Collectors.toMap(Tuple2::_1, Tuple2::_2, (x1, x2) -> x1)); + private static void enrichPidFromList(List base, List enrich) { + if (base == null || enrich == null) + return; + final Map basePidAuthorMap = base + .stream() + .filter(a -> a.getPid() != null && a.getPid().size() > 0) + .flatMap(a -> a.getPid().stream().map(p -> new Tuple2<>(p.toComparableString(), a))) + .collect(Collectors.toMap(Tuple2::_1, Tuple2::_2, (x1, x2) -> x1)); - final List> pidToEnrich = - enrich.stream() - .filter(a -> a.getPid() != null && a.getPid().size() > 0) - .flatMap( - a -> - a.getPid().stream() - .filter(p -> !basePidAuthorMap.containsKey(p.toComparableString())) - .map(p -> new Tuple2<>(p, a))) - .collect(Collectors.toList()); + final List> pidToEnrich = enrich + .stream() + .filter(a -> a.getPid() != null && a.getPid().size() > 0) + .flatMap( + a -> a + .getPid() + .stream() + .filter(p -> !basePidAuthorMap.containsKey(p.toComparableString())) + .map(p -> new Tuple2<>(p, a))) + .collect(Collectors.toList()); - pidToEnrich.forEach( - a -> { - Optional> simAuhtor = - base.stream() - .map(ba -> new Tuple2<>(sim(ba, a._2()), ba)) - .max(Comparator.comparing(Tuple2::_1)); - if (simAuhtor.isPresent() && simAuhtor.get()._1() > THRESHOLD) { - Author r = simAuhtor.get()._2(); - r.getPid().add(a._1()); - } - }); - } + pidToEnrich + .forEach( + a -> { + Optional> simAuhtor = base + .stream() + .map(ba -> new Tuple2<>(sim(ba, a._2()), ba)) + .max(Comparator.comparing(Tuple2::_1)); + if (simAuhtor.isPresent() && simAuhtor.get()._1() > THRESHOLD) { + Author r = simAuhtor.get()._2(); + r.getPid().add(a._1()); + } + }); + } - public static String createDedupRecordPath( - final String basePath, final String actionSetId, final String entityType) { - return String.format("%s/%s/%s_deduprecord", basePath, actionSetId, entityType); - } + public static String createDedupRecordPath( + final String basePath, final String actionSetId, final String entityType) { + return String.format("%s/%s/%s_deduprecord", basePath, actionSetId, entityType); + } - public static String createEntityPath(final String basePath, final String entityType) { - return String.format("%s/%s", basePath, entityType); - } + public static String createEntityPath(final String basePath, final String entityType) { + return String.format("%s/%s", basePath, entityType); + } - public static String createSimRelPath( - final String basePath, final String actionSetId, final String entityType) { - return String.format("%s/%s/%s_simrel", basePath, actionSetId, entityType); - } + public static String createSimRelPath( + final String basePath, final String actionSetId, final String entityType) { + return String.format("%s/%s/%s_simrel", basePath, actionSetId, entityType); + } - public static String createMergeRelPath( - final String basePath, final String actionSetId, final String entityType) { - return String.format("%s/%s/%s_mergerel", basePath, actionSetId, entityType); - } + public static String createMergeRelPath( + final String basePath, final String actionSetId, final String entityType) { + return String.format("%s/%s/%s_mergerel", basePath, actionSetId, entityType); + } - private static Double sim(Author a, Author b) { + private static Double sim(Author a, Author b) { - final Person pa = parse(a); - final Person pb = parse(b); + final Person pa = parse(a); + final Person pb = parse(b); - if (pa.isAccurate() & pb.isAccurate()) { - return new JaroWinkler() - .score(normalize(pa.getSurnameString()), normalize(pb.getSurnameString())); - } else { - return new JaroWinkler() - .score(normalize(pa.getNormalisedFullname()), normalize(pb.getNormalisedFullname())); - } - } + if (pa.isAccurate() & pb.isAccurate()) { + return new JaroWinkler() + .score(normalize(pa.getSurnameString()), normalize(pb.getSurnameString())); + } else { + return new JaroWinkler() + .score(normalize(pa.getNormalisedFullname()), normalize(pb.getNormalisedFullname())); + } + } - private static String normalize(final String s) { - return nfd(s) - .toLowerCase() - // do not compact the regexes in a single expression, would cause StackOverflowError - // in case - // of large input strings - .replaceAll("(\\W)+", " ") - .replaceAll("(\\p{InCombiningDiacriticalMarks})+", " ") - .replaceAll("(\\p{Punct})+", " ") - .replaceAll("(\\d)+", " ") - .replaceAll("(\\n)+", " ") - .trim(); - } + private static String normalize(final String s) { + return nfd(s) + .toLowerCase() + // do not compact the regexes in a single expression, would cause StackOverflowError + // in case + // of large input strings + .replaceAll("(\\W)+", " ") + .replaceAll("(\\p{InCombiningDiacriticalMarks})+", " ") + .replaceAll("(\\p{Punct})+", " ") + .replaceAll("(\\d)+", " ") + .replaceAll("(\\n)+", " ") + .trim(); + } - private static String nfd(final String s) { - return Normalizer.normalize(s, Normalizer.Form.NFD); - } + private static String nfd(final String s) { + return Normalizer.normalize(s, Normalizer.Form.NFD); + } - private static Person parse(Author author) { - if (StringUtils.isNotBlank(author.getSurname())) { - return new Person(author.getSurname() + ", " + author.getName(), false); - } else { - return new Person(author.getFullname(), false); - } - } + private static Person parse(Author author) { + if (StringUtils.isNotBlank(author.getSurname())) { + return new Person(author.getSurname() + ", " + author.getName(), false); + } else { + return new Person(author.getFullname(), false); + } + } - private static int countAuthorsPids(List authors) { - if (authors == null) return 0; + private static int countAuthorsPids(List authors) { + if (authors == null) + return 0; - return (int) authors.stream().filter(DedupUtility::hasPid).count(); - } + return (int) authors.stream().filter(DedupUtility::hasPid).count(); + } - private static int authorsSize(List authors) { - if (authors == null) return 0; - return authors.size(); - } + private static int authorsSize(List authors) { + if (authors == null) + return 0; + return authors.size(); + } - private static boolean hasPid(Author a) { - if (a == null || a.getPid() == null || a.getPid().size() == 0) return false; - return a.getPid().stream().anyMatch(p -> p != null && StringUtils.isNotBlank(p.getValue())); - } + private static boolean hasPid(Author a) { + if (a == null || a.getPid() == null || a.getPid().size() == 0) + return false; + return a.getPid().stream().anyMatch(p -> p != null && StringUtils.isNotBlank(p.getValue())); + } - public static List getConfigurations(String isLookUpUrl, String orchestrator) - throws ISLookUpException, DocumentException { - final ISLookUpService isLookUpService = ISLookupClientFactory.getLookUpService(isLookUpUrl); + public static List getConfigurations(String isLookUpUrl, String orchestrator) + throws ISLookUpException, DocumentException { + final ISLookUpService isLookUpService = ISLookupClientFactory.getLookUpService(isLookUpUrl); - final String xquery = - String.format("/RESOURCE_PROFILE[.//DEDUPLICATION/ACTION_SET/@id = '%s']", orchestrator); + final String xquery = String.format("/RESOURCE_PROFILE[.//DEDUPLICATION/ACTION_SET/@id = '%s']", orchestrator); - String orchestratorProfile = isLookUpService.getResourceProfileByQuery(xquery); + String orchestratorProfile = isLookUpService.getResourceProfileByQuery(xquery); - final Document doc = new SAXReader().read(new StringReader(orchestratorProfile)); + final Document doc = new SAXReader().read(new StringReader(orchestratorProfile)); - final String actionSetId = doc.valueOf("//DEDUPLICATION/ACTION_SET/@id"); - final List configurations = new ArrayList<>(); + final String actionSetId = doc.valueOf("//DEDUPLICATION/ACTION_SET/@id"); + final List configurations = new ArrayList<>(); - for (final Object o : doc.selectNodes("//SCAN_SEQUENCE/SCAN")) { - configurations.add(loadConfig(isLookUpService, actionSetId, o)); - } + for (final Object o : doc.selectNodes("//SCAN_SEQUENCE/SCAN")) { + configurations.add(loadConfig(isLookUpService, actionSetId, o)); + } - return configurations; - } + return configurations; + } - private static DedupConfig loadConfig( - final ISLookUpService isLookUpService, final String actionSetId, final Object o) - throws ISLookUpException { - final Element s = (Element) o; - final String configProfileId = s.attributeValue("id"); - final String conf = - isLookUpService.getResourceProfileByQuery( - String.format( - "for $x in /RESOURCE_PROFILE[.//RESOURCE_IDENTIFIER/@value = '%s'] return $x//DEDUPLICATION/text()", - configProfileId)); - final DedupConfig dedupConfig = DedupConfig.load(conf); - dedupConfig.getWf().setConfigurationId(actionSetId); - return dedupConfig; - } + private static DedupConfig loadConfig( + final ISLookUpService isLookUpService, final String actionSetId, final Object o) + throws ISLookUpException { + final Element s = (Element) o; + final String configProfileId = s.attributeValue("id"); + final String conf = isLookUpService + .getResourceProfileByQuery( + String + .format( + "for $x in /RESOURCE_PROFILE[.//RESOURCE_IDENTIFIER/@value = '%s'] return $x//DEDUPLICATION/text()", + configProfileId)); + final DedupConfig dedupConfig = DedupConfig.load(conf); + dedupConfig.getWf().setConfigurationId(actionSetId); + return dedupConfig; + } } diff --git a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/Deduper.java b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/Deduper.java index a7515d575..c72940deb 100644 --- a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/Deduper.java +++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/Deduper.java @@ -1,54 +1,57 @@ + package eu.dnetlib.dhp.oa.dedup; +import java.util.Map; +import java.util.stream.Collectors; + +import org.apache.spark.api.java.JavaPairRDD; +import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.util.LongAccumulator; + import eu.dnetlib.dhp.oa.dedup.model.Block; import eu.dnetlib.pace.config.DedupConfig; import eu.dnetlib.pace.model.MapDocument; import eu.dnetlib.pace.util.BlockProcessor; -import java.util.Map; -import java.util.stream.Collectors; -import org.apache.spark.api.java.JavaPairRDD; -import org.apache.spark.api.java.JavaSparkContext; -import org.apache.spark.util.LongAccumulator; import scala.Serializable; import scala.Tuple2; public class Deduper implements Serializable { - public static JavaPairRDD computeRelations( - JavaSparkContext context, JavaPairRDD blocks, DedupConfig config) { - Map accumulators = - DedupUtility.constructAccumulator(config, context.sc()); + public static JavaPairRDD computeRelations( + JavaSparkContext context, JavaPairRDD blocks, DedupConfig config) { + Map accumulators = DedupUtility.constructAccumulator(config, context.sc()); - return blocks - .flatMapToPair( - it -> { - final SparkReporter reporter = new SparkReporter(accumulators); - new BlockProcessor(config) - .processSortedBlock(it._1(), it._2().getDocuments(), reporter); - return reporter.getRelations().iterator(); - }) - .mapToPair(it -> new Tuple2<>(it._1() + it._2(), it)) - .reduceByKey((a, b) -> a) - .mapToPair(Tuple2::_2); - } + return blocks + .flatMapToPair( + it -> { + final SparkReporter reporter = new SparkReporter(accumulators); + new BlockProcessor(config) + .processSortedBlock(it._1(), it._2().getDocuments(), reporter); + return reporter.getRelations().iterator(); + }) + .mapToPair(it -> new Tuple2<>(it._1() + it._2(), it)) + .reduceByKey((a, b) -> a) + .mapToPair(Tuple2::_2); + } - public static JavaPairRDD createSortedBlocks( - JavaPairRDD mapDocs, DedupConfig config) { - final String of = config.getWf().getOrderField(); - final int maxQueueSize = config.getWf().getGroupMaxSize(); + public static JavaPairRDD createSortedBlocks( + JavaPairRDD mapDocs, DedupConfig config) { + final String of = config.getWf().getOrderField(); + final int maxQueueSize = config.getWf().getGroupMaxSize(); - return mapDocs - // the reduce is just to be sure that we haven't document with same id - .reduceByKey((a, b) -> a) - .map(Tuple2::_2) - // Clustering: from to List - .flatMap( - a -> - DedupUtility.getGroupingKeys(config, a).stream() - .map(it -> Block.from(it, a)) - .collect(Collectors.toList()) - .iterator()) - .mapToPair(block -> new Tuple2<>(block.getKey(), block)) - .reduceByKey((b1, b2) -> Block.from(b1, b2, of, maxQueueSize)); - } + return mapDocs + // the reduce is just to be sure that we haven't document with same id + .reduceByKey((a, b) -> a) + .map(Tuple2::_2) + // Clustering: from to List + .flatMap( + a -> DedupUtility + .getGroupingKeys(config, a) + .stream() + .map(it -> Block.from(it, a)) + .collect(Collectors.toList()) + .iterator()) + .mapToPair(block -> new Tuple2<>(block.getKey(), block)) + .reduceByKey((b1, b2) -> Block.from(b1, b2, of, maxQueueSize)); + } } diff --git a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkCreateDedupRecord.java b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkCreateDedupRecord.java index b47b880e9..d870f6256 100644 --- a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkCreateDedupRecord.java +++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkCreateDedupRecord.java @@ -1,5 +1,16 @@ + package eu.dnetlib.dhp.oa.dedup; +import java.io.IOException; + +import org.apache.commons.io.IOUtils; +import org.apache.spark.SparkConf; +import org.apache.spark.sql.SaveMode; +import org.apache.spark.sql.SparkSession; +import org.dom4j.DocumentException; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.schema.common.EntityType; import eu.dnetlib.dhp.schema.common.ModelSupport; @@ -10,92 +21,84 @@ import eu.dnetlib.dhp.utils.ISLookupClientFactory; import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException; import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService; import eu.dnetlib.pace.config.DedupConfig; -import java.io.IOException; -import org.apache.commons.io.IOUtils; -import org.apache.spark.SparkConf; -import org.apache.spark.sql.SaveMode; -import org.apache.spark.sql.SparkSession; -import org.dom4j.DocumentException; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; public class SparkCreateDedupRecord extends AbstractSparkAction { - private static final Logger log = LoggerFactory.getLogger(SparkCreateDedupRecord.class); + private static final Logger log = LoggerFactory.getLogger(SparkCreateDedupRecord.class); - public static final String ROOT_TRUST = "0.8"; - public static final String PROVENANCE_ACTION_CLASS = "sysimport:dedup"; - public static final String PROVENANCE_ACTIONS = "dnet:provenanceActions"; + public static final String ROOT_TRUST = "0.8"; + public static final String PROVENANCE_ACTION_CLASS = "sysimport:dedup"; + public static final String PROVENANCE_ACTIONS = "dnet:provenanceActions"; - public SparkCreateDedupRecord(ArgumentApplicationParser parser, SparkSession spark) { - super(parser, spark); - } + public SparkCreateDedupRecord(ArgumentApplicationParser parser, SparkSession spark) { + super(parser, spark); + } - public static void main(String[] args) throws Exception { - ArgumentApplicationParser parser = - new ArgumentApplicationParser( - IOUtils.toString( - SparkCreateSimRels.class.getResourceAsStream( - "/eu/dnetlib/dhp/oa/dedup/createDedupRecord_parameters.json"))); - parser.parseArgument(args); + public static void main(String[] args) throws Exception { + ArgumentApplicationParser parser = new ArgumentApplicationParser( + IOUtils + .toString( + SparkCreateSimRels.class + .getResourceAsStream( + "/eu/dnetlib/dhp/oa/dedup/createDedupRecord_parameters.json"))); + parser.parseArgument(args); - SparkConf conf = new SparkConf(); - conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer"); - conf.registerKryoClasses(ModelSupport.getOafModelClasses()); + SparkConf conf = new SparkConf(); + conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer"); + conf.registerKryoClasses(ModelSupport.getOafModelClasses()); - new SparkCreateDedupRecord(parser, getSparkSession(conf)) - .run(ISLookupClientFactory.getLookUpService(parser.get("isLookUpUrl"))); - } + new SparkCreateDedupRecord(parser, getSparkSession(conf)) + .run(ISLookupClientFactory.getLookUpService(parser.get("isLookUpUrl"))); + } - @Override - public void run(ISLookUpService isLookUpService) - throws ISLookUpException, DocumentException, IOException { + @Override + public void run(ISLookUpService isLookUpService) + throws ISLookUpException, DocumentException, IOException { - final String graphBasePath = parser.get("graphBasePath"); - final String isLookUpUrl = parser.get("isLookUpUrl"); - final String actionSetId = parser.get("actionSetId"); - final String workingPath = parser.get("workingPath"); + final String graphBasePath = parser.get("graphBasePath"); + final String isLookUpUrl = parser.get("isLookUpUrl"); + final String actionSetId = parser.get("actionSetId"); + final String workingPath = parser.get("workingPath"); - log.info("graphBasePath: '{}'", graphBasePath); - log.info("isLookUpUrl: '{}'", isLookUpUrl); - log.info("actionSetId: '{}'", actionSetId); - log.info("workingPath: '{}'", workingPath); + log.info("graphBasePath: '{}'", graphBasePath); + log.info("isLookUpUrl: '{}'", isLookUpUrl); + log.info("actionSetId: '{}'", actionSetId); + log.info("workingPath: '{}'", workingPath); - for (DedupConfig dedupConf : getConfigurations(isLookUpService, actionSetId)) { - String subEntity = dedupConf.getWf().getSubEntityValue(); - log.info("Creating deduprecords for: '{}'", subEntity); + for (DedupConfig dedupConf : getConfigurations(isLookUpService, actionSetId)) { + String subEntity = dedupConf.getWf().getSubEntityValue(); + log.info("Creating deduprecords for: '{}'", subEntity); - final String outputPath = - DedupUtility.createDedupRecordPath(workingPath, actionSetId, subEntity); - removeOutputDir(spark, outputPath); + final String outputPath = DedupUtility.createDedupRecordPath(workingPath, actionSetId, subEntity); + removeOutputDir(spark, outputPath); - final String mergeRelPath = - DedupUtility.createMergeRelPath(workingPath, actionSetId, subEntity); - final String entityPath = DedupUtility.createEntityPath(graphBasePath, subEntity); + final String mergeRelPath = DedupUtility.createMergeRelPath(workingPath, actionSetId, subEntity); + final String entityPath = DedupUtility.createEntityPath(graphBasePath, subEntity); - final Class clazz = ModelSupport.entityTypes.get(EntityType.valueOf(subEntity)); - final DataInfo dataInfo = getDataInfo(dedupConf); - DedupRecordFactory.createDedupRecord(spark, dataInfo, mergeRelPath, entityPath, clazz) - .write() - .mode(SaveMode.Overwrite) - .option("compression", "gzip") - .json(outputPath); - } - } + final Class clazz = ModelSupport.entityTypes.get(EntityType.valueOf(subEntity)); + final DataInfo dataInfo = getDataInfo(dedupConf); + DedupRecordFactory + .createDedupRecord(spark, dataInfo, mergeRelPath, entityPath, clazz) + .write() + .mode(SaveMode.Overwrite) + .option("compression", "gzip") + .json(outputPath); + } + } - private static DataInfo getDataInfo(DedupConfig dedupConf) { - DataInfo info = new DataInfo(); - info.setDeletedbyinference(false); - info.setInferred(true); - info.setInvisible(false); - info.setTrust(ROOT_TRUST); - info.setInferenceprovenance(dedupConf.getWf().getConfigurationId()); - Qualifier provenance = new Qualifier(); - provenance.setClassid(PROVENANCE_ACTION_CLASS); - provenance.setClassname(PROVENANCE_ACTION_CLASS); - provenance.setSchemeid(PROVENANCE_ACTIONS); - provenance.setSchemename(PROVENANCE_ACTIONS); - info.setProvenanceaction(provenance); - return info; - } + private static DataInfo getDataInfo(DedupConfig dedupConf) { + DataInfo info = new DataInfo(); + info.setDeletedbyinference(false); + info.setInferred(true); + info.setInvisible(false); + info.setTrust(ROOT_TRUST); + info.setInferenceprovenance(dedupConf.getWf().getConfigurationId()); + Qualifier provenance = new Qualifier(); + provenance.setClassid(PROVENANCE_ACTION_CLASS); + provenance.setClassname(PROVENANCE_ACTION_CLASS); + provenance.setSchemeid(PROVENANCE_ACTIONS); + provenance.setSchemename(PROVENANCE_ACTIONS); + info.setProvenanceaction(provenance); + return info; + } } diff --git a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkCreateMergeRels.java b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkCreateMergeRels.java index 229379a53..a44650823 100644 --- a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkCreateMergeRels.java +++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkCreateMergeRels.java @@ -1,22 +1,11 @@ + package eu.dnetlib.dhp.oa.dedup; -import com.google.common.hash.Hashing; -import eu.dnetlib.dhp.application.ArgumentApplicationParser; -import eu.dnetlib.dhp.oa.dedup.graph.ConnectedComponent; -import eu.dnetlib.dhp.oa.dedup.graph.GraphProcessor; -import eu.dnetlib.dhp.schema.common.ModelSupport; -import eu.dnetlib.dhp.schema.oaf.DataInfo; -import eu.dnetlib.dhp.schema.oaf.Qualifier; -import eu.dnetlib.dhp.schema.oaf.Relation; -import eu.dnetlib.dhp.utils.ISLookupClientFactory; -import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException; -import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService; -import eu.dnetlib.pace.config.DedupConfig; -import eu.dnetlib.pace.util.MapDocumentUtil; import java.io.IOException; import java.util.ArrayList; import java.util.Iterator; import java.util.List; + import org.apache.commons.io.IOUtils; import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaPairRDD; @@ -31,132 +20,149 @@ import org.apache.spark.sql.SparkSession; import org.dom4j.DocumentException; import org.slf4j.Logger; import org.slf4j.LoggerFactory; + +import com.google.common.hash.Hashing; + +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.oa.dedup.graph.ConnectedComponent; +import eu.dnetlib.dhp.oa.dedup.graph.GraphProcessor; +import eu.dnetlib.dhp.schema.common.ModelSupport; +import eu.dnetlib.dhp.schema.oaf.DataInfo; +import eu.dnetlib.dhp.schema.oaf.Qualifier; +import eu.dnetlib.dhp.schema.oaf.Relation; +import eu.dnetlib.dhp.utils.ISLookupClientFactory; +import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException; +import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService; +import eu.dnetlib.pace.config.DedupConfig; +import eu.dnetlib.pace.util.MapDocumentUtil; import scala.Tuple2; public class SparkCreateMergeRels extends AbstractSparkAction { - public static final String PROVENANCE_ACTION_CLASS = "sysimport:dedup"; - private static final Logger log = LoggerFactory.getLogger(SparkCreateMergeRels.class); - public static final String DNET_PROVENANCE_ACTIONS = "dnet:provenanceActions"; + public static final String PROVENANCE_ACTION_CLASS = "sysimport:dedup"; + private static final Logger log = LoggerFactory.getLogger(SparkCreateMergeRels.class); + public static final String DNET_PROVENANCE_ACTIONS = "dnet:provenanceActions"; - public SparkCreateMergeRels(ArgumentApplicationParser parser, SparkSession spark) { - super(parser, spark); - } + public SparkCreateMergeRels(ArgumentApplicationParser parser, SparkSession spark) { + super(parser, spark); + } - public static void main(String[] args) throws Exception { - ArgumentApplicationParser parser = - new ArgumentApplicationParser( - IOUtils.toString( - SparkCreateSimRels.class.getResourceAsStream( - "/eu/dnetlib/dhp/oa/dedup/createCC_parameters.json"))); - parser.parseArgument(args); + public static void main(String[] args) throws Exception { + ArgumentApplicationParser parser = new ArgumentApplicationParser( + IOUtils + .toString( + SparkCreateSimRels.class + .getResourceAsStream( + "/eu/dnetlib/dhp/oa/dedup/createCC_parameters.json"))); + parser.parseArgument(args); - final String isLookUpUrl = parser.get("isLookUpUrl"); - log.info("isLookupUrl {}", isLookUpUrl); + final String isLookUpUrl = parser.get("isLookUpUrl"); + log.info("isLookupUrl {}", isLookUpUrl); - SparkConf conf = new SparkConf(); - conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer"); - conf.registerKryoClasses(ModelSupport.getOafModelClasses()); + SparkConf conf = new SparkConf(); + conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer"); + conf.registerKryoClasses(ModelSupport.getOafModelClasses()); - new SparkCreateMergeRels(parser, getSparkSession(conf)) - .run(ISLookupClientFactory.getLookUpService(isLookUpUrl)); - } + new SparkCreateMergeRels(parser, getSparkSession(conf)) + .run(ISLookupClientFactory.getLookUpService(isLookUpUrl)); + } - @Override - public void run(ISLookUpService isLookUpService) - throws ISLookUpException, DocumentException, IOException { + @Override + public void run(ISLookUpService isLookUpService) + throws ISLookUpException, DocumentException, IOException { - final String graphBasePath = parser.get("graphBasePath"); - final String workingPath = parser.get("workingPath"); - final String isLookUpUrl = parser.get("isLookUpUrl"); - final String actionSetId = parser.get("actionSetId"); + final String graphBasePath = parser.get("graphBasePath"); + final String workingPath = parser.get("workingPath"); + final String isLookUpUrl = parser.get("isLookUpUrl"); + final String actionSetId = parser.get("actionSetId"); - log.info("graphBasePath: '{}'", graphBasePath); - log.info("isLookUpUrl: '{}'", isLookUpUrl); - log.info("actionSetId: '{}'", actionSetId); - log.info("workingPath: '{}'", workingPath); + log.info("graphBasePath: '{}'", graphBasePath); + log.info("isLookUpUrl: '{}'", isLookUpUrl); + log.info("actionSetId: '{}'", actionSetId); + log.info("workingPath: '{}'", workingPath); - final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); + final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); - for (DedupConfig dedupConf : getConfigurations(isLookUpService, actionSetId)) { - final String subEntity = dedupConf.getWf().getSubEntityValue(); + for (DedupConfig dedupConf : getConfigurations(isLookUpService, actionSetId)) { + final String subEntity = dedupConf.getWf().getSubEntityValue(); - log.info("Creating mergerels for: '{}'", subEntity); + log.info("Creating mergerels for: '{}'", subEntity); - final int maxIterations = dedupConf.getWf().getMaxIterations(); - log.info("Max iterations {}", maxIterations); + final int maxIterations = dedupConf.getWf().getMaxIterations(); + log.info("Max iterations {}", maxIterations); - final String mergeRelPath = - DedupUtility.createMergeRelPath(workingPath, actionSetId, subEntity); + final String mergeRelPath = DedupUtility.createMergeRelPath(workingPath, actionSetId, subEntity); - final JavaPairRDD vertexes = - sc.textFile(graphBasePath + "/" + subEntity) - .map(s -> MapDocumentUtil.getJPathString(dedupConf.getWf().getIdPath(), s)) - .mapToPair((PairFunction) s -> new Tuple2<>(hash(s), s)); + final JavaPairRDD vertexes = sc + .textFile(graphBasePath + "/" + subEntity) + .map(s -> MapDocumentUtil.getJPathString(dedupConf.getWf().getIdPath(), s)) + .mapToPair((PairFunction) s -> new Tuple2<>(hash(s), s)); - final RDD> edgeRdd = - spark - .read() - .load(DedupUtility.createSimRelPath(workingPath, actionSetId, subEntity)) - .as(Encoders.bean(Relation.class)) - .javaRDD() - .map(it -> new Edge<>(hash(it.getSource()), hash(it.getTarget()), it.getRelClass())) - .rdd(); + final RDD> edgeRdd = spark + .read() + .load(DedupUtility.createSimRelPath(workingPath, actionSetId, subEntity)) + .as(Encoders.bean(Relation.class)) + .javaRDD() + .map(it -> new Edge<>(hash(it.getSource()), hash(it.getTarget()), it.getRelClass())) + .rdd(); - final Dataset mergeRels = - spark.createDataset( - GraphProcessor.findCCs(vertexes.rdd(), edgeRdd, maxIterations) - .toJavaRDD() - .filter(k -> k.getDocIds().size() > 1) - .flatMap(cc -> ccToMergeRel(cc, dedupConf)) - .rdd(), - Encoders.bean(Relation.class)); + final Dataset mergeRels = spark + .createDataset( + GraphProcessor + .findCCs(vertexes.rdd(), edgeRdd, maxIterations) + .toJavaRDD() + .filter(k -> k.getDocIds().size() > 1) + .flatMap(cc -> ccToMergeRel(cc, dedupConf)) + .rdd(), + Encoders.bean(Relation.class)); - mergeRels.write().mode(SaveMode.Append).parquet(mergeRelPath); - } - } + mergeRels.write().mode(SaveMode.Append).parquet(mergeRelPath); + } + } - public Iterator ccToMergeRel(ConnectedComponent cc, DedupConfig dedupConf) { - return cc.getDocIds().stream() - .flatMap( - id -> { - List tmp = new ArrayList<>(); + public Iterator ccToMergeRel(ConnectedComponent cc, DedupConfig dedupConf) { + return cc + .getDocIds() + .stream() + .flatMap( + id -> { + List tmp = new ArrayList<>(); - tmp.add(rel(cc.getCcId(), id, "merges", dedupConf)); - tmp.add(rel(id, cc.getCcId(), "isMergedIn", dedupConf)); + tmp.add(rel(cc.getCcId(), id, "merges", dedupConf)); + tmp.add(rel(id, cc.getCcId(), "isMergedIn", dedupConf)); - return tmp.stream(); - }) - .iterator(); - } + return tmp.stream(); + }) + .iterator(); + } - private Relation rel(String source, String target, String relClass, DedupConfig dedupConf) { - Relation r = new Relation(); - r.setSource(source); - r.setTarget(target); - r.setRelClass(relClass); - r.setSubRelType("dedup"); + private Relation rel(String source, String target, String relClass, DedupConfig dedupConf) { + Relation r = new Relation(); + r.setSource(source); + r.setTarget(target); + r.setRelClass(relClass); + r.setSubRelType("dedup"); - DataInfo info = new DataInfo(); - info.setDeletedbyinference(false); - info.setInferred(true); - info.setInvisible(false); - info.setInferenceprovenance(dedupConf.getWf().getConfigurationId()); - Qualifier provenanceAction = new Qualifier(); - provenanceAction.setClassid(PROVENANCE_ACTION_CLASS); - provenanceAction.setClassname(PROVENANCE_ACTION_CLASS); - provenanceAction.setSchemeid(DNET_PROVENANCE_ACTIONS); - provenanceAction.setSchemename(DNET_PROVENANCE_ACTIONS); - info.setProvenanceaction(provenanceAction); + DataInfo info = new DataInfo(); + info.setDeletedbyinference(false); + info.setInferred(true); + info.setInvisible(false); + info.setInferenceprovenance(dedupConf.getWf().getConfigurationId()); + Qualifier provenanceAction = new Qualifier(); + provenanceAction.setClassid(PROVENANCE_ACTION_CLASS); + provenanceAction.setClassname(PROVENANCE_ACTION_CLASS); + provenanceAction.setSchemeid(DNET_PROVENANCE_ACTIONS); + provenanceAction.setSchemename(DNET_PROVENANCE_ACTIONS); + info.setProvenanceaction(provenanceAction); - // TODO calculate the trust value based on the similarity score of the elements in the CC - // info.setTrust(); + // TODO calculate the trust value based on the similarity score of the elements in the CC + // info.setTrust(); - r.setDataInfo(info); - return r; - } + r.setDataInfo(info); + return r; + } - public static long hash(final String id) { - return Hashing.murmur3_128().hashString(id).asLong(); - } + public static long hash(final String id) { + return Hashing.murmur3_128().hashString(id).asLong(); + } } diff --git a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkCreateSimRels.java b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkCreateSimRels.java index c5a1d768c..2cfe2e080 100644 --- a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkCreateSimRels.java +++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkCreateSimRels.java @@ -1,5 +1,21 @@ + package eu.dnetlib.dhp.oa.dedup; +import java.io.IOException; + +import org.apache.commons.io.IOUtils; +import org.apache.spark.SparkConf; +import org.apache.spark.api.java.JavaPairRDD; +import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.api.java.function.PairFunction; +import org.apache.spark.sql.Encoders; +import org.apache.spark.sql.SaveMode; +import org.apache.spark.sql.SparkSession; +import org.dom4j.DocumentException; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.oa.dedup.model.Block; import eu.dnetlib.dhp.schema.oaf.DataInfo; @@ -12,117 +28,107 @@ import eu.dnetlib.pace.model.FieldListImpl; import eu.dnetlib.pace.model.FieldValueImpl; import eu.dnetlib.pace.model.MapDocument; import eu.dnetlib.pace.util.MapDocumentUtil; -import java.io.IOException; -import org.apache.commons.io.IOUtils; -import org.apache.spark.SparkConf; -import org.apache.spark.api.java.JavaPairRDD; -import org.apache.spark.api.java.JavaRDD; -import org.apache.spark.api.java.JavaSparkContext; -import org.apache.spark.api.java.function.PairFunction; -import org.apache.spark.sql.Encoders; -import org.apache.spark.sql.SaveMode; -import org.apache.spark.sql.SparkSession; -import org.dom4j.DocumentException; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; import scala.Tuple2; public class SparkCreateSimRels extends AbstractSparkAction { - private static final Logger log = LoggerFactory.getLogger(SparkCreateSimRels.class); + private static final Logger log = LoggerFactory.getLogger(SparkCreateSimRels.class); - public SparkCreateSimRels(ArgumentApplicationParser parser, SparkSession spark) { - super(parser, spark); - } + public SparkCreateSimRels(ArgumentApplicationParser parser, SparkSession spark) { + super(parser, spark); + } - public static void main(String[] args) throws Exception { - ArgumentApplicationParser parser = - new ArgumentApplicationParser( - IOUtils.toString( - SparkCreateSimRels.class.getResourceAsStream( - "/eu/dnetlib/dhp/oa/dedup/createSimRels_parameters.json"))); - parser.parseArgument(args); + public static void main(String[] args) throws Exception { + ArgumentApplicationParser parser = new ArgumentApplicationParser( + IOUtils + .toString( + SparkCreateSimRels.class + .getResourceAsStream( + "/eu/dnetlib/dhp/oa/dedup/createSimRels_parameters.json"))); + parser.parseArgument(args); - SparkConf conf = new SparkConf(); - conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer"); - conf.registerKryoClasses( - new Class[] {MapDocument.class, FieldListImpl.class, FieldValueImpl.class, Block.class}); + SparkConf conf = new SparkConf(); + conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer"); + conf + .registerKryoClasses( + new Class[] { + MapDocument.class, FieldListImpl.class, FieldValueImpl.class, Block.class + }); - new SparkCreateSimRels(parser, getSparkSession(conf)) - .run(ISLookupClientFactory.getLookUpService(parser.get("isLookUpUrl"))); - } + new SparkCreateSimRels(parser, getSparkSession(conf)) + .run(ISLookupClientFactory.getLookUpService(parser.get("isLookUpUrl"))); + } - @Override - public void run(ISLookUpService isLookUpService) - throws DocumentException, IOException, ISLookUpException { + @Override + public void run(ISLookUpService isLookUpService) + throws DocumentException, IOException, ISLookUpException { - // read oozie parameters - final String graphBasePath = parser.get("graphBasePath"); - final String isLookUpUrl = parser.get("isLookUpUrl"); - final String actionSetId = parser.get("actionSetId"); - final String workingPath = parser.get("workingPath"); + // read oozie parameters + final String graphBasePath = parser.get("graphBasePath"); + final String isLookUpUrl = parser.get("isLookUpUrl"); + final String actionSetId = parser.get("actionSetId"); + final String workingPath = parser.get("workingPath"); - log.info("graphBasePath: '{}'", graphBasePath); - log.info("isLookUpUrl: '{}'", isLookUpUrl); - log.info("actionSetId: '{}'", actionSetId); - log.info("workingPath: '{}'", workingPath); + log.info("graphBasePath: '{}'", graphBasePath); + log.info("isLookUpUrl: '{}'", isLookUpUrl); + log.info("actionSetId: '{}'", actionSetId); + log.info("workingPath: '{}'", workingPath); - // for each dedup configuration - for (DedupConfig dedupConf : getConfigurations(isLookUpService, actionSetId)) { + // for each dedup configuration + for (DedupConfig dedupConf : getConfigurations(isLookUpService, actionSetId)) { - final String entity = dedupConf.getWf().getEntityType(); - final String subEntity = dedupConf.getWf().getSubEntityValue(); - log.info("Creating simrels for: '{}'", subEntity); + final String entity = dedupConf.getWf().getEntityType(); + final String subEntity = dedupConf.getWf().getSubEntityValue(); + log.info("Creating simrels for: '{}'", subEntity); - final String outputPath = DedupUtility.createSimRelPath(workingPath, actionSetId, subEntity); - removeOutputDir(spark, outputPath); + final String outputPath = DedupUtility.createSimRelPath(workingPath, actionSetId, subEntity); + removeOutputDir(spark, outputPath); - JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); + JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); - JavaPairRDD mapDocuments = - sc.textFile(DedupUtility.createEntityPath(graphBasePath, subEntity)) - .mapToPair( - (PairFunction) - s -> { - MapDocument d = MapDocumentUtil.asMapDocumentWithJPath(dedupConf, s); - return new Tuple2<>(d.getIdentifier(), d); - }); + JavaPairRDD mapDocuments = sc + .textFile(DedupUtility.createEntityPath(graphBasePath, subEntity)) + .mapToPair( + (PairFunction) s -> { + MapDocument d = MapDocumentUtil.asMapDocumentWithJPath(dedupConf, s); + return new Tuple2<>(d.getIdentifier(), d); + }); - // create blocks for deduplication - JavaPairRDD blocks = Deduper.createSortedBlocks(mapDocuments, dedupConf); + // create blocks for deduplication + JavaPairRDD blocks = Deduper.createSortedBlocks(mapDocuments, dedupConf); - // create relations by comparing only elements in the same group - JavaRDD relations = - Deduper.computeRelations(sc, blocks, dedupConf) - .map(t -> createSimRel(t._1(), t._2(), entity)); + // create relations by comparing only elements in the same group + JavaRDD relations = Deduper + .computeRelations(sc, blocks, dedupConf) + .map(t -> createSimRel(t._1(), t._2(), entity)); - // save the simrel in the workingdir - spark - .createDataset(relations.rdd(), Encoders.bean(Relation.class)) - .write() - .mode(SaveMode.Append) - .save(outputPath); - } - } + // save the simrel in the workingdir + spark + .createDataset(relations.rdd(), Encoders.bean(Relation.class)) + .write() + .mode(SaveMode.Append) + .save(outputPath); + } + } - private Relation createSimRel(String source, String target, String entity) { - final Relation r = new Relation(); - r.setSource(source); - r.setTarget(target); - r.setSubRelType("dedupSimilarity"); - r.setRelClass("isSimilarTo"); - r.setDataInfo(new DataInfo()); + private Relation createSimRel(String source, String target, String entity) { + final Relation r = new Relation(); + r.setSource(source); + r.setTarget(target); + r.setSubRelType("dedupSimilarity"); + r.setRelClass("isSimilarTo"); + r.setDataInfo(new DataInfo()); - switch (entity) { - case "result": - r.setRelType("resultResult"); - break; - case "organization": - r.setRelType("organizationOrganization"); - break; - default: - throw new IllegalArgumentException("unmanaged entity type: " + entity); - } - return r; - } + switch (entity) { + case "result": + r.setRelType("resultResult"); + break; + case "organization": + r.setRelType("organizationOrganization"); + break; + default: + throw new IllegalArgumentException("unmanaged entity type: " + entity); + } + return r; + } } diff --git a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkPropagateRelation.java b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkPropagateRelation.java index d829a9a03..34611db8e 100644 --- a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkPropagateRelation.java +++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkPropagateRelation.java @@ -1,183 +1,178 @@ + package eu.dnetlib.dhp.oa.dedup; import static org.apache.spark.sql.functions.col; +import org.apache.commons.io.IOUtils; +import org.apache.spark.SparkConf; +import org.apache.spark.api.java.function.MapFunction; +import org.apache.spark.sql.*; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.schema.common.ModelSupport; import eu.dnetlib.dhp.schema.oaf.DataInfo; import eu.dnetlib.dhp.schema.oaf.Relation; import eu.dnetlib.dhp.utils.ISLookupClientFactory; import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService; -import org.apache.commons.io.IOUtils; -import org.apache.spark.SparkConf; -import org.apache.spark.api.java.function.MapFunction; -import org.apache.spark.sql.*; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; import scala.Tuple2; public class SparkPropagateRelation extends AbstractSparkAction { - private static final Logger log = LoggerFactory.getLogger(SparkPropagateRelation.class); + private static final Logger log = LoggerFactory.getLogger(SparkPropagateRelation.class); - enum FieldType { - SOURCE, - TARGET - } + enum FieldType { + SOURCE, TARGET + } - public SparkPropagateRelation(ArgumentApplicationParser parser, SparkSession spark) - throws Exception { - super(parser, spark); - } + public SparkPropagateRelation(ArgumentApplicationParser parser, SparkSession spark) + throws Exception { + super(parser, spark); + } - public static void main(String[] args) throws Exception { - ArgumentApplicationParser parser = - new ArgumentApplicationParser( - IOUtils.toString( - SparkCreateSimRels.class.getResourceAsStream( - "/eu/dnetlib/dhp/oa/dedup/propagateRelation_parameters.json"))); + public static void main(String[] args) throws Exception { + ArgumentApplicationParser parser = new ArgumentApplicationParser( + IOUtils + .toString( + SparkCreateSimRels.class + .getResourceAsStream( + "/eu/dnetlib/dhp/oa/dedup/propagateRelation_parameters.json"))); - parser.parseArgument(args); + parser.parseArgument(args); - SparkConf conf = new SparkConf(); - conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer"); - conf.registerKryoClasses(ModelSupport.getOafModelClasses()); + SparkConf conf = new SparkConf(); + conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer"); + conf.registerKryoClasses(ModelSupport.getOafModelClasses()); - new SparkPropagateRelation(parser, getSparkSession(conf)) - .run(ISLookupClientFactory.getLookUpService(parser.get("isLookUpUrl"))); - } + new SparkPropagateRelation(parser, getSparkSession(conf)) + .run(ISLookupClientFactory.getLookUpService(parser.get("isLookUpUrl"))); + } - @Override - public void run(ISLookUpService isLookUpService) { + @Override + public void run(ISLookUpService isLookUpService) { - final String graphBasePath = parser.get("graphBasePath"); - final String workingPath = parser.get("workingPath"); - final String dedupGraphPath = parser.get("dedupGraphPath"); + final String graphBasePath = parser.get("graphBasePath"); + final String workingPath = parser.get("workingPath"); + final String dedupGraphPath = parser.get("dedupGraphPath"); - log.info("graphBasePath: '{}'", graphBasePath); - log.info("workingPath: '{}'", workingPath); - log.info("dedupGraphPath: '{}'", dedupGraphPath); + log.info("graphBasePath: '{}'", graphBasePath); + log.info("workingPath: '{}'", workingPath); + log.info("dedupGraphPath: '{}'", dedupGraphPath); - final String outputRelationPath = DedupUtility.createEntityPath(dedupGraphPath, "relation"); - removeOutputDir(spark, outputRelationPath); + final String outputRelationPath = DedupUtility.createEntityPath(dedupGraphPath, "relation"); + removeOutputDir(spark, outputRelationPath); - Dataset mergeRels = - spark - .read() - .load(DedupUtility.createMergeRelPath(workingPath, "*", "*")) - .as(Encoders.bean(Relation.class)); + Dataset mergeRels = spark + .read() + .load(DedupUtility.createMergeRelPath(workingPath, "*", "*")) + .as(Encoders.bean(Relation.class)); - Dataset> mergedIds = - mergeRels - .where(col("relClass").equalTo("merges")) - .select(col("source"), col("target")) - .distinct() - .map( - (MapFunction>) - r -> new Tuple2<>(r.getString(1), r.getString(0)), - Encoders.tuple(Encoders.STRING(), Encoders.STRING())) - .cache(); + Dataset> mergedIds = mergeRels + .where(col("relClass").equalTo("merges")) + .select(col("source"), col("target")) + .distinct() + .map( + (MapFunction>) r -> new Tuple2<>(r.getString(1), r.getString(0)), + Encoders.tuple(Encoders.STRING(), Encoders.STRING())) + .cache(); - final String relationPath = DedupUtility.createEntityPath(graphBasePath, "relation"); + final String relationPath = DedupUtility.createEntityPath(graphBasePath, "relation"); - Dataset rels = - spark.read().textFile(relationPath).map(patchRelFn(), Encoders.bean(Relation.class)); + Dataset rels = spark.read().textFile(relationPath).map(patchRelFn(), Encoders.bean(Relation.class)); - Dataset newRels = - processDataset( - processDataset(rels, mergedIds, FieldType.SOURCE, getFixRelFn(FieldType.SOURCE)), - mergedIds, - FieldType.TARGET, - getFixRelFn(FieldType.TARGET)) - .filter(SparkPropagateRelation::containsDedup); + Dataset newRels = processDataset( + processDataset(rels, mergedIds, FieldType.SOURCE, getFixRelFn(FieldType.SOURCE)), + mergedIds, + FieldType.TARGET, + getFixRelFn(FieldType.TARGET)) + .filter(SparkPropagateRelation::containsDedup); - Dataset updated = - processDataset( - processDataset(rels, mergedIds, FieldType.SOURCE, getDeletedFn()), - mergedIds, - FieldType.TARGET, - getDeletedFn()); + Dataset updated = processDataset( + processDataset(rels, mergedIds, FieldType.SOURCE, getDeletedFn()), + mergedIds, + FieldType.TARGET, + getDeletedFn()); - save(newRels.union(updated), outputRelationPath, SaveMode.Overwrite); - } + save(newRels.union(updated), outputRelationPath, SaveMode.Overwrite); + } - private static Dataset processDataset( - Dataset rels, - Dataset> mergedIds, - FieldType type, - MapFunction, Tuple2>, Relation> mapFn) { - final Dataset> mapped = - rels.map( - (MapFunction>) r -> new Tuple2<>(getId(r, type), r), - Encoders.tuple(Encoders.STRING(), Encoders.kryo(Relation.class))); - return mapped - .joinWith(mergedIds, mapped.col("_1").equalTo(mergedIds.col("_1")), "left_outer") - .map(mapFn, Encoders.bean(Relation.class)); - } + private static Dataset processDataset( + Dataset rels, + Dataset> mergedIds, + FieldType type, + MapFunction, Tuple2>, Relation> mapFn) { + final Dataset> mapped = rels + .map( + (MapFunction>) r -> new Tuple2<>(getId(r, type), r), + Encoders.tuple(Encoders.STRING(), Encoders.kryo(Relation.class))); + return mapped + .joinWith(mergedIds, mapped.col("_1").equalTo(mergedIds.col("_1")), "left_outer") + .map(mapFn, Encoders.bean(Relation.class)); + } - private static MapFunction patchRelFn() { - return value -> { - final Relation rel = OBJECT_MAPPER.readValue(value, Relation.class); - if (rel.getDataInfo() == null) { - rel.setDataInfo(new DataInfo()); - } - return rel; - }; - } + private static MapFunction patchRelFn() { + return value -> { + final Relation rel = OBJECT_MAPPER.readValue(value, Relation.class); + if (rel.getDataInfo() == null) { + rel.setDataInfo(new DataInfo()); + } + return rel; + }; + } - private static String getId(Relation r, FieldType type) { - switch (type) { - case SOURCE: - return r.getSource(); - case TARGET: - return r.getTarget(); - default: - throw new IllegalArgumentException(""); - } - } + private static String getId(Relation r, FieldType type) { + switch (type) { + case SOURCE: + return r.getSource(); + case TARGET: + return r.getTarget(); + default: + throw new IllegalArgumentException(""); + } + } - private static MapFunction, Tuple2>, Relation> - getFixRelFn(FieldType type) { - return value -> { - if (value._2() != null) { - Relation r = value._1()._2(); - String id = value._2()._2(); - if (r.getDataInfo() == null) { - r.setDataInfo(new DataInfo()); - } - r.getDataInfo().setDeletedbyinference(false); - switch (type) { - case SOURCE: - r.setSource(id); - return r; - case TARGET: - r.setTarget(id); - return r; - default: - throw new IllegalArgumentException(""); - } - } - return value._1()._2(); - }; - } + private static MapFunction, Tuple2>, Relation> getFixRelFn( + FieldType type) { + return value -> { + if (value._2() != null) { + Relation r = value._1()._2(); + String id = value._2()._2(); + if (r.getDataInfo() == null) { + r.setDataInfo(new DataInfo()); + } + r.getDataInfo().setDeletedbyinference(false); + switch (type) { + case SOURCE: + r.setSource(id); + return r; + case TARGET: + r.setTarget(id); + return r; + default: + throw new IllegalArgumentException(""); + } + } + return value._1()._2(); + }; + } - private static MapFunction, Tuple2>, Relation> - getDeletedFn() { - return value -> { - if (value._2() != null) { - Relation r = value._1()._2(); - if (r.getDataInfo() == null) { - r.setDataInfo(new DataInfo()); - } - r.getDataInfo().setDeletedbyinference(true); - return r; - } - return value._1()._2(); - }; - } + private static MapFunction, Tuple2>, Relation> getDeletedFn() { + return value -> { + if (value._2() != null) { + Relation r = value._1()._2(); + if (r.getDataInfo() == null) { + r.setDataInfo(new DataInfo()); + } + r.getDataInfo().setDeletedbyinference(true); + return r; + } + return value._1()._2(); + }; + } - private static boolean containsDedup(final Relation r) { - return r.getSource().toLowerCase().contains("dedup") - || r.getTarget().toLowerCase().contains("dedup"); - } + private static boolean containsDedup(final Relation r) { + return r.getSource().toLowerCase().contains("dedup") + || r.getTarget().toLowerCase().contains("dedup"); + } } diff --git a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkReporter.java b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkReporter.java index 2ffd982b1..7100c9037 100644 --- a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkReporter.java +++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkReporter.java @@ -1,47 +1,50 @@ + package eu.dnetlib.dhp.oa.dedup; -import eu.dnetlib.pace.util.Reporter; import java.util.ArrayList; import java.util.List; import java.util.Map; + import org.apache.spark.util.LongAccumulator; + +import eu.dnetlib.pace.util.Reporter; import scala.Serializable; import scala.Tuple2; public class SparkReporter implements Serializable, Reporter { - private final List> relations = new ArrayList<>(); + private final List> relations = new ArrayList<>(); - private Map accumulators; + private Map accumulators; - public SparkReporter(Map accumulators) { - this.accumulators = accumulators; - } + public SparkReporter(Map accumulators) { + this.accumulators = accumulators; + } - public void incrementCounter( - String counterGroup, - String counterName, - long delta, - Map accumulators) { + public void incrementCounter( + String counterGroup, + String counterName, + long delta, + Map accumulators) { - final String accumulatorName = String.format("%s::%s", counterGroup, counterName); - if (accumulators.containsKey(accumulatorName)) { - accumulators.get(accumulatorName).add(delta); - } - } + final String accumulatorName = String.format("%s::%s", counterGroup, counterName); + if (accumulators.containsKey(accumulatorName)) { + accumulators.get(accumulatorName).add(delta); + } + } - @Override - public void incrementCounter(String counterGroup, String counterName, long delta) { + @Override + public void incrementCounter(String counterGroup, String counterName, long delta) { - incrementCounter(counterGroup, counterName, delta, accumulators); - } + incrementCounter(counterGroup, counterName, delta, accumulators); + } - @Override - public void emit(String type, String from, String to) { - relations.add(new Tuple2<>(from, to)); - } + @Override + public void emit(String type, String from, String to) { + relations.add(new Tuple2<>(from, to)); + } - public List> getRelations() { - return relations; - } + public List> getRelations() { + return relations; + } } diff --git a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkUpdateEntity.java b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkUpdateEntity.java index 56dec79cf..779fb91d6 100644 --- a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkUpdateEntity.java +++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkUpdateEntity.java @@ -1,15 +1,8 @@ + package eu.dnetlib.dhp.oa.dedup; -import eu.dnetlib.dhp.application.ArgumentApplicationParser; -import eu.dnetlib.dhp.schema.common.ModelSupport; -import eu.dnetlib.dhp.schema.oaf.DataInfo; -import eu.dnetlib.dhp.schema.oaf.Oaf; -import eu.dnetlib.dhp.schema.oaf.OafEntity; -import eu.dnetlib.dhp.schema.oaf.Relation; -import eu.dnetlib.dhp.utils.ISLookupClientFactory; -import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService; -import eu.dnetlib.pace.util.MapDocumentUtil; import java.io.IOException; + import org.apache.commons.io.IOUtils; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileStatus; @@ -27,123 +20,133 @@ import org.apache.spark.sql.Row; import org.apache.spark.sql.SparkSession; import org.slf4j.Logger; import org.slf4j.LoggerFactory; + +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.schema.common.ModelSupport; +import eu.dnetlib.dhp.schema.oaf.DataInfo; +import eu.dnetlib.dhp.schema.oaf.Oaf; +import eu.dnetlib.dhp.schema.oaf.OafEntity; +import eu.dnetlib.dhp.schema.oaf.Relation; +import eu.dnetlib.dhp.utils.ISLookupClientFactory; +import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService; +import eu.dnetlib.pace.util.MapDocumentUtil; import scala.Tuple2; public class SparkUpdateEntity extends AbstractSparkAction { - private static final Logger log = LoggerFactory.getLogger(SparkUpdateEntity.class); + private static final Logger log = LoggerFactory.getLogger(SparkUpdateEntity.class); - private static final String IDJSONPATH = "$.id"; + private static final String IDJSONPATH = "$.id"; - public SparkUpdateEntity(ArgumentApplicationParser parser, SparkSession spark) { - super(parser, spark); - } + public SparkUpdateEntity(ArgumentApplicationParser parser, SparkSession spark) { + super(parser, spark); + } - public static void main(String[] args) throws Exception { - ArgumentApplicationParser parser = - new ArgumentApplicationParser( - IOUtils.toString( - SparkUpdateEntity.class.getResourceAsStream( - "/eu/dnetlib/dhp/oa/dedup/updateEntity_parameters.json"))); - parser.parseArgument(args); + public static void main(String[] args) throws Exception { + ArgumentApplicationParser parser = new ArgumentApplicationParser( + IOUtils + .toString( + SparkUpdateEntity.class + .getResourceAsStream( + "/eu/dnetlib/dhp/oa/dedup/updateEntity_parameters.json"))); + parser.parseArgument(args); - SparkConf conf = new SparkConf(); - conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer"); - conf.registerKryoClasses(ModelSupport.getOafModelClasses()); + SparkConf conf = new SparkConf(); + conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer"); + conf.registerKryoClasses(ModelSupport.getOafModelClasses()); - new SparkUpdateEntity(parser, getSparkSession(conf)) - .run(ISLookupClientFactory.getLookUpService(parser.get("isLookUpUrl"))); - } + new SparkUpdateEntity(parser, getSparkSession(conf)) + .run(ISLookupClientFactory.getLookUpService(parser.get("isLookUpUrl"))); + } - public void run(ISLookUpService isLookUpService) throws IOException { + public void run(ISLookUpService isLookUpService) throws IOException { - final String graphBasePath = parser.get("graphBasePath"); - final String workingPath = parser.get("workingPath"); - final String dedupGraphPath = parser.get("dedupGraphPath"); + final String graphBasePath = parser.get("graphBasePath"); + final String workingPath = parser.get("workingPath"); + final String dedupGraphPath = parser.get("dedupGraphPath"); - log.info("graphBasePath: '{}'", graphBasePath); - log.info("workingPath: '{}'", workingPath); - log.info("dedupGraphPath: '{}'", dedupGraphPath); + log.info("graphBasePath: '{}'", graphBasePath); + log.info("workingPath: '{}'", workingPath); + log.info("dedupGraphPath: '{}'", dedupGraphPath); - final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); + final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); - // for each entity - ModelSupport.entityTypes.forEach( - (type, clazz) -> { - final String outputPath = dedupGraphPath + "/" + type; - removeOutputDir(spark, outputPath); + // for each entity + ModelSupport.entityTypes + .forEach( + (type, clazz) -> { + final String outputPath = dedupGraphPath + "/" + type; + removeOutputDir(spark, outputPath); - JavaRDD sourceEntity = - sc.textFile(DedupUtility.createEntityPath(graphBasePath, type.toString())); + JavaRDD sourceEntity = sc + .textFile(DedupUtility.createEntityPath(graphBasePath, type.toString())); - if (mergeRelExists(workingPath, type.toString())) { + if (mergeRelExists(workingPath, type.toString())) { - final String mergeRelPath = - DedupUtility.createMergeRelPath(workingPath, "*", type.toString()); - final String dedupRecordPath = - DedupUtility.createDedupRecordPath(workingPath, "*", type.toString()); + final String mergeRelPath = DedupUtility.createMergeRelPath(workingPath, "*", type.toString()); + final String dedupRecordPath = DedupUtility + .createDedupRecordPath(workingPath, "*", type.toString()); - final Dataset rel = - spark.read().load(mergeRelPath).as(Encoders.bean(Relation.class)); + final Dataset rel = spark.read().load(mergeRelPath).as(Encoders.bean(Relation.class)); - final JavaPairRDD mergedIds = - rel.where("relClass == 'merges'") - .select(rel.col("target")) - .distinct() - .toJavaRDD() - .mapToPair( - (PairFunction) r -> new Tuple2<>(r.getString(0), "d")); + final JavaPairRDD mergedIds = rel + .where("relClass == 'merges'") + .select(rel.col("target")) + .distinct() + .toJavaRDD() + .mapToPair( + (PairFunction) r -> new Tuple2<>(r.getString(0), "d")); - JavaPairRDD entitiesWithId = - sourceEntity.mapToPair( - (PairFunction) - s -> new Tuple2<>(MapDocumentUtil.getJPathString(IDJSONPATH, s), s)); - JavaRDD map = - entitiesWithId - .leftOuterJoin(mergedIds) - .map( - k -> - k._2()._2().isPresent() - ? updateDeletedByInference(k._2()._1(), clazz) - : k._2()._1()); + JavaPairRDD entitiesWithId = sourceEntity + .mapToPair( + (PairFunction) s -> new Tuple2<>( + MapDocumentUtil.getJPathString(IDJSONPATH, s), s)); + JavaRDD map = entitiesWithId + .leftOuterJoin(mergedIds) + .map( + k -> k._2()._2().isPresent() + ? updateDeletedByInference(k._2()._1(), clazz) + : k._2()._1()); - sourceEntity = map.union(sc.textFile(dedupRecordPath)); - } + sourceEntity = map.union(sc.textFile(dedupRecordPath)); + } - sourceEntity.saveAsTextFile(outputPath, GzipCodec.class); - }); - } + sourceEntity.saveAsTextFile(outputPath, GzipCodec.class); + }); + } - public boolean mergeRelExists(String basePath, String entity) { + public boolean mergeRelExists(String basePath, String entity) { - boolean result = false; - try { - FileSystem fileSystem = FileSystem.get(new Configuration()); + boolean result = false; + try { + FileSystem fileSystem = FileSystem.get(new Configuration()); - FileStatus[] fileStatuses = fileSystem.listStatus(new Path(basePath)); + FileStatus[] fileStatuses = fileSystem.listStatus(new Path(basePath)); - for (FileStatus fs : fileStatuses) { - if (fs.isDirectory()) - if (fileSystem.exists( - new Path(DedupUtility.createMergeRelPath(basePath, fs.getPath().getName(), entity)))) - result = true; - } + for (FileStatus fs : fileStatuses) { + if (fs.isDirectory()) + if (fileSystem + .exists( + new Path(DedupUtility.createMergeRelPath(basePath, fs.getPath().getName(), entity)))) + result = true; + } - return result; - } catch (IOException e) { - throw new RuntimeException(e); - } - } + return result; + } catch (IOException e) { + throw new RuntimeException(e); + } + } - private static String updateDeletedByInference( - final String json, final Class clazz) { - try { - Oaf entity = OBJECT_MAPPER.readValue(json, clazz); - if (entity.getDataInfo() == null) entity.setDataInfo(new DataInfo()); - entity.getDataInfo().setDeletedbyinference(true); - return OBJECT_MAPPER.writeValueAsString(entity); - } catch (IOException e) { - throw new RuntimeException("Unable to convert json", e); - } - } + private static String updateDeletedByInference( + final String json, final Class clazz) { + try { + Oaf entity = OBJECT_MAPPER.readValue(json, clazz); + if (entity.getDataInfo() == null) + entity.setDataInfo(new DataInfo()); + entity.getDataInfo().setDeletedbyinference(true); + return OBJECT_MAPPER.writeValueAsString(entity); + } catch (IOException e) { + throw new RuntimeException("Unable to convert json", e); + } + } } diff --git a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/graph/ConnectedComponent.java b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/graph/ConnectedComponent.java index f4370a79c..bfd2c25e2 100644 --- a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/graph/ConnectedComponent.java +++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/graph/ConnectedComponent.java @@ -1,78 +1,84 @@ + package eu.dnetlib.dhp.oa.dedup.graph; -import com.fasterxml.jackson.databind.ObjectMapper; -import eu.dnetlib.dhp.oa.dedup.DedupUtility; -import eu.dnetlib.pace.util.PaceException; import java.io.IOException; import java.io.Serializable; import java.util.Set; + import org.apache.commons.lang.StringUtils; import org.codehaus.jackson.annotate.JsonIgnore; +import com.fasterxml.jackson.databind.ObjectMapper; + +import eu.dnetlib.dhp.oa.dedup.DedupUtility; +import eu.dnetlib.pace.util.PaceException; + public class ConnectedComponent implements Serializable { - private Set docIds; - private String ccId; + private Set docIds; + private String ccId; - public ConnectedComponent() {} + public ConnectedComponent() { + } - public ConnectedComponent(Set docIds) { - this.docIds = docIds; - createID(); - } + public ConnectedComponent(Set docIds) { + this.docIds = docIds; + createID(); + } - public String createID() { - if (docIds.size() > 1) { - final String s = getMin(); - String prefix = s.split("\\|")[0]; - ccId = prefix + "|dedup_wf_001::" + DedupUtility.md5(s); - return ccId; - } else { - return docIds.iterator().next(); - } - } + public String createID() { + if (docIds.size() > 1) { + final String s = getMin(); + String prefix = s.split("\\|")[0]; + ccId = prefix + "|dedup_wf_001::" + DedupUtility.md5(s); + return ccId; + } else { + return docIds.iterator().next(); + } + } - @JsonIgnore - public String getMin() { + @JsonIgnore + public String getMin() { - final StringBuilder min = new StringBuilder(); - docIds.forEach( - i -> { - if (StringUtils.isBlank(min.toString())) { - min.append(i); - } else { - if (min.toString().compareTo(i) > 0) { - min.setLength(0); - min.append(i); - } - } - }); - return min.toString(); - } + final StringBuilder min = new StringBuilder(); + docIds + .forEach( + i -> { + if (StringUtils.isBlank(min.toString())) { + min.append(i); + } else { + if (min.toString().compareTo(i) > 0) { + min.setLength(0); + min.append(i); + } + } + }); + return min.toString(); + } - @Override - public String toString() { - ObjectMapper mapper = new ObjectMapper(); - try { - return mapper.writeValueAsString(this); - } catch (IOException e) { - throw new PaceException("Failed to create Json: ", e); - } - } + @Override + public String toString() { + ObjectMapper mapper = new ObjectMapper(); + try { + return mapper.writeValueAsString(this); + } catch (IOException e) { + throw new PaceException("Failed to create Json: ", e); + } + } - public Set getDocIds() { - return docIds; - } + public Set getDocIds() { + return docIds; + } - public void setDocIds(Set docIds) { - this.docIds = docIds; - } + public void setDocIds(Set docIds) { + this.docIds = docIds; + } - public String getCcId() { - return ccId; - } + public String getCcId() { + return ccId; + } - public void setCcId(String ccId) { - this.ccId = ccId; - } + public void setCcId(String ccId) { + this.ccId = ccId; + } } diff --git a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/model/Block.java b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/model/Block.java index 10b622497..4f0d95c8f 100644 --- a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/model/Block.java +++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/model/Block.java @@ -1,7 +1,6 @@ + package eu.dnetlib.dhp.oa.dedup.model; -import com.google.common.collect.Lists; -import eu.dnetlib.pace.model.MapDocument; import java.io.Serializable; import java.util.ArrayList; import java.util.Comparator; @@ -11,63 +10,71 @@ import java.util.stream.Collectors; import java.util.stream.Stream; import java.util.stream.StreamSupport; +import com.google.common.collect.Lists; + +import eu.dnetlib.pace.model.MapDocument; + public class Block implements Serializable { - private String key; + private String key; - private List documents; + private List documents; - public Block() { - super(); - } + public Block() { + super(); + } - public static Block from(String key, MapDocument doc) { - Block block = new Block(); - block.setKey(key); - block.setDocuments(Lists.newArrayList(doc)); - return block; - } + public static Block from(String key, MapDocument doc) { + Block block = new Block(); + block.setKey(key); + block.setDocuments(Lists.newArrayList(doc)); + return block; + } - public static Block from(String key, Iterator blocks, String orderField, int maxSize) { - Block block = new Block(); - block.setKey(key); + public static Block from(String key, Iterator blocks, String orderField, int maxSize) { + Block block = new Block(); + block.setKey(key); - Iterable it = () -> blocks; + Iterable it = () -> blocks; - block.setDocuments( - StreamSupport.stream(it.spliterator(), false) - .flatMap(b -> b.getDocuments().stream()) - .sorted(Comparator.comparing(a -> a.getFieldMap().get(orderField).stringValue())) - .limit(maxSize) - .collect(Collectors.toCollection(ArrayList::new))); - return block; - } + block + .setDocuments( + StreamSupport + .stream(it.spliterator(), false) + .flatMap(b -> b.getDocuments().stream()) + .sorted(Comparator.comparing(a -> a.getFieldMap().get(orderField).stringValue())) + .limit(maxSize) + .collect(Collectors.toCollection(ArrayList::new))); + return block; + } - public static Block from(Block b1, Block b2, String orderField, int maxSize) { - Block block = new Block(); - block.setKey(b1.getKey()); - block.setDocuments( - Stream.concat(b1.getDocuments().stream(), b2.getDocuments().stream()) - .sorted(Comparator.comparing(a -> a.getFieldMap().get(orderField).stringValue())) - .limit(maxSize) - .collect(Collectors.toCollection(ArrayList::new))); + public static Block from(Block b1, Block b2, String orderField, int maxSize) { + Block block = new Block(); + block.setKey(b1.getKey()); + block + .setDocuments( + Stream + .concat(b1.getDocuments().stream(), b2.getDocuments().stream()) + .sorted(Comparator.comparing(a -> a.getFieldMap().get(orderField).stringValue())) + .limit(maxSize) + .collect(Collectors.toCollection(ArrayList::new))); - return block; - } + return block; + } - public String getKey() { - return key; - } + public String getKey() { + return key; + } - public void setKey(String key) { - this.key = key; - } + public void setKey(String key) { + this.key = key; + } - public List getDocuments() { - return documents; - } + public List getDocuments() { + return documents; + } - public void setDocuments(List documents) { - this.documents = documents; - } + public void setDocuments(List documents) { + this.documents = documents; + } } diff --git a/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/MergeAuthorTest.java b/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/MergeAuthorTest.java index 4236f32e3..a217a2657 100644 --- a/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/MergeAuthorTest.java +++ b/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/MergeAuthorTest.java @@ -1,49 +1,54 @@ + package eu.dnetlib.dhp.oa.dedup; -import eu.dnetlib.dhp.schema.oaf.Publication; import java.io.IOException; import java.util.Arrays; import java.util.List; import java.util.stream.Collectors; + import org.apache.commons.io.IOUtils; import org.codehaus.jackson.map.ObjectMapper; import org.junit.jupiter.api.BeforeEach; +import eu.dnetlib.dhp.schema.oaf.Publication; + public class MergeAuthorTest { - private List publicationsToMerge; - private final ObjectMapper mapper = new ObjectMapper(); + private List publicationsToMerge; + private final ObjectMapper mapper = new ObjectMapper(); - @BeforeEach - public void setUp() throws Exception { - final String json = - IOUtils.toString( - this.getClass().getResourceAsStream("/eu/dnetlib/dhp/dedup/json/authors_merge.json")); + @BeforeEach + public void setUp() throws Exception { + final String json = IOUtils + .toString( + this.getClass().getResourceAsStream("/eu/dnetlib/dhp/dedup/json/authors_merge.json")); - publicationsToMerge = - Arrays.asList(json.split("\n")).stream() - .map( - s -> { - try { - return mapper.readValue(s, Publication.class); - } catch (IOException e) { - throw new RuntimeException(e); - } - }) - .collect(Collectors.toList()); - } + publicationsToMerge = Arrays + .asList(json.split("\n")) + .stream() + .map( + s -> { + try { + return mapper.readValue(s, Publication.class); + } catch (IOException e) { + throw new RuntimeException(e); + } + }) + .collect(Collectors.toList()); + } - // FIX ME Michele DB this tests doesn't work - // @Test - public void test() throws Exception { - Publication dedup = new Publication(); + // FIX ME Michele DB this tests doesn't work + // @Test + public void test() throws Exception { + Publication dedup = new Publication(); - publicationsToMerge.forEach( - p -> { - dedup.mergeFrom(p); - dedup.setAuthor(DedupUtility.mergeAuthor(dedup.getAuthor(), p.getAuthor())); - }); + publicationsToMerge + .forEach( + p -> { + dedup.mergeFrom(p); + dedup.setAuthor(DedupUtility.mergeAuthor(dedup.getAuthor(), p.getAuthor())); + }); - System.out.println(mapper.writeValueAsString(dedup)); - } + System.out.println(mapper.writeValueAsString(dedup)); + } } diff --git a/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/SparkDedupTest.java b/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/SparkDedupTest.java index 838e7188d..a0ae7bc3c 100644 --- a/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/SparkDedupTest.java +++ b/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/SparkDedupTest.java @@ -1,19 +1,17 @@ + package eu.dnetlib.dhp.oa.dedup; import static java.nio.file.Files.createTempDirectory; + import static org.junit.jupiter.api.Assertions.assertEquals; import static org.mockito.Mockito.lenient; -import eu.dnetlib.dhp.application.ArgumentApplicationParser; -import eu.dnetlib.dhp.schema.oaf.Relation; -import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException; -import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService; -import eu.dnetlib.pace.util.MapDocumentUtil; import java.io.File; import java.io.IOException; import java.io.Serializable; import java.net.URISyntaxException; import java.nio.file.Paths; + import org.apache.commons.io.FileUtils; import org.apache.commons.io.IOUtils; import org.apache.spark.SparkConf; @@ -30,424 +28,435 @@ import org.junit.jupiter.api.extension.ExtendWith; import org.mockito.Mock; import org.mockito.Mockito; import org.mockito.junit.jupiter.MockitoExtension; + +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.schema.oaf.Relation; +import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException; +import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService; +import eu.dnetlib.pace.util.MapDocumentUtil; import scala.Tuple2; @ExtendWith(MockitoExtension.class) @TestMethodOrder(MethodOrderer.OrderAnnotation.class) public class SparkDedupTest implements Serializable { - @Mock(serializable = true) - ISLookUpService isLookUpService; + @Mock(serializable = true) + ISLookUpService isLookUpService; - private static SparkSession spark; - private static JavaSparkContext jsc; + private static SparkSession spark; + private static JavaSparkContext jsc; - private static String testGraphBasePath; - private static String testOutputBasePath; - private static String testDedupGraphBasePath; - private static final String testActionSetId = "test-orchestrator"; + private static String testGraphBasePath; + private static String testOutputBasePath; + private static String testDedupGraphBasePath; + private static final String testActionSetId = "test-orchestrator"; - @BeforeAll - public static void cleanUp() throws IOException, URISyntaxException { + @BeforeAll + public static void cleanUp() throws IOException, URISyntaxException { - testGraphBasePath = - Paths.get(SparkDedupTest.class.getResource("/eu/dnetlib/dhp/dedup/entities").toURI()) - .toFile() - .getAbsolutePath(); - testOutputBasePath = - createTempDirectory(SparkDedupTest.class.getSimpleName() + "-").toAbsolutePath().toString(); - testDedupGraphBasePath = - createTempDirectory(SparkDedupTest.class.getSimpleName() + "-").toAbsolutePath().toString(); + testGraphBasePath = Paths + .get(SparkDedupTest.class.getResource("/eu/dnetlib/dhp/dedup/entities").toURI()) + .toFile() + .getAbsolutePath(); + testOutputBasePath = createTempDirectory(SparkDedupTest.class.getSimpleName() + "-") + .toAbsolutePath() + .toString(); + testDedupGraphBasePath = createTempDirectory(SparkDedupTest.class.getSimpleName() + "-") + .toAbsolutePath() + .toString(); - FileUtils.deleteDirectory(new File(testOutputBasePath)); - FileUtils.deleteDirectory(new File(testDedupGraphBasePath)); + FileUtils.deleteDirectory(new File(testOutputBasePath)); + FileUtils.deleteDirectory(new File(testDedupGraphBasePath)); - spark = - SparkSession.builder() - .appName(SparkDedupTest.class.getSimpleName()) - .master("local[*]") - .config(new SparkConf()) - .getOrCreate(); + spark = SparkSession + .builder() + .appName(SparkDedupTest.class.getSimpleName()) + .master("local[*]") + .config(new SparkConf()) + .getOrCreate(); - jsc = JavaSparkContext.fromSparkContext(spark.sparkContext()); - } + jsc = JavaSparkContext.fromSparkContext(spark.sparkContext()); + } - @BeforeEach - public void setUp() throws IOException, ISLookUpException { + @BeforeEach + public void setUp() throws IOException, ISLookUpException { - lenient() - .when(isLookUpService.getResourceProfileByQuery(Mockito.contains(testActionSetId))) - .thenReturn( - IOUtils.toString( - SparkDedupTest.class.getResourceAsStream( - "/eu/dnetlib/dhp/dedup/profiles/mock_orchestrator.xml"))); + lenient() + .when(isLookUpService.getResourceProfileByQuery(Mockito.contains(testActionSetId))) + .thenReturn( + IOUtils + .toString( + SparkDedupTest.class + .getResourceAsStream( + "/eu/dnetlib/dhp/dedup/profiles/mock_orchestrator.xml"))); - lenient() - .when(isLookUpService.getResourceProfileByQuery(Mockito.contains("organization"))) - .thenReturn( - IOUtils.toString( - SparkDedupTest.class.getResourceAsStream( - "/eu/dnetlib/dhp/dedup/conf/org.curr.conf.json"))); + lenient() + .when(isLookUpService.getResourceProfileByQuery(Mockito.contains("organization"))) + .thenReturn( + IOUtils + .toString( + SparkDedupTest.class + .getResourceAsStream( + "/eu/dnetlib/dhp/dedup/conf/org.curr.conf.json"))); - lenient() - .when(isLookUpService.getResourceProfileByQuery(Mockito.contains("publication"))) - .thenReturn( - IOUtils.toString( - SparkDedupTest.class.getResourceAsStream( - "/eu/dnetlib/dhp/dedup/conf/pub.curr.conf.json"))); + lenient() + .when(isLookUpService.getResourceProfileByQuery(Mockito.contains("publication"))) + .thenReturn( + IOUtils + .toString( + SparkDedupTest.class + .getResourceAsStream( + "/eu/dnetlib/dhp/dedup/conf/pub.curr.conf.json"))); - lenient() - .when(isLookUpService.getResourceProfileByQuery(Mockito.contains("software"))) - .thenReturn( - IOUtils.toString( - SparkDedupTest.class.getResourceAsStream( - "/eu/dnetlib/dhp/dedup/conf/sw.curr.conf.json"))); + lenient() + .when(isLookUpService.getResourceProfileByQuery(Mockito.contains("software"))) + .thenReturn( + IOUtils + .toString( + SparkDedupTest.class + .getResourceAsStream( + "/eu/dnetlib/dhp/dedup/conf/sw.curr.conf.json"))); - lenient() - .when(isLookUpService.getResourceProfileByQuery(Mockito.contains("dataset"))) - .thenReturn( - IOUtils.toString( - SparkDedupTest.class.getResourceAsStream( - "/eu/dnetlib/dhp/dedup/conf/ds.curr.conf.json"))); + lenient() + .when(isLookUpService.getResourceProfileByQuery(Mockito.contains("dataset"))) + .thenReturn( + IOUtils + .toString( + SparkDedupTest.class + .getResourceAsStream( + "/eu/dnetlib/dhp/dedup/conf/ds.curr.conf.json"))); - lenient() - .when(isLookUpService.getResourceProfileByQuery(Mockito.contains("otherresearchproduct"))) - .thenReturn( - IOUtils.toString( - SparkDedupTest.class.getResourceAsStream( - "/eu/dnetlib/dhp/dedup/conf/orp.curr.conf.json"))); - } + lenient() + .when(isLookUpService.getResourceProfileByQuery(Mockito.contains("otherresearchproduct"))) + .thenReturn( + IOUtils + .toString( + SparkDedupTest.class + .getResourceAsStream( + "/eu/dnetlib/dhp/dedup/conf/orp.curr.conf.json"))); + } - @Test - @Order(1) - public void createSimRelsTest() throws Exception { + @Test + @Order(1) + public void createSimRelsTest() throws Exception { - ArgumentApplicationParser parser = - new ArgumentApplicationParser( - IOUtils.toString( - SparkCreateSimRels.class.getResourceAsStream( - "/eu/dnetlib/dhp/oa/dedup/createSimRels_parameters.json"))); - parser.parseArgument( - new String[] { - "-i", - testGraphBasePath, - "-asi", - testActionSetId, - "-la", - "lookupurl", - "-w", - testOutputBasePath - }); + ArgumentApplicationParser parser = new ArgumentApplicationParser( + IOUtils + .toString( + SparkCreateSimRels.class + .getResourceAsStream( + "/eu/dnetlib/dhp/oa/dedup/createSimRels_parameters.json"))); + parser + .parseArgument( + new String[] { + "-i", + testGraphBasePath, + "-asi", + testActionSetId, + "-la", + "lookupurl", + "-w", + testOutputBasePath + }); - new SparkCreateSimRels(parser, spark).run(isLookUpService); + new SparkCreateSimRels(parser, spark).run(isLookUpService); - long orgs_simrel = - spark - .read() - .load(testOutputBasePath + "/" + testActionSetId + "/organization_simrel") - .count(); - long pubs_simrel = - spark - .read() - .load(testOutputBasePath + "/" + testActionSetId + "/publication_simrel") - .count(); - long sw_simrel = - spark.read().load(testOutputBasePath + "/" + testActionSetId + "/software_simrel").count(); + long orgs_simrel = spark + .read() + .load(testOutputBasePath + "/" + testActionSetId + "/organization_simrel") + .count(); + long pubs_simrel = spark + .read() + .load(testOutputBasePath + "/" + testActionSetId + "/publication_simrel") + .count(); + long sw_simrel = spark.read().load(testOutputBasePath + "/" + testActionSetId + "/software_simrel").count(); - long ds_simrel = - spark.read().load(testOutputBasePath + "/" + testActionSetId + "/dataset_simrel").count(); + long ds_simrel = spark.read().load(testOutputBasePath + "/" + testActionSetId + "/dataset_simrel").count(); - long orp_simrel = - spark - .read() - .load(testOutputBasePath + "/" + testActionSetId + "/otherresearchproduct_simrel") - .count(); + long orp_simrel = spark + .read() + .load(testOutputBasePath + "/" + testActionSetId + "/otherresearchproduct_simrel") + .count(); - assertEquals(3432, orgs_simrel); - assertEquals(7152, pubs_simrel); - assertEquals(344, sw_simrel); - assertEquals(458, ds_simrel); - assertEquals(6750, orp_simrel); - } + assertEquals(3432, orgs_simrel); + assertEquals(7152, pubs_simrel); + assertEquals(344, sw_simrel); + assertEquals(458, ds_simrel); + assertEquals(6750, orp_simrel); + } - @Test - @Order(2) - public void createMergeRelsTest() throws Exception { + @Test + @Order(2) + public void createMergeRelsTest() throws Exception { - ArgumentApplicationParser parser = - new ArgumentApplicationParser( - IOUtils.toString( - SparkCreateMergeRels.class.getResourceAsStream( - "/eu/dnetlib/dhp/oa/dedup/createCC_parameters.json"))); - parser.parseArgument( - new String[] { - "-i", - testGraphBasePath, - "-asi", - testActionSetId, - "-la", - "lookupurl", - "-w", - testOutputBasePath - }); + ArgumentApplicationParser parser = new ArgumentApplicationParser( + IOUtils + .toString( + SparkCreateMergeRels.class + .getResourceAsStream( + "/eu/dnetlib/dhp/oa/dedup/createCC_parameters.json"))); + parser + .parseArgument( + new String[] { + "-i", + testGraphBasePath, + "-asi", + testActionSetId, + "-la", + "lookupurl", + "-w", + testOutputBasePath + }); - new SparkCreateMergeRels(parser, spark).run(isLookUpService); + new SparkCreateMergeRels(parser, spark).run(isLookUpService); - long orgs_mergerel = - spark - .read() - .load(testOutputBasePath + "/" + testActionSetId + "/organization_mergerel") - .count(); - long pubs_mergerel = - spark - .read() - .load(testOutputBasePath + "/" + testActionSetId + "/publication_mergerel") - .count(); - long sw_mergerel = - spark - .read() - .load(testOutputBasePath + "/" + testActionSetId + "/software_mergerel") - .count(); + long orgs_mergerel = spark + .read() + .load(testOutputBasePath + "/" + testActionSetId + "/organization_mergerel") + .count(); + long pubs_mergerel = spark + .read() + .load(testOutputBasePath + "/" + testActionSetId + "/publication_mergerel") + .count(); + long sw_mergerel = spark + .read() + .load(testOutputBasePath + "/" + testActionSetId + "/software_mergerel") + .count(); - long ds_mergerel = - spark.read().load(testOutputBasePath + "/" + testActionSetId + "/dataset_mergerel").count(); + long ds_mergerel = spark.read().load(testOutputBasePath + "/" + testActionSetId + "/dataset_mergerel").count(); - long orp_mergerel = - spark - .read() - .load(testOutputBasePath + "/" + testActionSetId + "/otherresearchproduct_mergerel") - .count(); + long orp_mergerel = spark + .read() + .load(testOutputBasePath + "/" + testActionSetId + "/otherresearchproduct_mergerel") + .count(); - assertEquals(1276, orgs_mergerel); - assertEquals(1442, pubs_mergerel); - assertEquals(288, sw_mergerel); - assertEquals(472, ds_mergerel); - assertEquals(718, orp_mergerel); - } + assertEquals(1276, orgs_mergerel); + assertEquals(1442, pubs_mergerel); + assertEquals(288, sw_mergerel); + assertEquals(472, ds_mergerel); + assertEquals(718, orp_mergerel); + } - @Test - @Order(3) - public void createDedupRecordTest() throws Exception { + @Test + @Order(3) + public void createDedupRecordTest() throws Exception { - ArgumentApplicationParser parser = - new ArgumentApplicationParser( - IOUtils.toString( - SparkCreateDedupRecord.class.getResourceAsStream( - "/eu/dnetlib/dhp/oa/dedup/createDedupRecord_parameters.json"))); - parser.parseArgument( - new String[] { - "-i", - testGraphBasePath, - "-asi", - testActionSetId, - "-la", - "lookupurl", - "-w", - testOutputBasePath - }); + ArgumentApplicationParser parser = new ArgumentApplicationParser( + IOUtils + .toString( + SparkCreateDedupRecord.class + .getResourceAsStream( + "/eu/dnetlib/dhp/oa/dedup/createDedupRecord_parameters.json"))); + parser + .parseArgument( + new String[] { + "-i", + testGraphBasePath, + "-asi", + testActionSetId, + "-la", + "lookupurl", + "-w", + testOutputBasePath + }); - new SparkCreateDedupRecord(parser, spark).run(isLookUpService); + new SparkCreateDedupRecord(parser, spark).run(isLookUpService); - long orgs_deduprecord = - jsc.textFile(testOutputBasePath + "/" + testActionSetId + "/organization_deduprecord") - .count(); - long pubs_deduprecord = - jsc.textFile(testOutputBasePath + "/" + testActionSetId + "/publication_deduprecord") - .count(); - long sw_deduprecord = - jsc.textFile(testOutputBasePath + "/" + testActionSetId + "/software_deduprecord").count(); - long ds_deduprecord = - jsc.textFile(testOutputBasePath + "/" + testActionSetId + "/dataset_deduprecord").count(); - long orp_deduprecord = - jsc.textFile( - testOutputBasePath + "/" + testActionSetId + "/otherresearchproduct_deduprecord") - .count(); + long orgs_deduprecord = jsc + .textFile(testOutputBasePath + "/" + testActionSetId + "/organization_deduprecord") + .count(); + long pubs_deduprecord = jsc + .textFile(testOutputBasePath + "/" + testActionSetId + "/publication_deduprecord") + .count(); + long sw_deduprecord = jsc + .textFile(testOutputBasePath + "/" + testActionSetId + "/software_deduprecord") + .count(); + long ds_deduprecord = jsc.textFile(testOutputBasePath + "/" + testActionSetId + "/dataset_deduprecord").count(); + long orp_deduprecord = jsc + .textFile( + testOutputBasePath + "/" + testActionSetId + "/otherresearchproduct_deduprecord") + .count(); - assertEquals(82, orgs_deduprecord); - assertEquals(66, pubs_deduprecord); - assertEquals(51, sw_deduprecord); - assertEquals(96, ds_deduprecord); - assertEquals(89, orp_deduprecord); - } + assertEquals(82, orgs_deduprecord); + assertEquals(66, pubs_deduprecord); + assertEquals(51, sw_deduprecord); + assertEquals(96, ds_deduprecord); + assertEquals(89, orp_deduprecord); + } - @Test - @Order(4) - public void updateEntityTest() throws Exception { + @Test + @Order(4) + public void updateEntityTest() throws Exception { - ArgumentApplicationParser parser = - new ArgumentApplicationParser( - IOUtils.toString( - SparkUpdateEntity.class.getResourceAsStream( - "/eu/dnetlib/dhp/oa/dedup/updateEntity_parameters.json"))); - parser.parseArgument( - new String[] { - "-i", testGraphBasePath, "-w", testOutputBasePath, "-o", testDedupGraphBasePath - }); + ArgumentApplicationParser parser = new ArgumentApplicationParser( + IOUtils + .toString( + SparkUpdateEntity.class + .getResourceAsStream( + "/eu/dnetlib/dhp/oa/dedup/updateEntity_parameters.json"))); + parser + .parseArgument( + new String[] { + "-i", testGraphBasePath, "-w", testOutputBasePath, "-o", testDedupGraphBasePath + }); - new SparkUpdateEntity(parser, spark).run(isLookUpService); + new SparkUpdateEntity(parser, spark).run(isLookUpService); - long organizations = jsc.textFile(testDedupGraphBasePath + "/organization").count(); - long publications = jsc.textFile(testDedupGraphBasePath + "/publication").count(); - long projects = jsc.textFile(testDedupGraphBasePath + "/project").count(); - long datasource = jsc.textFile(testDedupGraphBasePath + "/datasource").count(); - long softwares = jsc.textFile(testDedupGraphBasePath + "/software").count(); - long dataset = jsc.textFile(testDedupGraphBasePath + "/dataset").count(); - long otherresearchproduct = - jsc.textFile(testDedupGraphBasePath + "/otherresearchproduct").count(); + long organizations = jsc.textFile(testDedupGraphBasePath + "/organization").count(); + long publications = jsc.textFile(testDedupGraphBasePath + "/publication").count(); + long projects = jsc.textFile(testDedupGraphBasePath + "/project").count(); + long datasource = jsc.textFile(testDedupGraphBasePath + "/datasource").count(); + long softwares = jsc.textFile(testDedupGraphBasePath + "/software").count(); + long dataset = jsc.textFile(testDedupGraphBasePath + "/dataset").count(); + long otherresearchproduct = jsc.textFile(testDedupGraphBasePath + "/otherresearchproduct").count(); - long mergedOrgs = - spark - .read() - .load(testOutputBasePath + "/" + testActionSetId + "/organization_mergerel") - .as(Encoders.bean(Relation.class)) - .where("relClass=='merges'") - .javaRDD() - .map(Relation::getTarget) - .distinct() - .count(); + long mergedOrgs = spark + .read() + .load(testOutputBasePath + "/" + testActionSetId + "/organization_mergerel") + .as(Encoders.bean(Relation.class)) + .where("relClass=='merges'") + .javaRDD() + .map(Relation::getTarget) + .distinct() + .count(); - long mergedPubs = - spark - .read() - .load(testOutputBasePath + "/" + testActionSetId + "/publication_mergerel") - .as(Encoders.bean(Relation.class)) - .where("relClass=='merges'") - .javaRDD() - .map(Relation::getTarget) - .distinct() - .count(); + long mergedPubs = spark + .read() + .load(testOutputBasePath + "/" + testActionSetId + "/publication_mergerel") + .as(Encoders.bean(Relation.class)) + .where("relClass=='merges'") + .javaRDD() + .map(Relation::getTarget) + .distinct() + .count(); - long mergedSw = - spark - .read() - .load(testOutputBasePath + "/" + testActionSetId + "/software_mergerel") - .as(Encoders.bean(Relation.class)) - .where("relClass=='merges'") - .javaRDD() - .map(Relation::getTarget) - .distinct() - .count(); + long mergedSw = spark + .read() + .load(testOutputBasePath + "/" + testActionSetId + "/software_mergerel") + .as(Encoders.bean(Relation.class)) + .where("relClass=='merges'") + .javaRDD() + .map(Relation::getTarget) + .distinct() + .count(); - long mergedDs = - spark - .read() - .load(testOutputBasePath + "/" + testActionSetId + "/dataset_mergerel") - .as(Encoders.bean(Relation.class)) - .where("relClass=='merges'") - .javaRDD() - .map(Relation::getTarget) - .distinct() - .count(); + long mergedDs = spark + .read() + .load(testOutputBasePath + "/" + testActionSetId + "/dataset_mergerel") + .as(Encoders.bean(Relation.class)) + .where("relClass=='merges'") + .javaRDD() + .map(Relation::getTarget) + .distinct() + .count(); - long mergedOrp = - spark - .read() - .load(testOutputBasePath + "/" + testActionSetId + "/otherresearchproduct_mergerel") - .as(Encoders.bean(Relation.class)) - .where("relClass=='merges'") - .javaRDD() - .map(Relation::getTarget) - .distinct() - .count(); + long mergedOrp = spark + .read() + .load(testOutputBasePath + "/" + testActionSetId + "/otherresearchproduct_mergerel") + .as(Encoders.bean(Relation.class)) + .where("relClass=='merges'") + .javaRDD() + .map(Relation::getTarget) + .distinct() + .count(); - assertEquals(897, publications); - assertEquals(835, organizations); - assertEquals(100, projects); - assertEquals(100, datasource); - assertEquals(200, softwares); - assertEquals(388, dataset); - assertEquals(517, otherresearchproduct); + assertEquals(897, publications); + assertEquals(835, organizations); + assertEquals(100, projects); + assertEquals(100, datasource); + assertEquals(200, softwares); + assertEquals(388, dataset); + assertEquals(517, otherresearchproduct); - long deletedOrgs = - jsc.textFile(testDedupGraphBasePath + "/organization") - .filter(this::isDeletedByInference) - .count(); + long deletedOrgs = jsc + .textFile(testDedupGraphBasePath + "/organization") + .filter(this::isDeletedByInference) + .count(); - long deletedPubs = - jsc.textFile(testDedupGraphBasePath + "/publication") - .filter(this::isDeletedByInference) - .count(); + long deletedPubs = jsc + .textFile(testDedupGraphBasePath + "/publication") + .filter(this::isDeletedByInference) + .count(); - long deletedSw = - jsc.textFile(testDedupGraphBasePath + "/software") - .filter(this::isDeletedByInference) - .count(); + long deletedSw = jsc + .textFile(testDedupGraphBasePath + "/software") + .filter(this::isDeletedByInference) + .count(); - long deletedDs = - jsc.textFile(testDedupGraphBasePath + "/dataset") - .filter(this::isDeletedByInference) - .count(); + long deletedDs = jsc + .textFile(testDedupGraphBasePath + "/dataset") + .filter(this::isDeletedByInference) + .count(); - long deletedOrp = - jsc.textFile(testDedupGraphBasePath + "/otherresearchproduct") - .filter(this::isDeletedByInference) - .count(); + long deletedOrp = jsc + .textFile(testDedupGraphBasePath + "/otherresearchproduct") + .filter(this::isDeletedByInference) + .count(); - assertEquals(mergedOrgs, deletedOrgs); - assertEquals(mergedPubs, deletedPubs); - assertEquals(mergedSw, deletedSw); - assertEquals(mergedDs, deletedDs); - assertEquals(mergedOrp, deletedOrp); - } + assertEquals(mergedOrgs, deletedOrgs); + assertEquals(mergedPubs, deletedPubs); + assertEquals(mergedSw, deletedSw); + assertEquals(mergedDs, deletedDs); + assertEquals(mergedOrp, deletedOrp); + } - @Test - @Order(5) - public void propagateRelationTest() throws Exception { + @Test + @Order(5) + public void propagateRelationTest() throws Exception { - ArgumentApplicationParser parser = - new ArgumentApplicationParser( - IOUtils.toString( - SparkPropagateRelation.class.getResourceAsStream( - "/eu/dnetlib/dhp/oa/dedup/propagateRelation_parameters.json"))); - parser.parseArgument( - new String[] { - "-i", testGraphBasePath, "-w", testOutputBasePath, "-o", testDedupGraphBasePath - }); + ArgumentApplicationParser parser = new ArgumentApplicationParser( + IOUtils + .toString( + SparkPropagateRelation.class + .getResourceAsStream( + "/eu/dnetlib/dhp/oa/dedup/propagateRelation_parameters.json"))); + parser + .parseArgument( + new String[] { + "-i", testGraphBasePath, "-w", testOutputBasePath, "-o", testDedupGraphBasePath + }); - new SparkPropagateRelation(parser, spark).run(isLookUpService); + new SparkPropagateRelation(parser, spark).run(isLookUpService); - long relations = jsc.textFile(testDedupGraphBasePath + "/relation").count(); + long relations = jsc.textFile(testDedupGraphBasePath + "/relation").count(); - assertEquals(826, relations); + assertEquals(826, relations); - // check deletedbyinference - final Dataset mergeRels = - spark - .read() - .load(DedupUtility.createMergeRelPath(testOutputBasePath, "*", "*")) - .as(Encoders.bean(Relation.class)); - final JavaPairRDD mergedIds = - mergeRels - .where("relClass == 'merges'") - .select(mergeRels.col("target")) - .distinct() - .toJavaRDD() - .mapToPair( - (PairFunction) - r -> new Tuple2(r.getString(0), "d")); + // check deletedbyinference + final Dataset mergeRels = spark + .read() + .load(DedupUtility.createMergeRelPath(testOutputBasePath, "*", "*")) + .as(Encoders.bean(Relation.class)); + final JavaPairRDD mergedIds = mergeRels + .where("relClass == 'merges'") + .select(mergeRels.col("target")) + .distinct() + .toJavaRDD() + .mapToPair( + (PairFunction) r -> new Tuple2(r.getString(0), "d")); - JavaRDD toCheck = - jsc.textFile(testDedupGraphBasePath + "/relation") - .mapToPair(json -> new Tuple2<>(MapDocumentUtil.getJPathString("$.source", json), json)) - .join(mergedIds) - .map(t -> t._2()._1()) - .mapToPair(json -> new Tuple2<>(MapDocumentUtil.getJPathString("$.target", json), json)) - .join(mergedIds) - .map(t -> t._2()._1()); + JavaRDD toCheck = jsc + .textFile(testDedupGraphBasePath + "/relation") + .mapToPair(json -> new Tuple2<>(MapDocumentUtil.getJPathString("$.source", json), json)) + .join(mergedIds) + .map(t -> t._2()._1()) + .mapToPair(json -> new Tuple2<>(MapDocumentUtil.getJPathString("$.target", json), json)) + .join(mergedIds) + .map(t -> t._2()._1()); - long deletedbyinference = toCheck.filter(this::isDeletedByInference).count(); - long updated = toCheck.count(); + long deletedbyinference = toCheck.filter(this::isDeletedByInference).count(); + long updated = toCheck.count(); - assertEquals(updated, deletedbyinference); - } + assertEquals(updated, deletedbyinference); + } - @AfterAll - public static void finalCleanUp() throws IOException { - FileUtils.deleteDirectory(new File(testOutputBasePath)); - FileUtils.deleteDirectory(new File(testDedupGraphBasePath)); - } + @AfterAll + public static void finalCleanUp() throws IOException { + FileUtils.deleteDirectory(new File(testOutputBasePath)); + FileUtils.deleteDirectory(new File(testDedupGraphBasePath)); + } - public boolean isDeletedByInference(String s) { - return s.contains("\"deletedbyinference\":true"); - } + public boolean isDeletedByInference(String s) { + return s.contains("\"deletedbyinference\":true"); + } } diff --git a/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/jpath/JsonPathTest.java b/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/jpath/JsonPathTest.java index 7c8d937ce..9518efdb5 100644 --- a/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/jpath/JsonPathTest.java +++ b/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/jpath/JsonPathTest.java @@ -1,291 +1,292 @@ + package eu.dnetlib.dhp.oa.dedup.jpath; +import org.junit.jupiter.api.Test; + import eu.dnetlib.pace.config.DedupConfig; import eu.dnetlib.pace.model.MapDocument; import eu.dnetlib.pace.util.MapDocumentUtil; -import org.junit.jupiter.api.Test; public class JsonPathTest { - String json = - "{\t\"dataInfo\":{\t\t\"invisible\":false,\t\t\"inferred\":false,\t\t\"deletedbyinference\":false,\t\t\"trust\":\"0.810000002384185791\",\t\t\"inferenceprovenance\":\"\",\t\t\"provenanceaction\":{\t\t\t\"classid\":\"sysimport:crosswalk:entityregistry\",\t\t\t\"classname\":\"sysimport:crosswalk:entityregistry\",\t\t\t\"schemeid\":\"dnet:provenance_actions\",\t\t\t\"schemename\":\"dnet:provenance_actions\"\t\t}\t},\t\"lastupdatetimestamp\":1584960968152,\t\"id\":\"20|corda__h2020::9faf23721249f26ac2c16eb857ea1fb9\",\t\"originalId\":[\t\t\"corda__h2020::927957582\"\t],\t\"collectedfrom\":[\t\t{\t\t\t\"key\":\"openaire____::corda_h2020\",\t\t\t\"value\":\"CORDA - COmmon Research DAta Warehouse - Horizon 2020\",\t\t\t\"dataInfo\":null\t\t}\t],\t\"pid\":[\t],\t\"dateofcollection\":\"2016-06-05\",\t\"dateoftransformation\":\"2019-11-19\",\t\"extraInfo\":[\t],\t\"oaiprovenance\":null,\t\"legalshortname\":{\t\t\"value\":\"Comentor AB\",\t\t\"dataInfo\":{\t\t\t\"invisible\":false,\t\t\t\"inferred\":false,\t\t\t\"deletedbyinference\":false,\t\t\t\"trust\":\"0.810000002384185791\",\t\t\t\"inferenceprovenance\":\"\",\t\t\t\"provenanceaction\":{\t\t\t\t\"classid\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"classname\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"schemeid\":\"dnet:provenance_actions\",\t\t\t\t\"schemename\":\"dnet:provenance_actions\"\t\t\t}\t\t}\t},\t\"legalname\":{\t\t\"value\":\"Comentor AB\",\t\t\"dataInfo\":{\t\t\t\"invisible\":false,\t\t\t\"inferred\":false,\t\t\t\"deletedbyinference\":false,\t\t\t\"trust\":\"0.810000002384185791\",\t\t\t\"inferenceprovenance\":\"\",\t\t\t\"provenanceaction\":{\t\t\t\t\"classid\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"classname\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"schemeid\":\"dnet:provenance_actions\",\t\t\t\t\"schemename\":\"dnet:provenance_actions\"\t\t\t}\t\t}\t},\t\"alternativeNames\":[\t],\t\"websiteurl\":{\t\t\"value\":\"http://www.comentor.se\",\t\t\"dataInfo\":{\t\t\t\"invisible\":false,\t\t\t\"inferred\":false,\t\t\t\"deletedbyinference\":false,\t\t\t\"trust\":\"0.810000002384185791\",\t\t\t\"inferenceprovenance\":\"\",\t\t\t\"provenanceaction\":{\t\t\t\t\"classid\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"classname\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"schemeid\":\"dnet:provenance_actions\",\t\t\t\t\"schemename\":\"dnet:provenance_actions\"\t\t\t}\t\t}\t},\t\"logourl\":null,\t\"eclegalbody\":{\t\t\"value\":\"false\",\t\t\"dataInfo\":{\t\t\t\"invisible\":false,\t\t\t\"inferred\":false,\t\t\t\"deletedbyinference\":false,\t\t\t\"trust\":\"0.810000002384185791\",\t\t\t\"inferenceprovenance\":\"\",\t\t\t\"provenanceaction\":{\t\t\t\t\"classid\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"classname\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"schemeid\":\"dnet:provenance_actions\",\t\t\t\t\"schemename\":\"dnet:provenance_actions\"\t\t\t}\t\t}\t},\t\"eclegalperson\":{\t\t\"value\":\"true\",\t\t\"dataInfo\":{\t\t\t\"invisible\":false,\t\t\t\"inferred\":false,\t\t\t\"deletedbyinference\":false,\t\t\t\"trust\":\"0.810000002384185791\",\t\t\t\"inferenceprovenance\":\"\",\t\t\t\"provenanceaction\":{\t\t\t\t\"classid\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"classname\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"schemeid\":\"dnet:provenance_actions\",\t\t\t\t\"schemename\":\"dnet:provenance_actions\"\t\t\t}\t\t}\t},\t\"ecnonprofit\":{\t\t\"value\":\"false\",\t\t\"dataInfo\":{\t\t\t\"invisible\":false,\t\t\t\"inferred\":false,\t\t\t\"deletedbyinference\":false,\t\t\t\"trust\":\"0.810000002384185791\",\t\t\t\"inferenceprovenance\":\"\",\t\t\t\"provenanceaction\":{\t\t\t\t\"classid\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"classname\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"schemeid\":\"dnet:provenance_actions\",\t\t\t\t\"schemename\":\"dnet:provenance_actions\"\t\t\t}\t\t}\t},\t\"ecresearchorganization\":{\t\t\"value\":\"false\",\t\t\"dataInfo\":{\t\t\t\"invisible\":false,\t\t\t\"inferred\":false,\t\t\t\"deletedbyinference\":false,\t\t\t\"trust\":\"0.810000002384185791\",\t\t\t\"inferenceprovenance\":\"\",\t\t\t\"provenanceaction\":{\t\t\t\t\"classid\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"classname\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"schemeid\":\"dnet:provenance_actions\",\t\t\t\t\"schemename\":\"dnet:provenance_actions\"\t\t\t}\t\t}\t},\t\"echighereducation\":{\t\t\"value\":\"false\",\t\t\"dataInfo\":{\t\t\t\"invisible\":false,\t\t\t\"inferred\":false,\t\t\t\"deletedbyinference\":false,\t\t\t\"trust\":\"0.810000002384185791\",\t\t\t\"inferenceprovenance\":\"\",\t\t\t\"provenanceaction\":{\t\t\t\t\"classid\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"classname\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"schemeid\":\"dnet:provenance_actions\",\t\t\t\t\"schemename\":\"dnet:provenance_actions\"\t\t\t}\t\t}\t},\t\"ecinternationalorganizationeurinterests\":{\t\t\"value\":\"false\",\t\t\"dataInfo\":{\t\t\t\"invisible\":false,\t\t\t\"inferred\":false,\t\t\t\"deletedbyinference\":false,\t\t\t\"trust\":\"0.810000002384185791\",\t\t\t\"inferenceprovenance\":\"\",\t\t\t\"provenanceaction\":{\t\t\t\t\"classid\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"classname\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"schemeid\":\"dnet:provenance_actions\",\t\t\t\t\"schemename\":\"dnet:provenance_actions\"\t\t\t}\t\t}\t},\t\"ecinternationalorganization\":{\t\t\"value\":\"false\",\t\t\"dataInfo\":{\t\t\t\"invisible\":false,\t\t\t\"inferred\":false,\t\t\t\"deletedbyinference\":false,\t\t\t\"trust\":\"0.810000002384185791\",\t\t\t\"inferenceprovenance\":\"\",\t\t\t\"provenanceaction\":{\t\t\t\t\"classid\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"classname\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"schemeid\":\"dnet:provenance_actions\",\t\t\t\t\"schemename\":\"dnet:provenance_actions\"\t\t\t}\t\t}\t},\t\"ecenterprise\":{\t\t\"value\":\"false\",\t\t\"dataInfo\":{\t\t\t\"invisible\":false,\t\t\t\"inferred\":false,\t\t\t\"deletedbyinference\":false,\t\t\t\"trust\":\"0.810000002384185791\",\t\t\t\"inferenceprovenance\":\"\",\t\t\t\"provenanceaction\":{\t\t\t\t\"classid\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"classname\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"schemeid\":\"dnet:provenance_actions\",\t\t\t\t\"schemename\":\"dnet:provenance_actions\"\t\t\t}\t\t}\t},\t\"ecsmevalidated\":{\t\t\"value\":\"true\",\t\t\"dataInfo\":{\t\t\t\"invisible\":false,\t\t\t\"inferred\":false,\t\t\t\"deletedbyinference\":false,\t\t\t\"trust\":\"0.810000002384185791\",\t\t\t\"inferenceprovenance\":\"\",\t\t\t\"provenanceaction\":{\t\t\t\t\"classid\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"classname\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"schemeid\":\"dnet:provenance_actions\",\t\t\t\t\"schemename\":\"dnet:provenance_actions\"\t\t\t}\t\t}\t},\t\"ecnutscode\":{\t\t\"value\":\"false\",\t\t\"dataInfo\":{\t\t\t\"invisible\":false,\t\t\t\"inferred\":false,\t\t\t\"deletedbyinference\":false,\t\t\t\"trust\":\"0.810000002384185791\",\t\t\t\"inferenceprovenance\":\"\",\t\t\t\"provenanceaction\":{\t\t\t\t\"classid\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"classname\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"schemeid\":\"dnet:provenance_actions\",\t\t\t\t\"schemename\":\"dnet:provenance_actions\"\t\t\t}\t\t}\t},\t\"country\":null}"; - DedupConfig conf = - DedupConfig.load( - "{\n" - + " \"wf\" : {\n" - + " \"threshold\" : \"0.99\",\n" - + " \"dedupRun\" : \"001\",\n" - + " \"entityType\" : \"organization\",\n" - + " \"subEntityValue\": \"organization\",\n" - + " \"orderField\" : \"legalname\",\n" - + " \"queueMaxSize\" : \"2000\",\n" - + " \"groupMaxSize\" : \"50\",\n" - + " \"slidingWindowSize\" : \"200\",\n" - + " \"idPath\":\"$.id\",\n" - + " \"rootBuilder\" : [ \"organization\", \"projectOrganization_participation_isParticipant\", \"datasourceOrganization_provision_isProvidedBy\" ],\n" - + " \"includeChildren\" : \"true\",\n" - + " \"maxIterations\": \"20\"\n" - + " },\n" - + " \"pace\" : {\n" - + " \"clustering\" : [\n" - + " { \"name\" : \"sortedngrampairs\", \"fields\" : [ \"legalname\" ], \"params\" : { \"max\" : 2, \"ngramLen\" : \"3\"} },\n" - + " { \"name\" : \"suffixprefix\", \"fields\" : [ \"legalname\" ], \"params\" : { \"max\" : 1, \"len\" : \"3\" } },\n" - + " { \"name\" : \"urlclustering\", \"fields\" : [ \"websiteurl\" ], \"params\" : { } },\n" - + " { \"name\" : \"keywordsclustering\", \"fields\" : [ \"legalname\" ], \"params\" : { \"max\": 2, \"windowSize\": 4} }\n" - + " ],\n" - + " \"decisionTree\" : {\n" - + " \"start\": {\n" - + " \"fields\": [\n" - + " {\n" - + " \"field\": \"gridid\",\n" - + " \"comparator\": \"exactMatch\",\n" - + " \"weight\": 1,\n" - + " \"countIfUndefined\": \"false\",\n" - + " \"params\": {}\n" - + " }\n" - + " ],\n" - + " \"threshold\": 1,\n" - + " \"aggregation\": \"AVG\",\n" - + " \"positive\": \"MATCH\",\n" - + " \"negative\": \"NO_MATCH\",\n" - + " \"undefined\": \"layer2\",\n" - + " \"ignoreUndefined\": \"false\"\n" - + " },\n" - + " \"layer2\": {\n" - + " \"fields\": [\n" - + " {\n" - + " \"field\": \"websiteurl\",\n" - + " \"comparator\": \"domainExactMatch\",\n" - + " \"weight\": 1,\n" - + " \"countIfUndefined\": \"false\",\n" - + " \"params\": {}\n" - + " },\n" - + " {\n" - + " \"field\": \"country\",\n" - + " \"comparator\": \"exactMatch\",\n" - + " \"weight\": 1,\n" - + " \"countIfUndefined\": \"true\",\n" - + " \"params\": {}\n" - + " },\n" - + " {\n" - + " \"field\": \"legalname\",\n" - + " \"comparator\": \"numbersMatch\",\n" - + " \"weight\": 1,\n" - + " \"countIfUndefined\": \"true\",\n" - + " \"params\": {}\n" - + " },\n" - + " {\n" - + " \"field\": \"legalname\",\n" - + " \"comparator\": \"romansMatch\",\n" - + " \"weight\": 1,\n" - + " \"countIfUndefined\": \"true\",\n" - + " \"params\": {}\n" - + " }\n" - + " ],\n" - + " \"threshold\": 1,\n" - + " \"aggregation\": \"AND\",\n" - + " \"positive\": \"layer3\",\n" - + " \"negative\": \"NO_MATCH\",\n" - + " \"undefined\": \"layer3\",\n" - + " \"ignoreUndefined\": \"true\"\n" - + " },\n" - + " \"layer3\": {\n" - + " \"fields\": [\n" - + " {\n" - + " \"field\": \"legalname\",\n" - + " \"comparator\": \"cityMatch\",\n" - + " \"weight\": 1.0,\n" - + " \"countIfUndefined\": \"true\",\n" - + " \"params\": {\n" - + " \"windowSize\": \"4\"\n" - + " }\n" - + " }\n" - + " ],\n" - + " \"threshold\": 0.1,\n" - + " \"aggregation\": \"AVG\",\n" - + " \"positive\": \"layer4\",\n" - + " \"negative\": \"NO_MATCH\",\n" - + " \"undefined\": \"NO_MATCH\",\n" - + " \"ignoreUndefined\": \"true\"\n" - + " },\n" - + " \"layer4\": {\n" - + " \"fields\": [\n" - + " {\n" - + " \"field\": \"legalname\",\n" - + " \"comparator\": \"keywordMatch\",\n" - + " \"weight\": 1.0,\n" - + " \"countIfUndefined\": \"true\",\n" - + " \"params\": {\n" - + " \"windowSize\": \"4\"\n" - + " }\n" - + " }\n" - + " ],\n" - + " \"threshold\": 0.7,\n" - + " \"aggregation\": \"AVG\",\n" - + " \"positive\": \"layer5\",\n" - + " \"negative\": \"NO_MATCH\",\n" - + " \"undefined\": \"layer5\",\n" - + " \"ignoreUndefined\": \"true\"\n" - + " },\n" - + " \"layer5\": {\n" - + " \"fields\": [\n" - + " {\n" - + " \"field\": \"legalname\",\n" - + " \"comparator\": \"jaroWinklerNormalizedName\",\n" - + " \"weight\": 0.9,\n" - + " \"countIfUndefined\": \"true\",\n" - + " \"params\": {\n" - + " \"windowSize\": \"4\"\n" - + " }\n" - + " },\n" - + " {\n" - + " \"field\": \"legalshortname\",\n" - + " \"comparator\": \"jaroWinklerNormalizedName\",\n" - + " \"weight\": 0.1,\n" - + " \"countIfUndefined\": \"false\",\n" - + " \"params\": {\n" - + " \"windowSize\": 4\n" - + " }\n" - + " }\n" - + " ],\n" - + " \"threshold\": 0.9,\n" - + " \"aggregation\": \"W_MEAN\",\n" - + " \"positive\": \"MATCH\",\n" - + " \"negative\": \"NO_MATCH\",\n" - + " \"undefined\": \"NO_MATCH\",\n" - + " \"ignoreUndefined\": \"true\"\n" - + " }\n" - + " },\n" - + " \"model\" : [\n" - + " { \"name\" : \"country\", \"type\" : \"String\", \"path\" : \"$.country.classid\"},\n" - + " { \"name\" : \"legalshortname\", \"type\" : \"String\", \"path\" : \"$.legalshortname.value\"},\n" - + " { \"name\" : \"legalname\", \"type\" : \"String\", \"path\" : \"$.legalname.value\" },\n" - + " { \"name\" : \"websiteurl\", \"type\" : \"URL\", \"path\" : \"$.websiteurl.value\" },\n" - + " { \"name\" : \"gridid\", \"type\" : \"String\", \"path\" : \"$.pid[?(@.qualifier.classid =='grid')].value\"},\n" - + " { \"name\" : \"originalId\", \"type\" : \"String\", \"path\" : \"$.id\" }\n" - + " ],\n" - + " \"blacklists\" : {\n" - + " \"legalname\" : []\n" - + " },\n" - + " \"synonyms\": {\n" - + " \"key::1\": [\"university\",\"università\", \"universitas\", \"università studi\",\"universitario\",\"universitaria\",\"université\", \"universite\", \"universitaire\",\"universitaires\",\"universidad\",\"universitade\",\"Universität\",\"universitaet\",\"Uniwersytet\",\"университет\",\"universiteit\",\"πανεπιστήμιο\",\"universitesi\",\"universiteti\", \"universiti\"],\n" - + " \"key::2\": [\"studies\",\"studi\",\"études\",\"estudios\",\"estudos\",\"Studien\",\"studia\",\"исследования\",\"studies\",\"σπουδές\"],\n" - + " \"key::3\": [\"advanced\",\"superiore\",\"supérieur\",\"supérieure\",\"supérieurs\",\"supérieures\",\"avancado\",\"avancados\",\"fortgeschrittene\",\"fortgeschritten\",\"zaawansowany\",\"передовой\",\"gevorderd\",\"gevorderde\",\"προχωρημένος\",\"προχωρημένη\",\"προχωρημένο\",\"προχωρημένες\",\"προχωρημένα\",\"wyzsza\"],\n" - + " \"key::4\": [\"institute\",\"istituto\",\"institut\",\"instituto\",\"instituto\",\"Institut\",\"instytut\",\"институт\",\"instituut\",\"ινστιτούτο\"],\n" - + " \"key::5\": [\"hospital\",\"ospedale\",\"hôpital\",\"hospital\",\"hospital\",\"Krankenhaus\",\"szpital\",\"больница\",\"ziekenhuis\",\"νοσοκομείο\"],\n" - + " \"key::6\": [\"research\",\"ricerca\",\"recherche\",\"investigacion\",\"pesquisa\",\"Forschung\",\"badania\",\"исследования\",\"onderzoek\",\"έρευνα\",\"erevna\",\"erevnas\"],\n" - + " \"key::7\": [\"college\",\"collegio\",\"colegio\",\"faculdade\",\"Hochschule\",\"Szkoła Wyższa\",\"Высшая школа\",\"κολλέγιο\"],\n" - + " \"key::8\": [\"foundation\",\"fondazione\",\"fondation\",\"fundación\",\"fundação\",\"Stiftung\",\"Fundacja\",\"фонд\",\"stichting\",\"ίδρυμα\",\"idryma\"],\n" - + " \"key::9\": [\"center\",\"centro\",\"centre\",\"centro\",\"centro\",\"zentrum\",\"centrum\",\"центр\",\"centrum\",\"κέντρο\"],\n" - + " \"key::10\": [\"national\",\"nazionale\",\"national\",\"nationale\",\"nationaux\",\"nationales\",\"nacional\",\"nacional\",\"national\",\"krajowy\",\"национальный\",\"nationaal\",\"nationale\",\"εθνικό\"],\n" - + " \"key::11\": [\"association\",\"associazione\",\"association\",\"asociación\",\"associação\",\"Verein\",\"verband\",\"stowarzyszenie\",\"ассоциация\",\"associatie\"],\n" - + " \"key::12\": [\"society\",\"societa\",\"société\",\"sociedad\",\"sociedade\",\"gesellschaft\",\"społeczeństwo\",\"общество\",\"maatschappij\",\"κοινωνία\"],\n" - + " \"key::13\": [\"international\",\"internazionale\",\"international\",\"internacional\",\"internacional\",\"international\",\"międzynarodowy\",\"Международный\",\"internationaal\",\"internationale\",\"διεθνής\",\"διεθνή\",\"διεθνές\"],\n" - + " \"key::14\": [\"community\",\"comunita\",\"communauté\",\"comunidad\",\"comunidade\",\"Gemeinschaft\",\"społeczność\",\"сообщество\",\"gemeenschap\",\"κοινότητα\"],\n" - + " \"key::15\": [\"school\",\"scuola\",\"école\",\"escuela\",\"escola\",\"schule\",\"Szkoła\",\"школа\",\"school\",\"σχολείο\"],\n" - + " \"key::16\": [\"education\",\"educazione\",\"éducation\",\"educacion\",\"Educação\",\"Bildung\",\"Edukacja\",\"образование\",\"opleiding\",\"εκπαίδευση\"],\n" - + " \"key::17\": [\"academy\",\"accademia\",\"académie\",\"academia\",\"academia\",\"Akademie\",\"akademie\",\"академия\",\"academie\",\"ακαδημία\"],\n" - + " \"key::18\": [\"public\",\"pubblico\",\"public\",\"publique\",\"publics\",\"publiques\",\"publico\",\"publico\",\"Öffentlichkeit\",\"publiczny\",\"публичный\",\"publiek\",\"publieke\",\"δημόσιος\",\"δημόσια\",\"δημόσιο\"],\n" - + " \"key::19\": [\"museum\",\"museo\",\"musée\",\"mueso\",\"museu\",\"museum\",\"muzeum\",\"музей\",\"museum\",\"μουσείο\"],\n" - + " \"key::20\": [\"group\",\"gruppo\",\"groupe\",\"grupo\",\"grupo\",\"gruppe\",\"grupa\",\"группа\",\"groep\",\"ομάδα\",\"όμιλος\"],\n" - + " \"key::21\": [\"department\",\"dipartimento\",\"département\",\"departamento\",\"departamento\",\"abteilung\",\"departament\",\"отдел\",\"afdeling\",\"τμήμα\"],\n" - + " \"key::22\": [\"council\",\"consiglio\",\"conseil\",\"Consejo\",\"conselho\",\"gesellschaft\",\"rada\",\"совет\",\"raad\",\"συμβούλιο\"],\n" - + " \"key::23\": [\"library\",\"biblioteca\",\"bibliothèque\",\"biblioteca\",\"biblioteca\",\"Bibliothek\",\"biblioteka\",\"библиотека\",\"bibliotheek\",\"βιβλιοθήκη\"],\n" - + " \"key::24\": [\"ministry\",\"ministero\",\"ministère\",\"ministerio\",\"ministério\",\"Ministerium\",\"ministerstwo\",\"министерство\",\"ministerie\",\"υπουργείο\"],\n" - + " \"key::25\": [\"services\",\"servizi\",\"services\",\"servicios\",\"Serviços\",\"Dienstleistungen\",\"usługi\",\"услуги\",\"diensten\",\"υπηρεσίες\"],\n" - + " \"key::26\": [\"central\",\"centrale\",\"central\",\"centrale\",\"centrales\",\"central\",\"central\",\"zentral\",\"centralny\",\"цетральный\",\"centraal\",\"κεντρικός\",\"κεντρική\",\"κεντρικό\",\"κεντρικά\"],\n" - + " \"key::27\": [\"general\",\"generale\",\"général\",\"générale\",\"généraux\",\"générales\",\"general\",\"geral\",\"general\",\"Allgemeines\",\"general\",\"общий\",\"algemeen\",\"algemene\",\"γενικός\",\"γενική\",\"γενικό\",\"γενικά\"],\n" - + " \"key::28\": [\"applied\",\"applicati\",\"appliqué\",\"appliquée\",\"appliqués\",\"appliquées\",\"aplicado\",\"aplicada\",\"angewendet\",\"stosowany\",\"прикладной\",\"toegepast\",\"toegepaste\",\"εφαρμοσμένος\",\"εφαρμοσμένη\",\"εφαρμοσμένο\",\"εφαρμοσμένα\"],\n" - + " \"key::29\": [\"european\",\"europee\",\"europea\",\"européen\",\"européenne\",\"européens\",\"européennes\",\"europeo\",\"europeu\",\"europäisch\",\"europejski\",\"европейский\",\"Europees\",\"Europese\",\"ευρωπαϊκός\",\"ευρωπαϊκή\",\"ευρωπαϊκό\",\"ευρωπαϊκά\"],\n" - + " \"key::30\": [\"agency\",\"agenzia\",\"agence\",\"agencia\",\"agencia\",\"agentur\",\"agencja\",\"агенция\",\"agentschap\",\"πρακτορείο\"],\n" - + " \"key::31\": [\"laboratory\",\"laboratorio\",\"laboratoire\",\"laboratorio\",\"laboratorio\",\"labor\",\"laboratorium\",\"лаборатория\",\"laboratorium\",\"εργαστήριο\"],\n" - + " \"key::32\": [\"industry\",\"industria\",\"industrie\",\"индустрия\",\"industrie\",\"βιομηχανία\"],\n" - + " \"key::33\": [\"industrial\",\"industriale\",\"industriel\",\"industrielle\",\"industriels\",\"industrielles\",\"индустриальный\",\"industrieel\",\"βιομηχανικός\",\"βιομηχανική\",\"βιομηχανικό\",\"βιομηχανικά\",\"βιομηχανικές\"],\n" - + " \"key::34\": [\"consortium\",\"consorzio\",\"consortium\",\"консорциум\",\"consortium\",\"κοινοπραξία\"],\n" - + " \"key::35\": [\"organization\",\"organizzazione\",\"organisation\",\"organización\",\"organização\",\"organizacja\",\"организация\",\"organisatie\",\"οργανισμός\"],\n" - + " \"key::36\": [\"authority\",\"autorità\",\"autorité\",\"авторитет\",\"autoriteit\"],\n" - + " \"key::37\": [\"federation\",\"federazione\",\"fédération\",\"федерация\",\"federatie\",\"ομοσπονδία\"],\n" - + " \"key::38\": [\"observatory\",\"osservatorio\",\"observatoire\",\"обсерватория\",\"observatorium\",\"αστεροσκοπείο\"],\n" - + " \"key::39\": [\"bureau\",\"ufficio\",\"bureau\",\"офис\",\"bureau\",\"γραφείο\"],\n" - + " \"key::40\": [\"company\",\"impresa\",\"compagnie\",\"société\",\"компания\",\"bedrijf\",\"εταιρία\"],\n" - + " \"key::41\": [\"polytechnic\",\"politecnico\",\"polytechnique\",\"политехника\",\"polytechnisch\",\"πολυτεχνείο\",\"universita politecnica\",\"polytechnic university\",\"universidad politecnica\",\"universitat politecnica\",\"politechnika\",\"politechniki\",\"university technology\",\"university science technology\"],\n" - + " \"key::42\": [\"coalition\",\"coalizione\",\"coalition\",\"коалиция\",\"coalitie\",\"συνασπισμός\"],\n" - + " \"key::43\": [\"initiative\",\"iniziativa\",\"initiative\",\"инициатива\",\"initiatief\",\"πρωτοβουλία\"],\n" - + " \"key::44\": [\"academic\",\"accademico\",\"académique\",\"universitaire\",\"акадеческий academisch\",\"ακαδημαϊκός\",\"ακαδημαϊκή\",\"ακαδημαϊκό\",\"ακαδημαϊκές\",\"ακαδημαϊκοί\"],\n" - + " \"key::45\": [\"institution\",\"istituzione\",\"institution\",\"институциональный\",\"instelling\",\"ινστιτούτο\"],\n" - + " \"key::46\": [\"division\",\"divisione\",\"division\",\"отделение\",\"divisie\",\"τμήμα\"],\n" - + " \"key::47\": [\"committee\",\"comitato\",\"comité\",\"комитет\",\"commissie\",\"επιτροπή\"],\n" - + " \"key::48\": [\"promotion\",\"promozione\",\"продвижение\",\"proothisis\",\"forderung\"],\n" - + " \"key::49\": [\"medical\",\"medicine\",\"clinical\",\"medicina\",\"clinici\",\"médico\",\"medicina\",\"clínica\",\"médico\",\"medicina\",\"clínica\",\"medizinisch\",\"Medizin\",\"klinisch\",\"medisch\",\"geneeskunde\",\"klinisch\",\"ιατρικός\",\"ιατρική\",\"ιατρικό\",\"ιατρικά\",\"κλινικός\",\"κλινική\",\"κλινικό\",\"κλινικά\",\"tıbbi\",\"tıp\",\"klinik\",\"orvosi\",\"orvostudomány\",\"klinikai\",\"zdravniški\",\"medicinski\",\"klinični\",\"meditsiini\",\"kliinik\",\"kliiniline\"],\n" - + " \"key::50\": [\"technology\",\"technological\",\"tecnologia\",\"tecnologie\",\"tecnología\",\"tecnológico\",\"tecnologia\",\"tecnológico\",\"Technologie\",\"technologisch\",\"technologie\",\"technologisch\",\"τεχνολογία\",\"τεχνολογικός\",\"τεχνολογική\",\"τεχνολογικό\",\"teknoloji\",\"teknolojik\",\"technológia\",\"technológiai\",\"tehnologija\",\"tehnološki\",\"tehnoloogia\",\"tehnoloogiline\",\"technologii\",\"technical\",\"texniki\",\"teknik\"],\n" - + " \"key::51\": [\"science\",\"scientific\",\"scienza\",\"scientifiche\",\"scienze\",\"ciencia\",\"científico\",\"ciência\",\"científico\",\"Wissenschaft\",\"wissenschaftlich\",\"wetenschap\",\"wetenschappelijk\",\"επιστήμη\",\"επιστημονικός\",\"επιστημονική\",\"επιστημονικό\",\"επιστημονικά\",\"bilim\",\"bilimsel\",\"tudomány\",\"tudományos\",\"znanost\",\"znanstveni\",\"teadus\",\"teaduslik\",\"\"],\n" - + " \"key::52\": [\"engineering\",\"ingegneria\",\"ingeniería\",\"engenharia\",\"Ingenieurwissenschaft\",\"ingenieurswetenschappen\",\"bouwkunde\",\"μηχανικός\",\"μηχανική\",\"μηχανικό\",\"mühendislik\",\"mérnöki\",\"Inženirstvo\",\"inseneeria\",\"inseneri\",\"\"],\n" - + " \"key::53\": [\"management\",\"gestione\",\"gestionale\",\"gestionali\",\"gestión\",\"administración\",\"gestão\",\"administração\",\"Verwaltung\",\"management\",\"διαχείριση\",\"yönetim\",\"menedzsment\",\"vodstvo\",\"upravljanje\",\"management\",\"juhtkond\",\"juhtimine\",\"haldus\",\"\"],\n" - + " \"key::54\": [\"energy\",\"energia\",\"energía\",\"energia\",\"Energie\",\"energie\",\"ενέργεια\",\"enerji\",\"energia\",\"energija\",\"energia\",\"\"],\n" - + " \"key::55\": [\"agricultural\",\"agriculture\",\"agricoltura\",\"agricole\",\"agrícola\",\"agricultura\",\"agrícola\",\"agricultura\",\"landwirtschaftlich\",\"Landwirtschaft\",\"landbouwkundig\",\"landbouw\",\"αγροτικός\",\"αγροτική\",\"αγροτικό\",\"γεωργικός\",\"γεωργική\",\"γεωργικό\",\"γεωργία\",\"tarımsal\",\"tarım\",\"mezőgazdasági\",\"mezőgazdaság\",\"poljedelski\",\"poljedelstvo\",\"põllumajandus\",\"põllumajanduslik\",\"\"],\n" - + " \"key::56\": [\"information\",\"informazione\",\"información\",\"informação\",\"Information\",\"informatie\",\"πληροφορία\",\"bilgi\",\"információ\",\"informacija\",\"informatsioon\",\"informatycznych\",\"\"],\n" - + " \"key::57\": [\"social\",\"sociali\",\"social\",\"social\",\"Sozial\",\"sociaal\",\"maatschappelijk\",\"κοινωνικός\",\"κοινωνική\",\"κοινωνικό\",\"κοινωνικά\",\"sosyal\",\"szociális\",\"družbeni\",\"sotsiaal\",\"sotsiaalne\",\"\"],\n" - + " \"key::58\": [\"environmental\",\"ambiente\",\"medioambiental\",\"ambiente\",\"medioambiente\",\"meioambiente\",\"Umwelt\",\"milieu\",\"milieuwetenschap\",\"milieukunde\",\"περιβαλλοντικός\",\"περιβαλλοντική\",\"περιβαλλοντικό\",\"περιβαλλοντικά\",\"çevre\",\"környezeti\",\"okoliški\",\"keskonna\",\"\"],\n" - + " \"key::59\": [\"business\",\"economia\",\"economiche\",\"economica\",\"negocio\",\"empresa\",\"negócio\",\"Unternehmen\",\"bedrijf\",\"bedrijfskunde\",\"επιχείρηση\",\"iş\",\"üzleti\",\"posel\",\"ettevõte/äri\",\"\"],\n" - + " \"key::60\": [\"pharmaceuticals\",\"pharmacy\",\"farmacia\",\"farmaceutica\",\"farmacéutica\",\"farmacia\",\"farmacêutica\",\"farmácia\",\"Pharmazeutika\",\"Arzneimittelkunde\",\"farmaceutica\",\"geneesmiddelen\",\"apotheek\",\"φαρμακευτικός\",\"φαρμακευτική\",\"φαρμακευτικό\",\"φαρμακευτικά\",\"φαρμακείο\",\"ilaç\",\"eczane\",\"gyógyszerészeti\",\"gyógyszertár\",\"farmacevtika\",\"lekarništvo\",\"farmaatsia\",\"farmatseutiline\",\"\"],\n" - + " \"key::61\": [\"healthcare\",\"health services\",\"salute\",\"atenciónmédica\",\"cuidadodelasalud\",\"cuidadoscomasaúde\",\"Gesundheitswesen\",\"gezondheidszorg\",\"ιατροφαρμακευτικήπερίθαλψη\",\"sağlıkhizmeti\",\"egészségügy\",\"zdravstvo\",\"tervishoid\",\"tervishoiu\",\"\"],\n" - + " \"key::62\": [\"history\",\"storia\",\"historia\",\"história\",\"Geschichte\",\"geschiedenis\",\"geschiedkunde\",\"ιστορία\",\"tarih\",\"történelem\",\"zgodovina\",\"ajalugu\",\"\"],\n" - + " \"key::63\": [\"materials\",\"materiali\",\"materia\",\"materiales\",\"materiais\",\"materialen\",\"υλικά\",\"τεκμήρια\",\"malzemeler\",\"anyagok\",\"materiali\",\"materjalid\",\"vahendid\",\"\"],\n" - + " \"key::64\": [\"economics\",\"economia\",\"economiche\",\"economica\",\"economía\",\"economia\",\"Wirtschaft\",\"economie\",\"οικονομικά\",\"οικονομικέςεπιστήμες\",\"ekonomi\",\"közgazdaságtan\",\"gospodarstvo\",\"ekonomija\",\"majanduslik\",\"majandus\",\"\"],\n" - + " \"key::65\": [\"therapeutics\",\"terapeutica\",\"terapéutica\",\"terapêutica\",\"therapie\",\"θεραπευτική\",\"tedavibilimi\",\"gyógykezelés\",\"terapevtika\",\"terapeutiline\",\"ravi\",\"\"],\n" - + " \"key::66\": [\"oncology\",\"oncologia\",\"oncologico\",\"oncología\",\"oncologia\",\"Onkologie\",\"oncologie\",\"ογκολογία\",\"onkoloji\",\"onkológia\",\"onkologija\",\"onkoloogia\",\"\"],\n" - + " \"key::67\": [\"natural\",\"naturali\",\"naturale\",\"natural\",\"natural\",\"natürlich\",\"natuurlijk\",\"φυσικός\",\"φυσική\",\"φυσικό\",\"φυσικά\",\"doğal\",\"természetes\",\"naraven\",\"loodus\",\"\"],\n" - + " \"key::68\": [\"educational\",\"educazione\",\"pedagogia\",\"educacional\",\"educativo\",\"educacional\",\"pädagogisch\",\"educatief\",\"εκπαιδευτικός\",\"εκπαιδευτική\",\"εκπαιδευτικό\",\"εκπαιδευτικά\",\"eğitimsel\",\"oktatási\",\"izobraževalen\",\"haridus\",\"hariduslik\",\"\"],\n" - + " \"key::69\": [\"biomedical\",\"biomedica\",\"biomédico\",\"biomédico\",\"biomedizinisch\",\"biomedisch\",\"βιοιατρικός\",\"βιοιατρική\",\"βιοιατρικό\",\"βιοιατρικά\",\"biyomedikal\",\"orvosbiológiai\",\"biomedicinski\",\"biomeditsiiniline\",\"\"],\n" - + " \"key::70\": [\"veterinary\",\"veterinaria\",\"veterinarie\",\"veterinaria\",\"veterinária\",\"tierärtzlich\",\"veterinair\",\"veeartsenijlkunde\",\"κτηνιατρικός\",\"κτηνιατρική\",\"κτηνιατρικό\",\"κτηνιατρικά\",\"veteriner\",\"állatorvosi\",\"veterinar\",\"veterinarski\",\"veterinaaria\",\"\"],\n" - + " \"key::71\": [\"chemistry\",\"chimica\",\"química\",\"química\",\"Chemie\",\"chemie\",\"scheikunde\",\"χημεία\",\"kimya\",\"kémia\",\"kemija\",\"keemia\",\"\"],\n" - + " \"key::72\": [\"security\",\"sicurezza\",\"seguridad\",\"segurança\",\"Sicherheit\",\"veiligheid\",\"ασφάλεια\",\"güvenlik\",\"biztonsági\",\"varnost\",\"turvalisus\",\"julgeolek\",\"\"],\n" - + " \"key::73\": [\"biotechnology\",\"biotecnologia\",\"biotecnologie\",\"biotecnología\",\"biotecnologia\",\"Biotechnologie\",\"biotechnologie\",\"βιοτεχνολογία\",\"biyoteknoloji\",\"biotechnológia\",\"biotehnologija\",\"biotehnoloogia\",\"\"],\n" - + " \"key::74\": [\"military\",\"militare\",\"militari\",\"militar\",\"militar\",\"Militär\",\"militair\",\"leger\",\"στρατιωτικός\",\"στρατιωτική\",\"στρατιωτικό\",\"στρατιωτικά\",\"askeri\",\"katonai\",\"vojaški\",\"vojni\",\"militaar\",\"wojskowa\",\"\"],\n" - + " \"key::75\": [\"theological\",\"teologia\",\"teologico\",\"teológico\",\"tecnológica\",\"theologisch\",\"theologisch\",\"θεολογικός\",\"θεολογική\",\"θεολογικό\",\"θεολογικά\",\"teolojik\",\"technológiai\",\"teološki\",\"teoloogia\",\"usuteadus\",\"teoloogiline\",\"\"],\n" - + " \"key::76\": [\"electronics\",\"elettronica\",\"electrónica\",\"eletrônicos\",\"Elektronik\",\"elektronica\",\"ηλεκτρονική\",\"elektronik\",\"elektronika\",\"elektronika\",\"elektroonika\",\"\"],\n" - + " \"key::77\": [\"forestry\",\"forestale\",\"forestali\",\"silvicultura\",\"forestal\",\"floresta\",\"Forstwirtschaft\",\"bosbouw\",\"δασοκομία\",\"δασολογία\",\"ormancılık\",\"erdészet\",\"gozdarstvo\",\"metsandus\",\"\"],\n" - + " \"key::78\": [\"maritime\",\"marittima\",\"marittime\",\"marittimo\",\"marítimo\",\"marítimo\",\"maritiem\",\"ναυτικός\",\"ναυτική\",\"ναυτικό\",\"ναυτικά\",\"ναυτιλιακός\",\"ναυτιλιακή\",\"ναυτιλιακό\",\"ναυτιλιακά\",\"θαλάσσιος\",\"θαλάσσια\",\"θαλάσσιο\",\"denizcilik\",\"tengeri\",\"morski\",\"mere\",\"merendus\",\"\"],\n" - + " \"key::79\": [\"sports\",\"sport\",\"deportes\",\"esportes\",\"Sport\",\"sport\",\"sportwetenschappen\",\"άθληση\",\"γυμναστικήδραστηριότητα\",\"spor\",\"sport\",\"šport\",\"sport\",\"spordi\",\"\"],\n" - + " \"key::80\": [\"surgery\",\"chirurgia\",\"chirurgiche\",\"cirugía\",\"cirurgia\",\"Chirurgie\",\"chirurgie\",\"heelkunde\",\"εγχείρηση\",\"επέμβαση\",\"χειρουργικήεπέμβαση\",\"cerrahi\",\"sebészet\",\"kirurgija\",\"kirurgia\",\"\"],\n" - + " \"key::81\": [\"cultural\",\"culturale\",\"culturali\",\"cultura\",\"cultural\",\"cultural\",\"kulturell\",\"cultureel\",\"πολιτιστικός\",\"πολιτιστική\",\"πολιτιστικό\",\"πολιτισμικός\",\"πολιτισμική\",\"πολιτισμικό\",\"kültürel\",\"kultúrális\",\"kulturni\",\"kultuuri\",\"kultuuriline\",\"\"],\n" - + " \"key::82\": [\"computerscience\",\"informatica\",\"ordenador\",\"computadora\",\"informática\",\"computación\",\"cienciasdelacomputación\",\"ciênciadacomputação\",\"Computer\",\"computer\",\"υπολογιστής\",\"ηλεκτρονικόςυπολογιστής\",\"bilgisayar\",\"számítógép\",\"računalnik\",\"arvuti\",\"\"],\n" - + " \"key::83\": [\"finance\",\"financial\",\"finanza\",\"finanziarie\",\"finanza\",\"financiero\",\"finanças\",\"financeiro\",\"Finanzen\",\"finanziell\",\"financiën\",\"financieel\",\"χρηματοοικονομικά\",\"χρηματοδότηση\",\"finanse\",\"finansal\",\"pénzügy\",\"pénzügyi\",\"finance\",\"finančni\",\"finants\",\"finantsiline\",\"\"],\n" - + " \"key::84\": [\"communication\",\"comunicazione\",\"comuniciación\",\"comunicação\",\"Kommunikation\",\"communication\",\"επικοινωνία\",\"iletişim\",\"kommunikáció\",\"komuniciranje\",\"kommunikatsioon\",\"\"],\n" - + " \"key::85\": [\"justice\",\"giustizia\",\"justicia\",\"justiça\",\"Recht\",\"Justiz\",\"justitie\",\"gerechtigheid\",\"δικαιοσύνη\",\"υπουργείοδικαιοσύνης\",\"δίκαιο\",\"adalet\",\"igazságügy\",\"pravo\",\"õigus\",\"\"],\n" - + " \"key::86\": [\"aerospace\",\"aerospaziale\",\"aerospaziali\",\"aeroespacio\",\"aeroespaço\",\"Luftfahrt\",\"luchtvaart\",\"ruimtevaart\",\"αεροπορικός\",\"αεροπορική\",\"αεροπορικό\",\"αεροναυπηγικός\",\"αεροναυπηγική\",\"αεροναυπηγικό\",\"αεροναυπηγικά\",\"havacılıkveuzay\",\"légtér\",\"zrakoplovstvo\",\"atmosfäär\",\"kosmos\",\"\"],\n" - + " \"key::87\": [\"dermatology\",\"dermatologia\",\"dermatología\",\"dermatologia\",\"Dermatologie\",\"dermatologie\",\"δρματολογία\",\"dermatoloji\",\"bőrgyógyászat\",\"dermatológia\",\"dermatologija\",\"dermatoloogia\",\"\"],\n" - + " \"key::88\": [\"architecture\",\"architettura\",\"arquitectura\",\"arquitetura\",\"Architektur\",\"architectuur\",\"αρχιτεκτονική\",\"mimarlık\",\"építészet\",\"arhitektura\",\"arhitektuur\",\"\"],\n" - + " \"key::89\": [\"mathematics\",\"matematica\",\"matematiche\",\"matemáticas\",\"matemáticas\",\"Mathematik\",\"wiskunde\",\"mathematica\",\"μαθηματικά\",\"matematik\",\"matematika\",\"matematika\",\"matemaatika\",\"\"],\n" - + " \"key::90\": [\"language\",\"lingue\",\"linguistica\",\"linguistiche\",\"lenguaje\",\"idioma\",\"língua\",\"idioma\",\"Sprache\",\"taal\",\"taalkunde\",\"γλώσσα\",\"dil\",\"nyelv\",\"jezik\",\"keel\",\"\"],\n" - + " \"key::91\": [\"neuroscience\",\"neuroscienza\",\"neurociencia\",\"neurociência\",\"Neurowissenschaft\",\"neurowetenschappen\",\"νευροεπιστήμη\",\"nörobilim\",\"idegtudomány\",\"nevroznanost\",\"neuroteadused\",\"\"],\n" - + " \"key::92\": [\"automation\",\"automazione\",\"automatización\",\"automação\",\"Automatisierung\",\"automatisering\",\"αυτοματοποίηση\",\"otomasyon\",\"automatizálás\",\"avtomatizacija\",\"automatiseeritud\",\"\"],\n" - + " \"key::93\": [\"pediatric\",\"pediatria\",\"pediatriche\",\"pediatrico\",\"pediátrico\",\"pediatría\",\"pediátrico\",\"pediatria\",\"pädiatrisch\",\"pediatrische\",\"παιδιατρική\",\"pediatrik\",\"gyermekgyógyászat\",\"pediatrija\",\"pediaatria\",\"\"],\n" - + " \"key::94\": [\"photonics\",\"fotonica\",\"fotoniche\",\"fotónica\",\"fotônica\",\"Photonik\",\"fotonica\",\"φωτονική\",\"fotonik\",\"fotonika\",\"fotonika\",\"fotoonika\",\"\"],\n" - + " \"key::95\": [\"mechanics\", \"mechanical\", \"meccanica\",\"meccaniche\",\"mecánica\",\"mecânica\",\"Mechanik\",\"Maschinenbau\",\"mechanica\",\"werktuigkunde\",\"μηχανικής\",\"mekanik\",\"gépészet\",\"mehanika\",\"mehaanika\",\"\"],\n" - + " \"key::96\": [\"psychiatrics\",\"psichiatria\",\"psichiatrica\",\"psichiatriche\",\"psiquiatría\",\"psiquiatria\",\"Psychiatrie\",\"psychiatrie\",\"ψυχιατρική\",\"psikiyatrik\",\"pszihiátria\",\"psihiatrija\",\"psühhaatria\",\"\"],\n" - + " \"key::97\": [\"psychology\",\"fisiologia\",\"psicología\",\"psicologia\",\"Psychologie\",\"psychologie\",\"ψυχολογία\",\"psikoloji\",\"pszihológia\",\"psihologija\",\"psühholoogia\",\"\"],\n" - + " \"key::98\": [\"automotive\",\"industriaautomobilistica\",\"industriadelautomóvil\",\"automotriz\",\"industriaautomotriz\",\"automotivo\",\"Automobilindustrie\",\"autoindustrie\",\"αυτοκίνητος\",\"αυτοκίνητη\",\"αυτοκίνητο\",\"αυτοκινούμενος\",\"αυτοκινούμενη\",\"αυτοκινούμενο\",\"αυτοκινητιστικός\",\"αυτοκινητιστική\",\"αυτοκινητιστικό\",\"otomotiv\",\"autóipari\",\"samogiben\",\"avtomobilskaindustrija\",\"auto-\",\"\"],\n" - + " \"key::99\": [\"neurology\",\"neurologia\",\"neurologiche\",\"neurología\",\"neurologia\",\"Neurologie\",\"neurologie\",\"zenuwleer\",\"νευρολογία\",\"nöroloji\",\"neurológia\",\"ideggyógyászat\",\"nevrologija\",\"neuroloogia\",\"\"],\n" - + " \"key::100\": [\"geology\",\"geologia\",\"geologiche\",\"geología\",\"geologia\",\"Geologie\",\"geologie\",\"aardkunde\",\"γεωλογία\",\"jeoloji\",\"geológia\",\"földtudomány\",\"geologija\",\"geoloogia\",\"\"],\n" - + " \"key::101\": [\"microbiology\",\"microbiologia\",\"micro-biologia\",\"microbiologiche\",\"microbiología\",\"microbiologia\",\"Mikrobiologie\",\"microbiologie\",\"μικροβιολογία\",\"mikrobiyoloji\",\"mikrobiológia\",\"mikrobiologija\",\"mikrobioloogia\",\"\"],\n" - + " \"key::102\": [\"informatics\",\"informatica\",\"informática\",\"informática\",\"informatica\",\"\"],\n" - + " \"key::103\": [\"forschungsgemeinschaft\",\"comunita ricerca\",\"research community\",\"research foundation\",\"research association\"],\n" - + " \"key::104\": [\"commerce\",\"ticaret\",\"ticarət\",\"commercio\",\"trade\",\"handel\",\"comercio\"],\n" - + " \"key::105\" : [\"state\", \"stato\", \"etade\", \"estado\", \"statale\", \"etat\", \"zustand\", \"estado\"],\n" - + " \"key::106\" : [\"seminary\", \"seminario\", \"seminaire\", \"seminar\"],\n" - + " \"key::107\" : [\"agricultural forestry\", \"af\", \"a f\"],\n" - + " \"key::108\" : [\"agricultural mechanical\", \"am\", \"a m\"],\n" - + " \"key::109\" : [\"catholic\", \"catholique\", \"katholische\", \"catolica\", \"cattolica\", \"catolico\"]\n" - + " }\n" - + " }\n" - + "}"); + String json = "{\t\"dataInfo\":{\t\t\"invisible\":false,\t\t\"inferred\":false,\t\t\"deletedbyinference\":false,\t\t\"trust\":\"0.810000002384185791\",\t\t\"inferenceprovenance\":\"\",\t\t\"provenanceaction\":{\t\t\t\"classid\":\"sysimport:crosswalk:entityregistry\",\t\t\t\"classname\":\"sysimport:crosswalk:entityregistry\",\t\t\t\"schemeid\":\"dnet:provenance_actions\",\t\t\t\"schemename\":\"dnet:provenance_actions\"\t\t}\t},\t\"lastupdatetimestamp\":1584960968152,\t\"id\":\"20|corda__h2020::9faf23721249f26ac2c16eb857ea1fb9\",\t\"originalId\":[\t\t\"corda__h2020::927957582\"\t],\t\"collectedfrom\":[\t\t{\t\t\t\"key\":\"openaire____::corda_h2020\",\t\t\t\"value\":\"CORDA - COmmon Research DAta Warehouse - Horizon 2020\",\t\t\t\"dataInfo\":null\t\t}\t],\t\"pid\":[\t],\t\"dateofcollection\":\"2016-06-05\",\t\"dateoftransformation\":\"2019-11-19\",\t\"extraInfo\":[\t],\t\"oaiprovenance\":null,\t\"legalshortname\":{\t\t\"value\":\"Comentor AB\",\t\t\"dataInfo\":{\t\t\t\"invisible\":false,\t\t\t\"inferred\":false,\t\t\t\"deletedbyinference\":false,\t\t\t\"trust\":\"0.810000002384185791\",\t\t\t\"inferenceprovenance\":\"\",\t\t\t\"provenanceaction\":{\t\t\t\t\"classid\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"classname\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"schemeid\":\"dnet:provenance_actions\",\t\t\t\t\"schemename\":\"dnet:provenance_actions\"\t\t\t}\t\t}\t},\t\"legalname\":{\t\t\"value\":\"Comentor AB\",\t\t\"dataInfo\":{\t\t\t\"invisible\":false,\t\t\t\"inferred\":false,\t\t\t\"deletedbyinference\":false,\t\t\t\"trust\":\"0.810000002384185791\",\t\t\t\"inferenceprovenance\":\"\",\t\t\t\"provenanceaction\":{\t\t\t\t\"classid\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"classname\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"schemeid\":\"dnet:provenance_actions\",\t\t\t\t\"schemename\":\"dnet:provenance_actions\"\t\t\t}\t\t}\t},\t\"alternativeNames\":[\t],\t\"websiteurl\":{\t\t\"value\":\"http://www.comentor.se\",\t\t\"dataInfo\":{\t\t\t\"invisible\":false,\t\t\t\"inferred\":false,\t\t\t\"deletedbyinference\":false,\t\t\t\"trust\":\"0.810000002384185791\",\t\t\t\"inferenceprovenance\":\"\",\t\t\t\"provenanceaction\":{\t\t\t\t\"classid\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"classname\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"schemeid\":\"dnet:provenance_actions\",\t\t\t\t\"schemename\":\"dnet:provenance_actions\"\t\t\t}\t\t}\t},\t\"logourl\":null,\t\"eclegalbody\":{\t\t\"value\":\"false\",\t\t\"dataInfo\":{\t\t\t\"invisible\":false,\t\t\t\"inferred\":false,\t\t\t\"deletedbyinference\":false,\t\t\t\"trust\":\"0.810000002384185791\",\t\t\t\"inferenceprovenance\":\"\",\t\t\t\"provenanceaction\":{\t\t\t\t\"classid\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"classname\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"schemeid\":\"dnet:provenance_actions\",\t\t\t\t\"schemename\":\"dnet:provenance_actions\"\t\t\t}\t\t}\t},\t\"eclegalperson\":{\t\t\"value\":\"true\",\t\t\"dataInfo\":{\t\t\t\"invisible\":false,\t\t\t\"inferred\":false,\t\t\t\"deletedbyinference\":false,\t\t\t\"trust\":\"0.810000002384185791\",\t\t\t\"inferenceprovenance\":\"\",\t\t\t\"provenanceaction\":{\t\t\t\t\"classid\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"classname\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"schemeid\":\"dnet:provenance_actions\",\t\t\t\t\"schemename\":\"dnet:provenance_actions\"\t\t\t}\t\t}\t},\t\"ecnonprofit\":{\t\t\"value\":\"false\",\t\t\"dataInfo\":{\t\t\t\"invisible\":false,\t\t\t\"inferred\":false,\t\t\t\"deletedbyinference\":false,\t\t\t\"trust\":\"0.810000002384185791\",\t\t\t\"inferenceprovenance\":\"\",\t\t\t\"provenanceaction\":{\t\t\t\t\"classid\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"classname\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"schemeid\":\"dnet:provenance_actions\",\t\t\t\t\"schemename\":\"dnet:provenance_actions\"\t\t\t}\t\t}\t},\t\"ecresearchorganization\":{\t\t\"value\":\"false\",\t\t\"dataInfo\":{\t\t\t\"invisible\":false,\t\t\t\"inferred\":false,\t\t\t\"deletedbyinference\":false,\t\t\t\"trust\":\"0.810000002384185791\",\t\t\t\"inferenceprovenance\":\"\",\t\t\t\"provenanceaction\":{\t\t\t\t\"classid\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"classname\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"schemeid\":\"dnet:provenance_actions\",\t\t\t\t\"schemename\":\"dnet:provenance_actions\"\t\t\t}\t\t}\t},\t\"echighereducation\":{\t\t\"value\":\"false\",\t\t\"dataInfo\":{\t\t\t\"invisible\":false,\t\t\t\"inferred\":false,\t\t\t\"deletedbyinference\":false,\t\t\t\"trust\":\"0.810000002384185791\",\t\t\t\"inferenceprovenance\":\"\",\t\t\t\"provenanceaction\":{\t\t\t\t\"classid\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"classname\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"schemeid\":\"dnet:provenance_actions\",\t\t\t\t\"schemename\":\"dnet:provenance_actions\"\t\t\t}\t\t}\t},\t\"ecinternationalorganizationeurinterests\":{\t\t\"value\":\"false\",\t\t\"dataInfo\":{\t\t\t\"invisible\":false,\t\t\t\"inferred\":false,\t\t\t\"deletedbyinference\":false,\t\t\t\"trust\":\"0.810000002384185791\",\t\t\t\"inferenceprovenance\":\"\",\t\t\t\"provenanceaction\":{\t\t\t\t\"classid\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"classname\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"schemeid\":\"dnet:provenance_actions\",\t\t\t\t\"schemename\":\"dnet:provenance_actions\"\t\t\t}\t\t}\t},\t\"ecinternationalorganization\":{\t\t\"value\":\"false\",\t\t\"dataInfo\":{\t\t\t\"invisible\":false,\t\t\t\"inferred\":false,\t\t\t\"deletedbyinference\":false,\t\t\t\"trust\":\"0.810000002384185791\",\t\t\t\"inferenceprovenance\":\"\",\t\t\t\"provenanceaction\":{\t\t\t\t\"classid\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"classname\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"schemeid\":\"dnet:provenance_actions\",\t\t\t\t\"schemename\":\"dnet:provenance_actions\"\t\t\t}\t\t}\t},\t\"ecenterprise\":{\t\t\"value\":\"false\",\t\t\"dataInfo\":{\t\t\t\"invisible\":false,\t\t\t\"inferred\":false,\t\t\t\"deletedbyinference\":false,\t\t\t\"trust\":\"0.810000002384185791\",\t\t\t\"inferenceprovenance\":\"\",\t\t\t\"provenanceaction\":{\t\t\t\t\"classid\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"classname\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"schemeid\":\"dnet:provenance_actions\",\t\t\t\t\"schemename\":\"dnet:provenance_actions\"\t\t\t}\t\t}\t},\t\"ecsmevalidated\":{\t\t\"value\":\"true\",\t\t\"dataInfo\":{\t\t\t\"invisible\":false,\t\t\t\"inferred\":false,\t\t\t\"deletedbyinference\":false,\t\t\t\"trust\":\"0.810000002384185791\",\t\t\t\"inferenceprovenance\":\"\",\t\t\t\"provenanceaction\":{\t\t\t\t\"classid\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"classname\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"schemeid\":\"dnet:provenance_actions\",\t\t\t\t\"schemename\":\"dnet:provenance_actions\"\t\t\t}\t\t}\t},\t\"ecnutscode\":{\t\t\"value\":\"false\",\t\t\"dataInfo\":{\t\t\t\"invisible\":false,\t\t\t\"inferred\":false,\t\t\t\"deletedbyinference\":false,\t\t\t\"trust\":\"0.810000002384185791\",\t\t\t\"inferenceprovenance\":\"\",\t\t\t\"provenanceaction\":{\t\t\t\t\"classid\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"classname\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"schemeid\":\"dnet:provenance_actions\",\t\t\t\t\"schemename\":\"dnet:provenance_actions\"\t\t\t}\t\t}\t},\t\"country\":null}"; + DedupConfig conf = DedupConfig + .load( + "{\n" + + " \"wf\" : {\n" + + " \"threshold\" : \"0.99\",\n" + + " \"dedupRun\" : \"001\",\n" + + " \"entityType\" : \"organization\",\n" + + " \"subEntityValue\": \"organization\",\n" + + " \"orderField\" : \"legalname\",\n" + + " \"queueMaxSize\" : \"2000\",\n" + + " \"groupMaxSize\" : \"50\",\n" + + " \"slidingWindowSize\" : \"200\",\n" + + " \"idPath\":\"$.id\",\n" + + " \"rootBuilder\" : [ \"organization\", \"projectOrganization_participation_isParticipant\", \"datasourceOrganization_provision_isProvidedBy\" ],\n" + + " \"includeChildren\" : \"true\",\n" + + " \"maxIterations\": \"20\"\n" + + " },\n" + + " \"pace\" : {\n" + + " \"clustering\" : [\n" + + " { \"name\" : \"sortedngrampairs\", \"fields\" : [ \"legalname\" ], \"params\" : { \"max\" : 2, \"ngramLen\" : \"3\"} },\n" + + " { \"name\" : \"suffixprefix\", \"fields\" : [ \"legalname\" ], \"params\" : { \"max\" : 1, \"len\" : \"3\" } },\n" + + " { \"name\" : \"urlclustering\", \"fields\" : [ \"websiteurl\" ], \"params\" : { } },\n" + + " { \"name\" : \"keywordsclustering\", \"fields\" : [ \"legalname\" ], \"params\" : { \"max\": 2, \"windowSize\": 4} }\n" + + " ],\n" + + " \"decisionTree\" : {\n" + + " \"start\": {\n" + + " \"fields\": [\n" + + " {\n" + + " \"field\": \"gridid\",\n" + + " \"comparator\": \"exactMatch\",\n" + + " \"weight\": 1,\n" + + " \"countIfUndefined\": \"false\",\n" + + " \"params\": {}\n" + + " }\n" + + " ],\n" + + " \"threshold\": 1,\n" + + " \"aggregation\": \"AVG\",\n" + + " \"positive\": \"MATCH\",\n" + + " \"negative\": \"NO_MATCH\",\n" + + " \"undefined\": \"layer2\",\n" + + " \"ignoreUndefined\": \"false\"\n" + + " },\n" + + " \"layer2\": {\n" + + " \"fields\": [\n" + + " {\n" + + " \"field\": \"websiteurl\",\n" + + " \"comparator\": \"domainExactMatch\",\n" + + " \"weight\": 1,\n" + + " \"countIfUndefined\": \"false\",\n" + + " \"params\": {}\n" + + " },\n" + + " {\n" + + " \"field\": \"country\",\n" + + " \"comparator\": \"exactMatch\",\n" + + " \"weight\": 1,\n" + + " \"countIfUndefined\": \"true\",\n" + + " \"params\": {}\n" + + " },\n" + + " {\n" + + " \"field\": \"legalname\",\n" + + " \"comparator\": \"numbersMatch\",\n" + + " \"weight\": 1,\n" + + " \"countIfUndefined\": \"true\",\n" + + " \"params\": {}\n" + + " },\n" + + " {\n" + + " \"field\": \"legalname\",\n" + + " \"comparator\": \"romansMatch\",\n" + + " \"weight\": 1,\n" + + " \"countIfUndefined\": \"true\",\n" + + " \"params\": {}\n" + + " }\n" + + " ],\n" + + " \"threshold\": 1,\n" + + " \"aggregation\": \"AND\",\n" + + " \"positive\": \"layer3\",\n" + + " \"negative\": \"NO_MATCH\",\n" + + " \"undefined\": \"layer3\",\n" + + " \"ignoreUndefined\": \"true\"\n" + + " },\n" + + " \"layer3\": {\n" + + " \"fields\": [\n" + + " {\n" + + " \"field\": \"legalname\",\n" + + " \"comparator\": \"cityMatch\",\n" + + " \"weight\": 1.0,\n" + + " \"countIfUndefined\": \"true\",\n" + + " \"params\": {\n" + + " \"windowSize\": \"4\"\n" + + " }\n" + + " }\n" + + " ],\n" + + " \"threshold\": 0.1,\n" + + " \"aggregation\": \"AVG\",\n" + + " \"positive\": \"layer4\",\n" + + " \"negative\": \"NO_MATCH\",\n" + + " \"undefined\": \"NO_MATCH\",\n" + + " \"ignoreUndefined\": \"true\"\n" + + " },\n" + + " \"layer4\": {\n" + + " \"fields\": [\n" + + " {\n" + + " \"field\": \"legalname\",\n" + + " \"comparator\": \"keywordMatch\",\n" + + " \"weight\": 1.0,\n" + + " \"countIfUndefined\": \"true\",\n" + + " \"params\": {\n" + + " \"windowSize\": \"4\"\n" + + " }\n" + + " }\n" + + " ],\n" + + " \"threshold\": 0.7,\n" + + " \"aggregation\": \"AVG\",\n" + + " \"positive\": \"layer5\",\n" + + " \"negative\": \"NO_MATCH\",\n" + + " \"undefined\": \"layer5\",\n" + + " \"ignoreUndefined\": \"true\"\n" + + " },\n" + + " \"layer5\": {\n" + + " \"fields\": [\n" + + " {\n" + + " \"field\": \"legalname\",\n" + + " \"comparator\": \"jaroWinklerNormalizedName\",\n" + + " \"weight\": 0.9,\n" + + " \"countIfUndefined\": \"true\",\n" + + " \"params\": {\n" + + " \"windowSize\": \"4\"\n" + + " }\n" + + " },\n" + + " {\n" + + " \"field\": \"legalshortname\",\n" + + " \"comparator\": \"jaroWinklerNormalizedName\",\n" + + " \"weight\": 0.1,\n" + + " \"countIfUndefined\": \"false\",\n" + + " \"params\": {\n" + + " \"windowSize\": 4\n" + + " }\n" + + " }\n" + + " ],\n" + + " \"threshold\": 0.9,\n" + + " \"aggregation\": \"W_MEAN\",\n" + + " \"positive\": \"MATCH\",\n" + + " \"negative\": \"NO_MATCH\",\n" + + " \"undefined\": \"NO_MATCH\",\n" + + " \"ignoreUndefined\": \"true\"\n" + + " }\n" + + " },\n" + + " \"model\" : [\n" + + " { \"name\" : \"country\", \"type\" : \"String\", \"path\" : \"$.country.classid\"},\n" + + " { \"name\" : \"legalshortname\", \"type\" : \"String\", \"path\" : \"$.legalshortname.value\"},\n" + + " { \"name\" : \"legalname\", \"type\" : \"String\", \"path\" : \"$.legalname.value\" },\n" + + " { \"name\" : \"websiteurl\", \"type\" : \"URL\", \"path\" : \"$.websiteurl.value\" },\n" + + " { \"name\" : \"gridid\", \"type\" : \"String\", \"path\" : \"$.pid[?(@.qualifier.classid =='grid')].value\"},\n" + + " { \"name\" : \"originalId\", \"type\" : \"String\", \"path\" : \"$.id\" }\n" + + " ],\n" + + " \"blacklists\" : {\n" + + " \"legalname\" : []\n" + + " },\n" + + " \"synonyms\": {\n" + + " \"key::1\": [\"university\",\"università\", \"universitas\", \"università studi\",\"universitario\",\"universitaria\",\"université\", \"universite\", \"universitaire\",\"universitaires\",\"universidad\",\"universitade\",\"Universität\",\"universitaet\",\"Uniwersytet\",\"университет\",\"universiteit\",\"πανεπιστήμιο\",\"universitesi\",\"universiteti\", \"universiti\"],\n" + + " \"key::2\": [\"studies\",\"studi\",\"études\",\"estudios\",\"estudos\",\"Studien\",\"studia\",\"исследования\",\"studies\",\"σπουδές\"],\n" + + " \"key::3\": [\"advanced\",\"superiore\",\"supérieur\",\"supérieure\",\"supérieurs\",\"supérieures\",\"avancado\",\"avancados\",\"fortgeschrittene\",\"fortgeschritten\",\"zaawansowany\",\"передовой\",\"gevorderd\",\"gevorderde\",\"προχωρημένος\",\"προχωρημένη\",\"προχωρημένο\",\"προχωρημένες\",\"προχωρημένα\",\"wyzsza\"],\n" + + " \"key::4\": [\"institute\",\"istituto\",\"institut\",\"instituto\",\"instituto\",\"Institut\",\"instytut\",\"институт\",\"instituut\",\"ινστιτούτο\"],\n" + + " \"key::5\": [\"hospital\",\"ospedale\",\"hôpital\",\"hospital\",\"hospital\",\"Krankenhaus\",\"szpital\",\"больница\",\"ziekenhuis\",\"νοσοκομείο\"],\n" + + " \"key::6\": [\"research\",\"ricerca\",\"recherche\",\"investigacion\",\"pesquisa\",\"Forschung\",\"badania\",\"исследования\",\"onderzoek\",\"έρευνα\",\"erevna\",\"erevnas\"],\n" + + " \"key::7\": [\"college\",\"collegio\",\"colegio\",\"faculdade\",\"Hochschule\",\"Szkoła Wyższa\",\"Высшая школа\",\"κολλέγιο\"],\n" + + " \"key::8\": [\"foundation\",\"fondazione\",\"fondation\",\"fundación\",\"fundação\",\"Stiftung\",\"Fundacja\",\"фонд\",\"stichting\",\"ίδρυμα\",\"idryma\"],\n" + + " \"key::9\": [\"center\",\"centro\",\"centre\",\"centro\",\"centro\",\"zentrum\",\"centrum\",\"центр\",\"centrum\",\"κέντρο\"],\n" + + " \"key::10\": [\"national\",\"nazionale\",\"national\",\"nationale\",\"nationaux\",\"nationales\",\"nacional\",\"nacional\",\"national\",\"krajowy\",\"национальный\",\"nationaal\",\"nationale\",\"εθνικό\"],\n" + + " \"key::11\": [\"association\",\"associazione\",\"association\",\"asociación\",\"associação\",\"Verein\",\"verband\",\"stowarzyszenie\",\"ассоциация\",\"associatie\"],\n" + + " \"key::12\": [\"society\",\"societa\",\"société\",\"sociedad\",\"sociedade\",\"gesellschaft\",\"społeczeństwo\",\"общество\",\"maatschappij\",\"κοινωνία\"],\n" + + " \"key::13\": [\"international\",\"internazionale\",\"international\",\"internacional\",\"internacional\",\"international\",\"międzynarodowy\",\"Международный\",\"internationaal\",\"internationale\",\"διεθνής\",\"διεθνή\",\"διεθνές\"],\n" + + " \"key::14\": [\"community\",\"comunita\",\"communauté\",\"comunidad\",\"comunidade\",\"Gemeinschaft\",\"społeczność\",\"сообщество\",\"gemeenschap\",\"κοινότητα\"],\n" + + " \"key::15\": [\"school\",\"scuola\",\"école\",\"escuela\",\"escola\",\"schule\",\"Szkoła\",\"школа\",\"school\",\"σχολείο\"],\n" + + " \"key::16\": [\"education\",\"educazione\",\"éducation\",\"educacion\",\"Educação\",\"Bildung\",\"Edukacja\",\"образование\",\"opleiding\",\"εκπαίδευση\"],\n" + + " \"key::17\": [\"academy\",\"accademia\",\"académie\",\"academia\",\"academia\",\"Akademie\",\"akademie\",\"академия\",\"academie\",\"ακαδημία\"],\n" + + " \"key::18\": [\"public\",\"pubblico\",\"public\",\"publique\",\"publics\",\"publiques\",\"publico\",\"publico\",\"Öffentlichkeit\",\"publiczny\",\"публичный\",\"publiek\",\"publieke\",\"δημόσιος\",\"δημόσια\",\"δημόσιο\"],\n" + + " \"key::19\": [\"museum\",\"museo\",\"musée\",\"mueso\",\"museu\",\"museum\",\"muzeum\",\"музей\",\"museum\",\"μουσείο\"],\n" + + " \"key::20\": [\"group\",\"gruppo\",\"groupe\",\"grupo\",\"grupo\",\"gruppe\",\"grupa\",\"группа\",\"groep\",\"ομάδα\",\"όμιλος\"],\n" + + " \"key::21\": [\"department\",\"dipartimento\",\"département\",\"departamento\",\"departamento\",\"abteilung\",\"departament\",\"отдел\",\"afdeling\",\"τμήμα\"],\n" + + " \"key::22\": [\"council\",\"consiglio\",\"conseil\",\"Consejo\",\"conselho\",\"gesellschaft\",\"rada\",\"совет\",\"raad\",\"συμβούλιο\"],\n" + + " \"key::23\": [\"library\",\"biblioteca\",\"bibliothèque\",\"biblioteca\",\"biblioteca\",\"Bibliothek\",\"biblioteka\",\"библиотека\",\"bibliotheek\",\"βιβλιοθήκη\"],\n" + + " \"key::24\": [\"ministry\",\"ministero\",\"ministère\",\"ministerio\",\"ministério\",\"Ministerium\",\"ministerstwo\",\"министерство\",\"ministerie\",\"υπουργείο\"],\n" + + " \"key::25\": [\"services\",\"servizi\",\"services\",\"servicios\",\"Serviços\",\"Dienstleistungen\",\"usługi\",\"услуги\",\"diensten\",\"υπηρεσίες\"],\n" + + " \"key::26\": [\"central\",\"centrale\",\"central\",\"centrale\",\"centrales\",\"central\",\"central\",\"zentral\",\"centralny\",\"цетральный\",\"centraal\",\"κεντρικός\",\"κεντρική\",\"κεντρικό\",\"κεντρικά\"],\n" + + " \"key::27\": [\"general\",\"generale\",\"général\",\"générale\",\"généraux\",\"générales\",\"general\",\"geral\",\"general\",\"Allgemeines\",\"general\",\"общий\",\"algemeen\",\"algemene\",\"γενικός\",\"γενική\",\"γενικό\",\"γενικά\"],\n" + + " \"key::28\": [\"applied\",\"applicati\",\"appliqué\",\"appliquée\",\"appliqués\",\"appliquées\",\"aplicado\",\"aplicada\",\"angewendet\",\"stosowany\",\"прикладной\",\"toegepast\",\"toegepaste\",\"εφαρμοσμένος\",\"εφαρμοσμένη\",\"εφαρμοσμένο\",\"εφαρμοσμένα\"],\n" + + " \"key::29\": [\"european\",\"europee\",\"europea\",\"européen\",\"européenne\",\"européens\",\"européennes\",\"europeo\",\"europeu\",\"europäisch\",\"europejski\",\"европейский\",\"Europees\",\"Europese\",\"ευρωπαϊκός\",\"ευρωπαϊκή\",\"ευρωπαϊκό\",\"ευρωπαϊκά\"],\n" + + " \"key::30\": [\"agency\",\"agenzia\",\"agence\",\"agencia\",\"agencia\",\"agentur\",\"agencja\",\"агенция\",\"agentschap\",\"πρακτορείο\"],\n" + + " \"key::31\": [\"laboratory\",\"laboratorio\",\"laboratoire\",\"laboratorio\",\"laboratorio\",\"labor\",\"laboratorium\",\"лаборатория\",\"laboratorium\",\"εργαστήριο\"],\n" + + " \"key::32\": [\"industry\",\"industria\",\"industrie\",\"индустрия\",\"industrie\",\"βιομηχανία\"],\n" + + " \"key::33\": [\"industrial\",\"industriale\",\"industriel\",\"industrielle\",\"industriels\",\"industrielles\",\"индустриальный\",\"industrieel\",\"βιομηχανικός\",\"βιομηχανική\",\"βιομηχανικό\",\"βιομηχανικά\",\"βιομηχανικές\"],\n" + + " \"key::34\": [\"consortium\",\"consorzio\",\"consortium\",\"консорциум\",\"consortium\",\"κοινοπραξία\"],\n" + + " \"key::35\": [\"organization\",\"organizzazione\",\"organisation\",\"organización\",\"organização\",\"organizacja\",\"организация\",\"organisatie\",\"οργανισμός\"],\n" + + " \"key::36\": [\"authority\",\"autorità\",\"autorité\",\"авторитет\",\"autoriteit\"],\n" + + " \"key::37\": [\"federation\",\"federazione\",\"fédération\",\"федерация\",\"federatie\",\"ομοσπονδία\"],\n" + + " \"key::38\": [\"observatory\",\"osservatorio\",\"observatoire\",\"обсерватория\",\"observatorium\",\"αστεροσκοπείο\"],\n" + + " \"key::39\": [\"bureau\",\"ufficio\",\"bureau\",\"офис\",\"bureau\",\"γραφείο\"],\n" + + " \"key::40\": [\"company\",\"impresa\",\"compagnie\",\"société\",\"компания\",\"bedrijf\",\"εταιρία\"],\n" + + " \"key::41\": [\"polytechnic\",\"politecnico\",\"polytechnique\",\"политехника\",\"polytechnisch\",\"πολυτεχνείο\",\"universita politecnica\",\"polytechnic university\",\"universidad politecnica\",\"universitat politecnica\",\"politechnika\",\"politechniki\",\"university technology\",\"university science technology\"],\n" + + " \"key::42\": [\"coalition\",\"coalizione\",\"coalition\",\"коалиция\",\"coalitie\",\"συνασπισμός\"],\n" + + " \"key::43\": [\"initiative\",\"iniziativa\",\"initiative\",\"инициатива\",\"initiatief\",\"πρωτοβουλία\"],\n" + + " \"key::44\": [\"academic\",\"accademico\",\"académique\",\"universitaire\",\"акадеческий academisch\",\"ακαδημαϊκός\",\"ακαδημαϊκή\",\"ακαδημαϊκό\",\"ακαδημαϊκές\",\"ακαδημαϊκοί\"],\n" + + " \"key::45\": [\"institution\",\"istituzione\",\"institution\",\"институциональный\",\"instelling\",\"ινστιτούτο\"],\n" + + " \"key::46\": [\"division\",\"divisione\",\"division\",\"отделение\",\"divisie\",\"τμήμα\"],\n" + + " \"key::47\": [\"committee\",\"comitato\",\"comité\",\"комитет\",\"commissie\",\"επιτροπή\"],\n" + + " \"key::48\": [\"promotion\",\"promozione\",\"продвижение\",\"proothisis\",\"forderung\"],\n" + + " \"key::49\": [\"medical\",\"medicine\",\"clinical\",\"medicina\",\"clinici\",\"médico\",\"medicina\",\"clínica\",\"médico\",\"medicina\",\"clínica\",\"medizinisch\",\"Medizin\",\"klinisch\",\"medisch\",\"geneeskunde\",\"klinisch\",\"ιατρικός\",\"ιατρική\",\"ιατρικό\",\"ιατρικά\",\"κλινικός\",\"κλινική\",\"κλινικό\",\"κλινικά\",\"tıbbi\",\"tıp\",\"klinik\",\"orvosi\",\"orvostudomány\",\"klinikai\",\"zdravniški\",\"medicinski\",\"klinični\",\"meditsiini\",\"kliinik\",\"kliiniline\"],\n" + + " \"key::50\": [\"technology\",\"technological\",\"tecnologia\",\"tecnologie\",\"tecnología\",\"tecnológico\",\"tecnologia\",\"tecnológico\",\"Technologie\",\"technologisch\",\"technologie\",\"technologisch\",\"τεχνολογία\",\"τεχνολογικός\",\"τεχνολογική\",\"τεχνολογικό\",\"teknoloji\",\"teknolojik\",\"technológia\",\"technológiai\",\"tehnologija\",\"tehnološki\",\"tehnoloogia\",\"tehnoloogiline\",\"technologii\",\"technical\",\"texniki\",\"teknik\"],\n" + + " \"key::51\": [\"science\",\"scientific\",\"scienza\",\"scientifiche\",\"scienze\",\"ciencia\",\"científico\",\"ciência\",\"científico\",\"Wissenschaft\",\"wissenschaftlich\",\"wetenschap\",\"wetenschappelijk\",\"επιστήμη\",\"επιστημονικός\",\"επιστημονική\",\"επιστημονικό\",\"επιστημονικά\",\"bilim\",\"bilimsel\",\"tudomány\",\"tudományos\",\"znanost\",\"znanstveni\",\"teadus\",\"teaduslik\",\"\"],\n" + + " \"key::52\": [\"engineering\",\"ingegneria\",\"ingeniería\",\"engenharia\",\"Ingenieurwissenschaft\",\"ingenieurswetenschappen\",\"bouwkunde\",\"μηχανικός\",\"μηχανική\",\"μηχανικό\",\"mühendislik\",\"mérnöki\",\"Inženirstvo\",\"inseneeria\",\"inseneri\",\"\"],\n" + + " \"key::53\": [\"management\",\"gestione\",\"gestionale\",\"gestionali\",\"gestión\",\"administración\",\"gestão\",\"administração\",\"Verwaltung\",\"management\",\"διαχείριση\",\"yönetim\",\"menedzsment\",\"vodstvo\",\"upravljanje\",\"management\",\"juhtkond\",\"juhtimine\",\"haldus\",\"\"],\n" + + " \"key::54\": [\"energy\",\"energia\",\"energía\",\"energia\",\"Energie\",\"energie\",\"ενέργεια\",\"enerji\",\"energia\",\"energija\",\"energia\",\"\"],\n" + + " \"key::55\": [\"agricultural\",\"agriculture\",\"agricoltura\",\"agricole\",\"agrícola\",\"agricultura\",\"agrícola\",\"agricultura\",\"landwirtschaftlich\",\"Landwirtschaft\",\"landbouwkundig\",\"landbouw\",\"αγροτικός\",\"αγροτική\",\"αγροτικό\",\"γεωργικός\",\"γεωργική\",\"γεωργικό\",\"γεωργία\",\"tarımsal\",\"tarım\",\"mezőgazdasági\",\"mezőgazdaság\",\"poljedelski\",\"poljedelstvo\",\"põllumajandus\",\"põllumajanduslik\",\"\"],\n" + + " \"key::56\": [\"information\",\"informazione\",\"información\",\"informação\",\"Information\",\"informatie\",\"πληροφορία\",\"bilgi\",\"információ\",\"informacija\",\"informatsioon\",\"informatycznych\",\"\"],\n" + + " \"key::57\": [\"social\",\"sociali\",\"social\",\"social\",\"Sozial\",\"sociaal\",\"maatschappelijk\",\"κοινωνικός\",\"κοινωνική\",\"κοινωνικό\",\"κοινωνικά\",\"sosyal\",\"szociális\",\"družbeni\",\"sotsiaal\",\"sotsiaalne\",\"\"],\n" + + " \"key::58\": [\"environmental\",\"ambiente\",\"medioambiental\",\"ambiente\",\"medioambiente\",\"meioambiente\",\"Umwelt\",\"milieu\",\"milieuwetenschap\",\"milieukunde\",\"περιβαλλοντικός\",\"περιβαλλοντική\",\"περιβαλλοντικό\",\"περιβαλλοντικά\",\"çevre\",\"környezeti\",\"okoliški\",\"keskonna\",\"\"],\n" + + " \"key::59\": [\"business\",\"economia\",\"economiche\",\"economica\",\"negocio\",\"empresa\",\"negócio\",\"Unternehmen\",\"bedrijf\",\"bedrijfskunde\",\"επιχείρηση\",\"iş\",\"üzleti\",\"posel\",\"ettevõte/äri\",\"\"],\n" + + " \"key::60\": [\"pharmaceuticals\",\"pharmacy\",\"farmacia\",\"farmaceutica\",\"farmacéutica\",\"farmacia\",\"farmacêutica\",\"farmácia\",\"Pharmazeutika\",\"Arzneimittelkunde\",\"farmaceutica\",\"geneesmiddelen\",\"apotheek\",\"φαρμακευτικός\",\"φαρμακευτική\",\"φαρμακευτικό\",\"φαρμακευτικά\",\"φαρμακείο\",\"ilaç\",\"eczane\",\"gyógyszerészeti\",\"gyógyszertár\",\"farmacevtika\",\"lekarništvo\",\"farmaatsia\",\"farmatseutiline\",\"\"],\n" + + " \"key::61\": [\"healthcare\",\"health services\",\"salute\",\"atenciónmédica\",\"cuidadodelasalud\",\"cuidadoscomasaúde\",\"Gesundheitswesen\",\"gezondheidszorg\",\"ιατροφαρμακευτικήπερίθαλψη\",\"sağlıkhizmeti\",\"egészségügy\",\"zdravstvo\",\"tervishoid\",\"tervishoiu\",\"\"],\n" + + " \"key::62\": [\"history\",\"storia\",\"historia\",\"história\",\"Geschichte\",\"geschiedenis\",\"geschiedkunde\",\"ιστορία\",\"tarih\",\"történelem\",\"zgodovina\",\"ajalugu\",\"\"],\n" + + " \"key::63\": [\"materials\",\"materiali\",\"materia\",\"materiales\",\"materiais\",\"materialen\",\"υλικά\",\"τεκμήρια\",\"malzemeler\",\"anyagok\",\"materiali\",\"materjalid\",\"vahendid\",\"\"],\n" + + " \"key::64\": [\"economics\",\"economia\",\"economiche\",\"economica\",\"economía\",\"economia\",\"Wirtschaft\",\"economie\",\"οικονομικά\",\"οικονομικέςεπιστήμες\",\"ekonomi\",\"közgazdaságtan\",\"gospodarstvo\",\"ekonomija\",\"majanduslik\",\"majandus\",\"\"],\n" + + " \"key::65\": [\"therapeutics\",\"terapeutica\",\"terapéutica\",\"terapêutica\",\"therapie\",\"θεραπευτική\",\"tedavibilimi\",\"gyógykezelés\",\"terapevtika\",\"terapeutiline\",\"ravi\",\"\"],\n" + + " \"key::66\": [\"oncology\",\"oncologia\",\"oncologico\",\"oncología\",\"oncologia\",\"Onkologie\",\"oncologie\",\"ογκολογία\",\"onkoloji\",\"onkológia\",\"onkologija\",\"onkoloogia\",\"\"],\n" + + " \"key::67\": [\"natural\",\"naturali\",\"naturale\",\"natural\",\"natural\",\"natürlich\",\"natuurlijk\",\"φυσικός\",\"φυσική\",\"φυσικό\",\"φυσικά\",\"doğal\",\"természetes\",\"naraven\",\"loodus\",\"\"],\n" + + " \"key::68\": [\"educational\",\"educazione\",\"pedagogia\",\"educacional\",\"educativo\",\"educacional\",\"pädagogisch\",\"educatief\",\"εκπαιδευτικός\",\"εκπαιδευτική\",\"εκπαιδευτικό\",\"εκπαιδευτικά\",\"eğitimsel\",\"oktatási\",\"izobraževalen\",\"haridus\",\"hariduslik\",\"\"],\n" + + " \"key::69\": [\"biomedical\",\"biomedica\",\"biomédico\",\"biomédico\",\"biomedizinisch\",\"biomedisch\",\"βιοιατρικός\",\"βιοιατρική\",\"βιοιατρικό\",\"βιοιατρικά\",\"biyomedikal\",\"orvosbiológiai\",\"biomedicinski\",\"biomeditsiiniline\",\"\"],\n" + + " \"key::70\": [\"veterinary\",\"veterinaria\",\"veterinarie\",\"veterinaria\",\"veterinária\",\"tierärtzlich\",\"veterinair\",\"veeartsenijlkunde\",\"κτηνιατρικός\",\"κτηνιατρική\",\"κτηνιατρικό\",\"κτηνιατρικά\",\"veteriner\",\"állatorvosi\",\"veterinar\",\"veterinarski\",\"veterinaaria\",\"\"],\n" + + " \"key::71\": [\"chemistry\",\"chimica\",\"química\",\"química\",\"Chemie\",\"chemie\",\"scheikunde\",\"χημεία\",\"kimya\",\"kémia\",\"kemija\",\"keemia\",\"\"],\n" + + " \"key::72\": [\"security\",\"sicurezza\",\"seguridad\",\"segurança\",\"Sicherheit\",\"veiligheid\",\"ασφάλεια\",\"güvenlik\",\"biztonsági\",\"varnost\",\"turvalisus\",\"julgeolek\",\"\"],\n" + + " \"key::73\": [\"biotechnology\",\"biotecnologia\",\"biotecnologie\",\"biotecnología\",\"biotecnologia\",\"Biotechnologie\",\"biotechnologie\",\"βιοτεχνολογία\",\"biyoteknoloji\",\"biotechnológia\",\"biotehnologija\",\"biotehnoloogia\",\"\"],\n" + + " \"key::74\": [\"military\",\"militare\",\"militari\",\"militar\",\"militar\",\"Militär\",\"militair\",\"leger\",\"στρατιωτικός\",\"στρατιωτική\",\"στρατιωτικό\",\"στρατιωτικά\",\"askeri\",\"katonai\",\"vojaški\",\"vojni\",\"militaar\",\"wojskowa\",\"\"],\n" + + " \"key::75\": [\"theological\",\"teologia\",\"teologico\",\"teológico\",\"tecnológica\",\"theologisch\",\"theologisch\",\"θεολογικός\",\"θεολογική\",\"θεολογικό\",\"θεολογικά\",\"teolojik\",\"technológiai\",\"teološki\",\"teoloogia\",\"usuteadus\",\"teoloogiline\",\"\"],\n" + + " \"key::76\": [\"electronics\",\"elettronica\",\"electrónica\",\"eletrônicos\",\"Elektronik\",\"elektronica\",\"ηλεκτρονική\",\"elektronik\",\"elektronika\",\"elektronika\",\"elektroonika\",\"\"],\n" + + " \"key::77\": [\"forestry\",\"forestale\",\"forestali\",\"silvicultura\",\"forestal\",\"floresta\",\"Forstwirtschaft\",\"bosbouw\",\"δασοκομία\",\"δασολογία\",\"ormancılık\",\"erdészet\",\"gozdarstvo\",\"metsandus\",\"\"],\n" + + " \"key::78\": [\"maritime\",\"marittima\",\"marittime\",\"marittimo\",\"marítimo\",\"marítimo\",\"maritiem\",\"ναυτικός\",\"ναυτική\",\"ναυτικό\",\"ναυτικά\",\"ναυτιλιακός\",\"ναυτιλιακή\",\"ναυτιλιακό\",\"ναυτιλιακά\",\"θαλάσσιος\",\"θαλάσσια\",\"θαλάσσιο\",\"denizcilik\",\"tengeri\",\"morski\",\"mere\",\"merendus\",\"\"],\n" + + " \"key::79\": [\"sports\",\"sport\",\"deportes\",\"esportes\",\"Sport\",\"sport\",\"sportwetenschappen\",\"άθληση\",\"γυμναστικήδραστηριότητα\",\"spor\",\"sport\",\"šport\",\"sport\",\"spordi\",\"\"],\n" + + " \"key::80\": [\"surgery\",\"chirurgia\",\"chirurgiche\",\"cirugía\",\"cirurgia\",\"Chirurgie\",\"chirurgie\",\"heelkunde\",\"εγχείρηση\",\"επέμβαση\",\"χειρουργικήεπέμβαση\",\"cerrahi\",\"sebészet\",\"kirurgija\",\"kirurgia\",\"\"],\n" + + " \"key::81\": [\"cultural\",\"culturale\",\"culturali\",\"cultura\",\"cultural\",\"cultural\",\"kulturell\",\"cultureel\",\"πολιτιστικός\",\"πολιτιστική\",\"πολιτιστικό\",\"πολιτισμικός\",\"πολιτισμική\",\"πολιτισμικό\",\"kültürel\",\"kultúrális\",\"kulturni\",\"kultuuri\",\"kultuuriline\",\"\"],\n" + + " \"key::82\": [\"computerscience\",\"informatica\",\"ordenador\",\"computadora\",\"informática\",\"computación\",\"cienciasdelacomputación\",\"ciênciadacomputação\",\"Computer\",\"computer\",\"υπολογιστής\",\"ηλεκτρονικόςυπολογιστής\",\"bilgisayar\",\"számítógép\",\"računalnik\",\"arvuti\",\"\"],\n" + + " \"key::83\": [\"finance\",\"financial\",\"finanza\",\"finanziarie\",\"finanza\",\"financiero\",\"finanças\",\"financeiro\",\"Finanzen\",\"finanziell\",\"financiën\",\"financieel\",\"χρηματοοικονομικά\",\"χρηματοδότηση\",\"finanse\",\"finansal\",\"pénzügy\",\"pénzügyi\",\"finance\",\"finančni\",\"finants\",\"finantsiline\",\"\"],\n" + + " \"key::84\": [\"communication\",\"comunicazione\",\"comuniciación\",\"comunicação\",\"Kommunikation\",\"communication\",\"επικοινωνία\",\"iletişim\",\"kommunikáció\",\"komuniciranje\",\"kommunikatsioon\",\"\"],\n" + + " \"key::85\": [\"justice\",\"giustizia\",\"justicia\",\"justiça\",\"Recht\",\"Justiz\",\"justitie\",\"gerechtigheid\",\"δικαιοσύνη\",\"υπουργείοδικαιοσύνης\",\"δίκαιο\",\"adalet\",\"igazságügy\",\"pravo\",\"õigus\",\"\"],\n" + + " \"key::86\": [\"aerospace\",\"aerospaziale\",\"aerospaziali\",\"aeroespacio\",\"aeroespaço\",\"Luftfahrt\",\"luchtvaart\",\"ruimtevaart\",\"αεροπορικός\",\"αεροπορική\",\"αεροπορικό\",\"αεροναυπηγικός\",\"αεροναυπηγική\",\"αεροναυπηγικό\",\"αεροναυπηγικά\",\"havacılıkveuzay\",\"légtér\",\"zrakoplovstvo\",\"atmosfäär\",\"kosmos\",\"\"],\n" + + " \"key::87\": [\"dermatology\",\"dermatologia\",\"dermatología\",\"dermatologia\",\"Dermatologie\",\"dermatologie\",\"δρματολογία\",\"dermatoloji\",\"bőrgyógyászat\",\"dermatológia\",\"dermatologija\",\"dermatoloogia\",\"\"],\n" + + " \"key::88\": [\"architecture\",\"architettura\",\"arquitectura\",\"arquitetura\",\"Architektur\",\"architectuur\",\"αρχιτεκτονική\",\"mimarlık\",\"építészet\",\"arhitektura\",\"arhitektuur\",\"\"],\n" + + " \"key::89\": [\"mathematics\",\"matematica\",\"matematiche\",\"matemáticas\",\"matemáticas\",\"Mathematik\",\"wiskunde\",\"mathematica\",\"μαθηματικά\",\"matematik\",\"matematika\",\"matematika\",\"matemaatika\",\"\"],\n" + + " \"key::90\": [\"language\",\"lingue\",\"linguistica\",\"linguistiche\",\"lenguaje\",\"idioma\",\"língua\",\"idioma\",\"Sprache\",\"taal\",\"taalkunde\",\"γλώσσα\",\"dil\",\"nyelv\",\"jezik\",\"keel\",\"\"],\n" + + " \"key::91\": [\"neuroscience\",\"neuroscienza\",\"neurociencia\",\"neurociência\",\"Neurowissenschaft\",\"neurowetenschappen\",\"νευροεπιστήμη\",\"nörobilim\",\"idegtudomány\",\"nevroznanost\",\"neuroteadused\",\"\"],\n" + + " \"key::92\": [\"automation\",\"automazione\",\"automatización\",\"automação\",\"Automatisierung\",\"automatisering\",\"αυτοματοποίηση\",\"otomasyon\",\"automatizálás\",\"avtomatizacija\",\"automatiseeritud\",\"\"],\n" + + " \"key::93\": [\"pediatric\",\"pediatria\",\"pediatriche\",\"pediatrico\",\"pediátrico\",\"pediatría\",\"pediátrico\",\"pediatria\",\"pädiatrisch\",\"pediatrische\",\"παιδιατρική\",\"pediatrik\",\"gyermekgyógyászat\",\"pediatrija\",\"pediaatria\",\"\"],\n" + + " \"key::94\": [\"photonics\",\"fotonica\",\"fotoniche\",\"fotónica\",\"fotônica\",\"Photonik\",\"fotonica\",\"φωτονική\",\"fotonik\",\"fotonika\",\"fotonika\",\"fotoonika\",\"\"],\n" + + " \"key::95\": [\"mechanics\", \"mechanical\", \"meccanica\",\"meccaniche\",\"mecánica\",\"mecânica\",\"Mechanik\",\"Maschinenbau\",\"mechanica\",\"werktuigkunde\",\"μηχανικής\",\"mekanik\",\"gépészet\",\"mehanika\",\"mehaanika\",\"\"],\n" + + " \"key::96\": [\"psychiatrics\",\"psichiatria\",\"psichiatrica\",\"psichiatriche\",\"psiquiatría\",\"psiquiatria\",\"Psychiatrie\",\"psychiatrie\",\"ψυχιατρική\",\"psikiyatrik\",\"pszihiátria\",\"psihiatrija\",\"psühhaatria\",\"\"],\n" + + " \"key::97\": [\"psychology\",\"fisiologia\",\"psicología\",\"psicologia\",\"Psychologie\",\"psychologie\",\"ψυχολογία\",\"psikoloji\",\"pszihológia\",\"psihologija\",\"psühholoogia\",\"\"],\n" + + " \"key::98\": [\"automotive\",\"industriaautomobilistica\",\"industriadelautomóvil\",\"automotriz\",\"industriaautomotriz\",\"automotivo\",\"Automobilindustrie\",\"autoindustrie\",\"αυτοκίνητος\",\"αυτοκίνητη\",\"αυτοκίνητο\",\"αυτοκινούμενος\",\"αυτοκινούμενη\",\"αυτοκινούμενο\",\"αυτοκινητιστικός\",\"αυτοκινητιστική\",\"αυτοκινητιστικό\",\"otomotiv\",\"autóipari\",\"samogiben\",\"avtomobilskaindustrija\",\"auto-\",\"\"],\n" + + " \"key::99\": [\"neurology\",\"neurologia\",\"neurologiche\",\"neurología\",\"neurologia\",\"Neurologie\",\"neurologie\",\"zenuwleer\",\"νευρολογία\",\"nöroloji\",\"neurológia\",\"ideggyógyászat\",\"nevrologija\",\"neuroloogia\",\"\"],\n" + + " \"key::100\": [\"geology\",\"geologia\",\"geologiche\",\"geología\",\"geologia\",\"Geologie\",\"geologie\",\"aardkunde\",\"γεωλογία\",\"jeoloji\",\"geológia\",\"földtudomány\",\"geologija\",\"geoloogia\",\"\"],\n" + + " \"key::101\": [\"microbiology\",\"microbiologia\",\"micro-biologia\",\"microbiologiche\",\"microbiología\",\"microbiologia\",\"Mikrobiologie\",\"microbiologie\",\"μικροβιολογία\",\"mikrobiyoloji\",\"mikrobiológia\",\"mikrobiologija\",\"mikrobioloogia\",\"\"],\n" + + " \"key::102\": [\"informatics\",\"informatica\",\"informática\",\"informática\",\"informatica\",\"\"],\n" + + " \"key::103\": [\"forschungsgemeinschaft\",\"comunita ricerca\",\"research community\",\"research foundation\",\"research association\"],\n" + + " \"key::104\": [\"commerce\",\"ticaret\",\"ticarət\",\"commercio\",\"trade\",\"handel\",\"comercio\"],\n" + + " \"key::105\" : [\"state\", \"stato\", \"etade\", \"estado\", \"statale\", \"etat\", \"zustand\", \"estado\"],\n" + + " \"key::106\" : [\"seminary\", \"seminario\", \"seminaire\", \"seminar\"],\n" + + " \"key::107\" : [\"agricultural forestry\", \"af\", \"a f\"],\n" + + " \"key::108\" : [\"agricultural mechanical\", \"am\", \"a m\"],\n" + + " \"key::109\" : [\"catholic\", \"catholique\", \"katholische\", \"catolica\", \"cattolica\", \"catolico\"]\n" + + " }\n" + + " }\n" + + "}"); - @Test - public void testJPath() throws Exception { + @Test + public void testJPath() throws Exception { - MapDocument d = MapDocumentUtil.asMapDocumentWithJPath(conf, json); + MapDocument d = MapDocumentUtil.asMapDocumentWithJPath(conf, json); - System.out.println("d = " + d); - } + System.out.println("d = " + d); + } } diff --git a/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/DatePicker.java b/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/DatePicker.java index 8a6cdf7dc..db55434d8 100644 --- a/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/DatePicker.java +++ b/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/DatePicker.java @@ -1,113 +1,121 @@ + package eu.dnetlib.dedup; import static java.util.Collections.reverseOrder; import static java.util.Map.Entry.comparingByValue; import static java.util.stream.Collectors.toMap; + import static org.apache.commons.lang.StringUtils.endsWith; import static org.apache.commons.lang.StringUtils.substringBefore; -import eu.dnetlib.dhp.schema.oaf.Field; import java.time.Year; import java.util.*; import java.util.stream.Collectors; + import org.apache.commons.lang.StringUtils; +import eu.dnetlib.dhp.schema.oaf.Field; + public class DatePicker { - private static final String DATE_PATTERN = "\\d{4}-\\d{2}-\\d{2}"; - private static final String DATE_DEFAULT_SUFFIX = "01-01"; - private static final int YEAR_LB = 1300; - private static final int YEAR_UB = Year.now().getValue() + 5; + private static final String DATE_PATTERN = "\\d{4}-\\d{2}-\\d{2}"; + private static final String DATE_DEFAULT_SUFFIX = "01-01"; + private static final int YEAR_LB = 1300; + private static final int YEAR_UB = Year.now().getValue() + 5; - public static Field pick(final Collection dateofacceptance) { + public static Field pick(final Collection dateofacceptance) { - final Map frequencies = - dateofacceptance - .parallelStream() - .filter(StringUtils::isNotBlank) - .collect(Collectors.toConcurrentMap(w -> w, w -> 1, Integer::sum)); + final Map frequencies = dateofacceptance + .parallelStream() + .filter(StringUtils::isNotBlank) + .collect(Collectors.toConcurrentMap(w -> w, w -> 1, Integer::sum)); - if (frequencies.isEmpty()) { - return new Field<>(); - } + if (frequencies.isEmpty()) { + return new Field<>(); + } - final Field date = new Field<>(); - date.setValue(frequencies.keySet().iterator().next()); + final Field date = new Field<>(); + date.setValue(frequencies.keySet().iterator().next()); - // let's sort this map by values first, filtering out invalid dates - final Map sorted = - frequencies.entrySet().stream() - .filter(d -> StringUtils.isNotBlank(d.getKey())) - .filter(d -> d.getKey().matches(DATE_PATTERN)) - .filter(d -> inRange(d.getKey())) - .sorted(reverseOrder(comparingByValue())) - .collect( - toMap(Map.Entry::getKey, Map.Entry::getValue, (e1, e2) -> e2, LinkedHashMap::new)); + // let's sort this map by values first, filtering out invalid dates + final Map sorted = frequencies + .entrySet() + .stream() + .filter(d -> StringUtils.isNotBlank(d.getKey())) + .filter(d -> d.getKey().matches(DATE_PATTERN)) + .filter(d -> inRange(d.getKey())) + .sorted(reverseOrder(comparingByValue())) + .collect( + toMap(Map.Entry::getKey, Map.Entry::getValue, (e1, e2) -> e2, LinkedHashMap::new)); - // shortcut - if (sorted.size() == 0) { - return date; - } + // shortcut + if (sorted.size() == 0) { + return date; + } - // voting method (1/3 + 1) wins - if (sorted.size() >= 3) { - final int acceptThreshold = (sorted.size() / 3) + 1; - final List accepted = - sorted.entrySet().stream() - .filter(e -> e.getValue() >= acceptThreshold) - .map(e -> e.getKey()) - .collect(Collectors.toList()); + // voting method (1/3 + 1) wins + if (sorted.size() >= 3) { + final int acceptThreshold = (sorted.size() / 3) + 1; + final List accepted = sorted + .entrySet() + .stream() + .filter(e -> e.getValue() >= acceptThreshold) + .map(e -> e.getKey()) + .collect(Collectors.toList()); - // cannot find strong majority - if (accepted.isEmpty()) { - final int max = sorted.values().iterator().next(); - Optional first = - sorted.entrySet().stream() - .filter(e -> e.getValue() == max && !endsWith(e.getKey(), DATE_DEFAULT_SUFFIX)) - .map(Map.Entry::getKey) - .findFirst(); - if (first.isPresent()) { - date.setValue(first.get()); - return date; - } + // cannot find strong majority + if (accepted.isEmpty()) { + final int max = sorted.values().iterator().next(); + Optional first = sorted + .entrySet() + .stream() + .filter(e -> e.getValue() == max && !endsWith(e.getKey(), DATE_DEFAULT_SUFFIX)) + .map(Map.Entry::getKey) + .findFirst(); + if (first.isPresent()) { + date.setValue(first.get()); + return date; + } - date.setValue(sorted.keySet().iterator().next()); - return date; - } + date.setValue(sorted.keySet().iterator().next()); + return date; + } - if (accepted.size() == 1) { - date.setValue(accepted.get(0)); - return date; - } else { - final Optional first = - accepted.stream().filter(d -> !endsWith(d, DATE_DEFAULT_SUFFIX)).findFirst(); - if (first.isPresent()) { - date.setValue(first.get()); - return date; - } + if (accepted.size() == 1) { + date.setValue(accepted.get(0)); + return date; + } else { + final Optional first = accepted + .stream() + .filter(d -> !endsWith(d, DATE_DEFAULT_SUFFIX)) + .findFirst(); + if (first.isPresent()) { + date.setValue(first.get()); + return date; + } - return date; - } + return date; + } - // 1st non YYYY-01-01 is returned - } else { - if (sorted.size() == 2) { - for (Map.Entry e : sorted.entrySet()) { - if (!endsWith(e.getKey(), DATE_DEFAULT_SUFFIX)) { - date.setValue(e.getKey()); - return date; - } - } - } + // 1st non YYYY-01-01 is returned + } else { + if (sorted.size() == 2) { + for (Map.Entry e : sorted.entrySet()) { + if (!endsWith(e.getKey(), DATE_DEFAULT_SUFFIX)) { + date.setValue(e.getKey()); + return date; + } + } + } - // none of the dates seems good enough, return the 1st one - date.setValue(sorted.keySet().iterator().next()); - return date; - } - } + // none of the dates seems good enough, return the 1st one + date.setValue(sorted.keySet().iterator().next()); + return date; + } + } - private static boolean inRange(final String date) { - final int year = Integer.parseInt(substringBefore(date, "-")); - return year >= YEAR_LB && year <= YEAR_UB; - } + private static boolean inRange(final String date) { + final int year = Integer.parseInt(substringBefore(date, "-")); + return year >= YEAR_LB && year <= YEAR_UB; + } } diff --git a/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/DedupRecordFactory.java b/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/DedupRecordFactory.java index 782aa174f..d03cc2589 100644 --- a/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/DedupRecordFactory.java +++ b/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/DedupRecordFactory.java @@ -1,304 +1,318 @@ + package eu.dnetlib.dedup; -import com.fasterxml.jackson.databind.DeserializationFeature; -import com.fasterxml.jackson.databind.ObjectMapper; -import com.google.common.collect.Lists; -import eu.dnetlib.dhp.schema.oaf.*; -import eu.dnetlib.pace.config.DedupConfig; -import eu.dnetlib.pace.util.MapDocumentUtil; import java.util.Collection; + import org.apache.spark.api.java.JavaPairRDD; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.api.java.function.PairFunction; import org.apache.spark.sql.Encoders; import org.apache.spark.sql.SparkSession; + +import com.fasterxml.jackson.databind.DeserializationFeature; +import com.fasterxml.jackson.databind.ObjectMapper; +import com.google.common.collect.Lists; + +import eu.dnetlib.dhp.schema.oaf.*; +import eu.dnetlib.pace.config.DedupConfig; +import eu.dnetlib.pace.util.MapDocumentUtil; import scala.Tuple2; public class DedupRecordFactory { - public static JavaRDD createDedupRecord( - final JavaSparkContext sc, - final SparkSession spark, - final String mergeRelsInputPath, - final String entitiesInputPath, - final OafEntityType entityType, - final DedupConfig dedupConf) { - long ts = System.currentTimeMillis(); - // - final JavaPairRDD inputJsonEntities = - sc.textFile(entitiesInputPath) - .mapToPair( - (PairFunction) - it -> - new Tuple2( - MapDocumentUtil.getJPathString(dedupConf.getWf().getIdPath(), it), it)); + public static JavaRDD createDedupRecord( + final JavaSparkContext sc, + final SparkSession spark, + final String mergeRelsInputPath, + final String entitiesInputPath, + final OafEntityType entityType, + final DedupConfig dedupConf) { + long ts = System.currentTimeMillis(); + // + final JavaPairRDD inputJsonEntities = sc + .textFile(entitiesInputPath) + .mapToPair( + (PairFunction) it -> new Tuple2( + MapDocumentUtil.getJPathString(dedupConf.getWf().getIdPath(), it), it)); - // : source is the dedup_id, target is the id of the mergedIn - JavaPairRDD mergeRels = - spark - .read() - .load(mergeRelsInputPath) - .as(Encoders.bean(Relation.class)) - .where("relClass=='merges'") - .javaRDD() - .mapToPair( - (PairFunction) - r -> new Tuple2(r.getTarget(), r.getSource())); + // : source is the dedup_id, target is the id of the mergedIn + JavaPairRDD mergeRels = spark + .read() + .load(mergeRelsInputPath) + .as(Encoders.bean(Relation.class)) + .where("relClass=='merges'") + .javaRDD() + .mapToPair( + (PairFunction) r -> new Tuple2(r.getTarget(), r.getSource())); - // - final JavaPairRDD joinResult = - mergeRels - .join(inputJsonEntities) - .mapToPair( - (PairFunction>, String, String>) Tuple2::_2); + // + final JavaPairRDD joinResult = mergeRels + .join(inputJsonEntities) + .mapToPair( + (PairFunction>, String, String>) Tuple2::_2); - JavaPairRDD> sortedJoinResult = joinResult.groupByKey(); + JavaPairRDD> sortedJoinResult = joinResult.groupByKey(); - switch (entityType) { - case publication: - return sortedJoinResult.map(p -> DedupRecordFactory.publicationMerger(p, ts)); - case dataset: - return sortedJoinResult.map(d -> DedupRecordFactory.datasetMerger(d, ts)); - case project: - return sortedJoinResult.map(p -> DedupRecordFactory.projectMerger(p, ts)); - case software: - return sortedJoinResult.map(s -> DedupRecordFactory.softwareMerger(s, ts)); - case datasource: - return sortedJoinResult.map(d -> DedupRecordFactory.datasourceMerger(d, ts)); - case organization: - return sortedJoinResult.map(o -> DedupRecordFactory.organizationMerger(o, ts)); - case otherresearchproduct: - return sortedJoinResult.map(o -> DedupRecordFactory.otherresearchproductMerger(o, ts)); - default: - return null; - } - } + switch (entityType) { + case publication: + return sortedJoinResult.map(p -> DedupRecordFactory.publicationMerger(p, ts)); + case dataset: + return sortedJoinResult.map(d -> DedupRecordFactory.datasetMerger(d, ts)); + case project: + return sortedJoinResult.map(p -> DedupRecordFactory.projectMerger(p, ts)); + case software: + return sortedJoinResult.map(s -> DedupRecordFactory.softwareMerger(s, ts)); + case datasource: + return sortedJoinResult.map(d -> DedupRecordFactory.datasourceMerger(d, ts)); + case organization: + return sortedJoinResult.map(o -> DedupRecordFactory.organizationMerger(o, ts)); + case otherresearchproduct: + return sortedJoinResult.map(o -> DedupRecordFactory.otherresearchproductMerger(o, ts)); + default: + return null; + } + } - private static Publication publicationMerger(Tuple2> e, final long ts) { + private static Publication publicationMerger(Tuple2> e, final long ts) { - Publication p = new Publication(); // the result of the merge, to be returned at the end + Publication p = new Publication(); // the result of the merge, to be returned at the end - p.setId(e._1()); + p.setId(e._1()); - final ObjectMapper mapper = new ObjectMapper(); - mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false); + final ObjectMapper mapper = new ObjectMapper(); + mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false); - final Collection dateofacceptance = Lists.newArrayList(); + final Collection dateofacceptance = Lists.newArrayList(); - if (e._2() != null) - e._2() - .forEach( - pub -> { - try { - Publication publication = mapper.readValue(pub, Publication.class); + if (e._2() != null) + e + ._2() + .forEach( + pub -> { + try { + Publication publication = mapper.readValue(pub, Publication.class); - p.mergeFrom(publication); - p.setAuthor(DedupUtility.mergeAuthor(p.getAuthor(), publication.getAuthor())); - // add to the list if they are not null - if (publication.getDateofacceptance() != null) - dateofacceptance.add(publication.getDateofacceptance().getValue()); - } catch (Exception exc) { - throw new RuntimeException(exc); - } - }); - p.setDateofacceptance(DatePicker.pick(dateofacceptance)); - if (p.getDataInfo() == null) p.setDataInfo(new DataInfo()); - p.getDataInfo().setTrust("0.9"); - p.setLastupdatetimestamp(ts); - return p; - } + p.mergeFrom(publication); + p.setAuthor(DedupUtility.mergeAuthor(p.getAuthor(), publication.getAuthor())); + // add to the list if they are not null + if (publication.getDateofacceptance() != null) + dateofacceptance.add(publication.getDateofacceptance().getValue()); + } catch (Exception exc) { + throw new RuntimeException(exc); + } + }); + p.setDateofacceptance(DatePicker.pick(dateofacceptance)); + if (p.getDataInfo() == null) + p.setDataInfo(new DataInfo()); + p.getDataInfo().setTrust("0.9"); + p.setLastupdatetimestamp(ts); + return p; + } - private static Dataset datasetMerger(Tuple2> e, final long ts) { + private static Dataset datasetMerger(Tuple2> e, final long ts) { - Dataset d = new Dataset(); // the result of the merge, to be returned at the end + Dataset d = new Dataset(); // the result of the merge, to be returned at the end - d.setId(e._1()); + d.setId(e._1()); - final ObjectMapper mapper = new ObjectMapper(); - mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false); + final ObjectMapper mapper = new ObjectMapper(); + mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false); - final Collection dateofacceptance = Lists.newArrayList(); + final Collection dateofacceptance = Lists.newArrayList(); - if (e._2() != null) - e._2() - .forEach( - dat -> { - try { - Dataset dataset = mapper.readValue(dat, Dataset.class); + if (e._2() != null) + e + ._2() + .forEach( + dat -> { + try { + Dataset dataset = mapper.readValue(dat, Dataset.class); - d.mergeFrom(dataset); - d.setAuthor(DedupUtility.mergeAuthor(d.getAuthor(), dataset.getAuthor())); - // add to the list if they are not null - if (dataset.getDateofacceptance() != null) - dateofacceptance.add(dataset.getDateofacceptance().getValue()); - } catch (Exception exc) { - throw new RuntimeException(exc); - } - }); - d.setDateofacceptance(DatePicker.pick(dateofacceptance)); - if (d.getDataInfo() == null) d.setDataInfo(new DataInfo()); - d.getDataInfo().setTrust("0.9"); - d.setLastupdatetimestamp(ts); - return d; - } + d.mergeFrom(dataset); + d.setAuthor(DedupUtility.mergeAuthor(d.getAuthor(), dataset.getAuthor())); + // add to the list if they are not null + if (dataset.getDateofacceptance() != null) + dateofacceptance.add(dataset.getDateofacceptance().getValue()); + } catch (Exception exc) { + throw new RuntimeException(exc); + } + }); + d.setDateofacceptance(DatePicker.pick(dateofacceptance)); + if (d.getDataInfo() == null) + d.setDataInfo(new DataInfo()); + d.getDataInfo().setTrust("0.9"); + d.setLastupdatetimestamp(ts); + return d; + } - private static Project projectMerger(Tuple2> e, final long ts) { + private static Project projectMerger(Tuple2> e, final long ts) { - Project p = new Project(); // the result of the merge, to be returned at the end + Project p = new Project(); // the result of the merge, to be returned at the end - p.setId(e._1()); + p.setId(e._1()); - final ObjectMapper mapper = new ObjectMapper(); - mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false); - if (e._2() != null) - e._2() - .forEach( - proj -> { - try { - Project project = mapper.readValue(proj, Project.class); + final ObjectMapper mapper = new ObjectMapper(); + mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false); + if (e._2() != null) + e + ._2() + .forEach( + proj -> { + try { + Project project = mapper.readValue(proj, Project.class); - p.mergeFrom(project); - } catch (Exception exc) { - throw new RuntimeException(exc); - } - }); - if (p.getDataInfo() == null) p.setDataInfo(new DataInfo()); - p.getDataInfo().setTrust("0.9"); - p.setLastupdatetimestamp(ts); - return p; - } + p.mergeFrom(project); + } catch (Exception exc) { + throw new RuntimeException(exc); + } + }); + if (p.getDataInfo() == null) + p.setDataInfo(new DataInfo()); + p.getDataInfo().setTrust("0.9"); + p.setLastupdatetimestamp(ts); + return p; + } - private static Software softwareMerger(Tuple2> e, final long ts) { + private static Software softwareMerger(Tuple2> e, final long ts) { - Software s = new Software(); // the result of the merge, to be returned at the end + Software s = new Software(); // the result of the merge, to be returned at the end - s.setId(e._1()); - final ObjectMapper mapper = new ObjectMapper(); - mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false); - final Collection dateofacceptance = Lists.newArrayList(); - if (e._2() != null) - e._2() - .forEach( - soft -> { - try { - Software software = mapper.readValue(soft, Software.class); + s.setId(e._1()); + final ObjectMapper mapper = new ObjectMapper(); + mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false); + final Collection dateofacceptance = Lists.newArrayList(); + if (e._2() != null) + e + ._2() + .forEach( + soft -> { + try { + Software software = mapper.readValue(soft, Software.class); - s.mergeFrom(software); - s.setAuthor(DedupUtility.mergeAuthor(s.getAuthor(), software.getAuthor())); - // add to the list if they are not null - if (software.getDateofacceptance() != null) - dateofacceptance.add(software.getDateofacceptance().getValue()); - } catch (Exception exc) { - throw new RuntimeException(exc); - } - }); - s.setDateofacceptance(DatePicker.pick(dateofacceptance)); - if (s.getDataInfo() == null) s.setDataInfo(new DataInfo()); - s.getDataInfo().setTrust("0.9"); - s.setLastupdatetimestamp(ts); - return s; - } + s.mergeFrom(software); + s.setAuthor(DedupUtility.mergeAuthor(s.getAuthor(), software.getAuthor())); + // add to the list if they are not null + if (software.getDateofacceptance() != null) + dateofacceptance.add(software.getDateofacceptance().getValue()); + } catch (Exception exc) { + throw new RuntimeException(exc); + } + }); + s.setDateofacceptance(DatePicker.pick(dateofacceptance)); + if (s.getDataInfo() == null) + s.setDataInfo(new DataInfo()); + s.getDataInfo().setTrust("0.9"); + s.setLastupdatetimestamp(ts); + return s; + } - private static Datasource datasourceMerger(Tuple2> e, final long ts) { - Datasource d = new Datasource(); // the result of the merge, to be returned at the end - d.setId(e._1()); - final ObjectMapper mapper = new ObjectMapper(); - mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false); - if (e._2() != null) - e._2() - .forEach( - dat -> { - try { - Datasource datasource = mapper.readValue(dat, Datasource.class); + private static Datasource datasourceMerger(Tuple2> e, final long ts) { + Datasource d = new Datasource(); // the result of the merge, to be returned at the end + d.setId(e._1()); + final ObjectMapper mapper = new ObjectMapper(); + mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false); + if (e._2() != null) + e + ._2() + .forEach( + dat -> { + try { + Datasource datasource = mapper.readValue(dat, Datasource.class); - d.mergeFrom(datasource); - } catch (Exception exc) { - throw new RuntimeException(exc); - } - }); - if (d.getDataInfo() == null) d.setDataInfo(new DataInfo()); - d.getDataInfo().setTrust("0.9"); - d.setLastupdatetimestamp(ts); - return d; - } + d.mergeFrom(datasource); + } catch (Exception exc) { + throw new RuntimeException(exc); + } + }); + if (d.getDataInfo() == null) + d.setDataInfo(new DataInfo()); + d.getDataInfo().setTrust("0.9"); + d.setLastupdatetimestamp(ts); + return d; + } - private static Organization organizationMerger( - Tuple2> e, final long ts) { + private static Organization organizationMerger( + Tuple2> e, final long ts) { - Organization o = new Organization(); // the result of the merge, to be returned at the end + Organization o = new Organization(); // the result of the merge, to be returned at the end - o.setId(e._1()); + o.setId(e._1()); - final ObjectMapper mapper = new ObjectMapper(); - mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false); + final ObjectMapper mapper = new ObjectMapper(); + mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false); - StringBuilder trust = new StringBuilder("0.0"); + StringBuilder trust = new StringBuilder("0.0"); - if (e._2() != null) - e._2() - .forEach( - pub -> { - try { - Organization organization = mapper.readValue(pub, Organization.class); + if (e._2() != null) + e + ._2() + .forEach( + pub -> { + try { + Organization organization = mapper.readValue(pub, Organization.class); - final String currentTrust = organization.getDataInfo().getTrust(); - if (!"1.0".equals(currentTrust)) { - trust.setLength(0); - trust.append(currentTrust); - } - o.mergeFrom(organization); + final String currentTrust = organization.getDataInfo().getTrust(); + if (!"1.0".equals(currentTrust)) { + trust.setLength(0); + trust.append(currentTrust); + } + o.mergeFrom(organization); - } catch (Exception exc) { - throw new RuntimeException(exc); - } - }); + } catch (Exception exc) { + throw new RuntimeException(exc); + } + }); - if (o.getDataInfo() == null) { - o.setDataInfo(new DataInfo()); - } - if (o.getDataInfo() == null) o.setDataInfo(new DataInfo()); - o.getDataInfo().setTrust("0.9"); - o.setLastupdatetimestamp(ts); + if (o.getDataInfo() == null) { + o.setDataInfo(new DataInfo()); + } + if (o.getDataInfo() == null) + o.setDataInfo(new DataInfo()); + o.getDataInfo().setTrust("0.9"); + o.setLastupdatetimestamp(ts); - return o; - } + return o; + } - private static OtherResearchProduct otherresearchproductMerger( - Tuple2> e, final long ts) { + private static OtherResearchProduct otherresearchproductMerger( + Tuple2> e, final long ts) { - OtherResearchProduct o = new OtherResearchProduct(); // the result of the merge, to be - // returned at the end + OtherResearchProduct o = new OtherResearchProduct(); // the result of the merge, to be + // returned at the end - o.setId(e._1()); + o.setId(e._1()); - final ObjectMapper mapper = new ObjectMapper(); - mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false); + final ObjectMapper mapper = new ObjectMapper(); + mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false); - final Collection dateofacceptance = Lists.newArrayList(); + final Collection dateofacceptance = Lists.newArrayList(); - if (e._2() != null) - e._2() - .forEach( - orp -> { - try { - OtherResearchProduct otherResearchProduct = - mapper.readValue(orp, OtherResearchProduct.class); + if (e._2() != null) + e + ._2() + .forEach( + orp -> { + try { + OtherResearchProduct otherResearchProduct = mapper + .readValue(orp, OtherResearchProduct.class); - o.mergeFrom(otherResearchProduct); - o.setAuthor( - DedupUtility.mergeAuthor(o.getAuthor(), otherResearchProduct.getAuthor())); - // add to the list if they are not null - if (otherResearchProduct.getDateofacceptance() != null) - dateofacceptance.add(otherResearchProduct.getDateofacceptance().getValue()); - } catch (Exception exc) { - throw new RuntimeException(exc); - } - }); - if (o.getDataInfo() == null) o.setDataInfo(new DataInfo()); - o.setDateofacceptance(DatePicker.pick(dateofacceptance)); - o.getDataInfo().setTrust("0.9"); - o.setLastupdatetimestamp(ts); - return o; - } + o.mergeFrom(otherResearchProduct); + o + .setAuthor( + DedupUtility.mergeAuthor(o.getAuthor(), otherResearchProduct.getAuthor())); + // add to the list if they are not null + if (otherResearchProduct.getDateofacceptance() != null) + dateofacceptance.add(otherResearchProduct.getDateofacceptance().getValue()); + } catch (Exception exc) { + throw new RuntimeException(exc); + } + }); + if (o.getDataInfo() == null) + o.setDataInfo(new DataInfo()); + o.setDateofacceptance(DatePicker.pick(dateofacceptance)); + o.getDataInfo().setTrust("0.9"); + o.setLastupdatetimestamp(ts); + return o; + } } diff --git a/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/DedupUtility.java b/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/DedupUtility.java index 60f0a50f7..70a2e3591 100644 --- a/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/DedupUtility.java +++ b/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/DedupUtility.java @@ -1,13 +1,6 @@ + package eu.dnetlib.dedup; -import com.google.common.collect.Sets; -import com.wcohen.ss.JaroWinkler; -import eu.dnetlib.dhp.schema.oaf.Author; -import eu.dnetlib.dhp.schema.oaf.StructuredProperty; -import eu.dnetlib.pace.clustering.BlacklistAwareClusteringCombiner; -import eu.dnetlib.pace.config.DedupConfig; -import eu.dnetlib.pace.model.MapDocument; -import eu.dnetlib.pace.model.Person; import java.io.IOException; import java.io.StringWriter; import java.nio.charset.StandardCharsets; @@ -15,6 +8,7 @@ import java.security.MessageDigest; import java.text.Normalizer; import java.util.*; import java.util.stream.Collectors; + import org.apache.commons.codec.binary.Hex; import org.apache.commons.io.IOUtils; import org.apache.commons.lang3.StringUtils; @@ -26,205 +20,220 @@ import org.apache.spark.SparkContext; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.util.LongAccumulator; + +import com.google.common.collect.Sets; +import com.wcohen.ss.JaroWinkler; + +import eu.dnetlib.dhp.schema.oaf.Author; +import eu.dnetlib.dhp.schema.oaf.StructuredProperty; +import eu.dnetlib.pace.clustering.BlacklistAwareClusteringCombiner; +import eu.dnetlib.pace.config.DedupConfig; +import eu.dnetlib.pace.model.MapDocument; +import eu.dnetlib.pace.model.Person; import scala.Tuple2; public class DedupUtility { - private static final Double THRESHOLD = 0.95; + private static final Double THRESHOLD = 0.95; - public static Map constructAccumulator( - final DedupConfig dedupConf, final SparkContext context) { + public static Map constructAccumulator( + final DedupConfig dedupConf, final SparkContext context) { - Map accumulators = new HashMap<>(); + Map accumulators = new HashMap<>(); - String acc1 = - String.format("%s::%s", dedupConf.getWf().getEntityType(), "records per hash key = 1"); - accumulators.put(acc1, context.longAccumulator(acc1)); - String acc2 = - String.format( - "%s::%s", - dedupConf.getWf().getEntityType(), "missing " + dedupConf.getWf().getOrderField()); - accumulators.put(acc2, context.longAccumulator(acc2)); - String acc3 = - String.format( - "%s::%s", - dedupConf.getWf().getEntityType(), - String.format( - "Skipped records for count(%s) >= %s", - dedupConf.getWf().getOrderField(), dedupConf.getWf().getGroupMaxSize())); - accumulators.put(acc3, context.longAccumulator(acc3)); - String acc4 = String.format("%s::%s", dedupConf.getWf().getEntityType(), "skip list"); - accumulators.put(acc4, context.longAccumulator(acc4)); - String acc5 = - String.format("%s::%s", dedupConf.getWf().getEntityType(), "dedupSimilarity (x2)"); - accumulators.put(acc5, context.longAccumulator(acc5)); - String acc6 = - String.format( - "%s::%s", dedupConf.getWf().getEntityType(), "d < " + dedupConf.getWf().getThreshold()); - accumulators.put(acc6, context.longAccumulator(acc6)); + String acc1 = String.format("%s::%s", dedupConf.getWf().getEntityType(), "records per hash key = 1"); + accumulators.put(acc1, context.longAccumulator(acc1)); + String acc2 = String + .format( + "%s::%s", + dedupConf.getWf().getEntityType(), "missing " + dedupConf.getWf().getOrderField()); + accumulators.put(acc2, context.longAccumulator(acc2)); + String acc3 = String + .format( + "%s::%s", + dedupConf.getWf().getEntityType(), + String + .format( + "Skipped records for count(%s) >= %s", + dedupConf.getWf().getOrderField(), dedupConf.getWf().getGroupMaxSize())); + accumulators.put(acc3, context.longAccumulator(acc3)); + String acc4 = String.format("%s::%s", dedupConf.getWf().getEntityType(), "skip list"); + accumulators.put(acc4, context.longAccumulator(acc4)); + String acc5 = String.format("%s::%s", dedupConf.getWf().getEntityType(), "dedupSimilarity (x2)"); + accumulators.put(acc5, context.longAccumulator(acc5)); + String acc6 = String + .format( + "%s::%s", dedupConf.getWf().getEntityType(), "d < " + dedupConf.getWf().getThreshold()); + accumulators.put(acc6, context.longAccumulator(acc6)); - return accumulators; - } + return accumulators; + } - public static JavaRDD loadDataFromHDFS(String path, JavaSparkContext context) { - return context.textFile(path); - } + public static JavaRDD loadDataFromHDFS(String path, JavaSparkContext context) { + return context.textFile(path); + } - public static void deleteIfExists(String path) throws IOException { - Configuration conf = new Configuration(); - FileSystem fileSystem = FileSystem.get(conf); - if (fileSystem.exists(new Path(path))) { - fileSystem.delete(new Path(path), true); - } - } + public static void deleteIfExists(String path) throws IOException { + Configuration conf = new Configuration(); + FileSystem fileSystem = FileSystem.get(conf); + if (fileSystem.exists(new Path(path))) { + fileSystem.delete(new Path(path), true); + } + } - public static DedupConfig loadConfigFromHDFS(String path) throws IOException { + public static DedupConfig loadConfigFromHDFS(String path) throws IOException { - Configuration conf = new Configuration(); - FileSystem fileSystem = FileSystem.get(conf); - FSDataInputStream inputStream = new FSDataInputStream(fileSystem.open(new Path(path))); + Configuration conf = new Configuration(); + FileSystem fileSystem = FileSystem.get(conf); + FSDataInputStream inputStream = new FSDataInputStream(fileSystem.open(new Path(path))); - return DedupConfig.load(IOUtils.toString(inputStream, StandardCharsets.UTF_8.name())); - } + return DedupConfig.load(IOUtils.toString(inputStream, StandardCharsets.UTF_8.name())); + } - static String readFromClasspath(final String filename, final Class clazz) { - final StringWriter sw = new StringWriter(); - try { - IOUtils.copy(clazz.getResourceAsStream(filename), sw); - return sw.toString(); - } catch (final IOException e) { - throw new RuntimeException("cannot load resource from classpath: " + filename); - } - } + static String readFromClasspath(final String filename, final Class clazz) { + final StringWriter sw = new StringWriter(); + try { + IOUtils.copy(clazz.getResourceAsStream(filename), sw); + return sw.toString(); + } catch (final IOException e) { + throw new RuntimeException("cannot load resource from classpath: " + filename); + } + } - static Set getGroupingKeys(DedupConfig conf, MapDocument doc) { - return Sets.newHashSet(BlacklistAwareClusteringCombiner.filterAndCombine(doc, conf)); - } + static Set getGroupingKeys(DedupConfig conf, MapDocument doc) { + return Sets.newHashSet(BlacklistAwareClusteringCombiner.filterAndCombine(doc, conf)); + } - public static String md5(final String s) { - try { - final MessageDigest md = MessageDigest.getInstance("MD5"); - md.update(s.getBytes("UTF-8")); - return new String(Hex.encodeHex(md.digest())); - } catch (final Exception e) { - System.err.println("Error creating id"); - return null; - } - } + public static String md5(final String s) { + try { + final MessageDigest md = MessageDigest.getInstance("MD5"); + md.update(s.getBytes("UTF-8")); + return new String(Hex.encodeHex(md.digest())); + } catch (final Exception e) { + System.err.println("Error creating id"); + return null; + } + } - public static List mergeAuthor(final List a, final List b) { - int pa = countAuthorsPids(a); - int pb = countAuthorsPids(b); - List base, enrich; - int sa = authorsSize(a); - int sb = authorsSize(b); + public static List mergeAuthor(final List a, final List b) { + int pa = countAuthorsPids(a); + int pb = countAuthorsPids(b); + List base, enrich; + int sa = authorsSize(a); + int sb = authorsSize(b); - if (pa == pb) { - base = sa > sb ? a : b; - enrich = sa > sb ? b : a; - } else { - base = pa > pb ? a : b; - enrich = pa > pb ? b : a; - } - enrichPidFromList(base, enrich); - return base; - } + if (pa == pb) { + base = sa > sb ? a : b; + enrich = sa > sb ? b : a; + } else { + base = pa > pb ? a : b; + enrich = pa > pb ? b : a; + } + enrichPidFromList(base, enrich); + return base; + } - private static void enrichPidFromList(List base, List enrich) { - if (base == null || enrich == null) return; - final Map basePidAuthorMap = - base.stream() - .filter(a -> a.getPid() != null && a.getPid().size() > 0) - .flatMap(a -> a.getPid().stream().map(p -> new Tuple2<>(p.toComparableString(), a))) - .collect(Collectors.toMap(Tuple2::_1, Tuple2::_2, (x1, x2) -> x1)); + private static void enrichPidFromList(List base, List enrich) { + if (base == null || enrich == null) + return; + final Map basePidAuthorMap = base + .stream() + .filter(a -> a.getPid() != null && a.getPid().size() > 0) + .flatMap(a -> a.getPid().stream().map(p -> new Tuple2<>(p.toComparableString(), a))) + .collect(Collectors.toMap(Tuple2::_1, Tuple2::_2, (x1, x2) -> x1)); - final List> pidToEnrich = - enrich.stream() - .filter(a -> a.getPid() != null && a.getPid().size() > 0) - .flatMap( - a -> - a.getPid().stream() - .filter(p -> !basePidAuthorMap.containsKey(p.toComparableString())) - .map(p -> new Tuple2<>(p, a))) - .collect(Collectors.toList()); + final List> pidToEnrich = enrich + .stream() + .filter(a -> a.getPid() != null && a.getPid().size() > 0) + .flatMap( + a -> a + .getPid() + .stream() + .filter(p -> !basePidAuthorMap.containsKey(p.toComparableString())) + .map(p -> new Tuple2<>(p, a))) + .collect(Collectors.toList()); - pidToEnrich.forEach( - a -> { - Optional> simAuhtor = - base.stream() - .map(ba -> new Tuple2<>(sim(ba, a._2()), ba)) - .max(Comparator.comparing(Tuple2::_1)); - if (simAuhtor.isPresent() && simAuhtor.get()._1() > THRESHOLD) { - Author r = simAuhtor.get()._2(); - r.getPid().add(a._1()); - } - }); - } + pidToEnrich + .forEach( + a -> { + Optional> simAuhtor = base + .stream() + .map(ba -> new Tuple2<>(sim(ba, a._2()), ba)) + .max(Comparator.comparing(Tuple2::_1)); + if (simAuhtor.isPresent() && simAuhtor.get()._1() > THRESHOLD) { + Author r = simAuhtor.get()._2(); + r.getPid().add(a._1()); + } + }); + } - public static String createEntityPath(final String basePath, final String entityType) { - return String.format("%s/%s", basePath, entityType); - } + public static String createEntityPath(final String basePath, final String entityType) { + return String.format("%s/%s", basePath, entityType); + } - public static String createSimRelPath(final String basePath, final String entityType) { - return String.format("%s/%s/simRel", basePath, entityType); - } + public static String createSimRelPath(final String basePath, final String entityType) { + return String.format("%s/%s/simRel", basePath, entityType); + } - public static String createMergeRelPath(final String basePath, final String entityType) { - return String.format("%s/%s/mergeRel", basePath, entityType); - } + public static String createMergeRelPath(final String basePath, final String entityType) { + return String.format("%s/%s/mergeRel", basePath, entityType); + } - private static Double sim(Author a, Author b) { + private static Double sim(Author a, Author b) { - final Person pa = parse(a); - final Person pb = parse(b); + final Person pa = parse(a); + final Person pb = parse(b); - if (pa.isAccurate() & pb.isAccurate()) { - return new JaroWinkler() - .score(normalize(pa.getSurnameString()), normalize(pb.getSurnameString())); - } else { - return new JaroWinkler() - .score(normalize(pa.getNormalisedFullname()), normalize(pb.getNormalisedFullname())); - } - } + if (pa.isAccurate() & pb.isAccurate()) { + return new JaroWinkler() + .score(normalize(pa.getSurnameString()), normalize(pb.getSurnameString())); + } else { + return new JaroWinkler() + .score(normalize(pa.getNormalisedFullname()), normalize(pb.getNormalisedFullname())); + } + } - private static String normalize(final String s) { - return nfd(s) - .toLowerCase() - // do not compact the regexes in a single expression, would cause StackOverflowError - // in case - // of large input strings - .replaceAll("(\\W)+", " ") - .replaceAll("(\\p{InCombiningDiacriticalMarks})+", " ") - .replaceAll("(\\p{Punct})+", " ") - .replaceAll("(\\d)+", " ") - .replaceAll("(\\n)+", " ") - .trim(); - } + private static String normalize(final String s) { + return nfd(s) + .toLowerCase() + // do not compact the regexes in a single expression, would cause StackOverflowError + // in case + // of large input strings + .replaceAll("(\\W)+", " ") + .replaceAll("(\\p{InCombiningDiacriticalMarks})+", " ") + .replaceAll("(\\p{Punct})+", " ") + .replaceAll("(\\d)+", " ") + .replaceAll("(\\n)+", " ") + .trim(); + } - private static String nfd(final String s) { - return Normalizer.normalize(s, Normalizer.Form.NFD); - } + private static String nfd(final String s) { + return Normalizer.normalize(s, Normalizer.Form.NFD); + } - private static Person parse(Author author) { - if (StringUtils.isNotBlank(author.getSurname())) { - return new Person(author.getSurname() + ", " + author.getName(), false); - } else { - return new Person(author.getFullname(), false); - } - } + private static Person parse(Author author) { + if (StringUtils.isNotBlank(author.getSurname())) { + return new Person(author.getSurname() + ", " + author.getName(), false); + } else { + return new Person(author.getFullname(), false); + } + } - private static int countAuthorsPids(List authors) { - if (authors == null) return 0; + private static int countAuthorsPids(List authors) { + if (authors == null) + return 0; - return (int) authors.stream().filter(DedupUtility::hasPid).count(); - } + return (int) authors.stream().filter(DedupUtility::hasPid).count(); + } - private static int authorsSize(List authors) { - if (authors == null) return 0; - return authors.size(); - } + private static int authorsSize(List authors) { + if (authors == null) + return 0; + return authors.size(); + } - private static boolean hasPid(Author a) { - if (a == null || a.getPid() == null || a.getPid().size() == 0) return false; - return a.getPid().stream().anyMatch(p -> p != null && StringUtils.isNotBlank(p.getValue())); - } + private static boolean hasPid(Author a) { + if (a == null || a.getPid() == null || a.getPid().size() == 0) + return false; + return a.getPid().stream().anyMatch(p -> p != null && StringUtils.isNotBlank(p.getValue())); + } } diff --git a/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/Deduper.java b/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/Deduper.java index 681a4168a..e7d49be98 100644 --- a/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/Deduper.java +++ b/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/Deduper.java @@ -1,11 +1,9 @@ + package eu.dnetlib.dedup; -import eu.dnetlib.pace.config.DedupConfig; -import eu.dnetlib.pace.model.MapDocument; -import eu.dnetlib.pace.util.BlockProcessor; -import eu.dnetlib.pace.util.MapDocumentUtil; import java.util.*; import java.util.stream.Collectors; + import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.spark.api.java.JavaPairRDD; @@ -15,170 +13,170 @@ import org.apache.spark.api.java.function.Function2; import org.apache.spark.api.java.function.PairFlatMapFunction; import org.apache.spark.api.java.function.PairFunction; import org.apache.spark.util.LongAccumulator; + +import eu.dnetlib.pace.config.DedupConfig; +import eu.dnetlib.pace.model.MapDocument; +import eu.dnetlib.pace.util.BlockProcessor; +import eu.dnetlib.pace.util.MapDocumentUtil; import scala.Serializable; import scala.Tuple2; public class Deduper implements Serializable { - private static final Log log = LogFactory.getLog(Deduper.class); + private static final Log log = LogFactory.getLog(Deduper.class); - /** - * @return the list of relations generated by the deduplication - * @param: the spark context - * @param: list of JSON entities to be deduped - * @param: the dedup configuration - */ - public static JavaPairRDD dedup( - JavaSparkContext context, JavaRDD entities, DedupConfig config) { + /** + * @return the list of relations generated by the deduplication + * @param: the spark context + * @param: list of JSON entities to be deduped + * @param: the dedup configuration + */ + public static JavaPairRDD dedup( + JavaSparkContext context, JavaRDD entities, DedupConfig config) { - Map accumulators = - DedupUtility.constructAccumulator(config, context.sc()); + Map accumulators = DedupUtility.constructAccumulator(config, context.sc()); - // create vertexes of the graph: - JavaPairRDD mapDocs = mapToVertexes(context, entities, config); + // create vertexes of the graph: + JavaPairRDD mapDocs = mapToVertexes(context, entities, config); - // create blocks for deduplication - JavaPairRDD> blocks = createBlocks(context, mapDocs, config); + // create blocks for deduplication + JavaPairRDD> blocks = createBlocks(context, mapDocs, config); - // create relations by comparing only elements in the same group - return computeRelations(context, blocks, config); + // create relations by comparing only elements in the same group + return computeRelations(context, blocks, config); - // final RDD> edgeRdd = relationRDD.map(it -> new - // Edge<>(it._1().hashCode(), - // it._2().hashCode(), "equalTo")).rdd(); - // - // RDD> vertexes = - // mapDocs.mapToPair((PairFunction, Object, MapDocument>) t -> - // new - // Tuple2((long) t._1().hashCode(), t._2())).rdd(); - // accumulators.forEach((name, acc) -> log.info(name + " -> " + acc.value())); - // - // return GraphProcessor.findCCs(vertexes, edgeRdd, 20).toJavaRDD(); - } + // final RDD> edgeRdd = relationRDD.map(it -> new + // Edge<>(it._1().hashCode(), + // it._2().hashCode(), "equalTo")).rdd(); + // + // RDD> vertexes = + // mapDocs.mapToPair((PairFunction, Object, MapDocument>) t -> + // new + // Tuple2((long) t._1().hashCode(), t._2())).rdd(); + // accumulators.forEach((name, acc) -> log.info(name + " -> " + acc.value())); + // + // return GraphProcessor.findCCs(vertexes, edgeRdd, 20).toJavaRDD(); + } - /** - * @return the list of relations generated by the deduplication - * @param: the spark context - * @param: list of blocks - * @param: the dedup configuration - */ - public static JavaPairRDD computeRelations( - JavaSparkContext context, - JavaPairRDD> blocks, - DedupConfig config) { + /** + * @return the list of relations generated by the deduplication + * @param: the spark context + * @param: list of blocks + * @param: the dedup configuration + */ + public static JavaPairRDD computeRelations( + JavaSparkContext context, + JavaPairRDD> blocks, + DedupConfig config) { - Map accumulators = - DedupUtility.constructAccumulator(config, context.sc()); + Map accumulators = DedupUtility.constructAccumulator(config, context.sc()); - return blocks - .flatMapToPair( - (PairFlatMapFunction>, String, String>) - it -> { - final SparkReporter reporter = new SparkReporter(accumulators); - new BlockProcessor(config).process(it._1(), it._2(), reporter); - return reporter.getRelations().iterator(); - }) - .mapToPair( - (PairFunction, String, Tuple2>) - item -> new Tuple2>(item._1() + item._2(), item)) - .reduceByKey((a, b) -> a) - .mapToPair( - (PairFunction>, String, String>) Tuple2::_2); - } + return blocks + .flatMapToPair( + (PairFlatMapFunction>, String, String>) it -> { + final SparkReporter reporter = new SparkReporter(accumulators); + new BlockProcessor(config).process(it._1(), it._2(), reporter); + return reporter.getRelations().iterator(); + }) + .mapToPair( + (PairFunction, String, Tuple2>) item -> new Tuple2>( + item._1() + item._2(), item)) + .reduceByKey((a, b) -> a) + .mapToPair( + (PairFunction>, String, String>) Tuple2::_2); + } - /** - * @return the list of blocks based on clustering of dedup configuration - * @param: the spark context - * @param: list of entities: - * @param: the dedup configuration - */ - public static JavaPairRDD> createBlocks( - JavaSparkContext context, JavaPairRDD mapDocs, DedupConfig config) { - return mapDocs - // the reduce is just to be sure that we haven't document with same id - .reduceByKey((a, b) -> a) - .map(Tuple2::_2) - // Clustering: from to List - .flatMapToPair( - (PairFlatMapFunction) - a -> - DedupUtility.getGroupingKeys(config, a).stream() - .map(it -> new Tuple2<>(it, a)) - .collect(Collectors.toList()) - .iterator()) - .groupByKey(); - } + /** + * @return the list of blocks based on clustering of dedup configuration + * @param: the spark context + * @param: list of entities: + * @param: the dedup configuration + */ + public static JavaPairRDD> createBlocks( + JavaSparkContext context, JavaPairRDD mapDocs, DedupConfig config) { + return mapDocs + // the reduce is just to be sure that we haven't document with same id + .reduceByKey((a, b) -> a) + .map(Tuple2::_2) + // Clustering: from to List + .flatMapToPair( + (PairFlatMapFunction) a -> DedupUtility + .getGroupingKeys(config, a) + .stream() + .map(it -> new Tuple2<>(it, a)) + .collect(Collectors.toList()) + .iterator()) + .groupByKey(); + } - public static JavaPairRDD> createsortedBlocks( - JavaSparkContext context, JavaPairRDD mapDocs, DedupConfig config) { - final String of = config.getWf().getOrderField(); - final int maxQueueSize = config.getWf().getGroupMaxSize(); - return mapDocs - // the reduce is just to be sure that we haven't document with same id - .reduceByKey((a, b) -> a) - .map(Tuple2::_2) - // Clustering: from to List - .flatMapToPair( - (PairFlatMapFunction>) - a -> - DedupUtility.getGroupingKeys(config, a).stream() - .map( - it -> { - List tmp = new ArrayList<>(); - tmp.add(a); - return new Tuple2<>(it, tmp); - }) - .collect(Collectors.toList()) - .iterator()) - .reduceByKey( - (Function2, List, List>) - (v1, v2) -> { - v1.addAll(v2); - v1.sort(Comparator.comparing(a -> a.getFieldMap().get(of).stringValue())); - if (v1.size() > maxQueueSize) return new ArrayList<>(v1.subList(0, maxQueueSize)); - return v1; - }); - } + public static JavaPairRDD> createsortedBlocks( + JavaSparkContext context, JavaPairRDD mapDocs, DedupConfig config) { + final String of = config.getWf().getOrderField(); + final int maxQueueSize = config.getWf().getGroupMaxSize(); + return mapDocs + // the reduce is just to be sure that we haven't document with same id + .reduceByKey((a, b) -> a) + .map(Tuple2::_2) + // Clustering: from to List + .flatMapToPair( + (PairFlatMapFunction>) a -> DedupUtility + .getGroupingKeys(config, a) + .stream() + .map( + it -> { + List tmp = new ArrayList<>(); + tmp.add(a); + return new Tuple2<>(it, tmp); + }) + .collect(Collectors.toList()) + .iterator()) + .reduceByKey( + (Function2, List, List>) (v1, v2) -> { + v1.addAll(v2); + v1.sort(Comparator.comparing(a -> a.getFieldMap().get(of).stringValue())); + if (v1.size() > maxQueueSize) + return new ArrayList<>(v1.subList(0, maxQueueSize)); + return v1; + }); + } - /** - * @return the list of vertexes: - * @param: the spark context - * @param: list of JSON entities - * @param: the dedup configuration - */ - public static JavaPairRDD mapToVertexes( - JavaSparkContext context, JavaRDD entities, DedupConfig config) { + /** + * @return the list of vertexes: + * @param: the spark context + * @param: list of JSON entities + * @param: the dedup configuration + */ + public static JavaPairRDD mapToVertexes( + JavaSparkContext context, JavaRDD entities, DedupConfig config) { - return entities.mapToPair( - (PairFunction) - s -> { - MapDocument mapDocument = MapDocumentUtil.asMapDocumentWithJPath(config, s); - return new Tuple2(mapDocument.getIdentifier(), mapDocument); - }); - } + return entities + .mapToPair( + (PairFunction) s -> { + MapDocument mapDocument = MapDocumentUtil.asMapDocumentWithJPath(config, s); + return new Tuple2(mapDocument.getIdentifier(), mapDocument); + }); + } - public static JavaPairRDD computeRelations2( - JavaSparkContext context, JavaPairRDD> blocks, DedupConfig config) { - Map accumulators = - DedupUtility.constructAccumulator(config, context.sc()); + public static JavaPairRDD computeRelations2( + JavaSparkContext context, JavaPairRDD> blocks, DedupConfig config) { + Map accumulators = DedupUtility.constructAccumulator(config, context.sc()); - return blocks - .flatMapToPair( - (PairFlatMapFunction>, String, String>) - it -> { - try { - final SparkReporter reporter = new SparkReporter(accumulators); - new BlockProcessor(config).processSortedBlock(it._1(), it._2(), reporter); - return reporter.getRelations().iterator(); - } catch (Exception e) { - throw new RuntimeException(it._2().get(0).getIdentifier(), e); - } - }) - .mapToPair( - (PairFunction, String, Tuple2>) - item -> new Tuple2>(item._1() + item._2(), item)) - .reduceByKey((a, b) -> a) - .mapToPair( - (PairFunction>, String, String>) Tuple2::_2); - } + return blocks + .flatMapToPair( + (PairFlatMapFunction>, String, String>) it -> { + try { + final SparkReporter reporter = new SparkReporter(accumulators); + new BlockProcessor(config).processSortedBlock(it._1(), it._2(), reporter); + return reporter.getRelations().iterator(); + } catch (Exception e) { + throw new RuntimeException(it._2().get(0).getIdentifier(), e); + } + }) + .mapToPair( + (PairFunction, String, Tuple2>) item -> new Tuple2>( + item._1() + item._2(), item)) + .reduceByKey((a, b) -> a) + .mapToPair( + (PairFunction>, String, String>) Tuple2::_2); + } } diff --git a/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/OafEntityType.java b/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/OafEntityType.java index 72c771a13..bc9948190 100644 --- a/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/OafEntityType.java +++ b/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/OafEntityType.java @@ -1,11 +1,6 @@ + package eu.dnetlib.dedup; public enum OafEntityType { - datasource, - organization, - project, - dataset, - otherresearchproduct, - software, - publication + datasource, organization, project, dataset, otherresearchproduct, software, publication } diff --git a/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/SparkCreateConnectedComponent.java b/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/SparkCreateConnectedComponent.java index 1039b8636..f86410d29 100644 --- a/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/SparkCreateConnectedComponent.java +++ b/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/SparkCreateConnectedComponent.java @@ -1,14 +1,9 @@ + package eu.dnetlib.dedup; -import com.google.common.hash.Hashing; -import eu.dnetlib.dedup.graph.ConnectedComponent; -import eu.dnetlib.dedup.graph.GraphProcessor; -import eu.dnetlib.dhp.application.ArgumentApplicationParser; -import eu.dnetlib.dhp.schema.oaf.Relation; -import eu.dnetlib.pace.config.DedupConfig; -import eu.dnetlib.pace.util.MapDocumentUtil; import java.util.ArrayList; import java.util.List; + import org.apache.commons.io.IOUtils; import org.apache.spark.api.java.JavaPairRDD; import org.apache.spark.api.java.JavaRDD; @@ -20,86 +15,93 @@ import org.apache.spark.rdd.RDD; import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Encoders; import org.apache.spark.sql.SparkSession; + +import com.google.common.hash.Hashing; + +import eu.dnetlib.dedup.graph.ConnectedComponent; +import eu.dnetlib.dedup.graph.GraphProcessor; +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.schema.oaf.Relation; +import eu.dnetlib.pace.config.DedupConfig; +import eu.dnetlib.pace.util.MapDocumentUtil; import scala.Tuple2; public class SparkCreateConnectedComponent { - public static void main(String[] args) throws Exception { - final ArgumentApplicationParser parser = - new ArgumentApplicationParser( - IOUtils.toString( - SparkCreateConnectedComponent.class.getResourceAsStream( - "/eu/dnetlib/dhp/sx/dedup/dedup_parameters.json"))); - parser.parseArgument(args); - final SparkSession spark = - SparkSession.builder() - .appName(SparkCreateConnectedComponent.class.getSimpleName()) - .master(parser.get("master")) - .getOrCreate(); + public static void main(String[] args) throws Exception { + final ArgumentApplicationParser parser = new ArgumentApplicationParser( + IOUtils + .toString( + SparkCreateConnectedComponent.class + .getResourceAsStream( + "/eu/dnetlib/dhp/sx/dedup/dedup_parameters.json"))); + parser.parseArgument(args); + final SparkSession spark = SparkSession + .builder() + .appName(SparkCreateConnectedComponent.class.getSimpleName()) + .master(parser.get("master")) + .getOrCreate(); - final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); - final String inputPath = parser.get("sourcePath"); - final String entity = parser.get("entity"); - final String targetPath = parser.get("targetPath"); - // final DedupConfig dedupConf = - // DedupConfig.load(IOUtils.toString(SparkCreateConnectedComponent.class.getResourceAsStream("/eu/dnetlib/dhp/dedup/conf/org.curr.conf2.json"))); - final DedupConfig dedupConf = DedupConfig.load(parser.get("dedupConf")); + final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); + final String inputPath = parser.get("sourcePath"); + final String entity = parser.get("entity"); + final String targetPath = parser.get("targetPath"); + // final DedupConfig dedupConf = + // DedupConfig.load(IOUtils.toString(SparkCreateConnectedComponent.class.getResourceAsStream("/eu/dnetlib/dhp/dedup/conf/org.curr.conf2.json"))); + final DedupConfig dedupConf = DedupConfig.load(parser.get("dedupConf")); - final JavaPairRDD vertexes = - sc.textFile(inputPath + "/" + entity) - .map(s -> MapDocumentUtil.getJPathString(dedupConf.getWf().getIdPath(), s)) - .mapToPair( - (PairFunction) - s -> new Tuple2(getHashcode(s), s)); + final JavaPairRDD vertexes = sc + .textFile(inputPath + "/" + entity) + .map(s -> MapDocumentUtil.getJPathString(dedupConf.getWf().getIdPath(), s)) + .mapToPair( + (PairFunction) s -> new Tuple2(getHashcode(s), s)); - final Dataset similarityRelations = - spark - .read() - .load(DedupUtility.createSimRelPath(targetPath, entity)) - .as(Encoders.bean(Relation.class)); - final RDD> edgeRdd = - similarityRelations - .javaRDD() - .map( - it -> - new Edge<>( - getHashcode(it.getSource()), getHashcode(it.getTarget()), it.getRelClass())) - .rdd(); - final JavaRDD cc = - GraphProcessor.findCCs(vertexes.rdd(), edgeRdd, dedupConf.getWf().getMaxIterations()) - .toJavaRDD(); - final Dataset mergeRelation = - spark.createDataset( - cc.filter(k -> k.getDocIds().size() > 1) - .flatMap( - (FlatMapFunction) - c -> - c.getDocIds().stream() - .flatMap( - id -> { - List tmp = new ArrayList<>(); - Relation r = new Relation(); - r.setSource(c.getCcId()); - r.setTarget(id); - r.setRelClass("merges"); - tmp.add(r); - r = new Relation(); - r.setTarget(c.getCcId()); - r.setSource(id); - r.setRelClass("isMergedIn"); - tmp.add(r); - return tmp.stream(); - }) - .iterator()) - .rdd(), - Encoders.bean(Relation.class)); - mergeRelation - .write() - .mode("overwrite") - .save(DedupUtility.createMergeRelPath(targetPath, entity)); - } + final Dataset similarityRelations = spark + .read() + .load(DedupUtility.createSimRelPath(targetPath, entity)) + .as(Encoders.bean(Relation.class)); + final RDD> edgeRdd = similarityRelations + .javaRDD() + .map( + it -> new Edge<>( + getHashcode(it.getSource()), getHashcode(it.getTarget()), it.getRelClass())) + .rdd(); + final JavaRDD cc = GraphProcessor + .findCCs(vertexes.rdd(), edgeRdd, dedupConf.getWf().getMaxIterations()) + .toJavaRDD(); + final Dataset mergeRelation = spark + .createDataset( + cc + .filter(k -> k.getDocIds().size() > 1) + .flatMap( + (FlatMapFunction) c -> c + .getDocIds() + .stream() + .flatMap( + id -> { + List tmp = new ArrayList<>(); + Relation r = new Relation(); + r.setSource(c.getCcId()); + r.setTarget(id); + r.setRelClass("merges"); + tmp.add(r); + r = new Relation(); + r.setTarget(c.getCcId()); + r.setSource(id); + r.setRelClass("isMergedIn"); + tmp.add(r); + return tmp.stream(); + }) + .iterator()) + .rdd(), + Encoders.bean(Relation.class)); + mergeRelation + .write() + .mode("overwrite") + .save(DedupUtility.createMergeRelPath(targetPath, entity)); + } - public static long getHashcode(final String id) { - return Hashing.murmur3_128().hashString(id).asLong(); - } + public static long getHashcode(final String id) { + return Hashing.murmur3_128().hashString(id).asLong(); + } } diff --git a/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/SparkCreateDedupRecord.java b/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/SparkCreateDedupRecord.java index dbc97466d..d87269f03 100644 --- a/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/SparkCreateDedupRecord.java +++ b/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/SparkCreateDedupRecord.java @@ -1,48 +1,52 @@ + package eu.dnetlib.dedup; -import com.fasterxml.jackson.databind.ObjectMapper; -import eu.dnetlib.dhp.application.ArgumentApplicationParser; -import eu.dnetlib.dhp.schema.oaf.OafEntity; -import eu.dnetlib.pace.config.DedupConfig; import org.apache.commons.io.IOUtils; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.sql.SparkSession; +import com.fasterxml.jackson.databind.ObjectMapper; + +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.schema.oaf.OafEntity; +import eu.dnetlib.pace.config.DedupConfig; + public class SparkCreateDedupRecord { - public static void main(String[] args) throws Exception { - final ArgumentApplicationParser parser = - new ArgumentApplicationParser( - IOUtils.toString( - SparkCreateDedupRecord.class.getResourceAsStream( - "/eu/dnetlib/dhp/sx/dedup/dedupRecord_parameters.json"))); - parser.parseArgument(args); - final SparkSession spark = - SparkSession.builder() - .appName(SparkCreateDedupRecord.class.getSimpleName()) - .master(parser.get("master")) - .getOrCreate(); + public static void main(String[] args) throws Exception { + final ArgumentApplicationParser parser = new ArgumentApplicationParser( + IOUtils + .toString( + SparkCreateDedupRecord.class + .getResourceAsStream( + "/eu/dnetlib/dhp/sx/dedup/dedupRecord_parameters.json"))); + parser.parseArgument(args); + final SparkSession spark = SparkSession + .builder() + .appName(SparkCreateDedupRecord.class.getSimpleName()) + .master(parser.get("master")) + .getOrCreate(); - final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); - final String sourcePath = parser.get("sourcePath"); - final String entity = parser.get("entity"); - final String dedupPath = parser.get("dedupPath"); - final DedupConfig dedupConf = DedupConfig.load(parser.get("dedupConf")); + final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); + final String sourcePath = parser.get("sourcePath"); + final String entity = parser.get("entity"); + final String dedupPath = parser.get("dedupPath"); + final DedupConfig dedupConf = DedupConfig.load(parser.get("dedupConf")); - final JavaRDD dedupRecord = - DedupRecordFactory.createDedupRecord( - sc, - spark, - DedupUtility.createMergeRelPath(dedupPath, entity), - DedupUtility.createEntityPath(sourcePath, entity), - OafEntityType.valueOf(entity), - dedupConf); - dedupRecord - .map( - r -> { - ObjectMapper mapper = new ObjectMapper(); - return mapper.writeValueAsString(r); - }) - .saveAsTextFile(dedupPath + "/" + entity + "/dedup_records"); - } + final JavaRDD dedupRecord = DedupRecordFactory + .createDedupRecord( + sc, + spark, + DedupUtility.createMergeRelPath(dedupPath, entity), + DedupUtility.createEntityPath(sourcePath, entity), + OafEntityType.valueOf(entity), + dedupConf); + dedupRecord + .map( + r -> { + ObjectMapper mapper = new ObjectMapper(); + return mapper.writeValueAsString(r); + }) + .saveAsTextFile(dedupPath + "/" + entity + "/dedup_records"); + } } diff --git a/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/SparkCreateSimRels.java b/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/SparkCreateSimRels.java index b8df49af3..41fe911e7 100644 --- a/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/SparkCreateSimRels.java +++ b/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/SparkCreateSimRels.java @@ -1,80 +1,83 @@ + package eu.dnetlib.dedup; -import eu.dnetlib.dhp.application.ArgumentApplicationParser; -import eu.dnetlib.dhp.schema.oaf.Relation; -import eu.dnetlib.pace.config.DedupConfig; -import eu.dnetlib.pace.model.MapDocument; -import eu.dnetlib.pace.util.MapDocumentUtil; import java.util.List; + import org.apache.commons.io.IOUtils; import org.apache.spark.api.java.JavaPairRDD; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.sql.Encoders; import org.apache.spark.sql.SparkSession; + +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.schema.oaf.Relation; +import eu.dnetlib.pace.config.DedupConfig; +import eu.dnetlib.pace.model.MapDocument; +import eu.dnetlib.pace.util.MapDocumentUtil; import scala.Tuple2; /** * This Spark class creates similarity relations between entities, saving result - * - *

param request: sourcePath entityType target Path + *

+ * param request: sourcePath entityType target Path */ public class SparkCreateSimRels { - public static void main(String[] args) throws Exception { - final ArgumentApplicationParser parser = - new ArgumentApplicationParser( - IOUtils.toString( - SparkCreateSimRels.class.getResourceAsStream( - "/eu/dnetlib/dhp/sx/dedup/dedup_parameters.json"))); - parser.parseArgument(args); - final SparkSession spark = - SparkSession.builder() - .appName(SparkCreateSimRels.class.getSimpleName()) - .master(parser.get("master")) - .getOrCreate(); + public static void main(String[] args) throws Exception { + final ArgumentApplicationParser parser = new ArgumentApplicationParser( + IOUtils + .toString( + SparkCreateSimRels.class + .getResourceAsStream( + "/eu/dnetlib/dhp/sx/dedup/dedup_parameters.json"))); + parser.parseArgument(args); + final SparkSession spark = SparkSession + .builder() + .appName(SparkCreateSimRels.class.getSimpleName()) + .master(parser.get("master")) + .getOrCreate(); - final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); - final String inputPath = parser.get("sourcePath"); - final String entity = parser.get("entity"); - final String targetPath = parser.get("targetPath"); - // final DedupConfig dedupConf = - // DedupConfig.load(IOUtils.toString(SparkCreateSimRels.class.getResourceAsStream("/eu/dnetlib/dhp/dedup/conf/org.curr.conf.json"))); - final DedupConfig dedupConf = DedupConfig.load(parser.get("dedupConf")); + final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); + final String inputPath = parser.get("sourcePath"); + final String entity = parser.get("entity"); + final String targetPath = parser.get("targetPath"); + // final DedupConfig dedupConf = + // DedupConfig.load(IOUtils.toString(SparkCreateSimRels.class.getResourceAsStream("/eu/dnetlib/dhp/dedup/conf/org.curr.conf.json"))); + final DedupConfig dedupConf = DedupConfig.load(parser.get("dedupConf")); - JavaPairRDD mapDocument = - sc.textFile(inputPath + "/" + entity) - .mapToPair( - s -> { - MapDocument d = MapDocumentUtil.asMapDocumentWithJPath(dedupConf, s); - return new Tuple2<>(d.getIdentifier(), d); - }); + JavaPairRDD mapDocument = sc + .textFile(inputPath + "/" + entity) + .mapToPair( + s -> { + MapDocument d = MapDocumentUtil.asMapDocumentWithJPath(dedupConf, s); + return new Tuple2<>(d.getIdentifier(), d); + }); - // create blocks for deduplication - JavaPairRDD> blocks = - Deduper.createsortedBlocks(sc, mapDocument, dedupConf); - // JavaPairRDD> blocks = Deduper.createBlocks(sc, - // mapDocument, dedupConf); + // create blocks for deduplication + JavaPairRDD> blocks = Deduper.createsortedBlocks(sc, mapDocument, dedupConf); + // JavaPairRDD> blocks = Deduper.createBlocks(sc, + // mapDocument, dedupConf); - // create relations by comparing only elements in the same group - final JavaPairRDD dedupRels = Deduper.computeRelations2(sc, blocks, dedupConf); - // final JavaPairRDD dedupRels = Deduper.computeRelations(sc, blocks, - // dedupConf); + // create relations by comparing only elements in the same group + final JavaPairRDD dedupRels = Deduper.computeRelations2(sc, blocks, dedupConf); + // final JavaPairRDD dedupRels = Deduper.computeRelations(sc, blocks, + // dedupConf); - final JavaRDD isSimilarToRDD = - dedupRels.map( - simRel -> { - final Relation r = new Relation(); - r.setSource(simRel._1()); - r.setTarget(simRel._2()); - r.setRelClass("isSimilarTo"); - return r; - }); + final JavaRDD isSimilarToRDD = dedupRels + .map( + simRel -> { + final Relation r = new Relation(); + r.setSource(simRel._1()); + r.setTarget(simRel._2()); + r.setRelClass("isSimilarTo"); + return r; + }); - spark - .createDataset(isSimilarToRDD.rdd(), Encoders.bean(Relation.class)) - .write() - .mode("overwrite") - .save(DedupUtility.createSimRelPath(targetPath, entity)); - } + spark + .createDataset(isSimilarToRDD.rdd(), Encoders.bean(Relation.class)) + .write() + .mode("overwrite") + .save(DedupUtility.createSimRelPath(targetPath, entity)); + } } diff --git a/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/SparkReporter.java b/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/SparkReporter.java index d92eef2d4..21e72b5b8 100644 --- a/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/SparkReporter.java +++ b/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/SparkReporter.java @@ -1,49 +1,52 @@ + package eu.dnetlib.dedup; -import eu.dnetlib.pace.util.Reporter; import java.util.ArrayList; import java.util.List; import java.util.Map; + import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.spark.util.LongAccumulator; + +import eu.dnetlib.pace.util.Reporter; import scala.Serializable; import scala.Tuple2; public class SparkReporter implements Serializable, Reporter { - final List> relations = new ArrayList<>(); - private static final Log log = LogFactory.getLog(SparkReporter.class); - Map accumulators; + final List> relations = new ArrayList<>(); + private static final Log log = LogFactory.getLog(SparkReporter.class); + Map accumulators; - public SparkReporter(Map accumulators) { - this.accumulators = accumulators; - } + public SparkReporter(Map accumulators) { + this.accumulators = accumulators; + } - public void incrementCounter( - String counterGroup, - String counterName, - long delta, - Map accumulators) { + public void incrementCounter( + String counterGroup, + String counterName, + long delta, + Map accumulators) { - final String accumulatorName = String.format("%s::%s", counterGroup, counterName); - if (accumulators.containsKey(accumulatorName)) { - accumulators.get(accumulatorName).add(delta); - } - } + final String accumulatorName = String.format("%s::%s", counterGroup, counterName); + if (accumulators.containsKey(accumulatorName)) { + accumulators.get(accumulatorName).add(delta); + } + } - @Override - public void incrementCounter(String counterGroup, String counterName, long delta) { + @Override + public void incrementCounter(String counterGroup, String counterName, long delta) { - incrementCounter(counterGroup, counterName, delta, accumulators); - } + incrementCounter(counterGroup, counterName, delta, accumulators); + } - @Override - public void emit(String type, String from, String to) { - relations.add(new Tuple2<>(from, to)); - } + @Override + public void emit(String type, String from, String to) { + relations.add(new Tuple2<>(from, to)); + } - public List> getRelations() { - return relations; - } + public List> getRelations() { + return relations; + } } diff --git a/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/graph/ConnectedComponent.java b/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/graph/ConnectedComponent.java index 444d987d8..79a3114fd 100644 --- a/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/graph/ConnectedComponent.java +++ b/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/graph/ConnectedComponent.java @@ -1,78 +1,84 @@ + package eu.dnetlib.dedup.graph; -import com.fasterxml.jackson.databind.ObjectMapper; -import eu.dnetlib.dedup.DedupUtility; -import eu.dnetlib.pace.util.PaceException; import java.io.IOException; import java.io.Serializable; import java.util.Set; + import org.apache.commons.lang.StringUtils; import org.codehaus.jackson.annotate.JsonIgnore; +import com.fasterxml.jackson.databind.ObjectMapper; + +import eu.dnetlib.dedup.DedupUtility; +import eu.dnetlib.pace.util.PaceException; + public class ConnectedComponent implements Serializable { - private Set docIds; - private String ccId; + private Set docIds; + private String ccId; - public ConnectedComponent() {} + public ConnectedComponent() { + } - public ConnectedComponent(Set docIds) { - this.docIds = docIds; - createID(); - } + public ConnectedComponent(Set docIds) { + this.docIds = docIds; + createID(); + } - public String createID() { - if (docIds.size() > 1) { - final String s = getMin(); - String prefix = s.split("\\|")[0]; - ccId = prefix + "|dedup_wf_001::" + DedupUtility.md5(s); - return ccId; - } else { - return docIds.iterator().next(); - } - } + public String createID() { + if (docIds.size() > 1) { + final String s = getMin(); + String prefix = s.split("\\|")[0]; + ccId = prefix + "|dedup_wf_001::" + DedupUtility.md5(s); + return ccId; + } else { + return docIds.iterator().next(); + } + } - @JsonIgnore - public String getMin() { + @JsonIgnore + public String getMin() { - final StringBuilder min = new StringBuilder(); - docIds.forEach( - i -> { - if (StringUtils.isBlank(min.toString())) { - min.append(i); - } else { - if (min.toString().compareTo(i) > 0) { - min.setLength(0); - min.append(i); - } - } - }); - return min.toString(); - } + final StringBuilder min = new StringBuilder(); + docIds + .forEach( + i -> { + if (StringUtils.isBlank(min.toString())) { + min.append(i); + } else { + if (min.toString().compareTo(i) > 0) { + min.setLength(0); + min.append(i); + } + } + }); + return min.toString(); + } - @Override - public String toString() { - ObjectMapper mapper = new ObjectMapper(); - try { - return mapper.writeValueAsString(this); - } catch (IOException e) { - throw new PaceException("Failed to create Json: ", e); - } - } + @Override + public String toString() { + ObjectMapper mapper = new ObjectMapper(); + try { + return mapper.writeValueAsString(this); + } catch (IOException e) { + throw new PaceException("Failed to create Json: ", e); + } + } - public Set getDocIds() { - return docIds; - } + public Set getDocIds() { + return docIds; + } - public void setDocIds(Set docIds) { - this.docIds = docIds; - } + public void setDocIds(Set docIds) { + this.docIds = docIds; + } - public String getCcId() { - return ccId; - } + public String getCcId() { + return ccId; + } - public void setCcId(String ccId) { - this.ccId = ccId; - } + public void setCcId(String ccId) { + this.ccId = ccId; + } } diff --git a/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/sx/SparkPropagateRelationsJob.java b/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/sx/SparkPropagateRelationsJob.java index 54a2d5dba..e3d4fdbe3 100644 --- a/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/sx/SparkPropagateRelationsJob.java +++ b/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/sx/SparkPropagateRelationsJob.java @@ -1,110 +1,112 @@ + package eu.dnetlib.dedup.sx; -import com.fasterxml.jackson.databind.DeserializationFeature; -import com.fasterxml.jackson.databind.ObjectMapper; -import eu.dnetlib.dhp.application.ArgumentApplicationParser; -import eu.dnetlib.dhp.schema.oaf.DataInfo; -import eu.dnetlib.dhp.schema.oaf.Relation; -import eu.dnetlib.dhp.utils.DHPUtils; import java.io.IOException; + import org.apache.commons.io.IOUtils; import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.api.java.function.MapFunction; import org.apache.spark.sql.*; + +import com.fasterxml.jackson.databind.DeserializationFeature; +import com.fasterxml.jackson.databind.ObjectMapper; + +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.schema.oaf.DataInfo; +import eu.dnetlib.dhp.schema.oaf.Relation; +import eu.dnetlib.dhp.utils.DHPUtils; import scala.Tuple2; public class SparkPropagateRelationsJob { - enum FieldType { - SOURCE, - TARGET - } + enum FieldType { + SOURCE, TARGET + } - static final String SOURCEJSONPATH = "$.source"; - static final String TARGETJSONPATH = "$.target"; + static final String SOURCEJSONPATH = "$.source"; + static final String TARGETJSONPATH = "$.target"; - public static void main(String[] args) throws Exception { - final ArgumentApplicationParser parser = - new ArgumentApplicationParser( - IOUtils.toString( - SparkPropagateRelationsJob.class.getResourceAsStream( - "/eu/dnetlib/dhp/sx/dedup/dedup_propagate_relation_parameters.json"))); - parser.parseArgument(args); - final SparkSession spark = - SparkSession.builder() - .appName(SparkUpdateEntityJob.class.getSimpleName()) - .master(parser.get("master")) - .getOrCreate(); + public static void main(String[] args) throws Exception { + final ArgumentApplicationParser parser = new ArgumentApplicationParser( + IOUtils + .toString( + SparkPropagateRelationsJob.class + .getResourceAsStream( + "/eu/dnetlib/dhp/sx/dedup/dedup_propagate_relation_parameters.json"))); + parser.parseArgument(args); + final SparkSession spark = SparkSession + .builder() + .appName(SparkUpdateEntityJob.class.getSimpleName()) + .master(parser.get("master")) + .getOrCreate(); - final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); - final String relationPath = parser.get("relationPath"); - final String mergeRelPath = parser.get("mergeRelPath"); - final String targetRelPath = parser.get("targetRelPath"); + final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); + final String relationPath = parser.get("relationPath"); + final String mergeRelPath = parser.get("mergeRelPath"); + final String targetRelPath = parser.get("targetRelPath"); - final Dataset merge = - spark - .read() - .load(mergeRelPath) - .as(Encoders.bean(Relation.class)) - .where("relClass == 'merges'"); + final Dataset merge = spark + .read() + .load(mergeRelPath) + .as(Encoders.bean(Relation.class)) + .where("relClass == 'merges'"); - final Dataset rels = - spark.read().load(relationPath).as(Encoders.bean(Relation.class)); + final Dataset rels = spark.read().load(relationPath).as(Encoders.bean(Relation.class)); - final Dataset firstJoin = - rels.joinWith(merge, merge.col("target").equalTo(rels.col("source")), "left_outer") - .map( - (MapFunction, Relation>) - r -> { - final Relation mergeRelation = r._2(); - final Relation relation = r._1(); + final Dataset firstJoin = rels + .joinWith(merge, merge.col("target").equalTo(rels.col("source")), "left_outer") + .map( + (MapFunction, Relation>) r -> { + final Relation mergeRelation = r._2(); + final Relation relation = r._1(); - if (mergeRelation != null) relation.setSource(mergeRelation.getSource()); - return relation; - }, - Encoders.bean(Relation.class)); + if (mergeRelation != null) + relation.setSource(mergeRelation.getSource()); + return relation; + }, + Encoders.bean(Relation.class)); - final Dataset secondJoin = - firstJoin - .joinWith(merge, merge.col("target").equalTo(firstJoin.col("target")), "left_outer") - .map( - (MapFunction, Relation>) - r -> { - final Relation mergeRelation = r._2(); - final Relation relation = r._1(); - if (mergeRelation != null) relation.setTarget(mergeRelation.getSource()); - return relation; - }, - Encoders.bean(Relation.class)); + final Dataset secondJoin = firstJoin + .joinWith(merge, merge.col("target").equalTo(firstJoin.col("target")), "left_outer") + .map( + (MapFunction, Relation>) r -> { + final Relation mergeRelation = r._2(); + final Relation relation = r._1(); + if (mergeRelation != null) + relation.setTarget(mergeRelation.getSource()); + return relation; + }, + Encoders.bean(Relation.class)); - secondJoin.write().mode(SaveMode.Overwrite).save(targetRelPath); - } + secondJoin.write().mode(SaveMode.Overwrite).save(targetRelPath); + } - private static boolean containsDedup(final String json) { - final String source = DHPUtils.getJPathString(SOURCEJSONPATH, json); - final String target = DHPUtils.getJPathString(TARGETJSONPATH, json); + private static boolean containsDedup(final String json) { + final String source = DHPUtils.getJPathString(SOURCEJSONPATH, json); + final String target = DHPUtils.getJPathString(TARGETJSONPATH, json); - return source.toLowerCase().contains("dedup") || target.toLowerCase().contains("dedup"); - } + return source.toLowerCase().contains("dedup") || target.toLowerCase().contains("dedup"); + } - private static String replaceField(final String json, final String id, final FieldType type) { - ObjectMapper mapper = new ObjectMapper(); - mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false); - try { - Relation relation = mapper.readValue(json, Relation.class); - if (relation.getDataInfo() == null) relation.setDataInfo(new DataInfo()); - relation.getDataInfo().setDeletedbyinference(false); - switch (type) { - case SOURCE: - relation.setSource(id); - return mapper.writeValueAsString(relation); - case TARGET: - relation.setTarget(id); - return mapper.writeValueAsString(relation); - default: - throw new IllegalArgumentException(""); - } - } catch (IOException e) { - throw new RuntimeException("unable to deserialize json relation: " + json, e); - } - } + private static String replaceField(final String json, final String id, final FieldType type) { + ObjectMapper mapper = new ObjectMapper(); + mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false); + try { + Relation relation = mapper.readValue(json, Relation.class); + if (relation.getDataInfo() == null) + relation.setDataInfo(new DataInfo()); + relation.getDataInfo().setDeletedbyinference(false); + switch (type) { + case SOURCE: + relation.setSource(id); + return mapper.writeValueAsString(relation); + case TARGET: + relation.setTarget(id); + return mapper.writeValueAsString(relation); + default: + throw new IllegalArgumentException(""); + } + } catch (IOException e) { + throw new RuntimeException("unable to deserialize json relation: " + json, e); + } + } } diff --git a/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/sx/SparkUpdateEntityJob.java b/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/sx/SparkUpdateEntityJob.java index 6ebdb0572..a847ad612 100644 --- a/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/sx/SparkUpdateEntityJob.java +++ b/dhp-workflows/dhp-dedup-scholexplorer/src/main/java/eu/dnetlib/dedup/sx/SparkUpdateEntityJob.java @@ -1,7 +1,19 @@ + package eu.dnetlib.dedup.sx; +import java.io.IOException; + +import org.apache.commons.io.IOUtils; +import org.apache.hadoop.io.compress.GzipCodec; +import org.apache.spark.api.java.JavaPairRDD; +import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.api.java.function.PairFunction; +import org.apache.spark.sql.*; + import com.fasterxml.jackson.databind.DeserializationFeature; import com.fasterxml.jackson.databind.ObjectMapper; + import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.schema.oaf.DataInfo; import eu.dnetlib.dhp.schema.oaf.Oaf; @@ -10,90 +22,81 @@ import eu.dnetlib.dhp.schema.scholexplorer.DLIDataset; import eu.dnetlib.dhp.schema.scholexplorer.DLIPublication; import eu.dnetlib.dhp.schema.scholexplorer.DLIUnknown; import eu.dnetlib.dhp.utils.DHPUtils; -import java.io.IOException; -import org.apache.commons.io.IOUtils; -import org.apache.hadoop.io.compress.GzipCodec; -import org.apache.spark.api.java.JavaPairRDD; -import org.apache.spark.api.java.JavaRDD; -import org.apache.spark.api.java.JavaSparkContext; -import org.apache.spark.api.java.function.PairFunction; -import org.apache.spark.sql.*; import scala.Tuple2; public class SparkUpdateEntityJob { - static final String IDJSONPATH = "$.id"; + static final String IDJSONPATH = "$.id"; - public static void main(String[] args) throws Exception { - final ArgumentApplicationParser parser = - new ArgumentApplicationParser( - IOUtils.toString( - SparkUpdateEntityJob.class.getResourceAsStream( - "/eu/dnetlib/dhp/sx/dedup/dedup_delete_by_inference_parameters.json"))); - parser.parseArgument(args); - final SparkSession spark = - SparkSession.builder() - .appName(SparkUpdateEntityJob.class.getSimpleName()) - .master(parser.get("master")) - .getOrCreate(); + public static void main(String[] args) throws Exception { + final ArgumentApplicationParser parser = new ArgumentApplicationParser( + IOUtils + .toString( + SparkUpdateEntityJob.class + .getResourceAsStream( + "/eu/dnetlib/dhp/sx/dedup/dedup_delete_by_inference_parameters.json"))); + parser.parseArgument(args); + final SparkSession spark = SparkSession + .builder() + .appName(SparkUpdateEntityJob.class.getSimpleName()) + .master(parser.get("master")) + .getOrCreate(); - final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); - final String entityPath = parser.get("entityPath"); - final String mergeRelPath = parser.get("mergeRelPath"); - final String dedupRecordPath = parser.get("dedupRecordPath"); - final String entity = parser.get("entity"); - final String destination = parser.get("targetPath"); + final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); + final String entityPath = parser.get("entityPath"); + final String mergeRelPath = parser.get("mergeRelPath"); + final String dedupRecordPath = parser.get("dedupRecordPath"); + final String entity = parser.get("entity"); + final String destination = parser.get("targetPath"); - final Dataset df = spark.read().load(mergeRelPath).as(Encoders.bean(Relation.class)); - final JavaPairRDD mergedIds = - df.where("relClass == 'merges'") - .select(df.col("target")) - .distinct() - .toJavaRDD() - .mapToPair((PairFunction) r -> new Tuple2<>(r.getString(0), "d")); - final JavaRDD sourceEntity = sc.textFile(entityPath); + final Dataset df = spark.read().load(mergeRelPath).as(Encoders.bean(Relation.class)); + final JavaPairRDD mergedIds = df + .where("relClass == 'merges'") + .select(df.col("target")) + .distinct() + .toJavaRDD() + .mapToPair((PairFunction) r -> new Tuple2<>(r.getString(0), "d")); + final JavaRDD sourceEntity = sc.textFile(entityPath); - final JavaRDD dedupEntity = sc.textFile(dedupRecordPath); - JavaPairRDD entitiesWithId = - sourceEntity.mapToPair( - (PairFunction) - s -> new Tuple2<>(DHPUtils.getJPathString(IDJSONPATH, s), s)); - Class mainClass; - switch (entity) { - case "publication": - mainClass = DLIPublication.class; - break; - case "dataset": - mainClass = DLIDataset.class; - break; - case "unknown": - mainClass = DLIUnknown.class; - break; - default: - throw new IllegalArgumentException("Illegal type " + entity); - } - JavaRDD map = - entitiesWithId - .leftOuterJoin(mergedIds) - .map( - k -> - k._2()._2().isPresent() - ? updateDeletedByInference(k._2()._1(), mainClass) - : k._2()._1()); - map.union(dedupEntity).saveAsTextFile(destination, GzipCodec.class); - } + final JavaRDD dedupEntity = sc.textFile(dedupRecordPath); + JavaPairRDD entitiesWithId = sourceEntity + .mapToPair( + (PairFunction) s -> new Tuple2<>(DHPUtils.getJPathString(IDJSONPATH, s), s)); + Class mainClass; + switch (entity) { + case "publication": + mainClass = DLIPublication.class; + break; + case "dataset": + mainClass = DLIDataset.class; + break; + case "unknown": + mainClass = DLIUnknown.class; + break; + default: + throw new IllegalArgumentException("Illegal type " + entity); + } + JavaRDD map = entitiesWithId + .leftOuterJoin(mergedIds) + .map( + k -> k._2()._2().isPresent() + ? updateDeletedByInference(k._2()._1(), mainClass) + : k._2()._1()); + map.union(dedupEntity).saveAsTextFile(destination, GzipCodec.class); + } - private static String updateDeletedByInference( - final String json, final Class clazz) { - final ObjectMapper mapper = new ObjectMapper(); - mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false); - try { - Oaf entity = mapper.readValue(json, clazz); - if (entity.getDataInfo() == null) entity.setDataInfo(new DataInfo()); - entity.getDataInfo().setDeletedbyinference(true); - return mapper.writeValueAsString(entity); - } catch (IOException e) { - throw new RuntimeException("Unable to convert json", e); - } - } + private static String updateDeletedByInference( + final String json, final Class clazz) { + final ObjectMapper mapper = new ObjectMapper(); + mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false); + try { + Oaf entity = mapper.readValue(json, clazz); + if (entity.getDataInfo() == null) + entity.setDataInfo(new DataInfo()); + entity.getDataInfo().setDeletedbyinference(true); + return mapper.writeValueAsString(entity); + } catch (IOException e) { + throw new RuntimeException("Unable to convert json", e); + } + } } diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/hive/GraphHiveImporterJob.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/hive/GraphHiveImporterJob.java index 1046df609..0f74c6343 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/hive/GraphHiveImporterJob.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/hive/GraphHiveImporterJob.java @@ -1,11 +1,10 @@ + package eu.dnetlib.dhp.oa.graph.hive; import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkHiveSession; -import com.fasterxml.jackson.databind.ObjectMapper; -import eu.dnetlib.dhp.application.ArgumentApplicationParser; -import eu.dnetlib.dhp.schema.common.ModelSupport; import java.util.Optional; + import org.apache.commons.io.IOUtils; import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaSparkContext; @@ -15,61 +14,68 @@ import org.apache.spark.sql.SparkSession; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import com.fasterxml.jackson.databind.ObjectMapper; + +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.schema.common.ModelSupport; + public class GraphHiveImporterJob { - private static final Logger log = LoggerFactory.getLogger(GraphHiveImporterJob.class); + private static final Logger log = LoggerFactory.getLogger(GraphHiveImporterJob.class); - private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); + private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); - public static void main(String[] args) throws Exception { + public static void main(String[] args) throws Exception { - final ArgumentApplicationParser parser = - new ArgumentApplicationParser( - IOUtils.toString( - GraphHiveImporterJob.class.getResourceAsStream( - "/eu/dnetlib/dhp/oa/graph/input_graph_hive_parameters.json"))); - parser.parseArgument(args); + final ArgumentApplicationParser parser = new ArgumentApplicationParser( + IOUtils + .toString( + GraphHiveImporterJob.class + .getResourceAsStream( + "/eu/dnetlib/dhp/oa/graph/input_graph_hive_parameters.json"))); + parser.parseArgument(args); - Boolean isSparkSessionManaged = - Optional.ofNullable(parser.get("isSparkSessionManaged")) - .map(Boolean::valueOf) - .orElse(Boolean.TRUE); - log.info("isSparkSessionManaged: {}", isSparkSessionManaged); + Boolean isSparkSessionManaged = Optional + .ofNullable(parser.get("isSparkSessionManaged")) + .map(Boolean::valueOf) + .orElse(Boolean.TRUE); + log.info("isSparkSessionManaged: {}", isSparkSessionManaged); - String inputPath = parser.get("inputPath"); - log.info("inputPath: {}", inputPath); + String inputPath = parser.get("inputPath"); + log.info("inputPath: {}", inputPath); - String hiveDbName = parser.get("hiveDbName"); - log.info("hiveDbName: {}", hiveDbName); + String hiveDbName = parser.get("hiveDbName"); + log.info("hiveDbName: {}", hiveDbName); - String hiveMetastoreUris = parser.get("hiveMetastoreUris"); - log.info("hiveMetastoreUris: {}", hiveMetastoreUris); + String hiveMetastoreUris = parser.get("hiveMetastoreUris"); + log.info("hiveMetastoreUris: {}", hiveMetastoreUris); - SparkConf conf = new SparkConf(); - conf.set("hive.metastore.uris", hiveMetastoreUris); + SparkConf conf = new SparkConf(); + conf.set("hive.metastore.uris", hiveMetastoreUris); - runWithSparkHiveSession( - conf, isSparkSessionManaged, spark -> loadGraphAsHiveDB(spark, inputPath, hiveDbName)); - } + runWithSparkHiveSession( + conf, isSparkSessionManaged, spark -> loadGraphAsHiveDB(spark, inputPath, hiveDbName)); + } - // protected for testing - private static void loadGraphAsHiveDB(SparkSession spark, String inputPath, String hiveDbName) { + // protected for testing + private static void loadGraphAsHiveDB(SparkSession spark, String inputPath, String hiveDbName) { - spark.sql(String.format("DROP DATABASE IF EXISTS %s CASCADE", hiveDbName)); - spark.sql(String.format("CREATE DATABASE IF NOT EXISTS %s", hiveDbName)); + spark.sql(String.format("DROP DATABASE IF EXISTS %s CASCADE", hiveDbName)); + spark.sql(String.format("CREATE DATABASE IF NOT EXISTS %s", hiveDbName)); - final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); - // Read the input file and convert it into RDD of serializable object - ModelSupport.oafTypes.forEach( - (name, clazz) -> - spark - .createDataset( - sc.textFile(inputPath + "/" + name) - .map(s -> OBJECT_MAPPER.readValue(s, clazz)) - .rdd(), - Encoders.bean(clazz)) - .write() - .mode(SaveMode.Overwrite) - .saveAsTable(hiveDbName + "." + name)); - } + final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); + // Read the input file and convert it into RDD of serializable object + ModelSupport.oafTypes + .forEach( + (name, clazz) -> spark + .createDataset( + sc + .textFile(inputPath + "/" + name) + .map(s -> OBJECT_MAPPER.readValue(s, clazz)) + .rdd(), + Encoders.bean(clazz)) + .write() + .mode(SaveMode.Overwrite) + .saveAsTable(hiveDbName + "." + name)); + } } diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java index 69cd0001f..e20d1eb79 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java @@ -1,3 +1,4 @@ + package eu.dnetlib.dhp.oa.graph.raw; import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.createOpenaireId; @@ -10,6 +11,19 @@ import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.oaiIProvenance; import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.qualifier; import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.structuredProperty; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Date; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +import org.apache.commons.lang3.StringUtils; +import org.dom4j.Document; +import org.dom4j.DocumentFactory; +import org.dom4j.DocumentHelper; +import org.dom4j.Node; + import eu.dnetlib.dhp.oa.graph.raw.common.MigrationConstants; import eu.dnetlib.dhp.schema.oaf.Author; import eu.dnetlib.dhp.schema.oaf.Context; @@ -29,440 +43,429 @@ import eu.dnetlib.dhp.schema.oaf.Relation; import eu.dnetlib.dhp.schema.oaf.Result; import eu.dnetlib.dhp.schema.oaf.Software; import eu.dnetlib.dhp.schema.oaf.StructuredProperty; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.Date; -import java.util.HashMap; -import java.util.List; -import java.util.Map; -import org.apache.commons.lang3.StringUtils; -import org.dom4j.Document; -import org.dom4j.DocumentFactory; -import org.dom4j.DocumentHelper; -import org.dom4j.Node; public abstract class AbstractMdRecordToOafMapper { - protected final Map code2name; - - protected static final Qualifier MAIN_TITLE_QUALIFIER = - qualifier("main title", "main title", "dnet:dataCite_title", "dnet:dataCite_title"); - - protected AbstractMdRecordToOafMapper(final Map code2name) { - this.code2name = code2name; - } - - public List processMdRecord(final String xml) { - try { - final Map nsContext = new HashMap<>(); - nsContext.put("dr", "http://www.driver-repository.eu/namespace/dr"); - nsContext.put("dri", "http://www.driver-repository.eu/namespace/dri"); - nsContext.put("oaf", "http://namespace.openaire.eu/oaf"); - nsContext.put("oai", "http://www.openarchives.org/OAI/2.0/"); - nsContext.put("prov", "http://www.openarchives.org/OAI/2.0/provenance"); - nsContext.put("dc", "http://purl.org/dc/elements/1.1/"); - nsContext.put("datacite", "http://datacite.org/schema/kernel-3"); - DocumentFactory.getInstance().setXPathNamespaceURIs(nsContext); - - final Document doc = - DocumentHelper.parseText( - xml.replaceAll( - "http://datacite.org/schema/kernel-4", "http://datacite.org/schema/kernel-3")); - - final String type = doc.valueOf("//dr:CobjCategory/@type"); - final KeyValue collectedFrom = - keyValue( - createOpenaireId(10, doc.valueOf("//oaf:collectedFrom/@id"), true), - doc.valueOf("//oaf:collectedFrom/@name")); - final KeyValue hostedBy = - StringUtils.isBlank(doc.valueOf("//oaf:hostedBy/@id")) - ? collectedFrom - : keyValue( - createOpenaireId(10, doc.valueOf("//oaf:hostedBy/@id"), true), - doc.valueOf("//oaf:hostedBy/@name")); - - final DataInfo info = prepareDataInfo(doc); - final long lastUpdateTimestamp = new Date().getTime(); - - return createOafs(doc, type, collectedFrom, hostedBy, info, lastUpdateTimestamp); - } catch (final Exception e) { - throw new RuntimeException(e); - } - } - - protected List createOafs( - final Document doc, - final String type, - final KeyValue collectedFrom, - final KeyValue hostedBy, - final DataInfo info, - final long lastUpdateTimestamp) { - - final List oafs = new ArrayList<>(); - - switch (type.toLowerCase()) { - case "": - case "publication": - final Publication p = new Publication(); - populateResultFields(p, doc, collectedFrom, hostedBy, info, lastUpdateTimestamp); - p.setResulttype(MigrationConstants.PUBLICATION_RESULTTYPE_QUALIFIER); - p.setJournal(prepareJournal(doc, info)); - oafs.add(p); - break; - case "dataset": - final Dataset d = new Dataset(); - populateResultFields(d, doc, collectedFrom, hostedBy, info, lastUpdateTimestamp); - d.setResulttype(MigrationConstants.DATASET_RESULTTYPE_QUALIFIER); - d.setStoragedate(prepareDatasetStorageDate(doc, info)); - d.setDevice(prepareDatasetDevice(doc, info)); - d.setSize(prepareDatasetSize(doc, info)); - d.setVersion(prepareDatasetVersion(doc, info)); - d.setLastmetadataupdate(prepareDatasetLastMetadataUpdate(doc, info)); - d.setMetadataversionnumber(prepareDatasetMetadataVersionNumber(doc, info)); - d.setGeolocation(prepareDatasetGeoLocations(doc, info)); - oafs.add(d); - break; - case "software": - final Software s = new Software(); - populateResultFields(s, doc, collectedFrom, hostedBy, info, lastUpdateTimestamp); - s.setResulttype(MigrationConstants.SOFTWARE_RESULTTYPE_QUALIFIER); - s.setDocumentationUrl(prepareSoftwareDocumentationUrls(doc, info)); - s.setLicense(prepareSoftwareLicenses(doc, info)); - s.setCodeRepositoryUrl(prepareSoftwareCodeRepositoryUrl(doc, info)); - s.setProgrammingLanguage(prepareSoftwareProgrammingLanguage(doc, info)); - oafs.add(s); - break; - case "otherresearchproducts": - default: - final OtherResearchProduct o = new OtherResearchProduct(); - populateResultFields(o, doc, collectedFrom, hostedBy, info, lastUpdateTimestamp); - o.setResulttype(MigrationConstants.OTHER_RESULTTYPE_QUALIFIER); - o.setContactperson(prepareOtherResearchProductContactPersons(doc, info)); - o.setContactgroup(prepareOtherResearchProductContactGroups(doc, info)); - o.setTool(prepareOtherResearchProductTools(doc, info)); - oafs.add(o); - break; - } - - if (!oafs.isEmpty()) { - oafs.addAll(addProjectRels(doc, collectedFrom, info, lastUpdateTimestamp)); - oafs.addAll(addOtherResultRels(doc, collectedFrom, info, lastUpdateTimestamp)); - } - - return oafs; - } - - private List addProjectRels( - final Document doc, - final KeyValue collectedFrom, - final DataInfo info, - final long lastUpdateTimestamp) { - - final List res = new ArrayList<>(); - - final String docId = createOpenaireId(50, doc.valueOf("//dri:objIdentifier"), false); - - for (final Object o : doc.selectNodes("//oaf:projectid")) { - - final String originalId = ((Node) o).getText(); - - if (StringUtils.isNotBlank(originalId)) { - final String projectId = createOpenaireId(40, originalId, true); - - final Relation r1 = new Relation(); - r1.setRelType("resultProject"); - r1.setSubRelType("outcome"); - r1.setRelClass("isProducedBy"); - r1.setSource(docId); - r1.setTarget(projectId); - r1.setCollectedfrom(Arrays.asList(collectedFrom)); - r1.setDataInfo(info); - r1.setLastupdatetimestamp(lastUpdateTimestamp); - res.add(r1); - - final Relation r2 = new Relation(); - r2.setRelType("resultProject"); - r2.setSubRelType("outcome"); - r2.setRelClass("produces"); - r2.setSource(projectId); - r2.setTarget(docId); - r2.setCollectedfrom(Arrays.asList(collectedFrom)); - r2.setDataInfo(info); - r2.setLastupdatetimestamp(lastUpdateTimestamp); - res.add(r2); - } - } - - return res; - } - - protected abstract List addOtherResultRels( - final Document doc, - final KeyValue collectedFrom, - final DataInfo info, - final long lastUpdateTimestamp); - - private void populateResultFields( - final Result r, - final Document doc, - final KeyValue collectedFrom, - final KeyValue hostedBy, - final DataInfo info, - final long lastUpdateTimestamp) { - r.setDataInfo(info); - r.setLastupdatetimestamp(lastUpdateTimestamp); - r.setId(createOpenaireId(50, doc.valueOf("//dri:objIdentifier"), false)); - r.setOriginalId(Arrays.asList(doc.valueOf("//dri:objIdentifier"))); - r.setCollectedfrom(Arrays.asList(collectedFrom)); - r.setPid( - prepareListStructProps( - doc, "//oaf:identifier", "@identifierType", "dnet:pid_types", "dnet:pid_types", info)); - r.setDateofcollection(doc.valueOf("//dr:dateOfCollection")); - r.setDateoftransformation(doc.valueOf("//dr:dateOfTransformation")); - r.setExtraInfo(new ArrayList<>()); // NOT PRESENT IN MDSTORES - r.setOaiprovenance(prepareOAIprovenance(doc)); - r.setAuthor(prepareAuthors(doc, info)); - r.setLanguage(prepareLanguages(doc)); - r.setCountry(new ArrayList<>()); // NOT PRESENT IN MDSTORES - r.setSubject(prepareSubjects(doc, info)); - r.setTitle(prepareTitles(doc, info)); - r.setRelevantdate(prepareRelevantDates(doc, info)); - r.setDescription(prepareDescriptions(doc, info)); - r.setDateofacceptance(prepareField(doc, "//oaf:dateAccepted", info)); - r.setPublisher(preparePublisher(doc, info)); - r.setEmbargoenddate(prepareField(doc, "//oaf:embargoenddate", info)); - r.setSource(prepareSources(doc, info)); - r.setFulltext(new ArrayList<>()); // NOT PRESENT IN MDSTORES - r.setFormat(prepareFormats(doc, info)); - r.setContributor(prepareContributors(doc, info)); - r.setResourcetype(prepareResourceType(doc, info)); - r.setCoverage(prepareCoverages(doc, info)); - r.setContext(prepareContexts(doc, info)); - r.setExternalReference(new ArrayList<>()); // NOT PRESENT IN MDSTORES - r.setInstance(prepareInstances(doc, info, collectedFrom, hostedBy)); - } - - private List prepareContexts(final Document doc, final DataInfo info) { - final List list = new ArrayList<>(); - for (final Object o : doc.selectNodes("//oaf:concept")) { - final String cid = ((Node) o).valueOf("@id"); - if (StringUtils.isNotBlank(cid)) { - final Context c = new Context(); - c.setId(cid); - c.setDataInfo(Arrays.asList(info)); - list.add(c); - } - } - return list; - } - - protected abstract Qualifier prepareResourceType(Document doc, DataInfo info); - - protected abstract List prepareInstances( - Document doc, DataInfo info, KeyValue collectedfrom, KeyValue hostedby); - - protected abstract List> prepareSources(Document doc, DataInfo info); - - protected abstract List prepareRelevantDates(Document doc, DataInfo info); - - protected abstract List> prepareCoverages(Document doc, DataInfo info); - - protected abstract List> prepareContributors(Document doc, DataInfo info); - - protected abstract List> prepareFormats(Document doc, DataInfo info); - - protected abstract Field preparePublisher(Document doc, DataInfo info); - - protected abstract List> prepareDescriptions(Document doc, DataInfo info); - - protected abstract List prepareTitles(Document doc, DataInfo info); - - protected abstract List prepareSubjects(Document doc, DataInfo info); - - protected abstract Qualifier prepareLanguages(Document doc); - - protected abstract List prepareAuthors(Document doc, DataInfo info); - - protected abstract List> prepareOtherResearchProductTools( - Document doc, DataInfo info); - - protected abstract List> prepareOtherResearchProductContactGroups( - Document doc, DataInfo info); - - protected abstract List> prepareOtherResearchProductContactPersons( - Document doc, DataInfo info); - - protected abstract Qualifier prepareSoftwareProgrammingLanguage(Document doc, DataInfo info); - - protected abstract Field prepareSoftwareCodeRepositoryUrl(Document doc, DataInfo info); - - protected abstract List prepareSoftwareLicenses(Document doc, DataInfo info); - - protected abstract List> prepareSoftwareDocumentationUrls( - Document doc, DataInfo info); - - protected abstract List prepareDatasetGeoLocations(Document doc, DataInfo info); - - protected abstract Field prepareDatasetMetadataVersionNumber(Document doc, DataInfo info); - - protected abstract Field prepareDatasetLastMetadataUpdate(Document doc, DataInfo info); - - protected abstract Field prepareDatasetVersion(Document doc, DataInfo info); - - protected abstract Field prepareDatasetSize(Document doc, DataInfo info); - - protected abstract Field prepareDatasetDevice(Document doc, DataInfo info); - - protected abstract Field prepareDatasetStorageDate(Document doc, DataInfo info); - - private Journal prepareJournal(final Document doc, final DataInfo info) { - final Node n = doc.selectSingleNode("//oaf:journal"); - if (n != null) { - final String name = n.getText(); - final String issnPrinted = n.valueOf("@issn"); - final String issnOnline = n.valueOf("@eissn"); - final String issnLinking = n.valueOf("@lissn"); - final String ep = n.valueOf("@ep"); - final String iss = n.valueOf("@iss"); - final String sp = n.valueOf("@sp"); - final String vol = n.valueOf("@vol"); - final String edition = n.valueOf("@edition"); - if (StringUtils.isNotBlank(name)) { - return journal( - name, - issnPrinted, - issnOnline, - issnLinking, - ep, - iss, - sp, - vol, - edition, - null, - null, - info); - } - } - return null; - } - - protected Qualifier prepareQualifier( - final Node node, final String xpath, final String schemeId, final String schemeName) { - final String classId = node.valueOf(xpath); - final String className = code2name.get(classId); - return qualifier(classId, className, schemeId, schemeName); - } - - protected List prepareListStructProps( - final Node node, - final String xpath, - final String xpathClassId, - final String schemeId, - final String schemeName, - final DataInfo info) { - final List res = new ArrayList<>(); - for (final Object o : node.selectNodes(xpath)) { - final Node n = (Node) o; - final String classId = n.valueOf(xpathClassId); - final String className = code2name.get(classId); - res.add(structuredProperty(n.getText(), classId, className, schemeId, schemeName, info)); - } - return res; - } - - protected List prepareListStructProps( - final Node node, final String xpath, final Qualifier qualifier, final DataInfo info) { - final List res = new ArrayList<>(); - for (final Object o : node.selectNodes(xpath)) { - final Node n = (Node) o; - res.add(structuredProperty(n.getText(), qualifier, info)); - } - return res; - } - - protected List prepareListStructProps( - final Node node, final String xpath, final DataInfo info) { - final List res = new ArrayList<>(); - for (final Object o : node.selectNodes(xpath)) { - final Node n = (Node) o; - res.add( - structuredProperty( - n.getText(), - n.valueOf("@classid"), - n.valueOf("@classname"), - n.valueOf("@schemeid"), - n.valueOf("@schemename"), - info)); - } - return res; - } - - protected OAIProvenance prepareOAIprovenance(final Document doc) { - final Node n = - doc.selectSingleNode("//*[local-name()='provenance']/*[local-name()='originDescription']"); - - if (n == null) { - return null; - } - - final String identifier = n.valueOf("./*[local-name()='identifier']"); - final String baseURL = n.valueOf("./*[local-name()='baseURL']"); - ; - final String metadataNamespace = n.valueOf("./*[local-name()='metadataNamespace']"); - ; - final boolean altered = n.valueOf("@altered").equalsIgnoreCase("true"); - final String datestamp = n.valueOf("./*[local-name()='datestamp']"); - ; - final String harvestDate = n.valueOf("@harvestDate"); - ; - - return oaiIProvenance(identifier, baseURL, metadataNamespace, altered, datestamp, harvestDate); - } - - protected DataInfo prepareDataInfo(final Document doc) { - final Node n = doc.selectSingleNode("//oaf:datainfo"); - - if (n == null) { - return dataInfo( - false, null, false, false, MigrationConstants.REPOSITORY_PROVENANCE_ACTIONS, "0.9"); - } - - final String paClassId = n.valueOf("./oaf:provenanceaction/@classid"); - final String paClassName = n.valueOf("./oaf:provenanceaction/@classname"); - final String paSchemeId = n.valueOf("./oaf:provenanceaction/@schemeid"); - final String paSchemeName = n.valueOf("./oaf:provenanceaction/@schemename"); - - final boolean deletedbyinference = Boolean.parseBoolean(n.valueOf("./oaf:deletedbyinference")); - final String inferenceprovenance = n.valueOf("./oaf:inferenceprovenance"); - final Boolean inferred = Boolean.parseBoolean(n.valueOf("./oaf:inferred")); - final String trust = n.valueOf("./oaf:trust"); - - return dataInfo( - deletedbyinference, - inferenceprovenance, - inferred, - false, - qualifier(paClassId, paClassName, paSchemeId, paSchemeName), - trust); - } - - protected Field prepareField(final Node node, final String xpath, final DataInfo info) { - return field(node.valueOf(xpath), info); - } - - protected List> prepareListFields( - final Node node, final String xpath, final DataInfo info) { - return listFields(info, prepareListString(node, xpath)); - } - - protected List prepareListString(final Node node, final String xpath) { - final List res = new ArrayList<>(); - for (final Object o : node.selectNodes(xpath)) { - final String s = ((Node) o).getText().trim(); - if (StringUtils.isNotBlank(s)) { - res.add(s); - } - } - return res; - } + protected final Map code2name; + + protected static final Qualifier MAIN_TITLE_QUALIFIER = qualifier( + "main title", "main title", "dnet:dataCite_title", "dnet:dataCite_title"); + + protected AbstractMdRecordToOafMapper(final Map code2name) { + this.code2name = code2name; + } + + public List processMdRecord(final String xml) { + try { + final Map nsContext = new HashMap<>(); + nsContext.put("dr", "http://www.driver-repository.eu/namespace/dr"); + nsContext.put("dri", "http://www.driver-repository.eu/namespace/dri"); + nsContext.put("oaf", "http://namespace.openaire.eu/oaf"); + nsContext.put("oai", "http://www.openarchives.org/OAI/2.0/"); + nsContext.put("prov", "http://www.openarchives.org/OAI/2.0/provenance"); + nsContext.put("dc", "http://purl.org/dc/elements/1.1/"); + nsContext.put("datacite", "http://datacite.org/schema/kernel-3"); + DocumentFactory.getInstance().setXPathNamespaceURIs(nsContext); + + final Document doc = DocumentHelper + .parseText( + xml + .replaceAll( + "http://datacite.org/schema/kernel-4", "http://datacite.org/schema/kernel-3")); + + final String type = doc.valueOf("//dr:CobjCategory/@type"); + final KeyValue collectedFrom = keyValue( + createOpenaireId(10, doc.valueOf("//oaf:collectedFrom/@id"), true), + doc.valueOf("//oaf:collectedFrom/@name")); + final KeyValue hostedBy = StringUtils.isBlank(doc.valueOf("//oaf:hostedBy/@id")) + ? collectedFrom + : keyValue( + createOpenaireId(10, doc.valueOf("//oaf:hostedBy/@id"), true), + doc.valueOf("//oaf:hostedBy/@name")); + + final DataInfo info = prepareDataInfo(doc); + final long lastUpdateTimestamp = new Date().getTime(); + + return createOafs(doc, type, collectedFrom, hostedBy, info, lastUpdateTimestamp); + } catch (final Exception e) { + throw new RuntimeException(e); + } + } + + protected List createOafs( + final Document doc, + final String type, + final KeyValue collectedFrom, + final KeyValue hostedBy, + final DataInfo info, + final long lastUpdateTimestamp) { + + final List oafs = new ArrayList<>(); + + switch (type.toLowerCase()) { + case "": + case "publication": + final Publication p = new Publication(); + populateResultFields(p, doc, collectedFrom, hostedBy, info, lastUpdateTimestamp); + p.setResulttype(MigrationConstants.PUBLICATION_RESULTTYPE_QUALIFIER); + p.setJournal(prepareJournal(doc, info)); + oafs.add(p); + break; + case "dataset": + final Dataset d = new Dataset(); + populateResultFields(d, doc, collectedFrom, hostedBy, info, lastUpdateTimestamp); + d.setResulttype(MigrationConstants.DATASET_RESULTTYPE_QUALIFIER); + d.setStoragedate(prepareDatasetStorageDate(doc, info)); + d.setDevice(prepareDatasetDevice(doc, info)); + d.setSize(prepareDatasetSize(doc, info)); + d.setVersion(prepareDatasetVersion(doc, info)); + d.setLastmetadataupdate(prepareDatasetLastMetadataUpdate(doc, info)); + d.setMetadataversionnumber(prepareDatasetMetadataVersionNumber(doc, info)); + d.setGeolocation(prepareDatasetGeoLocations(doc, info)); + oafs.add(d); + break; + case "software": + final Software s = new Software(); + populateResultFields(s, doc, collectedFrom, hostedBy, info, lastUpdateTimestamp); + s.setResulttype(MigrationConstants.SOFTWARE_RESULTTYPE_QUALIFIER); + s.setDocumentationUrl(prepareSoftwareDocumentationUrls(doc, info)); + s.setLicense(prepareSoftwareLicenses(doc, info)); + s.setCodeRepositoryUrl(prepareSoftwareCodeRepositoryUrl(doc, info)); + s.setProgrammingLanguage(prepareSoftwareProgrammingLanguage(doc, info)); + oafs.add(s); + break; + case "otherresearchproducts": + default: + final OtherResearchProduct o = new OtherResearchProduct(); + populateResultFields(o, doc, collectedFrom, hostedBy, info, lastUpdateTimestamp); + o.setResulttype(MigrationConstants.OTHER_RESULTTYPE_QUALIFIER); + o.setContactperson(prepareOtherResearchProductContactPersons(doc, info)); + o.setContactgroup(prepareOtherResearchProductContactGroups(doc, info)); + o.setTool(prepareOtherResearchProductTools(doc, info)); + oafs.add(o); + break; + } + + if (!oafs.isEmpty()) { + oafs.addAll(addProjectRels(doc, collectedFrom, info, lastUpdateTimestamp)); + oafs.addAll(addOtherResultRels(doc, collectedFrom, info, lastUpdateTimestamp)); + } + + return oafs; + } + + private List addProjectRels( + final Document doc, + final KeyValue collectedFrom, + final DataInfo info, + final long lastUpdateTimestamp) { + + final List res = new ArrayList<>(); + + final String docId = createOpenaireId(50, doc.valueOf("//dri:objIdentifier"), false); + + for (final Object o : doc.selectNodes("//oaf:projectid")) { + + final String originalId = ((Node) o).getText(); + + if (StringUtils.isNotBlank(originalId)) { + final String projectId = createOpenaireId(40, originalId, true); + + final Relation r1 = new Relation(); + r1.setRelType("resultProject"); + r1.setSubRelType("outcome"); + r1.setRelClass("isProducedBy"); + r1.setSource(docId); + r1.setTarget(projectId); + r1.setCollectedfrom(Arrays.asList(collectedFrom)); + r1.setDataInfo(info); + r1.setLastupdatetimestamp(lastUpdateTimestamp); + res.add(r1); + + final Relation r2 = new Relation(); + r2.setRelType("resultProject"); + r2.setSubRelType("outcome"); + r2.setRelClass("produces"); + r2.setSource(projectId); + r2.setTarget(docId); + r2.setCollectedfrom(Arrays.asList(collectedFrom)); + r2.setDataInfo(info); + r2.setLastupdatetimestamp(lastUpdateTimestamp); + res.add(r2); + } + } + + return res; + } + + protected abstract List addOtherResultRels( + final Document doc, + final KeyValue collectedFrom, + final DataInfo info, + final long lastUpdateTimestamp); + + private void populateResultFields( + final Result r, + final Document doc, + final KeyValue collectedFrom, + final KeyValue hostedBy, + final DataInfo info, + final long lastUpdateTimestamp) { + r.setDataInfo(info); + r.setLastupdatetimestamp(lastUpdateTimestamp); + r.setId(createOpenaireId(50, doc.valueOf("//dri:objIdentifier"), false)); + r.setOriginalId(Arrays.asList(doc.valueOf("//dri:objIdentifier"))); + r.setCollectedfrom(Arrays.asList(collectedFrom)); + r + .setPid( + prepareListStructProps( + doc, "//oaf:identifier", "@identifierType", "dnet:pid_types", "dnet:pid_types", info)); + r.setDateofcollection(doc.valueOf("//dr:dateOfCollection")); + r.setDateoftransformation(doc.valueOf("//dr:dateOfTransformation")); + r.setExtraInfo(new ArrayList<>()); // NOT PRESENT IN MDSTORES + r.setOaiprovenance(prepareOAIprovenance(doc)); + r.setAuthor(prepareAuthors(doc, info)); + r.setLanguage(prepareLanguages(doc)); + r.setCountry(new ArrayList<>()); // NOT PRESENT IN MDSTORES + r.setSubject(prepareSubjects(doc, info)); + r.setTitle(prepareTitles(doc, info)); + r.setRelevantdate(prepareRelevantDates(doc, info)); + r.setDescription(prepareDescriptions(doc, info)); + r.setDateofacceptance(prepareField(doc, "//oaf:dateAccepted", info)); + r.setPublisher(preparePublisher(doc, info)); + r.setEmbargoenddate(prepareField(doc, "//oaf:embargoenddate", info)); + r.setSource(prepareSources(doc, info)); + r.setFulltext(new ArrayList<>()); // NOT PRESENT IN MDSTORES + r.setFormat(prepareFormats(doc, info)); + r.setContributor(prepareContributors(doc, info)); + r.setResourcetype(prepareResourceType(doc, info)); + r.setCoverage(prepareCoverages(doc, info)); + r.setContext(prepareContexts(doc, info)); + r.setExternalReference(new ArrayList<>()); // NOT PRESENT IN MDSTORES + r.setInstance(prepareInstances(doc, info, collectedFrom, hostedBy)); + } + + private List prepareContexts(final Document doc, final DataInfo info) { + final List list = new ArrayList<>(); + for (final Object o : doc.selectNodes("//oaf:concept")) { + final String cid = ((Node) o).valueOf("@id"); + if (StringUtils.isNotBlank(cid)) { + final Context c = new Context(); + c.setId(cid); + c.setDataInfo(Arrays.asList(info)); + list.add(c); + } + } + return list; + } + + protected abstract Qualifier prepareResourceType(Document doc, DataInfo info); + + protected abstract List prepareInstances( + Document doc, DataInfo info, KeyValue collectedfrom, KeyValue hostedby); + + protected abstract List> prepareSources(Document doc, DataInfo info); + + protected abstract List prepareRelevantDates(Document doc, DataInfo info); + + protected abstract List> prepareCoverages(Document doc, DataInfo info); + + protected abstract List> prepareContributors(Document doc, DataInfo info); + + protected abstract List> prepareFormats(Document doc, DataInfo info); + + protected abstract Field preparePublisher(Document doc, DataInfo info); + + protected abstract List> prepareDescriptions(Document doc, DataInfo info); + + protected abstract List prepareTitles(Document doc, DataInfo info); + + protected abstract List prepareSubjects(Document doc, DataInfo info); + + protected abstract Qualifier prepareLanguages(Document doc); + + protected abstract List prepareAuthors(Document doc, DataInfo info); + + protected abstract List> prepareOtherResearchProductTools( + Document doc, DataInfo info); + + protected abstract List> prepareOtherResearchProductContactGroups( + Document doc, DataInfo info); + + protected abstract List> prepareOtherResearchProductContactPersons( + Document doc, DataInfo info); + + protected abstract Qualifier prepareSoftwareProgrammingLanguage(Document doc, DataInfo info); + + protected abstract Field prepareSoftwareCodeRepositoryUrl(Document doc, DataInfo info); + + protected abstract List prepareSoftwareLicenses(Document doc, DataInfo info); + + protected abstract List> prepareSoftwareDocumentationUrls( + Document doc, DataInfo info); + + protected abstract List prepareDatasetGeoLocations(Document doc, DataInfo info); + + protected abstract Field prepareDatasetMetadataVersionNumber(Document doc, DataInfo info); + + protected abstract Field prepareDatasetLastMetadataUpdate(Document doc, DataInfo info); + + protected abstract Field prepareDatasetVersion(Document doc, DataInfo info); + + protected abstract Field prepareDatasetSize(Document doc, DataInfo info); + + protected abstract Field prepareDatasetDevice(Document doc, DataInfo info); + + protected abstract Field prepareDatasetStorageDate(Document doc, DataInfo info); + + private Journal prepareJournal(final Document doc, final DataInfo info) { + final Node n = doc.selectSingleNode("//oaf:journal"); + if (n != null) { + final String name = n.getText(); + final String issnPrinted = n.valueOf("@issn"); + final String issnOnline = n.valueOf("@eissn"); + final String issnLinking = n.valueOf("@lissn"); + final String ep = n.valueOf("@ep"); + final String iss = n.valueOf("@iss"); + final String sp = n.valueOf("@sp"); + final String vol = n.valueOf("@vol"); + final String edition = n.valueOf("@edition"); + if (StringUtils.isNotBlank(name)) { + return journal( + name, + issnPrinted, + issnOnline, + issnLinking, + ep, + iss, + sp, + vol, + edition, + null, + null, + info); + } + } + return null; + } + + protected Qualifier prepareQualifier( + final Node node, final String xpath, final String schemeId, final String schemeName) { + final String classId = node.valueOf(xpath); + final String className = code2name.get(classId); + return qualifier(classId, className, schemeId, schemeName); + } + + protected List prepareListStructProps( + final Node node, + final String xpath, + final String xpathClassId, + final String schemeId, + final String schemeName, + final DataInfo info) { + final List res = new ArrayList<>(); + for (final Object o : node.selectNodes(xpath)) { + final Node n = (Node) o; + final String classId = n.valueOf(xpathClassId); + final String className = code2name.get(classId); + res.add(structuredProperty(n.getText(), classId, className, schemeId, schemeName, info)); + } + return res; + } + + protected List prepareListStructProps( + final Node node, final String xpath, final Qualifier qualifier, final DataInfo info) { + final List res = new ArrayList<>(); + for (final Object o : node.selectNodes(xpath)) { + final Node n = (Node) o; + res.add(structuredProperty(n.getText(), qualifier, info)); + } + return res; + } + + protected List prepareListStructProps( + final Node node, final String xpath, final DataInfo info) { + final List res = new ArrayList<>(); + for (final Object o : node.selectNodes(xpath)) { + final Node n = (Node) o; + res + .add( + structuredProperty( + n.getText(), + n.valueOf("@classid"), + n.valueOf("@classname"), + n.valueOf("@schemeid"), + n.valueOf("@schemename"), + info)); + } + return res; + } + + protected OAIProvenance prepareOAIprovenance(final Document doc) { + final Node n = doc.selectSingleNode("//*[local-name()='provenance']/*[local-name()='originDescription']"); + + if (n == null) { + return null; + } + + final String identifier = n.valueOf("./*[local-name()='identifier']"); + final String baseURL = n.valueOf("./*[local-name()='baseURL']"); + ; + final String metadataNamespace = n.valueOf("./*[local-name()='metadataNamespace']"); + ; + final boolean altered = n.valueOf("@altered").equalsIgnoreCase("true"); + final String datestamp = n.valueOf("./*[local-name()='datestamp']"); + ; + final String harvestDate = n.valueOf("@harvestDate"); + ; + + return oaiIProvenance(identifier, baseURL, metadataNamespace, altered, datestamp, harvestDate); + } + + protected DataInfo prepareDataInfo(final Document doc) { + final Node n = doc.selectSingleNode("//oaf:datainfo"); + + if (n == null) { + return dataInfo( + false, null, false, false, MigrationConstants.REPOSITORY_PROVENANCE_ACTIONS, "0.9"); + } + + final String paClassId = n.valueOf("./oaf:provenanceaction/@classid"); + final String paClassName = n.valueOf("./oaf:provenanceaction/@classname"); + final String paSchemeId = n.valueOf("./oaf:provenanceaction/@schemeid"); + final String paSchemeName = n.valueOf("./oaf:provenanceaction/@schemename"); + + final boolean deletedbyinference = Boolean.parseBoolean(n.valueOf("./oaf:deletedbyinference")); + final String inferenceprovenance = n.valueOf("./oaf:inferenceprovenance"); + final Boolean inferred = Boolean.parseBoolean(n.valueOf("./oaf:inferred")); + final String trust = n.valueOf("./oaf:trust"); + + return dataInfo( + deletedbyinference, + inferenceprovenance, + inferred, + false, + qualifier(paClassId, paClassName, paSchemeId, paSchemeName), + trust); + } + + protected Field prepareField(final Node node, final String xpath, final DataInfo info) { + return field(node.valueOf(xpath), info); + } + + protected List> prepareListFields( + final Node node, final String xpath, final DataInfo info) { + return listFields(info, prepareListString(node, xpath)); + } + + protected List prepareListString(final Node node, final String xpath) { + final List res = new ArrayList<>(); + for (final Object o : node.selectNodes(xpath)) { + final String s = ((Node) o).getText().trim(); + if (StringUtils.isNotBlank(s)) { + res.add(s); + } + } + return res; + } } diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/DispatchEntitiesApplication.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/DispatchEntitiesApplication.java index 8029f8422..1aab78afe 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/DispatchEntitiesApplication.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/DispatchEntitiesApplication.java @@ -1,12 +1,10 @@ + package eu.dnetlib.dhp.oa.graph.raw; import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; -import eu.dnetlib.dhp.application.ArgumentApplicationParser; -import eu.dnetlib.dhp.common.HdfsSupport; -import eu.dnetlib.dhp.schema.common.ModelSupport; -import eu.dnetlib.dhp.schema.oaf.*; import java.util.Optional; + import org.apache.commons.io.IOUtils; import org.apache.commons.lang3.StringUtils; import org.apache.spark.SparkConf; @@ -18,66 +16,72 @@ import org.apache.spark.sql.SparkSession; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.common.HdfsSupport; +import eu.dnetlib.dhp.schema.common.ModelSupport; +import eu.dnetlib.dhp.schema.oaf.*; + public class DispatchEntitiesApplication { - private static final Logger log = LoggerFactory.getLogger(DispatchEntitiesApplication.class); + private static final Logger log = LoggerFactory.getLogger(DispatchEntitiesApplication.class); - public static void main(final String[] args) throws Exception { - final ArgumentApplicationParser parser = - new ArgumentApplicationParser( - IOUtils.toString( - MigrateMongoMdstoresApplication.class.getResourceAsStream( - "/eu/dnetlib/dhp/oa/graph/dispatch_entities_parameters.json"))); - parser.parseArgument(args); + public static void main(final String[] args) throws Exception { + final ArgumentApplicationParser parser = new ArgumentApplicationParser( + IOUtils + .toString( + MigrateMongoMdstoresApplication.class + .getResourceAsStream( + "/eu/dnetlib/dhp/oa/graph/dispatch_entities_parameters.json"))); + parser.parseArgument(args); - Boolean isSparkSessionManaged = - Optional.ofNullable(parser.get("isSparkSessionManaged")) - .map(Boolean::valueOf) - .orElse(Boolean.TRUE); - log.info("isSparkSessionManaged: {}", isSparkSessionManaged); + Boolean isSparkSessionManaged = Optional + .ofNullable(parser.get("isSparkSessionManaged")) + .map(Boolean::valueOf) + .orElse(Boolean.TRUE); + log.info("isSparkSessionManaged: {}", isSparkSessionManaged); - final String sourcePath = parser.get("sourcePath"); - final String targetPath = parser.get("graphRawPath"); + final String sourcePath = parser.get("sourcePath"); + final String targetPath = parser.get("graphRawPath"); - SparkConf conf = new SparkConf(); - runWithSparkSession( - conf, - isSparkSessionManaged, - spark -> { - removeOutputDir(spark, targetPath); - ModelSupport.oafTypes - .values() - .forEach(clazz -> processEntity(spark, clazz, sourcePath, targetPath)); - }); - } + SparkConf conf = new SparkConf(); + runWithSparkSession( + conf, + isSparkSessionManaged, + spark -> { + removeOutputDir(spark, targetPath); + ModelSupport.oafTypes + .values() + .forEach(clazz -> processEntity(spark, clazz, sourcePath, targetPath)); + }); + } - private static void processEntity( - final SparkSession spark, - final Class clazz, - final String sourcePath, - final String targetPath) { - final String type = clazz.getSimpleName().toLowerCase(); + private static void processEntity( + final SparkSession spark, + final Class clazz, + final String sourcePath, + final String targetPath) { + final String type = clazz.getSimpleName().toLowerCase(); - log.info("Processing entities ({}) in file: {}", type, sourcePath); + log.info("Processing entities ({}) in file: {}", type, sourcePath); - spark - .read() - .textFile(sourcePath) - .filter((FilterFunction) value -> isEntityType(value, type)) - .map( - (MapFunction) l -> StringUtils.substringAfter(l, "|"), - Encoders.STRING()) - .write() - .option("compression", "gzip") - .mode(SaveMode.Overwrite) - .text(targetPath + "/" + type); - } + spark + .read() + .textFile(sourcePath) + .filter((FilterFunction) value -> isEntityType(value, type)) + .map( + (MapFunction) l -> StringUtils.substringAfter(l, "|"), + Encoders.STRING()) + .write() + .option("compression", "gzip") + .mode(SaveMode.Overwrite) + .text(targetPath + "/" + type); + } - private static boolean isEntityType(final String line, final String type) { - return StringUtils.substringBefore(line, "|").equalsIgnoreCase(type); - } + private static boolean isEntityType(final String line, final String type) { + return StringUtils.substringBefore(line, "|").equalsIgnoreCase(type); + } - private static void removeOutputDir(SparkSession spark, String path) { - HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration()); - } + private static void removeOutputDir(SparkSession spark, String path) { + HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration()); + } } diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/GenerateEntitiesApplication.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/GenerateEntitiesApplication.java index 21288ad98..ccc9f8a89 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/GenerateEntitiesApplication.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/GenerateEntitiesApplication.java @@ -1,17 +1,13 @@ + package eu.dnetlib.dhp.oa.graph.raw; import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; -import com.fasterxml.jackson.databind.ObjectMapper; -import eu.dnetlib.dhp.application.ArgumentApplicationParser; -import eu.dnetlib.dhp.common.HdfsSupport; -import eu.dnetlib.dhp.oa.graph.raw.common.DbClient; -import eu.dnetlib.dhp.schema.common.ModelSupport; -import eu.dnetlib.dhp.schema.oaf.*; import java.io.IOException; import java.sql.SQLException; import java.util.*; import java.util.stream.Collectors; + import org.apache.commons.io.IOUtils; import org.apache.commons.lang3.StringUtils; import org.apache.hadoop.fs.FileSystem; @@ -24,172 +20,182 @@ import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.sql.SparkSession; import org.slf4j.Logger; import org.slf4j.LoggerFactory; + +import com.fasterxml.jackson.databind.ObjectMapper; + +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.common.HdfsSupport; +import eu.dnetlib.dhp.oa.graph.raw.common.DbClient; +import eu.dnetlib.dhp.schema.common.ModelSupport; +import eu.dnetlib.dhp.schema.oaf.*; import scala.Tuple2; public class GenerateEntitiesApplication { - private static final Logger log = LoggerFactory.getLogger(GenerateEntitiesApplication.class); + private static final Logger log = LoggerFactory.getLogger(GenerateEntitiesApplication.class); - private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); + private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); - public static void main(final String[] args) throws Exception { - final ArgumentApplicationParser parser = - new ArgumentApplicationParser( - IOUtils.toString( - MigrateMongoMdstoresApplication.class.getResourceAsStream( - "/eu/dnetlib/dhp/oa/graph/generate_entities_parameters.json"))); + public static void main(final String[] args) throws Exception { + final ArgumentApplicationParser parser = new ArgumentApplicationParser( + IOUtils + .toString( + MigrateMongoMdstoresApplication.class + .getResourceAsStream( + "/eu/dnetlib/dhp/oa/graph/generate_entities_parameters.json"))); - parser.parseArgument(args); + parser.parseArgument(args); - Boolean isSparkSessionManaged = - Optional.ofNullable(parser.get("isSparkSessionManaged")) - .map(Boolean::valueOf) - .orElse(Boolean.TRUE); - log.info("isSparkSessionManaged: {}", isSparkSessionManaged); + Boolean isSparkSessionManaged = Optional + .ofNullable(parser.get("isSparkSessionManaged")) + .map(Boolean::valueOf) + .orElse(Boolean.TRUE); + log.info("isSparkSessionManaged: {}", isSparkSessionManaged); - final String sourcePaths = parser.get("sourcePaths"); - final String targetPath = parser.get("targetPath"); + final String sourcePaths = parser.get("sourcePaths"); + final String targetPath = parser.get("targetPath"); - final String dbUrl = parser.get("postgresUrl"); - final String dbUser = parser.get("postgresUser"); - final String dbPassword = parser.get("postgresPassword"); + final String dbUrl = parser.get("postgresUrl"); + final String dbUser = parser.get("postgresUser"); + final String dbPassword = parser.get("postgresPassword"); - final Map code2name = loadClassNames(dbUrl, dbUser, dbPassword); + final Map code2name = loadClassNames(dbUrl, dbUser, dbPassword); - SparkConf conf = new SparkConf(); - runWithSparkSession( - conf, - isSparkSessionManaged, - spark -> { - removeOutputDir(spark, targetPath); - generateEntities(spark, code2name, sourcePaths, targetPath); - }); - } + SparkConf conf = new SparkConf(); + runWithSparkSession( + conf, + isSparkSessionManaged, + spark -> { + removeOutputDir(spark, targetPath); + generateEntities(spark, code2name, sourcePaths, targetPath); + }); + } - private static void generateEntities( - final SparkSession spark, - final Map code2name, - final String sourcePaths, - final String targetPath) { + private static void generateEntities( + final SparkSession spark, + final Map code2name, + final String sourcePaths, + final String targetPath) { - JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); - final List existingSourcePaths = - Arrays.stream(sourcePaths.split(",")) - .filter(p -> exists(sc, p)) - .collect(Collectors.toList()); + JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); + final List existingSourcePaths = Arrays + .stream(sourcePaths.split(",")) + .filter(p -> exists(sc, p)) + .collect(Collectors.toList()); - log.info("Generate entities from files:"); - existingSourcePaths.forEach(log::info); + log.info("Generate entities from files:"); + existingSourcePaths.forEach(log::info); - JavaRDD inputRdd = sc.emptyRDD(); + JavaRDD inputRdd = sc.emptyRDD(); - for (final String sp : existingSourcePaths) { - inputRdd = - inputRdd.union( - sc.sequenceFile(sp, Text.class, Text.class) - .map(k -> new Tuple2<>(k._1().toString(), k._2().toString())) - .map(k -> convertToListOaf(k._1(), k._2(), code2name)) - .flatMap(list -> list.iterator())); - } + for (final String sp : existingSourcePaths) { + inputRdd = inputRdd + .union( + sc + .sequenceFile(sp, Text.class, Text.class) + .map(k -> new Tuple2<>(k._1().toString(), k._2().toString())) + .map(k -> convertToListOaf(k._1(), k._2(), code2name)) + .flatMap(list -> list.iterator())); + } - inputRdd - .mapToPair(oaf -> new Tuple2<>(ModelSupport.idFn().apply(oaf), oaf)) - .reduceByKey((o1, o2) -> merge(o1, o2)) - .map(Tuple2::_2) - .map( - oaf -> - oaf.getClass().getSimpleName().toLowerCase() - + "|" - + OBJECT_MAPPER.writeValueAsString(oaf)) - .saveAsTextFile(targetPath, GzipCodec.class); - } + inputRdd + .mapToPair(oaf -> new Tuple2<>(ModelSupport.idFn().apply(oaf), oaf)) + .reduceByKey((o1, o2) -> merge(o1, o2)) + .map(Tuple2::_2) + .map( + oaf -> oaf.getClass().getSimpleName().toLowerCase() + + "|" + + OBJECT_MAPPER.writeValueAsString(oaf)) + .saveAsTextFile(targetPath, GzipCodec.class); + } - private static Oaf merge(Oaf o1, Oaf o2) { - if (ModelSupport.isSubClass(o1, OafEntity.class)) { - ((OafEntity) o1).mergeFrom((OafEntity) o2); - } else if (ModelSupport.isSubClass(o1, Relation.class)) { - ((Relation) o1).mergeFrom((Relation) o2); - } else { - throw new RuntimeException("invalid Oaf type:" + o1.getClass().getCanonicalName()); - } - return o1; - } + private static Oaf merge(Oaf o1, Oaf o2) { + if (ModelSupport.isSubClass(o1, OafEntity.class)) { + ((OafEntity) o1).mergeFrom((OafEntity) o2); + } else if (ModelSupport.isSubClass(o1, Relation.class)) { + ((Relation) o1).mergeFrom((Relation) o2); + } else { + throw new RuntimeException("invalid Oaf type:" + o1.getClass().getCanonicalName()); + } + return o1; + } - private static List convertToListOaf( - final String id, final String s, final Map code2name) { - final String type = StringUtils.substringAfter(id, ":"); + private static List convertToListOaf( + final String id, final String s, final Map code2name) { + final String type = StringUtils.substringAfter(id, ":"); - switch (type.toLowerCase()) { - case "native_oaf": - return new OafToOafMapper(code2name).processMdRecord(s); - case "native_odf": - return new OdfToOafMapper(code2name).processMdRecord(s); - case "datasource": - return Arrays.asList(convertFromJson(s, Datasource.class)); - case "organization": - return Arrays.asList(convertFromJson(s, Organization.class)); - case "project": - return Arrays.asList(convertFromJson(s, Project.class)); - case "relation": - return Arrays.asList(convertFromJson(s, Relation.class)); - case "publication": - return Arrays.asList(convertFromJson(s, Publication.class)); - case "dataset": - return Arrays.asList(convertFromJson(s, Dataset.class)); - case "software": - return Arrays.asList(convertFromJson(s, Software.class)); - case "otherresearchproduct": - return Arrays.asList(convertFromJson(s, OtherResearchProduct.class)); - default: - throw new RuntimeException("type not managed: " + type.toLowerCase()); - } - } + switch (type.toLowerCase()) { + case "native_oaf": + return new OafToOafMapper(code2name).processMdRecord(s); + case "native_odf": + return new OdfToOafMapper(code2name).processMdRecord(s); + case "datasource": + return Arrays.asList(convertFromJson(s, Datasource.class)); + case "organization": + return Arrays.asList(convertFromJson(s, Organization.class)); + case "project": + return Arrays.asList(convertFromJson(s, Project.class)); + case "relation": + return Arrays.asList(convertFromJson(s, Relation.class)); + case "publication": + return Arrays.asList(convertFromJson(s, Publication.class)); + case "dataset": + return Arrays.asList(convertFromJson(s, Dataset.class)); + case "software": + return Arrays.asList(convertFromJson(s, Software.class)); + case "otherresearchproduct": + return Arrays.asList(convertFromJson(s, OtherResearchProduct.class)); + default: + throw new RuntimeException("type not managed: " + type.toLowerCase()); + } + } - private static Map loadClassNames( - final String dbUrl, final String dbUser, final String dbPassword) throws IOException { + private static Map loadClassNames( + final String dbUrl, final String dbUser, final String dbPassword) throws IOException { - log.info("Loading vocabulary terms from db..."); + log.info("Loading vocabulary terms from db..."); - final Map map = new HashMap<>(); + final Map map = new HashMap<>(); - try (DbClient dbClient = new DbClient(dbUrl, dbUser, dbPassword)) { - dbClient.processResults( - "select code, name from class", - rs -> { - try { - map.put(rs.getString("code"), rs.getString("name")); - } catch (final SQLException e) { - e.printStackTrace(); - } - }); - } + try (DbClient dbClient = new DbClient(dbUrl, dbUser, dbPassword)) { + dbClient + .processResults( + "select code, name from class", + rs -> { + try { + map.put(rs.getString("code"), rs.getString("name")); + } catch (final SQLException e) { + e.printStackTrace(); + } + }); + } - log.info("Found " + map.size() + " terms."); + log.info("Found " + map.size() + " terms."); - return map; - } + return map; + } - private static Oaf convertFromJson(final String s, final Class clazz) { - try { - return OBJECT_MAPPER.readValue(s, clazz); - } catch (final Exception e) { - log.error("Error parsing object of class: " + clazz); - log.error(s); - throw new RuntimeException(e); - } - } + private static Oaf convertFromJson(final String s, final Class clazz) { + try { + return OBJECT_MAPPER.readValue(s, clazz); + } catch (final Exception e) { + log.error("Error parsing object of class: " + clazz); + log.error(s); + throw new RuntimeException(e); + } + } - private static boolean exists(final JavaSparkContext context, final String pathToFile) { - try { - final FileSystem hdfs = FileSystem.get(context.hadoopConfiguration()); - final Path path = new Path(pathToFile); - return hdfs.exists(path); - } catch (final IOException e) { - throw new RuntimeException(e); - } - } + private static boolean exists(final JavaSparkContext context, final String pathToFile) { + try { + final FileSystem hdfs = FileSystem.get(context.hadoopConfiguration()); + final Path path = new Path(pathToFile); + return hdfs.exists(path); + } catch (final IOException e) { + throw new RuntimeException(e); + } + } - private static void removeOutputDir(SparkSession spark, String path) { - HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration()); - } + private static void removeOutputDir(SparkSession spark, String path) { + HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration()); + } } diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/MergeClaimsApplication.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/MergeClaimsApplication.java index 7667735cb..9b99097ce 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/MergeClaimsApplication.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/MergeClaimsApplication.java @@ -1,14 +1,11 @@ + package eu.dnetlib.dhp.oa.graph.raw; import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; -import com.fasterxml.jackson.databind.ObjectMapper; -import eu.dnetlib.dhp.application.ArgumentApplicationParser; -import eu.dnetlib.dhp.common.HdfsSupport; -import eu.dnetlib.dhp.schema.common.ModelSupport; -import eu.dnetlib.dhp.schema.oaf.*; import java.util.Objects; import java.util.Optional; + import org.apache.commons.io.IOUtils; import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaSparkContext; @@ -20,113 +17,118 @@ import org.apache.spark.sql.SaveMode; import org.apache.spark.sql.SparkSession; import org.slf4j.Logger; import org.slf4j.LoggerFactory; + +import com.fasterxml.jackson.databind.ObjectMapper; + +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.common.HdfsSupport; +import eu.dnetlib.dhp.schema.common.ModelSupport; +import eu.dnetlib.dhp.schema.oaf.*; import scala.Tuple2; public class MergeClaimsApplication { - private static final Logger log = LoggerFactory.getLogger(MergeClaimsApplication.class); + private static final Logger log = LoggerFactory.getLogger(MergeClaimsApplication.class); - private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); + private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); - public static void main(final String[] args) throws Exception { - final ArgumentApplicationParser parser = - new ArgumentApplicationParser( - IOUtils.toString( - MigrateMongoMdstoresApplication.class.getResourceAsStream( - "/eu/dnetlib/dhp/oa/graph/merge_claims_parameters.json"))); - parser.parseArgument(args); + public static void main(final String[] args) throws Exception { + final ArgumentApplicationParser parser = new ArgumentApplicationParser( + IOUtils + .toString( + MigrateMongoMdstoresApplication.class + .getResourceAsStream( + "/eu/dnetlib/dhp/oa/graph/merge_claims_parameters.json"))); + parser.parseArgument(args); - Boolean isSparkSessionManaged = - Optional.ofNullable(parser.get("isSparkSessionManaged")) - .map(Boolean::valueOf) - .orElse(Boolean.TRUE); - log.info("isSparkSessionManaged: {}", isSparkSessionManaged); + Boolean isSparkSessionManaged = Optional + .ofNullable(parser.get("isSparkSessionManaged")) + .map(Boolean::valueOf) + .orElse(Boolean.TRUE); + log.info("isSparkSessionManaged: {}", isSparkSessionManaged); - final String rawGraphPath = parser.get("rawGraphPath"); - log.info("rawGraphPath: {}", rawGraphPath); + final String rawGraphPath = parser.get("rawGraphPath"); + log.info("rawGraphPath: {}", rawGraphPath); - final String claimsGraphPath = parser.get("claimsGraphPath"); - log.info("claimsGraphPath: {}", claimsGraphPath); + final String claimsGraphPath = parser.get("claimsGraphPath"); + log.info("claimsGraphPath: {}", claimsGraphPath); - final String outputRawGaphPath = parser.get("outputRawGaphPath"); - log.info("outputRawGaphPath: {}", outputRawGaphPath); + final String outputRawGaphPath = parser.get("outputRawGaphPath"); + log.info("outputRawGaphPath: {}", outputRawGaphPath); - String graphTableClassName = parser.get("graphTableClassName"); - log.info("graphTableClassName: {}", graphTableClassName); + String graphTableClassName = parser.get("graphTableClassName"); + log.info("graphTableClassName: {}", graphTableClassName); - Class clazz = (Class) Class.forName(graphTableClassName); + Class clazz = (Class) Class.forName(graphTableClassName); - SparkConf conf = new SparkConf(); - conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer"); - conf.registerKryoClasses(ModelSupport.getOafModelClasses()); + SparkConf conf = new SparkConf(); + conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer"); + conf.registerKryoClasses(ModelSupport.getOafModelClasses()); - runWithSparkSession( - conf, - isSparkSessionManaged, - spark -> { - String type = clazz.getSimpleName().toLowerCase(); + runWithSparkSession( + conf, + isSparkSessionManaged, + spark -> { + String type = clazz.getSimpleName().toLowerCase(); - String rawPath = rawGraphPath + "/" + type; - String claimPath = claimsGraphPath + "/" + type; - String outPath = outputRawGaphPath + "/" + type; + String rawPath = rawGraphPath + "/" + type; + String claimPath = claimsGraphPath + "/" + type; + String outPath = outputRawGaphPath + "/" + type; - removeOutputDir(spark, outPath); - mergeByType(spark, rawPath, claimPath, outPath, clazz); - }); - } + removeOutputDir(spark, outPath); + mergeByType(spark, rawPath, claimPath, outPath, clazz); + }); + } - private static void mergeByType( - SparkSession spark, String rawPath, String claimPath, String outPath, Class clazz) { - Dataset> raw = - readFromPath(spark, rawPath, clazz) - .map( - (MapFunction>) - value -> new Tuple2<>(ModelSupport.idFn().apply(value), value), - Encoders.tuple(Encoders.STRING(), Encoders.kryo(clazz))); + private static void mergeByType( + SparkSession spark, String rawPath, String claimPath, String outPath, Class clazz) { + Dataset> raw = readFromPath(spark, rawPath, clazz) + .map( + (MapFunction>) value -> new Tuple2<>(ModelSupport.idFn().apply(value), value), + Encoders.tuple(Encoders.STRING(), Encoders.kryo(clazz))); - final JavaSparkContext jsc = JavaSparkContext.fromSparkContext(spark.sparkContext()); - Dataset> claim = - jsc.broadcast(readFromPath(spark, claimPath, clazz)) - .getValue() - .map( - (MapFunction>) - value -> new Tuple2<>(ModelSupport.idFn().apply(value), value), - Encoders.tuple(Encoders.STRING(), Encoders.kryo(clazz))); + final JavaSparkContext jsc = JavaSparkContext.fromSparkContext(spark.sparkContext()); + Dataset> claim = jsc + .broadcast(readFromPath(spark, claimPath, clazz)) + .getValue() + .map( + (MapFunction>) value -> new Tuple2<>(ModelSupport.idFn().apply(value), value), + Encoders.tuple(Encoders.STRING(), Encoders.kryo(clazz))); - raw.joinWith(claim, raw.col("_1").equalTo(claim.col("_1")), "full_outer") - .map( - (MapFunction, Tuple2>, T>) - value -> { - Optional> opRaw = Optional.ofNullable(value._1()); - Optional> opClaim = Optional.ofNullable(value._2()); + raw + .joinWith(claim, raw.col("_1").equalTo(claim.col("_1")), "full_outer") + .map( + (MapFunction, Tuple2>, T>) value -> { + Optional> opRaw = Optional.ofNullable(value._1()); + Optional> opClaim = Optional.ofNullable(value._2()); - return opRaw.isPresent() - ? opRaw.get()._2() - : opClaim.isPresent() ? opClaim.get()._2() : null; - }, - Encoders.bean(clazz)) - .filter(Objects::nonNull) - .map( - (MapFunction) value -> OBJECT_MAPPER.writeValueAsString(value), - Encoders.STRING()) - .write() - .mode(SaveMode.Overwrite) - .option("compression", "gzip") - .text(outPath); - } + return opRaw.isPresent() + ? opRaw.get()._2() + : opClaim.isPresent() ? opClaim.get()._2() : null; + }, + Encoders.bean(clazz)) + .filter(Objects::nonNull) + .map( + (MapFunction) value -> OBJECT_MAPPER.writeValueAsString(value), + Encoders.STRING()) + .write() + .mode(SaveMode.Overwrite) + .option("compression", "gzip") + .text(outPath); + } - private static Dataset readFromPath( - SparkSession spark, String path, Class clazz) { - return spark - .read() - .textFile(path) - .map( - (MapFunction) value -> OBJECT_MAPPER.readValue(value, clazz), - Encoders.bean(clazz)) - .filter((FilterFunction) value -> Objects.nonNull(ModelSupport.idFn().apply(value))); - } + private static Dataset readFromPath( + SparkSession spark, String path, Class clazz) { + return spark + .read() + .textFile(path) + .map( + (MapFunction) value -> OBJECT_MAPPER.readValue(value, clazz), + Encoders.bean(clazz)) + .filter((FilterFunction) value -> Objects.nonNull(ModelSupport.idFn().apply(value))); + } - private static void removeOutputDir(SparkSession spark, String path) { - HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration()); - } + private static void removeOutputDir(SparkSession spark, String path) { + HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration()); + } } diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/MigrateDbEntitiesApplication.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/MigrateDbEntitiesApplication.java index a5114abc3..aa63f9ebc 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/MigrateDbEntitiesApplication.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/MigrateDbEntitiesApplication.java @@ -1,3 +1,4 @@ + package eu.dnetlib.dhp.oa.graph.raw; import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.asString; @@ -10,6 +11,23 @@ import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.listKeyValues; import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.qualifier; import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.structuredProperty; +import java.io.Closeable; +import java.io.IOException; +import java.sql.Array; +import java.sql.ResultSet; +import java.sql.SQLException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Date; +import java.util.List; +import java.util.function.Consumer; +import java.util.function.Function; + +import org.apache.commons.io.IOUtils; +import org.apache.commons.lang3.StringUtils; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; + import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.oa.graph.raw.common.AbstractMigrationApplication; import eu.dnetlib.dhp.oa.graph.raw.common.DbClient; @@ -31,24 +49,8 @@ import eu.dnetlib.dhp.schema.oaf.Relation; import eu.dnetlib.dhp.schema.oaf.Result; import eu.dnetlib.dhp.schema.oaf.Software; import eu.dnetlib.dhp.schema.oaf.StructuredProperty; -import java.io.Closeable; -import java.io.IOException; -import java.sql.Array; -import java.sql.ResultSet; -import java.sql.SQLException; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.Date; -import java.util.List; -import java.util.function.Consumer; -import java.util.function.Function; -import org.apache.commons.io.IOUtils; -import org.apache.commons.lang3.StringUtils; -import org.apache.commons.logging.Log; -import org.apache.commons.logging.LogFactory; public class MigrateDbEntitiesApplication extends AbstractMigrationApplication - implements Closeable { private static final Log log = LogFactory.getLog(MigrateDbEntitiesApplication.class); @@ -541,4 +543,5 @@ public class MigrateDbEntitiesApplication extends AbstractMigrationApplication dbClient.close(); } + } diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/MigrateMongoMdstoresApplication.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/MigrateMongoMdstoresApplication.java index 95f4477e8..00c1dc4bb 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/MigrateMongoMdstoresApplication.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/MigrateMongoMdstoresApplication.java @@ -1,70 +1,73 @@ + package eu.dnetlib.dhp.oa.graph.raw; -import eu.dnetlib.dhp.application.ArgumentApplicationParser; -import eu.dnetlib.dhp.oa.graph.raw.common.AbstractMigrationApplication; -import eu.dnetlib.dhp.oa.graph.raw.common.MdstoreClient; import java.io.Closeable; import java.io.IOException; import java.util.Map; import java.util.Map.Entry; + import org.apache.commons.io.IOUtils; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.oa.graph.raw.common.AbstractMigrationApplication; +import eu.dnetlib.dhp.oa.graph.raw.common.MdstoreClient; + public class MigrateMongoMdstoresApplication extends AbstractMigrationApplication - implements Closeable { + implements Closeable { - private static final Log log = LogFactory.getLog(MigrateMongoMdstoresApplication.class); + private static final Log log = LogFactory.getLog(MigrateMongoMdstoresApplication.class); - private final MdstoreClient mdstoreClient; + private final MdstoreClient mdstoreClient; - public static void main(final String[] args) throws Exception { - final ArgumentApplicationParser parser = - new ArgumentApplicationParser( - IOUtils.toString( - MigrateMongoMdstoresApplication.class.getResourceAsStream( - "/eu/dnetlib/dhp/oa/graph/migrate_mongo_mstores_parameters.json"))); - parser.parseArgument(args); + public static void main(final String[] args) throws Exception { + final ArgumentApplicationParser parser = new ArgumentApplicationParser( + IOUtils + .toString( + MigrateMongoMdstoresApplication.class + .getResourceAsStream( + "/eu/dnetlib/dhp/oa/graph/migrate_mongo_mstores_parameters.json"))); + parser.parseArgument(args); - final String mongoBaseUrl = parser.get("mongoBaseUrl"); - final String mongoDb = parser.get("mongoDb"); + final String mongoBaseUrl = parser.get("mongoBaseUrl"); + final String mongoDb = parser.get("mongoDb"); - final String mdFormat = parser.get("mdFormat"); - final String mdLayout = parser.get("mdLayout"); - final String mdInterpretation = parser.get("mdInterpretation"); + final String mdFormat = parser.get("mdFormat"); + final String mdLayout = parser.get("mdLayout"); + final String mdInterpretation = parser.get("mdInterpretation"); - final String hdfsPath = parser.get("hdfsPath"); + final String hdfsPath = parser.get("hdfsPath"); - try (MigrateMongoMdstoresApplication app = - new MigrateMongoMdstoresApplication(hdfsPath, mongoBaseUrl, mongoDb)) { - app.execute(mdFormat, mdLayout, mdInterpretation); - } - } + try (MigrateMongoMdstoresApplication app = new MigrateMongoMdstoresApplication(hdfsPath, mongoBaseUrl, + mongoDb)) { + app.execute(mdFormat, mdLayout, mdInterpretation); + } + } - public MigrateMongoMdstoresApplication( - final String hdfsPath, final String mongoBaseUrl, final String mongoDb) throws Exception { - super(hdfsPath); - this.mdstoreClient = new MdstoreClient(mongoBaseUrl, mongoDb); - } + public MigrateMongoMdstoresApplication( + final String hdfsPath, final String mongoBaseUrl, final String mongoDb) throws Exception { + super(hdfsPath); + this.mdstoreClient = new MdstoreClient(mongoBaseUrl, mongoDb); + } - public void execute(final String format, final String layout, final String interpretation) { - final Map colls = - mdstoreClient.validCollections(format, layout, interpretation); - log.info("Found " + colls.size() + " mdstores"); + public void execute(final String format, final String layout, final String interpretation) { + final Map colls = mdstoreClient.validCollections(format, layout, interpretation); + log.info("Found " + colls.size() + " mdstores"); - for (final Entry entry : colls.entrySet()) { - log.info("Processing mdstore " + entry.getKey() + " (collection: " + entry.getValue() + ")"); - final String currentColl = entry.getValue(); + for (final Entry entry : colls.entrySet()) { + log.info("Processing mdstore " + entry.getKey() + " (collection: " + entry.getValue() + ")"); + final String currentColl = entry.getValue(); - for (final String xml : mdstoreClient.listRecords(currentColl)) { - emit(xml, "native_" + format); - } - } - } + for (final String xml : mdstoreClient.listRecords(currentColl)) { + emit(xml, "native_" + format); + } + } + } - @Override - public void close() throws IOException { - super.close(); - mdstoreClient.close(); - } + @Override + public void close() throws IOException { + super.close(); + mdstoreClient.close(); + } } diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OafToOafMapper.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OafToOafMapper.java index 2a40e1802..286656149 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OafToOafMapper.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OafToOafMapper.java @@ -1,8 +1,18 @@ + package eu.dnetlib.dhp.oa.graph.raw; import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.createOpenaireId; import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.field; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.Map; + +import org.apache.commons.lang3.StringUtils; +import org.dom4j.Document; +import org.dom4j.Node; + import eu.dnetlib.dhp.oa.graph.raw.common.PacePerson; import eu.dnetlib.dhp.schema.oaf.Author; import eu.dnetlib.dhp.schema.oaf.DataInfo; @@ -14,254 +24,251 @@ import eu.dnetlib.dhp.schema.oaf.Oaf; import eu.dnetlib.dhp.schema.oaf.Qualifier; import eu.dnetlib.dhp.schema.oaf.Relation; import eu.dnetlib.dhp.schema.oaf.StructuredProperty; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.List; -import java.util.Map; -import org.apache.commons.lang3.StringUtils; -import org.dom4j.Document; -import org.dom4j.Node; public class OafToOafMapper extends AbstractMdRecordToOafMapper { - public OafToOafMapper(final Map code2name) { - super(code2name); - } + public OafToOafMapper(final Map code2name) { + super(code2name); + } - @Override - protected List prepareAuthors(final Document doc, final DataInfo info) { - final List res = new ArrayList<>(); - int pos = 1; - for (final Object o : doc.selectNodes("//dc:creator")) { - final Node n = (Node) o; - final Author author = new Author(); - author.setFullname(n.getText()); - author.setRank(pos++); - final PacePerson p = new PacePerson(n.getText(), false); - if (p.isAccurate()) { - author.setName(p.getNormalisedFirstName()); - author.setSurname(p.getNormalisedSurname()); - } - res.add(author); - } - return res; - } + @Override + protected List prepareAuthors(final Document doc, final DataInfo info) { + final List res = new ArrayList<>(); + int pos = 1; + for (final Object o : doc.selectNodes("//dc:creator")) { + final Node n = (Node) o; + final Author author = new Author(); + author.setFullname(n.getText()); + author.setRank(pos++); + final PacePerson p = new PacePerson(n.getText(), false); + if (p.isAccurate()) { + author.setName(p.getNormalisedFirstName()); + author.setSurname(p.getNormalisedSurname()); + } + res.add(author); + } + return res; + } - @Override - protected Qualifier prepareLanguages(final Document doc) { - return prepareQualifier(doc, "//dc:language", "dnet:languages", "dnet:languages"); - } + @Override + protected Qualifier prepareLanguages(final Document doc) { + return prepareQualifier(doc, "//dc:language", "dnet:languages", "dnet:languages"); + } - @Override - protected List prepareSubjects(final Document doc, final DataInfo info) { - return prepareListStructProps(doc, "//dc:subject", info); - } + @Override + protected List prepareSubjects(final Document doc, final DataInfo info) { + return prepareListStructProps(doc, "//dc:subject", info); + } - @Override - protected List prepareTitles(final Document doc, final DataInfo info) { - return prepareListStructProps(doc, "//dc:title", MAIN_TITLE_QUALIFIER, info); - } + @Override + protected List prepareTitles(final Document doc, final DataInfo info) { + return prepareListStructProps(doc, "//dc:title", MAIN_TITLE_QUALIFIER, info); + } - @Override - protected List> prepareDescriptions(final Document doc, final DataInfo info) { - return prepareListFields(doc, "//dc:description", info); - } + @Override + protected List> prepareDescriptions(final Document doc, final DataInfo info) { + return prepareListFields(doc, "//dc:description", info); + } - @Override - protected Field preparePublisher(final Document doc, final DataInfo info) { - return prepareField(doc, "//dc:publisher", info); - } + @Override + protected Field preparePublisher(final Document doc, final DataInfo info) { + return prepareField(doc, "//dc:publisher", info); + } - @Override - protected List> prepareFormats(final Document doc, final DataInfo info) { - return prepareListFields(doc, "//dc:format", info); - } + @Override + protected List> prepareFormats(final Document doc, final DataInfo info) { + return prepareListFields(doc, "//dc:format", info); + } - @Override - protected List> prepareContributors(final Document doc, final DataInfo info) { - return prepareListFields(doc, "//dc:contributor", info); - } + @Override + protected List> prepareContributors(final Document doc, final DataInfo info) { + return prepareListFields(doc, "//dc:contributor", info); + } - @Override - protected List> prepareCoverages(final Document doc, final DataInfo info) { - return prepareListFields(doc, "//dc:coverage", info); - } + @Override + protected List> prepareCoverages(final Document doc, final DataInfo info) { + return prepareListFields(doc, "//dc:coverage", info); + } - @Override - protected List prepareInstances( - final Document doc, - final DataInfo info, - final KeyValue collectedfrom, - final KeyValue hostedby) { - final List res = new ArrayList<>(); - for (final Object o : doc.selectNodes("//dc:identifier")) { - final String url = ((Node) o).getText().trim(); - if (url.startsWith("http")) { - final Instance instance = new Instance(); - instance.setUrl(Arrays.asList(url)); - instance.setInstancetype( - prepareQualifier( - doc, - "//dr:CobjCategory", - "dnet:publication_resource", - "dnet:publication_resource")); - instance.setCollectedfrom(collectedfrom); - instance.setHostedby(hostedby); - instance.setDateofacceptance(field(doc.valueOf("//oaf:dateAccepted"), info)); - instance.setDistributionlocation(doc.valueOf("//oaf:distributionlocation")); - instance.setAccessright( - prepareQualifier(doc, "//oaf:accessrights", "dnet:access_modes", "dnet:access_modes")); - instance.setLicense(field(doc.valueOf("//oaf:license"), info)); - instance.setRefereed(field(doc.valueOf("//oaf:refereed"), info)); - instance.setProcessingchargeamount( - field(doc.valueOf("//oaf:processingchargeamount"), info)); - instance.setProcessingchargecurrency( - field(doc.valueOf("//oaf:processingchargeamount/@currency"), info)); - res.add(instance); - } - } - return res; - } + @Override + protected List prepareInstances( + final Document doc, + final DataInfo info, + final KeyValue collectedfrom, + final KeyValue hostedby) { + final List res = new ArrayList<>(); + for (final Object o : doc.selectNodes("//dc:identifier")) { + final String url = ((Node) o).getText().trim(); + if (url.startsWith("http")) { + final Instance instance = new Instance(); + instance.setUrl(Arrays.asList(url)); + instance + .setInstancetype( + prepareQualifier( + doc, + "//dr:CobjCategory", + "dnet:publication_resource", + "dnet:publication_resource")); + instance.setCollectedfrom(collectedfrom); + instance.setHostedby(hostedby); + instance.setDateofacceptance(field(doc.valueOf("//oaf:dateAccepted"), info)); + instance.setDistributionlocation(doc.valueOf("//oaf:distributionlocation")); + instance + .setAccessright( + prepareQualifier(doc, "//oaf:accessrights", "dnet:access_modes", "dnet:access_modes")); + instance.setLicense(field(doc.valueOf("//oaf:license"), info)); + instance.setRefereed(field(doc.valueOf("//oaf:refereed"), info)); + instance + .setProcessingchargeamount( + field(doc.valueOf("//oaf:processingchargeamount"), info)); + instance + .setProcessingchargecurrency( + field(doc.valueOf("//oaf:processingchargeamount/@currency"), info)); + res.add(instance); + } + } + return res; + } - @Override - protected List> prepareSources(final Document doc, final DataInfo info) { - return prepareListFields(doc, "//dc:source", info); - } + @Override + protected List> prepareSources(final Document doc, final DataInfo info) { + return prepareListFields(doc, "//dc:source", info); + } - @Override - protected List prepareRelevantDates(final Document doc, final DataInfo info) { - return new ArrayList<>(); // NOT PRESENT IN OAF - } + @Override + protected List prepareRelevantDates(final Document doc, final DataInfo info) { + return new ArrayList<>(); // NOT PRESENT IN OAF + } - // SOFTWARES + // SOFTWARES - @Override - protected Qualifier prepareSoftwareProgrammingLanguage(final Document doc, final DataInfo info) { - return null; // NOT PRESENT IN OAF - } + @Override + protected Qualifier prepareSoftwareProgrammingLanguage(final Document doc, final DataInfo info) { + return null; // NOT PRESENT IN OAF + } - @Override - protected Field prepareSoftwareCodeRepositoryUrl( - final Document doc, final DataInfo info) { - return null; // NOT PRESENT IN OAF - } + @Override + protected Field prepareSoftwareCodeRepositoryUrl( + final Document doc, final DataInfo info) { + return null; // NOT PRESENT IN OAF + } - @Override - protected List prepareSoftwareLicenses( - final Document doc, final DataInfo info) { - return new ArrayList<>(); // NOT PRESENT IN OAF - } + @Override + protected List prepareSoftwareLicenses( + final Document doc, final DataInfo info) { + return new ArrayList<>(); // NOT PRESENT IN OAF + } - @Override - protected List> prepareSoftwareDocumentationUrls( - final Document doc, final DataInfo info) { - return new ArrayList<>(); // NOT PRESENT IN OAF - } + @Override + protected List> prepareSoftwareDocumentationUrls( + final Document doc, final DataInfo info) { + return new ArrayList<>(); // NOT PRESENT IN OAF + } - // DATASETS - @Override - protected List prepareDatasetGeoLocations(final Document doc, final DataInfo info) { - return new ArrayList<>(); // NOT PRESENT IN OAF - } + // DATASETS + @Override + protected List prepareDatasetGeoLocations(final Document doc, final DataInfo info) { + return new ArrayList<>(); // NOT PRESENT IN OAF + } - @Override - protected Field prepareDatasetMetadataVersionNumber( - final Document doc, final DataInfo info) { - return null; // NOT PRESENT IN OAF - } + @Override + protected Field prepareDatasetMetadataVersionNumber( + final Document doc, final DataInfo info) { + return null; // NOT PRESENT IN OAF + } - @Override - protected Field prepareDatasetLastMetadataUpdate( - final Document doc, final DataInfo info) { - return null; // NOT PRESENT IN OAF - } + @Override + protected Field prepareDatasetLastMetadataUpdate( + final Document doc, final DataInfo info) { + return null; // NOT PRESENT IN OAF + } - @Override - protected Field prepareDatasetVersion(final Document doc, final DataInfo info) { - return null; // NOT PRESENT IN OAF - } + @Override + protected Field prepareDatasetVersion(final Document doc, final DataInfo info) { + return null; // NOT PRESENT IN OAF + } - @Override - protected Field prepareDatasetSize(final Document doc, final DataInfo info) { - return null; // NOT PRESENT IN OAF - } + @Override + protected Field prepareDatasetSize(final Document doc, final DataInfo info) { + return null; // NOT PRESENT IN OAF + } - @Override - protected Field prepareDatasetDevice(final Document doc, final DataInfo info) { - return null; // NOT PRESENT IN OAF - } + @Override + protected Field prepareDatasetDevice(final Document doc, final DataInfo info) { + return null; // NOT PRESENT IN OAF + } - @Override - protected Field prepareDatasetStorageDate(final Document doc, final DataInfo info) { - return null; // NOT PRESENT IN OAF - } + @Override + protected Field prepareDatasetStorageDate(final Document doc, final DataInfo info) { + return null; // NOT PRESENT IN OAF + } - // OTHER PRODUCTS + // OTHER PRODUCTS - @Override - protected List> prepareOtherResearchProductTools( - final Document doc, final DataInfo info) { - return new ArrayList<>(); // NOT PRESENT IN OAF - } + @Override + protected List> prepareOtherResearchProductTools( + final Document doc, final DataInfo info) { + return new ArrayList<>(); // NOT PRESENT IN OAF + } - @Override - protected List> prepareOtherResearchProductContactGroups( - final Document doc, final DataInfo info) { - return new ArrayList<>(); // NOT PRESENT IN OAF - } + @Override + protected List> prepareOtherResearchProductContactGroups( + final Document doc, final DataInfo info) { + return new ArrayList<>(); // NOT PRESENT IN OAF + } - @Override - protected List> prepareOtherResearchProductContactPersons( - final Document doc, final DataInfo info) { - return new ArrayList<>(); // NOT PRESENT IN OAF - } + @Override + protected List> prepareOtherResearchProductContactPersons( + final Document doc, final DataInfo info) { + return new ArrayList<>(); // NOT PRESENT IN OAF + } - @Override - protected List addOtherResultRels( - final Document doc, - final KeyValue collectedFrom, - final DataInfo info, - final long lastUpdateTimestamp) { - final String docId = createOpenaireId(50, doc.valueOf("//dri:objIdentifier"), false); + @Override + protected List addOtherResultRels( + final Document doc, + final KeyValue collectedFrom, + final DataInfo info, + final long lastUpdateTimestamp) { + final String docId = createOpenaireId(50, doc.valueOf("//dri:objIdentifier"), false); - final List res = new ArrayList<>(); + final List res = new ArrayList<>(); - for (final Object o : doc.selectNodes("//*[local-name()='relatedDataset']")) { + for (final Object o : doc.selectNodes("//*[local-name()='relatedDataset']")) { - final String originalId = ((Node) o).getText(); + final String originalId = ((Node) o).getText(); - if (StringUtils.isNotBlank(originalId)) { + if (StringUtils.isNotBlank(originalId)) { - final String otherId = createOpenaireId(50, originalId, false); + final String otherId = createOpenaireId(50, originalId, false); - final Relation r1 = new Relation(); - r1.setRelType("resultResult"); - r1.setSubRelType("publicationDataset"); - r1.setRelClass("isRelatedTo"); - r1.setSource(docId); - r1.setTarget(otherId); - r1.setCollectedfrom(Arrays.asList(collectedFrom)); - r1.setDataInfo(info); - r1.setLastupdatetimestamp(lastUpdateTimestamp); - res.add(r1); + final Relation r1 = new Relation(); + r1.setRelType("resultResult"); + r1.setSubRelType("publicationDataset"); + r1.setRelClass("isRelatedTo"); + r1.setSource(docId); + r1.setTarget(otherId); + r1.setCollectedfrom(Arrays.asList(collectedFrom)); + r1.setDataInfo(info); + r1.setLastupdatetimestamp(lastUpdateTimestamp); + res.add(r1); - final Relation r2 = new Relation(); - r2.setRelType("resultResult"); - r2.setSubRelType("publicationDataset"); - r2.setRelClass("isRelatedTo"); - r2.setSource(otherId); - r2.setTarget(docId); - r2.setCollectedfrom(Arrays.asList(collectedFrom)); - r2.setDataInfo(info); - r2.setLastupdatetimestamp(lastUpdateTimestamp); - res.add(r2); - } - } - return res; - } + final Relation r2 = new Relation(); + r2.setRelType("resultResult"); + r2.setSubRelType("publicationDataset"); + r2.setRelClass("isRelatedTo"); + r2.setSource(otherId); + r2.setTarget(docId); + r2.setCollectedfrom(Arrays.asList(collectedFrom)); + r2.setDataInfo(info); + r2.setLastupdatetimestamp(lastUpdateTimestamp); + res.add(r2); + } + } + return res; + } - @Override - protected Qualifier prepareResourceType(final Document doc, final DataInfo info) { - return null; // NOT PRESENT IN OAF - } + @Override + protected Qualifier prepareResourceType(final Document doc, final DataInfo info) { + return null; // NOT PRESENT IN OAF + } } diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OdfToOafMapper.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OdfToOafMapper.java index 64755a6eb..93b0eb29c 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OdfToOafMapper.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OdfToOafMapper.java @@ -1,9 +1,19 @@ + package eu.dnetlib.dhp.oa.graph.raw; import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.createOpenaireId; import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.field; import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.structuredProperty; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.Map; + +import org.apache.commons.lang3.StringUtils; +import org.dom4j.Document; +import org.dom4j.Node; + import eu.dnetlib.dhp.schema.oaf.Author; import eu.dnetlib.dhp.schema.oaf.DataInfo; import eu.dnetlib.dhp.schema.oaf.Field; @@ -14,338 +24,337 @@ import eu.dnetlib.dhp.schema.oaf.Oaf; import eu.dnetlib.dhp.schema.oaf.Qualifier; import eu.dnetlib.dhp.schema.oaf.Relation; import eu.dnetlib.dhp.schema.oaf.StructuredProperty; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.List; -import java.util.Map; -import org.apache.commons.lang3.StringUtils; -import org.dom4j.Document; -import org.dom4j.Node; public class OdfToOafMapper extends AbstractMdRecordToOafMapper { - public OdfToOafMapper(final Map code2name) { - super(code2name); - } + public OdfToOafMapper(final Map code2name) { + super(code2name); + } - @Override - protected List prepareTitles(final Document doc, final DataInfo info) { - return prepareListStructProps(doc, "//datacite:title", MAIN_TITLE_QUALIFIER, info); - } + @Override + protected List prepareTitles(final Document doc, final DataInfo info) { + return prepareListStructProps(doc, "//datacite:title", MAIN_TITLE_QUALIFIER, info); + } - @Override - protected List prepareAuthors(final Document doc, final DataInfo info) { - final List res = new ArrayList<>(); - int pos = 1; - for (final Object o : doc.selectNodes("//datacite:creator")) { - final Node n = (Node) o; - final Author author = new Author(); - author.setFullname(n.valueOf("./datacite:creatorName")); - author.setName(n.valueOf("./datacite:givenName")); - author.setSurname(n.valueOf("./datacite:familyName")); - author.setAffiliation(prepareListFields(doc, "./datacite:affiliation", info)); - author.setPid(preparePids(doc, info)); - author.setRank(pos++); - res.add(author); - } - return res; - } + @Override + protected List prepareAuthors(final Document doc, final DataInfo info) { + final List res = new ArrayList<>(); + int pos = 1; + for (final Object o : doc.selectNodes("//datacite:creator")) { + final Node n = (Node) o; + final Author author = new Author(); + author.setFullname(n.valueOf("./datacite:creatorName")); + author.setName(n.valueOf("./datacite:givenName")); + author.setSurname(n.valueOf("./datacite:familyName")); + author.setAffiliation(prepareListFields(doc, "./datacite:affiliation", info)); + author.setPid(preparePids(doc, info)); + author.setRank(pos++); + res.add(author); + } + return res; + } - private List preparePids(final Document doc, final DataInfo info) { - final List res = new ArrayList<>(); - for (final Object o : doc.selectNodes("./datacite:nameIdentifier")) { - res.add( - structuredProperty( - ((Node) o).getText(), - prepareQualifier( - (Node) o, "./@nameIdentifierScheme", "dnet:pid_types", "dnet:pid_types"), - info)); - } - return res; - } + private List preparePids(final Document doc, final DataInfo info) { + final List res = new ArrayList<>(); + for (final Object o : doc.selectNodes("./datacite:nameIdentifier")) { + res + .add( + structuredProperty( + ((Node) o).getText(), + prepareQualifier( + (Node) o, "./@nameIdentifierScheme", "dnet:pid_types", "dnet:pid_types"), + info)); + } + return res; + } - @Override - protected List prepareInstances( - final Document doc, - final DataInfo info, - final KeyValue collectedfrom, - final KeyValue hostedby) { + @Override + protected List prepareInstances( + final Document doc, + final DataInfo info, + final KeyValue collectedfrom, + final KeyValue hostedby) { - final Instance instance = new Instance(); - instance.setUrl(new ArrayList<>()); - instance.setInstancetype( - prepareQualifier( - doc, "//dr:CobjCategory", "dnet:publication_resource", "dnet:publication_resource")); - instance.setCollectedfrom(collectedfrom); - instance.setHostedby(hostedby); - instance.setDateofacceptance(field(doc.valueOf("//oaf:dateAccepted"), info)); - instance.setDistributionlocation(doc.valueOf("//oaf:distributionlocation")); - instance.setAccessright( - prepareQualifier(doc, "//oaf:accessrights", "dnet:access_modes", "dnet:access_modes")); - instance.setLicense(field(doc.valueOf("//oaf:license"), info)); - instance.setRefereed(field(doc.valueOf("//oaf:refereed"), info)); - instance.setProcessingchargeamount(field(doc.valueOf("//oaf:processingchargeamount"), info)); - instance.setProcessingchargecurrency( - field(doc.valueOf("//oaf:processingchargeamount/@currency"), info)); + final Instance instance = new Instance(); + instance.setUrl(new ArrayList<>()); + instance + .setInstancetype( + prepareQualifier( + doc, "//dr:CobjCategory", "dnet:publication_resource", "dnet:publication_resource")); + instance.setCollectedfrom(collectedfrom); + instance.setHostedby(hostedby); + instance.setDateofacceptance(field(doc.valueOf("//oaf:dateAccepted"), info)); + instance.setDistributionlocation(doc.valueOf("//oaf:distributionlocation")); + instance + .setAccessright( + prepareQualifier(doc, "//oaf:accessrights", "dnet:access_modes", "dnet:access_modes")); + instance.setLicense(field(doc.valueOf("//oaf:license"), info)); + instance.setRefereed(field(doc.valueOf("//oaf:refereed"), info)); + instance.setProcessingchargeamount(field(doc.valueOf("//oaf:processingchargeamount"), info)); + instance + .setProcessingchargecurrency( + field(doc.valueOf("//oaf:processingchargeamount/@currency"), info)); - for (final Object o : - doc.selectNodes("//datacite:alternateIdentifier[@alternateIdentifierType='URL']")) { - instance.getUrl().add(((Node) o).getText().trim()); - } - for (final Object o : doc.selectNodes("//datacite:identifier[@identifierType='URL']")) { - instance.getUrl().add(((Node) o).getText().trim()); - } - for (final Object o : - doc.selectNodes("//datacite:alternateIdentifier[@alternateIdentifierType='DOI']")) { - instance.getUrl().add("http://dx.doi.org/" + ((Node) o).getText().trim()); - } - for (final Object o : doc.selectNodes("//datacite:identifier[@identifierType='DOI']")) { - instance.getUrl().add("http://dx.doi.org/" + ((Node) o).getText().trim()); - } - return Arrays.asList(instance); - } + for (final Object o : doc.selectNodes("//datacite:alternateIdentifier[@alternateIdentifierType='URL']")) { + instance.getUrl().add(((Node) o).getText().trim()); + } + for (final Object o : doc.selectNodes("//datacite:identifier[@identifierType='URL']")) { + instance.getUrl().add(((Node) o).getText().trim()); + } + for (final Object o : doc.selectNodes("//datacite:alternateIdentifier[@alternateIdentifierType='DOI']")) { + instance.getUrl().add("http://dx.doi.org/" + ((Node) o).getText().trim()); + } + for (final Object o : doc.selectNodes("//datacite:identifier[@identifierType='DOI']")) { + instance.getUrl().add("http://dx.doi.org/" + ((Node) o).getText().trim()); + } + return Arrays.asList(instance); + } - @Override - protected List> prepareSources(final Document doc, final DataInfo info) { - return new ArrayList<>(); // Not present in ODF ??? - } + @Override + protected List> prepareSources(final Document doc, final DataInfo info) { + return new ArrayList<>(); // Not present in ODF ??? + } - @Override - protected List prepareRelevantDates(final Document doc, final DataInfo info) { - final List res = new ArrayList<>(); - for (final Object o : doc.selectNodes("//datacite:date")) { - final String dateType = ((Node) o).valueOf("@dateType"); - if (StringUtils.isBlank(dateType) - && !dateType.equalsIgnoreCase("Accepted") - && !dateType.equalsIgnoreCase("Issued") - && !dateType.equalsIgnoreCase("Updated") - && !dateType.equalsIgnoreCase("Available")) { - res.add( - structuredProperty( - ((Node) o).getText(), - "UNKNOWN", - "UNKNOWN", - "dnet:dataCite_date", - "dnet:dataCite_date", - info)); - } - } - return res; - } + @Override + protected List prepareRelevantDates(final Document doc, final DataInfo info) { + final List res = new ArrayList<>(); + for (final Object o : doc.selectNodes("//datacite:date")) { + final String dateType = ((Node) o).valueOf("@dateType"); + if (StringUtils.isBlank(dateType) + && !dateType.equalsIgnoreCase("Accepted") + && !dateType.equalsIgnoreCase("Issued") + && !dateType.equalsIgnoreCase("Updated") + && !dateType.equalsIgnoreCase("Available")) { + res + .add( + structuredProperty( + ((Node) o).getText(), + "UNKNOWN", + "UNKNOWN", + "dnet:dataCite_date", + "dnet:dataCite_date", + info)); + } + } + return res; + } - @Override - protected List> prepareCoverages(final Document doc, final DataInfo info) { - return new ArrayList<>(); // Not present in ODF ??? - } + @Override + protected List> prepareCoverages(final Document doc, final DataInfo info) { + return new ArrayList<>(); // Not present in ODF ??? + } - @Override - protected List> prepareContributors(final Document doc, final DataInfo info) { - return prepareListFields(doc, "//datacite:contributorName", info); - } + @Override + protected List> prepareContributors(final Document doc, final DataInfo info) { + return prepareListFields(doc, "//datacite:contributorName", info); + } - @Override - protected List> prepareFormats(final Document doc, final DataInfo info) { - return prepareListFields(doc, "//datacite:format", info); - } + @Override + protected List> prepareFormats(final Document doc, final DataInfo info) { + return prepareListFields(doc, "//datacite:format", info); + } - @Override - protected Field preparePublisher(final Document doc, final DataInfo info) { - return prepareField(doc, "//datacite:publisher", info); - } + @Override + protected Field preparePublisher(final Document doc, final DataInfo info) { + return prepareField(doc, "//datacite:publisher", info); + } - @Override - protected List> prepareDescriptions(final Document doc, final DataInfo info) { - return prepareListFields(doc, "//datacite:description[@descriptionType='Abstract']", info); - } + @Override + protected List> prepareDescriptions(final Document doc, final DataInfo info) { + return prepareListFields(doc, "//datacite:description[@descriptionType='Abstract']", info); + } - @Override - protected List prepareSubjects(final Document doc, final DataInfo info) { - return prepareListStructProps(doc, "//datacite:subject", info); - } + @Override + protected List prepareSubjects(final Document doc, final DataInfo info) { + return prepareListStructProps(doc, "//datacite:subject", info); + } - @Override - protected Qualifier prepareLanguages(final Document doc) { - return prepareQualifier(doc, "//datacite:language", "dnet:languages", "dnet:languages"); - } + @Override + protected Qualifier prepareLanguages(final Document doc) { + return prepareQualifier(doc, "//datacite:language", "dnet:languages", "dnet:languages"); + } - @Override - protected List> prepareOtherResearchProductTools( - final Document doc, final DataInfo info) { - return new ArrayList<>(); // Not present in ODF ??? - } + @Override + protected List> prepareOtherResearchProductTools( + final Document doc, final DataInfo info) { + return new ArrayList<>(); // Not present in ODF ??? + } - @Override - protected List> prepareOtherResearchProductContactGroups( - final Document doc, final DataInfo info) { - return prepareListFields( - doc, - "//datacite:contributor[@contributorType='ContactGroup']/datacite:contributorName", - info); - } + @Override + protected List> prepareOtherResearchProductContactGroups( + final Document doc, final DataInfo info) { + return prepareListFields( + doc, + "//datacite:contributor[@contributorType='ContactGroup']/datacite:contributorName", + info); + } - @Override - protected List> prepareOtherResearchProductContactPersons( - final Document doc, final DataInfo info) { - return prepareListFields( - doc, - "//datacite:contributor[@contributorType='ContactPerson']/datacite:contributorName", - info); - } + @Override + protected List> prepareOtherResearchProductContactPersons( + final Document doc, final DataInfo info) { + return prepareListFields( + doc, + "//datacite:contributor[@contributorType='ContactPerson']/datacite:contributorName", + info); + } - @Override - protected Qualifier prepareSoftwareProgrammingLanguage(final Document doc, final DataInfo info) { - return prepareQualifier( - doc, "//datacite:format", "dnet:programming_languages", "dnet:programming_languages"); - } + @Override + protected Qualifier prepareSoftwareProgrammingLanguage(final Document doc, final DataInfo info) { + return prepareQualifier( + doc, "//datacite:format", "dnet:programming_languages", "dnet:programming_languages"); + } - @Override - protected Field prepareSoftwareCodeRepositoryUrl( - final Document doc, final DataInfo info) { - return null; // Not present in ODF ??? - } + @Override + protected Field prepareSoftwareCodeRepositoryUrl( + final Document doc, final DataInfo info) { + return null; // Not present in ODF ??? + } - @Override - protected List prepareSoftwareLicenses( - final Document doc, final DataInfo info) { - return new ArrayList<>(); // Not present in ODF ??? - } + @Override + protected List prepareSoftwareLicenses( + final Document doc, final DataInfo info) { + return new ArrayList<>(); // Not present in ODF ??? + } - @Override - protected List> prepareSoftwareDocumentationUrls( - final Document doc, final DataInfo info) { - return prepareListFields( - doc, - "//datacite:relatedIdentifier[@relatedIdentifierType='URL' and @relationType='IsDocumentedBy']", - info); - } + @Override + protected List> prepareSoftwareDocumentationUrls( + final Document doc, final DataInfo info) { + return prepareListFields( + doc, + "//datacite:relatedIdentifier[@relatedIdentifierType='URL' and @relationType='IsDocumentedBy']", + info); + } - // DATASETS + // DATASETS - @Override - protected List prepareDatasetGeoLocations(final Document doc, final DataInfo info) { - final List res = new ArrayList<>(); + @Override + protected List prepareDatasetGeoLocations(final Document doc, final DataInfo info) { + final List res = new ArrayList<>(); - for (final Object o : doc.selectNodes("//datacite:geoLocation")) { - final GeoLocation loc = new GeoLocation(); - loc.setBox(((Node) o).valueOf("./datacite:geoLocationBox")); - loc.setPlace(((Node) o).valueOf("./datacite:geoLocationPlace")); - loc.setPoint(((Node) o).valueOf("./datacite:geoLocationPoint")); - res.add(loc); - } - return res; - } + for (final Object o : doc.selectNodes("//datacite:geoLocation")) { + final GeoLocation loc = new GeoLocation(); + loc.setBox(((Node) o).valueOf("./datacite:geoLocationBox")); + loc.setPlace(((Node) o).valueOf("./datacite:geoLocationPlace")); + loc.setPoint(((Node) o).valueOf("./datacite:geoLocationPoint")); + res.add(loc); + } + return res; + } - @Override - protected Field prepareDatasetMetadataVersionNumber( - final Document doc, final DataInfo info) { - return null; // Not present in ODF ??? - } + @Override + protected Field prepareDatasetMetadataVersionNumber( + final Document doc, final DataInfo info) { + return null; // Not present in ODF ??? + } - @Override - protected Field prepareDatasetLastMetadataUpdate( - final Document doc, final DataInfo info) { - return prepareField(doc, "//datacite:date[@dateType='Updated']", info); - } + @Override + protected Field prepareDatasetLastMetadataUpdate( + final Document doc, final DataInfo info) { + return prepareField(doc, "//datacite:date[@dateType='Updated']", info); + } - @Override - protected Field prepareDatasetVersion(final Document doc, final DataInfo info) { - return prepareField(doc, "//datacite:version", info); - } + @Override + protected Field prepareDatasetVersion(final Document doc, final DataInfo info) { + return prepareField(doc, "//datacite:version", info); + } - @Override - protected Field prepareDatasetSize(final Document doc, final DataInfo info) { - return prepareField(doc, "//datacite:size", info); - } + @Override + protected Field prepareDatasetSize(final Document doc, final DataInfo info) { + return prepareField(doc, "//datacite:size", info); + } - @Override - protected Field prepareDatasetDevice(final Document doc, final DataInfo info) { - return null; // Not present in ODF ??? - } + @Override + protected Field prepareDatasetDevice(final Document doc, final DataInfo info) { + return null; // Not present in ODF ??? + } - @Override - protected Field prepareDatasetStorageDate(final Document doc, final DataInfo info) { - return prepareField(doc, "//datacite:date[@dateType='Issued']", info); - } + @Override + protected Field prepareDatasetStorageDate(final Document doc, final DataInfo info) { + return prepareField(doc, "//datacite:date[@dateType='Issued']", info); + } - @Override - protected List addOtherResultRels( - final Document doc, - final KeyValue collectedFrom, - final DataInfo info, - final long lastUpdateTimestamp) { + @Override + protected List addOtherResultRels( + final Document doc, + final KeyValue collectedFrom, + final DataInfo info, + final long lastUpdateTimestamp) { - final String docId = createOpenaireId(50, doc.valueOf("//dri:objIdentifier"), false); + final String docId = createOpenaireId(50, doc.valueOf("//dri:objIdentifier"), false); - final List res = new ArrayList<>(); + final List res = new ArrayList<>(); - for (final Object o : - doc.selectNodes("//datacite:relatedIdentifier[@relatedIdentifierType='OPENAIRE']")) { + for (final Object o : doc.selectNodes("//datacite:relatedIdentifier[@relatedIdentifierType='OPENAIRE']")) { - final String originalId = ((Node) o).getText(); + final String originalId = ((Node) o).getText(); - if (StringUtils.isNotBlank(originalId)) { - final String otherId = createOpenaireId(50, originalId, false); - final String type = ((Node) o).valueOf("@relationType"); + if (StringUtils.isNotBlank(originalId)) { + final String otherId = createOpenaireId(50, originalId, false); + final String type = ((Node) o).valueOf("@relationType"); - if (type.equals("IsSupplementTo")) { - res.add( - prepareOtherResultRel( - collectedFrom, - info, - lastUpdateTimestamp, - docId, - otherId, - "supplement", - "isSupplementTo")); - res.add( - prepareOtherResultRel( - collectedFrom, - info, - lastUpdateTimestamp, - otherId, - docId, - "supplement", - "isSupplementedBy")); - } else if (type.equals("IsPartOf")) { - res.add( - prepareOtherResultRel( - collectedFrom, info, lastUpdateTimestamp, docId, otherId, "part", "IsPartOf")); - res.add( - prepareOtherResultRel( - collectedFrom, info, lastUpdateTimestamp, otherId, docId, "part", "HasParts")); - } else { - } - } - } - return res; - } + if (type.equals("IsSupplementTo")) { + res + .add( + prepareOtherResultRel( + collectedFrom, + info, + lastUpdateTimestamp, + docId, + otherId, + "supplement", + "isSupplementTo")); + res + .add( + prepareOtherResultRel( + collectedFrom, + info, + lastUpdateTimestamp, + otherId, + docId, + "supplement", + "isSupplementedBy")); + } else if (type.equals("IsPartOf")) { + res + .add( + prepareOtherResultRel( + collectedFrom, info, lastUpdateTimestamp, docId, otherId, "part", "IsPartOf")); + res + .add( + prepareOtherResultRel( + collectedFrom, info, lastUpdateTimestamp, otherId, docId, "part", "HasParts")); + } else { + } + } + } + return res; + } - private Relation prepareOtherResultRel( - final KeyValue collectedFrom, - final DataInfo info, - final long lastUpdateTimestamp, - final String source, - final String target, - final String subRelType, - final String relClass) { - final Relation r = new Relation(); - r.setRelType("resultResult"); - r.setSubRelType(subRelType); - r.setRelClass(relClass); - r.setSource(source); - r.setTarget(target); - r.setCollectedfrom(Arrays.asList(collectedFrom)); - r.setDataInfo(info); - r.setLastupdatetimestamp(lastUpdateTimestamp); - return r; - } + private Relation prepareOtherResultRel( + final KeyValue collectedFrom, + final DataInfo info, + final long lastUpdateTimestamp, + final String source, + final String target, + final String subRelType, + final String relClass) { + final Relation r = new Relation(); + r.setRelType("resultResult"); + r.setSubRelType(subRelType); + r.setRelClass(relClass); + r.setSource(source); + r.setTarget(target); + r.setCollectedfrom(Arrays.asList(collectedFrom)); + r.setDataInfo(info); + r.setLastupdatetimestamp(lastUpdateTimestamp); + return r; + } - @Override - protected Qualifier prepareResourceType(final Document doc, final DataInfo info) { - return prepareQualifier( - doc, - "//*[local-name() = 'resource']//*[local-name() = 'resourceType']", - "dnet:dataCite_resource", - "dnet:dataCite_resource"); - } + @Override + protected Qualifier prepareResourceType(final Document doc, final DataInfo info) { + return prepareQualifier( + doc, + "//*[local-name() = 'resource']//*[local-name() = 'resourceType']", + "dnet:dataCite_resource", + "dnet:dataCite_resource"); + } } diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/common/AbstractMigrationApplication.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/common/AbstractMigrationApplication.java index c7756be0d..f7579c0a0 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/common/AbstractMigrationApplication.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/common/AbstractMigrationApplication.java @@ -1,9 +1,10 @@ + package eu.dnetlib.dhp.oa.graph.raw.common; -import eu.dnetlib.dhp.schema.oaf.Oaf; import java.io.Closeable; import java.io.IOException; import java.util.concurrent.atomic.AtomicInteger; + import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; @@ -12,72 +13,74 @@ import org.apache.hadoop.io.SequenceFile; import org.apache.hadoop.io.Text; import org.codehaus.jackson.map.ObjectMapper; +import eu.dnetlib.dhp.schema.oaf.Oaf; + public class AbstractMigrationApplication implements Closeable { - private final AtomicInteger counter = new AtomicInteger(0); + private final AtomicInteger counter = new AtomicInteger(0); - private final Text key = new Text(); + private final Text key = new Text(); - private final Text value = new Text(); + private final Text value = new Text(); - private final SequenceFile.Writer writer; + private final SequenceFile.Writer writer; - private final ObjectMapper objectMapper = new ObjectMapper(); + private final ObjectMapper objectMapper = new ObjectMapper(); - private static final Log log = LogFactory.getLog(AbstractMigrationApplication.class); + private static final Log log = LogFactory.getLog(AbstractMigrationApplication.class); - protected AbstractMigrationApplication() { // ONLY FOR UNIT TEST - this.writer = null; - } + protected AbstractMigrationApplication() { // ONLY FOR UNIT TEST + this.writer = null; + } - public AbstractMigrationApplication(final String hdfsPath) throws Exception { + public AbstractMigrationApplication(final String hdfsPath) throws Exception { - log.info(String.format("Creating SequenceFile Writer, hdfsPath=%s", hdfsPath)); + log.info(String.format("Creating SequenceFile Writer, hdfsPath=%s", hdfsPath)); - this.writer = - SequenceFile.createWriter( - getConf(), - SequenceFile.Writer.file(new Path(hdfsPath)), - SequenceFile.Writer.keyClass(Text.class), - SequenceFile.Writer.valueClass(Text.class)); - } + this.writer = SequenceFile + .createWriter( + getConf(), + SequenceFile.Writer.file(new Path(hdfsPath)), + SequenceFile.Writer.keyClass(Text.class), + SequenceFile.Writer.valueClass(Text.class)); + } - private Configuration getConf() throws IOException { - final Configuration conf = new Configuration(); - /* - * conf.set("fs.defaultFS", hdfsNameNode); conf.set("fs.hdfs.impl", - * org.apache.hadoop.hdfs.DistributedFileSystem.class.getName()); conf.set("fs.file.impl", - * org.apache.hadoop.fs.LocalFileSystem.class.getName()); System.setProperty("HADOOP_USER_NAME", hdfsUser); - * System.setProperty("hadoop.home.dir", "/"); FileSystem.get(URI.create(hdfsNameNode), conf); - */ - return conf; - } + private Configuration getConf() throws IOException { + final Configuration conf = new Configuration(); + /* + * conf.set("fs.defaultFS", hdfsNameNode); conf.set("fs.hdfs.impl", + * org.apache.hadoop.hdfs.DistributedFileSystem.class.getName()); conf.set("fs.file.impl", + * org.apache.hadoop.fs.LocalFileSystem.class.getName()); System.setProperty("HADOOP_USER_NAME", hdfsUser); + * System.setProperty("hadoop.home.dir", "/"); FileSystem.get(URI.create(hdfsNameNode), conf); + */ + return conf; + } - protected void emit(final String s, final String type) { - try { - key.set(counter.getAndIncrement() + ":" + type); - value.set(s); - writer.append(key, value); - } catch (final Exception e) { - throw new RuntimeException(e); - } - } + protected void emit(final String s, final String type) { + try { + key.set(counter.getAndIncrement() + ":" + type); + value.set(s); + writer.append(key, value); + } catch (final Exception e) { + throw new RuntimeException(e); + } + } - protected void emitOaf(final Oaf oaf) { - try { - emit(objectMapper.writeValueAsString(oaf), oaf.getClass().getSimpleName().toLowerCase()); - } catch (final Exception e) { - throw new RuntimeException(e); - } - } + protected void emitOaf(final Oaf oaf) { + try { + emit(objectMapper.writeValueAsString(oaf), oaf.getClass().getSimpleName().toLowerCase()); + } catch (final Exception e) { + throw new RuntimeException(e); + } + } - public ObjectMapper getObjectMapper() { - return objectMapper; - } + public ObjectMapper getObjectMapper() { + return objectMapper; + } - @Override - public void close() throws IOException { - writer.hflush(); - writer.close(); - } + @Override + public void close() throws IOException { + writer.hflush(); + writer.close(); + } } diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/common/DbClient.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/common/DbClient.java index ca7c9fffb..121df8131 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/common/DbClient.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/common/DbClient.java @@ -1,61 +1,62 @@ + package eu.dnetlib.dhp.oa.graph.raw.common; import java.io.Closeable; import java.io.IOException; import java.sql.*; import java.util.function.Consumer; + import org.apache.commons.lang3.StringUtils; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; public class DbClient implements Closeable { - private static final Log log = LogFactory.getLog(DbClient.class); + private static final Log log = LogFactory.getLog(DbClient.class); - private Connection connection; + private Connection connection; - public DbClient(final String address, final String login, final String password) { + public DbClient(final String address, final String login, final String password) { - try { - Class.forName("org.postgresql.Driver"); + try { + Class.forName("org.postgresql.Driver"); - this.connection = - StringUtils.isNoneBlank(login, password) - ? DriverManager.getConnection(address, login, password) - : DriverManager.getConnection(address); - this.connection.setAutoCommit(false); - } catch (final Exception e) { - log.error("Connection to postgresDB failed"); - throw new RuntimeException("Connection to postgresDB failed", e); - } - log.info("Opened database successfully"); - } + this.connection = StringUtils.isNoneBlank(login, password) + ? DriverManager.getConnection(address, login, password) + : DriverManager.getConnection(address); + this.connection.setAutoCommit(false); + } catch (final Exception e) { + log.error("Connection to postgresDB failed"); + throw new RuntimeException("Connection to postgresDB failed", e); + } + log.info("Opened database successfully"); + } - public void processResults(final String sql, final Consumer consumer) { + public void processResults(final String sql, final Consumer consumer) { - try (final Statement stmt = connection.createStatement()) { - stmt.setFetchSize(100); + try (final Statement stmt = connection.createStatement()) { + stmt.setFetchSize(100); - try (final ResultSet rs = stmt.executeQuery(sql)) { - while (rs.next()) { - consumer.accept(rs); - } - } catch (final SQLException e) { - log.error("Error executing sql query: " + sql, e); - throw new RuntimeException("Error executing sql query", e); - } - } catch (final SQLException e1) { - log.error("Error preparing sql statement", e1); - throw new RuntimeException("Error preparing sql statement", e1); - } - } + try (final ResultSet rs = stmt.executeQuery(sql)) { + while (rs.next()) { + consumer.accept(rs); + } + } catch (final SQLException e) { + log.error("Error executing sql query: " + sql, e); + throw new RuntimeException("Error executing sql query", e); + } + } catch (final SQLException e1) { + log.error("Error preparing sql statement", e1); + throw new RuntimeException("Error preparing sql statement", e1); + } + } - @Override - public void close() throws IOException { - try { - connection.close(); - } catch (final SQLException e) { - throw new RuntimeException(e); - } - } + @Override + public void close() throws IOException { + try { + connection.close(); + } catch (final SQLException e) { + throw new RuntimeException(e); + } + } } diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/common/MdstoreClient.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/common/MdstoreClient.java index 1602c9742..a2177935a 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/common/MdstoreClient.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/common/MdstoreClient.java @@ -1,100 +1,102 @@ + package eu.dnetlib.dhp.oa.graph.raw.common; -import com.google.common.collect.Iterables; -import com.mongodb.MongoClient; -import com.mongodb.MongoClientURI; -import com.mongodb.client.MongoCollection; -import com.mongodb.client.MongoDatabase; import java.io.Closeable; import java.io.IOException; import java.util.ArrayList; import java.util.HashMap; import java.util.Map; import java.util.stream.StreamSupport; + import org.apache.commons.lang3.StringUtils; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.bson.Document; +import com.google.common.collect.Iterables; +import com.mongodb.MongoClient; +import com.mongodb.MongoClientURI; +import com.mongodb.client.MongoCollection; +import com.mongodb.client.MongoDatabase; + public class MdstoreClient implements Closeable { - private final MongoClient client; - private final MongoDatabase db; + private final MongoClient client; + private final MongoDatabase db; - private static final String COLL_METADATA = "metadata"; - private static final String COLL_METADATA_MANAGER = "metadataManager"; + private static final String COLL_METADATA = "metadata"; + private static final String COLL_METADATA_MANAGER = "metadataManager"; - private static final Log log = LogFactory.getLog(MdstoreClient.class); + private static final Log log = LogFactory.getLog(MdstoreClient.class); - public MdstoreClient(final String baseUrl, final String dbName) { - this.client = new MongoClient(new MongoClientURI(baseUrl)); - this.db = getDb(client, dbName); - } + public MdstoreClient(final String baseUrl, final String dbName) { + this.client = new MongoClient(new MongoClientURI(baseUrl)); + this.db = getDb(client, dbName); + } - public Map validCollections( - final String mdFormat, final String mdLayout, final String mdInterpretation) { + public Map validCollections( + final String mdFormat, final String mdLayout, final String mdInterpretation) { - final Map transactions = new HashMap<>(); - for (final Document entry : getColl(db, COLL_METADATA_MANAGER, true).find()) { - final String mdId = entry.getString("mdId"); - final String currentId = entry.getString("currentId"); - if (StringUtils.isNoneBlank(mdId, currentId)) { - transactions.put(mdId, currentId); - } - } + final Map transactions = new HashMap<>(); + for (final Document entry : getColl(db, COLL_METADATA_MANAGER, true).find()) { + final String mdId = entry.getString("mdId"); + final String currentId = entry.getString("currentId"); + if (StringUtils.isNoneBlank(mdId, currentId)) { + transactions.put(mdId, currentId); + } + } - final Map res = new HashMap<>(); - for (final Document entry : getColl(db, COLL_METADATA, true).find()) { - if (entry.getString("format").equals(mdFormat) - && entry.getString("layout").equals(mdLayout) - && entry.getString("interpretation").equals(mdInterpretation) - && transactions.containsKey(entry.getString("mdId"))) { - res.put(entry.getString("mdId"), transactions.get(entry.getString("mdId"))); - } - } + final Map res = new HashMap<>(); + for (final Document entry : getColl(db, COLL_METADATA, true).find()) { + if (entry.getString("format").equals(mdFormat) + && entry.getString("layout").equals(mdLayout) + && entry.getString("interpretation").equals(mdInterpretation) + && transactions.containsKey(entry.getString("mdId"))) { + res.put(entry.getString("mdId"), transactions.get(entry.getString("mdId"))); + } + } - return res; - } + return res; + } - private MongoDatabase getDb(final MongoClient client, final String dbName) { - if (!Iterables.contains(client.listDatabaseNames(), dbName)) { - final String err = - String.format("Database '%s' not found in %s", dbName, client.getAddress()); - log.warn(err); - throw new RuntimeException(err); - } - return client.getDatabase(dbName); - } + private MongoDatabase getDb(final MongoClient client, final String dbName) { + if (!Iterables.contains(client.listDatabaseNames(), dbName)) { + final String err = String.format("Database '%s' not found in %s", dbName, client.getAddress()); + log.warn(err); + throw new RuntimeException(err); + } + return client.getDatabase(dbName); + } - private MongoCollection getColl( - final MongoDatabase db, final String collName, final boolean abortIfMissing) { - if (!Iterables.contains(db.listCollectionNames(), collName)) { - final String err = - String.format( - String.format("Missing collection '%s' in database '%s'", collName, db.getName())); - log.warn(err); - if (abortIfMissing) { - throw new RuntimeException(err); - } else { - return null; - } - } - return db.getCollection(collName); - } + private MongoCollection getColl( + final MongoDatabase db, final String collName, final boolean abortIfMissing) { + if (!Iterables.contains(db.listCollectionNames(), collName)) { + final String err = String + .format( + String.format("Missing collection '%s' in database '%s'", collName, db.getName())); + log.warn(err); + if (abortIfMissing) { + throw new RuntimeException(err); + } else { + return null; + } + } + return db.getCollection(collName); + } - public Iterable listRecords(final String collName) { - final MongoCollection coll = getColl(db, collName, false); - return coll == null - ? new ArrayList<>() - : () -> - StreamSupport.stream(coll.find().spliterator(), false) - .filter(e -> e.containsKey("body")) - .map(e -> e.getString("body")) - .iterator(); - } + public Iterable listRecords(final String collName) { + final MongoCollection coll = getColl(db, collName, false); + return coll == null + ? new ArrayList<>() + : () -> StreamSupport + .stream(coll.find().spliterator(), false) + .filter(e -> e.containsKey("body")) + .map(e -> e.getString("body")) + .iterator(); + } - @Override - public void close() throws IOException { - client.close(); - } + @Override + public void close() throws IOException { + client.close(); + } } diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/common/MigrationConstants.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/common/MigrationConstants.java index 4e0b2dbd3..15bff9565 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/common/MigrationConstants.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/common/MigrationConstants.java @@ -1,3 +1,4 @@ + package eu.dnetlib.dhp.oa.graph.raw.common; import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.qualifier; @@ -6,26 +7,21 @@ import eu.dnetlib.dhp.schema.oaf.Qualifier; public class MigrationConstants { - public static final Qualifier PUBLICATION_RESULTTYPE_QUALIFIER = - qualifier("publication", "publication", "dnet:result_typologies", "dnet:result_typologies"); - public static final Qualifier DATASET_RESULTTYPE_QUALIFIER = - qualifier( - "dataset", "dataset", - "dnet:result_typologies", "dnet:result_typologies"); - public static final Qualifier SOFTWARE_RESULTTYPE_QUALIFIER = - qualifier( - "software", "software", - "dnet:result_typologies", "dnet:result_typologies"); - public static final Qualifier OTHER_RESULTTYPE_QUALIFIER = - qualifier( - "other", "other", - "dnet:result_typologies", "dnet:result_typologies"); - public static final Qualifier REPOSITORY_PROVENANCE_ACTIONS = - qualifier( - "sysimport:crosswalk:repository", "sysimport:crosswalk:repository", - "dnet:provenanceActions", "dnet:provenanceActions"); - public static final Qualifier ENTITYREGISTRY_PROVENANCE_ACTION = - qualifier( - "sysimport:crosswalk:entityregistry", "sysimport:crosswalk:entityregistry", - "dnet:provenanceActions", "dnet:provenanceActions"); + public static final Qualifier PUBLICATION_RESULTTYPE_QUALIFIER = qualifier( + "publication", "publication", "dnet:result_typologies", "dnet:result_typologies"); + public static final Qualifier DATASET_RESULTTYPE_QUALIFIER = qualifier( + "dataset", "dataset", + "dnet:result_typologies", "dnet:result_typologies"); + public static final Qualifier SOFTWARE_RESULTTYPE_QUALIFIER = qualifier( + "software", "software", + "dnet:result_typologies", "dnet:result_typologies"); + public static final Qualifier OTHER_RESULTTYPE_QUALIFIER = qualifier( + "other", "other", + "dnet:result_typologies", "dnet:result_typologies"); + public static final Qualifier REPOSITORY_PROVENANCE_ACTIONS = qualifier( + "sysimport:crosswalk:repository", "sysimport:crosswalk:repository", + "dnet:provenanceActions", "dnet:provenanceActions"); + public static final Qualifier ENTITYREGISTRY_PROVENANCE_ACTION = qualifier( + "sysimport:crosswalk:entityregistry", "sysimport:crosswalk:entityregistry", + "dnet:provenanceActions", "dnet:provenanceActions"); } diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/common/OafMapperUtils.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/common/OafMapperUtils.java index b9788a05c..9beed2837 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/common/OafMapperUtils.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/common/OafMapperUtils.java @@ -1,215 +1,220 @@ + package eu.dnetlib.dhp.oa.graph.raw.common; -import eu.dnetlib.dhp.schema.oaf.*; -import eu.dnetlib.dhp.utils.DHPUtils; import java.util.ArrayList; import java.util.Arrays; import java.util.List; import java.util.Objects; import java.util.stream.Collectors; + import org.apache.commons.lang3.StringUtils; +import eu.dnetlib.dhp.schema.oaf.*; +import eu.dnetlib.dhp.utils.DHPUtils; + public class OafMapperUtils { - public static KeyValue keyValue(final String k, final String v) { - final KeyValue kv = new KeyValue(); - kv.setKey(k); - kv.setValue(v); - return kv; - } + public static KeyValue keyValue(final String k, final String v) { + final KeyValue kv = new KeyValue(); + kv.setKey(k); + kv.setValue(v); + return kv; + } - public static List listKeyValues(final String... s) { - if (s.length % 2 > 0) { - throw new RuntimeException("Invalid number of parameters (k,v,k,v,....)"); - } + public static List listKeyValues(final String... s) { + if (s.length % 2 > 0) { + throw new RuntimeException("Invalid number of parameters (k,v,k,v,....)"); + } - final List list = new ArrayList<>(); - for (int i = 0; i < s.length; i += 2) { - list.add(keyValue(s[i], s[i + 1])); - } - return list; - } + final List list = new ArrayList<>(); + for (int i = 0; i < s.length; i += 2) { + list.add(keyValue(s[i], s[i + 1])); + } + return list; + } - public static Field field(final T value, final DataInfo info) { - if (value == null || StringUtils.isBlank(value.toString())) { - return null; - } + public static Field field(final T value, final DataInfo info) { + if (value == null || StringUtils.isBlank(value.toString())) { + return null; + } - final Field field = new Field<>(); - field.setValue(value); - field.setDataInfo(info); - return field; - } + final Field field = new Field<>(); + field.setValue(value); + field.setDataInfo(info); + return field; + } - public static List> listFields(final DataInfo info, final String... values) { - return Arrays.stream(values) - .map(v -> field(v, info)) - .filter(Objects::nonNull) - .collect(Collectors.toList()); - } + public static List> listFields(final DataInfo info, final String... values) { + return Arrays + .stream(values) + .map(v -> field(v, info)) + .filter(Objects::nonNull) + .collect(Collectors.toList()); + } - public static List> listFields(final DataInfo info, final List values) { - return values.stream() - .map(v -> field(v, info)) - .filter(Objects::nonNull) - .collect(Collectors.toList()); - } + public static List> listFields(final DataInfo info, final List values) { + return values + .stream() + .map(v -> field(v, info)) + .filter(Objects::nonNull) + .collect(Collectors.toList()); + } - public static Qualifier qualifier( - final String classid, - final String classname, - final String schemeid, - final String schemename) { - final Qualifier q = new Qualifier(); - q.setClassid(classid); - q.setClassname(classname); - q.setSchemeid(schemeid); - q.setSchemename(schemename); - return q; - } + public static Qualifier qualifier( + final String classid, + final String classname, + final String schemeid, + final String schemename) { + final Qualifier q = new Qualifier(); + q.setClassid(classid); + q.setClassname(classname); + q.setSchemeid(schemeid); + q.setSchemename(schemename); + return q; + } - public static StructuredProperty structuredProperty( - final String value, - final String classid, - final String classname, - final String schemeid, - final String schemename, - final DataInfo dataInfo) { + public static StructuredProperty structuredProperty( + final String value, + final String classid, + final String classname, + final String schemeid, + final String schemename, + final DataInfo dataInfo) { - return structuredProperty(value, qualifier(classid, classname, schemeid, schemename), dataInfo); - } + return structuredProperty(value, qualifier(classid, classname, schemeid, schemename), dataInfo); + } - public static StructuredProperty structuredProperty( - final String value, final Qualifier qualifier, final DataInfo dataInfo) { - if (value == null) { - return null; - } - final StructuredProperty sp = new StructuredProperty(); - sp.setValue(value); - sp.setQualifier(qualifier); - sp.setDataInfo(dataInfo); - return sp; - } + public static StructuredProperty structuredProperty( + final String value, final Qualifier qualifier, final DataInfo dataInfo) { + if (value == null) { + return null; + } + final StructuredProperty sp = new StructuredProperty(); + sp.setValue(value); + sp.setQualifier(qualifier); + sp.setDataInfo(dataInfo); + return sp; + } - public static ExtraInfo extraInfo( - final String name, - final String value, - final String typology, - final String provenance, - final String trust) { - final ExtraInfo info = new ExtraInfo(); - info.setName(name); - info.setValue(value); - info.setTypology(typology); - info.setProvenance(provenance); - info.setTrust(trust); - return info; - } + public static ExtraInfo extraInfo( + final String name, + final String value, + final String typology, + final String provenance, + final String trust) { + final ExtraInfo info = new ExtraInfo(); + info.setName(name); + info.setValue(value); + info.setTypology(typology); + info.setProvenance(provenance); + info.setTrust(trust); + return info; + } - public static OAIProvenance oaiIProvenance( - final String identifier, - final String baseURL, - final String metadataNamespace, - final Boolean altered, - final String datestamp, - final String harvestDate) { + public static OAIProvenance oaiIProvenance( + final String identifier, + final String baseURL, + final String metadataNamespace, + final Boolean altered, + final String datestamp, + final String harvestDate) { - final OriginDescription desc = new OriginDescription(); - desc.setIdentifier(identifier); - desc.setBaseURL(baseURL); - desc.setMetadataNamespace(metadataNamespace); - desc.setAltered(altered); - desc.setDatestamp(datestamp); - desc.setHarvestDate(harvestDate); + final OriginDescription desc = new OriginDescription(); + desc.setIdentifier(identifier); + desc.setBaseURL(baseURL); + desc.setMetadataNamespace(metadataNamespace); + desc.setAltered(altered); + desc.setDatestamp(datestamp); + desc.setHarvestDate(harvestDate); - final OAIProvenance p = new OAIProvenance(); - p.setOriginDescription(desc); + final OAIProvenance p = new OAIProvenance(); + p.setOriginDescription(desc); - return p; - } + return p; + } - public static Journal journal( - final String name, - final String issnPrinted, - final String issnOnline, - final String issnLinking, - final String ep, - final String iss, - final String sp, - final String vol, - final String edition, - final String conferenceplace, - final String conferencedate, - final DataInfo dataInfo) { + public static Journal journal( + final String name, + final String issnPrinted, + final String issnOnline, + final String issnLinking, + final String ep, + final String iss, + final String sp, + final String vol, + final String edition, + final String conferenceplace, + final String conferencedate, + final DataInfo dataInfo) { - if (StringUtils.isNotBlank(name) - || StringUtils.isNotBlank(issnPrinted) - || StringUtils.isNotBlank(issnOnline) - || StringUtils.isNotBlank(issnLinking)) { - final Journal j = new Journal(); - j.setName(name); - j.setIssnPrinted(issnPrinted); - j.setIssnOnline(issnOnline); - j.setIssnLinking(issnLinking); - j.setEp(ep); - j.setIss(iss); - j.setSp(sp); - j.setVol(vol); - j.setEdition(edition); - j.setConferenceplace(conferenceplace); - j.setConferencedate(conferencedate); - j.setDataInfo(dataInfo); - return j; - } else { - return null; - } - } + if (StringUtils.isNotBlank(name) + || StringUtils.isNotBlank(issnPrinted) + || StringUtils.isNotBlank(issnOnline) + || StringUtils.isNotBlank(issnLinking)) { + final Journal j = new Journal(); + j.setName(name); + j.setIssnPrinted(issnPrinted); + j.setIssnOnline(issnOnline); + j.setIssnLinking(issnLinking); + j.setEp(ep); + j.setIss(iss); + j.setSp(sp); + j.setVol(vol); + j.setEdition(edition); + j.setConferenceplace(conferenceplace); + j.setConferencedate(conferencedate); + j.setDataInfo(dataInfo); + return j; + } else { + return null; + } + } - public static DataInfo dataInfo( - final Boolean deletedbyinference, - final String inferenceprovenance, - final Boolean inferred, - final Boolean invisible, - final Qualifier provenanceaction, - final String trust) { - final DataInfo d = new DataInfo(); - d.setDeletedbyinference(deletedbyinference); - d.setInferenceprovenance(inferenceprovenance); - d.setInferred(inferred); - d.setInvisible(invisible); - d.setProvenanceaction(provenanceaction); - d.setTrust(trust); - return d; - } + public static DataInfo dataInfo( + final Boolean deletedbyinference, + final String inferenceprovenance, + final Boolean inferred, + final Boolean invisible, + final Qualifier provenanceaction, + final String trust) { + final DataInfo d = new DataInfo(); + d.setDeletedbyinference(deletedbyinference); + d.setInferenceprovenance(inferenceprovenance); + d.setInferred(inferred); + d.setInvisible(invisible); + d.setProvenanceaction(provenanceaction); + d.setTrust(trust); + return d; + } - public static String createOpenaireId( - final int prefix, final String originalId, final boolean to_md5) { - if (to_md5) { - final String nsPrefix = StringUtils.substringBefore(originalId, "::"); - final String rest = StringUtils.substringAfter(originalId, "::"); - return String.format("%s|%s::%s", prefix, nsPrefix, DHPUtils.md5(rest)); - } else { - return String.format("%s|%s", prefix, originalId); - } - } + public static String createOpenaireId( + final int prefix, final String originalId, final boolean to_md5) { + if (to_md5) { + final String nsPrefix = StringUtils.substringBefore(originalId, "::"); + final String rest = StringUtils.substringAfter(originalId, "::"); + return String.format("%s|%s::%s", prefix, nsPrefix, DHPUtils.md5(rest)); + } else { + return String.format("%s|%s", prefix, originalId); + } + } - public static String createOpenaireId( - final String type, final String originalId, final boolean to_md5) { - switch (type) { - case "datasource": - return createOpenaireId(10, originalId, to_md5); - case "organization": - return createOpenaireId(20, originalId, to_md5); - case "person": - return createOpenaireId(30, originalId, to_md5); - case "project": - return createOpenaireId(40, originalId, to_md5); - default: - return createOpenaireId(50, originalId, to_md5); - } - } + public static String createOpenaireId( + final String type, final String originalId, final boolean to_md5) { + switch (type) { + case "datasource": + return createOpenaireId(10, originalId, to_md5); + case "organization": + return createOpenaireId(20, originalId, to_md5); + case "person": + return createOpenaireId(30, originalId, to_md5); + case "project": + return createOpenaireId(40, originalId, to_md5); + default: + return createOpenaireId(50, originalId, to_md5); + } + } - public static String asString(final Object o) { - return o == null ? "" : o.toString(); - } + public static String asString(final Object o) { + return o == null ? "" : o.toString(); + } } diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/common/PacePerson.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/common/PacePerson.java index 5317983b1..8adcd565b 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/common/PacePerson.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/common/PacePerson.java @@ -1,178 +1,183 @@ + package eu.dnetlib.dhp.oa.graph.raw.common; +import java.nio.charset.Charset; +import java.text.Normalizer; +import java.util.HashSet; +import java.util.List; +import java.util.Set; + +import org.apache.commons.io.IOUtils; +import org.apache.commons.lang3.text.WordUtils; + import com.google.common.base.Joiner; import com.google.common.base.Splitter; import com.google.common.collect.Iterables; import com.google.common.collect.Lists; import com.google.common.hash.Hashing; -import java.nio.charset.Charset; -import java.text.Normalizer; -import java.util.HashSet; -import java.util.List; -import java.util.Set; -import org.apache.commons.io.IOUtils; -import org.apache.commons.lang3.text.WordUtils; public class PacePerson { - private static final String UTF8 = "UTF-8"; - private List name = Lists.newArrayList(); - private List surname = Lists.newArrayList(); - private List fullname = Lists.newArrayList(); - private final String original; + private static final String UTF8 = "UTF-8"; + private List name = Lists.newArrayList(); + private List surname = Lists.newArrayList(); + private List fullname = Lists.newArrayList(); + private final String original; - private static Set particles = null; + private static Set particles = null; - public static final String capitalize(final String s) { - return WordUtils.capitalize(s.toLowerCase(), ' ', '-'); - } + public static final String capitalize(final String s) { + return WordUtils.capitalize(s.toLowerCase(), ' ', '-'); + } - public static final String dotAbbreviations(final String s) { - return s.length() == 1 ? s + "." : s; - } + public static final String dotAbbreviations(final String s) { + return s.length() == 1 ? s + "." : s; + } - public static Set loadFromClasspath(final String classpath) { - final Set h = new HashSet<>(); - try { - for (final String s : IOUtils.readLines(PacePerson.class.getResourceAsStream(classpath))) { - h.add(s); - } - } catch (final Throwable e) { - return new HashSet<>(); - } - return h; - } + public static Set loadFromClasspath(final String classpath) { + final Set h = new HashSet<>(); + try { + for (final String s : IOUtils.readLines(PacePerson.class.getResourceAsStream(classpath))) { + h.add(s); + } + } catch (final Throwable e) { + return new HashSet<>(); + } + return h; + } - public PacePerson(String s, final boolean aggressive) { - original = s; - s = Normalizer.normalize(s, Normalizer.Form.NFD); - s = s.replaceAll("\\(.+\\)", ""); - s = s.replaceAll("\\[.+\\]", ""); - s = s.replaceAll("\\{.+\\}", ""); - s = s.replaceAll("\\s+-\\s+", "-"); - s = s.replaceAll("[\\p{Punct}&&[^,-]]", " "); - s = s.replaceAll("\\d", " "); - s = s.replaceAll("\\n", " "); - s = s.replaceAll("\\.", " "); - s = s.replaceAll("\\s+", " "); + public PacePerson(String s, final boolean aggressive) { + original = s; + s = Normalizer.normalize(s, Normalizer.Form.NFD); + s = s.replaceAll("\\(.+\\)", ""); + s = s.replaceAll("\\[.+\\]", ""); + s = s.replaceAll("\\{.+\\}", ""); + s = s.replaceAll("\\s+-\\s+", "-"); + s = s.replaceAll("[\\p{Punct}&&[^,-]]", " "); + s = s.replaceAll("\\d", " "); + s = s.replaceAll("\\n", " "); + s = s.replaceAll("\\.", " "); + s = s.replaceAll("\\s+", " "); - if (aggressive) { - s = s.replaceAll("[\\p{InCombiningDiacriticalMarks}&&[^,-]]", ""); - // s = s.replaceAll("[\\W&&[^,-]]", ""); - } + if (aggressive) { + s = s.replaceAll("[\\p{InCombiningDiacriticalMarks}&&[^,-]]", ""); + // s = s.replaceAll("[\\W&&[^,-]]", ""); + } - if (s.contains(",")) { - final String[] arr = s.split(","); - if (arr.length == 1) { - fullname = splitTerms(arr[0]); - } else if (arr.length > 1) { - surname = splitTerms(arr[0]); - name = splitTerms(arr[1]); - fullname.addAll(surname); - fullname.addAll(name); - } - } else { - fullname = splitTerms(s); + if (s.contains(",")) { + final String[] arr = s.split(","); + if (arr.length == 1) { + fullname = splitTerms(arr[0]); + } else if (arr.length > 1) { + surname = splitTerms(arr[0]); + name = splitTerms(arr[1]); + fullname.addAll(surname); + fullname.addAll(name); + } + } else { + fullname = splitTerms(s); - int lastInitialPosition = fullname.size(); - boolean hasSurnameInUpperCase = false; + int lastInitialPosition = fullname.size(); + boolean hasSurnameInUpperCase = false; - for (int i = 0; i < fullname.size(); i++) { - final String term = fullname.get(i); - if (term.length() == 1) { - lastInitialPosition = i; - } else if (term.equals(term.toUpperCase())) { - hasSurnameInUpperCase = true; - } - } + for (int i = 0; i < fullname.size(); i++) { + final String term = fullname.get(i); + if (term.length() == 1) { + lastInitialPosition = i; + } else if (term.equals(term.toUpperCase())) { + hasSurnameInUpperCase = true; + } + } - if (lastInitialPosition < fullname.size() - 1) { // Case: Michele G. Artini - name = fullname.subList(0, lastInitialPosition + 1); - surname = fullname.subList(lastInitialPosition + 1, fullname.size()); - } else if (hasSurnameInUpperCase) { // Case: Michele ARTINI - for (final String term : fullname) { - if (term.length() > 1 && term.equals(term.toUpperCase())) { - surname.add(term); - } else { - name.add(term); - } - } - } - } - } + if (lastInitialPosition < fullname.size() - 1) { // Case: Michele G. Artini + name = fullname.subList(0, lastInitialPosition + 1); + surname = fullname.subList(lastInitialPosition + 1, fullname.size()); + } else if (hasSurnameInUpperCase) { // Case: Michele ARTINI + for (final String term : fullname) { + if (term.length() > 1 && term.equals(term.toUpperCase())) { + surname.add(term); + } else { + name.add(term); + } + } + } + } + } - private List splitTerms(final String s) { - if (particles == null) { - particles = loadFromClasspath("/eu/dnetlib/dhp/oa/graph/pace/name_particles.txt"); - } + private List splitTerms(final String s) { + if (particles == null) { + particles = loadFromClasspath("/eu/dnetlib/dhp/oa/graph/pace/name_particles.txt"); + } - final List list = Lists.newArrayList(); - for (final String part : Splitter.on(" ").omitEmptyStrings().split(s)) { - if (!particles.contains(part.toLowerCase())) { - list.add(part); - } - } - return list; - } + final List list = Lists.newArrayList(); + for (final String part : Splitter.on(" ").omitEmptyStrings().split(s)) { + if (!particles.contains(part.toLowerCase())) { + list.add(part); + } + } + return list; + } - public List getName() { - return name; - } + public List getName() { + return name; + } - public String getNameString() { - return Joiner.on(" ").join(getName()); - } + public String getNameString() { + return Joiner.on(" ").join(getName()); + } - public List getSurname() { - return surname; - } + public List getSurname() { + return surname; + } - public List getFullname() { - return fullname; - } + public List getFullname() { + return fullname; + } - public String getOriginal() { - return original; - } + public String getOriginal() { + return original; + } - public String hash() { - return Hashing.murmur3_128() - .hashString(getNormalisedFullname(), Charset.forName(UTF8)) - .toString(); - } + public String hash() { + return Hashing + .murmur3_128() + .hashString(getNormalisedFullname(), Charset.forName(UTF8)) + .toString(); + } - public String getNormalisedFirstName() { - return Joiner.on(" ").join(getCapitalFirstnames()); - } + public String getNormalisedFirstName() { + return Joiner.on(" ").join(getCapitalFirstnames()); + } - public String getNormalisedSurname() { - return Joiner.on(" ").join(getCapitalSurname()); - } + public String getNormalisedSurname() { + return Joiner.on(" ").join(getCapitalSurname()); + } - public String getSurnameString() { - return Joiner.on(" ").join(getSurname()); - } + public String getSurnameString() { + return Joiner.on(" ").join(getSurname()); + } - public String getNormalisedFullname() { - return isAccurate() - ? getNormalisedSurname() + ", " + getNormalisedFirstName() - : Joiner.on(" ").join(fullname); - } + public String getNormalisedFullname() { + return isAccurate() + ? getNormalisedSurname() + ", " + getNormalisedFirstName() + : Joiner.on(" ").join(fullname); + } - public List getCapitalFirstnames() { - return Lists.newArrayList( - Iterables.transform(getNameWithAbbreviations(), PacePerson::capitalize)); - } + public List getCapitalFirstnames() { + return Lists + .newArrayList( + Iterables.transform(getNameWithAbbreviations(), PacePerson::capitalize)); + } - public List getCapitalSurname() { - return Lists.newArrayList(Iterables.transform(surname, PacePerson::capitalize)); - } + public List getCapitalSurname() { + return Lists.newArrayList(Iterables.transform(surname, PacePerson::capitalize)); + } - public List getNameWithAbbreviations() { - return Lists.newArrayList(Iterables.transform(name, PacePerson::dotAbbreviations)); - } + public List getNameWithAbbreviations() { + return Lists.newArrayList(Iterables.transform(name, PacePerson::dotAbbreviations)); + } - public boolean isAccurate() { - return name != null && surname != null && !name.isEmpty() && !surname.isEmpty(); - } + public boolean isAccurate() { + return name != null && surname != null && !name.isEmpty() && !surname.isEmpty(); + } } diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/ImportDataFromMongo.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/ImportDataFromMongo.java index 2787c61a9..bc40afbfd 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/ImportDataFromMongo.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/ImportDataFromMongo.java @@ -1,12 +1,6 @@ + package eu.dnetlib.dhp.sx.graph; -import com.mongodb.DBObject; -import com.mongodb.MongoClient; -import com.mongodb.QueryBuilder; -import com.mongodb.client.FindIterable; -import com.mongodb.client.MongoCollection; -import com.mongodb.client.MongoDatabase; -import eu.dnetlib.dhp.application.ArgumentApplicationParser; import java.io.IOException; import java.net.URI; import java.util.ArrayList; @@ -15,6 +9,7 @@ import java.util.Objects; import java.util.concurrent.atomic.AtomicInteger; import java.util.function.Consumer; import java.util.stream.Collectors; + import org.apache.commons.io.IOUtils; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; @@ -25,127 +20,134 @@ import org.apache.hadoop.io.Text; import org.bson.Document; import org.bson.conversions.Bson; +import com.mongodb.DBObject; +import com.mongodb.MongoClient; +import com.mongodb.QueryBuilder; +import com.mongodb.client.FindIterable; +import com.mongodb.client.MongoCollection; +import com.mongodb.client.MongoDatabase; + +import eu.dnetlib.dhp.application.ArgumentApplicationParser; + /** - * This job is responsible to collect data from mongoDatabase and store in a sequence File on HDFS - * Mongo database contains information of each MDSTore in two collections: -metadata That contains - * info like: ID, format, layout, interpretation -metadataManager: that contains info : ID, - * mongoCollectionName from the metadata collection we filter the ids with Format, layout, and - * Interpretation from the metadataManager we get the current MONGO collection name which contains - * metadata XML see function getCurrentId - * - *

This Job will be called different times in base at the triple we want import, and generates - * for each triple a sequence file of XML + * This job is responsible to collect data from mongoDatabase and store in a sequence File on HDFS Mongo database + * contains information of each MDSTore in two collections: -metadata That contains info like: ID, format, layout, + * interpretation -metadataManager: that contains info : ID, mongoCollectionName from the metadata collection we filter + * the ids with Format, layout, and Interpretation from the metadataManager we get the current MONGO collection name + * which contains metadata XML see function getCurrentId + *

+ * This Job will be called different times in base at the triple we want import, and generates for each triple a + * sequence file of XML */ public class ImportDataFromMongo { - /** - * It requires in input some parameters described on a file - * eu/dnetlib/dhp/graph/sx/import_from_mongo_parameters.json - * - *

- the name node - the paht where store HDFS File - the mongo host - the mongo port - the - * metadata format to import - the metadata layout to import - the metadata interpretation to - * import - the mongo database Name - * - *

This params are encoded into args - * - * @param args - * @throws Exception - */ - public static void main(String[] args) throws Exception { - final ArgumentApplicationParser parser = - new ArgumentApplicationParser( - IOUtils.toString( - ImportDataFromMongo.class.getResourceAsStream( - "/eu/dnetlib/dhp/sx/graph/argumentparser/import_from_mongo_parameters.json"))); - parser.parseArgument(args); - final int port = Integer.parseInt(parser.get("dbport")); - final String host = parser.get("dbhost"); + /** + * It requires in input some parameters described on a file + * eu/dnetlib/dhp/graph/sx/import_from_mongo_parameters.json + *

+ * - the name node - the paht where store HDFS File - the mongo host - the mongo port - the metadata format to + * import - the metadata layout to import - the metadata interpretation to import - the mongo database Name + *

+ * This params are encoded into args + * + * @param args + * @throws Exception + */ + public static void main(String[] args) throws Exception { + final ArgumentApplicationParser parser = new ArgumentApplicationParser( + IOUtils + .toString( + ImportDataFromMongo.class + .getResourceAsStream( + "/eu/dnetlib/dhp/sx/graph/argumentparser/import_from_mongo_parameters.json"))); + parser.parseArgument(args); + final int port = Integer.parseInt(parser.get("dbport")); + final String host = parser.get("dbhost"); - final String format = parser.get("format"); - final String layout = parser.get("layout"); - final String interpretation = parser.get("interpretation"); + final String format = parser.get("format"); + final String layout = parser.get("layout"); + final String interpretation = parser.get("interpretation"); - final String dbName = parser.get("dbName"); - final MongoClient client = new MongoClient(host, port); - MongoDatabase database = client.getDatabase(dbName); + final String dbName = parser.get("dbName"); + final MongoClient client = new MongoClient(host, port); + MongoDatabase database = client.getDatabase(dbName); - MongoCollection metadata = database.getCollection("metadata"); - MongoCollection metadataManager = database.getCollection("metadataManager"); - final DBObject query = - QueryBuilder.start("format") - .is(format) - .and("layout") - .is(layout) - .and("interpretation") - .is(interpretation) - .get(); - final List ids = new ArrayList<>(); - metadata - .find((Bson) query) - .forEach((Consumer) document -> ids.add(document.getString("mdId"))); - List databaseId = - ids.stream() - .map(it -> getCurrentId(it, metadataManager)) - .filter(Objects::nonNull) - .collect(Collectors.toList()); + MongoCollection metadata = database.getCollection("metadata"); + MongoCollection metadataManager = database.getCollection("metadataManager"); + final DBObject query = QueryBuilder + .start("format") + .is(format) + .and("layout") + .is(layout) + .and("interpretation") + .is(interpretation) + .get(); + final List ids = new ArrayList<>(); + metadata + .find((Bson) query) + .forEach((Consumer) document -> ids.add(document.getString("mdId"))); + List databaseId = ids + .stream() + .map(it -> getCurrentId(it, metadataManager)) + .filter(Objects::nonNull) + .collect(Collectors.toList()); - final String hdfsuri = parser.get("namenode"); - // ====== Init HDFS File System Object - Configuration conf = new Configuration(); - // Set FileSystem URI - conf.set("fs.defaultFS", hdfsuri); - // Because of Maven - conf.set("fs.hdfs.impl", org.apache.hadoop.hdfs.DistributedFileSystem.class.getName()); - conf.set("fs.file.impl", org.apache.hadoop.fs.LocalFileSystem.class.getName()); + final String hdfsuri = parser.get("namenode"); + // ====== Init HDFS File System Object + Configuration conf = new Configuration(); + // Set FileSystem URI + conf.set("fs.defaultFS", hdfsuri); + // Because of Maven + conf.set("fs.hdfs.impl", org.apache.hadoop.hdfs.DistributedFileSystem.class.getName()); + conf.set("fs.file.impl", org.apache.hadoop.fs.LocalFileSystem.class.getName()); - FileSystem.get(URI.create(hdfsuri), conf); - Path hdfswritepath = new Path(parser.get("targetPath")); + FileSystem.get(URI.create(hdfsuri), conf); + Path hdfswritepath = new Path(parser.get("targetPath")); - final AtomicInteger counter = new AtomicInteger(0); - try (SequenceFile.Writer writer = - SequenceFile.createWriter( - conf, - SequenceFile.Writer.file(hdfswritepath), - SequenceFile.Writer.keyClass(IntWritable.class), - SequenceFile.Writer.valueClass(Text.class))) { - final IntWritable key = new IntWritable(counter.get()); - final Text value = new Text(); - databaseId.forEach( - id -> { - System.out.println("Reading :" + id); - MongoCollection collection = database.getCollection(id); - collection - .find() - .forEach( - (Consumer) - document -> { - key.set(counter.getAndIncrement()); - value.set(document.getString("body")); + final AtomicInteger counter = new AtomicInteger(0); + try (SequenceFile.Writer writer = SequenceFile + .createWriter( + conf, + SequenceFile.Writer.file(hdfswritepath), + SequenceFile.Writer.keyClass(IntWritable.class), + SequenceFile.Writer.valueClass(Text.class))) { + final IntWritable key = new IntWritable(counter.get()); + final Text value = new Text(); + databaseId + .forEach( + id -> { + System.out.println("Reading :" + id); + MongoCollection collection = database.getCollection(id); + collection + .find() + .forEach( + (Consumer) document -> { + key.set(counter.getAndIncrement()); + value.set(document.getString("body")); - if (counter.get() % 10000 == 0) { - System.out.println("Added " + counter.get()); - } - try { - writer.append(key, value); - } catch (IOException e) { - throw new RuntimeException(e); - } - }); - }); - } - } + if (counter.get() % 10000 == 0) { + System.out.println("Added " + counter.get()); + } + try { + writer.append(key, value); + } catch (IOException e) { + throw new RuntimeException(e); + } + }); + }); + } + } - /** - * Return the name of mongo collection giving an MdStore ID - * - * @param mdId The id of the MDStore - * @param metadataManager The collection metadataManager on mongo which contains this information - * @return - */ - private static String getCurrentId( - final String mdId, final MongoCollection metadataManager) { - FindIterable result = - metadataManager.find((Bson) QueryBuilder.start("mdId").is(mdId).get()); - final Document item = result.first(); - return item == null ? null : item.getString("currentId"); - } + /** + * Return the name of mongo collection giving an MdStore ID + * + * @param mdId The id of the MDStore + * @param metadataManager The collection metadataManager on mongo which contains this information + * @return + */ + private static String getCurrentId( + final String mdId, final MongoCollection metadataManager) { + FindIterable result = metadataManager.find((Bson) QueryBuilder.start("mdId").is(mdId).get()); + final Document item = result.first(); + return item == null ? null : item.getString("currentId"); + } } diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/SparkExtractEntitiesJob.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/SparkExtractEntitiesJob.java index 457f987eb..4f015a9ad 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/SparkExtractEntitiesJob.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/SparkExtractEntitiesJob.java @@ -1,11 +1,10 @@ + package eu.dnetlib.dhp.sx.graph; -import com.jayway.jsonpath.JsonPath; -import eu.dnetlib.dhp.application.ArgumentApplicationParser; import java.util.Arrays; import java.util.List; import java.util.stream.Collectors; -import net.minidev.json.JSONArray; + import org.apache.commons.io.IOUtils; import org.apache.commons.lang3.StringUtils; import org.apache.hadoop.io.compress.GzipCodec; @@ -13,107 +12,115 @@ import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.sql.SparkSession; +import com.jayway.jsonpath.JsonPath; + +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import net.minidev.json.JSONArray; + /** - * This Job extracts a typology of entity and stores it in a new RDD This job is called different - * times, for each file generated by the Job {@link ImportDataFromMongo} and store the new RDD in a - * path that should be under a folder: extractedEntities/entity/version1 - * - *

at the end of this process we will have : extractedEntities/dataset/version1 - * extractedEntities/dataset/version2 extractedEntities/dataset/... - * extractedEntities/publication/version1 extractedEntities/publication/version2 - * extractedEntities/publication/... extractedEntities/unknown/version1 - * extractedEntities/unknown/version2 extractedEntities/unknown/... - * extractedEntities/relation/version1 extractedEntities/relation/version2 + * This Job extracts a typology of entity and stores it in a new RDD This job is called different times, for each file + * generated by the Job {@link ImportDataFromMongo} and store the new RDD in a path that should be under a folder: + * extractedEntities/entity/version1 + *

+ * at the end of this process we will have : extractedEntities/dataset/version1 extractedEntities/dataset/version2 + * extractedEntities/dataset/... extractedEntities/publication/version1 extractedEntities/publication/version2 + * extractedEntities/publication/... extractedEntities/unknown/version1 extractedEntities/unknown/version2 + * extractedEntities/unknown/... extractedEntities/relation/version1 extractedEntities/relation/version2 * extractedEntities/relation/... */ public class SparkExtractEntitiesJob { - static final String IDJSONPATH = "$.id"; - static final String SOURCEJSONPATH = "$.source"; - static final String TARGETJSONPATH = "$.target"; + static final String IDJSONPATH = "$.id"; + static final String SOURCEJSONPATH = "$.source"; + static final String TARGETJSONPATH = "$.target"; - public static void main(String[] args) throws Exception { + public static void main(String[] args) throws Exception { - final ArgumentApplicationParser parser = - new ArgumentApplicationParser( - IOUtils.toString( - SparkExtractEntitiesJob.class.getResourceAsStream( - "/eu/dnetlib/dhp/sx/graph/argumentparser/input_extract_entities_parameters.json"))); - parser.parseArgument(args); - final SparkSession spark = - SparkSession.builder() - .appName(SparkExtractEntitiesJob.class.getSimpleName()) - .master(parser.get("master")) - .getOrCreate(); - final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); - final String inputPath = parser.get("sourcePath"); - final String targetPath = parser.get("targetPath"); - final String tdir = parser.get("targetDir"); - final JavaRDD inputRDD = sc.textFile(inputPath); + final ArgumentApplicationParser parser = new ArgumentApplicationParser( + IOUtils + .toString( + SparkExtractEntitiesJob.class + .getResourceAsStream( + "/eu/dnetlib/dhp/sx/graph/argumentparser/input_extract_entities_parameters.json"))); + parser.parseArgument(args); + final SparkSession spark = SparkSession + .builder() + .appName(SparkExtractEntitiesJob.class.getSimpleName()) + .master(parser.get("master")) + .getOrCreate(); + final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); + final String inputPath = parser.get("sourcePath"); + final String targetPath = parser.get("targetPath"); + final String tdir = parser.get("targetDir"); + final JavaRDD inputRDD = sc.textFile(inputPath); - List entities = - Arrays.stream(parser.get("entities").split(",")) - .map(String::trim) - .collect(Collectors.toList()); - if (entities.stream().anyMatch("dataset"::equalsIgnoreCase)) { - // Extract Dataset - inputRDD - .filter(SparkExtractEntitiesJob::isDataset) - .saveAsTextFile(targetPath + "/dataset/" + tdir, GzipCodec.class); - } - if (entities.stream().anyMatch("unknown"::equalsIgnoreCase)) { - // Extract Unknown - inputRDD - .filter(SparkExtractEntitiesJob::isUnknown) - .saveAsTextFile(targetPath + "/unknown/" + tdir, GzipCodec.class); - } + List entities = Arrays + .stream(parser.get("entities").split(",")) + .map(String::trim) + .collect(Collectors.toList()); + if (entities.stream().anyMatch("dataset"::equalsIgnoreCase)) { + // Extract Dataset + inputRDD + .filter(SparkExtractEntitiesJob::isDataset) + .saveAsTextFile(targetPath + "/dataset/" + tdir, GzipCodec.class); + } + if (entities.stream().anyMatch("unknown"::equalsIgnoreCase)) { + // Extract Unknown + inputRDD + .filter(SparkExtractEntitiesJob::isUnknown) + .saveAsTextFile(targetPath + "/unknown/" + tdir, GzipCodec.class); + } - if (entities.stream().anyMatch("relation"::equalsIgnoreCase)) { - // Extract Relation - inputRDD - .filter(SparkExtractEntitiesJob::isRelation) - .saveAsTextFile(targetPath + "/relation/" + tdir, GzipCodec.class); - } - if (entities.stream().anyMatch("publication"::equalsIgnoreCase)) { - // Extract Relation - inputRDD - .filter(SparkExtractEntitiesJob::isPublication) - .saveAsTextFile(targetPath + "/publication/" + tdir, GzipCodec.class); - } - } + if (entities.stream().anyMatch("relation"::equalsIgnoreCase)) { + // Extract Relation + inputRDD + .filter(SparkExtractEntitiesJob::isRelation) + .saveAsTextFile(targetPath + "/relation/" + tdir, GzipCodec.class); + } + if (entities.stream().anyMatch("publication"::equalsIgnoreCase)) { + // Extract Relation + inputRDD + .filter(SparkExtractEntitiesJob::isPublication) + .saveAsTextFile(targetPath + "/publication/" + tdir, GzipCodec.class); + } + } - public static boolean isDataset(final String json) { - final String id = getJPathString(IDJSONPATH, json); - if (StringUtils.isBlank(id)) return false; - return id.startsWith("60|"); - } + public static boolean isDataset(final String json) { + final String id = getJPathString(IDJSONPATH, json); + if (StringUtils.isBlank(id)) + return false; + return id.startsWith("60|"); + } - public static boolean isPublication(final String json) { - final String id = getJPathString(IDJSONPATH, json); - if (StringUtils.isBlank(id)) return false; - return id.startsWith("50|"); - } + public static boolean isPublication(final String json) { + final String id = getJPathString(IDJSONPATH, json); + if (StringUtils.isBlank(id)) + return false; + return id.startsWith("50|"); + } - public static boolean isUnknown(final String json) { - final String id = getJPathString(IDJSONPATH, json); - if (StringUtils.isBlank(id)) return false; - return id.startsWith("70|"); - } + public static boolean isUnknown(final String json) { + final String id = getJPathString(IDJSONPATH, json); + if (StringUtils.isBlank(id)) + return false; + return id.startsWith("70|"); + } - public static boolean isRelation(final String json) { - final String source = getJPathString(SOURCEJSONPATH, json); - final String target = getJPathString(TARGETJSONPATH, json); - return StringUtils.isNotBlank(source) && StringUtils.isNotBlank(target); - } + public static boolean isRelation(final String json) { + final String source = getJPathString(SOURCEJSONPATH, json); + final String target = getJPathString(TARGETJSONPATH, json); + return StringUtils.isNotBlank(source) && StringUtils.isNotBlank(target); + } - public static String getJPathString(final String jsonPath, final String json) { - try { - Object o = JsonPath.read(json, jsonPath); - if (o instanceof String) return (String) o; - if (o instanceof JSONArray && ((JSONArray) o).size() > 0) - return (String) ((JSONArray) o).get(0); - return ""; - } catch (Exception e) { - return ""; - } - } + public static String getJPathString(final String jsonPath, final String json) { + try { + Object o = JsonPath.read(json, jsonPath); + if (o instanceof String) + return (String) o; + if (o instanceof JSONArray && ((JSONArray) o).size() > 0) + return (String) ((JSONArray) o).get(0); + return ""; + } catch (Exception e) { + return ""; + } + } } diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/SparkSXGeneratePidSimlarity.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/SparkSXGeneratePidSimlarity.java index cd8375df5..f3d7fd40f 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/SparkSXGeneratePidSimlarity.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/SparkSXGeneratePidSimlarity.java @@ -1,7 +1,6 @@ + package eu.dnetlib.dhp.sx.graph; -import eu.dnetlib.dhp.schema.scholexplorer.DLIRelation; -import eu.dnetlib.dhp.utils.DHPUtils; import org.apache.commons.lang3.StringUtils; import org.apache.spark.api.java.JavaPairRDD; import org.apache.spark.api.java.JavaRDD; @@ -10,70 +9,67 @@ import org.apache.spark.api.java.function.PairFunction; import org.apache.spark.sql.Encoders; import org.apache.spark.sql.SaveMode; import org.apache.spark.sql.SparkSession; + +import eu.dnetlib.dhp.schema.scholexplorer.DLIRelation; +import eu.dnetlib.dhp.utils.DHPUtils; import scala.Tuple2; /** - * In some case the identifier generated for the Entity in @{@link SparkExtractEntitiesJob} is - * different from the identifier * associated by the aggregator, this means that some relation - * points to missing identifier To avoid this problem we store in the model the Id and the - * OriginalObJIdentifier This jobs extract this pair and creates a Similar relation that will be - * used in SparkMergeEntities + * In some case the identifier generated for the Entity in @{@link SparkExtractEntitiesJob} is different from the + * identifier * associated by the aggregator, this means that some relation points to missing identifier To avoid this + * problem we store in the model the Id and the OriginalObJIdentifier This jobs extract this pair and creates a Similar + * relation that will be used in SparkMergeEntities */ public class SparkSXGeneratePidSimlarity { - static final String IDJSONPATH = "$.id"; - static final String OBJIDPATH = "$.originalObjIdentifier"; + static final String IDJSONPATH = "$.id"; + static final String OBJIDPATH = "$.originalObjIdentifier"; - public static void generateDataFrame( - final SparkSession spark, - final JavaSparkContext sc, - final String inputPath, - final String targetPath) { + public static void generateDataFrame( + final SparkSession spark, + final JavaSparkContext sc, + final String inputPath, + final String targetPath) { - final JavaPairRDD datasetSimRel = - sc.textFile(inputPath + "/dataset/*") - .mapToPair( - (PairFunction) - k -> - new Tuple2<>( - DHPUtils.getJPathString(IDJSONPATH, k), - DHPUtils.getJPathString(OBJIDPATH, k))) - .filter( - t -> - !StringUtils.substringAfter(t._1(), "|") - .equalsIgnoreCase(StringUtils.substringAfter(t._2(), "::"))) - .distinct(); + final JavaPairRDD datasetSimRel = sc + .textFile(inputPath + "/dataset/*") + .mapToPair( + (PairFunction) k -> new Tuple2<>( + DHPUtils.getJPathString(IDJSONPATH, k), + DHPUtils.getJPathString(OBJIDPATH, k))) + .filter( + t -> !StringUtils + .substringAfter(t._1(), "|") + .equalsIgnoreCase(StringUtils.substringAfter(t._2(), "::"))) + .distinct(); - final JavaPairRDD publicationSimRel = - sc.textFile(inputPath + "/publication/*") - .mapToPair( - (PairFunction) - k -> - new Tuple2<>( - DHPUtils.getJPathString(IDJSONPATH, k), - DHPUtils.getJPathString(OBJIDPATH, k))) - .filter( - t -> - !StringUtils.substringAfter(t._1(), "|") - .equalsIgnoreCase(StringUtils.substringAfter(t._2(), "::"))) - .distinct(); + final JavaPairRDD publicationSimRel = sc + .textFile(inputPath + "/publication/*") + .mapToPair( + (PairFunction) k -> new Tuple2<>( + DHPUtils.getJPathString(IDJSONPATH, k), + DHPUtils.getJPathString(OBJIDPATH, k))) + .filter( + t -> !StringUtils + .substringAfter(t._1(), "|") + .equalsIgnoreCase(StringUtils.substringAfter(t._2(), "::"))) + .distinct(); - JavaRDD simRel = - datasetSimRel - .union(publicationSimRel) - .map( - s -> { - final DLIRelation r = new DLIRelation(); - r.setSource(s._1()); - r.setTarget(s._2()); - r.setRelType("similar"); - return r; - }); - spark - .createDataset(simRel.rdd(), Encoders.bean(DLIRelation.class)) - .distinct() - .write() - .mode(SaveMode.Overwrite) - .save(targetPath + "/pid_simRel"); - } + JavaRDD simRel = datasetSimRel + .union(publicationSimRel) + .map( + s -> { + final DLIRelation r = new DLIRelation(); + r.setSource(s._1()); + r.setTarget(s._2()); + r.setRelType("similar"); + return r; + }); + spark + .createDataset(simRel.rdd(), Encoders.bean(DLIRelation.class)) + .distinct() + .write() + .mode(SaveMode.Overwrite) + .save(targetPath + "/pid_simRel"); + } } diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/SparkScholexplorerCreateRawGraphJob.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/SparkScholexplorerCreateRawGraphJob.java index 3d1d9ec49..385ac4d1a 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/SparkScholexplorerCreateRawGraphJob.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/SparkScholexplorerCreateRawGraphJob.java @@ -1,20 +1,11 @@ + package eu.dnetlib.dhp.sx.graph; -import com.fasterxml.jackson.databind.DeserializationFeature; -import com.fasterxml.jackson.databind.ObjectMapper; -import com.jayway.jsonpath.JsonPath; -import eu.dnetlib.dhp.application.ArgumentApplicationParser; -import eu.dnetlib.dhp.schema.oaf.Relation; -import eu.dnetlib.dhp.schema.scholexplorer.DLIDataset; -import eu.dnetlib.dhp.schema.scholexplorer.DLIPublication; -import eu.dnetlib.dhp.schema.scholexplorer.DLIRelation; -import eu.dnetlib.dhp.schema.scholexplorer.DLIUnknown; -import eu.dnetlib.dhp.utils.DHPUtils; import java.util.ArrayList; import java.util.Arrays; import java.util.List; import java.util.stream.Collectors; -import net.minidev.json.JSONArray; + import org.apache.commons.io.IOUtils; import org.apache.commons.lang3.StringUtils; import org.apache.hadoop.fs.FileStatus; @@ -31,228 +22,236 @@ import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Encoders; import org.apache.spark.sql.SaveMode; import org.apache.spark.sql.SparkSession; + +import com.fasterxml.jackson.databind.DeserializationFeature; +import com.fasterxml.jackson.databind.ObjectMapper; +import com.jayway.jsonpath.JsonPath; + +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.schema.oaf.Relation; +import eu.dnetlib.dhp.schema.scholexplorer.DLIDataset; +import eu.dnetlib.dhp.schema.scholexplorer.DLIPublication; +import eu.dnetlib.dhp.schema.scholexplorer.DLIRelation; +import eu.dnetlib.dhp.schema.scholexplorer.DLIUnknown; +import eu.dnetlib.dhp.utils.DHPUtils; +import net.minidev.json.JSONArray; import scala.Tuple2; /** - * This job is responsible of the creation of RAW Graph It is applied to the different entities - * generated from {@link SparkExtractEntitiesJob} In case of dataset, publication and Unknown - * Entities we group all the entities of the same type by their identifier, and then in the reduce - * phase we merge all the entities. Merge means: -merge all the metadata -merge the collected From - * values - * - *

In case of relation we need to make a different work: -Phase 1: Map reduce jobs Map: Get all - * Relation and emit a key constructed by (source, relType, Target) and the relation itself Reduce: - * Merge all relations Looking at the javadoc of {@link SparkSXGeneratePidSimlarity} we take the - * dataset of pid relation and joining by source and target we replace the wrong identifier in the - * relation with the correct ones. At the end we replace the new Dataset of Relation + * This job is responsible of the creation of RAW Graph It is applied to the different entities generated from + * {@link SparkExtractEntitiesJob} In case of dataset, publication and Unknown Entities we group all the entities of the + * same type by their identifier, and then in the reduce phase we merge all the entities. Merge means: -merge all the + * metadata -merge the collected From values + *

+ * In case of relation we need to make a different work: -Phase 1: Map reduce jobs Map: Get all Relation and emit a key + * constructed by (source, relType, Target) and the relation itself Reduce: Merge all relations Looking at the javadoc + * of {@link SparkSXGeneratePidSimlarity} we take the dataset of pid relation and joining by source and target we + * replace the wrong identifier in the relation with the correct ones. At the end we replace the new Dataset of Relation */ public class SparkScholexplorerCreateRawGraphJob { - static final String IDJSONPATH = "$.id"; - static final String SOURCEJSONPATH = "$.source"; - static final String TARGETJSONPATH = "$.target"; - static final String RELJSONPATH = "$.relType"; + static final String IDJSONPATH = "$.id"; + static final String SOURCEJSONPATH = "$.source"; + static final String TARGETJSONPATH = "$.target"; + static final String RELJSONPATH = "$.relType"; - public static void main(String[] args) throws Exception { + public static void main(String[] args) throws Exception { - final ArgumentApplicationParser parser = - new ArgumentApplicationParser( - IOUtils.toString( - SparkScholexplorerCreateRawGraphJob.class.getResourceAsStream( - "/eu/dnetlib/dhp/sx/graph/argumentparser/merge_entities_scholix_parameters.json"))); - parser.parseArgument(args); - final SparkSession spark = - SparkSession.builder() - .config( - new SparkConf() - .set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")) - .appName(SparkScholexplorerCreateRawGraphJob.class.getSimpleName()) - .master(parser.get("master")) - .getOrCreate(); - final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); - final String inputPath = parser.get("sourcePath"); - final String targetPath = parser.get("targetPath"); - final String entity = parser.get("entity"); - FileSystem fs = FileSystem.get(sc.sc().hadoopConfiguration()); - List subFolder = - Arrays.stream(fs.listStatus(new Path(inputPath))) - .filter(FileStatus::isDirectory) - .map(FileStatus::getPath) - .collect(Collectors.toList()); - List> inputRdd = new ArrayList<>(); - subFolder.forEach(p -> inputRdd.add(sc.textFile(p.toUri().getRawPath()))); - JavaRDD union = sc.emptyRDD(); - for (JavaRDD item : inputRdd) { - union = union.union(item); - } - switch (entity) { - case "dataset": - union - .mapToPair( - (PairFunction) - f -> { - final String id = getJPathString(IDJSONPATH, f); - ObjectMapper mapper = new ObjectMapper(); - mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false); - return new Tuple2<>(id, mapper.readValue(f, DLIDataset.class)); - }) - .reduceByKey( - (a, b) -> { - a.mergeFrom(b); - return a; - }) - .map( - item -> { - ObjectMapper mapper = new ObjectMapper(); - return mapper.writeValueAsString(item._2()); - }) - .saveAsTextFile(targetPath, GzipCodec.class); - break; - case "publication": - union - .mapToPair( - (PairFunction) - f -> { - final String id = getJPathString(IDJSONPATH, f); - ObjectMapper mapper = new ObjectMapper(); - mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false); - return new Tuple2<>(id, mapper.readValue(f, DLIPublication.class)); - }) - .reduceByKey( - (a, b) -> { - a.mergeFrom(b); - return a; - }) - .map( - item -> { - ObjectMapper mapper = new ObjectMapper(); - return mapper.writeValueAsString(item._2()); - }) - .saveAsTextFile(targetPath, GzipCodec.class); - break; - case "unknown": - union - .mapToPair( - (PairFunction) - f -> { - final String id = getJPathString(IDJSONPATH, f); - ObjectMapper mapper = new ObjectMapper(); - mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false); - return new Tuple2<>(id, mapper.readValue(f, DLIUnknown.class)); - }) - .reduceByKey( - (a, b) -> { - a.mergeFrom(b); - return a; - }) - .map( - item -> { - ObjectMapper mapper = new ObjectMapper(); - return mapper.writeValueAsString(item._2()); - }) - .saveAsTextFile(targetPath, GzipCodec.class); - break; - case "relation": - SparkSXGeneratePidSimlarity.generateDataFrame( - spark, sc, inputPath.replace("/relation", ""), targetPath.replace("/relation", "")); - RDD rdd = - union - .mapToPair( - (PairFunction) - f -> { - final String source = getJPathString(SOURCEJSONPATH, f); - final String target = getJPathString(TARGETJSONPATH, f); - final String reltype = getJPathString(RELJSONPATH, f); - ObjectMapper mapper = new ObjectMapper(); - mapper.configure( - DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false); - return new Tuple2<>( - DHPUtils.md5( - String.format( - "%s::%s::%s", - source.toLowerCase(), - reltype.toLowerCase(), - target.toLowerCase())), - mapper.readValue(f, DLIRelation.class)); - }) - .reduceByKey( - (a, b) -> { - a.mergeFrom(b); - return a; - }) - .map(Tuple2::_2) - .rdd(); + final ArgumentApplicationParser parser = new ArgumentApplicationParser( + IOUtils + .toString( + SparkScholexplorerCreateRawGraphJob.class + .getResourceAsStream( + "/eu/dnetlib/dhp/sx/graph/argumentparser/merge_entities_scholix_parameters.json"))); + parser.parseArgument(args); + final SparkSession spark = SparkSession + .builder() + .config( + new SparkConf() + .set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")) + .appName(SparkScholexplorerCreateRawGraphJob.class.getSimpleName()) + .master(parser.get("master")) + .getOrCreate(); + final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); + final String inputPath = parser.get("sourcePath"); + final String targetPath = parser.get("targetPath"); + final String entity = parser.get("entity"); + FileSystem fs = FileSystem.get(sc.sc().hadoopConfiguration()); + List subFolder = Arrays + .stream(fs.listStatus(new Path(inputPath))) + .filter(FileStatus::isDirectory) + .map(FileStatus::getPath) + .collect(Collectors.toList()); + List> inputRdd = new ArrayList<>(); + subFolder.forEach(p -> inputRdd.add(sc.textFile(p.toUri().getRawPath()))); + JavaRDD union = sc.emptyRDD(); + for (JavaRDD item : inputRdd) { + union = union.union(item); + } + switch (entity) { + case "dataset": + union + .mapToPair( + (PairFunction) f -> { + final String id = getJPathString(IDJSONPATH, f); + ObjectMapper mapper = new ObjectMapper(); + mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false); + return new Tuple2<>(id, mapper.readValue(f, DLIDataset.class)); + }) + .reduceByKey( + (a, b) -> { + a.mergeFrom(b); + return a; + }) + .map( + item -> { + ObjectMapper mapper = new ObjectMapper(); + return mapper.writeValueAsString(item._2()); + }) + .saveAsTextFile(targetPath, GzipCodec.class); + break; + case "publication": + union + .mapToPair( + (PairFunction) f -> { + final String id = getJPathString(IDJSONPATH, f); + ObjectMapper mapper = new ObjectMapper(); + mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false); + return new Tuple2<>(id, mapper.readValue(f, DLIPublication.class)); + }) + .reduceByKey( + (a, b) -> { + a.mergeFrom(b); + return a; + }) + .map( + item -> { + ObjectMapper mapper = new ObjectMapper(); + return mapper.writeValueAsString(item._2()); + }) + .saveAsTextFile(targetPath, GzipCodec.class); + break; + case "unknown": + union + .mapToPair( + (PairFunction) f -> { + final String id = getJPathString(IDJSONPATH, f); + ObjectMapper mapper = new ObjectMapper(); + mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false); + return new Tuple2<>(id, mapper.readValue(f, DLIUnknown.class)); + }) + .reduceByKey( + (a, b) -> { + a.mergeFrom(b); + return a; + }) + .map( + item -> { + ObjectMapper mapper = new ObjectMapper(); + return mapper.writeValueAsString(item._2()); + }) + .saveAsTextFile(targetPath, GzipCodec.class); + break; + case "relation": + SparkSXGeneratePidSimlarity + .generateDataFrame( + spark, sc, inputPath.replace("/relation", ""), targetPath.replace("/relation", "")); + RDD rdd = union + .mapToPair( + (PairFunction) f -> { + final String source = getJPathString(SOURCEJSONPATH, f); + final String target = getJPathString(TARGETJSONPATH, f); + final String reltype = getJPathString(RELJSONPATH, f); + ObjectMapper mapper = new ObjectMapper(); + mapper + .configure( + DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false); + return new Tuple2<>( + DHPUtils + .md5( + String + .format( + "%s::%s::%s", + source.toLowerCase(), + reltype.toLowerCase(), + target.toLowerCase())), + mapper.readValue(f, DLIRelation.class)); + }) + .reduceByKey( + (a, b) -> { + a.mergeFrom(b); + return a; + }) + .map(Tuple2::_2) + .rdd(); - spark - .createDataset(rdd, Encoders.bean(DLIRelation.class)) - .write() - .mode(SaveMode.Overwrite) - .save(targetPath); - Dataset rel_ds = spark.read().load(targetPath).as(Encoders.bean(Relation.class)); + spark + .createDataset(rdd, Encoders.bean(DLIRelation.class)) + .write() + .mode(SaveMode.Overwrite) + .save(targetPath); + Dataset rel_ds = spark.read().load(targetPath).as(Encoders.bean(Relation.class)); - System.out.println("LOADING PATH :" + targetPath.replace("/relation", "") + "/pid_simRel"); - Dataset sim_ds = - spark - .read() - .load(targetPath.replace("/relation", "") + "/pid_simRel") - .as(Encoders.bean(Relation.class)); + System.out.println("LOADING PATH :" + targetPath.replace("/relation", "") + "/pid_simRel"); + Dataset sim_ds = spark + .read() + .load(targetPath.replace("/relation", "") + "/pid_simRel") + .as(Encoders.bean(Relation.class)); - Dataset ids = - sim_ds.map( - (MapFunction) - relation -> { - final String type = StringUtils.substringBefore(relation.getSource(), "|"); - relation.setTarget( - String.format( - "%s|%s", - type, StringUtils.substringAfter(relation.getTarget(), "::"))); - return relation; - }, - Encoders.bean(Relation.class)); + Dataset ids = sim_ds + .map( + (MapFunction) relation -> { + final String type = StringUtils.substringBefore(relation.getSource(), "|"); + relation + .setTarget( + String + .format( + "%s|%s", + type, StringUtils.substringAfter(relation.getTarget(), "::"))); + return relation; + }, + Encoders.bean(Relation.class)); - final Dataset firstJoin = - rel_ds - .joinWith(ids, ids.col("target").equalTo(rel_ds.col("source")), "left_outer") - .map( - (MapFunction, Relation>) - s -> { - if (s._2() != null) { - s._1().setSource(s._2().getSource()); - } - return s._1(); - }, - Encoders.bean(Relation.class)); + final Dataset firstJoin = rel_ds + .joinWith(ids, ids.col("target").equalTo(rel_ds.col("source")), "left_outer") + .map( + (MapFunction, Relation>) s -> { + if (s._2() != null) { + s._1().setSource(s._2().getSource()); + } + return s._1(); + }, + Encoders.bean(Relation.class)); - Dataset secondJoin = - firstJoin - .joinWith(ids, ids.col("target").equalTo(firstJoin.col("target")), "left_outer") - .map( - (MapFunction, Relation>) - s -> { - if (s._2() != null) { - s._1().setTarget(s._2().getSource()); - } - return s._1(); - }, - Encoders.bean(Relation.class)); - secondJoin.write().mode(SaveMode.Overwrite).save(targetPath + "_fixed"); + Dataset secondJoin = firstJoin + .joinWith(ids, ids.col("target").equalTo(firstJoin.col("target")), "left_outer") + .map( + (MapFunction, Relation>) s -> { + if (s._2() != null) { + s._1().setTarget(s._2().getSource()); + } + return s._1(); + }, + Encoders.bean(Relation.class)); + secondJoin.write().mode(SaveMode.Overwrite).save(targetPath + "_fixed"); - FileSystem fileSystem = FileSystem.get(sc.hadoopConfiguration()); + FileSystem fileSystem = FileSystem.get(sc.hadoopConfiguration()); - fileSystem.delete(new Path(targetPath), true); - fileSystem.rename(new Path(targetPath + "_fixed"), new Path(targetPath)); - } - } + fileSystem.delete(new Path(targetPath), true); + fileSystem.rename(new Path(targetPath + "_fixed"), new Path(targetPath)); + } + } - public static String getJPathString(final String jsonPath, final String json) { - try { - Object o = JsonPath.read(json, jsonPath); - if (o instanceof String) return (String) o; - if (o instanceof JSONArray && ((JSONArray) o).size() > 0) - return (String) ((JSONArray) o).get(0); - return ""; - } catch (Exception e) { - return ""; - } - } + public static String getJPathString(final String jsonPath, final String json) { + try { + Object o = JsonPath.read(json, jsonPath); + if (o instanceof String) + return (String) o; + if (o instanceof JSONArray && ((JSONArray) o).size() > 0) + return (String) ((JSONArray) o).get(0); + return ""; + } catch (Exception e) { + return ""; + } + } } diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/SparkScholexplorerGraphImporter.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/SparkScholexplorerGraphImporter.java index e0b0710c9..97f1251f0 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/SparkScholexplorerGraphImporter.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/SparkScholexplorerGraphImporter.java @@ -1,11 +1,6 @@ + package eu.dnetlib.dhp.sx.graph; -import com.fasterxml.jackson.databind.ObjectMapper; -import eu.dnetlib.dhp.application.ArgumentApplicationParser; -import eu.dnetlib.dhp.schema.oaf.Oaf; -import eu.dnetlib.dhp.sx.graph.parser.DatasetScholexplorerParser; -import eu.dnetlib.dhp.sx.graph.parser.PublicationScholexplorerParser; -import eu.dnetlib.scholexplorer.relation.RelationMapper; import org.apache.commons.io.IOUtils; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.Text; @@ -13,56 +8,65 @@ import org.apache.hadoop.io.compress.GzipCodec; import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.api.java.function.FlatMapFunction; import org.apache.spark.sql.SparkSession; + +import com.fasterxml.jackson.databind.ObjectMapper; + +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.schema.oaf.Oaf; +import eu.dnetlib.dhp.sx.graph.parser.DatasetScholexplorerParser; +import eu.dnetlib.dhp.sx.graph.parser.PublicationScholexplorerParser; +import eu.dnetlib.scholexplorer.relation.RelationMapper; import scala.Tuple2; /** - * This Job read a sequential File containing XML stored in the aggregator and generates an RDD of - * heterogeneous entities like Dataset, Relation, Publication and Unknown + * This Job read a sequential File containing XML stored in the aggregator and generates an RDD of heterogeneous + * entities like Dataset, Relation, Publication and Unknown */ public class SparkScholexplorerGraphImporter { - public static void main(String[] args) throws Exception { + public static void main(String[] args) throws Exception { - final ArgumentApplicationParser parser = - new ArgumentApplicationParser( - IOUtils.toString( - SparkScholexplorerGraphImporter.class.getResourceAsStream( - "/eu/dnetlib/dhp/sx/graph/argumentparser/input_graph_scholix_parameters.json"))); + final ArgumentApplicationParser parser = new ArgumentApplicationParser( + IOUtils + .toString( + SparkScholexplorerGraphImporter.class + .getResourceAsStream( + "/eu/dnetlib/dhp/sx/graph/argumentparser/input_graph_scholix_parameters.json"))); - parser.parseArgument(args); - final SparkSession spark = - SparkSession.builder() - .appName(SparkScholexplorerGraphImporter.class.getSimpleName()) - .master(parser.get("master")) - .getOrCreate(); - final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); - final String inputPath = parser.get("sourcePath"); + parser.parseArgument(args); + final SparkSession spark = SparkSession + .builder() + .appName(SparkScholexplorerGraphImporter.class.getSimpleName()) + .master(parser.get("master")) + .getOrCreate(); + final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); + final String inputPath = parser.get("sourcePath"); - RelationMapper relationMapper = RelationMapper.load(); + RelationMapper relationMapper = RelationMapper.load(); - sc.sequenceFile(inputPath, IntWritable.class, Text.class) - .map(Tuple2::_2) - .map(Text::toString) - .repartition(500) - .flatMap( - (FlatMapFunction) - record -> { - switch (parser.get("entity")) { - case "dataset": - final DatasetScholexplorerParser d = new DatasetScholexplorerParser(); - return d.parseObject(record, relationMapper).iterator(); - case "publication": - final PublicationScholexplorerParser p = new PublicationScholexplorerParser(); - return p.parseObject(record, relationMapper).iterator(); - default: - throw new IllegalArgumentException("wrong values of entities"); - } - }) - .map( - k -> { - ObjectMapper mapper = new ObjectMapper(); - return mapper.writeValueAsString(k); - }) - .saveAsTextFile(parser.get("targetPath"), GzipCodec.class); - } + sc + .sequenceFile(inputPath, IntWritable.class, Text.class) + .map(Tuple2::_2) + .map(Text::toString) + .repartition(500) + .flatMap( + (FlatMapFunction) record -> { + switch (parser.get("entity")) { + case "dataset": + final DatasetScholexplorerParser d = new DatasetScholexplorerParser(); + return d.parseObject(record, relationMapper).iterator(); + case "publication": + final PublicationScholexplorerParser p = new PublicationScholexplorerParser(); + return p.parseObject(record, relationMapper).iterator(); + default: + throw new IllegalArgumentException("wrong values of entities"); + } + }) + .map( + k -> { + ObjectMapper mapper = new ObjectMapper(); + return mapper.writeValueAsString(k); + }) + .saveAsTextFile(parser.get("targetPath"), GzipCodec.class); + } } diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/parser/AbstractScholexplorerParser.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/parser/AbstractScholexplorerParser.java index 5e11c2a53..c97753fdc 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/parser/AbstractScholexplorerParser.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/parser/AbstractScholexplorerParser.java @@ -1,5 +1,17 @@ + package eu.dnetlib.dhp.sx.graph.parser; +import java.util.*; +import java.util.regex.Matcher; +import java.util.regex.Pattern; +import java.util.stream.Collectors; + +import javax.xml.stream.XMLStreamReader; + +import org.apache.commons.lang3.StringUtils; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; + import eu.dnetlib.dhp.parser.utility.VtdUtilityParser; import eu.dnetlib.dhp.schema.oaf.*; import eu.dnetlib.dhp.schema.scholexplorer.DLIRelation; @@ -8,199 +20,195 @@ import eu.dnetlib.dhp.schema.scholexplorer.ProvenaceInfo; import eu.dnetlib.dhp.utils.DHPUtils; import eu.dnetlib.scholexplorer.relation.RelInfo; import eu.dnetlib.scholexplorer.relation.RelationMapper; -import java.util.*; -import java.util.regex.Matcher; -import java.util.regex.Pattern; -import java.util.stream.Collectors; -import javax.xml.stream.XMLStreamReader; -import org.apache.commons.lang3.StringUtils; -import org.apache.commons.logging.Log; -import org.apache.commons.logging.LogFactory; public abstract class AbstractScholexplorerParser { - protected static final Log log = LogFactory.getLog(AbstractScholexplorerParser.class); - static final Pattern pattern = - Pattern.compile("10\\.\\d{4,9}/[-._;()/:A-Z0-9]+$", Pattern.CASE_INSENSITIVE); - private List datasetSubTypes = - Arrays.asList( - "dataset", - "software", - "film", - "sound", - "physicalobject", - "audiovisual", - "collection", - "other", - "study", - "metadata"); + protected static final Log log = LogFactory.getLog(AbstractScholexplorerParser.class); + static final Pattern pattern = Pattern.compile("10\\.\\d{4,9}/[-._;()/:A-Z0-9]+$", Pattern.CASE_INSENSITIVE); + private List datasetSubTypes = Arrays + .asList( + "dataset", + "software", + "film", + "sound", + "physicalobject", + "audiovisual", + "collection", + "other", + "study", + "metadata"); - public abstract List parseObject(final String record, final RelationMapper relMapper); + public abstract List parseObject(final String record, final RelationMapper relMapper); - protected Map getAttributes(final XMLStreamReader parser) { - final Map attributesMap = new HashMap<>(); - for (int i = 0; i < parser.getAttributeCount(); i++) { - attributesMap.put(parser.getAttributeLocalName(i), parser.getAttributeValue(i)); - } - return attributesMap; - } + protected Map getAttributes(final XMLStreamReader parser) { + final Map attributesMap = new HashMap<>(); + for (int i = 0; i < parser.getAttributeCount(); i++) { + attributesMap.put(parser.getAttributeLocalName(i), parser.getAttributeValue(i)); + } + return attributesMap; + } - protected List extractSubject(List subjects) { - final List subjectResult = new ArrayList<>(); - if (subjects != null && subjects.size() > 0) { - subjects.forEach( - subjectMap -> { - final StructuredProperty subject = new StructuredProperty(); - subject.setValue(subjectMap.getTextValue()); - final Qualifier schema = new Qualifier(); - schema.setClassid("dnet:subject"); - schema.setClassname("dnet:subject"); - schema.setSchemeid(subjectMap.getAttributes().get("subjectScheme")); - schema.setSchemename(subjectMap.getAttributes().get("subjectScheme")); - subject.setQualifier(schema); - subjectResult.add(subject); - }); - } - return subjectResult; - } + protected List extractSubject(List subjects) { + final List subjectResult = new ArrayList<>(); + if (subjects != null && subjects.size() > 0) { + subjects + .forEach( + subjectMap -> { + final StructuredProperty subject = new StructuredProperty(); + subject.setValue(subjectMap.getTextValue()); + final Qualifier schema = new Qualifier(); + schema.setClassid("dnet:subject"); + schema.setClassname("dnet:subject"); + schema.setSchemeid(subjectMap.getAttributes().get("subjectScheme")); + schema.setSchemename(subjectMap.getAttributes().get("subjectScheme")); + subject.setQualifier(schema); + subjectResult.add(subject); + }); + } + return subjectResult; + } - protected StructuredProperty extractIdentifier( - List identifierType, final String fieldName) { - final StructuredProperty pid = new StructuredProperty(); - if (identifierType != null && identifierType.size() > 0) { - final VtdUtilityParser.Node result = identifierType.get(0); - pid.setValue(result.getTextValue()); - final Qualifier pidType = new Qualifier(); - pidType.setClassname(result.getAttributes().get(fieldName)); - pidType.setClassid(result.getAttributes().get(fieldName)); - pidType.setSchemename("dnet:pid_types"); - pidType.setSchemeid("dnet:pid_types"); - pid.setQualifier(pidType); - return pid; - } - return null; - } + protected StructuredProperty extractIdentifier( + List identifierType, final String fieldName) { + final StructuredProperty pid = new StructuredProperty(); + if (identifierType != null && identifierType.size() > 0) { + final VtdUtilityParser.Node result = identifierType.get(0); + pid.setValue(result.getTextValue()); + final Qualifier pidType = new Qualifier(); + pidType.setClassname(result.getAttributes().get(fieldName)); + pidType.setClassid(result.getAttributes().get(fieldName)); + pidType.setSchemename("dnet:pid_types"); + pidType.setSchemeid("dnet:pid_types"); + pid.setQualifier(pidType); + return pid; + } + return null; + } - protected void inferPid(final StructuredProperty input) { - final Matcher matcher = pattern.matcher(input.getValue()); - if (matcher.find()) { - input.setValue(matcher.group()); - if (input.getQualifier() == null) { - input.setQualifier(new Qualifier()); - input.getQualifier().setSchemename("dnet:pid_types"); - input.getQualifier().setSchemeid("dnet:pid_types"); - } - input.getQualifier().setClassid("doi"); - input.getQualifier().setClassname("doi"); - } - } + protected void inferPid(final StructuredProperty input) { + final Matcher matcher = pattern.matcher(input.getValue()); + if (matcher.find()) { + input.setValue(matcher.group()); + if (input.getQualifier() == null) { + input.setQualifier(new Qualifier()); + input.getQualifier().setSchemename("dnet:pid_types"); + input.getQualifier().setSchemeid("dnet:pid_types"); + } + input.getQualifier().setClassid("doi"); + input.getQualifier().setClassname("doi"); + } + } - protected String generateId(final String pid, final String pidType, final String entityType) { - String type; - switch (entityType) { - case "publication": - type = "50|"; - break; - case "dataset": - type = "60|"; - break; - case "unknown": - type = "70|"; - break; - default: - throw new IllegalArgumentException("unexpected value " + entityType); - } - if ("dnet".equalsIgnoreCase(pidType)) return type + StringUtils.substringAfter(pid, "::"); + protected String generateId(final String pid, final String pidType, final String entityType) { + String type; + switch (entityType) { + case "publication": + type = "50|"; + break; + case "dataset": + type = "60|"; + break; + case "unknown": + type = "70|"; + break; + default: + throw new IllegalArgumentException("unexpected value " + entityType); + } + if ("dnet".equalsIgnoreCase(pidType)) + return type + StringUtils.substringAfter(pid, "::"); - return type - + DHPUtils.md5( - String.format("%s::%s", pid.toLowerCase().trim(), pidType.toLowerCase().trim())); - } + return type + + DHPUtils + .md5( + String.format("%s::%s", pid.toLowerCase().trim(), pidType.toLowerCase().trim())); + } - protected DLIUnknown createUnknownObject( - final String pid, - final String pidType, - final KeyValue cf, - final DataInfo di, - final String dateOfCollection) { - final DLIUnknown uk = new DLIUnknown(); - uk.setId(generateId(pid, pidType, "unknown")); - ProvenaceInfo pi = new ProvenaceInfo(); - pi.setId(cf.getKey()); - pi.setName(cf.getValue()); - pi.setCompletionStatus("incomplete"); - uk.setDataInfo(di); - uk.setDlicollectedfrom(Collections.singletonList(pi)); - final StructuredProperty sourcePid = new StructuredProperty(); - sourcePid.setValue(pid); - final Qualifier pt = new Qualifier(); - pt.setClassname(pidType); - pt.setClassid(pidType); - pt.setSchemename("dnet:pid_types"); - pt.setSchemeid("dnet:pid_types"); - sourcePid.setQualifier(pt); - uk.setPid(Collections.singletonList(sourcePid)); - uk.setDateofcollection(dateOfCollection); - return uk; - } + protected DLIUnknown createUnknownObject( + final String pid, + final String pidType, + final KeyValue cf, + final DataInfo di, + final String dateOfCollection) { + final DLIUnknown uk = new DLIUnknown(); + uk.setId(generateId(pid, pidType, "unknown")); + ProvenaceInfo pi = new ProvenaceInfo(); + pi.setId(cf.getKey()); + pi.setName(cf.getValue()); + pi.setCompletionStatus("incomplete"); + uk.setDataInfo(di); + uk.setDlicollectedfrom(Collections.singletonList(pi)); + final StructuredProperty sourcePid = new StructuredProperty(); + sourcePid.setValue(pid); + final Qualifier pt = new Qualifier(); + pt.setClassname(pidType); + pt.setClassid(pidType); + pt.setSchemename("dnet:pid_types"); + pt.setSchemeid("dnet:pid_types"); + sourcePid.setQualifier(pt); + uk.setPid(Collections.singletonList(sourcePid)); + uk.setDateofcollection(dateOfCollection); + return uk; + } - protected void generateRelations( - RelationMapper relationMapper, - Result parsedObject, - List result, - DataInfo di, - String dateOfCollection, - List relatedIdentifiers) { - if (relatedIdentifiers != null) { - result.addAll( - relatedIdentifiers.stream() - .flatMap( - n -> { - final List rels = new ArrayList<>(); - DLIRelation r = new DLIRelation(); - r.setSource(parsedObject.getId()); - final String relatedPid = n.getTextValue(); - final String relatedPidType = n.getAttributes().get("relatedIdentifierType"); - final String relatedType = - n.getAttributes().getOrDefault("entityType", "unknown"); - String relationSemantic = n.getAttributes().get("relationType"); - String inverseRelation; - final String targetId = generateId(relatedPid, relatedPidType, relatedType); - r.setDateOfCollection(dateOfCollection); - if (relationMapper.containsKey(relationSemantic.toLowerCase())) { - RelInfo relInfo = relationMapper.get(relationSemantic.toLowerCase()); - relationSemantic = relInfo.getOriginal(); - inverseRelation = relInfo.getInverse(); - } else { - relationSemantic = "Unknown"; - inverseRelation = "Unknown"; - } - r.setTarget(targetId); - r.setRelType(relationSemantic); - r.setRelClass("datacite"); - r.setCollectedfrom(parsedObject.getCollectedfrom()); - r.setDataInfo(di); - rels.add(r); - r = new DLIRelation(); - r.setDataInfo(di); - r.setSource(targetId); - r.setTarget(parsedObject.getId()); - r.setRelType(inverseRelation); - r.setRelClass("datacite"); - r.setCollectedfrom(parsedObject.getCollectedfrom()); - r.setDateOfCollection(dateOfCollection); - rels.add(r); - if ("unknown".equalsIgnoreCase(relatedType)) - result.add( - createUnknownObject( - relatedPid, - relatedPidType, - parsedObject.getCollectedfrom().get(0), - di, - dateOfCollection)); - return rels.stream(); - }) - .collect(Collectors.toList())); - } - } + protected void generateRelations( + RelationMapper relationMapper, + Result parsedObject, + List result, + DataInfo di, + String dateOfCollection, + List relatedIdentifiers) { + if (relatedIdentifiers != null) { + result + .addAll( + relatedIdentifiers + .stream() + .flatMap( + n -> { + final List rels = new ArrayList<>(); + DLIRelation r = new DLIRelation(); + r.setSource(parsedObject.getId()); + final String relatedPid = n.getTextValue(); + final String relatedPidType = n.getAttributes().get("relatedIdentifierType"); + final String relatedType = n.getAttributes().getOrDefault("entityType", "unknown"); + String relationSemantic = n.getAttributes().get("relationType"); + String inverseRelation; + final String targetId = generateId(relatedPid, relatedPidType, relatedType); + r.setDateOfCollection(dateOfCollection); + if (relationMapper.containsKey(relationSemantic.toLowerCase())) { + RelInfo relInfo = relationMapper.get(relationSemantic.toLowerCase()); + relationSemantic = relInfo.getOriginal(); + inverseRelation = relInfo.getInverse(); + } else { + relationSemantic = "Unknown"; + inverseRelation = "Unknown"; + } + r.setTarget(targetId); + r.setRelType(relationSemantic); + r.setRelClass("datacite"); + r.setCollectedfrom(parsedObject.getCollectedfrom()); + r.setDataInfo(di); + rels.add(r); + r = new DLIRelation(); + r.setDataInfo(di); + r.setSource(targetId); + r.setTarget(parsedObject.getId()); + r.setRelType(inverseRelation); + r.setRelClass("datacite"); + r.setCollectedfrom(parsedObject.getCollectedfrom()); + r.setDateOfCollection(dateOfCollection); + rels.add(r); + if ("unknown".equalsIgnoreCase(relatedType)) + result + .add( + createUnknownObject( + relatedPid, + relatedPidType, + parsedObject.getCollectedfrom().get(0), + di, + dateOfCollection)); + return rels.stream(); + }) + .collect(Collectors.toList())); + } + } } diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/parser/DatasetScholexplorerParser.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/parser/DatasetScholexplorerParser.java index 07b711106..f49163c87 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/parser/DatasetScholexplorerParser.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/parser/DatasetScholexplorerParser.java @@ -1,270 +1,292 @@ + package eu.dnetlib.dhp.sx.graph.parser; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.List; +import java.util.stream.Collectors; + +import org.apache.commons.lang3.StringUtils; + import com.ximpleware.AutoPilot; import com.ximpleware.VTDGen; import com.ximpleware.VTDNav; + import eu.dnetlib.dhp.parser.utility.VtdUtilityParser; import eu.dnetlib.dhp.parser.utility.VtdUtilityParser.Node; import eu.dnetlib.dhp.schema.oaf.*; import eu.dnetlib.dhp.schema.scholexplorer.DLIDataset; import eu.dnetlib.dhp.schema.scholexplorer.ProvenaceInfo; import eu.dnetlib.scholexplorer.relation.RelationMapper; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.Collections; -import java.util.List; -import java.util.stream.Collectors; -import org.apache.commons.lang3.StringUtils; public class DatasetScholexplorerParser extends AbstractScholexplorerParser { - @Override - public List parseObject(String record, final RelationMapper relationMapper) { - try { - final DLIDataset parsedObject = new DLIDataset(); - final VTDGen vg = new VTDGen(); - vg.setDoc(record.getBytes()); - final List result = new ArrayList<>(); - vg.parse(true); + @Override + public List parseObject(String record, final RelationMapper relationMapper) { + try { + final DLIDataset parsedObject = new DLIDataset(); + final VTDGen vg = new VTDGen(); + vg.setDoc(record.getBytes()); + final List result = new ArrayList<>(); + vg.parse(true); - final VTDNav vn = vg.getNav(); - final AutoPilot ap = new AutoPilot(vn); + final VTDNav vn = vg.getNav(); + final AutoPilot ap = new AutoPilot(vn); - DataInfo di = new DataInfo(); - di.setTrust("0.9"); - di.setDeletedbyinference(false); - di.setInvisible(false); - parsedObject.setDataInfo(di); + DataInfo di = new DataInfo(); + di.setTrust("0.9"); + di.setDeletedbyinference(false); + di.setInvisible(false); + parsedObject.setDataInfo(di); - parsedObject.setOriginalId( - Collections.singletonList( - VtdUtilityParser.getSingleValue(ap, vn, "//*[local-name()='recordIdentifier']"))); + parsedObject + .setOriginalId( + Collections + .singletonList( + VtdUtilityParser.getSingleValue(ap, vn, "//*[local-name()='recordIdentifier']"))); - parsedObject.setOriginalObjIdentifier( - VtdUtilityParser.getSingleValue(ap, vn, "//*[local-name()='objIdentifier']")); - String dateOfCollection = - VtdUtilityParser.getSingleValue(ap, vn, "//*[local-name()='dateOfCollection']"); - parsedObject.setDateofcollection(dateOfCollection); + parsedObject + .setOriginalObjIdentifier( + VtdUtilityParser.getSingleValue(ap, vn, "//*[local-name()='objIdentifier']")); + String dateOfCollection = VtdUtilityParser.getSingleValue(ap, vn, "//*[local-name()='dateOfCollection']"); + parsedObject.setDateofcollection(dateOfCollection); - final String resolvedDate = - VtdUtilityParser.getSingleValue(ap, vn, "//*[local-name()='resolvedDate']"); + final String resolvedDate = VtdUtilityParser.getSingleValue(ap, vn, "//*[local-name()='resolvedDate']"); - if (StringUtils.isNotBlank(resolvedDate)) { - StructuredProperty currentDate = new StructuredProperty(); - currentDate.setValue(resolvedDate); - final Qualifier dateQualifier = new Qualifier(); - dateQualifier.setClassname("resolvedDate"); - dateQualifier.setClassid("resolvedDate"); - dateQualifier.setSchemename("dnet::date"); - dateQualifier.setSchemeid("dnet::date"); - currentDate.setQualifier(dateQualifier); - parsedObject.setRelevantdate(Collections.singletonList(currentDate)); - } + if (StringUtils.isNotBlank(resolvedDate)) { + StructuredProperty currentDate = new StructuredProperty(); + currentDate.setValue(resolvedDate); + final Qualifier dateQualifier = new Qualifier(); + dateQualifier.setClassname("resolvedDate"); + dateQualifier.setClassid("resolvedDate"); + dateQualifier.setSchemename("dnet::date"); + dateQualifier.setSchemeid("dnet::date"); + currentDate.setQualifier(dateQualifier); + parsedObject.setRelevantdate(Collections.singletonList(currentDate)); + } - final String completionStatus = - VtdUtilityParser.getSingleValue(ap, vn, "//*[local-name()='completionStatus']"); - final String provisionMode = - VtdUtilityParser.getSingleValue(ap, vn, "//*[local-name()='provisionMode']"); + final String completionStatus = VtdUtilityParser + .getSingleValue(ap, vn, "//*[local-name()='completionStatus']"); + final String provisionMode = VtdUtilityParser.getSingleValue(ap, vn, "//*[local-name()='provisionMode']"); - final String publisher = - VtdUtilityParser.getSingleValue( - ap, vn, "//*[local-name()='resource']/*[local-name()='publisher']"); + final String publisher = VtdUtilityParser + .getSingleValue( + ap, vn, "//*[local-name()='resource']/*[local-name()='publisher']"); - List collectedFromNodes = - VtdUtilityParser.getTextValuesWithAttributes( - ap, - vn, - "//*[local-name()='collectedFrom']", - Arrays.asList("name", "id", "mode", "completionStatus")); + List collectedFromNodes = VtdUtilityParser + .getTextValuesWithAttributes( + ap, + vn, + "//*[local-name()='collectedFrom']", + Arrays.asList("name", "id", "mode", "completionStatus")); - List resolvededFromNodes = - VtdUtilityParser.getTextValuesWithAttributes( - ap, - vn, - "//*[local-name()='resolvedFrom']", - Arrays.asList("name", "id", "mode", "completionStatus")); + List resolvededFromNodes = VtdUtilityParser + .getTextValuesWithAttributes( + ap, + vn, + "//*[local-name()='resolvedFrom']", + Arrays.asList("name", "id", "mode", "completionStatus")); - Field pf = new Field<>(); - pf.setValue(publisher); + Field pf = new Field<>(); + pf.setValue(publisher); - parsedObject.setPublisher(pf); - final List provenances = new ArrayList<>(); - if (collectedFromNodes != null && collectedFromNodes.size() > 0) { - collectedFromNodes.forEach( - it -> { - final ProvenaceInfo provenance = new ProvenaceInfo(); - provenance.setId(it.getAttributes().get("id")); - provenance.setName(it.getAttributes().get("name")); - provenance.setCollectionMode(provisionMode); - provenance.setCompletionStatus(it.getAttributes().get("completionStatus")); - provenances.add(provenance); - }); - } + parsedObject.setPublisher(pf); + final List provenances = new ArrayList<>(); + if (collectedFromNodes != null && collectedFromNodes.size() > 0) { + collectedFromNodes + .forEach( + it -> { + final ProvenaceInfo provenance = new ProvenaceInfo(); + provenance.setId(it.getAttributes().get("id")); + provenance.setName(it.getAttributes().get("name")); + provenance.setCollectionMode(provisionMode); + provenance.setCompletionStatus(it.getAttributes().get("completionStatus")); + provenances.add(provenance); + }); + } - if (resolvededFromNodes != null && resolvededFromNodes.size() > 0) { - resolvededFromNodes.forEach( - it -> { - final ProvenaceInfo provenance = new ProvenaceInfo(); - provenance.setId(it.getAttributes().get("id")); - provenance.setName(it.getAttributes().get("name")); - provenance.setCollectionMode("resolved"); - provenance.setCompletionStatus(it.getAttributes().get("completionStatus")); - provenances.add(provenance); - }); - } + if (resolvededFromNodes != null && resolvededFromNodes.size() > 0) { + resolvededFromNodes + .forEach( + it -> { + final ProvenaceInfo provenance = new ProvenaceInfo(); + provenance.setId(it.getAttributes().get("id")); + provenance.setName(it.getAttributes().get("name")); + provenance.setCollectionMode("resolved"); + provenance.setCompletionStatus(it.getAttributes().get("completionStatus")); + provenances.add(provenance); + }); + } - parsedObject.setDlicollectedfrom(provenances); - parsedObject.setCollectedfrom( - parsedObject.getDlicollectedfrom().stream() - .map( - p -> { - final KeyValue cf = new KeyValue(); - cf.setKey(p.getId()); - cf.setValue(p.getName()); - return cf; - }) - .collect(Collectors.toList())); - parsedObject.setCompletionStatus( - VtdUtilityParser.getSingleValue(ap, vn, "//*[local-name()='completionStatus']")); + parsedObject.setDlicollectedfrom(provenances); + parsedObject + .setCollectedfrom( + parsedObject + .getDlicollectedfrom() + .stream() + .map( + p -> { + final KeyValue cf = new KeyValue(); + cf.setKey(p.getId()); + cf.setValue(p.getName()); + return cf; + }) + .collect(Collectors.toList())); + parsedObject + .setCompletionStatus( + VtdUtilityParser.getSingleValue(ap, vn, "//*[local-name()='completionStatus']")); - final List identifierType = - VtdUtilityParser.getTextValuesWithAttributes( - ap, - vn, - "//*[local-name()='resource']/*[local-name()='identifier']", - Collections.singletonList("identifierType")); + final List identifierType = VtdUtilityParser + .getTextValuesWithAttributes( + ap, + vn, + "//*[local-name()='resource']/*[local-name()='identifier']", + Collections.singletonList("identifierType")); - StructuredProperty currentPid = extractIdentifier(identifierType, "identifierType"); - if (currentPid == null) return null; - inferPid(currentPid); - parsedObject.setPid(Collections.singletonList(currentPid)); + StructuredProperty currentPid = extractIdentifier(identifierType, "identifierType"); + if (currentPid == null) + return null; + inferPid(currentPid); + parsedObject.setPid(Collections.singletonList(currentPid)); - final String sourceId = - generateId(currentPid.getValue(), currentPid.getQualifier().getClassid(), "dataset"); - parsedObject.setId(sourceId); + final String sourceId = generateId( + currentPid.getValue(), currentPid.getQualifier().getClassid(), "dataset"); + parsedObject.setId(sourceId); - List descs = VtdUtilityParser.getTextValue(ap, vn, "//*[local-name()='description']"); - if (descs != null && descs.size() > 0) - parsedObject.setDescription( - descs.stream() - .map(it -> it.length() < 10000 ? it : it.substring(0, 10000)) - .map( - it -> { - final Field d = new Field<>(); - d.setValue(it); - return d; - }) - .collect(Collectors.toList())); + List descs = VtdUtilityParser.getTextValue(ap, vn, "//*[local-name()='description']"); + if (descs != null && descs.size() > 0) + parsedObject + .setDescription( + descs + .stream() + .map(it -> it.length() < 10000 ? it : it.substring(0, 10000)) + .map( + it -> { + final Field d = new Field<>(); + d.setValue(it); + return d; + }) + .collect(Collectors.toList())); - final List relatedIdentifiers = - VtdUtilityParser.getTextValuesWithAttributes( - ap, - vn, - "//*[local-name()='relatedIdentifier']", - Arrays.asList( - "relatedIdentifierType", "relationType", "entityType", "inverseRelationType")); + final List relatedIdentifiers = VtdUtilityParser + .getTextValuesWithAttributes( + ap, + vn, + "//*[local-name()='relatedIdentifier']", + Arrays + .asList( + "relatedIdentifierType", "relationType", "entityType", "inverseRelationType")); - generateRelations( - relationMapper, parsedObject, result, di, dateOfCollection, relatedIdentifiers); + generateRelations( + relationMapper, parsedObject, result, di, dateOfCollection, relatedIdentifiers); - final List hostedBy = - VtdUtilityParser.getTextValuesWithAttributes( - ap, vn, "//*[local-name()='hostedBy']", Arrays.asList("id", "name")); + final List hostedBy = VtdUtilityParser + .getTextValuesWithAttributes( + ap, vn, "//*[local-name()='hostedBy']", Arrays.asList("id", "name")); - if (hostedBy != null) { - parsedObject.setInstance( - hostedBy.stream() - .map( - it -> { - final Instance i = new Instance(); - i.setUrl(Collections.singletonList(currentPid.getValue())); - KeyValue h = new KeyValue(); - i.setHostedby(h); - h.setKey(it.getAttributes().get("id")); - h.setValue(it.getAttributes().get("name")); - return i; - }) - .collect(Collectors.toList())); - } + if (hostedBy != null) { + parsedObject + .setInstance( + hostedBy + .stream() + .map( + it -> { + final Instance i = new Instance(); + i.setUrl(Collections.singletonList(currentPid.getValue())); + KeyValue h = new KeyValue(); + i.setHostedby(h); + h.setKey(it.getAttributes().get("id")); + h.setValue(it.getAttributes().get("name")); + return i; + }) + .collect(Collectors.toList())); + } - List subjects = - extractSubject( - VtdUtilityParser.getTextValuesWithAttributes( - ap, - vn, - "//*[local-name()='resource']//*[local-name()='subject']", - Collections.singletonList("subjectScheme"))); + List subjects = extractSubject( + VtdUtilityParser + .getTextValuesWithAttributes( + ap, + vn, + "//*[local-name()='resource']//*[local-name()='subject']", + Collections.singletonList("subjectScheme"))); - parsedObject.setSubject(subjects); + parsedObject.setSubject(subjects); - Qualifier q = new Qualifier(); - q.setClassname("dataset"); - q.setClassid("dataset"); - q.setSchemename("dataset"); - q.setSchemeid("dataset"); - parsedObject.setResulttype(q); + Qualifier q = new Qualifier(); + q.setClassname("dataset"); + q.setClassid("dataset"); + q.setSchemename("dataset"); + q.setSchemeid("dataset"); + parsedObject.setResulttype(q); - parsedObject.setCompletionStatus(completionStatus); + parsedObject.setCompletionStatus(completionStatus); - final List creators = - VtdUtilityParser.getTextValue( - ap, - vn, - "//*[local-name()='resource']//*[local-name()='creator']/*[local-name()='creatorName']"); - if (creators != null && creators.size() > 0) { - parsedObject.setAuthor( - creators.stream() - .map( - a -> { - final Author author = new Author(); - author.setFullname(a); - return author; - }) - .collect(Collectors.toList())); - } - final List titles = - VtdUtilityParser.getTextValue( - ap, vn, "//*[local-name()='resource']//*[local-name()='title']"); - if (titles != null && titles.size() > 0) { - parsedObject.setTitle( - titles.stream() - .map( - t -> { - final StructuredProperty st = new StructuredProperty(); - st.setValue(t); - return st; - }) - .collect(Collectors.toList())); - } + final List creators = VtdUtilityParser + .getTextValue( + ap, + vn, + "//*[local-name()='resource']//*[local-name()='creator']/*[local-name()='creatorName']"); + if (creators != null && creators.size() > 0) { + parsedObject + .setAuthor( + creators + .stream() + .map( + a -> { + final Author author = new Author(); + author.setFullname(a); + return author; + }) + .collect(Collectors.toList())); + } + final List titles = VtdUtilityParser + .getTextValue( + ap, vn, "//*[local-name()='resource']//*[local-name()='title']"); + if (titles != null && titles.size() > 0) { + parsedObject + .setTitle( + titles + .stream() + .map( + t -> { + final StructuredProperty st = new StructuredProperty(); + st.setValue(t); + return st; + }) + .collect(Collectors.toList())); + } - final List dates = - VtdUtilityParser.getTextValue( - ap, - vn, - "//*[local-name()='resource']/*[local-name()='dates']/*[local-name()='date']"); + final List dates = VtdUtilityParser + .getTextValue( + ap, + vn, + "//*[local-name()='resource']/*[local-name()='dates']/*[local-name()='date']"); - if (dates != null && dates.size() > 0) { - parsedObject.setRelevantdate( - dates.stream() - .map( - cd -> { - StructuredProperty date = new StructuredProperty(); - date.setValue(cd); - final Qualifier dq = new Qualifier(); - dq.setClassname("date"); - dq.setClassid("date"); - dq.setSchemename("dnet::date"); - dq.setSchemeid("dnet::date"); - date.setQualifier(dq); - return date; - }) - .collect(Collectors.toList())); - } + if (dates != null && dates.size() > 0) { + parsedObject + .setRelevantdate( + dates + .stream() + .map( + cd -> { + StructuredProperty date = new StructuredProperty(); + date.setValue(cd); + final Qualifier dq = new Qualifier(); + dq.setClassname("date"); + dq.setClassid("date"); + dq.setSchemename("dnet::date"); + dq.setSchemeid("dnet::date"); + date.setQualifier(dq); + return date; + }) + .collect(Collectors.toList())); + } - result.add(parsedObject); - return result; - } catch (Throwable e) { - log.error("Error on parsing record " + record, e); - return null; - } - } + result.add(parsedObject); + return result; + } catch (Throwable e) { + log.error("Error on parsing record " + record, e); + return null; + } + } } diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/parser/PublicationScholexplorerParser.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/parser/PublicationScholexplorerParser.java index 2f7d48417..edbb444db 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/parser/PublicationScholexplorerParser.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/sx/graph/parser/PublicationScholexplorerParser.java @@ -1,243 +1,259 @@ + package eu.dnetlib.dhp.sx.graph.parser; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.List; +import java.util.stream.Collectors; + +import org.apache.commons.lang3.StringUtils; + import com.ximpleware.AutoPilot; import com.ximpleware.VTDGen; import com.ximpleware.VTDNav; + import eu.dnetlib.dhp.parser.utility.VtdUtilityParser; import eu.dnetlib.dhp.parser.utility.VtdUtilityParser.Node; import eu.dnetlib.dhp.schema.oaf.*; import eu.dnetlib.dhp.schema.scholexplorer.DLIPublication; import eu.dnetlib.dhp.schema.scholexplorer.ProvenaceInfo; import eu.dnetlib.scholexplorer.relation.RelationMapper; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.Collections; -import java.util.List; -import java.util.stream.Collectors; -import org.apache.commons.lang3.StringUtils; public class PublicationScholexplorerParser extends AbstractScholexplorerParser { - @Override - public List parseObject(final String record, final RelationMapper relationMapper) { - try { - final List result = new ArrayList<>(); - final DLIPublication parsedObject = new DLIPublication(); - final VTDGen vg = new VTDGen(); - vg.setDoc(record.getBytes()); - vg.parse(true); + @Override + public List parseObject(final String record, final RelationMapper relationMapper) { + try { + final List result = new ArrayList<>(); + final DLIPublication parsedObject = new DLIPublication(); + final VTDGen vg = new VTDGen(); + vg.setDoc(record.getBytes()); + vg.parse(true); - final VTDNav vn = vg.getNav(); - final AutoPilot ap = new AutoPilot(vn); + final VTDNav vn = vg.getNav(); + final AutoPilot ap = new AutoPilot(vn); - final DataInfo di = new DataInfo(); - di.setTrust("0.9"); - di.setDeletedbyinference(false); - di.setInvisible(false); + final DataInfo di = new DataInfo(); + di.setTrust("0.9"); + di.setDeletedbyinference(false); + di.setInvisible(false); - String dateOfCollection = - VtdUtilityParser.getSingleValue(ap, vn, "//*[local-name()='dateOfCollection']"); - parsedObject.setDateofcollection(dateOfCollection); + String dateOfCollection = VtdUtilityParser.getSingleValue(ap, vn, "//*[local-name()='dateOfCollection']"); + parsedObject.setDateofcollection(dateOfCollection); - final String resolvedDate = - VtdUtilityParser.getSingleValue(ap, vn, "//*[local-name()='resolvedDate']"); - parsedObject.setOriginalId( - Collections.singletonList( - VtdUtilityParser.getSingleValue(ap, vn, "//*[local-name()='recordIdentifier']"))); + final String resolvedDate = VtdUtilityParser.getSingleValue(ap, vn, "//*[local-name()='resolvedDate']"); + parsedObject + .setOriginalId( + Collections + .singletonList( + VtdUtilityParser.getSingleValue(ap, vn, "//*[local-name()='recordIdentifier']"))); - if (StringUtils.isNotBlank(resolvedDate)) { - StructuredProperty currentDate = new StructuredProperty(); - currentDate.setValue(resolvedDate); - final Qualifier dateQualifier = new Qualifier(); - dateQualifier.setClassname("resolvedDate"); - dateQualifier.setClassid("resolvedDate"); - dateQualifier.setSchemename("dnet::date"); - dateQualifier.setSchemeid("dnet::date"); - currentDate.setQualifier(dateQualifier); - parsedObject.setRelevantdate(Collections.singletonList(currentDate)); - } + if (StringUtils.isNotBlank(resolvedDate)) { + StructuredProperty currentDate = new StructuredProperty(); + currentDate.setValue(resolvedDate); + final Qualifier dateQualifier = new Qualifier(); + dateQualifier.setClassname("resolvedDate"); + dateQualifier.setClassid("resolvedDate"); + dateQualifier.setSchemename("dnet::date"); + dateQualifier.setSchemeid("dnet::date"); + currentDate.setQualifier(dateQualifier); + parsedObject.setRelevantdate(Collections.singletonList(currentDate)); + } - final List pid = - VtdUtilityParser.getTextValuesWithAttributes( - ap, vn, "//*[local-name()='pid']", Arrays.asList("type")); + final List pid = VtdUtilityParser + .getTextValuesWithAttributes( + ap, vn, "//*[local-name()='pid']", Arrays.asList("type")); - StructuredProperty currentPid = extractIdentifier(pid, "type"); - if (currentPid == null) return null; - inferPid(currentPid); - parsedObject.setPid(Collections.singletonList(currentPid)); - final String sourceId = - generateId(currentPid.getValue(), currentPid.getQualifier().getClassid(), "publication"); - parsedObject.setId(sourceId); + StructuredProperty currentPid = extractIdentifier(pid, "type"); + if (currentPid == null) + return null; + inferPid(currentPid); + parsedObject.setPid(Collections.singletonList(currentPid)); + final String sourceId = generateId( + currentPid.getValue(), currentPid.getQualifier().getClassid(), "publication"); + parsedObject.setId(sourceId); - parsedObject.setOriginalObjIdentifier( - VtdUtilityParser.getSingleValue(ap, vn, "//*[local-name()='objIdentifier']")); + parsedObject + .setOriginalObjIdentifier( + VtdUtilityParser.getSingleValue(ap, vn, "//*[local-name()='objIdentifier']")); - String provisionMode = - VtdUtilityParser.getSingleValue(ap, vn, "//*[local-name()='provisionMode']"); + String provisionMode = VtdUtilityParser.getSingleValue(ap, vn, "//*[local-name()='provisionMode']"); - List collectedFromNodes = - VtdUtilityParser.getTextValuesWithAttributes( - ap, - vn, - "//*[local-name()='collectedFrom']", - Arrays.asList("name", "id", "mode", "completionStatus")); + List collectedFromNodes = VtdUtilityParser + .getTextValuesWithAttributes( + ap, + vn, + "//*[local-name()='collectedFrom']", + Arrays.asList("name", "id", "mode", "completionStatus")); - List resolvededFromNodes = - VtdUtilityParser.getTextValuesWithAttributes( - ap, - vn, - "//*[local-name()='resolvedFrom']", - Arrays.asList("name", "id", "mode", "completionStatus")); + List resolvededFromNodes = VtdUtilityParser + .getTextValuesWithAttributes( + ap, + vn, + "//*[local-name()='resolvedFrom']", + Arrays.asList("name", "id", "mode", "completionStatus")); - final String publisher = - VtdUtilityParser.getSingleValue(ap, vn, "//*[local-name()='publisher']"); - Field pf = new Field<>(); - pf.setValue(publisher); + final String publisher = VtdUtilityParser.getSingleValue(ap, vn, "//*[local-name()='publisher']"); + Field pf = new Field<>(); + pf.setValue(publisher); - parsedObject.setPublisher(pf); - final List provenances = new ArrayList<>(); - if (collectedFromNodes != null && collectedFromNodes.size() > 0) { - collectedFromNodes.forEach( - it -> { - final ProvenaceInfo provenance = new ProvenaceInfo(); - provenance.setId(it.getAttributes().get("id")); - provenance.setName(it.getAttributes().get("name")); - provenance.setCollectionMode(provisionMode); - provenance.setCompletionStatus(it.getAttributes().get("completionStatus")); - provenances.add(provenance); - }); - } + parsedObject.setPublisher(pf); + final List provenances = new ArrayList<>(); + if (collectedFromNodes != null && collectedFromNodes.size() > 0) { + collectedFromNodes + .forEach( + it -> { + final ProvenaceInfo provenance = new ProvenaceInfo(); + provenance.setId(it.getAttributes().get("id")); + provenance.setName(it.getAttributes().get("name")); + provenance.setCollectionMode(provisionMode); + provenance.setCompletionStatus(it.getAttributes().get("completionStatus")); + provenances.add(provenance); + }); + } - if (resolvededFromNodes != null && resolvededFromNodes.size() > 0) { - resolvededFromNodes.forEach( - it -> { - final ProvenaceInfo provenance = new ProvenaceInfo(); - provenance.setId(it.getAttributes().get("id")); - provenance.setName(it.getAttributes().get("name")); - provenance.setCollectionMode("resolved"); - provenance.setCompletionStatus(it.getAttributes().get("completionStatus")); - provenances.add(provenance); - }); - } + if (resolvededFromNodes != null && resolvededFromNodes.size() > 0) { + resolvededFromNodes + .forEach( + it -> { + final ProvenaceInfo provenance = new ProvenaceInfo(); + provenance.setId(it.getAttributes().get("id")); + provenance.setName(it.getAttributes().get("name")); + provenance.setCollectionMode("resolved"); + provenance.setCompletionStatus(it.getAttributes().get("completionStatus")); + provenances.add(provenance); + }); + } - parsedObject.setDlicollectedfrom(provenances); - parsedObject.setCompletionStatus( - VtdUtilityParser.getSingleValue(ap, vn, "//*[local-name()='completionStatus']")); + parsedObject.setDlicollectedfrom(provenances); + parsedObject + .setCompletionStatus( + VtdUtilityParser.getSingleValue(ap, vn, "//*[local-name()='completionStatus']")); - parsedObject.setCollectedfrom( - parsedObject.getDlicollectedfrom().stream() - .map( - p -> { - final KeyValue cf = new KeyValue(); - cf.setKey(p.getId()); - cf.setValue(p.getName()); - return cf; - }) - .collect(Collectors.toList())); + parsedObject + .setCollectedfrom( + parsedObject + .getDlicollectedfrom() + .stream() + .map( + p -> { + final KeyValue cf = new KeyValue(); + cf.setKey(p.getId()); + cf.setValue(p.getName()); + return cf; + }) + .collect(Collectors.toList())); - final List relatedIdentifiers = - VtdUtilityParser.getTextValuesWithAttributes( - ap, - vn, - "//*[local-name()='relatedIdentifier']", - Arrays.asList( - "relatedIdentifierType", "relationType", "entityType", "inverseRelationType")); - generateRelations( - relationMapper, parsedObject, result, di, dateOfCollection, relatedIdentifiers); + final List relatedIdentifiers = VtdUtilityParser + .getTextValuesWithAttributes( + ap, + vn, + "//*[local-name()='relatedIdentifier']", + Arrays + .asList( + "relatedIdentifierType", "relationType", "entityType", "inverseRelationType")); + generateRelations( + relationMapper, parsedObject, result, di, dateOfCollection, relatedIdentifiers); - final List hostedBy = - VtdUtilityParser.getTextValuesWithAttributes( - ap, vn, "//*[local-name()='hostedBy']", Arrays.asList("id", "name")); + final List hostedBy = VtdUtilityParser + .getTextValuesWithAttributes( + ap, vn, "//*[local-name()='hostedBy']", Arrays.asList("id", "name")); - if (hostedBy != null) { - parsedObject.setInstance( - hostedBy.stream() - .map( - it -> { - final Instance i = new Instance(); - i.setUrl(Collections.singletonList(currentPid.getValue())); - KeyValue h = new KeyValue(); - i.setHostedby(h); - h.setKey(it.getAttributes().get("id")); - h.setValue(it.getAttributes().get("name")); - return i; - }) - .collect(Collectors.toList())); - } + if (hostedBy != null) { + parsedObject + .setInstance( + hostedBy + .stream() + .map( + it -> { + final Instance i = new Instance(); + i.setUrl(Collections.singletonList(currentPid.getValue())); + KeyValue h = new KeyValue(); + i.setHostedby(h); + h.setKey(it.getAttributes().get("id")); + h.setValue(it.getAttributes().get("name")); + return i; + }) + .collect(Collectors.toList())); + } - final List authorsNode = - VtdUtilityParser.getTextValue(ap, vn, "//*[local-name()='creator']"); - if (authorsNode != null) - parsedObject.setAuthor( - authorsNode.stream() - .map( - a -> { - final Author author = new Author(); - author.setFullname(a); - return author; - }) - .collect(Collectors.toList())); + final List authorsNode = VtdUtilityParser.getTextValue(ap, vn, "//*[local-name()='creator']"); + if (authorsNode != null) + parsedObject + .setAuthor( + authorsNode + .stream() + .map( + a -> { + final Author author = new Author(); + author.setFullname(a); + return author; + }) + .collect(Collectors.toList())); - final List titles = - VtdUtilityParser.getTextValue(ap, vn, "//*[local-name()='title']"); - if (titles != null) { - parsedObject.setTitle( - titles.stream() - .map( - t -> { - final StructuredProperty st = new StructuredProperty(); - st.setValue(t); - return st; - }) - .collect(Collectors.toList())); - } + final List titles = VtdUtilityParser.getTextValue(ap, vn, "//*[local-name()='title']"); + if (titles != null) { + parsedObject + .setTitle( + titles + .stream() + .map( + t -> { + final StructuredProperty st = new StructuredProperty(); + st.setValue(t); + return st; + }) + .collect(Collectors.toList())); + } - Field description = new Field<>(); + Field description = new Field<>(); - description.setValue( - VtdUtilityParser.getSingleValue(ap, vn, "//*[local-name()='description']")); + description + .setValue( + VtdUtilityParser.getSingleValue(ap, vn, "//*[local-name()='description']")); - if (StringUtils.isNotBlank(description.getValue()) - && description.getValue().length() > 10000) { - description.setValue(description.getValue().substring(0, 10000)); - } + if (StringUtils.isNotBlank(description.getValue()) + && description.getValue().length() > 10000) { + description.setValue(description.getValue().substring(0, 10000)); + } - parsedObject.setDescription(Collections.singletonList(description)); + parsedObject.setDescription(Collections.singletonList(description)); - final String cd = VtdUtilityParser.getSingleValue(ap, vn, "//*[local-name()='date']"); + final String cd = VtdUtilityParser.getSingleValue(ap, vn, "//*[local-name()='date']"); - StructuredProperty date = new StructuredProperty(); - date.setValue(cd); - final Qualifier dq = new Qualifier(); - dq.setClassname("date"); - dq.setClassid("date"); - dq.setSchemename("dnet::date"); - dq.setSchemeid("dnet::date"); - date.setQualifier(dq); - parsedObject.setRelevantdate(Collections.singletonList(date)); + StructuredProperty date = new StructuredProperty(); + date.setValue(cd); + final Qualifier dq = new Qualifier(); + dq.setClassname("date"); + dq.setClassid("date"); + dq.setSchemename("dnet::date"); + dq.setSchemeid("dnet::date"); + date.setQualifier(dq); + parsedObject.setRelevantdate(Collections.singletonList(date)); - List subjects = - extractSubject( - VtdUtilityParser.getTextValuesWithAttributes( - ap, vn, "//*[local-name()='subject']", Collections.singletonList("scheme"))); - parsedObject.setSubject(subjects); + List subjects = extractSubject( + VtdUtilityParser + .getTextValuesWithAttributes( + ap, vn, "//*[local-name()='subject']", Collections.singletonList("scheme"))); + parsedObject.setSubject(subjects); - parsedObject.setDataInfo(di); + parsedObject.setDataInfo(di); - parsedObject.setSubject(subjects); - Qualifier q = new Qualifier(); - q.setClassname("publication"); - q.setClassid("publication"); - q.setSchemename("publication"); - q.setSchemeid("publication"); - parsedObject.setResulttype(q); - result.add(parsedObject); - return result; + parsedObject.setSubject(subjects); + Qualifier q = new Qualifier(); + q.setClassname("publication"); + q.setClassid("publication"); + q.setSchemename("publication"); + q.setSchemeid("publication"); + parsedObject.setResulttype(q); + result.add(parsedObject); + return result; - } catch (Throwable e) { - log.error("Input record: " + record); - log.error("Error on parsing record ", e); - return null; - } - } + } catch (Throwable e) { + log.error("Input record: " + record); + log.error("Error on parsing record ", e); + return null; + } + } } diff --git a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/GraphHiveImporterJobTest.java b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/GraphHiveImporterJobTest.java index 06d9d1e8a..e95174670 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/GraphHiveImporterJobTest.java +++ b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/GraphHiveImporterJobTest.java @@ -1,10 +1,10 @@ + package eu.dnetlib.dhp.oa.graph; -import eu.dnetlib.dhp.oa.graph.hive.GraphHiveImporterJob; -import eu.dnetlib.dhp.schema.common.ModelSupport; import java.io.IOException; import java.nio.file.Files; import java.nio.file.Path; + import org.apache.commons.io.FileUtils; import org.apache.commons.lang3.RandomStringUtils; import org.apache.spark.SparkConf; @@ -16,76 +16,82 @@ import org.junit.jupiter.api.Test; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import eu.dnetlib.dhp.oa.graph.hive.GraphHiveImporterJob; +import eu.dnetlib.dhp.schema.common.ModelSupport; + public class GraphHiveImporterJobTest { - private static final Logger log = LoggerFactory.getLogger(GraphHiveImporterJobTest.class); + private static final Logger log = LoggerFactory.getLogger(GraphHiveImporterJobTest.class); - public static final String JDBC_DERBY_TEMPLATE = - "jdbc:derby:;databaseName=%s/junit_metastore_db;create=true"; + public static final String JDBC_DERBY_TEMPLATE = "jdbc:derby:;databaseName=%s/junit_metastore_db;create=true"; - private static SparkSession spark; + private static SparkSession spark; - private static Path workingDir; + private static Path workingDir; - private static String dbName; + private static String dbName; - @BeforeAll - public static void beforeAll() throws IOException { - workingDir = Files.createTempDirectory(GraphHiveImporterJobTest.class.getSimpleName()); - log.info("using work dir {}", workingDir); + @BeforeAll + public static void beforeAll() throws IOException { + workingDir = Files.createTempDirectory(GraphHiveImporterJobTest.class.getSimpleName()); + log.info("using work dir {}", workingDir); - dbName = RandomStringUtils.randomAlphabetic(5); - log.info("using DB name {}", "test_" + dbName); + dbName = RandomStringUtils.randomAlphabetic(5); + log.info("using DB name {}", "test_" + dbName); - SparkConf conf = new SparkConf(); - conf.setAppName(GraphHiveImporterJobTest.class.getSimpleName()); + SparkConf conf = new SparkConf(); + conf.setAppName(GraphHiveImporterJobTest.class.getSimpleName()); - conf.setMaster("local[*]"); - conf.set("spark.driver.host", "localhost"); - conf.set("hive.metastore.local", "true"); - conf.set("spark.ui.enabled", "false"); - conf.set("spark.sql.warehouse.dir", workingDir.toString()); - conf.set("hive.metastore.warehouse.dir", workingDir.resolve("warehouse").toString()); - conf.set( - "javax.jdo.option.ConnectionURL", - String.format(JDBC_DERBY_TEMPLATE, workingDir.resolve("warehouse").toString())); + conf.setMaster("local[*]"); + conf.set("spark.driver.host", "localhost"); + conf.set("hive.metastore.local", "true"); + conf.set("spark.ui.enabled", "false"); + conf.set("spark.sql.warehouse.dir", workingDir.toString()); + conf.set("hive.metastore.warehouse.dir", workingDir.resolve("warehouse").toString()); + conf + .set( + "javax.jdo.option.ConnectionURL", + String.format(JDBC_DERBY_TEMPLATE, workingDir.resolve("warehouse").toString())); - spark = - SparkSession.builder() - .appName(GraphHiveImporterJobTest.class.getSimpleName()) - .config(conf) - .enableHiveSupport() - .getOrCreate(); - } + spark = SparkSession + .builder() + .appName(GraphHiveImporterJobTest.class.getSimpleName()) + .config(conf) + .enableHiveSupport() + .getOrCreate(); + } - @AfterAll - public static void afterAll() throws IOException { - FileUtils.deleteDirectory(workingDir.toFile()); - spark.stop(); - } + @AfterAll + public static void afterAll() throws IOException { + FileUtils.deleteDirectory(workingDir.toFile()); + spark.stop(); + } - @Test - public void testImportGraphAsHiveDB() throws Exception { + @Test + public void testImportGraphAsHiveDB() throws Exception { - GraphHiveImporterJob.main( - new String[] { - "-isSparkSessionManaged", - Boolean.FALSE.toString(), - "-inputPath", - getClass().getResource("/eu/dnetlib/dhp/oa/graph/sample").getPath(), - "-hiveMetastoreUris", - "", - "-hiveDbName", - dbName - }); + GraphHiveImporterJob + .main( + new String[] { + "-isSparkSessionManaged", + Boolean.FALSE.toString(), + "-inputPath", + getClass().getResource("/eu/dnetlib/dhp/oa/graph/sample").getPath(), + "-hiveMetastoreUris", + "", + "-hiveDbName", + dbName + }); - ModelSupport.oafTypes.forEach( - (name, clazz) -> { - long count = spark.read().table(dbName + "." + name).count(); - int expected = name.equals("relation") ? 100 : 10; + ModelSupport.oafTypes + .forEach( + (name, clazz) -> { + long count = spark.read().table(dbName + "." + name).count(); + int expected = name.equals("relation") ? 100 : 10; - Assertions.assertEquals( - expected, count, String.format("%s should be %s", name, expected)); - }); - } + Assertions + .assertEquals( + expected, count, String.format("%s should be %s", name, expected)); + }); + } } diff --git a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java index 89740718b..951c97d9d 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java +++ b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java @@ -1,3 +1,4 @@ + package eu.dnetlib.dhp.oa.graph.raw; import static org.junit.jupiter.api.Assertions.assertEquals; @@ -6,14 +7,10 @@ import static org.junit.jupiter.api.Assertions.assertTrue; import static org.mockito.ArgumentMatchers.anyString; import static org.mockito.Mockito.when; -import eu.dnetlib.dhp.schema.oaf.Dataset; -import eu.dnetlib.dhp.schema.oaf.Oaf; -import eu.dnetlib.dhp.schema.oaf.Publication; -import eu.dnetlib.dhp.schema.oaf.Relation; -import eu.dnetlib.dhp.schema.oaf.Software; import java.io.IOException; import java.util.List; import java.util.Map; + import org.apache.commons.io.IOUtils; import org.apache.commons.lang3.StringUtils; import org.junit.jupiter.api.BeforeEach; @@ -22,124 +19,131 @@ import org.junit.jupiter.api.extension.ExtendWith; import org.mockito.Mock; import org.mockito.junit.jupiter.MockitoExtension; +import eu.dnetlib.dhp.schema.oaf.Dataset; +import eu.dnetlib.dhp.schema.oaf.Oaf; +import eu.dnetlib.dhp.schema.oaf.Publication; +import eu.dnetlib.dhp.schema.oaf.Relation; +import eu.dnetlib.dhp.schema.oaf.Software; + @ExtendWith(MockitoExtension.class) public class MappersTest { - @Mock private Map code2name; + @Mock + private Map code2name; - @BeforeEach - public void setUp() throws Exception { - when(code2name.get(anyString())).thenAnswer(invocation -> invocation.getArgument(0)); - } + @BeforeEach + public void setUp() throws Exception { + when(code2name.get(anyString())).thenAnswer(invocation -> invocation.getArgument(0)); + } - @Test - void testPublication() throws IOException { - final String xml = IOUtils.toString(getClass().getResourceAsStream("oaf_record.xml")); + @Test + void testPublication() throws IOException { + final String xml = IOUtils.toString(getClass().getResourceAsStream("oaf_record.xml")); - final List list = new OafToOafMapper(code2name).processMdRecord(xml); + final List list = new OafToOafMapper(code2name).processMdRecord(xml); - assertEquals(3, list.size()); - assertTrue(list.get(0) instanceof Publication); - assertTrue(list.get(1) instanceof Relation); - assertTrue(list.get(2) instanceof Relation); + assertEquals(3, list.size()); + assertTrue(list.get(0) instanceof Publication); + assertTrue(list.get(1) instanceof Relation); + assertTrue(list.get(2) instanceof Relation); - final Publication p = (Publication) list.get(0); - final Relation r1 = (Relation) list.get(1); - final Relation r2 = (Relation) list.get(2); + final Publication p = (Publication) list.get(0); + final Relation r1 = (Relation) list.get(1); + final Relation r2 = (Relation) list.get(2); - assertValidId(p.getId()); - assertValidId(p.getCollectedfrom().get(0).getKey()); - assertTrue(StringUtils.isNotBlank(p.getTitle().get(0).getValue())); - assertTrue(p.getAuthor().size() > 0); - assertTrue(p.getSubject().size() > 0); - assertTrue(StringUtils.isNotBlank(p.getJournal().getIssnOnline())); - assertTrue(StringUtils.isNotBlank(p.getJournal().getName())); - assertTrue(p.getInstance().size() > 0); + assertValidId(p.getId()); + assertValidId(p.getCollectedfrom().get(0).getKey()); + assertTrue(StringUtils.isNotBlank(p.getTitle().get(0).getValue())); + assertTrue(p.getAuthor().size() > 0); + assertTrue(p.getSubject().size() > 0); + assertTrue(StringUtils.isNotBlank(p.getJournal().getIssnOnline())); + assertTrue(StringUtils.isNotBlank(p.getJournal().getName())); + assertTrue(p.getInstance().size() > 0); - assertValidId(r1.getSource()); - assertValidId(r1.getTarget()); - assertValidId(r2.getSource()); - assertValidId(r2.getTarget()); - assertValidId(r1.getCollectedfrom().get(0).getKey()); - assertValidId(r2.getCollectedfrom().get(0).getKey()); - assertNotNull(r1.getDataInfo()); - assertNotNull(r2.getDataInfo()); - assertNotNull(r1.getDataInfo().getTrust()); - assertNotNull(r2.getDataInfo().getTrust()); - assertEquals(r1.getSource(), r2.getTarget()); - assertEquals(r2.getSource(), r1.getTarget()); - assertTrue(StringUtils.isNotBlank(r1.getRelClass())); - assertTrue(StringUtils.isNotBlank(r2.getRelClass())); - assertTrue(StringUtils.isNotBlank(r1.getRelType())); - assertTrue(StringUtils.isNotBlank(r2.getRelType())); + assertValidId(r1.getSource()); + assertValidId(r1.getTarget()); + assertValidId(r2.getSource()); + assertValidId(r2.getTarget()); + assertValidId(r1.getCollectedfrom().get(0).getKey()); + assertValidId(r2.getCollectedfrom().get(0).getKey()); + assertNotNull(r1.getDataInfo()); + assertNotNull(r2.getDataInfo()); + assertNotNull(r1.getDataInfo().getTrust()); + assertNotNull(r2.getDataInfo().getTrust()); + assertEquals(r1.getSource(), r2.getTarget()); + assertEquals(r2.getSource(), r1.getTarget()); + assertTrue(StringUtils.isNotBlank(r1.getRelClass())); + assertTrue(StringUtils.isNotBlank(r2.getRelClass())); + assertTrue(StringUtils.isNotBlank(r1.getRelType())); + assertTrue(StringUtils.isNotBlank(r2.getRelType())); - // System.out.println(new ObjectMapper().writeValueAsString(r1)); - // System.out.println(new ObjectMapper().writeValueAsString(r2)); - } + // System.out.println(new ObjectMapper().writeValueAsString(r1)); + // System.out.println(new ObjectMapper().writeValueAsString(r2)); + } - @Test - void testDataset() throws IOException { - final String xml = IOUtils.toString(getClass().getResourceAsStream("odf_dataset.xml")); + @Test + void testDataset() throws IOException { + final String xml = IOUtils.toString(getClass().getResourceAsStream("odf_dataset.xml")); - final List list = new OdfToOafMapper(code2name).processMdRecord(xml); + final List list = new OdfToOafMapper(code2name).processMdRecord(xml); - assertEquals(3, list.size()); - assertTrue(list.get(0) instanceof Dataset); - assertTrue(list.get(1) instanceof Relation); - assertTrue(list.get(2) instanceof Relation); + assertEquals(3, list.size()); + assertTrue(list.get(0) instanceof Dataset); + assertTrue(list.get(1) instanceof Relation); + assertTrue(list.get(2) instanceof Relation); - final Dataset d = (Dataset) list.get(0); - final Relation r1 = (Relation) list.get(1); - final Relation r2 = (Relation) list.get(2); + final Dataset d = (Dataset) list.get(0); + final Relation r1 = (Relation) list.get(1); + final Relation r2 = (Relation) list.get(2); - assertValidId(d.getId()); - assertValidId(d.getCollectedfrom().get(0).getKey()); - assertTrue(StringUtils.isNotBlank(d.getTitle().get(0).getValue())); - assertTrue(d.getAuthor().size() > 0); - assertTrue(d.getSubject().size() > 0); - assertTrue(d.getInstance().size() > 0); - assertTrue(d.getContext().size() > 0); - assertTrue(d.getContext().get(0).getId().length() > 0); + assertValidId(d.getId()); + assertValidId(d.getCollectedfrom().get(0).getKey()); + assertTrue(StringUtils.isNotBlank(d.getTitle().get(0).getValue())); + assertTrue(d.getAuthor().size() > 0); + assertTrue(d.getSubject().size() > 0); + assertTrue(d.getInstance().size() > 0); + assertTrue(d.getContext().size() > 0); + assertTrue(d.getContext().get(0).getId().length() > 0); - assertValidId(r1.getSource()); - assertValidId(r1.getTarget()); - assertValidId(r2.getSource()); - assertValidId(r2.getTarget()); - assertNotNull(r1.getDataInfo()); - assertNotNull(r2.getDataInfo()); - assertNotNull(r1.getDataInfo().getTrust()); - assertNotNull(r2.getDataInfo().getTrust()); - assertEquals(r1.getSource(), r2.getTarget()); - assertEquals(r2.getSource(), r1.getTarget()); - assertTrue(StringUtils.isNotBlank(r1.getRelClass())); - assertTrue(StringUtils.isNotBlank(r2.getRelClass())); - assertTrue(StringUtils.isNotBlank(r1.getRelType())); - assertTrue(StringUtils.isNotBlank(r2.getRelType())); - } + assertValidId(r1.getSource()); + assertValidId(r1.getTarget()); + assertValidId(r2.getSource()); + assertValidId(r2.getTarget()); + assertNotNull(r1.getDataInfo()); + assertNotNull(r2.getDataInfo()); + assertNotNull(r1.getDataInfo().getTrust()); + assertNotNull(r2.getDataInfo().getTrust()); + assertEquals(r1.getSource(), r2.getTarget()); + assertEquals(r2.getSource(), r1.getTarget()); + assertTrue(StringUtils.isNotBlank(r1.getRelClass())); + assertTrue(StringUtils.isNotBlank(r2.getRelClass())); + assertTrue(StringUtils.isNotBlank(r1.getRelType())); + assertTrue(StringUtils.isNotBlank(r2.getRelType())); + } - @Test - void testSoftware() throws IOException { - final String xml = IOUtils.toString(getClass().getResourceAsStream("odf_software.xml")); + @Test + void testSoftware() throws IOException { + final String xml = IOUtils.toString(getClass().getResourceAsStream("odf_software.xml")); - final List list = new OdfToOafMapper(code2name).processMdRecord(xml); + final List list = new OdfToOafMapper(code2name).processMdRecord(xml); - assertEquals(1, list.size()); - assertTrue(list.get(0) instanceof Software); + assertEquals(1, list.size()); + assertTrue(list.get(0) instanceof Software); - final Software s = (Software) list.get(0); + final Software s = (Software) list.get(0); - assertValidId(s.getId()); - assertValidId(s.getCollectedfrom().get(0).getKey()); - assertTrue(StringUtils.isNotBlank(s.getTitle().get(0).getValue())); - assertTrue(s.getAuthor().size() > 0); - assertTrue(s.getSubject().size() > 0); - assertTrue(s.getInstance().size() > 0); - } + assertValidId(s.getId()); + assertValidId(s.getCollectedfrom().get(0).getKey()); + assertTrue(StringUtils.isNotBlank(s.getTitle().get(0).getValue())); + assertTrue(s.getAuthor().size() > 0); + assertTrue(s.getSubject().size() > 0); + assertTrue(s.getInstance().size() > 0); + } - private void assertValidId(final String id) { - assertEquals(49, id.length()); - assertEquals('|', id.charAt(2)); - assertEquals(':', id.charAt(15)); - assertEquals(':', id.charAt(16)); - } + private void assertValidId(final String id) { + assertEquals(49, id.length()); + assertEquals('|', id.charAt(2)); + assertEquals(':', id.charAt(15)); + assertEquals(':', id.charAt(16)); + } } diff --git a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MigrateDbEntitiesApplicationTest.java b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MigrateDbEntitiesApplicationTest.java index 0d3a273ec..1bbe57ee8 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MigrateDbEntitiesApplicationTest.java +++ b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MigrateDbEntitiesApplicationTest.java @@ -1,17 +1,10 @@ + package eu.dnetlib.dhp.oa.graph.raw; import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertNotNull; import static org.junit.jupiter.api.Assertions.assertTrue; -import com.fasterxml.jackson.core.type.TypeReference; -import com.fasterxml.jackson.databind.ObjectMapper; -import eu.dnetlib.dhp.schema.oaf.Datasource; -import eu.dnetlib.dhp.schema.oaf.Oaf; -import eu.dnetlib.dhp.schema.oaf.Organization; -import eu.dnetlib.dhp.schema.oaf.Project; -import eu.dnetlib.dhp.schema.oaf.Relation; -import eu.dnetlib.dhp.schema.oaf.Result; import java.io.IOException; import java.sql.Array; import java.sql.Date; @@ -19,6 +12,7 @@ import java.sql.ResultSet; import java.sql.SQLException; import java.util.List; import java.util.Objects; + import org.apache.commons.io.IOUtils; import org.apache.commons.lang3.StringUtils; import org.junit.jupiter.api.BeforeEach; @@ -28,316 +22,332 @@ import org.mockito.Mock; import org.mockito.Mockito; import org.mockito.junit.jupiter.MockitoExtension; +import com.fasterxml.jackson.core.type.TypeReference; +import com.fasterxml.jackson.databind.ObjectMapper; + +import eu.dnetlib.dhp.schema.oaf.Datasource; +import eu.dnetlib.dhp.schema.oaf.Oaf; +import eu.dnetlib.dhp.schema.oaf.Organization; +import eu.dnetlib.dhp.schema.oaf.Project; +import eu.dnetlib.dhp.schema.oaf.Relation; +import eu.dnetlib.dhp.schema.oaf.Result; + @ExtendWith(MockitoExtension.class) public class MigrateDbEntitiesApplicationTest { - private MigrateDbEntitiesApplication app; + private MigrateDbEntitiesApplication app; - @Mock private ResultSet rs; + @Mock + private ResultSet rs; - @BeforeEach - public void setUp() { - this.app = new MigrateDbEntitiesApplication(); - } + @BeforeEach + public void setUp() { + this.app = new MigrateDbEntitiesApplication(); + } - @Test - public void testProcessDatasource() throws Exception { - final List fields = prepareMocks("datasources_resultset_entry.json"); + @Test + public void testProcessDatasource() throws Exception { + final List fields = prepareMocks("datasources_resultset_entry.json"); - final List list = app.processDatasource(rs); - assertEquals(1, list.size()); - verifyMocks(fields); + final List list = app.processDatasource(rs); + assertEquals(1, list.size()); + verifyMocks(fields); - final Datasource ds = (Datasource) list.get(0); - assertValidId(ds.getId()); - assertValidId(ds.getCollectedfrom().get(0).getKey()); - assertEquals(ds.getOfficialname().getValue(), getValueAsString("officialname", fields)); - assertEquals(ds.getEnglishname().getValue(), getValueAsString("englishname", fields)); - assertEquals(ds.getContactemail().getValue(), getValueAsString("contactemail", fields)); - assertEquals(ds.getWebsiteurl().getValue(), getValueAsString("websiteurl", fields)); - assertEquals(ds.getNamespaceprefix().getValue(), getValueAsString("namespaceprefix", fields)); - assertEquals( - ds.getCollectedfrom().get(0).getValue(), getValueAsString("collectedfromname", fields)); - } + final Datasource ds = (Datasource) list.get(0); + assertValidId(ds.getId()); + assertValidId(ds.getCollectedfrom().get(0).getKey()); + assertEquals(ds.getOfficialname().getValue(), getValueAsString("officialname", fields)); + assertEquals(ds.getEnglishname().getValue(), getValueAsString("englishname", fields)); + assertEquals(ds.getContactemail().getValue(), getValueAsString("contactemail", fields)); + assertEquals(ds.getWebsiteurl().getValue(), getValueAsString("websiteurl", fields)); + assertEquals(ds.getNamespaceprefix().getValue(), getValueAsString("namespaceprefix", fields)); + assertEquals( + ds.getCollectedfrom().get(0).getValue(), getValueAsString("collectedfromname", fields)); + } - @Test - public void testProcessProject() throws Exception { - final List fields = prepareMocks("projects_resultset_entry.json"); + @Test + public void testProcessProject() throws Exception { + final List fields = prepareMocks("projects_resultset_entry.json"); - final List list = app.processProject(rs); - assertEquals(1, list.size()); - verifyMocks(fields); + final List list = app.processProject(rs); + assertEquals(1, list.size()); + verifyMocks(fields); - final Project p = (Project) list.get(0); - assertValidId(p.getId()); - assertValidId(p.getCollectedfrom().get(0).getKey()); - assertEquals(p.getAcronym().getValue(), getValueAsString("acronym", fields)); - assertEquals(p.getTitle().getValue(), getValueAsString("title", fields)); - assertEquals( - p.getCollectedfrom().get(0).getValue(), getValueAsString("collectedfromname", fields)); - } + final Project p = (Project) list.get(0); + assertValidId(p.getId()); + assertValidId(p.getCollectedfrom().get(0).getKey()); + assertEquals(p.getAcronym().getValue(), getValueAsString("acronym", fields)); + assertEquals(p.getTitle().getValue(), getValueAsString("title", fields)); + assertEquals( + p.getCollectedfrom().get(0).getValue(), getValueAsString("collectedfromname", fields)); + } - @Test - public void testProcessOrganization() throws Exception { - final List fields = prepareMocks("organizations_resultset_entry.json"); + @Test + public void testProcessOrganization() throws Exception { + final List fields = prepareMocks("organizations_resultset_entry.json"); - final List list = app.processOrganization(rs); + final List list = app.processOrganization(rs); - assertEquals(1, list.size()); + assertEquals(1, list.size()); - verifyMocks(fields); + verifyMocks(fields); - final Organization o = (Organization) list.get(0); - assertValidId(o.getId()); - assertValidId(o.getCollectedfrom().get(0).getKey()); - assertEquals(o.getLegalshortname().getValue(), getValueAsString("legalshortname", fields)); - assertEquals(o.getLegalname().getValue(), getValueAsString("legalname", fields)); - assertEquals(o.getWebsiteurl().getValue(), getValueAsString("websiteurl", fields)); - assertEquals(o.getCountry().getClassid(), getValueAsString("country", fields).split("@@@")[0]); - assertEquals( - o.getCountry().getClassname(), getValueAsString("country", fields).split("@@@")[1]); - assertEquals(o.getCountry().getSchemeid(), getValueAsString("country", fields).split("@@@")[2]); - assertEquals( - o.getCountry().getSchemename(), getValueAsString("country", fields).split("@@@")[3]); - assertEquals( - o.getCollectedfrom().get(0).getValue(), getValueAsString("collectedfromname", fields)); - } + final Organization o = (Organization) list.get(0); + assertValidId(o.getId()); + assertValidId(o.getCollectedfrom().get(0).getKey()); + assertEquals(o.getLegalshortname().getValue(), getValueAsString("legalshortname", fields)); + assertEquals(o.getLegalname().getValue(), getValueAsString("legalname", fields)); + assertEquals(o.getWebsiteurl().getValue(), getValueAsString("websiteurl", fields)); + assertEquals(o.getCountry().getClassid(), getValueAsString("country", fields).split("@@@")[0]); + assertEquals( + o.getCountry().getClassname(), getValueAsString("country", fields).split("@@@")[1]); + assertEquals(o.getCountry().getSchemeid(), getValueAsString("country", fields).split("@@@")[2]); + assertEquals( + o.getCountry().getSchemename(), getValueAsString("country", fields).split("@@@")[3]); + assertEquals( + o.getCollectedfrom().get(0).getValue(), getValueAsString("collectedfromname", fields)); + } - @Test - public void testProcessDatasourceOrganization() throws Exception { - final List fields = prepareMocks("datasourceorganization_resultset_entry.json"); + @Test + public void testProcessDatasourceOrganization() throws Exception { + final List fields = prepareMocks("datasourceorganization_resultset_entry.json"); - final List list = app.processDatasourceOrganization(rs); + final List list = app.processDatasourceOrganization(rs); - assertEquals(2, list.size()); - verifyMocks(fields); + assertEquals(2, list.size()); + verifyMocks(fields); - final Relation r1 = (Relation) list.get(0); - final Relation r2 = (Relation) list.get(1); - assertValidId(r1.getSource()); - assertValidId(r2.getSource()); - assertEquals(r1.getSource(), r2.getTarget()); - assertEquals(r2.getSource(), r1.getTarget()); - } + final Relation r1 = (Relation) list.get(0); + final Relation r2 = (Relation) list.get(1); + assertValidId(r1.getSource()); + assertValidId(r2.getSource()); + assertEquals(r1.getSource(), r2.getTarget()); + assertEquals(r2.getSource(), r1.getTarget()); + } - @Test - public void testProcessProjectOrganization() throws Exception { - final List fields = prepareMocks("projectorganization_resultset_entry.json"); + @Test + public void testProcessProjectOrganization() throws Exception { + final List fields = prepareMocks("projectorganization_resultset_entry.json"); - final List list = app.processProjectOrganization(rs); + final List list = app.processProjectOrganization(rs); - assertEquals(2, list.size()); - verifyMocks(fields); + assertEquals(2, list.size()); + verifyMocks(fields); - final Relation r1 = (Relation) list.get(0); - final Relation r2 = (Relation) list.get(1); - assertValidId(r1.getSource()); - assertValidId(r2.getSource()); - assertEquals(r1.getSource(), r2.getTarget()); - assertEquals(r2.getSource(), r1.getTarget()); - assertValidId(r1.getCollectedfrom().get(0).getKey()); - assertValidId(r2.getCollectedfrom().get(0).getKey()); - } + final Relation r1 = (Relation) list.get(0); + final Relation r2 = (Relation) list.get(1); + assertValidId(r1.getSource()); + assertValidId(r2.getSource()); + assertEquals(r1.getSource(), r2.getTarget()); + assertEquals(r2.getSource(), r1.getTarget()); + assertValidId(r1.getCollectedfrom().get(0).getKey()); + assertValidId(r2.getCollectedfrom().get(0).getKey()); + } - @Test - public void testProcessClaims_context() throws Exception { - final List fields = prepareMocks("claimscontext_resultset_entry.json"); + @Test + public void testProcessClaims_context() throws Exception { + final List fields = prepareMocks("claimscontext_resultset_entry.json"); - final List list = app.processClaims(rs); + final List list = app.processClaims(rs); - assertEquals(1, list.size()); - assertTrue(list.get(0) instanceof Result); - final Result r = (Result) list.get(0); + assertEquals(1, list.size()); + assertTrue(list.get(0) instanceof Result); + final Result r = (Result) list.get(0); - verifyMocks(fields); + verifyMocks(fields); - assertValidId(r.getCollectedfrom().get(0).getKey()); - } + assertValidId(r.getCollectedfrom().get(0).getKey()); + } - @Test - public void testProcessClaims_rels() throws Exception { - final List fields = prepareMocks("claimsrel_resultset_entry.json"); + @Test + public void testProcessClaims_rels() throws Exception { + final List fields = prepareMocks("claimsrel_resultset_entry.json"); - final List list = app.processClaims(rs); + final List list = app.processClaims(rs); - assertEquals(2, list.size()); - verifyMocks(fields); + assertEquals(2, list.size()); + verifyMocks(fields); - assertTrue(list.get(0) instanceof Relation); - assertTrue(list.get(1) instanceof Relation); + assertTrue(list.get(0) instanceof Relation); + assertTrue(list.get(1) instanceof Relation); - final Relation r1 = (Relation) list.get(0); - final Relation r2 = (Relation) list.get(1); + final Relation r1 = (Relation) list.get(0); + final Relation r2 = (Relation) list.get(1); - assertValidId(r1.getSource()); - assertValidId(r1.getTarget()); - assertValidId(r2.getSource()); - assertValidId(r2.getTarget()); - assertNotNull(r1.getDataInfo()); - assertNotNull(r2.getDataInfo()); - assertNotNull(r1.getDataInfo().getTrust()); - assertNotNull(r2.getDataInfo().getTrust()); - assertEquals(r1.getSource(), r2.getTarget()); - assertEquals(r2.getSource(), r1.getTarget()); - assertTrue(StringUtils.isNotBlank(r1.getRelClass())); - assertTrue(StringUtils.isNotBlank(r2.getRelClass())); - assertTrue(StringUtils.isNotBlank(r1.getRelType())); - assertTrue(StringUtils.isNotBlank(r2.getRelType())); + assertValidId(r1.getSource()); + assertValidId(r1.getTarget()); + assertValidId(r2.getSource()); + assertValidId(r2.getTarget()); + assertNotNull(r1.getDataInfo()); + assertNotNull(r2.getDataInfo()); + assertNotNull(r1.getDataInfo().getTrust()); + assertNotNull(r2.getDataInfo().getTrust()); + assertEquals(r1.getSource(), r2.getTarget()); + assertEquals(r2.getSource(), r1.getTarget()); + assertTrue(StringUtils.isNotBlank(r1.getRelClass())); + assertTrue(StringUtils.isNotBlank(r2.getRelClass())); + assertTrue(StringUtils.isNotBlank(r1.getRelType())); + assertTrue(StringUtils.isNotBlank(r2.getRelType())); - assertValidId(r1.getCollectedfrom().get(0).getKey()); - assertValidId(r2.getCollectedfrom().get(0).getKey()); + assertValidId(r1.getCollectedfrom().get(0).getKey()); + assertValidId(r2.getCollectedfrom().get(0).getKey()); - // System.out.println(new ObjectMapper().writeValueAsString(r1)); - // System.out.println(new ObjectMapper().writeValueAsString(r2)); - } + // System.out.println(new ObjectMapper().writeValueAsString(r1)); + // System.out.println(new ObjectMapper().writeValueAsString(r2)); + } - private List prepareMocks(final String jsonFile) throws IOException, SQLException { - final String json = IOUtils.toString(getClass().getResourceAsStream(jsonFile)); - final ObjectMapper mapper = new ObjectMapper(); - final List list = mapper.readValue(json, new TypeReference>() {}); + private List prepareMocks(final String jsonFile) throws IOException, SQLException { + final String json = IOUtils.toString(getClass().getResourceAsStream(jsonFile)); + final ObjectMapper mapper = new ObjectMapper(); + final List list = mapper.readValue(json, new TypeReference>() { + }); - for (final TypedField tf : list) { - if (tf.getValue() == null) { - switch (tf.getType()) { - case "not_used": - break; - case "boolean": - Mockito.when(rs.getBoolean(tf.getField())).thenReturn(false); - break; - case "date": - Mockito.when(rs.getDate(tf.getField())).thenReturn(null); - break; - case "int": - Mockito.when(rs.getInt(tf.getField())).thenReturn(0); - break; - case "double": - Mockito.when(rs.getDouble(tf.getField())).thenReturn(0.0); - break; - case "array": - Mockito.when(rs.getArray(tf.getField())).thenReturn(null); - break; - case "string": - default: - Mockito.when(rs.getString(tf.getField())).thenReturn(null); - break; - } - } else { - switch (tf.getType()) { - case "not_used": - break; - case "boolean": - Mockito.when(rs.getBoolean(tf.getField())) - .thenReturn(Boolean.parseBoolean(tf.getValue().toString())); - break; - case "date": - Mockito.when(rs.getDate(tf.getField())) - .thenReturn(Date.valueOf(tf.getValue().toString())); - break; - case "int": - Mockito.when(rs.getInt(tf.getField())) - .thenReturn(new Integer(tf.getValue().toString())); - break; - case "double": - Mockito.when(rs.getDouble(tf.getField())) - .thenReturn(new Double(tf.getValue().toString())); - break; - case "array": - final Array arr = Mockito.mock(Array.class); - final String[] values = - ((List) tf.getValue()) - .stream() - .filter(Objects::nonNull) - .map(o -> o.toString()) - .toArray(String[]::new); + for (final TypedField tf : list) { + if (tf.getValue() == null) { + switch (tf.getType()) { + case "not_used": + break; + case "boolean": + Mockito.when(rs.getBoolean(tf.getField())).thenReturn(false); + break; + case "date": + Mockito.when(rs.getDate(tf.getField())).thenReturn(null); + break; + case "int": + Mockito.when(rs.getInt(tf.getField())).thenReturn(0); + break; + case "double": + Mockito.when(rs.getDouble(tf.getField())).thenReturn(0.0); + break; + case "array": + Mockito.when(rs.getArray(tf.getField())).thenReturn(null); + break; + case "string": + default: + Mockito.when(rs.getString(tf.getField())).thenReturn(null); + break; + } + } else { + switch (tf.getType()) { + case "not_used": + break; + case "boolean": + Mockito + .when(rs.getBoolean(tf.getField())) + .thenReturn(Boolean.parseBoolean(tf.getValue().toString())); + break; + case "date": + Mockito + .when(rs.getDate(tf.getField())) + .thenReturn(Date.valueOf(tf.getValue().toString())); + break; + case "int": + Mockito + .when(rs.getInt(tf.getField())) + .thenReturn(new Integer(tf.getValue().toString())); + break; + case "double": + Mockito + .when(rs.getDouble(tf.getField())) + .thenReturn(new Double(tf.getValue().toString())); + break; + case "array": + final Array arr = Mockito.mock(Array.class); + final String[] values = ((List) tf.getValue()) + .stream() + .filter(Objects::nonNull) + .map(o -> o.toString()) + .toArray(String[]::new); - Mockito.when(arr.getArray()).thenReturn(values); - Mockito.when(rs.getArray(tf.getField())).thenReturn(arr); - break; - case "string": - default: - Mockito.when(rs.getString(tf.getField())).thenReturn(tf.getValue().toString()); - break; - } - } - } + Mockito.when(arr.getArray()).thenReturn(values); + Mockito.when(rs.getArray(tf.getField())).thenReturn(arr); + break; + case "string": + default: + Mockito.when(rs.getString(tf.getField())).thenReturn(tf.getValue().toString()); + break; + } + } + } - return list; - } + return list; + } - private void verifyMocks(final List list) throws SQLException { - for (final TypedField tf : list) { + private void verifyMocks(final List list) throws SQLException { + for (final TypedField tf : list) { - switch (tf.getType()) { - case "not_used": - break; - case "boolean": - Mockito.verify(rs, Mockito.atLeastOnce()).getBoolean(tf.getField()); - break; - case "date": - Mockito.verify(rs, Mockito.atLeastOnce()).getDate(tf.getField()); - break; - case "int": - Mockito.verify(rs, Mockito.atLeastOnce()).getInt(tf.getField()); - break; - case "double": - Mockito.verify(rs, Mockito.atLeastOnce()).getDouble(tf.getField()); - break; - case "array": - Mockito.verify(rs, Mockito.atLeastOnce()).getArray(tf.getField()); - break; - case "string": - default: - Mockito.verify(rs, Mockito.atLeastOnce()).getString(tf.getField()); - break; - } - } - } + switch (tf.getType()) { + case "not_used": + break; + case "boolean": + Mockito.verify(rs, Mockito.atLeastOnce()).getBoolean(tf.getField()); + break; + case "date": + Mockito.verify(rs, Mockito.atLeastOnce()).getDate(tf.getField()); + break; + case "int": + Mockito.verify(rs, Mockito.atLeastOnce()).getInt(tf.getField()); + break; + case "double": + Mockito.verify(rs, Mockito.atLeastOnce()).getDouble(tf.getField()); + break; + case "array": + Mockito.verify(rs, Mockito.atLeastOnce()).getArray(tf.getField()); + break; + case "string": + default: + Mockito.verify(rs, Mockito.atLeastOnce()).getString(tf.getField()); + break; + } + } + } - private void assertValidId(final String id) { - assertEquals(49, id.length()); - assertEquals('|', id.charAt(2)); - assertEquals(':', id.charAt(15)); - assertEquals(':', id.charAt(16)); - } + private void assertValidId(final String id) { + assertEquals(49, id.length()); + assertEquals('|', id.charAt(2)); + assertEquals(':', id.charAt(15)); + assertEquals(':', id.charAt(16)); + } - private String getValueAsString(final String name, final List fields) { - return fields.stream() - .filter(f -> f.getField().equals(name)) - .map(TypedField::getValue) - .filter(Objects::nonNull) - .map(o -> o.toString()) - .findFirst() - .get(); - } + private String getValueAsString(final String name, final List fields) { + return fields + .stream() + .filter(f -> f.getField().equals(name)) + .map(TypedField::getValue) + .filter(Objects::nonNull) + .map(o -> o.toString()) + .findFirst() + .get(); + } } class TypedField { - private String field; - private String type; - private Object value; + private String field; + private String type; + private Object value; - public String getField() { - return field; - } + public String getField() { + return field; + } - public void setField(final String field) { - this.field = field; - } + public void setField(final String field) { + this.field = field; + } - public String getType() { - return type; - } + public String getType() { + return type; + } - public void setType(final String type) { - this.type = type; - } + public void setType(final String type) { + this.type = type; + } - public Object getValue() { - return value; - } + public Object getValue() { + return value; + } - public void setValue(final Object value) { - this.value = value; - } + public void setValue(final Object value) { + this.value = value; + } } diff --git a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/sx/graph/ScholexplorerParserTest.java b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/sx/graph/ScholexplorerParserTest.java index f5ba4af55..d418da594 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/sx/graph/ScholexplorerParserTest.java +++ b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/sx/graph/ScholexplorerParserTest.java @@ -1,35 +1,40 @@ + package eu.dnetlib.dhp.sx.graph; +import java.util.List; + +import org.apache.commons.io.IOUtils; +import org.junit.jupiter.api.Test; + import com.fasterxml.jackson.core.JsonProcessingException; import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.databind.SerializationFeature; + import eu.dnetlib.dhp.schema.oaf.Oaf; import eu.dnetlib.dhp.sx.graph.parser.DatasetScholexplorerParser; import eu.dnetlib.scholexplorer.relation.RelationMapper; -import java.util.List; -import org.apache.commons.io.IOUtils; -import org.junit.jupiter.api.Test; public class ScholexplorerParserTest { - @Test - public void testDataciteParser() throws Exception { - String xml = IOUtils.toString(this.getClass().getResourceAsStream("dmf.xml")); + @Test + public void testDataciteParser() throws Exception { + String xml = IOUtils.toString(this.getClass().getResourceAsStream("dmf.xml")); - DatasetScholexplorerParser p = new DatasetScholexplorerParser(); - List oaves = p.parseObject(xml, RelationMapper.load()); + DatasetScholexplorerParser p = new DatasetScholexplorerParser(); + List oaves = p.parseObject(xml, RelationMapper.load()); - ObjectMapper m = new ObjectMapper(); - m.enable(SerializationFeature.INDENT_OUTPUT); + ObjectMapper m = new ObjectMapper(); + m.enable(SerializationFeature.INDENT_OUTPUT); - oaves.forEach( - oaf -> { - try { - System.out.println(m.writeValueAsString(oaf)); - System.out.println("----------------------------"); - } catch (JsonProcessingException e) { + oaves + .forEach( + oaf -> { + try { + System.out.println(m.writeValueAsString(oaf)); + System.out.println("----------------------------"); + } catch (JsonProcessingException e) { - } - }); - } + } + }); + } } diff --git a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/sx/graph/SparkScholexplorerGraphImporterTest.java b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/sx/graph/SparkScholexplorerGraphImporterTest.java index 7f32de318..ed3b6efdc 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/sx/graph/SparkScholexplorerGraphImporterTest.java +++ b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/sx/graph/SparkScholexplorerGraphImporterTest.java @@ -1,3 +1,5 @@ + package eu.dnetlib.dhp.sx.graph; -public class SparkScholexplorerGraphImporterTest {} +public class SparkScholexplorerGraphImporterTest { +} diff --git a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/sx/graph/SparkScholexplorerMergeEntitiesJobTest.java b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/sx/graph/SparkScholexplorerMergeEntitiesJobTest.java index af6385803..348a2b030 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/sx/graph/SparkScholexplorerMergeEntitiesJobTest.java +++ b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/sx/graph/SparkScholexplorerMergeEntitiesJobTest.java @@ -1,3 +1,5 @@ + package eu.dnetlib.dhp.sx.graph; -public class SparkScholexplorerMergeEntitiesJobTest {} +public class SparkScholexplorerMergeEntitiesJobTest { +} diff --git a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/ProvisionUtil.java b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/ProvisionUtil.java index f9756c88b..1b0cb4d05 100644 --- a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/ProvisionUtil.java +++ b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/ProvisionUtil.java @@ -1,46 +1,48 @@ + package eu.dnetlib.dhp.provision; +import org.apache.commons.lang3.StringUtils; + import eu.dnetlib.dhp.provision.scholix.summary.Typology; import eu.dnetlib.dhp.utils.DHPUtils; -import org.apache.commons.lang3.StringUtils; public class ProvisionUtil { - public static final String deletedByInferenceJPATH = "$.dataInfo.deletedbyinference"; - public static final String TARGETJSONPATH = "$.target"; - public static final String SOURCEJSONPATH = "$.source"; + public static final String deletedByInferenceJPATH = "$.dataInfo.deletedbyinference"; + public static final String TARGETJSONPATH = "$.target"; + public static final String SOURCEJSONPATH = "$.source"; - // public static RelatedItemInfo getItemType(final String item, final String idPath) { - // String targetId = DHPUtils.getJPathString(idPath, item); - // switch (StringUtils.substringBefore(targetId, "|")) { - // case "50": - // return new RelatedItemInfo(null,0,1,0); - // case "60": - // return new RelatedItemInfo(null,1,0,0); - // case "70": - // return new RelatedItemInfo(null,0,0,1); - // default: - // throw new RuntimeException("Unknonw target ID"); - // - // } - // - // } + // public static RelatedItemInfo getItemType(final String item, final String idPath) { + // String targetId = DHPUtils.getJPathString(idPath, item); + // switch (StringUtils.substringBefore(targetId, "|")) { + // case "50": + // return new RelatedItemInfo(null,0,1,0); + // case "60": + // return new RelatedItemInfo(null,1,0,0); + // case "70": + // return new RelatedItemInfo(null,0,0,1); + // default: + // throw new RuntimeException("Unknonw target ID"); + // + // } + // + // } - public static Boolean isNotDeleted(final String item) { - return !"true".equalsIgnoreCase(DHPUtils.getJPathString(deletedByInferenceJPATH, item)); - } + public static Boolean isNotDeleted(final String item) { + return !"true".equalsIgnoreCase(DHPUtils.getJPathString(deletedByInferenceJPATH, item)); + } - public static Typology getItemTypeFromId(String id) { + public static Typology getItemTypeFromId(String id) { - switch (StringUtils.substringBefore(id, "|")) { - case "50": - return Typology.publication; - case "60": - return Typology.dataset; - case "70": - return Typology.unknown; - default: - throw new RuntimeException("Unknonw ID type"); - } - } + switch (StringUtils.substringBefore(id, "|")) { + case "50": + return Typology.publication; + case "60": + return Typology.dataset; + case "70": + return Typology.unknown; + default: + throw new RuntimeException("Unknonw ID type"); + } + } } diff --git a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/RelatedItemInfo.java b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/RelatedItemInfo.java index 7e322ce06..28826612d 100644 --- a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/RelatedItemInfo.java +++ b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/RelatedItemInfo.java @@ -1,3 +1,4 @@ + package eu.dnetlib.dhp.provision; import java.io.Serializable; @@ -5,53 +6,54 @@ import java.io.Serializable; /** This class models the information of related items */ public class RelatedItemInfo implements Serializable { - private String source; + private String source; - private long relatedDataset = 0; + private long relatedDataset = 0; - private long relatedPublication = 0; + private long relatedPublication = 0; - private long relatedUnknown = 0; + private long relatedUnknown = 0; - public RelatedItemInfo() {} + public RelatedItemInfo() { + } - public RelatedItemInfo( - String source, long relatedDataset, long relatedPublication, long relatedUnknown) { - this.source = source; - this.relatedDataset = relatedDataset; - this.relatedPublication = relatedPublication; - this.relatedUnknown = relatedUnknown; - } + public RelatedItemInfo( + String source, long relatedDataset, long relatedPublication, long relatedUnknown) { + this.source = source; + this.relatedDataset = relatedDataset; + this.relatedPublication = relatedPublication; + this.relatedUnknown = relatedUnknown; + } - public String getSource() { - return source; - } + public String getSource() { + return source; + } - public void setSource(String source) { - this.source = source; - } + public void setSource(String source) { + this.source = source; + } - public long getRelatedDataset() { - return relatedDataset; - } + public long getRelatedDataset() { + return relatedDataset; + } - public void setRelatedDataset(long relatedDataset) { - this.relatedDataset = relatedDataset; - } + public void setRelatedDataset(long relatedDataset) { + this.relatedDataset = relatedDataset; + } - public long getRelatedPublication() { - return relatedPublication; - } + public long getRelatedPublication() { + return relatedPublication; + } - public void setRelatedPublication(long relatedPublication) { - this.relatedPublication = relatedPublication; - } + public void setRelatedPublication(long relatedPublication) { + this.relatedPublication = relatedPublication; + } - public long getRelatedUnknown() { - return relatedUnknown; - } + public long getRelatedUnknown() { + return relatedUnknown; + } - public void setRelatedUnknown(int relatedUnknown) { - this.relatedUnknown = relatedUnknown; - } + public void setRelatedUnknown(int relatedUnknown) { + this.relatedUnknown = relatedUnknown; + } } diff --git a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/SparkExtractRelationCount.java b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/SparkExtractRelationCount.java index 14ffb32e5..df167f104 100644 --- a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/SparkExtractRelationCount.java +++ b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/SparkExtractRelationCount.java @@ -1,32 +1,34 @@ + package eu.dnetlib.dhp.provision; -import eu.dnetlib.dhp.application.ArgumentApplicationParser; import org.apache.commons.io.IOUtils; import org.apache.spark.sql.*; +import eu.dnetlib.dhp.application.ArgumentApplicationParser; + /** - * SparkExtractRelationCount is a spark job that takes in input relation RDD and retrieve for each - * item in relation which are the number of - Related Dataset - Related Publication - Related - * Unknown + * SparkExtractRelationCount is a spark job that takes in input relation RDD and retrieve for each item in relation + * which are the number of - Related Dataset - Related Publication - Related Unknown */ public class SparkExtractRelationCount { - public static void main(String[] args) throws Exception { - final ArgumentApplicationParser parser = - new ArgumentApplicationParser( - IOUtils.toString( - SparkExtractRelationCount.class.getResourceAsStream( - "/eu/dnetlib/dhp/provision/input_related_entities_parameters.json"))); - parser.parseArgument(args); - final SparkSession spark = - SparkSession.builder() - .appName(SparkExtractRelationCount.class.getSimpleName()) - .master(parser.get("master")) - .getOrCreate(); + public static void main(String[] args) throws Exception { + final ArgumentApplicationParser parser = new ArgumentApplicationParser( + IOUtils + .toString( + SparkExtractRelationCount.class + .getResourceAsStream( + "/eu/dnetlib/dhp/provision/input_related_entities_parameters.json"))); + parser.parseArgument(args); + final SparkSession spark = SparkSession + .builder() + .appName(SparkExtractRelationCount.class.getSimpleName()) + .master(parser.get("master")) + .getOrCreate(); - final String workingDirPath = parser.get("workingDirPath"); + final String workingDirPath = parser.get("workingDirPath"); - final String relationPath = parser.get("relationPath"); - DatasetJoiner.startJoin(spark, relationPath, workingDirPath + "/relatedItemCount"); - } + final String relationPath = parser.get("relationPath"); + DatasetJoiner.startJoin(spark, relationPath, workingDirPath + "/relatedItemCount"); + } } diff --git a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/SparkGenerateScholix.java b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/SparkGenerateScholix.java index 327bad94e..f9f3a58ce 100644 --- a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/SparkGenerateScholix.java +++ b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/SparkGenerateScholix.java @@ -1,10 +1,6 @@ + package eu.dnetlib.dhp.provision; -import com.fasterxml.jackson.databind.ObjectMapper; -import eu.dnetlib.dhp.application.ArgumentApplicationParser; -import eu.dnetlib.dhp.provision.scholix.*; -import eu.dnetlib.dhp.provision.scholix.summary.ScholixSummary; -import eu.dnetlib.dhp.schema.oaf.Relation; import org.apache.commons.io.IOUtils; import org.apache.hadoop.io.compress.GzipCodec; import org.apache.spark.SparkConf; @@ -14,91 +10,100 @@ import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Encoders; import org.apache.spark.sql.SaveMode; import org.apache.spark.sql.SparkSession; + +import com.fasterxml.jackson.databind.ObjectMapper; + +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.provision.scholix.*; +import eu.dnetlib.dhp.provision.scholix.summary.ScholixSummary; +import eu.dnetlib.dhp.schema.oaf.Relation; import scala.Tuple2; public class SparkGenerateScholix { - public static void main(String[] args) throws Exception { - final ArgumentApplicationParser parser = - new ArgumentApplicationParser( - IOUtils.toString( - SparkGenerateScholix.class.getResourceAsStream( - "/eu/dnetlib/dhp/provision/input_generate_summary_parameters.json"))); - parser.parseArgument(args); - SparkConf conf = new SparkConf(); - conf.set("spark.sql.shuffle.partitions", "4000"); - conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer"); - final SparkSession spark = - SparkSession.builder() - .config(conf) - .appName(SparkExtractRelationCount.class.getSimpleName()) - .master(parser.get("master")) - .getOrCreate(); + public static void main(String[] args) throws Exception { + final ArgumentApplicationParser parser = new ArgumentApplicationParser( + IOUtils + .toString( + SparkGenerateScholix.class + .getResourceAsStream( + "/eu/dnetlib/dhp/provision/input_generate_summary_parameters.json"))); + parser.parseArgument(args); + SparkConf conf = new SparkConf(); + conf.set("spark.sql.shuffle.partitions", "4000"); + conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer"); + final SparkSession spark = SparkSession + .builder() + .config(conf) + .appName(SparkExtractRelationCount.class.getSimpleName()) + .master(parser.get("master")) + .getOrCreate(); - conf.registerKryoClasses( - new Class[] { - Scholix.class, ScholixCollectedFrom.class, ScholixEntityId.class, - ScholixIdentifier.class, ScholixRelationship.class, ScholixResource.class - }); + conf + .registerKryoClasses( + new Class[] { + Scholix.class, ScholixCollectedFrom.class, ScholixEntityId.class, + ScholixIdentifier.class, ScholixRelationship.class, ScholixResource.class + }); - final String graphPath = parser.get("graphPath"); - final String workingDirPath = parser.get("workingDirPath"); + final String graphPath = parser.get("graphPath"); + final String workingDirPath = parser.get("workingDirPath"); - final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); + final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); - final Dataset scholixSummary = - spark.read().load(workingDirPath + "/summary").as(Encoders.bean(ScholixSummary.class)); - final Dataset rels = - spark.read().load(graphPath + "/relation").as(Encoders.bean(Relation.class)); + final Dataset scholixSummary = spark + .read() + .load(workingDirPath + "/summary") + .as(Encoders.bean(ScholixSummary.class)); + final Dataset rels = spark.read().load(graphPath + "/relation").as(Encoders.bean(Relation.class)); - Dataset firstJoin = - scholixSummary - .joinWith(rels, scholixSummary.col("id").equalTo(rels.col("source"))) - .map( - (MapFunction, Scholix>) - f -> Scholix.generateScholixWithSource(f._1(), f._2()), - Encoders.bean(Scholix.class)); + Dataset firstJoin = scholixSummary + .joinWith(rels, scholixSummary.col("id").equalTo(rels.col("source"))) + .map( + (MapFunction, Scholix>) f -> Scholix + .generateScholixWithSource(f._1(), f._2()), + Encoders.bean(Scholix.class)); - firstJoin.write().mode(SaveMode.Overwrite).save(workingDirPath + "/scholix_1"); + firstJoin.write().mode(SaveMode.Overwrite).save(workingDirPath + "/scholix_1"); - Dataset scholix_final = - spark.read().load(workingDirPath + "/scholix_1").as(Encoders.bean(Scholix.class)); + Dataset scholix_final = spark + .read() + .load(workingDirPath + "/scholix_1") + .as(Encoders.bean(Scholix.class)); - scholixSummary - .map( - (MapFunction) ScholixResource::fromSummary, - Encoders.bean(ScholixResource.class)) - .repartition(1000) - .write() - .mode(SaveMode.Overwrite) - .save(workingDirPath + "/scholix_target"); + scholixSummary + .map( + (MapFunction) ScholixResource::fromSummary, + Encoders.bean(ScholixResource.class)) + .repartition(1000) + .write() + .mode(SaveMode.Overwrite) + .save(workingDirPath + "/scholix_target"); - Dataset target = - spark - .read() - .load(workingDirPath + "/scholix_target") - .as(Encoders.bean(ScholixResource.class)); + Dataset target = spark + .read() + .load(workingDirPath + "/scholix_target") + .as(Encoders.bean(ScholixResource.class)); - scholix_final - .joinWith( - target, scholix_final.col("identifier").equalTo(target.col("dnetIdentifier")), "inner") - .map( - (MapFunction, Scholix>) - f -> { - final Scholix scholix = f._1(); - final ScholixResource scholixTarget = f._2(); - scholix.setTarget(scholixTarget); - scholix.generateIdentifier(); - scholix.generatelinkPublisher(); - return scholix; - }, - Encoders.kryo(Scholix.class)) - .javaRDD() - .map( - s -> { - ObjectMapper mapper = new ObjectMapper(); - return mapper.writeValueAsString(s); - }) - .saveAsTextFile(workingDirPath + "/scholix_json", GzipCodec.class); - } + scholix_final + .joinWith( + target, scholix_final.col("identifier").equalTo(target.col("dnetIdentifier")), "inner") + .map( + (MapFunction, Scholix>) f -> { + final Scholix scholix = f._1(); + final ScholixResource scholixTarget = f._2(); + scholix.setTarget(scholixTarget); + scholix.generateIdentifier(); + scholix.generatelinkPublisher(); + return scholix; + }, + Encoders.kryo(Scholix.class)) + .javaRDD() + .map( + s -> { + ObjectMapper mapper = new ObjectMapper(); + return mapper.writeValueAsString(s); + }) + .saveAsTextFile(workingDirPath + "/scholix_json", GzipCodec.class); + } } diff --git a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/SparkGenerateSummary.java b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/SparkGenerateSummary.java index a4a19e833..04bde1099 100644 --- a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/SparkGenerateSummary.java +++ b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/SparkGenerateSummary.java @@ -1,104 +1,106 @@ + package eu.dnetlib.dhp.provision; -import eu.dnetlib.dhp.application.ArgumentApplicationParser; -import eu.dnetlib.dhp.provision.scholix.summary.ScholixSummary; -import eu.dnetlib.dhp.utils.DHPUtils; import org.apache.commons.io.IOUtils; import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.api.java.function.MapFunction; import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Encoders; import org.apache.spark.sql.SparkSession; + +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.provision.scholix.summary.ScholixSummary; +import eu.dnetlib.dhp.utils.DHPUtils; import scala.Tuple2; public class SparkGenerateSummary { - private static final String jsonIDPath = "$.id"; + private static final String jsonIDPath = "$.id"; - public static void main(String[] args) throws Exception { - final ArgumentApplicationParser parser = - new ArgumentApplicationParser( - IOUtils.toString( - SparkGenerateSummary.class.getResourceAsStream( - "/eu/dnetlib/dhp/provision/input_generate_summary_parameters.json"))); - parser.parseArgument(args); - final SparkSession spark = - SparkSession.builder() - .appName(SparkExtractRelationCount.class.getSimpleName()) - .master(parser.get("master")) - .getOrCreate(); + public static void main(String[] args) throws Exception { + final ArgumentApplicationParser parser = new ArgumentApplicationParser( + IOUtils + .toString( + SparkGenerateSummary.class + .getResourceAsStream( + "/eu/dnetlib/dhp/provision/input_generate_summary_parameters.json"))); + parser.parseArgument(args); + final SparkSession spark = SparkSession + .builder() + .appName(SparkExtractRelationCount.class.getSimpleName()) + .master(parser.get("master")) + .getOrCreate(); - final String graphPath = parser.get("graphPath"); - final String workingDirPath = parser.get("workingDirPath"); + final String graphPath = parser.get("graphPath"); + final String workingDirPath = parser.get("workingDirPath"); - final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); + final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); - Dataset rInfo = - spark - .read() - .load(workingDirPath + "/relatedItemCount") - .as(Encoders.bean(RelatedItemInfo.class)); + Dataset rInfo = spark + .read() + .load(workingDirPath + "/relatedItemCount") + .as(Encoders.bean(RelatedItemInfo.class)); - Dataset entity = - spark.createDataset( - sc.textFile( - graphPath + "/publication," + graphPath + "/dataset," + graphPath + "/unknown") - .map( - s -> - ScholixSummary.fromJsonOAF( - ProvisionUtil.getItemTypeFromId(DHPUtils.getJPathString(jsonIDPath, s)), - s)) - .rdd(), - Encoders.bean(ScholixSummary.class)); + Dataset entity = spark + .createDataset( + sc + .textFile( + graphPath + "/publication," + graphPath + "/dataset," + graphPath + "/unknown") + .map( + s -> ScholixSummary + .fromJsonOAF( + ProvisionUtil.getItemTypeFromId(DHPUtils.getJPathString(jsonIDPath, s)), + s)) + .rdd(), + Encoders.bean(ScholixSummary.class)); - Dataset summaryComplete = - rInfo - .joinWith(entity, rInfo.col("source").equalTo(entity.col("id"))) - .map( - (MapFunction, ScholixSummary>) - t -> { - ScholixSummary scholixSummary = t._2(); - RelatedItemInfo relatedItemInfo = t._1(); - scholixSummary.setRelatedDatasets(relatedItemInfo.getRelatedDataset()); - scholixSummary.setRelatedPublications( - relatedItemInfo.getRelatedPublication()); - scholixSummary.setRelatedUnknown(relatedItemInfo.getRelatedUnknown()); - return scholixSummary; - }, - Encoders.bean(ScholixSummary.class)); + Dataset summaryComplete = rInfo + .joinWith(entity, rInfo.col("source").equalTo(entity.col("id"))) + .map( + (MapFunction, ScholixSummary>) t -> { + ScholixSummary scholixSummary = t._2(); + RelatedItemInfo relatedItemInfo = t._1(); + scholixSummary.setRelatedDatasets(relatedItemInfo.getRelatedDataset()); + scholixSummary + .setRelatedPublications( + relatedItemInfo.getRelatedPublication()); + scholixSummary.setRelatedUnknown(relatedItemInfo.getRelatedUnknown()); + return scholixSummary; + }, + Encoders.bean(ScholixSummary.class)); - summaryComplete.write().save(workingDirPath + "/summary"); + summaryComplete.write().save(workingDirPath + "/summary"); - // JavaPairRDD relationCount = - // sc.textFile(workingDirPath+"/relatedItemCount").mapToPair((PairFunction) i -> new Tuple2<>(DHPUtils.getJPathString(jsonIDPath, i), i)); - // - // JavaPairRDD entities = - // sc.textFile(graphPath + "/publication") - // .filter(ProvisionUtil::isNotDeleted) - // .mapToPair((PairFunction) i -> new - // Tuple2<>(DHPUtils.getJPathString(jsonIDPath, i), i)) - // .union( - // sc.textFile(graphPath + "/dataset") - // .filter(ProvisionUtil::isNotDeleted) - // .mapToPair((PairFunction) - // i -> - // new Tuple2<>(DHPUtils.getJPathString(jsonIDPath, i), i)) - // ) - // .union( - // sc.textFile(graphPath + "/unknown") - // .filter(ProvisionUtil::isNotDeleted) - // .mapToPair((PairFunction) - // i -> - // new Tuple2<>(DHPUtils.getJPathString(jsonIDPath, i), i)) - // ); - // entities.join(relationCount).map((Function>, - // String>) k -> - // ScholixSummary.fromJsonOAF(ProvisionUtil.getItemTypeFromId(k._1()), - // k._2()._1(), k._2()._2())).saveAsTextFile(workingDirPath+"/summary", GzipCodec.class); - // - // - // ; + // JavaPairRDD relationCount = + // sc.textFile(workingDirPath+"/relatedItemCount").mapToPair((PairFunction) i -> new Tuple2<>(DHPUtils.getJPathString(jsonIDPath, i), i)); + // + // JavaPairRDD entities = + // sc.textFile(graphPath + "/publication") + // .filter(ProvisionUtil::isNotDeleted) + // .mapToPair((PairFunction) i -> new + // Tuple2<>(DHPUtils.getJPathString(jsonIDPath, i), i)) + // .union( + // sc.textFile(graphPath + "/dataset") + // .filter(ProvisionUtil::isNotDeleted) + // .mapToPair((PairFunction) + // i -> + // new Tuple2<>(DHPUtils.getJPathString(jsonIDPath, i), i)) + // ) + // .union( + // sc.textFile(graphPath + "/unknown") + // .filter(ProvisionUtil::isNotDeleted) + // .mapToPair((PairFunction) + // i -> + // new Tuple2<>(DHPUtils.getJPathString(jsonIDPath, i), i)) + // ); + // entities.join(relationCount).map((Function>, + // String>) k -> + // ScholixSummary.fromJsonOAF(ProvisionUtil.getItemTypeFromId(k._1()), + // k._2()._1(), k._2()._2())).saveAsTextFile(workingDirPath+"/summary", GzipCodec.class); + // + // + // ; - } + } } diff --git a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/SparkIndexCollectionOnES.java b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/SparkIndexCollectionOnES.java index bafdaa5d7..e79dad8d3 100644 --- a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/SparkIndexCollectionOnES.java +++ b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/SparkIndexCollectionOnES.java @@ -1,10 +1,9 @@ + package eu.dnetlib.dhp.provision; -import com.fasterxml.jackson.databind.ObjectMapper; -import eu.dnetlib.dhp.application.ArgumentApplicationParser; -import eu.dnetlib.dhp.provision.scholix.summary.ScholixSummary; import java.util.HashMap; import java.util.Map; + import org.apache.commons.io.IOUtils; import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaRDD; @@ -14,58 +13,62 @@ import org.apache.spark.sql.Encoders; import org.apache.spark.sql.SparkSession; import org.elasticsearch.spark.rdd.api.java.JavaEsSpark; +import com.fasterxml.jackson.databind.ObjectMapper; + +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.provision.scholix.summary.ScholixSummary; + public class SparkIndexCollectionOnES { - public static void main(String[] args) throws Exception { + public static void main(String[] args) throws Exception { - final ArgumentApplicationParser parser = - new ArgumentApplicationParser( - IOUtils.toString( - SparkIndexCollectionOnES.class.getResourceAsStream( - "/eu/dnetlib/dhp/provision/index_on_es.json"))); - parser.parseArgument(args); + final ArgumentApplicationParser parser = new ArgumentApplicationParser( + IOUtils + .toString( + SparkIndexCollectionOnES.class + .getResourceAsStream( + "/eu/dnetlib/dhp/provision/index_on_es.json"))); + parser.parseArgument(args); - SparkConf conf = - new SparkConf() - .setAppName(SparkIndexCollectionOnES.class.getSimpleName()) - .setMaster(parser.get("master")); + SparkConf conf = new SparkConf() + .setAppName(SparkIndexCollectionOnES.class.getSimpleName()) + .setMaster(parser.get("master")); - conf.set("spark.sql.shuffle.partitions", "4000"); + conf.set("spark.sql.shuffle.partitions", "4000"); - final String sourcePath = parser.get("sourcePath"); - final String index = parser.get("index"); - final String idPath = parser.get("idPath"); - final String type = parser.get("type"); + final String sourcePath = parser.get("sourcePath"); + final String index = parser.get("index"); + final String idPath = parser.get("idPath"); + final String type = parser.get("type"); - final SparkSession spark = SparkSession.builder().config(conf).getOrCreate(); + final SparkSession spark = SparkSession.builder().config(conf).getOrCreate(); - final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); + final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); - JavaRDD inputRdd; + JavaRDD inputRdd; - if ("summary".equalsIgnoreCase(type)) - inputRdd = - spark - .read() - .load(sourcePath) - .as(Encoders.bean(ScholixSummary.class)) - .map( - (MapFunction) - f -> { - final ObjectMapper mapper = new ObjectMapper(); - return mapper.writeValueAsString(f); - }, - Encoders.STRING()) - .javaRDD(); - else inputRdd = sc.textFile(sourcePath); + if ("summary".equalsIgnoreCase(type)) + inputRdd = spark + .read() + .load(sourcePath) + .as(Encoders.bean(ScholixSummary.class)) + .map( + (MapFunction) f -> { + final ObjectMapper mapper = new ObjectMapper(); + return mapper.writeValueAsString(f); + }, + Encoders.STRING()) + .javaRDD(); + else + inputRdd = sc.textFile(sourcePath); - Map esCfg = new HashMap<>(); - esCfg.put("es.nodes", "10.19.65.51, 10.19.65.52, 10.19.65.53, 10.19.65.54"); - esCfg.put("es.mapping.id", idPath); - esCfg.put("es.batch.write.retry.count", "8"); - esCfg.put("es.batch.write.retry.wait", "60s"); - esCfg.put("es.batch.size.entries", "200"); - esCfg.put("es.nodes.wan.only", "true"); - JavaEsSpark.saveJsonToEs(inputRdd, index, esCfg); - } + Map esCfg = new HashMap<>(); + esCfg.put("es.nodes", "10.19.65.51, 10.19.65.52, 10.19.65.53, 10.19.65.54"); + esCfg.put("es.mapping.id", idPath); + esCfg.put("es.batch.write.retry.count", "8"); + esCfg.put("es.batch.write.retry.wait", "60s"); + esCfg.put("es.batch.size.entries", "200"); + esCfg.put("es.nodes.wan.only", "true"); + JavaEsSpark.saveJsonToEs(inputRdd, index, esCfg); + } } diff --git a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/scholix/Scholix.java b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/scholix/Scholix.java index 3130d8b98..d71415513 100644 --- a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/scholix/Scholix.java +++ b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/scholix/Scholix.java @@ -1,184 +1,200 @@ + package eu.dnetlib.dhp.provision.scholix; -import com.fasterxml.jackson.databind.ObjectMapper; -import eu.dnetlib.dhp.provision.scholix.summary.ScholixSummary; -import eu.dnetlib.dhp.schema.oaf.Relation; -import eu.dnetlib.dhp.utils.DHPUtils; import java.io.Serializable; import java.util.*; import java.util.stream.Collectors; +import com.fasterxml.jackson.databind.ObjectMapper; + +import eu.dnetlib.dhp.provision.scholix.summary.ScholixSummary; +import eu.dnetlib.dhp.schema.oaf.Relation; +import eu.dnetlib.dhp.utils.DHPUtils; + public class Scholix implements Serializable { - private String publicationDate; + private String publicationDate; - private List publisher; + private List publisher; - private List linkprovider; + private List linkprovider; - private ScholixRelationship relationship; + private ScholixRelationship relationship; - private ScholixResource source; + private ScholixResource source; - private ScholixResource target; + private ScholixResource target; - private String identifier; + private String identifier; - public Scholix clone(final ScholixResource t) { - final Scholix clone = new Scholix(); - clone.setPublicationDate(publicationDate); - clone.setPublisher(publisher); - clone.setLinkprovider(linkprovider); - clone.setRelationship(relationship); - clone.setSource(source); - clone.setTarget(t); - clone.generatelinkPublisher(); - clone.generateIdentifier(); - return clone; - } + public Scholix clone(final ScholixResource t) { + final Scholix clone = new Scholix(); + clone.setPublicationDate(publicationDate); + clone.setPublisher(publisher); + clone.setLinkprovider(linkprovider); + clone.setRelationship(relationship); + clone.setSource(source); + clone.setTarget(t); + clone.generatelinkPublisher(); + clone.generateIdentifier(); + return clone; + } - public static Scholix generateScholixWithSource( - final String sourceSummaryJson, final String relation) { - final ObjectMapper mapper = new ObjectMapper(); + public static Scholix generateScholixWithSource( + final String sourceSummaryJson, final String relation) { + final ObjectMapper mapper = new ObjectMapper(); - try { - ScholixSummary scholixSummary = mapper.readValue(sourceSummaryJson, ScholixSummary.class); - Relation rel = mapper.readValue(relation, Relation.class); - final Scholix s = new Scholix(); - if (scholixSummary.getDate() != null && scholixSummary.getDate().size() > 0) - s.setPublicationDate(scholixSummary.getDate().get(0)); - s.setLinkprovider( - rel.getCollectedfrom().stream() - .map( - cf -> - new ScholixEntityId( - cf.getValue(), - Collections.singletonList( - new ScholixIdentifier(cf.getKey(), "dnet_identifier")))) - .collect(Collectors.toList())); - s.setRelationship(new ScholixRelationship(rel.getRelType(), rel.getRelClass(), null)); - s.setSource(ScholixResource.fromSummary(scholixSummary)); - return s; - } catch (Throwable e) { - throw new RuntimeException( - String.format("Summary: %s \n relation:%s", sourceSummaryJson, relation), e); - } - } + try { + ScholixSummary scholixSummary = mapper.readValue(sourceSummaryJson, ScholixSummary.class); + Relation rel = mapper.readValue(relation, Relation.class); + final Scholix s = new Scholix(); + if (scholixSummary.getDate() != null && scholixSummary.getDate().size() > 0) + s.setPublicationDate(scholixSummary.getDate().get(0)); + s + .setLinkprovider( + rel + .getCollectedfrom() + .stream() + .map( + cf -> new ScholixEntityId( + cf.getValue(), + Collections + .singletonList( + new ScholixIdentifier(cf.getKey(), "dnet_identifier")))) + .collect(Collectors.toList())); + s.setRelationship(new ScholixRelationship(rel.getRelType(), rel.getRelClass(), null)); + s.setSource(ScholixResource.fromSummary(scholixSummary)); + return s; + } catch (Throwable e) { + throw new RuntimeException( + String.format("Summary: %s \n relation:%s", sourceSummaryJson, relation), e); + } + } - public static Scholix generateScholixWithSource( - final ScholixSummary scholixSummary, final Relation rel) { - final Scholix s = new Scholix(); - if (scholixSummary.getDate() != null && scholixSummary.getDate().size() > 0) - s.setPublicationDate(scholixSummary.getDate().get(0)); - s.setLinkprovider( - rel.getCollectedfrom().stream() - .map( - cf -> - new ScholixEntityId( - cf.getValue(), - Collections.singletonList( - new ScholixIdentifier(cf.getKey(), "dnet_identifier")))) - .collect(Collectors.toList())); - s.setRelationship(new ScholixRelationship(rel.getRelType(), rel.getRelClass(), null)); - s.setSource(ScholixResource.fromSummary(scholixSummary)); + public static Scholix generateScholixWithSource( + final ScholixSummary scholixSummary, final Relation rel) { + final Scholix s = new Scholix(); + if (scholixSummary.getDate() != null && scholixSummary.getDate().size() > 0) + s.setPublicationDate(scholixSummary.getDate().get(0)); + s + .setLinkprovider( + rel + .getCollectedfrom() + .stream() + .map( + cf -> new ScholixEntityId( + cf.getValue(), + Collections + .singletonList( + new ScholixIdentifier(cf.getKey(), "dnet_identifier")))) + .collect(Collectors.toList())); + s.setRelationship(new ScholixRelationship(rel.getRelType(), rel.getRelClass(), null)); + s.setSource(ScholixResource.fromSummary(scholixSummary)); - s.setIdentifier(rel.getTarget()); - // ScholixResource mockTarget = new ScholixResource(); - // mockTarget.setDnetIdentifier(rel.getTarget()); - // s.setTarget(mockTarget); - // s.generateIdentifier(); - return s; - } + s.setIdentifier(rel.getTarget()); + // ScholixResource mockTarget = new ScholixResource(); + // mockTarget.setDnetIdentifier(rel.getTarget()); + // s.setTarget(mockTarget); + // s.generateIdentifier(); + return s; + } - public void generatelinkPublisher() { - Set publisher = new HashSet<>(); - if (source.getPublisher() != null) - publisher.addAll( - source.getPublisher().stream() - .map(ScholixEntityId::getName) - .collect(Collectors.toList())); - if (target.getPublisher() != null) - publisher.addAll( - target.getPublisher().stream() - .map(ScholixEntityId::getName) - .collect(Collectors.toList())); - this.publisher = - publisher.stream().map(k -> new ScholixEntityId(k, null)).collect(Collectors.toList()); - } + public void generatelinkPublisher() { + Set publisher = new HashSet<>(); + if (source.getPublisher() != null) + publisher + .addAll( + source + .getPublisher() + .stream() + .map(ScholixEntityId::getName) + .collect(Collectors.toList())); + if (target.getPublisher() != null) + publisher + .addAll( + target + .getPublisher() + .stream() + .map(ScholixEntityId::getName) + .collect(Collectors.toList())); + this.publisher = publisher.stream().map(k -> new ScholixEntityId(k, null)).collect(Collectors.toList()); + } - public void generateIdentifier() { - setIdentifier( - DHPUtils.md5( - String.format( - "%s::%s::%s", - source.getDnetIdentifier(), relationship.getName(), target.getDnetIdentifier()))); - } + public void generateIdentifier() { + setIdentifier( + DHPUtils + .md5( + String + .format( + "%s::%s::%s", + source.getDnetIdentifier(), relationship.getName(), target.getDnetIdentifier()))); + } - public Scholix addTarget(final String targetSummaryJson) { - final ObjectMapper mapper = new ObjectMapper(); + public Scholix addTarget(final String targetSummaryJson) { + final ObjectMapper mapper = new ObjectMapper(); - try { - ScholixSummary targetSummary = mapper.readValue(targetSummaryJson, ScholixSummary.class); - setTarget(ScholixResource.fromSummary(targetSummary)); - generateIdentifier(); - return this; - } catch (Throwable e) { - throw new RuntimeException(e); - } - } + try { + ScholixSummary targetSummary = mapper.readValue(targetSummaryJson, ScholixSummary.class); + setTarget(ScholixResource.fromSummary(targetSummary)); + generateIdentifier(); + return this; + } catch (Throwable e) { + throw new RuntimeException(e); + } + } - public String getPublicationDate() { - return publicationDate; - } + public String getPublicationDate() { + return publicationDate; + } - public void setPublicationDate(String publicationDate) { - this.publicationDate = publicationDate; - } + public void setPublicationDate(String publicationDate) { + this.publicationDate = publicationDate; + } - public List getPublisher() { - return publisher; - } + public List getPublisher() { + return publisher; + } - public void setPublisher(List publisher) { - this.publisher = publisher; - } + public void setPublisher(List publisher) { + this.publisher = publisher; + } - public List getLinkprovider() { - return linkprovider; - } + public List getLinkprovider() { + return linkprovider; + } - public void setLinkprovider(List linkprovider) { - this.linkprovider = linkprovider; - } + public void setLinkprovider(List linkprovider) { + this.linkprovider = linkprovider; + } - public ScholixRelationship getRelationship() { - return relationship; - } + public ScholixRelationship getRelationship() { + return relationship; + } - public void setRelationship(ScholixRelationship relationship) { - this.relationship = relationship; - } + public void setRelationship(ScholixRelationship relationship) { + this.relationship = relationship; + } - public ScholixResource getSource() { - return source; - } + public ScholixResource getSource() { + return source; + } - public void setSource(ScholixResource source) { - this.source = source; - } + public void setSource(ScholixResource source) { + this.source = source; + } - public ScholixResource getTarget() { - return target; - } + public ScholixResource getTarget() { + return target; + } - public void setTarget(ScholixResource target) { - this.target = target; - } + public void setTarget(ScholixResource target) { + this.target = target; + } - public String getIdentifier() { - return identifier; - } + public String getIdentifier() { + return identifier; + } - public void setIdentifier(String identifier) { - this.identifier = identifier; - } + public void setIdentifier(String identifier) { + this.identifier = identifier; + } } diff --git a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/scholix/ScholixCollectedFrom.java b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/scholix/ScholixCollectedFrom.java index c55bbb111..9ce071fbc 100644 --- a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/scholix/ScholixCollectedFrom.java +++ b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/scholix/ScholixCollectedFrom.java @@ -1,43 +1,45 @@ + package eu.dnetlib.dhp.provision.scholix; import java.io.Serializable; public class ScholixCollectedFrom implements Serializable { - private ScholixEntityId provider; - private String provisionMode; - private String completionStatus; + private ScholixEntityId provider; + private String provisionMode; + private String completionStatus; - public ScholixCollectedFrom() {} + public ScholixCollectedFrom() { + } - public ScholixCollectedFrom( - ScholixEntityId provider, String provisionMode, String completionStatus) { - this.provider = provider; - this.provisionMode = provisionMode; - this.completionStatus = completionStatus; - } + public ScholixCollectedFrom( + ScholixEntityId provider, String provisionMode, String completionStatus) { + this.provider = provider; + this.provisionMode = provisionMode; + this.completionStatus = completionStatus; + } - public ScholixEntityId getProvider() { - return provider; - } + public ScholixEntityId getProvider() { + return provider; + } - public void setProvider(ScholixEntityId provider) { - this.provider = provider; - } + public void setProvider(ScholixEntityId provider) { + this.provider = provider; + } - public String getProvisionMode() { - return provisionMode; - } + public String getProvisionMode() { + return provisionMode; + } - public void setProvisionMode(String provisionMode) { - this.provisionMode = provisionMode; - } + public void setProvisionMode(String provisionMode) { + this.provisionMode = provisionMode; + } - public String getCompletionStatus() { - return completionStatus; - } + public String getCompletionStatus() { + return completionStatus; + } - public void setCompletionStatus(String completionStatus) { - this.completionStatus = completionStatus; - } + public void setCompletionStatus(String completionStatus) { + this.completionStatus = completionStatus; + } } diff --git a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/scholix/ScholixEntityId.java b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/scholix/ScholixEntityId.java index 226c3d20a..e797017bc 100644 --- a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/scholix/ScholixEntityId.java +++ b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/scholix/ScholixEntityId.java @@ -1,32 +1,34 @@ + package eu.dnetlib.dhp.provision.scholix; import java.io.Serializable; import java.util.List; public class ScholixEntityId implements Serializable { - private String name; - private List identifiers; + private String name; + private List identifiers; - public ScholixEntityId() {} + public ScholixEntityId() { + } - public ScholixEntityId(String name, List identifiers) { - this.name = name; - this.identifiers = identifiers; - } + public ScholixEntityId(String name, List identifiers) { + this.name = name; + this.identifiers = identifiers; + } - public String getName() { - return name; - } + public String getName() { + return name; + } - public void setName(String name) { - this.name = name; - } + public void setName(String name) { + this.name = name; + } - public List getIdentifiers() { - return identifiers; - } + public List getIdentifiers() { + return identifiers; + } - public void setIdentifiers(List identifiers) { - this.identifiers = identifiers; - } + public void setIdentifiers(List identifiers) { + this.identifiers = identifiers; + } } diff --git a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/scholix/ScholixIdentifier.java b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/scholix/ScholixIdentifier.java index 265ac1ef5..0dd15336a 100644 --- a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/scholix/ScholixIdentifier.java +++ b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/scholix/ScholixIdentifier.java @@ -1,31 +1,33 @@ + package eu.dnetlib.dhp.provision.scholix; import java.io.Serializable; public class ScholixIdentifier implements Serializable { - private String identifier; - private String schema; + private String identifier; + private String schema; - public ScholixIdentifier() {} + public ScholixIdentifier() { + } - public ScholixIdentifier(String identifier, String schema) { - this.identifier = identifier; - this.schema = schema; - } + public ScholixIdentifier(String identifier, String schema) { + this.identifier = identifier; + this.schema = schema; + } - public String getIdentifier() { - return identifier; - } + public String getIdentifier() { + return identifier; + } - public void setIdentifier(String identifier) { - this.identifier = identifier; - } + public void setIdentifier(String identifier) { + this.identifier = identifier; + } - public String getSchema() { - return schema; - } + public String getSchema() { + return schema; + } - public void setSchema(String schema) { - this.schema = schema; - } + public void setSchema(String schema) { + this.schema = schema; + } } diff --git a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/scholix/ScholixRelationship.java b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/scholix/ScholixRelationship.java index b363eff2c..0cbdf43e7 100644 --- a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/scholix/ScholixRelationship.java +++ b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/scholix/ScholixRelationship.java @@ -1,41 +1,43 @@ + package eu.dnetlib.dhp.provision.scholix; import java.io.Serializable; public class ScholixRelationship implements Serializable { - private String name; - private String schema; - private String inverse; + private String name; + private String schema; + private String inverse; - public ScholixRelationship() {} + public ScholixRelationship() { + } - public ScholixRelationship(String name, String schema, String inverse) { - this.name = name; - this.schema = schema; - this.inverse = inverse; - } + public ScholixRelationship(String name, String schema, String inverse) { + this.name = name; + this.schema = schema; + this.inverse = inverse; + } - public String getName() { - return name; - } + public String getName() { + return name; + } - public void setName(String name) { - this.name = name; - } + public void setName(String name) { + this.name = name; + } - public String getSchema() { - return schema; - } + public String getSchema() { + return schema; + } - public void setSchema(String schema) { - this.schema = schema; - } + public void setSchema(String schema) { + this.schema = schema; + } - public String getInverse() { - return inverse; - } + public String getInverse() { + return inverse; + } - public void setInverse(String inverse) { - this.inverse = inverse; - } + public void setInverse(String inverse) { + this.inverse = inverse; + } } diff --git a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/scholix/ScholixResource.java b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/scholix/ScholixResource.java index 89342d281..6de30c748 100644 --- a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/scholix/ScholixResource.java +++ b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/scholix/ScholixResource.java @@ -1,137 +1,151 @@ + package eu.dnetlib.dhp.provision.scholix; -import eu.dnetlib.dhp.provision.scholix.summary.ScholixSummary; import java.io.Serializable; import java.util.Collections; import java.util.List; import java.util.stream.Collectors; +import eu.dnetlib.dhp.provision.scholix.summary.ScholixSummary; + public class ScholixResource implements Serializable { - private List identifier; - private String dnetIdentifier; - private String objectType; - private String objectSubType; - private String title; - private List creator; - private String publicationDate; - private List publisher; - private List collectedFrom; + private List identifier; + private String dnetIdentifier; + private String objectType; + private String objectSubType; + private String title; + private List creator; + private String publicationDate; + private List publisher; + private List collectedFrom; - public static ScholixResource fromSummary(ScholixSummary summary) { + public static ScholixResource fromSummary(ScholixSummary summary) { - final ScholixResource resource = new ScholixResource(); + final ScholixResource resource = new ScholixResource(); - resource.setDnetIdentifier(summary.getId()); + resource.setDnetIdentifier(summary.getId()); - resource.setIdentifier( - summary.getLocalIdentifier().stream() - .map(i -> new ScholixIdentifier(i.getId(), i.getType())) - .collect(Collectors.toList())); + resource + .setIdentifier( + summary + .getLocalIdentifier() + .stream() + .map(i -> new ScholixIdentifier(i.getId(), i.getType())) + .collect(Collectors.toList())); - resource.setObjectType(summary.getTypology().toString()); + resource.setObjectType(summary.getTypology().toString()); - if (summary.getTitle() != null && summary.getTitle().size() > 0) - resource.setTitle(summary.getTitle().get(0)); + if (summary.getTitle() != null && summary.getTitle().size() > 0) + resource.setTitle(summary.getTitle().get(0)); - if (summary.getAuthor() != null) - resource.setCreator( - summary.getAuthor().stream() - .map(c -> new ScholixEntityId(c, null)) - .collect(Collectors.toList())); + if (summary.getAuthor() != null) + resource + .setCreator( + summary + .getAuthor() + .stream() + .map(c -> new ScholixEntityId(c, null)) + .collect(Collectors.toList())); - if (summary.getDate() != null && summary.getDate().size() > 0) - resource.setPublicationDate(summary.getDate().get(0)); - if (summary.getPublisher() != null) - resource.setPublisher( - summary.getPublisher().stream() - .map(p -> new ScholixEntityId(p, null)) - .collect(Collectors.toList())); - if (summary.getDatasources() != null) - resource.setCollectedFrom( - summary.getDatasources().stream() - .map( - d -> - new ScholixCollectedFrom( - new ScholixEntityId( - d.getDatasourceName(), - Collections.singletonList( - new ScholixIdentifier(d.getDatasourceId(), "dnet_identifier"))), - "collected", - d.getCompletionStatus())) - .collect(Collectors.toList())); - return resource; - } + if (summary.getDate() != null && summary.getDate().size() > 0) + resource.setPublicationDate(summary.getDate().get(0)); + if (summary.getPublisher() != null) + resource + .setPublisher( + summary + .getPublisher() + .stream() + .map(p -> new ScholixEntityId(p, null)) + .collect(Collectors.toList())); + if (summary.getDatasources() != null) + resource + .setCollectedFrom( + summary + .getDatasources() + .stream() + .map( + d -> new ScholixCollectedFrom( + new ScholixEntityId( + d.getDatasourceName(), + Collections + .singletonList( + new ScholixIdentifier(d.getDatasourceId(), "dnet_identifier"))), + "collected", + d.getCompletionStatus())) + .collect(Collectors.toList())); + return resource; + } - public List getIdentifier() { - return identifier; - } + public List getIdentifier() { + return identifier; + } - public void setIdentifier(List identifier) { - this.identifier = identifier; - } + public void setIdentifier(List identifier) { + this.identifier = identifier; + } - public String getDnetIdentifier() { - return dnetIdentifier; - } + public String getDnetIdentifier() { + return dnetIdentifier; + } - public void setDnetIdentifier(String dnetIdentifier) { - this.dnetIdentifier = dnetIdentifier; - } + public void setDnetIdentifier(String dnetIdentifier) { + this.dnetIdentifier = dnetIdentifier; + } - public String getObjectType() { - return objectType; - } + public String getObjectType() { + return objectType; + } - public void setObjectType(String objectType) { - this.objectType = objectType; - } + public void setObjectType(String objectType) { + this.objectType = objectType; + } - public String getObjectSubType() { - return objectSubType; - } + public String getObjectSubType() { + return objectSubType; + } - public void setObjectSubType(String objectSubType) { - this.objectSubType = objectSubType; - } + public void setObjectSubType(String objectSubType) { + this.objectSubType = objectSubType; + } - public String getTitle() { - return title; - } + public String getTitle() { + return title; + } - public void setTitle(String title) { - this.title = title; - } + public void setTitle(String title) { + this.title = title; + } - public List getCreator() { - return creator; - } + public List getCreator() { + return creator; + } - public void setCreator(List creator) { - this.creator = creator; - } + public void setCreator(List creator) { + this.creator = creator; + } - public String getPublicationDate() { - return publicationDate; - } + public String getPublicationDate() { + return publicationDate; + } - public void setPublicationDate(String publicationDate) { - this.publicationDate = publicationDate; - } + public void setPublicationDate(String publicationDate) { + this.publicationDate = publicationDate; + } - public List getPublisher() { - return publisher; - } + public List getPublisher() { + return publisher; + } - public void setPublisher(List publisher) { - this.publisher = publisher; - } + public void setPublisher(List publisher) { + this.publisher = publisher; + } - public List getCollectedFrom() { - return collectedFrom; - } + public List getCollectedFrom() { + return collectedFrom; + } - public void setCollectedFrom(List collectedFrom) { - this.collectedFrom = collectedFrom; - } + public void setCollectedFrom(List collectedFrom) { + this.collectedFrom = collectedFrom; + } } diff --git a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/scholix/summary/CollectedFromType.java b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/scholix/summary/CollectedFromType.java index 0d58eacd6..6d6f46f54 100644 --- a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/scholix/summary/CollectedFromType.java +++ b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/scholix/summary/CollectedFromType.java @@ -1,42 +1,44 @@ + package eu.dnetlib.dhp.provision.scholix.summary; import java.io.Serializable; public class CollectedFromType implements Serializable { - private String datasourceName; - private String datasourceId; - private String completionStatus; + private String datasourceName; + private String datasourceId; + private String completionStatus; - public CollectedFromType() {} + public CollectedFromType() { + } - public CollectedFromType(String datasourceName, String datasourceId, String completionStatus) { - this.datasourceName = datasourceName; - this.datasourceId = datasourceId; - this.completionStatus = completionStatus; - } + public CollectedFromType(String datasourceName, String datasourceId, String completionStatus) { + this.datasourceName = datasourceName; + this.datasourceId = datasourceId; + this.completionStatus = completionStatus; + } - public String getDatasourceName() { - return datasourceName; - } + public String getDatasourceName() { + return datasourceName; + } - public void setDatasourceName(String datasourceName) { - this.datasourceName = datasourceName; - } + public void setDatasourceName(String datasourceName) { + this.datasourceName = datasourceName; + } - public String getDatasourceId() { - return datasourceId; - } + public String getDatasourceId() { + return datasourceId; + } - public void setDatasourceId(String datasourceId) { - this.datasourceId = datasourceId; - } + public void setDatasourceId(String datasourceId) { + this.datasourceId = datasourceId; + } - public String getCompletionStatus() { - return completionStatus; - } + public String getCompletionStatus() { + return completionStatus; + } - public void setCompletionStatus(String completionStatus) { - this.completionStatus = completionStatus; - } + public void setCompletionStatus(String completionStatus) { + this.completionStatus = completionStatus; + } } diff --git a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/scholix/summary/SchemeValue.java b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/scholix/summary/SchemeValue.java index 8338e3995..e9d94fccf 100644 --- a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/scholix/summary/SchemeValue.java +++ b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/scholix/summary/SchemeValue.java @@ -1,31 +1,33 @@ + package eu.dnetlib.dhp.provision.scholix.summary; import java.io.Serializable; public class SchemeValue implements Serializable { - private String scheme; - private String value; + private String scheme; + private String value; - public SchemeValue() {} + public SchemeValue() { + } - public SchemeValue(String scheme, String value) { - this.scheme = scheme; - this.value = value; - } + public SchemeValue(String scheme, String value) { + this.scheme = scheme; + this.value = value; + } - public String getScheme() { - return scheme; - } + public String getScheme() { + return scheme; + } - public void setScheme(String scheme) { - this.scheme = scheme; - } + public void setScheme(String scheme) { + this.scheme = scheme; + } - public String getValue() { - return value; - } + public String getValue() { + return value; + } - public void setValue(String value) { - this.value = value; - } + public void setValue(String value) { + this.value = value; + } } diff --git a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/scholix/summary/ScholixSummary.java b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/scholix/summary/ScholixSummary.java index 50aa2a75c..e5ea8b9f5 100644 --- a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/scholix/summary/ScholixSummary.java +++ b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/scholix/summary/ScholixSummary.java @@ -1,313 +1,353 @@ + package eu.dnetlib.dhp.provision.scholix.summary; +import java.io.Serializable; +import java.util.Collections; +import java.util.List; +import java.util.stream.Collectors; + import com.fasterxml.jackson.annotation.JsonProperty; import com.fasterxml.jackson.databind.DeserializationFeature; import com.fasterxml.jackson.databind.ObjectMapper; + import eu.dnetlib.dhp.provision.RelatedItemInfo; import eu.dnetlib.dhp.schema.oaf.Author; import eu.dnetlib.dhp.schema.oaf.StructuredProperty; import eu.dnetlib.dhp.schema.scholexplorer.DLIDataset; import eu.dnetlib.dhp.schema.scholexplorer.DLIPublication; import eu.dnetlib.dhp.schema.scholexplorer.DLIUnknown; -import java.io.Serializable; -import java.util.Collections; -import java.util.List; -import java.util.stream.Collectors; public class ScholixSummary implements Serializable { - private String id; - private List localIdentifier; - private Typology typology; - private List title; - private List author; - private List date; - private String description; - private List subject; - private List publisher; - private long relatedPublications; - private long relatedDatasets; - private long relatedUnknown; - private List datasources; + private String id; + private List localIdentifier; + private Typology typology; + private List title; + private List author; + private List date; + private String description; + private List subject; + private List publisher; + private long relatedPublications; + private long relatedDatasets; + private long relatedUnknown; + private List datasources; - public String getId() { - return id; - } + public String getId() { + return id; + } - public void setId(String id) { - this.id = id; - } + public void setId(String id) { + this.id = id; + } - public List getLocalIdentifier() { - return localIdentifier; - } + public List getLocalIdentifier() { + return localIdentifier; + } - public void setLocalIdentifier(List localIdentifier) { - this.localIdentifier = localIdentifier; - } + public void setLocalIdentifier(List localIdentifier) { + this.localIdentifier = localIdentifier; + } - public Typology getTypology() { - return typology; - } + public Typology getTypology() { + return typology; + } - public void setTypology(Typology typology) { - this.typology = typology; - } + public void setTypology(Typology typology) { + this.typology = typology; + } - public List getTitle() { - return title; - } + public List getTitle() { + return title; + } - public void setTitle(List title) { - this.title = title; - } + public void setTitle(List title) { + this.title = title; + } - public List getAuthor() { - return author; - } + public List getAuthor() { + return author; + } - public void setAuthor(List author) { - this.author = author; - } + public void setAuthor(List author) { + this.author = author; + } - public List getDate() { - return date; - } + public List getDate() { + return date; + } - public void setDate(List date) { - this.date = date; - } + public void setDate(List date) { + this.date = date; + } - @JsonProperty("abstract") - public String getDescription() { - return description; - } + @JsonProperty("abstract") + public String getDescription() { + return description; + } - @JsonProperty("abstract") - public void setDescription(String description) { - this.description = description; - } + @JsonProperty("abstract") + public void setDescription(String description) { + this.description = description; + } - public List getSubject() { - return subject; - } + public List getSubject() { + return subject; + } - public void setSubject(List subject) { - this.subject = subject; - } + public void setSubject(List subject) { + this.subject = subject; + } - public List getPublisher() { - return publisher; - } + public List getPublisher() { + return publisher; + } - public void setPublisher(List publisher) { - this.publisher = publisher; - } + public void setPublisher(List publisher) { + this.publisher = publisher; + } - public long getRelatedPublications() { - return relatedPublications; - } + public long getRelatedPublications() { + return relatedPublications; + } - public void setRelatedPublications(long relatedPublications) { - this.relatedPublications = relatedPublications; - } + public void setRelatedPublications(long relatedPublications) { + this.relatedPublications = relatedPublications; + } - public long getRelatedDatasets() { - return relatedDatasets; - } + public long getRelatedDatasets() { + return relatedDatasets; + } - public void setRelatedDatasets(long relatedDatasets) { - this.relatedDatasets = relatedDatasets; - } + public void setRelatedDatasets(long relatedDatasets) { + this.relatedDatasets = relatedDatasets; + } - public long getRelatedUnknown() { - return relatedUnknown; - } + public long getRelatedUnknown() { + return relatedUnknown; + } - public void setRelatedUnknown(long relatedUnknown) { - this.relatedUnknown = relatedUnknown; - } + public void setRelatedUnknown(long relatedUnknown) { + this.relatedUnknown = relatedUnknown; + } - public List getDatasources() { - return datasources; - } + public List getDatasources() { + return datasources; + } - public void setDatasources(List datasources) { - this.datasources = datasources; - } + public void setDatasources(List datasources) { + this.datasources = datasources; + } - public static ScholixSummary fromJsonOAF(final Typology oafType, final String oafJson) { - try { - final ObjectMapper mapper = new ObjectMapper(); - final RelatedItemInfo relatedItemInfo = new RelatedItemInfo(); - mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false); - switch (oafType) { - case dataset: - return summaryFromDataset(mapper.readValue(oafJson, DLIDataset.class), relatedItemInfo); - case publication: - return summaryFromPublication( - mapper.readValue(oafJson, DLIPublication.class), relatedItemInfo); - case unknown: - return summaryFromUnknown(mapper.readValue(oafJson, DLIUnknown.class), relatedItemInfo); - } - } catch (Throwable e) { - throw new RuntimeException(e); - } - return null; - } + public static ScholixSummary fromJsonOAF(final Typology oafType, final String oafJson) { + try { + final ObjectMapper mapper = new ObjectMapper(); + final RelatedItemInfo relatedItemInfo = new RelatedItemInfo(); + mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false); + switch (oafType) { + case dataset: + return summaryFromDataset(mapper.readValue(oafJson, DLIDataset.class), relatedItemInfo); + case publication: + return summaryFromPublication( + mapper.readValue(oafJson, DLIPublication.class), relatedItemInfo); + case unknown: + return summaryFromUnknown(mapper.readValue(oafJson, DLIUnknown.class), relatedItemInfo); + } + } catch (Throwable e) { + throw new RuntimeException(e); + } + return null; + } - public static String fromJsonOAF( - final Typology oafType, final String oafJson, final String relEntityJson) { - try { - final ObjectMapper mapper = new ObjectMapper(); - mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false); + public static String fromJsonOAF( + final Typology oafType, final String oafJson, final String relEntityJson) { + try { + final ObjectMapper mapper = new ObjectMapper(); + mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false); - RelatedItemInfo relatedItemInfo = mapper.readValue(relEntityJson, RelatedItemInfo.class); + RelatedItemInfo relatedItemInfo = mapper.readValue(relEntityJson, RelatedItemInfo.class); - switch (oafType) { - case dataset: - return mapper.writeValueAsString( - summaryFromDataset(mapper.readValue(oafJson, DLIDataset.class), relatedItemInfo)); - case publication: - return mapper.writeValueAsString( - summaryFromPublication( - mapper.readValue(oafJson, DLIPublication.class), relatedItemInfo)); - case unknown: - return mapper.writeValueAsString( - summaryFromUnknown(mapper.readValue(oafJson, DLIUnknown.class), relatedItemInfo)); - } + switch (oafType) { + case dataset: + return mapper + .writeValueAsString( + summaryFromDataset(mapper.readValue(oafJson, DLIDataset.class), relatedItemInfo)); + case publication: + return mapper + .writeValueAsString( + summaryFromPublication( + mapper.readValue(oafJson, DLIPublication.class), relatedItemInfo)); + case unknown: + return mapper + .writeValueAsString( + summaryFromUnknown(mapper.readValue(oafJson, DLIUnknown.class), relatedItemInfo)); + } - } catch (Throwable e) { - throw new RuntimeException(e); - } + } catch (Throwable e) { + throw new RuntimeException(e); + } - return null; - } + return null; + } - private static ScholixSummary summaryFromDataset( - final DLIDataset item, final RelatedItemInfo relatedItemInfo) { - ScholixSummary summary = new ScholixSummary(); - summary.setId(item.getId()); + private static ScholixSummary summaryFromDataset( + final DLIDataset item, final RelatedItemInfo relatedItemInfo) { + ScholixSummary summary = new ScholixSummary(); + summary.setId(item.getId()); - if (item.getPid() != null) - summary.setLocalIdentifier( - item.getPid().stream() - .map(p -> new TypedIdentifier(p.getValue(), p.getQualifier().getClassid())) - .collect(Collectors.toList())); + if (item.getPid() != null) + summary + .setLocalIdentifier( + item + .getPid() + .stream() + .map(p -> new TypedIdentifier(p.getValue(), p.getQualifier().getClassid())) + .collect(Collectors.toList())); - summary.setTypology(Typology.dataset); - if (item.getTitle() != null) - summary.setTitle( - item.getTitle().stream().map(StructuredProperty::getValue).collect(Collectors.toList())); + summary.setTypology(Typology.dataset); + if (item.getTitle() != null) + summary + .setTitle( + item.getTitle().stream().map(StructuredProperty::getValue).collect(Collectors.toList())); - if (item.getAuthor() != null) { - summary.setAuthor( - item.getAuthor().stream().map(Author::getFullname).collect(Collectors.toList())); - } + if (item.getAuthor() != null) { + summary + .setAuthor( + item.getAuthor().stream().map(Author::getFullname).collect(Collectors.toList())); + } - if (item.getRelevantdate() != null) - summary.setDate( - item.getRelevantdate().stream() - .filter(d -> "date".equalsIgnoreCase(d.getQualifier().getClassname())) - .map(StructuredProperty::getValue) - .collect(Collectors.toList())); + if (item.getRelevantdate() != null) + summary + .setDate( + item + .getRelevantdate() + .stream() + .filter(d -> "date".equalsIgnoreCase(d.getQualifier().getClassname())) + .map(StructuredProperty::getValue) + .collect(Collectors.toList())); - if (item.getDescription() != null && item.getDescription().size() > 0) - summary.setDescription(item.getDescription().get(0).getValue()); + if (item.getDescription() != null && item.getDescription().size() > 0) + summary.setDescription(item.getDescription().get(0).getValue()); - if (item.getSubject() != null) { - summary.setSubject( - item.getSubject().stream() - .map(s -> new SchemeValue(s.getQualifier().getClassid(), s.getValue())) - .collect(Collectors.toList())); - } - if (item.getPublisher() != null) - summary.setPublisher(Collections.singletonList(item.getPublisher().getValue())); + if (item.getSubject() != null) { + summary + .setSubject( + item + .getSubject() + .stream() + .map(s -> new SchemeValue(s.getQualifier().getClassid(), s.getValue())) + .collect(Collectors.toList())); + } + if (item.getPublisher() != null) + summary.setPublisher(Collections.singletonList(item.getPublisher().getValue())); - summary.setRelatedDatasets(relatedItemInfo.getRelatedDataset()); - summary.setRelatedPublications(relatedItemInfo.getRelatedPublication()); - summary.setRelatedUnknown(relatedItemInfo.getRelatedUnknown()); + summary.setRelatedDatasets(relatedItemInfo.getRelatedDataset()); + summary.setRelatedPublications(relatedItemInfo.getRelatedPublication()); + summary.setRelatedUnknown(relatedItemInfo.getRelatedUnknown()); - if (item.getDlicollectedfrom() != null) - summary.setDatasources( - item.getDlicollectedfrom().stream() - .map(c -> new CollectedFromType(c.getName(), c.getId(), c.getCompletionStatus())) - .collect(Collectors.toList())); - return summary; - } + if (item.getDlicollectedfrom() != null) + summary + .setDatasources( + item + .getDlicollectedfrom() + .stream() + .map(c -> new CollectedFromType(c.getName(), c.getId(), c.getCompletionStatus())) + .collect(Collectors.toList())); + return summary; + } - private static ScholixSummary summaryFromPublication( - final DLIPublication item, final RelatedItemInfo relatedItemInfo) { - ScholixSummary summary = new ScholixSummary(); - summary.setId(item.getId()); + private static ScholixSummary summaryFromPublication( + final DLIPublication item, final RelatedItemInfo relatedItemInfo) { + ScholixSummary summary = new ScholixSummary(); + summary.setId(item.getId()); - if (item.getPid() != null) - summary.setLocalIdentifier( - item.getPid().stream() - .map(p -> new TypedIdentifier(p.getValue(), p.getQualifier().getClassid())) - .collect(Collectors.toList())); + if (item.getPid() != null) + summary + .setLocalIdentifier( + item + .getPid() + .stream() + .map(p -> new TypedIdentifier(p.getValue(), p.getQualifier().getClassid())) + .collect(Collectors.toList())); - summary.setTypology(Typology.publication); - if (item.getTitle() != null) - summary.setTitle( - item.getTitle().stream().map(StructuredProperty::getValue).collect(Collectors.toList())); + summary.setTypology(Typology.publication); + if (item.getTitle() != null) + summary + .setTitle( + item.getTitle().stream().map(StructuredProperty::getValue).collect(Collectors.toList())); - if (item.getAuthor() != null) { - summary.setAuthor( - item.getAuthor().stream().map(Author::getFullname).collect(Collectors.toList())); - } + if (item.getAuthor() != null) { + summary + .setAuthor( + item.getAuthor().stream().map(Author::getFullname).collect(Collectors.toList())); + } - if (item.getRelevantdate() != null) - summary.setDate( - item.getRelevantdate().stream() - .filter(d -> "date".equalsIgnoreCase(d.getQualifier().getClassname())) - .map(StructuredProperty::getValue) - .collect(Collectors.toList())); + if (item.getRelevantdate() != null) + summary + .setDate( + item + .getRelevantdate() + .stream() + .filter(d -> "date".equalsIgnoreCase(d.getQualifier().getClassname())) + .map(StructuredProperty::getValue) + .collect(Collectors.toList())); - if (item.getDescription() != null && item.getDescription().size() > 0) - summary.setDescription(item.getDescription().get(0).getValue()); + if (item.getDescription() != null && item.getDescription().size() > 0) + summary.setDescription(item.getDescription().get(0).getValue()); - if (item.getSubject() != null) { - summary.setSubject( - item.getSubject().stream() - .map(s -> new SchemeValue(s.getQualifier().getClassid(), s.getValue())) - .collect(Collectors.toList())); - } + if (item.getSubject() != null) { + summary + .setSubject( + item + .getSubject() + .stream() + .map(s -> new SchemeValue(s.getQualifier().getClassid(), s.getValue())) + .collect(Collectors.toList())); + } - if (item.getPublisher() != null) - summary.setPublisher(Collections.singletonList(item.getPublisher().getValue())); + if (item.getPublisher() != null) + summary.setPublisher(Collections.singletonList(item.getPublisher().getValue())); - summary.setRelatedDatasets(relatedItemInfo.getRelatedDataset()); - summary.setRelatedPublications(relatedItemInfo.getRelatedPublication()); - summary.setRelatedUnknown(relatedItemInfo.getRelatedUnknown()); + summary.setRelatedDatasets(relatedItemInfo.getRelatedDataset()); + summary.setRelatedPublications(relatedItemInfo.getRelatedPublication()); + summary.setRelatedUnknown(relatedItemInfo.getRelatedUnknown()); - if (item.getDlicollectedfrom() != null) - summary.setDatasources( - item.getDlicollectedfrom().stream() - .map(c -> new CollectedFromType(c.getName(), c.getId(), c.getCompletionStatus())) - .collect(Collectors.toList())); + if (item.getDlicollectedfrom() != null) + summary + .setDatasources( + item + .getDlicollectedfrom() + .stream() + .map(c -> new CollectedFromType(c.getName(), c.getId(), c.getCompletionStatus())) + .collect(Collectors.toList())); - return summary; - } + return summary; + } - private static ScholixSummary summaryFromUnknown( - final DLIUnknown item, final RelatedItemInfo relatedItemInfo) { - ScholixSummary summary = new ScholixSummary(); - summary.setId(item.getId()); - if (item.getPid() != null) - summary.setLocalIdentifier( - item.getPid().stream() - .map(p -> new TypedIdentifier(p.getValue(), p.getQualifier().getClassid())) - .collect(Collectors.toList())); + private static ScholixSummary summaryFromUnknown( + final DLIUnknown item, final RelatedItemInfo relatedItemInfo) { + ScholixSummary summary = new ScholixSummary(); + summary.setId(item.getId()); + if (item.getPid() != null) + summary + .setLocalIdentifier( + item + .getPid() + .stream() + .map(p -> new TypedIdentifier(p.getValue(), p.getQualifier().getClassid())) + .collect(Collectors.toList())); - summary.setRelatedDatasets(relatedItemInfo.getRelatedDataset()); - summary.setRelatedPublications(relatedItemInfo.getRelatedPublication()); - summary.setRelatedUnknown(relatedItemInfo.getRelatedUnknown()); - summary.setTypology(Typology.unknown); - if (item.getDlicollectedfrom() != null) - summary.setDatasources( - item.getDlicollectedfrom().stream() - .map(c -> new CollectedFromType(c.getName(), c.getId(), c.getCompletionStatus())) - .collect(Collectors.toList())); - return summary; - } + summary.setRelatedDatasets(relatedItemInfo.getRelatedDataset()); + summary.setRelatedPublications(relatedItemInfo.getRelatedPublication()); + summary.setRelatedUnknown(relatedItemInfo.getRelatedUnknown()); + summary.setTypology(Typology.unknown); + if (item.getDlicollectedfrom() != null) + summary + .setDatasources( + item + .getDlicollectedfrom() + .stream() + .map(c -> new CollectedFromType(c.getName(), c.getId(), c.getCompletionStatus())) + .collect(Collectors.toList())); + return summary; + } } diff --git a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/scholix/summary/TypedIdentifier.java b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/scholix/summary/TypedIdentifier.java index 773695eff..c4148ad24 100644 --- a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/scholix/summary/TypedIdentifier.java +++ b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/scholix/summary/TypedIdentifier.java @@ -1,31 +1,33 @@ + package eu.dnetlib.dhp.provision.scholix.summary; import java.io.Serializable; public class TypedIdentifier implements Serializable { - private String id; - private String type; + private String id; + private String type; - public TypedIdentifier() {} + public TypedIdentifier() { + } - public TypedIdentifier(String id, String type) { - this.id = id; - this.type = type; - } + public TypedIdentifier(String id, String type) { + this.id = id; + this.type = type; + } - public String getId() { - return id; - } + public String getId() { + return id; + } - public void setId(String id) { - this.id = id; - } + public void setId(String id) { + this.id = id; + } - public String getType() { - return type; - } + public String getType() { + return type; + } - public void setType(String type) { - this.type = type; - } + public void setType(String type) { + this.type = type; + } } diff --git a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/scholix/summary/Typology.java b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/scholix/summary/Typology.java index d90e224f9..effa32b6b 100644 --- a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/scholix/summary/Typology.java +++ b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/scholix/summary/Typology.java @@ -1,9 +1,8 @@ + package eu.dnetlib.dhp.provision.scholix.summary; import java.io.Serializable; public enum Typology implements Serializable { - dataset, - publication, - unknown + dataset, publication, unknown } diff --git a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/update/CrossRefParserJSON.java b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/update/CrossRefParserJSON.java index 7e8e7aef3..bc9562e08 100644 --- a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/update/CrossRefParserJSON.java +++ b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/update/CrossRefParserJSON.java @@ -1,121 +1,131 @@ + package eu.dnetlib.dhp.provision.update; +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; + import com.google.gson.JsonArray; import com.google.gson.JsonElement; import com.google.gson.JsonObject; import com.google.gson.JsonParser; + import eu.dnetlib.dhp.provision.scholix.ScholixCollectedFrom; import eu.dnetlib.dhp.provision.scholix.ScholixEntityId; import eu.dnetlib.dhp.provision.scholix.ScholixIdentifier; import eu.dnetlib.dhp.provision.scholix.ScholixResource; import eu.dnetlib.dhp.utils.DHPUtils; -import java.util.ArrayList; -import java.util.Collections; -import java.util.List; public class CrossRefParserJSON { - private static List collectedFrom = - generateCrossrefCollectedFrom("complete"); + private static List collectedFrom = generateCrossrefCollectedFrom("complete"); - public static ScholixResource parseRecord(final String record) { - if (record == null) return null; - JsonElement jElement = new JsonParser().parse(record); - JsonElement source = null; - if (jElement.getAsJsonObject().has("_source")) { - source = jElement.getAsJsonObject().get("_source"); - if (source == null || !source.isJsonObject()) return null; - } else if (jElement.getAsJsonObject().has("DOI")) { - source = jElement; - } else { - return null; - } + public static ScholixResource parseRecord(final String record) { + if (record == null) + return null; + JsonElement jElement = new JsonParser().parse(record); + JsonElement source = null; + if (jElement.getAsJsonObject().has("_source")) { + source = jElement.getAsJsonObject().get("_source"); + if (source == null || !source.isJsonObject()) + return null; + } else if (jElement.getAsJsonObject().has("DOI")) { + source = jElement; + } else { + return null; + } - final JsonObject message = source.getAsJsonObject(); - ScholixResource currentObject = new ScholixResource(); + final JsonObject message = source.getAsJsonObject(); + ScholixResource currentObject = new ScholixResource(); - if (message.get("DOI") != null) { - final String doi = message.get("DOI").getAsString(); - currentObject.setIdentifier(Collections.singletonList(new ScholixIdentifier(doi, "doi"))); - } + if (message.get("DOI") != null) { + final String doi = message.get("DOI").getAsString(); + currentObject.setIdentifier(Collections.singletonList(new ScholixIdentifier(doi, "doi"))); + } - if ((!message.get("created").isJsonNull()) - && (message.getAsJsonObject("created").get("date-time") != null)) { - currentObject.setPublicationDate( - message.getAsJsonObject("created").get("date-time").getAsString()); - } + if ((!message.get("created").isJsonNull()) + && (message.getAsJsonObject("created").get("date-time") != null)) { + currentObject + .setPublicationDate( + message.getAsJsonObject("created").get("date-time").getAsString()); + } - if (message.get("title") != null - && !message.get("title").isJsonNull() - && message.get("title").isJsonArray()) { + if (message.get("title") != null + && !message.get("title").isJsonNull() + && message.get("title").isJsonArray()) { - JsonArray array = message.get("title").getAsJsonArray(); - currentObject.setTitle(array.get(0).getAsString()); - } - if (message.get("author") != null && !message.get("author").isJsonNull()) { - JsonArray author = message.getAsJsonArray("author"); - List authorList = new ArrayList<>(); - for (JsonElement anAuthor : author) { - JsonObject currentAuth = anAuthor.getAsJsonObject(); + JsonArray array = message.get("title").getAsJsonArray(); + currentObject.setTitle(array.get(0).getAsString()); + } + if (message.get("author") != null && !message.get("author").isJsonNull()) { + JsonArray author = message.getAsJsonArray("author"); + List authorList = new ArrayList<>(); + for (JsonElement anAuthor : author) { + JsonObject currentAuth = anAuthor.getAsJsonObject(); - String family = ""; - String given = ""; - if (currentAuth != null - && currentAuth.get("family") != null - && !currentAuth.get("family").isJsonNull()) { - family = currentAuth.get("family").getAsString(); - } - if (currentAuth != null - && currentAuth.get("given") != null - && !currentAuth.get("given").isJsonNull()) { - given = currentAuth.get("given").getAsString(); - } - authorList.add(new ScholixEntityId(String.format("%s %s", family, given), null)); - } - currentObject.setCreator(authorList); - } - if (message.get("publisher") != null && !message.get("publisher").isJsonNull()) { - currentObject.setPublisher( - Collections.singletonList( - new ScholixEntityId(message.get("publisher").getAsString(), null))); - } - currentObject.setCollectedFrom(collectedFrom); - currentObject.setObjectType("publication"); - currentObject.setDnetIdentifier( - generateId(message.get("DOI").getAsString(), "doi", "publication")); + String family = ""; + String given = ""; + if (currentAuth != null + && currentAuth.get("family") != null + && !currentAuth.get("family").isJsonNull()) { + family = currentAuth.get("family").getAsString(); + } + if (currentAuth != null + && currentAuth.get("given") != null + && !currentAuth.get("given").isJsonNull()) { + given = currentAuth.get("given").getAsString(); + } + authorList.add(new ScholixEntityId(String.format("%s %s", family, given), null)); + } + currentObject.setCreator(authorList); + } + if (message.get("publisher") != null && !message.get("publisher").isJsonNull()) { + currentObject + .setPublisher( + Collections + .singletonList( + new ScholixEntityId(message.get("publisher").getAsString(), null))); + } + currentObject.setCollectedFrom(collectedFrom); + currentObject.setObjectType("publication"); + currentObject + .setDnetIdentifier( + generateId(message.get("DOI").getAsString(), "doi", "publication")); - return currentObject; - } + return currentObject; + } - private static List generateCrossrefCollectedFrom( - final String completionStatus) { - final ScholixEntityId scholixEntityId = - new ScholixEntityId( - "Crossref", - Collections.singletonList( - new ScholixIdentifier("dli_________::crossref", "dnet_identifier"))); - return Collections.singletonList( - new ScholixCollectedFrom(scholixEntityId, "resolved", completionStatus)); - } + private static List generateCrossrefCollectedFrom( + final String completionStatus) { + final ScholixEntityId scholixEntityId = new ScholixEntityId( + "Crossref", + Collections + .singletonList( + new ScholixIdentifier("dli_________::crossref", "dnet_identifier"))); + return Collections + .singletonList( + new ScholixCollectedFrom(scholixEntityId, "resolved", completionStatus)); + } - private static String generateId( - final String pid, final String pidType, final String entityType) { - String type; - switch (entityType) { - case "publication": - type = "50|"; - break; - case "dataset": - type = "60|"; - break; - case "unknown": - type = "70|"; - break; - default: - throw new IllegalArgumentException("unexpected value " + entityType); - } - return type - + DHPUtils.md5( - String.format("%s::%s", pid.toLowerCase().trim(), pidType.toLowerCase().trim())); - } + private static String generateId( + final String pid, final String pidType, final String entityType) { + String type; + switch (entityType) { + case "publication": + type = "50|"; + break; + case "dataset": + type = "60|"; + break; + case "unknown": + type = "70|"; + break; + default: + throw new IllegalArgumentException("unexpected value " + entityType); + } + return type + + DHPUtils + .md5( + String.format("%s::%s", pid.toLowerCase().trim(), pidType.toLowerCase().trim())); + } } diff --git a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/update/CrossrefClient.java b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/update/CrossrefClient.java index e5aa38c1d..fac1da253 100644 --- a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/update/CrossrefClient.java +++ b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/update/CrossrefClient.java @@ -1,10 +1,9 @@ + package eu.dnetlib.dhp.provision.update; -import com.google.gson.JsonElement; -import com.google.gson.JsonParser; -import eu.dnetlib.dhp.provision.scholix.ScholixResource; import java.io.ByteArrayOutputStream; import java.util.zip.Inflater; + import org.apache.commons.codec.binary.Base64; import org.apache.commons.io.IOUtils; import org.apache.http.client.methods.CloseableHttpResponse; @@ -12,77 +11,81 @@ import org.apache.http.client.methods.HttpGet; import org.apache.http.impl.client.CloseableHttpClient; import org.apache.http.impl.client.HttpClients; +import com.google.gson.JsonElement; +import com.google.gson.JsonParser; + +import eu.dnetlib.dhp.provision.scholix.ScholixResource; + public class CrossrefClient { - private String host; - private String index = "crossref"; - private String indexType = "item"; + private String host; + private String index = "crossref"; + private String indexType = "item"; - public CrossrefClient(String host) { - this.host = host; - } + public CrossrefClient(String host) { + this.host = host; + } - public String getHost() { - return host; - } + public String getHost() { + return host; + } - public void setHost(String host) { - this.host = host; - } + public void setHost(String host) { + this.host = host; + } - public String getIndex() { - return index; - } + public String getIndex() { + return index; + } - public void setIndex(String index) { - this.index = index; - } + public void setIndex(String index) { + this.index = index; + } - public String getIndexType() { - return indexType; - } + public String getIndexType() { + return indexType; + } - public void setIndexType(String indexType) { - this.indexType = indexType; - } + public void setIndexType(String indexType) { + this.indexType = indexType; + } - private static String decompressBlob(final String blob) { - try { - byte[] byteArray = Base64.decodeBase64(blob.getBytes()); - final Inflater decompresser = new Inflater(); - decompresser.setInput(byteArray); - final ByteArrayOutputStream bos = new ByteArrayOutputStream(byteArray.length); - byte[] buffer = new byte[8192]; - while (!decompresser.finished()) { - int size = decompresser.inflate(buffer); - bos.write(buffer, 0, size); - } - byte[] unzippeddata = bos.toByteArray(); - decompresser.end(); - return new String(unzippeddata); - } catch (Throwable e) { - throw new RuntimeException("Wrong record:" + blob, e); - } - } + private static String decompressBlob(final String blob) { + try { + byte[] byteArray = Base64.decodeBase64(blob.getBytes()); + final Inflater decompresser = new Inflater(); + decompresser.setInput(byteArray); + final ByteArrayOutputStream bos = new ByteArrayOutputStream(byteArray.length); + byte[] buffer = new byte[8192]; + while (!decompresser.finished()) { + int size = decompresser.inflate(buffer); + bos.write(buffer, 0, size); + } + byte[] unzippeddata = bos.toByteArray(); + decompresser.end(); + return new String(unzippeddata); + } catch (Throwable e) { + throw new RuntimeException("Wrong record:" + blob, e); + } + } - public ScholixResource getResourceByDOI(final String doi) { - try (CloseableHttpClient client = HttpClients.createDefault()) { - HttpGet httpGet = - new HttpGet( - String.format( - "http://%s:9200/%s/%s/%s", host, index, indexType, doi.replaceAll("/", "%2F"))); - CloseableHttpResponse response = client.execute(httpGet); - String json = IOUtils.toString(response.getEntity().getContent()); - if (json.contains("blob")) { - JsonParser p = new JsonParser(); - final JsonElement root = p.parse(json); - json = - decompressBlob( - root.getAsJsonObject().get("_source").getAsJsonObject().get("blob").getAsString()); - } - return CrossRefParserJSON.parseRecord(json); - } catch (Throwable e) { - return null; - } - } + public ScholixResource getResourceByDOI(final String doi) { + try (CloseableHttpClient client = HttpClients.createDefault()) { + HttpGet httpGet = new HttpGet( + String + .format( + "http://%s:9200/%s/%s/%s", host, index, indexType, doi.replaceAll("/", "%2F"))); + CloseableHttpResponse response = client.execute(httpGet); + String json = IOUtils.toString(response.getEntity().getContent()); + if (json.contains("blob")) { + JsonParser p = new JsonParser(); + final JsonElement root = p.parse(json); + json = decompressBlob( + root.getAsJsonObject().get("_source").getAsJsonObject().get("blob").getAsString()); + } + return CrossRefParserJSON.parseRecord(json); + } catch (Throwable e) { + return null; + } + } } diff --git a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/update/Datacite2Scholix.java b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/update/Datacite2Scholix.java index 3eed64d4d..10426b29c 100644 --- a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/update/Datacite2Scholix.java +++ b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/update/Datacite2Scholix.java @@ -1,218 +1,229 @@ + package eu.dnetlib.dhp.provision.update; -import com.jayway.jsonpath.JsonPath; -import eu.dnetlib.dhp.provision.scholix.*; -import eu.dnetlib.dhp.utils.DHPUtils; -import eu.dnetlib.scholexplorer.relation.RelInfo; -import eu.dnetlib.scholexplorer.relation.RelationMapper; import java.util.ArrayList; import java.util.Collections; import java.util.List; import java.util.Map; import java.util.stream.Collectors; + import org.apache.commons.lang3.StringUtils; +import com.jayway.jsonpath.JsonPath; + +import eu.dnetlib.dhp.provision.scholix.*; +import eu.dnetlib.dhp.utils.DHPUtils; +import eu.dnetlib.scholexplorer.relation.RelInfo; +import eu.dnetlib.scholexplorer.relation.RelationMapper; + public class Datacite2Scholix { - private String rootPath = "$.attributes"; - final RelationMapper relationMapper; + private String rootPath = "$.attributes"; + final RelationMapper relationMapper; - public Datacite2Scholix(RelationMapper relationMapper) { - this.relationMapper = relationMapper; - } + public Datacite2Scholix(RelationMapper relationMapper) { + this.relationMapper = relationMapper; + } - public List generateScholixFromJson(final String dJson) { - List> relIds = getRelatedIendtifiers(dJson); - relIds = - relIds != null - ? relIds.stream() - .filter( - m -> - m.containsKey("relatedIdentifierType") - && m.containsKey("relationType") - && m.containsKey("relatedIdentifier")) - .collect(Collectors.toList()) - : null; - if (relIds == null || relIds.size() == 0) return null; + public List generateScholixFromJson(final String dJson) { + List> relIds = getRelatedIendtifiers(dJson); + relIds = relIds != null + ? relIds + .stream() + .filter( + m -> m.containsKey("relatedIdentifierType") + && m.containsKey("relationType") + && m.containsKey("relatedIdentifier")) + .collect(Collectors.toList()) + : null; + if (relIds == null || relIds.size() == 0) + return null; - final String updated = JsonPath.read(dJson, rootPath + ".updated"); - ScholixResource resource = generateDataciteScholixResource(dJson); + final String updated = JsonPath.read(dJson, rootPath + ".updated"); + ScholixResource resource = generateDataciteScholixResource(dJson); - return relIds.stream() - .flatMap( - s -> { - try { - final List result = - generateScholix( - resource, - "" + s.get("relatedIdentifier"), - s.get("relatedIdentifierType"), - s.get("relationType"), - updated); - return result.stream(); - } catch (Throwable e) { - return new ArrayList().stream(); - } - }) - .collect(Collectors.toList()); - } + return relIds + .stream() + .flatMap( + s -> { + try { + final List result = generateScholix( + resource, + "" + s.get("relatedIdentifier"), + s.get("relatedIdentifierType"), + s.get("relationType"), + updated); + return result.stream(); + } catch (Throwable e) { + return new ArrayList().stream(); + } + }) + .collect(Collectors.toList()); + } - public String getRootPath() { - return rootPath; - } + public String getRootPath() { + return rootPath; + } - public void setRootPath(String rootPath) { - this.rootPath = rootPath; - } + public void setRootPath(String rootPath) { + this.rootPath = rootPath; + } - private List generateScholix( - ScholixResource source, - final String pid, - final String pidtype, - final String relType, - final String updated) { + private List generateScholix( + ScholixResource source, + final String pid, + final String pidtype, + final String relType, + final String updated) { - if ("doi".equalsIgnoreCase(pidtype)) { - ScholixResource target = new ScholixResource(); - target.setIdentifier(Collections.singletonList(new ScholixIdentifier(pid, pidtype))); - final RelInfo relInfo = relationMapper.get(relType.toLowerCase()); - final ScholixRelationship rel = - new ScholixRelationship(relInfo.getOriginal(), "datacite", relInfo.getInverse()); - final ScholixEntityId provider = source.getCollectedFrom().get(0).getProvider(); - final Scholix s = new Scholix(); - s.setSource(source); - s.setTarget(target); - s.setLinkprovider(Collections.singletonList(provider)); - s.setPublisher(source.getPublisher()); - s.setRelationship(rel); - s.setPublicationDate(updated); - return Collections.singletonList(s); - } else { - final List result = new ArrayList<>(); - ScholixResource target = new ScholixResource(); - target.setIdentifier(Collections.singletonList(new ScholixIdentifier(pid, pidtype))); - target.setDnetIdentifier(generateId(pid, pidtype, "unknown")); - target.setObjectType("unknown"); - target.setCollectedFrom(generateDataciteCollectedFrom("incomplete")); - final RelInfo relInfo = relationMapper.get(relType.toLowerCase()); - final ScholixRelationship rel = - new ScholixRelationship(relInfo.getOriginal(), "datacite", relInfo.getInverse()); - final ScholixEntityId provider = source.getCollectedFrom().get(0).getProvider(); - final Scholix s = new Scholix(); - s.setSource(source); - s.setTarget(target); - s.setLinkprovider(Collections.singletonList(provider)); - s.setPublisher(source.getPublisher()); - s.setRelationship(rel); - s.setPublicationDate(updated); - s.generateIdentifier(); - result.add(s); - final Scholix s2 = new Scholix(); - s2.setSource(target); - s2.setTarget(source); - s2.setLinkprovider(Collections.singletonList(provider)); - s2.setPublisher(source.getPublisher()); - s2.setRelationship( - new ScholixRelationship(relInfo.getInverse(), "datacite", relInfo.getOriginal())); - s2.setPublicationDate(updated); - s2.generateIdentifier(); - result.add(s2); - return result; - } - } + if ("doi".equalsIgnoreCase(pidtype)) { + ScholixResource target = new ScholixResource(); + target.setIdentifier(Collections.singletonList(new ScholixIdentifier(pid, pidtype))); + final RelInfo relInfo = relationMapper.get(relType.toLowerCase()); + final ScholixRelationship rel = new ScholixRelationship(relInfo.getOriginal(), "datacite", + relInfo.getInverse()); + final ScholixEntityId provider = source.getCollectedFrom().get(0).getProvider(); + final Scholix s = new Scholix(); + s.setSource(source); + s.setTarget(target); + s.setLinkprovider(Collections.singletonList(provider)); + s.setPublisher(source.getPublisher()); + s.setRelationship(rel); + s.setPublicationDate(updated); + return Collections.singletonList(s); + } else { + final List result = new ArrayList<>(); + ScholixResource target = new ScholixResource(); + target.setIdentifier(Collections.singletonList(new ScholixIdentifier(pid, pidtype))); + target.setDnetIdentifier(generateId(pid, pidtype, "unknown")); + target.setObjectType("unknown"); + target.setCollectedFrom(generateDataciteCollectedFrom("incomplete")); + final RelInfo relInfo = relationMapper.get(relType.toLowerCase()); + final ScholixRelationship rel = new ScholixRelationship(relInfo.getOriginal(), "datacite", + relInfo.getInverse()); + final ScholixEntityId provider = source.getCollectedFrom().get(0).getProvider(); + final Scholix s = new Scholix(); + s.setSource(source); + s.setTarget(target); + s.setLinkprovider(Collections.singletonList(provider)); + s.setPublisher(source.getPublisher()); + s.setRelationship(rel); + s.setPublicationDate(updated); + s.generateIdentifier(); + result.add(s); + final Scholix s2 = new Scholix(); + s2.setSource(target); + s2.setTarget(source); + s2.setLinkprovider(Collections.singletonList(provider)); + s2.setPublisher(source.getPublisher()); + s2 + .setRelationship( + new ScholixRelationship(relInfo.getInverse(), "datacite", relInfo.getOriginal())); + s2.setPublicationDate(updated); + s2.generateIdentifier(); + result.add(s2); + return result; + } + } - public ScholixResource generateDataciteScholixResource(String dJson) { - ScholixResource resource = new ScholixResource(); - String DOI_PATH = rootPath + ".doi"; - final String doi = JsonPath.read(dJson, DOI_PATH); - resource.setIdentifier(Collections.singletonList(new ScholixIdentifier(doi, "doi"))); - resource.setObjectType(getType(dJson)); - resource.setDnetIdentifier(generateId(doi, "doi", resource.getObjectType())); - resource.setCollectedFrom(generateDataciteCollectedFrom("complete")); - final String publisher = JsonPath.read(dJson, rootPath + ".publisher"); - if (StringUtils.isNotBlank(publisher)) - resource.setPublisher(Collections.singletonList(new ScholixEntityId(publisher, null))); - final String date = getDate(dJson); - if (StringUtils.isNotBlank(date)) resource.setPublicationDate(date); - final String title = getTitle(dJson); - if (StringUtils.isNotBlank(title)) resource.setTitle(title); - resource.setCreator(getCreators(dJson)); - return resource; - } + public ScholixResource generateDataciteScholixResource(String dJson) { + ScholixResource resource = new ScholixResource(); + String DOI_PATH = rootPath + ".doi"; + final String doi = JsonPath.read(dJson, DOI_PATH); + resource.setIdentifier(Collections.singletonList(new ScholixIdentifier(doi, "doi"))); + resource.setObjectType(getType(dJson)); + resource.setDnetIdentifier(generateId(doi, "doi", resource.getObjectType())); + resource.setCollectedFrom(generateDataciteCollectedFrom("complete")); + final String publisher = JsonPath.read(dJson, rootPath + ".publisher"); + if (StringUtils.isNotBlank(publisher)) + resource.setPublisher(Collections.singletonList(new ScholixEntityId(publisher, null))); + final String date = getDate(dJson); + if (StringUtils.isNotBlank(date)) + resource.setPublicationDate(date); + final String title = getTitle(dJson); + if (StringUtils.isNotBlank(title)) + resource.setTitle(title); + resource.setCreator(getCreators(dJson)); + return resource; + } - private List getCreators(final String json) { - final List creatorName = JsonPath.read(json, rootPath + ".creators[*].name"); - if (creatorName != null && creatorName.size() > 0) { - return creatorName.stream() - .map(s -> new ScholixEntityId(s, null)) - .collect(Collectors.toList()); - } - return null; - } + private List getCreators(final String json) { + final List creatorName = JsonPath.read(json, rootPath + ".creators[*].name"); + if (creatorName != null && creatorName.size() > 0) { + return creatorName + .stream() + .map(s -> new ScholixEntityId(s, null)) + .collect(Collectors.toList()); + } + return null; + } - private String getTitle(final String json) { - final List titles = JsonPath.read(json, rootPath + ".titles[*].title"); - return titles != null && titles.size() > 0 ? titles.get(0) : null; - } + private String getTitle(final String json) { + final List titles = JsonPath.read(json, rootPath + ".titles[*].title"); + return titles != null && titles.size() > 0 ? titles.get(0) : null; + } - private String getDate(final String json) { - final List> dates = JsonPath.read(json, rootPath + ".dates"); - if (dates != null && dates.size() > 0) { + private String getDate(final String json) { + final List> dates = JsonPath.read(json, rootPath + ".dates"); + if (dates != null && dates.size() > 0) { - List> issued = - dates.stream() - .filter(s -> "issued".equalsIgnoreCase(s.get("dateType"))) - .collect(Collectors.toList()); - if (issued.size() > 0) return issued.get(0).get("date"); - } - return null; - } + List> issued = dates + .stream() + .filter(s -> "issued".equalsIgnoreCase(s.get("dateType"))) + .collect(Collectors.toList()); + if (issued.size() > 0) + return issued.get(0).get("date"); + } + return null; + } - private List generateDataciteCollectedFrom(final String completionStatus) { - final ScholixEntityId scholixEntityId = - new ScholixEntityId( - "Datasets in Datacite", - Collections.singletonList( - new ScholixIdentifier("dli_________::datacite", "dnet_identifier"))); - return Collections.singletonList( - new ScholixCollectedFrom(scholixEntityId, "collected", completionStatus)); - } + private List generateDataciteCollectedFrom(final String completionStatus) { + final ScholixEntityId scholixEntityId = new ScholixEntityId( + "Datasets in Datacite", + Collections + .singletonList( + new ScholixIdentifier("dli_________::datacite", "dnet_identifier"))); + return Collections + .singletonList( + new ScholixCollectedFrom(scholixEntityId, "collected", completionStatus)); + } - private String getType(final String json) { - try { - final String bibtext = JsonPath.read(json, rootPath + ".types.bibtex"); - if ("article".equalsIgnoreCase(bibtext)) { - return "publication"; - } - return "dataset"; - } catch (Throwable e) { - return "dataset"; - } - } + private String getType(final String json) { + try { + final String bibtext = JsonPath.read(json, rootPath + ".types.bibtex"); + if ("article".equalsIgnoreCase(bibtext)) { + return "publication"; + } + return "dataset"; + } catch (Throwable e) { + return "dataset"; + } + } - private List> getRelatedIendtifiers(final String json) { - String REL_IDENTIFIER_PATH = rootPath + ".relatedIdentifiers[*]"; - List> res = JsonPath.read(json, REL_IDENTIFIER_PATH); - return res; - } + private List> getRelatedIendtifiers(final String json) { + String REL_IDENTIFIER_PATH = rootPath + ".relatedIdentifiers[*]"; + List> res = JsonPath.read(json, REL_IDENTIFIER_PATH); + return res; + } - public static String generateId(final String pid, final String pidType, final String entityType) { - String type; - switch (entityType) { - case "publication": - type = "50|"; - break; - case "dataset": - type = "60|"; - break; - case "unknown": - type = "70|"; - break; - default: - throw new IllegalArgumentException("unexpected value " + entityType); - } - return type - + DHPUtils.md5( - String.format("%s::%s", pid.toLowerCase().trim(), pidType.toLowerCase().trim())); - } + public static String generateId(final String pid, final String pidType, final String entityType) { + String type; + switch (entityType) { + case "publication": + type = "50|"; + break; + case "dataset": + type = "60|"; + break; + case "unknown": + type = "70|"; + break; + default: + throw new IllegalArgumentException("unexpected value " + entityType); + } + return type + + DHPUtils + .md5( + String.format("%s::%s", pid.toLowerCase().trim(), pidType.toLowerCase().trim())); + } } diff --git a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/update/DataciteClient.java b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/update/DataciteClient.java index a4e77b37c..e84ec4376 100644 --- a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/update/DataciteClient.java +++ b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/update/DataciteClient.java @@ -1,72 +1,75 @@ + package eu.dnetlib.dhp.provision.update; -import eu.dnetlib.dhp.provision.scholix.ScholixResource; import java.io.IOException; + import org.apache.commons.io.IOUtils; import org.apache.http.client.methods.CloseableHttpResponse; import org.apache.http.client.methods.HttpGet; import org.apache.http.impl.client.CloseableHttpClient; import org.apache.http.impl.client.HttpClients; +import eu.dnetlib.dhp.provision.scholix.ScholixResource; + public class DataciteClient { - private String host; - private String index = "datacite"; - private String indexType = "dump"; - private Datacite2Scholix d2s; + private String host; + private String index = "datacite"; + private String indexType = "dump"; + private Datacite2Scholix d2s; - public DataciteClient(String host) { - this.host = host; + public DataciteClient(String host) { + this.host = host; - d2s = new Datacite2Scholix(null); - d2s.setRootPath("$._source.attributes"); - } + d2s = new Datacite2Scholix(null); + d2s.setRootPath("$._source.attributes"); + } - public Iterable getDatasetsFromTs(final Long timestamp) { - return () -> { - try { - return new DataciteClientIterator(host, index, timestamp); - } catch (IOException e) { - throw new RuntimeException(e); - } - }; - } + public Iterable getDatasetsFromTs(final Long timestamp) { + return () -> { + try { + return new DataciteClientIterator(host, index, timestamp); + } catch (IOException e) { + throw new RuntimeException(e); + } + }; + } - public String getHost() { - return host; - } + public String getHost() { + return host; + } - public void setHost(String host) { - this.host = host; - } + public void setHost(String host) { + this.host = host; + } - public String getIndex() { - return index; - } + public String getIndex() { + return index; + } - public void setIndex(String index) { - this.index = index; - } + public void setIndex(String index) { + this.index = index; + } - public String getIndexType() { - return indexType; - } + public String getIndexType() { + return indexType; + } - public void setIndexType(String indexType) { - this.indexType = indexType; - } + public void setIndexType(String indexType) { + this.indexType = indexType; + } - public ScholixResource getDatasetByDOI(final String doi) { - try (CloseableHttpClient client = HttpClients.createDefault()) { - HttpGet httpGet = - new HttpGet( - String.format( - "http://%s:9200/%s/%s/%s", host, index, indexType, doi.replaceAll("/", "%2F"))); - CloseableHttpResponse response = client.execute(httpGet); - final String json = IOUtils.toString(response.getEntity().getContent()); - return d2s.generateDataciteScholixResource(json); - } catch (Throwable e) { - return null; - } - } + public ScholixResource getDatasetByDOI(final String doi) { + try (CloseableHttpClient client = HttpClients.createDefault()) { + HttpGet httpGet = new HttpGet( + String + .format( + "http://%s:9200/%s/%s/%s", host, index, indexType, doi.replaceAll("/", "%2F"))); + CloseableHttpResponse response = client.execute(httpGet); + final String json = IOUtils.toString(response.getEntity().getContent()); + return d2s.generateDataciteScholixResource(json); + } catch (Throwable e) { + return null; + } + } } diff --git a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/update/DataciteClientIterator.java b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/update/DataciteClientIterator.java index fa9dc5646..2c70c8b09 100644 --- a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/update/DataciteClientIterator.java +++ b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/update/DataciteClientIterator.java @@ -1,12 +1,11 @@ + package eu.dnetlib.dhp.provision.update; -import com.fasterxml.jackson.databind.ObjectMapper; -import com.jayway.jsonpath.JsonPath; import java.io.IOException; import java.util.Iterator; import java.util.List; import java.util.stream.Collectors; -import net.minidev.json.JSONArray; + import org.apache.commons.io.IOUtils; import org.apache.http.client.methods.CloseableHttpResponse; import org.apache.http.client.methods.HttpPost; @@ -14,103 +13,108 @@ import org.apache.http.entity.StringEntity; import org.apache.http.impl.client.CloseableHttpClient; import org.apache.http.impl.client.HttpClients; +import com.fasterxml.jackson.databind.ObjectMapper; +import com.jayway.jsonpath.JsonPath; + +import net.minidev.json.JSONArray; + public class DataciteClientIterator implements Iterator { - static final String blobPath = "$.hits.hits[*]._source"; - static final String scrollIdPath = "$._scroll_id"; + static final String blobPath = "$.hits.hits[*]._source"; + static final String scrollIdPath = "$._scroll_id"; - String scrollId; + String scrollId; - List buffer; + List buffer; - final String esHost; - final String esIndex; - final ObjectMapper mapper = new ObjectMapper(); + final String esHost; + final String esIndex; + final ObjectMapper mapper = new ObjectMapper(); - public DataciteClientIterator(final String esHost, final String esIndex, long timestamp) - throws IOException { + public DataciteClientIterator(final String esHost, final String esIndex, long timestamp) + throws IOException { - this.esHost = esHost; - this.esIndex = esIndex; - // THIS FIX IS NECESSARY to avoid different timezone - timestamp -= (60 * 60 * 2); - final String body = - getResponse( - String.format("http://%s:9200/%s/_search?scroll=1m", esHost, esIndex), - String.format( - "{\"size\":1000, \"query\":{\"range\":{\"timestamp\":{\"gte\":%d}}}}", timestamp)); - scrollId = getJPathString(scrollIdPath, body); - buffer = getBlobs(body); - } + this.esHost = esHost; + this.esIndex = esIndex; + // THIS FIX IS NECESSARY to avoid different timezone + timestamp -= (60 * 60 * 2); + final String body = getResponse( + String.format("http://%s:9200/%s/_search?scroll=1m", esHost, esIndex), + String + .format( + "{\"size\":1000, \"query\":{\"range\":{\"timestamp\":{\"gte\":%d}}}}", timestamp)); + scrollId = getJPathString(scrollIdPath, body); + buffer = getBlobs(body); + } - public String getResponse(final String url, final String json) { - CloseableHttpClient client = HttpClients.createDefault(); - try { + public String getResponse(final String url, final String json) { + CloseableHttpClient client = HttpClients.createDefault(); + try { - HttpPost httpPost = new HttpPost(url); - if (json != null) { - StringEntity entity = new StringEntity(json); - httpPost.setEntity(entity); - httpPost.setHeader("Accept", "application/json"); - httpPost.setHeader("Content-type", "application/json"); - } - CloseableHttpResponse response = client.execute(httpPost); + HttpPost httpPost = new HttpPost(url); + if (json != null) { + StringEntity entity = new StringEntity(json); + httpPost.setEntity(entity); + httpPost.setHeader("Accept", "application/json"); + httpPost.setHeader("Content-type", "application/json"); + } + CloseableHttpResponse response = client.execute(httpPost); - return IOUtils.toString(response.getEntity().getContent()); - } catch (Throwable e) { - throw new RuntimeException("Error on executing request ", e); - } finally { - try { - client.close(); - } catch (IOException e) { - throw new RuntimeException("Unable to close client ", e); - } - } - } + return IOUtils.toString(response.getEntity().getContent()); + } catch (Throwable e) { + throw new RuntimeException("Error on executing request ", e); + } finally { + try { + client.close(); + } catch (IOException e) { + throw new RuntimeException("Unable to close client ", e); + } + } + } - private String getJPathString(final String jsonPath, final String json) { - try { - Object o = JsonPath.read(json, jsonPath); - if (o instanceof String) return (String) o; - return null; - } catch (Exception e) { - return ""; - } - } + private String getJPathString(final String jsonPath, final String json) { + try { + Object o = JsonPath.read(json, jsonPath); + if (o instanceof String) + return (String) o; + return null; + } catch (Exception e) { + return ""; + } + } - private List getBlobs(final String body) { - JSONArray array = JsonPath.read(body, blobPath); - return array.stream() - .map( - o -> { - try { - return mapper.writeValueAsString(o); - } catch (Throwable e) { - throw new RuntimeException(e); - } - }) - .collect(Collectors.toList()); - } + private List getBlobs(final String body) { + JSONArray array = JsonPath.read(body, blobPath); + return array + .stream() + .map( + o -> { + try { + return mapper.writeValueAsString(o); + } catch (Throwable e) { + throw new RuntimeException(e); + } + }) + .collect(Collectors.toList()); + } - @Override - public boolean hasNext() { - return (buffer != null && !buffer.isEmpty()); - } + @Override + public boolean hasNext() { + return (buffer != null && !buffer.isEmpty()); + } - @Override - public String next() { - final String nextItem = buffer.remove(0); - if (buffer.isEmpty()) { - final String json_param = - String.format("{\"scroll_id\":\"%s\",\"scroll\" : \"1m\"}", scrollId); - final String body = - getResponse(String.format("http://%s:9200/_search/scroll", esHost), json_param); - try { - buffer = getBlobs(body); - } catch (Throwable e) { - System.out.println(body); - } - } - return nextItem; - } + @Override + public String next() { + final String nextItem = buffer.remove(0); + if (buffer.isEmpty()) { + final String json_param = String.format("{\"scroll_id\":\"%s\",\"scroll\" : \"1m\"}", scrollId); + final String body = getResponse(String.format("http://%s:9200/_search/scroll", esHost), json_param); + try { + buffer = getBlobs(body); + } catch (Throwable e) { + System.out.println(body); + } + } + return nextItem; + } } diff --git a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/update/RetrieveUpdateFromDatacite.java b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/update/RetrieveUpdateFromDatacite.java index 15c396b10..e876d05a1 100644 --- a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/update/RetrieveUpdateFromDatacite.java +++ b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/update/RetrieveUpdateFromDatacite.java @@ -1,11 +1,9 @@ + package eu.dnetlib.dhp.provision.update; -import com.fasterxml.jackson.databind.ObjectMapper; -import eu.dnetlib.dhp.application.ArgumentApplicationParser; -import eu.dnetlib.dhp.provision.scholix.Scholix; -import eu.dnetlib.scholexplorer.relation.RelationMapper; import java.net.URI; import java.util.List; + import org.apache.commons.io.IOUtils; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; @@ -14,54 +12,61 @@ import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.SequenceFile; import org.apache.hadoop.io.Text; +import com.fasterxml.jackson.databind.ObjectMapper; + +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.provision.scholix.Scholix; +import eu.dnetlib.scholexplorer.relation.RelationMapper; + public class RetrieveUpdateFromDatacite { - public static void main(String[] args) throws Exception { - final ArgumentApplicationParser parser = - new ArgumentApplicationParser( - IOUtils.toString( - RetrieveUpdateFromDatacite.class.getResourceAsStream( - "/eu/dnetlib/dhp/provision/input_retrieve_update_parameters.json"))); - parser.parseArgument(args); - final String hdfsuri = parser.get("namenode"); - Path hdfswritepath = new Path(parser.get("targetPath")); - final long timestamp = Long.parseLong(parser.get("timestamp")); - final String host = parser.get("indexHost"); - final String index = parser.get("indexName"); + public static void main(String[] args) throws Exception { + final ArgumentApplicationParser parser = new ArgumentApplicationParser( + IOUtils + .toString( + RetrieveUpdateFromDatacite.class + .getResourceAsStream( + "/eu/dnetlib/dhp/provision/input_retrieve_update_parameters.json"))); + parser.parseArgument(args); + final String hdfsuri = parser.get("namenode"); + Path hdfswritepath = new Path(parser.get("targetPath")); + final long timestamp = Long.parseLong(parser.get("timestamp")); + final String host = parser.get("indexHost"); + final String index = parser.get("indexName"); - // ====== Init HDFS File System Object - Configuration conf = new Configuration(); - // Set FileSystem URI - conf.set("fs.defaultFS", hdfsuri); - // Because of Maven - conf.set("fs.hdfs.impl", org.apache.hadoop.hdfs.DistributedFileSystem.class.getName()); - conf.set("fs.file.impl", org.apache.hadoop.fs.LocalFileSystem.class.getName()); + // ====== Init HDFS File System Object + Configuration conf = new Configuration(); + // Set FileSystem URI + conf.set("fs.defaultFS", hdfsuri); + // Because of Maven + conf.set("fs.hdfs.impl", org.apache.hadoop.hdfs.DistributedFileSystem.class.getName()); + conf.set("fs.file.impl", org.apache.hadoop.fs.LocalFileSystem.class.getName()); - FileSystem.get(URI.create(hdfsuri), conf); - final Datacite2Scholix d2s = new Datacite2Scholix(RelationMapper.load()); - final ObjectMapper mapper = new ObjectMapper(); - try (SequenceFile.Writer writer = - SequenceFile.createWriter( - conf, - SequenceFile.Writer.file(hdfswritepath), - SequenceFile.Writer.keyClass(IntWritable.class), - SequenceFile.Writer.valueClass(Text.class))) { - final Text value = new Text(); - final IntWritable key = new IntWritable(); - int i = 0; - for (String dataset : new DataciteClient(host).getDatasetsFromTs(timestamp)) { - i++; - List scholix = d2s.generateScholixFromJson(dataset); - if (scholix != null) - for (Scholix s : scholix) { - key.set(i); - value.set(mapper.writeValueAsString(s)); - writer.append(key, value); - if (i % 10000 == 0) { - System.out.println("wrote " + i); - } - } - } - } - } + FileSystem.get(URI.create(hdfsuri), conf); + final Datacite2Scholix d2s = new Datacite2Scholix(RelationMapper.load()); + final ObjectMapper mapper = new ObjectMapper(); + try (SequenceFile.Writer writer = SequenceFile + .createWriter( + conf, + SequenceFile.Writer.file(hdfswritepath), + SequenceFile.Writer.keyClass(IntWritable.class), + SequenceFile.Writer.valueClass(Text.class))) { + final Text value = new Text(); + final IntWritable key = new IntWritable(); + int i = 0; + for (String dataset : new DataciteClient(host).getDatasetsFromTs(timestamp)) { + i++; + List scholix = d2s.generateScholixFromJson(dataset); + if (scholix != null) + for (Scholix s : scholix) { + key.set(i); + value.set(mapper.writeValueAsString(s)); + writer.append(key, value); + if (i % 10000 == 0) { + System.out.println("wrote " + i); + } + } + } + } + } } diff --git a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/update/SparkResolveScholixTarget.java b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/update/SparkResolveScholixTarget.java index 09a5c7c3d..981c471ae 100644 --- a/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/update/SparkResolveScholixTarget.java +++ b/dhp-workflows/dhp-graph-provision-scholexplorer/src/main/java/eu/dnetlib/dhp/provision/update/SparkResolveScholixTarget.java @@ -1,16 +1,11 @@ + package eu.dnetlib.dhp.provision.update; -import com.fasterxml.jackson.databind.ObjectMapper; -import eu.dnetlib.dhp.application.ArgumentApplicationParser; -import eu.dnetlib.dhp.provision.scholix.Scholix; -import eu.dnetlib.dhp.provision.scholix.ScholixIdentifier; -import eu.dnetlib.dhp.provision.scholix.ScholixRelationship; -import eu.dnetlib.dhp.provision.scholix.ScholixResource; -import eu.dnetlib.dhp.utils.DHPUtils; import java.util.ArrayList; import java.util.Collections; import java.util.List; import java.util.stream.Collectors; + import org.apache.commons.io.IOUtils; import org.apache.commons.lang3.StringUtils; import org.apache.hadoop.io.IntWritable; @@ -20,150 +15,170 @@ import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.api.java.function.FlatMapFunction; import org.apache.spark.api.java.function.MapFunction; import org.apache.spark.sql.*; + +import com.fasterxml.jackson.databind.ObjectMapper; + +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.provision.scholix.Scholix; +import eu.dnetlib.dhp.provision.scholix.ScholixIdentifier; +import eu.dnetlib.dhp.provision.scholix.ScholixRelationship; +import eu.dnetlib.dhp.provision.scholix.ScholixResource; +import eu.dnetlib.dhp.utils.DHPUtils; import scala.Tuple2; public class SparkResolveScholixTarget { - public static void main(String[] args) throws Exception { - final ArgumentApplicationParser parser = - new ArgumentApplicationParser( - IOUtils.toString( - SparkResolveScholixTarget.class.getResourceAsStream( - "/eu/dnetlib/dhp/provision/input_resolve_scholix_parameters.json"))); - parser.parseArgument(args); + public static void main(String[] args) throws Exception { + final ArgumentApplicationParser parser = new ArgumentApplicationParser( + IOUtils + .toString( + SparkResolveScholixTarget.class + .getResourceAsStream( + "/eu/dnetlib/dhp/provision/input_resolve_scholix_parameters.json"))); + parser.parseArgument(args); - final SparkConf conf = new SparkConf(); + final SparkConf conf = new SparkConf(); - final String master = parser.get("master"); - final String sourcePath = parser.get("sourcePath"); - final String workingDirPath = parser.get("workingDirPath"); - final String indexHost = parser.get("indexHost"); - try (SparkSession spark = getSession(conf, master)) { + final String master = parser.get("master"); + final String sourcePath = parser.get("sourcePath"); + final String workingDirPath = parser.get("workingDirPath"); + final String indexHost = parser.get("indexHost"); + try (SparkSession spark = getSession(conf, master)) { - final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); + final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); - spark - .createDataset( - sc.sequenceFile(sourcePath, IntWritable.class, Text.class) - .map(Tuple2::_2) - .map(s -> new ObjectMapper().readValue(s.toString(), Scholix.class)) - .rdd(), - Encoders.bean(Scholix.class)) - .write() - .save(workingDirPath + "/stepA"); + spark + .createDataset( + sc + .sequenceFile(sourcePath, IntWritable.class, Text.class) + .map(Tuple2::_2) + .map(s -> new ObjectMapper().readValue(s.toString(), Scholix.class)) + .rdd(), + Encoders.bean(Scholix.class)) + .write() + .save(workingDirPath + "/stepA"); - Dataset s1 = - spark.read().load(workingDirPath + "/stepA").as(Encoders.bean(Scholix.class)); + Dataset s1 = spark.read().load(workingDirPath + "/stepA").as(Encoders.bean(Scholix.class)); - s1.where(s1.col("target.dnetIdentifier").isNull()) - .select(s1.col("target.identifier")) - .distinct() - .map( - (MapFunction) - f -> { - final String pid = ((Row) f.getList(0).get(0)).getString(0); - ScholixResource publication = - new CrossrefClient(indexHost).getResourceByDOI(pid); - if (publication != null) { - return publication; - } - ScholixResource dataset = new DataciteClient(indexHost).getDatasetByDOI(pid); - if (dataset != null) { - return dataset; - } - ScholixResource r = new ScholixResource(); - r.setIdentifier(Collections.singletonList(new ScholixIdentifier(pid, "doi"))); - r.setObjectType("unknown"); - r.setDnetIdentifier( - "70|" + DHPUtils.md5(String.format("%s::doi", pid.toLowerCase().trim()))); + s1 + .where(s1.col("target.dnetIdentifier").isNull()) + .select(s1.col("target.identifier")) + .distinct() + .map( + (MapFunction) f -> { + final String pid = ((Row) f.getList(0).get(0)).getString(0); + ScholixResource publication = new CrossrefClient(indexHost).getResourceByDOI(pid); + if (publication != null) { + return publication; + } + ScholixResource dataset = new DataciteClient(indexHost).getDatasetByDOI(pid); + if (dataset != null) { + return dataset; + } + ScholixResource r = new ScholixResource(); + r.setIdentifier(Collections.singletonList(new ScholixIdentifier(pid, "doi"))); + r.setObjectType("unknown"); + r + .setDnetIdentifier( + "70|" + DHPUtils.md5(String.format("%s::doi", pid.toLowerCase().trim()))); - return r; - }, - Encoders.bean(ScholixResource.class)) - .write() - .mode(SaveMode.Overwrite) - .save(workingDirPath + "/stepB"); + return r; + }, + Encoders.bean(ScholixResource.class)) + .write() + .mode(SaveMode.Overwrite) + .save(workingDirPath + "/stepB"); - Dataset s2 = - spark.read().load(workingDirPath + "/stepB").as(Encoders.bean(ScholixResource.class)); + Dataset s2 = spark + .read() + .load(workingDirPath + "/stepB") + .as(Encoders.bean(ScholixResource.class)); - s1.joinWith( - s2, - s1.col("target.identifier.identifier").equalTo(s2.col("identifier.identifier")), - "left") - .flatMap( - (FlatMapFunction, Scholix>) - f -> { - final List res = new ArrayList<>(); - final Scholix s = f._1(); - final ScholixResource target = f._2(); - if (StringUtils.isNotBlank(s.getIdentifier())) res.add(s); - else if (target == null) { - ScholixResource currentTarget = s.getTarget(); - currentTarget.setObjectType("unknown"); - currentTarget.setDnetIdentifier( - Datacite2Scholix.generateId( - currentTarget.getIdentifier().get(0).getIdentifier(), - currentTarget.getIdentifier().get(0).getSchema(), - currentTarget.getObjectType())); + s1 + .joinWith( + s2, + s1.col("target.identifier.identifier").equalTo(s2.col("identifier.identifier")), + "left") + .flatMap( + (FlatMapFunction, Scholix>) f -> { + final List res = new ArrayList<>(); + final Scholix s = f._1(); + final ScholixResource target = f._2(); + if (StringUtils.isNotBlank(s.getIdentifier())) + res.add(s); + else if (target == null) { + ScholixResource currentTarget = s.getTarget(); + currentTarget.setObjectType("unknown"); + currentTarget + .setDnetIdentifier( + Datacite2Scholix + .generateId( + currentTarget.getIdentifier().get(0).getIdentifier(), + currentTarget.getIdentifier().get(0).getSchema(), + currentTarget.getObjectType())); - s.generateIdentifier(); - res.add(s); - final Scholix inverse = new Scholix(); - inverse.setTarget(s.getSource()); - inverse.setSource(s.getTarget()); - inverse.setLinkprovider(s.getLinkprovider()); - inverse.setPublicationDate(s.getPublicationDate()); - inverse.setPublisher(s.getPublisher()); - inverse.setRelationship( - new ScholixRelationship( - s.getRelationship().getInverse(), - s.getRelationship().getSchema(), - s.getRelationship().getName())); - inverse.generateIdentifier(); - res.add(inverse); + s.generateIdentifier(); + res.add(s); + final Scholix inverse = new Scholix(); + inverse.setTarget(s.getSource()); + inverse.setSource(s.getTarget()); + inverse.setLinkprovider(s.getLinkprovider()); + inverse.setPublicationDate(s.getPublicationDate()); + inverse.setPublisher(s.getPublisher()); + inverse + .setRelationship( + new ScholixRelationship( + s.getRelationship().getInverse(), + s.getRelationship().getSchema(), + s.getRelationship().getName())); + inverse.generateIdentifier(); + res.add(inverse); - } else { - target.setIdentifier( - target.getIdentifier().stream() - .map( - d -> - new ScholixIdentifier( - d.getIdentifier().toLowerCase(), - d.getSchema().toLowerCase())) - .collect(Collectors.toList())); - s.setTarget(target); - s.generateIdentifier(); - res.add(s); - final Scholix inverse = new Scholix(); - inverse.setTarget(s.getSource()); - inverse.setSource(s.getTarget()); - inverse.setLinkprovider(s.getLinkprovider()); - inverse.setPublicationDate(s.getPublicationDate()); - inverse.setPublisher(s.getPublisher()); - inverse.setRelationship( - new ScholixRelationship( - s.getRelationship().getInverse(), - s.getRelationship().getSchema(), - s.getRelationship().getName())); - inverse.generateIdentifier(); - res.add(inverse); - } + } else { + target + .setIdentifier( + target + .getIdentifier() + .stream() + .map( + d -> new ScholixIdentifier( + d.getIdentifier().toLowerCase(), + d.getSchema().toLowerCase())) + .collect(Collectors.toList())); + s.setTarget(target); + s.generateIdentifier(); + res.add(s); + final Scholix inverse = new Scholix(); + inverse.setTarget(s.getSource()); + inverse.setSource(s.getTarget()); + inverse.setLinkprovider(s.getLinkprovider()); + inverse.setPublicationDate(s.getPublicationDate()); + inverse.setPublisher(s.getPublisher()); + inverse + .setRelationship( + new ScholixRelationship( + s.getRelationship().getInverse(), + s.getRelationship().getSchema(), + s.getRelationship().getName())); + inverse.generateIdentifier(); + res.add(inverse); + } - return res.iterator(); - }, - Encoders.bean(Scholix.class)) - .javaRDD() - .map(s -> new ObjectMapper().writeValueAsString(s)) - .saveAsTextFile(workingDirPath + "/resolved_json"); - } - } + return res.iterator(); + }, + Encoders.bean(Scholix.class)) + .javaRDD() + .map(s -> new ObjectMapper().writeValueAsString(s)) + .saveAsTextFile(workingDirPath + "/resolved_json"); + } + } - private static SparkSession getSession(SparkConf conf, String master) { - return SparkSession.builder() - .config(conf) - .appName(SparkResolveScholixTarget.class.getSimpleName()) - .master(master) - .getOrCreate(); - } + private static SparkSession getSession(SparkConf conf, String master) { + return SparkSession + .builder() + .config(conf) + .appName(SparkResolveScholixTarget.class.getSimpleName()) + .master(master) + .getOrCreate(); + } } diff --git a/dhp-workflows/dhp-graph-provision-scholexplorer/src/test/java/eu/dnetlib/dhp/provision/DataciteClientTest.java b/dhp-workflows/dhp-graph-provision-scholexplorer/src/test/java/eu/dnetlib/dhp/provision/DataciteClientTest.java index 7dba9c95e..d9cbd22f3 100644 --- a/dhp-workflows/dhp-graph-provision-scholexplorer/src/test/java/eu/dnetlib/dhp/provision/DataciteClientTest.java +++ b/dhp-workflows/dhp-graph-provision-scholexplorer/src/test/java/eu/dnetlib/dhp/provision/DataciteClientTest.java @@ -1,46 +1,50 @@ + package eu.dnetlib.dhp.provision; -import com.fasterxml.jackson.databind.ObjectMapper; -import eu.dnetlib.dhp.provision.scholix.Scholix; -import eu.dnetlib.dhp.provision.scholix.ScholixResource; -import eu.dnetlib.dhp.provision.update.*; -import eu.dnetlib.scholexplorer.relation.RelationMapper; import java.util.List; + import org.apache.commons.io.IOUtils; import org.junit.jupiter.api.Assertions; import org.junit.jupiter.api.Test; +import com.fasterxml.jackson.databind.ObjectMapper; + +import eu.dnetlib.dhp.provision.scholix.Scholix; +import eu.dnetlib.dhp.provision.scholix.ScholixResource; +import eu.dnetlib.dhp.provision.update.*; +import eu.dnetlib.scholexplorer.relation.RelationMapper; + public class DataciteClientTest { - @Test - public void dataciteSCholixTest() throws Exception { - final String json = IOUtils.toString(getClass().getResourceAsStream("datacite.json")); - final RelationMapper mapper = RelationMapper.load(); + @Test + public void dataciteSCholixTest() throws Exception { + final String json = IOUtils.toString(getClass().getResourceAsStream("datacite.json")); + final RelationMapper mapper = RelationMapper.load(); - Datacite2Scholix ds = new Datacite2Scholix(mapper); - final List s = ds.generateScholixFromJson(json); - System.out.println(new ObjectMapper().writeValueAsString(s)); - } + Datacite2Scholix ds = new Datacite2Scholix(mapper); + final List s = ds.generateScholixFromJson(json); + System.out.println(new ObjectMapper().writeValueAsString(s)); + } - // public void testS() throws Exception { - // RetrieveUpdateFromDatacite.main(new String[]{ - // "-n", "file:///data/new_s2.txt", - // "-t", "/data/new_s2.txt", - // "-ts", "1586974078", - // "-ih", "ip-90-147-167-25.ct1.garrservices.it", - // "-in", "datacite", - // }); - // - // } + // public void testS() throws Exception { + // RetrieveUpdateFromDatacite.main(new String[]{ + // "-n", "file:///data/new_s2.txt", + // "-t", "/data/new_s2.txt", + // "-ts", "1586974078", + // "-ih", "ip-90-147-167-25.ct1.garrservices.it", + // "-in", "datacite", + // }); + // + // } - public void testResolveDataset() throws Exception { - DataciteClient dc = new DataciteClient("ip-90-147-167-25.ct1.garrservices.it"); - ScholixResource datasetByDOI = dc.getDatasetByDOI("10.17182/hepdata.15392.v1/t5"); - Assertions.assertNotNull(datasetByDOI); - System.out.println(new ObjectMapper().writeValueAsString(datasetByDOI)); + public void testResolveDataset() throws Exception { + DataciteClient dc = new DataciteClient("ip-90-147-167-25.ct1.garrservices.it"); + ScholixResource datasetByDOI = dc.getDatasetByDOI("10.17182/hepdata.15392.v1/t5"); + Assertions.assertNotNull(datasetByDOI); + System.out.println(new ObjectMapper().writeValueAsString(datasetByDOI)); - CrossrefClient cr = new CrossrefClient("ip-90-147-167-25.ct1.garrservices.it"); - ScholixResource crossrefByDOI = cr.getResourceByDOI("10.26850/1678-4618eqj.v35.1.2010.p41-46"); - Assertions.assertNotNull(crossrefByDOI); - System.out.println(new ObjectMapper().writeValueAsString(crossrefByDOI)); - } + CrossrefClient cr = new CrossrefClient("ip-90-147-167-25.ct1.garrservices.it"); + ScholixResource crossrefByDOI = cr.getResourceByDOI("10.26850/1678-4618eqj.v35.1.2010.p41-46"); + Assertions.assertNotNull(crossrefByDOI); + System.out.println(new ObjectMapper().writeValueAsString(crossrefByDOI)); + } } diff --git a/dhp-workflows/dhp-graph-provision-scholexplorer/src/test/java/eu/dnetlib/dhp/provision/ExtractInfoTest.java b/dhp-workflows/dhp-graph-provision-scholexplorer/src/test/java/eu/dnetlib/dhp/provision/ExtractInfoTest.java index cc6e999ae..be97072b5 100644 --- a/dhp-workflows/dhp-graph-provision-scholexplorer/src/test/java/eu/dnetlib/dhp/provision/ExtractInfoTest.java +++ b/dhp-workflows/dhp-graph-provision-scholexplorer/src/test/java/eu/dnetlib/dhp/provision/ExtractInfoTest.java @@ -1,27 +1,30 @@ + package eu.dnetlib.dhp.provision; -import com.fasterxml.jackson.databind.ObjectMapper; -import eu.dnetlib.dhp.provision.scholix.Scholix; -import eu.dnetlib.dhp.provision.scholix.summary.ScholixSummary; import org.apache.commons.io.IOUtils; import org.junit.jupiter.api.Test; +import com.fasterxml.jackson.databind.ObjectMapper; + +import eu.dnetlib.dhp.provision.scholix.Scholix; +import eu.dnetlib.dhp.provision.scholix.summary.ScholixSummary; + public class ExtractInfoTest { - @Test - public void testSerialization() throws Exception { + @Test + public void testSerialization() throws Exception { - ScholixSummary summary = new ScholixSummary(); - summary.setDescription("descrizione"); - ObjectMapper mapper = new ObjectMapper(); - String json = mapper.writeValueAsString(summary); - System.out.println(json); - System.out.println(mapper.readValue(json, ScholixSummary.class).getDescription()); - } + ScholixSummary summary = new ScholixSummary(); + summary.setDescription("descrizione"); + ObjectMapper mapper = new ObjectMapper(); + String json = mapper.writeValueAsString(summary); + System.out.println(json); + System.out.println(mapper.readValue(json, ScholixSummary.class).getDescription()); + } - @Test - public void testScholix() throws Exception { - final String jsonSummary = IOUtils.toString(getClass().getResourceAsStream("summary.json")); - final String jsonRelation = IOUtils.toString(getClass().getResourceAsStream("relation.json")); - Scholix.generateScholixWithSource(jsonSummary, jsonRelation); - } + @Test + public void testScholix() throws Exception { + final String jsonSummary = IOUtils.toString(getClass().getResourceAsStream("summary.json")); + final String jsonRelation = IOUtils.toString(getClass().getResourceAsStream("relation.json")); + Scholix.generateScholixWithSource(jsonSummary, jsonRelation); + } } diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/AdjacencyListBuilderJob.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/AdjacencyListBuilderJob.java index 07b11010b..99247b756 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/AdjacencyListBuilderJob.java +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/AdjacencyListBuilderJob.java @@ -1,16 +1,12 @@ + package eu.dnetlib.dhp.oa.provision; import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; -import eu.dnetlib.dhp.application.ArgumentApplicationParser; -import eu.dnetlib.dhp.common.HdfsSupport; -import eu.dnetlib.dhp.oa.provision.model.EntityRelEntity; -import eu.dnetlib.dhp.oa.provision.model.JoinedEntity; -import eu.dnetlib.dhp.oa.provision.model.Tuple2; -import eu.dnetlib.dhp.schema.common.ModelSupport; import java.util.ArrayList; import java.util.List; import java.util.Optional; + import org.apache.commons.io.IOUtils; import org.apache.spark.SparkConf; import org.apache.spark.api.java.function.MapFunction; @@ -21,103 +17,108 @@ import org.apache.spark.sql.SparkSession; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.common.HdfsSupport; +import eu.dnetlib.dhp.oa.provision.model.EntityRelEntity; +import eu.dnetlib.dhp.oa.provision.model.JoinedEntity; +import eu.dnetlib.dhp.oa.provision.model.Tuple2; +import eu.dnetlib.dhp.schema.common.ModelSupport; + /** - * Joins the graph nodes by resolving the links of distance = 1 to create an adjacency list of - * linked objects. The operation considers all the entity types (publication, dataset, software, - * ORP, project, datasource, organization, and all the possible relationships (similarity links - * produced by the Dedup process are excluded). - * - *

The operation is implemented by sequentially joining one entity type at time (E) with the - * relationships (R), and again by E, finally grouped by E.id; - * - *

The workflow is organized in different parts aimed to to reduce the complexity of the - * operation 1) PrepareRelationsJob: only consider relationships that are not virtually deleted - * ($.dataInfo.deletedbyinference == false), each entity can be linked at most to 100 other objects - * - *

2) JoinRelationEntityByTargetJob: (phase 1): prepare tuples [relation - target entity] (R - - * T): for each entity type E_i map E_i as RelatedEntity T_i to simplify the model and extracting - * only the necessary information join (R.target = T_i.id) save the tuples (R_i, T_i) (phase 2): - * create the union of all the entity types E, hash by id read the tuples (R, T), hash by R.source - * join E.id = (R, T).source, where E becomes the Source Entity S save the tuples (S, R, T) - * - *

3) AdjacencyListBuilderJob: given the tuple (S - R - T) we need to group by S.id -> List [ R - - * T ], mapping the result as JoinedEntity - * - *

4) XmlConverterJob: convert the JoinedEntities as XML records + * Joins the graph nodes by resolving the links of distance = 1 to create an adjacency list of linked objects. The + * operation considers all the entity types (publication, dataset, software, ORP, project, datasource, organization, and + * all the possible relationships (similarity links produced by the Dedup process are excluded). + *

+ * The operation is implemented by sequentially joining one entity type at time (E) with the relationships (R), and + * again by E, finally grouped by E.id; + *

+ * The workflow is organized in different parts aimed to to reduce the complexity of the operation 1) + * PrepareRelationsJob: only consider relationships that are not virtually deleted ($.dataInfo.deletedbyinference == + * false), each entity can be linked at most to 100 other objects + *

+ * 2) JoinRelationEntityByTargetJob: (phase 1): prepare tuples [relation - target entity] (R - T): for each entity type + * E_i map E_i as RelatedEntity T_i to simplify the model and extracting only the necessary information join (R.target = + * T_i.id) save the tuples (R_i, T_i) (phase 2): create the union of all the entity types E, hash by id read the tuples + * (R, T), hash by R.source join E.id = (R, T).source, where E becomes the Source Entity S save the tuples (S, R, T) + *

+ * 3) AdjacencyListBuilderJob: given the tuple (S - R - T) we need to group by S.id -> List [ R - T ], mapping the + * result as JoinedEntity + *

+ * 4) XmlConverterJob: convert the JoinedEntities as XML records */ public class AdjacencyListBuilderJob { - private static final Logger log = LoggerFactory.getLogger(AdjacencyListBuilderJob.class); + private static final Logger log = LoggerFactory.getLogger(AdjacencyListBuilderJob.class); - public static final int MAX_LINKS = 100; + public static final int MAX_LINKS = 100; - public static void main(String[] args) throws Exception { + public static void main(String[] args) throws Exception { - final ArgumentApplicationParser parser = - new ArgumentApplicationParser( - IOUtils.toString( - AdjacencyListBuilderJob.class.getResourceAsStream( - "/eu/dnetlib/dhp/oa/provision/input_params_build_adjacency_lists.json"))); - parser.parseArgument(args); + final ArgumentApplicationParser parser = new ArgumentApplicationParser( + IOUtils + .toString( + AdjacencyListBuilderJob.class + .getResourceAsStream( + "/eu/dnetlib/dhp/oa/provision/input_params_build_adjacency_lists.json"))); + parser.parseArgument(args); - Boolean isSparkSessionManaged = - Optional.ofNullable(parser.get("isSparkSessionManaged")) - .map(Boolean::valueOf) - .orElse(Boolean.TRUE); - log.info("isSparkSessionManaged: {}", isSparkSessionManaged); + Boolean isSparkSessionManaged = Optional + .ofNullable(parser.get("isSparkSessionManaged")) + .map(Boolean::valueOf) + .orElse(Boolean.TRUE); + log.info("isSparkSessionManaged: {}", isSparkSessionManaged); - String inputPath = parser.get("inputPath"); - log.info("inputPath: {}", inputPath); + String inputPath = parser.get("inputPath"); + log.info("inputPath: {}", inputPath); - String outputPath = parser.get("outputPath"); - log.info("outputPath: {}", outputPath); + String outputPath = parser.get("outputPath"); + log.info("outputPath: {}", outputPath); - SparkConf conf = new SparkConf(); - conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer"); - conf.registerKryoClasses(ModelSupport.getOafModelClasses()); + SparkConf conf = new SparkConf(); + conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer"); + conf.registerKryoClasses(ModelSupport.getOafModelClasses()); - runWithSparkSession( - conf, - isSparkSessionManaged, - spark -> { - removeOutputDir(spark, outputPath); - createAdjacencyLists(spark, inputPath, outputPath); - }); - } + runWithSparkSession( + conf, + isSparkSessionManaged, + spark -> { + removeOutputDir(spark, outputPath); + createAdjacencyLists(spark, inputPath, outputPath); + }); + } - private static void createAdjacencyLists( - SparkSession spark, String inputPath, String outputPath) { + private static void createAdjacencyLists( + SparkSession spark, String inputPath, String outputPath) { - log.info("Reading joined entities from: {}", inputPath); - spark - .read() - .load(inputPath) - .as(Encoders.bean(EntityRelEntity.class)) - .groupByKey( - (MapFunction) value -> value.getEntity().getId(), - Encoders.STRING()) - .mapGroups( - (MapGroupsFunction) - (key, values) -> { - JoinedEntity j = new JoinedEntity(); - List links = new ArrayList<>(); - while (values.hasNext() && links.size() < MAX_LINKS) { - EntityRelEntity curr = values.next(); - if (j.getEntity() == null) { - j.setEntity(curr.getEntity()); - } - links.add(new Tuple2(curr.getRelation(), curr.getTarget())); - } - j.setLinks(links); - return j; - }, - Encoders.bean(JoinedEntity.class)) - .write() - .mode(SaveMode.Overwrite) - .parquet(outputPath); - } + log.info("Reading joined entities from: {}", inputPath); + spark + .read() + .load(inputPath) + .as(Encoders.bean(EntityRelEntity.class)) + .groupByKey( + (MapFunction) value -> value.getEntity().getId(), + Encoders.STRING()) + .mapGroups( + (MapGroupsFunction) (key, values) -> { + JoinedEntity j = new JoinedEntity(); + List links = new ArrayList<>(); + while (values.hasNext() && links.size() < MAX_LINKS) { + EntityRelEntity curr = values.next(); + if (j.getEntity() == null) { + j.setEntity(curr.getEntity()); + } + links.add(new Tuple2(curr.getRelation(), curr.getTarget())); + } + j.setLinks(links); + return j; + }, + Encoders.bean(JoinedEntity.class)) + .write() + .mode(SaveMode.Overwrite) + .parquet(outputPath); + } - private static void removeOutputDir(SparkSession spark, String path) { - HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration()); - } + private static void removeOutputDir(SparkSession spark, String path) { + HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration()); + } } diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/CreateRelatedEntitiesJob_phase1.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/CreateRelatedEntitiesJob_phase1.java index a9c97155c..606fa4cc0 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/CreateRelatedEntitiesJob_phase1.java +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/CreateRelatedEntitiesJob_phase1.java @@ -1,21 +1,14 @@ + package eu.dnetlib.dhp.oa.provision; import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; import static eu.dnetlib.dhp.oa.provision.utils.GraphMappingUtils.*; -import com.fasterxml.jackson.databind.ObjectMapper; -import eu.dnetlib.dhp.application.ArgumentApplicationParser; -import eu.dnetlib.dhp.common.HdfsSupport; -import eu.dnetlib.dhp.oa.provision.model.EntityRelEntity; -import eu.dnetlib.dhp.oa.provision.model.RelatedEntity; -import eu.dnetlib.dhp.oa.provision.model.SortableRelation; -import eu.dnetlib.dhp.schema.common.EntityType; -import eu.dnetlib.dhp.schema.common.ModelSupport; -import eu.dnetlib.dhp.schema.oaf.*; import java.util.List; import java.util.Objects; import java.util.Optional; import java.util.stream.Collectors; + import org.apache.commons.io.IOUtils; import org.apache.spark.SparkConf; import org.apache.spark.api.java.function.MapFunction; @@ -25,224 +18,228 @@ import org.apache.spark.sql.SaveMode; import org.apache.spark.sql.SparkSession; import org.slf4j.Logger; import org.slf4j.LoggerFactory; + +import com.fasterxml.jackson.databind.ObjectMapper; + +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.common.HdfsSupport; +import eu.dnetlib.dhp.oa.provision.model.EntityRelEntity; +import eu.dnetlib.dhp.oa.provision.model.RelatedEntity; +import eu.dnetlib.dhp.oa.provision.model.SortableRelation; +import eu.dnetlib.dhp.schema.common.EntityType; +import eu.dnetlib.dhp.schema.common.ModelSupport; +import eu.dnetlib.dhp.schema.oaf.*; import scala.Tuple2; /** - * Joins the graph nodes by resolving the links of distance = 1 to create an adjacency list of - * linked objects. The operation considers all the entity types (publication, dataset, software, - * ORP, project, datasource, organization, and all the possible relationships (similarity links - * produced by the Dedup process are excluded). - * - *

The operation is implemented by sequentially joining one entity type at time (E) with the - * relationships (R), and again by E, finally grouped by E.id; - * - *

The workflow is organized in different parts aimed to to reduce the complexity of the - * operation 1) PrepareRelationsJob: only consider relationships that are not virtually deleted - * ($.dataInfo.deletedbyinference == false), each entity can be linked at most to 100 other objects - * - *

2) JoinRelationEntityByTargetJob: (phase 1): prepare tuples [relation - target entity] (R - - * T): for each entity type E_i map E_i as RelatedEntity T_i to simplify the model and extracting - * only the necessary information join (R.target = T_i.id) save the tuples (R_i, T_i) (phase 2): - * create the union of all the entity types E, hash by id read the tuples (R, T), hash by R.source - * join E.id = (R, T).source, where E becomes the Source Entity S save the tuples (S, R, T) - * - *

3) AdjacencyListBuilderJob: given the tuple (S - R - T) we need to group by S.id -> List [ R - - * T ], mapping the result as JoinedEntity - * - *

4) XmlConverterJob: convert the JoinedEntities as XML records + * Joins the graph nodes by resolving the links of distance = 1 to create an adjacency list of linked objects. The + * operation considers all the entity types (publication, dataset, software, ORP, project, datasource, organization, and + * all the possible relationships (similarity links produced by the Dedup process are excluded). + *

+ * The operation is implemented by sequentially joining one entity type at time (E) with the relationships (R), and + * again by E, finally grouped by E.id; + *

+ * The workflow is organized in different parts aimed to to reduce the complexity of the operation 1) + * PrepareRelationsJob: only consider relationships that are not virtually deleted ($.dataInfo.deletedbyinference == + * false), each entity can be linked at most to 100 other objects + *

+ * 2) JoinRelationEntityByTargetJob: (phase 1): prepare tuples [relation - target entity] (R - T): for each entity type + * E_i map E_i as RelatedEntity T_i to simplify the model and extracting only the necessary information join (R.target = + * T_i.id) save the tuples (R_i, T_i) (phase 2): create the union of all the entity types E, hash by id read the tuples + * (R, T), hash by R.source join E.id = (R, T).source, where E becomes the Source Entity S save the tuples (S, R, T) + *

+ * 3) AdjacencyListBuilderJob: given the tuple (S - R - T) we need to group by S.id -> List [ R - T ], mapping the + * result as JoinedEntity + *

+ * 4) XmlConverterJob: convert the JoinedEntities as XML records */ public class CreateRelatedEntitiesJob_phase1 { - private static final Logger log = LoggerFactory.getLogger(CreateRelatedEntitiesJob_phase1.class); + private static final Logger log = LoggerFactory.getLogger(CreateRelatedEntitiesJob_phase1.class); - private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); + private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); - public static void main(String[] args) throws Exception { + public static void main(String[] args) throws Exception { - String jsonConfiguration = - IOUtils.toString( - PrepareRelationsJob.class.getResourceAsStream( - "/eu/dnetlib/dhp/oa/provision/input_params_related_entities_pahase1.json")); - final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration); - parser.parseArgument(args); + String jsonConfiguration = IOUtils + .toString( + PrepareRelationsJob.class + .getResourceAsStream( + "/eu/dnetlib/dhp/oa/provision/input_params_related_entities_pahase1.json")); + final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration); + parser.parseArgument(args); - Boolean isSparkSessionManaged = - Optional.ofNullable(parser.get("isSparkSessionManaged")) - .map(Boolean::valueOf) - .orElse(Boolean.TRUE); - log.info("isSparkSessionManaged: {}", isSparkSessionManaged); + Boolean isSparkSessionManaged = Optional + .ofNullable(parser.get("isSparkSessionManaged")) + .map(Boolean::valueOf) + .orElse(Boolean.TRUE); + log.info("isSparkSessionManaged: {}", isSparkSessionManaged); - String inputRelationsPath = parser.get("inputRelationsPath"); - log.info("inputRelationsPath: {}", inputRelationsPath); + String inputRelationsPath = parser.get("inputRelationsPath"); + log.info("inputRelationsPath: {}", inputRelationsPath); - String inputEntityPath = parser.get("inputEntityPath"); - log.info("inputEntityPath: {}", inputEntityPath); + String inputEntityPath = parser.get("inputEntityPath"); + log.info("inputEntityPath: {}", inputEntityPath); - String outputPath = parser.get("outputPath"); - log.info("outputPath: {}", outputPath); + String outputPath = parser.get("outputPath"); + log.info("outputPath: {}", outputPath); - String graphTableClassName = parser.get("graphTableClassName"); - log.info("graphTableClassName: {}", graphTableClassName); + String graphTableClassName = parser.get("graphTableClassName"); + log.info("graphTableClassName: {}", graphTableClassName); - Class entityClazz = - (Class) Class.forName(graphTableClassName); + Class entityClazz = (Class) Class.forName(graphTableClassName); - SparkConf conf = new SparkConf(); - conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer"); - conf.registerKryoClasses(ModelSupport.getOafModelClasses()); + SparkConf conf = new SparkConf(); + conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer"); + conf.registerKryoClasses(ModelSupport.getOafModelClasses()); - runWithSparkSession( - conf, - isSparkSessionManaged, - spark -> { - removeOutputDir(spark, outputPath); - joinRelationEntity(spark, inputRelationsPath, inputEntityPath, entityClazz, outputPath); - }); - } + runWithSparkSession( + conf, + isSparkSessionManaged, + spark -> { + removeOutputDir(spark, outputPath); + joinRelationEntity(spark, inputRelationsPath, inputEntityPath, entityClazz, outputPath); + }); + } - private static void joinRelationEntity( - SparkSession spark, - String inputRelationsPath, - String inputEntityPath, - Class clazz, - String outputPath) { + private static void joinRelationEntity( + SparkSession spark, + String inputRelationsPath, + String inputEntityPath, + Class clazz, + String outputPath) { - Dataset> relsByTarget = - readPathRelation(spark, inputRelationsPath) - .filter("dataInfo.deletedbyinference == false") - .map( - (MapFunction>) - r -> new Tuple2<>(r.getTarget(), r), - Encoders.tuple(Encoders.STRING(), Encoders.kryo(SortableRelation.class))) - .cache(); + Dataset> relsByTarget = readPathRelation(spark, inputRelationsPath) + .filter("dataInfo.deletedbyinference == false") + .map( + (MapFunction>) r -> new Tuple2<>(r.getTarget(), r), + Encoders.tuple(Encoders.STRING(), Encoders.kryo(SortableRelation.class))) + .cache(); - Dataset> entities = - readPathEntity(spark, inputEntityPath, clazz) - .filter("dataInfo.invisible == false") - .map( - (MapFunction) value -> asRelatedEntity(value, clazz), - Encoders.bean(RelatedEntity.class)) - .map( - (MapFunction>) - e -> new Tuple2<>(e.getId(), e), - Encoders.tuple(Encoders.STRING(), Encoders.kryo(RelatedEntity.class))) - .cache(); + Dataset> entities = readPathEntity(spark, inputEntityPath, clazz) + .filter("dataInfo.invisible == false") + .map( + (MapFunction) value -> asRelatedEntity(value, clazz), + Encoders.bean(RelatedEntity.class)) + .map( + (MapFunction>) e -> new Tuple2<>(e.getId(), e), + Encoders.tuple(Encoders.STRING(), Encoders.kryo(RelatedEntity.class))) + .cache(); - relsByTarget - .joinWith(entities, entities.col("_1").equalTo(relsByTarget.col("_1")), "inner") - .map( - (MapFunction< - Tuple2, Tuple2>, - EntityRelEntity>) - t -> new EntityRelEntity(t._1()._2(), t._2()._2()), - Encoders.bean(EntityRelEntity.class)) - .write() - .mode(SaveMode.Overwrite) - .parquet(outputPath + "/" + EntityType.fromClass(clazz)); - } + relsByTarget + .joinWith(entities, entities.col("_1").equalTo(relsByTarget.col("_1")), "inner") + .map( + (MapFunction, Tuple2>, EntityRelEntity>) t -> new EntityRelEntity( + t._1()._2(), t._2()._2()), + Encoders.bean(EntityRelEntity.class)) + .write() + .mode(SaveMode.Overwrite) + .parquet(outputPath + "/" + EntityType.fromClass(clazz)); + } - private static Dataset readPathEntity( - SparkSession spark, String inputEntityPath, Class entityClazz) { + private static Dataset readPathEntity( + SparkSession spark, String inputEntityPath, Class entityClazz) { - log.info("Reading Graph table from: {}", inputEntityPath); - return spark - .read() - .textFile(inputEntityPath) - .map( - (MapFunction) value -> OBJECT_MAPPER.readValue(value, entityClazz), - Encoders.bean(entityClazz)); - } + log.info("Reading Graph table from: {}", inputEntityPath); + return spark + .read() + .textFile(inputEntityPath) + .map( + (MapFunction) value -> OBJECT_MAPPER.readValue(value, entityClazz), + Encoders.bean(entityClazz)); + } - public static RelatedEntity asRelatedEntity(E entity, Class clazz) { + public static RelatedEntity asRelatedEntity(E entity, Class clazz) { - final RelatedEntity re = new RelatedEntity(); - re.setId(entity.getId()); - re.setType(EntityType.fromClass(clazz).name()); + final RelatedEntity re = new RelatedEntity(); + re.setId(entity.getId()); + re.setType(EntityType.fromClass(clazz).name()); - re.setPid(entity.getPid()); - re.setCollectedfrom(entity.getCollectedfrom()); + re.setPid(entity.getPid()); + re.setCollectedfrom(entity.getCollectedfrom()); - switch (EntityType.fromClass(clazz)) { - case publication: - case dataset: - case otherresearchproduct: - case software: - Result result = (Result) entity; + switch (EntityType.fromClass(clazz)) { + case publication: + case dataset: + case otherresearchproduct: + case software: + Result result = (Result) entity; - if (result.getTitle() != null && !result.getTitle().isEmpty()) { - re.setTitle(result.getTitle().stream().findFirst().get()); - } + if (result.getTitle() != null && !result.getTitle().isEmpty()) { + re.setTitle(result.getTitle().stream().findFirst().get()); + } - re.setDateofacceptance(getValue(result.getDateofacceptance())); - re.setPublisher(getValue(result.getPublisher())); - re.setResulttype(result.getResulttype()); - re.setInstances(result.getInstance()); + re.setDateofacceptance(getValue(result.getDateofacceptance())); + re.setPublisher(getValue(result.getPublisher())); + re.setResulttype(result.getResulttype()); + re.setInstances(result.getInstance()); - // TODO still to be mapped - // re.setCodeRepositoryUrl(j.read("$.coderepositoryurl")); + // TODO still to be mapped + // re.setCodeRepositoryUrl(j.read("$.coderepositoryurl")); - break; - case datasource: - Datasource d = (Datasource) entity; + break; + case datasource: + Datasource d = (Datasource) entity; - re.setOfficialname(getValue(d.getOfficialname())); - re.setWebsiteurl(getValue(d.getWebsiteurl())); - re.setDatasourcetype(d.getDatasourcetype()); - re.setOpenairecompatibility(d.getOpenairecompatibility()); + re.setOfficialname(getValue(d.getOfficialname())); + re.setWebsiteurl(getValue(d.getWebsiteurl())); + re.setDatasourcetype(d.getDatasourcetype()); + re.setOpenairecompatibility(d.getOpenairecompatibility()); - break; - case organization: - Organization o = (Organization) entity; + break; + case organization: + Organization o = (Organization) entity; - re.setLegalname(getValue(o.getLegalname())); - re.setLegalshortname(getValue(o.getLegalshortname())); - re.setCountry(o.getCountry()); - re.setWebsiteurl(getValue(o.getWebsiteurl())); - break; - case project: - Project p = (Project) entity; + re.setLegalname(getValue(o.getLegalname())); + re.setLegalshortname(getValue(o.getLegalshortname())); + re.setCountry(o.getCountry()); + re.setWebsiteurl(getValue(o.getWebsiteurl())); + break; + case project: + Project p = (Project) entity; - re.setProjectTitle(getValue(p.getTitle())); - re.setCode(getValue(p.getCode())); - re.setAcronym(getValue(p.getAcronym())); - re.setContracttype(p.getContracttype()); + re.setProjectTitle(getValue(p.getTitle())); + re.setCode(getValue(p.getCode())); + re.setAcronym(getValue(p.getAcronym())); + re.setContracttype(p.getContracttype()); - List> f = p.getFundingtree(); - if (!f.isEmpty()) { - re.setFundingtree(f.stream().map(s -> s.getValue()).collect(Collectors.toList())); - } - break; - } - return re; - } + List> f = p.getFundingtree(); + if (!f.isEmpty()) { + re.setFundingtree(f.stream().map(s -> s.getValue()).collect(Collectors.toList())); + } + break; + } + return re; + } - private static String getValue(Field field) { - return getFieldValueWithDefault(field, ""); - } + private static String getValue(Field field) { + return getFieldValueWithDefault(field, ""); + } - private static T getFieldValueWithDefault(Field f, T defaultValue) { - return Optional.ofNullable(f) - .filter(Objects::nonNull) - .map(x -> x.getValue()) - .orElse(defaultValue); - } + private static T getFieldValueWithDefault(Field f, T defaultValue) { + return Optional + .ofNullable(f) + .filter(Objects::nonNull) + .map(x -> x.getValue()) + .orElse(defaultValue); + } - /** - * Reads a Dataset of eu.dnetlib.dhp.oa.provision.model.SortableRelation objects from a newline - * delimited json text file, - * - * @param spark - * @param relationPath - * @return the Dataset containing all the relationships - */ - private static Dataset readPathRelation( - SparkSession spark, final String relationPath) { + /** + * Reads a Dataset of eu.dnetlib.dhp.oa.provision.model.SortableRelation objects from a newline delimited json text + * file, + * + * @param spark + * @param relationPath + * @return the Dataset containing all the relationships + */ + private static Dataset readPathRelation( + SparkSession spark, final String relationPath) { - log.info("Reading relations from: {}", relationPath); - return spark.read().load(relationPath).as(Encoders.bean(SortableRelation.class)); - } + log.info("Reading relations from: {}", relationPath); + return spark.read().load(relationPath).as(Encoders.bean(SortableRelation.class)); + } - private static void removeOutputDir(SparkSession spark, String path) { - HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration()); - } + private static void removeOutputDir(SparkSession spark, String path) { + HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration()); + } } diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/CreateRelatedEntitiesJob_phase2.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/CreateRelatedEntitiesJob_phase2.java index 021ef86ba..403817019 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/CreateRelatedEntitiesJob_phase2.java +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/CreateRelatedEntitiesJob_phase2.java @@ -1,17 +1,11 @@ + package eu.dnetlib.dhp.oa.provision; import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; -import com.fasterxml.jackson.core.JsonProcessingException; -import com.fasterxml.jackson.databind.ObjectMapper; -import eu.dnetlib.dhp.application.ArgumentApplicationParser; -import eu.dnetlib.dhp.common.HdfsSupport; -import eu.dnetlib.dhp.oa.provision.model.EntityRelEntity; -import eu.dnetlib.dhp.oa.provision.model.TypedRow; -import eu.dnetlib.dhp.schema.common.ModelSupport; -import eu.dnetlib.dhp.schema.oaf.*; import java.util.List; import java.util.Optional; + import org.apache.commons.io.IOUtils; import org.apache.commons.lang3.StringUtils; import org.apache.spark.SparkConf; @@ -23,203 +17,200 @@ import org.apache.spark.sql.SaveMode; import org.apache.spark.sql.SparkSession; import org.slf4j.Logger; import org.slf4j.LoggerFactory; + +import com.fasterxml.jackson.core.JsonProcessingException; +import com.fasterxml.jackson.databind.ObjectMapper; + +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.common.HdfsSupport; +import eu.dnetlib.dhp.oa.provision.model.EntityRelEntity; +import eu.dnetlib.dhp.oa.provision.model.TypedRow; +import eu.dnetlib.dhp.schema.common.ModelSupport; +import eu.dnetlib.dhp.schema.oaf.*; import scala.Tuple2; import scala.collection.JavaConverters; import scala.collection.Seq; /** - * Joins the graph nodes by resolving the links of distance = 1 to create an adjacency list of - * linked objects. The operation considers all the entity types (publication, dataset, software, - * ORP, project, datasource, organization, and all the possible relationships (similarity links - * produced by the Dedup process are excluded). - * - *

The operation is implemented by sequentially joining one entity type at time (E) with the - * relationships (R), and again by E, finally grouped by E.id; - * - *

The workflow is organized in different parts aimed to to reduce the complexity of the - * operation 1) PrepareRelationsJob: only consider relationships that are not virtually deleted - * ($.dataInfo.deletedbyinference == false), each entity can be linked at most to 100 other objects - * - *

2) JoinRelationEntityByTargetJob: (phase 1): prepare tuples [relation - target entity] (R - - * T): for each entity type E_i map E_i as RelatedEntity T_i to simplify the model and extracting - * only the necessary information join (R.target = T_i.id) save the tuples (R_i, T_i) (phase 2): - * create the union of all the entity types E, hash by id read the tuples (R, T), hash by R.source - * join E.id = (R, T).source, where E becomes the Source Entity S save the tuples (S, R, T) - * - *

3) AdjacencyListBuilderJob: given the tuple (S - R - T) we need to group by S.id -> List [ R - - * T ], mapping the result as JoinedEntity - * - *

4) XmlConverterJob: convert the JoinedEntities as XML records + * Joins the graph nodes by resolving the links of distance = 1 to create an adjacency list of linked objects. The + * operation considers all the entity types (publication, dataset, software, ORP, project, datasource, organization, and + * all the possible relationships (similarity links produced by the Dedup process are excluded). + *

+ * The operation is implemented by sequentially joining one entity type at time (E) with the relationships (R), and + * again by E, finally grouped by E.id; + *

+ * The workflow is organized in different parts aimed to to reduce the complexity of the operation 1) + * PrepareRelationsJob: only consider relationships that are not virtually deleted ($.dataInfo.deletedbyinference == + * false), each entity can be linked at most to 100 other objects + *

+ * 2) JoinRelationEntityByTargetJob: (phase 1): prepare tuples [relation - target entity] (R - T): for each entity type + * E_i map E_i as RelatedEntity T_i to simplify the model and extracting only the necessary information join (R.target = + * T_i.id) save the tuples (R_i, T_i) (phase 2): create the union of all the entity types E, hash by id read the tuples + * (R, T), hash by R.source join E.id = (R, T).source, where E becomes the Source Entity S save the tuples (S, R, T) + *

+ * 3) AdjacencyListBuilderJob: given the tuple (S - R - T) we need to group by S.id -> List [ R - T ], mapping the + * result as JoinedEntity + *

+ * 4) XmlConverterJob: convert the JoinedEntities as XML records */ public class CreateRelatedEntitiesJob_phase2 { - private static final Logger log = LoggerFactory.getLogger(CreateRelatedEntitiesJob_phase2.class); + private static final Logger log = LoggerFactory.getLogger(CreateRelatedEntitiesJob_phase2.class); - private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); + private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); - public static void main(String[] args) throws Exception { + public static void main(String[] args) throws Exception { - String jsonConfiguration = - IOUtils.toString( - PrepareRelationsJob.class.getResourceAsStream( - "/eu/dnetlib/dhp/oa/provision/input_params_related_entities_pahase2.json")); - final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration); - parser.parseArgument(args); + String jsonConfiguration = IOUtils + .toString( + PrepareRelationsJob.class + .getResourceAsStream( + "/eu/dnetlib/dhp/oa/provision/input_params_related_entities_pahase2.json")); + final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration); + parser.parseArgument(args); - Boolean isSparkSessionManaged = - Optional.ofNullable(parser.get("isSparkSessionManaged")) - .map(Boolean::valueOf) - .orElse(Boolean.TRUE); - log.info("isSparkSessionManaged: {}", isSparkSessionManaged); + Boolean isSparkSessionManaged = Optional + .ofNullable(parser.get("isSparkSessionManaged")) + .map(Boolean::valueOf) + .orElse(Boolean.TRUE); + log.info("isSparkSessionManaged: {}", isSparkSessionManaged); - String inputRelatedEntitiesPath = parser.get("inputRelatedEntitiesPath"); - log.info("inputRelatedEntitiesPath: {}", inputRelatedEntitiesPath); + String inputRelatedEntitiesPath = parser.get("inputRelatedEntitiesPath"); + log.info("inputRelatedEntitiesPath: {}", inputRelatedEntitiesPath); - String inputGraphRootPath = parser.get("inputGraphRootPath"); - log.info("inputGraphRootPath: {}", inputGraphRootPath); + String inputGraphRootPath = parser.get("inputGraphRootPath"); + log.info("inputGraphRootPath: {}", inputGraphRootPath); - String outputPath = parser.get("outputPath"); - log.info("outputPath: {}", outputPath); + String outputPath = parser.get("outputPath"); + log.info("outputPath: {}", outputPath); - int numPartitions = Integer.parseInt(parser.get("numPartitions")); - log.info("numPartitions: {}", numPartitions); + int numPartitions = Integer.parseInt(parser.get("numPartitions")); + log.info("numPartitions: {}", numPartitions); - SparkConf conf = new SparkConf(); - conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer"); - conf.registerKryoClasses(ModelSupport.getOafModelClasses()); + SparkConf conf = new SparkConf(); + conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer"); + conf.registerKryoClasses(ModelSupport.getOafModelClasses()); - runWithSparkSession( - conf, - isSparkSessionManaged, - spark -> { - removeOutputDir(spark, outputPath); - joinAllEntities( - spark, inputRelatedEntitiesPath, inputGraphRootPath, outputPath, numPartitions); - }); - } + runWithSparkSession( + conf, + isSparkSessionManaged, + spark -> { + removeOutputDir(spark, outputPath); + joinAllEntities( + spark, inputRelatedEntitiesPath, inputGraphRootPath, outputPath, numPartitions); + }); + } - private static void joinAllEntities( - SparkSession spark, - String inputRelatedEntitiesPath, - String inputGraphRootPath, - String outputPath, - int numPartitions) { + private static void joinAllEntities( + SparkSession spark, + String inputRelatedEntitiesPath, + String inputGraphRootPath, + String outputPath, + int numPartitions) { - Dataset> entities = - readAllEntities(spark, inputGraphRootPath, numPartitions); - Dataset> relsBySource = - readRelatedEntities(spark, inputRelatedEntitiesPath); + Dataset> entities = readAllEntities(spark, inputGraphRootPath, numPartitions); + Dataset> relsBySource = readRelatedEntities(spark, inputRelatedEntitiesPath); - entities - .joinWith(relsBySource, entities.col("_1").equalTo(relsBySource.col("_1")), "left_outer") - .map( - (MapFunction< - Tuple2, Tuple2>, - EntityRelEntity>) - value -> { - EntityRelEntity re = new EntityRelEntity(); - re.setEntity(value._1()._2()); - Optional related = - Optional.ofNullable(value._2()).map(Tuple2::_2); - if (related.isPresent()) { - re.setRelation(related.get().getRelation()); - re.setTarget(related.get().getTarget()); - } - return re; - }, - Encoders.bean(EntityRelEntity.class)) - .repartition(numPartitions) - .filter( - (FilterFunction) - value -> - value.getEntity() != null && StringUtils.isNotBlank(value.getEntity().getId())) - .write() - .mode(SaveMode.Overwrite) - .parquet(outputPath); - } + entities + .joinWith(relsBySource, entities.col("_1").equalTo(relsBySource.col("_1")), "left_outer") + .map( + (MapFunction, Tuple2>, EntityRelEntity>) value -> { + EntityRelEntity re = new EntityRelEntity(); + re.setEntity(value._1()._2()); + Optional related = Optional.ofNullable(value._2()).map(Tuple2::_2); + if (related.isPresent()) { + re.setRelation(related.get().getRelation()); + re.setTarget(related.get().getTarget()); + } + return re; + }, + Encoders.bean(EntityRelEntity.class)) + .repartition(numPartitions) + .filter( + (FilterFunction) value -> value.getEntity() != null + && StringUtils.isNotBlank(value.getEntity().getId())) + .write() + .mode(SaveMode.Overwrite) + .parquet(outputPath); + } - private static Dataset> readAllEntities( - SparkSession spark, String inputGraphPath, int numPartitions) { - Dataset publication = - readPathEntity(spark, inputGraphPath + "/publication", Publication.class); - Dataset dataset = - readPathEntity(spark, inputGraphPath + "/dataset", eu.dnetlib.dhp.schema.oaf.Dataset.class); - Dataset other = - readPathEntity(spark, inputGraphPath + "/otherresearchproduct", OtherResearchProduct.class); - Dataset software = - readPathEntity(spark, inputGraphPath + "/software", Software.class); - Dataset datasource = - readPathEntity(spark, inputGraphPath + "/datasource", Datasource.class); - Dataset organization = - readPathEntity(spark, inputGraphPath + "/organization", Organization.class); - Dataset project = readPathEntity(spark, inputGraphPath + "/project", Project.class); + private static Dataset> readAllEntities( + SparkSession spark, String inputGraphPath, int numPartitions) { + Dataset publication = readPathEntity(spark, inputGraphPath + "/publication", Publication.class); + Dataset dataset = readPathEntity( + spark, inputGraphPath + "/dataset", eu.dnetlib.dhp.schema.oaf.Dataset.class); + Dataset other = readPathEntity( + spark, inputGraphPath + "/otherresearchproduct", OtherResearchProduct.class); + Dataset software = readPathEntity(spark, inputGraphPath + "/software", Software.class); + Dataset datasource = readPathEntity(spark, inputGraphPath + "/datasource", Datasource.class); + Dataset organization = readPathEntity(spark, inputGraphPath + "/organization", Organization.class); + Dataset project = readPathEntity(spark, inputGraphPath + "/project", Project.class); - return publication - .union(dataset) - .union(other) - .union(software) - .union(datasource) - .union(organization) - .union(project) - .map( - (MapFunction>) - value -> new Tuple2<>(value.getId(), value), - Encoders.tuple(Encoders.STRING(), Encoders.kryo(TypedRow.class))) - .repartition(numPartitions); - } + return publication + .union(dataset) + .union(other) + .union(software) + .union(datasource) + .union(organization) + .union(project) + .map( + (MapFunction>) value -> new Tuple2<>(value.getId(), value), + Encoders.tuple(Encoders.STRING(), Encoders.kryo(TypedRow.class))) + .repartition(numPartitions); + } - private static Dataset> readRelatedEntities( - SparkSession spark, String inputRelatedEntitiesPath) { + private static Dataset> readRelatedEntities( + SparkSession spark, String inputRelatedEntitiesPath) { - log.info("Reading related entities from: {}", inputRelatedEntitiesPath); + log.info("Reading related entities from: {}", inputRelatedEntitiesPath); - final List paths = - HdfsSupport.listFiles(inputRelatedEntitiesPath, spark.sparkContext().hadoopConfiguration()); + final List paths = HdfsSupport + .listFiles(inputRelatedEntitiesPath, spark.sparkContext().hadoopConfiguration()); - log.info("Found paths: {}", String.join(",", paths)); + log.info("Found paths: {}", String.join(",", paths)); - return spark - .read() - .load(toSeq(paths)) - .as(Encoders.bean(EntityRelEntity.class)) - .map( - (MapFunction>) - value -> new Tuple2<>(value.getRelation().getSource(), value), - Encoders.tuple(Encoders.STRING(), Encoders.kryo(EntityRelEntity.class))); - } + return spark + .read() + .load(toSeq(paths)) + .as(Encoders.bean(EntityRelEntity.class)) + .map( + (MapFunction>) value -> new Tuple2<>( + value.getRelation().getSource(), value), + Encoders.tuple(Encoders.STRING(), Encoders.kryo(EntityRelEntity.class))); + } - private static Dataset readPathEntity( - SparkSession spark, String inputEntityPath, Class entityClazz) { + private static Dataset readPathEntity( + SparkSession spark, String inputEntityPath, Class entityClazz) { - log.info("Reading Graph table from: {}", inputEntityPath); - return spark - .read() - .textFile(inputEntityPath) - .map( - (MapFunction) value -> OBJECT_MAPPER.readValue(value, entityClazz), - Encoders.bean(entityClazz)) - .filter("dataInfo.invisible == false") - .map( - (MapFunction) - value -> getTypedRow(StringUtils.substringAfterLast(inputEntityPath, "/"), value), - Encoders.bean(TypedRow.class)); - } + log.info("Reading Graph table from: {}", inputEntityPath); + return spark + .read() + .textFile(inputEntityPath) + .map( + (MapFunction) value -> OBJECT_MAPPER.readValue(value, entityClazz), + Encoders.bean(entityClazz)) + .filter("dataInfo.invisible == false") + .map( + (MapFunction) value -> getTypedRow( + StringUtils.substringAfterLast(inputEntityPath, "/"), value), + Encoders.bean(TypedRow.class)); + } - private static TypedRow getTypedRow(String type, OafEntity entity) - throws JsonProcessingException { - TypedRow t = new TypedRow(); - t.setType(type); - t.setDeleted(entity.getDataInfo().getDeletedbyinference()); - t.setId(entity.getId()); - t.setOaf(OBJECT_MAPPER.writeValueAsString(entity)); - return t; - } + private static TypedRow getTypedRow(String type, OafEntity entity) + throws JsonProcessingException { + TypedRow t = new TypedRow(); + t.setType(type); + t.setDeleted(entity.getDataInfo().getDeletedbyinference()); + t.setId(entity.getId()); + t.setOaf(OBJECT_MAPPER.writeValueAsString(entity)); + return t; + } - private static void removeOutputDir(SparkSession spark, String path) { - HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration()); - } + private static void removeOutputDir(SparkSession spark, String path) { + HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration()); + } - private static Seq toSeq(List list) { - return JavaConverters.asScalaIteratorConverter(list.iterator()).asScala().toSeq(); - } + private static Seq toSeq(List list) { + return JavaConverters.asScalaIteratorConverter(list.iterator()).asScala().toSeq(); + } } diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/PrepareRelationsJob.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/PrepareRelationsJob.java index 72eb15cbb..dbdc54fc0 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/PrepareRelationsJob.java +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/PrepareRelationsJob.java @@ -1,15 +1,10 @@ + package eu.dnetlib.dhp.oa.provision; import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; -import com.fasterxml.jackson.databind.ObjectMapper; -import com.google.common.collect.Iterables; -import com.google.common.collect.Iterators; -import eu.dnetlib.dhp.application.ArgumentApplicationParser; -import eu.dnetlib.dhp.common.HdfsSupport; -import eu.dnetlib.dhp.oa.provision.model.SortableRelation; -import eu.dnetlib.dhp.oa.provision.utils.RelationPartitioner; import java.util.Optional; + import org.apache.commons.io.IOUtils; import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaRDD; @@ -22,139 +17,144 @@ import org.apache.spark.sql.SaveMode; import org.apache.spark.sql.SparkSession; import org.slf4j.Logger; import org.slf4j.LoggerFactory; + +import com.fasterxml.jackson.databind.ObjectMapper; +import com.google.common.collect.Iterables; +import com.google.common.collect.Iterators; + +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.common.HdfsSupport; +import eu.dnetlib.dhp.oa.provision.model.SortableRelation; +import eu.dnetlib.dhp.oa.provision.utils.RelationPartitioner; import scala.Tuple2; /** - * Joins the graph nodes by resolving the links of distance = 1 to create an adjacency list of - * linked objects. The operation considers all the entity types (publication, dataset, software, - * ORP, project, datasource, organization, and all the possible relationships (similarity links - * produced by the Dedup process are excluded). - * - *

The operation is implemented by sequentially joining one entity type at time (E) with the - * relationships (R), and again by E, finally grouped by E.id; - * - *

The workflow is organized in different parts aimed to to reduce the complexity of the - * operation 1) PrepareRelationsJob: only consider relationships that are not virtually deleted - * ($.dataInfo.deletedbyinference == false), each entity can be linked at most to 100 other objects - * - *

2) JoinRelationEntityByTargetJob: (phase 1): prepare tuples [relation - target entity] (R - - * T): for each entity type E_i map E_i as RelatedEntity T_i to simplify the model and extracting - * only the necessary information join (R.target = T_i.id) save the tuples (R_i, T_i) (phase 2): - * create the union of all the entity types E, hash by id read the tuples (R, T), hash by R.source - * join E.id = (R, T).source, where E becomes the Source Entity S save the tuples (S, R, T) - * - *

3) AdjacencyListBuilderJob: given the tuple (S - R - T) we need to group by S.id -> List [ R - - * T ], mapping the result as JoinedEntity - * - *

4) XmlConverterJob: convert the JoinedEntities as XML records + * Joins the graph nodes by resolving the links of distance = 1 to create an adjacency list of linked objects. The + * operation considers all the entity types (publication, dataset, software, ORP, project, datasource, organization, and + * all the possible relationships (similarity links produced by the Dedup process are excluded). + *

+ * The operation is implemented by sequentially joining one entity type at time (E) with the relationships (R), and + * again by E, finally grouped by E.id; + *

+ * The workflow is organized in different parts aimed to to reduce the complexity of the operation 1) + * PrepareRelationsJob: only consider relationships that are not virtually deleted ($.dataInfo.deletedbyinference == + * false), each entity can be linked at most to 100 other objects + *

+ * 2) JoinRelationEntityByTargetJob: (phase 1): prepare tuples [relation - target entity] (R - T): for each entity type + * E_i map E_i as RelatedEntity T_i to simplify the model and extracting only the necessary information join (R.target = + * T_i.id) save the tuples (R_i, T_i) (phase 2): create the union of all the entity types E, hash by id read the tuples + * (R, T), hash by R.source join E.id = (R, T).source, where E becomes the Source Entity S save the tuples (S, R, T) + *

+ * 3) AdjacencyListBuilderJob: given the tuple (S - R - T) we need to group by S.id -> List [ R - T ], mapping the + * result as JoinedEntity + *

+ * 4) XmlConverterJob: convert the JoinedEntities as XML records */ public class PrepareRelationsJob { - private static final Logger log = LoggerFactory.getLogger(PrepareRelationsJob.class); + private static final Logger log = LoggerFactory.getLogger(PrepareRelationsJob.class); - private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); + private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); - public static final int MAX_RELS = 100; + public static final int MAX_RELS = 100; - public static void main(String[] args) throws Exception { - String jsonConfiguration = - IOUtils.toString( - PrepareRelationsJob.class.getResourceAsStream( - "/eu/dnetlib/dhp/oa/provision/input_params_prepare_relations.json")); - final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration); - parser.parseArgument(args); + public static void main(String[] args) throws Exception { + String jsonConfiguration = IOUtils + .toString( + PrepareRelationsJob.class + .getResourceAsStream( + "/eu/dnetlib/dhp/oa/provision/input_params_prepare_relations.json")); + final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration); + parser.parseArgument(args); - Boolean isSparkSessionManaged = - Optional.ofNullable(parser.get("isSparkSessionManaged")) - .map(Boolean::valueOf) - .orElse(Boolean.TRUE); - log.info("isSparkSessionManaged: {}", isSparkSessionManaged); + Boolean isSparkSessionManaged = Optional + .ofNullable(parser.get("isSparkSessionManaged")) + .map(Boolean::valueOf) + .orElse(Boolean.TRUE); + log.info("isSparkSessionManaged: {}", isSparkSessionManaged); - String inputRelationsPath = parser.get("inputRelationsPath"); - log.info("inputRelationsPath: {}", inputRelationsPath); + String inputRelationsPath = parser.get("inputRelationsPath"); + log.info("inputRelationsPath: {}", inputRelationsPath); - String outputPath = parser.get("outputPath"); - log.info("outputPath: {}", outputPath); + String outputPath = parser.get("outputPath"); + log.info("outputPath: {}", outputPath); - SparkConf conf = new SparkConf(); + SparkConf conf = new SparkConf(); - runWithSparkSession( - conf, - isSparkSessionManaged, - spark -> { - removeOutputDir(spark, outputPath); - prepareRelationsFromPaths(spark, inputRelationsPath, outputPath); - }); - } + runWithSparkSession( + conf, + isSparkSessionManaged, + spark -> { + removeOutputDir(spark, outputPath); + prepareRelationsFromPaths(spark, inputRelationsPath, outputPath); + }); + } - private static void prepareRelationsFromPaths( - SparkSession spark, String inputRelationsPath, String outputPath) { - readPathRelation(spark, inputRelationsPath) - .filter("dataInfo.deletedbyinference == false") - .groupByKey( - (MapFunction) value -> value.getSource(), Encoders.STRING()) - .flatMapGroups( - (FlatMapGroupsFunction) - (key, values) -> Iterators.limit(values, MAX_RELS), - Encoders.bean(SortableRelation.class)) - .write() - .mode(SaveMode.Overwrite) - .parquet(outputPath); - } + private static void prepareRelationsFromPaths( + SparkSession spark, String inputRelationsPath, String outputPath) { + readPathRelation(spark, inputRelationsPath) + .filter("dataInfo.deletedbyinference == false") + .groupByKey( + (MapFunction) value -> value.getSource(), Encoders.STRING()) + .flatMapGroups( + (FlatMapGroupsFunction) (key, values) -> Iterators + .limit(values, MAX_RELS), + Encoders.bean(SortableRelation.class)) + .write() + .mode(SaveMode.Overwrite) + .parquet(outputPath); + } - /** - * Reads a Dataset of eu.dnetlib.dhp.oa.provision.model.SortableRelation objects from a newline - * delimited json text file, - * - * @param spark - * @param inputPath - * @return the Dataset containing all the relationships - */ - private static Dataset readPathRelation( - SparkSession spark, final String inputPath) { - return spark - .read() - .textFile(inputPath) - .map( - (MapFunction) - value -> OBJECT_MAPPER.readValue(value, SortableRelation.class), - Encoders.bean(SortableRelation.class)); - } + /** + * Reads a Dataset of eu.dnetlib.dhp.oa.provision.model.SortableRelation objects from a newline delimited json text + * file, + * + * @param spark + * @param inputPath + * @return the Dataset containing all the relationships + */ + private static Dataset readPathRelation( + SparkSession spark, final String inputPath) { + return spark + .read() + .textFile(inputPath) + .map( + (MapFunction) value -> OBJECT_MAPPER.readValue(value, SortableRelation.class), + Encoders.bean(SortableRelation.class)); + } - // TODO work in progress - private static void prepareRelationsRDDFromPaths( - SparkSession spark, String inputRelationsPath, String outputPath, int numPartitions) { - JavaRDD rels = - readPathRelationRDD(spark, inputRelationsPath).repartition(numPartitions); + // TODO work in progress + private static void prepareRelationsRDDFromPaths( + SparkSession spark, String inputRelationsPath, String outputPath, int numPartitions) { + JavaRDD rels = readPathRelationRDD(spark, inputRelationsPath).repartition(numPartitions); - RDD d = - rels.filter(rel -> !rel.getDataInfo().getDeletedbyinference()) // only - // consider - // those - // that are not virtually - // deleted - .mapToPair( - (PairFunction) - rel -> new Tuple2<>(rel, rel)) - .groupByKey(new RelationPartitioner(rels.getNumPartitions())) - .map(p -> Iterables.limit(p._2(), MAX_RELS)) - .flatMap(p -> p.iterator()) - .rdd(); + RDD d = rels + .filter(rel -> !rel.getDataInfo().getDeletedbyinference()) // only + // consider + // those + // that are not virtually + // deleted + .mapToPair( + (PairFunction) rel -> new Tuple2<>(rel, rel)) + .groupByKey(new RelationPartitioner(rels.getNumPartitions())) + .map(p -> Iterables.limit(p._2(), MAX_RELS)) + .flatMap(p -> p.iterator()) + .rdd(); - spark - .createDataset(d, Encoders.bean(SortableRelation.class)) - .write() - .mode(SaveMode.Overwrite) - .parquet(outputPath); - } + spark + .createDataset(d, Encoders.bean(SortableRelation.class)) + .write() + .mode(SaveMode.Overwrite) + .parquet(outputPath); + } - private static JavaRDD readPathRelationRDD( - SparkSession spark, final String inputPath) { - JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); - return sc.textFile(inputPath).map(s -> OBJECT_MAPPER.readValue(s, SortableRelation.class)); - } + private static JavaRDD readPathRelationRDD( + SparkSession spark, final String inputPath) { + JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); + return sc.textFile(inputPath).map(s -> OBJECT_MAPPER.readValue(s, SortableRelation.class)); + } - private static void removeOutputDir(SparkSession spark, String path) { - HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration()); - } + private static void removeOutputDir(SparkSession spark, String path) { + HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration()); + } } diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/XmlConverterJob.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/XmlConverterJob.java index aabeae5ee..a88b28592 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/XmlConverterJob.java +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/XmlConverterJob.java @@ -1,19 +1,13 @@ + package eu.dnetlib.dhp.oa.provision; import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; -import com.fasterxml.jackson.databind.ObjectMapper; -import com.google.common.collect.Maps; -import eu.dnetlib.dhp.application.ArgumentApplicationParser; -import eu.dnetlib.dhp.common.HdfsSupport; -import eu.dnetlib.dhp.oa.provision.model.*; -import eu.dnetlib.dhp.oa.provision.utils.ContextMapper; -import eu.dnetlib.dhp.oa.provision.utils.XmlRecordFactory; -import eu.dnetlib.dhp.schema.oaf.*; import java.util.ArrayList; import java.util.Map; import java.util.Optional; import java.util.stream.Collectors; + import org.apache.commons.io.IOUtils; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.compress.GzipCodec; @@ -27,178 +21,205 @@ import org.apache.spark.sql.SparkSession; import org.apache.spark.util.LongAccumulator; import org.slf4j.Logger; import org.slf4j.LoggerFactory; + +import com.fasterxml.jackson.databind.ObjectMapper; +import com.google.common.collect.Maps; + +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.common.HdfsSupport; +import eu.dnetlib.dhp.oa.provision.model.*; +import eu.dnetlib.dhp.oa.provision.utils.ContextMapper; +import eu.dnetlib.dhp.oa.provision.utils.XmlRecordFactory; +import eu.dnetlib.dhp.schema.oaf.*; import scala.Tuple2; /** - * Joins the graph nodes by resolving the links of distance = 1 to create an adjacency list of - * linked objects. The operation considers all the entity types (publication, dataset, software, - * ORP, project, datasource, organization, and all the possible relationships (similarity links - * produced by the Dedup process are excluded). - * - *

The workflow is organized in different parts aimed to to reduce the complexity of the - * operation 1) PrepareRelationsJob: only consider relationships that are not virtually deleted - * ($.dataInfo.deletedbyinference == false), each entity can be linked at most to 100 other objects - * - *

2) JoinRelationEntityByTargetJob: (phase 1): prepare tuples [relation - target entity] (R - - * T): for each entity type E_i map E_i as RelatedEntity T_i to simplify the model and extracting - * only the necessary information join (R.target = T_i.id) save the tuples (R_i, T_i) (phase 2): - * create the union of all the entity types E, hash by id read the tuples (R, T), hash by R.source - * join E.id = (R, T).source, where E becomes the Source Entity S save the tuples (S, R, T) - * - *

3) AdjacencyListBuilderJob: given the tuple (S - R - T) we need to group by S.id -> List [ R - - * T ], mapping the result as JoinedEntity - * - *

4) XmlConverterJob: convert the JoinedEntities as XML records + * Joins the graph nodes by resolving the links of distance = 1 to create an adjacency list of linked objects. The + * operation considers all the entity types (publication, dataset, software, ORP, project, datasource, organization, and + * all the possible relationships (similarity links produced by the Dedup process are excluded). + *

+ * The workflow is organized in different parts aimed to to reduce the complexity of the operation 1) + * PrepareRelationsJob: only consider relationships that are not virtually deleted ($.dataInfo.deletedbyinference == + * false), each entity can be linked at most to 100 other objects + *

+ * 2) JoinRelationEntityByTargetJob: (phase 1): prepare tuples [relation - target entity] (R - T): for each entity type + * E_i map E_i as RelatedEntity T_i to simplify the model and extracting only the necessary information join (R.target = + * T_i.id) save the tuples (R_i, T_i) (phase 2): create the union of all the entity types E, hash by id read the tuples + * (R, T), hash by R.source join E.id = (R, T).source, where E becomes the Source Entity S save the tuples (S, R, T) + *

+ * 3) AdjacencyListBuilderJob: given the tuple (S - R - T) we need to group by S.id -> List [ R - T ], mapping the + * result as JoinedEntity + *

+ * 4) XmlConverterJob: convert the JoinedEntities as XML records */ public class XmlConverterJob { - private static final Logger log = LoggerFactory.getLogger(XmlConverterJob.class); + private static final Logger log = LoggerFactory.getLogger(XmlConverterJob.class); - private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); + private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); - public static final String schemaLocation = "https://www.openaire.eu/schema/1.0/oaf-1.0.xsd"; + public static final String schemaLocation = "https://www.openaire.eu/schema/1.0/oaf-1.0.xsd"; - public static void main(String[] args) throws Exception { + public static void main(String[] args) throws Exception { - final ArgumentApplicationParser parser = - new ArgumentApplicationParser( - IOUtils.toString( - XmlConverterJob.class.getResourceAsStream( - "/eu/dnetlib/dhp/oa/provision/input_params_xml_converter.json"))); - parser.parseArgument(args); + final ArgumentApplicationParser parser = new ArgumentApplicationParser( + IOUtils + .toString( + XmlConverterJob.class + .getResourceAsStream( + "/eu/dnetlib/dhp/oa/provision/input_params_xml_converter.json"))); + parser.parseArgument(args); - Boolean isSparkSessionManaged = - Optional.ofNullable(parser.get("isSparkSessionManaged")) - .map(Boolean::valueOf) - .orElse(Boolean.TRUE); - log.info("isSparkSessionManaged: {}", isSparkSessionManaged); + Boolean isSparkSessionManaged = Optional + .ofNullable(parser.get("isSparkSessionManaged")) + .map(Boolean::valueOf) + .orElse(Boolean.TRUE); + log.info("isSparkSessionManaged: {}", isSparkSessionManaged); - String inputPath = parser.get("inputPath"); - log.info("inputPath: {}", inputPath); + String inputPath = parser.get("inputPath"); + log.info("inputPath: {}", inputPath); - String outputPath = parser.get("outputPath"); - log.info("outputPath: {}", outputPath); + String outputPath = parser.get("outputPath"); + log.info("outputPath: {}", outputPath); - String isLookupUrl = parser.get("isLookupUrl"); - log.info("isLookupUrl: {}", isLookupUrl); + String isLookupUrl = parser.get("isLookupUrl"); + log.info("isLookupUrl: {}", isLookupUrl); - String otherDsTypeId = parser.get("otherDsTypeId"); - log.info("otherDsTypeId: {}", otherDsTypeId); + String otherDsTypeId = parser.get("otherDsTypeId"); + log.info("otherDsTypeId: {}", otherDsTypeId); - SparkConf conf = new SparkConf(); + SparkConf conf = new SparkConf(); - runWithSparkSession( - conf, - isSparkSessionManaged, - spark -> { - removeOutputDir(spark, outputPath); - convertToXml( - spark, inputPath, outputPath, ContextMapper.fromIS(isLookupUrl), otherDsTypeId); - }); - } + runWithSparkSession( + conf, + isSparkSessionManaged, + spark -> { + removeOutputDir(spark, outputPath); + convertToXml( + spark, inputPath, outputPath, ContextMapper.fromIS(isLookupUrl), otherDsTypeId); + }); + } - private static void convertToXml( - SparkSession spark, - String inputPath, - String outputPath, - ContextMapper contextMapper, - String otherDsTypeId) { + private static void convertToXml( + SparkSession spark, + String inputPath, + String outputPath, + ContextMapper contextMapper, + String otherDsTypeId) { - final XmlRecordFactory recordFactory = - new XmlRecordFactory( - prepareAccumulators(spark.sparkContext()), - contextMapper, - false, - schemaLocation, - otherDsTypeId); + final XmlRecordFactory recordFactory = new XmlRecordFactory( + prepareAccumulators(spark.sparkContext()), + contextMapper, + false, + schemaLocation, + otherDsTypeId); - spark - .read() - .load(inputPath) - .as(Encoders.bean(JoinedEntity.class)) - .map( - (MapFunction) - j -> { - if (j.getLinks() != null) { - j.setLinks( - j.getLinks().stream() - .filter(t -> t.getRelation() != null & t.getRelatedEntity() != null) - .collect(Collectors.toCollection(ArrayList::new))); - } - return j; - }, - Encoders.bean(JoinedEntity.class)) - .map( - (MapFunction>) - je -> new Tuple2<>(je.getEntity().getId(), recordFactory.build(je)), - Encoders.tuple(Encoders.STRING(), Encoders.STRING())) - .javaRDD() - .mapToPair( - (PairFunction, Text, Text>) - t -> new Tuple2<>(new Text(t._1()), new Text(t._2()))) - .saveAsHadoopFile( - outputPath, Text.class, Text.class, SequenceFileOutputFormat.class, GzipCodec.class); - } + spark + .read() + .load(inputPath) + .as(Encoders.bean(JoinedEntity.class)) + .map( + (MapFunction) j -> { + if (j.getLinks() != null) { + j + .setLinks( + j + .getLinks() + .stream() + .filter(t -> t.getRelation() != null & t.getRelatedEntity() != null) + .collect(Collectors.toCollection(ArrayList::new))); + } + return j; + }, + Encoders.bean(JoinedEntity.class)) + .map( + (MapFunction>) je -> new Tuple2<>(je.getEntity().getId(), + recordFactory.build(je)), + Encoders.tuple(Encoders.STRING(), Encoders.STRING())) + .javaRDD() + .mapToPair( + (PairFunction, Text, Text>) t -> new Tuple2<>(new Text(t._1()), + new Text(t._2()))) + .saveAsHadoopFile( + outputPath, Text.class, Text.class, SequenceFileOutputFormat.class, GzipCodec.class); + } - private static void removeOutputDir(SparkSession spark, String path) { - HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration()); - } + private static void removeOutputDir(SparkSession spark, String path) { + HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration()); + } - private static Map prepareAccumulators(SparkContext sc) { - Map accumulators = Maps.newHashMap(); - accumulators.put( - "resultResult_similarity_isAmongTopNSimilarDocuments", - sc.longAccumulator("resultResult_similarity_isAmongTopNSimilarDocuments")); - accumulators.put( - "resultResult_similarity_hasAmongTopNSimilarDocuments", - sc.longAccumulator("resultResult_similarity_hasAmongTopNSimilarDocuments")); - accumulators.put( - "resultResult_supplement_isSupplementTo", - sc.longAccumulator("resultResult_supplement_isSupplementTo")); - accumulators.put( - "resultResult_supplement_isSupplementedBy", - sc.longAccumulator("resultResult_supplement_isSupplementedBy")); - accumulators.put( - "resultResult_dedup_isMergedIn", sc.longAccumulator("resultResult_dedup_isMergedIn")); - accumulators.put("resultResult_dedup_merges", sc.longAccumulator("resultResult_dedup_merges")); + private static Map prepareAccumulators(SparkContext sc) { + Map accumulators = Maps.newHashMap(); + accumulators + .put( + "resultResult_similarity_isAmongTopNSimilarDocuments", + sc.longAccumulator("resultResult_similarity_isAmongTopNSimilarDocuments")); + accumulators + .put( + "resultResult_similarity_hasAmongTopNSimilarDocuments", + sc.longAccumulator("resultResult_similarity_hasAmongTopNSimilarDocuments")); + accumulators + .put( + "resultResult_supplement_isSupplementTo", + sc.longAccumulator("resultResult_supplement_isSupplementTo")); + accumulators + .put( + "resultResult_supplement_isSupplementedBy", + sc.longAccumulator("resultResult_supplement_isSupplementedBy")); + accumulators + .put( + "resultResult_dedup_isMergedIn", sc.longAccumulator("resultResult_dedup_isMergedIn")); + accumulators.put("resultResult_dedup_merges", sc.longAccumulator("resultResult_dedup_merges")); - accumulators.put( - "resultResult_publicationDataset_isRelatedTo", - sc.longAccumulator("resultResult_publicationDataset_isRelatedTo")); - accumulators.put( - "resultResult_relationship_isRelatedTo", - sc.longAccumulator("resultResult_relationship_isRelatedTo")); - accumulators.put( - "resultProject_outcome_isProducedBy", - sc.longAccumulator("resultProject_outcome_isProducedBy")); - accumulators.put( - "resultProject_outcome_produces", sc.longAccumulator("resultProject_outcome_produces")); - accumulators.put( - "resultOrganization_affiliation_isAuthorInstitutionOf", - sc.longAccumulator("resultOrganization_affiliation_isAuthorInstitutionOf")); + accumulators + .put( + "resultResult_publicationDataset_isRelatedTo", + sc.longAccumulator("resultResult_publicationDataset_isRelatedTo")); + accumulators + .put( + "resultResult_relationship_isRelatedTo", + sc.longAccumulator("resultResult_relationship_isRelatedTo")); + accumulators + .put( + "resultProject_outcome_isProducedBy", + sc.longAccumulator("resultProject_outcome_isProducedBy")); + accumulators + .put( + "resultProject_outcome_produces", sc.longAccumulator("resultProject_outcome_produces")); + accumulators + .put( + "resultOrganization_affiliation_isAuthorInstitutionOf", + sc.longAccumulator("resultOrganization_affiliation_isAuthorInstitutionOf")); - accumulators.put( - "resultOrganization_affiliation_hasAuthorInstitution", - sc.longAccumulator("resultOrganization_affiliation_hasAuthorInstitution")); - accumulators.put( - "projectOrganization_participation_hasParticipant", - sc.longAccumulator("projectOrganization_participation_hasParticipant")); - accumulators.put( - "projectOrganization_participation_isParticipant", - sc.longAccumulator("projectOrganization_participation_isParticipant")); - accumulators.put( - "organizationOrganization_dedup_isMergedIn", - sc.longAccumulator("organizationOrganization_dedup_isMergedIn")); - accumulators.put( - "organizationOrganization_dedup_merges", - sc.longAccumulator("resultProject_outcome_produces")); - accumulators.put( - "datasourceOrganization_provision_isProvidedBy", - sc.longAccumulator("datasourceOrganization_provision_isProvidedBy")); - accumulators.put( - "datasourceOrganization_provision_provides", - sc.longAccumulator("datasourceOrganization_provision_provides")); + accumulators + .put( + "resultOrganization_affiliation_hasAuthorInstitution", + sc.longAccumulator("resultOrganization_affiliation_hasAuthorInstitution")); + accumulators + .put( + "projectOrganization_participation_hasParticipant", + sc.longAccumulator("projectOrganization_participation_hasParticipant")); + accumulators + .put( + "projectOrganization_participation_isParticipant", + sc.longAccumulator("projectOrganization_participation_isParticipant")); + accumulators + .put( + "organizationOrganization_dedup_isMergedIn", + sc.longAccumulator("organizationOrganization_dedup_isMergedIn")); + accumulators + .put( + "organizationOrganization_dedup_merges", + sc.longAccumulator("resultProject_outcome_produces")); + accumulators + .put( + "datasourceOrganization_provision_isProvidedBy", + sc.longAccumulator("datasourceOrganization_provision_isProvidedBy")); + accumulators + .put( + "datasourceOrganization_provision_provides", + sc.longAccumulator("datasourceOrganization_provision_provides")); - return accumulators; - } + return accumulators; + } } diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/XmlIndexingJob.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/XmlIndexingJob.java index ca81e0b3f..b9746f153 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/XmlIndexingJob.java +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/XmlIndexingJob.java @@ -1,25 +1,20 @@ + package eu.dnetlib.dhp.oa.provision; import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; -import com.lucidworks.spark.util.SolrSupport; -import eu.dnetlib.dhp.application.ArgumentApplicationParser; -import eu.dnetlib.dhp.oa.provision.utils.StreamingInputDocumentFactory; -import eu.dnetlib.dhp.utils.ISLookupClientFactory; -import eu.dnetlib.dhp.utils.saxon.SaxonTransformerFactory; -import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpDocumentNotFoundException; -import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException; -import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService; import java.io.IOException; import java.io.StringReader; import java.io.StringWriter; import java.text.SimpleDateFormat; import java.util.Date; import java.util.Optional; + import javax.xml.transform.Transformer; import javax.xml.transform.TransformerException; import javax.xml.transform.stream.StreamResult; import javax.xml.transform.stream.StreamSource; + import org.apache.commons.io.IOUtils; import org.apache.commons.lang3.StringUtils; import org.apache.hadoop.io.Text; @@ -30,197 +25,206 @@ import org.apache.spark.rdd.RDD; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import com.lucidworks.spark.util.SolrSupport; + +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.oa.provision.utils.StreamingInputDocumentFactory; +import eu.dnetlib.dhp.utils.ISLookupClientFactory; +import eu.dnetlib.dhp.utils.saxon.SaxonTransformerFactory; +import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpDocumentNotFoundException; +import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException; +import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService; + public class XmlIndexingJob { - private static final Logger log = LoggerFactory.getLogger(XmlIndexingJob.class); + private static final Logger log = LoggerFactory.getLogger(XmlIndexingJob.class); - private static final Integer DEFAULT_BATCH_SIZE = 1000; + private static final Integer DEFAULT_BATCH_SIZE = 1000; - private static final String LAYOUT = "index"; - private static final String INTERPRETATION = "openaire"; - private static final String SEPARATOR = "-"; - public static final String DATE_FORMAT = "yyyy-MM-dd'T'hh:mm:ss'Z'"; + private static final String LAYOUT = "index"; + private static final String INTERPRETATION = "openaire"; + private static final String SEPARATOR = "-"; + public static final String DATE_FORMAT = "yyyy-MM-dd'T'hh:mm:ss'Z'"; - public static void main(String[] args) throws Exception { + public static void main(String[] args) throws Exception { - final ArgumentApplicationParser parser = - new ArgumentApplicationParser( - IOUtils.toString( - XmlIndexingJob.class.getResourceAsStream( - "/eu/dnetlib/dhp/oa/provision/input_params_update_index.json"))); - parser.parseArgument(args); + final ArgumentApplicationParser parser = new ArgumentApplicationParser( + IOUtils + .toString( + XmlIndexingJob.class + .getResourceAsStream( + "/eu/dnetlib/dhp/oa/provision/input_params_update_index.json"))); + parser.parseArgument(args); - Boolean isSparkSessionManaged = - Optional.ofNullable(parser.get("isSparkSessionManaged")) - .map(Boolean::valueOf) - .orElse(Boolean.TRUE); - log.info("isSparkSessionManaged: {}", isSparkSessionManaged); + Boolean isSparkSessionManaged = Optional + .ofNullable(parser.get("isSparkSessionManaged")) + .map(Boolean::valueOf) + .orElse(Boolean.TRUE); + log.info("isSparkSessionManaged: {}", isSparkSessionManaged); - final String inputPath = parser.get("inputPath"); - log.info("inputPath: {}", inputPath); + final String inputPath = parser.get("inputPath"); + log.info("inputPath: {}", inputPath); - final String isLookupUrl = parser.get("isLookupUrl"); - log.info("isLookupUrl: {}", isLookupUrl); + final String isLookupUrl = parser.get("isLookupUrl"); + log.info("isLookupUrl: {}", isLookupUrl); - final String format = parser.get("format"); - log.info("format: {}", format); + final String format = parser.get("format"); + log.info("format: {}", format); - final Integer batchSize = - parser.getObjectMap().containsKey("batchSize") - ? Integer.valueOf(parser.get("batchSize")) - : DEFAULT_BATCH_SIZE; - log.info("batchSize: {}", batchSize); + final Integer batchSize = parser.getObjectMap().containsKey("batchSize") + ? Integer.valueOf(parser.get("batchSize")) + : DEFAULT_BATCH_SIZE; + log.info("batchSize: {}", batchSize); - final ISLookUpService isLookup = ISLookupClientFactory.getLookUpService(isLookupUrl); - final String fields = getLayoutSource(isLookup, format); - log.info("fields: {}", fields); + final ISLookUpService isLookup = ISLookupClientFactory.getLookUpService(isLookupUrl); + final String fields = getLayoutSource(isLookup, format); + log.info("fields: {}", fields); - final String xslt = getLayoutTransformer(isLookup); + final String xslt = getLayoutTransformer(isLookup); - final String dsId = getDsId(format, isLookup); - log.info("dsId: {}", dsId); + final String dsId = getDsId(format, isLookup); + log.info("dsId: {}", dsId); - final String zkHost = getZkHost(isLookup); - log.info("zkHost: {}", zkHost); + final String zkHost = getZkHost(isLookup); + log.info("zkHost: {}", zkHost); - final String version = getRecordDatestamp(); + final String version = getRecordDatestamp(); - final String indexRecordXslt = getLayoutTransformer(format, fields, xslt); - log.info("indexRecordTransformer {}", indexRecordXslt); + final String indexRecordXslt = getLayoutTransformer(format, fields, xslt); + log.info("indexRecordTransformer {}", indexRecordXslt); - final SparkConf conf = new SparkConf(); + final SparkConf conf = new SparkConf(); - runWithSparkSession( - conf, - isSparkSessionManaged, - spark -> { - final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); + runWithSparkSession( + conf, + isSparkSessionManaged, + spark -> { + final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); - RDD docs = - sc.sequenceFile(inputPath, Text.class, Text.class) - .map(t -> t._2().toString()) - .map(s -> toIndexRecord(SaxonTransformerFactory.newInstance(indexRecordXslt), s)) - .map(s -> new StreamingInputDocumentFactory(version, dsId).parseDocument(s)) - .rdd(); + RDD docs = sc + .sequenceFile(inputPath, Text.class, Text.class) + .map(t -> t._2().toString()) + .map(s -> toIndexRecord(SaxonTransformerFactory.newInstance(indexRecordXslt), s)) + .map(s -> new StreamingInputDocumentFactory(version, dsId).parseDocument(s)) + .rdd(); - final String collection = format + SEPARATOR + LAYOUT + SEPARATOR + INTERPRETATION; - SolrSupport.indexDocs(zkHost, collection, batchSize, docs); - }); - } + final String collection = format + SEPARATOR + LAYOUT + SEPARATOR + INTERPRETATION; + SolrSupport.indexDocs(zkHost, collection, batchSize, docs); + }); + } - private static String toIndexRecord(Transformer tr, final String record) { - final StreamResult res = new StreamResult(new StringWriter()); - try { - tr.transform(new StreamSource(new StringReader(record)), res); - return res.getWriter().toString(); - } catch (Throwable e) { - log.error("XPathException on record: \n {}", record, e); - throw new IllegalArgumentException(e); - } - } + private static String toIndexRecord(Transformer tr, final String record) { + final StreamResult res = new StreamResult(new StringWriter()); + try { + tr.transform(new StreamSource(new StringReader(record)), res); + return res.getWriter().toString(); + } catch (Throwable e) { + log.error("XPathException on record: \n {}", record, e); + throw new IllegalArgumentException(e); + } + } - /** - * Creates the XSLT responsible for building the index xml records. - * - * @param format Metadata format name (DMF|TMF) - * @param xslt xslt for building the index record transformer - * @param fields the list of fields - * @return the javax.xml.transform.Transformer - * @throws ISLookUpException could happen - * @throws IOException could happen - * @throws TransformerException could happen - */ - private static String getLayoutTransformer(String format, String fields, String xslt) - throws TransformerException { + /** + * Creates the XSLT responsible for building the index xml records. + * + * @param format Metadata format name (DMF|TMF) + * @param xslt xslt for building the index record transformer + * @param fields the list of fields + * @return the javax.xml.transform.Transformer + * @throws ISLookUpException could happen + * @throws IOException could happen + * @throws TransformerException could happen + */ + private static String getLayoutTransformer(String format, String fields, String xslt) + throws TransformerException { - final Transformer layoutTransformer = SaxonTransformerFactory.newInstance(xslt); - final StreamResult layoutToXsltXslt = new StreamResult(new StringWriter()); + final Transformer layoutTransformer = SaxonTransformerFactory.newInstance(xslt); + final StreamResult layoutToXsltXslt = new StreamResult(new StringWriter()); - layoutTransformer.setParameter("format", format); - layoutTransformer.transform(new StreamSource(new StringReader(fields)), layoutToXsltXslt); + layoutTransformer.setParameter("format", format); + layoutTransformer.transform(new StreamSource(new StringReader(fields)), layoutToXsltXslt); - return layoutToXsltXslt.getWriter().toString(); - } + return layoutToXsltXslt.getWriter().toString(); + } - /** - * method return a solr-compatible string representation of a date, used to mark all records as - * indexed today - * - * @return the parsed date - */ - public static String getRecordDatestamp() { - return new SimpleDateFormat(DATE_FORMAT).format(new Date()); - } + /** + * method return a solr-compatible string representation of a date, used to mark all records as indexed today + * + * @return the parsed date + */ + public static String getRecordDatestamp() { + return new SimpleDateFormat(DATE_FORMAT).format(new Date()); + } - /** - * Method retrieves from the information system the list of fields associated to the given - * MDFormat name - * - * @param isLookup the ISLookup service stub - * @param format the Metadata format name - * @return the string representation of the list of fields to be indexed - * @throws ISLookUpDocumentNotFoundException - * @throws ISLookUpException - */ - private static String getLayoutSource(final ISLookUpService isLookup, final String format) - throws ISLookUpDocumentNotFoundException, ISLookUpException { - return doLookup( - isLookup, - String.format( - "collection('')//RESOURCE_PROFILE[.//RESOURCE_TYPE/@value = 'MDFormatDSResourceType' and .//NAME='%s']//LAYOUT[@name='%s']", - format, LAYOUT)); - } + /** + * Method retrieves from the information system the list of fields associated to the given MDFormat name + * + * @param isLookup the ISLookup service stub + * @param format the Metadata format name + * @return the string representation of the list of fields to be indexed + * @throws ISLookUpDocumentNotFoundException + * @throws ISLookUpException + */ + private static String getLayoutSource(final ISLookUpService isLookup, final String format) + throws ISLookUpDocumentNotFoundException, ISLookUpException { + return doLookup( + isLookup, + String + .format( + "collection('')//RESOURCE_PROFILE[.//RESOURCE_TYPE/@value = 'MDFormatDSResourceType' and .//NAME='%s']//LAYOUT[@name='%s']", + format, LAYOUT)); + } - /** - * Method retrieves from the information system the openaireLayoutToRecordStylesheet - * - * @param isLookup the ISLookup service stub - * @return the string representation of the XSLT contained in the transformation rule profile - * @throws ISLookUpDocumentNotFoundException - * @throws ISLookUpException - */ - private static String getLayoutTransformer(ISLookUpService isLookup) throws ISLookUpException { - return doLookup( - isLookup, - "collection('/db/DRIVER/TransformationRuleDSResources/TransformationRuleDSResourceType')" - + "//RESOURCE_PROFILE[./BODY/CONFIGURATION/SCRIPT/TITLE/text() = 'openaireLayoutToRecordStylesheet']//CODE/node()"); - } + /** + * Method retrieves from the information system the openaireLayoutToRecordStylesheet + * + * @param isLookup the ISLookup service stub + * @return the string representation of the XSLT contained in the transformation rule profile + * @throws ISLookUpDocumentNotFoundException + * @throws ISLookUpException + */ + private static String getLayoutTransformer(ISLookUpService isLookup) throws ISLookUpException { + return doLookup( + isLookup, + "collection('/db/DRIVER/TransformationRuleDSResources/TransformationRuleDSResourceType')" + + "//RESOURCE_PROFILE[./BODY/CONFIGURATION/SCRIPT/TITLE/text() = 'openaireLayoutToRecordStylesheet']//CODE/node()"); + } - /** - * Method retrieves from the information system the IndexDS profile ID associated to the given - * MDFormat name - * - * @param format - * @param isLookup - * @return the IndexDS identifier - * @throws ISLookUpException - */ - private static String getDsId(String format, ISLookUpService isLookup) throws ISLookUpException { - return doLookup( - isLookup, - String.format( - "collection('/db/DRIVER/IndexDSResources/IndexDSResourceType')" - + "//RESOURCE_PROFILE[./BODY/CONFIGURATION/METADATA_FORMAT/text() = '%s']//RESOURCE_IDENTIFIER/@value/string()", - format)); - } + /** + * Method retrieves from the information system the IndexDS profile ID associated to the given MDFormat name + * + * @param format + * @param isLookup + * @return the IndexDS identifier + * @throws ISLookUpException + */ + private static String getDsId(String format, ISLookUpService isLookup) throws ISLookUpException { + return doLookup( + isLookup, + String + .format( + "collection('/db/DRIVER/IndexDSResources/IndexDSResourceType')" + + "//RESOURCE_PROFILE[./BODY/CONFIGURATION/METADATA_FORMAT/text() = '%s']//RESOURCE_IDENTIFIER/@value/string()", + format)); + } - /** - * Method retrieves from the information system the zookeeper quorum of the Solr server - * - * @param isLookup - * @return the zookeeper quorum of the Solr server - * @throws ISLookUpException - */ - private static String getZkHost(ISLookUpService isLookup) throws ISLookUpException { - return doLookup( - isLookup, - "for $x in /RESOURCE_PROFILE[.//RESOURCE_TYPE/@value='IndexServiceResourceType'] return $x//PROTOCOL[./@name='solr']/@address/string()"); - } + /** + * Method retrieves from the information system the zookeeper quorum of the Solr server + * + * @param isLookup + * @return the zookeeper quorum of the Solr server + * @throws ISLookUpException + */ + private static String getZkHost(ISLookUpService isLookup) throws ISLookUpException { + return doLookup( + isLookup, + "for $x in /RESOURCE_PROFILE[.//RESOURCE_TYPE/@value='IndexServiceResourceType'] return $x//PROTOCOL[./@name='solr']/@address/string()"); + } - private static String doLookup(ISLookUpService isLookup, String xquery) throws ISLookUpException { - log.info(String.format("running xquery: %s", xquery)); - final String res = isLookup.getResourceProfileByQuery(xquery); - log.info(String.format("got response (100 chars): %s", StringUtils.left(res, 100) + " ...")); - return res; - } + private static String doLookup(ISLookUpService isLookup, String xquery) throws ISLookUpException { + log.info(String.format("running xquery: %s", xquery)); + final String res = isLookup.getResourceProfileByQuery(xquery); + log.info(String.format("got response (100 chars): %s", StringUtils.left(res, 100) + " ...")); + return res; + } } diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/EntityRelEntity.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/EntityRelEntity.java index e47356c13..a6b3c5591 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/EntityRelEntity.java +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/EntityRelEntity.java @@ -1,62 +1,67 @@ + package eu.dnetlib.dhp.oa.provision.model; -import com.google.common.base.Objects; import java.io.Serializable; +import com.google.common.base.Objects; + public class EntityRelEntity implements Serializable { - private TypedRow entity; - private SortableRelation relation; - private RelatedEntity target; + private TypedRow entity; + private SortableRelation relation; + private RelatedEntity target; - public EntityRelEntity() {} + public EntityRelEntity() { + } - public EntityRelEntity(SortableRelation relation, RelatedEntity target) { - this(null, relation, target); - } + public EntityRelEntity(SortableRelation relation, RelatedEntity target) { + this(null, relation, target); + } - public EntityRelEntity(TypedRow entity, SortableRelation relation, RelatedEntity target) { - this.entity = entity; - this.relation = relation; - this.target = target; - } + public EntityRelEntity(TypedRow entity, SortableRelation relation, RelatedEntity target) { + this.entity = entity; + this.relation = relation; + this.target = target; + } - public TypedRow getEntity() { - return entity; - } + public TypedRow getEntity() { + return entity; + } - public void setEntity(TypedRow entity) { - this.entity = entity; - } + public void setEntity(TypedRow entity) { + this.entity = entity; + } - public SortableRelation getRelation() { - return relation; - } + public SortableRelation getRelation() { + return relation; + } - public void setRelation(SortableRelation relation) { - this.relation = relation; - } + public void setRelation(SortableRelation relation) { + this.relation = relation; + } - public RelatedEntity getTarget() { - return target; - } + public RelatedEntity getTarget() { + return target; + } - public void setTarget(RelatedEntity target) { - this.target = target; - } + public void setTarget(RelatedEntity target) { + this.target = target; + } - @Override - public boolean equals(Object o) { - if (this == o) return true; - if (o == null || getClass() != o.getClass()) return false; - EntityRelEntity that = (EntityRelEntity) o; - return Objects.equal(entity, that.entity) - && Objects.equal(relation, that.relation) - && Objects.equal(target, that.target); - } + @Override + public boolean equals(Object o) { + if (this == o) + return true; + if (o == null || getClass() != o.getClass()) + return false; + EntityRelEntity that = (EntityRelEntity) o; + return Objects.equal(entity, that.entity) + && Objects.equal(relation, that.relation) + && Objects.equal(target, that.target); + } - @Override - public int hashCode() { - return Objects.hashCode(entity, relation, target); - } + @Override + public int hashCode() { + return Objects.hashCode(entity, relation, target); + } } diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/JoinedEntity.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/JoinedEntity.java index daa069255..e29ec9d19 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/JoinedEntity.java +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/JoinedEntity.java @@ -1,3 +1,4 @@ + package eu.dnetlib.dhp.oa.provision.model; import java.io.Serializable; @@ -5,25 +6,26 @@ import java.util.List; public class JoinedEntity implements Serializable { - private TypedRow entity; + private TypedRow entity; - private List links; + private List links; - public JoinedEntity() {} + public JoinedEntity() { + } - public TypedRow getEntity() { - return entity; - } + public TypedRow getEntity() { + return entity; + } - public void setEntity(TypedRow entity) { - this.entity = entity; - } + public void setEntity(TypedRow entity) { + this.entity = entity; + } - public List getLinks() { - return links; - } + public List getLinks() { + return links; + } - public void setLinks(List links) { - this.links = links; - } + public void setLinks(List links) { + this.links = links; + } } diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/RelatedEntity.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/RelatedEntity.java index 9671d505c..e15ceff76 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/RelatedEntity.java +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/RelatedEntity.java @@ -1,289 +1,295 @@ + package eu.dnetlib.dhp.oa.provision.model; +import java.io.Serializable; +import java.util.List; + import com.google.common.base.Objects; + import eu.dnetlib.dhp.schema.oaf.Instance; import eu.dnetlib.dhp.schema.oaf.KeyValue; import eu.dnetlib.dhp.schema.oaf.Qualifier; import eu.dnetlib.dhp.schema.oaf.StructuredProperty; -import java.io.Serializable; -import java.util.List; public class RelatedEntity implements Serializable { - private String id; - private String type; + private String id; + private String type; - // common fields - private StructuredProperty title; - private String websiteurl; // datasource, organizations, projects + // common fields + private StructuredProperty title; + private String websiteurl; // datasource, organizations, projects - // results - private String dateofacceptance; - private String publisher; - private List pid; - private String codeRepositoryUrl; - private Qualifier resulttype; - private List collectedfrom; - private List instances; + // results + private String dateofacceptance; + private String publisher; + private List pid; + private String codeRepositoryUrl; + private Qualifier resulttype; + private List collectedfrom; + private List instances; - // datasource - private String officialname; - private Qualifier datasourcetype; - private Qualifier datasourcetypeui; - private Qualifier openairecompatibility; - // private String aggregatortype; + // datasource + private String officialname; + private Qualifier datasourcetype; + private Qualifier datasourcetypeui; + private Qualifier openairecompatibility; + // private String aggregatortype; - // organization - private String legalname; - private String legalshortname; - private Qualifier country; + // organization + private String legalname; + private String legalshortname; + private Qualifier country; - // project - private String projectTitle; - private String code; - private String acronym; - private Qualifier contracttype; - private List fundingtree; + // project + private String projectTitle; + private String code; + private String acronym; + private Qualifier contracttype; + private List fundingtree; - public String getId() { - return id; - } + public String getId() { + return id; + } - public void setId(String id) { - this.id = id; - } + public void setId(String id) { + this.id = id; + } - public String getType() { - return type; - } + public String getType() { + return type; + } - public void setType(String type) { - this.type = type; - } + public void setType(String type) { + this.type = type; + } - public StructuredProperty getTitle() { - return title; - } + public StructuredProperty getTitle() { + return title; + } - public void setTitle(StructuredProperty title) { - this.title = title; - } + public void setTitle(StructuredProperty title) { + this.title = title; + } - public String getWebsiteurl() { - return websiteurl; - } + public String getWebsiteurl() { + return websiteurl; + } - public void setWebsiteurl(String websiteurl) { - this.websiteurl = websiteurl; - } + public void setWebsiteurl(String websiteurl) { + this.websiteurl = websiteurl; + } - public String getDateofacceptance() { - return dateofacceptance; - } + public String getDateofacceptance() { + return dateofacceptance; + } - public void setDateofacceptance(String dateofacceptance) { - this.dateofacceptance = dateofacceptance; - } + public void setDateofacceptance(String dateofacceptance) { + this.dateofacceptance = dateofacceptance; + } - public String getPublisher() { - return publisher; - } + public String getPublisher() { + return publisher; + } - public void setPublisher(String publisher) { - this.publisher = publisher; - } + public void setPublisher(String publisher) { + this.publisher = publisher; + } - public List getPid() { - return pid; - } + public List getPid() { + return pid; + } - public void setPid(List pid) { - this.pid = pid; - } + public void setPid(List pid) { + this.pid = pid; + } - public String getCodeRepositoryUrl() { - return codeRepositoryUrl; - } + public String getCodeRepositoryUrl() { + return codeRepositoryUrl; + } - public void setCodeRepositoryUrl(String codeRepositoryUrl) { - this.codeRepositoryUrl = codeRepositoryUrl; - } + public void setCodeRepositoryUrl(String codeRepositoryUrl) { + this.codeRepositoryUrl = codeRepositoryUrl; + } - public Qualifier getResulttype() { - return resulttype; - } + public Qualifier getResulttype() { + return resulttype; + } - public void setResulttype(Qualifier resulttype) { - this.resulttype = resulttype; - } + public void setResulttype(Qualifier resulttype) { + this.resulttype = resulttype; + } - public List getCollectedfrom() { - return collectedfrom; - } + public List getCollectedfrom() { + return collectedfrom; + } - public void setCollectedfrom(List collectedfrom) { - this.collectedfrom = collectedfrom; - } + public void setCollectedfrom(List collectedfrom) { + this.collectedfrom = collectedfrom; + } - public List getInstances() { - return instances; - } + public List getInstances() { + return instances; + } - public void setInstances(List instances) { - this.instances = instances; - } + public void setInstances(List instances) { + this.instances = instances; + } - public String getOfficialname() { - return officialname; - } + public String getOfficialname() { + return officialname; + } - public void setOfficialname(String officialname) { - this.officialname = officialname; - } + public void setOfficialname(String officialname) { + this.officialname = officialname; + } - public Qualifier getDatasourcetype() { - return datasourcetype; - } + public Qualifier getDatasourcetype() { + return datasourcetype; + } - public void setDatasourcetype(Qualifier datasourcetype) { - this.datasourcetype = datasourcetype; - } + public void setDatasourcetype(Qualifier datasourcetype) { + this.datasourcetype = datasourcetype; + } - public Qualifier getDatasourcetypeui() { - return datasourcetypeui; - } + public Qualifier getDatasourcetypeui() { + return datasourcetypeui; + } - public void setDatasourcetypeui(Qualifier datasourcetypeui) { - this.datasourcetypeui = datasourcetypeui; - } + public void setDatasourcetypeui(Qualifier datasourcetypeui) { + this.datasourcetypeui = datasourcetypeui; + } - public Qualifier getOpenairecompatibility() { - return openairecompatibility; - } + public Qualifier getOpenairecompatibility() { + return openairecompatibility; + } - public void setOpenairecompatibility(Qualifier openairecompatibility) { - this.openairecompatibility = openairecompatibility; - } + public void setOpenairecompatibility(Qualifier openairecompatibility) { + this.openairecompatibility = openairecompatibility; + } - public String getLegalname() { - return legalname; - } + public String getLegalname() { + return legalname; + } - public void setLegalname(String legalname) { - this.legalname = legalname; - } + public void setLegalname(String legalname) { + this.legalname = legalname; + } - public String getLegalshortname() { - return legalshortname; - } + public String getLegalshortname() { + return legalshortname; + } - public void setLegalshortname(String legalshortname) { - this.legalshortname = legalshortname; - } + public void setLegalshortname(String legalshortname) { + this.legalshortname = legalshortname; + } - public Qualifier getCountry() { - return country; - } + public Qualifier getCountry() { + return country; + } - public void setCountry(Qualifier country) { - this.country = country; - } + public void setCountry(Qualifier country) { + this.country = country; + } - public String getProjectTitle() { - return projectTitle; - } + public String getProjectTitle() { + return projectTitle; + } - public void setProjectTitle(String projectTitle) { - this.projectTitle = projectTitle; - } + public void setProjectTitle(String projectTitle) { + this.projectTitle = projectTitle; + } - public String getCode() { - return code; - } + public String getCode() { + return code; + } - public void setCode(String code) { - this.code = code; - } + public void setCode(String code) { + this.code = code; + } - public String getAcronym() { - return acronym; - } + public String getAcronym() { + return acronym; + } - public void setAcronym(String acronym) { - this.acronym = acronym; - } + public void setAcronym(String acronym) { + this.acronym = acronym; + } - public Qualifier getContracttype() { - return contracttype; - } + public Qualifier getContracttype() { + return contracttype; + } - public void setContracttype(Qualifier contracttype) { - this.contracttype = contracttype; - } + public void setContracttype(Qualifier contracttype) { + this.contracttype = contracttype; + } - public List getFundingtree() { - return fundingtree; - } + public List getFundingtree() { + return fundingtree; + } - public void setFundingtree(List fundingtree) { - this.fundingtree = fundingtree; - } + public void setFundingtree(List fundingtree) { + this.fundingtree = fundingtree; + } - @Override - public boolean equals(Object o) { - if (this == o) return true; - if (o == null || getClass() != o.getClass()) return false; - RelatedEntity that = (RelatedEntity) o; - return Objects.equal(id, that.id) - && Objects.equal(type, that.type) - && Objects.equal(title, that.title) - && Objects.equal(websiteurl, that.websiteurl) - && Objects.equal(dateofacceptance, that.dateofacceptance) - && Objects.equal(publisher, that.publisher) - && Objects.equal(pid, that.pid) - && Objects.equal(codeRepositoryUrl, that.codeRepositoryUrl) - && Objects.equal(resulttype, that.resulttype) - && Objects.equal(collectedfrom, that.collectedfrom) - && Objects.equal(instances, that.instances) - && Objects.equal(officialname, that.officialname) - && Objects.equal(datasourcetype, that.datasourcetype) - && Objects.equal(datasourcetypeui, that.datasourcetypeui) - && Objects.equal(openairecompatibility, that.openairecompatibility) - && Objects.equal(legalname, that.legalname) - && Objects.equal(legalshortname, that.legalshortname) - && Objects.equal(country, that.country) - && Objects.equal(projectTitle, that.projectTitle) - && Objects.equal(code, that.code) - && Objects.equal(acronym, that.acronym) - && Objects.equal(contracttype, that.contracttype) - && Objects.equal(fundingtree, that.fundingtree); - } + @Override + public boolean equals(Object o) { + if (this == o) + return true; + if (o == null || getClass() != o.getClass()) + return false; + RelatedEntity that = (RelatedEntity) o; + return Objects.equal(id, that.id) + && Objects.equal(type, that.type) + && Objects.equal(title, that.title) + && Objects.equal(websiteurl, that.websiteurl) + && Objects.equal(dateofacceptance, that.dateofacceptance) + && Objects.equal(publisher, that.publisher) + && Objects.equal(pid, that.pid) + && Objects.equal(codeRepositoryUrl, that.codeRepositoryUrl) + && Objects.equal(resulttype, that.resulttype) + && Objects.equal(collectedfrom, that.collectedfrom) + && Objects.equal(instances, that.instances) + && Objects.equal(officialname, that.officialname) + && Objects.equal(datasourcetype, that.datasourcetype) + && Objects.equal(datasourcetypeui, that.datasourcetypeui) + && Objects.equal(openairecompatibility, that.openairecompatibility) + && Objects.equal(legalname, that.legalname) + && Objects.equal(legalshortname, that.legalshortname) + && Objects.equal(country, that.country) + && Objects.equal(projectTitle, that.projectTitle) + && Objects.equal(code, that.code) + && Objects.equal(acronym, that.acronym) + && Objects.equal(contracttype, that.contracttype) + && Objects.equal(fundingtree, that.fundingtree); + } - @Override - public int hashCode() { - return Objects.hashCode( - id, - type, - title, - websiteurl, - dateofacceptance, - publisher, - pid, - codeRepositoryUrl, - resulttype, - collectedfrom, - instances, - officialname, - datasourcetype, - datasourcetypeui, - openairecompatibility, - legalname, - legalshortname, - country, - projectTitle, - code, - acronym, - contracttype, - fundingtree); - } + @Override + public int hashCode() { + return Objects + .hashCode( + id, + type, + title, + websiteurl, + dateofacceptance, + publisher, + pid, + codeRepositoryUrl, + resulttype, + collectedfrom, + instances, + officialname, + datasourcetype, + datasourcetypeui, + openairecompatibility, + legalname, + legalshortname, + country, + projectTitle, + code, + acronym, + contracttype, + fundingtree); + } } diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/SortableRelation.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/SortableRelation.java index 0a35a9752..7c866001b 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/SortableRelation.java +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/SortableRelation.java @@ -1,34 +1,38 @@ + package eu.dnetlib.dhp.oa.provision.model; -import com.google.common.collect.ComparisonChain; -import com.google.common.collect.Maps; -import eu.dnetlib.dhp.schema.oaf.Relation; import java.io.Serializable; import java.util.Map; +import com.google.common.collect.ComparisonChain; +import com.google.common.collect.Maps; + +import eu.dnetlib.dhp.schema.oaf.Relation; + public class SortableRelation extends Relation implements Comparable, Serializable { - private static final Map weights = Maps.newHashMap(); + private static final Map weights = Maps.newHashMap(); - static { - weights.put("outcome", 0); - weights.put("supplement", 1); - weights.put("publicationDataset", 2); - weights.put("relationship", 3); - weights.put("similarity", 4); - weights.put("affiliation", 5); + static { + weights.put("outcome", 0); + weights.put("supplement", 1); + weights.put("publicationDataset", 2); + weights.put("relationship", 3); + weights.put("similarity", 4); + weights.put("affiliation", 5); - weights.put("provision", 6); - weights.put("participation", 7); - weights.put("dedup", 8); - } + weights.put("provision", 6); + weights.put("participation", 7); + weights.put("dedup", 8); + } - @Override - public int compareTo(Relation o) { - return ComparisonChain.start() - .compare(weights.get(getSubRelType()), weights.get(o.getSubRelType())) - .compare(getSource(), o.getSource()) - .compare(getTarget(), o.getTarget()) - .result(); - } + @Override + public int compareTo(Relation o) { + return ComparisonChain + .start() + .compare(weights.get(getSubRelType()), weights.get(o.getSubRelType())) + .compare(getSource(), o.getSource()) + .compare(getTarget(), o.getTarget()) + .result(); + } } diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/Tuple2.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/Tuple2.java index e7e4aea3c..5ebe9c9eb 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/Tuple2.java +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/Tuple2.java @@ -1,48 +1,53 @@ + package eu.dnetlib.dhp.oa.provision.model; -import eu.dnetlib.dhp.schema.oaf.Relation; import java.io.Serializable; import java.util.Objects; +import eu.dnetlib.dhp.schema.oaf.Relation; + public class Tuple2 implements Serializable { - private Relation relation; + private Relation relation; - private RelatedEntity relatedEntity; + private RelatedEntity relatedEntity; - public Tuple2() {} + public Tuple2() { + } - public Tuple2(Relation relation, RelatedEntity relatedEntity) { - this.relation = relation; - this.relatedEntity = relatedEntity; - } + public Tuple2(Relation relation, RelatedEntity relatedEntity) { + this.relation = relation; + this.relatedEntity = relatedEntity; + } - public Relation getRelation() { - return relation; - } + public Relation getRelation() { + return relation; + } - public void setRelation(Relation relation) { - this.relation = relation; - } + public void setRelation(Relation relation) { + this.relation = relation; + } - public RelatedEntity getRelatedEntity() { - return relatedEntity; - } + public RelatedEntity getRelatedEntity() { + return relatedEntity; + } - public void setRelatedEntity(RelatedEntity relatedEntity) { - this.relatedEntity = relatedEntity; - } + public void setRelatedEntity(RelatedEntity relatedEntity) { + this.relatedEntity = relatedEntity; + } - @Override - public boolean equals(Object o) { - if (this == o) return true; - if (o == null || getClass() != o.getClass()) return false; - Tuple2 t2 = (Tuple2) o; - return getRelation().equals(t2.getRelation()); - } + @Override + public boolean equals(Object o) { + if (this == o) + return true; + if (o == null || getClass() != o.getClass()) + return false; + Tuple2 t2 = (Tuple2) o; + return getRelation().equals(t2.getRelation()); + } - @Override - public int hashCode() { - return Objects.hash(getRelation().hashCode()); - } + @Override + public int hashCode() { + return Objects.hash(getRelation().hashCode()); + } } diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/TypedRow.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/TypedRow.java index 01067707e..cbec372e4 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/TypedRow.java +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/TypedRow.java @@ -1,60 +1,64 @@ + package eu.dnetlib.dhp.oa.provision.model; -import com.google.common.base.Objects; import java.io.Serializable; +import com.google.common.base.Objects; + public class TypedRow implements Serializable { - private String id; + private String id; - private Boolean deleted; + private Boolean deleted; - private String type; + private String type; - private String oaf; + private String oaf; - public String getId() { - return id; - } + public String getId() { + return id; + } - public void setId(String id) { - this.id = id; - } + public void setId(String id) { + this.id = id; + } - public Boolean getDeleted() { - return deleted; - } + public Boolean getDeleted() { + return deleted; + } - public void setDeleted(Boolean deleted) { - this.deleted = deleted; - } + public void setDeleted(Boolean deleted) { + this.deleted = deleted; + } - public String getType() { - return type; - } + public String getType() { + return type; + } - public void setType(String type) { - this.type = type; - } + public void setType(String type) { + this.type = type; + } - public String getOaf() { - return oaf; - } + public String getOaf() { + return oaf; + } - public void setOaf(String oaf) { - this.oaf = oaf; - } + public void setOaf(String oaf) { + this.oaf = oaf; + } - @Override - public boolean equals(Object o) { - if (this == o) return true; - if (o == null || getClass() != o.getClass()) return false; - TypedRow typedRow2 = (TypedRow) o; - return Objects.equal(id, typedRow2.id); - } + @Override + public boolean equals(Object o) { + if (this == o) + return true; + if (o == null || getClass() != o.getClass()) + return false; + TypedRow typedRow2 = (TypedRow) o; + return Objects.equal(id, typedRow2.id); + } - @Override - public int hashCode() { - return Objects.hashCode(id); - } + @Override + public int hashCode() { + return Objects.hashCode(id); + } } diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/ContextDef.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/ContextDef.java index dc6170445..8afd6400c 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/ContextDef.java +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/ContextDef.java @@ -1,51 +1,52 @@ + package eu.dnetlib.dhp.oa.provision.utils; import java.io.Serializable; public class ContextDef implements Serializable { - private String id; - private String label; - private String name; - private String type; + private String id; + private String label; + private String name; + private String type; - public ContextDef(final String id, final String label, final String name, final String type) { - super(); - this.setId(id); - this.setLabel(label); - this.setName(name); - this.setType(type); - } + public ContextDef(final String id, final String label, final String name, final String type) { + super(); + this.setId(id); + this.setLabel(label); + this.setName(name); + this.setType(type); + } - public String getLabel() { - return label; - } + public String getLabel() { + return label; + } - public void setLabel(final String label) { - this.label = label; - } + public void setLabel(final String label) { + this.label = label; + } - public String getId() { - return id; - } + public String getId() { + return id; + } - public void setId(final String id) { - this.id = id; - } + public void setId(final String id) { + this.id = id; + } - public String getName() { - return name; - } + public String getName() { + return name; + } - public void setName(final String name) { - this.name = name; - } + public void setName(final String name) { + this.name = name; + } - public String getType() { - return type; - } + public String getType() { + return type; + } - public void setType(final String type) { - this.type = type; - } + public void setType(final String type) { + this.type = type; + } } diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/ContextMapper.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/ContextMapper.java index d1d6521db..ac418f2b9 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/ContextMapper.java +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/ContextMapper.java @@ -1,46 +1,49 @@ + package eu.dnetlib.dhp.oa.provision.utils; -import com.google.common.base.Joiner; -import eu.dnetlib.dhp.utils.ISLookupClientFactory; -import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException; -import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService; import java.io.Serializable; import java.io.StringReader; import java.util.HashMap; + import org.dom4j.Document; import org.dom4j.DocumentException; import org.dom4j.Node; import org.dom4j.io.SAXReader; +import com.google.common.base.Joiner; + +import eu.dnetlib.dhp.utils.ISLookupClientFactory; +import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException; +import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService; + public class ContextMapper extends HashMap implements Serializable { - private static final long serialVersionUID = 2159682308502487305L; + private static final long serialVersionUID = 2159682308502487305L; - private static final String XQUERY = - "for $x in //RESOURCE_PROFILE[.//RESOURCE_TYPE/@value='ContextDSResourceType']//*[name()='context' or name()='category' or name()='concept'] return "; + private static final String XQUERY = "for $x in //RESOURCE_PROFILE[.//RESOURCE_TYPE/@value='ContextDSResourceType']//*[name()='context' or name()='category' or name()='concept'] return "; - public static ContextMapper fromIS(final String isLookupUrl) - throws DocumentException, ISLookUpException { - ISLookUpService isLookUp = ISLookupClientFactory.getLookUpService(isLookupUrl); - StringBuilder sb = new StringBuilder(""); - Joiner.on("").appendTo(sb, isLookUp.quickSearchProfile(XQUERY)); - sb.append(""); - return fromXml(sb.toString()); - } + public static ContextMapper fromIS(final String isLookupUrl) + throws DocumentException, ISLookUpException { + ISLookUpService isLookUp = ISLookupClientFactory.getLookUpService(isLookupUrl); + StringBuilder sb = new StringBuilder(""); + Joiner.on("").appendTo(sb, isLookUp.quickSearchProfile(XQUERY)); + sb.append(""); + return fromXml(sb.toString()); + } - public static ContextMapper fromXml(final String xml) throws DocumentException { - final ContextMapper contextMapper = new ContextMapper(); + public static ContextMapper fromXml(final String xml) throws DocumentException { + final ContextMapper contextMapper = new ContextMapper(); - final Document doc = new SAXReader().read(new StringReader(xml)); - for (Object o : doc.selectNodes("//entry")) { - Node node = (Node) o; - String id = node.valueOf("./@id"); - String label = node.valueOf("./@label"); - String name = node.valueOf("./@name"); - String type = node.valueOf("./@type") + ""; + final Document doc = new SAXReader().read(new StringReader(xml)); + for (Object o : doc.selectNodes("//entry")) { + Node node = (Node) o; + String id = node.valueOf("./@id"); + String label = node.valueOf("./@label"); + String name = node.valueOf("./@name"); + String type = node.valueOf("./@type") + ""; - contextMapper.put(id, new ContextDef(id, label, name, type)); - } - return contextMapper; - } + contextMapper.put(id, new ContextDef(id, label, name, type)); + } + return contextMapper; + } } diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/GraphMappingUtils.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/GraphMappingUtils.java index 96ffb4c90..0e742365a 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/GraphMappingUtils.java +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/GraphMappingUtils.java @@ -1,23 +1,27 @@ + package eu.dnetlib.dhp.oa.provision.utils; import static org.apache.commons.lang3.StringUtils.substringAfter; -import com.google.common.collect.Sets; -import eu.dnetlib.dhp.schema.oaf.*; import java.util.Set; +import com.google.common.collect.Sets; + +import eu.dnetlib.dhp.schema.oaf.*; + public class GraphMappingUtils { - public static final String SEPARATOR = "_"; + public static final String SEPARATOR = "_"; - public static Set authorPidTypes = Sets.newHashSet("orcid", "magidentifier"); + public static Set authorPidTypes = Sets.newHashSet("orcid", "magidentifier"); - public static String removePrefix(final String s) { - if (s.contains("|")) return substringAfter(s, "|"); - return s; - } + public static String removePrefix(final String s) { + if (s.contains("|")) + return substringAfter(s, "|"); + return s; + } - public static String getRelDescriptor(String relType, String subRelType, String relClass) { - return relType + SEPARATOR + subRelType + SEPARATOR + relClass; - } + public static String getRelDescriptor(String relType, String subRelType, String relClass) { + return relType + SEPARATOR + subRelType + SEPARATOR + relClass; + } } diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/LicenseComparator.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/LicenseComparator.java index 823997b6d..9dbac1936 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/LicenseComparator.java +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/LicenseComparator.java @@ -1,47 +1,69 @@ + package eu.dnetlib.dhp.oa.provision.utils; -import eu.dnetlib.dhp.schema.oaf.Qualifier; import java.util.Comparator; +import eu.dnetlib.dhp.schema.oaf.Qualifier; + public class LicenseComparator implements Comparator { - @Override - public int compare(Qualifier left, Qualifier right) { + @Override + public int compare(Qualifier left, Qualifier right) { - if (left == null && right == null) return 0; - if (left == null) return 1; - if (right == null) return -1; + if (left == null && right == null) + return 0; + if (left == null) + return 1; + if (right == null) + return -1; - String lClass = left.getClassid(); - String rClass = right.getClassid(); + String lClass = left.getClassid(); + String rClass = right.getClassid(); - if (lClass.equals(rClass)) return 0; + if (lClass.equals(rClass)) + return 0; - if (lClass.equals("OPEN SOURCE")) return -1; - if (rClass.equals("OPEN SOURCE")) return 1; + if (lClass.equals("OPEN SOURCE")) + return -1; + if (rClass.equals("OPEN SOURCE")) + return 1; - if (lClass.equals("OPEN")) return -1; - if (rClass.equals("OPEN")) return 1; + if (lClass.equals("OPEN")) + return -1; + if (rClass.equals("OPEN")) + return 1; - if (lClass.equals("6MONTHS")) return -1; - if (rClass.equals("6MONTHS")) return 1; + if (lClass.equals("6MONTHS")) + return -1; + if (rClass.equals("6MONTHS")) + return 1; - if (lClass.equals("12MONTHS")) return -1; - if (rClass.equals("12MONTHS")) return 1; + if (lClass.equals("12MONTHS")) + return -1; + if (rClass.equals("12MONTHS")) + return 1; - if (lClass.equals("EMBARGO")) return -1; - if (rClass.equals("EMBARGO")) return 1; + if (lClass.equals("EMBARGO")) + return -1; + if (rClass.equals("EMBARGO")) + return 1; - if (lClass.equals("RESTRICTED")) return -1; - if (rClass.equals("RESTRICTED")) return 1; + if (lClass.equals("RESTRICTED")) + return -1; + if (rClass.equals("RESTRICTED")) + return 1; - if (lClass.equals("CLOSED")) return -1; - if (rClass.equals("CLOSED")) return 1; + if (lClass.equals("CLOSED")) + return -1; + if (rClass.equals("CLOSED")) + return 1; - if (lClass.equals("UNKNOWN")) return -1; - if (rClass.equals("UNKNOWN")) return 1; + if (lClass.equals("UNKNOWN")) + return -1; + if (rClass.equals("UNKNOWN")) + return 1; - // Else (but unlikely), lexicographical ordering will do. - return lClass.compareTo(rClass); - } + // Else (but unlikely), lexicographical ordering will do. + return lClass.compareTo(rClass); + } } diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/RelationPartitioner.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/RelationPartitioner.java index 6db8b12de..bac2278e6 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/RelationPartitioner.java +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/RelationPartitioner.java @@ -1,29 +1,30 @@ + package eu.dnetlib.dhp.oa.provision.utils; -import eu.dnetlib.dhp.oa.provision.model.SortableRelation; import org.apache.spark.Partitioner; import org.apache.spark.util.Utils; +import eu.dnetlib.dhp.oa.provision.model.SortableRelation; + /** - * Used in combination with SortableRelationKey, allows to partition the records by source id, - * therefore allowing to sort relations sharing the same source id by the ordering defined in - * SortableRelationKey. + * Used in combination with SortableRelationKey, allows to partition the records by source id, therefore allowing to + * sort relations sharing the same source id by the ordering defined in SortableRelationKey. */ public class RelationPartitioner extends Partitioner { - private int numPartitions; + private int numPartitions; - public RelationPartitioner(int numPartitions) { - this.numPartitions = numPartitions; - } + public RelationPartitioner(int numPartitions) { + this.numPartitions = numPartitions; + } - @Override - public int numPartitions() { - return numPartitions; - } + @Override + public int numPartitions() { + return numPartitions; + } - @Override - public int getPartition(Object key) { - return Utils.nonNegativeMod(((SortableRelation) key).getSource().hashCode(), numPartitions()); - } + @Override + public int getPartition(Object key) { + return Utils.nonNegativeMod(((SortableRelation) key).getSource().hashCode(), numPartitions()); + } } diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/StreamingInputDocumentFactory.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/StreamingInputDocumentFactory.java index c472e6e85..de221b2ee 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/StreamingInputDocumentFactory.java +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/StreamingInputDocumentFactory.java @@ -1,262 +1,260 @@ + package eu.dnetlib.dhp.oa.provision.utils; -import com.google.common.collect.Lists; import java.io.StringReader; import java.io.StringWriter; import java.util.Arrays; import java.util.HashMap; import java.util.Iterator; import java.util.List; + import javax.xml.stream.*; import javax.xml.stream.events.Namespace; import javax.xml.stream.events.StartElement; import javax.xml.stream.events.XMLEvent; + import org.apache.solr.common.SolrInputDocument; +import com.google.common.collect.Lists; + /** * Optimized version of the document parser, drop in replacement of InputDocumentFactory. - * - *

Faster because: - * + *

+ * Faster because: *

- * - *

This class is fully reentrant and can be invoked in parallel. + *

+ * This class is fully reentrant and can be invoked in parallel. * * @author claudio */ public class StreamingInputDocumentFactory { - private static final String INDEX_FIELD_PREFIX = "__"; + private static final String INDEX_FIELD_PREFIX = "__"; - private static final String DS_VERSION = INDEX_FIELD_PREFIX + "dsversion"; + private static final String DS_VERSION = INDEX_FIELD_PREFIX + "dsversion"; - private static final String DS_ID = INDEX_FIELD_PREFIX + "dsid"; + private static final String DS_ID = INDEX_FIELD_PREFIX + "dsid"; - private static final String RESULT = "result"; + private static final String RESULT = "result"; - private static final String INDEX_RESULT = INDEX_FIELD_PREFIX + RESULT; + private static final String INDEX_RESULT = INDEX_FIELD_PREFIX + RESULT; - private static final String INDEX_RECORD_ID = INDEX_FIELD_PREFIX + "indexrecordidentifier"; + private static final String INDEX_RECORD_ID = INDEX_FIELD_PREFIX + "indexrecordidentifier"; - private static final String outFormat = new String("yyyy-MM-dd'T'hh:mm:ss'Z'"); + private static final String outFormat = new String("yyyy-MM-dd'T'hh:mm:ss'Z'"); - private static final List dateFormats = - Arrays.asList("yyyy-MM-dd'T'hh:mm:ss", "yyyy-MM-dd", "dd-MM-yyyy", "dd/MM/yyyy", "yyyy"); + private static final List dateFormats = Arrays + .asList("yyyy-MM-dd'T'hh:mm:ss", "yyyy-MM-dd", "dd-MM-yyyy", "dd/MM/yyyy", "yyyy"); - private static final String DEFAULTDNETRESULT = "dnetResult"; + private static final String DEFAULTDNETRESULT = "dnetResult"; - private static final String TARGETFIELDS = "targetFields"; + private static final String TARGETFIELDS = "targetFields"; - private static final String INDEX_RECORD_ID_ELEMENT = "indexRecordIdentifier"; + private static final String INDEX_RECORD_ID_ELEMENT = "indexRecordIdentifier"; - private static final String ROOT_ELEMENT = "indexRecord"; + private static final String ROOT_ELEMENT = "indexRecord"; - private static final int MAX_FIELD_LENGTH = 25000; + private static final int MAX_FIELD_LENGTH = 25000; - private ThreadLocal inputFactory = - ThreadLocal.withInitial(() -> XMLInputFactory.newInstance()); + private ThreadLocal inputFactory = ThreadLocal.withInitial(() -> XMLInputFactory.newInstance()); - private ThreadLocal outputFactory = - ThreadLocal.withInitial(() -> XMLOutputFactory.newInstance()); + private ThreadLocal outputFactory = ThreadLocal.withInitial(() -> XMLOutputFactory.newInstance()); - private ThreadLocal eventFactory = - ThreadLocal.withInitial(() -> XMLEventFactory.newInstance()); + private ThreadLocal eventFactory = ThreadLocal.withInitial(() -> XMLEventFactory.newInstance()); - private String version; + private String version; - private String dsId; + private String dsId; - private String resultName = DEFAULTDNETRESULT; + private String resultName = DEFAULTDNETRESULT; - public StreamingInputDocumentFactory(final String version, final String dsId) { - this(version, dsId, DEFAULTDNETRESULT); - } + public StreamingInputDocumentFactory(final String version, final String dsId) { + this(version, dsId, DEFAULTDNETRESULT); + } - public StreamingInputDocumentFactory( - final String version, final String dsId, final String resultName) { - this.version = version; - this.dsId = dsId; - this.resultName = resultName; - } + public StreamingInputDocumentFactory( + final String version, final String dsId, final String resultName) { + this.version = version; + this.dsId = dsId; + this.resultName = resultName; + } - public SolrInputDocument parseDocument(final String inputDocument) { + public SolrInputDocument parseDocument(final String inputDocument) { - final StringWriter results = new StringWriter(); - final List nsList = Lists.newLinkedList(); - try { + final StringWriter results = new StringWriter(); + final List nsList = Lists.newLinkedList(); + try { - XMLEventReader parser = - inputFactory.get().createXMLEventReader(new StringReader(inputDocument)); + XMLEventReader parser = inputFactory.get().createXMLEventReader(new StringReader(inputDocument)); - final SolrInputDocument indexDocument = new SolrInputDocument(new HashMap<>()); + final SolrInputDocument indexDocument = new SolrInputDocument(new HashMap<>()); - while (parser.hasNext()) { - final XMLEvent event = parser.nextEvent(); - if ((event != null) && event.isStartElement()) { - final String localName = event.asStartElement().getName().getLocalPart(); + while (parser.hasNext()) { + final XMLEvent event = parser.nextEvent(); + if ((event != null) && event.isStartElement()) { + final String localName = event.asStartElement().getName().getLocalPart(); - if (ROOT_ELEMENT.equals(localName)) { - nsList.addAll(getNamespaces(event)); - } else if (INDEX_RECORD_ID_ELEMENT.equals(localName)) { - final XMLEvent text = parser.nextEvent(); - String recordId = getText(text); - indexDocument.addField(INDEX_RECORD_ID, recordId); - } else if (TARGETFIELDS.equals(localName)) { - parseTargetFields(indexDocument, parser); - } else if (resultName.equals(localName)) { - copyResult(indexDocument, results, parser, nsList, resultName); - } - } - } + if (ROOT_ELEMENT.equals(localName)) { + nsList.addAll(getNamespaces(event)); + } else if (INDEX_RECORD_ID_ELEMENT.equals(localName)) { + final XMLEvent text = parser.nextEvent(); + String recordId = getText(text); + indexDocument.addField(INDEX_RECORD_ID, recordId); + } else if (TARGETFIELDS.equals(localName)) { + parseTargetFields(indexDocument, parser); + } else if (resultName.equals(localName)) { + copyResult(indexDocument, results, parser, nsList, resultName); + } + } + } - if (version != null) { - indexDocument.addField(DS_VERSION, version); - } + if (version != null) { + indexDocument.addField(DS_VERSION, version); + } - if (dsId != null) { - indexDocument.addField(DS_ID, dsId); - } + if (dsId != null) { + indexDocument.addField(DS_ID, dsId); + } - if (!indexDocument.containsKey(INDEX_RECORD_ID)) { - indexDocument.clear(); - System.err.println("missing indexrecord id:\n" + inputDocument); - } + if (!indexDocument.containsKey(INDEX_RECORD_ID)) { + indexDocument.clear(); + System.err.println("missing indexrecord id:\n" + inputDocument); + } - return indexDocument; - } catch (XMLStreamException e) { - return new SolrInputDocument(); - } - } + return indexDocument; + } catch (XMLStreamException e) { + return new SolrInputDocument(); + } + } - private List getNamespaces(final XMLEvent event) { - final List res = Lists.newLinkedList(); - @SuppressWarnings("unchecked") - Iterator nsIter = event.asStartElement().getNamespaces(); - while (nsIter.hasNext()) { - Namespace ns = nsIter.next(); - res.add(ns); - } - return res; - } + private List getNamespaces(final XMLEvent event) { + final List res = Lists.newLinkedList(); + @SuppressWarnings("unchecked") + Iterator nsIter = event.asStartElement().getNamespaces(); + while (nsIter.hasNext()) { + Namespace ns = nsIter.next(); + res.add(ns); + } + return res; + } - /** - * Parse the targetFields block and add fields to the solr document. - * - * @param indexDocument - * @param parser - * @throws XMLStreamException - */ - protected void parseTargetFields( - final SolrInputDocument indexDocument, final XMLEventReader parser) - throws XMLStreamException { + /** + * Parse the targetFields block and add fields to the solr document. + * + * @param indexDocument + * @param parser + * @throws XMLStreamException + */ + protected void parseTargetFields( + final SolrInputDocument indexDocument, final XMLEventReader parser) + throws XMLStreamException { - boolean hasFields = false; + boolean hasFields = false; - while (parser.hasNext()) { - final XMLEvent targetEvent = parser.nextEvent(); - if (targetEvent.isEndElement() - && targetEvent.asEndElement().getName().getLocalPart().equals(TARGETFIELDS)) { - break; - } + while (parser.hasNext()) { + final XMLEvent targetEvent = parser.nextEvent(); + if (targetEvent.isEndElement() + && targetEvent.asEndElement().getName().getLocalPart().equals(TARGETFIELDS)) { + break; + } - if (targetEvent.isStartElement()) { - final String fieldName = targetEvent.asStartElement().getName().getLocalPart(); - final XMLEvent text = parser.nextEvent(); + if (targetEvent.isStartElement()) { + final String fieldName = targetEvent.asStartElement().getName().getLocalPart(); + final XMLEvent text = parser.nextEvent(); - String data = getText(text); + String data = getText(text); - addField(indexDocument, fieldName, data); - hasFields = true; - } - } + addField(indexDocument, fieldName, data); + hasFields = true; + } + } - if (!hasFields) { - indexDocument.clear(); - } - } + if (!hasFields) { + indexDocument.clear(); + } + } - /** - * Copy the /indexRecord/result element and children, preserving namespace declarations etc. - * - * @param indexDocument - * @param results - * @param parser - * @param nsList - * @throws XMLStreamException - */ - protected void copyResult( - final SolrInputDocument indexDocument, - final StringWriter results, - final XMLEventReader parser, - final List nsList, - final String dnetResult) - throws XMLStreamException { - final XMLEventWriter writer = outputFactory.get().createXMLEventWriter(results); + /** + * Copy the /indexRecord/result element and children, preserving namespace declarations etc. + * + * @param indexDocument + * @param results + * @param parser + * @param nsList + * @throws XMLStreamException + */ + protected void copyResult( + final SolrInputDocument indexDocument, + final StringWriter results, + final XMLEventReader parser, + final List nsList, + final String dnetResult) + throws XMLStreamException { + final XMLEventWriter writer = outputFactory.get().createXMLEventWriter(results); - for (Namespace ns : nsList) { - eventFactory.get().createNamespace(ns.getPrefix(), ns.getNamespaceURI()); - } + for (Namespace ns : nsList) { + eventFactory.get().createNamespace(ns.getPrefix(), ns.getNamespaceURI()); + } - StartElement newRecord = - eventFactory.get().createStartElement("", null, RESULT, null, nsList.iterator()); + StartElement newRecord = eventFactory.get().createStartElement("", null, RESULT, null, nsList.iterator()); - // new root record - writer.add(newRecord); + // new root record + writer.add(newRecord); - // copy the rest as it is - while (parser.hasNext()) { - final XMLEvent resultEvent = parser.nextEvent(); + // copy the rest as it is + while (parser.hasNext()) { + final XMLEvent resultEvent = parser.nextEvent(); - // TODO: replace with depth tracking instead of close tag tracking. - if (resultEvent.isEndElement() - && resultEvent.asEndElement().getName().getLocalPart().equals(dnetResult)) { - writer.add(eventFactory.get().createEndElement("", null, RESULT)); - break; - } + // TODO: replace with depth tracking instead of close tag tracking. + if (resultEvent.isEndElement() + && resultEvent.asEndElement().getName().getLocalPart().equals(dnetResult)) { + writer.add(eventFactory.get().createEndElement("", null, RESULT)); + break; + } - writer.add(resultEvent); - } - writer.close(); - indexDocument.addField(INDEX_RESULT, results.toString()); - } + writer.add(resultEvent); + } + writer.close(); + indexDocument.addField(INDEX_RESULT, results.toString()); + } - /** - * Helper used to add a field to a solr doc. It avoids to add empy fields - * - * @param indexDocument - * @param field - * @param value - */ - private final void addField( - final SolrInputDocument indexDocument, final String field, final String value) { - String cleaned = value.trim(); - if (!cleaned.isEmpty()) { - // log.info("\n\n adding field " + field.toLowerCase() + " value: " + cleaned + "\n"); - indexDocument.addField(field.toLowerCase(), cleaned); - } - } + /** + * Helper used to add a field to a solr doc. It avoids to add empy fields + * + * @param indexDocument + * @param field + * @param value + */ + private final void addField( + final SolrInputDocument indexDocument, final String field, final String value) { + String cleaned = value.trim(); + if (!cleaned.isEmpty()) { + // log.info("\n\n adding field " + field.toLowerCase() + " value: " + cleaned + "\n"); + indexDocument.addField(field.toLowerCase(), cleaned); + } + } - /** - * Helper used to get the string from a text element. - * - * @param text - * @return the - */ - protected final String getText(final XMLEvent text) { - if (text.isEndElement()) // log.warn("skipping because isEndOfElement " + - // text.asEndElement().getName().getLocalPart()); - return ""; + /** + * Helper used to get the string from a text element. + * + * @param text + * @return the + */ + protected final String getText(final XMLEvent text) { + if (text.isEndElement()) // log.warn("skipping because isEndOfElement " + + // text.asEndElement().getName().getLocalPart()); + return ""; - final String data = text.asCharacters().getData(); - if (data != null && data.length() > MAX_FIELD_LENGTH) { - return data.substring(0, MAX_FIELD_LENGTH); - } + final String data = text.asCharacters().getData(); + if (data != null && data.length() > MAX_FIELD_LENGTH) { + return data.substring(0, MAX_FIELD_LENGTH); + } - return data; - } + return data; + } } diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/TemplateFactory.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/TemplateFactory.java index 7c919d952..3d9cf1ae7 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/TemplateFactory.java +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/TemplateFactory.java @@ -1,113 +1,117 @@ + package eu.dnetlib.dhp.oa.provision.utils; import static eu.dnetlib.dhp.oa.provision.utils.GraphMappingUtils.removePrefix; import static eu.dnetlib.dhp.oa.provision.utils.XmlSerializationUtils.escapeXml; -import eu.dnetlib.dhp.schema.oaf.DataInfo; -import eu.dnetlib.dhp.schema.oaf.OafEntity; import java.io.IOException; import java.util.Collection; import java.util.List; import java.util.stream.Collectors; + import org.apache.commons.lang3.StringUtils; import org.stringtemplate.v4.ST; +import eu.dnetlib.dhp.schema.oaf.DataInfo; +import eu.dnetlib.dhp.schema.oaf.OafEntity; + public class TemplateFactory { - private TemplateResources resources; + private TemplateResources resources; - private static final char DELIMITER = '$'; + private static final char DELIMITER = '$'; - public TemplateFactory() { - try { - resources = new TemplateResources(); - } catch (IOException e) { - throw new IllegalStateException(e); - } - } + public TemplateFactory() { + try { + resources = new TemplateResources(); + } catch (IOException e) { + throw new IllegalStateException(e); + } + } - public String buildBody( - final String type, - final List metadata, - final List rels, - final List children, - final List extraInfo) { - ST body = getTemplate(resources.getEntity()); + public String buildBody( + final String type, + final List metadata, + final List rels, + final List children, + final List extraInfo) { + ST body = getTemplate(resources.getEntity()); - body.add("name", type); - body.add("metadata", metadata); - body.add("rels", rels); - body.add("children", children); - body.add("extrainfo", extraInfo); + body.add("name", type); + body.add("metadata", metadata); + body.add("rels", rels); + body.add("children", children); + body.add("extrainfo", extraInfo); - return body.render(); - } + return body.render(); + } - public String getChild(final String name, final String id, final List metadata) { - return getTemplate(resources.getChild()) - .add("name", name) - .add("hasId", !(id == null)) - .add("id", id != null ? escapeXml(removePrefix(id)) : "") - .add("metadata", metadata) - .render(); - } + public String getChild(final String name, final String id, final List metadata) { + return getTemplate(resources.getChild()) + .add("name", name) + .add("hasId", !(id == null)) + .add("id", id != null ? escapeXml(removePrefix(id)) : "") + .add("metadata", metadata) + .render(); + } - public String buildRecord( - final OafEntity entity, final String schemaLocation, final String body) { - return getTemplate(resources.getRecord()) - .add("id", escapeXml(removePrefix(entity.getId()))) - .add("dateofcollection", entity.getDateofcollection()) - .add("dateoftransformation", entity.getDateoftransformation()) - .add("schemaLocation", schemaLocation) - .add("it", body) - .render(); - } + public String buildRecord( + final OafEntity entity, final String schemaLocation, final String body) { + return getTemplate(resources.getRecord()) + .add("id", escapeXml(removePrefix(entity.getId()))) + .add("dateofcollection", entity.getDateofcollection()) + .add("dateoftransformation", entity.getDateoftransformation()) + .add("schemaLocation", schemaLocation) + .add("it", body) + .render(); + } - public String getRel( - final String type, - final String objIdentifier, - final Collection fields, - final String semanticclass, - final String semantischeme, - final DataInfo info) { - return getTemplate(resources.getRel()) - .add("type", type) - .add("objIdentifier", escapeXml(removePrefix(objIdentifier))) - .add("class", semanticclass) - .add("scheme", semantischeme) - .add("metadata", fields) - .add("inferred", info.getInferred()) - .add("trust", info.getTrust()) - .add("inferenceprovenance", info.getInferenceprovenance()) - .add( - "provenanceaction", - info.getProvenanceaction() != null ? info.getProvenanceaction().getClassid() : "") - .render(); - } + public String getRel( + final String type, + final String objIdentifier, + final Collection fields, + final String semanticclass, + final String semantischeme, + final DataInfo info) { + return getTemplate(resources.getRel()) + .add("type", type) + .add("objIdentifier", escapeXml(removePrefix(objIdentifier))) + .add("class", semanticclass) + .add("scheme", semantischeme) + .add("metadata", fields) + .add("inferred", info.getInferred()) + .add("trust", info.getTrust()) + .add("inferenceprovenance", info.getInferenceprovenance()) + .add( + "provenanceaction", + info.getProvenanceaction() != null ? info.getProvenanceaction().getClassid() : "") + .render(); + } - public String getInstance( - final String resultId, final List instancemetadata, final List webresources) { - return getTemplate(resources.getInstance()) - .add("instanceId", escapeXml(removePrefix(resultId))) - .add("metadata", instancemetadata) - .add( - "webresources", - webresources.stream() - .filter(StringUtils::isNotBlank) - .map(w -> getWebResource(w)) - .collect(Collectors.toList())) - .render(); - } + public String getInstance( + final String resultId, final List instancemetadata, final List webresources) { + return getTemplate(resources.getInstance()) + .add("instanceId", escapeXml(removePrefix(resultId))) + .add("metadata", instancemetadata) + .add( + "webresources", + webresources + .stream() + .filter(StringUtils::isNotBlank) + .map(w -> getWebResource(w)) + .collect(Collectors.toList())) + .render(); + } - private String getWebResource(final String identifier) { - return getTemplate(resources.getWebresource()) - .add("identifier", escapeXml(identifier)) - .render(); - } + private String getWebResource(final String identifier) { + return getTemplate(resources.getWebresource()) + .add("identifier", escapeXml(identifier)) + .render(); + } - // HELPERS + // HELPERS - private ST getTemplate(final String res) { - return new ST(res, DELIMITER, DELIMITER); - } + private ST getTemplate(final String res) { + return new ST(res, DELIMITER, DELIMITER); + } } diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/TemplateResources.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/TemplateResources.java index 3ffc33bd8..746f8ebe6 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/TemplateResources.java +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/TemplateResources.java @@ -1,50 +1,53 @@ + package eu.dnetlib.dhp.oa.provision.utils; -import com.google.common.io.Resources; import java.io.IOException; import java.nio.charset.StandardCharsets; +import com.google.common.io.Resources; + public class TemplateResources { - private String record = read("eu/dnetlib/dhp/oa/provision/template/record.st"); + private String record = read("eu/dnetlib/dhp/oa/provision/template/record.st"); - private String instance = read("eu/dnetlib/dhp/oa/provision/template/instance.st"); + private String instance = read("eu/dnetlib/dhp/oa/provision/template/instance.st"); - private String rel = read("eu/dnetlib/dhp/oa/provision/template/rel.st"); + private String rel = read("eu/dnetlib/dhp/oa/provision/template/rel.st"); - private String webresource = read("eu/dnetlib/dhp/oa/provision/template/webresource.st"); + private String webresource = read("eu/dnetlib/dhp/oa/provision/template/webresource.st"); - private String child = read("eu/dnetlib/dhp/oa/provision/template/child.st"); + private String child = read("eu/dnetlib/dhp/oa/provision/template/child.st"); - private String entity = read("eu/dnetlib/dhp/oa/provision/template/entity.st"); + private String entity = read("eu/dnetlib/dhp/oa/provision/template/entity.st"); - private static String read(final String classpathResource) throws IOException { - return Resources.toString(Resources.getResource(classpathResource), StandardCharsets.UTF_8); - } + private static String read(final String classpathResource) throws IOException { + return Resources.toString(Resources.getResource(classpathResource), StandardCharsets.UTF_8); + } - public TemplateResources() throws IOException {} + public TemplateResources() throws IOException { + } - public String getEntity() { - return entity; - } + public String getEntity() { + return entity; + } - public String getRecord() { - return record; - } + public String getRecord() { + return record; + } - public String getInstance() { - return instance; - } + public String getInstance() { + return instance; + } - public String getRel() { - return rel; - } + public String getRel() { + return rel; + } - public String getWebresource() { - return webresource; - } + public String getWebresource() { + return webresource; + } - public String getChild() { - return child; - } + public String getChild() { + return child; + } } diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/XmlRecordFactory.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/XmlRecordFactory.java index 52a509409..f667d9f3c 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/XmlRecordFactory.java +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/XmlRecordFactory.java @@ -1,23 +1,10 @@ + package eu.dnetlib.dhp.oa.provision.utils; import static eu.dnetlib.dhp.oa.provision.utils.GraphMappingUtils.*; import static org.apache.commons.lang3.StringUtils.isNotBlank; import static org.apache.commons.lang3.StringUtils.substringBefore; -import com.fasterxml.jackson.databind.ObjectMapper; -import com.google.common.base.Joiner; -import com.google.common.base.Splitter; -import com.google.common.collect.Lists; -import com.google.common.collect.Maps; -import com.google.common.collect.Sets; -import com.mycila.xmltool.XMLDoc; -import com.mycila.xmltool.XMLTag; -import eu.dnetlib.dhp.oa.provision.model.*; -import eu.dnetlib.dhp.schema.common.EntityType; -import eu.dnetlib.dhp.schema.common.MainEntityType; -import eu.dnetlib.dhp.schema.common.ModelSupport; -import eu.dnetlib.dhp.schema.oaf.*; -import eu.dnetlib.dhp.schema.oaf.Result; import java.io.IOException; import java.io.Serializable; import java.io.StringReader; @@ -27,9 +14,11 @@ import java.util.List; import java.util.Map; import java.util.Set; import java.util.stream.Collectors; + import javax.xml.transform.*; import javax.xml.transform.dom.DOMSource; import javax.xml.transform.stream.StreamResult; + import org.apache.commons.lang3.StringUtils; import org.apache.spark.util.LongAccumulator; import org.dom4j.Document; @@ -40,1130 +29,1347 @@ import org.dom4j.io.OutputFormat; import org.dom4j.io.SAXReader; import org.dom4j.io.XMLWriter; +import com.fasterxml.jackson.databind.ObjectMapper; +import com.google.common.base.Joiner; +import com.google.common.base.Splitter; +import com.google.common.collect.Lists; +import com.google.common.collect.Maps; +import com.google.common.collect.Sets; +import com.mycila.xmltool.XMLDoc; +import com.mycila.xmltool.XMLTag; + +import eu.dnetlib.dhp.oa.provision.model.*; +import eu.dnetlib.dhp.schema.common.EntityType; +import eu.dnetlib.dhp.schema.common.MainEntityType; +import eu.dnetlib.dhp.schema.common.ModelSupport; +import eu.dnetlib.dhp.schema.oaf.*; +import eu.dnetlib.dhp.schema.oaf.Result; + public class XmlRecordFactory implements Serializable { - public static final String REL_SUBTYPE_DEDUP = "dedup"; - private Map accumulators; - - private Set specialDatasourceTypes; - - private ContextMapper contextMapper; - - private String schemaLocation; - - private boolean indent = false; - - private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); - - public XmlRecordFactory( - final ContextMapper contextMapper, - final boolean indent, - final String schemaLocation, - final String otherDatasourceTypesUForUI) { - - this(Maps.newHashMap(), contextMapper, indent, schemaLocation, otherDatasourceTypesUForUI); - } - - public XmlRecordFactory( - final Map accumulators, - final ContextMapper contextMapper, - final boolean indent, - final String schemaLocation, - final String otherDatasourceTypesUForUI) { - - this.accumulators = accumulators; - this.contextMapper = contextMapper; - this.schemaLocation = schemaLocation; - this.specialDatasourceTypes = - Sets.newHashSet(Splitter.on(",").trimResults().split(otherDatasourceTypesUForUI)); - - this.indent = indent; - } - - public String build(final JoinedEntity je) { - - final Set contexts = Sets.newHashSet(); - - final OafEntity entity = toOafEntity(je.getEntity()); - TemplateFactory templateFactory = new TemplateFactory(); - try { - final EntityType type = EntityType.valueOf(je.getEntity().getType()); - final List metadata = metadata(type, entity, contexts); - - // rels has to be processed before the contexts because they enrich the contextMap with - // the - // funding info. - final List relations = - je.getLinks().stream() - .filter(t -> !REL_SUBTYPE_DEDUP.equalsIgnoreCase(t.getRelation().getSubRelType())) - .map(link -> mapRelation(link, templateFactory, contexts)) - .collect(Collectors.toCollection(ArrayList::new)); - - final String mainType = ModelSupport.getMainType(type); - metadata.addAll(buildContexts(mainType, contexts)); - metadata.add(XmlSerializationUtils.parseDataInfo(entity.getDataInfo())); - - final String body = - templateFactory.buildBody( - mainType, - metadata, - relations, - listChildren(entity, je, templateFactory), - listExtraInfo(entity)); - - return printXML(templateFactory.buildRecord(entity, schemaLocation, body), indent); - } catch (final Throwable e) { - throw new RuntimeException(String.format("error building record '%s'", entity.getId()), e); - } - } - - private static OafEntity toOafEntity(TypedRow typedRow) { - return parseOaf(typedRow.getOaf(), typedRow.getType()); - } - - private static OafEntity parseOaf(final String json, final String type) { - try { - switch (EntityType.valueOf(type)) { - case publication: - return OBJECT_MAPPER.readValue(json, Publication.class); - case dataset: - return OBJECT_MAPPER.readValue(json, Dataset.class); - case otherresearchproduct: - return OBJECT_MAPPER.readValue(json, OtherResearchProduct.class); - case software: - return OBJECT_MAPPER.readValue(json, Software.class); - case datasource: - return OBJECT_MAPPER.readValue(json, Datasource.class); - case organization: - return OBJECT_MAPPER.readValue(json, Organization.class); - case project: - return OBJECT_MAPPER.readValue(json, Project.class); - default: - throw new IllegalArgumentException("invalid type: " + type); - } - } catch (IOException e) { - throw new IllegalArgumentException(e); - } - } - - private String printXML(String xml, boolean indent) { - try { - final Document doc = new SAXReader().read(new StringReader(xml)); - OutputFormat format = - indent ? OutputFormat.createPrettyPrint() : OutputFormat.createCompactFormat(); - format.setExpandEmptyElements(false); - format.setSuppressDeclaration(true); - StringWriter sw = new StringWriter(); - XMLWriter writer = new XMLWriter(sw, format); - writer.write(doc); - return sw.toString(); - } catch (IOException | DocumentException e) { - throw new IllegalArgumentException("Unable to indent XML. Invalid record:\n" + xml, e); - } - } - - private List metadata( - final EntityType type, final OafEntity entity, final Set contexts) { - - final List metadata = Lists.newArrayList(); - - if (entity.getCollectedfrom() != null) { - metadata.addAll( - entity.getCollectedfrom().stream() - .map(kv -> XmlSerializationUtils.mapKeyValue("collectedfrom", kv)) - .collect(Collectors.toList())); - } - if (entity.getOriginalId() != null) { - metadata.addAll( - entity.getOriginalId().stream() - .map(s -> XmlSerializationUtils.asXmlElement("originalId", s)) - .collect(Collectors.toList())); - } - if (entity.getPid() != null) { - metadata.addAll( - entity.getPid().stream() - .map(p -> XmlSerializationUtils.mapStructuredProperty("pid", p)) - .collect(Collectors.toList())); - } - - if (ModelSupport.isResult(type)) { - final Result r = (Result) entity; - - if (r.getContext() != null) { - contexts.addAll(r.getContext().stream().map(c -> c.getId()).collect(Collectors.toList())); - /* FIXME: Workaround for CLARIN mining issue: #3670#note-29 */ - if (contexts.contains("dh-ch::subcommunity::2")) { - contexts.add("clarin"); - } - } - - if (r.getTitle() != null) { - metadata.addAll( - r.getTitle().stream() - .map(t -> XmlSerializationUtils.mapStructuredProperty("title", t)) - .collect(Collectors.toList())); - } - if (r.getBestaccessright() != null) { - metadata.add(XmlSerializationUtils.mapQualifier("bestaccessright", r.getBestaccessright())); - } - if (r.getAuthor() != null) { - metadata.addAll( - r.getAuthor().stream() - .map( - a -> { - final StringBuilder sb = - new StringBuilder(" - isNotBlank(sp.getQualifier().getClassid()) - && isNotBlank(sp.getValue())) - .forEach( - sp -> { - String pidType = - XmlSerializationUtils.escapeXml( - sp.getQualifier().getClassid()) - .replaceAll("\\W", ""); - String pidValue = XmlSerializationUtils.escapeXml(sp.getValue()); - - // ugly hack: some records - // provide swapped pidtype and - // pidvalue - if (authorPidTypes.contains(pidValue.toLowerCase().trim())) { - sb.append(String.format(" %s=\"%s\"", pidValue, pidType)); - } else { - pidType = pidType.replaceAll("\\W", "").replaceAll("\\d", ""); - if (isNotBlank(pidType)) { - sb.append( - String.format( - " %s=\"%s\"", - pidType, - pidValue.toLowerCase().replaceAll("orcid", ""))); - } - } - }); - } - sb.append( - ">" + XmlSerializationUtils.escapeXml(a.getFullname()) + ""); - return sb.toString(); - }) - .collect(Collectors.toList())); - } - if (r.getContributor() != null) { - metadata.addAll( - r.getContributor().stream() - .map(c -> XmlSerializationUtils.asXmlElement("contributor", c.getValue())) - .collect(Collectors.toList())); - } - if (r.getCountry() != null) { - metadata.addAll( - r.getCountry().stream() - .map(c -> XmlSerializationUtils.mapQualifier("country", c)) - .collect(Collectors.toList())); - } - if (r.getCoverage() != null) { - metadata.addAll( - r.getCoverage().stream() - .map(c -> XmlSerializationUtils.asXmlElement("coverage", c.getValue())) - .collect(Collectors.toList())); - } - if (r.getDateofacceptance() != null) { - metadata.add( - XmlSerializationUtils.asXmlElement( - "dateofacceptance", r.getDateofacceptance().getValue())); - } - if (r.getDescription() != null) { - metadata.addAll( - r.getDescription().stream() - .map(c -> XmlSerializationUtils.asXmlElement("description", c.getValue())) - .collect(Collectors.toList())); - } - if (r.getEmbargoenddate() != null) { - metadata.add( - XmlSerializationUtils.asXmlElement("embargoenddate", r.getEmbargoenddate().getValue())); - } - if (r.getSubject() != null) { - metadata.addAll( - r.getSubject().stream() - .map(s -> XmlSerializationUtils.mapStructuredProperty("subject", s)) - .collect(Collectors.toList())); - } - if (r.getLanguage() != null) { - metadata.add(XmlSerializationUtils.mapQualifier("language", r.getLanguage())); - } - if (r.getRelevantdate() != null) { - metadata.addAll( - r.getRelevantdate().stream() - .map(s -> XmlSerializationUtils.mapStructuredProperty("relevantdate", s)) - .collect(Collectors.toList())); - } - if (r.getPublisher() != null) { - metadata.add(XmlSerializationUtils.asXmlElement("publisher", r.getPublisher().getValue())); - } - if (r.getSource() != null) { - metadata.addAll( - r.getSource().stream() - .map(c -> XmlSerializationUtils.asXmlElement("source", c.getValue())) - .collect(Collectors.toList())); - } - if (r.getFormat() != null) { - metadata.addAll( - r.getFormat().stream() - .map(c -> XmlSerializationUtils.asXmlElement("format", c.getValue())) - .collect(Collectors.toList())); - } - if (r.getResulttype() != null) { - metadata.add(XmlSerializationUtils.mapQualifier("resulttype", r.getResulttype())); - } - if (r.getResourcetype() != null) { - metadata.add(XmlSerializationUtils.mapQualifier("resourcetype", r.getResourcetype())); - } - } - - switch (type) { - case publication: - final Publication pub = (Publication) entity; - - if (pub.getJournal() != null) { - final Journal j = pub.getJournal(); - metadata.add(XmlSerializationUtils.mapJournal(j)); - } - - break; - case dataset: - final Dataset d = (Dataset) entity; - if (d.getDevice() != null) { - metadata.add(XmlSerializationUtils.asXmlElement("device", d.getDevice().getValue())); - } - if (d.getLastmetadataupdate() != null) { - metadata.add( - XmlSerializationUtils.asXmlElement( - "lastmetadataupdate", d.getLastmetadataupdate().getValue())); - } - if (d.getMetadataversionnumber() != null) { - metadata.add( - XmlSerializationUtils.asXmlElement( - "metadataversionnumber", d.getMetadataversionnumber().getValue())); - } - if (d.getSize() != null) { - metadata.add(XmlSerializationUtils.asXmlElement("size", d.getSize().getValue())); - } - if (d.getStoragedate() != null) { - metadata.add( - XmlSerializationUtils.asXmlElement("storagedate", d.getStoragedate().getValue())); - } - if (d.getVersion() != null) { - metadata.add(XmlSerializationUtils.asXmlElement("version", d.getVersion().getValue())); - } - // TODO d.getGeolocation() - - break; - case otherresearchproduct: - final OtherResearchProduct orp = (OtherResearchProduct) entity; - - if (orp.getContactperson() != null) { - metadata.addAll( - orp.getContactperson().stream() - .map(c -> XmlSerializationUtils.asXmlElement("contactperson", c.getValue())) - .collect(Collectors.toList())); - } - - if (orp.getContactgroup() != null) { - metadata.addAll( - orp.getContactgroup().stream() - .map(c -> XmlSerializationUtils.asXmlElement("contactgroup", c.getValue())) - .collect(Collectors.toList())); - } - if (orp.getTool() != null) { - metadata.addAll( - orp.getTool().stream() - .map(c -> XmlSerializationUtils.asXmlElement("tool", c.getValue())) - .collect(Collectors.toList())); - } - break; - case software: - final Software s = (Software) entity; - - if (s.getDocumentationUrl() != null) { - metadata.addAll( - s.getDocumentationUrl().stream() - .map(c -> XmlSerializationUtils.asXmlElement("documentationUrl", c.getValue())) - .collect(Collectors.toList())); - } - if (s.getLicense() != null) { - metadata.addAll( - s.getLicense().stream() - .map(l -> XmlSerializationUtils.mapStructuredProperty("license", l)) - .collect(Collectors.toList())); - } - if (s.getCodeRepositoryUrl() != null) { - metadata.add( - XmlSerializationUtils.asXmlElement( - "codeRepositoryUrl", s.getCodeRepositoryUrl().getValue())); - } - if (s.getProgrammingLanguage() != null) { - metadata.add( - XmlSerializationUtils.mapQualifier( - "programmingLanguage", s.getProgrammingLanguage())); - } - break; - case datasource: - final Datasource ds = (Datasource) entity; - - if (ds.getDatasourcetype() != null) { - mapDatasourceType(metadata, ds.getDatasourcetype()); - } - if (ds.getOpenairecompatibility() != null) { - metadata.add( - XmlSerializationUtils.mapQualifier( - "openairecompatibility", ds.getOpenairecompatibility())); - } - if (ds.getOfficialname() != null) { - metadata.add( - XmlSerializationUtils.asXmlElement("officialname", ds.getOfficialname().getValue())); - } - if (ds.getEnglishname() != null) { - metadata.add( - XmlSerializationUtils.asXmlElement("englishname", ds.getEnglishname().getValue())); - } - if (ds.getWebsiteurl() != null) { - metadata.add( - XmlSerializationUtils.asXmlElement("websiteurl", ds.getWebsiteurl().getValue())); - } - if (ds.getLogourl() != null) { - metadata.add(XmlSerializationUtils.asXmlElement("logourl", ds.getLogourl().getValue())); - } - if (ds.getContactemail() != null) { - metadata.add( - XmlSerializationUtils.asXmlElement("contactemail", ds.getContactemail().getValue())); - } - if (ds.getNamespaceprefix() != null) { - metadata.add( - XmlSerializationUtils.asXmlElement( - "namespaceprefix", ds.getNamespaceprefix().getValue())); - } - if (ds.getLatitude() != null) { - metadata.add(XmlSerializationUtils.asXmlElement("latitude", ds.getLatitude().getValue())); - } - if (ds.getLongitude() != null) { - metadata.add( - XmlSerializationUtils.asXmlElement("longitude", ds.getLongitude().getValue())); - } - if (ds.getDateofvalidation() != null) { - metadata.add( - XmlSerializationUtils.asXmlElement( - "dateofvalidation", ds.getDateofvalidation().getValue())); - } - if (ds.getDescription() != null) { - metadata.add( - XmlSerializationUtils.asXmlElement("description", ds.getDescription().getValue())); - } - if (ds.getOdnumberofitems() != null) { - metadata.add( - XmlSerializationUtils.asXmlElement( - "odnumberofitems", ds.getOdnumberofitems().getValue())); - } - if (ds.getOdnumberofitemsdate() != null) { - metadata.add( - XmlSerializationUtils.asXmlElement( - "odnumberofitemsdate", ds.getOdnumberofitemsdate().getValue())); - } - if (ds.getOdpolicies() != null) { - metadata.add( - XmlSerializationUtils.asXmlElement("odpolicies", ds.getOdpolicies().getValue())); - } - if (ds.getOdlanguages() != null) { - metadata.addAll( - ds.getOdlanguages().stream() - .map(c -> XmlSerializationUtils.asXmlElement("odlanguages", c.getValue())) - .collect(Collectors.toList())); - } - if (ds.getOdcontenttypes() != null) { - metadata.addAll( - ds.getOdcontenttypes().stream() - .map(c -> XmlSerializationUtils.asXmlElement("odcontenttypes", c.getValue())) - .collect(Collectors.toList())); - } - if (ds.getAccessinfopackage() != null) { - metadata.addAll( - ds.getAccessinfopackage().stream() - .map(c -> XmlSerializationUtils.asXmlElement("accessinfopackage", c.getValue())) - .collect(Collectors.toList())); - } - if (ds.getReleaseenddate() != null) { - metadata.add( - XmlSerializationUtils.asXmlElement( - "releasestartdate", ds.getReleaseenddate().getValue())); - } - if (ds.getReleaseenddate() != null) { - metadata.add( - XmlSerializationUtils.asXmlElement( - "releaseenddate", ds.getReleaseenddate().getValue())); - } - if (ds.getMissionstatementurl() != null) { - metadata.add( - XmlSerializationUtils.asXmlElement( - "missionstatementurl", ds.getMissionstatementurl().getValue())); - } - if (ds.getDataprovider() != null) { - metadata.add( - XmlSerializationUtils.asXmlElement( - "dataprovider", ds.getDataprovider().getValue().toString())); - } - if (ds.getServiceprovider() != null) { - metadata.add( - XmlSerializationUtils.asXmlElement( - "serviceprovider", ds.getServiceprovider().getValue().toString())); - } - if (ds.getDatabaseaccesstype() != null) { - metadata.add( - XmlSerializationUtils.asXmlElement( - "databaseaccesstype", ds.getDatabaseaccesstype().getValue())); - } - if (ds.getDatauploadtype() != null) { - metadata.add( - XmlSerializationUtils.asXmlElement( - "datauploadtype", ds.getDatauploadtype().getValue())); - } - if (ds.getDatabaseaccessrestriction() != null) { - metadata.add( - XmlSerializationUtils.asXmlElement( - "databaseaccessrestriction", ds.getDatabaseaccessrestriction().getValue())); - } - if (ds.getDatauploadrestriction() != null) { - metadata.add( - XmlSerializationUtils.asXmlElement( - "datauploadrestriction", ds.getDatauploadrestriction().getValue())); - } - if (ds.getVersioning() != null) { - metadata.add( - XmlSerializationUtils.asXmlElement( - "versioning", ds.getVersioning().getValue().toString())); - } - if (ds.getCitationguidelineurl() != null) { - metadata.add( - XmlSerializationUtils.asXmlElement( - "citationguidelineurl", ds.getCitationguidelineurl().getValue())); - } - if (ds.getQualitymanagementkind() != null) { - metadata.add( - XmlSerializationUtils.asXmlElement( - "qualitymanagementkind", ds.getQualitymanagementkind().getValue())); - } - if (ds.getPidsystems() != null) { - metadata.add( - XmlSerializationUtils.asXmlElement("pidsystems", ds.getPidsystems().getValue())); - } - if (ds.getCertificates() != null) { - metadata.add( - XmlSerializationUtils.asXmlElement("certificates", ds.getCertificates().getValue())); - } - if (ds.getPolicies() != null) { - metadata.addAll( - ds.getPolicies().stream() - .map(kv -> XmlSerializationUtils.mapKeyValue("policies", kv)) - .collect(Collectors.toList())); - } - if (ds.getJournal() != null) { - metadata.add(XmlSerializationUtils.mapJournal(ds.getJournal())); - } - if (ds.getSubjects() != null) { - metadata.addAll( - ds.getSubjects().stream() - .map(sp -> XmlSerializationUtils.mapStructuredProperty("subjects", sp)) - .collect(Collectors.toList())); - } - - break; - case organization: - final Organization o = (Organization) entity; - - if (o.getLegalshortname() != null) { - metadata.add( - XmlSerializationUtils.asXmlElement( - "legalshortname", o.getLegalshortname().getValue())); - } - if (o.getLegalname() != null) { - metadata.add( - XmlSerializationUtils.asXmlElement("legalname", o.getLegalname().getValue())); - } - if (o.getAlternativeNames() != null) { - metadata.addAll( - o.getAlternativeNames().stream() - .map(c -> XmlSerializationUtils.asXmlElement("alternativeNames", c.getValue())) - .collect(Collectors.toList())); - } - if (o.getWebsiteurl() != null) { - metadata.add( - XmlSerializationUtils.asXmlElement("websiteurl", o.getWebsiteurl().getValue())); - } - if (o.getLogourl() != null) { - metadata.add(XmlSerializationUtils.asXmlElement("websiteurl", o.getLogourl().getValue())); - } - - if (o.getEclegalbody() != null) { - metadata.add( - XmlSerializationUtils.asXmlElement("eclegalbody", o.getEclegalbody().getValue())); - } - if (o.getEclegalperson() != null) { - metadata.add( - XmlSerializationUtils.asXmlElement("eclegalperson", o.getEclegalperson().getValue())); - } - if (o.getEcnonprofit() != null) { - metadata.add( - XmlSerializationUtils.asXmlElement("ecnonprofit", o.getEcnonprofit().getValue())); - } - if (o.getEcresearchorganization() != null) { - metadata.add( - XmlSerializationUtils.asXmlElement( - "ecresearchorganization", o.getEcresearchorganization().getValue())); - } - if (o.getEchighereducation() != null) { - metadata.add( - XmlSerializationUtils.asXmlElement( - "echighereducation", o.getEchighereducation().getValue())); - } - if (o.getEcinternationalorganization() != null) { - metadata.add( - XmlSerializationUtils.asXmlElement( - "ecinternationalorganizationeurinterests", - o.getEcinternationalorganization().getValue())); - } - if (o.getEcinternationalorganization() != null) { - metadata.add( - XmlSerializationUtils.asXmlElement( - "ecinternationalorganization", o.getEcinternationalorganization().getValue())); - } - if (o.getEcenterprise() != null) { - metadata.add( - XmlSerializationUtils.asXmlElement("ecenterprise", o.getEcenterprise().getValue())); - } - if (o.getEcsmevalidated() != null) { - metadata.add( - XmlSerializationUtils.asXmlElement( - "ecsmevalidated", o.getEcsmevalidated().getValue())); - } - if (o.getEcnutscode() != null) { - metadata.add( - XmlSerializationUtils.asXmlElement("ecnutscode", o.getEcnutscode().getValue())); - } - if (o.getCountry() != null) { - metadata.add(XmlSerializationUtils.mapQualifier("country", o.getCountry())); - } - - break; - case project: - final Project p = (Project) entity; - - if (p.getWebsiteurl() != null) { - metadata.add( - XmlSerializationUtils.asXmlElement("websiteurl", p.getWebsiteurl().getValue())); - } - if (p.getCode() != null) { - metadata.add(XmlSerializationUtils.asXmlElement("code", p.getCode().getValue())); - } - if (p.getAcronym() != null) { - metadata.add(XmlSerializationUtils.asXmlElement("acronym", p.getAcronym().getValue())); - } - if (p.getTitle() != null) { - metadata.add(XmlSerializationUtils.asXmlElement("title", p.getTitle().getValue())); - } - if (p.getStartdate() != null) { - metadata.add( - XmlSerializationUtils.asXmlElement("startdate", p.getStartdate().getValue())); - } - if (p.getEnddate() != null) { - metadata.add(XmlSerializationUtils.asXmlElement("enddate", p.getEnddate().getValue())); - } - if (p.getCallidentifier() != null) { - metadata.add( - XmlSerializationUtils.asXmlElement( - "callidentifier", p.getCallidentifier().getValue())); - } - if (p.getKeywords() != null) { - metadata.add(XmlSerializationUtils.asXmlElement("keywords", p.getKeywords().getValue())); - } - if (p.getDuration() != null) { - metadata.add(XmlSerializationUtils.asXmlElement("duration", p.getDuration().getValue())); - } - if (p.getEcarticle29_3() != null) { - metadata.add( - XmlSerializationUtils.asXmlElement("ecarticle29_3", p.getEcarticle29_3().getValue())); - } - if (p.getSubjects() != null) { - metadata.addAll( - p.getSubjects().stream() - .map(sp -> XmlSerializationUtils.mapStructuredProperty("subject", sp)) - .collect(Collectors.toList())); - } - if (p.getContracttype() != null) { - metadata.add(XmlSerializationUtils.mapQualifier("contracttype", p.getContracttype())); - } - if (p.getEcsc39() != null) { - metadata.add(XmlSerializationUtils.asXmlElement("ecsc39", p.getEcsc39().getValue())); - } - if (p.getContactfullname() != null) { - metadata.add( - XmlSerializationUtils.asXmlElement( - "contactfullname", p.getContactfullname().getValue())); - } - if (p.getContactfax() != null) { - metadata.add( - XmlSerializationUtils.asXmlElement("contactfax", p.getContactfax().getValue())); - } - if (p.getContactphone() != null) { - metadata.add( - XmlSerializationUtils.asXmlElement("contactphone", p.getContactphone().getValue())); - } - if (p.getContactemail() != null) { - metadata.add( - XmlSerializationUtils.asXmlElement("contactemail", p.getContactemail().getValue())); - } - if (p.getSummary() != null) { - metadata.add(XmlSerializationUtils.asXmlElement("summary", p.getSummary().getValue())); - } - if (p.getCurrency() != null) { - metadata.add(XmlSerializationUtils.asXmlElement("currency", p.getCurrency().getValue())); - } - if (p.getTotalcost() != null) { - metadata.add( - XmlSerializationUtils.asXmlElement("totalcost", p.getTotalcost().toString())); - } - if (p.getFundedamount() != null) { - metadata.add( - XmlSerializationUtils.asXmlElement("fundedamount", p.getFundedamount().toString())); - } - if (p.getFundingtree() != null) { - metadata.addAll( - p.getFundingtree().stream().map(ft -> ft.getValue()).collect(Collectors.toList())); - } - - break; - default: - throw new IllegalArgumentException("invalid entity type: " + type); - } - - return metadata; - } - - private void mapDatasourceType(List metadata, final Qualifier dsType) { - metadata.add(XmlSerializationUtils.mapQualifier("datasourcetype", dsType)); - - if (specialDatasourceTypes.contains(dsType.getClassid())) { - dsType.setClassid("other"); - dsType.setClassname("other"); - } - metadata.add(XmlSerializationUtils.mapQualifier("datasourcetypeui", dsType)); - } - - private String mapRelation(Tuple2 link, TemplateFactory templateFactory, Set contexts) { - final Relation rel = link.getRelation(); - final RelatedEntity re = link.getRelatedEntity(); - final String targetType = link.getRelatedEntity().getType(); - - final List metadata = Lists.newArrayList(); - switch (EntityType.valueOf(targetType)) { - case publication: - case dataset: - case otherresearchproduct: - case software: - if (re.getTitle() != null && isNotBlank(re.getTitle().getValue())) { - metadata.add(XmlSerializationUtils.mapStructuredProperty("title", re.getTitle())); - } - if (isNotBlank(re.getDateofacceptance())) { - metadata.add( - XmlSerializationUtils.asXmlElement("dateofacceptance", re.getDateofacceptance())); - } - if (isNotBlank(re.getPublisher())) { - metadata.add(XmlSerializationUtils.asXmlElement("publisher", re.getPublisher())); - } - if (isNotBlank(re.getCodeRepositoryUrl())) { - metadata.add( - XmlSerializationUtils.asXmlElement("coderepositoryurl", re.getCodeRepositoryUrl())); - } - if (re.getResulttype() != null & re.getResulttype().isBlank()) { - metadata.add(XmlSerializationUtils.mapQualifier("resulttype", re.getResulttype())); - } - if (re.getCollectedfrom() != null) { - metadata.addAll( - re.getCollectedfrom().stream() - .map(kv -> XmlSerializationUtils.mapKeyValue("collectedfrom", kv)) - .collect(Collectors.toList())); - } - if (re.getPid() != null) { - metadata.addAll( - re.getPid().stream() - .map(p -> XmlSerializationUtils.mapStructuredProperty("pid", p)) - .collect(Collectors.toList())); - } - break; - case datasource: - if (isNotBlank(re.getOfficialname())) { - metadata.add(XmlSerializationUtils.asXmlElement("officialname", re.getOfficialname())); - } - if (re.getDatasourcetype() != null & !re.getDatasourcetype().isBlank()) { - mapDatasourceType(metadata, re.getDatasourcetype()); - } - if (re.getOpenairecompatibility() != null & !re.getOpenairecompatibility().isBlank()) { - metadata.add( - XmlSerializationUtils.mapQualifier( - "openairecompatibility", re.getOpenairecompatibility())); - } - break; - case organization: - if (isNotBlank(re.getLegalname())) { - metadata.add(XmlSerializationUtils.asXmlElement("legalname", re.getLegalname())); - } - if (isNotBlank(re.getLegalshortname())) { - metadata.add( - XmlSerializationUtils.asXmlElement("legalshortname", re.getLegalshortname())); - } - if (re.getCountry() != null & !re.getCountry().isBlank()) { - metadata.add(XmlSerializationUtils.mapQualifier("country", re.getCountry())); - } - break; - case project: - if (isNotBlank(re.getProjectTitle())) { - metadata.add(XmlSerializationUtils.asXmlElement("title", re.getProjectTitle())); - } - if (isNotBlank(re.getCode())) { - metadata.add(XmlSerializationUtils.asXmlElement("code", re.getCode())); - } - if (isNotBlank(re.getAcronym())) { - metadata.add(XmlSerializationUtils.asXmlElement("acronym", re.getAcronym())); - } - if (re.getContracttype() != null & !re.getContracttype().isBlank()) { - metadata.add(XmlSerializationUtils.mapQualifier("contracttype", re.getContracttype())); - } - if (re.getFundingtree() != null & contexts != null) { - metadata.addAll( - re.getFundingtree().stream() - .peek(ft -> fillContextMap(ft, contexts)) - .map(ft -> getRelFundingTree(ft)) - .collect(Collectors.toList())); - } - break; - default: - throw new IllegalArgumentException("invalid target type: " + targetType); - } - final DataInfo info = rel.getDataInfo(); - final String scheme = ModelSupport.getScheme(re.getType(), targetType); - - if (StringUtils.isBlank(scheme)) { - throw new IllegalArgumentException( - String.format("missing scheme for: <%s - %s>", re.getType(), targetType)); - } - - final String accumulatorName = - getRelDescriptor(rel.getRelType(), rel.getSubRelType(), rel.getRelClass()); - if (accumulators.containsKey(accumulatorName)) { - accumulators.get(accumulatorName).add(1); - } - - return templateFactory.getRel( - targetType, rel.getTarget(), Sets.newHashSet(metadata), rel.getRelClass(), scheme, info); - } - - private List listChildren( - final OafEntity entity, JoinedEntity je, TemplateFactory templateFactory) { - - final List children = Lists.newArrayList(); - EntityType entityType = EntityType.valueOf(je.getEntity().getType()); - - children.addAll( - je.getLinks().stream() - .filter(link -> REL_SUBTYPE_DEDUP.equalsIgnoreCase(link.getRelation().getSubRelType())) - .map(link -> mapRelation(link, templateFactory, null)) - .collect(Collectors.toCollection(ArrayList::new))); - - if (MainEntityType.result.toString().equals(ModelSupport.getMainType(entityType))) { - final List instances = ((Result) entity).getInstance(); - if (instances != null) { - for (final Instance instance : ((Result) entity).getInstance()) { - - final List fields = Lists.newArrayList(); - - if (instance.getAccessright() != null && !instance.getAccessright().isBlank()) { - fields.add( - XmlSerializationUtils.mapQualifier("accessright", instance.getAccessright())); - } - if (instance.getCollectedfrom() != null) { - fields.add( - XmlSerializationUtils.mapKeyValue("collectedfrom", instance.getCollectedfrom())); - } - if (instance.getHostedby() != null) { - fields.add(XmlSerializationUtils.mapKeyValue("hostedby", instance.getHostedby())); - } - if (instance.getDateofacceptance() != null - && isNotBlank(instance.getDateofacceptance().getValue())) { - fields.add( - XmlSerializationUtils.asXmlElement( - "dateofacceptance", instance.getDateofacceptance().getValue())); - } - if (instance.getInstancetype() != null && !instance.getInstancetype().isBlank()) { - fields.add( - XmlSerializationUtils.mapQualifier("instancetype", instance.getInstancetype())); - } - if (isNotBlank(instance.getDistributionlocation())) { - fields.add( - XmlSerializationUtils.asXmlElement( - "distributionlocation", instance.getDistributionlocation())); - } - if (instance.getRefereed() != null && isNotBlank(instance.getRefereed().getValue())) { - fields.add( - XmlSerializationUtils.asXmlElement("refereed", instance.getRefereed().getValue())); - } - if (instance.getProcessingchargeamount() != null - && isNotBlank(instance.getProcessingchargeamount().getValue())) { - fields.add( - XmlSerializationUtils.asXmlElement( - "processingchargeamount", instance.getProcessingchargeamount().getValue())); - } - if (instance.getProcessingchargecurrency() != null - && isNotBlank(instance.getProcessingchargecurrency().getValue())) { - fields.add( - XmlSerializationUtils.asXmlElement( - "processingchargecurrency", instance.getProcessingchargecurrency().getValue())); - } - - children.add( - templateFactory.getInstance( - instance.getHostedby().getKey(), fields, instance.getUrl())); - } - } - final List ext = ((Result) entity).getExternalReference(); - if (ext != null) { - for (final ExternalReference er : ((Result) entity).getExternalReference()) { - - final List fields = Lists.newArrayList(); - - if (isNotBlank(er.getSitename())) { - fields.add(XmlSerializationUtils.asXmlElement("sitename", er.getSitename())); - } - if (isNotBlank(er.getLabel())) { - fields.add(XmlSerializationUtils.asXmlElement("label", er.getLabel())); - } - if (isNotBlank(er.getUrl())) { - fields.add(XmlSerializationUtils.asXmlElement("url", er.getUrl())); - } - if (isNotBlank(er.getDescription())) { - fields.add(XmlSerializationUtils.asXmlElement("description", er.getDescription())); - } - if (isNotBlank(er.getUrl())) { - fields.add(XmlSerializationUtils.mapQualifier("qualifier", er.getQualifier())); - } - if (isNotBlank(er.getRefidentifier())) { - fields.add(XmlSerializationUtils.asXmlElement("refidentifier", er.getRefidentifier())); - } - if (isNotBlank(er.getQuery())) { - fields.add(XmlSerializationUtils.asXmlElement("query", er.getQuery())); - } - - children.add(templateFactory.getChild("externalreference", null, fields)); - } - } - } - - return children; - } - - private List listExtraInfo(OafEntity entity) { - final List extraInfo = entity.getExtraInfo(); - return extraInfo != null - ? extraInfo.stream() - .map(e -> XmlSerializationUtils.mapExtraInfo(e)) - .collect(Collectors.toList()) - : Lists.newArrayList(); - } - - private List buildContexts(final String type, final Set contexts) { - final List res = Lists.newArrayList(); - - if ((contextMapper != null) - && !contextMapper.isEmpty() - && MainEntityType.result.toString().equals(type)) { - - XMLTag document = XMLDoc.newDocument(true).addRoot("contextRoot"); - - for (final String context : contexts) { - - String id = ""; - for (final String token : Splitter.on("::").split(context)) { - id += token; - - final ContextDef def = contextMapper.get(id); - - if (def == null) { - continue; - // throw new IllegalStateException(String.format("cannot find context for id - // '%s'", - // id)); - } - - if (def.getName().equals("context")) { - final String xpath = "//context/@id='" + def.getId() + "'"; - if (!document.gotoRoot().rawXpathBoolean(xpath, new Object())) { - document = addContextDef(document.gotoRoot(), def); - } - } - - if (def.getName().equals("category")) { - final String rootId = substringBefore(def.getId(), "::"); - document = - addContextDef( - document.gotoRoot().gotoTag("//context[./@id='" + rootId + "']", new Object()), - def); - } - - if (def.getName().equals("concept")) { - document = addContextDef(document, def).gotoParent(); - } - id += "::"; - } - } - final Transformer transformer = getTransformer(); - for (final org.w3c.dom.Element x : document.gotoRoot().getChildElement()) { - try { - res.add(asStringElement(x, transformer)); - } catch (final TransformerException e) { - throw new RuntimeException(e); - } - } - } - - return res; - } - - private Transformer getTransformer() { - try { - Transformer transformer = TransformerFactory.newInstance().newTransformer(); - transformer.setOutputProperty(OutputKeys.OMIT_XML_DECLARATION, "yes"); - return transformer; - } catch (TransformerConfigurationException e) { - throw new IllegalStateException("unable to create javax.xml.transform.Transformer", e); - } - } - - private XMLTag addContextDef(final XMLTag tag, final ContextDef def) { - tag.addTag(def.getName()).addAttribute("id", def.getId()).addAttribute("label", def.getLabel()); - if ((def.getType() != null) && !def.getType().isEmpty()) { - tag.addAttribute("type", def.getType()); - } - return tag; - } - - private String asStringElement(final org.w3c.dom.Element element, final Transformer transformer) - throws TransformerException { - final StringWriter buffer = new StringWriter(); - transformer.transform(new DOMSource(element), new StreamResult(buffer)); - return buffer.toString(); - } - - private void fillContextMap(final String xmlTree, final Set contexts) { - - Document fundingPath; - try { - fundingPath = new SAXReader().read(new StringReader(xmlTree)); - } catch (final DocumentException e) { - throw new RuntimeException(e); - } - try { - final Node funder = fundingPath.selectSingleNode("//funder"); - - if (funder != null) { - - final String funderShortName = funder.valueOf("./shortname"); - contexts.add(funderShortName); - - contextMapper.put( - funderShortName, - new ContextDef(funderShortName, funder.valueOf("./name"), "context", "funding")); - final Node level0 = fundingPath.selectSingleNode("//funding_level_0"); - if (level0 != null) { - final String level0Id = Joiner.on("::").join(funderShortName, level0.valueOf("./name")); - contextMapper.put( - level0Id, new ContextDef(level0Id, level0.valueOf("./description"), "category", "")); - final Node level1 = fundingPath.selectSingleNode("//funding_level_1"); - if (level1 == null) { - contexts.add(level0Id); - } else { - final String level1Id = Joiner.on("::").join(level0Id, level1.valueOf("./name")); - contextMapper.put( - level1Id, new ContextDef(level1Id, level1.valueOf("./description"), "concept", "")); - final Node level2 = fundingPath.selectSingleNode("//funding_level_2"); - if (level2 == null) { - contexts.add(level1Id); - } else { - final String level2Id = Joiner.on("::").join(level1Id, level2.valueOf("./name")); - contextMapper.put( - level2Id, - new ContextDef(level2Id, level2.valueOf("./description"), "concept", "")); - contexts.add(level2Id); - } - } - } - } - } catch (final NullPointerException e) { - throw new IllegalArgumentException("malformed funding path: " + xmlTree, e); - } - } - - @SuppressWarnings("unchecked") - protected static String getRelFundingTree(final String xmlTree) { - String funding = ""; - try { - final Document ftree = new SAXReader().read(new StringReader(xmlTree)); - funding = ""; - - funding += getFunderElement(ftree); - - for (final Object o : - Lists.reverse( - ftree.selectNodes("//fundingtree//*[starts-with(local-name(),'funding_level_')]"))) { - final Element e = (Element) o; - final String _id = e.valueOf("./id"); - funding += - "<" - + e.getName() - + " name=\"" - + XmlSerializationUtils.escapeXml(e.valueOf("./name")) - + "\">" - + XmlSerializationUtils.escapeXml(_id) - + ""; - } - } catch (final DocumentException e) { - throw new IllegalArgumentException( - "unable to parse funding tree: " + xmlTree + "\n" + e.getMessage()); - } finally { - funding += ""; - } - return funding; - } - - private static String getFunderElement(final Document ftree) { - final String funderId = ftree.valueOf("//fundingtree/funder/id"); - final String funderShortName = ftree.valueOf("//fundingtree/funder/shortname"); - final String funderName = ftree.valueOf("//fundingtree/funder/name"); - final String funderJurisdiction = ftree.valueOf("//fundingtree/funder/jurisdiction"); - - return ""; - } + public static final String REL_SUBTYPE_DEDUP = "dedup"; + private Map accumulators; + + private Set specialDatasourceTypes; + + private ContextMapper contextMapper; + + private String schemaLocation; + + private boolean indent = false; + + private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); + + public XmlRecordFactory( + final ContextMapper contextMapper, + final boolean indent, + final String schemaLocation, + final String otherDatasourceTypesUForUI) { + + this(Maps.newHashMap(), contextMapper, indent, schemaLocation, otherDatasourceTypesUForUI); + } + + public XmlRecordFactory( + final Map accumulators, + final ContextMapper contextMapper, + final boolean indent, + final String schemaLocation, + final String otherDatasourceTypesUForUI) { + + this.accumulators = accumulators; + this.contextMapper = contextMapper; + this.schemaLocation = schemaLocation; + this.specialDatasourceTypes = Sets.newHashSet(Splitter.on(",").trimResults().split(otherDatasourceTypesUForUI)); + + this.indent = indent; + } + + public String build(final JoinedEntity je) { + + final Set contexts = Sets.newHashSet(); + + final OafEntity entity = toOafEntity(je.getEntity()); + TemplateFactory templateFactory = new TemplateFactory(); + try { + final EntityType type = EntityType.valueOf(je.getEntity().getType()); + final List metadata = metadata(type, entity, contexts); + + // rels has to be processed before the contexts because they enrich the contextMap with + // the + // funding info. + final List relations = je + .getLinks() + .stream() + .filter(t -> !REL_SUBTYPE_DEDUP.equalsIgnoreCase(t.getRelation().getSubRelType())) + .map(link -> mapRelation(link, templateFactory, contexts)) + .collect(Collectors.toCollection(ArrayList::new)); + + final String mainType = ModelSupport.getMainType(type); + metadata.addAll(buildContexts(mainType, contexts)); + metadata.add(XmlSerializationUtils.parseDataInfo(entity.getDataInfo())); + + final String body = templateFactory + .buildBody( + mainType, + metadata, + relations, + listChildren(entity, je, templateFactory), + listExtraInfo(entity)); + + return printXML(templateFactory.buildRecord(entity, schemaLocation, body), indent); + } catch (final Throwable e) { + throw new RuntimeException(String.format("error building record '%s'", entity.getId()), e); + } + } + + private static OafEntity toOafEntity(TypedRow typedRow) { + return parseOaf(typedRow.getOaf(), typedRow.getType()); + } + + private static OafEntity parseOaf(final String json, final String type) { + try { + switch (EntityType.valueOf(type)) { + case publication: + return OBJECT_MAPPER.readValue(json, Publication.class); + case dataset: + return OBJECT_MAPPER.readValue(json, Dataset.class); + case otherresearchproduct: + return OBJECT_MAPPER.readValue(json, OtherResearchProduct.class); + case software: + return OBJECT_MAPPER.readValue(json, Software.class); + case datasource: + return OBJECT_MAPPER.readValue(json, Datasource.class); + case organization: + return OBJECT_MAPPER.readValue(json, Organization.class); + case project: + return OBJECT_MAPPER.readValue(json, Project.class); + default: + throw new IllegalArgumentException("invalid type: " + type); + } + } catch (IOException e) { + throw new IllegalArgumentException(e); + } + } + + private String printXML(String xml, boolean indent) { + try { + final Document doc = new SAXReader().read(new StringReader(xml)); + OutputFormat format = indent ? OutputFormat.createPrettyPrint() : OutputFormat.createCompactFormat(); + format.setExpandEmptyElements(false); + format.setSuppressDeclaration(true); + StringWriter sw = new StringWriter(); + XMLWriter writer = new XMLWriter(sw, format); + writer.write(doc); + return sw.toString(); + } catch (IOException | DocumentException e) { + throw new IllegalArgumentException("Unable to indent XML. Invalid record:\n" + xml, e); + } + } + + private List metadata( + final EntityType type, final OafEntity entity, final Set contexts) { + + final List metadata = Lists.newArrayList(); + + if (entity.getCollectedfrom() != null) { + metadata + .addAll( + entity + .getCollectedfrom() + .stream() + .map(kv -> XmlSerializationUtils.mapKeyValue("collectedfrom", kv)) + .collect(Collectors.toList())); + } + if (entity.getOriginalId() != null) { + metadata + .addAll( + entity + .getOriginalId() + .stream() + .map(s -> XmlSerializationUtils.asXmlElement("originalId", s)) + .collect(Collectors.toList())); + } + if (entity.getPid() != null) { + metadata + .addAll( + entity + .getPid() + .stream() + .map(p -> XmlSerializationUtils.mapStructuredProperty("pid", p)) + .collect(Collectors.toList())); + } + + if (ModelSupport.isResult(type)) { + final Result r = (Result) entity; + + if (r.getContext() != null) { + contexts.addAll(r.getContext().stream().map(c -> c.getId()).collect(Collectors.toList())); + /* FIXME: Workaround for CLARIN mining issue: #3670#note-29 */ + if (contexts.contains("dh-ch::subcommunity::2")) { + contexts.add("clarin"); + } + } + + if (r.getTitle() != null) { + metadata + .addAll( + r + .getTitle() + .stream() + .map(t -> XmlSerializationUtils.mapStructuredProperty("title", t)) + .collect(Collectors.toList())); + } + if (r.getBestaccessright() != null) { + metadata.add(XmlSerializationUtils.mapQualifier("bestaccessright", r.getBestaccessright())); + } + if (r.getAuthor() != null) { + metadata + .addAll( + r + .getAuthor() + .stream() + .map( + a -> { + final StringBuilder sb = new StringBuilder(" isNotBlank(sp.getQualifier().getClassid()) + && isNotBlank(sp.getValue())) + .forEach( + sp -> { + String pidType = XmlSerializationUtils + .escapeXml( + sp.getQualifier().getClassid()) + .replaceAll("\\W", ""); + String pidValue = XmlSerializationUtils.escapeXml(sp.getValue()); + + // ugly hack: some records + // provide swapped pidtype and + // pidvalue + if (authorPidTypes.contains(pidValue.toLowerCase().trim())) { + sb.append(String.format(" %s=\"%s\"", pidValue, pidType)); + } else { + pidType = pidType.replaceAll("\\W", "").replaceAll("\\d", ""); + if (isNotBlank(pidType)) { + sb + .append( + String + .format( + " %s=\"%s\"", + pidType, + pidValue + .toLowerCase() + .replaceAll("orcid", ""))); + } + } + }); + } + sb + .append( + ">" + XmlSerializationUtils.escapeXml(a.getFullname()) + ""); + return sb.toString(); + }) + .collect(Collectors.toList())); + } + if (r.getContributor() != null) { + metadata + .addAll( + r + .getContributor() + .stream() + .map(c -> XmlSerializationUtils.asXmlElement("contributor", c.getValue())) + .collect(Collectors.toList())); + } + if (r.getCountry() != null) { + metadata + .addAll( + r + .getCountry() + .stream() + .map(c -> XmlSerializationUtils.mapQualifier("country", c)) + .collect(Collectors.toList())); + } + if (r.getCoverage() != null) { + metadata + .addAll( + r + .getCoverage() + .stream() + .map(c -> XmlSerializationUtils.asXmlElement("coverage", c.getValue())) + .collect(Collectors.toList())); + } + if (r.getDateofacceptance() != null) { + metadata + .add( + XmlSerializationUtils + .asXmlElement( + "dateofacceptance", r.getDateofacceptance().getValue())); + } + if (r.getDescription() != null) { + metadata + .addAll( + r + .getDescription() + .stream() + .map(c -> XmlSerializationUtils.asXmlElement("description", c.getValue())) + .collect(Collectors.toList())); + } + if (r.getEmbargoenddate() != null) { + metadata + .add( + XmlSerializationUtils.asXmlElement("embargoenddate", r.getEmbargoenddate().getValue())); + } + if (r.getSubject() != null) { + metadata + .addAll( + r + .getSubject() + .stream() + .map(s -> XmlSerializationUtils.mapStructuredProperty("subject", s)) + .collect(Collectors.toList())); + } + if (r.getLanguage() != null) { + metadata.add(XmlSerializationUtils.mapQualifier("language", r.getLanguage())); + } + if (r.getRelevantdate() != null) { + metadata + .addAll( + r + .getRelevantdate() + .stream() + .map(s -> XmlSerializationUtils.mapStructuredProperty("relevantdate", s)) + .collect(Collectors.toList())); + } + if (r.getPublisher() != null) { + metadata.add(XmlSerializationUtils.asXmlElement("publisher", r.getPublisher().getValue())); + } + if (r.getSource() != null) { + metadata + .addAll( + r + .getSource() + .stream() + .map(c -> XmlSerializationUtils.asXmlElement("source", c.getValue())) + .collect(Collectors.toList())); + } + if (r.getFormat() != null) { + metadata + .addAll( + r + .getFormat() + .stream() + .map(c -> XmlSerializationUtils.asXmlElement("format", c.getValue())) + .collect(Collectors.toList())); + } + if (r.getResulttype() != null) { + metadata.add(XmlSerializationUtils.mapQualifier("resulttype", r.getResulttype())); + } + if (r.getResourcetype() != null) { + metadata.add(XmlSerializationUtils.mapQualifier("resourcetype", r.getResourcetype())); + } + } + + switch (type) { + case publication: + final Publication pub = (Publication) entity; + + if (pub.getJournal() != null) { + final Journal j = pub.getJournal(); + metadata.add(XmlSerializationUtils.mapJournal(j)); + } + + break; + case dataset: + final Dataset d = (Dataset) entity; + if (d.getDevice() != null) { + metadata.add(XmlSerializationUtils.asXmlElement("device", d.getDevice().getValue())); + } + if (d.getLastmetadataupdate() != null) { + metadata + .add( + XmlSerializationUtils + .asXmlElement( + "lastmetadataupdate", d.getLastmetadataupdate().getValue())); + } + if (d.getMetadataversionnumber() != null) { + metadata + .add( + XmlSerializationUtils + .asXmlElement( + "metadataversionnumber", d.getMetadataversionnumber().getValue())); + } + if (d.getSize() != null) { + metadata.add(XmlSerializationUtils.asXmlElement("size", d.getSize().getValue())); + } + if (d.getStoragedate() != null) { + metadata + .add( + XmlSerializationUtils.asXmlElement("storagedate", d.getStoragedate().getValue())); + } + if (d.getVersion() != null) { + metadata.add(XmlSerializationUtils.asXmlElement("version", d.getVersion().getValue())); + } + // TODO d.getGeolocation() + + break; + case otherresearchproduct: + final OtherResearchProduct orp = (OtherResearchProduct) entity; + + if (orp.getContactperson() != null) { + metadata + .addAll( + orp + .getContactperson() + .stream() + .map(c -> XmlSerializationUtils.asXmlElement("contactperson", c.getValue())) + .collect(Collectors.toList())); + } + + if (orp.getContactgroup() != null) { + metadata + .addAll( + orp + .getContactgroup() + .stream() + .map(c -> XmlSerializationUtils.asXmlElement("contactgroup", c.getValue())) + .collect(Collectors.toList())); + } + if (orp.getTool() != null) { + metadata + .addAll( + orp + .getTool() + .stream() + .map(c -> XmlSerializationUtils.asXmlElement("tool", c.getValue())) + .collect(Collectors.toList())); + } + break; + case software: + final Software s = (Software) entity; + + if (s.getDocumentationUrl() != null) { + metadata + .addAll( + s + .getDocumentationUrl() + .stream() + .map(c -> XmlSerializationUtils.asXmlElement("documentationUrl", c.getValue())) + .collect(Collectors.toList())); + } + if (s.getLicense() != null) { + metadata + .addAll( + s + .getLicense() + .stream() + .map(l -> XmlSerializationUtils.mapStructuredProperty("license", l)) + .collect(Collectors.toList())); + } + if (s.getCodeRepositoryUrl() != null) { + metadata + .add( + XmlSerializationUtils + .asXmlElement( + "codeRepositoryUrl", s.getCodeRepositoryUrl().getValue())); + } + if (s.getProgrammingLanguage() != null) { + metadata + .add( + XmlSerializationUtils + .mapQualifier( + "programmingLanguage", s.getProgrammingLanguage())); + } + break; + case datasource: + final Datasource ds = (Datasource) entity; + + if (ds.getDatasourcetype() != null) { + mapDatasourceType(metadata, ds.getDatasourcetype()); + } + if (ds.getOpenairecompatibility() != null) { + metadata + .add( + XmlSerializationUtils + .mapQualifier( + "openairecompatibility", ds.getOpenairecompatibility())); + } + if (ds.getOfficialname() != null) { + metadata + .add( + XmlSerializationUtils.asXmlElement("officialname", ds.getOfficialname().getValue())); + } + if (ds.getEnglishname() != null) { + metadata + .add( + XmlSerializationUtils.asXmlElement("englishname", ds.getEnglishname().getValue())); + } + if (ds.getWebsiteurl() != null) { + metadata + .add( + XmlSerializationUtils.asXmlElement("websiteurl", ds.getWebsiteurl().getValue())); + } + if (ds.getLogourl() != null) { + metadata.add(XmlSerializationUtils.asXmlElement("logourl", ds.getLogourl().getValue())); + } + if (ds.getContactemail() != null) { + metadata + .add( + XmlSerializationUtils.asXmlElement("contactemail", ds.getContactemail().getValue())); + } + if (ds.getNamespaceprefix() != null) { + metadata + .add( + XmlSerializationUtils + .asXmlElement( + "namespaceprefix", ds.getNamespaceprefix().getValue())); + } + if (ds.getLatitude() != null) { + metadata.add(XmlSerializationUtils.asXmlElement("latitude", ds.getLatitude().getValue())); + } + if (ds.getLongitude() != null) { + metadata + .add( + XmlSerializationUtils.asXmlElement("longitude", ds.getLongitude().getValue())); + } + if (ds.getDateofvalidation() != null) { + metadata + .add( + XmlSerializationUtils + .asXmlElement( + "dateofvalidation", ds.getDateofvalidation().getValue())); + } + if (ds.getDescription() != null) { + metadata + .add( + XmlSerializationUtils.asXmlElement("description", ds.getDescription().getValue())); + } + if (ds.getOdnumberofitems() != null) { + metadata + .add( + XmlSerializationUtils + .asXmlElement( + "odnumberofitems", ds.getOdnumberofitems().getValue())); + } + if (ds.getOdnumberofitemsdate() != null) { + metadata + .add( + XmlSerializationUtils + .asXmlElement( + "odnumberofitemsdate", ds.getOdnumberofitemsdate().getValue())); + } + if (ds.getOdpolicies() != null) { + metadata + .add( + XmlSerializationUtils.asXmlElement("odpolicies", ds.getOdpolicies().getValue())); + } + if (ds.getOdlanguages() != null) { + metadata + .addAll( + ds + .getOdlanguages() + .stream() + .map(c -> XmlSerializationUtils.asXmlElement("odlanguages", c.getValue())) + .collect(Collectors.toList())); + } + if (ds.getOdcontenttypes() != null) { + metadata + .addAll( + ds + .getOdcontenttypes() + .stream() + .map(c -> XmlSerializationUtils.asXmlElement("odcontenttypes", c.getValue())) + .collect(Collectors.toList())); + } + if (ds.getAccessinfopackage() != null) { + metadata + .addAll( + ds + .getAccessinfopackage() + .stream() + .map(c -> XmlSerializationUtils.asXmlElement("accessinfopackage", c.getValue())) + .collect(Collectors.toList())); + } + if (ds.getReleaseenddate() != null) { + metadata + .add( + XmlSerializationUtils + .asXmlElement( + "releasestartdate", ds.getReleaseenddate().getValue())); + } + if (ds.getReleaseenddate() != null) { + metadata + .add( + XmlSerializationUtils + .asXmlElement( + "releaseenddate", ds.getReleaseenddate().getValue())); + } + if (ds.getMissionstatementurl() != null) { + metadata + .add( + XmlSerializationUtils + .asXmlElement( + "missionstatementurl", ds.getMissionstatementurl().getValue())); + } + if (ds.getDataprovider() != null) { + metadata + .add( + XmlSerializationUtils + .asXmlElement( + "dataprovider", ds.getDataprovider().getValue().toString())); + } + if (ds.getServiceprovider() != null) { + metadata + .add( + XmlSerializationUtils + .asXmlElement( + "serviceprovider", ds.getServiceprovider().getValue().toString())); + } + if (ds.getDatabaseaccesstype() != null) { + metadata + .add( + XmlSerializationUtils + .asXmlElement( + "databaseaccesstype", ds.getDatabaseaccesstype().getValue())); + } + if (ds.getDatauploadtype() != null) { + metadata + .add( + XmlSerializationUtils + .asXmlElement( + "datauploadtype", ds.getDatauploadtype().getValue())); + } + if (ds.getDatabaseaccessrestriction() != null) { + metadata + .add( + XmlSerializationUtils + .asXmlElement( + "databaseaccessrestriction", ds.getDatabaseaccessrestriction().getValue())); + } + if (ds.getDatauploadrestriction() != null) { + metadata + .add( + XmlSerializationUtils + .asXmlElement( + "datauploadrestriction", ds.getDatauploadrestriction().getValue())); + } + if (ds.getVersioning() != null) { + metadata + .add( + XmlSerializationUtils + .asXmlElement( + "versioning", ds.getVersioning().getValue().toString())); + } + if (ds.getCitationguidelineurl() != null) { + metadata + .add( + XmlSerializationUtils + .asXmlElement( + "citationguidelineurl", ds.getCitationguidelineurl().getValue())); + } + if (ds.getQualitymanagementkind() != null) { + metadata + .add( + XmlSerializationUtils + .asXmlElement( + "qualitymanagementkind", ds.getQualitymanagementkind().getValue())); + } + if (ds.getPidsystems() != null) { + metadata + .add( + XmlSerializationUtils.asXmlElement("pidsystems", ds.getPidsystems().getValue())); + } + if (ds.getCertificates() != null) { + metadata + .add( + XmlSerializationUtils.asXmlElement("certificates", ds.getCertificates().getValue())); + } + if (ds.getPolicies() != null) { + metadata + .addAll( + ds + .getPolicies() + .stream() + .map(kv -> XmlSerializationUtils.mapKeyValue("policies", kv)) + .collect(Collectors.toList())); + } + if (ds.getJournal() != null) { + metadata.add(XmlSerializationUtils.mapJournal(ds.getJournal())); + } + if (ds.getSubjects() != null) { + metadata + .addAll( + ds + .getSubjects() + .stream() + .map(sp -> XmlSerializationUtils.mapStructuredProperty("subjects", sp)) + .collect(Collectors.toList())); + } + + break; + case organization: + final Organization o = (Organization) entity; + + if (o.getLegalshortname() != null) { + metadata + .add( + XmlSerializationUtils + .asXmlElement( + "legalshortname", o.getLegalshortname().getValue())); + } + if (o.getLegalname() != null) { + metadata + .add( + XmlSerializationUtils.asXmlElement("legalname", o.getLegalname().getValue())); + } + if (o.getAlternativeNames() != null) { + metadata + .addAll( + o + .getAlternativeNames() + .stream() + .map(c -> XmlSerializationUtils.asXmlElement("alternativeNames", c.getValue())) + .collect(Collectors.toList())); + } + if (o.getWebsiteurl() != null) { + metadata + .add( + XmlSerializationUtils.asXmlElement("websiteurl", o.getWebsiteurl().getValue())); + } + if (o.getLogourl() != null) { + metadata.add(XmlSerializationUtils.asXmlElement("websiteurl", o.getLogourl().getValue())); + } + + if (o.getEclegalbody() != null) { + metadata + .add( + XmlSerializationUtils.asXmlElement("eclegalbody", o.getEclegalbody().getValue())); + } + if (o.getEclegalperson() != null) { + metadata + .add( + XmlSerializationUtils.asXmlElement("eclegalperson", o.getEclegalperson().getValue())); + } + if (o.getEcnonprofit() != null) { + metadata + .add( + XmlSerializationUtils.asXmlElement("ecnonprofit", o.getEcnonprofit().getValue())); + } + if (o.getEcresearchorganization() != null) { + metadata + .add( + XmlSerializationUtils + .asXmlElement( + "ecresearchorganization", o.getEcresearchorganization().getValue())); + } + if (o.getEchighereducation() != null) { + metadata + .add( + XmlSerializationUtils + .asXmlElement( + "echighereducation", o.getEchighereducation().getValue())); + } + if (o.getEcinternationalorganization() != null) { + metadata + .add( + XmlSerializationUtils + .asXmlElement( + "ecinternationalorganizationeurinterests", + o.getEcinternationalorganization().getValue())); + } + if (o.getEcinternationalorganization() != null) { + metadata + .add( + XmlSerializationUtils + .asXmlElement( + "ecinternationalorganization", o.getEcinternationalorganization().getValue())); + } + if (o.getEcenterprise() != null) { + metadata + .add( + XmlSerializationUtils.asXmlElement("ecenterprise", o.getEcenterprise().getValue())); + } + if (o.getEcsmevalidated() != null) { + metadata + .add( + XmlSerializationUtils + .asXmlElement( + "ecsmevalidated", o.getEcsmevalidated().getValue())); + } + if (o.getEcnutscode() != null) { + metadata + .add( + XmlSerializationUtils.asXmlElement("ecnutscode", o.getEcnutscode().getValue())); + } + if (o.getCountry() != null) { + metadata.add(XmlSerializationUtils.mapQualifier("country", o.getCountry())); + } + + break; + case project: + final Project p = (Project) entity; + + if (p.getWebsiteurl() != null) { + metadata + .add( + XmlSerializationUtils.asXmlElement("websiteurl", p.getWebsiteurl().getValue())); + } + if (p.getCode() != null) { + metadata.add(XmlSerializationUtils.asXmlElement("code", p.getCode().getValue())); + } + if (p.getAcronym() != null) { + metadata.add(XmlSerializationUtils.asXmlElement("acronym", p.getAcronym().getValue())); + } + if (p.getTitle() != null) { + metadata.add(XmlSerializationUtils.asXmlElement("title", p.getTitle().getValue())); + } + if (p.getStartdate() != null) { + metadata + .add( + XmlSerializationUtils.asXmlElement("startdate", p.getStartdate().getValue())); + } + if (p.getEnddate() != null) { + metadata.add(XmlSerializationUtils.asXmlElement("enddate", p.getEnddate().getValue())); + } + if (p.getCallidentifier() != null) { + metadata + .add( + XmlSerializationUtils + .asXmlElement( + "callidentifier", p.getCallidentifier().getValue())); + } + if (p.getKeywords() != null) { + metadata.add(XmlSerializationUtils.asXmlElement("keywords", p.getKeywords().getValue())); + } + if (p.getDuration() != null) { + metadata.add(XmlSerializationUtils.asXmlElement("duration", p.getDuration().getValue())); + } + if (p.getEcarticle29_3() != null) { + metadata + .add( + XmlSerializationUtils.asXmlElement("ecarticle29_3", p.getEcarticle29_3().getValue())); + } + if (p.getSubjects() != null) { + metadata + .addAll( + p + .getSubjects() + .stream() + .map(sp -> XmlSerializationUtils.mapStructuredProperty("subject", sp)) + .collect(Collectors.toList())); + } + if (p.getContracttype() != null) { + metadata.add(XmlSerializationUtils.mapQualifier("contracttype", p.getContracttype())); + } + if (p.getEcsc39() != null) { + metadata.add(XmlSerializationUtils.asXmlElement("ecsc39", p.getEcsc39().getValue())); + } + if (p.getContactfullname() != null) { + metadata + .add( + XmlSerializationUtils + .asXmlElement( + "contactfullname", p.getContactfullname().getValue())); + } + if (p.getContactfax() != null) { + metadata + .add( + XmlSerializationUtils.asXmlElement("contactfax", p.getContactfax().getValue())); + } + if (p.getContactphone() != null) { + metadata + .add( + XmlSerializationUtils.asXmlElement("contactphone", p.getContactphone().getValue())); + } + if (p.getContactemail() != null) { + metadata + .add( + XmlSerializationUtils.asXmlElement("contactemail", p.getContactemail().getValue())); + } + if (p.getSummary() != null) { + metadata.add(XmlSerializationUtils.asXmlElement("summary", p.getSummary().getValue())); + } + if (p.getCurrency() != null) { + metadata.add(XmlSerializationUtils.asXmlElement("currency", p.getCurrency().getValue())); + } + if (p.getTotalcost() != null) { + metadata + .add( + XmlSerializationUtils.asXmlElement("totalcost", p.getTotalcost().toString())); + } + if (p.getFundedamount() != null) { + metadata + .add( + XmlSerializationUtils.asXmlElement("fundedamount", p.getFundedamount().toString())); + } + if (p.getFundingtree() != null) { + metadata + .addAll( + p.getFundingtree().stream().map(ft -> ft.getValue()).collect(Collectors.toList())); + } + + break; + default: + throw new IllegalArgumentException("invalid entity type: " + type); + } + + return metadata; + } + + private void mapDatasourceType(List metadata, final Qualifier dsType) { + metadata.add(XmlSerializationUtils.mapQualifier("datasourcetype", dsType)); + + if (specialDatasourceTypes.contains(dsType.getClassid())) { + dsType.setClassid("other"); + dsType.setClassname("other"); + } + metadata.add(XmlSerializationUtils.mapQualifier("datasourcetypeui", dsType)); + } + + private String mapRelation(Tuple2 link, TemplateFactory templateFactory, Set contexts) { + final Relation rel = link.getRelation(); + final RelatedEntity re = link.getRelatedEntity(); + final String targetType = link.getRelatedEntity().getType(); + + final List metadata = Lists.newArrayList(); + switch (EntityType.valueOf(targetType)) { + case publication: + case dataset: + case otherresearchproduct: + case software: + if (re.getTitle() != null && isNotBlank(re.getTitle().getValue())) { + metadata.add(XmlSerializationUtils.mapStructuredProperty("title", re.getTitle())); + } + if (isNotBlank(re.getDateofacceptance())) { + metadata + .add( + XmlSerializationUtils.asXmlElement("dateofacceptance", re.getDateofacceptance())); + } + if (isNotBlank(re.getPublisher())) { + metadata.add(XmlSerializationUtils.asXmlElement("publisher", re.getPublisher())); + } + if (isNotBlank(re.getCodeRepositoryUrl())) { + metadata + .add( + XmlSerializationUtils.asXmlElement("coderepositoryurl", re.getCodeRepositoryUrl())); + } + if (re.getResulttype() != null & re.getResulttype().isBlank()) { + metadata.add(XmlSerializationUtils.mapQualifier("resulttype", re.getResulttype())); + } + if (re.getCollectedfrom() != null) { + metadata + .addAll( + re + .getCollectedfrom() + .stream() + .map(kv -> XmlSerializationUtils.mapKeyValue("collectedfrom", kv)) + .collect(Collectors.toList())); + } + if (re.getPid() != null) { + metadata + .addAll( + re + .getPid() + .stream() + .map(p -> XmlSerializationUtils.mapStructuredProperty("pid", p)) + .collect(Collectors.toList())); + } + break; + case datasource: + if (isNotBlank(re.getOfficialname())) { + metadata.add(XmlSerializationUtils.asXmlElement("officialname", re.getOfficialname())); + } + if (re.getDatasourcetype() != null & !re.getDatasourcetype().isBlank()) { + mapDatasourceType(metadata, re.getDatasourcetype()); + } + if (re.getOpenairecompatibility() != null & !re.getOpenairecompatibility().isBlank()) { + metadata + .add( + XmlSerializationUtils + .mapQualifier( + "openairecompatibility", re.getOpenairecompatibility())); + } + break; + case organization: + if (isNotBlank(re.getLegalname())) { + metadata.add(XmlSerializationUtils.asXmlElement("legalname", re.getLegalname())); + } + if (isNotBlank(re.getLegalshortname())) { + metadata + .add( + XmlSerializationUtils.asXmlElement("legalshortname", re.getLegalshortname())); + } + if (re.getCountry() != null & !re.getCountry().isBlank()) { + metadata.add(XmlSerializationUtils.mapQualifier("country", re.getCountry())); + } + break; + case project: + if (isNotBlank(re.getProjectTitle())) { + metadata.add(XmlSerializationUtils.asXmlElement("title", re.getProjectTitle())); + } + if (isNotBlank(re.getCode())) { + metadata.add(XmlSerializationUtils.asXmlElement("code", re.getCode())); + } + if (isNotBlank(re.getAcronym())) { + metadata.add(XmlSerializationUtils.asXmlElement("acronym", re.getAcronym())); + } + if (re.getContracttype() != null & !re.getContracttype().isBlank()) { + metadata.add(XmlSerializationUtils.mapQualifier("contracttype", re.getContracttype())); + } + if (re.getFundingtree() != null & contexts != null) { + metadata + .addAll( + re + .getFundingtree() + .stream() + .peek(ft -> fillContextMap(ft, contexts)) + .map(ft -> getRelFundingTree(ft)) + .collect(Collectors.toList())); + } + break; + default: + throw new IllegalArgumentException("invalid target type: " + targetType); + } + final DataInfo info = rel.getDataInfo(); + final String scheme = ModelSupport.getScheme(re.getType(), targetType); + + if (StringUtils.isBlank(scheme)) { + throw new IllegalArgumentException( + String.format("missing scheme for: <%s - %s>", re.getType(), targetType)); + } + + final String accumulatorName = getRelDescriptor(rel.getRelType(), rel.getSubRelType(), rel.getRelClass()); + if (accumulators.containsKey(accumulatorName)) { + accumulators.get(accumulatorName).add(1); + } + + return templateFactory + .getRel( + targetType, rel.getTarget(), Sets.newHashSet(metadata), rel.getRelClass(), scheme, info); + } + + private List listChildren( + final OafEntity entity, JoinedEntity je, TemplateFactory templateFactory) { + + final List children = Lists.newArrayList(); + EntityType entityType = EntityType.valueOf(je.getEntity().getType()); + + children + .addAll( + je + .getLinks() + .stream() + .filter(link -> REL_SUBTYPE_DEDUP.equalsIgnoreCase(link.getRelation().getSubRelType())) + .map(link -> mapRelation(link, templateFactory, null)) + .collect(Collectors.toCollection(ArrayList::new))); + + if (MainEntityType.result.toString().equals(ModelSupport.getMainType(entityType))) { + final List instances = ((Result) entity).getInstance(); + if (instances != null) { + for (final Instance instance : ((Result) entity).getInstance()) { + + final List fields = Lists.newArrayList(); + + if (instance.getAccessright() != null && !instance.getAccessright().isBlank()) { + fields + .add( + XmlSerializationUtils.mapQualifier("accessright", instance.getAccessright())); + } + if (instance.getCollectedfrom() != null) { + fields + .add( + XmlSerializationUtils.mapKeyValue("collectedfrom", instance.getCollectedfrom())); + } + if (instance.getHostedby() != null) { + fields.add(XmlSerializationUtils.mapKeyValue("hostedby", instance.getHostedby())); + } + if (instance.getDateofacceptance() != null + && isNotBlank(instance.getDateofacceptance().getValue())) { + fields + .add( + XmlSerializationUtils + .asXmlElement( + "dateofacceptance", instance.getDateofacceptance().getValue())); + } + if (instance.getInstancetype() != null && !instance.getInstancetype().isBlank()) { + fields + .add( + XmlSerializationUtils.mapQualifier("instancetype", instance.getInstancetype())); + } + if (isNotBlank(instance.getDistributionlocation())) { + fields + .add( + XmlSerializationUtils + .asXmlElement( + "distributionlocation", instance.getDistributionlocation())); + } + if (instance.getRefereed() != null && isNotBlank(instance.getRefereed().getValue())) { + fields + .add( + XmlSerializationUtils.asXmlElement("refereed", instance.getRefereed().getValue())); + } + if (instance.getProcessingchargeamount() != null + && isNotBlank(instance.getProcessingchargeamount().getValue())) { + fields + .add( + XmlSerializationUtils + .asXmlElement( + "processingchargeamount", instance.getProcessingchargeamount().getValue())); + } + if (instance.getProcessingchargecurrency() != null + && isNotBlank(instance.getProcessingchargecurrency().getValue())) { + fields + .add( + XmlSerializationUtils + .asXmlElement( + "processingchargecurrency", instance.getProcessingchargecurrency().getValue())); + } + + children + .add( + templateFactory + .getInstance( + instance.getHostedby().getKey(), fields, instance.getUrl())); + } + } + final List ext = ((Result) entity).getExternalReference(); + if (ext != null) { + for (final ExternalReference er : ((Result) entity).getExternalReference()) { + + final List fields = Lists.newArrayList(); + + if (isNotBlank(er.getSitename())) { + fields.add(XmlSerializationUtils.asXmlElement("sitename", er.getSitename())); + } + if (isNotBlank(er.getLabel())) { + fields.add(XmlSerializationUtils.asXmlElement("label", er.getLabel())); + } + if (isNotBlank(er.getUrl())) { + fields.add(XmlSerializationUtils.asXmlElement("url", er.getUrl())); + } + if (isNotBlank(er.getDescription())) { + fields.add(XmlSerializationUtils.asXmlElement("description", er.getDescription())); + } + if (isNotBlank(er.getUrl())) { + fields.add(XmlSerializationUtils.mapQualifier("qualifier", er.getQualifier())); + } + if (isNotBlank(er.getRefidentifier())) { + fields.add(XmlSerializationUtils.asXmlElement("refidentifier", er.getRefidentifier())); + } + if (isNotBlank(er.getQuery())) { + fields.add(XmlSerializationUtils.asXmlElement("query", er.getQuery())); + } + + children.add(templateFactory.getChild("externalreference", null, fields)); + } + } + } + + return children; + } + + private List listExtraInfo(OafEntity entity) { + final List extraInfo = entity.getExtraInfo(); + return extraInfo != null + ? extraInfo + .stream() + .map(e -> XmlSerializationUtils.mapExtraInfo(e)) + .collect(Collectors.toList()) + : Lists.newArrayList(); + } + + private List buildContexts(final String type, final Set contexts) { + final List res = Lists.newArrayList(); + + if ((contextMapper != null) + && !contextMapper.isEmpty() + && MainEntityType.result.toString().equals(type)) { + + XMLTag document = XMLDoc.newDocument(true).addRoot("contextRoot"); + + for (final String context : contexts) { + + String id = ""; + for (final String token : Splitter.on("::").split(context)) { + id += token; + + final ContextDef def = contextMapper.get(id); + + if (def == null) { + continue; + // throw new IllegalStateException(String.format("cannot find context for id + // '%s'", + // id)); + } + + if (def.getName().equals("context")) { + final String xpath = "//context/@id='" + def.getId() + "'"; + if (!document.gotoRoot().rawXpathBoolean(xpath, new Object())) { + document = addContextDef(document.gotoRoot(), def); + } + } + + if (def.getName().equals("category")) { + final String rootId = substringBefore(def.getId(), "::"); + document = addContextDef( + document.gotoRoot().gotoTag("//context[./@id='" + rootId + "']", new Object()), + def); + } + + if (def.getName().equals("concept")) { + document = addContextDef(document, def).gotoParent(); + } + id += "::"; + } + } + final Transformer transformer = getTransformer(); + for (final org.w3c.dom.Element x : document.gotoRoot().getChildElement()) { + try { + res.add(asStringElement(x, transformer)); + } catch (final TransformerException e) { + throw new RuntimeException(e); + } + } + } + + return res; + } + + private Transformer getTransformer() { + try { + Transformer transformer = TransformerFactory.newInstance().newTransformer(); + transformer.setOutputProperty(OutputKeys.OMIT_XML_DECLARATION, "yes"); + return transformer; + } catch (TransformerConfigurationException e) { + throw new IllegalStateException("unable to create javax.xml.transform.Transformer", e); + } + } + + private XMLTag addContextDef(final XMLTag tag, final ContextDef def) { + tag.addTag(def.getName()).addAttribute("id", def.getId()).addAttribute("label", def.getLabel()); + if ((def.getType() != null) && !def.getType().isEmpty()) { + tag.addAttribute("type", def.getType()); + } + return tag; + } + + private String asStringElement(final org.w3c.dom.Element element, final Transformer transformer) + throws TransformerException { + final StringWriter buffer = new StringWriter(); + transformer.transform(new DOMSource(element), new StreamResult(buffer)); + return buffer.toString(); + } + + private void fillContextMap(final String xmlTree, final Set contexts) { + + Document fundingPath; + try { + fundingPath = new SAXReader().read(new StringReader(xmlTree)); + } catch (final DocumentException e) { + throw new RuntimeException(e); + } + try { + final Node funder = fundingPath.selectSingleNode("//funder"); + + if (funder != null) { + + final String funderShortName = funder.valueOf("./shortname"); + contexts.add(funderShortName); + + contextMapper + .put( + funderShortName, + new ContextDef(funderShortName, funder.valueOf("./name"), "context", "funding")); + final Node level0 = fundingPath.selectSingleNode("//funding_level_0"); + if (level0 != null) { + final String level0Id = Joiner.on("::").join(funderShortName, level0.valueOf("./name")); + contextMapper + .put( + level0Id, new ContextDef(level0Id, level0.valueOf("./description"), "category", "")); + final Node level1 = fundingPath.selectSingleNode("//funding_level_1"); + if (level1 == null) { + contexts.add(level0Id); + } else { + final String level1Id = Joiner.on("::").join(level0Id, level1.valueOf("./name")); + contextMapper + .put( + level1Id, new ContextDef(level1Id, level1.valueOf("./description"), "concept", "")); + final Node level2 = fundingPath.selectSingleNode("//funding_level_2"); + if (level2 == null) { + contexts.add(level1Id); + } else { + final String level2Id = Joiner.on("::").join(level1Id, level2.valueOf("./name")); + contextMapper + .put( + level2Id, + new ContextDef(level2Id, level2.valueOf("./description"), "concept", "")); + contexts.add(level2Id); + } + } + } + } + } catch (final NullPointerException e) { + throw new IllegalArgumentException("malformed funding path: " + xmlTree, e); + } + } + + @SuppressWarnings("unchecked") + protected static String getRelFundingTree(final String xmlTree) { + String funding = ""; + try { + final Document ftree = new SAXReader().read(new StringReader(xmlTree)); + funding = ""; + + funding += getFunderElement(ftree); + + for (final Object o : Lists + .reverse( + ftree.selectNodes("//fundingtree//*[starts-with(local-name(),'funding_level_')]"))) { + final Element e = (Element) o; + final String _id = e.valueOf("./id"); + funding += "<" + + e.getName() + + " name=\"" + + XmlSerializationUtils.escapeXml(e.valueOf("./name")) + + "\">" + + XmlSerializationUtils.escapeXml(_id) + + ""; + } + } catch (final DocumentException e) { + throw new IllegalArgumentException( + "unable to parse funding tree: " + xmlTree + "\n" + e.getMessage()); + } finally { + funding += ""; + } + return funding; + } + + private static String getFunderElement(final Document ftree) { + final String funderId = ftree.valueOf("//fundingtree/funder/id"); + final String funderShortName = ftree.valueOf("//fundingtree/funder/shortname"); + final String funderName = ftree.valueOf("//fundingtree/funder/name"); + final String funderJurisdiction = ftree.valueOf("//fundingtree/funder/jurisdiction"); + + return ""; + } } diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/XmlSerializationUtils.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/XmlSerializationUtils.java index 0b3109bde..bc3b3107d 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/XmlSerializationUtils.java +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/XmlSerializationUtils.java @@ -1,3 +1,4 @@ + package eu.dnetlib.dhp.oa.provision.utils; import static eu.dnetlib.dhp.oa.provision.utils.GraphMappingUtils.removePrefix; @@ -8,149 +9,151 @@ import eu.dnetlib.dhp.schema.oaf.*; public class XmlSerializationUtils { - // XML 1.0 - // #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF] - private static final String xml10pattern = - "[^" + "\u0009\r\n" + "\u0020-\uD7FF" + "\uE000-\uFFFD" + "\ud800\udc00-\udbff\udfff" + "]"; + // XML 1.0 + // #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF] + private static final String xml10pattern = "[^" + "\u0009\r\n" + "\u0020-\uD7FF" + "\uE000-\uFFFD" + + "\ud800\udc00-\udbff\udfff" + "]"; - public static String mapJournal(Journal j) { - final String attrs = - new StringBuilder() - .append(attr("issn", j.getIssnPrinted())) - .append(attr("eissn", j.getIssnOnline())) - .append(attr("lissn", j.getIssnLinking())) - .append(attr("ep", j.getEp())) - .append(attr("iss", j.getIss())) - .append(attr("sp", j.getSp())) - .append(attr("vol", j.getVol())) - .toString() - .trim(); + public static String mapJournal(Journal j) { + final String attrs = new StringBuilder() + .append(attr("issn", j.getIssnPrinted())) + .append(attr("eissn", j.getIssnOnline())) + .append(attr("lissn", j.getIssnLinking())) + .append(attr("ep", j.getEp())) + .append(attr("iss", j.getIss())) + .append(attr("sp", j.getSp())) + .append(attr("vol", j.getVol())) + .toString() + .trim(); - return new StringBuilder() - .append("") - .append(escapeXml(j.getName())) - .append("") - .toString(); - } + return new StringBuilder() + .append("") + .append(escapeXml(j.getName())) + .append("") + .toString(); + } - private static String attr(final String name, final String value) { - return isNotBlank(value) ? name + "=\"" + escapeXml(value) + "\" " : ""; - } + private static String attr(final String name, final String value) { + return isNotBlank(value) ? name + "=\"" + escapeXml(value) + "\" " : ""; + } - public static String mapStructuredProperty(String name, StructuredProperty t) { - return asXmlElement( - name, t.getValue(), t.getQualifier(), t.getDataInfo() != null ? t.getDataInfo() : null); - } + public static String mapStructuredProperty(String name, StructuredProperty t) { + return asXmlElement( + name, t.getValue(), t.getQualifier(), t.getDataInfo() != null ? t.getDataInfo() : null); + } - public static String mapQualifier(String name, Qualifier q) { - return asXmlElement(name, "", q, null); - } + public static String mapQualifier(String name, Qualifier q) { + return asXmlElement(name, "", q, null); + } - public static String escapeXml(final String value) { - return value - .replaceAll("&", "&") - .replaceAll("<", "<") - .replaceAll(">", ">") - .replaceAll("\"", """) - .replaceAll("'", "'") - .replaceAll(xml10pattern, ""); - } + public static String escapeXml(final String value) { + return value + .replaceAll("&", "&") + .replaceAll("<", "<") + .replaceAll(">", ">") + .replaceAll("\"", """) + .replaceAll("'", "'") + .replaceAll(xml10pattern, ""); + } - public static String parseDataInfo(final DataInfo dataInfo) { - return new StringBuilder() - .append("") - .append(asXmlElement("inferred", dataInfo.getInferred() + "")) - .append(asXmlElement("deletedbyinference", dataInfo.getDeletedbyinference() + "")) - .append(asXmlElement("trust", dataInfo.getTrust() + "")) - .append(asXmlElement("inferenceprovenance", dataInfo.getInferenceprovenance() + "")) - .append(asXmlElement("provenanceaction", null, dataInfo.getProvenanceaction(), null)) - .append("") - .toString(); - } + public static String parseDataInfo(final DataInfo dataInfo) { + return new StringBuilder() + .append("") + .append(asXmlElement("inferred", dataInfo.getInferred() + "")) + .append(asXmlElement("deletedbyinference", dataInfo.getDeletedbyinference() + "")) + .append(asXmlElement("trust", dataInfo.getTrust() + "")) + .append(asXmlElement("inferenceprovenance", dataInfo.getInferenceprovenance() + "")) + .append(asXmlElement("provenanceaction", null, dataInfo.getProvenanceaction(), null)) + .append("") + .toString(); + } - private static StringBuilder dataInfoAsAttributes(final StringBuilder sb, final DataInfo info) { - return sb.append( - attr("inferred", info.getInferred() != null ? info.getInferred().toString() : "")) - .append(attr("inferenceprovenance", info.getInferenceprovenance())) - .append( - attr( - "provenanceaction", - info.getProvenanceaction() != null ? info.getProvenanceaction().getClassid() : "")) - .append(attr("trust", info.getTrust())); - } + private static StringBuilder dataInfoAsAttributes(final StringBuilder sb, final DataInfo info) { + return sb + .append( + attr("inferred", info.getInferred() != null ? info.getInferred().toString() : "")) + .append(attr("inferenceprovenance", info.getInferenceprovenance())) + .append( + attr( + "provenanceaction", + info.getProvenanceaction() != null ? info.getProvenanceaction().getClassid() : "")) + .append(attr("trust", info.getTrust())); + } - public static String mapKeyValue(final String name, final KeyValue kv) { - return new StringBuilder() - .append("<") - .append(name) - .append(" name=\"") - .append(escapeXml(kv.getValue())) - .append("\" id=\"") - .append(escapeXml(removePrefix(kv.getKey()))) - .append("\"/>") - .toString(); - } + public static String mapKeyValue(final String name, final KeyValue kv) { + return new StringBuilder() + .append("<") + .append(name) + .append(" name=\"") + .append(escapeXml(kv.getValue())) + .append("\" id=\"") + .append(escapeXml(removePrefix(kv.getKey()))) + .append("\"/>") + .toString(); + } - public static String mapExtraInfo(final ExtraInfo e) { - return new StringBuilder("") - .append(e.getValue()) - .append("") - .toString(); - } + public static String mapExtraInfo(final ExtraInfo e) { + return new StringBuilder("") + .append(e.getValue()) + .append("") + .toString(); + } - public static String asXmlElement(final String name, final String value) { - return asXmlElement(name, value, null, null); - } + public static String asXmlElement(final String name, final String value) { + return asXmlElement(name, value, null, null); + } - public static String asXmlElement( - final String name, final String value, final Qualifier q, final DataInfo info) { - StringBuilder sb = new StringBuilder(); - sb.append("<"); - sb.append(name); - if (q != null) { - sb.append(getAttributes(q)); - } - if (info != null) { - sb.append(" ") - .append(attr("inferred", info.getInferred() != null ? info.getInferred().toString() : "")) - .append(attr("inferenceprovenance", info.getInferenceprovenance())) - .append( - attr( - "provenanceaction", - info.getProvenanceaction() != null - ? info.getProvenanceaction().getClassid() - : "")) - .append(attr("trust", info.getTrust())); - } - if (isBlank(value)) { - sb.append("/>"); - return sb.toString(); - } + public static String asXmlElement( + final String name, final String value, final Qualifier q, final DataInfo info) { + StringBuilder sb = new StringBuilder(); + sb.append("<"); + sb.append(name); + if (q != null) { + sb.append(getAttributes(q)); + } + if (info != null) { + sb + .append(" ") + .append(attr("inferred", info.getInferred() != null ? info.getInferred().toString() : "")) + .append(attr("inferenceprovenance", info.getInferenceprovenance())) + .append( + attr( + "provenanceaction", + info.getProvenanceaction() != null + ? info.getProvenanceaction().getClassid() + : "")) + .append(attr("trust", info.getTrust())); + } + if (isBlank(value)) { + sb.append("/>"); + return sb.toString(); + } - sb.append(">"); - sb.append(escapeXml(value)); - sb.append(""); + sb.append(">"); + sb.append(escapeXml(value)); + sb.append(""); - return sb.toString(); - } + return sb.toString(); + } - public static String getAttributes(final Qualifier q) { - if (q == null || q.isBlank()) return ""; + public static String getAttributes(final Qualifier q) { + if (q == null || q.isBlank()) + return ""; - return new StringBuilder(" ") - .append(attr("classid", q.getClassid())) - .append(attr("classname", q.getClassname())) - .append(attr("schemeid", q.getSchemeid())) - .append(attr("schemename", q.getSchemename())) - .toString(); - } + return new StringBuilder(" ") + .append(attr("classid", q.getClassid())) + .append(attr("classname", q.getClassname())) + .append(attr("schemeid", q.getSchemeid())) + .append(attr("schemename", q.getSchemename())) + .toString(); + } } diff --git a/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/oa/provision/GraphJoinerTest.java b/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/oa/provision/GraphJoinerTest.java index 21feb1637..8afe03d6d 100644 --- a/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/oa/provision/GraphJoinerTest.java +++ b/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/oa/provision/GraphJoinerTest.java @@ -1,39 +1,42 @@ + package eu.dnetlib.dhp.oa.provision; import java.io.IOException; import java.nio.file.Files; import java.nio.file.Path; + import org.junit.jupiter.api.BeforeEach; public class GraphJoinerTest { - private ClassLoader cl = getClass().getClassLoader(); - private Path workingDir; - private Path inputDir; - private Path outputDir; + private ClassLoader cl = getClass().getClassLoader(); + private Path workingDir; + private Path inputDir; + private Path outputDir; - @BeforeEach - public void before() throws IOException { - workingDir = Files.createTempDirectory("promote_action_set"); - inputDir = workingDir.resolve("input"); - outputDir = workingDir.resolve("output"); - } + @BeforeEach + public void before() throws IOException { + workingDir = Files.createTempDirectory("promote_action_set"); + inputDir = workingDir.resolve("input"); + outputDir = workingDir.resolve("output"); + } - private static void copyFiles(Path source, Path target) throws IOException { - Files.list(source) - .forEach( - f -> { - try { - if (Files.isDirectory(f)) { - Path subTarget = Files.createDirectories(target.resolve(f.getFileName())); - copyFiles(f, subTarget); - } else { - Files.copy(f, target.resolve(f.getFileName())); - } - } catch (IOException e) { - e.printStackTrace(); - throw new RuntimeException(e); - } - }); - } + private static void copyFiles(Path source, Path target) throws IOException { + Files + .list(source) + .forEach( + f -> { + try { + if (Files.isDirectory(f)) { + Path subTarget = Files.createDirectories(target.resolve(f.getFileName())); + copyFiles(f, subTarget); + } else { + Files.copy(f, target.resolve(f.getFileName())); + } + } catch (IOException e) { + e.printStackTrace(); + throw new RuntimeException(e); + } + }); + } } diff --git a/dhp-workflows/dhp-propagation/src/main/java/eu/dnetlib/dhp/PropagationConstant.java b/dhp-workflows/dhp-propagation/src/main/java/eu/dnetlib/dhp/PropagationConstant.java index 1a0d98182..a2fb0c0ef 100644 --- a/dhp-workflows/dhp-propagation/src/main/java/eu/dnetlib/dhp/PropagationConstant.java +++ b/dhp-workflows/dhp-propagation/src/main/java/eu/dnetlib/dhp/PropagationConstant.java @@ -1,12 +1,9 @@ + package eu.dnetlib.dhp; -import com.fasterxml.jackson.databind.ObjectMapper; -import eu.dnetlib.dhp.application.ArgumentApplicationParser; -import eu.dnetlib.dhp.common.HdfsSupport; -import eu.dnetlib.dhp.resulttocommunityfromorganization.ResultCommunityList; -import eu.dnetlib.dhp.schema.oaf.*; import java.io.IOException; import java.util.*; + import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.spark.api.java.function.MapFunction; @@ -14,190 +11,189 @@ import org.apache.spark.sql.Encoders; import org.apache.spark.sql.Row; import org.apache.spark.sql.SparkSession; +import com.fasterxml.jackson.databind.ObjectMapper; + +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.common.HdfsSupport; +import eu.dnetlib.dhp.resulttocommunityfromorganization.ResultCommunityList; +import eu.dnetlib.dhp.schema.oaf.*; + public class PropagationConstant { - public static final String INSTITUTIONAL_REPO_TYPE = "pubsrepository::institutional"; + public static final String INSTITUTIONAL_REPO_TYPE = "pubsrepository::institutional"; - public static final String PROPAGATION_DATA_INFO_TYPE = "propagation"; + public static final String PROPAGATION_DATA_INFO_TYPE = "propagation"; - public static final String TRUE = "true"; + public static final String TRUE = "true"; - public static final String DNET_COUNTRY_SCHEMA = "dnet:countries"; - public static final String DNET_SCHEMA_NAME = "dnet:provenanceActions"; - public static final String DNET_SCHEMA_ID = "dnet:provenanceActions"; + public static final String DNET_COUNTRY_SCHEMA = "dnet:countries"; + public static final String DNET_SCHEMA_NAME = "dnet:provenanceActions"; + public static final String DNET_SCHEMA_ID = "dnet:provenanceActions"; - public static final String PROPAGATION_COUNTRY_INSTREPO_CLASS_ID = "country:instrepos"; - public static final String PROPAGATION_COUNTRY_INSTREPO_CLASS_NAME = - "Propagation of country to result collected from datasources of type institutional repositories"; + public static final String PROPAGATION_COUNTRY_INSTREPO_CLASS_ID = "country:instrepos"; + public static final String PROPAGATION_COUNTRY_INSTREPO_CLASS_NAME = "Propagation of country to result collected from datasources of type institutional repositories"; - public static final String PROPAGATION_RELATION_RESULT_ORGANIZATION_INST_REPO_CLASS_ID = - "result:organization:instrepo"; - public static final String PROPAGATION_RELATION_RESULT_ORGANIZATION_INST_REPO_CLASS_NAME = - "Propagation of affiliation to result collected from datasources of type institutional repository"; + public static final String PROPAGATION_RELATION_RESULT_ORGANIZATION_INST_REPO_CLASS_ID = "result:organization:instrepo"; + public static final String PROPAGATION_RELATION_RESULT_ORGANIZATION_INST_REPO_CLASS_NAME = "Propagation of affiliation to result collected from datasources of type institutional repository"; - public static final String PROPAGATION_RELATION_RESULT_PROJECT_SEM_REL_CLASS_ID = - "result:project:semrel"; - public static final String PROPAGATION_RELATION_RESULT_PROJECT_SEM_REL_CLASS_NAME = - "Propagation of result to project through semantic relation"; + public static final String PROPAGATION_RELATION_RESULT_PROJECT_SEM_REL_CLASS_ID = "result:project:semrel"; + public static final String PROPAGATION_RELATION_RESULT_PROJECT_SEM_REL_CLASS_NAME = "Propagation of result to project through semantic relation"; - public static final String PROPAGATION_RESULT_COMMUNITY_SEMREL_CLASS_ID = - "result:community:semrel"; - public static final String PROPAGATION_RESULT_COMMUNITY_SEMREL_CLASS_NAME = - " Propagation of result belonging to community through semantic relation"; + public static final String PROPAGATION_RESULT_COMMUNITY_SEMREL_CLASS_ID = "result:community:semrel"; + public static final String PROPAGATION_RESULT_COMMUNITY_SEMREL_CLASS_NAME = " Propagation of result belonging to community through semantic relation"; - public static final String PROPAGATION_RESULT_COMMUNITY_ORGANIZATION_CLASS_ID = - "result:community:organization"; - public static final String PROPAGATION_RESULT_COMMUNITY_ORGANIZATION_CLASS_NAME = - " Propagation of result belonging to community through organization"; + public static final String PROPAGATION_RESULT_COMMUNITY_ORGANIZATION_CLASS_ID = "result:community:organization"; + public static final String PROPAGATION_RESULT_COMMUNITY_ORGANIZATION_CLASS_NAME = " Propagation of result belonging to community through organization"; - public static final String PROPAGATION_ORCID_TO_RESULT_FROM_SEM_REL_CLASS_ID = - "authorpid:result"; - public static final String PROPAGATION_ORCID_TO_RESULT_FROM_SEM_REL_CLASS_NAME = - "Propagation of authors pid to result through semantic relations"; + public static final String PROPAGATION_ORCID_TO_RESULT_FROM_SEM_REL_CLASS_ID = "authorpid:result"; + public static final String PROPAGATION_ORCID_TO_RESULT_FROM_SEM_REL_CLASS_NAME = "Propagation of authors pid to result through semantic relations"; - public static final String RELATION_DATASOURCE_ORGANIZATION_REL_CLASS = "provides"; + public static final String RELATION_DATASOURCE_ORGANIZATION_REL_CLASS = "provides"; - public static final String RELATION_RESULTORGANIZATION_REL_TYPE = "resultOrganization"; - public static final String RELATION_RESULTORGANIZATION_SUBREL_TYPE = "affiliation"; - public static final String RELATION_ORGANIZATION_RESULT_REL_CLASS = "isAuthorInstitutionOf"; - public static final String RELATION_RESULT_ORGANIZATION_REL_CLASS = "hasAuthorInstitution"; + public static final String RELATION_RESULTORGANIZATION_REL_TYPE = "resultOrganization"; + public static final String RELATION_RESULTORGANIZATION_SUBREL_TYPE = "affiliation"; + public static final String RELATION_ORGANIZATION_RESULT_REL_CLASS = "isAuthorInstitutionOf"; + public static final String RELATION_RESULT_ORGANIZATION_REL_CLASS = "hasAuthorInstitution"; - public static final String RELATION_RESULTRESULT_REL_TYPE = "resultResult"; + public static final String RELATION_RESULTRESULT_REL_TYPE = "resultResult"; - public static final String RELATION_RESULTPROJECT_REL_TYPE = "resultProject"; - public static final String RELATION_RESULTPROJECT_SUBREL_TYPE = "outcome"; - public static final String RELATION_RESULT_PROJECT_REL_CLASS = "isProducedBy"; - public static final String RELATION_PROJECT_RESULT_REL_CLASS = "produces"; + public static final String RELATION_RESULTPROJECT_REL_TYPE = "resultProject"; + public static final String RELATION_RESULTPROJECT_SUBREL_TYPE = "outcome"; + public static final String RELATION_RESULT_PROJECT_REL_CLASS = "isProducedBy"; + public static final String RELATION_PROJECT_RESULT_REL_CLASS = "produces"; - public static final String RELATION_REPRESENTATIVERESULT_RESULT_CLASS = "merges"; + public static final String RELATION_REPRESENTATIVERESULT_RESULT_CLASS = "merges"; - public static final String PROPAGATION_AUTHOR_PID = "ORCID"; + public static final String PROPAGATION_AUTHOR_PID = "ORCID"; - public static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); + public static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); - public static Country getCountry(String classid, String classname) { - Country nc = new Country(); - nc.setClassid(classid); - nc.setClassname(classname); - nc.setSchemename(DNET_COUNTRY_SCHEMA); - nc.setSchemeid(DNET_COUNTRY_SCHEMA); - nc.setDataInfo( - getDataInfo( - PROPAGATION_DATA_INFO_TYPE, - PROPAGATION_COUNTRY_INSTREPO_CLASS_ID, - PROPAGATION_COUNTRY_INSTREPO_CLASS_NAME)); - return nc; - } + public static Country getCountry(String classid, String classname) { + Country nc = new Country(); + nc.setClassid(classid); + nc.setClassname(classname); + nc.setSchemename(DNET_COUNTRY_SCHEMA); + nc.setSchemeid(DNET_COUNTRY_SCHEMA); + nc + .setDataInfo( + getDataInfo( + PROPAGATION_DATA_INFO_TYPE, + PROPAGATION_COUNTRY_INSTREPO_CLASS_ID, + PROPAGATION_COUNTRY_INSTREPO_CLASS_NAME)); + return nc; + } - public static DataInfo getDataInfo( - String inference_provenance, String inference_class_id, String inference_class_name) { - DataInfo di = new DataInfo(); - di.setInferred(true); - di.setDeletedbyinference(false); - di.setTrust("0.85"); - di.setInferenceprovenance(inference_provenance); - di.setProvenanceaction(getQualifier(inference_class_id, inference_class_name)); - return di; - } + public static DataInfo getDataInfo( + String inference_provenance, String inference_class_id, String inference_class_name) { + DataInfo di = new DataInfo(); + di.setInferred(true); + di.setDeletedbyinference(false); + di.setTrust("0.85"); + di.setInferenceprovenance(inference_provenance); + di.setProvenanceaction(getQualifier(inference_class_id, inference_class_name)); + return di; + } - public static Qualifier getQualifier(String inference_class_id, String inference_class_name) { - Qualifier pa = new Qualifier(); - pa.setClassid(inference_class_id); - pa.setClassname(inference_class_name); - pa.setSchemeid(DNET_SCHEMA_ID); - pa.setSchemename(DNET_SCHEMA_NAME); - return pa; - } + public static Qualifier getQualifier(String inference_class_id, String inference_class_name) { + Qualifier pa = new Qualifier(); + pa.setClassid(inference_class_id); + pa.setClassname(inference_class_name); + pa.setSchemeid(DNET_SCHEMA_ID); + pa.setSchemename(DNET_SCHEMA_NAME); + return pa; + } - public static Relation getRelation( - String source, - String target, - String rel_class, - String rel_type, - String subrel_type, - String inference_provenance, - String inference_class_id, - String inference_class_name) { - Relation r = new Relation(); - r.setSource(source); - r.setTarget(target); - r.setRelClass(rel_class); - r.setRelType(rel_type); - r.setSubRelType(subrel_type); - r.setDataInfo(getDataInfo(inference_provenance, inference_class_id, inference_class_name)); - return r; - } + public static Relation getRelation( + String source, + String target, + String rel_class, + String rel_type, + String subrel_type, + String inference_provenance, + String inference_class_id, + String inference_class_name) { + Relation r = new Relation(); + r.setSource(source); + r.setTarget(target); + r.setRelClass(rel_class); + r.setRelType(rel_type); + r.setSubRelType(subrel_type); + r.setDataInfo(getDataInfo(inference_provenance, inference_class_id, inference_class_name)); + return r; + } - public static String getConstraintList(String text, List constraints) { - String ret = " and (" + text + constraints.get(0) + "'"; - for (int i = 1; i < constraints.size(); i++) { - ret += " OR " + text + constraints.get(i) + "'"; - } - ret += ")"; - return ret; - } + public static String getConstraintList(String text, List constraints) { + String ret = " and (" + text + constraints.get(0) + "'"; + for (int i = 1; i < constraints.size(); i++) { + ret += " OR " + text + constraints.get(i) + "'"; + } + ret += ")"; + return ret; + } - public static void createOutputDirs(String outputPath, FileSystem fs) throws IOException { - if (fs.exists(new Path(outputPath))) { - fs.delete(new Path(outputPath), true); - } - fs.mkdirs(new Path(outputPath)); - } + public static void createOutputDirs(String outputPath, FileSystem fs) throws IOException { + if (fs.exists(new Path(outputPath))) { + fs.delete(new Path(outputPath), true); + } + fs.mkdirs(new Path(outputPath)); + } - public static void removeOutputDir(SparkSession spark, String path) { - HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration()); - } + public static void removeOutputDir(SparkSession spark, String path) { + HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration()); + } - public static Boolean isSparkSessionManaged(ArgumentApplicationParser parser) { - return Optional.ofNullable(parser.get("isSparkSessionManaged")) - .map(Boolean::valueOf) - .orElse(Boolean.TRUE); - } + public static Boolean isSparkSessionManaged(ArgumentApplicationParser parser) { + return Optional + .ofNullable(parser.get("isSparkSessionManaged")) + .map(Boolean::valueOf) + .orElse(Boolean.TRUE); + } - public static Boolean isTest(ArgumentApplicationParser parser) { - return Optional.ofNullable(parser.get("isTest")) - .map(Boolean::valueOf) - .orElse(Boolean.FALSE); - } + public static Boolean isTest(ArgumentApplicationParser parser) { + return Optional + .ofNullable(parser.get("isTest")) + .map(Boolean::valueOf) + .orElse(Boolean.FALSE); + } - public static void createCfHbforresult(SparkSession spark) { - String query; - query = - "SELECT id, inst.collectedfrom.key cf , inst.hostedby.key hb " - + "FROM ( SELECT id, instance " - + "FROM result " - + " WHERE datainfo.deletedbyinference = false) ds " - + "LATERAL VIEW EXPLODE(instance) i AS inst"; - org.apache.spark.sql.Dataset cfhb = spark.sql(query); - cfhb.createOrReplaceTempView("cfhb"); - } + public static void createCfHbforresult(SparkSession spark) { + String query; + query = "SELECT id, inst.collectedfrom.key cf , inst.hostedby.key hb " + + "FROM ( SELECT id, instance " + + "FROM result " + + " WHERE datainfo.deletedbyinference = false) ds " + + "LATERAL VIEW EXPLODE(instance) i AS inst"; + org.apache.spark.sql.Dataset cfhb = spark.sql(query); + cfhb.createOrReplaceTempView("cfhb"); + } - public static org.apache.spark.sql.Dataset readPathEntity( - SparkSession spark, String inputEntityPath, Class resultClazz) { + public static org.apache.spark.sql.Dataset readPathEntity( + SparkSession spark, String inputEntityPath, Class resultClazz) { - return spark.read() - .textFile(inputEntityPath) - .map( - (MapFunction) - value -> OBJECT_MAPPER.readValue(value, resultClazz), - Encoders.bean(resultClazz)); - } + return spark + .read() + .textFile(inputEntityPath) + .map( + (MapFunction) value -> OBJECT_MAPPER.readValue(value, resultClazz), + Encoders.bean(resultClazz)); + } - public static org.apache.spark.sql.Dataset readRelations( - SparkSession spark, String inputPath) { - return spark.read() - .textFile(inputPath) - .map( - (MapFunction) - value -> OBJECT_MAPPER.readValue(value, Relation.class), - Encoders.bean(Relation.class)); - } + public static org.apache.spark.sql.Dataset readRelations( + SparkSession spark, String inputPath) { + return spark + .read() + .textFile(inputPath) + .map( + (MapFunction) value -> OBJECT_MAPPER.readValue(value, Relation.class), + Encoders.bean(Relation.class)); + } - public static org.apache.spark.sql.Dataset readResultCommunityList( - SparkSession spark, String possibleUpdatesPath) { - return spark.read() - .textFile(possibleUpdatesPath) - .map( - value -> OBJECT_MAPPER.readValue(value, ResultCommunityList.class), - Encoders.bean(ResultCommunityList.class)); - } + public static org.apache.spark.sql.Dataset readResultCommunityList( + SparkSession spark, String possibleUpdatesPath) { + return spark + .read() + .textFile(possibleUpdatesPath) + .map( + value -> OBJECT_MAPPER.readValue(value, ResultCommunityList.class), + Encoders.bean(ResultCommunityList.class)); + } } diff --git a/dhp-workflows/dhp-propagation/src/main/java/eu/dnetlib/dhp/QueryInformationSystem.java b/dhp-workflows/dhp-propagation/src/main/java/eu/dnetlib/dhp/QueryInformationSystem.java index a33919d19..c29043a2d 100644 --- a/dhp-workflows/dhp-propagation/src/main/java/eu/dnetlib/dhp/QueryInformationSystem.java +++ b/dhp-workflows/dhp-propagation/src/main/java/eu/dnetlib/dhp/QueryInformationSystem.java @@ -1,19 +1,20 @@ + package eu.dnetlib.dhp; +import java.util.List; + import eu.dnetlib.dhp.utils.ISLookupClientFactory; import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException; import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService; -import java.util.List; public class QueryInformationSystem { - private static final String XQUERY = - "for $x in collection('/db/DRIVER/ContextDSResources/ContextDSResourceType')" - + " where $x//CONFIGURATION/context[./@type='community' or ./@type='ri']" - + " and $x//CONFIGURATION/context/param[./@name='status']/text() != 'hidden'" - + " return $x//CONFIGURATION/context/@id/string()"; + private static final String XQUERY = "for $x in collection('/db/DRIVER/ContextDSResources/ContextDSResourceType')" + + " where $x//CONFIGURATION/context[./@type='community' or ./@type='ri']" + + " and $x//CONFIGURATION/context/param[./@name='status']/text() != 'hidden'" + + " return $x//CONFIGURATION/context/@id/string()"; - public static List getCommunityList(final String isLookupUrl) throws ISLookUpException { - ISLookUpService isLookUp = ISLookupClientFactory.getLookUpService(isLookupUrl); - return isLookUp.quickSearchProfile(XQUERY); - } + public static List getCommunityList(final String isLookupUrl) throws ISLookUpException { + ISLookUpService isLookUp = ISLookupClientFactory.getLookUpService(isLookupUrl); + return isLookUp.quickSearchProfile(XQUERY); + } } diff --git a/dhp-workflows/dhp-propagation/src/main/java/eu/dnetlib/dhp/countrypropagation/CountrySbs.java b/dhp-workflows/dhp-propagation/src/main/java/eu/dnetlib/dhp/countrypropagation/CountrySbs.java index 32c893261..271cc6bb3 100644 --- a/dhp-workflows/dhp-propagation/src/main/java/eu/dnetlib/dhp/countrypropagation/CountrySbs.java +++ b/dhp-workflows/dhp-propagation/src/main/java/eu/dnetlib/dhp/countrypropagation/CountrySbs.java @@ -1,24 +1,25 @@ + package eu.dnetlib.dhp.countrypropagation; import java.io.Serializable; public class CountrySbs implements Serializable { - private String classid; - private String classname; + private String classid; + private String classname; - public String getClassid() { - return classid; - } + public String getClassid() { + return classid; + } - public void setClassid(String classid) { - this.classid = classid; - } + public void setClassid(String classid) { + this.classid = classid; + } - public String getClassname() { - return classname; - } + public String getClassname() { + return classname; + } - public void setClassname(String classname) { - this.classname = classname; - } + public void setClassname(String classname) { + this.classname = classname; + } } diff --git a/dhp-workflows/dhp-propagation/src/main/java/eu/dnetlib/dhp/countrypropagation/DatasourceCountry.java b/dhp-workflows/dhp-propagation/src/main/java/eu/dnetlib/dhp/countrypropagation/DatasourceCountry.java index 0ac8b108e..642192f73 100644 --- a/dhp-workflows/dhp-propagation/src/main/java/eu/dnetlib/dhp/countrypropagation/DatasourceCountry.java +++ b/dhp-workflows/dhp-propagation/src/main/java/eu/dnetlib/dhp/countrypropagation/DatasourceCountry.java @@ -1,24 +1,25 @@ + package eu.dnetlib.dhp.countrypropagation; import java.io.Serializable; public class DatasourceCountry implements Serializable { - private String dataSourceId; - private CountrySbs country; + private String dataSourceId; + private CountrySbs country; - public String getDataSourceId() { - return dataSourceId; - } + public String getDataSourceId() { + return dataSourceId; + } - public void setDataSourceId(String dataSourceId) { - this.dataSourceId = dataSourceId; - } + public void setDataSourceId(String dataSourceId) { + this.dataSourceId = dataSourceId; + } - public CountrySbs getCountry() { - return country; - } + public CountrySbs getCountry() { + return country; + } - public void setCountry(CountrySbs country) { - this.country = country; - } + public void setCountry(CountrySbs country) { + this.country = country; + } } diff --git a/dhp-workflows/dhp-propagation/src/main/java/eu/dnetlib/dhp/countrypropagation/PrepareDatasourceCountryAssociation.java b/dhp-workflows/dhp-propagation/src/main/java/eu/dnetlib/dhp/countrypropagation/PrepareDatasourceCountryAssociation.java index 0604bb019..56185eb72 100644 --- a/dhp-workflows/dhp-propagation/src/main/java/eu/dnetlib/dhp/countrypropagation/PrepareDatasourceCountryAssociation.java +++ b/dhp-workflows/dhp-propagation/src/main/java/eu/dnetlib/dhp/countrypropagation/PrepareDatasourceCountryAssociation.java @@ -1,13 +1,12 @@ + package eu.dnetlib.dhp.countrypropagation; import static eu.dnetlib.dhp.PropagationConstant.*; import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkHiveSession; -import com.fasterxml.jackson.databind.ObjectMapper; -import eu.dnetlib.dhp.application.ArgumentApplicationParser; -import eu.dnetlib.dhp.schema.oaf.*; import java.util.Arrays; import java.util.List; + import org.apache.commons.io.IOUtils; import org.apache.hadoop.io.compress.GzipCodec; import org.apache.spark.SparkConf; @@ -18,118 +17,125 @@ import org.apache.spark.sql.SparkSession; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import com.fasterxml.jackson.databind.ObjectMapper; + +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.schema.oaf.*; + /** - * For the association of the country to the datasource The association is computed only for - * datasource of specific type or having whitelisted ids The country is registered in the - * Organization associated to the Datasource, so the relation provides between Datasource and - * Organization is exploited to get the country for the datasource + * For the association of the country to the datasource The association is computed only for datasource of specific type + * or having whitelisted ids The country is registered in the Organization associated to the Datasource, so the relation + * provides between Datasource and Organization is exploited to get the country for the datasource */ public class PrepareDatasourceCountryAssociation { - private static final Logger log = - LoggerFactory.getLogger(PrepareDatasourceCountryAssociation.class); - private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); + private static final Logger log = LoggerFactory.getLogger(PrepareDatasourceCountryAssociation.class); + private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); - public static void main(String[] args) throws Exception { + public static void main(String[] args) throws Exception { - String jsonConfiguration = - IOUtils.toString( - PrepareDatasourceCountryAssociation.class.getResourceAsStream( - "/eu/dnetlib/dhp/countrypropagation/input_prepareassoc_parameters.json")); + String jsonConfiguration = IOUtils + .toString( + PrepareDatasourceCountryAssociation.class + .getResourceAsStream( + "/eu/dnetlib/dhp/countrypropagation/input_prepareassoc_parameters.json")); - final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration); + final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration); - parser.parseArgument(args); + parser.parseArgument(args); - Boolean isSparkSessionManaged = isSparkSessionManaged(parser); - log.info("isSparkSessionManaged: {}", isSparkSessionManaged); + Boolean isSparkSessionManaged = isSparkSessionManaged(parser); + log.info("isSparkSessionManaged: {}", isSparkSessionManaged); - String inputPath = parser.get("sourcePath"); - log.info("inputPath: {}", inputPath); + String inputPath = parser.get("sourcePath"); + log.info("inputPath: {}", inputPath); - final String outputPath = parser.get("outputPath"); - log.info("outputPath {}: ", outputPath); + final String outputPath = parser.get("outputPath"); + log.info("outputPath {}: ", outputPath); - SparkConf conf = new SparkConf(); - conf.set("hive.metastore.uris", parser.get("hive_metastore_uris")); + SparkConf conf = new SparkConf(); + conf.set("hive.metastore.uris", parser.get("hive_metastore_uris")); - runWithSparkHiveSession( - conf, - isSparkSessionManaged, - spark -> { - removeOutputDir(spark, outputPath); - prepareDatasourceCountryAssociation( - spark, - Arrays.asList(parser.get("whitelist").split(";")), - Arrays.asList(parser.get("allowedtypes").split(";")), - inputPath, - outputPath); - }); - } + runWithSparkHiveSession( + conf, + isSparkSessionManaged, + spark -> { + removeOutputDir(spark, outputPath); + prepareDatasourceCountryAssociation( + spark, + Arrays.asList(parser.get("whitelist").split(";")), + Arrays.asList(parser.get("allowedtypes").split(";")), + inputPath, + outputPath); + }); + } - private static void prepareDatasourceCountryAssociation( - SparkSession spark, - List whitelist, - List allowedtypes, - String inputPath, - String outputPath) { - String whitelisted = ""; - for (String i : whitelist) { - whitelisted += " OR id = '" + i + "'"; - } - final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); + private static void prepareDatasourceCountryAssociation( + SparkSession spark, + List whitelist, + List allowedtypes, + String inputPath, + String outputPath) { + String whitelisted = ""; + for (String i : whitelist) { + whitelisted += " OR id = '" + i + "'"; + } + final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); - Dataset datasource = - spark.createDataset( - sc.textFile(inputPath + "/datasource") - .map(item -> OBJECT_MAPPER.readValue(item, Datasource.class)) - .rdd(), - Encoders.bean(Datasource.class)); + Dataset datasource = spark + .createDataset( + sc + .textFile(inputPath + "/datasource") + .map(item -> OBJECT_MAPPER.readValue(item, Datasource.class)) + .rdd(), + Encoders.bean(Datasource.class)); - Dataset relation = - spark.createDataset( - sc.textFile(inputPath + "/relation") - .map(item -> OBJECT_MAPPER.readValue(item, Relation.class)) - .rdd(), - Encoders.bean(Relation.class)); + Dataset relation = spark + .createDataset( + sc + .textFile(inputPath + "/relation") + .map(item -> OBJECT_MAPPER.readValue(item, Relation.class)) + .rdd(), + Encoders.bean(Relation.class)); - Dataset organization = - spark.createDataset( - sc.textFile(inputPath + "/organization") - .map(item -> OBJECT_MAPPER.readValue(item, Organization.class)) - .rdd(), - Encoders.bean(Organization.class)); + Dataset organization = spark + .createDataset( + sc + .textFile(inputPath + "/organization") + .map(item -> OBJECT_MAPPER.readValue(item, Organization.class)) + .rdd(), + Encoders.bean(Organization.class)); - datasource.createOrReplaceTempView("datasource"); - relation.createOrReplaceTempView("relation"); - organization.createOrReplaceTempView("organization"); + datasource.createOrReplaceTempView("datasource"); + relation.createOrReplaceTempView("relation"); + organization.createOrReplaceTempView("organization"); - String query = - "SELECT source dataSourceId, named_struct('classid', country.classid, 'classname', country.classname) country " - + "FROM ( SELECT id " - + " FROM datasource " - + " WHERE (datainfo.deletedbyinference = false " - + whitelisted - + ") " - + getConstraintList("datasourcetype.classid = '", allowedtypes) - + ") d " - + "JOIN ( SELECT source, target " - + " FROM relation " - + " WHERE relclass = '" - + RELATION_DATASOURCE_ORGANIZATION_REL_CLASS - + "' " - + " AND datainfo.deletedbyinference = false ) rel " - + "ON d.id = rel.source " - + "JOIN (SELECT id, country " - + " FROM organization " - + " WHERE datainfo.deletedbyinference = false " - + " AND length(country.classid)>0) o " - + "ON o.id = rel.target"; + String query = "SELECT source dataSourceId, named_struct('classid', country.classid, 'classname', country.classname) country " + + "FROM ( SELECT id " + + " FROM datasource " + + " WHERE (datainfo.deletedbyinference = false " + + whitelisted + + ") " + + getConstraintList("datasourcetype.classid = '", allowedtypes) + + ") d " + + "JOIN ( SELECT source, target " + + " FROM relation " + + " WHERE relclass = '" + + RELATION_DATASOURCE_ORGANIZATION_REL_CLASS + + "' " + + " AND datainfo.deletedbyinference = false ) rel " + + "ON d.id = rel.source " + + "JOIN (SELECT id, country " + + " FROM organization " + + " WHERE datainfo.deletedbyinference = false " + + " AND length(country.classid)>0) o " + + "ON o.id = rel.target"; - spark.sql(query) - .as(Encoders.bean(DatasourceCountry.class)) - .toJavaRDD() - .map(c -> OBJECT_MAPPER.writeValueAsString(c)) - .saveAsTextFile(outputPath, GzipCodec.class); - } + spark + .sql(query) + .as(Encoders.bean(DatasourceCountry.class)) + .toJavaRDD() + .map(c -> OBJECT_MAPPER.writeValueAsString(c)) + .saveAsTextFile(outputPath, GzipCodec.class); + } } diff --git a/dhp-workflows/dhp-propagation/src/main/java/eu/dnetlib/dhp/countrypropagation/ResultCountrySet.java b/dhp-workflows/dhp-propagation/src/main/java/eu/dnetlib/dhp/countrypropagation/ResultCountrySet.java index 69f7a59e9..8c29424f2 100644 --- a/dhp-workflows/dhp-propagation/src/main/java/eu/dnetlib/dhp/countrypropagation/ResultCountrySet.java +++ b/dhp-workflows/dhp-propagation/src/main/java/eu/dnetlib/dhp/countrypropagation/ResultCountrySet.java @@ -1,25 +1,26 @@ + package eu.dnetlib.dhp.countrypropagation; import java.io.Serializable; import java.util.ArrayList; public class ResultCountrySet implements Serializable { - private String resultId; - private ArrayList countrySet; + private String resultId; + private ArrayList countrySet; - public String getResultId() { - return resultId; - } + public String getResultId() { + return resultId; + } - public void setResultId(String resultId) { - this.resultId = resultId; - } + public void setResultId(String resultId) { + this.resultId = resultId; + } - public ArrayList getCountrySet() { - return countrySet; - } + public ArrayList getCountrySet() { + return countrySet; + } - public void setCountrySet(ArrayList countrySet) { - this.countrySet = countrySet; - } + public void setCountrySet(ArrayList countrySet) { + this.countrySet = countrySet; + } } diff --git a/dhp-workflows/dhp-propagation/src/main/java/eu/dnetlib/dhp/countrypropagation/SparkCountryPropagationJob2.java b/dhp-workflows/dhp-propagation/src/main/java/eu/dnetlib/dhp/countrypropagation/SparkCountryPropagationJob2.java index cd37e79eb..15f9e9b60 100644 --- a/dhp-workflows/dhp-propagation/src/main/java/eu/dnetlib/dhp/countrypropagation/SparkCountryPropagationJob2.java +++ b/dhp-workflows/dhp-propagation/src/main/java/eu/dnetlib/dhp/countrypropagation/SparkCountryPropagationJob2.java @@ -1,12 +1,11 @@ + package eu.dnetlib.dhp.countrypropagation; import static eu.dnetlib.dhp.PropagationConstant.*; import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkHiveSession; -import com.fasterxml.jackson.databind.ObjectMapper; -import eu.dnetlib.dhp.application.ArgumentApplicationParser; -import eu.dnetlib.dhp.schema.oaf.*; import java.util.*; + import org.apache.commons.io.IOUtils; import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaSparkContext; @@ -16,194 +15,193 @@ import org.apache.spark.sql.*; import org.apache.spark.sql.Dataset; import org.slf4j.Logger; import org.slf4j.LoggerFactory; + +import com.fasterxml.jackson.databind.ObjectMapper; + +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.schema.oaf.*; import scala.Tuple2; public class SparkCountryPropagationJob2 { - private static final Logger log = LoggerFactory.getLogger(SparkCountryPropagationJob2.class); + private static final Logger log = LoggerFactory.getLogger(SparkCountryPropagationJob2.class); - private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); + private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); - public static void main(String[] args) throws Exception { + public static void main(String[] args) throws Exception { - String jsonConfiguration = - IOUtils.toString( - SparkCountryPropagationJob2.class.getResourceAsStream( - "/eu/dnetlib/dhp/countrypropagation/input_countrypropagation_parameters.json")); + String jsonConfiguration = IOUtils + .toString( + SparkCountryPropagationJob2.class + .getResourceAsStream( + "/eu/dnetlib/dhp/countrypropagation/input_countrypropagation_parameters.json")); - final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration); + final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration); - parser.parseArgument(args); + parser.parseArgument(args); - Boolean isSparkSessionManaged = isSparkSessionManaged(parser); - log.info("isSparkSessionManaged: {}", isSparkSessionManaged); + Boolean isSparkSessionManaged = isSparkSessionManaged(parser); + log.info("isSparkSessionManaged: {}", isSparkSessionManaged); - String inputPath = parser.get("sourcePath"); - log.info("inputPath: {}", inputPath); + String inputPath = parser.get("sourcePath"); + log.info("inputPath: {}", inputPath); - final String outputPath = parser.get("outputPath"); - log.info("outputPath: {}", outputPath); + final String outputPath = parser.get("outputPath"); + log.info("outputPath: {}", outputPath); - final String datasourcecountrypath = parser.get("preparedInfoPath"); - log.info("preparedInfoPath: {}", datasourcecountrypath); + final String datasourcecountrypath = parser.get("preparedInfoPath"); + log.info("preparedInfoPath: {}", datasourcecountrypath); - final String resultClassName = parser.get("resultTableName"); - log.info("resultTableName: {}", resultClassName); + final String resultClassName = parser.get("resultTableName"); + log.info("resultTableName: {}", resultClassName); - final Boolean saveGraph = - Optional.ofNullable(parser.get("saveGraph")) - .map(Boolean::valueOf) - .orElse(Boolean.TRUE); - log.info("saveGraph: {}", saveGraph); + final Boolean saveGraph = Optional + .ofNullable(parser.get("saveGraph")) + .map(Boolean::valueOf) + .orElse(Boolean.TRUE); + log.info("saveGraph: {}", saveGraph); - Class resultClazz = - (Class) Class.forName(resultClassName); + Class resultClazz = (Class) Class.forName(resultClassName); - SparkConf conf = new SparkConf(); - conf.set("hive.metastore.uris", parser.get("hive_metastore_uris")); + SparkConf conf = new SparkConf(); + conf.set("hive.metastore.uris", parser.get("hive_metastore_uris")); - runWithSparkHiveSession( - conf, - isSparkSessionManaged, - spark -> { - execPropagation( - spark, - datasourcecountrypath, - inputPath, - outputPath, - resultClazz, - saveGraph); - }); - } + runWithSparkHiveSession( + conf, + isSparkSessionManaged, + spark -> { + execPropagation( + spark, + datasourcecountrypath, + inputPath, + outputPath, + resultClazz, + saveGraph); + }); + } - private static void execPropagation( - SparkSession spark, - String datasourcecountrypath, - String inputPath, - String outputPath, - Class resultClazz, - boolean saveGraph) { - final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); + private static void execPropagation( + SparkSession spark, + String datasourcecountrypath, + String inputPath, + String outputPath, + Class resultClazz, + boolean saveGraph) { + final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); - // Load file with preprocessed association datasource - country - Dataset datasourcecountryassoc = - readAssocDatasourceCountry(spark, datasourcecountrypath); - // broadcasting the result of the preparation step - Broadcast> broadcast_datasourcecountryassoc = - sc.broadcast(datasourcecountryassoc); + // Load file with preprocessed association datasource - country + Dataset datasourcecountryassoc = readAssocDatasourceCountry(spark, datasourcecountrypath); + // broadcasting the result of the preparation step + Broadcast> broadcast_datasourcecountryassoc = sc.broadcast(datasourcecountryassoc); - Dataset potentialUpdates = - getPotentialResultToUpdate( - spark, inputPath, resultClazz, broadcast_datasourcecountryassoc) - .as(Encoders.bean(ResultCountrySet.class)); + Dataset potentialUpdates = getPotentialResultToUpdate( + spark, inputPath, resultClazz, broadcast_datasourcecountryassoc) + .as(Encoders.bean(ResultCountrySet.class)); - if (saveGraph) { - updateResultTable(spark, potentialUpdates, inputPath, resultClazz, outputPath); - } - } + if (saveGraph) { + updateResultTable(spark, potentialUpdates, inputPath, resultClazz, outputPath); + } + } - private static void updateResultTable( - SparkSession spark, - Dataset potentialUpdates, - String inputPath, - Class resultClazz, - String outputPath) { + private static void updateResultTable( + SparkSession spark, + Dataset potentialUpdates, + String inputPath, + Class resultClazz, + String outputPath) { - log.info("Reading Graph table from: {}", inputPath); - Dataset result = readPathEntity(spark, inputPath, resultClazz); + log.info("Reading Graph table from: {}", inputPath); + Dataset result = readPathEntity(spark, inputPath, resultClazz); - Dataset> result_pair = - result.map( - r -> new Tuple2<>(r.getId(), r), - Encoders.tuple(Encoders.STRING(), Encoders.bean(resultClazz))); + Dataset> result_pair = result + .map( + r -> new Tuple2<>(r.getId(), r), + Encoders.tuple(Encoders.STRING(), Encoders.bean(resultClazz))); - Dataset new_table = - result_pair - .joinWith( - potentialUpdates, - result_pair.col("_1").equalTo(potentialUpdates.col("resultId")), - "left_outer") - .map( - (MapFunction, ResultCountrySet>, R>) - value -> { - R r = value._1()._2(); - Optional potentialNewCountries = - Optional.ofNullable(value._2()); - if (potentialNewCountries.isPresent()) { - HashSet countries = new HashSet<>(); - for (Qualifier country : r.getCountry()) { - countries.add(country.getClassid()); - } - Result res = new Result(); - res.setId(r.getId()); - List countryList = new ArrayList<>(); - for (CountrySbs country : - potentialNewCountries - .get() - .getCountrySet()) { - if (!countries.contains(country.getClassid())) { - countryList.add( - getCountry( - country.getClassid(), - country.getClassname())); - } - } - res.setCountry(countryList); - r.mergeFrom(res); - } - return r; - }, - Encoders.bean(resultClazz)); + Dataset new_table = result_pair + .joinWith( + potentialUpdates, + result_pair.col("_1").equalTo(potentialUpdates.col("resultId")), + "left_outer") + .map( + (MapFunction, ResultCountrySet>, R>) value -> { + R r = value._1()._2(); + Optional potentialNewCountries = Optional.ofNullable(value._2()); + if (potentialNewCountries.isPresent()) { + HashSet countries = new HashSet<>(); + for (Qualifier country : r.getCountry()) { + countries.add(country.getClassid()); + } + Result res = new Result(); + res.setId(r.getId()); + List countryList = new ArrayList<>(); + for (CountrySbs country : potentialNewCountries + .get() + .getCountrySet()) { + if (!countries.contains(country.getClassid())) { + countryList + .add( + getCountry( + country.getClassid(), + country.getClassname())); + } + } + res.setCountry(countryList); + r.mergeFrom(res); + } + return r; + }, + Encoders.bean(resultClazz)); - log.info("Saving graph table to path: {}", outputPath); - // log.info("number of saved recordsa: {}", new_table.count()); - new_table.toJSON().write().option("compression", "gzip").text(outputPath); - } + log.info("Saving graph table to path: {}", outputPath); + // log.info("number of saved recordsa: {}", new_table.count()); + new_table.toJSON().write().option("compression", "gzip").text(outputPath); + } - private static Dataset getPotentialResultToUpdate( - SparkSession spark, - String inputPath, - Class resultClazz, - Broadcast> broadcast_datasourcecountryassoc) { + private static Dataset getPotentialResultToUpdate( + SparkSession spark, + String inputPath, + Class resultClazz, + Broadcast> broadcast_datasourcecountryassoc) { - Dataset result = readPathEntity(spark, inputPath, resultClazz); - result.createOrReplaceTempView("result"); - // log.info("number of results: {}", result.count()); - createCfHbforresult(spark); - return countryPropagationAssoc(spark, broadcast_datasourcecountryassoc); - } + Dataset result = readPathEntity(spark, inputPath, resultClazz); + result.createOrReplaceTempView("result"); + // log.info("number of results: {}", result.count()); + createCfHbforresult(spark); + return countryPropagationAssoc(spark, broadcast_datasourcecountryassoc); + } - private static Dataset countryPropagationAssoc( - SparkSession spark, - Broadcast> broadcast_datasourcecountryassoc) { + private static Dataset countryPropagationAssoc( + SparkSession spark, + Broadcast> broadcast_datasourcecountryassoc) { - Dataset datasource_country = broadcast_datasourcecountryassoc.value(); - datasource_country.createOrReplaceTempView("datasource_country"); - log.info("datasource_country number : {}", datasource_country.count()); + Dataset datasource_country = broadcast_datasourcecountryassoc.value(); + datasource_country.createOrReplaceTempView("datasource_country"); + log.info("datasource_country number : {}", datasource_country.count()); - String query = - "SELECT id resultId, collect_set(country) countrySet " - + "FROM ( SELECT id, country " - + "FROM datasource_country " - + "JOIN cfhb " - + " ON cf = dataSourceId " - + "UNION ALL " - + "SELECT id , country " - + "FROM datasource_country " - + "JOIN cfhb " - + " ON hb = dataSourceId ) tmp " - + "GROUP BY id"; - Dataset potentialUpdates = spark.sql(query); - // log.info("potential update number : {}", potentialUpdates.count()); - return potentialUpdates; - } + String query = "SELECT id resultId, collect_set(country) countrySet " + + "FROM ( SELECT id, country " + + "FROM datasource_country " + + "JOIN cfhb " + + " ON cf = dataSourceId " + + "UNION ALL " + + "SELECT id , country " + + "FROM datasource_country " + + "JOIN cfhb " + + " ON hb = dataSourceId ) tmp " + + "GROUP BY id"; + Dataset potentialUpdates = spark.sql(query); + // log.info("potential update number : {}", potentialUpdates.count()); + return potentialUpdates; + } - private static Dataset readAssocDatasourceCountry( - SparkSession spark, String relationPath) { - return spark.read() - .textFile(relationPath) - .map( - value -> OBJECT_MAPPER.readValue(value, DatasourceCountry.class), - Encoders.bean(DatasourceCountry.class)); - } + private static Dataset readAssocDatasourceCountry( + SparkSession spark, String relationPath) { + return spark + .read() + .textFile(relationPath) + .map( + value -> OBJECT_MAPPER.readValue(value, DatasourceCountry.class), + Encoders.bean(DatasourceCountry.class)); + } } diff --git a/dhp-workflows/dhp-propagation/src/main/java/eu/dnetlib/dhp/orcidtoresultfromsemrel/AutoritativeAuthor.java b/dhp-workflows/dhp-propagation/src/main/java/eu/dnetlib/dhp/orcidtoresultfromsemrel/AutoritativeAuthor.java index 048e8ae46..c1644a589 100644 --- a/dhp-workflows/dhp-propagation/src/main/java/eu/dnetlib/dhp/orcidtoresultfromsemrel/AutoritativeAuthor.java +++ b/dhp-workflows/dhp-propagation/src/main/java/eu/dnetlib/dhp/orcidtoresultfromsemrel/AutoritativeAuthor.java @@ -1,40 +1,41 @@ + package eu.dnetlib.dhp.orcidtoresultfromsemrel; public class AutoritativeAuthor { - String name; - String surname; - String fullname; - String orcid; + String name; + String surname; + String fullname; + String orcid; - public String getName() { - return name; - } + public String getName() { + return name; + } - public void setName(String name) { - this.name = name; - } + public void setName(String name) { + this.name = name; + } - public String getSurname() { - return surname; - } + public String getSurname() { + return surname; + } - public void setSurname(String surname) { - this.surname = surname; - } + public void setSurname(String surname) { + this.surname = surname; + } - public String getFullname() { - return fullname; - } + public String getFullname() { + return fullname; + } - public void setFullname(String fullname) { - this.fullname = fullname; - } + public void setFullname(String fullname) { + this.fullname = fullname; + } - public String getOrcid() { - return orcid; - } + public String getOrcid() { + return orcid; + } - public void setOrcid(String orcid) { - this.orcid = orcid; - } + public void setOrcid(String orcid) { + this.orcid = orcid; + } } diff --git a/dhp-workflows/dhp-propagation/src/main/java/eu/dnetlib/dhp/orcidtoresultfromsemrel/PrepareResultOrcidAssociationStep1.java b/dhp-workflows/dhp-propagation/src/main/java/eu/dnetlib/dhp/orcidtoresultfromsemrel/PrepareResultOrcidAssociationStep1.java index 9bc34eb73..1baec07c5 100644 --- a/dhp-workflows/dhp-propagation/src/main/java/eu/dnetlib/dhp/orcidtoresultfromsemrel/PrepareResultOrcidAssociationStep1.java +++ b/dhp-workflows/dhp-propagation/src/main/java/eu/dnetlib/dhp/orcidtoresultfromsemrel/PrepareResultOrcidAssociationStep1.java @@ -1,15 +1,12 @@ + package eu.dnetlib.dhp.orcidtoresultfromsemrel; import static eu.dnetlib.dhp.PropagationConstant.*; import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkHiveSession; -import com.fasterxml.jackson.databind.ObjectMapper; -import com.google.gson.Gson; -import eu.dnetlib.dhp.application.ArgumentApplicationParser; -import eu.dnetlib.dhp.schema.oaf.Relation; -import eu.dnetlib.dhp.schema.oaf.Result; import java.util.Arrays; import java.util.List; + import org.apache.commons.io.IOUtils; import org.apache.hadoop.io.compress.GzipCodec; import org.apache.spark.SparkConf; @@ -20,115 +17,121 @@ import org.apache.spark.sql.SparkSession; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import com.fasterxml.jackson.databind.ObjectMapper; +import com.google.gson.Gson; + +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.schema.oaf.Relation; +import eu.dnetlib.dhp.schema.oaf.Result; + public class PrepareResultOrcidAssociationStep1 { - private static final Logger log = - LoggerFactory.getLogger(PrepareResultOrcidAssociationStep1.class); + private static final Logger log = LoggerFactory.getLogger(PrepareResultOrcidAssociationStep1.class); - private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); + private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); - public static void main(String[] args) throws Exception { - String jsonConfiguration = - IOUtils.toString( - SparkOrcidToResultFromSemRelJob3.class.getResourceAsStream( - "/eu/dnetlib/dhp/orcidtoresultfromsemrel/input_prepareorcidtoresult_parameters.json")); + public static void main(String[] args) throws Exception { + String jsonConfiguration = IOUtils + .toString( + SparkOrcidToResultFromSemRelJob3.class + .getResourceAsStream( + "/eu/dnetlib/dhp/orcidtoresultfromsemrel/input_prepareorcidtoresult_parameters.json")); - final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration); + final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration); - parser.parseArgument(args); + parser.parseArgument(args); - Boolean isSparkSessionManaged = isSparkSessionManaged(parser); - log.info("isSparkSessionManaged: {}", isSparkSessionManaged); + Boolean isSparkSessionManaged = isSparkSessionManaged(parser); + log.info("isSparkSessionManaged: {}", isSparkSessionManaged); - String inputPath = parser.get("sourcePath"); - log.info("inputPath: {}", inputPath); + String inputPath = parser.get("sourcePath"); + log.info("inputPath: {}", inputPath); - final String outputPath = parser.get("outputPath"); - log.info("outputPath: {}", outputPath); + final String outputPath = parser.get("outputPath"); + log.info("outputPath: {}", outputPath); - final String resultClassName = parser.get("resultTableName"); - log.info("resultTableName: {}", resultClassName); + final String resultClassName = parser.get("resultTableName"); + log.info("resultTableName: {}", resultClassName); - final List allowedsemrel = Arrays.asList(parser.get("allowedsemrels").split(";")); - log.info("allowedSemRel: {}", new Gson().toJson(allowedsemrel)); + final List allowedsemrel = Arrays.asList(parser.get("allowedsemrels").split(";")); + log.info("allowedSemRel: {}", new Gson().toJson(allowedsemrel)); - final String resultType = - resultClassName.substring(resultClassName.lastIndexOf(".") + 1).toLowerCase(); - log.info("resultType: {}", resultType); + final String resultType = resultClassName.substring(resultClassName.lastIndexOf(".") + 1).toLowerCase(); + log.info("resultType: {}", resultType); - Class resultClazz = - (Class) Class.forName(resultClassName); + Class resultClazz = (Class) Class.forName(resultClassName); - SparkConf conf = new SparkConf(); - conf.set("hive.metastore.uris", parser.get("hive_metastore_uris")); + SparkConf conf = new SparkConf(); + conf.set("hive.metastore.uris", parser.get("hive_metastore_uris")); - runWithSparkHiveSession( - conf, - isSparkSessionManaged, - spark -> { - if (isTest(parser)) { - removeOutputDir(spark, outputPath); - } - prepareInfo( - spark, inputPath, outputPath, resultClazz, resultType, allowedsemrel); - }); - } + runWithSparkHiveSession( + conf, + isSparkSessionManaged, + spark -> { + if (isTest(parser)) { + removeOutputDir(spark, outputPath); + } + prepareInfo( + spark, inputPath, outputPath, resultClazz, resultType, allowedsemrel); + }); + } - private static void prepareInfo( - SparkSession spark, - String inputPath, - String outputPath, - Class resultClazz, - String resultType, - List allowedsemrel) { + private static void prepareInfo( + SparkSession spark, + String inputPath, + String outputPath, + Class resultClazz, + String resultType, + List allowedsemrel) { - // read the relation table and the table related to the result it is using - final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); - org.apache.spark.sql.Dataset relation = - spark.createDataset( - sc.textFile(inputPath + "/relation") - .map(item -> OBJECT_MAPPER.readValue(item, Relation.class)) - .rdd(), - Encoders.bean(Relation.class)); - relation.createOrReplaceTempView("relation"); + // read the relation table and the table related to the result it is using + final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); + org.apache.spark.sql.Dataset relation = spark + .createDataset( + sc + .textFile(inputPath + "/relation") + .map(item -> OBJECT_MAPPER.readValue(item, Relation.class)) + .rdd(), + Encoders.bean(Relation.class)); + relation.createOrReplaceTempView("relation"); - log.info("Reading Graph table from: {}", inputPath + "/" + resultType); - Dataset result = readPathEntity(spark, inputPath + "/" + resultType, resultClazz); + log.info("Reading Graph table from: {}", inputPath + "/" + resultType); + Dataset result = readPathEntity(spark, inputPath + "/" + resultType, resultClazz); - result.createOrReplaceTempView("result"); + result.createOrReplaceTempView("result"); - getPossibleResultOrcidAssociation(spark, allowedsemrel, outputPath + "/" + resultType); - } + getPossibleResultOrcidAssociation(spark, allowedsemrel, outputPath + "/" + resultType); + } - private static void getPossibleResultOrcidAssociation( - SparkSession spark, List allowedsemrel, String outputPath) { - String query = - " select target resultId, author authorList" - + " from (select id, collect_set(named_struct('name', name, 'surname', surname, 'fullname', fullname, 'orcid', orcid)) author " - + " from ( " - + " select id, MyT.fullname, MyT.name, MyT.surname, MyP.value orcid " - + " from result " - + " lateral view explode (author) a as MyT " - + " lateral view explode (MyT.pid) p as MyP " - + " where MyP.qualifier.classid = 'ORCID') tmp " - + " group by id) r_t " - + " join (" - + " select source, target " - + " from relation " - + " where datainfo.deletedbyinference = false " - + getConstraintList(" relclass = '", allowedsemrel) - + ") rel_rel " - + " on source = id"; + private static void getPossibleResultOrcidAssociation( + SparkSession spark, List allowedsemrel, String outputPath) { + String query = " select target resultId, author authorList" + + " from (select id, collect_set(named_struct('name', name, 'surname', surname, 'fullname', fullname, 'orcid', orcid)) author " + + " from ( " + + " select id, MyT.fullname, MyT.name, MyT.surname, MyP.value orcid " + + " from result " + + " lateral view explode (author) a as MyT " + + " lateral view explode (MyT.pid) p as MyP " + + " where MyP.qualifier.classid = 'ORCID') tmp " + + " group by id) r_t " + + " join (" + + " select source, target " + + " from relation " + + " where datainfo.deletedbyinference = false " + + getConstraintList(" relclass = '", allowedsemrel) + + ") rel_rel " + + " on source = id"; - spark.sql(query) - .as(Encoders.bean(ResultOrcidList.class)) - .toJavaRDD() - .map(r -> OBJECT_MAPPER.writeValueAsString(r)) - .saveAsTextFile(outputPath, GzipCodec.class); - // .toJSON() - // .write() - // .mode(SaveMode.Append) - // .option("compression","gzip") - // .text(outputPath) - // ; - } + spark + .sql(query) + .as(Encoders.bean(ResultOrcidList.class)) + .toJavaRDD() + .map(r -> OBJECT_MAPPER.writeValueAsString(r)) + .saveAsTextFile(outputPath, GzipCodec.class); + // .toJSON() + // .write() + // .mode(SaveMode.Append) + // .option("compression","gzip") + // .text(outputPath) + // ; + } } diff --git a/dhp-workflows/dhp-propagation/src/main/java/eu/dnetlib/dhp/orcidtoresultfromsemrel/PrepareResultOrcidAssociationStep2.java b/dhp-workflows/dhp-propagation/src/main/java/eu/dnetlib/dhp/orcidtoresultfromsemrel/PrepareResultOrcidAssociationStep2.java index 658c97f6c..a8380e8b9 100644 --- a/dhp-workflows/dhp-propagation/src/main/java/eu/dnetlib/dhp/orcidtoresultfromsemrel/PrepareResultOrcidAssociationStep2.java +++ b/dhp-workflows/dhp-propagation/src/main/java/eu/dnetlib/dhp/orcidtoresultfromsemrel/PrepareResultOrcidAssociationStep2.java @@ -1,101 +1,107 @@ + package eu.dnetlib.dhp.orcidtoresultfromsemrel; import static eu.dnetlib.dhp.PropagationConstant.*; import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; -import com.fasterxml.jackson.databind.ObjectMapper; -import eu.dnetlib.dhp.application.ArgumentApplicationParser; import java.util.HashSet; import java.util.Set; + import org.apache.commons.io.IOUtils; import org.apache.hadoop.io.compress.GzipCodec; import org.apache.spark.SparkConf; import org.apache.spark.sql.*; import org.slf4j.Logger; import org.slf4j.LoggerFactory; + +import com.fasterxml.jackson.databind.ObjectMapper; + +import eu.dnetlib.dhp.application.ArgumentApplicationParser; import scala.Tuple2; public class PrepareResultOrcidAssociationStep2 { - private static final Logger log = - LoggerFactory.getLogger(PrepareResultOrcidAssociationStep2.class); + private static final Logger log = LoggerFactory.getLogger(PrepareResultOrcidAssociationStep2.class); - private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); + private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); - public static void main(String[] args) throws Exception { - String jsonConfiguration = - IOUtils.toString( - PrepareResultOrcidAssociationStep2.class.getResourceAsStream( - "/eu/dnetlib/dhp/orcidtoresultfromsemrel/input_prepareorcidtoresult_parameters2.json")); + public static void main(String[] args) throws Exception { + String jsonConfiguration = IOUtils + .toString( + PrepareResultOrcidAssociationStep2.class + .getResourceAsStream( + "/eu/dnetlib/dhp/orcidtoresultfromsemrel/input_prepareorcidtoresult_parameters2.json")); - final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration); + final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration); - parser.parseArgument(args); + parser.parseArgument(args); - Boolean isSparkSessionManaged = isSparkSessionManaged(parser); - log.info("isSparkSessionManaged: {}", isSparkSessionManaged); + Boolean isSparkSessionManaged = isSparkSessionManaged(parser); + log.info("isSparkSessionManaged: {}", isSparkSessionManaged); - String inputPath = parser.get("sourcePath"); - log.info("inputPath: {}", inputPath); + String inputPath = parser.get("sourcePath"); + log.info("inputPath: {}", inputPath); - final String outputPath = parser.get("outputPath"); - log.info("outputPath: {}", outputPath); + final String outputPath = parser.get("outputPath"); + log.info("outputPath: {}", outputPath); - SparkConf conf = new SparkConf(); + SparkConf conf = new SparkConf(); - runWithSparkSession( - conf, - isSparkSessionManaged, - spark -> { - if (isTest(parser)) { - removeOutputDir(spark, outputPath); - } - mergeInfo(spark, inputPath, outputPath); - }); - } + runWithSparkSession( + conf, + isSparkSessionManaged, + spark -> { + if (isTest(parser)) { + removeOutputDir(spark, outputPath); + } + mergeInfo(spark, inputPath, outputPath); + }); + } - private static void mergeInfo(SparkSession spark, String inputPath, String outputPath) { + private static void mergeInfo(SparkSession spark, String inputPath, String outputPath) { - Dataset resultOrcidAssoc = - readAssocResultOrcidList(spark, inputPath + "/publication") - .union(readAssocResultOrcidList(spark, inputPath + "/dataset")) - .union(readAssocResultOrcidList(spark, inputPath + "/otherresearchproduct")) - .union(readAssocResultOrcidList(spark, inputPath + "/software")); + Dataset resultOrcidAssoc = readAssocResultOrcidList(spark, inputPath + "/publication") + .union(readAssocResultOrcidList(spark, inputPath + "/dataset")) + .union(readAssocResultOrcidList(spark, inputPath + "/otherresearchproduct")) + .union(readAssocResultOrcidList(spark, inputPath + "/software")); - resultOrcidAssoc - .toJavaRDD() - .mapToPair(r -> new Tuple2<>(r.getResultId(), r)) - .reduceByKey( - (a, b) -> { - if (a == null) { - return b; - } - if (b == null) { - return a; - } - Set orcid_set = new HashSet<>(); - a.getAuthorList().stream().forEach(aa -> orcid_set.add(aa.getOrcid())); + resultOrcidAssoc + .toJavaRDD() + .mapToPair(r -> new Tuple2<>(r.getResultId(), r)) + .reduceByKey( + (a, b) -> { + if (a == null) { + return b; + } + if (b == null) { + return a; + } + Set orcid_set = new HashSet<>(); + a.getAuthorList().stream().forEach(aa -> orcid_set.add(aa.getOrcid())); - b.getAuthorList().stream() - .forEach( - aa -> { - if (!orcid_set.contains(aa.getOrcid())) { - a.getAuthorList().add(aa); - orcid_set.add(aa.getOrcid()); - } - }); - return a; - }) - .map(c -> c._2()) - .map(r -> OBJECT_MAPPER.writeValueAsString(r)) - .saveAsTextFile(outputPath, GzipCodec.class); - } + b + .getAuthorList() + .stream() + .forEach( + aa -> { + if (!orcid_set.contains(aa.getOrcid())) { + a.getAuthorList().add(aa); + orcid_set.add(aa.getOrcid()); + } + }); + return a; + }) + .map(c -> c._2()) + .map(r -> OBJECT_MAPPER.writeValueAsString(r)) + .saveAsTextFile(outputPath, GzipCodec.class); + } - private static Dataset readAssocResultOrcidList( - SparkSession spark, String relationPath) { - return spark.read() - .textFile(relationPath) - .map( - value -> OBJECT_MAPPER.readValue(value, ResultOrcidList.class), - Encoders.bean(ResultOrcidList.class)); - } + private static Dataset readAssocResultOrcidList( + SparkSession spark, String relationPath) { + return spark + .read() + .textFile(relationPath) + .map( + value -> OBJECT_MAPPER.readValue(value, ResultOrcidList.class), + Encoders.bean(ResultOrcidList.class)); + } } diff --git a/dhp-workflows/dhp-propagation/src/main/java/eu/dnetlib/dhp/orcidtoresultfromsemrel/ResultOrcidList.java b/dhp-workflows/dhp-propagation/src/main/java/eu/dnetlib/dhp/orcidtoresultfromsemrel/ResultOrcidList.java index 9e2bc6e31..54b415d1c 100644 --- a/dhp-workflows/dhp-propagation/src/main/java/eu/dnetlib/dhp/orcidtoresultfromsemrel/ResultOrcidList.java +++ b/dhp-workflows/dhp-propagation/src/main/java/eu/dnetlib/dhp/orcidtoresultfromsemrel/ResultOrcidList.java @@ -1,3 +1,4 @@ + package eu.dnetlib.dhp.orcidtoresultfromsemrel; import java.io.Serializable; @@ -5,22 +6,22 @@ import java.util.ArrayList; import java.util.List; public class ResultOrcidList implements Serializable { - String resultId; - List authorList = new ArrayList<>(); + String resultId; + List authorList = new ArrayList<>(); - public String getResultId() { - return resultId; - } + public String getResultId() { + return resultId; + } - public void setResultId(String resultId) { - this.resultId = resultId; - } + public void setResultId(String resultId) { + this.resultId = resultId; + } - public List getAuthorList() { - return authorList; - } + public List getAuthorList() { + return authorList; + } - public void setAuthorList(List authorList) { - this.authorList = authorList; - } + public void setAuthorList(List authorList) { + this.authorList = authorList; + } } diff --git a/dhp-workflows/dhp-propagation/src/main/java/eu/dnetlib/dhp/orcidtoresultfromsemrel/SparkOrcidToResultFromSemRelJob3.java b/dhp-workflows/dhp-propagation/src/main/java/eu/dnetlib/dhp/orcidtoresultfromsemrel/SparkOrcidToResultFromSemRelJob3.java index 75527552a..997b58bf2 100644 --- a/dhp-workflows/dhp-propagation/src/main/java/eu/dnetlib/dhp/orcidtoresultfromsemrel/SparkOrcidToResultFromSemRelJob3.java +++ b/dhp-workflows/dhp-propagation/src/main/java/eu/dnetlib/dhp/orcidtoresultfromsemrel/SparkOrcidToResultFromSemRelJob3.java @@ -1,15 +1,12 @@ + package eu.dnetlib.dhp.orcidtoresultfromsemrel; import static eu.dnetlib.dhp.PropagationConstant.*; import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkHiveSession; -import com.fasterxml.jackson.databind.ObjectMapper; -import eu.dnetlib.dhp.application.ArgumentApplicationParser; -import eu.dnetlib.dhp.schema.oaf.Author; -import eu.dnetlib.dhp.schema.oaf.Result; -import eu.dnetlib.dhp.schema.oaf.StructuredProperty; import java.util.List; import java.util.Optional; + import org.apache.commons.io.IOUtils; import org.apache.commons.lang3.StringUtils; import org.apache.spark.SparkConf; @@ -20,174 +17,181 @@ import org.apache.spark.sql.SparkSession; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import com.fasterxml.jackson.databind.ObjectMapper; + +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.schema.oaf.Author; +import eu.dnetlib.dhp.schema.oaf.Result; +import eu.dnetlib.dhp.schema.oaf.StructuredProperty; + public class SparkOrcidToResultFromSemRelJob3 { - private static final Logger log = - LoggerFactory.getLogger(SparkOrcidToResultFromSemRelJob3.class); + private static final Logger log = LoggerFactory.getLogger(SparkOrcidToResultFromSemRelJob3.class); - private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); + private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); - public static void main(String[] args) throws Exception { - String jsonConfiguration = - IOUtils.toString( - SparkOrcidToResultFromSemRelJob3.class.getResourceAsStream( - "/eu/dnetlib/dhp/orcidtoresultfromsemrel/input_orcidtoresult_parameters.json")); + public static void main(String[] args) throws Exception { + String jsonConfiguration = IOUtils + .toString( + SparkOrcidToResultFromSemRelJob3.class + .getResourceAsStream( + "/eu/dnetlib/dhp/orcidtoresultfromsemrel/input_orcidtoresult_parameters.json")); - final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration); + final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration); - parser.parseArgument(args); + parser.parseArgument(args); - Boolean isSparkSessionManaged = isSparkSessionManaged(parser); - log.info("isSparkSessionManaged: {}", isSparkSessionManaged); + Boolean isSparkSessionManaged = isSparkSessionManaged(parser); + log.info("isSparkSessionManaged: {}", isSparkSessionManaged); - final String inputPath = parser.get("sourcePath"); - log.info("inputPath: {}", inputPath); + final String inputPath = parser.get("sourcePath"); + log.info("inputPath: {}", inputPath); - final String outputPath = parser.get("outputPath"); - log.info("outputPath: {}", outputPath); + final String outputPath = parser.get("outputPath"); + log.info("outputPath: {}", outputPath); - final String possibleUpdates = parser.get("possibleUpdatesPath"); - log.info("possibleUpdatesPath: {}", possibleUpdates); + final String possibleUpdates = parser.get("possibleUpdatesPath"); + log.info("possibleUpdatesPath: {}", possibleUpdates); - final String resultClassName = parser.get("resultTableName"); - log.info("resultTableName: {}", resultClassName); + final String resultClassName = parser.get("resultTableName"); + log.info("resultTableName: {}", resultClassName); - final Boolean saveGraph = - Optional.ofNullable(parser.get("saveGraph")) - .map(Boolean::valueOf) - .orElse(Boolean.TRUE); - log.info("saveGraph: {}", saveGraph); + final Boolean saveGraph = Optional + .ofNullable(parser.get("saveGraph")) + .map(Boolean::valueOf) + .orElse(Boolean.TRUE); + log.info("saveGraph: {}", saveGraph); - Class resultClazz = - (Class) Class.forName(resultClassName); + Class resultClazz = (Class) Class.forName(resultClassName); - SparkConf conf = new SparkConf(); - conf.set("hive.metastore.uris", parser.get("hive_metastore_uris")); + SparkConf conf = new SparkConf(); + conf.set("hive.metastore.uris", parser.get("hive_metastore_uris")); - runWithSparkHiveSession( - conf, - isSparkSessionManaged, - spark -> { - if (isTest(parser)) { - removeOutputDir(spark, outputPath); - } - if (saveGraph) - execPropagation(spark, possibleUpdates, inputPath, outputPath, resultClazz); - }); - } + runWithSparkHiveSession( + conf, + isSparkSessionManaged, + spark -> { + if (isTest(parser)) { + removeOutputDir(spark, outputPath); + } + if (saveGraph) + execPropagation(spark, possibleUpdates, inputPath, outputPath, resultClazz); + }); + } - private static void execPropagation( - SparkSession spark, - String possibleUpdatesPath, - String inputPath, - String outputPath, - Class resultClazz) { + private static void execPropagation( + SparkSession spark, + String possibleUpdatesPath, + String inputPath, + String outputPath, + Class resultClazz) { - // read possible updates (resultId and list of possible orcid to add - Dataset possible_updates = - readAssocResultOrcidList(spark, possibleUpdatesPath); - // read the result we have been considering - Dataset result = readPathEntity(spark, inputPath, resultClazz); - // make join result left_outer with possible updates + // read possible updates (resultId and list of possible orcid to add + Dataset possible_updates = readAssocResultOrcidList(spark, possibleUpdatesPath); + // read the result we have been considering + Dataset result = readPathEntity(spark, inputPath, resultClazz); + // make join result left_outer with possible updates - result.joinWith( - possible_updates, - result.col("id").equalTo(possible_updates.col("resultId")), - "left_outer") - .map( - value -> { - R ret = value._1(); - Optional rol = Optional.ofNullable(value._2()); - if (rol.isPresent()) { - List toenrich_author = ret.getAuthor(); - List autoritativeAuthors = - rol.get().getAuthorList(); - for (Author author : toenrich_author) { - if (!containsAllowedPid(author)) { - enrichAuthor(author, autoritativeAuthors); - } - } - } + result + .joinWith( + possible_updates, + result.col("id").equalTo(possible_updates.col("resultId")), + "left_outer") + .map( + value -> { + R ret = value._1(); + Optional rol = Optional.ofNullable(value._2()); + if (rol.isPresent()) { + List toenrich_author = ret.getAuthor(); + List autoritativeAuthors = rol.get().getAuthorList(); + for (Author author : toenrich_author) { + if (!containsAllowedPid(author)) { + enrichAuthor(author, autoritativeAuthors); + } + } + } - return ret; - }, - Encoders.bean(resultClazz)) - .toJSON() - .write() - .mode(SaveMode.Overwrite) - .option("compression", "gzip") - .text(outputPath); - } + return ret; + }, + Encoders.bean(resultClazz)) + .toJSON() + .write() + .mode(SaveMode.Overwrite) + .option("compression", "gzip") + .text(outputPath); + } - private static Dataset readAssocResultOrcidList( - SparkSession spark, String relationPath) { - return spark.read() - .textFile(relationPath) - .map( - value -> OBJECT_MAPPER.readValue(value, ResultOrcidList.class), - Encoders.bean(ResultOrcidList.class)); - } + private static Dataset readAssocResultOrcidList( + SparkSession spark, String relationPath) { + return spark + .read() + .textFile(relationPath) + .map( + value -> OBJECT_MAPPER.readValue(value, ResultOrcidList.class), + Encoders.bean(ResultOrcidList.class)); + } - private static void enrichAuthor(Author a, List au) { - for (AutoritativeAuthor aa : au) { - if (enrichAuthor(aa, a)) { - return; - } - } - } + private static void enrichAuthor(Author a, List au) { + for (AutoritativeAuthor aa : au) { + if (enrichAuthor(aa, a)) { + return; + } + } + } - private static boolean enrichAuthor(AutoritativeAuthor autoritative_author, Author author) { - boolean toaddpid = false; + private static boolean enrichAuthor(AutoritativeAuthor autoritative_author, Author author) { + boolean toaddpid = false; - if (StringUtils.isNoneEmpty(autoritative_author.getSurname())) { - if (StringUtils.isNoneEmpty(author.getSurname())) { - if (autoritative_author - .getSurname() - .trim() - .equalsIgnoreCase(author.getSurname().trim())) { + if (StringUtils.isNoneEmpty(autoritative_author.getSurname())) { + if (StringUtils.isNoneEmpty(author.getSurname())) { + if (autoritative_author + .getSurname() + .trim() + .equalsIgnoreCase(author.getSurname().trim())) { - // have the same surname. Check the name - if (StringUtils.isNoneEmpty(autoritative_author.getName())) { - if (StringUtils.isNoneEmpty(author.getName())) { - if (autoritative_author - .getName() - .trim() - .equalsIgnoreCase(author.getName().trim())) { - toaddpid = true; - } - // they could be differently written (i.e. only the initials of the name - // in one of the two - if (autoritative_author - .getName() - .trim() - .substring(0, 0) - .equalsIgnoreCase(author.getName().trim().substring(0, 0))) { - toaddpid = true; - } - } - } - } - } - } - if (toaddpid) { - StructuredProperty p = new StructuredProperty(); - p.setValue(autoritative_author.getOrcid()); - p.setQualifier(getQualifier(PROPAGATION_AUTHOR_PID, PROPAGATION_AUTHOR_PID)); - p.setDataInfo( - getDataInfo( - PROPAGATION_DATA_INFO_TYPE, - PROPAGATION_ORCID_TO_RESULT_FROM_SEM_REL_CLASS_ID, - PROPAGATION_ORCID_TO_RESULT_FROM_SEM_REL_CLASS_NAME)); - author.addPid(p); - } - return toaddpid; - } + // have the same surname. Check the name + if (StringUtils.isNoneEmpty(autoritative_author.getName())) { + if (StringUtils.isNoneEmpty(author.getName())) { + if (autoritative_author + .getName() + .trim() + .equalsIgnoreCase(author.getName().trim())) { + toaddpid = true; + } + // they could be differently written (i.e. only the initials of the name + // in one of the two + if (autoritative_author + .getName() + .trim() + .substring(0, 0) + .equalsIgnoreCase(author.getName().trim().substring(0, 0))) { + toaddpid = true; + } + } + } + } + } + } + if (toaddpid) { + StructuredProperty p = new StructuredProperty(); + p.setValue(autoritative_author.getOrcid()); + p.setQualifier(getQualifier(PROPAGATION_AUTHOR_PID, PROPAGATION_AUTHOR_PID)); + p + .setDataInfo( + getDataInfo( + PROPAGATION_DATA_INFO_TYPE, + PROPAGATION_ORCID_TO_RESULT_FROM_SEM_REL_CLASS_ID, + PROPAGATION_ORCID_TO_RESULT_FROM_SEM_REL_CLASS_NAME)); + author.addPid(p); + } + return toaddpid; + } - private static boolean containsAllowedPid(Author a) { - for (StructuredProperty pid : a.getPid()) { - if (PROPAGATION_AUTHOR_PID.equals(pid.getQualifier().getClassid())) { - return true; - } - } - return false; - } + private static boolean containsAllowedPid(Author a) { + for (StructuredProperty pid : a.getPid()) { + if (PROPAGATION_AUTHOR_PID.equals(pid.getQualifier().getClassid())) { + return true; + } + } + return false; + } } diff --git a/dhp-workflows/dhp-propagation/src/main/java/eu/dnetlib/dhp/projecttoresult/PrepareProjectResultsAssociation.java b/dhp-workflows/dhp-propagation/src/main/java/eu/dnetlib/dhp/projecttoresult/PrepareProjectResultsAssociation.java index cf970048d..b8579156b 100644 --- a/dhp-workflows/dhp-propagation/src/main/java/eu/dnetlib/dhp/projecttoresult/PrepareProjectResultsAssociation.java +++ b/dhp-workflows/dhp-propagation/src/main/java/eu/dnetlib/dhp/projecttoresult/PrepareProjectResultsAssociation.java @@ -1,16 +1,13 @@ + package eu.dnetlib.dhp.projecttoresult; import static eu.dnetlib.dhp.PropagationConstant.*; import static eu.dnetlib.dhp.PropagationConstant.getConstraintList; import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkHiveSession; -import com.fasterxml.jackson.databind.ObjectMapper; -import com.google.gson.Gson; -import eu.dnetlib.dhp.application.ArgumentApplicationParser; -import eu.dnetlib.dhp.countrypropagation.PrepareDatasourceCountryAssociation; -import eu.dnetlib.dhp.schema.oaf.Relation; import java.util.Arrays; import java.util.List; + import org.apache.commons.io.IOUtils; import org.apache.hadoop.io.compress.GzipCodec; import org.apache.spark.SparkConf; @@ -19,134 +16,141 @@ import org.apache.spark.sql.*; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import com.fasterxml.jackson.databind.ObjectMapper; +import com.google.gson.Gson; + +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.countrypropagation.PrepareDatasourceCountryAssociation; +import eu.dnetlib.dhp.schema.oaf.Relation; + public class PrepareProjectResultsAssociation { - private static final Logger log = - LoggerFactory.getLogger(PrepareDatasourceCountryAssociation.class); - private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); + private static final Logger log = LoggerFactory.getLogger(PrepareDatasourceCountryAssociation.class); + private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); - public static void main(String[] args) throws Exception { + public static void main(String[] args) throws Exception { - String jsonConfiguration = - IOUtils.toString( - PrepareProjectResultsAssociation.class.getResourceAsStream( - "/eu/dnetlib/dhp/projecttoresult/input_prepareprojecttoresult_parameters.json")); + String jsonConfiguration = IOUtils + .toString( + PrepareProjectResultsAssociation.class + .getResourceAsStream( + "/eu/dnetlib/dhp/projecttoresult/input_prepareprojecttoresult_parameters.json")); - final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration); + final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration); - parser.parseArgument(args); + parser.parseArgument(args); - Boolean isSparkSessionManaged = isSparkSessionManaged(parser); - log.info("isSparkSessionManaged: {}", isSparkSessionManaged); + Boolean isSparkSessionManaged = isSparkSessionManaged(parser); + log.info("isSparkSessionManaged: {}", isSparkSessionManaged); - String inputPath = parser.get("sourcePath"); - log.info("inputPath: {}", inputPath); + String inputPath = parser.get("sourcePath"); + log.info("inputPath: {}", inputPath); - final String potentialUpdatePath = parser.get("potentialUpdatePath"); - log.info("potentialUpdatePath {}: ", potentialUpdatePath); + final String potentialUpdatePath = parser.get("potentialUpdatePath"); + log.info("potentialUpdatePath {}: ", potentialUpdatePath); - String alreadyLinkedPath = parser.get("alreadyLinkedPath"); - log.info("alreadyLinkedPath: {} ", alreadyLinkedPath); + String alreadyLinkedPath = parser.get("alreadyLinkedPath"); + log.info("alreadyLinkedPath: {} ", alreadyLinkedPath); - final List allowedsemrel = Arrays.asList(parser.get("allowedsemrels").split(";")); - log.info("allowedSemRel: {}", new Gson().toJson(allowedsemrel)); + final List allowedsemrel = Arrays.asList(parser.get("allowedsemrels").split(";")); + log.info("allowedSemRel: {}", new Gson().toJson(allowedsemrel)); - SparkConf conf = new SparkConf(); - conf.set("hive.metastore.uris", parser.get("hive_metastore_uris")); + SparkConf conf = new SparkConf(); + conf.set("hive.metastore.uris", parser.get("hive_metastore_uris")); - runWithSparkHiveSession( - conf, - isSparkSessionManaged, - spark -> { - // removeOutputDir(spark, potentialUpdatePath); - // removeOutputDir(spark, alreadyLinkedPath); - prepareResultProjProjectResults( - spark, - inputPath, - potentialUpdatePath, - alreadyLinkedPath, - allowedsemrel); - }); - } + runWithSparkHiveSession( + conf, + isSparkSessionManaged, + spark -> { + // removeOutputDir(spark, potentialUpdatePath); + // removeOutputDir(spark, alreadyLinkedPath); + prepareResultProjProjectResults( + spark, + inputPath, + potentialUpdatePath, + alreadyLinkedPath, + allowedsemrel); + }); + } - private static void prepareResultProjProjectResults( - SparkSession spark, - String inputPath, - String potentialUpdatePath, - String alreadyLinkedPath, - List allowedsemrel) { - JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); - Dataset relation = - spark.createDataset( - sc.textFile(inputPath) - .map(item -> OBJECT_MAPPER.readValue(item, Relation.class)) - .rdd(), - Encoders.bean(Relation.class)); + private static void prepareResultProjProjectResults( + SparkSession spark, + String inputPath, + String potentialUpdatePath, + String alreadyLinkedPath, + List allowedsemrel) { + JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); + Dataset relation = spark + .createDataset( + sc + .textFile(inputPath) + .map(item -> OBJECT_MAPPER.readValue(item, Relation.class)) + .rdd(), + Encoders.bean(Relation.class)); - relation.createOrReplaceTempView("relation"); + relation.createOrReplaceTempView("relation"); - String query = - "SELECT source, target " - + " FROM relation " - + " WHERE datainfo.deletedbyinference = false " - + " AND relClass = '" - + RELATION_RESULT_PROJECT_REL_CLASS - + "'"; + String query = "SELECT source, target " + + " FROM relation " + + " WHERE datainfo.deletedbyinference = false " + + " AND relClass = '" + + RELATION_RESULT_PROJECT_REL_CLASS + + "'"; - Dataset resproj_relation = spark.sql(query); - resproj_relation.createOrReplaceTempView("resproj_relation"); + Dataset resproj_relation = spark.sql(query); + resproj_relation.createOrReplaceTempView("resproj_relation"); - query = - "SELECT resultId, collect_set(projectId) projectSet " - + "FROM ( " - + "SELECT r1.target resultId, r2.target projectId " - + " FROM (SELECT source, target " - + " FROM relation " - + " WHERE datainfo.deletedbyinference = false " - + getConstraintList(" relClass = '", allowedsemrel) - + " ) r1" - + " JOIN resproj_relation r2 " - + " ON r1.source = r2.source " - + " ) tmp " - + "GROUP BY resultId "; - // query = - // "SELECT projectId, collect_set(resId) resultSet " - // + "FROM (" - // + " SELECT r1.target resId, r2.target projectId " - // + " FROM (SELECT source, target " - // + " FROM relation " - // + " WHERE datainfo.deletedbyinference = false " - // + getConstraintList(" relClass = '", allowedsemrel) - // + ") r1" - // + " JOIN resproj_relation r2 " - // + " ON r1.source = r2.source " - // + " ) tmp " - // + "GROUP BY projectId "; + query = "SELECT resultId, collect_set(projectId) projectSet " + + "FROM ( " + + "SELECT r1.target resultId, r2.target projectId " + + " FROM (SELECT source, target " + + " FROM relation " + + " WHERE datainfo.deletedbyinference = false " + + getConstraintList(" relClass = '", allowedsemrel) + + " ) r1" + + " JOIN resproj_relation r2 " + + " ON r1.source = r2.source " + + " ) tmp " + + "GROUP BY resultId "; + // query = + // "SELECT projectId, collect_set(resId) resultSet " + // + "FROM (" + // + " SELECT r1.target resId, r2.target projectId " + // + " FROM (SELECT source, target " + // + " FROM relation " + // + " WHERE datainfo.deletedbyinference = false " + // + getConstraintList(" relClass = '", allowedsemrel) + // + ") r1" + // + " JOIN resproj_relation r2 " + // + " ON r1.source = r2.source " + // + " ) tmp " + // + "GROUP BY projectId "; - spark.sql(query) - .as(Encoders.bean(ResultProjectSet.class)) - // .toJSON() - // .write() - // .mode(SaveMode.Overwrite) - // .option("compression", "gzip") - // .text(potentialUpdatePath); - .toJavaRDD() - .map(r -> OBJECT_MAPPER.writeValueAsString(r)) - .saveAsTextFile(potentialUpdatePath, GzipCodec.class); + spark + .sql(query) + .as(Encoders.bean(ResultProjectSet.class)) + // .toJSON() + // .write() + // .mode(SaveMode.Overwrite) + // .option("compression", "gzip") + // .text(potentialUpdatePath); + .toJavaRDD() + .map(r -> OBJECT_MAPPER.writeValueAsString(r)) + .saveAsTextFile(potentialUpdatePath, GzipCodec.class); - query = - "SELECT source resultId, collect_set(target) projectSet " - + "FROM resproj_relation " - + "GROUP BY source"; + query = "SELECT source resultId, collect_set(target) projectSet " + + "FROM resproj_relation " + + "GROUP BY source"; - spark.sql(query) - .as(Encoders.bean(ResultProjectSet.class)) - // .toJSON() - // .write() - // .mode(SaveMode.Overwrite) - // .option("compression", "gzip") - // .text(alreadyLinkedPath); - .toJavaRDD() - .map(r -> OBJECT_MAPPER.writeValueAsString(r)) - .saveAsTextFile(alreadyLinkedPath, GzipCodec.class); - } + spark + .sql(query) + .as(Encoders.bean(ResultProjectSet.class)) + // .toJSON() + // .write() + // .mode(SaveMode.Overwrite) + // .option("compression", "gzip") + // .text(alreadyLinkedPath); + .toJavaRDD() + .map(r -> OBJECT_MAPPER.writeValueAsString(r)) + .saveAsTextFile(alreadyLinkedPath, GzipCodec.class); + } } diff --git a/dhp-workflows/dhp-propagation/src/main/java/eu/dnetlib/dhp/projecttoresult/ResultProjectSet.java b/dhp-workflows/dhp-propagation/src/main/java/eu/dnetlib/dhp/projecttoresult/ResultProjectSet.java index 183ae1489..1d5280874 100644 --- a/dhp-workflows/dhp-propagation/src/main/java/eu/dnetlib/dhp/projecttoresult/ResultProjectSet.java +++ b/dhp-workflows/dhp-propagation/src/main/java/eu/dnetlib/dhp/projecttoresult/ResultProjectSet.java @@ -1,25 +1,26 @@ + package eu.dnetlib.dhp.projecttoresult; import java.io.Serializable; import java.util.ArrayList; public class ResultProjectSet implements Serializable { - private String resultId; - private ArrayList projectSet; + private String resultId; + private ArrayList projectSet; - public String getResultId() { - return resultId; - } + public String getResultId() { + return resultId; + } - public void setResultId(String resultId) { - this.resultId = resultId; - } + public void setResultId(String resultId) { + this.resultId = resultId; + } - public ArrayList getProjectSet() { - return projectSet; - } + public ArrayList getProjectSet() { + return projectSet; + } - public void setProjectSet(ArrayList project) { - this.projectSet = project; - } + public void setProjectSet(ArrayList project) { + this.projectSet = project; + } } diff --git a/dhp-workflows/dhp-propagation/src/main/java/eu/dnetlib/dhp/projecttoresult/SparkResultToProjectThroughSemRelJob3.java b/dhp-workflows/dhp-propagation/src/main/java/eu/dnetlib/dhp/projecttoresult/SparkResultToProjectThroughSemRelJob3.java index e32242a90..4be072901 100644 --- a/dhp-workflows/dhp-propagation/src/main/java/eu/dnetlib/dhp/projecttoresult/SparkResultToProjectThroughSemRelJob3.java +++ b/dhp-workflows/dhp-propagation/src/main/java/eu/dnetlib/dhp/projecttoresult/SparkResultToProjectThroughSemRelJob3.java @@ -1,149 +1,159 @@ + package eu.dnetlib.dhp.projecttoresult; import static eu.dnetlib.dhp.PropagationConstant.*; import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; -import com.fasterxml.jackson.databind.ObjectMapper; -import eu.dnetlib.dhp.application.ArgumentApplicationParser; -import eu.dnetlib.dhp.countrypropagation.PrepareDatasourceCountryAssociation; -import eu.dnetlib.dhp.schema.oaf.Relation; import java.util.ArrayList; import java.util.List; import java.util.Optional; + import org.apache.commons.io.IOUtils; import org.apache.spark.SparkConf; import org.apache.spark.sql.*; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import com.fasterxml.jackson.databind.ObjectMapper; + +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.countrypropagation.PrepareDatasourceCountryAssociation; +import eu.dnetlib.dhp.schema.oaf.Relation; + public class SparkResultToProjectThroughSemRelJob3 { - private static final Logger log = - LoggerFactory.getLogger(PrepareDatasourceCountryAssociation.class); - private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); + private static final Logger log = LoggerFactory.getLogger(PrepareDatasourceCountryAssociation.class); + private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); - public static void main(String[] args) throws Exception { + public static void main(String[] args) throws Exception { - String jsonConfiguration = - IOUtils.toString( - SparkResultToProjectThroughSemRelJob3.class.getResourceAsStream( - "/eu/dnetlib/dhp/projecttoresult/input_projecttoresult_parameters.json")); + String jsonConfiguration = IOUtils + .toString( + SparkResultToProjectThroughSemRelJob3.class + .getResourceAsStream( + "/eu/dnetlib/dhp/projecttoresult/input_projecttoresult_parameters.json")); - final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration); + final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration); - parser.parseArgument(args); + parser.parseArgument(args); - Boolean isSparkSessionManaged = isSparkSessionManaged(parser); - log.info("isSparkSessionManaged: {}", isSparkSessionManaged); + Boolean isSparkSessionManaged = isSparkSessionManaged(parser); + log.info("isSparkSessionManaged: {}", isSparkSessionManaged); - final String outputPath = parser.get("outputPath"); - log.info("outputPath {}: ", outputPath); + final String outputPath = parser.get("outputPath"); + log.info("outputPath {}: ", outputPath); - final String potentialUpdatePath = parser.get("potentialUpdatePath"); - log.info("potentialUpdatePath {}: ", potentialUpdatePath); + final String potentialUpdatePath = parser.get("potentialUpdatePath"); + log.info("potentialUpdatePath {}: ", potentialUpdatePath); - final String alreadyLinkedPath = parser.get("alreadyLinkedPath"); - log.info("alreadyLinkedPath {}: ", alreadyLinkedPath); + final String alreadyLinkedPath = parser.get("alreadyLinkedPath"); + log.info("alreadyLinkedPath {}: ", alreadyLinkedPath); - final Boolean saveGraph = Boolean.valueOf(parser.get("saveGraph")); - log.info("saveGraph: {}", saveGraph); + final Boolean saveGraph = Boolean.valueOf(parser.get("saveGraph")); + log.info("saveGraph: {}", saveGraph); - SparkConf conf = new SparkConf(); + SparkConf conf = new SparkConf(); - runWithSparkSession( - conf, - isSparkSessionManaged, - spark -> { - if (isTest(parser)) { - removeOutputDir(spark, outputPath); - } - execPropagation( - spark, outputPath, alreadyLinkedPath, potentialUpdatePath, saveGraph); - }); - } + runWithSparkSession( + conf, + isSparkSessionManaged, + spark -> { + if (isTest(parser)) { + removeOutputDir(spark, outputPath); + } + execPropagation( + spark, outputPath, alreadyLinkedPath, potentialUpdatePath, saveGraph); + }); + } - private static void execPropagation( - SparkSession spark, - String outputPath, - String alreadyLinkedPath, - String potentialUpdatePath, - Boolean saveGraph) { + private static void execPropagation( + SparkSession spark, + String outputPath, + String alreadyLinkedPath, + String potentialUpdatePath, + Boolean saveGraph) { - Dataset toaddrelations = - readAssocResultProjects(spark, potentialUpdatePath); - Dataset alreadyLinked = readAssocResultProjects(spark, alreadyLinkedPath); + Dataset toaddrelations = readAssocResultProjects(spark, potentialUpdatePath); + Dataset alreadyLinked = readAssocResultProjects(spark, alreadyLinkedPath); - if (saveGraph) { - getNewRelations(alreadyLinked, toaddrelations) - .toJSON() - .write() - .mode(SaveMode.Append) - .option("compression", "gzip") - .text(outputPath); - } - } + if (saveGraph) { + getNewRelations(alreadyLinked, toaddrelations) + .toJSON() + .write() + .mode(SaveMode.Append) + .option("compression", "gzip") + .text(outputPath); + } + } - private static Dataset getNewRelations( - Dataset alreadyLinked, Dataset toaddrelations) { + private static Dataset getNewRelations( + Dataset alreadyLinked, Dataset toaddrelations) { - return toaddrelations - .joinWith( - alreadyLinked, - toaddrelations.col("resultId").equalTo(alreadyLinked.col("resultId")), - "left_outer") - .flatMap( - value -> { - List new_relations = new ArrayList<>(); - ResultProjectSet potential_update = value._1(); - Optional already_linked = - Optional.ofNullable(value._2()); - if (already_linked.isPresent()) { - already_linked.get().getProjectSet().stream() - .forEach( - (p -> { - if (potential_update - .getProjectSet() - .contains(p)) { - potential_update.getProjectSet().remove(p); - } - })); - } - String resId = potential_update.getResultId(); - potential_update.getProjectSet().stream() - .forEach( - pId -> { - new_relations.add( - getRelation( - resId, - pId, - RELATION_RESULT_PROJECT_REL_CLASS, - RELATION_RESULTPROJECT_REL_TYPE, - RELATION_RESULTPROJECT_SUBREL_TYPE, - PROPAGATION_DATA_INFO_TYPE, - PROPAGATION_RELATION_RESULT_PROJECT_SEM_REL_CLASS_ID, - PROPAGATION_RELATION_RESULT_PROJECT_SEM_REL_CLASS_NAME)); - new_relations.add( - getRelation( - pId, - resId, - RELATION_PROJECT_RESULT_REL_CLASS, - RELATION_RESULTPROJECT_REL_TYPE, - RELATION_RESULTPROJECT_SUBREL_TYPE, - PROPAGATION_DATA_INFO_TYPE, - PROPAGATION_RELATION_RESULT_PROJECT_SEM_REL_CLASS_ID, - PROPAGATION_RELATION_RESULT_PROJECT_SEM_REL_CLASS_NAME)); - }); - return new_relations.iterator(); - }, - Encoders.bean(Relation.class)); - } + return toaddrelations + .joinWith( + alreadyLinked, + toaddrelations.col("resultId").equalTo(alreadyLinked.col("resultId")), + "left_outer") + .flatMap( + value -> { + List new_relations = new ArrayList<>(); + ResultProjectSet potential_update = value._1(); + Optional already_linked = Optional.ofNullable(value._2()); + if (already_linked.isPresent()) { + already_linked + .get() + .getProjectSet() + .stream() + .forEach( + (p -> { + if (potential_update + .getProjectSet() + .contains(p)) { + potential_update.getProjectSet().remove(p); + } + })); + } + String resId = potential_update.getResultId(); + potential_update + .getProjectSet() + .stream() + .forEach( + pId -> { + new_relations + .add( + getRelation( + resId, + pId, + RELATION_RESULT_PROJECT_REL_CLASS, + RELATION_RESULTPROJECT_REL_TYPE, + RELATION_RESULTPROJECT_SUBREL_TYPE, + PROPAGATION_DATA_INFO_TYPE, + PROPAGATION_RELATION_RESULT_PROJECT_SEM_REL_CLASS_ID, + PROPAGATION_RELATION_RESULT_PROJECT_SEM_REL_CLASS_NAME)); + new_relations + .add( + getRelation( + pId, + resId, + RELATION_PROJECT_RESULT_REL_CLASS, + RELATION_RESULTPROJECT_REL_TYPE, + RELATION_RESULTPROJECT_SUBREL_TYPE, + PROPAGATION_DATA_INFO_TYPE, + PROPAGATION_RELATION_RESULT_PROJECT_SEM_REL_CLASS_ID, + PROPAGATION_RELATION_RESULT_PROJECT_SEM_REL_CLASS_NAME)); + }); + return new_relations.iterator(); + }, + Encoders.bean(Relation.class)); + } - private static Dataset readAssocResultProjects( - SparkSession spark, String potentialUpdatePath) { - return spark.read() - .textFile(potentialUpdatePath) - .map( - value -> OBJECT_MAPPER.readValue(value, ResultProjectSet.class), - Encoders.bean(ResultProjectSet.class)); - } + private static Dataset readAssocResultProjects( + SparkSession spark, String potentialUpdatePath) { + return spark + .read() + .textFile(potentialUpdatePath) + .map( + value -> OBJECT_MAPPER.readValue(value, ResultProjectSet.class), + Encoders.bean(ResultProjectSet.class)); + } } diff --git a/dhp-workflows/dhp-propagation/src/main/java/eu/dnetlib/dhp/resulttocommunityfromorganization/OrganizationMap.java b/dhp-workflows/dhp-propagation/src/main/java/eu/dnetlib/dhp/resulttocommunityfromorganization/OrganizationMap.java index 9a42f3f7e..7d786058a 100644 --- a/dhp-workflows/dhp-propagation/src/main/java/eu/dnetlib/dhp/resulttocommunityfromorganization/OrganizationMap.java +++ b/dhp-workflows/dhp-propagation/src/main/java/eu/dnetlib/dhp/resulttocommunityfromorganization/OrganizationMap.java @@ -1,3 +1,4 @@ + package eu.dnetlib.dhp.resulttocommunityfromorganization; import java.util.ArrayList; @@ -6,15 +7,15 @@ import java.util.List; public class OrganizationMap extends HashMap> { - public OrganizationMap() { - super(); - } + public OrganizationMap() { + super(); + } - public List get(String key) { + public List get(String key) { - if (super.get(key) == null) { - return new ArrayList<>(); - } - return super.get(key); - } + if (super.get(key) == null) { + return new ArrayList<>(); + } + return super.get(key); + } } diff --git a/dhp-workflows/dhp-propagation/src/main/java/eu/dnetlib/dhp/resulttocommunityfromorganization/PrepareResultCommunitySet.java b/dhp-workflows/dhp-propagation/src/main/java/eu/dnetlib/dhp/resulttocommunityfromorganization/PrepareResultCommunitySet.java index 9e62f9b4f..fbe598e89 100644 --- a/dhp-workflows/dhp-propagation/src/main/java/eu/dnetlib/dhp/resulttocommunityfromorganization/PrepareResultCommunitySet.java +++ b/dhp-workflows/dhp-propagation/src/main/java/eu/dnetlib/dhp/resulttocommunityfromorganization/PrepareResultCommunitySet.java @@ -1,129 +1,133 @@ + package eu.dnetlib.dhp.resulttocommunityfromorganization; import static eu.dnetlib.dhp.PropagationConstant.*; import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkHiveSession; -import com.fasterxml.jackson.databind.ObjectMapper; -import com.google.gson.Gson; -import eu.dnetlib.dhp.application.ArgumentApplicationParser; -import eu.dnetlib.dhp.schema.oaf.Relation; import java.util.*; + import org.apache.commons.io.IOUtils; import org.apache.spark.SparkConf; import org.apache.spark.sql.*; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import com.fasterxml.jackson.databind.ObjectMapper; +import com.google.gson.Gson; + +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.schema.oaf.Relation; + public class PrepareResultCommunitySet { - private static final Logger log = LoggerFactory.getLogger(PrepareResultCommunitySet.class); + private static final Logger log = LoggerFactory.getLogger(PrepareResultCommunitySet.class); - private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); + private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); - public static void main(String[] args) throws Exception { - String jsonConfiguration = - IOUtils.toString( - PrepareResultCommunitySet.class.getResourceAsStream( - "/eu/dnetlib/dhp/resulttocommunityfromorganization/input_preparecommunitytoresult_parameters.json")); + public static void main(String[] args) throws Exception { + String jsonConfiguration = IOUtils + .toString( + PrepareResultCommunitySet.class + .getResourceAsStream( + "/eu/dnetlib/dhp/resulttocommunityfromorganization/input_preparecommunitytoresult_parameters.json")); - final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration); + final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration); - parser.parseArgument(args); + parser.parseArgument(args); - Boolean isSparkSessionManaged = isSparkSessionManaged(parser); - log.info("isSparkSessionManaged: {}", isSparkSessionManaged); + Boolean isSparkSessionManaged = isSparkSessionManaged(parser); + log.info("isSparkSessionManaged: {}", isSparkSessionManaged); - String inputPath = parser.get("sourcePath"); - log.info("inputPath: {}", inputPath); + String inputPath = parser.get("sourcePath"); + log.info("inputPath: {}", inputPath); - final String outputPath = parser.get("outputPath"); - log.info("outputPath: {}", outputPath); + final String outputPath = parser.get("outputPath"); + log.info("outputPath: {}", outputPath); - final OrganizationMap organizationMap = - new Gson() - .fromJson( - parser.get("organizationtoresultcommunitymap"), - OrganizationMap.class); - log.info("organizationMap: {}", new Gson().toJson(organizationMap)); + final OrganizationMap organizationMap = new Gson() + .fromJson( + parser.get("organizationtoresultcommunitymap"), + OrganizationMap.class); + log.info("organizationMap: {}", new Gson().toJson(organizationMap)); - SparkConf conf = new SparkConf(); - conf.set("hive.metastore.uris", parser.get("hive_metastore_uris")); + SparkConf conf = new SparkConf(); + conf.set("hive.metastore.uris", parser.get("hive_metastore_uris")); - runWithSparkHiveSession( - conf, - isSparkSessionManaged, - spark -> { - if (isTest(parser)) { - removeOutputDir(spark, outputPath); - } - prepareInfo(spark, inputPath, outputPath, organizationMap); - }); - } + runWithSparkHiveSession( + conf, + isSparkSessionManaged, + spark -> { + if (isTest(parser)) { + removeOutputDir(spark, outputPath); + } + prepareInfo(spark, inputPath, outputPath, organizationMap); + }); + } - private static void prepareInfo( - SparkSession spark, - String inputPath, - String outputPath, - OrganizationMap organizationMap) { - Dataset relation = readRelations(spark, inputPath); - relation.createOrReplaceTempView("relation"); + private static void prepareInfo( + SparkSession spark, + String inputPath, + String outputPath, + OrganizationMap organizationMap) { + Dataset relation = readRelations(spark, inputPath); + relation.createOrReplaceTempView("relation"); - String query = - "SELECT result_organization.source resultId, result_organization.target orgId, org_set merges " - + "FROM (SELECT source, target " - + " FROM relation " - + " WHERE datainfo.deletedbyinference = false " - + " AND relClass = '" - + RELATION_RESULT_ORGANIZATION_REL_CLASS - + "') result_organization " - + "LEFT JOIN (SELECT source, collect_set(target) org_set " - + " FROM relation " - + " WHERE datainfo.deletedbyinference = false " - + " AND relClass = '" - + RELATION_REPRESENTATIVERESULT_RESULT_CLASS - + "' " - + " GROUP BY source) organization_organization " - + "ON result_organization.target = organization_organization.source "; + String query = "SELECT result_organization.source resultId, result_organization.target orgId, org_set merges " + + "FROM (SELECT source, target " + + " FROM relation " + + " WHERE datainfo.deletedbyinference = false " + + " AND relClass = '" + + RELATION_RESULT_ORGANIZATION_REL_CLASS + + "') result_organization " + + "LEFT JOIN (SELECT source, collect_set(target) org_set " + + " FROM relation " + + " WHERE datainfo.deletedbyinference = false " + + " AND relClass = '" + + RELATION_REPRESENTATIVERESULT_RESULT_CLASS + + "' " + + " GROUP BY source) organization_organization " + + "ON result_organization.target = organization_organization.source "; - org.apache.spark.sql.Dataset result_organizationset = - spark.sql(query).as(Encoders.bean(ResultOrganizations.class)); + org.apache.spark.sql.Dataset result_organizationset = spark + .sql(query) + .as(Encoders.bean(ResultOrganizations.class)); - result_organizationset - .map( - value -> { - String rId = value.getResultId(); - Optional> orgs = Optional.ofNullable(value.getMerges()); - String oTarget = value.getOrgId(); - Set communitySet = new HashSet<>(); - if (organizationMap.containsKey(oTarget)) { - communitySet.addAll(organizationMap.get(oTarget)); - } - if (orgs.isPresent()) - // try{ - for (String oId : orgs.get()) { - if (organizationMap.containsKey(oId)) { - communitySet.addAll(organizationMap.get(oId)); - } - } - // }catch(Exception e){ - // - // } - if (communitySet.size() > 0) { - ResultCommunityList rcl = new ResultCommunityList(); - rcl.setResultId(rId); - ArrayList communityList = new ArrayList<>(); - communityList.addAll(communitySet); - rcl.setCommunityList(communityList); - return rcl; - } - return null; - }, - Encoders.bean(ResultCommunityList.class)) - .filter(r -> r != null) - .toJSON() - .write() - .mode(SaveMode.Overwrite) - .option("compression", "gzip") - .text(outputPath); - } + result_organizationset + .map( + value -> { + String rId = value.getResultId(); + Optional> orgs = Optional.ofNullable(value.getMerges()); + String oTarget = value.getOrgId(); + Set communitySet = new HashSet<>(); + if (organizationMap.containsKey(oTarget)) { + communitySet.addAll(organizationMap.get(oTarget)); + } + if (orgs.isPresent()) + // try{ + for (String oId : orgs.get()) { + if (organizationMap.containsKey(oId)) { + communitySet.addAll(organizationMap.get(oId)); + } + } + // }catch(Exception e){ + // + // } + if (communitySet.size() > 0) { + ResultCommunityList rcl = new ResultCommunityList(); + rcl.setResultId(rId); + ArrayList communityList = new ArrayList<>(); + communityList.addAll(communitySet); + rcl.setCommunityList(communityList); + return rcl; + } + return null; + }, + Encoders.bean(ResultCommunityList.class)) + .filter(r -> r != null) + .toJSON() + .write() + .mode(SaveMode.Overwrite) + .option("compression", "gzip") + .text(outputPath); + } } diff --git a/dhp-workflows/dhp-propagation/src/main/java/eu/dnetlib/dhp/resulttocommunityfromorganization/ResultCommunityList.java b/dhp-workflows/dhp-propagation/src/main/java/eu/dnetlib/dhp/resulttocommunityfromorganization/ResultCommunityList.java index 50d9a6d7a..e3275745d 100644 --- a/dhp-workflows/dhp-propagation/src/main/java/eu/dnetlib/dhp/resulttocommunityfromorganization/ResultCommunityList.java +++ b/dhp-workflows/dhp-propagation/src/main/java/eu/dnetlib/dhp/resulttocommunityfromorganization/ResultCommunityList.java @@ -1,25 +1,26 @@ + package eu.dnetlib.dhp.resulttocommunityfromorganization; import java.io.Serializable; import java.util.ArrayList; public class ResultCommunityList implements Serializable { - private String resultId; - private ArrayList communityList; + private String resultId; + private ArrayList communityList; - public String getResultId() { - return resultId; - } + public String getResultId() { + return resultId; + } - public void setResultId(String resultId) { - this.resultId = resultId; - } + public void setResultId(String resultId) { + this.resultId = resultId; + } - public ArrayList getCommunityList() { - return communityList; - } + public ArrayList getCommunityList() { + return communityList; + } - public void setCommunityList(ArrayList communityList) { - this.communityList = communityList; - } + public void setCommunityList(ArrayList communityList) { + this.communityList = communityList; + } } diff --git a/dhp-workflows/dhp-propagation/src/main/java/eu/dnetlib/dhp/resulttocommunityfromorganization/ResultOrganizations.java b/dhp-workflows/dhp-propagation/src/main/java/eu/dnetlib/dhp/resulttocommunityfromorganization/ResultOrganizations.java index 53a5fccdf..3ea9d41d6 100644 --- a/dhp-workflows/dhp-propagation/src/main/java/eu/dnetlib/dhp/resulttocommunityfromorganization/ResultOrganizations.java +++ b/dhp-workflows/dhp-propagation/src/main/java/eu/dnetlib/dhp/resulttocommunityfromorganization/ResultOrganizations.java @@ -1,34 +1,35 @@ + package eu.dnetlib.dhp.resulttocommunityfromorganization; import java.io.Serializable; import java.util.ArrayList; public class ResultOrganizations implements Serializable { - private String resultId; - private String orgId; - private ArrayList merges; + private String resultId; + private String orgId; + private ArrayList merges; - public String getResultId() { - return resultId; - } + public String getResultId() { + return resultId; + } - public void setResultId(String resultId) { - this.resultId = resultId; - } + public void setResultId(String resultId) { + this.resultId = resultId; + } - public String getOrgId() { - return orgId; - } + public String getOrgId() { + return orgId; + } - public void setOrgId(String orgId) { - this.orgId = orgId; - } + public void setOrgId(String orgId) { + this.orgId = orgId; + } - public ArrayList getMerges() { - return merges; - } + public ArrayList getMerges() { + return merges; + } - public void setMerges(ArrayList merges) { - this.merges = merges; - } + public void setMerges(ArrayList merges) { + this.merges = merges; + } } diff --git a/dhp-workflows/dhp-propagation/src/main/java/eu/dnetlib/dhp/resulttocommunityfromorganization/SparkResultToCommunityFromOrganizationJob2.java b/dhp-workflows/dhp-propagation/src/main/java/eu/dnetlib/dhp/resulttocommunityfromorganization/SparkResultToCommunityFromOrganizationJob2.java index 3c5b0a04c..74931a537 100644 --- a/dhp-workflows/dhp-propagation/src/main/java/eu/dnetlib/dhp/resulttocommunityfromorganization/SparkResultToCommunityFromOrganizationJob2.java +++ b/dhp-workflows/dhp-propagation/src/main/java/eu/dnetlib/dhp/resulttocommunityfromorganization/SparkResultToCommunityFromOrganizationJob2.java @@ -1,13 +1,12 @@ + package eu.dnetlib.dhp.resulttocommunityfromorganization; import static eu.dnetlib.dhp.PropagationConstant.*; import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkHiveSession; -import com.fasterxml.jackson.databind.ObjectMapper; -import eu.dnetlib.dhp.application.ArgumentApplicationParser; -import eu.dnetlib.dhp.schema.oaf.*; import java.util.*; import java.util.stream.Collectors; + import org.apache.commons.io.IOUtils; import org.apache.spark.SparkConf; import org.apache.spark.sql.Encoders; @@ -16,111 +15,119 @@ import org.apache.spark.sql.SparkSession; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import com.fasterxml.jackson.databind.ObjectMapper; + +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.schema.oaf.*; + public class SparkResultToCommunityFromOrganizationJob2 { - private static final Logger log = - LoggerFactory.getLogger(SparkResultToCommunityFromOrganizationJob2.class); + private static final Logger log = LoggerFactory.getLogger(SparkResultToCommunityFromOrganizationJob2.class); - private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); + private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); - public static void main(String[] args) throws Exception { - String jsonConfiguration = - IOUtils.toString( - SparkResultToCommunityFromOrganizationJob2.class.getResourceAsStream( - "/eu/dnetlib/dhp/resulttocommunityfromorganization/input_communitytoresult_parameters.json")); + public static void main(String[] args) throws Exception { + String jsonConfiguration = IOUtils + .toString( + SparkResultToCommunityFromOrganizationJob2.class + .getResourceAsStream( + "/eu/dnetlib/dhp/resulttocommunityfromorganization/input_communitytoresult_parameters.json")); - final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration); + final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration); - parser.parseArgument(args); + parser.parseArgument(args); - Boolean isSparkSessionManaged = isSparkSessionManaged(parser); - log.info("isSparkSessionManaged: {}", isSparkSessionManaged); + Boolean isSparkSessionManaged = isSparkSessionManaged(parser); + log.info("isSparkSessionManaged: {}", isSparkSessionManaged); - String inputPath = parser.get("sourcePath"); - log.info("inputPath: {}", inputPath); + String inputPath = parser.get("sourcePath"); + log.info("inputPath: {}", inputPath); - final String outputPath = parser.get("outputPath"); - log.info("outputPath: {}", outputPath); + final String outputPath = parser.get("outputPath"); + log.info("outputPath: {}", outputPath); - final String possibleupdatespath = parser.get("preparedInfoPath"); - log.info("preparedInfoPath: {}", possibleupdatespath); + final String possibleupdatespath = parser.get("preparedInfoPath"); + log.info("preparedInfoPath: {}", possibleupdatespath); - final String resultClassName = parser.get("resultTableName"); - log.info("resultTableName: {}", resultClassName); + final String resultClassName = parser.get("resultTableName"); + log.info("resultTableName: {}", resultClassName); - final Boolean saveGraph = - Optional.ofNullable(parser.get("saveGraph")) - .map(Boolean::valueOf) - .orElse(Boolean.TRUE); - log.info("saveGraph: {}", saveGraph); + final Boolean saveGraph = Optional + .ofNullable(parser.get("saveGraph")) + .map(Boolean::valueOf) + .orElse(Boolean.TRUE); + log.info("saveGraph: {}", saveGraph); - Class resultClazz = - (Class) Class.forName(resultClassName); + Class resultClazz = (Class) Class.forName(resultClassName); - SparkConf conf = new SparkConf(); - conf.set("hive.metastore.uris", parser.get("hive_metastore_uris")); + SparkConf conf = new SparkConf(); + conf.set("hive.metastore.uris", parser.get("hive_metastore_uris")); - runWithSparkHiveSession( - conf, - isSparkSessionManaged, - spark -> { - if (isTest(parser)) { - removeOutputDir(spark, outputPath); - } - execPropagation(spark, inputPath, outputPath, resultClazz, possibleupdatespath); - }); - } + runWithSparkHiveSession( + conf, + isSparkSessionManaged, + spark -> { + if (isTest(parser)) { + removeOutputDir(spark, outputPath); + } + execPropagation(spark, inputPath, outputPath, resultClazz, possibleupdatespath); + }); + } - private static void execPropagation( - SparkSession spark, - String inputPath, - String outputPath, - Class resultClazz, - String possibleUpdatesPath) { - org.apache.spark.sql.Dataset possibleUpdates = - readResultCommunityList(spark, possibleUpdatesPath); - org.apache.spark.sql.Dataset result = readPathEntity(spark, inputPath, resultClazz); + private static void execPropagation( + SparkSession spark, + String inputPath, + String outputPath, + Class resultClazz, + String possibleUpdatesPath) { + org.apache.spark.sql.Dataset possibleUpdates = readResultCommunityList( + spark, possibleUpdatesPath); + org.apache.spark.sql.Dataset result = readPathEntity(spark, inputPath, resultClazz); - result.joinWith( - possibleUpdates, - result.col("id").equalTo(possibleUpdates.col("resultId")), - "left_outer") - .map( - value -> { - R ret = value._1(); - Optional rcl = Optional.ofNullable(value._2()); - if (rcl.isPresent()) { - ArrayList communitySet = rcl.get().getCommunityList(); - List contextList = - ret.getContext().stream() - .map(con -> con.getId()) - .collect(Collectors.toList()); - Result res = new Result(); - res.setId(ret.getId()); - List propagatedContexts = new ArrayList<>(); - for (String cId : communitySet) { - if (!contextList.contains(cId)) { - Context newContext = new Context(); - newContext.setId(cId); - newContext.setDataInfo( - Arrays.asList( - getDataInfo( - PROPAGATION_DATA_INFO_TYPE, - PROPAGATION_RESULT_COMMUNITY_ORGANIZATION_CLASS_ID, - PROPAGATION_RESULT_COMMUNITY_ORGANIZATION_CLASS_NAME))); - propagatedContexts.add(newContext); - } - } - res.setContext(propagatedContexts); - ret.mergeFrom(res); - } - return ret; - }, - Encoders.bean(resultClazz)) - .toJSON() - .write() - .mode(SaveMode.Overwrite) - .option("compression", "gzip") - .text(outputPath); - } + result + .joinWith( + possibleUpdates, + result.col("id").equalTo(possibleUpdates.col("resultId")), + "left_outer") + .map( + value -> { + R ret = value._1(); + Optional rcl = Optional.ofNullable(value._2()); + if (rcl.isPresent()) { + ArrayList communitySet = rcl.get().getCommunityList(); + List contextList = ret + .getContext() + .stream() + .map(con -> con.getId()) + .collect(Collectors.toList()); + Result res = new Result(); + res.setId(ret.getId()); + List propagatedContexts = new ArrayList<>(); + for (String cId : communitySet) { + if (!contextList.contains(cId)) { + Context newContext = new Context(); + newContext.setId(cId); + newContext + .setDataInfo( + Arrays + .asList( + getDataInfo( + PROPAGATION_DATA_INFO_TYPE, + PROPAGATION_RESULT_COMMUNITY_ORGANIZATION_CLASS_ID, + PROPAGATION_RESULT_COMMUNITY_ORGANIZATION_CLASS_NAME))); + propagatedContexts.add(newContext); + } + } + res.setContext(propagatedContexts); + ret.mergeFrom(res); + } + return ret; + }, + Encoders.bean(resultClazz)) + .toJSON() + .write() + .mode(SaveMode.Overwrite) + .option("compression", "gzip") + .text(outputPath); + } } diff --git a/dhp-workflows/dhp-propagation/src/main/java/eu/dnetlib/dhp/resulttocommunityfromsemrel/PrepareResultCommunitySetStep1.java b/dhp-workflows/dhp-propagation/src/main/java/eu/dnetlib/dhp/resulttocommunityfromsemrel/PrepareResultCommunitySetStep1.java index cbb9b580e..5aef1c370 100644 --- a/dhp-workflows/dhp-propagation/src/main/java/eu/dnetlib/dhp/resulttocommunityfromsemrel/PrepareResultCommunitySetStep1.java +++ b/dhp-workflows/dhp-propagation/src/main/java/eu/dnetlib/dhp/resulttocommunityfromsemrel/PrepareResultCommunitySetStep1.java @@ -1,17 +1,12 @@ + package eu.dnetlib.dhp.resulttocommunityfromsemrel; import static eu.dnetlib.dhp.PropagationConstant.*; import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkHiveSession; -import com.fasterxml.jackson.databind.ObjectMapper; -import com.google.gson.Gson; -import eu.dnetlib.dhp.QueryInformationSystem; -import eu.dnetlib.dhp.application.ArgumentApplicationParser; -import eu.dnetlib.dhp.resulttocommunityfromorganization.ResultCommunityList; -import eu.dnetlib.dhp.schema.oaf.Relation; -import eu.dnetlib.dhp.schema.oaf.Result; import java.util.Arrays; import java.util.List; + import org.apache.commons.io.IOUtils; import org.apache.hadoop.io.compress.GzipCodec; import org.apache.spark.SparkConf; @@ -23,154 +18,158 @@ import org.apache.spark.sql.SparkSession; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import com.fasterxml.jackson.databind.ObjectMapper; +import com.google.gson.Gson; + +import eu.dnetlib.dhp.QueryInformationSystem; +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.resulttocommunityfromorganization.ResultCommunityList; +import eu.dnetlib.dhp.schema.oaf.Relation; +import eu.dnetlib.dhp.schema.oaf.Result; + public class PrepareResultCommunitySetStep1 { - private static final Logger log = LoggerFactory.getLogger(PrepareResultCommunitySetStep1.class); + private static final Logger log = LoggerFactory.getLogger(PrepareResultCommunitySetStep1.class); - private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); + private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); - public static void main(String[] args) throws Exception { - String jsonConfiguration = - IOUtils.toString( - PrepareResultCommunitySetStep1.class.getResourceAsStream( - "/eu/dnetlib/dhp/resulttocommunityfromsemrel/input_preparecommunitytoresult_parameters.json")); + public static void main(String[] args) throws Exception { + String jsonConfiguration = IOUtils + .toString( + PrepareResultCommunitySetStep1.class + .getResourceAsStream( + "/eu/dnetlib/dhp/resulttocommunityfromsemrel/input_preparecommunitytoresult_parameters.json")); - final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration); + final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration); - parser.parseArgument(args); + parser.parseArgument(args); - Boolean isSparkSessionManaged = isSparkSessionManaged(parser); - log.info("isSparkSessionManaged: {}", isSparkSessionManaged); + Boolean isSparkSessionManaged = isSparkSessionManaged(parser); + log.info("isSparkSessionManaged: {}", isSparkSessionManaged); - String inputPath = parser.get("sourcePath"); - log.info("inputPath: {}", inputPath); + String inputPath = parser.get("sourcePath"); + log.info("inputPath: {}", inputPath); - final String outputPath = parser.get("outputPath"); - log.info("outputPath: {}", outputPath); + final String outputPath = parser.get("outputPath"); + log.info("outputPath: {}", outputPath); - final String resultClassName = parser.get("resultTableName"); - log.info("resultTableName: {}", resultClassName); + final String resultClassName = parser.get("resultTableName"); + log.info("resultTableName: {}", resultClassName); - SparkConf conf = new SparkConf(); - conf.set("hive.metastore.uris", parser.get("hive_metastore_uris")); + SparkConf conf = new SparkConf(); + conf.set("hive.metastore.uris", parser.get("hive_metastore_uris")); - final List allowedsemrel = Arrays.asList(parser.get("allowedsemrels").split(";")); - log.info("allowedSemRel: {}", new Gson().toJson(allowedsemrel)); + final List allowedsemrel = Arrays.asList(parser.get("allowedsemrels").split(";")); + log.info("allowedSemRel: {}", new Gson().toJson(allowedsemrel)); - final String isLookupUrl = parser.get("isLookupUrl"); - log.info("isLookupUrl: {}", isLookupUrl); + final String isLookupUrl = parser.get("isLookupUrl"); + log.info("isLookupUrl: {}", isLookupUrl); - final List communityIdList = QueryInformationSystem.getCommunityList(isLookupUrl); - log.info("communityIdList: {}", new Gson().toJson(communityIdList)); + final List communityIdList = QueryInformationSystem.getCommunityList(isLookupUrl); + log.info("communityIdList: {}", new Gson().toJson(communityIdList)); - final String resultType = - resultClassName.substring(resultClassName.lastIndexOf(".") + 1).toLowerCase(); - log.info("resultType: {}", resultType); + final String resultType = resultClassName.substring(resultClassName.lastIndexOf(".") + 1).toLowerCase(); + log.info("resultType: {}", resultType); - Class resultClazz = - (Class) Class.forName(resultClassName); + Class resultClazz = (Class) Class.forName(resultClassName); - runWithSparkHiveSession( - conf, - isSparkSessionManaged, - spark -> { - if (isTest(parser)) { - removeOutputDir(spark, outputPath); - } - prepareInfo( - spark, - inputPath, - outputPath, - allowedsemrel, - resultClazz, - resultType, - communityIdList); - }); - } + runWithSparkHiveSession( + conf, + isSparkSessionManaged, + spark -> { + if (isTest(parser)) { + removeOutputDir(spark, outputPath); + } + prepareInfo( + spark, + inputPath, + outputPath, + allowedsemrel, + resultClazz, + resultType, + communityIdList); + }); + } - private static void prepareInfo( - SparkSession spark, - String inputPath, - String outputPath, - List allowedsemrel, - Class resultClazz, - String resultType, - List communityIdList) { - // read the relation table and the table related to the result it is using - final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); - org.apache.spark.sql.Dataset relation = - spark.createDataset( - sc.textFile(inputPath + "/relation") - .map(item -> OBJECT_MAPPER.readValue(item, Relation.class)) - .rdd(), - Encoders.bean(Relation.class)); - relation.createOrReplaceTempView("relation"); + private static void prepareInfo( + SparkSession spark, + String inputPath, + String outputPath, + List allowedsemrel, + Class resultClazz, + String resultType, + List communityIdList) { + // read the relation table and the table related to the result it is using + final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); + org.apache.spark.sql.Dataset relation = spark + .createDataset( + sc + .textFile(inputPath + "/relation") + .map(item -> OBJECT_MAPPER.readValue(item, Relation.class)) + .rdd(), + Encoders.bean(Relation.class)); + relation.createOrReplaceTempView("relation"); - log.info("Reading Graph table from: {}", inputPath + "/" + resultType); - Dataset result = readPathEntity(spark, inputPath + "/" + resultType, resultClazz); + log.info("Reading Graph table from: {}", inputPath + "/" + resultType); + Dataset result = readPathEntity(spark, inputPath + "/" + resultType, resultClazz); - result.createOrReplaceTempView("result"); + result.createOrReplaceTempView("result"); - getPossibleResultcommunityAssociation( - spark, allowedsemrel, outputPath + "/" + resultType, communityIdList); - } + getPossibleResultcommunityAssociation( + spark, allowedsemrel, outputPath + "/" + resultType, communityIdList); + } - private static void getPossibleResultcommunityAssociation( - SparkSession spark, - List allowedsemrel, - String outputPath, - List communityIdList) { + private static void getPossibleResultcommunityAssociation( + SparkSession spark, + List allowedsemrel, + String outputPath, + List communityIdList) { - String communitylist = getConstraintList(" co.id = '", communityIdList); - String semrellist = getConstraintList(" relClass = '", allowedsemrel); + String communitylist = getConstraintList(" co.id = '", communityIdList); + String semrellist = getConstraintList(" relClass = '", allowedsemrel); - /* - associates to each result the set of community contexts they are associated to - select id, collect_set(co.id) community_context " + - " from result " + - " lateral view explode (context) c as co " + - " where datainfo.deletedbyinference = false "+ communitylist + - " group by id + /* + * associates to each result the set of community contexts they are associated to select id, collect_set(co.id) + * community_context " + " from result " + " lateral view explode (context) c as co " + + * " where datainfo.deletedbyinference = false "+ communitylist + " group by id associates to each target + * of a relation with allowed semantics the set of community context it could possibly inherit from the source + * of the relation + */ + String query = "Select target resultId, community_context " + + "from (select id, collect_set(co.id) community_context " + + " from result " + + " lateral view explode (context) c as co " + + " where datainfo.deletedbyinference = false " + + communitylist + + " group by id) p " + + "JOIN " + + "(select source, target " + + "from relation " + + "where datainfo.deletedbyinference = false " + + semrellist + + ") r " + + "ON p.id = r.source"; - associates to each target of a relation with allowed semantics the set of community context it could possibly - inherit from the source of the relation - */ - String query = - "Select target resultId, community_context " - + "from (select id, collect_set(co.id) community_context " - + " from result " - + " lateral view explode (context) c as co " - + " where datainfo.deletedbyinference = false " - + communitylist - + " group by id) p " - + "JOIN " - + "(select source, target " - + "from relation " - + "where datainfo.deletedbyinference = false " - + semrellist - + ") r " - + "ON p.id = r.source"; + org.apache.spark.sql.Dataset result_context = spark.sql(query); + result_context.createOrReplaceTempView("result_context"); - org.apache.spark.sql.Dataset result_context = spark.sql(query); - result_context.createOrReplaceTempView("result_context"); + // ( target, (mes, dh-ch-, ni)) + /* + * a dataset for example could be linked to more than one publication. For each publication linked to that + * dataset the previous query will produce a row: targetId set of community context the target could possibly + * inherit with the following query there will be a single row for each result linked to more than one result of + * the result type currently being used + */ + query = "select resultId , collect_set(co) communityList " + + "from result_context " + + "lateral view explode (community_context) c as co " + + "where length(co) > 0 " + + "group by resultId"; - // ( target, (mes, dh-ch-, ni)) - /* - a dataset for example could be linked to more than one publication. For each publication linked to that dataset - the previous query will produce a row: targetId set of community context the target could possibly inherit - with the following query there will be a single row for each result linked to more than one result of the result type - currently being used - */ - query = - "select resultId , collect_set(co) communityList " - + "from result_context " - + "lateral view explode (community_context) c as co " - + "where length(co) > 0 " - + "group by resultId"; - - spark.sql(query) - .as(Encoders.bean(ResultCommunityList.class)) - .toJavaRDD() - .map(r -> OBJECT_MAPPER.writeValueAsString(r)) - .saveAsTextFile(outputPath, GzipCodec.class); - } + spark + .sql(query) + .as(Encoders.bean(ResultCommunityList.class)) + .toJavaRDD() + .map(r -> OBJECT_MAPPER.writeValueAsString(r)) + .saveAsTextFile(outputPath, GzipCodec.class); + } } diff --git a/dhp-workflows/dhp-propagation/src/main/java/eu/dnetlib/dhp/resulttocommunityfromsemrel/PrepareResultCommunitySetStep2.java b/dhp-workflows/dhp-propagation/src/main/java/eu/dnetlib/dhp/resulttocommunityfromsemrel/PrepareResultCommunitySetStep2.java index 3579db9e6..cbd7e5e50 100644 --- a/dhp-workflows/dhp-propagation/src/main/java/eu/dnetlib/dhp/resulttocommunityfromsemrel/PrepareResultCommunitySetStep2.java +++ b/dhp-workflows/dhp-propagation/src/main/java/eu/dnetlib/dhp/resulttocommunityfromsemrel/PrepareResultCommunitySetStep2.java @@ -1,13 +1,12 @@ + package eu.dnetlib.dhp.resulttocommunityfromsemrel; import static eu.dnetlib.dhp.PropagationConstant.*; import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; -import com.fasterxml.jackson.databind.ObjectMapper; -import eu.dnetlib.dhp.application.ArgumentApplicationParser; -import eu.dnetlib.dhp.resulttocommunityfromorganization.ResultCommunityList; import java.util.HashSet; import java.util.Set; + import org.apache.commons.io.IOUtils; import org.apache.hadoop.io.compress.GzipCodec; import org.apache.spark.SparkConf; @@ -16,89 +15,98 @@ import org.apache.spark.sql.Encoders; import org.apache.spark.sql.SparkSession; import org.slf4j.Logger; import org.slf4j.LoggerFactory; + +import com.fasterxml.jackson.databind.ObjectMapper; + +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.resulttocommunityfromorganization.ResultCommunityList; import scala.Tuple2; public class PrepareResultCommunitySetStep2 { - private static final Logger log = LoggerFactory.getLogger(PrepareResultCommunitySetStep2.class); - private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); + private static final Logger log = LoggerFactory.getLogger(PrepareResultCommunitySetStep2.class); + private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); - public static void main(String[] args) throws Exception { + public static void main(String[] args) throws Exception { - String jsonConfiguration = - IOUtils.toString( - PrepareResultCommunitySetStep2.class.getResourceAsStream( - "/eu/dnetlib/dhp/resulttocommunityfromsemrel/input_preparecommunitytoresult2_parameters.json")); + String jsonConfiguration = IOUtils + .toString( + PrepareResultCommunitySetStep2.class + .getResourceAsStream( + "/eu/dnetlib/dhp/resulttocommunityfromsemrel/input_preparecommunitytoresult2_parameters.json")); - final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration); + final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration); - parser.parseArgument(args); + parser.parseArgument(args); - Boolean isSparkSessionManaged = isSparkSessionManaged(parser); - log.info("isSparkSessionManaged: {}", isSparkSessionManaged); + Boolean isSparkSessionManaged = isSparkSessionManaged(parser); + log.info("isSparkSessionManaged: {}", isSparkSessionManaged); - String inputPath = parser.get("sourcePath"); - log.info("inputPath: {}", inputPath); + String inputPath = parser.get("sourcePath"); + log.info("inputPath: {}", inputPath); - final String outputPath = parser.get("outputPath"); - log.info("outputPath: {}", outputPath); + final String outputPath = parser.get("outputPath"); + log.info("outputPath: {}", outputPath); - SparkConf conf = new SparkConf(); + SparkConf conf = new SparkConf(); - runWithSparkSession( - conf, - isSparkSessionManaged, - spark -> { - if (isTest(parser)) { - removeOutputDir(spark, outputPath); - } - mergeInfo(spark, inputPath, outputPath); - }); - } + runWithSparkSession( + conf, + isSparkSessionManaged, + spark -> { + if (isTest(parser)) { + removeOutputDir(spark, outputPath); + } + mergeInfo(spark, inputPath, outputPath); + }); + } - private static void mergeInfo(SparkSession spark, String inputPath, String outputPath) { + private static void mergeInfo(SparkSession spark, String inputPath, String outputPath) { - Dataset resultOrcidAssocCommunityList = - readResultCommunityList(spark, inputPath + "/publication") - .union(readResultCommunityList(spark, inputPath + "/dataset")) - .union(readResultCommunityList(spark, inputPath + "/otherresearchproduct")) - .union(readResultCommunityList(spark, inputPath + "/software")); + Dataset resultOrcidAssocCommunityList = readResultCommunityList( + spark, inputPath + "/publication") + .union(readResultCommunityList(spark, inputPath + "/dataset")) + .union(readResultCommunityList(spark, inputPath + "/otherresearchproduct")) + .union(readResultCommunityList(spark, inputPath + "/software")); - resultOrcidAssocCommunityList - .toJavaRDD() - .mapToPair(r -> new Tuple2<>(r.getResultId(), r)) - .reduceByKey( - (a, b) -> { - if (a == null) { - return b; - } - if (b == null) { - return a; - } - Set community_set = new HashSet<>(); + resultOrcidAssocCommunityList + .toJavaRDD() + .mapToPair(r -> new Tuple2<>(r.getResultId(), r)) + .reduceByKey( + (a, b) -> { + if (a == null) { + return b; + } + if (b == null) { + return a; + } + Set community_set = new HashSet<>(); - a.getCommunityList().stream().forEach(aa -> community_set.add(aa)); + a.getCommunityList().stream().forEach(aa -> community_set.add(aa)); - b.getCommunityList().stream() - .forEach( - aa -> { - if (!community_set.contains(aa)) { - a.getCommunityList().add(aa); - community_set.add(aa); - } - }); - return a; - }) - .map(c -> c._2()) - .map(r -> OBJECT_MAPPER.writeValueAsString(r)) - .saveAsTextFile(outputPath, GzipCodec.class); - } + b + .getCommunityList() + .stream() + .forEach( + aa -> { + if (!community_set.contains(aa)) { + a.getCommunityList().add(aa); + community_set.add(aa); + } + }); + return a; + }) + .map(c -> c._2()) + .map(r -> OBJECT_MAPPER.writeValueAsString(r)) + .saveAsTextFile(outputPath, GzipCodec.class); + } - private static Dataset readResultCommunityList( - SparkSession spark, String relationPath) { - return spark.read() - .textFile(relationPath) - .map( - value -> OBJECT_MAPPER.readValue(value, ResultCommunityList.class), - Encoders.bean(ResultCommunityList.class)); - } + private static Dataset readResultCommunityList( + SparkSession spark, String relationPath) { + return spark + .read() + .textFile(relationPath) + .map( + value -> OBJECT_MAPPER.readValue(value, ResultCommunityList.class), + Encoders.bean(ResultCommunityList.class)); + } } diff --git a/dhp-workflows/dhp-propagation/src/main/java/eu/dnetlib/dhp/resulttocommunityfromsemrel/SparkResultToCommunityThroughSemRelJob4.java b/dhp-workflows/dhp-propagation/src/main/java/eu/dnetlib/dhp/resulttocommunityfromsemrel/SparkResultToCommunityThroughSemRelJob4.java index 4e72fac27..b513ddd79 100644 --- a/dhp-workflows/dhp-propagation/src/main/java/eu/dnetlib/dhp/resulttocommunityfromsemrel/SparkResultToCommunityThroughSemRelJob4.java +++ b/dhp-workflows/dhp-propagation/src/main/java/eu/dnetlib/dhp/resulttocommunityfromsemrel/SparkResultToCommunityThroughSemRelJob4.java @@ -1,15 +1,12 @@ + package eu.dnetlib.dhp.resulttocommunityfromsemrel; import static eu.dnetlib.dhp.PropagationConstant.*; import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkHiveSession; -import com.fasterxml.jackson.databind.ObjectMapper; -import com.ximpleware.extended.xpath.parser; -import eu.dnetlib.dhp.application.ArgumentApplicationParser; -import eu.dnetlib.dhp.resulttocommunityfromorganization.ResultCommunityList; -import eu.dnetlib.dhp.schema.oaf.*; import java.util.*; import java.util.stream.Collectors; + import org.apache.commons.io.IOUtils; import org.apache.spark.SparkConf; import org.apache.spark.sql.Encoders; @@ -18,119 +15,130 @@ import org.apache.spark.sql.SparkSession; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import com.fasterxml.jackson.databind.ObjectMapper; +import com.ximpleware.extended.xpath.parser; + +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.resulttocommunityfromorganization.ResultCommunityList; +import eu.dnetlib.dhp.schema.oaf.*; + public class SparkResultToCommunityThroughSemRelJob4 { - private static final Logger log = - LoggerFactory.getLogger(SparkResultToCommunityThroughSemRelJob4.class); + private static final Logger log = LoggerFactory.getLogger(SparkResultToCommunityThroughSemRelJob4.class); - private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); + private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); - public static void main(String[] args) throws Exception { + public static void main(String[] args) throws Exception { - String jsonConfiguration = - IOUtils.toString( - SparkResultToCommunityThroughSemRelJob4.class.getResourceAsStream( - "/eu/dnetlib/dhp/resulttocommunityfromsemrel/input_communitytoresult_parameters.json")); + String jsonConfiguration = IOUtils + .toString( + SparkResultToCommunityThroughSemRelJob4.class + .getResourceAsStream( + "/eu/dnetlib/dhp/resulttocommunityfromsemrel/input_communitytoresult_parameters.json")); - final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration); + final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration); - parser.parseArgument(args); + parser.parseArgument(args); - Boolean isSparkSessionManaged = isSparkSessionManaged(parser); - log.info("isSparkSessionManaged: {}", isSparkSessionManaged); + Boolean isSparkSessionManaged = isSparkSessionManaged(parser); + log.info("isSparkSessionManaged: {}", isSparkSessionManaged); - String inputPath = parser.get("sourcePath"); - log.info("inputPath: {}", inputPath); + String inputPath = parser.get("sourcePath"); + log.info("inputPath: {}", inputPath); - final String outputPath = parser.get("outputPath"); - log.info("outputPath: {}", outputPath); + final String outputPath = parser.get("outputPath"); + log.info("outputPath: {}", outputPath); - final String preparedInfoPath = parser.get("preparedInfoPath"); - log.info("preparedInfoPath: {}", preparedInfoPath); + final String preparedInfoPath = parser.get("preparedInfoPath"); + log.info("preparedInfoPath: {}", preparedInfoPath); - SparkConf conf = new SparkConf(); - conf.set("hive.metastore.uris", parser.get("hive_metastore_uris")); + SparkConf conf = new SparkConf(); + conf.set("hive.metastore.uris", parser.get("hive_metastore_uris")); - final String resultClassName = parser.get("resultTableName"); - log.info("resultTableName: {}", resultClassName); + final String resultClassName = parser.get("resultTableName"); + log.info("resultTableName: {}", resultClassName); - final Boolean saveGraph = - Optional.ofNullable(parser.get("saveGraph")) - .map(Boolean::valueOf) - .orElse(Boolean.TRUE); - log.info("saveGraph: {}", saveGraph); + final Boolean saveGraph = Optional + .ofNullable(parser.get("saveGraph")) + .map(Boolean::valueOf) + .orElse(Boolean.TRUE); + log.info("saveGraph: {}", saveGraph); - Class resultClazz = - (Class) Class.forName(resultClassName); + Class resultClazz = (Class) Class.forName(resultClassName); - runWithSparkHiveSession( - conf, - isSparkSessionManaged, - spark -> { - if (isTest(parser)) { - removeOutputDir(spark, outputPath); - } - if (saveGraph) { - execPropagation( - spark, inputPath, outputPath, preparedInfoPath, resultClazz); - } - }); - } + runWithSparkHiveSession( + conf, + isSparkSessionManaged, + spark -> { + if (isTest(parser)) { + removeOutputDir(spark, outputPath); + } + if (saveGraph) { + execPropagation( + spark, inputPath, outputPath, preparedInfoPath, resultClazz); + } + }); + } - private static void execPropagation( - SparkSession spark, - String inputPath, - String outputPath, - String preparedInfoPath, - Class resultClazz) { + private static void execPropagation( + SparkSession spark, + String inputPath, + String outputPath, + String preparedInfoPath, + Class resultClazz) { - org.apache.spark.sql.Dataset possibleUpdates = - readResultCommunityList(spark, preparedInfoPath); - org.apache.spark.sql.Dataset result = readPathEntity(spark, inputPath, resultClazz); + org.apache.spark.sql.Dataset possibleUpdates = readResultCommunityList( + spark, preparedInfoPath); + org.apache.spark.sql.Dataset result = readPathEntity(spark, inputPath, resultClazz); - result.joinWith( - possibleUpdates, - result.col("id").equalTo(possibleUpdates.col("resultId")), - "left_outer") - .map( - value -> { - R ret = value._1(); - Optional rcl = Optional.ofNullable(value._2()); - if (rcl.isPresent()) { - Set context_set = new HashSet<>(); - ret.getContext().stream().forEach(c -> context_set.add(c.getId())); - List contextList = - rcl.get().getCommunityList().stream() - .map( - c -> { - if (!context_set.contains(c)) { - Context newContext = new Context(); - newContext.setId(c); - newContext.setDataInfo( - Arrays.asList( - getDataInfo( - PROPAGATION_DATA_INFO_TYPE, - PROPAGATION_RESULT_COMMUNITY_SEMREL_CLASS_ID, - PROPAGATION_RESULT_COMMUNITY_SEMREL_CLASS_NAME))); - return newContext; - } - return null; - }) - .filter(c -> c != null) - .collect(Collectors.toList()); - Result r = new Result(); - r.setId(ret.getId()); - r.setContext(contextList); - ret.mergeFrom(r); - } + result + .joinWith( + possibleUpdates, + result.col("id").equalTo(possibleUpdates.col("resultId")), + "left_outer") + .map( + value -> { + R ret = value._1(); + Optional rcl = Optional.ofNullable(value._2()); + if (rcl.isPresent()) { + Set context_set = new HashSet<>(); + ret.getContext().stream().forEach(c -> context_set.add(c.getId())); + List contextList = rcl + .get() + .getCommunityList() + .stream() + .map( + c -> { + if (!context_set.contains(c)) { + Context newContext = new Context(); + newContext.setId(c); + newContext + .setDataInfo( + Arrays + .asList( + getDataInfo( + PROPAGATION_DATA_INFO_TYPE, + PROPAGATION_RESULT_COMMUNITY_SEMREL_CLASS_ID, + PROPAGATION_RESULT_COMMUNITY_SEMREL_CLASS_NAME))); + return newContext; + } + return null; + }) + .filter(c -> c != null) + .collect(Collectors.toList()); + Result r = new Result(); + r.setId(ret.getId()); + r.setContext(contextList); + ret.mergeFrom(r); + } - return ret; - }, - Encoders.bean(resultClazz)) - .toJSON() - .write() - .mode(SaveMode.Overwrite) - .option("compression", "gzip") - .text(outputPath); - } + return ret; + }, + Encoders.bean(resultClazz)) + .toJSON() + .write() + .mode(SaveMode.Overwrite) + .option("compression", "gzip") + .text(outputPath); + } } diff --git a/dhp-workflows/dhp-propagation/src/main/java/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/DatasourceOrganization.java b/dhp-workflows/dhp-propagation/src/main/java/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/DatasourceOrganization.java index e13e2a68d..e6b13dfa4 100644 --- a/dhp-workflows/dhp-propagation/src/main/java/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/DatasourceOrganization.java +++ b/dhp-workflows/dhp-propagation/src/main/java/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/DatasourceOrganization.java @@ -1,25 +1,26 @@ + package eu.dnetlib.dhp.resulttoorganizationfrominstrepo; import java.io.Serializable; public class DatasourceOrganization implements Serializable { - private String datasourceId; - private String organizationId; + private String datasourceId; + private String organizationId; - public String getDatasourceId() { - return datasourceId; - } + public String getDatasourceId() { + return datasourceId; + } - public void setDatasourceId(String datasourceId) { - this.datasourceId = datasourceId; - } + public void setDatasourceId(String datasourceId) { + this.datasourceId = datasourceId; + } - public String getOrganizationId() { - return organizationId; - } + public String getOrganizationId() { + return organizationId; + } - public void setOrganizationId(String organizationId) { - this.organizationId = organizationId; - } + public void setOrganizationId(String organizationId) { + this.organizationId = organizationId; + } } diff --git a/dhp-workflows/dhp-propagation/src/main/java/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/PrepareResultInstRepoAssociation.java b/dhp-workflows/dhp-propagation/src/main/java/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/PrepareResultInstRepoAssociation.java index 095f476cf..02faf0086 100644 --- a/dhp-workflows/dhp-propagation/src/main/java/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/PrepareResultInstRepoAssociation.java +++ b/dhp-workflows/dhp-propagation/src/main/java/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/PrepareResultInstRepoAssociation.java @@ -1,13 +1,9 @@ + package eu.dnetlib.dhp.resulttoorganizationfrominstrepo; import static eu.dnetlib.dhp.PropagationConstant.*; import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkHiveSession; -import com.fasterxml.jackson.databind.ObjectMapper; -import eu.dnetlib.dhp.application.ArgumentApplicationParser; -import eu.dnetlib.dhp.schema.oaf.Datasource; -import eu.dnetlib.dhp.schema.oaf.Organization; -import eu.dnetlib.dhp.schema.oaf.Relation; import org.apache.commons.io.IOUtils; import org.apache.hadoop.io.compress.GzipCodec; import org.apache.spark.SparkConf; @@ -18,121 +14,131 @@ import org.apache.spark.sql.SparkSession; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import com.fasterxml.jackson.databind.ObjectMapper; + +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.schema.oaf.Datasource; +import eu.dnetlib.dhp.schema.oaf.Organization; +import eu.dnetlib.dhp.schema.oaf.Relation; + public class PrepareResultInstRepoAssociation { - private static final Logger log = - LoggerFactory.getLogger(PrepareResultInstRepoAssociation.class); - private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); + private static final Logger log = LoggerFactory.getLogger(PrepareResultInstRepoAssociation.class); + private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); - public static void main(String[] args) throws Exception { + public static void main(String[] args) throws Exception { - String jsonConfiguration = - IOUtils.toString( - PrepareResultInstRepoAssociation.class.getResourceAsStream( - "/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/input_prepareresultorg_parameters.json")); + String jsonConfiguration = IOUtils + .toString( + PrepareResultInstRepoAssociation.class + .getResourceAsStream( + "/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/input_prepareresultorg_parameters.json")); - final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration); + final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration); - parser.parseArgument(args); + parser.parseArgument(args); - Boolean isSparkSessionManaged = isSparkSessionManaged(parser); - log.info("isSparkSessionManaged: {}", isSparkSessionManaged); + Boolean isSparkSessionManaged = isSparkSessionManaged(parser); + log.info("isSparkSessionManaged: {}", isSparkSessionManaged); - String inputPath = parser.get("sourcePath"); - log.info("inputPath: {}", inputPath); + String inputPath = parser.get("sourcePath"); + log.info("inputPath: {}", inputPath); - final String datasourceOrganizationPath = parser.get("datasourceOrganizationPath"); - log.info("datasourceOrganizationPath {}: ", datasourceOrganizationPath); + final String datasourceOrganizationPath = parser.get("datasourceOrganizationPath"); + log.info("datasourceOrganizationPath {}: ", datasourceOrganizationPath); - final String alreadyLinkedPath = parser.get("alreadyLinkedPath"); - log.info("alreadyLinkedPath {}: ", alreadyLinkedPath); + final String alreadyLinkedPath = parser.get("alreadyLinkedPath"); + log.info("alreadyLinkedPath {}: ", alreadyLinkedPath); - SparkConf conf = new SparkConf(); - conf.set("hive.metastore.uris", parser.get("hive_metastore_uris")); + SparkConf conf = new SparkConf(); + conf.set("hive.metastore.uris", parser.get("hive_metastore_uris")); - runWithSparkHiveSession( - conf, - isSparkSessionManaged, - spark -> { - readNeededResources(spark, inputPath); - prepareDatasourceOrganizationAssociations( - spark, datasourceOrganizationPath, alreadyLinkedPath); - prepareAlreadyLinkedAssociation(spark, alreadyLinkedPath); - }); - } + runWithSparkHiveSession( + conf, + isSparkSessionManaged, + spark -> { + readNeededResources(spark, inputPath); + prepareDatasourceOrganizationAssociations( + spark, datasourceOrganizationPath, alreadyLinkedPath); + prepareAlreadyLinkedAssociation(spark, alreadyLinkedPath); + }); + } - private static void prepareAlreadyLinkedAssociation( - SparkSession spark, String alreadyLinkedPath) { - String query = - "Select source resultId, collect_set(target) organizationSet " - + "from relation " - + "where datainfo.deletedbyinference = false " - + "and relClass = '" - + RELATION_RESULT_ORGANIZATION_REL_CLASS - + "' " - + "group by source"; + private static void prepareAlreadyLinkedAssociation( + SparkSession spark, String alreadyLinkedPath) { + String query = "Select source resultId, collect_set(target) organizationSet " + + "from relation " + + "where datainfo.deletedbyinference = false " + + "and relClass = '" + + RELATION_RESULT_ORGANIZATION_REL_CLASS + + "' " + + "group by source"; - spark.sql(query) - .as(Encoders.bean(ResultOrganizationSet.class)) - .toJavaRDD() - .map(r -> OBJECT_MAPPER.writeValueAsString(r)) - .saveAsTextFile(alreadyLinkedPath, GzipCodec.class); - } + spark + .sql(query) + .as(Encoders.bean(ResultOrganizationSet.class)) + .toJavaRDD() + .map(r -> OBJECT_MAPPER.writeValueAsString(r)) + .saveAsTextFile(alreadyLinkedPath, GzipCodec.class); + } - private static void readNeededResources(SparkSession spark, String inputPath) { - final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); + private static void readNeededResources(SparkSession spark, String inputPath) { + final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); - org.apache.spark.sql.Dataset datasource = - spark.createDataset( - sc.textFile(inputPath + "/datasource") - .map(item -> new ObjectMapper().readValue(item, Datasource.class)) - .rdd(), - Encoders.bean(Datasource.class)); + org.apache.spark.sql.Dataset datasource = spark + .createDataset( + sc + .textFile(inputPath + "/datasource") + .map(item -> new ObjectMapper().readValue(item, Datasource.class)) + .rdd(), + Encoders.bean(Datasource.class)); - org.apache.spark.sql.Dataset relation = - spark.createDataset( - sc.textFile(inputPath + "/relation") - .map(item -> new ObjectMapper().readValue(item, Relation.class)) - .rdd(), - Encoders.bean(Relation.class)); + org.apache.spark.sql.Dataset relation = spark + .createDataset( + sc + .textFile(inputPath + "/relation") + .map(item -> new ObjectMapper().readValue(item, Relation.class)) + .rdd(), + Encoders.bean(Relation.class)); - org.apache.spark.sql.Dataset organization = - spark.createDataset( - sc.textFile(inputPath + "/organization") - .map(item -> new ObjectMapper().readValue(item, Organization.class)) - .rdd(), - Encoders.bean(Organization.class)); + org.apache.spark.sql.Dataset organization = spark + .createDataset( + sc + .textFile(inputPath + "/organization") + .map(item -> new ObjectMapper().readValue(item, Organization.class)) + .rdd(), + Encoders.bean(Organization.class)); - datasource.createOrReplaceTempView("datasource"); - relation.createOrReplaceTempView("relation"); - organization.createOrReplaceTempView("organization"); - } + datasource.createOrReplaceTempView("datasource"); + relation.createOrReplaceTempView("relation"); + organization.createOrReplaceTempView("organization"); + } - private static void prepareDatasourceOrganizationAssociations( - SparkSession spark, String datasourceOrganizationPath, String alreadyLinkedPath) { + private static void prepareDatasourceOrganizationAssociations( + SparkSession spark, String datasourceOrganizationPath, String alreadyLinkedPath) { - String query = - "SELECT source datasourceId, target organizationId " - + "FROM ( SELECT id " - + "FROM datasource " - + "WHERE datasourcetype.classid = '" - + INSTITUTIONAL_REPO_TYPE - + "' " - + "AND datainfo.deletedbyinference = false ) d " - + "JOIN ( SELECT source, target " - + "FROM relation " - + "WHERE relclass = '" - + RELATION_DATASOURCE_ORGANIZATION_REL_CLASS - + "' " - + "AND datainfo.deletedbyinference = false ) rel " - + "ON d.id = rel.source "; + String query = "SELECT source datasourceId, target organizationId " + + "FROM ( SELECT id " + + "FROM datasource " + + "WHERE datasourcetype.classid = '" + + INSTITUTIONAL_REPO_TYPE + + "' " + + "AND datainfo.deletedbyinference = false ) d " + + "JOIN ( SELECT source, target " + + "FROM relation " + + "WHERE relclass = '" + + RELATION_DATASOURCE_ORGANIZATION_REL_CLASS + + "' " + + "AND datainfo.deletedbyinference = false ) rel " + + "ON d.id = rel.source "; - spark.sql(query) - .as(Encoders.bean(DatasourceOrganization.class)) - .toJSON() - .write() - .mode(SaveMode.Overwrite) - .option("compression", "gzip") - .text(datasourceOrganizationPath); - } + spark + .sql(query) + .as(Encoders.bean(DatasourceOrganization.class)) + .toJSON() + .write() + .mode(SaveMode.Overwrite) + .option("compression", "gzip") + .text(datasourceOrganizationPath); + } } diff --git a/dhp-workflows/dhp-propagation/src/main/java/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/ResultOrganizationSet.java b/dhp-workflows/dhp-propagation/src/main/java/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/ResultOrganizationSet.java index bad581c1d..3bce14cdb 100644 --- a/dhp-workflows/dhp-propagation/src/main/java/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/ResultOrganizationSet.java +++ b/dhp-workflows/dhp-propagation/src/main/java/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/ResultOrganizationSet.java @@ -1,25 +1,26 @@ + package eu.dnetlib.dhp.resulttoorganizationfrominstrepo; import java.io.Serializable; import java.util.ArrayList; public class ResultOrganizationSet implements Serializable { - private String resultId; - private ArrayList organizationSet; + private String resultId; + private ArrayList organizationSet; - public String getResultId() { - return resultId; - } + public String getResultId() { + return resultId; + } - public void setResultId(String resultId) { - this.resultId = resultId; - } + public void setResultId(String resultId) { + this.resultId = resultId; + } - public ArrayList getOrganizationSet() { - return organizationSet; - } + public ArrayList getOrganizationSet() { + return organizationSet; + } - public void setOrganizationSet(ArrayList organizationSet) { - this.organizationSet = organizationSet; - } + public void setOrganizationSet(ArrayList organizationSet) { + this.organizationSet = organizationSet; + } } diff --git a/dhp-workflows/dhp-propagation/src/main/java/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/SparkResultToOrganizationFromIstRepoJob2.java b/dhp-workflows/dhp-propagation/src/main/java/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/SparkResultToOrganizationFromIstRepoJob2.java index db8b99ac7..6bdfa36dd 100644 --- a/dhp-workflows/dhp-propagation/src/main/java/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/SparkResultToOrganizationFromIstRepoJob2.java +++ b/dhp-workflows/dhp-propagation/src/main/java/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/SparkResultToOrganizationFromIstRepoJob2.java @@ -1,12 +1,11 @@ + package eu.dnetlib.dhp.resulttoorganizationfrominstrepo; import static eu.dnetlib.dhp.PropagationConstant.*; import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkHiveSession; -import com.fasterxml.jackson.databind.ObjectMapper; -import eu.dnetlib.dhp.application.ArgumentApplicationParser; -import eu.dnetlib.dhp.schema.oaf.*; import java.util.*; + import org.apache.commons.io.IOUtils; import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaSparkContext; @@ -16,277 +15,279 @@ import org.apache.spark.sql.*; import org.apache.spark.sql.Dataset; import org.slf4j.Logger; import org.slf4j.LoggerFactory; + +import com.fasterxml.jackson.databind.ObjectMapper; + +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.schema.oaf.*; import scala.Tuple2; public class SparkResultToOrganizationFromIstRepoJob2 { - private static final Logger log = - LoggerFactory.getLogger(SparkResultToOrganizationFromIstRepoJob2.class); + private static final Logger log = LoggerFactory.getLogger(SparkResultToOrganizationFromIstRepoJob2.class); - private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); + private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); - public static void main(String[] args) throws Exception { + public static void main(String[] args) throws Exception { - String jsonConfiguration = - IOUtils.toString( - SparkResultToOrganizationFromIstRepoJob2.class.getResourceAsStream( - "/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/input_propagationresulaffiliationfrominstrepo_parameters.json")); + String jsonConfiguration = IOUtils + .toString( + SparkResultToOrganizationFromIstRepoJob2.class + .getResourceAsStream( + "/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/input_propagationresulaffiliationfrominstrepo_parameters.json")); - final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration); + final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration); - parser.parseArgument(args); + parser.parseArgument(args); - Boolean isSparkSessionManaged = isSparkSessionManaged(parser); - log.info("isSparkSessionManaged: {}", isSparkSessionManaged); + Boolean isSparkSessionManaged = isSparkSessionManaged(parser); + log.info("isSparkSessionManaged: {}", isSparkSessionManaged); - String inputPath = parser.get("sourcePath"); - log.info("inputPath: {}", inputPath); + String inputPath = parser.get("sourcePath"); + log.info("inputPath: {}", inputPath); - final String outputPath = parser.get("outputPath"); - log.info("outputPath: {}", outputPath); + final String outputPath = parser.get("outputPath"); + log.info("outputPath: {}", outputPath); - final String datasourceorganization = parser.get("datasourceOrganizationPath"); - log.info("datasourceOrganizationPath: {}", datasourceorganization); + final String datasourceorganization = parser.get("datasourceOrganizationPath"); + log.info("datasourceOrganizationPath: {}", datasourceorganization); - final String alreadylinked = parser.get("alreadyLinkedPath"); - log.info("alreadyLinkedPath: {}", alreadylinked); + final String alreadylinked = parser.get("alreadyLinkedPath"); + log.info("alreadyLinkedPath: {}", alreadylinked); - final String resultClassName = parser.get("resultTableName"); - log.info("resultTableName: {}", resultClassName); + final String resultClassName = parser.get("resultTableName"); + log.info("resultTableName: {}", resultClassName); - final String resultType = - resultClassName.substring(resultClassName.lastIndexOf(".") + 1).toLowerCase(); - log.info("resultType: {}", resultType); + final String resultType = resultClassName.substring(resultClassName.lastIndexOf(".") + 1).toLowerCase(); + log.info("resultType: {}", resultType); - final Boolean writeUpdates = - Optional.ofNullable(parser.get("writeUpdate")) - .map(Boolean::valueOf) - .orElse(Boolean.TRUE); - log.info("writeUpdate: {}", writeUpdates); + final Boolean writeUpdates = Optional + .ofNullable(parser.get("writeUpdate")) + .map(Boolean::valueOf) + .orElse(Boolean.TRUE); + log.info("writeUpdate: {}", writeUpdates); - final Boolean saveGraph = - Optional.ofNullable(parser.get("saveGraph")) - .map(Boolean::valueOf) - .orElse(Boolean.TRUE); - log.info("saveGraph: {}", saveGraph); + final Boolean saveGraph = Optional + .ofNullable(parser.get("saveGraph")) + .map(Boolean::valueOf) + .orElse(Boolean.TRUE); + log.info("saveGraph: {}", saveGraph); - Class resultClazz = - (Class) Class.forName(resultClassName); + Class resultClazz = (Class) Class.forName(resultClassName); - SparkConf conf = new SparkConf(); - conf.set("hive.metastore.uris", parser.get("hive_metastore_uris")); + SparkConf conf = new SparkConf(); + conf.set("hive.metastore.uris", parser.get("hive_metastore_uris")); - runWithSparkHiveSession( - conf, - isSparkSessionManaged, - spark -> { - if (isTest(parser)) { - removeOutputDir(spark, outputPath); - } - execPropagation( - spark, - datasourceorganization, - alreadylinked, - inputPath, - outputPath, - resultClazz, - resultType, - writeUpdates, - saveGraph); - }); - } + runWithSparkHiveSession( + conf, + isSparkSessionManaged, + spark -> { + if (isTest(parser)) { + removeOutputDir(spark, outputPath); + } + execPropagation( + spark, + datasourceorganization, + alreadylinked, + inputPath, + outputPath, + resultClazz, + resultType, + writeUpdates, + saveGraph); + }); + } - private static void execPropagation( - SparkSession spark, - String datasourceorganization, - String alreadylinked, - String inputPath, - String outputPath, - Class resultClazz, - String resultType, - Boolean writeUpdates, - Boolean saveGraph) { - final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); + private static void execPropagation( + SparkSession spark, + String datasourceorganization, + String alreadylinked, + String inputPath, + String outputPath, + Class resultClazz, + String resultType, + Boolean writeUpdates, + Boolean saveGraph) { + final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); - org.apache.spark.sql.Dataset datasourceorganizationassoc = - readAssocDatasourceOrganization(spark, datasourceorganization); + org.apache.spark.sql.Dataset datasourceorganizationassoc = readAssocDatasourceOrganization( + spark, datasourceorganization); - // broadcasting the result of the preparation step - Broadcast> - broadcast_datasourceorganizationassoc = sc.broadcast(datasourceorganizationassoc); + // broadcasting the result of the preparation step + Broadcast> broadcast_datasourceorganizationassoc = sc + .broadcast(datasourceorganizationassoc); - org.apache.spark.sql.Dataset potentialUpdates = - getPotentialRelations( - spark, - inputPath, - resultClazz, - broadcast_datasourceorganizationassoc) - .as(Encoders.bean(ResultOrganizationSet.class)); + org.apache.spark.sql.Dataset potentialUpdates = getPotentialRelations( + spark, + inputPath, + resultClazz, + broadcast_datasourceorganizationassoc) + .as(Encoders.bean(ResultOrganizationSet.class)); - if (writeUpdates) { - createUpdateForRelationWrite(potentialUpdates, outputPath + "/" + resultType); - } + if (writeUpdates) { + createUpdateForRelationWrite(potentialUpdates, outputPath + "/" + resultType); + } - if (saveGraph) { - getNewRelations( - spark.read() - .textFile(alreadylinked) - .map( - value -> - OBJECT_MAPPER.readValue( - value, ResultOrganizationSet.class), - Encoders.bean(ResultOrganizationSet.class)), - potentialUpdates) - .toJSON() - .write() - .mode(SaveMode.Append) - .option("compression", "gzip") - .text(outputPath); - } - } + if (saveGraph) { + getNewRelations( + spark + .read() + .textFile(alreadylinked) + .map( + value -> OBJECT_MAPPER + .readValue( + value, ResultOrganizationSet.class), + Encoders.bean(ResultOrganizationSet.class)), + potentialUpdates) + .toJSON() + .write() + .mode(SaveMode.Append) + .option("compression", "gzip") + .text(outputPath); + } + } - private static Dataset getNewRelations( - Dataset alreadyLinked, - Dataset potentialUpdates) { + private static Dataset getNewRelations( + Dataset alreadyLinked, + Dataset potentialUpdates) { - return potentialUpdates - .joinWith( - alreadyLinked, - potentialUpdates.col("resultId").equalTo(alreadyLinked.col("resultId")), - "left_outer") - .flatMap( - (FlatMapFunction< - Tuple2, - Relation>) - value -> { - List new_relations = new ArrayList<>(); - ResultOrganizationSet potential_update = value._1(); - Optional already_linked = - Optional.ofNullable(value._2()); - List organization_list = - potential_update.getOrganizationSet(); - if (already_linked.isPresent()) { - already_linked.get().getOrganizationSet().stream() - .forEach( - rId -> { - if (organization_list.contains(rId)) { - organization_list.remove(rId); - } - }); - } - String resultId = potential_update.getResultId(); - organization_list.stream() - .forEach( - orgId -> { - new_relations.add( - getRelation( - orgId, - resultId, - RELATION_ORGANIZATION_RESULT_REL_CLASS, - RELATION_RESULTORGANIZATION_REL_TYPE, - RELATION_RESULTORGANIZATION_SUBREL_TYPE, - PROPAGATION_DATA_INFO_TYPE, - PROPAGATION_RELATION_RESULT_ORGANIZATION_INST_REPO_CLASS_ID, - PROPAGATION_RELATION_RESULT_ORGANIZATION_INST_REPO_CLASS_NAME)); - new_relations.add( - getRelation( - resultId, - orgId, - RELATION_RESULT_ORGANIZATION_REL_CLASS, - RELATION_RESULTORGANIZATION_REL_TYPE, - RELATION_RESULTORGANIZATION_SUBREL_TYPE, - PROPAGATION_DATA_INFO_TYPE, - PROPAGATION_RELATION_RESULT_ORGANIZATION_INST_REPO_CLASS_ID, - PROPAGATION_RELATION_RESULT_ORGANIZATION_INST_REPO_CLASS_NAME)); - }); - return new_relations.iterator(); - }, - Encoders.bean(Relation.class)); - } + return potentialUpdates + .joinWith( + alreadyLinked, + potentialUpdates.col("resultId").equalTo(alreadyLinked.col("resultId")), + "left_outer") + .flatMap( + (FlatMapFunction, Relation>) value -> { + List new_relations = new ArrayList<>(); + ResultOrganizationSet potential_update = value._1(); + Optional already_linked = Optional.ofNullable(value._2()); + List organization_list = potential_update.getOrganizationSet(); + if (already_linked.isPresent()) { + already_linked + .get() + .getOrganizationSet() + .stream() + .forEach( + rId -> { + if (organization_list.contains(rId)) { + organization_list.remove(rId); + } + }); + } + String resultId = potential_update.getResultId(); + organization_list + .stream() + .forEach( + orgId -> { + new_relations + .add( + getRelation( + orgId, + resultId, + RELATION_ORGANIZATION_RESULT_REL_CLASS, + RELATION_RESULTORGANIZATION_REL_TYPE, + RELATION_RESULTORGANIZATION_SUBREL_TYPE, + PROPAGATION_DATA_INFO_TYPE, + PROPAGATION_RELATION_RESULT_ORGANIZATION_INST_REPO_CLASS_ID, + PROPAGATION_RELATION_RESULT_ORGANIZATION_INST_REPO_CLASS_NAME)); + new_relations + .add( + getRelation( + resultId, + orgId, + RELATION_RESULT_ORGANIZATION_REL_CLASS, + RELATION_RESULTORGANIZATION_REL_TYPE, + RELATION_RESULTORGANIZATION_SUBREL_TYPE, + PROPAGATION_DATA_INFO_TYPE, + PROPAGATION_RELATION_RESULT_ORGANIZATION_INST_REPO_CLASS_ID, + PROPAGATION_RELATION_RESULT_ORGANIZATION_INST_REPO_CLASS_NAME)); + }); + return new_relations.iterator(); + }, + Encoders.bean(Relation.class)); + } - private static - org.apache.spark.sql.Dataset getPotentialRelations( - SparkSession spark, - String inputPath, - Class resultClazz, - Broadcast> - broadcast_datasourceorganizationassoc) { - org.apache.spark.sql.Dataset result = readPathEntity(spark, inputPath, resultClazz); - result.createOrReplaceTempView("result"); - createCfHbforresult(spark); + private static org.apache.spark.sql.Dataset getPotentialRelations( + SparkSession spark, + String inputPath, + Class resultClazz, + Broadcast> broadcast_datasourceorganizationassoc) { + org.apache.spark.sql.Dataset result = readPathEntity(spark, inputPath, resultClazz); + result.createOrReplaceTempView("result"); + createCfHbforresult(spark); - return organizationPropagationAssoc(spark, broadcast_datasourceorganizationassoc); - } + return organizationPropagationAssoc(spark, broadcast_datasourceorganizationassoc); + } - private static org.apache.spark.sql.Dataset - readAssocDatasourceOrganization( - SparkSession spark, String datasourcecountryorganization) { - return spark.read() - .textFile(datasourcecountryorganization) - .map( - value -> OBJECT_MAPPER.readValue(value, DatasourceOrganization.class), - Encoders.bean(DatasourceOrganization.class)); - } + private static org.apache.spark.sql.Dataset readAssocDatasourceOrganization( + SparkSession spark, String datasourcecountryorganization) { + return spark + .read() + .textFile(datasourcecountryorganization) + .map( + value -> OBJECT_MAPPER.readValue(value, DatasourceOrganization.class), + Encoders.bean(DatasourceOrganization.class)); + } - private static void createUpdateForRelationWrite( - Dataset toupdaterelation, String outputPath) { - toupdaterelation - .flatMap( - s -> { - List relationList = new ArrayList<>(); - List orgs = s.getOrganizationSet(); - String resId = s.getResultId(); - for (String org : orgs) { - relationList.add( - getRelation( - org, - resId, - RELATION_ORGANIZATION_RESULT_REL_CLASS, - RELATION_RESULTORGANIZATION_REL_TYPE, - RELATION_RESULTORGANIZATION_SUBREL_TYPE, - PROPAGATION_DATA_INFO_TYPE, - PROPAGATION_RELATION_RESULT_ORGANIZATION_INST_REPO_CLASS_ID, - PROPAGATION_RELATION_RESULT_ORGANIZATION_INST_REPO_CLASS_NAME)); - relationList.add( - getRelation( - resId, - org, - RELATION_RESULT_ORGANIZATION_REL_CLASS, - RELATION_RESULTORGANIZATION_REL_TYPE, - RELATION_RESULTORGANIZATION_SUBREL_TYPE, - PROPAGATION_DATA_INFO_TYPE, - PROPAGATION_RELATION_RESULT_ORGANIZATION_INST_REPO_CLASS_ID, - PROPAGATION_RELATION_RESULT_ORGANIZATION_INST_REPO_CLASS_NAME)); - } - return relationList.iterator(); - }, - Encoders.bean(Relation.class)) - .toJSON() - .write() - .mode(SaveMode.Overwrite) - .option("compression", "gzip") - .text(outputPath); - } + private static void createUpdateForRelationWrite( + Dataset toupdaterelation, String outputPath) { + toupdaterelation + .flatMap( + s -> { + List relationList = new ArrayList<>(); + List orgs = s.getOrganizationSet(); + String resId = s.getResultId(); + for (String org : orgs) { + relationList + .add( + getRelation( + org, + resId, + RELATION_ORGANIZATION_RESULT_REL_CLASS, + RELATION_RESULTORGANIZATION_REL_TYPE, + RELATION_RESULTORGANIZATION_SUBREL_TYPE, + PROPAGATION_DATA_INFO_TYPE, + PROPAGATION_RELATION_RESULT_ORGANIZATION_INST_REPO_CLASS_ID, + PROPAGATION_RELATION_RESULT_ORGANIZATION_INST_REPO_CLASS_NAME)); + relationList + .add( + getRelation( + resId, + org, + RELATION_RESULT_ORGANIZATION_REL_CLASS, + RELATION_RESULTORGANIZATION_REL_TYPE, + RELATION_RESULTORGANIZATION_SUBREL_TYPE, + PROPAGATION_DATA_INFO_TYPE, + PROPAGATION_RELATION_RESULT_ORGANIZATION_INST_REPO_CLASS_ID, + PROPAGATION_RELATION_RESULT_ORGANIZATION_INST_REPO_CLASS_NAME)); + } + return relationList.iterator(); + }, + Encoders.bean(Relation.class)) + .toJSON() + .write() + .mode(SaveMode.Overwrite) + .option("compression", "gzip") + .text(outputPath); + } - private static org.apache.spark.sql.Dataset organizationPropagationAssoc( - SparkSession spark, - Broadcast> - broadcast_datasourceorganizationassoc) { - org.apache.spark.sql.Dataset datasourceorganization = - broadcast_datasourceorganizationassoc.value(); - datasourceorganization.createOrReplaceTempView("rels"); - String query = - "SELECT id resultId, collect_set(organizationId) organizationSet " - + "FROM ( SELECT id, organizationId " - + "FROM rels " - + "JOIN cfhb " - + " ON cf = datasourceId " - + "UNION ALL " - + "SELECT id , organizationId " - + "FROM rels " - + "JOIN cfhb " - + " ON hb = datasourceId ) tmp " - + "GROUP BY id"; - return spark.sql(query).as(Encoders.bean(ResultOrganizationSet.class)); - } + private static org.apache.spark.sql.Dataset organizationPropagationAssoc( + SparkSession spark, + Broadcast> broadcast_datasourceorganizationassoc) { + org.apache.spark.sql.Dataset datasourceorganization = broadcast_datasourceorganizationassoc + .value(); + datasourceorganization.createOrReplaceTempView("rels"); + String query = "SELECT id resultId, collect_set(organizationId) organizationSet " + + "FROM ( SELECT id, organizationId " + + "FROM rels " + + "JOIN cfhb " + + " ON cf = datasourceId " + + "UNION ALL " + + "SELECT id , organizationId " + + "FROM rels " + + "JOIN cfhb " + + " ON hb = datasourceId ) tmp " + + "GROUP BY id"; + return spark.sql(query).as(Encoders.bean(ResultOrganizationSet.class)); + } } diff --git a/dhp-workflows/dhp-propagation/src/test/java/eu/dnetlib/dhp/countrypropagation/CountryPropagationJobTest.java b/dhp-workflows/dhp-propagation/src/test/java/eu/dnetlib/dhp/countrypropagation/CountryPropagationJobTest.java index 675bb3917..3bc499233 100644 --- a/dhp-workflows/dhp-propagation/src/test/java/eu/dnetlib/dhp/countrypropagation/CountryPropagationJobTest.java +++ b/dhp-workflows/dhp-propagation/src/test/java/eu/dnetlib/dhp/countrypropagation/CountryPropagationJobTest.java @@ -1,13 +1,12 @@ + package eu.dnetlib.dhp.countrypropagation; -import com.fasterxml.jackson.databind.ObjectMapper; -import eu.dnetlib.dhp.schema.oaf.Country; -import eu.dnetlib.dhp.schema.oaf.Software; import java.io.IOException; import java.nio.file.Files; import java.nio.file.Path; import java.util.ArrayList; import java.util.List; + import org.apache.commons.io.FileUtils; import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaRDD; @@ -19,231 +18,252 @@ import org.junit.jupiter.api.BeforeAll; import org.junit.jupiter.api.Test; import org.slf4j.Logger; import org.slf4j.LoggerFactory; + +import com.fasterxml.jackson.databind.ObjectMapper; + +import eu.dnetlib.dhp.schema.oaf.Country; +import eu.dnetlib.dhp.schema.oaf.Software; import scala.Tuple2; public class CountryPropagationJobTest { - private static final Logger log = LoggerFactory.getLogger(CountryPropagationJobTest.class); + private static final Logger log = LoggerFactory.getLogger(CountryPropagationJobTest.class); - private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); + private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); - private static final ClassLoader cl = CountryPropagationJobTest.class.getClassLoader(); + private static final ClassLoader cl = CountryPropagationJobTest.class.getClassLoader(); - private static SparkSession spark; + private static SparkSession spark; - private static Path workingDir; + private static Path workingDir; - @BeforeAll - public static void beforeAll() throws IOException { - workingDir = Files.createTempDirectory(CountryPropagationJobTest.class.getSimpleName()); - log.info("using work dir {}", workingDir); + @BeforeAll + public static void beforeAll() throws IOException { + workingDir = Files.createTempDirectory(CountryPropagationJobTest.class.getSimpleName()); + log.info("using work dir {}", workingDir); - SparkConf conf = new SparkConf(); - conf.setAppName(CountryPropagationJobTest.class.getSimpleName()); + SparkConf conf = new SparkConf(); + conf.setAppName(CountryPropagationJobTest.class.getSimpleName()); - conf.setMaster("local[*]"); - conf.set("spark.driver.host", "localhost"); - conf.set("hive.metastore.local", "true"); - conf.set("spark.ui.enabled", "false"); - conf.set("spark.sql.warehouse.dir", workingDir.toString()); - conf.set("hive.metastore.warehouse.dir", workingDir.resolve("warehouse").toString()); + conf.setMaster("local[*]"); + conf.set("spark.driver.host", "localhost"); + conf.set("hive.metastore.local", "true"); + conf.set("spark.ui.enabled", "false"); + conf.set("spark.sql.warehouse.dir", workingDir.toString()); + conf.set("hive.metastore.warehouse.dir", workingDir.resolve("warehouse").toString()); - spark = - SparkSession.builder() - .appName(CountryPropagationJobTest.class.getSimpleName()) - .config(conf) - .getOrCreate(); - } + spark = SparkSession + .builder() + .appName(CountryPropagationJobTest.class.getSimpleName()) + .config(conf) + .getOrCreate(); + } - @AfterAll - public static void afterAll() throws IOException { - FileUtils.deleteDirectory(workingDir.toFile()); - spark.stop(); - } + @AfterAll + public static void afterAll() throws IOException { + FileUtils.deleteDirectory(workingDir.toFile()); + spark.stop(); + } - @Test - public void testCountryPropagationSoftware() throws Exception { - SparkCountryPropagationJob2.main( - new String[] { - "-isSparkSessionManaged", - Boolean.FALSE.toString(), - "-sourcePath", - getClass() - .getResource("/eu/dnetlib/dhp/countrypropagation/sample/software") - .getPath(), - "-hive_metastore_uris", - "", - "-saveGraph", - "true", - "-resultTableName", - "eu.dnetlib.dhp.schema.oaf.Software", - "-outputPath", - workingDir.toString() + "/software", - "-preparedInfoPath", - getClass() - .getResource("/eu/dnetlib/dhp/countrypropagation/preparedInfo") - .getPath(), - }); + @Test + public void testCountryPropagationSoftware() throws Exception { + SparkCountryPropagationJob2 + .main( + new String[] { + "-isSparkSessionManaged", + Boolean.FALSE.toString(), + "-sourcePath", + getClass() + .getResource("/eu/dnetlib/dhp/countrypropagation/sample/software") + .getPath(), + "-hive_metastore_uris", + "", + "-saveGraph", + "true", + "-resultTableName", + "eu.dnetlib.dhp.schema.oaf.Software", + "-outputPath", + workingDir.toString() + "/software", + "-preparedInfoPath", + getClass() + .getResource("/eu/dnetlib/dhp/countrypropagation/preparedInfo") + .getPath(), + }); - final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); + final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); - JavaRDD tmp = - sc.textFile(workingDir.toString() + "/software") - .map(item -> OBJECT_MAPPER.readValue(item, Software.class)); + JavaRDD tmp = sc + .textFile(workingDir.toString() + "/software") + .map(item -> OBJECT_MAPPER.readValue(item, Software.class)); - // tmp.map(s -> new Gson().toJson(s)).foreach(s -> System.out.println(s)); + // tmp.map(s -> new Gson().toJson(s)).foreach(s -> System.out.println(s)); - Assertions.assertEquals(10, tmp.count()); + Assertions.assertEquals(10, tmp.count()); - Dataset verificationDs = - spark.createDataset(tmp.rdd(), Encoders.bean(Software.class)); + Dataset verificationDs = spark.createDataset(tmp.rdd(), Encoders.bean(Software.class)); - Assertions.assertEquals(6, verificationDs.filter("size(country) > 0").count()); - Assertions.assertEquals(3, verificationDs.filter("size(country) = 1").count()); - Assertions.assertEquals(3, verificationDs.filter("size(country) = 2").count()); - Assertions.assertEquals(0, verificationDs.filter("size(country) > 2").count()); + Assertions.assertEquals(6, verificationDs.filter("size(country) > 0").count()); + Assertions.assertEquals(3, verificationDs.filter("size(country) = 1").count()); + Assertions.assertEquals(3, verificationDs.filter("size(country) = 2").count()); + Assertions.assertEquals(0, verificationDs.filter("size(country) > 2").count()); - Dataset countryExploded = - verificationDs - .flatMap(row -> row.getCountry().iterator(), Encoders.bean(Country.class)) - .map(c -> c.getClassid(), Encoders.STRING()); + Dataset countryExploded = verificationDs + .flatMap(row -> row.getCountry().iterator(), Encoders.bean(Country.class)) + .map(c -> c.getClassid(), Encoders.STRING()); - Assertions.assertEquals(9, countryExploded.count()); + Assertions.assertEquals(9, countryExploded.count()); - Assertions.assertEquals(1, countryExploded.filter("value = 'FR'").count()); - Assertions.assertEquals(1, countryExploded.filter("value = 'TR'").count()); - Assertions.assertEquals(2, countryExploded.filter("value = 'IT'").count()); - Assertions.assertEquals(1, countryExploded.filter("value = 'US'").count()); - Assertions.assertEquals(1, countryExploded.filter("value = 'MX'").count()); - Assertions.assertEquals(1, countryExploded.filter("value = 'CH'").count()); - Assertions.assertEquals(2, countryExploded.filter("value = 'JP'").count()); + Assertions.assertEquals(1, countryExploded.filter("value = 'FR'").count()); + Assertions.assertEquals(1, countryExploded.filter("value = 'TR'").count()); + Assertions.assertEquals(2, countryExploded.filter("value = 'IT'").count()); + Assertions.assertEquals(1, countryExploded.filter("value = 'US'").count()); + Assertions.assertEquals(1, countryExploded.filter("value = 'MX'").count()); + Assertions.assertEquals(1, countryExploded.filter("value = 'CH'").count()); + Assertions.assertEquals(2, countryExploded.filter("value = 'JP'").count()); - Dataset> countryExplodedWithCountryclassid = - verificationDs.flatMap( - row -> { - List> prova = new ArrayList(); - List country_list = row.getCountry(); - country_list.stream() - .forEach( - c -> - prova.add( - new Tuple2<>( - row.getId(), c.getClassid()))); - return prova.iterator(); - }, - Encoders.tuple(Encoders.STRING(), Encoders.STRING())); + Dataset> countryExplodedWithCountryclassid = verificationDs + .flatMap( + row -> { + List> prova = new ArrayList(); + List country_list = row.getCountry(); + country_list + .stream() + .forEach( + c -> prova + .add( + new Tuple2<>( + row.getId(), c.getClassid()))); + return prova.iterator(); + }, + Encoders.tuple(Encoders.STRING(), Encoders.STRING())); - Assertions.assertEquals(9, countryExplodedWithCountryclassid.count()); + Assertions.assertEquals(9, countryExplodedWithCountryclassid.count()); - countryExplodedWithCountryclassid.show(false); - Assertions.assertEquals( - 1, - countryExplodedWithCountryclassid - .filter( - "_1 = '50|od______1582::6e7a9b21a2feef45673890432af34244' and _2 = 'FR' ") - .count()); - Assertions.assertEquals( - 1, - countryExplodedWithCountryclassid - .filter( - "_1 = '50|dedup_wf_001::40ea2f24181f6ae77b866ebcbffba523' and _2 = 'TR' ") - .count()); - Assertions.assertEquals( - 2, - countryExplodedWithCountryclassid - .filter( - "_1 = '50|od______1106::2b7ca9726230be8e862be224fd463ac4' and (_2 = 'IT' or _2 = 'MX') ") - .count()); - Assertions.assertEquals( - 2, - countryExplodedWithCountryclassid - .filter( - "_1 = '50|od_______935::46a0ad9964171c3dd13373f5427b9a1c' and (_2 = 'IT' or _2 = 'US') ") - .count()); - Assertions.assertEquals( - 1, - countryExplodedWithCountryclassid - .filter( - "_1 = '50|dedup_wf_001::b67bc915603fc01e445f2b5888ba7218' and _2 = 'JP'") - .count()); - Assertions.assertEquals( - 2, - countryExplodedWithCountryclassid - .filter( - "_1 = '50|od_______109::f375befa62a741e9250e55bcfa88f9a6' and (_2 = 'CH' or _2 = 'JP') ") - .count()); + countryExplodedWithCountryclassid.show(false); + Assertions + .assertEquals( + 1, + countryExplodedWithCountryclassid + .filter( + "_1 = '50|od______1582::6e7a9b21a2feef45673890432af34244' and _2 = 'FR' ") + .count()); + Assertions + .assertEquals( + 1, + countryExplodedWithCountryclassid + .filter( + "_1 = '50|dedup_wf_001::40ea2f24181f6ae77b866ebcbffba523' and _2 = 'TR' ") + .count()); + Assertions + .assertEquals( + 2, + countryExplodedWithCountryclassid + .filter( + "_1 = '50|od______1106::2b7ca9726230be8e862be224fd463ac4' and (_2 = 'IT' or _2 = 'MX') ") + .count()); + Assertions + .assertEquals( + 2, + countryExplodedWithCountryclassid + .filter( + "_1 = '50|od_______935::46a0ad9964171c3dd13373f5427b9a1c' and (_2 = 'IT' or _2 = 'US') ") + .count()); + Assertions + .assertEquals( + 1, + countryExplodedWithCountryclassid + .filter( + "_1 = '50|dedup_wf_001::b67bc915603fc01e445f2b5888ba7218' and _2 = 'JP'") + .count()); + Assertions + .assertEquals( + 2, + countryExplodedWithCountryclassid + .filter( + "_1 = '50|od_______109::f375befa62a741e9250e55bcfa88f9a6' and (_2 = 'CH' or _2 = 'JP') ") + .count()); - Dataset> countryExplodedWithCountryclassname = - verificationDs.flatMap( - row -> { - List> prova = new ArrayList(); - List country_list = row.getCountry(); - country_list.stream() - .forEach( - c -> - prova.add( - new Tuple2<>( - row.getId(), - c.getClassname()))); - return prova.iterator(); - }, - Encoders.tuple(Encoders.STRING(), Encoders.STRING())); + Dataset> countryExplodedWithCountryclassname = verificationDs + .flatMap( + row -> { + List> prova = new ArrayList(); + List country_list = row.getCountry(); + country_list + .stream() + .forEach( + c -> prova + .add( + new Tuple2<>( + row.getId(), + c.getClassname()))); + return prova.iterator(); + }, + Encoders.tuple(Encoders.STRING(), Encoders.STRING())); - countryExplodedWithCountryclassname.show(false); - Assertions.assertEquals( - 1, - countryExplodedWithCountryclassname - .filter( - "_1 = '50|od______1582::6e7a9b21a2feef45673890432af34244' and _2 = 'France' ") - .count()); - Assertions.assertEquals( - 1, - countryExplodedWithCountryclassname - .filter( - "_1 = '50|dedup_wf_001::40ea2f24181f6ae77b866ebcbffba523' and _2 = 'Turkey' ") - .count()); - Assertions.assertEquals( - 2, - countryExplodedWithCountryclassname - .filter( - "_1 = '50|od______1106::2b7ca9726230be8e862be224fd463ac4' and (_2 = 'Italy' or _2 = 'Mexico') ") - .count()); - Assertions.assertEquals( - 2, - countryExplodedWithCountryclassname - .filter( - "_1 = '50|od_______935::46a0ad9964171c3dd13373f5427b9a1c' and (_2 = 'Italy' or _2 = 'United States') ") - .count()); - Assertions.assertEquals( - 1, - countryExplodedWithCountryclassname - .filter( - "_1 = '50|dedup_wf_001::b67bc915603fc01e445f2b5888ba7218' and _2 = 'Japan' ") - .count()); - Assertions.assertEquals( - 2, - countryExplodedWithCountryclassname - .filter( - "_1 = '50|od_______109::f375befa62a741e9250e55bcfa88f9a6' and (_2 = 'Switzerland' or _2 = 'Japan') ") - .count()); + countryExplodedWithCountryclassname.show(false); + Assertions + .assertEquals( + 1, + countryExplodedWithCountryclassname + .filter( + "_1 = '50|od______1582::6e7a9b21a2feef45673890432af34244' and _2 = 'France' ") + .count()); + Assertions + .assertEquals( + 1, + countryExplodedWithCountryclassname + .filter( + "_1 = '50|dedup_wf_001::40ea2f24181f6ae77b866ebcbffba523' and _2 = 'Turkey' ") + .count()); + Assertions + .assertEquals( + 2, + countryExplodedWithCountryclassname + .filter( + "_1 = '50|od______1106::2b7ca9726230be8e862be224fd463ac4' and (_2 = 'Italy' or _2 = 'Mexico') ") + .count()); + Assertions + .assertEquals( + 2, + countryExplodedWithCountryclassname + .filter( + "_1 = '50|od_______935::46a0ad9964171c3dd13373f5427b9a1c' and (_2 = 'Italy' or _2 = 'United States') ") + .count()); + Assertions + .assertEquals( + 1, + countryExplodedWithCountryclassname + .filter( + "_1 = '50|dedup_wf_001::b67bc915603fc01e445f2b5888ba7218' and _2 = 'Japan' ") + .count()); + Assertions + .assertEquals( + 2, + countryExplodedWithCountryclassname + .filter( + "_1 = '50|od_______109::f375befa62a741e9250e55bcfa88f9a6' and (_2 = 'Switzerland' or _2 = 'Japan') ") + .count()); - Dataset> countryExplodedWithCountryProvenance = - verificationDs.flatMap( - row -> { - List> prova = new ArrayList(); - List country_list = row.getCountry(); - country_list.stream() - .forEach( - c -> - prova.add( - new Tuple2<>( - row.getId(), - c.getDataInfo() - .getInferenceprovenance()))); - return prova.iterator(); - }, - Encoders.tuple(Encoders.STRING(), Encoders.STRING())); + Dataset> countryExplodedWithCountryProvenance = verificationDs + .flatMap( + row -> { + List> prova = new ArrayList(); + List country_list = row.getCountry(); + country_list + .stream() + .forEach( + c -> prova + .add( + new Tuple2<>( + row.getId(), + c + .getDataInfo() + .getInferenceprovenance()))); + return prova.iterator(); + }, + Encoders.tuple(Encoders.STRING(), Encoders.STRING())); - Assertions.assertEquals( - 7, countryExplodedWithCountryProvenance.filter("_2 = 'propagation'").count()); - } + Assertions + .assertEquals( + 7, countryExplodedWithCountryProvenance.filter("_2 = 'propagation'").count()); + } } diff --git a/dhp-workflows/dhp-propagation/src/test/java/eu/dnetlib/dhp/orcidtoresultfromsemrel/OrcidPropagationJobTest.java b/dhp-workflows/dhp-propagation/src/test/java/eu/dnetlib/dhp/orcidtoresultfromsemrel/OrcidPropagationJobTest.java index 4292f3b05..d18acd550 100644 --- a/dhp-workflows/dhp-propagation/src/test/java/eu/dnetlib/dhp/orcidtoresultfromsemrel/OrcidPropagationJobTest.java +++ b/dhp-workflows/dhp-propagation/src/test/java/eu/dnetlib/dhp/orcidtoresultfromsemrel/OrcidPropagationJobTest.java @@ -1,10 +1,10 @@ + package eu.dnetlib.dhp.orcidtoresultfromsemrel; -import com.fasterxml.jackson.databind.ObjectMapper; -import eu.dnetlib.dhp.schema.oaf.Dataset; import java.io.IOException; import java.nio.file.Files; import java.nio.file.Path; + import org.apache.commons.io.FileUtils; import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaRDD; @@ -19,234 +19,242 @@ import org.junit.jupiter.api.Test; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import com.fasterxml.jackson.databind.ObjectMapper; + +import eu.dnetlib.dhp.schema.oaf.Dataset; + public class OrcidPropagationJobTest { - private static final Logger log = LoggerFactory.getLogger(OrcidPropagationJobTest.class); + private static final Logger log = LoggerFactory.getLogger(OrcidPropagationJobTest.class); - private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); + private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); - private static final ClassLoader cl = OrcidPropagationJobTest.class.getClassLoader(); + private static final ClassLoader cl = OrcidPropagationJobTest.class.getClassLoader(); - private static SparkSession spark; + private static SparkSession spark; - private static Path workingDir; + private static Path workingDir; - @BeforeAll - public static void beforeAll() throws IOException { - workingDir = Files.createTempDirectory(OrcidPropagationJobTest.class.getSimpleName()); - log.info("using work dir {}", workingDir); + @BeforeAll + public static void beforeAll() throws IOException { + workingDir = Files.createTempDirectory(OrcidPropagationJobTest.class.getSimpleName()); + log.info("using work dir {}", workingDir); - SparkConf conf = new SparkConf(); - conf.setAppName(OrcidPropagationJobTest.class.getSimpleName()); + SparkConf conf = new SparkConf(); + conf.setAppName(OrcidPropagationJobTest.class.getSimpleName()); - conf.setMaster("local[*]"); - conf.set("spark.driver.host", "localhost"); - conf.set("hive.metastore.local", "true"); - conf.set("spark.ui.enabled", "false"); - conf.set("spark.sql.warehouse.dir", workingDir.toString()); - conf.set("hive.metastore.warehouse.dir", workingDir.resolve("warehouse").toString()); + conf.setMaster("local[*]"); + conf.set("spark.driver.host", "localhost"); + conf.set("hive.metastore.local", "true"); + conf.set("spark.ui.enabled", "false"); + conf.set("spark.sql.warehouse.dir", workingDir.toString()); + conf.set("hive.metastore.warehouse.dir", workingDir.resolve("warehouse").toString()); - spark = - SparkSession.builder() - .appName(OrcidPropagationJobTest.class.getSimpleName()) - .config(conf) - .getOrCreate(); - } + spark = SparkSession + .builder() + .appName(OrcidPropagationJobTest.class.getSimpleName()) + .config(conf) + .getOrCreate(); + } - @AfterAll - public static void afterAll() throws IOException { - FileUtils.deleteDirectory(workingDir.toFile()); - spark.stop(); - } + @AfterAll + public static void afterAll() throws IOException { + FileUtils.deleteDirectory(workingDir.toFile()); + spark.stop(); + } - @Test - public void noUpdateTest() throws Exception { - SparkOrcidToResultFromSemRelJob3.main( - new String[] { - "-isTest", - Boolean.TRUE.toString(), - "-isSparkSessionManaged", - Boolean.FALSE.toString(), - "-sourcePath", - getClass() - .getResource("/eu/dnetlib/dhp/orcidtoresultfromsemrel/sample/noupdate") - .getPath(), - "-hive_metastore_uris", - "", - "-saveGraph", - "true", - "-resultTableName", - "eu.dnetlib.dhp.schema.oaf.Dataset", - "-outputPath", - workingDir.toString() + "/dataset", - "-possibleUpdatesPath", - getClass() - .getResource( - "/eu/dnetlib/dhp/orcidtoresultfromsemrel/preparedInfo/mergedOrcidAssoc") - .getPath() - }); + @Test + public void noUpdateTest() throws Exception { + SparkOrcidToResultFromSemRelJob3 + .main( + new String[] { + "-isTest", + Boolean.TRUE.toString(), + "-isSparkSessionManaged", + Boolean.FALSE.toString(), + "-sourcePath", + getClass() + .getResource("/eu/dnetlib/dhp/orcidtoresultfromsemrel/sample/noupdate") + .getPath(), + "-hive_metastore_uris", + "", + "-saveGraph", + "true", + "-resultTableName", + "eu.dnetlib.dhp.schema.oaf.Dataset", + "-outputPath", + workingDir.toString() + "/dataset", + "-possibleUpdatesPath", + getClass() + .getResource( + "/eu/dnetlib/dhp/orcidtoresultfromsemrel/preparedInfo/mergedOrcidAssoc") + .getPath() + }); - final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); + final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); - JavaRDD tmp = - sc.textFile(workingDir.toString() + "/dataset") - .map(item -> OBJECT_MAPPER.readValue(item, Dataset.class)); + JavaRDD tmp = sc + .textFile(workingDir.toString() + "/dataset") + .map(item -> OBJECT_MAPPER.readValue(item, Dataset.class)); - // tmp.map(s -> new Gson().toJson(s)).foreach(s -> System.out.println(s)); + // tmp.map(s -> new Gson().toJson(s)).foreach(s -> System.out.println(s)); - Assertions.assertEquals(10, tmp.count()); + Assertions.assertEquals(10, tmp.count()); - org.apache.spark.sql.Dataset verificationDataset = - spark.createDataset(tmp.rdd(), Encoders.bean(Dataset.class)); + org.apache.spark.sql.Dataset verificationDataset = spark + .createDataset(tmp.rdd(), Encoders.bean(Dataset.class)); - verificationDataset.createOrReplaceTempView("dataset"); + verificationDataset.createOrReplaceTempView("dataset"); - String query = - "select id " - + "from dataset " - + "lateral view explode(author) a as MyT " - + "lateral view explode(MyT.pid) p as MyP " - + "where MyP.datainfo.inferenceprovenance = 'propagation'"; + String query = "select id " + + "from dataset " + + "lateral view explode(author) a as MyT " + + "lateral view explode(MyT.pid) p as MyP " + + "where MyP.datainfo.inferenceprovenance = 'propagation'"; - Assertions.assertEquals(0, spark.sql(query).count()); - } + Assertions.assertEquals(0, spark.sql(query).count()); + } - @Test - public void oneUpdateTest() throws Exception { - SparkOrcidToResultFromSemRelJob3.main( - new String[] { - "-isTest", - Boolean.TRUE.toString(), - "-isSparkSessionManaged", - Boolean.FALSE.toString(), - "-sourcePath", - getClass() - .getResource("/eu/dnetlib/dhp/orcidtoresultfromsemrel/sample/oneupdate") - .getPath(), - "-hive_metastore_uris", - "", - "-saveGraph", - "true", - "-resultTableName", - "eu.dnetlib.dhp.schema.oaf.Dataset", - "-outputPath", - workingDir.toString() + "/dataset", - "-possibleUpdatesPath", - getClass() - .getResource( - "/eu/dnetlib/dhp/orcidtoresultfromsemrel/preparedInfo/mergedOrcidAssoc") - .getPath() - }); + @Test + public void oneUpdateTest() throws Exception { + SparkOrcidToResultFromSemRelJob3 + .main( + new String[] { + "-isTest", + Boolean.TRUE.toString(), + "-isSparkSessionManaged", + Boolean.FALSE.toString(), + "-sourcePath", + getClass() + .getResource("/eu/dnetlib/dhp/orcidtoresultfromsemrel/sample/oneupdate") + .getPath(), + "-hive_metastore_uris", + "", + "-saveGraph", + "true", + "-resultTableName", + "eu.dnetlib.dhp.schema.oaf.Dataset", + "-outputPath", + workingDir.toString() + "/dataset", + "-possibleUpdatesPath", + getClass() + .getResource( + "/eu/dnetlib/dhp/orcidtoresultfromsemrel/preparedInfo/mergedOrcidAssoc") + .getPath() + }); - final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); + final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); - JavaRDD tmp = - sc.textFile(workingDir.toString() + "/dataset") - .map(item -> OBJECT_MAPPER.readValue(item, Dataset.class)); + JavaRDD tmp = sc + .textFile(workingDir.toString() + "/dataset") + .map(item -> OBJECT_MAPPER.readValue(item, Dataset.class)); - // tmp.map(s -> new Gson().toJson(s)).foreach(s -> System.out.println(s)); + // tmp.map(s -> new Gson().toJson(s)).foreach(s -> System.out.println(s)); - Assertions.assertEquals(10, tmp.count()); + Assertions.assertEquals(10, tmp.count()); - org.apache.spark.sql.Dataset verificationDataset = - spark.createDataset(tmp.rdd(), Encoders.bean(Dataset.class)); + org.apache.spark.sql.Dataset verificationDataset = spark + .createDataset(tmp.rdd(), Encoders.bean(Dataset.class)); - verificationDataset.createOrReplaceTempView("dataset"); + verificationDataset.createOrReplaceTempView("dataset"); - String query = - "select id, MyT.name name, MyT.surname surname, MyP.value pid, MyP.qualifier.classid pidType " - + "from dataset " - + "lateral view explode(author) a as MyT " - + "lateral view explode(MyT.pid) p as MyP " - + "where MyP.datainfo.inferenceprovenance = 'propagation'"; + String query = "select id, MyT.name name, MyT.surname surname, MyP.value pid, MyP.qualifier.classid pidType " + + "from dataset " + + "lateral view explode(author) a as MyT " + + "lateral view explode(MyT.pid) p as MyP " + + "where MyP.datainfo.inferenceprovenance = 'propagation'"; - org.apache.spark.sql.Dataset propagatedAuthors = spark.sql(query); + org.apache.spark.sql.Dataset propagatedAuthors = spark.sql(query); - Assertions.assertEquals(1, propagatedAuthors.count()); + Assertions.assertEquals(1, propagatedAuthors.count()); - Assertions.assertEquals( - 1, - propagatedAuthors - .filter( - "id = '50|dedup_wf_001::95b033c0c3961f6a1cdcd41a99a9632e' " - + "and name = 'Vajinder' and surname = 'Kumar' and pidType = 'ORCID'") - .count()); + Assertions + .assertEquals( + 1, + propagatedAuthors + .filter( + "id = '50|dedup_wf_001::95b033c0c3961f6a1cdcd41a99a9632e' " + + "and name = 'Vajinder' and surname = 'Kumar' and pidType = 'ORCID'") + .count()); - Assertions.assertEquals(1, propagatedAuthors.filter("pid = '0000-0002-8825-3517'").count()); - } + Assertions.assertEquals(1, propagatedAuthors.filter("pid = '0000-0002-8825-3517'").count()); + } - @Test - public void twoUpdatesTest() throws Exception { - SparkOrcidToResultFromSemRelJob3.main( - new String[] { - "-isTest", - Boolean.TRUE.toString(), - "-isSparkSessionManaged", - Boolean.FALSE.toString(), - "-sourcePath", - getClass() - .getResource( - "/eu/dnetlib/dhp/orcidtoresultfromsemrel/sample/twoupdates") - .getPath(), - "-hive_metastore_uris", - "", - "-saveGraph", - "true", - "-resultTableName", - "eu.dnetlib.dhp.schema.oaf.Dataset", - "-outputPath", - workingDir.toString() + "/dataset", - "-possibleUpdatesPath", - getClass() - .getResource( - "/eu/dnetlib/dhp/orcidtoresultfromsemrel/preparedInfo/mergedOrcidAssoc") - .getPath() - }); + @Test + public void twoUpdatesTest() throws Exception { + SparkOrcidToResultFromSemRelJob3 + .main( + new String[] { + "-isTest", + Boolean.TRUE.toString(), + "-isSparkSessionManaged", + Boolean.FALSE.toString(), + "-sourcePath", + getClass() + .getResource( + "/eu/dnetlib/dhp/orcidtoresultfromsemrel/sample/twoupdates") + .getPath(), + "-hive_metastore_uris", + "", + "-saveGraph", + "true", + "-resultTableName", + "eu.dnetlib.dhp.schema.oaf.Dataset", + "-outputPath", + workingDir.toString() + "/dataset", + "-possibleUpdatesPath", + getClass() + .getResource( + "/eu/dnetlib/dhp/orcidtoresultfromsemrel/preparedInfo/mergedOrcidAssoc") + .getPath() + }); - final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); + final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); - JavaRDD tmp = - sc.textFile(workingDir.toString() + "/dataset") - .map(item -> OBJECT_MAPPER.readValue(item, Dataset.class)); + JavaRDD tmp = sc + .textFile(workingDir.toString() + "/dataset") + .map(item -> OBJECT_MAPPER.readValue(item, Dataset.class)); - Assertions.assertEquals(10, tmp.count()); + Assertions.assertEquals(10, tmp.count()); - org.apache.spark.sql.Dataset verificationDataset = - spark.createDataset(tmp.rdd(), Encoders.bean(Dataset.class)); + org.apache.spark.sql.Dataset verificationDataset = spark + .createDataset(tmp.rdd(), Encoders.bean(Dataset.class)); - verificationDataset.createOrReplaceTempView("dataset"); + verificationDataset.createOrReplaceTempView("dataset"); - String query = - "select id, MyT.name name, MyT.surname surname, MyP.value pid, MyP.qualifier.classid pidType " - + "from dataset " - + "lateral view explode(author) a as MyT " - + "lateral view explode(MyT.pid) p as MyP " - + "where MyP.datainfo.inferenceprovenance = 'propagation'"; + String query = "select id, MyT.name name, MyT.surname surname, MyP.value pid, MyP.qualifier.classid pidType " + + "from dataset " + + "lateral view explode(author) a as MyT " + + "lateral view explode(MyT.pid) p as MyP " + + "where MyP.datainfo.inferenceprovenance = 'propagation'"; - org.apache.spark.sql.Dataset propagatedAuthors = spark.sql(query); + org.apache.spark.sql.Dataset propagatedAuthors = spark.sql(query); - Assertions.assertEquals(2, propagatedAuthors.count()); + Assertions.assertEquals(2, propagatedAuthors.count()); - Assertions.assertEquals( - 1, propagatedAuthors.filter("name = 'Marc' and surname = 'Schmidtmann'").count()); - Assertions.assertEquals( - 1, propagatedAuthors.filter("name = 'Ruediger' and surname = 'Beckhaus'").count()); + Assertions + .assertEquals( + 1, propagatedAuthors.filter("name = 'Marc' and surname = 'Schmidtmann'").count()); + Assertions + .assertEquals( + 1, propagatedAuthors.filter("name = 'Ruediger' and surname = 'Beckhaus'").count()); - query = - "select id, MyT.name name, MyT.surname surname, MyP.value pid ,MyP.qualifier.classid pidType " - + "from dataset " - + "lateral view explode(author) a as MyT " - + "lateral view explode(MyT.pid) p as MyP "; + query = "select id, MyT.name name, MyT.surname surname, MyP.value pid ,MyP.qualifier.classid pidType " + + "from dataset " + + "lateral view explode(author) a as MyT " + + "lateral view explode(MyT.pid) p as MyP "; - org.apache.spark.sql.Dataset authorsExplodedPids = spark.sql(query); + org.apache.spark.sql.Dataset authorsExplodedPids = spark.sql(query); - Assertions.assertEquals( - 2, authorsExplodedPids.filter("name = 'Marc' and surname = 'Schmidtmann'").count()); - Assertions.assertEquals( - 1, - authorsExplodedPids - .filter( - "name = 'Marc' and surname = 'Schmidtmann' and pidType = 'MAG Identifier'") - .count()); - } + Assertions + .assertEquals( + 2, authorsExplodedPids.filter("name = 'Marc' and surname = 'Schmidtmann'").count()); + Assertions + .assertEquals( + 1, + authorsExplodedPids + .filter( + "name = 'Marc' and surname = 'Schmidtmann' and pidType = 'MAG Identifier'") + .count()); + } } diff --git a/dhp-workflows/dhp-propagation/src/test/java/eu/dnetlib/dhp/projecttoresult/ProjectPropagationJobTest.java b/dhp-workflows/dhp-propagation/src/test/java/eu/dnetlib/dhp/projecttoresult/ProjectPropagationJobTest.java index 7a742e4db..ac28e9d4b 100644 --- a/dhp-workflows/dhp-propagation/src/test/java/eu/dnetlib/dhp/projecttoresult/ProjectPropagationJobTest.java +++ b/dhp-workflows/dhp-propagation/src/test/java/eu/dnetlib/dhp/projecttoresult/ProjectPropagationJobTest.java @@ -1,10 +1,10 @@ + package eu.dnetlib.dhp.projecttoresult; -import com.fasterxml.jackson.databind.ObjectMapper; -import eu.dnetlib.dhp.schema.oaf.Relation; import java.io.IOException; import java.nio.file.Files; import java.nio.file.Path; + import org.apache.commons.io.FileUtils; import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaRDD; @@ -19,244 +19,252 @@ import org.junit.jupiter.api.Test; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import com.fasterxml.jackson.databind.ObjectMapper; + +import eu.dnetlib.dhp.schema.oaf.Relation; + public class ProjectPropagationJobTest { - private static final Logger log = LoggerFactory.getLogger(ProjectPropagationJobTest.class); + private static final Logger log = LoggerFactory.getLogger(ProjectPropagationJobTest.class); - private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); + private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); - private static final ClassLoader cl = ProjectPropagationJobTest.class.getClassLoader(); + private static final ClassLoader cl = ProjectPropagationJobTest.class.getClassLoader(); - private static SparkSession spark; + private static SparkSession spark; - private static Path workingDir; + private static Path workingDir; - @BeforeAll - public static void beforeAll() throws IOException { - workingDir = Files.createTempDirectory(ProjectPropagationJobTest.class.getSimpleName()); - log.info("using work dir {}", workingDir); + @BeforeAll + public static void beforeAll() throws IOException { + workingDir = Files.createTempDirectory(ProjectPropagationJobTest.class.getSimpleName()); + log.info("using work dir {}", workingDir); - SparkConf conf = new SparkConf(); - conf.setAppName(ProjectPropagationJobTest.class.getSimpleName()); + SparkConf conf = new SparkConf(); + conf.setAppName(ProjectPropagationJobTest.class.getSimpleName()); - conf.setMaster("local[*]"); - conf.set("spark.driver.host", "localhost"); - conf.set("hive.metastore.local", "true"); - conf.set("spark.ui.enabled", "false"); - conf.set("spark.sql.warehouse.dir", workingDir.toString()); - conf.set("hive.metastore.warehouse.dir", workingDir.resolve("warehouse").toString()); + conf.setMaster("local[*]"); + conf.set("spark.driver.host", "localhost"); + conf.set("hive.metastore.local", "true"); + conf.set("spark.ui.enabled", "false"); + conf.set("spark.sql.warehouse.dir", workingDir.toString()); + conf.set("hive.metastore.warehouse.dir", workingDir.resolve("warehouse").toString()); - spark = - SparkSession.builder() - .appName(ProjectPropagationJobTest.class.getSimpleName()) - .config(conf) - .getOrCreate(); - } + spark = SparkSession + .builder() + .appName(ProjectPropagationJobTest.class.getSimpleName()) + .config(conf) + .getOrCreate(); + } - @AfterAll - public static void afterAll() throws IOException { - FileUtils.deleteDirectory(workingDir.toFile()); - spark.stop(); - } + @AfterAll + public static void afterAll() throws IOException { + FileUtils.deleteDirectory(workingDir.toFile()); + spark.stop(); + } - /** - * There are no new relations to be added. All the possible relations have already been linked - * with the project in the graph - * - * @throws Exception - */ - @Test - public void NoUpdateTest() throws Exception { + /** + * There are no new relations to be added. All the possible relations have already been linked with the project in + * the graph + * + * @throws Exception + */ + @Test + public void NoUpdateTest() throws Exception { - SparkResultToProjectThroughSemRelJob3.main( - new String[] { - "-isTest", - Boolean.TRUE.toString(), - "-isSparkSessionManaged", - Boolean.FALSE.toString(), - // "-sourcePath", - // getClass().getResource("/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/sample/relation").getPath(), - "-hive_metastore_uris", - "", - "-saveGraph", - "true", - "-outputPath", - workingDir.toString() + "/relation", - "-potentialUpdatePath", - getClass() - .getResource( - "/eu/dnetlib/dhp/projecttoresult/preparedInfo/noupdates/potentialUpdates") - .getPath(), - "-alreadyLinkedPath", - getClass() - .getResource( - "/eu/dnetlib/dhp/projecttoresult/preparedInfo/alreadyLinked") - .getPath(), - }); + SparkResultToProjectThroughSemRelJob3 + .main( + new String[] { + "-isTest", + Boolean.TRUE.toString(), + "-isSparkSessionManaged", + Boolean.FALSE.toString(), + // "-sourcePath", + // getClass().getResource("/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/sample/relation").getPath(), + "-hive_metastore_uris", + "", + "-saveGraph", + "true", + "-outputPath", + workingDir.toString() + "/relation", + "-potentialUpdatePath", + getClass() + .getResource( + "/eu/dnetlib/dhp/projecttoresult/preparedInfo/noupdates/potentialUpdates") + .getPath(), + "-alreadyLinkedPath", + getClass() + .getResource( + "/eu/dnetlib/dhp/projecttoresult/preparedInfo/alreadyLinked") + .getPath(), + }); - final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); + final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); - JavaRDD tmp = - sc.textFile(workingDir.toString() + "/relation") - .map(item -> OBJECT_MAPPER.readValue(item, Relation.class)); + JavaRDD tmp = sc + .textFile(workingDir.toString() + "/relation") + .map(item -> OBJECT_MAPPER.readValue(item, Relation.class)); - Assertions.assertEquals(0, tmp.count()); - } + Assertions.assertEquals(0, tmp.count()); + } - /** - * All the possible updates will produce a new relation. No relations are already linked in the - * grpha - * - * @throws Exception - */ - @Test - public void UpdateTenTest() throws Exception { - SparkResultToProjectThroughSemRelJob3.main( - new String[] { - "-isTest", - Boolean.TRUE.toString(), - "-isSparkSessionManaged", - Boolean.FALSE.toString(), - // "-sourcePath", - // getClass().getResource("/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/sample/relation").getPath(), - "-hive_metastore_uris", - "", - "-saveGraph", - "true", - "-outputPath", - workingDir.toString() + "/relation", - "-potentialUpdatePath", - getClass() - .getResource( - "/eu/dnetlib/dhp/projecttoresult/preparedInfo/tenupdates/potentialUpdates") - .getPath(), - "-alreadyLinkedPath", - getClass() - .getResource( - "/eu/dnetlib/dhp/projecttoresult/preparedInfo/alreadyLinked") - .getPath(), - }); + /** + * All the possible updates will produce a new relation. No relations are already linked in the grpha + * + * @throws Exception + */ + @Test + public void UpdateTenTest() throws Exception { + SparkResultToProjectThroughSemRelJob3 + .main( + new String[] { + "-isTest", + Boolean.TRUE.toString(), + "-isSparkSessionManaged", + Boolean.FALSE.toString(), + // "-sourcePath", + // getClass().getResource("/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/sample/relation").getPath(), + "-hive_metastore_uris", + "", + "-saveGraph", + "true", + "-outputPath", + workingDir.toString() + "/relation", + "-potentialUpdatePath", + getClass() + .getResource( + "/eu/dnetlib/dhp/projecttoresult/preparedInfo/tenupdates/potentialUpdates") + .getPath(), + "-alreadyLinkedPath", + getClass() + .getResource( + "/eu/dnetlib/dhp/projecttoresult/preparedInfo/alreadyLinked") + .getPath(), + }); - final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); + final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); - JavaRDD tmp = - sc.textFile(workingDir.toString() + "/relation") - .map(item -> OBJECT_MAPPER.readValue(item, Relation.class)); + JavaRDD tmp = sc + .textFile(workingDir.toString() + "/relation") + .map(item -> OBJECT_MAPPER.readValue(item, Relation.class)); - // got 20 new relations because "produces" and "isProducedBy" are added - Assertions.assertEquals(10, tmp.count()); + // got 20 new relations because "produces" and "isProducedBy" are added + Assertions.assertEquals(10, tmp.count()); - Dataset verificationDs = - spark.createDataset(tmp.rdd(), Encoders.bean(Relation.class)); + Dataset verificationDs = spark.createDataset(tmp.rdd(), Encoders.bean(Relation.class)); - Assertions.assertEquals(5, verificationDs.filter("relClass = 'produces'").count()); - Assertions.assertEquals(5, verificationDs.filter("relClass = 'isProducedBy'").count()); + Assertions.assertEquals(5, verificationDs.filter("relClass = 'produces'").count()); + Assertions.assertEquals(5, verificationDs.filter("relClass = 'isProducedBy'").count()); - Assertions.assertEquals( - 5, - verificationDs - .filter( - r -> - r.getSource().substring(0, 2).equals("50") - && r.getTarget().substring(0, 2).equals("40") - && r.getRelClass().equals("isProducedBy")) - .count()); - Assertions.assertEquals( - 5, - verificationDs - .filter( - r -> - r.getSource().substring(0, 2).equals("40") - && r.getTarget().substring(0, 2).equals("50") - && r.getRelClass().equals("produces")) - .count()); + Assertions + .assertEquals( + 5, + verificationDs + .filter( + r -> r.getSource().substring(0, 2).equals("50") + && r.getTarget().substring(0, 2).equals("40") + && r.getRelClass().equals("isProducedBy")) + .count()); + Assertions + .assertEquals( + 5, + verificationDs + .filter( + r -> r.getSource().substring(0, 2).equals("40") + && r.getTarget().substring(0, 2).equals("50") + && r.getRelClass().equals("produces")) + .count()); - verificationDs.createOrReplaceTempView("temporary"); + verificationDs.createOrReplaceTempView("temporary"); - Assertions.assertEquals( - 10, - spark.sql( - "Select * from temporary where datainfo.inferenceprovenance = 'propagation'") - .count()); - } + Assertions + .assertEquals( + 10, + spark + .sql( + "Select * from temporary where datainfo.inferenceprovenance = 'propagation'") + .count()); + } - /** - * One of the relations in the possible updates is already linked to the project in the graph. - * All the others are not. There will be 9 new associations leading to 18 new relations - * - * @throws Exception - */ - @Test - public void UpdateMixTest() throws Exception { - SparkResultToProjectThroughSemRelJob3.main( - new String[] { - "-isTest", - Boolean.TRUE.toString(), - "-isSparkSessionManaged", - Boolean.FALSE.toString(), - // "-sourcePath", - // getClass().getResource("/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/sample/relation").getPath(), - "-hive_metastore_uris", - "", - "-saveGraph", - "true", - "-outputPath", - workingDir.toString() + "/relation", - "-potentialUpdatePath", - getClass() - .getResource( - "/eu/dnetlib/dhp/projecttoresult/preparedInfo/updatesmixed/potentialUpdates") - .getPath(), - "-alreadyLinkedPath", - getClass() - .getResource( - "/eu/dnetlib/dhp/projecttoresult/preparedInfo/alreadyLinked") - .getPath(), - }); + /** + * One of the relations in the possible updates is already linked to the project in the graph. All the others are + * not. There will be 9 new associations leading to 18 new relations + * + * @throws Exception + */ + @Test + public void UpdateMixTest() throws Exception { + SparkResultToProjectThroughSemRelJob3 + .main( + new String[] { + "-isTest", + Boolean.TRUE.toString(), + "-isSparkSessionManaged", + Boolean.FALSE.toString(), + // "-sourcePath", + // getClass().getResource("/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/sample/relation").getPath(), + "-hive_metastore_uris", + "", + "-saveGraph", + "true", + "-outputPath", + workingDir.toString() + "/relation", + "-potentialUpdatePath", + getClass() + .getResource( + "/eu/dnetlib/dhp/projecttoresult/preparedInfo/updatesmixed/potentialUpdates") + .getPath(), + "-alreadyLinkedPath", + getClass() + .getResource( + "/eu/dnetlib/dhp/projecttoresult/preparedInfo/alreadyLinked") + .getPath(), + }); - final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); + final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); - JavaRDD tmp = - sc.textFile(workingDir.toString() + "/relation") - .map(item -> OBJECT_MAPPER.readValue(item, Relation.class)); + JavaRDD tmp = sc + .textFile(workingDir.toString() + "/relation") + .map(item -> OBJECT_MAPPER.readValue(item, Relation.class)); - // JavaRDD tmp = sc.textFile("/tmp/relation") - // .map(item -> OBJECT_MAPPER.readValue(item, Relation.class)); + // JavaRDD tmp = sc.textFile("/tmp/relation") + // .map(item -> OBJECT_MAPPER.readValue(item, Relation.class)); - // got 20 new relations because "produces" and "isProducedBy" are added - Assertions.assertEquals(8, tmp.count()); + // got 20 new relations because "produces" and "isProducedBy" are added + Assertions.assertEquals(8, tmp.count()); - Dataset verificationDs = - spark.createDataset(tmp.rdd(), Encoders.bean(Relation.class)); + Dataset verificationDs = spark.createDataset(tmp.rdd(), Encoders.bean(Relation.class)); - Assertions.assertEquals(4, verificationDs.filter("relClass = 'produces'").count()); - Assertions.assertEquals(4, verificationDs.filter("relClass = 'isProducedBy'").count()); + Assertions.assertEquals(4, verificationDs.filter("relClass = 'produces'").count()); + Assertions.assertEquals(4, verificationDs.filter("relClass = 'isProducedBy'").count()); - Assertions.assertEquals( - 4, - verificationDs - .filter( - r -> - r.getSource().substring(0, 2).equals("50") - && r.getTarget().substring(0, 2).equals("40") - && r.getRelClass().equals("isProducedBy")) - .count()); - Assertions.assertEquals( - 4, - verificationDs - .filter( - r -> - r.getSource().substring(0, 2).equals("40") - && r.getTarget().substring(0, 2).equals("50") - && r.getRelClass().equals("produces")) - .count()); + Assertions + .assertEquals( + 4, + verificationDs + .filter( + r -> r.getSource().substring(0, 2).equals("50") + && r.getTarget().substring(0, 2).equals("40") + && r.getRelClass().equals("isProducedBy")) + .count()); + Assertions + .assertEquals( + 4, + verificationDs + .filter( + r -> r.getSource().substring(0, 2).equals("40") + && r.getTarget().substring(0, 2).equals("50") + && r.getRelClass().equals("produces")) + .count()); - verificationDs.createOrReplaceTempView("temporary"); + verificationDs.createOrReplaceTempView("temporary"); - Assertions.assertEquals( - 8, - spark.sql( - "Select * from temporary where datainfo.inferenceprovenance = 'propagation'") - .count()); - } + Assertions + .assertEquals( + 8, + spark + .sql( + "Select * from temporary where datainfo.inferenceprovenance = 'propagation'") + .count()); + } } diff --git a/dhp-workflows/dhp-propagation/src/test/java/eu/dnetlib/dhp/resulttocommunityfromorganization/ResultToCommunityJobTest.java b/dhp-workflows/dhp-propagation/src/test/java/eu/dnetlib/dhp/resulttocommunityfromorganization/ResultToCommunityJobTest.java index 78b311bc1..0dd8c6bd4 100644 --- a/dhp-workflows/dhp-propagation/src/test/java/eu/dnetlib/dhp/resulttocommunityfromorganization/ResultToCommunityJobTest.java +++ b/dhp-workflows/dhp-propagation/src/test/java/eu/dnetlib/dhp/resulttocommunityfromorganization/ResultToCommunityJobTest.java @@ -1,13 +1,12 @@ + package eu.dnetlib.dhp.resulttocommunityfromorganization; import static org.apache.spark.sql.functions.*; -import com.fasterxml.jackson.databind.ObjectMapper; -import eu.dnetlib.dhp.orcidtoresultfromsemrel.OrcidPropagationJobTest; -import eu.dnetlib.dhp.schema.oaf.Dataset; import java.io.IOException; import java.nio.file.Files; import java.nio.file.Path; + import org.apache.commons.io.FileUtils; import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaRDD; @@ -22,295 +21,320 @@ import org.junit.jupiter.api.Test; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import com.fasterxml.jackson.databind.ObjectMapper; + +import eu.dnetlib.dhp.orcidtoresultfromsemrel.OrcidPropagationJobTest; +import eu.dnetlib.dhp.schema.oaf.Dataset; + public class ResultToCommunityJobTest { - private static final Logger log = LoggerFactory.getLogger(ResultToCommunityJobTest.class); + private static final Logger log = LoggerFactory.getLogger(ResultToCommunityJobTest.class); - private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); + private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); - private static final ClassLoader cl = ResultToCommunityJobTest.class.getClassLoader(); + private static final ClassLoader cl = ResultToCommunityJobTest.class.getClassLoader(); - private static SparkSession spark; + private static SparkSession spark; - private static Path workingDir; + private static Path workingDir; - @BeforeAll - public static void beforeAll() throws IOException { - workingDir = Files.createTempDirectory(ResultToCommunityJobTest.class.getSimpleName()); - log.info("using work dir {}", workingDir); + @BeforeAll + public static void beforeAll() throws IOException { + workingDir = Files.createTempDirectory(ResultToCommunityJobTest.class.getSimpleName()); + log.info("using work dir {}", workingDir); - SparkConf conf = new SparkConf(); - conf.setAppName(ResultToCommunityJobTest.class.getSimpleName()); + SparkConf conf = new SparkConf(); + conf.setAppName(ResultToCommunityJobTest.class.getSimpleName()); - conf.setMaster("local[*]"); - conf.set("spark.driver.host", "localhost"); - conf.set("hive.metastore.local", "true"); - conf.set("spark.ui.enabled", "false"); - conf.set("spark.sql.warehouse.dir", workingDir.toString()); - conf.set("hive.metastore.warehouse.dir", workingDir.resolve("warehouse").toString()); + conf.setMaster("local[*]"); + conf.set("spark.driver.host", "localhost"); + conf.set("hive.metastore.local", "true"); + conf.set("spark.ui.enabled", "false"); + conf.set("spark.sql.warehouse.dir", workingDir.toString()); + conf.set("hive.metastore.warehouse.dir", workingDir.resolve("warehouse").toString()); - spark = - SparkSession.builder() - .appName(OrcidPropagationJobTest.class.getSimpleName()) - .config(conf) - .getOrCreate(); - } + spark = SparkSession + .builder() + .appName(OrcidPropagationJobTest.class.getSimpleName()) + .config(conf) + .getOrCreate(); + } - @AfterAll - public static void afterAll() throws IOException { - FileUtils.deleteDirectory(workingDir.toFile()); - spark.stop(); - } + @AfterAll + public static void afterAll() throws IOException { + FileUtils.deleteDirectory(workingDir.toFile()); + spark.stop(); + } - @Test - public void test1() throws Exception { - SparkResultToCommunityFromOrganizationJob2.main( - new String[] { - "-isTest", - Boolean.TRUE.toString(), - "-isSparkSessionManaged", - Boolean.FALSE.toString(), - "-sourcePath", - getClass() - .getResource("/eu/dnetlib/dhp/resulttocommunityfromorganization/sample") - .getPath(), - "-hive_metastore_uris", - "", - "-saveGraph", - "true", - "-resultTableName", - "eu.dnetlib.dhp.schema.oaf.Dataset", - "-outputPath", - workingDir.toString() + "/dataset", - "-preparedInfoPath", - getClass() - .getResource( - "/eu/dnetlib/dhp/resulttocommunityfromorganization/preparedInfo") - .getPath() - }); + @Test + public void test1() throws Exception { + SparkResultToCommunityFromOrganizationJob2 + .main( + new String[] { + "-isTest", + Boolean.TRUE.toString(), + "-isSparkSessionManaged", + Boolean.FALSE.toString(), + "-sourcePath", + getClass() + .getResource("/eu/dnetlib/dhp/resulttocommunityfromorganization/sample") + .getPath(), + "-hive_metastore_uris", + "", + "-saveGraph", + "true", + "-resultTableName", + "eu.dnetlib.dhp.schema.oaf.Dataset", + "-outputPath", + workingDir.toString() + "/dataset", + "-preparedInfoPath", + getClass() + .getResource( + "/eu/dnetlib/dhp/resulttocommunityfromorganization/preparedInfo") + .getPath() + }); - final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); + final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); - JavaRDD tmp = - sc.textFile(workingDir.toString() + "/dataset") - .map(item -> OBJECT_MAPPER.readValue(item, Dataset.class)); + JavaRDD tmp = sc + .textFile(workingDir.toString() + "/dataset") + .map(item -> OBJECT_MAPPER.readValue(item, Dataset.class)); - Assertions.assertEquals(10, tmp.count()); - org.apache.spark.sql.Dataset verificationDataset = - spark.createDataset(tmp.rdd(), Encoders.bean(Dataset.class)); + Assertions.assertEquals(10, tmp.count()); + org.apache.spark.sql.Dataset verificationDataset = spark + .createDataset(tmp.rdd(), Encoders.bean(Dataset.class)); - verificationDataset.createOrReplaceTempView("dataset"); + verificationDataset.createOrReplaceTempView("dataset"); - String query = - "select id, MyT.id community " - + "from dataset " - + "lateral view explode(context) c as MyT " - + "lateral view explode(MyT.datainfo) d as MyD " - + "where MyD.inferenceprovenance = 'propagation'"; + String query = "select id, MyT.id community " + + "from dataset " + + "lateral view explode(context) c as MyT " + + "lateral view explode(MyT.datainfo) d as MyD " + + "where MyD.inferenceprovenance = 'propagation'"; - org.apache.spark.sql.Dataset resultExplodedProvenance = spark.sql(query); - Assertions.assertEquals(5, resultExplodedProvenance.count()); - Assertions.assertEquals( - 0, - resultExplodedProvenance - .filter("id = '50|dedup_wf_001::afaf128022d29872c4dad402b2db04fe'") - .count()); - Assertions.assertEquals( - 1, - resultExplodedProvenance - .filter("id = '50|dedup_wf_001::3f62cfc27024d564ea86760c494ba93b'") - .count()); - Assertions.assertEquals( - "beopen", - resultExplodedProvenance - .select("community") - .where( - resultExplodedProvenance - .col("id") - .equalTo( - "50|dedup_wf_001::3f62cfc27024d564ea86760c494ba93b")) - .collectAsList() - .get(0) - .getString(0)); + org.apache.spark.sql.Dataset resultExplodedProvenance = spark.sql(query); + Assertions.assertEquals(5, resultExplodedProvenance.count()); + Assertions + .assertEquals( + 0, + resultExplodedProvenance + .filter("id = '50|dedup_wf_001::afaf128022d29872c4dad402b2db04fe'") + .count()); + Assertions + .assertEquals( + 1, + resultExplodedProvenance + .filter("id = '50|dedup_wf_001::3f62cfc27024d564ea86760c494ba93b'") + .count()); + Assertions + .assertEquals( + "beopen", + resultExplodedProvenance + .select("community") + .where( + resultExplodedProvenance + .col("id") + .equalTo( + "50|dedup_wf_001::3f62cfc27024d564ea86760c494ba93b")) + .collectAsList() + .get(0) + .getString(0)); - Assertions.assertEquals( - 2, - resultExplodedProvenance - .filter("id = '50|od________18::8887b1df8b563c4ea851eb9c882c9d7b'") - .count()); - Assertions.assertEquals( - "mes", - resultExplodedProvenance - .select("community") - .where( - resultExplodedProvenance - .col("id") - .equalTo( - "50|od________18::8887b1df8b563c4ea851eb9c882c9d7b")) - .sort(desc("community")) - .collectAsList() - .get(0) - .getString(0)); - Assertions.assertEquals( - "euromarine", - resultExplodedProvenance - .select("community") - .where( - resultExplodedProvenance - .col("id") - .equalTo( - "50|od________18::8887b1df8b563c4ea851eb9c882c9d7b")) - .sort(desc("community")) - .collectAsList() - .get(1) - .getString(0)); + Assertions + .assertEquals( + 2, + resultExplodedProvenance + .filter("id = '50|od________18::8887b1df8b563c4ea851eb9c882c9d7b'") + .count()); + Assertions + .assertEquals( + "mes", + resultExplodedProvenance + .select("community") + .where( + resultExplodedProvenance + .col("id") + .equalTo( + "50|od________18::8887b1df8b563c4ea851eb9c882c9d7b")) + .sort(desc("community")) + .collectAsList() + .get(0) + .getString(0)); + Assertions + .assertEquals( + "euromarine", + resultExplodedProvenance + .select("community") + .where( + resultExplodedProvenance + .col("id") + .equalTo( + "50|od________18::8887b1df8b563c4ea851eb9c882c9d7b")) + .sort(desc("community")) + .collectAsList() + .get(1) + .getString(0)); - Assertions.assertEquals( - 1, - resultExplodedProvenance - .filter("id = '50|doajarticles::8d817039a63710fcf97e30f14662c6c8'") - .count()); - Assertions.assertEquals( - "mes", - resultExplodedProvenance - .select("community") - .where( - resultExplodedProvenance - .col("id") - .equalTo( - "50|doajarticles::8d817039a63710fcf97e30f14662c6c8")) - .sort(desc("community")) - .collectAsList() - .get(0) - .getString(0)); + Assertions + .assertEquals( + 1, + resultExplodedProvenance + .filter("id = '50|doajarticles::8d817039a63710fcf97e30f14662c6c8'") + .count()); + Assertions + .assertEquals( + "mes", + resultExplodedProvenance + .select("community") + .where( + resultExplodedProvenance + .col("id") + .equalTo( + "50|doajarticles::8d817039a63710fcf97e30f14662c6c8")) + .sort(desc("community")) + .collectAsList() + .get(0) + .getString(0)); - Assertions.assertEquals( - 1, - resultExplodedProvenance - .filter("id = '50|doajarticles::3c98f0632f1875b4979e552ba3aa01e6'") - .count()); - Assertions.assertEquals( - "mes", - resultExplodedProvenance - .select("community") - .where( - resultExplodedProvenance - .col("id") - .equalTo( - "50|doajarticles::3c98f0632f1875b4979e552ba3aa01e6")) - .sort(desc("community")) - .collectAsList() - .get(0) - .getString(0)); + Assertions + .assertEquals( + 1, + resultExplodedProvenance + .filter("id = '50|doajarticles::3c98f0632f1875b4979e552ba3aa01e6'") + .count()); + Assertions + .assertEquals( + "mes", + resultExplodedProvenance + .select("community") + .where( + resultExplodedProvenance + .col("id") + .equalTo( + "50|doajarticles::3c98f0632f1875b4979e552ba3aa01e6")) + .sort(desc("community")) + .collectAsList() + .get(0) + .getString(0)); - /* - {"communityList":["euromarine","mes"],"resultId":"50|doajarticles::8d817039a63710fcf97e30f14662c6c8"} "context" ["id": euromarine] updates = 1 - {"communityList":["euromarine","mes"],"resultId":"50|doajarticles::3c98f0632f1875b4979e552ba3aa01e6"} context = [ni, euromarine] updates = 1 + /* + * {"communityList":["euromarine","mes"],"resultId":"50|doajarticles::8d817039a63710fcf97e30f14662c6c8"} + * "context" ["id": euromarine] updates = 1 + * {"communityList":["euromarine","mes"],"resultId":"50|doajarticles::3c98f0632f1875b4979e552ba3aa01e6"} context + * = [ni, euromarine] updates = 1 + */ - */ + query = "select id, MyT.id community " + + "from dataset " + + "lateral view explode(context) c as MyT " + + "lateral view explode(MyT.datainfo) d as MyD "; - query = - "select id, MyT.id community " - + "from dataset " - + "lateral view explode(context) c as MyT " - + "lateral view explode(MyT.datainfo) d as MyD "; + org.apache.spark.sql.Dataset resultCommunityId = spark.sql(query); - org.apache.spark.sql.Dataset resultCommunityId = spark.sql(query); + Assertions.assertEquals(10, resultCommunityId.count()); - Assertions.assertEquals(10, resultCommunityId.count()); + Assertions + .assertEquals( + 1, + resultCommunityId + .filter("id = '50|dedup_wf_001::afaf128022d29872c4dad402b2db04fe'") + .count()); + Assertions + .assertEquals( + "beopen", + resultCommunityId + .select("community") + .where( + resultCommunityId + .col("id") + .equalTo( + "50|dedup_wf_001::afaf128022d29872c4dad402b2db04fe")) + .collectAsList() + .get(0) + .getString(0)); - Assertions.assertEquals( - 1, - resultCommunityId - .filter("id = '50|dedup_wf_001::afaf128022d29872c4dad402b2db04fe'") - .count()); - Assertions.assertEquals( - "beopen", - resultCommunityId - .select("community") - .where( - resultCommunityId - .col("id") - .equalTo( - "50|dedup_wf_001::afaf128022d29872c4dad402b2db04fe")) - .collectAsList() - .get(0) - .getString(0)); + Assertions + .assertEquals( + 1, + resultCommunityId + .filter("id = '50|dedup_wf_001::3f62cfc27024d564ea86760c494ba93b'") + .count()); - Assertions.assertEquals( - 1, - resultCommunityId - .filter("id = '50|dedup_wf_001::3f62cfc27024d564ea86760c494ba93b'") - .count()); + Assertions + .assertEquals( + 3, + resultCommunityId + .filter("id = '50|od________18::8887b1df8b563c4ea851eb9c882c9d7b'") + .count()); + Assertions + .assertEquals( + "beopen", + resultCommunityId + .select("community") + .where( + resultCommunityId + .col("id") + .equalTo( + "50|od________18::8887b1df8b563c4ea851eb9c882c9d7b")) + .sort(desc("community")) + .collectAsList() + .get(2) + .getString(0)); - Assertions.assertEquals( - 3, - resultCommunityId - .filter("id = '50|od________18::8887b1df8b563c4ea851eb9c882c9d7b'") - .count()); - Assertions.assertEquals( - "beopen", - resultCommunityId - .select("community") - .where( - resultCommunityId - .col("id") - .equalTo( - "50|od________18::8887b1df8b563c4ea851eb9c882c9d7b")) - .sort(desc("community")) - .collectAsList() - .get(2) - .getString(0)); + Assertions + .assertEquals( + 2, + resultCommunityId + .filter("id = '50|doajarticles::8d817039a63710fcf97e30f14662c6c8'") + .count()); + Assertions + .assertEquals( + "euromarine", + resultCommunityId + .select("community") + .where( + resultCommunityId + .col("id") + .equalTo( + "50|doajarticles::8d817039a63710fcf97e30f14662c6c8")) + .sort(desc("community")) + .collectAsList() + .get(1) + .getString(0)); - Assertions.assertEquals( - 2, - resultCommunityId - .filter("id = '50|doajarticles::8d817039a63710fcf97e30f14662c6c8'") - .count()); - Assertions.assertEquals( - "euromarine", - resultCommunityId - .select("community") - .where( - resultCommunityId - .col("id") - .equalTo( - "50|doajarticles::8d817039a63710fcf97e30f14662c6c8")) - .sort(desc("community")) - .collectAsList() - .get(1) - .getString(0)); - - Assertions.assertEquals( - 3, - resultCommunityId - .filter("id = '50|doajarticles::3c98f0632f1875b4979e552ba3aa01e6'") - .count()); - Assertions.assertEquals( - "euromarine", - resultCommunityId - .select("community") - .where( - resultCommunityId - .col("id") - .equalTo( - "50|doajarticles::3c98f0632f1875b4979e552ba3aa01e6")) - .sort(desc("community")) - .collectAsList() - .get(2) - .getString(0)); - Assertions.assertEquals( - "ni", - resultCommunityId - .select("community") - .where( - resultCommunityId - .col("id") - .equalTo( - "50|doajarticles::3c98f0632f1875b4979e552ba3aa01e6")) - .sort(desc("community")) - .collectAsList() - .get(0) - .getString(0)); - } + Assertions + .assertEquals( + 3, + resultCommunityId + .filter("id = '50|doajarticles::3c98f0632f1875b4979e552ba3aa01e6'") + .count()); + Assertions + .assertEquals( + "euromarine", + resultCommunityId + .select("community") + .where( + resultCommunityId + .col("id") + .equalTo( + "50|doajarticles::3c98f0632f1875b4979e552ba3aa01e6")) + .sort(desc("community")) + .collectAsList() + .get(2) + .getString(0)); + Assertions + .assertEquals( + "ni", + resultCommunityId + .select("community") + .where( + resultCommunityId + .col("id") + .equalTo( + "50|doajarticles::3c98f0632f1875b4979e552ba3aa01e6")) + .sort(desc("community")) + .collectAsList() + .get(0) + .getString(0)); + } } diff --git a/dhp-workflows/dhp-propagation/src/test/java/eu/dnetlib/dhp/resulttocommunityfromsemrel/ResultToCommunityJobTest.java b/dhp-workflows/dhp-propagation/src/test/java/eu/dnetlib/dhp/resulttocommunityfromsemrel/ResultToCommunityJobTest.java index f8806d8bb..e0ee12be6 100644 --- a/dhp-workflows/dhp-propagation/src/test/java/eu/dnetlib/dhp/resulttocommunityfromsemrel/ResultToCommunityJobTest.java +++ b/dhp-workflows/dhp-propagation/src/test/java/eu/dnetlib/dhp/resulttocommunityfromsemrel/ResultToCommunityJobTest.java @@ -1,14 +1,13 @@ + package eu.dnetlib.dhp.resulttocommunityfromsemrel; import static org.apache.spark.sql.functions.desc; -import com.fasterxml.jackson.databind.ObjectMapper; -import eu.dnetlib.dhp.orcidtoresultfromsemrel.OrcidPropagationJobTest; -import eu.dnetlib.dhp.schema.oaf.Dataset; import java.io.IOException; import java.nio.file.Files; import java.nio.file.Path; import java.util.List; + import org.apache.commons.io.FileUtils; import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaRDD; @@ -23,258 +22,269 @@ import org.junit.jupiter.api.Test; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import com.fasterxml.jackson.databind.ObjectMapper; + +import eu.dnetlib.dhp.orcidtoresultfromsemrel.OrcidPropagationJobTest; +import eu.dnetlib.dhp.schema.oaf.Dataset; + public class ResultToCommunityJobTest { - private static final Logger log = - LoggerFactory.getLogger( - eu.dnetlib.dhp.resulttocommunityfromsemrel.ResultToCommunityJobTest.class); + private static final Logger log = LoggerFactory + .getLogger( + eu.dnetlib.dhp.resulttocommunityfromsemrel.ResultToCommunityJobTest.class); - private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); + private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); - private static final ClassLoader cl = - eu.dnetlib.dhp.resulttocommunityfromsemrel.ResultToCommunityJobTest.class - .getClassLoader(); + private static final ClassLoader cl = eu.dnetlib.dhp.resulttocommunityfromsemrel.ResultToCommunityJobTest.class + .getClassLoader(); - private static SparkSession spark; + private static SparkSession spark; - private static Path workingDir; + private static Path workingDir; - @BeforeAll - public static void beforeAll() throws IOException { - workingDir = - Files.createTempDirectory( - eu.dnetlib.dhp.resulttocommunityfromsemrel.ResultToCommunityJobTest.class - .getSimpleName()); - log.info("using work dir {}", workingDir); + @BeforeAll + public static void beforeAll() throws IOException { + workingDir = Files + .createTempDirectory( + eu.dnetlib.dhp.resulttocommunityfromsemrel.ResultToCommunityJobTest.class + .getSimpleName()); + log.info("using work dir {}", workingDir); - SparkConf conf = new SparkConf(); - conf.setAppName( - eu.dnetlib.dhp.resulttocommunityfromsemrel.ResultToCommunityJobTest.class - .getSimpleName()); + SparkConf conf = new SparkConf(); + conf + .setAppName( + eu.dnetlib.dhp.resulttocommunityfromsemrel.ResultToCommunityJobTest.class + .getSimpleName()); - conf.setMaster("local[*]"); - conf.set("spark.driver.host", "localhost"); - conf.set("hive.metastore.local", "true"); - conf.set("spark.ui.enabled", "false"); - conf.set("spark.sql.warehouse.dir", workingDir.toString()); - conf.set("hive.metastore.warehouse.dir", workingDir.resolve("warehouse").toString()); + conf.setMaster("local[*]"); + conf.set("spark.driver.host", "localhost"); + conf.set("hive.metastore.local", "true"); + conf.set("spark.ui.enabled", "false"); + conf.set("spark.sql.warehouse.dir", workingDir.toString()); + conf.set("hive.metastore.warehouse.dir", workingDir.resolve("warehouse").toString()); - spark = - SparkSession.builder() - .appName(OrcidPropagationJobTest.class.getSimpleName()) - .config(conf) - .getOrCreate(); - } + spark = SparkSession + .builder() + .appName(OrcidPropagationJobTest.class.getSimpleName()) + .config(conf) + .getOrCreate(); + } - @AfterAll - public static void afterAll() throws IOException { - FileUtils.deleteDirectory(workingDir.toFile()); - spark.stop(); - } + @AfterAll + public static void afterAll() throws IOException { + FileUtils.deleteDirectory(workingDir.toFile()); + spark.stop(); + } - @Test - public void test1() throws Exception { - SparkResultToCommunityThroughSemRelJob4.main( - new String[] { - "-isTest", Boolean.TRUE.toString(), - "-isSparkSessionManaged", Boolean.FALSE.toString(), - "-sourcePath", - getClass() - .getResource( - "/eu/dnetlib/dhp/resulttocommunityfromsemrel/sample") - .getPath(), - "-hive_metastore_uris", "", - "-resultTableName", "eu.dnetlib.dhp.schema.oaf.Dataset", - "-outputPath", workingDir.toString() + "/dataset", - "-preparedInfoPath", - getClass() - .getResource( - "/eu/dnetlib/dhp/resulttocommunityfromsemrel/preparedInfo") - .getPath() - }); + @Test + public void test1() throws Exception { + SparkResultToCommunityThroughSemRelJob4 + .main( + new String[] { + "-isTest", Boolean.TRUE.toString(), + "-isSparkSessionManaged", Boolean.FALSE.toString(), + "-sourcePath", + getClass() + .getResource( + "/eu/dnetlib/dhp/resulttocommunityfromsemrel/sample") + .getPath(), + "-hive_metastore_uris", "", + "-resultTableName", "eu.dnetlib.dhp.schema.oaf.Dataset", + "-outputPath", workingDir.toString() + "/dataset", + "-preparedInfoPath", + getClass() + .getResource( + "/eu/dnetlib/dhp/resulttocommunityfromsemrel/preparedInfo") + .getPath() + }); - final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); + final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); - JavaRDD tmp = - sc.textFile(workingDir.toString() + "/dataset") - .map(item -> OBJECT_MAPPER.readValue(item, Dataset.class)); + JavaRDD tmp = sc + .textFile(workingDir.toString() + "/dataset") + .map(item -> OBJECT_MAPPER.readValue(item, Dataset.class)); - Assertions.assertEquals(10, tmp.count()); - org.apache.spark.sql.Dataset verificationDataset = - spark.createDataset(tmp.rdd(), Encoders.bean(Dataset.class)); + Assertions.assertEquals(10, tmp.count()); + org.apache.spark.sql.Dataset verificationDataset = spark + .createDataset(tmp.rdd(), Encoders.bean(Dataset.class)); - verificationDataset.createOrReplaceTempView("dataset"); + verificationDataset.createOrReplaceTempView("dataset"); - String query = - "select id, MyT.id community " - + "from dataset " - + "lateral view explode(context) c as MyT " - + "lateral view explode(MyT.datainfo) d as MyD " - + "where MyD.inferenceprovenance = 'propagation'"; + String query = "select id, MyT.id community " + + "from dataset " + + "lateral view explode(context) c as MyT " + + "lateral view explode(MyT.datainfo) d as MyD " + + "where MyD.inferenceprovenance = 'propagation'"; - org.apache.spark.sql.Dataset resultExplodedProvenance = spark.sql(query); - Assertions.assertEquals(5, resultExplodedProvenance.count()); + org.apache.spark.sql.Dataset resultExplodedProvenance = spark.sql(query); + Assertions.assertEquals(5, resultExplodedProvenance.count()); - Assertions.assertEquals( - 0, - resultExplodedProvenance - .filter("id = '50|dedup_wf_001::2305908abeca9da37eaf3bddcaf81b7b'") - .count()); + Assertions + .assertEquals( + 0, + resultExplodedProvenance + .filter("id = '50|dedup_wf_001::2305908abeca9da37eaf3bddcaf81b7b'") + .count()); - Assertions.assertEquals( - 1, - resultExplodedProvenance - .filter("id = '50|dedup_wf_001::0489ae524201eedaa775da282dce35e7'") - .count()); - Assertions.assertEquals( - "dh-ch", - resultExplodedProvenance - .select("community") - .where( - resultExplodedProvenance - .col("id") - .equalTo( - "50|dedup_wf_001::0489ae524201eedaa775da282dce35e7")) - .collectAsList() - .get(0) - .getString(0)); + Assertions + .assertEquals( + 1, + resultExplodedProvenance + .filter("id = '50|dedup_wf_001::0489ae524201eedaa775da282dce35e7'") + .count()); + Assertions + .assertEquals( + "dh-ch", + resultExplodedProvenance + .select("community") + .where( + resultExplodedProvenance + .col("id") + .equalTo( + "50|dedup_wf_001::0489ae524201eedaa775da282dce35e7")) + .collectAsList() + .get(0) + .getString(0)); - Assertions.assertEquals( - 3, - resultExplodedProvenance - .filter("id = '50|dedup_wf_001::0a60e33b4f0986ebd9819451f2d87a28'") - .count()); - List rowList = - resultExplodedProvenance - .select("community") - .where( - resultExplodedProvenance - .col("id") - .equalTo( - "50|dedup_wf_001::0a60e33b4f0986ebd9819451f2d87a28")) - .sort(desc("community")) - .collectAsList(); - Assertions.assertEquals("mes", rowList.get(0).getString(0)); - Assertions.assertEquals("fam", rowList.get(1).getString(0)); - Assertions.assertEquals("ee", rowList.get(2).getString(0)); + Assertions + .assertEquals( + 3, + resultExplodedProvenance + .filter("id = '50|dedup_wf_001::0a60e33b4f0986ebd9819451f2d87a28'") + .count()); + List rowList = resultExplodedProvenance + .select("community") + .where( + resultExplodedProvenance + .col("id") + .equalTo( + "50|dedup_wf_001::0a60e33b4f0986ebd9819451f2d87a28")) + .sort(desc("community")) + .collectAsList(); + Assertions.assertEquals("mes", rowList.get(0).getString(0)); + Assertions.assertEquals("fam", rowList.get(1).getString(0)); + Assertions.assertEquals("ee", rowList.get(2).getString(0)); - Assertions.assertEquals( - 1, - resultExplodedProvenance - .filter("id = '50|dedup_wf_001::0ae02edb5598a5545d10b107fcf48dcc'") - .count()); - Assertions.assertEquals( - "aginfra", - resultExplodedProvenance - .select("community") - .where( - resultExplodedProvenance - .col("id") - .equalTo( - "50|dedup_wf_001::0ae02edb5598a5545d10b107fcf48dcc")) - .collectAsList() - .get(0) - .getString(0)); + Assertions + .assertEquals( + 1, + resultExplodedProvenance + .filter("id = '50|dedup_wf_001::0ae02edb5598a5545d10b107fcf48dcc'") + .count()); + Assertions + .assertEquals( + "aginfra", + resultExplodedProvenance + .select("community") + .where( + resultExplodedProvenance + .col("id") + .equalTo( + "50|dedup_wf_001::0ae02edb5598a5545d10b107fcf48dcc")) + .collectAsList() + .get(0) + .getString(0)); - query = - "select id, MyT.id community " - + "from dataset " - + "lateral view explode(context) c as MyT " - + "lateral view explode(MyT.datainfo) d as MyD "; + query = "select id, MyT.id community " + + "from dataset " + + "lateral view explode(context) c as MyT " + + "lateral view explode(MyT.datainfo) d as MyD "; - org.apache.spark.sql.Dataset resultCommunityId = spark.sql(query); + org.apache.spark.sql.Dataset resultCommunityId = spark.sql(query); - Assertions.assertEquals(10, resultCommunityId.count()); + Assertions.assertEquals(10, resultCommunityId.count()); - Assertions.assertEquals( - 2, - resultCommunityId - .filter("id = '50|dedup_wf_001::0489ae524201eedaa775da282dce35e7'") - .count()); - rowList = - resultCommunityId - .select("community") - .where( - resultCommunityId - .col("id") - .equalTo( - "50|dedup_wf_001::0489ae524201eedaa775da282dce35e7")) - .sort(desc("community")) - .collectAsList(); - Assertions.assertEquals("dh-ch", rowList.get(0).getString(0)); - Assertions.assertEquals("beopen", rowList.get(1).getString(0)); + Assertions + .assertEquals( + 2, + resultCommunityId + .filter("id = '50|dedup_wf_001::0489ae524201eedaa775da282dce35e7'") + .count()); + rowList = resultCommunityId + .select("community") + .where( + resultCommunityId + .col("id") + .equalTo( + "50|dedup_wf_001::0489ae524201eedaa775da282dce35e7")) + .sort(desc("community")) + .collectAsList(); + Assertions.assertEquals("dh-ch", rowList.get(0).getString(0)); + Assertions.assertEquals("beopen", rowList.get(1).getString(0)); - Assertions.assertEquals( - 3, - resultCommunityId - .filter("id = '50|dedup_wf_001::0a60e33b4f0986ebd9819451f2d87a28'") - .count()); - rowList = - resultCommunityId - .select("community") - .where( - resultCommunityId - .col("id") - .equalTo( - "50|dedup_wf_001::0a60e33b4f0986ebd9819451f2d87a28")) - .sort(desc("community")) - .collectAsList(); - Assertions.assertEquals("mes", rowList.get(0).getString(0)); - Assertions.assertEquals("fam", rowList.get(1).getString(0)); - Assertions.assertEquals("ee", rowList.get(2).getString(0)); + Assertions + .assertEquals( + 3, + resultCommunityId + .filter("id = '50|dedup_wf_001::0a60e33b4f0986ebd9819451f2d87a28'") + .count()); + rowList = resultCommunityId + .select("community") + .where( + resultCommunityId + .col("id") + .equalTo( + "50|dedup_wf_001::0a60e33b4f0986ebd9819451f2d87a28")) + .sort(desc("community")) + .collectAsList(); + Assertions.assertEquals("mes", rowList.get(0).getString(0)); + Assertions.assertEquals("fam", rowList.get(1).getString(0)); + Assertions.assertEquals("ee", rowList.get(2).getString(0)); - Assertions.assertEquals( - 2, - resultCommunityId - .filter("id = '50|dedup_wf_001::0ae02edb5598a5545d10b107fcf48dcc'") - .count()); - rowList = - resultCommunityId - .select("community") - .where( - resultCommunityId - .col("id") - .equalTo( - "50|dedup_wf_001::0ae02edb5598a5545d10b107fcf48dcc")) - .sort(desc("community")) - .collectAsList(); - Assertions.assertEquals("beopen", rowList.get(0).getString(0)); - Assertions.assertEquals("aginfra", rowList.get(1).getString(0)); + Assertions + .assertEquals( + 2, + resultCommunityId + .filter("id = '50|dedup_wf_001::0ae02edb5598a5545d10b107fcf48dcc'") + .count()); + rowList = resultCommunityId + .select("community") + .where( + resultCommunityId + .col("id") + .equalTo( + "50|dedup_wf_001::0ae02edb5598a5545d10b107fcf48dcc")) + .sort(desc("community")) + .collectAsList(); + Assertions.assertEquals("beopen", rowList.get(0).getString(0)); + Assertions.assertEquals("aginfra", rowList.get(1).getString(0)); - Assertions.assertEquals( - 2, - resultCommunityId - .filter("id = '50|dedup_wf_001::2305908abeca9da37eaf3bddcaf81b7b'") - .count()); - rowList = - resultCommunityId - .select("community") - .where( - resultCommunityId - .col("id") - .equalTo( - "50|dedup_wf_001::2305908abeca9da37eaf3bddcaf81b7b")) - .sort(desc("community")) - .collectAsList(); - Assertions.assertEquals("euromarine", rowList.get(1).getString(0)); - Assertions.assertEquals("ni", rowList.get(0).getString(0)); + Assertions + .assertEquals( + 2, + resultCommunityId + .filter("id = '50|dedup_wf_001::2305908abeca9da37eaf3bddcaf81b7b'") + .count()); + rowList = resultCommunityId + .select("community") + .where( + resultCommunityId + .col("id") + .equalTo( + "50|dedup_wf_001::2305908abeca9da37eaf3bddcaf81b7b")) + .sort(desc("community")) + .collectAsList(); + Assertions.assertEquals("euromarine", rowList.get(1).getString(0)); + Assertions.assertEquals("ni", rowList.get(0).getString(0)); - Assertions.assertEquals( - 1, - resultCommunityId - .filter("id = '50|doajarticles::8d817039a63710fcf97e30f14662c6c8'") - .count()); - Assertions.assertEquals( - "euromarine", - resultCommunityId - .select("community") - .where( - resultCommunityId - .col("id") - .equalTo( - "50|doajarticles::8d817039a63710fcf97e30f14662c6c8")) - .collectAsList() - .get(0) - .getString(0)); - } + Assertions + .assertEquals( + 1, + resultCommunityId + .filter("id = '50|doajarticles::8d817039a63710fcf97e30f14662c6c8'") + .count()); + Assertions + .assertEquals( + "euromarine", + resultCommunityId + .select("community") + .where( + resultCommunityId + .col("id") + .equalTo( + "50|doajarticles::8d817039a63710fcf97e30f14662c6c8")) + .collectAsList() + .get(0) + .getString(0)); + } } diff --git a/dhp-workflows/dhp-propagation/src/test/java/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/Result2OrganizationJobTest.java b/dhp-workflows/dhp-propagation/src/test/java/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/Result2OrganizationJobTest.java index 2b2f64a33..01bf64dd9 100644 --- a/dhp-workflows/dhp-propagation/src/test/java/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/Result2OrganizationJobTest.java +++ b/dhp-workflows/dhp-propagation/src/test/java/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/Result2OrganizationJobTest.java @@ -1,10 +1,10 @@ + package eu.dnetlib.dhp.resulttoorganizationfrominstrepo; -import com.fasterxml.jackson.databind.ObjectMapper; -import eu.dnetlib.dhp.schema.oaf.Relation; import java.io.IOException; import java.nio.file.Files; import java.nio.file.Path; + import org.apache.commons.io.FileUtils; import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaRDD; @@ -19,269 +19,286 @@ import org.junit.jupiter.api.Test; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import com.fasterxml.jackson.databind.ObjectMapper; + +import eu.dnetlib.dhp.schema.oaf.Relation; + public class Result2OrganizationJobTest { - private static final Logger log = LoggerFactory.getLogger(Result2OrganizationJobTest.class); + private static final Logger log = LoggerFactory.getLogger(Result2OrganizationJobTest.class); - private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); + private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); - private static final ClassLoader cl = Result2OrganizationJobTest.class.getClassLoader(); + private static final ClassLoader cl = Result2OrganizationJobTest.class.getClassLoader(); - private static SparkSession spark; + private static SparkSession spark; - private static Path workingDir; + private static Path workingDir; - @BeforeAll - public static void beforeAll() throws IOException { - workingDir = - Files.createTempDirectory( - SparkResultToOrganizationFromIstRepoJob2.class.getSimpleName()); - log.info("using work dir {}", workingDir); + @BeforeAll + public static void beforeAll() throws IOException { + workingDir = Files + .createTempDirectory( + SparkResultToOrganizationFromIstRepoJob2.class.getSimpleName()); + log.info("using work dir {}", workingDir); - SparkConf conf = new SparkConf(); - conf.setAppName(SparkResultToOrganizationFromIstRepoJob2.class.getSimpleName()); + SparkConf conf = new SparkConf(); + conf.setAppName(SparkResultToOrganizationFromIstRepoJob2.class.getSimpleName()); - conf.setMaster("local[*]"); - conf.set("spark.driver.host", "localhost"); - conf.set("hive.metastore.local", "true"); - conf.set("spark.ui.enabled", "false"); - conf.set("spark.sql.warehouse.dir", workingDir.toString()); - conf.set("hive.metastore.warehouse.dir", workingDir.resolve("warehouse").toString()); + conf.setMaster("local[*]"); + conf.set("spark.driver.host", "localhost"); + conf.set("hive.metastore.local", "true"); + conf.set("spark.ui.enabled", "false"); + conf.set("spark.sql.warehouse.dir", workingDir.toString()); + conf.set("hive.metastore.warehouse.dir", workingDir.resolve("warehouse").toString()); - spark = - SparkSession.builder() - .appName(SparkResultToOrganizationFromIstRepoJob2.class.getSimpleName()) - .config(conf) - .getOrCreate(); - } + spark = SparkSession + .builder() + .appName(SparkResultToOrganizationFromIstRepoJob2.class.getSimpleName()) + .config(conf) + .getOrCreate(); + } - @AfterAll - public static void afterAll() throws IOException { - FileUtils.deleteDirectory(workingDir.toFile()); - spark.stop(); - } + @AfterAll + public static void afterAll() throws IOException { + FileUtils.deleteDirectory(workingDir.toFile()); + spark.stop(); + } - /** - * No modifications done to the sample sets, so that no possible updates are created - * - * @throws Exception - */ - @Test - public void NoUpdateTest() throws Exception { - SparkResultToOrganizationFromIstRepoJob2.main( - new String[] { - "-isTest", - Boolean.TRUE.toString(), - "-isSparkSessionManaged", - Boolean.FALSE.toString(), - "-sourcePath", - getClass() - .getResource( - "/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/sample/noupdate_updatenomix") - .getPath(), - "-hive_metastore_uris", - "", - "-resultTableName", - "eu.dnetlib.dhp.schema.oaf.Software", - "-writeUpdate", - "false", - "-saveGraph", - "true", - "-outputPath", - workingDir.toString() + "/relation", - "-datasourceOrganizationPath", - getClass() - .getResource( - "/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/noupdate/preparedInfo/datasourceOrganization") - .getPath(), - "-alreadyLinkedPath", - getClass() - .getResource( - "/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/noupdate/preparedInfo/alreadyLinked") - .getPath(), - }); + /** + * No modifications done to the sample sets, so that no possible updates are created + * + * @throws Exception + */ + @Test + public void NoUpdateTest() throws Exception { + SparkResultToOrganizationFromIstRepoJob2 + .main( + new String[] { + "-isTest", + Boolean.TRUE.toString(), + "-isSparkSessionManaged", + Boolean.FALSE.toString(), + "-sourcePath", + getClass() + .getResource( + "/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/sample/noupdate_updatenomix") + .getPath(), + "-hive_metastore_uris", + "", + "-resultTableName", + "eu.dnetlib.dhp.schema.oaf.Software", + "-writeUpdate", + "false", + "-saveGraph", + "true", + "-outputPath", + workingDir.toString() + "/relation", + "-datasourceOrganizationPath", + getClass() + .getResource( + "/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/noupdate/preparedInfo/datasourceOrganization") + .getPath(), + "-alreadyLinkedPath", + getClass() + .getResource( + "/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/noupdate/preparedInfo/alreadyLinked") + .getPath(), + }); - final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); + final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); - JavaRDD tmp = - sc.textFile(workingDir.toString() + "/relation") - .map(item -> OBJECT_MAPPER.readValue(item, Relation.class)); + JavaRDD tmp = sc + .textFile(workingDir.toString() + "/relation") + .map(item -> OBJECT_MAPPER.readValue(item, Relation.class)); - Assertions.assertEquals(0, tmp.count()); - } + Assertions.assertEquals(0, tmp.count()); + } - /** - * Testing set with modified association between datasource and organization. Copied some - * hostedby collectedfrom from the software sample set. No intersection with the already linked - * (all the possible new relations, will became new relations) - * - * @throws Exception - */ - @Test - public void UpdateNoMixTest() throws Exception { - SparkResultToOrganizationFromIstRepoJob2.main( - new String[] { - "-isTest", - Boolean.TRUE.toString(), - "-isSparkSessionManaged", - Boolean.FALSE.toString(), - "-sourcePath", - getClass() - .getResource( - "/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/sample/noupdate_updatenomix") - .getPath(), - "-hive_metastore_uris", - "", - "-resultTableName", - "eu.dnetlib.dhp.schema.oaf.Software", - "-writeUpdate", - "false", - "-saveGraph", - "true", - "-outputPath", - workingDir.toString() + "/relation", - "-datasourceOrganizationPath", - getClass() - .getResource( - "/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/updatenomix/preparedInfo/datasourceOrganization") - .getPath(), - "-alreadyLinkedPath", - getClass() - .getResource( - "/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/updatenomix/preparedInfo/alreadyLinked") - .getPath(), - }); + /** + * Testing set with modified association between datasource and organization. Copied some hostedby collectedfrom + * from the software sample set. No intersection with the already linked (all the possible new relations, will + * became new relations) + * + * @throws Exception + */ + @Test + public void UpdateNoMixTest() throws Exception { + SparkResultToOrganizationFromIstRepoJob2 + .main( + new String[] { + "-isTest", + Boolean.TRUE.toString(), + "-isSparkSessionManaged", + Boolean.FALSE.toString(), + "-sourcePath", + getClass() + .getResource( + "/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/sample/noupdate_updatenomix") + .getPath(), + "-hive_metastore_uris", + "", + "-resultTableName", + "eu.dnetlib.dhp.schema.oaf.Software", + "-writeUpdate", + "false", + "-saveGraph", + "true", + "-outputPath", + workingDir.toString() + "/relation", + "-datasourceOrganizationPath", + getClass() + .getResource( + "/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/updatenomix/preparedInfo/datasourceOrganization") + .getPath(), + "-alreadyLinkedPath", + getClass() + .getResource( + "/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/updatenomix/preparedInfo/alreadyLinked") + .getPath(), + }); - final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); + final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); - JavaRDD tmp = - sc.textFile(workingDir.toString() + "/relation") - .map(item -> OBJECT_MAPPER.readValue(item, Relation.class)); + JavaRDD tmp = sc + .textFile(workingDir.toString() + "/relation") + .map(item -> OBJECT_MAPPER.readValue(item, Relation.class)); - Assertions.assertEquals(20, tmp.count()); + Assertions.assertEquals(20, tmp.count()); - Dataset verificationDs = - spark.createDataset(tmp.rdd(), Encoders.bean(Relation.class)); - Assertions.assertEquals( - 8, - verificationDs - .filter("target = '20|dedup_wf_001::5168917a6aeeea55269daeac1af2ecd2'") - .count()); - Assertions.assertEquals( - 1, - verificationDs - .filter("target = '20|opendoar____::124266ebc4ece2934eb80edfda3f2091'") - .count()); - Assertions.assertEquals( - 1, - verificationDs - .filter("target = '20|opendoar____::4429502fa1936b0941f4647b69b844c8'") - .count()); + Dataset verificationDs = spark.createDataset(tmp.rdd(), Encoders.bean(Relation.class)); + Assertions + .assertEquals( + 8, + verificationDs + .filter("target = '20|dedup_wf_001::5168917a6aeeea55269daeac1af2ecd2'") + .count()); + Assertions + .assertEquals( + 1, + verificationDs + .filter("target = '20|opendoar____::124266ebc4ece2934eb80edfda3f2091'") + .count()); + Assertions + .assertEquals( + 1, + verificationDs + .filter("target = '20|opendoar____::4429502fa1936b0941f4647b69b844c8'") + .count()); - Assertions.assertEquals( - 2, - verificationDs - .filter( - "source = '50|dedup_wf_001::b67bc915603fc01e445f2b5888ba7218' and " - + "(target = '20|opendoar____::124266ebc4ece2934eb80edfda3f2091' " - + "or target = '20|dedup_wf_001::5168917a6aeeea55269daeac1af2ecd2')") - .count()); - } + Assertions + .assertEquals( + 2, + verificationDs + .filter( + "source = '50|dedup_wf_001::b67bc915603fc01e445f2b5888ba7218' and " + + "(target = '20|opendoar____::124266ebc4ece2934eb80edfda3f2091' " + + "or target = '20|dedup_wf_001::5168917a6aeeea55269daeac1af2ecd2')") + .count()); + } - @Test - public void UpdateMixTest() throws Exception { - SparkResultToOrganizationFromIstRepoJob2.main( - new String[] { - "-isTest", - Boolean.TRUE.toString(), - "-isSparkSessionManaged", - Boolean.FALSE.toString(), - "-sourcePath", - getClass() - .getResource( - "/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/sample/updatemix") - .getPath(), - "-hive_metastore_uris", - "", - "-resultTableName", - "eu.dnetlib.dhp.schema.oaf.Software", - "-writeUpdate", - "false", - "-saveGraph", - "true", - "-outputPath", - workingDir.toString() + "/relation", - "-datasourceOrganizationPath", - getClass() - .getResource( - "/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/updatemix/preparedInfo/datasourceOrganization") - .getPath(), - "-alreadyLinkedPath", - getClass() - .getResource( - "/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/updatemix/preparedInfo/alreadyLinked") - .getPath(), - }); + @Test + public void UpdateMixTest() throws Exception { + SparkResultToOrganizationFromIstRepoJob2 + .main( + new String[] { + "-isTest", + Boolean.TRUE.toString(), + "-isSparkSessionManaged", + Boolean.FALSE.toString(), + "-sourcePath", + getClass() + .getResource( + "/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/sample/updatemix") + .getPath(), + "-hive_metastore_uris", + "", + "-resultTableName", + "eu.dnetlib.dhp.schema.oaf.Software", + "-writeUpdate", + "false", + "-saveGraph", + "true", + "-outputPath", + workingDir.toString() + "/relation", + "-datasourceOrganizationPath", + getClass() + .getResource( + "/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/updatemix/preparedInfo/datasourceOrganization") + .getPath(), + "-alreadyLinkedPath", + getClass() + .getResource( + "/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/updatemix/preparedInfo/alreadyLinked") + .getPath(), + }); - final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); + final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); - JavaRDD tmp = - sc.textFile(workingDir.toString() + "/relation") - .map(item -> OBJECT_MAPPER.readValue(item, Relation.class)); + JavaRDD tmp = sc + .textFile(workingDir.toString() + "/relation") + .map(item -> OBJECT_MAPPER.readValue(item, Relation.class)); - Dataset verificationDs = - spark.createDataset(tmp.rdd(), Encoders.bean(Relation.class)); + Dataset verificationDs = spark.createDataset(tmp.rdd(), Encoders.bean(Relation.class)); - Assertions.assertEquals(8, verificationDs.count()); + Assertions.assertEquals(8, verificationDs.count()); - Assertions.assertEquals( - 2, - verificationDs - .filter("source = '50|od_______109::f375befa62a741e9250e55bcfa88f9a6'") - .count()); - Assertions.assertEquals( - 1, - verificationDs - .filter("source = '50|dedup_wf_001::b67bc915603fc01e445f2b5888ba7218'") - .count()); - Assertions.assertEquals( - 1, - verificationDs - .filter("source = '50|dedup_wf_001::40ea2f24181f6ae77b866ebcbffba523'") - .count()); + Assertions + .assertEquals( + 2, + verificationDs + .filter("source = '50|od_______109::f375befa62a741e9250e55bcfa88f9a6'") + .count()); + Assertions + .assertEquals( + 1, + verificationDs + .filter("source = '50|dedup_wf_001::b67bc915603fc01e445f2b5888ba7218'") + .count()); + Assertions + .assertEquals( + 1, + verificationDs + .filter("source = '50|dedup_wf_001::40ea2f24181f6ae77b866ebcbffba523'") + .count()); - Assertions.assertEquals( - 1, - verificationDs - .filter("source = '20|wt__________::a72760363ca885e6bef165804770e00c'") - .count()); + Assertions + .assertEquals( + 1, + verificationDs + .filter("source = '20|wt__________::a72760363ca885e6bef165804770e00c'") + .count()); - Assertions.assertEquals( - 4, - verificationDs - .filter( - "relclass = 'hasAuthorInstitution' and substring(source, 1,2) = '50'") - .count()); - Assertions.assertEquals( - 4, - verificationDs - .filter( - "relclass = 'isAuthorInstitutionOf' and substring(source, 1,2) = '20'") - .count()); + Assertions + .assertEquals( + 4, + verificationDs + .filter( + "relclass = 'hasAuthorInstitution' and substring(source, 1,2) = '50'") + .count()); + Assertions + .assertEquals( + 4, + verificationDs + .filter( + "relclass = 'isAuthorInstitutionOf' and substring(source, 1,2) = '20'") + .count()); - Assertions.assertEquals( - 4, - verificationDs - .filter( - "relclass = 'hasAuthorInstitution' and " - + "substring(source, 1,2) = '50' and substring(target, 1, 2) = '20'") - .count()); - Assertions.assertEquals( - 4, - verificationDs - .filter( - "relclass = 'isAuthorInstitutionOf' and " - + "substring(source, 1,2) = '20' and substring(target, 1, 2) = '50'") - .count()); - } + Assertions + .assertEquals( + 4, + verificationDs + .filter( + "relclass = 'hasAuthorInstitution' and " + + "substring(source, 1,2) = '50' and substring(target, 1, 2) = '20'") + .count()); + Assertions + .assertEquals( + 4, + verificationDs + .filter( + "relclass = 'isAuthorInstitutionOf' and " + + "substring(source, 1,2) = '20' and substring(target, 1, 2) = '50'") + .count()); + } }