merge from beta

deduptesting
Claudio Atzori 1 year ago
commit 0419953470

7
.gitignore vendored

@ -3,10 +3,10 @@
*.iws
*.ipr
*.iml
*.ipr
*.iws
*~
.vscode
.metals
.bloop
.classpath
/*/.classpath
/*/*/.classpath
@ -24,4 +24,5 @@
spark-warehouse
/**/job-override.properties
/**/*.log
/**/.factorypath
/**/.scalafmt.conf

@ -0,0 +1,21 @@
style = defaultWithAlign
align.openParenCallSite = false
align.openParenDefnSite = false
align.tokens = [{code = "->"}, {code = "<-"}, {code = "=>", owner = "Case"}]
continuationIndent.callSite = 2
continuationIndent.defnSite = 2
danglingParentheses = true
indentOperator = spray
maxColumn = 120
newlines.alwaysBeforeTopLevelStatements = true
project.excludeFilters = [".*\\.sbt"]
rewrite.rules = [AvoidInfix]
rewrite.rules = [ExpandImportSelectors]
rewrite.rules = [RedundantBraces]
rewrite.rules = [RedundantParens]
rewrite.rules = [SortImports]
rewrite.rules = [SortModifiers]
rewrite.rules = [PreferCurlyFors]
spaces.inImportCurlyBraces = false
unindentTopLevelOperators = true

@ -1,2 +1,2 @@
# dnet-hadoop
Dnet-hadoop is a tool for
Dnet-hadoop is the project that defined all the OOZIE workflows for the OpenAIRE Graph construction, processing, provisioning.

@ -6,7 +6,7 @@
<parent>
<groupId>eu.dnetlib.dhp</groupId>
<artifactId>dhp-build</artifactId>
<version>1.2.4-SNAPSHOT</version>
<version>1.2.5-SNAPSHOT</version>
</parent>
<artifactId>dhp-build-assembly-resources</artifactId>

@ -6,7 +6,7 @@
<parent>
<groupId>eu.dnetlib.dhp</groupId>
<artifactId>dhp-build</artifactId>
<version>1.2.4-SNAPSHOT</version>
<version>1.2.5-SNAPSHOT</version>
</parent>
<artifactId>dhp-build-properties-maven-plugin</artifactId>

@ -8,8 +8,6 @@ import java.util.List;
import org.apache.commons.lang.ArrayUtils;
import org.apache.commons.lang.StringUtils;
import org.apache.maven.plugin.AbstractMojo;
import org.apache.maven.plugin.MojoExecutionException;
import org.apache.maven.plugin.MojoFailureException;
/**
* Generates oozie properties which were not provided from commandline.
@ -27,7 +25,7 @@ public class GenerateOoziePropertiesMojo extends AbstractMojo {
};
@Override
public void execute() throws MojoExecutionException, MojoFailureException {
public void execute() {
if (System.getProperties().containsKey(PROPERTY_NAME_WF_SOURCE_DIR)
&& !System.getProperties().containsKey(PROPERTY_NAME_SANDBOX_NAME)) {
String generatedSandboxName = generateSandboxName(
@ -46,24 +44,24 @@ public class GenerateOoziePropertiesMojo extends AbstractMojo {
/**
* Generates sandbox name from workflow source directory.
*
* @param wfSourceDir
* @param wfSourceDir workflow source directory
* @return generated sandbox name
*/
private String generateSandboxName(String wfSourceDir) {
// utilize all dir names until finding one of the limiters
List<String> sandboxNameParts = new ArrayList<String>();
List<String> sandboxNameParts = new ArrayList<>();
String[] tokens = StringUtils.split(wfSourceDir, File.separatorChar);
ArrayUtils.reverse(tokens);
if (tokens.length > 0) {
for (String token : tokens) {
for (String limiter : limiters) {
if (limiter.equals(token)) {
return sandboxNameParts.size() > 0
return !sandboxNameParts.isEmpty()
? StringUtils.join(sandboxNameParts.toArray())
: null;
}
}
if (sandboxNameParts.size() > 0) {
if (!sandboxNameParts.isEmpty()) {
sandboxNameParts.add(0, File.separator);
}
sandboxNameParts.add(0, token);

@ -16,6 +16,7 @@ import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
@ -289,7 +290,7 @@ public class WritePredefinedProjectProperties extends AbstractMojo {
*/
protected List<String> getEscapeChars(String escapeChars) {
List<String> tokens = getListFromCSV(escapeChars);
List<String> realTokens = new ArrayList<String>();
List<String> realTokens = new ArrayList<>();
for (String token : tokens) {
String realToken = getRealToken(token);
realTokens.add(realToken);
@ -324,7 +325,7 @@ public class WritePredefinedProjectProperties extends AbstractMojo {
* @return content
*/
protected String getContent(String comment, Properties properties, List<String> escapeTokens) {
List<String> names = new ArrayList<String>(properties.stringPropertyNames());
List<String> names = new ArrayList<>(properties.stringPropertyNames());
Collections.sort(names);
StringBuilder sb = new StringBuilder();
if (!StringUtils.isBlank(comment)) {
@ -352,7 +353,7 @@ public class WritePredefinedProjectProperties extends AbstractMojo {
throws MojoExecutionException {
try {
String content = getContent(comment, properties, escapeTokens);
FileUtils.writeStringToFile(file, content, ENCODING_UTF8);
FileUtils.writeStringToFile(file, content, StandardCharsets.UTF_8);
} catch (IOException e) {
throw new MojoExecutionException("Error creating properties file", e);
}
@ -399,9 +400,9 @@ public class WritePredefinedProjectProperties extends AbstractMojo {
*/
protected static final List<String> getListFromCSV(String csv) {
if (StringUtils.isBlank(csv)) {
return new ArrayList<String>();
return new ArrayList<>();
}
List<String> list = new ArrayList<String>();
List<String> list = new ArrayList<>();
String[] tokens = StringUtils.split(csv, ",");
for (String token : tokens) {
list.add(token.trim());

@ -9,18 +9,18 @@ import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
/** @author mhorst, claudio.atzori */
public class GenerateOoziePropertiesMojoTest {
class GenerateOoziePropertiesMojoTest {
private final GenerateOoziePropertiesMojo mojo = new GenerateOoziePropertiesMojo();
@BeforeEach
public void clearSystemProperties() {
void clearSystemProperties() {
System.clearProperty(PROPERTY_NAME_SANDBOX_NAME);
System.clearProperty(PROPERTY_NAME_WF_SOURCE_DIR);
}
@Test
public void testExecuteEmpty() throws Exception {
void testExecuteEmpty() throws Exception {
// execute
mojo.execute();
@ -29,7 +29,7 @@ public class GenerateOoziePropertiesMojoTest {
}
@Test
public void testExecuteSandboxNameAlreadySet() throws Exception {
void testExecuteSandboxNameAlreadySet() throws Exception {
// given
String workflowSourceDir = "eu/dnetlib/dhp/wf/transformers";
String sandboxName = "originalSandboxName";
@ -44,7 +44,7 @@ public class GenerateOoziePropertiesMojoTest {
}
@Test
public void testExecuteEmptyWorkflowSourceDir() throws Exception {
void testExecuteEmptyWorkflowSourceDir() throws Exception {
// given
String workflowSourceDir = "";
System.setProperty(PROPERTY_NAME_WF_SOURCE_DIR, workflowSourceDir);
@ -57,7 +57,7 @@ public class GenerateOoziePropertiesMojoTest {
}
@Test
public void testExecuteNullSandboxNameGenerated() throws Exception {
void testExecuteNullSandboxNameGenerated() throws Exception {
// given
String workflowSourceDir = "eu/dnetlib/dhp/";
System.setProperty(PROPERTY_NAME_WF_SOURCE_DIR, workflowSourceDir);
@ -70,7 +70,7 @@ public class GenerateOoziePropertiesMojoTest {
}
@Test
public void testExecute() throws Exception {
void testExecute() throws Exception {
// given
String workflowSourceDir = "eu/dnetlib/dhp/wf/transformers";
System.setProperty(PROPERTY_NAME_WF_SOURCE_DIR, workflowSourceDir);
@ -83,7 +83,7 @@ public class GenerateOoziePropertiesMojoTest {
}
@Test
public void testExecuteWithoutRoot() throws Exception {
void testExecuteWithoutRoot() throws Exception {
// given
String workflowSourceDir = "wf/transformers";
System.setProperty(PROPERTY_NAME_WF_SOURCE_DIR, workflowSourceDir);

@ -20,7 +20,7 @@ import org.mockito.junit.jupiter.MockitoExtension;
/** @author mhorst, claudio.atzori */
@ExtendWith(MockitoExtension.class)
public class WritePredefinedProjectPropertiesTest {
class WritePredefinedProjectPropertiesTest {
@Mock
private MavenProject mavenProject;
@ -39,7 +39,7 @@ public class WritePredefinedProjectPropertiesTest {
// ----------------------------------- TESTS ---------------------------------------------
@Test
public void testExecuteEmpty() throws Exception {
void testExecuteEmpty() throws Exception {
// execute
mojo.execute();
@ -50,7 +50,7 @@ public class WritePredefinedProjectPropertiesTest {
}
@Test
public void testExecuteWithProjectProperties() throws Exception {
void testExecuteWithProjectProperties() throws Exception {
// given
String key = "projectPropertyKey";
String value = "projectPropertyValue";
@ -70,7 +70,7 @@ public class WritePredefinedProjectPropertiesTest {
}
@Test()
public void testExecuteWithProjectPropertiesAndInvalidOutputFile(@TempDir File testFolder) {
void testExecuteWithProjectPropertiesAndInvalidOutputFile(@TempDir File testFolder) {
// given
String key = "projectPropertyKey";
String value = "projectPropertyValue";
@ -84,7 +84,7 @@ public class WritePredefinedProjectPropertiesTest {
}
@Test
public void testExecuteWithProjectPropertiesExclusion(@TempDir File testFolder) throws Exception {
void testExecuteWithProjectPropertiesExclusion(@TempDir File testFolder) throws Exception {
// given
String key = "projectPropertyKey";
String value = "projectPropertyValue";
@ -108,7 +108,7 @@ public class WritePredefinedProjectPropertiesTest {
}
@Test
public void testExecuteWithProjectPropertiesInclusion(@TempDir File testFolder) throws Exception {
void testExecuteWithProjectPropertiesInclusion(@TempDir File testFolder) throws Exception {
// given
String key = "projectPropertyKey";
String value = "projectPropertyValue";
@ -132,7 +132,7 @@ public class WritePredefinedProjectPropertiesTest {
}
@Test
public void testExecuteIncludingPropertyKeysFromFile(@TempDir File testFolder) throws Exception {
void testExecuteIncludingPropertyKeysFromFile(@TempDir File testFolder) throws Exception {
// given
String key = "projectPropertyKey";
String value = "projectPropertyValue";
@ -164,7 +164,7 @@ public class WritePredefinedProjectPropertiesTest {
}
@Test
public void testExecuteIncludingPropertyKeysFromClasspathResource(@TempDir File testFolder)
void testExecuteIncludingPropertyKeysFromClasspathResource(@TempDir File testFolder)
throws Exception {
// given
String key = "projectPropertyKey";
@ -194,7 +194,7 @@ public class WritePredefinedProjectPropertiesTest {
}
@Test
public void testExecuteIncludingPropertyKeysFromBlankLocation() {
void testExecuteIncludingPropertyKeysFromBlankLocation() {
// given
String key = "projectPropertyKey";
String value = "projectPropertyValue";
@ -214,7 +214,7 @@ public class WritePredefinedProjectPropertiesTest {
}
@Test
public void testExecuteIncludingPropertyKeysFromXmlFile(@TempDir File testFolder)
void testExecuteIncludingPropertyKeysFromXmlFile(@TempDir File testFolder)
throws Exception {
// given
String key = "projectPropertyKey";
@ -247,7 +247,7 @@ public class WritePredefinedProjectPropertiesTest {
}
@Test
public void testExecuteIncludingPropertyKeysFromInvalidXmlFile(@TempDir File testFolder)
void testExecuteIncludingPropertyKeysFromInvalidXmlFile(@TempDir File testFolder)
throws Exception {
// given
String key = "projectPropertyKey";
@ -273,7 +273,7 @@ public class WritePredefinedProjectPropertiesTest {
}
@Test
public void testExecuteWithQuietModeOn(@TempDir File testFolder) throws Exception {
void testExecuteWithQuietModeOn(@TempDir File testFolder) throws Exception {
// given
mojo.setQuiet(true);
mojo.setIncludePropertyKeysFromFiles(new String[] {
@ -290,7 +290,7 @@ public class WritePredefinedProjectPropertiesTest {
}
@Test
public void testExecuteIncludingPropertyKeysFromInvalidFile() {
void testExecuteIncludingPropertyKeysFromInvalidFile() {
// given
mojo.setIncludePropertyKeysFromFiles(new String[] {
"invalid location"
@ -301,7 +301,7 @@ public class WritePredefinedProjectPropertiesTest {
}
@Test
public void testExecuteWithEnvironmentProperties(@TempDir File testFolder) throws Exception {
void testExecuteWithEnvironmentProperties(@TempDir File testFolder) throws Exception {
// given
mojo.setIncludeEnvironmentVariables(true);
@ -318,7 +318,7 @@ public class WritePredefinedProjectPropertiesTest {
}
@Test
public void testExecuteWithSystemProperties(@TempDir File testFolder) throws Exception {
void testExecuteWithSystemProperties(@TempDir File testFolder) throws Exception {
// given
String key = "systemPropertyKey";
String value = "systemPropertyValue";
@ -337,7 +337,7 @@ public class WritePredefinedProjectPropertiesTest {
}
@Test
public void testExecuteWithSystemPropertiesAndEscapeChars(@TempDir File testFolder)
void testExecuteWithSystemPropertiesAndEscapeChars(@TempDir File testFolder)
throws Exception {
// given
String key = "systemPropertyKey ";

@ -5,7 +5,7 @@
<groupId>eu.dnetlib.dhp</groupId>
<artifactId>dhp-code-style</artifactId>
<version>1.2.4-SNAPSHOT</version>
<version>1.2.5-SNAPSHOT</version>
<packaging>jar</packaging>
@ -15,16 +15,27 @@
<snapshotRepository>
<id>dnet45-snapshots</id>
<name>DNet45 Snapshots</name>
<url>http://maven.research-infrastructures.eu/nexus/content/repositories/dnet45-snapshots</url>
<url>https://maven.d4science.org/nexus/content/repositories/dnet45-snapshots</url>
<layout>default</layout>
</snapshotRepository>
<repository>
<id>dnet45-releases</id>
<url>http://maven.research-infrastructures.eu/nexus/content/repositories/dnet45-releases</url>
<url>https://maven.d4science.org/nexus/content/repositories/dnet45-releases</url>
</repository>
<site>
<id>DHPSite</id>
<url>${dhp.site.stage.path}/dhp-build/dhp-code-style</url>
</site>
</distributionManagement>
<build>
<extensions>
<extension>
<groupId>org.apache.maven.wagon</groupId>
<artifactId>wagon-ssh</artifactId>
<version>2.10</version>
</extension>
</extensions>
<pluginManagement>
<plugins>
<plugin>
@ -35,14 +46,19 @@
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-site-plugin</artifactId>
<version>3.7.1</version>
<version>3.9.1</version>
<configuration>
<skip>true</skip>
</configuration>
</plugin>
</plugins>
</pluginManagement>
</build>
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<dhp.site.stage.path>sftp://dnet-hadoop@static-web.d4science.org/dnet-hadoop</dhp.site.stage.path>
</properties>
</project>

@ -0,0 +1,21 @@
style = defaultWithAlign
align.openParenCallSite = false
align.openParenDefnSite = false
align.tokens = [{code = "->"}, {code = "<-"}, {code = "=>", owner = "Case"}]
continuationIndent.callSite = 2
continuationIndent.defnSite = 2
danglingParentheses = true
indentOperator = spray
maxColumn = 120
newlines.alwaysBeforeTopLevelStatements = true
project.excludeFilters = [".*\\.sbt"]
rewrite.rules = [AvoidInfix]
rewrite.rules = [ExpandImportSelectors]
rewrite.rules = [RedundantBraces]
rewrite.rules = [RedundantParens]
rewrite.rules = [SortImports]
rewrite.rules = [SortModifiers]
rewrite.rules = [PreferCurlyFors]
spaces.inImportCurlyBraces = false
unindentTopLevelOperators = true

@ -0,0 +1,21 @@
<?xml version="1.0" encoding="ISO-8859-1"?>
<project xmlns="http://maven.apache.org/DECORATION/1.8.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/DECORATION/1.8.0 https://maven.apache.org/xsd/decoration-1.8.0.xsd"
name="DHP-Aggregation">
<skin>
<groupId>org.apache.maven.skins</groupId>
<artifactId>maven-fluido-skin</artifactId>
<version>1.8</version>
</skin>
<poweredBy>
<logo name="OpenAIRE Research Graph" href="https://graph.openaire.eu/"
img="https://graph.openaire.eu/assets/common-assets/logo-large-graph.png"/>
</poweredBy>
<body>
<links>
<item name="Code" href="https://code-repo.d4science.org/" />
</links>
<menu ref="modules" />
<menu ref="reports"/>
</body>
</project>

@ -4,12 +4,15 @@
<parent>
<groupId>eu.dnetlib.dhp</groupId>
<artifactId>dhp</artifactId>
<version>1.2.4-SNAPSHOT</version>
<version>1.2.5-SNAPSHOT</version>
</parent>
<artifactId>dhp-build</artifactId>
<packaging>pom</packaging>
<description>This module is a container for the build tools used in dnet-hadoop</description>
<properties>
<maven.javadoc.skip>true</maven.javadoc.skip>
</properties>
<modules>
<module>dhp-code-style</module>
@ -17,4 +20,12 @@
<module>dhp-build-properties-maven-plugin</module>
</modules>
<distributionManagement>
<site>
<id>DHPSite</id>
<url>${dhp.site.stage.path}/dhp-build/</url>
</site>
</distributionManagement>
</project>

@ -0,0 +1,22 @@
<?xml version="1.0" encoding="ISO-8859-1"?>
<project xmlns="http://maven.apache.org/DECORATION/1.8.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/DECORATION/1.8.0 https://maven.apache.org/xsd/decoration-1.8.0.xsd"
name="DHP-Aggregation">
<skin>
<groupId>org.apache.maven.skins</groupId>
<artifactId>maven-fluido-skin</artifactId>
<version>1.8</version>
</skin>
<poweredBy>
<logo name="OpenAIRE Research Graph" href="https://graph.openaire.eu/"
img="https://graph.openaire.eu/assets/common-assets/logo-large-graph.png"/>
</poweredBy>
<body>
<links>
<item name="Code" href="https://code-repo.d4science.org/" />
</links>
<menu ref="modules" />
<menu ref="reports"/>
</body>
</project>

@ -5,14 +5,59 @@
<parent>
<groupId>eu.dnetlib.dhp</groupId>
<artifactId>dhp</artifactId>
<version>1.2.4-SNAPSHOT</version>
<relativePath>../</relativePath>
<version>1.2.5-SNAPSHOT</version>
<relativePath>../pom.xml</relativePath>
</parent>
<artifactId>dhp-common</artifactId>
<packaging>jar</packaging>
<distributionManagement>
<site>
<id>DHPSite</id>
<url>${dhp.site.stage.path}/dhp-common</url>
</site>
</distributionManagement>
<description>This module contains common utilities meant to be used across the dnet-hadoop submodules</description>
<build>
<plugins>
<plugin>
<groupId>net.alchim31.maven</groupId>
<artifactId>scala-maven-plugin</artifactId>
<version>${net.alchim31.maven.version}</version>
<executions>
<execution>
<id>scala-compile-first</id>
<phase>initialize</phase>
<goals>
<goal>add-source</goal>
<goal>compile</goal>
</goals>
</execution>
<execution>
<id>scala-test-compile</id>
<phase>process-test-resources</phase>
<goals>
<goal>testCompile</goal>
</goals>
</execution>
<execution>
<id>scala-doc</id>
<phase>process-resources</phase> <!-- or wherever -->
<goals>
<goal>doc</goal>
</goals>
</execution>
</executions>
<configuration>
<scalaVersion>${scala.version}</scalaVersion>
</configuration>
</plugin>
</plugins>
</build>
<dependencies>
@ -20,6 +65,15 @@
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-common</artifactId>
</dependency>
<dependency>
<groupId>com.github.sisyphsu</groupId>
<artifactId>dateparser</artifactId>
</dependency>
<dependency>
<groupId>me.xuender</groupId>
<artifactId>unidecode</artifactId>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.11</artifactId>
@ -29,12 +83,6 @@
<artifactId>spark-sql_2.11</artifactId>
</dependency>
<dependency>
<groupId>eu.dnetlib.dhp</groupId>
<artifactId>dhp-schemas</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>commons-cli</groupId>
<artifactId>commons-cli</artifactId>
@ -59,11 +107,6 @@
<groupId>com.fasterxml.jackson.core</groupId>
<artifactId>jackson-databind</artifactId>
</dependency>
<!-- https://mvnrepository.com/artifact/com.rabbitmq/amqp-client -->
<dependency>
<groupId>com.rabbitmq</groupId>
<artifactId>amqp-client</artifactId>
</dependency>
<dependency>
<groupId>net.sf.saxon</groupId>
<artifactId>Saxon-HE</artifactId>
@ -104,10 +147,24 @@
<artifactId>dnet-pace-core</artifactId>
</dependency>
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient</artifactId>
</dependency>
<dependency>
<groupId>org.mongodb</groupId>
<artifactId>mongo-java-driver</artifactId>
</dependency>
<dependency>
<groupId>eu.dnetlib.dhp</groupId>
<artifactId>dhp-schemas</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>com.opencsv</groupId>
<artifactId>opencsv</artifactId>
</dependency>
</dependencies>

@ -1,48 +0,0 @@
package eu.dnetlib.collector.worker.model;
import java.util.HashMap;
import java.util.Map;
public class ApiDescriptor {
private String id;
private String baseUrl;
private String protocol;
private Map<String, String> params = new HashMap<>();
public String getBaseUrl() {
return baseUrl;
}
public void setBaseUrl(final String baseUrl) {
this.baseUrl = baseUrl;
}
public String getId() {
return id;
}
public void setId(final String id) {
this.id = id;
}
public Map<String, String> getParams() {
return params;
}
public void setParams(final HashMap<String, String> params) {
this.params = params;
}
public String getProtocol() {
return protocol;
}
public void setProtocol(final String protocol) {
this.protocol = protocol;
}
}

@ -1,119 +0,0 @@
package eu.dnetlib.data.mdstore.manager.common.model;
import java.io.Serializable;
import java.util.UUID;
import javax.persistence.Column;
import javax.persistence.Entity;
import javax.persistence.Id;
import javax.persistence.Table;
@Entity
@Table(name = "mdstores")
public class MDStore implements Serializable {
/** */
private static final long serialVersionUID = 3160530489149700055L;
@Id
@Column(name = "id")
private String id;
@Column(name = "format")
private String format;
@Column(name = "layout")
private String layout;
@Column(name = "interpretation")
private String interpretation;
@Column(name = "datasource_name")
private String datasourceName;
@Column(name = "datasource_id")
private String datasourceId;
@Column(name = "api_id")
private String apiId;
public String getId() {
return id;
}
public void setId(final String id) {
this.id = id;
}
public String getFormat() {
return format;
}
public void setFormat(final String format) {
this.format = format;
}
public String getLayout() {
return layout;
}
public void setLayout(final String layout) {
this.layout = layout;
}
public String getInterpretation() {
return interpretation;
}
public void setInterpretation(final String interpretation) {
this.interpretation = interpretation;
}
public String getDatasourceName() {
return datasourceName;
}
public void setDatasourceName(final String datasourceName) {
this.datasourceName = datasourceName;
}
public String getDatasourceId() {
return datasourceId;
}
public void setDatasourceId(final String datasourceId) {
this.datasourceId = datasourceId;
}
public String getApiId() {
return apiId;
}
public void setApiId(final String apiId) {
this.apiId = apiId;
}
public static MDStore newInstance(
final String format, final String layout, final String interpretation) {
return newInstance(format, layout, interpretation, null, null, null);
}
public static MDStore newInstance(
final String format,
final String layout,
final String interpretation,
final String dsName,
final String dsId,
final String apiId) {
final MDStore md = new MDStore();
md.setId("md-" + UUID.randomUUID());
md.setFormat(format);
md.setLayout(layout);
md.setInterpretation(interpretation);
md.setDatasourceName(dsName);
md.setDatasourceId(dsId);
md.setApiId(apiId);
return md;
}
}

@ -1,51 +0,0 @@
package eu.dnetlib.data.mdstore.manager.common.model;
import java.io.Serializable;
import javax.persistence.Column;
import javax.persistence.Entity;
import javax.persistence.Id;
import javax.persistence.Table;
@Entity
@Table(name = "mdstore_current_versions")
public class MDStoreCurrentVersion implements Serializable {
/** */
private static final long serialVersionUID = -4757725888593745773L;
@Id
@Column(name = "mdstore")
private String mdstore;
@Column(name = "current_version")
private String currentVersion;
public String getMdstore() {
return mdstore;
}
public void setMdstore(final String mdstore) {
this.mdstore = mdstore;
}
public String getCurrentVersion() {
return currentVersion;
}
public void setCurrentVersion(final String currentVersion) {
this.currentVersion = currentVersion;
}
public static MDStoreCurrentVersion newInstance(final String mdId, final String versionId) {
final MDStoreCurrentVersion cv = new MDStoreCurrentVersion();
cv.setMdstore(mdId);
cv.setCurrentVersion(versionId);
return cv;
}
public static MDStoreCurrentVersion newInstance(final MDStoreVersion v) {
return newInstance(v.getMdstore(), v.getId());
}
}

@ -1,99 +0,0 @@
package eu.dnetlib.data.mdstore.manager.common.model;
import java.io.Serializable;
import java.util.Date;
import javax.persistence.Column;
import javax.persistence.Entity;
import javax.persistence.Id;
import javax.persistence.Table;
import javax.persistence.Temporal;
import javax.persistence.TemporalType;
@Entity
@Table(name = "mdstore_versions")
public class MDStoreVersion implements Serializable {
/** */
private static final long serialVersionUID = -4763494442274298339L;
@Id
@Column(name = "id")
private String id;
@Column(name = "mdstore")
private String mdstore;
@Column(name = "writing")
private boolean writing;
@Column(name = "readcount")
private int readCount = 0;
@Column(name = "lastupdate")
@Temporal(TemporalType.TIMESTAMP)
private Date lastUpdate;
@Column(name = "size")
private long size = 0;
public static MDStoreVersion newInstance(final String mdId, final boolean writing) {
final MDStoreVersion t = new MDStoreVersion();
t.setId(mdId + "-" + new Date().getTime());
t.setMdstore(mdId);
t.setLastUpdate(null);
t.setWriting(writing);
t.setReadCount(0);
t.setSize(0);
return t;
}
public String getId() {
return id;
}
public void setId(final String id) {
this.id = id;
}
public String getMdstore() {
return mdstore;
}
public void setMdstore(final String mdstore) {
this.mdstore = mdstore;
}
public boolean isWriting() {
return writing;
}
public void setWriting(final boolean writing) {
this.writing = writing;
}
public int getReadCount() {
return readCount;
}
public void setReadCount(final int readCount) {
this.readCount = readCount;
}
public Date getLastUpdate() {
return lastUpdate;
}
public void setLastUpdate(final Date lastUpdate) {
this.lastUpdate = lastUpdate;
}
public long getSize() {
return size;
}
public void setSize(final long size) {
this.size = size;
}
}

@ -1,143 +0,0 @@
package eu.dnetlib.data.mdstore.manager.common.model;
import java.io.Serializable;
import java.util.Date;
import javax.persistence.Column;
import javax.persistence.Entity;
import javax.persistence.Id;
import javax.persistence.Table;
import javax.persistence.Temporal;
import javax.persistence.TemporalType;
@Entity
@Table(name = "mdstores_with_info")
public class MDStoreWithInfo implements Serializable {
/** */
private static final long serialVersionUID = -8445784770687571492L;
@Id
@Column(name = "id")
private String id;
@Column(name = "format")
private String format;
@Column(name = "layout")
private String layout;
@Column(name = "interpretation")
private String interpretation;
@Column(name = "datasource_name")
private String datasourceName;
@Column(name = "datasource_id")
private String datasourceId;
@Column(name = "api_id")
private String apiId;
@Column(name = "current_version")
private String currentVersion;
@Column(name = "lastupdate")
@Temporal(TemporalType.TIMESTAMP)
private Date lastUpdate;
@Column(name = "size")
private long size = 0;
@Column(name = "n_versions")
private long numberOfVersions = 0;
public String getId() {
return id;
}
public void setId(final String id) {
this.id = id;
}
public String getFormat() {
return format;
}
public void setFormat(final String format) {
this.format = format;
}
public String getLayout() {
return layout;
}
public void setLayout(final String layout) {
this.layout = layout;
}
public String getInterpretation() {
return interpretation;
}
public void setInterpretation(final String interpretation) {
this.interpretation = interpretation;
}
public String getDatasourceName() {
return datasourceName;
}
public void setDatasourceName(final String datasourceName) {
this.datasourceName = datasourceName;
}
public String getDatasourceId() {
return datasourceId;
}
public void setDatasourceId(final String datasourceId) {
this.datasourceId = datasourceId;
}
public String getApiId() {
return apiId;
}
public void setApiId(final String apiId) {
this.apiId = apiId;
}
public String getCurrentVersion() {
return currentVersion;
}
public void setCurrentVersion(final String currentVersion) {
this.currentVersion = currentVersion;
}
public Date getLastUpdate() {
return lastUpdate;
}
public void setLastUpdate(final Date lastUpdate) {
this.lastUpdate = lastUpdate;
}
public long getSize() {
return size;
}
public void setSize(final long size) {
this.size = size;
}
public long getNumberOfVersions() {
return numberOfVersions;
}
public void setNumberOfVersions(final long numberOfVersions) {
this.numberOfVersions = numberOfVersions;
}
}

@ -1,10 +1,7 @@
package eu.dnetlib.dhp.application;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.Serializable;
import java.io.StringWriter;
import java.io.*;
import java.util.*;
import java.util.zip.GZIPInputStream;
import java.util.zip.GZIPOutputStream;
@ -12,17 +9,21 @@ import java.util.zip.GZIPOutputStream;
import org.apache.commons.cli.*;
import org.apache.commons.codec.binary.Base64;
import org.apache.commons.io.IOUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.databind.ObjectMapper;
public class ArgumentApplicationParser implements Serializable {
private static final Logger log = LoggerFactory.getLogger(ArgumentApplicationParser.class);
private final Options options = new Options();
private final Map<String, String> objectMap = new HashMap<>();
private final List<String> compressedValues = new ArrayList<>();
public ArgumentApplicationParser(final String json_configuration) throws Exception {
public ArgumentApplicationParser(final String json_configuration) throws IOException {
final ObjectMapper mapper = new ObjectMapper();
final OptionsParameter[] configuration = mapper.readValue(json_configuration, OptionsParameter[].class);
createOptionMap(configuration);
@ -33,7 +34,6 @@ public class ArgumentApplicationParser implements Serializable {
}
private void createOptionMap(final OptionsParameter[] configuration) {
Arrays
.stream(configuration)
.map(
@ -47,10 +47,6 @@ public class ArgumentApplicationParser implements Serializable {
return o;
})
.forEach(options::addOption);
// HelpFormatter formatter = new HelpFormatter();
// formatter.printHelp("myapp", null, options, null, true);
}
public static String decompressValue(final String abstractCompressed) {
@ -60,13 +56,13 @@ public class ArgumentApplicationParser implements Serializable {
final StringWriter stringWriter = new StringWriter();
IOUtils.copy(gis, stringWriter);
return stringWriter.toString();
} catch (Throwable e) {
System.out.println("Wrong value to decompress:" + abstractCompressed);
throw new RuntimeException(e);
} catch (IOException e) {
log.error("Wrong value to decompress: {}", abstractCompressed);
throw new IllegalArgumentException(e);
}
}
public static String compressArgument(final String value) throws Exception {
public static String compressArgument(final String value) throws IOException {
ByteArrayOutputStream out = new ByteArrayOutputStream();
GZIPOutputStream gzip = new GZIPOutputStream(out);
gzip.write(value.getBytes());
@ -74,7 +70,7 @@ public class ArgumentApplicationParser implements Serializable {
return java.util.Base64.getEncoder().encodeToString(out.toByteArray());
}
public void parseArgument(final String[] args) throws Exception {
public void parseArgument(final String[] args) throws ParseException {
CommandLineParser parser = new BasicParser();
CommandLine cmd = parser.parse(options, args);
Arrays

@ -9,9 +9,6 @@ public class OptionsParameter {
private boolean paramRequired;
private boolean compressed;
public OptionsParameter() {
}
public String getParamName() {
return paramName;
}

@ -0,0 +1,48 @@
package eu.dnetlib.dhp.collection;
import java.util.HashMap;
import java.util.Map;
public class ApiDescriptor {
private String id;
private String baseUrl;
private String protocol;
private Map<String, String> params = new HashMap<>();
public String getBaseUrl() {
return baseUrl;
}
public void setBaseUrl(final String baseUrl) {
this.baseUrl = baseUrl;
}
public String getId() {
return id;
}
public void setId(final String id) {
this.id = id;
}
public Map<String, String> getParams() {
return params;
}
public void setParams(final Map<String, String> params) {
this.params = params;
}
public String getProtocol() {
return protocol;
}
public void setProtocol(final String protocol) {
this.protocol = protocol;
}
}

@ -0,0 +1,61 @@
package eu.dnetlib.dhp.common;
import java.util.Map;
import com.google.common.collect.Maps;
public class Constants {
public static final Map<String, String> accessRightsCoarMap = Maps.newHashMap();
public static final Map<String, String> coarCodeLabelMap = Maps.newHashMap();
public static String COAR_ACCESS_RIGHT_SCHEMA = "http://vocabularies.coar-repositories.org/documentation/access_rights/";
private Constants() {
}
static {
accessRightsCoarMap.put("OPEN", "c_abf2");
accessRightsCoarMap.put("RESTRICTED", "c_16ec");
accessRightsCoarMap.put("OPEN SOURCE", "c_abf2");
accessRightsCoarMap.put("CLOSED", "c_14cb");
accessRightsCoarMap.put("EMBARGO", "c_f1cf");
}
static {
coarCodeLabelMap.put("c_abf2", "OPEN");
coarCodeLabelMap.put("c_16ec", "RESTRICTED");
coarCodeLabelMap.put("c_14cb", "CLOSED");
coarCodeLabelMap.put("c_f1cf", "EMBARGO");
}
public static final String SEQUENCE_FILE_NAME = "/sequence_file";
public static final String REPORT_FILE_NAME = "/report";
public static final String MDSTORE_DATA_PATH = "/store";
public static final String MDSTORE_SIZE_PATH = "/size";
public static final String COLLECTION_MODE = "collectionMode";
public static final String METADATA_ENCODING = "metadataEncoding";
public static final String OOZIE_WF_PATH = "oozieWfPath";
public static final String DNET_MESSAGE_MGR_URL = "dnetMessageManagerURL";
public static final String MAX_NUMBER_OF_RETRY = "maxNumberOfRetry";
public static final String REQUEST_DELAY = "requestDelay";
public static final String RETRY_DELAY = "retryDelay";
public static final String CONNECT_TIMEOUT = "connectTimeOut";
public static final String READ_TIMEOUT = "readTimeOut";
public static final String FROM_DATE_OVERRIDE = "fromDateOverride";
public static final String UNTIL_DATE_OVERRIDE = "untilDateOverride";
public static final String CONTENT_TOTALITEMS = "TotalItems";
public static final String CONTENT_INVALIDRECORDS = "InvalidRecords";
public static final String CONTENT_TRANSFORMEDRECORDS = "transformedItems";
// IETF Draft and used by Repositories like ZENODO , not included in APACHE HTTP java packages
// see https://ietf-wg-httpapi.github.io/ratelimit-headers/draft-ietf-httpapi-ratelimit-headers.html
public static final String HTTPHEADER_IETF_DRAFT_RATELIMIT_LIMIT = "X-RateLimit-Limit";
public static final String HTTPHEADER_IETF_DRAFT_RATELIMIT_REMAINING = "X-RateLimit-Remaining";
public static final String HTTPHEADER_IETF_DRAFT_RATELIMIT_RESET = "X-RateLimit-Reset";
}

@ -14,7 +14,7 @@ public class DbClient implements Closeable {
private static final Log log = LogFactory.getLog(DbClient.class);
private Connection connection;
private final Connection connection;
public DbClient(final String address, final String login, final String password) {

@ -28,7 +28,7 @@ public class HdfsSupport {
* @param configuration Configuration of hadoop env
*/
public static boolean exists(String path, Configuration configuration) {
logger.info("Removing path: {}", path);
logger.info("Checking existence for path: {}", path);
return rethrowAsRuntimeException(
() -> {
Path f = new Path(path);

@ -0,0 +1,172 @@
package eu.dnetlib.dhp.common;
import java.io.BufferedInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.Serializable;
import java.util.Optional;
import org.apache.commons.compress.archivers.tar.TarArchiveEntry;
import org.apache.commons.compress.archivers.tar.TarArchiveOutputStream;
import org.apache.commons.io.IOUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.*;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
public class MakeTarArchive implements Serializable {
private static final Logger log = LoggerFactory.getLogger(MakeTarArchive.class);
public static void main(String[] args) throws Exception {
String jsonConfiguration = IOUtils
.toString(
MakeTarArchive.class
.getResourceAsStream(
"/eu/dnetlib/dhp/common/input_maketar_parameters.json"));
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
parser.parseArgument(args);
final String outputPath = parser.get("hdfsPath");
log.info("hdfsPath: {}", outputPath);
final String hdfsNameNode = parser.get("nameNode");
log.info("nameNode: {}", hdfsNameNode);
final String inputPath = parser.get("sourcePath");
log.info("input path : {}", inputPath);
final int gBperSplit = Optional
.ofNullable(parser.get("splitSize"))
.map(Integer::valueOf)
.orElse(10);
Configuration conf = new Configuration();
conf.set("fs.defaultFS", hdfsNameNode);
FileSystem fileSystem = FileSystem.get(conf);
makeTArArchive(fileSystem, inputPath, outputPath, gBperSplit);
}
public static void makeTArArchive(FileSystem fileSystem, String inputPath, String outputPath, int gBperSplit)
throws IOException {
RemoteIterator<LocatedFileStatus> dirIterator = fileSystem.listLocatedStatus(new Path(inputPath));
while (dirIterator.hasNext()) {
LocatedFileStatus fileStatus = dirIterator.next();
Path p = fileStatus.getPath();
String pathString = p.toString();
String entity = pathString.substring(pathString.lastIndexOf("/") + 1);
MakeTarArchive.tarMaxSize(fileSystem, pathString, outputPath + "/" + entity, entity, gBperSplit);
}
}
private static TarArchiveOutputStream getTar(FileSystem fileSystem, String outputPath) throws IOException {
Path hdfsWritePath = new Path(outputPath);
if (fileSystem.exists(hdfsWritePath)) {
fileSystem.delete(hdfsWritePath, true);
}
return new TarArchiveOutputStream(fileSystem.create(hdfsWritePath).getWrappedStream());
}
private static void write(FileSystem fileSystem, String inputPath, String outputPath, String dirName)
throws IOException {
Path hdfsWritePath = new Path(outputPath);
if (fileSystem.exists(hdfsWritePath)) {
fileSystem.delete(hdfsWritePath, true);
}
try (TarArchiveOutputStream ar = new TarArchiveOutputStream(
fileSystem.create(hdfsWritePath).getWrappedStream())) {
RemoteIterator<LocatedFileStatus> iterator = fileSystem
.listFiles(
new Path(inputPath), true);
while (iterator.hasNext()) {
writeCurrentFile(fileSystem, dirName, iterator, ar, 0);
}
}
}
public static void tarMaxSize(FileSystem fileSystem, String inputPath, String outputPath, String dir_name,
int gBperSplit) throws IOException {
final long bytesPerSplit = 1024L * 1024L * 1024L * gBperSplit;
long sourceSize = fileSystem.getContentSummary(new Path(inputPath)).getSpaceConsumed();
if (sourceSize < bytesPerSplit) {
write(fileSystem, inputPath, outputPath + ".tar", dir_name);
} else {
int partNum = 0;
RemoteIterator<LocatedFileStatus> fileStatusListIterator = fileSystem
.listFiles(
new Path(inputPath), true);
boolean next = fileStatusListIterator.hasNext();
while (next) {
try (TarArchiveOutputStream ar = getTar(fileSystem, outputPath + "_" + (partNum + 1) + ".tar")) {
long currentSize = 0;
while (next && currentSize < bytesPerSplit) {
currentSize = writeCurrentFile(fileSystem, dir_name, fileStatusListIterator, ar, currentSize);
next = fileStatusListIterator.hasNext();
}
partNum += 1;
}
}
}
}
private static long writeCurrentFile(FileSystem fileSystem, String dirName,
RemoteIterator<LocatedFileStatus> fileStatusListIterator,
TarArchiveOutputStream ar, long currentSize) throws IOException {
LocatedFileStatus fileStatus = fileStatusListIterator.next();
Path p = fileStatus.getPath();
String pString = p.toString();
if (!pString.endsWith("_SUCCESS")) {
String name = pString.substring(pString.lastIndexOf("/") + 1);
if (name.startsWith("part-") & name.length() > 10) {
String tmp = name.substring(0, 10);
if (name.contains(".")) {
tmp += name.substring(name.indexOf("."));
}
name = tmp;
}
TarArchiveEntry entry = new TarArchiveEntry(dirName + "/" + name);
entry.setSize(fileStatus.getLen());
currentSize += fileStatus.getLen();
ar.putArchiveEntry(entry);
InputStream is = fileSystem.open(fileStatus.getPath());
BufferedInputStream bis = new BufferedInputStream(is);
int count;
byte[] data = new byte[1024];
while ((count = bis.read(data, 0, data.length)) != -1) {
ar.write(data, 0, count);
}
bis.close();
ar.closeArchiveEntry();
}
return currentSize;
}
}

@ -0,0 +1,122 @@
package eu.dnetlib.dhp.common;
import java.io.Closeable;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Map;
import java.util.Optional;
import java.util.stream.StreamSupport;
import org.apache.commons.lang3.StringUtils;
import org.bson.Document;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.google.common.collect.Iterables;
import com.mongodb.BasicDBObject;
import com.mongodb.MongoClient;
import com.mongodb.MongoClientURI;
import com.mongodb.QueryBuilder;
import com.mongodb.client.FindIterable;
import com.mongodb.client.MongoCollection;
import com.mongodb.client.MongoDatabase;
public class MdstoreClient implements Closeable {
private static final Logger log = LoggerFactory.getLogger(MdstoreClient.class);
private final MongoClient client;
private final MongoDatabase db;
private static final String COLL_METADATA = "metadata";
private static final String COLL_METADATA_MANAGER = "metadataManager";
public MdstoreClient(final String baseUrl, final String dbName) {
this.client = new MongoClient(new MongoClientURI(baseUrl));
this.db = getDb(client, dbName);
}
public MongoCollection<Document> mdStore(final String mdId) {
BasicDBObject query = (BasicDBObject) QueryBuilder.start("mdId").is(mdId).get();
log.info("querying current mdId: {}", query.toJson());
final String currentId = Optional
.ofNullable(getColl(db, COLL_METADATA_MANAGER, true).find(query))
.map(FindIterable::first)
.map(d -> d.getString("currentId"))
.orElseThrow(() -> new IllegalArgumentException("cannot find current mdstore id for: " + mdId));
log.info("currentId: {}", currentId);
return getColl(db, currentId, true);
}
public Map<String, String> validCollections(
final String mdFormat, final String mdLayout, final String mdInterpretation) {
final Map<String, String> transactions = new HashMap<>();
for (final Document entry : getColl(db, COLL_METADATA_MANAGER, true).find()) {
final String mdId = entry.getString("mdId");
final String currentId = entry.getString("currentId");
if (StringUtils.isNoneBlank(mdId, currentId)) {
transactions.put(mdId, currentId);
}
}
final Map<String, String> res = new HashMap<>();
for (final Document entry : getColl(db, COLL_METADATA, true).find()) {
if (entry.getString("format").equals(mdFormat)
&& entry.getString("layout").equals(mdLayout)
&& entry.getString("interpretation").equals(mdInterpretation)
&& transactions.containsKey(entry.getString("mdId"))) {
res.put(entry.getString("mdId"), transactions.get(entry.getString("mdId")));
}
}
return res;
}
private MongoDatabase getDb(final MongoClient client, final String dbName) {
if (!Iterables.contains(client.listDatabaseNames(), dbName)) {
final String err = String.format("Database '%s' not found in %s", dbName, client.getAddress());
log.warn(err);
throw new IllegalArgumentException(err);
}
return client.getDatabase(dbName);
}
private MongoCollection<Document> getColl(
final MongoDatabase db, final String collName, final boolean abortIfMissing) {
if (!Iterables.contains(db.listCollectionNames(), collName)) {
final String err = String
.format(
String.format("Missing collection '%s' in database '%s'", collName, db.getName()));
log.warn(err);
if (abortIfMissing) {
throw new IllegalArgumentException(err);
} else {
return null;
}
}
return db.getCollection(collName);
}
public Iterable<String> listRecords(final String collName) {
final MongoCollection<Document> coll = getColl(db, collName, false);
return coll == null
? new ArrayList<>()
: () -> StreamSupport
.stream(coll.find().spliterator(), false)
.filter(e -> e.containsKey("body"))
.map(e -> e.getString("body"))
.iterator();
}
@Override
public void close() throws IOException {
client.close();
}
}

@ -1,18 +1,18 @@
package eu.dnetlib.dhp.common;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.text.Normalizer;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.*;
import java.util.stream.Collectors;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.text.WordUtils;
import com.ctc.wstx.dtd.LargePrefixedNameSet;
import com.google.common.base.Joiner;
import com.google.common.base.Splitter;
import com.google.common.collect.Iterables;
import com.google.common.collect.Lists;
import com.google.common.hash.Hashing;
@ -24,13 +24,24 @@ import com.google.common.hash.Hashing;
*/
public class PacePerson {
private static final String UTF8 = "UTF-8";
private List<String> name = Lists.newArrayList();
private List<String> surname = Lists.newArrayList();
private List<String> fullname = Lists.newArrayList();
private final String original;
private static Set<String> particles = null;
private static Set<String> particles;
static {
try {
particles = new HashSet<>(IOUtils
.readLines(
PacePerson.class
.getResourceAsStream(
"/eu/dnetlib/dhp/common/name_particles.txt")));
} catch (IOException e) {
throw new RuntimeException(e);
}
}
/**
* Capitalizes a string
@ -38,29 +49,20 @@ public class PacePerson {
* @param s the string to capitalize
* @return the input string with capital letter
*/
public static final String capitalize(final String s) {
public static String capitalize(final String s) {
if (particles.contains(s)) {
return s;
}
return WordUtils.capitalize(s.toLowerCase(), ' ', '-');
}
/**
* Adds a dot to a string with length equals to 1
*/
public static final String dotAbbreviations(final String s) {
public static String dotAbbreviations(final String s) {
return s.length() == 1 ? s + "." : s;
}
public static Set<String> loadFromClasspath(final String classpath) {
final Set<String> h = new HashSet<>();
try {
for (final String s : IOUtils.readLines(PacePerson.class.getResourceAsStream(classpath))) {
h.add(s);
}
} catch (final Throwable e) {
return new HashSet<>();
}
return h;
}
/**
* The constructor of the class. It fills the fields of the class basing on the input fullname.
*
@ -129,10 +131,6 @@ public class PacePerson {
}
private List<String> splitTerms(final String s) {
if (particles == null) {
particles = loadFromClasspath("/eu/dnetlib/dhp/oa/graph/pace/name_particles.txt");
}
final List<String> list = Lists.newArrayList();
for (final String part : Splitter.on(" ").omitEmptyStrings().split(s)) {
if (!particles.contains(part.toLowerCase())) {
@ -188,17 +186,36 @@ public class PacePerson {
}
public List<String> getCapitalFirstnames() {
return Lists
.newArrayList(
Iterables.transform(getNameWithAbbreviations(), PacePerson::capitalize));
return Optional
.ofNullable(getNameWithAbbreviations())
.map(
name -> name
.stream()
.map(PacePerson::capitalize)
.collect(Collectors.toList()))
.orElse(new ArrayList<>());
}
public List<String> getCapitalSurname() {
return Lists.newArrayList(Iterables.transform(surname, PacePerson::capitalize));
return Optional
.ofNullable(getSurname())
.map(
surname -> surname
.stream()
.map(PacePerson::capitalize)
.collect(Collectors.toList()))
.orElse(new ArrayList<>());
}
public List<String> getNameWithAbbreviations() {
return Lists.newArrayList(Iterables.transform(name, PacePerson::dotAbbreviations));
return Optional
.ofNullable(getName())
.map(
name -> name
.stream()
.map(PacePerson::dotAbbreviations)
.collect(Collectors.toList()))
.orElse(new ArrayList<>());
}
public boolean isAccurate() {

@ -0,0 +1,45 @@
package eu.dnetlib.dhp.common.aggregation;
import java.io.Closeable;
import java.io.IOException;
import java.util.HashMap;
import java.util.LinkedHashMap;
import java.util.Map;
import java.util.Objects;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import eu.dnetlib.dhp.message.MessageSender;
import eu.dnetlib.dhp.utils.DHPUtils;
public class AggregatorReport extends LinkedHashMap<String, String> implements Closeable {
private static final Logger log = LoggerFactory.getLogger(AggregatorReport.class);
private transient MessageSender messageSender;
public AggregatorReport() {
}
public AggregatorReport(MessageSender messageSender) {
this.messageSender = messageSender;
}
public void ongoing(Long current, Long total) {
messageSender.sendMessage(current, total);
}
@Override
public void close() throws IOException {
if (Objects.nonNull(messageSender)) {
log.info("closing report: ");
this.forEach((k, v) -> log.info("{} - {}", k, v));
Map<String, String> m = new HashMap<>();
m.put(getClass().getSimpleName().toLowerCase(), DHPUtils.MAPPER.writeValueAsString(values()));
messageSender.sendReport(m);
}
}
}

@ -13,9 +13,9 @@ import okio.Source;
public class InputStreamRequestBody extends RequestBody {
private InputStream inputStream;
private MediaType mediaType;
private long lenght;
private final InputStream inputStream;
private final MediaType mediaType;
private final long lenght;
public static RequestBody create(final MediaType mediaType, final InputStream inputStream, final long len) {

@ -3,6 +3,10 @@ package eu.dnetlib.dhp.common.api;
import java.io.*;
import java.io.IOException;
import java.util.concurrent.TimeUnit;
import org.apache.http.HttpHeaders;
import org.apache.http.entity.ContentType;
import com.google.gson.Gson;
@ -42,7 +46,7 @@ public class ZenodoAPIClient implements Serializable {
this.deposition_id = deposition_id;
}
public ZenodoAPIClient(String urlString, String access_token) throws IOException {
public ZenodoAPIClient(String urlString, String access_token) {
this.urlString = urlString;
this.access_token = access_token;
@ -50,19 +54,20 @@ public class ZenodoAPIClient implements Serializable {
/**
* Brand new deposition in Zenodo. It sets the deposition_id and the bucket where to store the files to upload
*
* @return response code
* @throws IOException
*/
public int newDeposition() throws IOException {
String json = "{}";
OkHttpClient httpClient = new OkHttpClient();
OkHttpClient httpClient = new OkHttpClient.Builder().connectTimeout(600, TimeUnit.SECONDS).build();
RequestBody body = RequestBody.create(MEDIA_TYPE_JSON, json);
RequestBody body = RequestBody.create(json, MEDIA_TYPE_JSON);
Request request = new Request.Builder()
.url(urlString)
.addHeader("Content-Type", "application/json") // add request headers
.addHeader("Authorization", "Bearer " + access_token)
.addHeader(HttpHeaders.CONTENT_TYPE, ContentType.APPLICATION_JSON.toString()) // add request headers
.addHeader(HttpHeaders.AUTHORIZATION, "Bearer " + access_token)
.post(body)
.build();
@ -86,18 +91,23 @@ public class ZenodoAPIClient implements Serializable {
/**
* Upload files in Zenodo.
*
* @param is the inputStream for the file to upload
* @param file_name the name of the file as it will appear on Zenodo
* @param len the size of the file
* @return the response code
*/
public int uploadIS(InputStream is, String file_name, long len) throws IOException {
OkHttpClient httpClient = new OkHttpClient();
OkHttpClient httpClient = new OkHttpClient.Builder()
.writeTimeout(600, TimeUnit.SECONDS)
.readTimeout(600, TimeUnit.SECONDS)
.connectTimeout(600, TimeUnit.SECONDS)
.build();
Request request = new Request.Builder()
.url(bucket + "/" + file_name)
.addHeader("Content-Type", "application/zip") // add request headers
.addHeader("Authorization", "Bearer " + access_token)
.addHeader(HttpHeaders.CONTENT_TYPE, "application/zip") // add request headers
.addHeader(HttpHeaders.AUTHORIZATION, "Bearer " + access_token)
.put(InputStreamRequestBody.create(MEDIA_TYPE_ZIP, is, len))
.build();
@ -110,20 +120,21 @@ public class ZenodoAPIClient implements Serializable {
/**
* Associates metadata information to the current deposition
*
* @param metadata the metadata
* @return response code
* @throws IOException
*/
public int sendMretadata(String metadata) throws IOException {
OkHttpClient httpClient = new OkHttpClient();
OkHttpClient httpClient = new OkHttpClient.Builder().connectTimeout(600, TimeUnit.SECONDS).build();
RequestBody body = RequestBody.create(MEDIA_TYPE_JSON, metadata);
RequestBody body = RequestBody.create(metadata, MEDIA_TYPE_JSON);
Request request = new Request.Builder()
.url(urlString + "/" + deposition_id)
.addHeader("Content-Type", "application/json") // add request headers
.addHeader("Authorization", "Bearer " + access_token)
.addHeader(HttpHeaders.CONTENT_TYPE, ContentType.APPLICATION_JSON.toString()) // add request headers
.addHeader(HttpHeaders.AUTHORIZATION, "Bearer " + access_token)
.put(body)
.build();
@ -140,6 +151,7 @@ public class ZenodoAPIClient implements Serializable {
/**
* To publish the current deposition. It works for both new deposition or new version of an old deposition
*
* @return response code
* @throws IOException
*/
@ -147,12 +159,14 @@ public class ZenodoAPIClient implements Serializable {
String json = "{}";
OkHttpClient httpClient = new OkHttpClient();
OkHttpClient httpClient = new OkHttpClient.Builder().connectTimeout(600, TimeUnit.SECONDS).build();
RequestBody body = RequestBody.create(json, MEDIA_TYPE_JSON);
Request request = new Request.Builder()
.url(urlString + "/" + deposition_id + "/actions/publish")
.addHeader("Authorization", "Bearer " + access_token)
.post(RequestBody.create(MEDIA_TYPE_JSON, json))
.post(body)
.build();
try (Response response = httpClient.newCall(request).execute()) {
@ -166,25 +180,28 @@ public class ZenodoAPIClient implements Serializable {
}
/**
* To create a new version of an already published deposition.
* It sets the deposition_id and the bucket to be used for the new version.
* @param concept_rec_id the concept record id of the deposition for which to create a new version. It is
* the last part of the url for the DOI Zenodo suggests to use to cite all versions:
* DOI: 10.xxx/zenodo.656930 concept_rec_id = 656930
* To create a new version of an already published deposition. It sets the deposition_id and the bucket to be used
* for the new version.
*
* @param concept_rec_id the concept record id of the deposition for which to create a new version. It is the last
* part of the url for the DOI Zenodo suggests to use to cite all versions: DOI: 10.xxx/zenodo.656930
* concept_rec_id = 656930
* @return response code
* @throws IOException
* @throws MissingConceptDoiException
*/
public int newVersion(String concept_rec_id) throws IOException, MissingConceptDoiException {
setDepositionId(concept_rec_id);
setDepositionId(concept_rec_id, 1);
String json = "{}";
OkHttpClient httpClient = new OkHttpClient();
OkHttpClient httpClient = new OkHttpClient.Builder().connectTimeout(600, TimeUnit.SECONDS).build();
RequestBody body = RequestBody.create(json, MEDIA_TYPE_JSON);
Request request = new Request.Builder()
.url(urlString + "/" + deposition_id + "/actions/newversion")
.addHeader("Authorization", "Bearer " + access_token)
.post(RequestBody.create(MEDIA_TYPE_JSON, json))
.addHeader(HttpHeaders.AUTHORIZATION, "Bearer " + access_token)
.post(body)
.build();
try (Response response = httpClient.newCall(request).execute()) {
@ -201,9 +218,45 @@ public class ZenodoAPIClient implements Serializable {
}
}
private void setDepositionId(String concept_rec_id) throws IOException, MissingConceptDoiException {
/**
* To finish uploading a version or new deposition not published
* It sets the deposition_id and the bucket to be used
*
*
* @param deposition_id the deposition id of the not yet published upload
* concept_rec_id = 656930
* @return response code
* @throws IOException
* @throws MissingConceptDoiException
*/
public int uploadOpenDeposition(String deposition_id) throws IOException, MissingConceptDoiException {
this.deposition_id = deposition_id;
OkHttpClient httpClient = new OkHttpClient.Builder().connectTimeout(600, TimeUnit.SECONDS).build();
Request request = new Request.Builder()
.url(urlString + "/" + deposition_id)
.addHeader("Authorization", "Bearer " + access_token)
.build();
try (Response response = httpClient.newCall(request).execute()) {
if (!response.isSuccessful())
throw new IOException("Unexpected code " + response + response.body().string());
ZenodoModel zenodoModel = new Gson().fromJson(response.body().string(), ZenodoModel.class);
bucket = zenodoModel.getLinks().getBucket();
return response.code();
}
}
private void setDepositionId(String concept_rec_id, Integer page) throws IOException, MissingConceptDoiException {
ZenodoModelList zenodoModelList = new Gson().fromJson(getPrevDepositions(), ZenodoModelList.class);
ZenodoModelList zenodoModelList = new Gson()
.fromJson(getPrevDepositions(String.valueOf(page)), ZenodoModelList.class);
for (ZenodoModel zm : zenodoModelList) {
if (zm.getConceptrecid().equals(concept_rec_id)) {
@ -211,18 +264,25 @@ public class ZenodoAPIClient implements Serializable {
return;
}
}
throw new MissingConceptDoiException("The concept record id specified was missing in the list of depositions");
if (zenodoModelList.size() == 0)
throw new MissingConceptDoiException(
"The concept record id specified was missing in the list of depositions");
setDepositionId(concept_rec_id, page + 1);
}
private String getPrevDepositions() throws IOException {
OkHttpClient httpClient = new OkHttpClient();
private String getPrevDepositions(String page) throws IOException {
OkHttpClient httpClient = new OkHttpClient.Builder().connectTimeout(600, TimeUnit.SECONDS).build();
HttpUrl.Builder urlBuilder = HttpUrl.parse(urlString).newBuilder();
urlBuilder.addQueryParameter("page", page);
String url = urlBuilder.build().toString();
Request request = new Request.Builder()
.url(urlString)
.addHeader("Content-Type", "application/json") // add request headers
.addHeader("Authorization", "Bearer " + access_token)
.url(url)
.addHeader(HttpHeaders.CONTENT_TYPE, ContentType.APPLICATION_JSON.toString()) // add request headers
.addHeader(HttpHeaders.AUTHORIZATION, "Bearer " + access_token)
.get()
.build();
@ -238,12 +298,14 @@ public class ZenodoAPIClient implements Serializable {
}
private String getBucket(String url) throws IOException {
OkHttpClient httpClient = new OkHttpClient();
OkHttpClient httpClient = new OkHttpClient.Builder()
.connectTimeout(600, TimeUnit.SECONDS)
.build();
Request request = new Request.Builder()
.url(url)
.addHeader("Content-Type", "application/json") // add request headers
.addHeader("Authorization", "Bearer " + access_token)
.addHeader(HttpHeaders.CONTENT_TYPE, ContentType.APPLICATION_JSON.toString()) // add request headers
.addHeader(HttpHeaders.AUTHORIZATION, "Bearer " + access_token)
.get()
.build();

@ -32,13 +32,13 @@ public class Creator {
public static Creator newInstance(String name, String affiliation, String orcid) {
Creator c = new Creator();
if (!(name == null)) {
if (name != null) {
c.name = name;
}
if (!(affiliation == null)) {
if (affiliation != null) {
c.affiliation = affiliation;
}
if (!(orcid == null)) {
if (orcid != null) {
c.orcid = orcid;
}

@ -3,17 +3,12 @@ package eu.dnetlib.dhp.common.api.zenodo;
import java.io.Serializable;
import net.minidev.json.annotate.JsonIgnore;
public class File implements Serializable {
private String checksum;
private String filename;
private long filesize;
private String id;
@JsonIgnore
// private Links links;
public String getChecksum() {
return checksum;
}
@ -46,13 +41,4 @@ public class File implements Serializable {
this.id = id;
}
// @JsonIgnore
// public Links getLinks() {
// return links;
// }
//
// @JsonIgnore
// public void setLinks(Links links) {
// this.links = links;
// }
}

@ -0,0 +1,32 @@
package eu.dnetlib.dhp.common.collection;
public class CollectorException extends Exception {
/** */
private static final long serialVersionUID = -290723075076039757L;
public CollectorException() {
super();
}
public CollectorException(
final String message,
final Throwable cause,
final boolean enableSuppression,
final boolean writableStackTrace) {
super(message, cause, enableSuppression, writableStackTrace);
}
public CollectorException(final String message, final Throwable cause) {
super(message, cause);
}
public CollectorException(final String message) {
super(message);
}
public CollectorException(final Throwable cause) {
super(cause);
}
}

@ -0,0 +1,40 @@
package eu.dnetlib.dhp.common.collection;
import java.io.BufferedOutputStream;
import java.io.IOException;
import java.util.zip.GZIPOutputStream;
import org.apache.commons.compress.archivers.tar.TarArchiveEntry;
import org.apache.commons.compress.archivers.tar.TarArchiveInputStream;
import org.apache.commons.compress.compressors.gzip.GzipCompressorInputStream;
import org.apache.commons.io.IOUtils;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
public class DecompressTarGz {
public static void doExtract(FileSystem fs, String outputPath, String tarGzPath) throws IOException {
FSDataInputStream inputFileStream = fs.open(new Path(tarGzPath));
try (TarArchiveInputStream tais = new TarArchiveInputStream(
new GzipCompressorInputStream(inputFileStream))) {
TarArchiveEntry entry = null;
while ((entry = tais.getNextTarEntry()) != null) {
if (!entry.isDirectory()) {
try (
FSDataOutputStream out = fs
.create(new Path(outputPath.concat(entry.getName()).concat(".gz")));
GZIPOutputStream gzipOs = new GZIPOutputStream(new BufferedOutputStream(out))) {
IOUtils.copy(tais, gzipOs);
}
}
}
}
}
}

@ -0,0 +1,56 @@
package eu.dnetlib.dhp.common.collection;
import java.io.*;
import java.nio.charset.StandardCharsets;
import java.util.List;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.opencsv.bean.CsvToBeanBuilder;
public class GetCSV {
public static final char DEFAULT_DELIMITER = ',';
private GetCSV() {
}
public static void getCsv(FileSystem fileSystem, BufferedReader reader, String hdfsPath,
String modelClass) throws IOException, ClassNotFoundException {
getCsv(fileSystem, reader, hdfsPath, modelClass, DEFAULT_DELIMITER);
}
public static void getCsv(FileSystem fileSystem, Reader reader, String hdfsPath,
String modelClass, char delimiter) throws IOException, ClassNotFoundException {
Path hdfsWritePath = new Path(hdfsPath);
FSDataOutputStream fsDataOutputStream = null;
if (fileSystem.exists(hdfsWritePath)) {
fileSystem.delete(hdfsWritePath, false);
}
fsDataOutputStream = fileSystem.create(hdfsWritePath);
try (BufferedWriter writer = new BufferedWriter(
new OutputStreamWriter(fsDataOutputStream, StandardCharsets.UTF_8))) {
final ObjectMapper mapper = new ObjectMapper();
@SuppressWarnings("unchecked")
final List lines = new CsvToBeanBuilder(reader)
.withType(Class.forName(modelClass))
.withSeparator(delimiter)
.build()
.parse();
for (Object line : lines) {
writer.write(mapper.writeValueAsString(line));
writer.newLine();
}
}
}
}

@ -0,0 +1,94 @@
package eu.dnetlib.dhp.common.collection;
/**
* Bundles the http connection parameters driving the client behaviour.
*/
public class HttpClientParams {
// Defaults
public static int _maxNumberOfRetry = 3;
public static int _requestDelay = 0; // milliseconds
public static int _retryDelay = 10; // seconds
public static int _connectTimeOut = 10; // seconds
public static int _readTimeOut = 30; // seconds
/**
* Maximum number of allowed retires before failing
*/
private int maxNumberOfRetry;
/**
* Delay between request (Milliseconds)
*/
private int requestDelay;
/**
* Time to wait after a failure before retrying (Seconds)
*/
private int retryDelay;
/**
* Connect timeout (Seconds)
*/
private int connectTimeOut;
/**
* Read timeout (Seconds)
*/
private int readTimeOut;
public HttpClientParams() {
this(_maxNumberOfRetry, _requestDelay, _retryDelay, _connectTimeOut, _readTimeOut);
}
public HttpClientParams(int maxNumberOfRetry, int requestDelay, int retryDelay, int connectTimeOut,
int readTimeOut) {
this.maxNumberOfRetry = maxNumberOfRetry;
this.requestDelay = requestDelay;
this.retryDelay = retryDelay;
this.connectTimeOut = connectTimeOut;
this.readTimeOut = readTimeOut;
}
public int getMaxNumberOfRetry() {
return maxNumberOfRetry;
}
public void setMaxNumberOfRetry(int maxNumberOfRetry) {
this.maxNumberOfRetry = maxNumberOfRetry;
}
public int getRequestDelay() {
return requestDelay;
}
public void setRequestDelay(int requestDelay) {
this.requestDelay = requestDelay;
}
public int getRetryDelay() {
return retryDelay;
}
public void setRetryDelay(int retryDelay) {
this.retryDelay = retryDelay;
}
public void setConnectTimeOut(int connectTimeOut) {
this.connectTimeOut = connectTimeOut;
}
public int getConnectTimeOut() {
return connectTimeOut;
}
public int getReadTimeOut() {
return readTimeOut;
}
public void setReadTimeOut(int readTimeOut) {
this.readTimeOut = readTimeOut;
}
}

@ -0,0 +1,263 @@
package eu.dnetlib.dhp.common.collection;
import static eu.dnetlib.dhp.utils.DHPUtils.*;
import java.io.IOException;
import java.io.InputStream;
import java.net.*;
import java.util.List;
import java.util.Map;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.math.NumberUtils;
import org.apache.http.HttpHeaders;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import eu.dnetlib.dhp.common.Constants;
import eu.dnetlib.dhp.common.aggregation.AggregatorReport;
/**
* Migrated from https://svn.driver.research-infrastructures.eu/driver/dnet45/modules/dnet-modular-collector-service/trunk/src/main/java/eu/dnetlib/data/collector/plugins/HttpConnector.java
*
* @author jochen, michele, andrea, alessia, claudio, andreas
*/
public class HttpConnector2 {
private static final Logger log = LoggerFactory.getLogger(HttpConnector2.class);
private static final String REPORT_PREFIX = "http:";
private HttpClientParams clientParams;
private String responseType = null;
private static final String userAgent = "Mozilla/5.0 (compatible; OAI; +http://www.openaire.eu)";
public HttpConnector2() {
this(new HttpClientParams());
}
public HttpConnector2(HttpClientParams clientParams) {
this.clientParams = clientParams;
CookieHandler.setDefault(new CookieManager(null, CookiePolicy.ACCEPT_ALL));
}
/**
* @see HttpConnector2#getInputSource(java.lang.String, AggregatorReport)
*/
public InputStream getInputSourceAsStream(final String requestUrl) throws CollectorException {
return IOUtils.toInputStream(getInputSource(requestUrl));
}
/**
* @see HttpConnector2#getInputSource(java.lang.String, AggregatorReport)
*/
public String getInputSource(final String requestUrl) throws CollectorException {
return attemptDownloadAsString(requestUrl, 1, new AggregatorReport());
}
/**
* Given the URL returns the content via HTTP GET
*
* @param requestUrl the URL
* @param report the list of errors
* @return the content of the downloaded resource
* @throws CollectorException when retrying more than maxNumberOfRetry times
*/
public String getInputSource(final String requestUrl, AggregatorReport report)
throws CollectorException {
return attemptDownloadAsString(requestUrl, 1, report);
}
private String attemptDownloadAsString(final String requestUrl, final int retryNumber,
final AggregatorReport report) throws CollectorException {
try (InputStream s = attemptDownload(requestUrl, retryNumber, report)) {
return IOUtils.toString(s);
} catch (IOException e) {
log.error(e.getMessage(), e);
throw new CollectorException(e);
}
}
private InputStream attemptDownload(final String requestUrl, final int retryNumber,
final AggregatorReport report) throws CollectorException, IOException {
if (retryNumber > getClientParams().getMaxNumberOfRetry()) {
final String msg = String
.format(
"Max number of retries (%s/%s) exceeded, failing.",
retryNumber, getClientParams().getMaxNumberOfRetry());
log.error(msg);
throw new CollectorException(msg);
}
log.info("Request attempt {} [{}]", retryNumber, requestUrl);
InputStream input = null;
try {
if (getClientParams().getRequestDelay() > 0) {
backoffAndSleep(getClientParams().getRequestDelay());
}
final HttpURLConnection urlConn = (HttpURLConnection) new URL(requestUrl).openConnection();
urlConn.setInstanceFollowRedirects(false);
urlConn.setReadTimeout(getClientParams().getReadTimeOut() * 1000);
urlConn.setConnectTimeout(getClientParams().getConnectTimeOut() * 1000);
urlConn.addRequestProperty(HttpHeaders.USER_AGENT, userAgent);
if (log.isDebugEnabled()) {
logHeaderFields(urlConn);
}
int retryAfter = obtainRetryAfter(urlConn.getHeaderFields());
String rateLimit = urlConn.getHeaderField(Constants.HTTPHEADER_IETF_DRAFT_RATELIMIT_LIMIT);
String rateRemaining = urlConn.getHeaderField(Constants.HTTPHEADER_IETF_DRAFT_RATELIMIT_REMAINING);
if ((rateLimit != null) && (rateRemaining != null) && (Integer.parseInt(rateRemaining) < 2)) {
if (retryAfter > 0) {
backoffAndSleep(retryAfter);
} else {
backoffAndSleep(1000);
}
}
if (is2xx(urlConn.getResponseCode())) {
input = urlConn.getInputStream();
responseType = urlConn.getContentType();
return input;
}
if (is3xx(urlConn.getResponseCode())) {
// REDIRECTS
final String newUrl = obtainNewLocation(urlConn.getHeaderFields());
log.info("The requested url has been moved to {}", newUrl);
report
.put(
REPORT_PREFIX + urlConn.getResponseCode(),
String.format("Moved to: %s", newUrl));
urlConn.disconnect();
if (retryAfter > 0) {
backoffAndSleep(retryAfter);
}
return attemptDownload(newUrl, retryNumber + 1, report);
}
if (is4xx(urlConn.getResponseCode()) || is5xx(urlConn.getResponseCode())) {
switch (urlConn.getResponseCode()) {
case HttpURLConnection.HTTP_NOT_FOUND:
case HttpURLConnection.HTTP_BAD_GATEWAY:
case HttpURLConnection.HTTP_UNAVAILABLE:
case HttpURLConnection.HTTP_GATEWAY_TIMEOUT:
if (retryAfter > 0) {
log
.warn(
"{} - waiting and repeating request after suggested retry-after {} sec.",
requestUrl, retryAfter);
backoffAndSleep(retryAfter * 1000);
} else {
log
.warn(
"{} - waiting and repeating request after default delay of {} sec.",
requestUrl, getClientParams().getRetryDelay());
backoffAndSleep(retryNumber * getClientParams().getRetryDelay() * 1000);
}
report.put(REPORT_PREFIX + urlConn.getResponseCode(), requestUrl);
urlConn.disconnect();
return attemptDownload(requestUrl, retryNumber + 1, report);
default:
report
.put(
REPORT_PREFIX + urlConn.getResponseCode(),
String
.format(
"%s Error: %s", requestUrl, urlConn.getResponseMessage()));
throw new CollectorException(urlConn.getResponseCode() + " error " + report);
}
}
throw new CollectorException(
String
.format(
"Unexpected status code: %s errors: %s", urlConn.getResponseCode(),
MAPPER.writeValueAsString(report)));
} catch (MalformedURLException | UnknownHostException e) {
log.error(e.getMessage(), e);
report.put(e.getClass().getName(), e.getMessage());
throw new CollectorException(e.getMessage(), e);
} catch (SocketTimeoutException | SocketException e) {
log.error(e.getMessage(), e);
report.put(e.getClass().getName(), e.getMessage());
backoffAndSleep(getClientParams().getRetryDelay() * retryNumber * 1000);
return attemptDownload(requestUrl, retryNumber + 1, report);
}
}
private void logHeaderFields(final HttpURLConnection urlConn) throws IOException {
log.debug("StatusCode: {}", urlConn.getResponseMessage());
for (Map.Entry<String, List<String>> e : urlConn.getHeaderFields().entrySet()) {
if (e.getKey() != null) {
for (String v : e.getValue()) {
log.debug(" key: {} - value: {}", e.getKey(), v);
}
}
}
}
private void backoffAndSleep(int sleepTimeMs) throws CollectorException {
log.info("I'm going to sleep for {}ms", sleepTimeMs);
try {
Thread.sleep(sleepTimeMs);
} catch (InterruptedException e) {
log.error(e.getMessage(), e);
throw new CollectorException(e);
}
}
private int obtainRetryAfter(final Map<String, List<String>> headerMap) {
for (String key : headerMap.keySet()) {
if ((key != null) && key.equalsIgnoreCase(HttpHeaders.RETRY_AFTER) && (!headerMap.get(key).isEmpty())
&& NumberUtils.isCreatable(headerMap.get(key).get(0))) {
return Integer.parseInt(headerMap.get(key).get(0)) + 10;
}
}
return -1;
}
private String obtainNewLocation(final Map<String, List<String>> headerMap) throws CollectorException {
for (String key : headerMap.keySet()) {
if ((key != null) && key.equalsIgnoreCase(HttpHeaders.LOCATION) && (headerMap.get(key).size() > 0)) {
return headerMap.get(key).get(0);
}
}
throw new CollectorException("The requested url has been MOVED, but 'location' param is MISSING");
}
private boolean is2xx(final int statusCode) {
return statusCode >= 200 && statusCode <= 299;
}
private boolean is4xx(final int statusCode) {
return statusCode >= 400 && statusCode <= 499;
}
private boolean is3xx(final int statusCode) {
return statusCode >= 300 && statusCode <= 399;
}
private boolean is5xx(final int statusCode) {
return statusCode >= 500 && statusCode <= 599;
}
public String getResponseType() {
return responseType;
}
public HttpClientParams getClientParams() {
return clientParams;
}
public void setClientParams(HttpClientParams clientParams) {
this.clientParams = clientParams;
}
}

@ -0,0 +1,75 @@
package eu.dnetlib.dhp.common.rest;
import java.io.IOException;
import java.util.Arrays;
import java.util.stream.Collectors;
import org.apache.commons.io.IOUtils;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.client.methods.HttpUriRequest;
import org.apache.http.entity.StringEntity;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.databind.ObjectMapper;
public class DNetRestClient {
private static final Logger log = LoggerFactory.getLogger(DNetRestClient.class);
private static final ObjectMapper mapper = new ObjectMapper();
private DNetRestClient() {
}
public static <T> T doGET(final String url, Class<T> clazz) throws Exception {
final HttpGet httpGet = new HttpGet(url);
return doHTTPRequest(httpGet, clazz);
}
public static String doGET(final String url) throws IOException {
final HttpGet httpGet = new HttpGet(url);
return doHTTPRequest(httpGet);
}
public static <V> String doPOST(final String url, V objParam) throws IOException {
final HttpPost httpPost = new HttpPost(url);
if (objParam != null) {
final StringEntity entity = new StringEntity(mapper.writeValueAsString(objParam));
httpPost.setEntity(entity);
httpPost.setHeader("Accept", "application/json");
httpPost.setHeader("Content-type", "application/json");
}
return doHTTPRequest(httpPost);
}
public static <T, V> T doPOST(final String url, V objParam, Class<T> clazz) throws IOException {
return mapper.readValue(doPOST(url, objParam), clazz);
}
private static String doHTTPRequest(final HttpUriRequest r) throws IOException {
try (CloseableHttpClient client = HttpClients.createDefault()) {
log.info("performing HTTP request, method {} on URI {}", r.getMethod(), r.getURI().toString());
log
.info(
"request headers: {}",
Arrays
.asList(r.getAllHeaders())
.stream()
.map(h -> h.getName() + ":" + h.getValue())
.collect(Collectors.joining(",")));
return IOUtils.toString(client.execute(r).getEntity().getContent());
}
}
private static <T> T doHTTPRequest(final HttpUriRequest r, Class<T> clazz) throws Exception {
return mapper.readValue(doHTTPRequest(r), clazz);
}
}

@ -0,0 +1,92 @@
package eu.dnetlib.dhp.common.vocabulary;
import java.io.Serializable;
import java.util.HashMap;
import java.util.Map;
import java.util.Optional;
import org.apache.commons.lang3.StringUtils;
import com.google.common.collect.Maps;
import eu.dnetlib.dhp.schema.oaf.Qualifier;
import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils;
public class Vocabulary implements Serializable {
private final String id;
private final String name;
/**
* Code to Term mappings for this Vocabulary.
*/
private final Map<String, VocabularyTerm> terms = new HashMap<>();
/**
* Synonym to Code mappings for this Vocabulary.
*/
private final Map<String, String> synonyms = Maps.newHashMap();
public Vocabulary(final String id, final String name) {
this.id = id;
this.name = name;
}
public String getId() {
return id;
}
public String getName() {
return name;
}
protected Map<String, VocabularyTerm> getTerms() {
return terms;
}
public VocabularyTerm getTerm(final String id) {
return Optional.ofNullable(id).map(String::toLowerCase).map(terms::get).orElse(null);
}
protected void addTerm(final String id, final String name) {
terms.put(id.toLowerCase(), new VocabularyTerm(id, name));
}
protected boolean termExists(final String id) {
return terms.containsKey(id.toLowerCase());
}
protected void addSynonym(final String syn, final String termCode) {
synonyms.put(syn, termCode.toLowerCase());
}
public VocabularyTerm getTermBySynonym(final String syn) {
return getTerm(synonyms.get(syn.toLowerCase()));
}
public Qualifier getTermAsQualifier(final String termId) {
if (StringUtils.isBlank(termId)) {
return OafMapperUtils.unknown(getId(), getName());
} else if (termExists(termId)) {
final VocabularyTerm t = getTerm(termId);
return OafMapperUtils.qualifier(t.getId(), t.getName(), getId(), getName());
} else {
return OafMapperUtils.qualifier(termId, termId, getId(), getName());
}
}
public Qualifier getSynonymAsQualifier(final String syn) {
return Optional
.ofNullable(getTermBySynonym(syn))
.map(term -> getTermAsQualifier(term.getId()))
.orElse(null);
}
public Qualifier lookup(String id) {
return Optional
.ofNullable(getSynonymAsQualifier(id))
.orElse(getTermAsQualifier(id));
}
}

@ -0,0 +1,189 @@
package eu.dnetlib.dhp.common.vocabulary;
import java.io.Serializable;
import java.util.*;
import java.util.stream.Collectors;
import org.apache.commons.lang3.StringUtils;
import eu.dnetlib.dhp.schema.oaf.Qualifier;
import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
public class VocabularyGroup implements Serializable {
public static final String VOCABULARIES_XQUERY = "for $x in collection('/db/DRIVER/VocabularyDSResources/VocabularyDSResourceType') \n"
+
"let $vocid := $x//VOCABULARY_NAME/@code\n" +
"let $vocname := $x//VOCABULARY_NAME/text()\n" +
"for $term in ($x//TERM)\n" +
"return concat($vocid,' @=@ ',$vocname,' @=@ ',$term/@code,' @=@ ',$term/@english_name)";
public static final String VOCABULARY_SYNONYMS_XQUERY = "for $x in collection('/db/DRIVER/VocabularyDSResources/VocabularyDSResourceType')\n"
+
"let $vocid := $x//VOCABULARY_NAME/@code\n" +
"let $vocname := $x//VOCABULARY_NAME/text()\n" +
"for $term in ($x//TERM)\n" +
"for $syn in ($term//SYNONYM/@term)\n" +
"return concat($vocid,' @=@ ',$term/@code,' @=@ ', $syn)\n";
public static VocabularyGroup loadVocsFromIS(ISLookUpService isLookUpService) throws ISLookUpException {
final VocabularyGroup vocs = new VocabularyGroup();
for (final String s : isLookUpService.quickSearchProfile(VOCABULARIES_XQUERY)) {
final String[] arr = s.split("@=@");
if (arr.length == 4) {
final String vocId = arr[0].trim();
final String vocName = arr[1].trim();
final String termId = arr[2].trim();
final String termName = arr[3].trim();
if (!vocs.vocabularyExists(vocId)) {
vocs.addVocabulary(vocId, vocName);
}
vocs.addTerm(vocId, termId, termName);
}
}
for (final String s : isLookUpService.quickSearchProfile(VOCABULARY_SYNONYMS_XQUERY)) {
final String[] arr = s.split("@=@");
if (arr.length == 3) {
final String vocId = arr[0].trim();
final String termId = arr[1].trim();
final String syn = arr[2].trim();
vocs.addSynonyms(vocId, termId, syn);
}
}
// add the term names as synonyms
vocs.vocs.values().forEach(voc -> {
voc.getTerms().values().forEach(term -> {
voc.addSynonym(term.getName().toLowerCase(), term.getId());
});
});
return vocs;
}
private final Map<String, Vocabulary> vocs = new HashMap<>();
public Set<String> vocabularyNames() {
return vocs.keySet();
}
public void addVocabulary(final String id, final String name) {
vocs.put(id.toLowerCase(), new Vocabulary(id, name));
}
public Optional<Vocabulary> find(final String vocId) {
return Optional
.ofNullable(vocId)
.map(String::toLowerCase)
.map(vocs::get);
}
public void addTerm(final String vocId, final String id, final String name) {
if (vocabularyExists(vocId)) {
vocs.get(vocId.toLowerCase()).addTerm(id, name);
}
}
public VocabularyTerm getTerm(final String vocId, final String id) {
if (termExists(vocId, id)) {
return vocs.get(vocId.toLowerCase()).getTerm(id);
} else {
return new VocabularyTerm(id, id);
}
}
public Set<String> getTerms(String vocId) {
if (!vocabularyExists(vocId)) {
return new HashSet<>();
}
return vocs
.get(vocId.toLowerCase())
.getTerms()
.values()
.stream()
.map(VocabularyTerm::getId)
.collect(Collectors.toCollection(HashSet::new));
}
public Qualifier lookup(String vocId, String id) {
return Optional
.ofNullable(getSynonymAsQualifier(vocId, id))
.orElse(getTermAsQualifier(vocId, id));
}
public Qualifier getTermAsQualifier(final String vocId, final String id) {
if (vocabularyExists(vocId)) {
return vocs.get(vocId.toLowerCase()).getTermAsQualifier(id);
}
return OafMapperUtils.qualifier(id, id, "", "");
}
public Qualifier getSynonymAsQualifier(final String vocId, final String syn) {
if (StringUtils.isBlank(vocId)) {
return OafMapperUtils.unknown("", "");
}
return vocs.get(vocId.toLowerCase()).getSynonymAsQualifier(syn);
}
/**
* getSynonymAsQualifierCaseSensitive
*
* refelects the situation to check caseSensitive vocabulary
*/
public Qualifier getSynonymAsQualifierCaseSensitive(final String vocId, final String syn) {
if (StringUtils.isBlank(vocId)) {
return OafMapperUtils.unknown("", "");
}
return vocs.get(vocId).getSynonymAsQualifier(syn);
}
/**
* termExists
*
* two methods: without and with caseSensitive check
*/
public boolean termExists(final String vocId, final String id) {
return termExists(vocId, id, Boolean.FALSE);
}
public boolean termExists(final String vocId, final String id, final Boolean caseSensitive) {
if (Boolean.TRUE.equals(caseSensitive)) {
return vocabularyExists(vocId) && vocs.get(vocId).termExists(id);
}
return vocabularyExists(vocId) && vocs.get(vocId.toLowerCase()).termExists(id);
}
public boolean vocabularyExists(final String vocId) {
return Optional
.ofNullable(vocId)
.map(String::toLowerCase)
.map(vocs::containsKey)
.orElse(false);
}
private void addSynonyms(final String vocId, final String termId, final String syn) {
String id = Optional
.ofNullable(vocId)
.map(String::toLowerCase)
.orElseThrow(
() -> new IllegalArgumentException(
String
.format(
"empty vocabulary id for [term:%s, synonym:%s]", termId, syn)));
Optional
.ofNullable(vocs.get(id))
.orElseThrow(() -> new IllegalArgumentException("missing vocabulary id: " + vocId))
.addSynonym(syn.toLowerCase(), termId);
}
}

@ -0,0 +1,24 @@
package eu.dnetlib.dhp.common.vocabulary;
import java.io.Serializable;
public class VocabularyTerm implements Serializable {
private final String id;
private final String name;
public VocabularyTerm(final String id, final String name) {
this.id = id;
this.name = name;
}
public String getId() {
return id;
}
public String getName() {
return name;
}
}

@ -0,0 +1,63 @@
package eu.dnetlib.dhp.message;
import java.io.Serializable;
import java.util.LinkedHashMap;
import java.util.Map;
public class Message implements Serializable {
private static final long serialVersionUID = 401753881204524893L;
public static final String CURRENT_PARAM = "current";
public static final String TOTAL_PARAM = "total";
private MessageType messageType;
private String workflowId;
private Map<String, String> body;
public Message() {
}
public Message(final MessageType messageType, final String workflowId) {
this(messageType, workflowId, new LinkedHashMap<>());
}
public Message(final MessageType messageType, final String workflowId, final Map<String, String> body) {
this.messageType = messageType;
this.workflowId = workflowId;
this.body = body;
}
public MessageType getMessageType() {
return messageType;
}
public void setMessageType(MessageType messageType) {
this.messageType = messageType;
}
public String getWorkflowId() {
return workflowId;
}
public void setWorkflowId(final String workflowId) {
this.workflowId = workflowId;
}
public Map<String, String> getBody() {
return body;
}
public void setBody(final Map<String, String> body) {
this.body = body;
}
@Override
public String toString() {
return String.format("Message [type=%s, workflowId=%s, body=%s]", messageType, workflowId, body);
}
}

@ -0,0 +1,94 @@
package eu.dnetlib.dhp.message;
import java.util.Map;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpPut;
import org.apache.http.entity.ContentType;
import org.apache.http.entity.StringEntity;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.core.JsonProcessingException;
import com.fasterxml.jackson.databind.ObjectMapper;
public class MessageSender {
private static final Logger log = LoggerFactory.getLogger(MessageSender.class);
private static final int SOCKET_TIMEOUT_MS = 2000;
private static final int CONNECTION_REQUEST_TIMEOUT_MS = 2000;
private static final int CONNTECTION_TIMEOUT_MS = 2000;
private final ObjectMapper objectMapper = new ObjectMapper();
private final String dnetMessageEndpoint;
private final String workflowId;
private final ExecutorService executorService = Executors.newCachedThreadPool();
public MessageSender(final String dnetMessageEndpoint, final String workflowId) {
this.workflowId = workflowId;
this.dnetMessageEndpoint = dnetMessageEndpoint;
}
public void sendMessage(final Message message) {
executorService.submit(() -> _sendMessage(message));
}
public void sendMessage(final Long current, final Long total) {
sendMessage(createOngoingMessage(current, total));
}
public void sendReport(final Map<String, String> report) {
sendMessage(new Message(MessageType.REPORT, workflowId, report));
}
private Message createOngoingMessage(final Long current, final Long total) {
final Message m = new Message(MessageType.ONGOING, workflowId);
m.getBody().put(Message.CURRENT_PARAM, current.toString());
if (total != null) {
m.getBody().put(Message.TOTAL_PARAM, total.toString());
}
return m;
}
private void _sendMessage(final Message message) {
try {
final String json = objectMapper.writeValueAsString(message);
final HttpPut req = new HttpPut(dnetMessageEndpoint);
req.setEntity(new StringEntity(json, ContentType.APPLICATION_JSON));
final RequestConfig requestConfig = RequestConfig
.custom()
.setConnectTimeout(CONNTECTION_TIMEOUT_MS)
.setConnectionRequestTimeout(CONNECTION_REQUEST_TIMEOUT_MS)
.setSocketTimeout(SOCKET_TIMEOUT_MS)
.build();
try (final CloseableHttpClient client = HttpClients
.custom()
.setDefaultRequestConfig(requestConfig)
.build();
final CloseableHttpResponse response = client.execute(req)) {
log.debug("Sent Message to " + dnetMessageEndpoint);
log.debug("MESSAGE:" + message);
} catch (final Throwable e) {
log.error("Error sending message to " + dnetMessageEndpoint + ", message content: " + message, e);
}
} catch (final JsonProcessingException e) {
log.error("Error sending message to " + dnetMessageEndpoint + ", message content: " + message, e);
}
}
}

@ -0,0 +1,21 @@
package eu.dnetlib.dhp.message;
import java.io.Serializable;
import java.util.Optional;
import org.apache.commons.lang3.StringUtils;
public enum MessageType implements Serializable {
ONGOING, REPORT;
public MessageType from(String value) {
return Optional
.ofNullable(value)
.map(StringUtils::upperCase)
.map(MessageType::valueOf)
.orElseThrow(() -> new IllegalArgumentException("unknown message type: " + value));
}
}

@ -1,121 +0,0 @@
package eu.dnetlib.dhp.model.mdstore;
import java.io.Serializable;
import eu.dnetlib.dhp.utils.DHPUtils;
/** This class models a record inside the new Metadata store collection on HDFS * */
public class MetadataRecord implements Serializable {
/** The D-Net Identifier associated to the record */
private String id;
/** The original Identifier of the record */
private String originalId;
/** The encoding of the record, should be JSON or XML */
private String encoding;
/**
* The information about the provenance of the record see @{@link Provenance} for the model of this information
*/
private Provenance provenance;
/** The content of the metadata */
private String body;
/** the date when the record has been stored */
private long dateOfCollection;
/** the date when the record has been stored */
private long dateOfTransformation;
public MetadataRecord() {
this.dateOfCollection = System.currentTimeMillis();
}
public MetadataRecord(
String originalId,
String encoding,
Provenance provenance,
String body,
long dateOfCollection) {
this.originalId = originalId;
this.encoding = encoding;
this.provenance = provenance;
this.body = body;
this.dateOfCollection = dateOfCollection;
this.id = DHPUtils.generateIdentifier(originalId, this.provenance.getNsPrefix());
}
public String getId() {
return id;
}
public void setId(String id) {
this.id = id;
}
public String getOriginalId() {
return originalId;
}
public void setOriginalId(String originalId) {
this.originalId = originalId;
}
public String getEncoding() {
return encoding;
}
public void setEncoding(String encoding) {
this.encoding = encoding;
}
public Provenance getProvenance() {
return provenance;
}
public void setProvenance(Provenance provenance) {
this.provenance = provenance;
}
public String getBody() {
return body;
}
public void setBody(String body) {
this.body = body;
}
public long getDateOfCollection() {
return dateOfCollection;
}
public void setDateOfCollection(long dateOfCollection) {
this.dateOfCollection = dateOfCollection;
}
public long getDateOfTransformation() {
return dateOfTransformation;
}
public void setDateOfTransformation(long dateOfTransformation) {
this.dateOfTransformation = dateOfTransformation;
}
@Override
public boolean equals(Object o) {
if (!(o instanceof MetadataRecord)) {
return false;
}
return ((MetadataRecord) o).getId().equalsIgnoreCase(id);
}
@Override
public int hashCode() {
return id.hashCode();
}
}

@ -1,52 +0,0 @@
package eu.dnetlib.dhp.model.mdstore;
import java.io.Serializable;
/**
* @author Sandro La Bruzzo
* <p>
* Provenace class models the provenance of the record in the metadataStore It contains the identifier and the
* name of the datasource that gives the record
*/
public class Provenance implements Serializable {
private String datasourceId;
private String datasourceName;
private String nsPrefix;
public Provenance() {
}
public Provenance(String datasourceId, String datasourceName, String nsPrefix) {
this.datasourceId = datasourceId;
this.datasourceName = datasourceName;
this.nsPrefix = nsPrefix;
}
public String getDatasourceId() {
return datasourceId;
}
public void setDatasourceId(String datasourceId) {
this.datasourceId = datasourceId;
}
public String getDatasourceName() {
return datasourceName;
}
public void setDatasourceName(String datasourceName) {
this.datasourceName = datasourceName;
}
public String getNsPrefix() {
return nsPrefix;
}
public void setNsPrefix(String nsPrefix) {
this.nsPrefix = nsPrefix;
}
}

@ -18,6 +18,9 @@ public class AuthorMerger {
private static final Double THRESHOLD = 0.95;
private AuthorMerger() {
}
public static List<Author> merge(List<List<Author>> authors) {
authors.sort((o1, o2) -> -Integer.compare(countAuthorsPids(o1), countAuthorsPids(o2)));
@ -32,44 +35,54 @@ public class AuthorMerger {
}
public static List<Author> mergeAuthor(final List<Author> a, final List<Author> b) {
public static List<Author> mergeAuthor(final List<Author> a, final List<Author> b, Double threshold) {
int pa = countAuthorsPids(a);
int pb = countAuthorsPids(b);
List<Author> base, enrich;
List<Author> base;
List<Author> enrich;
int sa = authorsSize(a);
int sb = authorsSize(b);
if (pa == pb) {
base = sa > sb ? a : b;
enrich = sa > sb ? b : a;
} else {
if (sa == sb) {
base = pa > pb ? a : b;
enrich = pa > pb ? b : a;
} else {
base = sa > sb ? a : b;
enrich = sa > sb ? b : a;
}
enrichPidFromList(base, enrich);
enrichPidFromList(base, enrich, threshold);
return base;
}
private static void enrichPidFromList(List<Author> base, List<Author> enrich) {
public static List<Author> mergeAuthor(final List<Author> a, final List<Author> b) {
return mergeAuthor(a, b, THRESHOLD);
}
private static void enrichPidFromList(List<Author> base, List<Author> enrich, Double threshold) {
if (base == null || enrich == null)
return;
// <pidComparableString, Author> (if an Author has more than 1 pid, it appears 2 times in the list)
final Map<String, Author> basePidAuthorMap = base
.stream()
.filter(a -> a.getPid() != null && a.getPid().size() > 0)
.filter(a -> a.getPid() != null && !a.getPid().isEmpty())
.flatMap(
a -> a
.getPid()
.stream()
.filter(Objects::nonNull)
.map(p -> new Tuple2<>(pidToComparableString(p), a)))
.collect(Collectors.toMap(Tuple2::_1, Tuple2::_2, (x1, x2) -> x1));
// <pid, Author> (list of pid that are missing in the other list)
final List<Tuple2<StructuredProperty, Author>> pidToEnrich = enrich
.stream()
.filter(a -> a.getPid() != null && a.getPid().size() > 0)
.filter(a -> a.getPid() != null && !a.getPid().isEmpty())
.flatMap(
a -> a
.getPid()
.stream()
.filter(Objects::nonNull)
.filter(p -> !basePidAuthorMap.containsKey(pidToComparableString(p)))
.map(p -> new Tuple2<>(p, a)))
.collect(Collectors.toList());
@ -83,10 +96,10 @@ public class AuthorMerger {
.max(Comparator.comparing(Tuple2::_1));
if (simAuthor.isPresent()) {
double th = THRESHOLD;
double th = threshold;
// increase the threshold if the surname is too short
if (simAuthor.get()._2().getSurname() != null
&& simAuthor.get()._2().getSurname().length() <= 3)
&& simAuthor.get()._2().getSurname().length() <= 3 && threshold > 0.0)
th = 0.99;
if (simAuthor.get()._1() > th) {
@ -107,9 +120,9 @@ public class AuthorMerger {
}
public static String pidToComparableString(StructuredProperty pid) {
return (pid.getQualifier() != null
? pid.getQualifier().getClassid() != null ? pid.getQualifier().getClassid().toLowerCase() : ""
: "")
final String classid = pid.getQualifier().getClassid() != null ? pid.getQualifier().getClassid().toLowerCase()
: "";
return (pid.getQualifier() != null ? classid : "")
+ (pid.getValue() != null ? pid.getValue().toLowerCase() : "");
}
@ -142,7 +155,7 @@ public class AuthorMerger {
}
private static boolean hasPid(Author a) {
if (a == null || a.getPid() == null || a.getPid().size() == 0)
if (a == null || a.getPid() == null || a.getPid().isEmpty())
return false;
return a.getPid().stream().anyMatch(p -> p != null && StringUtils.isNotBlank(p.getValue()));
}
@ -151,12 +164,15 @@ public class AuthorMerger {
if (StringUtils.isNotBlank(author.getSurname())) {
return new Person(author.getSurname() + ", " + author.getName(), false);
} else {
return new Person(author.getFullname(), false);
if (StringUtils.isNotBlank(author.getFullname()))
return new Person(author.getFullname(), false);
else
return new Person("", false);
}
}
private static String normalize(final String s) {
return nfd(s)
String[] normalized = nfd(s)
.toLowerCase()
// do not compact the regexes in a single expression, would cause StackOverflowError
// in case
@ -166,7 +182,12 @@ public class AuthorMerger {
.replaceAll("(\\p{Punct})+", " ")
.replaceAll("(\\d)+", " ")
.replaceAll("(\\n)+", " ")
.trim();
.trim()
.split(" ");
Arrays.sort(normalized);
return String.join(" ", normalized);
}
private static String nfd(final String s) {

@ -0,0 +1,97 @@
package eu.dnetlib.dhp.oa.merge;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import java.util.Objects;
import java.util.Optional;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.FilterFunction;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SaveMode;
import org.apache.spark.sql.SparkSession;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.common.HdfsSupport;
import eu.dnetlib.dhp.schema.oaf.Oaf;
import eu.dnetlib.dhp.schema.oaf.OafEntity;
public class DispatchEntitiesSparkJob {
private static final Logger log = LoggerFactory.getLogger(DispatchEntitiesSparkJob.class);
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
public static void main(String[] args) throws Exception {
String jsonConfiguration = IOUtils
.toString(
Objects
.requireNonNull(
DispatchEntitiesSparkJob.class
.getResourceAsStream(
"/eu/dnetlib/dhp/oa/merge/dispatch_entities_parameters.json")));
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
parser.parseArgument(args);
Boolean isSparkSessionManaged = Optional
.ofNullable(parser.get("isSparkSessionManaged"))
.map(Boolean::valueOf)
.orElse(Boolean.TRUE);
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
String inputPath = parser.get("inputPath");
log.info("inputPath: {}", inputPath);
String outputPath = parser.get("outputPath");
log.info("outputPath: {}", outputPath);
String graphTableClassName = parser.get("graphTableClassName");
log.info("graphTableClassName: {}", graphTableClassName);
@SuppressWarnings("unchecked")
Class<? extends OafEntity> entityClazz = (Class<? extends OafEntity>) Class.forName(graphTableClassName);
SparkConf conf = new SparkConf();
runWithSparkSession(
conf,
isSparkSessionManaged,
spark -> {
HdfsSupport.remove(outputPath, spark.sparkContext().hadoopConfiguration());
dispatchEntities(spark, inputPath, entityClazz, outputPath);
});
}
private static <T extends Oaf> void dispatchEntities(
SparkSession spark,
String inputPath,
Class<T> clazz,
String outputPath) {
spark
.read()
.textFile(inputPath)
.filter((FilterFunction<String>) s -> isEntityType(s, clazz))
.map((MapFunction<String, String>) s -> StringUtils.substringAfter(s, "|"), Encoders.STRING())
.map(
(MapFunction<String, T>) value -> OBJECT_MAPPER.readValue(value, clazz),
Encoders.bean(clazz))
.write()
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.json(outputPath);
}
private static <T extends Oaf> boolean isEntityType(final String s, final Class<T> clazz) {
return StringUtils.substringBefore(s, "|").equals(clazz.getName());
}
}

@ -0,0 +1,203 @@
package eu.dnetlib.dhp.oa.merge;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import static eu.dnetlib.dhp.utils.DHPUtils.toSeq;
import java.io.IOException;
import java.util.List;
import java.util.Objects;
import java.util.Optional;
import java.util.stream.Collectors;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.FilterFunction;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.*;
import org.apache.spark.sql.expressions.Aggregator;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.databind.DeserializationFeature;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.jayway.jsonpath.Configuration;
import com.jayway.jsonpath.DocumentContext;
import com.jayway.jsonpath.JsonPath;
import com.jayway.jsonpath.Option;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.common.HdfsSupport;
import eu.dnetlib.dhp.schema.common.ModelSupport;
import eu.dnetlib.dhp.schema.oaf.*;
import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils;
import scala.Tuple2;
/**
* Groups the graph content by entity identifier to ensure ID uniqueness
*/
public class GroupEntitiesSparkJob {
private static final Logger log = LoggerFactory.getLogger(GroupEntitiesSparkJob.class);
private static final String ID_JPATH = "$.id";
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper()
.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);
public static void main(String[] args) throws Exception {
String jsonConfiguration = IOUtils
.toString(
GroupEntitiesSparkJob.class
.getResourceAsStream(
"/eu/dnetlib/dhp/oa/merge/group_graph_entities_parameters.json"));
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
parser.parseArgument(args);
Boolean isSparkSessionManaged = Optional
.ofNullable(parser.get("isSparkSessionManaged"))
.map(Boolean::valueOf)
.orElse(Boolean.TRUE);
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
String graphInputPath = parser.get("graphInputPath");
log.info("graphInputPath: {}", graphInputPath);
String outputPath = parser.get("outputPath");
log.info("outputPath: {}", outputPath);
SparkConf conf = new SparkConf();
conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer");
conf.registerKryoClasses(ModelSupport.getOafModelClasses());
runWithSparkSession(
conf,
isSparkSessionManaged,
spark -> {
HdfsSupport.remove(outputPath, spark.sparkContext().hadoopConfiguration());
groupEntities(spark, graphInputPath, outputPath);
});
}
private static void groupEntities(
SparkSession spark,
String inputPath,
String outputPath) {
final TypedColumn<OafEntity, OafEntity> aggregator = new GroupingAggregator().toColumn();
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
spark
.read()
.textFile(toSeq(listEntityPaths(inputPath, sc)))
.map((MapFunction<String, OafEntity>) GroupEntitiesSparkJob::parseOaf, Encoders.kryo(OafEntity.class))
.filter((FilterFunction<OafEntity>) e -> StringUtils.isNotBlank(ModelSupport.idFn().apply(e)))
.groupByKey((MapFunction<OafEntity, String>) oaf -> ModelSupport.idFn().apply(oaf), Encoders.STRING())
.agg(aggregator)
.map(
(MapFunction<Tuple2<String, OafEntity>, String>) t -> t._2().getClass().getName() +
"|" + OBJECT_MAPPER.writeValueAsString(t._2()),
Encoders.STRING())
.write()
.option("compression", "gzip")
.mode(SaveMode.Overwrite)
.text(outputPath);
}
public static class GroupingAggregator extends Aggregator<OafEntity, OafEntity, OafEntity> {
@Override
public OafEntity zero() {
return null;
}
@Override
public OafEntity reduce(OafEntity b, OafEntity a) {
return mergeAndGet(b, a);
}
private OafEntity mergeAndGet(OafEntity b, OafEntity a) {
if (Objects.nonNull(a) && Objects.nonNull(b)) {
return OafMapperUtils.mergeEntities(b, a);
}
return Objects.isNull(a) ? b : a;
}
@Override
public OafEntity merge(OafEntity b, OafEntity a) {
return mergeAndGet(b, a);
}
@Override
public OafEntity finish(OafEntity j) {
return j;
}
@Override
public Encoder<OafEntity> bufferEncoder() {
return Encoders.kryo(OafEntity.class);
}
@Override
public Encoder<OafEntity> outputEncoder() {
return Encoders.kryo(OafEntity.class);
}
}
private static OafEntity parseOaf(String s) {
DocumentContext dc = JsonPath
.parse(s, Configuration.defaultConfiguration().addOptions(Option.SUPPRESS_EXCEPTIONS));
final String id = dc.read(ID_JPATH);
if (StringUtils.isNotBlank(id)) {
String prefix = StringUtils.substringBefore(id, "|");
switch (prefix) {
case "10":
return parse(s, Datasource.class);
case "20":
return parse(s, Organization.class);
case "40":
return parse(s, Project.class);
case "50":
String resultType = dc.read("$.resulttype.classid");
switch (resultType) {
case "publication":
return parse(s, Publication.class);
case "dataset":
return parse(s, eu.dnetlib.dhp.schema.oaf.Dataset.class);
case "software":
return parse(s, Software.class);
case "other":
return parse(s, OtherResearchProduct.class);
default:
throw new IllegalArgumentException(String.format("invalid resultType: '%s'", resultType));
}
default:
throw new IllegalArgumentException(String.format("invalid id prefix: '%s'", prefix));
}
} else {
throw new IllegalArgumentException(String.format("invalid oaf: '%s'", s));
}
}
private static <T extends OafEntity> OafEntity parse(String s, Class<T> clazz) {
try {
return OBJECT_MAPPER.readValue(s, clazz);
} catch (IOException e) {
throw new IllegalArgumentException(e);
}
}
private static List<String> listEntityPaths(String inputPath, JavaSparkContext sc) {
return HdfsSupport
.listFiles(inputPath, sc.hadoopConfiguration())
.stream()
.filter(f -> !f.toLowerCase().contains("relation"))
.collect(Collectors.toList());
}
}

@ -12,6 +12,9 @@ import com.ximpleware.VTDNav;
/** Created by sandro on 9/29/16. */
public class VtdUtilityParser {
private VtdUtilityParser() {
}
public static List<Node> getTextValuesWithAttributes(
final AutoPilot ap, final VTDNav vn, final String xpath, final List<String> attributes)
throws VtdException {

@ -1,238 +0,0 @@
package eu.dnetlib.dhp.schema.oaf;
import java.util.LinkedHashMap;
import java.util.Objects;
import java.util.Optional;
import java.util.function.Function;
import java.util.stream.Collectors;
import org.apache.commons.lang3.StringUtils;
import com.clearspring.analytics.util.Lists;
import eu.dnetlib.dhp.schema.common.ModelConstants;
public class CleaningFunctions {
public static final String DOI_URL_PREFIX_REGEX = "(^http(s?):\\/\\/)(((dx\\.)?doi\\.org)|(handle\\.test\\.datacite\\.org))\\/";
public static final String ORCID_PREFIX_REGEX = "^http(s?):\\/\\/orcid\\.org\\/";
public static final String NONE = "none";
public static <T extends Oaf> T fixVocabularyNames(T value) {
if (value instanceof Datasource) {
// nothing to clean here
} else if (value instanceof Project) {
// nothing to clean here
} else if (value instanceof Organization) {
Organization o = (Organization) value;
if (Objects.nonNull(o.getCountry())) {
fixVocabName(o.getCountry(), ModelConstants.DNET_COUNTRY_TYPE);
}
} else if (value instanceof Relation) {
// nothing to clean here
} else if (value instanceof Result) {
Result r = (Result) value;
fixVocabName(r.getLanguage(), ModelConstants.DNET_LANGUAGES);
fixVocabName(r.getResourcetype(), ModelConstants.DNET_DATA_CITE_RESOURCE);
fixVocabName(r.getBestaccessright(), ModelConstants.DNET_ACCESS_MODES);
if (Objects.nonNull(r.getSubject())) {
r.getSubject().forEach(s -> fixVocabName(s.getQualifier(), ModelConstants.DNET_SUBJECT_TYPOLOGIES));
}
if (Objects.nonNull(r.getInstance())) {
for (Instance i : r.getInstance()) {
fixVocabName(i.getAccessright(), ModelConstants.DNET_ACCESS_MODES);
fixVocabName(i.getRefereed(), ModelConstants.DNET_REVIEW_LEVELS);
}
}
if (Objects.nonNull(r.getAuthor())) {
r.getAuthor().forEach(a -> {
if (Objects.nonNull(a.getPid())) {
a.getPid().forEach(p -> {
fixVocabName(p.getQualifier(), ModelConstants.DNET_PID_TYPES);
});
}
});
}
if (value instanceof Publication) {
} else if (value instanceof eu.dnetlib.dhp.schema.oaf.Dataset) {
} else if (value instanceof OtherResearchProduct) {
} else if (value instanceof Software) {
}
}
return value;
}
public static <T extends Oaf> T fixDefaults(T value) {
if (value instanceof Datasource) {
// nothing to clean here
} else if (value instanceof Project) {
// nothing to clean here
} else if (value instanceof Organization) {
Organization o = (Organization) value;
if (Objects.isNull(o.getCountry()) || StringUtils.isBlank(o.getCountry().getClassid())) {
o.setCountry(qualifier("UNKNOWN", "Unknown", ModelConstants.DNET_COUNTRY_TYPE));
}
} else if (value instanceof Relation) {
// nothing to clean here
} else if (value instanceof Result) {
Result r = (Result) value;
if (Objects.nonNull(r.getPublisher()) && StringUtils.isBlank(r.getPublisher().getValue())) {
r.setPublisher(null);
}
if (Objects.isNull(r.getLanguage()) || StringUtils.isBlank(r.getLanguage().getClassid())) {
r
.setLanguage(
qualifier("und", "Undetermined", ModelConstants.DNET_LANGUAGES));
}
if (Objects.nonNull(r.getSubject())) {
r
.setSubject(
r
.getSubject()
.stream()
.filter(Objects::nonNull)
.filter(sp -> StringUtils.isNotBlank(sp.getValue()))
.filter(sp -> Objects.nonNull(sp.getQualifier()))
.filter(sp -> StringUtils.isNotBlank(sp.getQualifier().getClassid()))
.collect(Collectors.toList()));
}
if (Objects.nonNull(r.getPid())) {
r
.setPid(
r
.getPid()
.stream()
.filter(Objects::nonNull)
.filter(sp -> StringUtils.isNotBlank(StringUtils.trim(sp.getValue())))
.filter(sp -> NONE.equalsIgnoreCase(sp.getValue()))
.filter(sp -> Objects.nonNull(sp.getQualifier()))
.filter(sp -> StringUtils.isNotBlank(sp.getQualifier().getClassid()))
.map(CleaningFunctions::normalizePidValue)
.collect(Collectors.toList()));
}
if (Objects.isNull(r.getResourcetype()) || StringUtils.isBlank(r.getResourcetype().getClassid())) {
r
.setResourcetype(
qualifier("UNKNOWN", "Unknown", ModelConstants.DNET_DATA_CITE_RESOURCE));
}
if (Objects.nonNull(r.getInstance())) {
for (Instance i : r.getInstance()) {
if (Objects.isNull(i.getAccessright()) || StringUtils.isBlank(i.getAccessright().getClassid())) {
i.setAccessright(qualifier("UNKNOWN", "not available", ModelConstants.DNET_ACCESS_MODES));
}
if (Objects.isNull(i.getHostedby()) || StringUtils.isBlank(i.getHostedby().getKey())) {
i.setHostedby(ModelConstants.UNKNOWN_REPOSITORY);
}
if (Objects.isNull(i.getRefereed())) {
i.setRefereed(qualifier("0000", "Unknown", ModelConstants.DNET_REVIEW_LEVELS));
}
}
}
if (Objects.isNull(r.getBestaccessright()) || StringUtils.isBlank(r.getBestaccessright().getClassid())) {
Qualifier bestaccessrights = OafMapperUtils.createBestAccessRights(r.getInstance());
if (Objects.isNull(bestaccessrights)) {
r
.setBestaccessright(
qualifier("UNKNOWN", "not available", ModelConstants.DNET_ACCESS_MODES));
} else {
r.setBestaccessright(bestaccessrights);
}
}
if (Objects.nonNull(r.getAuthor())) {
boolean nullRank = r
.getAuthor()
.stream()
.anyMatch(a -> Objects.isNull(a.getRank()));
if (nullRank) {
int i = 1;
for (Author author : r.getAuthor()) {
author.setRank(i++);
}
}
for (Author a : r.getAuthor()) {
if (Objects.isNull(a.getPid())) {
a.setPid(Lists.newArrayList());
} else {
a
.setPid(
a
.getPid()
.stream()
.filter(p -> Objects.nonNull(p.getQualifier()))
.filter(p -> StringUtils.isNotBlank(p.getValue()))
.map(p -> {
p.setValue(p.getValue().trim().replaceAll(ORCID_PREFIX_REGEX, ""));
return p;
})
.collect(
Collectors
.toMap(
StructuredProperty::getValue, Function.identity(), (p1, p2) -> p1,
LinkedHashMap::new))
.values()
.stream()
.collect(Collectors.toList()));
}
}
}
if (value instanceof Publication) {
} else if (value instanceof eu.dnetlib.dhp.schema.oaf.Dataset) {
} else if (value instanceof OtherResearchProduct) {
} else if (value instanceof Software) {
}
}
return value;
}
// HELPERS
private static void fixVocabName(Qualifier q, String vocabularyName) {
if (Objects.nonNull(q) && StringUtils.isBlank(q.getSchemeid())) {
q.setSchemeid(vocabularyName);
q.setSchemename(vocabularyName);
}
}
private static Qualifier qualifier(String classid, String classname, String scheme) {
return OafMapperUtils
.qualifier(
classid, classname, scheme, scheme);
}
/**
* Utility method that normalises PID values on a per-type basis.
* @param pid the PID whose value will be normalised.
* @return the PID containing the normalised value.
*/
public static StructuredProperty normalizePidValue(StructuredProperty pid) {
String value = Optional
.ofNullable(pid.getValue())
.map(String::trim)
.orElseThrow(() -> new IllegalArgumentException("PID value cannot be empty"));
switch (pid.getQualifier().getClassid()) {
// TODO add cleaning for more PID types as needed
case "doi":
pid.setValue(value.toLowerCase().replaceAll(DOI_URL_PREFIX_REGEX, ""));
break;
}
return pid;
}
}

@ -1,14 +0,0 @@
package eu.dnetlib.dhp.schema.oaf;
public class ModelHardLimits {
public static final int MAX_EXTERNAL_ENTITIES = 50;
public static final int MAX_AUTHORS = 200;
public static final int MAX_AUTHOR_FULLNAME_LENGTH = 1000;
public static final int MAX_TITLE_LENGTH = 5000;
public static final int MAX_TITLES = 10;
public static final int MAX_ABSTRACT_LENGTH = 150000;
public static final int MAX_INSTANCES = 10;
}

@ -1,296 +0,0 @@
package eu.dnetlib.dhp.schema.oaf;
import static eu.dnetlib.dhp.schema.common.ModelConstants.*;
import static eu.dnetlib.dhp.schema.common.ModelConstants.DNET_ACCESS_MODES;
import java.util.*;
import java.util.concurrent.ConcurrentHashMap;
import java.util.function.Function;
import java.util.function.Predicate;
import java.util.stream.Collectors;
import org.apache.commons.lang3.StringUtils;
import eu.dnetlib.dhp.schema.common.LicenseComparator;
import eu.dnetlib.dhp.utils.DHPUtils;
public class OafMapperUtils {
public static KeyValue keyValue(final String k, final String v) {
final KeyValue kv = new KeyValue();
kv.setKey(k);
kv.setValue(v);
return kv;
}
public static List<KeyValue> listKeyValues(final String... s) {
if (s.length % 2 > 0) {
throw new RuntimeException("Invalid number of parameters (k,v,k,v,....)");
}
final List<KeyValue> list = new ArrayList<>();
for (int i = 0; i < s.length; i += 2) {
list.add(keyValue(s[i], s[i + 1]));
}
return list;
}
public static <T> Field<T> field(final T value, final DataInfo info) {
if (value == null || StringUtils.isBlank(value.toString())) {
return null;
}
final Field<T> field = new Field<>();
field.setValue(value);
field.setDataInfo(info);
return field;
}
public static List<Field<String>> listFields(final DataInfo info, final String... values) {
return Arrays
.stream(values)
.map(v -> field(v, info))
.filter(Objects::nonNull)
.filter(distinctByKey(f -> f.getValue()))
.collect(Collectors.toList());
}
public static List<Field<String>> listFields(final DataInfo info, final List<String> values) {
return values
.stream()
.map(v -> field(v, info))
.filter(Objects::nonNull)
.filter(distinctByKey(f -> f.getValue()))
.collect(Collectors.toList());
}
public static Qualifier unknown(final String schemeid, final String schemename) {
return qualifier("UNKNOWN", "Unknown", schemeid, schemename);
}
public static Qualifier qualifier(
final String classid,
final String classname,
final String schemeid,
final String schemename) {
final Qualifier q = new Qualifier();
q.setClassid(classid);
q.setClassname(classname);
q.setSchemeid(schemeid);
q.setSchemename(schemename);
return q;
}
public static StructuredProperty structuredProperty(
final String value,
final String classid,
final String classname,
final String schemeid,
final String schemename,
final DataInfo dataInfo) {
return structuredProperty(value, qualifier(classid, classname, schemeid, schemename), dataInfo);
}
public static StructuredProperty structuredProperty(
final String value,
final Qualifier qualifier,
final DataInfo dataInfo) {
if (value == null) {
return null;
}
final StructuredProperty sp = new StructuredProperty();
sp.setValue(value);
sp.setQualifier(qualifier);
sp.setDataInfo(dataInfo);
return sp;
}
public static ExtraInfo extraInfo(
final String name,
final String value,
final String typology,
final String provenance,
final String trust) {
final ExtraInfo info = new ExtraInfo();
info.setName(name);
info.setValue(value);
info.setTypology(typology);
info.setProvenance(provenance);
info.setTrust(trust);
return info;
}
public static OAIProvenance oaiIProvenance(
final String identifier,
final String baseURL,
final String metadataNamespace,
final Boolean altered,
final String datestamp,
final String harvestDate) {
final OriginDescription desc = new OriginDescription();
desc.setIdentifier(identifier);
desc.setBaseURL(baseURL);
desc.setMetadataNamespace(metadataNamespace);
desc.setAltered(altered);
desc.setDatestamp(datestamp);
desc.setHarvestDate(harvestDate);
final OAIProvenance p = new OAIProvenance();
p.setOriginDescription(desc);
return p;
}
public static Journal journal(
final String name,
final String issnPrinted,
final String issnOnline,
final String issnLinking,
final DataInfo dataInfo) {
return journal(
name,
issnPrinted,
issnOnline,
issnLinking,
null,
null,
null,
null,
null,
null,
null,
dataInfo);
}
public static Journal journal(
final String name,
final String issnPrinted,
final String issnOnline,
final String issnLinking,
final String ep,
final String iss,
final String sp,
final String vol,
final String edition,
final String conferenceplace,
final String conferencedate,
final DataInfo dataInfo) {
if (StringUtils.isNotBlank(name)
|| StringUtils.isNotBlank(issnPrinted)
|| StringUtils.isNotBlank(issnOnline)
|| StringUtils.isNotBlank(issnLinking)) {
final Journal j = new Journal();
j.setName(name);
j.setIssnPrinted(issnPrinted);
j.setIssnOnline(issnOnline);
j.setIssnLinking(issnLinking);
j.setEp(ep);
j.setIss(iss);
j.setSp(sp);
j.setVol(vol);
j.setEdition(edition);
j.setConferenceplace(conferenceplace);
j.setConferencedate(conferencedate);
j.setDataInfo(dataInfo);
return j;
} else {
return null;
}
}
public static DataInfo dataInfo(
final Boolean deletedbyinference,
final String inferenceprovenance,
final Boolean inferred,
final Boolean invisible,
final Qualifier provenanceaction,
final String trust) {
final DataInfo d = new DataInfo();
d.setDeletedbyinference(deletedbyinference);
d.setInferenceprovenance(inferenceprovenance);
d.setInferred(inferred);
d.setInvisible(invisible);
d.setProvenanceaction(provenanceaction);
d.setTrust(trust);
return d;
}
public static String createOpenaireId(
final int prefix,
final String originalId,
final boolean to_md5) {
if (StringUtils.isBlank(originalId)) {
return null;
} else if (to_md5) {
final String nsPrefix = StringUtils.substringBefore(originalId, "::");
final String rest = StringUtils.substringAfter(originalId, "::");
return String.format("%s|%s::%s", prefix, nsPrefix, DHPUtils.md5(rest));
} else {
return String.format("%s|%s", prefix, originalId);
}
}
public static String createOpenaireId(
final String type,
final String originalId,
final boolean to_md5) {
switch (type) {
case "datasource":
return createOpenaireId(10, originalId, to_md5);
case "organization":
return createOpenaireId(20, originalId, to_md5);
case "person":
return createOpenaireId(30, originalId, to_md5);
case "project":
return createOpenaireId(40, originalId, to_md5);
default:
return createOpenaireId(50, originalId, to_md5);
}
}
public static String asString(final Object o) {
return o == null ? "" : o.toString();
}
public static <T> Predicate<T> distinctByKey(
final Function<? super T, ?> keyExtractor) {
final Map<Object, Boolean> seen = new ConcurrentHashMap<>();
return t -> seen.putIfAbsent(keyExtractor.apply(t), Boolean.TRUE) == null;
}
public static Qualifier createBestAccessRights(final List<Instance> instanceList) {
return getBestAccessRights(instanceList);
}
protected static Qualifier getBestAccessRights(final List<Instance> instanceList) {
if (instanceList != null) {
final Optional<Qualifier> min = instanceList
.stream()
.map(i -> i.getAccessright())
.min(new LicenseComparator());
final Qualifier rights = min.isPresent() ? min.get() : new Qualifier();
if (StringUtils.isBlank(rights.getClassid())) {
rights.setClassid(UNKNOWN);
}
if (StringUtils.isBlank(rights.getClassname())
|| UNKNOWN.equalsIgnoreCase(rights.getClassname())) {
rights.setClassname(NOT_AVAILABLE);
}
if (StringUtils.isBlank(rights.getSchemeid())) {
rights.setSchemeid(DNET_ACCESS_MODES);
}
if (StringUtils.isBlank(rights.getSchemename())) {
rights.setSchemename(DNET_ACCESS_MODES);
}
return rights;
}
return null;
}
}

@ -1,49 +0,0 @@
package eu.dnetlib.dhp.schema.oaf;
import java.util.Comparator;
import eu.dnetlib.dhp.schema.common.ModelConstants;
public class ResultTypeComparator implements Comparator<Result> {
@Override
public int compare(Result left, Result right) {
if (left == null && right == null)
return 0;
if (left == null)
return 1;
if (right == null)
return -1;
String lClass = left.getResulttype().getClassid();
String rClass = right.getResulttype().getClassid();
if (lClass.equals(rClass))
return 0;
if (lClass.equals(ModelConstants.PUBLICATION_RESULTTYPE_CLASSID))
return -1;
if (rClass.equals(ModelConstants.PUBLICATION_RESULTTYPE_CLASSID))
return 1;
if (lClass.equals(ModelConstants.DATASET_RESULTTYPE_CLASSID))
return -1;
if (rClass.equals(ModelConstants.DATASET_RESULTTYPE_CLASSID))
return 1;
if (lClass.equals(ModelConstants.SOFTWARE_RESULTTYPE_CLASSID))
return -1;
if (rClass.equals(ModelConstants.SOFTWARE_RESULTTYPE_CLASSID))
return 1;
if (lClass.equals(ModelConstants.ORP_RESULTTYPE_CLASSID))
return -1;
if (rClass.equals(ModelConstants.ORP_RESULTTYPE_CLASSID))
return 1;
// Else (but unlikely), lexicographical ordering will do.
return lClass.compareTo(rClass);
}
}

@ -0,0 +1,538 @@
package eu.dnetlib.dhp.schema.oaf.utils;
import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.getProvenance;
import java.time.LocalDate;
import java.time.ZoneId;
import java.time.format.DateTimeFormatter;
import java.time.format.DateTimeParseException;
import java.util.*;
import java.util.function.Function;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import org.apache.commons.lang3.StringUtils;
import com.github.sisyphsu.dateparser.DateParserUtils;
import com.google.common.collect.Lists;
import com.google.common.collect.Sets;
import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup;
import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.common.ModelSupport;
import eu.dnetlib.dhp.schema.oaf.*;
import me.xuender.unidecode.Unidecode;
public class GraphCleaningFunctions extends CleaningFunctions {
public static final String ORCID_CLEANING_REGEX = ".*([0-9]{4}).*[-–—−=].*([0-9]{4}).*[-–—−=].*([0-9]{4}).*[-–—−=].*([0-9x]{4})";
public static final int ORCID_LEN = 19;
public static final String CLEANING_REGEX = "(?:\\n|\\r|\\t)";
public static final String INVALID_AUTHOR_REGEX = ".*deactivated.*";
public static final String TITLE_TEST = "test";
public static final String TITLE_FILTER_REGEX = String.format("(%s)|\\W|\\d", TITLE_TEST);
public static final int TITLE_FILTER_RESIDUAL_LENGTH = 5;
public static <T extends Oaf> T fixVocabularyNames(T value) {
if (value instanceof Datasource) {
// nothing to clean here
} else if (value instanceof Project) {
// nothing to clean here
} else if (value instanceof Organization) {
Organization o = (Organization) value;
if (Objects.nonNull(o.getCountry())) {
fixVocabName(o.getCountry(), ModelConstants.DNET_COUNTRY_TYPE);
}
} else if (value instanceof Relation) {
// nothing to clean here
} else if (value instanceof Result) {
Result r = (Result) value;
fixVocabName(r.getLanguage(), ModelConstants.DNET_LANGUAGES);
fixVocabName(r.getResourcetype(), ModelConstants.DNET_DATA_CITE_RESOURCE);
fixVocabName(r.getBestaccessright(), ModelConstants.DNET_ACCESS_MODES);
if (Objects.nonNull(r.getSubject())) {
r.getSubject().forEach(s -> fixVocabName(s.getQualifier(), ModelConstants.DNET_SUBJECT_TYPOLOGIES));
}
if (Objects.nonNull(r.getInstance())) {
for (Instance i : r.getInstance()) {
fixVocabName(i.getAccessright(), ModelConstants.DNET_ACCESS_MODES);
fixVocabName(i.getRefereed(), ModelConstants.DNET_REVIEW_LEVELS);
}
}
if (Objects.nonNull(r.getAuthor())) {
r.getAuthor().stream().filter(Objects::nonNull).forEach(a -> {
if (Objects.nonNull(a.getPid())) {
a.getPid().stream().filter(Objects::nonNull).forEach(p -> {
fixVocabName(p.getQualifier(), ModelConstants.DNET_PID_TYPES);
});
}
});
}
if (value instanceof Publication) {
} else if (value instanceof Dataset) {
} else if (value instanceof OtherResearchProduct) {
} else if (value instanceof Software) {
}
}
return value;
}
public static <T extends Oaf> boolean filter(T value) {
if (Boolean.TRUE
.equals(
Optional
.ofNullable(value)
.map(
o -> Optional
.ofNullable(o.getDataInfo())
.map(
d -> Optional
.ofNullable(d.getInvisible())
.orElse(true))
.orElse(true))
.orElse(true))) {
return true;
}
if (value instanceof Datasource) {
// nothing to evaluate here
} else if (value instanceof Project) {
// nothing to evaluate here
} else if (value instanceof Organization) {
// nothing to evaluate here
} else if (value instanceof Relation) {
// nothing to clean here
} else if (value instanceof Result) {
Result r = (Result) value;
if (Objects.isNull(r.getTitle()) || r.getTitle().isEmpty()) {
return false;
}
if (value instanceof Publication) {
} else if (value instanceof Dataset) {
} else if (value instanceof OtherResearchProduct) {
} else if (value instanceof Software) {
}
}
return true;
}
public static <T extends Oaf> T cleanup(T value, VocabularyGroup vocs) {
if (value instanceof Datasource) {
// nothing to clean here
} else if (value instanceof Project) {
// nothing to clean here
} else if (value instanceof Organization) {
Organization o = (Organization) value;
if (Objects.isNull(o.getCountry()) || StringUtils.isBlank(o.getCountry().getClassid())) {
o.setCountry(ModelConstants.UNKNOWN_COUNTRY);
}
} else if (value instanceof Relation) {
Relation r = (Relation) value;
Optional<String> validationDate = doCleanDate(r.getValidationDate());
if (validationDate.isPresent()) {
r.setValidationDate(validationDate.get());
r.setValidated(true);
} else {
r.setValidationDate(null);
r.setValidated(false);
}
} else if (value instanceof Result) {
Result r = (Result) value;
if (Objects.nonNull(r.getDateofacceptance())) {
Optional<String> date = cleanDateField(r.getDateofacceptance());
if (date.isPresent()) {
r.getDateofacceptance().setValue(date.get());
} else {
r.setDateofacceptance(null);
}
}
if (Objects.nonNull(r.getRelevantdate())) {
r
.setRelevantdate(
r
.getRelevantdate()
.stream()
.filter(Objects::nonNull)
.filter(sp -> Objects.nonNull(sp.getQualifier()))
.filter(sp -> StringUtils.isNotBlank(sp.getQualifier().getClassid()))
.map(sp -> {
sp.setValue(GraphCleaningFunctions.cleanDate(sp.getValue()));
return sp;
})
.filter(sp -> StringUtils.isNotBlank(sp.getValue()))
.collect(Collectors.toList()));
}
if (Objects.nonNull(r.getPublisher()) && StringUtils.isBlank(r.getPublisher().getValue())) {
r.setPublisher(null);
}
if (Objects.isNull(r.getLanguage()) || StringUtils.isBlank(r.getLanguage().getClassid())) {
r
.setLanguage(
qualifier("und", "Undetermined", ModelConstants.DNET_LANGUAGES));
}
if (Objects.nonNull(r.getSubject())) {
List<Subject> subjects = Lists
.newArrayList(
r
.getSubject()
.stream()
.filter(Objects::nonNull)
.filter(sp -> StringUtils.isNotBlank(sp.getValue()))
.filter(sp -> Objects.nonNull(sp.getQualifier()))
.filter(sp -> StringUtils.isNotBlank(sp.getQualifier().getClassid()))
.map(GraphCleaningFunctions::cleanValue)
.collect(
Collectors
.toMap(
s -> Optional
.ofNullable(s.getQualifier())
.map(q -> q.getClassid() + s.getValue())
.orElse(s.getValue()),
Function.identity(),
(s1, s2) -> Collections
.min(Lists.newArrayList(s1, s2), new SubjectProvenanceComparator())))
.values());
r.setSubject(subjects);
}
if (Objects.nonNull(r.getTitle())) {
r
.setTitle(
r
.getTitle()
.stream()
.filter(Objects::nonNull)
.filter(sp -> StringUtils.isNotBlank(sp.getValue()))
.filter(
sp -> {
final String title = sp
.getValue()
.toLowerCase();
final String decoded = Unidecode.decode(title);
if (StringUtils.contains(decoded, TITLE_TEST)) {
return decoded
.replaceAll(TITLE_FILTER_REGEX, "")
.length() > TITLE_FILTER_RESIDUAL_LENGTH;
}
return !decoded
.replaceAll("\\W|\\d", "")
.isEmpty();
})
.map(GraphCleaningFunctions::cleanValue)
.collect(Collectors.toList()));
}
if (Objects.nonNull(r.getFormat())) {
r
.setFormat(
r
.getFormat()
.stream()
.map(GraphCleaningFunctions::cleanValue)
.collect(Collectors.toList()));
}
if (Objects.nonNull(r.getDescription())) {
r
.setDescription(
r
.getDescription()
.stream()
.filter(Objects::nonNull)
.filter(sp -> StringUtils.isNotBlank(sp.getValue()))
.map(GraphCleaningFunctions::cleanValue)
.collect(Collectors.toList()));
}
if (Objects.nonNull(r.getPid())) {
r.setPid(processPidCleaning(r.getPid()));
}
if (Objects.isNull(r.getResourcetype()) || StringUtils.isBlank(r.getResourcetype().getClassid())) {
r
.setResourcetype(
qualifier(ModelConstants.UNKNOWN, "Unknown", ModelConstants.DNET_DATA_CITE_RESOURCE));
}
if (Objects.nonNull(r.getInstance())) {
for (Instance i : r.getInstance()) {
if (!vocs.termExists(ModelConstants.DNET_PUBLICATION_RESOURCE, i.getInstancetype().getClassid())) {
if (r instanceof Publication) {
i
.setInstancetype(
OafMapperUtils
.qualifier(
"0038", "Other literature type", ModelConstants.DNET_PUBLICATION_RESOURCE,
ModelConstants.DNET_PUBLICATION_RESOURCE));
} else if (r instanceof Dataset) {
i
.setInstancetype(
OafMapperUtils
.qualifier(
"0039", "Other dataset type", ModelConstants.DNET_PUBLICATION_RESOURCE,
ModelConstants.DNET_PUBLICATION_RESOURCE));
} else if (r instanceof Software) {
i
.setInstancetype(
OafMapperUtils
.qualifier(
"0040", "Other software type", ModelConstants.DNET_PUBLICATION_RESOURCE,
ModelConstants.DNET_PUBLICATION_RESOURCE));
} else if (r instanceof OtherResearchProduct) {
i
.setInstancetype(
OafMapperUtils
.qualifier(
"0020", "Other ORP type", ModelConstants.DNET_PUBLICATION_RESOURCE,
ModelConstants.DNET_PUBLICATION_RESOURCE));
}
}
if (Objects.nonNull(i.getPid())) {
i.setPid(processPidCleaning(i.getPid()));
}
if (Objects.nonNull(i.getAlternateIdentifier())) {
i.setAlternateIdentifier(processPidCleaning(i.getAlternateIdentifier()));
}
Optional
.ofNullable(i.getPid())
.ifPresent(pid -> {
final Set<StructuredProperty> pids = Sets.newHashSet(pid);
Optional
.ofNullable(i.getAlternateIdentifier())
.ifPresent(altId -> {
final Set<StructuredProperty> altIds = Sets.newHashSet(altId);
i.setAlternateIdentifier(Lists.newArrayList(Sets.difference(altIds, pids)));
});
});
if (Objects.isNull(i.getAccessright()) || StringUtils.isBlank(i.getAccessright().getClassid())) {
i
.setAccessright(
accessRight(
ModelConstants.UNKNOWN, ModelConstants.NOT_AVAILABLE,
ModelConstants.DNET_ACCESS_MODES));
}
if (Objects.isNull(i.getHostedby()) || StringUtils.isBlank(i.getHostedby().getKey())) {
i.setHostedby(ModelConstants.UNKNOWN_REPOSITORY);
}
if (Objects.isNull(i.getRefereed())) {
i.setRefereed(qualifier("0000", "Unknown", ModelConstants.DNET_REVIEW_LEVELS));
}
if (Objects.nonNull(i.getDateofacceptance())) {
Optional<String> date = cleanDateField(i.getDateofacceptance());
if (date.isPresent()) {
i.getDateofacceptance().setValue(date.get());
} else {
i.setDateofacceptance(null);
}
}
}
}
if (Objects.isNull(r.getBestaccessright()) || StringUtils.isBlank(r.getBestaccessright().getClassid())) {
Qualifier bestaccessrights = OafMapperUtils.createBestAccessRights(r.getInstance());
if (Objects.isNull(bestaccessrights)) {
r
.setBestaccessright(
qualifier(
ModelConstants.UNKNOWN, ModelConstants.NOT_AVAILABLE,
ModelConstants.DNET_ACCESS_MODES));
} else {
r.setBestaccessright(bestaccessrights);
}
}
if (Objects.nonNull(r.getAuthor())) {
r
.setAuthor(
r
.getAuthor()
.stream()
.filter(Objects::nonNull)
.filter(a -> StringUtils.isNotBlank(a.getFullname()))
.filter(a -> StringUtils.isNotBlank(a.getFullname().replaceAll("[\\W]", "")))
.collect(Collectors.toList()));
boolean nullRank = r
.getAuthor()
.stream()
.anyMatch(a -> Objects.isNull(a.getRank()));
if (nullRank) {
int i = 1;
for (Author author : r.getAuthor()) {
author.setRank(i++);
}
}
for (Author a : r.getAuthor()) {
if (Objects.isNull(a.getPid())) {
a.setPid(Lists.newArrayList());
} else {
a
.setPid(
a
.getPid()
.stream()
.filter(Objects::nonNull)
.filter(p -> Objects.nonNull(p.getQualifier()))
.filter(p -> StringUtils.isNotBlank(p.getValue()))
.map(p -> {
// hack to distinguish orcid from orcid_pending
String pidProvenance = getProvenance(p.getDataInfo());
if (p
.getQualifier()
.getClassid()
.toLowerCase()
.contains(ModelConstants.ORCID)) {
if (pidProvenance
.equals(ModelConstants.SYSIMPORT_CROSSWALK_ENTITYREGISTRY)) {
p.getQualifier().setClassid(ModelConstants.ORCID);
} else {
p.getQualifier().setClassid(ModelConstants.ORCID_PENDING);
}
final String orcid = p
.getValue()
.trim()
.toLowerCase()
.replaceAll(ORCID_CLEANING_REGEX, "$1-$2-$3-$4");
if (orcid.length() == ORCID_LEN) {
p.setValue(orcid);
} else {
p.setValue("");
}
}
return p;
})
.filter(p -> StringUtils.isNotBlank(p.getValue()))
.collect(
Collectors
.toMap(
p -> p.getQualifier().getClassid() + p.getValue(),
Function.identity(),
(p1, p2) -> p1,
LinkedHashMap::new))
.values()
.stream()
.collect(Collectors.toList()));
}
}
}
if (value instanceof Publication) {
} else if (value instanceof Dataset) {
} else if (value instanceof OtherResearchProduct) {
} else if (value instanceof Software) {
}
}
return value;
}
private static Optional<String> cleanDateField(Field<String> dateofacceptance) {
return Optional
.ofNullable(dateofacceptance)
.map(Field::getValue)
.map(GraphCleaningFunctions::cleanDate)
.filter(Objects::nonNull);
}
protected static Optional<String> doCleanDate(String date) {
return Optional.ofNullable(cleanDate(date));
}
public static String cleanDate(final String inputDate) {
if (StringUtils.isBlank(inputDate)) {
return null;
}
try {
final LocalDate date = DateParserUtils
.parseDate(inputDate.trim())
.toInstant()
.atZone(ZoneId.systemDefault())
.toLocalDate();
return DateTimeFormatter.ofPattern(ModelSupport.DATE_FORMAT).format(date);
} catch (DateTimeParseException e) {
return null;
}
}
// HELPERS
private static boolean isValidAuthorName(Author a) {
return !Stream
.of(a.getFullname(), a.getName(), a.getSurname())
.filter(s -> s != null && !s.isEmpty())
.collect(Collectors.joining(""))
.toLowerCase()
.matches(INVALID_AUTHOR_REGEX);
}
private static List<StructuredProperty> processPidCleaning(List<StructuredProperty> pids) {
return pids
.stream()
.filter(Objects::nonNull)
.filter(sp -> StringUtils.isNotBlank(StringUtils.trim(sp.getValue())))
.filter(sp -> !PID_BLACKLIST.contains(sp.getValue().trim().toLowerCase()))
.filter(sp -> Objects.nonNull(sp.getQualifier()))
.filter(sp -> StringUtils.isNotBlank(sp.getQualifier().getClassid()))
.map(CleaningFunctions::normalizePidValue)
.filter(CleaningFunctions::pidFilter)
.collect(Collectors.toList());
}
private static void fixVocabName(Qualifier q, String vocabularyName) {
if (Objects.nonNull(q) && StringUtils.isBlank(q.getSchemeid())) {
q.setSchemeid(vocabularyName);
q.setSchemename(vocabularyName);
}
}
private static AccessRight accessRight(String classid, String classname, String scheme) {
return OafMapperUtils
.accessRight(
classid, classname, scheme, scheme);
}
private static Qualifier qualifier(String classid, String classname, String scheme) {
return OafMapperUtils
.qualifier(
classid, classname, scheme, scheme);
}
protected static StructuredProperty cleanValue(StructuredProperty s) {
s.setValue(s.getValue().replaceAll(CLEANING_REGEX, " "));
return s;
}
protected static Subject cleanValue(Subject s) {
s.setValue(s.getValue().replaceAll(CLEANING_REGEX, " "));
return s;
}
protected static Field<String> cleanValue(Field<String> s) {
s.setValue(s.getValue().replaceAll(CLEANING_REGEX, " "));
return s;
}
}

@ -1,102 +0,0 @@
package eu.dnetlib.dhp.schema.oaf.utils;
import java.io.Serializable;
import java.util.Objects;
import java.util.Optional;
import java.util.regex.Pattern;
import org.apache.commons.lang.StringUtils;
import eu.dnetlib.dhp.schema.oaf.CleaningFunctions;
import eu.dnetlib.dhp.schema.oaf.OafEntity;
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
import eu.dnetlib.dhp.utils.DHPUtils;
/**
* Factory class for OpenAIRE identifiers in the Graph
*/
public class IdentifierFactory implements Serializable {
public static final String ID_SEPARATOR = "::";
public static final String ID_PREFIX_SEPARATOR = "|";
public final static String ID_REGEX = "^[0-9][0-9]\\" + ID_PREFIX_SEPARATOR + ".{12}" + ID_SEPARATOR
+ "[a-zA-Z0-9]{32}$";
public final static String DOI_REGEX = "(^10\\.[0-9]{4,9}\\/[-._;()\\/:a-zA-Z0-9]+$)|" +
"(^10\\.1002\\/[^\\s]+$)|" +
"(^10\\.1021\\/[a-zA-Z0-9_][a-zA-Z0-9_][0-9]++$)|" +
"(^10\\.1207\\/[a-zA-Z0-9_]+\\&[0-9]+_[0-9]+$)";
public static final int ID_PREFIX_LEN = 12;
public static final String NONE = "none";
/**
* Creates an identifier from the most relevant PID (if available) in the given entity T. Returns entity.id
* when no PID is available
* @param entity the entity providing PIDs and a default ID.
* @param <T> the specific entity type. Currently Organization and Result subclasses are supported.
* @return an identifier from the most relevant PID, entity.id otherwise
*/
public static <T extends OafEntity> String createIdentifier(T entity) {
if (Objects.isNull(entity.getPid()) || entity.getPid().isEmpty()) {
return entity.getId();
}
return entity
.getPid()
.stream()
.filter(s -> pidFilter(s))
.min(new PidComparator<>(entity))
.map(s -> idFromPid(entity, s))
.map(IdentifierFactory::verifyIdSyntax)
.orElseGet(entity::getId);
}
protected static boolean pidFilter(StructuredProperty s) {
if (Objects.isNull(s.getQualifier()) ||
StringUtils.isBlank(StringUtils.trim(s.getValue()))) {
return false;
}
try {
switch (PidType.valueOf(s.getQualifier().getClassid())) {
case doi:
final String doi = StringUtils.trim(StringUtils.lowerCase(s.getValue()));
return doi.matches(DOI_REGEX);
default:
return true;
}
} catch (IllegalArgumentException e) {
return false;
}
}
private static String verifyIdSyntax(String s) {
if (StringUtils.isBlank(s) || !s.matches(ID_REGEX)) {
throw new RuntimeException(String.format("malformed id: '%s'", s));
} else {
return s;
}
}
private static <T extends OafEntity> String idFromPid(T entity, StructuredProperty s) {
return new StringBuilder()
.append(StringUtils.substringBefore(entity.getId(), ID_PREFIX_SEPARATOR))
.append(ID_PREFIX_SEPARATOR)
.append(createPrefix(s.getQualifier().getClassid()))
.append(ID_SEPARATOR)
.append(DHPUtils.md5(CleaningFunctions.normalizePidValue(s).getValue()))
.toString();
}
// create the prefix (length = 12)
private static String createPrefix(String pidType) {
StringBuilder prefix = new StringBuilder(StringUtils.left(pidType, ID_PREFIX_LEN));
while (prefix.length() < ID_PREFIX_LEN) {
prefix.append("_");
}
return prefix.substring(0, ID_PREFIX_LEN);
}
}

@ -0,0 +1,517 @@
package eu.dnetlib.dhp.schema.oaf.utils;
import static eu.dnetlib.dhp.schema.common.ModelConstants.*;
import java.sql.Array;
import java.sql.SQLException;
import java.util.*;
import java.util.concurrent.ConcurrentHashMap;
import java.util.function.Function;
import java.util.function.Predicate;
import java.util.stream.Collectors;
import org.apache.commons.lang3.StringUtils;
import eu.dnetlib.dhp.schema.common.AccessRightComparator;
import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.common.ModelSupport;
import eu.dnetlib.dhp.schema.oaf.*;
public class OafMapperUtils {
private OafMapperUtils() {
}
public static Oaf merge(final Oaf left, final Oaf right) {
if (ModelSupport.isSubClass(left, OafEntity.class)) {
return mergeEntities((OafEntity) left, (OafEntity) right);
} else if (ModelSupport.isSubClass(left, Relation.class)) {
((Relation) left).mergeFrom((Relation) right);
} else {
throw new IllegalArgumentException("invalid Oaf type:" + left.getClass().getCanonicalName());
}
return left;
}
public static OafEntity mergeEntities(OafEntity left, OafEntity right) {
if (ModelSupport.isSubClass(left, Result.class)) {
return mergeResults((Result) left, (Result) right);
} else if (ModelSupport.isSubClass(left, Datasource.class)) {
left.mergeFrom(right);
} else if (ModelSupport.isSubClass(left, Organization.class)) {
left.mergeFrom(right);
} else if (ModelSupport.isSubClass(left, Project.class)) {
left.mergeFrom(right);
} else {
throw new IllegalArgumentException("invalid OafEntity subtype:" + left.getClass().getCanonicalName());
}
return left;
}
public static Result mergeResults(Result left, Result right) {
final boolean leftFromDelegatedAuthority = isFromDelegatedAuthority(left);
final boolean rightFromDelegatedAuthority = isFromDelegatedAuthority(right);
if (leftFromDelegatedAuthority && !rightFromDelegatedAuthority) {
return left;
}
if (!leftFromDelegatedAuthority && rightFromDelegatedAuthority) {
return right;
}
if (new ResultTypeComparator().compare(left, right) < 0) {
left.mergeFrom(right);
return left;
} else {
right.mergeFrom(left);
return right;
}
}
private static boolean isFromDelegatedAuthority(Result r) {
return Optional
.ofNullable(r.getInstance())
.map(
instance -> instance
.stream()
.filter(i -> Objects.nonNull(i.getCollectedfrom()))
.map(i -> i.getCollectedfrom().getKey())
.anyMatch(cfId -> IdentifierFactory.delegatedAuthorityDatasourceIds().contains(cfId)))
.orElse(false);
}
public static KeyValue keyValue(final String k, final String v) {
final KeyValue kv = new KeyValue();
kv.setKey(k);
kv.setValue(v);
return kv;
}
public static List<KeyValue> listKeyValues(final String... s) {
if (s.length % 2 > 0) {
throw new IllegalArgumentException("Invalid number of parameters (k,v,k,v,....)");
}
final List<KeyValue> list = new ArrayList<>();
for (int i = 0; i < s.length; i += 2) {
list.add(keyValue(s[i], s[i + 1]));
}
return list;
}
public static <T> Field<T> field(final T value, final DataInfo info) {
if (value == null || StringUtils.isBlank(value.toString())) {
return null;
}
final Field<T> field = new Field<>();
field.setValue(value);
field.setDataInfo(info);
return field;
}
public static List<Field<String>> listFields(final DataInfo info, final String... values) {
return Arrays
.stream(values)
.map(v -> field(v, info))
.filter(Objects::nonNull)
.filter(distinctByKey(Field::getValue))
.collect(Collectors.toList());
}
public static <T> List<T> listValues(Array values) throws SQLException {
if (Objects.isNull(values)) {
return null;
}
return Arrays
.stream((T[]) values.getArray())
.filter(Objects::nonNull)
.distinct()
.collect(Collectors.toList());
}
public static List<Field<String>> listFields(final DataInfo info, final List<String> values) {
return values
.stream()
.map(v -> field(v, info))
.filter(Objects::nonNull)
.filter(distinctByKey(Field::getValue))
.collect(Collectors.toList());
}
public static Qualifier unknown(final String schemeid, final String schemename) {
return qualifier(UNKNOWN, "Unknown", schemeid, schemename);
}
public static AccessRight accessRight(
final String classid,
final String classname,
final String schemeid,
final String schemename) {
return accessRight(classid, classname, schemeid, schemename, null);
}
public static AccessRight accessRight(
final String classid,
final String classname,
final String schemeid,
final String schemename,
final OpenAccessRoute openAccessRoute) {
final AccessRight accessRight = new AccessRight();
accessRight.setClassid(classid);
accessRight.setClassname(classname);
accessRight.setSchemeid(schemeid);
accessRight.setSchemename(schemename);
accessRight.setOpenAccessRoute(openAccessRoute);
return accessRight;
}
public static Qualifier qualifier(
final String classid,
final String classname,
final String schemeid,
final String schemename) {
final Qualifier q = new Qualifier();
q.setClassid(classid);
q.setClassname(classname);
q.setSchemeid(schemeid);
q.setSchemename(schemename);
return q;
}
public static Qualifier qualifier(final Qualifier qualifier) {
final Qualifier q = new Qualifier();
q.setClassid(qualifier.getClassid());
q.setClassname(qualifier.getClassname());
q.setSchemeid(qualifier.getSchemeid());
q.setSchemename(qualifier.getSchemename());
return q;
}
public static Subject subject(
final String value,
final String classid,
final String classname,
final String schemeid,
final String schemename,
final DataInfo dataInfo) {
return subject(value, qualifier(classid, classname, schemeid, schemename), dataInfo);
}
public static StructuredProperty structuredProperty(
final String value,
final String classid,
final String classname,
final String schemeid,
final String schemename,
final DataInfo dataInfo) {
return structuredProperty(value, qualifier(classid, classname, schemeid, schemename), dataInfo);
}
public static Subject subject(
final String value,
final Qualifier qualifier,
final DataInfo dataInfo) {
if (value == null) {
return null;
}
final Subject s = new Subject();
s.setValue(value);
s.setQualifier(qualifier);
s.setDataInfo(dataInfo);
return s;
}
public static StructuredProperty structuredProperty(
final String value,
final Qualifier qualifier,
final DataInfo dataInfo) {
if (value == null) {
return null;
}
final StructuredProperty sp = new StructuredProperty();
sp.setValue(value);
sp.setQualifier(qualifier);
sp.setDataInfo(dataInfo);
return sp;
}
public static ExtraInfo extraInfo(
final String name,
final String value,
final String typology,
final String provenance,
final String trust) {
final ExtraInfo info = new ExtraInfo();
info.setName(name);
info.setValue(value);
info.setTypology(typology);
info.setProvenance(provenance);
info.setTrust(trust);
return info;
}
public static OAIProvenance oaiIProvenance(
final String identifier,
final String baseURL,
final String metadataNamespace,
final Boolean altered,
final String datestamp,
final String harvestDate) {
final OriginDescription desc = new OriginDescription();
desc.setIdentifier(identifier);
desc.setBaseURL(baseURL);
desc.setMetadataNamespace(metadataNamespace);
desc.setAltered(altered);
desc.setDatestamp(datestamp);
desc.setHarvestDate(harvestDate);
final OAIProvenance p = new OAIProvenance();
p.setOriginDescription(desc);
return p;
}
public static Journal journal(
final String name,
final String issnPrinted,
final String issnOnline,
final String issnLinking,
final DataInfo dataInfo) {
return hasIssn(issnPrinted, issnOnline, issnLinking) ? journal(
name,
issnPrinted,
issnOnline,
issnLinking,
null,
null,
null,
null,
null,
null,
null,
dataInfo) : null;
}
public static Journal journal(
final String name,
final String issnPrinted,
final String issnOnline,
final String issnLinking,
final String ep,
final String iss,
final String sp,
final String vol,
final String edition,
final String conferenceplace,
final String conferencedate,
final DataInfo dataInfo) {
if (StringUtils.isNotBlank(name) || hasIssn(issnPrinted, issnOnline, issnLinking)) {
final Journal j = new Journal();
j.setName(name);
j.setIssnPrinted(issnPrinted);
j.setIssnOnline(issnOnline);
j.setIssnLinking(issnLinking);
j.setEp(ep);
j.setIss(iss);
j.setSp(sp);
j.setVol(vol);
j.setEdition(edition);
j.setConferenceplace(conferenceplace);
j.setConferencedate(conferencedate);
j.setDataInfo(dataInfo);
return j;
} else {
return null;
}
}
private static boolean hasIssn(String issnPrinted, String issnOnline, String issnLinking) {
return StringUtils.isNotBlank(issnPrinted)
|| StringUtils.isNotBlank(issnOnline)
|| StringUtils.isNotBlank(issnLinking);
}
public static DataInfo dataInfo(
final Boolean deletedbyinference,
final String inferenceprovenance,
final Boolean inferred,
final Boolean invisible,
final Qualifier provenanceaction,
final String trust) {
final DataInfo d = new DataInfo();
d.setDeletedbyinference(deletedbyinference);
d.setInferenceprovenance(inferenceprovenance);
d.setInferred(inferred);
d.setInvisible(invisible);
d.setProvenanceaction(provenanceaction);
d.setTrust(trust);
return d;
}
public static String createOpenaireId(
final int prefix,
final String originalId,
final boolean to_md5) {
if (StringUtils.isBlank(originalId)) {
return null;
} else if (to_md5) {
final String nsPrefix = StringUtils.substringBefore(originalId, "::");
final String rest = StringUtils.substringAfter(originalId, "::");
return String.format("%s|%s::%s", prefix, nsPrefix, IdentifierFactory.md5(rest));
} else {
return String.format("%s|%s", prefix, originalId);
}
}
public static String createOpenaireId(
final String type,
final String originalId,
final boolean to_md5) {
switch (type) {
case "datasource":
return createOpenaireId(10, originalId, to_md5);
case "organization":
return createOpenaireId(20, originalId, to_md5);
case "person":
return createOpenaireId(30, originalId, to_md5);
case "project":
return createOpenaireId(40, originalId, to_md5);
default:
return createOpenaireId(50, originalId, to_md5);
}
}
public static String asString(final Object o) {
return o == null ? "" : o.toString();
}
public static <T> Predicate<T> distinctByKey(
final Function<? super T, ?> keyExtractor) {
final Map<Object, Boolean> seen = new ConcurrentHashMap<>();
return t -> seen.putIfAbsent(keyExtractor.apply(t), Boolean.TRUE) == null;
}
public static Qualifier createBestAccessRights(final List<Instance> instanceList) {
return getBestAccessRights(instanceList);
}
protected static Qualifier getBestAccessRights(final List<Instance> instanceList) {
if (instanceList != null) {
final Optional<AccessRight> min = instanceList
.stream()
.map(Instance::getAccessright)
.min(new AccessRightComparator<>());
final Qualifier rights = min.map(OafMapperUtils::qualifier).orElseGet(Qualifier::new);
if (StringUtils.isBlank(rights.getClassid())) {
rights.setClassid(UNKNOWN);
}
if (StringUtils.isBlank(rights.getClassname())
|| UNKNOWN.equalsIgnoreCase(rights.getClassname())) {
rights.setClassname(NOT_AVAILABLE);
}
if (StringUtils.isBlank(rights.getSchemeid())) {
rights.setSchemeid(DNET_ACCESS_MODES);
}
if (StringUtils.isBlank(rights.getSchemename())) {
rights.setSchemename(DNET_ACCESS_MODES);
}
return rights;
}
return null;
}
public static KeyValue newKeyValueInstance(String key, String value, DataInfo dataInfo) {
KeyValue kv = new KeyValue();
kv.setDataInfo(dataInfo);
kv.setKey(key);
kv.setValue(value);
return kv;
}
public static Measure newMeasureInstance(String id, String value, String key, DataInfo dataInfo) {
Measure m = new Measure();
m.setId(id);
m.setUnit(Arrays.asList(newKeyValueInstance(key, value, dataInfo)));
return m;
}
public static Relation getRelation(final String source,
final String target,
final String relType,
final String subRelType,
final String relClass,
final OafEntity entity) {
return getRelation(source, target, relType, subRelType, relClass, entity, null);
}
public static Relation getRelation(final String source,
final String target,
final String relType,
final String subRelType,
final String relClass,
final OafEntity entity,
final String validationDate) {
return getRelation(
source, target, relType, subRelType, relClass, entity.getCollectedfrom(), entity.getDataInfo(),
entity.getLastupdatetimestamp(), validationDate, null);
}
public static Relation getRelation(final String source,
final String target,
final String relType,
final String subRelType,
final String relClass,
final List<KeyValue> collectedfrom,
final DataInfo dataInfo,
final Long lastupdatetimestamp) {
return getRelation(
source, target, relType, subRelType, relClass, collectedfrom, dataInfo, lastupdatetimestamp, null, null);
}
public static Relation getRelation(final String source,
final String target,
final String relType,
final String subRelType,
final String relClass,
final List<KeyValue> collectedfrom,
final DataInfo dataInfo,
final Long lastupdatetimestamp,
final String validationDate,
final List<KeyValue> properties) {
final Relation rel = new Relation();
rel.setRelType(relType);
rel.setSubRelType(subRelType);
rel.setRelClass(relClass);
rel.setSource(source);
rel.setTarget(target);
rel.setCollectedfrom(collectedfrom);
rel.setDataInfo(dataInfo);
rel.setLastupdatetimestamp(lastupdatetimestamp);
rel.setValidated(StringUtils.isNotBlank(validationDate));
rel.setValidationDate(StringUtils.isNotBlank(validationDate) ? validationDate : null);
rel.setProperties(properties);
return rel;
}
public static String getProvenance(DataInfo dataInfo) {
return Optional
.ofNullable(dataInfo)
.map(
d -> Optional
.ofNullable(d.getProvenanceaction())
.map(Qualifier::getClassid)
.orElse(""))
.orElse("");
}
}

@ -1,27 +0,0 @@
package eu.dnetlib.dhp.schema.oaf.utils;
import java.util.Comparator;
public class OrganizationPidComparator implements Comparator<PidType> {
@Override
public int compare(PidType pLeft, PidType pRight) {
if (pLeft.equals(PidType.GRID))
return -1;
if (pRight.equals(PidType.GRID))
return 1;
if (pLeft.equals(PidType.mag_id))
return -1;
if (pRight.equals(PidType.mag_id))
return 1;
if (pLeft.equals(PidType.urn))
return -1;
if (pRight.equals(PidType.urn))
return 1;
return 0;
}
}

@ -1,54 +0,0 @@
package eu.dnetlib.dhp.schema.oaf.utils;
import java.util.Comparator;
import eu.dnetlib.dhp.schema.common.ModelSupport;
import eu.dnetlib.dhp.schema.oaf.OafEntity;
import eu.dnetlib.dhp.schema.oaf.Organization;
import eu.dnetlib.dhp.schema.oaf.Result;
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
public class PidComparator<T extends OafEntity> implements Comparator<StructuredProperty> {
private T entity;
public PidComparator(T entity) {
this.entity = entity;
}
@Override
public int compare(StructuredProperty left, StructuredProperty right) {
if (left == null && right == null)
return 0;
if (left == null)
return 1;
if (right == null)
return -1;
PidType lClass = PidType.valueOf(left.getQualifier().getClassid());
PidType rClass = PidType.valueOf(right.getQualifier().getClassid());
if (lClass.equals(rClass))
return 0;
if (ModelSupport.isSubClass(entity, Result.class)) {
return compareResultPids(lClass, rClass);
}
if (ModelSupport.isSubClass(entity, Organization.class)) {
return compareOrganizationtPids(lClass, rClass);
}
// Else (but unlikely), lexicographical ordering will do.
return lClass.compareTo(rClass);
}
private int compareResultPids(PidType lClass, PidType rClass) {
return new ResultPidComparator().compare(lClass, rClass);
}
private int compareOrganizationtPids(PidType lClass, PidType rClass) {
return new OrganizationPidComparator().compare(lClass, rClass);
}
}

@ -1,29 +0,0 @@
package eu.dnetlib.dhp.schema.oaf.utils;
import org.apache.commons.lang3.EnumUtils;
public enum PidType {
// Result
doi, pmid, pmc, handle, arXiv, NCID, GBIF, nct, pdb,
// Organization
GRID, mag_id, urn,
// Used by dedup
undefined, original;
public static boolean isValid(String type) {
return EnumUtils.isValidEnum(PidType.class, type);
}
public static PidType tryValueOf(String s) {
try {
return PidType.valueOf(s);
} catch (Exception e) {
return PidType.original;
}
}
}

@ -1,57 +0,0 @@
package eu.dnetlib.dhp.schema.oaf.utils;
import java.util.Comparator;
public class ResultPidComparator implements Comparator<PidType> {
@Override
public int compare(PidType pLeft, PidType pRight) {
if (pLeft.equals(PidType.doi))
return -1;
if (pRight.equals(PidType.doi))
return 1;
if (pLeft.equals(PidType.pmid))
return -1;
if (pRight.equals(PidType.pmid))
return 1;
if (pLeft.equals(PidType.pmc))
return -1;
if (pRight.equals(PidType.pmc))
return 1;
if (pLeft.equals(PidType.handle))
return -1;
if (pRight.equals(PidType.handle))
return 1;
if (pLeft.equals(PidType.arXiv))
return -1;
if (pRight.equals(PidType.arXiv))
return 1;
if (pLeft.equals(PidType.NCID))
return -1;
if (pRight.equals(PidType.NCID))
return 1;
if (pLeft.equals(PidType.GBIF))
return -1;
if (pRight.equals(PidType.GBIF))
return 1;
if (pLeft.equals(PidType.nct))
return -1;
if (pRight.equals(PidType.nct))
return 1;
if (pLeft.equals(PidType.urn))
return -1;
if (pRight.equals(PidType.urn))
return 1;
return 0;
}
}

@ -0,0 +1,46 @@
package eu.dnetlib.dhp.schema.oaf.utils;
import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.getProvenance;
import static org.apache.commons.lang3.StringUtils.isBlank;
import java.util.Comparator;
import eu.dnetlib.dhp.schema.oaf.Subject;
public class SubjectProvenanceComparator implements Comparator<Subject> {
@Override
public int compare(Subject left, Subject right) {
String lProv = getProvenance(left.getDataInfo());
String rProv = getProvenance(right.getDataInfo());
if (isBlank(lProv) && isBlank(rProv))
return 0;
if (isBlank(lProv))
return 1;
if (isBlank(rProv))
return -1;
if (lProv.equals(rProv))
return 0;
if (lProv.toLowerCase().contains("crosswalk"))
return -1;
if (rProv.toLowerCase().contains("crosswalk"))
return 1;
if (lProv.toLowerCase().contains("user"))
return -1;
if (rProv.toLowerCase().contains("user"))
return 1;
if (lProv.toLowerCase().contains("propagation"))
return -1;
if (rProv.toLowerCase().contains("propagation"))
return 1;
if (lProv.toLowerCase().contains("iis"))
return -1;
if (rProv.toLowerCase().contains("iis"))
return 1;
return 0;
}
}

@ -1,63 +1,112 @@
package eu.dnetlib.dhp.utils;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.*;
import java.nio.charset.StandardCharsets;
import java.security.MessageDigest;
import java.util.zip.GZIPInputStream;
import java.util.zip.GZIPOutputStream;
import java.util.*;
import java.util.stream.Collectors;
import org.apache.commons.codec.binary.Base64;
import org.apache.commons.codec.binary.Base64OutputStream;
import org.apache.commons.codec.binary.Hex;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.SaveMode;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.google.common.collect.Maps;
import com.jayway.jsonpath.JsonPath;
import eu.dnetlib.dhp.schema.mdstore.MDStoreWithInfo;
import eu.dnetlib.dhp.schema.oaf.utils.CleaningFunctions;
import net.minidev.json.JSONArray;
import scala.collection.JavaConverters;
import scala.collection.Seq;
public class DHPUtils {
private static final Logger log = LoggerFactory.getLogger(DHPUtils.class);
private DHPUtils() {
}
public static Seq<String> toSeq(List<String> list) {
return JavaConverters.asScalaIteratorConverter(list.iterator()).asScala().toSeq();
}
public static String md5(final String s) {
try {
final MessageDigest md = MessageDigest.getInstance("MD5");
md.update(s.getBytes(StandardCharsets.UTF_8));
return new String(Hex.encodeHex(md.digest()));
} catch (final Exception e) {
System.err.println("Error creating id");
log.error("Error creating id from {}", s);
return null;
}
}
/**
* Retrieves from the metadata store manager application the list of paths associated with mdstores characterized
* by he given format, layout, interpretation
* @param mdstoreManagerUrl the URL of the mdstore manager service
* @param format the mdstore format
* @param layout the mdstore layout
* @param interpretation the mdstore interpretation
* @param includeEmpty include Empty mdstores
* @return the set of hdfs paths
* @throws IOException in case of HTTP communication issues
*/
public static Set<String> mdstorePaths(final String mdstoreManagerUrl,
final String format,
final String layout,
final String interpretation,
boolean includeEmpty) throws IOException {
final String url = mdstoreManagerUrl + "/mdstores/";
final ObjectMapper objectMapper = new ObjectMapper();
final HttpGet req = new HttpGet(url);
log.info("MDStoreManager request: {}", req);
try (final CloseableHttpClient client = HttpClients.createDefault()) {
try (final CloseableHttpResponse response = client.execute(req)) {
final String json = IOUtils.toString(response.getEntity().getContent());
log.info("MDStoreManager response: {}", json);
final MDStoreWithInfo[] mdstores = objectMapper.readValue(json, MDStoreWithInfo[].class);
return Arrays
.stream(mdstores)
.filter(md -> md.getFormat().equalsIgnoreCase(format))
.filter(md -> md.getLayout().equalsIgnoreCase(layout))
.filter(md -> md.getInterpretation().equalsIgnoreCase(interpretation))
.filter(md -> StringUtils.isNotBlank(md.getHdfsPath()))
.filter(md -> StringUtils.isNotBlank(md.getCurrentVersion()))
.filter(md -> includeEmpty || md.getSize() > 0)
.map(md -> md.getHdfsPath() + "/" + md.getCurrentVersion() + "/store")
.collect(Collectors.toSet());
}
}
}
public static String generateIdentifier(final String originalId, final String nsPrefix) {
return String.format("%s::%s", nsPrefix, DHPUtils.md5(originalId));
}
public static String compressString(final String input) {
try (ByteArrayOutputStream out = new ByteArrayOutputStream();
Base64OutputStream b64os = new Base64OutputStream(out)) {
GZIPOutputStream gzip = new GZIPOutputStream(b64os);
gzip.write(input.getBytes(StandardCharsets.UTF_8));
gzip.close();
return out.toString();
} catch (Throwable e) {
return null;
}
}
public static String generateUnresolvedIdentifier(final String pid, final String pidType) {
public static String decompressString(final String input) {
byte[] byteArray = Base64.decodeBase64(input.getBytes());
int len;
try (GZIPInputStream gis = new GZIPInputStream(new ByteArrayInputStream((byteArray)));
ByteArrayOutputStream bos = new ByteArrayOutputStream(byteArray.length)) {
byte[] buffer = new byte[1024];
while ((len = gis.read(buffer)) != -1) {
bos.write(buffer, 0, len);
}
return bos.toString();
} catch (Exception e) {
return null;
}
final String cleanedPid = CleaningFunctions.normalizePidValue(pidType, pid);
return String.format("unresolved::%s::%s", cleanedPid, pidType.toLowerCase().trim());
}
public static String getJPathString(final String jsonPath, final String json) {
@ -72,4 +121,72 @@ public class DHPUtils {
return "";
}
}
public static final ObjectMapper MAPPER = new ObjectMapper();
public static void writeHdfsFile(final Configuration conf, final String content, final String path)
throws IOException {
log.info("writing file {}, size {}", path, content.length());
try (FileSystem fs = FileSystem.get(conf);
BufferedOutputStream os = new BufferedOutputStream(fs.create(new Path(path)))) {
os.write(content.getBytes(StandardCharsets.UTF_8));
os.flush();
}
}
public static String readHdfsFile(Configuration conf, String path) throws IOException {
log.info("reading file {}", path);
try (FileSystem fs = FileSystem.get(conf)) {
final Path p = new Path(path);
if (!fs.exists(p)) {
throw new FileNotFoundException(path);
}
return IOUtils.toString(fs.open(p));
}
}
public static <T> T readHdfsFileAs(Configuration conf, String path, Class<T> clazz) throws IOException {
return MAPPER.readValue(readHdfsFile(conf, path), clazz);
}
public static <T> void saveDataset(final Dataset<T> mdstore, final String targetPath) {
log.info("saving dataset in: {}", targetPath);
mdstore
.write()
.mode(SaveMode.Overwrite)
.format("parquet")
.save(targetPath);
}
public static Configuration getHadoopConfiguration(String nameNode) {
// ====== Init HDFS File System Object
Configuration conf = new Configuration();
// Set FileSystem URI
conf.set("fs.defaultFS", nameNode);
// Because of Maven
conf.set("fs.hdfs.impl", org.apache.hadoop.hdfs.DistributedFileSystem.class.getName());
conf.set("fs.file.impl", org.apache.hadoop.fs.LocalFileSystem.class.getName());
System.setProperty("hadoop.home.dir", "/");
return conf;
}
public static void populateOOZIEEnv(final Map<String, String> report) throws IOException {
File file = new File(System.getProperty("oozie.action.output.properties"));
Properties props = new Properties();
report.forEach((k, v) -> props.setProperty(k, v));
try (OutputStream os = new FileOutputStream(file)) {
props.store(os, "");
}
}
public static void populateOOZIEEnv(final String paramName, String value) throws IOException {
Map<String, String> report = Maps.newHashMap();
report.put(paramName, value);
populateOOZIEEnv(report);
}
}

@ -1,11 +1,11 @@
package eu.dnetlib.dhp.utils;
import java.util.Map;
import javax.xml.ws.BindingProvider;
import org.apache.cxf.endpoint.Client;
import org.apache.cxf.frontend.ClientProxy;
import org.apache.cxf.jaxws.JaxWsProxyFactoryBean;
import org.apache.cxf.transport.http.HTTPConduit;
import org.apache.cxf.transports.http.configuration.HTTPClientPolicy;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@ -15,8 +15,11 @@ public class ISLookupClientFactory {
private static final Logger log = LoggerFactory.getLogger(ISLookupClientFactory.class);
private static int requestTimeout = 60000 * 10;
private static int connectTimeout = 60000 * 10;
private static final int requestTimeout = 60000 * 10;
private static final int connectTimeout = 60000 * 10;
private ISLookupClientFactory() {
}
public static ISLookUpService getLookUpService(final String isLookupUrl) {
return getServiceStub(ISLookUpService.class, isLookupUrl);
@ -24,27 +27,28 @@ public class ISLookupClientFactory {
@SuppressWarnings("unchecked")
private static <T> T getServiceStub(final Class<T> clazz, final String endpoint) {
log.info(String.format("creating %s stub from %s", clazz.getName(), endpoint));
log.info("creating {} stub from {}", clazz.getName(), endpoint);
final JaxWsProxyFactoryBean jaxWsProxyFactory = new JaxWsProxyFactoryBean();
jaxWsProxyFactory.setServiceClass(clazz);
jaxWsProxyFactory.setAddress(endpoint);
final T service = (T) jaxWsProxyFactory.create();
if (service instanceof BindingProvider) {
Client client = ClientProxy.getClient(service);
if (client != null) {
HTTPConduit conduit = (HTTPConduit) client.getConduit();
HTTPClientPolicy policy = new HTTPClientPolicy();
log
.info(
"setting timeouts for {} to requestTimeout: {}, connectTimeout: {}",
BindingProvider.class.getName(), requestTimeout, connectTimeout);
Map<String, Object> requestContext = ((BindingProvider) service).getRequestContext();
requestContext.put("com.sun.xml.internal.ws.request.timeout", requestTimeout);
requestContext.put("com.sun.xml.internal.ws.connect.timeout", connectTimeout);
requestContext.put("com.sun.xml.ws.request.timeout", requestTimeout);
requestContext.put("com.sun.xml.ws.connect.timeout", connectTimeout);
requestContext.put("javax.xml.ws.client.receiveTimeout", requestTimeout);
requestContext.put("javax.xml.ws.client.connectionTimeout", connectTimeout);
"setting connectTimeout to {}, requestTimeout to {} for service {}",
connectTimeout,
requestTimeout,
clazz.getCanonicalName());
policy.setConnectionTimeout(connectTimeout);
policy.setReceiveTimeout(requestTimeout);
conduit.setClient(policy);
}
return service;

@ -10,7 +10,7 @@ import net.sf.saxon.trans.XPathException;
public abstract class AbstractExtensionFunction extends ExtensionFunctionDefinition {
public static String DEFAULT_SAXON_EXT_NS_URI = "http://www.d-net.research-infrastructures.eu/saxon-extension";
public static final String DEFAULT_SAXON_EXT_NS_URI = "http://www.d-net.research-infrastructures.eu/saxon-extension";
public abstract String getName();

@ -26,7 +26,7 @@ public class ExtractYear extends AbstractExtensionFunction {
@Override
public Sequence doCall(XPathContext context, Sequence[] arguments) throws XPathException {
if (arguments == null | arguments.length == 0) {
if (arguments == null || arguments.length == 0) {
return new StringValue("");
}
final Item item = arguments[0].head();
@ -63,8 +63,7 @@ public class ExtractYear extends AbstractExtensionFunction {
for (String format : dateFormats) {
try {
c.setTime(new SimpleDateFormat(format).parse(s));
String year = String.valueOf(c.get(Calendar.YEAR));
return year;
return String.valueOf(c.get(Calendar.YEAR));
} catch (ParseException e) {
}
}

@ -30,7 +30,7 @@ public class NormalizeDate extends AbstractExtensionFunction {
@Override
public Sequence doCall(XPathContext context, Sequence[] arguments) throws XPathException {
if (arguments == null | arguments.length == 0) {
if (arguments == null || arguments.length == 0) {
return new StringValue(BLANK);
}
String s = arguments[0].head().getStringValue();

@ -1,6 +1,8 @@
package eu.dnetlib.dhp.utils.saxon;
import static org.apache.commons.lang3.StringUtils.isNotBlank;
import org.apache.commons.lang3.StringUtils;
import net.sf.saxon.expr.XPathContext;
@ -26,7 +28,8 @@ public class PickFirst extends AbstractExtensionFunction {
final String s1 = getValue(arguments[0]);
final String s2 = getValue(arguments[1]);
return new StringValue(StringUtils.isNotBlank(s1) ? s1 : StringUtils.isNotBlank(s2) ? s2 : "");
final String value = isNotBlank(s1) ? s1 : isNotBlank(s2) ? s2 : "";
return new StringValue(value);
}
private String getValue(final Sequence arg) throws XPathException {

@ -12,6 +12,9 @@ import net.sf.saxon.TransformerFactoryImpl;
public class SaxonTransformerFactory {
private SaxonTransformerFactory() {
}
/**
* Creates the index record transformer from the given XSLT
*

@ -1,76 +0,0 @@
package eu.dnetlib.message;
import java.io.IOException;
import java.util.Map;
import com.fasterxml.jackson.core.JsonProcessingException;
import com.fasterxml.jackson.databind.ObjectMapper;
public class Message {
private String workflowId;
private String jobName;
private MessageType type;
private Map<String, String> body;
public static Message fromJson(final String json) throws IOException {
final ObjectMapper jsonMapper = new ObjectMapper();
return jsonMapper.readValue(json, Message.class);
}
public Message() {
}
public Message(String workflowId, String jobName, MessageType type, Map<String, String> body) {
this.workflowId = workflowId;
this.jobName = jobName;
this.type = type;
this.body = body;
}
public String getWorkflowId() {
return workflowId;
}
public void setWorkflowId(String workflowId) {
this.workflowId = workflowId;
}
public String getJobName() {
return jobName;
}
public void setJobName(String jobName) {
this.jobName = jobName;
}
public MessageType getType() {
return type;
}
public void setType(MessageType type) {
this.type = type;
}
public Map<String, String> getBody() {
return body;
}
public void setBody(Map<String, String> body) {
this.body = body;
}
@Override
public String toString() {
final ObjectMapper jsonMapper = new ObjectMapper();
try {
return jsonMapper.writeValueAsString(this);
} catch (JsonProcessingException e) {
return null;
}
}
}

@ -1,47 +0,0 @@
package eu.dnetlib.message;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.util.concurrent.LinkedBlockingQueue;
import com.rabbitmq.client.AMQP;
import com.rabbitmq.client.Channel;
import com.rabbitmq.client.DefaultConsumer;
import com.rabbitmq.client.Envelope;
public class MessageConsumer extends DefaultConsumer {
final LinkedBlockingQueue<Message> queueMessages;
/**
* Constructs a new instance and records its association to the passed-in channel.
*
* @param channel the channel to which this consumer is attached
* @param queueMessages
*/
public MessageConsumer(Channel channel, LinkedBlockingQueue<Message> queueMessages) {
super(channel);
this.queueMessages = queueMessages;
}
@Override
public void handleDelivery(
String consumerTag, Envelope envelope, AMQP.BasicProperties properties, byte[] body)
throws IOException {
final String json = new String(body, StandardCharsets.UTF_8);
Message message = Message.fromJson(json);
try {
this.queueMessages.put(message);
System.out.println("Receiving Message " + message);
} catch (InterruptedException e) {
if (message.getType() == MessageType.REPORT)
throw new RuntimeException("Error on sending message");
else {
// TODO LOGGING EXCEPTION
}
} finally {
getChannel().basicAck(envelope.getDeliveryTag(), false);
}
}
}

@ -1,136 +0,0 @@
package eu.dnetlib.message;
import java.io.IOException;
import java.util.HashMap;
import java.util.Map;
import java.util.concurrent.LinkedBlockingQueue;
import java.util.concurrent.TimeoutException;
import com.rabbitmq.client.Channel;
import com.rabbitmq.client.Connection;
import com.rabbitmq.client.ConnectionFactory;
public class MessageManager {
private final String messageHost;
private final String username;
private final String password;
private Connection connection;
private final Map<String, Channel> channels = new HashMap<>();
private boolean durable;
private boolean autodelete;
private final LinkedBlockingQueue<Message> queueMessages;
public MessageManager(
String messageHost,
String username,
String password,
final LinkedBlockingQueue<Message> queueMessages) {
this.queueMessages = queueMessages;
this.messageHost = messageHost;
this.username = username;
this.password = password;
}
public MessageManager(
String messageHost,
String username,
String password,
boolean durable,
boolean autodelete,
final LinkedBlockingQueue<Message> queueMessages) {
this.queueMessages = queueMessages;
this.messageHost = messageHost;
this.username = username;
this.password = password;
this.durable = durable;
this.autodelete = autodelete;
}
private Connection createConnection() throws IOException, TimeoutException {
ConnectionFactory factory = new ConnectionFactory();
factory.setHost(this.messageHost);
factory.setUsername(this.username);
factory.setPassword(this.password);
return factory.newConnection();
}
private Channel createChannel(
final Connection connection,
final String queueName,
final boolean durable,
final boolean autodelete)
throws Exception {
Map<String, Object> args = new HashMap<>();
args.put("x-message-ttl", 10000);
Channel channel = connection.createChannel();
channel.queueDeclare(queueName, durable, false, this.autodelete, args);
return channel;
}
private Channel getOrCreateChannel(final String queueName, boolean durable, boolean autodelete)
throws Exception {
if (channels.containsKey(queueName)) {
return channels.get(queueName);
}
if (this.connection == null) {
this.connection = createConnection();
}
channels.put(queueName, createChannel(this.connection, queueName, durable, autodelete));
return channels.get(queueName);
}
public void close() throws IOException {
channels
.values()
.forEach(
ch -> {
try {
ch.close();
} catch (Exception e) {
// TODO LOG
}
});
this.connection.close();
}
public boolean sendMessage(final Message message, String queueName) throws Exception {
try {
Channel channel = getOrCreateChannel(queueName, this.durable, this.autodelete);
channel.basicPublish("", queueName, null, message.toString().getBytes());
return true;
} catch (Throwable e) {
throw new RuntimeException(e);
}
}
public boolean sendMessage(
final Message message, String queueName, boolean durable_var, boolean autodelete_var)
throws Exception {
try {
Channel channel = getOrCreateChannel(queueName, durable_var, autodelete_var);
channel.basicPublish("", queueName, null, message.toString().getBytes());
return true;
} catch (Throwable e) {
throw new RuntimeException(e);
}
}
public void startConsumingMessage(
final String queueName, final boolean durable, final boolean autodelete) throws Exception {
Channel channel = createChannel(createConnection(), queueName, durable, autodelete);
channel.basicConsume(queueName, false, new MessageConsumer(channel, queueMessages));
}
}

@ -1,6 +0,0 @@
package eu.dnetlib.message;
public enum MessageType {
ONGOING, REPORT
}

@ -0,0 +1,30 @@
[
{
"paramName":"s",
"paramLongName":"sourcePath",
"paramDescription": "the path of the sequencial file to read",
"paramRequired": true
},
{
"paramName": "hdp",
"paramLongName": "hdfsPath",
"paramDescription": "the path used to store the output archive",
"paramRequired": true
},
{
"paramName":"nn",
"paramLongName":"nameNode",
"paramDescription": "the name node",
"paramRequired": true
},
{
"paramName":"ss",
"paramLongName":"splitSize",
"paramDescription": "the maximum size of the archive",
"paramRequired": false
}
]

@ -0,0 +1,26 @@
[
{
"paramName": "issm",
"paramLongName": "isSparkSessionManaged",
"paramDescription": "when true will stop SparkSession after job execution",
"paramRequired": false
},
{
"paramName": "i",
"paramLongName": "inputPath",
"paramDescription": "the source path",
"paramRequired": true
},
{
"paramName": "o",
"paramLongName": "outputPath",
"paramDescription": "path of the output graph",
"paramRequired": true
},
{
"paramName": "c",
"paramLongName": "graphTableClassName",
"paramDescription": "the graph entity class name",
"paramRequired": true
}
]

@ -0,0 +1,20 @@
[
{
"paramName": "issm",
"paramLongName": "isSparkSessionManaged",
"paramDescription": "when true will stop SparkSession after job execution",
"paramRequired": false
},
{
"paramName": "gin",
"paramLongName": "graphInputPath",
"paramDescription": "the graph root path",
"paramRequired": true
},
{
"paramName": "out",
"paramLongName": "outputPath",
"paramDescription": "the output merged graph root path",
"paramRequired": true
}
]

File diff suppressed because one or more lines are too long

@ -0,0 +1,73 @@
package eu.dnetlib.dhp.application
import scala.io.Source
/** This is the main Interface SparkApplication
* where all the Spark Scala class should inherit
*/
trait SparkScalaApplication {
/** This is the path in the classpath of the json
* describes all the argument needed to run
*/
val propertyPath: String
/** Utility to parse the arguments using the
* property json in the classpath identified from
* the variable propertyPath
*
* @param args the list of arguments
*/
def parseArguments(args: Array[String]): ArgumentApplicationParser = {
val parser = new ArgumentApplicationParser(
Source.fromInputStream(getClass.getResourceAsStream(propertyPath)).mkString
)
parser.parseArgument(args)
parser
}
/** Here all the spark applications runs this method
* where the whole logic of the spark node is defined
*/
def run(): Unit
}
import org.apache.spark.SparkConf
import org.apache.spark.sql.SparkSession
import org.slf4j.Logger
abstract class AbstractScalaApplication(
val propertyPath: String,
val args: Array[String],
log: Logger
) extends SparkScalaApplication {
var parser: ArgumentApplicationParser = null
var spark: SparkSession = null
def initialize(): SparkScalaApplication = {
parser = parseArguments(args)
spark = createSparkSession()
this
}
/** Utility for creating a spark session starting from parser
*
* @return a spark Session
*/
private def createSparkSession(): SparkSession = {
require(parser != null)
val conf: SparkConf = new SparkConf()
val master = parser.get("master")
log.info(s"Creating Spark session: Master: $master")
SparkSession
.builder()
.config(conf)
.appName(getClass.getSimpleName)
.master(master)
.getOrCreate()
}
}

@ -0,0 +1,442 @@
package eu.dnetlib.dhp.sx.graph.scholix
import eu.dnetlib.dhp.schema.oaf.{Publication, Relation, Result, StructuredProperty}
import eu.dnetlib.dhp.schema.sx.scholix._
import eu.dnetlib.dhp.schema.sx.summary.{CollectedFromType, SchemeValue, ScholixSummary, Typology}
import eu.dnetlib.dhp.utils.DHPUtils
import org.apache.spark.sql.expressions.Aggregator
import org.apache.spark.sql.{Encoder, Encoders}
import org.json4s
import org.json4s.DefaultFormats
import org.json4s.jackson.JsonMethods.parse
import scala.collection.JavaConverters._
import scala.io.Source
object ScholixUtils extends Serializable {
val DNET_IDENTIFIER_SCHEMA: String = "DNET Identifier"
val DATE_RELATION_KEY: String = "RelationDate"
case class RelationVocabulary(original: String, inverse: String) {}
case class RelatedEntities(id: String, relatedDataset: Long, relatedPublication: Long) {}
val relations: Map[String, RelationVocabulary] = {
val input = Source
.fromInputStream(
getClass.getResourceAsStream("/eu/dnetlib/scholexplorer/relation/relations.json")
)
.mkString
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
lazy val json: json4s.JValue = parse(input)
json.extract[Map[String, RelationVocabulary]]
}
def extractRelationDate(relation: Relation): String = {
if (relation.getProperties == null || !relation.getProperties.isEmpty)
null
else {
val date = relation.getProperties.asScala
.find(p => DATE_RELATION_KEY.equalsIgnoreCase(p.getKey))
.map(p => p.getValue)
if (date.isDefined)
date.get
else
null
}
}
def extractRelationDate(summary: ScholixSummary): String = {
if (summary.getDate == null || summary.getDate.isEmpty)
null
else {
summary.getDate.get(0)
}
}
def inverseRelationShip(rel: ScholixRelationship): ScholixRelationship = {
new ScholixRelationship(rel.getInverse, rel.getSchema, rel.getName)
}
def generateScholixResourceFromResult(r: Result): ScholixResource = {
generateScholixResourceFromSummary(ScholixUtils.resultToSummary(r))
}
val statsAggregator: Aggregator[(String, String, Long), RelatedEntities, RelatedEntities] =
new Aggregator[(String, String, Long), RelatedEntities, RelatedEntities] with Serializable {
override def zero: RelatedEntities = null
override def reduce(b: RelatedEntities, a: (String, String, Long)): RelatedEntities = {
val relatedDataset = if ("dataset".equalsIgnoreCase(a._2)) a._3 else 0
val relatedPublication = if ("publication".equalsIgnoreCase(a._2)) a._3 else 0
if (b == null)
RelatedEntities(a._1, relatedDataset, relatedPublication)
else
RelatedEntities(
a._1,
b.relatedDataset + relatedDataset,
b.relatedPublication + relatedPublication
)
}
override def merge(b1: RelatedEntities, b2: RelatedEntities): RelatedEntities = {
if (b1 != null && b2 != null)
RelatedEntities(
b1.id,
b1.relatedDataset + b2.relatedDataset,
b1.relatedPublication + b2.relatedPublication
)
else if (b1 != null)
b1
else
b2
}
override def finish(reduction: RelatedEntities): RelatedEntities = reduction
override def bufferEncoder: Encoder[RelatedEntities] = Encoders.bean(classOf[RelatedEntities])
override def outputEncoder: Encoder[RelatedEntities] = Encoders.bean(classOf[RelatedEntities])
}
val scholixAggregator: Aggregator[(String, Scholix), Scholix, Scholix] =
new Aggregator[(String, Scholix), Scholix, Scholix] with Serializable {
override def zero: Scholix = null
def scholix_complete(s: Scholix): Boolean = {
if (s == null || s.getIdentifier == null) {
false
} else if (s.getSource == null || s.getTarget == null) {
false
} else if (s.getLinkprovider == null || s.getLinkprovider.isEmpty)
false
else
true
}
override def reduce(b: Scholix, a: (String, Scholix)): Scholix = {
if (scholix_complete(b)) b else a._2
}
override def merge(b1: Scholix, b2: Scholix): Scholix = {
if (scholix_complete(b1)) b1 else b2
}
override def finish(reduction: Scholix): Scholix = reduction
override def bufferEncoder: Encoder[Scholix] = Encoders.kryo[Scholix]
override def outputEncoder: Encoder[Scholix] = Encoders.kryo[Scholix]
}
def createInverseScholixRelation(scholix: Scholix): Scholix = {
val s = new Scholix
s.setPublicationDate(scholix.getPublicationDate)
s.setPublisher(scholix.getPublisher)
s.setLinkprovider(scholix.getLinkprovider)
s.setRelationship(inverseRelationShip(scholix.getRelationship))
s.setSource(scholix.getTarget)
s.setTarget(scholix.getSource)
s.setIdentifier(
DHPUtils.md5(
s"${s.getSource.getIdentifier}::${s.getRelationship.getName}::${s.getTarget.getIdentifier}"
)
)
s
}
def extractCollectedFrom(summary: ScholixResource): List[ScholixEntityId] = {
if (summary.getCollectedFrom != null && !summary.getCollectedFrom.isEmpty) {
val l: List[ScholixEntityId] = summary.getCollectedFrom.asScala.map { d =>
new ScholixEntityId(d.getProvider.getName, d.getProvider.getIdentifiers)
}(collection.breakOut)
l
} else List()
}
def extractCollectedFrom(summary: ScholixSummary): List[ScholixEntityId] = {
if (summary.getDatasources != null && !summary.getDatasources.isEmpty) {
val l: List[ScholixEntityId] = summary.getDatasources.asScala.map { d =>
new ScholixEntityId(
d.getDatasourceName,
List(new ScholixIdentifier(d.getDatasourceId, "DNET Identifier", null)).asJava
)
}(collection.breakOut)
l
} else List()
}
def extractCollectedFrom(relation: Relation): List[ScholixEntityId] = {
if (relation.getCollectedfrom != null && !relation.getCollectedfrom.isEmpty) {
val l: List[ScholixEntityId] = relation.getCollectedfrom.asScala.map { c =>
new ScholixEntityId(
c.getValue,
List(new ScholixIdentifier(c.getKey, DNET_IDENTIFIER_SCHEMA, null)).asJava
)
}.toList
l
} else List()
}
def generateCompleteScholix(scholix: Scholix, target: ScholixSummary): Scholix = {
val s = new Scholix
s.setPublicationDate(scholix.getPublicationDate)
s.setPublisher(scholix.getPublisher)
s.setLinkprovider(scholix.getLinkprovider)
s.setRelationship(scholix.getRelationship)
s.setSource(scholix.getSource)
s.setTarget(generateScholixResourceFromSummary(target))
s.setIdentifier(
DHPUtils.md5(
s"${s.getSource.getIdentifier}::${s.getRelationship.getName}::${s.getTarget.getIdentifier}"
)
)
s
}
def generateCompleteScholix(scholix: Scholix, target: ScholixResource): Scholix = {
val s = new Scholix
s.setPublicationDate(scholix.getPublicationDate)
s.setPublisher(scholix.getPublisher)
s.setLinkprovider(scholix.getLinkprovider)
s.setRelationship(scholix.getRelationship)
s.setSource(scholix.getSource)
s.setTarget(target)
s.setIdentifier(
DHPUtils.md5(
s"${s.getSource.getIdentifier}::${s.getRelationship.getName}::${s.getTarget.getIdentifier}"
)
)
s
}
def generateScholixResourceFromSummary(summaryObject: ScholixSummary): ScholixResource = {
val r = new ScholixResource
r.setIdentifier(summaryObject.getLocalIdentifier)
r.setDnetIdentifier(summaryObject.getId)
r.setObjectType(summaryObject.getTypology.toString)
r.setObjectSubType(summaryObject.getSubType)
if (summaryObject.getTitle != null && !summaryObject.getTitle.isEmpty)
r.setTitle(summaryObject.getTitle.get(0))
if (summaryObject.getAuthor != null && !summaryObject.getAuthor.isEmpty) {
val l: List[ScholixEntityId] =
summaryObject.getAuthor.asScala.map(a => new ScholixEntityId(a, null)).toList
if (l.nonEmpty)
r.setCreator(l.asJava)
}
if (summaryObject.getDate != null && !summaryObject.getDate.isEmpty)
r.setPublicationDate(summaryObject.getDate.get(0))
if (summaryObject.getPublisher != null && !summaryObject.getPublisher.isEmpty) {
val plist: List[ScholixEntityId] =
summaryObject.getPublisher.asScala.map(p => new ScholixEntityId(p, null)).toList
if (plist.nonEmpty)
r.setPublisher(plist.asJava)
}
if (summaryObject.getDatasources != null && !summaryObject.getDatasources.isEmpty) {
val l: List[ScholixCollectedFrom] = summaryObject.getDatasources.asScala
.map(c =>
new ScholixCollectedFrom(
new ScholixEntityId(
c.getDatasourceName,
List(new ScholixIdentifier(c.getDatasourceId, DNET_IDENTIFIER_SCHEMA, null)).asJava
),
"collected",
"complete"
)
)
.toList
if (l.nonEmpty)
r.setCollectedFrom(l.asJava)
}
r
}
def scholixFromSource(relation: Relation, source: ScholixResource): Scholix = {
if (relation == null || source == null)
return null
val s = new Scholix
var l: List[ScholixEntityId] = extractCollectedFrom(relation)
if (l.isEmpty)
l = extractCollectedFrom(source)
if (l.isEmpty)
return null
s.setLinkprovider(l.asJava)
var d = extractRelationDate(relation)
if (d == null)
d = source.getPublicationDate
s.setPublicationDate(d)
if (source.getPublisher != null && !source.getPublisher.isEmpty) {
s.setPublisher(source.getPublisher)
}
val semanticRelation = relations.getOrElse(relation.getRelClass.toLowerCase, null)
if (semanticRelation == null)
return null
s.setRelationship(
new ScholixRelationship(semanticRelation.original, "datacite", semanticRelation.inverse)
)
s.setSource(source)
s
}
def scholixFromSource(relation: Relation, source: ScholixSummary): Scholix = {
if (relation == null || source == null)
return null
val s = new Scholix
var l: List[ScholixEntityId] = extractCollectedFrom(relation)
if (l.isEmpty)
l = extractCollectedFrom(source)
if (l.isEmpty)
return null
s.setLinkprovider(l.asJava)
var d = extractRelationDate(relation)
if (d == null)
d = extractRelationDate(source)
s.setPublicationDate(d)
if (source.getPublisher != null && !source.getPublisher.isEmpty) {
val l: List[ScholixEntityId] = source.getPublisher.asScala
.map { p =>
new ScholixEntityId(p, null)
}(collection.breakOut)
if (l.nonEmpty)
s.setPublisher(l.asJava)
}
val semanticRelation = relations.getOrElse(relation.getRelClass.toLowerCase, null)
if (semanticRelation == null)
return null
s.setRelationship(
new ScholixRelationship(semanticRelation.original, "datacite", semanticRelation.inverse)
)
s.setSource(generateScholixResourceFromSummary(source))
s
}
def findURLForPID(
pidValue: List[StructuredProperty],
urls: List[String]
): List[(StructuredProperty, String)] = {
pidValue.map { p =>
val pv = p.getValue
val r = urls.find(u => u.toLowerCase.contains(pv.toLowerCase))
(p, r.orNull)
}
}
def extractTypedIdentifierFromInstance(r: Result): List[ScholixIdentifier] = {
if (r.getInstance() == null || r.getInstance().isEmpty)
return List()
r.getInstance()
.asScala
.filter(i => i.getUrl != null && !i.getUrl.isEmpty)
.filter(i => i.getPid != null && i.getUrl != null)
.flatMap(i => findURLForPID(i.getPid.asScala.toList, i.getUrl.asScala.toList))
.map(i => new ScholixIdentifier(i._1.getValue, i._1.getQualifier.getClassid, i._2))
.distinct
.toList
}
def resultToSummary(r: Result): ScholixSummary = {
val s = new ScholixSummary
s.setId(r.getId)
if (r.getPid == null || r.getPid.isEmpty)
return null
val persistentIdentifiers: List[ScholixIdentifier] = extractTypedIdentifierFromInstance(r)
if (persistentIdentifiers.isEmpty)
return null
s.setLocalIdentifier(persistentIdentifiers.asJava)
if (r.isInstanceOf[Publication])
s.setTypology(Typology.publication)
else
s.setTypology(Typology.dataset)
s.setSubType(r.getInstance().get(0).getInstancetype.getClassname)
if (r.getTitle != null && r.getTitle.asScala.nonEmpty) {
val titles: List[String] = r.getTitle.asScala.map(t => t.getValue).toList
if (titles.nonEmpty)
s.setTitle(titles.asJava)
else
return null
}
if (r.getAuthor != null && !r.getAuthor.isEmpty) {
val authors: List[String] = r.getAuthor.asScala.map(a => a.getFullname).toList
if (authors.nonEmpty)
s.setAuthor(authors.asJava)
}
if (r.getInstance() != null) {
val dt: List[String] = r
.getInstance()
.asScala
.filter(i => i.getDateofacceptance != null)
.map(i => i.getDateofacceptance.getValue)
.toList
if (dt.nonEmpty)
s.setDate(dt.distinct.asJava)
}
if (r.getDescription != null && !r.getDescription.isEmpty) {
val d = r.getDescription.asScala.find(f => f != null && f.getValue != null)
if (d.isDefined)
s.setDescription(d.get.getValue)
}
if (r.getSubject != null && !r.getSubject.isEmpty) {
val subjects: List[SchemeValue] = r.getSubject.asScala
.map(s => new SchemeValue(s.getQualifier.getClassname, s.getValue))
.toList
if (subjects.nonEmpty)
s.setSubject(subjects.asJava)
}
if (r.getPublisher != null)
s.setPublisher(List(r.getPublisher.getValue).asJava)
if (r.getCollectedfrom != null && !r.getCollectedfrom.isEmpty) {
val cf: List[CollectedFromType] = r.getCollectedfrom.asScala
.map(c => new CollectedFromType(c.getValue, c.getKey, "complete"))
.toList
if (cf.nonEmpty)
s.setDatasources(cf.distinct.asJava)
}
s.setRelatedDatasets(0)
s.setRelatedPublications(0)
s.setRelatedUnknown(0)
s
}
}

@ -7,10 +7,10 @@ import static org.junit.jupiter.api.Assertions.assertNotNull;
import org.apache.commons.io.IOUtils;
import org.junit.jupiter.api.Test;
public class ArgumentApplicationParserTest {
class ArgumentApplicationParserTest {
@Test
public void testParseParameter() throws Exception {
void testParseParameter() throws Exception {
final String jsonConfiguration = IOUtils
.toString(
this.getClass().getResourceAsStream("/eu/dnetlib/application/parameters.json"));

@ -21,13 +21,13 @@ public class HdfsSupportTest {
class Remove {
@Test
public void shouldThrowARuntimeExceptionOnError() {
void shouldThrowARuntimeExceptionOnError() {
// when
assertThrows(RuntimeException.class, () -> HdfsSupport.remove(null, new Configuration()));
}
@Test
public void shouldRemoveADirFromHDFS(@TempDir Path tempDir) {
void shouldRemoveADirFromHDFS(@TempDir Path tempDir) {
// when
HdfsSupport.remove(tempDir.toString(), new Configuration());
@ -36,7 +36,7 @@ public class HdfsSupportTest {
}
@Test
public void shouldRemoveAFileFromHDFS(@TempDir Path tempDir) throws IOException {
void shouldRemoveAFileFromHDFS(@TempDir Path tempDir) throws IOException {
// given
Path file = Files.createTempFile(tempDir, "p", "s");
@ -52,13 +52,13 @@ public class HdfsSupportTest {
class ListFiles {
@Test
public void shouldThrowARuntimeExceptionOnError() {
void shouldThrowARuntimeExceptionOnError() {
// when
assertThrows(RuntimeException.class, () -> HdfsSupport.listFiles(null, new Configuration()));
}
@Test
public void shouldListFilesLocatedInPath(@TempDir Path tempDir) throws IOException {
void shouldListFilesLocatedInPath(@TempDir Path tempDir) throws IOException {
Path subDir1 = Files.createTempDirectory(tempDir, "list_me");
Path subDir2 = Files.createTempDirectory(tempDir, "list_me");

@ -5,10 +5,10 @@ import static org.junit.jupiter.api.Assertions.*;
import org.junit.jupiter.api.Test;
public class PacePersonTest {
class PacePersonTest {
@Test
public void pacePersonTest1() {
void pacePersonTest1() {
PacePerson p = new PacePerson("Artini, Michele", false);
assertEquals("Artini", p.getSurnameString());
@ -17,7 +17,7 @@ public class PacePersonTest {
}
@Test
public void pacePersonTest2() {
void pacePersonTest2() {
PacePerson p = new PacePerson("Michele G. Artini", false);
assertEquals("Artini, Michele G.", p.getNormalisedFullname());
assertEquals("Michele G", p.getNameString());

@ -18,7 +18,8 @@ public class SparkSessionSupportTest {
class RunWithSparkSession {
@Test
public void shouldExecuteFunctionAndNotStopSparkSessionWhenSparkSessionIsNotManaged()
@SuppressWarnings("unchecked")
void shouldExecuteFunctionAndNotStopSparkSessionWhenSparkSessionIsNotManaged()
throws Exception {
// given
SparkSession spark = mock(SparkSession.class);
@ -37,7 +38,8 @@ public class SparkSessionSupportTest {
}
@Test
public void shouldExecuteFunctionAndStopSparkSessionWhenSparkSessionIsManaged()
@SuppressWarnings("unchecked")
void shouldExecuteFunctionAndStopSparkSessionWhenSparkSessionIsManaged()
throws Exception {
// given
SparkSession spark = mock(SparkSession.class);

@ -12,15 +12,39 @@ import org.junit.jupiter.api.Disabled;
import org.junit.jupiter.api.Test;
@Disabled
public class ZenodoAPIClientTest {
class ZenodoAPIClientTest {
private final String URL_STRING = "https://sandbox.zenodo.org/api/deposit/depositions";
private final String ACCESS_TOKEN = "";
private final String CONCEPT_REC_ID = "657113";
private final String depositionId = "674915";
@Test
void testUploadOldDeposition() throws IOException, MissingConceptDoiException {
ZenodoAPIClient client = new ZenodoAPIClient(URL_STRING,
ACCESS_TOKEN);
Assertions.assertEquals(200, client.uploadOpenDeposition(depositionId));
File file = new File(getClass()
.getResource("/eu/dnetlib/dhp/common/api/COVID-19.json.gz")
.getPath());
InputStream is = new FileInputStream(file);
Assertions.assertEquals(200, client.uploadIS(is, "COVID-19.json.gz", file.length()));
String metadata = IOUtils.toString(getClass().getResourceAsStream("/eu/dnetlib/dhp/common/api/metadata.json"));
Assertions.assertEquals(200, client.sendMretadata(metadata));
Assertions.assertEquals(202, client.publish());
}
@Test
public void testNewDeposition() throws IOException {
void testNewDeposition() throws IOException {
ZenodoAPIClient client = new ZenodoAPIClient(URL_STRING,
ACCESS_TOKEN);
@ -43,7 +67,7 @@ public class ZenodoAPIClientTest {
}
@Test
public void testNewVersionNewName() throws IOException, MissingConceptDoiException {
void testNewVersionNewName() throws IOException, MissingConceptDoiException {
ZenodoAPIClient client = new ZenodoAPIClient(URL_STRING,
ACCESS_TOKEN);
@ -63,7 +87,7 @@ public class ZenodoAPIClientTest {
}
@Test
public void testNewVersionOldName() throws IOException, MissingConceptDoiException {
void testNewVersionOldName() throws IOException, MissingConceptDoiException {
ZenodoAPIClient client = new ZenodoAPIClient(URL_STRING,
ACCESS_TOKEN);

@ -1,16 +0,0 @@
package eu.dnetlib.dhp.model.mdstore;
import static org.junit.jupiter.api.Assertions.assertTrue;
import org.junit.jupiter.api.Test;
public class MetadataRecordTest {
@Test
public void getTimestamp() {
MetadataRecord r = new MetadataRecord();
assertTrue(r.getDateOfCollection() > 0);
}
}

@ -0,0 +1,100 @@
package eu.dnetlib.dhp.oa.merge;
import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.List;
import java.util.stream.Collectors;
import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.schema.oaf.Author;
import eu.dnetlib.dhp.schema.oaf.Publication;
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
import eu.dnetlib.pace.util.MapDocumentUtil;
import scala.Tuple2;
class AuthorMergerTest {
private String publicationsBasePath;
private List<List<Author>> authors;
@BeforeEach
public void setUp() throws Exception {
publicationsBasePath = Paths
.get(AuthorMergerTest.class.getResource("/eu/dnetlib/dhp/oa/merge").toURI())
.toFile()
.getAbsolutePath();
authors = readSample(publicationsBasePath + "/publications_with_authors.json", Publication.class)
.stream()
.map(p -> p._2().getAuthor())
.collect(Collectors.toList());
}
@Test
void mergeTest() { // used in the dedup: threshold set to 0.95
for (List<Author> authors1 : authors) {
System.out.println("List " + (authors.indexOf(authors1) + 1));
for (Author author : authors1) {
System.out.println(authorToString(author));
}
}
List<Author> merge = AuthorMerger.merge(authors);
System.out.println("Merge ");
for (Author author : merge) {
System.out.println(authorToString(author));
}
Assertions.assertEquals(7, merge.size());
}
public <T> List<Tuple2<String, T>> readSample(String path, Class<T> clazz) {
List<Tuple2<String, T>> res = new ArrayList<>();
BufferedReader reader;
try {
reader = new BufferedReader(new FileReader(path));
String line = reader.readLine();
while (line != null) {
res
.add(
new Tuple2<>(
MapDocumentUtil.getJPathString("$.id", line),
new ObjectMapper().readValue(line, clazz)));
// read next line
line = reader.readLine();
}
reader.close();
} catch (IOException e) {
e.printStackTrace();
}
return res;
}
public String authorToString(Author a) {
String print = "Fullname = ";
print += a.getFullname() + " pid = [";
if (a.getPid() != null)
for (StructuredProperty sp : a.getPid()) {
print += sp.toComparableString() + " ";
}
print += "]";
return print;
}
}

@ -1,47 +0,0 @@
package eu.dnetlib.dhp.schema.oaf.utils;
import static org.junit.jupiter.api.Assertions.*;
import java.io.IOException;
import org.apache.commons.io.IOUtils;
import org.junit.jupiter.api.Test;
import com.fasterxml.jackson.databind.DeserializationFeature;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.schema.oaf.Publication;
import eu.dnetlib.dhp.utils.DHPUtils;
public class IdentifierFactoryTest {
private static ObjectMapper OBJECT_MAPPER = new ObjectMapper()
.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);
@Test
public void testCreateIdentifierForPublication() throws IOException {
verifyIdentifier("publication_doi.json", "50|doi_________::" + DHPUtils.md5("10.1016/j.cmet.2011.03.013"));
verifyIdentifier("publication_pmc.json", "50|pmc_________::" + DHPUtils.md5("21459329"));
verifyIdentifier(
"publication_urn.json",
"50|urn_________::" + DHPUtils.md5("urn:nbn:nl:ui:29-f3ed5f9e-edf6-457e-8848-61b58a4075e2"));
final String defaultID = "50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1f";
verifyIdentifier("publication_3.json", defaultID);
verifyIdentifier("publication_4.json", defaultID);
verifyIdentifier("publication_5.json", defaultID);
}
protected void verifyIdentifier(String filename, String expectedID) throws IOException {
final String json = IOUtils.toString(getClass().getResourceAsStream(filename));
final Publication pub = OBJECT_MAPPER.readValue(json, Publication.class);
String id = IdentifierFactory.createIdentifier(pub);
assertNotNull(id);
assertEquals(expectedID, id);
}
}

@ -0,0 +1,212 @@
package eu.dnetlib.dhp.schema.oaf.utils;
import static org.junit.jupiter.api.Assertions.*;
import java.io.IOException;
import java.util.HashSet;
import java.util.List;
import java.util.stream.Collectors;
import org.apache.commons.io.IOUtils;
import org.junit.jupiter.api.Test;
import com.fasterxml.jackson.databind.DeserializationFeature;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.oaf.Dataset;
import eu.dnetlib.dhp.schema.oaf.KeyValue;
import eu.dnetlib.dhp.schema.oaf.Publication;
import eu.dnetlib.dhp.schema.oaf.Result;
import me.xuender.unidecode.Unidecode;
class OafMapperUtilsTest {
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper()
.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);
@Test
public void testUnidecode() {
assertEquals("Liu Ben Mu hiruzuSen tawa", Unidecode.decode("六本木ヒルズ森タワ"));
assertEquals("Nan Wu A Mi Tuo Fo", Unidecode.decode("南无阿弥陀佛"));
assertEquals("Yi Tiao Hui Zou Lu De Yu", Unidecode.decode("一条会走路的鱼"));
assertEquals("amidaniyorai", Unidecode.decode("あみだにょらい"));
assertEquals("T`owrk`iayi", Unidecode.decode("Թուրքիայի"));
assertEquals("Obzor tematiki", Unidecode.decode("Обзор тематики"));
assertEquals("GERMANSKIE IaZYKI", Unidecode.decode("ГЕРМАНСКИЕ ЯЗЫКИ"));
assertEquals("Diereunese tes ikanopoieses", Unidecode.decode("Διερεύνηση της ικανοποίησης"));
assertEquals("lqDy l'wly@", Unidecode.decode("القضايا الأولية"));
assertEquals("abc def ghi", Unidecode.decode("abc def ghi"));
}
@Test
void testDateValidation() {
assertNotNull(GraphCleaningFunctions.cleanDate("2016-05-07T12:41:19.202Z "));
assertNotNull(GraphCleaningFunctions.cleanDate("2020-09-10 11:08:52 "));
assertNotNull(GraphCleaningFunctions.cleanDate(" 2016-04-05"));
assertEquals("2016-04-05", GraphCleaningFunctions.cleanDate("2016 Apr 05"));
assertEquals("2009-05-08", GraphCleaningFunctions.cleanDate("May 8, 2009 5:57:51 PM"));
assertEquals("1970-10-07", GraphCleaningFunctions.cleanDate("oct 7, 1970"));
assertEquals("1970-10-07", GraphCleaningFunctions.cleanDate("oct 7, '70"));
assertEquals("1970-10-07", GraphCleaningFunctions.cleanDate("oct. 7, 1970"));
assertEquals("1970-10-07", GraphCleaningFunctions.cleanDate("oct. 7, 70"));
assertEquals("2006-01-02", GraphCleaningFunctions.cleanDate("Mon Jan 2 15:04:05 2006"));
assertEquals("2006-01-02", GraphCleaningFunctions.cleanDate("Mon Jan 2 15:04:05 MST 2006"));
assertEquals("2006-01-02", GraphCleaningFunctions.cleanDate("Mon Jan 02 15:04:05 -0700 2006"));
assertEquals("2006-01-02", GraphCleaningFunctions.cleanDate("Monday, 02-Jan-06 15:04:05 MST"));
assertEquals("2006-01-02", GraphCleaningFunctions.cleanDate("Mon, 02 Jan 2006 15:04:05 MST"));
assertEquals("2017-07-11", GraphCleaningFunctions.cleanDate("Tue, 11 Jul 2017 16:28:13 +0200 (CEST)"));
assertEquals("2006-01-02", GraphCleaningFunctions.cleanDate("Mon, 02 Jan 2006 15:04:05 -0700"));
assertEquals("2018-01-04", GraphCleaningFunctions.cleanDate("Thu, 4 Jan 2018 17:53:36 +0000"));
assertEquals("2015-08-10", GraphCleaningFunctions.cleanDate("Mon Aug 10 15:44:11 UTC+0100 2015"));
assertEquals(
"2015-07-03",
GraphCleaningFunctions.cleanDate("Fri Jul 03 2015 18:04:07 GMT+0100 (GMT Daylight Time)"));
assertEquals("2012-09-17", GraphCleaningFunctions.cleanDate("September 17, 2012 10:09am"));
assertEquals("2012-09-17", GraphCleaningFunctions.cleanDate("September 17, 2012 at 10:09am PST-08"));
assertEquals("2012-09-17", GraphCleaningFunctions.cleanDate("September 17, 2012, 10:10:09"));
assertEquals("1970-10-07", GraphCleaningFunctions.cleanDate("October 7, 1970"));
assertEquals("1970-10-07", GraphCleaningFunctions.cleanDate("October 7th, 1970"));
assertEquals("2006-02-12", GraphCleaningFunctions.cleanDate("12 Feb 2006, 19:17"));
assertEquals("2006-02-12", GraphCleaningFunctions.cleanDate("12 Feb 2006 19:17"));
assertEquals("1970-10-07", GraphCleaningFunctions.cleanDate("7 oct 70"));
assertEquals("1970-10-07", GraphCleaningFunctions.cleanDate("7 oct 1970"));
assertEquals("2013-02-03", GraphCleaningFunctions.cleanDate("03 February 2013"));
assertEquals("2013-07-01", GraphCleaningFunctions.cleanDate("1 July 2013"));
assertEquals("2013-02-03", GraphCleaningFunctions.cleanDate("2013-Feb-03"));
assertEquals("2014-03-31", GraphCleaningFunctions.cleanDate("3/31/2014"));
assertEquals("2014-03-31", GraphCleaningFunctions.cleanDate("03/31/2014"));
assertEquals("1971-08-21", GraphCleaningFunctions.cleanDate("08/21/71"));
assertEquals("1971-01-08", GraphCleaningFunctions.cleanDate("8/1/71"));
assertEquals("2014-08-04", GraphCleaningFunctions.cleanDate("4/8/2014 22:05"));
assertEquals("2014-08-04", GraphCleaningFunctions.cleanDate("04/08/2014 22:05"));
assertEquals("2014-08-04", GraphCleaningFunctions.cleanDate("4/8/14 22:05"));
assertEquals("2014-02-04", GraphCleaningFunctions.cleanDate("04/2/2014 03:00:51"));
assertEquals("1965-08-08", GraphCleaningFunctions.cleanDate("8/8/1965 12:00:00 AM"));
assertEquals("1965-08-08", GraphCleaningFunctions.cleanDate("8/8/1965 01:00:01 PM"));
assertEquals("1965-08-08", GraphCleaningFunctions.cleanDate("8/8/1965 01:00 PM"));
assertEquals("1965-08-08", GraphCleaningFunctions.cleanDate("8/8/1965 1:00 PM"));
assertEquals("1965-08-08", GraphCleaningFunctions.cleanDate("8/8/1965 12:00 AM"));
assertEquals("2014-02-04", GraphCleaningFunctions.cleanDate("4/02/2014 03:00:51"));
assertEquals("2012-03-19", GraphCleaningFunctions.cleanDate("03/19/2012 10:11:59"));
assertEquals("2012-03-19", GraphCleaningFunctions.cleanDate("03/19/2012 10:11:59.3186369"));
assertEquals("2014-03-31", GraphCleaningFunctions.cleanDate("2014/3/31"));
assertEquals("2014-03-31", GraphCleaningFunctions.cleanDate("2014/03/31"));
assertEquals("2014-04-08", GraphCleaningFunctions.cleanDate("2014/4/8 22:05"));
assertEquals("2014-04-08", GraphCleaningFunctions.cleanDate("2014/04/08 22:05"));
assertEquals("2014-04-02", GraphCleaningFunctions.cleanDate("2014/04/2 03:00:51"));
assertEquals("2014-04-02", GraphCleaningFunctions.cleanDate("2014/4/02 03:00:51"));
assertEquals("2012-03-19", GraphCleaningFunctions.cleanDate("2012/03/19 10:11:59"));
assertEquals("2012-03-19", GraphCleaningFunctions.cleanDate("2012/03/19 10:11:59.3186369"));
assertEquals("2014-04-08", GraphCleaningFunctions.cleanDate("2014年04月08日"));
assertEquals("2006-01-02", GraphCleaningFunctions.cleanDate("2006-01-02T15:04:05+0000"));
assertEquals("2009-08-13", GraphCleaningFunctions.cleanDate("2009-08-12T22:15:09-07:00"));
assertEquals("2009-08-12", GraphCleaningFunctions.cleanDate("2009-08-12T22:15:09"));
assertEquals("2014-04-26", GraphCleaningFunctions.cleanDate("2014-04-26 17:24:37.3186369"));
assertEquals("2012-08-03", GraphCleaningFunctions.cleanDate("2012-08-03 18:31:59.257000000"));
assertEquals("2014-04-26", GraphCleaningFunctions.cleanDate("2014-04-26 17:24:37.123"));
assertEquals("2013-04-01", GraphCleaningFunctions.cleanDate("2013-04-01 22:43"));
assertEquals("2013-04-01", GraphCleaningFunctions.cleanDate("2013-04-01 22:43:22"));
assertEquals("2014-12-16", GraphCleaningFunctions.cleanDate("2014-12-16 06:20:00 UTC"));
assertEquals("2014-12-16", GraphCleaningFunctions.cleanDate("2014-12-16 06:20:00 GMT"));
assertEquals("2014-04-26", GraphCleaningFunctions.cleanDate("2014-04-26 05:24:37 PM"));
assertEquals("2014-04-26", GraphCleaningFunctions.cleanDate("2014-04-26 13:13:43 +0800"));
assertEquals("2014-04-26", GraphCleaningFunctions.cleanDate("2014-04-26 13:13:43 +0800 +08"));
assertEquals("2014-04-26", GraphCleaningFunctions.cleanDate("2014-04-26 13:13:44 +09:00"));
assertEquals("2012-08-03", GraphCleaningFunctions.cleanDate("2012-08-03 18:31:59.257000000 +0000 UTC"));
assertEquals("2015-09-30", GraphCleaningFunctions.cleanDate("2015-09-30 18:48:56.35272715 +0000 UTC"));
assertEquals("2015-02-18", GraphCleaningFunctions.cleanDate("2015-02-18 00:12:00 +0000 GMT"));
assertEquals("2015-02-18", GraphCleaningFunctions.cleanDate("2015-02-18 00:12:00 +0000 UTC"));
assertEquals(
"2015-02-08", GraphCleaningFunctions.cleanDate("2015-02-08 03:02:00 +0300 MSK m=+0.000000001"));
assertEquals(
"2015-02-08", GraphCleaningFunctions.cleanDate("2015-02-08 03:02:00.001 +0300 MSK m=+0.000000001"));
assertEquals("2017-07-19", GraphCleaningFunctions.cleanDate("2017-07-19 03:21:51+00:00"));
assertEquals("2014-04-26", GraphCleaningFunctions.cleanDate("2014-04-26"));
assertEquals("2014-04-01", GraphCleaningFunctions.cleanDate("2014-04"));
assertEquals("2014-01-01", GraphCleaningFunctions.cleanDate("2014"));
assertEquals("2014-05-11", GraphCleaningFunctions.cleanDate("2014-05-11 08:20:13,787"));
assertEquals("2014-03-31", GraphCleaningFunctions.cleanDate("3.31.2014"));
assertEquals("2014-03-31", GraphCleaningFunctions.cleanDate("03.31.2014"));
assertEquals("1971-08-21", GraphCleaningFunctions.cleanDate("08.21.71"));
assertEquals("2014-03-01", GraphCleaningFunctions.cleanDate("2014.03"));
assertEquals("2014-03-30", GraphCleaningFunctions.cleanDate("2014.03.30"));
assertEquals("2014-06-01", GraphCleaningFunctions.cleanDate("20140601"));
assertEquals("2014-07-22", GraphCleaningFunctions.cleanDate("20140722105203"));
assertEquals("2012-03-19", GraphCleaningFunctions.cleanDate("1332151919"));
assertEquals("2013-11-12", GraphCleaningFunctions.cleanDate("1384216367189"));
assertEquals("2013-11-12", GraphCleaningFunctions.cleanDate("1384216367111222"));
assertEquals("2013-11-12", GraphCleaningFunctions.cleanDate("1384216367111222333"));
}
@Test
void testDate() {
final String date = GraphCleaningFunctions.cleanDate("23-FEB-1998");
assertNotNull(date);
System.out.println(date);
}
@Test
void testMergePubs() throws IOException {
Publication p1 = read("publication_1.json", Publication.class);
Publication p2 = read("publication_2.json", Publication.class);
Dataset d1 = read("dataset_1.json", Dataset.class);
Dataset d2 = read("dataset_2.json", Dataset.class);
assertEquals(1, p1.getCollectedfrom().size());
assertEquals(ModelConstants.CROSSREF_ID, p1.getCollectedfrom().get(0).getKey());
assertEquals(1, d2.getCollectedfrom().size());
assertFalse(cfId(d2.getCollectedfrom()).contains(ModelConstants.CROSSREF_ID));
assertEquals(
ModelConstants.PUBLICATION_RESULTTYPE_CLASSID,
OafMapperUtils
.mergeResults(p1, d2)
.getResulttype()
.getClassid());
assertEquals(1, p2.getCollectedfrom().size());
assertFalse(cfId(p2.getCollectedfrom()).contains(ModelConstants.CROSSREF_ID));
assertEquals(1, d1.getCollectedfrom().size());
assertTrue(cfId(d1.getCollectedfrom()).contains(ModelConstants.CROSSREF_ID));
assertEquals(
ModelConstants.DATASET_RESULTTYPE_CLASSID,
OafMapperUtils
.mergeResults(p2, d1)
.getResulttype()
.getClassid());
}
@Test
void testDelegatedAuthority() throws IOException {
Dataset d1 = read("dataset_2.json", Dataset.class);
Dataset d2 = read("dataset_delegated.json", Dataset.class);
assertEquals(1, d2.getCollectedfrom().size());
assertTrue(cfId(d2.getCollectedfrom()).contains(ModelConstants.ZENODO_OD_ID));
Result res = OafMapperUtils.mergeResults(d1, d2);
assertEquals(d2, res);
System.out.println(OBJECT_MAPPER.writeValueAsString(res));
}
protected HashSet<String> cfId(List<KeyValue> collectedfrom) {
return collectedfrom.stream().map(KeyValue::getKey).collect(Collectors.toCollection(HashSet::new));
}
protected <T extends Result> T read(String filename, Class<T> clazz) throws IOException {
final String json = IOUtils.toString(getClass().getResourceAsStream(filename));
return OBJECT_MAPPER.readValue(json, clazz);
}
}

@ -1,51 +0,0 @@
package eu.dnetlib.message;
import static org.junit.jupiter.api.Assertions.*;
import java.io.IOException;
import java.util.HashMap;
import java.util.Map;
import org.junit.jupiter.api.Test;
public class MessageTest {
@Test
public void fromJsonTest() throws IOException {
Message m = new Message();
m.setWorkflowId("wId");
m.setType(MessageType.ONGOING);
m.setJobName("Collection");
Map<String, String> body = new HashMap<>();
body.put("parsedItem", "300");
body.put("ExecutionTime", "30s");
m.setBody(body);
System.out.println("m = " + m);
Message m1 = Message.fromJson(m.toString());
assertEquals(m1.getWorkflowId(), m.getWorkflowId());
assertEquals(m1.getType(), m.getType());
assertEquals(m1.getJobName(), m.getJobName());
assertNotNull(m1.getBody());
m1.getBody().keySet().forEach(it -> assertEquals(m1.getBody().get(it), m.getBody().get(it)));
assertEquals(m1.getJobName(), m.getJobName());
}
@Test
public void toStringTest() {
final String expectedJson = "{\"workflowId\":\"wId\",\"jobName\":\"Collection\",\"type\":\"ONGOING\",\"body\":{\"ExecutionTime\":\"30s\",\"parsedItem\":\"300\"}}";
Message m = new Message();
m.setWorkflowId("wId");
m.setType(MessageType.ONGOING);
m.setJobName("Collection");
Map<String, String> body = new HashMap<>();
body.put("parsedItem", "300");
body.put("ExecutionTime", "30s");
m.setBody(body);
assertEquals(expectedJson, m.toString());
}
}

@ -3,10 +3,10 @@ package eu.dnetlib.scholexplorer.relation;
import org.junit.jupiter.api.Test;
public class RelationMapperTest {
class RelationMapperTest {
@Test
public void testLoadRels() throws Exception {
void testLoadRels() throws Exception {
RelationMapper relationMapper = RelationMapper.load();
relationMapper.keySet().forEach(System.out::println);

File diff suppressed because one or more lines are too long

@ -0,0 +1 @@
{"id":"50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1g", "resuttype" : { "classid" : "dataset" }, "pid":[{"qualifier":{"classid":"doi"},"value":"10.1016/j.cmet.2011.03.013"},{"qualifier":{"classid":"urn"},"value":"urn:nbn:nl:ui:29-f3ed5f9e-edf6-457e-8848-61b58a4075e2"},{"qualifier":{"classid":"scp-number"},"value":"79953761260"},{"qualifier":{"classid":"pmc"},"value":"21459329"}], "collectedfrom" : [ { "key" : "10|openaire____::081b82f96300b6a6e3d282bad31cb6e2", "value" : "Crossref"} ]}

@ -0,0 +1,140 @@
{
"id": "50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1g",
"resuttype": {"classid": "dataset"},
"pid": [
{
"qualifier": {"classid": "doi"},
"value": "10.1016/j.cmet.2011.03.013"
},
{
"qualifier": {"classid": "urn"},
"value": "urn:nbn:nl:ui:29-f3ed5f9e-edf6-457e-8848-61b58a4075e2"
},
{
"qualifier": {"classid": "scp-number"},
"value": "79953761260"
},
{
"qualifier": {"classid": "pmc"},
"value": "21459329"
}
],
"collectedfrom": [
{
"key": "10|openaire____::081b82f96300b6a6e3d282bad31cb6e3",
"value": "Repository B"
}
],
"instance": [
{
"refereed": {
"classid": "0000",
"classname": "UNKNOWN",
"schemeid": "dnet:review_levels",
"schemename": "dnet:review_levels"
},
"hostedby": {
"key": "10|opendoar____::358aee4cc897452c00244351e4d91f69",
"value": "Zenodo"
},
"accessright": {
"classid": "OPEN",
"classname": "Open Access",
"schemeid": "dnet:access_modes",
"schemename": "dnet:access_modes"
},
"processingchargecurrency": {
"dataInfo": {
"provenanceaction": {
"classid": "sysimport:crosswalk:datasetarchive",
"classname": "Harvested",
"schemeid": "dnet:provenanceActions",
"schemename": "dnet:provenanceActions"
},
"deletedbyinference": false,
"inferred": false,
"inferenceprovenance": "",
"invisible": true,
"trust": "0.9"
},
"value": "EUR"
},
"pid": [
{
"dataInfo": {
"provenanceaction": {
"classid": "sysimport:crosswalk:datasetarchive",
"classname": "Harvested",
"schemeid": "dnet:provenanceActions",
"schemename": "dnet:provenanceActions"
},
"deletedbyinference": false,
"inferred": false,
"inferenceprovenance": "",
"invisible": true,
"trust": "0.9"
},
"qualifier": {
"classid": "doi",
"classname": "Digital Object Identifier",
"schemeid": "dnet:pid_types",
"schemename": "dnet:pid_types"
},
"value": "10.1371/journal.pone.0085605"
}
],
"distributionlocation": "",
"url": ["https://doi.org/10.1371/journal.pone.0085605"],
"alternateIdentifier": [
{
"dataInfo": {
"provenanceaction": {
"classid": "sysimport:crosswalk:datasetarchive",
"classname": "Harvested",
"schemeid": "dnet:provenanceActions",
"schemename": "dnet:provenanceActions"
},
"deletedbyinference": false,
"inferred": false,
"inferenceprovenance": "",
"invisible": true,
"trust": "0.9"
},
"qualifier": {
"classid": "pmid",
"classname": "PubMed ID",
"schemeid": "dnet:pid_types",
"schemename": "dnet:pid_types"
},
"value": "24454899.0"
}
],
"collectedfrom": {
"key": "10|openaire____::081b82f96300b6a6e3d282bad31cb6e3",
"value": "Repository B"
},
"processingchargeamount": {
"dataInfo": {
"provenanceaction": {
"classid": "sysimport:crosswalk:datasetarchive",
"classname": "Harvested",
"schemeid": "dnet:provenanceActions",
"schemename": "dnet:provenanceActions"
},
"deletedbyinference": false,
"inferred": false,
"inferenceprovenance": "",
"invisible": true,
"trust": "0.9"
},
"value": "1022.02"
},
"instancetype": {
"classid": "0004",
"classname": "Conference object",
"schemeid": "dnet:publication_resource",
"schemename": "dnet:publication_resource"
}
}
]
}

@ -0,0 +1,140 @@
{
"id": "50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1g",
"resuttype": {"classid": "dataset"},
"pid": [
{
"qualifier": {"classid": "doi"},
"value": "10.1016/j.cmet.2011.03.013"
},
{
"qualifier": {"classid": "urn"},
"value": "urn:nbn:nl:ui:29-f3ed5f9e-edf6-457e-8848-61b58a4075e2"
},
{
"qualifier": {"classid": "scp-number"},
"value": "79953761260"
},
{
"qualifier": {"classid": "pmc"},
"value": "21459329"
}
],
"collectedfrom": [
{
"key": "10|opendoar____::358aee4cc897452c00244351e4d91f69",
"value": "Zenodo"
}
],
"instance": [
{
"refereed": {
"classid": "0000",
"classname": "UNKNOWN",
"schemeid": "dnet:review_levels",
"schemename": "dnet:review_levels"
},
"hostedby": {
"key": "10|opendoar____::358aee4cc897452c00244351e4d91f69",
"value": "Zenodo"
},
"accessright": {
"classid": "OPEN",
"classname": "Open Access",
"schemeid": "dnet:access_modes",
"schemename": "dnet:access_modes"
},
"processingchargecurrency": {
"dataInfo": {
"provenanceaction": {
"classid": "sysimport:crosswalk:datasetarchive",
"classname": "Harvested",
"schemeid": "dnet:provenanceActions",
"schemename": "dnet:provenanceActions"
},
"deletedbyinference": false,
"inferred": false,
"inferenceprovenance": "",
"invisible": true,
"trust": "0.9"
},
"value": "EUR"
},
"pid": [
{
"dataInfo": {
"provenanceaction": {
"classid": "sysimport:crosswalk:datasetarchive",
"classname": "Harvested",
"schemeid": "dnet:provenanceActions",
"schemename": "dnet:provenanceActions"
},
"deletedbyinference": false,
"inferred": false,
"inferenceprovenance": "",
"invisible": true,
"trust": "0.9"
},
"qualifier": {
"classid": "doi",
"classname": "Digital Object Identifier",
"schemeid": "dnet:pid_types",
"schemename": "dnet:pid_types"
},
"value": "10.1371/journal.pone.0085605"
}
],
"distributionlocation": "",
"url": ["https://doi.org/10.1371/journal.pone.0085605"],
"alternateIdentifier": [
{
"dataInfo": {
"provenanceaction": {
"classid": "sysimport:crosswalk:datasetarchive",
"classname": "Harvested",
"schemeid": "dnet:provenanceActions",
"schemename": "dnet:provenanceActions"
},
"deletedbyinference": false,
"inferred": false,
"inferenceprovenance": "",
"invisible": true,
"trust": "0.9"
},
"qualifier": {
"classid": "pmid",
"classname": "PubMed ID",
"schemeid": "dnet:pid_types",
"schemename": "dnet:pid_types"
},
"value": "24454899.0"
}
],
"collectedfrom": {
"key": "10|opendoar____::358aee4cc897452c00244351e4d91f69",
"value": "Zenodo"
},
"processingchargeamount": {
"dataInfo": {
"provenanceaction": {
"classid": "sysimport:crosswalk:datasetarchive",
"classname": "Harvested",
"schemeid": "dnet:provenanceActions",
"schemename": "dnet:provenanceActions"
},
"deletedbyinference": false,
"inferred": false,
"inferenceprovenance": "",
"invisible": true,
"trust": "0.9"
},
"value": "1022.02"
},
"instancetype": {
"classid": "0004",
"classname": "Conference object",
"schemeid": "dnet:publication_resource",
"schemename": "dnet:publication_resource"
}
}
]
}

@ -0,0 +1 @@
{"id":"50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1f", "resuttype" : { "classid" : "publication" }, "pid":[{"qualifier":{"classid":"doi"},"value":"10.1016/j.cmet.2011.03.013"},{"qualifier":{"classid":"urn"},"value":"urn:nbn:nl:ui:29-f3ed5f9e-edf6-457e-8848-61b58a4075e2"},{"qualifier":{"classid":"scp-number"},"value":"79953761260"},{"qualifier":{"classid":"pmc"},"value":"21459329"}], "collectedfrom" : [ { "key" : "10|openaire____::081b82f96300b6a6e3d282bad31cb6e2", "value" : "Crossref"} ]}

@ -0,0 +1 @@
{"id":"50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1f", "resuttype" : { "classid" : "publication" }, "pid":[{"qualifier":{"classid":"doi"},"value":"10.1016/j.cmet.2011.03.013"},{"qualifier":{"classid":"urn"},"value":"urn:nbn:nl:ui:29-f3ed5f9e-edf6-457e-8848-61b58a4075e2"},{"qualifier":{"classid":"scp-number"},"value":"79953761260"},{"qualifier":{"classid":"pmc"},"value":"21459329"}], "collectedfrom" : [ { "key" : "10|openaire____::081b82f96300b6a6e3d282bad31cb6e3", "value" : "Repository A"} ]}

@ -1 +0,0 @@
{"id":"50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1f","pid":[{"qualifier":{"classid":"scp-number"},"value":"79953761260"}]}

Some files were not shown because too many files have changed in this diff Show More

Loading…
Cancel
Save