diff --git a/.gitignore b/.gitignore index 2d7730711..73d9179fa 100644 --- a/.gitignore +++ b/.gitignore @@ -3,10 +3,10 @@ *.iws *.ipr *.iml -*.ipr -*.iws *~ .vscode +.metals +.bloop .classpath /*/.classpath /*/*/.classpath @@ -24,4 +24,5 @@ spark-warehouse /**/job-override.properties /**/*.log - +/**/.factorypath +/**/.scalafmt.conf diff --git a/.scalafmt.conf b/.scalafmt.conf new file mode 100644 index 000000000..0b5dbe0b4 --- /dev/null +++ b/.scalafmt.conf @@ -0,0 +1,21 @@ +style = defaultWithAlign + +align.openParenCallSite = false +align.openParenDefnSite = false +align.tokens = [{code = "->"}, {code = "<-"}, {code = "=>", owner = "Case"}] +continuationIndent.callSite = 2 +continuationIndent.defnSite = 2 +danglingParentheses = true +indentOperator = spray +maxColumn = 120 +newlines.alwaysBeforeTopLevelStatements = true +project.excludeFilters = [".*\\.sbt"] +rewrite.rules = [AvoidInfix] +rewrite.rules = [ExpandImportSelectors] +rewrite.rules = [RedundantBraces] +rewrite.rules = [RedundantParens] +rewrite.rules = [SortImports] +rewrite.rules = [SortModifiers] +rewrite.rules = [PreferCurlyFors] +spaces.inImportCurlyBraces = false +unindentTopLevelOperators = true \ No newline at end of file diff --git a/README.md b/README.md index 39d4d98e4..0a0bd82ab 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,2 @@ # dnet-hadoop -Dnet-hadoop is a tool for \ No newline at end of file +Dnet-hadoop is the project that defined all the OOZIE workflows for the OpenAIRE Graph construction, processing, provisioning. \ No newline at end of file diff --git a/dhp-build/dhp-build-assembly-resources/pom.xml b/dhp-build/dhp-build-assembly-resources/pom.xml index 012ff89a3..44165995d 100644 --- a/dhp-build/dhp-build-assembly-resources/pom.xml +++ b/dhp-build/dhp-build-assembly-resources/pom.xml @@ -6,7 +6,7 @@ eu.dnetlib.dhp dhp-build - 1.2.4-SNAPSHOT + 1.2.5-SNAPSHOT dhp-build-assembly-resources diff --git a/dhp-build/dhp-build-properties-maven-plugin/pom.xml b/dhp-build/dhp-build-properties-maven-plugin/pom.xml index 256017e2c..7579bdf45 100644 --- a/dhp-build/dhp-build-properties-maven-plugin/pom.xml +++ b/dhp-build/dhp-build-properties-maven-plugin/pom.xml @@ -6,7 +6,7 @@ eu.dnetlib.dhp dhp-build - 1.2.4-SNAPSHOT + 1.2.5-SNAPSHOT dhp-build-properties-maven-plugin diff --git a/dhp-build/dhp-build-properties-maven-plugin/src/main/java/eu/dnetlib/maven/plugin/properties/GenerateOoziePropertiesMojo.java b/dhp-build/dhp-build-properties-maven-plugin/src/main/java/eu/dnetlib/maven/plugin/properties/GenerateOoziePropertiesMojo.java index 10a25fdc3..a642dab70 100644 --- a/dhp-build/dhp-build-properties-maven-plugin/src/main/java/eu/dnetlib/maven/plugin/properties/GenerateOoziePropertiesMojo.java +++ b/dhp-build/dhp-build-properties-maven-plugin/src/main/java/eu/dnetlib/maven/plugin/properties/GenerateOoziePropertiesMojo.java @@ -8,8 +8,6 @@ import java.util.List; import org.apache.commons.lang.ArrayUtils; import org.apache.commons.lang.StringUtils; import org.apache.maven.plugin.AbstractMojo; -import org.apache.maven.plugin.MojoExecutionException; -import org.apache.maven.plugin.MojoFailureException; /** * Generates oozie properties which were not provided from commandline. @@ -27,7 +25,7 @@ public class GenerateOoziePropertiesMojo extends AbstractMojo { }; @Override - public void execute() throws MojoExecutionException, MojoFailureException { + public void execute() { if (System.getProperties().containsKey(PROPERTY_NAME_WF_SOURCE_DIR) && !System.getProperties().containsKey(PROPERTY_NAME_SANDBOX_NAME)) { String generatedSandboxName = generateSandboxName( @@ -46,24 +44,24 @@ public class GenerateOoziePropertiesMojo extends AbstractMojo { /** * Generates sandbox name from workflow source directory. * - * @param wfSourceDir + * @param wfSourceDir workflow source directory * @return generated sandbox name */ private String generateSandboxName(String wfSourceDir) { // utilize all dir names until finding one of the limiters - List sandboxNameParts = new ArrayList(); + List sandboxNameParts = new ArrayList<>(); String[] tokens = StringUtils.split(wfSourceDir, File.separatorChar); ArrayUtils.reverse(tokens); if (tokens.length > 0) { for (String token : tokens) { for (String limiter : limiters) { if (limiter.equals(token)) { - return sandboxNameParts.size() > 0 + return !sandboxNameParts.isEmpty() ? StringUtils.join(sandboxNameParts.toArray()) : null; } } - if (sandboxNameParts.size() > 0) { + if (!sandboxNameParts.isEmpty()) { sandboxNameParts.add(0, File.separator); } sandboxNameParts.add(0, token); diff --git a/dhp-build/dhp-build-properties-maven-plugin/src/main/java/eu/dnetlib/maven/plugin/properties/WritePredefinedProjectProperties.java b/dhp-build/dhp-build-properties-maven-plugin/src/main/java/eu/dnetlib/maven/plugin/properties/WritePredefinedProjectProperties.java index d195ca86e..e3cdf5a22 100644 --- a/dhp-build/dhp-build-properties-maven-plugin/src/main/java/eu/dnetlib/maven/plugin/properties/WritePredefinedProjectProperties.java +++ b/dhp-build/dhp-build-properties-maven-plugin/src/main/java/eu/dnetlib/maven/plugin/properties/WritePredefinedProjectProperties.java @@ -16,6 +16,7 @@ import java.io.File; import java.io.FileInputStream; import java.io.IOException; import java.io.InputStream; +import java.nio.charset.StandardCharsets; import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; @@ -289,7 +290,7 @@ public class WritePredefinedProjectProperties extends AbstractMojo { */ protected List getEscapeChars(String escapeChars) { List tokens = getListFromCSV(escapeChars); - List realTokens = new ArrayList(); + List realTokens = new ArrayList<>(); for (String token : tokens) { String realToken = getRealToken(token); realTokens.add(realToken); @@ -324,7 +325,7 @@ public class WritePredefinedProjectProperties extends AbstractMojo { * @return content */ protected String getContent(String comment, Properties properties, List escapeTokens) { - List names = new ArrayList(properties.stringPropertyNames()); + List names = new ArrayList<>(properties.stringPropertyNames()); Collections.sort(names); StringBuilder sb = new StringBuilder(); if (!StringUtils.isBlank(comment)) { @@ -352,7 +353,7 @@ public class WritePredefinedProjectProperties extends AbstractMojo { throws MojoExecutionException { try { String content = getContent(comment, properties, escapeTokens); - FileUtils.writeStringToFile(file, content, ENCODING_UTF8); + FileUtils.writeStringToFile(file, content, StandardCharsets.UTF_8); } catch (IOException e) { throw new MojoExecutionException("Error creating properties file", e); } @@ -399,9 +400,9 @@ public class WritePredefinedProjectProperties extends AbstractMojo { */ protected static final List getListFromCSV(String csv) { if (StringUtils.isBlank(csv)) { - return new ArrayList(); + return new ArrayList<>(); } - List list = new ArrayList(); + List list = new ArrayList<>(); String[] tokens = StringUtils.split(csv, ","); for (String token : tokens) { list.add(token.trim()); diff --git a/dhp-build/dhp-build-properties-maven-plugin/src/test/java/eu/dnetlib/maven/plugin/properties/GenerateOoziePropertiesMojoTest.java b/dhp-build/dhp-build-properties-maven-plugin/src/test/java/eu/dnetlib/maven/plugin/properties/GenerateOoziePropertiesMojoTest.java index 4bfcd3b33..2ff6bea30 100644 --- a/dhp-build/dhp-build-properties-maven-plugin/src/test/java/eu/dnetlib/maven/plugin/properties/GenerateOoziePropertiesMojoTest.java +++ b/dhp-build/dhp-build-properties-maven-plugin/src/test/java/eu/dnetlib/maven/plugin/properties/GenerateOoziePropertiesMojoTest.java @@ -9,18 +9,18 @@ import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; /** @author mhorst, claudio.atzori */ -public class GenerateOoziePropertiesMojoTest { +class GenerateOoziePropertiesMojoTest { private final GenerateOoziePropertiesMojo mojo = new GenerateOoziePropertiesMojo(); @BeforeEach - public void clearSystemProperties() { + void clearSystemProperties() { System.clearProperty(PROPERTY_NAME_SANDBOX_NAME); System.clearProperty(PROPERTY_NAME_WF_SOURCE_DIR); } @Test - public void testExecuteEmpty() throws Exception { + void testExecuteEmpty() throws Exception { // execute mojo.execute(); @@ -29,7 +29,7 @@ public class GenerateOoziePropertiesMojoTest { } @Test - public void testExecuteSandboxNameAlreadySet() throws Exception { + void testExecuteSandboxNameAlreadySet() throws Exception { // given String workflowSourceDir = "eu/dnetlib/dhp/wf/transformers"; String sandboxName = "originalSandboxName"; @@ -44,7 +44,7 @@ public class GenerateOoziePropertiesMojoTest { } @Test - public void testExecuteEmptyWorkflowSourceDir() throws Exception { + void testExecuteEmptyWorkflowSourceDir() throws Exception { // given String workflowSourceDir = ""; System.setProperty(PROPERTY_NAME_WF_SOURCE_DIR, workflowSourceDir); @@ -57,7 +57,7 @@ public class GenerateOoziePropertiesMojoTest { } @Test - public void testExecuteNullSandboxNameGenerated() throws Exception { + void testExecuteNullSandboxNameGenerated() throws Exception { // given String workflowSourceDir = "eu/dnetlib/dhp/"; System.setProperty(PROPERTY_NAME_WF_SOURCE_DIR, workflowSourceDir); @@ -70,7 +70,7 @@ public class GenerateOoziePropertiesMojoTest { } @Test - public void testExecute() throws Exception { + void testExecute() throws Exception { // given String workflowSourceDir = "eu/dnetlib/dhp/wf/transformers"; System.setProperty(PROPERTY_NAME_WF_SOURCE_DIR, workflowSourceDir); @@ -83,7 +83,7 @@ public class GenerateOoziePropertiesMojoTest { } @Test - public void testExecuteWithoutRoot() throws Exception { + void testExecuteWithoutRoot() throws Exception { // given String workflowSourceDir = "wf/transformers"; System.setProperty(PROPERTY_NAME_WF_SOURCE_DIR, workflowSourceDir); diff --git a/dhp-build/dhp-build-properties-maven-plugin/src/test/java/eu/dnetlib/maven/plugin/properties/WritePredefinedProjectPropertiesTest.java b/dhp-build/dhp-build-properties-maven-plugin/src/test/java/eu/dnetlib/maven/plugin/properties/WritePredefinedProjectPropertiesTest.java index 0b3ea9653..84b962b4b 100644 --- a/dhp-build/dhp-build-properties-maven-plugin/src/test/java/eu/dnetlib/maven/plugin/properties/WritePredefinedProjectPropertiesTest.java +++ b/dhp-build/dhp-build-properties-maven-plugin/src/test/java/eu/dnetlib/maven/plugin/properties/WritePredefinedProjectPropertiesTest.java @@ -20,7 +20,7 @@ import org.mockito.junit.jupiter.MockitoExtension; /** @author mhorst, claudio.atzori */ @ExtendWith(MockitoExtension.class) -public class WritePredefinedProjectPropertiesTest { +class WritePredefinedProjectPropertiesTest { @Mock private MavenProject mavenProject; @@ -39,7 +39,7 @@ public class WritePredefinedProjectPropertiesTest { // ----------------------------------- TESTS --------------------------------------------- @Test - public void testExecuteEmpty() throws Exception { + void testExecuteEmpty() throws Exception { // execute mojo.execute(); @@ -50,7 +50,7 @@ public class WritePredefinedProjectPropertiesTest { } @Test - public void testExecuteWithProjectProperties() throws Exception { + void testExecuteWithProjectProperties() throws Exception { // given String key = "projectPropertyKey"; String value = "projectPropertyValue"; @@ -70,7 +70,7 @@ public class WritePredefinedProjectPropertiesTest { } @Test() - public void testExecuteWithProjectPropertiesAndInvalidOutputFile(@TempDir File testFolder) { + void testExecuteWithProjectPropertiesAndInvalidOutputFile(@TempDir File testFolder) { // given String key = "projectPropertyKey"; String value = "projectPropertyValue"; @@ -84,7 +84,7 @@ public class WritePredefinedProjectPropertiesTest { } @Test - public void testExecuteWithProjectPropertiesExclusion(@TempDir File testFolder) throws Exception { + void testExecuteWithProjectPropertiesExclusion(@TempDir File testFolder) throws Exception { // given String key = "projectPropertyKey"; String value = "projectPropertyValue"; @@ -108,7 +108,7 @@ public class WritePredefinedProjectPropertiesTest { } @Test - public void testExecuteWithProjectPropertiesInclusion(@TempDir File testFolder) throws Exception { + void testExecuteWithProjectPropertiesInclusion(@TempDir File testFolder) throws Exception { // given String key = "projectPropertyKey"; String value = "projectPropertyValue"; @@ -132,7 +132,7 @@ public class WritePredefinedProjectPropertiesTest { } @Test - public void testExecuteIncludingPropertyKeysFromFile(@TempDir File testFolder) throws Exception { + void testExecuteIncludingPropertyKeysFromFile(@TempDir File testFolder) throws Exception { // given String key = "projectPropertyKey"; String value = "projectPropertyValue"; @@ -164,7 +164,7 @@ public class WritePredefinedProjectPropertiesTest { } @Test - public void testExecuteIncludingPropertyKeysFromClasspathResource(@TempDir File testFolder) + void testExecuteIncludingPropertyKeysFromClasspathResource(@TempDir File testFolder) throws Exception { // given String key = "projectPropertyKey"; @@ -194,7 +194,7 @@ public class WritePredefinedProjectPropertiesTest { } @Test - public void testExecuteIncludingPropertyKeysFromBlankLocation() { + void testExecuteIncludingPropertyKeysFromBlankLocation() { // given String key = "projectPropertyKey"; String value = "projectPropertyValue"; @@ -214,7 +214,7 @@ public class WritePredefinedProjectPropertiesTest { } @Test - public void testExecuteIncludingPropertyKeysFromXmlFile(@TempDir File testFolder) + void testExecuteIncludingPropertyKeysFromXmlFile(@TempDir File testFolder) throws Exception { // given String key = "projectPropertyKey"; @@ -247,7 +247,7 @@ public class WritePredefinedProjectPropertiesTest { } @Test - public void testExecuteIncludingPropertyKeysFromInvalidXmlFile(@TempDir File testFolder) + void testExecuteIncludingPropertyKeysFromInvalidXmlFile(@TempDir File testFolder) throws Exception { // given String key = "projectPropertyKey"; @@ -273,7 +273,7 @@ public class WritePredefinedProjectPropertiesTest { } @Test - public void testExecuteWithQuietModeOn(@TempDir File testFolder) throws Exception { + void testExecuteWithQuietModeOn(@TempDir File testFolder) throws Exception { // given mojo.setQuiet(true); mojo.setIncludePropertyKeysFromFiles(new String[] { @@ -290,7 +290,7 @@ public class WritePredefinedProjectPropertiesTest { } @Test - public void testExecuteIncludingPropertyKeysFromInvalidFile() { + void testExecuteIncludingPropertyKeysFromInvalidFile() { // given mojo.setIncludePropertyKeysFromFiles(new String[] { "invalid location" @@ -301,7 +301,7 @@ public class WritePredefinedProjectPropertiesTest { } @Test - public void testExecuteWithEnvironmentProperties(@TempDir File testFolder) throws Exception { + void testExecuteWithEnvironmentProperties(@TempDir File testFolder) throws Exception { // given mojo.setIncludeEnvironmentVariables(true); @@ -318,7 +318,7 @@ public class WritePredefinedProjectPropertiesTest { } @Test - public void testExecuteWithSystemProperties(@TempDir File testFolder) throws Exception { + void testExecuteWithSystemProperties(@TempDir File testFolder) throws Exception { // given String key = "systemPropertyKey"; String value = "systemPropertyValue"; @@ -337,7 +337,7 @@ public class WritePredefinedProjectPropertiesTest { } @Test - public void testExecuteWithSystemPropertiesAndEscapeChars(@TempDir File testFolder) + void testExecuteWithSystemPropertiesAndEscapeChars(@TempDir File testFolder) throws Exception { // given String key = "systemPropertyKey "; diff --git a/dhp-build/dhp-code-style/pom.xml b/dhp-build/dhp-code-style/pom.xml index e60e8076e..5a86efe17 100644 --- a/dhp-build/dhp-code-style/pom.xml +++ b/dhp-build/dhp-code-style/pom.xml @@ -5,7 +5,7 @@ eu.dnetlib.dhp dhp-code-style - 1.2.4-SNAPSHOT + 1.2.5-SNAPSHOT jar @@ -15,16 +15,27 @@ dnet45-snapshots DNet45 Snapshots - http://maven.research-infrastructures.eu/nexus/content/repositories/dnet45-snapshots + https://maven.d4science.org/nexus/content/repositories/dnet45-snapshots default dnet45-releases - http://maven.research-infrastructures.eu/nexus/content/repositories/dnet45-releases + https://maven.d4science.org/nexus/content/repositories/dnet45-releases + + DHPSite + ${dhp.site.stage.path}/dhp-build/dhp-code-style + + + + org.apache.maven.wagon + wagon-ssh + 2.10 + + @@ -35,14 +46,19 @@ org.apache.maven.plugins maven-site-plugin - 3.7.1 + 3.9.1 + + true + + UTF-8 + sftp://dnet-hadoop@static-web.d4science.org/dnet-hadoop \ No newline at end of file diff --git a/dhp-build/dhp-code-style/src/main/resources/scalafmt/scalafmt.conf b/dhp-build/dhp-code-style/src/main/resources/scalafmt/scalafmt.conf new file mode 100644 index 000000000..0b5dbe0b4 --- /dev/null +++ b/dhp-build/dhp-code-style/src/main/resources/scalafmt/scalafmt.conf @@ -0,0 +1,21 @@ +style = defaultWithAlign + +align.openParenCallSite = false +align.openParenDefnSite = false +align.tokens = [{code = "->"}, {code = "<-"}, {code = "=>", owner = "Case"}] +continuationIndent.callSite = 2 +continuationIndent.defnSite = 2 +danglingParentheses = true +indentOperator = spray +maxColumn = 120 +newlines.alwaysBeforeTopLevelStatements = true +project.excludeFilters = [".*\\.sbt"] +rewrite.rules = [AvoidInfix] +rewrite.rules = [ExpandImportSelectors] +rewrite.rules = [RedundantBraces] +rewrite.rules = [RedundantParens] +rewrite.rules = [SortImports] +rewrite.rules = [SortModifiers] +rewrite.rules = [PreferCurlyFors] +spaces.inImportCurlyBraces = false +unindentTopLevelOperators = true \ No newline at end of file diff --git a/dhp-build/dhp-code-style/src/site/site.xml b/dhp-build/dhp-code-style/src/site/site.xml new file mode 100644 index 000000000..634a2c154 --- /dev/null +++ b/dhp-build/dhp-code-style/src/site/site.xml @@ -0,0 +1,21 @@ + + + + org.apache.maven.skins + maven-fluido-skin + 1.8 + + + + + + + + + + + + \ No newline at end of file diff --git a/dhp-build/pom.xml b/dhp-build/pom.xml index 12b999b9c..9040ea94e 100644 --- a/dhp-build/pom.xml +++ b/dhp-build/pom.xml @@ -4,12 +4,15 @@ eu.dnetlib.dhp dhp - 1.2.4-SNAPSHOT + 1.2.5-SNAPSHOT dhp-build pom This module is a container for the build tools used in dnet-hadoop + + true + dhp-code-style @@ -17,4 +20,12 @@ dhp-build-properties-maven-plugin + + + + DHPSite + ${dhp.site.stage.path}/dhp-build/ + + + diff --git a/dhp-build/src/site/site.xml b/dhp-build/src/site/site.xml new file mode 100644 index 000000000..2d9d769a2 --- /dev/null +++ b/dhp-build/src/site/site.xml @@ -0,0 +1,22 @@ + + + + org.apache.maven.skins + maven-fluido-skin + 1.8 + + + + + + + + + + + + + \ No newline at end of file diff --git a/dhp-common/pom.xml b/dhp-common/pom.xml index 4ee706169..6df11f4ea 100644 --- a/dhp-common/pom.xml +++ b/dhp-common/pom.xml @@ -5,14 +5,59 @@ eu.dnetlib.dhp dhp - 1.2.4-SNAPSHOT - ../ + 1.2.5-SNAPSHOT + ../pom.xml + dhp-common jar + + + DHPSite + ${dhp.site.stage.path}/dhp-common + + + This module contains common utilities meant to be used across the dnet-hadoop submodules + + + + net.alchim31.maven + scala-maven-plugin + ${net.alchim31.maven.version} + + + scala-compile-first + initialize + + add-source + compile + + + + scala-test-compile + process-test-resources + + testCompile + + + + scala-doc + process-resources + + doc + + + + + ${scala.version} + + + + + @@ -20,6 +65,15 @@ org.apache.hadoop hadoop-common + + com.github.sisyphsu + dateparser + + + me.xuender + unidecode + + org.apache.spark spark-core_2.11 @@ -29,12 +83,6 @@ spark-sql_2.11 - - eu.dnetlib.dhp - dhp-schemas - ${project.version} - - commons-cli commons-cli @@ -59,11 +107,6 @@ com.fasterxml.jackson.core jackson-databind - - - com.rabbitmq - amqp-client - net.sf.saxon Saxon-HE @@ -104,10 +147,24 @@ dnet-pace-core + + org.apache.httpcomponents + httpclient + + + + org.mongodb + mongo-java-driver + + eu.dnetlib.dhp dhp-schemas - ${project.version} + + + + com.opencsv + opencsv diff --git a/dhp-common/src/main/java/eu/dnetlib/collector/worker/model/ApiDescriptor.java b/dhp-common/src/main/java/eu/dnetlib/collector/worker/model/ApiDescriptor.java deleted file mode 100644 index bfd70e8c6..000000000 --- a/dhp-common/src/main/java/eu/dnetlib/collector/worker/model/ApiDescriptor.java +++ /dev/null @@ -1,48 +0,0 @@ - -package eu.dnetlib.collector.worker.model; - -import java.util.HashMap; -import java.util.Map; - -public class ApiDescriptor { - - private String id; - - private String baseUrl; - - private String protocol; - - private Map params = new HashMap<>(); - - public String getBaseUrl() { - return baseUrl; - } - - public void setBaseUrl(final String baseUrl) { - this.baseUrl = baseUrl; - } - - public String getId() { - return id; - } - - public void setId(final String id) { - this.id = id; - } - - public Map getParams() { - return params; - } - - public void setParams(final HashMap params) { - this.params = params; - } - - public String getProtocol() { - return protocol; - } - - public void setProtocol(final String protocol) { - this.protocol = protocol; - } -} diff --git a/dhp-common/src/main/java/eu/dnetlib/data/mdstore/manager/common/model/MDStore.java b/dhp-common/src/main/java/eu/dnetlib/data/mdstore/manager/common/model/MDStore.java deleted file mode 100644 index 68fc024af..000000000 --- a/dhp-common/src/main/java/eu/dnetlib/data/mdstore/manager/common/model/MDStore.java +++ /dev/null @@ -1,119 +0,0 @@ - -package eu.dnetlib.data.mdstore.manager.common.model; - -import java.io.Serializable; -import java.util.UUID; - -import javax.persistence.Column; -import javax.persistence.Entity; -import javax.persistence.Id; -import javax.persistence.Table; - -@Entity -@Table(name = "mdstores") -public class MDStore implements Serializable { - - /** */ - private static final long serialVersionUID = 3160530489149700055L; - - @Id - @Column(name = "id") - private String id; - - @Column(name = "format") - private String format; - - @Column(name = "layout") - private String layout; - - @Column(name = "interpretation") - private String interpretation; - - @Column(name = "datasource_name") - private String datasourceName; - - @Column(name = "datasource_id") - private String datasourceId; - - @Column(name = "api_id") - private String apiId; - - public String getId() { - return id; - } - - public void setId(final String id) { - this.id = id; - } - - public String getFormat() { - return format; - } - - public void setFormat(final String format) { - this.format = format; - } - - public String getLayout() { - return layout; - } - - public void setLayout(final String layout) { - this.layout = layout; - } - - public String getInterpretation() { - return interpretation; - } - - public void setInterpretation(final String interpretation) { - this.interpretation = interpretation; - } - - public String getDatasourceName() { - return datasourceName; - } - - public void setDatasourceName(final String datasourceName) { - this.datasourceName = datasourceName; - } - - public String getDatasourceId() { - return datasourceId; - } - - public void setDatasourceId(final String datasourceId) { - this.datasourceId = datasourceId; - } - - public String getApiId() { - return apiId; - } - - public void setApiId(final String apiId) { - this.apiId = apiId; - } - - public static MDStore newInstance( - final String format, final String layout, final String interpretation) { - return newInstance(format, layout, interpretation, null, null, null); - } - - public static MDStore newInstance( - final String format, - final String layout, - final String interpretation, - final String dsName, - final String dsId, - final String apiId) { - final MDStore md = new MDStore(); - md.setId("md-" + UUID.randomUUID()); - md.setFormat(format); - md.setLayout(layout); - md.setInterpretation(interpretation); - md.setDatasourceName(dsName); - md.setDatasourceId(dsId); - md.setApiId(apiId); - return md; - } -} diff --git a/dhp-common/src/main/java/eu/dnetlib/data/mdstore/manager/common/model/MDStoreCurrentVersion.java b/dhp-common/src/main/java/eu/dnetlib/data/mdstore/manager/common/model/MDStoreCurrentVersion.java deleted file mode 100644 index f74ab39be..000000000 --- a/dhp-common/src/main/java/eu/dnetlib/data/mdstore/manager/common/model/MDStoreCurrentVersion.java +++ /dev/null @@ -1,51 +0,0 @@ - -package eu.dnetlib.data.mdstore.manager.common.model; - -import java.io.Serializable; - -import javax.persistence.Column; -import javax.persistence.Entity; -import javax.persistence.Id; -import javax.persistence.Table; - -@Entity -@Table(name = "mdstore_current_versions") -public class MDStoreCurrentVersion implements Serializable { - - /** */ - private static final long serialVersionUID = -4757725888593745773L; - - @Id - @Column(name = "mdstore") - private String mdstore; - - @Column(name = "current_version") - private String currentVersion; - - public String getMdstore() { - return mdstore; - } - - public void setMdstore(final String mdstore) { - this.mdstore = mdstore; - } - - public String getCurrentVersion() { - return currentVersion; - } - - public void setCurrentVersion(final String currentVersion) { - this.currentVersion = currentVersion; - } - - public static MDStoreCurrentVersion newInstance(final String mdId, final String versionId) { - final MDStoreCurrentVersion cv = new MDStoreCurrentVersion(); - cv.setMdstore(mdId); - cv.setCurrentVersion(versionId); - return cv; - } - - public static MDStoreCurrentVersion newInstance(final MDStoreVersion v) { - return newInstance(v.getMdstore(), v.getId()); - } -} diff --git a/dhp-common/src/main/java/eu/dnetlib/data/mdstore/manager/common/model/MDStoreVersion.java b/dhp-common/src/main/java/eu/dnetlib/data/mdstore/manager/common/model/MDStoreVersion.java deleted file mode 100644 index 7ef24f191..000000000 --- a/dhp-common/src/main/java/eu/dnetlib/data/mdstore/manager/common/model/MDStoreVersion.java +++ /dev/null @@ -1,99 +0,0 @@ - -package eu.dnetlib.data.mdstore.manager.common.model; - -import java.io.Serializable; -import java.util.Date; - -import javax.persistence.Column; -import javax.persistence.Entity; -import javax.persistence.Id; -import javax.persistence.Table; -import javax.persistence.Temporal; -import javax.persistence.TemporalType; - -@Entity -@Table(name = "mdstore_versions") -public class MDStoreVersion implements Serializable { - - /** */ - private static final long serialVersionUID = -4763494442274298339L; - - @Id - @Column(name = "id") - private String id; - - @Column(name = "mdstore") - private String mdstore; - - @Column(name = "writing") - private boolean writing; - - @Column(name = "readcount") - private int readCount = 0; - - @Column(name = "lastupdate") - @Temporal(TemporalType.TIMESTAMP) - private Date lastUpdate; - - @Column(name = "size") - private long size = 0; - - public static MDStoreVersion newInstance(final String mdId, final boolean writing) { - final MDStoreVersion t = new MDStoreVersion(); - t.setId(mdId + "-" + new Date().getTime()); - t.setMdstore(mdId); - t.setLastUpdate(null); - t.setWriting(writing); - t.setReadCount(0); - t.setSize(0); - return t; - } - - public String getId() { - return id; - } - - public void setId(final String id) { - this.id = id; - } - - public String getMdstore() { - return mdstore; - } - - public void setMdstore(final String mdstore) { - this.mdstore = mdstore; - } - - public boolean isWriting() { - return writing; - } - - public void setWriting(final boolean writing) { - this.writing = writing; - } - - public int getReadCount() { - return readCount; - } - - public void setReadCount(final int readCount) { - this.readCount = readCount; - } - - public Date getLastUpdate() { - return lastUpdate; - } - - public void setLastUpdate(final Date lastUpdate) { - this.lastUpdate = lastUpdate; - } - - public long getSize() { - return size; - } - - public void setSize(final long size) { - this.size = size; - } -} diff --git a/dhp-common/src/main/java/eu/dnetlib/data/mdstore/manager/common/model/MDStoreWithInfo.java b/dhp-common/src/main/java/eu/dnetlib/data/mdstore/manager/common/model/MDStoreWithInfo.java deleted file mode 100644 index 438359241..000000000 --- a/dhp-common/src/main/java/eu/dnetlib/data/mdstore/manager/common/model/MDStoreWithInfo.java +++ /dev/null @@ -1,143 +0,0 @@ - -package eu.dnetlib.data.mdstore.manager.common.model; - -import java.io.Serializable; -import java.util.Date; - -import javax.persistence.Column; -import javax.persistence.Entity; -import javax.persistence.Id; -import javax.persistence.Table; -import javax.persistence.Temporal; -import javax.persistence.TemporalType; - -@Entity -@Table(name = "mdstores_with_info") -public class MDStoreWithInfo implements Serializable { - - /** */ - private static final long serialVersionUID = -8445784770687571492L; - - @Id - @Column(name = "id") - private String id; - - @Column(name = "format") - private String format; - - @Column(name = "layout") - private String layout; - - @Column(name = "interpretation") - private String interpretation; - - @Column(name = "datasource_name") - private String datasourceName; - - @Column(name = "datasource_id") - private String datasourceId; - - @Column(name = "api_id") - private String apiId; - - @Column(name = "current_version") - private String currentVersion; - - @Column(name = "lastupdate") - @Temporal(TemporalType.TIMESTAMP) - private Date lastUpdate; - - @Column(name = "size") - private long size = 0; - - @Column(name = "n_versions") - private long numberOfVersions = 0; - - public String getId() { - return id; - } - - public void setId(final String id) { - this.id = id; - } - - public String getFormat() { - return format; - } - - public void setFormat(final String format) { - this.format = format; - } - - public String getLayout() { - return layout; - } - - public void setLayout(final String layout) { - this.layout = layout; - } - - public String getInterpretation() { - return interpretation; - } - - public void setInterpretation(final String interpretation) { - this.interpretation = interpretation; - } - - public String getDatasourceName() { - return datasourceName; - } - - public void setDatasourceName(final String datasourceName) { - this.datasourceName = datasourceName; - } - - public String getDatasourceId() { - return datasourceId; - } - - public void setDatasourceId(final String datasourceId) { - this.datasourceId = datasourceId; - } - - public String getApiId() { - return apiId; - } - - public void setApiId(final String apiId) { - this.apiId = apiId; - } - - public String getCurrentVersion() { - return currentVersion; - } - - public void setCurrentVersion(final String currentVersion) { - this.currentVersion = currentVersion; - } - - public Date getLastUpdate() { - return lastUpdate; - } - - public void setLastUpdate(final Date lastUpdate) { - this.lastUpdate = lastUpdate; - } - - public long getSize() { - return size; - } - - public void setSize(final long size) { - this.size = size; - } - - public long getNumberOfVersions() { - return numberOfVersions; - } - - public void setNumberOfVersions(final long numberOfVersions) { - this.numberOfVersions = numberOfVersions; - } -} diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/application/ArgumentApplicationParser.java b/dhp-common/src/main/java/eu/dnetlib/dhp/application/ArgumentApplicationParser.java index e65b4bb0b..72c1f6a5e 100644 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/application/ArgumentApplicationParser.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/application/ArgumentApplicationParser.java @@ -1,10 +1,7 @@ package eu.dnetlib.dhp.application; -import java.io.ByteArrayInputStream; -import java.io.ByteArrayOutputStream; -import java.io.Serializable; -import java.io.StringWriter; +import java.io.*; import java.util.*; import java.util.zip.GZIPInputStream; import java.util.zip.GZIPOutputStream; @@ -12,17 +9,21 @@ import java.util.zip.GZIPOutputStream; import org.apache.commons.cli.*; import org.apache.commons.codec.binary.Base64; import org.apache.commons.io.IOUtils; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import com.fasterxml.jackson.databind.ObjectMapper; public class ArgumentApplicationParser implements Serializable { + private static final Logger log = LoggerFactory.getLogger(ArgumentApplicationParser.class); + private final Options options = new Options(); private final Map objectMap = new HashMap<>(); private final List compressedValues = new ArrayList<>(); - public ArgumentApplicationParser(final String json_configuration) throws Exception { + public ArgumentApplicationParser(final String json_configuration) throws IOException { final ObjectMapper mapper = new ObjectMapper(); final OptionsParameter[] configuration = mapper.readValue(json_configuration, OptionsParameter[].class); createOptionMap(configuration); @@ -33,7 +34,6 @@ public class ArgumentApplicationParser implements Serializable { } private void createOptionMap(final OptionsParameter[] configuration) { - Arrays .stream(configuration) .map( @@ -47,10 +47,6 @@ public class ArgumentApplicationParser implements Serializable { return o; }) .forEach(options::addOption); - - // HelpFormatter formatter = new HelpFormatter(); - // formatter.printHelp("myapp", null, options, null, true); - } public static String decompressValue(final String abstractCompressed) { @@ -60,13 +56,13 @@ public class ArgumentApplicationParser implements Serializable { final StringWriter stringWriter = new StringWriter(); IOUtils.copy(gis, stringWriter); return stringWriter.toString(); - } catch (Throwable e) { - System.out.println("Wrong value to decompress:" + abstractCompressed); - throw new RuntimeException(e); + } catch (IOException e) { + log.error("Wrong value to decompress: {}", abstractCompressed); + throw new IllegalArgumentException(e); } } - public static String compressArgument(final String value) throws Exception { + public static String compressArgument(final String value) throws IOException { ByteArrayOutputStream out = new ByteArrayOutputStream(); GZIPOutputStream gzip = new GZIPOutputStream(out); gzip.write(value.getBytes()); @@ -74,7 +70,7 @@ public class ArgumentApplicationParser implements Serializable { return java.util.Base64.getEncoder().encodeToString(out.toByteArray()); } - public void parseArgument(final String[] args) throws Exception { + public void parseArgument(final String[] args) throws ParseException { CommandLineParser parser = new BasicParser(); CommandLine cmd = parser.parse(options, args); Arrays diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/application/OptionsParameter.java b/dhp-common/src/main/java/eu/dnetlib/dhp/application/OptionsParameter.java index 7004112e4..f34326d67 100644 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/application/OptionsParameter.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/application/OptionsParameter.java @@ -9,9 +9,6 @@ public class OptionsParameter { private boolean paramRequired; private boolean compressed; - public OptionsParameter() { - } - public String getParamName() { return paramName; } diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/collection/ApiDescriptor.java b/dhp-common/src/main/java/eu/dnetlib/dhp/collection/ApiDescriptor.java new file mode 100644 index 000000000..fbbbffcbb --- /dev/null +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/collection/ApiDescriptor.java @@ -0,0 +1,48 @@ + +package eu.dnetlib.dhp.collection; + +import java.util.HashMap; +import java.util.Map; + +public class ApiDescriptor { + + private String id; + + private String baseUrl; + + private String protocol; + + private Map params = new HashMap<>(); + + public String getBaseUrl() { + return baseUrl; + } + + public void setBaseUrl(final String baseUrl) { + this.baseUrl = baseUrl; + } + + public String getId() { + return id; + } + + public void setId(final String id) { + this.id = id; + } + + public Map getParams() { + return params; + } + + public void setParams(final Map params) { + this.params = params; + } + + public String getProtocol() { + return protocol; + } + + public void setProtocol(final String protocol) { + this.protocol = protocol; + } +} diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/common/Constants.java b/dhp-common/src/main/java/eu/dnetlib/dhp/common/Constants.java new file mode 100644 index 000000000..a62a0ac79 --- /dev/null +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/common/Constants.java @@ -0,0 +1,61 @@ + +package eu.dnetlib.dhp.common; + +import java.util.Map; + +import com.google.common.collect.Maps; + +public class Constants { + + public static final Map accessRightsCoarMap = Maps.newHashMap(); + public static final Map coarCodeLabelMap = Maps.newHashMap(); + + public static String COAR_ACCESS_RIGHT_SCHEMA = "http://vocabularies.coar-repositories.org/documentation/access_rights/"; + + private Constants() { + } + + static { + accessRightsCoarMap.put("OPEN", "c_abf2"); + accessRightsCoarMap.put("RESTRICTED", "c_16ec"); + accessRightsCoarMap.put("OPEN SOURCE", "c_abf2"); + accessRightsCoarMap.put("CLOSED", "c_14cb"); + accessRightsCoarMap.put("EMBARGO", "c_f1cf"); + } + + static { + coarCodeLabelMap.put("c_abf2", "OPEN"); + coarCodeLabelMap.put("c_16ec", "RESTRICTED"); + coarCodeLabelMap.put("c_14cb", "CLOSED"); + coarCodeLabelMap.put("c_f1cf", "EMBARGO"); + } + + public static final String SEQUENCE_FILE_NAME = "/sequence_file"; + public static final String REPORT_FILE_NAME = "/report"; + public static final String MDSTORE_DATA_PATH = "/store"; + public static final String MDSTORE_SIZE_PATH = "/size"; + + public static final String COLLECTION_MODE = "collectionMode"; + public static final String METADATA_ENCODING = "metadataEncoding"; + public static final String OOZIE_WF_PATH = "oozieWfPath"; + public static final String DNET_MESSAGE_MGR_URL = "dnetMessageManagerURL"; + + public static final String MAX_NUMBER_OF_RETRY = "maxNumberOfRetry"; + public static final String REQUEST_DELAY = "requestDelay"; + public static final String RETRY_DELAY = "retryDelay"; + public static final String CONNECT_TIMEOUT = "connectTimeOut"; + public static final String READ_TIMEOUT = "readTimeOut"; + public static final String FROM_DATE_OVERRIDE = "fromDateOverride"; + public static final String UNTIL_DATE_OVERRIDE = "untilDateOverride"; + + public static final String CONTENT_TOTALITEMS = "TotalItems"; + public static final String CONTENT_INVALIDRECORDS = "InvalidRecords"; + public static final String CONTENT_TRANSFORMEDRECORDS = "transformedItems"; + + // IETF Draft and used by Repositories like ZENODO , not included in APACHE HTTP java packages + // see https://ietf-wg-httpapi.github.io/ratelimit-headers/draft-ietf-httpapi-ratelimit-headers.html + public static final String HTTPHEADER_IETF_DRAFT_RATELIMIT_LIMIT = "X-RateLimit-Limit"; + public static final String HTTPHEADER_IETF_DRAFT_RATELIMIT_REMAINING = "X-RateLimit-Remaining"; + public static final String HTTPHEADER_IETF_DRAFT_RATELIMIT_RESET = "X-RateLimit-Reset"; + +} diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/common/DbClient.java b/dhp-common/src/main/java/eu/dnetlib/dhp/common/DbClient.java index cedc9bd4d..fabb25f16 100644 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/common/DbClient.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/common/DbClient.java @@ -14,7 +14,7 @@ public class DbClient implements Closeable { private static final Log log = LogFactory.getLog(DbClient.class); - private Connection connection; + private final Connection connection; public DbClient(final String address, final String login, final String password) { diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/common/HdfsSupport.java b/dhp-common/src/main/java/eu/dnetlib/dhp/common/HdfsSupport.java index 0b2cd571f..654fdd5ac 100644 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/common/HdfsSupport.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/common/HdfsSupport.java @@ -28,7 +28,7 @@ public class HdfsSupport { * @param configuration Configuration of hadoop env */ public static boolean exists(String path, Configuration configuration) { - logger.info("Removing path: {}", path); + logger.info("Checking existence for path: {}", path); return rethrowAsRuntimeException( () -> { Path f = new Path(path); diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/common/MakeTarArchive.java b/dhp-common/src/main/java/eu/dnetlib/dhp/common/MakeTarArchive.java new file mode 100644 index 000000000..eca433e9e --- /dev/null +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/common/MakeTarArchive.java @@ -0,0 +1,172 @@ + +package eu.dnetlib.dhp.common; + +import java.io.BufferedInputStream; +import java.io.IOException; +import java.io.InputStream; +import java.io.Serializable; +import java.util.Optional; + +import org.apache.commons.compress.archivers.tar.TarArchiveEntry; +import org.apache.commons.compress.archivers.tar.TarArchiveOutputStream; +import org.apache.commons.io.IOUtils; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.*; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import eu.dnetlib.dhp.application.ArgumentApplicationParser; + +public class MakeTarArchive implements Serializable { + + private static final Logger log = LoggerFactory.getLogger(MakeTarArchive.class); + + public static void main(String[] args) throws Exception { + String jsonConfiguration = IOUtils + .toString( + MakeTarArchive.class + .getResourceAsStream( + "/eu/dnetlib/dhp/common/input_maketar_parameters.json")); + + final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration); + parser.parseArgument(args); + + final String outputPath = parser.get("hdfsPath"); + log.info("hdfsPath: {}", outputPath); + + final String hdfsNameNode = parser.get("nameNode"); + log.info("nameNode: {}", hdfsNameNode); + + final String inputPath = parser.get("sourcePath"); + log.info("input path : {}", inputPath); + + final int gBperSplit = Optional + .ofNullable(parser.get("splitSize")) + .map(Integer::valueOf) + .orElse(10); + + Configuration conf = new Configuration(); + conf.set("fs.defaultFS", hdfsNameNode); + + FileSystem fileSystem = FileSystem.get(conf); + + makeTArArchive(fileSystem, inputPath, outputPath, gBperSplit); + + } + + public static void makeTArArchive(FileSystem fileSystem, String inputPath, String outputPath, int gBperSplit) + throws IOException { + + RemoteIterator dirIterator = fileSystem.listLocatedStatus(new Path(inputPath)); + + while (dirIterator.hasNext()) { + LocatedFileStatus fileStatus = dirIterator.next(); + + Path p = fileStatus.getPath(); + String pathString = p.toString(); + String entity = pathString.substring(pathString.lastIndexOf("/") + 1); + + MakeTarArchive.tarMaxSize(fileSystem, pathString, outputPath + "/" + entity, entity, gBperSplit); + } + } + + private static TarArchiveOutputStream getTar(FileSystem fileSystem, String outputPath) throws IOException { + Path hdfsWritePath = new Path(outputPath); + if (fileSystem.exists(hdfsWritePath)) { + fileSystem.delete(hdfsWritePath, true); + + } + return new TarArchiveOutputStream(fileSystem.create(hdfsWritePath).getWrappedStream()); + } + + private static void write(FileSystem fileSystem, String inputPath, String outputPath, String dirName) + throws IOException { + + Path hdfsWritePath = new Path(outputPath); + if (fileSystem.exists(hdfsWritePath)) { + fileSystem.delete(hdfsWritePath, true); + + } + try (TarArchiveOutputStream ar = new TarArchiveOutputStream( + fileSystem.create(hdfsWritePath).getWrappedStream())) { + + RemoteIterator iterator = fileSystem + .listFiles( + new Path(inputPath), true); + + while (iterator.hasNext()) { + writeCurrentFile(fileSystem, dirName, iterator, ar, 0); + } + + } + } + + public static void tarMaxSize(FileSystem fileSystem, String inputPath, String outputPath, String dir_name, + int gBperSplit) throws IOException { + final long bytesPerSplit = 1024L * 1024L * 1024L * gBperSplit; + + long sourceSize = fileSystem.getContentSummary(new Path(inputPath)).getSpaceConsumed(); + + if (sourceSize < bytesPerSplit) { + write(fileSystem, inputPath, outputPath + ".tar", dir_name); + } else { + int partNum = 0; + + RemoteIterator fileStatusListIterator = fileSystem + .listFiles( + new Path(inputPath), true); + boolean next = fileStatusListIterator.hasNext(); + while (next) { + try (TarArchiveOutputStream ar = getTar(fileSystem, outputPath + "_" + (partNum + 1) + ".tar")) { + + long currentSize = 0; + while (next && currentSize < bytesPerSplit) { + currentSize = writeCurrentFile(fileSystem, dir_name, fileStatusListIterator, ar, currentSize); + next = fileStatusListIterator.hasNext(); + + } + + partNum += 1; + } + } + } + } + + private static long writeCurrentFile(FileSystem fileSystem, String dirName, + RemoteIterator fileStatusListIterator, + TarArchiveOutputStream ar, long currentSize) throws IOException { + LocatedFileStatus fileStatus = fileStatusListIterator.next(); + + Path p = fileStatus.getPath(); + String pString = p.toString(); + if (!pString.endsWith("_SUCCESS")) { + String name = pString.substring(pString.lastIndexOf("/") + 1); + if (name.startsWith("part-") & name.length() > 10) { + String tmp = name.substring(0, 10); + if (name.contains(".")) { + tmp += name.substring(name.indexOf(".")); + } + name = tmp; + } + TarArchiveEntry entry = new TarArchiveEntry(dirName + "/" + name); + entry.setSize(fileStatus.getLen()); + currentSize += fileStatus.getLen(); + ar.putArchiveEntry(entry); + + InputStream is = fileSystem.open(fileStatus.getPath()); + + BufferedInputStream bis = new BufferedInputStream(is); + + int count; + byte[] data = new byte[1024]; + while ((count = bis.read(data, 0, data.length)) != -1) { + ar.write(data, 0, count); + } + bis.close(); + ar.closeArchiveEntry(); + + } + return currentSize; + } + +} diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/common/MdstoreClient.java b/dhp-common/src/main/java/eu/dnetlib/dhp/common/MdstoreClient.java new file mode 100644 index 000000000..d06544ae1 --- /dev/null +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/common/MdstoreClient.java @@ -0,0 +1,122 @@ + +package eu.dnetlib.dhp.common; + +import java.io.Closeable; +import java.io.IOException; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.Map; +import java.util.Optional; +import java.util.stream.StreamSupport; + +import org.apache.commons.lang3.StringUtils; +import org.bson.Document; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import com.google.common.collect.Iterables; +import com.mongodb.BasicDBObject; +import com.mongodb.MongoClient; +import com.mongodb.MongoClientURI; +import com.mongodb.QueryBuilder; +import com.mongodb.client.FindIterable; +import com.mongodb.client.MongoCollection; +import com.mongodb.client.MongoDatabase; + +public class MdstoreClient implements Closeable { + + private static final Logger log = LoggerFactory.getLogger(MdstoreClient.class); + + private final MongoClient client; + private final MongoDatabase db; + + private static final String COLL_METADATA = "metadata"; + private static final String COLL_METADATA_MANAGER = "metadataManager"; + + public MdstoreClient(final String baseUrl, final String dbName) { + this.client = new MongoClient(new MongoClientURI(baseUrl)); + this.db = getDb(client, dbName); + } + + public MongoCollection mdStore(final String mdId) { + BasicDBObject query = (BasicDBObject) QueryBuilder.start("mdId").is(mdId).get(); + + log.info("querying current mdId: {}", query.toJson()); + + final String currentId = Optional + .ofNullable(getColl(db, COLL_METADATA_MANAGER, true).find(query)) + .map(FindIterable::first) + .map(d -> d.getString("currentId")) + .orElseThrow(() -> new IllegalArgumentException("cannot find current mdstore id for: " + mdId)); + + log.info("currentId: {}", currentId); + + return getColl(db, currentId, true); + } + + public Map validCollections( + final String mdFormat, final String mdLayout, final String mdInterpretation) { + + final Map transactions = new HashMap<>(); + for (final Document entry : getColl(db, COLL_METADATA_MANAGER, true).find()) { + final String mdId = entry.getString("mdId"); + final String currentId = entry.getString("currentId"); + if (StringUtils.isNoneBlank(mdId, currentId)) { + transactions.put(mdId, currentId); + } + } + + final Map res = new HashMap<>(); + for (final Document entry : getColl(db, COLL_METADATA, true).find()) { + if (entry.getString("format").equals(mdFormat) + && entry.getString("layout").equals(mdLayout) + && entry.getString("interpretation").equals(mdInterpretation) + && transactions.containsKey(entry.getString("mdId"))) { + res.put(entry.getString("mdId"), transactions.get(entry.getString("mdId"))); + } + } + + return res; + } + + private MongoDatabase getDb(final MongoClient client, final String dbName) { + if (!Iterables.contains(client.listDatabaseNames(), dbName)) { + final String err = String.format("Database '%s' not found in %s", dbName, client.getAddress()); + log.warn(err); + throw new IllegalArgumentException(err); + } + return client.getDatabase(dbName); + } + + private MongoCollection getColl( + final MongoDatabase db, final String collName, final boolean abortIfMissing) { + if (!Iterables.contains(db.listCollectionNames(), collName)) { + final String err = String + .format( + String.format("Missing collection '%s' in database '%s'", collName, db.getName())); + log.warn(err); + if (abortIfMissing) { + throw new IllegalArgumentException(err); + } else { + return null; + } + } + return db.getCollection(collName); + } + + public Iterable listRecords(final String collName) { + final MongoCollection coll = getColl(db, collName, false); + return coll == null + ? new ArrayList<>() + : () -> StreamSupport + .stream(coll.find().spliterator(), false) + .filter(e -> e.containsKey("body")) + .map(e -> e.getString("body")) + .iterator(); + } + + @Override + public void close() throws IOException { + client.close(); + } +} diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/common/PacePerson.java b/dhp-common/src/main/java/eu/dnetlib/dhp/common/PacePerson.java index 6e02ca614..fac9a7565 100644 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/common/PacePerson.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/common/PacePerson.java @@ -1,18 +1,18 @@ package eu.dnetlib.dhp.common; +import java.io.IOException; import java.nio.charset.StandardCharsets; import java.text.Normalizer; -import java.util.HashSet; -import java.util.List; -import java.util.Set; +import java.util.*; +import java.util.stream.Collectors; import org.apache.commons.io.IOUtils; import org.apache.commons.lang3.text.WordUtils; +import com.ctc.wstx.dtd.LargePrefixedNameSet; import com.google.common.base.Joiner; import com.google.common.base.Splitter; -import com.google.common.collect.Iterables; import com.google.common.collect.Lists; import com.google.common.hash.Hashing; @@ -24,13 +24,24 @@ import com.google.common.hash.Hashing; */ public class PacePerson { - private static final String UTF8 = "UTF-8"; private List name = Lists.newArrayList(); private List surname = Lists.newArrayList(); private List fullname = Lists.newArrayList(); private final String original; - private static Set particles = null; + private static Set particles; + + static { + try { + particles = new HashSet<>(IOUtils + .readLines( + PacePerson.class + .getResourceAsStream( + "/eu/dnetlib/dhp/common/name_particles.txt"))); + } catch (IOException e) { + throw new RuntimeException(e); + } + } /** * Capitalizes a string @@ -38,29 +49,20 @@ public class PacePerson { * @param s the string to capitalize * @return the input string with capital letter */ - public static final String capitalize(final String s) { + public static String capitalize(final String s) { + if (particles.contains(s)) { + return s; + } return WordUtils.capitalize(s.toLowerCase(), ' ', '-'); } /** * Adds a dot to a string with length equals to 1 */ - public static final String dotAbbreviations(final String s) { + public static String dotAbbreviations(final String s) { return s.length() == 1 ? s + "." : s; } - public static Set loadFromClasspath(final String classpath) { - final Set h = new HashSet<>(); - try { - for (final String s : IOUtils.readLines(PacePerson.class.getResourceAsStream(classpath))) { - h.add(s); - } - } catch (final Throwable e) { - return new HashSet<>(); - } - return h; - } - /** * The constructor of the class. It fills the fields of the class basing on the input fullname. * @@ -129,10 +131,6 @@ public class PacePerson { } private List splitTerms(final String s) { - if (particles == null) { - particles = loadFromClasspath("/eu/dnetlib/dhp/oa/graph/pace/name_particles.txt"); - } - final List list = Lists.newArrayList(); for (final String part : Splitter.on(" ").omitEmptyStrings().split(s)) { if (!particles.contains(part.toLowerCase())) { @@ -188,17 +186,36 @@ public class PacePerson { } public List getCapitalFirstnames() { - return Lists - .newArrayList( - Iterables.transform(getNameWithAbbreviations(), PacePerson::capitalize)); + return Optional + .ofNullable(getNameWithAbbreviations()) + .map( + name -> name + .stream() + .map(PacePerson::capitalize) + .collect(Collectors.toList())) + .orElse(new ArrayList<>()); } public List getCapitalSurname() { - return Lists.newArrayList(Iterables.transform(surname, PacePerson::capitalize)); + return Optional + .ofNullable(getSurname()) + .map( + surname -> surname + .stream() + .map(PacePerson::capitalize) + .collect(Collectors.toList())) + .orElse(new ArrayList<>()); } public List getNameWithAbbreviations() { - return Lists.newArrayList(Iterables.transform(name, PacePerson::dotAbbreviations)); + return Optional + .ofNullable(getName()) + .map( + name -> name + .stream() + .map(PacePerson::dotAbbreviations) + .collect(Collectors.toList())) + .orElse(new ArrayList<>()); } public boolean isAccurate() { diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/common/aggregation/AggregatorReport.java b/dhp-common/src/main/java/eu/dnetlib/dhp/common/aggregation/AggregatorReport.java new file mode 100644 index 000000000..c5926848e --- /dev/null +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/common/aggregation/AggregatorReport.java @@ -0,0 +1,45 @@ + +package eu.dnetlib.dhp.common.aggregation; + +import java.io.Closeable; +import java.io.IOException; +import java.util.HashMap; +import java.util.LinkedHashMap; +import java.util.Map; +import java.util.Objects; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import eu.dnetlib.dhp.message.MessageSender; +import eu.dnetlib.dhp.utils.DHPUtils; + +public class AggregatorReport extends LinkedHashMap implements Closeable { + + private static final Logger log = LoggerFactory.getLogger(AggregatorReport.class); + + private transient MessageSender messageSender; + + public AggregatorReport() { + } + + public AggregatorReport(MessageSender messageSender) { + this.messageSender = messageSender; + } + + public void ongoing(Long current, Long total) { + messageSender.sendMessage(current, total); + } + + @Override + public void close() throws IOException { + if (Objects.nonNull(messageSender)) { + log.info("closing report: "); + this.forEach((k, v) -> log.info("{} - {}", k, v)); + + Map m = new HashMap<>(); + m.put(getClass().getSimpleName().toLowerCase(), DHPUtils.MAPPER.writeValueAsString(values())); + messageSender.sendReport(m); + } + } +} diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/common/api/InputStreamRequestBody.java b/dhp-common/src/main/java/eu/dnetlib/dhp/common/api/InputStreamRequestBody.java index c3f393436..c127783e5 100644 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/common/api/InputStreamRequestBody.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/common/api/InputStreamRequestBody.java @@ -13,9 +13,9 @@ import okio.Source; public class InputStreamRequestBody extends RequestBody { - private InputStream inputStream; - private MediaType mediaType; - private long lenght; + private final InputStream inputStream; + private final MediaType mediaType; + private final long lenght; public static RequestBody create(final MediaType mediaType, final InputStream inputStream, final long len) { diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/common/api/ZenodoAPIClient.java b/dhp-common/src/main/java/eu/dnetlib/dhp/common/api/ZenodoAPIClient.java index f2dd4f0ac..63f890a10 100644 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/common/api/ZenodoAPIClient.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/common/api/ZenodoAPIClient.java @@ -3,6 +3,10 @@ package eu.dnetlib.dhp.common.api; import java.io.*; import java.io.IOException; +import java.util.concurrent.TimeUnit; + +import org.apache.http.HttpHeaders; +import org.apache.http.entity.ContentType; import com.google.gson.Gson; @@ -42,7 +46,7 @@ public class ZenodoAPIClient implements Serializable { this.deposition_id = deposition_id; } - public ZenodoAPIClient(String urlString, String access_token) throws IOException { + public ZenodoAPIClient(String urlString, String access_token) { this.urlString = urlString; this.access_token = access_token; @@ -50,19 +54,20 @@ public class ZenodoAPIClient implements Serializable { /** * Brand new deposition in Zenodo. It sets the deposition_id and the bucket where to store the files to upload + * * @return response code * @throws IOException */ public int newDeposition() throws IOException { String json = "{}"; - OkHttpClient httpClient = new OkHttpClient(); + OkHttpClient httpClient = new OkHttpClient.Builder().connectTimeout(600, TimeUnit.SECONDS).build(); - RequestBody body = RequestBody.create(MEDIA_TYPE_JSON, json); + RequestBody body = RequestBody.create(json, MEDIA_TYPE_JSON); Request request = new Request.Builder() .url(urlString) - .addHeader("Content-Type", "application/json") // add request headers - .addHeader("Authorization", "Bearer " + access_token) + .addHeader(HttpHeaders.CONTENT_TYPE, ContentType.APPLICATION_JSON.toString()) // add request headers + .addHeader(HttpHeaders.AUTHORIZATION, "Bearer " + access_token) .post(body) .build(); @@ -86,18 +91,23 @@ public class ZenodoAPIClient implements Serializable { /** * Upload files in Zenodo. + * * @param is the inputStream for the file to upload * @param file_name the name of the file as it will appear on Zenodo * @param len the size of the file * @return the response code */ public int uploadIS(InputStream is, String file_name, long len) throws IOException { - OkHttpClient httpClient = new OkHttpClient(); + OkHttpClient httpClient = new OkHttpClient.Builder() + .writeTimeout(600, TimeUnit.SECONDS) + .readTimeout(600, TimeUnit.SECONDS) + .connectTimeout(600, TimeUnit.SECONDS) + .build(); Request request = new Request.Builder() .url(bucket + "/" + file_name) - .addHeader("Content-Type", "application/zip") // add request headers - .addHeader("Authorization", "Bearer " + access_token) + .addHeader(HttpHeaders.CONTENT_TYPE, "application/zip") // add request headers + .addHeader(HttpHeaders.AUTHORIZATION, "Bearer " + access_token) .put(InputStreamRequestBody.create(MEDIA_TYPE_ZIP, is, len)) .build(); @@ -110,20 +120,21 @@ public class ZenodoAPIClient implements Serializable { /** * Associates metadata information to the current deposition + * * @param metadata the metadata * @return response code * @throws IOException */ public int sendMretadata(String metadata) throws IOException { - OkHttpClient httpClient = new OkHttpClient(); + OkHttpClient httpClient = new OkHttpClient.Builder().connectTimeout(600, TimeUnit.SECONDS).build(); - RequestBody body = RequestBody.create(MEDIA_TYPE_JSON, metadata); + RequestBody body = RequestBody.create(metadata, MEDIA_TYPE_JSON); Request request = new Request.Builder() .url(urlString + "/" + deposition_id) - .addHeader("Content-Type", "application/json") // add request headers - .addHeader("Authorization", "Bearer " + access_token) + .addHeader(HttpHeaders.CONTENT_TYPE, ContentType.APPLICATION_JSON.toString()) // add request headers + .addHeader(HttpHeaders.AUTHORIZATION, "Bearer " + access_token) .put(body) .build(); @@ -140,6 +151,7 @@ public class ZenodoAPIClient implements Serializable { /** * To publish the current deposition. It works for both new deposition or new version of an old deposition + * * @return response code * @throws IOException */ @@ -147,12 +159,14 @@ public class ZenodoAPIClient implements Serializable { String json = "{}"; - OkHttpClient httpClient = new OkHttpClient(); + OkHttpClient httpClient = new OkHttpClient.Builder().connectTimeout(600, TimeUnit.SECONDS).build(); + + RequestBody body = RequestBody.create(json, MEDIA_TYPE_JSON); Request request = new Request.Builder() .url(urlString + "/" + deposition_id + "/actions/publish") .addHeader("Authorization", "Bearer " + access_token) - .post(RequestBody.create(MEDIA_TYPE_JSON, json)) + .post(body) .build(); try (Response response = httpClient.newCall(request).execute()) { @@ -166,25 +180,28 @@ public class ZenodoAPIClient implements Serializable { } /** - * To create a new version of an already published deposition. - * It sets the deposition_id and the bucket to be used for the new version. - * @param concept_rec_id the concept record id of the deposition for which to create a new version. It is - * the last part of the url for the DOI Zenodo suggests to use to cite all versions: - * DOI: 10.xxx/zenodo.656930 concept_rec_id = 656930 + * To create a new version of an already published deposition. It sets the deposition_id and the bucket to be used + * for the new version. + * + * @param concept_rec_id the concept record id of the deposition for which to create a new version. It is the last + * part of the url for the DOI Zenodo suggests to use to cite all versions: DOI: 10.xxx/zenodo.656930 + * concept_rec_id = 656930 * @return response code * @throws IOException * @throws MissingConceptDoiException */ public int newVersion(String concept_rec_id) throws IOException, MissingConceptDoiException { - setDepositionId(concept_rec_id); + setDepositionId(concept_rec_id, 1); String json = "{}"; - OkHttpClient httpClient = new OkHttpClient(); + OkHttpClient httpClient = new OkHttpClient.Builder().connectTimeout(600, TimeUnit.SECONDS).build(); + + RequestBody body = RequestBody.create(json, MEDIA_TYPE_JSON); Request request = new Request.Builder() .url(urlString + "/" + deposition_id + "/actions/newversion") - .addHeader("Authorization", "Bearer " + access_token) - .post(RequestBody.create(MEDIA_TYPE_JSON, json)) + .addHeader(HttpHeaders.AUTHORIZATION, "Bearer " + access_token) + .post(body) .build(); try (Response response = httpClient.newCall(request).execute()) { @@ -201,9 +218,45 @@ public class ZenodoAPIClient implements Serializable { } } - private void setDepositionId(String concept_rec_id) throws IOException, MissingConceptDoiException { + /** + * To finish uploading a version or new deposition not published + * It sets the deposition_id and the bucket to be used + * + * + * @param deposition_id the deposition id of the not yet published upload + * concept_rec_id = 656930 + * @return response code + * @throws IOException + * @throws MissingConceptDoiException + */ + public int uploadOpenDeposition(String deposition_id) throws IOException, MissingConceptDoiException { - ZenodoModelList zenodoModelList = new Gson().fromJson(getPrevDepositions(), ZenodoModelList.class); + this.deposition_id = deposition_id; + + OkHttpClient httpClient = new OkHttpClient.Builder().connectTimeout(600, TimeUnit.SECONDS).build(); + + Request request = new Request.Builder() + .url(urlString + "/" + deposition_id) + .addHeader("Authorization", "Bearer " + access_token) + .build(); + + try (Response response = httpClient.newCall(request).execute()) { + + if (!response.isSuccessful()) + throw new IOException("Unexpected code " + response + response.body().string()); + + ZenodoModel zenodoModel = new Gson().fromJson(response.body().string(), ZenodoModel.class); + bucket = zenodoModel.getLinks().getBucket(); + return response.code(); + + } + + } + + private void setDepositionId(String concept_rec_id, Integer page) throws IOException, MissingConceptDoiException { + + ZenodoModelList zenodoModelList = new Gson() + .fromJson(getPrevDepositions(String.valueOf(page)), ZenodoModelList.class); for (ZenodoModel zm : zenodoModelList) { if (zm.getConceptrecid().equals(concept_rec_id)) { @@ -211,18 +264,25 @@ public class ZenodoAPIClient implements Serializable { return; } } - - throw new MissingConceptDoiException("The concept record id specified was missing in the list of depositions"); + if (zenodoModelList.size() == 0) + throw new MissingConceptDoiException( + "The concept record id specified was missing in the list of depositions"); + setDepositionId(concept_rec_id, page + 1); } - private String getPrevDepositions() throws IOException { - OkHttpClient httpClient = new OkHttpClient(); + private String getPrevDepositions(String page) throws IOException { + + OkHttpClient httpClient = new OkHttpClient.Builder().connectTimeout(600, TimeUnit.SECONDS).build(); + + HttpUrl.Builder urlBuilder = HttpUrl.parse(urlString).newBuilder(); + urlBuilder.addQueryParameter("page", page); + String url = urlBuilder.build().toString(); Request request = new Request.Builder() - .url(urlString) - .addHeader("Content-Type", "application/json") // add request headers - .addHeader("Authorization", "Bearer " + access_token) + .url(url) + .addHeader(HttpHeaders.CONTENT_TYPE, ContentType.APPLICATION_JSON.toString()) // add request headers + .addHeader(HttpHeaders.AUTHORIZATION, "Bearer " + access_token) .get() .build(); @@ -238,12 +298,14 @@ public class ZenodoAPIClient implements Serializable { } private String getBucket(String url) throws IOException { - OkHttpClient httpClient = new OkHttpClient(); + OkHttpClient httpClient = new OkHttpClient.Builder() + .connectTimeout(600, TimeUnit.SECONDS) + .build(); Request request = new Request.Builder() .url(url) - .addHeader("Content-Type", "application/json") // add request headers - .addHeader("Authorization", "Bearer " + access_token) + .addHeader(HttpHeaders.CONTENT_TYPE, ContentType.APPLICATION_JSON.toString()) // add request headers + .addHeader(HttpHeaders.AUTHORIZATION, "Bearer " + access_token) .get() .build(); diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/common/api/zenodo/Creator.java b/dhp-common/src/main/java/eu/dnetlib/dhp/common/api/zenodo/Creator.java index c03762693..c14af55b6 100644 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/common/api/zenodo/Creator.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/common/api/zenodo/Creator.java @@ -32,13 +32,13 @@ public class Creator { public static Creator newInstance(String name, String affiliation, String orcid) { Creator c = new Creator(); - if (!(name == null)) { + if (name != null) { c.name = name; } - if (!(affiliation == null)) { + if (affiliation != null) { c.affiliation = affiliation; } - if (!(orcid == null)) { + if (orcid != null) { c.orcid = orcid; } diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/common/api/zenodo/File.java b/dhp-common/src/main/java/eu/dnetlib/dhp/common/api/zenodo/File.java index c7428de7d..509f444b9 100644 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/common/api/zenodo/File.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/common/api/zenodo/File.java @@ -3,17 +3,12 @@ package eu.dnetlib.dhp.common.api.zenodo; import java.io.Serializable; -import net.minidev.json.annotate.JsonIgnore; - public class File implements Serializable { private String checksum; private String filename; private long filesize; private String id; - @JsonIgnore - // private Links links; - public String getChecksum() { return checksum; } @@ -46,13 +41,4 @@ public class File implements Serializable { this.id = id; } -// @JsonIgnore -// public Links getLinks() { -// return links; -// } -// -// @JsonIgnore -// public void setLinks(Links links) { -// this.links = links; -// } } diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/common/collection/CollectorException.java b/dhp-common/src/main/java/eu/dnetlib/dhp/common/collection/CollectorException.java new file mode 100644 index 000000000..5d94c2f89 --- /dev/null +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/common/collection/CollectorException.java @@ -0,0 +1,32 @@ + +package eu.dnetlib.dhp.common.collection; + +public class CollectorException extends Exception { + + /** */ + private static final long serialVersionUID = -290723075076039757L; + + public CollectorException() { + super(); + } + + public CollectorException( + final String message, + final Throwable cause, + final boolean enableSuppression, + final boolean writableStackTrace) { + super(message, cause, enableSuppression, writableStackTrace); + } + + public CollectorException(final String message, final Throwable cause) { + super(message, cause); + } + + public CollectorException(final String message) { + super(message); + } + + public CollectorException(final Throwable cause) { + super(cause); + } +} diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/common/collection/DecompressTarGz.java b/dhp-common/src/main/java/eu/dnetlib/dhp/common/collection/DecompressTarGz.java new file mode 100644 index 000000000..8bcf14ba4 --- /dev/null +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/common/collection/DecompressTarGz.java @@ -0,0 +1,40 @@ + +package eu.dnetlib.dhp.common.collection; + +import java.io.BufferedOutputStream; +import java.io.IOException; +import java.util.zip.GZIPOutputStream; + +import org.apache.commons.compress.archivers.tar.TarArchiveEntry; +import org.apache.commons.compress.archivers.tar.TarArchiveInputStream; +import org.apache.commons.compress.compressors.gzip.GzipCompressorInputStream; +import org.apache.commons.io.IOUtils; +import org.apache.hadoop.fs.FSDataInputStream; +import org.apache.hadoop.fs.FSDataOutputStream; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; + +public class DecompressTarGz { + + public static void doExtract(FileSystem fs, String outputPath, String tarGzPath) throws IOException { + + FSDataInputStream inputFileStream = fs.open(new Path(tarGzPath)); + try (TarArchiveInputStream tais = new TarArchiveInputStream( + new GzipCompressorInputStream(inputFileStream))) { + TarArchiveEntry entry = null; + while ((entry = tais.getNextTarEntry()) != null) { + if (!entry.isDirectory()) { + try ( + FSDataOutputStream out = fs + .create(new Path(outputPath.concat(entry.getName()).concat(".gz"))); + GZIPOutputStream gzipOs = new GZIPOutputStream(new BufferedOutputStream(out))) { + + IOUtils.copy(tais, gzipOs); + + } + + } + } + } + } +} diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/common/collection/GetCSV.java b/dhp-common/src/main/java/eu/dnetlib/dhp/common/collection/GetCSV.java new file mode 100644 index 000000000..44e19142c --- /dev/null +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/common/collection/GetCSV.java @@ -0,0 +1,56 @@ + +package eu.dnetlib.dhp.common.collection; + +import java.io.*; +import java.nio.charset.StandardCharsets; +import java.util.List; + +import org.apache.hadoop.fs.FSDataOutputStream; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; + +import com.fasterxml.jackson.databind.ObjectMapper; +import com.opencsv.bean.CsvToBeanBuilder; + +public class GetCSV { + + public static final char DEFAULT_DELIMITER = ','; + + private GetCSV() { + } + + public static void getCsv(FileSystem fileSystem, BufferedReader reader, String hdfsPath, + String modelClass) throws IOException, ClassNotFoundException { + getCsv(fileSystem, reader, hdfsPath, modelClass, DEFAULT_DELIMITER); + } + + public static void getCsv(FileSystem fileSystem, Reader reader, String hdfsPath, + String modelClass, char delimiter) throws IOException, ClassNotFoundException { + + Path hdfsWritePath = new Path(hdfsPath); + FSDataOutputStream fsDataOutputStream = null; + if (fileSystem.exists(hdfsWritePath)) { + fileSystem.delete(hdfsWritePath, false); + } + fsDataOutputStream = fileSystem.create(hdfsWritePath); + + try (BufferedWriter writer = new BufferedWriter( + new OutputStreamWriter(fsDataOutputStream, StandardCharsets.UTF_8))) { + + final ObjectMapper mapper = new ObjectMapper(); + + @SuppressWarnings("unchecked") + final List lines = new CsvToBeanBuilder(reader) + .withType(Class.forName(modelClass)) + .withSeparator(delimiter) + .build() + .parse(); + + for (Object line : lines) { + writer.write(mapper.writeValueAsString(line)); + writer.newLine(); + } + } + } + +} diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/common/collection/HttpClientParams.java b/dhp-common/src/main/java/eu/dnetlib/dhp/common/collection/HttpClientParams.java new file mode 100644 index 000000000..6fcec00dd --- /dev/null +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/common/collection/HttpClientParams.java @@ -0,0 +1,94 @@ + +package eu.dnetlib.dhp.common.collection; + +/** + * Bundles the http connection parameters driving the client behaviour. + */ +public class HttpClientParams { + + // Defaults + public static int _maxNumberOfRetry = 3; + public static int _requestDelay = 0; // milliseconds + public static int _retryDelay = 10; // seconds + public static int _connectTimeOut = 10; // seconds + public static int _readTimeOut = 30; // seconds + + /** + * Maximum number of allowed retires before failing + */ + private int maxNumberOfRetry; + + /** + * Delay between request (Milliseconds) + */ + private int requestDelay; + + /** + * Time to wait after a failure before retrying (Seconds) + */ + private int retryDelay; + + /** + * Connect timeout (Seconds) + */ + private int connectTimeOut; + + /** + * Read timeout (Seconds) + */ + private int readTimeOut; + + public HttpClientParams() { + this(_maxNumberOfRetry, _requestDelay, _retryDelay, _connectTimeOut, _readTimeOut); + } + + public HttpClientParams(int maxNumberOfRetry, int requestDelay, int retryDelay, int connectTimeOut, + int readTimeOut) { + this.maxNumberOfRetry = maxNumberOfRetry; + this.requestDelay = requestDelay; + this.retryDelay = retryDelay; + this.connectTimeOut = connectTimeOut; + this.readTimeOut = readTimeOut; + } + + public int getMaxNumberOfRetry() { + return maxNumberOfRetry; + } + + public void setMaxNumberOfRetry(int maxNumberOfRetry) { + this.maxNumberOfRetry = maxNumberOfRetry; + } + + public int getRequestDelay() { + return requestDelay; + } + + public void setRequestDelay(int requestDelay) { + this.requestDelay = requestDelay; + } + + public int getRetryDelay() { + return retryDelay; + } + + public void setRetryDelay(int retryDelay) { + this.retryDelay = retryDelay; + } + + public void setConnectTimeOut(int connectTimeOut) { + this.connectTimeOut = connectTimeOut; + } + + public int getConnectTimeOut() { + return connectTimeOut; + } + + public int getReadTimeOut() { + return readTimeOut; + } + + public void setReadTimeOut(int readTimeOut) { + this.readTimeOut = readTimeOut; + } + +} diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/common/collection/HttpConnector2.java b/dhp-common/src/main/java/eu/dnetlib/dhp/common/collection/HttpConnector2.java new file mode 100644 index 000000000..dd46ab1f4 --- /dev/null +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/common/collection/HttpConnector2.java @@ -0,0 +1,263 @@ + +package eu.dnetlib.dhp.common.collection; + +import static eu.dnetlib.dhp.utils.DHPUtils.*; + +import java.io.IOException; +import java.io.InputStream; +import java.net.*; +import java.util.List; +import java.util.Map; + +import org.apache.commons.io.IOUtils; +import org.apache.commons.lang3.math.NumberUtils; +import org.apache.http.HttpHeaders; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import eu.dnetlib.dhp.common.Constants; +import eu.dnetlib.dhp.common.aggregation.AggregatorReport; + +/** + * Migrated from https://svn.driver.research-infrastructures.eu/driver/dnet45/modules/dnet-modular-collector-service/trunk/src/main/java/eu/dnetlib/data/collector/plugins/HttpConnector.java + * + * @author jochen, michele, andrea, alessia, claudio, andreas + */ +public class HttpConnector2 { + + private static final Logger log = LoggerFactory.getLogger(HttpConnector2.class); + + private static final String REPORT_PREFIX = "http:"; + + private HttpClientParams clientParams; + + private String responseType = null; + + private static final String userAgent = "Mozilla/5.0 (compatible; OAI; +http://www.openaire.eu)"; + + public HttpConnector2() { + this(new HttpClientParams()); + } + + public HttpConnector2(HttpClientParams clientParams) { + this.clientParams = clientParams; + CookieHandler.setDefault(new CookieManager(null, CookiePolicy.ACCEPT_ALL)); + } + + /** + * @see HttpConnector2#getInputSource(java.lang.String, AggregatorReport) + */ + public InputStream getInputSourceAsStream(final String requestUrl) throws CollectorException { + return IOUtils.toInputStream(getInputSource(requestUrl)); + } + + /** + * @see HttpConnector2#getInputSource(java.lang.String, AggregatorReport) + */ + public String getInputSource(final String requestUrl) throws CollectorException { + return attemptDownloadAsString(requestUrl, 1, new AggregatorReport()); + } + + /** + * Given the URL returns the content via HTTP GET + * + * @param requestUrl the URL + * @param report the list of errors + * @return the content of the downloaded resource + * @throws CollectorException when retrying more than maxNumberOfRetry times + */ + public String getInputSource(final String requestUrl, AggregatorReport report) + throws CollectorException { + return attemptDownloadAsString(requestUrl, 1, report); + } + + private String attemptDownloadAsString(final String requestUrl, final int retryNumber, + final AggregatorReport report) throws CollectorException { + + try (InputStream s = attemptDownload(requestUrl, retryNumber, report)) { + return IOUtils.toString(s); + } catch (IOException e) { + log.error(e.getMessage(), e); + throw new CollectorException(e); + } + } + + private InputStream attemptDownload(final String requestUrl, final int retryNumber, + final AggregatorReport report) throws CollectorException, IOException { + + if (retryNumber > getClientParams().getMaxNumberOfRetry()) { + final String msg = String + .format( + "Max number of retries (%s/%s) exceeded, failing.", + retryNumber, getClientParams().getMaxNumberOfRetry()); + log.error(msg); + throw new CollectorException(msg); + } + + log.info("Request attempt {} [{}]", retryNumber, requestUrl); + + InputStream input = null; + + try { + if (getClientParams().getRequestDelay() > 0) { + backoffAndSleep(getClientParams().getRequestDelay()); + } + final HttpURLConnection urlConn = (HttpURLConnection) new URL(requestUrl).openConnection(); + urlConn.setInstanceFollowRedirects(false); + urlConn.setReadTimeout(getClientParams().getReadTimeOut() * 1000); + urlConn.setConnectTimeout(getClientParams().getConnectTimeOut() * 1000); + urlConn.addRequestProperty(HttpHeaders.USER_AGENT, userAgent); + + if (log.isDebugEnabled()) { + logHeaderFields(urlConn); + } + + int retryAfter = obtainRetryAfter(urlConn.getHeaderFields()); + String rateLimit = urlConn.getHeaderField(Constants.HTTPHEADER_IETF_DRAFT_RATELIMIT_LIMIT); + String rateRemaining = urlConn.getHeaderField(Constants.HTTPHEADER_IETF_DRAFT_RATELIMIT_REMAINING); + + if ((rateLimit != null) && (rateRemaining != null) && (Integer.parseInt(rateRemaining) < 2)) { + if (retryAfter > 0) { + backoffAndSleep(retryAfter); + } else { + backoffAndSleep(1000); + } + } + + if (is2xx(urlConn.getResponseCode())) { + input = urlConn.getInputStream(); + responseType = urlConn.getContentType(); + return input; + } + if (is3xx(urlConn.getResponseCode())) { + // REDIRECTS + final String newUrl = obtainNewLocation(urlConn.getHeaderFields()); + log.info("The requested url has been moved to {}", newUrl); + report + .put( + REPORT_PREFIX + urlConn.getResponseCode(), + String.format("Moved to: %s", newUrl)); + urlConn.disconnect(); + if (retryAfter > 0) { + backoffAndSleep(retryAfter); + } + return attemptDownload(newUrl, retryNumber + 1, report); + } + if (is4xx(urlConn.getResponseCode()) || is5xx(urlConn.getResponseCode())) { + switch (urlConn.getResponseCode()) { + case HttpURLConnection.HTTP_NOT_FOUND: + case HttpURLConnection.HTTP_BAD_GATEWAY: + case HttpURLConnection.HTTP_UNAVAILABLE: + case HttpURLConnection.HTTP_GATEWAY_TIMEOUT: + if (retryAfter > 0) { + log + .warn( + "{} - waiting and repeating request after suggested retry-after {} sec.", + requestUrl, retryAfter); + backoffAndSleep(retryAfter * 1000); + } else { + log + .warn( + "{} - waiting and repeating request after default delay of {} sec.", + requestUrl, getClientParams().getRetryDelay()); + backoffAndSleep(retryNumber * getClientParams().getRetryDelay() * 1000); + } + report.put(REPORT_PREFIX + urlConn.getResponseCode(), requestUrl); + urlConn.disconnect(); + return attemptDownload(requestUrl, retryNumber + 1, report); + default: + report + .put( + REPORT_PREFIX + urlConn.getResponseCode(), + String + .format( + "%s Error: %s", requestUrl, urlConn.getResponseMessage())); + throw new CollectorException(urlConn.getResponseCode() + " error " + report); + } + } + throw new CollectorException( + String + .format( + "Unexpected status code: %s errors: %s", urlConn.getResponseCode(), + MAPPER.writeValueAsString(report))); + } catch (MalformedURLException | UnknownHostException e) { + log.error(e.getMessage(), e); + report.put(e.getClass().getName(), e.getMessage()); + throw new CollectorException(e.getMessage(), e); + } catch (SocketTimeoutException | SocketException e) { + log.error(e.getMessage(), e); + report.put(e.getClass().getName(), e.getMessage()); + backoffAndSleep(getClientParams().getRetryDelay() * retryNumber * 1000); + return attemptDownload(requestUrl, retryNumber + 1, report); + } + } + + private void logHeaderFields(final HttpURLConnection urlConn) throws IOException { + log.debug("StatusCode: {}", urlConn.getResponseMessage()); + + for (Map.Entry> e : urlConn.getHeaderFields().entrySet()) { + if (e.getKey() != null) { + for (String v : e.getValue()) { + log.debug(" key: {} - value: {}", e.getKey(), v); + } + } + } + } + + private void backoffAndSleep(int sleepTimeMs) throws CollectorException { + log.info("I'm going to sleep for {}ms", sleepTimeMs); + try { + Thread.sleep(sleepTimeMs); + } catch (InterruptedException e) { + log.error(e.getMessage(), e); + throw new CollectorException(e); + } + } + + private int obtainRetryAfter(final Map> headerMap) { + for (String key : headerMap.keySet()) { + if ((key != null) && key.equalsIgnoreCase(HttpHeaders.RETRY_AFTER) && (!headerMap.get(key).isEmpty()) + && NumberUtils.isCreatable(headerMap.get(key).get(0))) { + return Integer.parseInt(headerMap.get(key).get(0)) + 10; + } + } + return -1; + } + + private String obtainNewLocation(final Map> headerMap) throws CollectorException { + for (String key : headerMap.keySet()) { + if ((key != null) && key.equalsIgnoreCase(HttpHeaders.LOCATION) && (headerMap.get(key).size() > 0)) { + return headerMap.get(key).get(0); + } + } + throw new CollectorException("The requested url has been MOVED, but 'location' param is MISSING"); + } + + private boolean is2xx(final int statusCode) { + return statusCode >= 200 && statusCode <= 299; + } + + private boolean is4xx(final int statusCode) { + return statusCode >= 400 && statusCode <= 499; + } + + private boolean is3xx(final int statusCode) { + return statusCode >= 300 && statusCode <= 399; + } + + private boolean is5xx(final int statusCode) { + return statusCode >= 500 && statusCode <= 599; + } + + public String getResponseType() { + return responseType; + } + + public HttpClientParams getClientParams() { + return clientParams; + } + + public void setClientParams(HttpClientParams clientParams) { + this.clientParams = clientParams; + } +} diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/common/rest/DNetRestClient.java b/dhp-common/src/main/java/eu/dnetlib/dhp/common/rest/DNetRestClient.java new file mode 100644 index 000000000..af6926cc7 --- /dev/null +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/common/rest/DNetRestClient.java @@ -0,0 +1,75 @@ + +package eu.dnetlib.dhp.common.rest; + +import java.io.IOException; +import java.util.Arrays; +import java.util.stream.Collectors; + +import org.apache.commons.io.IOUtils; +import org.apache.http.client.methods.HttpGet; +import org.apache.http.client.methods.HttpPost; +import org.apache.http.client.methods.HttpUriRequest; +import org.apache.http.entity.StringEntity; +import org.apache.http.impl.client.CloseableHttpClient; +import org.apache.http.impl.client.HttpClients; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import com.fasterxml.jackson.databind.ObjectMapper; + +public class DNetRestClient { + + private static final Logger log = LoggerFactory.getLogger(DNetRestClient.class); + + private static final ObjectMapper mapper = new ObjectMapper(); + + private DNetRestClient() { + } + + public static T doGET(final String url, Class clazz) throws Exception { + final HttpGet httpGet = new HttpGet(url); + return doHTTPRequest(httpGet, clazz); + } + + public static String doGET(final String url) throws IOException { + final HttpGet httpGet = new HttpGet(url); + return doHTTPRequest(httpGet); + } + + public static String doPOST(final String url, V objParam) throws IOException { + final HttpPost httpPost = new HttpPost(url); + + if (objParam != null) { + final StringEntity entity = new StringEntity(mapper.writeValueAsString(objParam)); + httpPost.setEntity(entity); + httpPost.setHeader("Accept", "application/json"); + httpPost.setHeader("Content-type", "application/json"); + } + return doHTTPRequest(httpPost); + } + + public static T doPOST(final String url, V objParam, Class clazz) throws IOException { + return mapper.readValue(doPOST(url, objParam), clazz); + } + + private static String doHTTPRequest(final HttpUriRequest r) throws IOException { + try (CloseableHttpClient client = HttpClients.createDefault()) { + + log.info("performing HTTP request, method {} on URI {}", r.getMethod(), r.getURI().toString()); + log + .info( + "request headers: {}", + Arrays + .asList(r.getAllHeaders()) + .stream() + .map(h -> h.getName() + ":" + h.getValue()) + .collect(Collectors.joining(","))); + + return IOUtils.toString(client.execute(r).getEntity().getContent()); + } + } + + private static T doHTTPRequest(final HttpUriRequest r, Class clazz) throws Exception { + return mapper.readValue(doHTTPRequest(r), clazz); + } +} diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/common/vocabulary/Vocabulary.java b/dhp-common/src/main/java/eu/dnetlib/dhp/common/vocabulary/Vocabulary.java new file mode 100644 index 000000000..3a8df5c9e --- /dev/null +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/common/vocabulary/Vocabulary.java @@ -0,0 +1,92 @@ + +package eu.dnetlib.dhp.common.vocabulary; + +import java.io.Serializable; +import java.util.HashMap; +import java.util.Map; +import java.util.Optional; + +import org.apache.commons.lang3.StringUtils; + +import com.google.common.collect.Maps; + +import eu.dnetlib.dhp.schema.oaf.Qualifier; +import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils; + +public class Vocabulary implements Serializable { + + private final String id; + private final String name; + + /** + * Code to Term mappings for this Vocabulary. + */ + private final Map terms = new HashMap<>(); + + /** + * Synonym to Code mappings for this Vocabulary. + */ + private final Map synonyms = Maps.newHashMap(); + + public Vocabulary(final String id, final String name) { + this.id = id; + this.name = name; + } + + public String getId() { + return id; + } + + public String getName() { + return name; + } + + protected Map getTerms() { + return terms; + } + + public VocabularyTerm getTerm(final String id) { + return Optional.ofNullable(id).map(String::toLowerCase).map(terms::get).orElse(null); + } + + protected void addTerm(final String id, final String name) { + terms.put(id.toLowerCase(), new VocabularyTerm(id, name)); + } + + protected boolean termExists(final String id) { + return terms.containsKey(id.toLowerCase()); + } + + protected void addSynonym(final String syn, final String termCode) { + synonyms.put(syn, termCode.toLowerCase()); + } + + public VocabularyTerm getTermBySynonym(final String syn) { + return getTerm(synonyms.get(syn.toLowerCase())); + } + + public Qualifier getTermAsQualifier(final String termId) { + if (StringUtils.isBlank(termId)) { + return OafMapperUtils.unknown(getId(), getName()); + } else if (termExists(termId)) { + final VocabularyTerm t = getTerm(termId); + return OafMapperUtils.qualifier(t.getId(), t.getName(), getId(), getName()); + } else { + return OafMapperUtils.qualifier(termId, termId, getId(), getName()); + } + } + + public Qualifier getSynonymAsQualifier(final String syn) { + return Optional + .ofNullable(getTermBySynonym(syn)) + .map(term -> getTermAsQualifier(term.getId())) + .orElse(null); + } + + public Qualifier lookup(String id) { + return Optional + .ofNullable(getSynonymAsQualifier(id)) + .orElse(getTermAsQualifier(id)); + } + +} diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/common/vocabulary/VocabularyGroup.java b/dhp-common/src/main/java/eu/dnetlib/dhp/common/vocabulary/VocabularyGroup.java new file mode 100644 index 000000000..fc7175270 --- /dev/null +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/common/vocabulary/VocabularyGroup.java @@ -0,0 +1,189 @@ + +package eu.dnetlib.dhp.common.vocabulary; + +import java.io.Serializable; +import java.util.*; +import java.util.stream.Collectors; + +import org.apache.commons.lang3.StringUtils; + +import eu.dnetlib.dhp.schema.oaf.Qualifier; +import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils; +import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException; +import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService; + +public class VocabularyGroup implements Serializable { + + public static final String VOCABULARIES_XQUERY = "for $x in collection('/db/DRIVER/VocabularyDSResources/VocabularyDSResourceType') \n" + + + "let $vocid := $x//VOCABULARY_NAME/@code\n" + + "let $vocname := $x//VOCABULARY_NAME/text()\n" + + "for $term in ($x//TERM)\n" + + "return concat($vocid,' @=@ ',$vocname,' @=@ ',$term/@code,' @=@ ',$term/@english_name)"; + + public static final String VOCABULARY_SYNONYMS_XQUERY = "for $x in collection('/db/DRIVER/VocabularyDSResources/VocabularyDSResourceType')\n" + + + "let $vocid := $x//VOCABULARY_NAME/@code\n" + + "let $vocname := $x//VOCABULARY_NAME/text()\n" + + "for $term in ($x//TERM)\n" + + "for $syn in ($term//SYNONYM/@term)\n" + + "return concat($vocid,' @=@ ',$term/@code,' @=@ ', $syn)\n"; + + public static VocabularyGroup loadVocsFromIS(ISLookUpService isLookUpService) throws ISLookUpException { + + final VocabularyGroup vocs = new VocabularyGroup(); + + for (final String s : isLookUpService.quickSearchProfile(VOCABULARIES_XQUERY)) { + final String[] arr = s.split("@=@"); + if (arr.length == 4) { + final String vocId = arr[0].trim(); + final String vocName = arr[1].trim(); + final String termId = arr[2].trim(); + final String termName = arr[3].trim(); + + if (!vocs.vocabularyExists(vocId)) { + vocs.addVocabulary(vocId, vocName); + } + + vocs.addTerm(vocId, termId, termName); + } + } + + for (final String s : isLookUpService.quickSearchProfile(VOCABULARY_SYNONYMS_XQUERY)) { + final String[] arr = s.split("@=@"); + if (arr.length == 3) { + final String vocId = arr[0].trim(); + final String termId = arr[1].trim(); + final String syn = arr[2].trim(); + + vocs.addSynonyms(vocId, termId, syn); + + } + } + + // add the term names as synonyms + vocs.vocs.values().forEach(voc -> { + voc.getTerms().values().forEach(term -> { + voc.addSynonym(term.getName().toLowerCase(), term.getId()); + }); + }); + + return vocs; + } + + private final Map vocs = new HashMap<>(); + + public Set vocabularyNames() { + return vocs.keySet(); + } + + public void addVocabulary(final String id, final String name) { + vocs.put(id.toLowerCase(), new Vocabulary(id, name)); + } + + public Optional find(final String vocId) { + return Optional + .ofNullable(vocId) + .map(String::toLowerCase) + .map(vocs::get); + } + + public void addTerm(final String vocId, final String id, final String name) { + if (vocabularyExists(vocId)) { + vocs.get(vocId.toLowerCase()).addTerm(id, name); + } + } + + public VocabularyTerm getTerm(final String vocId, final String id) { + if (termExists(vocId, id)) { + return vocs.get(vocId.toLowerCase()).getTerm(id); + } else { + return new VocabularyTerm(id, id); + } + } + + public Set getTerms(String vocId) { + if (!vocabularyExists(vocId)) { + return new HashSet<>(); + } + return vocs + .get(vocId.toLowerCase()) + .getTerms() + .values() + .stream() + .map(VocabularyTerm::getId) + .collect(Collectors.toCollection(HashSet::new)); + } + + public Qualifier lookup(String vocId, String id) { + return Optional + .ofNullable(getSynonymAsQualifier(vocId, id)) + .orElse(getTermAsQualifier(vocId, id)); + } + + public Qualifier getTermAsQualifier(final String vocId, final String id) { + if (vocabularyExists(vocId)) { + return vocs.get(vocId.toLowerCase()).getTermAsQualifier(id); + } + return OafMapperUtils.qualifier(id, id, "", ""); + } + + public Qualifier getSynonymAsQualifier(final String vocId, final String syn) { + if (StringUtils.isBlank(vocId)) { + return OafMapperUtils.unknown("", ""); + } + return vocs.get(vocId.toLowerCase()).getSynonymAsQualifier(syn); + } + + /** + * getSynonymAsQualifierCaseSensitive + * + * refelects the situation to check caseSensitive vocabulary + */ + public Qualifier getSynonymAsQualifierCaseSensitive(final String vocId, final String syn) { + if (StringUtils.isBlank(vocId)) { + return OafMapperUtils.unknown("", ""); + } + return vocs.get(vocId).getSynonymAsQualifier(syn); + } + + /** + * termExists + * + * two methods: without and with caseSensitive check + */ + public boolean termExists(final String vocId, final String id) { + return termExists(vocId, id, Boolean.FALSE); + } + + public boolean termExists(final String vocId, final String id, final Boolean caseSensitive) { + if (Boolean.TRUE.equals(caseSensitive)) { + return vocabularyExists(vocId) && vocs.get(vocId).termExists(id); + } + return vocabularyExists(vocId) && vocs.get(vocId.toLowerCase()).termExists(id); + } + + public boolean vocabularyExists(final String vocId) { + return Optional + .ofNullable(vocId) + .map(String::toLowerCase) + .map(vocs::containsKey) + .orElse(false); + } + + private void addSynonyms(final String vocId, final String termId, final String syn) { + String id = Optional + .ofNullable(vocId) + .map(String::toLowerCase) + .orElseThrow( + () -> new IllegalArgumentException( + String + .format( + "empty vocabulary id for [term:%s, synonym:%s]", termId, syn))); + Optional + .ofNullable(vocs.get(id)) + .orElseThrow(() -> new IllegalArgumentException("missing vocabulary id: " + vocId)) + .addSynonym(syn.toLowerCase(), termId); + } + +} diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/common/vocabulary/VocabularyTerm.java b/dhp-common/src/main/java/eu/dnetlib/dhp/common/vocabulary/VocabularyTerm.java new file mode 100644 index 000000000..52eb7ca23 --- /dev/null +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/common/vocabulary/VocabularyTerm.java @@ -0,0 +1,24 @@ + +package eu.dnetlib.dhp.common.vocabulary; + +import java.io.Serializable; + +public class VocabularyTerm implements Serializable { + + private final String id; + private final String name; + + public VocabularyTerm(final String id, final String name) { + this.id = id; + this.name = name; + } + + public String getId() { + return id; + } + + public String getName() { + return name; + } + +} diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/message/Message.java b/dhp-common/src/main/java/eu/dnetlib/dhp/message/Message.java new file mode 100644 index 000000000..c7a0b5f50 --- /dev/null +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/message/Message.java @@ -0,0 +1,63 @@ + +package eu.dnetlib.dhp.message; + +import java.io.Serializable; +import java.util.LinkedHashMap; +import java.util.Map; + +public class Message implements Serializable { + + private static final long serialVersionUID = 401753881204524893L; + + public static final String CURRENT_PARAM = "current"; + public static final String TOTAL_PARAM = "total"; + + private MessageType messageType; + + private String workflowId; + + private Map body; + + public Message() { + } + + public Message(final MessageType messageType, final String workflowId) { + this(messageType, workflowId, new LinkedHashMap<>()); + } + + public Message(final MessageType messageType, final String workflowId, final Map body) { + this.messageType = messageType; + this.workflowId = workflowId; + this.body = body; + } + + public MessageType getMessageType() { + return messageType; + } + + public void setMessageType(MessageType messageType) { + this.messageType = messageType; + } + + public String getWorkflowId() { + return workflowId; + } + + public void setWorkflowId(final String workflowId) { + this.workflowId = workflowId; + } + + public Map getBody() { + return body; + } + + public void setBody(final Map body) { + this.body = body; + } + + @Override + public String toString() { + return String.format("Message [type=%s, workflowId=%s, body=%s]", messageType, workflowId, body); + } + +} diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/message/MessageSender.java b/dhp-common/src/main/java/eu/dnetlib/dhp/message/MessageSender.java new file mode 100644 index 000000000..deeda9beb --- /dev/null +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/message/MessageSender.java @@ -0,0 +1,94 @@ + +package eu.dnetlib.dhp.message; + +import java.util.Map; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; + +import org.apache.http.client.config.RequestConfig; +import org.apache.http.client.methods.CloseableHttpResponse; +import org.apache.http.client.methods.HttpPut; +import org.apache.http.entity.ContentType; +import org.apache.http.entity.StringEntity; +import org.apache.http.impl.client.CloseableHttpClient; +import org.apache.http.impl.client.HttpClients; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import com.fasterxml.jackson.core.JsonProcessingException; +import com.fasterxml.jackson.databind.ObjectMapper; + +public class MessageSender { + + private static final Logger log = LoggerFactory.getLogger(MessageSender.class); + + private static final int SOCKET_TIMEOUT_MS = 2000; + + private static final int CONNECTION_REQUEST_TIMEOUT_MS = 2000; + + private static final int CONNTECTION_TIMEOUT_MS = 2000; + + private final ObjectMapper objectMapper = new ObjectMapper(); + + private final String dnetMessageEndpoint; + + private final String workflowId; + + private final ExecutorService executorService = Executors.newCachedThreadPool(); + + public MessageSender(final String dnetMessageEndpoint, final String workflowId) { + this.workflowId = workflowId; + this.dnetMessageEndpoint = dnetMessageEndpoint; + } + + public void sendMessage(final Message message) { + executorService.submit(() -> _sendMessage(message)); + } + + public void sendMessage(final Long current, final Long total) { + sendMessage(createOngoingMessage(current, total)); + } + + public void sendReport(final Map report) { + sendMessage(new Message(MessageType.REPORT, workflowId, report)); + } + + private Message createOngoingMessage(final Long current, final Long total) { + final Message m = new Message(MessageType.ONGOING, workflowId); + m.getBody().put(Message.CURRENT_PARAM, current.toString()); + if (total != null) { + m.getBody().put(Message.TOTAL_PARAM, total.toString()); + } + return m; + } + + private void _sendMessage(final Message message) { + try { + final String json = objectMapper.writeValueAsString(message); + + final HttpPut req = new HttpPut(dnetMessageEndpoint); + req.setEntity(new StringEntity(json, ContentType.APPLICATION_JSON)); + + final RequestConfig requestConfig = RequestConfig + .custom() + .setConnectTimeout(CONNTECTION_TIMEOUT_MS) + .setConnectionRequestTimeout(CONNECTION_REQUEST_TIMEOUT_MS) + .setSocketTimeout(SOCKET_TIMEOUT_MS) + .build(); + + try (final CloseableHttpClient client = HttpClients + .custom() + .setDefaultRequestConfig(requestConfig) + .build(); + final CloseableHttpResponse response = client.execute(req)) { + log.debug("Sent Message to " + dnetMessageEndpoint); + log.debug("MESSAGE:" + message); + } catch (final Throwable e) { + log.error("Error sending message to " + dnetMessageEndpoint + ", message content: " + message, e); + } + } catch (final JsonProcessingException e) { + log.error("Error sending message to " + dnetMessageEndpoint + ", message content: " + message, e); + } + } + +} diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/message/MessageType.java b/dhp-common/src/main/java/eu/dnetlib/dhp/message/MessageType.java new file mode 100644 index 000000000..75ffb8ef5 --- /dev/null +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/message/MessageType.java @@ -0,0 +1,21 @@ + +package eu.dnetlib.dhp.message; + +import java.io.Serializable; +import java.util.Optional; + +import org.apache.commons.lang3.StringUtils; + +public enum MessageType implements Serializable { + + ONGOING, REPORT; + + public MessageType from(String value) { + return Optional + .ofNullable(value) + .map(StringUtils::upperCase) + .map(MessageType::valueOf) + .orElseThrow(() -> new IllegalArgumentException("unknown message type: " + value)); + } + +} diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/model/mdstore/MetadataRecord.java b/dhp-common/src/main/java/eu/dnetlib/dhp/model/mdstore/MetadataRecord.java deleted file mode 100644 index ce65e710f..000000000 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/model/mdstore/MetadataRecord.java +++ /dev/null @@ -1,121 +0,0 @@ - -package eu.dnetlib.dhp.model.mdstore; - -import java.io.Serializable; - -import eu.dnetlib.dhp.utils.DHPUtils; - -/** This class models a record inside the new Metadata store collection on HDFS * */ -public class MetadataRecord implements Serializable { - - /** The D-Net Identifier associated to the record */ - private String id; - - /** The original Identifier of the record */ - private String originalId; - - /** The encoding of the record, should be JSON or XML */ - private String encoding; - - /** - * The information about the provenance of the record see @{@link Provenance} for the model of this information - */ - private Provenance provenance; - - /** The content of the metadata */ - private String body; - - /** the date when the record has been stored */ - private long dateOfCollection; - - /** the date when the record has been stored */ - private long dateOfTransformation; - - public MetadataRecord() { - this.dateOfCollection = System.currentTimeMillis(); - } - - public MetadataRecord( - String originalId, - String encoding, - Provenance provenance, - String body, - long dateOfCollection) { - - this.originalId = originalId; - this.encoding = encoding; - this.provenance = provenance; - this.body = body; - this.dateOfCollection = dateOfCollection; - this.id = DHPUtils.generateIdentifier(originalId, this.provenance.getNsPrefix()); - } - - public String getId() { - return id; - } - - public void setId(String id) { - this.id = id; - } - - public String getOriginalId() { - return originalId; - } - - public void setOriginalId(String originalId) { - this.originalId = originalId; - } - - public String getEncoding() { - return encoding; - } - - public void setEncoding(String encoding) { - this.encoding = encoding; - } - - public Provenance getProvenance() { - return provenance; - } - - public void setProvenance(Provenance provenance) { - this.provenance = provenance; - } - - public String getBody() { - return body; - } - - public void setBody(String body) { - this.body = body; - } - - public long getDateOfCollection() { - return dateOfCollection; - } - - public void setDateOfCollection(long dateOfCollection) { - this.dateOfCollection = dateOfCollection; - } - - public long getDateOfTransformation() { - return dateOfTransformation; - } - - public void setDateOfTransformation(long dateOfTransformation) { - this.dateOfTransformation = dateOfTransformation; - } - - @Override - public boolean equals(Object o) { - if (!(o instanceof MetadataRecord)) { - return false; - } - return ((MetadataRecord) o).getId().equalsIgnoreCase(id); - } - - @Override - public int hashCode() { - return id.hashCode(); - } -} diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/model/mdstore/Provenance.java b/dhp-common/src/main/java/eu/dnetlib/dhp/model/mdstore/Provenance.java deleted file mode 100644 index 556535022..000000000 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/model/mdstore/Provenance.java +++ /dev/null @@ -1,52 +0,0 @@ - -package eu.dnetlib.dhp.model.mdstore; - -import java.io.Serializable; - -/** - * @author Sandro La Bruzzo - *

- * Provenace class models the provenance of the record in the metadataStore It contains the identifier and the - * name of the datasource that gives the record - */ -public class Provenance implements Serializable { - - private String datasourceId; - - private String datasourceName; - - private String nsPrefix; - - public Provenance() { - } - - public Provenance(String datasourceId, String datasourceName, String nsPrefix) { - this.datasourceId = datasourceId; - this.datasourceName = datasourceName; - this.nsPrefix = nsPrefix; - } - - public String getDatasourceId() { - return datasourceId; - } - - public void setDatasourceId(String datasourceId) { - this.datasourceId = datasourceId; - } - - public String getDatasourceName() { - return datasourceName; - } - - public void setDatasourceName(String datasourceName) { - this.datasourceName = datasourceName; - } - - public String getNsPrefix() { - return nsPrefix; - } - - public void setNsPrefix(String nsPrefix) { - this.nsPrefix = nsPrefix; - } -} diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/oa/merge/AuthorMerger.java b/dhp-common/src/main/java/eu/dnetlib/dhp/oa/merge/AuthorMerger.java index 17482c019..aea046203 100644 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/oa/merge/AuthorMerger.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/oa/merge/AuthorMerger.java @@ -18,6 +18,9 @@ public class AuthorMerger { private static final Double THRESHOLD = 0.95; + private AuthorMerger() { + } + public static List merge(List> authors) { authors.sort((o1, o2) -> -Integer.compare(countAuthorsPids(o1), countAuthorsPids(o2))); @@ -32,44 +35,54 @@ public class AuthorMerger { } - public static List mergeAuthor(final List a, final List b) { + public static List mergeAuthor(final List a, final List b, Double threshold) { int pa = countAuthorsPids(a); int pb = countAuthorsPids(b); - List base, enrich; + List base; + List enrich; int sa = authorsSize(a); int sb = authorsSize(b); - if (pa == pb) { - base = sa > sb ? a : b; - enrich = sa > sb ? b : a; - } else { + if (sa == sb) { base = pa > pb ? a : b; enrich = pa > pb ? b : a; + } else { + base = sa > sb ? a : b; + enrich = sa > sb ? b : a; } - enrichPidFromList(base, enrich); + enrichPidFromList(base, enrich, threshold); return base; } - private static void enrichPidFromList(List base, List enrich) { + public static List mergeAuthor(final List a, final List b) { + return mergeAuthor(a, b, THRESHOLD); + } + + private static void enrichPidFromList(List base, List enrich, Double threshold) { if (base == null || enrich == null) return; + + // (if an Author has more than 1 pid, it appears 2 times in the list) final Map basePidAuthorMap = base .stream() - .filter(a -> a.getPid() != null && a.getPid().size() > 0) + .filter(a -> a.getPid() != null && !a.getPid().isEmpty()) .flatMap( a -> a .getPid() .stream() + .filter(Objects::nonNull) .map(p -> new Tuple2<>(pidToComparableString(p), a))) .collect(Collectors.toMap(Tuple2::_1, Tuple2::_2, (x1, x2) -> x1)); + // (list of pid that are missing in the other list) final List> pidToEnrich = enrich .stream() - .filter(a -> a.getPid() != null && a.getPid().size() > 0) + .filter(a -> a.getPid() != null && !a.getPid().isEmpty()) .flatMap( a -> a .getPid() .stream() + .filter(Objects::nonNull) .filter(p -> !basePidAuthorMap.containsKey(pidToComparableString(p))) .map(p -> new Tuple2<>(p, a))) .collect(Collectors.toList()); @@ -83,10 +96,10 @@ public class AuthorMerger { .max(Comparator.comparing(Tuple2::_1)); if (simAuthor.isPresent()) { - double th = THRESHOLD; + double th = threshold; // increase the threshold if the surname is too short if (simAuthor.get()._2().getSurname() != null - && simAuthor.get()._2().getSurname().length() <= 3) + && simAuthor.get()._2().getSurname().length() <= 3 && threshold > 0.0) th = 0.99; if (simAuthor.get()._1() > th) { @@ -107,9 +120,9 @@ public class AuthorMerger { } public static String pidToComparableString(StructuredProperty pid) { - return (pid.getQualifier() != null - ? pid.getQualifier().getClassid() != null ? pid.getQualifier().getClassid().toLowerCase() : "" - : "") + final String classid = pid.getQualifier().getClassid() != null ? pid.getQualifier().getClassid().toLowerCase() + : ""; + return (pid.getQualifier() != null ? classid : "") + (pid.getValue() != null ? pid.getValue().toLowerCase() : ""); } @@ -142,7 +155,7 @@ public class AuthorMerger { } private static boolean hasPid(Author a) { - if (a == null || a.getPid() == null || a.getPid().size() == 0) + if (a == null || a.getPid() == null || a.getPid().isEmpty()) return false; return a.getPid().stream().anyMatch(p -> p != null && StringUtils.isNotBlank(p.getValue())); } @@ -151,12 +164,15 @@ public class AuthorMerger { if (StringUtils.isNotBlank(author.getSurname())) { return new Person(author.getSurname() + ", " + author.getName(), false); } else { - return new Person(author.getFullname(), false); + if (StringUtils.isNotBlank(author.getFullname())) + return new Person(author.getFullname(), false); + else + return new Person("", false); } } private static String normalize(final String s) { - return nfd(s) + String[] normalized = nfd(s) .toLowerCase() // do not compact the regexes in a single expression, would cause StackOverflowError // in case @@ -166,7 +182,12 @@ public class AuthorMerger { .replaceAll("(\\p{Punct})+", " ") .replaceAll("(\\d)+", " ") .replaceAll("(\\n)+", " ") - .trim(); + .trim() + .split(" "); + + Arrays.sort(normalized); + + return String.join(" ", normalized); } private static String nfd(final String s) { diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/oa/merge/DispatchEntitiesSparkJob.java b/dhp-common/src/main/java/eu/dnetlib/dhp/oa/merge/DispatchEntitiesSparkJob.java new file mode 100644 index 000000000..3f65d754f --- /dev/null +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/oa/merge/DispatchEntitiesSparkJob.java @@ -0,0 +1,97 @@ + +package eu.dnetlib.dhp.oa.merge; + +import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; + +import java.util.Objects; +import java.util.Optional; + +import org.apache.commons.io.IOUtils; +import org.apache.commons.lang3.StringUtils; +import org.apache.spark.SparkConf; +import org.apache.spark.api.java.function.FilterFunction; +import org.apache.spark.api.java.function.MapFunction; +import org.apache.spark.sql.Encoders; +import org.apache.spark.sql.SaveMode; +import org.apache.spark.sql.SparkSession; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import com.fasterxml.jackson.databind.ObjectMapper; + +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.common.HdfsSupport; +import eu.dnetlib.dhp.schema.oaf.Oaf; +import eu.dnetlib.dhp.schema.oaf.OafEntity; + +public class DispatchEntitiesSparkJob { + + private static final Logger log = LoggerFactory.getLogger(DispatchEntitiesSparkJob.class); + + private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); + + public static void main(String[] args) throws Exception { + + String jsonConfiguration = IOUtils + .toString( + Objects + .requireNonNull( + DispatchEntitiesSparkJob.class + .getResourceAsStream( + "/eu/dnetlib/dhp/oa/merge/dispatch_entities_parameters.json"))); + final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration); + parser.parseArgument(args); + + Boolean isSparkSessionManaged = Optional + .ofNullable(parser.get("isSparkSessionManaged")) + .map(Boolean::valueOf) + .orElse(Boolean.TRUE); + log.info("isSparkSessionManaged: {}", isSparkSessionManaged); + + String inputPath = parser.get("inputPath"); + log.info("inputPath: {}", inputPath); + + String outputPath = parser.get("outputPath"); + log.info("outputPath: {}", outputPath); + + String graphTableClassName = parser.get("graphTableClassName"); + log.info("graphTableClassName: {}", graphTableClassName); + + @SuppressWarnings("unchecked") + Class entityClazz = (Class) Class.forName(graphTableClassName); + + SparkConf conf = new SparkConf(); + runWithSparkSession( + conf, + isSparkSessionManaged, + spark -> { + HdfsSupport.remove(outputPath, spark.sparkContext().hadoopConfiguration()); + dispatchEntities(spark, inputPath, entityClazz, outputPath); + }); + } + + private static void dispatchEntities( + SparkSession spark, + String inputPath, + Class clazz, + String outputPath) { + + spark + .read() + .textFile(inputPath) + .filter((FilterFunction) s -> isEntityType(s, clazz)) + .map((MapFunction) s -> StringUtils.substringAfter(s, "|"), Encoders.STRING()) + .map( + (MapFunction) value -> OBJECT_MAPPER.readValue(value, clazz), + Encoders.bean(clazz)) + .write() + .mode(SaveMode.Overwrite) + .option("compression", "gzip") + .json(outputPath); + } + + private static boolean isEntityType(final String s, final Class clazz) { + return StringUtils.substringBefore(s, "|").equals(clazz.getName()); + } + +} diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/oa/merge/GroupEntitiesSparkJob.java b/dhp-common/src/main/java/eu/dnetlib/dhp/oa/merge/GroupEntitiesSparkJob.java new file mode 100644 index 000000000..e652bd5b6 --- /dev/null +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/oa/merge/GroupEntitiesSparkJob.java @@ -0,0 +1,203 @@ + +package eu.dnetlib.dhp.oa.merge; + +import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; +import static eu.dnetlib.dhp.utils.DHPUtils.toSeq; + +import java.io.IOException; +import java.util.List; +import java.util.Objects; +import java.util.Optional; +import java.util.stream.Collectors; + +import org.apache.commons.io.IOUtils; +import org.apache.commons.lang3.StringUtils; +import org.apache.spark.SparkConf; +import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.api.java.function.FilterFunction; +import org.apache.spark.api.java.function.MapFunction; +import org.apache.spark.sql.*; +import org.apache.spark.sql.expressions.Aggregator; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import com.fasterxml.jackson.databind.DeserializationFeature; +import com.fasterxml.jackson.databind.ObjectMapper; +import com.jayway.jsonpath.Configuration; +import com.jayway.jsonpath.DocumentContext; +import com.jayway.jsonpath.JsonPath; +import com.jayway.jsonpath.Option; + +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.common.HdfsSupport; +import eu.dnetlib.dhp.schema.common.ModelSupport; +import eu.dnetlib.dhp.schema.oaf.*; +import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils; +import scala.Tuple2; + +/** + * Groups the graph content by entity identifier to ensure ID uniqueness + */ +public class GroupEntitiesSparkJob { + + private static final Logger log = LoggerFactory.getLogger(GroupEntitiesSparkJob.class); + + private static final String ID_JPATH = "$.id"; + + private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper() + .configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false); + + public static void main(String[] args) throws Exception { + + String jsonConfiguration = IOUtils + .toString( + GroupEntitiesSparkJob.class + .getResourceAsStream( + "/eu/dnetlib/dhp/oa/merge/group_graph_entities_parameters.json")); + final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration); + parser.parseArgument(args); + + Boolean isSparkSessionManaged = Optional + .ofNullable(parser.get("isSparkSessionManaged")) + .map(Boolean::valueOf) + .orElse(Boolean.TRUE); + log.info("isSparkSessionManaged: {}", isSparkSessionManaged); + + String graphInputPath = parser.get("graphInputPath"); + log.info("graphInputPath: {}", graphInputPath); + + String outputPath = parser.get("outputPath"); + log.info("outputPath: {}", outputPath); + + SparkConf conf = new SparkConf(); + conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer"); + conf.registerKryoClasses(ModelSupport.getOafModelClasses()); + + runWithSparkSession( + conf, + isSparkSessionManaged, + spark -> { + HdfsSupport.remove(outputPath, spark.sparkContext().hadoopConfiguration()); + groupEntities(spark, graphInputPath, outputPath); + }); + } + + private static void groupEntities( + SparkSession spark, + String inputPath, + String outputPath) { + + final TypedColumn aggregator = new GroupingAggregator().toColumn(); + final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); + spark + .read() + .textFile(toSeq(listEntityPaths(inputPath, sc))) + .map((MapFunction) GroupEntitiesSparkJob::parseOaf, Encoders.kryo(OafEntity.class)) + .filter((FilterFunction) e -> StringUtils.isNotBlank(ModelSupport.idFn().apply(e))) + .groupByKey((MapFunction) oaf -> ModelSupport.idFn().apply(oaf), Encoders.STRING()) + .agg(aggregator) + .map( + (MapFunction, String>) t -> t._2().getClass().getName() + + "|" + OBJECT_MAPPER.writeValueAsString(t._2()), + Encoders.STRING()) + .write() + .option("compression", "gzip") + .mode(SaveMode.Overwrite) + .text(outputPath); + } + + public static class GroupingAggregator extends Aggregator { + + @Override + public OafEntity zero() { + return null; + } + + @Override + public OafEntity reduce(OafEntity b, OafEntity a) { + return mergeAndGet(b, a); + } + + private OafEntity mergeAndGet(OafEntity b, OafEntity a) { + if (Objects.nonNull(a) && Objects.nonNull(b)) { + return OafMapperUtils.mergeEntities(b, a); + } + return Objects.isNull(a) ? b : a; + } + + @Override + public OafEntity merge(OafEntity b, OafEntity a) { + return mergeAndGet(b, a); + } + + @Override + public OafEntity finish(OafEntity j) { + return j; + } + + @Override + public Encoder bufferEncoder() { + return Encoders.kryo(OafEntity.class); + } + + @Override + public Encoder outputEncoder() { + return Encoders.kryo(OafEntity.class); + } + + } + + private static OafEntity parseOaf(String s) { + + DocumentContext dc = JsonPath + .parse(s, Configuration.defaultConfiguration().addOptions(Option.SUPPRESS_EXCEPTIONS)); + final String id = dc.read(ID_JPATH); + if (StringUtils.isNotBlank(id)) { + + String prefix = StringUtils.substringBefore(id, "|"); + switch (prefix) { + case "10": + return parse(s, Datasource.class); + case "20": + return parse(s, Organization.class); + case "40": + return parse(s, Project.class); + case "50": + String resultType = dc.read("$.resulttype.classid"); + switch (resultType) { + case "publication": + return parse(s, Publication.class); + case "dataset": + return parse(s, eu.dnetlib.dhp.schema.oaf.Dataset.class); + case "software": + return parse(s, Software.class); + case "other": + return parse(s, OtherResearchProduct.class); + default: + throw new IllegalArgumentException(String.format("invalid resultType: '%s'", resultType)); + } + default: + throw new IllegalArgumentException(String.format("invalid id prefix: '%s'", prefix)); + } + } else { + throw new IllegalArgumentException(String.format("invalid oaf: '%s'", s)); + } + } + + private static OafEntity parse(String s, Class clazz) { + try { + return OBJECT_MAPPER.readValue(s, clazz); + } catch (IOException e) { + throw new IllegalArgumentException(e); + } + } + + private static List listEntityPaths(String inputPath, JavaSparkContext sc) { + return HdfsSupport + .listFiles(inputPath, sc.hadoopConfiguration()) + .stream() + .filter(f -> !f.toLowerCase().contains("relation")) + .collect(Collectors.toList()); + } + +} diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/parser/utility/VtdUtilityParser.java b/dhp-common/src/main/java/eu/dnetlib/dhp/parser/utility/VtdUtilityParser.java index 9ac0a0bf7..fd4c0191a 100644 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/parser/utility/VtdUtilityParser.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/parser/utility/VtdUtilityParser.java @@ -12,6 +12,9 @@ import com.ximpleware.VTDNav; /** Created by sandro on 9/29/16. */ public class VtdUtilityParser { + private VtdUtilityParser() { + } + public static List getTextValuesWithAttributes( final AutoPilot ap, final VTDNav vn, final String xpath, final List attributes) throws VtdException { diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/CleaningFunctions.java b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/CleaningFunctions.java deleted file mode 100644 index 12fbcc490..000000000 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/CleaningFunctions.java +++ /dev/null @@ -1,238 +0,0 @@ - -package eu.dnetlib.dhp.schema.oaf; - -import java.util.LinkedHashMap; -import java.util.Objects; -import java.util.Optional; -import java.util.function.Function; -import java.util.stream.Collectors; - -import org.apache.commons.lang3.StringUtils; - -import com.clearspring.analytics.util.Lists; - -import eu.dnetlib.dhp.schema.common.ModelConstants; - -public class CleaningFunctions { - - public static final String DOI_URL_PREFIX_REGEX = "(^http(s?):\\/\\/)(((dx\\.)?doi\\.org)|(handle\\.test\\.datacite\\.org))\\/"; - public static final String ORCID_PREFIX_REGEX = "^http(s?):\\/\\/orcid\\.org\\/"; - public static final String NONE = "none"; - - public static T fixVocabularyNames(T value) { - if (value instanceof Datasource) { - // nothing to clean here - } else if (value instanceof Project) { - // nothing to clean here - } else if (value instanceof Organization) { - Organization o = (Organization) value; - if (Objects.nonNull(o.getCountry())) { - fixVocabName(o.getCountry(), ModelConstants.DNET_COUNTRY_TYPE); - } - } else if (value instanceof Relation) { - // nothing to clean here - } else if (value instanceof Result) { - - Result r = (Result) value; - - fixVocabName(r.getLanguage(), ModelConstants.DNET_LANGUAGES); - fixVocabName(r.getResourcetype(), ModelConstants.DNET_DATA_CITE_RESOURCE); - fixVocabName(r.getBestaccessright(), ModelConstants.DNET_ACCESS_MODES); - - if (Objects.nonNull(r.getSubject())) { - r.getSubject().forEach(s -> fixVocabName(s.getQualifier(), ModelConstants.DNET_SUBJECT_TYPOLOGIES)); - } - if (Objects.nonNull(r.getInstance())) { - for (Instance i : r.getInstance()) { - fixVocabName(i.getAccessright(), ModelConstants.DNET_ACCESS_MODES); - fixVocabName(i.getRefereed(), ModelConstants.DNET_REVIEW_LEVELS); - } - } - if (Objects.nonNull(r.getAuthor())) { - r.getAuthor().forEach(a -> { - if (Objects.nonNull(a.getPid())) { - a.getPid().forEach(p -> { - fixVocabName(p.getQualifier(), ModelConstants.DNET_PID_TYPES); - }); - } - }); - } - if (value instanceof Publication) { - - } else if (value instanceof eu.dnetlib.dhp.schema.oaf.Dataset) { - - } else if (value instanceof OtherResearchProduct) { - - } else if (value instanceof Software) { - - } - } - - return value; - } - - public static T fixDefaults(T value) { - if (value instanceof Datasource) { - // nothing to clean here - } else if (value instanceof Project) { - // nothing to clean here - } else if (value instanceof Organization) { - Organization o = (Organization) value; - if (Objects.isNull(o.getCountry()) || StringUtils.isBlank(o.getCountry().getClassid())) { - o.setCountry(qualifier("UNKNOWN", "Unknown", ModelConstants.DNET_COUNTRY_TYPE)); - } - } else if (value instanceof Relation) { - // nothing to clean here - } else if (value instanceof Result) { - - Result r = (Result) value; - if (Objects.nonNull(r.getPublisher()) && StringUtils.isBlank(r.getPublisher().getValue())) { - r.setPublisher(null); - } - if (Objects.isNull(r.getLanguage()) || StringUtils.isBlank(r.getLanguage().getClassid())) { - r - .setLanguage( - qualifier("und", "Undetermined", ModelConstants.DNET_LANGUAGES)); - } - if (Objects.nonNull(r.getSubject())) { - r - .setSubject( - r - .getSubject() - .stream() - .filter(Objects::nonNull) - .filter(sp -> StringUtils.isNotBlank(sp.getValue())) - .filter(sp -> Objects.nonNull(sp.getQualifier())) - .filter(sp -> StringUtils.isNotBlank(sp.getQualifier().getClassid())) - .collect(Collectors.toList())); - } - if (Objects.nonNull(r.getPid())) { - r - .setPid( - r - .getPid() - .stream() - .filter(Objects::nonNull) - .filter(sp -> StringUtils.isNotBlank(StringUtils.trim(sp.getValue()))) - .filter(sp -> NONE.equalsIgnoreCase(sp.getValue())) - .filter(sp -> Objects.nonNull(sp.getQualifier())) - .filter(sp -> StringUtils.isNotBlank(sp.getQualifier().getClassid())) - .map(CleaningFunctions::normalizePidValue) - .collect(Collectors.toList())); - } - if (Objects.isNull(r.getResourcetype()) || StringUtils.isBlank(r.getResourcetype().getClassid())) { - r - .setResourcetype( - qualifier("UNKNOWN", "Unknown", ModelConstants.DNET_DATA_CITE_RESOURCE)); - } - if (Objects.nonNull(r.getInstance())) { - for (Instance i : r.getInstance()) { - if (Objects.isNull(i.getAccessright()) || StringUtils.isBlank(i.getAccessright().getClassid())) { - i.setAccessright(qualifier("UNKNOWN", "not available", ModelConstants.DNET_ACCESS_MODES)); - } - if (Objects.isNull(i.getHostedby()) || StringUtils.isBlank(i.getHostedby().getKey())) { - i.setHostedby(ModelConstants.UNKNOWN_REPOSITORY); - } - if (Objects.isNull(i.getRefereed())) { - i.setRefereed(qualifier("0000", "Unknown", ModelConstants.DNET_REVIEW_LEVELS)); - } - } - } - if (Objects.isNull(r.getBestaccessright()) || StringUtils.isBlank(r.getBestaccessright().getClassid())) { - Qualifier bestaccessrights = OafMapperUtils.createBestAccessRights(r.getInstance()); - if (Objects.isNull(bestaccessrights)) { - r - .setBestaccessright( - qualifier("UNKNOWN", "not available", ModelConstants.DNET_ACCESS_MODES)); - } else { - r.setBestaccessright(bestaccessrights); - } - } - if (Objects.nonNull(r.getAuthor())) { - boolean nullRank = r - .getAuthor() - .stream() - .anyMatch(a -> Objects.isNull(a.getRank())); - if (nullRank) { - int i = 1; - for (Author author : r.getAuthor()) { - author.setRank(i++); - } - } - for (Author a : r.getAuthor()) { - if (Objects.isNull(a.getPid())) { - a.setPid(Lists.newArrayList()); - } else { - a - .setPid( - a - .getPid() - .stream() - .filter(p -> Objects.nonNull(p.getQualifier())) - .filter(p -> StringUtils.isNotBlank(p.getValue())) - .map(p -> { - p.setValue(p.getValue().trim().replaceAll(ORCID_PREFIX_REGEX, "")); - return p; - }) - .collect( - Collectors - .toMap( - StructuredProperty::getValue, Function.identity(), (p1, p2) -> p1, - LinkedHashMap::new)) - .values() - .stream() - .collect(Collectors.toList())); - } - } - - } - if (value instanceof Publication) { - - } else if (value instanceof eu.dnetlib.dhp.schema.oaf.Dataset) { - - } else if (value instanceof OtherResearchProduct) { - - } else if (value instanceof Software) { - - } - } - - return value; - } - - // HELPERS - - private static void fixVocabName(Qualifier q, String vocabularyName) { - if (Objects.nonNull(q) && StringUtils.isBlank(q.getSchemeid())) { - q.setSchemeid(vocabularyName); - q.setSchemename(vocabularyName); - } - } - - private static Qualifier qualifier(String classid, String classname, String scheme) { - return OafMapperUtils - .qualifier( - classid, classname, scheme, scheme); - } - - /** - * Utility method that normalises PID values on a per-type basis. - * @param pid the PID whose value will be normalised. - * @return the PID containing the normalised value. - */ - public static StructuredProperty normalizePidValue(StructuredProperty pid) { - String value = Optional - .ofNullable(pid.getValue()) - .map(String::trim) - .orElseThrow(() -> new IllegalArgumentException("PID value cannot be empty")); - switch (pid.getQualifier().getClassid()) { - - // TODO add cleaning for more PID types as needed - case "doi": - pid.setValue(value.toLowerCase().replaceAll(DOI_URL_PREFIX_REGEX, "")); - break; - } - return pid; - } - -} diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/ModelHardLimits.java b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/ModelHardLimits.java deleted file mode 100644 index 16fdc3760..000000000 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/ModelHardLimits.java +++ /dev/null @@ -1,14 +0,0 @@ - -package eu.dnetlib.dhp.schema.oaf; - -public class ModelHardLimits { - - public static final int MAX_EXTERNAL_ENTITIES = 50; - public static final int MAX_AUTHORS = 200; - public static final int MAX_AUTHOR_FULLNAME_LENGTH = 1000; - public static final int MAX_TITLE_LENGTH = 5000; - public static final int MAX_TITLES = 10; - public static final int MAX_ABSTRACT_LENGTH = 150000; - public static final int MAX_INSTANCES = 10; - -} diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/OafMapperUtils.java b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/OafMapperUtils.java deleted file mode 100644 index f079c55af..000000000 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/OafMapperUtils.java +++ /dev/null @@ -1,296 +0,0 @@ - -package eu.dnetlib.dhp.schema.oaf; - -import static eu.dnetlib.dhp.schema.common.ModelConstants.*; -import static eu.dnetlib.dhp.schema.common.ModelConstants.DNET_ACCESS_MODES; - -import java.util.*; -import java.util.concurrent.ConcurrentHashMap; -import java.util.function.Function; -import java.util.function.Predicate; -import java.util.stream.Collectors; - -import org.apache.commons.lang3.StringUtils; - -import eu.dnetlib.dhp.schema.common.LicenseComparator; -import eu.dnetlib.dhp.utils.DHPUtils; - -public class OafMapperUtils { - - public static KeyValue keyValue(final String k, final String v) { - final KeyValue kv = new KeyValue(); - kv.setKey(k); - kv.setValue(v); - return kv; - } - - public static List listKeyValues(final String... s) { - if (s.length % 2 > 0) { - throw new RuntimeException("Invalid number of parameters (k,v,k,v,....)"); - } - - final List list = new ArrayList<>(); - for (int i = 0; i < s.length; i += 2) { - list.add(keyValue(s[i], s[i + 1])); - } - return list; - } - - public static Field field(final T value, final DataInfo info) { - if (value == null || StringUtils.isBlank(value.toString())) { - return null; - } - - final Field field = new Field<>(); - field.setValue(value); - field.setDataInfo(info); - return field; - } - - public static List> listFields(final DataInfo info, final String... values) { - return Arrays - .stream(values) - .map(v -> field(v, info)) - .filter(Objects::nonNull) - .filter(distinctByKey(f -> f.getValue())) - .collect(Collectors.toList()); - } - - public static List> listFields(final DataInfo info, final List values) { - return values - .stream() - .map(v -> field(v, info)) - .filter(Objects::nonNull) - .filter(distinctByKey(f -> f.getValue())) - .collect(Collectors.toList()); - } - - public static Qualifier unknown(final String schemeid, final String schemename) { - return qualifier("UNKNOWN", "Unknown", schemeid, schemename); - } - - public static Qualifier qualifier( - final String classid, - final String classname, - final String schemeid, - final String schemename) { - final Qualifier q = new Qualifier(); - q.setClassid(classid); - q.setClassname(classname); - q.setSchemeid(schemeid); - q.setSchemename(schemename); - return q; - } - - public static StructuredProperty structuredProperty( - final String value, - final String classid, - final String classname, - final String schemeid, - final String schemename, - final DataInfo dataInfo) { - - return structuredProperty(value, qualifier(classid, classname, schemeid, schemename), dataInfo); - } - - public static StructuredProperty structuredProperty( - final String value, - final Qualifier qualifier, - final DataInfo dataInfo) { - if (value == null) { - return null; - } - final StructuredProperty sp = new StructuredProperty(); - sp.setValue(value); - sp.setQualifier(qualifier); - sp.setDataInfo(dataInfo); - return sp; - } - - public static ExtraInfo extraInfo( - final String name, - final String value, - final String typology, - final String provenance, - final String trust) { - final ExtraInfo info = new ExtraInfo(); - info.setName(name); - info.setValue(value); - info.setTypology(typology); - info.setProvenance(provenance); - info.setTrust(trust); - return info; - } - - public static OAIProvenance oaiIProvenance( - final String identifier, - final String baseURL, - final String metadataNamespace, - final Boolean altered, - final String datestamp, - final String harvestDate) { - - final OriginDescription desc = new OriginDescription(); - desc.setIdentifier(identifier); - desc.setBaseURL(baseURL); - desc.setMetadataNamespace(metadataNamespace); - desc.setAltered(altered); - desc.setDatestamp(datestamp); - desc.setHarvestDate(harvestDate); - - final OAIProvenance p = new OAIProvenance(); - p.setOriginDescription(desc); - - return p; - } - - public static Journal journal( - final String name, - final String issnPrinted, - final String issnOnline, - final String issnLinking, - final DataInfo dataInfo) { - return journal( - name, - issnPrinted, - issnOnline, - issnLinking, - null, - null, - null, - null, - null, - null, - null, - dataInfo); - } - - public static Journal journal( - final String name, - final String issnPrinted, - final String issnOnline, - final String issnLinking, - final String ep, - final String iss, - final String sp, - final String vol, - final String edition, - final String conferenceplace, - final String conferencedate, - final DataInfo dataInfo) { - - if (StringUtils.isNotBlank(name) - || StringUtils.isNotBlank(issnPrinted) - || StringUtils.isNotBlank(issnOnline) - || StringUtils.isNotBlank(issnLinking)) { - final Journal j = new Journal(); - j.setName(name); - j.setIssnPrinted(issnPrinted); - j.setIssnOnline(issnOnline); - j.setIssnLinking(issnLinking); - j.setEp(ep); - j.setIss(iss); - j.setSp(sp); - j.setVol(vol); - j.setEdition(edition); - j.setConferenceplace(conferenceplace); - j.setConferencedate(conferencedate); - j.setDataInfo(dataInfo); - return j; - } else { - return null; - } - } - - public static DataInfo dataInfo( - final Boolean deletedbyinference, - final String inferenceprovenance, - final Boolean inferred, - final Boolean invisible, - final Qualifier provenanceaction, - final String trust) { - final DataInfo d = new DataInfo(); - d.setDeletedbyinference(deletedbyinference); - d.setInferenceprovenance(inferenceprovenance); - d.setInferred(inferred); - d.setInvisible(invisible); - d.setProvenanceaction(provenanceaction); - d.setTrust(trust); - return d; - } - - public static String createOpenaireId( - final int prefix, - final String originalId, - final boolean to_md5) { - if (StringUtils.isBlank(originalId)) { - return null; - } else if (to_md5) { - final String nsPrefix = StringUtils.substringBefore(originalId, "::"); - final String rest = StringUtils.substringAfter(originalId, "::"); - return String.format("%s|%s::%s", prefix, nsPrefix, DHPUtils.md5(rest)); - } else { - return String.format("%s|%s", prefix, originalId); - } - } - - public static String createOpenaireId( - final String type, - final String originalId, - final boolean to_md5) { - switch (type) { - case "datasource": - return createOpenaireId(10, originalId, to_md5); - case "organization": - return createOpenaireId(20, originalId, to_md5); - case "person": - return createOpenaireId(30, originalId, to_md5); - case "project": - return createOpenaireId(40, originalId, to_md5); - default: - return createOpenaireId(50, originalId, to_md5); - } - } - - public static String asString(final Object o) { - return o == null ? "" : o.toString(); - } - - public static Predicate distinctByKey( - final Function keyExtractor) { - final Map seen = new ConcurrentHashMap<>(); - return t -> seen.putIfAbsent(keyExtractor.apply(t), Boolean.TRUE) == null; - } - - public static Qualifier createBestAccessRights(final List instanceList) { - return getBestAccessRights(instanceList); - } - - protected static Qualifier getBestAccessRights(final List instanceList) { - if (instanceList != null) { - final Optional min = instanceList - .stream() - .map(i -> i.getAccessright()) - .min(new LicenseComparator()); - - final Qualifier rights = min.isPresent() ? min.get() : new Qualifier(); - - if (StringUtils.isBlank(rights.getClassid())) { - rights.setClassid(UNKNOWN); - } - if (StringUtils.isBlank(rights.getClassname()) - || UNKNOWN.equalsIgnoreCase(rights.getClassname())) { - rights.setClassname(NOT_AVAILABLE); - } - if (StringUtils.isBlank(rights.getSchemeid())) { - rights.setSchemeid(DNET_ACCESS_MODES); - } - if (StringUtils.isBlank(rights.getSchemename())) { - rights.setSchemename(DNET_ACCESS_MODES); - } - - return rights; - } - return null; - } -} diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/ResultTypeComparator.java b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/ResultTypeComparator.java deleted file mode 100644 index 6c11d1a85..000000000 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/ResultTypeComparator.java +++ /dev/null @@ -1,49 +0,0 @@ - -package eu.dnetlib.dhp.schema.oaf; - -import java.util.Comparator; - -import eu.dnetlib.dhp.schema.common.ModelConstants; - -public class ResultTypeComparator implements Comparator { - - @Override - public int compare(Result left, Result right) { - - if (left == null && right == null) - return 0; - if (left == null) - return 1; - if (right == null) - return -1; - - String lClass = left.getResulttype().getClassid(); - String rClass = right.getResulttype().getClassid(); - - if (lClass.equals(rClass)) - return 0; - - if (lClass.equals(ModelConstants.PUBLICATION_RESULTTYPE_CLASSID)) - return -1; - if (rClass.equals(ModelConstants.PUBLICATION_RESULTTYPE_CLASSID)) - return 1; - - if (lClass.equals(ModelConstants.DATASET_RESULTTYPE_CLASSID)) - return -1; - if (rClass.equals(ModelConstants.DATASET_RESULTTYPE_CLASSID)) - return 1; - - if (lClass.equals(ModelConstants.SOFTWARE_RESULTTYPE_CLASSID)) - return -1; - if (rClass.equals(ModelConstants.SOFTWARE_RESULTTYPE_CLASSID)) - return 1; - - if (lClass.equals(ModelConstants.ORP_RESULTTYPE_CLASSID)) - return -1; - if (rClass.equals(ModelConstants.ORP_RESULTTYPE_CLASSID)) - return 1; - - // Else (but unlikely), lexicographical ordering will do. - return lClass.compareTo(rClass); - } -} diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/GraphCleaningFunctions.java b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/GraphCleaningFunctions.java new file mode 100644 index 000000000..363f95423 --- /dev/null +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/GraphCleaningFunctions.java @@ -0,0 +1,538 @@ + +package eu.dnetlib.dhp.schema.oaf.utils; + +import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.getProvenance; + +import java.time.LocalDate; +import java.time.ZoneId; +import java.time.format.DateTimeFormatter; +import java.time.format.DateTimeParseException; +import java.util.*; +import java.util.function.Function; +import java.util.stream.Collectors; +import java.util.stream.Stream; + +import org.apache.commons.lang3.StringUtils; + +import com.github.sisyphsu.dateparser.DateParserUtils; +import com.google.common.collect.Lists; +import com.google.common.collect.Sets; + +import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup; +import eu.dnetlib.dhp.schema.common.ModelConstants; +import eu.dnetlib.dhp.schema.common.ModelSupport; +import eu.dnetlib.dhp.schema.oaf.*; +import me.xuender.unidecode.Unidecode; + +public class GraphCleaningFunctions extends CleaningFunctions { + + public static final String ORCID_CLEANING_REGEX = ".*([0-9]{4}).*[-–—−=].*([0-9]{4}).*[-–—−=].*([0-9]{4}).*[-–—−=].*([0-9x]{4})"; + public static final int ORCID_LEN = 19; + public static final String CLEANING_REGEX = "(?:\\n|\\r|\\t)"; + public static final String INVALID_AUTHOR_REGEX = ".*deactivated.*"; + + public static final String TITLE_TEST = "test"; + public static final String TITLE_FILTER_REGEX = String.format("(%s)|\\W|\\d", TITLE_TEST); + + public static final int TITLE_FILTER_RESIDUAL_LENGTH = 5; + + public static T fixVocabularyNames(T value) { + if (value instanceof Datasource) { + // nothing to clean here + } else if (value instanceof Project) { + // nothing to clean here + } else if (value instanceof Organization) { + Organization o = (Organization) value; + if (Objects.nonNull(o.getCountry())) { + fixVocabName(o.getCountry(), ModelConstants.DNET_COUNTRY_TYPE); + } + } else if (value instanceof Relation) { + // nothing to clean here + } else if (value instanceof Result) { + + Result r = (Result) value; + + fixVocabName(r.getLanguage(), ModelConstants.DNET_LANGUAGES); + fixVocabName(r.getResourcetype(), ModelConstants.DNET_DATA_CITE_RESOURCE); + fixVocabName(r.getBestaccessright(), ModelConstants.DNET_ACCESS_MODES); + + if (Objects.nonNull(r.getSubject())) { + r.getSubject().forEach(s -> fixVocabName(s.getQualifier(), ModelConstants.DNET_SUBJECT_TYPOLOGIES)); + } + if (Objects.nonNull(r.getInstance())) { + for (Instance i : r.getInstance()) { + fixVocabName(i.getAccessright(), ModelConstants.DNET_ACCESS_MODES); + fixVocabName(i.getRefereed(), ModelConstants.DNET_REVIEW_LEVELS); + } + } + if (Objects.nonNull(r.getAuthor())) { + r.getAuthor().stream().filter(Objects::nonNull).forEach(a -> { + if (Objects.nonNull(a.getPid())) { + a.getPid().stream().filter(Objects::nonNull).forEach(p -> { + fixVocabName(p.getQualifier(), ModelConstants.DNET_PID_TYPES); + }); + } + }); + } + if (value instanceof Publication) { + + } else if (value instanceof Dataset) { + + } else if (value instanceof OtherResearchProduct) { + + } else if (value instanceof Software) { + + } + } + + return value; + } + + public static boolean filter(T value) { + if (Boolean.TRUE + .equals( + Optional + .ofNullable(value) + .map( + o -> Optional + .ofNullable(o.getDataInfo()) + .map( + d -> Optional + .ofNullable(d.getInvisible()) + .orElse(true)) + .orElse(true)) + .orElse(true))) { + return true; + } + + if (value instanceof Datasource) { + // nothing to evaluate here + } else if (value instanceof Project) { + // nothing to evaluate here + } else if (value instanceof Organization) { + // nothing to evaluate here + } else if (value instanceof Relation) { + // nothing to clean here + } else if (value instanceof Result) { + + Result r = (Result) value; + + if (Objects.isNull(r.getTitle()) || r.getTitle().isEmpty()) { + return false; + } + + if (value instanceof Publication) { + + } else if (value instanceof Dataset) { + + } else if (value instanceof OtherResearchProduct) { + + } else if (value instanceof Software) { + + } + } + return true; + } + + public static T cleanup(T value, VocabularyGroup vocs) { + if (value instanceof Datasource) { + // nothing to clean here + } else if (value instanceof Project) { + // nothing to clean here + } else if (value instanceof Organization) { + Organization o = (Organization) value; + if (Objects.isNull(o.getCountry()) || StringUtils.isBlank(o.getCountry().getClassid())) { + o.setCountry(ModelConstants.UNKNOWN_COUNTRY); + } + } else if (value instanceof Relation) { + Relation r = (Relation) value; + + Optional validationDate = doCleanDate(r.getValidationDate()); + if (validationDate.isPresent()) { + r.setValidationDate(validationDate.get()); + r.setValidated(true); + } else { + r.setValidationDate(null); + r.setValidated(false); + } + } else if (value instanceof Result) { + + Result r = (Result) value; + + if (Objects.nonNull(r.getDateofacceptance())) { + Optional date = cleanDateField(r.getDateofacceptance()); + if (date.isPresent()) { + r.getDateofacceptance().setValue(date.get()); + } else { + r.setDateofacceptance(null); + } + } + if (Objects.nonNull(r.getRelevantdate())) { + r + .setRelevantdate( + r + .getRelevantdate() + .stream() + .filter(Objects::nonNull) + .filter(sp -> Objects.nonNull(sp.getQualifier())) + .filter(sp -> StringUtils.isNotBlank(sp.getQualifier().getClassid())) + .map(sp -> { + sp.setValue(GraphCleaningFunctions.cleanDate(sp.getValue())); + return sp; + }) + .filter(sp -> StringUtils.isNotBlank(sp.getValue())) + .collect(Collectors.toList())); + } + if (Objects.nonNull(r.getPublisher()) && StringUtils.isBlank(r.getPublisher().getValue())) { + r.setPublisher(null); + } + if (Objects.isNull(r.getLanguage()) || StringUtils.isBlank(r.getLanguage().getClassid())) { + r + .setLanguage( + qualifier("und", "Undetermined", ModelConstants.DNET_LANGUAGES)); + } + if (Objects.nonNull(r.getSubject())) { + List subjects = Lists + .newArrayList( + r + .getSubject() + .stream() + .filter(Objects::nonNull) + .filter(sp -> StringUtils.isNotBlank(sp.getValue())) + .filter(sp -> Objects.nonNull(sp.getQualifier())) + .filter(sp -> StringUtils.isNotBlank(sp.getQualifier().getClassid())) + .map(GraphCleaningFunctions::cleanValue) + .collect( + Collectors + .toMap( + s -> Optional + .ofNullable(s.getQualifier()) + .map(q -> q.getClassid() + s.getValue()) + .orElse(s.getValue()), + Function.identity(), + (s1, s2) -> Collections + .min(Lists.newArrayList(s1, s2), new SubjectProvenanceComparator()))) + .values()); + r.setSubject(subjects); + } + if (Objects.nonNull(r.getTitle())) { + r + .setTitle( + r + .getTitle() + .stream() + .filter(Objects::nonNull) + .filter(sp -> StringUtils.isNotBlank(sp.getValue())) + .filter( + sp -> { + final String title = sp + .getValue() + .toLowerCase(); + final String decoded = Unidecode.decode(title); + + if (StringUtils.contains(decoded, TITLE_TEST)) { + return decoded + .replaceAll(TITLE_FILTER_REGEX, "") + .length() > TITLE_FILTER_RESIDUAL_LENGTH; + } + return !decoded + .replaceAll("\\W|\\d", "") + .isEmpty(); + }) + .map(GraphCleaningFunctions::cleanValue) + .collect(Collectors.toList())); + } + if (Objects.nonNull(r.getFormat())) { + r + .setFormat( + r + .getFormat() + .stream() + .map(GraphCleaningFunctions::cleanValue) + .collect(Collectors.toList())); + } + if (Objects.nonNull(r.getDescription())) { + r + .setDescription( + r + .getDescription() + .stream() + .filter(Objects::nonNull) + .filter(sp -> StringUtils.isNotBlank(sp.getValue())) + .map(GraphCleaningFunctions::cleanValue) + .collect(Collectors.toList())); + } + if (Objects.nonNull(r.getPid())) { + r.setPid(processPidCleaning(r.getPid())); + } + if (Objects.isNull(r.getResourcetype()) || StringUtils.isBlank(r.getResourcetype().getClassid())) { + r + .setResourcetype( + qualifier(ModelConstants.UNKNOWN, "Unknown", ModelConstants.DNET_DATA_CITE_RESOURCE)); + } + if (Objects.nonNull(r.getInstance())) { + + for (Instance i : r.getInstance()) { + if (!vocs.termExists(ModelConstants.DNET_PUBLICATION_RESOURCE, i.getInstancetype().getClassid())) { + if (r instanceof Publication) { + i + .setInstancetype( + OafMapperUtils + .qualifier( + "0038", "Other literature type", ModelConstants.DNET_PUBLICATION_RESOURCE, + ModelConstants.DNET_PUBLICATION_RESOURCE)); + } else if (r instanceof Dataset) { + i + .setInstancetype( + OafMapperUtils + .qualifier( + "0039", "Other dataset type", ModelConstants.DNET_PUBLICATION_RESOURCE, + ModelConstants.DNET_PUBLICATION_RESOURCE)); + } else if (r instanceof Software) { + i + .setInstancetype( + OafMapperUtils + .qualifier( + "0040", "Other software type", ModelConstants.DNET_PUBLICATION_RESOURCE, + ModelConstants.DNET_PUBLICATION_RESOURCE)); + } else if (r instanceof OtherResearchProduct) { + i + .setInstancetype( + OafMapperUtils + .qualifier( + "0020", "Other ORP type", ModelConstants.DNET_PUBLICATION_RESOURCE, + ModelConstants.DNET_PUBLICATION_RESOURCE)); + } + } + + if (Objects.nonNull(i.getPid())) { + i.setPid(processPidCleaning(i.getPid())); + } + if (Objects.nonNull(i.getAlternateIdentifier())) { + i.setAlternateIdentifier(processPidCleaning(i.getAlternateIdentifier())); + } + Optional + .ofNullable(i.getPid()) + .ifPresent(pid -> { + final Set pids = Sets.newHashSet(pid); + Optional + .ofNullable(i.getAlternateIdentifier()) + .ifPresent(altId -> { + final Set altIds = Sets.newHashSet(altId); + i.setAlternateIdentifier(Lists.newArrayList(Sets.difference(altIds, pids))); + }); + }); + + if (Objects.isNull(i.getAccessright()) || StringUtils.isBlank(i.getAccessright().getClassid())) { + i + .setAccessright( + accessRight( + ModelConstants.UNKNOWN, ModelConstants.NOT_AVAILABLE, + ModelConstants.DNET_ACCESS_MODES)); + } + if (Objects.isNull(i.getHostedby()) || StringUtils.isBlank(i.getHostedby().getKey())) { + i.setHostedby(ModelConstants.UNKNOWN_REPOSITORY); + } + if (Objects.isNull(i.getRefereed())) { + i.setRefereed(qualifier("0000", "Unknown", ModelConstants.DNET_REVIEW_LEVELS)); + } + if (Objects.nonNull(i.getDateofacceptance())) { + Optional date = cleanDateField(i.getDateofacceptance()); + if (date.isPresent()) { + i.getDateofacceptance().setValue(date.get()); + } else { + i.setDateofacceptance(null); + } + } + } + } + if (Objects.isNull(r.getBestaccessright()) || StringUtils.isBlank(r.getBestaccessright().getClassid())) { + Qualifier bestaccessrights = OafMapperUtils.createBestAccessRights(r.getInstance()); + if (Objects.isNull(bestaccessrights)) { + r + .setBestaccessright( + qualifier( + ModelConstants.UNKNOWN, ModelConstants.NOT_AVAILABLE, + ModelConstants.DNET_ACCESS_MODES)); + } else { + r.setBestaccessright(bestaccessrights); + } + } + if (Objects.nonNull(r.getAuthor())) { + r + .setAuthor( + r + .getAuthor() + .stream() + .filter(Objects::nonNull) + .filter(a -> StringUtils.isNotBlank(a.getFullname())) + .filter(a -> StringUtils.isNotBlank(a.getFullname().replaceAll("[\\W]", ""))) + .collect(Collectors.toList())); + + boolean nullRank = r + .getAuthor() + .stream() + .anyMatch(a -> Objects.isNull(a.getRank())); + if (nullRank) { + int i = 1; + for (Author author : r.getAuthor()) { + author.setRank(i++); + } + } + + for (Author a : r.getAuthor()) { + if (Objects.isNull(a.getPid())) { + a.setPid(Lists.newArrayList()); + } else { + a + .setPid( + a + .getPid() + .stream() + .filter(Objects::nonNull) + .filter(p -> Objects.nonNull(p.getQualifier())) + .filter(p -> StringUtils.isNotBlank(p.getValue())) + .map(p -> { + // hack to distinguish orcid from orcid_pending + String pidProvenance = getProvenance(p.getDataInfo()); + if (p + .getQualifier() + .getClassid() + .toLowerCase() + .contains(ModelConstants.ORCID)) { + if (pidProvenance + .equals(ModelConstants.SYSIMPORT_CROSSWALK_ENTITYREGISTRY)) { + p.getQualifier().setClassid(ModelConstants.ORCID); + } else { + p.getQualifier().setClassid(ModelConstants.ORCID_PENDING); + } + final String orcid = p + .getValue() + .trim() + .toLowerCase() + .replaceAll(ORCID_CLEANING_REGEX, "$1-$2-$3-$4"); + if (orcid.length() == ORCID_LEN) { + p.setValue(orcid); + } else { + p.setValue(""); + } + } + return p; + }) + .filter(p -> StringUtils.isNotBlank(p.getValue())) + .collect( + Collectors + .toMap( + p -> p.getQualifier().getClassid() + p.getValue(), + Function.identity(), + (p1, p2) -> p1, + LinkedHashMap::new)) + .values() + .stream() + .collect(Collectors.toList())); + } + } + } + if (value instanceof Publication) { + + } else if (value instanceof Dataset) { + + } else if (value instanceof OtherResearchProduct) { + + } else if (value instanceof Software) { + + } + } + + return value; + } + + private static Optional cleanDateField(Field dateofacceptance) { + return Optional + .ofNullable(dateofacceptance) + .map(Field::getValue) + .map(GraphCleaningFunctions::cleanDate) + .filter(Objects::nonNull); + } + + protected static Optional doCleanDate(String date) { + return Optional.ofNullable(cleanDate(date)); + } + + public static String cleanDate(final String inputDate) { + + if (StringUtils.isBlank(inputDate)) { + return null; + } + + try { + final LocalDate date = DateParserUtils + .parseDate(inputDate.trim()) + .toInstant() + .atZone(ZoneId.systemDefault()) + .toLocalDate(); + return DateTimeFormatter.ofPattern(ModelSupport.DATE_FORMAT).format(date); + } catch (DateTimeParseException e) { + return null; + } + } + + // HELPERS + + private static boolean isValidAuthorName(Author a) { + return !Stream + .of(a.getFullname(), a.getName(), a.getSurname()) + .filter(s -> s != null && !s.isEmpty()) + .collect(Collectors.joining("")) + .toLowerCase() + .matches(INVALID_AUTHOR_REGEX); + } + + private static List processPidCleaning(List pids) { + return pids + .stream() + .filter(Objects::nonNull) + .filter(sp -> StringUtils.isNotBlank(StringUtils.trim(sp.getValue()))) + .filter(sp -> !PID_BLACKLIST.contains(sp.getValue().trim().toLowerCase())) + .filter(sp -> Objects.nonNull(sp.getQualifier())) + .filter(sp -> StringUtils.isNotBlank(sp.getQualifier().getClassid())) + .map(CleaningFunctions::normalizePidValue) + .filter(CleaningFunctions::pidFilter) + .collect(Collectors.toList()); + } + + private static void fixVocabName(Qualifier q, String vocabularyName) { + if (Objects.nonNull(q) && StringUtils.isBlank(q.getSchemeid())) { + q.setSchemeid(vocabularyName); + q.setSchemename(vocabularyName); + } + } + + private static AccessRight accessRight(String classid, String classname, String scheme) { + return OafMapperUtils + .accessRight( + classid, classname, scheme, scheme); + } + + private static Qualifier qualifier(String classid, String classname, String scheme) { + return OafMapperUtils + .qualifier( + classid, classname, scheme, scheme); + } + + protected static StructuredProperty cleanValue(StructuredProperty s) { + s.setValue(s.getValue().replaceAll(CLEANING_REGEX, " ")); + return s; + } + + protected static Subject cleanValue(Subject s) { + s.setValue(s.getValue().replaceAll(CLEANING_REGEX, " ")); + return s; + } + + protected static Field cleanValue(Field s) { + s.setValue(s.getValue().replaceAll(CLEANING_REGEX, " ")); + return s; + } + +} diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/IdentifierFactory.java b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/IdentifierFactory.java deleted file mode 100644 index 319cda0bf..000000000 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/IdentifierFactory.java +++ /dev/null @@ -1,102 +0,0 @@ - -package eu.dnetlib.dhp.schema.oaf.utils; - -import java.io.Serializable; -import java.util.Objects; -import java.util.Optional; -import java.util.regex.Pattern; - -import org.apache.commons.lang.StringUtils; - -import eu.dnetlib.dhp.schema.oaf.CleaningFunctions; -import eu.dnetlib.dhp.schema.oaf.OafEntity; -import eu.dnetlib.dhp.schema.oaf.StructuredProperty; -import eu.dnetlib.dhp.utils.DHPUtils; - -/** - * Factory class for OpenAIRE identifiers in the Graph - */ -public class IdentifierFactory implements Serializable { - - public static final String ID_SEPARATOR = "::"; - public static final String ID_PREFIX_SEPARATOR = "|"; - public final static String ID_REGEX = "^[0-9][0-9]\\" + ID_PREFIX_SEPARATOR + ".{12}" + ID_SEPARATOR - + "[a-zA-Z0-9]{32}$"; - - public final static String DOI_REGEX = "(^10\\.[0-9]{4,9}\\/[-._;()\\/:a-zA-Z0-9]+$)|" + - "(^10\\.1002\\/[^\\s]+$)|" + - "(^10\\.1021\\/[a-zA-Z0-9_][a-zA-Z0-9_][0-9]++$)|" + - "(^10\\.1207\\/[a-zA-Z0-9_]+\\&[0-9]+_[0-9]+$)"; - - public static final int ID_PREFIX_LEN = 12; - public static final String NONE = "none"; - - /** - * Creates an identifier from the most relevant PID (if available) in the given entity T. Returns entity.id - * when no PID is available - * @param entity the entity providing PIDs and a default ID. - * @param the specific entity type. Currently Organization and Result subclasses are supported. - * @return an identifier from the most relevant PID, entity.id otherwise - */ - public static String createIdentifier(T entity) { - - if (Objects.isNull(entity.getPid()) || entity.getPid().isEmpty()) { - return entity.getId(); - } - - return entity - .getPid() - .stream() - .filter(s -> pidFilter(s)) - .min(new PidComparator<>(entity)) - .map(s -> idFromPid(entity, s)) - .map(IdentifierFactory::verifyIdSyntax) - .orElseGet(entity::getId); - } - - protected static boolean pidFilter(StructuredProperty s) { - if (Objects.isNull(s.getQualifier()) || - StringUtils.isBlank(StringUtils.trim(s.getValue()))) { - return false; - } - try { - switch (PidType.valueOf(s.getQualifier().getClassid())) { - case doi: - final String doi = StringUtils.trim(StringUtils.lowerCase(s.getValue())); - return doi.matches(DOI_REGEX); - default: - return true; - } - } catch (IllegalArgumentException e) { - return false; - } - } - - private static String verifyIdSyntax(String s) { - if (StringUtils.isBlank(s) || !s.matches(ID_REGEX)) { - throw new RuntimeException(String.format("malformed id: '%s'", s)); - } else { - return s; - } - } - - private static String idFromPid(T entity, StructuredProperty s) { - return new StringBuilder() - .append(StringUtils.substringBefore(entity.getId(), ID_PREFIX_SEPARATOR)) - .append(ID_PREFIX_SEPARATOR) - .append(createPrefix(s.getQualifier().getClassid())) - .append(ID_SEPARATOR) - .append(DHPUtils.md5(CleaningFunctions.normalizePidValue(s).getValue())) - .toString(); - } - - // create the prefix (length = 12) - private static String createPrefix(String pidType) { - StringBuilder prefix = new StringBuilder(StringUtils.left(pidType, ID_PREFIX_LEN)); - while (prefix.length() < ID_PREFIX_LEN) { - prefix.append("_"); - } - return prefix.substring(0, ID_PREFIX_LEN); - } - -} diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/OafMapperUtils.java b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/OafMapperUtils.java new file mode 100644 index 000000000..c58096d35 --- /dev/null +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/OafMapperUtils.java @@ -0,0 +1,517 @@ + +package eu.dnetlib.dhp.schema.oaf.utils; + +import static eu.dnetlib.dhp.schema.common.ModelConstants.*; + +import java.sql.Array; +import java.sql.SQLException; +import java.util.*; +import java.util.concurrent.ConcurrentHashMap; +import java.util.function.Function; +import java.util.function.Predicate; +import java.util.stream.Collectors; + +import org.apache.commons.lang3.StringUtils; + +import eu.dnetlib.dhp.schema.common.AccessRightComparator; +import eu.dnetlib.dhp.schema.common.ModelConstants; +import eu.dnetlib.dhp.schema.common.ModelSupport; +import eu.dnetlib.dhp.schema.oaf.*; + +public class OafMapperUtils { + + private OafMapperUtils() { + } + + public static Oaf merge(final Oaf left, final Oaf right) { + if (ModelSupport.isSubClass(left, OafEntity.class)) { + return mergeEntities((OafEntity) left, (OafEntity) right); + } else if (ModelSupport.isSubClass(left, Relation.class)) { + ((Relation) left).mergeFrom((Relation) right); + } else { + throw new IllegalArgumentException("invalid Oaf type:" + left.getClass().getCanonicalName()); + } + return left; + } + + public static OafEntity mergeEntities(OafEntity left, OafEntity right) { + if (ModelSupport.isSubClass(left, Result.class)) { + return mergeResults((Result) left, (Result) right); + } else if (ModelSupport.isSubClass(left, Datasource.class)) { + left.mergeFrom(right); + } else if (ModelSupport.isSubClass(left, Organization.class)) { + left.mergeFrom(right); + } else if (ModelSupport.isSubClass(left, Project.class)) { + left.mergeFrom(right); + } else { + throw new IllegalArgumentException("invalid OafEntity subtype:" + left.getClass().getCanonicalName()); + } + return left; + } + + public static Result mergeResults(Result left, Result right) { + + final boolean leftFromDelegatedAuthority = isFromDelegatedAuthority(left); + final boolean rightFromDelegatedAuthority = isFromDelegatedAuthority(right); + + if (leftFromDelegatedAuthority && !rightFromDelegatedAuthority) { + return left; + } + if (!leftFromDelegatedAuthority && rightFromDelegatedAuthority) { + return right; + } + + if (new ResultTypeComparator().compare(left, right) < 0) { + left.mergeFrom(right); + return left; + } else { + right.mergeFrom(left); + return right; + } + } + + private static boolean isFromDelegatedAuthority(Result r) { + return Optional + .ofNullable(r.getInstance()) + .map( + instance -> instance + .stream() + .filter(i -> Objects.nonNull(i.getCollectedfrom())) + .map(i -> i.getCollectedfrom().getKey()) + .anyMatch(cfId -> IdentifierFactory.delegatedAuthorityDatasourceIds().contains(cfId))) + .orElse(false); + } + + public static KeyValue keyValue(final String k, final String v) { + final KeyValue kv = new KeyValue(); + kv.setKey(k); + kv.setValue(v); + return kv; + } + + public static List listKeyValues(final String... s) { + if (s.length % 2 > 0) { + throw new IllegalArgumentException("Invalid number of parameters (k,v,k,v,....)"); + } + + final List list = new ArrayList<>(); + for (int i = 0; i < s.length; i += 2) { + list.add(keyValue(s[i], s[i + 1])); + } + return list; + } + + public static Field field(final T value, final DataInfo info) { + if (value == null || StringUtils.isBlank(value.toString())) { + return null; + } + + final Field field = new Field<>(); + field.setValue(value); + field.setDataInfo(info); + return field; + } + + public static List> listFields(final DataInfo info, final String... values) { + return Arrays + .stream(values) + .map(v -> field(v, info)) + .filter(Objects::nonNull) + .filter(distinctByKey(Field::getValue)) + .collect(Collectors.toList()); + } + + public static List listValues(Array values) throws SQLException { + if (Objects.isNull(values)) { + return null; + } + return Arrays + .stream((T[]) values.getArray()) + .filter(Objects::nonNull) + .distinct() + .collect(Collectors.toList()); + } + + public static List> listFields(final DataInfo info, final List values) { + return values + .stream() + .map(v -> field(v, info)) + .filter(Objects::nonNull) + .filter(distinctByKey(Field::getValue)) + .collect(Collectors.toList()); + } + + public static Qualifier unknown(final String schemeid, final String schemename) { + return qualifier(UNKNOWN, "Unknown", schemeid, schemename); + } + + public static AccessRight accessRight( + final String classid, + final String classname, + final String schemeid, + final String schemename) { + return accessRight(classid, classname, schemeid, schemename, null); + } + + public static AccessRight accessRight( + final String classid, + final String classname, + final String schemeid, + final String schemename, + final OpenAccessRoute openAccessRoute) { + final AccessRight accessRight = new AccessRight(); + accessRight.setClassid(classid); + accessRight.setClassname(classname); + accessRight.setSchemeid(schemeid); + accessRight.setSchemename(schemename); + accessRight.setOpenAccessRoute(openAccessRoute); + return accessRight; + } + + public static Qualifier qualifier( + final String classid, + final String classname, + final String schemeid, + final String schemename) { + final Qualifier q = new Qualifier(); + q.setClassid(classid); + q.setClassname(classname); + q.setSchemeid(schemeid); + q.setSchemename(schemename); + return q; + } + + public static Qualifier qualifier(final Qualifier qualifier) { + final Qualifier q = new Qualifier(); + q.setClassid(qualifier.getClassid()); + q.setClassname(qualifier.getClassname()); + q.setSchemeid(qualifier.getSchemeid()); + q.setSchemename(qualifier.getSchemename()); + return q; + } + + public static Subject subject( + final String value, + final String classid, + final String classname, + final String schemeid, + final String schemename, + final DataInfo dataInfo) { + + return subject(value, qualifier(classid, classname, schemeid, schemename), dataInfo); + } + + public static StructuredProperty structuredProperty( + final String value, + final String classid, + final String classname, + final String schemeid, + final String schemename, + final DataInfo dataInfo) { + + return structuredProperty(value, qualifier(classid, classname, schemeid, schemename), dataInfo); + } + + public static Subject subject( + final String value, + final Qualifier qualifier, + final DataInfo dataInfo) { + if (value == null) { + return null; + } + final Subject s = new Subject(); + s.setValue(value); + s.setQualifier(qualifier); + s.setDataInfo(dataInfo); + return s; + } + + public static StructuredProperty structuredProperty( + final String value, + final Qualifier qualifier, + final DataInfo dataInfo) { + if (value == null) { + return null; + } + final StructuredProperty sp = new StructuredProperty(); + sp.setValue(value); + sp.setQualifier(qualifier); + sp.setDataInfo(dataInfo); + return sp; + } + + public static ExtraInfo extraInfo( + final String name, + final String value, + final String typology, + final String provenance, + final String trust) { + final ExtraInfo info = new ExtraInfo(); + info.setName(name); + info.setValue(value); + info.setTypology(typology); + info.setProvenance(provenance); + info.setTrust(trust); + return info; + } + + public static OAIProvenance oaiIProvenance( + final String identifier, + final String baseURL, + final String metadataNamespace, + final Boolean altered, + final String datestamp, + final String harvestDate) { + + final OriginDescription desc = new OriginDescription(); + desc.setIdentifier(identifier); + desc.setBaseURL(baseURL); + desc.setMetadataNamespace(metadataNamespace); + desc.setAltered(altered); + desc.setDatestamp(datestamp); + desc.setHarvestDate(harvestDate); + + final OAIProvenance p = new OAIProvenance(); + p.setOriginDescription(desc); + + return p; + } + + public static Journal journal( + final String name, + final String issnPrinted, + final String issnOnline, + final String issnLinking, + final DataInfo dataInfo) { + + return hasIssn(issnPrinted, issnOnline, issnLinking) ? journal( + name, + issnPrinted, + issnOnline, + issnLinking, + null, + null, + null, + null, + null, + null, + null, + dataInfo) : null; + } + + public static Journal journal( + final String name, + final String issnPrinted, + final String issnOnline, + final String issnLinking, + final String ep, + final String iss, + final String sp, + final String vol, + final String edition, + final String conferenceplace, + final String conferencedate, + final DataInfo dataInfo) { + + if (StringUtils.isNotBlank(name) || hasIssn(issnPrinted, issnOnline, issnLinking)) { + final Journal j = new Journal(); + j.setName(name); + j.setIssnPrinted(issnPrinted); + j.setIssnOnline(issnOnline); + j.setIssnLinking(issnLinking); + j.setEp(ep); + j.setIss(iss); + j.setSp(sp); + j.setVol(vol); + j.setEdition(edition); + j.setConferenceplace(conferenceplace); + j.setConferencedate(conferencedate); + j.setDataInfo(dataInfo); + return j; + } else { + return null; + } + } + + private static boolean hasIssn(String issnPrinted, String issnOnline, String issnLinking) { + return StringUtils.isNotBlank(issnPrinted) + || StringUtils.isNotBlank(issnOnline) + || StringUtils.isNotBlank(issnLinking); + } + + public static DataInfo dataInfo( + final Boolean deletedbyinference, + final String inferenceprovenance, + final Boolean inferred, + final Boolean invisible, + final Qualifier provenanceaction, + final String trust) { + final DataInfo d = new DataInfo(); + d.setDeletedbyinference(deletedbyinference); + d.setInferenceprovenance(inferenceprovenance); + d.setInferred(inferred); + d.setInvisible(invisible); + d.setProvenanceaction(provenanceaction); + d.setTrust(trust); + return d; + } + + public static String createOpenaireId( + final int prefix, + final String originalId, + final boolean to_md5) { + if (StringUtils.isBlank(originalId)) { + return null; + } else if (to_md5) { + final String nsPrefix = StringUtils.substringBefore(originalId, "::"); + final String rest = StringUtils.substringAfter(originalId, "::"); + return String.format("%s|%s::%s", prefix, nsPrefix, IdentifierFactory.md5(rest)); + } else { + return String.format("%s|%s", prefix, originalId); + } + } + + public static String createOpenaireId( + final String type, + final String originalId, + final boolean to_md5) { + switch (type) { + case "datasource": + return createOpenaireId(10, originalId, to_md5); + case "organization": + return createOpenaireId(20, originalId, to_md5); + case "person": + return createOpenaireId(30, originalId, to_md5); + case "project": + return createOpenaireId(40, originalId, to_md5); + default: + return createOpenaireId(50, originalId, to_md5); + } + } + + public static String asString(final Object o) { + return o == null ? "" : o.toString(); + } + + public static Predicate distinctByKey( + final Function keyExtractor) { + final Map seen = new ConcurrentHashMap<>(); + return t -> seen.putIfAbsent(keyExtractor.apply(t), Boolean.TRUE) == null; + } + + public static Qualifier createBestAccessRights(final List instanceList) { + return getBestAccessRights(instanceList); + } + + protected static Qualifier getBestAccessRights(final List instanceList) { + if (instanceList != null) { + final Optional min = instanceList + .stream() + .map(Instance::getAccessright) + .min(new AccessRightComparator<>()); + + final Qualifier rights = min.map(OafMapperUtils::qualifier).orElseGet(Qualifier::new); + + if (StringUtils.isBlank(rights.getClassid())) { + rights.setClassid(UNKNOWN); + } + if (StringUtils.isBlank(rights.getClassname()) + || UNKNOWN.equalsIgnoreCase(rights.getClassname())) { + rights.setClassname(NOT_AVAILABLE); + } + if (StringUtils.isBlank(rights.getSchemeid())) { + rights.setSchemeid(DNET_ACCESS_MODES); + } + if (StringUtils.isBlank(rights.getSchemename())) { + rights.setSchemename(DNET_ACCESS_MODES); + } + + return rights; + } + return null; + } + + public static KeyValue newKeyValueInstance(String key, String value, DataInfo dataInfo) { + KeyValue kv = new KeyValue(); + kv.setDataInfo(dataInfo); + kv.setKey(key); + kv.setValue(value); + return kv; + } + + public static Measure newMeasureInstance(String id, String value, String key, DataInfo dataInfo) { + Measure m = new Measure(); + m.setId(id); + m.setUnit(Arrays.asList(newKeyValueInstance(key, value, dataInfo))); + return m; + } + + public static Relation getRelation(final String source, + final String target, + final String relType, + final String subRelType, + final String relClass, + final OafEntity entity) { + return getRelation(source, target, relType, subRelType, relClass, entity, null); + } + + public static Relation getRelation(final String source, + final String target, + final String relType, + final String subRelType, + final String relClass, + final OafEntity entity, + final String validationDate) { + return getRelation( + source, target, relType, subRelType, relClass, entity.getCollectedfrom(), entity.getDataInfo(), + entity.getLastupdatetimestamp(), validationDate, null); + } + + public static Relation getRelation(final String source, + final String target, + final String relType, + final String subRelType, + final String relClass, + final List collectedfrom, + final DataInfo dataInfo, + final Long lastupdatetimestamp) { + return getRelation( + source, target, relType, subRelType, relClass, collectedfrom, dataInfo, lastupdatetimestamp, null, null); + } + + public static Relation getRelation(final String source, + final String target, + final String relType, + final String subRelType, + final String relClass, + final List collectedfrom, + final DataInfo dataInfo, + final Long lastupdatetimestamp, + final String validationDate, + final List properties) { + final Relation rel = new Relation(); + rel.setRelType(relType); + rel.setSubRelType(subRelType); + rel.setRelClass(relClass); + rel.setSource(source); + rel.setTarget(target); + rel.setCollectedfrom(collectedfrom); + rel.setDataInfo(dataInfo); + rel.setLastupdatetimestamp(lastupdatetimestamp); + rel.setValidated(StringUtils.isNotBlank(validationDate)); + rel.setValidationDate(StringUtils.isNotBlank(validationDate) ? validationDate : null); + rel.setProperties(properties); + return rel; + } + + public static String getProvenance(DataInfo dataInfo) { + return Optional + .ofNullable(dataInfo) + .map( + d -> Optional + .ofNullable(d.getProvenanceaction()) + .map(Qualifier::getClassid) + .orElse("")) + .orElse(""); + } +} diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/OrganizationPidComparator.java b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/OrganizationPidComparator.java deleted file mode 100644 index 733d41bff..000000000 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/OrganizationPidComparator.java +++ /dev/null @@ -1,27 +0,0 @@ - -package eu.dnetlib.dhp.schema.oaf.utils; - -import java.util.Comparator; - -public class OrganizationPidComparator implements Comparator { - - @Override - public int compare(PidType pLeft, PidType pRight) { - if (pLeft.equals(PidType.GRID)) - return -1; - if (pRight.equals(PidType.GRID)) - return 1; - - if (pLeft.equals(PidType.mag_id)) - return -1; - if (pRight.equals(PidType.mag_id)) - return 1; - - if (pLeft.equals(PidType.urn)) - return -1; - if (pRight.equals(PidType.urn)) - return 1; - - return 0; - } -} diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/PidComparator.java b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/PidComparator.java deleted file mode 100644 index 34ce5563f..000000000 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/PidComparator.java +++ /dev/null @@ -1,54 +0,0 @@ - -package eu.dnetlib.dhp.schema.oaf.utils; - -import java.util.Comparator; - -import eu.dnetlib.dhp.schema.common.ModelSupport; -import eu.dnetlib.dhp.schema.oaf.OafEntity; -import eu.dnetlib.dhp.schema.oaf.Organization; -import eu.dnetlib.dhp.schema.oaf.Result; -import eu.dnetlib.dhp.schema.oaf.StructuredProperty; - -public class PidComparator implements Comparator { - - private T entity; - - public PidComparator(T entity) { - this.entity = entity; - } - - @Override - public int compare(StructuredProperty left, StructuredProperty right) { - - if (left == null && right == null) - return 0; - if (left == null) - return 1; - if (right == null) - return -1; - - PidType lClass = PidType.valueOf(left.getQualifier().getClassid()); - PidType rClass = PidType.valueOf(right.getQualifier().getClassid()); - - if (lClass.equals(rClass)) - return 0; - - if (ModelSupport.isSubClass(entity, Result.class)) { - return compareResultPids(lClass, rClass); - } - if (ModelSupport.isSubClass(entity, Organization.class)) { - return compareOrganizationtPids(lClass, rClass); - } - - // Else (but unlikely), lexicographical ordering will do. - return lClass.compareTo(rClass); - } - - private int compareResultPids(PidType lClass, PidType rClass) { - return new ResultPidComparator().compare(lClass, rClass); - } - - private int compareOrganizationtPids(PidType lClass, PidType rClass) { - return new OrganizationPidComparator().compare(lClass, rClass); - } -} diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/PidType.java b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/PidType.java deleted file mode 100644 index 9a0196ccc..000000000 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/PidType.java +++ /dev/null @@ -1,29 +0,0 @@ - -package eu.dnetlib.dhp.schema.oaf.utils; - -import org.apache.commons.lang3.EnumUtils; - -public enum PidType { - - // Result - doi, pmid, pmc, handle, arXiv, NCID, GBIF, nct, pdb, - - // Organization - GRID, mag_id, urn, - - // Used by dedup - undefined, original; - - public static boolean isValid(String type) { - return EnumUtils.isValidEnum(PidType.class, type); - } - - public static PidType tryValueOf(String s) { - try { - return PidType.valueOf(s); - } catch (Exception e) { - return PidType.original; - } - } - -} diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/ResultPidComparator.java b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/ResultPidComparator.java deleted file mode 100644 index 0f65cca36..000000000 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/ResultPidComparator.java +++ /dev/null @@ -1,57 +0,0 @@ - -package eu.dnetlib.dhp.schema.oaf.utils; - -import java.util.Comparator; - -public class ResultPidComparator implements Comparator { - - @Override - public int compare(PidType pLeft, PidType pRight) { - if (pLeft.equals(PidType.doi)) - return -1; - if (pRight.equals(PidType.doi)) - return 1; - - if (pLeft.equals(PidType.pmid)) - return -1; - if (pRight.equals(PidType.pmid)) - return 1; - - if (pLeft.equals(PidType.pmc)) - return -1; - if (pRight.equals(PidType.pmc)) - return 1; - - if (pLeft.equals(PidType.handle)) - return -1; - if (pRight.equals(PidType.handle)) - return 1; - - if (pLeft.equals(PidType.arXiv)) - return -1; - if (pRight.equals(PidType.arXiv)) - return 1; - - if (pLeft.equals(PidType.NCID)) - return -1; - if (pRight.equals(PidType.NCID)) - return 1; - - if (pLeft.equals(PidType.GBIF)) - return -1; - if (pRight.equals(PidType.GBIF)) - return 1; - - if (pLeft.equals(PidType.nct)) - return -1; - if (pRight.equals(PidType.nct)) - return 1; - - if (pLeft.equals(PidType.urn)) - return -1; - if (pRight.equals(PidType.urn)) - return 1; - - return 0; - } -} diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/SubjectProvenanceComparator.java b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/SubjectProvenanceComparator.java new file mode 100644 index 000000000..f4e3c8841 --- /dev/null +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/SubjectProvenanceComparator.java @@ -0,0 +1,46 @@ + +package eu.dnetlib.dhp.schema.oaf.utils; + +import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.getProvenance; +import static org.apache.commons.lang3.StringUtils.isBlank; + +import java.util.Comparator; + +import eu.dnetlib.dhp.schema.oaf.Subject; + +public class SubjectProvenanceComparator implements Comparator { + + @Override + public int compare(Subject left, Subject right) { + + String lProv = getProvenance(left.getDataInfo()); + String rProv = getProvenance(right.getDataInfo()); + + if (isBlank(lProv) && isBlank(rProv)) + return 0; + if (isBlank(lProv)) + return 1; + if (isBlank(rProv)) + return -1; + if (lProv.equals(rProv)) + return 0; + if (lProv.toLowerCase().contains("crosswalk")) + return -1; + if (rProv.toLowerCase().contains("crosswalk")) + return 1; + if (lProv.toLowerCase().contains("user")) + return -1; + if (rProv.toLowerCase().contains("user")) + return 1; + if (lProv.toLowerCase().contains("propagation")) + return -1; + if (rProv.toLowerCase().contains("propagation")) + return 1; + if (lProv.toLowerCase().contains("iis")) + return -1; + if (rProv.toLowerCase().contains("iis")) + return 1; + + return 0; + } +} diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/utils/DHPUtils.java b/dhp-common/src/main/java/eu/dnetlib/dhp/utils/DHPUtils.java index dfbaf3a6c..e10d0c500 100644 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/utils/DHPUtils.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/utils/DHPUtils.java @@ -1,63 +1,112 @@ package eu.dnetlib.dhp.utils; -import java.io.ByteArrayInputStream; -import java.io.ByteArrayOutputStream; +import java.io.*; import java.nio.charset.StandardCharsets; import java.security.MessageDigest; -import java.util.zip.GZIPInputStream; -import java.util.zip.GZIPOutputStream; +import java.util.*; +import java.util.stream.Collectors; -import org.apache.commons.codec.binary.Base64; -import org.apache.commons.codec.binary.Base64OutputStream; import org.apache.commons.codec.binary.Hex; +import org.apache.commons.io.IOUtils; +import org.apache.commons.lang3.StringUtils; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.http.client.methods.CloseableHttpResponse; +import org.apache.http.client.methods.HttpGet; +import org.apache.http.impl.client.CloseableHttpClient; +import org.apache.http.impl.client.HttpClients; +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.SaveMode; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import com.fasterxml.jackson.databind.ObjectMapper; +import com.google.common.collect.Maps; import com.jayway.jsonpath.JsonPath; +import eu.dnetlib.dhp.schema.mdstore.MDStoreWithInfo; +import eu.dnetlib.dhp.schema.oaf.utils.CleaningFunctions; import net.minidev.json.JSONArray; +import scala.collection.JavaConverters; +import scala.collection.Seq; public class DHPUtils { + private static final Logger log = LoggerFactory.getLogger(DHPUtils.class); + + private DHPUtils() { + } + + public static Seq toSeq(List list) { + return JavaConverters.asScalaIteratorConverter(list.iterator()).asScala().toSeq(); + } + public static String md5(final String s) { try { final MessageDigest md = MessageDigest.getInstance("MD5"); md.update(s.getBytes(StandardCharsets.UTF_8)); return new String(Hex.encodeHex(md.digest())); } catch (final Exception e) { - System.err.println("Error creating id"); + log.error("Error creating id from {}", s); return null; } } + /** + * Retrieves from the metadata store manager application the list of paths associated with mdstores characterized + * by he given format, layout, interpretation + * @param mdstoreManagerUrl the URL of the mdstore manager service + * @param format the mdstore format + * @param layout the mdstore layout + * @param interpretation the mdstore interpretation + * @param includeEmpty include Empty mdstores + * @return the set of hdfs paths + * @throws IOException in case of HTTP communication issues + */ + public static Set mdstorePaths(final String mdstoreManagerUrl, + final String format, + final String layout, + final String interpretation, + boolean includeEmpty) throws IOException { + final String url = mdstoreManagerUrl + "/mdstores/"; + final ObjectMapper objectMapper = new ObjectMapper(); + + final HttpGet req = new HttpGet(url); + + log.info("MDStoreManager request: {}", req); + + try (final CloseableHttpClient client = HttpClients.createDefault()) { + try (final CloseableHttpResponse response = client.execute(req)) { + final String json = IOUtils.toString(response.getEntity().getContent()); + + log.info("MDStoreManager response: {}", json); + + final MDStoreWithInfo[] mdstores = objectMapper.readValue(json, MDStoreWithInfo[].class); + return Arrays + .stream(mdstores) + .filter(md -> md.getFormat().equalsIgnoreCase(format)) + .filter(md -> md.getLayout().equalsIgnoreCase(layout)) + .filter(md -> md.getInterpretation().equalsIgnoreCase(interpretation)) + .filter(md -> StringUtils.isNotBlank(md.getHdfsPath())) + .filter(md -> StringUtils.isNotBlank(md.getCurrentVersion())) + .filter(md -> includeEmpty || md.getSize() > 0) + .map(md -> md.getHdfsPath() + "/" + md.getCurrentVersion() + "/store") + .collect(Collectors.toSet()); + } + } + } + public static String generateIdentifier(final String originalId, final String nsPrefix) { return String.format("%s::%s", nsPrefix, DHPUtils.md5(originalId)); } - public static String compressString(final String input) { - try (ByteArrayOutputStream out = new ByteArrayOutputStream(); - Base64OutputStream b64os = new Base64OutputStream(out)) { - GZIPOutputStream gzip = new GZIPOutputStream(b64os); - gzip.write(input.getBytes(StandardCharsets.UTF_8)); - gzip.close(); - return out.toString(); - } catch (Throwable e) { - return null; - } - } + public static String generateUnresolvedIdentifier(final String pid, final String pidType) { - public static String decompressString(final String input) { - byte[] byteArray = Base64.decodeBase64(input.getBytes()); - int len; - try (GZIPInputStream gis = new GZIPInputStream(new ByteArrayInputStream((byteArray))); - ByteArrayOutputStream bos = new ByteArrayOutputStream(byteArray.length)) { - byte[] buffer = new byte[1024]; - while ((len = gis.read(buffer)) != -1) { - bos.write(buffer, 0, len); - } - return bos.toString(); - } catch (Exception e) { - return null; - } + final String cleanedPid = CleaningFunctions.normalizePidValue(pidType, pid); + + return String.format("unresolved::%s::%s", cleanedPid, pidType.toLowerCase().trim()); } public static String getJPathString(final String jsonPath, final String json) { @@ -72,4 +121,72 @@ public class DHPUtils { return ""; } } + + public static final ObjectMapper MAPPER = new ObjectMapper(); + + public static void writeHdfsFile(final Configuration conf, final String content, final String path) + throws IOException { + + log.info("writing file {}, size {}", path, content.length()); + try (FileSystem fs = FileSystem.get(conf); + BufferedOutputStream os = new BufferedOutputStream(fs.create(new Path(path)))) { + os.write(content.getBytes(StandardCharsets.UTF_8)); + os.flush(); + } + } + + public static String readHdfsFile(Configuration conf, String path) throws IOException { + log.info("reading file {}", path); + + try (FileSystem fs = FileSystem.get(conf)) { + final Path p = new Path(path); + if (!fs.exists(p)) { + throw new FileNotFoundException(path); + } + return IOUtils.toString(fs.open(p)); + } + } + + public static T readHdfsFileAs(Configuration conf, String path, Class clazz) throws IOException { + return MAPPER.readValue(readHdfsFile(conf, path), clazz); + } + + public static void saveDataset(final Dataset mdstore, final String targetPath) { + log.info("saving dataset in: {}", targetPath); + mdstore + .write() + .mode(SaveMode.Overwrite) + .format("parquet") + .save(targetPath); + } + + public static Configuration getHadoopConfiguration(String nameNode) { + // ====== Init HDFS File System Object + Configuration conf = new Configuration(); + // Set FileSystem URI + conf.set("fs.defaultFS", nameNode); + // Because of Maven + conf.set("fs.hdfs.impl", org.apache.hadoop.hdfs.DistributedFileSystem.class.getName()); + conf.set("fs.file.impl", org.apache.hadoop.fs.LocalFileSystem.class.getName()); + + System.setProperty("hadoop.home.dir", "/"); + return conf; + } + + public static void populateOOZIEEnv(final Map report) throws IOException { + File file = new File(System.getProperty("oozie.action.output.properties")); + Properties props = new Properties(); + report.forEach((k, v) -> props.setProperty(k, v)); + + try (OutputStream os = new FileOutputStream(file)) { + props.store(os, ""); + } + } + + public static void populateOOZIEEnv(final String paramName, String value) throws IOException { + Map report = Maps.newHashMap(); + report.put(paramName, value); + + populateOOZIEEnv(report); + } } diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/utils/ISLookupClientFactory.java b/dhp-common/src/main/java/eu/dnetlib/dhp/utils/ISLookupClientFactory.java index 9552eb2b3..8ae0bb5c3 100644 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/utils/ISLookupClientFactory.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/utils/ISLookupClientFactory.java @@ -1,11 +1,11 @@ package eu.dnetlib.dhp.utils; -import java.util.Map; - -import javax.xml.ws.BindingProvider; - +import org.apache.cxf.endpoint.Client; +import org.apache.cxf.frontend.ClientProxy; import org.apache.cxf.jaxws.JaxWsProxyFactoryBean; +import org.apache.cxf.transport.http.HTTPConduit; +import org.apache.cxf.transports.http.configuration.HTTPClientPolicy; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -15,8 +15,11 @@ public class ISLookupClientFactory { private static final Logger log = LoggerFactory.getLogger(ISLookupClientFactory.class); - private static int requestTimeout = 60000 * 10; - private static int connectTimeout = 60000 * 10; + private static final int requestTimeout = 60000 * 10; + private static final int connectTimeout = 60000 * 10; + + private ISLookupClientFactory() { + } public static ISLookUpService getLookUpService(final String isLookupUrl) { return getServiceStub(ISLookUpService.class, isLookupUrl); @@ -24,27 +27,28 @@ public class ISLookupClientFactory { @SuppressWarnings("unchecked") private static T getServiceStub(final Class clazz, final String endpoint) { - log.info(String.format("creating %s stub from %s", clazz.getName(), endpoint)); + log.info("creating {} stub from {}", clazz.getName(), endpoint); final JaxWsProxyFactoryBean jaxWsProxyFactory = new JaxWsProxyFactoryBean(); jaxWsProxyFactory.setServiceClass(clazz); jaxWsProxyFactory.setAddress(endpoint); final T service = (T) jaxWsProxyFactory.create(); - if (service instanceof BindingProvider) { + Client client = ClientProxy.getClient(service); + if (client != null) { + HTTPConduit conduit = (HTTPConduit) client.getConduit(); + HTTPClientPolicy policy = new HTTPClientPolicy(); + log .info( - "setting timeouts for {} to requestTimeout: {}, connectTimeout: {}", - BindingProvider.class.getName(), requestTimeout, connectTimeout); + "setting connectTimeout to {}, requestTimeout to {} for service {}", + connectTimeout, + requestTimeout, + clazz.getCanonicalName()); - Map requestContext = ((BindingProvider) service).getRequestContext(); - - requestContext.put("com.sun.xml.internal.ws.request.timeout", requestTimeout); - requestContext.put("com.sun.xml.internal.ws.connect.timeout", connectTimeout); - requestContext.put("com.sun.xml.ws.request.timeout", requestTimeout); - requestContext.put("com.sun.xml.ws.connect.timeout", connectTimeout); - requestContext.put("javax.xml.ws.client.receiveTimeout", requestTimeout); - requestContext.put("javax.xml.ws.client.connectionTimeout", connectTimeout); + policy.setConnectionTimeout(connectTimeout); + policy.setReceiveTimeout(requestTimeout); + conduit.setClient(policy); } return service; diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/utils/saxon/AbstractExtensionFunction.java b/dhp-common/src/main/java/eu/dnetlib/dhp/utils/saxon/AbstractExtensionFunction.java index 9b00b908c..81f1b5142 100644 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/utils/saxon/AbstractExtensionFunction.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/utils/saxon/AbstractExtensionFunction.java @@ -10,7 +10,7 @@ import net.sf.saxon.trans.XPathException; public abstract class AbstractExtensionFunction extends ExtensionFunctionDefinition { - public static String DEFAULT_SAXON_EXT_NS_URI = "http://www.d-net.research-infrastructures.eu/saxon-extension"; + public static final String DEFAULT_SAXON_EXT_NS_URI = "http://www.d-net.research-infrastructures.eu/saxon-extension"; public abstract String getName(); diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/utils/saxon/ExtractYear.java b/dhp-common/src/main/java/eu/dnetlib/dhp/utils/saxon/ExtractYear.java index c7e311b02..1ea2b9f46 100644 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/utils/saxon/ExtractYear.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/utils/saxon/ExtractYear.java @@ -26,7 +26,7 @@ public class ExtractYear extends AbstractExtensionFunction { @Override public Sequence doCall(XPathContext context, Sequence[] arguments) throws XPathException { - if (arguments == null | arguments.length == 0) { + if (arguments == null || arguments.length == 0) { return new StringValue(""); } final Item item = arguments[0].head(); @@ -63,8 +63,7 @@ public class ExtractYear extends AbstractExtensionFunction { for (String format : dateFormats) { try { c.setTime(new SimpleDateFormat(format).parse(s)); - String year = String.valueOf(c.get(Calendar.YEAR)); - return year; + return String.valueOf(c.get(Calendar.YEAR)); } catch (ParseException e) { } } diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/utils/saxon/NormalizeDate.java b/dhp-common/src/main/java/eu/dnetlib/dhp/utils/saxon/NormalizeDate.java index 1b5f3c40d..3e5def9b5 100644 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/utils/saxon/NormalizeDate.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/utils/saxon/NormalizeDate.java @@ -30,7 +30,7 @@ public class NormalizeDate extends AbstractExtensionFunction { @Override public Sequence doCall(XPathContext context, Sequence[] arguments) throws XPathException { - if (arguments == null | arguments.length == 0) { + if (arguments == null || arguments.length == 0) { return new StringValue(BLANK); } String s = arguments[0].head().getStringValue(); diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/utils/saxon/PickFirst.java b/dhp-common/src/main/java/eu/dnetlib/dhp/utils/saxon/PickFirst.java index 46ecafd0a..b46a415d8 100644 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/utils/saxon/PickFirst.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/utils/saxon/PickFirst.java @@ -1,6 +1,8 @@ package eu.dnetlib.dhp.utils.saxon; +import static org.apache.commons.lang3.StringUtils.isNotBlank; + import org.apache.commons.lang3.StringUtils; import net.sf.saxon.expr.XPathContext; @@ -26,7 +28,8 @@ public class PickFirst extends AbstractExtensionFunction { final String s1 = getValue(arguments[0]); final String s2 = getValue(arguments[1]); - return new StringValue(StringUtils.isNotBlank(s1) ? s1 : StringUtils.isNotBlank(s2) ? s2 : ""); + final String value = isNotBlank(s1) ? s1 : isNotBlank(s2) ? s2 : ""; + return new StringValue(value); } private String getValue(final Sequence arg) throws XPathException { diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/utils/saxon/SaxonTransformerFactory.java b/dhp-common/src/main/java/eu/dnetlib/dhp/utils/saxon/SaxonTransformerFactory.java index b85d866f1..61049d2e1 100644 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/utils/saxon/SaxonTransformerFactory.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/utils/saxon/SaxonTransformerFactory.java @@ -12,6 +12,9 @@ import net.sf.saxon.TransformerFactoryImpl; public class SaxonTransformerFactory { + private SaxonTransformerFactory() { + } + /** * Creates the index record transformer from the given XSLT * diff --git a/dhp-common/src/main/java/eu/dnetlib/message/Message.java b/dhp-common/src/main/java/eu/dnetlib/message/Message.java deleted file mode 100644 index fc1c38291..000000000 --- a/dhp-common/src/main/java/eu/dnetlib/message/Message.java +++ /dev/null @@ -1,76 +0,0 @@ - -package eu.dnetlib.message; - -import java.io.IOException; -import java.util.Map; - -import com.fasterxml.jackson.core.JsonProcessingException; -import com.fasterxml.jackson.databind.ObjectMapper; - -public class Message { - - private String workflowId; - - private String jobName; - - private MessageType type; - - private Map body; - - public static Message fromJson(final String json) throws IOException { - final ObjectMapper jsonMapper = new ObjectMapper(); - return jsonMapper.readValue(json, Message.class); - } - - public Message() { - } - - public Message(String workflowId, String jobName, MessageType type, Map body) { - this.workflowId = workflowId; - this.jobName = jobName; - this.type = type; - this.body = body; - } - - public String getWorkflowId() { - return workflowId; - } - - public void setWorkflowId(String workflowId) { - this.workflowId = workflowId; - } - - public String getJobName() { - return jobName; - } - - public void setJobName(String jobName) { - this.jobName = jobName; - } - - public MessageType getType() { - return type; - } - - public void setType(MessageType type) { - this.type = type; - } - - public Map getBody() { - return body; - } - - public void setBody(Map body) { - this.body = body; - } - - @Override - public String toString() { - final ObjectMapper jsonMapper = new ObjectMapper(); - try { - return jsonMapper.writeValueAsString(this); - } catch (JsonProcessingException e) { - return null; - } - } -} diff --git a/dhp-common/src/main/java/eu/dnetlib/message/MessageConsumer.java b/dhp-common/src/main/java/eu/dnetlib/message/MessageConsumer.java deleted file mode 100644 index fb3f0bd95..000000000 --- a/dhp-common/src/main/java/eu/dnetlib/message/MessageConsumer.java +++ /dev/null @@ -1,47 +0,0 @@ - -package eu.dnetlib.message; - -import java.io.IOException; -import java.nio.charset.StandardCharsets; -import java.util.concurrent.LinkedBlockingQueue; - -import com.rabbitmq.client.AMQP; -import com.rabbitmq.client.Channel; -import com.rabbitmq.client.DefaultConsumer; -import com.rabbitmq.client.Envelope; - -public class MessageConsumer extends DefaultConsumer { - - final LinkedBlockingQueue queueMessages; - - /** - * Constructs a new instance and records its association to the passed-in channel. - * - * @param channel the channel to which this consumer is attached - * @param queueMessages - */ - public MessageConsumer(Channel channel, LinkedBlockingQueue queueMessages) { - super(channel); - this.queueMessages = queueMessages; - } - - @Override - public void handleDelivery( - String consumerTag, Envelope envelope, AMQP.BasicProperties properties, byte[] body) - throws IOException { - final String json = new String(body, StandardCharsets.UTF_8); - Message message = Message.fromJson(json); - try { - this.queueMessages.put(message); - System.out.println("Receiving Message " + message); - } catch (InterruptedException e) { - if (message.getType() == MessageType.REPORT) - throw new RuntimeException("Error on sending message"); - else { - // TODO LOGGING EXCEPTION - } - } finally { - getChannel().basicAck(envelope.getDeliveryTag(), false); - } - } -} diff --git a/dhp-common/src/main/java/eu/dnetlib/message/MessageManager.java b/dhp-common/src/main/java/eu/dnetlib/message/MessageManager.java deleted file mode 100644 index 5ca79f3cc..000000000 --- a/dhp-common/src/main/java/eu/dnetlib/message/MessageManager.java +++ /dev/null @@ -1,136 +0,0 @@ - -package eu.dnetlib.message; - -import java.io.IOException; -import java.util.HashMap; -import java.util.Map; -import java.util.concurrent.LinkedBlockingQueue; -import java.util.concurrent.TimeoutException; - -import com.rabbitmq.client.Channel; -import com.rabbitmq.client.Connection; -import com.rabbitmq.client.ConnectionFactory; - -public class MessageManager { - - private final String messageHost; - - private final String username; - - private final String password; - - private Connection connection; - - private final Map channels = new HashMap<>(); - - private boolean durable; - - private boolean autodelete; - - private final LinkedBlockingQueue queueMessages; - - public MessageManager( - String messageHost, - String username, - String password, - final LinkedBlockingQueue queueMessages) { - this.queueMessages = queueMessages; - this.messageHost = messageHost; - this.username = username; - this.password = password; - } - - public MessageManager( - String messageHost, - String username, - String password, - boolean durable, - boolean autodelete, - final LinkedBlockingQueue queueMessages) { - this.queueMessages = queueMessages; - this.messageHost = messageHost; - this.username = username; - this.password = password; - - this.durable = durable; - this.autodelete = autodelete; - } - - private Connection createConnection() throws IOException, TimeoutException { - ConnectionFactory factory = new ConnectionFactory(); - factory.setHost(this.messageHost); - factory.setUsername(this.username); - factory.setPassword(this.password); - return factory.newConnection(); - } - - private Channel createChannel( - final Connection connection, - final String queueName, - final boolean durable, - final boolean autodelete) - throws Exception { - Map args = new HashMap<>(); - args.put("x-message-ttl", 10000); - Channel channel = connection.createChannel(); - channel.queueDeclare(queueName, durable, false, this.autodelete, args); - return channel; - } - - private Channel getOrCreateChannel(final String queueName, boolean durable, boolean autodelete) - throws Exception { - if (channels.containsKey(queueName)) { - return channels.get(queueName); - } - - if (this.connection == null) { - this.connection = createConnection(); - } - channels.put(queueName, createChannel(this.connection, queueName, durable, autodelete)); - return channels.get(queueName); - } - - public void close() throws IOException { - channels - .values() - .forEach( - ch -> { - try { - ch.close(); - } catch (Exception e) { - // TODO LOG - } - }); - - this.connection.close(); - } - - public boolean sendMessage(final Message message, String queueName) throws Exception { - try { - Channel channel = getOrCreateChannel(queueName, this.durable, this.autodelete); - channel.basicPublish("", queueName, null, message.toString().getBytes()); - return true; - } catch (Throwable e) { - throw new RuntimeException(e); - } - } - - public boolean sendMessage( - final Message message, String queueName, boolean durable_var, boolean autodelete_var) - throws Exception { - try { - Channel channel = getOrCreateChannel(queueName, durable_var, autodelete_var); - channel.basicPublish("", queueName, null, message.toString().getBytes()); - return true; - } catch (Throwable e) { - throw new RuntimeException(e); - } - } - - public void startConsumingMessage( - final String queueName, final boolean durable, final boolean autodelete) throws Exception { - - Channel channel = createChannel(createConnection(), queueName, durable, autodelete); - channel.basicConsume(queueName, false, new MessageConsumer(channel, queueMessages)); - } -} diff --git a/dhp-common/src/main/java/eu/dnetlib/message/MessageType.java b/dhp-common/src/main/java/eu/dnetlib/message/MessageType.java deleted file mode 100644 index 72cbda252..000000000 --- a/dhp-common/src/main/java/eu/dnetlib/message/MessageType.java +++ /dev/null @@ -1,6 +0,0 @@ - -package eu.dnetlib.message; - -public enum MessageType { - ONGOING, REPORT -} diff --git a/dhp-common/src/main/resources/eu/dnetlib/dhp/common/input_maketar_parameters.json b/dhp-common/src/main/resources/eu/dnetlib/dhp/common/input_maketar_parameters.json new file mode 100644 index 000000000..a15318865 --- /dev/null +++ b/dhp-common/src/main/resources/eu/dnetlib/dhp/common/input_maketar_parameters.json @@ -0,0 +1,30 @@ +[ + + { + "paramName":"s", + "paramLongName":"sourcePath", + "paramDescription": "the path of the sequencial file to read", + "paramRequired": true + }, + { + "paramName": "hdp", + "paramLongName": "hdfsPath", + "paramDescription": "the path used to store the output archive", + "paramRequired": true + }, + { + "paramName":"nn", + "paramLongName":"nameNode", + "paramDescription": "the name node", + "paramRequired": true + }, + { + "paramName":"ss", + "paramLongName":"splitSize", + "paramDescription": "the maximum size of the archive", + "paramRequired": false + } +] + + + diff --git a/dhp-common/src/main/resources/eu/dnetlib/dhp/common/name_particles.txt b/dhp-common/src/main/resources/eu/dnetlib/dhp/common/name_particles.txt new file mode 100644 index 000000000..07cf06a98 --- /dev/null +++ b/dhp-common/src/main/resources/eu/dnetlib/dhp/common/name_particles.txt @@ -0,0 +1,8 @@ +van +von +der +de +dell +sig +mr +mrs \ No newline at end of file diff --git a/dhp-common/src/main/resources/eu/dnetlib/dhp/oa/merge/dispatch_entities_parameters.json b/dhp-common/src/main/resources/eu/dnetlib/dhp/oa/merge/dispatch_entities_parameters.json new file mode 100644 index 000000000..aa8d2a7c2 --- /dev/null +++ b/dhp-common/src/main/resources/eu/dnetlib/dhp/oa/merge/dispatch_entities_parameters.json @@ -0,0 +1,26 @@ +[ + { + "paramName": "issm", + "paramLongName": "isSparkSessionManaged", + "paramDescription": "when true will stop SparkSession after job execution", + "paramRequired": false + }, + { + "paramName": "i", + "paramLongName": "inputPath", + "paramDescription": "the source path", + "paramRequired": true + }, + { + "paramName": "o", + "paramLongName": "outputPath", + "paramDescription": "path of the output graph", + "paramRequired": true + }, + { + "paramName": "c", + "paramLongName": "graphTableClassName", + "paramDescription": "the graph entity class name", + "paramRequired": true + } +] \ No newline at end of file diff --git a/dhp-common/src/main/resources/eu/dnetlib/dhp/oa/merge/group_graph_entities_parameters.json b/dhp-common/src/main/resources/eu/dnetlib/dhp/oa/merge/group_graph_entities_parameters.json new file mode 100644 index 000000000..e65acb3c4 --- /dev/null +++ b/dhp-common/src/main/resources/eu/dnetlib/dhp/oa/merge/group_graph_entities_parameters.json @@ -0,0 +1,20 @@ +[ + { + "paramName": "issm", + "paramLongName": "isSparkSessionManaged", + "paramDescription": "when true will stop SparkSession after job execution", + "paramRequired": false + }, + { + "paramName": "gin", + "paramLongName": "graphInputPath", + "paramDescription": "the graph root path", + "paramRequired": true + }, + { + "paramName": "out", + "paramLongName": "outputPath", + "paramDescription": "the output merged graph root path", + "paramRequired": true + } +] \ No newline at end of file diff --git a/dhp-common/src/main/resources/eu/dnetlib/dhp/schema/oaf/utils/pid_blacklist.json b/dhp-common/src/main/resources/eu/dnetlib/dhp/schema/oaf/utils/pid_blacklist.json new file mode 100644 index 000000000..05e8cde72 --- /dev/null +++ b/dhp-common/src/main/resources/eu/dnetlib/dhp/schema/oaf/utils/pid_blacklist.json @@ -0,0 +1,5 @@ +{ + "doi" : [ "10.12739/10.12739", "10.11646/zootaxa.4404.1.1", "10.5281/zenodo.3678492", "10.11646/zootaxa.4757.1.1", "10.17176/20170811-142447", "10.6035/asparkia", "10.11646/zootaxa.4754.1.6", "10.11646/zootaxa.4784.1.1", "10.6035/millars", "10.11646/zootaxa.4776.1.1", "10.1590/1982-0224-20170094", "10.11646/zootaxa.4773.1.1", "10.11646/zootaxa.4744.1.1", "10.3897/zookeys.38.383", "10.1371/journal.", "10.5281/zenodo.3727017", "10.5252/zoosystema2019v41a15", "10.6035/dossiersf", "10.11646/zootaxa.4754.1.20", "10.6035/recerca", "10.11646/zootaxa.4428.1.1", "10.7179/psri", "10.11646/zootaxa.4785.1.1", "10.2478/aemnp-2018-0014", "10.17979/spudc.9788497497565", "10.2139/ssrn.2721313", "10.17979/spudc.9788497497749", "10.5281/zenodo.3760976", "10.11646/zootaxa.4381.1.1", "10.6035/tiempos", "10.11646/zootaxa.4754.1.10", "10.5281/zenodo.3776452", "10.11646/zootaxa.4754.1.16", "10.5252/zoosystema2019v41a26", "10.11646/zootaxa.4759.2.1", "10.11646/zootaxa.4741.1.1", "10.5252/zoosystema2019v41a4", "10.1145/nnnnnnn.nnnnnnn", "10.17979/spudc.9788497497169", "10.11646/zootaxa.4780.3.1", "10.11646/zootaxa.4663.1.1", "10.5281/zenodo.3748525", "10.5281/zenodo.3746744", "10.3920/978-90-8686-761-5", "10.14198/eurau18alicante", "10.5252/geodiversitas2019v41a8", "10.4126/38m-0000003", "10.5281/zenodo.3648511", "10.6035/clr", "10.4126/38m-0000004", "10.5281/zenodo.3732535", "10.5281/zenodo.3355776", "10.4126/38m-0000002", "10.11646/zootaxa.4763.3.3", "10.11646/zootaxa.4413.3.1", "10.1163/9789004416208_005", "10.4126/38m-0000001", "10.3897/zookeys.30.308", "10.4126/38m-0000000", "10.5281/zenodo.3739808", "10.5281/zenodo.3674873", "10.3161/00034541anz2020.70.1.003", "10.5281/zenodo.3738648", "10.11646/zootaxa.4765.1.1", "10.11646/zootaxa.4754.1.8", "10.3897/zookeys.36.306", "10.4230/lipics", "10.5281/zenodo.3758345", "10.3161/00034541anz2020.70.1.001", "10.3929/ethz-a-005427569", "10.11646/zootaxa.4772.1.1", "10.5281/zenodo.3677235", "10.11646/zootaxa.4766.1.1", "10.17509/jurnal", "10.1145/1235", "10.11646/zootaxa.4754.1.15", "10.2478/aemnp-2018-0018", "10.11646/zootaxa.4538.1.1", "10.11646/zootaxa.4740.1.1", "10.3897/zookeys.32.282", "10.3897/zookeys.2.56", "10.3897/zookeys.39.425", "10.11646/zootaxa.4514.3.3", "10.1007/978-94-007-1966-8", "10.3897/zookeys.26.214", "10.11646/zootaxa.4106.1.1", "10.3897/zookeys.22.219", "10.11646/zootaxa.4748.2.1", "10.5252/zoosystema2019v41a19", "10.3897/zookeys.22.122", "10.1080/00222933.2019.1634225", "10.11646/zootaxa.4632.1.1", "10.1007/s00259-016-3484-4", "10.3897/zookeys.19.221", "10.3897/zookeys.2.7", "10.11646/zootaxa.4777.1.1", "10.14279/depositonce-3753", "10.1111/apha.12712", "10.11646/zootaxa.4759.3.4", "10.11646/zootaxa.4754.1.9", "10.11646/zootaxa.4747.2.8", "10.5281/zenodo.3757451", "10.5281/zenodo.3740269", "10.5252/zoosystema2020v42a4", "10.1140/epje/i2013-13103-3", "10.1177/0301006619863862", "10.5281/zenodo.3726987", "10.12795/hid", "10.24042/jipf", "10.12795/e-rips", "10.1186/s12913-016-1423-5", "10.4126/38m-0000005", "10.3847/2041-8213/aa91c9", "10.1145/1122445.1122456", "10.1103/physrevlett.114.191803", "10.3920/978-90-8686-782-0", "10.11646/zootaxa.4739.1.1", "10.11646/zootaxa.4770.1.1", "10.21009/10.21009/jpd.081", "10.1080/15548627.2015.1100356", "10.12795/ricl", "10.3897/zookeys.34.309", "10.1080/00222933.2019.1692088", "10.4126/frl01-0064002", "10.1371/journal", "10.1175/1520-0485(2002)032", "10.3897/zookeys.22.152", "10.11646/zootaxa.4731.2.1", "10.4126/frl01-0064005", "10.11646/zootaxa.4738.1.1", "10.11646/zootaxa.4780.1.6", "10.4126/frl01-0064004", "10.6018/analesps.31.1.158071", "10.1007/jhep08(2016)045", "10.5281/zenodo.3759519", "10.4126/frl01-0064010", "10.11646/zootaxa.4537.1.1", "10.5281/zenodo.3713533", "10.5281/zenodo.3742020", "10.4126/frl01-0064014", "10.4126/frl01-0064001", "10.1000/isbn", "10.5281/zenodo.3777290", "10.4126/frl01-0064008", "10.1159/000440895", "10.3897/zookeys.31.140", "10.4126/frl01-0064003", "10.1080/00222933.2018.1524032", "10.21686/2500-3925-2014-6", "10.1016/j.bbr.2011.03.031", "10.4126/frl01-0064006", "10.4126/frl01-0064007", "10.4126/frl01-0064020", "10.4126/frl01-0064016", "10.2478/aemnp-2018-0013", "10.4126/frl01-0064021", "10.5281/zenodo.3754300", "10.15330/gal.29-30.", "10.3897/zookeys.2.4", "10.5252/zoosystema2019v41a7", "10.22435/bpk.v17i2", "10.4126/frl01-0063997", "10.3897/zookeys.11.160", "10.11646/zootaxa.4754.1.14", "10.4126/frl01-0064013", "10.1080/20013078.2018.1535750", "10.1016/j.", "10.4126/frl01-0064011", "10.1002/ece3.2579", "10.1088/0264-9381/28/9/094001", "10.3897/zookeys.2.25", "10.4126/frl01-0064019", "10.4126/frl01-0063994", "10.4126/frl01-0064135", "10.4126/frl01-0063998", "10.12795/ppa", "10.4126/frl01-0064009", "10.11646/zootaxa.4769.1.1", "10.11646/zootaxa.4419.1.1", "10.11646/zootaxa.4733.1.1", "10.4126/frl01-0063993", "10.3161/15081109acc2016.18.1.005", "10.11646/zootaxa.4763.1.2", "10.11646/zootaxa.4754.1.19", "10.4126/frl01-0064136", "10.4126/frl01-0064159", "10.4126/frl01-0063999", "10.4126/frl01-0064161", "10.1089/ten.tea.2015.5000.abstracts", "10.1002/(issn)1521-3773", "10.1140/epjc/s10052-015-3325-9", "10.1016/j.physletb.2016.04.050", "10.1007/jhep04(2015)117", "10.1111/gcb.14904", "10.1016/s0140-6736(17)32129-3", "10.11646/zootaxa.4748.1.1", "10.4126/frl01-0064078", "10.1140/epjc/s10052-015-3408-7", "10.1002/(issn)1097-4652", "10.1007/jhep06(2015)121", "10.1007/jhep09(2014)103", "10.1016/j.gca.2007.06.021", "10.1007/jhep09(2015)049", "10.3897/zookeys.4.32", "10.6101/azq/0002", "10.11646/zootaxa.4764.1.1", "10.11646/zootaxa.4772.1.5", "10.4126/frl01-0064000", "10.4126/frl01-0064131", "10.1016/j.physletb.2015.08.061", "10.1007/jhep01(2015)069", "10.1016/j.physletb.2016.06.039", "10.1016/j.physletb.2015.07.011", "10.1007/jhep04(2015)116", "10.3920/978-90-8686-797-4", "10.1016/j.physletb.2015.12.020", "10.1016/j.physletb.2015.04.042", "10.1016/j.physletb.2016.06.004", "10.1140/epjc/s10052-015-3261-8", "10.1016/j.physletb.2015.10.067", "10.1016/j.physletb.2015.07.065", "10.1163/1876312x-00002195", "10.1016/j.physletb.2013.12.010", "10.1016/j.physletb.2013.01.024", "10.1007/jhep11(2014)056", "10.1007/jhep12(2017)142", "10.1002/pds.4864", "10.1140/epjc/s10052-015-3262-7", "10.1016/j.physletb.2014.09.054", "10.1140/epjc/s10052-015-3373-1", "10.1007/jhep03(2015)041", "10.1016/j.physletb.2016.02.047", "10.4126/frl01-0064018", "10.1016/j.physletb.2014.01.042", "10.1007/jhep09(2014)037", "10.1007/978-94-017-7285-3", "10.1007/s00424-013-1401-2", "10.1007/s00259-017-3822-1", "10.1177/0301006616671273", "10.1007/jhep09(2014)112", "10.1007/jhep06(2015)116", "10.1140/epjc/s10052-018-6243-9", "10.1140/epjc/s10052-017-4692-1", "10.1007/jhep10(2015)144", "10.1007/jhep07(2017)107", "10.1007/jhep11(2014)088", "10.1016/j.physletb.2014.01.006", "10.1007/jhep01(2018)055", "10.1016/j.physletb.2016.03.060", "10.1140/epjc/s10052-019-6904-3", "10.11646/zootaxa.4737.1.1", "10.3934/xx.xx.xx.xx", "10.11646/zootaxa.4758.2.1", "10.1016/j.physletb.2015.10.004", "10.1016/j.physletb.2015.07.053", "10.5798/diclemedj.0921.2012.04.0184", "10.1007/jhep04(2014)169", "10.4126/frl01-0064160", "10.3989/aem.2001.v31.i2", "10.1039/x0xx00000x", "10.11646/zootaxa.3856.4.1", "10.4126/frl01-0064133", "10.1007/jhep05(2015)078", "10.1016/j.physletb.2012.08.020", "10.1007/jhep07(2015)032", "10.1159/000090218", "10.1016/j.physletb.2014.03.015", "10.1007/jhep09(2015)108", "10.1007/jhep09(2015)050", "10.1007/jhep01(2014)163", "10.1016/j.physletb.2014.11.026", "10.1140/epjc/s10052-016-4580-0", "10.1140/epjc/s10052-014-3109-7", "10.1140/epjc/s10052-014-3231-6", "10.1007/jhep02(2014)088", "10.1016/j.physletb.2016.01.056", "10.1016/j.physletb.2015.08.047", "10.1016/j.physletb.2015.12.039", "10.1007/jhep11(2015)071", "10.1140/epjc/s10052-015-3853-3", "10.1007/jhep04(2015)124", "10.1016/j.physletb.2015.07.010", "10.5281/zenodo.3413524", "10.1007/jhep04(2014)031", "10.1007/jhep07(2015)157", "10.1103/physrevd.90.052008", "10.1007/jhep11(2014)118", "10.3920/978-90-8686-708-0", "10.5281/zenodo.1136235", "10.1103/physrevd.86.032003", "10.1016/j.physletb.2016.01.032", "10.1007/jhep03(2018)174", "10.1007/jhep10(2017)182", "10.1140/epjst/e2019-900045-4", "10.1016/j.physletb.2015.06.070", "10.1140/epjc/s10052-016-4067-z", "10.1016/j.physletb.2015.11.042", "10.1007/jhep04(2018)033", "10.1007/jhep09(2014)145", "10.1016/j.physletb.2016.08.055", "10.1016/j.physletb.2015.04.002", "10.1007/jhep03(2014)032", "10.1140/epjc/s10052-017-5491-4", "10.1016/j.physletb.2015.09.062", "10.1016/j.physletb.2014.12.003", "10.1016/j.physletb.2015.03.017", "10.1140/epjc/s10052-014-3195-6", "10.1140/epjc/s10052-016-4034-8", "10.1140/epjc/s10052-016-4070-4", "10.1140/epjc/s10052-018-5693-4", "10.4126/frl01-0064017", "10.1007/jhep08(2014)173", "10.1016/j.physletb.2014.06.076", "10.1016/j.physletb.2018.11.064", "10.1140/epjc/s10052-017-4988-1", "10.11646/zootaxa.4258.4.3", "10.11646/zootaxa.4766.1.2", "10.11646/zootaxa.4780.1.1", "10.5281/zenodo.3693943", "10.4126/frl01-0064129", "10.15330/gal.28.", "10.1007/jhep02(2016)145", "10.1007/jhep04(2014)172", "10.1007/jhep04(2016)005", "10.1007/jhep03(2016)125", "10.1016/j.physletb.2018.02.033", "10.1007/jhep08(2017)052", "10.1007/jhep12(2017)085", "10.1007/jhep09(2014)176", "10.1007/jhep12(2017)024", "10.1140/epjc/s10052-018-5686-3", "10.1016/j.physletb.2016.11.035", "10.1016/j.physletb.2015.12.017", "10.1140/epjc/s10052-015-3542-2", "10.1140/epjc/s10052-014-3071-4", "10.1103/physrevd.97.032009", "10.1140/epjc/s10052-015-3306-z", "10.1016/j.physletb.2017.12.043", "10.1140/epjc/s10052-014-3233-4", "10.1016/j.physletb.2018.09.013", "10.1016/j.gca.2007.06.014", "10.1016/j.physletb.2016.05.005", "10.1038/s41586-019-1171-x", "10.1016/j.physletb.2016.05.087", "10.1007/jhep06(2018)022", "10.1016/j.physletb.2016.01.057", "10.1016/j.physletb.2018.03.023", "10.1140/epjc/s10052-015-3351-7", "10.1126/science.aap8757", "10.1007/jhep09(2015)137", "10.1007/jhep01(2015)063", "10.1007/jhep01(2018)126", "10.1016/j.gca.2007.06.020", "10.1140/epjc/s10052-018-5595-5", "10.1016/j.physletb.2015.02.015", "10.1016/j.physletb.2014.06.077", "10.1007/jhep12(2017)059", "10.1007/jhep10(2017)141", "10.1007/jhep02(2014)107", "10.1140/epjc/s10052-014-2965-5", "10.1016/j.physletb.2015.07.079", "10.1007/jhep10(2017)112", "10.1140/epjc/s10052-014-2982-4", "10.1007/jhep05(2016)160", "10.1016/j.physletb.2016.07.030", "10.1140/epjc/s10052-014-3168-9", "10.1140/epjc/s10052-018-5583-9", "10.1140/epjc/s10052-016-4184-8", "10.1007/jhep08(2015)105", "10.1007/jhep05(2015)061", "10.1103/physrevd.97.032003", "10.1140/epjc/s10052-014-3190-y", "10.1016/j.physletb.2012.10.061", "10.1140/epjc/s10052-014-2941-0", "10.1016/j.physletb.2016.02.002", "10.1016/j.physletb.2016.05.033", "10.1007/jhep01(2014)096", "10.1007/jhep09(2015)201", "10.1016/j.physletb.2016.01.010", "10.1016/j.physletb.2015.07.037", "10.1007/jhep07(2015)042", "10.1016/j.physletb.2016.05.044", "10.1016/j.physletb.2016.05.088", "10.3897/zookeys.2.2", "10.1007/jhep11(2015)018", "10.1007/jhep11(2015)189", "10.1016/j.physletb.2016.10.014", "10.1007/jhep06(2015)080", "10.1016/j.physletb.2014.11.042", "10.1140/epjc/s10052-014-3157-z", "10.1140/epjc/s10052-015-3406-9", "10.1016/j.physletb.2016.02.056", "10.1016/j.physletb.2015.03.054", "10.1140/epjc/s10052-016-4574-y", "10.5252/geodiversitas2019v41a15", "10.1007/jhep09(2014)094", "10.1140/epjc/s10052-017-5486-1", "10.1007/jhep03(2018)095", "10.11646/zootaxa.4736.1.1", "10.11646/zootaxa.4766.2.1", "10.5281/zenodo.3762392", "10.5281/zenodo.3761958", "10.11646/zootaxa.4403.3.2", "10.1553/iswimab", "10.11646/zootaxa.3750.5.1", "10.4126/frl01-0064134", "10.1103/physrevd.87.032002", "10.1140/epjc/s10052-013-2676-3", "10.1007/jhep02(2015)153", "10.1007/jhep08(2017)006", "10.1016/j.physletb.2016.11.005", "10.1007/jhep01(2013)029", "10.1007/jhep10(2017)132", "10.1016/j.physletb.2013.01.034", "10.1016/j.physletb.2016.03.046", "10.1140/epjc/s10052-016-3988-x", "10.1016/j.physletb.2016.07.006", "10.1140/epjc/s10052-018-5752-x", "10.1140/epjc/s10052-015-3454-1", "10.1002/ece3.1303", "10.1007/jhep02(2014)013", "10.1007/jhep06(2016)081", "10.1140/epjc/s10052-014-3117-7", "10.1007/jhep09(2017)084", "10.1016/j.physletb.2017.09.078", "10.1007/jhep08(2016)005", "10.1007/jhep01(2015)020", "10.1140/epjc/s10052-017-4852-3", "10.1016/j.physletb.2018.02.045", "10.7818/sibecolandaeetmeeting.2019", "10.1007/jhep11(2014)104", "10.1007/jhep05(2018)077", "10.1016/j.physletb.2016.11.045", "10.1016/j.physletb.2016.10.042", "10.1140/epjc/s10052-016-4203-9", "10.1007/jhep01(2015)068", "10.1007/jhep06(2016)093", "10.1016/j.physletb.2015.09.051", "10.1140/epjc/s10052-015-3534-2", "10.1007/jhep09(2014)087", "10.1016/j.physletb.2014.05.055", "10.1016/j.physletb.2014.02.033", "10.1140/epjc/s10052-017-5225-7", "10.1140/epjc/s10052-017-5442-0", "10.1016/s0140-6736(18)32335-3", "10.1016/j.physletb.2017.11.049", "10.1007/jhep06(2018)166", "10.1016/j.physletb.2016.05.002", "10.1140/epjc/s10052-016-4219-1", "10.1140/epjst/e2019-900087-0", "10.1007/jhep01(2016)166", "10.1007/jhep01(2018)097", "10.1016/j.physletb.2017.11.043", "10.1016/j.physletb.2018.04.036", "10.1140/epjc/s10052-018-5607-5", "10.1007/jhep12(2017)034", "10.1007/jhep11(2016)112", "10.1007/jhep06(2014)008", "10.1140/epjc/s10052-012-2261-1", "10.1016/j.physletb.2014.08.039", "10.1016/s0140-6736(16)31919-5", "10.1140/epjc/s10052-019-7058-z", "10.1016/j.physletb.2014.07.053", "10.1007/jhep01(2015)053", "10.1016/j.physletb.2016.07.042", "10.1007/jhep08(2014)103", "10.1007/jhep06(2015)100", "10.1140/epjc/s10052-015-3363-3", "10.1140/epjc/s10052-017-4915-5", "10.1140/epjc/s10052-014-3023-z", "10.1140/epjc/s10052-017-5315-6", "10.1140/epjc/s10052-016-4050-8", "10.3389/fpsyt.2017.00244", "10.1016/j.physletb.2014.10.002", "10.1007/jhep07(2015)162", "10.1007/jhep08(2014)174", "10.3897/zookeys.2.23", "10.1007/jhep07(2017)014", "10.1007/jhep04(2016)035", "10.1140/epjc/s10052-017-4984-5", "10.1007/jhep02(2016)156", "10.1016/j.physletb.2016.03.039", "10.1007/jhep07(2018)115", "10.3897/zookeys.34.268", "10.1007/jhep02(2016)122", "10.1016/j.physletb.2012.03.022", "10.1016/j.physletb.2018.09.019", "10.1016/j.physletb.2018.09.024", "10.1051/0004-6361/201629272", "10.1103/physrevc.97.024904", "10.1140/epjc/s10052-016-4521-y", "10.1140/epjc/s10052-016-4176-8", "10.1140/epjc/s10052-014-3134-6", "10.1140/epjc/s10052-016-4110-0", "10.1007/jhep07(2017)121", "10.1007/jhep07(2018)153", "10.1007/jhep03(2018)115", "10.1007/jhep04(2018)060", "10.11606/1807-0205/2020.60.06", "10.4126/frl01-0064015", "10.1007/jhep09(2017)020", "10.1016/j.physletb.2014.04.023", "10.1016/j.physletb.2015.02.048", "10.1007/jhep02(2018)032", "10.1016/j.physletb.2018.01.001", "10.1140/epjc/s10052-015-3852-4", "10.1007/jhep10(2014)087", "10.11646/zootaxa.4630.1.1", "10.5281/zenodo.3742118", "10.4126/frl01-0064022", "10.11646/zootaxa.4758.3.1", "10.11646/zootaxa.4772.3.1", "10.11646/zootaxa.4576.3.5", "10.4126/frl01-0064125", "10.1007/jhep12(2017)017", "10.4126/frl01-0064162", "10.4126/frl01-0064138", "10.1007/jhep06(2014)124", "10.1007/jhep06(2016)059", "10.1007/jhep06(2014)035", "10.1103/physrevd.90.052005", "10.1007/jhep11(2017)062", "10.3847/2041-8213/aa9aed", "10.1016/j.physletb.2016.06.080", "10.1007/jhep10(2017)073", "10.1007/jhep03(2018)167", "10.1016/j.physletb.2018.11.065", "10.1140/epjc/s10052-017-5081-5", "10.1140/epjc/s10052-015-3500-z", "10.1140/epjc/s10052-017-5445-x", "10.1016/j.physletb.2014.01.049", "10.1007/jhep03(2018)172", "10.1016/j.physletb.2015.03.048", "10.1016/j.physletb.2018.11.032", "10.1007/jhep05(2018)025", "10.1016/j.physletb.2016.08.052", "10.1016/j.physletb.2014.09.008", "10.1103/physrevlett.120.071802", "10.1016/j.physletb.2018.01.049", "10.1016/j.physletb.2016.06.017", "10.1016/j.physletb.2016.04.005", "10.1007/jhep06(2018)031", "10.1007/jhep01(2016)079", "10.1007/jhep10(2017)006", "10.1140/epjc/s10052-018-5740-1", "10.1016/j.physletb.2015.01.034", "10.1007/jhep10(2017)005", "10.1016/j.physletb.2018.04.007", "10.1007/jhep04(2015)164", "10.1140/epjc/s10052-018-5691-6", "10.1007/jhep05(2018)148", "10.1007/jhep03(2018)003", "10.1140/epjc/s10052-014-3076-z", "10.1016/j.physletb.2016.02.015", "10.1103/physrevd.97.072003", "10.1016/j.physletb.2017.11.054", "10.1140/epjc/s10052-011-1849-1", "10.1007/jhep09(2016)175", "10.1016/j.physletb.2017.12.011", "10.1007/jhep04(2014)103", "10.1007/jhep12(2014)017", "10.1016/j.physletb.2014.09.048", "10.1140/epjc/s10052-019-7202-9", "10.1007/jhep04(2014)191", "10.1007/jhep07(2013)163", "10.1140/epjc/s10052-014-3130-x", "10.1007/jhep04(2016)023", "10.1016/j.physletb.2015.07.023", "10.1140/epjc/s10052-018-6500-y", "10.1016/j.physletb.2015.04.045", "10.1007/jhep09(2017)053", "10.1007/jhep10(2017)180", "10.1140/epjc/s10052-017-4912-8", "10.1007/jhep10(2016)129", "10.3920/978-90-8686-816-2", "10.1007/jhep01(2017)099", "10.1007/jhep01(2018)045", "10.1007/jhep04(2015)025", "10.1016/j.physletb.2018.02.050", "10.1103/physrevlett.116.032301", "10.1007/jhep08(2017)029", "10.1007/jhep08(2017)073", "10.1016/j.physletb.2014.11.059", "10.1007/jhep01(2013)131", "10.1007/jhep06(2014)112", "10.1016/j.physletb.2017.09.066", "10.1140/epjc/s10052-014-2883-6", "10.1094/mpmi", "10.1007/jhep11(2017)195", "10.1007/jhep06(2018)108", "10.1007/jhep09(2018)139", "10.1016/j.physletb.2016.12.005", "10.1140/epjc/s10052-017-5349-9", "10.1016/j.physletb.2012.08.021", "10.1016/j.physletb.2014.10.032", "10.1007/jhep09(2017)088", "10.1140/epjc/s10052-015-3425-6", "10.1007/jhep01(2018)054", "10.1103/physrevlett.110.182302", "10.1140/epjc/s10052-017-5317-4", "10.1007/jhep01(2017)117", "10.1016/j.physletb.2017.12.006", "10.1016/j.physletb.2018.02.004", "10.1016/j.physletb.2018.02.025", "10.1016/j.physletb.2016.02.055", "10.1016/j.physletb.2016.04.061", "10.1140/epjc/s10052-015-3372-2", "10.1016/j.physletb.2015.02.051", "10.1016/j.physletb.2014.11.049", "10.1007/jhep09(2016)001", "10.1016/j.physletb.2016.03.017", "10.1007/jhep06(2016)067", "10.1140/epjc/s10052-015-3543-1", "10.1140/epjc/s10052-017-4911-9", "10.1007/jhep07(2013)122", "10.1140/epjc/s10052-019-6855-8", "10.1140/epjc/s10052-019-6540-y", "10.1007/jhep06(2014)009", "10.1007/jhep05(2019)043", "10.1016/j.physletb.2016.01.028", "10.1103/physrevlett.120.231801", "10.1140/epjc/s10052-016-4325-0", "10.1007/jhep07(2018)127", "10.1016/j.physletb.2016.05.003", "10.1140/epjc/s10052-017-4644-9", "10.1140/epjc/s10052-017-4700-5", "10.1007/jhep06(2018)107", "10.1016/j.physletb.2018.01.042", "10.1140/epjc/s10052-018-5624-4", "10.1007/jhep08(2016)139", "10.1007/jhep05(2018)195", "10.1103/physrevd.97.052012", "10.1140/epjc/s10052-016-3978-z", "10.1007/jhep05(2019)088", "10.1140/epjc/s10052-017-5079-z", "10.1140/epjc/s10052-016-4205-7", "10.1007/jhep01(2016)006", "10.1140/epjc/s10052-016-4286-3", "10.1016/j.physletb.2017.04.071", "10.1103/physrevd.97.012007", "10.1016/j.physletb.2018.01.077", "10.1007/jhep04(2018)073", "10.1016/j.physletb.2015.09.057", "10.1007/jhep07(2018)032", "10.1140/epjc/s10052-015-3435-4", "10.1007/jhep11(2017)010", "10.1093/isd/ixaa002", "10.1016/j.physletb.2018.03.035", "10.1007/jhep10(2018)031", "10.1016/s0140-6736(18)31891-9", "10.1140/epjc/s10052-018-6148-7", "10.1016/j.physletb.2018.03.057", "10.1140/epjc/s10052-019-6632-8", "10.1016/j.physletb.2015.11.071", "10.1140/epjc/s10052-018-5605-7", "10.1016/j.physletb.2018.10.073", "10.1140/epjc/s10052-019-7387-y", "10.1007/jhep06(2019)143", "10.1140/epjc/s10052-018-5567-9", "10.1140/epjc/s10052-019-6909-y", "10.1002/(sici)1521-3978(199901)47:1/3", "10.5281/zenodo.3758372", "10.4126/frl01-0064041", "10.1140/epjc/s10052-014-3129-3", "10.11646/zootaxa.4685.1.1", "10.11646/zootaxa.4756.1.1", "10.6101/azq/0001", "10.14582/duzg", "10.1016/j.physletb.2012.11.039", "10.4126/frl01-0064191", "10.1016/j.physletb.2013.12.029", "10.1007/jhep10(2013)189", "10.1051/0004-6361/201629512", "10.1007/jhep01(2013)116", "10.2312/gfz.lis.2016.001", "10.1016/j.physletb.2013.01.040", "10.1103/physrevd.90.112005", "10.1140/epjc/s10052-015-3726-9", "10.1007/s11682-013-9269-5", "10.1007/jhep02(2017)071", "10.1016/j.physletb.2016.09.040", "10.1007/jhep02(2017)117", "10.1007/jhep08(2016)009", "10.1103/physrevd.97.052010", "10.1007/jhep09(2017)032", "10.1103/physrevd.97.032005", "10.1140/epjc/s10052-017-4965-8", "10.1016/j.physletb.2016.08.042", "10.1016/j.physletb.2017.10.039", "10.1007/jhep03(2016)127", "10.1140/epjc/s10052-014-3034-9", "10.1007/jhep03(2017)113", "10.1007/jhep11(2018)040", "10.1140/epjc/s10052-018-6457-x", "10.1140/epjc/s10052-016-4041-9", "10.1140/epjc/s10052-018-6219-9", "10.1140/epjc/s10052-016-4149-y", "10.1007/jhep10(2017)072", "10.1140/epjc/s10052-016-4083-z", "10.1140/epjc/s10052-016-3956-5", "10.1007/jhep04(2016)073", "10.1007/jhep06(2016)177", "10.1016/j.physletb.2018.03.084", "10.1007/jhep10(2015)128", "10.1007/jhep03(2018)166", "10.1140/epjc/s10052-015-3491-9", "10.1016/j.physletb.2015.04.060", "10.1103/physrevd.92.112001", "10.1140/epjc/s10052-015-3367-z", "10.1007/jhep10(2017)019", "10.1007/jhep10(2017)131", "10.1016/j.physletb.2018.08.057", "10.1007/jhep01(2016)096", "10.1016/j.physletb.2017.09.053", "10.1007/jhep07(2017)013", "10.1007/jhep01(2019)030", "10.1007/jhep11(2016)110", "10.1016/j.physletb.2012.02.044", "10.1140/epjc/s10052-017-5192-z", "10.1007/jhep03(2015)022", "10.1140/epjc/s10052-019-6847-8", "10.1093/database/baz085", "10.1140/epjc/s10052-015-3451-4", "10.1007/jhep11(2017)029", "10.1140/epjc/s10052-015-3517-3", "10.1007/jhep07(2017)001", "10.1007/jhep09(2016)074", "10.1103/physrevd.97.072016", "10.1007/jhep05(2018)006", "10.1103/physrevlett.120.081801", "10.1103/physrevlett.120.161802", "10.1103/physrevlett.120.202005", "10.5281/zenodo.1299123", "10.5281/zenodo.3777294" ], + "pmid" : [], + "pmc" : [] +} \ No newline at end of file diff --git a/dhp-common/src/main/scala/eu/dnetlib/dhp/application/SparkScalaApplication.scala b/dhp-common/src/main/scala/eu/dnetlib/dhp/application/SparkScalaApplication.scala new file mode 100644 index 000000000..f8afe9af4 --- /dev/null +++ b/dhp-common/src/main/scala/eu/dnetlib/dhp/application/SparkScalaApplication.scala @@ -0,0 +1,73 @@ +package eu.dnetlib.dhp.application + +import scala.io.Source + +/** This is the main Interface SparkApplication + * where all the Spark Scala class should inherit + */ +trait SparkScalaApplication { + + /** This is the path in the classpath of the json + * describes all the argument needed to run + */ + val propertyPath: String + + /** Utility to parse the arguments using the + * property json in the classpath identified from + * the variable propertyPath + * + * @param args the list of arguments + */ + def parseArguments(args: Array[String]): ArgumentApplicationParser = { + val parser = new ArgumentApplicationParser( + Source.fromInputStream(getClass.getResourceAsStream(propertyPath)).mkString + ) + parser.parseArgument(args) + parser + } + + /** Here all the spark applications runs this method + * where the whole logic of the spark node is defined + */ + def run(): Unit +} + +import org.apache.spark.SparkConf +import org.apache.spark.sql.SparkSession +import org.slf4j.Logger + +abstract class AbstractScalaApplication( + val propertyPath: String, + val args: Array[String], + log: Logger +) extends SparkScalaApplication { + + var parser: ArgumentApplicationParser = null + + var spark: SparkSession = null + + def initialize(): SparkScalaApplication = { + parser = parseArguments(args) + spark = createSparkSession() + this + } + + /** Utility for creating a spark session starting from parser + * + * @return a spark Session + */ + private def createSparkSession(): SparkSession = { + require(parser != null) + + val conf: SparkConf = new SparkConf() + val master = parser.get("master") + log.info(s"Creating Spark session: Master: $master") + SparkSession + .builder() + .config(conf) + .appName(getClass.getSimpleName) + .master(master) + .getOrCreate() + } + +} diff --git a/dhp-common/src/main/scala/eu/dnetlib/dhp/sx/graph/scholix/ScholixUtils.scala b/dhp-common/src/main/scala/eu/dnetlib/dhp/sx/graph/scholix/ScholixUtils.scala new file mode 100644 index 000000000..a995016a8 --- /dev/null +++ b/dhp-common/src/main/scala/eu/dnetlib/dhp/sx/graph/scholix/ScholixUtils.scala @@ -0,0 +1,442 @@ +package eu.dnetlib.dhp.sx.graph.scholix + +import eu.dnetlib.dhp.schema.oaf.{Publication, Relation, Result, StructuredProperty} +import eu.dnetlib.dhp.schema.sx.scholix._ +import eu.dnetlib.dhp.schema.sx.summary.{CollectedFromType, SchemeValue, ScholixSummary, Typology} +import eu.dnetlib.dhp.utils.DHPUtils +import org.apache.spark.sql.expressions.Aggregator +import org.apache.spark.sql.{Encoder, Encoders} +import org.json4s +import org.json4s.DefaultFormats +import org.json4s.jackson.JsonMethods.parse +import scala.collection.JavaConverters._ +import scala.io.Source + +object ScholixUtils extends Serializable { + + val DNET_IDENTIFIER_SCHEMA: String = "DNET Identifier" + + val DATE_RELATION_KEY: String = "RelationDate" + + case class RelationVocabulary(original: String, inverse: String) {} + + case class RelatedEntities(id: String, relatedDataset: Long, relatedPublication: Long) {} + + val relations: Map[String, RelationVocabulary] = { + val input = Source + .fromInputStream( + getClass.getResourceAsStream("/eu/dnetlib/scholexplorer/relation/relations.json") + ) + .mkString + implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats + + lazy val json: json4s.JValue = parse(input) + + json.extract[Map[String, RelationVocabulary]] + } + + def extractRelationDate(relation: Relation): String = { + + if (relation.getProperties == null || !relation.getProperties.isEmpty) + null + else { + val date = relation.getProperties.asScala + .find(p => DATE_RELATION_KEY.equalsIgnoreCase(p.getKey)) + .map(p => p.getValue) + if (date.isDefined) + date.get + else + null + } + } + + def extractRelationDate(summary: ScholixSummary): String = { + + if (summary.getDate == null || summary.getDate.isEmpty) + null + else { + summary.getDate.get(0) + } + } + + def inverseRelationShip(rel: ScholixRelationship): ScholixRelationship = { + new ScholixRelationship(rel.getInverse, rel.getSchema, rel.getName) + + } + + def generateScholixResourceFromResult(r: Result): ScholixResource = { + generateScholixResourceFromSummary(ScholixUtils.resultToSummary(r)) + } + + val statsAggregator: Aggregator[(String, String, Long), RelatedEntities, RelatedEntities] = + new Aggregator[(String, String, Long), RelatedEntities, RelatedEntities] with Serializable { + override def zero: RelatedEntities = null + + override def reduce(b: RelatedEntities, a: (String, String, Long)): RelatedEntities = { + val relatedDataset = if ("dataset".equalsIgnoreCase(a._2)) a._3 else 0 + val relatedPublication = if ("publication".equalsIgnoreCase(a._2)) a._3 else 0 + + if (b == null) + RelatedEntities(a._1, relatedDataset, relatedPublication) + else + RelatedEntities( + a._1, + b.relatedDataset + relatedDataset, + b.relatedPublication + relatedPublication + ) + } + + override def merge(b1: RelatedEntities, b2: RelatedEntities): RelatedEntities = { + if (b1 != null && b2 != null) + RelatedEntities( + b1.id, + b1.relatedDataset + b2.relatedDataset, + b1.relatedPublication + b2.relatedPublication + ) + else if (b1 != null) + b1 + else + b2 + } + + override def finish(reduction: RelatedEntities): RelatedEntities = reduction + + override def bufferEncoder: Encoder[RelatedEntities] = Encoders.bean(classOf[RelatedEntities]) + + override def outputEncoder: Encoder[RelatedEntities] = Encoders.bean(classOf[RelatedEntities]) + } + + val scholixAggregator: Aggregator[(String, Scholix), Scholix, Scholix] = + new Aggregator[(String, Scholix), Scholix, Scholix] with Serializable { + override def zero: Scholix = null + + def scholix_complete(s: Scholix): Boolean = { + if (s == null || s.getIdentifier == null) { + false + } else if (s.getSource == null || s.getTarget == null) { + false + } else if (s.getLinkprovider == null || s.getLinkprovider.isEmpty) + false + else + true + } + + override def reduce(b: Scholix, a: (String, Scholix)): Scholix = { + if (scholix_complete(b)) b else a._2 + } + + override def merge(b1: Scholix, b2: Scholix): Scholix = { + if (scholix_complete(b1)) b1 else b2 + } + + override def finish(reduction: Scholix): Scholix = reduction + + override def bufferEncoder: Encoder[Scholix] = Encoders.kryo[Scholix] + + override def outputEncoder: Encoder[Scholix] = Encoders.kryo[Scholix] + } + + def createInverseScholixRelation(scholix: Scholix): Scholix = { + val s = new Scholix + s.setPublicationDate(scholix.getPublicationDate) + s.setPublisher(scholix.getPublisher) + s.setLinkprovider(scholix.getLinkprovider) + s.setRelationship(inverseRelationShip(scholix.getRelationship)) + s.setSource(scholix.getTarget) + s.setTarget(scholix.getSource) + s.setIdentifier( + DHPUtils.md5( + s"${s.getSource.getIdentifier}::${s.getRelationship.getName}::${s.getTarget.getIdentifier}" + ) + ) + s + + } + + def extractCollectedFrom(summary: ScholixResource): List[ScholixEntityId] = { + if (summary.getCollectedFrom != null && !summary.getCollectedFrom.isEmpty) { + val l: List[ScholixEntityId] = summary.getCollectedFrom.asScala.map { d => + new ScholixEntityId(d.getProvider.getName, d.getProvider.getIdentifiers) + }(collection.breakOut) + l + } else List() + } + + def extractCollectedFrom(summary: ScholixSummary): List[ScholixEntityId] = { + if (summary.getDatasources != null && !summary.getDatasources.isEmpty) { + val l: List[ScholixEntityId] = summary.getDatasources.asScala.map { d => + new ScholixEntityId( + d.getDatasourceName, + List(new ScholixIdentifier(d.getDatasourceId, "DNET Identifier", null)).asJava + ) + }(collection.breakOut) + l + } else List() + } + + def extractCollectedFrom(relation: Relation): List[ScholixEntityId] = { + if (relation.getCollectedfrom != null && !relation.getCollectedfrom.isEmpty) { + + val l: List[ScholixEntityId] = relation.getCollectedfrom.asScala.map { c => + new ScholixEntityId( + c.getValue, + List(new ScholixIdentifier(c.getKey, DNET_IDENTIFIER_SCHEMA, null)).asJava + ) + }.toList + l + } else List() + } + + def generateCompleteScholix(scholix: Scholix, target: ScholixSummary): Scholix = { + val s = new Scholix + s.setPublicationDate(scholix.getPublicationDate) + s.setPublisher(scholix.getPublisher) + s.setLinkprovider(scholix.getLinkprovider) + s.setRelationship(scholix.getRelationship) + s.setSource(scholix.getSource) + s.setTarget(generateScholixResourceFromSummary(target)) + s.setIdentifier( + DHPUtils.md5( + s"${s.getSource.getIdentifier}::${s.getRelationship.getName}::${s.getTarget.getIdentifier}" + ) + ) + s + } + + def generateCompleteScholix(scholix: Scholix, target: ScholixResource): Scholix = { + val s = new Scholix + s.setPublicationDate(scholix.getPublicationDate) + s.setPublisher(scholix.getPublisher) + s.setLinkprovider(scholix.getLinkprovider) + s.setRelationship(scholix.getRelationship) + s.setSource(scholix.getSource) + s.setTarget(target) + s.setIdentifier( + DHPUtils.md5( + s"${s.getSource.getIdentifier}::${s.getRelationship.getName}::${s.getTarget.getIdentifier}" + ) + ) + s + } + + def generateScholixResourceFromSummary(summaryObject: ScholixSummary): ScholixResource = { + val r = new ScholixResource + r.setIdentifier(summaryObject.getLocalIdentifier) + r.setDnetIdentifier(summaryObject.getId) + + r.setObjectType(summaryObject.getTypology.toString) + r.setObjectSubType(summaryObject.getSubType) + + if (summaryObject.getTitle != null && !summaryObject.getTitle.isEmpty) + r.setTitle(summaryObject.getTitle.get(0)) + + if (summaryObject.getAuthor != null && !summaryObject.getAuthor.isEmpty) { + val l: List[ScholixEntityId] = + summaryObject.getAuthor.asScala.map(a => new ScholixEntityId(a, null)).toList + if (l.nonEmpty) + r.setCreator(l.asJava) + } + + if (summaryObject.getDate != null && !summaryObject.getDate.isEmpty) + r.setPublicationDate(summaryObject.getDate.get(0)) + if (summaryObject.getPublisher != null && !summaryObject.getPublisher.isEmpty) { + val plist: List[ScholixEntityId] = + summaryObject.getPublisher.asScala.map(p => new ScholixEntityId(p, null)).toList + + if (plist.nonEmpty) + r.setPublisher(plist.asJava) + } + + if (summaryObject.getDatasources != null && !summaryObject.getDatasources.isEmpty) { + + val l: List[ScholixCollectedFrom] = summaryObject.getDatasources.asScala + .map(c => + new ScholixCollectedFrom( + new ScholixEntityId( + c.getDatasourceName, + List(new ScholixIdentifier(c.getDatasourceId, DNET_IDENTIFIER_SCHEMA, null)).asJava + ), + "collected", + "complete" + ) + ) + .toList + + if (l.nonEmpty) + r.setCollectedFrom(l.asJava) + + } + r + } + + def scholixFromSource(relation: Relation, source: ScholixResource): Scholix = { + if (relation == null || source == null) + return null + val s = new Scholix + var l: List[ScholixEntityId] = extractCollectedFrom(relation) + if (l.isEmpty) + l = extractCollectedFrom(source) + if (l.isEmpty) + return null + s.setLinkprovider(l.asJava) + var d = extractRelationDate(relation) + if (d == null) + d = source.getPublicationDate + + s.setPublicationDate(d) + + if (source.getPublisher != null && !source.getPublisher.isEmpty) { + s.setPublisher(source.getPublisher) + } + + val semanticRelation = relations.getOrElse(relation.getRelClass.toLowerCase, null) + if (semanticRelation == null) + return null + s.setRelationship( + new ScholixRelationship(semanticRelation.original, "datacite", semanticRelation.inverse) + ) + s.setSource(source) + + s + } + + def scholixFromSource(relation: Relation, source: ScholixSummary): Scholix = { + + if (relation == null || source == null) + return null + + val s = new Scholix + + var l: List[ScholixEntityId] = extractCollectedFrom(relation) + if (l.isEmpty) + l = extractCollectedFrom(source) + if (l.isEmpty) + return null + + s.setLinkprovider(l.asJava) + + var d = extractRelationDate(relation) + if (d == null) + d = extractRelationDate(source) + + s.setPublicationDate(d) + + if (source.getPublisher != null && !source.getPublisher.isEmpty) { + val l: List[ScholixEntityId] = source.getPublisher.asScala + .map { p => + new ScholixEntityId(p, null) + }(collection.breakOut) + + if (l.nonEmpty) + s.setPublisher(l.asJava) + } + + val semanticRelation = relations.getOrElse(relation.getRelClass.toLowerCase, null) + if (semanticRelation == null) + return null + s.setRelationship( + new ScholixRelationship(semanticRelation.original, "datacite", semanticRelation.inverse) + ) + s.setSource(generateScholixResourceFromSummary(source)) + + s + } + + def findURLForPID( + pidValue: List[StructuredProperty], + urls: List[String] + ): List[(StructuredProperty, String)] = { + pidValue.map { p => + val pv = p.getValue + + val r = urls.find(u => u.toLowerCase.contains(pv.toLowerCase)) + (p, r.orNull) + } + } + + def extractTypedIdentifierFromInstance(r: Result): List[ScholixIdentifier] = { + if (r.getInstance() == null || r.getInstance().isEmpty) + return List() + r.getInstance() + .asScala + .filter(i => i.getUrl != null && !i.getUrl.isEmpty) + .filter(i => i.getPid != null && i.getUrl != null) + .flatMap(i => findURLForPID(i.getPid.asScala.toList, i.getUrl.asScala.toList)) + .map(i => new ScholixIdentifier(i._1.getValue, i._1.getQualifier.getClassid, i._2)) + .distinct + .toList + } + + def resultToSummary(r: Result): ScholixSummary = { + val s = new ScholixSummary + s.setId(r.getId) + if (r.getPid == null || r.getPid.isEmpty) + return null + + val persistentIdentifiers: List[ScholixIdentifier] = extractTypedIdentifierFromInstance(r) + if (persistentIdentifiers.isEmpty) + return null + s.setLocalIdentifier(persistentIdentifiers.asJava) + if (r.isInstanceOf[Publication]) + s.setTypology(Typology.publication) + else + s.setTypology(Typology.dataset) + + s.setSubType(r.getInstance().get(0).getInstancetype.getClassname) + + if (r.getTitle != null && r.getTitle.asScala.nonEmpty) { + val titles: List[String] = r.getTitle.asScala.map(t => t.getValue).toList + if (titles.nonEmpty) + s.setTitle(titles.asJava) + else + return null + } + + if (r.getAuthor != null && !r.getAuthor.isEmpty) { + val authors: List[String] = r.getAuthor.asScala.map(a => a.getFullname).toList + if (authors.nonEmpty) + s.setAuthor(authors.asJava) + } + if (r.getInstance() != null) { + val dt: List[String] = r + .getInstance() + .asScala + .filter(i => i.getDateofacceptance != null) + .map(i => i.getDateofacceptance.getValue) + .toList + if (dt.nonEmpty) + s.setDate(dt.distinct.asJava) + } + if (r.getDescription != null && !r.getDescription.isEmpty) { + val d = r.getDescription.asScala.find(f => f != null && f.getValue != null) + if (d.isDefined) + s.setDescription(d.get.getValue) + } + + if (r.getSubject != null && !r.getSubject.isEmpty) { + val subjects: List[SchemeValue] = r.getSubject.asScala + .map(s => new SchemeValue(s.getQualifier.getClassname, s.getValue)) + .toList + if (subjects.nonEmpty) + s.setSubject(subjects.asJava) + } + + if (r.getPublisher != null) + s.setPublisher(List(r.getPublisher.getValue).asJava) + + if (r.getCollectedfrom != null && !r.getCollectedfrom.isEmpty) { + val cf: List[CollectedFromType] = r.getCollectedfrom.asScala + .map(c => new CollectedFromType(c.getValue, c.getKey, "complete")) + .toList + if (cf.nonEmpty) + s.setDatasources(cf.distinct.asJava) + } + + s.setRelatedDatasets(0) + s.setRelatedPublications(0) + s.setRelatedUnknown(0) + + s + } + +} diff --git a/dhp-common/src/test/java/eu/dnetlib/dhp/application/ArgumentApplicationParserTest.java b/dhp-common/src/test/java/eu/dnetlib/dhp/application/ArgumentApplicationParserTest.java index e14020830..1788239f2 100644 --- a/dhp-common/src/test/java/eu/dnetlib/dhp/application/ArgumentApplicationParserTest.java +++ b/dhp-common/src/test/java/eu/dnetlib/dhp/application/ArgumentApplicationParserTest.java @@ -7,10 +7,10 @@ import static org.junit.jupiter.api.Assertions.assertNotNull; import org.apache.commons.io.IOUtils; import org.junit.jupiter.api.Test; -public class ArgumentApplicationParserTest { +class ArgumentApplicationParserTest { @Test - public void testParseParameter() throws Exception { + void testParseParameter() throws Exception { final String jsonConfiguration = IOUtils .toString( this.getClass().getResourceAsStream("/eu/dnetlib/application/parameters.json")); diff --git a/dhp-common/src/test/java/eu/dnetlib/dhp/common/HdfsSupportTest.java b/dhp-common/src/test/java/eu/dnetlib/dhp/common/HdfsSupportTest.java index 870943816..fa721d5e5 100644 --- a/dhp-common/src/test/java/eu/dnetlib/dhp/common/HdfsSupportTest.java +++ b/dhp-common/src/test/java/eu/dnetlib/dhp/common/HdfsSupportTest.java @@ -21,13 +21,13 @@ public class HdfsSupportTest { class Remove { @Test - public void shouldThrowARuntimeExceptionOnError() { + void shouldThrowARuntimeExceptionOnError() { // when assertThrows(RuntimeException.class, () -> HdfsSupport.remove(null, new Configuration())); } @Test - public void shouldRemoveADirFromHDFS(@TempDir Path tempDir) { + void shouldRemoveADirFromHDFS(@TempDir Path tempDir) { // when HdfsSupport.remove(tempDir.toString(), new Configuration()); @@ -36,7 +36,7 @@ public class HdfsSupportTest { } @Test - public void shouldRemoveAFileFromHDFS(@TempDir Path tempDir) throws IOException { + void shouldRemoveAFileFromHDFS(@TempDir Path tempDir) throws IOException { // given Path file = Files.createTempFile(tempDir, "p", "s"); @@ -52,13 +52,13 @@ public class HdfsSupportTest { class ListFiles { @Test - public void shouldThrowARuntimeExceptionOnError() { + void shouldThrowARuntimeExceptionOnError() { // when assertThrows(RuntimeException.class, () -> HdfsSupport.listFiles(null, new Configuration())); } @Test - public void shouldListFilesLocatedInPath(@TempDir Path tempDir) throws IOException { + void shouldListFilesLocatedInPath(@TempDir Path tempDir) throws IOException { Path subDir1 = Files.createTempDirectory(tempDir, "list_me"); Path subDir2 = Files.createTempDirectory(tempDir, "list_me"); diff --git a/dhp-common/src/test/java/eu/dnetlib/dhp/common/PacePersonTest.java b/dhp-common/src/test/java/eu/dnetlib/dhp/common/PacePersonTest.java index 5ebd7213e..cb9ae2886 100644 --- a/dhp-common/src/test/java/eu/dnetlib/dhp/common/PacePersonTest.java +++ b/dhp-common/src/test/java/eu/dnetlib/dhp/common/PacePersonTest.java @@ -5,10 +5,10 @@ import static org.junit.jupiter.api.Assertions.*; import org.junit.jupiter.api.Test; -public class PacePersonTest { +class PacePersonTest { @Test - public void pacePersonTest1() { + void pacePersonTest1() { PacePerson p = new PacePerson("Artini, Michele", false); assertEquals("Artini", p.getSurnameString()); @@ -17,7 +17,7 @@ public class PacePersonTest { } @Test - public void pacePersonTest2() { + void pacePersonTest2() { PacePerson p = new PacePerson("Michele G. Artini", false); assertEquals("Artini, Michele G.", p.getNormalisedFullname()); assertEquals("Michele G", p.getNameString()); diff --git a/dhp-common/src/test/java/eu/dnetlib/dhp/common/SparkSessionSupportTest.java b/dhp-common/src/test/java/eu/dnetlib/dhp/common/SparkSessionSupportTest.java index 2f01c0863..8fa966c2f 100644 --- a/dhp-common/src/test/java/eu/dnetlib/dhp/common/SparkSessionSupportTest.java +++ b/dhp-common/src/test/java/eu/dnetlib/dhp/common/SparkSessionSupportTest.java @@ -18,7 +18,8 @@ public class SparkSessionSupportTest { class RunWithSparkSession { @Test - public void shouldExecuteFunctionAndNotStopSparkSessionWhenSparkSessionIsNotManaged() + @SuppressWarnings("unchecked") + void shouldExecuteFunctionAndNotStopSparkSessionWhenSparkSessionIsNotManaged() throws Exception { // given SparkSession spark = mock(SparkSession.class); @@ -37,7 +38,8 @@ public class SparkSessionSupportTest { } @Test - public void shouldExecuteFunctionAndStopSparkSessionWhenSparkSessionIsManaged() + @SuppressWarnings("unchecked") + void shouldExecuteFunctionAndStopSparkSessionWhenSparkSessionIsManaged() throws Exception { // given SparkSession spark = mock(SparkSession.class); diff --git a/dhp-common/src/test/java/eu/dnetlib/dhp/common/api/ZenodoAPIClientTest.java b/dhp-common/src/test/java/eu/dnetlib/dhp/common/api/ZenodoAPIClientTest.java index f961d6748..2ccaed3e4 100644 --- a/dhp-common/src/test/java/eu/dnetlib/dhp/common/api/ZenodoAPIClientTest.java +++ b/dhp-common/src/test/java/eu/dnetlib/dhp/common/api/ZenodoAPIClientTest.java @@ -12,15 +12,39 @@ import org.junit.jupiter.api.Disabled; import org.junit.jupiter.api.Test; @Disabled -public class ZenodoAPIClientTest { +class ZenodoAPIClientTest { private final String URL_STRING = "https://sandbox.zenodo.org/api/deposit/depositions"; private final String ACCESS_TOKEN = ""; private final String CONCEPT_REC_ID = "657113"; + private final String depositionId = "674915"; + @Test - public void testNewDeposition() throws IOException { + void testUploadOldDeposition() throws IOException, MissingConceptDoiException { + ZenodoAPIClient client = new ZenodoAPIClient(URL_STRING, + ACCESS_TOKEN); + Assertions.assertEquals(200, client.uploadOpenDeposition(depositionId)); + + File file = new File(getClass() + .getResource("/eu/dnetlib/dhp/common/api/COVID-19.json.gz") + .getPath()); + + InputStream is = new FileInputStream(file); + + Assertions.assertEquals(200, client.uploadIS(is, "COVID-19.json.gz", file.length())); + + String metadata = IOUtils.toString(getClass().getResourceAsStream("/eu/dnetlib/dhp/common/api/metadata.json")); + + Assertions.assertEquals(200, client.sendMretadata(metadata)); + + Assertions.assertEquals(202, client.publish()); + + } + + @Test + void testNewDeposition() throws IOException { ZenodoAPIClient client = new ZenodoAPIClient(URL_STRING, ACCESS_TOKEN); @@ -43,7 +67,7 @@ public class ZenodoAPIClientTest { } @Test - public void testNewVersionNewName() throws IOException, MissingConceptDoiException { + void testNewVersionNewName() throws IOException, MissingConceptDoiException { ZenodoAPIClient client = new ZenodoAPIClient(URL_STRING, ACCESS_TOKEN); @@ -63,7 +87,7 @@ public class ZenodoAPIClientTest { } @Test - public void testNewVersionOldName() throws IOException, MissingConceptDoiException { + void testNewVersionOldName() throws IOException, MissingConceptDoiException { ZenodoAPIClient client = new ZenodoAPIClient(URL_STRING, ACCESS_TOKEN); diff --git a/dhp-common/src/test/java/eu/dnetlib/dhp/model/mdstore/MetadataRecordTest.java b/dhp-common/src/test/java/eu/dnetlib/dhp/model/mdstore/MetadataRecordTest.java deleted file mode 100644 index cb4d0ab50..000000000 --- a/dhp-common/src/test/java/eu/dnetlib/dhp/model/mdstore/MetadataRecordTest.java +++ /dev/null @@ -1,16 +0,0 @@ - -package eu.dnetlib.dhp.model.mdstore; - -import static org.junit.jupiter.api.Assertions.assertTrue; - -import org.junit.jupiter.api.Test; - -public class MetadataRecordTest { - - @Test - public void getTimestamp() { - - MetadataRecord r = new MetadataRecord(); - assertTrue(r.getDateOfCollection() > 0); - } -} diff --git a/dhp-common/src/test/java/eu/dnetlib/dhp/oa/merge/AuthorMergerTest.java b/dhp-common/src/test/java/eu/dnetlib/dhp/oa/merge/AuthorMergerTest.java new file mode 100644 index 000000000..3a7a41a1b --- /dev/null +++ b/dhp-common/src/test/java/eu/dnetlib/dhp/oa/merge/AuthorMergerTest.java @@ -0,0 +1,100 @@ + +package eu.dnetlib.dhp.oa.merge; + +import java.io.BufferedReader; +import java.io.FileReader; +import java.io.IOException; +import java.nio.file.Paths; +import java.util.ArrayList; +import java.util.List; +import java.util.stream.Collectors; + +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; + +import com.fasterxml.jackson.databind.ObjectMapper; + +import eu.dnetlib.dhp.schema.oaf.Author; +import eu.dnetlib.dhp.schema.oaf.Publication; +import eu.dnetlib.dhp.schema.oaf.StructuredProperty; +import eu.dnetlib.pace.util.MapDocumentUtil; +import scala.Tuple2; + +class AuthorMergerTest { + + private String publicationsBasePath; + + private List> authors; + + @BeforeEach + public void setUp() throws Exception { + + publicationsBasePath = Paths + .get(AuthorMergerTest.class.getResource("/eu/dnetlib/dhp/oa/merge").toURI()) + .toFile() + .getAbsolutePath(); + + authors = readSample(publicationsBasePath + "/publications_with_authors.json", Publication.class) + .stream() + .map(p -> p._2().getAuthor()) + .collect(Collectors.toList()); + + } + + @Test + void mergeTest() { // used in the dedup: threshold set to 0.95 + + for (List authors1 : authors) { + System.out.println("List " + (authors.indexOf(authors1) + 1)); + for (Author author : authors1) { + System.out.println(authorToString(author)); + } + } + + List merge = AuthorMerger.merge(authors); + + System.out.println("Merge "); + for (Author author : merge) { + System.out.println(authorToString(author)); + } + + Assertions.assertEquals(7, merge.size()); + + } + + public List> readSample(String path, Class clazz) { + List> res = new ArrayList<>(); + BufferedReader reader; + try { + reader = new BufferedReader(new FileReader(path)); + String line = reader.readLine(); + while (line != null) { + res + .add( + new Tuple2<>( + MapDocumentUtil.getJPathString("$.id", line), + new ObjectMapper().readValue(line, clazz))); + // read next line + line = reader.readLine(); + } + reader.close(); + } catch (IOException e) { + e.printStackTrace(); + } + + return res; + } + + public String authorToString(Author a) { + + String print = "Fullname = "; + print += a.getFullname() + " pid = ["; + if (a.getPid() != null) + for (StructuredProperty sp : a.getPid()) { + print += sp.toComparableString() + " "; + } + print += "]"; + return print; + } +} diff --git a/dhp-common/src/test/java/eu/dnetlib/dhp/schema/oaf/utils/IdentifierFactoryTest.java b/dhp-common/src/test/java/eu/dnetlib/dhp/schema/oaf/utils/IdentifierFactoryTest.java deleted file mode 100644 index d458c613e..000000000 --- a/dhp-common/src/test/java/eu/dnetlib/dhp/schema/oaf/utils/IdentifierFactoryTest.java +++ /dev/null @@ -1,47 +0,0 @@ - -package eu.dnetlib.dhp.schema.oaf.utils; - -import static org.junit.jupiter.api.Assertions.*; - -import java.io.IOException; - -import org.apache.commons.io.IOUtils; -import org.junit.jupiter.api.Test; - -import com.fasterxml.jackson.databind.DeserializationFeature; -import com.fasterxml.jackson.databind.ObjectMapper; - -import eu.dnetlib.dhp.schema.oaf.Publication; -import eu.dnetlib.dhp.utils.DHPUtils; - -public class IdentifierFactoryTest { - - private static ObjectMapper OBJECT_MAPPER = new ObjectMapper() - .configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false); - - @Test - public void testCreateIdentifierForPublication() throws IOException { - - verifyIdentifier("publication_doi.json", "50|doi_________::" + DHPUtils.md5("10.1016/j.cmet.2011.03.013")); - verifyIdentifier("publication_pmc.json", "50|pmc_________::" + DHPUtils.md5("21459329")); - verifyIdentifier( - "publication_urn.json", - "50|urn_________::" + DHPUtils.md5("urn:nbn:nl:ui:29-f3ed5f9e-edf6-457e-8848-61b58a4075e2")); - - final String defaultID = "50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1f"; - verifyIdentifier("publication_3.json", defaultID); - verifyIdentifier("publication_4.json", defaultID); - verifyIdentifier("publication_5.json", defaultID); - } - - protected void verifyIdentifier(String filename, String expectedID) throws IOException { - final String json = IOUtils.toString(getClass().getResourceAsStream(filename)); - final Publication pub = OBJECT_MAPPER.readValue(json, Publication.class); - - String id = IdentifierFactory.createIdentifier(pub); - - assertNotNull(id); - assertEquals(expectedID, id); - } - -} diff --git a/dhp-common/src/test/java/eu/dnetlib/dhp/schema/oaf/utils/OafMapperUtilsTest.java b/dhp-common/src/test/java/eu/dnetlib/dhp/schema/oaf/utils/OafMapperUtilsTest.java new file mode 100644 index 000000000..9111ac2df --- /dev/null +++ b/dhp-common/src/test/java/eu/dnetlib/dhp/schema/oaf/utils/OafMapperUtilsTest.java @@ -0,0 +1,212 @@ + +package eu.dnetlib.dhp.schema.oaf.utils; + +import static org.junit.jupiter.api.Assertions.*; + +import java.io.IOException; +import java.util.HashSet; +import java.util.List; +import java.util.stream.Collectors; + +import org.apache.commons.io.IOUtils; +import org.junit.jupiter.api.Test; + +import com.fasterxml.jackson.databind.DeserializationFeature; +import com.fasterxml.jackson.databind.ObjectMapper; + +import eu.dnetlib.dhp.schema.common.ModelConstants; +import eu.dnetlib.dhp.schema.oaf.Dataset; +import eu.dnetlib.dhp.schema.oaf.KeyValue; +import eu.dnetlib.dhp.schema.oaf.Publication; +import eu.dnetlib.dhp.schema.oaf.Result; +import me.xuender.unidecode.Unidecode; + +class OafMapperUtilsTest { + + private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper() + .configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false); + + @Test + public void testUnidecode() { + + assertEquals("Liu Ben Mu hiruzuSen tawa", Unidecode.decode("六本木ヒルズ森タワ")); + assertEquals("Nan Wu A Mi Tuo Fo", Unidecode.decode("南无阿弥陀佛")); + assertEquals("Yi Tiao Hui Zou Lu De Yu", Unidecode.decode("一条会走路的鱼")); + assertEquals("amidaniyorai", Unidecode.decode("あみだにょらい")); + assertEquals("T`owrk`iayi", Unidecode.decode("Թուրքիայի")); + assertEquals("Obzor tematiki", Unidecode.decode("Обзор тематики")); + assertEquals("GERMANSKIE IaZYKI", Unidecode.decode("ГЕРМАНСКИЕ ЯЗЫКИ")); + assertEquals("Diereunese tes ikanopoieses", Unidecode.decode("Διερεύνηση της ικανοποίησης")); + assertEquals("lqDy l'wly@", Unidecode.decode("القضايا الأولية")); + assertEquals("abc def ghi", Unidecode.decode("abc def ghi")); + } + + @Test + void testDateValidation() { + + assertNotNull(GraphCleaningFunctions.cleanDate("2016-05-07T12:41:19.202Z ")); + assertNotNull(GraphCleaningFunctions.cleanDate("2020-09-10 11:08:52 ")); + assertNotNull(GraphCleaningFunctions.cleanDate(" 2016-04-05")); + + assertEquals("2016-04-05", GraphCleaningFunctions.cleanDate("2016 Apr 05")); + + assertEquals("2009-05-08", GraphCleaningFunctions.cleanDate("May 8, 2009 5:57:51 PM")); + assertEquals("1970-10-07", GraphCleaningFunctions.cleanDate("oct 7, 1970")); + assertEquals("1970-10-07", GraphCleaningFunctions.cleanDate("oct 7, '70")); + assertEquals("1970-10-07", GraphCleaningFunctions.cleanDate("oct. 7, 1970")); + assertEquals("1970-10-07", GraphCleaningFunctions.cleanDate("oct. 7, 70")); + assertEquals("2006-01-02", GraphCleaningFunctions.cleanDate("Mon Jan 2 15:04:05 2006")); + assertEquals("2006-01-02", GraphCleaningFunctions.cleanDate("Mon Jan 2 15:04:05 MST 2006")); + assertEquals("2006-01-02", GraphCleaningFunctions.cleanDate("Mon Jan 02 15:04:05 -0700 2006")); + assertEquals("2006-01-02", GraphCleaningFunctions.cleanDate("Monday, 02-Jan-06 15:04:05 MST")); + assertEquals("2006-01-02", GraphCleaningFunctions.cleanDate("Mon, 02 Jan 2006 15:04:05 MST")); + assertEquals("2017-07-11", GraphCleaningFunctions.cleanDate("Tue, 11 Jul 2017 16:28:13 +0200 (CEST)")); + assertEquals("2006-01-02", GraphCleaningFunctions.cleanDate("Mon, 02 Jan 2006 15:04:05 -0700")); + assertEquals("2018-01-04", GraphCleaningFunctions.cleanDate("Thu, 4 Jan 2018 17:53:36 +0000")); + assertEquals("2015-08-10", GraphCleaningFunctions.cleanDate("Mon Aug 10 15:44:11 UTC+0100 2015")); + assertEquals( + "2015-07-03", + GraphCleaningFunctions.cleanDate("Fri Jul 03 2015 18:04:07 GMT+0100 (GMT Daylight Time)")); + assertEquals("2012-09-17", GraphCleaningFunctions.cleanDate("September 17, 2012 10:09am")); + assertEquals("2012-09-17", GraphCleaningFunctions.cleanDate("September 17, 2012 at 10:09am PST-08")); + assertEquals("2012-09-17", GraphCleaningFunctions.cleanDate("September 17, 2012, 10:10:09")); + assertEquals("1970-10-07", GraphCleaningFunctions.cleanDate("October 7, 1970")); + assertEquals("1970-10-07", GraphCleaningFunctions.cleanDate("October 7th, 1970")); + assertEquals("2006-02-12", GraphCleaningFunctions.cleanDate("12 Feb 2006, 19:17")); + assertEquals("2006-02-12", GraphCleaningFunctions.cleanDate("12 Feb 2006 19:17")); + assertEquals("1970-10-07", GraphCleaningFunctions.cleanDate("7 oct 70")); + assertEquals("1970-10-07", GraphCleaningFunctions.cleanDate("7 oct 1970")); + assertEquals("2013-02-03", GraphCleaningFunctions.cleanDate("03 February 2013")); + assertEquals("2013-07-01", GraphCleaningFunctions.cleanDate("1 July 2013")); + assertEquals("2013-02-03", GraphCleaningFunctions.cleanDate("2013-Feb-03")); + assertEquals("2014-03-31", GraphCleaningFunctions.cleanDate("3/31/2014")); + assertEquals("2014-03-31", GraphCleaningFunctions.cleanDate("03/31/2014")); + assertEquals("1971-08-21", GraphCleaningFunctions.cleanDate("08/21/71")); + assertEquals("1971-01-08", GraphCleaningFunctions.cleanDate("8/1/71")); + assertEquals("2014-08-04", GraphCleaningFunctions.cleanDate("4/8/2014 22:05")); + assertEquals("2014-08-04", GraphCleaningFunctions.cleanDate("04/08/2014 22:05")); + assertEquals("2014-08-04", GraphCleaningFunctions.cleanDate("4/8/14 22:05")); + assertEquals("2014-02-04", GraphCleaningFunctions.cleanDate("04/2/2014 03:00:51")); + assertEquals("1965-08-08", GraphCleaningFunctions.cleanDate("8/8/1965 12:00:00 AM")); + assertEquals("1965-08-08", GraphCleaningFunctions.cleanDate("8/8/1965 01:00:01 PM")); + assertEquals("1965-08-08", GraphCleaningFunctions.cleanDate("8/8/1965 01:00 PM")); + assertEquals("1965-08-08", GraphCleaningFunctions.cleanDate("8/8/1965 1:00 PM")); + assertEquals("1965-08-08", GraphCleaningFunctions.cleanDate("8/8/1965 12:00 AM")); + assertEquals("2014-02-04", GraphCleaningFunctions.cleanDate("4/02/2014 03:00:51")); + assertEquals("2012-03-19", GraphCleaningFunctions.cleanDate("03/19/2012 10:11:59")); + assertEquals("2012-03-19", GraphCleaningFunctions.cleanDate("03/19/2012 10:11:59.3186369")); + assertEquals("2014-03-31", GraphCleaningFunctions.cleanDate("2014/3/31")); + assertEquals("2014-03-31", GraphCleaningFunctions.cleanDate("2014/03/31")); + assertEquals("2014-04-08", GraphCleaningFunctions.cleanDate("2014/4/8 22:05")); + assertEquals("2014-04-08", GraphCleaningFunctions.cleanDate("2014/04/08 22:05")); + assertEquals("2014-04-02", GraphCleaningFunctions.cleanDate("2014/04/2 03:00:51")); + assertEquals("2014-04-02", GraphCleaningFunctions.cleanDate("2014/4/02 03:00:51")); + assertEquals("2012-03-19", GraphCleaningFunctions.cleanDate("2012/03/19 10:11:59")); + assertEquals("2012-03-19", GraphCleaningFunctions.cleanDate("2012/03/19 10:11:59.3186369")); + assertEquals("2014-04-08", GraphCleaningFunctions.cleanDate("2014年04月08日")); + assertEquals("2006-01-02", GraphCleaningFunctions.cleanDate("2006-01-02T15:04:05+0000")); + assertEquals("2009-08-13", GraphCleaningFunctions.cleanDate("2009-08-12T22:15:09-07:00")); + assertEquals("2009-08-12", GraphCleaningFunctions.cleanDate("2009-08-12T22:15:09")); + assertEquals("2014-04-26", GraphCleaningFunctions.cleanDate("2014-04-26 17:24:37.3186369")); + assertEquals("2012-08-03", GraphCleaningFunctions.cleanDate("2012-08-03 18:31:59.257000000")); + assertEquals("2014-04-26", GraphCleaningFunctions.cleanDate("2014-04-26 17:24:37.123")); + assertEquals("2013-04-01", GraphCleaningFunctions.cleanDate("2013-04-01 22:43")); + assertEquals("2013-04-01", GraphCleaningFunctions.cleanDate("2013-04-01 22:43:22")); + assertEquals("2014-12-16", GraphCleaningFunctions.cleanDate("2014-12-16 06:20:00 UTC")); + assertEquals("2014-12-16", GraphCleaningFunctions.cleanDate("2014-12-16 06:20:00 GMT")); + assertEquals("2014-04-26", GraphCleaningFunctions.cleanDate("2014-04-26 05:24:37 PM")); + assertEquals("2014-04-26", GraphCleaningFunctions.cleanDate("2014-04-26 13:13:43 +0800")); + assertEquals("2014-04-26", GraphCleaningFunctions.cleanDate("2014-04-26 13:13:43 +0800 +08")); + assertEquals("2014-04-26", GraphCleaningFunctions.cleanDate("2014-04-26 13:13:44 +09:00")); + assertEquals("2012-08-03", GraphCleaningFunctions.cleanDate("2012-08-03 18:31:59.257000000 +0000 UTC")); + assertEquals("2015-09-30", GraphCleaningFunctions.cleanDate("2015-09-30 18:48:56.35272715 +0000 UTC")); + assertEquals("2015-02-18", GraphCleaningFunctions.cleanDate("2015-02-18 00:12:00 +0000 GMT")); + assertEquals("2015-02-18", GraphCleaningFunctions.cleanDate("2015-02-18 00:12:00 +0000 UTC")); + assertEquals( + "2015-02-08", GraphCleaningFunctions.cleanDate("2015-02-08 03:02:00 +0300 MSK m=+0.000000001")); + assertEquals( + "2015-02-08", GraphCleaningFunctions.cleanDate("2015-02-08 03:02:00.001 +0300 MSK m=+0.000000001")); + assertEquals("2017-07-19", GraphCleaningFunctions.cleanDate("2017-07-19 03:21:51+00:00")); + assertEquals("2014-04-26", GraphCleaningFunctions.cleanDate("2014-04-26")); + assertEquals("2014-04-01", GraphCleaningFunctions.cleanDate("2014-04")); + assertEquals("2014-01-01", GraphCleaningFunctions.cleanDate("2014")); + assertEquals("2014-05-11", GraphCleaningFunctions.cleanDate("2014-05-11 08:20:13,787")); + assertEquals("2014-03-31", GraphCleaningFunctions.cleanDate("3.31.2014")); + assertEquals("2014-03-31", GraphCleaningFunctions.cleanDate("03.31.2014")); + assertEquals("1971-08-21", GraphCleaningFunctions.cleanDate("08.21.71")); + assertEquals("2014-03-01", GraphCleaningFunctions.cleanDate("2014.03")); + assertEquals("2014-03-30", GraphCleaningFunctions.cleanDate("2014.03.30")); + assertEquals("2014-06-01", GraphCleaningFunctions.cleanDate("20140601")); + assertEquals("2014-07-22", GraphCleaningFunctions.cleanDate("20140722105203")); + assertEquals("2012-03-19", GraphCleaningFunctions.cleanDate("1332151919")); + assertEquals("2013-11-12", GraphCleaningFunctions.cleanDate("1384216367189")); + assertEquals("2013-11-12", GraphCleaningFunctions.cleanDate("1384216367111222")); + assertEquals("2013-11-12", GraphCleaningFunctions.cleanDate("1384216367111222333")); + + } + + @Test + void testDate() { + final String date = GraphCleaningFunctions.cleanDate("23-FEB-1998"); + assertNotNull(date); + System.out.println(date); + } + + @Test + void testMergePubs() throws IOException { + Publication p1 = read("publication_1.json", Publication.class); + Publication p2 = read("publication_2.json", Publication.class); + Dataset d1 = read("dataset_1.json", Dataset.class); + Dataset d2 = read("dataset_2.json", Dataset.class); + + assertEquals(1, p1.getCollectedfrom().size()); + assertEquals(ModelConstants.CROSSREF_ID, p1.getCollectedfrom().get(0).getKey()); + assertEquals(1, d2.getCollectedfrom().size()); + assertFalse(cfId(d2.getCollectedfrom()).contains(ModelConstants.CROSSREF_ID)); + + assertEquals( + ModelConstants.PUBLICATION_RESULTTYPE_CLASSID, + OafMapperUtils + .mergeResults(p1, d2) + .getResulttype() + .getClassid()); + + assertEquals(1, p2.getCollectedfrom().size()); + assertFalse(cfId(p2.getCollectedfrom()).contains(ModelConstants.CROSSREF_ID)); + assertEquals(1, d1.getCollectedfrom().size()); + assertTrue(cfId(d1.getCollectedfrom()).contains(ModelConstants.CROSSREF_ID)); + + assertEquals( + ModelConstants.DATASET_RESULTTYPE_CLASSID, + OafMapperUtils + .mergeResults(p2, d1) + .getResulttype() + .getClassid()); + } + + @Test + void testDelegatedAuthority() throws IOException { + Dataset d1 = read("dataset_2.json", Dataset.class); + Dataset d2 = read("dataset_delegated.json", Dataset.class); + + assertEquals(1, d2.getCollectedfrom().size()); + assertTrue(cfId(d2.getCollectedfrom()).contains(ModelConstants.ZENODO_OD_ID)); + + Result res = OafMapperUtils.mergeResults(d1, d2); + + assertEquals(d2, res); + + System.out.println(OBJECT_MAPPER.writeValueAsString(res)); + + } + + protected HashSet cfId(List collectedfrom) { + return collectedfrom.stream().map(KeyValue::getKey).collect(Collectors.toCollection(HashSet::new)); + } + + protected T read(String filename, Class clazz) throws IOException { + final String json = IOUtils.toString(getClass().getResourceAsStream(filename)); + return OBJECT_MAPPER.readValue(json, clazz); + } + +} diff --git a/dhp-common/src/test/java/eu/dnetlib/message/MessageTest.java b/dhp-common/src/test/java/eu/dnetlib/message/MessageTest.java deleted file mode 100644 index 442f7b5c2..000000000 --- a/dhp-common/src/test/java/eu/dnetlib/message/MessageTest.java +++ /dev/null @@ -1,51 +0,0 @@ - -package eu.dnetlib.message; - -import static org.junit.jupiter.api.Assertions.*; - -import java.io.IOException; -import java.util.HashMap; -import java.util.Map; - -import org.junit.jupiter.api.Test; - -public class MessageTest { - - @Test - public void fromJsonTest() throws IOException { - Message m = new Message(); - m.setWorkflowId("wId"); - m.setType(MessageType.ONGOING); - m.setJobName("Collection"); - Map body = new HashMap<>(); - body.put("parsedItem", "300"); - body.put("ExecutionTime", "30s"); - - m.setBody(body); - System.out.println("m = " + m); - Message m1 = Message.fromJson(m.toString()); - assertEquals(m1.getWorkflowId(), m.getWorkflowId()); - assertEquals(m1.getType(), m.getType()); - assertEquals(m1.getJobName(), m.getJobName()); - - assertNotNull(m1.getBody()); - m1.getBody().keySet().forEach(it -> assertEquals(m1.getBody().get(it), m.getBody().get(it))); - assertEquals(m1.getJobName(), m.getJobName()); - } - - @Test - public void toStringTest() { - final String expectedJson = "{\"workflowId\":\"wId\",\"jobName\":\"Collection\",\"type\":\"ONGOING\",\"body\":{\"ExecutionTime\":\"30s\",\"parsedItem\":\"300\"}}"; - Message m = new Message(); - m.setWorkflowId("wId"); - m.setType(MessageType.ONGOING); - m.setJobName("Collection"); - Map body = new HashMap<>(); - body.put("parsedItem", "300"); - body.put("ExecutionTime", "30s"); - - m.setBody(body); - - assertEquals(expectedJson, m.toString()); - } -} diff --git a/dhp-common/src/test/java/eu/dnetlib/scholexplorer/relation/RelationMapperTest.java b/dhp-common/src/test/java/eu/dnetlib/scholexplorer/relation/RelationMapperTest.java index d1d1ada71..5743b0831 100644 --- a/dhp-common/src/test/java/eu/dnetlib/scholexplorer/relation/RelationMapperTest.java +++ b/dhp-common/src/test/java/eu/dnetlib/scholexplorer/relation/RelationMapperTest.java @@ -3,10 +3,10 @@ package eu.dnetlib.scholexplorer.relation; import org.junit.jupiter.api.Test; -public class RelationMapperTest { +class RelationMapperTest { @Test - public void testLoadRels() throws Exception { + void testLoadRels() throws Exception { RelationMapper relationMapper = RelationMapper.load(); relationMapper.keySet().forEach(System.out::println); diff --git a/dhp-common/src/test/resources/eu/dnetlib/dhp/oa/merge/publications_with_authors.json b/dhp-common/src/test/resources/eu/dnetlib/dhp/oa/merge/publications_with_authors.json new file mode 100644 index 000000000..600181ba5 --- /dev/null +++ b/dhp-common/src/test/resources/eu/dnetlib/dhp/oa/merge/publications_with_authors.json @@ -0,0 +1,3 @@ +{ "journal":{ "dataInfo":null, "conferenceplace":null, "issnPrinted":"0009-9260", "issnOnline":null, "issnLinking":null, "ep":"636", "iss":null, "sp":"632", "vol":"55", "edition":null, "conferencedate":null, "name":"Clinical Radiology" }, "measures":null, "author":[ { "rank":null, "fullname":"KARL TURETSCHEK", "affiliation":null, "pid":null, "surname":"TURETSCHEK", "name":"KARL" }, { "rank":null, "fullname":"WOLFGANG EBNER", "affiliation":null, "pid":null, "surname":"EBNER", "name":"WOLFGANG" }, { "rank":null, "fullname":"DOMINIK FLEISCHMANN", "affiliation":null, "pid":null, "surname":"FLEISCHMANN", "name":"DOMINIK" }, { "rank":null, "fullname":"PATRICK WUNDERBALDINGER", "affiliation":null, "pid":null, "surname":"WUNDERBALDINGER", "name":"PATRICK" }, { "rank":null, "fullname":"LUDWIG ERLACHER", "affiliation":null, "pid":null, "surname":"ERLACHER", "name":"LUDWIG" }, { "rank":null, "fullname":"THOMAS ZONTSICH", "affiliation":null, "pid":null, "surname":"ZONTSICH", "name":"THOMAS" }, { "rank":null, "fullname":"ALEXANDER A. BANKIER", "affiliation":null, "pid":null, "surname":"BANKIER", "name":"ALEXANDER A." } ], "resulttype":{ "classid":"publication", "schemeid":"dnet:result_typologies", "schemename":"dnet:result_typologies", "classname":"publication"}, "title":[ { "qualifier":{ "classid":"main title", "schemeid":"dnet:dataCite_title", "schemename":"dnet:dataCite_title", "classname":"main title" }, "dataInfo":null, "value":"Early Pulmonary Involvement in Ankylosing Spondylitis: Assessment With Thin-section CT" } ], "relevantdate":[ { "qualifier":{ "classid":"created", "schemeid":"dnet:dataCite_date", "schemename":"dnet:dataCite_date", "classname":"created" }, "dataInfo":null, "value":"2002-09-19T13:54:50Z" } ], "dateofacceptance":{ "dataInfo":null, "value":"2002-09-19T13:54:50Z" }, "publisher":{ "dataInfo":null, "value":"Elsevier BV" }, "embargoenddate":null, "fulltext":null, "contributor":null, "resourcetype":{ "classid":"0001", "schemeid":"dnet:dataCite_resource", "schemename":"dnet:dataCite_resource", "classname":"0001"}, "coverage":null, "bestaccessright":null, "externalReference":null, "format":null, "description":[ ], "source":[ { "dataInfo":null, "value":"Crossref" } ], "subject":[ { "qualifier":{ "classid":"keywords", "schemeid":"dnet:subject_classification_typologies", "schemename":"dnet:subject_classification_typologies", "classname":"keywords" }, "dataInfo":null, "value":"Radiology Nuclear Medicine and imaging" }, { "qualifier":{ "classid":"keywords", "schemeid":"dnet:subject_classification_typologies", "schemename":"dnet:subject_classification_typologies", "classname":"keywords" }, "dataInfo":null, "value":"General Medicine" } ], "language":null, "instance":[ { "processingchargecurrency":null, "refereed":null, "instancetype":{ "classid":"0001", "schemeid":"dnet:publication_resource", "schemename":"dnet:publication_resource", "classname":"Article" }, "hostedby":null, "distributionlocation":null, "processingchargeamount":null, "license":{ "dataInfo":null, "value":"https://www.elsevier.com/tdm/userlicense/1.0/" }, "accessright":{ "classid":"RESTRICTED", "schemeid":"dnet:access_modes", "schemename":"dnet:access_modes", "classname":"Restricted" }, "dateofacceptance":{ "dataInfo":null, "value":"2002-09-19T13:54:50Z" }, "collectedfrom":{ "dataInfo":null, "value":"Crossref", "key":"10|openaire____::081b82f96300b6a6e3d282bad31cb6e2" }, "url":[ "https://api.elsevier.com/content/article/PII:S0009926000904987?httpAccept=text/xml", "https://api.elsevier.com/content/article/PII:S0009926000904987?httpAccept=text/plain", "http://dx.doi.org/10.1053/crad.2000.0498" ] } ], "context":null, "country":null, "originalId":[ "S0009926000904987", "10.1053/crad.2000.0498" ], "pid":[ { "qualifier":{ "classid":"doi", "schemeid":"dnet:pid_types", "schemename":"dnet:pid_types", "classname":"doi" }, "dataInfo":null, "value":"10.1053/crad.2000.0498" } ], "dateofcollection":"2020-02-06T20:40:22Z", "dateoftransformation":null, "oaiprovenance":null, "extraInfo":null, "id":"50|doiboost____::994b7e47b9e225ab6d5e14841cb45a7f", "collectedfrom":[ { "dataInfo":null, "value":"Crossref", "key":"10|openaire____::081b82f96300b6a6e3d282bad31cb6e2" } ], "dataInfo":{ "trust":"0.9", "invisible":false, "inferred":false, "deletedbyinference":false, "inferenceprovenance":null, "provenanceaction":{ "classid":"sysimport:actionset", "schemeid":"dnet:provenanceActions", "schemename":"dnet:provenanceActions", "classname":"sysimport:actionset" } }, "lastupdatetimestamp":1581021622595 } +{ "journal":null, "measures":null, "author":[ { "rank":null, "fullname":"Dominik Fleischmann", "affiliation":null, "pid":[ { "qualifier":{ "classid":"ORCID", "schemeid":"dnet:pid_types", "schemename":"dnet:pid_types", "classname":"ORCID" }, "dataInfo":{ "trust":"0.91", "invisible":false, "inferred":false, "deletedbyinference":false, "inferenceprovenance":null, "provenanceaction":{ "classid":"sysimport:crosswalk:entityregistry", "schemeid":"dnet:provenanceActions", "schemename":"dnet:provenanceActions", "classname":"Harvested"} }, "value":"0000-0003-0715-0952" } ], "surname":"Fleischmann", "name":"Dominik" } ], "resulttype":{ "classid":"publication", "schemeid":"dnet:result_typologies", "schemename":"dnet:result_typologies", "classname":"publication"}, "title":[ ], "relevantdate":[ ], "dateofacceptance":null, "publisher":null, "embargoenddate":null, "fulltext":[ ], "contributor":[ ], "resourcetype":null, "coverage":[ ], "bestaccessright":null, "externalReference":[ ], "format":[ ], "description":null, "source":[ ], "subject":[ ], "language":null, "instance":[ ], "context":[ ], "country":[ ], "originalId":[ ], "pid":[ { "qualifier":{ "classid":"doi", "schemeid":"dnet:pid_types", "schemename":"dnet:pid_types", "classname":"doi"}, "dataInfo":null, "value":"10.1053/crad.2000.0498" } ], "dateofcollection":null, "dateoftransformation":null, "oaiprovenance":null, "extraInfo":[ ], "id":"50|doiboost____::994b7e47b9e225ab6d5e14841cb45a7f", "collectedfrom":[ { "dataInfo":null, "value":"ORCID", "key":"10|openaire____::806360c771262b4d6770e7cdf04b5c5a" } ], "dataInfo":{ "trust":"0.9", "invisible":false, "inferred":false, "deletedbyinference":false, "inferenceprovenance":null, "provenanceaction":{ "classid":"sysimport:actionset", "schemeid":"dnet:provenanceActions", "schemename":"dnet:provenanceActions", "classname":"sysimport:actionset" } }, "lastupdatetimestamp":null } +{ "journal":{ "dataInfo":null, "conferenceplace":null, "issnPrinted":"0009-9260", "issnOnline":null, "issnLinking":null, "ep":"636", "iss":"8", "sp":"632", "vol":"55", "edition":null, "conferencedate":null, "name":"Clinical Radiology" }, "measures":null, "author":[ { "rank":null, "fullname":"T. Zontsich", "affiliation":[ { "dataInfo":null, "value":"University of Vienna" } ], "pid":[ { "qualifier":{ "classid":"URL", "schemeid":"dnet:pid_types", "schemename":"dnet:pid_types", "classname":"URL"}, "dataInfo":null, "value":"https://academic.microsoft.com/#/detail/1966908432" } ], "surname":null, "name":null }, { "rank":null, "fullname":"L Erlacher", "affiliation":[ { "dataInfo":null, "value":"University of Vienna" } ], "pid":[ { "qualifier":{ "classid":"URL", "schemeid":"dnet:pid_types", "schemename":"dnet:pid_types", "classname":"URL"}, "dataInfo":null, "value":"https://academic.microsoft.com/#/detail/687931320" } ], "surname":null, "name":null }, { "rank":null, "fullname":"Dominik Fleischmann", "affiliation":[ { "dataInfo":null, "value":"University of Vienna" } ], "pid":[ { "qualifier":{ "classid":"URL", "schemeid":"dnet:pid_types", "schemename":"dnet:pid_types", "classname":"URL"}, "dataInfo":null, "value":"https://academic.microsoft.com/#/detail/2156559961" } ], "surname":null, "name":null }, { "rank":null, "fullname":"Alexander A. Bankier", "affiliation":[ { "dataInfo":null, "value":"University of Vienna" } ], "pid":[ { "qualifier":{ "classid":"URL", "schemeid":"dnet:pid_types", "schemename":"dnet:pid_types", "classname":"URL"}, "dataInfo":null, "value":"https://academic.microsoft.com/#/detail/1107971609" } ], "surname":null, "name":null }, { "rank":null, "fullname":"Patrick Wunderbaldinger", "affiliation":[ { "dataInfo":null, "value":"University of Vienna" } ], "pid":[ { "qualifier":{ "classid":"URL", "schemeid":"dnet:pid_types", "schemename":"dnet:pid_types", "classname":"URL" }, "dataInfo":null, "value":"https://academic.microsoft.com/#/detail/2422340537" } ], "surname":null, "name":null }, { "rank":null, "fullname":"Wolfgang Ebner", "affiliation":null, "pid":[ { "qualifier":{ "classid":"URL", "schemeid":"dnet:pid_types", "schemename":"dnet:pid_types", "classname":"URL" }, "dataInfo":null, "value":"https://academic.microsoft.com/#/detail/2186462571" } ], "surname":null, "name":null }, { "rank":null, "fullname":"K. Turetschek", "affiliation":[ { "dataInfo":null, "value":"University of Vienna" } ], "pid":[ { "qualifier":{ "classid":"URL", "schemeid":"dnet:pid_types", "schemename":"dnet:pid_types", "classname":"URL" }, "dataInfo":null, "value":"https://academic.microsoft.com/#/detail/321765676" } ], "surname":null, "name":null } ], "resulttype":{ "classid":"publication", "schemeid":"dnet:result_typologies", "schemename":"dnet:result_typologies", "classname":"publication" }, "title":[ { "qualifier":{ "classid":"main title", "schemeid":"dnet:dataCite_title", "schemename":"dnet:dataCite_title", "classname":"main title" }, "dataInfo":null, "value":"early pulmonary involvement in ankylosing spondylitis assessment with thin section ct" }, { "qualifier":{ "classid":"alternative title", "schemeid":"dnet:dataCite_title", "schemename":"dnet:dataCite_title", "classname":"alternative title" }, "dataInfo":null, "value":"Early pulmonary involvement in ankylosing spondylitis: assessment with thin-section CT." } ], "relevantdate":null, "dateofacceptance":{ "dataInfo":null, "value":"2000-08-01" }, "publisher":{ "dataInfo":null, "value":"Elsevier" }, "embargoenddate":null, "fulltext":null, "contributor":null, "resourcetype":null, "coverage":null, "bestaccessright":null, "externalReference":null, "format":null, "description":[ { "dataInfo":null, "value":"Abstract AIM: To determine the frequency and the distribution of early pulmonary lesions in patients with ankylosing spondylitis (AS) and a normal chest X-ray on thin-section CT and to correlate the CT findings with the results of pulmonary function tests and clinical data. MATERIALS AND METHODS: Twenty-five patients with clinically proven AS and no history of smoking underwent clinical examinations, pulmonary function tests (PFT), chest radiography, and thin-section CT. Four of 25 patients (16%), who had obvious signs on plain films suggestive of pre-existing disorders unrelated to AS were excluded. RESULTS: Fifteen of 21 patients (71%) had abnormalities on thin-section CT. The most frequent abnormalities were thickening of the interlobular septa in seven of 21 patients (33%), mild bronchial wall thickening in (6/21, 29%), pleural thickening and pleuropulmonary irregularities (both 29%) and linear septal thickening (6/21, 29%). In six patients there were no signs of pleuropulmonary involvement. Eight of 15 patients (53%) with abnormal and four of six patients (67%) with normal CT findings revealed mild restrictive lung function impairment. CONCLUSION: Patients with AS but a normal chest radiograph frequently have abnormalities on thin-section CT. As these abnormalities are usually subtle and their extent does not correlate with functional and clinical data, the overall routine impact of thin-section CT in the diagnosis of AS is limited. Turetschek, K , (2000) Clinical Radiology53, 632–636." } ], "source":[ { "dataInfo":null, "value":null } ], "subject":[ { "qualifier":{ "classid":"MAG", "schemeid":"dnet:subject_classification_typologies", "schemename":"dnet:subject_classification_typologies", "classname":"Microsoft Academic Graph classification" }, "dataInfo":null, "value":"Complication" }, { "qualifier":{ "classid":"MAG", "schemeid":"dnet:subject_classification_typologies", "schemename":"dnet:subject_classification_typologies", "classname":"Microsoft Academic Graph classification" }, "dataInfo":null, "value":"Chest radiograph" }, { "qualifier":{ "classid":"MAG", "schemeid":"dnet:subject_classification_typologies", "schemename":"dnet:subject_classification_typologies", "classname":"Microsoft Academic Graph classification" }, "dataInfo":{ "trust":"0.580897", "invisible":false, "inferred":false, "deletedbyinference":false, "inferenceprovenance":null, "provenanceaction":{ "classid":"sysimport:actionset", "schemeid":"dnet:provenanceActions", "schemename":"dnet:provenanceActions", "classname":"sysimport:actionset" } }, "value":"medicine.diagnostic_test" }, { "qualifier":{ "classid":"MAG", "schemeid":"dnet:subject_classification_typologies", "schemename":"dnet:subject_classification_typologies", "classname":"Microsoft Academic Graph classification" }, "dataInfo":{ "trust":"0.580897", "invisible":false, "inferred":false, "deletedbyinference":false, "inferenceprovenance":null, "provenanceaction":{ "classid":"sysimport:actionset", "schemeid":"dnet:provenanceActions", "schemename":"dnet:provenanceActions", "classname":"sysimport:actionset" } }, "value":"medicine" }, { "qualifier":{ "classid":"MAG", "schemeid":"dnet:subject_classification_typologies", "schemename":"dnet:subject_classification_typologies", "classname":"Microsoft Academic Graph classification" }, "dataInfo":null, "value":"In patient" }, { "qualifier":{ "classid":"MAG", "schemeid":"dnet:subject_classification_typologies", "schemename":"dnet:subject_classification_typologies", "classname":"Microsoft Academic Graph classification" }, "dataInfo":null, "value":"Radiography" }, { "qualifier":{ "classid":"MAG", "schemeid":"dnet:subject_classification_typologies", "schemename":"dnet:subject_classification_typologies", "classname":"Microsoft Academic Graph classification" }, "dataInfo":{ "trust":"0.4582326", "invisible":false, "inferred":false, "deletedbyinference":false, "inferenceprovenance":null, "provenanceaction":{ "classid":"sysimport:actionset", "schemeid":"dnet:provenanceActions", "schemename":"dnet:provenanceActions", "classname":"sysimport:actionset" } }, "value":"business.industry" }, { "qualifier":{ "classid":"MAG", "schemeid":"dnet:subject_classification_typologies", "schemename":"dnet:subject_classification_typologies", "classname":"Microsoft Academic Graph classification" }, "dataInfo":{ "trust":"0.4582326", "invisible":false, "inferred":false, "deletedbyinference":false, "inferenceprovenance":null, "provenanceaction":{ "classid":"sysimport:actionset", "schemeid":"dnet:provenanceActions", "schemename":"dnet:provenanceActions", "classname":"sysimport:actionset" } }, "value":"business" }, { "qualifier":{ "classid":"MAG", "schemeid":"dnet:subject_classification_typologies", "schemename":"dnet:subject_classification_typologies", "classname":"Microsoft Academic Graph classification" }, "dataInfo":null, "value":"Thin section ct" }, { "qualifier":{ "classid":"MAG", "schemeid":"dnet:subject_classification_typologies", "schemename":"dnet:subject_classification_typologies", "classname":"Microsoft Academic Graph classification" }, "dataInfo":null, "value":"Respiratory disease" }, { "qualifier":{ "classid":"MAG", "schemeid":"dnet:subject_classification_typologies", "schemename":"dnet:subject_classification_typologies", "classname":"Microsoft Academic Graph classification" }, "dataInfo":{ "trust":"0.49358836", "invisible":false, "inferred":false, "deletedbyinference":false, "inferenceprovenance":null, "provenanceaction":{ "classid":"sysimport:actionset", "schemeid":"dnet:provenanceActions", "schemename":"dnet:provenanceActions", "classname":"sysimport:actionset" } }, "value":"medicine.disease" }, { "qualifier":{ "classid":"MAG", "schemeid":"dnet:subject_classification_typologies", "schemename":"dnet:subject_classification_typologies", "classname":"Microsoft Academic Graph classification" }, "dataInfo":{ "trust":"0.49358836", "invisible":false, "inferred":false, "deletedbyinference":false, "inferenceprovenance":null, "provenanceaction":{ "classid":"sysimport:actionset", "schemeid":"dnet:provenanceActions", "schemename":"dnet:provenanceActions", "classname":"sysimport:actionset" } }, "value":"medicine" }, { "qualifier":{ "classid":"MAG", "schemeid":"dnet:subject_classification_typologies", "schemename":"dnet:subject_classification_typologies", "classname":"Microsoft Academic Graph classification" }, "dataInfo":null, "value":"Ankylosing spondylitis" }, { "qualifier":{ "classid":"MAG", "schemeid":"dnet:subject_classification_typologies", "schemename":"dnet:subject_classification_typologies", "classname":"Microsoft Academic Graph classification" }, "dataInfo":{ "trust":"0.49937168", "invisible":false, "inferred":false, "deletedbyinference":false, "inferenceprovenance":null, "provenanceaction":{ "classid":"sysimport:actionset", "schemeid":"dnet:provenanceActions", "schemename":"dnet:provenanceActions", "classname":"sysimport:actionset" } }, "value":"medicine.disease" }, { "qualifier":{ "classid":"MAG", "schemeid":"dnet:subject_classification_typologies", "schemename":"dnet:subject_classification_typologies", "classname":"Microsoft Academic Graph classification" }, "dataInfo":{ "trust":"0.49937168", "invisible":false, "inferred":false, "deletedbyinference":false, "inferenceprovenance":null, "provenanceaction":{ "classid":"sysimport:actionset", "schemeid":"dnet:provenanceActions", "schemename":"dnet:provenanceActions", "classname":"sysimport:actionset" } }, "value":"medicine" }, { "qualifier":{ "classid":"MAG", "schemeid":"dnet:subject_classification_typologies", "schemename":"dnet:subject_classification_typologies", "classname":"Microsoft Academic Graph classification" }, "dataInfo":null, "value":"Radiology" }, { "qualifier":{ "classid":"MAG", "schemeid":"dnet:subject_classification_typologies", "schemename":"dnet:subject_classification_typologies", "classname":"Microsoft Academic Graph classification" }, "dataInfo":{ "trust":"0.4573571", "invisible":false, "inferred":false, "deletedbyinference":false, "inferenceprovenance":null, "provenanceaction":{ "classid":"sysimport:actionset", "schemeid":"dnet:provenanceActions", "schemename":"dnet:provenanceActions", "classname":"sysimport:actionset" } }, "value":"medicine.medical_specialty" }, { "qualifier":{ "classid":"MAG", "schemeid":"dnet:subject_classification_typologies", "schemename":"dnet:subject_classification_typologies", "classname":"Microsoft Academic Graph classification" }, "dataInfo":{ "trust":"0.4573571", "invisible":false, "inferred":false, "deletedbyinference":false, "inferenceprovenance":null, "provenanceaction":{ "classid":"sysimport:actionset", "schemeid":"dnet:provenanceActions", "schemename":"dnet:provenanceActions", "classname":"sysimport:actionset" } }, "value":"medicine" }, { "qualifier":{ "classid":"MAG", "schemeid":"dnet:subject_classification_typologies", "schemename":"dnet:subject_classification_typologies", "classname":"Microsoft Academic Graph classification" }, "dataInfo":null, "value":"Medicine" }, { "qualifier":{ "classid":"MAG", "schemeid":"dnet:subject_classification_typologies", "schemename":"dnet:subject_classification_typologies", "classname":"Microsoft Academic Graph classification" }, "dataInfo":{ "trust":"0.40295774", "invisible":false, "inferred":false, "deletedbyinference":false, "inferenceprovenance":null, "provenanceaction":{ "classid":"sysimport:actionset", "schemeid":"dnet:provenanceActions", "schemename":"dnet:provenanceActions", "classname":"sysimport:actionset" } }, "value":"business.industry" }, { "qualifier":{ "classid":"MAG", "schemeid":"dnet:subject_classification_typologies", "schemename":"dnet:subject_classification_typologies", "classname":"Microsoft Academic Graph classification" }, "dataInfo":{ "trust":"0.40295774", "invisible":false, "inferred":false, "deletedbyinference":false, "inferenceprovenance":null, "provenanceaction":{ "classid":"sysimport:actionset", "schemeid":"dnet:provenanceActions", "schemename":"dnet:provenanceActions", "classname":"sysimport:actionset" } }, "value":"business" }, { "qualifier":{ "classid":"MAG", "schemeid":"dnet:subject_classification_typologies", "schemename":"dnet:subject_classification_typologies", "classname":"Microsoft Academic Graph classification" }, "dataInfo":null, "value":"Pulmonary function testing" } ], "language":null, "instance":[ { "processingchargecurrency":null, "refereed":null, "instancetype":null, "hostedby":null, "distributionlocation":null, "processingchargeamount":null, "license":null, "accessright":null, "dateofacceptance":null, "collectedfrom":{ "dataInfo":null, "value":"Microsoft Academic Graph", "key":"10|openaire____::5f532a3fc4f1ea403f37070f59a7a53a" }, "url":[ "https://www.ncbi.nlm.nih.gov/pubmed/10964736", "https://www.sciencedirect.com/science/article/pii/S0009926000904987", "https://academic.microsoft.com/#/detail/1990704599" ] } ], "context":null, "country":null, "originalId":[ "1990704599", "10.1053/crad.2000.0498" ], "pid":[ { "qualifier":{ "classid":"doi", "schemeid":"dnet:pid_types", "schemename":"dnet:pid_types", "classname":"doi" }, "dataInfo":null, "value":"10.1053/crad.2000.0498" } ], "dateofcollection":null, "dateoftransformation":null, "oaiprovenance":null, "extraInfo":null, "id":"50|doiboost____::994b7e47b9e225ab6d5e14841cb45a7f", "collectedfrom":[ { "dataInfo":null, "value":"Microsoft Academic Graph", "key":"10|openaire____::5f532a3fc4f1ea403f37070f59a7a53a" } ], "dataInfo":{ "trust":"0.9", "invisible":false, "inferred":false, "deletedbyinference":false, "inferenceprovenance":null, "provenanceaction":{ "classid":"sysimport:actionset", "schemeid":"dnet:provenanceActions", "schemename":"dnet:provenanceActions", "classname":"sysimport:actionset"} }, "lastupdatetimestamp":null } \ No newline at end of file diff --git a/dhp-common/src/test/resources/eu/dnetlib/dhp/schema/oaf/utils/dataset_1.json b/dhp-common/src/test/resources/eu/dnetlib/dhp/schema/oaf/utils/dataset_1.json new file mode 100644 index 000000000..e38c4d1cc --- /dev/null +++ b/dhp-common/src/test/resources/eu/dnetlib/dhp/schema/oaf/utils/dataset_1.json @@ -0,0 +1 @@ +{"id":"50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1g", "resuttype" : { "classid" : "dataset" }, "pid":[{"qualifier":{"classid":"doi"},"value":"10.1016/j.cmet.2011.03.013"},{"qualifier":{"classid":"urn"},"value":"urn:nbn:nl:ui:29-f3ed5f9e-edf6-457e-8848-61b58a4075e2"},{"qualifier":{"classid":"scp-number"},"value":"79953761260"},{"qualifier":{"classid":"pmc"},"value":"21459329"}], "collectedfrom" : [ { "key" : "10|openaire____::081b82f96300b6a6e3d282bad31cb6e2", "value" : "Crossref"} ]} \ No newline at end of file diff --git a/dhp-common/src/test/resources/eu/dnetlib/dhp/schema/oaf/utils/dataset_2.json b/dhp-common/src/test/resources/eu/dnetlib/dhp/schema/oaf/utils/dataset_2.json new file mode 100644 index 000000000..c880edb7d --- /dev/null +++ b/dhp-common/src/test/resources/eu/dnetlib/dhp/schema/oaf/utils/dataset_2.json @@ -0,0 +1,140 @@ +{ + "id": "50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1g", + "resuttype": {"classid": "dataset"}, + "pid": [ + { + "qualifier": {"classid": "doi"}, + "value": "10.1016/j.cmet.2011.03.013" + }, + { + "qualifier": {"classid": "urn"}, + "value": "urn:nbn:nl:ui:29-f3ed5f9e-edf6-457e-8848-61b58a4075e2" + }, + { + "qualifier": {"classid": "scp-number"}, + "value": "79953761260" + }, + { + "qualifier": {"classid": "pmc"}, + "value": "21459329" + } + ], + "collectedfrom": [ + { + "key": "10|openaire____::081b82f96300b6a6e3d282bad31cb6e3", + "value": "Repository B" + } + ], + "instance": [ + { + "refereed": { + "classid": "0000", + "classname": "UNKNOWN", + "schemeid": "dnet:review_levels", + "schemename": "dnet:review_levels" + }, + "hostedby": { + "key": "10|opendoar____::358aee4cc897452c00244351e4d91f69", + "value": "Zenodo" + }, + "accessright": { + "classid": "OPEN", + "classname": "Open Access", + "schemeid": "dnet:access_modes", + "schemename": "dnet:access_modes" + }, + "processingchargecurrency": { + "dataInfo": { + "provenanceaction": { + "classid": "sysimport:crosswalk:datasetarchive", + "classname": "Harvested", + "schemeid": "dnet:provenanceActions", + "schemename": "dnet:provenanceActions" + }, + "deletedbyinference": false, + "inferred": false, + "inferenceprovenance": "", + "invisible": true, + "trust": "0.9" + }, + "value": "EUR" + }, + "pid": [ + { + "dataInfo": { + "provenanceaction": { + "classid": "sysimport:crosswalk:datasetarchive", + "classname": "Harvested", + "schemeid": "dnet:provenanceActions", + "schemename": "dnet:provenanceActions" + }, + "deletedbyinference": false, + "inferred": false, + "inferenceprovenance": "", + "invisible": true, + "trust": "0.9" + }, + "qualifier": { + "classid": "doi", + "classname": "Digital Object Identifier", + "schemeid": "dnet:pid_types", + "schemename": "dnet:pid_types" + }, + "value": "10.1371/journal.pone.0085605" + } + ], + "distributionlocation": "", + "url": ["https://doi.org/10.1371/journal.pone.0085605"], + "alternateIdentifier": [ + { + "dataInfo": { + "provenanceaction": { + "classid": "sysimport:crosswalk:datasetarchive", + "classname": "Harvested", + "schemeid": "dnet:provenanceActions", + "schemename": "dnet:provenanceActions" + }, + "deletedbyinference": false, + "inferred": false, + "inferenceprovenance": "", + "invisible": true, + "trust": "0.9" + }, + "qualifier": { + "classid": "pmid", + "classname": "PubMed ID", + "schemeid": "dnet:pid_types", + "schemename": "dnet:pid_types" + }, + "value": "24454899.0" + } + ], + "collectedfrom": { + "key": "10|openaire____::081b82f96300b6a6e3d282bad31cb6e3", + "value": "Repository B" + }, + "processingchargeamount": { + "dataInfo": { + "provenanceaction": { + "classid": "sysimport:crosswalk:datasetarchive", + "classname": "Harvested", + "schemeid": "dnet:provenanceActions", + "schemename": "dnet:provenanceActions" + }, + "deletedbyinference": false, + "inferred": false, + "inferenceprovenance": "", + "invisible": true, + "trust": "0.9" + }, + "value": "1022.02" + }, + "instancetype": { + "classid": "0004", + "classname": "Conference object", + "schemeid": "dnet:publication_resource", + "schemename": "dnet:publication_resource" + } + } + ] +} \ No newline at end of file diff --git a/dhp-common/src/test/resources/eu/dnetlib/dhp/schema/oaf/utils/dataset_delegated.json b/dhp-common/src/test/resources/eu/dnetlib/dhp/schema/oaf/utils/dataset_delegated.json new file mode 100644 index 000000000..967c1181b --- /dev/null +++ b/dhp-common/src/test/resources/eu/dnetlib/dhp/schema/oaf/utils/dataset_delegated.json @@ -0,0 +1,140 @@ +{ + "id": "50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1g", + "resuttype": {"classid": "dataset"}, + "pid": [ + { + "qualifier": {"classid": "doi"}, + "value": "10.1016/j.cmet.2011.03.013" + }, + { + "qualifier": {"classid": "urn"}, + "value": "urn:nbn:nl:ui:29-f3ed5f9e-edf6-457e-8848-61b58a4075e2" + }, + { + "qualifier": {"classid": "scp-number"}, + "value": "79953761260" + }, + { + "qualifier": {"classid": "pmc"}, + "value": "21459329" + } + ], + "collectedfrom": [ + { + "key": "10|opendoar____::358aee4cc897452c00244351e4d91f69", + "value": "Zenodo" + } + ], + "instance": [ + { + "refereed": { + "classid": "0000", + "classname": "UNKNOWN", + "schemeid": "dnet:review_levels", + "schemename": "dnet:review_levels" + }, + "hostedby": { + "key": "10|opendoar____::358aee4cc897452c00244351e4d91f69", + "value": "Zenodo" + }, + "accessright": { + "classid": "OPEN", + "classname": "Open Access", + "schemeid": "dnet:access_modes", + "schemename": "dnet:access_modes" + }, + "processingchargecurrency": { + "dataInfo": { + "provenanceaction": { + "classid": "sysimport:crosswalk:datasetarchive", + "classname": "Harvested", + "schemeid": "dnet:provenanceActions", + "schemename": "dnet:provenanceActions" + }, + "deletedbyinference": false, + "inferred": false, + "inferenceprovenance": "", + "invisible": true, + "trust": "0.9" + }, + "value": "EUR" + }, + "pid": [ + { + "dataInfo": { + "provenanceaction": { + "classid": "sysimport:crosswalk:datasetarchive", + "classname": "Harvested", + "schemeid": "dnet:provenanceActions", + "schemename": "dnet:provenanceActions" + }, + "deletedbyinference": false, + "inferred": false, + "inferenceprovenance": "", + "invisible": true, + "trust": "0.9" + }, + "qualifier": { + "classid": "doi", + "classname": "Digital Object Identifier", + "schemeid": "dnet:pid_types", + "schemename": "dnet:pid_types" + }, + "value": "10.1371/journal.pone.0085605" + } + ], + "distributionlocation": "", + "url": ["https://doi.org/10.1371/journal.pone.0085605"], + "alternateIdentifier": [ + { + "dataInfo": { + "provenanceaction": { + "classid": "sysimport:crosswalk:datasetarchive", + "classname": "Harvested", + "schemeid": "dnet:provenanceActions", + "schemename": "dnet:provenanceActions" + }, + "deletedbyinference": false, + "inferred": false, + "inferenceprovenance": "", + "invisible": true, + "trust": "0.9" + }, + "qualifier": { + "classid": "pmid", + "classname": "PubMed ID", + "schemeid": "dnet:pid_types", + "schemename": "dnet:pid_types" + }, + "value": "24454899.0" + } + ], + "collectedfrom": { + "key": "10|opendoar____::358aee4cc897452c00244351e4d91f69", + "value": "Zenodo" + }, + "processingchargeamount": { + "dataInfo": { + "provenanceaction": { + "classid": "sysimport:crosswalk:datasetarchive", + "classname": "Harvested", + "schemeid": "dnet:provenanceActions", + "schemename": "dnet:provenanceActions" + }, + "deletedbyinference": false, + "inferred": false, + "inferenceprovenance": "", + "invisible": true, + "trust": "0.9" + }, + "value": "1022.02" + }, + "instancetype": { + "classid": "0004", + "classname": "Conference object", + "schemeid": "dnet:publication_resource", + "schemename": "dnet:publication_resource" + } + } + ] +} \ No newline at end of file diff --git a/dhp-common/src/test/resources/eu/dnetlib/dhp/schema/oaf/utils/publication_1.json b/dhp-common/src/test/resources/eu/dnetlib/dhp/schema/oaf/utils/publication_1.json new file mode 100644 index 000000000..704c5ad4d --- /dev/null +++ b/dhp-common/src/test/resources/eu/dnetlib/dhp/schema/oaf/utils/publication_1.json @@ -0,0 +1 @@ +{"id":"50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1f", "resuttype" : { "classid" : "publication" }, "pid":[{"qualifier":{"classid":"doi"},"value":"10.1016/j.cmet.2011.03.013"},{"qualifier":{"classid":"urn"},"value":"urn:nbn:nl:ui:29-f3ed5f9e-edf6-457e-8848-61b58a4075e2"},{"qualifier":{"classid":"scp-number"},"value":"79953761260"},{"qualifier":{"classid":"pmc"},"value":"21459329"}], "collectedfrom" : [ { "key" : "10|openaire____::081b82f96300b6a6e3d282bad31cb6e2", "value" : "Crossref"} ]} \ No newline at end of file diff --git a/dhp-common/src/test/resources/eu/dnetlib/dhp/schema/oaf/utils/publication_2.json b/dhp-common/src/test/resources/eu/dnetlib/dhp/schema/oaf/utils/publication_2.json new file mode 100644 index 000000000..a1744e84e --- /dev/null +++ b/dhp-common/src/test/resources/eu/dnetlib/dhp/schema/oaf/utils/publication_2.json @@ -0,0 +1 @@ +{"id":"50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1f", "resuttype" : { "classid" : "publication" }, "pid":[{"qualifier":{"classid":"doi"},"value":"10.1016/j.cmet.2011.03.013"},{"qualifier":{"classid":"urn"},"value":"urn:nbn:nl:ui:29-f3ed5f9e-edf6-457e-8848-61b58a4075e2"},{"qualifier":{"classid":"scp-number"},"value":"79953761260"},{"qualifier":{"classid":"pmc"},"value":"21459329"}], "collectedfrom" : [ { "key" : "10|openaire____::081b82f96300b6a6e3d282bad31cb6e3", "value" : "Repository A"} ]} \ No newline at end of file diff --git a/dhp-common/src/test/resources/eu/dnetlib/dhp/schema/oaf/utils/publication_3.json b/dhp-common/src/test/resources/eu/dnetlib/dhp/schema/oaf/utils/publication_3.json deleted file mode 100644 index 6d33568f4..000000000 --- a/dhp-common/src/test/resources/eu/dnetlib/dhp/schema/oaf/utils/publication_3.json +++ /dev/null @@ -1 +0,0 @@ -{"id":"50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1f","pid":[{"qualifier":{"classid":"scp-number"},"value":"79953761260"}]} \ No newline at end of file diff --git a/dhp-common/src/test/resources/eu/dnetlib/dhp/schema/oaf/utils/publication_4.json b/dhp-common/src/test/resources/eu/dnetlib/dhp/schema/oaf/utils/publication_4.json deleted file mode 100644 index 6617fe15f..000000000 --- a/dhp-common/src/test/resources/eu/dnetlib/dhp/schema/oaf/utils/publication_4.json +++ /dev/null @@ -1 +0,0 @@ -{"id":"50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1f","pid":[]} \ No newline at end of file diff --git a/dhp-common/src/test/resources/eu/dnetlib/dhp/schema/oaf/utils/publication_5.json b/dhp-common/src/test/resources/eu/dnetlib/dhp/schema/oaf/utils/publication_5.json deleted file mode 100644 index 700a10046..000000000 --- a/dhp-common/src/test/resources/eu/dnetlib/dhp/schema/oaf/utils/publication_5.json +++ /dev/null @@ -1 +0,0 @@ -{"id":"50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1f"} \ No newline at end of file diff --git a/dhp-common/src/test/resources/eu/dnetlib/dhp/schema/oaf/utils/publication_doi.json b/dhp-common/src/test/resources/eu/dnetlib/dhp/schema/oaf/utils/publication_doi.json deleted file mode 100644 index 6865d0ff6..000000000 --- a/dhp-common/src/test/resources/eu/dnetlib/dhp/schema/oaf/utils/publication_doi.json +++ /dev/null @@ -1 +0,0 @@ -{"id":"50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1f","pid":[{"qualifier":{"classid":"doi"},"value":"10.1016/j.cmet.2011.03.013"},{"qualifier":{"classid":"urn"},"value":"urn:nbn:nl:ui:29-f3ed5f9e-edf6-457e-8848-61b58a4075e2"},{"qualifier":{"classid":"scp-number"},"value":"79953761260"},{"qualifier":{"classid":"pmc"},"value":"21459329"}]} \ No newline at end of file diff --git a/dhp-common/src/test/resources/eu/dnetlib/dhp/schema/oaf/utils/publication_pmc.json b/dhp-common/src/test/resources/eu/dnetlib/dhp/schema/oaf/utils/publication_pmc.json deleted file mode 100644 index 3e4ba2246..000000000 --- a/dhp-common/src/test/resources/eu/dnetlib/dhp/schema/oaf/utils/publication_pmc.json +++ /dev/null @@ -1 +0,0 @@ -{"id":"50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1f","pid":[{"qualifier":{"classid":"urn"},"value":"urn:nbn:nl:ui:29-f3ed5f9e-edf6-457e-8848-61b58a4075e2"},{"qualifier":{"classid":"scp-number"},"value":"79953761260"},{"qualifier":{"classid":"pmc"},"value":"21459329"}]} \ No newline at end of file diff --git a/dhp-common/src/test/resources/eu/dnetlib/dhp/schema/oaf/utils/publication_urn.json b/dhp-common/src/test/resources/eu/dnetlib/dhp/schema/oaf/utils/publication_urn.json deleted file mode 100644 index 83e14e48d..000000000 --- a/dhp-common/src/test/resources/eu/dnetlib/dhp/schema/oaf/utils/publication_urn.json +++ /dev/null @@ -1 +0,0 @@ -{"id":"50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1f","pid":[{"qualifier":{"classid":"urn"},"value":"urn:nbn:nl:ui:29-f3ed5f9e-edf6-457e-8848-61b58a4075e2"},{"qualifier":{"classid":"scp-number"},"value":"79953761260"},{"qualifier":{"classid":"pmcid"},"value":"21459329"}]} \ No newline at end of file diff --git a/dhp-common/src/test/resources/eu/dnetlib/dhp/transform/synonyms.txt b/dhp-common/src/test/resources/eu/dnetlib/dhp/transform/synonyms.txt new file mode 100644 index 000000000..729296522 --- /dev/null +++ b/dhp-common/src/test/resources/eu/dnetlib/dhp/transform/synonyms.txt @@ -0,0 +1,1234 @@ +dnet:access_modes @=@ CLOSED @=@ http://purl.org/coar/access_right/c_14cb +dnet:access_modes @=@ CLOSED @=@ info:eu-repo/semantics/closedAccess +dnet:access_modes @=@ EMBARGO @=@ http://purl.org/coar/access_right/c_f1cf +dnet:access_modes @=@ EMBARGO @=@ info:eu-repo/semantics/embargoedAccess +dnet:access_modes @=@ OPEN @=@ Creative Commons License [CC BY-NC-ND] http://creativecommons.org/licenses/by-nc-nd/3.0/de/ +dnet:access_modes @=@ OPEN @=@ Creative commons +dnet:access_modes @=@ OPEN @=@ http://creativecommons.org/licenses/by-nc-nd/3.0/ +dnet:access_modes @=@ OPEN @=@ http://creativecommons.org/licenses/by-nc/3.0/ +dnet:access_modes @=@ OPEN @=@ http://creativecommons.org/licenses/by-sa/3.0/ +dnet:access_modes @=@ OPEN @=@ http://creativecommons.org/licenses/by-sa/4.0/ +dnet:access_modes @=@ OPEN @=@ http://creativecommons.org/licenses/by/3.0/ +dnet:access_modes @=@ OPEN @=@ http://creativecommons.org/licenses/by/3.0/us/ +dnet:access_modes @=@ OPEN @=@ http://creativecommons.org/licenses/by/4.0/ +dnet:access_modes @=@ OPEN @=@ http://creativecommons.org/publicdomain/zero/1.0/ +dnet:access_modes @=@ OPEN @=@ http://creativecommons.org/publicdomain/zero/1.0/ & http://www.canadensys.net/norms +dnet:access_modes @=@ OPEN @=@ http://purl.org/coar/access_right/c_abf2 +dnet:access_modes @=@ OPEN @=@ https://creativecommons.org/licenses/by-nc/4.0/ +dnet:access_modes @=@ OPEN @=@ info:eu-repo/semantics/openAccess +dnet:access_modes @=@ OPEN @=@ open_access +dnet:access_modes @=@ RESTRICTED @=@ http://purl.org/coar/access_right/c_16ec +dnet:access_modes @=@ RESTRICTED @=@ info:eu-repo/semantics/restrictedAccess +dnet:compatibilityLevel @=@ openaire-pub_4.0 @=@ openaire4.0 +dnet:subject_classification_typologies @=@ jel @=@ jelElement +dnet:publication_resource @=@ 0018 @=@ Comment/debate +dnet:publication_resource @=@ 0018 @=@ http://purl.org/coar/resource_type/c_1162 +dnet:publication_resource @=@ 0018 @=@ info:eu-repo/semantics/annotation +dnet:publication_resource @=@ 0001 @=@ A1 Alkuperäisartikkeli tieteellisessä aikakauslehdessä +dnet:publication_resource @=@ 0001 @=@ Article +dnet:publication_resource @=@ 0001 @=@ Article (author) +dnet:publication_resource @=@ 0001 @=@ Article - letter to the editor +dnet:publication_resource @=@ 0001 @=@ Article / Letter to editor +dnet:publication_resource @=@ 0001 @=@ Article / Letter to the editor +dnet:publication_resource @=@ 0001 @=@ Article / Newspaper +dnet:publication_resource @=@ 0001 @=@ Article in journal +dnet:publication_resource @=@ 0001 @=@ Article in monograph or in proceedings +dnet:publication_resource @=@ 0001 @=@ Article in proceedings +dnet:publication_resource @=@ 0001 @=@ Article-letter to the editor +dnet:publication_resource @=@ 0001 @=@ Article/Letter to editor +dnet:publication_resource @=@ 0001 @=@ Articolo +dnet:publication_resource @=@ 0001 @=@ Artículo +dnet:publication_resource @=@ 0001 @=@ Aufsatz +dnet:publication_resource @=@ 0001 @=@ Clinical Study +dnet:publication_resource @=@ 0001 @=@ Institutional Series +dnet:publication_resource @=@ 0001 @=@ International Journal +dnet:publication_resource @=@ 0001 @=@ International Journal Abstract +dnet:publication_resource @=@ 0001 @=@ International Journal ISI/JCR +dnet:publication_resource @=@ 0001 @=@ Journal (full / special issue) +dnet:publication_resource @=@ 0001 @=@ Journal Article/Review +dnet:publication_resource @=@ 0001 @=@ Journal article +dnet:publication_resource @=@ 0001 @=@ Journal article (on-line or printed) +dnet:publication_resource @=@ 0001 @=@ Journal articles +dnet:publication_resource @=@ 0001 @=@ Journal paper +dnet:publication_resource @=@ 0001 @=@ National Journal +dnet:publication_resource @=@ 0001 @=@ Original article (non peer-reviewed) +dnet:publication_resource @=@ 0001 @=@ Original article (peer-reviewed) +dnet:publication_resource @=@ 0001 @=@ Peer-reviewed Article +dnet:publication_resource @=@ 0001 @=@ Published Journal Article +dnet:publication_resource @=@ 0001 @=@ Research Article +dnet:publication_resource @=@ 0001 @=@ Review article (non peer-reviewed) +dnet:publication_resource @=@ 0001 @=@ Review article (peer-reviewed) +dnet:publication_resource @=@ 0001 @=@ Volumes Edited / Special Issues +dnet:publication_resource @=@ 0001 @=@ article in non peer-reviewed journal +dnet:publication_resource @=@ 0001 @=@ article in peer-reviewed journal +dnet:publication_resource @=@ 0001 @=@ article-commentary +dnet:publication_resource @=@ 0001 @=@ article_site_web +dnet:publication_resource @=@ 0001 @=@ doc-type:Journal Article +dnet:publication_resource @=@ 0001 @=@ doc-type:article +dnet:publication_resource @=@ 0001 @=@ http://purl.org/coar/resource_type/c_2df8fbb1 +dnet:publication_resource @=@ 0001 @=@ http://purl.org/coar/resource_type/c_545b +dnet:publication_resource @=@ 0001 @=@ http://purl.org/coar/resource_type/c_6501 +dnet:publication_resource @=@ 0001 @=@ http://purl.org/coar/resource_type/c_7877 +dnet:publication_resource @=@ 0001 @=@ in-brief +dnet:publication_resource @=@ 0001 @=@ info:eu-repo/semantics/article +dnet:publication_resource @=@ 0001 @=@ journal-article +dnet:publication_resource @=@ 0001 @=@ journalArticle +dnet:publication_resource @=@ 0001 @=@ journal_article +dnet:publication_resource @=@ 0001 @=@ letter +dnet:publication_resource @=@ 0001 @=@ non peer-reviewed article +dnet:publication_resource @=@ 0001 @=@ partial-retraction +dnet:publication_resource @=@ 0001 @=@ proceeding with peer review +dnet:publication_resource @=@ 0001 @=@ publication-article +dnet:publication_resource @=@ 0001 @=@ rapid-communication +dnet:publication_resource @=@ 0001 @=@ reply +dnet:publication_resource @=@ 0001 @=@ research-article +dnet:publication_resource @=@ 0001 @=@ retraction +dnet:publication_resource @=@ 0001 @=@ review-article +dnet:publication_resource @=@ 0001 @=@ text (article) +dnet:publication_resource @=@ 0001 @=@ Статья +dnet:publication_resource @=@ 0001 @=@ ArticleArtikel +dnet:publication_resource @=@ 0033 @=@ AUDIOVISUAL_DOCUMENT +dnet:publication_resource @=@ 0033 @=@ Audiovisual/Audiovisual +dnet:publication_resource @=@ 0033 @=@ http://purl.org/coar/resource_type/c_c513 +dnet:publication_resource @=@ 0008 @=@ Bachelor's +dnet:publication_resource @=@ 0008 @=@ Bachelor's Degree +dnet:publication_resource @=@ 0008 @=@ Bachelors Thesis +dnet:publication_resource @=@ 0008 @=@ Proyecto fin de carrera +dnet:publication_resource @=@ 0008 @=@ Undergraduate Thesis +dnet:publication_resource @=@ 0008 @=@ http://purl.org/coar/resource_type/c_7a1f +dnet:publication_resource @=@ 0008 @=@ info:eu-repo/semantics/bachelorThesis +dnet:publication_resource @=@ 0008 @=@ выпускная бакалаврская работа +dnet:publication_resource @=@ 0002 @=@ Book (monograph) +dnet:publication_resource @=@ 0002 @=@ Book (non peer-reviewed) +dnet:publication_resource @=@ 0002 @=@ Book (peer-reviewed) +dnet:publication_resource @=@ 0002 @=@ Book - monograph - editorial book +dnet:publication_resource @=@ 0002 @=@ Book Section +dnet:publication_resource @=@ 0002 @=@ Book as author +dnet:publication_resource @=@ 0002 @=@ Buch +dnet:publication_resource @=@ 0002 @=@ International Book/Monograph +dnet:publication_resource @=@ 0002 @=@ Libro +dnet:publication_resource @=@ 0002 @=@ Monografia +dnet:publication_resource @=@ 0002 @=@ Monograph +dnet:publication_resource @=@ 0002 @=@ National Book/Monograph +dnet:publication_resource @=@ 0002 @=@ atlas +dnet:publication_resource @=@ 0002 @=@ book +dnet:publication_resource @=@ 0002 @=@ book-series +dnet:publication_resource @=@ 0002 @=@ book-set +dnet:publication_resource @=@ 0002 @=@ book-track +dnet:publication_resource @=@ 0002 @=@ book_series +dnet:publication_resource @=@ 0002 @=@ book_title +dnet:publication_resource @=@ 0002 @=@ doc-type:book +dnet:publication_resource @=@ 0002 @=@ edited-book +dnet:publication_resource @=@ 0002 @=@ http://purl.org/coar/resource_type/c_2f33 +dnet:publication_resource @=@ 0002 @=@ info:eu-repo/semantics/book +dnet:publication_resource @=@ 0002 @=@ ouvrage +dnet:publication_resource @=@ 0002 @=@ publication-book +dnet:publication_resource @=@ 0002 @=@ reference-book +dnet:publication_resource @=@ 0002 @=@ scientific book +dnet:publication_resource @=@ 0002 @=@ Монография +dnet:publication_resource @=@ 0002 @=@ Учебник +dnet:publication_resource @=@ 0037 @=@ clinicalTrial +dnet:publication_resource @=@ 0037 @=@ http://purl.org/coar/resource_type/c_cb28 +dnet:publication_resource @=@ 0022 @=@ collection +dnet:publication_resource @=@ 0004 @=@ A4 Artikkeli konferenssijulkaisussa +dnet:publication_resource @=@ 0004 @=@ Comunicación de congreso +dnet:publication_resource @=@ 0004 @=@ Conference Paper +dnet:publication_resource @=@ 0004 @=@ Conference Paper/Proceeding/Abstract +dnet:publication_resource @=@ 0004 @=@ Conference Proceedings +dnet:publication_resource @=@ 0004 @=@ Conference article +dnet:publication_resource @=@ 0004 @=@ Conference contribution +dnet:publication_resource @=@ 0004 @=@ Conference lecture +dnet:publication_resource @=@ 0004 @=@ Conference or Workshop Item +dnet:publication_resource @=@ 0004 @=@ Conference paper, poster, etc. +dnet:publication_resource @=@ 0004 @=@ Conference papers +dnet:publication_resource @=@ 0004 @=@ Conference report +dnet:publication_resource @=@ 0004 @=@ International Conference +dnet:publication_resource @=@ 0004 @=@ International Conference Abstract/Poster +dnet:publication_resource @=@ 0004 @=@ International Conference ISI/JCR +dnet:publication_resource @=@ 0004 @=@ International Conference communication/abstract/poster +dnet:publication_resource @=@ 0004 @=@ National Conference +dnet:publication_resource @=@ 0004 @=@ National Conference Abstract/Poster +dnet:publication_resource @=@ 0004 @=@ National Conference communication/abstract/poster +dnet:publication_resource @=@ 0004 @=@ PREFACE_PROCEEDINGS +dnet:publication_resource @=@ 0004 @=@ PROCEEDING_PAPER +dnet:publication_resource @=@ 0004 @=@ Papers in Conference Proceedings +dnet:publication_resource @=@ 0004 @=@ Presentación +dnet:publication_resource @=@ 0004 @=@ Proceedings (peer-reviewed) +dnet:publication_resource @=@ 0004 @=@ Proceedings of a Conference +dnet:publication_resource @=@ 0004 @=@ Proceedings paper +dnet:publication_resource @=@ 0004 @=@ Póster +dnet:publication_resource @=@ 0004 @=@ actes_congres +dnet:publication_resource @=@ 0004 @=@ communication_avec_actes +dnet:publication_resource @=@ 0004 @=@ communication_invitee +dnet:publication_resource @=@ 0004 @=@ communication_par_affiche +dnet:publication_resource @=@ 0004 @=@ communication_sans_actes +dnet:publication_resource @=@ 0004 @=@ conference +dnet:publication_resource @=@ 0004 @=@ conference item +dnet:publication_resource @=@ 0004 @=@ conference proceeding +dnet:publication_resource @=@ 0004 @=@ conferenceObject +dnet:publication_resource @=@ 0004 @=@ conference_paper +dnet:publication_resource @=@ 0004 @=@ doc-type:conferenceObject +dnet:publication_resource @=@ 0004 @=@ http://purl.org/coar/resource_type/c_18co +dnet:publication_resource @=@ 0004 @=@ http://purl.org/coar/resource_type/c_18cp +dnet:publication_resource @=@ 0004 @=@ http://purl.org/coar/resource_type/c_5794 +dnet:publication_resource @=@ 0004 @=@ http://purl.org/coar/resource_type/c_6670 +dnet:publication_resource @=@ 0004 @=@ http://purl.org/coar/resource_type/c_c94f +dnet:publication_resource @=@ 0004 @=@ http://purl.org/coar/resource_type/c_f744 +dnet:publication_resource @=@ 0004 @=@ info:eu-repo/semantics/conferenceItem +dnet:publication_resource @=@ 0004 @=@ info:eu-repo/semantics/conferenceObject +dnet:publication_resource @=@ 0004 @=@ invited conference talk +dnet:publication_resource @=@ 0004 @=@ poster +dnet:publication_resource @=@ 0004 @=@ presentation +dnet:publication_resource @=@ 0004 @=@ proceeding, seminar, workshop without peer review +dnet:publication_resource @=@ 0004 @=@ proceedings +dnet:publication_resource @=@ 0004 @=@ proceedings-article +dnet:publication_resource @=@ 0004 @=@ publication-conferencepaper +dnet:publication_resource @=@ 0004 @=@ научный доклад +dnet:publication_resource @=@ 0005 @=@ Newspaper or magazine article +dnet:publication_resource @=@ 0005 @=@ http://purl.org/coar/resource_type/c_998f +dnet:publication_resource @=@ 0005 @=@ info:eu-repo/semantics/contributionToPeriodical +dnet:publication_resource @=@ 0045 @=@ Data Management Plan +dnet:publication_resource @=@ 0045 @=@ Data Management Plan (NSF Generic) +dnet:publication_resource @=@ 0045 @=@ http://purl.org/coar/resource_type/c_ab20 +dnet:publication_resource @=@ 0045 @=@ http://purl.org/spar/fabio/DataManagementPolicy +dnet:publication_resource @=@ 0045 @=@ http://purl.org/spar/fabio/DataManagementPolicyDocument +dnet:publication_resource @=@ 0045 @=@ http://purl.org/spar/fabio/DataMangementPlan +dnet:publication_resource @=@ 0045 @=@ plan de gestión de datos +dnet:publication_resource @=@ 0045 @=@ publication-datamanagementplan +dnet:publication_resource @=@ 0031 @=@ Data Descriptor +dnet:publication_resource @=@ 0031 @=@ DataPaper +dnet:publication_resource @=@ 0031 @=@ data-article +dnet:publication_resource @=@ 0031 @=@ http://purl.org/coar/resource_type/c_beb9 +dnet:publication_resource @=@ 0021 @=@ Dataset/Dataset +dnet:publication_resource @=@ 0021 @=@ Research Data +dnet:publication_resource @=@ 0021 @=@ dataset +dnet:publication_resource @=@ 0021 @=@ http://purl.org/coar/resource_type/c_ddb1 +dnet:publication_resource @=@ 0021 @=@ info:eu-repo/semantics/DDIInstance +dnet:publication_resource @=@ 0021 @=@ info:eu-repo/semantics/datafile +dnet:publication_resource @=@ 0021 @=@ info:eu-repo/semantics/dataset +dnet:publication_resource @=@ 0021 @=@ info:eu-repo/semantics/enhancedObjectFile +dnet:publication_resource @=@ 0006 @=@ Diss +dnet:publication_resource @=@ 0006 @=@ Dissertation +dnet:publication_resource @=@ 0006 @=@ Doctoral +dnet:publication_resource @=@ 0006 @=@ DoctoralThesis +dnet:publication_resource @=@ 0006 @=@ PhD thesis +dnet:publication_resource @=@ 0006 @=@ Tesis +dnet:publication_resource @=@ 0006 @=@ Text.Thesis.Doctoral +dnet:publication_resource @=@ 0006 @=@ Theses +dnet:publication_resource @=@ 0006 @=@ Thesis +dnet:publication_resource @=@ 0006 @=@ Thesis or Dissertation +dnet:publication_resource @=@ 0006 @=@ Thesis.Doctoral +dnet:publication_resource @=@ 0006 @=@ doc-type:doctoralThesis +dnet:publication_resource @=@ 0006 @=@ http://purl.org/coar/resource_type/c_db06 +dnet:publication_resource @=@ 0006 @=@ info:eu-repo/semantics/doctoralThesis +dnet:publication_resource @=@ 0006 @=@ publication-thesis +dnet:publication_resource @=@ 0006 @=@ these +dnet:publication_resource @=@ 0006 @=@ these exercice +dnet:publication_resource @=@ 0023 @=@ Event/Event +dnet:publication_resource @=@ 0023 @=@ event +dnet:publication_resource @=@ 0009 @=@ Departmental Technical Report +dnet:publication_resource @=@ 0009 @=@ Informe Técnico +dnet:publication_resource @=@ 0009 @=@ RESEARCH_REPORT +dnet:publication_resource @=@ 0009 @=@ Tech-Report +dnet:publication_resource @=@ 0009 @=@ Technical Report +dnet:publication_resource @=@ 0009 @=@ http://purl.org/coar/resource_type/c_18gh +dnet:publication_resource @=@ 0009 @=@ publication-technicalnote +dnet:publication_resource @=@ 0009 @=@ research report +dnet:publication_resource @=@ 0024 @=@ Video +dnet:publication_resource @=@ 0024 @=@ film +dnet:publication_resource @=@ 0024 @=@ http://purl.org/coar/resource_type/c_12ce +dnet:publication_resource @=@ 0024 @=@ http://purl.org/coar/resource_type/c_8a7e +dnet:publication_resource @=@ 0025 @=@ Diagram +dnet:publication_resource @=@ 0025 @=@ Drawing +dnet:publication_resource @=@ 0025 @=@ Figure +dnet:publication_resource @=@ 0025 @=@ Image/Image +dnet:publication_resource @=@ 0025 @=@ Imagen +dnet:publication_resource @=@ 0025 @=@ Photo +dnet:publication_resource @=@ 0025 @=@ Plot +dnet:publication_resource @=@ 0025 @=@ fotó +dnet:publication_resource @=@ 0025 @=@ grafika +dnet:publication_resource @=@ 0025 @=@ http://purl.org/coar/resource_type/c_ecc8 +dnet:publication_resource @=@ 0025 @=@ image +dnet:publication_resource @=@ 0025 @=@ image-diagram +dnet:publication_resource @=@ 0025 @=@ image-drawing +dnet:publication_resource @=@ 0025 @=@ image-figure +dnet:publication_resource @=@ 0025 @=@ image-other +dnet:publication_resource @=@ 0025 @=@ image-photo +dnet:publication_resource @=@ 0025 @=@ image-plot +dnet:publication_resource @=@ 0026 @=@ http://purl.org/coar/resource_type/c_e9a0 +dnet:publication_resource @=@ 0026 @=@ interactiveResource +dnet:publication_resource @=@ 0011 @=@ Internal note +dnet:publication_resource @=@ 0011 @=@ http://purl.org/coar/resource_type/c_18ww +dnet:publication_resource @=@ 0043 @=@ http://purl.org/coar/resource_type/c_0640 +dnet:publication_resource @=@ 0010 @=@ Inaugural lecture +dnet:publication_resource @=@ 0010 @=@ Material didáctico +dnet:publication_resource @=@ 0010 @=@ Public-Lecture +dnet:publication_resource @=@ 0010 @=@ http://purl.org/coar/resource_type/c_8544 +dnet:publication_resource @=@ 0010 @=@ info:eu-repo/semantics/lecture +dnet:publication_resource @=@ 0010 @=@ lesson +dnet:publication_resource @=@ 0010 @=@ Учебный материал +dnet:publication_resource @=@ 0007 @=@ Diploma Project +dnet:publication_resource @=@ 0007 @=@ MSc Thesis +dnet:publication_resource @=@ 0007 @=@ Master Degree +dnet:publication_resource @=@ 0007 @=@ Master's +dnet:publication_resource @=@ 0007 @=@ Masterarbeit u.a. +dnet:publication_resource @=@ 0007 @=@ Masters (Taught) +dnet:publication_resource @=@ 0007 @=@ Masters thesis +dnet:publication_resource @=@ 0007 @=@ Masters-Thesis.Magister +dnet:publication_resource @=@ 0007 @=@ Tesina +dnet:publication_resource @=@ 0007 @=@ Thesis.Master +dnet:publication_resource @=@ 0007 @=@ Trabajo fin de Máster +dnet:publication_resource @=@ 0007 @=@ doc-type:masterThesis +dnet:publication_resource @=@ 0007 @=@ hdr +dnet:publication_resource @=@ 0007 @=@ http://purl.org/coar/resource_type/c_bdcc +dnet:publication_resource @=@ 0007 @=@ info:eu-repo/semantics/masterThesis +dnet:publication_resource @=@ 0007 @=@ masterThesis +dnet:publication_resource @=@ 0007 @=@ memoire +dnet:publication_resource @=@ 0027 @=@ Model/Model +dnet:publication_resource @=@ 0027 @=@ model +dnet:publication_resource @=@ 0020 @=@ Exhibition +dnet:publication_resource @=@ 0020 @=@ Learning Object +dnet:publication_resource @=@ 0020 @=@ Mapa +dnet:publication_resource @=@ 0020 @=@ Modelo de utilidad +dnet:publication_resource @=@ 0020 @=@ PEDAGOGICAL_DOCUMENT +dnet:publication_resource @=@ 0020 @=@ Partitura +dnet:publication_resource @=@ 0020 @=@ Sitio web +dnet:publication_resource @=@ 0020 @=@ Trabajo de divulgación +dnet:publication_resource @=@ 0020 @=@ Web publication/site +dnet:publication_resource @=@ 0020 @=@ application +dnet:publication_resource @=@ 0020 @=@ artefact +dnet:publication_resource @=@ 0020 @=@ carte +dnet:publication_resource @=@ 0020 @=@ composition +dnet:publication_resource @=@ 0020 @=@ document_audiovisuel +dnet:publication_resource @=@ 0020 @=@ http://purl.org/coar/resource_type/c_12cc +dnet:publication_resource @=@ 0020 @=@ http://purl.org/coar/resource_type/c_12cd +dnet:publication_resource @=@ 0020 @=@ http://purl.org/coar/resource_type/c_1843 +dnet:publication_resource @=@ 0020 @=@ http://purl.org/coar/resource_type/c_18cd +dnet:publication_resource @=@ 0020 @=@ http://purl.org/coar/resource_type/c_18cw +dnet:publication_resource @=@ 0020 @=@ http://purl.org/coar/resource_type/c_26e4 +dnet:publication_resource @=@ 0020 @=@ http://purl.org/coar/resource_type/c_7ad9 +dnet:publication_resource @=@ 0020 @=@ http://purl.org/coar/resource_type/c_e059 +dnet:publication_resource @=@ 0020 @=@ info:eu-repo/semantics/other +dnet:publication_resource @=@ 0020 @=@ learningObject +dnet:publication_resource @=@ 0020 @=@ map +dnet:publication_resource @=@ 0020 @=@ misc +dnet:publication_resource @=@ 0020 @=@ other +dnet:publication_resource @=@ 0020 @=@ revue +dnet:publication_resource @=@ 0038 @=@ Abstract +dnet:publication_resource @=@ 0038 @=@ Blog +dnet:publication_resource @=@ 0038 @=@ Book Prospectus +dnet:publication_resource @=@ 0038 @=@ Dictionary Entry +dnet:publication_resource @=@ 0038 @=@ Disclosure +dnet:publication_resource @=@ 0038 @=@ Editorial +dnet:publication_resource @=@ 0038 @=@ Editorial ISI/JCR +dnet:publication_resource @=@ 0038 @=@ Editors +dnet:publication_resource @=@ 0038 @=@ Editors (non peer-reviewed) +dnet:publication_resource @=@ 0038 @=@ Editors (peer-reviewed) +dnet:publication_resource @=@ 0038 @=@ Encyclopedia Entry +dnet:publication_resource @=@ 0038 @=@ Entrada de blog +dnet:publication_resource @=@ 0038 @=@ Funding Submission +dnet:publication_resource @=@ 0038 @=@ HabilitationThesis +dnet:publication_resource @=@ 0038 @=@ License +dnet:publication_resource @=@ 0038 @=@ Manual +dnet:publication_resource @=@ 0038 @=@ Manuscript +dnet:publication_resource @=@ 0038 @=@ Manuscrito +dnet:publication_resource @=@ 0038 @=@ Other publication (non peer-review) +dnet:publication_resource @=@ 0038 @=@ Other publication (peer-review) +dnet:publication_resource @=@ 0038 @=@ Revista +dnet:publication_resource @=@ 0038 @=@ Supervised Student Publication +dnet:publication_resource @=@ 0038 @=@ Tesis/trabajos de grado – Thesis +dnet:publication_resource @=@ 0038 @=@ Text +dnet:publication_resource @=@ 0038 @=@ Text/Text +dnet:publication_resource @=@ 0038 @=@ Trademark +dnet:publication_resource @=@ 0038 @=@ Translation +dnet:publication_resource @=@ 0038 @=@ afterword +dnet:publication_resource @=@ 0038 @=@ avantpropos +dnet:publication_resource @=@ 0038 @=@ bibliography +dnet:publication_resource @=@ 0038 @=@ chronique +dnet:publication_resource @=@ 0038 @=@ compte rendu +dnet:publication_resource @=@ 0038 @=@ correction +dnet:publication_resource @=@ 0038 @=@ foreword +dnet:publication_resource @=@ 0038 @=@ habilitation à diriger des recherches +dnet:publication_resource @=@ 0038 @=@ historicalDocument +dnet:publication_resource @=@ 0038 @=@ http://purl.org/coar/resource_type/c_0040 +dnet:publication_resource @=@ 0038 @=@ http://purl.org/coar/resource_type/c_0857 +dnet:publication_resource @=@ 0038 @=@ http://purl.org/coar/resource_type/c_18cf +dnet:publication_resource @=@ 0038 @=@ http://purl.org/coar/resource_type/c_18wz +dnet:publication_resource @=@ 0038 @=@ http://purl.org/coar/resource_type/c_3e5a +dnet:publication_resource @=@ 0038 @=@ http://purl.org/coar/resource_type/c_46ec +dnet:publication_resource @=@ 0038 @=@ http://purl.org/coar/resource_type/c_6947 +dnet:publication_resource @=@ 0038 @=@ http://purl.org/coar/resource_type/c_7acd +dnet:publication_resource @=@ 0038 @=@ http://purl.org/coar/resource_type/c_86bc +dnet:publication_resource @=@ 0038 @=@ http://purl.org/coar/resource_type/c_b239 +dnet:publication_resource @=@ 0038 @=@ note de lecture +dnet:publication_resource @=@ 0038 @=@ notedelecture +dnet:publication_resource @=@ 0038 @=@ other publication +dnet:publication_resource @=@ 0038 @=@ postface +dnet:publication_resource @=@ 0038 @=@ publication-other +dnet:publication_resource @=@ 0038 @=@ revuedepresse +dnet:publication_resource @=@ 0038 @=@ sa_component +dnet:publication_resource @=@ 0038 @=@ standard +dnet:publication_resource @=@ 0038 @=@ standard-series +dnet:publication_resource @=@ 0013 @=@ A3 Kirjan tai muun kokoomateoksen osa +dnet:publication_resource @=@ 0013 @=@ Book Part (author) +dnet:publication_resource @=@ 0013 @=@ Book Section / Chapter +dnet:publication_resource @=@ 0013 @=@ Book chapter or Essay in book +dnet:publication_resource @=@ 0013 @=@ Book editorial +dnet:publication_resource @=@ 0013 @=@ Book section +dnet:publication_resource @=@ 0013 @=@ Book_Chapter +dnet:publication_resource @=@ 0013 @=@ Buchbeitrag +dnet:publication_resource @=@ 0013 @=@ Capítulo de libro +dnet:publication_resource @=@ 0013 @=@ Contribution to International Book/Monograph +dnet:publication_resource @=@ 0013 @=@ Contribution to International Book/Monograph ISI/JCR +dnet:publication_resource @=@ 0013 @=@ Contribution to National Book/Monograph +dnet:publication_resource @=@ 0013 @=@ Contribution to book (non peer-reviewed) +dnet:publication_resource @=@ 0013 @=@ Contribution to book (peer-reviewed) +dnet:publication_resource @=@ 0013 @=@ Part of book - chapter +dnet:publication_resource @=@ 0013 @=@ book chapter +dnet:publication_resource @=@ 0013 @=@ book-part +dnet:publication_resource @=@ 0013 @=@ bookPart +dnet:publication_resource @=@ 0013 @=@ book_content +dnet:publication_resource @=@ 0013 @=@ chapitre_ouvrage +dnet:publication_resource @=@ 0013 @=@ chapter +dnet:publication_resource @=@ 0013 @=@ doc-type:bookPart +dnet:publication_resource @=@ 0013 @=@ http://purl.org/coar/resource_type/c_3248 +dnet:publication_resource @=@ 0013 @=@ info:eu-repo/semantics/bookPart +dnet:publication_resource @=@ 0013 @=@ publication-section +dnet:publication_resource @=@ 0013 @=@ reference-entry +dnet:publication_resource @=@ 0013 @=@ reference_entry +dnet:publication_resource @=@ 0013 @=@ scientific book chapter +dnet:publication_resource @=@ 0013 @=@ Глава монографии +dnet:publication_resource @=@ 0019 @=@ H1 Myönnetty patentti +dnet:publication_resource @=@ 0019 @=@ Patent +dnet:publication_resource @=@ 0019 @=@ Patente +dnet:publication_resource @=@ 0019 @=@ Solicitud de patente +dnet:publication_resource @=@ 0019 @=@ Traducción de patente +dnet:publication_resource @=@ 0019 @=@ brevet +dnet:publication_resource @=@ 0019 @=@ http://purl.org/coar/resource_type/c_15cd +dnet:publication_resource @=@ 0019 @=@ info:eu-repo/semantics/patent +dnet:publication_resource @=@ 0019 @=@ publication-patent +dnet:publication_resource @=@ 0028 @=@ Service +dnet:publication_resource @=@ 0028 @=@ physicalObject +dnet:publication_resource @=@ 0016 @=@ Pre Print +dnet:publication_resource @=@ 0016 @=@ Pre-print +dnet:publication_resource @=@ 0016 @=@ http://purl.org/coar/resource_type/c_816b +dnet:publication_resource @=@ 0016 @=@ info:eu-repo/semantics/preprint +dnet:publication_resource @=@ 0016 @=@ publication-preprint +dnet:publication_resource @=@ 0016 @=@ Препринт +dnet:publication_resource @=@ 0034 @=@ Project deliverable +dnet:publication_resource @=@ 0034 @=@ http://purl.org/coar/resource_type/c_18op +dnet:publication_resource @=@ 0034 @=@ publication-deliverable +dnet:publication_resource @=@ 0035 @=@ Project milestone +dnet:publication_resource @=@ 0035 @=@ publication-milestone +dnet:publication_resource @=@ 0036 @=@ Proposal +dnet:publication_resource @=@ 0036 @=@ http://purl.org/coar/resource_type/c_baaf +dnet:publication_resource @=@ 0036 @=@ research-proposal +dnet:publication_resource @=@ 0017 @=@ ACTIVITY_REPORT +dnet:publication_resource @=@ 0017 @=@ Commissioned report +dnet:publication_resource @=@ 0017 @=@ D4 Julkaistu kehittämis- tai tutkimusraportti tai -selvitys +dnet:publication_resource @=@ 0017 @=@ Deliverable +dnet:publication_resource @=@ 0017 @=@ Documento tecnico +dnet:publication_resource @=@ 0017 @=@ Project Report +dnet:publication_resource @=@ 0017 @=@ Software documentation +dnet:publication_resource @=@ 0017 @=@ brief-report +dnet:publication_resource @=@ 0017 @=@ case-report +dnet:publication_resource @=@ 0017 @=@ chapitre_rapport +dnet:publication_resource @=@ 0017 @=@ doc-type:report +dnet:publication_resource @=@ 0017 @=@ document_institutionnel +dnet:publication_resource @=@ 0017 @=@ document_technique +dnet:publication_resource @=@ 0017 @=@ http://purl.org/coar/resource_type/c_186u +dnet:publication_resource @=@ 0017 @=@ http://purl.org/coar/resource_type/c_18hj +dnet:publication_resource @=@ 0017 @=@ http://purl.org/coar/resource_type/c_18wq +dnet:publication_resource @=@ 0017 @=@ http://purl.org/coar/resource_type/c_18ws +dnet:publication_resource @=@ 0017 @=@ http://purl.org/coar/resource_type/c_71bd +dnet:publication_resource @=@ 0017 @=@ http://purl.org/coar/resource_type/c_93fc +dnet:publication_resource @=@ 0017 @=@ http://purl.org/coar/resource_type/c_ba1f +dnet:publication_resource @=@ 0017 @=@ info:eu-repo/semantics/report +dnet:publication_resource @=@ 0017 @=@ publication-report +dnet:publication_resource @=@ 0017 @=@ publication-softwaredocumentation +dnet:publication_resource @=@ 0017 @=@ rapport_expertise +dnet:publication_resource @=@ 0017 @=@ rapport_mission +dnet:publication_resource @=@ 0017 @=@ report +dnet:publication_resource @=@ 0017 @=@ report-paper +dnet:publication_resource @=@ 0017 @=@ report-paper_title +dnet:publication_resource @=@ 0017 @=@ report-series +dnet:publication_resource @=@ 0017 @=@ support_cours +dnet:publication_resource @=@ 0014 @=@ Arbeitspapier +dnet:publication_resource @=@ 0014 @=@ Departmental Bulletin Paper +dnet:publication_resource @=@ 0014 @=@ Documento de trabajo +dnet:publication_resource @=@ 0014 @=@ Paper +dnet:publication_resource @=@ 0014 @=@ Project description +dnet:publication_resource @=@ 0014 @=@ Research-Paper +dnet:publication_resource @=@ 0014 @=@ ResearchPaper +dnet:publication_resource @=@ 0014 @=@ Working / discussion paper +dnet:publication_resource @=@ 0014 @=@ Working Paper +dnet:publication_resource @=@ 0014 @=@ Working Paper / Technical Report +dnet:publication_resource @=@ 0014 @=@ doc-type:workingPaper +dnet:publication_resource @=@ 0014 @=@ http://purl.org/coar/resource_type/c_8042 +dnet:publication_resource @=@ 0014 @=@ info:eu-repo/semantics/paper +dnet:publication_resource @=@ 0014 @=@ info:eu-repo/semantics/workingPaper +dnet:publication_resource @=@ 0014 @=@ publication-workingpaper +dnet:publication_resource @=@ 0014 @=@ workingPaper +dnet:publication_resource @=@ 0015 @=@ A2 Katsausartikkeli tieteellisessä aikakauslehdessä +dnet:publication_resource @=@ 0015 @=@ Book Review +dnet:publication_resource @=@ 0015 @=@ Book/Film/Article review +dnet:publication_resource @=@ 0015 @=@ Literature review +dnet:publication_resource @=@ 0015 @=@ Peer review +dnet:publication_resource @=@ 0015 @=@ Reseña bibliográfica +dnet:publication_resource @=@ 0015 @=@ Review Article +dnet:publication_resource @=@ 0015 @=@ RezensionReview +dnet:publication_resource @=@ 0015 @=@ book-review +dnet:publication_resource @=@ 0015 @=@ http://purl.org/coar/resource_type/c_ba08 +dnet:publication_resource @=@ 0015 @=@ http://purl.org/coar/resource_type/c_dcae04bc +dnet:publication_resource @=@ 0015 @=@ http://purl.org/coar/resource_type/c_efa0 +dnet:publication_resource @=@ 0015 @=@ info:eu-repo/semantics/review +dnet:publication_resource @=@ 0015 @=@ peer-review +dnet:publication_resource @=@ 0029 @=@ Software +dnet:publication_resource @=@ 0029 @=@ Software/Software +dnet:publication_resource @=@ 0029 @=@ Workflow +dnet:publication_resource @=@ 0029 @=@ Workflow/Workflow +dnet:publication_resource @=@ 0029 @=@ http://purl.org/coar/resource_type/c_393c +dnet:publication_resource @=@ 0029 @=@ http://purl.org/coar/resource_type/c_5ce6 +dnet:publication_resource @=@ 0029 @=@ http://purl.org/coar/resource_type/c_c950 +dnet:publication_resource @=@ 0032 @=@ http://purl.org/coar/resource_type/c_7bab +dnet:publication_resource @=@ 0030 @=@ http://purl.org/coar/resource_type/c_18cc +dnet:publication_resource @=@ 0030 @=@ sound +dnet:publication_resource @=@ 0044 @=@ Graduate diploma +dnet:publication_resource @=@ 0044 @=@ Undergraduate diploma +dnet:publication_resource @=@ 0000 @=@ UNKNOWN +dnet:publication_resource @=@ 0042 @=@ EGI Virtual Appliance +dnet:languages @=@ abk @=@ ab +dnet:languages @=@ aar @=@ aa +dnet:languages @=@ afr @=@ af +dnet:languages @=@ alb/sqi @=@ sq +dnet:languages @=@ amh @=@ am +dnet:languages @=@ ara @=@ ar +dnet:languages @=@ arm/hye @=@ hy +dnet:languages @=@ asm @=@ as +dnet:languages @=@ ina @=@ ia +dnet:languages @=@ aym @=@ ay +dnet:languages @=@ aze @=@ az +dnet:languages @=@ bak @=@ ba +dnet:languages @=@ baq/eus @=@ eu +dnet:languages @=@ bel @=@ be +dnet:languages @=@ ben @=@ bn +dnet:languages @=@ bih @=@ bh +dnet:languages @=@ bis @=@ bi +dnet:languages @=@ bre @=@ br +dnet:languages @=@ bul @=@ bg +dnet:languages @=@ bur/mya @=@ my +dnet:languages @=@ cat @=@ ca +dnet:languages @=@ chi/zho @=@ zh +dnet:languages @=@ cos @=@ co +dnet:languages @=@ hrv @=@ hr +dnet:languages @=@ hrv @=@ hr +dnet:languages @=@ hrv @=@ scr/hrv +dnet:languages @=@ ces/cze @=@ cs +dnet:languages @=@ dan @=@ da +dnet:languages @=@ dut/nld @=@ dut/nla +dnet:languages @=@ dut/nld @=@ dutdut +dnet:languages @=@ dut/nld @=@ nl +dnet:languages @=@ dut/nld @=@ nl_be +dnet:languages @=@ dut/nld @=@ nl_nl +dnet:languages @=@ dut/nld @=@ nld +dnet:languages @=@ dzo @=@ dz +dnet:languages @=@ eng @=@ en +dnet:languages @=@ eng @=@ en_au +dnet:languages @=@ eng @=@ en_en +dnet:languages @=@ eng @=@ en_gb +dnet:languages @=@ eng @=@ en_nz +dnet:languages @=@ eng @=@ en_us +dnet:languages @=@ eng @=@ english +dnet:languages @=@ eng @=@ en-us +dnet:languages @=@ eng @=@ en-US +dnet:languages @=@ eng @=@ English +dnet:languages @=@ eng @=@ EN +dnet:languages @=@ eng @=@ en angielski +dnet:languages @=@ eng @=@ en-GB +dnet:languages @=@ eng @=@ Englisch +dnet:languages @=@ epo @=@ eo +dnet:languages @=@ est @=@ et +dnet:languages @=@ fao @=@ fo +dnet:languages @=@ fij @=@ fj +dnet:languages @=@ fin @=@ fi +dnet:languages @=@ fin @=@ Finnish +dnet:languages @=@ fra/fre @=@ fr +dnet:languages @=@ fra/fre @=@ FR +dnet:languages @=@ fra/fre @=@ fr_be +dnet:languages @=@ fra/fre @=@ fr_fr +dnet:languages @=@ fra/fre @=@ fre/fra +dnet:languages @=@ fra/fre @=@ fra +dnet:languages @=@ fry @=@ fy +dnet:languages @=@ glg @=@ gl +dnet:languages @=@ geo/kat @=@ ka +dnet:languages @=@ deu/ger @=@ de +dnet:languages @=@ deu/ger @=@ ger/deu +dnet:languages @=@ deu/ger @=@ german +dnet:languages @=@ deu/ger @=@ ger +dnet:languages @=@ deu/ger @=@ deu +dnet:languages @=@ deu/ger @=@ DE-de +dnet:languages @=@ ell/gre @=@ el +dnet:languages @=@ ell/gre @=@ gr +dnet:languages @=@ ell/gre @=@ el-GR +dnet:languages @=@ kal @=@ kl +dnet:languages @=@ grn @=@ gn +dnet:languages @=@ guj @=@ gu +dnet:languages @=@ hau @=@ ha +dnet:languages @=@ heb @=@ he +dnet:languages @=@ hin @=@ hi +dnet:languages @=@ hun @=@ hu +dnet:languages @=@ ice/isl @=@ is +dnet:languages @=@ ine @=@ - +dnet:languages @=@ ind @=@ id +dnet:languages @=@ iku @=@ iu +dnet:languages @=@ ipk @=@ ik +dnet:languages @=@ gai/iri @=@ ga +dnet:languages @=@ gai/iri @=@ gle +dnet:languages @=@ ita @=@ it +dnet:languages @=@ jpn @=@ ja +dnet:languages @=@ jav @=@ jv +dnet:languages @=@ jav @=@ jv/jw +dnet:languages @=@ jav @=@ jw +dnet:languages @=@ kan @=@ kn +dnet:languages @=@ kas @=@ ks +dnet:languages @=@ kaz @=@ kk +dnet:languages @=@ khm @=@ km +dnet:languages @=@ kin @=@ rw +dnet:languages @=@ kir @=@ ky +dnet:languages @=@ kor @=@ ko +dnet:languages @=@ kur @=@ ku +dnet:languages @=@ lao @=@ lo +dnet:languages @=@ lat @=@ la +dnet:languages @=@ lav @=@ lv +dnet:languages @=@ lin @=@ ln +dnet:languages @=@ lit @=@ lt +dnet:languages @=@ mac/mak @=@ mk +dnet:languages @=@ mlg @=@ mg +dnet:languages @=@ may/msa @=@ ms +dnet:languages @=@ mlt @=@ ml +dnet:languages @=@ mao/mri @=@ mi +dnet:languages @=@ mar @=@ mr +dnet:languages @=@ mol @=@ mo +dnet:languages @=@ mon @=@ mn +dnet:languages @=@ nau @=@ na +dnet:languages @=@ nep @=@ ne +dnet:languages @=@ nor @=@ no +dnet:languages @=@ oci @=@ oc +dnet:languages @=@ ori @=@ or +dnet:languages @=@ orm @=@ om +dnet:languages @=@ pan @=@ pa +dnet:languages @=@ fas/per @=@ fa +dnet:languages @=@ pol @=@ pl +dnet:languages @=@ por @=@ pt +dnet:languages @=@ por @=@ pt_pt +dnet:languages @=@ pus @=@ ps +dnet:languages @=@ que @=@ qu +dnet:languages @=@ roh @=@ rm +dnet:languages @=@ ron/rum @=@ ro +dnet:languages @=@ run @=@ rn +dnet:languages @=@ rus @=@ ru +dnet:languages @=@ smo @=@ sm +dnet:languages @=@ sag @=@ sg +dnet:languages @=@ san @=@ sa +dnet:languages @=@ srp @=@ scc/srp +dnet:languages @=@ srp @=@ sr +dnet:languages @=@ scr @=@ sh +dnet:languages @=@ sna @=@ sn +dnet:languages @=@ snd @=@ sd +dnet:languages @=@ sin @=@ si +dnet:languages @=@ sit @=@ - +dnet:languages @=@ slk/slo @=@ sk +dnet:languages @=@ slv @=@ sl +dnet:languages @=@ som @=@ so +dnet:languages @=@ sot @=@ st +dnet:languages @=@ esl/spa @=@ es +dnet:languages @=@ sun @=@ su +dnet:languages @=@ swa @=@ sw +dnet:languages @=@ ssw @=@ ss +dnet:languages @=@ swe @=@ sv +dnet:languages @=@ swe @=@ sve/swe +dnet:languages @=@ tgl @=@ tl +dnet:languages @=@ tgk @=@ tg +dnet:languages @=@ tam @=@ ta +dnet:languages @=@ tat @=@ tt +dnet:languages @=@ tel @=@ te +dnet:languages @=@ tha @=@ th +dnet:languages @=@ tha @=@ thai +dnet:languages @=@ bod/tib @=@ bo +dnet:languages @=@ tir @=@ ti +dnet:languages @=@ tog @=@ to +dnet:languages @=@ tso @=@ ts +dnet:languages @=@ tsn @=@ tn +dnet:languages @=@ tur @=@ tr +dnet:languages @=@ tuk @=@ tk +dnet:languages @=@ twi @=@ tw +dnet:languages @=@ uig @=@ ug +dnet:languages @=@ ukr @=@ uk +dnet:languages @=@ und @=@ UNKNOWN +dnet:languages @=@ und @=@ none +dnet:languages @=@ urd @=@ ur +dnet:languages @=@ uzb @=@ uz +dnet:languages @=@ vie @=@ vi +dnet:languages @=@ vol @=@ vo +dnet:languages @=@ wln @=@ wa +dnet:languages @=@ cym/wel @=@ cy +dnet:languages @=@ wol @=@ wo +dnet:languages @=@ xho @=@ xh +dnet:languages @=@ yid @=@ yi +dnet:languages @=@ yor @=@ yo +dnet:languages @=@ zha @=@ za +dnet:languages @=@ zul @=@ zu +dnet:result_typologies @=@ dataset @=@ 0021 +dnet:result_typologies @=@ dataset @=@ 0024 +dnet:result_typologies @=@ dataset @=@ 0025 +dnet:result_typologies @=@ dataset @=@ 0030 +dnet:result_typologies @=@ dataset @=@ 0033 +dnet:result_typologies @=@ dataset @=@ 0037 +dnet:result_typologies @=@ dataset @=@ 0039 +dnet:result_typologies @=@ dataset @=@ 0046 +dnet:result_typologies @=@ other @=@ 0000 +dnet:result_typologies @=@ other @=@ 0010 +dnet:result_typologies @=@ other @=@ 0018 +dnet:result_typologies @=@ other @=@ 0020 +dnet:result_typologies @=@ other @=@ 0022 +dnet:result_typologies @=@ other @=@ 0023 +dnet:result_typologies @=@ other @=@ 0026 +dnet:result_typologies @=@ other @=@ 0027 +dnet:result_typologies @=@ other @=@ 0028 +dnet:result_typologies @=@ other @=@ 0042 +dnet:result_typologies @=@ publication @=@ 0001 +dnet:result_typologies @=@ publication @=@ 0002 +dnet:result_typologies @=@ publication @=@ 0004 +dnet:result_typologies @=@ publication @=@ 0005 +dnet:result_typologies @=@ publication @=@ 0006 +dnet:result_typologies @=@ publication @=@ 0007 +dnet:result_typologies @=@ publication @=@ 0008 +dnet:result_typologies @=@ publication @=@ 0009 +dnet:result_typologies @=@ publication @=@ 0011 +dnet:result_typologies @=@ publication @=@ 0012 +dnet:result_typologies @=@ publication @=@ 0013 +dnet:result_typologies @=@ publication @=@ 0014 +dnet:result_typologies @=@ publication @=@ 0015 +dnet:result_typologies @=@ publication @=@ 0016 +dnet:result_typologies @=@ publication @=@ 0017 +dnet:result_typologies @=@ publication @=@ 0019 +dnet:result_typologies @=@ publication @=@ 0031 +dnet:result_typologies @=@ publication @=@ 0032 +dnet:result_typologies @=@ publication @=@ 0034 +dnet:result_typologies @=@ publication @=@ 0035 +dnet:result_typologies @=@ publication @=@ 0036 +dnet:result_typologies @=@ publication @=@ 0038 +dnet:result_typologies @=@ publication @=@ 0044 +dnet:result_typologies @=@ publication @=@ 0045 +dnet:result_typologies @=@ software @=@ 0029 +dnet:result_typologies @=@ software @=@ 0040 +dnet:countries @=@ AF @=@ AFG +dnet:countries @=@ AF @=@ Afghanistan +dnet:countries @=@ AD @=@ Andorra +dnet:countries @=@ AO @=@ Angola +dnet:countries @=@ AR @=@ ARG +dnet:countries @=@ AR @=@ Argentina +dnet:countries @=@ AU @=@ AUS +dnet:countries @=@ AU @=@ Australia +dnet:countries @=@ AT @=@ AUT +dnet:countries @=@ AT @=@ Austria +dnet:countries @=@ AZ @=@ AZE +dnet:countries @=@ BD @=@ Bangladesh +dnet:countries @=@ BY @=@ Belarus +dnet:countries @=@ BE @=@ BEL +dnet:countries @=@ BE @=@ Belgium +dnet:countries @=@ BJ @=@ BEN +dnet:countries @=@ BO @=@ Bolivia, Plurinational State of +dnet:countries @=@ BA @=@ BIH +dnet:countries @=@ BA @=@ Bosnia-Hercegovina +dnet:countries @=@ BR @=@ BRA +dnet:countries @=@ BR @=@ Brazil +dnet:countries @=@ BG @=@ Bulgaria +dnet:countries @=@ BF @=@ BFA +dnet:countries @=@ KH @=@ Cambodia +dnet:countries @=@ KH @=@ Cambogia +dnet:countries @=@ KH @=@ Campuchea +dnet:countries @=@ CM @=@ CMR +dnet:countries @=@ CA @=@ CAN +dnet:countries @=@ CA @=@ Canada +dnet:countries @=@ CV @=@ Cape Verde +dnet:countries @=@ CL @=@ CHL +dnet:countries @=@ CL @=@ Chile +dnet:countries @=@ CN @=@ CHN +dnet:countries @=@ CN @=@ China +dnet:countries @=@ CO @=@ COL +dnet:countries @=@ CO @=@ Colombia +dnet:countries @=@ CD @=@ Congo +dnet:countries @=@ CD @=@ Congo Democratic Republic (formerly Zaire) +dnet:countries @=@ CD @=@ Congo, Republic +dnet:countries @=@ CD @=@ Congo, the Democratic Republic of the +dnet:countries @=@ CD @=@ Zaire +dnet:countries @=@ CR @=@ CRI +dnet:countries @=@ CI @=@ CIV +dnet:countries @=@ CI @=@ Ivory Coast +dnet:countries @=@ HR @=@ Croatia +dnet:countries @=@ HR @=@ HRV +dnet:countries @=@ CY @=@ CYP +dnet:countries @=@ CY @=@ Cyprus +dnet:countries @=@ CZ @=@ CZE +dnet:countries @=@ CZ @=@ Czech Republic +dnet:countries @=@ CZ @=@ Czechia +dnet:countries @=@ CZ @=@ Czechoslovakia +dnet:countries @=@ DK @=@ DNK +dnet:countries @=@ DK @=@ Denmark +dnet:countries @=@ EC @=@ Ecuador +dnet:countries @=@ EG @=@ EGY +dnet:countries @=@ EG @=@ Egypt +dnet:countries @=@ SV @=@ SLV +dnet:countries @=@ EE @=@ EST +dnet:countries @=@ EE @=@ Estonia +dnet:countries @=@ ET @=@ ETH +dnet:countries @=@ EU @=@ EEC +dnet:countries @=@ FJ @=@ FJI +dnet:countries @=@ FI @=@ FIN +dnet:countries @=@ FI @=@ Finland +dnet:countries @=@ MK @=@ Macedonia +dnet:countries @=@ MK @=@ Macedonia, the Former Yugoslav Republic Of +dnet:countries @=@ MK @=@ North Macedonia +dnet:countries @=@ FR @=@ FRA +dnet:countries @=@ FR @=@ France +dnet:countries @=@ PF @=@ French Polynesia +dnet:countries @=@ PF @=@ PYF +dnet:countries @=@ TF @=@ French Southern Territories +dnet:countries @=@ GE @=@ Georgia +dnet:countries @=@ DE @=@ DEU +dnet:countries @=@ DE @=@ Germany +dnet:countries @=@ DE @=@ Germany, Berlin +dnet:countries @=@ GH @=@ GHA +dnet:countries @=@ GR @=@ EL +dnet:countries @=@ GR @=@ GRC +dnet:countries @=@ GL @=@ GRL +dnet:countries @=@ GN @=@ Guinea +dnet:countries @=@ GW @=@ Guinea-Bissau +dnet:countries @=@ VA @=@ Vatican State +dnet:countries @=@ HK @=@ HKG +dnet:countries @=@ HK @=@ Hong Kong +dnet:countries @=@ HK @=@ Hongkong +dnet:countries @=@ HU @=@ HUN +dnet:countries @=@ HU @=@ Hungary +dnet:countries @=@ IS @=@ ISL +dnet:countries @=@ IN @=@ IND +dnet:countries @=@ IN @=@ India +dnet:countries @=@ ID @=@ IDN +dnet:countries @=@ ID @=@ Indonesia +dnet:countries @=@ IR @=@ Iran +dnet:countries @=@ IR @=@ Iran, Islamic Republic of +dnet:countries @=@ IE @=@ IRL +dnet:countries @=@ IE @=@ Ireland +dnet:countries @=@ IL @=@ ISR +dnet:countries @=@ IL @=@ Israel +dnet:countries @=@ IT @=@ ITA +dnet:countries @=@ IT @=@ Italy +dnet:countries @=@ JM @=@ Jamaica +dnet:countries @=@ JP @=@ JPN +dnet:countries @=@ JP @=@ Japan +dnet:countries @=@ KZ @=@ KAZ +dnet:countries @=@ KZ @=@ Kazakistan +dnet:countries @=@ KZ @=@ Kazakstan +dnet:countries @=@ KE @=@ KEN +dnet:countries @=@ KE @=@ Kenya +dnet:countries @=@ KR @=@ KOR +dnet:countries @=@ KR @=@ Korea, Republic of +dnet:countries @=@ KR @=@ Korean Republic (South Korea) +dnet:countries @=@ KP @=@ PRK +dnet:countries @=@ LV @=@ LVA +dnet:countries @=@ LY @=@ Libya +dnet:countries @=@ LT @=@ LTU +dnet:countries @=@ LU @=@ LUX +dnet:countries @=@ LU @=@ Luxembourg +dnet:countries @=@ MO @=@ Macao +dnet:countries @=@ MG @=@ Madagascar +dnet:countries @=@ MY @=@ Malaysia +dnet:countries @=@ ML @=@ Mali +dnet:countries @=@ MT @=@ Malta +dnet:countries @=@ MU @=@ Mauritius +dnet:countries @=@ MX @=@ MEX +dnet:countries @=@ MX @=@ Mexico +dnet:countries @=@ FM @=@ Micronesia +dnet:countries @=@ MD @=@ Moldova +dnet:countries @=@ MD @=@ Moldova, Republic of +dnet:countries @=@ MN @=@ Mongolia +dnet:countries @=@ MA @=@ Morocco +dnet:countries @=@ MZ @=@ Mozambique +dnet:countries @=@ NA @=@ NAM +dnet:countries @=@ NL @=@ NLD +dnet:countries @=@ NL @=@ Netherlands +dnet:countries @=@ AN @=@ Netherlands Antilles +dnet:countries @=@ NC @=@ NCL +dnet:countries @=@ NZ @=@ NZL +dnet:countries @=@ NZ @=@ New Zealand +dnet:countries @=@ NO @=@ NOR +dnet:countries @=@ NO @=@ Norway +dnet:countries @=@ OC @=@ Australasia +dnet:countries @=@ OM @=@ Oman +dnet:countries @=@ PK @=@ PAK +dnet:countries @=@ PK @=@ Pakistan +dnet:countries @=@ PS @=@ Palestin, State of +dnet:countries @=@ PS @=@ Palestine, State of +dnet:countries @=@ PS @=@ Palestinian Territory, Occupied +dnet:countries @=@ PA @=@ PAN +dnet:countries @=@ PA @=@ Panama +dnet:countries @=@ PG @=@ PapuaNew Guinea +dnet:countries @=@ PE @=@ PER +dnet:countries @=@ PH @=@ PHL +dnet:countries @=@ PH @=@ Philippines +dnet:countries @=@ PL @=@ POL +dnet:countries @=@ PL @=@ Poland +dnet:countries @=@ PT @=@ PRT +dnet:countries @=@ PT @=@ Portugal +dnet:countries @=@ PR @=@ Puerto Rico +dnet:countries @=@ RO @=@ ROU +dnet:countries @=@ RO @=@ Romania +dnet:countries @=@ RU @=@ RUS +dnet:countries @=@ RU @=@ Russia +dnet:countries @=@ RU @=@ Russian Federation +dnet:countries @=@ RE @=@ Réunion +dnet:countries @=@ KN @=@ Saint Kitts And Nevis +dnet:countries @=@ SA @=@ Saudi Arabia +dnet:countries @=@ SN @=@ SEN +dnet:countries @=@ RS @=@ SRB +dnet:countries @=@ CS @=@ Serbia and Montenegro +dnet:countries @=@ SG @=@ SGP +dnet:countries @=@ SG @=@ Singapore +dnet:countries @=@ SK @=@ SVK +dnet:countries @=@ SI @=@ SVN +dnet:countries @=@ SI @=@ Slovenia +dnet:countries @=@ ZA @=@ South Africa +dnet:countries @=@ ZA @=@ ZAF +dnet:countries @=@ ES @=@ ESP +dnet:countries @=@ ES @=@ Spain +dnet:countries @=@ LK @=@ LKA +dnet:countries @=@ LK @=@ Sri Lanka +dnet:countries @=@ SD @=@ SDN +dnet:countries @=@ SR @=@ Suriname +dnet:countries @=@ SE @=@ SWE +dnet:countries @=@ SE @=@ Sweden +dnet:countries @=@ CH @=@ CHE +dnet:countries @=@ CH @=@ Switzerland +dnet:countries @=@ SY @=@ Syria +dnet:countries @=@ ST @=@ Sao Tome and Principe +dnet:countries @=@ TW @=@ TWN +dnet:countries @=@ TW @=@ Taiwan +dnet:countries @=@ TW @=@ Taiwan, Province of China +dnet:countries @=@ TZ @=@ Tanzania +dnet:countries @=@ TZ @=@ Tanzania, United Republic of +dnet:countries @=@ TH @=@ THA +dnet:countries @=@ TH @=@ Thailand +dnet:countries @=@ TL @=@ East Timor +dnet:countries @=@ TN @=@ TUN +dnet:countries @=@ TN @=@ Tunisia +dnet:countries @=@ TR @=@ TUR +dnet:countries @=@ TR @=@ Turkey +dnet:countries @=@ UNKNOWN @=@ AAA +dnet:countries @=@ UNKNOWN @=@ [Unknown] +dnet:countries @=@ UNKNOWN @=@ _? +dnet:countries @=@ UA @=@ UKR +dnet:countries @=@ UA @=@ Ukraine +dnet:countries @=@ AE @=@ United Arab Emirates +dnet:countries @=@ GB @=@ England +dnet:countries @=@ GB @=@ GBR +dnet:countries @=@ GB @=@ Great Britain +dnet:countries @=@ GB @=@ Great Britain and Northern Ireland +dnet:countries @=@ GB @=@ Scotland +dnet:countries @=@ GB @=@ UK +dnet:countries @=@ GB @=@ United Kingdom +dnet:countries @=@ US @=@ USA +dnet:countries @=@ US @=@ United States +dnet:countries @=@ US @=@ United States of America +dnet:countries @=@ UY @=@ Uruguay +dnet:countries @=@ UZ @=@ Uzbekistan +dnet:countries @=@ VE @=@ Venezuela, Bolivarian Republic of +dnet:countries @=@ VN @=@ Vietnam +dnet:countries @=@ VG @=@ British Virgin Islands +dnet:countries @=@ YU @=@ Jugoslavia +dnet:countries @=@ YU @=@ Yugoslavia +dnet:countries @=@ ZW @=@ ABW +dnet:protocols @=@ oai @=@ OAI-PMH +dnet:protocols @=@ oai @=@ OAI_PMH +dnet:pid_types @=@ orcid @=@ ORCID12 +dnet:pid_types @=@ handle @=@ hdl +dnet:review_levels @=@ 0000 @=@ UNKNOWN +dnet:review_levels @=@ 0002 @=@ 80 大阪経大学会「Working Paper」 +dnet:review_levels @=@ 0002 @=@ AO +dnet:review_levels @=@ 0002 @=@ ARTICLE SANS COMITE DE LECTURE (ASCL) +dnet:review_levels @=@ 0002 @=@ Arbeitspapier +dnet:review_levels @=@ 0002 @=@ Arbeitspapier [workingPaper] +dnet:review_levels @=@ 0002 @=@ Article (author) +dnet:review_levels @=@ 0002 @=@ Article type: preprint +dnet:review_levels @=@ 0002 @=@ Article(author version) +dnet:review_levels @=@ 0002 @=@ Article, not peer-reviewed +dnet:review_levels @=@ 0002 @=@ Articulo no evaluado +dnet:review_levels @=@ 0002 @=@ Artigo Solicitado e Não Avaliado por Pares +dnet:review_levels @=@ 0002 @=@ Artigo não avaliado pelos pares +dnet:review_levels @=@ 0002 @=@ Artigo não avaliado por pares +dnet:review_levels @=@ 0002 @=@ Artigo não avaliado por pres +dnet:review_levels @=@ 0002 @=@ Artikkeli|Artikkeli ammattilehdessä. Ei vertaisarvioitu +dnet:review_levels @=@ 0002 @=@ Artículo no evaluado +dnet:review_levels @=@ 0002 @=@ Book (non peer-reviewed) +dnet:review_levels @=@ 0002 @=@ Book Part (author) +dnet:review_levels @=@ 0002 @=@ Book item; Non-peer-reviewed +dnet:review_levels @=@ 0002 @=@ Conference preprint +dnet:review_levels @=@ 0002 @=@ Contribution to book (non peer-reviewed) +dnet:review_levels @=@ 0002 @=@ Discussion Paper +dnet:review_levels @=@ 0002 @=@ Document de travail (Working Paper) +dnet:review_levels @=@ 0002 @=@ Documento de trabajo +dnet:review_levels @=@ 0002 @=@ Documento de trabajo de investigaci??n +dnet:review_levels @=@ 0002 @=@ Draft +dnet:review_levels @=@ 0002 @=@ E-pub ahead of print +dnet:review_levels @=@ 0002 @=@ Editorial de revista, no evaluado por pares +dnet:review_levels @=@ 0002 @=@ Editorial de revista, não avaliado por pares +dnet:review_levels @=@ 0002 @=@ Editorial não avaliado pelos pares +dnet:review_levels @=@ 0002 @=@ Editors (non peer-reviewed) +dnet:review_levels @=@ 0002 @=@ Epub ahead of print +dnet:review_levels @=@ 0002 @=@ Hakemlik Sürecinden Geçmiş Makale +dnet:review_levels @=@ 0002 @=@ Hakemlik sürecindeki makale +dnet:review_levels @=@ 0002 @=@ Hakemlik sürecinden geçmemiş kitap değerlendirmesi +dnet:review_levels @=@ 0002 @=@ Journal Article (author version) +dnet:review_levels @=@ 0002 @=@ Journal Article Preprint +dnet:review_levels @=@ 0002 @=@ Journal Editorial, not peer-reviewed +dnet:review_levels @=@ 0002 @=@ Journal article; Non-peer-reviewed +dnet:review_levels @=@ 0002 @=@ Journal:WorkingPaper +dnet:review_levels @=@ 0002 @=@ Manuscript (preprint) +dnet:review_levels @=@ 0002 @=@ Monográfico (Informes, Documentos de trabajo, etc.) +dnet:review_levels @=@ 0002 @=@ NOTE INTERNE OU DE TRAVAIL +dnet:review_levels @=@ 0002 @=@ Nicht begutachteter Beitrag +dnet:review_levels @=@ 0002 @=@ No evaluado por pares +dnet:review_levels @=@ 0002 @=@ Non-Refereed +dnet:review_levels @=@ 0002 @=@ Non-refeered article +dnet:review_levels @=@ 0002 @=@ Non-refereed Article +dnet:review_levels @=@ 0002 @=@ Non-refereed Book Review +dnet:review_levels @=@ 0002 @=@ Non-refereed Review +dnet:review_levels @=@ 0002 @=@ Non-refereed Text +dnet:review_levels @=@ 0002 @=@ NonPeerReviewed +dnet:review_levels @=@ 0002 @=@ Not Peer reviewed +dnet:review_levels @=@ 0002 @=@ Not Reviewed +dnet:review_levels @=@ 0002 @=@ Not peer-reviewed +dnet:review_levels @=@ 0002 @=@ Não Avaliado por Pares +dnet:review_levels @=@ 0002 @=@ Não avaliada pelos pares +dnet:review_levels @=@ 0002 @=@ Não avaliado pelos pares +dnet:review_levels @=@ 0002 @=@ Original article (non peer-reviewed) +dnet:review_levels @=@ 0002 @=@ Other publication (non peer-review) +dnet:review_levels @=@ 0002 @=@ Pre Print +dnet:review_levels @=@ 0002 @=@ Pre-print +dnet:review_levels @=@ 0002 @=@ Preprint Article +dnet:review_levels @=@ 0002 @=@ Preprints +dnet:review_levels @=@ 0002 @=@ Preprints, Working Papers, ... +dnet:review_levels @=@ 0002 @=@ Rapporto tecnico / Working Paper / Rapporto di progetto +dnet:review_levels @=@ 0002 @=@ Resumo Não Avaliado por Pares +dnet:review_levels @=@ 0002 @=@ Review article (non peer-reviewed) +dnet:review_levels @=@ 0002 @=@ SMUR +dnet:review_levels @=@ 0002 @=@ Submissão dos artigos +dnet:review_levels @=@ 0002 @=@ Submitted version +dnet:review_levels @=@ 0002 @=@ Vertaisarvioimaton kirjan tai muun kokoomateoksen osa +dnet:review_levels @=@ 0002 @=@ Vorabdruck +dnet:review_levels @=@ 0002 @=@ Wetensch. publ. non-refereed +dnet:review_levels @=@ 0002 @=@ Working / discussion paper +dnet:review_levels @=@ 0002 @=@ Working Document +dnet:review_levels @=@ 0002 @=@ Working Notes +dnet:review_levels @=@ 0002 @=@ Working Paper +dnet:review_levels @=@ 0002 @=@ Working Paper / Technical Report +dnet:review_levels @=@ 0002 @=@ Working Papers +dnet:review_levels @=@ 0002 @=@ WorkingPaper +dnet:review_levels @=@ 0002 @=@ article in non peer-reviewed journal +dnet:review_levels @=@ 0002 @=@ articolo preliminare +dnet:review_levels @=@ 0002 @=@ articulo preliminar +dnet:review_levels @=@ 0002 @=@ articulo sin revision por pares +dnet:review_levels @=@ 0002 @=@ artigo preliminar +dnet:review_levels @=@ 0002 @=@ artigo sem revisão +dnet:review_levels @=@ 0002 @=@ artículo preliminar +dnet:review_levels @=@ 0002 @=@ artículo sin revisión por pares +dnet:review_levels @=@ 0002 @=@ bookchapter (author version) +dnet:review_levels @=@ 0002 @=@ borrador +dnet:review_levels @=@ 0002 @=@ column (author version) +dnet:review_levels @=@ 0002 @=@ communication_invitee +dnet:review_levels @=@ 0002 @=@ doc-type:preprint +dnet:review_levels @=@ 0002 @=@ doc-type:workingPaper +dnet:review_levels @=@ 0002 @=@ draf +dnet:review_levels @=@ 0002 @=@ eu-repo/semantics/submittedVersion +dnet:review_levels @=@ 0002 @=@ http://purl.org/coar/resource_type/c_8042 +dnet:review_levels @=@ 0002 @=@ http://purl.org/coar/resource_type/c_816b +dnet:review_levels @=@ 0002 @=@ http://purl.org/coar/version/c_71e4c1898caa6e32 +dnet:review_levels @=@ 0002 @=@ http://purl.org/coar/version/c_b1a7d7d4d402bcce +dnet:review_levels @=@ 0002 @=@ http://purl.org/eprint/type/SubmittedBookItem +dnet:review_levels @=@ 0002 @=@ http://purl.org/eprint/type/SubmittedJournalArticle +dnet:review_levels @=@ 0002 @=@ http://purl.org/info:eu-repo/semantics/authorVersion +dnet:review_levels @=@ 0002 @=@ http://purl.org/info:eu-repo/semantics/submittedVersion +dnet:review_levels @=@ 0002 @=@ http://purl.org/spar/fabio/Preprint +dnet:review_levels @=@ 0002 @=@ http://purl.org/spar/fabio/WorkingPaper +dnet:review_levels @=@ 0002 @=@ https://dictionary.casrai.org/Preprint +dnet:review_levels @=@ 0002 @=@ info:ar-repo/semantics/documento de trabajo +dnet:review_levels @=@ 0002 @=@ info:ar-repo/semantics/documentoDeTrabajo +dnet:review_levels @=@ 0002 @=@ info:eu repo/semantics/draft +dnet:review_levels @=@ 0002 @=@ info:eu-repo/semantics/authorVersion +dnet:review_levels @=@ 0002 @=@ info:eu-repo/semantics/draft +dnet:review_levels @=@ 0002 @=@ info:eu-repo/semantics/preprint +dnet:review_levels @=@ 0002 @=@ info:eu-repo/semantics/submitedVersion +dnet:review_levels @=@ 0002 @=@ info:eu-repo/semantics/submittedVersion +dnet:review_levels @=@ 0002 @=@ info:eu-repo/semantics/unReviewed +dnet:review_levels @=@ 0002 @=@ info:eu-repo/semantics/updatedVersion +dnet:review_levels @=@ 0002 @=@ info:eu-repo/semantics/workingPaper +dnet:review_levels @=@ 0002 @=@ info:eu-repo/submittedVersion +dnet:review_levels @=@ 0002 @=@ info:ulb-repo/semantics/articleNonPeerReview +dnet:review_levels @=@ 0002 @=@ info:ulb-repo/semantics/openurl/vlink-workingpaper +dnet:review_levels @=@ 0002 @=@ info:ulb-repo/semantics/workingPaper +dnet:review_levels @=@ 0002 @=@ non peer-reviewed article +dnet:review_levels @=@ 0002 @=@ non-refereed review article +dnet:review_levels @=@ 0002 @=@ não avaliado +dnet:review_levels @=@ 0002 @=@ preprint +dnet:review_levels @=@ 0002 @=@ prepublicación +dnet:review_levels @=@ 0002 @=@ proceeding, seminar, workshop without peer review +dnet:review_levels @=@ 0002 @=@ proceedings (author version) +dnet:review_levels @=@ 0002 @=@ pré-print +dnet:review_levels @=@ 0002 @=@ pré-publication +dnet:review_levels @=@ 0002 @=@ préprint +dnet:review_levels @=@ 0002 @=@ prépublication +dnet:review_levels @=@ 0002 @=@ publicació preliminar +dnet:review_levels @=@ 0002 @=@ publication-preprint +dnet:review_levels @=@ 0002 @=@ publication-workingpaper +dnet:review_levels @=@ 0002 @=@ submitedVersion +dnet:review_levels @=@ 0002 @=@ submittedVersion +dnet:review_levels @=@ 0002 @=@ voordruk +dnet:review_levels @=@ 0002 @=@ workingPaper +dnet:review_levels @=@ 0002 @=@ ön baskı +dnet:review_levels @=@ 0002 @=@ Препринт +dnet:review_levels @=@ 0002 @=@ предпечатная версия публикации +dnet:review_levels @=@ 0002 @=@ препринт статьи +dnet:review_levels @=@ 0002 @=@ ディスカッション/ワーキング・ペーパー DP/WP +dnet:review_levels @=@ 0002 @=@ プレプリント +dnet:review_levels @=@ 0002 @=@ プレプリント Preprint +dnet:review_levels @=@ 0002 @=@ プレプリント(Preprint) +dnet:review_levels @=@ 0002 @=@ 印刷物/電子媒体-その他(査読無し) +dnet:review_levels @=@ 0002 @=@ 印刷物/電子媒体-テクニカルレポート類(査読無し) +dnet:review_levels @=@ 0002 @=@ 印刷物/電子媒体-会議発表論文(査読無し) +dnet:review_levels @=@ 0002 @=@ 印刷物/電子媒体-図書(査読無し) +dnet:review_levels @=@ 0002 @=@ 印刷物/電子媒体-学術雑誌論文(査読無し) +dnet:review_levels @=@ 0002 @=@ 印刷物/電子媒体-紀要論文(査読無し) +dnet:review_levels @=@ 0002 @=@ 印刷物/電子媒体-雑誌記事(査読無し) +dnet:review_levels @=@ 0002 @=@ 预印本 +dnet:review_levels @=@ 0001 @=@ ##rt.metadata.pkp.peerReviewed## +dnet:review_levels @=@ 0001 @=@ A1 Alkuperäisartikkeli tieteellisessä aikakauslehdessä +dnet:review_levels @=@ 0001 @=@ Art?culo revisado por pares +dnet:review_levels @=@ 0001 @=@ Article revisat per persones expertes +dnet:review_levels @=@ 0001 @=@ Article type: peer review +dnet:review_levels @=@ 0001 @=@ Article évalué par les pairs +dnet:review_levels @=@ 0001 @=@ Article évalué par des pairs +dnet:review_levels @=@ 0001 @=@ Article évalué par les pairs +dnet:review_levels @=@ 0001 @=@ Articolo valutato secondo i criteri della peer review +dnet:review_levels @=@ 0001 @=@ Articulo evaluado por dos pares +dnet:review_levels @=@ 0001 @=@ Articulo revisado por pares +dnet:review_levels @=@ 0001 @=@ Artigo Avaliado pelos Pares +dnet:review_levels @=@ 0001 @=@ Artigo Revisto por Pares +dnet:review_levels @=@ 0001 @=@ Artigo avaliado por blind peer review +dnet:review_levels @=@ 0001 @=@ Artigo avaliado por pares +dnet:review_levels @=@ 0001 @=@ Artigo de convidado. Avaliado pelos pares +dnet:review_levels @=@ 0001 @=@ Artigos; Avaliado pelos pares +dnet:review_levels @=@ 0001 @=@ Artículo de investigación, Investigaciones originales, Artículo evaluado por pares, Investigaciones empíricas +dnet:review_levels @=@ 0001 @=@ Artículo evaluado por pares +dnet:review_levels @=@ 0001 @=@ Artículo evaluado por pares, Ensayos de investigación +dnet:review_levels @=@ 0001 @=@ Artículo evaluado por pares, Investigaciones empíricas, Artículos de investigación +dnet:review_levels @=@ 0001 @=@ Artículo revisado +dnet:review_levels @=@ 0001 @=@ Artículo revisado por pares +dnet:review_levels @=@ 0001 @=@ Artículos de estudiantes, Artículo evaluado por pares, Artículos de investigación +dnet:review_levels @=@ 0001 @=@ Artículos de investigación evaluados por doble ciego +dnet:review_levels @=@ 0001 @=@ Artículos evaluadores por doble ciego +dnet:review_levels @=@ 0001 @=@ Artículos evaluados por pares +dnet:review_levels @=@ 0001 @=@ Artículos evaluados por pares académicos +dnet:review_levels @=@ 0001 @=@ Artículos revisados por pares +dnet:review_levels @=@ 0001 @=@ Avaliadas pelos pares +dnet:review_levels @=@ 0001 @=@ Avaliado anonimamente por pares +dnet:review_levels @=@ 0001 @=@ Avaliado em duplo cego por pares +dnet:review_levels @=@ 0001 @=@ Avaliado pela Editoria +dnet:review_levels @=@ 0001 @=@ Avaliado pela Editoria. Avaliado pelos pares. +dnet:review_levels @=@ 0001 @=@ Avaliado pelo Editoria +dnet:review_levels @=@ 0001 @=@ Avaliado pelo pares +dnet:review_levels @=@ 0001 @=@ Avaliado pelos Editores +dnet:review_levels @=@ 0001 @=@ Avaliado pelos pares +dnet:review_levels @=@ 0001 @=@ Avaliado pelos pares, Artigo de convidado +dnet:review_levels @=@ 0001 @=@ Avaliado pelos pares, Artigos Originais +dnet:review_levels @=@ 0001 @=@ Avaliado pelos pares, Artigos Originais, Artigos de Revisão +dnet:review_levels @=@ 0001 @=@ Avaliado pelos pares. Avaliado pelo Editoria +dnet:review_levels @=@ 0001 @=@ Avaliado po Pares +dnet:review_levels @=@ 0001 @=@ Avaliado por Editor +dnet:review_levels @=@ 0001 @=@ Avaliado por pares +dnet:review_levels @=@ 0001 @=@ Avaliados pelos pares +dnet:review_levels @=@ 0001 @=@ Avaliados por Pares +dnet:review_levels @=@ 0001 @=@ Blind Peer-reviewed Article +dnet:review_levels @=@ 0001 @=@ Book (peer-reviewed) +dnet:review_levels @=@ 0001 @=@ Comentario de libros, Comentario de revistas, Comentario de conferencias, Artículo evaluado por pares, Artículo de investigación +dnet:review_levels @=@ 0001 @=@ Conference paper; Peer-reviewed +dnet:review_levels @=@ 0001 @=@ Contribution to book (peer-reviewed) +dnet:review_levels @=@ 0001 @=@ Documento Avaliado por Pares +dnet:review_levels @=@ 0001 @=@ Double blind evaluation articles +dnet:review_levels @=@ 0001 @=@ Double blind peer review +dnet:review_levels @=@ 0001 @=@ Editors (peer-reviewed) +dnet:review_levels @=@ 0001 @=@ Evaluación por pares +dnet:review_levels @=@ 0001 @=@ Evaluado por pares +dnet:review_levels @=@ 0001 @=@ Evaluados por los pares +dnet:review_levels @=@ 0001 @=@ Hakem sürecinden geçmiş makale +dnet:review_levels @=@ 0001 @=@ Hakemli makale +dnet:review_levels @=@ 0001 @=@ Hakemlik Sürecinden Geçmiş +dnet:review_levels @=@ 0001 @=@ Invited Peer-Reviewed Article +dnet:review_levels @=@ 0001 @=@ Journal article; Peer-reviewed +dnet:review_levels @=@ 0001 @=@ Original article (peer-reviewed) +dnet:review_levels @=@ 0001 @=@ Other publication (peer-review) +dnet:review_levels @=@ 0001 @=@ Paper peer-reviewed +dnet:review_levels @=@ 0001 @=@ Papers evaluated by academic peers +dnet:review_levels @=@ 0001 @=@ Peer reviewed +dnet:review_levels @=@ 0001 @=@ Peer reviewed article +dnet:review_levels @=@ 0001 @=@ Peer reviewed invited commentry +dnet:review_levels @=@ 0001 @=@ Peer-Reviewed Protocol +dnet:review_levels @=@ 0001 @=@ Peer-reviewd Article +dnet:review_levels @=@ 0001 @=@ Peer-reviewed +dnet:review_levels @=@ 0001 @=@ Peer-reviewed Article +dnet:review_levels @=@ 0001 @=@ Peer-reviewed Paper +dnet:review_levels @=@ 0001 @=@ Peer-reviewed Review +dnet:review_levels @=@ 0001 @=@ Peer-reviewed Review Article +dnet:review_levels @=@ 0001 @=@ Peer-reviewed Text +dnet:review_levels @=@ 0001 @=@ Peer-reviewed communication +dnet:review_levels @=@ 0001 @=@ Peer-reviewed conference proceedings +dnet:review_levels @=@ 0001 @=@ Peer-reviewed research article +dnet:review_levels @=@ 0001 @=@ Peer-reviewed short communication +dnet:review_levels @=@ 0001 @=@ PeerReviewed +dnet:review_levels @=@ 0001 @=@ Proceedings (peer-reviewed) +dnet:review_levels @=@ 0001 @=@ Refereed +dnet:review_levels @=@ 0001 @=@ Refereed Article +dnet:review_levels @=@ 0001 @=@ Research articles evaluated by double blind +dnet:review_levels @=@ 0001 @=@ Resenha avaliada pelos pares +dnet:review_levels @=@ 0001 @=@ Review article (peer-reviewed) +dnet:review_levels @=@ 0001 @=@ Reviewed by peers +dnet:review_levels @=@ 0001 @=@ Revisión por Expertos +dnet:review_levels @=@ 0001 @=@ Revisto por Pares +dnet:review_levels @=@ 0001 @=@ SBBq abstracts / peer-reviewed +dnet:review_levels @=@ 0001 @=@ SBBq resúmenes - revisada por pares +dnet:review_levels @=@ 0001 @=@ Scholarly publ. Refereed +dnet:review_levels @=@ 0001 @=@ Scientific Publ (refereed) +dnet:review_levels @=@ 0001 @=@ Vertaisarvioimaton kirjoitus tieteellisessä aikakauslehdessä +dnet:review_levels @=@ 0001 @=@ Vertaisarvioitu alkuperäisartikkeli tieteellisessä aikakauslehdessä +dnet:review_levels @=@ 0001 @=@ Vertaisarvioitu artikkeli konferenssijulkaisussa +dnet:review_levels @=@ 0001 @=@ Vertaisarvioitu artikkeli tieteellisessä aikakauslehdessä +dnet:review_levels @=@ 0001 @=@ Vertaisarvioitu kirjan tai muun kokoomateoksen osa +dnet:review_levels @=@ 0001 @=@ Wetensch. publ. Refereed +dnet:review_levels @=@ 0001 @=@ article in peer-reviewed journal +dnet:review_levels @=@ 0001 @=@ articles validés +dnet:review_levels @=@ 0001 @=@ avaliado por pares, temas livres +dnet:review_levels @=@ 0001 @=@ info:eu-repo/semantics/peerReviewed +dnet:review_levels @=@ 0001 @=@ info:ulb-repo/semantics/articlePeerReview +dnet:review_levels @=@ 0001 @=@ proceeding with peer review +dnet:review_levels @=@ 0001 @=@ refereed_publications +dnet:review_levels @=@ 0001 @=@ ul_published_reviewed +dnet:review_levels @=@ 0001 @=@ Άρθρο που έχει αξιολογηθεί από ομότιμους ειδικούς +dnet:review_levels @=@ 0001 @=@ Άρθρο το οποίο έχει περάσει από ομότιμη αξιολόγηση +dnet:review_levels @=@ 0001 @=@ レフェリー付き論文 +dnet:review_levels @=@ 0001 @=@ 印刷物/電子媒体-テクニカルレポート類(査読有り) +dnet:review_levels @=@ 0001 @=@ 印刷物/電子媒体-会議発表論文(査読有り) +dnet:review_levels @=@ 0001 @=@ 印刷物/電子媒体-図書(査読有り) +dnet:review_levels @=@ 0001 @=@ 印刷物/電子媒体-学術雑誌論文(査読有り) +dnet:review_levels @=@ 0001 @=@ 印刷物/電子媒体-紀要論文(査読有り) +dnet:review_levels @=@ 0001 @=@ 印刷物/電子媒体-雑誌記事(査読有り) +dnet:review_levels @=@ 0001 @=@ 原著論文(査読有り) +dnet:review_levels @=@ 0001 @=@ 査読論文 \ No newline at end of file diff --git a/dhp-common/src/test/resources/eu/dnetlib/dhp/transform/terms.txt b/dhp-common/src/test/resources/eu/dnetlib/dhp/transform/terms.txt new file mode 100644 index 000000000..93cc00eca --- /dev/null +++ b/dhp-common/src/test/resources/eu/dnetlib/dhp/transform/terms.txt @@ -0,0 +1,1080 @@ +ModularUiLabels @=@ ModularUiLabels @=@ PendingRepositoryResources @=@ Pending datasource +ModularUiLabels @=@ ModularUiLabels @=@ RepositoryServiceResources @=@ Valid datasource +dnet:content_description_typologies @=@ D-Net Content Description Typologies @=@ file::EuropePMC @=@ file::EuropePMC +dnet:content_description_typologies @=@ D-Net Content Description Typologies @=@ file::PDF @=@ file::PDF +dnet:content_description_typologies @=@ D-Net Content Description Typologies @=@ file::WoS @=@ file::WoS +dnet:content_description_typologies @=@ D-Net Content Description Typologies @=@ metadata @=@ metadata +dnet:content_description_typologies @=@ D-Net Content Description Typologies @=@ file::hybrid @=@ file::hybrid +dnet:provenanceActions @=@ dnet:provenanceActions @=@ sysimport:crosswalk:cris @=@ Harvested +dnet:provenanceActions @=@ dnet:provenanceActions @=@ sysimport:actionset:orcidworks-no-doi @=@ Harvested +dnet:provenanceActions @=@ dnet:provenanceActions @=@ sysimport:crosswalk:infospace @=@ Harvested +dnet:provenanceActions @=@ dnet:provenanceActions @=@ sysimport:crosswalk @=@ Harvested +dnet:provenanceActions @=@ dnet:provenanceActions @=@ sysimport:crosswalk:aggregator @=@ Harvested +dnet:provenanceActions @=@ dnet:provenanceActions @=@ sysimport:crosswalk:datasetarchive @=@ Harvested +dnet:provenanceActions @=@ dnet:provenanceActions @=@ sysimport:actionset @=@ Harvested +dnet:provenanceActions @=@ dnet:provenanceActions @=@ sysimport:crosswalk:entityregistry @=@ Harvested +dnet:provenanceActions @=@ dnet:provenanceActions @=@ sysimport:crosswalk:repository @=@ Harvested +dnet:provenanceActions @=@ dnet:provenanceActions @=@ sysimport:mining:aggregator @=@ Inferred by OpenAIRE +dnet:provenanceActions @=@ dnet:provenanceActions @=@ community:subject @=@ Inferred by OpenAIRE +dnet:provenanceActions @=@ dnet:provenanceActions @=@ community:zenodocommunity @=@ Inferred by OpenAIRE +dnet:provenanceActions @=@ dnet:provenanceActions @=@ iis @=@ Inferred by OpenAIRE +dnet:provenanceActions @=@ dnet:provenanceActions @=@ sysimport:mining:entityregistry @=@ Inferred by OpenAIRE +dnet:provenanceActions @=@ dnet:provenanceActions @=@ community:organization @=@ Inferred by OpenAIRE +dnet:provenanceActions @=@ dnet:provenanceActions @=@ sysimport:mining:infospace @=@ Inferred by OpenAIRE +dnet:provenanceActions @=@ dnet:provenanceActions @=@ sysimport:dedup @=@ Inferred by OpenAIRE +dnet:provenanceActions @=@ dnet:provenanceActions @=@ community:datasource @=@ Inferred by OpenAIRE +dnet:provenanceActions @=@ dnet:provenanceActions @=@ propagation:project:semrel @=@ Inferred by OpenAIRE +dnet:provenanceActions @=@ dnet:provenanceActions @=@ sysimport:mining:cris @=@ Inferred by OpenAIRE +dnet:provenanceActions @=@ dnet:provenanceActions @=@ sysimport:mining:repository @=@ Inferred by OpenAIRE +dnet:provenanceActions @=@ dnet:provenanceActions @=@ sysimport:mining:datasetarchive @=@ Inferred by OpenAIRE +dnet:provenanceActions @=@ dnet:provenanceActions @=@ community:semrel @=@ Inferred by OpenAIRE +dnet:provenanceActions @=@ dnet:provenanceActions @=@ user:claim @=@ Linked by user +dnet:provenanceActions @=@ dnet:provenanceActions @=@ user:claim:pid @=@ Linked by user +dnet:provenanceActions @=@ dnet:provenanceActions @=@ user:insert @=@ Linked by user +dnet:provenanceActions @=@ dnet:provenanceActions @=@ user:claim:search @=@ Linked by user +dnet:provenanceActions @=@ dnet:provenanceActions @=@ UNKNOWN @=@ UNKNOWN +dnet:provenanceActions @=@ dnet:provenanceActions @=@ country:instrepos @=@ Inferred by OpenAIRE +dnet:access_modes @=@ dnet:access_modes @=@ 12MONTHS @=@ 12 Months Embargo +dnet:access_modes @=@ dnet:access_modes @=@ 6MONTHS @=@ 6 Months Embargo +dnet:access_modes @=@ dnet:access_modes @=@ CLOSED @=@ Closed Access +dnet:access_modes @=@ dnet:access_modes @=@ EMBARGO @=@ Embargo +dnet:access_modes @=@ dnet:access_modes @=@ OPEN @=@ Open Access +dnet:access_modes @=@ dnet:access_modes @=@ OPEN SOURCE @=@ Open Source +dnet:access_modes @=@ dnet:access_modes @=@ OTHER @=@ Other +dnet:access_modes @=@ dnet:access_modes @=@ RESTRICTED @=@ Restricted +dnet:access_modes @=@ dnet:access_modes @=@ UNKNOWN @=@ not available +fct:funding_typologies @=@ fct:funding_typologies @=@ fct:program @=@ fct:program +dnet:compatibilityLevel @=@ dnet:compatibilityLevel @=@ openaire2.0 @=@ OpenAIRE 2.0 (EC funding) +dnet:compatibilityLevel @=@ dnet:compatibilityLevel @=@ openaire3.0 @=@ OpenAIRE 3.0 (OA, funding) +dnet:compatibilityLevel @=@ dnet:compatibilityLevel @=@ driver @=@ OpenAIRE Basic (DRIVER OA) +dnet:compatibilityLevel @=@ dnet:compatibilityLevel @=@ openaire-cris_1.1 @=@ OpenAIRE CRIS v1.1 +dnet:compatibilityLevel @=@ dnet:compatibilityLevel @=@ openaire2.0_data @=@ OpenAIRE Data (funded, referenced datasets) +dnet:compatibilityLevel @=@ dnet:compatibilityLevel @=@ openaire-pub_4.0 @=@ OpenAIRE PubRepos v4.0 +dnet:compatibilityLevel @=@ dnet:compatibilityLevel @=@ hostedBy @=@ collected from a compatible aggregator +dnet:compatibilityLevel @=@ dnet:compatibilityLevel @=@ files @=@ files +dnet:compatibilityLevel @=@ dnet:compatibilityLevel @=@ native @=@ native +dnet:compatibilityLevel @=@ dnet:compatibilityLevel @=@ UNKNOWN @=@ not available +dnet:compatibilityLevel @=@ dnet:compatibilityLevel @=@ notCompatible @=@ under validation +dnet:dataCite_date @=@ dnet:dataCite_date @=@ UNKNOWN @=@ UNKNOWN +dnet:dataCite_date @=@ dnet:dataCite_date @=@ available @=@ available +dnet:dataCite_date @=@ dnet:dataCite_date @=@ copyrighted @=@ copyrighted +dnet:dataCite_date @=@ dnet:dataCite_date @=@ created @=@ created +dnet:dataCite_date @=@ dnet:dataCite_date @=@ endDate @=@ endDate +dnet:dataCite_date @=@ dnet:dataCite_date @=@ issued @=@ issued +dnet:dataCite_date @=@ dnet:dataCite_date @=@ startDate @=@ startDate +dnet:dataCite_date @=@ dnet:dataCite_date @=@ submitted @=@ submitted +dnet:dataCite_date @=@ dnet:dataCite_date @=@ updated @=@ updated +dnet:dataCite_date @=@ dnet:dataCite_date @=@ valid @=@ valid +dnet:dataCite_date @=@ dnet:dataCite_date @=@ published-print @=@ published-print +dnet:dataCite_date @=@ dnet:dataCite_date @=@ published-online @=@ published-online +dnet:dataCite_date @=@ dnet:dataCite_date @=@ accepted @=@ accepted +dnet:datasource_typologies @=@ dnet:datasource_typologies @=@ crissystem @=@ CRIS System +dnet:datasource_typologies @=@ dnet:datasource_typologies @=@ datarepository::unknown @=@ Data Repository +dnet:datasource_typologies @=@ dnet:datasource_typologies @=@ aggregator::datarepository @=@ Data Repository Aggregator +dnet:datasource_typologies @=@ dnet:datasource_typologies @=@ entityregistry::projects @=@ Funder database +dnet:datasource_typologies @=@ dnet:datasource_typologies @=@ infospace @=@ Information Space +dnet:datasource_typologies @=@ dnet:datasource_typologies @=@ pubsrepository::institutional @=@ Institutional Repository +dnet:datasource_typologies @=@ dnet:datasource_typologies @=@ aggregator::pubsrepository::institutional @=@ Institutional Repository Aggregator +dnet:datasource_typologies @=@ dnet:datasource_typologies @=@ pubsrepository::journal @=@ Journal +dnet:datasource_typologies @=@ dnet:datasource_typologies @=@ aggregator::pubsrepository::journals @=@ Journal Aggregator/Publisher +dnet:datasource_typologies @=@ dnet:datasource_typologies @=@ pubsrepository::mock @=@ Other +dnet:datasource_typologies @=@ dnet:datasource_typologies @=@ pubscatalogue::unknown @=@ Publication Catalogue +dnet:datasource_typologies @=@ dnet:datasource_typologies @=@ pubsrepository::unknown @=@ Publication Repository +dnet:datasource_typologies @=@ dnet:datasource_typologies @=@ aggregator::pubsrepository::unknown @=@ Publication Repository Aggregator +dnet:datasource_typologies @=@ dnet:datasource_typologies @=@ entityregistry @=@ Registry +dnet:datasource_typologies @=@ dnet:datasource_typologies @=@ entityregistry::repositories @=@ Registry of repositories +dnet:datasource_typologies @=@ dnet:datasource_typologies @=@ entityregistry::products @=@ Registry of research products +dnet:datasource_typologies @=@ dnet:datasource_typologies @=@ entityregistry::researchers @=@ Registry of researchers +dnet:datasource_typologies @=@ dnet:datasource_typologies @=@ entityregistry::organizations @=@ Registry of organizations +dnet:datasource_typologies @=@ dnet:datasource_typologies @=@ scholarcomminfra @=@ Scholarly Comm. Infrastructure +dnet:datasource_typologies @=@ dnet:datasource_typologies @=@ softwarerepository @=@ Software Repository +dnet:datasource_typologies @=@ dnet:datasource_typologies @=@ pubsrepository::thematic @=@ Thematic Repository +dnet:datasource_typologies @=@ dnet:datasource_typologies @=@ websource @=@ Web Source +dnet:datasource_typologies @=@ dnet:datasource_typologies @=@ aggregator::softwarerepository @=@ Software Repository Aggregator +dnet:datasource_typologies @=@ dnet:datasource_typologies @=@ orprepository @=@ Repository +dnet:datasource_typologies @=@ dnet:datasource_typologies @=@ researchgraph @=@ Research Graph +dnet:subject_classification_typologies @=@ dnet:subject_classification_typologies @=@ ACM @=@ ACM Computing Classification System +dnet:subject_classification_typologies @=@ dnet:subject_classification_typologies @=@ agrovoc @=@ AGROVOC +dnet:subject_classification_typologies @=@ dnet:subject_classification_typologies @=@ bicssc @=@ BIC standard subject categories +dnet:subject_classification_typologies @=@ dnet:subject_classification_typologies @=@ DFG @=@ DFG Classification +dnet:subject_classification_typologies @=@ dnet:subject_classification_typologies @=@ ddc @=@ Dewey Decimal Classification +dnet:subject_classification_typologies @=@ dnet:subject_classification_typologies @=@ nsf:fieldOfApplication @=@ Field of Application (NSF) +dnet:subject_classification_typologies @=@ dnet:subject_classification_typologies @=@ gok @=@ Göttingen Online Classification +dnet:subject_classification_typologies @=@ dnet:subject_classification_typologies @=@ ec:h2020topics @=@ Horizon 2020 Topics +dnet:subject_classification_typologies @=@ dnet:subject_classification_typologies @=@ IPC @=@ International Patent Classification +dnet:subject_classification_typologies @=@ dnet:subject_classification_typologies @=@ jel @=@ JEL Classification +dnet:subject_classification_typologies @=@ dnet:subject_classification_typologies @=@ lcsh @=@ Library of Congress Subject Headings +dnet:subject_classification_typologies @=@ dnet:subject_classification_typologies @=@ msc @=@ Mathematics Subject Classification +dnet:subject_classification_typologies @=@ dnet:subject_classification_typologies @=@ mesheuropmc @=@ Medical Subject Headings +dnet:subject_classification_typologies @=@ dnet:subject_classification_typologies @=@ mesh @=@ Medical Subject Headings +dnet:subject_classification_typologies @=@ dnet:subject_classification_typologies @=@ bk @=@ Nederlandse basisclassificatie +dnet:subject_classification_typologies @=@ dnet:subject_classification_typologies @=@ dnet:od_subjects @=@ OpenDOAR subjects +dnet:subject_classification_typologies @=@ dnet:subject_classification_typologies @=@ ocis @=@ Optics Classification and Indexing Scheme +dnet:subject_classification_typologies @=@ dnet:subject_classification_typologies @=@ pacs @=@ Physics and Astronomy Classification Scheme +dnet:subject_classification_typologies @=@ dnet:subject_classification_typologies @=@ rvk @=@ Regensburger Verbundklassifikation +dnet:subject_classification_typologies @=@ dnet:subject_classification_typologies @=@ UNKNOWN @=@ UNKNOWN +dnet:subject_classification_typologies @=@ dnet:subject_classification_typologies @=@ udc @=@ Universal Decimal Classification +dnet:subject_classification_typologies @=@ dnet:subject_classification_typologies @=@ wos @=@ Web of Science Subject Areas +dnet:subject_classification_typologies @=@ dnet:subject_classification_typologies @=@ arxiv @=@ arXiv +dnet:subject_classification_typologies @=@ dnet:subject_classification_typologies @=@ keyword @=@ keyword +dnet:subject_classification_typologies @=@ dnet:subject_classification_typologies @=@ MAG @=@ Microsoft Academic Graph classification +fct:contractTypes @=@ fct:contractTypes @=@ UNKNOWN @=@ UNKNOWN +dnet:publication_resource @=@ dnet:publication_resource @=@ 0018 @=@ Annotation +dnet:publication_resource @=@ dnet:publication_resource @=@ 0001 @=@ Article +dnet:publication_resource @=@ dnet:publication_resource @=@ 0033 @=@ Audiovisual +dnet:publication_resource @=@ dnet:publication_resource @=@ 0008 @=@ Bachelor thesis +dnet:publication_resource @=@ dnet:publication_resource @=@ 0046 @=@ Bioentity +dnet:publication_resource @=@ dnet:publication_resource @=@ 0002 @=@ Book +dnet:publication_resource @=@ dnet:publication_resource @=@ 0037 @=@ Clinical Trial +dnet:publication_resource @=@ dnet:publication_resource @=@ 0022 @=@ Collection +dnet:publication_resource @=@ dnet:publication_resource @=@ 0004 @=@ Conference object +dnet:publication_resource @=@ dnet:publication_resource @=@ 0005 @=@ Contribution for newspaper or weekly magazine +dnet:publication_resource @=@ dnet:publication_resource @=@ 0045 @=@ Data Management Plan +dnet:publication_resource @=@ dnet:publication_resource @=@ 0031 @=@ Data Paper +dnet:publication_resource @=@ dnet:publication_resource @=@ 0021 @=@ Dataset +dnet:publication_resource @=@ dnet:publication_resource @=@ 0006 @=@ Doctoral thesis +dnet:publication_resource @=@ dnet:publication_resource @=@ 0023 @=@ Event +dnet:publication_resource @=@ dnet:publication_resource @=@ 0009 @=@ External research report +dnet:publication_resource @=@ dnet:publication_resource @=@ 0024 @=@ Film +dnet:publication_resource @=@ dnet:publication_resource @=@ 0025 @=@ Image +dnet:publication_resource @=@ dnet:publication_resource @=@ 0026 @=@ InteractiveResource +dnet:publication_resource @=@ dnet:publication_resource @=@ 0011 @=@ Internal report +dnet:publication_resource @=@ dnet:publication_resource @=@ 0043 @=@ Journal +dnet:publication_resource @=@ dnet:publication_resource @=@ 0010 @=@ Lecture +dnet:publication_resource @=@ dnet:publication_resource @=@ 0007 @=@ Master thesis +dnet:publication_resource @=@ dnet:publication_resource @=@ 0027 @=@ Model +dnet:publication_resource @=@ dnet:publication_resource @=@ 0012 @=@ Newsletter +dnet:publication_resource @=@ dnet:publication_resource @=@ 0020 @=@ Other ORP type +dnet:publication_resource @=@ dnet:publication_resource @=@ 0039 @=@ Other dataset type +dnet:publication_resource @=@ dnet:publication_resource @=@ 0038 @=@ Other literature type +dnet:publication_resource @=@ dnet:publication_resource @=@ 0040 @=@ Other software type +dnet:publication_resource @=@ dnet:publication_resource @=@ 0013 @=@ Part of book or chapter of book +dnet:publication_resource @=@ dnet:publication_resource @=@ 0019 @=@ Patent +dnet:publication_resource @=@ dnet:publication_resource @=@ 0028 @=@ PhysicalObject +dnet:publication_resource @=@ dnet:publication_resource @=@ 0016 @=@ Preprint +dnet:publication_resource @=@ dnet:publication_resource @=@ 0034 @=@ Project deliverable +dnet:publication_resource @=@ dnet:publication_resource @=@ 0035 @=@ Project milestone +dnet:publication_resource @=@ dnet:publication_resource @=@ 0036 @=@ Project proposal +dnet:publication_resource @=@ dnet:publication_resource @=@ 0017 @=@ Report +dnet:publication_resource @=@ dnet:publication_resource @=@ 0014 @=@ Research +dnet:publication_resource @=@ dnet:publication_resource @=@ 0015 @=@ Review +dnet:publication_resource @=@ dnet:publication_resource @=@ 0029 @=@ Software +dnet:publication_resource @=@ dnet:publication_resource @=@ 0032 @=@ Software Paper +dnet:publication_resource @=@ dnet:publication_resource @=@ 0030 @=@ Sound +dnet:publication_resource @=@ dnet:publication_resource @=@ 0044 @=@ Thesis +dnet:publication_resource @=@ dnet:publication_resource @=@ 0000 @=@ Unknown +dnet:publication_resource @=@ dnet:publication_resource @=@ 0042 @=@ Virtual Appliance +ec:funding_typologies @=@ ec:funding_typologies @=@ ec:frameworkprogram @=@ frameworkprogram +ec:funding_typologies @=@ ec:funding_typologies @=@ ec:program @=@ program +ec:funding_typologies @=@ ec:funding_typologies @=@ ec:specificprogram @=@ specificprogram +ec:FP7contractTypes @=@ ec:FP7contractTypes @=@ 171 @=@ Article 171 of the Treaty +ec:FP7contractTypes @=@ ec:FP7contractTypes @=@ BSG @=@ Research for the benefit of specific groups +ec:FP7contractTypes @=@ ec:FP7contractTypes @=@ CIP-EIP-TN @=@ CIP-Eco-Innovation - CIP-Thematic Network +ec:FP7contractTypes @=@ ec:FP7contractTypes @=@ CP @=@ Collaborative project +ec:FP7contractTypes @=@ ec:FP7contractTypes @=@ CP-CSA @=@ Combination of CP & CSA +ec:FP7contractTypes @=@ ec:FP7contractTypes @=@ CSA @=@ Coordination and support action +ec:FP7contractTypes @=@ ec:FP7contractTypes @=@ ERC @=@ Support for frontier research (ERC) +ec:FP7contractTypes @=@ ec:FP7contractTypes @=@ MC @=@ Support for training and career development of researchers (Marie Curie) +ec:FP7contractTypes @=@ ec:FP7contractTypes @=@ NoE @=@ Network of Excellence +wt:funding_relations @=@ wt:funding_relations @=@ wt:hasParentFunding @=@ wt:hasParentFunding +dnet:languages @=@ dnet:languages @=@ abk @=@ Abkhazian +dnet:languages @=@ dnet:languages @=@ ace @=@ Achinese +dnet:languages @=@ dnet:languages @=@ ach @=@ Acoli +dnet:languages @=@ dnet:languages @=@ ada @=@ Adangme +dnet:languages @=@ dnet:languages @=@ aar @=@ Afar +dnet:languages @=@ dnet:languages @=@ afh @=@ Afrihili +dnet:languages @=@ dnet:languages @=@ afr @=@ Afrikaans +dnet:languages @=@ dnet:languages @=@ afa @=@ Afro-Asiatic +dnet:languages @=@ dnet:languages @=@ aka @=@ Akan +dnet:languages @=@ dnet:languages @=@ akk @=@ Akkadian +dnet:languages @=@ dnet:languages @=@ alb/sqi @=@ Albanian +dnet:languages @=@ dnet:languages @=@ ale @=@ Aleut +dnet:languages @=@ dnet:languages @=@ alg @=@ Algonquian languages +dnet:languages @=@ dnet:languages @=@ tut @=@ Altaic +dnet:languages @=@ dnet:languages @=@ amh @=@ Amharic +dnet:languages @=@ dnet:languages @=@ egy @=@ Ancient Egyptian +dnet:languages @=@ dnet:languages @=@ grc @=@ Ancient Greek +dnet:languages @=@ dnet:languages @=@ apa @=@ Apache +dnet:languages @=@ dnet:languages @=@ ara @=@ Arabic +dnet:languages @=@ dnet:languages @=@ arg @=@ Aragonese +dnet:languages @=@ dnet:languages @=@ arc @=@ Aramaic +dnet:languages @=@ dnet:languages @=@ arp @=@ Arapaho +dnet:languages @=@ dnet:languages @=@ arn @=@ Araucanian +dnet:languages @=@ dnet:languages @=@ arw @=@ Arawak +dnet:languages @=@ dnet:languages @=@ arm/hye @=@ Armenian +dnet:languages @=@ dnet:languages @=@ art @=@ Artificial +dnet:languages @=@ dnet:languages @=@ asm @=@ Assamese +dnet:languages @=@ dnet:languages @=@ ath @=@ Athapascan +dnet:languages @=@ dnet:languages @=@ map @=@ Austronesian +dnet:languages @=@ dnet:languages @=@ ina @=@ Auxiliary Language Association) +dnet:languages @=@ dnet:languages @=@ ava @=@ Avaric +dnet:languages @=@ dnet:languages @=@ ave @=@ Avestan +dnet:languages @=@ dnet:languages @=@ awa @=@ Awadhi +dnet:languages @=@ dnet:languages @=@ aym @=@ Aymara +dnet:languages @=@ dnet:languages @=@ aze @=@ Azerbaijani +dnet:languages @=@ dnet:languages @=@ nah @=@ Aztec +dnet:languages @=@ dnet:languages @=@ ban @=@ Balinese +dnet:languages @=@ dnet:languages @=@ bat @=@ Baltic +dnet:languages @=@ dnet:languages @=@ bal @=@ Baluchi +dnet:languages @=@ dnet:languages @=@ bam @=@ Bambara +dnet:languages @=@ dnet:languages @=@ bai @=@ Bamileke +dnet:languages @=@ dnet:languages @=@ bad @=@ Banda +dnet:languages @=@ dnet:languages @=@ bnt @=@ Bantu +dnet:languages @=@ dnet:languages @=@ bas @=@ Basa +dnet:languages @=@ dnet:languages @=@ bak @=@ Bashkir +dnet:languages @=@ dnet:languages @=@ baq/eus @=@ Basque +dnet:languages @=@ dnet:languages @=@ bej @=@ Beja +dnet:languages @=@ dnet:languages @=@ bel @=@ Belarusian +dnet:languages @=@ dnet:languages @=@ bem @=@ Bemba +dnet:languages @=@ dnet:languages @=@ ben @=@ Bengali +dnet:languages @=@ dnet:languages @=@ ber @=@ Berber +dnet:languages @=@ dnet:languages @=@ bho @=@ Bhojpuri +dnet:languages @=@ dnet:languages @=@ bih @=@ Bihari +dnet:languages @=@ dnet:languages @=@ bik @=@ Bikol +dnet:languages @=@ dnet:languages @=@ bin @=@ Bini +dnet:languages @=@ dnet:languages @=@ bis @=@ Bislama +dnet:languages @=@ dnet:languages @=@ nob @=@ Bokmål, Norwegian; Norwegian Bokmål +dnet:languages @=@ dnet:languages @=@ bos @=@ Bosnian +dnet:languages @=@ dnet:languages @=@ bra @=@ Braj +dnet:languages @=@ dnet:languages @=@ bre @=@ Breton +dnet:languages @=@ dnet:languages @=@ bug @=@ Buginese +dnet:languages @=@ dnet:languages @=@ bul @=@ Bulgarian +dnet:languages @=@ dnet:languages @=@ bua @=@ Buriat +dnet:languages @=@ dnet:languages @=@ bur/mya @=@ Burmese +dnet:languages @=@ dnet:languages @=@ cad @=@ Caddo +dnet:languages @=@ dnet:languages @=@ car @=@ Carib +dnet:languages @=@ dnet:languages @=@ cat @=@ Catalan; Valencian +dnet:languages @=@ dnet:languages @=@ cau @=@ Caucasian +dnet:languages @=@ dnet:languages @=@ ceb @=@ Cebuano +dnet:languages @=@ dnet:languages @=@ cel @=@ Celtic +dnet:languages @=@ dnet:languages @=@ cai @=@ Central American Indian +dnet:languages @=@ dnet:languages @=@ chg @=@ Chagatai +dnet:languages @=@ dnet:languages @=@ cha @=@ Chamorro +dnet:languages @=@ dnet:languages @=@ che @=@ Chechen +dnet:languages @=@ dnet:languages @=@ chr @=@ Cherokee +dnet:languages @=@ dnet:languages @=@ nya @=@ Chewa; Chichewa; Nyanja +dnet:languages @=@ dnet:languages @=@ chy @=@ Cheyenne +dnet:languages @=@ dnet:languages @=@ chb @=@ Chibcha +dnet:languages @=@ dnet:languages @=@ chi/zho @=@ Chinese +dnet:languages @=@ dnet:languages @=@ chn @=@ Chinook jargon +dnet:languages @=@ dnet:languages @=@ cho @=@ Choctaw +dnet:languages @=@ dnet:languages @=@ chu @=@ Church Slavic; Slavonic; Church Slavonic; Old Bulgarian; Old Church Slavonic +dnet:languages @=@ dnet:languages @=@ chv @=@ Chuvash +dnet:languages @=@ dnet:languages @=@ cop @=@ Coptic +dnet:languages @=@ dnet:languages @=@ cor @=@ Cornish +dnet:languages @=@ dnet:languages @=@ cos @=@ Corsican +dnet:languages @=@ dnet:languages @=@ cre @=@ Cree +dnet:languages @=@ dnet:languages @=@ mus @=@ Creek +dnet:languages @=@ dnet:languages @=@ crp @=@ Creoles and Pidgins +dnet:languages @=@ dnet:languages @=@ hrv @=@ Croatian +dnet:languages @=@ dnet:languages @=@ cus @=@ Cushitic +dnet:languages @=@ dnet:languages @=@ ces/cze @=@ Czech +dnet:languages @=@ dnet:languages @=@ dak @=@ Dakota +dnet:languages @=@ dnet:languages @=@ dan @=@ Danish +dnet:languages @=@ dnet:languages @=@ del @=@ Delaware +dnet:languages @=@ dnet:languages @=@ din @=@ Dinka +dnet:languages @=@ dnet:languages @=@ div @=@ Divehi +dnet:languages @=@ dnet:languages @=@ doi @=@ Dogri +dnet:languages @=@ dnet:languages @=@ dra @=@ Dravidian +dnet:languages @=@ dnet:languages @=@ dua @=@ Duala +dnet:languages @=@ dnet:languages @=@ dut/nld @=@ Dutch; Flemish +dnet:languages @=@ dnet:languages @=@ dyu @=@ Dyula +dnet:languages @=@ dnet:languages @=@ dzo @=@ Dzongkha +dnet:languages @=@ dnet:languages @=@ efi @=@ Efik +dnet:languages @=@ dnet:languages @=@ eka @=@ Ekajuk +dnet:languages @=@ dnet:languages @=@ elx @=@ Elamite +dnet:languages @=@ dnet:languages @=@ eng @=@ English +dnet:languages @=@ dnet:languages @=@ cpe @=@ English-based Creoles and Pidgins +dnet:languages @=@ dnet:languages @=@ esk @=@ Eskimo +dnet:languages @=@ dnet:languages @=@ epo @=@ Esperanto +dnet:languages @=@ dnet:languages @=@ est @=@ Estonian +dnet:languages @=@ dnet:languages @=@ ewe @=@ Ewe +dnet:languages @=@ dnet:languages @=@ ewo @=@ Ewondo +dnet:languages @=@ dnet:languages @=@ fan @=@ Fang +dnet:languages @=@ dnet:languages @=@ fat @=@ Fanti +dnet:languages @=@ dnet:languages @=@ fao @=@ Faroese +dnet:languages @=@ dnet:languages @=@ fij @=@ Fijian +dnet:languages @=@ dnet:languages @=@ fin @=@ Finnish +dnet:languages @=@ dnet:languages @=@ fiu @=@ Finno-Ugrian +dnet:languages @=@ dnet:languages @=@ fon @=@ Fon +dnet:languages @=@ dnet:languages @=@ fra/fre @=@ French +dnet:languages @=@ dnet:languages @=@ cpf @=@ French-based Creoles and Pidgins +dnet:languages @=@ dnet:languages @=@ fry @=@ Frisian +dnet:languages @=@ dnet:languages @=@ ful @=@ Fulah +dnet:languages @=@ dnet:languages @=@ gaa @=@ Ga +dnet:languages @=@ dnet:languages @=@ gae/gdh @=@ Gaelic +dnet:languages @=@ dnet:languages @=@ gla @=@ Gaelic; Scottish Gaelic +dnet:languages @=@ dnet:languages @=@ glg @=@ Galician +dnet:languages @=@ dnet:languages @=@ lug @=@ Ganda +dnet:languages @=@ dnet:languages @=@ gay @=@ Gayo +dnet:languages @=@ dnet:languages @=@ gez @=@ Geez +dnet:languages @=@ dnet:languages @=@ geo/kat @=@ Georgian +dnet:languages @=@ dnet:languages @=@ deu/ger @=@ German +dnet:languages @=@ dnet:languages @=@ gem @=@ Germanic +dnet:languages @=@ dnet:languages @=@ kik @=@ Gikuyu; Kikuyu +dnet:languages @=@ dnet:languages @=@ gil @=@ Gilbertese +dnet:languages @=@ dnet:languages @=@ gon @=@ Gondi +dnet:languages @=@ dnet:languages @=@ got @=@ Gothic +dnet:languages @=@ dnet:languages @=@ grb @=@ Grebo +dnet:languages @=@ dnet:languages @=@ ell/gre @=@ Greek +dnet:languages @=@ dnet:languages @=@ gre/ell @=@ Greek, Modern (1453-) +dnet:languages @=@ dnet:languages @=@ kal @=@ Greenlandic; Kalaallisut +dnet:languages @=@ dnet:languages @=@ grn @=@ Guarani +dnet:languages @=@ dnet:languages @=@ guj @=@ Gujarati +dnet:languages @=@ dnet:languages @=@ hai @=@ Haida +dnet:languages @=@ dnet:languages @=@ hat @=@ Haitian; Haitian Creole +dnet:languages @=@ dnet:languages @=@ hau @=@ Hausa +dnet:languages @=@ dnet:languages @=@ haw @=@ Hawaiian +dnet:languages @=@ dnet:languages @=@ heb @=@ Hebrew +dnet:languages @=@ dnet:languages @=@ her @=@ Herero +dnet:languages @=@ dnet:languages @=@ hil @=@ Hiligaynon +dnet:languages @=@ dnet:languages @=@ him @=@ Himachali +dnet:languages @=@ dnet:languages @=@ hin @=@ Hindi +dnet:languages @=@ dnet:languages @=@ hmo @=@ Hiri Motu +dnet:languages @=@ dnet:languages @=@ hun @=@ Hungarian +dnet:languages @=@ dnet:languages @=@ hup @=@ Hupa +dnet:languages @=@ dnet:languages @=@ iba @=@ Iban +dnet:languages @=@ dnet:languages @=@ ice/isl @=@ Icelandic +dnet:languages @=@ dnet:languages @=@ ido @=@ Ido +dnet:languages @=@ dnet:languages @=@ ibo @=@ Igbo +dnet:languages @=@ dnet:languages @=@ ijo @=@ Ijo +dnet:languages @=@ dnet:languages @=@ ilo @=@ Iloko +dnet:languages @=@ dnet:languages @=@ inc @=@ Indic +dnet:languages @=@ dnet:languages @=@ ine @=@ Indo-European +dnet:languages @=@ dnet:languages @=@ ind @=@ Indonesian +dnet:languages @=@ dnet:languages @=@ ile @=@ Interlingue +dnet:languages @=@ dnet:languages @=@ iku @=@ Inuktitut +dnet:languages @=@ dnet:languages @=@ ipk @=@ Inupiaq +dnet:languages @=@ dnet:languages @=@ ira @=@ Iranian +dnet:languages @=@ dnet:languages @=@ gai/iri @=@ Irish +dnet:languages @=@ dnet:languages @=@ iro @=@ Iroquoian +dnet:languages @=@ dnet:languages @=@ ita @=@ Italian +dnet:languages @=@ dnet:languages @=@ jpn @=@ Japanese +dnet:languages @=@ dnet:languages @=@ jav @=@ Javanese +dnet:languages @=@ dnet:languages @=@ jrb @=@ Judeo-Arabic +dnet:languages @=@ dnet:languages @=@ jpr @=@ Judeo-Persian +dnet:languages @=@ dnet:languages @=@ kab @=@ Kabyle +dnet:languages @=@ dnet:languages @=@ kac @=@ Kachin +dnet:languages @=@ dnet:languages @=@ kam @=@ Kamba +dnet:languages @=@ dnet:languages @=@ kan @=@ Kannada +dnet:languages @=@ dnet:languages @=@ kau @=@ Kanuri +dnet:languages @=@ dnet:languages @=@ kaa @=@ Kara-Kalpak +dnet:languages @=@ dnet:languages @=@ kar @=@ Karen +dnet:languages @=@ dnet:languages @=@ kas @=@ Kashmiri +dnet:languages @=@ dnet:languages @=@ kaw @=@ Kawi +dnet:languages @=@ dnet:languages @=@ kaz @=@ Kazakh +dnet:languages @=@ dnet:languages @=@ kha @=@ Khasi +dnet:languages @=@ dnet:languages @=@ khm @=@ Khmer +dnet:languages @=@ dnet:languages @=@ khi @=@ Khoisan +dnet:languages @=@ dnet:languages @=@ kho @=@ Khotanese +dnet:languages @=@ dnet:languages @=@ kin @=@ Kinyarwanda +dnet:languages @=@ dnet:languages @=@ kir @=@ Kirghiz +dnet:languages @=@ dnet:languages @=@ kom @=@ Komi +dnet:languages @=@ dnet:languages @=@ kon @=@ Kongo +dnet:languages @=@ dnet:languages @=@ kok @=@ Konkani +dnet:languages @=@ dnet:languages @=@ kor @=@ Korean +dnet:languages @=@ dnet:languages @=@ kpe @=@ Kpelle +dnet:languages @=@ dnet:languages @=@ kro @=@ Kru +dnet:languages @=@ dnet:languages @=@ kua @=@ Kuanyama; Kwanyama +dnet:languages @=@ dnet:languages @=@ kum @=@ Kumyk +dnet:languages @=@ dnet:languages @=@ kur @=@ Kurdish +dnet:languages @=@ dnet:languages @=@ kru @=@ Kurukh +dnet:languages @=@ dnet:languages @=@ kus @=@ Kusaie +dnet:languages @=@ dnet:languages @=@ kut @=@ Kutenai +dnet:languages @=@ dnet:languages @=@ lad @=@ Ladino +dnet:languages @=@ dnet:languages @=@ lah @=@ Lahnda +dnet:languages @=@ dnet:languages @=@ lam @=@ Lamba +dnet:languages @=@ dnet:languages @=@ lao @=@ Lao +dnet:languages @=@ dnet:languages @=@ lat @=@ Latin +dnet:languages @=@ dnet:languages @=@ lav @=@ Latvian +dnet:languages @=@ dnet:languages @=@ ltz @=@ Letzeburgesch; Luxembourgish +dnet:languages @=@ dnet:languages @=@ lez @=@ Lezghian +dnet:languages @=@ dnet:languages @=@ lim @=@ Limburgan; Limburger; Limburgish +dnet:languages @=@ dnet:languages @=@ lin @=@ Lingala +dnet:languages @=@ dnet:languages @=@ lit @=@ Lithuanian +dnet:languages @=@ dnet:languages @=@ loz @=@ Lozi +dnet:languages @=@ dnet:languages @=@ lub @=@ Luba-Katanga +dnet:languages @=@ dnet:languages @=@ lui @=@ Luiseno +dnet:languages @=@ dnet:languages @=@ lun @=@ Lunda +dnet:languages @=@ dnet:languages @=@ luo @=@ Luo +dnet:languages @=@ dnet:languages @=@ mac/mak @=@ Macedonian +dnet:languages @=@ dnet:languages @=@ mad @=@ Madurese +dnet:languages @=@ dnet:languages @=@ mag @=@ Magahi +dnet:languages @=@ dnet:languages @=@ mai @=@ Maithili +dnet:languages @=@ dnet:languages @=@ mak @=@ Makasar +dnet:languages @=@ dnet:languages @=@ mlg @=@ Malagasy +dnet:languages @=@ dnet:languages @=@ may/msa @=@ Malay +dnet:languages @=@ dnet:languages @=@ mal @=@ Malayalam +dnet:languages @=@ dnet:languages @=@ mlt @=@ Maltese +dnet:languages @=@ dnet:languages @=@ man @=@ Mandingo +dnet:languages @=@ dnet:languages @=@ mni @=@ Manipuri +dnet:languages @=@ dnet:languages @=@ mno @=@ Manobo +dnet:languages @=@ dnet:languages @=@ glv @=@ Manx +dnet:languages @=@ dnet:languages @=@ mao/mri @=@ Maori +dnet:languages @=@ dnet:languages @=@ mar @=@ Marathi +dnet:languages @=@ dnet:languages @=@ chm @=@ Mari +dnet:languages @=@ dnet:languages @=@ mah @=@ Marshallese +dnet:languages @=@ dnet:languages @=@ mwr @=@ Marwari +dnet:languages @=@ dnet:languages @=@ mas @=@ Masai +dnet:languages @=@ dnet:languages @=@ myn @=@ Mayan +dnet:languages @=@ dnet:languages @=@ men @=@ Mende +dnet:languages @=@ dnet:languages @=@ mic @=@ Micmac +dnet:languages @=@ dnet:languages @=@ dum @=@ Middle Dutch +dnet:languages @=@ dnet:languages @=@ enm @=@ Middle English +dnet:languages @=@ dnet:languages @=@ frm @=@ Middle French +dnet:languages @=@ dnet:languages @=@ gmh @=@ Middle High German +dnet:languages @=@ dnet:languages @=@ mga @=@ Middle Irish +dnet:languages @=@ dnet:languages @=@ min @=@ Minangkabau +dnet:languages @=@ dnet:languages @=@ mis @=@ Miscellaneous +dnet:languages @=@ dnet:languages @=@ moh @=@ Mohawk +dnet:languages @=@ dnet:languages @=@ mol @=@ Moldavian +dnet:languages @=@ dnet:languages @=@ mkh @=@ Mon-Kmer +dnet:languages @=@ dnet:languages @=@ lol @=@ Mongo +dnet:languages @=@ dnet:languages @=@ mon @=@ Mongolian +dnet:languages @=@ dnet:languages @=@ mos @=@ Mossi +dnet:languages @=@ dnet:languages @=@ mul @=@ Multiple languages +dnet:languages @=@ dnet:languages @=@ mun @=@ Munda +dnet:languages @=@ dnet:languages @=@ nau @=@ Nauru +dnet:languages @=@ dnet:languages @=@ nav @=@ Navajo; Navaho +dnet:languages @=@ dnet:languages @=@ nde @=@ Ndebele, North +dnet:languages @=@ dnet:languages @=@ nbl @=@ Ndebele, South +dnet:languages @=@ dnet:languages @=@ ndo @=@ Ndonga +dnet:languages @=@ dnet:languages @=@ nep @=@ Nepali +dnet:languages @=@ dnet:languages @=@ new @=@ Newari +dnet:languages @=@ dnet:languages @=@ nic @=@ Niger-Kordofanian +dnet:languages @=@ dnet:languages @=@ ssa @=@ Nilo-Saharan +dnet:languages @=@ dnet:languages @=@ niu @=@ Niuean +dnet:languages @=@ dnet:languages @=@ non @=@ Norse +dnet:languages @=@ dnet:languages @=@ nai @=@ North American Indian +dnet:languages @=@ dnet:languages @=@ sme @=@ Northern Sami +dnet:languages @=@ dnet:languages @=@ nor @=@ Norwegian +dnet:languages @=@ dnet:languages @=@ nno @=@ Norwegian Nynorsk; Nynorsk, Norwegian +dnet:languages @=@ dnet:languages @=@ nub @=@ Nubian +dnet:languages @=@ dnet:languages @=@ nym @=@ Nyamwezi +dnet:languages @=@ dnet:languages @=@ nyn @=@ Nyankole +dnet:languages @=@ dnet:languages @=@ nyo @=@ Nyoro +dnet:languages @=@ dnet:languages @=@ nzi @=@ Nzima +dnet:languages @=@ dnet:languages @=@ oci @=@ Occitan (post 1500); Provençal +dnet:languages @=@ dnet:languages @=@ oji @=@ Ojibwa +dnet:languages @=@ dnet:languages @=@ ang @=@ Old English +dnet:languages @=@ dnet:languages @=@ fro @=@ Old French +dnet:languages @=@ dnet:languages @=@ goh @=@ Old High German +dnet:languages @=@ dnet:languages @=@ ori @=@ Oriya +dnet:languages @=@ dnet:languages @=@ orm @=@ Oromo +dnet:languages @=@ dnet:languages @=@ osa @=@ Osage +dnet:languages @=@ dnet:languages @=@ oss @=@ Ossetian; Ossetic +dnet:languages @=@ dnet:languages @=@ oto @=@ Otomian +dnet:languages @=@ dnet:languages @=@ ota @=@ Ottoman +dnet:languages @=@ dnet:languages @=@ pal @=@ Pahlavi +dnet:languages @=@ dnet:languages @=@ pau @=@ Palauan +dnet:languages @=@ dnet:languages @=@ pli @=@ Pali +dnet:languages @=@ dnet:languages @=@ pam @=@ Pampanga +dnet:languages @=@ dnet:languages @=@ pag @=@ Pangasinan +dnet:languages @=@ dnet:languages @=@ pan @=@ Panjabi; Punjabi +dnet:languages @=@ dnet:languages @=@ pap @=@ Papiamento +dnet:languages @=@ dnet:languages @=@ paa @=@ Papuan-Australian +dnet:languages @=@ dnet:languages @=@ fas/per @=@ Persian +dnet:languages @=@ dnet:languages @=@ peo @=@ Persian, Old (ca 600 - 400 B.C.) +dnet:languages @=@ dnet:languages @=@ phn @=@ Phoenician +dnet:languages @=@ dnet:languages @=@ pol @=@ Polish +dnet:languages @=@ dnet:languages @=@ pon @=@ Ponape +dnet:languages @=@ dnet:languages @=@ por @=@ Portuguese +dnet:languages @=@ dnet:languages @=@ cpp @=@ Portuguese-based Creoles and Pidgins +dnet:languages @=@ dnet:languages @=@ pra @=@ Prakrit +dnet:languages @=@ dnet:languages @=@ pro @=@ Provencal +dnet:languages @=@ dnet:languages @=@ pus @=@ Pushto +dnet:languages @=@ dnet:languages @=@ que @=@ Quechua +dnet:languages @=@ dnet:languages @=@ roh @=@ Raeto-Romance +dnet:languages @=@ dnet:languages @=@ raj @=@ Rajasthani +dnet:languages @=@ dnet:languages @=@ rar @=@ Rarotongan +dnet:languages @=@ dnet:languages @=@ roa @=@ Romance +dnet:languages @=@ dnet:languages @=@ ron/rum @=@ Romanian +dnet:languages @=@ dnet:languages @=@ rom @=@ Romany +dnet:languages @=@ dnet:languages @=@ run @=@ Rundi +dnet:languages @=@ dnet:languages @=@ rus @=@ Russian +dnet:languages @=@ dnet:languages @=@ sal @=@ Salishan +dnet:languages @=@ dnet:languages @=@ sam @=@ Samaritan +dnet:languages @=@ dnet:languages @=@ smi @=@ Sami +dnet:languages @=@ dnet:languages @=@ smo @=@ Samoan +dnet:languages @=@ dnet:languages @=@ sad @=@ Sandawe +dnet:languages @=@ dnet:languages @=@ sag @=@ Sango +dnet:languages @=@ dnet:languages @=@ san @=@ Sanskrit +dnet:languages @=@ dnet:languages @=@ srd @=@ Sardinian +dnet:languages @=@ dnet:languages @=@ sco @=@ Scots +dnet:languages @=@ dnet:languages @=@ sel @=@ Selkup +dnet:languages @=@ dnet:languages @=@ sem @=@ Semitic +dnet:languages @=@ dnet:languages @=@ srp @=@ Serbian +dnet:languages @=@ dnet:languages @=@ scr @=@ Serbo-Croatian +dnet:languages @=@ dnet:languages @=@ srr @=@ Serer +dnet:languages @=@ dnet:languages @=@ shn @=@ Shan +dnet:languages @=@ dnet:languages @=@ sna @=@ Shona +dnet:languages @=@ dnet:languages @=@ iii @=@ Sichuan Yi +dnet:languages @=@ dnet:languages @=@ sid @=@ Sidamo +dnet:languages @=@ dnet:languages @=@ bla @=@ Siksika +dnet:languages @=@ dnet:languages @=@ snd @=@ Sindhi +dnet:languages @=@ dnet:languages @=@ sin @=@ Sinhala; Sinhalese +dnet:languages @=@ dnet:languages @=@ sit @=@ Sino-Tibetan +dnet:languages @=@ dnet:languages @=@ sio @=@ Siouan +dnet:languages @=@ dnet:languages @=@ sla @=@ Slavic +dnet:languages @=@ dnet:languages @=@ slk/slo @=@ Slovak +dnet:languages @=@ dnet:languages @=@ slv @=@ Slovenian +dnet:languages @=@ dnet:languages @=@ sog @=@ Sogdian +dnet:languages @=@ dnet:languages @=@ som @=@ Somali +dnet:languages @=@ dnet:languages @=@ son @=@ Songhai +dnet:languages @=@ dnet:languages @=@ wen @=@ Sorbian +dnet:languages @=@ dnet:languages @=@ nso @=@ Sotho +dnet:languages @=@ dnet:languages @=@ sot @=@ Sotho, Southern +dnet:languages @=@ dnet:languages @=@ sai @=@ South American Indian +dnet:languages @=@ dnet:languages @=@ esl/spa @=@ Spanish +dnet:languages @=@ dnet:languages @=@ spa @=@ Spanish; Castilian +dnet:languages @=@ dnet:languages @=@ suk @=@ Sukuma +dnet:languages @=@ dnet:languages @=@ sux @=@ Sumerian +dnet:languages @=@ dnet:languages @=@ sun @=@ Sundanese +dnet:languages @=@ dnet:languages @=@ sus @=@ Susu +dnet:languages @=@ dnet:languages @=@ swa @=@ Swahili +dnet:languages @=@ dnet:languages @=@ ssw @=@ Swati +dnet:languages @=@ dnet:languages @=@ swe @=@ Swedish +dnet:languages @=@ dnet:languages @=@ syr @=@ Syriac +dnet:languages @=@ dnet:languages @=@ tgl @=@ Tagalog +dnet:languages @=@ dnet:languages @=@ tah @=@ Tahitian +dnet:languages @=@ dnet:languages @=@ tgk @=@ Tajik +dnet:languages @=@ dnet:languages @=@ tmh @=@ Tamashek +dnet:languages @=@ dnet:languages @=@ tam @=@ Tamil +dnet:languages @=@ dnet:languages @=@ tat @=@ Tatar +dnet:languages @=@ dnet:languages @=@ tel @=@ Telugu +dnet:languages @=@ dnet:languages @=@ ter @=@ Tereno +dnet:languages @=@ dnet:languages @=@ tha @=@ Thai +dnet:languages @=@ dnet:languages @=@ bod/tib @=@ Tibetan +dnet:languages @=@ dnet:languages @=@ tig @=@ Tigre +dnet:languages @=@ dnet:languages @=@ tir @=@ Tigrinya +dnet:languages @=@ dnet:languages @=@ tem @=@ Timne +dnet:languages @=@ dnet:languages @=@ tiv @=@ Tivi +dnet:languages @=@ dnet:languages @=@ tli @=@ Tlingit +dnet:languages @=@ dnet:languages @=@ ton @=@ Tonga (Tonga Islands) +dnet:languages @=@ dnet:languages @=@ tog @=@ Tonga(Nyasa) +dnet:languages @=@ dnet:languages @=@ tru @=@ Truk +dnet:languages @=@ dnet:languages @=@ tsi @=@ Tsimshian +dnet:languages @=@ dnet:languages @=@ tso @=@ Tsonga +dnet:languages @=@ dnet:languages @=@ tsn @=@ Tswana +dnet:languages @=@ dnet:languages @=@ tum @=@ Tumbuka +dnet:languages @=@ dnet:languages @=@ tur @=@ Turkish +dnet:languages @=@ dnet:languages @=@ tuk @=@ Turkmen +dnet:languages @=@ dnet:languages @=@ tyv @=@ Tuvinian +dnet:languages @=@ dnet:languages @=@ twi @=@ Twi +dnet:languages @=@ dnet:languages @=@ uga @=@ Ugaritic +dnet:languages @=@ dnet:languages @=@ uig @=@ Uighur; Uyghur +dnet:languages @=@ dnet:languages @=@ ukr @=@ Ukrainian +dnet:languages @=@ dnet:languages @=@ umb @=@ Umbundu +dnet:languages @=@ dnet:languages @=@ und @=@ Undetermined +dnet:languages @=@ dnet:languages @=@ urd @=@ Urdu +dnet:languages @=@ dnet:languages @=@ uzb @=@ Uzbek +dnet:languages @=@ dnet:languages @=@ vai @=@ Vai +dnet:languages @=@ dnet:languages @=@ ven @=@ Venda +dnet:languages @=@ dnet:languages @=@ vie @=@ Vietnamese +dnet:languages @=@ dnet:languages @=@ vol @=@ Volapük +dnet:languages @=@ dnet:languages @=@ vot @=@ Votic +dnet:languages @=@ dnet:languages @=@ wak @=@ Wakashan +dnet:languages @=@ dnet:languages @=@ wal @=@ Walamo +dnet:languages @=@ dnet:languages @=@ wln @=@ Walloon +dnet:languages @=@ dnet:languages @=@ war @=@ Waray +dnet:languages @=@ dnet:languages @=@ was @=@ Washo +dnet:languages @=@ dnet:languages @=@ cym/wel @=@ Welsh +dnet:languages @=@ dnet:languages @=@ wol @=@ Wolof +dnet:languages @=@ dnet:languages @=@ xho @=@ Xhosa +dnet:languages @=@ dnet:languages @=@ sah @=@ Yakut +dnet:languages @=@ dnet:languages @=@ yao @=@ Yao +dnet:languages @=@ dnet:languages @=@ yap @=@ Yap +dnet:languages @=@ dnet:languages @=@ yid @=@ Yiddish +dnet:languages @=@ dnet:languages @=@ yor @=@ Yoruba +dnet:languages @=@ dnet:languages @=@ zap @=@ Zapotec +dnet:languages @=@ dnet:languages @=@ zen @=@ Zenaga +dnet:languages @=@ dnet:languages @=@ zha @=@ Zhuang; Chuang +dnet:languages @=@ dnet:languages @=@ zul @=@ Zulu +dnet:languages @=@ dnet:languages @=@ zun @=@ Zuni +dnet:languages @=@ dnet:languages @=@ sga @=@ old Irish +nsf:contractTypes @=@ NSF Contract Types @=@ BOA/Task Order @=@ BOA/Task Order +nsf:contractTypes @=@ NSF Contract Types @=@ Continuing grant @=@ Continuing grant +nsf:contractTypes @=@ NSF Contract Types @=@ Contract @=@ Contract +nsf:contractTypes @=@ NSF Contract Types @=@ Contract Interagency Agreement @=@ Contract Interagency Agreement +nsf:contractTypes @=@ NSF Contract Types @=@ Cooperative Agreement @=@ Cooperative Agreement +nsf:contractTypes @=@ NSF Contract Types @=@ Fellowship @=@ Fellowship +nsf:contractTypes @=@ NSF Contract Types @=@ Fixed Price Award @=@ Fixed Price Award +nsf:contractTypes @=@ NSF Contract Types @=@ GAA @=@ GAA +nsf:contractTypes @=@ NSF Contract Types @=@ Interagency Agreement @=@ Interagency Agreement +nsf:contractTypes @=@ NSF Contract Types @=@ Intergovernmental Personnel Award @=@ Intergovernmental Personnel Award +nsf:contractTypes @=@ NSF Contract Types @=@ Personnel Agreement @=@ Personnel Agreement +nsf:contractTypes @=@ NSF Contract Types @=@ Standard Grant @=@ Standard Grant +ec:funding_relations @=@ ec:funding_relations @=@ ec:hasframeworkprogram @=@ hasframeworkprogram +ec:funding_relations @=@ ec:funding_relations @=@ ec:hasprogram @=@ hasprogram +ec:funding_relations @=@ ec:funding_relations @=@ ec:hasspecificprogram @=@ hasspecificprogram +dnet:dataCite_resource @=@ dnet:dataCite_resource @=@ UNKNOWN @=@ UNKNOWN +dnet:dataCite_resource @=@ dnet:dataCite_resource @=@ collection @=@ collection +dnet:dataCite_resource @=@ dnet:dataCite_resource @=@ dataset @=@ dataset +dnet:dataCite_resource @=@ dnet:dataCite_resource @=@ event @=@ event +dnet:dataCite_resource @=@ dnet:dataCite_resource @=@ film @=@ film +dnet:dataCite_resource @=@ dnet:dataCite_resource @=@ image @=@ image +dnet:dataCite_resource @=@ dnet:dataCite_resource @=@ interactiveResource @=@ interactiveResource +dnet:dataCite_resource @=@ dnet:dataCite_resource @=@ model @=@ model +dnet:dataCite_resource @=@ dnet:dataCite_resource @=@ physicalObject @=@ physicalObject +dnet:dataCite_resource @=@ dnet:dataCite_resource @=@ service @=@ service +dnet:dataCite_resource @=@ dnet:dataCite_resource @=@ software @=@ software +dnet:dataCite_resource @=@ dnet:dataCite_resource @=@ sound @=@ sound +dnet:dataCite_resource @=@ dnet:dataCite_resource @=@ text @=@ text +dnet:dataCite_resource @=@ dnet:dataCite_resource @=@ clinicalTrial @=@ Clinical trial +dnet:dataCite_title @=@ dnet:dataCite_title @=@ alternative title @=@ alternative title +dnet:dataCite_title @=@ dnet:dataCite_title @=@ main title @=@ main title +dnet:dataCite_title @=@ dnet:dataCite_title @=@ subtitle @=@ subtitle +dnet:dataCite_title @=@ dnet:dataCite_title @=@ translated title @=@ translated title +datacite:relation_typologies @=@ datacite:relation_typologies @=@ IsCitedBy @=@ IsCitedBy +datacite:relation_typologies @=@ datacite:relation_typologies @=@ IsNewVersionOf @=@ IsNewVersionOf +datacite:relation_typologies @=@ datacite:relation_typologies @=@ IsPartOf @=@ IsPartOf +datacite:relation_typologies @=@ datacite:relation_typologies @=@ IsPreviousVersionOf @=@ IsPreviousVersionOf +datacite:relation_typologies @=@ datacite:relation_typologies @=@ IsReferencedBy @=@ IsReferencedBy +datacite:relation_typologies @=@ datacite:relation_typologies @=@ References @=@ References +datacite:relation_typologies @=@ datacite:relation_typologies @=@ UNKNOWN @=@ UNKNOWN +dnet:result_typologies @=@ dnet:result_typologies @=@ dataset @=@ dataset +dnet:result_typologies @=@ dnet:result_typologies @=@ other @=@ other +dnet:result_typologies @=@ dnet:result_typologies @=@ publication @=@ publication +dnet:result_typologies @=@ dnet:result_typologies @=@ software @=@ software +ec:h2020toas @=@ Horizon 2020 - Type of Actions @=@ ERC-ADG @=@ Advanced Grant +ec:h2020toas @=@ Horizon 2020 - Type of Actions @=@ BBI-CSA @=@ Bio-based Industries Coordination and Support action +ec:h2020toas @=@ Horizon 2020 - Type of Actions @=@ BBI-IA-DEMO @=@ Bio-based Industries Innovation action - Demonstration +ec:h2020toas @=@ Horizon 2020 - Type of Actions @=@ BBI-IA-FLAG @=@ Bio-based Industries Innovation action - Flagship +ec:h2020toas @=@ Horizon 2020 - Type of Actions @=@ BBI-RIA @=@ Bio-based Industries Research and Innovation action +ec:h2020toas @=@ Horizon 2020 - Type of Actions @=@ MSCA-IF-EF-CAR @=@ CAR – Career Restart panel +ec:h2020toas @=@ Horizon 2020 - Type of Actions @=@ COFUND-EJP @=@ COFUND (European Joint Programme) +ec:h2020toas @=@ Horizon 2020 - Type of Actions @=@ COFUND-PCP @=@ COFUND (PCP) +ec:h2020toas @=@ Horizon 2020 - Type of Actions @=@ COFUND-PPI @=@ COFUND (PPI) +ec:h2020toas @=@ Horizon 2020 - Type of Actions @=@ CS2-CSA @=@ CS2 Coordination and Support action +ec:h2020toas @=@ Horizon 2020 - Type of Actions @=@ CS2-IA @=@ CS2 Innovation Action +ec:h2020toas @=@ Horizon 2020 - Type of Actions @=@ CS2-RIA @=@ CS2 Research and Innovation action +ec:h2020toas @=@ Horizon 2020 - Type of Actions @=@ CSA-LS @=@ CSA Lump sum +ec:h2020toas @=@ Horizon 2020 - Type of Actions @=@ ERC-COG @=@ Consolidator Grant +ec:h2020toas @=@ Horizon 2020 - Type of Actions @=@ FCH2-CSA @=@ Coordination & support action +ec:h2020toas @=@ Horizon 2020 - Type of Actions @=@ CSA @=@ Coordination and support action +ec:h2020toas @=@ Horizon 2020 - Type of Actions @=@ MSCA-COFUND-DP @=@ Doctoral programmes +ec:h2020toas @=@ Horizon 2020 - Type of Actions @=@ ECSEL-CSA @=@ ECSEL Coordination & Support action +ec:h2020toas @=@ Horizon 2020 - Type of Actions @=@ ECSEL-IA @=@ ECSEL Innovation Action +ec:h2020toas @=@ Horizon 2020 - Type of Actions @=@ ECSEL-RIA @=@ ECSEL Research and Innovation Actions +ec:h2020toas @=@ Horizon 2020 - Type of Actions @=@ ERA-NET-Cofund @=@ ERA-NET Cofund +ec:h2020toas @=@ Horizon 2020 - Type of Actions @=@ ERC-POC-LS @=@ ERC Proof of Concept Lump Sum Pilot +ec:h2020toas @=@ Horizon 2020 - Type of Actions @=@ ERC-SyG @=@ ERC Synergy Grant +ec:h2020toas @=@ Horizon 2020 - Type of Actions @=@ ERC-LVG @=@ ERC low value grant +ec:h2020toas @=@ Horizon 2020 - Type of Actions @=@ H2020-EEN-SGA @=@ Enterprise Europe Network - Specific Grant Agreement +ec:h2020toas @=@ Horizon 2020 - Type of Actions @=@ MSCA-ITN-EID @=@ European Industrial Doctorates +ec:h2020toas @=@ Horizon 2020 - Type of Actions @=@ MSCA-ITN-EJD @=@ European Joint Doctorates +ec:h2020toas @=@ Horizon 2020 - Type of Actions @=@ MSCA-ITN-ETN @=@ European Training Networks +ec:h2020toas @=@ Horizon 2020 - Type of Actions @=@ FCH2-IA @=@ FCH2 Innovation action +ec:h2020toas @=@ Horizon 2020 - Type of Actions @=@ FCH2-RIA @=@ FCH2 Research and Innovation action +ec:h2020toas @=@ Horizon 2020 - Type of Actions @=@ MSCA-COFUND-FP @=@ Fellowship programmes +ec:h2020toas @=@ Horizon 2020 - Type of Actions @=@ MSCA-IF-GF @=@ Global Fellowships +ec:h2020toas @=@ Horizon 2020 - Type of Actions @=@ IMI2-CSA @=@ IMI2 Coordination & support action +ec:h2020toas @=@ Horizon 2020 - Type of Actions @=@ IMI2-RIA @=@ IMI2 Research and Innovation action +ec:h2020toas @=@ Horizon 2020 - Type of Actions @=@ Shift2Rail-IA-LS @=@ Innovation Action Lump-Sum +ec:h2020toas @=@ Horizon 2020 - Type of Actions @=@ IA-LS @=@ Innovation Action Lump-Sum +ec:h2020toas @=@ Horizon 2020 - Type of Actions @=@ IA @=@ Innovation action +ec:h2020toas @=@ Horizon 2020 - Type of Actions @=@ Shift2Rail-IA @=@ Innovation action +ec:h2020toas @=@ Horizon 2020 - Type of Actions @=@ PCP @=@ Pre-Commercial Procurement +ec:h2020toas @=@ Horizon 2020 - Type of Actions @=@ ERC-POC @=@ Proof of Concept Grant +ec:h2020toas @=@ Horizon 2020 - Type of Actions @=@ PPI @=@ Public Procurement of Innovative Solutions +ec:h2020toas @=@ Horizon 2020 - Type of Actions @=@ MSCA-IF-EF-RI @=@ RI – Reintegration panel +ec:h2020toas @=@ Horizon 2020 - Type of Actions @=@ MSCA-RISE @=@ RISE +ec:h2020toas @=@ Horizon 2020 - Type of Actions @=@ Shift2Rail-RIA-LS @=@ Research and Innovation Action Lump-Sum +ec:h2020toas @=@ Horizon 2020 - Type of Actions @=@ Shift2Rail-RIA @=@ Research and Innovation action +ec:h2020toas @=@ Horizon 2020 - Type of Actions @=@ RIA @=@ Research and Innovation action +ec:h2020toas @=@ Horizon 2020 - Type of Actions @=@ RIA-LS @=@ Research and Innovation action Lump Sum +ec:h2020toas @=@ Horizon 2020 - Type of Actions @=@ SESAR-CSA @=@ SESAR: Coordination and Support Action +ec:h2020toas @=@ Horizon 2020 - Type of Actions @=@ SESAR-IA @=@ SESAR: Innovation action +ec:h2020toas @=@ Horizon 2020 - Type of Actions @=@ SESAR-RIA @=@ SESAR: Research and Innovation action +ec:h2020toas @=@ Horizon 2020 - Type of Actions @=@ SGA-RIA @=@ SGA Research and Innovation action +ec:h2020toas @=@ Horizon 2020 - Type of Actions @=@ SME-2b @=@ SME Instrument (grant only and blended finance) +ec:h2020toas @=@ Horizon 2020 - Type of Actions @=@ SME-1 @=@ SME instrument phase 1 +ec:h2020toas @=@ Horizon 2020 - Type of Actions @=@ SME-2 @=@ SME instrument phase 2 +ec:h2020toas @=@ Horizon 2020 - Type of Actions @=@ Shift2Rail-CSA @=@ Shift2Rail - Coordination and Support action +ec:h2020toas @=@ Horizon 2020 - Type of Actions @=@ MSCA-IF-EF-SE @=@ Society and Enterprise panel +ec:h2020toas @=@ Horizon 2020 - Type of Actions @=@ SGA-CSA @=@ Specific Grant agreement and Coordination and Support Action +ec:h2020toas @=@ Horizon 2020 - Type of Actions @=@ MSCA-IF-EF-ST @=@ Standard EF +ec:h2020toas @=@ Horizon 2020 - Type of Actions @=@ ERC-STG @=@ Starting Grant +ec:h2020toas @=@ Horizon 2020 - Type of Actions @=@ MSCA-SNLS @=@ Grant to identified beneficiary - Coordination and support actions (MSCA-Special Needs lump sum) +wt:contractTypes @=@ wt:contractTypes @=@ UNKNOWN @=@ UNKNOWN +dnet:countries @=@ dnet:countries @=@ AF @=@ Afghanistan +dnet:countries @=@ dnet:countries @=@ AL @=@ Albania +dnet:countries @=@ dnet:countries @=@ DZ @=@ Algeria +dnet:countries @=@ dnet:countries @=@ AS @=@ American Samoa +dnet:countries @=@ dnet:countries @=@ AD @=@ Andorra +dnet:countries @=@ dnet:countries @=@ AO @=@ Angola +dnet:countries @=@ dnet:countries @=@ AI @=@ Anguilla +dnet:countries @=@ dnet:countries @=@ AQ @=@ Antarctica +dnet:countries @=@ dnet:countries @=@ AG @=@ Antigua and Barbuda +dnet:countries @=@ dnet:countries @=@ AR @=@ Argentina +dnet:countries @=@ dnet:countries @=@ AM @=@ Armenia +dnet:countries @=@ dnet:countries @=@ AW @=@ Aruba +dnet:countries @=@ dnet:countries @=@ AU @=@ Australia +dnet:countries @=@ dnet:countries @=@ AT @=@ Austria +dnet:countries @=@ dnet:countries @=@ AZ @=@ Azerbaijan +dnet:countries @=@ dnet:countries @=@ BS @=@ Bahamas +dnet:countries @=@ dnet:countries @=@ BH @=@ Bahrain +dnet:countries @=@ dnet:countries @=@ BD @=@ Bangladesh +dnet:countries @=@ dnet:countries @=@ BB @=@ Barbados +dnet:countries @=@ dnet:countries @=@ BY @=@ Belarus +dnet:countries @=@ dnet:countries @=@ BE @=@ Belgium +dnet:countries @=@ dnet:countries @=@ BZ @=@ Belize +dnet:countries @=@ dnet:countries @=@ BJ @=@ Benin +dnet:countries @=@ dnet:countries @=@ BM @=@ Bermuda +dnet:countries @=@ dnet:countries @=@ BT @=@ Bhutan +dnet:countries @=@ dnet:countries @=@ BO @=@ Bolivia +dnet:countries @=@ dnet:countries @=@ BQ @=@ Bonaire, Sint Eustatius and Saba +dnet:countries @=@ dnet:countries @=@ BA @=@ Bosnia and Herzegovina +dnet:countries @=@ dnet:countries @=@ BW @=@ Botswana +dnet:countries @=@ dnet:countries @=@ BV @=@ Bouvet Island +dnet:countries @=@ dnet:countries @=@ BR @=@ Brazil +dnet:countries @=@ dnet:countries @=@ IO @=@ British Indian Ocean Territory +dnet:countries @=@ dnet:countries @=@ BN @=@ Brunei Darussalam +dnet:countries @=@ dnet:countries @=@ BG @=@ Bulgaria +dnet:countries @=@ dnet:countries @=@ BF @=@ Burkina Faso +dnet:countries @=@ dnet:countries @=@ BI @=@ Burundi +dnet:countries @=@ dnet:countries @=@ KH @=@ Cambodia +dnet:countries @=@ dnet:countries @=@ CM @=@ Cameroon +dnet:countries @=@ dnet:countries @=@ CA @=@ Canada +dnet:countries @=@ dnet:countries @=@ CV @=@ Cape Verde +dnet:countries @=@ dnet:countries @=@ KY @=@ Cayman Islands +dnet:countries @=@ dnet:countries @=@ CF @=@ Central African Republic +dnet:countries @=@ dnet:countries @=@ TD @=@ Chad +dnet:countries @=@ dnet:countries @=@ CL @=@ Chile +dnet:countries @=@ dnet:countries @=@ CN @=@ China (People's Republic of) +dnet:countries @=@ dnet:countries @=@ CX @=@ Christmas Island +dnet:countries @=@ dnet:countries @=@ CC @=@ Cocos (Keeling) Islands +dnet:countries @=@ dnet:countries @=@ CO @=@ Colombia +dnet:countries @=@ dnet:countries @=@ KM @=@ Comoros +dnet:countries @=@ dnet:countries @=@ CG @=@ Congo +dnet:countries @=@ dnet:countries @=@ CD @=@ Congo (Democratic Republic of) +dnet:countries @=@ dnet:countries @=@ CK @=@ Cook Islands +dnet:countries @=@ dnet:countries @=@ CR @=@ Costa Rica +dnet:countries @=@ dnet:countries @=@ CI @=@ Cote d'Ivoire +dnet:countries @=@ dnet:countries @=@ HR @=@ Croatia +dnet:countries @=@ dnet:countries @=@ CU @=@ Cuba +dnet:countries @=@ dnet:countries @=@ CW @=@ Curaçao +dnet:countries @=@ dnet:countries @=@ CY @=@ Cyprus +dnet:countries @=@ dnet:countries @=@ CZ @=@ Czech Republic +dnet:countries @=@ dnet:countries @=@ DK @=@ Denmark +dnet:countries @=@ dnet:countries @=@ DJ @=@ Djibouti +dnet:countries @=@ dnet:countries @=@ DM @=@ Dominica +dnet:countries @=@ dnet:countries @=@ DO @=@ Dominican Republic +dnet:countries @=@ dnet:countries @=@ EC @=@ Ecuador +dnet:countries @=@ dnet:countries @=@ EG @=@ Egypt +dnet:countries @=@ dnet:countries @=@ SV @=@ El Salvador +dnet:countries @=@ dnet:countries @=@ GQ @=@ Equatorial Guinea +dnet:countries @=@ dnet:countries @=@ ER @=@ Eritrea +dnet:countries @=@ dnet:countries @=@ EE @=@ Estonia +dnet:countries @=@ dnet:countries @=@ ET @=@ Ethiopia +dnet:countries @=@ dnet:countries @=@ EU @=@ European Union +dnet:countries @=@ dnet:countries @=@ FK @=@ Falkland Islands (Malvinas) +dnet:countries @=@ dnet:countries @=@ FO @=@ Faroe Islands +dnet:countries @=@ dnet:countries @=@ FJ @=@ Fiji +dnet:countries @=@ dnet:countries @=@ FI @=@ Finland +dnet:countries @=@ dnet:countries @=@ MK @=@ Former Yugoslav Republic of Macedonia +dnet:countries @=@ dnet:countries @=@ FR @=@ France +dnet:countries @=@ dnet:countries @=@ GF @=@ French Guiana +dnet:countries @=@ dnet:countries @=@ PF @=@ French Polynesia +dnet:countries @=@ dnet:countries @=@ TF @=@ French Southern Territories +dnet:countries @=@ dnet:countries @=@ GA @=@ Gabon +dnet:countries @=@ dnet:countries @=@ GM @=@ Gambia +dnet:countries @=@ dnet:countries @=@ GE @=@ Georgia +dnet:countries @=@ dnet:countries @=@ DE @=@ Germany +dnet:countries @=@ dnet:countries @=@ GH @=@ Ghana +dnet:countries @=@ dnet:countries @=@ GI @=@ Gibraltar +dnet:countries @=@ dnet:countries @=@ GR @=@ Greece +dnet:countries @=@ dnet:countries @=@ GL @=@ Greenland +dnet:countries @=@ dnet:countries @=@ GD @=@ Grenada +dnet:countries @=@ dnet:countries @=@ GP @=@ Guadeloupe +dnet:countries @=@ dnet:countries @=@ GU @=@ Guam +dnet:countries @=@ dnet:countries @=@ GT @=@ Guatemala +dnet:countries @=@ dnet:countries @=@ GG @=@ Guernsey +dnet:countries @=@ dnet:countries @=@ GN @=@ Guinea +dnet:countries @=@ dnet:countries @=@ GW @=@ Guinea-Bissau +dnet:countries @=@ dnet:countries @=@ GY @=@ Guyana +dnet:countries @=@ dnet:countries @=@ HT @=@ Haiti +dnet:countries @=@ dnet:countries @=@ HM @=@ Heard Island and McDonald Islands +dnet:countries @=@ dnet:countries @=@ VA @=@ Holy See (Vatican City State) +dnet:countries @=@ dnet:countries @=@ HN @=@ Honduras +dnet:countries @=@ dnet:countries @=@ HK @=@ Hong Kong +dnet:countries @=@ dnet:countries @=@ HU @=@ Hungary +dnet:countries @=@ dnet:countries @=@ IS @=@ Iceland +dnet:countries @=@ dnet:countries @=@ IN @=@ India +dnet:countries @=@ dnet:countries @=@ ID @=@ Indonesia +dnet:countries @=@ dnet:countries @=@ IR @=@ Iran (Islamic Republic of) +dnet:countries @=@ dnet:countries @=@ IQ @=@ Iraq +dnet:countries @=@ dnet:countries @=@ IE @=@ Ireland +dnet:countries @=@ dnet:countries @=@ IM @=@ Isle of Man +dnet:countries @=@ dnet:countries @=@ IL @=@ Israel +dnet:countries @=@ dnet:countries @=@ IT @=@ Italy +dnet:countries @=@ dnet:countries @=@ JM @=@ Jamaica +dnet:countries @=@ dnet:countries @=@ JP @=@ Japan +dnet:countries @=@ dnet:countries @=@ JE @=@ Jersey +dnet:countries @=@ dnet:countries @=@ JO @=@ Jordan +dnet:countries @=@ dnet:countries @=@ KZ @=@ Kazakhstan +dnet:countries @=@ dnet:countries @=@ KE @=@ Kenya +dnet:countries @=@ dnet:countries @=@ KI @=@ Kiribati +dnet:countries @=@ dnet:countries @=@ KR @=@ Korea (Republic of) +dnet:countries @=@ dnet:countries @=@ KP @=@ Korea, Democatric People's Republic of +dnet:countries @=@ dnet:countries @=@ XK @=@ Kosovo * UN resolution +dnet:countries @=@ dnet:countries @=@ KW @=@ Kuwait +dnet:countries @=@ dnet:countries @=@ KG @=@ Kyrgyzstan +dnet:countries @=@ dnet:countries @=@ LA @=@ Lao (People's Democratic Republic) +dnet:countries @=@ dnet:countries @=@ LV @=@ Latvia +dnet:countries @=@ dnet:countries @=@ LB @=@ Lebanon +dnet:countries @=@ dnet:countries @=@ LS @=@ Lesotho +dnet:countries @=@ dnet:countries @=@ LR @=@ Liberia +dnet:countries @=@ dnet:countries @=@ LY @=@ Libyan Arab Jamahiriya +dnet:countries @=@ dnet:countries @=@ LI @=@ Liechtenstein +dnet:countries @=@ dnet:countries @=@ LT @=@ Lithuania +dnet:countries @=@ dnet:countries @=@ LU @=@ Luxembourg +dnet:countries @=@ dnet:countries @=@ MO @=@ Macao +dnet:countries @=@ dnet:countries @=@ MG @=@ Madagascar +dnet:countries @=@ dnet:countries @=@ MW @=@ Malawi +dnet:countries @=@ dnet:countries @=@ MY @=@ Malaysia +dnet:countries @=@ dnet:countries @=@ MV @=@ Maldives +dnet:countries @=@ dnet:countries @=@ ML @=@ Mali +dnet:countries @=@ dnet:countries @=@ MT @=@ Malta +dnet:countries @=@ dnet:countries @=@ MH @=@ Marshall Islands +dnet:countries @=@ dnet:countries @=@ MQ @=@ Martinique +dnet:countries @=@ dnet:countries @=@ MR @=@ Mauritania +dnet:countries @=@ dnet:countries @=@ MU @=@ Mauritius +dnet:countries @=@ dnet:countries @=@ YT @=@ Mayotte +dnet:countries @=@ dnet:countries @=@ MX @=@ Mexico +dnet:countries @=@ dnet:countries @=@ FM @=@ Micronesia, Federated States of +dnet:countries @=@ dnet:countries @=@ MD @=@ Moldova (Republic of) +dnet:countries @=@ dnet:countries @=@ MN @=@ Mongolia +dnet:countries @=@ dnet:countries @=@ ME @=@ Montenegro +dnet:countries @=@ dnet:countries @=@ MS @=@ Montserrat +dnet:countries @=@ dnet:countries @=@ MA @=@ Morocco +dnet:countries @=@ dnet:countries @=@ MZ @=@ Mozambique +dnet:countries @=@ dnet:countries @=@ MM @=@ Myanmar +dnet:countries @=@ dnet:countries @=@ NA @=@ Namibia +dnet:countries @=@ dnet:countries @=@ NR @=@ Nauru +dnet:countries @=@ dnet:countries @=@ NP @=@ Nepal +dnet:countries @=@ dnet:countries @=@ NL @=@ Netherlands +dnet:countries @=@ dnet:countries @=@ AN @=@ Netherlands Antilles +dnet:countries @=@ dnet:countries @=@ NC @=@ New Caledonia +dnet:countries @=@ dnet:countries @=@ NZ @=@ New Zealand +dnet:countries @=@ dnet:countries @=@ NI @=@ Nicaragua +dnet:countries @=@ dnet:countries @=@ NE @=@ Niger +dnet:countries @=@ dnet:countries @=@ NG @=@ Nigeria +dnet:countries @=@ dnet:countries @=@ NU @=@ Niue +dnet:countries @=@ dnet:countries @=@ NF @=@ Norfolk Island +dnet:countries @=@ dnet:countries @=@ MP @=@ Northern Mariana Islands +dnet:countries @=@ dnet:countries @=@ NO @=@ Norway +dnet:countries @=@ dnet:countries @=@ OC @=@ Oceania +dnet:countries @=@ dnet:countries @=@ OM @=@ Oman +dnet:countries @=@ dnet:countries @=@ PK @=@ Pakistan +dnet:countries @=@ dnet:countries @=@ PW @=@ Palau +dnet:countries @=@ dnet:countries @=@ PS @=@ Palestinian-administered areas +dnet:countries @=@ dnet:countries @=@ PA @=@ Panama +dnet:countries @=@ dnet:countries @=@ PG @=@ Papua New Guinea +dnet:countries @=@ dnet:countries @=@ PY @=@ Paraguay +dnet:countries @=@ dnet:countries @=@ PE @=@ Peru +dnet:countries @=@ dnet:countries @=@ PH @=@ Philippines +dnet:countries @=@ dnet:countries @=@ PN @=@ Pitcairn +dnet:countries @=@ dnet:countries @=@ PL @=@ Poland +dnet:countries @=@ dnet:countries @=@ PT @=@ Portugal +dnet:countries @=@ dnet:countries @=@ PR @=@ Puerto Rico +dnet:countries @=@ dnet:countries @=@ QA @=@ Qatar +dnet:countries @=@ dnet:countries @=@ RO @=@ Romania +dnet:countries @=@ dnet:countries @=@ RU @=@ Russian Federation +dnet:countries @=@ dnet:countries @=@ RW @=@ Rwanda +dnet:countries @=@ dnet:countries @=@ RE @=@ Réunion +dnet:countries @=@ dnet:countries @=@ SH @=@ Saint Helena, Ascension and Tristan da Cunha +dnet:countries @=@ dnet:countries @=@ KN @=@ Saint Kitts and Nevis +dnet:countries @=@ dnet:countries @=@ LC @=@ Saint Lucia +dnet:countries @=@ dnet:countries @=@ MF @=@ Saint Martin (French Part) +dnet:countries @=@ dnet:countries @=@ PM @=@ Saint Pierre and Miquelon +dnet:countries @=@ dnet:countries @=@ VC @=@ Saint Vincent and the Grenadines +dnet:countries @=@ dnet:countries @=@ BL @=@ Saint-Barthélemy +dnet:countries @=@ dnet:countries @=@ WS @=@ Samoa +dnet:countries @=@ dnet:countries @=@ SM @=@ San Marino +dnet:countries @=@ dnet:countries @=@ SA @=@ Saudi Arabia +dnet:countries @=@ dnet:countries @=@ SN @=@ Senegal +dnet:countries @=@ dnet:countries @=@ RS @=@ Serbia +dnet:countries @=@ dnet:countries @=@ CS @=@ Serbia and Montenegro +dnet:countries @=@ dnet:countries @=@ SC @=@ Seychelles +dnet:countries @=@ dnet:countries @=@ SL @=@ Sierra Leone +dnet:countries @=@ dnet:countries @=@ SG @=@ Singapore +dnet:countries @=@ dnet:countries @=@ SX @=@ Sint Maarten (Dutch Part) +dnet:countries @=@ dnet:countries @=@ SK @=@ Slovakia +dnet:countries @=@ dnet:countries @=@ SI @=@ Slovenia +dnet:countries @=@ dnet:countries @=@ SB @=@ Solomon Islands +dnet:countries @=@ dnet:countries @=@ SO @=@ Somalia +dnet:countries @=@ dnet:countries @=@ ZA @=@ South Africa +dnet:countries @=@ dnet:countries @=@ GS @=@ South Georgia and the South Sandwich Islands +dnet:countries @=@ dnet:countries @=@ SS @=@ South Sudan +dnet:countries @=@ dnet:countries @=@ ES @=@ Spain +dnet:countries @=@ dnet:countries @=@ LK @=@ Sri Lanka +dnet:countries @=@ dnet:countries @=@ SD @=@ Sudan +dnet:countries @=@ dnet:countries @=@ SR @=@ Suriname +dnet:countries @=@ dnet:countries @=@ SJ @=@ Svalbard and Jan Mayen +dnet:countries @=@ dnet:countries @=@ SZ @=@ Swaziland +dnet:countries @=@ dnet:countries @=@ SE @=@ Sweden +dnet:countries @=@ dnet:countries @=@ CH @=@ Switzerland +dnet:countries @=@ dnet:countries @=@ SY @=@ Syrian Arab Republic +dnet:countries @=@ dnet:countries @=@ ST @=@ São Tomé and Príncipe +dnet:countries @=@ dnet:countries @=@ TW @=@ Taiwan +dnet:countries @=@ dnet:countries @=@ TJ @=@ Tajikistan +dnet:countries @=@ dnet:countries @=@ TZ @=@ Tanzania (United Republic of) +dnet:countries @=@ dnet:countries @=@ TH @=@ Thailand +dnet:countries @=@ dnet:countries @=@ TL @=@ Timor-Leste +dnet:countries @=@ dnet:countries @=@ TG @=@ Togo +dnet:countries @=@ dnet:countries @=@ TK @=@ Tokelau +dnet:countries @=@ dnet:countries @=@ TO @=@ Tonga +dnet:countries @=@ dnet:countries @=@ TT @=@ Trinidad and Tobago +dnet:countries @=@ dnet:countries @=@ TN @=@ Tunisia +dnet:countries @=@ dnet:countries @=@ TR @=@ Turkey +dnet:countries @=@ dnet:countries @=@ TM @=@ Turkmenistan +dnet:countries @=@ dnet:countries @=@ TC @=@ Turks and Caicos Islands +dnet:countries @=@ dnet:countries @=@ TV @=@ Tuvalu +dnet:countries @=@ dnet:countries @=@ UNKNOWN @=@ UNKNOWN +dnet:countries @=@ dnet:countries @=@ UG @=@ Uganda +dnet:countries @=@ dnet:countries @=@ UA @=@ Ukraine +dnet:countries @=@ dnet:countries @=@ AE @=@ United Arab Emirates +dnet:countries @=@ dnet:countries @=@ GB @=@ United Kingdom +dnet:countries @=@ dnet:countries @=@ US @=@ United States +dnet:countries @=@ dnet:countries @=@ UM @=@ United States Minor Outlying Islands +dnet:countries @=@ dnet:countries @=@ UY @=@ Uruguay +dnet:countries @=@ dnet:countries @=@ UZ @=@ Uzbekistan +dnet:countries @=@ dnet:countries @=@ VU @=@ Vanuatu +dnet:countries @=@ dnet:countries @=@ VE @=@ Venezuela +dnet:countries @=@ dnet:countries @=@ VN @=@ Viet Nam +dnet:countries @=@ dnet:countries @=@ VG @=@ Virgin Islands (British) +dnet:countries @=@ dnet:countries @=@ VI @=@ Virgin Islands, U.S. +dnet:countries @=@ dnet:countries @=@ WF @=@ Wallis and Futuna +dnet:countries @=@ dnet:countries @=@ EH @=@ Western Sahara +dnet:countries @=@ dnet:countries @=@ YE @=@ Yemen +dnet:countries @=@ dnet:countries @=@ YU @=@ Yugoslavia +dnet:countries @=@ dnet:countries @=@ ZM @=@ Zambia +dnet:countries @=@ dnet:countries @=@ ZW @=@ Zimbabwe +dnet:countries @=@ dnet:countries @=@ AX @=@ Åland Islands +dnet:datasourceCompatibilityLevel @=@ dnet:datasourceCompatibilityLevel @=@ openaire2.0 @=@ OpenAIRE 2.0 (EC funding) +dnet:datasourceCompatibilityLevel @=@ dnet:datasourceCompatibilityLevel @=@ driver-openaire2.0 @=@ OpenAIRE 2.0+ (DRIVER OA, EC funding) +dnet:datasourceCompatibilityLevel @=@ dnet:datasourceCompatibilityLevel @=@ openaire3.0 @=@ OpenAIRE 3.0 (OA, funding) +dnet:datasourceCompatibilityLevel @=@ dnet:datasourceCompatibilityLevel @=@ openaire4.0 @=@ OpenAIRE 4.0 (inst.&thematic. repo.) +dnet:datasourceCompatibilityLevel @=@ dnet:datasourceCompatibilityLevel @=@ driver @=@ OpenAIRE Basic (DRIVER OA) +dnet:datasourceCompatibilityLevel @=@ dnet:datasourceCompatibilityLevel @=@ openaire2.0_data @=@ OpenAIRE Data (funded, referenced datasets) +dnet:datasourceCompatibilityLevel @=@ dnet:datasourceCompatibilityLevel @=@ hostedBy @=@ collected from a compatible aggregator +dnet:datasourceCompatibilityLevel @=@ dnet:datasourceCompatibilityLevel @=@ UNKNOWN @=@ not available +dnet:datasourceCompatibilityLevel @=@ dnet:datasourceCompatibilityLevel @=@ native @=@ proprietary +dnet:datasourceCompatibilityLevel @=@ dnet:datasourceCompatibilityLevel @=@ notCompatible @=@ under validation +dnet:datasourceCompatibilityLevel @=@ dnet:datasourceCompatibilityLevel @=@ openaire-cris_1.1 @=@ OpenAIRE CRIS v1.1 +fct:funding_relations @=@ fct:funding_relations @=@ fct:hasParentFunding @=@ fct:hasParentFunding +dnet:protocols @=@ dnet:protocols @=@ HTTPWithFileName @=@ HTTPWithFileName +dnet:protocols @=@ dnet:protocols @=@ NetCDF @=@ NetCDF +dnet:protocols @=@ dnet:protocols @=@ OpenDAP @=@ OpenDAP +dnet:protocols @=@ dnet:protocols @=@ schemaorg @=@ Schema.org +dnet:protocols @=@ dnet:protocols @=@ UNKNOWN @=@ UNKNOWN +dnet:protocols @=@ dnet:protocols @=@ api @=@ api +dnet:protocols @=@ dnet:protocols @=@ dataciteESPlugins @=@ dataciteESPlugins +dnet:protocols @=@ dnet:protocols @=@ datasetsbyjournal @=@ datasetsbyjournal +dnet:protocols @=@ dnet:protocols @=@ datasetsbyproject @=@ datasetsbyproject +dnet:protocols @=@ dnet:protocols @=@ excelFile @=@ excelFile +dnet:protocols @=@ dnet:protocols @=@ file @=@ file +dnet:protocols @=@ dnet:protocols @=@ fileGzip @=@ fileGzip +dnet:protocols @=@ dnet:protocols @=@ files_by_rpc @=@ files_by_rpc +dnet:protocols @=@ dnet:protocols @=@ files_from_mdstore @=@ files_from_mdstore +dnet:protocols @=@ dnet:protocols @=@ files_from_metadata @=@ files_from_metadata +dnet:protocols @=@ dnet:protocols @=@ filesystem @=@ filesystem +dnet:protocols @=@ dnet:protocols @=@ ftp @=@ ftp +dnet:protocols @=@ dnet:protocols @=@ gristProjects @=@ gristProjects +dnet:protocols @=@ dnet:protocols @=@ gtr2Projects @=@ gtr2Projects +dnet:protocols @=@ dnet:protocols @=@ http @=@ http +dnet:protocols @=@ dnet:protocols @=@ httpCSV @=@ httpCSV +dnet:protocols @=@ dnet:protocols @=@ httpList @=@ httpList +dnet:protocols @=@ dnet:protocols @=@ jdbc @=@ jdbc +dnet:protocols @=@ dnet:protocols @=@ oai @=@ oai +dnet:protocols @=@ dnet:protocols @=@ oai_sets @=@ oai_sets +dnet:protocols @=@ dnet:protocols @=@ other @=@ other +dnet:protocols @=@ dnet:protocols @=@ re3data @=@ re3data +dnet:protocols @=@ dnet:protocols @=@ rest @=@ rest +dnet:protocols @=@ dnet:protocols @=@ rest_json2xml @=@ rest_json2xml +dnet:protocols @=@ dnet:protocols @=@ sftp @=@ sftp +dnet:protocols @=@ dnet:protocols @=@ soap @=@ soap +dnet:protocols @=@ dnet:protocols @=@ sparql @=@ sparql +dnet:protocols @=@ dnet:protocols @=@ sword @=@ sword +dnet:protocols @=@ dnet:protocols @=@ targz @=@ targz +dnet:protocols @=@ dnet:protocols @=@ remoteMdstore @=@ remoteMdstore +wt:funding_typologies @=@ Wellcome Trust: Funding Typologies @=@ wt:fundingStream @=@ Wellcome Trust: Funding Stream +dnet:externalReference_typologies @=@ dnet:externalReference_typologies @=@ accessionNumber @=@ accessionNumber +dnet:externalReference_typologies @=@ dnet:externalReference_typologies @=@ dataset @=@ dataset +dnet:externalReference_typologies @=@ dnet:externalReference_typologies @=@ software @=@ software +datacite:id_typologies @=@ datacite:id_typologies @=@ ARK @=@ ARK +datacite:id_typologies @=@ datacite:id_typologies @=@ DOI @=@ DOI +datacite:id_typologies @=@ datacite:id_typologies @=@ EAN13 @=@ EAN13 +datacite:id_typologies @=@ datacite:id_typologies @=@ EISSN @=@ EISSN +datacite:id_typologies @=@ datacite:id_typologies @=@ Handle @=@ Handle +datacite:id_typologies @=@ datacite:id_typologies @=@ ISBN @=@ ISBN +datacite:id_typologies @=@ datacite:id_typologies @=@ ISSN @=@ ISSN +datacite:id_typologies @=@ datacite:id_typologies @=@ ISTC @=@ ISTC +datacite:id_typologies @=@ datacite:id_typologies @=@ LISSN @=@ LISSN +datacite:id_typologies @=@ datacite:id_typologies @=@ LSID @=@ LSID +datacite:id_typologies @=@ datacite:id_typologies @=@ PURL @=@ PURL +datacite:id_typologies @=@ datacite:id_typologies @=@ UNKNOWN @=@ UNKNOWN +datacite:id_typologies @=@ datacite:id_typologies @=@ UPC @=@ UPC +datacite:id_typologies @=@ datacite:id_typologies @=@ URL @=@ URL +datacite:id_typologies @=@ datacite:id_typologies @=@ URN @=@ URN +dnet:pid_types @=@ dnet:pid_types @=@ actrn @=@ ACTRN Identifier +dnet:pid_types @=@ dnet:pid_types @=@ nct @=@ ClinicalTrials.gov Identifier +dnet:pid_types @=@ dnet:pid_types @=@ euctr @=@ EU Clinical Trials Register +dnet:pid_types @=@ dnet:pid_types @=@ epo_id @=@ European Patent Office application ID +dnet:pid_types @=@ dnet:pid_types @=@ gsk @=@ GSK Identifier +dnet:pid_types @=@ dnet:pid_types @=@ GeoPass @=@ Geographic Location-Password Scheme +dnet:pid_types @=@ dnet:pid_types @=@ GBIF @=@ Global Biodiversity Information Facility +dnet:pid_types @=@ dnet:pid_types @=@ isrctn @=@ ISRCTN Identifier +dnet:pid_types @=@ dnet:pid_types @=@ ISNI @=@ International Standard Name Identifier +dnet:pid_types @=@ dnet:pid_types @=@ jprn @=@ JPRN Identifier +dnet:pid_types @=@ dnet:pid_types @=@ mag_id @=@ Microsoft Academic Graph Identifier +dnet:pid_types @=@ dnet:pid_types @=@ oai @=@ Open Archives Initiative +dnet:pid_types @=@ dnet:pid_types @=@ orcid @=@ Open Researcher and Contributor ID +dnet:pid_types @=@ dnet:pid_types @=@ PANGAEA @=@ PANGAEA +dnet:pid_types @=@ dnet:pid_types @=@ epo_nr_epodoc @=@ Patent application number in EPODOC format +dnet:pid_types @=@ dnet:pid_types @=@ UNKNOWN @=@ UNKNOWN +dnet:pid_types @=@ dnet:pid_types @=@ VIAF @=@ Virtual International Authority File +dnet:pid_types @=@ dnet:pid_types @=@ arXiv @=@ arXiv +dnet:pid_types @=@ dnet:pid_types @=@ doi @=@ doi +dnet:pid_types @=@ dnet:pid_types @=@ grid @=@ grid +dnet:pid_types @=@ dnet:pid_types @=@ info:eu-repo/dai @=@ info:eu-repo/dai +dnet:pid_types @=@ dnet:pid_types @=@ orcidworkid @=@ orcid workid +dnet:pid_types @=@ dnet:pid_types @=@ pmc @=@ pmc +dnet:pid_types @=@ dnet:pid_types @=@ pmid @=@ pmid +dnet:pid_types @=@ dnet:pid_types @=@ urn @=@ urn +dnet:pid_types @=@ dnet:pid_types @=@ who @=@ WHO Identifier +dnet:pid_types @=@ dnet:pid_types @=@ drks @=@ DRKS Identifier +dnet:pid_types @=@ dnet:pid_types @=@ handle @=@ Handle +dnet:topic_types @=@ dnet:topic_types @=@ ENRICH/MISSING/SUBJECT/ACM @=@ An ACM classification term that can be associated to your publications +dnet:topic_types @=@ dnet:topic_types @=@ ENRICH/MISSING/SUBJECT/ARXIV @=@ An ARXIV classification term that can be associated to your publications +dnet:topic_types @=@ dnet:topic_types @=@ ENRICH/MISSING/SUBJECT/DDC @=@ A Dewey Decimal classification term (DDC) that can be associated to your publications +dnet:topic_types @=@ dnet:topic_types @=@ ENRICH/MISSING/SUBJECT/JEL @=@ A Journal of Economic Literature (JEL) classification term that can be associated to your publications +dnet:topic_types @=@ dnet:topic_types @=@ ENRICH/MISSING/OPENACCESS_VERSION @=@ An Open Access versions of your publications +dnet:topic_types @=@ dnet:topic_types @=@ ENRICH/MISSING/DATASET/IS_REFERENCED_BY @=@ A dataset referenced by your records +dnet:topic_types @=@ dnet:topic_types @=@ ENRICH/MISSING/DATASET/REFERENCES @=@ A dataset that refers to your records +dnet:topic_types @=@ dnet:topic_types @=@ ENRICH/MISSING/DATASET/IS_RELATED_TO @=@ A dataset related to your records +dnet:topic_types @=@ dnet:topic_types @=@ ENRICH/MISSING/DATASET/IS_SUPPLEMENTED_TO @=@ A dataset that supplements your records +dnet:topic_types @=@ dnet:topic_types @=@ ENRICH/MISSING/PUBLICATION/IS_RELATED_TO @=@ A publication related to your records +dnet:topic_types @=@ dnet:topic_types @=@ ENRICH/MISSING/PUBLICATION/REFERENCES @=@ A publication referenced by your records +dnet:topic_types @=@ dnet:topic_types @=@ ENRICH/MISSING/PUBLICATION/IS_REFERENCED_BY @=@ A publication that refers to your records +dnet:topic_types @=@ dnet:topic_types @=@ ENRICH/MISSING/PUBLICATION/IS_SUPPLEMENTED_BY @=@ A publication that is supplemented by your records +dnet:topic_types @=@ dnet:topic_types @=@ ENRICH/MISSING/PUBLICATION/IS_SUPPLEMENTED_TO @=@ A publication that supplements your records +dnet:topic_types @=@ dnet:topic_types @=@ ENRICH/MISSING/SOFTWARE @=@ A software referred by your records +dnet:topic_types @=@ dnet:topic_types @=@ ENRICH/MORE/OPENACCESS_VERSION @=@ Another Open Access version of a publication +dnet:topic_types @=@ dnet:topic_types @=@ ENRICH/MORE/PID @=@ Another persistent identifier associated to your publications +dnet:topic_types @=@ dnet:topic_types @=@ ENRICH/MISSING/SUBJECT/MESHEUROPMC @=@ A classification term from the Medical Subject Headings (MeSH) that can be associated to your publications +dnet:topic_types @=@ dnet:topic_types @=@ ENRICH/MISSING/ABSTRACT @=@ An abstract describing among your publications +dnet:topic_types @=@ dnet:topic_types @=@ ENRICH/MISSING/PUBLICATION_DATE @=@ A date of publication missing in your content +dnet:topic_types @=@ dnet:topic_types @=@ ENRICH/MISSING/PID @=@ A persistent identifier associated to your publications +dnet:topic_types @=@ dnet:topic_types @=@ ENRICH/MORE/SUBJECT/ACM @=@ Another ACM classification term that can be associated to your publications +dnet:topic_types @=@ dnet:topic_types @=@ ENRICH/MORE/SUBJECT/ARXIV @=@ Another ARXIV classification term that can be associated to your publications +dnet:topic_types @=@ dnet:topic_types @=@ ENRICH/MORE/SUBJECT/DDC @=@ Another Dewey Decimal classification term (DDC) that can be associated to your publications +dnet:topic_types @=@ dnet:topic_types @=@ ENRICH/MORE/SUBJECT/JEL @=@ Another Journal of Economic Literature (JEL) classification term that can be associated to your publications +dnet:topic_types @=@ dnet:topic_types @=@ ENRICH/MORE/SUBJECT/MESHEUROPMC @=@ Another classification term from the Medical Subject Headings (MeSH) that can be associated to your publications +dnet:topic_types @=@ dnet:topic_types @=@ ENRICH/MISSING/PROJECT @=@ A project reference that can be associated to your publications +dnet:topic_types @=@ dnet:topic_types @=@ ENRICH/MISSING/DATASET/IS_SUPPLEMENTED_BY @=@ A dataset that is supplemented by your records +dnet:topic_types @=@ dnet:topic_types @=@ ENRICH/MISSING/AUTHOR/ORCID @=@ An Open Researcher and Contributor ID (ORCID) that can be associated to an author of your publications +dnet:review_levels @=@ dnet:review_levels @=@ 0000 @=@ Unknown +dnet:review_levels @=@ dnet:review_levels @=@ 0002 @=@ nonPeerReviewed +dnet:review_levels @=@ dnet:review_levels @=@ 0001 @=@ peerReviewed \ No newline at end of file diff --git a/dhp-schemas/README.md b/dhp-schemas/README.md deleted file mode 100644 index 7431cda42..000000000 --- a/dhp-schemas/README.md +++ /dev/null @@ -1,11 +0,0 @@ -Description of the project --------------------------- -This project defines **object schemas** of the OpenAIRE main entities and the relationships that intercur among them. -Namely it defines the model for - -- **research product (result)** which subclasses in publication, dataset, other research product, software -- **data source** object describing the data provider (institutional repository, aggregators, cris systems) -- **organization** research bodies managing a data source or participating to a research project -- **project** research project - -Te serialization of such objects (data store files) are used to pass data between workflow nodes in the processing pipeline. diff --git a/dhp-schemas/pom.xml b/dhp-schemas/pom.xml deleted file mode 100644 index b04d62dd2..000000000 --- a/dhp-schemas/pom.xml +++ /dev/null @@ -1,73 +0,0 @@ - - - 4.0.0 - - - eu.dnetlib.dhp - dhp - 1.2.4-SNAPSHOT - ../ - - - dhp-schemas - jar - - This module contains common schema classes meant to be used across the dnet-hadoop submodules - - - - - net.alchim31.maven - scala-maven-plugin - 4.0.1 - - - scala-compile-first - initialize - - add-source - compile - - - - scala-test-compile - process-test-resources - - testCompile - - - - - ${scala.version} - - - - - - - - - - commons-io - commons-io - - - - org.apache.commons - commons-lang3 - - - - com.fasterxml.jackson.core - jackson-databind - - - - com.google.guava - guava - - - - - - diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/action/AtomicAction.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/action/AtomicAction.java deleted file mode 100644 index 84b22c81c..000000000 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/action/AtomicAction.java +++ /dev/null @@ -1,40 +0,0 @@ - -package eu.dnetlib.dhp.schema.action; - -import java.io.Serializable; - -import com.fasterxml.jackson.databind.annotation.JsonDeserialize; - -import eu.dnetlib.dhp.schema.oaf.Oaf; - -@JsonDeserialize(using = AtomicActionDeserializer.class) -public class AtomicAction implements Serializable { - - private Class clazz; - - private T payload; - - public AtomicAction() { - } - - public AtomicAction(Class clazz, T payload) { - this.clazz = clazz; - this.payload = payload; - } - - public Class getClazz() { - return clazz; - } - - public void setClazz(Class clazz) { - this.clazz = clazz; - } - - public T getPayload() { - return payload; - } - - public void setPayload(T payload) { - this.payload = payload; - } -} diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/action/AtomicActionDeserializer.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/action/AtomicActionDeserializer.java deleted file mode 100644 index 7b88e9c7e..000000000 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/action/AtomicActionDeserializer.java +++ /dev/null @@ -1,32 +0,0 @@ - -package eu.dnetlib.dhp.schema.action; - -import java.io.IOException; - -import com.fasterxml.jackson.core.JsonParser; -import com.fasterxml.jackson.core.JsonProcessingException; -import com.fasterxml.jackson.databind.DeserializationContext; -import com.fasterxml.jackson.databind.JsonDeserializer; -import com.fasterxml.jackson.databind.JsonNode; -import com.fasterxml.jackson.databind.ObjectMapper; - -import eu.dnetlib.dhp.schema.oaf.Oaf; - -public class AtomicActionDeserializer extends JsonDeserializer { - - @Override - public Object deserialize(JsonParser jp, DeserializationContext ctxt) - throws IOException { - JsonNode node = jp.getCodec().readTree(jp); - String classTag = node.get("clazz").asText(); - JsonNode payload = node.get("payload"); - ObjectMapper mapper = new ObjectMapper(); - - try { - final Class clazz = Class.forName(classTag); - return new AtomicAction(clazz, (Oaf) mapper.readValue(payload.toString(), clazz)); - } catch (ClassNotFoundException e) { - throw new IOException(e); - } - } -} diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/common/EntityType.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/common/EntityType.java deleted file mode 100644 index 54f30cf33..000000000 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/common/EntityType.java +++ /dev/null @@ -1,21 +0,0 @@ - -package eu.dnetlib.dhp.schema.common; - -import eu.dnetlib.dhp.schema.oaf.OafEntity; - -/** Actual entity types in the Graph */ -public enum EntityType { - publication, dataset, otherresearchproduct, software, datasource, organization, project; - - /** - * Resolves the EntityType, given the relative class name - * - * @param clazz the given class name - * @param actual OafEntity subclass - * @return the EntityType associated to the given class - */ - public static EntityType fromClass(Class clazz) { - - return EntityType.valueOf(clazz.getSimpleName().toLowerCase()); - } -} diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/common/LicenseComparator.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/common/LicenseComparator.java deleted file mode 100644 index db523ad1a..000000000 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/common/LicenseComparator.java +++ /dev/null @@ -1,69 +0,0 @@ - -package eu.dnetlib.dhp.schema.common; - -import java.util.Comparator; - -import eu.dnetlib.dhp.schema.oaf.Qualifier; - -public class LicenseComparator implements Comparator { - - @Override - public int compare(Qualifier left, Qualifier right) { - - if (left == null && right == null) - return 0; - if (left == null) - return 1; - if (right == null) - return -1; - - String lClass = left.getClassid(); - String rClass = right.getClassid(); - - if (lClass.equals(rClass)) - return 0; - - if (lClass.equals("OPEN SOURCE")) - return -1; - if (rClass.equals("OPEN SOURCE")) - return 1; - - if (lClass.equals("OPEN")) - return -1; - if (rClass.equals("OPEN")) - return 1; - - if (lClass.equals("6MONTHS")) - return -1; - if (rClass.equals("6MONTHS")) - return 1; - - if (lClass.equals("12MONTHS")) - return -1; - if (rClass.equals("12MONTHS")) - return 1; - - if (lClass.equals("EMBARGO")) - return -1; - if (rClass.equals("EMBARGO")) - return 1; - - if (lClass.equals("RESTRICTED")) - return -1; - if (rClass.equals("RESTRICTED")) - return 1; - - if (lClass.equals("CLOSED")) - return -1; - if (rClass.equals("CLOSED")) - return 1; - - if (lClass.equals("UNKNOWN")) - return -1; - if (rClass.equals("UNKNOWN")) - return 1; - - // Else (but unlikely), lexicographical ordering will do. - return lClass.compareTo(rClass); - } -} diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/common/MainEntityType.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/common/MainEntityType.java deleted file mode 100644 index cda8ba484..000000000 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/common/MainEntityType.java +++ /dev/null @@ -1,7 +0,0 @@ - -package eu.dnetlib.dhp.schema.common; - -/** Main entity types in the Graph */ -public enum MainEntityType { - result, datasource, organization, project -} diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/common/ModelConstants.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/common/ModelConstants.java deleted file mode 100644 index d759f0d55..000000000 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/common/ModelConstants.java +++ /dev/null @@ -1,124 +0,0 @@ - -package eu.dnetlib.dhp.schema.common; - -import eu.dnetlib.dhp.schema.oaf.DataInfo; -import eu.dnetlib.dhp.schema.oaf.KeyValue; -import eu.dnetlib.dhp.schema.oaf.Qualifier; - -public class ModelConstants { - - public static final String DNET_SUBJECT_TYPOLOGIES = "dnet:subject_classification_typologies"; - public static final String DNET_RESULT_TYPOLOGIES = "dnet:result_typologies"; - public static final String DNET_PUBLICATION_RESOURCE = "dnet:publication_resource"; - public static final String DNET_ACCESS_MODES = "dnet:access_modes"; - public static final String DNET_LANGUAGES = "dnet:languages"; - public static final String DNET_PID_TYPES = "dnet:pid_types"; - public static final String DNET_DATA_CITE_DATE = "dnet:dataCite_date"; - public static final String DNET_DATA_CITE_RESOURCE = "dnet:dataCite_resource"; - public static final String DNET_PROVENANCE_ACTIONS = "dnet:provenanceActions"; - public static final String DNET_COUNTRY_TYPE = "dnet:countries"; - public static final String DNET_REVIEW_LEVELS = "dnet:review_levels"; - - public static final String SYSIMPORT_CROSSWALK_REPOSITORY = "sysimport:crosswalk:repository"; - public static final String SYSIMPORT_CROSSWALK_ENTITYREGISTRY = "sysimport:crosswalk:entityregistry"; - public static final String USER_CLAIM = "user:claim"; - - public static final String DATASET_RESULTTYPE_CLASSID = "dataset"; - public static final String PUBLICATION_RESULTTYPE_CLASSID = "publication"; - public static final String SOFTWARE_RESULTTYPE_CLASSID = "software"; - public static final String ORP_RESULTTYPE_CLASSID = "other"; - - public static final String RESULT_RESULT = "resultResult"; - /** - * @deprecated Use {@link ModelConstants#RELATIONSHIP} instead. - */ - @Deprecated - public static final String PUBLICATION_DATASET = "publicationDataset"; - public static final String IS_RELATED_TO = "isRelatedTo"; - public static final String SUPPLEMENT = "supplement"; - public static final String IS_SUPPLEMENT_TO = "isSupplementTo"; - public static final String IS_SUPPLEMENTED_BY = "isSupplementedBy"; - public static final String PART = "part"; - public static final String IS_PART_OF = "isPartOf"; - public static final String HAS_PARTS = "hasParts"; - public static final String RELATIONSHIP = "relationship"; - public static final String CITATION = "citation"; - public static final String CITES = "cites"; - public static final String IS_CITED_BY = "isCitedBy"; - public static final String REVIEW = "review"; - public static final String REVIEWS = "reviews"; - public static final String IS_REVIEWED_BY = "isReviewedBy"; - - public static final String RESULT_PROJECT = "resultProject"; - public static final String OUTCOME = "outcome"; - public static final String IS_PRODUCED_BY = "isProducedBy"; - public static final String PRODUCES = "produces"; - - public static final String DATASOURCE_ORGANIZATION = "datasourceOrganization"; - public static final String PROVISION = "provision"; - public static final String IS_PROVIDED_BY = "isProvidedBy"; - public static final String PROVIDES = "provides"; - - public static final String PROJECT_ORGANIZATION = "projectOrganization"; - public static final String PARTICIPATION = "participation"; - public static final String HAS_PARTICIPANT = "hasParticipant"; - public static final String IS_PARTICIPANT = "isParticipant"; - - public static final String RESULT_ORGANIZATION = "resultOrganization"; - public static final String AFFILIATION = "affiliation"; - public static final String IS_AUTHOR_INSTITUTION_OF = "isAuthorInstitutionOf"; - public static final String HAS_AUTHOR_INSTITUTION = "hasAuthorInstitution"; - - public static final String MERGES = "merges"; - - public static final String UNKNOWN = "UNKNOWN"; - public static final String NOT_AVAILABLE = "not available"; - - public static final Qualifier PUBLICATION_DEFAULT_RESULTTYPE = qualifier( - PUBLICATION_RESULTTYPE_CLASSID, PUBLICATION_RESULTTYPE_CLASSID, - DNET_RESULT_TYPOLOGIES, DNET_RESULT_TYPOLOGIES); - - public static final Qualifier DATASET_DEFAULT_RESULTTYPE = qualifier( - DATASET_RESULTTYPE_CLASSID, DATASET_RESULTTYPE_CLASSID, - DNET_RESULT_TYPOLOGIES, DNET_RESULT_TYPOLOGIES); - - public static final Qualifier SOFTWARE_DEFAULT_RESULTTYPE = qualifier( - SOFTWARE_RESULTTYPE_CLASSID, SOFTWARE_RESULTTYPE_CLASSID, - DNET_RESULT_TYPOLOGIES, DNET_RESULT_TYPOLOGIES); - - public static final Qualifier ORP_DEFAULT_RESULTTYPE = qualifier( - ORP_RESULTTYPE_CLASSID, ORP_RESULTTYPE_CLASSID, - DNET_RESULT_TYPOLOGIES, DNET_RESULT_TYPOLOGIES); - - public static final Qualifier REPOSITORY_PROVENANCE_ACTIONS = qualifier( - SYSIMPORT_CROSSWALK_REPOSITORY, SYSIMPORT_CROSSWALK_REPOSITORY, - DNET_PROVENANCE_ACTIONS, DNET_PROVENANCE_ACTIONS); - - public static final Qualifier ENTITYREGISTRY_PROVENANCE_ACTION = qualifier( - SYSIMPORT_CROSSWALK_ENTITYREGISTRY, SYSIMPORT_CROSSWALK_ENTITYREGISTRY, - DNET_PROVENANCE_ACTIONS, DNET_PROVENANCE_ACTIONS); - - public static final KeyValue UNKNOWN_REPOSITORY = keyValue( - "10|openaire____::55045bd2a65019fd8e6741a755395c8c", "Unknown Repository"); - - private static Qualifier qualifier( - final String classid, - final String classname, - final String schemeid, - final String schemename) { - final Qualifier q = new Qualifier(); - q.setClassid(classid); - q.setClassname(classname); - q.setSchemeid(schemeid); - q.setSchemename(schemename); - return q; - } - - private static KeyValue keyValue(String key, String value) { - KeyValue kv = new KeyValue(); - kv.setKey(key); - kv.setValue(value); - kv.setDataInfo(new DataInfo()); - return kv; - } -} diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/common/ModelSupport.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/common/ModelSupport.java deleted file mode 100644 index b5bca2e93..000000000 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/common/ModelSupport.java +++ /dev/null @@ -1,476 +0,0 @@ - -package eu.dnetlib.dhp.schema.common; - -import static com.google.common.base.Preconditions.checkArgument; - -import java.util.Map; -import java.util.Objects; -import java.util.Optional; -import java.util.function.Function; - -import org.apache.commons.lang3.StringUtils; - -import com.google.common.collect.Maps; - -import eu.dnetlib.dhp.schema.oaf.*; - -/** Oaf model utility methods. */ -public class ModelSupport { - - /** Defines the mapping between the actual entity type and the main entity type */ - private static Map entityMapping = Maps.newHashMap(); - - static { - entityMapping.put(EntityType.publication, MainEntityType.result); - entityMapping.put(EntityType.dataset, MainEntityType.result); - entityMapping.put(EntityType.otherresearchproduct, MainEntityType.result); - entityMapping.put(EntityType.software, MainEntityType.result); - entityMapping.put(EntityType.datasource, MainEntityType.datasource); - entityMapping.put(EntityType.organization, MainEntityType.organization); - entityMapping.put(EntityType.project, MainEntityType.project); - } - - /** - * Defines the mapping between the actual entity types and the relative classes implementing them - */ - public static final Map entityTypes = Maps.newHashMap(); - - static { - entityTypes.put(EntityType.datasource, Datasource.class); - entityTypes.put(EntityType.organization, Organization.class); - entityTypes.put(EntityType.project, Project.class); - entityTypes.put(EntityType.dataset, Dataset.class); - entityTypes.put(EntityType.otherresearchproduct, OtherResearchProduct.class); - entityTypes.put(EntityType.software, Software.class); - entityTypes.put(EntityType.publication, Publication.class); - } - - public static final Map oafTypes = Maps.newHashMap(); - - static { - oafTypes.put("datasource", Datasource.class); - oafTypes.put("organization", Organization.class); - oafTypes.put("project", Project.class); - oafTypes.put("dataset", Dataset.class); - oafTypes.put("otherresearchproduct", OtherResearchProduct.class); - oafTypes.put("software", Software.class); - oafTypes.put("publication", Publication.class); - oafTypes.put("relation", Relation.class); - } - - public static final Map idPrefixMap = Maps.newHashMap(); - - static { - idPrefixMap.put(Datasource.class, "10"); - idPrefixMap.put(Organization.class, "20"); - idPrefixMap.put(Project.class, "40"); - idPrefixMap.put(Dataset.class, "50"); - idPrefixMap.put(OtherResearchProduct.class, "50"); - idPrefixMap.put(Software.class, "50"); - idPrefixMap.put(Publication.class, "50"); - } - - public static final Map entityIdPrefix = Maps.newHashMap(); - - static { - entityIdPrefix.put("datasource", "10"); - entityIdPrefix.put("organization", "20"); - entityIdPrefix.put("project", "40"); - entityIdPrefix.put("result", "50"); - } - - public static final Map idPrefixEntity = Maps.newHashMap(); - - static { - idPrefixEntity.put("10", "datasource"); - idPrefixEntity.put("20", "organization"); - idPrefixEntity.put("40", "project"); - idPrefixEntity.put("50", "result"); - } - - public static final Map relationInverseMap = Maps.newHashMap(); - - static { - relationInverseMap - .put( - "personResult_authorship_isAuthorOf", new RelationInverse() - .setRelation("isAuthorOf") - .setInverse("hasAuthor") - .setRelType("personResult") - .setSubReltype("authorship")); - relationInverseMap - .put( - "personResult_authorship_hasAuthor", new RelationInverse() - .setInverse("isAuthorOf") - .setRelation("hasAuthor") - .setRelType("personResult") - .setSubReltype("authorship")); - relationInverseMap - .put( - "projectOrganization_participation_isParticipant", new RelationInverse() - .setRelation("isParticipant") - .setInverse("hasParticipant") - .setRelType("projectOrganization") - .setSubReltype("participation")); - relationInverseMap - .put( - "projectOrganization_participation_hasParticipant", new RelationInverse() - .setInverse("isParticipant") - .setRelation("hasParticipant") - .setRelType("projectOrganization") - .setSubReltype("participation")); - relationInverseMap - .put( - "resultOrganization_affiliation_hasAuthorInstitution", new RelationInverse() - .setRelation("hasAuthorInstitution") - .setInverse("isAuthorInstitutionOf") - .setRelType("resultOrganization") - .setSubReltype("affiliation")); - relationInverseMap - .put( - "resultOrganization_affiliation_isAuthorInstitutionOf", new RelationInverse() - .setInverse("hasAuthorInstitution") - .setRelation("isAuthorInstitutionOf") - .setRelType("resultOrganization") - .setSubReltype("affiliation")); - relationInverseMap - .put( - "organizationOrganization_dedup_merges", new RelationInverse() - .setRelation("merges") - .setInverse("isMergedIn") - .setRelType("organizationOrganization") - .setSubReltype("dedup")); - relationInverseMap - .put( - "organizationOrganization_dedup_isMergedIn", new RelationInverse() - .setInverse("merges") - .setRelation("isMergedIn") - .setRelType("organizationOrganization") - .setSubReltype("dedup")); - relationInverseMap - .put( - "organizationOrganization_dedupSimilarity_isSimilarTo", new RelationInverse() - .setInverse("isSimilarTo") - .setRelation("isSimilarTo") - .setRelType("organizationOrganization") - .setSubReltype("dedupSimilarity")); - - relationInverseMap - .put( - "resultProject_outcome_isProducedBy", new RelationInverse() - .setRelation("isProducedBy") - .setInverse("produces") - .setRelType("resultProject") - .setSubReltype("outcome")); - relationInverseMap - .put( - "resultProject_outcome_produces", new RelationInverse() - .setInverse("isProducedBy") - .setRelation("produces") - .setRelType("resultProject") - .setSubReltype("outcome")); - relationInverseMap - .put( - "projectPerson_contactPerson_isContact", new RelationInverse() - .setRelation("isContact") - .setInverse("hasContact") - .setRelType("projectPerson") - .setSubReltype("contactPerson")); - relationInverseMap - .put( - "projectPerson_contactPerson_hasContact", new RelationInverse() - .setInverse("isContact") - .setRelation("hasContact") - .setRelType("personPerson") - .setSubReltype("coAuthorship")); - relationInverseMap - .put( - "personPerson_coAuthorship_isCoauthorOf", new RelationInverse() - .setInverse("isCoAuthorOf") - .setRelation("isCoAuthorOf") - .setRelType("personPerson") - .setSubReltype("coAuthorship")); - relationInverseMap - .put( - "personPerson_dedup_merges", new RelationInverse() - .setInverse("isMergedIn") - .setRelation("merges") - .setRelType("personPerson") - .setSubReltype("dedup")); - relationInverseMap - .put( - "personPerson_dedup_isMergedIn", new RelationInverse() - .setInverse("merges") - .setRelation("isMergedIn") - .setRelType("personPerson") - .setSubReltype("dedup")); - relationInverseMap - .put( - "personPerson_dedupSimilarity_isSimilarTo", new RelationInverse() - .setInverse("isSimilarTo") - .setRelation("isSimilarTo") - .setRelType("personPerson") - .setSubReltype("dedupSimilarity")); - relationInverseMap - .put( - "datasourceOrganization_provision_isProvidedBy", new RelationInverse() - .setInverse("provides") - .setRelation("isProvidedBy") - .setRelType("datasourceOrganization") - .setSubReltype("provision")); - relationInverseMap - .put( - "datasourceOrganization_provision_provides", new RelationInverse() - .setInverse("isProvidedBy") - .setRelation("provides") - .setRelType("datasourceOrganization") - .setSubReltype("provision")); - relationInverseMap - .put( - "resultResult_similarity_hasAmongTopNSimilarDocuments", new RelationInverse() - .setInverse("isAmongTopNSimilarDocuments") - .setRelation("hasAmongTopNSimilarDocuments") - .setRelType("resultResult") - .setSubReltype("similarity")); - relationInverseMap - .put( - "resultResult_similarity_isAmongTopNSimilarDocuments", new RelationInverse() - .setInverse("hasAmongTopNSimilarDocuments") - .setRelation("isAmongTopNSimilarDocuments") - .setRelType("resultResult") - .setSubReltype("similarity")); - relationInverseMap - .put( - "resultResult_relationship_isRelatedTo", new RelationInverse() - .setInverse("isRelatedTo") - .setRelation("isRelatedTo") - .setRelType("resultResult") - .setSubReltype("relationship")); - relationInverseMap - .put( - "resultResult_similarity_isAmongTopNSimilarDocuments", new RelationInverse() - .setInverse("hasAmongTopNSimilarDocuments") - .setRelation("isAmongTopNSimilarDocuments") - .setRelType("resultResult") - .setSubReltype("similarity")); - relationInverseMap - .put( - "resultResult_supplement_isSupplementTo", new RelationInverse() - .setInverse("isSupplementedBy") - .setRelation("isSupplementTo") - .setRelType("resultResult") - .setSubReltype("supplement")); - relationInverseMap - .put( - "resultResult_supplement_isSupplementedBy", new RelationInverse() - .setInverse("isSupplementTo") - .setRelation("isSupplementedBy") - .setRelType("resultResult") - .setSubReltype("supplement")); - relationInverseMap - .put( - "resultResult_part_isPartOf", new RelationInverse() - .setInverse("hasPart") - .setRelation("isPartOf") - .setRelType("resultResult") - .setSubReltype("part")); - relationInverseMap - .put( - "resultResult_part_hasPart", new RelationInverse() - .setInverse("isPartOf") - .setRelation("hasPart") - .setRelType("resultResult") - .setSubReltype("part")); - relationInverseMap - .put( - "resultResult_dedup_merges", new RelationInverse() - .setInverse("isMergedIn") - .setRelation("merges") - .setRelType("resultResult") - .setSubReltype("dedup")); - relationInverseMap - .put( - "resultResult_dedup_isMergedIn", new RelationInverse() - .setInverse("merges") - .setRelation("isMergedIn") - .setRelType("resultResult") - .setSubReltype("dedup")); - relationInverseMap - .put( - "resultResult_dedupSimilarity_isSimilarTo", new RelationInverse() - .setInverse("isSimilarTo") - .setRelation("isSimilarTo") - .setRelType("resultResult") - .setSubReltype("dedupSimilarity")); - - } - - private static final String schemeTemplate = "dnet:%s_%s_relations"; - - private ModelSupport() { - } - - public static String getIdPrefix(Class clazz) { - return idPrefixMap.get(clazz); - } - - /** - * Checks subclass-superclass relationship. - * - * @param subClazzObject Subclass object instance - * @param superClazzObject Superclass object instance - * @param Subclass type - * @param Superclass type - * @return True if X is a subclass of Y - */ - public static Boolean isSubClass( - X subClazzObject, Y superClazzObject) { - return isSubClass(subClazzObject.getClass(), superClazzObject.getClass()); - } - - /** - * Checks subclass-superclass relationship. - * - * @param subClazzObject Subclass object instance - * @param superClazz Superclass class - * @param Subclass type - * @param Superclass type - * @return True if X is a subclass of Y - */ - public static Boolean isSubClass( - X subClazzObject, Class superClazz) { - return isSubClass(subClazzObject.getClass(), superClazz); - } - - /** - * Checks subclass-superclass relationship. - * - * @param subClazz Subclass class - * @param superClazz Superclass class - * @param Subclass type - * @param Superclass type - * @return True if X is a subclass of Y - */ - public static Boolean isSubClass( - Class subClazz, Class superClazz) { - return superClazz.isAssignableFrom(subClazz); - } - - /** - * Lists all the OAF model classes - * - * @param - * @return - */ - public static Class[] getOafModelClasses() { - return new Class[] { - Author.class, - Context.class, - Country.class, - DataInfo.class, - Dataset.class, - Datasource.class, - ExternalReference.class, - ExtraInfo.class, - Field.class, - GeoLocation.class, - Instance.class, - Journal.class, - KeyValue.class, - Oaf.class, - OafEntity.class, - OAIProvenance.class, - Organization.class, - OriginDescription.class, - OtherResearchProduct.class, - Project.class, - Publication.class, - Qualifier.class, - Relation.class, - Result.class, - Software.class, - StructuredProperty.class - }; - } - - public static String getMainType(final EntityType type) { - return entityMapping.get(type).name(); - } - - public static boolean isResult(EntityType type) { - return MainEntityType.result.name().equals(getMainType(type)); - } - - public static String getScheme(final String sourceType, final String targetType) { - return String - .format( - schemeTemplate, - entityMapping.get(EntityType.valueOf(sourceType)).name(), - entityMapping.get(EntityType.valueOf(targetType)).name()); - } - - public static String tableIdentifier(String dbName, String tableName) { - - checkArgument(StringUtils.isNotBlank(dbName), "DB name cannot be empty"); - checkArgument(StringUtils.isNotBlank(tableName), "table name cannot be empty"); - - return String.format("%s.%s", dbName, tableName); - } - - public static String tableIdentifier(String dbName, Class clazz) { - - checkArgument(Objects.nonNull(clazz), "clazz is needed to derive the table name, thus cannot be null"); - - return tableIdentifier(dbName, clazz.getSimpleName().toLowerCase()); - } - - public static Function idFn() { - return x -> { - if (isSubClass(x, Relation.class)) { - return idFnForRelation(x); - } - return idFnForOafEntity(x); - }; - } - - private static String idFnForRelation(T t) { - Relation r = (Relation) t; - return Optional - .ofNullable(r.getSource()) - .map( - source -> Optional - .ofNullable(r.getTarget()) - .map( - target -> Optional - .ofNullable(r.getRelType()) - .map( - relType -> Optional - .ofNullable(r.getSubRelType()) - .map( - subRelType -> Optional - .ofNullable(r.getRelClass()) - .map( - relClass -> String - .join( - source, - target, - relType, - subRelType, - relClass)) - .orElse( - String - .join( - source, - target, - relType, - subRelType))) - .orElse(String.join(source, target, relType))) - .orElse(String.join(source, target))) - .orElse(source)) - .orElse(null); - } - - private static String idFnForOafEntity(T t) { - return ((OafEntity) t).getId(); - } -} diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/common/RelationInverse.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/common/RelationInverse.java deleted file mode 100644 index 4757c637e..000000000 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/common/RelationInverse.java +++ /dev/null @@ -1,46 +0,0 @@ - -package eu.dnetlib.dhp.schema.common; - -public class RelationInverse { - private String relation; - private String inverse; - private String relType; - private String subReltype; - - public String getRelType() { - return relType; - } - - public RelationInverse setRelType(String relType) { - this.relType = relType; - return this; - } - - public String getSubReltype() { - return subReltype; - } - - public RelationInverse setSubReltype(String subReltype) { - this.subReltype = subReltype; - return this; - } - - public String getRelation() { - return relation; - } - - public RelationInverse setRelation(String relation) { - this.relation = relation; - return this; - } - - public String getInverse() { - return inverse; - } - - public RelationInverse setInverse(String inverse) { - this.inverse = inverse; - return this; - } - -} diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/dump/oaf/APC.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/dump/oaf/APC.java deleted file mode 100644 index 7f5dcb397..000000000 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/dump/oaf/APC.java +++ /dev/null @@ -1,29 +0,0 @@ - -package eu.dnetlib.dhp.schema.dump.oaf; - -import java.io.Serializable; - -/** - * Used to refer to the Article Processing Charge information. Not dumped in this release. It contains two parameters: - - * currency of type String to store the currency of the APC - amount of type String to stores the charged amount - */ -public class APC implements Serializable { - private String currency; - private String amount; - - public String getCurrency() { - return currency; - } - - public void setCurrency(String currency) { - this.currency = currency; - } - - public String getAmount() { - return amount; - } - - public void setAmount(String amount) { - this.amount = amount; - } -} diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/dump/oaf/AccessRight.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/dump/oaf/AccessRight.java deleted file mode 100644 index f28c544f6..000000000 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/dump/oaf/AccessRight.java +++ /dev/null @@ -1,31 +0,0 @@ - -package eu.dnetlib.dhp.schema.dump.oaf; - -/** - * AccessRight. Used to represent the result access rights. It extends the eu.dnet.lib.dhp.schema.dump.oaf.Qualifier - * element with a parameter scheme of type String to store the scheme. Values for this element are found against the - * COAR access right scheme. The classid of the element accessright in eu.dnetlib.dhp.schema.oaf.Result is used to get - * the COAR corresponding code whose value will be used to set the code parameter. The COAR label corresponding to the - * COAR code will be used to set the label parameter. The scheme value will always be the one referring to the COAR - * access right scheme - */ -public class AccessRight extends Qualifier { - - private String scheme; - - public String getScheme() { - return scheme; - } - - public void setScheme(String scheme) { - this.scheme = scheme; - } - - public static AccessRight newInstance(String code, String label, String scheme) { - AccessRight ar = new AccessRight(); - ar.setCode(code); - ar.setLabel(label); - ar.setScheme(scheme); - return ar; - } -} diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/dump/oaf/Author.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/dump/oaf/Author.java deleted file mode 100644 index 34920bcf7..000000000 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/dump/oaf/Author.java +++ /dev/null @@ -1,73 +0,0 @@ - -package eu.dnetlib.dhp.schema.dump.oaf; - -import java.io.Serializable; -import java.util.List; - -/** - * Used to represent the generic author of the result. It has six parameters: - name of type String to store the given - * name of the author. The value for this parameter corresponds to eu.dnetlib.dhp.schema.oaf.Author name - surname of - * type String to store the family name of the author. The value for this parameter corresponds to - * eu.dnetlib.dhp.schema.oaf.Author surname - fullname of type String to store the fullname of the author. The value for - * this parameter corresponds to eu.dnetlib.dhp.schema.oaf.Author fullname - rank of type Integer to store the rank on - * the author in the result's authors list. The value for this parameter corresponds to eu.dnetlib.dhp.schema.oaf.Author - * rank - pid of type eu.dnetlib.dhp.schema.dump.oaf.Pid to store the persistent identifier for the author. For the - * moment only ORCID identifiers will be dumped. - The id element is instantiated by using the following values in the - * eu.dnetlib.dhp.schema.oaf.Result pid: * Qualifier.classid for scheme * value for value - The provenance element is - * instantiated only if the dataInfo is set for the pid in the result to be dumped. The provenance element is - * instantiated by using the following values in the eu.dnetlib.dhp.schema.oaf.Result pid: * - * dataInfo.provenanceaction.classname for provenance * dataInfo.trust for trust - */ -public class Author implements Serializable { - - private String fullname; - - private String name; - - private String surname; - - private Integer rank; - - private Pid pid; - - public String getFullname() { - return fullname; - } - - public void setFullname(String fullname) { - this.fullname = fullname; - } - - public String getName() { - return name; - } - - public void setName(String name) { - this.name = name; - } - - public String getSurname() { - return surname; - } - - public void setSurname(String surname) { - this.surname = surname; - } - - public Integer getRank() { - return rank; - } - - public void setRank(Integer rank) { - this.rank = rank; - } - - public Pid getPid() { - return pid; - } - - public void setPid(Pid pid) { - this.pid = pid; - } - -} diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/dump/oaf/Container.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/dump/oaf/Container.java deleted file mode 100644 index 8699528ca..000000000 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/dump/oaf/Container.java +++ /dev/null @@ -1,136 +0,0 @@ - -package eu.dnetlib.dhp.schema.dump.oaf; - -import java.io.Serializable; -import java.util.Objects; - -/** - * To store information about the conference or journal where the result has been presented or published. It contains - * eleven parameters: - name of type String to store the name of the journal or conference. It corresponds to the - * parameter name of eu.dnetlib.dhp.schema.oaf.Journal - issnPrinted ot type String to store the journal printed issn. - * It corresponds to the parameter issnPrinted of eu.dnetlib.dhp.schema.oaf.Journal - issnOnline of type String to store - * the journal online issn. It corresponds to the parameter issnOnline of eu.dnetlib.dhp.schema.oaf.Journal - - * issnLinking of type String to store the journal linking issn. It corresponds to the parameter issnLinking of - * eu.dnetlib.dhp.schema.oaf.Journal - ep of type String to store the end page. It corresponds to the parameter ep of - * eu.dnetlib.dhp.schema.oaf.Journal - iss of type String to store the journal issue. It corresponds to the parameter - * iss of eu.dnetlib.dhp.schema.oaf.Journal - sp of type String to store the start page. It corresponds to the parameter - * sp of eu.dnetlib.dhp.schema.oaf.Journal - vol of type String to store the Volume. It corresponds to the parameter vol - * of eu.dnetlib.dhp.schema.oaf.Journal - edition of type String to store the edition of the journal or conference - * proceeding. It corresponds to the parameter edition of eu.dnetlib.dhp.schema.oaf.Journal - conferenceplace of type - * String to store the place of the conference. It corresponds to the parameter conferenceplace of - * eu.dnetlib.dhp.schema.oaf.Journal - conferencedate of type String to store the date of the conference. It corresponds - * to the parameter conferencedate of eu.dnetlib.dhp.schema.oaf.Journal - */ -public class Container implements Serializable { - - private String name; - - private String issnPrinted; - - private String issnOnline; - - private String issnLinking; - - private String ep; - - private String iss; - - private String sp; - - private String vol; - - private String edition; - - private String conferenceplace; - - private String conferencedate; - - public String getName() { - return name; - } - - public void setName(String name) { - this.name = name; - } - - public String getIssnPrinted() { - return issnPrinted; - } - - public void setIssnPrinted(String issnPrinted) { - this.issnPrinted = issnPrinted; - } - - public String getIssnOnline() { - return issnOnline; - } - - public void setIssnOnline(String issnOnline) { - this.issnOnline = issnOnline; - } - - public String getIssnLinking() { - return issnLinking; - } - - public void setIssnLinking(String issnLinking) { - this.issnLinking = issnLinking; - } - - public String getEp() { - return ep; - } - - public void setEp(String ep) { - this.ep = ep; - } - - public String getIss() { - return iss; - } - - public void setIss(String iss) { - this.iss = iss; - } - - public String getSp() { - return sp; - } - - public void setSp(String sp) { - this.sp = sp; - } - - public String getVol() { - return vol; - } - - public void setVol(String vol) { - this.vol = vol; - } - - public String getEdition() { - return edition; - } - - public void setEdition(String edition) { - this.edition = edition; - } - - public String getConferenceplace() { - return conferenceplace; - } - - public void setConferenceplace(String conferenceplace) { - this.conferenceplace = conferenceplace; - } - - public String getConferencedate() { - return conferencedate; - } - - public void setConferencedate(String conferencedate) { - this.conferencedate = conferencedate; - } - -} diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/dump/oaf/ControlledField.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/dump/oaf/ControlledField.java deleted file mode 100644 index cad7b8b5c..000000000 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/dump/oaf/ControlledField.java +++ /dev/null @@ -1,38 +0,0 @@ - -package eu.dnetlib.dhp.schema.dump.oaf; - -import java.io.Serializable; - -/** - * To represent the information described by a scheme and a value in that scheme (i.e. pid). It has two parameters: - - * scheme of type String to store the scheme - value of type String to store the value in that scheme - */ -public class ControlledField implements Serializable { - private String scheme; - private String value; - - public String getScheme() { - return scheme; - } - - public void setScheme(String scheme) { - this.scheme = scheme; - } - - public String getValue() { - return value; - } - - public void setValue(String value) { - this.value = value; - } - - public static ControlledField newInstance(String scheme, String value) { - ControlledField cf = new ControlledField(); - - cf.setScheme(scheme); - cf.setValue(value); - - return cf; - } -} diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/dump/oaf/Country.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/dump/oaf/Country.java deleted file mode 100644 index 3ab4d90fe..000000000 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/dump/oaf/Country.java +++ /dev/null @@ -1,37 +0,0 @@ - -package eu.dnetlib.dhp.schema.dump.oaf; - -/** - * Represents the country associated to this result. It extends eu.dnetlib.dhp.schema.dump.oaf.Qualifier with a - * provenance parameter of type eu.dnetlib.dhp.schema.dumo.oaf.Provenance. The country in not mapped if its value in the - * result reprensented in the internal format is Unknown. The value for this element correspond to: - code corresponds - * to the classid of eu.dnetlib.dhp.schema.oaf.Country - label corresponds to the classname of - * eu.dnetlib.dhp.schema.oaf.Country - provenance set only if the dataInfo associated to the Country of the result to be - * dumped is not null. In this case : - provenance corresponds to dataInfo.provenanceaction.classid (to be modified with - * datainfo.provenanceaction.classname) - trust corresponds to dataInfo.trust - */ -public class Country extends Qualifier { - - private Provenance provenance; - - public Provenance getProvenance() { - return provenance; - } - - public void setProvenance(Provenance provenance) { - this.provenance = provenance; - } - - public static Country newInstance(String code, String label, Provenance provenance) { - Country c = new Country(); - c.setProvenance(provenance); - c.setCode(code); - c.setLabel(label); - return c; - } - - public static Country newInstance(String code, String label, String provenance, String trust) { - return newInstance(code, label, Provenance.newInstance(provenance, trust)); - } - -} diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/dump/oaf/Funder.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/dump/oaf/Funder.java deleted file mode 100644 index 16cab22cc..000000000 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/dump/oaf/Funder.java +++ /dev/null @@ -1,36 +0,0 @@ - -package eu.dnetlib.dhp.schema.dump.oaf; - -import java.io.Serializable; - -public class Funder implements Serializable { - private String shortName; - - private String name; - - private String jurisdiction; - - public String getJurisdiction() { - return jurisdiction; - } - - public void setJurisdiction(String jurisdiction) { - this.jurisdiction = jurisdiction; - } - - public String getShortName() { - return shortName; - } - - public void setShortName(String shortName) { - this.shortName = shortName; - } - - public String getName() { - return name; - } - - public void setName(String name) { - this.name = name; - } -} diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/dump/oaf/GeoLocation.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/dump/oaf/GeoLocation.java deleted file mode 100644 index 6bd891bbd..000000000 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/dump/oaf/GeoLocation.java +++ /dev/null @@ -1,53 +0,0 @@ - -package eu.dnetlib.dhp.schema.dump.oaf; - -import java.io.Serializable; - -import org.apache.commons.lang3.StringUtils; - -import com.fasterxml.jackson.annotation.JsonIgnore; - -/** - * Represents the geolocation information. It has three parameters: - point of type String to store the point - * information. It corresponds to eu.dnetlib.dhp.schema.oaf.GeoLocation point - box ot type String to store the box - * information. It corresponds to eu.dnetlib.dhp.schema.oaf.GeoLocation box - place of type String to store the place - * information. It corresponds to eu.dnetlib.dhp.schema.oaf.GeoLocation place - */ -public class GeoLocation implements Serializable { - - private String point; - - private String box; - - private String place; - - public String getPoint() { - return point; - } - - public void setPoint(String point) { - this.point = point; - } - - public String getBox() { - return box; - } - - public void setBox(String box) { - this.box = box; - } - - public String getPlace() { - return place; - } - - public void setPlace(String place) { - this.place = place; - } - - @JsonIgnore - public boolean isBlank() { - return StringUtils.isBlank(point) && StringUtils.isBlank(box) && StringUtils.isBlank(place); - } - -} diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/dump/oaf/Instance.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/dump/oaf/Instance.java deleted file mode 100644 index 4a09f5a86..000000000 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/dump/oaf/Instance.java +++ /dev/null @@ -1,107 +0,0 @@ - -package eu.dnetlib.dhp.schema.dump.oaf; - -import java.io.Serializable; -import java.util.List; - -/** - * Represents the manifestations (i.e. different versions) of the result. For example: the pre-print and the published - * versions are two manifestations of the same research result. It has the following parameters: - license of type - * String to store the license applied to the instance. It corresponds to the value of the licence in the instance to be - * dumped - accessright of type eu.dnetlib.dhp.schema.dump.oaf.AccessRight to store the accessright of the instance. - - * type of type String to store the type of the instance as defined in the corresponding dnet vocabulary - * (dnet:pubication_resource). It corresponds to the instancetype.classname of the instance to be mapped - hostedby of - * type eu.dnetlib.dhp.schema.dump.oaf.KeyValue to store the information about the source from which the instance can be - * viewed or downloaded. It is mapped against the hostedby parameter of the instance to be dumped and - key corresponds - * to hostedby.key - value corresponds to hostedby.value - url of type List list of locations where the instance - * is accessible. It corresponds to url of the instance to be dumped - collectedfrom of type - * eu.dnetlib.dhp.schema.dump.oaf.KeyValue to store the information about the source from which the instance has been - * collected. It is mapped against the collectedfrom parameter of the instance to be dumped and - key corresponds to - * collectedfrom.key - value corresponds to collectedfrom.value - publicationdate of type String to store the - * publication date of the instance ;// dateofacceptance; - refereed of type String to store information abour tthe - * review status of the instance. Possible values are 'Unknown', 'nonPeerReviewed', 'peerReviewed'. It corresponds to - * refereed.classname of the instance to be dumped - */ -public class Instance implements Serializable { - - private String license; - - private AccessRight accessright; - - private String type; - - private KeyValue hostedby; - - private List url; - - private KeyValue collectedfrom; - - private String publicationdate;// dateofacceptance; - - private String refereed; // peer-review status - - public String getLicense() { - return license; - } - - public void setLicense(String license) { - this.license = license; - } - - public AccessRight getAccessright() { - return accessright; - } - - public void setAccessright(AccessRight accessright) { - this.accessright = accessright; - } - - public String getType() { - return type; - } - - public void setType(String type) { - this.type = type; - } - - public KeyValue getHostedby() { - return hostedby; - } - - public void setHostedby(KeyValue hostedby) { - this.hostedby = hostedby; - } - - public List getUrl() { - return url; - } - - public void setUrl(List url) { - this.url = url; - } - - public KeyValue getCollectedfrom() { - return collectedfrom; - } - - public void setCollectedfrom(KeyValue collectedfrom) { - this.collectedfrom = collectedfrom; - } - - public String getPublicationdate() { - return publicationdate; - } - - public void setPublicationdate(String publicationdate) { - this.publicationdate = publicationdate; - } - - public String getRefereed() { - return refereed; - } - - public void setRefereed(String refereed) { - this.refereed = refereed; - } - -} diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/dump/oaf/KeyValue.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/dump/oaf/KeyValue.java deleted file mode 100644 index 849aa4d3c..000000000 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/dump/oaf/KeyValue.java +++ /dev/null @@ -1,48 +0,0 @@ - -package eu.dnetlib.dhp.schema.dump.oaf; - -import java.io.Serializable; - -import org.apache.commons.lang3.StringUtils; - -import com.fasterxml.jackson.annotation.JsonIgnore; - -/** - * To represent the information described by a key and a value. It has two parameters: - key to store the key (generally - * the OpenAIRE id for some entity) - value to store the value (generally the OpenAIRE name for the key) - */ -public class KeyValue implements Serializable { - - private String key; - - private String value; - - public String getKey() { - return key; - } - - public void setKey(String key) { - this.key = key; - } - - public String getValue() { - return value; - } - - public void setValue(String value) { - this.value = value; - } - - public static KeyValue newInstance(String key, String value) { - KeyValue inst = new KeyValue(); - inst.key = key; - inst.value = value; - return inst; - } - - @JsonIgnore - public boolean isBlank() { - return StringUtils.isBlank(key) && StringUtils.isBlank(value); - } - -} diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/dump/oaf/Pid.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/dump/oaf/Pid.java deleted file mode 100644 index 786ddb1d7..000000000 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/dump/oaf/Pid.java +++ /dev/null @@ -1,45 +0,0 @@ - -package eu.dnetlib.dhp.schema.dump.oaf; - -import java.io.Serializable; - -/** - * To represent the generic persistent identifier. It has two parameters: - id of type - * eu.dnetlib.dhp.schema.dump.oaf.ControlledField to store the scheme and value of the Persistent Identifier. - - * provenance of type eu.dnetlib.dhp.schema.dump.oaf.Provenance to store the provenance and trust of the information - */ -public class Pid implements Serializable { - private ControlledField id; - private Provenance provenance; - - public ControlledField getId() { - return id; - } - - public void setId(ControlledField pid) { - this.id = pid; - } - - public Provenance getProvenance() { - return provenance; - } - - public void setProvenance(Provenance provenance) { - this.provenance = provenance; - } - - public static Pid newInstance(ControlledField pid, Provenance provenance) { - Pid p = new Pid(); - p.id = pid; - p.provenance = provenance; - - return p; - } - - public static Pid newInstance(ControlledField pid) { - Pid p = new Pid(); - p.id = pid; - - return p; - } -} diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/dump/oaf/Project.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/dump/oaf/Project.java deleted file mode 100644 index f23d5a670..000000000 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/dump/oaf/Project.java +++ /dev/null @@ -1,45 +0,0 @@ - -package eu.dnetlib.dhp.schema.dump.oaf; - -import java.io.Serializable; - -public class Project implements Serializable { - protected String id;// OpenAIRE id - protected String code; - - protected String acronym; - - protected String title; - - public String getId() { - return id; - } - - public void setId(String id) { - this.id = id; - } - - public String getCode() { - return code; - } - - public void setCode(String code) { - this.code = code; - } - - public String getAcronym() { - return acronym; - } - - public void setAcronym(String acronym) { - this.acronym = acronym; - } - - public String getTitle() { - return title; - } - - public void setTitle(String title) { - this.title = title; - } -} diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/dump/oaf/Provenance.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/dump/oaf/Provenance.java deleted file mode 100644 index 28fb3aaa6..000000000 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/dump/oaf/Provenance.java +++ /dev/null @@ -1,41 +0,0 @@ - -package eu.dnetlib.dhp.schema.dump.oaf; - -import java.io.Serializable; - -/** - * Indicates the process that produced (or provided) the information, and the trust associated to the information. It - * has two parameters: - provenance of type String to store the provenance of the information, - trust of type String to - * store the trust associated to the information - */ -public class Provenance implements Serializable { - private String provenance; - private String trust; - - public String getProvenance() { - return provenance; - } - - public void setProvenance(String provenance) { - this.provenance = provenance; - } - - public String getTrust() { - return trust; - } - - public void setTrust(String trust) { - this.trust = trust; - } - - public static Provenance newInstance(String provenance, String trust) { - Provenance p = new Provenance(); - p.provenance = provenance; - p.trust = trust; - return p; - } - - public String toString() { - return provenance + trust; - } -} diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/dump/oaf/Qualifier.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/dump/oaf/Qualifier.java deleted file mode 100644 index 348c22b31..000000000 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/dump/oaf/Qualifier.java +++ /dev/null @@ -1,42 +0,0 @@ - -package eu.dnetlib.dhp.schema.dump.oaf; - -import java.io.Serializable; - -import org.apache.commons.lang3.StringUtils; - -import com.fasterxml.jackson.annotation.JsonIgnore; - -/** - * To represent the information described by a code and a value It has two parameters: - code to store the code - * (generally the classid of the eu.dnetlib.dhp.schema.oaf.Qualifier element) - label to store the label (generally the - * classname of the eu.dnetlib.dhp.schema.oaf.Qualifier element - */ -public class Qualifier implements Serializable { - - private String code; // the classid in the Qualifier - private String label; // the classname in the Qualifier - - public String getCode() { - return code; - } - - public void setCode(String code) { - this.code = code; - } - - public String getLabel() { - return label; - } - - public void setLabel(String label) { - this.label = label; - } - - public static Qualifier newInstance(String code, String value) { - Qualifier qualifier = new Qualifier(); - qualifier.setCode(code); - qualifier.setLabel(value); - return qualifier; - } -} diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/dump/oaf/Result.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/dump/oaf/Result.java deleted file mode 100644 index 97ee72259..000000000 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/dump/oaf/Result.java +++ /dev/null @@ -1,391 +0,0 @@ - -package eu.dnetlib.dhp.schema.dump.oaf; - -import java.io.Serializable; -import java.util.List; - -import eu.dnetlib.dhp.schema.dump.oaf.community.Project; - -/** - * To represent the dumped result. It will be extended in the dump for Research Communities - Research - * Initiative/Infrastructures. It has the following parameters: - author of type - * List to describe the authors of a result. For each author in the result - * represented in the internal model one author in the esternal model is produced. - type of type String to represent - * the category of the result. Possible values are publication, dataset, software, other. It corresponds to - * resulttype.classname of the dumped result - language of type eu.dnetlib.dhp.schema.dump.oaf.Qualifier to store - * information about the language of the result. It is dumped as - code corresponds to language.classid - value - * corresponds to language.classname - country of type List to store the country - * list to which the result is associated. For each country in the result respresented in the internal model one country - * in the external model is produces - subjects of type List to store the subjects for - * the result. For each subject in the result represented in the internal model one subject in the external model is - * produced - maintitle of type String to store the main title of the result. It corresponds to the value of the first - * title in the resul to be dumped having classid equals to "main title" - subtitle of type String to store the subtitle - * of the result. It corresponds to the value of the first title in the resul to be dumped having classid equals to - * "subtitle" - description of type List to store the description of the result. It corresponds to the list of - * description.value in the result represented in the internal model - publicationdate of type String to store the - * pubblication date. It corresponds to dateofacceptance.value in the result represented in the internal model - - * publisher of type String to store information about the publisher. It corresponds to publisher.value of the result - * represented in the intrenal model - embargoenddate of type String to store the embargo end date. It corresponds to - * embargoenddate.value of the result represented in the internal model - source of type List See definition of - * Dublin Core field dc:source. It corresponds to the list of source.value in the result represented in the internal - * model - format of type List It corresponds to the list of format.value in the result represented in the - * internal model - contributor of type List to represent contributors for this result. It corresponds to the - * list of contributor.value in the result represented in the internal model - coverage of type String. It corresponds - * to the list of coverage.value in the result represented in the internal model - bestaccessright of type - * eu.dnetlib.dhp.schema.dump.oaf.AccessRight to store informatin about the openest access right associated to the - * manifestations of this research results. It corresponds to the same parameter in the result represented in the - * internal model - instance of type List to store all the instances associated - * to the result. It corresponds to the same parameter in the result represented in the internal model - container of - * type eu.dnetlib.dhp.schema/dump.oaf.Container (only for result of type publication). It corresponds to the parameter - * journal of the result represented in the internal model - documentationUrl of type List (only for results of - * type software) to store the URLs to the software documentation. It corresponds to the list of documentationUrl.value - * of the result represented in the internal model - codeRepositoryUrl of type String (only for results of type - * software) to store the URL to the repository with the source code. It corresponds to codeRepositoryUrl.value of the - * result represented in the internal model - programmingLanguage of type String (only for results of type software) to - * store the programming language. It corresponds to programmingLanguaga.classid of the result represented in the - * internal model - contactperson of type List (only for results of type other) to store the contact person for - * this result. It corresponds to the list of contactperson.value of the result represented in the internal model - - * contactgroup of type List (only for results of type other) to store the information for the contact group. It - * corresponds to the list of contactgroup.value of the result represented in the internal model - tool of type - * List (only fro results of type other) to store information about tool useful for the interpretation and/or - * re-used of the research product. It corresponds to the list of tool.value in the result represented in the internal - * modelt - size of type String (only for results of type dataset) to store the size of the dataset. It corresponds to - * size.value in the result represented in the internal model - version of type String (only for results of type - * dataset) to store the version. It corresponds to version.value of the result represented in the internal model - - * geolocation fo type List (only for results of type dataset) to store - * geolocation information. For each geolocation element in the result represented in the internal model a GeoLocation - * in the external model il produced - id of type String to store the OpenAIRE id of the result. It corresponds to the - * id of the result represented in the internal model - originalId of type List to store the original ids of the - * result. It corresponds to the originalId of the result represented in the internal model - pid of type - * List to store the persistent identifiers for the result. For each pid - * in the results represented in the internal model one pid in the external model is produced. The value correspondence - * is: - scheme corresponds to pid.qualifier.classid of the result represented in the internal model - value corresponds - * to the pid.value of the result represented in the internal model - dateofcollection of type String to store - * information about the time OpenAIRE collected the record. It corresponds to dateofcollection of the result - * represented in the internal model - lasteupdatetimestamp of type String to store the timestamp of the last update of - * the record. It corresponds to lastupdatetimestamp of the resord represented in the internal model - */ -public class Result implements Serializable { - - private List author; - - // resulttype allows subclassing results into publications | datasets | software - private String type; // resulttype - - // common fields - private Qualifier language; - - private List country; - - private List subjects; - - private String maintitle; - - private String subtitle; - - private List description; - - private String publicationdate; // dateofacceptance; - - private String publisher; - - private String embargoenddate; - - private List source; - - private List format; - - private List contributor; - - private List coverage; - - private AccessRight bestaccessright; - - private List instance; - - private Container container;// Journal - - private List documentationUrl; // software - - private String codeRepositoryUrl; // software - - private String programmingLanguage; // software - - private List contactperson; // orp - - private List contactgroup; // orp - - private List tool; // orp - - private String size; // dataset - - private String version; // dataset - - private List geolocation; // dataset - - private String id; - - private List originalId; - - private List pid; - - private String dateofcollection; - - private Long lastupdatetimestamp; - - public Long getLastupdatetimestamp() { - return lastupdatetimestamp; - } - - public void setLastupdatetimestamp(Long lastupdatetimestamp) { - this.lastupdatetimestamp = lastupdatetimestamp; - } - - public String getId() { - return id; - } - - public void setId(String id) { - this.id = id; - } - - public List getOriginalId() { - return originalId; - } - - public void setOriginalId(List originalId) { - this.originalId = originalId; - } - - public List getPid() { - return pid; - } - - public void setPid(List pid) { - this.pid = pid; - } - - public String getDateofcollection() { - return dateofcollection; - } - - public void setDateofcollection(String dateofcollection) { - this.dateofcollection = dateofcollection; - } - - public List getAuthor() { - return author; - } - - public String getType() { - return type; - } - - public void setType(String type) { - this.type = type; - } - - public Container getContainer() { - return container; - } - - public void setContainer(Container container) { - this.container = container; - } - - public void setAuthor(List author) { - this.author = author; - } - - public Qualifier getLanguage() { - return language; - } - - public void setLanguage(Qualifier language) { - this.language = language; - } - - public List getCountry() { - return country; - } - - public void setCountry(List country) { - this.country = country; - } - - public List getSubjects() { - return subjects; - } - - public void setSubjects(List subjects) { - this.subjects = subjects; - } - - public String getMaintitle() { - return maintitle; - } - - public void setMaintitle(String maintitle) { - this.maintitle = maintitle; - } - - public String getSubtitle() { - return subtitle; - } - - public void setSubtitle(String subtitle) { - this.subtitle = subtitle; - } - - public List getDescription() { - return description; - } - - public void setDescription(List description) { - this.description = description; - } - - public String getPublicationdate() { - return publicationdate; - } - - public void setPublicationdate(String publicationdate) { - this.publicationdate = publicationdate; - } - - public String getPublisher() { - return publisher; - } - - public void setPublisher(String publisher) { - this.publisher = publisher; - } - - public String getEmbargoenddate() { - return embargoenddate; - } - - public void setEmbargoenddate(String embargoenddate) { - this.embargoenddate = embargoenddate; - } - - public List getSource() { - return source; - } - - public void setSource(List source) { - this.source = source; - } - - public List getFormat() { - return format; - } - - public void setFormat(List format) { - this.format = format; - } - - public List getContributor() { - return contributor; - } - - public void setContributor(List contributor) { - this.contributor = contributor; - } - - public List getCoverage() { - return coverage; - } - - public void setCoverage(List coverage) { - this.coverage = coverage; - } - - public AccessRight getBestaccessright() { - return bestaccessright; - } - - public void setBestaccessright(AccessRight bestaccessright) { - this.bestaccessright = bestaccessright; - } - - public List getInstance() { - return instance; - } - - public void setInstance(List instance) { - this.instance = instance; - } - - public List getDocumentationUrl() { - return documentationUrl; - } - - public void setDocumentationUrl(List documentationUrl) { - this.documentationUrl = documentationUrl; - } - - public String getCodeRepositoryUrl() { - return codeRepositoryUrl; - } - - public void setCodeRepositoryUrl(String codeRepositoryUrl) { - this.codeRepositoryUrl = codeRepositoryUrl; - } - - public String getProgrammingLanguage() { - return programmingLanguage; - } - - public void setProgrammingLanguage(String programmingLanguage) { - this.programmingLanguage = programmingLanguage; - } - - public List getContactperson() { - return contactperson; - } - - public void setContactperson(List contactperson) { - this.contactperson = contactperson; - } - - public List getContactgroup() { - return contactgroup; - } - - public void setContactgroup(List contactgroup) { - this.contactgroup = contactgroup; - } - - public List getTool() { - return tool; - } - - public void setTool(List tool) { - this.tool = tool; - } - - public String getSize() { - return size; - } - - public void setSize(String size) { - this.size = size; - } - - public String getVersion() { - return version; - } - - public void setVersion(String version) { - this.version = version; - } - - public List getGeolocation() { - return geolocation; - } - - public void setGeolocation(List geolocation) { - this.geolocation = geolocation; - } -} diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/dump/oaf/Subject.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/dump/oaf/Subject.java deleted file mode 100644 index 5c4bbef3c..000000000 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/dump/oaf/Subject.java +++ /dev/null @@ -1,34 +0,0 @@ - -package eu.dnetlib.dhp.schema.dump.oaf; - -import java.io.Serializable; - -/** - * To represent keywords associated to the result. It has two parameters: - subject of type - * eu.dnetlib.dhp.schema.dump.oaf.ControlledField to describe the subject. It mapped as: - schema it corresponds to - * qualifier.classid of the dumped subject - value it corresponds to the subject value - provenance of type - * eu.dnetlib.dhp.schema.dump.oaf.Provenance to represent the provenance of the subject. It is dumped only if dataInfo - * is not null. In this case: - provenance corresponds to dataInfo.provenanceaction.classname - trust corresponds to - * dataInfo.trust - */ -public class Subject implements Serializable { - private ControlledField subject; - private Provenance provenance; - - public ControlledField getSubject() { - return subject; - } - - public void setSubject(ControlledField subject) { - this.subject = subject; - } - - public Provenance getProvenance() { - return provenance; - } - - public void setProvenance(Provenance provenance) { - this.provenance = provenance; - } - -} diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/dump/oaf/community/CommunityResult.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/dump/oaf/community/CommunityResult.java deleted file mode 100644 index 8c748e103..000000000 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/dump/oaf/community/CommunityResult.java +++ /dev/null @@ -1,51 +0,0 @@ - -package eu.dnetlib.dhp.schema.dump.oaf.community; - -import java.util.List; - -import eu.dnetlib.dhp.schema.dump.oaf.KeyValue; -import eu.dnetlib.dhp.schema.dump.oaf.Result; - -/** - * extends eu.dnetlib.dhp.schema.dump.oaf.Result with the following parameters: - projects of type - * List to store the list of projects related to the result. The - * information is added after the result is mapped to the external model - context of type - * List to store information about the RC RI related to the result. - * For each context in the result represented in the internal model one context in the external model is produced - - * collectedfrom of type List to store information about the sources from which - * the record has been collected. For each collectedfrom in the result represented in the internal model one - * collectedfrom in the external model is produced - */ -public class CommunityResult extends Result { - - private List projects; - - private List context; - - protected List collectedfrom; - - public List getCollectedfrom() { - return collectedfrom; - } - - public void setCollectedfrom(List collectedfrom) { - this.collectedfrom = collectedfrom; - } - - public List getProjects() { - return projects; - } - - public void setProjects(List projects) { - this.projects = projects; - } - - public List getContext() { - return context; - } - - public void setContext(List context) { - this.context = context; - } - -} diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/dump/oaf/community/Context.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/dump/oaf/community/Context.java deleted file mode 100644 index 3ad692b30..000000000 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/dump/oaf/community/Context.java +++ /dev/null @@ -1,40 +0,0 @@ - -package eu.dnetlib.dhp.schema.dump.oaf.community; - -import java.util.List; -import java.util.Objects; - -import eu.dnetlib.dhp.schema.dump.oaf.Provenance; -import eu.dnetlib.dhp.schema.dump.oaf.Qualifier; - -/** - * Reference to a relevant research infrastructure, initiative or community (RI/RC) among those collaborating with - * OpenAIRE. It extend eu.dnetlib.dhp.shema.dump.oaf.Qualifier with a parameter provenance of type - * List to store the provenances of the association between the result and - * the RC/RI. The values for this element correspond to: - code: it corresponds to the id of the context in the result - * to be mapped. If the context id refers to a RC/RI and contains '::' only the part of the id before the first "::" - * will be used as value for code - label it corresponds to the label associated to the id. The information id taken - * from the profile of the RC/RI - provenance it is set only if the dataInfo associated to the contenxt element of the - * result to be dumped is not null. For each dataInfo one instance of type eu.dnetlib.dhp.schema.dump.oaf.Provenance is - * instantiated if the element datainfo.provenanceaction is not null. In this case - provenance corresponds to - * dataInfo.provenanceaction.classname - trust corresponds to dataInfo.trust - */ -public class Context extends Qualifier { - private List provenance; - - public List getProvenance() { - return provenance; - } - - public void setProvenance(List provenance) { - this.provenance = provenance; - } - - @Override - public int hashCode() { - String provenance = new String(); - this.provenance.forEach(p -> provenance.concat(p.toString())); - return Objects.hash(getCode(), getLabel(), provenance); - } - -} diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/dump/oaf/community/Funder.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/dump/oaf/community/Funder.java deleted file mode 100644 index b795fd100..000000000 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/dump/oaf/community/Funder.java +++ /dev/null @@ -1,52 +0,0 @@ - -package eu.dnetlib.dhp.schema.dump.oaf.community; - -import java.io.Serializable; - -/** - * To store information about the funder funding the project related to the result. It has the following parameters: - - * shortName of type String to store the funder short name (e.c. AKA). - name of type String to store the funder name - * (e.c. Akademy of Finland) - fundingStream of type String to store the funding stream - jurisdiction of type String to - * store the jurisdiction of the funder - */ -public class Funder implements Serializable { - private String shortName; - - private String name; - - private String fundingStream; - - private String jurisdiction; - - public String getJurisdiction() { - return jurisdiction; - } - - public void setJurisdiction(String jurisdiction) { - this.jurisdiction = jurisdiction; - } - - public String getShortName() { - return shortName; - } - - public void setShortName(String shortName) { - this.shortName = shortName; - } - - public String getName() { - return name; - } - - public void setName(String name) { - this.name = name; - } - - public String getFundingStream() { - return fundingStream; - } - - public void setFundingStream(String fundingStream) { - this.fundingStream = fundingStream; - } -} diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/dump/oaf/community/Project.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/dump/oaf/community/Project.java deleted file mode 100644 index 7e23a1311..000000000 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/dump/oaf/community/Project.java +++ /dev/null @@ -1,88 +0,0 @@ - -package eu.dnetlib.dhp.schema.dump.oaf.community; - -import java.io.Serializable; - -import eu.dnetlib.dhp.schema.dump.oaf.Provenance; - -/** - * To store information about the project related to the result. This information is not directly mapped from the result - * represented in the internal model because it is not there. The mapped result will be enriched with project - * information derived by relation between results and projects. Project class has the following parameters: - id of - * type String to store the OpenAIRE id for the Project - code of type String to store the grant agreement - acronym of - * type String to store the acronym for the project - title of type String to store the title of the project - funder of - * type eu.dnetlib.dhp.schema.dump.oaf.community.Funder to store information about the funder funding the project - - * provenance of type eu.dnetlib.dhp.schema.dump.oaf.Provenance to store information about the. provenance of the - * association between the result and the project - */ -public class Project implements Serializable { - - private String id;// OpenAIRE id - private String code; - - private String acronym; - - private String title; - - private Funder funder; - - private Provenance provenance; - - public Provenance getProvenance() { - return provenance; - } - - public void setProvenance(Provenance provenance) { - this.provenance = provenance; - } - - public String getId() { - return id; - } - - public void setId(String id) { - this.id = id; - } - - public String getCode() { - return code; - } - - public void setCode(String code) { - this.code = code; - } - - public String getAcronym() { - return acronym; - } - - public void setAcronym(String acronym) { - this.acronym = acronym; - } - - public String getTitle() { - return title; - } - - public void setTitle(String title) { - this.title = title; - } - - public Funder getFunder() { - return funder; - } - - public void setFunder(Funder funders) { - this.funder = funders; - } - - public static Project newInstance(String id, String code, String acronym, String title, Funder funder) { - Project project = new Project(); - project.setAcronym(acronym); - project.setCode(code); - project.setFunder(funder); - project.setId(id); - project.setTitle(title); - return project; - } -} diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/dump/oaf/graph/Constants.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/dump/oaf/graph/Constants.java deleted file mode 100644 index 35cc60c1c..000000000 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/dump/oaf/graph/Constants.java +++ /dev/null @@ -1,21 +0,0 @@ - -package eu.dnetlib.dhp.schema.dump.oaf.graph; - -import java.io.Serializable; - -public class Constants implements Serializable { - // collectedFrom va con isProvidedBy -> becco da ModelSupport - - public static final String HOSTED_BY = "isHostedBy"; - public static final String HOSTS = "hosts"; - - // community result uso isrelatedto - - public static final String RESULT_ENTITY = "result"; - public static final String DATASOURCE_ENTITY = "datasource"; - public static final String CONTEXT_ENTITY = "context"; - - public static final String CONTEXT_ID = "60"; - public static final String CONTEXT_NS_PREFIX = "context____"; - -} diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/dump/oaf/graph/Datasource.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/dump/oaf/graph/Datasource.java deleted file mode 100644 index 6b2b7b1ab..000000000 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/dump/oaf/graph/Datasource.java +++ /dev/null @@ -1,316 +0,0 @@ - -package eu.dnetlib.dhp.schema.dump.oaf.graph; - -import java.io.Serializable; -import java.util.List; - -import eu.dnetlib.dhp.schema.dump.oaf.Container; -import eu.dnetlib.dhp.schema.dump.oaf.ControlledField; -import eu.dnetlib.dhp.schema.dump.oaf.KeyValue; - -/** - * To store information about the datasource OpenAIRE collects information from. It contains the following parameters: - - * id of type String to store the OpenAIRE id for the datasource. It corresponds to the parameter id of the datasource - * represented in the internal model - originalId of type List to store the list of original ids associated to - * the datasource. It corresponds to the parameter originalId of the datasource represented in the internal model. The - * null values are filtered out - pid of type List to store the - * persistent identifiers for the datasource. For each pid in the datasource represented in the internal model one pid - * in the external model is produced as : - schema corresponds to pid.qualifier.classid of the datasource represented in - * the internal model - value corresponds to pid.value of the datasource represented in the internal model - - * datasourceType of type eu.dnetlib.dhp.schema.dump.oaf.ControlledField to store the datasource type (e.g. - * pubsrepository::institutional, Institutional Repository) as in the dnet vocabulary dnet:datasource_typologies. It - * corresponds to datasourcetype of the datasource represented in the internal model and : - code corresponds to - * datasourcetype.classid - value corresponds to datasourcetype.classname - openairecompatibility of type String to - * store information about the OpenAIRE compatibility of the ingested results (which guidelines they are compliant to). - * It corresponds to openairecompatibility.classname of the datasource represented in the internal model - officialname - * of type Sgtring to store the official name of the datasource. It correspond to officialname.value of the datasource - * represented in the internal model - englishname of type String to store the English name of the datasource. It - * corresponds to englishname.value of the datasource represented in the internal model - websiteurl of type String to - * store the URL of the website of the datasource. It corresponds to websiteurl.value of the datasource represented in - * the internal model - logourl of type String to store the URL of the logo for the datasource. It corresponds to - * logourl.value of the datasource represented in the internal model - dateofvalidation of type String to store the data - * of validation against the guidelines for the datasource records. It corresponds to dateofvalidation.value of the - * datasource represented in the internal model - description of type String to store the description for the - * datasource. It corresponds to description.value of the datasource represented in the internal model - */ -public class Datasource implements Serializable { - - private String id; // string - - private List originalId; // list string - - private List pid; // list - - private ControlledField datasourcetype; // value - - private String openairecompatibility; // value - - private String officialname; // string - - private String englishname; // string - - private String websiteurl; // string - - private String logourl; // string - - private String dateofvalidation; // string - - private String description; // description - - private List subjects; // List - - // opendoar specific fields (od*) - - private List languages; // odlanguages List - - private List contenttypes; // odcontent types List - - // re3data fields - private String releasestartdate; // string - - private String releaseenddate; // string - - private String missionstatementurl; // string - - // {open, restricted or closed} - private String accessrights; // databaseaccesstype string - - // {open, restricted or closed} - private String uploadrights; // datauploadtype string - - // {feeRequired, registration, other} - private String databaseaccessrestriction; // string - - // {feeRequired, registration, other} - private String datauploadrestriction; // string - - private Boolean versioning; // boolean - - private String citationguidelineurl; // string - - // {yes, no, uknown} - - private String pidsystems; // string - - private String certificates; // string - - private List policies; // - - private Container journal; // issn etc del Journal - - public String getId() { - return id; - } - - public void setId(String id) { - this.id = id; - } - - public List getOriginalId() { - return originalId; - } - - public void setOriginalId(List originalId) { - this.originalId = originalId; - } - - public List getPid() { - return pid; - } - - public void setPid(List pid) { - this.pid = pid; - } - - public ControlledField getDatasourcetype() { - return datasourcetype; - } - - public void setDatasourcetype(ControlledField datasourcetype) { - this.datasourcetype = datasourcetype; - } - - public String getOpenairecompatibility() { - return openairecompatibility; - } - - public void setOpenairecompatibility(String openairecompatibility) { - this.openairecompatibility = openairecompatibility; - } - - public String getOfficialname() { - return officialname; - } - - public void setOfficialname(String officialname) { - this.officialname = officialname; - } - - public String getEnglishname() { - return englishname; - } - - public void setEnglishname(String englishname) { - this.englishname = englishname; - } - - public String getWebsiteurl() { - return websiteurl; - } - - public void setWebsiteurl(String websiteurl) { - this.websiteurl = websiteurl; - } - - public String getLogourl() { - return logourl; - } - - public void setLogourl(String logourl) { - this.logourl = logourl; - } - - public String getDateofvalidation() { - return dateofvalidation; - } - - public void setDateofvalidation(String dateofvalidation) { - this.dateofvalidation = dateofvalidation; - } - - public String getDescription() { - return description; - } - - public void setDescription(String description) { - this.description = description; - } - - public List getSubjects() { - return subjects; - } - - public void setSubjects(List subjects) { - this.subjects = subjects; - } - - public List getLanguages() { - return languages; - } - - public void setLanguages(List languages) { - this.languages = languages; - } - - public List getContenttypes() { - return contenttypes; - } - - public void setContenttypes(List contenttypes) { - this.contenttypes = contenttypes; - } - - public String getReleasestartdate() { - return releasestartdate; - } - - public void setReleasestartdate(String releasestartdate) { - this.releasestartdate = releasestartdate; - } - - public String getReleaseenddate() { - return releaseenddate; - } - - public void setReleaseenddate(String releaseenddate) { - this.releaseenddate = releaseenddate; - } - - public String getMissionstatementurl() { - return missionstatementurl; - } - - public void setMissionstatementurl(String missionstatementurl) { - this.missionstatementurl = missionstatementurl; - } - - public String getAccessrights() { - return accessrights; - } - - public void setAccessrights(String accessrights) { - this.accessrights = accessrights; - } - - public String getUploadrights() { - return uploadrights; - } - - public void setUploadrights(String uploadrights) { - this.uploadrights = uploadrights; - } - - public String getDatabaseaccessrestriction() { - return databaseaccessrestriction; - } - - public void setDatabaseaccessrestriction(String databaseaccessrestriction) { - this.databaseaccessrestriction = databaseaccessrestriction; - } - - public String getDatauploadrestriction() { - return datauploadrestriction; - } - - public void setDatauploadrestriction(String datauploadrestriction) { - this.datauploadrestriction = datauploadrestriction; - } - - public Boolean getVersioning() { - return versioning; - } - - public void setVersioning(Boolean versioning) { - this.versioning = versioning; - } - - public String getCitationguidelineurl() { - return citationguidelineurl; - } - - public void setCitationguidelineurl(String citationguidelineurl) { - this.citationguidelineurl = citationguidelineurl; - } - - public String getPidsystems() { - return pidsystems; - } - - public void setPidsystems(String pidsystems) { - this.pidsystems = pidsystems; - } - - public String getCertificates() { - return certificates; - } - - public void setCertificates(String certificates) { - this.certificates = certificates; - } - - public List getPolicies() { - return policies; - } - - public void setPolicies(List policiesr3) { - this.policies = policiesr3; - } - - public Container getJournal() { - return journal; - } - - public void setJournal(Container journal) { - this.journal = journal; - } -} diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/dump/oaf/graph/Funder.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/dump/oaf/graph/Funder.java deleted file mode 100644 index 57d94f481..000000000 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/dump/oaf/graph/Funder.java +++ /dev/null @@ -1,54 +0,0 @@ - -package eu.dnetlib.dhp.schema.dump.oaf.graph; - -import java.io.Serializable; - -/** - * To store information about the funder funding the project related to the result. It has the following parameters: - * - private String shortName to store the short name of the funder (e.g. AKA) - * - private String name to store information about the name of the funder (e.g. Akademy of Finland) - * - private Fundings funding_stream to store the fundingstream - * - private String jurisdiction to store information about the jurisdiction of the funder - */ -public class Funder implements Serializable { - - private String shortName; - - private String name; - - private Fundings funding_stream; - - private String jurisdiction; - - public String getShortName() { - return shortName; - } - - public void setShortName(String shortName) { - this.shortName = shortName; - } - - public String getName() { - return name; - } - - public void setName(String name) { - this.name = name; - } - - public String getJurisdiction() { - return jurisdiction; - } - - public void setJurisdiction(String jurisdiction) { - this.jurisdiction = jurisdiction; - } - - public Fundings getFunding_stream() { - return funding_stream; - } - - public void setFunding_stream(Fundings funding_stream) { - this.funding_stream = funding_stream; - } -} diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/dump/oaf/graph/Fundings.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/dump/oaf/graph/Fundings.java deleted file mode 100644 index 173878ef0..000000000 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/dump/oaf/graph/Fundings.java +++ /dev/null @@ -1,35 +0,0 @@ - -package eu.dnetlib.dhp.schema.dump.oaf.graph; - -import java.io.Serializable; - -/** - * To store inforamtion about the funding stream. It has two parameters: - * - private String id to store the id of the fundings stream. The id is created by appending the shortname of the - * funder to the name of each level in the xml representing the fundng stream. For example: if the funder is the - * European Commission, the funding level 0 name is FP7, the funding level 1 name is SP3 and the funding level 2 name is - * PEOPLE then the id will be: EC::FP7::SP3::PEOPLE - * - private String description to describe the funding stream. It is created by concatenating the description of each funding - * level so for the example above the description would be: SEVENTH FRAMEWORK PROGRAMME - SP3-People - Marie-Curie Actions - */ -public class Fundings implements Serializable { - - private String id; - private String description; - - public String getId() { - return id; - } - - public void setId(String id) { - this.id = id; - } - - public String getDescription() { - return description; - } - - public void setDescription(String description) { - this.description = description; - } -} diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/dump/oaf/graph/Granted.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/dump/oaf/graph/Granted.java deleted file mode 100644 index 1ac27ddf1..000000000 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/dump/oaf/graph/Granted.java +++ /dev/null @@ -1,56 +0,0 @@ - -package eu.dnetlib.dhp.schema.dump.oaf.graph; - -import java.io.Serializable; -import java.util.Optional; - -/** - * To describe the funded amount. It has the following parameters: - * - private String currency to store the currency of the fund - * - private float totalcost to store the total cost of the project - * - private float fundedamount to store the funded amount by the funder - */ -public class Granted implements Serializable { - private String currency; - private float totalcost; - private float fundedamount; - - public String getCurrency() { - return currency; - } - - public void setCurrency(String currency) { - this.currency = currency; - } - - public float getTotalcost() { - return totalcost; - } - - public void setTotalcost(float totalcost) { - this.totalcost = totalcost; - } - - public float getFundedamount() { - return fundedamount; - } - - public void setFundedamount(float fundedamount) { - this.fundedamount = fundedamount; - } - - public static Granted newInstance(String currency, float totalcost, float fundedamount) { - Granted granted = new Granted(); - granted.currency = currency; - granted.totalcost = totalcost; - granted.fundedamount = fundedamount; - return granted; - } - - public static Granted newInstance(String currency, float fundedamount) { - Granted granted = new Granted(); - granted.currency = currency; - granted.fundedamount = fundedamount; - return granted; - } -} diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/dump/oaf/graph/H2020Classification.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/dump/oaf/graph/H2020Classification.java deleted file mode 100644 index 4a61663b8..000000000 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/dump/oaf/graph/H2020Classification.java +++ /dev/null @@ -1,82 +0,0 @@ - -package eu.dnetlib.dhp.schema.dump.oaf.graph; - -import java.io.Serializable; - -/** - * To store information about the classification for the project. The classification depends on the programme. For example - * H2020-EU.3.4.5.3 can be classified as - * H2020-EU.3. => Societal Challenges (level1) - * H2020-EU.3.4. => Transport (level2) - * H2020-EU.3.4.5. => CLEANSKY2 (level3) - * H2020-EU.3.4.5.3. => IADP Fast Rotorcraft (level4) - * - * We decided to explicitly represent up to three levels in the classification. - * - * H2020Classification has the following parameters: - * - private Programme programme to store the information about the programme related to this classification - * - private String level1 to store the information about the level 1 of the classification (Priority or Pillar of the EC) - * - private String level2 to store the information about the level2 af the classification (Objectives (?)) - * - private String level3 to store the information about the level3 of the classification - * - private String classification to store the entire classification related to the programme - */ -public class H2020Classification implements Serializable { - private Programme programme; - - private String level1; - private String level2; - private String level3; - - private String classification; - - public Programme getProgramme() { - return programme; - } - - public void setProgramme(Programme programme) { - this.programme = programme; - } - - public String getLevel1() { - return level1; - } - - public void setLevel1(String level1) { - this.level1 = level1; - } - - public String getLevel2() { - return level2; - } - - public void setLevel2(String level2) { - this.level2 = level2; - } - - public String getLevel3() { - return level3; - } - - public void setLevel3(String level3) { - this.level3 = level3; - } - - public String getClassification() { - return classification; - } - - public void setClassification(String classification) { - this.classification = classification; - } - - public static H2020Classification newInstance(String programme_code, String programme_description, String level1, - String level2, String level3, String classification) { - H2020Classification h2020classification = new H2020Classification(); - h2020classification.programme = Programme.newInstance(programme_code, programme_description); - h2020classification.level1 = level1; - h2020classification.level2 = level2; - h2020classification.level3 = level3; - h2020classification.classification = classification; - return h2020classification; - } -} diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/dump/oaf/graph/Node.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/dump/oaf/graph/Node.java deleted file mode 100644 index dac594451..000000000 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/dump/oaf/graph/Node.java +++ /dev/null @@ -1,41 +0,0 @@ - -package eu.dnetlib.dhp.schema.dump.oaf.graph; - -import java.io.Serializable; - -/** - * To represent the generic node in a relation. It has the following parameters: - * - private String id the openaire id of the entity in the relation - * - private String type the type of the entity in the relation. - * - * Consider the generic relation between a Result R and a Project P, the node representing R will have - * as id the id of R and as type result, while the node representing the project will have as id the id of the project - * and as type project - */ -public class Node implements Serializable { - private String id; - private String type; - - public String getId() { - return id; - } - - public void setId(String id) { - this.id = id; - } - - public String getType() { - return type; - } - - public void setType(String type) { - this.type = type; - } - - public static Node newInstance(String id, String type) { - Node node = new Node(); - node.id = id; - node.type = type; - return node; - } -} diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/dump/oaf/graph/Organization.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/dump/oaf/graph/Organization.java deleted file mode 100644 index 579245c05..000000000 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/dump/oaf/graph/Organization.java +++ /dev/null @@ -1,88 +0,0 @@ - -package eu.dnetlib.dhp.schema.dump.oaf.graph; - -import java.io.Serializable; -import java.util.List; - -import eu.dnetlib.dhp.schema.dump.oaf.ControlledField; -import eu.dnetlib.dhp.schema.dump.oaf.Country; -import eu.dnetlib.dhp.schema.dump.oaf.KeyValue; -import eu.dnetlib.dhp.schema.dump.oaf.Qualifier; -import eu.dnetlib.dhp.schema.dump.oaf.community.Project; - -/** - * To represent the generic organizaiton. It has the following parameters: - * - private String legalshortname to store the legalshortname of the organizaiton - * - private String legalname to store the legal name of the organization - * - private String websiteurl to store the websiteurl of the organization - * - private List alternativenames to store the alternative names of the organization - * - private Qualifier country to store the country of the organization - * - private String id to store the id of the organization - * - private List pid to store the list of pids for the organization - */ -public class Organization implements Serializable { - private String legalshortname; - private String legalname; - private String websiteurl; - private List alternativenames; - private Qualifier country; - private String id; - private List pid; - - public String getLegalshortname() { - return legalshortname; - } - - public void setLegalshortname(String legalshortname) { - this.legalshortname = legalshortname; - } - - public String getLegalname() { - return legalname; - } - - public void setLegalname(String legalname) { - this.legalname = legalname; - } - - public String getWebsiteurl() { - return websiteurl; - } - - public void setWebsiteurl(String websiteurl) { - this.websiteurl = websiteurl; - } - - public List getAlternativenames() { - return alternativenames; - } - - public void setAlternativenames(List alternativenames) { - this.alternativenames = alternativenames; - } - - public Qualifier getCountry() { - return country; - } - - public void setCountry(Qualifier country) { - this.country = country; - } - - public String getId() { - return id; - } - - public void setId(String id) { - this.id = id; - } - - public List getPid() { - return pid; - } - - public void setPid(List pid) { - this.pid = pid; - } - -} diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/dump/oaf/graph/Programme.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/dump/oaf/graph/Programme.java deleted file mode 100644 index 663ca25bc..000000000 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/dump/oaf/graph/Programme.java +++ /dev/null @@ -1,37 +0,0 @@ - -package eu.dnetlib.dhp.schema.dump.oaf.graph; - -import java.io.Serializable; - -/** - * To store information about the ec programme for the project. It has the following parameters: - * - private String code to store the code of the programme - * - private String description to store the description of the programme - */ -public class Programme implements Serializable { - private String code; - private String description; - - public String getCode() { - return code; - } - - public void setCode(String code) { - this.code = code; - } - - public String getDescription() { - return description; - } - - public void setDescription(String description) { - this.description = description; - } - - public static Programme newInstance(String code, String description) { - Programme p = new Programme(); - p.code = code; - p.description = description; - return p; - } -} diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/dump/oaf/graph/Project.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/dump/oaf/graph/Project.java deleted file mode 100644 index 054e4d2df..000000000 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/dump/oaf/graph/Project.java +++ /dev/null @@ -1,192 +0,0 @@ - -package eu.dnetlib.dhp.schema.dump.oaf.graph; - -import java.io.Serializable; -import java.util.List; - -/** - * This is the class representing the Project in the model used for the dumps of the whole graph. At the moment the dump - * of the Projects differs from the other dumps because we do not create relations between Funders (Organization) and - * Projects but we put the information about the Funder within the Project representation. We also removed the - * collected from element from the Project. No relation between the Project and the Datasource entity from which it is - * collected will be created. We will never create relations between Project and Datasource. In case some relation will - * be extracted from the Project they will refer the Funder and will be of type ( organization -> funds -> project, - * project -> isFundedBy -> organization) We also removed the duration parameter because the most of times it is set to - * 0. It has the following parameters: - * - private String id to store the id of the project (OpenAIRE id) - * - private String websiteurl to store the websiteurl of the project - * - private String code to store the grant agreement of the project - * - private String acronym to store the acronym of the project - * - private String title to store the tile of the project - * - private String startdate to store the start date - * - private String enddate to store the end date - * - private String callidentifier to store the call indentifier - * - private String keywords to store the keywords - * - private boolean openaccessmandateforpublications to store if the project must accomplish to the open access mandate - * for publications. This value will be set to true if one of the field in the project represented in the internal model - * is set to true - * - private boolean openaccessmandatefordataset to store if the project must accomplish to the open access mandate for - * dataset. It is set to the value in the corresponding filed of the project represented in the internal model - * - private List subject to store the list of subjects of the project - * - private List funding to store the list of funder of the project - * - private String summary to store the summary of the project - * - private Granted granted to store the granted amount - * - private List h2020classification to store the list of H2020 classifications the project is related to - */ - -public class Project implements Serializable { - private String id; - - private String websiteurl; - private String code; - private String acronym; - private String title; - private String startdate; - - private String enddate; - - private String callidentifier; - - private String keywords; - - private boolean openaccessmandateforpublications; - - private boolean openaccessmandatefordataset; - private List subject; - - private List funding; - - private String summary; - - private Granted granted; - - private List h2020Classifications; - - public String getId() { - return id; - } - - public void setId(String id) { - this.id = id; - } - - public String getWebsiteurl() { - return websiteurl; - } - - public void setWebsiteurl(String websiteurl) { - this.websiteurl = websiteurl; - } - - public String getCode() { - return code; - } - - public void setCode(String code) { - this.code = code; - } - - public String getAcronym() { - return acronym; - } - - public void setAcronym(String acronym) { - this.acronym = acronym; - } - - public String getTitle() { - return title; - } - - public void setTitle(String title) { - this.title = title; - } - - public String getStartdate() { - return startdate; - } - - public void setStartdate(String startdate) { - this.startdate = startdate; - } - - public String getEnddate() { - return enddate; - } - - public void setEnddate(String enddate) { - this.enddate = enddate; - } - - public String getCallidentifier() { - return callidentifier; - } - - public void setCallidentifier(String callidentifier) { - this.callidentifier = callidentifier; - } - - public String getKeywords() { - return keywords; - } - - public void setKeywords(String keywords) { - this.keywords = keywords; - } - - public boolean isOpenaccessmandateforpublications() { - return openaccessmandateforpublications; - } - - public void setOpenaccessmandateforpublications(boolean openaccessmandateforpublications) { - this.openaccessmandateforpublications = openaccessmandateforpublications; - } - - public boolean isOpenaccessmandatefordataset() { - return openaccessmandatefordataset; - } - - public void setOpenaccessmandatefordataset(boolean openaccessmandatefordataset) { - this.openaccessmandatefordataset = openaccessmandatefordataset; - } - - public List getSubject() { - return subject; - } - - public void setSubject(List subject) { - this.subject = subject; - } - - public List getFunding() { - return funding; - } - - public void setFunding(List funding) { - this.funding = funding; - } - - public String getSummary() { - return summary; - } - - public void setSummary(String summary) { - this.summary = summary; - } - - public Granted getGranted() { - return granted; - } - - public void setGranted(Granted granted) { - this.granted = granted; - } - - public List getH2020Classifications() { - return h2020Classifications; - } - - public void setH2020Classifications(List h2020Classifications) { - this.h2020Classifications = h2020Classifications; - } -} diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/dump/oaf/graph/RelType.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/dump/oaf/graph/RelType.java deleted file mode 100644 index 83ae2dda6..000000000 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/dump/oaf/graph/RelType.java +++ /dev/null @@ -1,40 +0,0 @@ - -package eu.dnetlib.dhp.schema.dump.oaf.graph; - -import java.io.Serializable; - -/** - * To represent the semantics of the generic relation between two entities. It has the following parameters: - * - private String name to store the semantics of the relation (i.e. isAuthorInstitutionOf). It corresponds to the - * relclass parameter in the relation represented in the internal model - * represented in the internal model - * - private String type to store the type of the relation (i.e. affiliation). It corresponds to the subreltype parameter - * of the relation represented in theinternal model - */ -public class RelType implements Serializable { - private String name; // relclass - private String type; // subreltype - - public String getName() { - return name; - } - - public void setName(String name) { - this.name = name; - } - - public String getType() { - return type; - } - - public void setType(String type) { - this.type = type; - } - - public static RelType newInstance(String name, String type) { - RelType rel = new RelType(); - rel.name = name; - rel.type = type; - return rel; - } -} diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/dump/oaf/graph/Relation.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/dump/oaf/graph/Relation.java deleted file mode 100644 index 4b88eb6c2..000000000 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/dump/oaf/graph/Relation.java +++ /dev/null @@ -1,68 +0,0 @@ - -package eu.dnetlib.dhp.schema.dump.oaf.graph; - -import java.io.Serializable; -import java.util.Objects; - -import eu.dnetlib.dhp.schema.dump.oaf.Provenance; - -/** - * To represent the gereric relation between two entities. It has the following parameters: - * - private Node source to represent the entity source of the relation - * - private Node target to represent the entity target of the relation - * - private RelType reltype to represent the semantics of the relation - * - private Provenance provenance to represent the provenance of the relation - */ -public class Relation implements Serializable { - private Node source; - private Node target; - private RelType reltype; - private Provenance provenance; - - public Node getSource() { - return source; - } - - public void setSource(Node source) { - this.source = source; - } - - public Node getTarget() { - return target; - } - - public void setTarget(Node target) { - this.target = target; - } - - public RelType getReltype() { - return reltype; - } - - public void setReltype(RelType reltype) { - this.reltype = reltype; - } - - public Provenance getProvenance() { - return provenance; - } - - public void setProvenance(Provenance provenance) { - this.provenance = provenance; - } - - @Override - public int hashCode() { - - return Objects.hash(source.getId(), target.getId(), reltype.getType() + ":" + reltype.getName()); - } - - public static Relation newInstance(Node source, Node target, RelType reltype, Provenance provenance) { - Relation relation = new Relation(); - relation.source = source; - relation.target = target; - relation.reltype = reltype; - relation.provenance = provenance; - return relation; - } -} diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/dump/oaf/graph/ResearchCommunity.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/dump/oaf/graph/ResearchCommunity.java deleted file mode 100644 index 026042ce9..000000000 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/dump/oaf/graph/ResearchCommunity.java +++ /dev/null @@ -1,20 +0,0 @@ - -package eu.dnetlib.dhp.schema.dump.oaf.graph; - -import java.util.List; - -/** - * To represent RC entities. It extends eu.dnetlib.dhp.dump.oaf.grap.ResearchInitiative by adding the parameter subject - * to store the list of subjects related to the community - */ -public class ResearchCommunity extends ResearchInitiative { - private List subject; - - public List getSubject() { - return subject; - } - - public void setSubject(List subject) { - this.subject = subject; - } -} diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/dump/oaf/graph/ResearchInitiative.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/dump/oaf/graph/ResearchInitiative.java deleted file mode 100644 index 6646fd541..000000000 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/dump/oaf/graph/ResearchInitiative.java +++ /dev/null @@ -1,71 +0,0 @@ - -package eu.dnetlib.dhp.schema.dump.oaf.graph; - -import java.io.Serializable; - -/** - * To represent entity of type RC/RI. It has the following parameters, which are mostly derived by the profile - * - private String id to store the openaire id for the entity. Is has as code 00 and will be created as - * 00|context_____::md5(originalId) - * private String originalId to store the id of the context as provided in the profile (i.e. mes) - * private String name to store the name of the context (got from the label attribute in the context definition) - * private String type to store the type of the context (i.e.: research initiative or research community) - * private String description to store the description of the context as given in the profile - * private String zenodo_community to store the zenodo community associated to the context (main zenodo community) - */ -public class ResearchInitiative implements Serializable { - private String id; // openaireId - private String originalId; // context id - private String name; // context name - private String type; // context type: research initiative or research community - private String description; - private String zenodo_community; - - public String getZenodo_community() { - return zenodo_community; - } - - public void setZenodo_community(String zenodo_community) { - this.zenodo_community = zenodo_community; - } - - public String getType() { - return type; - } - - public void setType(String type) { - this.type = type; - } - - public String getId() { - return id; - } - - public void setId(String id) { - this.id = id; - } - - public String getName() { - return name; - } - - public void setName(String label) { - this.name = label; - } - - public String getOriginalId() { - return originalId; - } - - public void setOriginalId(String originalId) { - this.originalId = originalId; - } - - public String getDescription() { - return description; - } - - public void setDescription(String description) { - this.description = description; - } -} diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Author.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Author.java deleted file mode 100644 index 231fb1e60..000000000 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Author.java +++ /dev/null @@ -1,89 +0,0 @@ - -package eu.dnetlib.dhp.schema.oaf; - -import java.io.Serializable; -import java.util.*; - -public class Author implements Serializable { - - private String fullname; - - private String name; - - private String surname; - - private Integer rank; - - private List pid; - - private List> affiliation; - - public String getFullname() { - return fullname; - } - - public void setFullname(String fullname) { - this.fullname = fullname; - } - - public String getName() { - return name; - } - - public void setName(String name) { - this.name = name; - } - - public String getSurname() { - return surname; - } - - public void setSurname(String surname) { - this.surname = surname; - } - - public Integer getRank() { - return rank; - } - - public void setRank(Integer rank) { - this.rank = rank; - } - - public List getPid() { - return pid; - } - - public void setPid(List pid) { - this.pid = pid; - } - - public List> getAffiliation() { - return affiliation; - } - - public void setAffiliation(List> affiliation) { - this.affiliation = affiliation; - } - - @Override - public boolean equals(Object o) { - if (this == o) - return true; - if (o == null || getClass() != o.getClass()) - return false; - Author author = (Author) o; - return Objects.equals(fullname, author.fullname) - && Objects.equals(name, author.name) - && Objects.equals(surname, author.surname) - && Objects.equals(rank, author.rank) - && Objects.equals(pid, author.pid) - && Objects.equals(affiliation, author.affiliation); - } - - @Override - public int hashCode() { - return Objects.hash(fullname, name, surname, rank, pid, affiliation); - } - -} diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Context.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Context.java deleted file mode 100644 index 57912c463..000000000 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Context.java +++ /dev/null @@ -1,46 +0,0 @@ - -package eu.dnetlib.dhp.schema.oaf; - -import java.io.Serializable; -import java.util.List; - -public class Context implements Serializable { - private String id; - - private List dataInfo; - - public String getId() { - return id; - } - - public void setId(String id) { - this.id = id; - } - - public List getDataInfo() { - return dataInfo; - } - - public void setDataInfo(List dataInfo) { - this.dataInfo = dataInfo; - } - - @Override - public int hashCode() { - return id == null ? 0 : id.hashCode(); - } - - @Override - public boolean equals(Object obj) { - if (this == obj) - return true; - if (obj == null) - return false; - if (getClass() != obj.getClass()) - return false; - - Context other = (Context) obj; - - return id.equals(other.getId()); - } -} diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Country.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Country.java deleted file mode 100644 index e25fdcade..000000000 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Country.java +++ /dev/null @@ -1,34 +0,0 @@ - -package eu.dnetlib.dhp.schema.oaf; - -import java.util.Objects; - -public class Country extends Qualifier { - - private DataInfo dataInfo; - - public DataInfo getDataInfo() { - return dataInfo; - } - - public void setDataInfo(DataInfo dataInfo) { - this.dataInfo = dataInfo; - } - - @Override - public boolean equals(Object o) { - if (this == o) - return true; - if (o == null || getClass() != o.getClass()) - return false; - if (!super.equals(o)) - return false; - Country country = (Country) o; - return Objects.equals(dataInfo, country.dataInfo); - } - - @Override - public int hashCode() { - return Objects.hash(super.hashCode(), dataInfo); - } -} diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/DataInfo.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/DataInfo.java deleted file mode 100644 index 9d572ee30..000000000 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/DataInfo.java +++ /dev/null @@ -1,85 +0,0 @@ - -package eu.dnetlib.dhp.schema.oaf; - -import java.io.Serializable; -import java.util.Objects; - -public class DataInfo implements Serializable { - - private Boolean invisible = false; - private Boolean inferred; - private Boolean deletedbyinference = false; - private String trust; - private String inferenceprovenance; - private Qualifier provenanceaction; - - public Boolean getInvisible() { - return invisible; - } - - public void setInvisible(Boolean invisible) { - this.invisible = invisible; - } - - public Boolean getInferred() { - return inferred; - } - - public void setInferred(Boolean inferred) { - this.inferred = inferred; - } - - public Boolean getDeletedbyinference() { - return deletedbyinference; - } - - public void setDeletedbyinference(Boolean deletedbyinference) { - this.deletedbyinference = deletedbyinference; - } - - public String getTrust() { - return trust; - } - - public void setTrust(String trust) { - this.trust = trust; - } - - public String getInferenceprovenance() { - return inferenceprovenance; - } - - public void setInferenceprovenance(String inferenceprovenance) { - this.inferenceprovenance = inferenceprovenance; - } - - public Qualifier getProvenanceaction() { - return provenanceaction; - } - - public void setProvenanceaction(Qualifier provenanceaction) { - this.provenanceaction = provenanceaction; - } - - @Override - public boolean equals(Object o) { - if (this == o) - return true; - if (o == null || getClass() != o.getClass()) - return false; - DataInfo dataInfo = (DataInfo) o; - return Objects.equals(invisible, dataInfo.invisible) - && Objects.equals(inferred, dataInfo.inferred) - && Objects.equals(deletedbyinference, dataInfo.deletedbyinference) - && Objects.equals(trust, dataInfo.trust) - && Objects.equals(inferenceprovenance, dataInfo.inferenceprovenance) - && Objects.equals(provenanceaction, dataInfo.provenanceaction); - } - - @Override - public int hashCode() { - return Objects - .hash( - invisible, inferred, deletedbyinference, trust, inferenceprovenance, provenanceaction); - } -} diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Dataset.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Dataset.java deleted file mode 100644 index b5587c6b7..000000000 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Dataset.java +++ /dev/null @@ -1,116 +0,0 @@ - -package eu.dnetlib.dhp.schema.oaf; - -import java.io.Serializable; -import java.util.List; - -import eu.dnetlib.dhp.schema.common.ModelConstants; - -public class Dataset extends Result implements Serializable { - - private Field storagedate; - - // candidate for removal - private Field device; - - private Field size; - - private Field version; - - private Field lastmetadataupdate; - - private Field metadataversionnumber; - - private List geolocation; - - public Dataset() { - setResulttype(ModelConstants.DATASET_DEFAULT_RESULTTYPE); - } - - public Field getStoragedate() { - return storagedate; - } - - public void setStoragedate(Field storagedate) { - this.storagedate = storagedate; - } - - public Field getDevice() { - return device; - } - - public void setDevice(Field device) { - this.device = device; - } - - public Field getSize() { - return size; - } - - public void setSize(Field size) { - this.size = size; - } - - public Field getVersion() { - return version; - } - - public void setVersion(Field version) { - this.version = version; - } - - public Field getLastmetadataupdate() { - return lastmetadataupdate; - } - - public void setLastmetadataupdate(Field lastmetadataupdate) { - this.lastmetadataupdate = lastmetadataupdate; - } - - public Field getMetadataversionnumber() { - return metadataversionnumber; - } - - public void setMetadataversionnumber(Field metadataversionnumber) { - this.metadataversionnumber = metadataversionnumber; - } - - public List getGeolocation() { - return geolocation; - } - - public void setGeolocation(List geolocation) { - this.geolocation = geolocation; - } - - @Override - public void mergeFrom(OafEntity e) { - super.mergeFrom(e); - - if (!Dataset.class.isAssignableFrom(e.getClass())) { - return; - } - - final Dataset d = (Dataset) e; - - storagedate = d.getStoragedate() != null && compareTrust(this, e) < 0 ? d.getStoragedate() : storagedate; - - device = d.getDevice() != null && compareTrust(this, e) < 0 ? d.getDevice() : device; - - size = d.getSize() != null && compareTrust(this, e) < 0 ? d.getSize() : size; - - version = d.getVersion() != null && compareTrust(this, e) < 0 ? d.getVersion() : version; - - lastmetadataupdate = d.getLastmetadataupdate() != null && compareTrust(this, e) < 0 - ? d.getLastmetadataupdate() - : lastmetadataupdate; - - metadataversionnumber = d.getMetadataversionnumber() != null && compareTrust(this, e) < 0 - ? d.getMetadataversionnumber() - : metadataversionnumber; - - geolocation = mergeLists(geolocation, d.getGeolocation()); - - mergeOAFDataInfo(d); - } -} diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Datasource.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Datasource.java deleted file mode 100644 index 721798206..000000000 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Datasource.java +++ /dev/null @@ -1,472 +0,0 @@ - -package eu.dnetlib.dhp.schema.oaf; - -import java.io.Serializable; -import java.util.List; - -public class Datasource extends OafEntity implements Serializable { - - private Qualifier datasourcetype; - - private Qualifier openairecompatibility; - - private Field officialname; - - private Field englishname; - - private Field websiteurl; - - private Field logourl; - - private Field contactemail; - - private Field namespaceprefix; - - private Field latitude; - - private Field longitude; - - private Field dateofvalidation; - - private Field description; - - private List subjects; - - // opendoar specific fields (od*) - private Field odnumberofitems; - - private Field odnumberofitemsdate; - - private Field odpolicies; - - private List> odlanguages; - - private List> odcontenttypes; - - private List> accessinfopackage; - - // re3data fields - private Field releasestartdate; - - private Field releaseenddate; - - private Field missionstatementurl; - - private Field dataprovider; - - private Field serviceprovider; - - // {open, restricted or closed} - private Field databaseaccesstype; - - // {open, restricted or closed} - private Field datauploadtype; - - // {feeRequired, registration, other} - private Field databaseaccessrestriction; - - // {feeRequired, registration, other} - private Field datauploadrestriction; - - private Field versioning; - - private Field citationguidelineurl; - - // {yes, no, uknown} - private Field qualitymanagementkind; - - private Field pidsystems; - - private Field certificates; - - private List policies; - - private Journal journal; - - public Qualifier getDatasourcetype() { - return datasourcetype; - } - - public void setDatasourcetype(Qualifier datasourcetype) { - this.datasourcetype = datasourcetype; - } - - public Qualifier getOpenairecompatibility() { - return openairecompatibility; - } - - public void setOpenairecompatibility(Qualifier openairecompatibility) { - this.openairecompatibility = openairecompatibility; - } - - public Field getOfficialname() { - return officialname; - } - - public void setOfficialname(Field officialname) { - this.officialname = officialname; - } - - public Field getEnglishname() { - return englishname; - } - - public void setEnglishname(Field englishname) { - this.englishname = englishname; - } - - public Field getWebsiteurl() { - return websiteurl; - } - - public void setWebsiteurl(Field websiteurl) { - this.websiteurl = websiteurl; - } - - public Field getLogourl() { - return logourl; - } - - public void setLogourl(Field logourl) { - this.logourl = logourl; - } - - public Field getContactemail() { - return contactemail; - } - - public void setContactemail(Field contactemail) { - this.contactemail = contactemail; - } - - public Field getNamespaceprefix() { - return namespaceprefix; - } - - public void setNamespaceprefix(Field namespaceprefix) { - this.namespaceprefix = namespaceprefix; - } - - public Field getLatitude() { - return latitude; - } - - public void setLatitude(Field latitude) { - this.latitude = latitude; - } - - public Field getLongitude() { - return longitude; - } - - public void setLongitude(Field longitude) { - this.longitude = longitude; - } - - public Field getDateofvalidation() { - return dateofvalidation; - } - - public void setDateofvalidation(Field dateofvalidation) { - this.dateofvalidation = dateofvalidation; - } - - public Field getDescription() { - return description; - } - - public void setDescription(Field description) { - this.description = description; - } - - public List getSubjects() { - return subjects; - } - - public void setSubjects(List subjects) { - this.subjects = subjects; - } - - public Field getOdnumberofitems() { - return odnumberofitems; - } - - public void setOdnumberofitems(Field odnumberofitems) { - this.odnumberofitems = odnumberofitems; - } - - public Field getOdnumberofitemsdate() { - return odnumberofitemsdate; - } - - public void setOdnumberofitemsdate(Field odnumberofitemsdate) { - this.odnumberofitemsdate = odnumberofitemsdate; - } - - public Field getOdpolicies() { - return odpolicies; - } - - public void setOdpolicies(Field odpolicies) { - this.odpolicies = odpolicies; - } - - public List> getOdlanguages() { - return odlanguages; - } - - public void setOdlanguages(List> odlanguages) { - this.odlanguages = odlanguages; - } - - public List> getOdcontenttypes() { - return odcontenttypes; - } - - public void setOdcontenttypes(List> odcontenttypes) { - this.odcontenttypes = odcontenttypes; - } - - public List> getAccessinfopackage() { - return accessinfopackage; - } - - public void setAccessinfopackage(List> accessinfopackage) { - this.accessinfopackage = accessinfopackage; - } - - public Field getReleasestartdate() { - return releasestartdate; - } - - public void setReleasestartdate(Field releasestartdate) { - this.releasestartdate = releasestartdate; - } - - public Field getReleaseenddate() { - return releaseenddate; - } - - public void setReleaseenddate(Field releaseenddate) { - this.releaseenddate = releaseenddate; - } - - public Field getMissionstatementurl() { - return missionstatementurl; - } - - public void setMissionstatementurl(Field missionstatementurl) { - this.missionstatementurl = missionstatementurl; - } - - public Field getDataprovider() { - return dataprovider; - } - - public void setDataprovider(Field dataprovider) { - this.dataprovider = dataprovider; - } - - public Field getServiceprovider() { - return serviceprovider; - } - - public void setServiceprovider(Field serviceprovider) { - this.serviceprovider = serviceprovider; - } - - public Field getDatabaseaccesstype() { - return databaseaccesstype; - } - - public void setDatabaseaccesstype(Field databaseaccesstype) { - this.databaseaccesstype = databaseaccesstype; - } - - public Field getDatauploadtype() { - return datauploadtype; - } - - public void setDatauploadtype(Field datauploadtype) { - this.datauploadtype = datauploadtype; - } - - public Field getDatabaseaccessrestriction() { - return databaseaccessrestriction; - } - - public void setDatabaseaccessrestriction(Field databaseaccessrestriction) { - this.databaseaccessrestriction = databaseaccessrestriction; - } - - public Field getDatauploadrestriction() { - return datauploadrestriction; - } - - public void setDatauploadrestriction(Field datauploadrestriction) { - this.datauploadrestriction = datauploadrestriction; - } - - public Field getVersioning() { - return versioning; - } - - public void setVersioning(Field versioning) { - this.versioning = versioning; - } - - public Field getCitationguidelineurl() { - return citationguidelineurl; - } - - public void setCitationguidelineurl(Field citationguidelineurl) { - this.citationguidelineurl = citationguidelineurl; - } - - public Field getQualitymanagementkind() { - return qualitymanagementkind; - } - - public void setQualitymanagementkind(Field qualitymanagementkind) { - this.qualitymanagementkind = qualitymanagementkind; - } - - public Field getPidsystems() { - return pidsystems; - } - - public void setPidsystems(Field pidsystems) { - this.pidsystems = pidsystems; - } - - public Field getCertificates() { - return certificates; - } - - public void setCertificates(Field certificates) { - this.certificates = certificates; - } - - public List getPolicies() { - return policies; - } - - public void setPolicies(List policies) { - this.policies = policies; - } - - public Journal getJournal() { - return journal; - } - - public void setJournal(Journal journal) { - this.journal = journal; - } - - @Override - public void mergeFrom(OafEntity e) { - super.mergeFrom(e); - - if (!Datasource.class.isAssignableFrom(e.getClass())) { - return; - } - - Datasource d = (Datasource) e; - - datasourcetype = d.getDatasourcetype() != null && compareTrust(this, e) < 0 - ? d.getDatasourcetype() - : datasourcetype; - openairecompatibility = d.getOpenairecompatibility() != null && compareTrust(this, e) < 0 - ? d.getOpenairecompatibility() - : openairecompatibility; - officialname = d.getOfficialname() != null && compareTrust(this, e) < 0 - ? d.getOfficialname() - : officialname; - englishname = d.getEnglishname() != null && compareTrust(this, e) < 0 ? d.getEnglishname() : officialname; - websiteurl = d.getWebsiteurl() != null && compareTrust(this, e) < 0 ? d.getWebsiteurl() : websiteurl; - logourl = d.getLogourl() != null && compareTrust(this, e) < 0 ? d.getLogourl() : getLogourl(); - contactemail = d.getContactemail() != null && compareTrust(this, e) < 0 - ? d.getContactemail() - : contactemail; - namespaceprefix = d.getNamespaceprefix() != null && compareTrust(this, e) < 0 - ? d.getNamespaceprefix() - : namespaceprefix; - latitude = d.getLatitude() != null && compareTrust(this, e) < 0 ? d.getLatitude() : latitude; - longitude = d.getLongitude() != null && compareTrust(this, e) < 0 ? d.getLongitude() : longitude; - dateofvalidation = d.getDateofvalidation() != null && compareTrust(this, e) < 0 - ? d.getDateofvalidation() - : dateofvalidation; - description = d.getDescription() != null && compareTrust(this, e) < 0 ? d.getDescription() : description; - subjects = mergeLists(subjects, d.getSubjects()); - - // opendoar specific fields (od*) - odnumberofitems = d.getOdnumberofitems() != null && compareTrust(this, e) < 0 - ? d.getOdnumberofitems() - : odnumberofitems; - odnumberofitemsdate = d.getOdnumberofitemsdate() != null && compareTrust(this, e) < 0 - ? d.getOdnumberofitemsdate() - : odnumberofitemsdate; - odpolicies = d.getOdpolicies() != null && compareTrust(this, e) < 0 ? d.getOdpolicies() : odpolicies; - odlanguages = mergeLists(odlanguages, d.getOdlanguages()); - odcontenttypes = mergeLists(odcontenttypes, d.getOdcontenttypes()); - accessinfopackage = mergeLists(accessinfopackage, d.getAccessinfopackage()); - - // re3data fields - releasestartdate = d.getReleasestartdate() != null && compareTrust(this, e) < 0 - ? d.getReleasestartdate() - : releasestartdate; - releaseenddate = d.getReleaseenddate() != null && compareTrust(this, e) < 0 - ? d.getReleaseenddate() - : releaseenddate; - missionstatementurl = d.getMissionstatementurl() != null && compareTrust(this, e) < 0 - ? d.getMissionstatementurl() - : missionstatementurl; - dataprovider = d.getDataprovider() != null && compareTrust(this, e) < 0 - ? d.getDataprovider() - : dataprovider; - serviceprovider = d.getServiceprovider() != null && compareTrust(this, e) < 0 - ? d.getServiceprovider() - : serviceprovider; - - // {open, restricted or closed} - databaseaccesstype = d.getDatabaseaccesstype() != null && compareTrust(this, e) < 0 - ? d.getDatabaseaccesstype() - : databaseaccesstype; - - // {open, restricted or closed} - datauploadtype = d.getDatauploadtype() != null && compareTrust(this, e) < 0 - ? d.getDatauploadtype() - : datauploadtype; - - // {feeRequired, registration, other} - databaseaccessrestriction = d.getDatabaseaccessrestriction() != null && compareTrust(this, e) < 0 - ? d.getDatabaseaccessrestriction() - : databaseaccessrestriction; - - // {feeRequired, registration, other} - datauploadrestriction = d.getDatauploadrestriction() != null && compareTrust(this, e) < 0 - ? d.getDatauploadrestriction() - : datauploadrestriction; - - versioning = d.getVersioning() != null && compareTrust(this, e) < 0 ? d.getVersioning() : versioning; - citationguidelineurl = d.getCitationguidelineurl() != null && compareTrust(this, e) < 0 - ? d.getCitationguidelineurl() - : citationguidelineurl; - - // {yes, no, unknown} - qualitymanagementkind = d.getQualitymanagementkind() != null && compareTrust(this, e) < 0 - ? d.getQualitymanagementkind() - : qualitymanagementkind; - pidsystems = d.getPidsystems() != null && compareTrust(this, e) < 0 ? d.getPidsystems() : pidsystems; - - certificates = d.getCertificates() != null && compareTrust(this, e) < 0 - ? d.getCertificates() - : certificates; - - policies = mergeLists(policies, d.getPolicies()); - - journal = d.getJournal() != null && compareTrust(this, e) < 0 ? d.getJournal() : journal; - - mergeOAFDataInfo(e); - } -} diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/ExternalReference.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/ExternalReference.java deleted file mode 100644 index d509b954e..000000000 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/ExternalReference.java +++ /dev/null @@ -1,119 +0,0 @@ - -package eu.dnetlib.dhp.schema.oaf; - -import java.io.Serializable; -import java.util.Objects; - -public class ExternalReference implements Serializable { - // source - private String sitename; - - // title - private String label; - - // text() - private String url; - - // ?? not mapped yet ?? - private String description; - - // type - private Qualifier qualifier; - - // site internal identifier - private String refidentifier; - - // maps the oaf:reference/@query attribute - private String query; - - // ExternalReferences might be also inferred - private DataInfo dataInfo; - - public String getSitename() { - return sitename; - } - - public void setSitename(String sitename) { - this.sitename = sitename; - } - - public String getLabel() { - return label; - } - - public void setLabel(String label) { - this.label = label; - } - - public String getUrl() { - return url; - } - - public void setUrl(String url) { - this.url = url; - } - - public String getDescription() { - return description; - } - - public void setDescription(String description) { - this.description = description; - } - - public Qualifier getQualifier() { - return qualifier; - } - - public void setQualifier(Qualifier qualifier) { - this.qualifier = qualifier; - } - - public String getRefidentifier() { - return refidentifier; - } - - public void setRefidentifier(String refidentifier) { - this.refidentifier = refidentifier; - } - - public String getQuery() { - return query; - } - - public void setQuery(String query) { - this.query = query; - } - - public DataInfo getDataInfo() { - return dataInfo; - } - - public void setDataInfo(DataInfo dataInfo) { - this.dataInfo = dataInfo; - } - - @Override - public boolean equals(Object o) { - if (this == o) - return true; - if (o == null || getClass() != o.getClass()) - return false; - ExternalReference that = (ExternalReference) o; - return Objects.equals(sitename, that.sitename) - && Objects.equals(label, that.label) - && Objects.equals(url, that.url) - && Objects.equals(description, that.description) - && Objects.equals(qualifier, that.qualifier) - && Objects.equals(refidentifier, that.refidentifier) - && Objects.equals(query, that.query) - && Objects.equals(dataInfo, that.dataInfo); - } - - @Override - public int hashCode() { - return Objects - .hash( - sitename, label, url, description, qualifier, refidentifier, query, dataInfo); - } -} diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/ExtraInfo.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/ExtraInfo.java deleted file mode 100644 index 3682cc2aa..000000000 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/ExtraInfo.java +++ /dev/null @@ -1,77 +0,0 @@ - -package eu.dnetlib.dhp.schema.oaf; - -import java.io.Serializable; -import java.util.Objects; - -public class ExtraInfo implements Serializable { - private String name; - - private String typology; - - private String provenance; - - private String trust; - - // json containing a Citation or Statistics - private String value; - - public String getName() { - return name; - } - - public void setName(String name) { - this.name = name; - } - - public String getTypology() { - return typology; - } - - public void setTypology(String typology) { - this.typology = typology; - } - - public String getProvenance() { - return provenance; - } - - public void setProvenance(String provenance) { - this.provenance = provenance; - } - - public String getTrust() { - return trust; - } - - public void setTrust(String trust) { - this.trust = trust; - } - - public String getValue() { - return value; - } - - public void setValue(String value) { - this.value = value; - } - - @Override - public boolean equals(Object o) { - if (this == o) - return true; - if (o == null || getClass() != o.getClass()) - return false; - ExtraInfo extraInfo = (ExtraInfo) o; - return Objects.equals(name, extraInfo.name) - && Objects.equals(typology, extraInfo.typology) - && Objects.equals(provenance, extraInfo.provenance) - && Objects.equals(trust, extraInfo.trust) - && Objects.equals(value, extraInfo.value); - } - - @Override - public int hashCode() { - return Objects.hash(name, typology, provenance, trust, value); - } -} diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Field.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Field.java deleted file mode 100644 index 8358bc4b3..000000000 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Field.java +++ /dev/null @@ -1,45 +0,0 @@ - -package eu.dnetlib.dhp.schema.oaf; - -import java.io.Serializable; -import java.util.Objects; - -public class Field implements Serializable { - - private T value; - - private DataInfo dataInfo; - - public T getValue() { - return value; - } - - public void setValue(T value) { - this.value = value; - } - - public DataInfo getDataInfo() { - return dataInfo; - } - - public void setDataInfo(DataInfo dataInfo) { - this.dataInfo = dataInfo; - } - - @Override - public int hashCode() { - return getValue() == null ? 0 : getValue().hashCode(); - } - - @Override - public boolean equals(Object obj) { - if (this == obj) - return true; - if (obj == null) - return false; - if (getClass() != obj.getClass()) - return false; - Field other = (Field) obj; - return Objects.equals(getValue(), other.getValue()); - } -} diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/GeoLocation.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/GeoLocation.java deleted file mode 100644 index 7ed313a59..000000000 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/GeoLocation.java +++ /dev/null @@ -1,76 +0,0 @@ - -package eu.dnetlib.dhp.schema.oaf; - -import java.io.Serializable; - -import org.apache.commons.lang3.StringUtils; - -import com.fasterxml.jackson.annotation.JsonIgnore; - -public class GeoLocation implements Serializable { - - private String point; - - private String box; - - private String place; - - public String getPoint() { - return point; - } - - public void setPoint(String point) { - this.point = point; - } - - public String getBox() { - return box; - } - - public void setBox(String box) { - this.box = box; - } - - public String getPlace() { - return place; - } - - public void setPlace(String place) { - this.place = place; - } - - @JsonIgnore - public boolean isBlank() { - return StringUtils.isBlank(point) && StringUtils.isBlank(box) && StringUtils.isBlank(place); - } - - public String toComparableString() { - return isBlank() - ? "" - : String - .format( - "%s::%s%s", - point != null ? point.toLowerCase() : "", - box != null ? box.toLowerCase() : "", - place != null ? place.toLowerCase() : ""); - } - - @Override - public int hashCode() { - return toComparableString().hashCode(); - } - - @Override - public boolean equals(Object obj) { - if (this == obj) - return true; - if (obj == null) - return false; - if (getClass() != obj.getClass()) - return false; - - GeoLocation other = (GeoLocation) obj; - - return toComparableString().equals(other.toComparableString()); - } -} diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/H2020Classification.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/H2020Classification.java deleted file mode 100644 index 219bdc00d..000000000 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/H2020Classification.java +++ /dev/null @@ -1,88 +0,0 @@ - -package eu.dnetlib.dhp.schema.oaf; - -import java.io.Serializable; -import java.util.Objects; - -/** - * To store information about the classification for the project. The classification depends on the programme. For example - * H2020-EU.3.4.5.3 can be classified as - * H2020-EU.3. => Societal Challenges (level1) - * H2020-EU.3.4. => Transport (level2) - * H2020-EU.3.4.5. => CLEANSKY2 (level3) - * H2020-EU.3.4.5.3. => IADP Fast Rotorcraft (level4) - * - * We decided to explicitly represent up to three levels in the classification. - * - * H2020Classification has the following parameters: - * - private Programme programme to store the information about the programme related to this classification - * - private String level1 to store the information about the level 1 of the classification (Priority or Pillar of the EC) - * - private String level2 to store the information about the level2 af the classification (Objectives (?)) - * - private String level3 to store the information about the level3 of the classification - * - private String classification to store the entire classification related to the programme - */ - -public class H2020Classification implements Serializable { - private H2020Programme h2020Programme; - private String level1; - private String level2; - private String level3; - - private String classification; - - public H2020Programme getH2020Programme() { - return h2020Programme; - } - - public void setH2020Programme(H2020Programme h2020Programme) { - this.h2020Programme = h2020Programme; - } - - public String getLevel1() { - return level1; - } - - public void setLevel1(String level1) { - this.level1 = level1; - } - - public String getLevel2() { - return level2; - } - - public void setLevel2(String level2) { - this.level2 = level2; - } - - public String getLevel3() { - return level3; - } - - public void setLevel3(String level3) { - this.level3 = level3; - } - - public String getClassification() { - return classification; - } - - public void setClassification(String classification) { - this.classification = classification; - } - - @Override - public boolean equals(Object o) { - if (this == o) - return true; - if (o == null || getClass() != o.getClass()) - return false; - - H2020Classification h2020classification = (H2020Classification) o; - - return Objects.equals(level1, h2020classification.level1) && - Objects.equals(level2, h2020classification.level2) && - Objects.equals(level3, h2020classification.level3) && - Objects.equals(classification, h2020classification.classification) && - h2020Programme.equals(h2020classification.h2020Programme); - } -} diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/H2020Programme.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/H2020Programme.java deleted file mode 100644 index 101d46d35..000000000 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/H2020Programme.java +++ /dev/null @@ -1,44 +0,0 @@ - -package eu.dnetlib.dhp.schema.oaf; - -import java.io.Serializable; -import java.util.Objects; - -/** - * To store information about the ec programme for the project. It has the following parameters: - * - private String code to store the code of the programme - * - private String description to store the description of the programme - */ - -public class H2020Programme implements Serializable { - private String code; - private String description; - - public String getCode() { - return code; - } - - public void setCode(String code) { - this.code = code; - } - - public String getDescription() { - return description; - } - - public void setDescription(String description) { - this.description = description; - } - - @Override - public boolean equals(Object o) { - if (this == o) - return true; - if (o == null || getClass() != o.getClass()) - return false; - - H2020Programme h2020Programme = (H2020Programme) o; - return Objects.equals(code, h2020Programme.code); - } - -} diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Instance.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Instance.java deleted file mode 100644 index 29d495261..000000000 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Instance.java +++ /dev/null @@ -1,152 +0,0 @@ - -package eu.dnetlib.dhp.schema.oaf; - -import java.io.Serializable; -import java.util.List; - -public class Instance implements Serializable { - - private Field license; - - private Qualifier accessright; - - private Qualifier instancetype; - - private KeyValue hostedby; - - private List url; - - // other research products specifc - private String distributionlocation; - - private KeyValue collectedfrom; - - private Field dateofacceptance; - - // ( article | book ) processing charges. Defined here to cope with possible wrongly typed - // results - private Field processingchargeamount; - - // currency - alphabetic code describe in ISO-4217. Defined here to cope with possible wrongly - // typed results - private Field processingchargecurrency; - - private Qualifier refereed; // peer-review status - - public Field getLicense() { - return license; - } - - public void setLicense(Field license) { - this.license = license; - } - - public Qualifier getAccessright() { - return accessright; - } - - public void setAccessright(Qualifier accessright) { - this.accessright = accessright; - } - - public Qualifier getInstancetype() { - return instancetype; - } - - public void setInstancetype(Qualifier instancetype) { - this.instancetype = instancetype; - } - - public KeyValue getHostedby() { - return hostedby; - } - - public void setHostedby(KeyValue hostedby) { - this.hostedby = hostedby; - } - - public List getUrl() { - return url; - } - - public void setUrl(List url) { - this.url = url; - } - - public String getDistributionlocation() { - return distributionlocation; - } - - public void setDistributionlocation(String distributionlocation) { - this.distributionlocation = distributionlocation; - } - - public KeyValue getCollectedfrom() { - return collectedfrom; - } - - public void setCollectedfrom(KeyValue collectedfrom) { - this.collectedfrom = collectedfrom; - } - - public Field getDateofacceptance() { - return dateofacceptance; - } - - public void setDateofacceptance(Field dateofacceptance) { - this.dateofacceptance = dateofacceptance; - } - - public Field getProcessingchargeamount() { - return processingchargeamount; - } - - public void setProcessingchargeamount(Field processingchargeamount) { - this.processingchargeamount = processingchargeamount; - } - - public Field getProcessingchargecurrency() { - return processingchargecurrency; - } - - public void setProcessingchargecurrency(Field processingchargecurrency) { - this.processingchargecurrency = processingchargecurrency; - } - - public Qualifier getRefereed() { - return refereed; - } - - public void setRefereed(Qualifier refereed) { - this.refereed = refereed; - } - - public String toComparableString() { - return String - .format( - "%s::%s::%s::%s", - hostedby != null && hostedby.getKey() != null ? hostedby.getKey().toLowerCase() : "", - accessright != null && accessright.getClassid() != null ? accessright.getClassid() : "", - instancetype != null && instancetype.getClassid() != null ? instancetype.getClassid() : "", - url != null ? url : ""); - } - - @Override - public int hashCode() { - return toComparableString().hashCode(); - } - - @Override - public boolean equals(Object obj) { - if (this == obj) - return true; - if (obj == null) - return false; - if (getClass() != obj.getClass()) - return false; - - Instance other = (Instance) obj; - - return toComparableString().equals(other.toComparableString()); - } -} diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Journal.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Journal.java deleted file mode 100644 index 7a375e28b..000000000 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Journal.java +++ /dev/null @@ -1,167 +0,0 @@ - -package eu.dnetlib.dhp.schema.oaf; - -import java.io.Serializable; -import java.util.Objects; - -public class Journal implements Serializable { - - private String name; - - private String issnPrinted; - - private String issnOnline; - - private String issnLinking; - - private String ep; - - private String iss; - - private String sp; - - private String vol; - - private String edition; - - private String conferenceplace; - - private String conferencedate; - - private DataInfo dataInfo; - - public String getName() { - return name; - } - - public void setName(String name) { - this.name = name; - } - - public String getIssnPrinted() { - return issnPrinted; - } - - public void setIssnPrinted(String issnPrinted) { - this.issnPrinted = issnPrinted; - } - - public String getIssnOnline() { - return issnOnline; - } - - public void setIssnOnline(String issnOnline) { - this.issnOnline = issnOnline; - } - - public String getIssnLinking() { - return issnLinking; - } - - public void setIssnLinking(String issnLinking) { - this.issnLinking = issnLinking; - } - - public String getEp() { - return ep; - } - - public void setEp(String ep) { - this.ep = ep; - } - - public String getIss() { - return iss; - } - - public void setIss(String iss) { - this.iss = iss; - } - - public String getSp() { - return sp; - } - - public void setSp(String sp) { - this.sp = sp; - } - - public String getVol() { - return vol; - } - - public void setVol(String vol) { - this.vol = vol; - } - - public String getEdition() { - return edition; - } - - public void setEdition(String edition) { - this.edition = edition; - } - - public String getConferenceplace() { - return conferenceplace; - } - - public void setConferenceplace(String conferenceplace) { - this.conferenceplace = conferenceplace; - } - - public String getConferencedate() { - return conferencedate; - } - - public void setConferencedate(String conferencedate) { - this.conferencedate = conferencedate; - } - - public DataInfo getDataInfo() { - return dataInfo; - } - - public void setDataInfo(DataInfo dataInfo) { - this.dataInfo = dataInfo; - } - - @Override - public boolean equals(Object o) { - if (this == o) - return true; - if (o == null || getClass() != o.getClass()) - return false; - Journal journal = (Journal) o; - return Objects.equals(name, journal.name) - && Objects.equals(issnPrinted, journal.issnPrinted) - && Objects.equals(issnOnline, journal.issnOnline) - && Objects.equals(issnLinking, journal.issnLinking) - && Objects.equals(ep, journal.ep) - && Objects.equals(iss, journal.iss) - && Objects.equals(sp, journal.sp) - && Objects.equals(vol, journal.vol) - && Objects.equals(edition, journal.edition) - && Objects.equals(conferenceplace, journal.conferenceplace) - && Objects.equals(conferencedate, journal.conferencedate) - && Objects.equals(dataInfo, journal.dataInfo); - } - - @Override - public int hashCode() { - return Objects - .hash( - name, - issnPrinted, - issnOnline, - issnLinking, - ep, - iss, - sp, - vol, - edition, - conferenceplace, - conferencedate, - dataInfo); - } -} diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/KeyValue.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/KeyValue.java deleted file mode 100644 index 4e2d60138..000000000 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/KeyValue.java +++ /dev/null @@ -1,74 +0,0 @@ - -package eu.dnetlib.dhp.schema.oaf; - -import java.io.Serializable; - -import org.apache.commons.lang3.StringUtils; - -import com.fasterxml.jackson.annotation.JsonIgnore; - -public class KeyValue implements Serializable { - - private String key; - - private String value; - - private DataInfo dataInfo; - - public String getKey() { - return key; - } - - public void setKey(String key) { - this.key = key; - } - - public String getValue() { - return value; - } - - public void setValue(String value) { - this.value = value; - } - - public DataInfo getDataInfo() { - return dataInfo; - } - - public void setDataInfo(DataInfo dataInfo) { - this.dataInfo = dataInfo; - } - - public String toComparableString() { - return isBlank() - ? "" - : String - .format( - "%s::%s", - key != null ? key.toLowerCase() : "", value != null ? value.toLowerCase() : ""); - } - - @JsonIgnore - public boolean isBlank() { - return StringUtils.isBlank(key) && StringUtils.isBlank(value); - } - - @Override - public int hashCode() { - return toComparableString().hashCode(); - } - - @Override - public boolean equals(Object obj) { - if (this == obj) - return true; - if (obj == null) - return false; - if (getClass() != obj.getClass()) - return false; - - KeyValue other = (KeyValue) obj; - - return toComparableString().equals(other.toComparableString()); - } -} diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Measure.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Measure.java deleted file mode 100644 index c0c14d10d..000000000 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Measure.java +++ /dev/null @@ -1,59 +0,0 @@ - -package eu.dnetlib.dhp.schema.oaf; - -import java.util.List; - -import com.google.common.base.Objects; - -/** - * Represent a measure, must be further described by a system available resource providing name and descriptions. - */ -public class Measure { - - /** - * Unique measure identifier. - */ - private String id; - - /** - * List of units associated with this measure. KeyValue provides a pair to store the laber (key) and the value, plus - * common provenance information. - */ - private List unit; - - public String getId() { - return id; - } - - public void setId(String id) { - this.id = id; - } - - public List getUnit() { - return unit; - } - - public void setUnit(List unit) { - this.unit = unit; - } - - public void mergeFrom(Measure m) { - // TODO - } - - @Override - public boolean equals(Object o) { - if (this == o) - return true; - if (o == null || getClass() != o.getClass()) - return false; - Measure measure = (Measure) o; - return Objects.equal(id, measure.id) && - Objects.equal(unit, measure.unit); - } - - @Override - public int hashCode() { - return Objects.hashCode(id, unit); - } -} diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/OAIProvenance.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/OAIProvenance.java deleted file mode 100644 index 88d74afbf..000000000 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/OAIProvenance.java +++ /dev/null @@ -1,33 +0,0 @@ - -package eu.dnetlib.dhp.schema.oaf; - -import java.io.Serializable; -import java.util.Objects; - -public class OAIProvenance implements Serializable { - - private OriginDescription originDescription; - - public OriginDescription getOriginDescription() { - return originDescription; - } - - public void setOriginDescription(OriginDescription originDescription) { - this.originDescription = originDescription; - } - - @Override - public boolean equals(Object o) { - if (this == o) - return true; - if (o == null || getClass() != o.getClass()) - return false; - OAIProvenance that = (OAIProvenance) o; - return Objects.equals(originDescription, that.originDescription); - } - - @Override - public int hashCode() { - return Objects.hash(originDescription); - } -} diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Oaf.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Oaf.java deleted file mode 100644 index 3496492e8..000000000 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Oaf.java +++ /dev/null @@ -1,73 +0,0 @@ - -package eu.dnetlib.dhp.schema.oaf; - -import java.io.Serializable; -import java.util.List; -import java.util.Objects; - -public abstract class Oaf implements Serializable { - - /** - * The list of datasource id/name pairs providing this relationship. - */ - protected List collectedfrom; - - private DataInfo dataInfo; - - private Long lastupdatetimestamp; - - public List getCollectedfrom() { - return collectedfrom; - } - - public void setCollectedfrom(List collectedfrom) { - this.collectedfrom = collectedfrom; - } - - public DataInfo getDataInfo() { - return dataInfo; - } - - public void setDataInfo(DataInfo dataInfo) { - this.dataInfo = dataInfo; - } - - public Long getLastupdatetimestamp() { - return lastupdatetimestamp; - } - - public void setLastupdatetimestamp(Long lastupdatetimestamp) { - this.lastupdatetimestamp = lastupdatetimestamp; - } - - public void mergeOAFDataInfo(Oaf e) { - if (e.getDataInfo() != null && compareTrust(this, e) < 0) - dataInfo = e.getDataInfo(); - } - - protected String extractTrust(Oaf e) { - if (e == null || e.getDataInfo() == null || e.getDataInfo().getTrust() == null) - return "0.0"; - return e.getDataInfo().getTrust(); - } - - protected int compareTrust(Oaf a, Oaf b) { - return extractTrust(a).compareTo(extractTrust(b)); - } - - @Override - public boolean equals(Object o) { - if (this == o) - return true; - if (o == null || getClass() != o.getClass()) - return false; - Oaf oaf = (Oaf) o; - return Objects.equals(dataInfo, oaf.dataInfo) - && Objects.equals(lastupdatetimestamp, oaf.lastupdatetimestamp); - } - - @Override - public int hashCode() { - return Objects.hash(dataInfo, lastupdatetimestamp); - } -} diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/OafEntity.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/OafEntity.java deleted file mode 100644 index 2823ee49d..000000000 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/OafEntity.java +++ /dev/null @@ -1,130 +0,0 @@ - -package eu.dnetlib.dhp.schema.oaf; - -import java.io.Serializable; -import java.util.*; -import java.util.stream.Collectors; - -public abstract class OafEntity extends Oaf implements Serializable { - - private String id; - - private List originalId; - - private List pid; - - private String dateofcollection; - - private String dateoftransformation; - - private List extraInfo; - - private OAIProvenance oaiprovenance; - - public String getId() { - return id; - } - - public void setId(String id) { - this.id = id; - } - - public List getOriginalId() { - return originalId; - } - - public void setOriginalId(List originalId) { - this.originalId = originalId; - } - - public List getPid() { - return pid; - } - - public void setPid(List pid) { - this.pid = pid; - } - - public String getDateofcollection() { - return dateofcollection; - } - - public void setDateofcollection(String dateofcollection) { - this.dateofcollection = dateofcollection; - } - - public String getDateoftransformation() { - return dateoftransformation; - } - - public void setDateoftransformation(String dateoftransformation) { - this.dateoftransformation = dateoftransformation; - } - - public List getExtraInfo() { - return extraInfo; - } - - public void setExtraInfo(List extraInfo) { - this.extraInfo = extraInfo; - } - - public OAIProvenance getOaiprovenance() { - return oaiprovenance; - } - - public void setOaiprovenance(OAIProvenance oaiprovenance) { - this.oaiprovenance = oaiprovenance; - } - - public void mergeFrom(OafEntity e) { - - if (e == null) - return; - - originalId = mergeLists(originalId, e.getOriginalId()); - - collectedfrom = mergeLists(collectedfrom, e.getCollectedfrom()); - - pid = mergeLists(pid, e.getPid()); - - if (e.getDateofcollection() != null && compareTrust(this, e) < 0) - dateofcollection = e.getDateofcollection(); - - if (e.getDateoftransformation() != null && compareTrust(this, e) < 0) - dateoftransformation = e.getDateoftransformation(); - - extraInfo = mergeLists(extraInfo, e.getExtraInfo()); - - if (e.getOaiprovenance() != null && compareTrust(this, e) < 0) - oaiprovenance = e.getOaiprovenance(); - } - - protected List mergeLists(final List... lists) { - - return Arrays - .stream(lists) - .filter(Objects::nonNull) - .flatMap(List::stream) - .filter(Objects::nonNull) - .distinct() - .collect(Collectors.toList()); - } - - @Override - public boolean equals(Object o) { - if (this == o) - return true; - if (o == null || getClass() != o.getClass()) - return false; - if (!super.equals(o)) - return false; - OafEntity oafEntity = (OafEntity) o; - return Objects.equals(id, oafEntity.id); - } - - @Override - public int hashCode() { - return Objects.hash(super.hashCode(), id); - } -} diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Organization.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Organization.java deleted file mode 100644 index a5f9bce30..000000000 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Organization.java +++ /dev/null @@ -1,214 +0,0 @@ - -package eu.dnetlib.dhp.schema.oaf; - -import java.io.Serializable; -import java.util.List; - -public class Organization extends OafEntity implements Serializable { - - private Field legalshortname; - - private Field legalname; - - private List> alternativeNames; - - private Field websiteurl; - - private Field logourl; - - private Field eclegalbody; - - private Field eclegalperson; - - private Field ecnonprofit; - - private Field ecresearchorganization; - - private Field echighereducation; - - private Field ecinternationalorganizationeurinterests; - - private Field ecinternationalorganization; - - private Field ecenterprise; - - private Field ecsmevalidated; - - private Field ecnutscode; - - private Qualifier country; - - public Field getLegalshortname() { - return legalshortname; - } - - public void setLegalshortname(Field legalshortname) { - this.legalshortname = legalshortname; - } - - public Field getLegalname() { - return legalname; - } - - public void setLegalname(Field legalname) { - this.legalname = legalname; - } - - public List> getAlternativeNames() { - return alternativeNames; - } - - public void setAlternativeNames(List> alternativeNames) { - this.alternativeNames = alternativeNames; - } - - public Field getWebsiteurl() { - return websiteurl; - } - - public void setWebsiteurl(Field websiteurl) { - this.websiteurl = websiteurl; - } - - public Field getLogourl() { - return logourl; - } - - public void setLogourl(Field logourl) { - this.logourl = logourl; - } - - public Field getEclegalbody() { - return eclegalbody; - } - - public void setEclegalbody(Field eclegalbody) { - this.eclegalbody = eclegalbody; - } - - public Field getEclegalperson() { - return eclegalperson; - } - - public void setEclegalperson(Field eclegalperson) { - this.eclegalperson = eclegalperson; - } - - public Field getEcnonprofit() { - return ecnonprofit; - } - - public void setEcnonprofit(Field ecnonprofit) { - this.ecnonprofit = ecnonprofit; - } - - public Field getEcresearchorganization() { - return ecresearchorganization; - } - - public void setEcresearchorganization(Field ecresearchorganization) { - this.ecresearchorganization = ecresearchorganization; - } - - public Field getEchighereducation() { - return echighereducation; - } - - public void setEchighereducation(Field echighereducation) { - this.echighereducation = echighereducation; - } - - public Field getEcinternationalorganizationeurinterests() { - return ecinternationalorganizationeurinterests; - } - - public void setEcinternationalorganizationeurinterests( - Field ecinternationalorganizationeurinterests) { - this.ecinternationalorganizationeurinterests = ecinternationalorganizationeurinterests; - } - - public Field getEcinternationalorganization() { - return ecinternationalorganization; - } - - public void setEcinternationalorganization(Field ecinternationalorganization) { - this.ecinternationalorganization = ecinternationalorganization; - } - - public Field getEcenterprise() { - return ecenterprise; - } - - public void setEcenterprise(Field ecenterprise) { - this.ecenterprise = ecenterprise; - } - - public Field getEcsmevalidated() { - return ecsmevalidated; - } - - public void setEcsmevalidated(Field ecsmevalidated) { - this.ecsmevalidated = ecsmevalidated; - } - - public Field getEcnutscode() { - return ecnutscode; - } - - public void setEcnutscode(Field ecnutscode) { - this.ecnutscode = ecnutscode; - } - - public Qualifier getCountry() { - return country; - } - - public void setCountry(Qualifier country) { - this.country = country; - } - - @Override - public void mergeFrom(OafEntity e) { - super.mergeFrom(e); - - if (!Organization.class.isAssignableFrom(e.getClass())) { - return; - } - - final Organization o = (Organization) e; - legalshortname = o.getLegalshortname() != null && compareTrust(this, e) < 0 - ? o.getLegalshortname() - : legalshortname; - legalname = o.getLegalname() != null && compareTrust(this, e) < 0 ? o.getLegalname() : legalname; - alternativeNames = mergeLists(o.getAlternativeNames(), alternativeNames); - websiteurl = o.getWebsiteurl() != null && compareTrust(this, e) < 0 ? o.getWebsiteurl() : websiteurl; - logourl = o.getLogourl() != null && compareTrust(this, e) < 0 ? o.getLogourl() : logourl; - eclegalbody = o.getEclegalbody() != null && compareTrust(this, e) < 0 ? o.getEclegalbody() : eclegalbody; - eclegalperson = o.getEclegalperson() != null && compareTrust(this, e) < 0 - ? o.getEclegalperson() - : eclegalperson; - ecnonprofit = o.getEcnonprofit() != null && compareTrust(this, e) < 0 ? o.getEcnonprofit() : ecnonprofit; - ecresearchorganization = o.getEcresearchorganization() != null && compareTrust(this, e) < 0 - ? o.getEcresearchorganization() - : ecresearchorganization; - echighereducation = o.getEchighereducation() != null && compareTrust(this, e) < 0 - ? o.getEchighereducation() - : echighereducation; - ecinternationalorganizationeurinterests = o.getEcinternationalorganizationeurinterests() != null - && compareTrust(this, e) < 0 - ? o.getEcinternationalorganizationeurinterests() - : ecinternationalorganizationeurinterests; - ecinternationalorganization = o.getEcinternationalorganization() != null && compareTrust(this, e) < 0 - ? o.getEcinternationalorganization() - : ecinternationalorganization; - ecenterprise = o.getEcenterprise() != null && compareTrust(this, e) < 0 - ? o.getEcenterprise() - : ecenterprise; - ecsmevalidated = o.getEcsmevalidated() != null && compareTrust(this, e) < 0 - ? o.getEcsmevalidated() - : ecsmevalidated; - ecnutscode = o.getEcnutscode() != null && compareTrust(this, e) < 0 ? o.getEcnutscode() : ecnutscode; - country = o.getCountry() != null && compareTrust(this, e) < 0 ? o.getCountry() : country; - mergeOAFDataInfo(o); - } -} diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/OriginDescription.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/OriginDescription.java deleted file mode 100644 index a275fc1a9..000000000 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/OriginDescription.java +++ /dev/null @@ -1,88 +0,0 @@ - -package eu.dnetlib.dhp.schema.oaf; - -import java.io.Serializable; -import java.util.Objects; - -public class OriginDescription implements Serializable { - - private String harvestDate; - - private Boolean altered = true; - - private String baseURL; - - private String identifier; - - private String datestamp; - - private String metadataNamespace; - - public String getHarvestDate() { - return harvestDate; - } - - public void setHarvestDate(String harvestDate) { - this.harvestDate = harvestDate; - } - - public Boolean getAltered() { - return altered; - } - - public void setAltered(Boolean altered) { - this.altered = altered; - } - - public String getBaseURL() { - return baseURL; - } - - public void setBaseURL(String baseURL) { - this.baseURL = baseURL; - } - - public String getIdentifier() { - return identifier; - } - - public void setIdentifier(String identifier) { - this.identifier = identifier; - } - - public String getDatestamp() { - return datestamp; - } - - public void setDatestamp(String datestamp) { - this.datestamp = datestamp; - } - - public String getMetadataNamespace() { - return metadataNamespace; - } - - public void setMetadataNamespace(String metadataNamespace) { - this.metadataNamespace = metadataNamespace; - } - - @Override - public boolean equals(Object o) { - if (this == o) - return true; - if (o == null || getClass() != o.getClass()) - return false; - OriginDescription that = (OriginDescription) o; - return Objects.equals(harvestDate, that.harvestDate) - && Objects.equals(altered, that.altered) - && Objects.equals(baseURL, that.baseURL) - && Objects.equals(identifier, that.identifier) - && Objects.equals(datestamp, that.datestamp) - && Objects.equals(metadataNamespace, that.metadataNamespace); - } - - @Override - public int hashCode() { - return Objects.hash(harvestDate, altered, baseURL, identifier, datestamp, metadataNamespace); - } -} diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/OtherResearchProduct.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/OtherResearchProduct.java deleted file mode 100644 index b04934c23..000000000 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/OtherResearchProduct.java +++ /dev/null @@ -1,60 +0,0 @@ - -package eu.dnetlib.dhp.schema.oaf; - -import java.io.Serializable; -import java.util.List; - -import eu.dnetlib.dhp.schema.common.ModelConstants; - -public class OtherResearchProduct extends Result implements Serializable { - - private List> contactperson; - - private List> contactgroup; - - private List> tool; - - public OtherResearchProduct() { - setResulttype(ModelConstants.ORP_DEFAULT_RESULTTYPE); - } - - public List> getContactperson() { - return contactperson; - } - - public void setContactperson(List> contactperson) { - this.contactperson = contactperson; - } - - public List> getContactgroup() { - return contactgroup; - } - - public void setContactgroup(List> contactgroup) { - this.contactgroup = contactgroup; - } - - public List> getTool() { - return tool; - } - - public void setTool(List> tool) { - this.tool = tool; - } - - @Override - public void mergeFrom(OafEntity e) { - super.mergeFrom(e); - - if (!OtherResearchProduct.class.isAssignableFrom(e.getClass())) { - return; - } - - OtherResearchProduct o = (OtherResearchProduct) e; - - contactperson = mergeLists(contactperson, o.getContactperson()); - contactgroup = mergeLists(contactgroup, o.getContactgroup()); - tool = mergeLists(tool, o.getTool()); - mergeOAFDataInfo(e); - } -} diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Project.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Project.java deleted file mode 100644 index b698c957d..000000000 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Project.java +++ /dev/null @@ -1,360 +0,0 @@ - -package eu.dnetlib.dhp.schema.oaf; - -import java.io.Serializable; -import java.util.List; - -public class Project extends OafEntity implements Serializable { - - private Field websiteurl; - - private Field code; - - private Field acronym; - - private Field title; - - private Field startdate; - - private Field enddate; - - private Field callidentifier; - - private Field keywords; - - private Field duration; - - private Field ecsc39; - - private Field oamandatepublications; - - private Field ecarticle29_3; - - private List subjects; - - private List> fundingtree; - - private Qualifier contracttype; - - private Field optional1; - - private Field optional2; - - private Field jsonextrainfo; - - private Field contactfullname; - - private Field contactfax; - - private Field contactphone; - - private Field contactemail; - - private Field summary; - - private Field currency; - - private Float totalcost; - - private Float fundedamount; - - private String h2020topiccode; - - private String h2020topicdescription; - - private List h2020classification; - - public String getH2020topicdescription() { - return h2020topicdescription; - } - - public void setH2020topicdescription(String h2020topicdescription) { - this.h2020topicdescription = h2020topicdescription; - } - - public String getH2020topiccode() { - return h2020topiccode; - } - - public void setH2020topiccode(String h2020topiccode) { - this.h2020topiccode = h2020topiccode; - } - - public List getH2020classification() { - return h2020classification; - } - - public void setH2020classification(List h2020classification) { - this.h2020classification = h2020classification; - } - - public Field getWebsiteurl() { - return websiteurl; - } - - public void setWebsiteurl(Field websiteurl) { - this.websiteurl = websiteurl; - } - - public Field getCode() { - return code; - } - - public void setCode(Field code) { - this.code = code; - } - - public Field getAcronym() { - return acronym; - } - - public void setAcronym(Field acronym) { - this.acronym = acronym; - } - - public Field getTitle() { - return title; - } - - public void setTitle(Field title) { - this.title = title; - } - - public Field getStartdate() { - return startdate; - } - - public void setStartdate(Field startdate) { - this.startdate = startdate; - } - - public Field getEnddate() { - return enddate; - } - - public void setEnddate(Field enddate) { - this.enddate = enddate; - } - - public Field getCallidentifier() { - return callidentifier; - } - - public void setCallidentifier(Field callidentifier) { - this.callidentifier = callidentifier; - } - - public Field getKeywords() { - return keywords; - } - - public void setKeywords(Field keywords) { - this.keywords = keywords; - } - - public Field getDuration() { - return duration; - } - - public void setDuration(Field duration) { - this.duration = duration; - } - - public Field getEcsc39() { - return ecsc39; - } - - public void setEcsc39(Field ecsc39) { - this.ecsc39 = ecsc39; - } - - public Field getOamandatepublications() { - return oamandatepublications; - } - - public void setOamandatepublications(Field oamandatepublications) { - this.oamandatepublications = oamandatepublications; - } - - public Field getEcarticle29_3() { - return ecarticle29_3; - } - - public void setEcarticle29_3(Field ecarticle29_3) { - this.ecarticle29_3 = ecarticle29_3; - } - - public List getSubjects() { - return subjects; - } - - public void setSubjects(List subjects) { - this.subjects = subjects; - } - - public List> getFundingtree() { - return fundingtree; - } - - public void setFundingtree(List> fundingtree) { - this.fundingtree = fundingtree; - } - - public Qualifier getContracttype() { - return contracttype; - } - - public void setContracttype(Qualifier contracttype) { - this.contracttype = contracttype; - } - - public Field getOptional1() { - return optional1; - } - - public void setOptional1(Field optional1) { - this.optional1 = optional1; - } - - public Field getOptional2() { - return optional2; - } - - public void setOptional2(Field optional2) { - this.optional2 = optional2; - } - - public Field getJsonextrainfo() { - return jsonextrainfo; - } - - public void setJsonextrainfo(Field jsonextrainfo) { - this.jsonextrainfo = jsonextrainfo; - } - - public Field getContactfullname() { - return contactfullname; - } - - public void setContactfullname(Field contactfullname) { - this.contactfullname = contactfullname; - } - - public Field getContactfax() { - return contactfax; - } - - public void setContactfax(Field contactfax) { - this.contactfax = contactfax; - } - - public Field getContactphone() { - return contactphone; - } - - public void setContactphone(Field contactphone) { - this.contactphone = contactphone; - } - - public Field getContactemail() { - return contactemail; - } - - public void setContactemail(Field contactemail) { - this.contactemail = contactemail; - } - - public Field getSummary() { - return summary; - } - - public void setSummary(Field summary) { - this.summary = summary; - } - - public Field getCurrency() { - return currency; - } - - public void setCurrency(Field currency) { - this.currency = currency; - } - - public Float getTotalcost() { - return totalcost; - } - - public void setTotalcost(Float totalcost) { - this.totalcost = totalcost; - } - - public Float getFundedamount() { - return fundedamount; - } - - public void setFundedamount(Float fundedamount) { - this.fundedamount = fundedamount; - } - - @Override - public void mergeFrom(OafEntity e) { - super.mergeFrom(e); - - if (!Project.class.isAssignableFrom(e.getClass())) { - return; - } - - Project p = (Project) e; - - websiteurl = p.getWebsiteurl() != null && compareTrust(this, e) < 0 ? p.getWebsiteurl() : websiteurl; - code = p.getCode() != null && compareTrust(this, e) < 0 ? p.getCode() : code; - acronym = p.getAcronym() != null && compareTrust(this, e) < 0 ? p.getAcronym() : acronym; - title = p.getTitle() != null && compareTrust(this, e) < 0 ? p.getTitle() : title; - startdate = p.getStartdate() != null && compareTrust(this, e) < 0 ? p.getStartdate() : startdate; - enddate = p.getEnddate() != null && compareTrust(this, e) < 0 ? p.getEnddate() : enddate; - callidentifier = p.getCallidentifier() != null && compareTrust(this, e) < 0 - ? p.getCallidentifier() - : callidentifier; - keywords = p.getKeywords() != null && compareTrust(this, e) < 0 ? p.getKeywords() : keywords; - duration = p.getDuration() != null && compareTrust(this, e) < 0 ? p.getDuration() : duration; - ecsc39 = p.getEcsc39() != null && compareTrust(this, e) < 0 ? p.getEcsc39() : ecsc39; - oamandatepublications = p.getOamandatepublications() != null && compareTrust(this, e) < 0 - ? p.getOamandatepublications() - : oamandatepublications; - ecarticle29_3 = p.getEcarticle29_3() != null && compareTrust(this, e) < 0 - ? p.getEcarticle29_3() - : ecarticle29_3; - subjects = mergeLists(subjects, p.getSubjects()); - fundingtree = mergeLists(fundingtree, p.getFundingtree()); - contracttype = p.getContracttype() != null && compareTrust(this, e) < 0 - ? p.getContracttype() - : contracttype; - optional1 = p.getOptional1() != null && compareTrust(this, e) < 0 ? p.getOptional1() : optional1; - optional2 = p.getOptional2() != null && compareTrust(this, e) < 0 ? p.getOptional2() : optional2; - jsonextrainfo = p.getJsonextrainfo() != null && compareTrust(this, e) < 0 - ? p.getJsonextrainfo() - : jsonextrainfo; - contactfullname = p.getContactfullname() != null && compareTrust(this, e) < 0 - ? p.getContactfullname() - : contactfullname; - contactfax = p.getContactfax() != null && compareTrust(this, e) < 0 ? p.getContactfax() : contactfax; - contactphone = p.getContactphone() != null && compareTrust(this, e) < 0 - ? p.getContactphone() - : contactphone; - contactemail = p.getContactemail() != null && compareTrust(this, e) < 0 - ? p.getContactemail() - : contactemail; - summary = p.getSummary() != null && compareTrust(this, e) < 0 ? p.getSummary() : summary; - currency = p.getCurrency() != null && compareTrust(this, e) < 0 ? p.getCurrency() : currency; - totalcost = p.getTotalcost() != null && compareTrust(this, e) < 0 ? p.getTotalcost() : totalcost; - fundedamount = p.getFundedamount() != null && compareTrust(this, e) < 0 - ? p.getFundedamount() - : fundedamount; - - // programme = mergeLists(programme, p.getProgramme()); - - h2020classification = mergeLists(h2020classification, p.getH2020classification()); - - mergeOAFDataInfo(e); - } -} diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Publication.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Publication.java deleted file mode 100644 index 3058c262b..000000000 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Publication.java +++ /dev/null @@ -1,39 +0,0 @@ - -package eu.dnetlib.dhp.schema.oaf; - -import java.io.Serializable; - -import eu.dnetlib.dhp.schema.common.ModelConstants; - -public class Publication extends Result implements Serializable { - - // publication specific - private Journal journal; - - public Publication() { - setResulttype(ModelConstants.PUBLICATION_DEFAULT_RESULTTYPE); - } - - public Journal getJournal() { - return journal; - } - - public void setJournal(Journal journal) { - this.journal = journal; - } - - @Override - public void mergeFrom(OafEntity e) { - super.mergeFrom(e); - - if (!Publication.class.isAssignableFrom(e.getClass())) { - return; - } - - Publication p = (Publication) e; - - if (p.getJournal() != null && compareTrust(this, e) < 0) - journal = p.getJournal(); - mergeOAFDataInfo(e); - } -} diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Qualifier.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Qualifier.java deleted file mode 100644 index 87ecb55f1..000000000 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Qualifier.java +++ /dev/null @@ -1,87 +0,0 @@ - -package eu.dnetlib.dhp.schema.oaf; - -import java.io.Serializable; - -import org.apache.commons.lang3.StringUtils; - -import com.fasterxml.jackson.annotation.JsonIgnore; - -public class Qualifier implements Serializable { - - private String classid; - private String classname; - private String schemeid; - private String schemename; - - public String getClassid() { - return classid; - } - - public void setClassid(String classid) { - this.classid = classid; - } - - public String getClassname() { - return classname; - } - - public void setClassname(String classname) { - this.classname = classname; - } - - public String getSchemeid() { - return schemeid; - } - - public void setSchemeid(String schemeid) { - this.schemeid = schemeid; - } - - public String getSchemename() { - return schemename; - } - - public void setSchemename(String schemename) { - this.schemename = schemename; - } - - public String toComparableString() { - return isBlank() - ? "" - : String - .format( - "%s::%s::%s::%s", - classid != null ? classid : "", - classname != null ? classname : "", - schemeid != null ? schemeid : "", - schemename != null ? schemename : ""); - } - - @JsonIgnore - public boolean isBlank() { - return StringUtils.isBlank(classid) - && StringUtils.isBlank(classname) - && StringUtils.isBlank(schemeid) - && StringUtils.isBlank(schemename); - } - - @Override - public int hashCode() { - return toComparableString().hashCode(); - } - - @Override - public boolean equals(Object obj) { - if (this == obj) - return true; - if (obj == null) - return false; - if (getClass() != obj.getClass()) - return false; - - Qualifier other = (Qualifier) obj; - - return toComparableString().equals(other.toComparableString()); - } -} diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Relation.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Relation.java deleted file mode 100644 index 17a50d7ac..000000000 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Relation.java +++ /dev/null @@ -1,167 +0,0 @@ - -package eu.dnetlib.dhp.schema.oaf; - -import static com.google.common.base.Preconditions.checkArgument; - -import java.util.*; -import java.util.stream.Collectors; -import java.util.stream.Stream; - -/** - * Relation models any edge between two nodes in the OpenAIRE graph. It has a source id and a target id pointing to - * graph node identifiers and it is further characterised by the semantic of the link through the fields relType, - * subRelType and relClass. Provenance information is modeled according to the dataInfo element and collectedFrom, while - * individual relationship types can provide extra information via the properties field. - */ -public class Relation extends Oaf { - - /** - * Main relationship classifier, values include 'resultResult', 'resultProject', 'resultOrganization', etc. - */ - private String relType; - - /** - * Further classifies a relationship, values include 'affiliation', 'similarity', 'supplement', etc. - */ - private String subRelType; - - /** - * Indicates the direction of the relationship, values include 'isSupplementTo', 'isSupplementedBy', 'merges, - * 'isMergedIn'. - */ - private String relClass; - - /** - * The source entity id. - */ - private String source; - - /** - * The target entity id. - */ - private String target; - - /** - * Was this relationship authoritatively validated? - */ - private Boolean validated; - - /** - * When was this relationship authoritatively validated. - */ - private String validationDate; - - /** - * List of relation specific properties. Values include 'similarityLevel', indicating the similarity score between a - * pair of publications. - */ - private List properties = new ArrayList<>(); - - public String getRelType() { - return relType; - } - - public void setRelType(final String relType) { - this.relType = relType; - } - - public String getSubRelType() { - return subRelType; - } - - public void setSubRelType(final String subRelType) { - this.subRelType = subRelType; - } - - public String getRelClass() { - return relClass; - } - - public void setRelClass(final String relClass) { - this.relClass = relClass; - } - - public String getSource() { - return source; - } - - public void setSource(final String source) { - this.source = source; - } - - public String getTarget() { - return target; - } - - public void setTarget(final String target) { - this.target = target; - } - - public List getProperties() { - return properties; - } - - public void setProperties(List properties) { - this.properties = properties; - } - - public Boolean getValidated() { - return validated; - } - - public void setValidated(Boolean validated) { - this.validated = validated; - } - - public String getValidationDate() { - return validationDate; - } - - public void setValidationDate(String validationDate) { - this.validationDate = validationDate; - } - - public void mergeFrom(final Relation r) { - - checkArgument(Objects.equals(getSource(), r.getSource()), "source ids must be equal"); - checkArgument(Objects.equals(getTarget(), r.getTarget()), "target ids must be equal"); - checkArgument(Objects.equals(getRelType(), r.getRelType()), "relType(s) must be equal"); - checkArgument( - Objects.equals(getSubRelType(), r.getSubRelType()), "subRelType(s) must be equal"); - checkArgument(Objects.equals(getRelClass(), r.getRelClass()), "relClass(es) must be equal"); - - setCollectedfrom( - Stream - .concat( - Optional - .ofNullable(getCollectedfrom()) - .map(Collection::stream) - .orElse(Stream.empty()), - Optional - .ofNullable(r.getCollectedfrom()) - .map(Collection::stream) - .orElse(Stream.empty())) - .distinct() // relies on KeyValue.equals - .collect(Collectors.toList())); - } - - @Override - public boolean equals(Object o) { - if (this == o) - return true; - if (o == null || getClass() != o.getClass()) - return false; - Relation relation = (Relation) o; - return relType.equals(relation.relType) - && subRelType.equals(relation.subRelType) - && relClass.equals(relation.relClass) - && source.equals(relation.source) - && target.equals(relation.target); - } - - @Override - public int hashCode() { - return Objects.hash(relType, subRelType, relClass, source, target, collectedfrom); - } - -} diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Result.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Result.java deleted file mode 100644 index 443c18230..000000000 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Result.java +++ /dev/null @@ -1,351 +0,0 @@ - -package eu.dnetlib.dhp.schema.oaf; - -import java.io.Serializable; -import java.util.ArrayList; -import java.util.Comparator; -import java.util.List; -import java.util.stream.Collectors; - -import eu.dnetlib.dhp.schema.common.LicenseComparator; - -public class Result extends OafEntity implements Serializable { - - private List measures; - - private List author; - - // resulttype allows subclassing results into publications | datasets | software - private Qualifier resulttype; - - // common fields - private Qualifier language; - - private List country; - - private List subject; - - private List title; - - private List relevantdate; - - private List> description; - - private Field dateofacceptance; - - private Field publisher; - - private Field embargoenddate; - - private List> source; - - private List> fulltext; // remove candidate - - private List> format; - - private List> contributor; - - private Qualifier resourcetype; - - private List> coverage; - - private Qualifier bestaccessright; - - private List context; - - private List externalReference; - - private List instance; - - public List getMeasures() { - return measures; - } - - public void setMeasures(List measures) { - this.measures = measures; - } - - public List getAuthor() { - return author; - } - - public void setAuthor(List author) { - this.author = author; - } - - public Qualifier getResulttype() { - return resulttype; - } - - public void setResulttype(Qualifier resulttype) { - this.resulttype = resulttype; - } - - public Qualifier getLanguage() { - return language; - } - - public void setLanguage(Qualifier language) { - this.language = language; - } - - public List getCountry() { - return country; - } - - public void setCountry(List country) { - this.country = country; - } - - public List getSubject() { - return subject; - } - - public void setSubject(List subject) { - this.subject = subject; - } - - public List getTitle() { - return title; - } - - public void setTitle(List title) { - this.title = title; - } - - public List getRelevantdate() { - return relevantdate; - } - - public void setRelevantdate(List relevantdate) { - this.relevantdate = relevantdate; - } - - public List> getDescription() { - return description; - } - - public void setDescription(List> description) { - this.description = description; - } - - public Field getDateofacceptance() { - return dateofacceptance; - } - - public void setDateofacceptance(Field dateofacceptance) { - this.dateofacceptance = dateofacceptance; - } - - public Field getPublisher() { - return publisher; - } - - public void setPublisher(Field publisher) { - this.publisher = publisher; - } - - public Field getEmbargoenddate() { - return embargoenddate; - } - - public void setEmbargoenddate(Field embargoenddate) { - this.embargoenddate = embargoenddate; - } - - public List> getSource() { - return source; - } - - public void setSource(List> source) { - this.source = source; - } - - public List> getFulltext() { - return fulltext; - } - - public void setFulltext(List> fulltext) { - this.fulltext = fulltext; - } - - public List> getFormat() { - return format; - } - - public void setFormat(List> format) { - this.format = format; - } - - public List> getContributor() { - return contributor; - } - - public void setContributor(List> contributor) { - this.contributor = contributor; - } - - public Qualifier getResourcetype() { - return resourcetype; - } - - public void setResourcetype(Qualifier resourcetype) { - this.resourcetype = resourcetype; - } - - public List> getCoverage() { - return coverage; - } - - public void setCoverage(List> coverage) { - this.coverage = coverage; - } - - public Qualifier getBestaccessright() { - return bestaccessright; - } - - public void setBestaccessright(Qualifier bestaccessright) { - this.bestaccessright = bestaccessright; - } - - public List getContext() { - return context; - } - - public void setContext(List context) { - this.context = context; - } - - public List getExternalReference() { - return externalReference; - } - - public void setExternalReference(List externalReference) { - this.externalReference = externalReference; - } - - public List getInstance() { - return instance; - } - - public void setInstance(List instance) { - this.instance = instance; - } - - @Override - public void mergeFrom(OafEntity e) { - super.mergeFrom(e); - - if (!Result.class.isAssignableFrom(e.getClass())) { - return; - } - - Result r = (Result) e; - - // TODO consider merging also Measures - - instance = mergeLists(instance, r.getInstance()); - - if (r.getBestaccessright() != null - && new LicenseComparator().compare(r.getBestaccessright(), bestaccessright) < 0) - bestaccessright = r.getBestaccessright(); - - if (r.getResulttype() != null && compareTrust(this, r) < 0) - resulttype = r.getResulttype(); - - if (r.getLanguage() != null && compareTrust(this, r) < 0) - language = r.getLanguage(); - - country = mergeLists(country, r.getCountry()); - - subject = mergeLists(subject, r.getSubject()); - - // merge title lists: main title with higher trust and distinct between the others - StructuredProperty baseMainTitle = null; - if (title != null) { - baseMainTitle = getMainTitle(title); - if (baseMainTitle != null) { - final StructuredProperty p = baseMainTitle; - title = title.stream().filter(t -> t != p).collect(Collectors.toList()); - } - } - - StructuredProperty newMainTitle = null; - if (r.getTitle() != null) { - newMainTitle = getMainTitle(r.getTitle()); - if (newMainTitle != null) { - final StructuredProperty p = newMainTitle; - r.setTitle(r.getTitle().stream().filter(t -> t != p).collect(Collectors.toList())); - } - } - - if (newMainTitle != null && compareTrust(this, r) < 0) { - baseMainTitle = newMainTitle; - } - - title = mergeLists(title, r.getTitle()); - if (title != null && baseMainTitle != null) { - title.add(baseMainTitle); - } - - relevantdate = mergeLists(relevantdate, r.getRelevantdate()); - - description = longestLists(description, r.getDescription()); - - if (r.getPublisher() != null && compareTrust(this, r) < 0) - publisher = r.getPublisher(); - - if (r.getEmbargoenddate() != null && compareTrust(this, r) < 0) - embargoenddate = r.getEmbargoenddate(); - - source = mergeLists(source, r.getSource()); - - fulltext = mergeLists(fulltext, r.getFulltext()); - - format = mergeLists(format, r.getFormat()); - - contributor = mergeLists(contributor, r.getContributor()); - - if (r.getResourcetype() != null) - resourcetype = r.getResourcetype(); - - coverage = mergeLists(coverage, r.getCoverage()); - - context = mergeLists(context, r.getContext()); - - externalReference = mergeLists(externalReference, r.getExternalReference()); - } - - private List> longestLists(List> a, List> b) { - if (a == null || b == null) - return a == null ? b : a; - if (a.size() == b.size()) { - int msa = a - .stream() - .filter(i -> i.getValue() != null) - .map(i -> i.getValue().length()) - .max(Comparator.naturalOrder()) - .orElse(0); - int msb = b - .stream() - .filter(i -> i.getValue() != null) - .map(i -> i.getValue().length()) - .max(Comparator.naturalOrder()) - .orElse(0); - return msa > msb ? a : b; - } - return a.size() > b.size() ? a : b; - } - - private StructuredProperty getMainTitle(List titles) { - // need to check if the list of titles contains more than 1 main title? (in that case, we should chose which - // main title select in the list) - for (StructuredProperty title : titles) { - if (title.getQualifier() != null && title.getQualifier().getClassid() != null) - if (title.getQualifier().getClassid().equals("main title")) - return title; - } - return null; - } -} diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Software.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Software.java deleted file mode 100644 index d25b5c9ce..000000000 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Software.java +++ /dev/null @@ -1,80 +0,0 @@ - -package eu.dnetlib.dhp.schema.oaf; - -import java.io.Serializable; -import java.util.List; - -import eu.dnetlib.dhp.schema.common.ModelConstants; - -public class Software extends Result implements Serializable { - - private List> documentationUrl; - - // candidate for removal - private List license; - - // candidate for removal - private Field codeRepositoryUrl; - - private Qualifier programmingLanguage; - - public Software() { - setResulttype(ModelConstants.SOFTWARE_DEFAULT_RESULTTYPE); - } - - public List> getDocumentationUrl() { - return documentationUrl; - } - - public void setDocumentationUrl(List> documentationUrl) { - this.documentationUrl = documentationUrl; - } - - public List getLicense() { - return license; - } - - public void setLicense(List license) { - this.license = license; - } - - public Field getCodeRepositoryUrl() { - return codeRepositoryUrl; - } - - public void setCodeRepositoryUrl(Field codeRepositoryUrl) { - this.codeRepositoryUrl = codeRepositoryUrl; - } - - public Qualifier getProgrammingLanguage() { - return programmingLanguage; - } - - public void setProgrammingLanguage(Qualifier programmingLanguage) { - this.programmingLanguage = programmingLanguage; - } - - @Override - public void mergeFrom(OafEntity e) { - super.mergeFrom(e); - - if (!Software.class.isAssignableFrom(e.getClass())) { - return; - } - - final Software s = (Software) e; - documentationUrl = mergeLists(documentationUrl, s.getDocumentationUrl()); - - license = mergeLists(license, s.getLicense()); - - codeRepositoryUrl = s.getCodeRepositoryUrl() != null && compareTrust(this, s) < 0 - ? s.getCodeRepositoryUrl() - : codeRepositoryUrl; - - programmingLanguage = s.getProgrammingLanguage() != null && compareTrust(this, s) < 0 - ? s.getProgrammingLanguage() - : programmingLanguage; - - mergeOAFDataInfo(e); - } -} diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/StructuredProperty.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/StructuredProperty.java deleted file mode 100644 index 1fa0de0be..000000000 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/StructuredProperty.java +++ /dev/null @@ -1,60 +0,0 @@ - -package eu.dnetlib.dhp.schema.oaf; - -import java.io.Serializable; - -public class StructuredProperty implements Serializable { - - private String value; - - private Qualifier qualifier; - - private DataInfo dataInfo; - - public String getValue() { - return value; - } - - public void setValue(String value) { - this.value = value; - } - - public Qualifier getQualifier() { - return qualifier; - } - - public void setQualifier(Qualifier qualifier) { - this.qualifier = qualifier; - } - - public DataInfo getDataInfo() { - return dataInfo; - } - - public void setDataInfo(DataInfo dataInfo) { - this.dataInfo = dataInfo; - } - - public String toComparableString() { - return value != null ? value.toLowerCase() : ""; - } - - @Override - public int hashCode() { - return toComparableString().hashCode(); - } - - @Override - public boolean equals(Object obj) { - if (this == obj) - return true; - if (obj == null) - return false; - if (getClass() != obj.getClass()) - return false; - - StructuredProperty other = (StructuredProperty) obj; - - return toComparableString().equals(other.toComparableString()); - } -} diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/scholexplorer/DLIDataset.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/scholexplorer/DLIDataset.java deleted file mode 100644 index 421b4ecaa..000000000 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/scholexplorer/DLIDataset.java +++ /dev/null @@ -1,89 +0,0 @@ - -package eu.dnetlib.dhp.schema.scholexplorer; - -import java.util.ArrayList; -import java.util.HashMap; -import java.util.List; -import java.util.Map; - -import org.apache.commons.lang3.StringUtils; - -import eu.dnetlib.dhp.schema.oaf.Dataset; -import eu.dnetlib.dhp.schema.oaf.OafEntity; - -public class DLIDataset extends Dataset { - - private String originalObjIdentifier; - - private List dlicollectedfrom; - - private String completionStatus; - - public String getCompletionStatus() { - return completionStatus; - } - - public void setCompletionStatus(String completionStatus) { - this.completionStatus = completionStatus; - } - - public List getDlicollectedfrom() { - return dlicollectedfrom; - } - - public void setDlicollectedfrom(List dlicollectedfrom) { - this.dlicollectedfrom = dlicollectedfrom; - } - - public String getOriginalObjIdentifier() { - return originalObjIdentifier; - } - - public void setOriginalObjIdentifier(String originalObjIdentifier) { - this.originalObjIdentifier = originalObjIdentifier; - } - - @Override - public void mergeFrom(OafEntity e) { - super.mergeFrom(e); - DLIDataset p = (DLIDataset) e; - if (StringUtils.isBlank(completionStatus) && StringUtils.isNotBlank(p.completionStatus)) - completionStatus = p.completionStatus; - if ("complete".equalsIgnoreCase(p.completionStatus)) - completionStatus = "complete"; - dlicollectedfrom = mergeProvenance(dlicollectedfrom, p.getDlicollectedfrom()); - } - - private List mergeProvenance( - final List a, final List b) { - Map result = new HashMap<>(); - if (a != null) - a - .forEach( - p -> { - if (p != null && StringUtils.isNotBlank(p.getId()) && result.containsKey(p.getId())) { - if ("incomplete".equalsIgnoreCase(result.get(p.getId()).getCompletionStatus()) - && StringUtils.isNotBlank(p.getCompletionStatus())) { - result.put(p.getId(), p); - } - - } else if (p != null && p.getId() != null && !result.containsKey(p.getId())) - result.put(p.getId(), p); - }); - if (b != null) - b - .forEach( - p -> { - if (p != null && StringUtils.isNotBlank(p.getId()) && result.containsKey(p.getId())) { - if ("incomplete".equalsIgnoreCase(result.get(p.getId()).getCompletionStatus()) - && StringUtils.isNotBlank(p.getCompletionStatus())) { - result.put(p.getId(), p); - } - - } else if (p != null && p.getId() != null && !result.containsKey(p.getId())) - result.put(p.getId(), p); - }); - - return new ArrayList<>(result.values()); - } -} diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/scholexplorer/DLIPublication.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/scholexplorer/DLIPublication.java deleted file mode 100644 index c899a899c..000000000 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/scholexplorer/DLIPublication.java +++ /dev/null @@ -1,87 +0,0 @@ - -package eu.dnetlib.dhp.schema.scholexplorer; - -import java.io.Serializable; -import java.util.*; - -import org.apache.commons.lang3.StringUtils; - -import eu.dnetlib.dhp.schema.oaf.OafEntity; -import eu.dnetlib.dhp.schema.oaf.Publication; - -public class DLIPublication extends Publication implements Serializable { - - private String originalObjIdentifier; - - private List dlicollectedfrom; - - private String completionStatus; - - public String getCompletionStatus() { - return completionStatus; - } - - public void setCompletionStatus(String completionStatus) { - this.completionStatus = completionStatus; - } - - public List getDlicollectedfrom() { - return dlicollectedfrom; - } - - public void setDlicollectedfrom(List dlicollectedfrom) { - this.dlicollectedfrom = dlicollectedfrom; - } - - public String getOriginalObjIdentifier() { - return originalObjIdentifier; - } - - public void setOriginalObjIdentifier(String originalObjIdentifier) { - this.originalObjIdentifier = originalObjIdentifier; - } - - @Override - public void mergeFrom(OafEntity e) { - super.mergeFrom(e); - DLIPublication p = (DLIPublication) e; - if (StringUtils.isBlank(completionStatus) && StringUtils.isNotBlank(p.completionStatus)) - completionStatus = p.completionStatus; - if ("complete".equalsIgnoreCase(p.completionStatus)) - completionStatus = "complete"; - dlicollectedfrom = mergeProvenance(dlicollectedfrom, p.getDlicollectedfrom()); - } - - private List mergeProvenance( - final List a, final List b) { - Map result = new HashMap<>(); - if (a != null) - a - .forEach( - p -> { - if (p != null && StringUtils.isNotBlank(p.getId()) && result.containsKey(p.getId())) { - if ("incomplete".equalsIgnoreCase(result.get(p.getId()).getCompletionStatus()) - && StringUtils.isNotBlank(p.getCompletionStatus())) { - result.put(p.getId(), p); - } - - } else if (p != null && p.getId() != null && !result.containsKey(p.getId())) - result.put(p.getId(), p); - }); - if (b != null) - b - .forEach( - p -> { - if (p != null && StringUtils.isNotBlank(p.getId()) && result.containsKey(p.getId())) { - if ("incomplete".equalsIgnoreCase(result.get(p.getId()).getCompletionStatus()) - && StringUtils.isNotBlank(p.getCompletionStatus())) { - result.put(p.getId(), p); - } - - } else if (p != null && p.getId() != null && !result.containsKey(p.getId())) - result.put(p.getId(), p); - }); - - return new ArrayList<>(result.values()); - } -} diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/scholexplorer/DLIUnknown.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/scholexplorer/DLIUnknown.java deleted file mode 100644 index 5da599427..000000000 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/scholexplorer/DLIUnknown.java +++ /dev/null @@ -1,132 +0,0 @@ - -package eu.dnetlib.dhp.schema.scholexplorer; - -import java.io.Serializable; -import java.util.*; -import java.util.stream.Collectors; - -import org.apache.commons.lang3.StringUtils; - -import eu.dnetlib.dhp.schema.oaf.Oaf; -import eu.dnetlib.dhp.schema.oaf.StructuredProperty; - -public class DLIUnknown extends Oaf implements Serializable { - - private String id; - - private List pid; - - private String dateofcollection; - - private String dateoftransformation; - - private List dlicollectedfrom; - - private String completionStatus = "incomplete"; - - public String getCompletionStatus() { - return completionStatus; - } - - public void setCompletionStatus(String completionStatus) { - this.completionStatus = completionStatus; - } - - public List getDlicollectedfrom() { - return dlicollectedfrom; - } - - public void setDlicollectedfrom(List dlicollectedfrom) { - this.dlicollectedfrom = dlicollectedfrom; - } - - public String getId() { - return id; - } - - public void setId(String id) { - this.id = id; - } - - public List getPid() { - return pid; - } - - public void setPid(List pid) { - this.pid = pid; - } - - public String getDateofcollection() { - return dateofcollection; - } - - public void setDateofcollection(String dateofcollection) { - this.dateofcollection = dateofcollection; - } - - public String getDateoftransformation() { - return dateoftransformation; - } - - public void setDateoftransformation(String dateoftransformation) { - this.dateoftransformation = dateoftransformation; - } - - public void mergeFrom(DLIUnknown p) { - if ("complete".equalsIgnoreCase(p.completionStatus)) - completionStatus = "complete"; - dlicollectedfrom = mergeProvenance(dlicollectedfrom, p.getDlicollectedfrom()); - if (StringUtils.isEmpty(id) && StringUtils.isNoneEmpty(p.getId())) - id = p.getId(); - if (StringUtils.isEmpty(dateofcollection) && StringUtils.isNoneEmpty(p.getDateofcollection())) - dateofcollection = p.getDateofcollection(); - - if (StringUtils.isEmpty(dateoftransformation) && StringUtils.isNoneEmpty(p.getDateoftransformation())) - dateofcollection = p.getDateoftransformation(); - pid = mergeLists(pid, p.getPid()); - } - - protected List mergeLists(final List... lists) { - - return Arrays - .stream(lists) - .filter(Objects::nonNull) - .flatMap(List::stream) - .filter(Objects::nonNull) - .distinct() - .collect(Collectors.toList()); - } - - private List mergeProvenance( - final List a, final List b) { - Map result = new HashMap<>(); - if (a != null) - a - .forEach( - p -> { - if (p != null && StringUtils.isNotBlank(p.getId()) && result.containsKey(p.getId())) { - if ("incomplete".equalsIgnoreCase(result.get(p.getId()).getCompletionStatus()) - && StringUtils.isNotBlank(p.getCompletionStatus())) { - result.put(p.getId(), p); - } - - } else if (p != null && p.getId() != null && !result.containsKey(p.getId())) - result.put(p.getId(), p); - }); - if (b != null) - b - .forEach( - p -> { - if (p != null && StringUtils.isNotBlank(p.getId()) && result.containsKey(p.getId())) { - if ("incomplete".equalsIgnoreCase(result.get(p.getId()).getCompletionStatus()) - && StringUtils.isNotBlank(p.getCompletionStatus())) { - result.put(p.getId(), p); - } - - } else if (p != null && p.getId() != null && !result.containsKey(p.getId())) - result.put(p.getId(), p); - }); - - return new ArrayList<>(result.values()); - } -} diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/scholexplorer/OafUtils.scala b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/scholexplorer/OafUtils.scala deleted file mode 100644 index 27eec77fa..000000000 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/scholexplorer/OafUtils.scala +++ /dev/null @@ -1,90 +0,0 @@ -package eu.dnetlib.dhp.schema.scholexplorer - -import eu.dnetlib.dhp.schema.oaf.{DataInfo, Field, KeyValue, Qualifier, StructuredProperty} - -object OafUtils { - - - - def generateKeyValue(key: String, value: String): KeyValue = { - val kv: KeyValue = new KeyValue() - kv.setKey(key) - kv.setValue(value) - kv.setDataInfo(generateDataInfo("0.9")) - kv - } - - - def generateDataInfo(trust: String = "0.9", invisibile: Boolean = false): DataInfo = { - val di = new DataInfo - di.setDeletedbyinference(false) - di.setInferred(false) - di.setInvisible(false) - di.setTrust(trust) - di.setProvenanceaction(createQualifier("sysimport:actionset", "dnet:provenanceActions")) - di - } - - def createQualifier(cls: String, sch: String): Qualifier = { - createQualifier(cls, cls, sch, sch) - } - - - def createQualifier(classId: String, className: String, schemeId: String, schemeName: String): Qualifier = { - val q: Qualifier = new Qualifier - q.setClassid(classId) - q.setClassname(className) - q.setSchemeid(schemeId) - q.setSchemename(schemeName) - q - } - - - def asField[T](value: T): Field[T] = { - val tmp = new Field[T] - tmp.setValue(value) - tmp - - - } - - def createSP(value: String, classId: String,className:String, schemeId: String, schemeName:String): StructuredProperty = { - val sp = new StructuredProperty - sp.setQualifier(createQualifier(classId,className, schemeId, schemeName)) - sp.setValue(value) - sp - - } - - - - def createSP(value: String, classId: String,className:String, schemeId: String, schemeName:String, dataInfo: DataInfo): StructuredProperty = { - val sp = new StructuredProperty - sp.setQualifier(createQualifier(classId,className, schemeId, schemeName)) - sp.setValue(value) - sp.setDataInfo(dataInfo) - sp - - } - - def createSP(value: String, classId: String, schemeId: String): StructuredProperty = { - val sp = new StructuredProperty - sp.setQualifier(createQualifier(classId, schemeId)) - sp.setValue(value) - sp - - } - - - - def createSP(value: String, classId: String, schemeId: String, dataInfo: DataInfo): StructuredProperty = { - val sp = new StructuredProperty - sp.setQualifier(createQualifier(classId, schemeId)) - sp.setValue(value) - sp.setDataInfo(dataInfo) - sp - - } - - -} diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/scholexplorer/ProvenaceInfo.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/scholexplorer/ProvenaceInfo.java deleted file mode 100644 index b1188f064..000000000 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/scholexplorer/ProvenaceInfo.java +++ /dev/null @@ -1,47 +0,0 @@ - -package eu.dnetlib.dhp.schema.scholexplorer; - -import java.io.Serializable; - -public class ProvenaceInfo implements Serializable { - - private String id; - - private String name; - - private String completionStatus; - - private String collectionMode = "collected"; - - public String getId() { - return id; - } - - public void setId(String id) { - this.id = id; - } - - public String getName() { - return name; - } - - public void setName(String name) { - this.name = name; - } - - public String getCompletionStatus() { - return completionStatus; - } - - public void setCompletionStatus(String completionStatus) { - this.completionStatus = completionStatus; - } - - public String getCollectionMode() { - return collectionMode; - } - - public void setCollectionMode(String collectionMode) { - this.collectionMode = collectionMode; - } -} diff --git a/dhp-schemas/src/test/java/eu/dnetlib/dhp/schema/action/AtomicActionTest.java b/dhp-schemas/src/test/java/eu/dnetlib/dhp/schema/action/AtomicActionTest.java deleted file mode 100644 index 4d31591a0..000000000 --- a/dhp-schemas/src/test/java/eu/dnetlib/dhp/schema/action/AtomicActionTest.java +++ /dev/null @@ -1,40 +0,0 @@ - -package eu.dnetlib.dhp.schema.action; - -import static org.junit.jupiter.api.Assertions.*; - -import java.io.IOException; - -import org.apache.commons.lang3.StringUtils; -import org.junit.jupiter.api.Test; - -import com.fasterxml.jackson.databind.ObjectMapper; - -import eu.dnetlib.dhp.schema.oaf.Relation; - -/** @author claudio.atzori */ -public class AtomicActionTest { - - @Test - public void serializationTest() throws IOException { - - Relation rel = new Relation(); - rel.setSource("1"); - rel.setTarget("2"); - rel.setRelType("resultResult"); - rel.setSubRelType("dedup"); - rel.setRelClass("merges"); - - AtomicAction aa1 = new AtomicAction(Relation.class, rel); - - final ObjectMapper mapper = new ObjectMapper(); - String json = mapper.writeValueAsString(aa1); - - assertTrue(StringUtils.isNotBlank(json)); - - AtomicAction aa2 = mapper.readValue(json, AtomicAction.class); - - assertEquals(aa1.getClazz(), aa2.getClazz()); - assertEquals(aa1.getPayload(), aa2.getPayload()); - } -} diff --git a/dhp-schemas/src/test/java/eu/dnetlib/dhp/schema/common/ModelSupportTest.java b/dhp-schemas/src/test/java/eu/dnetlib/dhp/schema/common/ModelSupportTest.java deleted file mode 100644 index 73e8c47ff..000000000 --- a/dhp-schemas/src/test/java/eu/dnetlib/dhp/schema/common/ModelSupportTest.java +++ /dev/null @@ -1,37 +0,0 @@ - -package eu.dnetlib.dhp.schema.common; - -import static org.junit.jupiter.api.Assertions.assertFalse; -import static org.junit.jupiter.api.Assertions.assertTrue; - -import org.junit.jupiter.api.Nested; -import org.junit.jupiter.api.Test; - -import eu.dnetlib.dhp.schema.oaf.OafEntity; -import eu.dnetlib.dhp.schema.oaf.Relation; -import eu.dnetlib.dhp.schema.oaf.Result; - -public class ModelSupportTest { - - @Nested - class IsSubClass { - - @Test - public void shouldReturnFalseWhenSubClassDoesNotExtendSuperClass() { - // when - Boolean result = ModelSupport.isSubClass(Relation.class, OafEntity.class); - - // then - assertFalse(result); - } - - @Test - public void shouldReturnTrueWhenSubClassExtendsSuperClass() { - // when - Boolean result = ModelSupport.isSubClass(Result.class, OafEntity.class); - - // then - assertTrue(result); - } - } -} diff --git a/dhp-schemas/src/test/java/eu/dnetlib/dhp/schema/oaf/MeasureTest.java b/dhp-schemas/src/test/java/eu/dnetlib/dhp/schema/oaf/MeasureTest.java deleted file mode 100644 index 26b4407c9..000000000 --- a/dhp-schemas/src/test/java/eu/dnetlib/dhp/schema/oaf/MeasureTest.java +++ /dev/null @@ -1,57 +0,0 @@ - -package eu.dnetlib.dhp.schema.oaf; - -import java.io.IOException; -import java.util.List; - -import org.junit.jupiter.api.Assertions; -import org.junit.jupiter.api.Test; - -import com.fasterxml.jackson.annotation.JsonInclude; -import com.fasterxml.jackson.core.type.TypeReference; -import com.fasterxml.jackson.databind.ObjectMapper; -import com.google.common.collect.Lists; - -public class MeasureTest { - - public static final ObjectMapper OBJECT_MAPPER = new ObjectMapper() - .setSerializationInclusion(JsonInclude.Include.NON_NULL); - - @Test - public void testMeasureSerialization() throws IOException { - - Measure popularity = new Measure(); - popularity.setId("popularity"); - popularity - .setUnit( - Lists - .newArrayList( - unit("score", "0.5"))); - - Measure influence = new Measure(); - influence.setId("influence"); - influence - .setUnit( - Lists - .newArrayList( - unit("score", "0.3"))); - - List m = Lists.newArrayList(popularity, influence); - - String s = OBJECT_MAPPER.writeValueAsString(m); - System.out.println(s); - - List mm = OBJECT_MAPPER.readValue(s, new TypeReference>() { - }); - - Assertions.assertNotNull(mm); - } - - private KeyValue unit(String key, String value) { - KeyValue unit = new KeyValue(); - unit.setKey(key); - unit.setValue(value); - return unit; - } - -} diff --git a/dhp-schemas/src/test/java/eu/dnetlib/dhp/schema/oaf/MergeTest.java b/dhp-schemas/src/test/java/eu/dnetlib/dhp/schema/oaf/MergeTest.java deleted file mode 100644 index f91646f2c..000000000 --- a/dhp-schemas/src/test/java/eu/dnetlib/dhp/schema/oaf/MergeTest.java +++ /dev/null @@ -1,88 +0,0 @@ - -package eu.dnetlib.dhp.schema.oaf; - -import static org.junit.jupiter.api.Assertions.*; - -import java.util.Arrays; -import java.util.List; - -import org.junit.jupiter.api.BeforeEach; -import org.junit.jupiter.api.Test; - -public class MergeTest { - - OafEntity oaf; - - @BeforeEach - public void setUp() { - oaf = new Publication(); - } - - @Test - public void mergeListsTest() { - - // string list merge test - List a = Arrays.asList("a", "b", "c", "e"); - List b = Arrays.asList("a", "b", "c", "d"); - List c = null; - - System.out.println("merge result 1 = " + oaf.mergeLists(a, b)); - - System.out.println("merge result 2 = " + oaf.mergeLists(a, c)); - - System.out.println("merge result 3 = " + oaf.mergeLists(c, c)); - } - - @Test - public void mergePublicationCollectedFromTest() { - - Publication a = new Publication(); - Publication b = new Publication(); - - a.setCollectedfrom(Arrays.asList(setKV("a", "open"), setKV("b", "closed"))); - b.setCollectedfrom(Arrays.asList(setKV("A", "open"), setKV("b", "Open"))); - - a.mergeFrom(b); - - assertNotNull(a.getCollectedfrom()); - assertEquals(3, a.getCollectedfrom().size()); - } - - @Test - public void mergePublicationSubjectTest() { - - Publication a = new Publication(); - Publication b = new Publication(); - - a.setSubject(Arrays.asList(setSP("a", "open", "classe"), setSP("b", "open", "classe"))); - b.setSubject(Arrays.asList(setSP("A", "open", "classe"), setSP("c", "open", "classe"))); - - a.mergeFrom(b); - - assertNotNull(a.getSubject()); - assertEquals(3, a.getSubject().size()); - } - - private KeyValue setKV(final String key, final String value) { - - KeyValue k = new KeyValue(); - - k.setKey(key); - k.setValue(value); - - return k; - } - - private StructuredProperty setSP( - final String value, final String schema, final String classname) { - StructuredProperty s = new StructuredProperty(); - s.setValue(value); - Qualifier q = new Qualifier(); - q.setClassname(classname); - q.setClassid(classname); - q.setSchemename(schema); - q.setSchemeid(schema); - s.setQualifier(q); - return s; - } -} diff --git a/dhp-schemas/src/test/java/eu/dnetlib/dhp/schema/scholexplorer/DLItest.java b/dhp-schemas/src/test/java/eu/dnetlib/dhp/schema/scholexplorer/DLItest.java deleted file mode 100644 index e4596fcdd..000000000 --- a/dhp-schemas/src/test/java/eu/dnetlib/dhp/schema/scholexplorer/DLItest.java +++ /dev/null @@ -1,83 +0,0 @@ - -package eu.dnetlib.dhp.schema.scholexplorer; - -import java.io.IOException; -import java.util.Arrays; -import java.util.Collections; - -import org.junit.jupiter.api.Test; - -import com.fasterxml.jackson.core.JsonProcessingException; -import com.fasterxml.jackson.databind.DeserializationFeature; -import com.fasterxml.jackson.databind.ObjectMapper; -import com.fasterxml.jackson.databind.SerializationFeature; - -import eu.dnetlib.dhp.schema.oaf.Qualifier; -import eu.dnetlib.dhp.schema.oaf.StructuredProperty; - -public class DLItest { - - @Test - public void testMergePublication() throws JsonProcessingException { - DLIPublication a1 = new DLIPublication(); - a1.setPid(Arrays.asList(createSP("123456", "pdb", "dnet:pid_types"))); - a1.setTitle(Collections.singletonList(createSP("Un Titolo", "title", "dnetTitle"))); - a1.setDlicollectedfrom(Arrays.asList(createCollectedFrom("znd", "Zenodo", "complete"))); - a1.setCompletionStatus("complete"); - - DLIPublication a = new DLIPublication(); - a - .setPid( - Arrays - .asList( - createSP("10.11", "doi", "dnet:pid_types"), - createSP("123456", "pdb", "dnet:pid_types"))); - a.setTitle(Collections.singletonList(createSP("A Title", "title", "dnetTitle"))); - a - .setDlicollectedfrom( - Arrays - .asList( - createCollectedFrom("dct", "datacite", "complete"), - createCollectedFrom("dct", "datacite", "incomplete"))); - a.setCompletionStatus("incomplete"); - - a.mergeFrom(a1); - - ObjectMapper mapper = new ObjectMapper(); - System.out.println(mapper.writeValueAsString(a)); - } - - @Test - public void testDeserialization() throws IOException { - - final String json = "{\"dataInfo\":{\"invisible\":false,\"inferred\":null,\"deletedbyinference\":false,\"trust\":\"0.9\",\"inferenceprovenance\":null,\"provenanceaction\":null},\"lastupdatetimestamp\":null,\"id\":\"60|bd9352547098929a394655ad1a44a479\",\"originalId\":[\"bd9352547098929a394655ad1a44a479\"],\"collectedfrom\":[{\"key\":\"dli_________::datacite\",\"value\":\"Datasets in Datacite\",\"dataInfo\":null,\"blank\":false}],\"pid\":[{\"value\":\"10.7925/DRS1.DUCHAS_5078760\",\"qualifier\":{\"classid\":\"doi\",\"classname\":\"doi\",\"schemeid\":\"dnet:pid_types\",\"schemename\":\"dnet:pid_types\",\"blank\":false},\"dataInfo\":null}],\"dateofcollection\":\"2020-01-09T08:29:31.885Z\",\"dateoftransformation\":null,\"extraInfo\":null,\"oaiprovenance\":null,\"author\":[{\"fullname\":\"Cathail, S. Ó\",\"name\":null,\"surname\":null,\"rank\":null,\"pid\":null,\"affiliation\":null},{\"fullname\":\"Donnell, Breda Mc\",\"name\":null,\"surname\":null,\"rank\":null,\"pid\":null,\"affiliation\":null},{\"fullname\":\"Ireland. Department of Arts, Culture, and the Gaeltacht\",\"name\":null,\"surname\":null,\"rank\":null,\"pid\":null,\"affiliation\":null},{\"fullname\":\"University College Dublin\",\"name\":null,\"surname\":null,\"rank\":null,\"pid\":null,\"affiliation\":null},{\"fullname\":\"National Folklore Foundation\",\"name\":null,\"surname\":null,\"rank\":null,\"pid\":null,\"affiliation\":null},{\"fullname\":\"Cathail, S. Ó\",\"name\":null,\"surname\":null,\"rank\":null,\"pid\":null,\"affiliation\":null},{\"fullname\":\"Donnell, Breda Mc\",\"name\":null,\"surname\":null,\"rank\":null,\"pid\":null,\"affiliation\":null}],\"resulttype\":null,\"language\":null,\"country\":null,\"subject\":[{\"value\":\"Recreation\",\"qualifier\":{\"classid\":\"dnet:subject\",\"classname\":\"dnet:subject\",\"schemeid\":\"unknown\",\"schemename\":\"unknown\",\"blank\":false},\"dataInfo\":null},{\"value\":\"Entertainments and recreational activities\",\"qualifier\":{\"classid\":\"dnet:subject\",\"classname\":\"dnet:subject\",\"schemeid\":\"unknown\",\"schemename\":\"unknown\",\"blank\":false},\"dataInfo\":null},{\"value\":\"Siamsaíocht agus caitheamh aimsire\",\"qualifier\":{\"classid\":\"dnet:subject\",\"classname\":\"dnet:subject\",\"schemeid\":\"unknown\",\"schemename\":\"unknown\",\"blank\":false},\"dataInfo\":null}],\"title\":[{\"value\":\"Games We Play\",\"qualifier\":null,\"dataInfo\":null}],\"relevantdate\":[{\"value\":\"1938-09-28\",\"qualifier\":{\"classid\":\"date\",\"classname\":\"date\",\"schemeid\":\"dnet::date\",\"schemename\":\"dnet::date\",\"blank\":false},\"dataInfo\":null}],\"description\":[{\"value\":\"Story collected by Breda Mc Donnell, a student at Tenure school (Tinure, Co. Louth) (no informant identified).\",\"dataInfo\":null}],\"dateofacceptance\":null,\"publisher\":{\"value\":\"University College Dublin\",\"dataInfo\":null},\"embargoenddate\":null,\"source\":null,\"fulltext\":null,\"format\":null,\"contributor\":null,\"resourcetype\":null,\"coverage\":null,\"refereed\":null,\"context\":null,\"processingchargeamount\":null,\"processingchargecurrency\":null,\"externalReference\":null,\"instance\":[],\"storagedate\":null,\"device\":null,\"size\":null,\"version\":null,\"lastmetadataupdate\":null,\"metadataversionnumber\":null,\"geolocation\":null,\"dlicollectedfrom\":[{\"id\":\"dli_________::datacite\",\"name\":\"Datasets in Datacite\",\"completionStatus\":\"complete\",\"collectionMode\":\"resolved\"}],\"completionStatus\":\"complete\"}"; - - ObjectMapper mapper = new ObjectMapper(); - mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false); - DLIDataset dliDataset = mapper.readValue(json, DLIDataset.class); - mapper.enable(SerializationFeature.INDENT_OUTPUT); - System.out.println(mapper.writeValueAsString(dliDataset)); - } - - private ProvenaceInfo createCollectedFrom( - final String id, final String name, final String completionStatus) { - ProvenaceInfo p = new ProvenaceInfo(); - p.setId(id); - p.setName(name); - p.setCompletionStatus(completionStatus); - return p; - } - - private StructuredProperty createSP( - final String value, final String className, final String schemeName) { - StructuredProperty p = new StructuredProperty(); - p.setValue(value); - Qualifier schema = new Qualifier(); - schema.setClassname(className); - schema.setClassid(className); - schema.setSchemename(schemeName); - schema.setSchemeid(schemeName); - p.setQualifier(schema); - return p; - } -} diff --git a/dhp-workflows/dhp-actionmanager/pom.xml b/dhp-workflows/dhp-actionmanager/pom.xml index 0b4d25700..29e1fab1f 100644 --- a/dhp-workflows/dhp-actionmanager/pom.xml +++ b/dhp-workflows/dhp-actionmanager/pom.xml @@ -4,7 +4,7 @@ eu.dnetlib.dhp dhp-workflows - 1.2.4-SNAPSHOT + 1.2.5-SNAPSHOT dhp-actionmanager @@ -51,17 +51,6 @@ hadoop-distcp - - eu.dnetlib - dnet-openaire-data-protos - - - - eu.dnetlib.dhp - dhp-schemas - ${project.version} - - eu.dnetlib dnet-actionmanager-api diff --git a/dhp-workflows/dhp-actionmanager/src/main/java/eu/dnetlib/dhp/actionmanager/ISClient.java b/dhp-workflows/dhp-actionmanager/src/main/java/eu/dnetlib/dhp/actionmanager/ISClient.java index 0f0d21e11..088e618c7 100644 --- a/dhp-workflows/dhp-actionmanager/src/main/java/eu/dnetlib/dhp/actionmanager/ISClient.java +++ b/dhp-workflows/dhp-actionmanager/src/main/java/eu/dnetlib/dhp/actionmanager/ISClient.java @@ -3,36 +3,37 @@ package eu.dnetlib.dhp.actionmanager; import java.io.Serializable; import java.io.StringReader; -import java.util.ArrayList; import java.util.List; import java.util.NoSuchElementException; +import java.util.Optional; +import java.util.Set; import java.util.stream.Collectors; +import org.apache.commons.lang3.tuple.Triple; import org.dom4j.Document; -import org.dom4j.Element; +import org.dom4j.DocumentException; import org.dom4j.io.SAXReader; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import org.xml.sax.SAXException; +import com.google.common.base.Joiner; import com.google.common.base.Splitter; import com.google.common.collect.Iterables; -import com.google.common.collect.Lists; +import com.google.common.collect.Sets; import eu.dnetlib.actionmanager.rmi.ActionManagerException; -import eu.dnetlib.actionmanager.set.ActionManagerSet; -import eu.dnetlib.actionmanager.set.ActionManagerSet.ImpactTypes; -import eu.dnetlib.dhp.actionmanager.partition.PartitionActionSetsByPayloadTypeJob; import eu.dnetlib.dhp.utils.ISLookupClientFactory; import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException; import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService; public class ISClient implements Serializable { - private static final Logger log = LoggerFactory.getLogger(PartitionActionSetsByPayloadTypeJob.class); + private static final Logger log = LoggerFactory.getLogger(ISClient.class); private static final String INPUT_ACTION_SET_ID_SEPARATOR = ","; - private final ISLookUpService isLookup; + private final transient ISLookUpService isLookup; public ISClient(String isLookupUrl) { isLookup = ISLookupClientFactory.getLookUpService(isLookupUrl); @@ -40,80 +41,54 @@ public class ISClient implements Serializable { public List getLatestRawsetPaths(String setIds) { - List ids = Lists - .newArrayList( + final Set ids = Sets + .newHashSet( Splitter .on(INPUT_ACTION_SET_ID_SEPARATOR) .omitEmptyStrings() .trimResults() .split(setIds)); - - return ids - .stream() - .map(id -> getSet(isLookup, id)) - .map(as -> as.getPathToLatest()) - .collect(Collectors.toCollection(ArrayList::new)); - } - - private ActionManagerSet getSet(ISLookUpService isLookup, final String setId) { - - final String q = "for $x in collection('/db/DRIVER/ActionManagerSetDSResources/ActionManagerSetDSResourceType') " - + "where $x//SET/@id = '" - + setId - + "' return $x"; - try { final String basePath = getBasePathHDFS(isLookup); - final String setProfile = isLookup.getResourceProfileByQuery(q); - return getActionManagerSet(basePath, setProfile); - } catch (ISLookUpException | ActionManagerException e) { - throw new RuntimeException("Error accessing Sets, using query: " + q); + + // + final String xquery = "for $x in collection('/db/DRIVER/ActionManagerSetDSResources/ActionManagerSetDSResourceType') " + + + "return "; + return Optional + .ofNullable(isLookup.quickSearchProfile(xquery)) + .map( + sets -> sets + .stream() + .map(ISClient::parseSetInfo) + .filter(t -> ids.contains(t.getLeft())) + .map(t -> buildDirectory(basePath, t)) + .collect(Collectors.toList())) + .orElseThrow(() -> new IllegalStateException("empty set list")); + } catch (ActionManagerException | ISLookUpException e) { + throw new IllegalStateException("unable to query ActionSets info from the IS"); } } - private ActionManagerSet getActionManagerSet(final String basePath, final String profile) - throws ActionManagerException { - final SAXReader reader = new SAXReader(); - final ActionManagerSet set = new ActionManagerSet(); - + private static Triple parseSetInfo(String set) { try { - final Document doc = reader.read(new StringReader(profile)); - - set.setId(doc.valueOf("//SET/@id").trim()); - set.setName(doc.valueOf("//SET").trim()); - set.setImpact(ImpactTypes.valueOf(doc.valueOf("//IMPACT").trim())); - set - .setLatest( - doc.valueOf("//RAW_SETS/LATEST/@id"), - doc.valueOf("//RAW_SETS/LATEST/@creationDate"), - doc.valueOf("//RAW_SETS/LATEST/@lastUpdate")); - set.setDirectory(doc.valueOf("//SET/@directory")); - final List expiredNodes = doc.selectNodes("//RAW_SETS/EXPIRED"); - if (expiredNodes != null) { - for (int i = 0; i < expiredNodes.size(); i++) { - Element ex = (Element) expiredNodes.get(i); - set - .addExpired( - ex.attributeValue("id"), - ex.attributeValue("creationDate"), - ex.attributeValue("lastUpdate")); - } - } - - final StringBuilder sb = new StringBuilder(); - sb.append(basePath); - sb.append("/"); - sb.append(doc.valueOf("//SET/@directory")); - sb.append("/"); - sb.append(doc.valueOf("//RAW_SETS/LATEST/@id")); - set.setPathToLatest(sb.toString()); - - return set; - } catch (Exception e) { - throw new ActionManagerException("Error creating set from profile: " + profile, e); + final SAXReader reader = new SAXReader(); + reader.setFeature("http://apache.org/xml/features/disallow-doctype-decl", true); + Document doc = reader.read(new StringReader(set)); + return Triple + .of( + doc.valueOf("//SET/@id"), + doc.valueOf("//SET/@directory"), + doc.valueOf("//SET/@latest")); + } catch (DocumentException | SAXException e) { + throw new IllegalStateException(e); } } + private String buildDirectory(String basePath, Triple t) { + return Joiner.on("/").join(basePath, t.getMiddle(), t.getRight()); + } + private String getBasePathHDFS(ISLookUpService isLookup) throws ActionManagerException { return queryServiceProperty(isLookup, "basePath"); } @@ -123,7 +98,7 @@ public class ISClient implements Serializable { final String q = "for $x in /RESOURCE_PROFILE[.//RESOURCE_TYPE/@value='ActionManagerServiceResourceType'] return $x//SERVICE_PROPERTIES/PROPERTY[./@ key='" + propertyName + "']/@value/string()"; - log.debug("quering for service property: " + q); + log.debug("quering for service property: {}", q); try { final List value = isLookup.quickSearchProfile(q); return Iterables.getOnlyElement(value); diff --git a/dhp-workflows/dhp-actionmanager/src/main/java/eu/dnetlib/dhp/actionmanager/migration/LicenseComparator.java b/dhp-workflows/dhp-actionmanager/src/main/java/eu/dnetlib/dhp/actionmanager/migration/LicenseComparator.java deleted file mode 100644 index 7b6046f8b..000000000 --- a/dhp-workflows/dhp-actionmanager/src/main/java/eu/dnetlib/dhp/actionmanager/migration/LicenseComparator.java +++ /dev/null @@ -1,69 +0,0 @@ - -package eu.dnetlib.dhp.actionmanager.migration; - -import java.util.Comparator; - -import eu.dnetlib.data.proto.FieldTypeProtos.Qualifier; - -public class LicenseComparator implements Comparator { - - @Override - public int compare(Qualifier left, Qualifier right) { - - if (left == null && right == null) - return 0; - if (left == null) - return 1; - if (right == null) - return -1; - - String lClass = left.getClassid(); - String rClass = right.getClassid(); - - if (lClass.equals(rClass)) - return 0; - - if (lClass.equals("OPEN SOURCE")) - return -1; - if (rClass.equals("OPEN SOURCE")) - return 1; - - if (lClass.equals("OPEN")) - return -1; - if (rClass.equals("OPEN")) - return 1; - - if (lClass.equals("6MONTHS")) - return -1; - if (rClass.equals("6MONTHS")) - return 1; - - if (lClass.equals("12MONTHS")) - return -1; - if (rClass.equals("12MONTHS")) - return 1; - - if (lClass.equals("EMBARGO")) - return -1; - if (rClass.equals("EMBARGO")) - return 1; - - if (lClass.equals("RESTRICTED")) - return -1; - if (rClass.equals("RESTRICTED")) - return 1; - - if (lClass.equals("CLOSED")) - return -1; - if (rClass.equals("CLOSED")) - return 1; - - if (lClass.equals("UNKNOWN")) - return -1; - if (rClass.equals("UNKNOWN")) - return 1; - - // Else (but unlikely), lexicographical ordering will do. - return lClass.compareTo(rClass); - } -} diff --git a/dhp-workflows/dhp-actionmanager/src/main/java/eu/dnetlib/dhp/actionmanager/migration/MigrateActionSet.java b/dhp-workflows/dhp-actionmanager/src/main/java/eu/dnetlib/dhp/actionmanager/migration/MigrateActionSet.java deleted file mode 100644 index 77be7652e..000000000 --- a/dhp-workflows/dhp-actionmanager/src/main/java/eu/dnetlib/dhp/actionmanager/migration/MigrateActionSet.java +++ /dev/null @@ -1,196 +0,0 @@ - -package eu.dnetlib.dhp.actionmanager.migration; - -import java.io.File; -import java.io.FileOutputStream; -import java.io.OutputStream; -import java.util.ArrayList; -import java.util.LinkedList; -import java.util.List; -import java.util.Properties; -import java.util.stream.Collectors; - -import org.apache.commons.io.IOUtils; -import org.apache.commons.lang3.StringUtils; -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.tools.DistCp; -import org.apache.hadoop.tools.DistCpOptions; -import org.apache.hadoop.util.ToolRunner; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import com.google.common.base.Splitter; -import com.google.common.collect.Lists; - -import eu.dnetlib.dhp.application.ArgumentApplicationParser; -import eu.dnetlib.dhp.utils.ISLookupClientFactory; -import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException; -import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService; - -public class MigrateActionSet { - - private static final Logger log = LoggerFactory.getLogger(MigrateActionSet.class); - - private static final String SEPARATOR = "/"; - private static final String TARGET_PATHS = "target_paths"; - private static final String RAWSET_PREFIX = "rawset_"; - - public static void main(String[] args) throws Exception { - final ArgumentApplicationParser parser = new ArgumentApplicationParser( - IOUtils - .toString( - MigrateActionSet.class - .getResourceAsStream( - "/eu/dnetlib/dhp/actionmanager/migration/migrate_actionsets_parameters.json"))); - parser.parseArgument(args); - - new MigrateActionSet().run(parser); - } - - private void run(ArgumentApplicationParser parser) throws Exception { - - final String isLookupUrl = parser.get("isLookupUrl"); - final String sourceNN = parser.get("sourceNameNode"); - final String targetNN = parser.get("targetNameNode"); - final String workDir = parser.get("workingDirectory"); - final Integer distcp_num_maps = Integer.parseInt(parser.get("distcp_num_maps")); - - final String distcp_memory_mb = parser.get("distcp_memory_mb"); - final String distcp_task_timeout = parser.get("distcp_task_timeout"); - - final String transform_only_s = parser.get("transform_only"); - - log.info("transform only param: {}", transform_only_s); - - final Boolean transformOnly = Boolean.valueOf(parser.get("transform_only")); - - log.info("transform only: {}", transformOnly); - - ISLookUpService isLookUp = ISLookupClientFactory.getLookUpService(isLookupUrl); - - Configuration conf = getConfiguration(distcp_task_timeout, distcp_memory_mb, distcp_num_maps); - FileSystem targetFS = FileSystem.get(conf); - - Configuration sourceConf = getConfiguration(distcp_task_timeout, distcp_memory_mb, distcp_num_maps); - sourceConf.set(FileSystem.FS_DEFAULT_NAME_KEY, sourceNN); - FileSystem sourceFS = FileSystem.get(sourceConf); - - Properties props = new Properties(); - - List targetPaths = new ArrayList<>(); - - final List sourcePaths = getSourcePaths(sourceNN, isLookUp); - log - .info( - "paths to process:\n{}", sourcePaths - .stream() - .map(p -> p.toString()) - .collect(Collectors.joining("\n"))); - - for (Path source : sourcePaths) { - - if (!sourceFS.exists(source)) { - log.warn("skipping unexisting path: {}", source); - } else { - - LinkedList pathQ = Lists.newLinkedList(Splitter.on(SEPARATOR).split(source.toUri().getPath())); - - final String rawSet = pathQ.pollLast(); - log.info("got RAWSET: {}", rawSet); - - if (StringUtils.isNotBlank(rawSet) && rawSet.startsWith(RAWSET_PREFIX)) { - - final String actionSetDirectory = pathQ.pollLast(); - - final Path targetPath = new Path( - targetNN + workDir + SEPARATOR + actionSetDirectory + SEPARATOR + rawSet); - - log.info("using TARGET PATH: {}", targetPath); - - if (!transformOnly) { - if (targetFS.exists(targetPath)) { - targetFS.delete(targetPath, true); - } - runDistcp( - distcp_num_maps, distcp_memory_mb, distcp_task_timeout, conf, source, targetPath); - } - - targetPaths.add(targetPath); - } - } - } - - final String targetPathsCsv = targetPaths.stream().map(p -> p.toString()).collect(Collectors.joining(",")); - props.setProperty(TARGET_PATHS, targetPathsCsv); - File file = new File(System.getProperty("oozie.action.output.properties")); - - try (OutputStream os = new FileOutputStream(file)) { - props.store(os, ""); - } - System.out.println(file.getAbsolutePath()); - } - - private void runDistcp( - Integer distcp_num_maps, - String distcp_memory_mb, - String distcp_task_timeout, - Configuration conf, - Path source, - Path targetPath) - throws Exception { - - final DistCpOptions op = new DistCpOptions(source, targetPath); - op.setMaxMaps(distcp_num_maps); - op.preserve(DistCpOptions.FileAttribute.BLOCKSIZE); - op.preserve(DistCpOptions.FileAttribute.REPLICATION); - op.preserve(DistCpOptions.FileAttribute.CHECKSUMTYPE); - - int res = ToolRunner - .run( - new DistCp(conf, op), - new String[] { - "-Dmapred.task.timeout=" + distcp_task_timeout, - "-Dmapreduce.map.memory.mb=" + distcp_memory_mb, - "-pb", - "-m " + distcp_num_maps, - source.toString(), - targetPath.toString() - }); - - if (res != 0) { - throw new RuntimeException(String.format("distcp exited with code %s", res)); - } - } - - private Configuration getConfiguration( - String distcp_task_timeout, String distcp_memory_mb, Integer distcp_num_maps) { - final Configuration conf = new Configuration(); - conf.set("dfs.webhdfs.socket.connect-timeout", distcp_task_timeout); - conf.set("dfs.webhdfs.socket.read-timeout", distcp_task_timeout); - conf.set("dfs.http.client.retry.policy.enabled", "true"); - conf.set("mapred.task.timeout", distcp_task_timeout); - conf.set("mapreduce.map.memory.mb", distcp_memory_mb); - conf.set("mapred.map.tasks", String.valueOf(distcp_num_maps)); - return conf; - } - - private List getSourcePaths(String sourceNN, ISLookUpService isLookUp) - throws ISLookUpException { - String XQUERY = "distinct-values(\n" - + "let $basePath := collection('/db/DRIVER/ServiceResources/ActionManagerServiceResourceType')//SERVICE_PROPERTIES/PROPERTY[@key = 'basePath']/@value/string()\n" - + "for $x in collection('/db/DRIVER/ActionManagerSetDSResources/ActionManagerSetDSResourceType') \n" - + "let $setDir := $x//SET/@directory/string()\n" - + "let $rawSet := $x//RAW_SETS/LATEST/@id/string()\n" - + "return concat($basePath, '/', $setDir, '/', $rawSet))"; - - log.info(String.format("running xquery:\n%s", XQUERY)); - return isLookUp - .quickSearchProfile(XQUERY) - .stream() - .map(p -> sourceNN + p) - .map(Path::new) - .collect(Collectors.toList()); - } -} diff --git a/dhp-workflows/dhp-actionmanager/src/main/java/eu/dnetlib/dhp/actionmanager/migration/ProtoConverter.java b/dhp-workflows/dhp-actionmanager/src/main/java/eu/dnetlib/dhp/actionmanager/migration/ProtoConverter.java deleted file mode 100644 index 8ea877aec..000000000 --- a/dhp-workflows/dhp-actionmanager/src/main/java/eu/dnetlib/dhp/actionmanager/migration/ProtoConverter.java +++ /dev/null @@ -1,710 +0,0 @@ - -package eu.dnetlib.dhp.actionmanager.migration; - -import static eu.dnetlib.dhp.schema.common.ModelConstants.*; - -import java.io.Serializable; -import java.util.ArrayList; -import java.util.List; -import java.util.Optional; -import java.util.stream.Collectors; - -import org.apache.commons.lang3.StringUtils; - -import com.google.common.collect.Lists; -import com.googlecode.protobuf.format.JsonFormat; - -import eu.dnetlib.data.proto.*; -import eu.dnetlib.dhp.schema.oaf.*; - -public class ProtoConverter implements Serializable { - - public static Oaf convert(OafProtos.Oaf oaf) { - try { - switch (oaf.getKind()) { - case entity: - return convertEntity(oaf); - case relation: - return convertRelation(oaf); - default: - throw new IllegalArgumentException("invalid kind " + oaf.getKind()); - } - } catch (Throwable e) { - throw new RuntimeException("error on getting " + JsonFormat.printToString(oaf), e); - } - } - - private static Relation convertRelation(OafProtos.Oaf oaf) { - final OafProtos.OafRel r = oaf.getRel(); - final Relation rel = new Relation(); - rel.setDataInfo(mapDataInfo(oaf.getDataInfo())); - rel.setLastupdatetimestamp(oaf.getLastupdatetimestamp()); - rel.setSource(r.getSource()); - rel.setTarget(r.getTarget()); - rel.setRelType(r.getRelType().toString()); - rel.setSubRelType(r.getSubRelType().toString()); - rel.setRelClass(r.getRelClass()); - rel - .setCollectedfrom( - r.getCollectedfromCount() > 0 - ? r.getCollectedfromList().stream().map(kv -> mapKV(kv)).collect(Collectors.toList()) - : null); - return rel; - } - - private static OafEntity convertEntity(OafProtos.Oaf oaf) { - - switch (oaf.getEntity().getType()) { - case result: - final Result r = convertResult(oaf); - r.setInstance(convertInstances(oaf)); - r.setExternalReference(convertExternalRefs(oaf)); - return r; - case project: - return convertProject(oaf); - case datasource: - return convertDataSource(oaf); - case organization: - return convertOrganization(oaf); - default: - throw new RuntimeException("received unknown type"); - } - } - - private static List convertInstances(OafProtos.Oaf oaf) { - - final ResultProtos.Result r = oaf.getEntity().getResult(); - if (r.getInstanceCount() > 0) { - return r.getInstanceList().stream().map(i -> convertInstance(i)).collect(Collectors.toList()); - } - return Lists.newArrayList(); - } - - private static Instance convertInstance(ResultProtos.Result.Instance ri) { - final Instance i = new Instance(); - i.setAccessright(mapQualifier(ri.getAccessright())); - i.setCollectedfrom(mapKV(ri.getCollectedfrom())); - i.setDateofacceptance(mapStringField(ri.getDateofacceptance())); - i.setDistributionlocation(ri.getDistributionlocation()); - i.setHostedby(mapKV(ri.getHostedby())); - i.setInstancetype(mapQualifier(ri.getInstancetype())); - i.setLicense(mapStringField(ri.getLicense())); - i - .setUrl( - ri.getUrlList() != null ? ri - .getUrlList() - .stream() - .distinct() - .collect(Collectors.toCollection(ArrayList::new)) : null); - i.setRefereed(mapRefereed(ri.getRefereed())); - i.setProcessingchargeamount(mapStringField(ri.getProcessingchargeamount())); - i.setProcessingchargecurrency(mapStringField(ri.getProcessingchargecurrency())); - return i; - } - - private static Qualifier mapRefereed(FieldTypeProtos.StringField refereed) { - Qualifier q = new Qualifier(); - q.setClassid(refereed.getValue()); - q.setSchemename(refereed.getValue()); - q.setSchemeid("dnet:review_levels"); - q.setSchemename("dnet:review_levels"); - return q; - } - - private static List convertExternalRefs(OafProtos.Oaf oaf) { - ResultProtos.Result r = oaf.getEntity().getResult(); - if (r.getExternalReferenceCount() > 0) { - return r - .getExternalReferenceList() - .stream() - .map(e -> convertExtRef(e)) - .collect(Collectors.toList()); - } - return Lists.newArrayList(); - } - - private static ExternalReference convertExtRef(ResultProtos.Result.ExternalReference e) { - ExternalReference ex = new ExternalReference(); - ex.setUrl(e.getUrl()); - ex.setSitename(e.getSitename()); - ex.setRefidentifier(e.getRefidentifier()); - ex.setQuery(e.getQuery()); - ex.setQualifier(mapQualifier(e.getQualifier())); - ex.setLabel(e.getLabel()); - ex.setDescription(e.getDescription()); - ex.setDataInfo(ex.getDataInfo()); - return ex; - } - - private static Organization convertOrganization(OafProtos.Oaf oaf) { - final OrganizationProtos.Organization.Metadata m = oaf.getEntity().getOrganization().getMetadata(); - final Organization org = setOaf(new Organization(), oaf); - setEntity(org, oaf); - org.setLegalshortname(mapStringField(m.getLegalshortname())); - org.setLegalname(mapStringField(m.getLegalname())); - org - .setAlternativeNames( - m - .getAlternativeNamesList() - .stream() - .map(ProtoConverter::mapStringField) - .collect(Collectors.toList())); - org.setWebsiteurl(mapStringField(m.getWebsiteurl())); - org.setLogourl(mapStringField(m.getLogourl())); - org.setEclegalbody(mapStringField(m.getEclegalbody())); - org.setEclegalperson(mapStringField(m.getEclegalperson())); - org.setEcnonprofit(mapStringField(m.getEcnonprofit())); - org.setEcresearchorganization(mapStringField(m.getEcresearchorganization())); - org.setEchighereducation(mapStringField(m.getEchighereducation())); - org - .setEcinternationalorganizationeurinterests( - mapStringField(m.getEcinternationalorganizationeurinterests())); - org.setEcinternationalorganization(mapStringField(m.getEcinternationalorganization())); - org.setEcenterprise(mapStringField(m.getEcenterprise())); - org.setEcsmevalidated(mapStringField(m.getEcsmevalidated())); - org.setEcnutscode(mapStringField(m.getEcnutscode())); - org.setCountry(mapQualifier(m.getCountry())); - - return org; - } - - private static Datasource convertDataSource(OafProtos.Oaf oaf) { - final DatasourceProtos.Datasource.Metadata m = oaf.getEntity().getDatasource().getMetadata(); - final Datasource datasource = setOaf(new Datasource(), oaf); - setEntity(datasource, oaf); - datasource - .setAccessinfopackage( - m - .getAccessinfopackageList() - .stream() - .map(ProtoConverter::mapStringField) - .collect(Collectors.toList())); - datasource.setCertificates(mapStringField(m.getCertificates())); - datasource.setCitationguidelineurl(mapStringField(m.getCitationguidelineurl())); - datasource.setContactemail(mapStringField(m.getContactemail())); - datasource.setDatabaseaccessrestriction(mapStringField(m.getDatabaseaccessrestriction())); - datasource.setDatabaseaccesstype(mapStringField(m.getDatabaseaccesstype())); - datasource.setDataprovider(mapBoolField(m.getDataprovider())); - datasource.setDatasourcetype(mapQualifier(m.getDatasourcetype())); - datasource.setDatauploadrestriction(mapStringField(m.getDatauploadrestriction())); - datasource.setCitationguidelineurl(mapStringField(m.getCitationguidelineurl())); - datasource.setDatauploadtype(mapStringField(m.getDatauploadtype())); - datasource.setDateofvalidation(mapStringField(m.getDateofvalidation())); - datasource.setDescription(mapStringField(m.getDescription())); - datasource.setEnglishname(mapStringField(m.getEnglishname())); - datasource.setLatitude(mapStringField(m.getLatitude())); - datasource.setLongitude(mapStringField(m.getLongitude())); - datasource.setLogourl(mapStringField(m.getLogourl())); - datasource.setMissionstatementurl(mapStringField(m.getMissionstatementurl())); - datasource.setNamespaceprefix(mapStringField(m.getNamespaceprefix())); - datasource - .setOdcontenttypes( - m - .getOdcontenttypesList() - .stream() - .map(ProtoConverter::mapStringField) - .collect(Collectors.toList())); - datasource - .setOdlanguages( - m - .getOdlanguagesList() - .stream() - .map(ProtoConverter::mapStringField) - .collect(Collectors.toList())); - datasource.setOdnumberofitems(mapStringField(m.getOdnumberofitems())); - datasource.setOdnumberofitemsdate(mapStringField(m.getOdnumberofitemsdate())); - datasource.setOdpolicies(mapStringField(m.getOdpolicies())); - datasource.setOfficialname(mapStringField(m.getOfficialname())); - datasource.setOpenairecompatibility(mapQualifier(m.getOpenairecompatibility())); - datasource.setPidsystems(mapStringField(m.getPidsystems())); - datasource - .setPolicies( - m.getPoliciesList().stream().map(ProtoConverter::mapKV).collect(Collectors.toList())); - datasource.setQualitymanagementkind(mapStringField(m.getQualitymanagementkind())); - datasource.setReleaseenddate(mapStringField(m.getReleaseenddate())); - datasource.setServiceprovider(mapBoolField(m.getServiceprovider())); - datasource.setReleasestartdate(mapStringField(m.getReleasestartdate())); - datasource - .setSubjects( - m - .getSubjectsList() - .stream() - .map(ProtoConverter::mapStructuredProperty) - .collect(Collectors.toList())); - datasource.setVersioning(mapBoolField(m.getVersioning())); - datasource.setWebsiteurl(mapStringField(m.getWebsiteurl())); - datasource.setJournal(mapJournal(m.getJournal())); - - return datasource; - } - - private static Project convertProject(OafProtos.Oaf oaf) { - final ProjectProtos.Project.Metadata m = oaf.getEntity().getProject().getMetadata(); - final Project project = setOaf(new Project(), oaf); - setEntity(project, oaf); - project.setAcronym(mapStringField(m.getAcronym())); - project.setCallidentifier(mapStringField(m.getCallidentifier())); - project.setCode(mapStringField(m.getCode())); - project.setContactemail(mapStringField(m.getContactemail())); - project.setContactfax(mapStringField(m.getContactfax())); - project.setContactfullname(mapStringField(m.getContactfullname())); - project.setContactphone(mapStringField(m.getContactphone())); - project.setContracttype(mapQualifier(m.getContracttype())); - project.setCurrency(mapStringField(m.getCurrency())); - project.setDuration(mapStringField(m.getDuration())); - project.setEcarticle29_3(mapStringField(m.getEcarticle293())); - project.setEcsc39(mapStringField(m.getEcsc39())); - project.setOamandatepublications(mapStringField(m.getOamandatepublications())); - project.setStartdate(mapStringField(m.getStartdate())); - project.setEnddate(mapStringField(m.getEnddate())); - project.setFundedamount(m.getFundedamount()); - project.setTotalcost(m.getTotalcost()); - project.setKeywords(mapStringField(m.getKeywords())); - project - .setSubjects( - m - .getSubjectsList() - .stream() - .map(sp -> mapStructuredProperty(sp)) - .collect(Collectors.toList())); - project.setTitle(mapStringField(m.getTitle())); - project.setWebsiteurl(mapStringField(m.getWebsiteurl())); - project - .setFundingtree( - m.getFundingtreeList().stream().map(f -> mapStringField(f)).collect(Collectors.toList())); - project.setJsonextrainfo(mapStringField(m.getJsonextrainfo())); - project.setSummary(mapStringField(m.getSummary())); - project.setOptional1(mapStringField(m.getOptional1())); - project.setOptional2(mapStringField(m.getOptional2())); - return project; - } - - private static Result convertResult(OafProtos.Oaf oaf) { - switch (oaf.getEntity().getResult().getMetadata().getResulttype().getClassid()) { - case "dataset": - return createDataset(oaf); - case "publication": - return createPublication(oaf); - case "software": - return createSoftware(oaf); - case "other": - return createORP(oaf); - default: - Result result = setOaf(new Result(), oaf); - setEntity(result, oaf); - return setResult(result, oaf); - } - } - - private static Software createSoftware(OafProtos.Oaf oaf) { - ResultProtos.Result.Metadata m = oaf.getEntity().getResult().getMetadata(); - Software software = setOaf(new Software(), oaf); - setEntity(software, oaf); - setResult(software, oaf); - - software - .setDocumentationUrl( - m - .getDocumentationUrlList() - .stream() - .map(ProtoConverter::mapStringField) - .collect(Collectors.toList())); - software - .setLicense( - m - .getLicenseList() - .stream() - .map(ProtoConverter::mapStructuredProperty) - .collect(Collectors.toList())); - software.setCodeRepositoryUrl(mapStringField(m.getCodeRepositoryUrl())); - software.setProgrammingLanguage(mapQualifier(m.getProgrammingLanguage())); - return software; - } - - private static OtherResearchProduct createORP(OafProtos.Oaf oaf) { - ResultProtos.Result.Metadata m = oaf.getEntity().getResult().getMetadata(); - OtherResearchProduct otherResearchProducts = setOaf(new OtherResearchProduct(), oaf); - setEntity(otherResearchProducts, oaf); - setResult(otherResearchProducts, oaf); - otherResearchProducts - .setContactperson( - m - .getContactpersonList() - .stream() - .map(ProtoConverter::mapStringField) - .collect(Collectors.toList())); - otherResearchProducts - .setContactgroup( - m - .getContactgroupList() - .stream() - .map(ProtoConverter::mapStringField) - .collect(Collectors.toList())); - otherResearchProducts - .setTool( - m.getToolList().stream().map(ProtoConverter::mapStringField).collect(Collectors.toList())); - - return otherResearchProducts; - } - - private static Publication createPublication(OafProtos.Oaf oaf) { - - ResultProtos.Result.Metadata m = oaf.getEntity().getResult().getMetadata(); - Publication publication = setOaf(new Publication(), oaf); - setEntity(publication, oaf); - setResult(publication, oaf); - publication.setJournal(mapJournal(m.getJournal())); - return publication; - } - - private static Dataset createDataset(OafProtos.Oaf oaf) { - - ResultProtos.Result.Metadata m = oaf.getEntity().getResult().getMetadata(); - Dataset dataset = setOaf(new Dataset(), oaf); - setEntity(dataset, oaf); - setResult(dataset, oaf); - dataset.setStoragedate(mapStringField(m.getStoragedate())); - dataset.setDevice(mapStringField(m.getDevice())); - dataset.setSize(mapStringField(m.getSize())); - dataset.setVersion(mapStringField(m.getVersion())); - dataset.setLastmetadataupdate(mapStringField(m.getLastmetadataupdate())); - dataset.setMetadataversionnumber(mapStringField(m.getMetadataversionnumber())); - dataset - .setGeolocation( - m - .getGeolocationList() - .stream() - .map(ProtoConverter::mapGeolocation) - .collect(Collectors.toList())); - return dataset; - } - - public static T setOaf(T oaf, OafProtos.Oaf o) { - oaf.setDataInfo(mapDataInfo(o.getDataInfo())); - oaf.setLastupdatetimestamp(o.getLastupdatetimestamp()); - return oaf; - } - - public static T setEntity(T entity, OafProtos.Oaf oaf) { - // setting Entity fields - final OafProtos.OafEntity e = oaf.getEntity(); - entity.setId(e.getId()); - entity.setOriginalId(e.getOriginalIdList()); - entity - .setCollectedfrom( - e.getCollectedfromList().stream().map(ProtoConverter::mapKV).collect(Collectors.toList())); - entity - .setPid( - e - .getPidList() - .stream() - .map(ProtoConverter::mapStructuredProperty) - .collect(Collectors.toList())); - entity.setDateofcollection(e.getDateofcollection()); - entity.setDateoftransformation(e.getDateoftransformation()); - entity - .setExtraInfo( - e - .getExtraInfoList() - .stream() - .map(ProtoConverter::mapExtraInfo) - .collect(Collectors.toList())); - return entity; - } - - public static T setResult(T entity, OafProtos.Oaf oaf) { - // setting Entity fields - final ResultProtos.Result.Metadata m = oaf.getEntity().getResult().getMetadata(); - entity - .setAuthor( - m.getAuthorList().stream().map(ProtoConverter::mapAuthor).collect(Collectors.toList())); - entity.setResulttype(mapQualifier(m.getResulttype())); - entity.setLanguage(mapQualifier(m.getLanguage())); - entity - .setCountry( - m - .getCountryList() - .stream() - .map(ProtoConverter::mapQualifierAsCountry) - .collect(Collectors.toList())); - entity - .setSubject( - m - .getSubjectList() - .stream() - .map(ProtoConverter::mapStructuredProperty) - .collect(Collectors.toList())); - entity - .setTitle( - m - .getTitleList() - .stream() - .map(ProtoConverter::mapStructuredProperty) - .collect(Collectors.toList())); - entity - .setRelevantdate( - m - .getRelevantdateList() - .stream() - .map(ProtoConverter::mapStructuredProperty) - .collect(Collectors.toList())); - entity - .setDescription( - m - .getDescriptionList() - .stream() - .map(ProtoConverter::mapStringField) - .collect(Collectors.toList())); - entity.setDateofacceptance(mapStringField(m.getDateofacceptance())); - entity.setPublisher(mapStringField(m.getPublisher())); - entity.setEmbargoenddate(mapStringField(m.getEmbargoenddate())); - entity - .setSource( - m - .getSourceList() - .stream() - .map(ProtoConverter::mapStringField) - .collect(Collectors.toList())); - entity - .setFulltext( - m - .getFulltextList() - .stream() - .map(ProtoConverter::mapStringField) - .collect(Collectors.toList())); - entity - .setFormat( - m - .getFormatList() - .stream() - .map(ProtoConverter::mapStringField) - .collect(Collectors.toList())); - entity - .setContributor( - m - .getContributorList() - .stream() - .map(ProtoConverter::mapStringField) - .collect(Collectors.toList())); - entity.setResourcetype(mapQualifier(m.getResourcetype())); - entity - .setCoverage( - m - .getCoverageList() - .stream() - .map(ProtoConverter::mapStringField) - .collect(Collectors.toList())); - entity - .setContext( - m.getContextList().stream().map(ProtoConverter::mapContext).collect(Collectors.toList())); - - entity.setBestaccessright(getBestAccessRights(oaf.getEntity().getResult().getInstanceList())); - - return entity; - } - - private static Qualifier getBestAccessRights(List instanceList) { - if (instanceList != null) { - final Optional min = instanceList - .stream() - .map(i -> i.getAccessright()) - .min(new LicenseComparator()); - - final Qualifier rights = min.isPresent() ? mapQualifier(min.get()) : new Qualifier(); - - if (StringUtils.isBlank(rights.getClassid())) { - rights.setClassid(UNKNOWN); - } - if (StringUtils.isBlank(rights.getClassname()) - || UNKNOWN.equalsIgnoreCase(rights.getClassname())) { - rights.setClassname(NOT_AVAILABLE); - } - if (StringUtils.isBlank(rights.getSchemeid())) { - rights.setSchemeid(DNET_ACCESS_MODES); - } - if (StringUtils.isBlank(rights.getSchemename())) { - rights.setSchemename(DNET_ACCESS_MODES); - } - - return rights; - } - return null; - } - - private static Context mapContext(ResultProtos.Result.Context context) { - if (context == null || StringUtils.isBlank(context.getId())) { - return null; - } - final Context entity = new Context(); - entity.setId(context.getId()); - entity - .setDataInfo( - context - .getDataInfoList() - .stream() - .map(ProtoConverter::mapDataInfo) - .collect(Collectors.toList())); - return entity; - } - - public static KeyValue mapKV(FieldTypeProtos.KeyValue kv) { - if (kv == null || StringUtils.isBlank(kv.getKey()) & StringUtils.isBlank(kv.getValue())) { - return null; - } - - final KeyValue keyValue = new KeyValue(); - keyValue.setKey(kv.getKey()); - keyValue.setValue(kv.getValue()); - keyValue.setDataInfo(mapDataInfo(kv.getDataInfo())); - return keyValue; - } - - public static DataInfo mapDataInfo(FieldTypeProtos.DataInfo d) { - final DataInfo dataInfo = new DataInfo(); - dataInfo.setDeletedbyinference(d.getDeletedbyinference()); - dataInfo.setInferenceprovenance(d.getInferenceprovenance()); - dataInfo.setInferred(d.getInferred()); - dataInfo.setInvisible(d.getInvisible()); - dataInfo.setProvenanceaction(mapQualifier(d.getProvenanceaction())); - dataInfo.setTrust(d.getTrust()); - return dataInfo; - } - - public static Qualifier mapQualifier(FieldTypeProtos.Qualifier q) { - final Qualifier qualifier = new Qualifier(); - qualifier.setClassid(q.getClassid()); - qualifier.setClassname(q.getClassname()); - qualifier.setSchemeid(q.getSchemeid()); - qualifier.setSchemename(q.getSchemename()); - return qualifier; - } - - public static Country mapQualifierAsCountry(FieldTypeProtos.Qualifier q) { - final Country c = new Country(); - c.setClassid(q.getClassid()); - c.setClassname(q.getClassname()); - c.setSchemeid(q.getSchemeid()); - c.setSchemename(q.getSchemename()); - c.setDataInfo(mapDataInfo(q.getDataInfo())); - return c; - } - - public static StructuredProperty mapStructuredProperty(FieldTypeProtos.StructuredProperty sp) { - if (sp == null | StringUtils.isBlank(sp.getValue())) { - return null; - } - - final StructuredProperty structuredProperty = new StructuredProperty(); - structuredProperty.setValue(sp.getValue()); - structuredProperty.setQualifier(mapQualifier(sp.getQualifier())); - structuredProperty.setDataInfo(mapDataInfo(sp.getDataInfo())); - return structuredProperty; - } - - public static ExtraInfo mapExtraInfo(FieldTypeProtos.ExtraInfo extraInfo) { - final ExtraInfo entity = new ExtraInfo(); - entity.setName(extraInfo.getName()); - entity.setTypology(extraInfo.getTypology()); - entity.setProvenance(extraInfo.getProvenance()); - entity.setTrust(extraInfo.getTrust()); - entity.setValue(extraInfo.getValue()); - return entity; - } - - public static OAIProvenance mapOAIProvenance(FieldTypeProtos.OAIProvenance oaiProvenance) { - final OAIProvenance entity = new OAIProvenance(); - entity.setOriginDescription(mapOriginalDescription(oaiProvenance.getOriginDescription())); - return entity; - } - - public static OriginDescription mapOriginalDescription( - FieldTypeProtos.OAIProvenance.OriginDescription originDescription) { - final OriginDescription originDescriptionResult = new OriginDescription(); - originDescriptionResult.setHarvestDate(originDescription.getHarvestDate()); - originDescriptionResult.setAltered(originDescription.getAltered()); - originDescriptionResult.setBaseURL(originDescription.getBaseURL()); - originDescriptionResult.setIdentifier(originDescription.getIdentifier()); - originDescriptionResult.setDatestamp(originDescription.getDatestamp()); - originDescriptionResult.setMetadataNamespace(originDescription.getMetadataNamespace()); - return originDescriptionResult; - } - - public static Field mapStringField(FieldTypeProtos.StringField s) { - if (s == null || StringUtils.isBlank(s.getValue())) { - return null; - } - - final Field stringField = new Field<>(); - stringField.setValue(s.getValue()); - stringField.setDataInfo(mapDataInfo(s.getDataInfo())); - return stringField; - } - - public static Field mapBoolField(FieldTypeProtos.BoolField b) { - if (b == null) { - return null; - } - - final Field booleanField = new Field<>(); - booleanField.setValue(b.getValue()); - booleanField.setDataInfo(mapDataInfo(b.getDataInfo())); - return booleanField; - } - - public static Journal mapJournal(FieldTypeProtos.Journal j) { - final Journal journal = new Journal(); - journal.setConferencedate(j.getConferencedate()); - journal.setConferenceplace(j.getConferenceplace()); - journal.setEdition(j.getEdition()); - journal.setEp(j.getEp()); - journal.setIss(j.getIss()); - journal.setIssnLinking(j.getIssnLinking()); - journal.setIssnOnline(j.getIssnOnline()); - journal.setIssnPrinted(j.getIssnPrinted()); - journal.setName(j.getName()); - journal.setSp(j.getSp()); - journal.setVol(j.getVol()); - journal.setDataInfo(mapDataInfo(j.getDataInfo())); - return journal; - } - - public static Author mapAuthor(FieldTypeProtos.Author author) { - final Author entity = new Author(); - entity.setFullname(author.getFullname()); - entity.setName(author.getName()); - entity.setSurname(author.getSurname()); - entity.setRank(author.getRank()); - entity - .setPid( - author - .getPidList() - .stream() - .map( - kv -> { - final StructuredProperty sp = new StructuredProperty(); - sp.setValue(kv.getValue()); - final Qualifier q = new Qualifier(); - q.setClassid(kv.getKey()); - q.setClassname(kv.getKey()); - sp.setQualifier(q); - return sp; - }) - .collect(Collectors.toList())); - entity - .setAffiliation( - author - .getAffiliationList() - .stream() - .map(ProtoConverter::mapStringField) - .collect(Collectors.toList())); - return entity; - } - - public static GeoLocation mapGeolocation(ResultProtos.Result.GeoLocation geoLocation) { - final GeoLocation entity = new GeoLocation(); - entity.setPoint(geoLocation.getPoint()); - entity.setBox(geoLocation.getBox()); - entity.setPlace(geoLocation.getPlace()); - return entity; - } -} diff --git a/dhp-workflows/dhp-actionmanager/src/main/java/eu/dnetlib/dhp/actionmanager/migration/TransformActions.java b/dhp-workflows/dhp-actionmanager/src/main/java/eu/dnetlib/dhp/actionmanager/migration/TransformActions.java deleted file mode 100644 index 490668606..000000000 --- a/dhp-workflows/dhp-actionmanager/src/main/java/eu/dnetlib/dhp/actionmanager/migration/TransformActions.java +++ /dev/null @@ -1,172 +0,0 @@ - -package eu.dnetlib.dhp.actionmanager.migration; - -import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; - -import java.io.IOException; -import java.io.Serializable; -import java.util.LinkedList; -import java.util.Objects; -import java.util.Optional; - -import org.apache.commons.io.IOUtils; -import org.apache.commons.lang3.StringUtils; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.io.Text; -import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat; -import org.apache.spark.SparkConf; -import org.apache.spark.api.java.JavaSparkContext; -import org.apache.spark.sql.SparkSession; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import com.fasterxml.jackson.databind.ObjectMapper; -import com.google.common.base.Splitter; -import com.google.common.collect.Lists; -import com.google.protobuf.InvalidProtocolBufferException; - -import eu.dnetlib.data.proto.OafProtos; -import eu.dnetlib.dhp.application.ArgumentApplicationParser; -import eu.dnetlib.dhp.schema.action.AtomicAction; -import eu.dnetlib.dhp.schema.oaf.*; -import eu.dnetlib.dhp.utils.ISLookupClientFactory; -import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException; -import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService; -import scala.Tuple2; - -public class TransformActions implements Serializable { - - private static final Logger log = LoggerFactory.getLogger(TransformActions.class); - - private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); - - private static final String SEPARATOR = "/"; - - public static void main(String[] args) throws Exception { - final ArgumentApplicationParser parser = new ArgumentApplicationParser( - IOUtils - .toString( - MigrateActionSet.class - .getResourceAsStream( - "/eu/dnetlib/dhp/actionmanager/migration/transform_actionsets_parameters.json"))); - parser.parseArgument(args); - - Boolean isSparkSessionManaged = Optional - .ofNullable(parser.get("isSparkSessionManaged")) - .map(Boolean::valueOf) - .orElse(Boolean.TRUE); - log.info("isSparkSessionManaged: {}", isSparkSessionManaged); - - final String isLookupUrl = parser.get("isLookupUrl"); - log.info("isLookupUrl: {}", isLookupUrl); - - final String inputPaths = parser.get("inputPaths"); - - if (StringUtils.isBlank(inputPaths)) { - throw new RuntimeException("empty inputPaths"); - } - log.info("inputPaths: {}", inputPaths); - - final String targetBaseDir = getTargetBaseDir(isLookupUrl); - - SparkConf conf = new SparkConf(); - - runWithSparkSession( - conf, isSparkSessionManaged, spark -> transformActions(inputPaths, targetBaseDir, spark)); - } - - private static void transformActions(String inputPaths, String targetBaseDir, SparkSession spark) - throws IOException { - final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); - final FileSystem fs = FileSystem.get(spark.sparkContext().hadoopConfiguration()); - - for (String sourcePath : Lists.newArrayList(Splitter.on(",").split(inputPaths))) { - - LinkedList pathQ = Lists.newLinkedList(Splitter.on(SEPARATOR).split(sourcePath)); - - final String rawset = pathQ.pollLast(); - final String actionSetDirectory = pathQ.pollLast(); - - final Path targetDirectory = new Path(targetBaseDir + SEPARATOR + actionSetDirectory + SEPARATOR + rawset); - - if (fs.exists(targetDirectory)) { - log.info("found target directory '{}", targetDirectory); - fs.delete(targetDirectory, true); - log.info("deleted target directory '{}", targetDirectory); - } - - log.info("transforming actions from '{}' to '{}'", sourcePath, targetDirectory); - - sc - .sequenceFile(sourcePath, Text.class, Text.class) - .map(a -> eu.dnetlib.actionmanager.actions.AtomicAction.fromJSON(a._2().toString())) - .map(TransformActions::doTransform) - .filter(Objects::nonNull) - .mapToPair( - a -> new Tuple2<>(a.getClazz().toString(), OBJECT_MAPPER.writeValueAsString(a))) - .mapToPair(t -> new Tuple2(new Text(t._1()), new Text(t._2()))) - .saveAsNewAPIHadoopFile( - targetDirectory.toString(), - Text.class, - Text.class, - SequenceFileOutputFormat.class, - sc.hadoopConfiguration()); - } - } - - private static AtomicAction doTransform(eu.dnetlib.actionmanager.actions.AtomicAction aa) - throws InvalidProtocolBufferException { - - // dedup similarity relations had empty target value, don't migrate them - if (aa.getTargetValue().length == 0) { - return null; - } - final OafProtos.Oaf proto_oaf = OafProtos.Oaf.parseFrom(aa.getTargetValue()); - final Oaf oaf = ProtoConverter.convert(proto_oaf); - switch (proto_oaf.getKind()) { - case entity: - switch (proto_oaf.getEntity().getType()) { - case datasource: - return new AtomicAction<>(Datasource.class, (Datasource) oaf); - case organization: - return new AtomicAction<>(Organization.class, (Organization) oaf); - case project: - return new AtomicAction<>(Project.class, (Project) oaf); - case result: - final String resulttypeid = proto_oaf - .getEntity() - .getResult() - .getMetadata() - .getResulttype() - .getClassid(); - switch (resulttypeid) { - case "publication": - return new AtomicAction<>(Publication.class, (Publication) oaf); - case "software": - return new AtomicAction<>(Software.class, (Software) oaf); - case "other": - return new AtomicAction<>(OtherResearchProduct.class, (OtherResearchProduct) oaf); - case "dataset": - return new AtomicAction<>(Dataset.class, (Dataset) oaf); - default: - // can be an update, where the resulttype is not specified - return new AtomicAction<>(Result.class, (Result) oaf); - } - default: - throw new IllegalArgumentException( - "invalid entity type: " + proto_oaf.getEntity().getType()); - } - case relation: - return new AtomicAction<>(Relation.class, (Relation) oaf); - default: - throw new IllegalArgumentException("invalid kind: " + proto_oaf.getKind()); - } - } - - private static String getTargetBaseDir(String isLookupUrl) throws ISLookUpException { - ISLookUpService isLookUp = ISLookupClientFactory.getLookUpService(isLookupUrl); - String XQUERY = "collection('/db/DRIVER/ServiceResources/ActionManagerServiceResourceType')//SERVICE_PROPERTIES/PROPERTY[@key = 'basePath']/@value/string()"; - return isLookUp.getResourceProfileByQuery(XQUERY); - } -} diff --git a/dhp-workflows/dhp-actionmanager/src/main/java/eu/dnetlib/dhp/actionmanager/promote/MergeAndGet.java b/dhp-workflows/dhp-actionmanager/src/main/java/eu/dnetlib/dhp/actionmanager/promote/MergeAndGet.java index fbb072957..eccfa445c 100644 --- a/dhp-workflows/dhp-actionmanager/src/main/java/eu/dnetlib/dhp/actionmanager/promote/MergeAndGet.java +++ b/dhp-workflows/dhp-actionmanager/src/main/java/eu/dnetlib/dhp/actionmanager/promote/MergeAndGet.java @@ -62,6 +62,7 @@ public class MergeAndGet { x.getClass().getCanonicalName(), y.getClass().getCanonicalName())); } + @SuppressWarnings("unchecked") private static G selectNewerAndGet(G x, A y) { if (x.getClass().equals(y.getClass()) && x.getLastupdatetimestamp() > y.getLastupdatetimestamp()) { diff --git a/dhp-workflows/dhp-actionmanager/src/main/java/eu/dnetlib/dhp/actionmanager/promote/PromoteActionPayloadForGraphTableJob.java b/dhp-workflows/dhp-actionmanager/src/main/java/eu/dnetlib/dhp/actionmanager/promote/PromoteActionPayloadForGraphTableJob.java index 5fa9e6723..c5f252c97 100644 --- a/dhp-workflows/dhp-actionmanager/src/main/java/eu/dnetlib/dhp/actionmanager/promote/PromoteActionPayloadForGraphTableJob.java +++ b/dhp-workflows/dhp-actionmanager/src/main/java/eu/dnetlib/dhp/actionmanager/promote/PromoteActionPayloadForGraphTableJob.java @@ -5,12 +5,12 @@ import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; import static eu.dnetlib.dhp.schema.common.ModelSupport.isSubClass; import java.io.IOException; -import java.util.Objects; import java.util.Optional; import java.util.function.BiFunction; import java.util.function.Function; import org.apache.commons.io.IOUtils; +import org.apache.commons.lang3.StringUtils; import org.apache.spark.SparkConf; import org.apache.spark.api.java.function.MapFunction; import org.apache.spark.sql.Dataset; @@ -68,7 +68,15 @@ public class PromoteActionPayloadForGraphTableJob { MergeAndGet.Strategy strategy = MergeAndGet.Strategy.valueOf(parser.get("mergeAndGetStrategy").toUpperCase()); logger.info("strategy: {}", strategy); + Boolean shouldGroupById = Optional + .ofNullable(parser.get("shouldGroupById")) + .map(Boolean::valueOf) + .orElse(true); + logger.info("shouldGroupById: {}", shouldGroupById); + + @SuppressWarnings("unchecked") Class rowClazz = (Class) Class.forName(graphTableClassName); + @SuppressWarnings("unchecked") Class actionPayloadClazz = (Class) Class.forName(actionPayloadClassName); throwIfGraphTableClassIsNotSubClassOfActionPayloadClass(rowClazz, actionPayloadClazz); @@ -89,7 +97,8 @@ public class PromoteActionPayloadForGraphTableJob { outputGraphTablePath, strategy, rowClazz, - actionPayloadClazz); + actionPayloadClazz, + shouldGroupById); }); } @@ -115,12 +124,12 @@ public class PromoteActionPayloadForGraphTableJob { String outputGraphTablePath, MergeAndGet.Strategy strategy, Class rowClazz, - Class actionPayloadClazz) { + Class actionPayloadClazz, Boolean shouldGroupById) { Dataset rowDS = readGraphTable(spark, inputGraphTablePath, rowClazz); Dataset actionPayloadDS = readActionPayload(spark, inputActionPayloadPath, actionPayloadClazz); Dataset result = promoteActionPayloadForGraphTable( - rowDS, actionPayloadDS, strategy, rowClazz, actionPayloadClazz) + rowDS, actionPayloadDS, strategy, rowClazz, actionPayloadClazz, shouldGroupById) .map((MapFunction) value -> value, Encoders.bean(rowClazz)); saveGraphTable(result, outputGraphTablePath); @@ -145,7 +154,7 @@ public class PromoteActionPayloadForGraphTableJob { return spark .read() .parquet(path) - .map((MapFunction) value -> extractPayload(value), Encoders.STRING()) + .map((MapFunction) PromoteActionPayloadForGraphTableJob::extractPayload, Encoders.STRING()) .map( (MapFunction) value -> decodePayload(actionPayloadClazz, value), Encoders.bean(actionPayloadClazz)); @@ -153,9 +162,9 @@ public class PromoteActionPayloadForGraphTableJob { private static String extractPayload(Row value) { try { - return value. getAs("payload"); + return value.getAs("payload"); } catch (IllegalArgumentException | ClassCastException e) { - logger.error("cannot extract payload from action: {}", value.toString()); + logger.error("cannot extract payload from action: {}", value); throw e; } } @@ -174,7 +183,8 @@ public class PromoteActionPayloadForGraphTableJob { Dataset actionPayloadDS, MergeAndGet.Strategy strategy, Class rowClazz, - Class actionPayloadClazz) { + Class actionPayloadClazz, + Boolean shouldGroupById) { logger .info( "Promoting action payload for graph table: payload={}, table={}", @@ -186,7 +196,7 @@ public class PromoteActionPayloadForGraphTableJob { SerializableSupplier> mergeRowWithActionPayloadAndGetFn = MergeAndGet.functionFor(strategy); SerializableSupplier> mergeRowsAndGetFn = MergeAndGet.functionFor(strategy); SerializableSupplier zeroFn = zeroFn(rowClazz); - SerializableSupplier> isNotZeroFn = PromoteActionPayloadForGraphTableJob::isNotZeroFnUsingIdOrSource; + SerializableSupplier> isNotZeroFn = PromoteActionPayloadForGraphTableJob::isNotZeroFnUsingIdOrSourceAndTarget; Dataset joinedAndMerged = PromoteActionPayloadFunctions .joinGraphTableWithActionPayloadAndMerge( @@ -198,9 +208,13 @@ public class PromoteActionPayloadForGraphTableJob { rowClazz, actionPayloadClazz); - return PromoteActionPayloadFunctions - .groupGraphTableByIdAndMerge( - joinedAndMerged, rowIdFn, mergeRowsAndGetFn, zeroFn, isNotZeroFn, rowClazz); + if (shouldGroupById) { + return PromoteActionPayloadFunctions + .groupGraphTableByIdAndMerge( + joinedAndMerged, rowIdFn, mergeRowsAndGetFn, zeroFn, isNotZeroFn, rowClazz); + } else { + return joinedAndMerged; + } } private static SerializableSupplier zeroFn(Class clazz) { @@ -226,12 +240,13 @@ public class PromoteActionPayloadForGraphTableJob { } } - private static Function isNotZeroFnUsingIdOrSource() { + private static Function isNotZeroFnUsingIdOrSourceAndTarget() { return t -> { if (isSubClass(t, Relation.class)) { - return Objects.nonNull(((Relation) t).getSource()); + final Relation rel = (Relation) t; + return StringUtils.isNotBlank(rel.getSource()) && StringUtils.isNotBlank(rel.getTarget()); } - return Objects.nonNull(((OafEntity) t).getId()); + return StringUtils.isNotBlank(((OafEntity) t).getId()); }; } diff --git a/dhp-workflows/dhp-actionmanager/src/main/java/eu/dnetlib/dhp/actionmanager/promote/PromoteActionPayloadFunctions.java b/dhp-workflows/dhp-actionmanager/src/main/java/eu/dnetlib/dhp/actionmanager/promote/PromoteActionPayloadFunctions.java index 56c8dd05a..d799c646b 100644 --- a/dhp-workflows/dhp-actionmanager/src/main/java/eu/dnetlib/dhp/actionmanager/promote/PromoteActionPayloadFunctions.java +++ b/dhp-workflows/dhp-actionmanager/src/main/java/eu/dnetlib/dhp/actionmanager/promote/PromoteActionPayloadFunctions.java @@ -112,6 +112,7 @@ public class PromoteActionPayloadFunctions { Class rowClazz) { TypedColumn aggregator = new TableAggregator<>(zeroFn, mergeAndGetFn, isNotZeroFn, rowClazz).toColumn(); return rowDS + .filter((FilterFunction) o -> isNotZeroFn.get().apply(o)) .groupByKey((MapFunction) x -> rowIdFn.get().apply(x), Encoders.STRING()) .agg(aggregator) .map((MapFunction, G>) Tuple2::_2, Encoders.kryo(rowClazz)); diff --git a/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/migration/migrate_actionsets_parameters.json b/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/migration/migrate_actionsets_parameters.json deleted file mode 100644 index c7b931c44..000000000 --- a/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/migration/migrate_actionsets_parameters.json +++ /dev/null @@ -1,56 +0,0 @@ -[ - { - "paramName": "issm", - "paramLongName": "isSparkSessionManaged", - "paramDescription": "when true will stop SparkSession after job execution", - "paramRequired": false - }, - { - "paramName": "is", - "paramLongName": "isLookupUrl", - "paramDescription": "URL of the isLookUp Service", - "paramRequired": true - }, - { - "paramName": "sn", - "paramLongName": "sourceNameNode", - "paramDescription": "nameNode of the source cluster", - "paramRequired": true - }, - { - "paramName": "tn", - "paramLongName": "targetNameNode", - "paramDescription": "namoNode of the target cluster", - "paramRequired": true - }, - { - "paramName": "w", - "paramLongName": "workingDirectory", - "paramDescription": "working directory", - "paramRequired": true - }, - { - "paramName": "nm", - "paramLongName": "distcp_num_maps", - "paramDescription": "maximum number of map tasks used in the distcp process", - "paramRequired": true - }, - { - "paramName": "mm", - "paramLongName": "distcp_memory_mb", - "paramDescription": "memory for distcp action copying actionsets from remote cluster", - "paramRequired": true - }, - { - "paramName": "tt", - "paramLongName": "distcp_task_timeout", - "paramDescription": "timeout for distcp copying actions from remote cluster", - "paramRequired": true - }, - { - "paramName": "tr", - "paramLongName": "transform_only", - "paramDescription": "activate tranform-only mode. Only apply transformation step", - "paramRequired": true - } -] diff --git a/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/migration/transform_actionsets_parameters.json b/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/migration/transform_actionsets_parameters.json deleted file mode 100644 index 85c39c5b3..000000000 --- a/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/migration/transform_actionsets_parameters.json +++ /dev/null @@ -1,20 +0,0 @@ -[ - { - "paramName": "issm", - "paramLongName": "isSparkSessionManaged", - "paramDescription": "when true will stop SparkSession after job execution", - "paramRequired": false - }, - { - "paramName": "is", - "paramLongName": "isLookupUrl", - "paramDescription": "URL of the isLookUp Service", - "paramRequired": true - }, - { - "paramName": "i", - "paramLongName": "inputPaths", - "paramDescription": "URL of the isLookUp Service", - "paramRequired": true - } -] diff --git a/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/promote/promote_action_payload_for_graph_table_input_parameters.json b/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/promote/promote_action_payload_for_graph_table_input_parameters.json index e111f156e..00c9404ef 100644 --- a/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/promote/promote_action_payload_for_graph_table_input_parameters.json +++ b/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/promote/promote_action_payload_for_graph_table_input_parameters.json @@ -40,5 +40,11 @@ "paramLongName": "mergeAndGetStrategy", "paramDescription": "strategy for merging graph table objects with action payload instances, MERGE_FROM_AND_GET or SELECT_NEWER_AND_GET", "paramRequired": true + }, + { + "paramName": "sgid", + "paramLongName": "shouldGroupById", + "paramDescription": "indicates whether the promotion operation should group objects in the graph by id or not", + "paramRequired": false } ] \ No newline at end of file diff --git a/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/dataset/oozie_app/workflow.xml b/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/dataset/oozie_app/workflow.xml index f95349935..4dc250c29 100644 --- a/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/dataset/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/dataset/oozie_app/workflow.xml @@ -24,6 +24,10 @@ mergeAndGetStrategy strategy for merging graph table objects with action payload instances, MERGE_FROM_AND_GET or SELECT_NEWER_AND_GET + + shouldGroupById + indicates whether the promotion operation should group objects in the graph by id or not + sparkDriverMemory memory for driver process @@ -111,6 +115,7 @@ --actionPayloadClassNameeu.dnetlib.dhp.schema.oaf.Dataset --outputGraphTablePath${workingDir}/dataset --mergeAndGetStrategy${mergeAndGetStrategy} + --shouldGroupById${shouldGroupById} @@ -162,6 +167,7 @@ --actionPayloadClassNameeu.dnetlib.dhp.schema.oaf.Result --outputGraphTablePath${outputGraphRootPath}/dataset --mergeAndGetStrategy${mergeAndGetStrategy} + --shouldGroupById${shouldGroupById} diff --git a/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/main/oozie_app/workflow.xml b/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/main/oozie_app/workflow.xml index 25afc34c9..393f04e89 100644 --- a/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/main/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/main/oozie_app/workflow.xml @@ -56,6 +56,11 @@ mergeAndGetStrategy strategy for merging graph table objects with action payload instances, MERGE_FROM_AND_GET or SELECT_NEWER_AND_GET + + shouldGroupById + false + indicates whether the promotion operation should group objects in the graph by id or not + sparkDriverMemory memory for driver process diff --git a/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/migration/oozie_app/config-default.xml b/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/migration/oozie_app/config-default.xml deleted file mode 100644 index 9637ebdc6..000000000 --- a/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/migration/oozie_app/config-default.xml +++ /dev/null @@ -1,30 +0,0 @@ - - - jobTracker - yarnRM - - - nameNode - hdfs://nameservice1 - - - sourceNN - webhdfs://namenode2.hadoop.dm.openaire.eu:50071 - - - oozie.use.system.libpath - true - - - oozie.action.sharelib.for.spark - spark2 - - - spark2YarnHistoryServerAddress - http://iis-cdh5-test-gw.ocean.icm.edu.pl:18088 - - - spark2EventLogDir - /user/spark/applicationHistory - - \ No newline at end of file diff --git a/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/migration/oozie_app/workflow.xml b/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/migration/oozie_app/workflow.xml deleted file mode 100644 index d8888de9d..000000000 --- a/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/migration/oozie_app/workflow.xml +++ /dev/null @@ -1,138 +0,0 @@ - - - - sourceNN - the source name node - - - isLookupUrl - the isLookup service endpoint - - - workingDirectory - working directory - - - distcp_memory_mb - 6144 - memory for distcp copying actionsets from remote cluster - - - distcp_task_timeout - 60000000 - timeout for distcp copying actions from remote cluster - - - distcp_num_maps - 1 - mmaximum number of map tasks used in the distcp process - - - transform_only - activate tranform-only mode. Only apply transformation step - - - sparkDriverMemory - memory for driver process - - - sparkExecutorMemory - memory for individual executor - - - sparkExecutorCores - number of cores used by single executor - - - oozieActionShareLibForSpark2 - oozie action sharelib for spark 2.* - - - spark2ExtraListeners - com.cloudera.spark.lineage.NavigatorAppListener - spark 2.* extra listeners classname - - - spark2SqlQueryExecutionListeners - com.cloudera.spark.lineage.NavigatorQueryListener - spark 2.* sql query execution listeners classname - - - spark2YarnHistoryServerAddress - spark 2.* yarn history server address - - - spark2EventLogDir - spark 2.* event log dir location - - - - - ${jobTracker} - ${nameNode} - - - mapreduce.job.queuename - ${queueName} - - - oozie.launcher.mapred.job.queue.name - ${oozieLauncherQueueName} - - - oozie.action.sharelib.for.spark - ${oozieActionShareLibForSpark2} - - - - - - - - - eu.dnetlib.dhp.actionmanager.migration.MigrateActionSet - -Dmapred.task.timeout=${distcp_task_timeout} - --isLookupUrl${isLookupUrl} - --sourceNameNode${sourceNN} - --targetNameNode${nameNode} - --workingDirectory${workingDirectory} - --distcp_num_maps${distcp_num_maps} - --distcp_memory_mb${distcp_memory_mb} - --distcp_task_timeout${distcp_task_timeout} - --transform_only${transform_only} - - - - - - - - - yarn - cluster - transform_actions - eu.dnetlib.dhp.actionmanager.migration.TransformActions - dhp-actionmanager-${projectVersion}.jar - - --executor-cores=${sparkExecutorCores} - --executor-memory=${sparkExecutorMemory} - --driver-memory=${sparkDriverMemory} - --conf spark.extraListeners=${spark2ExtraListeners} - --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} - --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} - --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} - - --isLookupUrl${isLookupUrl} - --inputPaths${wf:actionData('migrate_actionsets')['target_paths']} - - - - - - - migrate_actions failed, error message[${wf:errorMessage(wf:lastErrorNode())}] - - - - - \ No newline at end of file diff --git a/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/otherresearchproduct/oozie_app/workflow.xml b/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/otherresearchproduct/oozie_app/workflow.xml index 0deb1b945..7bac760e2 100644 --- a/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/otherresearchproduct/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/otherresearchproduct/oozie_app/workflow.xml @@ -24,6 +24,10 @@ mergeAndGetStrategy strategy for merging graph table objects with action payload instances, MERGE_FROM_AND_GET or SELECT_NEWER_AND_GET + + shouldGroupById + indicates whether the promotion operation should group objects in the graph by id or not + sparkDriverMemory memory for driver process @@ -110,6 +114,7 @@ --actionPayloadClassNameeu.dnetlib.dhp.schema.oaf.OtherResearchProduct --outputGraphTablePath${workingDir}/otherresearchproduct --mergeAndGetStrategy${mergeAndGetStrategy} + --shouldGroupById${shouldGroupById} @@ -161,6 +166,7 @@ --actionPayloadClassNameeu.dnetlib.dhp.schema.oaf.Result --outputGraphTablePath${outputGraphRootPath}/otherresearchproduct --mergeAndGetStrategy${mergeAndGetStrategy} + --shouldGroupById${shouldGroupById} diff --git a/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/publication/oozie_app/workflow.xml b/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/publication/oozie_app/workflow.xml index 70400a123..b1c8e7c85 100644 --- a/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/publication/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/publication/oozie_app/workflow.xml @@ -24,6 +24,10 @@ mergeAndGetStrategy strategy for merging graph table objects with action payload instances, MERGE_FROM_AND_GET or SELECT_NEWER_AND_GET + + shouldGroupById + indicates whether the promotion operation should group objects in the graph by id or not + sparkDriverMemory memory for driver process @@ -103,7 +107,7 @@ --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} - --conf spark.sql.shuffle.partitions=2560 + --conf spark.sql.shuffle.partitions=5000 --inputGraphTablePath${inputGraphRootPath}/publication --graphTableClassNameeu.dnetlib.dhp.schema.oaf.Publication @@ -111,6 +115,7 @@ --actionPayloadClassNameeu.dnetlib.dhp.schema.oaf.Publication --outputGraphTablePath${workingDir}/publication --mergeAndGetStrategy${mergeAndGetStrategy} + --shouldGroupById${shouldGroupById} @@ -154,7 +159,7 @@ --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} - --conf spark.sql.shuffle.partitions=2560 + --conf spark.sql.shuffle.partitions=5000 --inputGraphTablePath${workingDir}/publication --graphTableClassNameeu.dnetlib.dhp.schema.oaf.Publication @@ -162,6 +167,7 @@ --actionPayloadClassNameeu.dnetlib.dhp.schema.oaf.Result --outputGraphTablePath${outputGraphRootPath}/publication --mergeAndGetStrategy${mergeAndGetStrategy} + --shouldGroupById${shouldGroupById} diff --git a/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/relation/oozie_app/workflow.xml b/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/relation/oozie_app/workflow.xml index a7dce8f2f..20ffe26d3 100644 --- a/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/relation/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/relation/oozie_app/workflow.xml @@ -99,7 +99,7 @@ --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} - --conf spark.sql.shuffle.partitions=2560 + --conf spark.sql.shuffle.partitions=5000 --inputGraphTablePath${inputGraphRootPath}/relation --graphTableClassNameeu.dnetlib.dhp.schema.oaf.Relation diff --git a/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/software/oozie_app/workflow.xml b/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/software/oozie_app/workflow.xml index 396e27721..b5673b18f 100644 --- a/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/software/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/software/oozie_app/workflow.xml @@ -24,6 +24,10 @@ mergeAndGetStrategy strategy for merging graph table objects with action payload instances, MERGE_FROM_AND_GET or SELECT_NEWER_AND_GET + + shouldGroupById + indicates whether the promotion operation should group objects in the graph by id or not + sparkDriverMemory memory for driver process @@ -110,6 +114,7 @@ --actionPayloadClassNameeu.dnetlib.dhp.schema.oaf.Software --outputGraphTablePath${workingDir}/software --mergeAndGetStrategy${mergeAndGetStrategy} + --shouldGroupById${shouldGroupById} @@ -161,6 +166,7 @@ --actionPayloadClassNameeu.dnetlib.dhp.schema.oaf.Result --outputGraphTablePath${outputGraphRootPath}/software --mergeAndGetStrategy${mergeAndGetStrategy} + --shouldGroupById${shouldGroupById} diff --git a/dhp-workflows/dhp-actionmanager/src/test/java/eu/dnetlib/dhp/actionmanager/partition/PartitionActionSetsByPayloadTypeJobTest.java b/dhp-workflows/dhp-actionmanager/src/test/java/eu/dnetlib/dhp/actionmanager/partition/PartitionActionSetsByPayloadTypeJobTest.java index f51c697f4..62eec13d5 100644 --- a/dhp-workflows/dhp-actionmanager/src/test/java/eu/dnetlib/dhp/actionmanager/partition/PartitionActionSetsByPayloadTypeJobTest.java +++ b/dhp-workflows/dhp-actionmanager/src/test/java/eu/dnetlib/dhp/actionmanager/partition/PartitionActionSetsByPayloadTypeJobTest.java @@ -80,7 +80,7 @@ public class PartitionActionSetsByPayloadTypeJobTest { private ISClient isClient; @Test - public void shouldPartitionActionSetsByPayloadType(@TempDir Path workingDir) throws Exception { + void shouldPartitionActionSetsByPayloadType(@TempDir Path workingDir) throws Exception { // given Path inputActionSetsBaseDir = workingDir.resolve("input").resolve("action_sets"); Path outputDir = workingDir.resolve("output"); diff --git a/dhp-workflows/dhp-actionmanager/src/test/java/eu/dnetlib/dhp/actionmanager/promote/MergeAndGetTest.java b/dhp-workflows/dhp-actionmanager/src/test/java/eu/dnetlib/dhp/actionmanager/promote/MergeAndGetTest.java index b2248d77a..4c88e9de3 100644 --- a/dhp-workflows/dhp-actionmanager/src/test/java/eu/dnetlib/dhp/actionmanager/promote/MergeAndGetTest.java +++ b/dhp-workflows/dhp-actionmanager/src/test/java/eu/dnetlib/dhp/actionmanager/promote/MergeAndGetTest.java @@ -20,7 +20,7 @@ public class MergeAndGetTest { class MergeFromAndGetStrategy { @Test - public void shouldThrowForOafAndOaf() { + void shouldThrowForOafAndOaf() { // given Oaf a = mock(Oaf.class); Oaf b = mock(Oaf.class); @@ -33,7 +33,7 @@ public class MergeAndGetTest { } @Test - public void shouldThrowForOafAndRelation() { + void shouldThrowForOafAndRelation() { // given Oaf a = mock(Oaf.class); Relation b = mock(Relation.class); @@ -46,7 +46,7 @@ public class MergeAndGetTest { } @Test - public void shouldThrowForOafAndOafEntity() { + void shouldThrowForOafAndOafEntity() { // given Oaf a = mock(Oaf.class); OafEntity b = mock(OafEntity.class); @@ -59,7 +59,7 @@ public class MergeAndGetTest { } @Test - public void shouldThrowForRelationAndOaf() { + void shouldThrowForRelationAndOaf() { // given Relation a = mock(Relation.class); Oaf b = mock(Oaf.class); @@ -72,7 +72,7 @@ public class MergeAndGetTest { } @Test - public void shouldThrowForRelationAndOafEntity() { + void shouldThrowForRelationAndOafEntity() { // given Relation a = mock(Relation.class); OafEntity b = mock(OafEntity.class); @@ -85,7 +85,7 @@ public class MergeAndGetTest { } @Test - public void shouldBehaveProperlyForRelationAndRelation() { + void shouldBehaveProperlyForRelationAndRelation() { // given Relation a = mock(Relation.class); Relation b = mock(Relation.class); @@ -101,7 +101,7 @@ public class MergeAndGetTest { } @Test - public void shouldThrowForOafEntityAndOaf() { + void shouldThrowForOafEntityAndOaf() { // given OafEntity a = mock(OafEntity.class); Oaf b = mock(Oaf.class); @@ -114,7 +114,7 @@ public class MergeAndGetTest { } @Test - public void shouldThrowForOafEntityAndRelation() { + void shouldThrowForOafEntityAndRelation() { // given OafEntity a = mock(OafEntity.class); Relation b = mock(Relation.class); @@ -127,7 +127,7 @@ public class MergeAndGetTest { } @Test - public void shouldThrowForOafEntityAndOafEntityButNotSubclasses() { + void shouldThrowForOafEntityAndOafEntityButNotSubclasses() { // given class OafEntitySub1 extends OafEntity { } @@ -145,7 +145,7 @@ public class MergeAndGetTest { } @Test - public void shouldBehaveProperlyForOafEntityAndOafEntity() { + void shouldBehaveProperlyForOafEntityAndOafEntity() { // given OafEntity a = mock(OafEntity.class); OafEntity b = mock(OafEntity.class); @@ -165,7 +165,7 @@ public class MergeAndGetTest { class SelectNewerAndGetStrategy { @Test - public void shouldThrowForOafEntityAndRelation() { + void shouldThrowForOafEntityAndRelation() { // given OafEntity a = mock(OafEntity.class); Relation b = mock(Relation.class); @@ -178,7 +178,7 @@ public class MergeAndGetTest { } @Test - public void shouldThrowForRelationAndOafEntity() { + void shouldThrowForRelationAndOafEntity() { // given Relation a = mock(Relation.class); OafEntity b = mock(OafEntity.class); @@ -191,7 +191,7 @@ public class MergeAndGetTest { } @Test - public void shouldThrowForOafEntityAndResult() { + void shouldThrowForOafEntityAndResult() { // given OafEntity a = mock(OafEntity.class); Result b = mock(Result.class); @@ -204,7 +204,7 @@ public class MergeAndGetTest { } @Test - public void shouldThrowWhenSuperTypeIsNewerForResultAndOafEntity() { + void shouldThrowWhenSuperTypeIsNewerForResultAndOafEntity() { // given // real types must be used because subclass-superclass resolution does not work for // mocks @@ -221,7 +221,7 @@ public class MergeAndGetTest { } @Test - public void shouldShouldReturnLeftForOafEntityAndOafEntity() { + void shouldShouldReturnLeftForOafEntityAndOafEntity() { // given OafEntity a = mock(OafEntity.class); when(a.getLastupdatetimestamp()).thenReturn(1L); @@ -238,7 +238,7 @@ public class MergeAndGetTest { } @Test - public void shouldShouldReturnRightForOafEntityAndOafEntity() { + void shouldShouldReturnRightForOafEntityAndOafEntity() { // given OafEntity a = mock(OafEntity.class); when(a.getLastupdatetimestamp()).thenReturn(2L); diff --git a/dhp-workflows/dhp-actionmanager/src/test/java/eu/dnetlib/dhp/actionmanager/promote/PromoteActionPayloadForGraphTableJobTest.java b/dhp-workflows/dhp-actionmanager/src/test/java/eu/dnetlib/dhp/actionmanager/promote/PromoteActionPayloadForGraphTableJobTest.java index 129daadcc..df9202ed8 100644 --- a/dhp-workflows/dhp-actionmanager/src/test/java/eu/dnetlib/dhp/actionmanager/promote/PromoteActionPayloadForGraphTableJobTest.java +++ b/dhp-workflows/dhp-actionmanager/src/test/java/eu/dnetlib/dhp/actionmanager/promote/PromoteActionPayloadForGraphTableJobTest.java @@ -77,7 +77,7 @@ public class PromoteActionPayloadForGraphTableJobTest { class Main { @Test - public void shouldThrowWhenGraphTableClassIsNotASubClassOfActionPayloadClass() { + void shouldThrowWhenGraphTableClassIsNotASubClassOfActionPayloadClass() { // given Class rowClazz = Relation.class; Class actionPayloadClazz = OafEntity.class; @@ -101,7 +101,9 @@ public class PromoteActionPayloadForGraphTableJobTest { "-outputGraphTablePath", "", "-mergeAndGetStrategy", - MergeAndGet.Strategy.SELECT_NEWER_AND_GET.name() + MergeAndGet.Strategy.SELECT_NEWER_AND_GET.name(), + "--shouldGroupById", + "true" })); // then @@ -114,7 +116,7 @@ public class PromoteActionPayloadForGraphTableJobTest { @ParameterizedTest(name = "strategy: {0}, graph table: {1}, action payload: {2}") @MethodSource("eu.dnetlib.dhp.actionmanager.promote.PromoteActionPayloadForGraphTableJobTest#promoteJobTestParams") - public void shouldPromoteActionPayloadForGraphTable( + void shouldPromoteActionPayloadForGraphTable( MergeAndGet.Strategy strategy, Class rowClazz, Class actionPayloadClazz) @@ -141,7 +143,9 @@ public class PromoteActionPayloadForGraphTableJobTest { "-outputGraphTablePath", outputGraphTableDir.toString(), "-mergeAndGetStrategy", - strategy.name() + strategy.name(), + "--shouldGroupById", + "true" }); // then @@ -168,6 +172,61 @@ public class PromoteActionPayloadForGraphTableJobTest { } } + @Test + void shouldPromoteActionPayload_custom() throws Exception { + + Class rowClazz = Publication.class; + Class actionPayloadClazz = Result.class; + MergeAndGet.Strategy strategy = MergeAndGet.Strategy.MERGE_FROM_AND_GET; + + // given + Path inputGraphTableDir = createGraphTable(inputGraphRootDir, rowClazz); + Path inputActionPayloadDir = createActionPayload(inputActionPayloadRootDir, rowClazz, actionPayloadClazz); + Path outputGraphTableDir = outputDir.resolve("graph").resolve(rowClazz.getSimpleName().toLowerCase()); + + // when + PromoteActionPayloadForGraphTableJob + .main( + new String[] { + "-isSparkSessionManaged", + Boolean.FALSE.toString(), + "-inputGraphTablePath", + inputGraphTableDir.toString(), + "-graphTableClassName", + rowClazz.getCanonicalName(), + "-inputActionPayloadPath", + inputActionPayloadDir.toString(), + "-actionPayloadClassName", + actionPayloadClazz.getCanonicalName(), + "-outputGraphTablePath", + outputGraphTableDir.toString(), + "-mergeAndGetStrategy", + strategy.name(), + "--shouldGroupById", + "true" + }); + + // then + assertTrue(Files.exists(outputGraphTableDir)); + + List actualOutputRows = readGraphTableFromJobOutput(outputGraphTableDir.toString(), rowClazz) + .collectAsList() + .stream() + .sorted(Comparator.comparingInt(Object::hashCode)) + .collect(Collectors.toList()); + + Publication p = actualOutputRows + .stream() + .map(o -> (Publication) o) + .filter(o -> "50|4ScienceCRIS::6a67ed3daba1c380bf9de3c13ed9c879".equals(o.getId())) + .findFirst() + .get(); + + assertNotNull(p.getMeasures()); + assertTrue(p.getMeasures().size() > 0); + + } + public static Stream promoteJobTestParams() { return Stream .of( diff --git a/dhp-workflows/dhp-actionmanager/src/test/java/eu/dnetlib/dhp/actionmanager/promote/PromoteActionPayloadFunctionsTest.java b/dhp-workflows/dhp-actionmanager/src/test/java/eu/dnetlib/dhp/actionmanager/promote/PromoteActionPayloadFunctionsTest.java index 477e4b204..cbc1bfaba 100644 --- a/dhp-workflows/dhp-actionmanager/src/test/java/eu/dnetlib/dhp/actionmanager/promote/PromoteActionPayloadFunctionsTest.java +++ b/dhp-workflows/dhp-actionmanager/src/test/java/eu/dnetlib/dhp/actionmanager/promote/PromoteActionPayloadFunctionsTest.java @@ -44,7 +44,7 @@ public class PromoteActionPayloadFunctionsTest { class JoinTableWithActionPayloadAndMerge { @Test - public void shouldThrowWhenTableTypeIsNotSubtypeOfActionPayloadType() { + void shouldThrowWhenTableTypeIsNotSubtypeOfActionPayloadType() { // given class OafImpl extends Oaf { } @@ -58,7 +58,7 @@ public class PromoteActionPayloadFunctionsTest { } @Test - public void shouldRunProperlyWhenActionPayloadTypeAndTableTypeAreTheSame() { + void shouldRunProperlyWhenActionPayloadTypeAndTableTypeAreTheSame() { // given String id0 = "id0"; String id1 = "id1"; @@ -138,7 +138,7 @@ public class PromoteActionPayloadFunctionsTest { } @Test - public void shouldRunProperlyWhenActionPayloadTypeIsSuperTypeOfTableType() { + void shouldRunProperlyWhenActionPayloadTypeIsSuperTypeOfTableType() { // given String id0 = "id0"; String id1 = "id1"; @@ -218,7 +218,7 @@ public class PromoteActionPayloadFunctionsTest { class GroupTableByIdAndMerge { @Test - public void shouldRunProperly() { + void shouldRunProperly() { // given String id1 = "id1"; String id2 = "id2"; diff --git a/dhp-workflows/dhp-actionmanager/src/test/resources/eu/dnetlib/dhp/actionmanager/promote/input/action_payload/publication_table/result.json b/dhp-workflows/dhp-actionmanager/src/test/resources/eu/dnetlib/dhp/actionmanager/promote/input/action_payload/publication_table/result.json index f98ee4b78..d98bfe98f 100644 --- a/dhp-workflows/dhp-actionmanager/src/test/resources/eu/dnetlib/dhp/actionmanager/promote/input/action_payload/publication_table/result.json +++ b/dhp-workflows/dhp-actionmanager/src/test/resources/eu/dnetlib/dhp/actionmanager/promote/input/action_payload/publication_table/result.json @@ -17,4 +17,5 @@ {"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"","inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}},"lastupdatetimestamp":1572018243405,"id":"50|CSC_________::00019460865d6cc381b36076131a5bc1","originalId":[],"collectedfrom":[],"pid":[],"dateofcollection":"","dateoftransformation":"","extraInfo":[],"oaiprovenance":null,"author":[],"resulttype":{"classid":"","classname":"","schemeid":"","schemename":""},"language":{"classid":"","classname":"","schemeid":"","schemename":""},"country":[],"subject":[{"value":"Computer Science::Networking and Internet Architecture","qualifier":{"classid":"arxiv","classname":"arxiv","schemeid":"dnet:subject_classification_typologies","schemename":"dnet:subject_classification_typologies"},"dataInfo":{"invisible":false,"inferred":true,"deletedbyinference":false,"trust":"0.7416","inferenceprovenance":"iis::document_classes","provenanceaction":{"classid":"iis","classname":"iis","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}}}],"title":[],"relevantdate":[],"description":[],"dateofacceptance":{"value":"","dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"","inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}},"publisher":{"value":"","dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"","inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}},"embargoenddate":{"value":"","dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"","inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}},"source":[],"fulltext":[],"format":[],"contributor":[],"resourcetype":{"classid":"","classname":"","schemeid":"","schemename":""},"coverage":[],"bestaccessright":{"classid":"UNKNOWN","classname":"not available","schemeid":"dnet:access_modes","schemename":"dnet:access_modes"},"context":[],"externalReference":null,"instance":[]} {"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"","inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}},"lastupdatetimestamp":1572018243405,"id":"50|CSC_________::00019460865d6cc381b36076131a5bc1","originalId":[],"collectedfrom":[],"pid":[],"dateofcollection":"","dateoftransformation":"","extraInfo":[],"oaiprovenance":null,"author":[],"resulttype":{"classid":"","classname":"","schemeid":"","schemename":""},"language":{"classid":"","classname":"","schemeid":"","schemename":""},"country":[],"subject":[{"value":"Computer Science::Networking and Internet Architecture","qualifier":{"classid":"arxiv","classname":"arxiv","schemeid":"dnet:subject_classification_typologies","schemename":"dnet:subject_classification_typologies"},"dataInfo":{"invisible":false,"inferred":true,"deletedbyinference":false,"trust":"0.7416","inferenceprovenance":"iis::document_classes","provenanceaction":{"classid":"iis","classname":"iis","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}}}],"title":[],"relevantdate":[],"description":[],"dateofacceptance":{"value":"","dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"","inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}},"publisher":{"value":"","dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"","inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}},"embargoenddate":{"value":"","dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"","inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}},"source":[],"fulltext":[],"format":[],"contributor":[],"resourcetype":{"classid":"","classname":"","schemeid":"","schemename":""},"coverage":[],"bestaccessright":{"classid":"UNKNOWN","classname":"not available","schemeid":"dnet:access_modes","schemename":"dnet:access_modes"},"context":[],"externalReference":null,"instance":[]} {"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"","inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}},"lastupdatetimestamp":1572018240982,"id":"50|CSC_________::0001d663c95c4132355e1765375a5275","originalId":[],"collectedfrom":[],"pid":[],"dateofcollection":"","dateoftransformation":"","extraInfo":[],"oaiprovenance":null,"author":[],"resulttype":{"classid":"","classname":"","schemeid":"","schemename":""},"language":{"classid":"","classname":"","schemeid":"","schemename":""},"country":[],"subject":[{"value":"animal diseases","qualifier":{"classid":"mesheuropmc","classname":"mesheuropmc","schemeid":"dnet:subject_classification_typologies","schemename":"dnet:subject_classification_typologies"},"dataInfo":{"invisible":false,"inferred":true,"deletedbyinference":false,"trust":"0.7461","inferenceprovenance":"iis::document_classes","provenanceaction":{"classid":"iis","classname":"iis","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}}}],"title":[],"relevantdate":[],"description":[],"dateofacceptance":{"value":"","dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"","inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}},"publisher":{"value":"","dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"","inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}},"embargoenddate":{"value":"","dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"","inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}},"source":[],"fulltext":[],"format":[],"contributor":[],"resourcetype":{"classid":"","classname":"","schemeid":"","schemename":""},"coverage":[],"bestaccessright":{"classid":"UNKNOWN","classname":"not available","schemeid":"dnet:access_modes","schemename":"dnet:access_modes"},"context":[],"externalReference":null,"instance":[]} -{"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"","inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}},"lastupdatetimestamp":1572018240982,"id":"50|CSC_________::0001d663c95c4132355e1765375a5275","originalId":[],"collectedfrom":[],"pid":[],"dateofcollection":"","dateoftransformation":"","extraInfo":[],"oaiprovenance":null,"author":[],"resulttype":{"classid":"","classname":"","schemeid":"","schemename":""},"language":{"classid":"","classname":"","schemeid":"","schemename":""},"country":[],"subject":[{"value":"animal diseases","qualifier":{"classid":"mesheuropmc","classname":"mesheuropmc","schemeid":"dnet:subject_classification_typologies","schemename":"dnet:subject_classification_typologies"},"dataInfo":{"invisible":false,"inferred":true,"deletedbyinference":false,"trust":"0.7461","inferenceprovenance":"iis::document_classes","provenanceaction":{"classid":"iis","classname":"iis","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}}}],"title":[],"relevantdate":[],"description":[],"dateofacceptance":{"value":"","dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"","inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}},"publisher":{"value":"","dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"","inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}},"embargoenddate":{"value":"","dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"","inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}},"source":[],"fulltext":[],"format":[],"contributor":[],"resourcetype":{"classid":"","classname":"","schemeid":"","schemename":""},"coverage":[],"bestaccessright":{"classid":"UNKNOWN","classname":"not available","schemeid":"dnet:access_modes","schemename":"dnet:access_modes"},"context":[],"externalReference":null,"instance":[]} \ No newline at end of file +{"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"","inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}},"lastupdatetimestamp":1572018240982,"id":"50|CSC_________::0001d663c95c4132355e1765375a5275","originalId":[],"collectedfrom":[],"pid":[],"dateofcollection":"","dateoftransformation":"","extraInfo":[],"oaiprovenance":null,"author":[],"resulttype":{"classid":"","classname":"","schemeid":"","schemename":""},"language":{"classid":"","classname":"","schemeid":"","schemename":""},"country":[],"subject":[{"value":"animal diseases","qualifier":{"classid":"mesheuropmc","classname":"mesheuropmc","schemeid":"dnet:subject_classification_typologies","schemename":"dnet:subject_classification_typologies"},"dataInfo":{"invisible":false,"inferred":true,"deletedbyinference":false,"trust":"0.7461","inferenceprovenance":"iis::document_classes","provenanceaction":{"classid":"iis","classname":"iis","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}}}],"title":[],"relevantdate":[],"description":[],"dateofacceptance":{"value":"","dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"","inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}},"publisher":{"value":"","dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"","inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}},"embargoenddate":{"value":"","dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"","inferenceprovenance":"","provenanceaction":{"classid":"","classname":"","schemeid":"","schemename":""}}},"source":[],"fulltext":[],"format":[],"contributor":[],"resourcetype":{"classid":"","classname":"","schemeid":"","schemename":""},"coverage":[],"bestaccessright":{"classid":"UNKNOWN","classname":"not available","schemeid":"dnet:access_modes","schemename":"dnet:access_modes"},"context":[],"externalReference":null,"instance":[]} +{"collectedfrom":null,"dataInfo":null,"lastupdatetimestamp":null,"id":"50|4ScienceCRIS::6a67ed3daba1c380bf9de3c13ed9c879","originalId":null,"pid":null,"dateofcollection":null,"dateoftransformation":null,"extraInfo":null,"oaiprovenance":null,"processingchargeamount":null,"processingchargecurrency":null,"measures":[{"id":"influence","unit":[{"key":"score","value":"1.64385446761e-08","dataInfo":{"invisible":false,"inferred":true,"deletedbyinference":false,"trust":"","inferenceprovenance":"update","provenanceaction":{"classid":"measure:bip","classname":"Inferred by OpenAIRE","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}}}]},{"id":"popularity_alt","unit":[{"key":"score","value":"18.9590813696","dataInfo":{"invisible":false,"inferred":true,"deletedbyinference":false,"trust":"","inferenceprovenance":"update","provenanceaction":{"classid":"measure:bip","classname":"Inferred by OpenAIRE","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}}}]},{"id":"popularity","unit":[{"key":"score","value":"6.00577981643e-08","dataInfo":{"invisible":false,"inferred":true,"deletedbyinference":false,"trust":"","inferenceprovenance":"update","provenanceaction":{"classid":"measure:bip","classname":"Inferred by OpenAIRE","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}}}]}],"author":null,"resulttype":null,"language":null,"country":null,"subject":null,"title":null,"relevantdate":null,"description":null,"dateofacceptance":null,"publisher":null,"embargoenddate":null,"source":null,"fulltext":null,"format":null,"contributor":null,"resourcetype":null,"coverage":null,"bestaccessright":null,"context":null,"externalReference":null,"instance":null} \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/.scalafmt.conf b/dhp-workflows/dhp-aggregation/.scalafmt.conf new file mode 100644 index 000000000..0b5dbe0b4 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/.scalafmt.conf @@ -0,0 +1,21 @@ +style = defaultWithAlign + +align.openParenCallSite = false +align.openParenDefnSite = false +align.tokens = [{code = "->"}, {code = "<-"}, {code = "=>", owner = "Case"}] +continuationIndent.callSite = 2 +continuationIndent.defnSite = 2 +danglingParentheses = true +indentOperator = spray +maxColumn = 120 +newlines.alwaysBeforeTopLevelStatements = true +project.excludeFilters = [".*\\.sbt"] +rewrite.rules = [AvoidInfix] +rewrite.rules = [ExpandImportSelectors] +rewrite.rules = [RedundantBraces] +rewrite.rules = [RedundantParens] +rewrite.rules = [SortImports] +rewrite.rules = [SortModifiers] +rewrite.rules = [PreferCurlyFors] +spaces.inImportCurlyBraces = false +unindentTopLevelOperators = true \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/README.md b/dhp-workflows/dhp-aggregation/README.md index 02583b443..5ed6a82d7 100644 --- a/dhp-workflows/dhp-aggregation/README.md +++ b/dhp-workflows/dhp-aggregation/README.md @@ -1,29 +1,27 @@ Description of the Module -------------------------- -This module defines a **collector worker application** that runs on Hadoop. +This module defines a set of oozie workflows for the **collection** and **transformation** of metadata records. +Both workflows interact with the Metadata Store Manager (MdSM) to handle the logical transactions required to ensure +the consistency of the read/write operations on the data as the MdSM in fact keeps track of the logical-physical mapping +of each MDStore. -It is responsible for harvesting metadata using different plugins. +## Metadata collection -The collector worker uses a message queue to inform the progress -of the harvesting action (using a message queue for sending **ONGOING** messages) furthermore, -It gives, at the end of the job, some information about the status -of the collection i.e Number of records collected(using a message queue for sending **REPORT** messages). +The **metadata collection workflow** is responsible for harvesting metadata records from different protocols and responding to +different formats and to store them as on HDFS so that they can be further processed. -To work the collection worker need some parameter like: +### Collector Plugins -* **hdfsPath**: the path where storing the sequential file -* **apidescriptor**: the JSON encoding of the API Descriptor -* **namenode**: the Name Node URI -* **userHDFS**: the user wich create the hdfs seq file -* **rabbitUser**: the user to connect with RabbitMq for messaging -* **rabbitPassWord**: the password to connect with RabbitMq for messaging -* **rabbitHost**: the host of the RabbitMq server -* **rabbitOngoingQueue**: the name of the ongoing queue -* **rabbitReportQueue**: the name of the report queue -* **workflowId**: the identifier of the dnet Workflow +Different protocols are managed by dedicated Collector plugins, i.e. java programs implementing a defined interface: -##Plugins -* OAI Plugin +```eu.dnetlib.dhp.collection.plugin.CollectorPlugin``` + +The list of the supported plugins: + +* OAI Plugin: collects from OAI-PMH compatible endpoints +* MDStore plugin: collects from a given D-Net MetadataStore, (identified by moogodb URI, dbName, MDStoreID) +* MDStore dump plugin: collects from an MDStore dump stored on the HDFS location indicated by the `path` parameter + +# Transformation Plugins +TODO -## Usage -TODO \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/pom.xml b/dhp-workflows/dhp-aggregation/pom.xml index cf0fa0efe..53d349d2a 100644 --- a/dhp-workflows/dhp-aggregation/pom.xml +++ b/dhp-workflows/dhp-aggregation/pom.xml @@ -4,13 +4,54 @@ eu.dnetlib.dhp dhp-workflows - 1.2.4-SNAPSHOT + 1.2.5-SNAPSHOT dhp-aggregation - - + + + + net.alchim31.maven + scala-maven-plugin + ${net.alchim31.maven.version} + + + scala-compile-first + initialize + + add-source + compile + + + + scala-test-compile + process-test-resources + + testCompile + + + + scala-doc + process-resources + + doc + + + + + ${scala.version} + + + + + + + + org.apache.httpcomponents + httpclient + + org.apache.spark spark-core_2.11 @@ -24,20 +65,7 @@ eu.dnetlib.dhp dhp-common ${project.version} - - - com.sun.xml.bind - jaxb-core - - - - - eu.dnetlib.dhp - dhp-schemas - ${project.version} - - net.sf.saxon @@ -58,14 +86,11 @@ jaxen - - org.apache.commons - commons-csv - 1.8 + org.json + json - org.apache.poi @@ -78,8 +103,11 @@ commons-compress - + + org.mongodb + mongo-java-driver + - + \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/Constants.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/Constants.java new file mode 100644 index 000000000..bd223d7c9 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/Constants.java @@ -0,0 +1,95 @@ + +package eu.dnetlib.dhp.actionmanager; + +import java.util.Optional; + +import org.apache.spark.api.java.function.MapFunction; +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Encoders; +import org.apache.spark.sql.SparkSession; + +import com.fasterxml.jackson.databind.ObjectMapper; + +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.schema.common.ModelConstants; +import eu.dnetlib.dhp.schema.oaf.StructuredProperty; +import eu.dnetlib.dhp.schema.oaf.Subject; +import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils; + +public class Constants { + + public static final String DOI = "doi"; + public static final String DOI_CLASSNAME = "Digital Object Identifier"; + + public static final String DEFAULT_DELIMITER = ","; + + public static final String UPDATE_DATA_INFO_TYPE = "update"; + public static final String UPDATE_SUBJECT_FOS_CLASS_ID = "subject:fos"; + public static final String UPDATE_CLASS_NAME = "Inferred by OpenAIRE"; + public static final String UPDATE_MEASURE_BIP_CLASS_ID = "measure:bip"; + public static final String UPDATE_SUBJECT_SDG_CLASS_ID = "subject:sdg"; + public static final String UPDATE_MEASURE_USAGE_COUNTS_CLASS_ID = "measure:usage_counts"; + public static final String UPDATE_KEY_USAGE_COUNTS = "count"; + + public static final String FOS_CLASS_ID = "FOS"; + public static final String FOS_CLASS_NAME = "Fields of Science and Technology classification"; + + public static final String SDG_CLASS_ID = "SDG"; + public static final String SDG_CLASS_NAME = "Sustainable Development Goals"; + + public static final String NULL = "NULL"; + + public static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); + + private Constants() { + } + + public static Boolean isSparkSessionManaged(ArgumentApplicationParser parser) { + return Optional + .ofNullable(parser.get("isSparkSessionManaged")) + .map(Boolean::valueOf) + .orElse(Boolean.TRUE); + } + + public static Dataset readPath( + SparkSession spark, String inputPath, Class clazz) { + return spark + .read() + .textFile(inputPath) + .map((MapFunction) value -> OBJECT_MAPPER.readValue(value, clazz), Encoders.bean(clazz)); + } + + public static Subject getSubject(String sbj, String classid, String classname, + String diqualifierclassid) { + if (sbj.equals(NULL)) + return null; + Subject s = new Subject(); + s.setValue(sbj); + s + .setQualifier( + OafMapperUtils + .qualifier( + classid, + classname, + ModelConstants.DNET_SUBJECT_TYPOLOGIES, + ModelConstants.DNET_SUBJECT_TYPOLOGIES)); + s + .setDataInfo( + OafMapperUtils + .dataInfo( + false, + UPDATE_DATA_INFO_TYPE, + true, + false, + OafMapperUtils + .qualifier( + diqualifierclassid, + UPDATE_CLASS_NAME, + ModelConstants.DNET_PROVENANCE_ACTIONS, + ModelConstants.DNET_PROVENANCE_ACTIONS), + "")); + + return s; + + } +} diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipfinder/SparkAtomicActionScoreJob.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipfinder/SparkAtomicActionScoreJob.java new file mode 100644 index 000000000..ddf5f4adf --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipfinder/SparkAtomicActionScoreJob.java @@ -0,0 +1,170 @@ + +package eu.dnetlib.dhp.actionmanager.bipfinder; + +import static eu.dnetlib.dhp.actionmanager.Constants.*; +import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; + +import java.io.Serializable; +import java.util.List; +import java.util.Optional; +import java.util.stream.Collectors; + +import org.apache.commons.io.IOUtils; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.mapred.SequenceFileOutputFormat; +import org.apache.spark.SparkConf; +import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.api.java.function.MapFunction; +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Encoders; +import org.apache.spark.sql.SparkSession; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import com.fasterxml.jackson.databind.ObjectMapper; + +import eu.dnetlib.dhp.actionmanager.bipmodel.BipDeserialize; +import eu.dnetlib.dhp.actionmanager.bipmodel.BipScore; +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.common.HdfsSupport; +import eu.dnetlib.dhp.schema.action.AtomicAction; +import eu.dnetlib.dhp.schema.common.ModelConstants; +import eu.dnetlib.dhp.schema.oaf.*; +import eu.dnetlib.dhp.schema.oaf.KeyValue; +import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils; +import scala.Tuple2; + +/** + * created the Atomic Action for each tipe of results + */ +public class SparkAtomicActionScoreJob implements Serializable { + + private static final String DOI = "doi"; + private static final Logger log = LoggerFactory.getLogger(SparkAtomicActionScoreJob.class); + private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); + + public static void main(String[] args) throws Exception { + + String jsonConfiguration = IOUtils + .toString( + SparkAtomicActionScoreJob.class + .getResourceAsStream( + "/eu/dnetlib/dhp/actionmanager/bipfinder/input_actionset_parameter.json")); + + final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration); + + parser.parseArgument(args); + + Boolean isSparkSessionManaged = Optional + .ofNullable(parser.get("isSparkSessionManaged")) + .map(Boolean::valueOf) + .orElse(Boolean.TRUE); + + log.info("isSparkSessionManaged: {}", isSparkSessionManaged); + + final String inputPath = parser.get("inputPath"); + log.info("inputPath {}: ", inputPath); + + final String outputPath = parser.get("outputPath"); + log.info("outputPath {}: ", outputPath); + + SparkConf conf = new SparkConf(); + + runWithSparkSession( + conf, + isSparkSessionManaged, + spark -> { + removeOutputDir(spark, outputPath); + prepareResults(spark, inputPath, outputPath); + }); + } + + private static void prepareResults(SparkSession spark, String bipScorePath, String outputPath) { + + final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); + + JavaRDD bipDeserializeJavaRDD = sc + .textFile(bipScorePath) + .map(item -> OBJECT_MAPPER.readValue(item, BipDeserialize.class)); + + Dataset bipScores = spark + .createDataset(bipDeserializeJavaRDD.flatMap(entry -> entry.keySet().stream().map(key -> { + BipScore bs = new BipScore(); + bs.setId(key); + bs.setScoreList(entry.get(key)); + return bs; + }).collect(Collectors.toList()).iterator()).rdd(), Encoders.bean(BipScore.class)); + + bipScores + + .map((MapFunction) bs -> { + Result ret = new Result(); + + ret.setId(bs.getId()); + + ret.setMeasures(getMeasure(bs)); + + return ret; + }, Encoders.bean(Result.class)) + .toJavaRDD() + .map(p -> new AtomicAction(Result.class, p)) + .mapToPair( + aa -> new Tuple2<>(new Text(aa.getClazz().getCanonicalName()), + new Text(OBJECT_MAPPER.writeValueAsString(aa)))) + .saveAsHadoopFile(outputPath, Text.class, Text.class, SequenceFileOutputFormat.class); + + } + + private static List getMeasure(BipScore value) { + return value + .getScoreList() + .stream() + .map(score -> { + Measure m = new Measure(); + m.setId(score.getId()); + m + .setUnit( + score + .getUnit() + .stream() + .map(unit -> { + KeyValue kv = new KeyValue(); + kv.setValue(unit.getValue()); + kv.setKey(unit.getKey()); + kv + .setDataInfo( + OafMapperUtils + .dataInfo( + false, + UPDATE_DATA_INFO_TYPE, + true, + false, + OafMapperUtils + .qualifier( + UPDATE_MEASURE_BIP_CLASS_ID, + UPDATE_CLASS_NAME, + ModelConstants.DNET_PROVENANCE_ACTIONS, + ModelConstants.DNET_PROVENANCE_ACTIONS), + "")); + return kv; + }) + .collect(Collectors.toList())); + return m; + }) + .collect(Collectors.toList()); + } + + private static void removeOutputDir(SparkSession spark, String path) { + HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration()); + } + + public static Dataset readPath( + SparkSession spark, String inputPath, Class clazz) { + return spark + .read() + .textFile(inputPath) + .map((MapFunction) value -> OBJECT_MAPPER.readValue(value, clazz), Encoders.bean(clazz)); + } + +} diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipmodel/BipDeserialize.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipmodel/BipDeserialize.java new file mode 100644 index 000000000..a70bca618 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipmodel/BipDeserialize.java @@ -0,0 +1,28 @@ + +package eu.dnetlib.dhp.actionmanager.bipmodel; + +import java.io.Serializable; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; + +/** + * Class that maps the model of the bipFinder! input data. + * Only needed for deserialization purposes + */ + +public class BipDeserialize extends HashMap> implements Serializable { + + public BipDeserialize() { + super(); + } + + public List get(String key) { + + if (super.get(key) == null) { + return new ArrayList<>(); + } + return super.get(key); + } + +} diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipmodel/BipScore.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipmodel/BipScore.java new file mode 100644 index 000000000..1ce20eaf4 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipmodel/BipScore.java @@ -0,0 +1,30 @@ + +package eu.dnetlib.dhp.actionmanager.bipmodel; + +import java.io.Serializable; +import java.util.List; + +/** + * Rewriting of the bipFinder input data by extracting the identifier of the result (doi) + */ + +public class BipScore implements Serializable { + private String id; // doi + private List scoreList; // unit as given in the inputfile + + public String getId() { + return id; + } + + public void setId(String id) { + this.id = id; + } + + public List getScoreList() { + return scoreList; + } + + public void setScoreList(List scoreList) { + this.scoreList = scoreList; + } +} diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipmodel/KeyValue.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipmodel/KeyValue.java new file mode 100644 index 000000000..33efc8ea0 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipmodel/KeyValue.java @@ -0,0 +1,26 @@ + +package eu.dnetlib.dhp.actionmanager.bipmodel; + +import java.io.Serializable; + +public class KeyValue implements Serializable { + + private String key; + private String value; + + public String getKey() { + return key; + } + + public void setKey(String key) { + this.key = key; + } + + public String getValue() { + return value; + } + + public void setValue(String value) { + this.value = value; + } +} diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipmodel/Score.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipmodel/Score.java new file mode 100644 index 000000000..5b3095cf2 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipmodel/Score.java @@ -0,0 +1,30 @@ + +package eu.dnetlib.dhp.actionmanager.bipmodel; + +import java.io.Serializable; +import java.util.List; + +/** + * represents the score in the input file + */ +public class Score implements Serializable { + + private String id; + private List unit; + + public String getId() { + return id; + } + + public void setId(String id) { + this.id = id; + } + + public List getUnit() { + return unit; + } + + public void setUnit(List unit) { + this.unit = unit; + } +} diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/createunresolvedentities/GetFOSSparkJob.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/createunresolvedentities/GetFOSSparkJob.java new file mode 100644 index 000000000..75fe42e90 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/createunresolvedentities/GetFOSSparkJob.java @@ -0,0 +1,91 @@ + +package eu.dnetlib.dhp.actionmanager.createunresolvedentities; + +import static eu.dnetlib.dhp.actionmanager.Constants.DEFAULT_DELIMITER; +import static eu.dnetlib.dhp.actionmanager.Constants.isSparkSessionManaged; +import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; + +import java.io.Serializable; +import java.util.Optional; + +import org.apache.commons.io.IOUtils; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.spark.SparkConf; +import org.apache.spark.api.java.function.MapFunction; +import org.apache.spark.sql.*; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import eu.dnetlib.dhp.actionmanager.createunresolvedentities.model.FOSDataModel; +import eu.dnetlib.dhp.application.ArgumentApplicationParser; + +public class GetFOSSparkJob implements Serializable { + + private static final Logger log = LoggerFactory.getLogger(GetFOSSparkJob.class); + + public static void main(String[] args) throws Exception { + + String jsonConfiguration = IOUtils + .toString( + GetFOSSparkJob.class + .getResourceAsStream( + "/eu/dnetlib/dhp/actionmanager/createunresolvedentities/get_input_parameters.json")); + + final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration); + + parser.parseArgument(args); + + Boolean isSparkSessionManaged = isSparkSessionManaged(parser); + log.info("isSparkSessionManaged: {}", isSparkSessionManaged); + + // the path where the original fos csv file is stored + final String sourcePath = parser.get("sourcePath"); + log.info("sourcePath {}", sourcePath); + + // the path where to put the file as json + final String outputPath = parser.get("outputPath"); + log.info("outputPath {}", outputPath); + + final String delimiter = Optional + .ofNullable(parser.get("delimiter")) + .orElse(DEFAULT_DELIMITER); + + SparkConf sconf = new SparkConf(); + runWithSparkSession( + sconf, + isSparkSessionManaged, + spark -> { + getFOS( + spark, + sourcePath, + outputPath, + delimiter); + }); + } + + private static void getFOS(SparkSession spark, String sourcePath, String outputPath, String delimiter) { + Dataset fosData = spark + .read() + .format("csv") + .option("sep", delimiter) + .option("inferSchema", "true") + .option("header", "true") + .option("quotes", "\"") + .load(sourcePath); + + fosData.map((MapFunction) r -> { + FOSDataModel fosDataModel = new FOSDataModel(); + fosDataModel.setDoi(r.getString(0).toLowerCase()); + fosDataModel.setLevel1(r.getString(1)); + fosDataModel.setLevel2(r.getString(2)); + fosDataModel.setLevel3(r.getString(3)); + return fosDataModel; + }, Encoders.bean(FOSDataModel.class)) + .write() + .mode(SaveMode.Overwrite) + .json(outputPath); + + } + +} diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/createunresolvedentities/GetSDGSparkJob.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/createunresolvedentities/GetSDGSparkJob.java new file mode 100644 index 000000000..328075389 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/createunresolvedentities/GetSDGSparkJob.java @@ -0,0 +1,91 @@ + +package eu.dnetlib.dhp.actionmanager.createunresolvedentities; + +import static eu.dnetlib.dhp.actionmanager.Constants.DEFAULT_DELIMITER; +import static eu.dnetlib.dhp.actionmanager.Constants.isSparkSessionManaged; +import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; + +import java.io.Serializable; +import java.util.Optional; + +import org.apache.commons.io.IOUtils; +import org.apache.spark.SparkConf; +import org.apache.spark.api.java.function.FilterFunction; +import org.apache.spark.api.java.function.MapFunction; +import org.apache.spark.sql.*; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import eu.dnetlib.dhp.actionmanager.createunresolvedentities.model.FOSDataModel; +import eu.dnetlib.dhp.actionmanager.createunresolvedentities.model.SDGDataModel; +import eu.dnetlib.dhp.application.ArgumentApplicationParser; + +public class GetSDGSparkJob implements Serializable { + + private static final Logger log = LoggerFactory.getLogger(GetSDGSparkJob.class); + + public static void main(String[] args) throws Exception { + + String jsonConfiguration = IOUtils + .toString( + GetSDGSparkJob.class + .getResourceAsStream( + "/eu/dnetlib/dhp/actionmanager/createunresolvedentities/get_input_parameters.json")); + + final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration); + + parser.parseArgument(args); + + Boolean isSparkSessionManaged = isSparkSessionManaged(parser); + log.info("isSparkSessionManaged: {}", isSparkSessionManaged); + + // the path where the original fos csv file is stored + final String sourcePath = parser.get("sourcePath"); + log.info("sourcePath {}", sourcePath); + + // the path where to put the file as json + final String outputPath = parser.get("outputPath"); + log.info("outputPath {}", outputPath); + + final String delimiter = Optional + .ofNullable(parser.get("delimiter")) + .orElse(DEFAULT_DELIMITER); + + SparkConf sconf = new SparkConf(); + runWithSparkSession( + sconf, + isSparkSessionManaged, + spark -> { + getSDG( + spark, + sourcePath, + outputPath, + delimiter); + }); + } + + private static void getSDG(SparkSession spark, String sourcePath, String outputPath, String delimiter) { + Dataset sdgData = spark + .read() + .format("csv") + .option("sep", delimiter) + .option("inferSchema", "true") + .option("header", "true") + .option("quotes", "\"") + .load(sourcePath); + + sdgData.map((MapFunction) r -> { + SDGDataModel sdgDataModel = new SDGDataModel(); + sdgDataModel.setDoi(r.getString(0).toLowerCase()); + sdgDataModel.setSbj(r.getString(1)); + + return sdgDataModel; + }, Encoders.bean(SDGDataModel.class)) + .filter((FilterFunction) sdg -> sdg.getSbj() != null) + .write() + .mode(SaveMode.Overwrite) + .json(outputPath); + + } + +} diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/createunresolvedentities/PrepareBipFinder.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/createunresolvedentities/PrepareBipFinder.java new file mode 100644 index 000000000..80573c71a --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/createunresolvedentities/PrepareBipFinder.java @@ -0,0 +1,178 @@ + +package eu.dnetlib.dhp.actionmanager.createunresolvedentities; + +import static eu.dnetlib.dhp.actionmanager.Constants.*; +import static eu.dnetlib.dhp.actionmanager.Constants.UPDATE_CLASS_NAME; +import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; + +import java.io.Serializable; +import java.util.Arrays; +import java.util.List; +import java.util.Optional; +import java.util.stream.Collectors; + +import org.apache.commons.io.IOUtils; +import org.apache.spark.SparkConf; +import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.api.java.function.MapFunction; +import org.apache.spark.sql.Encoders; +import org.apache.spark.sql.SaveMode; +import org.apache.spark.sql.SparkSession; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import com.fasterxml.jackson.databind.ObjectMapper; + +import eu.dnetlib.dhp.actionmanager.bipmodel.BipDeserialize; +import eu.dnetlib.dhp.actionmanager.bipmodel.BipScore; +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.common.HdfsSupport; +import eu.dnetlib.dhp.schema.common.ModelConstants; +import eu.dnetlib.dhp.schema.oaf.Instance; +import eu.dnetlib.dhp.schema.oaf.KeyValue; +import eu.dnetlib.dhp.schema.oaf.Measure; +import eu.dnetlib.dhp.schema.oaf.Result; +import eu.dnetlib.dhp.schema.oaf.utils.CleaningFunctions; +import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils; +import eu.dnetlib.dhp.utils.DHPUtils; + +public class PrepareBipFinder implements Serializable { + + private static final Logger log = LoggerFactory.getLogger(PrepareBipFinder.class); + private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); + + public static void main(String[] args) throws Exception { + + String jsonConfiguration = IOUtils + .toString( + PrepareBipFinder.class + .getResourceAsStream( + "/eu/dnetlib/dhp/actionmanager/createunresolvedentities/prepare_parameters.json")); + + final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration); + + parser.parseArgument(args); + + Boolean isSparkSessionManaged = Optional + .ofNullable(parser.get("isSparkSessionManaged")) + .map(Boolean::valueOf) + .orElse(Boolean.TRUE); + + log.info("isSparkSessionManaged: {}", isSparkSessionManaged); + + final String sourcePath = parser.get("sourcePath"); + log.info("sourcePath {}: ", sourcePath); + + final String outputPath = parser.get("outputPath"); + log.info("outputPath {}: ", outputPath); + + SparkConf conf = new SparkConf(); + + runWithSparkSession( + conf, + isSparkSessionManaged, + spark -> { + HdfsSupport.remove(outputPath, spark.sparkContext().hadoopConfiguration()); + prepareResults(spark, sourcePath, outputPath); + }); + } + + private static void prepareResults(SparkSession spark, String inputPath, String outputPath) { + + final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); + + JavaRDD bipDeserializeJavaRDD = sc + .textFile(inputPath) + .map(item -> OBJECT_MAPPER.readValue(item, BipDeserialize.class)); + + spark + .createDataset(bipDeserializeJavaRDD.flatMap(entry -> entry.keySet().stream().map(key -> { + BipScore bs = new BipScore(); + bs.setId(key); + bs.setScoreList(entry.get(key)); + + return bs; + }).collect(Collectors.toList()).iterator()).rdd(), Encoders.bean(BipScore.class)) + .map((MapFunction) v -> { + Result r = new Result(); + final String cleanedPid = CleaningFunctions.normalizePidValue(DOI, v.getId()); + + r.setId(DHPUtils.generateUnresolvedIdentifier(v.getId(), DOI)); + Instance inst = new Instance(); + inst.setMeasures(getMeasure(v)); + + inst + .setPid( + Arrays + .asList( + OafMapperUtils + .structuredProperty( + cleanedPid, + OafMapperUtils + .qualifier( + DOI, DOI_CLASSNAME, + ModelConstants.DNET_PID_TYPES, + ModelConstants.DNET_PID_TYPES), + null))); + r.setInstance(Arrays.asList(inst)); + r + .setDataInfo( + OafMapperUtils + .dataInfo( + false, null, true, + false, + OafMapperUtils + .qualifier( + ModelConstants.PROVENANCE_ENRICH, + null, + ModelConstants.DNET_PROVENANCE_ACTIONS, + ModelConstants.DNET_PROVENANCE_ACTIONS), + null)); + return r; + }, Encoders.bean(Result.class)) + .write() + .mode(SaveMode.Overwrite) + .option("compression", "gzip") + .json(outputPath + "/bip"); + } + + private static List getMeasure(BipScore value) { + return value + .getScoreList() + .stream() + .map(score -> { + Measure m = new Measure(); + m.setId(score.getId()); + m + .setUnit( + score + .getUnit() + .stream() + .map(unit -> { + KeyValue kv = new KeyValue(); + kv.setValue(unit.getValue()); + kv.setKey(unit.getKey()); + kv + .setDataInfo( + OafMapperUtils + .dataInfo( + false, + UPDATE_DATA_INFO_TYPE, + true, + false, + OafMapperUtils + .qualifier( + UPDATE_MEASURE_BIP_CLASS_ID, + UPDATE_CLASS_NAME, + ModelConstants.DNET_PROVENANCE_ACTIONS, + ModelConstants.DNET_PROVENANCE_ACTIONS), + "")); + return kv; + }) + .collect(Collectors.toList())); + return m; + }) + .collect(Collectors.toList()); + } +} diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/createunresolvedentities/PrepareFOSSparkJob.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/createunresolvedentities/PrepareFOSSparkJob.java new file mode 100644 index 000000000..4d2d25215 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/createunresolvedentities/PrepareFOSSparkJob.java @@ -0,0 +1,116 @@ + +package eu.dnetlib.dhp.actionmanager.createunresolvedentities; + +import static eu.dnetlib.dhp.actionmanager.Constants.*; +import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; + +import java.io.Serializable; +import java.util.*; +import java.util.stream.Collectors; + +import org.apache.commons.io.IOUtils; +import org.apache.spark.SparkConf; +import org.apache.spark.api.java.function.MapFunction; +import org.apache.spark.api.java.function.MapGroupsFunction; +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Encoders; +import org.apache.spark.sql.SaveMode; +import org.apache.spark.sql.SparkSession; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import eu.dnetlib.dhp.actionmanager.createunresolvedentities.model.FOSDataModel; +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.schema.common.ModelConstants; +import eu.dnetlib.dhp.schema.oaf.Result; +import eu.dnetlib.dhp.schema.oaf.StructuredProperty; +import eu.dnetlib.dhp.schema.oaf.Subject; +import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils; +import eu.dnetlib.dhp.utils.DHPUtils; + +public class PrepareFOSSparkJob implements Serializable { + private static final Logger log = LoggerFactory.getLogger(PrepareFOSSparkJob.class); + + public static void main(String[] args) throws Exception { + + String jsonConfiguration = IOUtils + .toString( + PrepareFOSSparkJob.class + .getResourceAsStream( + "/eu/dnetlib/dhp/actionmanager/createunresolvedentities/prepare_parameters.json")); + + final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration); + + parser.parseArgument(args); + + Boolean isSparkSessionManaged = isSparkSessionManaged(parser); + log.info("isSparkSessionManaged: {}", isSparkSessionManaged); + + String sourcePath = parser.get("sourcePath"); + log.info("sourcePath: {}", sourcePath); + + final String outputPath = parser.get("outputPath"); + log.info("outputPath: {}", outputPath); + + SparkConf conf = new SparkConf(); + runWithSparkSession( + conf, + isSparkSessionManaged, + spark -> { + distributeFOSdois( + spark, + sourcePath, + + outputPath); + }); + } + + private static void distributeFOSdois(SparkSession spark, String sourcePath, String outputPath) { + Dataset fosDataset = readPath(spark, sourcePath, FOSDataModel.class); + + fosDataset + .groupByKey((MapFunction) v -> v.getDoi().toLowerCase(), Encoders.STRING()) + .mapGroups((MapGroupsFunction) (k, it) -> { + Result r = new Result(); + FOSDataModel first = it.next(); + r.setId(DHPUtils.generateUnresolvedIdentifier(k, DOI)); + + HashSet level1 = new HashSet<>(); + HashSet level2 = new HashSet<>(); + HashSet level3 = new HashSet<>(); + addLevels(level1, level2, level3, first); + it.forEachRemaining(v -> addLevels(level1, level2, level3, v)); + List sbjs = new ArrayList<>(); + level1.forEach(l -> sbjs.add(getSubject(l, FOS_CLASS_ID, FOS_CLASS_NAME, UPDATE_SUBJECT_FOS_CLASS_ID))); + level2.forEach(l -> sbjs.add(getSubject(l, FOS_CLASS_ID, FOS_CLASS_NAME, UPDATE_SUBJECT_FOS_CLASS_ID))); + level3.forEach(l -> sbjs.add(getSubject(l, FOS_CLASS_ID, FOS_CLASS_NAME, UPDATE_SUBJECT_FOS_CLASS_ID))); + r.setSubject(sbjs); + r + .setDataInfo( + OafMapperUtils + .dataInfo( + false, null, true, + false, + OafMapperUtils + .qualifier( + ModelConstants.PROVENANCE_ENRICH, + null, + ModelConstants.DNET_PROVENANCE_ACTIONS, + ModelConstants.DNET_PROVENANCE_ACTIONS), + null)); + return r; + }, Encoders.bean(Result.class)) + .write() + .mode(SaveMode.Overwrite) + .option("compression", "gzip") + .json(outputPath + "/fos"); + } + + private static void addLevels(HashSet level1, HashSet level2, HashSet level3, + FOSDataModel first) { + level1.add(first.getLevel1()); + level2.add(first.getLevel2()); + level3.add(first.getLevel3()); + } + +} diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/createunresolvedentities/PrepareSDGSparkJob.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/createunresolvedentities/PrepareSDGSparkJob.java new file mode 100644 index 000000000..bfdf14234 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/createunresolvedentities/PrepareSDGSparkJob.java @@ -0,0 +1,105 @@ + +package eu.dnetlib.dhp.actionmanager.createunresolvedentities; + +import static eu.dnetlib.dhp.actionmanager.Constants.*; +import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; + +import java.io.Serializable; +import java.util.ArrayList; +import java.util.List; + +import org.apache.commons.io.IOUtils; +import org.apache.spark.SparkConf; +import org.apache.spark.api.java.function.MapFunction; +import org.apache.spark.api.java.function.MapGroupsFunction; +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Encoders; +import org.apache.spark.sql.SaveMode; +import org.apache.spark.sql.SparkSession; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import eu.dnetlib.dhp.actionmanager.createunresolvedentities.model.SDGDataModel; +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.schema.common.ModelConstants; +import eu.dnetlib.dhp.schema.oaf.Result; +import eu.dnetlib.dhp.schema.oaf.StructuredProperty; +import eu.dnetlib.dhp.schema.oaf.Subject; +import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils; +import eu.dnetlib.dhp.utils.DHPUtils; + +public class PrepareSDGSparkJob implements Serializable { + private static final Logger log = LoggerFactory.getLogger(PrepareSDGSparkJob.class); + + public static void main(String[] args) throws Exception { + + String jsonConfiguration = IOUtils + .toString( + PrepareSDGSparkJob.class + .getResourceAsStream( + "/eu/dnetlib/dhp/actionmanager/createunresolvedentities/prepare_parameters.json")); + + final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration); + + parser.parseArgument(args); + + Boolean isSparkSessionManaged = isSparkSessionManaged(parser); + log.info("isSparkSessionManaged: {}", isSparkSessionManaged); + + String sourcePath = parser.get("sourcePath"); + log.info("sourcePath: {}", sourcePath); + + final String outputPath = parser.get("outputPath"); + log.info("outputPath: {}", outputPath); + + SparkConf conf = new SparkConf(); + runWithSparkSession( + conf, + isSparkSessionManaged, + spark -> { + doPrepare( + spark, + sourcePath, + + outputPath); + }); + } + + private static void doPrepare(SparkSession spark, String sourcePath, String outputPath) { + Dataset sdgDataset = readPath(spark, sourcePath, SDGDataModel.class); + + sdgDataset + .groupByKey((MapFunction) r -> r.getDoi().toLowerCase(), Encoders.STRING()) + .mapGroups((MapGroupsFunction) (k, it) -> { + Result r = new Result(); + r.setId(DHPUtils.generateUnresolvedIdentifier(k, DOI)); + SDGDataModel first = it.next(); + List sbjs = new ArrayList<>(); + sbjs.add(getSubject(first.getSbj(), SDG_CLASS_ID, SDG_CLASS_NAME, UPDATE_SUBJECT_SDG_CLASS_ID)); + it + .forEachRemaining( + s -> sbjs + .add(getSubject(s.getSbj(), SDG_CLASS_ID, SDG_CLASS_NAME, UPDATE_SUBJECT_SDG_CLASS_ID))); + r.setSubject(sbjs); + r + .setDataInfo( + OafMapperUtils + .dataInfo( + false, null, true, + false, + OafMapperUtils + .qualifier( + ModelConstants.PROVENANCE_ENRICH, + null, + ModelConstants.DNET_PROVENANCE_ACTIONS, + ModelConstants.DNET_PROVENANCE_ACTIONS), + null)); + return r; + }, Encoders.bean(Result.class)) + .write() + .mode(SaveMode.Overwrite) + .option("compression", "gzip") + .json(outputPath + "/sdg"); + } + +} diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/createunresolvedentities/SparkSaveUnresolved.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/createunresolvedentities/SparkSaveUnresolved.java new file mode 100644 index 000000000..3b9775094 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/createunresolvedentities/SparkSaveUnresolved.java @@ -0,0 +1,92 @@ + +package eu.dnetlib.dhp.actionmanager.createunresolvedentities; + +import static eu.dnetlib.dhp.actionmanager.Constants.*; +import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; + +import java.io.Serializable; +import java.util.Optional; + +import org.apache.commons.io.IOUtils; +import org.apache.spark.SparkConf; +import org.apache.spark.api.java.function.MapFunction; +import org.apache.spark.api.java.function.MapGroupsFunction; +import org.apache.spark.sql.Encoders; +import org.apache.spark.sql.SaveMode; +import org.apache.spark.sql.SparkSession; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.schema.oaf.Result; + +public class SparkSaveUnresolved implements Serializable { + private static final Logger log = LoggerFactory.getLogger(SparkSaveUnresolved.class); + + public static void main(String[] args) throws Exception { + + String jsonConfiguration = IOUtils + .toString( + PrepareFOSSparkJob.class + .getResourceAsStream( + "/eu/dnetlib/dhp/actionmanager/createunresolvedentities/produce_unresolved_parameters.json")); + + final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration); + + parser.parseArgument(args); + + Boolean isSparkSessionManaged = isSparkSessionManaged(parser); + log.info("isSparkSessionManaged: {}", isSparkSessionManaged); + + String sourcePath = parser.get("sourcePath"); + log.info("sourcePath: {}", sourcePath); + + final String outputPath = parser.get("outputPath"); + log.info("outputPath: {}", outputPath); + + SparkConf conf = new SparkConf(); + runWithSparkSession( + conf, + isSparkSessionManaged, + spark -> { + saveUnresolved( + spark, + sourcePath, + + outputPath); + }); + } + + private static void saveUnresolved(SparkSession spark, String sourcePath, String outputPath) { + + spark + .read() + .textFile(sourcePath + "/*") + .map( + (MapFunction) l -> OBJECT_MAPPER.readValue(l, Result.class), + Encoders.bean(Result.class)) + .groupByKey((MapFunction) Result::getId, Encoders.STRING()) + .mapGroups((MapGroupsFunction) (k, it) -> { + Result ret = it.next(); + it.forEachRemaining(r -> { + if (r.getInstance() != null) { + ret.setInstance(r.getInstance()); + } + if (r.getSubject() != null) { + if (ret.getSubject() != null) + ret.getSubject().addAll(r.getSubject()); + else + ret.setSubject(r.getSubject()); + } + + // ret.mergeFrom(r) + }); + return ret; + }, Encoders.bean(Result.class)) + .write() + .mode(SaveMode.Overwrite) + .option("compression", "gzip") + .json(outputPath); + } + +} diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/createunresolvedentities/model/FOSDataModel.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/createunresolvedentities/model/FOSDataModel.java new file mode 100644 index 000000000..e98ba74a1 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/createunresolvedentities/model/FOSDataModel.java @@ -0,0 +1,71 @@ + +package eu.dnetlib.dhp.actionmanager.createunresolvedentities.model; + +import java.io.Serializable; + +import com.opencsv.bean.CsvBindByPosition; + +public class FOSDataModel implements Serializable { + @CsvBindByPosition(position = 0) +// @CsvBindByName(column = "doi") + private String doi; + + @CsvBindByPosition(position = 1) +// @CsvBindByName(column = "level1") + private String level1; + + @CsvBindByPosition(position = 2) +// @CsvBindByName(column = "level2") + private String level2; + + @CsvBindByPosition(position = 3) +// @CsvBindByName(column = "level3") + private String level3; + + public FOSDataModel() { + + } + + public FOSDataModel(String doi, String level1, String level2, String level3) { + this.doi = doi; + this.level1 = level1; + this.level2 = level2; + this.level3 = level3; + } + + public static FOSDataModel newInstance(String d, String level1, String level2, String level3) { + return new FOSDataModel(d, level1, level2, level3); + } + + public String getDoi() { + return doi; + } + + public void setDoi(String doi) { + this.doi = doi; + } + + public String getLevel1() { + return level1; + } + + public void setLevel1(String level1) { + this.level1 = level1; + } + + public String getLevel2() { + return level2; + } + + public void setLevel2(String level2) { + this.level2 = level2; + } + + public String getLevel3() { + return level3; + } + + public void setLevel3(String level3) { + this.level3 = level3; + } +} diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/createunresolvedentities/model/SDGDataModel.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/createunresolvedentities/model/SDGDataModel.java new file mode 100644 index 000000000..98ba5045c --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/createunresolvedentities/model/SDGDataModel.java @@ -0,0 +1,47 @@ + +package eu.dnetlib.dhp.actionmanager.createunresolvedentities.model; + +import java.io.Serializable; + +import com.opencsv.bean.CsvBindByPosition; + +public class SDGDataModel implements Serializable { + + @CsvBindByPosition(position = 0) +// @CsvBindByName(column = "doi") + private String doi; + + @CsvBindByPosition(position = 1) +// @CsvBindByName(column = "sdg") + private String sbj; + + public SDGDataModel() { + + } + + public SDGDataModel(String doi, String sbj) { + this.doi = doi; + this.sbj = sbj; + + } + + public static SDGDataModel newInstance(String d, String sbj) { + return new SDGDataModel(d, sbj); + } + + public String getDoi() { + return doi; + } + + public void setDoi(String doi) { + this.doi = doi; + } + + public String getSbj() { + return sbj; + } + + public void setSbj(String sbj) { + this.sbj = sbj; + } +} diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/opencitations/CreateActionSetSparkJob.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/opencitations/CreateActionSetSparkJob.java new file mode 100644 index 000000000..61bc3fbca --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/opencitations/CreateActionSetSparkJob.java @@ -0,0 +1,190 @@ + +package eu.dnetlib.dhp.actionmanager.opencitations; + +import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; + +import java.io.IOException; +import java.io.Serializable; +import java.util.*; + +import org.apache.commons.cli.ParseException; +import org.apache.commons.io.IOUtils; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.mapred.SequenceFileOutputFormat; +import org.apache.spark.SparkConf; +import org.apache.spark.api.java.function.FilterFunction; +import org.apache.spark.api.java.function.FlatMapFunction; +import org.apache.spark.api.java.function.MapFunction; +import org.apache.spark.sql.Encoders; +import org.apache.spark.sql.SparkSession; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import com.fasterxml.jackson.databind.ObjectMapper; + +import eu.dnetlib.dhp.actionmanager.opencitations.model.COCI; +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.schema.action.AtomicAction; +import eu.dnetlib.dhp.schema.common.ModelConstants; +import eu.dnetlib.dhp.schema.common.ModelSupport; +import eu.dnetlib.dhp.schema.oaf.*; +import eu.dnetlib.dhp.schema.oaf.utils.CleaningFunctions; +import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory; +import scala.Tuple2; + +public class CreateActionSetSparkJob implements Serializable { + public static final String OPENCITATIONS_CLASSID = "sysimport:crosswalk:opencitations"; + public static final String OPENCITATIONS_CLASSNAME = "Imported from OpenCitations"; + private static final String ID_PREFIX = "50|doi_________::"; + private static final String TRUST = "0.91"; + + private static final Logger log = LoggerFactory.getLogger(CreateActionSetSparkJob.class); + private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); + + public static void main(final String[] args) throws IOException, ParseException { + + final ArgumentApplicationParser parser = new ArgumentApplicationParser( + IOUtils + .toString( + Objects + .requireNonNull( + CreateActionSetSparkJob.class + .getResourceAsStream( + "/eu/dnetlib/dhp/actionmanager/opencitations/as_parameters.json")))); + + parser.parseArgument(args); + + Boolean isSparkSessionManaged = Optional + .ofNullable(parser.get("isSparkSessionManaged")) + .map(Boolean::valueOf) + .orElse(Boolean.TRUE); + + log.info("isSparkSessionManaged: {}", isSparkSessionManaged); + + final String inputPath = parser.get("inputPath"); + log.info("inputPath {}", inputPath.toString()); + + final String outputPath = parser.get("outputPath"); + log.info("outputPath {}", outputPath); + + final boolean shouldDuplicateRels = Optional + .ofNullable(parser.get("shouldDuplicateRels")) + .map(Boolean::valueOf) + .orElse(Boolean.FALSE); + + SparkConf conf = new SparkConf(); + runWithSparkSession( + conf, + isSparkSessionManaged, + spark -> { + extractContent(spark, inputPath, outputPath, shouldDuplicateRels); + }); + + } + + private static void extractContent(SparkSession spark, String inputPath, String outputPath, + boolean shouldDuplicateRels) { + spark + .read() + .textFile(inputPath + "/*") + .map( + (MapFunction) value -> OBJECT_MAPPER.readValue(value, COCI.class), + Encoders.bean(COCI.class)) + .flatMap( + (FlatMapFunction) value -> createRelation(value, shouldDuplicateRels).iterator(), + Encoders.bean(Relation.class)) + .filter((FilterFunction) value -> value != null) + .toJavaRDD() + .map(p -> new AtomicAction(p.getClass(), p)) + .mapToPair( + aa -> new Tuple2<>(new Text(aa.getClazz().getCanonicalName()), + new Text(OBJECT_MAPPER.writeValueAsString(aa)))) + .saveAsHadoopFile(outputPath, Text.class, Text.class, SequenceFileOutputFormat.class); + + } + + private static List createRelation(COCI value, boolean duplicate) { + + List relationList = new ArrayList<>(); + + String citing = ID_PREFIX + + IdentifierFactory.md5(CleaningFunctions.normalizePidValue("doi", value.getCiting())); + final String cited = ID_PREFIX + + IdentifierFactory.md5(CleaningFunctions.normalizePidValue("doi", value.getCited())); + + if (!citing.equals(cited)) { + relationList + .addAll( + getRelations( + citing, + cited)); + + if (duplicate && value.getCiting().endsWith(".refs")) { + citing = ID_PREFIX + IdentifierFactory + .md5( + CleaningFunctions + .normalizePidValue( + "doi", value.getCiting().substring(0, value.getCiting().indexOf(".refs")))); + relationList.addAll(getRelations(citing, cited)); + } + } + + return relationList; + } + + private static Collection getRelations(String citing, String cited) { + + return Arrays + .asList( + getRelation(citing, cited, ModelConstants.CITES), + getRelation(cited, citing, ModelConstants.IS_CITED_BY)); + } + + public static Relation getRelation( + String source, + String target, + String relclass) { + Relation r = new Relation(); + r.setCollectedfrom(getCollectedFrom()); + r.setSource(source); + r.setTarget(target); + r.setRelClass(relclass); + r.setRelType(ModelConstants.RESULT_RESULT); + r.setSubRelType(ModelConstants.CITATION); + r + .setDataInfo( + getDataInfo()); + return r; + } + + public static List getCollectedFrom() { + KeyValue kv = new KeyValue(); + kv.setKey(ModelConstants.OPENOCITATIONS_ID); + kv.setValue(ModelConstants.OPENOCITATIONS_NAME); + + return Arrays.asList(kv); + } + + public static DataInfo getDataInfo() { + DataInfo di = new DataInfo(); + di.setInferred(false); + di.setDeletedbyinference(false); + di.setTrust(TRUST); + + di + .setProvenanceaction( + getQualifier(OPENCITATIONS_CLASSID, OPENCITATIONS_CLASSNAME, ModelConstants.DNET_PROVENANCE_ACTIONS)); + return di; + } + + public static Qualifier getQualifier(String class_id, String class_name, + String qualifierSchema) { + Qualifier pa = new Qualifier(); + pa.setClassid(class_id); + pa.setClassname(class_name); + pa.setSchemeid(qualifierSchema); + pa.setSchemename(qualifierSchema); + return pa; + } + +} diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/opencitations/GetOpenCitationsRefs.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/opencitations/GetOpenCitationsRefs.java new file mode 100644 index 000000000..3530c9980 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/opencitations/GetOpenCitationsRefs.java @@ -0,0 +1,93 @@ + +package eu.dnetlib.dhp.actionmanager.opencitations; + +import java.io.*; +import java.io.Serializable; +import java.util.Objects; +import java.util.zip.GZIPOutputStream; +import java.util.zip.ZipEntry; +import java.util.zip.ZipInputStream; + +import org.apache.commons.cli.ParseException; +import org.apache.commons.io.IOUtils; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FSDataInputStream; +import org.apache.hadoop.fs.FSDataOutputStream; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import eu.dnetlib.dhp.application.ArgumentApplicationParser; + +public class GetOpenCitationsRefs implements Serializable { + private static final Logger log = LoggerFactory.getLogger(GetOpenCitationsRefs.class); + + public static void main(final String[] args) throws IOException, ParseException { + + final ArgumentApplicationParser parser = new ArgumentApplicationParser( + IOUtils + .toString( + Objects + .requireNonNull( + GetOpenCitationsRefs.class + .getResourceAsStream( + "/eu/dnetlib/dhp/actionmanager/opencitations/input_parameters.json")))); + + parser.parseArgument(args); + + final String[] inputFile = parser.get("inputFile").split(";"); + log.info("inputFile {}", inputFile.toString()); + + final String workingPath = parser.get("workingPath"); + log.info("workingPath {}", workingPath); + + final String hdfsNameNode = parser.get("hdfsNameNode"); + log.info("hdfsNameNode {}", hdfsNameNode); + + Configuration conf = new Configuration(); + conf.set("fs.defaultFS", hdfsNameNode); + + FileSystem fileSystem = FileSystem.get(conf); + + GetOpenCitationsRefs ocr = new GetOpenCitationsRefs(); + + for (String file : inputFile) { + ocr.doExtract(workingPath + "/Original/" + file, workingPath, fileSystem); + } + + } + + private void doExtract(String inputFile, String workingPath, FileSystem fileSystem) + throws IOException { + + final Path path = new Path(inputFile); + + FSDataInputStream oc_zip = fileSystem.open(path); + + int count = 1; + try (ZipInputStream zis = new ZipInputStream(oc_zip)) { + ZipEntry entry = null; + while ((entry = zis.getNextEntry()) != null) { + + if (!entry.isDirectory()) { + String fileName = entry.getName(); + fileName = fileName.substring(0, fileName.indexOf("T")) + "_" + count; + count++; + try ( + FSDataOutputStream out = fileSystem + .create(new Path(workingPath + "/COCI/" + fileName + ".gz")); + GZIPOutputStream gzipOs = new GZIPOutputStream(new BufferedOutputStream(out))) { + + IOUtils.copy(zis, gzipOs); + + } + } + + } + + } + + } + +} diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/opencitations/ReadCOCI.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/opencitations/ReadCOCI.java new file mode 100644 index 000000000..4293ca187 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/opencitations/ReadCOCI.java @@ -0,0 +1,103 @@ + +package eu.dnetlib.dhp.actionmanager.opencitations; + +import static eu.dnetlib.dhp.actionmanager.Constants.DEFAULT_DELIMITER; +import static eu.dnetlib.dhp.actionmanager.Constants.isSparkSessionManaged; +import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; + +import java.io.IOException; +import java.io.Serializable; +import java.util.Optional; + +import org.apache.commons.io.IOUtils; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.LocatedFileStatus; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.fs.RemoteIterator; +import org.apache.spark.SparkConf; +import org.apache.spark.api.java.function.MapFunction; +import org.apache.spark.sql.*; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import eu.dnetlib.dhp.actionmanager.opencitations.model.COCI; +import eu.dnetlib.dhp.application.ArgumentApplicationParser; + +public class ReadCOCI implements Serializable { + + private static final Logger log = LoggerFactory.getLogger(ReadCOCI.class); + + public static void main(String[] args) throws Exception { + String jsonConfiguration = IOUtils + .toString( + ReadCOCI.class + .getResourceAsStream( + "/eu/dnetlib/dhp/actionmanager/opencitations/input_readcoci_parameters.json")); + + final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration); + parser.parseArgument(args); + + final String outputPath = parser.get("outputPath"); + log.info("outputPath: {}", outputPath); + + final String[] inputFile = parser.get("inputFile").split(";"); + log.info("inputFile {}", inputFile.toString()); + Boolean isSparkSessionManaged = isSparkSessionManaged(parser); + log.info("isSparkSessionManaged: {}", isSparkSessionManaged); + + final String workingPath = parser.get("workingPath"); + log.info("workingPath {}", workingPath); + + SparkConf sconf = new SparkConf(); + + final String delimiter = Optional + .ofNullable(parser.get("delimiter")) + .orElse(DEFAULT_DELIMITER); + + runWithSparkSession( + sconf, + isSparkSessionManaged, + spark -> { + doRead( + spark, + workingPath, + inputFile, + outputPath, + delimiter); + }); + } + + private static void doRead(SparkSession spark, String workingPath, String[] inputFiles, + String outputPath, + String delimiter) throws IOException { + + for (String inputFile : inputFiles) { + String p_string = workingPath + "/" + inputFile + ".gz"; + + Dataset cociData = spark + .read() + .format("csv") + .option("sep", delimiter) + .option("inferSchema", "true") + .option("header", "true") + .option("quotes", "\"") + .load(p_string) + .repartition(100); + + cociData.map((MapFunction) row -> { + COCI coci = new COCI(); + coci.setOci(row.getString(0)); + coci.setCiting(row.getString(1)); + coci.setCited(row.getString(2)); + return coci; + }, Encoders.bean(COCI.class)) + .write() + .mode(SaveMode.Overwrite) + .option("compression", "gzip") + .json(outputPath + inputFile); + } + + } + +} diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/opencitations/model/COCI.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/opencitations/model/COCI.java new file mode 100644 index 000000000..c1ef1abad --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/opencitations/model/COCI.java @@ -0,0 +1,39 @@ + +package eu.dnetlib.dhp.actionmanager.opencitations.model; + +import java.io.Serializable; + +import com.opencsv.bean.CsvBindByPosition; + +public class COCI implements Serializable { + private String oci; + + private String citing; + + private String cited; + + public String getOci() { + return oci; + } + + public void setOci(String oci) { + this.oci = oci; + } + + public String getCiting() { + return citing; + } + + public void setCiting(String citing) { + this.citing = citing; + } + + public String getCited() { + return cited; + } + + public void setCited(String cited) { + this.cited = cited; + } + +} diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/PrepareProgramme.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/PrepareProgramme.java index b2d3253d5..686b0fc7f 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/PrepareProgramme.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/PrepareProgramme.java @@ -20,7 +20,7 @@ import org.slf4j.LoggerFactory; import com.fasterxml.jackson.databind.ObjectMapper; -import eu.dnetlib.dhp.actionmanager.project.utils.CSVProgramme; +import eu.dnetlib.dhp.actionmanager.project.utils.model.CSVProgramme; import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.common.HdfsSupport; import scala.Tuple2; @@ -143,24 +143,8 @@ public class PrepareProgramme { JavaRDD h2020Programmes = programme .toJavaRDD() - .filter(p -> p.getFrameworkProgramme().trim().equalsIgnoreCase("H2020")) .mapToPair(csvProgramme -> new Tuple2<>(csvProgramme.getCode(), csvProgramme)) - .reduceByKey((a, b) -> { - if (!a.getLanguage().equals("en")) { - if (b.getLanguage().equalsIgnoreCase("en")) { - a.setTitle(b.getTitle()); - a.setLanguage(b.getLanguage()); - } - } - if (StringUtils.isEmpty(a.getShortTitle())) { - if (!StringUtils.isEmpty(b.getShortTitle())) { - a.setShortTitle(b.getShortTitle()); - } - } - - return a; - - }) + .reduceByKey(PrepareProgramme::groupProgrammeByCode) .map(p -> { CSVProgramme csvProgramme = p._2(); String programmeTitle = csvProgramme.getTitle().trim(); @@ -177,25 +161,33 @@ public class PrepareProgramme { return csvProgramme; }); - // prepareClassification(h2020Programmes); - - JavaSparkContext jsc = new JavaSparkContext(spark.sparkContext()); + final JavaSparkContext jsc = JavaSparkContext.fromSparkContext(spark.sparkContext()); JavaRDD rdd = jsc.parallelize(prepareClassification(h2020Programmes), 1); rdd - .map(csvProgramme -> { - String tmp = OBJECT_MAPPER.writeValueAsString(csvProgramme); - return tmp; - }) + .map(OBJECT_MAPPER::writeValueAsString) .saveAsTextFile(outputPath); } + private static CSVProgramme groupProgrammeByCode(CSVProgramme a, CSVProgramme b) { + if (!a.getLanguage().equals("en") && b.getLanguage().equalsIgnoreCase("en")) { + a.setTitle(b.getTitle()); + a.setLanguage(b.getLanguage()); + } + if (StringUtils.isEmpty(a.getShortTitle()) && !StringUtils.isEmpty(b.getShortTitle())) { + a.setShortTitle(b.getShortTitle()); + } + + return a; + } + + @SuppressWarnings("unchecked") private static List prepareClassification(JavaRDD h2020Programmes) { Object[] codedescription = h2020Programmes .map( value -> new Tuple2<>(value.getCode(), - new Tuple2(value.getTitle(), value.getShortTitle()))) + new Tuple2<>(value.getTitle(), value.getShortTitle()))) .collect() .toArray(); @@ -221,7 +213,7 @@ public class PrepareProgramme { String[] tmp = ent.split("\\."); if (tmp.length <= 2) { if (StringUtils.isEmpty(entry._2()._2())) { - map.put(entry._1(), new Tuple2(entry._2()._1(), entry._2()._1())); + map.put(entry._1(), new Tuple2<>(entry._2()._1(), entry._2()._1())); } else { map.put(entry._1(), entry._2()); } @@ -241,15 +233,15 @@ public class PrepareProgramme { if (!ent.contains("Euratom")) { String parent; - String tmp_key = tmp[0] + "."; + String tmpKey = tmp[0] + "."; for (int i = 1; i < tmp.length - 1; i++) { - tmp_key += tmp[i] + "."; - parent = map.get(tmp_key)._1().toLowerCase().trim(); + tmpKey += tmp[i] + "."; + parent = map.get(tmpKey)._1().toLowerCase().trim(); if (parent.contains("|")) { parent = parent.substring(parent.lastIndexOf("|") + 1).trim(); } if (current.trim().length() > parent.length() - && current.toLowerCase().trim().substring(0, parent.length()).equals(parent)) { + && current.toLowerCase().trim().startsWith(parent)) { current = current.substring(parent.length() + 1); if (current.trim().charAt(0) == '-' || current.trim().charAt(0) == '–') { current = current.trim().substring(1).trim(); diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/PrepareProjects.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/PrepareProjects.java index e5cae0ff7..8efc76a0e 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/PrepareProjects.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/PrepareProjects.java @@ -18,8 +18,7 @@ import org.slf4j.LoggerFactory; import com.fasterxml.jackson.databind.ObjectMapper; -import eu.dnetlib.dhp.actionmanager.project.utils.CSVProgramme; -import eu.dnetlib.dhp.actionmanager.project.utils.CSVProject; +import eu.dnetlib.dhp.actionmanager.project.utils.model.CSVProject; import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.common.HdfsSupport; import scala.Tuple2; @@ -30,9 +29,8 @@ import scala.Tuple2; */ public class PrepareProjects { - private static final Logger log = LoggerFactory.getLogger(PrepareProgramme.class); + private static final Logger log = LoggerFactory.getLogger(PrepareProjects.class); private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); - private static final HashMap programmeMap = new HashMap<>(); public static void main(String[] args) throws Exception { @@ -93,7 +91,7 @@ public class PrepareProjects { } private static FlatMapFunction, CSVProject> getTuple2CSVProjectFlatMapFunction() { - return (FlatMapFunction, CSVProject>) value -> { + return value -> { Optional csvProject = Optional.ofNullable(value._2()); List csvProjectList = new ArrayList<>(); if (csvProject.isPresent()) { diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/ReadProjectsFromDB.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/ReadProjectsFromDB.java index 2bba9fb60..2cc20cb15 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/ReadProjectsFromDB.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/ReadProjectsFromDB.java @@ -31,15 +31,16 @@ import eu.dnetlib.dhp.common.DbClient; */ public class ReadProjectsFromDB implements Closeable { - private final DbClient dbClient; private static final Log log = LogFactory.getLog(ReadProjectsFromDB.class); + + private static final String query = "SELECT code " + + "from projects where id like 'corda__h2020%' "; + + private final DbClient dbClient; private final Configuration conf; private final BufferedWriter writer; private final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); - private final static String query = "SELECT code " + - "from projects where id like 'corda__h2020%' "; - public static void main(final String[] args) throws Exception { final ArgumentApplicationParser parser = new ArgumentApplicationParser( IOUtils @@ -65,9 +66,9 @@ public class ReadProjectsFromDB implements Closeable { } } - public void execute(final String sql, final Function> producer) throws Exception { + public void execute(final String sql, final Function> producer) { - final Consumer consumer = rs -> producer.apply(rs).forEach(r -> writeProject(r)); + final Consumer consumer = rs -> producer.apply(rs).forEach(this::writeProject); dbClient.processResults(sql, consumer); } @@ -94,20 +95,20 @@ public class ReadProjectsFromDB implements Closeable { public ReadProjectsFromDB( final String hdfsPath, String hdfsNameNode, final String dbUrl, final String dbUser, final String dbPassword) - throws Exception { + throws IOException { this.dbClient = new DbClient(dbUrl, dbUser, dbPassword); this.conf = new Configuration(); this.conf.set("fs.defaultFS", hdfsNameNode); FileSystem fileSystem = FileSystem.get(this.conf); Path hdfsWritePath = new Path(hdfsPath); - FSDataOutputStream fsDataOutputStream = null; + if (fileSystem.exists(hdfsWritePath)) { fileSystem.delete(hdfsWritePath, false); } - fsDataOutputStream = fileSystem.create(hdfsWritePath); + FSDataOutputStream fos = fileSystem.create(hdfsWritePath); - this.writer = new BufferedWriter(new OutputStreamWriter(fsDataOutputStream, StandardCharsets.UTF_8)); + this.writer = new BufferedWriter(new OutputStreamWriter(fos, StandardCharsets.UTF_8)); } @Override diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/SparkAtomicActionJob.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/SparkAtomicActionJob.java index a583b7bfa..cc1411b31 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/SparkAtomicActionJob.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/SparkAtomicActionJob.java @@ -4,7 +4,6 @@ package eu.dnetlib.dhp.actionmanager.project; import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; import java.util.Arrays; -import java.util.HashMap; import java.util.Objects; import java.util.Optional; @@ -22,15 +21,16 @@ import org.slf4j.LoggerFactory; import com.fasterxml.jackson.databind.ObjectMapper; -import eu.dnetlib.dhp.actionmanager.project.utils.CSVProgramme; -import eu.dnetlib.dhp.actionmanager.project.utils.CSVProject; -import eu.dnetlib.dhp.actionmanager.project.utils.EXCELTopic; +import eu.dnetlib.dhp.actionmanager.project.utils.model.CSVProgramme; +import eu.dnetlib.dhp.actionmanager.project.utils.model.CSVProject; +import eu.dnetlib.dhp.actionmanager.project.utils.model.EXCELTopic; import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.common.HdfsSupport; import eu.dnetlib.dhp.schema.action.AtomicAction; import eu.dnetlib.dhp.schema.common.ModelSupport; import eu.dnetlib.dhp.schema.oaf.H2020Classification; import eu.dnetlib.dhp.schema.oaf.H2020Programme; +import eu.dnetlib.dhp.schema.oaf.OafEntity; import eu.dnetlib.dhp.schema.oaf.Project; import eu.dnetlib.dhp.utils.DHPUtils; import scala.Tuple2; @@ -47,13 +47,10 @@ import scala.Tuple2; * * To produce one single entry for each project code a step of groupoing is needed: each project can be associated to more * than one programme. - * - * */ public class SparkAtomicActionJob { private static final Logger log = LoggerFactory.getLogger(SparkAtomicActionJob.class); private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); - private static final HashMap programmeMap = new HashMap<>(); public static void main(String[] args) throws Exception { @@ -120,7 +117,6 @@ public class SparkAtomicActionJob { .map((MapFunction, Project>) c -> { CSVProject csvProject = c._1(); - Optional ocsvProgramme = Optional.ofNullable(c._2()); return Optional .ofNullable(c._2()) @@ -135,37 +131,33 @@ public class SparkAtomicActionJob { H2020Programme pm = new H2020Programme(); H2020Classification h2020classification = new H2020Classification(); pm.setCode(csvProject.getProgramme()); - h2020classification.setClassification(ocsvProgramme.get().getClassification()); + h2020classification.setClassification(csvProgramme.getClassification()); h2020classification.setH2020Programme(pm); - setLevelsandProgramme(h2020classification, ocsvProgramme.get().getClassification_short()); - // setProgramme(h2020classification, ocsvProgramme.get().getClassification()); + setLevelsandProgramme(h2020classification, csvProgramme.getClassification_short()); pp.setH2020classification(Arrays.asList(h2020classification)); return pp; }) .orElse(null); - }, Encoders.bean(Project.class)); + }, Encoders.bean(Project.class)) + .filter(Objects::nonNull); aaproject - .joinWith(topic, aaproject.col("h2020topiccode").equalTo(topic.col("code"))) + .joinWith(topic, aaproject.col("h2020topiccode").equalTo(topic.col("code")), "left") .map((MapFunction, Project>) p -> { Optional op = Optional.ofNullable(p._2()); Project rp = p._1(); - if (op.isPresent()) { - rp.setH2020topicdescription(op.get().getTitle()); - } + op.ifPresent(excelTopic -> rp.setH2020topicdescription(excelTopic.getTitle())); return rp; }, Encoders.bean(Project.class)) .filter(Objects::nonNull) .groupByKey( - (MapFunction) p -> p.getId(), + (MapFunction) OafEntity::getId, Encoders.STRING()) .mapGroups((MapGroupsFunction) (s, it) -> { Project first = it.next(); - it.forEachRemaining(p -> { - first.mergeFrom(p); - }); + it.forEachRemaining(first::mergeFrom); return first; }, Encoders.bean(Project.class)) .toJavaRDD() @@ -189,12 +181,6 @@ public class SparkAtomicActionJob { h2020Classification.getH2020Programme().setDescription(tmp[tmp.length - 1]); } -// private static void setProgramme(H2020Classification h2020Classification, String classification) { -// String[] tmp = classification.split(" \\| "); -// -// h2020Classification.getH2020Programme().setDescription(tmp[tmp.length - 1]); -// } - public static Dataset readPath( SparkSession spark, String inputPath, Class clazz) { return spark diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/httpconnector/CollectorPluginErrorLogList.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/httpconnector/CollectorPluginErrorLogList.java deleted file mode 100644 index 9d3f88265..000000000 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/httpconnector/CollectorPluginErrorLogList.java +++ /dev/null @@ -1,20 +0,0 @@ - -package eu.dnetlib.dhp.actionmanager.project.httpconnector; - -import java.util.LinkedList; - -public class CollectorPluginErrorLogList extends LinkedList { - - private static final long serialVersionUID = -6925786561303289704L; - - @Override - public String toString() { - String log = new String(); - int index = 0; - for (String errorMessage : this) { - log += String.format("Retry #%s: %s / ", index++, errorMessage); - } - return log; - } - -} diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/httpconnector/CollectorServiceException.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/httpconnector/CollectorServiceException.java deleted file mode 100644 index 9167d97b4..000000000 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/httpconnector/CollectorServiceException.java +++ /dev/null @@ -1,20 +0,0 @@ - -package eu.dnetlib.dhp.actionmanager.project.httpconnector; - -public class CollectorServiceException extends Exception { - - private static final long serialVersionUID = 7523999812098059764L; - - public CollectorServiceException(String string) { - super(string); - } - - public CollectorServiceException(String string, Throwable exception) { - super(string, exception); - } - - public CollectorServiceException(Throwable exception) { - super(exception); - } - -} diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/httpconnector/HttpConnector.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/httpconnector/HttpConnector.java deleted file mode 100644 index e20518b55..000000000 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/httpconnector/HttpConnector.java +++ /dev/null @@ -1,240 +0,0 @@ - -package eu.dnetlib.dhp.actionmanager.project.httpconnector; - -import java.io.IOException; -import java.io.InputStream; -import java.net.*; -import java.security.GeneralSecurityException; -import java.security.cert.X509Certificate; -import java.util.List; -import java.util.Map; - -import javax.net.ssl.HttpsURLConnection; -import javax.net.ssl.SSLContext; -import javax.net.ssl.TrustManager; -import javax.net.ssl.X509TrustManager; - -import org.apache.commons.io.IOUtils; -import org.apache.commons.lang3.math.NumberUtils; -import org.apache.commons.logging.Log; -import org.apache.commons.logging.LogFactory; - -/** - * @author jochen, michele, andrea - */ -public class HttpConnector { - - private static final Log log = LogFactory.getLog(HttpConnector.class); - - private int maxNumberOfRetry = 6; - private int defaultDelay = 120; // seconds - private int readTimeOut = 120; // seconds - - private String responseType = null; - - private String userAgent = "Mozilla/5.0 (compatible; OAI; +http://www.openaire.eu)"; - - public HttpConnector() { - CookieHandler.setDefault(new CookieManager(null, CookiePolicy.ACCEPT_ALL)); - } - - /** - * Given the URL returns the content via HTTP GET - * - * @param requestUrl the URL - * @return the content of the downloaded resource - * @throws CollectorServiceException when retrying more than maxNumberOfRetry times - */ - public String getInputSource(final String requestUrl) throws CollectorServiceException { - return attemptDownlaodAsString(requestUrl, 1, new CollectorPluginErrorLogList()); - } - - /** - * Given the URL returns the content as a stream via HTTP GET - * - * @param requestUrl the URL - * @return the content of the downloaded resource as InputStream - * @throws CollectorServiceException when retrying more than maxNumberOfRetry times - */ - public InputStream getInputSourceAsStream(final String requestUrl) throws CollectorServiceException { - return attemptDownload(requestUrl, 1, new CollectorPluginErrorLogList()); - } - - private String attemptDownlaodAsString(final String requestUrl, final int retryNumber, - final CollectorPluginErrorLogList errorList) - throws CollectorServiceException { - try { - InputStream s = attemptDownload(requestUrl, 1, new CollectorPluginErrorLogList()); - try { - return IOUtils.toString(s); - } catch (IOException e) { - log.error("error while retrieving from http-connection occured: " + requestUrl, e); - Thread.sleep(defaultDelay * 1000); - errorList.add(e.getMessage()); - return attemptDownlaodAsString(requestUrl, retryNumber + 1, errorList); - } finally { - IOUtils.closeQuietly(s); - } - } catch (InterruptedException e) { - throw new CollectorServiceException(e); - } - } - - private InputStream attemptDownload(final String requestUrl, final int retryNumber, - final CollectorPluginErrorLogList errorList) - throws CollectorServiceException { - - if (retryNumber > maxNumberOfRetry) { - throw new CollectorServiceException("Max number of retries exceeded. Cause: \n " + errorList); - } - - log.debug("Downloading " + requestUrl + " - try: " + retryNumber); - try { - InputStream input = null; - - try { - final HttpURLConnection urlConn = (HttpURLConnection) new URL(requestUrl).openConnection(); - urlConn.setInstanceFollowRedirects(false); - urlConn.setReadTimeout(readTimeOut * 1000); - urlConn.addRequestProperty("User-Agent", userAgent); - - if (log.isDebugEnabled()) { - logHeaderFields(urlConn); - } - - int retryAfter = obtainRetryAfter(urlConn.getHeaderFields()); - if (retryAfter > 0 && urlConn.getResponseCode() == HttpURLConnection.HTTP_UNAVAILABLE) { - log.warn("waiting and repeating request after " + retryAfter + " sec."); - Thread.sleep(retryAfter * 1000); - errorList.add("503 Service Unavailable"); - urlConn.disconnect(); - return attemptDownload(requestUrl, retryNumber + 1, errorList); - } else if ((urlConn.getResponseCode() == HttpURLConnection.HTTP_MOVED_PERM) - || (urlConn.getResponseCode() == HttpURLConnection.HTTP_MOVED_TEMP)) { - final String newUrl = obtainNewLocation(urlConn.getHeaderFields()); - log.debug("The requested url has been moved to " + newUrl); - errorList - .add( - String - .format( - "%s %s. Moved to: %s", urlConn.getResponseCode(), urlConn.getResponseMessage(), - newUrl)); - urlConn.disconnect(); - return attemptDownload(newUrl, retryNumber + 1, errorList); - } else if (urlConn.getResponseCode() != HttpURLConnection.HTTP_OK) { - log - .error( - String - .format("HTTP error: %s %s", urlConn.getResponseCode(), urlConn.getResponseMessage())); - Thread.sleep(defaultDelay * 1000); - errorList.add(String.format("%s %s", urlConn.getResponseCode(), urlConn.getResponseMessage())); - urlConn.disconnect(); - return attemptDownload(requestUrl, retryNumber + 1, errorList); - } else { - input = urlConn.getInputStream(); - responseType = urlConn.getContentType(); - return input; - } - } catch (IOException e) { - log.error("error while retrieving from http-connection occured: " + requestUrl, e); - Thread.sleep(defaultDelay * 1000); - errorList.add(e.getMessage()); - return attemptDownload(requestUrl, retryNumber + 1, errorList); - } - } catch (InterruptedException e) { - throw new CollectorServiceException(e); - } - } - - private void logHeaderFields(final HttpURLConnection urlConn) throws IOException { - log.debug("StatusCode: " + urlConn.getResponseMessage()); - - for (Map.Entry> e : urlConn.getHeaderFields().entrySet()) { - if (e.getKey() != null) { - for (String v : e.getValue()) { - log.debug(" key: " + e.getKey() + " - value: " + v); - } - } - } - } - - private int obtainRetryAfter(final Map> headerMap) { - for (String key : headerMap.keySet()) { - if ((key != null) && key.toLowerCase().equals("retry-after") && (headerMap.get(key).size() > 0) - && NumberUtils.isCreatable(headerMap.get(key).get(0))) { - return Integer - .parseInt(headerMap.get(key).get(0)) + 10; - } - } - return -1; - } - - private String obtainNewLocation(final Map> headerMap) throws CollectorServiceException { - for (String key : headerMap.keySet()) { - if ((key != null) && key.toLowerCase().equals("location") && (headerMap.get(key).size() > 0)) { - return headerMap.get(key).get(0); - } - } - throw new CollectorServiceException("The requested url has been MOVED, but 'location' param is MISSING"); - } - - /** - * register for https scheme; this is a workaround and not intended for the use in trusted environments - */ - public void initTrustManager() { - final X509TrustManager tm = new X509TrustManager() { - - @Override - public void checkClientTrusted(final X509Certificate[] xcs, final String string) { - } - - @Override - public void checkServerTrusted(final X509Certificate[] xcs, final String string) { - } - - @Override - public X509Certificate[] getAcceptedIssuers() { - return null; - } - }; - try { - final SSLContext ctx = SSLContext.getInstance("TLS"); - ctx.init(null, new TrustManager[] { - tm - }, null); - HttpsURLConnection.setDefaultSSLSocketFactory(ctx.getSocketFactory()); - } catch (GeneralSecurityException e) { - log.fatal(e); - throw new IllegalStateException(e); - } - } - - public int getMaxNumberOfRetry() { - return maxNumberOfRetry; - } - - public void setMaxNumberOfRetry(final int maxNumberOfRetry) { - this.maxNumberOfRetry = maxNumberOfRetry; - } - - public int getDefaultDelay() { - return defaultDelay; - } - - public void setDefaultDelay(final int defaultDelay) { - this.defaultDelay = defaultDelay; - } - - public int getReadTimeOut() { - return readTimeOut; - } - - public void setReadTimeOut(final int readTimeOut) { - this.readTimeOut = readTimeOut; - } - - public String getResponseType() { - return responseType; - } - -} diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/utils/CSVParser.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/utils/CSVParser.java deleted file mode 100644 index 8bdce903b..000000000 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/utils/CSVParser.java +++ /dev/null @@ -1,40 +0,0 @@ - -package eu.dnetlib.dhp.actionmanager.project.utils; - -import java.io.IOException; -import java.util.ArrayList; -import java.util.List; -import java.util.Set; - -import org.apache.commons.csv.CSVFormat; -import org.apache.commons.csv.CSVRecord; -import org.apache.commons.lang.reflect.FieldUtils; - -/** - * Reads a generic csv and maps it into classes that mirror its schema - */ -public class CSVParser { - - public List parse(String csvFile, String classForName) - throws ClassNotFoundException, IOException, IllegalAccessException, InstantiationException { - final CSVFormat format = CSVFormat.EXCEL - .withHeader() - .withDelimiter(';') - .withQuote('"') - .withTrim(); - List ret = new ArrayList<>(); - final org.apache.commons.csv.CSVParser parser = org.apache.commons.csv.CSVParser.parse(csvFile, format); - final Set headers = parser.getHeaderMap().keySet(); - Class clazz = Class.forName(classForName); - for (CSVRecord csvRecord : parser.getRecords()) { - final Object cc = clazz.newInstance(); - for (String header : headers) { - FieldUtils.writeField(cc, header, csvRecord.get(header), true); - - } - ret.add((R) cc); - } - - return ret; - } -} diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/utils/CSVProgramme.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/utils/CSVProgramme.java deleted file mode 100644 index f991a4297..000000000 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/utils/CSVProgramme.java +++ /dev/null @@ -1,146 +0,0 @@ - -package eu.dnetlib.dhp.actionmanager.project.utils; - -import java.io.Serializable; - -/** - * The model for the programme csv file - */ -public class CSVProgramme implements Serializable { - private String parentProgramme; - private String frameworkProgramme; - private String startDate; - private String endDate; - private String objective; - private String subjects; - private String legalBasis; - private String call; - private String rcn; - private String code; - - private String title; - private String shortTitle; - private String language; - private String classification; - private String classification_short; - - public String getClassification_short() { - return classification_short; - } - - public void setClassification_short(String classification_short) { - this.classification_short = classification_short; - } - - public String getClassification() { - return classification; - } - - public void setClassification(String classification) { - this.classification = classification; - } - - public String getRcn() { - return rcn; - } - - public void setRcn(String rcn) { - this.rcn = rcn; - } - - public String getCode() { - return code; - } - - public void setCode(String code) { - this.code = code; - } - - public String getTitle() { - return title; - } - - public void setTitle(String title) { - this.title = title; - } - - public String getShortTitle() { - return shortTitle; - } - - public void setShortTitle(String shortTitle) { - this.shortTitle = shortTitle; - } - - public String getLanguage() { - return language; - } - - public void setLanguage(String language) { - this.language = language; - } - - public String getParentProgramme() { - return parentProgramme; - } - - public void setParentProgramme(String parentProgramme) { - this.parentProgramme = parentProgramme; - } - - public String getFrameworkProgramme() { - return frameworkProgramme; - } - - public void setFrameworkProgramme(String frameworkProgramme) { - this.frameworkProgramme = frameworkProgramme; - } - - public String getStartDate() { - return startDate; - } - - public void setStartDate(String startDate) { - this.startDate = startDate; - } - - public String getEndDate() { - return endDate; - } - - public void setEndDate(String endDate) { - this.endDate = endDate; - } - - public String getObjective() { - return objective; - } - - public void setObjective(String objective) { - this.objective = objective; - } - - public String getSubjects() { - return subjects; - } - - public void setSubjects(String subjects) { - this.subjects = subjects; - } - - public String getLegalBasis() { - return legalBasis; - } - - public void setLegalBasis(String legalBasis) { - this.legalBasis = legalBasis; - } - - public String getCall() { - return call; - } - - public void setCall(String call) { - this.call = call; - } -} diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/utils/CSVProject.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/utils/CSVProject.java deleted file mode 100644 index 268d5f28c..000000000 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/utils/CSVProject.java +++ /dev/null @@ -1,200 +0,0 @@ - -package eu.dnetlib.dhp.actionmanager.project.utils; - -import java.io.Serializable; - -/** - * the mmodel for the projects csv file - */ -public class CSVProject implements Serializable { - private String rcn; - private String id; - private String acronym; - private String status; - private String programme; - private String topics; - private String frameworkProgramme; - private String title; - private String startDate; - private String endDate; - private String projectUrl; - private String objective; - private String totalCost; - private String ecMaxContribution; - private String call; - private String fundingScheme; - private String coordinator; - private String coordinatorCountry; - private String participants; - private String participantCountries; - private String subjects; - - public String getRcn() { - return rcn; - } - - public void setRcn(String rcn) { - this.rcn = rcn; - } - - public String getId() { - return id; - } - - public void setId(String id) { - this.id = id; - } - - public String getAcronym() { - return acronym; - } - - public void setAcronym(String acronym) { - this.acronym = acronym; - } - - public String getStatus() { - return status; - } - - public void setStatus(String status) { - this.status = status; - } - - public String getProgramme() { - return programme; - } - - public void setProgramme(String programme) { - this.programme = programme; - } - - public String getTopics() { - return topics; - } - - public void setTopics(String topics) { - this.topics = topics; - } - - public String getFrameworkProgramme() { - return frameworkProgramme; - } - - public void setFrameworkProgramme(String frameworkProgramme) { - this.frameworkProgramme = frameworkProgramme; - } - - public String getTitle() { - return title; - } - - public void setTitle(String title) { - this.title = title; - } - - public String getStartDate() { - return startDate; - } - - public void setStartDate(String startDate) { - this.startDate = startDate; - } - - public String getEndDate() { - return endDate; - } - - public void setEndDate(String endDate) { - this.endDate = endDate; - } - - public String getProjectUrl() { - return projectUrl; - } - - public void setProjectUrl(String projectUrl) { - this.projectUrl = projectUrl; - } - - public String getObjective() { - return objective; - } - - public void setObjective(String objective) { - this.objective = objective; - } - - public String getTotalCost() { - return totalCost; - } - - public void setTotalCost(String totalCost) { - this.totalCost = totalCost; - } - - public String getEcMaxContribution() { - return ecMaxContribution; - } - - public void setEcMaxContribution(String ecMaxContribution) { - this.ecMaxContribution = ecMaxContribution; - } - - public String getCall() { - return call; - } - - public void setCall(String call) { - this.call = call; - } - - public String getFundingScheme() { - return fundingScheme; - } - - public void setFundingScheme(String fundingScheme) { - this.fundingScheme = fundingScheme; - } - - public String getCoordinator() { - return coordinator; - } - - public void setCoordinator(String coordinator) { - this.coordinator = coordinator; - } - - public String getCoordinatorCountry() { - return coordinatorCountry; - } - - public void setCoordinatorCountry(String coordinatorCountry) { - this.coordinatorCountry = coordinatorCountry; - } - - public String getParticipants() { - return participants; - } - - public void setParticipants(String participants) { - this.participants = participants; - } - - public String getParticipantCountries() { - return participantCountries; - } - - public void setParticipantCountries(String participantCountries) { - this.participantCountries = participantCountries; - } - - public String getSubjects() { - return subjects; - } - - public void setSubjects(String subjects) { - this.subjects = subjects; - } - -} diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/utils/EXCELParser.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/utils/EXCELParser.java index 0f83499e4..a520176f4 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/utils/EXCELParser.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/utils/EXCELParser.java @@ -17,59 +17,63 @@ import org.apache.poi.ss.usermodel.Row; import org.apache.poi.xssf.usermodel.XSSFSheet; import org.apache.poi.xssf.usermodel.XSSFWorkbook; +import eu.dnetlib.dhp.actionmanager.project.utils.model.EXCELTopic; + /** * Reads a generic excel file and maps it into classes that mirror its schema */ public class EXCELParser { - public List parse(InputStream file, String classForName) + public List parse(InputStream file, String classForName, String sheetName) throws ClassNotFoundException, IOException, IllegalAccessException, InstantiationException, InvalidFormatException { - // OPCPackage pkg = OPCPackage.open(httpConnector.getInputSourceAsStream(URL)); - OPCPackage pkg = OPCPackage.open(file); - XSSFWorkbook wb = new XSSFWorkbook(pkg); + try (OPCPackage pkg = OPCPackage.open(file); XSSFWorkbook wb = new XSSFWorkbook(pkg)) { - XSSFSheet sheet = wb.getSheet("cordisref-H2020topics"); - - List ret = new ArrayList<>(); - - DataFormatter dataFormatter = new DataFormatter(); - Iterator rowIterator = sheet.rowIterator(); - List headers = new ArrayList<>(); - int count = 0; - while (rowIterator.hasNext()) { - Row row = rowIterator.next(); - - if (count == 0) { - Iterator cellIterator = row.cellIterator(); - - while (cellIterator.hasNext()) { - Cell cell = cellIterator.next(); - headers.add(dataFormatter.formatCellValue(cell)); - } - } else { - Class clazz = Class.forName("eu.dnetlib.dhp.actionmanager.project.utils.EXCELTopic"); - final Object cc = clazz.newInstance(); - - for (int i = 0; i < headers.size(); i++) { - Cell cell = row.getCell(i); - String value = dataFormatter.formatCellValue(cell); - FieldUtils.writeField(cc, headers.get(i), dataFormatter.formatCellValue(cell), true); - - } - - EXCELTopic et = (EXCELTopic) cc; - if (StringUtils.isNotBlank(et.getRcn())) { - ret.add((R) cc); - } + XSSFSheet sheet = wb.getSheet(sheetName); + if (sheet == null) { + throw new IllegalArgumentException("Sheet name " + sheetName + " not present in current file"); } - count += 1; - } + List ret = new ArrayList<>(); - return ret; + DataFormatter dataFormatter = new DataFormatter(); + Iterator rowIterator = sheet.rowIterator(); + List headers = new ArrayList<>(); + int count = 0; + while (rowIterator.hasNext()) { + Row row = rowIterator.next(); + + if (count == 0) { + Iterator cellIterator = row.cellIterator(); + + while (cellIterator.hasNext()) { + Cell cell = cellIterator.next(); + headers.add(dataFormatter.formatCellValue(cell)); + } + } else { + Class clazz = Class.forName(classForName); + final Object cc = clazz.newInstance(); + + for (int i = 0; i < headers.size(); i++) { + Cell cell = row.getCell(i); + FieldUtils.writeField(cc, headers.get(i), dataFormatter.formatCellValue(cell), true); + + } + + EXCELTopic et = (EXCELTopic) cc; + if (StringUtils.isNotBlank(et.getRcn())) { + ret.add((R) cc); + } + + } + + count += 1; + } + + return ret; + } } } diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/utils/EXCELTopic.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/utils/EXCELTopic.java deleted file mode 100644 index 5607df118..000000000 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/utils/EXCELTopic.java +++ /dev/null @@ -1,127 +0,0 @@ - -package eu.dnetlib.dhp.actionmanager.project.utils; - -import java.io.Serializable; - -/** - * the model class for the topic excel file - */ -public class EXCELTopic implements Serializable { - private String rcn; - private String language; - private String code; - private String parentProgramme; - private String frameworkProgramme; - private String startDate; - private String endDate; - private String title; - private String shortTitle; - private String objective; - private String subjects; - private String legalBasis; - private String call; - - public String getRcn() { - return rcn; - } - - public void setRcn(String rcn) { - this.rcn = rcn; - } - - public String getLanguage() { - return language; - } - - public void setLanguage(String language) { - this.language = language; - } - - public String getCode() { - return code; - } - - public void setCode(String code) { - this.code = code; - } - - public String getParentProgramme() { - return parentProgramme; - } - - public void setParentProgramme(String parentProgramme) { - this.parentProgramme = parentProgramme; - } - - public String getFrameworkProgramme() { - return frameworkProgramme; - } - - public void setFrameworkProgramme(String frameworkProgramme) { - this.frameworkProgramme = frameworkProgramme; - } - - public String getStartDate() { - return startDate; - } - - public void setStartDate(String startDate) { - this.startDate = startDate; - } - - public String getEndDate() { - return endDate; - } - - public void setEndDate(String endDate) { - this.endDate = endDate; - } - - public String getTitle() { - return title; - } - - public void setTitle(String title) { - this.title = title; - } - - public String getShortTitle() { - return shortTitle; - } - - public void setShortTitle(String shortTitle) { - this.shortTitle = shortTitle; - } - - public String getObjective() { - return objective; - } - - public void setObjective(String objective) { - this.objective = objective; - } - - public String getSubjects() { - return subjects; - } - - public void setSubjects(String subjects) { - this.subjects = subjects; - } - - public String getLegalBasis() { - return legalBasis; - } - - public void setLegalBasis(String legalBasis) { - this.legalBasis = legalBasis; - } - - public String getCall() { - return call; - } - - public void setCall(String call) { - this.call = call; - } -} diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/utils/ReadCSV.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/utils/ReadCSV.java index 9dac34a15..c967d4cae 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/utils/ReadCSV.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/utils/ReadCSV.java @@ -1,34 +1,21 @@ package eu.dnetlib.dhp.actionmanager.project.utils; -import java.io.BufferedWriter; -import java.io.Closeable; -import java.io.IOException; -import java.io.OutputStreamWriter; -import java.nio.charset.StandardCharsets; +import java.io.*; +import java.util.Optional; import org.apache.commons.io.IOUtils; -import org.apache.commons.logging.Log; -import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.FSDataOutputStream; import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; -import com.fasterxml.jackson.databind.ObjectMapper; - -import eu.dnetlib.dhp.actionmanager.project.httpconnector.HttpConnector; import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.common.collection.GetCSV; +import eu.dnetlib.dhp.common.collection.HttpConnector2; /** * Applies the parsing of a csv file and writes the Serialization of it in hdfs */ -public class ReadCSV implements Closeable { - private static final Log log = LogFactory.getLog(ReadCSV.class); - private final Configuration conf; - private final BufferedWriter writer; - private final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); - private String csvFile; +public class ReadCSV { public static void main(final String[] args) throws Exception { final ArgumentApplicationParser parser = new ArgumentApplicationParser( @@ -44,57 +31,22 @@ public class ReadCSV implements Closeable { final String hdfsPath = parser.get("hdfsPath"); final String hdfsNameNode = parser.get("hdfsNameNode"); final String classForName = parser.get("classForName"); + Optional delimiter = Optional.ofNullable(parser.get("delimiter")); + char del = ';'; + if (delimiter.isPresent()) + del = delimiter.get().charAt(0); - try (final ReadCSV readCSV = new ReadCSV(hdfsPath, hdfsNameNode, fileURL)) { + Configuration conf = new Configuration(); + conf.set("fs.defaultFS", hdfsNameNode); - log.info("Getting CSV file..."); - readCSV.execute(classForName); + FileSystem fileSystem = FileSystem.get(conf); + BufferedReader reader = new BufferedReader( + new InputStreamReader(new HttpConnector2().getInputSourceAsStream(fileURL))); - } - } + GetCSV.getCsv(fileSystem, reader, hdfsPath, classForName, del); - public void execute(final String classForName) throws Exception { - CSVParser csvParser = new CSVParser(); - csvParser - .parse(csvFile, classForName) - .stream() - .forEach(p -> write(p)); + reader.close(); } - @Override - public void close() throws IOException { - writer.close(); - } - - public ReadCSV( - final String hdfsPath, - final String hdfsNameNode, - final String fileURL) - throws Exception { - this.conf = new Configuration(); - this.conf.set("fs.defaultFS", hdfsNameNode); - HttpConnector httpConnector = new HttpConnector(); - FileSystem fileSystem = FileSystem.get(this.conf); - Path hdfsWritePath = new Path(hdfsPath); - FSDataOutputStream fsDataOutputStream = null; - if (fileSystem.exists(hdfsWritePath)) { - fileSystem.delete(hdfsWritePath, false); - } - fsDataOutputStream = fileSystem.create(hdfsWritePath); - - this.writer = new BufferedWriter(new OutputStreamWriter(fsDataOutputStream, StandardCharsets.UTF_8)); - this.csvFile = httpConnector.getInputSource(fileURL); - ; - } - - protected void write(final Object p) { - try { - writer.write(OBJECT_MAPPER.writeValueAsString(p)); - writer.newLine(); - } catch (final Exception e) { - throw new RuntimeException(e); - } - } - } diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/utils/ReadExcel.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/utils/ReadExcel.java index 23b58f2a0..9e73cbc37 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/utils/ReadExcel.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/utils/ReadExcel.java @@ -11,28 +11,29 @@ import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FSDataOutputStream; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; +import org.apache.poi.openxml4j.exceptions.InvalidFormatException; import com.fasterxml.jackson.databind.ObjectMapper; -import eu.dnetlib.dhp.actionmanager.project.httpconnector.HttpConnector; import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.common.collection.CollectorException; +import eu.dnetlib.dhp.common.collection.HttpConnector2; /** * Applies the parsing of an excel file and writes the Serialization of it in hdfs */ - public class ReadExcel implements Closeable { - private static final Log log = LogFactory.getLog(ReadCSV.class); - private final Configuration conf; + private static final Log log = LogFactory.getLog(ReadExcel.class); + private final BufferedWriter writer; private final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); - private InputStream excelFile; + private final InputStream excelFile; public static void main(final String[] args) throws Exception { final ArgumentApplicationParser parser = new ArgumentApplicationParser( IOUtils .toString( - ReadCSV.class + ReadExcel.class .getResourceAsStream( "/eu/dnetlib/dhp/actionmanager/project/parameters.json"))); @@ -42,22 +43,25 @@ public class ReadExcel implements Closeable { final String hdfsPath = parser.get("hdfsPath"); final String hdfsNameNode = parser.get("hdfsNameNode"); final String classForName = parser.get("classForName"); + final String sheetName = parser.get("sheetName"); try (final ReadExcel readExcel = new ReadExcel(hdfsPath, hdfsNameNode, fileURL)) { log.info("Getting Excel file..."); - readExcel.execute(classForName); + readExcel.execute(classForName, sheetName); } } - public void execute(final String classForName) throws Exception { + public void execute(final String classForName, final String sheetName) + throws IOException, ClassNotFoundException, InvalidFormatException, IllegalAccessException, + InstantiationException { + EXCELParser excelParser = new EXCELParser(); excelParser - .parse(excelFile, classForName) + .parse(excelFile, classForName, sheetName) .stream() - .forEach(p -> write(p)); - + .forEach(this::write); } @Override @@ -68,22 +72,21 @@ public class ReadExcel implements Closeable { public ReadExcel( final String hdfsPath, final String hdfsNameNode, - final String fileURL) - throws Exception { - this.conf = new Configuration(); - this.conf.set("fs.defaultFS", hdfsNameNode); - HttpConnector httpConnector = new HttpConnector(); - FileSystem fileSystem = FileSystem.get(this.conf); + final String fileURL) throws CollectorException, IOException { + + final Configuration conf = new Configuration(); + conf.set("fs.defaultFS", hdfsNameNode); + HttpConnector2 httpConnector = new HttpConnector2(); + FileSystem fileSystem = FileSystem.get(conf); Path hdfsWritePath = new Path(hdfsPath); - FSDataOutputStream fsDataOutputStream = null; + if (fileSystem.exists(hdfsWritePath)) { fileSystem.delete(hdfsWritePath, false); } - fsDataOutputStream = fileSystem.create(hdfsWritePath); + FSDataOutputStream fos = fileSystem.create(hdfsWritePath); - this.writer = new BufferedWriter(new OutputStreamWriter(fsDataOutputStream, StandardCharsets.UTF_8)); + this.writer = new BufferedWriter(new OutputStreamWriter(fos, StandardCharsets.UTF_8)); this.excelFile = httpConnector.getInputSourceAsStream(fileURL); - ; } protected void write(final Object p) { diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/utils/model/CSVProgramme.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/utils/model/CSVProgramme.java new file mode 100644 index 000000000..df06fd6b4 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/utils/model/CSVProgramme.java @@ -0,0 +1,80 @@ + +package eu.dnetlib.dhp.actionmanager.project.utils.model; + +import java.io.Serializable; + +import com.opencsv.bean.CsvBindByName; +import com.opencsv.bean.CsvIgnore; + +/** + * The model for the programme csv file + */ +public class CSVProgramme implements Serializable { + + @CsvBindByName(column = "code") + private String code; + + @CsvBindByName(column = "title") + private String title; + + @CsvBindByName(column = "shortTitle") + private String shortTitle; + + @CsvBindByName(column = "language") + private String language; + + @CsvIgnore + private String classification; + + @CsvIgnore + private String classification_short; + + public String getClassification_short() { + return classification_short; + } + + public void setClassification_short(String classification_short) { + this.classification_short = classification_short; + } + + public String getClassification() { + return classification; + } + + public void setClassification(String classification) { + this.classification = classification; + } + + public String getCode() { + return code; + } + + public void setCode(String code) { + this.code = code; + } + + public String getTitle() { + return title; + } + + public void setTitle(String title) { + this.title = title; + } + + public String getShortTitle() { + return shortTitle; + } + + public void setShortTitle(String shortTitle) { + this.shortTitle = shortTitle; + } + + public String getLanguage() { + return language; + } + + public void setLanguage(String language) { + this.language = language; + } + +} diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/utils/model/CSVProject.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/utils/model/CSVProject.java new file mode 100644 index 000000000..3ddb19636 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/utils/model/CSVProject.java @@ -0,0 +1,46 @@ + +package eu.dnetlib.dhp.actionmanager.project.utils.model; + +import java.io.Serializable; + +import com.opencsv.bean.CsvBindByName; + +/** + * the mmodel for the projects csv file + */ +public class CSVProject implements Serializable { + + @CsvBindByName(column = "id") + private String id; + + @CsvBindByName(column = "programme") + private String programme; + + @CsvBindByName(column = "topics") + private String topics; + + public String getId() { + return id; + } + + public void setId(String id) { + this.id = id; + } + + public String getProgramme() { + return programme; + } + + public void setProgramme(String programme) { + this.programme = programme; + } + + public String getTopics() { + return topics; + } + + public void setTopics(String topics) { + this.topics = topics; + } + +} diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/utils/model/EXCELTopic.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/utils/model/EXCELTopic.java new file mode 100644 index 000000000..fa2e3422e --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/project/utils/model/EXCELTopic.java @@ -0,0 +1,127 @@ + +package eu.dnetlib.dhp.actionmanager.project.utils.model; + +import java.io.Serializable; + +/** + * the model class for the topic excel file + */ +public class EXCELTopic implements Serializable { + private String rcn; + private String language; + private String code; + private String parentProgramme; + private String frameworkProgramme; + private String startDate; + private String endDate; + private String title; + private String shortTitle; + private String objective; + private String subjects; + private String legalBasis; + private String call; + + public String getRcn() { + return rcn; + } + + public void setRcn(String rcn) { + this.rcn = rcn; + } + + public String getLanguage() { + return language; + } + + public void setLanguage(String language) { + this.language = language; + } + + public String getCode() { + return code; + } + + public void setCode(String code) { + this.code = code; + } + + public String getParentProgramme() { + return parentProgramme; + } + + public void setParentProgramme(String parentProgramme) { + this.parentProgramme = parentProgramme; + } + + public String getFrameworkProgramme() { + return frameworkProgramme; + } + + public void setFrameworkProgramme(String frameworkProgramme) { + this.frameworkProgramme = frameworkProgramme; + } + + public String getStartDate() { + return startDate; + } + + public void setStartDate(String startDate) { + this.startDate = startDate; + } + + public String getEndDate() { + return endDate; + } + + public void setEndDate(String endDate) { + this.endDate = endDate; + } + + public String getTitle() { + return title; + } + + public void setTitle(String title) { + this.title = title; + } + + public String getShortTitle() { + return shortTitle; + } + + public void setShortTitle(String shortTitle) { + this.shortTitle = shortTitle; + } + + public String getObjective() { + return objective; + } + + public void setObjective(String objective) { + this.objective = objective; + } + + public String getSubjects() { + return subjects; + } + + public void setSubjects(String subjects) { + this.subjects = subjects; + } + + public String getLegalBasis() { + return legalBasis; + } + + public void setLegalBasis(String legalBasis) { + this.legalBasis = legalBasis; + } + + public String getCall() { + return call; + } + + public void setCall(String call) { + this.call = call; + } +} diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/ror/GenerateRorActionSetJob.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/ror/GenerateRorActionSetJob.java new file mode 100644 index 000000000..6b5bed5b8 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/ror/GenerateRorActionSetJob.java @@ -0,0 +1,220 @@ + +package eu.dnetlib.dhp.actionmanager.ror; + +import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; +import static eu.dnetlib.dhp.schema.common.ModelConstants.ENTITYREGISTRY_PROVENANCE_ACTION; +import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.dataInfo; +import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.field; +import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.listKeyValues; +import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.qualifier; +import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.structuredProperty; + +import java.io.IOException; +import java.io.InputStream; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Date; +import java.util.LinkedHashSet; +import java.util.List; +import java.util.Map; +import java.util.Optional; +import java.util.Set; +import java.util.stream.Collectors; + +import org.apache.commons.io.IOUtils; +import org.apache.commons.lang3.StringUtils; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.mapred.SequenceFileOutputFormat; +import org.apache.spark.SparkConf; +import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.sql.Encoders; +import org.apache.spark.sql.SparkSession; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import com.fasterxml.jackson.databind.ObjectMapper; + +import eu.dnetlib.dhp.actionmanager.ror.model.ExternalIdType; +import eu.dnetlib.dhp.actionmanager.ror.model.RorOrganization; +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.common.HdfsSupport; +import eu.dnetlib.dhp.schema.action.AtomicAction; +import eu.dnetlib.dhp.schema.common.ModelConstants; +import eu.dnetlib.dhp.schema.oaf.DataInfo; +import eu.dnetlib.dhp.schema.oaf.Field; +import eu.dnetlib.dhp.schema.oaf.KeyValue; +import eu.dnetlib.dhp.schema.oaf.Oaf; +import eu.dnetlib.dhp.schema.oaf.Organization; +import eu.dnetlib.dhp.schema.oaf.Qualifier; +import eu.dnetlib.dhp.schema.oaf.StructuredProperty; +import eu.dnetlib.dhp.utils.DHPUtils; +import scala.Tuple2; + +public class GenerateRorActionSetJob { + + private static final Logger log = LoggerFactory.getLogger(GenerateRorActionSetJob.class); + + private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); + + private static final String ROR_NS_PREFIX = "ror_________"; + + private static final List ROR_COLLECTED_FROM = listKeyValues( + "10|openaire____::993a7ae7a863813cf95028b50708e222", "ROR"); + + private static final DataInfo ROR_DATA_INFO = dataInfo( + false, "", false, false, ENTITYREGISTRY_PROVENANCE_ACTION, "0.92"); + + private static final Qualifier ROR_PID_TYPE = qualifier( + "ROR", "ROR", ModelConstants.DNET_PID_TYPES, ModelConstants.DNET_PID_TYPES); + + public static void main(final String[] args) throws Exception { + + final String jsonConfiguration = IOUtils + .toString( + GenerateRorActionSetJob.class + .getResourceAsStream("/eu/dnetlib/dhp/actionmanager/ror/action_set_parameters.json")); + + final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration); + + parser.parseArgument(args); + + final Boolean isSparkSessionManaged = Optional + .ofNullable(parser.get("isSparkSessionManaged")) + .map(Boolean::valueOf) + .orElse(Boolean.TRUE); + + log.info("isSparkSessionManaged: {}", isSparkSessionManaged); + + final String inputPath = parser.get("inputPath"); + log.info("inputPath: {}", inputPath); + + final String outputPath = parser.get("outputPath"); + log.info("outputPath {}: ", outputPath); + + final SparkConf conf = new SparkConf(); + + runWithSparkSession(conf, isSparkSessionManaged, spark -> { + removeOutputDir(spark, outputPath); + processRorOrganizations(spark, inputPath, outputPath); + }); + } + + private static void removeOutputDir(final SparkSession spark, final String path) { + HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration()); + } + + private static void processRorOrganizations(final SparkSession spark, + final String inputPath, + final String outputPath) throws IOException { + + readInputPath(spark, inputPath) + .map(GenerateRorActionSetJob::convertRorOrg) + .flatMap(List::iterator) + .mapToPair( + aa -> new Tuple2<>(new Text(aa.getClazz().getCanonicalName()), + new Text(OBJECT_MAPPER.writeValueAsString(aa)))) + .saveAsHadoopFile(outputPath, Text.class, Text.class, SequenceFileOutputFormat.class); + } + + protected static List> convertRorOrg(final RorOrganization r) { + + final Date now = new Date(); + + final Organization o = new Organization(); + + o.setId(calculateOpenaireId(r.getId())); + o.setOriginalId(Arrays.asList(String.format("%s::%s", ROR_NS_PREFIX, r.getId()))); + o.setCollectedfrom(ROR_COLLECTED_FROM); + o.setPid(pids(r)); + o.setDateofcollection(now.toString()); + o.setDateoftransformation(now.toString()); + o.setExtraInfo(new ArrayList<>()); // Values not present in the file + o.setOaiprovenance(null); // Values not present in the file + o.setLegalshortname(field(r.getAcronyms().stream().findFirst().orElse(r.getName()), ROR_DATA_INFO)); + o.setLegalname(field(r.getName(), ROR_DATA_INFO)); + o.setAlternativeNames(alternativeNames(r)); + o.setWebsiteurl(field(r.getLinks().stream().findFirst().orElse(null), ROR_DATA_INFO)); + o.setLogourl(null); + o.setEclegalbody(null); + o.setEclegalperson(null); + o.setEcnonprofit(null); + o.setEcresearchorganization(null); + o.setEchighereducation(null); + o.setEcinternationalorganizationeurinterests(null); + o.setEcinternationalorganization(null); + o.setEcenterprise(null); + o.setEcsmevalidated(null); + o.setEcnutscode(null); + if (r.getCountry() != null) { + o + .setCountry( + qualifier( + r.getCountry().getCountryCode(), r + .getCountry() + .getCountryName(), + ModelConstants.DNET_COUNTRY_TYPE, ModelConstants.DNET_COUNTRY_TYPE)); + } else { + o.setCountry(null); + } + o.setDataInfo(ROR_DATA_INFO); + o.setLastupdatetimestamp(now.getTime()); + + final List> res = new ArrayList<>(); + res.add(new AtomicAction<>(Organization.class, o)); + + return res; + + } + + private static String calculateOpenaireId(final String rorId) { + return String.format("20|%s::%s", ROR_NS_PREFIX, DHPUtils.md5(rorId)); + } + + private static List pids(final RorOrganization r) { + final List pids = new ArrayList<>(); + pids.add(structuredProperty(r.getId(), ROR_PID_TYPE, ROR_DATA_INFO)); + + for (final Map.Entry e : r.getExternalIds().entrySet()) { + final String type = e.getKey(); + final List all = e.getValue().getAll(); + if (all != null) { + final Qualifier qualifier = qualifier( + type, type, ModelConstants.DNET_PID_TYPES, ModelConstants.DNET_PID_TYPES); + for (final String pid : all) { + pids + .add(structuredProperty(pid, qualifier, ROR_DATA_INFO)); + } + } + } + + return pids; + } + + private static List> alternativeNames(final RorOrganization r) { + final Set names = new LinkedHashSet<>(); + names.addAll(r.getAliases()); + names.addAll(r.getAcronyms()); + r.getLabels().forEach(l -> names.add(l.getLabel())); + + return names + .stream() + .filter(StringUtils::isNotBlank) + .map(s -> field(s, ROR_DATA_INFO)) + .collect(Collectors.toList()); + } + + private static JavaRDD readInputPath( + final SparkSession spark, + final String path) throws IOException { + + try (final FileSystem fileSystem = FileSystem.get(new Configuration()); + final InputStream is = fileSystem.open(new Path(path))) { + final RorOrganization[] arr = OBJECT_MAPPER.readValue(is, RorOrganization[].class); + return spark.createDataset(Arrays.asList(arr), Encoders.bean(RorOrganization.class)).toJavaRDD(); + } + } + +} diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/ror/model/Address.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/ror/model/Address.java new file mode 100644 index 000000000..a5acea5ae --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/ror/model/Address.java @@ -0,0 +1,122 @@ + +package eu.dnetlib.dhp.actionmanager.ror.model; + +import java.io.Serializable; + +import com.fasterxml.jackson.annotation.JsonProperty; + +public class Address implements Serializable { + + private static final long serialVersionUID = 2444635485253443195L; + + @JsonProperty("lat") + private Float lat; + + @JsonProperty("state_code") + private String stateCode; + + @JsonProperty("country_geonames_id") + private Integer countryGeonamesId; + + @JsonProperty("lng") + private Float lng; + + @JsonProperty("state") + private String state; + + @JsonProperty("city") + private String city; + + @JsonProperty("geonames_city") + private GeonamesCity geonamesCity; + + @JsonProperty("postcode") + private String postcode; + + @JsonProperty("primary") + private Boolean primary; + + @JsonProperty("line") + private String line; + + public Float getLat() { + return lat; + } + + public void setLat(final Float lat) { + this.lat = lat; + } + + public String getStateCode() { + return stateCode; + } + + public void setStateCode(final String stateCode) { + this.stateCode = stateCode; + } + + public Integer getCountryGeonamesId() { + return countryGeonamesId; + } + + public void setCountryGeonamesId(final Integer countryGeonamesId) { + this.countryGeonamesId = countryGeonamesId; + } + + public Float getLng() { + return lng; + } + + public void setLng(final Float lng) { + this.lng = lng; + } + + public String getState() { + return state; + } + + public void setState(final String state) { + this.state = state; + } + + public String getCity() { + return city; + } + + public void setCity(final String city) { + this.city = city; + } + + public GeonamesCity getGeonamesCity() { + return geonamesCity; + } + + public void setGeonamesCity(final GeonamesCity geonamesCity) { + this.geonamesCity = geonamesCity; + } + + public String getPostcode() { + return postcode; + } + + public void setPostcode(final String postcode) { + this.postcode = postcode; + } + + public Boolean getPrimary() { + return primary; + } + + public void setPrimary(final Boolean primary) { + this.primary = primary; + } + + public String getLine() { + return line; + } + + public void setLine(final String line) { + this.line = line; + } + +} diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/ror/model/Country.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/ror/model/Country.java new file mode 100644 index 000000000..1e7621f98 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/ror/model/Country.java @@ -0,0 +1,34 @@ + +package eu.dnetlib.dhp.actionmanager.ror.model; + +import java.io.Serializable; + +import com.fasterxml.jackson.annotation.JsonProperty; + +public class Country implements Serializable { + + private static final long serialVersionUID = 4357848706229493627L; + + @JsonProperty("country_code") + private String countryCode; + + @JsonProperty("country_name") + private String countryName; + + public String getCountryCode() { + return countryCode; + } + + public void setCountryCode(final String countryCode) { + this.countryCode = countryCode; + } + + public String getCountryName() { + return countryName; + } + + public void setCountryName(final String countryName) { + this.countryName = countryName; + } + +} diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/ror/model/ExternalIdType.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/ror/model/ExternalIdType.java new file mode 100644 index 000000000..5ea419b4e --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/ror/model/ExternalIdType.java @@ -0,0 +1,42 @@ + +package eu.dnetlib.dhp.actionmanager.ror.model; + +import java.io.Serializable; +import java.util.List; + +import com.fasterxml.jackson.databind.annotation.JsonDeserialize; + +@JsonDeserialize(using = ExternalIdTypeDeserializer.class) +public class ExternalIdType implements Serializable { + + private List all; + + private String preferred; + + private static final long serialVersionUID = 2616688352998387611L; + + public ExternalIdType() { + } + + public ExternalIdType(final List all, final String preferred) { + this.all = all; + this.preferred = preferred; + } + + public List getAll() { + return all; + } + + public void setAll(final List all) { + this.all = all; + } + + public String getPreferred() { + return preferred; + } + + public void setPreferred(final String preferred) { + this.preferred = preferred; + } + +} diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/ror/model/ExternalIdTypeDeserializer.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/ror/model/ExternalIdTypeDeserializer.java new file mode 100644 index 000000000..a744a325f --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/ror/model/ExternalIdTypeDeserializer.java @@ -0,0 +1,37 @@ + +package eu.dnetlib.dhp.actionmanager.ror.model; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; + +import com.fasterxml.jackson.core.JsonParser; +import com.fasterxml.jackson.core.JsonProcessingException; +import com.fasterxml.jackson.core.ObjectCodec; +import com.fasterxml.jackson.databind.DeserializationContext; +import com.fasterxml.jackson.databind.JsonDeserializer; +import com.fasterxml.jackson.databind.JsonNode; + +public class ExternalIdTypeDeserializer extends JsonDeserializer { + + @Override + public ExternalIdType deserialize(final JsonParser p, final DeserializationContext ctxt) throws IOException { + final ObjectCodec oc = p.getCodec(); + final JsonNode node = oc.readTree(p); + + final JsonNode allNode = node.get("all"); + + final String preferred = node.get("preferred").asText(); + + final List all = new ArrayList<>(); + + if (allNode.isArray()) { + allNode.elements().forEachRemaining(x -> all.add(x.asText())); + } else { + all.add(allNode.asText()); + } + + return new ExternalIdType(all, preferred); + } + +} diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/ror/model/GeonamesAdmin.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/ror/model/GeonamesAdmin.java new file mode 100644 index 000000000..9317a777c --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/ror/model/GeonamesAdmin.java @@ -0,0 +1,56 @@ + +package eu.dnetlib.dhp.actionmanager.ror.model; + +import java.io.Serializable; + +import com.fasterxml.jackson.annotation.JsonProperty; + +public class GeonamesAdmin implements Serializable { + + @JsonProperty("ascii_name") + private String asciiName; + + @JsonProperty("id") + private Integer id; + + @JsonProperty("name") + private String name; + + @JsonProperty("code") + private String code; + + private static final long serialVersionUID = 7294958526269195673L; + + public String getAsciiName() { + return asciiName; + } + + public void setAsciiName(final String asciiName) { + this.asciiName = asciiName; + } + + public Integer getId() { + return id; + } + + public void setId(final Integer id) { + this.id = id; + } + + public String getName() { + return name; + } + + public void setName(final String name) { + this.name = name; + } + + public String getCode() { + return code; + } + + public void setCode(final String code) { + this.code = code; + } + +} diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/ror/model/GeonamesCity.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/ror/model/GeonamesCity.java new file mode 100644 index 000000000..b13d64b10 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/ror/model/GeonamesCity.java @@ -0,0 +1,100 @@ + +package eu.dnetlib.dhp.actionmanager.ror.model; + +import java.io.Serializable; + +import com.fasterxml.jackson.annotation.JsonProperty; + +public class GeonamesCity implements Serializable { + + @JsonProperty("geonames_admin1") + private GeonamesAdmin geonamesAdmin1; + + @JsonProperty("geonames_admin2") + private GeonamesAdmin geonamesAdmin2; + + @JsonProperty("city") + private String city; + + @JsonProperty("id") + private Integer id; + + @JsonProperty("nuts_level1") + private NameAndCode nutsLevel1; + + @JsonProperty("nuts_level2") + private NameAndCode nutsLevel2; + + @JsonProperty("nuts_level3") + private NameAndCode nutsLevel3; + + @JsonProperty("license") + private License license; + + private static final long serialVersionUID = -8389480201526252955L; + + public NameAndCode getNutsLevel2() { + return nutsLevel2; + } + + public void setNutsLevel2(final NameAndCode nutsLevel2) { + this.nutsLevel2 = nutsLevel2; + } + + public GeonamesAdmin getGeonamesAdmin2() { + return geonamesAdmin2; + } + + public void setGeonamesAdmin2(final GeonamesAdmin geonamesAdmin2) { + this.geonamesAdmin2 = geonamesAdmin2; + } + + public GeonamesAdmin getGeonamesAdmin1() { + return geonamesAdmin1; + } + + public void setGeonamesAdmin1(final GeonamesAdmin geonamesAdmin1) { + this.geonamesAdmin1 = geonamesAdmin1; + } + + public String getCity() { + return city; + } + + public void setCity(final String city) { + this.city = city; + } + + public Integer getId() { + return id; + } + + public void setId(final Integer id) { + this.id = id; + } + + public NameAndCode getNutsLevel1() { + return nutsLevel1; + } + + public void setNutsLevel1(final NameAndCode nutsLevel1) { + this.nutsLevel1 = nutsLevel1; + } + + public NameAndCode getNutsLevel3() { + return nutsLevel3; + } + + public void setNutsLevel3(final NameAndCode nutsLevel3) { + this.nutsLevel3 = nutsLevel3; + } + + public License getLicense() { + return license; + } + + public void setLicense(final License license) { + this.license = license; + } + +} diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/ror/model/Label.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/ror/model/Label.java new file mode 100644 index 000000000..9a2cb39e3 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/ror/model/Label.java @@ -0,0 +1,34 @@ + +package eu.dnetlib.dhp.actionmanager.ror.model; + +import java.io.Serializable; + +import com.fasterxml.jackson.annotation.JsonProperty; + +public class Label implements Serializable { + + @JsonProperty("iso639") + private String iso639; + + @JsonProperty("label") + private String label; + + private static final long serialVersionUID = -6576156103297850809L; + + public String getIso639() { + return iso639; + } + + public void setIso639(final String iso639) { + this.iso639 = iso639; + } + + public String getLabel() { + return label; + } + + public void setLabel(final String label) { + this.label = label; + } + +} diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/ror/model/License.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/ror/model/License.java new file mode 100644 index 000000000..a0f6cf774 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/ror/model/License.java @@ -0,0 +1,34 @@ + +package eu.dnetlib.dhp.actionmanager.ror.model; + +import java.io.Serializable; + +import com.fasterxml.jackson.annotation.JsonProperty; + +public class License implements Serializable { + + @JsonProperty("attribution") + private String attribution; + + @JsonProperty("license") + private String license; + + private static final long serialVersionUID = -194308261058176439L; + + public String getAttribution() { + return attribution; + } + + public void setAttribution(final String attribution) { + this.attribution = attribution; + } + + public String getLicense() { + return license; + } + + public void setLicense(final String license) { + this.license = license; + } + +} diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/ror/model/NameAndCode.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/ror/model/NameAndCode.java new file mode 100644 index 000000000..c0f5d7645 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/ror/model/NameAndCode.java @@ -0,0 +1,34 @@ + +package eu.dnetlib.dhp.actionmanager.ror.model; + +import java.io.Serializable; + +import com.fasterxml.jackson.annotation.JsonProperty; + +public class NameAndCode implements Serializable { + + private static final long serialVersionUID = 5459836979206140843L; + + @JsonProperty("name") + private String name; + + @JsonProperty("code") + private String code; + + public String getName() { + return name; + } + + public void setName(final String name) { + this.name = name; + } + + public String getCode() { + return code; + } + + public void setCode(final String code) { + this.code = code; + } + +} diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/ror/model/Relationship.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/ror/model/Relationship.java new file mode 100644 index 000000000..db9f96445 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/ror/model/Relationship.java @@ -0,0 +1,45 @@ + +package eu.dnetlib.dhp.actionmanager.ror.model; + +import java.io.Serializable; + +import com.fasterxml.jackson.annotation.JsonProperty; + +public class Relationship implements Serializable { + + private static final long serialVersionUID = 7847399503395576960L; + + @JsonProperty("type") + private String type; + + @JsonProperty("id") + private String id; + + @JsonProperty("label") + private String label; + + public String getType() { + return type; + } + + public void setType(final String type) { + this.type = type; + } + + public String getId() { + return id; + } + + public void setId(final String id) { + this.id = id; + } + + public String getLabel() { + return label; + } + + public void setLabel(final String label) { + this.label = label; + } + +} diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/ror/model/RorOrganization.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/ror/model/RorOrganization.java new file mode 100644 index 000000000..b8041cfdf --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/ror/model/RorOrganization.java @@ -0,0 +1,192 @@ + +package eu.dnetlib.dhp.actionmanager.ror.model; + +import java.io.Serializable; +import java.util.ArrayList; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Map; + +import com.fasterxml.jackson.annotation.JsonProperty; + +public class RorOrganization implements Serializable { + + private static final long serialVersionUID = -2658312087616043225L; + + @JsonProperty("ip_addresses") + private List ipAddresses = new ArrayList<>(); + + @JsonProperty("aliases") + private List aliases = new ArrayList<>(); + + @JsonProperty("acronyms") + private List acronyms = new ArrayList<>(); + + @JsonProperty("links") + private List links = new ArrayList<>(); + + @JsonProperty("country") + private Country country; + + @JsonProperty("name") + private String name; + + @JsonProperty("wikipedia_url") + private String wikipediaUrl; + + @JsonProperty("addresses") + private List
addresses = new ArrayList<>(); + + @JsonProperty("types") + private List types = new ArrayList<>(); + + @JsonProperty("established") + private Integer established; + + @JsonProperty("relationships") + private List relationships = new ArrayList<>(); + + @JsonProperty("email_address") + private String emailAddress; + + @JsonProperty("external_ids") + private Map externalIds = new LinkedHashMap<>(); + + @JsonProperty("id") + private String id; + + @JsonProperty("labels") + private List
getAddresses() { + return addresses; + } + + public void setAddresses(final List
addresses) { + this.addresses = addresses; + } + + public List getTypes() { + return types; + } + + public void setTypes(final List types) { + this.types = types; + } + + public Integer getEstablished() { + return established; + } + + public void setEstablished(final Integer established) { + this.established = established; + } + + public List getRelationships() { + return relationships; + } + + public void setRelationships(final List relationships) { + this.relationships = relationships; + } + + public String getEmailAddress() { + return emailAddress; + } + + public void setEmailAddress(final String emailAddress) { + this.emailAddress = emailAddress; + } + + public Map getExternalIds() { + return externalIds; + } + + public void setExternalIds(final Map externalIds) { + this.externalIds = externalIds; + } + + public String getId() { + return id; + } + + public void setId(final String id) { + this.id = id; + } + + public List

+ * known bug: at resumptionType 'discover' if the (resultTotal % resultSizeValue) == 0 the collecting fails -> change the resultSizeValue + * + * @author Jochen Schirrwagen, Aenne Loehden, Andreas Czerniak + * @date 2020-04-09 + * + */ +public class RestIterator implements Iterator { + + private static final Logger log = LoggerFactory.getLogger(RestIterator.class); + public static final String UTF_8 = "UTF-8"; + + private final HttpClientParams clientParams; + + private final String BASIC = "basic"; + + private final JsonUtils jsonUtils; + + private final String baseUrl; + private final String resumptionType; + private final String resumptionParam; + private final String resultFormatValue; + private String queryParams; + private final int resultSizeValue; + private int resumptionInt = 0; // integer resumption token (first record to harvest) + private int resultTotal = -1; + private String resumptionStr = Integer.toString(resumptionInt); // string resumption token (first record to harvest + // or token scanned from results) + private InputStream resultStream; + private Transformer transformer; + private XPath xpath; + private String query; + private XPathExpression xprResultTotalPath; + private XPathExpression xprResumptionPath; + private XPathExpression xprEntity; + private final String queryFormat; + private final String querySize; + private final String authMethod; + private final String authToken; + private final Queue recordQueue = new PriorityBlockingQueue(); + private int discoverResultSize = 0; + private int pagination = 1; + /* + * While resultFormatValue is added to the request parameter, this is used to say that the results are retrieved in + * json. useful for cases when the target API expects a resultFormatValue != json, but the results are returned in + * json. An example is the EU Open Data Portal API: resultFormatValue=standard, results are in json format. + */ + private final String resultOutputFormat; + + /** RestIterator class + * compatible to version 1.3.33 + */ + public RestIterator( + final HttpClientParams clientParams, + final String baseUrl, + final String resumptionType, + final String resumptionParam, + final String resumptionXpath, + final String resultTotalXpath, + final String resultFormatParam, + final String resultFormatValue, + final String resultSizeParam, + final String resultSizeValueStr, + final String queryParams, + final String entityXpath, + final String authMethod, + final String authToken, + final String resultOutputFormat) { + + this.clientParams = clientParams; + this.jsonUtils = new JsonUtils(); + this.baseUrl = baseUrl; + this.resumptionType = resumptionType; + this.resumptionParam = resumptionParam; + this.resultFormatValue = resultFormatValue; + this.resultSizeValue = Integer.valueOf(resultSizeValueStr); + this.queryParams = queryParams; + this.authMethod = authMethod; + this.authToken = authToken; + this.resultOutputFormat = resultOutputFormat; + + queryFormat = StringUtils.isNotBlank(resultFormatParam) ? "&" + resultFormatParam + "=" + resultFormatValue + : ""; + querySize = StringUtils.isNotBlank(resultSizeParam) ? "&" + resultSizeParam + "=" + resultSizeValueStr : ""; + + try { + initXmlTransformation(resultTotalXpath, resumptionXpath, entityXpath); + } catch (Exception e) { + throw new IllegalStateException("xml transformation init failed: " + e.getMessage()); + } + initQueue(); + } + + private void initXmlTransformation(String resultTotalXpath, String resumptionXpath, String entityXpath) + throws TransformerConfigurationException, XPathExpressionException { + final TransformerFactory factory = TransformerFactory.newInstance(); + transformer = factory.newTransformer(); + transformer.setOutputProperty(OutputKeys.INDENT, "yes"); + transformer.setOutputProperty("{http://xml.apache.org/xslt}indent-amount", "3"); + xpath = XPathFactory.newInstance().newXPath(); + xprResultTotalPath = xpath.compile(resultTotalXpath); + xprResumptionPath = xpath.compile(StringUtils.isBlank(resumptionXpath) ? "/" : resumptionXpath); + xprEntity = xpath.compile(entityXpath); + } + + private void initQueue() { + query = baseUrl + "?" + queryParams + querySize + queryFormat; + log.info("REST calls starting with {}", query); + } + + private void disconnect() { + // TODO close inputstream + } + + /* + * (non-Javadoc) + * @see java.util.Iterator#hasNext() + */ + @Override + public boolean hasNext() { + if (recordQueue.isEmpty() && query.isEmpty()) { + disconnect(); + return false; + } else { + return true; + } + } + + /* + * (non-Javadoc) + * @see java.util.Iterator#next() + */ + @Override + public String next() { + synchronized (recordQueue) { + while (recordQueue.isEmpty() && !query.isEmpty()) { + try { + query = downloadPage(query); + } catch (CollectorException e) { + log.debug("CollectorPlugin.next()-Exception: {}", e); + throw new RuntimeException(e); + } + } + return recordQueue.poll(); + } + } + + /* + * download page and return nextQuery + */ + private String downloadPage(String query) throws CollectorException { + String resultJson; + String resultXml = ""; + String nextQuery = ""; + String emptyXml = resultXml + "<" + JsonUtils.wrapName + ">"; + Node resultNode = null; + NodeList nodeList = null; + String qUrlArgument = ""; + int urlOldResumptionSize = 0; + InputStream theHttpInputStream; + + // check if cursor=* is initial set otherwise add it to the queryParam URL + if (resumptionType.equalsIgnoreCase("deep-cursor")) { + log.debug("check resumptionType deep-cursor and check cursor=*?{}", query); + if (!query.contains("&cursor=")) { + query += "&cursor=*"; + } + } + + try { + log.info("requestig URL [{}]", query); + + URL qUrl = new URL(query); + log.debug("authMethod: {}", authMethod); + if ("bearer".equalsIgnoreCase(this.authMethod)) { + log.trace("authMethod before inputStream: {}", resultXml); + HttpURLConnection conn = (HttpURLConnection) qUrl.openConnection(); + conn.setRequestProperty(HttpHeaders.AUTHORIZATION, "Bearer " + authToken); + conn.setRequestProperty(HttpHeaders.CONTENT_TYPE, ContentType.APPLICATION_JSON.getMimeType()); + conn.setRequestMethod("GET"); + theHttpInputStream = conn.getInputStream(); + } else if (BASIC.equalsIgnoreCase(this.authMethod)) { + log.trace("authMethod before inputStream: {}", resultXml); + HttpURLConnection conn = (HttpURLConnection) qUrl.openConnection(); + conn.setRequestProperty(HttpHeaders.AUTHORIZATION, "Basic " + authToken); + conn.setRequestProperty(HttpHeaders.ACCEPT, ContentType.APPLICATION_XML.getMimeType()); + conn.setRequestMethod("GET"); + theHttpInputStream = conn.getInputStream(); + } else { + theHttpInputStream = qUrl.openStream(); + } + + resultStream = theHttpInputStream; + if ("json".equals(resultOutputFormat)) { + resultJson = IOUtils.toString(resultStream, StandardCharsets.UTF_8); + resultXml = jsonUtils.convertToXML(resultJson); + resultStream = IOUtils.toInputStream(resultXml, UTF_8); + } + + if (!(emptyXml).equalsIgnoreCase(resultXml)) { + resultNode = (Node) xpath.evaluate("/", new InputSource(resultStream), XPathConstants.NODE); + nodeList = (NodeList) xprEntity.evaluate(resultNode, XPathConstants.NODESET); + log.debug("nodeList.length: {}", nodeList.getLength()); + for (int i = 0; i < nodeList.getLength(); i++) { + StringWriter sw = new StringWriter(); + transformer.transform(new DOMSource(nodeList.item(i)), new StreamResult(sw)); + String toEnqueue = sw.toString(); + if (toEnqueue == null || StringUtils.isBlank(toEnqueue) || emptyXml.equalsIgnoreCase(toEnqueue)) { + log.warn("The following record resulted in empty item for the feeding queue: {}", resultXml); + } else { + recordQueue.add(sw.toString()); + } + } + } else { + log.warn("resultXml is equal with emptyXml"); + } + + resumptionInt += resultSizeValue; + + switch (resumptionType.toLowerCase()) { + case "scan": // read of resumptionToken , evaluate next results, e.g. OAI, iterate over items + resumptionStr = xprResumptionPath.evaluate(resultNode); + break; + + case "count": // begin at one step for all records, iterate over items + resumptionStr = Integer.toString(resumptionInt); + break; + + case "discover": // size of result items unknown, iterate over items (for openDOAR - 201808) + if (resultSizeValue < 2) { + throw new CollectorException("Mode: discover, Param 'resultSizeValue' is less than 2"); + } + qUrlArgument = qUrl.getQuery(); + String[] arrayQUrlArgument = qUrlArgument.split("&"); + for (String arrayUrlArgStr : arrayQUrlArgument) { + if (arrayUrlArgStr.startsWith(resumptionParam)) { + String[] resumptionKeyValue = arrayUrlArgStr.split("="); + if (isInteger(resumptionKeyValue[1])) { + urlOldResumptionSize = Integer.parseInt(resumptionKeyValue[1]); + log.debug("discover OldResumptionSize from Url (int): {}", urlOldResumptionSize); + } else { + log.debug("discover OldResumptionSize from Url (str): {}", resumptionKeyValue[1]); + } + } + } + + if (((emptyXml).equalsIgnoreCase(resultXml)) + || ((nodeList != null) && (nodeList.getLength() < resultSizeValue))) { + // resumptionStr = ""; + if (nodeList != null) { + discoverResultSize += nodeList.getLength(); + } + resultTotal = discoverResultSize; + } else { + resumptionStr = Integer.toString(resumptionInt); + resultTotal = resumptionInt + 1; + if (nodeList != null) { + discoverResultSize += nodeList.getLength(); + } + } + log.info("discoverResultSize: {}", discoverResultSize); + break; + + case "pagination": + case "page": // pagination, iterate over page numbers + pagination += 1; + if (nodeList != null) { + discoverResultSize += nodeList.getLength(); + } else { + resultTotal = discoverResultSize; + pagination = discoverResultSize; + } + resumptionInt = pagination; + resumptionStr = Integer.toString(resumptionInt); + break; + + case "deep-cursor": // size of result items unknown, iterate over items (for supporting deep cursor in + // solr) + // isn't relevant -- if (resultSizeValue < 2) {throw new CollectorServiceException("Mode: + // deep-cursor, Param 'resultSizeValue' is less than 2");} + + resumptionStr = encodeValue(xprResumptionPath.evaluate(resultNode)); + queryParams = queryParams.replace("&cursor=*", ""); + + // terminating if length of nodeList is 0 + if ((nodeList != null) && (nodeList.getLength() < discoverResultSize)) { + resumptionInt += (nodeList.getLength() + 1 - resultSizeValue); + } else { + resumptionInt += (nodeList.getLength() - resultSizeValue); // subtract the resultSizeValue + // because the iteration is over + // real length and the + // resultSizeValue is added before + // the switch() + } + + discoverResultSize = nodeList.getLength(); + + log + .debug( + "downloadPage().deep-cursor: resumptionStr=" + resumptionStr + " ; queryParams=" + + queryParams + " resumptionLengthIncreased: " + resumptionInt); + + break; + + default: // otherwise: abort + // resultTotal = resumptionInt; + break; + } + + } catch (Exception e) { + log.error(e.getMessage(), e); + throw new IllegalStateException("collection failed: " + e.getMessage()); + } + + try { + if (resultTotal == -1) { + resultTotal = Integer.parseInt(xprResultTotalPath.evaluate(resultNode)); + if (resumptionType.equalsIgnoreCase("page") && !BASIC.equalsIgnoreCase(authMethod)) { + resultTotal += 1; + } // to correct the upper bound + log.info("resultTotal was -1 is now: " + resultTotal); + } + } catch (Exception e) { + log.error(e.getMessage(), e); + throw new IllegalStateException("downloadPage resultTotal couldn't parse: " + e.getMessage()); + } + log.debug("resultTotal: " + resultTotal); + log.debug("resInt: " + resumptionInt); + if (resumptionInt <= resultTotal) { + nextQuery = baseUrl + "?" + queryParams + querySize + "&" + resumptionParam + "=" + resumptionStr + + queryFormat; + } else { + nextQuery = ""; + // if (resumptionType.toLowerCase().equals("deep-cursor")) { resumptionInt -= 1; } // correct the + // resumptionInt and prevent a NullPointer Exception at mdStore + } + log.debug("nextQueryUrl: " + nextQuery); + return nextQuery; + + } + + private boolean isInteger(String s) { + boolean isValidInteger = false; + try { + Integer.parseInt(s); + + // s is a valid integer + + isValidInteger = true; + } catch (NumberFormatException ex) { + // s is not an integer + } + + return isValidInteger; + } + + // Method to encode a string value using `UTF-8` encoding scheme + private String encodeValue(String value) { + try { + return URLEncoder.encode(value, StandardCharsets.UTF_8.toString()); + } catch (UnsupportedEncodingException ex) { + throw new RuntimeException(ex.getCause()); + } + } + + public String getResultFormatValue() { + return resultFormatValue; + } + + public String getResultOutputFormat() { + return resultOutputFormat; + } + +} diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/utils/JsonUtils.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/utils/JsonUtils.java new file mode 100644 index 000000000..15401e223 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/utils/JsonUtils.java @@ -0,0 +1,84 @@ + +package eu.dnetlib.dhp.collection.plugin.utils; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; + +public class JsonUtils { + + private static final Log log = LogFactory.getLog(JsonUtils.class); + + public static final String wrapName = "recordWrap"; + + /** + * convert in JSON-KeyName 'whitespace(s)' to '_' and '/' to '_', '(' and ')' to '' + * check W3C XML syntax: https://www.w3.org/TR/2006/REC-xml11-20060816/#sec-starttags for valid tag names + * and work-around for the JSON to XML converting of org.json.XML-package. + * + * known bugs: doesn't prevent "key name":" ["sexy name",": penari","erotic dance"], + * + * @param jsonInput + * @return convertedJsonKeynameOutput + */ + public String syntaxConvertJsonKeyNames(String jsonInput) { + + log.trace("before convertJsonKeyNames: " + jsonInput); + // pre-clean json - rid spaces of element names (misinterpreted as elements with attributes in xml) + // replace ' 's in JSON Namens with '_' + while (jsonInput.matches(".*\"([^\"]*)\\s+([^\"]*)\":.*")) { + jsonInput = jsonInput.replaceAll("\"([^\"]*)\\s+([^\"]*)\":", "\"$1_$2\":"); + } + + // replace forward-slash (sign '/' ) in JSON Names with '_' + while (jsonInput.matches(".*\"([^\"]*)/([^\"]*)\":.*")) { + jsonInput = jsonInput.replaceAll("\"([^\"]*)/([^\"]*)\":", "\"$1_$2\":"); + } + + // replace '(' in JSON Names with '' + while (jsonInput.matches(".*\"([^\"]*)[(]([^\"]*)\":.*")) { + jsonInput = jsonInput.replaceAll("\"([^\"]*)[(]([^\"]*)\":", "\"$1$2\":"); + } + + // replace ')' in JSON Names with '' + while (jsonInput.matches(".*\"([^\"]*)[)]([^\"]*)\":.*")) { + jsonInput = jsonInput.replaceAll("\"([^\"]*)[)]([^\"]*)\":", "\"$1$2\":"); + } + + // add prefix of startNumbers in JSON Keynames with 'n_' + while (jsonInput.matches(".*\"([^\"][0-9])([^\"]*)\":.*")) { + jsonInput = jsonInput.replaceAll("\"([^\"][0-9])([^\"]*)\":", "\"n_$1$2\":"); + } + // add prefix of only numbers in JSON Keynames with 'm_' + while (jsonInput.matches(".*\"([0-9]+)\":.*")) { + jsonInput = jsonInput.replaceAll("\"([0-9]+)\":", "\"m_$1\":"); + } + + // replace ':' between number like '2018-08-28T11:05:00Z' in JSON keynames with '' + while (jsonInput.matches(".*\"([^\"]*[0-9]):([0-9][^\"]*)\":.*")) { + jsonInput = jsonInput.replaceAll("\"([^\"]*[0-9]):([0-9][^\"]*)\":", "\"$1$2\":"); + } + + // replace ',' in JSON Keynames with '.' to prevent , in xml tagnames. + // while (jsonInput.matches(".*\"([^\"]*),([^\"]*)\":.*")) { + // jsonInput = jsonInput.replaceAll("\"([^\"]*),([^\"]*)\":", "\"$1.$2\":"); + // } + + // replace '=' in JSON Keynames with '-' + while (jsonInput.matches(".*\"([^\"]*)=([^\"]*)\":.*")) { + jsonInput = jsonInput.replaceAll("\"([^\"]*)=([^\"]*)\":", "\"$1-$2\":"); + } + + log.trace("after syntaxConvertJsonKeyNames: " + jsonInput); + return jsonInput; + } + + public String convertToXML(final String jsonRecord) { + String resultXml = ""; + org.json.JSONObject jsonObject = new org.json.JSONObject(syntaxConvertJsonKeyNames(jsonRecord)); + resultXml += org.json.XML.toString(jsonObject, wrapName); // wrap xml in single root element + log.trace("before inputStream: " + resultXml); + resultXml = XmlCleaner.cleanAllEntities(resultXml); + log.trace("after cleaning: " + resultXml); + return resultXml; + } +} diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/utils/XMLIterator.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/utils/XMLIterator.java new file mode 100644 index 000000000..e05fe263a --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/utils/XMLIterator.java @@ -0,0 +1,177 @@ + +package eu.dnetlib.dhp.collection.plugin.utils; + +import java.io.InputStream; +import java.io.InputStreamReader; +import java.io.Reader; +import java.io.StringWriter; +import java.nio.charset.Charset; +import java.nio.charset.CharsetDecoder; +import java.nio.charset.CodingErrorAction; +import java.util.Iterator; + +import javax.xml.stream.XMLEventFactory; +import javax.xml.stream.XMLEventReader; +import javax.xml.stream.XMLEventWriter; +import javax.xml.stream.XMLInputFactory; +import javax.xml.stream.XMLOutputFactory; +import javax.xml.stream.XMLStreamException; +import javax.xml.stream.events.StartElement; +import javax.xml.stream.events.XMLEvent; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; + +public class XMLIterator implements Iterator { + + private static final Log log = LogFactory.getLog(XMLIterator.class); + + private ThreadLocal inputFactory = new ThreadLocal() { + + @Override + protected XMLInputFactory initialValue() { + return XMLInputFactory.newInstance(); + } + }; + + private ThreadLocal outputFactory = new ThreadLocal() { + + @Override + protected XMLOutputFactory initialValue() { + return XMLOutputFactory.newInstance(); + } + }; + + private ThreadLocal eventFactory = new ThreadLocal() { + + @Override + protected XMLEventFactory initialValue() { + return XMLEventFactory.newInstance(); + } + }; + + public static final String UTF_8 = "UTF-8"; + + final XMLEventReader parser; + + private XMLEvent current = null; + + private String element; + + private InputStream inputStream; + + public XMLIterator(final String element, final InputStream inputStream) { + super(); + this.element = element; + this.inputStream = inputStream; + this.parser = getParser(); + try { + this.current = findElement(parser); + } catch (XMLStreamException e) { + log.warn("cannot init parser position. No element found: " + element); + current = null; + } + } + + @Override + public boolean hasNext() { + return current != null; + } + + @Override + public String next() { + String result = null; + try { + result = copy(parser); + current = findElement(parser); + return result; + } catch (XMLStreamException e) { + throw new RuntimeException(String.format("error copying xml, built so far: '%s'", result), e); + } + } + + @Override + public void remove() { + throw new UnsupportedOperationException(); + } + + @SuppressWarnings("finally") + private String copy(final XMLEventReader parser) throws XMLStreamException { + final StringWriter result = new StringWriter(); + try { + final XMLEventWriter writer = outputFactory.get().createXMLEventWriter(result); + final StartElement start = current.asStartElement(); + final StartElement newRecord = eventFactory + .get() + .createStartElement(start.getName(), start.getAttributes(), start.getNamespaces()); + + // new root record + writer.add(newRecord); + + // copy the rest as it is + while (parser.hasNext()) { + final XMLEvent event = parser.nextEvent(); + + // TODO: replace with depth tracking instead of close tag tracking. + if (event.isEndElement() && event.asEndElement().getName().getLocalPart().equals(element)) { + writer.add(event); + break; + } + + writer.add(event); + } + writer.close(); + } finally { + return result.toString(); + } + } + + /** + * Looks for the next occurrence of the splitter element. + * + * @param parser + * @return + * @throws XMLStreamException + */ + private XMLEvent findElement(final XMLEventReader parser) throws XMLStreamException { + + /* + * if (current != null && element.equals(current.asStartElement().getName().getLocalPart())) { return current; } + */ + + XMLEvent peek = parser.peek(); + if (peek != null && peek.isStartElement()) { + String name = peek.asStartElement().getName().getLocalPart(); + if (element.equals(name)) { + return peek; + } + } + + while (parser.hasNext()) { + final XMLEvent event = parser.nextEvent(); + if (event != null && event.isStartElement()) { + String name = event.asStartElement().getName().getLocalPart(); + if (element.equals(name)) { + return event; + } + } + } + return null; + } + + private XMLEventReader getParser() { + try { + return inputFactory.get().createXMLEventReader(sanitize(inputStream)); + } catch (XMLStreamException e) { + throw new RuntimeException(e); + } + } + + private Reader sanitize(final InputStream in) { + final CharsetDecoder charsetDecoder = Charset.forName(UTF_8).newDecoder(); + charsetDecoder.onMalformedInput(CodingErrorAction.REPLACE); + charsetDecoder.onUnmappableCharacter(CodingErrorAction.REPLACE); + return new InputStreamReader(in, charsetDecoder); + } + +} diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/utils/XmlCleaner.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/utils/XmlCleaner.java new file mode 100644 index 000000000..95d1d2402 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/utils/XmlCleaner.java @@ -0,0 +1,383 @@ + +package eu.dnetlib.dhp.collection.plugin.utils; + +import java.util.HashMap; +import java.util.HashSet; +import java.util.Map; +import java.util.Set; +import java.util.regex.Pattern; + +/** @author jochen, Andreas Czerniak */ +public class XmlCleaner { + + /** Pattern for numeric entities. */ + private static final Pattern validCharacterEntityPattern = Pattern.compile("^&#x?\\d{2,4};"); // $NON-NLS-1$ + // private static Pattern validCharacterEntityPattern = Pattern.compile("^&#?\\d{2,4};"); + // //$NON-NLS-1$ + + // see https://www.w3.org/TR/REC-xml/#charsets , not only limited to + private static final Pattern invalidControlCharPattern = Pattern.compile("&#x?1[0-9a-fA-F];"); + + /** + * Pattern that negates the allowable XML 4 byte unicode characters. Valid are: #x9 | #xA | #xD | [#x20-#xD7FF] | + * [#xE000-#xFFFD] | [#x10000-#x10FFFF] + */ + private static final Pattern invalidCharacterPattern = Pattern.compile("[^\t\r\n\u0020-\uD7FF\uE000-\uFFFD]"); // $NON-NLS-1$ + + // Map entities to their unicode equivalent + private static final Set goodEntities = new HashSet<>(); + private static final Map badEntities = new HashMap<>(); + + static { + // pre-defined XML entities + goodEntities.add("""); // $NON-NLS-1$ // quotation mark + goodEntities.add("&"); // $NON-NLS-1$ // ampersand + goodEntities.add("<"); // $NON-NLS-1$ // less-than sign + goodEntities.add(">"); // $NON-NLS-1$ // greater-than sign + // control entities + // badEntities.put(" ", ""); + badEntities.put("", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character + badEntities.put("€", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character + badEntities.put("", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character + badEntities.put("‚", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character + badEntities.put("ƒ", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character + badEntities.put("„", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character + badEntities.put("…", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character + badEntities.put("†", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character + badEntities.put("‡", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character + badEntities.put("ˆ", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character + badEntities.put("‰", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character + badEntities.put("Š", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character + badEntities.put("‹", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character + badEntities.put("Œ", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character + badEntities.put("", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character + badEntities.put("Ž", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character + badEntities.put("", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character + badEntities.put("", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character + badEntities.put("‘", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character + badEntities.put("’", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character + badEntities.put("“", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character + badEntities.put("”", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character + badEntities.put("•", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character + badEntities.put("–", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character + badEntities.put("—", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character + badEntities.put("˜", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character + badEntities.put("™", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character + badEntities.put("š", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character + badEntities.put("›", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character + badEntities.put("œ", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character + badEntities.put("", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character + badEntities.put("ž", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character + badEntities.put("Ÿ", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character + // misc entities + badEntities.put("€", "\u20AC"); // $NON-NLS-1$ //$NON-NLS-2$ // euro + badEntities.put("‘", "\u2018"); // $NON-NLS-1$ //$NON-NLS-2$ // left single quotation + // mark + badEntities.put("’", "\u2019"); // $NON-NLS-1$ //$NON-NLS-2$ // right single quotation + // mark + // Latin 1 entities + badEntities.put(" ", "\u00A0"); // $NON-NLS-1$ //$NON-NLS-2$ // no-break space + badEntities.put("¡", "\u00A1"); // $NON-NLS-1$ //$NON-NLS-2$ // inverted exclamation + // mark + badEntities.put("¢", "\u00A2"); // $NON-NLS-1$ //$NON-NLS-2$ // cent sign + badEntities.put("£", "\u00A3"); // $NON-NLS-1$ //$NON-NLS-2$ // pound sign + badEntities.put("¤", "\u00A4"); // $NON-NLS-1$ //$NON-NLS-2$ // currency sign + badEntities.put("¥", "\u00A5"); // $NON-NLS-1$ //$NON-NLS-2$ // yen sign + badEntities.put("¦", "\u00A6"); // $NON-NLS-1$ //$NON-NLS-2$ // broken vertical bar + badEntities.put("§", "\u00A7"); // $NON-NLS-1$ //$NON-NLS-2$ // section sign + badEntities.put("¨", "\u00A8"); // $NON-NLS-1$ //$NON-NLS-2$ // diaeresis + badEntities.put("©", "\u00A9"); // $NON-NLS-1$ //$NON-NLS-2$ // copyright sign + badEntities.put("ª", "\u00AA"); // $NON-NLS-1$ //$NON-NLS-2$ // feminine ordinal + // indicator + badEntities.put("«", "\u00AB"); // $NON-NLS-2$ // left-pointing double angle + // quotation mark + badEntities.put("¬", "\u00AC"); // $NON-NLS-1$ //$NON-NLS-2$ // not sign + badEntities.put("­", "\u00AD"); // $NON-NLS-1$ //$NON-NLS-2$ // soft hyphen + badEntities.put("®", "\u00AE"); // $NON-NLS-1$ //$NON-NLS-2$ // registered sign + badEntities.put("¯", "\u00AF"); // $NON-NLS-1$ //$NON-NLS-2$ // macron + badEntities.put("°", "\u00B0"); // $NON-NLS-1$ //$NON-NLS-2$ // degree sign + badEntities.put("±", "\u00B1"); // $NON-NLS-1$ //$NON-NLS-2$ // plus-minus sign + badEntities.put("²", "\u00B2"); // $NON-NLS-1$ //$NON-NLS-2$ // superscript two + badEntities.put("³", "\u00B3"); // $NON-NLS-1$ //$NON-NLS-2$ // superscript three + badEntities.put("´", "\u00B4"); // $NON-NLS-1$ //$NON-NLS-2$ // acute accent + badEntities.put("µ", "\u00B5"); // $NON-NLS-1$ //$NON-NLS-2$ // micro sign + badEntities.put("¶", "\u00B6"); // $NON-NLS-1$ //$NON-NLS-2$ // pilcrow sign + badEntities.put("·", "\u00B7"); // $NON-NLS-1$ //$NON-NLS-2$ // middle dot + badEntities.put("¸", "\u00B8"); // $NON-NLS-1$ //$NON-NLS-2$ // cedilla + badEntities.put("¹", "\u00B9"); // $NON-NLS-1$ //$NON-NLS-2$ // superscript one + badEntities.put("º", "\u00BA"); // $NON-NLS-1$ //$NON-NLS-2$ // masculine ordinal + // indicator + badEntities.put("»", "\u00BB"); // $NON-NLS-2$ // right-pointing double + // angle quotation + // mark + badEntities.put("¼", "\u00BC"); // $NON-NLS-1$ //$NON-NLS-2$ // vulgar fraction one + // quarter + badEntities.put("½", "\u00BD"); // $NON-NLS-1$ //$NON-NLS-2$ // vulgar fraction one + // half + badEntities.put("¾", "\u00BE"); // $NON-NLS-1$ //$NON-NLS-2$ // vulgar fraction three + // quarters + badEntities.put("¿", "\u00BF"); // $NON-NLS-1$ //$NON-NLS-2$ // inverted question + // mark + badEntities.put("À", "\u00C0"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter + // A + // with grave + badEntities.put("Á", "\u00C1"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter + // A + // with acute + badEntities.put("Â", "\u00C2"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter A + // with circumflex + badEntities.put("Ã", "\u00C3"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter + // A + // with tilde + badEntities.put("Ä", "\u00C4"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter A + // with + // diaeresis + badEntities.put("Å", "\u00C5"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter A + // with ring above + badEntities.put("Æ", "\u00C6"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter + // AE + badEntities.put("Ç", "\u00C7"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter + // C + // with cedilla + badEntities.put("È", "\u00C8"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter + // E + // with grave + badEntities.put("É", "\u00C9"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter + // E + // with acute + badEntities.put("Ê", "\u00CA"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter E + // with circumflex + badEntities.put("Ë", "\u00CB"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter E + // with + // diaeresis + badEntities.put("Ì", "\u00CC"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter + // I + // with grave + badEntities.put("Í", "\u00CD"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter + // I + // with acute + badEntities.put("Î", "\u00CE"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter I + // with circumflex + badEntities.put("Ï", "\u00CF"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter I + // with + // diaeresis + badEntities.put("Ð", "\u00D0"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter ETH + badEntities.put("Ñ", "\u00D1"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter + // N + // with tilde + badEntities.put("Ò", "\u00D2"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter + // O + // with grave + badEntities.put("Ó", "\u00D3"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter + // O + // with acute + badEntities.put("Ô", "\u00D4"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter O + // with circumflex + badEntities.put("Õ", "\u00D5"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter + // O + // with tilde + badEntities.put("Ö", "\u00D6"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter O + // with + // diaeresis + badEntities.put("×", "\u00D7"); // $NON-NLS-1$ //$NON-NLS-2$ // multiplication sign + badEntities.put("Ø", "\u00D8"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter + // O + // with stroke + badEntities.put("Ù", "\u00D9"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter + // U + // with grave + badEntities.put("Ú", "\u00DA"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter + // U + // with acute + badEntities.put("Û", "\u00DB"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter U + // with circumflex + badEntities.put("Ü", "\u00DC"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter U + // with + // diaeresis + badEntities.put("Ý", "\u00DD"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter + // Y + // with acute + badEntities.put("Þ", "\u00DE"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter + // THORN + badEntities.put("ß", "\u00DF"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter + // sharp s + badEntities.put("à", "\u00E0"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter a + // with + // grave + badEntities.put("á", "\u00E1"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter a + // with + // acute + badEntities.put("â", "\u00E2"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter a + // with + // circumflex + badEntities.put("ã", "\u00E3"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter a + // with + // tilde + badEntities.put("ä", "\u00E4"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter a + // with + // diaeresis + badEntities.put("å", "\u00E5"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter a + // with + // ring above + badEntities.put("æ", "\u00E6"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter ae + badEntities.put("ç", "\u00E7"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter c + // with + // cedilla + badEntities.put("è", "\u00E8"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter e + // with + // grave + badEntities.put("é", "\u00E9"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter e + // with + // acute + badEntities.put("ê", "\u00EA"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter e + // with + // circumflex + badEntities.put("ë", "\u00EB"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter e + // with + // diaeresis + badEntities.put("ì", "\u00EC"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter i + // with + // grave + badEntities.put("í", "\u00ED"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter i + // with + // acute + badEntities.put("î", "\u00EE"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter i + // with + // circumflex + badEntities.put("ï", "\u00EF"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter i + // with + // diaeresis + badEntities.put("ð", "\u00F0"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter eth + badEntities.put("ñ", "\u00F1"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter n + // with + // tilde + badEntities.put("ò", "\u00F2"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter o + // with + // grave + badEntities.put("ó", "\u00F3"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter o + // with + // acute + badEntities.put("ô", "\u00F4"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter o + // with + // circumflex + badEntities.put("õ", "\u00F5"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter o + // with + // tilde + badEntities.put("ö", "\u00F6"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter o + // with + // diaeresis + badEntities.put("÷", "\u00F7"); // $NON-NLS-1$ //$NON-NLS-2$ // division sign + badEntities.put("ø", "\u00F8"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter o + // with + // stroke + badEntities.put("ù", "\u00F9"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter u + // with + // grave + badEntities.put("ú", "\u00FA"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter u + // with + // acute + badEntities.put("û", "\u00FB"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter u + // with + // circumflex + badEntities.put("ü", "\u00FC"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter u + // with + // diaeresis + badEntities.put("ý", "\u00FD"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter y + // with + // acute + badEntities.put("þ", "\u00FE"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter + // thorn + badEntities.put("ÿ", "\u00FF"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter y + // with + // diaeresis + } + + /** + * For each entity in the input that is not allowed in XML, replace the entity with its unicode equivalent or remove + * it. For each instance of a bare {@literal &}, replace it with {@literal + * &
+ } XML only allows 4 entities: {@literal &amp;}, {@literal &quot;}, {@literal + * &lt;} and {@literal &gt;}. + * + * @param broken the string to handle entities + * @return the string with entities appropriately fixed up + */ + public static String cleanAllEntities(final String broken) { + if (broken == null) { + return null; + } + + String working = invalidControlCharPattern.matcher(broken).replaceAll(""); + working = invalidCharacterPattern.matcher(working).replaceAll(""); + + int cleanfrom = 0; + + while (true) { + int amp = working.indexOf('&', cleanfrom); + // If there are no more amps then we are done + if (amp == -1) { + break; + } + // Skip references of the kind &#ddd; + if (validCharacterEntityPattern.matcher(working.substring(amp)).find()) { + cleanfrom = working.indexOf(';', amp) + 1; + continue; + } + int i = amp + 1; + while (true) { + // if we are at the end of the string then just escape the '&'; + if (i >= working.length()) { + return working.substring(0, amp) + "&" + working.substring(amp + 1); // $NON-NLS-1$ + } + // if we have come to a ; then we have an entity + // If it is something that xml can't handle then replace it. + final char c = working.charAt(i); + if (c == ';') { + final String entity = working.substring(amp, i + 1); + final String replace = handleEntity(entity); + working = working.substring(0, amp) + replace + working.substring(i + 1); + break; + } + // Did we end an entity without finding a closing ; + // Then treat it as an '&' that needs to be replaced with & + if (!Character.isLetterOrDigit(c)) { + working = working.substring(0, amp) + "&" + working.substring(amp + 1); // $NON-NLS-1$ + amp = i + 4; // account for the 4 extra characters + break; + } + i++; + } + cleanfrom = amp + 1; + } + + if (Pattern.compile("<<").matcher(working).find()) { + working = working.replaceAll("<<", "<<"); + } + + if (Pattern.compile(">>").matcher(working).find()) { + working = working.replaceAll(">>", ">>"); + } + + return working; + } + + /** + * Replace entity with its unicode equivalent, if it is not a valid XML entity. Otherwise strip it out. XML only + * allows 4 entities: &amp;, &quot;, &lt; and &gt;. + * + * @param entity the entity to be replaced + * @return the substitution for the entity, either itself, the unicode equivalent or an empty string. + */ + private static String handleEntity(final String entity) { + if (goodEntities.contains(entity)) { + return entity; + } + + final String replace = badEntities.get(entity); + if (replace != null) { + return replace; + } + + return replace != null ? replace : ""; + } +} diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/DnetCollectorException.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/DnetCollectorException.java deleted file mode 100644 index f40962c21..000000000 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/DnetCollectorException.java +++ /dev/null @@ -1,32 +0,0 @@ - -package eu.dnetlib.dhp.collection.worker; - -public class DnetCollectorException extends Exception { - - /** */ - private static final long serialVersionUID = -290723075076039757L; - - public DnetCollectorException() { - super(); - } - - public DnetCollectorException( - final String message, - final Throwable cause, - final boolean enableSuppression, - final boolean writableStackTrace) { - super(message, cause, enableSuppression, writableStackTrace); - } - - public DnetCollectorException(final String message, final Throwable cause) { - super(message, cause); - } - - public DnetCollectorException(final String message) { - super(message); - } - - public DnetCollectorException(final Throwable cause) { - super(cause); - } -} diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/DnetCollectorWorker.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/DnetCollectorWorker.java deleted file mode 100644 index e686ad518..000000000 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/DnetCollectorWorker.java +++ /dev/null @@ -1,139 +0,0 @@ - -package eu.dnetlib.dhp.collection.worker; - -import java.io.IOException; -import java.net.URI; -import java.util.HashMap; -import java.util.Map; -import java.util.concurrent.atomic.AtomicInteger; - -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.io.IntWritable; -import org.apache.hadoop.io.SequenceFile; -import org.apache.hadoop.io.Text; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import com.fasterxml.jackson.databind.ObjectMapper; - -import eu.dnetlib.collector.worker.model.ApiDescriptor; -import eu.dnetlib.dhp.application.ArgumentApplicationParser; -import eu.dnetlib.dhp.collection.plugin.CollectorPlugin; -import eu.dnetlib.dhp.collection.worker.utils.CollectorPluginFactory; -import eu.dnetlib.message.Message; -import eu.dnetlib.message.MessageManager; -import eu.dnetlib.message.MessageType; - -public class DnetCollectorWorker { - - private static final Logger log = LoggerFactory.getLogger(DnetCollectorWorker.class); - - private final CollectorPluginFactory collectorPluginFactory; - - private final ArgumentApplicationParser argumentParser; - - private final MessageManager manager; - - public DnetCollectorWorker( - final CollectorPluginFactory collectorPluginFactory, - final ArgumentApplicationParser argumentParser, - final MessageManager manager) - throws DnetCollectorException { - this.collectorPluginFactory = collectorPluginFactory; - this.argumentParser = argumentParser; - this.manager = manager; - } - - public void collect() throws DnetCollectorException { - try { - final ObjectMapper jsonMapper = new ObjectMapper(); - final ApiDescriptor api = jsonMapper.readValue(argumentParser.get("apidescriptor"), ApiDescriptor.class); - - final CollectorPlugin plugin = collectorPluginFactory.getPluginByProtocol(api.getProtocol()); - - final String hdfsuri = argumentParser.get("namenode"); - - // ====== Init HDFS File System Object - Configuration conf = new Configuration(); - // Set FileSystem URI - conf.set("fs.defaultFS", hdfsuri); - // Because of Maven - conf.set("fs.hdfs.impl", org.apache.hadoop.hdfs.DistributedFileSystem.class.getName()); - conf.set("fs.file.impl", org.apache.hadoop.fs.LocalFileSystem.class.getName()); - - System.setProperty("HADOOP_USER_NAME", argumentParser.get("userHDFS")); - System.setProperty("hadoop.home.dir", "/"); - // Get the filesystem - HDFS - FileSystem.get(URI.create(hdfsuri), conf); - Path hdfswritepath = new Path(argumentParser.get("hdfsPath")); - - log.info("Created path " + hdfswritepath.toString()); - - final Map ongoingMap = new HashMap<>(); - final Map reportMap = new HashMap<>(); - final AtomicInteger counter = new AtomicInteger(0); - try (SequenceFile.Writer writer = SequenceFile - .createWriter( - conf, - SequenceFile.Writer.file(hdfswritepath), - SequenceFile.Writer.keyClass(IntWritable.class), - SequenceFile.Writer.valueClass(Text.class))) { - final IntWritable key = new IntWritable(counter.get()); - final Text value = new Text(); - plugin - .collect(api) - .forEach( - content -> { - key.set(counter.getAndIncrement()); - value.set(content); - if (counter.get() % 10 == 0) { - try { - ongoingMap.put("ongoing", "" + counter.get()); - log - .debug( - "Sending message: " - + manager - .sendMessage( - new Message( - argumentParser.get("workflowId"), - "Collection", - MessageType.ONGOING, - ongoingMap), - argumentParser.get("rabbitOngoingQueue"), - true, - false)); - } catch (Exception e) { - log.error("Error on sending message ", e); - } - } - try { - writer.append(key, value); - } catch (IOException e) { - throw new RuntimeException(e); - } - }); - } - ongoingMap.put("ongoing", "" + counter.get()); - manager - .sendMessage( - new Message( - argumentParser.get("workflowId"), "Collection", MessageType.ONGOING, ongoingMap), - argumentParser.get("rabbitOngoingQueue"), - true, - false); - reportMap.put("collected", "" + counter.get()); - manager - .sendMessage( - new Message( - argumentParser.get("workflowId"), "Collection", MessageType.REPORT, reportMap), - argumentParser.get("rabbitOngoingQueue"), - true, - false); - manager.close(); - } catch (Throwable e) { - throw new DnetCollectorException("Error on collecting ", e); - } - } -} diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/DnetCollectorWorkerApplication.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/DnetCollectorWorkerApplication.java deleted file mode 100644 index da30e8793..000000000 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/DnetCollectorWorkerApplication.java +++ /dev/null @@ -1,49 +0,0 @@ - -package eu.dnetlib.dhp.collection.worker; - -import org.apache.commons.io.IOUtils; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import eu.dnetlib.dhp.application.ArgumentApplicationParser; -import eu.dnetlib.dhp.collection.worker.utils.CollectorPluginFactory; -import eu.dnetlib.message.MessageManager; - -/** - * DnetCollectortWorkerApplication is the main class responsible to start the Dnet Collection into HDFS. This module - * will be executed on the hadoop cluster and taking in input some parameters that tells it which is the right collector - * plugin to use and where store the data into HDFS path - * - * @author Sandro La Bruzzo - */ -public class DnetCollectorWorkerApplication { - - private static final Logger log = LoggerFactory.getLogger(DnetCollectorWorkerApplication.class); - - private static final CollectorPluginFactory collectorPluginFactory = new CollectorPluginFactory(); - - private static ArgumentApplicationParser argumentParser; - - /** @param args */ - public static void main(final String[] args) throws Exception { - - argumentParser = new ArgumentApplicationParser( - IOUtils - .toString( - DnetCollectorWorker.class - .getResourceAsStream( - "/eu/dnetlib/collector/worker/collector_parameter.json"))); - argumentParser.parseArgument(args); - log.info("hdfsPath =" + argumentParser.get("hdfsPath")); - log.info("json = " + argumentParser.get("apidescriptor")); - final MessageManager manager = new MessageManager( - argumentParser.get("rabbitHost"), - argumentParser.get("rabbitUser"), - argumentParser.get("rabbitPassword"), - false, - false, - null); - final DnetCollectorWorker worker = new DnetCollectorWorker(collectorPluginFactory, argumentParser, manager); - worker.collect(); - } -} diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/utils/CollectorPluginErrorLogList.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/utils/CollectorPluginErrorLogList.java deleted file mode 100644 index dcaf0ea56..000000000 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/utils/CollectorPluginErrorLogList.java +++ /dev/null @@ -1,19 +0,0 @@ - -package eu.dnetlib.dhp.collection.worker.utils; - -import java.util.LinkedList; - -public class CollectorPluginErrorLogList extends LinkedList { - - private static final long serialVersionUID = -6925786561303289704L; - - @Override - public String toString() { - String log = ""; - int index = 0; - for (final String errorMessage : this) { - log += String.format("Retry #%s: %s / ", index++, errorMessage); - } - return log; - } -} diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/utils/CollectorPluginFactory.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/utils/CollectorPluginFactory.java deleted file mode 100644 index 7a0028e79..000000000 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/utils/CollectorPluginFactory.java +++ /dev/null @@ -1,20 +0,0 @@ - -package eu.dnetlib.dhp.collection.worker.utils; - -import eu.dnetlib.dhp.collection.plugin.CollectorPlugin; -import eu.dnetlib.dhp.collection.plugin.oai.OaiCollectorPlugin; -import eu.dnetlib.dhp.collection.worker.DnetCollectorException; - -public class CollectorPluginFactory { - - public CollectorPlugin getPluginByProtocol(final String protocol) throws DnetCollectorException { - if (protocol == null) - throw new DnetCollectorException("protocol cannot be null"); - switch (protocol.toLowerCase().trim()) { - case "oai": - return new OaiCollectorPlugin(); - default: - throw new DnetCollectorException("UNknown protocol"); - } - } -} diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/utils/HttpConnector.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/utils/HttpConnector.java deleted file mode 100644 index 5d6108fad..000000000 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/utils/HttpConnector.java +++ /dev/null @@ -1,244 +0,0 @@ - -package eu.dnetlib.dhp.collection.worker.utils; - -import java.io.IOException; -import java.io.InputStream; -import java.net.*; -import java.security.GeneralSecurityException; -import java.security.cert.X509Certificate; -import java.util.List; -import java.util.Map; - -import javax.net.ssl.HttpsURLConnection; -import javax.net.ssl.SSLContext; -import javax.net.ssl.TrustManager; -import javax.net.ssl.X509TrustManager; - -import org.apache.commons.io.IOUtils; -import org.apache.commons.lang.math.NumberUtils; -import org.apache.commons.logging.Log; -import org.apache.commons.logging.LogFactory; - -import eu.dnetlib.dhp.collection.worker.DnetCollectorException; - -public class HttpConnector { - - private static final Log log = LogFactory.getLog(HttpConnector.class); - - private int maxNumberOfRetry = 6; - private int defaultDelay = 120; // seconds - private int readTimeOut = 120; // seconds - - private String responseType = null; - - private final String userAgent = "Mozilla/5.0 (compatible; OAI; +http://www.openaire.eu)"; - - public HttpConnector() { - CookieHandler.setDefault(new CookieManager(null, CookiePolicy.ACCEPT_ALL)); - } - - /** - * Given the URL returns the content via HTTP GET - * - * @param requestUrl the URL - * @return the content of the downloaded resource - * @throws DnetCollectorException when retrying more than maxNumberOfRetry times - */ - public String getInputSource(final String requestUrl) throws DnetCollectorException { - return attemptDownlaodAsString(requestUrl, 1, new CollectorPluginErrorLogList()); - } - - /** - * Given the URL returns the content as a stream via HTTP GET - * - * @param requestUrl the URL - * @return the content of the downloaded resource as InputStream - * @throws DnetCollectorException when retrying more than maxNumberOfRetry times - */ - public InputStream getInputSourceAsStream(final String requestUrl) throws DnetCollectorException { - return attemptDownload(requestUrl, 1, new CollectorPluginErrorLogList()); - } - - private String attemptDownlaodAsString( - final String requestUrl, final int retryNumber, final CollectorPluginErrorLogList errorList) - throws DnetCollectorException { - try { - final InputStream s = attemptDownload(requestUrl, 1, new CollectorPluginErrorLogList()); - try { - return IOUtils.toString(s); - } catch (final IOException e) { - log.error("error while retrieving from http-connection occured: " + requestUrl, e); - Thread.sleep(defaultDelay * 1000); - errorList.add(e.getMessage()); - return attemptDownlaodAsString(requestUrl, retryNumber + 1, errorList); - } finally { - IOUtils.closeQuietly(s); - } - } catch (final InterruptedException e) { - throw new DnetCollectorException(e); - } - } - - private InputStream attemptDownload( - final String requestUrl, final int retryNumber, final CollectorPluginErrorLogList errorList) - throws DnetCollectorException { - - if (retryNumber > maxNumberOfRetry) { - throw new DnetCollectorException("Max number of retries exceeded. Cause: \n " + errorList); - } - - log.debug("Downloading " + requestUrl + " - try: " + retryNumber); - try { - InputStream input = null; - - try { - final HttpURLConnection urlConn = (HttpURLConnection) new URL(requestUrl).openConnection(); - urlConn.setInstanceFollowRedirects(false); - urlConn.setReadTimeout(readTimeOut * 1000); - urlConn.addRequestProperty("User-Agent", userAgent); - - if (log.isDebugEnabled()) { - logHeaderFields(urlConn); - } - - final int retryAfter = obtainRetryAfter(urlConn.getHeaderFields()); - if (retryAfter > 0 && urlConn.getResponseCode() == HttpURLConnection.HTTP_UNAVAILABLE) { - log.warn("waiting and repeating request after " + retryAfter + " sec."); - Thread.sleep(retryAfter * 1000); - errorList.add("503 Service Unavailable"); - urlConn.disconnect(); - return attemptDownload(requestUrl, retryNumber + 1, errorList); - } else if (urlConn.getResponseCode() == HttpURLConnection.HTTP_MOVED_PERM - || urlConn.getResponseCode() == HttpURLConnection.HTTP_MOVED_TEMP) { - final String newUrl = obtainNewLocation(urlConn.getHeaderFields()); - log.debug("The requested url has been moved to " + newUrl); - errorList - .add( - String - .format( - "%s %s. Moved to: %s", - urlConn.getResponseCode(), urlConn.getResponseMessage(), newUrl)); - urlConn.disconnect(); - return attemptDownload(newUrl, retryNumber + 1, errorList); - } else if (urlConn.getResponseCode() != HttpURLConnection.HTTP_OK) { - log - .error( - String - .format( - "HTTP error: %s %s", urlConn.getResponseCode(), urlConn.getResponseMessage())); - Thread.sleep(defaultDelay * 1000); - errorList - .add( - String.format("%s %s", urlConn.getResponseCode(), urlConn.getResponseMessage())); - urlConn.disconnect(); - return attemptDownload(requestUrl, retryNumber + 1, errorList); - } else { - input = urlConn.getInputStream(); - responseType = urlConn.getContentType(); - return input; - } - } catch (final IOException e) { - log.error("error while retrieving from http-connection occured: " + requestUrl, e); - Thread.sleep(defaultDelay * 1000); - errorList.add(e.getMessage()); - return attemptDownload(requestUrl, retryNumber + 1, errorList); - } - } catch (final InterruptedException e) { - throw new DnetCollectorException(e); - } - } - - private void logHeaderFields(final HttpURLConnection urlConn) throws IOException { - log.debug("StatusCode: " + urlConn.getResponseMessage()); - - for (final Map.Entry> e : urlConn.getHeaderFields().entrySet()) { - if (e.getKey() != null) { - for (final String v : e.getValue()) { - log.debug(" key: " + e.getKey() + " - value: " + v); - } - } - } - } - - private int obtainRetryAfter(final Map> headerMap) { - for (final String key : headerMap.keySet()) { - if (key != null - && key.toLowerCase().equals("retry-after") - && headerMap.get(key).size() > 0 - && NumberUtils.isNumber(headerMap.get(key).get(0))) { - return Integer.parseInt(headerMap.get(key).get(0)) + 10; - } - } - return -1; - } - - private String obtainNewLocation(final Map> headerMap) - throws DnetCollectorException { - for (final String key : headerMap.keySet()) { - if (key != null && key.toLowerCase().equals("location") && headerMap.get(key).size() > 0) { - return headerMap.get(key).get(0); - } - } - throw new DnetCollectorException( - "The requested url has been MOVED, but 'location' param is MISSING"); - } - - /** - * register for https scheme; this is a workaround and not intended for the use in trusted environments - */ - public void initTrustManager() { - final X509TrustManager tm = new X509TrustManager() { - - @Override - public void checkClientTrusted(final X509Certificate[] xcs, final String string) { - } - - @Override - public void checkServerTrusted(final X509Certificate[] xcs, final String string) { - } - - @Override - public X509Certificate[] getAcceptedIssuers() { - return null; - } - }; - try { - final SSLContext ctx = SSLContext.getInstance("TLS"); - ctx.init(null, new TrustManager[] { - tm - }, null); - HttpsURLConnection.setDefaultSSLSocketFactory(ctx.getSocketFactory()); - } catch (final GeneralSecurityException e) { - log.fatal(e); - throw new IllegalStateException(e); - } - } - - public int getMaxNumberOfRetry() { - return maxNumberOfRetry; - } - - public void setMaxNumberOfRetry(final int maxNumberOfRetry) { - this.maxNumberOfRetry = maxNumberOfRetry; - } - - public int getDefaultDelay() { - return defaultDelay; - } - - public void setDefaultDelay(final int defaultDelay) { - this.defaultDelay = defaultDelay; - } - - public int getReadTimeOut() { - return readTimeOut; - } - - public void setReadTimeOut(final int readTimeOut) { - this.readTimeOut = readTimeOut; - } - - public String getResponseType() { - return responseType; - } -} diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/utils/XmlCleaner.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/utils/XmlCleaner.java deleted file mode 100644 index 44aeb4d02..000000000 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/worker/utils/XmlCleaner.java +++ /dev/null @@ -1,383 +0,0 @@ - -package eu.dnetlib.dhp.collection.worker.utils; - -import java.util.HashMap; -import java.util.HashSet; -import java.util.Map; -import java.util.Set; -import java.util.regex.Pattern; - -/** @author jochen, Andreas Czerniak */ -public class XmlCleaner { - - /** Pattern for numeric entities. */ - private static final Pattern validCharacterEntityPattern = Pattern.compile("^&#x?\\d{2,4};"); // $NON-NLS-1$ - // private static Pattern validCharacterEntityPattern = Pattern.compile("^&#?\\d{2,4};"); - // //$NON-NLS-1$ - - // see https://www.w3.org/TR/REC-xml/#charsets , not only limited to - private static final Pattern invalidControlCharPattern = Pattern.compile("&#x?1[0-9a-fA-F];"); - - /** - * Pattern that negates the allowable XML 4 byte unicode characters. Valid are: #x9 | #xA | #xD | [#x20-#xD7FF] | - * [#xE000-#xFFFD] | [#x10000-#x10FFFF] - */ - private static final Pattern invalidCharacterPattern = Pattern.compile("[^\t\r\n\u0020-\uD7FF\uE000-\uFFFD]"); // $NON-NLS-1$ - - // Map entities to their unicode equivalent - private static final Set goodEntities = new HashSet<>(); - private static final Map badEntities = new HashMap<>(); - - static { - // pre-defined XML entities - goodEntities.add("""); // $NON-NLS-1$ // quotation mark - goodEntities.add("&"); // $NON-NLS-1$ // ampersand - goodEntities.add("<"); // $NON-NLS-1$ // less-than sign - goodEntities.add(">"); // $NON-NLS-1$ // greater-than sign - // control entities - // badEntities.put(" ", ""); - badEntities.put("", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character - badEntities.put("€", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character - badEntities.put("", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character - badEntities.put("‚", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character - badEntities.put("ƒ", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character - badEntities.put("„", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character - badEntities.put("…", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character - badEntities.put("†", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character - badEntities.put("‡", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character - badEntities.put("ˆ", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character - badEntities.put("‰", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character - badEntities.put("Š", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character - badEntities.put("‹", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character - badEntities.put("Œ", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character - badEntities.put("", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character - badEntities.put("Ž", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character - badEntities.put("", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character - badEntities.put("", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character - badEntities.put("‘", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character - badEntities.put("’", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character - badEntities.put("“", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character - badEntities.put("”", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character - badEntities.put("•", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character - badEntities.put("–", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character - badEntities.put("—", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character - badEntities.put("˜", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character - badEntities.put("™", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character - badEntities.put("š", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character - badEntities.put("›", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character - badEntities.put("œ", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character - badEntities.put("", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character - badEntities.put("ž", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character - badEntities.put("Ÿ", " "); // $NON-NLS-1$ //$NON-NLS-2$ // illegal HTML character - // misc entities - badEntities.put("€", "\u20AC"); // $NON-NLS-1$ //$NON-NLS-2$ // euro - badEntities.put("‘", "\u2018"); // $NON-NLS-1$ //$NON-NLS-2$ // left single quotation - // mark - badEntities.put("’", "\u2019"); // $NON-NLS-1$ //$NON-NLS-2$ // right single quotation - // mark - // Latin 1 entities - badEntities.put(" ", "\u00A0"); // $NON-NLS-1$ //$NON-NLS-2$ // no-break space - badEntities.put("¡", "\u00A1"); // $NON-NLS-1$ //$NON-NLS-2$ // inverted exclamation - // mark - badEntities.put("¢", "\u00A2"); // $NON-NLS-1$ //$NON-NLS-2$ // cent sign - badEntities.put("£", "\u00A3"); // $NON-NLS-1$ //$NON-NLS-2$ // pound sign - badEntities.put("¤", "\u00A4"); // $NON-NLS-1$ //$NON-NLS-2$ // currency sign - badEntities.put("¥", "\u00A5"); // $NON-NLS-1$ //$NON-NLS-2$ // yen sign - badEntities.put("¦", "\u00A6"); // $NON-NLS-1$ //$NON-NLS-2$ // broken vertical bar - badEntities.put("§", "\u00A7"); // $NON-NLS-1$ //$NON-NLS-2$ // section sign - badEntities.put("¨", "\u00A8"); // $NON-NLS-1$ //$NON-NLS-2$ // diaeresis - badEntities.put("©", "\u00A9"); // $NON-NLS-1$ //$NON-NLS-2$ // copyright sign - badEntities.put("ª", "\u00AA"); // $NON-NLS-1$ //$NON-NLS-2$ // feminine ordinal - // indicator - badEntities.put("«", "\u00AB"); // $NON-NLS-2$ // left-pointing double angle - // quotation mark - badEntities.put("¬", "\u00AC"); // $NON-NLS-1$ //$NON-NLS-2$ // not sign - badEntities.put("­", "\u00AD"); // $NON-NLS-1$ //$NON-NLS-2$ // soft hyphen - badEntities.put("®", "\u00AE"); // $NON-NLS-1$ //$NON-NLS-2$ // registered sign - badEntities.put("¯", "\u00AF"); // $NON-NLS-1$ //$NON-NLS-2$ // macron - badEntities.put("°", "\u00B0"); // $NON-NLS-1$ //$NON-NLS-2$ // degree sign - badEntities.put("±", "\u00B1"); // $NON-NLS-1$ //$NON-NLS-2$ // plus-minus sign - badEntities.put("²", "\u00B2"); // $NON-NLS-1$ //$NON-NLS-2$ // superscript two - badEntities.put("³", "\u00B3"); // $NON-NLS-1$ //$NON-NLS-2$ // superscript three - badEntities.put("´", "\u00B4"); // $NON-NLS-1$ //$NON-NLS-2$ // acute accent - badEntities.put("µ", "\u00B5"); // $NON-NLS-1$ //$NON-NLS-2$ // micro sign - badEntities.put("¶", "\u00B6"); // $NON-NLS-1$ //$NON-NLS-2$ // pilcrow sign - badEntities.put("·", "\u00B7"); // $NON-NLS-1$ //$NON-NLS-2$ // middle dot - badEntities.put("¸", "\u00B8"); // $NON-NLS-1$ //$NON-NLS-2$ // cedilla - badEntities.put("¹", "\u00B9"); // $NON-NLS-1$ //$NON-NLS-2$ // superscript one - badEntities.put("º", "\u00BA"); // $NON-NLS-1$ //$NON-NLS-2$ // masculine ordinal - // indicator - badEntities.put("»", "\u00BB"); // $NON-NLS-2$ // right-pointing double - // angle quotation - // mark - badEntities.put("¼", "\u00BC"); // $NON-NLS-1$ //$NON-NLS-2$ // vulgar fraction one - // quarter - badEntities.put("½", "\u00BD"); // $NON-NLS-1$ //$NON-NLS-2$ // vulgar fraction one - // half - badEntities.put("¾", "\u00BE"); // $NON-NLS-1$ //$NON-NLS-2$ // vulgar fraction three - // quarters - badEntities.put("¿", "\u00BF"); // $NON-NLS-1$ //$NON-NLS-2$ // inverted question - // mark - badEntities.put("À", "\u00C0"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter - // A - // with grave - badEntities.put("Á", "\u00C1"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter - // A - // with acute - badEntities.put("Â", "\u00C2"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter A - // with circumflex - badEntities.put("Ã", "\u00C3"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter - // A - // with tilde - badEntities.put("Ä", "\u00C4"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter A - // with - // diaeresis - badEntities.put("Å", "\u00C5"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter A - // with ring above - badEntities.put("Æ", "\u00C6"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter - // AE - badEntities.put("Ç", "\u00C7"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter - // C - // with cedilla - badEntities.put("È", "\u00C8"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter - // E - // with grave - badEntities.put("É", "\u00C9"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter - // E - // with acute - badEntities.put("Ê", "\u00CA"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter E - // with circumflex - badEntities.put("Ë", "\u00CB"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter E - // with - // diaeresis - badEntities.put("Ì", "\u00CC"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter - // I - // with grave - badEntities.put("Í", "\u00CD"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter - // I - // with acute - badEntities.put("Î", "\u00CE"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter I - // with circumflex - badEntities.put("Ï", "\u00CF"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter I - // with - // diaeresis - badEntities.put("Ð", "\u00D0"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter ETH - badEntities.put("Ñ", "\u00D1"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter - // N - // with tilde - badEntities.put("Ò", "\u00D2"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter - // O - // with grave - badEntities.put("Ó", "\u00D3"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter - // O - // with acute - badEntities.put("Ô", "\u00D4"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter O - // with circumflex - badEntities.put("Õ", "\u00D5"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter - // O - // with tilde - badEntities.put("Ö", "\u00D6"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter O - // with - // diaeresis - badEntities.put("×", "\u00D7"); // $NON-NLS-1$ //$NON-NLS-2$ // multiplication sign - badEntities.put("Ø", "\u00D8"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter - // O - // with stroke - badEntities.put("Ù", "\u00D9"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter - // U - // with grave - badEntities.put("Ú", "\u00DA"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter - // U - // with acute - badEntities.put("Û", "\u00DB"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter U - // with circumflex - badEntities.put("Ü", "\u00DC"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter U - // with - // diaeresis - badEntities.put("Ý", "\u00DD"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter - // Y - // with acute - badEntities.put("Þ", "\u00DE"); // $NON-NLS-1$ //$NON-NLS-2$ // latin capital letter - // THORN - badEntities.put("ß", "\u00DF"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter - // sharp s - badEntities.put("à", "\u00E0"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter a - // with - // grave - badEntities.put("á", "\u00E1"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter a - // with - // acute - badEntities.put("â", "\u00E2"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter a - // with - // circumflex - badEntities.put("ã", "\u00E3"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter a - // with - // tilde - badEntities.put("ä", "\u00E4"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter a - // with - // diaeresis - badEntities.put("å", "\u00E5"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter a - // with - // ring above - badEntities.put("æ", "\u00E6"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter ae - badEntities.put("ç", "\u00E7"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter c - // with - // cedilla - badEntities.put("è", "\u00E8"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter e - // with - // grave - badEntities.put("é", "\u00E9"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter e - // with - // acute - badEntities.put("ê", "\u00EA"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter e - // with - // circumflex - badEntities.put("ë", "\u00EB"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter e - // with - // diaeresis - badEntities.put("ì", "\u00EC"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter i - // with - // grave - badEntities.put("í", "\u00ED"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter i - // with - // acute - badEntities.put("î", "\u00EE"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter i - // with - // circumflex - badEntities.put("ï", "\u00EF"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter i - // with - // diaeresis - badEntities.put("ð", "\u00F0"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter eth - badEntities.put("ñ", "\u00F1"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter n - // with - // tilde - badEntities.put("ò", "\u00F2"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter o - // with - // grave - badEntities.put("ó", "\u00F3"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter o - // with - // acute - badEntities.put("ô", "\u00F4"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter o - // with - // circumflex - badEntities.put("õ", "\u00F5"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter o - // with - // tilde - badEntities.put("ö", "\u00F6"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter o - // with - // diaeresis - badEntities.put("÷", "\u00F7"); // $NON-NLS-1$ //$NON-NLS-2$ // division sign - badEntities.put("ø", "\u00F8"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter o - // with - // stroke - badEntities.put("ù", "\u00F9"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter u - // with - // grave - badEntities.put("ú", "\u00FA"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter u - // with - // acute - badEntities.put("û", "\u00FB"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter u - // with - // circumflex - badEntities.put("ü", "\u00FC"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter u - // with - // diaeresis - badEntities.put("ý", "\u00FD"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter y - // with - // acute - badEntities.put("þ", "\u00FE"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter - // thorn - badEntities.put("ÿ", "\u00FF"); // $NON-NLS-1$ //$NON-NLS-2$ // latin small letter y - // with - // diaeresis - } - - /** - * For each entity in the input that is not allowed in XML, replace the entity with its unicode equivalent or remove - * it. For each instance of a bare {@literal &}, replace it with {@literal - * &
- } XML only allows 4 entities: {@literal &amp;}, {@literal &quot;}, {@literal - * &lt;} and {@literal &gt;}. - * - * @param broken the string to handle entities - * @return the string with entities appropriately fixed up - */ - public static String cleanAllEntities(final String broken) { - if (broken == null) { - return null; - } - - String working = invalidControlCharPattern.matcher(broken).replaceAll(""); - working = invalidCharacterPattern.matcher(working).replaceAll(""); - - int cleanfrom = 0; - - while (true) { - int amp = working.indexOf('&', cleanfrom); - // If there are no more amps then we are done - if (amp == -1) { - break; - } - // Skip references of the kind &#ddd; - if (validCharacterEntityPattern.matcher(working.substring(amp)).find()) { - cleanfrom = working.indexOf(';', amp) + 1; - continue; - } - int i = amp + 1; - while (true) { - // if we are at the end of the string then just escape the '&'; - if (i >= working.length()) { - return working.substring(0, amp) + "&" + working.substring(amp + 1); // $NON-NLS-1$ - } - // if we have come to a ; then we have an entity - // If it is something that xml can't handle then replace it. - final char c = working.charAt(i); - if (c == ';') { - final String entity = working.substring(amp, i + 1); - final String replace = handleEntity(entity); - working = working.substring(0, amp) + replace + working.substring(i + 1); - break; - } - // Did we end an entity without finding a closing ; - // Then treat it as an '&' that needs to be replaced with & - if (!Character.isLetterOrDigit(c)) { - working = working.substring(0, amp) + "&" + working.substring(amp + 1); // $NON-NLS-1$ - amp = i + 4; // account for the 4 extra characters - break; - } - i++; - } - cleanfrom = amp + 1; - } - - if (Pattern.compile("<<").matcher(working).find()) { - working = working.replaceAll("<<", "<<"); - } - - if (Pattern.compile(">>").matcher(working).find()) { - working = working.replaceAll(">>", ">>"); - } - - return working; - } - - /** - * Replace entity with its unicode equivalent, if it is not a valid XML entity. Otherwise strip it out. XML only - * allows 4 entities: &amp;, &quot;, &lt; and &gt;. - * - * @param entity the entity to be replaced - * @return the substitution for the entity, either itself, the unicode equivalent or an empty string. - */ - private static String handleEntity(final String entity) { - if (goodEntities.contains(entity)) { - return entity; - } - - final String replace = badEntities.get(entity); - if (replace != null) { - return replace; - } - - return replace != null ? replace : ""; - } -} diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/pubmed/PMArticle.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/pubmed/PMArticle.java new file mode 100644 index 000000000..3fb814606 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/pubmed/PMArticle.java @@ -0,0 +1,266 @@ + +package eu.dnetlib.dhp.sx.bio.pubmed; + +import java.io.Serializable; +import java.util.ArrayList; +import java.util.List; + +/** + * This class represent an instance of Pubmed Article extracted from the native XML + * + * @author Sandro La Bruzzo + */ + +public class PMArticle implements Serializable { + + /** + * the Pubmed Identifier + */ + private String pmid; + + private String pmcId; + + /** + * the DOI + */ + private String doi; + /** + * the Pubmed Date extracted from Specifies a date significant to either the article's history or the citation's processing. + * All dates will have a , , and elements. Some may have an , , and element(s). + */ + private String date; + /** + * This is an 'envelop' element that contains various elements describing the journal cited; i.e., ISSN, Volume, Issue, and PubDate and author name(s), however, it does not contain data itself. + */ + private PMJournal journal; + /** + * The full journal title (taken from NLM cataloging data following NLM rules for how to compile a serial name) is exported in this element. Some characters that are not part of the NLM MEDLINE/PubMed Character Set reside in a relatively small number of full journal titles. The NLM journal title abbreviation is exported in the element. + */ + private String title; + /** + * English-language abstracts are taken directly from the published article. + * If the article does not have a published abstract, the National Library of Medicine does not create one, + * thus the record lacks the and elements. However, in the absence of a formally + * labeled abstract in the published article, text from a substantive "summary", "summary and conclusions" or "conclusions and summary" may be used. + */ + private String description; + /** + * the language in which an article was published is recorded in . + * All entries are three letter abbreviations stored in lower case, such as eng, fre, ger, jpn, etc. When a single + * record contains more than one language value the XML export program extracts the languages in alphabetic order by the 3-letter language value. + * Some records provided by collaborating data producers may contain the value und to identify articles whose language is undetermined. + */ + private String language; + + /** + * NLM controlled vocabulary, Medical Subject Headings (MeSH®), is used to characterize the content of the articles represented by MEDLINE citations. * + */ + private final List subjects = new ArrayList<>(); + /** + * This element is used to identify the type of article indexed for MEDLINE; + * it characterizes the nature of the information or the manner in which it is conveyed as well as the type of + * research support received (e.g., Review, Letter, Retracted Publication, Clinical Conference, Research Support, N.I.H., Extramural). + */ + private final List publicationTypes = new ArrayList<>(); + /** + * Personal and collective (corporate) author names published with the article are found in . + */ + private List authors = new ArrayList<>(); + + /** + * contains the research grant or contract number (or both) that designates financial support by any agency of the United States Public Health Service + * or any institute of the National Institutes of Health. Additionally, beginning in late 2005, grant numbers are included for many other US and non-US funding agencies and organizations. + */ + private final List grants = new ArrayList<>(); + + /** + * get the DOI + * @return a DOI + */ + public String getDoi() { + return doi; + } + + /** + * Set the DOI + * @param doi a DOI + */ + public void setDoi(String doi) { + this.doi = doi; + } + + /** + * get the Pubmed Identifier + * @return the PMID + */ + public String getPmid() { + return pmid; + } + + /** + * set the Pubmed Identifier + * @param pmid the Pubmed Identifier + */ + public void setPmid(String pmid) { + this.pmid = pmid; + } + + /** + * the Pubmed Date extracted from Specifies a date significant to either the article's history or the citation's processing. + * All dates will have a , , and elements. Some may have an , , and element(s). + * + * @return the Pubmed Date + */ + public String getDate() { + return date; + } + + /** + * Set the pubmed Date + * @param date + */ + public void setDate(String date) { + this.date = date; + } + + /** + * The full journal title (taken from NLM cataloging data following NLM rules for how to compile a serial name) is exported in this element. + * Some characters that are not part of the NLM MEDLINE/PubMed Character Set reside in a relatively small number of full journal titles. + * The NLM journal title abbreviation is exported in the element. + * + * @return the pubmed Journal Extracted + */ + public PMJournal getJournal() { + return journal; + } + + /** + * Set the mapped pubmed Journal + * @param journal + */ + public void setJournal(PMJournal journal) { + this.journal = journal; + } + + /** + * contains the entire title of the journal article. is always in English; + * those titles originally published in a non-English language and translated for are enclosed in square brackets. + * All titles end with a period unless another punctuation mark such as a question mark or bracket is present. + * Explanatory information about the title itself is enclosed in parentheses, e.g.: (author's transl). + * Corporate/collective authors may appear at the end of for citations up to about the year 2000. + * + * @return the extracted pubmed Title + */ + public String getTitle() { + return title; + } + + /** + * set the pubmed title + * @param title + */ + public void setTitle(String title) { + this.title = title; + } + + /** + * English-language abstracts are taken directly from the published article. + * If the article does not have a published abstract, the National Library of Medicine does not create one, + * thus the record lacks the and elements. However, in the absence of a formally + * labeled abstract in the published article, text from a substantive "summary", "summary and conclusions" or "conclusions and summary" may be used. + * + * @return the Mapped Pubmed Article Abstracts + */ + public String getDescription() { + return description; + } + + /** + * Set the Mapped Pubmed Article Abstracts + * @param description + */ + public void setDescription(String description) { + this.description = description; + } + + /** + * Personal and collective (corporate) author names published with the article are found in . + * + * @return get the Mapped Authors lists + */ + public List getAuthors() { + return authors; + } + + /** + * Set the Mapped Authors lists + * @param authors + */ + public void setAuthors(List authors) { + this.authors = authors; + } + + /** + * This element is used to identify the type of article indexed for MEDLINE; + * it characterizes the nature of the information or the manner in which it is conveyed as well as the type of + * research support received (e.g., Review, Letter, Retracted Publication, Clinical Conference, Research Support, N.I.H., Extramural). + * + * @return the mapped Subjects + */ + public List getSubjects() { + return subjects; + } + + /** + * + * the language in which an article was published is recorded in . + * All entries are three letter abbreviations stored in lower case, such as eng, fre, ger, jpn, etc. When a single + * record contains more than one language value the XML export program extracts the languages in alphabetic order by the 3-letter language value. + * Some records provided by collaborating data producers may contain the value und to identify articles whose language is undetermined. + * + * @return The mapped Language + */ + public String getLanguage() { + return language; + } + + /** + * + * Set The mapped Language + * + * @param language the mapped Language + */ + public void setLanguage(String language) { + this.language = language; + } + + /** + * This element is used to identify the type of article indexed for MEDLINE; + * it characterizes the nature of the information or the manner in which it is conveyed as well as the type of + * research support received (e.g., Review, Letter, Retracted Publication, Clinical Conference, Research Support, N.I.H., Extramural). + * + * @return the mapped Publication Type + */ + public List getPublicationTypes() { + return publicationTypes; + } + + /** + * contains the research grant or contract number (or both) that designates financial support by any agency of the United States Public Health Service + * or any institute of the National Institutes of Health. Additionally, beginning in late 2005, grant numbers are included for many other US and non-US funding agencies and organizations. + * @return the mapped grants + */ + + public List getGrants() { + return grants; + } + + public String getPmcId() { + return pmcId; + } + + public PMArticle setPmcId(String pmcId) { + this.pmcId = pmcId; + return this; + } +} diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/pubmed/PMAuthor.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/pubmed/PMAuthor.java new file mode 100644 index 000000000..68ef6459e --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/pubmed/PMAuthor.java @@ -0,0 +1,62 @@ + +package eu.dnetlib.dhp.sx.bio.pubmed; + +import java.io.Serializable; + +/** + * The type Pubmed author. + * + * @author Sandro La Bruzzo + */ +public class PMAuthor implements Serializable { + + private String lastName; + private String foreName; + + /** + * Gets last name. + * + * @return the last name + */ + public String getLastName() { + return lastName; + } + + /** + * Sets last name. + * + * @param lastName the last name + */ + public void setLastName(String lastName) { + this.lastName = lastName; + } + + /** + * Gets fore name. + * + * @return the fore name + */ + public String getForeName() { + return foreName; + } + + /** + * Sets fore name. + * + * @param foreName the fore name + */ + public void setForeName(String foreName) { + this.foreName = foreName; + } + + /** + * Gets full name. + * + * @return the full name + */ + public String getFullName() { + return String + .format("%s, %s", this.foreName != null ? this.foreName : "", this.lastName != null ? this.lastName : ""); + } + +} diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/pubmed/PMGrant.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/pubmed/PMGrant.java new file mode 100644 index 000000000..abb908483 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/pubmed/PMGrant.java @@ -0,0 +1,87 @@ + +package eu.dnetlib.dhp.sx.bio.pubmed; + +/** + * The type Pm grant. + * + * @author Sandro La Bruzzo + */ +public class PMGrant { + + private String grantID; + private String agency; + private String country; + + /** + * Instantiates a new Pm grant. + */ + public PMGrant() { + } + + /** + * Instantiates a new Pm grant. + * + * @param grantID the grant id + * @param agency the agency + * @param country the country + */ + public PMGrant(String grantID, String agency, String country) { + this.grantID = grantID; + this.agency = agency; + this.country = country; + } + + /** + * Gets grant id. + * + * @return the grant id + */ + public String getGrantID() { + return grantID; + } + + /** + * Sets grant id. + * + * @param grantID the grant id + */ + public void setGrantID(String grantID) { + this.grantID = grantID; + } + + /** + * Gets agency. + * + * @return the agency + */ + public String getAgency() { + return agency; + } + + /** + * Sets agency. + * + * @param agency the agency + */ + public void setAgency(String agency) { + this.agency = agency; + } + + /** + * Gets country. + * + * @return the country + */ + public String getCountry() { + return country; + } + + /** + * Sets country. + * + * @param country the country + */ + public void setCountry(String country) { + this.country = country; + } +} diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/pubmed/PMJournal.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/pubmed/PMJournal.java new file mode 100644 index 000000000..731648839 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/pubmed/PMJournal.java @@ -0,0 +1,108 @@ + +package eu.dnetlib.dhp.sx.bio.pubmed; + +import java.io.Serializable; + +/** + * The type Pm journal. + * + * @author Sandro La Bruzzo + */ +public class PMJournal implements Serializable { + + private String issn; + private String volume; + private String issue; + private String date; + private String title; + + /** + * Gets issn. + * + * @return the issn + */ + public String getIssn() { + return issn; + } + + /** + * Sets issn. + * + * @param issn the issn + */ + public void setIssn(String issn) { + this.issn = issn; + } + + /** + * Gets volume. + * + * @return the volume + */ + public String getVolume() { + return volume; + } + + /** + * Sets volume. + * + * @param volume the volume + */ + public void setVolume(String volume) { + this.volume = volume; + } + + /** + * Gets issue. + * + * @return the issue + */ + public String getIssue() { + return issue; + } + + /** + * Sets issue. + * + * @param issue the issue + */ + public void setIssue(String issue) { + this.issue = issue; + } + + /** + * Gets date. + * + * @return the date + */ + public String getDate() { + return date; + } + + /** + * Sets date. + * + * @param date the date + */ + public void setDate(String date) { + this.date = date; + } + + /** + * Gets title. + * + * @return the title + */ + public String getTitle() { + return title; + } + + /** + * Sets title. + * + * @param title the title + */ + public void setTitle(String title) { + this.title = title; + } +} diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/pubmed/PMSubject.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/pubmed/PMSubject.java new file mode 100644 index 000000000..e3829bb7b --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/pubmed/PMSubject.java @@ -0,0 +1,84 @@ + +package eu.dnetlib.dhp.sx.bio.pubmed; + +/** + * The type Pubmed subject. + */ +public class PMSubject { + private String value; + private String meshId; + private String registryNumber; + + /** + * Instantiates a new Pm subject. + */ + public PMSubject() { + } + + /** + * Instantiates a new Pm subject. + * + * @param value the value + * @param meshId the mesh id + * @param registryNumber the registry number + */ + public PMSubject(String value, String meshId, String registryNumber) { + this.value = value; + this.meshId = meshId; + this.registryNumber = registryNumber; + } + + /** + * Gets value. + * + * @return the value + */ + public String getValue() { + return value; + } + + /** + * Sets value. + * + * @param value the value + */ + public void setValue(String value) { + this.value = value; + } + + /** + * Gets mesh id. + * + * @return the mesh id + */ + public String getMeshId() { + return meshId; + } + + /** + * Sets mesh id. + * + * @param meshId the mesh id + */ + public void setMeshId(String meshId) { + this.meshId = meshId; + } + + /** + * Gets registry number. + * + * @return the registry number + */ + public String getRegistryNumber() { + return registryNumber; + } + + /** + * Sets registry number. + * + * @param registryNumber the registry number + */ + public void setRegistryNumber(String registryNumber) { + this.registryNumber = registryNumber; + } +} diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/DnetTransformationException.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/DnetTransformationException.java new file mode 100644 index 000000000..45bd844e2 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/DnetTransformationException.java @@ -0,0 +1,29 @@ + +package eu.dnetlib.dhp.transformation; + +public class DnetTransformationException extends Exception { + + public DnetTransformationException() { + super(); + } + + public DnetTransformationException( + final String message, + final Throwable cause, + final boolean enableSuppression, + final boolean writableStackTrace) { + super(message, cause, enableSuppression, writableStackTrace); + } + + public DnetTransformationException(final String message, final Throwable cause) { + super(message, cause); + } + + public DnetTransformationException(final String message) { + super(message); + } + + public DnetTransformationException(final Throwable cause) { + super(cause); + } +} diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/TransformFunction.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/TransformFunction.java deleted file mode 100644 index f4bf78e18..000000000 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/TransformFunction.java +++ /dev/null @@ -1,74 +0,0 @@ - -package eu.dnetlib.dhp.transformation; - -import java.io.ByteArrayInputStream; -import java.io.StringWriter; -import java.util.Map; - -import javax.xml.transform.stream.StreamSource; - -import org.apache.spark.api.java.function.MapFunction; -import org.apache.spark.util.LongAccumulator; - -import eu.dnetlib.dhp.model.mdstore.MetadataRecord; -import eu.dnetlib.dhp.transformation.functions.Cleaner; -import eu.dnetlib.dhp.transformation.vocabulary.Vocabulary; -import net.sf.saxon.s9api.*; - -public class TransformFunction implements MapFunction { - - private final LongAccumulator totalItems; - private final LongAccumulator errorItems; - private final LongAccumulator transformedItems; - private final String transformationRule; - private final Cleaner cleanFunction; - - private final long dateOfTransformation; - - public TransformFunction( - LongAccumulator totalItems, - LongAccumulator errorItems, - LongAccumulator transformedItems, - final String transformationRule, - long dateOfTransformation, - final Map vocabularies) - throws Exception { - this.totalItems = totalItems; - this.errorItems = errorItems; - this.transformedItems = transformedItems; - this.transformationRule = transformationRule; - this.dateOfTransformation = dateOfTransformation; - cleanFunction = new Cleaner(vocabularies); - } - - @Override - public MetadataRecord call(MetadataRecord value) { - totalItems.add(1); - try { - Processor processor = new Processor(false); - processor.registerExtensionFunction(cleanFunction); - final XsltCompiler comp = processor.newXsltCompiler(); - XsltExecutable xslt = comp - .compile(new StreamSource(new ByteArrayInputStream(transformationRule.getBytes()))); - XdmNode source = processor - .newDocumentBuilder() - .build(new StreamSource(new ByteArrayInputStream(value.getBody().getBytes()))); - XsltTransformer trans = xslt.load(); - trans.setInitialContextNode(source); - final StringWriter output = new StringWriter(); - Serializer out = processor.newSerializer(output); - out.setOutputProperty(Serializer.Property.METHOD, "xml"); - out.setOutputProperty(Serializer.Property.INDENT, "yes"); - trans.setDestination(out); - trans.transform(); - final String xml = output.toString(); - value.setBody(xml); - value.setDateOfTransformation(dateOfTransformation); - transformedItems.add(1); - return value; - } catch (Throwable e) { - errorItems.add(1); - return null; - } - } -} diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/TransformSparkJobNode.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/TransformSparkJobNode.java index 8737d36ef..ed867c7f2 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/TransformSparkJobNode.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/TransformSparkJobNode.java @@ -1,43 +1,44 @@ package eu.dnetlib.dhp.transformation; +import static eu.dnetlib.dhp.common.Constants.*; import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; +import static eu.dnetlib.dhp.utils.DHPUtils.*; -import java.io.ByteArrayInputStream; -import java.util.HashMap; +import java.io.IOException; import java.util.Map; import java.util.Objects; import java.util.Optional; -import org.apache.commons.cli.*; import org.apache.commons.io.IOUtils; import org.apache.spark.SparkConf; +import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.api.java.function.Function; +import org.apache.spark.api.java.function.MapFunction; import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Encoder; import org.apache.spark.sql.Encoders; import org.apache.spark.sql.SparkSession; import org.apache.spark.util.LongAccumulator; -import org.dom4j.Document; -import org.dom4j.DocumentException; -import org.dom4j.Node; -import org.dom4j.io.SAXReader; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import eu.dnetlib.dhp.aggregation.common.AggregationCounter; import eu.dnetlib.dhp.application.ArgumentApplicationParser; -import eu.dnetlib.dhp.collection.GenerateNativeStoreSparkJob; -import eu.dnetlib.dhp.model.mdstore.MetadataRecord; -import eu.dnetlib.dhp.transformation.vocabulary.Vocabulary; -import eu.dnetlib.dhp.transformation.vocabulary.VocabularyHelper; -import eu.dnetlib.dhp.utils.DHPUtils; -import eu.dnetlib.message.Message; -import eu.dnetlib.message.MessageManager; -import eu.dnetlib.message.MessageType; +import eu.dnetlib.dhp.common.aggregation.AggregatorReport; +import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup; +import eu.dnetlib.dhp.message.MessageSender; +import eu.dnetlib.dhp.schema.mdstore.MDStoreVersion; +import eu.dnetlib.dhp.schema.mdstore.MetadataRecord; +import eu.dnetlib.dhp.utils.ISLookupClientFactory; +import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService; public class TransformSparkJobNode { private static final Logger log = LoggerFactory.getLogger(TransformSparkJobNode.class); + private static final int RECORDS_PER_TASK = 200; + public static void main(String[] args) throws Exception { final ArgumentApplicationParser parser = new ArgumentApplicationParser( @@ -55,67 +56,108 @@ public class TransformSparkJobNode { .orElse(Boolean.TRUE); log.info("isSparkSessionManaged: {}", isSparkSessionManaged); - final String inputPath = parser.get("input"); - final String outputPath = parser.get("output"); - final String workflowId = parser.get("workflowId"); - final String trasformationRule = extractXSLTFromTR( - Objects.requireNonNull(DHPUtils.decompressString(parser.get("transformationRule")))); + final String mdstoreInputVersion = parser.get("mdstoreInputVersion"); + final String mdstoreOutputVersion = parser.get("mdstoreOutputVersion"); - final String rabbitUser = parser.get("rabbitUser"); - final String rabbitPassword = parser.get("rabbitPassword"); - final String rabbitHost = parser.get("rabbitHost"); - final String rabbitReportQueue = parser.get("rabbitReportQueue"); - final long dateOfCollection = new Long(parser.get("dateOfCollection")); - final boolean test = parser.get("isTest") == null ? false : Boolean.valueOf(parser.get("isTest")); + final MDStoreVersion nativeMdStoreVersion = MAPPER.readValue(mdstoreInputVersion, MDStoreVersion.class); + final String inputPath = nativeMdStoreVersion.getHdfsPath() + MDSTORE_DATA_PATH; + log.info("inputPath: {}", inputPath); + + final MDStoreVersion cleanedMdStoreVersion = MAPPER.readValue(mdstoreOutputVersion, MDStoreVersion.class); + final String outputBasePath = cleanedMdStoreVersion.getHdfsPath(); + log.info("outputBasePath: {}", outputBasePath); + + final String isLookupUrl = parser.get("isLookupUrl"); + log.info("isLookupUrl: {}", isLookupUrl); + + final String dateOfTransformation = parser.get("dateOfTransformation"); + log.info("dateOfTransformation: {}", dateOfTransformation); + + final Integer rpt = Optional + .ofNullable(parser.get("recordsPerTask")) + .map(Integer::valueOf) + .orElse(RECORDS_PER_TASK); + + final ISLookUpService isLookupService = ISLookupClientFactory.getLookUpService(isLookupUrl); + + final VocabularyGroup vocabularies = VocabularyGroup.loadVocsFromIS(isLookupService); + + log.info("Retrieved {} vocabularies", vocabularies.vocabularyNames().size()); SparkConf conf = new SparkConf(); runWithSparkSession( conf, isSparkSessionManaged, spark -> { - final Encoder encoder = Encoders.bean(MetadataRecord.class); - final Dataset mdstoreInput = spark.read().format("parquet").load(inputPath).as(encoder); - final LongAccumulator totalItems = spark.sparkContext().longAccumulator("TotalItems"); - final LongAccumulator errorItems = spark.sparkContext().longAccumulator("errorItems"); - final LongAccumulator transformedItems = spark.sparkContext().longAccumulator("transformedItems"); - final Map vocabularies = new HashMap<>(); - vocabularies.put("dnet:languages", VocabularyHelper.getVocabularyFromAPI("dnet:languages")); - final TransformFunction transformFunction = new TransformFunction( - totalItems, - errorItems, - transformedItems, - trasformationRule, - dateOfCollection, - vocabularies); - mdstoreInput.map(transformFunction, encoder).write().format("parquet").save(outputPath); - if (rabbitHost != null) { - System.out.println("SEND FINAL REPORT"); - final Map reportMap = new HashMap<>(); - reportMap.put("inputItem", "" + totalItems.value()); - reportMap.put("invalidRecords", "" + errorItems.value()); - reportMap.put("mdStoreSize", "" + transformedItems.value()); - System.out.println(new Message(workflowId, "Transform", MessageType.REPORT, reportMap)); - if (!test) { - final MessageManager manager = new MessageManager(rabbitHost, rabbitUser, rabbitPassword, false, - false, - null); - manager - .sendMessage( - new Message(workflowId, "Transform", MessageType.REPORT, reportMap), - rabbitReportQueue, - true, - false); - manager.close(); - } - } + transformRecords( + parser.getObjectMap(), isLookupService, spark, inputPath, outputBasePath, rpt); }); - } - private static String extractXSLTFromTR(final String tr) throws DocumentException { - SAXReader reader = new SAXReader(); - Document document = reader.read(new ByteArrayInputStream(tr.getBytes())); - Node node = document.selectSingleNode("//CODE/*[local-name()='stylesheet']"); - return node.asXML(); + public static void transformRecords(final Map args, final ISLookUpService isLookUpService, + final SparkSession spark, final String inputPath, final String outputBasePath, final Integer rpt) + throws DnetTransformationException, IOException { + + final LongAccumulator totalItems = spark.sparkContext().longAccumulator(CONTENT_TOTALITEMS); + final LongAccumulator errorItems = spark.sparkContext().longAccumulator(CONTENT_INVALIDRECORDS); + final LongAccumulator transformedItems = spark.sparkContext().longAccumulator(CONTENT_TRANSFORMEDRECORDS); + final AggregationCounter ct = new AggregationCounter(totalItems, errorItems, transformedItems); + final Encoder encoder = Encoders.bean(MetadataRecord.class); + + final String dnetMessageManagerURL = args.get(DNET_MESSAGE_MGR_URL); + log.info("dnetMessageManagerURL is {}", dnetMessageManagerURL); + + final String workflowId = args.get("workflowId"); + log.info("workflowId is {}", workflowId); + + MapFunction x = TransformationFactory + .getTransformationPlugin(args, ct, isLookUpService); + + final Dataset inputMDStore = spark + .read() + .format("parquet") + .load(inputPath) + .as(encoder); + + final long totalInput = inputMDStore.count(); + + final MessageSender messageSender = new MessageSender(dnetMessageManagerURL, workflowId); + try (AggregatorReport report = new AggregatorReport(messageSender)) { + try { + JavaRDD mdstore = inputMDStore + .javaRDD() + .repartition(getRepartitionNumber(totalInput, rpt)) + .map((Function) x::call) + .filter((Function) Objects::nonNull); + saveDataset(spark.createDataset(mdstore.rdd(), encoder), outputBasePath + MDSTORE_DATA_PATH); + + log.info("Transformed item {}", ct.getProcessedItems().count()); + log.info("Total item {}", ct.getTotalItems().count()); + log.info("Transformation Error item {}", ct.getErrorItems().count()); + + final long mdStoreSize = spark.read().load(outputBasePath + MDSTORE_DATA_PATH).count(); + writeHdfsFile( + spark.sparkContext().hadoopConfiguration(), + "" + mdStoreSize, outputBasePath + MDSTORE_SIZE_PATH); + } catch (Throwable e) { + log.error("error during record transformation", e); + report.put(TransformSparkJobNode.class.getSimpleName(), e.getMessage()); + report.put(CONTENT_TOTALITEMS, ct.getTotalItems().value().toString()); + report.put(CONTENT_INVALIDRECORDS, ct.getErrorItems().value().toString()); + report.put(CONTENT_TRANSFORMEDRECORDS, ct.getProcessedItems().value().toString()); + throw e; + } + } } + + /** + * Calculates the number of partitions allocating at most @rpt records for a single transformation task. + * @param totalInput + * @param rpt + * @return + */ + private static int getRepartitionNumber(long totalInput, Integer rpt) { + return Math.max(1, (int) (totalInput / rpt)); + } + } diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/TransformationFactory.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/TransformationFactory.java new file mode 100644 index 000000000..e93f3b518 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/TransformationFactory.java @@ -0,0 +1,74 @@ + +package eu.dnetlib.dhp.transformation; + +import java.util.List; +import java.util.Map; + +import org.apache.commons.lang3.StringUtils; +import org.apache.spark.api.java.function.MapFunction; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import eu.dnetlib.dhp.aggregation.common.AggregationCounter; +import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup; +import eu.dnetlib.dhp.schema.mdstore.MetadataRecord; +import eu.dnetlib.dhp.transformation.xslt.XSLTTransformationFunction; +import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException; +import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService; + +public class TransformationFactory { + + private static final Logger log = LoggerFactory.getLogger(TransformationFactory.class); + public static final String TRULE_XQUERY = "for $x in collection('/db/DRIVER/TransformationRuleDSResources/TransformationRuleDSResourceType') " + + + "where $x//RESOURCE_IDENTIFIER/@value = \"%s\" return $x//CODE/*[local-name() =\"stylesheet\"]"; + + private TransformationFactory() { + } + + public static MapFunction getTransformationPlugin( + final Map jobArgument, final AggregationCounter counters, final ISLookUpService isLookupService) + throws DnetTransformationException { + + try { + final String transformationPlugin = jobArgument.get("transformationPlugin"); + + log.info("Transformation plugin required {}", transformationPlugin); + switch (transformationPlugin) { + case "XSLT_TRANSFORM": { + final String transformationRuleId = jobArgument.get("transformationRuleId"); + if (StringUtils.isBlank(transformationRuleId)) + throw new DnetTransformationException("Missing Parameter transformationRule"); + final VocabularyGroup vocabularies = VocabularyGroup.loadVocsFromIS(isLookupService); + + final String transformationRule = queryTransformationRuleFromIS( + transformationRuleId, isLookupService); + + final long dateOfTransformation = Long.parseLong(jobArgument.get("dateOfTransformation")); + return new XSLTTransformationFunction(counters, transformationRule, dateOfTransformation, + vocabularies); + + } + default: + throw new DnetTransformationException( + "transformation plugin does not exists for " + transformationPlugin); + } + + } catch (Throwable e) { + throw new DnetTransformationException(e); + } + } + + private static String queryTransformationRuleFromIS(final String transformationRuleId, + final ISLookUpService isLookUpService) throws DnetTransformationException, ISLookUpException { + final String query = String.format(TRULE_XQUERY, transformationRuleId); + log.info("asking query to IS: {}", query); + List result = isLookUpService.quickSearchProfile(query); + + if (result == null || result.isEmpty()) + throw new DnetTransformationException( + "Unable to find transformation rule with name: " + transformationRuleId); + return result.get(0); + } + +} diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/functions/Cleaner.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/functions/Cleaner.java deleted file mode 100644 index 7f9b6646c..000000000 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/functions/Cleaner.java +++ /dev/null @@ -1,52 +0,0 @@ - -package eu.dnetlib.dhp.transformation.functions; - -import java.util.Map; -import java.util.Optional; - -import eu.dnetlib.dhp.transformation.vocabulary.Term; -import eu.dnetlib.dhp.transformation.vocabulary.Vocabulary; -import net.sf.saxon.s9api.*; -import scala.Serializable; - -public class Cleaner implements ExtensionFunction, Serializable { - - private final Map vocabularies; - - public Cleaner(Map vocabularies) { - this.vocabularies = vocabularies; - } - - @Override - public QName getName() { - return new QName("http://eu/dnetlib/trasform/extension", "clean"); - } - - @Override - public SequenceType getResultType() { - return SequenceType.makeSequenceType(ItemType.STRING, OccurrenceIndicator.ONE_OR_MORE); - } - - @Override - public SequenceType[] getArgumentTypes() { - return new SequenceType[] { - SequenceType.makeSequenceType(ItemType.STRING, OccurrenceIndicator.ONE), - SequenceType.makeSequenceType(ItemType.STRING, OccurrenceIndicator.ONE) - }; - } - - @Override - public XdmValue call(XdmValue[] xdmValues) throws SaxonApiException { - final String currentValue = xdmValues[0].itemAt(0).getStringValue(); - final String vocabularyName = xdmValues[1].itemAt(0).getStringValue(); - Optional cleanedValue = vocabularies - .get(vocabularyName) - .getTerms() - .stream() - .filter(it -> it.getNativeName().equalsIgnoreCase(currentValue)) - .findAny(); - - return new XdmAtomicValue( - cleanedValue.isPresent() ? cleanedValue.get().getCode() : currentValue); - } -} diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/vocabulary/Term.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/vocabulary/Term.java deleted file mode 100644 index b5ac18169..000000000 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/vocabulary/Term.java +++ /dev/null @@ -1,53 +0,0 @@ - -package eu.dnetlib.dhp.transformation.vocabulary; - -import java.io.Serializable; - -public class Term implements Serializable { - - private String englishName; - private String nativeName; - private String encoding; - private String code; - private String synonyms; - - public String getEnglishName() { - return englishName; - } - - public void setEnglishName(String englishName) { - this.englishName = englishName; - } - - public String getNativeName() { - return nativeName; - } - - public void setNativeName(String nativeName) { - this.nativeName = nativeName; - } - - public String getEncoding() { - return encoding; - } - - public void setEncoding(String encoding) { - this.encoding = encoding; - } - - public String getCode() { - return code; - } - - public void setCode(String code) { - this.code = code; - } - - public String getSynonyms() { - return synonyms; - } - - public void setSynonyms(String synonyms) { - this.synonyms = synonyms; - } -} diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/vocabulary/Vocabulary.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/vocabulary/Vocabulary.java deleted file mode 100644 index a9da6b725..000000000 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/vocabulary/Vocabulary.java +++ /dev/null @@ -1,54 +0,0 @@ - -package eu.dnetlib.dhp.transformation.vocabulary; - -import java.io.Serializable; -import java.util.List; - -public class Vocabulary implements Serializable { - - private String id; - private String name; - private String description; - private String code; - private List terms; - - public String getId() { - return id; - } - - public void setId(String id) { - this.id = id; - } - - public String getName() { - return name; - } - - public void setName(String name) { - this.name = name; - } - - public String getDescription() { - return description; - } - - public void setDescription(String description) { - this.description = description; - } - - public String getCode() { - return code; - } - - public void setCode(String code) { - this.code = code; - } - - public List getTerms() { - return terms; - } - - public void setTerms(List terms) { - this.terms = terms; - } -} diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/vocabulary/VocabularyHelper.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/vocabulary/VocabularyHelper.java deleted file mode 100644 index 10e959be0..000000000 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/vocabulary/VocabularyHelper.java +++ /dev/null @@ -1,24 +0,0 @@ - -package eu.dnetlib.dhp.transformation.vocabulary; - -import java.io.Serializable; -import java.net.URL; -import java.nio.charset.Charset; - -import org.apache.commons.io.IOUtils; - -import com.fasterxml.jackson.databind.ObjectMapper; - -public class VocabularyHelper implements Serializable { - - private static final String OPENAIRE_URL = "http://api.openaire.eu/vocabularies/%s.json"; - - public static Vocabulary getVocabularyFromAPI(final String vocabularyName) throws Exception { - final URL url = new URL(String.format(OPENAIRE_URL, vocabularyName)); - - final String response = IOUtils.toString(url, Charset.defaultCharset()); - final ObjectMapper jsonMapper = new ObjectMapper(); - final Vocabulary vocabulary = jsonMapper.readValue(response, Vocabulary.class); - return vocabulary; - } -} diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/xslt/Cleaner.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/xslt/Cleaner.java new file mode 100644 index 000000000..664215c0e --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/xslt/Cleaner.java @@ -0,0 +1,51 @@ + +package eu.dnetlib.dhp.transformation.xslt; + +import static eu.dnetlib.dhp.transformation.xslt.XSLTTransformationFunction.QNAME_BASE_URI; + +import java.io.Serializable; + +import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup; +import eu.dnetlib.dhp.schema.oaf.Qualifier; +import net.sf.saxon.s9api.*; + +public class Cleaner implements ExtensionFunction, Serializable { + + private final VocabularyGroup vocabularies; + + public Cleaner(final VocabularyGroup vocabularies) { + this.vocabularies = vocabularies; + } + + @Override + public QName getName() { + return new QName(QNAME_BASE_URI + "/clean", "clean"); + } + + @Override + public SequenceType getResultType() { + return SequenceType.makeSequenceType(ItemType.STRING, OccurrenceIndicator.ONE_OR_MORE); + } + + @Override + public SequenceType[] getArgumentTypes() { + return new SequenceType[] { + SequenceType.makeSequenceType(ItemType.STRING, OccurrenceIndicator.ZERO_OR_MORE), + SequenceType.makeSequenceType(ItemType.STRING, OccurrenceIndicator.ONE) + }; + } + + @Override + public XdmValue call(XdmValue[] xdmValues) throws SaxonApiException { + XdmValue r = xdmValues[0]; + if (r.size() == 0) { + return new XdmAtomicValue(""); + } + final String currentValue = xdmValues[0].itemAt(0).getStringValue(); + final String vocabularyName = xdmValues[1].itemAt(0).getStringValue(); + Qualifier cleanedValue = vocabularies.getSynonymAsQualifier(vocabularyName, currentValue); + + return new XdmAtomicValue( + cleanedValue != null ? cleanedValue.getClassid() : currentValue); + } +} diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/xslt/DateCleaner.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/xslt/DateCleaner.java new file mode 100644 index 000000000..3d57b966f --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/xslt/DateCleaner.java @@ -0,0 +1,44 @@ + +package eu.dnetlib.dhp.transformation.xslt; + +import static eu.dnetlib.dhp.transformation.xslt.XSLTTransformationFunction.QNAME_BASE_URI; + +import java.io.Serializable; + +import eu.dnetlib.dhp.schema.oaf.utils.GraphCleaningFunctions; +import net.sf.saxon.s9api.*; + +public class DateCleaner implements ExtensionFunction, Serializable { + + @Override + public QName getName() { + return new QName(QNAME_BASE_URI + "/dateISO", "dateISO"); + } + + @Override + public SequenceType getResultType() { + return SequenceType.makeSequenceType(ItemType.STRING, OccurrenceIndicator.ZERO_OR_ONE); + } + + @Override + public SequenceType[] getArgumentTypes() { + return new SequenceType[] { + SequenceType.makeSequenceType(ItemType.STRING, OccurrenceIndicator.ZERO_OR_ONE) + }; + } + + @Override + public XdmValue call(XdmValue[] xdmValues) throws SaxonApiException { + XdmValue r = xdmValues[0]; + if (r.size() == 0) { + return new XdmAtomicValue(""); + } + final String currentValue = xdmValues[0].itemAt(0).getStringValue(); + return new XdmAtomicValue(clean(currentValue)); + } + + // for backward compatibility with the existing unit tests + public String clean(String date) { + return GraphCleaningFunctions.cleanDate(date); + } +} diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/xslt/PersonCleaner.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/xslt/PersonCleaner.java new file mode 100644 index 000000000..1aa549c09 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/xslt/PersonCleaner.java @@ -0,0 +1,172 @@ + +package eu.dnetlib.dhp.transformation.xslt; + +import static eu.dnetlib.dhp.transformation.xslt.XSLTTransformationFunction.QNAME_BASE_URI; + +import java.io.Serializable; +import java.nio.charset.StandardCharsets; +import java.text.Normalizer; +import java.util.List; +import java.util.Set; + +import com.google.common.base.Joiner; +import com.google.common.base.Splitter; +import com.google.common.collect.Iterables; +import com.google.common.collect.Lists; +import com.google.common.hash.Hashing; + +import eu.dnetlib.dhp.transformation.xslt.utils.Capitalize; +import eu.dnetlib.dhp.transformation.xslt.utils.DotAbbreviations; +import net.sf.saxon.s9api.*; + +public class PersonCleaner implements ExtensionFunction, Serializable { + + private static final long serialVersionUID = 1L; + private List firstname = Lists.newArrayList(); + private List surname = Lists.newArrayList(); + private List fullname = Lists.newArrayList(); + + private static final Set particles = null; + + private String normalize(String s) { + s = Normalizer.normalize(s, Normalizer.Form.NFD); // was NFD + s = s.replaceAll("\\(.+\\)", ""); + s = s.replaceAll("\\[.+\\]", ""); + s = s.replaceAll("\\{.+\\}", ""); + s = s.replaceAll("\\s+-\\s+", "-"); + s = s.replaceAll("[\\p{Punct}&&[^-,]]", " "); + s = s.replace("\\d", " "); + s = s.replace("\\n", " "); + s = s.replace("\\.", " "); + s = s.replaceAll("\\s+", " "); + + if (s.contains(",")) { + String[] arr = s.split(","); + if (arr.length == 1) { + + fullname = splitTerms(arr[0]); + } else if (arr.length > 1) { + surname = splitTerms(arr[0]); + firstname = splitTermsFirstName(arr[1]); + fullname.addAll(surname); + fullname.addAll(firstname); + } + } else { + fullname = splitTerms(s); + + int lastInitialPosition = fullname.size(); + boolean hasSurnameInUpperCase = false; + + for (int i = 0; i < fullname.size(); i++) { + String term = fullname.get(i); + if (term.length() == 1) { + lastInitialPosition = i; + } else if (term.equals(term.toUpperCase())) { + hasSurnameInUpperCase = true; + } + } + if (lastInitialPosition < fullname.size() - 1) { // Case: Michele G. Artini + firstname = fullname.subList(0, lastInitialPosition + 1); + surname = fullname.subList(lastInitialPosition + 1, fullname.size()); + } else if (hasSurnameInUpperCase) { // Case: Michele ARTINI + for (String term : fullname) { + if (term.length() > 1 && term.equals(term.toUpperCase())) { + surname.add(term); + } else { + firstname.add(term); + } + } + } else if (lastInitialPosition == fullname.size()) { + surname = fullname.subList(lastInitialPosition - 1, fullname.size()); + firstname = fullname.subList(0, lastInitialPosition - 1); + } + + } + return null; + } + + private List splitTermsFirstName(String s) { + List list = Lists.newArrayList(); + for (String part : Splitter.on(" ").omitEmptyStrings().split(s)) { + if (s.trim().matches("\\p{Lu}{2,3}")) { + String[] parts = s.trim().split("(?=\\p{Lu})"); // (Unicode UpperCase) + for (String p : parts) { + if (p.length() > 0) + list.add(p); + } + } else { + list.add(part); + } + + } + return list; + } + + private List splitTerms(String s) { + List list = Lists.newArrayList(); + for (String part : Splitter.on(" ").omitEmptyStrings().split(s)) { + list.add(part); + } + return list; + } + + public List getFirstname() { + return firstname; + } + + public List getSurname() { + return surname; + } + + public List getFullname() { + return fullname; + } + + public String hash() { + return Hashing.murmur3_128().hashString(getNormalisedFullname(), StandardCharsets.UTF_8).toString(); + } + + public String getNormalisedFullname() { + return isAccurate() ? Joiner.on(" ").join(getSurname()) + ", " + Joiner.on(" ").join(getNameWithAbbreviations()) + : Joiner.on(" ").join(fullname); + } + + public List getCapitalSurname() { + return Lists.newArrayList(Iterables.transform(surname, new Capitalize())); + } + + public List getNameWithAbbreviations() { + return Lists.newArrayList(Iterables.transform(firstname, new DotAbbreviations())); + } + + public boolean isAccurate() { + return (firstname != null && surname != null && !firstname.isEmpty() && !surname.isEmpty()); + } + + @Override + public QName getName() { + return new QName(QNAME_BASE_URI + "/person", "normalize"); + } + + @Override + public SequenceType getResultType() { + return SequenceType.makeSequenceType(ItemType.STRING, OccurrenceIndicator.ZERO_OR_ONE); + } + + @Override + public SequenceType[] getArgumentTypes() { + return new SequenceType[] { + SequenceType.makeSequenceType(ItemType.STRING, OccurrenceIndicator.ZERO_OR_ONE) + }; + } + + @Override + public XdmValue call(XdmValue[] xdmValues) throws SaxonApiException { + XdmValue r = xdmValues[0]; + if (r.size() == 0) { + return new XdmAtomicValue(""); + } + final String currentValue = xdmValues[0].itemAt(0).getStringValue(); + return new XdmAtomicValue(normalize(currentValue)); + } +} diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/xslt/XSLTTransformationFunction.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/xslt/XSLTTransformationFunction.java new file mode 100644 index 000000000..54192a7be --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/xslt/XSLTTransformationFunction.java @@ -0,0 +1,108 @@ + +package eu.dnetlib.dhp.transformation.xslt; + +import java.io.Serializable; +import java.io.StringWriter; +import java.nio.charset.StandardCharsets; + +import javax.xml.transform.stream.StreamSource; + +import org.apache.commons.io.IOUtils; +import org.apache.spark.api.java.function.MapFunction; + +import eu.dnetlib.dhp.aggregation.common.AggregationCounter; +import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup; +import eu.dnetlib.dhp.schema.mdstore.MetadataRecord; +import net.sf.saxon.s9api.*; + +public class XSLTTransformationFunction implements MapFunction, Serializable { + + public static final String QNAME_BASE_URI = "http://eu/dnetlib/transform"; + + private static final String DATASOURCE_ID_PARAM = "varDataSourceId"; + + private static final String DATASOURCE_NAME_PARAM = "varOfficialName"; + + private final AggregationCounter aggregationCounter; + + private final String transformationRule; + + private final Cleaner cleanFunction; + + private final long dateOfTransformation; + + private final VocabularyGroup vocabularies; + + public XSLTTransformationFunction( + final AggregationCounter aggregationCounter, + final String transformationRule, + long dateOfTransformation, + final VocabularyGroup vocabularies) { + this.aggregationCounter = aggregationCounter; + this.transformationRule = transformationRule; + this.vocabularies = vocabularies; + this.dateOfTransformation = dateOfTransformation; + cleanFunction = new Cleaner(vocabularies); + } + + @Override + public MetadataRecord call(MetadataRecord value) { + aggregationCounter.getTotalItems().add(1); + try { + Processor processor = new Processor(false); + + processor.registerExtensionFunction(cleanFunction); + processor.registerExtensionFunction(new DateCleaner()); + processor.registerExtensionFunction(new PersonCleaner()); + + final XsltCompiler comp = processor.newXsltCompiler(); + QName datasourceIDParam = new QName(DATASOURCE_ID_PARAM); + comp.setParameter(datasourceIDParam, new XdmAtomicValue(value.getProvenance().getDatasourceId())); + QName datasourceNameParam = new QName(DATASOURCE_NAME_PARAM); + comp.setParameter(datasourceNameParam, new XdmAtomicValue(value.getProvenance().getDatasourceName())); + XsltExecutable xslt = comp + .compile(new StreamSource(IOUtils.toInputStream(transformationRule, StandardCharsets.UTF_8))); + XdmNode source = processor + .newDocumentBuilder() + .build(new StreamSource(IOUtils.toInputStream(value.getBody(), StandardCharsets.UTF_8))); + XsltTransformer trans = xslt.load(); + trans.setInitialContextNode(source); + final StringWriter output = new StringWriter(); + Serializer out = processor.newSerializer(output); + out.setOutputProperty(Serializer.Property.METHOD, "xml"); + out.setOutputProperty(Serializer.Property.INDENT, "yes"); + + trans.setDestination(out); + trans.transform(); + final String xml = output.toString(); + value.setBody(xml); + value.setDateOfTransformation(dateOfTransformation); + aggregationCounter.getProcessedItems().add(1); + return value; + } catch (Throwable e) { + aggregationCounter.getErrorItems().add(1); + return null; +// throw new RuntimeException(e); + } + } + + public AggregationCounter getAggregationCounter() { + return aggregationCounter; + } + + public String getTransformationRule() { + return transformationRule; + } + + public Cleaner getCleanFunction() { + return cleanFunction; + } + + public long getDateOfTransformation() { + return dateOfTransformation; + } + + public VocabularyGroup getVocabularies() { + return vocabularies; + } +} diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/xslt/utils/Capitalize.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/xslt/utils/Capitalize.java new file mode 100644 index 000000000..7143df2ec --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/xslt/utils/Capitalize.java @@ -0,0 +1,12 @@ + +package eu.dnetlib.dhp.transformation.xslt.utils; + +import com.google.common.base.Function; + +public class Capitalize implements Function { + + @Override + public String apply(String s) { + return org.apache.commons.lang3.text.WordUtils.capitalize(s.toLowerCase()); + } +} diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/xslt/utils/DotAbbreviations.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/xslt/utils/DotAbbreviations.java new file mode 100644 index 000000000..01174bf04 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/transformation/xslt/utils/DotAbbreviations.java @@ -0,0 +1,12 @@ + +package eu.dnetlib.dhp.transformation.xslt.utils; + +import com.google.common.base.Function; + +public class DotAbbreviations implements Function { + + @Override + public String apply(String s) { + return s.length() == 1 ? s + "." : s; + } +} diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/bipfinder/input_actionset_parameter.json b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/bipfinder/input_actionset_parameter.json new file mode 100644 index 000000000..7663a454b --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/bipfinder/input_actionset_parameter.json @@ -0,0 +1,20 @@ +[ + { + "paramName": "issm", + "paramLongName": "isSparkSessionManaged", + "paramDescription": "when true will stop SparkSession after job execution", + "paramRequired": false + }, + { + "paramName": "ip", + "paramLongName": "inputPath", + "paramDescription": "the URL from where to get the programme file", + "paramRequired": true + }, + { + "paramName": "o", + "paramLongName": "outputPath", + "paramDescription": "the path of the new ActionSet", + "paramRequired": true + } +] \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/bipfinder/oozie_app/config-default.xml b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/bipfinder/oozie_app/config-default.xml new file mode 100644 index 000000000..d262cb6e0 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/bipfinder/oozie_app/config-default.xml @@ -0,0 +1,30 @@ + + + jobTracker + yarnRM + + + nameNode + hdfs://nameservice1 + + + oozie.use.system.libpath + true + + + hiveMetastoreUris + thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083 + + + hiveJdbcUrl + jdbc:hive2://iis-cdh5-test-m3.ocean.icm.edu.pl:10000 + + + hiveDbName + openaire + + + oozie.launcher.mapreduce.user.classpath.first + true + + diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/bipfinder/oozie_app/workflow.xml b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/bipfinder/oozie_app/workflow.xml new file mode 100644 index 000000000..a6fba853d --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/bipfinder/oozie_app/workflow.xml @@ -0,0 +1,108 @@ + + + + + bipScorePath + the path where to find the bipFinder scores + + + outputPath + the path where to store the actionset + + + sparkDriverMemory + memory for driver process + + + sparkExecutorMemory + memory for individual executor + + + sparkExecutorCores + number of cores used by single executor + + + oozieActionShareLibForSpark2 + oozie action sharelib for spark 2.* + + + spark2ExtraListeners + com.cloudera.spark.lineage.NavigatorAppListener + spark 2.* extra listeners classname + + + spark2SqlQueryExecutionListeners + com.cloudera.spark.lineage.NavigatorQueryListener + spark 2.* sql query execution listeners classname + + + spark2YarnHistoryServerAddress + spark 2.* yarn history server address + + + spark2EventLogDir + spark 2.* event log dir location + + + + + ${jobTracker} + ${nameNode} + + + mapreduce.job.queuename + ${queueName} + + + oozie.launcher.mapred.job.queue.name + ${oozieLauncherQueueName} + + + oozie.action.sharelib.for.spark + ${oozieActionShareLibForSpark2} + + + + + + + Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] + + + + + + + + + + + + + + + + yarn + cluster + Produces the atomic action with the bip finder scores for publications + eu.dnetlib.dhp.actionmanager.bipfinder.SparkAtomicActionScoreJob + dhp-aggregation-${projectVersion}.jar + + --executor-memory=${sparkExecutorMemory} + --executor-cores=${sparkExecutorCores} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir} + + --inputPath${bipScorePath} + --outputPath${outputPath} + + + + + + + \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/createunresolvedentities/get_input_parameters.json b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/createunresolvedentities/get_input_parameters.json new file mode 100644 index 000000000..5a6a63774 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/createunresolvedentities/get_input_parameters.json @@ -0,0 +1,27 @@ +[ + { + "paramName":"s", + "paramLongName":"sourcePath", + "paramDescription": "the path of the sequencial file to read", + "paramRequired": true + }, + { + "paramName":"out", + "paramLongName":"outputPath", + "paramDescription": "the output path", + "paramRequired": true + }, + + { + "paramName": "ssm", + "paramLongName": "isSparkSessionManaged", + "paramDescription": "true if the spark session is managed, false otherwise", + "paramRequired": false + }, + { + "paramName": "d", + "paramLongName": "delimiter", + "paramDescription": "the delimiter if different from the default one (,)", + "paramRequired": false + } +] \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/createunresolvedentities/oozie_app/config-default.xml b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/createunresolvedentities/oozie_app/config-default.xml new file mode 100644 index 000000000..d262cb6e0 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/createunresolvedentities/oozie_app/config-default.xml @@ -0,0 +1,30 @@ + + + jobTracker + yarnRM + + + nameNode + hdfs://nameservice1 + + + oozie.use.system.libpath + true + + + hiveMetastoreUris + thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083 + + + hiveJdbcUrl + jdbc:hive2://iis-cdh5-test-m3.ocean.icm.edu.pl:10000 + + + hiveDbName + openaire + + + oozie.launcher.mapreduce.user.classpath.first + true + + diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/createunresolvedentities/oozie_app/workflow.xml b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/createunresolvedentities/oozie_app/workflow.xml new file mode 100644 index 000000000..a80bf4fbd --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/createunresolvedentities/oozie_app/workflow.xml @@ -0,0 +1,237 @@ + + + + + fosPath + the input path of the resources to be extended + + + + bipScorePath + the path where to find the bipFinder scores + + + outputPath + the path where to store the actionset + + + sparkDriverMemory + memory for driver process + + + sparkExecutorMemory + memory for individual executor + + + sparkExecutorCores + number of cores used by single executor + + + oozieActionShareLibForSpark2 + oozie action sharelib for spark 2.* + + + spark2ExtraListeners + com.cloudera.spark.lineage.NavigatorAppListener + spark 2.* extra listeners classname + + + spark2SqlQueryExecutionListeners + com.cloudera.spark.lineage.NavigatorQueryListener + spark 2.* sql query execution listeners classname + + + spark2YarnHistoryServerAddress + spark 2.* yarn history server address + + + spark2EventLogDir + spark 2.* event log dir location + + + + + ${jobTracker} + ${nameNode} + + + mapreduce.job.queuename + ${queueName} + + + oozie.launcher.mapred.job.queue.name + ${oozieLauncherQueueName} + + + oozie.action.sharelib.for.spark + ${oozieActionShareLibForSpark2} + + + + + + + + Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] + + + + + + + + + + + + yarn + cluster + Produces the unresolved from bip finder! + eu.dnetlib.dhp.actionmanager.createunresolvedentities.PrepareBipFinder + dhp-aggregation-${projectVersion}.jar + + --executor-memory=${sparkExecutorMemory} + --executor-cores=${sparkExecutorCores} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir} + + --sourcePath${bipScorePath} + --outputPath${workingDir}/prepared + + + + + + + + yarn + cluster + Gets Data from FOS csv file + eu.dnetlib.dhp.actionmanager.createunresolvedentities.GetFOSSparkJob + dhp-aggregation-${projectVersion}.jar + + --executor-memory=${sparkExecutorMemory} + --executor-cores=${sparkExecutorCores} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir} + + --sourcePath${fosPath} + --outputPath${workingDir}/input/fos + + + + + + + + + yarn + cluster + Produces the unresolved from FOS! + eu.dnetlib.dhp.actionmanager.createunresolvedentities.PrepareFOSSparkJob + dhp-aggregation-${projectVersion}.jar + + --executor-memory=${sparkExecutorMemory} + --executor-cores=${sparkExecutorCores} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir} + + --sourcePath${workingDir}/input/fos + --outputPath${workingDir}/prepared + + + + + + + + + yarn + cluster + Gets Data from SDG csv file + eu.dnetlib.dhp.actionmanager.createunresolvedentities.GetSDGSparkJob + dhp-aggregation-${projectVersion}.jar + + --executor-memory=${sparkExecutorMemory} + --executor-cores=${sparkExecutorCores} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir} + + --sourcePath${sdgPath} + --outputPath${workingDir}/input/sdg + + + + + + + + + yarn + cluster + Produces the unresolved from FOS! + eu.dnetlib.dhp.actionmanager.createunresolvedentities.PrepareSDGSparkJob + dhp-aggregation-${projectVersion}.jar + + --executor-memory=${sparkExecutorMemory} + --executor-cores=${sparkExecutorCores} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir} + + --sourcePath${workingDir}/input/sdg + --outputPath${workingDir}/prepared + + + + + + + + + + + + yarn + cluster + Saves the result produced for bip and fos by grouping results with the same id + eu.dnetlib.dhp.actionmanager.createunresolvedentities.SparkSaveUnresolved + dhp-aggregation-${projectVersion}.jar + + --executor-memory=${sparkExecutorMemory} + --executor-cores=${sparkExecutorCores} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir} + + --sourcePath${workingDir}/prepared + --outputPath${outputPath} + + + + + + + \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/createunresolvedentities/prepare_parameters.json b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/createunresolvedentities/prepare_parameters.json new file mode 100644 index 000000000..b7bad73e6 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/createunresolvedentities/prepare_parameters.json @@ -0,0 +1,20 @@ +[ + { + "paramName": "issm", + "paramLongName": "isSparkSessionManaged", + "paramDescription": "when true will stop SparkSession after job execution", + "paramRequired": false + }, + { + "paramName": "sp", + "paramLongName": "sourcePath", + "paramDescription": "the URL from where to get the programme file", + "paramRequired": true + }, + { + "paramName": "o", + "paramLongName": "outputPath", + "paramDescription": "the path of the new ActionSet", + "paramRequired": true + } +] \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/createunresolvedentities/produce_unresolved_parameters.json b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/createunresolvedentities/produce_unresolved_parameters.json new file mode 100644 index 000000000..b7bad73e6 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/createunresolvedentities/produce_unresolved_parameters.json @@ -0,0 +1,20 @@ +[ + { + "paramName": "issm", + "paramLongName": "isSparkSessionManaged", + "paramDescription": "when true will stop SparkSession after job execution", + "paramRequired": false + }, + { + "paramName": "sp", + "paramLongName": "sourcePath", + "paramDescription": "the URL from where to get the programme file", + "paramRequired": true + }, + { + "paramName": "o", + "paramLongName": "outputPath", + "paramDescription": "the path of the new ActionSet", + "paramRequired": true + } +] \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/opencitations/as_parameters.json b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/opencitations/as_parameters.json new file mode 100644 index 000000000..308e02026 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/opencitations/as_parameters.json @@ -0,0 +1,25 @@ +[ + { + "paramName": "ip", + "paramLongName": "inputPath", + "paramDescription": "the zipped opencitations file", + "paramRequired": true + }, + { + "paramName": "op", + "paramLongName": "outputPath", + "paramDescription": "the working path", + "paramRequired": true + }, + { + "paramName": "issm", + "paramLongName": "isSparkSessionManaged", + "paramDescription": "the hdfs name node", + "paramRequired": false + }, { + "paramName": "sdr", + "paramLongName": "shouldDuplicateRels", + "paramDescription": "the hdfs name node", + "paramRequired": false +} +] diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/opencitations/input_parameters.json b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/opencitations/input_parameters.json new file mode 100644 index 000000000..4910ad11d --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/opencitations/input_parameters.json @@ -0,0 +1,20 @@ +[ + { + "paramName": "if", + "paramLongName": "inputFile", + "paramDescription": "the zipped opencitations file", + "paramRequired": true + }, + { + "paramName": "wp", + "paramLongName": "workingPath", + "paramDescription": "the working path", + "paramRequired": true + }, + { + "paramName": "hnn", + "paramLongName": "hdfsNameNode", + "paramDescription": "the hdfs name node", + "paramRequired": true + } +] diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/opencitations/input_readcoci_parameters.json b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/opencitations/input_readcoci_parameters.json new file mode 100644 index 000000000..b57cb5d9a --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/opencitations/input_readcoci_parameters.json @@ -0,0 +1,37 @@ +[ + { + "paramName": "wp", + "paramLongName": "workingPath", + "paramDescription": "the zipped opencitations file", + "paramRequired": true + }, + + + { + "paramName": "issm", + "paramLongName": "isSparkSessionManaged", + "paramDescription": "the hdfs name node", + "paramRequired": false + }, + { + "paramName": "d", + "paramLongName": "delimiter", + "paramDescription": "the hdfs name node", + "paramRequired": false + }, + { + "paramName": "op", + "paramLongName": "outputPath", + "paramDescription": "the hdfs name node", + "paramRequired": true + }, + { + "paramName": "if", + "paramLongName": "inputFile", + "paramDescription": "the hdfs name node", + "paramRequired": true + } +] + + + diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/opencitations/oozie_app/config-default.xml b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/opencitations/oozie_app/config-default.xml new file mode 100644 index 000000000..a1755f329 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/opencitations/oozie_app/config-default.xml @@ -0,0 +1,58 @@ + + + jobTracker + yarnRM + + + nameNode + hdfs://nameservice1 + + + oozie.use.system.libpath + true + + + oozie.action.sharelib.for.spark + spark2 + + + hive_metastore_uris + thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083 + + + spark2YarnHistoryServerAddress + http://iis-cdh5-test-gw.ocean.icm.edu.pl:18089 + + + spark2ExtraListeners + com.cloudera.spark.lineage.NavigatorAppListener + + + spark2SqlQueryExecutionListeners + com.cloudera.spark.lineage.NavigatorQueryListener + + + oozie.launcher.mapreduce.user.classpath.first + true + + + sparkExecutorNumber + 4 + + + spark2EventLogDir + /user/spark/spark2ApplicationHistory + + + sparkDriverMemory + 15G + + + sparkExecutorMemory + 6G + + + sparkExecutorCores + 1 + + \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/opencitations/oozie_app/download.sh b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/opencitations/oozie_app/download.sh new file mode 100644 index 000000000..7a34f3c4e --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/opencitations/oozie_app/download.sh @@ -0,0 +1,2 @@ +#!/bin/bash +for file in $(echo $1 | tr ";" "\n"); do curl -L $(echo $file | cut -d '@' -f 1 ) | hdfs dfs -put - $2/$(echo $file | cut -d '@' -f 2) ; done; \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/opencitations/oozie_app/workflow.xml b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/opencitations/oozie_app/workflow.xml new file mode 100644 index 000000000..0f01039f7 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/opencitations/oozie_app/workflow.xml @@ -0,0 +1,118 @@ + + + + ${jobTracker} + ${nameNode} + + + mapreduce.job.queuename + ${queueName} + + + oozie.launcher.mapred.job.queue.name + ${oozieLauncherQueueName} + + + oozie.action.sharelib.for.spark + ${oozieActionShareLibForSpark2} + + + + + + + + + + ${wf:conf('resumeFrom') eq 'DownloadDump'} + ${wf:conf('resumeFrom') eq 'ExtractContent'} + ${wf:conf('resumeFrom') eq 'ReadContent'} + + + + + + Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] + + + + ${jobTracker} + ${nameNode} + + + mapred.job.queue.name + ${queueName} + + + download.sh + ${filelist} + ${workingPath}/Original + HADOOP_USER_NAME=${wf:user()} + download.sh + + + + + + + + eu.dnetlib.dhp.actionmanager.opencitations.GetOpenCitationsRefs + --hdfsNameNode${nameNode} + --inputFile${inputFile} + --workingPath${workingPath} + + + + + + + + yarn + cluster + Produces the AS for OC + eu.dnetlib.dhp.actionmanager.opencitations.ReadCOCI + dhp-aggregation-${projectVersion}.jar + + --executor-memory=${sparkExecutorMemory} + --executor-cores=${sparkExecutorCores} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir} + + --workingPath${workingPath}/COCI + --outputPath${workingPath}/COCI_JSON/ + --delimiter${delimiter} + --inputFile${inputFileCoci} + + + + + + + + yarn + cluster + Produces the AS for OC + eu.dnetlib.dhp.actionmanager.opencitations.CreateActionSetSparkJob + dhp-aggregation-${projectVersion}.jar + + --executor-memory=${sparkExecutorMemory} + --executor-cores=${sparkExecutorCores} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir} + + --inputPath${workingPath}/COCI_JSON + --outputPath${outputPath} + + + + + + \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/opencitations/opencitations_parameters.json b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/opencitations/opencitations_parameters.json new file mode 100644 index 000000000..258d6816e --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/opencitations/opencitations_parameters.json @@ -0,0 +1,8 @@ +[ + {"paramName":"n", "paramLongName":"hdfsServerUri", "paramDescription": "the server uri", "paramRequired": true}, + {"paramName":"w", "paramLongName":"workingPath", "paramDescription": "the default work path", "paramRequired": true}, + {"paramName":"f", "paramLongName":"opencitationFile", "paramDescription": "the name of the file", "paramRequired": true}, + {"paramName":"issm", "paramLongName":"isSparkSessionManaged", "paramDescription": "the name of the activities orcid file", "paramRequired": false}, + {"paramName":"o", "paramLongName":"outputPath", "paramDescription": "the name of the activities orcid file", "paramRequired": true} + +] \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/project/oozie_app/workflow.xml b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/project/oozie_app/workflow.xml index c710c8b55..bd864a6aa 100644 --- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/project/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/project/oozie_app/workflow.xml @@ -1,4 +1,4 @@ - + projectFileURL @@ -18,6 +18,10 @@ outputPath path where to store the action set + + sheetName + the name of the sheet to read + @@ -31,19 +35,32 @@ - + + + + + + + + + + + + + + eu.dnetlib.dhp.actionmanager.project.utils.ReadCSV --hdfsNameNode${nameNode} --fileURL${projectFileURL} --hdfsPath${workingDir}/projects - --classForNameeu.dnetlib.dhp.actionmanager.project.utils.CSVProject + --classForNameeu.dnetlib.dhp.actionmanager.project.utils.model.CSVProject - + @@ -53,9 +70,9 @@ --hdfsNameNode${nameNode} --fileURL${programmeFileURL} --hdfsPath${workingDir}/programme - --classForNameeu.dnetlib.dhp.actionmanager.project.utils.CSVProgramme + --classForNameeu.dnetlib.dhp.actionmanager.project.utils.model.CSVProgramme - + @@ -65,9 +82,10 @@ --hdfsNameNode${nameNode} --fileURL${topicFileURL} --hdfsPath${workingDir}/topic - --classForNameeu.dnetlib.dhp.actionmanager.project.utils.EXCELTopic + --sheetName${sheetName} + --classForNameeu.dnetlib.dhp.actionmanager.project.utils.model.EXCELTopic - + @@ -80,7 +98,7 @@ --postgresUser${postgresUser} --postgresPassword${postgresPassword} - + @@ -104,10 +122,15 @@ --programmePath${workingDir}/programme --outputPath${workingDir}/preparedProgramme - + + + + + + yarn @@ -129,7 +152,7 @@ --outputPath${workingDir}/preparedProjects --dbProjectPath${workingDir}/dbProjects - + diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/project/parameters.json b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/project/parameters.json index dd3de70f6..9ccb70a9f 100644 --- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/project/parameters.json +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/project/parameters.json @@ -23,6 +23,16 @@ "paramLongName" : "classForName", "paramDescription" : "the name of the class to deserialize the csv to", "paramRequired" : true +}, { + "paramName": "sn", + "paramLongName" : "sheetName", + "paramDescription" : "the name of the sheet in case the file is excel", + "paramRequired" : false +}, { + "paramName": "d", + "paramLongName" : "delimiter", + "paramDescription" : "the delimiter between fields in case it is not ;", + "paramRequired" : false } diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/ror/action_set_parameters.json b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/ror/action_set_parameters.json new file mode 100644 index 000000000..765e9c0af --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/ror/action_set_parameters.json @@ -0,0 +1,14 @@ +[ + { + "paramName": "i", + "paramLongName": "inputPath", + "paramDescription": "the path of the input json", + "paramRequired": true + }, + { + "paramName": "o", + "paramLongName": "outputPath", + "paramDescription": "the path of the new ActionSet", + "paramRequired": true + } +] diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/ror/oozie_app/config-default.xml b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/ror/oozie_app/config-default.xml new file mode 100644 index 000000000..a1755f329 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/ror/oozie_app/config-default.xml @@ -0,0 +1,58 @@ + + + jobTracker + yarnRM + + + nameNode + hdfs://nameservice1 + + + oozie.use.system.libpath + true + + + oozie.action.sharelib.for.spark + spark2 + + + hive_metastore_uris + thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083 + + + spark2YarnHistoryServerAddress + http://iis-cdh5-test-gw.ocean.icm.edu.pl:18089 + + + spark2ExtraListeners + com.cloudera.spark.lineage.NavigatorAppListener + + + spark2SqlQueryExecutionListeners + com.cloudera.spark.lineage.NavigatorQueryListener + + + oozie.launcher.mapreduce.user.classpath.first + true + + + sparkExecutorNumber + 4 + + + spark2EventLogDir + /user/spark/spark2ApplicationHistory + + + sparkDriverMemory + 15G + + + sparkExecutorMemory + 6G + + + sparkExecutorCores + 1 + + \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/ror/oozie_app/workflow.xml b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/ror/oozie_app/workflow.xml new file mode 100644 index 000000000..3d00b80a8 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/ror/oozie_app/workflow.xml @@ -0,0 +1,55 @@ + + + + rorJsonInputPath + the path of the json + + + rorActionSetPath + path where to store the action set + + + + + + + Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] + + + + + + + + + + + + + + + + yarn + cluster + ProcessRorFile + eu.dnetlib.dhp.actionmanager.ror.GenerateRorActionSetJob + dhp-aggregation-${projectVersion}.jar + + --executor-cores=${sparkExecutorCores} + --executor-memory=${sparkExecutorMemory} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.sql.shuffle.partitions=3840 + + --inputPath${rorJsonInputPath} + --outputPath${rorActionSetPath} + + + + + + + diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/usagestats/input_actionset_parameter.json b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/usagestats/input_actionset_parameter.json new file mode 100644 index 000000000..e9200d3ad --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/usagestats/input_actionset_parameter.json @@ -0,0 +1,32 @@ +[ + { + "paramName": "issm", + "paramLongName": "isSparkSessionManaged", + "paramDescription": "when true will stop SparkSession after job execution", + "paramRequired": false + }, + { + "paramName": "hmu", + "paramLongName": "hive_metastore_uris", + "paramDescription": "the URI for the hive metastore", + "paramRequired": true + }, + { + "paramName": "o", + "paramLongName": "outputPath", + "paramDescription": "the path of the new ActionSet", + "paramRequired": true + }, + { + "paramName": "sdb", + "paramLongName": "usagestatsdb", + "paramDescription": "the name of the db to be used", + "paramRequired": true + }, + { + "paramName": "wp", + "paramLongName": "workingPath", + "paramDescription": "the workingPath where to save the content of the usage_stats table", + "paramRequired": true + } +] \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/usagestats/oozie_app/config-default.xml b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/usagestats/oozie_app/config-default.xml new file mode 100644 index 000000000..d262cb6e0 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/usagestats/oozie_app/config-default.xml @@ -0,0 +1,30 @@ + + + jobTracker + yarnRM + + + nameNode + hdfs://nameservice1 + + + oozie.use.system.libpath + true + + + hiveMetastoreUris + thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083 + + + hiveJdbcUrl + jdbc:hive2://iis-cdh5-test-m3.ocean.icm.edu.pl:10000 + + + hiveDbName + openaire + + + oozie.launcher.mapreduce.user.classpath.first + true + + diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/usagestats/oozie_app/workflow.xml b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/usagestats/oozie_app/workflow.xml new file mode 100644 index 000000000..d94cf7d53 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/usagestats/oozie_app/workflow.xml @@ -0,0 +1,99 @@ + + + + outputPath + the path where to store the actionset + + + usagestatsdb + the name of the db to be used + + + sparkDriverMemory + memory for driver process + + + sparkExecutorMemory + memory for individual executor + + + sparkExecutorCores + number of cores used by single executor + + + oozieActionShareLibForSpark2 + oozie action sharelib for spark 2.* + + + spark2ExtraListeners + com.cloudera.spark.lineage.NavigatorAppListener + spark 2.* extra listeners classname + + + spark2SqlQueryExecutionListeners + com.cloudera.spark.lineage.NavigatorQueryListener + spark 2.* sql query execution listeners classname + + + spark2YarnHistoryServerAddress + spark 2.* yarn history server address + + + spark2EventLogDir + spark 2.* event log dir location + + + + + ${jobTracker} + ${nameNode} + + + mapreduce.job.queuename + ${queueName} + + + oozie.launcher.mapred.job.queue.name + ${oozieLauncherQueueName} + + + oozie.action.sharelib.for.spark + ${oozieActionShareLibForSpark2} + + + + + + + Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] + + + + + + yarn + cluster + Produces the atomic action with the usage stats count for results + eu.dnetlib.dhp.actionmanager.usagestats.SparkAtomicActionUsageJob + dhp-aggregation-${projectVersion}.jar + + --executor-memory=${sparkExecutorMemory} + --executor-cores=${sparkExecutorCores} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir} + + --hive_metastore_uris${hiveMetastoreUris} + --outputPath${outputPath} + --usagestatsdb${usagestatsdb} + --workingPath${workingDir}/usageDb + + + + + + + \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/collection_input_parameters.json b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/collection_input_parameters.json deleted file mode 100644 index 4a6aec5ee..000000000 --- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/collection_input_parameters.json +++ /dev/null @@ -1,86 +0,0 @@ -[ - { - "paramName": "issm", - "paramLongName": "isSparkSessionManaged", - "paramDescription": "when true will stop SparkSession after job execution", - "paramRequired": false - }, - { - "paramName": "e", - "paramLongName": "encoding", - "paramDescription": "the encoding of the input record should be JSON or XML", - "paramRequired": true - }, - { - "paramName": "d", - "paramLongName": "dateOfCollection", - "paramDescription": "the date when the record has been stored", - "paramRequired": true - }, - { - "paramName": "p", - "paramLongName": "provenance", - "paramDescription": "the infos about the provenance of the collected records", - "paramRequired": true - }, - { - "paramName": "x", - "paramLongName": "xpath", - "paramDescription": "the xpath to identify the record identifier", - "paramRequired": true - }, - { - "paramName": "i", - "paramLongName": "input", - "paramDescription": "the path of the sequencial file to read", - "paramRequired": true - }, - { - "paramName": "o", - "paramLongName": "output", - "paramDescription": "the path of the result DataFrame on HDFS", - "paramRequired": true - }, - { - "paramName": "ru", - "paramLongName": "rabbitUser", - "paramDescription": "the user to connect with RabbitMq for messaging", - "paramRequired": true - }, - { - "paramName": "rp", - "paramLongName": "rabbitPassword", - "paramDescription": "the password to connect with RabbitMq for messaging", - "paramRequired": true - }, - { - "paramName": "rh", - "paramLongName": "rabbitHost", - "paramDescription": "the host of the RabbitMq server", - "paramRequired": true - }, - { - "paramName": "ro", - "paramLongName": "rabbitOngoingQueue", - "paramDescription": "the name of the ongoing queue", - "paramRequired": true - }, - { - "paramName": "rr", - "paramLongName": "rabbitReportQueue", - "paramDescription": "the name of the report queue", - "paramRequired": true - }, - { - "paramName": "w", - "paramLongName": "workflowId", - "paramDescription": "the identifier of the dnet Workflow", - "paramRequired": true - }, - { - "paramName": "t", - "paramLongName": "isTest", - "paramDescription": "the name of the report queue", - "paramRequired": false - } -] \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/collector_worker_input_parameter.json b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/collector_worker_input_parameter.json new file mode 100644 index 000000000..cd4b8224b --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/collector_worker_input_parameter.json @@ -0,0 +1,62 @@ +[ + { + "paramName": "a", + "paramLongName": "apidescriptor", + "paramDescription": "the JSON encoding of the API Descriptor", + "paramRequired": true + }, + { + "paramName": "n", + "paramLongName": "namenode", + "paramDescription": "the Name Node URI", + "paramRequired": true + }, + { + "paramName": "mv", + "paramLongName": "mdStoreVersion", + "paramDescription": "the MDStore Version bean", + "paramRequired": true + }, + { + "paramName": "dm", + "paramLongName": "dnetMessageManagerURL", + "paramDescription": "the End point URL to send Messages", + "paramRequired": true + }, + { + "paramName": "w", + "paramLongName": "workflowId", + "paramDescription": "the identifier of the dnet Workflow", + "paramRequired": true + }, + { + "paramName": "mnr", + "paramLongName": "maxNumberOfRetry", + "paramDescription": "the maximum number of admitted connection retries", + "paramRequired": false + }, + { + "paramName": "rqd", + "paramLongName": "requestDelay", + "paramDescription": "the delay (ms) between requests", + "paramRequired": false + }, + { + "paramName": "rtd", + "paramLongName": "retryDelay", + "paramDescription": "the delay (ms) between retries", + "paramRequired": false + }, + { + "paramName": "cto", + "paramLongName": "connectTimeOut", + "paramDescription": "the maximum allowed time (ms) to connect to the remote host", + "paramRequired": false + }, + { + "paramName": "rto", + "paramLongName": "readTimeOut", + "paramDescription": "the maximum allowed time (ms) to receive content from the remote host", + "paramRequired": false + } +] \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/generate_native_input_parameters.json b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/generate_native_input_parameters.json new file mode 100644 index 000000000..987f004bb --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/generate_native_input_parameters.json @@ -0,0 +1,50 @@ +[ + { + "paramName": "issm", + "paramLongName": "isSparkSessionManaged", + "paramDescription": "when true will stop SparkSession after job execution", + "paramRequired": false + }, + { + "paramName": "e", + "paramLongName": "encoding", + "paramDescription": "the encoding of the input record should be JSON or XML", + "paramRequired": true + }, + { + "paramName": "d", + "paramLongName": "dateOfCollection", + "paramDescription": "the date when the record has been stored", + "paramRequired": true + }, + { + "paramName": "p", + "paramLongName": "provenance", + "paramDescription": "the infos about the provenance of the collected records", + "paramRequired": true + }, + { + "paramName": "x", + "paramLongName": "xpath", + "paramDescription": "the xpath to identify the record identifier", + "paramRequired": true + }, + { + "paramName": "mv", + "paramLongName": "mdStoreVersion", + "paramDescription": "the Metadata Store Version Info", + "paramRequired": true + }, + { + "paramName": "rmv", + "paramLongName": "readMdStoreVersion", + "paramDescription": "the Read Lock Metadata Store Version bean", + "paramRequired": false + }, + { + "paramName": "w", + "paramLongName": "workflowId", + "paramDescription": "the identifier of the dnet Workflow", + "paramRequired": false + } +] \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/mdstore_action_parameters.json b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/mdstore_action_parameters.json new file mode 100644 index 000000000..57a218a34 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/mdstore_action_parameters.json @@ -0,0 +1,45 @@ +[ + { + "paramName": "a", + "paramLongName": "action", + "paramDescription": "the JSON encoding of the API Descriptor", + "paramRequired": true + }, + { + "paramName": "mu", + "paramLongName": "mdStoreManagerURI", + "paramDescription": "the MDStore Manager URI", + "paramRequired": true + }, + { + "paramName": "mi", + "paramLongName": "mdStoreID", + "paramDescription": "the Metadata Store ID", + "paramRequired": false + }, + { + "paramName": "ms", + "paramLongName": "mdStoreSize", + "paramDescription": "the Metadata Store Size", + "paramRequired": false + }, + { + "paramName": "mv", + "paramLongName": "mdStoreVersion", + "paramDescription": "the Metadata Version Bean", + "paramRequired": false + }, + { + "paramName": "n", + "paramLongName": "namenode", + "paramDescription": "the Name Node URI", + "paramRequired": false + }, + { + "paramName": "rm", + "paramLongName": "readMDStoreId", + "paramDescription": "the ID Locked to Read", + "paramRequired": false + } + +] \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/oozie_app/config-default.xml b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/oozie_app/config-default.xml new file mode 100644 index 000000000..e77dd09c9 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/oozie_app/config-default.xml @@ -0,0 +1,22 @@ + + + jobTracker + yarnRM + + + nameNode + hdfs://nameservice1 + + + oozie.use.system.libpath + true + + + oozie.action.sharelib.for.spark + spark2 + + + oozie.launcher.mapreduce.user.classpath.first + true + + \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/oozie_app/workflow.xml b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/oozie_app/workflow.xml index 3e7f68401..0678eed11 100644 --- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collection/oozie_app/workflow.xml @@ -1,112 +1,212 @@ - - sequenceFilePath - the path to store the sequence file of the native metadata collected - - - - mdStorePath - the path of the native mdstore - - apiDescription A json encoding of the API Description class - dataSourceInfo A json encoding of the Datasource Info identifierPath - An xpath to retrieve the metadata idnentifier for the generation of DNet Identifier + An xpath to retrieve the metadata identifier for the generation of DNet Identifier - metadataEncoding The type of the metadata XML/JSON - timestamp The timestamp of the collection date - workflowId The identifier of the workflow + + mdStoreID + The identifier of the mdStore + + + mdStoreManagerURI + The URI of the MDStore Manager + + + + dnetMessageManagerURL + The URI of the Dnet Message Manager + + + collectionMode + Should be REFRESH or INCREMENTAL + + + + collection_java_xmx + -Xmx200m + Used to configure the heap size for the map JVM process. Should be 80% of mapreduce.map.memory.mb. + + + - + + ${jobTracker} + ${nameNode} + + + + Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] - - - - - - - + + + + ${wf:conf('collectionMode') eq 'REFRESH'} + ${wf:conf('collectionMode') eq 'INCREMENTAL'} + + + + + + + eu.dnetlib.dhp.aggregation.mdstore.MDStoreActionNode + ${collection_java_xmx} + --actionREAD_LOCK + --mdStoreID${mdStoreID} + --mdStoreManagerURI${mdStoreManagerURI} + + + + + + + + + eu.dnetlib.dhp.aggregation.mdstore.MDStoreActionNode + ${collection_java_xmx} + --actionNEW_VERSION + --mdStoreID${mdStoreID} + --mdStoreManagerURI${mdStoreManagerURI} + + + - ${jobTracker} - ${nameNode} - eu.dnetlib.dhp.collection.worker.DnetCollectorWorker - -p${sequenceFilePath} - -a${apiDescription} - -n${nameNode} - -rh${rmq_host} - -ru${rmq_user} - -rp${rmq_pwd} - -rr${rmq_report} - -ro${rmq_ongoing} - -usandro.labruzzo - -w${workflowId} + eu.dnetlib.dhp.collection.CollectorWorkerApplication + ${collection_java_xmx} + --apidescriptor${apiDescription} + --namenode${nameNode} + --workflowId${workflowId} + --dnetMessageManagerURL${dnetMessageManagerURL} + --mdStoreVersion${wf:actionData('StartTransaction')['mdStoreVersion']} + --maxNumberOfRetry${maxNumberOfRetry} + --requestDelay${requestDelay} + --retryDelay${retryDelay} + --connectTimeOut${connectTimeOut} + --readTimeOut${readTimeOut} - - - - - ${jobTracker} - ${nameNode} - yarn - cluster - GenerateNativeStoreSparkJob - eu.dnetlib.dhp.collection.GenerateNativeStoreSparkJob - dhp-aggregations-1.0.0-SNAPSHOT.jar - --num-executors 50 --conf spark.yarn.jars="hdfs://hadoop-rm1.garr-pa1.d4science.org:8020/user/oozie/share/lib/lib_20180405103059/spark2" - --encoding ${metadataEncoding} - --dateOfCollection ${timestamp} - --provenance ${dataSourceInfo} - --xpath${identifierPath} - --input${sequenceFilePath} - --output${mdStorePath} - -rh${rmq_host} - -ru${rmq_user} - -rp${rmq_pwd} - -rr${rmq_report} - -ro${rmq_ongoing} - -w${workflowId} - - - + - - - - + + + yarn + cluster + Generate Native MetadataStore + eu.dnetlib.dhp.collection.GenerateNativeStoreSparkJob + dhp-aggregation-${projectVersion}.jar + + --executor-memory=${sparkExecutorMemory} + --executor-cores=${sparkExecutorCores} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + + --encoding${metadataEncoding} + --dateOfCollection${timestamp} + --provenance${dataSourceInfo} + --xpath${identifierPath} + --mdStoreVersion${wf:actionData('StartTransaction')['mdStoreVersion']} + --readMdStoreVersion${wf:actionData('BeginRead')['mdStoreReadLockVersion']} + + + + + + + + ${wf:conf('collectionMode') eq 'REFRESH'} + ${wf:conf('collectionMode') eq 'INCREMENTAL'} + + + + + + + eu.dnetlib.dhp.aggregation.mdstore.MDStoreActionNode + ${collection_java_xmx} + --actionREAD_UNLOCK + --mdStoreManagerURI${mdStoreManagerURI} + --readMDStoreId${wf:actionData('BeginRead')['mdStoreReadLockVersion']} + + + + + + + + eu.dnetlib.dhp.aggregation.mdstore.MDStoreActionNode + ${collection_java_xmx} + --actionCOMMIT + --namenode${nameNode} + --mdStoreVersion${wf:actionData('StartTransaction')['mdStoreVersion']} + --mdStoreManagerURI${mdStoreManagerURI} + + + + + + + + ${wf:conf('collectionMode') eq 'REFRESH'} + ${wf:conf('collectionMode') eq 'INCREMENTAL'} + + + + + + + eu.dnetlib.dhp.aggregation.mdstore.MDStoreActionNode + ${collection_java_xmx} + --actionREAD_UNLOCK + --mdStoreManagerURI${mdStoreManagerURI} + --readMDStoreId${wf:actionData('BeginRead')['mdStoreReadLockVersion']} + + + + + + + + eu.dnetlib.dhp.aggregation.mdstore.MDStoreActionNode + ${collection_java_xmx} + --actionROLLBACK + --mdStoreVersion${wf:actionData('StartTransaction')['mdStoreVersion']} + --mdStoreManagerURI${mdStoreManagerURI} + + \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collector/worker/collector_parameter.json b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collector/worker/collector_parameter.json deleted file mode 100644 index c247d15e4..000000000 --- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/collector/worker/collector_parameter.json +++ /dev/null @@ -1,12 +0,0 @@ -[ - {"paramName":"p", "paramLongName":"hdfsPath", "paramDescription": "the path where storing the sequential file", "paramRequired": true}, - {"paramName":"a", "paramLongName":"apidescriptor", "paramDescription": "the JSON encoding of the API Descriptor", "paramRequired": true}, - {"paramName":"n", "paramLongName":"namenode", "paramDescription": "the Name Node URI", "paramRequired": true}, - {"paramName":"u", "paramLongName":"userHDFS", "paramDescription": "the user wich create the hdfs seq file", "paramRequired": true}, - {"paramName":"ru", "paramLongName":"rabbitUser", "paramDescription": "the user to connect with RabbitMq for messaging", "paramRequired": true}, - {"paramName":"rp", "paramLongName":"rabbitPassword", "paramDescription": "the password to connect with RabbitMq for messaging", "paramRequired": true}, - {"paramName":"rh", "paramLongName":"rabbitHost", "paramDescription": "the host of the RabbitMq server", "paramRequired": true}, - {"paramName":"ro", "paramLongName":"rabbitOngoingQueue", "paramDescription": "the name of the ongoing queue", "paramRequired": true}, - {"paramName":"rr", "paramLongName":"rabbitReportQueue", "paramDescription": "the name of the report queue", "paramRequired": true}, - {"paramName":"w", "paramLongName":"workflowId", "paramDescription": "the identifier of the dnet Workflow", "paramRequired": true} -] \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/datacite/collection/oozie_app/config-default.xml b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/datacite/collection/oozie_app/config-default.xml new file mode 100644 index 000000000..dd3c32c62 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/datacite/collection/oozie_app/config-default.xml @@ -0,0 +1,23 @@ + + + jobTracker + yarnRM + + + nameNode + hdfs://nameservice1 + + + oozie.use.system.libpath + true + + + oozie.action.sharelib.for.spark + spark2 + + + + oozie.launcher.mapreduce.user.classpath.first + true + + \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/datacite/collection/oozie_app/workflow.xml b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/datacite/collection/oozie_app/workflow.xml new file mode 100644 index 000000000..6989eed66 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/datacite/collection/oozie_app/workflow.xml @@ -0,0 +1,52 @@ + + + + mainPath + the working path of Datacite stores + + + isLookupUrl + The IS lookUp service endopoint + + + blocksize + 100 + The request block size + + + + + + + + Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] + + + + + + yarn-cluster + cluster + ImportDatacite + eu.dnetlib.dhp.datacite.ImportDatacite + dhp-aggregation-${projectVersion}.jar + + --executor-memory=${sparkExecutorMemory} + --executor-cores=${sparkExecutorCores} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + + --targetPath${mainPath}/datacite_update + --dataciteDumpPath${mainPath}/datacite_dump + --namenode${nameNode} + --masteryarn-cluster + --blocksize${blocksize} + + + + + + \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/datacite/create_updated_hb_map.py b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/datacite/create_updated_hb_map.py new file mode 100644 index 000000000..db0431aae --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/datacite/create_updated_hb_map.py @@ -0,0 +1,63 @@ +from urllib.request import urlopen +import json + + +def retrieve_datacite_clients(base_url): + datacite_clients = {} + while base_url is not None: + with urlopen(base_url) as response: + print(f"requesting {base_url}") + response_content = response.read() + data = json.loads(response_content) + if 'data' in data and len(data['data'])>0: + for item in data['data']: + datacite_clients[item['id'].lower()]= item['attributes']['re3data'].lower().replace("https://doi.org/","") + base_url = data['links']['next'] + else: + base_url = None + return datacite_clients + + +def retrieve_r3data(start_url): + r3data_clients = {} + page_number = 1 + base_url = start_url + while base_url is not None: + with urlopen(base_url) as response: + print(f"requesting {base_url}") + response_content = response.read() + data = json.loads(response_content) + if 'data' in data and len(data['data'])>0: + for item in data['data']: + r3data_clients[item['id'].lower()]= dict( + openaire_id= "re3data_____::"+item['attributes']['re3dataId'].lower(), + official_name=item['attributes']['repositoryName'] + ) + page_number +=1 + base_url = f"{start_url}&page[number]={page_number}" + else: + base_url = None + return r3data_clients + + + + + + +base_url ="https://api.datacite.org/clients?query=re3data_id:*&page[size]=250" + +dc = retrieve_datacite_clients(base_url) +r3 = retrieve_r3data("https://api.datacite.org/re3data?page[size]=250") + +result = {} + +for item in dc: + res = dc[item].lower() + if res not in r3: + print(f"missing {res} for {item} in dictionary") + else: + result[item.upper()]= dict(openaire_id=r3[res]["openaire_id"],datacite_name=r3[res]["official_name"], official_name=r3[res]["official_name"] ) + + +with open('hostedBy_map.json', 'w', encoding='utf8') as json_file: + json.dump(result, json_file, ensure_ascii=False, indent=1) diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/datacite/datacite_filter b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/datacite/datacite_filter new file mode 100644 index 000000000..ad80d6998 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/datacite/datacite_filter @@ -0,0 +1,28 @@ +TUBYDI - Assistir Filmes e Series Online Grátis +123Movies +WATCH FULL MOVIE +Movierulz +Full Movie Online +MOVIé WatcH +The King of Staten Island 2020 Online For Free +Watch Train to Busan 2 2020 online for free +Sixth Sense Movie Novelization +Film Complet streaming vf gratuit en ligne +watch now free +LIVE stream watch +LIVE stream UFC +RBC Heritage live stream +MLBStreams Free +NFL Live Stream +Live Stream Free +Royal Ascot 2020 Live Stream +TV Shows Full Episodes Official +FuboTV +Gomovies +Online Free Trial Access +123watch +DÜŞÜK HAPI +Bebek Düşürme Yöntemleri +WHATSAP İLETİŞİM +Cytotec +düşük hapı \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/datacite/exportDataset_parameters.json b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/datacite/exportDataset_parameters.json new file mode 100644 index 000000000..63e080337 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/datacite/exportDataset_parameters.json @@ -0,0 +1,21 @@ +[ + { + "paramName": "s", + "paramLongName": "sourcePath", + "paramDescription": "the source mdstore path", + "paramRequired": true + }, + + { + "paramName": "t", + "paramLongName": "targetPath", + "paramDescription": "the target mdstore path", + "paramRequired": true + }, + { + "paramName": "m", + "paramLongName": "master", + "paramDescription": "the master name", + "paramRequired": true + } +] \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/datacite/filter_crossref_param.json b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/datacite/filter_crossref_param.json new file mode 100644 index 000000000..63e080337 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/datacite/filter_crossref_param.json @@ -0,0 +1,21 @@ +[ + { + "paramName": "s", + "paramLongName": "sourcePath", + "paramDescription": "the source mdstore path", + "paramRequired": true + }, + + { + "paramName": "t", + "paramLongName": "targetPath", + "paramDescription": "the target mdstore path", + "paramRequired": true + }, + { + "paramName": "m", + "paramLongName": "master", + "paramDescription": "the master name", + "paramRequired": true + } +] \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/datacite/generate_dataset_params.json b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/datacite/generate_dataset_params.json new file mode 100644 index 000000000..04dc8b942 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/datacite/generate_dataset_params.json @@ -0,0 +1,34 @@ +[ + { + "paramName": "s", + "paramLongName": "sourcePath", + "paramDescription": "the source mdstore path", + "paramRequired": true + }, + + { + "paramName": "mo", + "paramLongName": "mdstoreOutputVersion", + "paramDescription": "the target mdstore path", + "paramRequired": true + }, + { + "paramName": "m", + "paramLongName": "master", + "paramDescription": "the master name", + "paramRequired": true + }, + { + "paramName": "i", + "paramLongName": "isLookupUrl", + "paramDescription": "the isLookup URL", + "paramRequired": true + }, + { + "paramName": "l", + "paramLongName": "exportLinks", + "paramDescription": "should export also links", + "paramRequired": false + } + +] \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/datacite/hostedBy_map.json b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/datacite/hostedBy_map.json new file mode 100644 index 000000000..ecae6811a --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/datacite/hostedBy_map.json @@ -0,0 +1,1047 @@ +{ + "GESIS.RKI": { + "openaire_id": "re3data_____::r3d100010436", + "datacite_name": "Forschungsdatenzentrum am Robert Koch Institut", + "official_name": "Forschungsdatenzentrum am Robert Koch Institut" + }, + "DELFT.DATA4TU": { + "openaire_id": "re3data_____::r3d100010216", + "datacite_name": "4TU.ResearchData | science.engineering.design", + "official_name": "4TU.ResearchData | science.engineering.design" + }, + "FBTK.DMXFNR": { + "openaire_id": "re3data_____::r3d100013444", + "datacite_name": "Aperta TÜBİTAK Open Archive", + "official_name": "Aperta TÜBİTAK Open Archive" + }, + "BL.CAM": { + "openaire_id": "opendoar____::109", + "datacite_name": "Apollo", + "official_name": "Apollo" + }, + "NU.ARCH": { + "openaire_id": "re3data_____::r3d100012925", + "datacite_name": "Arch", + "official_name": "Arch" + }, + "BL.ADS": { + "openaire_id": "re3data_____::r3d100000006", + "datacite_name": "Archaeology Data Service", + "official_name": "Archaeology Data Service" + }, + "FZJ.B2SHARE": { + "openaire_id": "re3data_____::r3d100011394", + "datacite_name": "B2SHARE", + "official_name": "B2SHARE" + }, + "STSCI.MAST": { + "openaire_id": "re3data_____::r3d100010403", + "datacite_name": "Barbara A. Mikulski Archive for Space Telescopes", + "official_name": "Barbara A. Mikulski Archive for Space Telescopes" + }, + "CBG.DATASETS": { + "openaire_id": "re3data_____::r3d100010927", + "datacite_name": "Barcode of Life Data Systems", + "official_name": "Barcode of Life Data Systems" + }, + "TIB.BEILST": { + "openaire_id": "re3data_____::r3d100012329", + "datacite_name": "STRENDA DB", + "official_name": "STRENDA DB" + }, + "MLBS.SKUXGS": { + "openaire_id": "re3data_____::r3d100011696", + "datacite_name": "biodiversity.aq", + "official_name": "biodiversity.aq" + }, + "BL.BIRKBECK": { + "openaire_id": "re3data_____::r3d100012185", + "datacite_name": "Birkbeck Research Data", + "official_name": "Birkbeck Research Data" + }, + "SND.BOLIN": { + "openaire_id": "re3data_____::r3d100011699", + "datacite_name": "Bolin Centre Database", + "official_name": "Bolin Centre Database" + }, + "BROWN.BDR": { + "openaire_id": "re3data_____::r3d100011654", + "datacite_name": "Brown Digital Repository", + "official_name": "Brown Digital Repository" + }, + "BL.BRUNEL": { + "openaire_id": "re3data_____::r3d100012140", + "datacite_name": "Brunel figshare", + "official_name": "Brunel figshare" + }, + "TIB.BAFG": { + "openaire_id": "re3data_____::r3d100011664", + "datacite_name": "Geoportal der BFG", + "official_name": "Geoportal der BFG" + }, + "TIND.CALTECH": { + "openaire_id": "re3data_____::r3d100012384", + "datacite_name": "CaltechDATA", + "official_name": "CaltechDATA" + }, + "CUL.CIESIN": { + "openaire_id": "re3data_____::r3d100010207", + "datacite_name": "Center for International Earth Science Information Network", + "official_name": "Center for International Earth Science Information Network" + }, + "TIB.KIT-IOC": { + "openaire_id": "re3data_____::r3d100010748", + "datacite_name": "chemotion", + "official_name": "chemotion" + }, + "CORNELL.CISER": { + "openaire_id": "re3data_____::r3d100011056", + "datacite_name": "CISER Data & Reproduction Archive", + "official_name": "CISER Data & Reproduction Archive" + }, + "CLARIN.CLARIN": { + "openaire_id": "re3data_____::r3d100010209", + "datacite_name": "CLARIN-ERIC", + "official_name": "CLARIN-ERIC" + }, + "OCEAN.OCEAN": { + "openaire_id": "re3data_____::r3d100012369", + "datacite_name": "Code Ocean", + "official_name": "Code Ocean" + }, + "CORNELL.LIBRARY": { + "openaire_id": "re3data_____::r3d100012322", + "datacite_name": "eCommons - Cornell's digital repository", + "official_name": "eCommons - Cornell's digital repository" + }, + "BL.CRAN": { + "openaire_id": "re3data_____::r3d100012068", + "datacite_name": "Cranfield Online Research Data", + "official_name": "Cranfield Online Research Data" + }, + "DARTLIB.CRAWDAD": { + "openaire_id": "re3data_____::r3d100010716", + "datacite_name": "CRAWDAD", + "official_name": "CRAWDAD" + }, + "GESIS.CSDA": { + "openaire_id": "re3data_____::r3d100010484", + "datacite_name": "Czech Social Science Data Archive", + "official_name": "Czech Social Science Data Archive" + }, + "PSU.DATACOM": { + "openaire_id": "re3data_____::r3d100012927", + "datacite_name": "Data Commons", + "official_name": "Data Commons" + }, + "INIST.INRA": { + "openaire_id": "re3data_____::r3d100012673", + "datacite_name": "Data INRAE", + "official_name": "Data INRAE" + }, + "UMN.DRUM": { + "openaire_id": "re3data_____::r3d100011393", + "datacite_name": "Data Repository for the University of Minnesota", + "official_name": "Data Repository for the University of Minnesota" + }, + "ESTDOI.REPO": { + "openaire_id": "re3data_____::r3d100012333", + "datacite_name": "DataDOI", + "official_name": "DataDOI" + }, + "DAFI.CLIENT": { + "openaire_id": "re3data_____::r3d100010217", + "datacite_name": "DataFirst", + "official_name": "DataFirst" + }, + "UNM.DATAONE": { + "openaire_id": "re3data_____::r3d100000045", + "datacite_name": "DataONE", + "official_name": "DataONE" + }, + "FCT.UMINHO": { + "openaire_id": "re3data_____::r3d100013173", + "datacite_name": "DataRepositoriUM", + "official_name": "DataRepositoriUM" + }, + "FIGSHARE.IASTATE": { + "openaire_id": "re3data_____::r3d100012696", + "datacite_name": "DataShare: the Open Data Repository of Iowa State University", + "official_name": "DataShare: the Open Data Repository of Iowa State University" + }, + "PU.DATASPACE": { + "openaire_id": "re3data_____::r3d100012513", + "datacite_name": "DataSpace", + "official_name": "DataSpace" + }, + "DANS.DATAVERSENL": { + "openaire_id": "re3data_____::r3d100011201", + "datacite_name": "DataverseNL", + "official_name": "DataverseNL" + }, + "BIBSYS.UIT-ORD": { + "openaire_id": "re3data_____::r3d100012538", + "datacite_name": "DataverseNO", + "official_name": "DataverseNO" + }, + "GESIS.SSRI": { + "openaire_id": "re3data_____::r3d100013494", + "datacite_name": "DATICE", + "official_name": "DATICE" + }, + "SML.TDAR": { + "openaire_id": "re3data_____::r3d100010347", + "datacite_name": "tDAR", + "official_name": "tDAR" + }, + "CSIC.DIGITAL": { + "openaire_id": "re3data_____::r3d100011076", + "datacite_name": "Digital CSIC", + "official_name": "DIGITAL.CSIC" + }, + "BL.DRI": { + "openaire_id": "re3data_____::r3d100011805", + "datacite_name": "Digital Repository of Ireland", + "official_name": "Digital Repository of Ireland" + }, + "SUBGOE.DARIAH": { + "openaire_id": "re3data_____::r3d100011345", + "datacite_name": "DARIAH-DE Repository", + "official_name": "DARIAH-DE Repository" + }, + "DRYAD.DRYAD": { + "openaire_id": "re3data_____::r3d100000044", + "datacite_name": "DRYAD", + "official_name": "DRYAD" + }, + "ZBMED.DSMZ": { + "openaire_id": "re3data_____::r3d100010219", + "datacite_name": "DSMZ", + "official_name": "DSMZ" + }, + "DKRZ.ESGF": { + "openaire_id": "re3data_____::r3d100011159", + "datacite_name": "Earth System Grid Federation", + "official_name": "Earth System Grid Federation" + }, + "KTSW.AEZVVV": { + "openaire_id": "re3data_____::r3d100013469", + "datacite_name": "EarthEnv", + "official_name": "EarthEnv" + }, + "DANS.ARCHIVE": { + "openaire_id": "re3data_____::r3d100010214", + "datacite_name": "EASY", + "official_name": "EASY" + }, + "ETHZ.WSL": { + "openaire_id": "re3data_____::r3d100012587", + "datacite_name": "EnviDat", + "official_name": "EnviDat" + }, + "ETHZ.E-COLL": { + "openaire_id": "re3data_____::r3d100012557", + "datacite_name": "ETH Zürich Research Collection", + "official_name": "ETH Zürich Research Collection" + }, + "ETHZ.DA-RD": { + "openaire_id": "re3data_____::r3d100011626", + "datacite_name": "ETH Data Archive", + "official_name": "ETH Data Archive" + }, + "BL.ECMWF": { + "openaire_id": "re3data_____::r3d100011726", + "datacite_name": "European Centre for Medium-Range Weather Forecasts", + "official_name": "European Centre for Medium-Range Weather Forecasts" + }, + "CARL.FRDR": { + "openaire_id": "re3data_____::r3d100012646", + "datacite_name": "Federated Research Data Repository", + "official_name": "Federated Research Data Repository" + }, + "FIGSHARE.ARS": { + "openaire_id": "re3data_____::r3d100010066", + "datacite_name": "figshare", + "official_name": "figshare" + }, + "TIB.FLOSS": { + "openaire_id": "re3data_____::r3d100010863", + "datacite_name": "FLOSSmole", + "official_name": "FLOSSmole" + }, + "LXKC.DSKYFI": { + "openaire_id": "re3data_____::r3d100010976", + "datacite_name": "ForestPlots.net", + "official_name": "ForestPlots.net" + }, + "YKDK.ZUYSQI": { + "openaire_id": "re3data_____::r3d100010368", + "datacite_name": "FORS DARIS", + "official_name": "FORS DARIS" + }, + "TIB.LUIS": { + "openaire_id": "re3data_____::r3d100012825", + "datacite_name": "Forschungsdaten-Repositorium der LUH", + "official_name": "Forschungsdaten-Repositorium der LUH" + }, + "GESIS.BIBB-FDZ": { + "openaire_id": "re3data_____::r3d100010190", + "datacite_name": "Forschungsdatenzentrum im Bundesinstitut für Berufsbildung", + "official_name": "Forschungsdatenzentrum im Bundesinstitut für Berufsbildung" + }, + "GESIS.ZPID": { + "openaire_id": "re3data_____::r3d100010328", + "datacite_name": "PsychData", + "official_name": "PsychData" + }, + "TIB.GFZ": { + "openaire_id": "re3data_____::r3d100012335", + "datacite_name": "GFZ Data Services", + "official_name": "GFZ Data Services" + }, + "CNGB.GIGADB": { + "openaire_id": "re3data_____::r3d100010478", + "datacite_name": "GigaDB", + "official_name": "GigaDB" + }, + "GBIF.GBIF": { + "openaire_id": "re3data_____::r3d100000039", + "datacite_name": "Global Biodiversity Information Facility", + "official_name": "Global Biodiversity Information Facility" + }, + "ARDCX.GRIFFITH": { + "openaire_id": "re3data_____::r3d100010864", + "datacite_name": "Griffith University Research Data Repository", + "official_name": "Griffith University Research Data Repository" + }, + "GDCC.HARVARD-SBGR": { + "openaire_id": "re3data_____::r3d100011601", + "datacite_name": "Structural Biology Data Grid", + "official_name": "Structural Biology Data Grid" + }, + "GDCC.HARVARD-DV": { + "openaire_id": "re3data_____::r3d100010051", + "datacite_name": "Harvard Dataverse", + "official_name": "Harvard Dataverse" + }, + "CERN.HEPDATA": { + "openaire_id": "re3data_____::r3d100010081", + "datacite_name": "HEPData", + "official_name": "HEPData" + }, + "SND.ICOS": { + "openaire_id": "re3data_____::r3d100012203", + "datacite_name": "ICOS Carbon Portal", + "official_name": "ICOS Carbon Portal" + }, + "GESIS.ICPSR": { + "openaire_id": "re3data_____::r3d100010255", + "datacite_name": "Inter-university Consortium for Political and Social Research", + "official_name": "Inter-university Consortium for Political and Social Research" + }, + "IEEE.DATAPORT": { + "openaire_id": "re3data_____::r3d100012569", + "datacite_name": "IEEE DataPort", + "official_name": "IEEE DataPort" + }, + "IIASA.DARE": { + "openaire_id": "re3data_____::r3d100012932", + "datacite_name": "IIASA DARE", + "official_name": "IIASA DARE" + }, + "ILLINOIS.DATABANK": { + "openaire_id": "re3data_____::r3d100012001", + "datacite_name": "Illinois Data Bank", + "official_name": "Illinois Data Bank" + }, + "IRIS.IRIS": { + "openaire_id": "re3data_____::r3d100010268", + "datacite_name": "Incorporated Research Institutions for Seismology", + "official_name": "Incorporated Research Institutions for Seismology" + }, + "GESIS.INDEPTH": { + "openaire_id": "re3data_____::r3d100011392", + "datacite_name": "INDEPTH Data Repository", + "official_name": "INDEPTH Data Repository" + }, + "JCVI.GXPWAQ": { + "openaire_id": "re3data_____::r3d100011558", + "datacite_name": "Influenza Research Database", + "official_name": "Influenza Research Database" + }, + "TIB.INP": { + "openaire_id": "re3data_____::r3d100013120", + "datacite_name": "INPTDAT", + "official_name": "INPTDAT" + }, + "CERN.INSPIRE": { + "openaire_id": "re3data_____::r3d100011077", + "datacite_name": "Inspire-HEP", + "official_name": "Inspire-HEP" + }, + "INIST.IFREMER": { + "openaire_id": "re3data_____::r3d100012965", + "datacite_name": "IFREMER-SISMER Portail de données marines", + "official_name": "IFREMER-SISMER Portail de données marines" + }, + "INIST.ILL": { + "openaire_id": "re3data_____::r3d100012072", + "datacite_name": "ILL Data Portal", + "official_name": "ILL Data Portal" + }, + "TIB.KIT-IMK": { + "openaire_id": "re3data_____::r3d100011956", + "datacite_name": "CARIBIC", + "official_name": "CARIBIC" + }, + "WWPX.INTR2": { + "openaire_id": "re3data_____::r3d100012347", + "datacite_name": "²Dok[§]", + "official_name": "²Dok[§]" + }, + "BL.IITA": { + "openaire_id": "re3data_____::r3d100012883", + "datacite_name": "International Institute of Tropical Agriculture datasets", + "official_name": "International Institute of Tropical Agriculture datasets" + }, + "TIB.IPK": { + "openaire_id": "re3data_____::r3d100011647", + "datacite_name": "IPK Gatersleben", + "official_name": "IPK Gatersleben" + }, + "IST.REX": { + "openaire_id": "re3data_____::r3d100012394", + "datacite_name": "IST Austria Research Explorer", + "official_name": "IST Austria Research Explorer" + }, + "GDCC.JHU": { + "openaire_id": "re3data_____::r3d100011836", + "datacite_name": "Johns Hopkins Data Archive Dataverse Network", + "official_name": "Johns Hopkins Data Archive Dataverse Network" + }, + "KAGGLE.KAGGLE": { + "openaire_id": "re3data_____::r3d100012705", + "datacite_name": "Kaggle", + "official_name": "Kaggle" + }, + "ESTDOI.KEEL": { + "openaire_id": "re3data_____::r3d100011941", + "datacite_name": "Center of Estonian Language Resources", + "official_name": "Center of Estonian Language Resources" + }, + "RICE.KINDER": { + "openaire_id": "re3data_____::r3d100012884", + "datacite_name": "Kinder Institute Urban Data Platform", + "official_name": "Kinder Institute Urban Data Platform" + }, + "DELFT.KNMI": { + "openaire_id": "re3data_____::r3d100011879", + "datacite_name": "KNMI Data Platform", + "official_name": "KNMI Data Platform" + }, + "LANDCARE.SOILS": { + "openaire_id": "re3data_____::r3d100010835", + "datacite_name": "Land Resource Information Systems Portal", + "official_name": "Land Resource Information Systems Portal" + }, + "LANDCARE.GENERAL": { + "openaire_id": "re3data_____::r3d100011662", + "datacite_name": "Landcare Research Data Repository", + "official_name": "Landcare Research Data Repository" + }, + "TIB.LDEO": { + "openaire_id": "re3data_____::r3d100012547", + "datacite_name": "Lamont-Doherty Core Repository", + "official_name": "Lamont-Doherty Core Repository" + }, + "ZBMED.LERNZDB": { + "openaire_id": "re3data_____::r3d100010066", + "datacite_name": "figshare", + "official_name": "figshare" + }, + "GESIS.NEPS": { + "openaire_id": "re3data_____::r3d100010736", + "datacite_name": "Nationales Bildungspanel", + "official_name": "Nationales Bildungspanel" + }, + "BL.LINCOLN": { + "openaire_id": "re3data_____::r3d100012407", + "datacite_name": "Lincoln repository", + "official_name": "Lincoln repository" + }, + "LDC.CATALOG": { + "openaire_id": "re3data_____::r3d100011940", + "datacite_name": "Linguistic Data Consortium", + "official_name": "Linguistic Data Consortium" + }, + "ZBW.IFO": { + "openaire_id": "re3data_____::r3d100010201", + "datacite_name": "LMU-ifo Economics & Business Data Center", + "official_name": "LMU-ifo Economics & Business Data Center" + }, + "DK.SB": { + "openaire_id": "re3data_____::r3d100012617", + "datacite_name": "LOAR", + "official_name": "LOAR" + }, + "BL.LSHTM": { + "openaire_id": "re3data_____::r3d100011800", + "datacite_name": "LSHTM Data Compass", + "official_name": "LSHTM Data Compass" + }, + "BL.LBORO": { + "openaire_id": "re3data_____::r3d100012143", + "datacite_name": "Loughborough Data Repository", + "official_name": "Loughborough Data Repository" + }, + "DELFT.MAASTRO": { + "openaire_id": "re3data_____::r3d100011086", + "datacite_name": "CancerData.org", + "official_name": "CancerData.org" + }, + "OIBK.OHYCFA": { + "openaire_id": "re3data_____::r3d100013499", + "datacite_name": "Materials Data Repository", + "official_name": "Materials Data Repository" + }, + "MDW.REPOSITORY": { + "openaire_id": "re3data_____::r3d100012108", + "datacite_name": "mdw Repository", + "official_name": "mdw Repository" + }, + "ELSEVIER.MD": { + "openaire_id": "re3data_____::r3d100011868", + "datacite_name": "Mendeley Data", + "official_name": "Mendeley Data" + }, + "BL.MENDELEY": { + "openaire_id": "re3data_____::r3d100011868", + "datacite_name": "Mendeley Data", + "official_name": "Mendeley Data" + }, + "BKMX.AZJWZC": { + "openaire_id": "re3data_____::r3d100011394", + "datacite_name": "B2SHARE", + "official_name": "B2SHARE" + }, + "CSC.NRD": { + "openaire_id": "re3data_____::r3d100012157", + "datacite_name": "Fairdata IDA Research Data Storage Service", + "official_name": "Fairdata IDA Research Data Storage Service" + }, + "UMN.IPUMS": { + "openaire_id": "re3data_____::r3d100010794", + "datacite_name": "Minnesota Population Center", + "official_name": "Minnesota Population Center" + }, + "PHBI.REPO": { + "openaire_id": "re3data_____::r3d100010101", + "datacite_name": "MorphoBank", + "official_name": "MorphoBank" + }, + "TIB.UKON": { + "openaire_id": "re3data_____::r3d100010469", + "datacite_name": "Movebank Data Repository", + "official_name": "Movebank Data Repository" + }, + "INIST.HUMANUM": { + "openaire_id": "re3data_____::r3d100012102", + "datacite_name": "NAKALA", + "official_name": "NAKALA" + }, + "GDCC.NTU": { + "openaire_id": "re3data_____::r3d100012440", + "datacite_name": "DR-NTU (Data)", + "official_name": "DR-NTU (Data)" + }, + "CORNELL.NDACAN": { + "openaire_id": "re3data_____::r3d100011036", + "datacite_name": "National Data Archive on Child Abuse and Neglect", + "official_name": "National Data Archive on Child Abuse and Neglect" + }, + "NOAA.NCEI": { + "openaire_id": "re3data_____::r3d100011801", + "datacite_name": "NCEI", + "official_name": "NCEI" + }, + "GDCC.HARVARD-SLP": { + "openaire_id": "re3data_____::r3d100011861", + "datacite_name": "National Sleep Research Resource", + "official_name": "National Sleep Research Resource" + }, + "NSIDC.DATACTR": { + "openaire_id": "re3data_____::r3d100010110", + "datacite_name": "National Snow and Ice Data Center", + "official_name": "National Snow and Ice Data Center" + }, + "NUS.SB": { + "openaire_id": "re3data_____::r3d100012564", + "datacite_name": "ScholarBank@NUS", + "official_name": "ScholarBank@NUS" + }, + "BL.NHM": { + "openaire_id": "re3data_____::r3d100011675", + "datacite_name": "Natural History Museum, Data Portal", + "official_name": "Natural History Museum, Data Portal" + }, + "ESDIS.ORNL": { + "openaire_id": "re3data_____::r3d100000037", + "datacite_name": "Oak Ridge National Laboratory Distributed Active Archive Center for Biogeochemical Dynamics", + "official_name": "Oak Ridge National Laboratory Distributed Active Archive Center for Biogeochemical Dynamics" + }, + "INIST.OTELO": { + "openaire_id": "re3data_____::r3d100012505", + "datacite_name": "ORDaR", + "official_name": "ORDaR" + }, + "EUROP.ODIN": { + "openaire_id": "re3data_____::r3d100011378", + "datacite_name": "MatDB", + "official_name": "MatDB" + }, + "GDCC.ODUM-DV": { + "openaire_id": "re3data_____::r3d100000005", + "datacite_name": "UNC Dataverse", + "official_name": "UNC Dataverse" + }, + "OHSU.OHSU": { + "openaire_id": "re3data_____::r3d100012244", + "datacite_name": "OHSU Digital Commons", + "official_name": "OHSU Digital Commons" + }, + "KIM.OPENKIM": { + "openaire_id": "re3data_____::r3d100011864", + "datacite_name": "OpenKIM", + "official_name": "OpenKIM" + }, + "COS.OSF": { + "openaire_id": "re3data_____::r3d100011137", + "datacite_name": "Open Science Framework", + "official_name": "Open Science Framework" + }, + "SUL.OPENNEURO": { + "openaire_id": "re3data_____::r3d100010924", + "datacite_name": "OpenNeuro", + "official_name": "OpenNeuro" + }, + "BL.SHEF": { + "openaire_id": "re3data_____::r3d100012124", + "datacite_name": "ORDA - The University of Sheffield Research Data Catalogue and Repository", + "official_name": "ORDA - The University of Sheffield Research Data Catalogue and Repository" + }, + "BL.BROOKES": { + "openaire_id": "re3data_____::r3d100012929", + "datacite_name": "Oxford Brookes University: RADAR", + "official_name": "Oxford Brookes University: RADAR" + }, + "BL.OXDB": { + "openaire_id": "re3data_____::r3d100011653", + "datacite_name": "DataBank, Bodleian Libraries, University of Oxford", + "official_name": "DataBank, Bodleian Libraries, University of Oxford" + }, + "PANGAEA.REPOSITORY": { + "openaire_id": "re3data_____::r3d100010134", + "datacite_name": "PANGAEA", + "official_name": "PANGAEA" + }, + "TIB.PANGAEA": { + "openaire_id": "re3data_____::r3d100010134", + "datacite_name": "PANGAEA", + "official_name": "PANGAEA" + }, + "NASAPDS.NASAPDS": { + "openaire_id": "re3data_____::r3d100010121", + "datacite_name": "PDS", + "official_name": "PDS" + }, + "BF.DISCOVER": { + "openaire_id": "re3data_____::r3d100013148", + "datacite_name": "Blackfynn Discover", + "official_name": "Blackfynn Discover" + }, + "MIT.PHYSIO": { + "openaire_id": "re3data_____::r3d100011561", + "datacite_name": "PhysioNet", + "official_name": "PhysioNet" + }, + "ZBMED.BIOFRESH": { + "openaire_id": "re3data_____::r3d100011651", + "datacite_name": "Freshwater Biodiversity Data Portal", + "official_name": "Freshwater Biodiversity Data Portal" + }, + "PDS.DATA": { + "openaire_id": "re3data_____::r3d100013015", + "datacite_name": "Project Data Sphere", + "official_name": "Project Data Sphere" + }, + "ESTDOI.QDB": { + "openaire_id": "re3data_____::r3d100013451", + "datacite_name": "QsarDB", + "official_name": "QsarDB" + }, + "INIST.ADISP": { + "openaire_id": "re3data_____::r3d100010494", + "datacite_name": "Quetelet PROGEDO Diffusion", + "official_name": "Quetelet PROGEDO Diffusion" + }, + "TIB.RADAR": { + "openaire_id": "re3data_____::r3d100012330", + "datacite_name": "RADAR", + "official_name": "RADAR" + }, + "UCHILE.DATAVERSE": { + "openaire_id": "re3data_____::r3d100013108", + "datacite_name": "Repositorio de Datos de Investigación de la Universidad de Chile", + "official_name": "Repositorio de Datos de Investigación de la Universidad de Chile" + }, + "UDR.RESEARCHDATA": { + "openaire_id": "re3data_____::r3d100013212", + "datacite_name": "Repositorio de datos de investigación de la Universidad del Rosario", + "official_name": "Repositorio de datos de investigación de la Universidad del Rosario" + }, + "GESIS.DIPF": { + "openaire_id": "re3data_____::r3d100010390", + "datacite_name": "Forschungsdatenzentrum Bildung", + "official_name": "Forschungsdatenzentrum Bildung" + }, + "RG.RG": { + "openaire_id": "re3data_____::r3d100012227", + "datacite_name": "ResearchGate Data", + "official_name": "ResearchGate Data" + }, + "INIST.RESIF": { + "openaire_id": "re3data_____::r3d100012222", + "datacite_name": "Résif Seismological Data Portal", + "official_name": "Résif Seismological Data Portal" + }, + "TIB.HZDR": { + "openaire_id": "re3data_____::r3d100012757", + "datacite_name": "RODARE", + "official_name": "RODARE" + }, + "OCUL.SPDV": { + "openaire_id": "re3data_____::r3d100010691", + "datacite_name": "Scholars Portal Dataverse", + "official_name": "Scholars Portal Dataverse" + }, + "PSU.SCHOLAR": { + "openaire_id": "re3data_____::r3d100010701", + "datacite_name": "ScholarSphere", + "official_name": "ScholarSphere" + }, + "TIB.BIKF": { + "openaire_id": "re3data_____::r3d100012379", + "datacite_name": "Senckenberg (meta) data portal", + "official_name": "Senckenberg (meta) data portal" + }, + "GESIS.SHARE": { + "openaire_id": "re3data_____::r3d100010430", + "datacite_name": "Survey of Health, Ageing and Retirement in Europe", + "official_name": "Survey of Health, Ageing and Retirement in Europe" + }, + "BL.HALLAM": { + "openaire_id": "re3data_____::r3d100011909", + "datacite_name": "Sheffield Hallam University Research Data Archive", + "official_name": "Sheffield Hallam University Research Data Archive" + }, + "ETHZ.SICAS": { + "openaire_id": "re3data_____::r3d100011560", + "datacite_name": "Sicas Medical Image Repository", + "official_name": "Sicas Medical Image Repository" + }, + "SUL.SIMTK": { + "openaire_id": "re3data_____::r3d100012486", + "datacite_name": "SimTK", + "official_name": "SimTK" + }, + "SI.SI": { + "openaire_id": "re3data_____::r3d100012274", + "datacite_name": "Smithsonian Research Online", + "official_name": "Smithsonian Research Online" + }, + "CONCOR.KCYDCU": { + "openaire_id": "re3data_____::r3d100012818", + "datacite_name": "Spectrum Research Repository", + "official_name": "Spectrum Research Repository" + }, + "SUL.SDR": { + "openaire_id": "re3data_____::r3d100010710", + "datacite_name": "Stanford Digital Repository", + "official_name": "Stanford Digital Repository" + }, + "SND.SU": { + "openaire_id": "re3data_____::r3d100012147", + "datacite_name": "Stockholm University Figshare Repository", + "official_name": "Stockholm University Figshare Repository" + }, + "INIST.CDS": { + "openaire_id": "re3data_____::r3d100010584", + "datacite_name": "Strasbourg Astronomical Data Center", + "official_name": "Strasbourg Astronomical Data Center" + }, + "DELFT.SURFSARA": { + "openaire_id": "re3data_____::r3d100013084", + "datacite_name": "SURF Data Repository", + "official_name": "SURF Data Repository" + }, + "SND.SMHI": { + "openaire_id": "re3data_____::r3d100011776", + "datacite_name": "Swedish Meteorological and Hydrological Institute open data", + "official_name": "Swedish Meteorological and Hydrological Institute open data" + }, + "SND.SND": { + "openaire_id": "re3data_____::r3d100010146", + "datacite_name": "Swedish National Data Service", + "official_name": "Swedish National Data Service" + }, + "SAGEBIO.SYNAPSE": { + "openaire_id": "re3data_____::r3d100011894", + "datacite_name": "Synapse", + "official_name": "Synapse" + }, + "GDCC.SYR-QDR": { + "openaire_id": "re3data_____::r3d100011038", + "datacite_name": "Qualitative Data Repository", + "official_name": "Qualitative Data Repository" + }, + "FZJ.TERENO": { + "openaire_id": "re3data_____::r3d100011471", + "datacite_name": "TERENO Data Discovery Portal", + "official_name": "TERENO Data Discovery Portal" + }, + "TUW.TETHYS": { + "openaire_id": "re3data_____::r3d100013400", + "datacite_name": "Tethys", + "official_name": "Tethys" + }, + "GESIS.AUSSDA": { + "openaire_id": "re3data_____::r3d100010483", + "datacite_name": "AUSSDA Dataverse", + "official_name": "AUSSDA Dataverse" + }, + "CCDC.CSD": { + "openaire_id": "re3data_____::r3d100010197", + "datacite_name": "The Cambridge Structural Database", + "official_name": "The Cambridge Structural Database" + }, + "SML.TCIA": { + "openaire_id": "re3data_____::r3d100011559", + "datacite_name": "The Cancer Imaging Archive", + "official_name": "The Cancer Imaging Archive" + }, + "SI.CDA": { + "openaire_id": "re3data_____::r3d100010035", + "datacite_name": "The Chandra Data Archive", + "official_name": "The Chandra Data Archive" + }, + "HLQC.ZNXELI": { + "openaire_id": "re3data_____::r3d100013029", + "datacite_name": "TUdatalib", + "official_name": "TUdatalib" + }, + "TIB.TUHH": { + "openaire_id": "re3data_____::r3d100013076", + "datacite_name": "TUHH Open Research - Research Data TUHH", + "official_name": "TUHH Open Research - Research Data TUHH" + }, + "BL.UEL": { + "openaire_id": "re3data_____::r3d100012414", + "datacite_name": "UEL Research Repository", + "official_name": "UEL Research Repository" + }, + "ARFM.UFZDRP": { + "openaire_id": "re3data_____::r3d100013674", + "datacite_name": "Datenrechercheportal UFZ", + "official_name": "Datenrechercheportal UFZ" + }, + "BL.UKDA": { + "openaire_id": "re3data_____::r3d100010215", + "datacite_name": "UK Data Archive", + "official_name": "UK Data Archive" + }, + "GDCC.ODUM-LIBRARY": { + "openaire_id": "re3data_____::r3d100000005", + "datacite_name": "UNC Dataverse", + "official_name": "UNC Dataverse" + }, + "CRUI.UNIBO": { + "openaire_id": "re3data_____::r3d100012604", + "datacite_name": "AMS Acta", + "official_name": "AMS Acta" + }, + "LMU.UB": { + "openaire_id": "re3data_____::r3d100010731", + "datacite_name": "Open Data LMU", + "official_name": "Open Data LMU" + }, + "INIST.IFSTTAR": { + "openaire_id": "re3data_____::r3d100013062", + "datacite_name": "Data Univ Gustave Eiffel", + "official_name": "Data Univ Gustave Eiffel" + }, + "BL.UCLD": { + "openaire_id": "re3data_____::r3d100012417", + "datacite_name": "UCL Discovery", + "official_name": "UCL Discovery" + }, + "NZAU.DATA": { + "openaire_id": "re3data_____::r3d100012110", + "datacite_name": "University of Auckland Data Repository", + "official_name": "University of Auckland Data Repository" + }, + "BL.BATH": { + "openaire_id": "re3data_____::r3d100011947", + "datacite_name": "University of Bath Research Data Archive", + "official_name": "University of Bath Research Data Archive" + }, + "BL.BRISTOL": { + "openaire_id": "re3data_____::r3d100011099", + "datacite_name": "data.bris Research Data Repository", + "official_name": "data.bris Research Data Repository" + }, + "FIGSHARE.UCT": { + "openaire_id": "re3data_____::r3d100012633", + "datacite_name": "University of Cape Town (UCT)", + "official_name": "ZivaHub" + }, + "BL.UCLAN": { + "openaire_id": "re3data_____::r3d100012019", + "datacite_name": "UCLanData", + "official_name": "UCLanData" + }, + "BL.ED": { + "openaire_id": "re3data_____::r3d100000047", + "datacite_name": "Edinburgh DataShare", + "official_name": "Edinburgh DataShare" + }, + "BL.ESSEX": { + "openaire_id": "re3data_____::r3d100012405", + "datacite_name": "Research Data at Essex", + "official_name": "Research Data at Essex" + }, + "BL.EXETER": { + "openaire_id": "re3data_____::r3d100011202", + "datacite_name": "Open Research Exeter", + "official_name": "Open Research Exeter" + }, + "BL.HERTS": { + "openaire_id": "re3data_____::r3d100013116", + "datacite_name": "University of Hertfordshire Research Archive", + "official_name": "University of Hertfordshire Research Archive" + }, + "NKN.NKN": { + "openaire_id": "re3data_____::r3d100011587", + "datacite_name": "Northwest Knowledge Network", + "official_name": "Northwest Knowledge Network" + }, + "BL.LEEDS": { + "openaire_id": "re3data_____::r3d100011945", + "datacite_name": "Research Data Leeds Repository", + "official_name": "Research Data Leeds Repository" + }, + "UNIMELB.REPO1": { + "openaire_id": "re3data_____::r3d100012145", + "datacite_name": "melbourne.figshare.com", + "official_name": "melbourne.figshare.com" + }, + "BL.READING": { + "openaire_id": "re3data_____::r3d100012064", + "datacite_name": "University of Reading Research Data Archive", + "official_name": "University of Reading Research Data Archive" + }, + "BL.SALFORD": { + "openaire_id": "re3data_____::r3d100012144", + "datacite_name": "University of Salford Data Repository", + "official_name": "University of Salford Data Repository" + }, + "BL.SOTON": { + "openaire_id": "re3data_____::r3d100011245", + "datacite_name": "University of Southampton Institutional Research Repository", + "official_name": "University of Southampton Institutional Research Repository" + }, + "ARDCX.USQ": { + "openaire_id": "re3data_____::r3d100011638", + "datacite_name": "University of Southern Queensland research data collection", + "official_name": "University of Southern Queensland research data collection" + }, + "BL.STANDREW": { + "openaire_id": "re3data_____::r3d100012411", + "datacite_name": "St Andrews Research portal - Research Data", + "official_name": "St Andrews Research portal - Research Data" + }, + "BL.STRATH": { + "openaire_id": "re3data_____::r3d100012412", + "datacite_name": "University of Strathclyde KnowledgeBase Datasets", + "official_name": "University of Strathclyde KnowledgeBase Datasets" + }, + "BL.SURREY": { + "openaire_id": "re3data_____::r3d100012232", + "datacite_name": "Surrey Research Insight", + "official_name": "Surrey Research Insight" + }, + "USDA.USDA": { + "openaire_id": "re3data_____::r3d100011890", + "datacite_name": "Ag Data Commons", + "official_name": "Ag Data Commons" + }, + "USGS.PROD": { + "openaire_id": "re3data_____::r3d100010054", + "datacite_name": "U.S. Geological Survey", + "official_name": "U.S. Geological Survey" + }, + "DELFT.UU": { + "openaire_id": "re3data_____::r3d100012623", + "datacite_name": "Yoda", + "official_name": "Yoda" + }, + "VT.VTECHDATA": { + "openaire_id": "re3data_____::r3d100012601", + "datacite_name": "Virginia Tech Data Repository", + "official_name": "Virginia Tech Data Repository" + }, + "JCVI.EIVBWB": { + "openaire_id": "re3data_____::r3d100011931", + "datacite_name": "Virus Pathogen Resource", + "official_name": "Virus Pathogen Resource" + }, + "VIVLI.SEARCH": { + "openaire_id": "re3data_____::r3d100012823", + "datacite_name": "Vivli", + "official_name": "Vivli" + }, + "DELFT.VLIZ": { + "openaire_id": "re3data_____::r3d100010661", + "datacite_name": "Flanders Marine Institute", + "official_name": "Flanders Marine Institute" + }, + "WH.WHOAS": { + "openaire_id": "re3data_____::r3d100010423", + "datacite_name": "Woods Hole Open Access Server", + "official_name": "Woods Hole Open Access Server" + }, + "DKRZ.WDCC": { + "openaire_id": "re3data_____::r3d100010299", + "datacite_name": "World Data Center for Climate", + "official_name": "World Data Center for Climate" + }, + "ETHZ.WGMS": { + "openaire_id": "re3data_____::r3d100010627", + "datacite_name": "World Glacier Monitoring Service", + "official_name": "World Glacier Monitoring Service" + }, + "ZBW.ZBW-JDA": { + "openaire_id": "re3data_____::r3d100012190", + "datacite_name": "ZBW Journal Data Archive", + "official_name": "ZBW Journal Data Archive" + }, + "CERN.ZENODO": { + "openaire_id": "opendoar____::2659", + "datacite_name": "Zenodo", + "official_name": "ZENODO" + }, + "ZBW.ZEW": { + "openaire_id": "re3data_____::r3d100010399", + "datacite_name": "ZEW Forschungsdatenzentrum", + "official_name": "ZEW Forschungsdatenzentrum" + } +} \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/datacite/import_from_api.json b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/datacite/import_from_api.json new file mode 100644 index 000000000..a37ae4bba --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/datacite/import_from_api.json @@ -0,0 +1,39 @@ +[ + { + "paramName": "t", + "paramLongName": "targetPath", + "paramDescription": "the path of the sequencial file to write", + "paramRequired": true + }, + + { + "paramName": "d", + "paramLongName": "dataciteDumpPath", + "paramDescription": "the path of the Datacite dump", + "paramRequired": true + }, + { + "paramName": "s", + "paramLongName": "skipImport", + "paramDescription": "avoid to downlaod new items but apply the previous update", + "paramRequired": false + }, + { + "paramName": "bs", + "paramLongName": "blocksize", + "paramDescription": "define the requests block size", + "paramRequired": false + }, + { + "paramName": "n", + "paramLongName": "namenode", + "paramDescription": "the hive metastore uris", + "paramRequired": true + }, + { + "paramName": "m", + "paramLongName": "master", + "paramDescription": "the master name", + "paramRequired": true + } +] \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/datacite/transformation/oozie_app/config-default.xml b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/datacite/transformation/oozie_app/config-default.xml new file mode 100644 index 000000000..dd3c32c62 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/datacite/transformation/oozie_app/config-default.xml @@ -0,0 +1,23 @@ + + + jobTracker + yarnRM + + + nameNode + hdfs://nameservice1 + + + oozie.use.system.libpath + true + + + oozie.action.sharelib.for.spark + spark2 + + + + oozie.launcher.mapreduce.user.classpath.first + true + + \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/datacite/transformation/oozie_app/workflow.xml b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/datacite/transformation/oozie_app/workflow.xml new file mode 100644 index 000000000..d5bae7305 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/datacite/transformation/oozie_app/workflow.xml @@ -0,0 +1,126 @@ + + + + mainPath + the working path of Datacite stores + + + isLookupUrl + The IS lookUp service endopoint + + + mdStoreOutputId + the identifier of the cleaned MDStore + + + mdStoreManagerURI + the path of the cleaned mdstore + + + + + + + Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] + + + + + + + oozie.launcher.mapreduce.user.classpath.first + true + + + eu.dnetlib.dhp.aggregation.mdstore.MDStoreActionNode + --actionNEW_VERSION + --mdStoreID${mdStoreOutputId} + --mdStoreManagerURI${mdStoreManagerURI} + + + + + + + + + yarn-cluster + cluster + TransformJob + eu.dnetlib.dhp.datacite.GenerateDataciteDatasetSpark + dhp-aggregation-${projectVersion}.jar + + --executor-memory=${sparkExecutorMemory} + --executor-cores=${sparkExecutorCores} + --driver-memory=${sparkDriverMemory} + --conf spark.sql.shuffle.partitions=3840 + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + + --sourcePath${mainPath}/datacite_dump + --mdstoreOutputVersion${wf:actionData('StartTransaction')['mdStoreVersion']} + --isLookupUrl${isLookupUrl} + --exportLinkstrue + --masteryarn-cluster + + + + + + + + + + oozie.launcher.mapreduce.user.classpath.first + true + + + eu.dnetlib.dhp.aggregation.mdstore.MDStoreActionNode + --actionCOMMIT + --namenode${nameNode} + --mdStoreVersion${wf:actionData('StartTransaction')['mdStoreVersion']} + --mdStoreManagerURI${mdStoreManagerURI} + + + + + + + + + + oozie.launcher.mapreduce.user.classpath.first + true + + + eu.dnetlib.dhp.aggregation.mdstore.MDStoreActionNode + --actionREAD_UNLOCK + --mdStoreManagerURI${mdStoreManagerURI} + --readMDStoreId${wf:actionData('BeginRead')['mdStoreReadLockVersion']} + + + + + + + + + + + oozie.launcher.mapreduce.user.classpath.first + true + + + eu.dnetlib.dhp.aggregation.mdstore.MDStoreActionNode + --actionROLLBACK + --mdStoreVersion${wf:actionData('StartTransaction')['mdStoreVersion']} + --mdStoreManagerURI${mdStoreManagerURI} + + + + + + + \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/sx/actionset/generate_actionset.json b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/sx/actionset/generate_actionset.json new file mode 100644 index 000000000..0563808ea --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/sx/actionset/generate_actionset.json @@ -0,0 +1,6 @@ +[ + {"paramName":"mt", "paramLongName":"master", "paramDescription": "should be local or yarn", "paramRequired": true}, + {"paramName":"s", "paramLongName":"sourcePath","paramDescription": "source path", "paramRequired": true}, + {"paramName":"w", "paramLongName":"workingDirFolder","paramDescription": "the working Dir Folder", "paramRequired": true}, + {"paramName":"t", "paramLongName":"targetPath","paramDescription": "the target path ", "paramRequired": true} +] \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/sx/actionset/oozie_app/config-default.xml b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/sx/actionset/oozie_app/config-default.xml new file mode 100644 index 000000000..dd3c32c62 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/sx/actionset/oozie_app/config-default.xml @@ -0,0 +1,23 @@ + + + jobTracker + yarnRM + + + nameNode + hdfs://nameservice1 + + + oozie.use.system.libpath + true + + + oozie.action.sharelib.for.spark + spark2 + + + + oozie.launcher.mapreduce.user.classpath.first + true + + \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/sx/actionset/oozie_app/workflow.xml b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/sx/actionset/oozie_app/workflow.xml new file mode 100644 index 000000000..2d97b5163 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/sx/actionset/oozie_app/workflow.xml @@ -0,0 +1,76 @@ + + + + sourcePath + the path of the consistent graph + + + workingDirFolder + the path of working dir ActionSet + + + outputPath + the path of Scholexplorer ActionSet + + + + + + + Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] + + + + + yarn-cluster + cluster + Create Action Set + eu.dnetlib.dhp.actionmanager.scholix.SparkCreateActionset + dhp-aggregation-${projectVersion}.jar + + --executor-memory=${sparkExecutorMemory} + --executor-cores=${sparkExecutorCores} + --driver-memory=${sparkDriverMemory} + --conf spark.sql.shuffle.partitions=3840 + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + + --sourcePath${sourcePath} + --targetPath${outputPath} + --workingDirFolder${workingDirFolder} + --masteryarn-cluster + + + + + + + + + yarn-cluster + cluster + Save Action Set + eu.dnetlib.dhp.actionmanager.scholix.SparkSaveActionSet + dhp-aggregation-${projectVersion}.jar + + --executor-memory=${sparkExecutorMemory} + --executor-cores=${sparkExecutorCores} + --driver-memory=${sparkDriverMemory} + --conf spark.sql.shuffle.partitions=3840 + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + + --sourcePath${workingDirFolder}/actionSetOaf + --targetPath${outputPath} + --masteryarn-cluster + + + + + + + \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/sx/actionset/save_actionset.json b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/sx/actionset/save_actionset.json new file mode 100644 index 000000000..0264c825f --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/sx/actionset/save_actionset.json @@ -0,0 +1,5 @@ +[ + {"paramName":"mt", "paramLongName":"master", "paramDescription": "should be local or yarn", "paramRequired": true}, + {"paramName":"s", "paramLongName":"sourcePath","paramDescription": "source path", "paramRequired": true}, + {"paramName":"t", "paramLongName":"targetPath","paramDescription": "the target path ", "paramRequired": true} +] \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/sx/bio/db/oozie_app/config-default.xml b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/sx/bio/db/oozie_app/config-default.xml new file mode 100644 index 000000000..bdd48b0ab --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/sx/bio/db/oozie_app/config-default.xml @@ -0,0 +1,19 @@ + + + jobTracker + yarnRM + + + nameNode + hdfs://nameservice1 + + + oozie.use.system.libpath + true + + + oozie.action.sharelib.for.spark + spark2 + + + \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/sx/bio/db/oozie_app/workflow.xml b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/sx/bio/db/oozie_app/workflow.xml new file mode 100644 index 000000000..071d202b6 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/sx/bio/db/oozie_app/workflow.xml @@ -0,0 +1,51 @@ + + + + sourcePath + the PDB Database Working Path + + + database + the PDB Database Working Path + + + + targetPath + the Target Working dir path + + + + + + + Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] + + + + + yarn + cluster + Convert Bio DB to OAF Dataset + eu.dnetlib.dhp.sx.bio.SparkTransformBioDatabaseToOAF + dhp-aggregation-${projectVersion}.jar + + --executor-memory=${sparkExecutorMemory} + --executor-cores=${sparkExecutorCores} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.shuffle.partitions=2000 + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + + --masteryarn + --dbPath${sourcePath} + --database${database} + --targetPath${targetPath} + + + + + + + \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/sx/bio/ebi/baseline_to_oaf_params.json b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/sx/bio/ebi/baseline_to_oaf_params.json new file mode 100644 index 000000000..8dc8a2aae --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/sx/bio/ebi/baseline_to_oaf_params.json @@ -0,0 +1,8 @@ +[ + {"paramName":"mt", "paramLongName":"master", "paramDescription": "should be local or yarn", "paramRequired": true}, + {"paramName":"i", "paramLongName":"isLookupUrl", "paramDescription": "isLookupUrl", "paramRequired": true}, + {"paramName":"w", "paramLongName":"workingPath", "paramDescription": "the path of the sequencial file to read", "paramRequired": true}, + {"paramName":"t", "paramLongName":"targetPath", "paramDescription": "the oaf path ", "paramRequired": true}, + {"paramName":"s", "paramLongName":"skipUpdate", "paramDescription": "skip update ", "paramRequired": false}, + {"paramName":"h", "paramLongName":"hdfsServerUri", "paramDescription": "the working path ", "paramRequired": true} +] \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/sx/bio/ebi/bio_to_oaf_params.json b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/sx/bio/ebi/bio_to_oaf_params.json new file mode 100644 index 000000000..76d0bfd6d --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/sx/bio/ebi/bio_to_oaf_params.json @@ -0,0 +1,6 @@ +[ + {"paramName":"mt", "paramLongName":"master", "paramDescription": "should be local or yarn", "paramRequired": true}, + {"paramName":"db", "paramLongName":"database", "paramDescription": "should be PDB or UNIPROT", "paramRequired": true}, + {"paramName":"p", "paramLongName":"dbPath", "paramDescription": "the path of the database to transform", "paramRequired": true}, + {"paramName":"t", "paramLongName":"targetPath", "paramDescription": "the OAF target path ", "paramRequired": true} +] \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/sx/bio/ebi/ebi_download_update.json b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/sx/bio/ebi/ebi_download_update.json new file mode 100644 index 000000000..0860ed558 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/sx/bio/ebi/ebi_download_update.json @@ -0,0 +1,5 @@ +[ + {"paramName":"mt", "paramLongName":"master", "paramDescription": "should be local or yarn", "paramRequired": true}, + {"paramName":"s", "paramLongName":"sourcePath", "paramDescription": "the source Path", "paramRequired": true}, + {"paramName":"w", "paramLongName":"workingPath", "paramDescription": "the working path ", "paramRequired": true} +] \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/sx/bio/ebi/ebi_to_df_params.json b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/sx/bio/ebi/ebi_to_df_params.json new file mode 100644 index 000000000..8039131b2 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/sx/bio/ebi/ebi_to_df_params.json @@ -0,0 +1,5 @@ +[ + {"paramName":"mt", "paramLongName":"master", "paramDescription": "should be local or yarn", "paramRequired": true}, + {"paramName":"s", "paramLongName":"sourcePath","paramDescription": "the source Path", "paramRequired": true}, + {"paramName":"t", "paramLongName":"targetPath","paramDescription": "the oaf path ", "paramRequired": true} +] \ No newline at end of file diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/ebi/oozie_app/config-default.xml b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/sx/bio/ebi/oozie_app/config-default.xml similarity index 100% rename from dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/sx/ebi/oozie_app/config-default.xml rename to dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/sx/bio/ebi/oozie_app/config-default.xml diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/sx/bio/ebi/oozie_app/workflow.xml b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/sx/bio/ebi/oozie_app/workflow.xml new file mode 100644 index 000000000..4b47ae38e --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/sx/bio/ebi/oozie_app/workflow.xml @@ -0,0 +1,105 @@ + + + + sourcePath + the Working Path + + + workingPath + the Working Path + + + targetPath + the OAF MDStore Path + + + sparkDriverMemory + memory for driver process + + + sparkExecutorMemory + memory for individual executor + + + sparkExecutorCores + number of cores used by single executor + + + resumeFrom + DownloadEBILinks + node to start + + + + + + + + ${wf:conf('resumeFrom') eq 'DownloadEBILinks'} + ${wf:conf('resumeFrom') eq 'CreateEBIDataSet'} + + + + + + + Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] + + + + + + yarn-cluster + cluster + Incremental Download EBI Links + eu.dnetlib.dhp.sx.bio.ebi.SparkDownloadEBILinks + dhp-aggregation-${projectVersion}.jar + + --executor-memory=${sparkExecutorMemory} + --executor-cores=${sparkExecutorCores} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.shuffle.partitions=2000 + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + + --sourcePath${sourcePath} + --workingPath${workingPath} + --masteryarn + + + + + + + + + + + + + + + + yarn-cluster + cluster + Create OAF DataSet + eu.dnetlib.dhp.sx.bio.ebi.SparkEBILinksToOaf + dhp-aggregation-${projectVersion}.jar + + --executor-memory=${sparkExecutorMemory} + --executor-cores=${sparkExecutorCores} + --driver-memory=${sparkDriverMemory} + --conf spark.sql.shuffle.partitions=2000 + ${sparkExtraOPT} + + --sourcePath${sourcePath}/ebi_links_dataset + --targetPath${targetPath} + --masteryarn + + + + + + \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/sx/bio/pubmed/oozie_app/config-default.xml b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/sx/bio/pubmed/oozie_app/config-default.xml new file mode 100644 index 000000000..bdd48b0ab --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/sx/bio/pubmed/oozie_app/config-default.xml @@ -0,0 +1,19 @@ + + + jobTracker + yarnRM + + + nameNode + hdfs://nameservice1 + + + oozie.use.system.libpath + true + + + oozie.action.sharelib.for.spark + spark2 + + + \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/sx/bio/pubmed/oozie_app/workflow.xml b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/sx/bio/pubmed/oozie_app/workflow.xml new file mode 100644 index 000000000..8915a090b --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/sx/bio/pubmed/oozie_app/workflow.xml @@ -0,0 +1,58 @@ + + + + baselineWorkingPath + the Baseline Working Path + + + isLookupUrl + The IS lookUp service endopoint + + + targetPath + The target path + + + skipUpdate + false + The request block size + + + + + + + Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] + + + + + yarn + cluster + Convert Baseline to OAF Dataset + eu.dnetlib.dhp.sx.bio.ebi.SparkCreateBaselineDataFrame + dhp-aggregation-${projectVersion}.jar + + --executor-memory=${sparkExecutorMemory} + --executor-cores=${sparkExecutorCores} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.shuffle.partitions=2000 + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + + --workingPath${baselineWorkingPath} + --targetPath${targetPath} + --masteryarn + --isLookupUrl${isLookupUrl} + --hdfsServerUri${nameNode} + --skipUpdate${skipUpdate} + + + + + + + + \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/sx/graph/datacite/oozie_app/config-default.xml b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/sx/graph/datacite/oozie_app/config-default.xml new file mode 100644 index 000000000..bdd48b0ab --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/sx/graph/datacite/oozie_app/config-default.xml @@ -0,0 +1,19 @@ + + + jobTracker + yarnRM + + + nameNode + hdfs://nameservice1 + + + oozie.use.system.libpath + true + + + oozie.action.sharelib.for.spark + spark2 + + + \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/sx/graph/datacite/oozie_app/workflow.xml b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/sx/graph/datacite/oozie_app/workflow.xml new file mode 100644 index 000000000..751b124cf --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/sx/graph/datacite/oozie_app/workflow.xml @@ -0,0 +1,62 @@ + + + + sourcePath + the source path of scholix graph + + + datacitePath + the datacite native path + + + workingSupportPath + the working Support path + + + isLookupUrl + The IS lookUp service endopoint + + + updateDS + false + The transformation Rule to apply + + + + + + + Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] + + + + + yarn + cluster + New Update from Datacite to Scholix + eu.dnetlib.dhp.sx.graph.SparkRetrieveDataciteDelta + dhp-aggregation-${projectVersion}.jar + + --executor-memory=${sparkExecutorMemory} + --executor-cores=${sparkExecutorCores} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.shuffle.partitions=6000 + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + + --sourcePath${sourcePath} + --datacitePath${datacitePath} + --masteryarn + --workingSupportPath${workingSupportPath} + --isLookupUrl${isLookupUrl} + --updateDS${updateDS} + + + + + + + + \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/sx/graph/retrieve_datacite_delta_params.json b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/sx/graph/retrieve_datacite_delta_params.json new file mode 100644 index 000000000..78777ffff --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/sx/graph/retrieve_datacite_delta_params.json @@ -0,0 +1,41 @@ +[ + { + "paramName": "s", + "paramLongName": "sourcePath", + "paramDescription": "the source mdstore path", + "paramRequired": true + }, + + { + "paramName": "d", + "paramLongName": "datacitePath", + "paramDescription": "the datacite native path", + "paramRequired": true + }, + + { + "paramName": "w", + "paramLongName": "workingSupportPath", + "paramDescription": "the working Support path", + "paramRequired": true + }, + { + "paramName": "i", + "paramLongName": "isLookupUrl", + "paramDescription": "the isLookup URL", + "paramRequired": true + }, + { + "paramName": "m", + "paramLongName": "master", + "paramDescription": "the master name", + "paramRequired": true + }, + { + "paramName": "u", + "paramLongName": "updateDS", + "paramDescription": "Need to regenerate all support Dataset", + "paramRequired": false + } + +] \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/transformation/oozie_app/config-default.xml b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/transformation/oozie_app/config-default.xml new file mode 100644 index 000000000..bdd48b0ab --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/transformation/oozie_app/config-default.xml @@ -0,0 +1,19 @@ + + + jobTracker + yarnRM + + + nameNode + hdfs://nameservice1 + + + oozie.use.system.libpath + true + + + oozie.action.sharelib.for.spark + spark2 + + + \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/transformation/oozie_app/workflow.xml b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/transformation/oozie_app/workflow.xml index 4b1e3d84b..fd17289a3 100644 --- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/transformation/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/transformation/oozie_app/workflow.xml @@ -1,76 +1,194 @@ - + - mdstoreInputPath - the path of the input MDStore + mdStoreInputId + the identifier of the native MDStore - - mdstoreOutputPath + mdStoreOutputId + the identifier of the cleaned MDStore + + + mdStoreManagerURI the path of the cleaned mdstore - - transformationRule + transformationRuleId The transformation Rule to apply - - timestamp - The timestamp of the collection date + transformationPlugin + XSLT_TRANSFORM + The transformation Plugin + + + dateOfTransformation + The timestamp of the transformation date + + + isLookupUrl + The IS lookUp service endopoint - workflowId The identifier of the workflow + + dnetMessageManagerURL + The URI of the Dnet Message Manager + + + recordsPerTask + 200 + The URI of the Dnet Message Manager + + - + + Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] - - - - - - + + + + + + oozie.launcher.mapreduce.user.classpath.first + true + + + eu.dnetlib.dhp.aggregation.mdstore.MDStoreActionNode + --actionREAD_LOCK + --mdStoreID${mdStoreInputId} + --mdStoreManagerURI${mdStoreManagerURI} + + + - - - ${jobTracker} - ${nameNode} - yarn - cluster - MDBuilder - eu.dnetlib.dhp.transformation.TransformSparkJobNode - dhp-aggregations-1.0.0-SNAPSHOT.jar - --num-executors 50 --conf spark.yarn.jars="hdfs://hadoop-rm1.garr-pa1.d4science.org:8020/user/oozie/share/lib/lib_20180405103059/spark2" - --dateOfCollection ${timestamp} - -mt yarn - --input${mdstoreInputPath} - --output${mdstoreOutputPath} - -w${workflowId} - -tr${transformationRule} - -ru${rmq_user} - -rp${rmq_pwd} - -rh${rmq_host} - -ro${rmq_ongoing} - -rr${rmq_report} - - - + + + + + + oozie.launcher.mapreduce.user.classpath.first + true + + + eu.dnetlib.dhp.aggregation.mdstore.MDStoreActionNode + --actionNEW_VERSION + --mdStoreID${mdStoreOutputId} + --mdStoreManagerURI${mdStoreManagerURI} + + + + - - - - + + + yarn + cluster + Transform MetadataStore + eu.dnetlib.dhp.transformation.TransformSparkJobNode + dhp-aggregation-${projectVersion}.jar + + --executor-memory=${sparkExecutorMemory} + --executor-cores=${sparkExecutorCores} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + + --mdstoreOutputVersion${wf:actionData('StartTransaction')['mdStoreVersion']} + --mdstoreInputVersion${wf:actionData('BeginRead')['mdStoreReadLockVersion']} + --dateOfTransformation${dateOfTransformation} + --transformationPlugin${transformationPlugin} + --transformationRuleId${transformationRuleId} + --isLookupUrl${isLookupUrl} + --recordsPerTask${recordsPerTask} + --workflowId${workflowId} + --dnetMessageManagerURL${dnetMessageManagerURL} + + + + + + + + + + oozie.launcher.mapreduce.user.classpath.first + true + + + + eu.dnetlib.dhp.aggregation.mdstore.MDStoreActionNode + --actionREAD_UNLOCK + --mdStoreManagerURI${mdStoreManagerURI} + --readMDStoreId${wf:actionData('BeginRead')['mdStoreReadLockVersion']} + + + + + + + + + + + oozie.launcher.mapreduce.user.classpath.first + true + + + eu.dnetlib.dhp.aggregation.mdstore.MDStoreActionNode + --actionCOMMIT + --namenode${nameNode} + --mdStoreVersion${wf:actionData('StartTransaction')['mdStoreVersion']} + --mdStoreManagerURI${mdStoreManagerURI} + + + + + + + + + + oozie.launcher.mapreduce.user.classpath.first + true + + + eu.dnetlib.dhp.aggregation.mdstore.MDStoreActionNode + --actionREAD_UNLOCK + --mdStoreManagerURI${mdStoreManagerURI} + --readMDStoreId${wf:actionData('BeginRead')['mdStoreReadLockVersion']} + + + + + + + + + + + oozie.launcher.mapreduce.user.classpath.first + true + + + eu.dnetlib.dhp.aggregation.mdstore.MDStoreActionNode + --actionROLLBACK + --mdStoreVersion${wf:actionData('StartTransaction')['mdStoreVersion']} + --mdStoreManagerURI${mdStoreManagerURI} + - + + \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/transformation/transformation_input_parameters.json b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/transformation/transformation_input_parameters.json index 4bb5fd56a..4cc2da0c4 100644 --- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/transformation/transformation_input_parameters.json +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/transformation/transformation_input_parameters.json @@ -7,20 +7,39 @@ }, { "paramName": "d", - "paramLongName": "dateOfCollection", + "paramLongName": "dateOfTransformation", "paramDescription": "the date when the record has been stored", "paramRequired": true }, { "paramName": "i", - "paramLongName": "input", - "paramDescription": "the path of the sequencial file to read", + "paramLongName": "mdstoreInputVersion", + "paramDescription": "the mdStore Version bean of the Input", "paramRequired": true }, { "paramName": "o", - "paramLongName": "output", - "paramDescription": "the path of the result DataFrame on HDFS", + "paramLongName": "mdstoreOutputVersion", + "paramDescription": "the mdStore Version bean of the Output", + "paramRequired": true + }, + { + "paramName": "tr", + "paramLongName": "transformationRuleId", + "paramDescription": "the transformation Rule to apply to the input MDStore", + "paramRequired": true + }, + + { + "paramName": "i", + "paramLongName": "isLookupUrl", + "paramDescription": "the Information System Service LookUp URL", + "paramRequired": true + }, + { + "paramName": "dm", + "paramLongName": "dnetMessageManagerURL", + "paramDescription": "the End point URL to send Messages", "paramRequired": true }, { @@ -30,45 +49,15 @@ "paramRequired": true }, { - "paramName": "tr", - "paramLongName": "transformationRule", - "paramDescription": "the transformation Rule to apply to the input MDStore", - "paramRequired": true - }, - { - "paramName": "ru", - "paramLongName": "rabbitUser", - "paramDescription": "the user to connect with RabbitMq for messaging", - "paramRequired": true - }, - { - "paramName": "rp", - "paramLongName": "rabbitPassword", - "paramDescription": "the password to connect with RabbitMq for messaging", - "paramRequired": true - }, - { - "paramName": "rh", - "paramLongName": "rabbitHost", - "paramDescription": "the host of the RabbitMq server", - "paramRequired": true - }, - { - "paramName": "ro", - "paramLongName": "rabbitOngoingQueue", - "paramDescription": "the name of the ongoing queue", - "paramRequired": true - }, - { - "paramName": "rr", - "paramLongName": "rabbitReportQueue", - "paramDescription": "the name of the report queue", - "paramRequired": true - }, - { - "paramName": "t", - "paramLongName": "isTest", - "paramDescription": "the name of the report queue", + "paramName": "rpt", + "paramLongName": "recordsPerTask", + "paramDescription": "the number of records transformed by a single Task", "paramRequired": false + }, + { + "paramName": "tp", + "paramLongName": "transformationPlugin", + "paramDescription": "the transformation plugin to apply", + "paramRequired": true } ] \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/log4j.properties b/dhp-workflows/dhp-aggregation/src/main/resources/log4j.properties index 63cba917e..81458d1f7 100644 --- a/dhp-workflows/dhp-aggregation/src/main/resources/log4j.properties +++ b/dhp-workflows/dhp-aggregation/src/main/resources/log4j.properties @@ -7,3 +7,6 @@ log4j.appender.A1=org.apache.log4j.ConsoleAppender # A1 uses PatternLayout. log4j.appender.A1.layout=org.apache.log4j.PatternLayout log4j.appender.A1.layout.ConversionPattern=%-4r [%t] %-5p %c %x - %m%n + +log4j.logger.org.apache.spark=FATAL +log4j.logger.org.spark_project=FATAL diff --git a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/collection/CollectionUtils.scala b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/collection/CollectionUtils.scala new file mode 100644 index 000000000..85f5a3082 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/collection/CollectionUtils.scala @@ -0,0 +1,63 @@ +package eu.dnetlib.dhp.collection + +import com.fasterxml.jackson.databind.ObjectMapper +import eu.dnetlib.dhp.schema.common.ModelSupport +import eu.dnetlib.dhp.schema.oaf.{Oaf, OafEntity, Relation} +import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode} + +object CollectionUtils { + + /** This method in pipeline to the transformation phase, + * generates relations in both verse, typically it should be a phase of flatMap + * + * @param i input OAF + * @return + * If the input OAF is an entity -> List(i) + * If the input OAF is a relation -> List(relation, inverseRelation) + */ + + def fixRelations(i: Oaf): List[Oaf] = { + if (i.isInstanceOf[OafEntity]) + return List(i) + else { + val r: Relation = i.asInstanceOf[Relation] + val currentRel = ModelSupport.findRelation(r.getRelClass) + if (currentRel != null) { + + // Cleaning relation + r.setRelType(currentRel.getRelType) + r.setSubRelType(currentRel.getSubReltype) + r.setRelClass(currentRel.getRelClass) + val inverse = new Relation + inverse.setSource(r.getTarget) + inverse.setTarget(r.getSource) + inverse.setRelType(currentRel.getRelType) + inverse.setSubRelType(currentRel.getSubReltype) + inverse.setRelClass(currentRel.getInverseRelClass) + inverse.setCollectedfrom(r.getCollectedfrom) + inverse.setDataInfo(r.getDataInfo) + inverse.setProperties(r.getProperties) + inverse.setLastupdatetimestamp(r.getLastupdatetimestamp) + inverse.setValidated(r.getValidated) + inverse.setValidationDate(r.getValidationDate) + return List(r, inverse) + } + } + List() + } + + def saveDataset(dataset: Dataset[Oaf], targetPath: String): Unit = { + implicit val resultEncoder: Encoder[Oaf] = Encoders.kryo(classOf[Oaf]) + val mapper = new ObjectMapper + + dataset + .flatMap(i => CollectionUtils.fixRelations(i)) + .filter(i => i != null) + .map(r => mapper.writeValueAsString(r))(Encoders.STRING) + .write + .mode(SaveMode.Overwrite) + .option("compression", "gzip") + .text(targetPath) + } + +} diff --git a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/datacite/AbstractRestClient.scala b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/datacite/AbstractRestClient.scala new file mode 100644 index 000000000..b09ffcfd5 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/datacite/AbstractRestClient.scala @@ -0,0 +1,86 @@ +package eu.dnetlib.dhp.datacite + +import org.apache.commons.io.IOUtils +import org.apache.http.client.config.RequestConfig +import org.apache.http.client.methods.{HttpGet, HttpPost, HttpUriRequest} +import org.apache.http.entity.StringEntity +import org.apache.http.impl.client.HttpClientBuilder + +abstract class AbstractRestClient extends Iterator[String] { + + var buffer: List[String] = List() + var current_index: Int = 0 + + var scroll_value: Option[String] = None + + var complete: Boolean = false + + def extractInfo(input: String): Unit + + protected def getBufferData(): Unit + + def doHTTPGETRequest(url: String): String = { + val httpGet = new HttpGet(url) + doHTTPRequest(httpGet) + + } + + def doHTTPPOSTRequest(url: String, json: String): String = { + val httpPost = new HttpPost(url) + if (json != null) { + val entity = new StringEntity(json) + httpPost.setEntity(entity) + httpPost.setHeader("Accept", "application/json") + httpPost.setHeader("Content-type", "application/json") + } + doHTTPRequest(httpPost) + } + + def hasNext: Boolean = { + buffer.nonEmpty && current_index < buffer.size + } + + override def next(): String = { + val next_item: String = buffer(current_index) + current_index = current_index + 1 + if (current_index == buffer.size) + getBufferData() + next_item + } + + private def doHTTPRequest[A <: HttpUriRequest](r: A): String = { + val timeout = 600; // seconds + val config = RequestConfig + .custom() + .setConnectTimeout(timeout * 1000) + .setConnectionRequestTimeout(timeout * 1000) + .setSocketTimeout(timeout * 1000) + .build() + val client = HttpClientBuilder.create().setDefaultRequestConfig(config).build() + try { + var tries = 4 + while (tries > 0) { + println(s"requesting ${r.getURI}") + try { + val response = client.execute(r) + println(s"get response with status${response.getStatusLine.getStatusCode}") + if (response.getStatusLine.getStatusCode > 400) { + tries -= 1 + } else + return IOUtils.toString(response.getEntity.getContent) + } catch { + case e: Throwable => + println(s"Error on requesting ${r.getURI}") + e.printStackTrace() + tries -= 1 + } + } + "" + } finally { + if (client != null) + client.close() + } + } + + getBufferData() +} diff --git a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/datacite/DataciteAPIImporter.scala b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/datacite/DataciteAPIImporter.scala new file mode 100644 index 000000000..d2fd709aa --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/datacite/DataciteAPIImporter.scala @@ -0,0 +1,33 @@ +package eu.dnetlib.dhp.datacite + +import org.json4s.jackson.JsonMethods.{compact, parse, render} +import org.json4s.{DefaultFormats, JValue} + +class DataciteAPIImporter(timestamp: Long = 0, blocks: Long = 10, until: Long = -1) extends AbstractRestClient { + + override def extractInfo(input: String): Unit = { + implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats + lazy val json: org.json4s.JValue = parse(input) + buffer = (json \ "data").extract[List[JValue]].map(s => compact(render(s))) + val next_url = (json \ "links" \ "next").extractOrElse[String](null) + scroll_value = if (next_url != null && next_url.nonEmpty) Some(next_url) else None + if (scroll_value.isEmpty) + complete = true + current_index = 0 + } + + def get_url(): String = { + val to = if (until > 0) s"$until" else "*" + s"https://api.datacite.org/dois?page[cursor]=1&page[size]=$blocks&query=updated:[$timestamp%20TO%20$to]" + + } + + override def getBufferData(): Unit = { + if (!complete) { + val response = + if (scroll_value.isDefined) doHTTPGETRequest(scroll_value.get) + else doHTTPGETRequest(get_url()) + extractInfo(response) + } + } +} diff --git a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/datacite/DataciteModelConstants.scala b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/datacite/DataciteModelConstants.scala new file mode 100644 index 000000000..a59779387 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/datacite/DataciteModelConstants.scala @@ -0,0 +1,278 @@ +package eu.dnetlib.dhp.datacite + +import eu.dnetlib.dhp.schema.common.ModelConstants +import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils +import eu.dnetlib.dhp.schema.oaf.{DataInfo, KeyValue} + +import java.io.InputStream +import java.time.format.DateTimeFormatter +import java.util.Locale +import java.util.regex.Pattern +import scala.io.Source + +/** This class represent the dataModel of the input Dataset of Datacite + * @param doi THE DOI + * @param timestamp timestamp of last update date + * @param isActive the record is active or deleted + * @param json the json native records + */ +case class DataciteType(doi: String, timestamp: Long, isActive: Boolean, json: String) {} + +/* + The following class are utility class used for the mapping from + json datacite to OAF Shema + */ +case class RelatedIdentifierType( + relationType: String, + relatedIdentifier: String, + relatedIdentifierType: String +) {} + +case class NameIdentifiersType( + nameIdentifierScheme: Option[String], + schemeUri: Option[String], + nameIdentifier: Option[String] +) {} + +case class CreatorType( + nameType: Option[String], + nameIdentifiers: Option[List[NameIdentifiersType]], + name: Option[String], + familyName: Option[String], + givenName: Option[String], + affiliation: Option[List[String]] +) {} + +case class TitleType(title: Option[String], titleType: Option[String], lang: Option[String]) {} + +case class SubjectType(subject: Option[String], subjectScheme: Option[String]) {} + +case class DescriptionType(descriptionType: Option[String], description: Option[String]) {} + +case class FundingReferenceType( + funderIdentifierType: Option[String], + awardTitle: Option[String], + awardUri: Option[String], + funderName: Option[String], + funderIdentifier: Option[String], + awardNumber: Option[String] +) {} + +case class DateType(date: Option[String], dateType: Option[String]) {} + +case class OAFRelations(relation: String, inverse: String, relType: String) + +class DataciteModelConstants extends Serializable {} + +object DataciteModelConstants { + + val REL_TYPE_VALUE: String = "resultResult" + val DATE_RELATION_KEY = "RelationDate" + val DATACITE_FILTER_PATH = "/eu/dnetlib/dhp/datacite/datacite_filter" + val DOI_CLASS = "doi" + val SUBJ_CLASS = "keywords" + val DATACITE_NAME = "Datacite" + val dataInfo: DataInfo = dataciteDataInfo("0.9") + + val DATACITE_COLLECTED_FROM: KeyValue = + OafMapperUtils.keyValue(ModelConstants.DATACITE_ID, DATACITE_NAME) + + val subRelTypeMapping: Map[String, OAFRelations] = Map( + ModelConstants.REFERENCES -> OAFRelations( + ModelConstants.REFERENCES, + ModelConstants.IS_REFERENCED_BY, + ModelConstants.RELATIONSHIP + ), + ModelConstants.IS_REFERENCED_BY -> OAFRelations( + ModelConstants.IS_REFERENCED_BY, + ModelConstants.REFERENCES, + ModelConstants.RELATIONSHIP + ), + ModelConstants.IS_SUPPLEMENTED_BY -> OAFRelations( + ModelConstants.IS_SUPPLEMENTED_BY, + ModelConstants.IS_SUPPLEMENT_TO, + ModelConstants.SUPPLEMENT + ), + ModelConstants.IS_SUPPLEMENT_TO -> OAFRelations( + ModelConstants.IS_SUPPLEMENT_TO, + ModelConstants.IS_SUPPLEMENTED_BY, + ModelConstants.SUPPLEMENT + ), + ModelConstants.HAS_PART -> OAFRelations( + ModelConstants.HAS_PART, + ModelConstants.IS_PART_OF, + ModelConstants.PART + ), + ModelConstants.IS_PART_OF -> OAFRelations( + ModelConstants.IS_PART_OF, + ModelConstants.HAS_PART, + ModelConstants.PART + ), + ModelConstants.IS_VERSION_OF -> OAFRelations( + ModelConstants.IS_VERSION_OF, + ModelConstants.HAS_VERSION, + ModelConstants.VERSION + ), + ModelConstants.HAS_VERSION -> OAFRelations( + ModelConstants.HAS_VERSION, + ModelConstants.IS_VERSION_OF, + ModelConstants.VERSION + ), + ModelConstants.IS_IDENTICAL_TO -> OAFRelations( + ModelConstants.IS_IDENTICAL_TO, + ModelConstants.IS_IDENTICAL_TO, + ModelConstants.RELATIONSHIP + ), + ModelConstants.IS_CONTINUED_BY -> OAFRelations( + ModelConstants.IS_CONTINUED_BY, + ModelConstants.CONTINUES, + ModelConstants.RELATIONSHIP + ), + ModelConstants.CONTINUES -> OAFRelations( + ModelConstants.CONTINUES, + ModelConstants.IS_CONTINUED_BY, + ModelConstants.RELATIONSHIP + ), + ModelConstants.IS_NEW_VERSION_OF -> OAFRelations( + ModelConstants.IS_NEW_VERSION_OF, + ModelConstants.IS_PREVIOUS_VERSION_OF, + ModelConstants.VERSION + ), + ModelConstants.IS_PREVIOUS_VERSION_OF -> OAFRelations( + ModelConstants.IS_PREVIOUS_VERSION_OF, + ModelConstants.IS_NEW_VERSION_OF, + ModelConstants.VERSION + ), + ModelConstants.IS_DOCUMENTED_BY -> OAFRelations( + ModelConstants.IS_DOCUMENTED_BY, + ModelConstants.DOCUMENTS, + ModelConstants.RELATIONSHIP + ), + ModelConstants.DOCUMENTS -> OAFRelations( + ModelConstants.DOCUMENTS, + ModelConstants.IS_DOCUMENTED_BY, + ModelConstants.RELATIONSHIP + ), + ModelConstants.IS_SOURCE_OF -> OAFRelations( + ModelConstants.IS_SOURCE_OF, + ModelConstants.IS_DERIVED_FROM, + ModelConstants.VERSION + ), + ModelConstants.IS_DERIVED_FROM -> OAFRelations( + ModelConstants.IS_DERIVED_FROM, + ModelConstants.IS_SOURCE_OF, + ModelConstants.VERSION + ), + ModelConstants.CITES -> OAFRelations( + ModelConstants.CITES, + ModelConstants.IS_CITED_BY, + ModelConstants.CITATION + ), + ModelConstants.IS_CITED_BY -> OAFRelations( + ModelConstants.IS_CITED_BY, + ModelConstants.CITES, + ModelConstants.CITATION + ), + ModelConstants.IS_VARIANT_FORM_OF -> OAFRelations( + ModelConstants.IS_VARIANT_FORM_OF, + ModelConstants.IS_DERIVED_FROM, + ModelConstants.VERSION + ), + ModelConstants.IS_OBSOLETED_BY -> OAFRelations( + ModelConstants.IS_OBSOLETED_BY, + ModelConstants.IS_NEW_VERSION_OF, + ModelConstants.VERSION + ), + ModelConstants.REVIEWS -> OAFRelations( + ModelConstants.REVIEWS, + ModelConstants.IS_REVIEWED_BY, + ModelConstants.REVIEW + ), + ModelConstants.IS_REVIEWED_BY -> OAFRelations( + ModelConstants.IS_REVIEWED_BY, + ModelConstants.REVIEWS, + ModelConstants.REVIEW + ), + ModelConstants.DOCUMENTS -> OAFRelations( + ModelConstants.DOCUMENTS, + ModelConstants.IS_DOCUMENTED_BY, + ModelConstants.RELATIONSHIP + ), + ModelConstants.IS_DOCUMENTED_BY -> OAFRelations( + ModelConstants.IS_DOCUMENTED_BY, + ModelConstants.DOCUMENTS, + ModelConstants.RELATIONSHIP + ), + ModelConstants.COMPILES -> OAFRelations( + ModelConstants.COMPILES, + ModelConstants.IS_COMPILED_BY, + ModelConstants.RELATIONSHIP + ), + ModelConstants.IS_COMPILED_BY -> OAFRelations( + ModelConstants.IS_COMPILED_BY, + ModelConstants.COMPILES, + ModelConstants.RELATIONSHIP + ) + ) + + val datacite_filter: List[String] = { + val stream: InputStream = getClass.getResourceAsStream(DATACITE_FILTER_PATH) + require(stream != null) + Source.fromInputStream(stream).getLines().toList + } + + def dataciteDataInfo(trust: String): DataInfo = OafMapperUtils.dataInfo( + false, + null, + false, + false, + ModelConstants.PROVENANCE_ACTION_SET_QUALIFIER, + trust + ) + + val df_en: DateTimeFormatter = DateTimeFormatter.ofPattern( + "[MM-dd-yyyy][MM/dd/yyyy][dd-MM-yy][dd-MMM-yyyy][dd/MMM/yyyy][dd-MMM-yy][dd/MMM/yy][dd-MM-yy][dd/MM/yy][dd-MM-yyyy][dd/MM/yyyy][yyyy-MM-dd][yyyy/MM/dd]", + Locale.ENGLISH + ) + + val df_it: DateTimeFormatter = + DateTimeFormatter.ofPattern("[dd-MM-yyyy][dd/MM/yyyy]", Locale.ITALIAN) + + val funder_regex: List[(Pattern, String)] = List( + ( + Pattern.compile( + "(info:eu-repo/grantagreement/ec/h2020/)(\\d\\d\\d\\d\\d\\d)(.*)", + Pattern.MULTILINE | Pattern.CASE_INSENSITIVE + ), + "40|corda__h2020::" + ), + ( + Pattern.compile( + "(info:eu-repo/grantagreement/ec/fp7/)(\\d\\d\\d\\d\\d\\d)(.*)", + Pattern.MULTILINE | Pattern.CASE_INSENSITIVE + ), + "40|corda_______::" + ) + ) + + val Date_regex: List[Pattern] = List( + //Y-M-D + Pattern.compile( + "(18|19|20)\\d\\d([- /.])(0[1-9]|1[012])\\2(0[1-9]|[12][0-9]|3[01])", + Pattern.MULTILINE + ), + //M-D-Y + Pattern.compile( + "((0[1-9]|1[012])|([1-9]))([- /.])(0[1-9]|[12][0-9]|3[01])([- /.])(18|19|20)?\\d\\d", + Pattern.MULTILINE + ), + //D-M-Y + Pattern.compile( + "(?:(?:31(/|-|\\.)(?:0?[13578]|1[02]|(?:Jan|Mar|May|Jul|Aug|Oct|Dec)))\\1|(?:(?:29|30)(/|-|\\.)(?:0?[1,3-9]|1[0-2]|(?:Jan|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec))\\2))(?:(?:1[6-9]|[2-9]\\d)?\\d{2})|(?:29(/|-|\\.)(?:0?2|(?:Feb))\\3(?:(?:(?:1[6-9]|[2-9]\\d)?(?:0[48]|[2468][048]|[13579][26])|(?:(?:16|[2468][048]|[3579][26])00))))|(?:0?[1-9]|1\\d|2[0-8])(/|-|\\.)(?:(?:0?[1-9]|(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep))|(?:1[0-2]|(?:Oct|Nov|Dec)))\\4(?:(?:1[6-9]|[2-9]\\d)?\\d{2})", + Pattern.MULTILINE + ), + //Y + Pattern.compile("(19|20)\\d\\d", Pattern.MULTILINE) + ) + +} diff --git a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/datacite/DataciteToOAFTransformation.scala b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/datacite/DataciteToOAFTransformation.scala new file mode 100644 index 000000000..a7ad9e2d6 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/datacite/DataciteToOAFTransformation.scala @@ -0,0 +1,685 @@ +package eu.dnetlib.dhp.datacite + +import com.fasterxml.jackson.databind.ObjectMapper +import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup +import eu.dnetlib.dhp.datacite.DataciteModelConstants._ +import eu.dnetlib.dhp.schema.action.AtomicAction +import eu.dnetlib.dhp.schema.common.ModelConstants +import eu.dnetlib.dhp.schema.oaf.utils.{IdentifierFactory, OafMapperUtils} +import eu.dnetlib.dhp.schema.oaf.{Dataset => OafDataset, _} +import eu.dnetlib.dhp.utils.DHPUtils +import org.apache.commons.lang3.StringUtils +import org.json4s.DefaultFormats +import org.json4s.JsonAST.{JField, JObject, JString} +import org.json4s.jackson.JsonMethods.parse + +import java.text.SimpleDateFormat +import java.time.LocalDate +import java.time.chrono.ThaiBuddhistDate +import java.time.format.DateTimeFormatter +import java.util.{Date, Locale} +import scala.collection.JavaConverters._ +import scala.io.Source + +object DataciteToOAFTransformation { + + case class HostedByMapType( + openaire_id: String, + datacite_name: String, + official_name: String, + similarity: Option[Float] + ) {} + + val mapper = new ObjectMapper() + + val unknown_repository: HostedByMapType = HostedByMapType( + ModelConstants.UNKNOWN_REPOSITORY_ORIGINALID, + ModelConstants.UNKNOWN_REPOSITORY.getValue, + ModelConstants.UNKNOWN_REPOSITORY.getValue, + Some(1.0f) + ) + + val hostedByMap: Map[String, HostedByMapType] = { + val s = Source.fromInputStream(getClass.getResourceAsStream("hostedBy_map.json")).mkString + implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats + lazy val json: org.json4s.JValue = parse(s) + json.extract[Map[String, HostedByMapType]] + } + + /** This method should skip record if json contains invalid text + * defined in file datacite_filter + * + * @param record : not parsed Datacite record + * @param json : parsed record + * @return True if the record should be skipped + */ + def skip_record(record: String, json: org.json4s.JValue): Boolean = { + implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats + datacite_filter.exists(f => record.contains(f)) || (json \\ "publisher") + .extractOrElse[String]("") + .equalsIgnoreCase("FAIRsharing") + + } + + @deprecated("this method will be removed", "dhp") + def toActionSet(item: Oaf): (String, String) = { + val mapper = new ObjectMapper() + + item match { + case dataset: OafDataset => + val a: AtomicAction[OafDataset] = new AtomicAction[OafDataset] + a.setClazz(classOf[OafDataset]) + a.setPayload(dataset) + (dataset.getClass.getCanonicalName, mapper.writeValueAsString(a)) + case publication: Publication => + val a: AtomicAction[Publication] = new AtomicAction[Publication] + a.setClazz(classOf[Publication]) + a.setPayload(publication) + (publication.getClass.getCanonicalName, mapper.writeValueAsString(a)) + case software: Software => + val a: AtomicAction[Software] = new AtomicAction[Software] + a.setClazz(classOf[Software]) + a.setPayload(software) + (software.getClass.getCanonicalName, mapper.writeValueAsString(a)) + case orp: OtherResearchProduct => + val a: AtomicAction[OtherResearchProduct] = new AtomicAction[OtherResearchProduct] + a.setClazz(classOf[OtherResearchProduct]) + a.setPayload(orp) + (orp.getClass.getCanonicalName, mapper.writeValueAsString(a)) + + case relation: Relation => + val a: AtomicAction[Relation] = new AtomicAction[Relation] + a.setClazz(classOf[Relation]) + a.setPayload(relation) + (relation.getClass.getCanonicalName, mapper.writeValueAsString(a)) + case _ => + null + } + + } + + /** This utility method indicates whether the embargo date has been reached + * @param embargo_end_date + * @return True if the embargo date has been reached, false otherwise + */ + def embargo_end(embargo_end_date: String): Boolean = { + val dt = LocalDate.parse(embargo_end_date, DateTimeFormatter.ofPattern("[yyyy-MM-dd]")) + val td = LocalDate.now() + td.isAfter(dt) + } + + def extract_date(input: String): Option[String] = { + val d = Date_regex + .map(pattern => { + val matcher = pattern.matcher(input) + if (matcher.find()) + matcher.group(0) + else + null + }) + .find(s => s != null) + + if (d.isDefined) { + val a_date = if (d.get.length == 4) s"01-01-${d.get}" else d.get + try { + return Some(LocalDate.parse(a_date, df_en).toString) + } catch { + case _: Throwable => + try { + return Some(LocalDate.parse(a_date, df_it).toString) + } catch { + case _: Throwable => + return None + } + } + } + d + } + + def fix_thai_date(input: String, format: String): String = { + try { + val a_date = LocalDate.parse(input, DateTimeFormatter.ofPattern(format)) + val d = ThaiBuddhistDate.of(a_date.getYear, a_date.getMonth.getValue, a_date.getDayOfMonth) + LocalDate.from(d).toString + } catch { + case _: Throwable => "" + } + } + + /** * + * Use the vocabulary dnet:publication_resource to find a synonym to one of these terms and get the instance.type. + * Using the dnet:result_typologies vocabulary, we look up the instance.type synonym + * to generate one of the following main entities: + * - publication + * - dataset + * - software + * - otherresearchproduct + * + * @param resourceType + * @param resourceTypeGeneral + * @param schemaOrg + * @param vocabularies + * @return + */ + def getTypeQualifier( + resourceType: String, + resourceTypeGeneral: String, + schemaOrg: String, + vocabularies: VocabularyGroup + ): (Qualifier, Qualifier) = { + if (resourceType != null && resourceType.nonEmpty) { + val typeQualifier = + vocabularies.getSynonymAsQualifier(ModelConstants.DNET_PUBLICATION_RESOURCE, resourceType) + if (typeQualifier != null) + return ( + typeQualifier, + vocabularies.getSynonymAsQualifier( + ModelConstants.DNET_RESULT_TYPOLOGIES, + typeQualifier.getClassid + ) + ) + } + if (schemaOrg != null && schemaOrg.nonEmpty) { + val typeQualifier = + vocabularies.getSynonymAsQualifier(ModelConstants.DNET_PUBLICATION_RESOURCE, schemaOrg) + if (typeQualifier != null) + return ( + typeQualifier, + vocabularies.getSynonymAsQualifier( + ModelConstants.DNET_RESULT_TYPOLOGIES, + typeQualifier.getClassid + ) + ) + + } + if (resourceTypeGeneral != null && resourceTypeGeneral.nonEmpty) { + val typeQualifier = vocabularies.getSynonymAsQualifier( + ModelConstants.DNET_PUBLICATION_RESOURCE, + resourceTypeGeneral + ) + if (typeQualifier != null) + return ( + typeQualifier, + vocabularies.getSynonymAsQualifier( + ModelConstants.DNET_RESULT_TYPOLOGIES, + typeQualifier.getClassid + ) + ) + + } + null + } + + def getResult( + resourceType: String, + resourceTypeGeneral: String, + schemaOrg: String, + vocabularies: VocabularyGroup + ): Result = { + val typeQualifiers: (Qualifier, Qualifier) = + getTypeQualifier(resourceType, resourceTypeGeneral, schemaOrg, vocabularies) + if (typeQualifiers == null) + return null + val i = new Instance + i.setInstancetype(typeQualifiers._1) + typeQualifiers._2.getClassname match { + case "dataset" => + val r = new OafDataset + r.setInstance(List(i).asJava) + return r + case "publication" => + val r = new Publication + r.setInstance(List(i).asJava) + return r + case "software" => + val r = new Software + r.setInstance(List(i).asJava) + return r + case "other" => + val r = new OtherResearchProduct + r.setInstance(List(i).asJava) + return r + } + null + } + + def available_date(input: String): Boolean = { + + implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats + lazy val json: org.json4s.JValue = parse(input) + val l: List[String] = for { + JObject(dates) <- json \\ "dates" + JField("dateType", JString(dateTypes)) <- dates + } yield dateTypes + + l.exists(p => p.equalsIgnoreCase("available")) + + } + + /** As describe in ticket #6377 + * when the result come from figshare we need to remove subject + * and set Access rights OPEN. + * + * @param r + */ + def fix_figshare(r: Result): Unit = { + + if (r.getInstance() != null) { + val hosted_by_figshare = r + .getInstance() + .asScala + .exists(i => i.getHostedby != null && "figshare".equalsIgnoreCase(i.getHostedby.getValue)) + if (hosted_by_figshare) { + r.getInstance().asScala.foreach(i => i.setAccessright(ModelConstants.OPEN_ACCESS_RIGHT())) + val l: List[Subject] = List() + r.setSubject(l.asJava) + } + } + + } + + def createDNetTargetIdentifier(pid: String, pidType: String, idPrefix: String): String = { + val f_part = s"$idPrefix|${pidType.toLowerCase}".padTo(15, '_') + s"$f_part::${IdentifierFactory.md5(pid.toLowerCase)}" + } + + def generateOAFDate(dt: String, q: Qualifier): StructuredProperty = { + OafMapperUtils.structuredProperty(dt, q, null) + } + + def generateRelation( + sourceId: String, + targetId: String, + relClass: String, + cf: KeyValue, + di: DataInfo + ): Relation = { + + val r = new Relation + r.setSource(sourceId) + r.setTarget(targetId) + r.setRelType(ModelConstants.RESULT_PROJECT) + r.setRelClass(relClass) + r.setSubRelType(ModelConstants.OUTCOME) + r.setCollectedfrom(List(cf).asJava) + r.setDataInfo(di) + r + + } + + def get_projectRelation(awardUri: String, sourceId: String): List[Relation] = { + val match_pattern = funder_regex.find(s => s._1.matcher(awardUri).find()) + + if (match_pattern.isDefined) { + val m = match_pattern.get._1 + val p = match_pattern.get._2 + val grantId = m.matcher(awardUri).replaceAll("$2") + val targetId = s"$p${DHPUtils.md5(grantId)}" + List(generateRelation(sourceId, targetId, "isProducedBy", DATACITE_COLLECTED_FROM, dataInfo)) + } else + List() + + } + + def generateOAF( + input: String, + ts: Long, + dateOfCollection: Long, + vocabularies: VocabularyGroup, + exportLinks: Boolean + ): List[Oaf] = { + + implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats + lazy val json = parse(input) + + if (skip_record(input, json)) + return List() + + val resourceType = (json \ "attributes" \ "types" \ "resourceType").extractOrElse[String](null) + val resourceTypeGeneral = + (json \ "attributes" \ "types" \ "resourceTypeGeneral").extractOrElse[String](null) + val schemaOrg = (json \ "attributes" \ "types" \ "schemaOrg").extractOrElse[String](null) + + val doi = (json \ "attributes" \ "doi").extract[String] + if (doi.isEmpty) + return List() + + //Mapping type based on vocabularies dnet:publication_resource and dnet:result_typologies + val result = getResult(resourceType, resourceTypeGeneral, schemaOrg, vocabularies) + if (result == null) + return List() + + // DOI is mapped on a PID inside a Instance object + val doi_q = OafMapperUtils.qualifier( + "doi", + "doi", + ModelConstants.DNET_PID_TYPES, + ModelConstants.DNET_PID_TYPES + ) + val pid = OafMapperUtils.structuredProperty(doi, doi_q, dataInfo) + result.setPid(List(pid).asJava) + + // This identifiere will be replaced in a second moment using the PID logic generation + result.setId(OafMapperUtils.createOpenaireId(50, s"datacite____::$doi", true)) + result.setOriginalId(List(doi).asJava) + + val d = new Date(dateOfCollection * 1000) + val ISO8601FORMAT = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ssZ", Locale.US) + + result.setDateofcollection(ISO8601FORMAT.format(d)) + result.setDateoftransformation(ISO8601FORMAT.format(d)) + result.setDataInfo(dataInfo) + + val creators = (json \\ "creators").extractOrElse[List[CreatorType]](List()) + + val authors = creators.zipWithIndex.map { case (c, idx) => + val a = new Author + a.setFullname(c.name.orNull) + a.setName(c.givenName.orNull) + a.setSurname(c.familyName.orNull) + if (c.nameIdentifiers != null && c.nameIdentifiers.isDefined && c.nameIdentifiers.get != null) { + a.setPid( + c.nameIdentifiers.get + .map(ni => { + val q = + if (ni.nameIdentifierScheme.isDefined) + vocabularies.getTermAsQualifier( + ModelConstants.DNET_PID_TYPES, + ni.nameIdentifierScheme.get.toLowerCase() + ) + else null + if (ni.nameIdentifier != null && ni.nameIdentifier.isDefined) { + OafMapperUtils.structuredProperty(ni.nameIdentifier.get, q, dataInfo) + } else + null + + }) + .asJava + ) + } + if (c.affiliation.isDefined) + a.setAffiliation( + c.affiliation.get + .filter(af => af.nonEmpty) + .map(af => OafMapperUtils.field(af, dataInfo)) + .asJava + ) + a.setRank(idx + 1) + a + } + + if (authors == null || authors.isEmpty || !authors.exists(a => a != null)) + return List() + result.setAuthor(authors.asJava) + + val titles: List[TitleType] = (json \\ "titles").extractOrElse[List[TitleType]](List()) + + result.setTitle( + titles + .filter(t => t.title.nonEmpty) + .map(t => { + if (t.titleType.isEmpty) { + OafMapperUtils + .structuredProperty(t.title.get, ModelConstants.MAIN_TITLE_QUALIFIER, null) + } else { + OafMapperUtils.structuredProperty( + t.title.get, + t.titleType.get, + t.titleType.get, + ModelConstants.DNET_DATACITE_TITLE, + ModelConstants.DNET_DATACITE_TITLE, + null + ) + } + }) + .asJava + ) + + val dates = (json \\ "dates").extract[List[DateType]] + val publication_year = (json \\ "publicationYear").extractOrElse[String](null) + + val i_date = dates + .filter(d => d.date.isDefined && d.dateType.isDefined) + .find(d => d.dateType.get.equalsIgnoreCase("issued")) + .map(d => extract_date(d.date.get)) + val a_date: Option[String] = dates + .filter(d => d.date.isDefined && d.dateType.isDefined && d.dateType.get.equalsIgnoreCase("available")) + .map(d => extract_date(d.date.get)) + .find(d => d != null && d.isDefined) + .map(d => d.get) + + if (a_date.isDefined) { + if (doi.startsWith("10.14457")) + result.setEmbargoenddate( + OafMapperUtils.field(fix_thai_date(a_date.get, "[yyyy-MM-dd]"), null) + ) + else + result.setEmbargoenddate(OafMapperUtils.field(a_date.get, null)) + } + if (i_date.isDefined && i_date.get.isDefined) { + if (doi.startsWith("10.14457")) { + result.setDateofacceptance( + OafMapperUtils.field(fix_thai_date(i_date.get.get, "[yyyy-MM-dd]"), null) + ) + result + .getInstance() + .get(0) + .setDateofacceptance( + OafMapperUtils.field(fix_thai_date(i_date.get.get, "[yyyy-MM-dd]"), null) + ) + } else { + result.setDateofacceptance(OafMapperUtils.field(i_date.get.get, null)) + result.getInstance().get(0).setDateofacceptance(OafMapperUtils.field(i_date.get.get, null)) + } + } else if (publication_year != null) { + if (doi.startsWith("10.14457")) { + result.setDateofacceptance( + OafMapperUtils.field(fix_thai_date(s"01-01-$publication_year", "[dd-MM-yyyy]"), null) + ) + result + .getInstance() + .get(0) + .setDateofacceptance( + OafMapperUtils.field(fix_thai_date(s"01-01-$publication_year", "[dd-MM-yyyy]"), null) + ) + + } else { + result.setDateofacceptance(OafMapperUtils.field(s"01-01-$publication_year", null)) + result + .getInstance() + .get(0) + .setDateofacceptance(OafMapperUtils.field(s"01-01-$publication_year", null)) + } + } + + result.setRelevantdate( + dates + .filter(d => d.date.isDefined && d.dateType.isDefined) + .map(d => (extract_date(d.date.get), d.dateType.get)) + .filter(d => d._1.isDefined) + .map(d => + ( + d._1.get, + vocabularies.getTermAsQualifier(ModelConstants.DNET_DATACITE_DATE, d._2.toLowerCase()) + ) + ) + .filter(d => d._2 != null) + .map(d => generateOAFDate(d._1, d._2)) + .asJava + ) + + val subjects = (json \\ "subjects").extract[List[SubjectType]] + + result.setSubject( + subjects + .filter(s => s.subject.nonEmpty) + .map(s => + OafMapperUtils.subject( + s.subject.get, + SUBJ_CLASS, + SUBJ_CLASS, + ModelConstants.DNET_SUBJECT_TYPOLOGIES, + ModelConstants.DNET_SUBJECT_TYPOLOGIES, + null + ) + ) + .asJava + ) + + result.setCollectedfrom(List(DATACITE_COLLECTED_FROM).asJava) + + val descriptions = (json \\ "descriptions").extract[List[DescriptionType]] + + result.setDescription( + descriptions + .filter(d => d.description.isDefined) + .map(d => OafMapperUtils.field(d.description.get, null)) + .filter(s => s != null) + .asJava + ) + + val publisher = (json \\ "publisher").extractOrElse[String](null) + if (publisher != null) + result.setPublisher(OafMapperUtils.field(publisher, null)) + + val language: String = (json \\ "language").extractOrElse[String](null) + + if (language != null) + result.setLanguage( + vocabularies.getSynonymAsQualifier(ModelConstants.DNET_LANGUAGES, language) + ) + + val instance = result.getInstance().get(0) + + val client = (json \ "relationships" \ "client" \\ "id").extractOpt[String] + + val accessRights: List[String] = for { + JObject(rightsList) <- json \\ "rightsList" + JField("rightsUri", JString(rightsUri)) <- rightsList + } yield rightsUri + + val aRights: Option[AccessRight] = accessRights + .map(r => { + vocabularies.getSynonymAsQualifier(ModelConstants.DNET_ACCESS_MODES, r) + }) + .find(q => q != null) + .map(q => { + val a = new AccessRight + a.setClassid(q.getClassid) + a.setClassname(q.getClassname) + a.setSchemeid(q.getSchemeid) + a.setSchemename(q.getSchemename) + a + }) + + val access_rights_qualifier = + if (aRights.isDefined) aRights.get + else + OafMapperUtils.accessRight( + ModelConstants.UNKNOWN, + ModelConstants.NOT_AVAILABLE, + ModelConstants.DNET_ACCESS_MODES, + ModelConstants.DNET_ACCESS_MODES + ) + + if (client.isDefined) { + + val hb = hostedByMap.getOrElse(client.get.toUpperCase(), unknown_repository) + instance.setHostedby(OafMapperUtils.keyValue(generateDSId(hb.openaire_id), hb.official_name)) + + instance.setCollectedfrom(DATACITE_COLLECTED_FROM) + instance.setUrl(List(s"https://dx.doi.org/$doi").asJava) + instance.setAccessright(access_rights_qualifier) + instance.setPid(result.getPid) + val license = accessRights + .find(r => + r.startsWith("http") && r.matches( + ".*(/licenses|/publicdomain|unlicense\\.org/|/legal-and-data-protection-notices|/download/license|/open-government-licence).*" + ) + ) + if (license.isDefined) + instance.setLicense(OafMapperUtils.field(license.get, null)) + } + + val awardUris: List[String] = for { + JObject(fundingReferences) <- json \\ "fundingReferences" + JField("awardUri", JString(awardUri)) <- fundingReferences + } yield awardUri + + val oid = result.getId + result.setId(IdentifierFactory.createIdentifier(result)) + if (!result.getId.equalsIgnoreCase(oid)) { + result.setOriginalId((oid :: List(doi)).asJava) + } + + var relations: List[Relation] = + awardUris.flatMap(a => get_projectRelation(a, result.getId)).filter(r => r != null) + + fix_figshare(result) + + if (result.getId == null) + return List() + + if (exportLinks) { + val rels: List[RelatedIdentifierType] = for { + JObject(relIdentifier) <- json \\ "relatedIdentifiers" + JField("relationType", JString(relationType)) <- relIdentifier + JField("relatedIdentifierType", JString(relatedIdentifierType)) <- relIdentifier + JField("relatedIdentifier", JString(relatedIdentifier)) <- relIdentifier + } yield RelatedIdentifierType(relationType, relatedIdentifier, relatedIdentifierType) + + relations = relations ::: generateRelations( + rels, + result.getId, + if (i_date.isDefined && i_date.get.isDefined) i_date.get.get else null + ) + } + if (relations != null && relations.nonEmpty) { + List(result) ::: relations + } else + List(result) + } + + private def generateRelations( + rels: List[RelatedIdentifierType], + id: String, + date: String + ): List[Relation] = { + rels + .filter(r => + subRelTypeMapping + .contains(r.relationType) && (r.relatedIdentifierType.equalsIgnoreCase("doi") || + r.relatedIdentifierType.equalsIgnoreCase("pmid") || + r.relatedIdentifierType.equalsIgnoreCase("arxiv")) + ) + .map(r => { + val rel = new Relation + rel.setCollectedfrom(List(DATACITE_COLLECTED_FROM).asJava) + rel.setDataInfo(dataInfo) + + val subRelType = subRelTypeMapping(r.relationType).relType + rel.setRelType(REL_TYPE_VALUE) + rel.setSubRelType(subRelType) + rel.setRelClass(r.relationType) + + val dateProps: KeyValue = OafMapperUtils.keyValue(DATE_RELATION_KEY, date) + + rel.setProperties(List(dateProps).asJava) + + rel.setSource(id) + rel.setTarget( + DHPUtils.generateUnresolvedIdentifier(r.relatedIdentifier, r.relatedIdentifierType) + ) + rel.setCollectedfrom(List(DATACITE_COLLECTED_FROM).asJava) + rel.getCollectedfrom.asScala.map(c => c.getValue).toList + rel + }) + } + + def generateDSId(input: String): String = { + val b = StringUtils.substringBefore(input, "::") + val a = StringUtils.substringAfter(input, "::") + s"10|$b::${DHPUtils.md5(a)}" + } + +} diff --git a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/datacite/GenerateDataciteDatasetSpark.scala b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/datacite/GenerateDataciteDatasetSpark.scala new file mode 100644 index 000000000..046290969 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/datacite/GenerateDataciteDatasetSpark.scala @@ -0,0 +1,110 @@ +package eu.dnetlib.dhp.datacite + +import com.fasterxml.jackson.databind.ObjectMapper +import eu.dnetlib.dhp.application.AbstractScalaApplication +import eu.dnetlib.dhp.collection.CollectionUtils +import eu.dnetlib.dhp.common.Constants.{MDSTORE_DATA_PATH, MDSTORE_SIZE_PATH} +import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup +import eu.dnetlib.dhp.schema.mdstore.{MDStoreVersion, MetadataRecord} +import eu.dnetlib.dhp.schema.oaf.Oaf +import eu.dnetlib.dhp.utils.DHPUtils.writeHdfsFile +import eu.dnetlib.dhp.utils.ISLookupClientFactory +import org.apache.spark.sql.{Encoder, Encoders, SparkSession} +import org.slf4j.{Logger, LoggerFactory} + +class GenerateDataciteDatasetSpark(propertyPath: String, args: Array[String], log: Logger) + extends AbstractScalaApplication(propertyPath, args, log: Logger) { + + /** Here all the spark applications runs this method + * where the whole logic of the spark node is defined + */ + override def run(): Unit = { + + val sourcePath = parser.get("sourcePath") + log.info(s"SourcePath is '$sourcePath'") + val exportLinks = "true".equalsIgnoreCase(parser.get("exportLinks")) + log.info(s"exportLinks is '$exportLinks'") + val isLookupUrl: String = parser.get("isLookupUrl") + log.info("isLookupUrl: {}", isLookupUrl) + + val isLookupService = ISLookupClientFactory.getLookUpService(isLookupUrl) + val vocabularies = VocabularyGroup.loadVocsFromIS(isLookupService) + require(vocabularies != null) + + val mdstoreOutputVersion = parser.get("mdstoreOutputVersion") + log.info(s"mdstoreOutputVersion is '$mdstoreOutputVersion'") + + val mapper = new ObjectMapper() + val cleanedMdStoreVersion = mapper.readValue(mdstoreOutputVersion, classOf[MDStoreVersion]) + val outputBasePath = cleanedMdStoreVersion.getHdfsPath + log.info(s"outputBasePath is '$outputBasePath'") + val targetPath = s"$outputBasePath$MDSTORE_DATA_PATH" + log.info(s"targetPath is '$targetPath'") + + generateDataciteDataset(sourcePath, exportLinks, vocabularies, targetPath, spark) + + reportTotalSize(targetPath, outputBasePath) + } + + /** For working with MDStore we need to store in a file on hdfs the size of + * the current dataset + * @param targetPath + * @param outputBasePath + */ + def reportTotalSize(targetPath: String, outputBasePath: String): Unit = { + val total_items = spark.read.text(targetPath).count() + writeHdfsFile( + spark.sparkContext.hadoopConfiguration, + s"$total_items", + outputBasePath + MDSTORE_SIZE_PATH + ) + } + + /** Generate the transformed and cleaned OAF Dataset from the native one + * + * @param sourcePath sourcePath of the native Dataset in format JSON/Datacite + * @param exportLinks If true it generates unresolved links + * @param vocabularies vocabularies for cleaning + * @param targetPath the targetPath of the result Dataset + */ + def generateDataciteDataset( + sourcePath: String, + exportLinks: Boolean, + vocabularies: VocabularyGroup, + targetPath: String, + spark: SparkSession + ): Unit = { + require(spark != null) + import spark.implicits._ + + implicit val mrEncoder: Encoder[MetadataRecord] = Encoders.kryo[MetadataRecord] + + implicit val resEncoder: Encoder[Oaf] = Encoders.kryo[Oaf] + CollectionUtils.saveDataset( + spark.read + .load(sourcePath) + .as[DataciteType] + .filter(d => d.isActive) + .flatMap(d => + DataciteToOAFTransformation + .generateOAF(d.json, d.timestamp, d.timestamp, vocabularies, exportLinks) + ) + .filter(d => d != null), + targetPath + ) + } + +} + +object GenerateDataciteDatasetSpark { + + val log: Logger = LoggerFactory.getLogger(GenerateDataciteDatasetSpark.getClass) + + def main(args: Array[String]): Unit = { + new GenerateDataciteDatasetSpark( + "/eu/dnetlib/dhp/datacite/generate_dataset_params.json", + args, + log + ).initialize().run() + } +} diff --git a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/datacite/ImportDatacite.scala b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/datacite/ImportDatacite.scala new file mode 100644 index 000000000..6e2cc798c --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/datacite/ImportDatacite.scala @@ -0,0 +1,211 @@ +package eu.dnetlib.dhp.datacite + +import eu.dnetlib.dhp.application.ArgumentApplicationParser +import org.apache.hadoop.conf.Configuration +import org.apache.hadoop.fs.{FileSystem, LocalFileSystem, Path} +import org.apache.hadoop.hdfs.DistributedFileSystem +import org.apache.hadoop.io.{IntWritable, SequenceFile, Text} +import org.apache.spark.SparkContext +import org.apache.spark.rdd.RDD +import org.apache.spark.sql.expressions.Aggregator +import org.apache.spark.sql.functions.max +import org.apache.spark.sql.{Dataset, Encoder, SaveMode, SparkSession} +import org.json4s.DefaultFormats +import org.json4s.jackson.JsonMethods.parse +import org.slf4j.{Logger, LoggerFactory} + +import java.time.format.DateTimeFormatter.ISO_DATE_TIME +import java.time.{LocalDateTime, ZoneOffset} +import scala.io.Source + +object ImportDatacite { + + val log: Logger = LoggerFactory.getLogger(ImportDatacite.getClass) + + def convertAPIStringToDataciteItem(input: String): DataciteType = { + implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats + lazy val json: org.json4s.JValue = parse(input) + val doi = (json \ "attributes" \ "doi").extract[String].toLowerCase + + val isActive = (json \ "attributes" \ "isActive").extract[Boolean] + + val timestamp_string = (json \ "attributes" \ "updated").extract[String] + val dt = LocalDateTime.parse(timestamp_string, ISO_DATE_TIME) + DataciteType( + doi = doi, + timestamp = dt.toInstant(ZoneOffset.UTC).toEpochMilli / 1000, + isActive = isActive, + json = input + ) + + } + + def main(args: Array[String]): Unit = { + + val parser = new ArgumentApplicationParser( + Source + .fromInputStream( + getClass.getResourceAsStream( + "/eu/dnetlib/dhp/datacite/import_from_api.json" + ) + ) + .mkString + ) + parser.parseArgument(args) + val master = parser.get("master") + + val hdfsuri = parser.get("namenode") + log.info(s"namenode is $hdfsuri") + + val targetPath = parser.get("targetPath") + log.info(s"targetPath is $targetPath") + + val dataciteDump = parser.get("dataciteDumpPath") + log.info(s"dataciteDump is $dataciteDump") + + val hdfsTargetPath = new Path(targetPath) + log.info(s"hdfsTargetPath is $hdfsTargetPath") + + val bs = if (parser.get("blocksize") == null) 100 else parser.get("blocksize").toInt + + val spkipImport = parser.get("skipImport") + log.info(s"skipImport is $spkipImport") + + val spark: SparkSession = SparkSession + .builder() + .appName(ImportDatacite.getClass.getSimpleName) + .master(master) + .getOrCreate() + + // ====== Init HDFS File System Object + val conf = new Configuration + // Set FileSystem URI + conf.set("fs.defaultFS", hdfsuri) + + // Because of Maven + conf.set("fs.hdfs.impl", classOf[DistributedFileSystem].getName) + conf.set("fs.file.impl", classOf[LocalFileSystem].getName) + val sc: SparkContext = spark.sparkContext + sc.setLogLevel("ERROR") + + import spark.implicits._ + + val dataciteAggregator: Aggregator[DataciteType, DataciteType, DataciteType] = + new Aggregator[DataciteType, DataciteType, DataciteType] with Serializable { + + override def zero: DataciteType = null + + override def reduce(a: DataciteType, b: DataciteType): DataciteType = { + if (b == null) + return a + if (a == null) + return b + if (a.timestamp > b.timestamp) { + return a + } + b + } + + override def merge(a: DataciteType, b: DataciteType): DataciteType = { + reduce(a, b) + } + + override def bufferEncoder: Encoder[DataciteType] = implicitly[Encoder[DataciteType]] + + override def outputEncoder: Encoder[DataciteType] = implicitly[Encoder[DataciteType]] + + override def finish(reduction: DataciteType): DataciteType = reduction + } + + val dump: Dataset[DataciteType] = spark.read.load(dataciteDump).as[DataciteType] + val ts = dump.select(max("timestamp")).first().getLong(0) + + println(s"last Timestamp is $ts") + + val cnt = + if ("true".equalsIgnoreCase(spkipImport)) 1 + else writeSequenceFile(hdfsTargetPath, ts, conf, bs) + + println(s"Imported from Datacite API $cnt documents") + + if (cnt > 0) { + + val inputRdd: RDD[DataciteType] = sc + .sequenceFile(targetPath, classOf[Int], classOf[Text]) + .map(s => s._2.toString) + .map(s => convertAPIStringToDataciteItem(s)) + spark.createDataset(inputRdd).write.mode(SaveMode.Overwrite).save(s"${targetPath}_dataset") + + val ds: Dataset[DataciteType] = spark.read.load(s"${targetPath}_dataset").as[DataciteType] + + dump + .union(ds) + .groupByKey(_.doi) + .agg(dataciteAggregator.toColumn) + .map(s => s._2) + .repartition(4000) + .write + .mode(SaveMode.Overwrite) + .save(s"${dataciteDump}_updated") + + val fs = FileSystem.get(sc.hadoopConfiguration) + fs.delete(new Path(s"$dataciteDump"), true) + fs.rename(new Path(s"${dataciteDump}_updated"), new Path(s"$dataciteDump")) + } + } + + private def writeSequenceFile( + hdfsTargetPath: Path, + timestamp: Long, + conf: Configuration, + bs: Int + ): Long = { + var from: Long = timestamp * 1000 + val delta: Long = 100000000L + var client: DataciteAPIImporter = null + val now: Long = System.currentTimeMillis() + var i = 0 + try { + val writer = SequenceFile.createWriter( + conf, + SequenceFile.Writer.file(hdfsTargetPath), + SequenceFile.Writer.keyClass(classOf[IntWritable]), + SequenceFile.Writer.valueClass(classOf[Text]) + ) + try { + var start: Long = System.currentTimeMillis + while (from < now) { + client = new DataciteAPIImporter(from, bs, from + delta) + var end: Long = 0 + val key: IntWritable = new IntWritable(i) + val value: Text = new Text + while (client.hasNext) { + key.set { + i += 1; + i - 1 + } + value.set(client.next()) + writer.append(key, value) + writer.hflush() + if (i % 1000 == 0) { + end = System.currentTimeMillis + val time = (end - start) / 1000.0f + println(s"Imported $i in $time seconds") + start = System.currentTimeMillis + } + } + println(s"updating from value: $from -> ${from + delta}") + from = from + delta + } + } catch { + case e: Throwable => + println("Error", e) + } finally if (writer != null) writer.close() + } catch { + case e: Throwable => + log.error("Error", e) + } + i + } + +} diff --git a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/datacite/SparkDownloadUpdateDatacite.scala b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/datacite/SparkDownloadUpdateDatacite.scala new file mode 100644 index 000000000..3e61edf02 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/datacite/SparkDownloadUpdateDatacite.scala @@ -0,0 +1,61 @@ +package eu.dnetlib.dhp.datacite + +import eu.dnetlib.dhp.application.ArgumentApplicationParser +import eu.dnetlib.dhp.schema.oaf.{Oaf, Result} +import org.apache.spark.SparkConf +import org.apache.spark.sql.functions.max +import org.apache.spark.sql.{Encoder, Encoders, SparkSession} +import org.slf4j.{Logger, LoggerFactory} + +import java.text.SimpleDateFormat +import java.util.Locale +import scala.io.Source + +object SparkDownloadUpdateDatacite { + val log: Logger = LoggerFactory.getLogger(getClass) + + def main(args: Array[String]): Unit = { + + val conf = new SparkConf + val parser = new ArgumentApplicationParser( + Source + .fromInputStream( + getClass.getResourceAsStream("/eu/dnetlib/dhp/datacite/generate_dataset_params.json") + ) + .mkString + ) + parser.parseArgument(args) + val master = parser.get("master") + val sourcePath = parser.get("sourcePath") + val workingPath = parser.get("workingPath") + + val hdfsuri = parser.get("namenode") + log.info(s"namenode is $hdfsuri") + + val spark: SparkSession = SparkSession + .builder() + .config(conf) + .appName(getClass.getSimpleName) + .master(master) + .getOrCreate() + + implicit val oafEncoder: Encoder[Oaf] = Encoders.kryo[Oaf] + implicit val resEncoder: Encoder[Result] = Encoders.kryo[Result] + + import spark.implicits._ + + val maxDate: String = spark.read + .load(workingPath) + .as[Oaf] + .filter(s => s.isInstanceOf[Result]) + .map(r => r.asInstanceOf[Result].getDateofcollection) + .select(max("value")) + .first() + .getString(0) + val ISO8601FORMAT = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ssZ", Locale.US) + val string_to_date = ISO8601FORMAT.parse(maxDate) + val ts = string_to_date.getTime + + } + +} diff --git a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/BioDBToOAF.scala b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/BioDBToOAF.scala new file mode 100644 index 000000000..670323598 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/BioDBToOAF.scala @@ -0,0 +1,597 @@ +package eu.dnetlib.dhp.sx.bio + +import eu.dnetlib.dhp.schema.common.ModelConstants +import eu.dnetlib.dhp.schema.oaf.utils.{GraphCleaningFunctions, OafMapperUtils} +import eu.dnetlib.dhp.schema.oaf._ +import org.json4s.DefaultFormats +import org.json4s.JsonAST.{JField, JObject, JString} +import org.json4s.jackson.JsonMethods.{compact, parse, render} +import collection.JavaConverters._ + +object BioDBToOAF { + + case class EBILinkItem(id: Long, links: String) {} + + case class EBILinks( + relType: String, + date: String, + title: String, + pmid: String, + targetPid: String, + targetPidType: String, + targetUrl: String + ) {} + + case class UniprotDate(date: String, date_info: String) {} + + case class ScholixResolved( + pid: String, + pidType: String, + typology: String, + tilte: List[String], + datasource: List[String], + date: List[String], + authors: List[String] + ) {} + + val DATA_INFO: DataInfo = OafMapperUtils.dataInfo( + false, + null, + false, + false, + ModelConstants.PROVENANCE_ACTION_SET_QUALIFIER, + "0.9" + ) + val SUBJ_CLASS = "Keywords" + + val DATE_RELATION_KEY = "RelationDate" + + val resolvedURL: Map[String, String] = Map( + "genbank" -> "https://www.ncbi.nlm.nih.gov/nuccore/", + "ncbi-n" -> "https://www.ncbi.nlm.nih.gov/nuccore/", + "ncbi-wgs" -> "https://www.ncbi.nlm.nih.gov/nuccore/", + "ncbi-p" -> "https://www.ncbi.nlm.nih.gov/protein/", + "ena" -> "https://www.ebi.ac.uk/ena/browser/view/", + "clinicaltrials.gov" -> "https://clinicaltrials.gov/ct2/show/", + "onim" -> "https://omim.org/entry/", + "refseq" -> "https://www.ncbi.nlm.nih.gov/nuccore/", + "geo" -> "https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=" + ) + + val collectedFromMap: Map[String, KeyValue] = { + val PDBCollectedFrom: KeyValue = OafMapperUtils.keyValue( + "10|opendoar____::d1c373ab1570cfb9a7dbb53c186b37a2", + "Protein Data Bank" + ) + val enaCollectedFrom: KeyValue = OafMapperUtils.keyValue( + "10|re3data_____::c2a591f440598b63d854556beaf01591", + "European Nucleotide Archive" + ) + val ncbiCollectedFrom: KeyValue = OafMapperUtils.keyValue( + "10|re3data_____::7d4f90870fe1e493232c9e86c43ae6f6", + "NCBI Nucleotide" + ) + val UNIPROTCollectedFrom: KeyValue = OafMapperUtils.keyValue( + "10|re3data_____::296e1abaf1302897a6838d3588cd0310", + "UniProtKB/Swiss-Prot" + ) + val ElsevierCollectedFrom: KeyValue = + OafMapperUtils.keyValue("10|openaire____::8f87e10869299a5fe80b315695296b88", "Elsevier") + val springerNatureCollectedFrom: KeyValue = OafMapperUtils.keyValue( + "10|openaire____::6e380d9cf51138baec8480f5a0ce3a2e", + "Springer Nature" + ) + val EBICollectedFrom: KeyValue = OafMapperUtils.keyValue( + "10|opendoar____::83e60e09c222f206c725385f53d7e567c", + "EMBL-EBIs Protein Data Bank in Europe (PDBe)" + ) + val pubmedCollectedFrom: KeyValue = + OafMapperUtils.keyValue(ModelConstants.EUROPE_PUBMED_CENTRAL_ID, "Europe PubMed Central") + + UNIPROTCollectedFrom.setDataInfo(DATA_INFO) + PDBCollectedFrom.setDataInfo(DATA_INFO) + ElsevierCollectedFrom.setDataInfo(DATA_INFO) + EBICollectedFrom.setDataInfo(DATA_INFO) + pubmedCollectedFrom.setDataInfo(DATA_INFO) + enaCollectedFrom.setDataInfo(DATA_INFO) + ncbiCollectedFrom.setDataInfo(DATA_INFO) + springerNatureCollectedFrom.setDataInfo(DATA_INFO) + + Map( + "uniprot" -> UNIPROTCollectedFrom, + "pdb" -> PDBCollectedFrom, + "elsevier" -> ElsevierCollectedFrom, + "ebi" -> EBICollectedFrom, + "Springer Nature" -> springerNatureCollectedFrom, + "NCBI Nucleotide" -> ncbiCollectedFrom, + "European Nucleotide Archive" -> enaCollectedFrom, + "Europe PMC" -> pubmedCollectedFrom + ) + } + + def crossrefLinksToOaf(input: String): Oaf = { + implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats + lazy val json = parse(input) + val source_pid = (json \ "Source" \ "Identifier" \ "ID").extract[String].toLowerCase + val source_pid_type = (json \ "Source" \ "Identifier" \ "IDScheme").extract[String].toLowerCase + + val target_pid = (json \ "Target" \ "Identifier" \ "ID").extract[String].toLowerCase + val target_pid_type = (json \ "Target" \ "Identifier" \ "IDScheme").extract[String].toLowerCase + + val relation_semantic = (json \ "RelationshipType" \ "Name").extract[String] + + val date = GraphCleaningFunctions.cleanDate((json \ "LinkedPublicationDate").extract[String]) + + createRelation( + target_pid, + target_pid_type, + generate_unresolved_id(source_pid, source_pid_type), + collectedFromMap("elsevier"), + "relationship", + relation_semantic, + date + ) + + } + + def scholixResolvedToOAF(input: ScholixResolved): Oaf = { + + val d = new Dataset + + d.setPid( + List( + OafMapperUtils.structuredProperty( + input.pid.toLowerCase, + input.pidType.toLowerCase, + input.pidType.toLowerCase, + ModelConstants.DNET_PID_TYPES, + ModelConstants.DNET_PID_TYPES, + DATA_INFO + ) + ).asJava + ) + + d.setDataInfo(DATA_INFO) + + val nsPrefix = input.pidType.toLowerCase.padTo(12, '_') + d.setId(OafMapperUtils.createOpenaireId(50, s"$nsPrefix::${input.pid.toLowerCase}", true)) + + if (input.tilte != null && input.tilte.nonEmpty) + d.setTitle( + List( + OafMapperUtils.structuredProperty( + input.tilte.head, + ModelConstants.MAIN_TITLE_QUALIFIER, + DATA_INFO + ) + ).asJava + ) + + d.setOriginalId(List(input.pid).asJava) + val i = new Instance + + i.setPid(d.getPid) + + if (resolvedURL.contains(input.pidType)) { + i.setUrl(List(s"${resolvedURL(input.pidType)}${input.pid}").asJava) + } + + if (input.pidType.equalsIgnoreCase("clinicaltrials.gov")) + i.setInstancetype( + OafMapperUtils.qualifier( + "0037", + "Clinical Trial", + ModelConstants.DNET_PUBLICATION_RESOURCE, + ModelConstants.DNET_PUBLICATION_RESOURCE + ) + ) + else + i.setInstancetype( + OafMapperUtils.qualifier( + "0046", + "Bioentity", + ModelConstants.DNET_PUBLICATION_RESOURCE, + ModelConstants.DNET_PUBLICATION_RESOURCE + ) + ) + + if (input.datasource == null || input.datasource.isEmpty) + return null + + val ds = input.datasource.head + d.setCollectedfrom(List(collectedFromMap(ds)).asJava) + i.setCollectedfrom(collectedFromMap(ds)) + d.setInstance(List(i).asJava) + + if (input.authors != null && input.authors.nonEmpty) { + val authors = input.authors.map(a => { + val authorOAF = new Author + authorOAF.setFullname(a) + authorOAF + }) + d.setAuthor(authors.asJava) + } + if (input.date != null && input.date.nonEmpty) { + val dt = input.date.head + i.setDateofacceptance(OafMapperUtils.field(GraphCleaningFunctions.cleanDate(dt), DATA_INFO)) + d.setDateofacceptance(OafMapperUtils.field(GraphCleaningFunctions.cleanDate(dt), DATA_INFO)) + } + d + } + + def uniprotToOAF(input: String): List[Oaf] = { + implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats + lazy val json = parse(input) + val pid = (json \ "pid").extract[String] + + val d = new Dataset + + d.setPid( + List( + OafMapperUtils.structuredProperty( + pid, + "uniprot", + "uniprot", + ModelConstants.DNET_PID_TYPES, + ModelConstants.DNET_PID_TYPES, + DATA_INFO + ) + ).asJava + ) + + d.setDataInfo(DATA_INFO) + d.setId(OafMapperUtils.createOpenaireId(50, s"uniprot_____::$pid", true)) + d.setCollectedfrom(List(collectedFromMap("uniprot")).asJava) + + val title: String = (json \ "title").extractOrElse[String](null) + + if (title != null) + d.setTitle( + List( + OafMapperUtils.structuredProperty(title, ModelConstants.MAIN_TITLE_QUALIFIER, DATA_INFO) + ).asJava + ) + + d.setOriginalId(List(pid).asJava) + val i = new Instance + + i.setPid(d.getPid) + i.setUrl(List(s"https://www.uniprot.org/uniprot/$pid").asJava) + i.setInstancetype( + OafMapperUtils.qualifier( + "0046", + "Bioentity", + ModelConstants.DNET_PUBLICATION_RESOURCE, + ModelConstants.DNET_PUBLICATION_RESOURCE + ) + ) + + i.setCollectedfrom(collectedFromMap("uniprot")) + d.setInstance(List(i).asJava) + + val dates: List[UniprotDate] = for { + JObject(dateOBJ) <- json \ "dates" + JField("date", JString(date)) <- dateOBJ + JField("date_info", JString(date_info)) <- dateOBJ + } yield UniprotDate(GraphCleaningFunctions.cleanDate(date), date_info) + + val subjects: List[String] = (json \\ "subjects").extractOrElse[List[String]](null) + + if (subjects != null) { + d.setSubject( + subjects + .map(s => + OafMapperUtils.subject( + s, + SUBJ_CLASS, + SUBJ_CLASS, + ModelConstants.DNET_SUBJECT_TYPOLOGIES, + ModelConstants.DNET_SUBJECT_TYPOLOGIES, + null + ) + ) + .asJava + ) + } + var i_date: Option[UniprotDate] = None + + if (dates.nonEmpty) { + i_date = dates.find(d => d.date_info.contains("entry version")) + if (i_date.isDefined) { + i.setDateofacceptance(OafMapperUtils.field(i_date.get.date, DATA_INFO)) + d.setDateofacceptance(OafMapperUtils.field(i_date.get.date, DATA_INFO)) + } + val relevant_dates: List[StructuredProperty] = dates + .filter(d => !d.date_info.contains("entry version")) + .map(date => + OafMapperUtils.structuredProperty( + date.date, + ModelConstants.UNKNOWN, + ModelConstants.UNKNOWN, + ModelConstants.DNET_DATACITE_DATE, + ModelConstants.DNET_DATACITE_DATE, + DATA_INFO + ) + ) + if (relevant_dates != null && relevant_dates.nonEmpty) + d.setRelevantdate(relevant_dates.asJava) + d.setDateofacceptance(OafMapperUtils.field(i_date.get.date, DATA_INFO)) + } + + val references_pmid: List[String] = for { + JObject(reference) <- json \ "references" + JField("PubMed", JString(pid)) <- reference + } yield pid + + val references_doi: List[String] = for { + JObject(reference) <- json \ "references" + JField(" DOI", JString(pid)) <- reference + } yield pid + + if (references_pmid != null && references_pmid.nonEmpty) { + val rel = createRelation( + references_pmid.head, + "pmid", + d.getId, + collectedFromMap("uniprot"), + ModelConstants.RELATIONSHIP, + ModelConstants.IS_RELATED_TO, + if (i_date.isDefined) i_date.get.date else null + ) + rel.getCollectedfrom + List(d, rel) + } else if (references_doi != null && references_doi.nonEmpty) { + val rel = createRelation( + references_doi.head, + "doi", + d.getId, + collectedFromMap("uniprot"), + ModelConstants.RELATIONSHIP, + ModelConstants.IS_RELATED_TO, + if (i_date.isDefined) i_date.get.date else null + ) + List(d, rel) + } else + List(d) + } + + def generate_unresolved_id(pid: String, pidType: String): String = { + s"unresolved::$pid::$pidType" + } + + def createRelation( + pid: String, + pidType: String, + sourceId: String, + collectedFrom: KeyValue, + subRelType: String, + relClass: String, + date: String + ): Relation = { + + val rel = new Relation + rel.setCollectedfrom(List(collectedFromMap("pdb")).asJava) + rel.setDataInfo(DATA_INFO) + + rel.setRelType(ModelConstants.RESULT_RESULT) + rel.setSubRelType(subRelType) + rel.setRelClass(relClass) + + rel.setSource(sourceId) + rel.setTarget(s"unresolved::$pid::$pidType") + + val dateProps: KeyValue = OafMapperUtils.keyValue(DATE_RELATION_KEY, date) + + rel.setProperties(List(dateProps).asJava) + + rel.getTarget.startsWith("unresolved") + rel.setCollectedfrom(List(collectedFrom).asJava) + rel + + } + + def createSupplementaryRelation( + pid: String, + pidType: String, + sourceId: String, + collectedFrom: KeyValue, + date: String + ): Relation = { + createRelation( + pid, + pidType, + sourceId, + collectedFrom, + ModelConstants.SUPPLEMENT, + ModelConstants.IS_SUPPLEMENT_TO, + date + ) + } + + def pdbTOOaf(input: String): List[Oaf] = { + implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats + lazy val json = parse(input) + val pdb = (json \ "pdb").extract[String].toLowerCase + + if (pdb.isEmpty) + return List() + + val d = new Dataset + + d.setPid( + List( + OafMapperUtils.structuredProperty( + pdb, + "pdb", + "Protein Data Bank Identifier", + ModelConstants.DNET_PID_TYPES, + ModelConstants.DNET_PID_TYPES, + DATA_INFO + ) + ).asJava + ) + + d.setCollectedfrom(List(collectedFromMap("pdb")).asJava) + d.setDataInfo(DATA_INFO) + d.setId(OafMapperUtils.createOpenaireId(50, s"pdb_________::$pdb", true)) + d.setOriginalId(List(pdb).asJava) + + val title = (json \ "title").extractOrElse[String](null) + + if (title == null) + return List() + d.setTitle( + List( + OafMapperUtils.structuredProperty(title, ModelConstants.MAIN_TITLE_QUALIFIER, DATA_INFO) + ).asJava + ) + + val authors: List[String] = (json \ "authors").extractOrElse[List[String]](null) + + if (authors != null) { + val convertedAuthors = authors.zipWithIndex.map { a => + val res = new Author + res.setFullname(a._1) + res.setRank(a._2 + 1) + res + } + + d.setAuthor(convertedAuthors.asJava) + } + + val i = new Instance + + i.setPid(d.getPid) + i.setUrl(List(s"https://www.rcsb.org/structure/$pdb").asJava) + i.setInstancetype( + OafMapperUtils.qualifier( + "0046", + "Bioentity", + ModelConstants.DNET_PUBLICATION_RESOURCE, + ModelConstants.DNET_PUBLICATION_RESOURCE + ) + ) + + i.setCollectedfrom(collectedFromMap("pdb")) + d.setInstance(List(i).asJava) + + val pmid = (json \ "pmid").extractOrElse[String](null) + + if (pmid != null) + List(d, createSupplementaryRelation(pmid, "pmid", d.getId, collectedFromMap("pdb"), null)) + else + List(d) + } + + def extractEBILinksFromDump(input: String): EBILinkItem = { + implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats + lazy val json = parse(input) + + val pmid = (json \ "publication" \ "pmid").extract[String] + val links = (json \ "links").extract[JObject] + EBILinkItem(pmid.toLong, compact(render(links))) + } + + def EBITargetLinksFilter(input: EBILinks): Boolean = { + + input.targetPidType.equalsIgnoreCase("ena") || input.targetPidType.equalsIgnoreCase( + "pdb" + ) || input.targetPidType.equalsIgnoreCase("uniprot") + + } + + def parse_ebi_links(input: String): List[EBILinks] = { + implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats + lazy val json = parse(input) + val pmid = (json \ "request" \ "id").extract[String] + for { + JObject(link) <- json \\ "Link" + JField("Target", JObject(target)) <- link + JField("RelationshipType", JObject(relType)) <- link + JField("Name", JString(relation)) <- relType + JField("PublicationDate", JString(publicationDate)) <- link + JField("Title", JString(title)) <- target + JField("Identifier", JObject(identifier)) <- target + JField("IDScheme", JString(idScheme)) <- identifier + JField("IDURL", JString(idUrl)) <- identifier + JField("ID", JString(id)) <- identifier + + } yield EBILinks( + relation, + GraphCleaningFunctions.cleanDate(publicationDate), + title, + pmid, + id, + idScheme, + idUrl + ) + } + + def convertEBILinksToOaf(input: EBILinks): List[Oaf] = { + val d = new Dataset + d.setCollectedfrom(List(collectedFromMap("ebi")).asJava) + d.setDataInfo(DATA_INFO) + d.setTitle( + List( + OafMapperUtils.structuredProperty( + input.title, + ModelConstants.MAIN_TITLE_QUALIFIER, + DATA_INFO + ) + ).asJava + ) + + val nsPrefix = input.targetPidType.toLowerCase.padTo(12, '_') + + d.setId(OafMapperUtils.createOpenaireId(50, s"$nsPrefix::${input.targetPid.toLowerCase}", true)) + d.setOriginalId(List(input.targetPid.toLowerCase).asJava) + + d.setPid( + List( + OafMapperUtils.structuredProperty( + input.targetPid.toLowerCase, + input.targetPidType.toLowerCase, + "Protein Data Bank Identifier", + ModelConstants.DNET_PID_TYPES, + ModelConstants.DNET_PID_TYPES, + DATA_INFO + ) + ).asJava + ) + + val i = new Instance + + i.setPid(d.getPid) + i.setUrl(List(input.targetUrl).asJava) + i.setInstancetype( + OafMapperUtils.qualifier( + "0046", + "Bioentity", + ModelConstants.DNET_PUBLICATION_RESOURCE, + ModelConstants.DNET_PUBLICATION_RESOURCE + ) + ) + + i.setCollectedfrom(collectedFromMap("ebi")) + d.setInstance(List(i).asJava) + i.setDateofacceptance( + OafMapperUtils.field(GraphCleaningFunctions.cleanDate(input.date), DATA_INFO) + ) + d.setDateofacceptance( + OafMapperUtils.field(GraphCleaningFunctions.cleanDate(input.date), DATA_INFO) + ) + + List( + d, + createRelation( + input.pmid, + "pmid", + d.getId, + collectedFromMap("ebi"), + ModelConstants.RELATIONSHIP, + ModelConstants.IS_RELATED_TO, + GraphCleaningFunctions.cleanDate(input.date) + ) + ) + } +} diff --git a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/SparkTransformBioDatabaseToOAF.scala b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/SparkTransformBioDatabaseToOAF.scala new file mode 100644 index 000000000..96075b4f3 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/SparkTransformBioDatabaseToOAF.scala @@ -0,0 +1,66 @@ +package eu.dnetlib.dhp.sx.bio + +import eu.dnetlib.dhp.application.ArgumentApplicationParser +import eu.dnetlib.dhp.collection.CollectionUtils +import eu.dnetlib.dhp.schema.oaf.Oaf +import eu.dnetlib.dhp.sx.bio.BioDBToOAF.ScholixResolved +import org.apache.commons.io.IOUtils +import org.apache.spark.SparkConf +import org.apache.spark.sql.{Encoder, Encoders, SparkSession} +import org.slf4j.{Logger, LoggerFactory} + +object SparkTransformBioDatabaseToOAF { + + def main(args: Array[String]): Unit = { + val conf: SparkConf = new SparkConf() + val log: Logger = LoggerFactory.getLogger(getClass) + val parser = new ArgumentApplicationParser( + IOUtils.toString( + getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/bio/ebi/bio_to_oaf_params.json") + ) + ) + parser.parseArgument(args) + val database: String = parser.get("database") + log.info("database: {}", database) + + val dbPath: String = parser.get("dbPath") + log.info("dbPath: {}", database) + val targetPath: String = parser.get("targetPath") + log.info("targetPath: {}", database) + + val spark: SparkSession = + SparkSession + .builder() + .config(conf) + .appName(getClass.getSimpleName) + .master(parser.get("master")) + .getOrCreate() + val sc = spark.sparkContext + + implicit val resultEncoder: Encoder[Oaf] = Encoders.kryo(classOf[Oaf]) + import spark.implicits._ + database.toUpperCase() match { + case "UNIPROT" => + CollectionUtils.saveDataset( + spark.createDataset(sc.textFile(dbPath).flatMap(i => BioDBToOAF.uniprotToOAF(i))), + targetPath + ) + case "PDB" => + CollectionUtils.saveDataset( + spark.createDataset(sc.textFile(dbPath).flatMap(i => BioDBToOAF.pdbTOOaf(i))), + targetPath + ) + case "SCHOLIX" => + CollectionUtils.saveDataset( + spark.read.load(dbPath).as[ScholixResolved].map(i => BioDBToOAF.scholixResolvedToOAF(i)), + targetPath + ) + case "CROSSREF_LINKS" => + CollectionUtils.saveDataset( + spark.createDataset(sc.textFile(dbPath).map(i => BioDBToOAF.crossrefLinksToOaf(i))), + targetPath + ) + } + } + +} diff --git a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/ebi/SparkCreateBaselineDataFrame.scala b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/ebi/SparkCreateBaselineDataFrame.scala new file mode 100644 index 000000000..9c55ec7be --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/ebi/SparkCreateBaselineDataFrame.scala @@ -0,0 +1,222 @@ +package eu.dnetlib.dhp.sx.bio.ebi + +import eu.dnetlib.dhp.application.ArgumentApplicationParser +import eu.dnetlib.dhp.collection.CollectionUtils +import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup +import eu.dnetlib.dhp.schema.oaf.{Oaf, Result} +import eu.dnetlib.dhp.sx.bio.pubmed._ +import eu.dnetlib.dhp.utils.ISLookupClientFactory +import org.apache.commons.io.IOUtils +import org.apache.hadoop.conf.Configuration +import org.apache.hadoop.fs.{FSDataOutputStream, FileSystem, Path} +import org.apache.http.client.config.RequestConfig +import org.apache.http.client.methods.HttpGet +import org.apache.http.impl.client.HttpClientBuilder +import org.apache.spark.SparkConf +import org.apache.spark.rdd.RDD +import org.apache.spark.sql.expressions.Aggregator +import org.apache.spark.sql._ +import org.slf4j.{Logger, LoggerFactory} + +import java.io.InputStream +import scala.io.Source +import scala.xml.pull.XMLEventReader + +object SparkCreateBaselineDataFrame { + + def requestBaseLineUpdatePage(maxFile: String): List[(String, String)] = { + val data = requestPage("https://ftp.ncbi.nlm.nih.gov/pubmed/updatefiles/") + + val result = data.lines + .filter(l => l.startsWith("
") + val start = l.indexOf("= 0 && end > start) + l.substring(start + 9, end - start) + else + "" + } + .filter(s => s.endsWith(".gz")) + .filter(s => s > maxFile) + .map(s => (s, s"https://ftp.ncbi.nlm.nih.gov/pubmed/updatefiles/$s")) + .toList + + result + } + + def downloadBaselinePart(url: String): InputStream = { + val r = new HttpGet(url) + val timeout = 60; // seconds + val config = RequestConfig + .custom() + .setConnectTimeout(timeout * 1000) + .setConnectionRequestTimeout(timeout * 1000) + .setSocketTimeout(timeout * 1000) + .build() + val client = HttpClientBuilder.create().setDefaultRequestConfig(config).build() + val response = client.execute(r) + println(s"get response with status${response.getStatusLine.getStatusCode}") + response.getEntity.getContent + + } + + def requestPage(url: String): String = { + val r = new HttpGet(url) + val timeout = 60; // seconds + val config = RequestConfig + .custom() + .setConnectTimeout(timeout * 1000) + .setConnectionRequestTimeout(timeout * 1000) + .setSocketTimeout(timeout * 1000) + .build() + val client = HttpClientBuilder.create().setDefaultRequestConfig(config).build() + try { + var tries = 4 + while (tries > 0) { + println(s"requesting ${r.getURI}") + try { + val response = client.execute(r) + println(s"get response with status${response.getStatusLine.getStatusCode}") + if (response.getStatusLine.getStatusCode > 400) { + tries -= 1 + } else + return IOUtils.toString(response.getEntity.getContent) + } catch { + case e: Throwable => + println(s"Error on requesting ${r.getURI}") + e.printStackTrace() + tries -= 1 + } + } + "" + } finally { + if (client != null) + client.close() + } + } + + def downloadBaseLineUpdate(baselinePath: String, hdfsServerUri: String): Unit = { + + val conf = new Configuration + conf.set("fs.defaultFS", hdfsServerUri) + val fs = FileSystem.get(conf) + val p = new Path(baselinePath) + val files = fs.listFiles(p, false) + var max_file = "" + while (files.hasNext) { + val c = files.next() + val data = c.getPath.toString + val fileName = data.substring(data.lastIndexOf("/") + 1) + + if (fileName > max_file) + max_file = fileName + } + + val files_to_download = requestBaseLineUpdatePage(max_file) + + files_to_download.foreach { u => + val hdfsWritePath: Path = new Path(s"$baselinePath/${u._1}") + val fsDataOutputStream: FSDataOutputStream = fs.create(hdfsWritePath, true) + val i = downloadBaselinePart(u._2) + IOUtils.copy(i, fsDataOutputStream) + println(s"Downloaded ${u._2} into $baselinePath/${u._1}") + fsDataOutputStream.close() + } + + } + + val pmArticleAggregator: Aggregator[(String, PMArticle), PMArticle, PMArticle] = + new Aggregator[(String, PMArticle), PMArticle, PMArticle] with Serializable { + override def zero: PMArticle = new PMArticle + + override def reduce(b: PMArticle, a: (String, PMArticle)): PMArticle = { + if (b != null && b.getPmid != null) b else a._2 + } + + override def merge(b1: PMArticle, b2: PMArticle): PMArticle = { + if (b1 != null && b1.getPmid != null) b1 else b2 + + } + + override def finish(reduction: PMArticle): PMArticle = reduction + + override def bufferEncoder: Encoder[PMArticle] = Encoders.kryo[PMArticle] + + override def outputEncoder: Encoder[PMArticle] = Encoders.kryo[PMArticle] + } + + def main(args: Array[String]): Unit = { + val conf: SparkConf = new SparkConf() + val log: Logger = LoggerFactory.getLogger(getClass) + val parser = new ArgumentApplicationParser( + IOUtils.toString( + SparkEBILinksToOaf.getClass.getResourceAsStream( + "/eu/dnetlib/dhp/sx/bio/ebi/baseline_to_oaf_params.json" + ) + ) + ) + parser.parseArgument(args) + val isLookupUrl: String = parser.get("isLookupUrl") + log.info("isLookupUrl: {}", isLookupUrl) + val workingPath = parser.get("workingPath") + log.info("workingPath: {}", workingPath) + + val targetPath = parser.get("targetPath") + log.info("targetPath: {}", targetPath) + + val hdfsServerUri = parser.get("hdfsServerUri") + log.info("hdfsServerUri: {}", targetPath) + + val skipUpdate = parser.get("skipUpdate") + log.info("skipUpdate: {}", skipUpdate) + + val isLookupService = ISLookupClientFactory.getLookUpService(isLookupUrl) + val vocabularies = VocabularyGroup.loadVocsFromIS(isLookupService) + val spark: SparkSession = + SparkSession + .builder() + .config(conf) + .appName(SparkEBILinksToOaf.getClass.getSimpleName) + .master(parser.get("master")) + .getOrCreate() + + val sc = spark.sparkContext + import spark.implicits._ + + implicit val PMEncoder: Encoder[PMArticle] = Encoders.kryo(classOf[PMArticle]) + implicit val PMJEncoder: Encoder[PMJournal] = Encoders.kryo(classOf[PMJournal]) + implicit val PMAEncoder: Encoder[PMAuthor] = Encoders.kryo(classOf[PMAuthor]) + implicit val resultEncoder: Encoder[Oaf] = Encoders.kryo(classOf[Oaf]) + + if (!"true".equalsIgnoreCase(skipUpdate)) { + downloadBaseLineUpdate(s"$workingPath/baseline", hdfsServerUri) + val k: RDD[(String, String)] = sc.wholeTextFiles(s"$workingPath/baseline", 2000) + val ds: Dataset[PMArticle] = spark.createDataset( + k.filter(i => i._1.endsWith(".gz")) + .flatMap(i => { + val xml = new XMLEventReader(Source.fromBytes(i._2.getBytes())) + new PMParser(xml) + }) + ) + ds.map(p => (p.getPmid, p))(Encoders.tuple(Encoders.STRING, PMEncoder)) + .groupByKey(_._1) + .agg(pmArticleAggregator.toColumn) + .map(p => p._2) + .write + .mode(SaveMode.Overwrite) + .save(s"$workingPath/baseline_dataset") + } + + val exported_dataset = spark.read.load(s"$workingPath/baseline_dataset").as[PMArticle] + CollectionUtils.saveDataset( + exported_dataset + .map(a => PubMedToOaf.convert(a, vocabularies)) + .as[Oaf] + .filter(p => p != null), + targetPath + ) + + } +} diff --git a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/ebi/SparkDownloadEBILinks.scala b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/ebi/SparkDownloadEBILinks.scala new file mode 100644 index 000000000..44e9e22ea --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/ebi/SparkDownloadEBILinks.scala @@ -0,0 +1,145 @@ +package eu.dnetlib.dhp.sx.bio.ebi + +import eu.dnetlib.dhp.application.ArgumentApplicationParser +import eu.dnetlib.dhp.sx.bio.BioDBToOAF.EBILinkItem +import eu.dnetlib.dhp.sx.bio.pubmed.{PMArticle, PMAuthor, PMJournal} +import org.apache.commons.io.IOUtils +import org.apache.http.client.config.RequestConfig +import org.apache.http.client.methods.HttpGet +import org.apache.http.impl.client.HttpClientBuilder +import org.apache.spark.SparkConf +import org.apache.spark.sql.functions.max +import org.apache.spark.sql._ +import org.slf4j.{Logger, LoggerFactory} + +object SparkDownloadEBILinks { + + def createEBILinks(pmid: Long): EBILinkItem = { + + val res = requestLinks(pmid) + if (res != null) + return EBILinkItem(pmid, res) + null + } + + def requestPage(url: String): String = { + val r = new HttpGet(url) + val timeout = 60; // seconds + val config = RequestConfig + .custom() + .setConnectTimeout(timeout * 1000) + .setConnectionRequestTimeout(timeout * 1000) + .setSocketTimeout(timeout * 1000) + .build() + val client = HttpClientBuilder.create().setDefaultRequestConfig(config).build() + try { + var tries = 4 + while (tries > 0) { + println(s"requesting ${r.getURI}") + try { + val response = client.execute(r) + println(s"get response with status${response.getStatusLine.getStatusCode}") + if (response.getStatusLine.getStatusCode > 400) { + tries -= 1 + } else + return IOUtils.toString(response.getEntity.getContent) + } catch { + case e: Throwable => + println(s"Error on requesting ${r.getURI}") + e.printStackTrace() + tries -= 1 + } + } + "" + } finally { + if (client != null) + client.close() + } + } + + def requestLinks(PMID: Long): String = { + requestPage(s"https://www.ebi.ac.uk/europepmc/webservices/rest/MED/$PMID/datalinks?format=json") + + } + + def main(args: Array[String]): Unit = { + + val log: Logger = LoggerFactory.getLogger(getClass) + val MAX_ITEM_PER_PARTITION = 20000 + val conf: SparkConf = new SparkConf() + val parser = new ArgumentApplicationParser( + IOUtils.toString( + getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/bio/ebi/ebi_download_update.json") + ) + ) + parser.parseArgument(args) + val spark: SparkSession = + SparkSession + .builder() + .config(conf) + .appName(SparkEBILinksToOaf.getClass.getSimpleName) + .master(parser.get("master")) + .getOrCreate() + + import spark.implicits._ + + implicit val PMEncoder: Encoder[PMArticle] = Encoders.kryo(classOf[PMArticle]) + implicit val PMJEncoder: Encoder[PMJournal] = Encoders.kryo(classOf[PMJournal]) + implicit val PMAEncoder: Encoder[PMAuthor] = Encoders.kryo(classOf[PMAuthor]) + + val sourcePath = parser.get("sourcePath") + log.info(s"sourcePath -> $sourcePath") + val workingPath = parser.get("workingPath") + log.info(s"workingPath -> $workingPath") + + log.info("Getting max pubmedId where the links have already requested") + val links: Dataset[EBILinkItem] = + spark.read.load(s"$sourcePath/ebi_links_dataset").as[EBILinkItem] + val lastPMIDRequested = links.map(l => l.id).select(max("value")).first.getLong(0) + + log.info("Retrieving PMID to request links") + val pubmed = spark.read.load(s"$sourcePath/baseline_dataset").as[PMArticle] + pubmed + .map(p => p.getPmid.toLong) + .where(s"value > $lastPMIDRequested") + .write + .mode(SaveMode.Overwrite) + .save(s"$workingPath/id_to_request") + + val pmidToReq: Dataset[Long] = spark.read.load(s"$workingPath/id_to_request").as[Long] + + val total = pmidToReq.count() + + spark + .createDataset( + pmidToReq.rdd + .repartition((total / MAX_ITEM_PER_PARTITION).toInt) + .map(pmid => createEBILinks(pmid)) + .filter(l => l != null) + ) + .write + .mode(SaveMode.Overwrite) + .save(s"$workingPath/links_update") + + val updates: Dataset[EBILinkItem] = + spark.read.load(s"$workingPath/links_update").as[EBILinkItem] + + links + .union(updates) + .groupByKey(_.id) + .reduceGroups { (x, y) => + if (x == null || x.links == null) + y + if (y == null || y.links == null) + x + if (x.links.length > y.links.length) + x + else + y + } + .map(_._2) + .write + .mode(SaveMode.Overwrite) + .save(s"$workingPath/links_final") + } +} diff --git a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/ebi/SparkEBILinksToOaf.scala b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/ebi/SparkEBILinksToOaf.scala new file mode 100644 index 000000000..7cb6153ff --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/ebi/SparkEBILinksToOaf.scala @@ -0,0 +1,52 @@ +package eu.dnetlib.dhp.sx.bio.ebi + +import eu.dnetlib.dhp.application.ArgumentApplicationParser +import eu.dnetlib.dhp.collection.CollectionUtils +import eu.dnetlib.dhp.schema.oaf.Oaf +import eu.dnetlib.dhp.sx.bio.BioDBToOAF +import eu.dnetlib.dhp.sx.bio.BioDBToOAF.EBILinkItem +import org.apache.commons.io.IOUtils +import org.apache.spark.SparkConf +import org.apache.spark.sql._ +import org.slf4j.{Logger, LoggerFactory} + +object SparkEBILinksToOaf { + + def main(args: Array[String]): Unit = { + val log: Logger = LoggerFactory.getLogger(getClass) + val conf: SparkConf = new SparkConf() + val parser = new ArgumentApplicationParser( + IOUtils.toString( + getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/bio/ebi/ebi_to_df_params.json") + ) + ) + parser.parseArgument(args) + val spark: SparkSession = + SparkSession + .builder() + .config(conf) + .appName(SparkEBILinksToOaf.getClass.getSimpleName) + .master(parser.get("master")) + .getOrCreate() + + import spark.implicits._ + val sourcePath = parser.get("sourcePath") + log.info(s"sourcePath -> $sourcePath") + val targetPath = parser.get("targetPath") + log.info(s"targetPath -> $targetPath") + implicit val PMEncoder: Encoder[Oaf] = Encoders.kryo(classOf[Oaf]) + + val ebLinks: Dataset[EBILinkItem] = spark.read + .load(sourcePath) + .as[EBILinkItem] + .filter(l => l.links != null && l.links.startsWith("{")) + + CollectionUtils.saveDataset( + ebLinks + .flatMap(j => BioDBToOAF.parse_ebi_links(j.links)) + .filter(p => BioDBToOAF.EBITargetLinksFilter(p)) + .flatMap(p => BioDBToOAF.convertEBILinksToOaf(p)), + targetPath + ) + } +} diff --git a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/pubmed/PMParser.scala b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/pubmed/PMParser.scala new file mode 100644 index 000000000..9102c12c4 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/pubmed/PMParser.scala @@ -0,0 +1,134 @@ +package eu.dnetlib.dhp.sx.bio.pubmed + +import scala.xml.MetaData +import scala.xml.pull.{EvElemEnd, EvElemStart, EvText, XMLEventReader} + +/** @param xml + */ +class PMParser(xml: XMLEventReader) extends Iterator[PMArticle] { + + var currentArticle: PMArticle = generateNextArticle() + + override def hasNext: Boolean = currentArticle != null + + override def next(): PMArticle = { + val tmp = currentArticle + currentArticle = generateNextArticle() + tmp + } + + def extractAttributes(attrs: MetaData, key: String): String = { + + val res = attrs.get(key) + if (res.isDefined) { + val s = res.get + if (s != null && s.nonEmpty) + s.head.text + else + null + } else null + } + + def validate_Date(year: String, month: String, day: String): String = { + try { + f"${year.toInt}-${month.toInt}%02d-${day.toInt}%02d" + + } catch { + case _: Throwable => null + } + } + + def generateNextArticle(): PMArticle = { + + var currentSubject: PMSubject = null + var currentAuthor: PMAuthor = null + var currentJournal: PMJournal = null + var currentGrant: PMGrant = null + var currNode: String = null + var currentYear = "0" + var currentMonth = "01" + var currentDay = "01" + var currentArticleType: String = null + + while (xml.hasNext) { + xml.next match { + case EvElemStart(_, label, attrs, _) => + currNode = label + + label match { + case "PubmedArticle" => currentArticle = new PMArticle + case "Author" => currentAuthor = new PMAuthor + case "Journal" => currentJournal = new PMJournal + case "Grant" => currentGrant = new PMGrant + case "PublicationType" | "DescriptorName" => + currentSubject = new PMSubject + currentSubject.setMeshId(extractAttributes(attrs, "UI")) + case "ArticleId" => currentArticleType = extractAttributes(attrs, "IdType") + case _ => + } + case EvElemEnd(_, label) => + label match { + case "PubmedArticle" => return currentArticle + case "Author" => currentArticle.getAuthors.add(currentAuthor) + case "Journal" => currentArticle.setJournal(currentJournal) + case "Grant" => currentArticle.getGrants.add(currentGrant) + case "PubMedPubDate" => + if (currentArticle.getDate == null) + currentArticle.setDate(validate_Date(currentYear, currentMonth, currentDay)) + case "PubDate" => currentJournal.setDate(s"$currentYear-$currentMonth-$currentDay") + case "DescriptorName" => currentArticle.getSubjects.add(currentSubject) + case "PublicationType" => currentArticle.getPublicationTypes.add(currentSubject) + case _ => + } + case EvText(text) => + if (currNode != null && text.trim.nonEmpty) + currNode match { + case "ArticleTitle" => { + if (currentArticle.getTitle == null) + currentArticle.setTitle(text.trim) + else + currentArticle.setTitle(currentArticle.getTitle + text.trim) + } + case "AbstractText" => { + if (currentArticle.getDescription == null) + currentArticle.setDescription(text.trim) + else + currentArticle.setDescription(currentArticle.getDescription + text.trim) + } + case "PMID" => currentArticle.setPmid(text.trim) + case "ArticleId" => + if ("doi".equalsIgnoreCase(currentArticleType)) currentArticle.setDoi(text.trim) + if ("pmc".equalsIgnoreCase(currentArticleType)) currentArticle.setPmcId(text.trim) + case "Language" => currentArticle.setLanguage(text.trim) + case "ISSN" => currentJournal.setIssn(text.trim) + case "GrantID" => currentGrant.setGrantID(text.trim) + case "Agency" => currentGrant.setAgency(text.trim) + case "Country" => if (currentGrant != null) currentGrant.setCountry(text.trim) + case "Year" => currentYear = text.trim + case "Month" => currentMonth = text.trim + case "Day" => currentDay = text.trim + case "Volume" => currentJournal.setVolume(text.trim) + case "Issue" => currentJournal.setIssue(text.trim) + case "PublicationType" | "DescriptorName" => currentSubject.setValue(text.trim) + case "LastName" => { + if (currentAuthor != null) + currentAuthor.setLastName(text.trim) + } + case "ForeName" => + if (currentAuthor != null) + currentAuthor.setForeName(text.trim) + case "Title" => + if (currentJournal.getTitle == null) + currentJournal.setTitle(text.trim) + else + currentJournal.setTitle(currentJournal.getTitle + text.trim) + case _ => + + } + case _ => + } + + } + null + } +} diff --git a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/pubmed/PubMedToOaf.scala b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/pubmed/PubMedToOaf.scala new file mode 100644 index 000000000..410686f97 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/pubmed/PubMedToOaf.scala @@ -0,0 +1,312 @@ +package eu.dnetlib.dhp.sx.bio.pubmed + +import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup +import eu.dnetlib.dhp.schema.common.ModelConstants +import eu.dnetlib.dhp.schema.oaf.utils.{GraphCleaningFunctions, IdentifierFactory, OafMapperUtils, PidType} +import eu.dnetlib.dhp.schema.oaf._ +import eu.dnetlib.dhp.utils.DHPUtils +import org.apache.commons.lang3.StringUtils + +import collection.JavaConverters._ +import java.util.regex.Pattern +import scala.collection.mutable.ListBuffer + +/** + */ +object PubMedToOaf { + + val SUBJ_CLASS = "keywords" + + val OAI_HEADER = "oai:pubmedcentral.nih.gov:" + val OLD_PMC_PREFIX = "od_______267::" + + val urlMap = Map( + "pmid" -> "https://pubmed.ncbi.nlm.nih.gov/", + "doi" -> "https://dx.doi.org/" + ) + + val dataInfo: DataInfo = OafMapperUtils.dataInfo( + false, + null, + false, + false, + ModelConstants.PROVENANCE_ACTION_SET_QUALIFIER, + "0.9" + ) + + val collectedFrom: KeyValue = + OafMapperUtils.keyValue(ModelConstants.EUROPE_PUBMED_CENTRAL_ID, "Europe PubMed Central") + + /** Cleaning the DOI Applying regex in order to + * remove doi starting with URL + * + * @param doi input DOI + * @return cleaned DOI + */ + def cleanDoi(doi: String): String = { + + val regex = "^10.\\d{4,9}\\/[\\[\\]\\-\\<\\>._;()\\/:A-Z0-9]+$" + + val pattern = Pattern.compile(regex, Pattern.CASE_INSENSITIVE) + val matcher = pattern.matcher(doi) + + if (matcher.find) { + return matcher.group(0) + } + null + } + + def createOriginalOpenaireId(article: PMArticle): String = { + if (StringUtils.isNotEmpty(article.getPmcId)) { + val md5 = DHPUtils.md5(s"$OAI_HEADER${article.getPmcId.replace("PMC", "")}") + s"$OLD_PMC_PREFIX$md5" + } else + null + + } + + /** Create an instance of class extends Result + * starting from OAF instanceType value + * + * @param cobjQualifier OAF instance type + * @param vocabularies All dnet vocabularies + * @return the correct instance + */ + def createResult(cobjQualifier: Qualifier, vocabularies: VocabularyGroup): Result = { + val result_typologies = getVocabularyTerm( + ModelConstants.DNET_RESULT_TYPOLOGIES, + vocabularies, + cobjQualifier.getClassid + ) + result_typologies.getClassid match { + case "dataset" => new Dataset + case "publication" => new Publication + case "other" => new OtherResearchProduct + case "software" => new Software + case _ => null + + } + } + + /** Mapping the Pubmedjournal info into the OAF Journale + * + * @param j the pubmedJournal + * @return the OAF Journal + */ + def mapJournal(j: PMJournal): Journal = { + if (j == null) + return null + val journal = new Journal + + journal.setDataInfo(dataInfo) + journal.setName(j.getTitle) + journal.setConferencedate(j.getDate) + journal.setVol(j.getVolume) + journal.setIssnPrinted(j.getIssn) + journal.setIss(j.getIssue) + journal + + } + + /** Find vocabulary term into synonyms and term in the vocabulary + * + * @param vocabularyName the input vocabulary name + * @param vocabularies all the vocabularies + * @param term the term to find + * @return the cleaned term value + */ + def getVocabularyTerm( + vocabularyName: String, + vocabularies: VocabularyGroup, + term: String + ): Qualifier = { + val a = vocabularies.getSynonymAsQualifier(vocabularyName, term) + val b = vocabularies.getTermAsQualifier(vocabularyName, term) + if (a == null) b else a + } + + /** Map the Pubmed Article into the OAF instance + * + * @param article the pubmed articles + * @param vocabularies the vocabularies + * @return The OAF instance if the mapping did not fail + */ + def convert(article: PMArticle, vocabularies: VocabularyGroup): Oaf = { + + if (article.getPublicationTypes == null) + return null + + // MAP PMID into pid with classid = classname = pmid + val pidList = ListBuffer[StructuredProperty]() + + pidList += OafMapperUtils.structuredProperty( + article.getPmid, + PidType.pmid.toString, + PidType.pmid.toString, + ModelConstants.DNET_PID_TYPES, + ModelConstants.DNET_PID_TYPES, + dataInfo + ) + + if (StringUtils.isNotBlank(article.getPmcId)) { + pidList += OafMapperUtils.structuredProperty( + article.getPmcId, + PidType.pmc.toString, + PidType.pmc.toString, + ModelConstants.DNET_PID_TYPES, + ModelConstants.DNET_PID_TYPES, + dataInfo + ) + } + if (pidList == null) + return null + + // MAP //ArticleId[./@IdType="doi"] into alternateIdentifier with classid = classname = doi + var alternateIdentifier: StructuredProperty = null + if (article.getDoi != null) { + val normalizedPid = cleanDoi(article.getDoi) + if (normalizedPid != null) + alternateIdentifier = OafMapperUtils.structuredProperty( + normalizedPid, + PidType.doi.toString, + PidType.doi.toString, + ModelConstants.DNET_PID_TYPES, + ModelConstants.DNET_PID_TYPES, + dataInfo + ) + } + + // INSTANCE MAPPING + //-------------------------------------------------------------------------------------- + + // If the article contains the typology Journal Article then we apply this type + //else We have to find a terms that match the vocabulary otherwise we discard it + val ja = + article.getPublicationTypes.asScala.find(s => "Journal Article".equalsIgnoreCase(s.getValue)) + val pubmedInstance = new Instance + if (ja.isDefined) { + val cojbCategory = + getVocabularyTerm(ModelConstants.DNET_PUBLICATION_RESOURCE, vocabularies, ja.get.getValue) + pubmedInstance.setInstancetype(cojbCategory) + } else { + val i_type = article.getPublicationTypes.asScala + .map(s => getVocabularyTerm(ModelConstants.DNET_PUBLICATION_RESOURCE, vocabularies, s.getValue)) + .find(q => q != null) + if (i_type.isDefined) + pubmedInstance.setInstancetype(i_type.get) + else + return null + } + val result = createResult(pubmedInstance.getInstancetype, vocabularies) + if (result == null) + return result + result.setDataInfo(dataInfo) + pubmedInstance.setPid(pidList.asJava) + if (alternateIdentifier != null) + pubmedInstance.setAlternateIdentifier(List(alternateIdentifier).asJava) + result.setInstance(List(pubmedInstance).asJava) + pubmedInstance.getPid.asScala + .filter(p => "pmid".equalsIgnoreCase(p.getQualifier.getClassid)) + .map(p => p.getValue)(collection.breakOut) + //CREATE URL From pmid + val urlLists: List[String] = pidList + .map(s => (urlMap.getOrElse(s.getQualifier.getClassid, ""), s.getValue)) + .filter(t => t._1.nonEmpty) + .toList + .map(t => t._1 + t._2) + if (urlLists != null) + pubmedInstance.setUrl(urlLists.asJava) + + //ASSIGN DateofAcceptance + pubmedInstance.setDateofacceptance( + OafMapperUtils.field(GraphCleaningFunctions.cleanDate(article.getDate), dataInfo) + ) + //ASSIGN COLLECTEDFROM + pubmedInstance.setCollectedfrom(collectedFrom) + result.setPid(pidList.asJava) + + //END INSTANCE MAPPING + //-------------------------------------------------------------------------------------- + + // JOURNAL MAPPING + //-------------------------------------------------------------------------------------- + if (article.getJournal != null && result.isInstanceOf[Publication]) + result.asInstanceOf[Publication].setJournal(mapJournal(article.getJournal)) + result.setCollectedfrom(List(collectedFrom).asJava) + //END JOURNAL MAPPING + //-------------------------------------------------------------------------------------- + + // RESULT MAPPING + //-------------------------------------------------------------------------------------- + result.setDateofacceptance( + OafMapperUtils.field(GraphCleaningFunctions.cleanDate(article.getDate), dataInfo) + ) + + if (article.getTitle == null || article.getTitle.isEmpty) + return null + result.setTitle( + List( + OafMapperUtils.structuredProperty( + article.getTitle, + ModelConstants.MAIN_TITLE_QUALIFIER, + dataInfo + ) + ).asJava + ) + + if (article.getDescription != null && article.getDescription.nonEmpty) + result.setDescription(List(OafMapperUtils.field(article.getDescription, dataInfo)).asJava) + + if (article.getLanguage != null) { + + val term = + vocabularies.getSynonymAsQualifier(ModelConstants.DNET_LANGUAGES, article.getLanguage) + if (term != null) + result.setLanguage(term) + } + + val subjects: List[Subject] = article.getSubjects.asScala.map(s => + OafMapperUtils.subject( + s.getValue, + SUBJ_CLASS, + SUBJ_CLASS, + ModelConstants.DNET_SUBJECT_TYPOLOGIES, + ModelConstants.DNET_SUBJECT_TYPOLOGIES, + dataInfo + ) + )(collection.breakOut) + if (subjects != null) + result.setSubject(subjects.asJava) + + val authors: List[Author] = article.getAuthors.asScala.zipWithIndex.map { case (a, index) => + val author = new Author() + author.setName(a.getForeName) + author.setSurname(a.getLastName) + author.setFullname(a.getFullName) + author.setRank(index + 1) + author + }(collection.breakOut) + + if (authors != null && authors.nonEmpty) + result.setAuthor(authors.asJava) + + if (StringUtils.isNotEmpty(article.getPmcId)) { + val originalIDS = ListBuffer[String]() + originalIDS += createOriginalOpenaireId(article) + pidList.map(s => s.getValue).foreach(p => originalIDS += p) + result.setOriginalId(originalIDS.asJava) + } else + result.setOriginalId(pidList.map(s => s.getValue).asJava) + + result.setId(article.getPmid) + + // END RESULT MAPPING + //-------------------------------------------------------------------------------------- + val id = IdentifierFactory.createIdentifier(result) + if (article.getPmid.equalsIgnoreCase(id)) + return null + result.setId(id) + result + } + +} diff --git a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/graph/SparkRetrieveDataciteDelta.scala b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/graph/SparkRetrieveDataciteDelta.scala new file mode 100644 index 000000000..2618d466a --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/graph/SparkRetrieveDataciteDelta.scala @@ -0,0 +1,345 @@ +package eu.dnetlib.dhp.sx.graph + +import eu.dnetlib.dhp.application.AbstractScalaApplication +import eu.dnetlib.dhp.collection.CollectionUtils.fixRelations +import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup +import eu.dnetlib.dhp.datacite.{DataciteToOAFTransformation, DataciteType} +import eu.dnetlib.dhp.schema.oaf.{Oaf, Relation, Result} +import eu.dnetlib.dhp.schema.sx.scholix.{Scholix, ScholixResource} +import eu.dnetlib.dhp.schema.sx.summary.ScholixSummary +import eu.dnetlib.dhp.sx.graph.scholix.ScholixUtils +import eu.dnetlib.dhp.utils.{DHPUtils, ISLookupClientFactory} +import org.apache.hadoop.fs.{FileSystem, Path} +import org.apache.spark.sql.functions.max +import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession} +import org.slf4j.{Logger, LoggerFactory} + +import scala.collection.JavaConverters._ +import java.text.SimpleDateFormat + +class SparkRetrieveDataciteDelta(propertyPath: String, args: Array[String], log: Logger) + extends AbstractScalaApplication(propertyPath, args, log: Logger) { + + val ISO_DATE_PATTERN = "yyyy-MM-dd'T'HH:mm:ssZ" + val simpleFormatter = new SimpleDateFormat(ISO_DATE_PATTERN) + + val SCHOLIX_RESOURCE_PATH_NAME = "scholixResource" + val DATACITE_OAF_PATH_NAME = "dataciteOAFUpdate" + val PID_MAP_PATH_NAME = "pidMap" + val RESOLVED_REL_PATH_NAME = "resolvedRelation" + val SCHOLIX_PATH_NAME = "scholix" + + def scholixResourcePath(workingPath: String) = s"$workingPath/$SCHOLIX_RESOURCE_PATH_NAME" + def dataciteOAFPath(workingPath: String) = s"$workingPath/$DATACITE_OAF_PATH_NAME" + def pidMapPath(workingPath: String) = s"$workingPath/$PID_MAP_PATH_NAME" + def resolvedRelationPath(workingPath: String) = s"$workingPath/$RESOLVED_REL_PATH_NAME" + def scholixPath(workingPath: String) = s"$workingPath/$SCHOLIX_PATH_NAME" + + /** Utility to parse Date in ISO8601 to epochMillis + * @param inputDate The String represents an input date in ISO8601 + * @return The relative epochMillis of parsed date + */ + def ISO8601toEpochMillis(inputDate: String): Long = { + simpleFormatter.parse(inputDate).getTime + } + + /** This method tries to retrieve the last collection date from all datacite + * records in HDFS. + * This method should be called before indexing scholexplorer to retrieve + * the delta of Datacite record to download, since from the generation of + * raw graph to the generation of Scholexplorer sometimes it takes 20 days + * @param spark + * @param entitiesPath + * @return the last collection date from the current scholexplorer Graph of the datacite records + */ + def retrieveLastCollectedFrom(spark: SparkSession, entitiesPath: String): Long = { + log.info("Retrieve last entities collected From") + + implicit val oafEncoder: Encoder[Oaf] = Encoders.kryo[Oaf] + implicit val resultEncoder: Encoder[Result] = Encoders.kryo[Result] + import spark.implicits._ + + val entitiesDS = spark.read + .load(s"$entitiesPath/*") + .as[Oaf] + .filter(o => o.isInstanceOf[Result]) + .map(r => r.asInstanceOf[Result]) + + val date = entitiesDS + .filter(r => r.getDateofcollection != null) + .map(_.getDateofcollection) + .select(max("value")) + .first + .getString(0) + + ISO8601toEpochMillis(date) / 1000 + } + + /** The method of update Datacite relationships on Scholexplorer + * needs some utilities data structures + * One is the scholixResource DS that stores all the nodes in the Scholix Graph + * in format ScholixResource + * @param summaryPath the path of the summary in Scholix + * @param workingPath the working path + * @param spark the spark session + */ + def generateScholixResource( + summaryPath: String, + workingPath: String, + spark: SparkSession + ): Unit = { + implicit val summaryEncoder: Encoder[ScholixSummary] = Encoders.kryo[ScholixSummary] + implicit val scholixResourceEncoder: Encoder[ScholixResource] = Encoders.kryo[ScholixResource] + + log.info("Convert All summary to ScholixResource") + spark.read + .load(summaryPath) + .as[ScholixSummary] + .map(ScholixUtils.generateScholixResourceFromSummary)(scholixResourceEncoder) + .filter(r => r.getIdentifier != null && r.getIdentifier.size > 0) + .write + .mode(SaveMode.Overwrite) + .save(s"${scholixResourcePath(workingPath)}_native") + } + + /** This method convert the new Datacite Resource into Scholix Resource + * Needed to fill the source and the type of Scholix Relationships + * @param workingPath the Working Path + * @param spark The spark Session + */ + def addMissingScholixResource(workingPath: String, spark: SparkSession): Unit = { + implicit val oafEncoder: Encoder[Oaf] = Encoders.kryo[Oaf] + implicit val scholixResourceEncoder: Encoder[ScholixResource] = Encoders.kryo[ScholixResource] + implicit val resultEncoder: Encoder[Result] = Encoders.kryo[Result] + import spark.implicits._ + + spark.read + .load(dataciteOAFPath(workingPath)) + .as[Oaf] + .filter(_.isInstanceOf[Result]) + .map(_.asInstanceOf[Result]) + .map(ScholixUtils.generateScholixResourceFromResult) + .filter(r => r.getIdentifier != null && r.getIdentifier.size > 0) + .write + .mode(SaveMode.Overwrite) + .save(s"${scholixResourcePath(workingPath)}_update") + + val update = spark.read.load(s"${scholixResourcePath(workingPath)}_update").as[ScholixResource] + val native = spark.read.load(s"${scholixResourcePath(workingPath)}_native").as[ScholixResource] + val graph = update + .union(native) + .groupByKey(_.getDnetIdentifier) + .reduceGroups((a, b) => if (a != null && a.getDnetIdentifier != null) a else b) + .map(_._2) + graph.write.mode(SaveMode.Overwrite).save(s"${scholixResourcePath(workingPath)}_graph") + } + + /** This method get and Transform only datacite records with + * timestamp greater than timestamp + * @param datacitePath the datacite input Path + * @param timestamp the timestamp + * @param workingPath the working path where save the generated Dataset + * @param spark SparkSession + * @param vocabularies Vocabularies needed for transformation + */ + + def getDataciteUpdate( + datacitePath: String, + timestamp: Long, + workingPath: String, + spark: SparkSession, + vocabularies: VocabularyGroup + ): Long = { + import spark.implicits._ + val ds = spark.read.load(datacitePath).as[DataciteType] + implicit val oafEncoder: Encoder[Oaf] = Encoders.kryo[Oaf] + val total = ds.filter(_.timestamp >= timestamp).count() + if (total > 0) { + ds.filter(_.timestamp >= timestamp) + .flatMap(d => + DataciteToOAFTransformation + .generateOAF(d.json, d.timestamp, d.timestamp, vocabularies, exportLinks = true) + ) + .flatMap(i => fixRelations(i)) + .filter(i => i != null) + .write + .mode(SaveMode.Overwrite) + .save(dataciteOAFPath(workingPath)) + } + total + } + + /** After added the new ScholixResource, we need to update the scholix Pid Map + * to intersected with the new Datacite Relations + * + * @param workingPath The working Path starting from save the new Map + * @param spark the spark session + */ + def generatePidMap(workingPath: String, spark: SparkSession): Unit = { + implicit val scholixResourceEncoder: Encoder[ScholixResource] = Encoders.kryo[ScholixResource] + import spark.implicits._ + spark.read + .load(s"${scholixResourcePath(workingPath)}_graph") + .as[ScholixResource] + .flatMap(r => + r.getIdentifier.asScala + .map(i => DHPUtils.generateUnresolvedIdentifier(i.getIdentifier, i.getSchema)) + .map(t => (t, r.getDnetIdentifier)) + )(Encoders.tuple(Encoders.STRING, Encoders.STRING)) + .groupByKey(_._1) + .reduceGroups((a, b) => if (a != null && a._2 != null) a else b) + .map(_._2)(Encoders.tuple(Encoders.STRING, Encoders.STRING)) + .write + .mode(SaveMode.Overwrite) + .save(pidMapPath(workingPath)) + } + + /** This method resolve the datacite relation and filter the resolved + * relation + * @param workingPath the working path + * @param spark the spark session + */ + + def resolveUpdateRelation(workingPath: String, spark: SparkSession): Unit = { + implicit val oafEncoder: Encoder[Oaf] = Encoders.kryo[Oaf] + implicit val relationEncoder: Encoder[Relation] = Encoders.kryo[Relation] + import spark.implicits._ + + val pidMap = spark.read.load(pidMapPath(workingPath)).as[(String, String)] + + val unresolvedRelations: Dataset[(String, Relation)] = spark.read + .load(dataciteOAFPath(workingPath)) + .as[Oaf] + .filter(_.isInstanceOf[Relation]) + .map(_.asInstanceOf[Relation]) + .map { r => + if (r.getSource.startsWith("unresolved")) + (r.getSource, r) + else + (r.getTarget, r) + }(Encoders.tuple(Encoders.STRING, relationEncoder)) + + unresolvedRelations + .joinWith(pidMap, unresolvedRelations("_1").equalTo(pidMap("_1"))) + .map(t => { + val r = t._1._2 + val resolvedIdentifier = t._2._2 + if (r.getSource.startsWith("unresolved")) + r.setSource(resolvedIdentifier) + else + r.setTarget(resolvedIdentifier) + r + })(relationEncoder) + .filter(r => !(r.getSource.startsWith("unresolved") || r.getTarget.startsWith("unresolved"))) + .write + .mode(SaveMode.Overwrite) + .save(resolvedRelationPath(workingPath)) + } + + /** This method generate scholix starting from resolved relation + * + * @param workingPath + * @param spark + */ + def generateScholixUpdate(workingPath: String, spark: SparkSession): Unit = { + implicit val oafEncoder: Encoder[Oaf] = Encoders.kryo[Oaf] + implicit val scholixEncoder: Encoder[Scholix] = Encoders.kryo[Scholix] + implicit val scholixResourceEncoder: Encoder[ScholixResource] = Encoders.kryo[ScholixResource] + implicit val relationEncoder: Encoder[Relation] = Encoders.kryo[Relation] + implicit val intermediateEncoder: Encoder[(String, Scholix)] = + Encoders.tuple(Encoders.STRING, scholixEncoder) + + val relations: Dataset[(String, Relation)] = spark.read + .load(resolvedRelationPath(workingPath)) + .as[Relation] + .map(r => (r.getSource, r))(Encoders.tuple(Encoders.STRING, relationEncoder)) + + val id_summary: Dataset[(String, ScholixResource)] = spark.read + .load(s"${scholixResourcePath(workingPath)}_graph") + .as[ScholixResource] + .map(r => (r.getDnetIdentifier, r))(Encoders.tuple(Encoders.STRING, scholixResourceEncoder)) + + id_summary.cache() + + relations + .joinWith(id_summary, relations("_1").equalTo(id_summary("_1")), "inner") + .map(t => (t._1._2.getTarget, ScholixUtils.scholixFromSource(t._1._2, t._2._2))) + .write + .mode(SaveMode.Overwrite) + .save(s"$workingPath/scholix_one_verse") + + val source_scholix: Dataset[(String, Scholix)] = + spark.read.load(s"$workingPath/scholix_one_verse").as[(String, Scholix)] + + source_scholix + .joinWith(id_summary, source_scholix("_1").equalTo(id_summary("_1")), "inner") + .map(t => { + val target: ScholixResource = t._2._2 + val scholix: Scholix = t._1._2 + ScholixUtils.generateCompleteScholix(scholix, target) + })(scholixEncoder) + .write + .mode(SaveMode.Overwrite) + .save(s"$workingPath/scholix") + } + + /** Here all the spark applications runs this method + * where the whole logic of the spark node is defined + */ + override def run(): Unit = { + val sourcePath = parser.get("sourcePath") + log.info(s"SourcePath is '$sourcePath'") + + val datacitePath = parser.get("datacitePath") + log.info(s"DatacitePath is '$datacitePath'") + + val workingPath = parser.get("workingSupportPath") + log.info(s"workingPath is '$workingPath'") + + val isLookupUrl: String = parser.get("isLookupUrl") + log.info("isLookupUrl: {}", isLookupUrl) + + val isLookupService = ISLookupClientFactory.getLookUpService(isLookupUrl) + val vocabularies = VocabularyGroup.loadVocsFromIS(isLookupService) + require(vocabularies != null) + + val updateDS: Boolean = "true".equalsIgnoreCase(parser.get("updateDS")) + log.info(s"updateDS is '$updateDS'") + + var lastCollectionDate = 0L + if (updateDS) { + generateScholixResource(s"$sourcePath/provision/summaries", workingPath, spark) + log.info("Retrieve last entities collected From starting from scholix Graph") + lastCollectionDate = retrieveLastCollectedFrom(spark, s"$sourcePath/entities") + } else { + val fs = FileSystem.get(spark.sparkContext.hadoopConfiguration) + fs.delete(new Path(s"${scholixResourcePath(workingPath)}_native"), true) + fs.rename( + new Path(s"${scholixResourcePath(workingPath)}_graph"), + new Path(s"${scholixResourcePath(workingPath)}_native") + ) + lastCollectionDate = retrieveLastCollectedFrom(spark, dataciteOAFPath(workingPath)) + } + + val numRecords = + getDataciteUpdate(datacitePath, lastCollectionDate, workingPath, spark, vocabularies) + if (numRecords > 0) { + addMissingScholixResource(workingPath, spark) + generatePidMap(workingPath, spark) + resolveUpdateRelation(workingPath, spark) + generateScholixUpdate(workingPath, spark) + } + } +} + +object SparkRetrieveDataciteDelta { + val log: Logger = LoggerFactory.getLogger(SparkRetrieveDataciteDelta.getClass) + + def main(args: Array[String]): Unit = { + new SparkRetrieveDataciteDelta( + "/eu/dnetlib/dhp/sx/graph/retrieve_datacite_delta_params.json", + args, + log + ).initialize().run() + } +} diff --git a/dhp-workflows/dhp-aggregation/src/site/markdown/datacite.md b/dhp-workflows/dhp-aggregation/src/site/markdown/datacite.md new file mode 100644 index 000000000..e69de29bb diff --git a/dhp-workflows/dhp-aggregation/src/site/markdown/index.md b/dhp-workflows/dhp-aggregation/src/site/markdown/index.md new file mode 100644 index 000000000..6c4e05d5f --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/site/markdown/index.md @@ -0,0 +1,20 @@ +##DHP-Aggregation + +This module defines a set of oozie workflows for + +1. the **collection** and **transformation** of metadata records. +2. the **integration** of new external information in the result + + +### Collection and Transformation + +The workflows interact with the Metadata Store Manager (MdSM) to handle the logical transactions required to ensure +the consistency of the read/write operations on the data as the MdSM in fact keeps track of the logical-physical mapping +of each MDStore. + +It defines [mappings](mappings.md) for transformation of different datasource (See mapping section). + +### Integration of external information in the result + +The workflows create new entity in the OpenAIRE format (OAF) whose aim is to enrich the result already contained in the graph. +See integration section for more insight diff --git a/dhp-workflows/dhp-aggregation/src/site/markdown/integration.md b/dhp-workflows/dhp-aggregation/src/site/markdown/integration.md new file mode 100644 index 000000000..7b763c681 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/site/markdown/integration.md @@ -0,0 +1,36 @@ +DHP Aggregation - Integration method +===================================== + +The integration method can be applied every time new information, which is not aggregated from the repositories +nor computed directly by OpenAIRE, should be added to the results of the graph. + +The information integrated so far is: + +1. Article impact measures + 1. [Bip!Finder](https://dl.acm.org/doi/10.1145/3357384.3357850) scores +2. Result Subjects + 1. Integration of Fields of Science and Techonology ([FOS](https://www.qnrf.org/en-us/FOS)) classification in + results subjects. + + +The method always consists in the creation of a new entity in the OpenAIRE format (OAF entity) containing only the id +and the element in the OAF model that should be used to map the information we want to integrate. + +The id is set by using a particular encoding of the given PID + +*unresolved::[pid]::[pidtype]* + +where + +1. *unresolved* is a constant value +2. *pid* is the persistent id value, e.g. 10.5281/zenodo.4707307 +3. *pidtype* is the persistent id type, e.g. doi + +Such entities are matched against those available in the graph using the result.instance.pid values. + +This mechanism can be used to integrate enrichments produced as associated by a given PID. +If a match will be found with one of the results already in the graph that said result will be enriched with the information +present in the new OAF. +All the entities for which a match is not found are discarded. + + diff --git a/dhp-workflows/dhp-aggregation/src/site/markdown/introduction.md b/dhp-workflows/dhp-aggregation/src/site/markdown/introduction.md new file mode 100644 index 000000000..9da46a27e --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/site/markdown/introduction.md @@ -0,0 +1,7 @@ +##DHP-Aggregation + +This module defines a set of oozie workflows for the **collection** and **transformation** of metadata records. + +Both workflows interact with the Metadata Store Manager (MdSM) to handle the logical transactions required to ensure +the consistency of the read/write operations on the data as the MdSM in fact keeps track of the logical-physical mapping +of each MDStore. \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/site/markdown/mappings.md b/dhp-workflows/dhp-aggregation/src/site/markdown/mappings.md new file mode 100644 index 000000000..576c4b6be --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/site/markdown/mappings.md @@ -0,0 +1,18 @@ +DHP Aggregation +=============== + +DHP-Aggregations contains different mappings from original data format into OAF Data Format, +which converge in the graph in different ways: + +- Via Action Manager +- Direct in the MdStore on Hadoop + +Below the list of the implemented mapping + + +Mappings +======= + +1. [PubMed](pubmed.md) +2. [Datacite](datacite.md) + diff --git a/dhp-workflows/dhp-aggregation/src/site/markdown/pubmed.md b/dhp-workflows/dhp-aggregation/src/site/markdown/pubmed.md new file mode 100644 index 000000000..c1813394b --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/site/markdown/pubmed.md @@ -0,0 +1,66 @@ +#Pubmed Mapping +This section describes the mapping implemented for [MEDLINE/PubMed](https://pubmed.ncbi.nlm.nih.gov/). + +Collection +--------- +The native data is collected from [ftp baseline](https://ftp.ncbi.nlm.nih.gov/pubmed/baseline/) containing XML with +the following [schema](https://www.nlm.nih.gov/bsd/licensee/elements_descriptions.html) + + +Parsing +------- +The resposible class of parsing is [PMParser](/dnet-hadoop/scaladocs/#eu.dnetlib.dhp.sx.bio.pubmed.PMParser) that generates +an intermediate mapping of PubMed Article defined [here](/dnet-hadoop/apidocs/eu/dnetlib/dhp/sx/bio/pubmed/package-summary.html) + + +Mapping +------- + +The table below describes the mapping from the XML Native to the OAF mapping + + + + + +| Xpath Source | Oaf Field | Notes | +| ----------- | ----------- | ----------- | +| //PMID | pid | classid = classname = pmid +| | **Instance Mapping** | | +|//PublicationType | InstanceType | If the article contains the typology **Journal Article** then we apply this type else We have to find a terms that match the vocabulary otherwise we discard it +|//PMID | instance/PID | Map the pmid also in the pid in the instance | +| //ArticleId[./@IdType="doi" | instance/alternateIdentifier |classid = classname = doi +|//PMID | instance/URL | prepend to the PMId the base url https://pubmed.ncbi.nlm.nih.gov/ +| //PubmedPubDate | instance/Dateofacceptance | apply the function GraphCleaningFunctions.cleanDate before assign it +| FOR ALL INSTANCE | CollectedFrom | datasourceName: *Europe PubMed Central* DatasourceId: +| | **Journal Mapping** | | +|//Journal/PubDate| Journal/Conferencedate | map the date of the Journal +|//Journal/Title| Journal/Name | | +|//Journal/Volume| Journal/Vol | | +|//Journal/ISSN| Journal/issPrinted | | +|//Journal/Issue| Journal/Iss | | +| | **Publication Mapping** | | +| //PubmedPubDate | Dateofacceptance | apply the function GraphCleaningFunctions.cleanDate before assign it +| //Title | title | with qualifier ModelConstants.MAIN_TITLE_QUALIFIER +| //AbstractText | Description || +|//Language| Language| cleaning vocabulary -> dnet:languages +|//DescriptorName| Subject | classId, className = keyword +| | **Author Mapping** | | +|//Author/LastName| author.Surname| | +|//Author/ForeName| author.Forename| | +|//Author/FullName| author.Forename| Concatenation of forname + lastName if exist | +|FOR ALL AUTHOR | author.rank| sequential number starting from 1| + +#TODO + +Missing item mapped + + + + + + + + + + + diff --git a/dhp-workflows/dhp-aggregation/src/site/resources/images/openaire.png b/dhp-workflows/dhp-aggregation/src/site/resources/images/openaire.png new file mode 100644 index 000000000..00d320c39 Binary files /dev/null and b/dhp-workflows/dhp-aggregation/src/site/resources/images/openaire.png differ diff --git a/dhp-workflows/dhp-aggregation/src/site/site.xml b/dhp-workflows/dhp-aggregation/src/site/site.xml new file mode 100644 index 000000000..75fc5032e --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/site/site.xml @@ -0,0 +1,34 @@ + + + + org.apache.maven.skins + maven-fluido-skin + 1.8 + + + + + + + + +