Compare commits

..

No commits in common. "beta" and "dump" have entirely different histories.
beta ... dump

811 changed files with 17687 additions and 49546 deletions

4
.gitignore vendored
View File

@ -3,6 +3,8 @@
*.iws *.iws
*.ipr *.ipr
*.iml *.iml
*.ipr
*.iws
*~ *~
.vscode .vscode
.metals .metals
@ -25,4 +27,4 @@ spark-warehouse
/**/job-override.properties /**/job-override.properties
/**/*.log /**/*.log
/**/.factorypath /**/.factorypath
/**/.scalafmt.conf

View File

@ -8,6 +8,8 @@ import java.util.List;
import org.apache.commons.lang.ArrayUtils; import org.apache.commons.lang.ArrayUtils;
import org.apache.commons.lang.StringUtils; import org.apache.commons.lang.StringUtils;
import org.apache.maven.plugin.AbstractMojo; import org.apache.maven.plugin.AbstractMojo;
import org.apache.maven.plugin.MojoExecutionException;
import org.apache.maven.plugin.MojoFailureException;
/** /**
* Generates oozie properties which were not provided from commandline. * Generates oozie properties which were not provided from commandline.
@ -25,7 +27,7 @@ public class GenerateOoziePropertiesMojo extends AbstractMojo {
}; };
@Override @Override
public void execute() { public void execute() throws MojoExecutionException, MojoFailureException {
if (System.getProperties().containsKey(PROPERTY_NAME_WF_SOURCE_DIR) if (System.getProperties().containsKey(PROPERTY_NAME_WF_SOURCE_DIR)
&& !System.getProperties().containsKey(PROPERTY_NAME_SANDBOX_NAME)) { && !System.getProperties().containsKey(PROPERTY_NAME_SANDBOX_NAME)) {
String generatedSandboxName = generateSandboxName( String generatedSandboxName = generateSandboxName(
@ -44,24 +46,24 @@ public class GenerateOoziePropertiesMojo extends AbstractMojo {
/** /**
* Generates sandbox name from workflow source directory. * Generates sandbox name from workflow source directory.
* *
* @param wfSourceDir workflow source directory * @param wfSourceDir
* @return generated sandbox name * @return generated sandbox name
*/ */
private String generateSandboxName(String wfSourceDir) { private String generateSandboxName(String wfSourceDir) {
// utilize all dir names until finding one of the limiters // utilize all dir names until finding one of the limiters
List<String> sandboxNameParts = new ArrayList<>(); List<String> sandboxNameParts = new ArrayList<String>();
String[] tokens = StringUtils.split(wfSourceDir, File.separatorChar); String[] tokens = StringUtils.split(wfSourceDir, File.separatorChar);
ArrayUtils.reverse(tokens); ArrayUtils.reverse(tokens);
if (tokens.length > 0) { if (tokens.length > 0) {
for (String token : tokens) { for (String token : tokens) {
for (String limiter : limiters) { for (String limiter : limiters) {
if (limiter.equals(token)) { if (limiter.equals(token)) {
return !sandboxNameParts.isEmpty() return sandboxNameParts.size() > 0
? StringUtils.join(sandboxNameParts.toArray()) ? StringUtils.join(sandboxNameParts.toArray())
: null; : null;
} }
} }
if (!sandboxNameParts.isEmpty()) { if (sandboxNameParts.size() > 0) {
sandboxNameParts.add(0, File.separator); sandboxNameParts.add(0, File.separator);
} }
sandboxNameParts.add(0, token); sandboxNameParts.add(0, token);

View File

@ -16,7 +16,6 @@ import java.io.File;
import java.io.FileInputStream; import java.io.FileInputStream;
import java.io.IOException; import java.io.IOException;
import java.io.InputStream; import java.io.InputStream;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Arrays; import java.util.Arrays;
import java.util.Collections; import java.util.Collections;
@ -290,7 +289,7 @@ public class WritePredefinedProjectProperties extends AbstractMojo {
*/ */
protected List<String> getEscapeChars(String escapeChars) { protected List<String> getEscapeChars(String escapeChars) {
List<String> tokens = getListFromCSV(escapeChars); List<String> tokens = getListFromCSV(escapeChars);
List<String> realTokens = new ArrayList<>(); List<String> realTokens = new ArrayList<String>();
for (String token : tokens) { for (String token : tokens) {
String realToken = getRealToken(token); String realToken = getRealToken(token);
realTokens.add(realToken); realTokens.add(realToken);
@ -325,7 +324,7 @@ public class WritePredefinedProjectProperties extends AbstractMojo {
* @return content * @return content
*/ */
protected String getContent(String comment, Properties properties, List<String> escapeTokens) { protected String getContent(String comment, Properties properties, List<String> escapeTokens) {
List<String> names = new ArrayList<>(properties.stringPropertyNames()); List<String> names = new ArrayList<String>(properties.stringPropertyNames());
Collections.sort(names); Collections.sort(names);
StringBuilder sb = new StringBuilder(); StringBuilder sb = new StringBuilder();
if (!StringUtils.isBlank(comment)) { if (!StringUtils.isBlank(comment)) {
@ -353,7 +352,7 @@ public class WritePredefinedProjectProperties extends AbstractMojo {
throws MojoExecutionException { throws MojoExecutionException {
try { try {
String content = getContent(comment, properties, escapeTokens); String content = getContent(comment, properties, escapeTokens);
FileUtils.writeStringToFile(file, content, StandardCharsets.UTF_8); FileUtils.writeStringToFile(file, content, ENCODING_UTF8);
} catch (IOException e) { } catch (IOException e) {
throw new MojoExecutionException("Error creating properties file", e); throw new MojoExecutionException("Error creating properties file", e);
} }
@ -400,9 +399,9 @@ public class WritePredefinedProjectProperties extends AbstractMojo {
*/ */
protected static final List<String> getListFromCSV(String csv) { protected static final List<String> getListFromCSV(String csv) {
if (StringUtils.isBlank(csv)) { if (StringUtils.isBlank(csv)) {
return new ArrayList<>(); return new ArrayList<String>();
} }
List<String> list = new ArrayList<>(); List<String> list = new ArrayList<String>();
String[] tokens = StringUtils.split(csv, ","); String[] tokens = StringUtils.split(csv, ",");
for (String token : tokens) { for (String token : tokens) {
list.add(token.trim()); list.add(token.trim());

View File

@ -9,18 +9,18 @@ import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test; import org.junit.jupiter.api.Test;
/** @author mhorst, claudio.atzori */ /** @author mhorst, claudio.atzori */
class GenerateOoziePropertiesMojoTest { public class GenerateOoziePropertiesMojoTest {
private final GenerateOoziePropertiesMojo mojo = new GenerateOoziePropertiesMojo(); private final GenerateOoziePropertiesMojo mojo = new GenerateOoziePropertiesMojo();
@BeforeEach @BeforeEach
void clearSystemProperties() { public void clearSystemProperties() {
System.clearProperty(PROPERTY_NAME_SANDBOX_NAME); System.clearProperty(PROPERTY_NAME_SANDBOX_NAME);
System.clearProperty(PROPERTY_NAME_WF_SOURCE_DIR); System.clearProperty(PROPERTY_NAME_WF_SOURCE_DIR);
} }
@Test @Test
void testExecuteEmpty() throws Exception { public void testExecuteEmpty() throws Exception {
// execute // execute
mojo.execute(); mojo.execute();
@ -29,7 +29,7 @@ class GenerateOoziePropertiesMojoTest {
} }
@Test @Test
void testExecuteSandboxNameAlreadySet() throws Exception { public void testExecuteSandboxNameAlreadySet() throws Exception {
// given // given
String workflowSourceDir = "eu/dnetlib/dhp/wf/transformers"; String workflowSourceDir = "eu/dnetlib/dhp/wf/transformers";
String sandboxName = "originalSandboxName"; String sandboxName = "originalSandboxName";
@ -44,7 +44,7 @@ class GenerateOoziePropertiesMojoTest {
} }
@Test @Test
void testExecuteEmptyWorkflowSourceDir() throws Exception { public void testExecuteEmptyWorkflowSourceDir() throws Exception {
// given // given
String workflowSourceDir = ""; String workflowSourceDir = "";
System.setProperty(PROPERTY_NAME_WF_SOURCE_DIR, workflowSourceDir); System.setProperty(PROPERTY_NAME_WF_SOURCE_DIR, workflowSourceDir);
@ -57,7 +57,7 @@ class GenerateOoziePropertiesMojoTest {
} }
@Test @Test
void testExecuteNullSandboxNameGenerated() throws Exception { public void testExecuteNullSandboxNameGenerated() throws Exception {
// given // given
String workflowSourceDir = "eu/dnetlib/dhp/"; String workflowSourceDir = "eu/dnetlib/dhp/";
System.setProperty(PROPERTY_NAME_WF_SOURCE_DIR, workflowSourceDir); System.setProperty(PROPERTY_NAME_WF_SOURCE_DIR, workflowSourceDir);
@ -70,7 +70,7 @@ class GenerateOoziePropertiesMojoTest {
} }
@Test @Test
void testExecute() throws Exception { public void testExecute() throws Exception {
// given // given
String workflowSourceDir = "eu/dnetlib/dhp/wf/transformers"; String workflowSourceDir = "eu/dnetlib/dhp/wf/transformers";
System.setProperty(PROPERTY_NAME_WF_SOURCE_DIR, workflowSourceDir); System.setProperty(PROPERTY_NAME_WF_SOURCE_DIR, workflowSourceDir);
@ -83,7 +83,7 @@ class GenerateOoziePropertiesMojoTest {
} }
@Test @Test
void testExecuteWithoutRoot() throws Exception { public void testExecuteWithoutRoot() throws Exception {
// given // given
String workflowSourceDir = "wf/transformers"; String workflowSourceDir = "wf/transformers";
System.setProperty(PROPERTY_NAME_WF_SOURCE_DIR, workflowSourceDir); System.setProperty(PROPERTY_NAME_WF_SOURCE_DIR, workflowSourceDir);

View File

@ -20,7 +20,7 @@ import org.mockito.junit.jupiter.MockitoExtension;
/** @author mhorst, claudio.atzori */ /** @author mhorst, claudio.atzori */
@ExtendWith(MockitoExtension.class) @ExtendWith(MockitoExtension.class)
class WritePredefinedProjectPropertiesTest { public class WritePredefinedProjectPropertiesTest {
@Mock @Mock
private MavenProject mavenProject; private MavenProject mavenProject;
@ -39,7 +39,7 @@ class WritePredefinedProjectPropertiesTest {
// ----------------------------------- TESTS --------------------------------------------- // ----------------------------------- TESTS ---------------------------------------------
@Test @Test
void testExecuteEmpty() throws Exception { public void testExecuteEmpty() throws Exception {
// execute // execute
mojo.execute(); mojo.execute();
@ -50,7 +50,7 @@ class WritePredefinedProjectPropertiesTest {
} }
@Test @Test
void testExecuteWithProjectProperties() throws Exception { public void testExecuteWithProjectProperties() throws Exception {
// given // given
String key = "projectPropertyKey"; String key = "projectPropertyKey";
String value = "projectPropertyValue"; String value = "projectPropertyValue";
@ -70,7 +70,7 @@ class WritePredefinedProjectPropertiesTest {
} }
@Test() @Test()
void testExecuteWithProjectPropertiesAndInvalidOutputFile(@TempDir File testFolder) { public void testExecuteWithProjectPropertiesAndInvalidOutputFile(@TempDir File testFolder) {
// given // given
String key = "projectPropertyKey"; String key = "projectPropertyKey";
String value = "projectPropertyValue"; String value = "projectPropertyValue";
@ -84,7 +84,7 @@ class WritePredefinedProjectPropertiesTest {
} }
@Test @Test
void testExecuteWithProjectPropertiesExclusion(@TempDir File testFolder) throws Exception { public void testExecuteWithProjectPropertiesExclusion(@TempDir File testFolder) throws Exception {
// given // given
String key = "projectPropertyKey"; String key = "projectPropertyKey";
String value = "projectPropertyValue"; String value = "projectPropertyValue";
@ -108,7 +108,7 @@ class WritePredefinedProjectPropertiesTest {
} }
@Test @Test
void testExecuteWithProjectPropertiesInclusion(@TempDir File testFolder) throws Exception { public void testExecuteWithProjectPropertiesInclusion(@TempDir File testFolder) throws Exception {
// given // given
String key = "projectPropertyKey"; String key = "projectPropertyKey";
String value = "projectPropertyValue"; String value = "projectPropertyValue";
@ -132,7 +132,7 @@ class WritePredefinedProjectPropertiesTest {
} }
@Test @Test
void testExecuteIncludingPropertyKeysFromFile(@TempDir File testFolder) throws Exception { public void testExecuteIncludingPropertyKeysFromFile(@TempDir File testFolder) throws Exception {
// given // given
String key = "projectPropertyKey"; String key = "projectPropertyKey";
String value = "projectPropertyValue"; String value = "projectPropertyValue";
@ -164,7 +164,7 @@ class WritePredefinedProjectPropertiesTest {
} }
@Test @Test
void testExecuteIncludingPropertyKeysFromClasspathResource(@TempDir File testFolder) public void testExecuteIncludingPropertyKeysFromClasspathResource(@TempDir File testFolder)
throws Exception { throws Exception {
// given // given
String key = "projectPropertyKey"; String key = "projectPropertyKey";
@ -194,7 +194,7 @@ class WritePredefinedProjectPropertiesTest {
} }
@Test @Test
void testExecuteIncludingPropertyKeysFromBlankLocation() { public void testExecuteIncludingPropertyKeysFromBlankLocation() {
// given // given
String key = "projectPropertyKey"; String key = "projectPropertyKey";
String value = "projectPropertyValue"; String value = "projectPropertyValue";
@ -214,7 +214,7 @@ class WritePredefinedProjectPropertiesTest {
} }
@Test @Test
void testExecuteIncludingPropertyKeysFromXmlFile(@TempDir File testFolder) public void testExecuteIncludingPropertyKeysFromXmlFile(@TempDir File testFolder)
throws Exception { throws Exception {
// given // given
String key = "projectPropertyKey"; String key = "projectPropertyKey";
@ -247,7 +247,7 @@ class WritePredefinedProjectPropertiesTest {
} }
@Test @Test
void testExecuteIncludingPropertyKeysFromInvalidXmlFile(@TempDir File testFolder) public void testExecuteIncludingPropertyKeysFromInvalidXmlFile(@TempDir File testFolder)
throws Exception { throws Exception {
// given // given
String key = "projectPropertyKey"; String key = "projectPropertyKey";
@ -273,7 +273,7 @@ class WritePredefinedProjectPropertiesTest {
} }
@Test @Test
void testExecuteWithQuietModeOn(@TempDir File testFolder) throws Exception { public void testExecuteWithQuietModeOn(@TempDir File testFolder) throws Exception {
// given // given
mojo.setQuiet(true); mojo.setQuiet(true);
mojo.setIncludePropertyKeysFromFiles(new String[] { mojo.setIncludePropertyKeysFromFiles(new String[] {
@ -290,7 +290,7 @@ class WritePredefinedProjectPropertiesTest {
} }
@Test @Test
void testExecuteIncludingPropertyKeysFromInvalidFile() { public void testExecuteIncludingPropertyKeysFromInvalidFile() {
// given // given
mojo.setIncludePropertyKeysFromFiles(new String[] { mojo.setIncludePropertyKeysFromFiles(new String[] {
"invalid location" "invalid location"
@ -301,7 +301,7 @@ class WritePredefinedProjectPropertiesTest {
} }
@Test @Test
void testExecuteWithEnvironmentProperties(@TempDir File testFolder) throws Exception { public void testExecuteWithEnvironmentProperties(@TempDir File testFolder) throws Exception {
// given // given
mojo.setIncludeEnvironmentVariables(true); mojo.setIncludeEnvironmentVariables(true);
@ -318,7 +318,7 @@ class WritePredefinedProjectPropertiesTest {
} }
@Test @Test
void testExecuteWithSystemProperties(@TempDir File testFolder) throws Exception { public void testExecuteWithSystemProperties(@TempDir File testFolder) throws Exception {
// given // given
String key = "systemPropertyKey"; String key = "systemPropertyKey";
String value = "systemPropertyValue"; String value = "systemPropertyValue";
@ -337,7 +337,7 @@ class WritePredefinedProjectPropertiesTest {
} }
@Test @Test
void testExecuteWithSystemPropertiesAndEscapeChars(@TempDir File testFolder) public void testExecuteWithSystemPropertiesAndEscapeChars(@TempDir File testFolder)
throws Exception { throws Exception {
// given // given
String key = "systemPropertyKey "; String key = "systemPropertyKey ";

View File

@ -22,20 +22,9 @@
<id>dnet45-releases</id> <id>dnet45-releases</id>
<url>https://maven.d4science.org/nexus/content/repositories/dnet45-releases</url> <url>https://maven.d4science.org/nexus/content/repositories/dnet45-releases</url>
</repository> </repository>
<site>
<id>DHPSite</id>
<url>${dhp.site.stage.path}/dhp-build/dhp-code-style</url>
</site>
</distributionManagement> </distributionManagement>
<build> <build>
<extensions>
<extension>
<groupId>org.apache.maven.wagon</groupId>
<artifactId>wagon-ssh</artifactId>
<version>2.10</version>
</extension>
</extensions>
<pluginManagement> <pluginManagement>
<plugins> <plugins>
<plugin> <plugin>
@ -46,7 +35,7 @@
<plugin> <plugin>
<groupId>org.apache.maven.plugins</groupId> <groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-site-plugin</artifactId> <artifactId>maven-site-plugin</artifactId>
<version>3.9.1</version> <version>3.7.1</version>
</plugin> </plugin>
</plugins> </plugins>
</pluginManagement> </pluginManagement>
@ -54,7 +43,6 @@
<properties> <properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding> <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<dhp.site.stage.path>sftp://dnet-hadoop@static-web.d4science.org/dnet-hadoop</dhp.site.stage.path>
</properties> </properties>
</project> </project>

View File

@ -1,21 +0,0 @@
style = defaultWithAlign
align.openParenCallSite = false
align.openParenDefnSite = false
align.tokens = [{code = "->"}, {code = "<-"}, {code = "=>", owner = "Case"}]
continuationIndent.callSite = 2
continuationIndent.defnSite = 2
danglingParentheses = true
indentOperator = spray
maxColumn = 120
newlines.alwaysBeforeTopLevelStatements = true
project.excludeFilters = [".*\\.sbt"]
rewrite.rules = [AvoidInfix]
rewrite.rules = [ExpandImportSelectors]
rewrite.rules = [RedundantBraces]
rewrite.rules = [RedundantParens]
rewrite.rules = [SortImports]
rewrite.rules = [SortModifiers]
rewrite.rules = [PreferCurlyFors]
spaces.inImportCurlyBraces = false
unindentTopLevelOperators = true

View File

@ -1,21 +0,0 @@
<?xml version="1.0" encoding="ISO-8859-1"?>
<project xmlns="http://maven.apache.org/DECORATION/1.8.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/DECORATION/1.8.0 https://maven.apache.org/xsd/decoration-1.8.0.xsd"
name="DHP-Aggregation">
<skin>
<groupId>org.apache.maven.skins</groupId>
<artifactId>maven-fluido-skin</artifactId>
<version>1.8</version>
</skin>
<poweredBy>
<logo name="OpenAIRE Research Graph" href="https://graph.openaire.eu/"
img="https://graph.openaire.eu/assets/common-assets/logo-large-graph.png"/>
</poweredBy>
<body>
<links>
<item name="Code" href="https://code-repo.d4science.org/" />
</links>
<menu ref="modules" />
<menu ref="reports"/>
</body>
</project>

View File

@ -10,9 +10,6 @@
<packaging>pom</packaging> <packaging>pom</packaging>
<description>This module is a container for the build tools used in dnet-hadoop</description> <description>This module is a container for the build tools used in dnet-hadoop</description>
<properties>
<maven.javadoc.skip>true</maven.javadoc.skip>
</properties>
<modules> <modules>
<module>dhp-code-style</module> <module>dhp-code-style</module>
@ -20,12 +17,4 @@
<module>dhp-build-properties-maven-plugin</module> <module>dhp-build-properties-maven-plugin</module>
</modules> </modules>
<distributionManagement>
<site>
<id>DHPSite</id>
<url>${dhp.site.stage.path}/dhp-build/</url>
</site>
</distributionManagement>
</project> </project>

View File

@ -1,22 +0,0 @@
<?xml version="1.0" encoding="ISO-8859-1"?>
<project xmlns="http://maven.apache.org/DECORATION/1.8.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/DECORATION/1.8.0 https://maven.apache.org/xsd/decoration-1.8.0.xsd"
name="DHP-Aggregation">
<skin>
<groupId>org.apache.maven.skins</groupId>
<artifactId>maven-fluido-skin</artifactId>
<version>1.8</version>
</skin>
<poweredBy>
<logo name="OpenAIRE Research Graph" href="https://graph.openaire.eu/"
img="https://graph.openaire.eu/assets/common-assets/logo-large-graph.png"/>
</poweredBy>
<body>
<links>
<item name="Code" href="https://code-repo.d4science.org/" />
</links>
<menu ref="modules" />
<menu ref="reports"/>
</body>
</project>

View File

@ -13,51 +13,7 @@
<artifactId>dhp-common</artifactId> <artifactId>dhp-common</artifactId>
<packaging>jar</packaging> <packaging>jar</packaging>
<distributionManagement>
<site>
<id>DHPSite</id>
<url>${dhp.site.stage.path}/dhp-common</url>
</site>
</distributionManagement>
<description>This module contains common utilities meant to be used across the dnet-hadoop submodules</description> <description>This module contains common utilities meant to be used across the dnet-hadoop submodules</description>
<build>
<plugins>
<plugin>
<groupId>net.alchim31.maven</groupId>
<artifactId>scala-maven-plugin</artifactId>
<version>${net.alchim31.maven.version}</version>
<executions>
<execution>
<id>scala-compile-first</id>
<phase>initialize</phase>
<goals>
<goal>add-source</goal>
<goal>compile</goal>
</goals>
</execution>
<execution>
<id>scala-test-compile</id>
<phase>process-test-resources</phase>
<goals>
<goal>testCompile</goal>
</goals>
</execution>
<execution>
<id>scala-doc</id>
<phase>process-resources</phase> <!-- or wherever -->
<goals>
<goal>doc</goal>
</goals>
</execution>
</executions>
<configuration>
<scalaVersion>${scala.version}</scalaVersion>
</configuration>
</plugin>
</plugins>
</build>
<dependencies> <dependencies>
@ -161,11 +117,6 @@
<groupId>eu.dnetlib.dhp</groupId> <groupId>eu.dnetlib.dhp</groupId>
<artifactId>dhp-schemas</artifactId> <artifactId>dhp-schemas</artifactId>
</dependency> </dependency>
<dependency>
<groupId>com.opencsv</groupId>
<artifactId>opencsv</artifactId>
</dependency>
</dependencies> </dependencies>
</project> </project>

View File

@ -0,0 +1,14 @@
package eu.dnetlib.dhp.application;
import java.io.*;
import java.util.Map;
import java.util.Properties;
import org.apache.hadoop.conf.Configuration;
import com.google.common.collect.Maps;
public class ApplicationUtils {
}

View File

@ -56,13 +56,13 @@ public class ArgumentApplicationParser implements Serializable {
final StringWriter stringWriter = new StringWriter(); final StringWriter stringWriter = new StringWriter();
IOUtils.copy(gis, stringWriter); IOUtils.copy(gis, stringWriter);
return stringWriter.toString(); return stringWriter.toString();
} catch (IOException e) { } catch (Throwable e) {
log.error("Wrong value to decompress: {}", abstractCompressed); log.error("Wrong value to decompress:" + abstractCompressed);
throw new IllegalArgumentException(e); throw new RuntimeException(e);
} }
} }
public static String compressArgument(final String value) throws IOException { public static String compressArgument(final String value) throws Exception {
ByteArrayOutputStream out = new ByteArrayOutputStream(); ByteArrayOutputStream out = new ByteArrayOutputStream();
GZIPOutputStream gzip = new GZIPOutputStream(out); GZIPOutputStream gzip = new GZIPOutputStream(out);
gzip.write(value.getBytes()); gzip.write(value.getBytes());

View File

@ -9,6 +9,9 @@ public class OptionsParameter {
private boolean paramRequired; private boolean paramRequired;
private boolean compressed; private boolean compressed;
public OptionsParameter() {
}
public String getParamName() { public String getParamName() {
return paramName; return paramName;
} }

View File

@ -34,7 +34,7 @@ public class ApiDescriptor {
return params; return params;
} }
public void setParams(final Map<String, String> params) { public void setParams(final HashMap<String, String> params) {
this.params = params; this.params = params;
} }

View File

@ -12,9 +12,6 @@ public class Constants {
public static String COAR_ACCESS_RIGHT_SCHEMA = "http://vocabularies.coar-repositories.org/documentation/access_rights/"; public static String COAR_ACCESS_RIGHT_SCHEMA = "http://vocabularies.coar-repositories.org/documentation/access_rights/";
private Constants() {
}
static { static {
accessRightsCoarMap.put("OPEN", "c_abf2"); accessRightsCoarMap.put("OPEN", "c_abf2");
accessRightsCoarMap.put("RESTRICTED", "c_16ec"); accessRightsCoarMap.put("RESTRICTED", "c_16ec");
@ -52,10 +49,4 @@ public class Constants {
public static final String CONTENT_INVALIDRECORDS = "InvalidRecords"; public static final String CONTENT_INVALIDRECORDS = "InvalidRecords";
public static final String CONTENT_TRANSFORMEDRECORDS = "transformedItems"; public static final String CONTENT_TRANSFORMEDRECORDS = "transformedItems";
// IETF Draft and used by Repositories like ZENODO , not included in APACHE HTTP java packages
// see https://ietf-wg-httpapi.github.io/ratelimit-headers/draft-ietf-httpapi-ratelimit-headers.html
public static final String HTTPHEADER_IETF_DRAFT_RATELIMIT_LIMIT = "X-RateLimit-Limit";
public static final String HTTPHEADER_IETF_DRAFT_RATELIMIT_REMAINING = "X-RateLimit-Remaining";
public static final String HTTPHEADER_IETF_DRAFT_RATELIMIT_RESET = "X-RateLimit-Reset";
} }

View File

@ -0,0 +1,412 @@
package eu.dnetlib.dhp.common;
import java.io.Serializable;
import java.util.*;
import java.util.stream.Collectors;
import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.dump.oaf.*;
import eu.dnetlib.dhp.schema.dump.oaf.community.CommunityInstance;
import eu.dnetlib.dhp.schema.dump.oaf.community.CommunityResult;
import eu.dnetlib.dhp.schema.oaf.DataInfo;
import eu.dnetlib.dhp.schema.oaf.Field;
import eu.dnetlib.dhp.schema.oaf.Journal;
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
public class GraphResultMapper implements Serializable {
public static <E extends eu.dnetlib.dhp.schema.oaf.OafEntity> Result map(
E in) {
CommunityResult out = new CommunityResult();
eu.dnetlib.dhp.schema.oaf.Result input = (eu.dnetlib.dhp.schema.oaf.Result) in;
Optional<eu.dnetlib.dhp.schema.oaf.Qualifier> ort = Optional.ofNullable(input.getResulttype());
if (ort.isPresent()) {
switch (ort.get().getClassid()) {
case "publication":
Optional<Journal> journal = Optional
.ofNullable(((eu.dnetlib.dhp.schema.oaf.Publication) input).getJournal());
if (journal.isPresent()) {
Journal j = journal.get();
Container c = new Container();
c.setConferencedate(j.getConferencedate());
c.setConferenceplace(j.getConferenceplace());
c.setEdition(j.getEdition());
c.setEp(j.getEp());
c.setIss(j.getIss());
c.setIssnLinking(j.getIssnLinking());
c.setIssnOnline(j.getIssnOnline());
c.setIssnPrinted(j.getIssnPrinted());
c.setName(j.getName());
c.setSp(j.getSp());
c.setVol(j.getVol());
out.setContainer(c);
out.setType(ModelConstants.PUBLICATION_DEFAULT_RESULTTYPE.getClassname());
}
break;
case "dataset":
eu.dnetlib.dhp.schema.oaf.Dataset id = (eu.dnetlib.dhp.schema.oaf.Dataset) input;
Optional.ofNullable(id.getSize()).ifPresent(v -> out.setSize(v.getValue()));
Optional.ofNullable(id.getVersion()).ifPresent(v -> out.setVersion(v.getValue()));
out
.setGeolocation(
Optional
.ofNullable(id.getGeolocation())
.map(
igl -> igl
.stream()
.filter(Objects::nonNull)
.map(gli -> {
GeoLocation gl = new GeoLocation();
gl.setBox(gli.getBox());
gl.setPlace(gli.getPlace());
gl.setPoint(gli.getPoint());
return gl;
})
.collect(Collectors.toList()))
.orElse(null));
out.setType(ModelConstants.DATASET_DEFAULT_RESULTTYPE.getClassname());
break;
case "software":
eu.dnetlib.dhp.schema.oaf.Software is = (eu.dnetlib.dhp.schema.oaf.Software) input;
Optional
.ofNullable(is.getCodeRepositoryUrl())
.ifPresent(value -> out.setCodeRepositoryUrl(value.getValue()));
Optional
.ofNullable(is.getDocumentationUrl())
.ifPresent(
value -> out
.setDocumentationUrl(
value
.stream()
.map(v -> v.getValue())
.collect(Collectors.toList())));
Optional
.ofNullable(is.getProgrammingLanguage())
.ifPresent(value -> out.setProgrammingLanguage(value.getClassid()));
out.setType(ModelConstants.SOFTWARE_DEFAULT_RESULTTYPE.getClassname());
break;
case "other":
eu.dnetlib.dhp.schema.oaf.OtherResearchProduct ir = (eu.dnetlib.dhp.schema.oaf.OtherResearchProduct) input;
out
.setContactgroup(
Optional
.ofNullable(ir.getContactgroup())
.map(value -> value.stream().map(cg -> cg.getValue()).collect(Collectors.toList()))
.orElse(null));
out
.setContactperson(
Optional
.ofNullable(ir.getContactperson())
.map(value -> value.stream().map(cp -> cp.getValue()).collect(Collectors.toList()))
.orElse(null));
out
.setTool(
Optional
.ofNullable(ir.getTool())
.map(value -> value.stream().map(t -> t.getValue()).collect(Collectors.toList()))
.orElse(null));
out.setType(ModelConstants.ORP_DEFAULT_RESULTTYPE.getClassname());
break;
}
Optional
.ofNullable(input.getAuthor())
.ifPresent(ats -> out.setAuthor(ats.stream().map(at -> getAuthor(at)).collect(Collectors.toList())));
// I do not map Access Right UNKNOWN or OTHER
Optional<eu.dnetlib.dhp.schema.oaf.Qualifier> oar = Optional.ofNullable(input.getBestaccessright());
if (oar.isPresent()) {
if (Constants.accessRightsCoarMap.containsKey(oar.get().getClassid())) {
String code = Constants.accessRightsCoarMap.get(oar.get().getClassid());
out
.setBestaccessright(
AccessRight
.newInstance(
code,
Constants.coarCodeLabelMap.get(code),
Constants.COAR_ACCESS_RIGHT_SCHEMA));
}
}
final List<String> contributorList = new ArrayList<>();
Optional
.ofNullable(input.getContributor())
.ifPresent(value -> value.stream().forEach(c -> contributorList.add(c.getValue())));
out.setContributor(contributorList);
Optional
.ofNullable(input.getCountry())
.ifPresent(
value -> out
.setCountry(
value
.stream()
.map(
c -> {
if (c.getClassid().equals((ModelConstants.UNKNOWN))) {
return null;
}
Country country = new Country();
country.setCode(c.getClassid());
country.setLabel(c.getClassname());
Optional
.ofNullable(c.getDataInfo())
.ifPresent(
provenance -> country
.setProvenance(
Provenance
.newInstance(
provenance
.getProvenanceaction()
.getClassname(),
c.getDataInfo().getTrust())));
return country;
})
.filter(Objects::nonNull)
.collect(Collectors.toList())));
final List<String> coverageList = new ArrayList<>();
Optional
.ofNullable(input.getCoverage())
.ifPresent(value -> value.stream().forEach(c -> coverageList.add(c.getValue())));
out.setCoverage(coverageList);
out.setDateofcollection(input.getDateofcollection());
final List<String> descriptionList = new ArrayList<>();
Optional
.ofNullable(input.getDescription())
.ifPresent(value -> value.forEach(d -> descriptionList.add(d.getValue())));
out.setDescription(descriptionList);
Optional<Field<String>> oStr = Optional.ofNullable(input.getEmbargoenddate());
if (oStr.isPresent()) {
out.setEmbargoenddate(oStr.get().getValue());
}
final List<String> formatList = new ArrayList<>();
Optional
.ofNullable(input.getFormat())
.ifPresent(value -> value.stream().forEach(f -> formatList.add(f.getValue())));
out.setFormat(formatList);
out.setId(input.getId());
out.setOriginalId(input.getOriginalId());
Optional<List<eu.dnetlib.dhp.schema.oaf.Instance>> oInst = Optional
.ofNullable(input.getInstance());
if (oInst.isPresent()) {
out
.setInstance(
oInst.get().stream().map(i -> getInstance(i)).collect(Collectors.toList()));
}
Optional<eu.dnetlib.dhp.schema.oaf.Qualifier> oL = Optional.ofNullable(input.getLanguage());
if (oL.isPresent()) {
eu.dnetlib.dhp.schema.oaf.Qualifier language = oL.get();
out.setLanguage(Qualifier.newInstance(language.getClassid(), language.getClassname()));
}
Optional<Long> oLong = Optional.ofNullable(input.getLastupdatetimestamp());
if (oLong.isPresent()) {
out.setLastupdatetimestamp(oLong.get());
}
Optional<List<StructuredProperty>> otitle = Optional.ofNullable(input.getTitle());
if (otitle.isPresent()) {
List<StructuredProperty> iTitle = otitle
.get()
.stream()
.filter(t -> t.getQualifier().getClassid().equalsIgnoreCase("main title"))
.collect(Collectors.toList());
if (iTitle.size() > 0) {
out.setMaintitle(iTitle.get(0).getValue());
}
iTitle = otitle
.get()
.stream()
.filter(t -> t.getQualifier().getClassid().equalsIgnoreCase("subtitle"))
.collect(Collectors.toList());
if (iTitle.size() > 0) {
out.setSubtitle(iTitle.get(0).getValue());
}
}
List<ControlledField> pids = new ArrayList<>();
Optional
.ofNullable(input.getPid())
.ifPresent(
value -> value
.stream()
.forEach(
p -> pids
.add(
ControlledField
.newInstance(p.getQualifier().getClassid(), p.getValue()))));
out.setPid(pids);
oStr = Optional.ofNullable(input.getDateofacceptance());
if (oStr.isPresent()) {
out.setPublicationdate(oStr.get().getValue());
}
oStr = Optional.ofNullable(input.getPublisher());
if (oStr.isPresent()) {
out.setPublisher(oStr.get().getValue());
}
List<String> sourceList = new ArrayList<>();
Optional
.ofNullable(input.getSource())
.ifPresent(value -> value.stream().forEach(s -> sourceList.add(s.getValue())));
// out.setSource(input.getSource().stream().map(s -> s.getValue()).collect(Collectors.toList()));
List<Subject> subjectList = new ArrayList<>();
Optional
.ofNullable(input.getSubject())
.ifPresent(
value -> value
.forEach(s -> subjectList.add(getSubject(s))));
out.setSubjects(subjectList);
out.setType(input.getResulttype().getClassid());
}
out
.setCollectedfrom(
input
.getCollectedfrom()
.stream()
.map(cf -> KeyValue.newInstance(cf.getKey(), cf.getValue()))
.collect(Collectors.toList()));
return out;
}
private static CommunityInstance getInstance(eu.dnetlib.dhp.schema.oaf.Instance i) {
CommunityInstance instance = new CommunityInstance();
setCommonValue(i, instance);
instance
.setCollectedfrom(
KeyValue
.newInstance(i.getCollectedfrom().getKey(), i.getCollectedfrom().getValue()));
instance
.setHostedby(
KeyValue.newInstance(i.getHostedby().getKey(), i.getHostedby().getValue()));
return instance;
}
private static <I extends Instance> void setCommonValue(eu.dnetlib.dhp.schema.oaf.Instance i, I instance) {
Optional<eu.dnetlib.dhp.schema.oaf.Qualifier> opAr = Optional
.ofNullable(i.getAccessright());
if (opAr.isPresent()) {
if (Constants.accessRightsCoarMap.containsKey(opAr.get().getClassid())) {
String code = Constants.accessRightsCoarMap.get(opAr.get().getClassid());
instance
.setAccessright(
AccessRight
.newInstance(
code,
Constants.coarCodeLabelMap.get(code),
Constants.COAR_ACCESS_RIGHT_SCHEMA));
}
}
Optional
.ofNullable(i.getLicense())
.ifPresent(value -> instance.setLicense(value.getValue()));
Optional
.ofNullable(i.getDateofacceptance())
.ifPresent(value -> instance.setPublicationdate(value.getValue()));
Optional
.ofNullable(i.getRefereed())
.ifPresent(value -> instance.setRefereed(value.getClassname()));
Optional
.ofNullable(i.getInstancetype())
.ifPresent(value -> instance.setType(value.getClassname()));
Optional.ofNullable(i.getUrl()).ifPresent(value -> instance.setUrl(value));
}
private static Subject getSubject(StructuredProperty s) {
Subject subject = new Subject();
subject.setSubject(ControlledField.newInstance(s.getQualifier().getClassid(), s.getValue()));
Optional<DataInfo> di = Optional.ofNullable(s.getDataInfo());
if (di.isPresent()) {
Provenance p = new Provenance();
p.setProvenance(di.get().getProvenanceaction().getClassname());
p.setTrust(di.get().getTrust());
subject.setProvenance(p);
}
return subject;
}
private static Author getAuthor(eu.dnetlib.dhp.schema.oaf.Author oa) {
Author a = new Author();
a.setFullname(oa.getFullname());
a.setName(oa.getName());
a.setSurname(oa.getSurname());
a.setRank(oa.getRank());
Optional<List<StructuredProperty>> oPids = Optional
.ofNullable(oa.getPid());
if (oPids.isPresent()) {
Pid pid = getOrcid(oPids.get());
if (pid != null) {
a.setPid(pid);
}
}
return a;
}
private static Pid getOrcid(List<StructuredProperty> p) {
for (StructuredProperty pid : p) {
if (pid.getQualifier().getClassid().equals(ModelConstants.ORCID)) {
Optional<DataInfo> di = Optional.ofNullable(pid.getDataInfo());
if (di.isPresent()) {
return Pid
.newInstance(
ControlledField
.newInstance(
pid.getQualifier().getClassid(),
pid.getValue()),
Provenance
.newInstance(
di.get().getProvenanceaction().getClassname(),
di.get().getTrust()));
} else {
return Pid
.newInstance(
ControlledField
.newInstance(
pid.getQualifier().getClassid(),
pid.getValue())
);
}
}
}
return null;
}
}

View File

@ -28,7 +28,7 @@ public class HdfsSupport {
* @param configuration Configuration of hadoop env * @param configuration Configuration of hadoop env
*/ */
public static boolean exists(String path, Configuration configuration) { public static boolean exists(String path, Configuration configuration) {
logger.info("Checking existence for path: {}", path); logger.info("Removing path: {}", path);
return rethrowAsRuntimeException( return rethrowAsRuntimeException(
() -> { () -> {
Path f = new Path(path); Path f = new Path(path);

View File

@ -14,33 +14,38 @@ public class MakeTarArchive implements Serializable {
private static TarArchiveOutputStream getTar(FileSystem fileSystem, String outputPath) throws IOException { private static TarArchiveOutputStream getTar(FileSystem fileSystem, String outputPath) throws IOException {
Path hdfsWritePath = new Path(outputPath); Path hdfsWritePath = new Path(outputPath);
FSDataOutputStream fsDataOutputStream = null;
if (fileSystem.exists(hdfsWritePath)) { if (fileSystem.exists(hdfsWritePath)) {
fileSystem.delete(hdfsWritePath, true); fileSystem.delete(hdfsWritePath, true);
} }
return new TarArchiveOutputStream(fileSystem.create(hdfsWritePath).getWrappedStream()); fsDataOutputStream = fileSystem.create(hdfsWritePath);
return new TarArchiveOutputStream(fsDataOutputStream.getWrappedStream());
} }
private static void write(FileSystem fileSystem, String inputPath, String outputPath, String dir_name) private static void write(FileSystem fileSystem, String inputPath, String outputPath, String dir_name)
throws IOException { throws IOException {
Path hdfsWritePath = new Path(outputPath); Path hdfsWritePath = new Path(outputPath);
FSDataOutputStream fsDataOutputStream = null;
if (fileSystem.exists(hdfsWritePath)) { if (fileSystem.exists(hdfsWritePath)) {
fileSystem.delete(hdfsWritePath, true); fileSystem.delete(hdfsWritePath, true);
} }
try (TarArchiveOutputStream ar = new TarArchiveOutputStream( fsDataOutputStream = fileSystem.create(hdfsWritePath);
fileSystem.create(hdfsWritePath).getWrappedStream())) {
RemoteIterator<LocatedFileStatus> iterator = fileSystem TarArchiveOutputStream ar = new TarArchiveOutputStream(fsDataOutputStream.getWrappedStream());
RemoteIterator<LocatedFileStatus> fileStatusListIterator = fileSystem
.listFiles( .listFiles(
new Path(inputPath), true); new Path(inputPath), true);
while (iterator.hasNext()) { while (fileStatusListIterator.hasNext()) {
writeCurrentFile(fileSystem, dir_name, iterator, ar, 0); writeCurrentFile(fileSystem, dir_name, fileStatusListIterator, ar, 0);
} }
} ar.close();
} }
public static void tarMaxSize(FileSystem fileSystem, String inputPath, String outputPath, String dir_name, public static void tarMaxSize(FileSystem fileSystem, String inputPath, String outputPath, String dir_name,

View File

@ -10,6 +10,8 @@ import java.util.Optional;
import java.util.stream.StreamSupport; import java.util.stream.StreamSupport;
import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.StringUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.bson.Document; import org.bson.Document;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
@ -19,7 +21,6 @@ import com.mongodb.BasicDBObject;
import com.mongodb.MongoClient; import com.mongodb.MongoClient;
import com.mongodb.MongoClientURI; import com.mongodb.MongoClientURI;
import com.mongodb.QueryBuilder; import com.mongodb.QueryBuilder;
import com.mongodb.client.FindIterable;
import com.mongodb.client.MongoCollection; import com.mongodb.client.MongoCollection;
import com.mongodb.client.MongoDatabase; import com.mongodb.client.MongoDatabase;
@ -45,7 +46,7 @@ public class MdstoreClient implements Closeable {
final String currentId = Optional final String currentId = Optional
.ofNullable(getColl(db, COLL_METADATA_MANAGER, true).find(query)) .ofNullable(getColl(db, COLL_METADATA_MANAGER, true).find(query))
.map(FindIterable::first) .map(r -> r.first())
.map(d -> d.getString("currentId")) .map(d -> d.getString("currentId"))
.orElseThrow(() -> new IllegalArgumentException("cannot find current mdstore id for: " + mdId)); .orElseThrow(() -> new IllegalArgumentException("cannot find current mdstore id for: " + mdId));
@ -83,7 +84,7 @@ public class MdstoreClient implements Closeable {
if (!Iterables.contains(client.listDatabaseNames(), dbName)) { if (!Iterables.contains(client.listDatabaseNames(), dbName)) {
final String err = String.format("Database '%s' not found in %s", dbName, client.getAddress()); final String err = String.format("Database '%s' not found in %s", dbName, client.getAddress());
log.warn(err); log.warn(err);
throw new IllegalArgumentException(err); throw new RuntimeException(err);
} }
return client.getDatabase(dbName); return client.getDatabase(dbName);
} }
@ -96,7 +97,7 @@ public class MdstoreClient implements Closeable {
String.format("Missing collection '%s' in database '%s'", collName, db.getName())); String.format("Missing collection '%s' in database '%s'", collName, db.getName()));
log.warn(err); log.warn(err);
if (abortIfMissing) { if (abortIfMissing) {
throw new IllegalArgumentException(err); throw new RuntimeException(err);
} else { } else {
return null; return null;
} }

View File

@ -24,6 +24,7 @@ import com.google.common.hash.Hashing;
*/ */
public class PacePerson { public class PacePerson {
private static final String UTF8 = "UTF-8";
private List<String> name = Lists.newArrayList(); private List<String> name = Lists.newArrayList();
private List<String> surname = Lists.newArrayList(); private List<String> surname = Lists.newArrayList();
private List<String> fullname = Lists.newArrayList(); private List<String> fullname = Lists.newArrayList();

View File

@ -5,9 +5,6 @@ import java.io.*;
import java.io.IOException; import java.io.IOException;
import java.util.concurrent.TimeUnit; import java.util.concurrent.TimeUnit;
import org.apache.http.HttpHeaders;
import org.apache.http.entity.ContentType;
import com.google.gson.Gson; import com.google.gson.Gson;
import eu.dnetlib.dhp.common.api.zenodo.ZenodoModel; import eu.dnetlib.dhp.common.api.zenodo.ZenodoModel;
@ -46,7 +43,7 @@ public class ZenodoAPIClient implements Serializable {
this.deposition_id = deposition_id; this.deposition_id = deposition_id;
} }
public ZenodoAPIClient(String urlString, String access_token) { public ZenodoAPIClient(String urlString, String access_token) throws IOException {
this.urlString = urlString; this.urlString = urlString;
this.access_token = access_token; this.access_token = access_token;
@ -66,8 +63,8 @@ public class ZenodoAPIClient implements Serializable {
Request request = new Request.Builder() Request request = new Request.Builder()
.url(urlString) .url(urlString)
.addHeader(HttpHeaders.CONTENT_TYPE, ContentType.APPLICATION_JSON.toString()) // add request headers .addHeader("Content-Type", "application/json") // add request headers
.addHeader(HttpHeaders.AUTHORIZATION, "Bearer " + access_token) .addHeader("Authorization", "Bearer " + access_token)
.post(body) .post(body)
.build(); .build();
@ -106,8 +103,8 @@ public class ZenodoAPIClient implements Serializable {
Request request = new Request.Builder() Request request = new Request.Builder()
.url(bucket + "/" + file_name) .url(bucket + "/" + file_name)
.addHeader(HttpHeaders.CONTENT_TYPE, "application/zip") // add request headers .addHeader("Content-Type", "application/zip") // add request headers
.addHeader(HttpHeaders.AUTHORIZATION, "Bearer " + access_token) .addHeader("Authorization", "Bearer " + access_token)
.put(InputStreamRequestBody.create(MEDIA_TYPE_ZIP, is, len)) .put(InputStreamRequestBody.create(MEDIA_TYPE_ZIP, is, len))
.build(); .build();
@ -133,8 +130,8 @@ public class ZenodoAPIClient implements Serializable {
Request request = new Request.Builder() Request request = new Request.Builder()
.url(urlString + "/" + deposition_id) .url(urlString + "/" + deposition_id)
.addHeader(HttpHeaders.CONTENT_TYPE, ContentType.APPLICATION_JSON.toString()) // add request headers .addHeader("Content-Type", "application/json") // add request headers
.addHeader(HttpHeaders.AUTHORIZATION, "Bearer " + access_token) .addHeader("Authorization", "Bearer " + access_token)
.put(body) .put(body)
.build(); .build();
@ -200,7 +197,7 @@ public class ZenodoAPIClient implements Serializable {
Request request = new Request.Builder() Request request = new Request.Builder()
.url(urlString + "/" + deposition_id + "/actions/newversion") .url(urlString + "/" + deposition_id + "/actions/newversion")
.addHeader(HttpHeaders.AUTHORIZATION, "Bearer " + access_token) .addHeader("Authorization", "Bearer " + access_token)
.post(body) .post(body)
.build(); .build();
@ -273,8 +270,8 @@ public class ZenodoAPIClient implements Serializable {
Request request = new Request.Builder() Request request = new Request.Builder()
.url(urlString) .url(urlString)
.addHeader(HttpHeaders.CONTENT_TYPE, ContentType.APPLICATION_JSON.toString()) // add request headers .addHeader("Content-Type", "application/json") // add request headers
.addHeader(HttpHeaders.AUTHORIZATION, "Bearer " + access_token) .addHeader("Authorization", "Bearer " + access_token)
.get() .get()
.build(); .build();
@ -296,8 +293,8 @@ public class ZenodoAPIClient implements Serializable {
Request request = new Request.Builder() Request request = new Request.Builder()
.url(url) .url(url)
.addHeader(HttpHeaders.CONTENT_TYPE, ContentType.APPLICATION_JSON.toString()) // add request headers .addHeader("Content-Type", "application/json") // add request headers
.addHeader(HttpHeaders.AUTHORIZATION, "Bearer " + access_token) .addHeader("Authorization", "Bearer " + access_token)
.get() .get()
.build(); .build();

View File

@ -32,13 +32,13 @@ public class Creator {
public static Creator newInstance(String name, String affiliation, String orcid) { public static Creator newInstance(String name, String affiliation, String orcid) {
Creator c = new Creator(); Creator c = new Creator();
if (name != null) { if (!(name == null)) {
c.name = name; c.name = name;
} }
if (affiliation != null) { if (!(affiliation == null)) {
c.affiliation = affiliation; c.affiliation = affiliation;
} }
if (orcid != null) { if (!(orcid == null)) {
c.orcid = orcid; c.orcid = orcid;
} }

View File

@ -3,12 +3,17 @@ package eu.dnetlib.dhp.common.api.zenodo;
import java.io.Serializable; import java.io.Serializable;
import net.minidev.json.annotate.JsonIgnore;
public class File implements Serializable { public class File implements Serializable {
private String checksum; private String checksum;
private String filename; private String filename;
private long filesize; private long filesize;
private String id; private String id;
@JsonIgnore
// private Links links;
public String getChecksum() { public String getChecksum() {
return checksum; return checksum;
} }
@ -41,4 +46,13 @@ public class File implements Serializable {
this.id = id; this.id = id;
} }
// @JsonIgnore
// public Links getLinks() {
// return links;
// }
//
// @JsonIgnore
// public void setLinks(Links links) {
// this.links = links;
// }
} }

View File

@ -1,56 +0,0 @@
package eu.dnetlib.dhp.common.collection;
import java.io.*;
import java.nio.charset.StandardCharsets;
import java.util.List;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.opencsv.bean.CsvToBeanBuilder;
public class GetCSV {
public static final char DEFAULT_DELIMITER = ',';
private GetCSV() {
}
public static void getCsv(FileSystem fileSystem, BufferedReader reader, String hdfsPath,
String modelClass) throws IOException, ClassNotFoundException {
getCsv(fileSystem, reader, hdfsPath, modelClass, DEFAULT_DELIMITER);
}
public static void getCsv(FileSystem fileSystem, Reader reader, String hdfsPath,
String modelClass, char delimiter) throws IOException, ClassNotFoundException {
Path hdfsWritePath = new Path(hdfsPath);
FSDataOutputStream fsDataOutputStream = null;
if (fileSystem.exists(hdfsWritePath)) {
fileSystem.delete(hdfsWritePath, false);
}
fsDataOutputStream = fileSystem.create(hdfsWritePath);
try (BufferedWriter writer = new BufferedWriter(
new OutputStreamWriter(fsDataOutputStream, StandardCharsets.UTF_8))) {
final ObjectMapper mapper = new ObjectMapper();
@SuppressWarnings("unchecked")
final List lines = new CsvToBeanBuilder(reader)
.withType(Class.forName(modelClass))
.withSeparator(delimiter)
.build()
.parse();
for (Object line : lines) {
writer.write(mapper.writeValueAsString(line));
writer.newLine();
}
}
}
}

View File

@ -1,11 +1,11 @@
package eu.dnetlib.dhp.common.rest; package eu.dnetlib.dhp.common.rest;
import java.io.IOException;
import java.util.Arrays; import java.util.Arrays;
import java.util.stream.Collectors; import java.util.stream.Collectors;
import org.apache.commons.io.IOUtils; import org.apache.commons.io.IOUtils;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet; import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.methods.HttpPost; import org.apache.http.client.methods.HttpPost;
import org.apache.http.client.methods.HttpUriRequest; import org.apache.http.client.methods.HttpUriRequest;
@ -23,20 +23,17 @@ public class DNetRestClient {
private static final ObjectMapper mapper = new ObjectMapper(); private static final ObjectMapper mapper = new ObjectMapper();
private DNetRestClient() {
}
public static <T> T doGET(final String url, Class<T> clazz) throws Exception { public static <T> T doGET(final String url, Class<T> clazz) throws Exception {
final HttpGet httpGet = new HttpGet(url); final HttpGet httpGet = new HttpGet(url);
return doHTTPRequest(httpGet, clazz); return doHTTPRequest(httpGet, clazz);
} }
public static String doGET(final String url) throws IOException { public static String doGET(final String url) throws Exception {
final HttpGet httpGet = new HttpGet(url); final HttpGet httpGet = new HttpGet(url);
return doHTTPRequest(httpGet); return doHTTPRequest(httpGet);
} }
public static <V> String doPOST(final String url, V objParam) throws IOException { public static <V> String doPOST(final String url, V objParam) throws Exception {
final HttpPost httpPost = new HttpPost(url); final HttpPost httpPost = new HttpPost(url);
if (objParam != null) { if (objParam != null) {
@ -48,12 +45,12 @@ public class DNetRestClient {
return doHTTPRequest(httpPost); return doHTTPRequest(httpPost);
} }
public static <T, V> T doPOST(final String url, V objParam, Class<T> clazz) throws IOException { public static <T, V> T doPOST(final String url, V objParam, Class<T> clazz) throws Exception {
return mapper.readValue(doPOST(url, objParam), clazz); return mapper.readValue(doPOST(url, objParam), clazz);
} }
private static String doHTTPRequest(final HttpUriRequest r) throws IOException { private static String doHTTPRequest(final HttpUriRequest r) throws Exception {
try (CloseableHttpClient client = HttpClients.createDefault()) { CloseableHttpClient client = HttpClients.createDefault();
log.info("performing HTTP request, method {} on URI {}", r.getMethod(), r.getURI().toString()); log.info("performing HTTP request, method {} on URI {}", r.getMethod(), r.getURI().toString());
log log
@ -65,8 +62,8 @@ public class DNetRestClient {
.map(h -> h.getName() + ":" + h.getValue()) .map(h -> h.getName() + ":" + h.getValue())
.collect(Collectors.joining(","))); .collect(Collectors.joining(",")));
return IOUtils.toString(client.execute(r).getEntity().getContent()); CloseableHttpResponse response = client.execute(r);
} return IOUtils.toString(response.getEntity().getContent());
} }
private static <T> T doHTTPRequest(final HttpUriRequest r, Class<T> clazz) throws Exception { private static <T> T doHTTPRequest(final HttpUriRequest r, Class<T> clazz) throws Exception {

View File

@ -46,7 +46,7 @@ public class Vocabulary implements Serializable {
} }
public VocabularyTerm getTerm(final String id) { public VocabularyTerm getTerm(final String id) {
return Optional.ofNullable(id).map(String::toLowerCase).map(terms::get).orElse(null); return Optional.ofNullable(id).map(s -> s.toLowerCase()).map(s -> terms.get(s)).orElse(null);
} }
protected void addTerm(final String id, final String name) { protected void addTerm(final String id, final String name) {
@ -81,6 +81,7 @@ public class Vocabulary implements Serializable {
.ofNullable(getTermBySynonym(syn)) .ofNullable(getTermBySynonym(syn))
.map(term -> getTermAsQualifier(term.getId())) .map(term -> getTermAsQualifier(term.getId()))
.orElse(null); .orElse(null);
// .orElse(OafMapperUtils.unknown(getId(), getName()));
} }
} }

View File

@ -46,6 +46,7 @@ public class VocabularyGroup implements Serializable {
} }
vocs.addTerm(vocId, termId, termName); vocs.addTerm(vocId, termId, termName);
// vocs.addSynonyms(vocId, termId, termId);
} }
} }
@ -57,17 +58,10 @@ public class VocabularyGroup implements Serializable {
final String syn = arr[2].trim(); final String syn = arr[2].trim();
vocs.addSynonyms(vocId, termId, syn); vocs.addSynonyms(vocId, termId, syn);
// vocs.addSynonyms(vocId, termId, termId);
} }
} }
// add the term names as synonyms
vocs.vocs.values().forEach(voc -> {
voc.getTerms().values().forEach(term -> {
voc.addSynonym(term.getName().toLowerCase(), term.getId());
});
});
return vocs; return vocs;
} }
@ -104,7 +98,7 @@ public class VocabularyGroup implements Serializable {
.getTerms() .getTerms()
.values() .values()
.stream() .stream()
.map(VocabularyTerm::getId) .map(t -> t.getId())
.collect(Collectors.toCollection(HashSet::new)); .collect(Collectors.toCollection(HashSet::new));
} }
@ -160,19 +154,16 @@ public class VocabularyGroup implements Serializable {
return Optional return Optional
.ofNullable(vocId) .ofNullable(vocId)
.map(String::toLowerCase) .map(String::toLowerCase)
.map(vocs::containsKey) .map(id -> vocs.containsKey(id))
.orElse(false); .orElse(false);
} }
private void addSynonyms(final String vocId, final String termId, final String syn) { private void addSynonyms(final String vocId, final String termId, final String syn) {
String id = Optional String id = Optional
.ofNullable(vocId) .ofNullable(vocId)
.map(String::toLowerCase) .map(s -> s.toLowerCase())
.orElseThrow( .orElseThrow(
() -> new IllegalArgumentException( () -> new IllegalArgumentException(String.format("empty vocabulary id for [term:%s, synonym:%s]")));
String
.format(
"empty vocabulary id for [term:%s, synonym:%s]", termId, syn)));
Optional Optional
.ofNullable(vocs.get(id)) .ofNullable(vocs.get(id))
.orElseThrow(() -> new IllegalArgumentException("missing vocabulary id: " + vocId)) .orElseThrow(() -> new IllegalArgumentException("missing vocabulary id: " + vocId))

View File

@ -2,6 +2,7 @@
package eu.dnetlib.dhp.message; package eu.dnetlib.dhp.message;
import java.io.Serializable; import java.io.Serializable;
import java.util.HashMap;
import java.util.LinkedHashMap; import java.util.LinkedHashMap;
import java.util.Map; import java.util.Map;
@ -9,8 +10,8 @@ public class Message implements Serializable {
private static final long serialVersionUID = 401753881204524893L; private static final long serialVersionUID = 401753881204524893L;
public static final String CURRENT_PARAM = "current"; public static String CURRENT_PARAM = "current";
public static final String TOTAL_PARAM = "total"; public static String TOTAL_PARAM = "total";
private MessageType messageType; private MessageType messageType;

View File

@ -4,6 +4,7 @@ package eu.dnetlib.dhp.oa.merge;
import java.text.Normalizer; import java.text.Normalizer;
import java.util.*; import java.util.*;
import java.util.stream.Collectors; import java.util.stream.Collectors;
import java.util.stream.Stream;
import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.StringUtils;
@ -18,9 +19,6 @@ public class AuthorMerger {
private static final Double THRESHOLD = 0.95; private static final Double THRESHOLD = 0.95;
private AuthorMerger() {
}
public static List<Author> merge(List<List<Author>> authors) { public static List<Author> merge(List<List<Author>> authors) {
authors.sort((o1, o2) -> -Integer.compare(countAuthorsPids(o1), countAuthorsPids(o2))); authors.sort((o1, o2) -> -Integer.compare(countAuthorsPids(o1), countAuthorsPids(o2)));
@ -38,8 +36,7 @@ public class AuthorMerger {
public static List<Author> mergeAuthor(final List<Author> a, final List<Author> b, Double threshold) { public static List<Author> mergeAuthor(final List<Author> a, final List<Author> b, Double threshold) {
int pa = countAuthorsPids(a); int pa = countAuthorsPids(a);
int pb = countAuthorsPids(b); int pb = countAuthorsPids(b);
List<Author> base; List<Author> base, enrich;
List<Author> enrich;
int sa = authorsSize(a); int sa = authorsSize(a);
int sb = authorsSize(b); int sb = authorsSize(b);
@ -65,7 +62,7 @@ public class AuthorMerger {
// <pidComparableString, Author> (if an Author has more than 1 pid, it appears 2 times in the list) // <pidComparableString, Author> (if an Author has more than 1 pid, it appears 2 times in the list)
final Map<String, Author> basePidAuthorMap = base final Map<String, Author> basePidAuthorMap = base
.stream() .stream()
.filter(a -> a.getPid() != null && !a.getPid().isEmpty()) .filter(a -> a.getPid() != null && a.getPid().size() > 0)
.flatMap( .flatMap(
a -> a a -> a
.getPid() .getPid()
@ -77,7 +74,7 @@ public class AuthorMerger {
// <pid, Author> (list of pid that are missing in the other list) // <pid, Author> (list of pid that are missing in the other list)
final List<Tuple2<StructuredProperty, Author>> pidToEnrich = enrich final List<Tuple2<StructuredProperty, Author>> pidToEnrich = enrich
.stream() .stream()
.filter(a -> a.getPid() != null && !a.getPid().isEmpty()) .filter(a -> a.getPid() != null && a.getPid().size() > 0)
.flatMap( .flatMap(
a -> a a -> a
.getPid() .getPid()
@ -120,9 +117,9 @@ public class AuthorMerger {
} }
public static String pidToComparableString(StructuredProperty pid) { public static String pidToComparableString(StructuredProperty pid) {
final String classid = pid.getQualifier().getClassid() != null ? pid.getQualifier().getClassid().toLowerCase() return (pid.getQualifier() != null
: ""; ? pid.getQualifier().getClassid() != null ? pid.getQualifier().getClassid().toLowerCase() : ""
return (pid.getQualifier() != null ? classid : "") : "")
+ (pid.getValue() != null ? pid.getValue().toLowerCase() : ""); + (pid.getValue() != null ? pid.getValue().toLowerCase() : "");
} }

View File

@ -12,9 +12,6 @@ import com.ximpleware.VTDNav;
/** Created by sandro on 9/29/16. */ /** Created by sandro on 9/29/16. */
public class VtdUtilityParser { public class VtdUtilityParser {
private VtdUtilityParser() {
}
public static List<Node> getTextValuesWithAttributes( public static List<Node> getTextValuesWithAttributes(
final AutoPilot ap, final VTDNav vn, final String xpath, final List<String> attributes) final AutoPilot ap, final VTDNav vn, final String xpath, final List<String> attributes)
throws VtdException { throws VtdException {

View File

@ -16,8 +16,6 @@ import com.github.sisyphsu.dateparser.DateParserUtils;
import com.google.common.collect.Lists; import com.google.common.collect.Lists;
import com.google.common.collect.Sets; import com.google.common.collect.Sets;
import eu.dnetlib.dhp.common.vocabulary.Vocabulary;
import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup;
import eu.dnetlib.dhp.schema.common.ModelConstants; import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.common.ModelSupport; import eu.dnetlib.dhp.schema.common.ModelSupport;
import eu.dnetlib.dhp.schema.oaf.*; import eu.dnetlib.dhp.schema.oaf.*;
@ -29,11 +27,8 @@ public class GraphCleaningFunctions extends CleaningFunctions {
public static final int ORCID_LEN = 19; public static final int ORCID_LEN = 19;
public static final String CLEANING_REGEX = "(?:\\n|\\r|\\t)"; public static final String CLEANING_REGEX = "(?:\\n|\\r|\\t)";
public static final String INVALID_AUTHOR_REGEX = ".*deactivated.*"; public static final String INVALID_AUTHOR_REGEX = ".*deactivated.*";
public static final String TITLE_FILTER_REGEX = "[.*test.*\\W\\d]";
public static final String TITLE_TEST = "test"; public static final int TITLE_FILTER_RESIDUAL_LENGTH = 10;
public static final String TITLE_FILTER_REGEX = String.format("(%s)|\\W|\\d", TITLE_TEST);
public static final int TITLE_FILTER_RESIDUAL_LENGTH = 5;
public static <T extends Oaf> T fixVocabularyNames(T value) { public static <T extends Oaf> T fixVocabularyNames(T value) {
if (value instanceof Datasource) { if (value instanceof Datasource) {
@ -88,22 +83,6 @@ public class GraphCleaningFunctions extends CleaningFunctions {
} }
public static <T extends Oaf> boolean filter(T value) { public static <T extends Oaf> boolean filter(T value) {
if (Boolean.TRUE
.equals(
Optional
.ofNullable(value)
.map(
o -> Optional
.ofNullable(o.getDataInfo())
.map(
d -> Optional
.ofNullable(d.getInvisible())
.orElse(true))
.orElse(true))
.orElse(true))) {
return true;
}
if (value instanceof Datasource) { if (value instanceof Datasource) {
// nothing to evaluate here // nothing to evaluate here
} else if (value instanceof Project) { } else if (value instanceof Project) {
@ -133,7 +112,7 @@ public class GraphCleaningFunctions extends CleaningFunctions {
return true; return true;
} }
public static <T extends Oaf> T cleanup(T value, VocabularyGroup vocs) { public static <T extends Oaf> T cleanup(T value) {
if (value instanceof Datasource) { if (value instanceof Datasource) {
// nothing to clean here // nothing to clean here
} else if (value instanceof Project) { } else if (value instanceof Project) {
@ -216,29 +195,14 @@ public class GraphCleaningFunctions extends CleaningFunctions {
final String title = sp final String title = sp
.getValue() .getValue()
.toLowerCase(); .toLowerCase();
final String decoded = Unidecode.decode(title); final String residual = Unidecode
.decode(title)
if (StringUtils.contains(decoded, TITLE_TEST)) { .replaceAll(TITLE_FILTER_REGEX, "");
return decoded return residual.length() > TITLE_FILTER_RESIDUAL_LENGTH;
.replaceAll(TITLE_FILTER_REGEX, "")
.length() > TITLE_FILTER_RESIDUAL_LENGTH;
}
return !decoded
.replaceAll("\\W|\\d", "")
.isEmpty();
}) })
.map(GraphCleaningFunctions::cleanValue) .map(GraphCleaningFunctions::cleanValue)
.collect(Collectors.toList())); .collect(Collectors.toList()));
} }
if (Objects.nonNull(r.getFormat())) {
r
.setFormat(
r
.getFormat()
.stream()
.map(GraphCleaningFunctions::cleanValue)
.collect(Collectors.toList()));
}
if (Objects.nonNull(r.getDescription())) { if (Objects.nonNull(r.getDescription())) {
r r
.setDescription( .setDescription(
@ -261,38 +225,6 @@ public class GraphCleaningFunctions extends CleaningFunctions {
if (Objects.nonNull(r.getInstance())) { if (Objects.nonNull(r.getInstance())) {
for (Instance i : r.getInstance()) { for (Instance i : r.getInstance()) {
if (!vocs.termExists(ModelConstants.DNET_PUBLICATION_RESOURCE, i.getInstancetype().getClassid())) {
if (r instanceof Publication) {
i
.setInstancetype(
OafMapperUtils
.qualifier(
"0038", "Other literature type", ModelConstants.DNET_PUBLICATION_RESOURCE,
ModelConstants.DNET_PUBLICATION_RESOURCE));
} else if (r instanceof Dataset) {
i
.setInstancetype(
OafMapperUtils
.qualifier(
"0039", "Other dataset type", ModelConstants.DNET_PUBLICATION_RESOURCE,
ModelConstants.DNET_PUBLICATION_RESOURCE));
} else if (r instanceof Software) {
i
.setInstancetype(
OafMapperUtils
.qualifier(
"0040", "Other software type", ModelConstants.DNET_PUBLICATION_RESOURCE,
ModelConstants.DNET_PUBLICATION_RESOURCE));
} else if (r instanceof OtherResearchProduct) {
i
.setInstancetype(
OafMapperUtils
.qualifier(
"0020", "Other ORP type", ModelConstants.DNET_PUBLICATION_RESOURCE,
ModelConstants.DNET_PUBLICATION_RESOURCE));
}
}
if (Objects.nonNull(i.getPid())) { if (Objects.nonNull(i.getPid())) {
i.setPid(processPidCleaning(i.getPid())); i.setPid(processPidCleaning(i.getPid()));
} }
@ -352,7 +284,7 @@ public class GraphCleaningFunctions extends CleaningFunctions {
r r
.getAuthor() .getAuthor()
.stream() .stream()
.filter(Objects::nonNull) .filter(a -> Objects.nonNull(a))
.filter(a -> StringUtils.isNotBlank(a.getFullname())) .filter(a -> StringUtils.isNotBlank(a.getFullname()))
.filter(a -> StringUtils.isNotBlank(a.getFullname().replaceAll("[\\W]", ""))) .filter(a -> StringUtils.isNotBlank(a.getFullname().replaceAll("[\\W]", "")))
.collect(Collectors.toList())); .collect(Collectors.toList()));

View File

@ -17,16 +17,13 @@ import eu.dnetlib.dhp.schema.oaf.*;
public class OafMapperUtils { public class OafMapperUtils {
private OafMapperUtils() {
}
public static Oaf merge(final Oaf left, final Oaf right) { public static Oaf merge(final Oaf left, final Oaf right) {
if (ModelSupport.isSubClass(left, OafEntity.class)) { if (ModelSupport.isSubClass(left, OafEntity.class)) {
return mergeEntities((OafEntity) left, (OafEntity) right); return mergeEntities((OafEntity) left, (OafEntity) right);
} else if (ModelSupport.isSubClass(left, Relation.class)) { } else if (ModelSupport.isSubClass(left, Relation.class)) {
((Relation) left).mergeFrom((Relation) right); ((Relation) left).mergeFrom((Relation) right);
} else { } else {
throw new IllegalArgumentException("invalid Oaf type:" + left.getClass().getCanonicalName()); throw new RuntimeException("invalid Oaf type:" + left.getClass().getCanonicalName());
} }
return left; return left;
} }
@ -41,7 +38,7 @@ public class OafMapperUtils {
} else if (ModelSupport.isSubClass(left, Project.class)) { } else if (ModelSupport.isSubClass(left, Project.class)) {
left.mergeFrom(right); left.mergeFrom(right);
} else { } else {
throw new IllegalArgumentException("invalid OafEntity subtype:" + left.getClass().getCanonicalName()); throw new RuntimeException("invalid OafEntity subtype:" + left.getClass().getCanonicalName());
} }
return left; return left;
} }
@ -65,7 +62,7 @@ public class OafMapperUtils {
public static List<KeyValue> listKeyValues(final String... s) { public static List<KeyValue> listKeyValues(final String... s) {
if (s.length % 2 > 0) { if (s.length % 2 > 0) {
throw new IllegalArgumentException("Invalid number of parameters (k,v,k,v,....)"); throw new RuntimeException("Invalid number of parameters (k,v,k,v,....)");
} }
final List<KeyValue> list = new ArrayList<>(); final List<KeyValue> list = new ArrayList<>();
@ -91,7 +88,7 @@ public class OafMapperUtils {
.stream(values) .stream(values)
.map(v -> field(v, info)) .map(v -> field(v, info))
.filter(Objects::nonNull) .filter(Objects::nonNull)
.filter(distinctByKey(Field::getValue)) .filter(distinctByKey(f -> f.getValue()))
.collect(Collectors.toList()); .collect(Collectors.toList());
} }
@ -100,7 +97,7 @@ public class OafMapperUtils {
.stream() .stream()
.map(v -> field(v, info)) .map(v -> field(v, info))
.filter(Objects::nonNull) .filter(Objects::nonNull)
.filter(distinctByKey(Field::getValue)) .filter(distinctByKey(f -> f.getValue()))
.collect(Collectors.toList()); .collect(Collectors.toList());
} }
@ -345,10 +342,10 @@ public class OafMapperUtils {
if (instanceList != null) { if (instanceList != null) {
final Optional<AccessRight> min = instanceList final Optional<AccessRight> min = instanceList
.stream() .stream()
.map(Instance::getAccessright) .map(i -> i.getAccessright())
.min(new AccessRightComparator<>()); .min(new AccessRightComparator<>());
final Qualifier rights = min.map(OafMapperUtils::qualifier).orElseGet(Qualifier::new); final Qualifier rights = min.isPresent() ? qualifier(min.get()) : new Qualifier();
if (StringUtils.isBlank(rights.getClassid())) { if (StringUtils.isBlank(rights.getClassid())) {
rights.setClassid(UNKNOWN); rights.setClassid(UNKNOWN);

View File

@ -4,19 +4,19 @@ package eu.dnetlib.dhp.utils;
import java.io.*; import java.io.*;
import java.nio.charset.StandardCharsets; import java.nio.charset.StandardCharsets;
import java.security.MessageDigest; import java.security.MessageDigest;
import java.util.*; import java.util.List;
import java.util.stream.Collectors; import java.util.Map;
import java.util.Properties;
import java.util.zip.GZIPInputStream;
import java.util.zip.GZIPOutputStream;
import org.apache.commons.codec.binary.Base64;
import org.apache.commons.codec.binary.Base64OutputStream;
import org.apache.commons.codec.binary.Hex; import org.apache.commons.codec.binary.Hex;
import org.apache.commons.io.IOUtils; import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path; import org.apache.hadoop.fs.Path;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.SaveMode; import org.apache.spark.sql.SaveMode;
import org.slf4j.Logger; import org.slf4j.Logger;
@ -26,8 +26,6 @@ import com.fasterxml.jackson.databind.ObjectMapper;
import com.google.common.collect.Maps; import com.google.common.collect.Maps;
import com.jayway.jsonpath.JsonPath; import com.jayway.jsonpath.JsonPath;
import eu.dnetlib.dhp.schema.mdstore.MDStoreWithInfo;
import eu.dnetlib.dhp.schema.oaf.utils.CleaningFunctions;
import net.minidev.json.JSONArray; import net.minidev.json.JSONArray;
import scala.collection.JavaConverters; import scala.collection.JavaConverters;
import scala.collection.Seq; import scala.collection.Seq;
@ -36,9 +34,6 @@ public class DHPUtils {
private static final Logger log = LoggerFactory.getLogger(DHPUtils.class); private static final Logger log = LoggerFactory.getLogger(DHPUtils.class);
private DHPUtils() {
}
public static Seq<String> toSeq(List<String> list) { public static Seq<String> toSeq(List<String> list) {
return JavaConverters.asScalaIteratorConverter(list.iterator()).asScala().toSeq(); return JavaConverters.asScalaIteratorConverter(list.iterator()).asScala().toSeq();
} }
@ -49,59 +44,40 @@ public class DHPUtils {
md.update(s.getBytes(StandardCharsets.UTF_8)); md.update(s.getBytes(StandardCharsets.UTF_8));
return new String(Hex.encodeHex(md.digest())); return new String(Hex.encodeHex(md.digest()));
} catch (final Exception e) { } catch (final Exception e) {
log.error("Error creating id from {}", s); System.err.println("Error creating id");
return null; return null;
} }
} }
/**
* Retrieves from the metadata store manager application the list of paths associated with mdstores characterized
* by he given format, layout, interpretation
* @param mdstoreManagerUrl the URL of the mdstore manager service
* @param format the mdstore format
* @param layout the mdstore layout
* @param interpretation the mdstore interpretation
* @param includeEmpty include Empty mdstores
* @return the set of hdfs paths
* @throws IOException in case of HTTP communication issues
*/
public static Set<String> mdstorePaths(final String mdstoreManagerUrl,
final String format,
final String layout,
final String interpretation,
boolean includeEmpty) throws IOException {
final String url = mdstoreManagerUrl + "/mdstores/";
final ObjectMapper objectMapper = new ObjectMapper();
final HttpGet req = new HttpGet(url);
try (final CloseableHttpClient client = HttpClients.createDefault()) {
try (final CloseableHttpResponse response = client.execute(req)) {
final String json = IOUtils.toString(response.getEntity().getContent());
final MDStoreWithInfo[] mdstores = objectMapper.readValue(json, MDStoreWithInfo[].class);
return Arrays
.stream(mdstores)
.filter(md -> md.getFormat().equalsIgnoreCase(format))
.filter(md -> md.getLayout().equalsIgnoreCase(layout))
.filter(md -> md.getInterpretation().equalsIgnoreCase(interpretation))
.filter(md -> StringUtils.isNotBlank(md.getHdfsPath()))
.filter(md -> StringUtils.isNotBlank(md.getCurrentVersion()))
.filter(md -> includeEmpty || md.getSize() > 0)
.map(md -> md.getHdfsPath() + "/" + md.getCurrentVersion() + "/store")
.collect(Collectors.toSet());
}
}
}
public static String generateIdentifier(final String originalId, final String nsPrefix) { public static String generateIdentifier(final String originalId, final String nsPrefix) {
return String.format("%s::%s", nsPrefix, DHPUtils.md5(originalId)); return String.format("%s::%s", nsPrefix, DHPUtils.md5(originalId));
} }
public static String generateUnresolvedIdentifier(final String pid, final String pidType) { public static String compressString(final String input) {
try (ByteArrayOutputStream out = new ByteArrayOutputStream();
Base64OutputStream b64os = new Base64OutputStream(out)) {
GZIPOutputStream gzip = new GZIPOutputStream(b64os);
gzip.write(input.getBytes(StandardCharsets.UTF_8));
gzip.close();
return out.toString();
} catch (Throwable e) {
return null;
}
}
final String cleanedPid = CleaningFunctions.normalizePidValue(pidType, pid); public static String decompressString(final String input) {
byte[] byteArray = Base64.decodeBase64(input.getBytes());
return String.format("unresolved::%s::%s", cleanedPid, pidType.toLowerCase().trim()); int len;
try (GZIPInputStream gis = new GZIPInputStream(new ByteArrayInputStream((byteArray)));
ByteArrayOutputStream bos = new ByteArrayOutputStream(byteArray.length)) {
byte[] buffer = new byte[1024];
while ((len = gis.read(buffer)) != -1) {
bos.write(buffer, 0, len);
}
return bos.toString();
} catch (Exception e) {
return null;
}
} }
public static String getJPathString(final String jsonPath, final String json) { public static String getJPathString(final String jsonPath, final String json) {

View File

@ -18,16 +18,13 @@ public class ISLookupClientFactory {
private static final int requestTimeout = 60000 * 10; private static final int requestTimeout = 60000 * 10;
private static final int connectTimeout = 60000 * 10; private static final int connectTimeout = 60000 * 10;
private ISLookupClientFactory() {
}
public static ISLookUpService getLookUpService(final String isLookupUrl) { public static ISLookUpService getLookUpService(final String isLookupUrl) {
return getServiceStub(ISLookUpService.class, isLookupUrl); return getServiceStub(ISLookUpService.class, isLookupUrl);
} }
@SuppressWarnings("unchecked") @SuppressWarnings("unchecked")
private static <T> T getServiceStub(final Class<T> clazz, final String endpoint) { private static <T> T getServiceStub(final Class<T> clazz, final String endpoint) {
log.info("creating {} stub from {}", clazz.getName(), endpoint); log.info(String.format("creating %s stub from %s", clazz.getName(), endpoint));
final JaxWsProxyFactoryBean jaxWsProxyFactory = new JaxWsProxyFactoryBean(); final JaxWsProxyFactoryBean jaxWsProxyFactory = new JaxWsProxyFactoryBean();
jaxWsProxyFactory.setServiceClass(clazz); jaxWsProxyFactory.setServiceClass(clazz);
jaxWsProxyFactory.setAddress(endpoint); jaxWsProxyFactory.setAddress(endpoint);
@ -41,10 +38,12 @@ public class ISLookupClientFactory {
log log
.info( .info(
"setting connectTimeout to {}, requestTimeout to {} for service {}", String
.format(
"setting connectTimeout to %s, requestTimeout to %s for service %s",
connectTimeout, connectTimeout,
requestTimeout, requestTimeout,
clazz.getCanonicalName()); clazz.getCanonicalName()));
policy.setConnectionTimeout(connectTimeout); policy.setConnectionTimeout(connectTimeout);
policy.setReceiveTimeout(requestTimeout); policy.setReceiveTimeout(requestTimeout);

View File

@ -10,7 +10,7 @@ import net.sf.saxon.trans.XPathException;
public abstract class AbstractExtensionFunction extends ExtensionFunctionDefinition { public abstract class AbstractExtensionFunction extends ExtensionFunctionDefinition {
public static final String DEFAULT_SAXON_EXT_NS_URI = "http://www.d-net.research-infrastructures.eu/saxon-extension"; public static String DEFAULT_SAXON_EXT_NS_URI = "http://www.d-net.research-infrastructures.eu/saxon-extension";
public abstract String getName(); public abstract String getName();

View File

@ -26,7 +26,7 @@ public class ExtractYear extends AbstractExtensionFunction {
@Override @Override
public Sequence doCall(XPathContext context, Sequence[] arguments) throws XPathException { public Sequence doCall(XPathContext context, Sequence[] arguments) throws XPathException {
if (arguments == null || arguments.length == 0) { if (arguments == null | arguments.length == 0) {
return new StringValue(""); return new StringValue("");
} }
final Item item = arguments[0].head(); final Item item = arguments[0].head();
@ -63,7 +63,8 @@ public class ExtractYear extends AbstractExtensionFunction {
for (String format : dateFormats) { for (String format : dateFormats) {
try { try {
c.setTime(new SimpleDateFormat(format).parse(s)); c.setTime(new SimpleDateFormat(format).parse(s));
return String.valueOf(c.get(Calendar.YEAR)); String year = String.valueOf(c.get(Calendar.YEAR));
return year;
} catch (ParseException e) { } catch (ParseException e) {
} }
} }

View File

@ -30,7 +30,7 @@ public class NormalizeDate extends AbstractExtensionFunction {
@Override @Override
public Sequence doCall(XPathContext context, Sequence[] arguments) throws XPathException { public Sequence doCall(XPathContext context, Sequence[] arguments) throws XPathException {
if (arguments == null || arguments.length == 0) { if (arguments == null | arguments.length == 0) {
return new StringValue(BLANK); return new StringValue(BLANK);
} }
String s = arguments[0].head().getStringValue(); String s = arguments[0].head().getStringValue();

View File

@ -1,8 +1,6 @@
package eu.dnetlib.dhp.utils.saxon; package eu.dnetlib.dhp.utils.saxon;
import static org.apache.commons.lang3.StringUtils.isNotBlank;
import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.StringUtils;
import net.sf.saxon.expr.XPathContext; import net.sf.saxon.expr.XPathContext;
@ -28,8 +26,7 @@ public class PickFirst extends AbstractExtensionFunction {
final String s1 = getValue(arguments[0]); final String s1 = getValue(arguments[0]);
final String s2 = getValue(arguments[1]); final String s2 = getValue(arguments[1]);
final String value = isNotBlank(s1) ? s1 : isNotBlank(s2) ? s2 : ""; return new StringValue(StringUtils.isNotBlank(s1) ? s1 : StringUtils.isNotBlank(s2) ? s2 : "");
return new StringValue(value);
} }
private String getValue(final Sequence arg) throws XPathException { private String getValue(final Sequence arg) throws XPathException {

View File

@ -12,9 +12,6 @@ import net.sf.saxon.TransformerFactoryImpl;
public class SaxonTransformerFactory { public class SaxonTransformerFactory {
private SaxonTransformerFactory() {
}
/** /**
* Creates the index record transformer from the given XSLT * Creates the index record transformer from the given XSLT
* *

View File

@ -1,73 +0,0 @@
package eu.dnetlib.dhp.application
import scala.io.Source
/** This is the main Interface SparkApplication
* where all the Spark Scala class should inherit
*/
trait SparkScalaApplication {
/** This is the path in the classpath of the json
* describes all the argument needed to run
*/
val propertyPath: String
/** Utility to parse the arguments using the
* property json in the classpath identified from
* the variable propertyPath
*
* @param args the list of arguments
*/
def parseArguments(args: Array[String]): ArgumentApplicationParser = {
val parser = new ArgumentApplicationParser(
Source.fromInputStream(getClass.getResourceAsStream(propertyPath)).mkString
)
parser.parseArgument(args)
parser
}
/** Here all the spark applications runs this method
* where the whole logic of the spark node is defined
*/
def run(): Unit
}
import org.apache.spark.SparkConf
import org.apache.spark.sql.SparkSession
import org.slf4j.Logger
abstract class AbstractScalaApplication(
val propertyPath: String,
val args: Array[String],
log: Logger
) extends SparkScalaApplication {
var parser: ArgumentApplicationParser = null
var spark: SparkSession = null
def initialize(): SparkScalaApplication = {
parser = parseArguments(args)
spark = createSparkSession()
this
}
/** Utility for creating a spark session starting from parser
*
* @return a spark Session
*/
private def createSparkSession(): SparkSession = {
require(parser != null)
val conf: SparkConf = new SparkConf()
val master = parser.get("master")
log.info(s"Creating Spark session: Master: $master")
SparkSession
.builder()
.config(conf)
.appName(getClass.getSimpleName)
.master(master)
.getOrCreate()
}
}

View File

@ -1,442 +0,0 @@
package eu.dnetlib.dhp.sx.graph.scholix
import eu.dnetlib.dhp.schema.oaf.{Publication, Relation, Result, StructuredProperty}
import eu.dnetlib.dhp.schema.sx.scholix._
import eu.dnetlib.dhp.schema.sx.summary.{CollectedFromType, SchemeValue, ScholixSummary, Typology}
import eu.dnetlib.dhp.utils.DHPUtils
import org.apache.spark.sql.expressions.Aggregator
import org.apache.spark.sql.{Encoder, Encoders}
import org.json4s
import org.json4s.DefaultFormats
import org.json4s.jackson.JsonMethods.parse
import scala.collection.JavaConverters._
import scala.io.Source
object ScholixUtils extends Serializable {
val DNET_IDENTIFIER_SCHEMA: String = "DNET Identifier"
val DATE_RELATION_KEY: String = "RelationDate"
case class RelationVocabulary(original: String, inverse: String) {}
case class RelatedEntities(id: String, relatedDataset: Long, relatedPublication: Long) {}
val relations: Map[String, RelationVocabulary] = {
val input = Source
.fromInputStream(
getClass.getResourceAsStream("/eu/dnetlib/scholexplorer/relation/relations.json")
)
.mkString
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
lazy val json: json4s.JValue = parse(input)
json.extract[Map[String, RelationVocabulary]]
}
def extractRelationDate(relation: Relation): String = {
if (relation.getProperties == null || !relation.getProperties.isEmpty)
null
else {
val date = relation.getProperties.asScala
.find(p => DATE_RELATION_KEY.equalsIgnoreCase(p.getKey))
.map(p => p.getValue)
if (date.isDefined)
date.get
else
null
}
}
def extractRelationDate(summary: ScholixSummary): String = {
if (summary.getDate == null || summary.getDate.isEmpty)
null
else {
summary.getDate.get(0)
}
}
def inverseRelationShip(rel: ScholixRelationship): ScholixRelationship = {
new ScholixRelationship(rel.getInverse, rel.getSchema, rel.getName)
}
def generateScholixResourceFromResult(r: Result): ScholixResource = {
generateScholixResourceFromSummary(ScholixUtils.resultToSummary(r))
}
val statsAggregator: Aggregator[(String, String, Long), RelatedEntities, RelatedEntities] =
new Aggregator[(String, String, Long), RelatedEntities, RelatedEntities] with Serializable {
override def zero: RelatedEntities = null
override def reduce(b: RelatedEntities, a: (String, String, Long)): RelatedEntities = {
val relatedDataset = if ("dataset".equalsIgnoreCase(a._2)) a._3 else 0
val relatedPublication = if ("publication".equalsIgnoreCase(a._2)) a._3 else 0
if (b == null)
RelatedEntities(a._1, relatedDataset, relatedPublication)
else
RelatedEntities(
a._1,
b.relatedDataset + relatedDataset,
b.relatedPublication + relatedPublication
)
}
override def merge(b1: RelatedEntities, b2: RelatedEntities): RelatedEntities = {
if (b1 != null && b2 != null)
RelatedEntities(
b1.id,
b1.relatedDataset + b2.relatedDataset,
b1.relatedPublication + b2.relatedPublication
)
else if (b1 != null)
b1
else
b2
}
override def finish(reduction: RelatedEntities): RelatedEntities = reduction
override def bufferEncoder: Encoder[RelatedEntities] = Encoders.bean(classOf[RelatedEntities])
override def outputEncoder: Encoder[RelatedEntities] = Encoders.bean(classOf[RelatedEntities])
}
val scholixAggregator: Aggregator[(String, Scholix), Scholix, Scholix] =
new Aggregator[(String, Scholix), Scholix, Scholix] with Serializable {
override def zero: Scholix = null
def scholix_complete(s: Scholix): Boolean = {
if (s == null || s.getIdentifier == null) {
false
} else if (s.getSource == null || s.getTarget == null) {
false
} else if (s.getLinkprovider == null || s.getLinkprovider.isEmpty)
false
else
true
}
override def reduce(b: Scholix, a: (String, Scholix)): Scholix = {
if (scholix_complete(b)) b else a._2
}
override def merge(b1: Scholix, b2: Scholix): Scholix = {
if (scholix_complete(b1)) b1 else b2
}
override def finish(reduction: Scholix): Scholix = reduction
override def bufferEncoder: Encoder[Scholix] = Encoders.kryo[Scholix]
override def outputEncoder: Encoder[Scholix] = Encoders.kryo[Scholix]
}
def createInverseScholixRelation(scholix: Scholix): Scholix = {
val s = new Scholix
s.setPublicationDate(scholix.getPublicationDate)
s.setPublisher(scholix.getPublisher)
s.setLinkprovider(scholix.getLinkprovider)
s.setRelationship(inverseRelationShip(scholix.getRelationship))
s.setSource(scholix.getTarget)
s.setTarget(scholix.getSource)
s.setIdentifier(
DHPUtils.md5(
s"${s.getSource.getIdentifier}::${s.getRelationship.getName}::${s.getTarget.getIdentifier}"
)
)
s
}
def extractCollectedFrom(summary: ScholixResource): List[ScholixEntityId] = {
if (summary.getCollectedFrom != null && !summary.getCollectedFrom.isEmpty) {
val l: List[ScholixEntityId] = summary.getCollectedFrom.asScala.map { d =>
new ScholixEntityId(d.getProvider.getName, d.getProvider.getIdentifiers)
}(collection.breakOut)
l
} else List()
}
def extractCollectedFrom(summary: ScholixSummary): List[ScholixEntityId] = {
if (summary.getDatasources != null && !summary.getDatasources.isEmpty) {
val l: List[ScholixEntityId] = summary.getDatasources.asScala.map { d =>
new ScholixEntityId(
d.getDatasourceName,
List(new ScholixIdentifier(d.getDatasourceId, "DNET Identifier", null)).asJava
)
}(collection.breakOut)
l
} else List()
}
def extractCollectedFrom(relation: Relation): List[ScholixEntityId] = {
if (relation.getCollectedfrom != null && !relation.getCollectedfrom.isEmpty) {
val l: List[ScholixEntityId] = relation.getCollectedfrom.asScala.map { c =>
new ScholixEntityId(
c.getValue,
List(new ScholixIdentifier(c.getKey, DNET_IDENTIFIER_SCHEMA, null)).asJava
)
}.toList
l
} else List()
}
def generateCompleteScholix(scholix: Scholix, target: ScholixSummary): Scholix = {
val s = new Scholix
s.setPublicationDate(scholix.getPublicationDate)
s.setPublisher(scholix.getPublisher)
s.setLinkprovider(scholix.getLinkprovider)
s.setRelationship(scholix.getRelationship)
s.setSource(scholix.getSource)
s.setTarget(generateScholixResourceFromSummary(target))
s.setIdentifier(
DHPUtils.md5(
s"${s.getSource.getIdentifier}::${s.getRelationship.getName}::${s.getTarget.getIdentifier}"
)
)
s
}
def generateCompleteScholix(scholix: Scholix, target: ScholixResource): Scholix = {
val s = new Scholix
s.setPublicationDate(scholix.getPublicationDate)
s.setPublisher(scholix.getPublisher)
s.setLinkprovider(scholix.getLinkprovider)
s.setRelationship(scholix.getRelationship)
s.setSource(scholix.getSource)
s.setTarget(target)
s.setIdentifier(
DHPUtils.md5(
s"${s.getSource.getIdentifier}::${s.getRelationship.getName}::${s.getTarget.getIdentifier}"
)
)
s
}
def generateScholixResourceFromSummary(summaryObject: ScholixSummary): ScholixResource = {
val r = new ScholixResource
r.setIdentifier(summaryObject.getLocalIdentifier)
r.setDnetIdentifier(summaryObject.getId)
r.setObjectType(summaryObject.getTypology.toString)
r.setObjectSubType(summaryObject.getSubType)
if (summaryObject.getTitle != null && !summaryObject.getTitle.isEmpty)
r.setTitle(summaryObject.getTitle.get(0))
if (summaryObject.getAuthor != null && !summaryObject.getAuthor.isEmpty) {
val l: List[ScholixEntityId] =
summaryObject.getAuthor.asScala.map(a => new ScholixEntityId(a, null)).toList
if (l.nonEmpty)
r.setCreator(l.asJava)
}
if (summaryObject.getDate != null && !summaryObject.getDate.isEmpty)
r.setPublicationDate(summaryObject.getDate.get(0))
if (summaryObject.getPublisher != null && !summaryObject.getPublisher.isEmpty) {
val plist: List[ScholixEntityId] =
summaryObject.getPublisher.asScala.map(p => new ScholixEntityId(p, null)).toList
if (plist.nonEmpty)
r.setPublisher(plist.asJava)
}
if (summaryObject.getDatasources != null && !summaryObject.getDatasources.isEmpty) {
val l: List[ScholixCollectedFrom] = summaryObject.getDatasources.asScala
.map(c =>
new ScholixCollectedFrom(
new ScholixEntityId(
c.getDatasourceName,
List(new ScholixIdentifier(c.getDatasourceId, DNET_IDENTIFIER_SCHEMA, null)).asJava
),
"collected",
"complete"
)
)
.toList
if (l.nonEmpty)
r.setCollectedFrom(l.asJava)
}
r
}
def scholixFromSource(relation: Relation, source: ScholixResource): Scholix = {
if (relation == null || source == null)
return null
val s = new Scholix
var l: List[ScholixEntityId] = extractCollectedFrom(relation)
if (l.isEmpty)
l = extractCollectedFrom(source)
if (l.isEmpty)
return null
s.setLinkprovider(l.asJava)
var d = extractRelationDate(relation)
if (d == null)
d = source.getPublicationDate
s.setPublicationDate(d)
if (source.getPublisher != null && !source.getPublisher.isEmpty) {
s.setPublisher(source.getPublisher)
}
val semanticRelation = relations.getOrElse(relation.getRelClass.toLowerCase, null)
if (semanticRelation == null)
return null
s.setRelationship(
new ScholixRelationship(semanticRelation.original, "datacite", semanticRelation.inverse)
)
s.setSource(source)
s
}
def scholixFromSource(relation: Relation, source: ScholixSummary): Scholix = {
if (relation == null || source == null)
return null
val s = new Scholix
var l: List[ScholixEntityId] = extractCollectedFrom(relation)
if (l.isEmpty)
l = extractCollectedFrom(source)
if (l.isEmpty)
return null
s.setLinkprovider(l.asJava)
var d = extractRelationDate(relation)
if (d == null)
d = extractRelationDate(source)
s.setPublicationDate(d)
if (source.getPublisher != null && !source.getPublisher.isEmpty) {
val l: List[ScholixEntityId] = source.getPublisher.asScala
.map { p =>
new ScholixEntityId(p, null)
}(collection.breakOut)
if (l.nonEmpty)
s.setPublisher(l.asJava)
}
val semanticRelation = relations.getOrElse(relation.getRelClass.toLowerCase, null)
if (semanticRelation == null)
return null
s.setRelationship(
new ScholixRelationship(semanticRelation.original, "datacite", semanticRelation.inverse)
)
s.setSource(generateScholixResourceFromSummary(source))
s
}
def findURLForPID(
pidValue: List[StructuredProperty],
urls: List[String]
): List[(StructuredProperty, String)] = {
pidValue.map { p =>
val pv = p.getValue
val r = urls.find(u => u.toLowerCase.contains(pv.toLowerCase))
(p, r.orNull)
}
}
def extractTypedIdentifierFromInstance(r: Result): List[ScholixIdentifier] = {
if (r.getInstance() == null || r.getInstance().isEmpty)
return List()
r.getInstance()
.asScala
.filter(i => i.getUrl != null && !i.getUrl.isEmpty)
.filter(i => i.getPid != null && i.getUrl != null)
.flatMap(i => findURLForPID(i.getPid.asScala.toList, i.getUrl.asScala.toList))
.map(i => new ScholixIdentifier(i._1.getValue, i._1.getQualifier.getClassid, i._2))
.distinct
.toList
}
def resultToSummary(r: Result): ScholixSummary = {
val s = new ScholixSummary
s.setId(r.getId)
if (r.getPid == null || r.getPid.isEmpty)
return null
val persistentIdentifiers: List[ScholixIdentifier] = extractTypedIdentifierFromInstance(r)
if (persistentIdentifiers.isEmpty)
return null
s.setLocalIdentifier(persistentIdentifiers.asJava)
if (r.isInstanceOf[Publication])
s.setTypology(Typology.publication)
else
s.setTypology(Typology.dataset)
s.setSubType(r.getInstance().get(0).getInstancetype.getClassname)
if (r.getTitle != null && r.getTitle.asScala.nonEmpty) {
val titles: List[String] = r.getTitle.asScala.map(t => t.getValue).toList
if (titles.nonEmpty)
s.setTitle(titles.asJava)
else
return null
}
if (r.getAuthor != null && !r.getAuthor.isEmpty) {
val authors: List[String] = r.getAuthor.asScala.map(a => a.getFullname).toList
if (authors.nonEmpty)
s.setAuthor(authors.asJava)
}
if (r.getInstance() != null) {
val dt: List[String] = r
.getInstance()
.asScala
.filter(i => i.getDateofacceptance != null)
.map(i => i.getDateofacceptance.getValue)
.toList
if (dt.nonEmpty)
s.setDate(dt.distinct.asJava)
}
if (r.getDescription != null && !r.getDescription.isEmpty) {
val d = r.getDescription.asScala.find(f => f != null && f.getValue != null)
if (d.isDefined)
s.setDescription(d.get.getValue)
}
if (r.getSubject != null && !r.getSubject.isEmpty) {
val subjects: List[SchemeValue] = r.getSubject.asScala
.map(s => new SchemeValue(s.getQualifier.getClassname, s.getValue))
.toList
if (subjects.nonEmpty)
s.setSubject(subjects.asJava)
}
if (r.getPublisher != null)
s.setPublisher(List(r.getPublisher.getValue).asJava)
if (r.getCollectedfrom != null && !r.getCollectedfrom.isEmpty) {
val cf: List[CollectedFromType] = r.getCollectedfrom.asScala
.map(c => new CollectedFromType(c.getValue, c.getKey, "complete"))
.toList
if (cf.nonEmpty)
s.setDatasources(cf.distinct.asJava)
}
s.setRelatedDatasets(0)
s.setRelatedPublications(0)
s.setRelatedUnknown(0)
s
}
}

View File

@ -7,10 +7,10 @@ import static org.junit.jupiter.api.Assertions.assertNotNull;
import org.apache.commons.io.IOUtils; import org.apache.commons.io.IOUtils;
import org.junit.jupiter.api.Test; import org.junit.jupiter.api.Test;
class ArgumentApplicationParserTest { public class ArgumentApplicationParserTest {
@Test @Test
void testParseParameter() throws Exception { public void testParseParameter() throws Exception {
final String jsonConfiguration = IOUtils final String jsonConfiguration = IOUtils
.toString( .toString(
this.getClass().getResourceAsStream("/eu/dnetlib/application/parameters.json")); this.getClass().getResourceAsStream("/eu/dnetlib/application/parameters.json"));

View File

@ -21,13 +21,13 @@ public class HdfsSupportTest {
class Remove { class Remove {
@Test @Test
void shouldThrowARuntimeExceptionOnError() { public void shouldThrowARuntimeExceptionOnError() {
// when // when
assertThrows(RuntimeException.class, () -> HdfsSupport.remove(null, new Configuration())); assertThrows(RuntimeException.class, () -> HdfsSupport.remove(null, new Configuration()));
} }
@Test @Test
void shouldRemoveADirFromHDFS(@TempDir Path tempDir) { public void shouldRemoveADirFromHDFS(@TempDir Path tempDir) {
// when // when
HdfsSupport.remove(tempDir.toString(), new Configuration()); HdfsSupport.remove(tempDir.toString(), new Configuration());
@ -36,7 +36,7 @@ public class HdfsSupportTest {
} }
@Test @Test
void shouldRemoveAFileFromHDFS(@TempDir Path tempDir) throws IOException { public void shouldRemoveAFileFromHDFS(@TempDir Path tempDir) throws IOException {
// given // given
Path file = Files.createTempFile(tempDir, "p", "s"); Path file = Files.createTempFile(tempDir, "p", "s");
@ -52,13 +52,13 @@ public class HdfsSupportTest {
class ListFiles { class ListFiles {
@Test @Test
void shouldThrowARuntimeExceptionOnError() { public void shouldThrowARuntimeExceptionOnError() {
// when // when
assertThrows(RuntimeException.class, () -> HdfsSupport.listFiles(null, new Configuration())); assertThrows(RuntimeException.class, () -> HdfsSupport.listFiles(null, new Configuration()));
} }
@Test @Test
void shouldListFilesLocatedInPath(@TempDir Path tempDir) throws IOException { public void shouldListFilesLocatedInPath(@TempDir Path tempDir) throws IOException {
Path subDir1 = Files.createTempDirectory(tempDir, "list_me"); Path subDir1 = Files.createTempDirectory(tempDir, "list_me");
Path subDir2 = Files.createTempDirectory(tempDir, "list_me"); Path subDir2 = Files.createTempDirectory(tempDir, "list_me");

View File

@ -5,10 +5,10 @@ import static org.junit.jupiter.api.Assertions.*;
import org.junit.jupiter.api.Test; import org.junit.jupiter.api.Test;
class PacePersonTest { public class PacePersonTest {
@Test @Test
void pacePersonTest1() { public void pacePersonTest1() {
PacePerson p = new PacePerson("Artini, Michele", false); PacePerson p = new PacePerson("Artini, Michele", false);
assertEquals("Artini", p.getSurnameString()); assertEquals("Artini", p.getSurnameString());
@ -17,7 +17,7 @@ class PacePersonTest {
} }
@Test @Test
void pacePersonTest2() { public void pacePersonTest2() {
PacePerson p = new PacePerson("Michele G. Artini", false); PacePerson p = new PacePerson("Michele G. Artini", false);
assertEquals("Artini, Michele G.", p.getNormalisedFullname()); assertEquals("Artini, Michele G.", p.getNormalisedFullname());
assertEquals("Michele G", p.getNameString()); assertEquals("Michele G", p.getNameString());

View File

@ -18,8 +18,7 @@ public class SparkSessionSupportTest {
class RunWithSparkSession { class RunWithSparkSession {
@Test @Test
@SuppressWarnings("unchecked") public void shouldExecuteFunctionAndNotStopSparkSessionWhenSparkSessionIsNotManaged()
void shouldExecuteFunctionAndNotStopSparkSessionWhenSparkSessionIsNotManaged()
throws Exception { throws Exception {
// given // given
SparkSession spark = mock(SparkSession.class); SparkSession spark = mock(SparkSession.class);
@ -38,8 +37,7 @@ public class SparkSessionSupportTest {
} }
@Test @Test
@SuppressWarnings("unchecked") public void shouldExecuteFunctionAndStopSparkSessionWhenSparkSessionIsManaged()
void shouldExecuteFunctionAndStopSparkSessionWhenSparkSessionIsManaged()
throws Exception { throws Exception {
// given // given
SparkSession spark = mock(SparkSession.class); SparkSession spark = mock(SparkSession.class);

View File

@ -12,7 +12,7 @@ import org.junit.jupiter.api.Disabled;
import org.junit.jupiter.api.Test; import org.junit.jupiter.api.Test;
@Disabled @Disabled
class ZenodoAPIClientTest { public class ZenodoAPIClientTest {
private final String URL_STRING = "https://sandbox.zenodo.org/api/deposit/depositions"; private final String URL_STRING = "https://sandbox.zenodo.org/api/deposit/depositions";
private final String ACCESS_TOKEN = ""; private final String ACCESS_TOKEN = "";
@ -22,7 +22,7 @@ class ZenodoAPIClientTest {
private final String depositionId = "674915"; private final String depositionId = "674915";
@Test @Test
void testUploadOldDeposition() throws IOException, MissingConceptDoiException { public void testUploadOldDeposition() throws IOException, MissingConceptDoiException {
ZenodoAPIClient client = new ZenodoAPIClient(URL_STRING, ZenodoAPIClient client = new ZenodoAPIClient(URL_STRING,
ACCESS_TOKEN); ACCESS_TOKEN);
Assertions.assertEquals(200, client.uploadOpenDeposition(depositionId)); Assertions.assertEquals(200, client.uploadOpenDeposition(depositionId));
@ -44,7 +44,7 @@ class ZenodoAPIClientTest {
} }
@Test @Test
void testNewDeposition() throws IOException { public void testNewDeposition() throws IOException {
ZenodoAPIClient client = new ZenodoAPIClient(URL_STRING, ZenodoAPIClient client = new ZenodoAPIClient(URL_STRING,
ACCESS_TOKEN); ACCESS_TOKEN);
@ -67,7 +67,7 @@ class ZenodoAPIClientTest {
} }
@Test @Test
void testNewVersionNewName() throws IOException, MissingConceptDoiException { public void testNewVersionNewName() throws IOException, MissingConceptDoiException {
ZenodoAPIClient client = new ZenodoAPIClient(URL_STRING, ZenodoAPIClient client = new ZenodoAPIClient(URL_STRING,
ACCESS_TOKEN); ACCESS_TOKEN);
@ -87,7 +87,7 @@ class ZenodoAPIClientTest {
} }
@Test @Test
void testNewVersionOldName() throws IOException, MissingConceptDoiException { public void testNewVersionOldName() throws IOException, MissingConceptDoiException {
ZenodoAPIClient client = new ZenodoAPIClient(URL_STRING, ZenodoAPIClient client = new ZenodoAPIClient(URL_STRING,
ACCESS_TOKEN); ACCESS_TOKEN);

View File

@ -21,7 +21,7 @@ import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
import eu.dnetlib.pace.util.MapDocumentUtil; import eu.dnetlib.pace.util.MapDocumentUtil;
import scala.Tuple2; import scala.Tuple2;
class AuthorMergerTest { public class AuthorMergerTest {
private String publicationsBasePath; private String publicationsBasePath;
@ -43,7 +43,7 @@ class AuthorMergerTest {
} }
@Test @Test
void mergeTest() { // used in the dedup: threshold set to 0.95 public void mergeTest() { // used in the dedup: threshold set to 0.95
for (List<Author> authors1 : authors) { for (List<Author> authors1 : authors) {
System.out.println("List " + (authors.indexOf(authors1) + 1)); System.out.println("List " + (authors.indexOf(authors1) + 1));

View File

@ -21,7 +21,7 @@ import eu.dnetlib.dhp.schema.oaf.Publication;
import eu.dnetlib.dhp.schema.oaf.Result; import eu.dnetlib.dhp.schema.oaf.Result;
import me.xuender.unidecode.Unidecode; import me.xuender.unidecode.Unidecode;
class OafMapperUtilsTest { public class OafMapperUtilsTest {
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper() private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper()
.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false); .configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);
@ -42,7 +42,7 @@ class OafMapperUtilsTest {
} }
@Test @Test
void testDateValidation() { public void testDateValidation() {
assertTrue(GraphCleaningFunctions.doCleanDate("2016-05-07T12:41:19.202Z ").isPresent()); assertTrue(GraphCleaningFunctions.doCleanDate("2016-05-07T12:41:19.202Z ").isPresent());
assertTrue(GraphCleaningFunctions.doCleanDate("2020-09-10 11:08:52 ").isPresent()); assertTrue(GraphCleaningFunctions.doCleanDate("2020-09-10 11:08:52 ").isPresent());
@ -107,7 +107,7 @@ class OafMapperUtilsTest {
assertEquals("2006-01-02", GraphCleaningFunctions.doCleanDate("2006-01-02T15:04:05+0000").get()); assertEquals("2006-01-02", GraphCleaningFunctions.doCleanDate("2006-01-02T15:04:05+0000").get());
assertEquals("2009-08-13", GraphCleaningFunctions.doCleanDate("2009-08-12T22:15:09-07:00").get()); assertEquals("2009-08-13", GraphCleaningFunctions.doCleanDate("2009-08-12T22:15:09-07:00").get());
assertEquals("2009-08-12", GraphCleaningFunctions.doCleanDate("2009-08-12T22:15:09").get()); assertEquals("2009-08-12", GraphCleaningFunctions.doCleanDate("2009-08-12T22:15:09").get());
assertEquals("2009-08-13", GraphCleaningFunctions.doCleanDate("2009-08-12T22:15:09Z").get()); assertEquals("2009-08-12", GraphCleaningFunctions.doCleanDate("2009-08-12T22:15:09Z").get());
assertEquals("2014-04-26", GraphCleaningFunctions.doCleanDate("2014-04-26 17:24:37.3186369").get()); assertEquals("2014-04-26", GraphCleaningFunctions.doCleanDate("2014-04-26 17:24:37.3186369").get());
assertEquals("2012-08-03", GraphCleaningFunctions.doCleanDate("2012-08-03 18:31:59.257000000").get()); assertEquals("2012-08-03", GraphCleaningFunctions.doCleanDate("2012-08-03 18:31:59.257000000").get());
assertEquals("2014-04-26", GraphCleaningFunctions.doCleanDate("2014-04-26 17:24:37.123").get()); assertEquals("2014-04-26", GraphCleaningFunctions.doCleanDate("2014-04-26 17:24:37.123").get());
@ -147,46 +147,44 @@ class OafMapperUtilsTest {
} }
@Test @Test
void testDate() { public void testDate() {
final String date = GraphCleaningFunctions.cleanDate("23-FEB-1998"); System.out.println(GraphCleaningFunctions.cleanDate("23-FEB-1998"));
assertNotNull(date);
System.out.println(date);
} }
@Test @Test
void testMergePubs() throws IOException { public void testMergePubs() throws IOException {
Publication p1 = read("publication_1.json", Publication.class); Publication p1 = read("publication_1.json", Publication.class);
Publication p2 = read("publication_2.json", Publication.class); Publication p2 = read("publication_2.json", Publication.class);
Dataset d1 = read("dataset_1.json", Dataset.class); Dataset d1 = read("dataset_1.json", Dataset.class);
Dataset d2 = read("dataset_2.json", Dataset.class); Dataset d2 = read("dataset_2.json", Dataset.class);
assertEquals(1, p1.getCollectedfrom().size()); assertEquals(p1.getCollectedfrom().size(), 1);
assertEquals(ModelConstants.CROSSREF_ID, p1.getCollectedfrom().get(0).getKey()); assertEquals(p1.getCollectedfrom().get(0).getKey(), ModelConstants.CROSSREF_ID);
assertEquals(1, d2.getCollectedfrom().size()); assertEquals(d2.getCollectedfrom().size(), 1);
assertFalse(cfId(d2.getCollectedfrom()).contains(ModelConstants.CROSSREF_ID)); assertFalse(cfId(d2.getCollectedfrom()).contains(ModelConstants.CROSSREF_ID));
assertEquals( assertTrue(
ModelConstants.PUBLICATION_RESULTTYPE_CLASSID,
OafMapperUtils OafMapperUtils
.mergeResults(p1, d2) .mergeResults(p1, d2)
.getResulttype() .getResulttype()
.getClassid()); .getClassid()
.equals(ModelConstants.PUBLICATION_RESULTTYPE_CLASSID));
assertEquals(1, p2.getCollectedfrom().size()); assertEquals(p2.getCollectedfrom().size(), 1);
assertFalse(cfId(p2.getCollectedfrom()).contains(ModelConstants.CROSSREF_ID)); assertFalse(cfId(p2.getCollectedfrom()).contains(ModelConstants.CROSSREF_ID));
assertEquals(1, d1.getCollectedfrom().size()); assertEquals(d1.getCollectedfrom().size(), 1);
assertTrue(cfId(d1.getCollectedfrom()).contains(ModelConstants.CROSSREF_ID)); assertTrue(cfId(d1.getCollectedfrom()).contains(ModelConstants.CROSSREF_ID));
assertEquals( assertTrue(
ModelConstants.DATASET_RESULTTYPE_CLASSID,
OafMapperUtils OafMapperUtils
.mergeResults(p2, d1) .mergeResults(p2, d1)
.getResulttype() .getResulttype()
.getClassid()); .getClassid()
.equals(ModelConstants.DATASET_RESULTTYPE_CLASSID));
} }
protected HashSet<String> cfId(List<KeyValue> collectedfrom) { protected HashSet<String> cfId(List<KeyValue> collectedfrom) {
return collectedfrom.stream().map(KeyValue::getKey).collect(Collectors.toCollection(HashSet::new)); return collectedfrom.stream().map(c -> c.getKey()).collect(Collectors.toCollection(HashSet::new));
} }
protected <T extends Result> T read(String filename, Class<T> clazz) throws IOException { protected <T extends Result> T read(String filename, Class<T> clazz) throws IOException {

View File

@ -3,10 +3,10 @@ package eu.dnetlib.scholexplorer.relation;
import org.junit.jupiter.api.Test; import org.junit.jupiter.api.Test;
class RelationMapperTest { public class RelationMapperTest {
@Test @Test
void testLoadRels() throws Exception { public void testLoadRels() throws Exception {
RelationMapper relationMapper = RelationMapper.load(); RelationMapper relationMapper = RelationMapper.load();
relationMapper.keySet().forEach(System.out::println); relationMapper.keySet().forEach(System.out::println);

View File

@ -3,37 +3,40 @@ package eu.dnetlib.dhp.actionmanager;
import java.io.Serializable; import java.io.Serializable;
import java.io.StringReader; import java.io.StringReader;
import java.util.List; import java.util.*;
import java.util.NoSuchElementException;
import java.util.Optional;
import java.util.Set;
import java.util.stream.Collectors; import java.util.stream.Collectors;
import org.apache.commons.lang3.tuple.Triple; import org.apache.commons.lang3.tuple.Triple;
import org.dom4j.Document; import org.dom4j.Document;
import org.dom4j.DocumentException; import org.dom4j.DocumentException;
import org.dom4j.Element;
import org.dom4j.io.SAXReader; import org.dom4j.io.SAXReader;
import org.jetbrains.annotations.NotNull;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
import org.xml.sax.SAXException;
import com.google.common.base.Joiner; import com.google.common.base.Joiner;
import com.google.common.base.Splitter; import com.google.common.base.Splitter;
import com.google.common.collect.Iterables; import com.google.common.collect.Iterables;
import com.google.common.collect.Lists;
import com.google.common.collect.Sets; import com.google.common.collect.Sets;
import eu.dnetlib.actionmanager.rmi.ActionManagerException; import eu.dnetlib.actionmanager.rmi.ActionManagerException;
import eu.dnetlib.actionmanager.set.ActionManagerSet;
import eu.dnetlib.actionmanager.set.ActionManagerSet.ImpactTypes;
import eu.dnetlib.dhp.actionmanager.partition.PartitionActionSetsByPayloadTypeJob;
import eu.dnetlib.dhp.utils.ISLookupClientFactory; import eu.dnetlib.dhp.utils.ISLookupClientFactory;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException; import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService; import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
import scala.Tuple2;
public class ISClient implements Serializable { public class ISClient implements Serializable {
private static final Logger log = LoggerFactory.getLogger(ISClient.class); private static final Logger log = LoggerFactory.getLogger(PartitionActionSetsByPayloadTypeJob.class);
private static final String INPUT_ACTION_SET_ID_SEPARATOR = ","; private static final String INPUT_ACTION_SET_ID_SEPARATOR = ",";
private final transient ISLookUpService isLookup; private final ISLookUpService isLookup;
public ISClient(String isLookupUrl) { public ISClient(String isLookupUrl) {
isLookup = ISLookupClientFactory.getLookUpService(isLookupUrl); isLookup = ISLookupClientFactory.getLookUpService(isLookupUrl);
@ -60,7 +63,7 @@ public class ISClient implements Serializable {
.map( .map(
sets -> sets sets -> sets
.stream() .stream()
.map(ISClient::parseSetInfo) .map(set -> parseSetInfo(set))
.filter(t -> ids.contains(t.getLeft())) .filter(t -> ids.contains(t.getLeft()))
.map(t -> buildDirectory(basePath, t)) .map(t -> buildDirectory(basePath, t))
.collect(Collectors.toList())) .collect(Collectors.toList()))
@ -70,17 +73,15 @@ public class ISClient implements Serializable {
} }
} }
private static Triple<String, String, String> parseSetInfo(String set) { private Triple<String, String, String> parseSetInfo(String set) {
try { try {
final SAXReader reader = new SAXReader(); Document doc = new SAXReader().read(new StringReader(set));
reader.setFeature("http://apache.org/xml/features/disallow-doctype-decl", true);
Document doc = reader.read(new StringReader(set));
return Triple return Triple
.of( .of(
doc.valueOf("//SET/@id"), doc.valueOf("//SET/@id"),
doc.valueOf("//SET/@directory"), doc.valueOf("//SET/@directory"),
doc.valueOf("//SET/@latest")); doc.valueOf("//SET/@latest"));
} catch (DocumentException | SAXException e) { } catch (DocumentException e) {
throw new IllegalStateException(e); throw new IllegalStateException(e);
} }
} }
@ -98,7 +99,7 @@ public class ISClient implements Serializable {
final String q = "for $x in /RESOURCE_PROFILE[.//RESOURCE_TYPE/@value='ActionManagerServiceResourceType'] return $x//SERVICE_PROPERTIES/PROPERTY[./@ key='" final String q = "for $x in /RESOURCE_PROFILE[.//RESOURCE_TYPE/@value='ActionManagerServiceResourceType'] return $x//SERVICE_PROPERTIES/PROPERTY[./@ key='"
+ propertyName + propertyName
+ "']/@value/string()"; + "']/@value/string()";
log.debug("quering for service property: {}", q); log.debug("quering for service property: " + q);
try { try {
final List<String> value = isLookup.quickSearchProfile(q); final List<String> value = isLookup.quickSearchProfile(q);
return Iterables.getOnlyElement(value); return Iterables.getOnlyElement(value);

View File

@ -62,7 +62,6 @@ public class MergeAndGet {
x.getClass().getCanonicalName(), y.getClass().getCanonicalName())); x.getClass().getCanonicalName(), y.getClass().getCanonicalName()));
} }
@SuppressWarnings("unchecked")
private static <G extends Oaf, A extends Oaf> G selectNewerAndGet(G x, A y) { private static <G extends Oaf, A extends Oaf> G selectNewerAndGet(G x, A y) {
if (x.getClass().equals(y.getClass()) if (x.getClass().equals(y.getClass())
&& x.getLastupdatetimestamp() > y.getLastupdatetimestamp()) { && x.getLastupdatetimestamp() > y.getLastupdatetimestamp()) {

View File

@ -74,9 +74,7 @@ public class PromoteActionPayloadForGraphTableJob {
.orElse(true); .orElse(true);
logger.info("shouldGroupById: {}", shouldGroupById); logger.info("shouldGroupById: {}", shouldGroupById);
@SuppressWarnings("unchecked")
Class<? extends Oaf> rowClazz = (Class<? extends Oaf>) Class.forName(graphTableClassName); Class<? extends Oaf> rowClazz = (Class<? extends Oaf>) Class.forName(graphTableClassName);
@SuppressWarnings("unchecked")
Class<? extends Oaf> actionPayloadClazz = (Class<? extends Oaf>) Class.forName(actionPayloadClassName); Class<? extends Oaf> actionPayloadClazz = (Class<? extends Oaf>) Class.forName(actionPayloadClassName);
throwIfGraphTableClassIsNotSubClassOfActionPayloadClass(rowClazz, actionPayloadClazz); throwIfGraphTableClassIsNotSubClassOfActionPayloadClass(rowClazz, actionPayloadClazz);
@ -154,7 +152,7 @@ public class PromoteActionPayloadForGraphTableJob {
return spark return spark
.read() .read()
.parquet(path) .parquet(path)
.map((MapFunction<Row, String>) PromoteActionPayloadForGraphTableJob::extractPayload, Encoders.STRING()) .map((MapFunction<Row, String>) value -> extractPayload(value), Encoders.STRING())
.map( .map(
(MapFunction<String, A>) value -> decodePayload(actionPayloadClazz, value), (MapFunction<String, A>) value -> decodePayload(actionPayloadClazz, value),
Encoders.bean(actionPayloadClazz)); Encoders.bean(actionPayloadClazz));

View File

@ -107,7 +107,7 @@
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=5000 --conf spark.sql.shuffle.partitions=2560
</spark-opts> </spark-opts>
<arg>--inputGraphTablePath</arg><arg>${inputGraphRootPath}/publication</arg> <arg>--inputGraphTablePath</arg><arg>${inputGraphRootPath}/publication</arg>
<arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Publication</arg> <arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Publication</arg>
@ -159,7 +159,7 @@
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=5000 --conf spark.sql.shuffle.partitions=2560
</spark-opts> </spark-opts>
<arg>--inputGraphTablePath</arg><arg>${workingDir}/publication</arg> <arg>--inputGraphTablePath</arg><arg>${workingDir}/publication</arg>
<arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Publication</arg> <arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Publication</arg>

View File

@ -99,7 +99,7 @@
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=5000 --conf spark.sql.shuffle.partitions=2560
</spark-opts> </spark-opts>
<arg>--inputGraphTablePath</arg><arg>${inputGraphRootPath}/relation</arg> <arg>--inputGraphTablePath</arg><arg>${inputGraphRootPath}/relation</arg>
<arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Relation</arg> <arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Relation</arg>

View File

@ -80,7 +80,7 @@ public class PartitionActionSetsByPayloadTypeJobTest {
private ISClient isClient; private ISClient isClient;
@Test @Test
void shouldPartitionActionSetsByPayloadType(@TempDir Path workingDir) throws Exception { public void shouldPartitionActionSetsByPayloadType(@TempDir Path workingDir) throws Exception {
// given // given
Path inputActionSetsBaseDir = workingDir.resolve("input").resolve("action_sets"); Path inputActionSetsBaseDir = workingDir.resolve("input").resolve("action_sets");
Path outputDir = workingDir.resolve("output"); Path outputDir = workingDir.resolve("output");

View File

@ -20,7 +20,7 @@ public class MergeAndGetTest {
class MergeFromAndGetStrategy { class MergeFromAndGetStrategy {
@Test @Test
void shouldThrowForOafAndOaf() { public void shouldThrowForOafAndOaf() {
// given // given
Oaf a = mock(Oaf.class); Oaf a = mock(Oaf.class);
Oaf b = mock(Oaf.class); Oaf b = mock(Oaf.class);
@ -33,7 +33,7 @@ public class MergeAndGetTest {
} }
@Test @Test
void shouldThrowForOafAndRelation() { public void shouldThrowForOafAndRelation() {
// given // given
Oaf a = mock(Oaf.class); Oaf a = mock(Oaf.class);
Relation b = mock(Relation.class); Relation b = mock(Relation.class);
@ -46,7 +46,7 @@ public class MergeAndGetTest {
} }
@Test @Test
void shouldThrowForOafAndOafEntity() { public void shouldThrowForOafAndOafEntity() {
// given // given
Oaf a = mock(Oaf.class); Oaf a = mock(Oaf.class);
OafEntity b = mock(OafEntity.class); OafEntity b = mock(OafEntity.class);
@ -59,7 +59,7 @@ public class MergeAndGetTest {
} }
@Test @Test
void shouldThrowForRelationAndOaf() { public void shouldThrowForRelationAndOaf() {
// given // given
Relation a = mock(Relation.class); Relation a = mock(Relation.class);
Oaf b = mock(Oaf.class); Oaf b = mock(Oaf.class);
@ -72,7 +72,7 @@ public class MergeAndGetTest {
} }
@Test @Test
void shouldThrowForRelationAndOafEntity() { public void shouldThrowForRelationAndOafEntity() {
// given // given
Relation a = mock(Relation.class); Relation a = mock(Relation.class);
OafEntity b = mock(OafEntity.class); OafEntity b = mock(OafEntity.class);
@ -85,7 +85,7 @@ public class MergeAndGetTest {
} }
@Test @Test
void shouldBehaveProperlyForRelationAndRelation() { public void shouldBehaveProperlyForRelationAndRelation() {
// given // given
Relation a = mock(Relation.class); Relation a = mock(Relation.class);
Relation b = mock(Relation.class); Relation b = mock(Relation.class);
@ -101,7 +101,7 @@ public class MergeAndGetTest {
} }
@Test @Test
void shouldThrowForOafEntityAndOaf() { public void shouldThrowForOafEntityAndOaf() {
// given // given
OafEntity a = mock(OafEntity.class); OafEntity a = mock(OafEntity.class);
Oaf b = mock(Oaf.class); Oaf b = mock(Oaf.class);
@ -114,7 +114,7 @@ public class MergeAndGetTest {
} }
@Test @Test
void shouldThrowForOafEntityAndRelation() { public void shouldThrowForOafEntityAndRelation() {
// given // given
OafEntity a = mock(OafEntity.class); OafEntity a = mock(OafEntity.class);
Relation b = mock(Relation.class); Relation b = mock(Relation.class);
@ -127,7 +127,7 @@ public class MergeAndGetTest {
} }
@Test @Test
void shouldThrowForOafEntityAndOafEntityButNotSubclasses() { public void shouldThrowForOafEntityAndOafEntityButNotSubclasses() {
// given // given
class OafEntitySub1 extends OafEntity { class OafEntitySub1 extends OafEntity {
} }
@ -145,7 +145,7 @@ public class MergeAndGetTest {
} }
@Test @Test
void shouldBehaveProperlyForOafEntityAndOafEntity() { public void shouldBehaveProperlyForOafEntityAndOafEntity() {
// given // given
OafEntity a = mock(OafEntity.class); OafEntity a = mock(OafEntity.class);
OafEntity b = mock(OafEntity.class); OafEntity b = mock(OafEntity.class);
@ -165,7 +165,7 @@ public class MergeAndGetTest {
class SelectNewerAndGetStrategy { class SelectNewerAndGetStrategy {
@Test @Test
void shouldThrowForOafEntityAndRelation() { public void shouldThrowForOafEntityAndRelation() {
// given // given
OafEntity a = mock(OafEntity.class); OafEntity a = mock(OafEntity.class);
Relation b = mock(Relation.class); Relation b = mock(Relation.class);
@ -178,7 +178,7 @@ public class MergeAndGetTest {
} }
@Test @Test
void shouldThrowForRelationAndOafEntity() { public void shouldThrowForRelationAndOafEntity() {
// given // given
Relation a = mock(Relation.class); Relation a = mock(Relation.class);
OafEntity b = mock(OafEntity.class); OafEntity b = mock(OafEntity.class);
@ -191,7 +191,7 @@ public class MergeAndGetTest {
} }
@Test @Test
void shouldThrowForOafEntityAndResult() { public void shouldThrowForOafEntityAndResult() {
// given // given
OafEntity a = mock(OafEntity.class); OafEntity a = mock(OafEntity.class);
Result b = mock(Result.class); Result b = mock(Result.class);
@ -204,7 +204,7 @@ public class MergeAndGetTest {
} }
@Test @Test
void shouldThrowWhenSuperTypeIsNewerForResultAndOafEntity() { public void shouldThrowWhenSuperTypeIsNewerForResultAndOafEntity() {
// given // given
// real types must be used because subclass-superclass resolution does not work for // real types must be used because subclass-superclass resolution does not work for
// mocks // mocks
@ -221,7 +221,7 @@ public class MergeAndGetTest {
} }
@Test @Test
void shouldShouldReturnLeftForOafEntityAndOafEntity() { public void shouldShouldReturnLeftForOafEntityAndOafEntity() {
// given // given
OafEntity a = mock(OafEntity.class); OafEntity a = mock(OafEntity.class);
when(a.getLastupdatetimestamp()).thenReturn(1L); when(a.getLastupdatetimestamp()).thenReturn(1L);
@ -238,7 +238,7 @@ public class MergeAndGetTest {
} }
@Test @Test
void shouldShouldReturnRightForOafEntityAndOafEntity() { public void shouldShouldReturnRightForOafEntityAndOafEntity() {
// given // given
OafEntity a = mock(OafEntity.class); OafEntity a = mock(OafEntity.class);
when(a.getLastupdatetimestamp()).thenReturn(2L); when(a.getLastupdatetimestamp()).thenReturn(2L);

View File

@ -77,7 +77,7 @@ public class PromoteActionPayloadForGraphTableJobTest {
class Main { class Main {
@Test @Test
void shouldThrowWhenGraphTableClassIsNotASubClassOfActionPayloadClass() { public void shouldThrowWhenGraphTableClassIsNotASubClassOfActionPayloadClass() {
// given // given
Class<Relation> rowClazz = Relation.class; Class<Relation> rowClazz = Relation.class;
Class<OafEntity> actionPayloadClazz = OafEntity.class; Class<OafEntity> actionPayloadClazz = OafEntity.class;
@ -116,7 +116,7 @@ public class PromoteActionPayloadForGraphTableJobTest {
@ParameterizedTest(name = "strategy: {0}, graph table: {1}, action payload: {2}") @ParameterizedTest(name = "strategy: {0}, graph table: {1}, action payload: {2}")
@MethodSource("eu.dnetlib.dhp.actionmanager.promote.PromoteActionPayloadForGraphTableJobTest#promoteJobTestParams") @MethodSource("eu.dnetlib.dhp.actionmanager.promote.PromoteActionPayloadForGraphTableJobTest#promoteJobTestParams")
void shouldPromoteActionPayloadForGraphTable( public void shouldPromoteActionPayloadForGraphTable(
MergeAndGet.Strategy strategy, MergeAndGet.Strategy strategy,
Class<? extends Oaf> rowClazz, Class<? extends Oaf> rowClazz,
Class<? extends Oaf> actionPayloadClazz) Class<? extends Oaf> actionPayloadClazz)

View File

@ -44,7 +44,7 @@ public class PromoteActionPayloadFunctionsTest {
class JoinTableWithActionPayloadAndMerge { class JoinTableWithActionPayloadAndMerge {
@Test @Test
void shouldThrowWhenTableTypeIsNotSubtypeOfActionPayloadType() { public void shouldThrowWhenTableTypeIsNotSubtypeOfActionPayloadType() {
// given // given
class OafImpl extends Oaf { class OafImpl extends Oaf {
} }
@ -58,7 +58,7 @@ public class PromoteActionPayloadFunctionsTest {
} }
@Test @Test
void shouldRunProperlyWhenActionPayloadTypeAndTableTypeAreTheSame() { public void shouldRunProperlyWhenActionPayloadTypeAndTableTypeAreTheSame() {
// given // given
String id0 = "id0"; String id0 = "id0";
String id1 = "id1"; String id1 = "id1";
@ -138,7 +138,7 @@ public class PromoteActionPayloadFunctionsTest {
} }
@Test @Test
void shouldRunProperlyWhenActionPayloadTypeIsSuperTypeOfTableType() { public void shouldRunProperlyWhenActionPayloadTypeIsSuperTypeOfTableType() {
// given // given
String id0 = "id0"; String id0 = "id0";
String id1 = "id1"; String id1 = "id1";
@ -218,7 +218,7 @@ public class PromoteActionPayloadFunctionsTest {
class GroupTableByIdAndMerge { class GroupTableByIdAndMerge {
@Test @Test
void shouldRunProperly() { public void shouldRunProperly() {
// given // given
String id1 = "id1"; String id1 = "id1";
String id2 = "id2"; String id2 = "id2";

View File

@ -29,13 +29,6 @@
<goal>testCompile</goal> <goal>testCompile</goal>
</goals> </goals>
</execution> </execution>
<execution>
<id>scala-doc</id>
<phase>process-resources</phase> <!-- or wherever -->
<goals>
<goal>doc</goal>
</goals>
</execution>
</executions> </executions>
<configuration> <configuration>
<scalaVersion>${scala.version}</scalaVersion> <scalaVersion>${scala.version}</scalaVersion>
@ -91,6 +84,14 @@
<artifactId>json</artifactId> <artifactId>json</artifactId>
</dependency> </dependency>
<!-- https://mvnrepository.com/artifact/org.apache.commons/commons-csv -->
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-csv</artifactId>
<version>1.8</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.poi/poi-ooxml --> <!-- https://mvnrepository.com/artifact/org.apache.poi/poi-ooxml -->
<dependency> <dependency>
<groupId>org.apache.poi</groupId> <groupId>org.apache.poi</groupId>

View File

@ -1,92 +0,0 @@
package eu.dnetlib.dhp.actionmanager;
import java.util.Optional;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SparkSession;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils;
public class Constants {
public static final String DOI = "doi";
public static final String DOI_CLASSNAME = "Digital Object Identifier";
public static final String DEFAULT_DELIMITER = ",";
public static final String UPDATE_DATA_INFO_TYPE = "update";
public static final String UPDATE_SUBJECT_FOS_CLASS_ID = "subject:fos";
public static final String UPDATE_CLASS_NAME = "Inferred by OpenAIRE";
public static final String UPDATE_MEASURE_BIP_CLASS_ID = "measure:bip";
public static final String UPDATE_SUBJECT_SDG_CLASS_ID = "subject:sdg";
public static final String FOS_CLASS_ID = "FOS";
public static final String FOS_CLASS_NAME = "Fields of Science and Technology classification";
public static final String SDG_CLASS_ID = "SDG";
public static final String SDG_CLASS_NAME = "Sustainable Development Goals";
public static final String NULL = "NULL";
public static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
private Constants() {
}
public static Boolean isSparkSessionManaged(ArgumentApplicationParser parser) {
return Optional
.ofNullable(parser.get("isSparkSessionManaged"))
.map(Boolean::valueOf)
.orElse(Boolean.TRUE);
}
public static <R> Dataset<R> readPath(
SparkSession spark, String inputPath, Class<R> clazz) {
return spark
.read()
.textFile(inputPath)
.map((MapFunction<String, R>) value -> OBJECT_MAPPER.readValue(value, clazz), Encoders.bean(clazz));
}
public static StructuredProperty getSubject(String sbj, String classid, String classname,
String diqualifierclassid) {
if (sbj.equals(NULL))
return null;
StructuredProperty sp = new StructuredProperty();
sp.setValue(sbj);
sp
.setQualifier(
OafMapperUtils
.qualifier(
classid,
classname,
ModelConstants.DNET_SUBJECT_TYPOLOGIES,
ModelConstants.DNET_SUBJECT_TYPOLOGIES));
sp
.setDataInfo(
OafMapperUtils
.dataInfo(
false,
UPDATE_DATA_INFO_TYPE,
true,
false,
OafMapperUtils
.qualifier(
diqualifierclassid,
UPDATE_CLASS_NAME,
ModelConstants.DNET_PROVENANCE_ACTIONS,
ModelConstants.DNET_PROVENANCE_ACTIONS),
""));
return sp;
}
}

View File

@ -1,5 +1,5 @@
package eu.dnetlib.dhp.actionmanager.bipmodel; package eu.dnetlib.dhp.actionmanager.bipfinder;
import java.io.Serializable; import java.io.Serializable;
import java.util.ArrayList; import java.util.ArrayList;

View File

@ -1,5 +1,5 @@
package eu.dnetlib.dhp.actionmanager.bipmodel; package eu.dnetlib.dhp.actionmanager.bipfinder;
import java.io.Serializable; import java.io.Serializable;
import java.util.List; import java.util.List;

View File

@ -0,0 +1,84 @@
package eu.dnetlib.dhp.actionmanager.bipfinder;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import java.io.Serializable;
import java.util.Optional;
import org.apache.commons.io.IOUtils;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.sql.SparkSession;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.common.HdfsSupport;
import eu.dnetlib.dhp.schema.oaf.Result;
/**
* Just collects all the atomic actions produced for the different results and saves them in
* outputpath for the ActionSet
*/
public class CollectAndSave implements Serializable {
private static final Logger log = LoggerFactory.getLogger(CollectAndSave.class);
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
public static <I extends Result> void main(String[] args) throws Exception {
String jsonConfiguration = IOUtils
.toString(
CollectAndSave.class
.getResourceAsStream(
"/eu/dnetlib/dhp/actionmanager/bipfinder/input_actionset_parameter.json"));
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
parser.parseArgument(args);
Boolean isSparkSessionManaged = Optional
.ofNullable(parser.get("isSparkSessionManaged"))
.map(Boolean::valueOf)
.orElse(Boolean.TRUE);
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
final String inputPath = parser.get("inputPath");
log.info("inputPath {}: ", inputPath);
final String outputPath = parser.get("outputPath");
log.info("outputPath {}: ", outputPath);
SparkConf conf = new SparkConf();
runWithSparkSession(
conf,
isSparkSessionManaged,
spark -> {
removeOutputDir(spark, outputPath);
collectAndSave(spark, inputPath, outputPath);
});
}
private static void collectAndSave(SparkSession spark, String inputPath, String outputPath) {
JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
sc
.sequenceFile(inputPath + "/publication", Text.class, Text.class)
.union(sc.sequenceFile(inputPath + "/dataset", Text.class, Text.class))
.union(sc.sequenceFile(inputPath + "/otherresearchproduct", Text.class, Text.class))
.union(sc.sequenceFile(inputPath + "/software", Text.class, Text.class))
.saveAsHadoopFile(outputPath, Text.class, Text.class, SequenceFileOutputFormat.class);
}
private static void removeOutputDir(SparkSession spark, String path) {
HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration());
}
}

View File

@ -1,5 +1,5 @@
package eu.dnetlib.dhp.actionmanager.bipmodel; package eu.dnetlib.dhp.actionmanager.bipfinder;
import java.io.Serializable; import java.io.Serializable;

View File

@ -0,0 +1,28 @@
package eu.dnetlib.dhp.actionmanager.bipfinder;
import java.io.Serializable;
/**
* Subset of the information of the generic results that are needed to create the atomic action
*/
public class PreparedResult implements Serializable {
private String id; // openaire id
private String value; // doi
public String getId() {
return id;
}
public void setId(String id) {
this.id = id;
}
public String getValue() {
return value;
}
public void setValue(String value) {
this.value = value;
}
}

View File

@ -1,5 +1,5 @@
package eu.dnetlib.dhp.actionmanager.bipmodel; package eu.dnetlib.dhp.actionmanager.bipfinder;
import java.io.Serializable; import java.io.Serializable;
import java.util.List; import java.util.List;

View File

@ -1,7 +1,6 @@
package eu.dnetlib.dhp.actionmanager.bipfinder; package eu.dnetlib.dhp.actionmanager.bipfinder;
import static eu.dnetlib.dhp.actionmanager.Constants.*;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import java.io.Serializable; import java.io.Serializable;
@ -16,6 +15,7 @@ import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.MapFunction; import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.api.java.function.MapGroupsFunction;
import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders; import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SparkSession; import org.apache.spark.sql.SparkSession;
@ -24,15 +24,11 @@ import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.actionmanager.bipmodel.BipDeserialize;
import eu.dnetlib.dhp.actionmanager.bipmodel.BipScore;
import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.common.HdfsSupport; import eu.dnetlib.dhp.common.HdfsSupport;
import eu.dnetlib.dhp.schema.action.AtomicAction; import eu.dnetlib.dhp.schema.action.AtomicAction;
import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.oaf.*; import eu.dnetlib.dhp.schema.oaf.*;
import eu.dnetlib.dhp.schema.oaf.KeyValue; import eu.dnetlib.dhp.schema.oaf.KeyValue;
import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils;
import scala.Tuple2; import scala.Tuple2;
/** /**
@ -50,7 +46,7 @@ public class SparkAtomicActionScoreJob implements Serializable {
.toString( .toString(
SparkAtomicActionScoreJob.class SparkAtomicActionScoreJob.class
.getResourceAsStream( .getResourceAsStream(
"/eu/dnetlib/dhp/actionmanager/bipfinder/input_actionset_parameter.json")); "/eu/dnetlib/dhp/actionmanager/bipfinder/input_parameters.json"));
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration); final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
@ -69,6 +65,14 @@ public class SparkAtomicActionScoreJob implements Serializable {
final String outputPath = parser.get("outputPath"); final String outputPath = parser.get("outputPath");
log.info("outputPath {}: ", outputPath); log.info("outputPath {}: ", outputPath);
final String bipScorePath = parser.get("bipScorePath");
log.info("bipScorePath: {}", bipScorePath);
final String resultClassName = parser.get("resultTableName");
log.info("resultTableName: {}", resultClassName);
Class<I> inputClazz = (Class<I>) Class.forName(resultClassName);
SparkConf conf = new SparkConf(); SparkConf conf = new SparkConf();
runWithSparkSession( runWithSparkSession(
@ -76,13 +80,14 @@ public class SparkAtomicActionScoreJob implements Serializable {
isSparkSessionManaged, isSparkSessionManaged,
spark -> { spark -> {
removeOutputDir(spark, outputPath); removeOutputDir(spark, outputPath);
prepareResults(spark, inputPath, outputPath); prepareResults(spark, inputPath, outputPath, bipScorePath, inputClazz);
}); });
} }
private static <I extends Result> void prepareResults(SparkSession spark, String bipScorePath, String outputPath) { private static <I extends Result> void prepareResults(SparkSession spark, String inputPath, String outputPath,
String bipScorePath, Class<I> inputClazz) {
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
JavaRDD<BipDeserialize> bipDeserializeJavaRDD = sc JavaRDD<BipDeserialize> bipDeserializeJavaRDD = sc
.textFile(bipScorePath) .textFile(bipScorePath)
@ -96,19 +101,43 @@ public class SparkAtomicActionScoreJob implements Serializable {
return bs; return bs;
}).collect(Collectors.toList()).iterator()).rdd(), Encoders.bean(BipScore.class)); }).collect(Collectors.toList()).iterator()).rdd(), Encoders.bean(BipScore.class));
System.out.println(bipScores.count());
Dataset<I> results = readPath(spark, inputPath, inputClazz);
results.createOrReplaceTempView("result");
Dataset<PreparedResult> preparedResult = spark
.sql(
"select pIde.value value, id " +
"from result " +
"lateral view explode (pid) p as pIde " +
"where dataInfo.deletedbyinference = false and pIde.qualifier.classid = '" + DOI + "'")
.as(Encoders.bean(PreparedResult.class));
bipScores bipScores
.joinWith(
.map((MapFunction<BipScore, Result>) bs -> { preparedResult, bipScores.col("id").equalTo(preparedResult.col("value")),
"inner")
.map((MapFunction<Tuple2<BipScore, PreparedResult>, BipScore>) value -> {
BipScore ret = value._1();
ret.setId(value._2().getId());
return ret;
}, Encoders.bean(BipScore.class))
.groupByKey((MapFunction<BipScore, String>) value -> value.getId(), Encoders.STRING())
.mapGroups((MapGroupsFunction<String, BipScore, Result>) (k, it) -> {
Result ret = new Result(); Result ret = new Result();
ret.setDataInfo(getDataInfo());
BipScore first = it.next();
ret.setId(first.getId());
ret.setId(bs.getId()); ret.setMeasures(getMeasure(first));
it.forEachRemaining(value -> ret.getMeasures().addAll(getMeasure(value)));
ret.setMeasures(getMeasure(bs));
return ret; return ret;
}, Encoders.bean(Result.class)) }, Encoders.bean(Result.class))
.toJavaRDD() .toJavaRDD()
.map(p -> new AtomicAction(Result.class, p)) .map(p -> new AtomicAction(inputClazz, p))
.mapToPair( .mapToPair(
aa -> new Tuple2<>(new Text(aa.getClazz().getCanonicalName()), aa -> new Tuple2<>(new Text(aa.getClazz().getCanonicalName()),
new Text(OBJECT_MAPPER.writeValueAsString(aa)))) new Text(OBJECT_MAPPER.writeValueAsString(aa))))
@ -132,21 +161,7 @@ public class SparkAtomicActionScoreJob implements Serializable {
KeyValue kv = new KeyValue(); KeyValue kv = new KeyValue();
kv.setValue(unit.getValue()); kv.setValue(unit.getValue());
kv.setKey(unit.getKey()); kv.setKey(unit.getKey());
kv kv.setDataInfo(getDataInfo());
.setDataInfo(
OafMapperUtils
.dataInfo(
false,
UPDATE_DATA_INFO_TYPE,
true,
false,
OafMapperUtils
.qualifier(
UPDATE_MEASURE_BIP_CLASS_ID,
UPDATE_CLASS_NAME,
ModelConstants.DNET_PROVENANCE_ACTIONS,
ModelConstants.DNET_PROVENANCE_ACTIONS),
""));
return kv; return kv;
}) })
.collect(Collectors.toList())); .collect(Collectors.toList()));
@ -155,6 +170,21 @@ public class SparkAtomicActionScoreJob implements Serializable {
.collect(Collectors.toList()); .collect(Collectors.toList());
} }
private static DataInfo getDataInfo() {
DataInfo di = new DataInfo();
di.setInferred(false);
di.setInvisible(false);
di.setDeletedbyinference(false);
di.setTrust("");
Qualifier qualifier = new Qualifier();
qualifier.setClassid("sysimport:actionset");
qualifier.setClassname("Harvested");
qualifier.setSchemename("dnet:provenanceActions");
qualifier.setSchemeid("dnet:provenanceActions");
di.setProvenanceaction(qualifier);
return di;
}
private static void removeOutputDir(SparkSession spark, String path) { private static void removeOutputDir(SparkSession spark, String path) {
HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration()); HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration());
} }

View File

@ -1,91 +0,0 @@
package eu.dnetlib.dhp.actionmanager.createunresolvedentities;
import static eu.dnetlib.dhp.actionmanager.Constants.DEFAULT_DELIMITER;
import static eu.dnetlib.dhp.actionmanager.Constants.isSparkSessionManaged;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import java.io.Serializable;
import java.util.Optional;
import org.apache.commons.io.IOUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.*;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import eu.dnetlib.dhp.actionmanager.createunresolvedentities.model.FOSDataModel;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
public class GetFOSSparkJob implements Serializable {
private static final Logger log = LoggerFactory.getLogger(GetFOSSparkJob.class);
public static void main(String[] args) throws Exception {
String jsonConfiguration = IOUtils
.toString(
GetFOSSparkJob.class
.getResourceAsStream(
"/eu/dnetlib/dhp/actionmanager/createunresolvedentities/get_input_parameters.json"));
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
parser.parseArgument(args);
Boolean isSparkSessionManaged = isSparkSessionManaged(parser);
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
// the path where the original fos csv file is stored
final String sourcePath = parser.get("sourcePath");
log.info("sourcePath {}", sourcePath);
// the path where to put the file as json
final String outputPath = parser.get("outputPath");
log.info("outputPath {}", outputPath);
final String delimiter = Optional
.ofNullable(parser.get("delimiter"))
.orElse(DEFAULT_DELIMITER);
SparkConf sconf = new SparkConf();
runWithSparkSession(
sconf,
isSparkSessionManaged,
spark -> {
getFOS(
spark,
sourcePath,
outputPath,
delimiter);
});
}
private static void getFOS(SparkSession spark, String sourcePath, String outputPath, String delimiter) {
Dataset<Row> fosData = spark
.read()
.format("csv")
.option("sep", delimiter)
.option("inferSchema", "true")
.option("header", "true")
.option("quotes", "\"")
.load(sourcePath);
fosData.map((MapFunction<Row, FOSDataModel>) r -> {
FOSDataModel fosDataModel = new FOSDataModel();
fosDataModel.setDoi(r.getString(0).toLowerCase());
fosDataModel.setLevel1(r.getString(1));
fosDataModel.setLevel2(r.getString(2));
fosDataModel.setLevel3(r.getString(3));
return fosDataModel;
}, Encoders.bean(FOSDataModel.class))
.write()
.mode(SaveMode.Overwrite)
.json(outputPath);
}
}

View File

@ -1,91 +0,0 @@
package eu.dnetlib.dhp.actionmanager.createunresolvedentities;
import static eu.dnetlib.dhp.actionmanager.Constants.DEFAULT_DELIMITER;
import static eu.dnetlib.dhp.actionmanager.Constants.isSparkSessionManaged;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import java.io.Serializable;
import java.util.Optional;
import org.apache.commons.io.IOUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.FilterFunction;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.*;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import eu.dnetlib.dhp.actionmanager.createunresolvedentities.model.FOSDataModel;
import eu.dnetlib.dhp.actionmanager.createunresolvedentities.model.SDGDataModel;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
public class GetSDGSparkJob implements Serializable {
private static final Logger log = LoggerFactory.getLogger(GetSDGSparkJob.class);
public static void main(String[] args) throws Exception {
String jsonConfiguration = IOUtils
.toString(
GetSDGSparkJob.class
.getResourceAsStream(
"/eu/dnetlib/dhp/actionmanager/createunresolvedentities/get_input_parameters.json"));
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
parser.parseArgument(args);
Boolean isSparkSessionManaged = isSparkSessionManaged(parser);
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
// the path where the original fos csv file is stored
final String sourcePath = parser.get("sourcePath");
log.info("sourcePath {}", sourcePath);
// the path where to put the file as json
final String outputPath = parser.get("outputPath");
log.info("outputPath {}", outputPath);
final String delimiter = Optional
.ofNullable(parser.get("delimiter"))
.orElse(DEFAULT_DELIMITER);
SparkConf sconf = new SparkConf();
runWithSparkSession(
sconf,
isSparkSessionManaged,
spark -> {
getSDG(
spark,
sourcePath,
outputPath,
delimiter);
});
}
private static void getSDG(SparkSession spark, String sourcePath, String outputPath, String delimiter) {
Dataset<Row> sdgData = spark
.read()
.format("csv")
.option("sep", delimiter)
.option("inferSchema", "true")
.option("header", "true")
.option("quotes", "\"")
.load(sourcePath);
sdgData.map((MapFunction<Row, SDGDataModel>) r -> {
SDGDataModel sdgDataModel = new SDGDataModel();
sdgDataModel.setDoi(r.getString(0).toLowerCase());
sdgDataModel.setSbj(r.getString(1));
return sdgDataModel;
}, Encoders.bean(SDGDataModel.class))
.filter((FilterFunction<SDGDataModel>) sdg -> sdg.getSbj() != null)
.write()
.mode(SaveMode.Overwrite)
.json(outputPath);
}
}

View File

@ -1,178 +0,0 @@
package eu.dnetlib.dhp.actionmanager.createunresolvedentities;
import static eu.dnetlib.dhp.actionmanager.Constants.*;
import static eu.dnetlib.dhp.actionmanager.Constants.UPDATE_CLASS_NAME;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import java.io.Serializable;
import java.util.Arrays;
import java.util.List;
import java.util.Optional;
import java.util.stream.Collectors;
import org.apache.commons.io.IOUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SaveMode;
import org.apache.spark.sql.SparkSession;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.actionmanager.bipmodel.BipDeserialize;
import eu.dnetlib.dhp.actionmanager.bipmodel.BipScore;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.common.HdfsSupport;
import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.oaf.Instance;
import eu.dnetlib.dhp.schema.oaf.KeyValue;
import eu.dnetlib.dhp.schema.oaf.Measure;
import eu.dnetlib.dhp.schema.oaf.Result;
import eu.dnetlib.dhp.schema.oaf.utils.CleaningFunctions;
import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils;
import eu.dnetlib.dhp.utils.DHPUtils;
public class PrepareBipFinder implements Serializable {
private static final Logger log = LoggerFactory.getLogger(PrepareBipFinder.class);
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
public static void main(String[] args) throws Exception {
String jsonConfiguration = IOUtils
.toString(
PrepareBipFinder.class
.getResourceAsStream(
"/eu/dnetlib/dhp/actionmanager/createunresolvedentities/prepare_parameters.json"));
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
parser.parseArgument(args);
Boolean isSparkSessionManaged = Optional
.ofNullable(parser.get("isSparkSessionManaged"))
.map(Boolean::valueOf)
.orElse(Boolean.TRUE);
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
final String sourcePath = parser.get("sourcePath");
log.info("sourcePath {}: ", sourcePath);
final String outputPath = parser.get("outputPath");
log.info("outputPath {}: ", outputPath);
SparkConf conf = new SparkConf();
runWithSparkSession(
conf,
isSparkSessionManaged,
spark -> {
HdfsSupport.remove(outputPath, spark.sparkContext().hadoopConfiguration());
prepareResults(spark, sourcePath, outputPath);
});
}
private static void prepareResults(SparkSession spark, String inputPath, String outputPath) {
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
JavaRDD<BipDeserialize> bipDeserializeJavaRDD = sc
.textFile(inputPath)
.map(item -> OBJECT_MAPPER.readValue(item, BipDeserialize.class));
spark
.createDataset(bipDeserializeJavaRDD.flatMap(entry -> entry.keySet().stream().map(key -> {
BipScore bs = new BipScore();
bs.setId(key);
bs.setScoreList(entry.get(key));
return bs;
}).collect(Collectors.toList()).iterator()).rdd(), Encoders.bean(BipScore.class))
.map((MapFunction<BipScore, Result>) v -> {
Result r = new Result();
final String cleanedPid = CleaningFunctions.normalizePidValue(DOI, v.getId());
r.setId(DHPUtils.generateUnresolvedIdentifier(v.getId(), DOI));
Instance inst = new Instance();
inst.setMeasures(getMeasure(v));
inst
.setPid(
Arrays
.asList(
OafMapperUtils
.structuredProperty(
cleanedPid,
OafMapperUtils
.qualifier(
DOI, DOI_CLASSNAME,
ModelConstants.DNET_PID_TYPES,
ModelConstants.DNET_PID_TYPES),
null)));
r.setInstance(Arrays.asList(inst));
r
.setDataInfo(
OafMapperUtils
.dataInfo(
false, null, true,
false,
OafMapperUtils
.qualifier(
ModelConstants.PROVENANCE_ENRICH,
null,
ModelConstants.DNET_PROVENANCE_ACTIONS,
ModelConstants.DNET_PROVENANCE_ACTIONS),
null));
return r;
}, Encoders.bean(Result.class))
.write()
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.json(outputPath + "/bip");
}
private static List<Measure> getMeasure(BipScore value) {
return value
.getScoreList()
.stream()
.map(score -> {
Measure m = new Measure();
m.setId(score.getId());
m
.setUnit(
score
.getUnit()
.stream()
.map(unit -> {
KeyValue kv = new KeyValue();
kv.setValue(unit.getValue());
kv.setKey(unit.getKey());
kv
.setDataInfo(
OafMapperUtils
.dataInfo(
false,
UPDATE_DATA_INFO_TYPE,
true,
false,
OafMapperUtils
.qualifier(
UPDATE_MEASURE_BIP_CLASS_ID,
UPDATE_CLASS_NAME,
ModelConstants.DNET_PROVENANCE_ACTIONS,
ModelConstants.DNET_PROVENANCE_ACTIONS),
""));
return kv;
})
.collect(Collectors.toList()));
return m;
})
.collect(Collectors.toList());
}
}

View File

@ -1,99 +0,0 @@
package eu.dnetlib.dhp.actionmanager.createunresolvedentities;
import static eu.dnetlib.dhp.actionmanager.Constants.*;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import java.io.Serializable;
import java.util.*;
import java.util.stream.Collectors;
import org.apache.commons.io.IOUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.api.java.function.MapGroupsFunction;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SaveMode;
import org.apache.spark.sql.SparkSession;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import eu.dnetlib.dhp.actionmanager.createunresolvedentities.model.FOSDataModel;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.schema.oaf.Result;
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
import eu.dnetlib.dhp.utils.DHPUtils;
public class PrepareFOSSparkJob implements Serializable {
private static final Logger log = LoggerFactory.getLogger(PrepareFOSSparkJob.class);
public static void main(String[] args) throws Exception {
String jsonConfiguration = IOUtils
.toString(
PrepareFOSSparkJob.class
.getResourceAsStream(
"/eu/dnetlib/dhp/actionmanager/createunresolvedentities/prepare_parameters.json"));
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
parser.parseArgument(args);
Boolean isSparkSessionManaged = isSparkSessionManaged(parser);
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
String sourcePath = parser.get("sourcePath");
log.info("sourcePath: {}", sourcePath);
final String outputPath = parser.get("outputPath");
log.info("outputPath: {}", outputPath);
SparkConf conf = new SparkConf();
runWithSparkSession(
conf,
isSparkSessionManaged,
spark -> {
distributeFOSdois(
spark,
sourcePath,
outputPath);
});
}
private static void distributeFOSdois(SparkSession spark, String sourcePath, String outputPath) {
Dataset<FOSDataModel> fosDataset = readPath(spark, sourcePath, FOSDataModel.class);
fosDataset
.groupByKey((MapFunction<FOSDataModel, String>) v -> v.getDoi().toLowerCase(), Encoders.STRING())
.mapGroups((MapGroupsFunction<String, FOSDataModel, Result>) (k, it) -> {
Result r = new Result();
FOSDataModel first = it.next();
r.setId(DHPUtils.generateUnresolvedIdentifier(k, DOI));
HashSet<String> level1 = new HashSet<>();
HashSet<String> level2 = new HashSet<>();
HashSet<String> level3 = new HashSet<>();
addLevels(level1, level2, level3, first);
it.forEachRemaining(v -> addLevels(level1, level2, level3, v));
List<StructuredProperty> sbjs = new ArrayList<>();
level1.forEach(l -> sbjs.add(getSubject(l, FOS_CLASS_ID, FOS_CLASS_NAME, UPDATE_SUBJECT_FOS_CLASS_ID)));
level2.forEach(l -> sbjs.add(getSubject(l, FOS_CLASS_ID, FOS_CLASS_NAME, UPDATE_SUBJECT_FOS_CLASS_ID)));
level3.forEach(l -> sbjs.add(getSubject(l, FOS_CLASS_ID, FOS_CLASS_NAME, UPDATE_SUBJECT_FOS_CLASS_ID)));
r.setSubject(sbjs);
return r;
}, Encoders.bean(Result.class))
.write()
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.json(outputPath + "/fos");
}
private static void addLevels(HashSet<String> level1, HashSet<String> level2, HashSet<String> level3,
FOSDataModel first) {
level1.add(first.getLevel1());
level2.add(first.getLevel2());
level3.add(first.getLevel3());
}
}

View File

@ -1,89 +0,0 @@
package eu.dnetlib.dhp.actionmanager.createunresolvedentities;
import static eu.dnetlib.dhp.actionmanager.Constants.*;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.List;
import org.apache.commons.io.IOUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.api.java.function.MapGroupsFunction;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SaveMode;
import org.apache.spark.sql.SparkSession;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import eu.dnetlib.dhp.actionmanager.createunresolvedentities.model.SDGDataModel;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.schema.oaf.Result;
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
import eu.dnetlib.dhp.utils.DHPUtils;
public class PrepareSDGSparkJob implements Serializable {
private static final Logger log = LoggerFactory.getLogger(PrepareSDGSparkJob.class);
public static void main(String[] args) throws Exception {
String jsonConfiguration = IOUtils
.toString(
PrepareSDGSparkJob.class
.getResourceAsStream(
"/eu/dnetlib/dhp/actionmanager/createunresolvedentities/prepare_parameters.json"));
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
parser.parseArgument(args);
Boolean isSparkSessionManaged = isSparkSessionManaged(parser);
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
String sourcePath = parser.get("sourcePath");
log.info("sourcePath: {}", sourcePath);
final String outputPath = parser.get("outputPath");
log.info("outputPath: {}", outputPath);
SparkConf conf = new SparkConf();
runWithSparkSession(
conf,
isSparkSessionManaged,
spark -> {
doPrepare(
spark,
sourcePath,
outputPath);
});
}
private static void doPrepare(SparkSession spark, String sourcePath, String outputPath) {
Dataset<SDGDataModel> sdgDataset = readPath(spark, sourcePath, SDGDataModel.class);
sdgDataset
.groupByKey((MapFunction<SDGDataModel, String>) r -> r.getDoi().toLowerCase(), Encoders.STRING())
.mapGroups((MapGroupsFunction<String, SDGDataModel, Result>) (k, it) -> {
Result r = new Result();
r.setId(DHPUtils.generateUnresolvedIdentifier(k, DOI));
SDGDataModel first = it.next();
List<StructuredProperty> sbjs = new ArrayList<>();
sbjs.add(getSubject(first.getSbj(), SDG_CLASS_ID, SDG_CLASS_NAME, UPDATE_SUBJECT_SDG_CLASS_ID));
it
.forEachRemaining(
s -> sbjs
.add(getSubject(s.getSbj(), SDG_CLASS_ID, SDG_CLASS_NAME, UPDATE_SUBJECT_SDG_CLASS_ID)));
r.setSubject(sbjs);
return r;
}, Encoders.bean(Result.class))
.write()
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.json(outputPath + "/sdg");
}
}

View File

@ -1,79 +0,0 @@
package eu.dnetlib.dhp.actionmanager.createunresolvedentities;
import static eu.dnetlib.dhp.actionmanager.Constants.*;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import java.io.Serializable;
import org.apache.commons.io.IOUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.api.java.function.MapGroupsFunction;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SaveMode;
import org.apache.spark.sql.SparkSession;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.schema.oaf.Result;
public class SparkSaveUnresolved implements Serializable {
private static final Logger log = LoggerFactory.getLogger(SparkSaveUnresolved.class);
public static void main(String[] args) throws Exception {
String jsonConfiguration = IOUtils
.toString(
PrepareFOSSparkJob.class
.getResourceAsStream(
"/eu/dnetlib/dhp/actionmanager/createunresolvedentities/produce_unresolved_parameters.json"));
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
parser.parseArgument(args);
Boolean isSparkSessionManaged = isSparkSessionManaged(parser);
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
String sourcePath = parser.get("sourcePath");
log.info("sourcePath: {}", sourcePath);
final String outputPath = parser.get("outputPath");
log.info("outputPath: {}", outputPath);
SparkConf conf = new SparkConf();
runWithSparkSession(
conf,
isSparkSessionManaged,
spark -> {
saveUnresolved(
spark,
sourcePath,
outputPath);
});
}
private static void saveUnresolved(SparkSession spark, String sourcePath, String outputPath) {
spark
.read()
.textFile(sourcePath + "/*")
.map(
(MapFunction<String, Result>) l -> OBJECT_MAPPER.readValue(l, Result.class),
Encoders.bean(Result.class))
.groupByKey((MapFunction<Result, String>) Result::getId, Encoders.STRING())
.mapGroups((MapGroupsFunction<String, Result, Result>) (k, it) -> {
Result ret = it.next();
it.forEachRemaining(r -> ret.mergeFrom(r));
return ret;
}, Encoders.bean(Result.class))
.write()
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.json(outputPath);
}
}

View File

@ -1,71 +0,0 @@
package eu.dnetlib.dhp.actionmanager.createunresolvedentities.model;
import java.io.Serializable;
import com.opencsv.bean.CsvBindByPosition;
public class FOSDataModel implements Serializable {
@CsvBindByPosition(position = 0)
// @CsvBindByName(column = "doi")
private String doi;
@CsvBindByPosition(position = 1)
// @CsvBindByName(column = "level1")
private String level1;
@CsvBindByPosition(position = 2)
// @CsvBindByName(column = "level2")
private String level2;
@CsvBindByPosition(position = 3)
// @CsvBindByName(column = "level3")
private String level3;
public FOSDataModel() {
}
public FOSDataModel(String doi, String level1, String level2, String level3) {
this.doi = doi;
this.level1 = level1;
this.level2 = level2;
this.level3 = level3;
}
public static FOSDataModel newInstance(String d, String level1, String level2, String level3) {
return new FOSDataModel(d, level1, level2, level3);
}
public String getDoi() {
return doi;
}
public void setDoi(String doi) {
this.doi = doi;
}
public String getLevel1() {
return level1;
}
public void setLevel1(String level1) {
this.level1 = level1;
}
public String getLevel2() {
return level2;
}
public void setLevel2(String level2) {
this.level2 = level2;
}
public String getLevel3() {
return level3;
}
public void setLevel3(String level3) {
this.level3 = level3;
}
}

View File

@ -1,47 +0,0 @@
package eu.dnetlib.dhp.actionmanager.createunresolvedentities.model;
import java.io.Serializable;
import com.opencsv.bean.CsvBindByPosition;
public class SDGDataModel implements Serializable {
@CsvBindByPosition(position = 0)
// @CsvBindByName(column = "doi")
private String doi;
@CsvBindByPosition(position = 1)
// @CsvBindByName(column = "sdg")
private String sbj;
public SDGDataModel() {
}
public SDGDataModel(String doi, String sbj) {
this.doi = doi;
this.sbj = sbj;
}
public static SDGDataModel newInstance(String d, String sbj) {
return new SDGDataModel(d, sbj);
}
public String getDoi() {
return doi;
}
public void setDoi(String doi) {
this.doi = doi;
}
public String getSbj() {
return sbj;
}
public void setSbj(String sbj) {
this.sbj = sbj;
}
}

View File

@ -1,10 +1,12 @@
package eu.dnetlib.dhp.datacite package eu.dnetlib.dhp.actionmanager.datacite
import org.apache.commons.io.IOUtils import org.apache.commons.io.IOUtils
import org.apache.http.client.config.RequestConfig import org.apache.http.client.config.RequestConfig
import org.apache.http.client.methods.{HttpGet, HttpPost, HttpUriRequest} import org.apache.http.client.methods.{HttpGet, HttpPost, HttpRequestBase, HttpUriRequest}
import org.apache.http.entity.StringEntity import org.apache.http.entity.StringEntity
import org.apache.http.impl.client.HttpClientBuilder import org.apache.http.impl.client.{HttpClientBuilder, HttpClients}
import java.io.IOException
abstract class AbstractRestClient extends Iterator[String]{ abstract class AbstractRestClient extends Iterator[String]{
@ -15,10 +17,12 @@ abstract class AbstractRestClient extends Iterator[String] {
var complete:Boolean = false var complete:Boolean = false
def extractInfo(input: String): Unit def extractInfo(input: String): Unit
protected def getBufferData(): Unit protected def getBufferData(): Unit
def doHTTPGETRequest(url:String): String = { def doHTTPGETRequest(url:String): String = {
val httpGet = new HttpGet(url) val httpGet = new HttpGet(url)
doHTTPRequest(httpGet) doHTTPRequest(httpGet)
@ -40,6 +44,7 @@ abstract class AbstractRestClient extends Iterator[String] {
buffer.nonEmpty && current_index < buffer.size buffer.nonEmpty && current_index < buffer.size
} }
override def next(): String = { override def next(): String = {
val next_item:String = buffer(current_index) val next_item:String = buffer(current_index)
current_index = current_index + 1 current_index = current_index + 1
@ -48,16 +53,16 @@ abstract class AbstractRestClient extends Iterator[String] {
next_item next_item
} }
private def doHTTPRequest[A <: HttpUriRequest](r: A) :String ={ private def doHTTPRequest[A <: HttpUriRequest](r: A) :String ={
val timeout = 60; // seconds val timeout = 60; // seconds
val config = RequestConfig val config = RequestConfig.custom()
.custom()
.setConnectTimeout(timeout * 1000) .setConnectTimeout(timeout * 1000)
.setConnectionRequestTimeout(timeout * 1000) .setConnectionRequestTimeout(timeout * 1000)
.setSocketTimeout(timeout * 1000) .setSocketTimeout(timeout * 1000).build()
.build()
val client =HttpClientBuilder.create().setDefaultRequestConfig(config).build() val client =HttpClientBuilder.create().setDefaultRequestConfig(config).build()
try {
var tries = 4 var tries = 4
while (tries > 0) { while (tries > 0) {
println(s"requesting ${r.getURI}") println(s"requesting ${r.getURI}")
@ -66,7 +71,8 @@ abstract class AbstractRestClient extends Iterator[String] {
println(s"get response with status${response.getStatusLine.getStatusCode}") println(s"get response with status${response.getStatusLine.getStatusCode}")
if (response.getStatusLine.getStatusCode > 400) { if (response.getStatusLine.getStatusCode > 400) {
tries -= 1 tries -= 1
} else }
else
return IOUtils.toString(response.getEntity.getContent) return IOUtils.toString(response.getEntity.getContent)
} catch { } catch {
case e: Throwable => case e: Throwable =>
@ -76,11 +82,6 @@ abstract class AbstractRestClient extends Iterator[String] {
} }
} }
"" ""
} finally {
if (client != null)
client.close()
} }
}
getBufferData() getBufferData()
} }

View File

@ -1,7 +1,7 @@
package eu.dnetlib.dhp.datacite package eu.dnetlib.dhp.actionmanager.datacite
import org.json4s.jackson.JsonMethods.{compact, parse, render}
import org.json4s.{DefaultFormats, JValue} import org.json4s.{DefaultFormats, JValue}
import org.json4s.jackson.JsonMethods.{compact, parse, render}
class DataciteAPIImporter(timestamp: Long = 0, blocks: Long = 10, until:Long = -1) extends AbstractRestClient { class DataciteAPIImporter(timestamp: Long = 0, blocks: Long = 10, until:Long = -1) extends AbstractRestClient {
@ -24,9 +24,7 @@ class DataciteAPIImporter(timestamp: Long = 0, blocks: Long = 10, until: Long =
override def getBufferData(): Unit = { override def getBufferData(): Unit = {
if (!complete) { if (!complete) {
val response = val response = if (scroll_value.isDefined) doHTTPGETRequest(scroll_value.get) else doHTTPGETRequest(get_url())
if (scroll_value.isDefined) doHTTPGETRequest(scroll_value.get)
else doHTTPGETRequest(get_url())
extractInfo(response) extractInfo(response)
} }
} }

View File

@ -0,0 +1,607 @@
package eu.dnetlib.dhp.actionmanager.datacite
import com.fasterxml.jackson.databind.ObjectMapper
import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup
import eu.dnetlib.dhp.schema.action.AtomicAction
import eu.dnetlib.dhp.schema.common.ModelConstants
import eu.dnetlib.dhp.schema.oaf.utils.{IdentifierFactory, OafMapperUtils}
import eu.dnetlib.dhp.schema.oaf.{AccessRight, Author, DataInfo, Instance, KeyValue, Oaf, OtherResearchProduct, Publication, Qualifier, Relation, Result, Software, StructuredProperty, Dataset => OafDataset}
import eu.dnetlib.dhp.utils.DHPUtils
import org.apache.commons.lang3.StringUtils
import org.json4s.DefaultFormats
import org.json4s.JsonAST.{JField, JObject, JString}
import org.json4s.jackson.JsonMethods.parse
import java.nio.charset.CodingErrorAction
import java.text.SimpleDateFormat
import java.time.LocalDate
import java.time.chrono.ThaiBuddhistDate
import java.time.format.DateTimeFormatter
import java.util.regex.Pattern
import java.util.{Date, Locale}
import scala.collection.JavaConverters._
import scala.io.{Codec, Source}
import scala.language.postfixOps
case class DataciteType(doi: String, timestamp: Long, isActive: Boolean, json: String) {}
case class RelatedIdentifierType(relationType: String, relatedIdentifier: String, relatedIdentifierType: String) {}
case class NameIdentifiersType(nameIdentifierScheme: Option[String], schemeUri: Option[String], nameIdentifier: Option[String]) {}
case class CreatorType(nameType: Option[String], nameIdentifiers: Option[List[NameIdentifiersType]], name: Option[String], familyName: Option[String], givenName: Option[String], affiliation: Option[List[String]]) {}
case class TitleType(title: Option[String], titleType: Option[String], lang: Option[String]) {}
case class SubjectType(subject: Option[String], subjectScheme: Option[String]) {}
case class DescriptionType(descriptionType: Option[String], description: Option[String]) {}
case class FundingReferenceType(funderIdentifierType: Option[String], awardTitle: Option[String], awardUri: Option[String], funderName: Option[String], funderIdentifier: Option[String], awardNumber: Option[String]) {}
case class DateType(date: Option[String], dateType: Option[String]) {}
case class HostedByMapType(openaire_id: String, datacite_name: String, official_name: String, similarity: Option[Float]) {}
object DataciteToOAFTransformation {
val REL_TYPE_VALUE:String = "resultResult"
val DATE_RELATION_KEY = "RelationDate"
val subRelTypeMapping: Map[String,(String,String)] = Map(
"References" ->("IsReferencedBy","relationship"),
"IsSupplementTo" ->("IsSupplementedBy","supplement"),
"IsPartOf" ->("HasPart","part"),
"HasPart" ->("IsPartOf","part"),
"IsVersionOf" ->("HasVersion","version"),
"HasVersion" ->("IsVersionOf","version"),
"IsIdenticalTo" ->("IsIdenticalTo","relationship"),
"IsPreviousVersionOf" ->("IsNewVersionOf","version"),
"IsContinuedBy" ->("Continues","relationship"),
"Continues" ->("IsContinuedBy","relationship"),
"IsNewVersionOf" ->("IsPreviousVersionOf","version"),
"IsSupplementedBy" ->("IsSupplementTo","supplement"),
"IsDocumentedBy" ->("Documents","relationship"),
"IsSourceOf" ->("IsDerivedFrom","relationship"),
"Cites" ->("IsCitedBy","citation"),
"IsCitedBy" ->("Cites","citation"),
"IsDerivedFrom" ->("IsSourceOf","relationship"),
"IsVariantFormOf" ->("IsDerivedFrom","version"),
"IsReferencedBy" ->("References","relationship"),
"IsObsoletedBy" ->("IsNewVersionOf","version"),
"Reviews" ->("IsReviewedBy","review"),
"Documents" ->("IsDocumentedBy","relationship"),
"IsCompiledBy" ->("Compiles","relationship"),
"Compiles" ->("IsCompiledBy","relationship"),
"IsReviewedBy" ->("Reviews","review")
)
implicit val codec: Codec = Codec("UTF-8")
codec.onMalformedInput(CodingErrorAction.REPLACE)
codec.onUnmappableCharacter(CodingErrorAction.REPLACE)
val DOI_CLASS = "doi"
val SUBJ_CLASS = "keywords"
val j_filter: List[String] = {
val s = Source.fromInputStream(getClass.getResourceAsStream("datacite_filter")).mkString
s.lines.toList
}
val mapper = new ObjectMapper()
val unknown_repository: HostedByMapType = HostedByMapType(ModelConstants.UNKNOWN_REPOSITORY_ORIGINALID, ModelConstants.UNKNOWN_REPOSITORY.getValue, ModelConstants.UNKNOWN_REPOSITORY.getValue, Some(1.0F))
val dataInfo: DataInfo = generateDataInfo("0.9")
val DATACITE_COLLECTED_FROM: KeyValue = OafMapperUtils.keyValue(ModelConstants.DATACITE_ID, "Datacite")
val hostedByMap: Map[String, HostedByMapType] = {
val s = Source.fromInputStream(getClass.getResourceAsStream("hostedBy_map.json")).mkString
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
lazy val json: org.json4s.JValue = parse(s)
json.extract[Map[String, HostedByMapType]]
}
val df_en: DateTimeFormatter = DateTimeFormatter.ofPattern("[MM-dd-yyyy][MM/dd/yyyy][dd-MM-yy][dd-MMM-yyyy][dd/MMM/yyyy][dd-MMM-yy][dd/MMM/yy][dd-MM-yy][dd/MM/yy][dd-MM-yyyy][dd/MM/yyyy][yyyy-MM-dd][yyyy/MM/dd]", Locale.ENGLISH)
val df_it: DateTimeFormatter = DateTimeFormatter.ofPattern("[dd-MM-yyyy][dd/MM/yyyy]", Locale.ITALIAN)
val funder_regex: List[(Pattern, String)] = List(
(Pattern.compile("(info:eu-repo/grantagreement/ec/h2020/)(\\d\\d\\d\\d\\d\\d)(.*)", Pattern.MULTILINE | Pattern.CASE_INSENSITIVE), "40|corda__h2020::"),
(Pattern.compile("(info:eu-repo/grantagreement/ec/fp7/)(\\d\\d\\d\\d\\d\\d)(.*)", Pattern.MULTILINE | Pattern.CASE_INSENSITIVE), "40|corda_______::")
)
val Date_regex: List[Pattern] = List(
//Y-M-D
Pattern.compile("(18|19|20)\\d\\d([- /.])(0[1-9]|1[012])\\2(0[1-9]|[12][0-9]|3[01])", Pattern.MULTILINE),
//M-D-Y
Pattern.compile("((0[1-9]|1[012])|([1-9]))([- /.])(0[1-9]|[12][0-9]|3[01])([- /.])(18|19|20)?\\d\\d", Pattern.MULTILINE),
//D-M-Y
Pattern.compile("(?:(?:31(/|-|\\.)(?:0?[13578]|1[02]|(?:Jan|Mar|May|Jul|Aug|Oct|Dec)))\\1|(?:(?:29|30)(/|-|\\.)(?:0?[1,3-9]|1[0-2]|(?:Jan|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec))\\2))(?:(?:1[6-9]|[2-9]\\d)?\\d{2})|(?:29(/|-|\\.)(?:0?2|(?:Feb))\\3(?:(?:(?:1[6-9]|[2-9]\\d)?(?:0[48]|[2468][048]|[13579][26])|(?:(?:16|[2468][048]|[3579][26])00))))|(?:0?[1-9]|1\\d|2[0-8])(/|-|\\.)(?:(?:0?[1-9]|(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep))|(?:1[0-2]|(?:Oct|Nov|Dec)))\\4(?:(?:1[6-9]|[2-9]\\d)?\\d{2})", Pattern.MULTILINE),
//Y
Pattern.compile("(19|20)\\d\\d", Pattern.MULTILINE)
)
def filter_json(json: String): Boolean = {
j_filter.exists(f => json.contains(f))
}
def toActionSet(item: Oaf): (String, String) = {
val mapper = new ObjectMapper()
item match {
case dataset: OafDataset =>
val a: AtomicAction[OafDataset] = new AtomicAction[OafDataset]
a.setClazz(classOf[OafDataset])
a.setPayload(dataset)
(dataset.getClass.getCanonicalName, mapper.writeValueAsString(a))
case publication: Publication =>
val a: AtomicAction[Publication] = new AtomicAction[Publication]
a.setClazz(classOf[Publication])
a.setPayload(publication)
(publication.getClass.getCanonicalName, mapper.writeValueAsString(a))
case software: Software =>
val a: AtomicAction[Software] = new AtomicAction[Software]
a.setClazz(classOf[Software])
a.setPayload(software)
(software.getClass.getCanonicalName, mapper.writeValueAsString(a))
case orp: OtherResearchProduct =>
val a: AtomicAction[OtherResearchProduct] = new AtomicAction[OtherResearchProduct]
a.setClazz(classOf[OtherResearchProduct])
a.setPayload(orp)
(orp.getClass.getCanonicalName, mapper.writeValueAsString(a))
case relation: Relation =>
val a: AtomicAction[Relation] = new AtomicAction[Relation]
a.setClazz(classOf[Relation])
a.setPayload(relation)
(relation.getClass.getCanonicalName, mapper.writeValueAsString(a))
case _ =>
null
}
}
def embargo_end(embargo_end_date: String): Boolean = {
val dt = LocalDate.parse(embargo_end_date, DateTimeFormatter.ofPattern("[yyyy-MM-dd]"))
val td = LocalDate.now()
td.isAfter(dt)
}
def extract_date(input: String): Option[String] = {
val d = Date_regex.map(pattern => {
val matcher = pattern.matcher(input)
if (matcher.find())
matcher.group(0)
else
null
}
).find(s => s != null)
if (d.isDefined) {
val a_date = if (d.get.length == 4) s"01-01-${d.get}" else d.get
try {
return Some(LocalDate.parse(a_date, df_en).toString)
} catch {
case _: Throwable => try {
return Some(LocalDate.parse(a_date, df_it).toString)
} catch {
case _: Throwable =>
return None
}
}
}
d
}
def fix_thai_date(input:String, format:String) :String = {
try {
val a_date = LocalDate.parse(input,DateTimeFormatter.ofPattern(format))
val d = ThaiBuddhistDate.of(a_date.getYear, a_date.getMonth.getValue, a_date.getDayOfMonth)
LocalDate.from(d).toString
} catch {
case _: Throwable => ""
}
}
def getTypeQualifier(resourceType: String, resourceTypeGeneral: String, schemaOrg: String, vocabularies: VocabularyGroup): (Qualifier, Qualifier) = {
if (resourceType != null && resourceType.nonEmpty) {
val typeQualifier = vocabularies.getSynonymAsQualifier(ModelConstants.DNET_PUBLICATION_RESOURCE, resourceType)
if (typeQualifier != null)
return (typeQualifier, vocabularies.getSynonymAsQualifier(ModelConstants.DNET_RESULT_TYPOLOGIES, typeQualifier.getClassid))
}
if (schemaOrg != null && schemaOrg.nonEmpty) {
val typeQualifier = vocabularies.getSynonymAsQualifier(ModelConstants.DNET_PUBLICATION_RESOURCE, schemaOrg)
if (typeQualifier != null)
return (typeQualifier, vocabularies.getSynonymAsQualifier(ModelConstants.DNET_RESULT_TYPOLOGIES, typeQualifier.getClassid))
}
if (resourceTypeGeneral != null && resourceTypeGeneral.nonEmpty) {
val typeQualifier = vocabularies.getSynonymAsQualifier(ModelConstants.DNET_PUBLICATION_RESOURCE, resourceTypeGeneral)
if (typeQualifier != null)
return (typeQualifier, vocabularies.getSynonymAsQualifier(ModelConstants.DNET_RESULT_TYPOLOGIES, typeQualifier.getClassid))
}
null
}
def getResult(resourceType: String, resourceTypeGeneral: String, schemaOrg: String, vocabularies: VocabularyGroup): Result = {
val typeQualifiers: (Qualifier, Qualifier) = getTypeQualifier(resourceType, resourceTypeGeneral, schemaOrg, vocabularies)
if (typeQualifiers == null)
return null
val i = new Instance
i.setInstancetype(typeQualifiers._1)
typeQualifiers._2.getClassname match {
case "dataset" =>
val r = new OafDataset
r.setInstance(List(i).asJava)
return r
case "publication" =>
val r = new Publication
r.setInstance(List(i).asJava)
return r
case "software" =>
val r = new Software
r.setInstance(List(i).asJava)
return r
case "other" =>
val r = new OtherResearchProduct
r.setInstance(List(i).asJava)
return r
}
null
}
def available_date(input: String): Boolean = {
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
lazy val json: org.json4s.JValue = parse(input)
val l: List[String] = for {
JObject(dates) <- json \\ "dates"
JField("dateType", JString(dateTypes)) <- dates
} yield dateTypes
l.exists(p => p.equalsIgnoreCase("available"))
}
/**
* As describe in ticket #6377
* when the result come from figshare we need to remove subject
* and set Access rights OPEN.
*
* @param r
*/
def fix_figshare(r: Result): Unit = {
if (r.getInstance() != null) {
val hosted_by_figshare = r.getInstance().asScala.exists(i => i.getHostedby != null && "figshare".equalsIgnoreCase(i.getHostedby.getValue))
if (hosted_by_figshare) {
r.getInstance().asScala.foreach(i => i.setAccessright(ModelConstants.OPEN_ACCESS_RIGHT()))
val l: List[StructuredProperty] = List()
r.setSubject(l.asJava)
}
}
}
def createDNetTargetIdentifier(pid: String, pidType: String, idPrefix: String): String = {
val f_part = s"$idPrefix|${pidType.toLowerCase}".padTo(15, '_')
s"$f_part::${IdentifierFactory.md5(pid.toLowerCase)}"
}
def generateOAFDate(dt: String, q: Qualifier): StructuredProperty = {
OafMapperUtils.structuredProperty(dt, q, null)
}
def generateRelation(sourceId: String, targetId: String, relClass: String, cf: KeyValue, di: DataInfo): Relation = {
val r = new Relation
r.setSource(sourceId)
r.setTarget(targetId)
r.setRelType(ModelConstants.RESULT_PROJECT)
r.setRelClass(relClass)
r.setSubRelType(ModelConstants.OUTCOME)
r.setCollectedfrom(List(cf).asJava)
r.setDataInfo(di)
r
}
def get_projectRelation(awardUri: String, sourceId: String): List[Relation] = {
val match_pattern = funder_regex.find(s => s._1.matcher(awardUri).find())
if (match_pattern.isDefined) {
val m = match_pattern.get._1
val p = match_pattern.get._2
val grantId = m.matcher(awardUri).replaceAll("$2")
val targetId = s"$p${DHPUtils.md5(grantId)}"
List(
generateRelation(sourceId, targetId, "isProducedBy", DATACITE_COLLECTED_FROM, dataInfo),
generateRelation(targetId, sourceId, "produces", DATACITE_COLLECTED_FROM, dataInfo)
)
}
else
List()
}
def generateOAF(input: String, ts: Long, dateOfCollection: Long, vocabularies: VocabularyGroup, exportLinks: Boolean): List[Oaf] = {
if (filter_json(input))
return List()
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
lazy val json = parse(input)
val resourceType = (json \ "attributes" \ "types" \ "resourceType").extractOrElse[String](null)
val resourceTypeGeneral = (json \ "attributes" \ "types" \ "resourceTypeGeneral").extractOrElse[String](null)
val schemaOrg = (json \ "attributes" \ "types" \ "schemaOrg").extractOrElse[String](null)
val doi = (json \ "attributes" \ "doi").extract[String]
if (doi.isEmpty)
return List()
//Mapping type based on vocabularies dnet:publication_resource and dnet:result_typologies
val result = getResult(resourceType, resourceTypeGeneral, schemaOrg, vocabularies)
if (result == null)
return List()
val doi_q = OafMapperUtils.qualifier("doi", "doi", ModelConstants.DNET_PID_TYPES, ModelConstants.DNET_PID_TYPES)
val pid = OafMapperUtils.structuredProperty(doi, doi_q, dataInfo)
result.setPid(List(pid).asJava)
result.setId(OafMapperUtils.createOpenaireId(50, s"datacite____::$doi", true))
result.setOriginalId(List(doi).asJava)
val d = new Date(dateOfCollection * 1000)
val ISO8601FORMAT = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ssZ", Locale.US)
result.setDateofcollection(ISO8601FORMAT.format(d))
result.setDateoftransformation(ISO8601FORMAT.format(d))
result.setDataInfo(dataInfo)
val creators = (json \\ "creators").extractOrElse[List[CreatorType]](List())
val authors = creators.zipWithIndex.map { case (c, idx) =>
val a = new Author
a.setFullname(c.name.orNull)
a.setName(c.givenName.orNull)
a.setSurname(c.familyName.orNull)
if (c.nameIdentifiers != null && c.nameIdentifiers.isDefined && c.nameIdentifiers.get != null) {
a.setPid(c.nameIdentifiers.get.map(ni => {
val q = if (ni.nameIdentifierScheme.isDefined) vocabularies.getTermAsQualifier(ModelConstants.DNET_PID_TYPES, ni.nameIdentifierScheme.get.toLowerCase()) else null
if (ni.nameIdentifier != null && ni.nameIdentifier.isDefined) {
OafMapperUtils.structuredProperty(ni.nameIdentifier.get, q, dataInfo)
}
else
null
}
)
.asJava)
}
if (c.affiliation.isDefined)
a.setAffiliation(c.affiliation.get.filter(af => af.nonEmpty).map(af => OafMapperUtils.field(af, dataInfo)).asJava)
a.setRank(idx + 1)
a
}
val titles: List[TitleType] = (json \\ "titles").extractOrElse[List[TitleType]](List())
result.setTitle(titles.filter(t => t.title.nonEmpty).map(t => {
if (t.titleType.isEmpty) {
OafMapperUtils.structuredProperty(t.title.get, ModelConstants.MAIN_TITLE_QUALIFIER, null)
} else {
OafMapperUtils.structuredProperty(t.title.get, t.titleType.get, t.titleType.get, ModelConstants.DNET_DATACITE_TITLE, ModelConstants.DNET_DATACITE_TITLE, null)
}
}).asJava)
if (authors == null || authors.isEmpty || !authors.exists(a => a != null))
return List()
result.setAuthor(authors.asJava)
val dates = (json \\ "dates").extract[List[DateType]]
val publication_year = (json \\ "publicationYear").extractOrElse[String](null)
val i_date = dates
.filter(d => d.date.isDefined && d.dateType.isDefined)
.find(d => d.dateType.get.equalsIgnoreCase("issued"))
.map(d => extract_date(d.date.get))
val a_date: Option[String] = dates
.filter(d => d.date.isDefined && d.dateType.isDefined && d.dateType.get.equalsIgnoreCase("available"))
.map(d => extract_date(d.date.get))
.find(d => d != null && d.isDefined)
.map(d => d.get)
if (a_date.isDefined) {
if(doi.startsWith("10.14457"))
result.setEmbargoenddate(OafMapperUtils.field(fix_thai_date(a_date.get,"[yyyy-MM-dd]"), null))
else
result.setEmbargoenddate(OafMapperUtils.field(a_date.get, null))
}
if (i_date.isDefined && i_date.get.isDefined) {
if(doi.startsWith("10.14457")) {
result.setDateofacceptance(OafMapperUtils.field(fix_thai_date(i_date.get.get,"[yyyy-MM-dd]"), null))
result.getInstance().get(0).setDateofacceptance(OafMapperUtils.field(fix_thai_date(i_date.get.get,"[yyyy-MM-dd]"), null))
}
else {
result.setDateofacceptance(OafMapperUtils.field(i_date.get.get, null))
result.getInstance().get(0).setDateofacceptance(OafMapperUtils.field(i_date.get.get, null))
}
}
else if (publication_year != null) {
if(doi.startsWith("10.14457")) {
result.setDateofacceptance(OafMapperUtils.field(fix_thai_date(s"01-01-$publication_year","[dd-MM-yyyy]"), null))
result.getInstance().get(0).setDateofacceptance(OafMapperUtils.field(fix_thai_date(s"01-01-$publication_year","[dd-MM-yyyy]"), null))
} else {
result.setDateofacceptance(OafMapperUtils.field(s"01-01-$publication_year", null))
result.getInstance().get(0).setDateofacceptance(OafMapperUtils.field(s"01-01-$publication_year", null))
}
}
result.setRelevantdate(dates.filter(d => d.date.isDefined && d.dateType.isDefined)
.map(d => (extract_date(d.date.get), d.dateType.get))
.filter(d => d._1.isDefined)
.map(d => (d._1.get, vocabularies.getTermAsQualifier(ModelConstants.DNET_DATACITE_DATE, d._2.toLowerCase())))
.filter(d => d._2 != null)
.map(d => generateOAFDate(d._1, d._2)).asJava)
val subjects = (json \\ "subjects").extract[List[SubjectType]]
result.setSubject(subjects.filter(s => s.subject.nonEmpty)
.map(s =>
OafMapperUtils.structuredProperty(s.subject.get, SUBJ_CLASS, SUBJ_CLASS, ModelConstants.DNET_SUBJECT_TYPOLOGIES, ModelConstants.DNET_SUBJECT_TYPOLOGIES, null)
).asJava)
result.setCollectedfrom(List(DATACITE_COLLECTED_FROM).asJava)
val descriptions = (json \\ "descriptions").extract[List[DescriptionType]]
result.setDescription(
descriptions
.filter(d => d.description.isDefined).
map(d =>
OafMapperUtils.field(d.description.get, null)
).filter(s => s != null).asJava)
val publisher = (json \\ "publisher").extractOrElse[String](null)
if (publisher != null)
result.setPublisher(OafMapperUtils.field(publisher, null))
val language: String = (json \\ "language").extractOrElse[String](null)
if (language != null)
result.setLanguage(vocabularies.getSynonymAsQualifier(ModelConstants.DNET_LANGUAGES, language))
val instance = result.getInstance().get(0)
val client = (json \ "relationships" \ "client" \\ "id").extractOpt[String]
val accessRights: List[String] = for {
JObject(rightsList) <- json \\ "rightsList"
JField("rightsUri", JString(rightsUri)) <- rightsList
} yield rightsUri
val aRights: Option[AccessRight] = accessRights.map(r => {
vocabularies.getSynonymAsQualifier(ModelConstants.DNET_ACCESS_MODES, r)
}).find(q => q != null).map(q => {
val a = new AccessRight
a.setClassid(q.getClassid)
a.setClassname(q.getClassname)
a.setSchemeid(q.getSchemeid)
a.setSchemename(q.getSchemename)
a
})
val access_rights_qualifier = if (aRights.isDefined) aRights.get else OafMapperUtils.accessRight(ModelConstants.UNKNOWN, ModelConstants.NOT_AVAILABLE, ModelConstants.DNET_ACCESS_MODES, ModelConstants.DNET_ACCESS_MODES)
if (client.isDefined) {
val hb = hostedByMap.getOrElse(client.get.toUpperCase(), unknown_repository)
instance.setHostedby(OafMapperUtils.keyValue(generateDSId(hb.openaire_id), hb.official_name))
instance.setCollectedfrom(DATACITE_COLLECTED_FROM)
instance.setUrl(List(s"https://dx.doi.org/$doi").asJava)
instance.setAccessright(access_rights_qualifier)
instance.setPid(result.getPid)
val license = accessRights
.find(r => r.startsWith("http") && r.matches(".*(/licenses|/publicdomain|unlicense\\.org/|/legal-and-data-protection-notices|/download/license|/open-government-licence).*"))
if (license.isDefined)
instance.setLicense(OafMapperUtils.field(license.get, null))
}
val awardUris: List[String] = for {
JObject(fundingReferences) <- json \\ "fundingReferences"
JField("awardUri", JString(awardUri)) <- fundingReferences
} yield awardUri
result.setId(IdentifierFactory.createIdentifier(result))
var relations: List[Relation] = awardUris.flatMap(a => get_projectRelation(a, result.getId)).filter(r => r != null)
fix_figshare(result)
if (result.getId == null)
return List()
if (exportLinks) {
val rels: List[RelatedIdentifierType] = for {
JObject(relIdentifier) <- json \\ "relatedIdentifiers"
JField("relationType", JString(relationType)) <- relIdentifier
JField("relatedIdentifierType", JString(relatedIdentifierType)) <- relIdentifier
JField("relatedIdentifier", JString(relatedIdentifier)) <- relIdentifier
} yield RelatedIdentifierType(relationType, relatedIdentifier, relatedIdentifierType)
relations = relations ::: generateRelations(rels,result.getId, if (i_date.isDefined && i_date.get.isDefined) i_date.get.get else null)
}
if (relations != null && relations.nonEmpty) {
List(result) ::: relations
}
else
List(result)
}
private def generateRelations(rels: List[RelatedIdentifierType], id:String, date:String):List[Relation] = {
rels
.filter(r =>
subRelTypeMapping.contains(r.relationType) && (
r.relatedIdentifierType.equalsIgnoreCase("doi") ||
r.relatedIdentifierType.equalsIgnoreCase("pmid") ||
r.relatedIdentifierType.equalsIgnoreCase("arxiv"))
)
.map(r => {
val rel = new Relation
rel.setCollectedfrom(List(DATACITE_COLLECTED_FROM).asJava)
rel.setDataInfo(dataInfo)
val subRelType = subRelTypeMapping(r.relationType)._2
rel.setRelType(REL_TYPE_VALUE)
rel.setSubRelType(subRelType)
rel.setRelClass(r.relationType)
val dateProps:KeyValue = OafMapperUtils.keyValue(DATE_RELATION_KEY, date)
rel.setProperties(List(dateProps).asJava)
rel.setSource(id)
rel.setTarget(s"unresolved::${r.relatedIdentifier}::${r.relatedIdentifierType}")
rel.setCollectedfrom(List(DATACITE_COLLECTED_FROM).asJava)
rel.getCollectedfrom.asScala.map(c => c.getValue)(collection.breakOut)
rel
})(collection breakOut)
}
def generateDataInfo(trust: String): DataInfo = {
val di = new DataInfo
di.setDeletedbyinference(false)
di.setInferred(false)
di.setInvisible(false)
di.setTrust(trust)
di.setProvenanceaction(ModelConstants.PROVENANCE_ACTION_SET_QUALIFIER)
di
}
def generateDSId(input: String): String = {
val b = StringUtils.substringBefore(input, "::")
val a = StringUtils.substringAfter(input, "::")
s"10|$b::${DHPUtils.md5(a)}"
}
}

View File

@ -0,0 +1,41 @@
package eu.dnetlib.dhp.actionmanager.datacite
import eu.dnetlib.dhp.application.ArgumentApplicationParser
import eu.dnetlib.dhp.schema.oaf.Oaf
import org.apache.hadoop.io.Text
import org.apache.hadoop.io.compress.GzipCodec
import org.apache.hadoop.mapred.SequenceFileOutputFormat
import org.apache.spark.SparkConf
import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession}
import org.slf4j.{Logger, LoggerFactory}
import scala.io.Source
object ExportActionSetJobNode {
val log: Logger = LoggerFactory.getLogger(ExportActionSetJobNode.getClass)
def main(args: Array[String]): Unit = {
val conf = new SparkConf
val parser = new ArgumentApplicationParser(Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/actionmanager/datacite/exportDataset_parameters.json")).mkString)
parser.parseArgument(args)
val master = parser.get("master")
val sourcePath = parser.get("sourcePath")
val targetPath = parser.get("targetPath")
val spark: SparkSession = SparkSession.builder().config(conf)
.appName(ExportActionSetJobNode.getClass.getSimpleName)
.master(master)
.getOrCreate()
implicit val resEncoder: Encoder[Oaf] = Encoders.kryo[Oaf]
implicit val tEncoder:Encoder[(String,String)] = Encoders.tuple(Encoders.STRING,Encoders.STRING)
spark.read.load(sourcePath).as[Oaf]
.map(o =>DataciteToOAFTransformation.toActionSet(o))
.filter(o => o!= null)
.rdd.map(s => (new Text(s._1), new Text(s._2))).saveAsHadoopFile(s"$targetPath", classOf[Text], classOf[Text], classOf[SequenceFileOutputFormat[Text,Text]], classOf[GzipCodec])
}
}

View File

@ -0,0 +1,46 @@
package eu.dnetlib.dhp.actionmanager.datacite
import eu.dnetlib.dhp.application.ArgumentApplicationParser
import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup
import eu.dnetlib.dhp.schema.mdstore.MetadataRecord
import eu.dnetlib.dhp.schema.oaf.{Oaf, Result}
import eu.dnetlib.dhp.utils.ISLookupClientFactory
import org.apache.spark.SparkConf
import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession}
import org.slf4j.{Logger, LoggerFactory}
import scala.io.Source
object FilterCrossrefEntitiesSpark {
val log: Logger = LoggerFactory.getLogger(getClass.getClass)
def main(args: Array[String]): Unit = {
val conf = new SparkConf
val parser = new ArgumentApplicationParser(Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/actionmanager/datacite/filter_crossref_param.json")).mkString)
parser.parseArgument(args)
val master = parser.get("master")
val sourcePath = parser.get("sourcePath")
log.info("sourcePath: {}", sourcePath)
val targetPath = parser.get("targetPath")
log.info("targetPath: {}", targetPath)
val spark: SparkSession = SparkSession.builder().config(conf)
.appName(getClass.getSimpleName)
.master(master)
.getOrCreate()
implicit val oafEncoder: Encoder[Oaf] = Encoders.kryo[Oaf]
implicit val resEncoder: Encoder[Result] = Encoders.kryo[Result]
val d:Dataset[Oaf]= spark.read.load(sourcePath).as[Oaf]
d.filter(r => r.isInstanceOf[Result]).map(r => r.asInstanceOf[Result]).write.mode(SaveMode.Overwrite).save(targetPath)
}
}

View File

@ -0,0 +1,48 @@
package eu.dnetlib.dhp.actionmanager.datacite
import eu.dnetlib.dhp.application.ArgumentApplicationParser
import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup
import eu.dnetlib.dhp.schema.mdstore.MetadataRecord
import eu.dnetlib.dhp.schema.oaf.Oaf
import eu.dnetlib.dhp.utils.ISLookupClientFactory
import org.apache.spark.SparkConf
import org.apache.spark.sql.{Encoder, Encoders, SaveMode, SparkSession}
import org.slf4j.{Logger, LoggerFactory}
import scala.io.Source
object GenerateDataciteDatasetSpark {
val log: Logger = LoggerFactory.getLogger(GenerateDataciteDatasetSpark.getClass)
def main(args: Array[String]): Unit = {
val conf = new SparkConf
val parser = new ArgumentApplicationParser(Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/actionmanager/datacite/generate_dataset_params.json")).mkString)
parser.parseArgument(args)
val master = parser.get("master")
val sourcePath = parser.get("sourcePath")
val targetPath = parser.get("targetPath")
val exportLinks = "true".equalsIgnoreCase(parser.get("exportLinks"))
val isLookupUrl: String = parser.get("isLookupUrl")
log.info("isLookupUrl: {}", isLookupUrl)
val isLookupService = ISLookupClientFactory.getLookUpService(isLookupUrl)
val vocabularies = VocabularyGroup.loadVocsFromIS(isLookupService)
val spark: SparkSession = SparkSession.builder().config(conf)
.appName(GenerateDataciteDatasetSpark.getClass.getSimpleName)
.master(master)
.getOrCreate()
implicit val mrEncoder: Encoder[MetadataRecord] = Encoders.kryo[MetadataRecord]
implicit val resEncoder: Encoder[Oaf] = Encoders.kryo[Oaf]
import spark.implicits._
spark.read.load(sourcePath).as[DataciteType]
.filter(d => d.isActive)
.flatMap(d => DataciteToOAFTransformation.generateOAF(d.json, d.timestamp, d.timestamp, vocabularies, exportLinks))
.filter(d => d != null)
.write.mode(SaveMode.Overwrite).save(targetPath)
}
}

View File

@ -1,5 +1,6 @@
package eu.dnetlib.dhp.datacite package eu.dnetlib.dhp.actionmanager.datacite
import eu.dnetlib.dhp.actionmanager.datacite.DataciteToOAFTransformation.df_it
import eu.dnetlib.dhp.application.ArgumentApplicationParser import eu.dnetlib.dhp.application.ArgumentApplicationParser
import org.apache.hadoop.conf.Configuration import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileSystem, LocalFileSystem, Path} import org.apache.hadoop.fs.{FileSystem, LocalFileSystem, Path}
@ -8,20 +9,21 @@ import org.apache.hadoop.io.{IntWritable, SequenceFile, Text}
import org.apache.spark.SparkContext import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD import org.apache.spark.rdd.RDD
import org.apache.spark.sql.expressions.Aggregator import org.apache.spark.sql.expressions.Aggregator
import org.apache.spark.sql.functions.max
import org.apache.spark.sql.{Dataset, Encoder, SaveMode, SparkSession} import org.apache.spark.sql.{Dataset, Encoder, SaveMode, SparkSession}
import org.json4s.DefaultFormats import org.json4s.DefaultFormats
import org.json4s.jackson.JsonMethods.parse import org.json4s.jackson.JsonMethods.parse
import org.apache.spark.sql.functions.max
import org.slf4j.{Logger, LoggerFactory} import org.slf4j.{Logger, LoggerFactory}
import java.time.format.DateTimeFormatter.ISO_DATE_TIME import java.time.format.DateTimeFormatter._
import java.time.{LocalDateTime, ZoneOffset} import java.time.{LocalDate, LocalDateTime, ZoneOffset}
import scala.io.Source import scala.io.Source
object ImportDatacite { object ImportDatacite {
val log: Logger = LoggerFactory.getLogger(ImportDatacite.getClass) val log: Logger = LoggerFactory.getLogger(ImportDatacite.getClass)
def convertAPIStringToDataciteItem(input: String): DataciteType = { def convertAPIStringToDataciteItem(input: String): DataciteType = {
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
lazy val json: org.json4s.JValue = parse(input) lazy val json: org.json4s.JValue = parse(input)
@ -31,26 +33,14 @@ object ImportDatacite {
val timestamp_string = (json \ "attributes" \ "updated").extract[String] val timestamp_string = (json \ "attributes" \ "updated").extract[String]
val dt = LocalDateTime.parse(timestamp_string, ISO_DATE_TIME) val dt = LocalDateTime.parse(timestamp_string, ISO_DATE_TIME)
DataciteType( DataciteType(doi = doi, timestamp = dt.toInstant(ZoneOffset.UTC).toEpochMilli / 1000, isActive = isActive, json = input)
doi = doi,
timestamp = dt.toInstant(ZoneOffset.UTC).toEpochMilli / 1000,
isActive = isActive,
json = input
)
} }
def main(args: Array[String]): Unit = { def main(args: Array[String]): Unit = {
val parser = new ArgumentApplicationParser( val parser = new ArgumentApplicationParser(Source.fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/actionmanager/datacite/import_from_api.json")).mkString)
Source
.fromInputStream(
getClass.getResourceAsStream(
"/eu/dnetlib/dhp/actionmanager/datacite/import_from_api.json"
)
)
.mkString
)
parser.parseArgument(args) parser.parseArgument(args)
val master = parser.get("master") val master = parser.get("master")
@ -71,8 +61,7 @@ object ImportDatacite {
val spkipImport = parser.get("skipImport") val spkipImport = parser.get("skipImport")
log.info(s"skipImport is $spkipImport") log.info(s"skipImport is $spkipImport")
val spark: SparkSession = SparkSession val spark: SparkSession = SparkSession.builder()
.builder()
.appName(ImportDatacite.getClass.getSimpleName) .appName(ImportDatacite.getClass.getSimpleName)
.master(master) .master(master)
.getOrCreate() .getOrCreate()
@ -90,8 +79,8 @@ object ImportDatacite {
import spark.implicits._ import spark.implicits._
val dataciteAggregator: Aggregator[DataciteType, DataciteType, DataciteType] =
new Aggregator[DataciteType, DataciteType, DataciteType] with Serializable { val dataciteAggregator: Aggregator[DataciteType, DataciteType, DataciteType] = new Aggregator[DataciteType, DataciteType, DataciteType] with Serializable {
override def zero: DataciteType = null override def zero: DataciteType = null
@ -122,16 +111,13 @@ object ImportDatacite {
println(s"last Timestamp is $ts") println(s"last Timestamp is $ts")
val cnt = val cnt = if ("true".equalsIgnoreCase(spkipImport)) 1 else writeSequenceFile(hdfsTargetPath, ts, conf, bs)
if ("true".equalsIgnoreCase(spkipImport)) 1
else writeSequenceFile(hdfsTargetPath, ts, conf, bs)
println(s"Imported from Datacite API $cnt documents") println(s"Imported from Datacite API $cnt documents")
if (cnt > 0) { if (cnt > 0) {
val inputRdd: RDD[DataciteType] = sc val inputRdd: RDD[DataciteType] = sc.sequenceFile(targetPath, classOf[Int], classOf[Text])
.sequenceFile(targetPath, classOf[Int], classOf[Text])
.map(s => s._2.toString) .map(s => s._2.toString)
.map(s => convertAPIStringToDataciteItem(s)) .map(s => convertAPIStringToDataciteItem(s))
spark.createDataset(inputRdd).write.mode(SaveMode.Overwrite).save(s"${targetPath}_dataset") spark.createDataset(inputRdd).write.mode(SaveMode.Overwrite).save(s"${targetPath}_dataset")
@ -144,9 +130,7 @@ object ImportDatacite {
.agg(dataciteAggregator.toColumn) .agg(dataciteAggregator.toColumn)
.map(s => s._2) .map(s => s._2)
.repartition(4000) .repartition(4000)
.write .write.mode(SaveMode.Overwrite).save(s"${dataciteDump}_updated")
.mode(SaveMode.Overwrite)
.save(s"${dataciteDump}_updated")
val fs = FileSystem.get(sc.hadoopConfiguration) val fs = FileSystem.get(sc.hadoopConfiguration)
fs.delete(new Path(s"$dataciteDump"), true) fs.delete(new Path(s"$dataciteDump"), true)
@ -154,24 +138,14 @@ object ImportDatacite {
} }
} }
private def writeSequenceFile( private def writeSequenceFile(hdfsTargetPath: Path, timestamp: Long, conf: Configuration, bs:Int): Long = {
hdfsTargetPath: Path,
timestamp: Long,
conf: Configuration,
bs: Int
): Long = {
var from:Long = timestamp * 1000 var from:Long = timestamp * 1000
val delta:Long = 100000000L val delta:Long = 100000000L
var client: DataciteAPIImporter = null var client: DataciteAPIImporter = null
val now :Long =System.currentTimeMillis() val now :Long =System.currentTimeMillis()
var i = 0 var i = 0
try { try {
val writer = SequenceFile.createWriter( val writer = SequenceFile.createWriter(conf, SequenceFile.Writer.file(hdfsTargetPath), SequenceFile.Writer.keyClass(classOf[IntWritable]), SequenceFile.Writer.valueClass(classOf[Text]))
conf,
SequenceFile.Writer.file(hdfsTargetPath),
SequenceFile.Writer.keyClass(classOf[IntWritable]),
SequenceFile.Writer.valueClass(classOf[Text])
)
try { try {
var start: Long = System.currentTimeMillis var start: Long = System.currentTimeMillis
while (from < now) { while (from < now) {
@ -180,16 +154,16 @@ object ImportDatacite {
val key: IntWritable = new IntWritable(i) val key: IntWritable = new IntWritable(i)
val value: Text = new Text val value: Text = new Text
while (client.hasNext) { while (client.hasNext) {
key.set { key.set({
i += 1; i += 1;
i - 1 i - 1
} })
value.set(client.next()) value.set(client.next())
writer.append(key, value) writer.append(key, value)
writer.hflush() writer.hflush()
if (i % 1000 == 0) { if (i % 1000 == 0) {
end = System.currentTimeMillis end = System.currentTimeMillis
val time = (end - start) / 1000.0f val time = (end - start) / 1000.0F
println(s"Imported $i in $time seconds") println(s"Imported $i in $time seconds")
start = System.currentTimeMillis start = System.currentTimeMillis
} }
@ -201,7 +175,8 @@ object ImportDatacite {
case e: Throwable => case e: Throwable =>
println("Error", e) println("Error", e)
} finally if (writer != null) writer.close() } finally if (writer != null) writer.close()
} catch { }
catch {
case e: Throwable => case e: Throwable =>
log.error("Error", e) log.error("Error", e)
} }

View File

@ -1,181 +0,0 @@
package eu.dnetlib.dhp.actionmanager.opencitations;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import java.io.IOException;
import java.io.Serializable;
import java.util.*;
import org.apache.commons.cli.ParseException;
import org.apache.commons.io.IOUtils;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.FilterFunction;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SparkSession;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.schema.action.AtomicAction;
import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.common.ModelSupport;
import eu.dnetlib.dhp.schema.oaf.*;
import eu.dnetlib.dhp.schema.oaf.utils.CleaningFunctions;
import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory;
import scala.Tuple2;
public class CreateActionSetSparkJob implements Serializable {
public static final String OPENCITATIONS_CLASSID = "sysimport:crosswalk:opencitations";
public static final String OPENCITATIONS_CLASSNAME = "Imported from OpenCitations";
private static final String ID_PREFIX = "50|doi_________::";
private static final String TRUST = "0.91";
private static final Logger log = LoggerFactory.getLogger(CreateActionSetSparkJob.class);
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
public static void main(final String[] args) throws IOException, ParseException {
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
IOUtils
.toString(
Objects
.requireNonNull(
CreateActionSetSparkJob.class
.getResourceAsStream(
"/eu/dnetlib/dhp/actionmanager/opencitations/as_parameters.json"))));
parser.parseArgument(args);
Boolean isSparkSessionManaged = Optional
.ofNullable(parser.get("isSparkSessionManaged"))
.map(Boolean::valueOf)
.orElse(Boolean.TRUE);
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
final String inputPath = parser.get("inputPath");
log.info("inputPath {}", inputPath.toString());
final String outputPath = parser.get("outputPath");
log.info("outputPath {}", outputPath);
final boolean shouldDuplicateRels = Optional
.ofNullable(parser.get("shouldDuplicateRels"))
.map(Boolean::valueOf)
.orElse(Boolean.FALSE);
SparkConf conf = new SparkConf();
runWithSparkSession(
conf,
isSparkSessionManaged,
spark -> {
extractContent(spark, inputPath, outputPath, shouldDuplicateRels);
});
}
private static void extractContent(SparkSession spark, String inputPath, String outputPath,
boolean shouldDuplicateRels) {
spark
.sqlContext()
.createDataset(spark.sparkContext().textFile(inputPath + "/*", 6000), Encoders.STRING())
.flatMap(
(FlatMapFunction<String, Relation>) value -> createRelation(value, shouldDuplicateRels).iterator(),
Encoders.bean(Relation.class))
.filter((FilterFunction<Relation>) value -> value != null)
.toJavaRDD()
.map(p -> new AtomicAction(p.getClass(), p))
.mapToPair(
aa -> new Tuple2<>(new Text(aa.getClazz().getCanonicalName()),
new Text(OBJECT_MAPPER.writeValueAsString(aa))))
.saveAsHadoopFile(outputPath, Text.class, Text.class, SequenceFileOutputFormat.class);
}
private static List<Relation> createRelation(String value, boolean duplicate) {
String[] line = value.split(",");
if (!line[1].startsWith("10.")) {
return new ArrayList<>();
}
List<Relation> relationList = new ArrayList<>();
String citing = ID_PREFIX + IdentifierFactory.md5(CleaningFunctions.normalizePidValue("doi", line[1]));
final String cited = ID_PREFIX + IdentifierFactory.md5(CleaningFunctions.normalizePidValue("doi", line[2]));
relationList
.addAll(
getRelations(
citing,
cited));
if (duplicate && line[1].endsWith(".refs")) {
citing = ID_PREFIX + IdentifierFactory
.md5(CleaningFunctions.normalizePidValue("doi", line[1].substring(0, line[1].indexOf(".refs"))));
relationList.addAll(getRelations(citing, cited));
}
return relationList;
}
private static Collection<Relation> getRelations(String citing, String cited) {
return Arrays
.asList(
getRelation(citing, cited, ModelConstants.CITES),
getRelation(cited, citing, ModelConstants.IS_CITED_BY));
}
public static Relation getRelation(
String source,
String target,
String relclass) {
Relation r = new Relation();
r.setCollectedfrom(getCollectedFrom());
r.setSource(source);
r.setTarget(target);
r.setRelClass(relclass);
r.setRelType(ModelConstants.RESULT_RESULT);
r.setSubRelType(ModelConstants.CITATION);
r
.setDataInfo(
getDataInfo());
return r;
}
public static List<KeyValue> getCollectedFrom() {
KeyValue kv = new KeyValue();
kv.setKey(ModelConstants.OPENOCITATIONS_ID);
kv.setValue(ModelConstants.OPENOCITATIONS_NAME);
return Arrays.asList(kv);
}
public static DataInfo getDataInfo() {
DataInfo di = new DataInfo();
di.setInferred(false);
di.setDeletedbyinference(false);
di.setTrust(TRUST);
di
.setProvenanceaction(
getQualifier(OPENCITATIONS_CLASSID, OPENCITATIONS_CLASSNAME, ModelConstants.DNET_PROVENANCE_ACTIONS));
return di;
}
public static Qualifier getQualifier(String class_id, String class_name,
String qualifierSchema) {
Qualifier pa = new Qualifier();
pa.setClassid(class_id);
pa.setClassname(class_name);
pa.setSchemeid(qualifierSchema);
pa.setSchemename(qualifierSchema);
return pa;
}
}

View File

@ -1,93 +0,0 @@
package eu.dnetlib.dhp.actionmanager.opencitations;
import java.io.*;
import java.io.Serializable;
import java.util.Objects;
import java.util.zip.GZIPOutputStream;
import java.util.zip.ZipEntry;
import java.util.zip.ZipInputStream;
import org.apache.commons.cli.ParseException;
import org.apache.commons.io.IOUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
public class GetOpenCitationsRefs implements Serializable {
private static final Logger log = LoggerFactory.getLogger(GetOpenCitationsRefs.class);
public static void main(final String[] args) throws IOException, ParseException {
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
IOUtils
.toString(
Objects
.requireNonNull(
GetOpenCitationsRefs.class
.getResourceAsStream(
"/eu/dnetlib/dhp/actionmanager/opencitations/input_parameters.json"))));
parser.parseArgument(args);
final String[] inputFile = parser.get("inputFile").split(";");
log.info("inputFile {}", inputFile.toString());
final String workingPath = parser.get("workingPath");
log.info("workingPath {}", workingPath);
final String hdfsNameNode = parser.get("hdfsNameNode");
log.info("hdfsNameNode {}", hdfsNameNode);
Configuration conf = new Configuration();
conf.set("fs.defaultFS", hdfsNameNode);
FileSystem fileSystem = FileSystem.get(conf);
GetOpenCitationsRefs ocr = new GetOpenCitationsRefs();
for (String file : inputFile) {
ocr.doExtract(workingPath + "/Original/" + file, workingPath, fileSystem);
}
}
private void doExtract(String inputFile, String workingPath, FileSystem fileSystem)
throws IOException {
final Path path = new Path(inputFile);
FSDataInputStream oc_zip = fileSystem.open(path);
int count = 1;
try (ZipInputStream zis = new ZipInputStream(oc_zip)) {
ZipEntry entry = null;
while ((entry = zis.getNextEntry()) != null) {
if (!entry.isDirectory()) {
String fileName = entry.getName();
fileName = fileName.substring(0, fileName.indexOf("T")) + "_" + count;
count++;
try (
FSDataOutputStream out = fileSystem
.create(new Path(workingPath + "/COCI/" + fileName + ".gz"));
GZIPOutputStream gzipOs = new GZIPOutputStream(new BufferedOutputStream(out))) {
IOUtils.copy(zis, gzipOs);
}
}
}
}
}
}

View File

@ -20,7 +20,7 @@ import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.actionmanager.project.utils.model.CSVProgramme; import eu.dnetlib.dhp.actionmanager.project.utils.CSVProgramme;
import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.common.HdfsSupport; import eu.dnetlib.dhp.common.HdfsSupport;
import scala.Tuple2; import scala.Tuple2;
@ -171,23 +171,26 @@ public class PrepareProgramme {
} }
private static CSVProgramme groupProgrammeByCode(CSVProgramme a, CSVProgramme b) { private static CSVProgramme groupProgrammeByCode(CSVProgramme a, CSVProgramme b) {
if (!a.getLanguage().equals("en") && b.getLanguage().equalsIgnoreCase("en")) { if (!a.getLanguage().equals("en")) {
if (b.getLanguage().equalsIgnoreCase("en")) {
a.setTitle(b.getTitle()); a.setTitle(b.getTitle());
a.setLanguage(b.getLanguage()); a.setLanguage(b.getLanguage());
} }
if (StringUtils.isEmpty(a.getShortTitle()) && !StringUtils.isEmpty(b.getShortTitle())) { }
if (StringUtils.isEmpty(a.getShortTitle())) {
if (!StringUtils.isEmpty(b.getShortTitle())) {
a.setShortTitle(b.getShortTitle()); a.setShortTitle(b.getShortTitle());
} }
}
return a; return a;
} }
@SuppressWarnings("unchecked")
private static List<CSVProgramme> prepareClassification(JavaRDD<CSVProgramme> h2020Programmes) { private static List<CSVProgramme> prepareClassification(JavaRDD<CSVProgramme> h2020Programmes) {
Object[] codedescription = h2020Programmes Object[] codedescription = h2020Programmes
.map( .map(
value -> new Tuple2<>(value.getCode(), value -> new Tuple2<>(value.getCode(),
new Tuple2<>(value.getTitle(), value.getShortTitle()))) new Tuple2<String, String>(value.getTitle(), value.getShortTitle())))
.collect() .collect()
.toArray(); .toArray();
@ -213,7 +216,7 @@ public class PrepareProgramme {
String[] tmp = ent.split("\\."); String[] tmp = ent.split("\\.");
if (tmp.length <= 2) { if (tmp.length <= 2) {
if (StringUtils.isEmpty(entry._2()._2())) { if (StringUtils.isEmpty(entry._2()._2())) {
map.put(entry._1(), new Tuple2<>(entry._2()._1(), entry._2()._1())); map.put(entry._1(), new Tuple2<String, String>(entry._2()._1(), entry._2()._1()));
} else { } else {
map.put(entry._1(), entry._2()); map.put(entry._1(), entry._2());
} }

View File

@ -18,7 +18,7 @@ import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.actionmanager.project.utils.model.CSVProject; import eu.dnetlib.dhp.actionmanager.project.utils.CSVProject;
import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.common.HdfsSupport; import eu.dnetlib.dhp.common.HdfsSupport;
import scala.Tuple2; import scala.Tuple2;
@ -29,7 +29,7 @@ import scala.Tuple2;
*/ */
public class PrepareProjects { public class PrepareProjects {
private static final Logger log = LoggerFactory.getLogger(PrepareProjects.class); private static final Logger log = LoggerFactory.getLogger(PrepareProgramme.class);
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
public static void main(String[] args) throws Exception { public static void main(String[] args) throws Exception {

View File

@ -31,16 +31,15 @@ import eu.dnetlib.dhp.common.DbClient;
*/ */
public class ReadProjectsFromDB implements Closeable { public class ReadProjectsFromDB implements Closeable {
private static final Log log = LogFactory.getLog(ReadProjectsFromDB.class);
private static final String query = "SELECT code " +
"from projects where id like 'corda__h2020%' ";
private final DbClient dbClient; private final DbClient dbClient;
private static final Log log = LogFactory.getLog(ReadProjectsFromDB.class);
private final Configuration conf; private final Configuration conf;
private final BufferedWriter writer; private final BufferedWriter writer;
private final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); private final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
private final static String query = "SELECT code " +
"from projects where id like 'corda__h2020%' ";
public static void main(final String[] args) throws Exception { public static void main(final String[] args) throws Exception {
final ArgumentApplicationParser parser = new ArgumentApplicationParser( final ArgumentApplicationParser parser = new ArgumentApplicationParser(
IOUtils IOUtils
@ -66,9 +65,9 @@ public class ReadProjectsFromDB implements Closeable {
} }
} }
public void execute(final String sql, final Function<ResultSet, List<ProjectSubset>> producer) { public void execute(final String sql, final Function<ResultSet, List<ProjectSubset>> producer) throws Exception {
final Consumer<ResultSet> consumer = rs -> producer.apply(rs).forEach(this::writeProject); final Consumer<ResultSet> consumer = rs -> producer.apply(rs).forEach(r -> writeProject(r));
dbClient.processResults(sql, consumer); dbClient.processResults(sql, consumer);
} }
@ -95,20 +94,20 @@ public class ReadProjectsFromDB implements Closeable {
public ReadProjectsFromDB( public ReadProjectsFromDB(
final String hdfsPath, String hdfsNameNode, final String dbUrl, final String dbUser, final String dbPassword) final String hdfsPath, String hdfsNameNode, final String dbUrl, final String dbUser, final String dbPassword)
throws IOException { throws Exception {
this.dbClient = new DbClient(dbUrl, dbUser, dbPassword); this.dbClient = new DbClient(dbUrl, dbUser, dbPassword);
this.conf = new Configuration(); this.conf = new Configuration();
this.conf.set("fs.defaultFS", hdfsNameNode); this.conf.set("fs.defaultFS", hdfsNameNode);
FileSystem fileSystem = FileSystem.get(this.conf); FileSystem fileSystem = FileSystem.get(this.conf);
Path hdfsWritePath = new Path(hdfsPath); Path hdfsWritePath = new Path(hdfsPath);
FSDataOutputStream fsDataOutputStream = null;
if (fileSystem.exists(hdfsWritePath)) { if (fileSystem.exists(hdfsWritePath)) {
fileSystem.delete(hdfsWritePath, false); fileSystem.delete(hdfsWritePath, false);
} }
FSDataOutputStream fos = fileSystem.create(hdfsWritePath); fsDataOutputStream = fileSystem.create(hdfsWritePath);
this.writer = new BufferedWriter(new OutputStreamWriter(fos, StandardCharsets.UTF_8)); this.writer = new BufferedWriter(new OutputStreamWriter(fsDataOutputStream, StandardCharsets.UTF_8));
} }
@Override @Override

View File

@ -4,6 +4,7 @@ package eu.dnetlib.dhp.actionmanager.project;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import java.util.Arrays; import java.util.Arrays;
import java.util.HashMap;
import java.util.Objects; import java.util.Objects;
import java.util.Optional; import java.util.Optional;
@ -21,16 +22,15 @@ import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.actionmanager.project.utils.model.CSVProgramme; import eu.dnetlib.dhp.actionmanager.project.utils.CSVProgramme;
import eu.dnetlib.dhp.actionmanager.project.utils.model.CSVProject; import eu.dnetlib.dhp.actionmanager.project.utils.CSVProject;
import eu.dnetlib.dhp.actionmanager.project.utils.model.EXCELTopic; import eu.dnetlib.dhp.actionmanager.project.utils.EXCELTopic;
import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.common.HdfsSupport; import eu.dnetlib.dhp.common.HdfsSupport;
import eu.dnetlib.dhp.schema.action.AtomicAction; import eu.dnetlib.dhp.schema.action.AtomicAction;
import eu.dnetlib.dhp.schema.common.ModelSupport; import eu.dnetlib.dhp.schema.common.ModelSupport;
import eu.dnetlib.dhp.schema.oaf.H2020Classification; import eu.dnetlib.dhp.schema.oaf.H2020Classification;
import eu.dnetlib.dhp.schema.oaf.H2020Programme; import eu.dnetlib.dhp.schema.oaf.H2020Programme;
import eu.dnetlib.dhp.schema.oaf.OafEntity;
import eu.dnetlib.dhp.schema.oaf.Project; import eu.dnetlib.dhp.schema.oaf.Project;
import eu.dnetlib.dhp.utils.DHPUtils; import eu.dnetlib.dhp.utils.DHPUtils;
import scala.Tuple2; import scala.Tuple2;
@ -47,10 +47,13 @@ import scala.Tuple2;
* *
* To produce one single entry for each project code a step of groupoing is needed: each project can be associated to more * To produce one single entry for each project code a step of groupoing is needed: each project can be associated to more
* than one programme. * than one programme.
*
*
*/ */
public class SparkAtomicActionJob { public class SparkAtomicActionJob {
private static final Logger log = LoggerFactory.getLogger(SparkAtomicActionJob.class); private static final Logger log = LoggerFactory.getLogger(SparkAtomicActionJob.class);
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
private static final HashMap<String, String> programmeMap = new HashMap<>();
public static void main(String[] args) throws Exception { public static void main(String[] args) throws Exception {
@ -134,6 +137,7 @@ public class SparkAtomicActionJob {
h2020classification.setClassification(csvProgramme.getClassification()); h2020classification.setClassification(csvProgramme.getClassification());
h2020classification.setH2020Programme(pm); h2020classification.setH2020Programme(pm);
setLevelsandProgramme(h2020classification, csvProgramme.getClassification_short()); setLevelsandProgramme(h2020classification, csvProgramme.getClassification_short());
// setProgramme(h2020classification, ocsvProgramme.get().getClassification());
pp.setH2020classification(Arrays.asList(h2020classification)); pp.setH2020classification(Arrays.asList(h2020classification));
return pp; return pp;
@ -148,16 +152,20 @@ public class SparkAtomicActionJob {
.map((MapFunction<Tuple2<Project, EXCELTopic>, Project>) p -> { .map((MapFunction<Tuple2<Project, EXCELTopic>, Project>) p -> {
Optional<EXCELTopic> op = Optional.ofNullable(p._2()); Optional<EXCELTopic> op = Optional.ofNullable(p._2());
Project rp = p._1(); Project rp = p._1();
op.ifPresent(excelTopic -> rp.setH2020topicdescription(excelTopic.getTitle())); if (op.isPresent()) {
rp.setH2020topicdescription(op.get().getTitle());
}
return rp; return rp;
}, Encoders.bean(Project.class)) }, Encoders.bean(Project.class))
.filter(Objects::nonNull) .filter(Objects::nonNull)
.groupByKey( .groupByKey(
(MapFunction<Project, String>) OafEntity::getId, (MapFunction<Project, String>) p -> p.getId(),
Encoders.STRING()) Encoders.STRING())
.mapGroups((MapGroupsFunction<String, Project, Project>) (s, it) -> { .mapGroups((MapGroupsFunction<String, Project, Project>) (s, it) -> {
Project first = it.next(); Project first = it.next();
it.forEachRemaining(first::mergeFrom); it.forEachRemaining(p -> {
first.mergeFrom(p);
});
return first; return first;
}, Encoders.bean(Project.class)) }, Encoders.bean(Project.class))
.toJavaRDD() .toJavaRDD()
@ -181,6 +189,12 @@ public class SparkAtomicActionJob {
h2020Classification.getH2020Programme().setDescription(tmp[tmp.length - 1]); h2020Classification.getH2020Programme().setDescription(tmp[tmp.length - 1]);
} }
// private static void setProgramme(H2020Classification h2020Classification, String classification) {
// String[] tmp = classification.split(" \\| ");
//
// h2020Classification.getH2020Programme().setDescription(tmp[tmp.length - 1]);
// }
public static <R> Dataset<R> readPath( public static <R> Dataset<R> readPath(
SparkSession spark, String inputPath, Class<R> clazz) { SparkSession spark, String inputPath, Class<R> clazz) {
return spark return spark

View File

@ -0,0 +1,40 @@
package eu.dnetlib.dhp.actionmanager.project.utils;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.Set;
import org.apache.commons.csv.CSVFormat;
import org.apache.commons.csv.CSVRecord;
import org.apache.commons.lang.reflect.FieldUtils;
/**
* Reads a generic csv and maps it into classes that mirror its schema
*/
public class CSVParser {
public <R> List<R> parse(String csvFile, String classForName)
throws ClassNotFoundException, IOException, IllegalAccessException, InstantiationException {
final CSVFormat format = CSVFormat.EXCEL
.withHeader()
.withDelimiter(';')
.withQuote('"')
.withTrim();
List<R> ret = new ArrayList<>();
final org.apache.commons.csv.CSVParser parser = org.apache.commons.csv.CSVParser.parse(csvFile, format);
final Set<String> headers = parser.getHeaderMap().keySet();
Class<?> clazz = Class.forName(classForName);
for (CSVRecord csvRecord : parser.getRecords()) {
final Object cc = clazz.newInstance();
for (String header : headers) {
FieldUtils.writeField(cc, header, csvRecord.get(header), true);
}
ret.add((R) cc);
}
return ret;
}
}

View File

@ -1,32 +1,20 @@
package eu.dnetlib.dhp.actionmanager.project.utils.model; package eu.dnetlib.dhp.actionmanager.project.utils;
import java.io.Serializable; import java.io.Serializable;
import com.opencsv.bean.CsvBindByName;
import com.opencsv.bean.CsvIgnore;
/** /**
* The model for the programme csv file * The model for the programme csv file
*/ */
public class CSVProgramme implements Serializable { public class CSVProgramme implements Serializable {
@CsvBindByName(column = "code") private String rcn;
private String code; private String code;
@CsvBindByName(column = "title")
private String title; private String title;
@CsvBindByName(column = "shortTitle")
private String shortTitle; private String shortTitle;
@CsvBindByName(column = "language")
private String language; private String language;
@CsvIgnore
private String classification; private String classification;
@CsvIgnore
private String classification_short; private String classification_short;
public String getClassification_short() { public String getClassification_short() {
@ -45,6 +33,14 @@ public class CSVProgramme implements Serializable {
this.classification = classification; this.classification = classification;
} }
public String getRcn() {
return rcn;
}
public void setRcn(String rcn) {
this.rcn = rcn;
}
public String getCode() { public String getCode() {
return code; return code;
} }
@ -77,4 +73,5 @@ public class CSVProgramme implements Serializable {
this.language = language; this.language = language;
} }
//
} }

View File

@ -0,0 +1,200 @@
package eu.dnetlib.dhp.actionmanager.project.utils;
import java.io.Serializable;
/**
* the mmodel for the projects csv file
*/
public class CSVProject implements Serializable {
private String rcn;
private String id;
private String acronym;
private String status;
private String programme;
private String topics;
private String frameworkProgramme;
private String title;
private String startDate;
private String endDate;
private String projectUrl;
private String objective;
private String totalCost;
private String ecMaxContribution;
private String call;
private String fundingScheme;
private String coordinator;
private String coordinatorCountry;
private String participants;
private String participantCountries;
private String subjects;
public String getRcn() {
return rcn;
}
public void setRcn(String rcn) {
this.rcn = rcn;
}
public String getId() {
return id;
}
public void setId(String id) {
this.id = id;
}
public String getAcronym() {
return acronym;
}
public void setAcronym(String acronym) {
this.acronym = acronym;
}
public String getStatus() {
return status;
}
public void setStatus(String status) {
this.status = status;
}
public String getProgramme() {
return programme;
}
public void setProgramme(String programme) {
this.programme = programme;
}
public String getTopics() {
return topics;
}
public void setTopics(String topics) {
this.topics = topics;
}
public String getFrameworkProgramme() {
return frameworkProgramme;
}
public void setFrameworkProgramme(String frameworkProgramme) {
this.frameworkProgramme = frameworkProgramme;
}
public String getTitle() {
return title;
}
public void setTitle(String title) {
this.title = title;
}
public String getStartDate() {
return startDate;
}
public void setStartDate(String startDate) {
this.startDate = startDate;
}
public String getEndDate() {
return endDate;
}
public void setEndDate(String endDate) {
this.endDate = endDate;
}
public String getProjectUrl() {
return projectUrl;
}
public void setProjectUrl(String projectUrl) {
this.projectUrl = projectUrl;
}
public String getObjective() {
return objective;
}
public void setObjective(String objective) {
this.objective = objective;
}
public String getTotalCost() {
return totalCost;
}
public void setTotalCost(String totalCost) {
this.totalCost = totalCost;
}
public String getEcMaxContribution() {
return ecMaxContribution;
}
public void setEcMaxContribution(String ecMaxContribution) {
this.ecMaxContribution = ecMaxContribution;
}
public String getCall() {
return call;
}
public void setCall(String call) {
this.call = call;
}
public String getFundingScheme() {
return fundingScheme;
}
public void setFundingScheme(String fundingScheme) {
this.fundingScheme = fundingScheme;
}
public String getCoordinator() {
return coordinator;
}
public void setCoordinator(String coordinator) {
this.coordinator = coordinator;
}
public String getCoordinatorCountry() {
return coordinatorCountry;
}
public void setCoordinatorCountry(String coordinatorCountry) {
this.coordinatorCountry = coordinatorCountry;
}
public String getParticipants() {
return participants;
}
public void setParticipants(String participants) {
this.participants = participants;
}
public String getParticipantCountries() {
return participantCountries;
}
public void setParticipantCountries(String participantCountries) {
this.participantCountries = participantCountries;
}
public String getSubjects() {
return subjects;
}
public void setSubjects(String subjects) {
this.subjects = subjects;
}
}

View File

@ -17,8 +17,6 @@ import org.apache.poi.ss.usermodel.Row;
import org.apache.poi.xssf.usermodel.XSSFSheet; import org.apache.poi.xssf.usermodel.XSSFSheet;
import org.apache.poi.xssf.usermodel.XSSFWorkbook; import org.apache.poi.xssf.usermodel.XSSFWorkbook;
import eu.dnetlib.dhp.actionmanager.project.utils.model.EXCELTopic;
/** /**
* Reads a generic excel file and maps it into classes that mirror its schema * Reads a generic excel file and maps it into classes that mirror its schema
*/ */
@ -28,12 +26,13 @@ public class EXCELParser {
throws ClassNotFoundException, IOException, IllegalAccessException, InstantiationException, throws ClassNotFoundException, IOException, IllegalAccessException, InstantiationException,
InvalidFormatException { InvalidFormatException {
try (OPCPackage pkg = OPCPackage.open(file); XSSFWorkbook wb = new XSSFWorkbook(pkg)) { OPCPackage pkg = OPCPackage.open(file);
XSSFWorkbook wb = new XSSFWorkbook(pkg);
XSSFSheet sheet = wb.getSheet(sheetName); XSSFSheet sheet = wb.getSheet(sheetName);
if (sheet == null) { if (sheetName == null) {
throw new IllegalArgumentException("Sheet name " + sheetName + " not present in current file"); throw new RuntimeException("Sheet name " + sheetName + " not present in current file");
} }
List<R> ret = new ArrayList<>(); List<R> ret = new ArrayList<>();
@ -74,6 +73,5 @@ public class EXCELParser {
return ret; return ret;
} }
}
} }

View File

@ -1,5 +1,5 @@
package eu.dnetlib.dhp.actionmanager.project.utils.model; package eu.dnetlib.dhp.actionmanager.project.utils;
import java.io.Serializable; import java.io.Serializable;

View File

@ -1,21 +1,34 @@
package eu.dnetlib.dhp.actionmanager.project.utils; package eu.dnetlib.dhp.actionmanager.project.utils;
import java.io.*; import java.io.BufferedWriter;
import java.util.Optional; import java.io.Closeable;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.nio.charset.StandardCharsets;
import org.apache.commons.io.IOUtils; import org.apache.commons.io.IOUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.common.collection.GetCSV; import eu.dnetlib.dhp.collection.HttpConnector2;
import eu.dnetlib.dhp.common.collection.HttpConnector2;
/** /**
* Applies the parsing of a csv file and writes the Serialization of it in hdfs * Applies the parsing of a csv file and writes the Serialization of it in hdfs
*/ */
public class ReadCSV { public class ReadCSV implements Closeable {
private static final Log log = LogFactory.getLog(ReadCSV.class);
private final Configuration conf;
private final BufferedWriter writer;
private final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
private final String csvFile;
public static void main(final String[] args) throws Exception { public static void main(final String[] args) throws Exception {
final ArgumentApplicationParser parser = new ArgumentApplicationParser( final ArgumentApplicationParser parser = new ArgumentApplicationParser(
@ -31,22 +44,56 @@ public class ReadCSV {
final String hdfsPath = parser.get("hdfsPath"); final String hdfsPath = parser.get("hdfsPath");
final String hdfsNameNode = parser.get("hdfsNameNode"); final String hdfsNameNode = parser.get("hdfsNameNode");
final String classForName = parser.get("classForName"); final String classForName = parser.get("classForName");
Optional<String> delimiter = Optional.ofNullable(parser.get("delimiter"));
char del = ';';
if (delimiter.isPresent())
del = delimiter.get().charAt(0);
Configuration conf = new Configuration(); try (final ReadCSV readCSV = new ReadCSV(hdfsPath, hdfsNameNode, fileURL)) {
conf.set("fs.defaultFS", hdfsNameNode);
FileSystem fileSystem = FileSystem.get(conf); log.info("Getting CSV file...");
BufferedReader reader = new BufferedReader( readCSV.execute(classForName);
new InputStreamReader(new HttpConnector2().getInputSourceAsStream(fileURL)));
GetCSV.getCsv(fileSystem, reader, hdfsPath, classForName, del); }
}
reader.close(); public void execute(final String classForName) throws Exception {
CSVParser csvParser = new CSVParser();
csvParser
.parse(csvFile, classForName)
.stream()
.forEach(p -> write(p));
} }
@Override
public void close() throws IOException {
writer.close();
}
public ReadCSV(
final String hdfsPath,
final String hdfsNameNode,
final String fileURL)
throws Exception {
this.conf = new Configuration();
this.conf.set("fs.defaultFS", hdfsNameNode);
HttpConnector2 httpConnector = new HttpConnector2();
FileSystem fileSystem = FileSystem.get(this.conf);
Path hdfsWritePath = new Path(hdfsPath);
FSDataOutputStream fsDataOutputStream = null;
if (fileSystem.exists(hdfsWritePath)) {
fileSystem.delete(hdfsWritePath, false);
}
fsDataOutputStream = fileSystem.create(hdfsWritePath);
this.writer = new BufferedWriter(new OutputStreamWriter(fsDataOutputStream, StandardCharsets.UTF_8));
this.csvFile = httpConnector.getInputSource(fileURL);
}
protected void write(final Object p) {
try {
writer.write(OBJECT_MAPPER.writeValueAsString(p));
writer.newLine();
} catch (final Exception e) {
throw new RuntimeException(e);
}
}
} }

View File

@ -11,20 +11,18 @@ import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataOutputStream; import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path; import org.apache.hadoop.fs.Path;
import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.common.collection.CollectorException; import eu.dnetlib.dhp.collection.HttpConnector2;
import eu.dnetlib.dhp.common.collection.HttpConnector2;
/** /**
* Applies the parsing of an excel file and writes the Serialization of it in hdfs * Applies the parsing of an excel file and writes the Serialization of it in hdfs
*/ */
public class ReadExcel implements Closeable { public class ReadExcel implements Closeable {
private static final Log log = LogFactory.getLog(ReadExcel.class); private static final Log log = LogFactory.getLog(ReadCSV.class);
private final Configuration conf;
private final BufferedWriter writer; private final BufferedWriter writer;
private final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); private final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
private final InputStream excelFile; private final InputStream excelFile;
@ -33,7 +31,7 @@ public class ReadExcel implements Closeable {
final ArgumentApplicationParser parser = new ArgumentApplicationParser( final ArgumentApplicationParser parser = new ArgumentApplicationParser(
IOUtils IOUtils
.toString( .toString(
ReadExcel.class ReadCSV.class
.getResourceAsStream( .getResourceAsStream(
"/eu/dnetlib/dhp/actionmanager/project/parameters.json"))); "/eu/dnetlib/dhp/actionmanager/project/parameters.json")));
@ -53,15 +51,13 @@ public class ReadExcel implements Closeable {
} }
} }
public void execute(final String classForName, final String sheetName) public void execute(final String classForName, final String sheetName) throws Exception {
throws IOException, ClassNotFoundException, InvalidFormatException, IllegalAccessException,
InstantiationException {
EXCELParser excelParser = new EXCELParser(); EXCELParser excelParser = new EXCELParser();
excelParser excelParser
.parse(excelFile, classForName, sheetName) .parse(excelFile, classForName, sheetName)
.stream() .stream()
.forEach(this::write); .forEach(p -> write(p));
} }
@Override @Override
@ -72,20 +68,20 @@ public class ReadExcel implements Closeable {
public ReadExcel( public ReadExcel(
final String hdfsPath, final String hdfsPath,
final String hdfsNameNode, final String hdfsNameNode,
final String fileURL) throws CollectorException, IOException { final String fileURL)
throws Exception {
final Configuration conf = new Configuration(); this.conf = new Configuration();
conf.set("fs.defaultFS", hdfsNameNode); this.conf.set("fs.defaultFS", hdfsNameNode);
HttpConnector2 httpConnector = new HttpConnector2(); HttpConnector2 httpConnector = new HttpConnector2();
FileSystem fileSystem = FileSystem.get(conf); FileSystem fileSystem = FileSystem.get(this.conf);
Path hdfsWritePath = new Path(hdfsPath); Path hdfsWritePath = new Path(hdfsPath);
FSDataOutputStream fsDataOutputStream = null;
if (fileSystem.exists(hdfsWritePath)) { if (fileSystem.exists(hdfsWritePath)) {
fileSystem.delete(hdfsWritePath, false); fileSystem.delete(hdfsWritePath, false);
} }
FSDataOutputStream fos = fileSystem.create(hdfsWritePath); fsDataOutputStream = fileSystem.create(hdfsWritePath);
this.writer = new BufferedWriter(new OutputStreamWriter(fos, StandardCharsets.UTF_8)); this.writer = new BufferedWriter(new OutputStreamWriter(fsDataOutputStream, StandardCharsets.UTF_8));
this.excelFile = httpConnector.getInputSourceAsStream(fileURL); this.excelFile = httpConnector.getInputSourceAsStream(fileURL);
} }

View File

@ -1,46 +0,0 @@
package eu.dnetlib.dhp.actionmanager.project.utils.model;
import java.io.Serializable;
import com.opencsv.bean.CsvBindByName;
/**
* the mmodel for the projects csv file
*/
public class CSVProject implements Serializable {
@CsvBindByName(column = "id")
private String id;
@CsvBindByName(column = "programme")
private String programme;
@CsvBindByName(column = "topics")
private String topics;
public String getId() {
return id;
}
public void setId(String id) {
this.id = id;
}
public String getProgramme() {
return programme;
}
public void setProgramme(String programme) {
this.programme = programme;
}
public String getTopics() {
return topics;
}
public void setTopics(String topics) {
this.topics = topics;
}
}

View File

@ -9,7 +9,6 @@ import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.listKeyValues;
import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.qualifier; import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.qualifier;
import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.structuredProperty; import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.structuredProperty;
import java.io.IOException;
import java.io.InputStream; import java.io.InputStream;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Arrays; import java.util.Arrays;
@ -75,7 +74,7 @@ public class GenerateRorActionSetJob {
final String jsonConfiguration = IOUtils final String jsonConfiguration = IOUtils
.toString( .toString(
GenerateRorActionSetJob.class SparkAtomicActionJob.class
.getResourceAsStream("/eu/dnetlib/dhp/actionmanager/ror/action_set_parameters.json")); .getResourceAsStream("/eu/dnetlib/dhp/actionmanager/ror/action_set_parameters.json"));
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration); final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
@ -109,7 +108,7 @@ public class GenerateRorActionSetJob {
private static void processRorOrganizations(final SparkSession spark, private static void processRorOrganizations(final SparkSession spark,
final String inputPath, final String inputPath,
final String outputPath) throws IOException { final String outputPath) throws Exception {
readInputPath(spark, inputPath) readInputPath(spark, inputPath)
.map( .map(
@ -204,7 +203,7 @@ public class GenerateRorActionSetJob {
private static Dataset<RorOrganization> readInputPath( private static Dataset<RorOrganization> readInputPath(
final SparkSession spark, final SparkSession spark,
final String path) throws IOException { final String path) throws Exception {
try (final FileSystem fileSystem = FileSystem.get(new Configuration()); try (final FileSystem fileSystem = FileSystem.get(new Configuration());
final InputStream is = fileSystem.open(new Path(path))) { final InputStream is = fileSystem.open(new Path(path))) {

View File

@ -7,8 +7,6 @@ import com.fasterxml.jackson.annotation.JsonProperty;
public class Address implements Serializable { public class Address implements Serializable {
private static final long serialVersionUID = 2444635485253443195L;
@JsonProperty("lat") @JsonProperty("lat")
private Float lat; private Float lat;
@ -39,6 +37,8 @@ public class Address implements Serializable {
@JsonProperty("line") @JsonProperty("line")
private String line; private String line;
private final static long serialVersionUID = 2444635485253443195L;
public Float getLat() { public Float getLat() {
return lat; return lat;
} }

View File

@ -7,14 +7,14 @@ import com.fasterxml.jackson.annotation.JsonProperty;
public class Country implements Serializable { public class Country implements Serializable {
private static final long serialVersionUID = 4357848706229493627L;
@JsonProperty("country_code") @JsonProperty("country_code")
private String countryCode; private String countryCode;
@JsonProperty("country_name") @JsonProperty("country_name")
private String countryName; private String countryName;
private final static long serialVersionUID = 4357848706229493627L;
public String getCountryCode() { public String getCountryCode() {
return countryCode; return countryCode;
} }

Some files were not shown because too many files have changed in this diff Show More