Merge remote-tracking branch 'upstream/master' into usage-stats-export-wf-v2

This commit is contained in:
dimitrispie 2021-11-29 08:21:32 +02:00
commit 195111119e
1129 changed files with 84119 additions and 25058 deletions

10
.gitignore vendored
View File

@ -7,6 +7,8 @@
*.iws
*~
.vscode
.metals
.bloop
.classpath
/*/.classpath
/*/*/.classpath
@ -23,4 +25,10 @@
/build
spark-warehouse
/**/job-override.properties
/**/*.log
<<<<<<< HEAD
/**/*.log
=======
/**/*.log
/**/.factorypath
>>>>>>> upstream/master

View File

@ -1,2 +1,2 @@
# dnet-hadoop
Dnet-hadoop is a tool for
Dnet-hadoop is the project that defined all the OOZIE workflows for the OpenAIRE Graph construction, processing, provisioning.

View File

@ -8,8 +8,6 @@ import java.util.List;
import org.apache.commons.lang.ArrayUtils;
import org.apache.commons.lang.StringUtils;
import org.apache.maven.plugin.AbstractMojo;
import org.apache.maven.plugin.MojoExecutionException;
import org.apache.maven.plugin.MojoFailureException;
/**
* Generates oozie properties which were not provided from commandline.
@ -27,7 +25,7 @@ public class GenerateOoziePropertiesMojo extends AbstractMojo {
};
@Override
public void execute() throws MojoExecutionException, MojoFailureException {
public void execute() {
if (System.getProperties().containsKey(PROPERTY_NAME_WF_SOURCE_DIR)
&& !System.getProperties().containsKey(PROPERTY_NAME_SANDBOX_NAME)) {
String generatedSandboxName = generateSandboxName(
@ -46,24 +44,24 @@ public class GenerateOoziePropertiesMojo extends AbstractMojo {
/**
* Generates sandbox name from workflow source directory.
*
* @param wfSourceDir
* @param wfSourceDir workflow source directory
* @return generated sandbox name
*/
private String generateSandboxName(String wfSourceDir) {
// utilize all dir names until finding one of the limiters
List<String> sandboxNameParts = new ArrayList<String>();
List<String> sandboxNameParts = new ArrayList<>();
String[] tokens = StringUtils.split(wfSourceDir, File.separatorChar);
ArrayUtils.reverse(tokens);
if (tokens.length > 0) {
for (String token : tokens) {
for (String limiter : limiters) {
if (limiter.equals(token)) {
return sandboxNameParts.size() > 0
return !sandboxNameParts.isEmpty()
? StringUtils.join(sandboxNameParts.toArray())
: null;
}
}
if (sandboxNameParts.size() > 0) {
if (!sandboxNameParts.isEmpty()) {
sandboxNameParts.add(0, File.separator);
}
sandboxNameParts.add(0, token);

View File

@ -16,6 +16,7 @@ import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
@ -289,7 +290,7 @@ public class WritePredefinedProjectProperties extends AbstractMojo {
*/
protected List<String> getEscapeChars(String escapeChars) {
List<String> tokens = getListFromCSV(escapeChars);
List<String> realTokens = new ArrayList<String>();
List<String> realTokens = new ArrayList<>();
for (String token : tokens) {
String realToken = getRealToken(token);
realTokens.add(realToken);
@ -324,7 +325,7 @@ public class WritePredefinedProjectProperties extends AbstractMojo {
* @return content
*/
protected String getContent(String comment, Properties properties, List<String> escapeTokens) {
List<String> names = new ArrayList<String>(properties.stringPropertyNames());
List<String> names = new ArrayList<>(properties.stringPropertyNames());
Collections.sort(names);
StringBuilder sb = new StringBuilder();
if (!StringUtils.isBlank(comment)) {
@ -352,7 +353,7 @@ public class WritePredefinedProjectProperties extends AbstractMojo {
throws MojoExecutionException {
try {
String content = getContent(comment, properties, escapeTokens);
FileUtils.writeStringToFile(file, content, ENCODING_UTF8);
FileUtils.writeStringToFile(file, content, StandardCharsets.UTF_8);
} catch (IOException e) {
throw new MojoExecutionException("Error creating properties file", e);
}
@ -399,9 +400,9 @@ public class WritePredefinedProjectProperties extends AbstractMojo {
*/
protected static final List<String> getListFromCSV(String csv) {
if (StringUtils.isBlank(csv)) {
return new ArrayList<String>();
return new ArrayList<>();
}
List<String> list = new ArrayList<String>();
List<String> list = new ArrayList<>();
String[] tokens = StringUtils.split(csv, ",");
for (String token : tokens) {
list.add(token.trim());

View File

@ -9,18 +9,18 @@ import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
/** @author mhorst, claudio.atzori */
public class GenerateOoziePropertiesMojoTest {
class GenerateOoziePropertiesMojoTest {
private final GenerateOoziePropertiesMojo mojo = new GenerateOoziePropertiesMojo();
@BeforeEach
public void clearSystemProperties() {
void clearSystemProperties() {
System.clearProperty(PROPERTY_NAME_SANDBOX_NAME);
System.clearProperty(PROPERTY_NAME_WF_SOURCE_DIR);
}
@Test
public void testExecuteEmpty() throws Exception {
void testExecuteEmpty() throws Exception {
// execute
mojo.execute();
@ -29,7 +29,7 @@ public class GenerateOoziePropertiesMojoTest {
}
@Test
public void testExecuteSandboxNameAlreadySet() throws Exception {
void testExecuteSandboxNameAlreadySet() throws Exception {
// given
String workflowSourceDir = "eu/dnetlib/dhp/wf/transformers";
String sandboxName = "originalSandboxName";
@ -44,7 +44,7 @@ public class GenerateOoziePropertiesMojoTest {
}
@Test
public void testExecuteEmptyWorkflowSourceDir() throws Exception {
void testExecuteEmptyWorkflowSourceDir() throws Exception {
// given
String workflowSourceDir = "";
System.setProperty(PROPERTY_NAME_WF_SOURCE_DIR, workflowSourceDir);
@ -57,7 +57,7 @@ public class GenerateOoziePropertiesMojoTest {
}
@Test
public void testExecuteNullSandboxNameGenerated() throws Exception {
void testExecuteNullSandboxNameGenerated() throws Exception {
// given
String workflowSourceDir = "eu/dnetlib/dhp/";
System.setProperty(PROPERTY_NAME_WF_SOURCE_DIR, workflowSourceDir);
@ -70,7 +70,7 @@ public class GenerateOoziePropertiesMojoTest {
}
@Test
public void testExecute() throws Exception {
void testExecute() throws Exception {
// given
String workflowSourceDir = "eu/dnetlib/dhp/wf/transformers";
System.setProperty(PROPERTY_NAME_WF_SOURCE_DIR, workflowSourceDir);
@ -83,7 +83,7 @@ public class GenerateOoziePropertiesMojoTest {
}
@Test
public void testExecuteWithoutRoot() throws Exception {
void testExecuteWithoutRoot() throws Exception {
// given
String workflowSourceDir = "wf/transformers";
System.setProperty(PROPERTY_NAME_WF_SOURCE_DIR, workflowSourceDir);

View File

@ -20,7 +20,7 @@ import org.mockito.junit.jupiter.MockitoExtension;
/** @author mhorst, claudio.atzori */
@ExtendWith(MockitoExtension.class)
public class WritePredefinedProjectPropertiesTest {
class WritePredefinedProjectPropertiesTest {
@Mock
private MavenProject mavenProject;
@ -39,7 +39,7 @@ public class WritePredefinedProjectPropertiesTest {
// ----------------------------------- TESTS ---------------------------------------------
@Test
public void testExecuteEmpty() throws Exception {
void testExecuteEmpty() throws Exception {
// execute
mojo.execute();
@ -50,7 +50,7 @@ public class WritePredefinedProjectPropertiesTest {
}
@Test
public void testExecuteWithProjectProperties() throws Exception {
void testExecuteWithProjectProperties() throws Exception {
// given
String key = "projectPropertyKey";
String value = "projectPropertyValue";
@ -70,7 +70,7 @@ public class WritePredefinedProjectPropertiesTest {
}
@Test()
public void testExecuteWithProjectPropertiesAndInvalidOutputFile(@TempDir File testFolder) {
void testExecuteWithProjectPropertiesAndInvalidOutputFile(@TempDir File testFolder) {
// given
String key = "projectPropertyKey";
String value = "projectPropertyValue";
@ -84,7 +84,7 @@ public class WritePredefinedProjectPropertiesTest {
}
@Test
public void testExecuteWithProjectPropertiesExclusion(@TempDir File testFolder) throws Exception {
void testExecuteWithProjectPropertiesExclusion(@TempDir File testFolder) throws Exception {
// given
String key = "projectPropertyKey";
String value = "projectPropertyValue";
@ -108,7 +108,7 @@ public class WritePredefinedProjectPropertiesTest {
}
@Test
public void testExecuteWithProjectPropertiesInclusion(@TempDir File testFolder) throws Exception {
void testExecuteWithProjectPropertiesInclusion(@TempDir File testFolder) throws Exception {
// given
String key = "projectPropertyKey";
String value = "projectPropertyValue";
@ -132,7 +132,7 @@ public class WritePredefinedProjectPropertiesTest {
}
@Test
public void testExecuteIncludingPropertyKeysFromFile(@TempDir File testFolder) throws Exception {
void testExecuteIncludingPropertyKeysFromFile(@TempDir File testFolder) throws Exception {
// given
String key = "projectPropertyKey";
String value = "projectPropertyValue";
@ -164,7 +164,7 @@ public class WritePredefinedProjectPropertiesTest {
}
@Test
public void testExecuteIncludingPropertyKeysFromClasspathResource(@TempDir File testFolder)
void testExecuteIncludingPropertyKeysFromClasspathResource(@TempDir File testFolder)
throws Exception {
// given
String key = "projectPropertyKey";
@ -194,7 +194,7 @@ public class WritePredefinedProjectPropertiesTest {
}
@Test
public void testExecuteIncludingPropertyKeysFromBlankLocation() {
void testExecuteIncludingPropertyKeysFromBlankLocation() {
// given
String key = "projectPropertyKey";
String value = "projectPropertyValue";
@ -214,7 +214,7 @@ public class WritePredefinedProjectPropertiesTest {
}
@Test
public void testExecuteIncludingPropertyKeysFromXmlFile(@TempDir File testFolder)
void testExecuteIncludingPropertyKeysFromXmlFile(@TempDir File testFolder)
throws Exception {
// given
String key = "projectPropertyKey";
@ -247,7 +247,7 @@ public class WritePredefinedProjectPropertiesTest {
}
@Test
public void testExecuteIncludingPropertyKeysFromInvalidXmlFile(@TempDir File testFolder)
void testExecuteIncludingPropertyKeysFromInvalidXmlFile(@TempDir File testFolder)
throws Exception {
// given
String key = "projectPropertyKey";
@ -273,7 +273,7 @@ public class WritePredefinedProjectPropertiesTest {
}
@Test
public void testExecuteWithQuietModeOn(@TempDir File testFolder) throws Exception {
void testExecuteWithQuietModeOn(@TempDir File testFolder) throws Exception {
// given
mojo.setQuiet(true);
mojo.setIncludePropertyKeysFromFiles(new String[] {
@ -290,7 +290,7 @@ public class WritePredefinedProjectPropertiesTest {
}
@Test
public void testExecuteIncludingPropertyKeysFromInvalidFile() {
void testExecuteIncludingPropertyKeysFromInvalidFile() {
// given
mojo.setIncludePropertyKeysFromFiles(new String[] {
"invalid location"
@ -301,7 +301,7 @@ public class WritePredefinedProjectPropertiesTest {
}
@Test
public void testExecuteWithEnvironmentProperties(@TempDir File testFolder) throws Exception {
void testExecuteWithEnvironmentProperties(@TempDir File testFolder) throws Exception {
// given
mojo.setIncludeEnvironmentVariables(true);
@ -318,7 +318,7 @@ public class WritePredefinedProjectPropertiesTest {
}
@Test
public void testExecuteWithSystemProperties(@TempDir File testFolder) throws Exception {
void testExecuteWithSystemProperties(@TempDir File testFolder) throws Exception {
// given
String key = "systemPropertyKey";
String value = "systemPropertyValue";
@ -337,7 +337,7 @@ public class WritePredefinedProjectPropertiesTest {
}
@Test
public void testExecuteWithSystemPropertiesAndEscapeChars(@TempDir File testFolder)
void testExecuteWithSystemPropertiesAndEscapeChars(@TempDir File testFolder)
throws Exception {
// given
String key = "systemPropertyKey ";

View File

@ -22,9 +22,20 @@
<id>dnet45-releases</id>
<url>https://maven.d4science.org/nexus/content/repositories/dnet45-releases</url>
</repository>
<site>
<id>DHPSite</id>
<url>${dhp.site.stage.path}/dhp-build/dhp-code-style</url>
</site>
</distributionManagement>
<build>
<extensions>
<extension>
<groupId>org.apache.maven.wagon</groupId>
<artifactId>wagon-ssh</artifactId>
<version>2.10</version>
</extension>
</extensions>
<pluginManagement>
<plugins>
<plugin>
@ -35,7 +46,7 @@
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-site-plugin</artifactId>
<version>3.7.1</version>
<version>3.9.1</version>
</plugin>
</plugins>
</pluginManagement>
@ -43,6 +54,7 @@
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<dhp.site.stage.path>sftp://dnet-hadoop@static-web.d4science.org/dnet-hadoop</dhp.site.stage.path>
</properties>
</project>

View File

@ -0,0 +1,21 @@
<?xml version="1.0" encoding="ISO-8859-1"?>
<project xmlns="http://maven.apache.org/DECORATION/1.8.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/DECORATION/1.8.0 https://maven.apache.org/xsd/decoration-1.8.0.xsd"
name="DHP-Aggregation">
<skin>
<groupId>org.apache.maven.skins</groupId>
<artifactId>maven-fluido-skin</artifactId>
<version>1.8</version>
</skin>
<poweredBy>
<logo name="OpenAIRE Research Graph" href="https://graph.openaire.eu/"
img="https://graph.openaire.eu/assets/common-assets/logo-large-graph.png"/>
</poweredBy>
<body>
<links>
<item name="Code" href="https://code-repo.d4science.org/" />
</links>
<menu ref="modules" />
<menu ref="reports"/>
</body>
</project>

View File

@ -10,6 +10,9 @@
<packaging>pom</packaging>
<description>This module is a container for the build tools used in dnet-hadoop</description>
<properties>
<maven.javadoc.skip>true</maven.javadoc.skip>
</properties>
<modules>
<module>dhp-code-style</module>
@ -17,4 +20,12 @@
<module>dhp-build-properties-maven-plugin</module>
</modules>
<distributionManagement>
<site>
<id>DHPSite</id>
<url>${dhp.site.stage.path}/dhp-build/</url>
</site>
</distributionManagement>
</project>

View File

@ -0,0 +1,22 @@
<?xml version="1.0" encoding="ISO-8859-1"?>
<project xmlns="http://maven.apache.org/DECORATION/1.8.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/DECORATION/1.8.0 https://maven.apache.org/xsd/decoration-1.8.0.xsd"
name="DHP-Aggregation">
<skin>
<groupId>org.apache.maven.skins</groupId>
<artifactId>maven-fluido-skin</artifactId>
<version>1.8</version>
</skin>
<poweredBy>
<logo name="OpenAIRE Research Graph" href="https://graph.openaire.eu/"
img="https://graph.openaire.eu/assets/common-assets/logo-large-graph.png"/>
</poweredBy>
<body>
<links>
<item name="Code" href="https://code-repo.d4science.org/" />
</links>
<menu ref="modules" />
<menu ref="reports"/>
</body>
</project>

View File

@ -7,12 +7,57 @@
<artifactId>dhp</artifactId>
<version>1.2.4-SNAPSHOT</version>
<relativePath>../pom.xml</relativePath>
</parent>
<artifactId>dhp-common</artifactId>
<packaging>jar</packaging>
<distributionManagement>
<site>
<id>DHPSite</id>
<url>${dhp.site.stage.path}/dhp-common</url>
</site>
</distributionManagement>
<description>This module contains common utilities meant to be used across the dnet-hadoop submodules</description>
<build>
<plugins>
<plugin>
<groupId>net.alchim31.maven</groupId>
<artifactId>scala-maven-plugin</artifactId>
<version>${net.alchim31.maven.version}</version>
<executions>
<execution>
<id>scala-compile-first</id>
<phase>initialize</phase>
<goals>
<goal>add-source</goal>
<goal>compile</goal>
</goals>
</execution>
<execution>
<id>scala-test-compile</id>
<phase>process-test-resources</phase>
<goals>
<goal>testCompile</goal>
</goals>
</execution>
<execution>
<id>scala-doc</id>
<phase>process-resources</phase> <!-- or wherever -->
<goals>
<goal>doc</goal>
</goals>
</execution>
</executions>
<configuration>
<scalaVersion>${scala.version}</scalaVersion>
</configuration>
</plugin>
</plugins>
</build>
<dependencies>
@ -20,6 +65,15 @@
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-common</artifactId>
</dependency>
<dependency>
<groupId>com.github.sisyphsu</groupId>
<artifactId>dateparser</artifactId>
</dependency>
<dependency>
<groupId>me.xuender</groupId>
<artifactId>unidecode</artifactId>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.11</artifactId>
@ -53,11 +107,6 @@
<groupId>com.fasterxml.jackson.core</groupId>
<artifactId>jackson-databind</artifactId>
</dependency>
<!-- https://mvnrepository.com/artifact/com.rabbitmq/amqp-client -->
<dependency>
<groupId>com.rabbitmq</groupId>
<artifactId>amqp-client</artifactId>
</dependency>
<dependency>
<groupId>net.sf.saxon</groupId>
<artifactId>Saxon-HE</artifactId>
@ -98,10 +147,25 @@
<artifactId>dnet-pace-core</artifactId>
</dependency>
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient</artifactId>
</dependency>
<dependency>
<groupId>org.mongodb</groupId>
<artifactId>mongo-java-driver</artifactId>
</dependency>
<dependency>
<groupId>eu.dnetlib.dhp</groupId>
<artifactId>dhp-schemas</artifactId>
</dependency>
<dependency>
<groupId>com.opencsv</groupId>
<artifactId>opencsv</artifactId>
</dependency>
</dependencies>
</project>

View File

@ -1,10 +1,7 @@
package eu.dnetlib.dhp.application;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.Serializable;
import java.io.StringWriter;
import java.io.*;
import java.util.*;
import java.util.zip.GZIPInputStream;
import java.util.zip.GZIPOutputStream;
@ -12,17 +9,21 @@ import java.util.zip.GZIPOutputStream;
import org.apache.commons.cli.*;
import org.apache.commons.codec.binary.Base64;
import org.apache.commons.io.IOUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.databind.ObjectMapper;
public class ArgumentApplicationParser implements Serializable {
private static final Logger log = LoggerFactory.getLogger(ArgumentApplicationParser.class);
private final Options options = new Options();
private final Map<String, String> objectMap = new HashMap<>();
private final List<String> compressedValues = new ArrayList<>();
public ArgumentApplicationParser(final String json_configuration) throws Exception {
public ArgumentApplicationParser(final String json_configuration) throws IOException {
final ObjectMapper mapper = new ObjectMapper();
final OptionsParameter[] configuration = mapper.readValue(json_configuration, OptionsParameter[].class);
createOptionMap(configuration);
@ -33,7 +34,6 @@ public class ArgumentApplicationParser implements Serializable {
}
private void createOptionMap(final OptionsParameter[] configuration) {
Arrays
.stream(configuration)
.map(
@ -47,10 +47,6 @@ public class ArgumentApplicationParser implements Serializable {
return o;
})
.forEach(options::addOption);
// HelpFormatter formatter = new HelpFormatter();
// formatter.printHelp("myapp", null, options, null, true);
}
public static String decompressValue(final String abstractCompressed) {
@ -60,13 +56,13 @@ public class ArgumentApplicationParser implements Serializable {
final StringWriter stringWriter = new StringWriter();
IOUtils.copy(gis, stringWriter);
return stringWriter.toString();
} catch (Throwable e) {
System.out.println("Wrong value to decompress:" + abstractCompressed);
throw new RuntimeException(e);
} catch (IOException e) {
log.error("Wrong value to decompress: {}", abstractCompressed);
throw new IllegalArgumentException(e);
}
}
public static String compressArgument(final String value) throws Exception {
public static String compressArgument(final String value) throws IOException {
ByteArrayOutputStream out = new ByteArrayOutputStream();
GZIPOutputStream gzip = new GZIPOutputStream(out);
gzip.write(value.getBytes());
@ -74,7 +70,7 @@ public class ArgumentApplicationParser implements Serializable {
return java.util.Base64.getEncoder().encodeToString(out.toByteArray());
}
public void parseArgument(final String[] args) throws Exception {
public void parseArgument(final String[] args) throws ParseException {
CommandLineParser parser = new BasicParser();
CommandLine cmd = parser.parse(options, args);
Arrays

View File

@ -9,9 +9,6 @@ public class OptionsParameter {
private boolean paramRequired;
private boolean compressed;
public OptionsParameter() {
}
public String getParamName() {
return paramName;
}

View File

@ -0,0 +1,72 @@
package eu.dnetlib.dhp.application
import scala.io.Source
/**
* This is the main Interface SparkApplication
* where all the Spark Scala class should inherit
*
*/
trait SparkScalaApplication {
/**
* This is the path in the classpath of the json
* describes all the argument needed to run
*/
val propertyPath: String
/**
* Utility to parse the arguments using the
* property json in the classpath identified from
* the variable propertyPath
*
* @param args the list of arguments
*/
def parseArguments(args: Array[String]): ArgumentApplicationParser = {
val parser = new ArgumentApplicationParser(Source.fromInputStream(getClass.getResourceAsStream(propertyPath)).mkString)
parser.parseArgument(args)
parser
}
/**
* Here all the spark applications runs this method
* where the whole logic of the spark node is defined
*/
def run(): Unit
}
import org.apache.spark.SparkConf
import org.apache.spark.sql.SparkSession
import org.slf4j.Logger
abstract class AbstractScalaApplication (val propertyPath:String, val args:Array[String], log:Logger) extends SparkScalaApplication {
var parser: ArgumentApplicationParser = null
var spark:SparkSession = null
def initialize():SparkScalaApplication = {
parser = parseArguments(args)
spark = createSparkSession()
this
}
/**
* Utility for creating a spark session starting from parser
*
* @return a spark Session
*/
private def createSparkSession():SparkSession = {
require(parser!= null)
val conf:SparkConf = new SparkConf()
val master = parser.get("master")
log.info(s"Creating Spark session: Master: $master")
SparkSession.builder().config(conf)
.appName(getClass.getSimpleName)
.master(master)
.getOrCreate()
}
}

View File

@ -1,5 +1,5 @@
package eu.dnetlib.collector.worker.model;
package eu.dnetlib.dhp.collection;
import java.util.HashMap;
import java.util.Map;
@ -34,7 +34,7 @@ public class ApiDescriptor {
return params;
}
public void setParams(final HashMap<String, String> params) {
public void setParams(final Map<String, String> params) {
this.params = params;
}

View File

@ -12,6 +12,9 @@ public class Constants {
public static String COAR_ACCESS_RIGHT_SCHEMA = "http://vocabularies.coar-repositories.org/documentation/access_rights/";
private Constants() {
}
static {
accessRightsCoarMap.put("OPEN", "c_abf2");
accessRightsCoarMap.put("RESTRICTED", "c_16ec");
@ -27,4 +30,32 @@ public class Constants {
coarCodeLabelMap.put("c_f1cf", "EMBARGO");
}
public static final String SEQUENCE_FILE_NAME = "/sequence_file";
public static final String REPORT_FILE_NAME = "/report";
public static final String MDSTORE_DATA_PATH = "/store";
public static final String MDSTORE_SIZE_PATH = "/size";
public static final String COLLECTION_MODE = "collectionMode";
public static final String METADATA_ENCODING = "metadataEncoding";
public static final String OOZIE_WF_PATH = "oozieWfPath";
public static final String DNET_MESSAGE_MGR_URL = "dnetMessageManagerURL";
public static final String MAX_NUMBER_OF_RETRY = "maxNumberOfRetry";
public static final String REQUEST_DELAY = "requestDelay";
public static final String RETRY_DELAY = "retryDelay";
public static final String CONNECT_TIMEOUT = "connectTimeOut";
public static final String READ_TIMEOUT = "readTimeOut";
public static final String FROM_DATE_OVERRIDE = "fromDateOverride";
public static final String UNTIL_DATE_OVERRIDE = "untilDateOverride";
public static final String CONTENT_TOTALITEMS = "TotalItems";
public static final String CONTENT_INVALIDRECORDS = "InvalidRecords";
public static final String CONTENT_TRANSFORMEDRECORDS = "transformedItems";
// IETF Draft and used by Repositories like ZENODO , not included in APACHE HTTP java packages
// see https://ietf-wg-httpapi.github.io/ratelimit-headers/draft-ietf-httpapi-ratelimit-headers.html
public static final String HTTPHEADER_IETF_DRAFT_RATELIMIT_LIMIT = "X-RateLimit-Limit";
public static final String HTTPHEADER_IETF_DRAFT_RATELIMIT_REMAINING = "X-RateLimit-Remaining";
public static final String HTTPHEADER_IETF_DRAFT_RATELIMIT_RESET = "X-RateLimit-Reset";
}

View File

@ -14,7 +14,7 @@ public class DbClient implements Closeable {
private static final Log log = LogFactory.getLog(DbClient.class);
private Connection connection;
private final Connection connection;
public DbClient(final String address, final String login, final String password) {

View File

@ -84,7 +84,7 @@ public class GraphResultMapper implements Serializable {
.setDocumentationUrl(
value
.stream()
.map(v -> v.getValue())
.map(Field::getValue)
.collect(Collectors.toList())));
Optional
@ -100,20 +100,20 @@ public class GraphResultMapper implements Serializable {
.setContactgroup(
Optional
.ofNullable(ir.getContactgroup())
.map(value -> value.stream().map(cg -> cg.getValue()).collect(Collectors.toList()))
.map(value -> value.stream().map(Field::getValue).collect(Collectors.toList()))
.orElse(null));
out
.setContactperson(
Optional
.ofNullable(ir.getContactperson())
.map(value -> value.stream().map(cp -> cp.getValue()).collect(Collectors.toList()))
.map(value -> value.stream().map(Field::getValue).collect(Collectors.toList()))
.orElse(null));
out
.setTool(
Optional
.ofNullable(ir.getTool())
.map(value -> value.stream().map(t -> t.getValue()).collect(Collectors.toList()))
.map(value -> value.stream().map(Field::getValue).collect(Collectors.toList()))
.orElse(null));
out.setType(ModelConstants.ORP_DEFAULT_RESULTTYPE.getClassname());
@ -123,7 +123,8 @@ public class GraphResultMapper implements Serializable {
Optional
.ofNullable(input.getAuthor())
.ifPresent(ats -> out.setAuthor(ats.stream().map(at -> getAuthor(at)).collect(Collectors.toList())));
.ifPresent(
ats -> out.setAuthor(ats.stream().map(GraphResultMapper::getAuthor).collect(Collectors.toList())));
// I do not map Access Right UNKNOWN or OTHER
@ -210,7 +211,7 @@ public class GraphResultMapper implements Serializable {
if (oInst.isPresent()) {
out
.setInstance(
oInst.get().stream().map(i -> getInstance(i)).collect(Collectors.toList()));
oInst.get().stream().map(GraphResultMapper::getInstance).collect(Collectors.toList()));
}
@ -230,7 +231,7 @@ public class GraphResultMapper implements Serializable {
.stream()
.filter(t -> t.getQualifier().getClassid().equalsIgnoreCase("main title"))
.collect(Collectors.toList());
if (iTitle.size() > 0) {
if (!iTitle.isEmpty()) {
out.setMaintitle(iTitle.get(0).getValue());
}
@ -239,7 +240,7 @@ public class GraphResultMapper implements Serializable {
.stream()
.filter(t -> t.getQualifier().getClassid().equalsIgnoreCase("subtitle"))
.collect(Collectors.toList());
if (iTitle.size() > 0) {
if (!iTitle.isEmpty()) {
out.setSubtitle(iTitle.get(0).getValue());
}

View File

@ -28,7 +28,7 @@ public class HdfsSupport {
* @param configuration Configuration of hadoop env
*/
public static boolean exists(String path, Configuration configuration) {
logger.info("Removing path: {}", path);
logger.info("Checking existence for path: {}", path);
return rethrowAsRuntimeException(
() -> {
Path f = new Path(path);

View File

@ -14,38 +14,33 @@ public class MakeTarArchive implements Serializable {
private static TarArchiveOutputStream getTar(FileSystem fileSystem, String outputPath) throws IOException {
Path hdfsWritePath = new Path(outputPath);
FSDataOutputStream fsDataOutputStream = null;
if (fileSystem.exists(hdfsWritePath)) {
fileSystem.delete(hdfsWritePath, true);
}
fsDataOutputStream = fileSystem.create(hdfsWritePath);
return new TarArchiveOutputStream(fsDataOutputStream.getWrappedStream());
return new TarArchiveOutputStream(fileSystem.create(hdfsWritePath).getWrappedStream());
}
private static void write(FileSystem fileSystem, String inputPath, String outputPath, String dir_name)
throws IOException {
Path hdfsWritePath = new Path(outputPath);
FSDataOutputStream fsDataOutputStream = null;
if (fileSystem.exists(hdfsWritePath)) {
fileSystem.delete(hdfsWritePath, true);
}
fsDataOutputStream = fileSystem.create(hdfsWritePath);
try (TarArchiveOutputStream ar = new TarArchiveOutputStream(
fileSystem.create(hdfsWritePath).getWrappedStream())) {
TarArchiveOutputStream ar = new TarArchiveOutputStream(fsDataOutputStream.getWrappedStream());
RemoteIterator<LocatedFileStatus> iterator = fileSystem
.listFiles(
new Path(inputPath), true);
RemoteIterator<LocatedFileStatus> fileStatusListIterator = fileSystem
.listFiles(
new Path(inputPath), true);
while (iterator.hasNext()) {
writeCurrentFile(fileSystem, dir_name, iterator, ar, 0);
}
while (fileStatusListIterator.hasNext()) {
writeCurrentFile(fileSystem, dir_name, fileStatusListIterator, ar, 0);
}
ar.close();
}
public static void tarMaxSize(FileSystem fileSystem, String inputPath, String outputPath, String dir_name,
@ -90,6 +85,13 @@ public class MakeTarArchive implements Serializable {
String p_string = p.toString();
if (!p_string.endsWith("_SUCCESS")) {
String name = p_string.substring(p_string.lastIndexOf("/") + 1);
if (name.startsWith("part-") & name.length() > 10) {
String tmp = name.substring(0, 10);
if (name.contains(".")) {
tmp += name.substring(name.indexOf("."));
}
name = tmp;
}
TarArchiveEntry entry = new TarArchiveEntry(dir_name + "/" + name);
entry.setSize(fileStatus.getLen());
current_size += fileStatus.getLen();
@ -100,7 +102,7 @@ public class MakeTarArchive implements Serializable {
BufferedInputStream bis = new BufferedInputStream(is);
int count;
byte data[] = new byte[1024];
byte[] data = new byte[1024];
while ((count = bis.read(data, 0, data.length)) != -1) {
ar.write(data, 0, count);
}

View File

@ -1,39 +1,59 @@
package eu.dnetlib.dhp.oa.graph.raw.common;
package eu.dnetlib.dhp.common;
import java.io.Closeable;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Map;
import java.util.Optional;
import java.util.stream.StreamSupport;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.bson.Document;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.google.common.collect.Iterables;
import com.mongodb.BasicDBObject;
import com.mongodb.MongoClient;
import com.mongodb.MongoClientURI;
import com.mongodb.QueryBuilder;
import com.mongodb.client.FindIterable;
import com.mongodb.client.MongoCollection;
import com.mongodb.client.MongoDatabase;
public class MdstoreClient implements Closeable {
private static final Logger log = LoggerFactory.getLogger(MdstoreClient.class);
private final MongoClient client;
private final MongoDatabase db;
private static final String COLL_METADATA = "metadata";
private static final String COLL_METADATA_MANAGER = "metadataManager";
private static final Log log = LogFactory.getLog(MdstoreClient.class);
public MdstoreClient(final String baseUrl, final String dbName) {
this.client = new MongoClient(new MongoClientURI(baseUrl));
this.db = getDb(client, dbName);
}
public MongoCollection<Document> mdStore(final String mdId) {
BasicDBObject query = (BasicDBObject) QueryBuilder.start("mdId").is(mdId).get();
log.info("querying current mdId: {}", query.toJson());
final String currentId = Optional
.ofNullable(getColl(db, COLL_METADATA_MANAGER, true).find(query))
.map(FindIterable::first)
.map(d -> d.getString("currentId"))
.orElseThrow(() -> new IllegalArgumentException("cannot find current mdstore id for: " + mdId));
log.info("currentId: {}", currentId);
return getColl(db, currentId, true);
}
public Map<String, String> validCollections(
final String mdFormat, final String mdLayout, final String mdInterpretation) {
@ -63,7 +83,7 @@ public class MdstoreClient implements Closeable {
if (!Iterables.contains(client.listDatabaseNames(), dbName)) {
final String err = String.format("Database '%s' not found in %s", dbName, client.getAddress());
log.warn(err);
throw new RuntimeException(err);
throw new IllegalArgumentException(err);
}
return client.getDatabase(dbName);
}
@ -76,7 +96,7 @@ public class MdstoreClient implements Closeable {
String.format("Missing collection '%s' in database '%s'", collName, db.getName()));
log.warn(err);
if (abortIfMissing) {
throw new RuntimeException(err);
throw new IllegalArgumentException(err);
} else {
return null;
}

View File

@ -24,7 +24,6 @@ import com.google.common.hash.Hashing;
*/
public class PacePerson {
private static final String UTF8 = "UTF-8";
private List<String> name = Lists.newArrayList();
private List<String> surname = Lists.newArrayList();
private List<String> fullname = Lists.newArrayList();

View File

@ -0,0 +1,45 @@
package eu.dnetlib.dhp.common.aggregation;
import java.io.Closeable;
import java.io.IOException;
import java.util.HashMap;
import java.util.LinkedHashMap;
import java.util.Map;
import java.util.Objects;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import eu.dnetlib.dhp.message.MessageSender;
import eu.dnetlib.dhp.utils.DHPUtils;
public class AggregatorReport extends LinkedHashMap<String, String> implements Closeable {
private static final Logger log = LoggerFactory.getLogger(AggregatorReport.class);
private transient MessageSender messageSender;
public AggregatorReport() {
}
public AggregatorReport(MessageSender messageSender) {
this.messageSender = messageSender;
}
public void ongoing(Long current, Long total) {
messageSender.sendMessage(current, total);
}
@Override
public void close() throws IOException {
if (Objects.nonNull(messageSender)) {
log.info("closing report: ");
this.forEach((k, v) -> log.info("{} - {}", k, v));
Map<String, String> m = new HashMap<>();
m.put(getClass().getSimpleName().toLowerCase(), DHPUtils.MAPPER.writeValueAsString(values()));
messageSender.sendReport(m);
}
}
}

View File

@ -13,9 +13,9 @@ import okio.Source;
public class InputStreamRequestBody extends RequestBody {
private InputStream inputStream;
private MediaType mediaType;
private long lenght;
private final InputStream inputStream;
private final MediaType mediaType;
private final long lenght;
public static RequestBody create(final MediaType mediaType, final InputStream inputStream, final long len) {

View File

@ -5,6 +5,9 @@ import java.io.*;
import java.io.IOException;
import java.util.concurrent.TimeUnit;
import org.apache.http.HttpHeaders;
import org.apache.http.entity.ContentType;
import com.google.gson.Gson;
import eu.dnetlib.dhp.common.api.zenodo.ZenodoModel;
@ -43,7 +46,7 @@ public class ZenodoAPIClient implements Serializable {
this.deposition_id = deposition_id;
}
public ZenodoAPIClient(String urlString, String access_token) throws IOException {
public ZenodoAPIClient(String urlString, String access_token) {
this.urlString = urlString;
this.access_token = access_token;
@ -63,8 +66,8 @@ public class ZenodoAPIClient implements Serializable {
Request request = new Request.Builder()
.url(urlString)
.addHeader("Content-Type", "application/json") // add request headers
.addHeader("Authorization", "Bearer " + access_token)
.addHeader(HttpHeaders.CONTENT_TYPE, ContentType.APPLICATION_JSON.toString()) // add request headers
.addHeader(HttpHeaders.AUTHORIZATION, "Bearer " + access_token)
.post(body)
.build();
@ -103,8 +106,8 @@ public class ZenodoAPIClient implements Serializable {
Request request = new Request.Builder()
.url(bucket + "/" + file_name)
.addHeader("Content-Type", "application/zip") // add request headers
.addHeader("Authorization", "Bearer " + access_token)
.addHeader(HttpHeaders.CONTENT_TYPE, "application/zip") // add request headers
.addHeader(HttpHeaders.AUTHORIZATION, "Bearer " + access_token)
.put(InputStreamRequestBody.create(MEDIA_TYPE_ZIP, is, len))
.build();
@ -130,8 +133,8 @@ public class ZenodoAPIClient implements Serializable {
Request request = new Request.Builder()
.url(urlString + "/" + deposition_id)
.addHeader("Content-Type", "application/json") // add request headers
.addHeader("Authorization", "Bearer " + access_token)
.addHeader(HttpHeaders.CONTENT_TYPE, ContentType.APPLICATION_JSON.toString()) // add request headers
.addHeader(HttpHeaders.AUTHORIZATION, "Bearer " + access_token)
.put(body)
.build();
@ -197,7 +200,7 @@ public class ZenodoAPIClient implements Serializable {
Request request = new Request.Builder()
.url(urlString + "/" + deposition_id + "/actions/newversion")
.addHeader("Authorization", "Bearer " + access_token)
.addHeader(HttpHeaders.AUTHORIZATION, "Bearer " + access_token)
.post(body)
.build();
@ -270,8 +273,8 @@ public class ZenodoAPIClient implements Serializable {
Request request = new Request.Builder()
.url(urlString)
.addHeader("Content-Type", "application/json") // add request headers
.addHeader("Authorization", "Bearer " + access_token)
.addHeader(HttpHeaders.CONTENT_TYPE, ContentType.APPLICATION_JSON.toString()) // add request headers
.addHeader(HttpHeaders.AUTHORIZATION, "Bearer " + access_token)
.get()
.build();
@ -293,8 +296,8 @@ public class ZenodoAPIClient implements Serializable {
Request request = new Request.Builder()
.url(url)
.addHeader("Content-Type", "application/json") // add request headers
.addHeader("Authorization", "Bearer " + access_token)
.addHeader(HttpHeaders.CONTENT_TYPE, ContentType.APPLICATION_JSON.toString()) // add request headers
.addHeader(HttpHeaders.AUTHORIZATION, "Bearer " + access_token)
.get()
.build();

View File

@ -32,13 +32,13 @@ public class Creator {
public static Creator newInstance(String name, String affiliation, String orcid) {
Creator c = new Creator();
if (!(name == null)) {
if (name != null) {
c.name = name;
}
if (!(affiliation == null)) {
if (affiliation != null) {
c.affiliation = affiliation;
}
if (!(orcid == null)) {
if (orcid != null) {
c.orcid = orcid;
}

View File

@ -3,17 +3,12 @@ package eu.dnetlib.dhp.common.api.zenodo;
import java.io.Serializable;
import net.minidev.json.annotate.JsonIgnore;
public class File implements Serializable {
private String checksum;
private String filename;
private long filesize;
private String id;
@JsonIgnore
// private Links links;
public String getChecksum() {
return checksum;
}
@ -46,13 +41,4 @@ public class File implements Serializable {
this.id = id;
}
// @JsonIgnore
// public Links getLinks() {
// return links;
// }
//
// @JsonIgnore
// public void setLinks(Links links) {
// this.links = links;
// }
}

View File

@ -1,16 +1,16 @@
package eu.dnetlib.dhp.collection.worker;
package eu.dnetlib.dhp.common.collection;
public class DnetCollectorException extends Exception {
public class CollectorException extends Exception {
/** */
private static final long serialVersionUID = -290723075076039757L;
public DnetCollectorException() {
public CollectorException() {
super();
}
public DnetCollectorException(
public CollectorException(
final String message,
final Throwable cause,
final boolean enableSuppression,
@ -18,15 +18,15 @@ public class DnetCollectorException extends Exception {
super(message, cause, enableSuppression, writableStackTrace);
}
public DnetCollectorException(final String message, final Throwable cause) {
public CollectorException(final String message, final Throwable cause) {
super(message, cause);
}
public DnetCollectorException(final String message) {
public CollectorException(final String message) {
super(message);
}
public DnetCollectorException(final Throwable cause) {
public CollectorException(final Throwable cause) {
super(cause);
}
}

View File

@ -0,0 +1,56 @@
package eu.dnetlib.dhp.common.collection;
import java.io.*;
import java.nio.charset.StandardCharsets;
import java.util.List;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.opencsv.bean.CsvToBeanBuilder;
public class GetCSV {
public static final char DEFAULT_DELIMITER = ',';
private GetCSV() {
}
public static void getCsv(FileSystem fileSystem, BufferedReader reader, String hdfsPath,
String modelClass) throws IOException, ClassNotFoundException {
getCsv(fileSystem, reader, hdfsPath, modelClass, DEFAULT_DELIMITER);
}
public static void getCsv(FileSystem fileSystem, Reader reader, String hdfsPath,
String modelClass, char delimiter) throws IOException, ClassNotFoundException {
Path hdfsWritePath = new Path(hdfsPath);
FSDataOutputStream fsDataOutputStream = null;
if (fileSystem.exists(hdfsWritePath)) {
fileSystem.delete(hdfsWritePath, false);
}
fsDataOutputStream = fileSystem.create(hdfsWritePath);
try (BufferedWriter writer = new BufferedWriter(
new OutputStreamWriter(fsDataOutputStream, StandardCharsets.UTF_8))) {
final ObjectMapper mapper = new ObjectMapper();
@SuppressWarnings("unchecked")
final List lines = new CsvToBeanBuilder(reader)
.withType(Class.forName(modelClass))
.withSeparator(delimiter)
.build()
.parse();
for (Object line : lines) {
writer.write(mapper.writeValueAsString(line));
writer.newLine();
}
}
}
}

View File

@ -0,0 +1,94 @@
package eu.dnetlib.dhp.common.collection;
/**
* Bundles the http connection parameters driving the client behaviour.
*/
public class HttpClientParams {
// Defaults
public static int _maxNumberOfRetry = 3;
public static int _requestDelay = 0; // milliseconds
public static int _retryDelay = 10; // seconds
public static int _connectTimeOut = 10; // seconds
public static int _readTimeOut = 30; // seconds
/**
* Maximum number of allowed retires before failing
*/
private int maxNumberOfRetry;
/**
* Delay between request (Milliseconds)
*/
private int requestDelay;
/**
* Time to wait after a failure before retrying (Seconds)
*/
private int retryDelay;
/**
* Connect timeout (Seconds)
*/
private int connectTimeOut;
/**
* Read timeout (Seconds)
*/
private int readTimeOut;
public HttpClientParams() {
this(_maxNumberOfRetry, _requestDelay, _retryDelay, _connectTimeOut, _readTimeOut);
}
public HttpClientParams(int maxNumberOfRetry, int requestDelay, int retryDelay, int connectTimeOut,
int readTimeOut) {
this.maxNumberOfRetry = maxNumberOfRetry;
this.requestDelay = requestDelay;
this.retryDelay = retryDelay;
this.connectTimeOut = connectTimeOut;
this.readTimeOut = readTimeOut;
}
public int getMaxNumberOfRetry() {
return maxNumberOfRetry;
}
public void setMaxNumberOfRetry(int maxNumberOfRetry) {
this.maxNumberOfRetry = maxNumberOfRetry;
}
public int getRequestDelay() {
return requestDelay;
}
public void setRequestDelay(int requestDelay) {
this.requestDelay = requestDelay;
}
public int getRetryDelay() {
return retryDelay;
}
public void setRetryDelay(int retryDelay) {
this.retryDelay = retryDelay;
}
public void setConnectTimeOut(int connectTimeOut) {
this.connectTimeOut = connectTimeOut;
}
public int getConnectTimeOut() {
return connectTimeOut;
}
public int getReadTimeOut() {
return readTimeOut;
}
public void setReadTimeOut(int readTimeOut) {
this.readTimeOut = readTimeOut;
}
}

View File

@ -0,0 +1,263 @@
package eu.dnetlib.dhp.common.collection;
import static eu.dnetlib.dhp.utils.DHPUtils.*;
import java.io.IOException;
import java.io.InputStream;
import java.net.*;
import java.util.List;
import java.util.Map;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.math.NumberUtils;
import org.apache.http.HttpHeaders;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import eu.dnetlib.dhp.common.Constants;
import eu.dnetlib.dhp.common.aggregation.AggregatorReport;
/**
* Migrated from https://svn.driver.research-infrastructures.eu/driver/dnet45/modules/dnet-modular-collector-service/trunk/src/main/java/eu/dnetlib/data/collector/plugins/HttpConnector.java
*
* @author jochen, michele, andrea, alessia, claudio, andreas
*/
public class HttpConnector2 {
private static final Logger log = LoggerFactory.getLogger(HttpConnector2.class);
private static final String REPORT_PREFIX = "http:";
private HttpClientParams clientParams;
private String responseType = null;
private static final String userAgent = "Mozilla/5.0 (compatible; OAI; +http://www.openaire.eu)";
public HttpConnector2() {
this(new HttpClientParams());
}
public HttpConnector2(HttpClientParams clientParams) {
this.clientParams = clientParams;
CookieHandler.setDefault(new CookieManager(null, CookiePolicy.ACCEPT_ALL));
}
/**
* @see HttpConnector2#getInputSource(java.lang.String, AggregatorReport)
*/
public InputStream getInputSourceAsStream(final String requestUrl) throws CollectorException {
return IOUtils.toInputStream(getInputSource(requestUrl));
}
/**
* @see HttpConnector2#getInputSource(java.lang.String, AggregatorReport)
*/
public String getInputSource(final String requestUrl) throws CollectorException {
return attemptDownloadAsString(requestUrl, 1, new AggregatorReport());
}
/**
* Given the URL returns the content via HTTP GET
*
* @param requestUrl the URL
* @param report the list of errors
* @return the content of the downloaded resource
* @throws CollectorException when retrying more than maxNumberOfRetry times
*/
public String getInputSource(final String requestUrl, AggregatorReport report)
throws CollectorException {
return attemptDownloadAsString(requestUrl, 1, report);
}
private String attemptDownloadAsString(final String requestUrl, final int retryNumber,
final AggregatorReport report) throws CollectorException {
try (InputStream s = attemptDownload(requestUrl, retryNumber, report)) {
return IOUtils.toString(s);
} catch (IOException e) {
log.error(e.getMessage(), e);
throw new CollectorException(e);
}
}
private InputStream attemptDownload(final String requestUrl, final int retryNumber,
final AggregatorReport report) throws CollectorException, IOException {
if (retryNumber > getClientParams().getMaxNumberOfRetry()) {
final String msg = String
.format(
"Max number of retries (%s/%s) exceeded, failing.",
retryNumber, getClientParams().getMaxNumberOfRetry());
log.error(msg);
throw new CollectorException(msg);
}
log.info("Request attempt {} [{}]", retryNumber, requestUrl);
InputStream input = null;
try {
if (getClientParams().getRequestDelay() > 0) {
backoffAndSleep(getClientParams().getRequestDelay());
}
final HttpURLConnection urlConn = (HttpURLConnection) new URL(requestUrl).openConnection();
urlConn.setInstanceFollowRedirects(false);
urlConn.setReadTimeout(getClientParams().getReadTimeOut() * 1000);
urlConn.setConnectTimeout(getClientParams().getConnectTimeOut() * 1000);
urlConn.addRequestProperty(HttpHeaders.USER_AGENT, userAgent);
if (log.isDebugEnabled()) {
logHeaderFields(urlConn);
}
int retryAfter = obtainRetryAfter(urlConn.getHeaderFields());
String rateLimit = urlConn.getHeaderField(Constants.HTTPHEADER_IETF_DRAFT_RATELIMIT_LIMIT);
String rateRemaining = urlConn.getHeaderField(Constants.HTTPHEADER_IETF_DRAFT_RATELIMIT_REMAINING);
if ((rateLimit != null) && (rateRemaining != null) && (Integer.parseInt(rateRemaining) < 2)) {
if (retryAfter > 0) {
backoffAndSleep(retryAfter);
} else {
backoffAndSleep(1000);
}
}
if (is2xx(urlConn.getResponseCode())) {
input = urlConn.getInputStream();
responseType = urlConn.getContentType();
return input;
}
if (is3xx(urlConn.getResponseCode())) {
// REDIRECTS
final String newUrl = obtainNewLocation(urlConn.getHeaderFields());
log.info("The requested url has been moved to {}", newUrl);
report
.put(
REPORT_PREFIX + urlConn.getResponseCode(),
String.format("Moved to: %s", newUrl));
urlConn.disconnect();
if (retryAfter > 0) {
backoffAndSleep(retryAfter);
}
return attemptDownload(newUrl, retryNumber + 1, report);
}
if (is4xx(urlConn.getResponseCode()) || is5xx(urlConn.getResponseCode())) {
switch (urlConn.getResponseCode()) {
case HttpURLConnection.HTTP_NOT_FOUND:
case HttpURLConnection.HTTP_BAD_GATEWAY:
case HttpURLConnection.HTTP_UNAVAILABLE:
case HttpURLConnection.HTTP_GATEWAY_TIMEOUT:
if (retryAfter > 0) {
log
.warn(
"{} - waiting and repeating request after suggested retry-after {} sec.",
requestUrl, retryAfter);
backoffAndSleep(retryAfter * 1000);
} else {
log
.warn(
"{} - waiting and repeating request after default delay of {} sec.",
requestUrl, getClientParams().getRetryDelay());
backoffAndSleep(retryNumber * getClientParams().getRetryDelay() * 1000);
}
report.put(REPORT_PREFIX + urlConn.getResponseCode(), requestUrl);
urlConn.disconnect();
return attemptDownload(requestUrl, retryNumber + 1, report);
default:
report
.put(
REPORT_PREFIX + urlConn.getResponseCode(),
String
.format(
"%s Error: %s", requestUrl, urlConn.getResponseMessage()));
throw new CollectorException(urlConn.getResponseCode() + " error " + report);
}
}
throw new CollectorException(
String
.format(
"Unexpected status code: %s errors: %s", urlConn.getResponseCode(),
MAPPER.writeValueAsString(report)));
} catch (MalformedURLException | UnknownHostException e) {
log.error(e.getMessage(), e);
report.put(e.getClass().getName(), e.getMessage());
throw new CollectorException(e.getMessage(), e);
} catch (SocketTimeoutException | SocketException e) {
log.error(e.getMessage(), e);
report.put(e.getClass().getName(), e.getMessage());
backoffAndSleep(getClientParams().getRetryDelay() * retryNumber * 1000);
return attemptDownload(requestUrl, retryNumber + 1, report);
}
}
private void logHeaderFields(final HttpURLConnection urlConn) throws IOException {
log.debug("StatusCode: {}", urlConn.getResponseMessage());
for (Map.Entry<String, List<String>> e : urlConn.getHeaderFields().entrySet()) {
if (e.getKey() != null) {
for (String v : e.getValue()) {
log.debug(" key: {} - value: {}", e.getKey(), v);
}
}
}
}
private void backoffAndSleep(int sleepTimeMs) throws CollectorException {
log.info("I'm going to sleep for {}ms", sleepTimeMs);
try {
Thread.sleep(sleepTimeMs);
} catch (InterruptedException e) {
log.error(e.getMessage(), e);
throw new CollectorException(e);
}
}
private int obtainRetryAfter(final Map<String, List<String>> headerMap) {
for (String key : headerMap.keySet()) {
if ((key != null) && key.equalsIgnoreCase(HttpHeaders.RETRY_AFTER) && (!headerMap.get(key).isEmpty())
&& NumberUtils.isCreatable(headerMap.get(key).get(0))) {
return Integer.parseInt(headerMap.get(key).get(0)) + 10;
}
}
return -1;
}
private String obtainNewLocation(final Map<String, List<String>> headerMap) throws CollectorException {
for (String key : headerMap.keySet()) {
if ((key != null) && key.equalsIgnoreCase(HttpHeaders.LOCATION) && (headerMap.get(key).size() > 0)) {
return headerMap.get(key).get(0);
}
}
throw new CollectorException("The requested url has been MOVED, but 'location' param is MISSING");
}
private boolean is2xx(final int statusCode) {
return statusCode >= 200 && statusCode <= 299;
}
private boolean is4xx(final int statusCode) {
return statusCode >= 400 && statusCode <= 499;
}
private boolean is3xx(final int statusCode) {
return statusCode >= 300 && statusCode <= 399;
}
private boolean is5xx(final int statusCode) {
return statusCode >= 500 && statusCode <= 599;
}
public String getResponseType() {
return responseType;
}
public HttpClientParams getClientParams() {
return clientParams;
}
public void setClientParams(HttpClientParams clientParams) {
this.clientParams = clientParams;
}
}

View File

@ -0,0 +1,75 @@
package eu.dnetlib.dhp.common.rest;
import java.io.IOException;
import java.util.Arrays;
import java.util.stream.Collectors;
import org.apache.commons.io.IOUtils;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.client.methods.HttpUriRequest;
import org.apache.http.entity.StringEntity;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.databind.ObjectMapper;
public class DNetRestClient {
private static final Logger log = LoggerFactory.getLogger(DNetRestClient.class);
private static final ObjectMapper mapper = new ObjectMapper();
private DNetRestClient() {
}
public static <T> T doGET(final String url, Class<T> clazz) throws Exception {
final HttpGet httpGet = new HttpGet(url);
return doHTTPRequest(httpGet, clazz);
}
public static String doGET(final String url) throws IOException {
final HttpGet httpGet = new HttpGet(url);
return doHTTPRequest(httpGet);
}
public static <V> String doPOST(final String url, V objParam) throws IOException {
final HttpPost httpPost = new HttpPost(url);
if (objParam != null) {
final StringEntity entity = new StringEntity(mapper.writeValueAsString(objParam));
httpPost.setEntity(entity);
httpPost.setHeader("Accept", "application/json");
httpPost.setHeader("Content-type", "application/json");
}
return doHTTPRequest(httpPost);
}
public static <T, V> T doPOST(final String url, V objParam, Class<T> clazz) throws IOException {
return mapper.readValue(doPOST(url, objParam), clazz);
}
private static String doHTTPRequest(final HttpUriRequest r) throws IOException {
try (CloseableHttpClient client = HttpClients.createDefault()) {
log.info("performing HTTP request, method {} on URI {}", r.getMethod(), r.getURI().toString());
log
.info(
"request headers: {}",
Arrays
.asList(r.getAllHeaders())
.stream()
.map(h -> h.getName() + ":" + h.getValue())
.collect(Collectors.joining(",")));
return IOUtils.toString(client.execute(r).getEntity().getContent());
}
}
private static <T> T doHTTPRequest(final HttpUriRequest r, Class<T> clazz) throws Exception {
return mapper.readValue(doHTTPRequest(r), clazz);
}
}

View File

@ -1,5 +1,5 @@
package eu.dnetlib.dhp.oa.graph.raw.common;
package eu.dnetlib.dhp.common.vocabulary;
import java.io.Serializable;
import java.util.HashMap;
@ -10,8 +10,8 @@ import org.apache.commons.lang3.StringUtils;
import com.google.common.collect.Maps;
import eu.dnetlib.dhp.schema.oaf.OafMapperUtils;
import eu.dnetlib.dhp.schema.oaf.Qualifier;
import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils;
public class Vocabulary implements Serializable {
@ -46,7 +46,7 @@ public class Vocabulary implements Serializable {
}
public VocabularyTerm getTerm(final String id) {
return Optional.ofNullable(id).map(s -> s.toLowerCase()).map(s -> terms.get(s)).orElse(null);
return Optional.ofNullable(id).map(String::toLowerCase).map(terms::get).orElse(null);
}
protected void addTerm(final String id, final String name) {
@ -81,7 +81,6 @@ public class Vocabulary implements Serializable {
.ofNullable(getTermBySynonym(syn))
.map(term -> getTermAsQualifier(term.getId()))
.orElse(null);
// .orElse(OafMapperUtils.unknown(getId(), getName()));
}
}

View File

@ -1,5 +1,5 @@
package eu.dnetlib.dhp.oa.graph.raw.common;
package eu.dnetlib.dhp.common.vocabulary;
import java.io.Serializable;
import java.util.*;
@ -7,8 +7,8 @@ import java.util.stream.Collectors;
import org.apache.commons.lang3.StringUtils;
import eu.dnetlib.dhp.schema.oaf.OafMapperUtils;
import eu.dnetlib.dhp.schema.oaf.Qualifier;
import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
@ -46,7 +46,6 @@ public class VocabularyGroup implements Serializable {
}
vocs.addTerm(vocId, termId, termName);
// vocs.addSynonyms(vocId, termId, termId);
}
}
@ -58,7 +57,6 @@ public class VocabularyGroup implements Serializable {
final String syn = arr[2].trim();
vocs.addSynonyms(vocId, termId, syn);
// vocs.addSynonyms(vocId, termId, termId);
}
}
@ -67,6 +65,10 @@ public class VocabularyGroup implements Serializable {
private final Map<String, Vocabulary> vocs = new HashMap<>();
public Set<String> vocabularyNames() {
return vocs.keySet();
}
public void addVocabulary(final String id, final String name) {
vocs.put(id.toLowerCase(), new Vocabulary(id, name));
}
@ -94,7 +96,7 @@ public class VocabularyGroup implements Serializable {
.getTerms()
.values()
.stream()
.map(t -> t.getId())
.map(VocabularyTerm::getId)
.collect(Collectors.toCollection(HashSet::new));
}
@ -118,7 +120,31 @@ public class VocabularyGroup implements Serializable {
return vocs.get(vocId.toLowerCase()).getSynonymAsQualifier(syn);
}
/**
* getSynonymAsQualifierCaseSensitive
*
* refelects the situation to check caseSensitive vocabulary
*/
public Qualifier getSynonymAsQualifierCaseSensitive(final String vocId, final String syn) {
if (StringUtils.isBlank(vocId)) {
return OafMapperUtils.unknown("", "");
}
return vocs.get(vocId).getSynonymAsQualifier(syn);
}
/**
* termExists
*
* two methods: without and with caseSensitive check
*/
public boolean termExists(final String vocId, final String id) {
return termExists(vocId, id, Boolean.FALSE);
}
public boolean termExists(final String vocId, final String id, final Boolean caseSensitive) {
if (Boolean.TRUE.equals(caseSensitive)) {
return vocabularyExists(vocId) && vocs.get(vocId).termExists(id);
}
return vocabularyExists(vocId) && vocs.get(vocId.toLowerCase()).termExists(id);
}
@ -126,16 +152,19 @@ public class VocabularyGroup implements Serializable {
return Optional
.ofNullable(vocId)
.map(String::toLowerCase)
.map(id -> vocs.containsKey(id))
.map(vocs::containsKey)
.orElse(false);
}
private void addSynonyms(final String vocId, final String termId, final String syn) {
String id = Optional
.ofNullable(vocId)
.map(s -> s.toLowerCase())
.map(String::toLowerCase)
.orElseThrow(
() -> new IllegalArgumentException(String.format("empty vocabulary id for [term:%s, synonym:%s]")));
() -> new IllegalArgumentException(
String
.format(
"empty vocabulary id for [term:%s, synonym:%s]", termId, syn)));
Optional
.ofNullable(vocs.get(id))
.orElseThrow(() -> new IllegalArgumentException("missing vocabulary id: " + vocId))

View File

@ -1,5 +1,5 @@
package eu.dnetlib.dhp.oa.graph.raw.common;
package eu.dnetlib.dhp.common.vocabulary;
import java.io.Serializable;

View File

@ -0,0 +1,63 @@
package eu.dnetlib.dhp.message;
import java.io.Serializable;
import java.util.LinkedHashMap;
import java.util.Map;
public class Message implements Serializable {
private static final long serialVersionUID = 401753881204524893L;
public static final String CURRENT_PARAM = "current";
public static final String TOTAL_PARAM = "total";
private MessageType messageType;
private String workflowId;
private Map<String, String> body;
public Message() {
}
public Message(final MessageType messageType, final String workflowId) {
this(messageType, workflowId, new LinkedHashMap<>());
}
public Message(final MessageType messageType, final String workflowId, final Map<String, String> body) {
this.messageType = messageType;
this.workflowId = workflowId;
this.body = body;
}
public MessageType getMessageType() {
return messageType;
}
public void setMessageType(MessageType messageType) {
this.messageType = messageType;
}
public String getWorkflowId() {
return workflowId;
}
public void setWorkflowId(final String workflowId) {
this.workflowId = workflowId;
}
public Map<String, String> getBody() {
return body;
}
public void setBody(final Map<String, String> body) {
this.body = body;
}
@Override
public String toString() {
return String.format("Message [type=%s, workflowId=%s, body=%s]", messageType, workflowId, body);
}
}

View File

@ -0,0 +1,94 @@
package eu.dnetlib.dhp.message;
import java.util.Map;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpPut;
import org.apache.http.entity.ContentType;
import org.apache.http.entity.StringEntity;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.core.JsonProcessingException;
import com.fasterxml.jackson.databind.ObjectMapper;
public class MessageSender {
private static final Logger log = LoggerFactory.getLogger(MessageSender.class);
private static final int SOCKET_TIMEOUT_MS = 2000;
private static final int CONNECTION_REQUEST_TIMEOUT_MS = 2000;
private static final int CONNTECTION_TIMEOUT_MS = 2000;
private final ObjectMapper objectMapper = new ObjectMapper();
private final String dnetMessageEndpoint;
private final String workflowId;
private final ExecutorService executorService = Executors.newCachedThreadPool();
public MessageSender(final String dnetMessageEndpoint, final String workflowId) {
this.workflowId = workflowId;
this.dnetMessageEndpoint = dnetMessageEndpoint;
}
public void sendMessage(final Message message) {
executorService.submit(() -> _sendMessage(message));
}
public void sendMessage(final Long current, final Long total) {
sendMessage(createOngoingMessage(current, total));
}
public void sendReport(final Map<String, String> report) {
sendMessage(new Message(MessageType.REPORT, workflowId, report));
}
private Message createOngoingMessage(final Long current, final Long total) {
final Message m = new Message(MessageType.ONGOING, workflowId);
m.getBody().put(Message.CURRENT_PARAM, current.toString());
if (total != null) {
m.getBody().put(Message.TOTAL_PARAM, total.toString());
}
return m;
}
private void _sendMessage(final Message message) {
try {
final String json = objectMapper.writeValueAsString(message);
final HttpPut req = new HttpPut(dnetMessageEndpoint);
req.setEntity(new StringEntity(json, ContentType.APPLICATION_JSON));
final RequestConfig requestConfig = RequestConfig
.custom()
.setConnectTimeout(CONNTECTION_TIMEOUT_MS)
.setConnectionRequestTimeout(CONNECTION_REQUEST_TIMEOUT_MS)
.setSocketTimeout(SOCKET_TIMEOUT_MS)
.build();
try (final CloseableHttpClient client = HttpClients
.custom()
.setDefaultRequestConfig(requestConfig)
.build();
final CloseableHttpResponse response = client.execute(req)) {
log.debug("Sent Message to " + dnetMessageEndpoint);
log.debug("MESSAGE:" + message);
} catch (final Throwable e) {
log.error("Error sending message to " + dnetMessageEndpoint + ", message content: " + message, e);
}
} catch (final JsonProcessingException e) {
log.error("Error sending message to " + dnetMessageEndpoint + ", message content: " + message, e);
}
}
}

View File

@ -0,0 +1,21 @@
package eu.dnetlib.dhp.message;
import java.io.Serializable;
import java.util.Optional;
import org.apache.commons.lang3.StringUtils;
public enum MessageType implements Serializable {
ONGOING, REPORT;
public MessageType from(String value) {
return Optional
.ofNullable(value)
.map(StringUtils::upperCase)
.map(MessageType::valueOf)
.orElseThrow(() -> new IllegalArgumentException("unknown message type: " + value));
}
}

View File

@ -1,121 +0,0 @@
package eu.dnetlib.dhp.model.mdstore;
import java.io.Serializable;
import eu.dnetlib.dhp.utils.DHPUtils;
/** This class models a record inside the new Metadata store collection on HDFS * */
public class MetadataRecord implements Serializable {
/** The D-Net Identifier associated to the record */
private String id;
/** The original Identifier of the record */
private String originalId;
/** The encoding of the record, should be JSON or XML */
private String encoding;
/**
* The information about the provenance of the record see @{@link Provenance} for the model of this information
*/
private Provenance provenance;
/** The content of the metadata */
private String body;
/** the date when the record has been stored */
private long dateOfCollection;
/** the date when the record has been stored */
private long dateOfTransformation;
public MetadataRecord() {
this.dateOfCollection = System.currentTimeMillis();
}
public MetadataRecord(
String originalId,
String encoding,
Provenance provenance,
String body,
long dateOfCollection) {
this.originalId = originalId;
this.encoding = encoding;
this.provenance = provenance;
this.body = body;
this.dateOfCollection = dateOfCollection;
this.id = DHPUtils.generateIdentifier(originalId, this.provenance.getNsPrefix());
}
public String getId() {
return id;
}
public void setId(String id) {
this.id = id;
}
public String getOriginalId() {
return originalId;
}
public void setOriginalId(String originalId) {
this.originalId = originalId;
}
public String getEncoding() {
return encoding;
}
public void setEncoding(String encoding) {
this.encoding = encoding;
}
public Provenance getProvenance() {
return provenance;
}
public void setProvenance(Provenance provenance) {
this.provenance = provenance;
}
public String getBody() {
return body;
}
public void setBody(String body) {
this.body = body;
}
public long getDateOfCollection() {
return dateOfCollection;
}
public void setDateOfCollection(long dateOfCollection) {
this.dateOfCollection = dateOfCollection;
}
public long getDateOfTransformation() {
return dateOfTransformation;
}
public void setDateOfTransformation(long dateOfTransformation) {
this.dateOfTransformation = dateOfTransformation;
}
@Override
public boolean equals(Object o) {
if (!(o instanceof MetadataRecord)) {
return false;
}
return ((MetadataRecord) o).getId().equalsIgnoreCase(id);
}
@Override
public int hashCode() {
return id.hashCode();
}
}

View File

@ -1,52 +0,0 @@
package eu.dnetlib.dhp.model.mdstore;
import java.io.Serializable;
/**
* @author Sandro La Bruzzo
* <p>
* Provenace class models the provenance of the record in the metadataStore It contains the identifier and the
* name of the datasource that gives the record
*/
public class Provenance implements Serializable {
private String datasourceId;
private String datasourceName;
private String nsPrefix;
public Provenance() {
}
public Provenance(String datasourceId, String datasourceName, String nsPrefix) {
this.datasourceId = datasourceId;
this.datasourceName = datasourceName;
this.nsPrefix = nsPrefix;
}
public String getDatasourceId() {
return datasourceId;
}
public void setDatasourceId(String datasourceId) {
this.datasourceId = datasourceId;
}
public String getDatasourceName() {
return datasourceName;
}
public void setDatasourceName(String datasourceName) {
this.datasourceName = datasourceName;
}
public String getNsPrefix() {
return nsPrefix;
}
public void setNsPrefix(String nsPrefix) {
this.nsPrefix = nsPrefix;
}
}

View File

@ -4,7 +4,6 @@ package eu.dnetlib.dhp.oa.merge;
import java.text.Normalizer;
import java.util.*;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import org.apache.commons.lang3.StringUtils;
@ -19,6 +18,9 @@ public class AuthorMerger {
private static final Double THRESHOLD = 0.95;
private AuthorMerger() {
}
public static List<Author> merge(List<List<Author>> authors) {
authors.sort((o1, o2) -> -Integer.compare(countAuthorsPids(o1), countAuthorsPids(o2)));
@ -36,7 +38,8 @@ public class AuthorMerger {
public static List<Author> mergeAuthor(final List<Author> a, final List<Author> b, Double threshold) {
int pa = countAuthorsPids(a);
int pb = countAuthorsPids(b);
List<Author> base, enrich;
List<Author> base;
List<Author> enrich;
int sa = authorsSize(a);
int sb = authorsSize(b);
@ -62,22 +65,24 @@ public class AuthorMerger {
// <pidComparableString, Author> (if an Author has more than 1 pid, it appears 2 times in the list)
final Map<String, Author> basePidAuthorMap = base
.stream()
.filter(a -> a.getPid() != null && a.getPid().size() > 0)
.filter(a -> a.getPid() != null && !a.getPid().isEmpty())
.flatMap(
a -> a
.getPid()
.stream()
.filter(Objects::nonNull)
.map(p -> new Tuple2<>(pidToComparableString(p), a)))
.collect(Collectors.toMap(Tuple2::_1, Tuple2::_2, (x1, x2) -> x1));
// <pid, Author> (list of pid that are missing in the other list)
final List<Tuple2<StructuredProperty, Author>> pidToEnrich = enrich
.stream()
.filter(a -> a.getPid() != null && a.getPid().size() > 0)
.filter(a -> a.getPid() != null && !a.getPid().isEmpty())
.flatMap(
a -> a
.getPid()
.stream()
.filter(Objects::nonNull)
.filter(p -> !basePidAuthorMap.containsKey(pidToComparableString(p)))
.map(p -> new Tuple2<>(p, a)))
.collect(Collectors.toList());
@ -115,11 +120,17 @@ public class AuthorMerger {
}
public static String pidToComparableString(StructuredProperty pid) {
<<<<<<< HEAD
if (pid == null)
return "";
return (pid.getQualifier() != null
? pid.getQualifier().getClassid() != null ? pid.getQualifier().getClassid().toLowerCase() : ""
: "")
=======
final String classid = pid.getQualifier().getClassid() != null ? pid.getQualifier().getClassid().toLowerCase()
: "";
return (pid.getQualifier() != null ? classid : "")
>>>>>>> upstream/master
+ (pid.getValue() != null ? pid.getValue().toLowerCase() : "");
}
@ -152,7 +163,7 @@ public class AuthorMerger {
}
private static boolean hasPid(Author a) {
if (a == null || a.getPid() == null || a.getPid().size() == 0)
if (a == null || a.getPid() == null || a.getPid().isEmpty())
return false;
return a.getPid().stream().anyMatch(p -> p != null && StringUtils.isNotBlank(p.getValue()));
}
@ -161,7 +172,10 @@ public class AuthorMerger {
if (StringUtils.isNotBlank(author.getSurname())) {
return new Person(author.getSurname() + ", " + author.getName(), false);
} else {
return new Person(author.getFullname(), false);
if (StringUtils.isNotBlank(author.getFullname()))
return new Person(author.getFullname(), false);
else
return new Person("", false);
}
}

View File

@ -12,6 +12,9 @@ import com.ximpleware.VTDNav;
/** Created by sandro on 9/29/16. */
public class VtdUtilityParser {
private VtdUtilityParser() {
}
public static List<Node> getTextValuesWithAttributes(
final AutoPilot ap, final VTDNav vn, final String xpath, final List<String> attributes)
throws VtdException {

View File

@ -1,49 +0,0 @@
package eu.dnetlib.dhp.schema.oaf;
import java.util.Comparator;
import eu.dnetlib.dhp.schema.common.ModelConstants;
public class ResultTypeComparator implements Comparator<Result> {
@Override
public int compare(Result left, Result right) {
if (left == null && right == null)
return 0;
if (left == null)
return 1;
if (right == null)
return -1;
String lClass = left.getResulttype().getClassid();
String rClass = right.getResulttype().getClassid();
if (lClass.equals(rClass))
return 0;
if (lClass.equals(ModelConstants.PUBLICATION_RESULTTYPE_CLASSID))
return -1;
if (rClass.equals(ModelConstants.PUBLICATION_RESULTTYPE_CLASSID))
return 1;
if (lClass.equals(ModelConstants.DATASET_RESULTTYPE_CLASSID))
return -1;
if (rClass.equals(ModelConstants.DATASET_RESULTTYPE_CLASSID))
return 1;
if (lClass.equals(ModelConstants.SOFTWARE_RESULTTYPE_CLASSID))
return -1;
if (rClass.equals(ModelConstants.SOFTWARE_RESULTTYPE_CLASSID))
return 1;
if (lClass.equals(ModelConstants.ORP_RESULTTYPE_CLASSID))
return -1;
if (rClass.equals(ModelConstants.ORP_RESULTTYPE_CLASSID))
return 1;
// Else (but unlikely), lexicographical ordering will do.
return lClass.compareTo(rClass);
}
}

View File

@ -1,33 +1,37 @@
package eu.dnetlib.dhp.oa.graph.clean;
package eu.dnetlib.dhp.schema.oaf.utils;
import java.time.LocalDate;
import java.time.ZoneId;
import java.time.format.DateTimeFormatter;
import java.time.format.DateTimeParseException;
import java.util.*;
import java.util.function.Function;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import org.apache.commons.lang3.StringUtils;
import com.clearspring.analytics.util.Lists;
import com.github.sisyphsu.dateparser.DateParserUtils;
import com.google.common.collect.Lists;
import com.google.common.collect.Sets;
import eu.dnetlib.dhp.oa.graph.raw.AbstractMdRecordToOafMapper;
import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.common.ModelSupport;
import eu.dnetlib.dhp.schema.oaf.*;
import me.xuender.unidecode.Unidecode;
public class CleaningFunctions {
public static final String DOI_PREFIX_REGEX = "^10\\.";
public class GraphCleaningFunctions extends CleaningFunctions {
public static final String ORCID_CLEANING_REGEX = ".*([0-9]{4}).*[-–—−=].*([0-9]{4}).*[-–—−=].*([0-9]{4}).*[-–—−=].*([0-9x]{4})";
public static final int ORCID_LEN = 19;
public static final String CLEANING_REGEX = "(?:\\n|\\r|\\t)";
public static final String INVALID_AUTHOR_REGEX = ".*deactivated.*";
public static final Set<String> PID_BLACKLIST = new HashSet<>();
public static final String TITLE_TEST = "test";
public static final String TITLE_FILTER_REGEX = String.format("(%s)|\\W|\\d", TITLE_TEST);
static {
PID_BLACKLIST.add("none");
PID_BLACKLIST.add("na");
}
public static final int TITLE_FILTER_RESIDUAL_LENGTH = 5;
public static <T extends Oaf> T fixVocabularyNames(T value) {
if (value instanceof Datasource) {
@ -59,23 +63,17 @@ public class CleaningFunctions {
}
}
if (Objects.nonNull(r.getAuthor())) {
r
.getAuthor()
.stream()
.filter(Objects::nonNull)
.forEach(a -> {
if (Objects.nonNull(a.getPid())) {
a
.getPid()
.stream()
.filter(Objects::nonNull)
.forEach(p -> fixVocabName(p.getQualifier(), ModelConstants.DNET_PID_TYPES));
}
});
r.getAuthor().stream().filter(Objects::nonNull).forEach(a -> {
if (Objects.nonNull(a.getPid())) {
a.getPid().stream().filter(Objects::nonNull).forEach(p -> {
fixVocabName(p.getQualifier(), ModelConstants.DNET_PID_TYPES);
});
}
});
}
if (value instanceof Publication) {
} else if (value instanceof eu.dnetlib.dhp.schema.oaf.Dataset) {
} else if (value instanceof Dataset) {
} else if (value instanceof OtherResearchProduct) {
@ -87,7 +85,37 @@ public class CleaningFunctions {
return value;
}
public static <T extends Oaf> T fixDefaults(T value) {
public static <T extends Oaf> boolean filter(T value) {
if (value instanceof Datasource) {
// nothing to evaluate here
} else if (value instanceof Project) {
// nothing to evaluate here
} else if (value instanceof Organization) {
// nothing to evaluate here
} else if (value instanceof Relation) {
// nothing to clean here
} else if (value instanceof Result) {
Result r = (Result) value;
if (Objects.isNull(r.getTitle()) || r.getTitle().isEmpty()) {
return false;
}
if (value instanceof Publication) {
} else if (value instanceof Dataset) {
} else if (value instanceof OtherResearchProduct) {
} else if (value instanceof Software) {
}
}
return true;
}
public static <T extends Oaf> T cleanup(T value) {
if (value instanceof Datasource) {
// nothing to clean here
} else if (value instanceof Project) {
@ -98,10 +126,44 @@ public class CleaningFunctions {
o.setCountry(ModelConstants.UNKNOWN_COUNTRY);
}
} else if (value instanceof Relation) {
// nothing to clean here
Relation r = (Relation) value;
Optional<String> validationDate = doCleanDate(r.getValidationDate());
if (validationDate.isPresent()) {
r.setValidationDate(validationDate.get());
r.setValidated(true);
} else {
r.setValidationDate(null);
r.setValidated(false);
}
} else if (value instanceof Result) {
Result r = (Result) value;
if (Objects.nonNull(r.getDateofacceptance())) {
Optional<String> date = cleanDateField(r.getDateofacceptance());
if (date.isPresent()) {
r.getDateofacceptance().setValue(date.get());
} else {
r.setDateofacceptance(null);
}
}
if (Objects.nonNull(r.getRelevantdate())) {
r
.setRelevantdate(
r
.getRelevantdate()
.stream()
.filter(Objects::nonNull)
.filter(sp -> Objects.nonNull(sp.getQualifier()))
.filter(sp -> StringUtils.isNotBlank(sp.getQualifier().getClassid()))
.map(sp -> {
sp.setValue(GraphCleaningFunctions.cleanDate(sp.getValue()));
return sp;
})
.filter(sp -> StringUtils.isNotBlank(sp.getValue()))
.collect(Collectors.toList()));
}
if (Objects.nonNull(r.getPublisher()) && StringUtils.isBlank(r.getPublisher().getValue())) {
r.setPublisher(null);
}
@ -110,16 +172,6 @@ public class CleaningFunctions {
.setLanguage(
qualifier("und", "Undetermined", ModelConstants.DNET_LANGUAGES));
}
if (Objects.nonNull(r.getCountry())) {
r
.setCountry(
r
.getCountry()
.stream()
.filter(Objects::nonNull)
.filter(c -> StringUtils.isNotBlank(c.getClassid()))
.collect(Collectors.toList()));
}
if (Objects.nonNull(r.getSubject())) {
r
.setSubject(
@ -130,7 +182,7 @@ public class CleaningFunctions {
.filter(sp -> StringUtils.isNotBlank(sp.getValue()))
.filter(sp -> Objects.nonNull(sp.getQualifier()))
.filter(sp -> StringUtils.isNotBlank(sp.getQualifier().getClassid()))
.map(CleaningFunctions::cleanValue)
.map(GraphCleaningFunctions::cleanValue)
.collect(Collectors.toList()));
}
if (Objects.nonNull(r.getTitle())) {
@ -141,7 +193,23 @@ public class CleaningFunctions {
.stream()
.filter(Objects::nonNull)
.filter(sp -> StringUtils.isNotBlank(sp.getValue()))
.map(CleaningFunctions::cleanValue)
.filter(
sp -> {
final String title = sp
.getValue()
.toLowerCase();
final String decoded = Unidecode.decode(title);
if (StringUtils.contains(decoded, TITLE_TEST)) {
return decoded
.replaceAll(TITLE_FILTER_REGEX, "")
.length() > TITLE_FILTER_RESIDUAL_LENGTH;
}
return !decoded
.replaceAll("\\W|\\d", "")
.isEmpty();
})
.map(GraphCleaningFunctions::cleanValue)
.collect(Collectors.toList()));
}
if (Objects.nonNull(r.getDescription())) {
@ -152,22 +220,11 @@ public class CleaningFunctions {
.stream()
.filter(Objects::nonNull)
.filter(sp -> StringUtils.isNotBlank(sp.getValue()))
.map(CleaningFunctions::cleanValue)
.map(GraphCleaningFunctions::cleanValue)
.collect(Collectors.toList()));
}
if (Objects.nonNull(r.getPid())) {
r
.setPid(
r
.getPid()
.stream()
.filter(Objects::nonNull)
.filter(sp -> StringUtils.isNotBlank(StringUtils.trim(sp.getValue())))
.filter(sp -> !PID_BLACKLIST.contains(sp.getValue().trim().toLowerCase()))
.filter(sp -> Objects.nonNull(sp.getQualifier()))
.filter(sp -> StringUtils.isNotBlank(sp.getQualifier().getClassid()))
.map(CleaningFunctions::normalizePidValue)
.collect(Collectors.toList()));
r.setPid(processPidCleaning(r.getPid()));
}
if (Objects.isNull(r.getResourcetype()) || StringUtils.isBlank(r.getResourcetype().getClassid())) {
r
@ -175,11 +232,32 @@ public class CleaningFunctions {
qualifier(ModelConstants.UNKNOWN, "Unknown", ModelConstants.DNET_DATA_CITE_RESOURCE));
}
if (Objects.nonNull(r.getInstance())) {
for (Instance i : r.getInstance()) {
if (Objects.nonNull(i.getPid())) {
i.setPid(processPidCleaning(i.getPid()));
}
if (Objects.nonNull(i.getAlternateIdentifier())) {
i.setAlternateIdentifier(processPidCleaning(i.getAlternateIdentifier()));
}
Optional
.ofNullable(i.getPid())
.ifPresent(pid -> {
final Set<StructuredProperty> pids = Sets.newHashSet(pid);
Optional
.ofNullable(i.getAlternateIdentifier())
.ifPresent(altId -> {
final Set<StructuredProperty> altIds = Sets.newHashSet(altId);
i.setAlternateIdentifier(Lists.newArrayList(Sets.difference(altIds, pids)));
});
});
if (Objects.isNull(i.getAccessright()) || StringUtils.isBlank(i.getAccessright().getClassid())) {
i
.setAccessright(
qualifier(ModelConstants.UNKNOWN, "not available", ModelConstants.DNET_ACCESS_MODES));
accessRight(
ModelConstants.UNKNOWN, ModelConstants.NOT_AVAILABLE,
ModelConstants.DNET_ACCESS_MODES));
}
if (Objects.isNull(i.getHostedby()) || StringUtils.isBlank(i.getHostedby().getKey())) {
i.setHostedby(ModelConstants.UNKNOWN_REPOSITORY);
@ -187,14 +265,24 @@ public class CleaningFunctions {
if (Objects.isNull(i.getRefereed())) {
i.setRefereed(qualifier("0000", "Unknown", ModelConstants.DNET_REVIEW_LEVELS));
}
if (Objects.nonNull(i.getDateofacceptance())) {
Optional<String> date = cleanDateField(i.getDateofacceptance());
if (date.isPresent()) {
i.getDateofacceptance().setValue(date.get());
} else {
i.setDateofacceptance(null);
}
}
}
}
if (Objects.isNull(r.getBestaccessright()) || StringUtils.isBlank(r.getBestaccessright().getClassid())) {
Qualifier bestaccessrights = AbstractMdRecordToOafMapper.createBestAccessRights(r.getInstance());
Qualifier bestaccessrights = OafMapperUtils.createBestAccessRights(r.getInstance());
if (Objects.isNull(bestaccessrights)) {
r
.setBestaccessright(
qualifier(ModelConstants.UNKNOWN, "not available", ModelConstants.DNET_ACCESS_MODES));
qualifier(
ModelConstants.UNKNOWN, ModelConstants.NOT_AVAILABLE,
ModelConstants.DNET_ACCESS_MODES));
} else {
r.setBestaccessright(bestaccessrights);
}
@ -205,7 +293,7 @@ public class CleaningFunctions {
r
.getAuthor()
.stream()
.filter(a -> Objects.nonNull(a))
.filter(Objects::nonNull)
.filter(a -> StringUtils.isNotBlank(a.getFullname()))
.filter(a -> StringUtils.isNotBlank(a.getFullname().replaceAll("[\\W]", "")))
.collect(Collectors.toList()));
@ -280,11 +368,10 @@ public class CleaningFunctions {
.collect(Collectors.toList()));
}
}
}
if (value instanceof Publication) {
} else if (value instanceof eu.dnetlib.dhp.schema.oaf.Dataset) {
} else if (value instanceof Dataset) {
} else if (value instanceof OtherResearchProduct) {
@ -296,6 +383,79 @@ public class CleaningFunctions {
return value;
}
private static Optional<String> cleanDateField(Field<String> dateofacceptance) {
return Optional
.ofNullable(dateofacceptance)
.map(Field::getValue)
.map(GraphCleaningFunctions::cleanDate)
.filter(Objects::nonNull);
}
protected static Optional<String> doCleanDate(String date) {
return Optional.ofNullable(cleanDate(date));
}
public static String cleanDate(final String inputDate) {
if (StringUtils.isBlank(inputDate)) {
return null;
}
try {
final LocalDate date = DateParserUtils
.parseDate(inputDate.trim())
.toInstant()
.atZone(ZoneId.systemDefault())
.toLocalDate();
return DateTimeFormatter.ofPattern(ModelSupport.DATE_FORMAT).format(date);
} catch (DateTimeParseException e) {
return null;
}
}
// HELPERS
private static boolean isValidAuthorName(Author a) {
return !Stream
.of(a.getFullname(), a.getName(), a.getSurname())
.filter(s -> s != null && !s.isEmpty())
.collect(Collectors.joining(""))
.toLowerCase()
.matches(INVALID_AUTHOR_REGEX);
}
private static List<StructuredProperty> processPidCleaning(List<StructuredProperty> pids) {
return pids
.stream()
.filter(Objects::nonNull)
.filter(sp -> StringUtils.isNotBlank(StringUtils.trim(sp.getValue())))
.filter(sp -> !PID_BLACKLIST.contains(sp.getValue().trim().toLowerCase()))
.filter(sp -> Objects.nonNull(sp.getQualifier()))
.filter(sp -> StringUtils.isNotBlank(sp.getQualifier().getClassid()))
.map(CleaningFunctions::normalizePidValue)
.filter(CleaningFunctions::pidFilter)
.collect(Collectors.toList());
}
private static void fixVocabName(Qualifier q, String vocabularyName) {
if (Objects.nonNull(q) && StringUtils.isBlank(q.getSchemeid())) {
q.setSchemeid(vocabularyName);
q.setSchemename(vocabularyName);
}
}
private static AccessRight accessRight(String classid, String classname, String scheme) {
return OafMapperUtils
.accessRight(
classid, classname, scheme, scheme);
}
private static Qualifier qualifier(String classid, String classname, String scheme) {
return OafMapperUtils
.qualifier(
classid, classname, scheme, scheme);
}
protected static StructuredProperty cleanValue(StructuredProperty s) {
s.setValue(s.getValue().replaceAll(CLEANING_REGEX, " "));
return s;
@ -306,39 +466,4 @@ public class CleaningFunctions {
return s;
}
// HELPERS
private static void fixVocabName(Qualifier q, String vocabularyName) {
if (Objects.nonNull(q) && StringUtils.isBlank(q.getSchemeid())) {
q.setSchemeid(vocabularyName);
q.setSchemename(vocabularyName);
}
}
private static Qualifier qualifier(String classid, String classname, String scheme) {
return OafMapperUtils
.qualifier(
classid, classname, scheme, scheme);
}
/**
* Utility method that normalises PID values on a per-type basis.
* @param pid the PID whose value will be normalised.
* @return the PID containing the normalised value.
*/
public static StructuredProperty normalizePidValue(StructuredProperty pid) {
String value = Optional
.ofNullable(pid.getValue())
.map(String::trim)
.orElseThrow(() -> new IllegalArgumentException("PID value cannot be empty"));
switch (pid.getQualifier().getClassid()) {
// TODO add cleaning for more PID types as needed
case "doi":
pid.setValue(value.toLowerCase().replaceAll(DOI_PREFIX_REGEX, "10."));
break;
}
return pid;
}
}

View File

@ -1,11 +1,9 @@
package eu.dnetlib.dhp.schema.oaf;
package eu.dnetlib.dhp.schema.oaf.utils;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import static eu.dnetlib.dhp.schema.common.ModelConstants.*;
import java.util.*;
import java.util.concurrent.ConcurrentHashMap;
import java.util.function.Function;
import java.util.function.Predicate;
@ -13,42 +11,48 @@ import java.util.stream.Collectors;
import org.apache.commons.lang3.StringUtils;
import com.google.common.base.Joiner;
import eu.dnetlib.dhp.schema.common.AccessRightComparator;
import eu.dnetlib.dhp.schema.common.ModelSupport;
import eu.dnetlib.dhp.utils.DHPUtils;
import eu.dnetlib.dhp.schema.oaf.*;
public class OafMapperUtils {
public static Oaf merge(final Oaf o1, final Oaf o2) {
if (ModelSupport.isSubClass(o1, OafEntity.class)) {
if (ModelSupport.isSubClass(o1, Result.class)) {
return mergeResults((Result) o1, (Result) o2);
} else if (ModelSupport.isSubClass(o1, Datasource.class)) {
((Datasource) o1).mergeFrom((Datasource) o2);
} else if (ModelSupport.isSubClass(o1, Organization.class)) {
((Organization) o1).mergeFrom((Organization) o2);
} else if (ModelSupport.isSubClass(o1, Project.class)) {
((Project) o1).mergeFrom((Project) o2);
} else {
throw new RuntimeException("invalid OafEntity subtype:" + o1.getClass().getCanonicalName());
}
} else if (ModelSupport.isSubClass(o1, Relation.class)) {
((Relation) o1).mergeFrom((Relation) o2);
} else {
throw new RuntimeException("invalid Oaf type:" + o1.getClass().getCanonicalName());
}
return o1;
private OafMapperUtils() {
}
public static Result mergeResults(Result r1, Result r2) {
if (new ResultTypeComparator().compare(r1, r2) < 0) {
r1.mergeFrom(r2);
return r1;
public static Oaf merge(final Oaf left, final Oaf right) {
if (ModelSupport.isSubClass(left, OafEntity.class)) {
return mergeEntities((OafEntity) left, (OafEntity) right);
} else if (ModelSupport.isSubClass(left, Relation.class)) {
((Relation) left).mergeFrom((Relation) right);
} else {
r2.mergeFrom(r1);
return r2;
throw new IllegalArgumentException("invalid Oaf type:" + left.getClass().getCanonicalName());
}
return left;
}
public static OafEntity mergeEntities(OafEntity left, OafEntity right) {
if (ModelSupport.isSubClass(left, Result.class)) {
return mergeResults((Result) left, (Result) right);
} else if (ModelSupport.isSubClass(left, Datasource.class)) {
left.mergeFrom(right);
} else if (ModelSupport.isSubClass(left, Organization.class)) {
left.mergeFrom(right);
} else if (ModelSupport.isSubClass(left, Project.class)) {
left.mergeFrom(right);
} else {
throw new IllegalArgumentException("invalid OafEntity subtype:" + left.getClass().getCanonicalName());
}
return left;
}
public static Result mergeResults(Result left, Result right) {
if (new ResultTypeComparator().compare(left, right) < 0) {
left.mergeFrom(right);
return left;
} else {
right.mergeFrom(left);
return right;
}
}
@ -61,7 +65,7 @@ public class OafMapperUtils {
public static List<KeyValue> listKeyValues(final String... s) {
if (s.length % 2 > 0) {
throw new RuntimeException("Invalid number of parameters (k,v,k,v,....)");
throw new IllegalArgumentException("Invalid number of parameters (k,v,k,v,....)");
}
final List<KeyValue> list = new ArrayList<>();
@ -87,7 +91,7 @@ public class OafMapperUtils {
.stream(values)
.map(v -> field(v, info))
.filter(Objects::nonNull)
.filter(distinctByKey(f -> f.getValue()))
.filter(distinctByKey(Field::getValue))
.collect(Collectors.toList());
}
@ -96,7 +100,7 @@ public class OafMapperUtils {
.stream()
.map(v -> field(v, info))
.filter(Objects::nonNull)
.filter(distinctByKey(f -> f.getValue()))
.filter(distinctByKey(Field::getValue))
.collect(Collectors.toList());
}
@ -104,6 +108,29 @@ public class OafMapperUtils {
return qualifier("UNKNOWN", "Unknown", schemeid, schemename);
}
public static AccessRight accessRight(
final String classid,
final String classname,
final String schemeid,
final String schemename) {
return accessRight(classid, classname, schemeid, schemename, null);
}
public static AccessRight accessRight(
final String classid,
final String classname,
final String schemeid,
final String schemename,
final OpenAccessRoute openAccessRoute) {
final AccessRight accessRight = new AccessRight();
accessRight.setClassid(classid);
accessRight.setClassname(classname);
accessRight.setSchemeid(schemeid);
accessRight.setSchemename(schemename);
accessRight.setOpenAccessRoute(openAccessRoute);
return accessRight;
}
public static Qualifier qualifier(
final String classid,
final String classname,
@ -117,6 +144,15 @@ public class OafMapperUtils {
return q;
}
public static Qualifier qualifier(final Qualifier qualifier) {
final Qualifier q = new Qualifier();
q.setClassid(qualifier.getClassid());
q.setClassname(qualifier.getClassname());
q.setSchemeid(qualifier.getSchemeid());
q.setSchemename(qualifier.getSchemename());
return q;
}
public static StructuredProperty structuredProperty(
final String value,
final String classid,
@ -267,7 +303,7 @@ public class OafMapperUtils {
} else if (to_md5) {
final String nsPrefix = StringUtils.substringBefore(originalId, "::");
final String rest = StringUtils.substringAfter(originalId, "::");
return String.format("%s|%s::%s", prefix, nsPrefix, DHPUtils.md5(rest));
return String.format("%s|%s::%s", prefix, nsPrefix, IdentifierFactory.md5(rest));
} else {
return String.format("%s|%s", prefix, originalId);
}
@ -300,4 +336,36 @@ public class OafMapperUtils {
final Map<Object, Boolean> seen = new ConcurrentHashMap<>();
return t -> seen.putIfAbsent(keyExtractor.apply(t), Boolean.TRUE) == null;
}
public static Qualifier createBestAccessRights(final List<Instance> instanceList) {
return getBestAccessRights(instanceList);
}
protected static Qualifier getBestAccessRights(final List<Instance> instanceList) {
if (instanceList != null) {
final Optional<AccessRight> min = instanceList
.stream()
.map(Instance::getAccessright)
.min(new AccessRightComparator<>());
final Qualifier rights = min.map(OafMapperUtils::qualifier).orElseGet(Qualifier::new);
if (StringUtils.isBlank(rights.getClassid())) {
rights.setClassid(UNKNOWN);
}
if (StringUtils.isBlank(rights.getClassname())
|| UNKNOWN.equalsIgnoreCase(rights.getClassname())) {
rights.setClassname(NOT_AVAILABLE);
}
if (StringUtils.isBlank(rights.getSchemeid())) {
rights.setSchemeid(DNET_ACCESS_MODES);
}
if (StringUtils.isBlank(rights.getSchemename())) {
rights.setSchemename(DNET_ACCESS_MODES);
}
return rights;
}
return null;
}
}

View File

@ -1,26 +1,44 @@
package eu.dnetlib.dhp.utils;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.*;
import java.nio.charset.StandardCharsets;
import java.security.MessageDigest;
import java.util.List;
import java.util.zip.GZIPInputStream;
import java.util.zip.GZIPOutputStream;
import java.util.*;
import java.util.stream.Collectors;
import org.apache.commons.codec.binary.Base64;
import org.apache.commons.codec.binary.Base64OutputStream;
import org.apache.commons.codec.binary.Hex;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.SaveMode;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.google.common.collect.Maps;
import com.jayway.jsonpath.JsonPath;
import eu.dnetlib.dhp.schema.mdstore.MDStoreWithInfo;
import eu.dnetlib.dhp.schema.oaf.utils.CleaningFunctions;
import net.minidev.json.JSONArray;
import scala.collection.JavaConverters;
import scala.collection.Seq;
public class DHPUtils {
private static final Logger log = LoggerFactory.getLogger(DHPUtils.class);
private DHPUtils() {
}
public static Seq<String> toSeq(List<String> list) {
return JavaConverters.asScalaIteratorConverter(list.iterator()).asScala().toSeq();
}
@ -31,40 +49,59 @@ public class DHPUtils {
md.update(s.getBytes(StandardCharsets.UTF_8));
return new String(Hex.encodeHex(md.digest()));
} catch (final Exception e) {
System.err.println("Error creating id");
log.error("Error creating id from {}", s);
return null;
}
}
/**
* Retrieves from the metadata store manager application the list of paths associated with mdstores characterized
* by he given format, layout, interpretation
* @param mdstoreManagerUrl the URL of the mdstore manager service
* @param format the mdstore format
* @param layout the mdstore layout
* @param interpretation the mdstore interpretation
* @param includeEmpty include Empty mdstores
* @return the set of hdfs paths
* @throws IOException in case of HTTP communication issues
*/
public static Set<String> mdstorePaths(final String mdstoreManagerUrl,
final String format,
final String layout,
final String interpretation,
boolean includeEmpty) throws IOException {
final String url = mdstoreManagerUrl + "/mdstores/";
final ObjectMapper objectMapper = new ObjectMapper();
final HttpGet req = new HttpGet(url);
try (final CloseableHttpClient client = HttpClients.createDefault()) {
try (final CloseableHttpResponse response = client.execute(req)) {
final String json = IOUtils.toString(response.getEntity().getContent());
final MDStoreWithInfo[] mdstores = objectMapper.readValue(json, MDStoreWithInfo[].class);
return Arrays
.stream(mdstores)
.filter(md -> md.getFormat().equalsIgnoreCase(format))
.filter(md -> md.getLayout().equalsIgnoreCase(layout))
.filter(md -> md.getInterpretation().equalsIgnoreCase(interpretation))
.filter(md -> StringUtils.isNotBlank(md.getHdfsPath()))
.filter(md -> StringUtils.isNotBlank(md.getCurrentVersion()))
.filter(md -> includeEmpty || md.getSize() > 0)
.map(md -> md.getHdfsPath() + "/" + md.getCurrentVersion() + "/store")
.collect(Collectors.toSet());
}
}
}
public static String generateIdentifier(final String originalId, final String nsPrefix) {
return String.format("%s::%s", nsPrefix, DHPUtils.md5(originalId));
}
public static String compressString(final String input) {
try (ByteArrayOutputStream out = new ByteArrayOutputStream();
Base64OutputStream b64os = new Base64OutputStream(out)) {
GZIPOutputStream gzip = new GZIPOutputStream(b64os);
gzip.write(input.getBytes(StandardCharsets.UTF_8));
gzip.close();
return out.toString();
} catch (Throwable e) {
return null;
}
}
public static String generateUnresolvedIdentifier(final String pid, final String pidType) {
public static String decompressString(final String input) {
byte[] byteArray = Base64.decodeBase64(input.getBytes());
int len;
try (GZIPInputStream gis = new GZIPInputStream(new ByteArrayInputStream((byteArray)));
ByteArrayOutputStream bos = new ByteArrayOutputStream(byteArray.length)) {
byte[] buffer = new byte[1024];
while ((len = gis.read(buffer)) != -1) {
bos.write(buffer, 0, len);
}
return bos.toString();
} catch (Exception e) {
return null;
}
final String cleanedPid = CleaningFunctions.normalizePidValue(pidType, pid);
return String.format("unresolved::%s::%s", cleanedPid, pidType.toLowerCase().trim());
}
public static String getJPathString(final String jsonPath, final String json) {
@ -79,4 +116,72 @@ public class DHPUtils {
return "";
}
}
public static final ObjectMapper MAPPER = new ObjectMapper();
public static void writeHdfsFile(final Configuration conf, final String content, final String path)
throws IOException {
log.info("writing file {}, size {}", path, content.length());
try (FileSystem fs = FileSystem.get(conf);
BufferedOutputStream os = new BufferedOutputStream(fs.create(new Path(path)))) {
os.write(content.getBytes(StandardCharsets.UTF_8));
os.flush();
}
}
public static String readHdfsFile(Configuration conf, String path) throws IOException {
log.info("reading file {}", path);
try (FileSystem fs = FileSystem.get(conf)) {
final Path p = new Path(path);
if (!fs.exists(p)) {
throw new FileNotFoundException(path);
}
return IOUtils.toString(fs.open(p));
}
}
public static <T> T readHdfsFileAs(Configuration conf, String path, Class<T> clazz) throws IOException {
return MAPPER.readValue(readHdfsFile(conf, path), clazz);
}
public static <T> void saveDataset(final Dataset<T> mdstore, final String targetPath) {
log.info("saving dataset in: {}", targetPath);
mdstore
.write()
.mode(SaveMode.Overwrite)
.format("parquet")
.save(targetPath);
}
public static Configuration getHadoopConfiguration(String nameNode) {
// ====== Init HDFS File System Object
Configuration conf = new Configuration();
// Set FileSystem URI
conf.set("fs.defaultFS", nameNode);
// Because of Maven
conf.set("fs.hdfs.impl", org.apache.hadoop.hdfs.DistributedFileSystem.class.getName());
conf.set("fs.file.impl", org.apache.hadoop.fs.LocalFileSystem.class.getName());
System.setProperty("hadoop.home.dir", "/");
return conf;
}
public static void populateOOZIEEnv(final Map<String, String> report) throws IOException {
File file = new File(System.getProperty("oozie.action.output.properties"));
Properties props = new Properties();
report.forEach((k, v) -> props.setProperty(k, v));
try (OutputStream os = new FileOutputStream(file)) {
props.store(os, "");
}
}
public static void populateOOZIEEnv(final String paramName, String value) throws IOException {
Map<String, String> report = Maps.newHashMap();
report.put(paramName, value);
populateOOZIEEnv(report);
}
}

View File

@ -15,8 +15,11 @@ public class ISLookupClientFactory {
private static final Logger log = LoggerFactory.getLogger(ISLookupClientFactory.class);
private static int requestTimeout = 60000 * 10;
private static int connectTimeout = 60000 * 10;
private static final int requestTimeout = 60000 * 10;
private static final int connectTimeout = 60000 * 10;
private ISLookupClientFactory() {
}
public static ISLookUpService getLookUpService(final String isLookupUrl) {
return getServiceStub(ISLookUpService.class, isLookupUrl);
@ -24,7 +27,7 @@ public class ISLookupClientFactory {
@SuppressWarnings("unchecked")
private static <T> T getServiceStub(final Class<T> clazz, final String endpoint) {
log.info(String.format("creating %s stub from %s", clazz.getName(), endpoint));
log.info("creating {} stub from {}", clazz.getName(), endpoint);
final JaxWsProxyFactoryBean jaxWsProxyFactory = new JaxWsProxyFactoryBean();
jaxWsProxyFactory.setServiceClass(clazz);
jaxWsProxyFactory.setAddress(endpoint);
@ -38,12 +41,10 @@ public class ISLookupClientFactory {
log
.info(
String
.format(
"setting connectTimeout to %s, requestTimeout to %s for service %s",
connectTimeout,
requestTimeout,
clazz.getCanonicalName()));
"setting connectTimeout to {}, requestTimeout to {} for service {}",
connectTimeout,
requestTimeout,
clazz.getCanonicalName());
policy.setConnectionTimeout(connectTimeout);
policy.setReceiveTimeout(requestTimeout);

View File

@ -10,7 +10,7 @@ import net.sf.saxon.trans.XPathException;
public abstract class AbstractExtensionFunction extends ExtensionFunctionDefinition {
public static String DEFAULT_SAXON_EXT_NS_URI = "http://www.d-net.research-infrastructures.eu/saxon-extension";
public static final String DEFAULT_SAXON_EXT_NS_URI = "http://www.d-net.research-infrastructures.eu/saxon-extension";
public abstract String getName();

View File

@ -26,7 +26,7 @@ public class ExtractYear extends AbstractExtensionFunction {
@Override
public Sequence doCall(XPathContext context, Sequence[] arguments) throws XPathException {
if (arguments == null | arguments.length == 0) {
if (arguments == null || arguments.length == 0) {
return new StringValue("");
}
final Item item = arguments[0].head();
@ -63,8 +63,7 @@ public class ExtractYear extends AbstractExtensionFunction {
for (String format : dateFormats) {
try {
c.setTime(new SimpleDateFormat(format).parse(s));
String year = String.valueOf(c.get(Calendar.YEAR));
return year;
return String.valueOf(c.get(Calendar.YEAR));
} catch (ParseException e) {
}
}

View File

@ -30,7 +30,7 @@ public class NormalizeDate extends AbstractExtensionFunction {
@Override
public Sequence doCall(XPathContext context, Sequence[] arguments) throws XPathException {
if (arguments == null | arguments.length == 0) {
if (arguments == null || arguments.length == 0) {
return new StringValue(BLANK);
}
String s = arguments[0].head().getStringValue();

View File

@ -1,6 +1,8 @@
package eu.dnetlib.dhp.utils.saxon;
import static org.apache.commons.lang3.StringUtils.isNotBlank;
import org.apache.commons.lang3.StringUtils;
import net.sf.saxon.expr.XPathContext;
@ -26,7 +28,8 @@ public class PickFirst extends AbstractExtensionFunction {
final String s1 = getValue(arguments[0]);
final String s2 = getValue(arguments[1]);
return new StringValue(StringUtils.isNotBlank(s1) ? s1 : StringUtils.isNotBlank(s2) ? s2 : "");
final String value = isNotBlank(s1) ? s1 : isNotBlank(s2) ? s2 : "";
return new StringValue(value);
}
private String getValue(final Sequence arg) throws XPathException {

View File

@ -12,6 +12,9 @@ import net.sf.saxon.TransformerFactoryImpl;
public class SaxonTransformerFactory {
private SaxonTransformerFactory() {
}
/**
* Creates the index record transformer from the given XSLT
*

View File

@ -1,76 +0,0 @@
package eu.dnetlib.message;
import java.io.IOException;
import java.util.Map;
import com.fasterxml.jackson.core.JsonProcessingException;
import com.fasterxml.jackson.databind.ObjectMapper;
public class Message {
private String workflowId;
private String jobName;
private MessageType type;
private Map<String, String> body;
public static Message fromJson(final String json) throws IOException {
final ObjectMapper jsonMapper = new ObjectMapper();
return jsonMapper.readValue(json, Message.class);
}
public Message() {
}
public Message(String workflowId, String jobName, MessageType type, Map<String, String> body) {
this.workflowId = workflowId;
this.jobName = jobName;
this.type = type;
this.body = body;
}
public String getWorkflowId() {
return workflowId;
}
public void setWorkflowId(String workflowId) {
this.workflowId = workflowId;
}
public String getJobName() {
return jobName;
}
public void setJobName(String jobName) {
this.jobName = jobName;
}
public MessageType getType() {
return type;
}
public void setType(MessageType type) {
this.type = type;
}
public Map<String, String> getBody() {
return body;
}
public void setBody(Map<String, String> body) {
this.body = body;
}
@Override
public String toString() {
final ObjectMapper jsonMapper = new ObjectMapper();
try {
return jsonMapper.writeValueAsString(this);
} catch (JsonProcessingException e) {
return null;
}
}
}

View File

@ -1,47 +0,0 @@
package eu.dnetlib.message;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.util.concurrent.LinkedBlockingQueue;
import com.rabbitmq.client.AMQP;
import com.rabbitmq.client.Channel;
import com.rabbitmq.client.DefaultConsumer;
import com.rabbitmq.client.Envelope;
public class MessageConsumer extends DefaultConsumer {
final LinkedBlockingQueue<Message> queueMessages;
/**
* Constructs a new instance and records its association to the passed-in channel.
*
* @param channel the channel to which this consumer is attached
* @param queueMessages
*/
public MessageConsumer(Channel channel, LinkedBlockingQueue<Message> queueMessages) {
super(channel);
this.queueMessages = queueMessages;
}
@Override
public void handleDelivery(
String consumerTag, Envelope envelope, AMQP.BasicProperties properties, byte[] body)
throws IOException {
final String json = new String(body, StandardCharsets.UTF_8);
Message message = Message.fromJson(json);
try {
this.queueMessages.put(message);
System.out.println("Receiving Message " + message);
} catch (InterruptedException e) {
if (message.getType() == MessageType.REPORT)
throw new RuntimeException("Error on sending message");
else {
// TODO LOGGING EXCEPTION
}
} finally {
getChannel().basicAck(envelope.getDeliveryTag(), false);
}
}
}

View File

@ -1,136 +0,0 @@
package eu.dnetlib.message;
import java.io.IOException;
import java.util.HashMap;
import java.util.Map;
import java.util.concurrent.LinkedBlockingQueue;
import java.util.concurrent.TimeoutException;
import com.rabbitmq.client.Channel;
import com.rabbitmq.client.Connection;
import com.rabbitmq.client.ConnectionFactory;
public class MessageManager {
private final String messageHost;
private final String username;
private final String password;
private Connection connection;
private final Map<String, Channel> channels = new HashMap<>();
private boolean durable;
private boolean autodelete;
private final LinkedBlockingQueue<Message> queueMessages;
public MessageManager(
String messageHost,
String username,
String password,
final LinkedBlockingQueue<Message> queueMessages) {
this.queueMessages = queueMessages;
this.messageHost = messageHost;
this.username = username;
this.password = password;
}
public MessageManager(
String messageHost,
String username,
String password,
boolean durable,
boolean autodelete,
final LinkedBlockingQueue<Message> queueMessages) {
this.queueMessages = queueMessages;
this.messageHost = messageHost;
this.username = username;
this.password = password;
this.durable = durable;
this.autodelete = autodelete;
}
private Connection createConnection() throws IOException, TimeoutException {
ConnectionFactory factory = new ConnectionFactory();
factory.setHost(this.messageHost);
factory.setUsername(this.username);
factory.setPassword(this.password);
return factory.newConnection();
}
private Channel createChannel(
final Connection connection,
final String queueName,
final boolean durable,
final boolean autodelete)
throws Exception {
Map<String, Object> args = new HashMap<>();
args.put("x-message-ttl", 10000);
Channel channel = connection.createChannel();
channel.queueDeclare(queueName, durable, false, this.autodelete, args);
return channel;
}
private Channel getOrCreateChannel(final String queueName, boolean durable, boolean autodelete)
throws Exception {
if (channels.containsKey(queueName)) {
return channels.get(queueName);
}
if (this.connection == null) {
this.connection = createConnection();
}
channels.put(queueName, createChannel(this.connection, queueName, durable, autodelete));
return channels.get(queueName);
}
public void close() throws IOException {
channels
.values()
.forEach(
ch -> {
try {
ch.close();
} catch (Exception e) {
// TODO LOG
}
});
this.connection.close();
}
public boolean sendMessage(final Message message, String queueName) throws Exception {
try {
Channel channel = getOrCreateChannel(queueName, this.durable, this.autodelete);
channel.basicPublish("", queueName, null, message.toString().getBytes());
return true;
} catch (Throwable e) {
throw new RuntimeException(e);
}
}
public boolean sendMessage(
final Message message, String queueName, boolean durable_var, boolean autodelete_var)
throws Exception {
try {
Channel channel = getOrCreateChannel(queueName, durable_var, autodelete_var);
channel.basicPublish("", queueName, null, message.toString().getBytes());
return true;
} catch (Throwable e) {
throw new RuntimeException(e);
}
}
public void startConsumingMessage(
final String queueName, final boolean durable, final boolean autodelete) throws Exception {
Channel channel = createChannel(createConnection(), queueName, durable, autodelete);
channel.basicConsume(queueName, false, new MessageConsumer(channel, queueMessages));
}
}

View File

@ -1,6 +0,0 @@
package eu.dnetlib.message;
public enum MessageType {
ONGOING, REPORT
}

File diff suppressed because one or more lines are too long

View File

@ -7,10 +7,10 @@ import static org.junit.jupiter.api.Assertions.assertNotNull;
import org.apache.commons.io.IOUtils;
import org.junit.jupiter.api.Test;
public class ArgumentApplicationParserTest {
class ArgumentApplicationParserTest {
@Test
public void testParseParameter() throws Exception {
void testParseParameter() throws Exception {
final String jsonConfiguration = IOUtils
.toString(
this.getClass().getResourceAsStream("/eu/dnetlib/application/parameters.json"));

View File

@ -21,13 +21,13 @@ public class HdfsSupportTest {
class Remove {
@Test
public void shouldThrowARuntimeExceptionOnError() {
void shouldThrowARuntimeExceptionOnError() {
// when
assertThrows(RuntimeException.class, () -> HdfsSupport.remove(null, new Configuration()));
}
@Test
public void shouldRemoveADirFromHDFS(@TempDir Path tempDir) {
void shouldRemoveADirFromHDFS(@TempDir Path tempDir) {
// when
HdfsSupport.remove(tempDir.toString(), new Configuration());
@ -36,7 +36,7 @@ public class HdfsSupportTest {
}
@Test
public void shouldRemoveAFileFromHDFS(@TempDir Path tempDir) throws IOException {
void shouldRemoveAFileFromHDFS(@TempDir Path tempDir) throws IOException {
// given
Path file = Files.createTempFile(tempDir, "p", "s");
@ -52,13 +52,13 @@ public class HdfsSupportTest {
class ListFiles {
@Test
public void shouldThrowARuntimeExceptionOnError() {
void shouldThrowARuntimeExceptionOnError() {
// when
assertThrows(RuntimeException.class, () -> HdfsSupport.listFiles(null, new Configuration()));
}
@Test
public void shouldListFilesLocatedInPath(@TempDir Path tempDir) throws IOException {
void shouldListFilesLocatedInPath(@TempDir Path tempDir) throws IOException {
Path subDir1 = Files.createTempDirectory(tempDir, "list_me");
Path subDir2 = Files.createTempDirectory(tempDir, "list_me");

View File

@ -5,10 +5,10 @@ import static org.junit.jupiter.api.Assertions.*;
import org.junit.jupiter.api.Test;
public class PacePersonTest {
class PacePersonTest {
@Test
public void pacePersonTest1() {
void pacePersonTest1() {
PacePerson p = new PacePerson("Artini, Michele", false);
assertEquals("Artini", p.getSurnameString());
@ -17,7 +17,7 @@ public class PacePersonTest {
}
@Test
public void pacePersonTest2() {
void pacePersonTest2() {
PacePerson p = new PacePerson("Michele G. Artini", false);
assertEquals("Artini, Michele G.", p.getNormalisedFullname());
assertEquals("Michele G", p.getNameString());

View File

@ -18,7 +18,8 @@ public class SparkSessionSupportTest {
class RunWithSparkSession {
@Test
public void shouldExecuteFunctionAndNotStopSparkSessionWhenSparkSessionIsNotManaged()
@SuppressWarnings("unchecked")
void shouldExecuteFunctionAndNotStopSparkSessionWhenSparkSessionIsNotManaged()
throws Exception {
// given
SparkSession spark = mock(SparkSession.class);
@ -37,7 +38,8 @@ public class SparkSessionSupportTest {
}
@Test
public void shouldExecuteFunctionAndStopSparkSessionWhenSparkSessionIsManaged()
@SuppressWarnings("unchecked")
void shouldExecuteFunctionAndStopSparkSessionWhenSparkSessionIsManaged()
throws Exception {
// given
SparkSession spark = mock(SparkSession.class);

View File

@ -12,7 +12,7 @@ import org.junit.jupiter.api.Disabled;
import org.junit.jupiter.api.Test;
@Disabled
public class ZenodoAPIClientTest {
class ZenodoAPIClientTest {
private final String URL_STRING = "https://sandbox.zenodo.org/api/deposit/depositions";
private final String ACCESS_TOKEN = "";
@ -22,7 +22,7 @@ public class ZenodoAPIClientTest {
private final String depositionId = "674915";
@Test
public void testUploadOldDeposition() throws IOException, MissingConceptDoiException {
void testUploadOldDeposition() throws IOException, MissingConceptDoiException {
ZenodoAPIClient client = new ZenodoAPIClient(URL_STRING,
ACCESS_TOKEN);
Assertions.assertEquals(200, client.uploadOpenDeposition(depositionId));
@ -44,7 +44,7 @@ public class ZenodoAPIClientTest {
}
@Test
public void testNewDeposition() throws IOException {
void testNewDeposition() throws IOException {
ZenodoAPIClient client = new ZenodoAPIClient(URL_STRING,
ACCESS_TOKEN);
@ -67,7 +67,7 @@ public class ZenodoAPIClientTest {
}
@Test
public void testNewVersionNewName() throws IOException, MissingConceptDoiException {
void testNewVersionNewName() throws IOException, MissingConceptDoiException {
ZenodoAPIClient client = new ZenodoAPIClient(URL_STRING,
ACCESS_TOKEN);
@ -87,7 +87,7 @@ public class ZenodoAPIClientTest {
}
@Test
public void testNewVersionOldName() throws IOException, MissingConceptDoiException {
void testNewVersionOldName() throws IOException, MissingConceptDoiException {
ZenodoAPIClient client = new ZenodoAPIClient(URL_STRING,
ACCESS_TOKEN);

View File

@ -1,16 +0,0 @@
package eu.dnetlib.dhp.model.mdstore;
import static org.junit.jupiter.api.Assertions.assertTrue;
import org.junit.jupiter.api.Test;
public class MetadataRecordTest {
@Test
public void getTimestamp() {
MetadataRecord r = new MetadataRecord();
assertTrue(r.getDateOfCollection() > 0);
}
}

View File

@ -21,7 +21,7 @@ import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
import eu.dnetlib.pace.util.MapDocumentUtil;
import scala.Tuple2;
public class AuthorMergerTest {
class AuthorMergerTest {
private String publicationsBasePath;
@ -43,7 +43,7 @@ public class AuthorMergerTest {
}
@Test
public void mergeTest() { // used in the dedup: threshold set to 0.95
void mergeTest() { // used in the dedup: threshold set to 0.95
for (List<Author> authors1 : authors) {
System.out.println("List " + (authors.indexOf(authors1) + 1));

View File

@ -0,0 +1,197 @@
package eu.dnetlib.dhp.schema.oaf.utils;
import static org.junit.jupiter.api.Assertions.*;
import java.io.IOException;
import java.util.HashSet;
import java.util.List;
import java.util.stream.Collectors;
import org.apache.commons.io.IOUtils;
import org.junit.jupiter.api.Test;
import com.fasterxml.jackson.databind.DeserializationFeature;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.oaf.Dataset;
import eu.dnetlib.dhp.schema.oaf.KeyValue;
import eu.dnetlib.dhp.schema.oaf.Publication;
import eu.dnetlib.dhp.schema.oaf.Result;
import me.xuender.unidecode.Unidecode;
class OafMapperUtilsTest {
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper()
.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);
@Test
public void testUnidecode() {
assertEquals("Liu Ben Mu hiruzuSen tawa", Unidecode.decode("六本木ヒルズ森タワ"));
assertEquals("Nan Wu A Mi Tuo Fo", Unidecode.decode("南无阿弥陀佛"));
assertEquals("Yi Tiao Hui Zou Lu De Yu", Unidecode.decode("一条会走路的鱼"));
assertEquals("amidaniyorai", Unidecode.decode("あみだにょらい"));
assertEquals("T`owrk`iayi", Unidecode.decode("Թուրքիայի"));
assertEquals("Obzor tematiki", Unidecode.decode("Обзор тематики"));
assertEquals("GERMANSKIE IaZYKI", Unidecode.decode("ГЕРМАНСКИЕ ЯЗЫКИ"));
assertEquals("Diereunese tes ikanopoieses", Unidecode.decode("Διερεύνηση της ικανοποίησης"));
assertEquals("lqDy l'wly@", Unidecode.decode("القضايا الأولية"));
assertEquals("abc def ghi", Unidecode.decode("abc def ghi"));
}
@Test
void testDateValidation() {
assertTrue(GraphCleaningFunctions.doCleanDate("2016-05-07T12:41:19.202Z ").isPresent());
assertTrue(GraphCleaningFunctions.doCleanDate("2020-09-10 11:08:52 ").isPresent());
assertTrue(GraphCleaningFunctions.doCleanDate(" 2016-04-05").isPresent());
assertEquals("2016-04-05", GraphCleaningFunctions.doCleanDate("2016 Apr 05").get());
assertEquals("2009-05-08", GraphCleaningFunctions.doCleanDate("May 8, 2009 5:57:51 PM").get());
assertEquals("1970-10-07", GraphCleaningFunctions.doCleanDate("oct 7, 1970").get());
assertEquals("1970-10-07", GraphCleaningFunctions.doCleanDate("oct 7, '70").get());
assertEquals("1970-10-07", GraphCleaningFunctions.doCleanDate("oct. 7, 1970").get());
assertEquals("1970-10-07", GraphCleaningFunctions.doCleanDate("oct. 7, 70").get());
assertEquals("2006-01-02", GraphCleaningFunctions.doCleanDate("Mon Jan 2 15:04:05 2006").get());
assertEquals("2006-01-02", GraphCleaningFunctions.doCleanDate("Mon Jan 2 15:04:05 MST 2006").get());
assertEquals("2006-01-02", GraphCleaningFunctions.doCleanDate("Mon Jan 02 15:04:05 -0700 2006").get());
assertEquals("2006-01-02", GraphCleaningFunctions.doCleanDate("Monday, 02-Jan-06 15:04:05 MST").get());
assertEquals("2006-01-02", GraphCleaningFunctions.doCleanDate("Mon, 02 Jan 2006 15:04:05 MST").get());
assertEquals("2017-07-11", GraphCleaningFunctions.doCleanDate("Tue, 11 Jul 2017 16:28:13 +0200 (CEST)").get());
assertEquals("2006-01-02", GraphCleaningFunctions.doCleanDate("Mon, 02 Jan 2006 15:04:05 -0700").get());
assertEquals("2018-01-04", GraphCleaningFunctions.doCleanDate("Thu, 4 Jan 2018 17:53:36 +0000").get());
assertEquals("2015-08-10", GraphCleaningFunctions.doCleanDate("Mon Aug 10 15:44:11 UTC+0100 2015").get());
assertEquals(
"2015-07-03",
GraphCleaningFunctions.doCleanDate("Fri Jul 03 2015 18:04:07 GMT+0100 (GMT Daylight Time)").get());
assertEquals("2012-09-17", GraphCleaningFunctions.doCleanDate("September 17, 2012 10:09am").get());
assertEquals("2012-09-17", GraphCleaningFunctions.doCleanDate("September 17, 2012 at 10:09am PST-08").get());
assertEquals("2012-09-17", GraphCleaningFunctions.doCleanDate("September 17, 2012, 10:10:09").get());
assertEquals("1970-10-07", GraphCleaningFunctions.doCleanDate("October 7, 1970").get());
assertEquals("1970-10-07", GraphCleaningFunctions.doCleanDate("October 7th, 1970").get());
assertEquals("2006-02-12", GraphCleaningFunctions.doCleanDate("12 Feb 2006, 19:17").get());
assertEquals("2006-02-12", GraphCleaningFunctions.doCleanDate("12 Feb 2006 19:17").get());
assertEquals("1970-10-07", GraphCleaningFunctions.doCleanDate("7 oct 70").get());
assertEquals("1970-10-07", GraphCleaningFunctions.doCleanDate("7 oct 1970").get());
assertEquals("2013-02-03", GraphCleaningFunctions.doCleanDate("03 February 2013").get());
assertEquals("2013-07-01", GraphCleaningFunctions.doCleanDate("1 July 2013").get());
assertEquals("2013-02-03", GraphCleaningFunctions.doCleanDate("2013-Feb-03").get());
assertEquals("2014-03-31", GraphCleaningFunctions.doCleanDate("3/31/2014").get());
assertEquals("2014-03-31", GraphCleaningFunctions.doCleanDate("03/31/2014").get());
assertEquals("1971-08-21", GraphCleaningFunctions.doCleanDate("08/21/71").get());
assertEquals("1971-01-08", GraphCleaningFunctions.doCleanDate("8/1/71").get());
assertEquals("2014-08-04", GraphCleaningFunctions.doCleanDate("4/8/2014 22:05").get());
assertEquals("2014-08-04", GraphCleaningFunctions.doCleanDate("04/08/2014 22:05").get());
assertEquals("2014-08-04", GraphCleaningFunctions.doCleanDate("4/8/14 22:05").get());
assertEquals("2014-02-04", GraphCleaningFunctions.doCleanDate("04/2/2014 03:00:51").get());
assertEquals("1965-08-08", GraphCleaningFunctions.doCleanDate("8/8/1965 12:00:00 AM").get());
assertEquals("1965-08-08", GraphCleaningFunctions.doCleanDate("8/8/1965 01:00:01 PM").get());
assertEquals("1965-08-08", GraphCleaningFunctions.doCleanDate("8/8/1965 01:00 PM").get());
assertEquals("1965-08-08", GraphCleaningFunctions.doCleanDate("8/8/1965 1:00 PM").get());
assertEquals("1965-08-08", GraphCleaningFunctions.doCleanDate("8/8/1965 12:00 AM").get());
assertEquals("2014-02-04", GraphCleaningFunctions.doCleanDate("4/02/2014 03:00:51").get());
assertEquals("2012-03-19", GraphCleaningFunctions.doCleanDate("03/19/2012 10:11:59").get());
assertEquals("2012-03-19", GraphCleaningFunctions.doCleanDate("03/19/2012 10:11:59.3186369").get());
assertEquals("2014-03-31", GraphCleaningFunctions.doCleanDate("2014/3/31").get());
assertEquals("2014-03-31", GraphCleaningFunctions.doCleanDate("2014/03/31").get());
assertEquals("2014-04-08", GraphCleaningFunctions.doCleanDate("2014/4/8 22:05").get());
assertEquals("2014-04-08", GraphCleaningFunctions.doCleanDate("2014/04/08 22:05").get());
assertEquals("2014-04-02", GraphCleaningFunctions.doCleanDate("2014/04/2 03:00:51").get());
assertEquals("2014-04-02", GraphCleaningFunctions.doCleanDate("2014/4/02 03:00:51").get());
assertEquals("2012-03-19", GraphCleaningFunctions.doCleanDate("2012/03/19 10:11:59").get());
assertEquals("2012-03-19", GraphCleaningFunctions.doCleanDate("2012/03/19 10:11:59.3186369").get());
assertEquals("2014-04-08", GraphCleaningFunctions.doCleanDate("2014年04月08日").get());
assertEquals("2006-01-02", GraphCleaningFunctions.doCleanDate("2006-01-02T15:04:05+0000").get());
assertEquals("2009-08-13", GraphCleaningFunctions.doCleanDate("2009-08-12T22:15:09-07:00").get());
assertEquals("2009-08-12", GraphCleaningFunctions.doCleanDate("2009-08-12T22:15:09").get());
assertEquals("2009-08-12", GraphCleaningFunctions.doCleanDate("2009-08-12T22:15:09Z").get());
assertEquals("2014-04-26", GraphCleaningFunctions.doCleanDate("2014-04-26 17:24:37.3186369").get());
assertEquals("2012-08-03", GraphCleaningFunctions.doCleanDate("2012-08-03 18:31:59.257000000").get());
assertEquals("2014-04-26", GraphCleaningFunctions.doCleanDate("2014-04-26 17:24:37.123").get());
assertEquals("2013-04-01", GraphCleaningFunctions.doCleanDate("2013-04-01 22:43").get());
assertEquals("2013-04-01", GraphCleaningFunctions.doCleanDate("2013-04-01 22:43:22").get());
assertEquals("2014-12-16", GraphCleaningFunctions.doCleanDate("2014-12-16 06:20:00 UTC").get());
assertEquals("2014-12-16", GraphCleaningFunctions.doCleanDate("2014-12-16 06:20:00 GMT").get());
assertEquals("2014-04-26", GraphCleaningFunctions.doCleanDate("2014-04-26 05:24:37 PM").get());
assertEquals("2014-04-26", GraphCleaningFunctions.doCleanDate("2014-04-26 13:13:43 +0800").get());
assertEquals("2014-04-26", GraphCleaningFunctions.doCleanDate("2014-04-26 13:13:43 +0800 +08").get());
assertEquals("2014-04-26", GraphCleaningFunctions.doCleanDate("2014-04-26 13:13:44 +09:00").get());
assertEquals("2012-08-03", GraphCleaningFunctions.doCleanDate("2012-08-03 18:31:59.257000000 +0000 UTC").get());
assertEquals("2015-09-30", GraphCleaningFunctions.doCleanDate("2015-09-30 18:48:56.35272715 +0000 UTC").get());
assertEquals("2015-02-18", GraphCleaningFunctions.doCleanDate("2015-02-18 00:12:00 +0000 GMT").get());
assertEquals("2015-02-18", GraphCleaningFunctions.doCleanDate("2015-02-18 00:12:00 +0000 UTC").get());
assertEquals(
"2015-02-08", GraphCleaningFunctions.doCleanDate("2015-02-08 03:02:00 +0300 MSK m=+0.000000001").get());
assertEquals(
"2015-02-08", GraphCleaningFunctions.doCleanDate("2015-02-08 03:02:00.001 +0300 MSK m=+0.000000001").get());
assertEquals("2017-07-19", GraphCleaningFunctions.doCleanDate("2017-07-19 03:21:51+00:00").get());
assertEquals("2014-04-26", GraphCleaningFunctions.doCleanDate("2014-04-26").get());
assertEquals("2014-04-01", GraphCleaningFunctions.doCleanDate("2014-04").get());
assertEquals("2014-01-01", GraphCleaningFunctions.doCleanDate("2014").get());
assertEquals("2014-05-11", GraphCleaningFunctions.doCleanDate("2014-05-11 08:20:13,787").get());
assertEquals("2014-03-31", GraphCleaningFunctions.doCleanDate("3.31.2014").get());
assertEquals("2014-03-31", GraphCleaningFunctions.doCleanDate("03.31.2014").get());
assertEquals("1971-08-21", GraphCleaningFunctions.doCleanDate("08.21.71").get());
assertEquals("2014-03-01", GraphCleaningFunctions.doCleanDate("2014.03").get());
assertEquals("2014-03-30", GraphCleaningFunctions.doCleanDate("2014.03.30").get());
assertEquals("2014-06-01", GraphCleaningFunctions.doCleanDate("20140601").get());
assertEquals("2014-07-22", GraphCleaningFunctions.doCleanDate("20140722105203").get());
assertEquals("2012-03-19", GraphCleaningFunctions.doCleanDate("1332151919").get());
assertEquals("2013-11-12", GraphCleaningFunctions.doCleanDate("1384216367189").get());
assertEquals("2013-11-12", GraphCleaningFunctions.doCleanDate("1384216367111222").get());
assertEquals("2013-11-12", GraphCleaningFunctions.doCleanDate("1384216367111222333").get());
}
@Test
void testDate() {
final String date = GraphCleaningFunctions.cleanDate("23-FEB-1998");
assertNotNull(date);
System.out.println(date);
}
@Test
void testMergePubs() throws IOException {
Publication p1 = read("publication_1.json", Publication.class);
Publication p2 = read("publication_2.json", Publication.class);
Dataset d1 = read("dataset_1.json", Dataset.class);
Dataset d2 = read("dataset_2.json", Dataset.class);
assertEquals(1, p1.getCollectedfrom().size());
assertEquals(ModelConstants.CROSSREF_ID, p1.getCollectedfrom().get(0).getKey());
assertEquals(1, d2.getCollectedfrom().size());
assertFalse(cfId(d2.getCollectedfrom()).contains(ModelConstants.CROSSREF_ID));
assertEquals(
ModelConstants.PUBLICATION_RESULTTYPE_CLASSID,
OafMapperUtils
.mergeResults(p1, d2)
.getResulttype()
.getClassid());
assertEquals(1, p2.getCollectedfrom().size());
assertFalse(cfId(p2.getCollectedfrom()).contains(ModelConstants.CROSSREF_ID));
assertEquals(1, d1.getCollectedfrom().size());
assertTrue(cfId(d1.getCollectedfrom()).contains(ModelConstants.CROSSREF_ID));
assertEquals(
ModelConstants.DATASET_RESULTTYPE_CLASSID,
OafMapperUtils
.mergeResults(p2, d1)
.getResulttype()
.getClassid());
}
protected HashSet<String> cfId(List<KeyValue> collectedfrom) {
return collectedfrom.stream().map(KeyValue::getKey).collect(Collectors.toCollection(HashSet::new));
}
protected <T extends Result> T read(String filename, Class<T> clazz) throws IOException {
final String json = IOUtils.toString(getClass().getResourceAsStream(filename));
return OBJECT_MAPPER.readValue(json, clazz);
}
}

View File

@ -1,51 +0,0 @@
package eu.dnetlib.message;
import static org.junit.jupiter.api.Assertions.*;
import java.io.IOException;
import java.util.HashMap;
import java.util.Map;
import org.junit.jupiter.api.Test;
public class MessageTest {
@Test
public void fromJsonTest() throws IOException {
Message m = new Message();
m.setWorkflowId("wId");
m.setType(MessageType.ONGOING);
m.setJobName("Collection");
Map<String, String> body = new HashMap<>();
body.put("parsedItem", "300");
body.put("ExecutionTime", "30s");
m.setBody(body);
System.out.println("m = " + m);
Message m1 = Message.fromJson(m.toString());
assertEquals(m1.getWorkflowId(), m.getWorkflowId());
assertEquals(m1.getType(), m.getType());
assertEquals(m1.getJobName(), m.getJobName());
assertNotNull(m1.getBody());
m1.getBody().keySet().forEach(it -> assertEquals(m1.getBody().get(it), m.getBody().get(it)));
assertEquals(m1.getJobName(), m.getJobName());
}
@Test
public void toStringTest() {
final String expectedJson = "{\"workflowId\":\"wId\",\"jobName\":\"Collection\",\"type\":\"ONGOING\",\"body\":{\"ExecutionTime\":\"30s\",\"parsedItem\":\"300\"}}";
Message m = new Message();
m.setWorkflowId("wId");
m.setType(MessageType.ONGOING);
m.setJobName("Collection");
Map<String, String> body = new HashMap<>();
body.put("parsedItem", "300");
body.put("ExecutionTime", "30s");
m.setBody(body);
assertEquals(expectedJson, m.toString());
}
}

View File

@ -3,10 +3,10 @@ package eu.dnetlib.scholexplorer.relation;
import org.junit.jupiter.api.Test;
public class RelationMapperTest {
class RelationMapperTest {
@Test
public void testLoadRels() throws Exception {
void testLoadRels() throws Exception {
RelationMapper relationMapper = RelationMapper.load();
relationMapper.keySet().forEach(System.out::println);

View File

@ -0,0 +1 @@
{"id":"50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1g", "resuttype" : { "classid" : "dataset" }, "pid":[{"qualifier":{"classid":"doi"},"value":"10.1016/j.cmet.2011.03.013"},{"qualifier":{"classid":"urn"},"value":"urn:nbn:nl:ui:29-f3ed5f9e-edf6-457e-8848-61b58a4075e2"},{"qualifier":{"classid":"scp-number"},"value":"79953761260"},{"qualifier":{"classid":"pmc"},"value":"21459329"}], "collectedfrom" : [ { "key" : "10|openaire____::081b82f96300b6a6e3d282bad31cb6e2", "value" : "Crossref"} ]}

View File

@ -0,0 +1 @@
{"id":"50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1g", "resuttype" : { "classid" : "dataset" }, "pid":[{"qualifier":{"classid":"doi"},"value":"10.1016/j.cmet.2011.03.013"},{"qualifier":{"classid":"urn"},"value":"urn:nbn:nl:ui:29-f3ed5f9e-edf6-457e-8848-61b58a4075e2"},{"qualifier":{"classid":"scp-number"},"value":"79953761260"},{"qualifier":{"classid":"pmc"},"value":"21459329"}], "collectedfrom" : [ { "key" : "10|openaire____::081b82f96300b6a6e3d282bad31cb6e3", "value" : "Repository B"} ]}

View File

@ -0,0 +1 @@
{"id":"50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1f", "resuttype" : { "classid" : "publication" }, "pid":[{"qualifier":{"classid":"doi"},"value":"10.1016/j.cmet.2011.03.013"},{"qualifier":{"classid":"urn"},"value":"urn:nbn:nl:ui:29-f3ed5f9e-edf6-457e-8848-61b58a4075e2"},{"qualifier":{"classid":"scp-number"},"value":"79953761260"},{"qualifier":{"classid":"pmc"},"value":"21459329"}], "collectedfrom" : [ { "key" : "10|openaire____::081b82f96300b6a6e3d282bad31cb6e2", "value" : "Crossref"} ]}

View File

@ -0,0 +1 @@
{"id":"50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1f", "resuttype" : { "classid" : "publication" }, "pid":[{"qualifier":{"classid":"doi"},"value":"10.1016/j.cmet.2011.03.013"},{"qualifier":{"classid":"urn"},"value":"urn:nbn:nl:ui:29-f3ed5f9e-edf6-457e-8848-61b58a4075e2"},{"qualifier":{"classid":"scp-number"},"value":"79953761260"},{"qualifier":{"classid":"pmc"},"value":"21459329"}], "collectedfrom" : [ { "key" : "10|openaire____::081b82f96300b6a6e3d282bad31cb6e3", "value" : "Repository A"} ]}

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -51,16 +51,6 @@
<artifactId>hadoop-distcp</artifactId>
</dependency>
<dependency>
<groupId>eu.dnetlib</groupId>
<artifactId>dnet-openaire-data-protos</artifactId>
</dependency>
<dependency>
<groupId>eu.dnetlib.dhp</groupId>
<artifactId>dhp-schemas</artifactId>
</dependency>
<dependency>
<groupId>eu.dnetlib</groupId>
<artifactId>dnet-actionmanager-api</artifactId>

View File

@ -3,40 +3,37 @@ package eu.dnetlib.dhp.actionmanager;
import java.io.Serializable;
import java.io.StringReader;
import java.util.*;
import java.util.List;
import java.util.NoSuchElementException;
import java.util.Optional;
import java.util.Set;
import java.util.stream.Collectors;
import org.apache.commons.lang3.tuple.Triple;
import org.dom4j.Document;
import org.dom4j.DocumentException;
import org.dom4j.Element;
import org.dom4j.io.SAXReader;
import org.jetbrains.annotations.NotNull;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.xml.sax.SAXException;
import com.google.common.base.Joiner;
import com.google.common.base.Splitter;
import com.google.common.collect.Iterables;
import com.google.common.collect.Lists;
import com.google.common.collect.Sets;
import eu.dnetlib.actionmanager.rmi.ActionManagerException;
import eu.dnetlib.actionmanager.set.ActionManagerSet;
import eu.dnetlib.actionmanager.set.ActionManagerSet.ImpactTypes;
import eu.dnetlib.dhp.actionmanager.partition.PartitionActionSetsByPayloadTypeJob;
import eu.dnetlib.dhp.utils.ISLookupClientFactory;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
import scala.Tuple2;
public class ISClient implements Serializable {
private static final Logger log = LoggerFactory.getLogger(PartitionActionSetsByPayloadTypeJob.class);
private static final Logger log = LoggerFactory.getLogger(ISClient.class);
private static final String INPUT_ACTION_SET_ID_SEPARATOR = ",";
private final ISLookUpService isLookup;
private final transient ISLookUpService isLookup;
public ISClient(String isLookupUrl) {
isLookup = ISLookupClientFactory.getLookUpService(isLookupUrl);
@ -63,7 +60,7 @@ public class ISClient implements Serializable {
.map(
sets -> sets
.stream()
.map(set -> parseSetInfo(set))
.map(ISClient::parseSetInfo)
.filter(t -> ids.contains(t.getLeft()))
.map(t -> buildDirectory(basePath, t))
.collect(Collectors.toList()))
@ -73,15 +70,17 @@ public class ISClient implements Serializable {
}
}
private Triple<String, String, String> parseSetInfo(String set) {
private static Triple<String, String, String> parseSetInfo(String set) {
try {
Document doc = new SAXReader().read(new StringReader(set));
final SAXReader reader = new SAXReader();
reader.setFeature("http://apache.org/xml/features/disallow-doctype-decl", true);
Document doc = reader.read(new StringReader(set));
return Triple
.of(
doc.valueOf("//SET/@id"),
doc.valueOf("//SET/@directory"),
doc.valueOf("//SET/@latest"));
} catch (DocumentException e) {
} catch (DocumentException | SAXException e) {
throw new IllegalStateException(e);
}
}
@ -99,7 +98,7 @@ public class ISClient implements Serializable {
final String q = "for $x in /RESOURCE_PROFILE[.//RESOURCE_TYPE/@value='ActionManagerServiceResourceType'] return $x//SERVICE_PROPERTIES/PROPERTY[./@ key='"
+ propertyName
+ "']/@value/string()";
log.debug("quering for service property: " + q);
log.debug("quering for service property: {}", q);
try {
final List<String> value = isLookup.quickSearchProfile(q);
return Iterables.getOnlyElement(value);

View File

@ -1,69 +0,0 @@
package eu.dnetlib.dhp.actionmanager.migration;
import java.util.Comparator;
import eu.dnetlib.data.proto.FieldTypeProtos.Qualifier;
public class LicenseComparator implements Comparator<Qualifier> {
@Override
public int compare(Qualifier left, Qualifier right) {
if (left == null && right == null)
return 0;
if (left == null)
return 1;
if (right == null)
return -1;
String lClass = left.getClassid();
String rClass = right.getClassid();
if (lClass.equals(rClass))
return 0;
if (lClass.equals("OPEN SOURCE"))
return -1;
if (rClass.equals("OPEN SOURCE"))
return 1;
if (lClass.equals("OPEN"))
return -1;
if (rClass.equals("OPEN"))
return 1;
if (lClass.equals("6MONTHS"))
return -1;
if (rClass.equals("6MONTHS"))
return 1;
if (lClass.equals("12MONTHS"))
return -1;
if (rClass.equals("12MONTHS"))
return 1;
if (lClass.equals("EMBARGO"))
return -1;
if (rClass.equals("EMBARGO"))
return 1;
if (lClass.equals("RESTRICTED"))
return -1;
if (rClass.equals("RESTRICTED"))
return 1;
if (lClass.equals("CLOSED"))
return -1;
if (rClass.equals("CLOSED"))
return 1;
if (lClass.equals("UNKNOWN"))
return -1;
if (rClass.equals("UNKNOWN"))
return 1;
// Else (but unlikely), lexicographical ordering will do.
return lClass.compareTo(rClass);
}
}

View File

@ -1,196 +0,0 @@
package eu.dnetlib.dhp.actionmanager.migration;
import java.io.File;
import java.io.FileOutputStream;
import java.io.OutputStream;
import java.util.ArrayList;
import java.util.LinkedList;
import java.util.List;
import java.util.Properties;
import java.util.stream.Collectors;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.tools.DistCp;
import org.apache.hadoop.tools.DistCpOptions;
import org.apache.hadoop.util.ToolRunner;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.google.common.base.Splitter;
import com.google.common.collect.Lists;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.utils.ISLookupClientFactory;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
public class MigrateActionSet {
private static final Logger log = LoggerFactory.getLogger(MigrateActionSet.class);
private static final String SEPARATOR = "/";
private static final String TARGET_PATHS = "target_paths";
private static final String RAWSET_PREFIX = "rawset_";
public static void main(String[] args) throws Exception {
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
IOUtils
.toString(
MigrateActionSet.class
.getResourceAsStream(
"/eu/dnetlib/dhp/actionmanager/migration/migrate_actionsets_parameters.json")));
parser.parseArgument(args);
new MigrateActionSet().run(parser);
}
private void run(ArgumentApplicationParser parser) throws Exception {
final String isLookupUrl = parser.get("isLookupUrl");
final String sourceNN = parser.get("sourceNameNode");
final String targetNN = parser.get("targetNameNode");
final String workDir = parser.get("workingDirectory");
final Integer distcp_num_maps = Integer.parseInt(parser.get("distcp_num_maps"));
final String distcp_memory_mb = parser.get("distcp_memory_mb");
final String distcp_task_timeout = parser.get("distcp_task_timeout");
final String transform_only_s = parser.get("transform_only");
log.info("transform only param: {}", transform_only_s);
final Boolean transformOnly = Boolean.valueOf(parser.get("transform_only"));
log.info("transform only: {}", transformOnly);
ISLookUpService isLookUp = ISLookupClientFactory.getLookUpService(isLookupUrl);
Configuration conf = getConfiguration(distcp_task_timeout, distcp_memory_mb, distcp_num_maps);
FileSystem targetFS = FileSystem.get(conf);
Configuration sourceConf = getConfiguration(distcp_task_timeout, distcp_memory_mb, distcp_num_maps);
sourceConf.set(FileSystem.FS_DEFAULT_NAME_KEY, sourceNN);
FileSystem sourceFS = FileSystem.get(sourceConf);
Properties props = new Properties();
List<Path> targetPaths = new ArrayList<>();
final List<Path> sourcePaths = getSourcePaths(sourceNN, isLookUp);
log
.info(
"paths to process:\n{}", sourcePaths
.stream()
.map(p -> p.toString())
.collect(Collectors.joining("\n")));
for (Path source : sourcePaths) {
if (!sourceFS.exists(source)) {
log.warn("skipping unexisting path: {}", source);
} else {
LinkedList<String> pathQ = Lists.newLinkedList(Splitter.on(SEPARATOR).split(source.toUri().getPath()));
final String rawSet = pathQ.pollLast();
log.info("got RAWSET: {}", rawSet);
if (StringUtils.isNotBlank(rawSet) && rawSet.startsWith(RAWSET_PREFIX)) {
final String actionSetDirectory = pathQ.pollLast();
final Path targetPath = new Path(
targetNN + workDir + SEPARATOR + actionSetDirectory + SEPARATOR + rawSet);
log.info("using TARGET PATH: {}", targetPath);
if (!transformOnly) {
if (targetFS.exists(targetPath)) {
targetFS.delete(targetPath, true);
}
runDistcp(
distcp_num_maps, distcp_memory_mb, distcp_task_timeout, conf, source, targetPath);
}
targetPaths.add(targetPath);
}
}
}
final String targetPathsCsv = targetPaths.stream().map(p -> p.toString()).collect(Collectors.joining(","));
props.setProperty(TARGET_PATHS, targetPathsCsv);
File file = new File(System.getProperty("oozie.action.output.properties"));
try (OutputStream os = new FileOutputStream(file)) {
props.store(os, "");
}
System.out.println(file.getAbsolutePath());
}
private void runDistcp(
Integer distcp_num_maps,
String distcp_memory_mb,
String distcp_task_timeout,
Configuration conf,
Path source,
Path targetPath)
throws Exception {
final DistCpOptions op = new DistCpOptions(source, targetPath);
op.setMaxMaps(distcp_num_maps);
op.preserve(DistCpOptions.FileAttribute.BLOCKSIZE);
op.preserve(DistCpOptions.FileAttribute.REPLICATION);
op.preserve(DistCpOptions.FileAttribute.CHECKSUMTYPE);
int res = ToolRunner
.run(
new DistCp(conf, op),
new String[] {
"-Dmapred.task.timeout=" + distcp_task_timeout,
"-Dmapreduce.map.memory.mb=" + distcp_memory_mb,
"-pb",
"-m " + distcp_num_maps,
source.toString(),
targetPath.toString()
});
if (res != 0) {
throw new RuntimeException(String.format("distcp exited with code %s", res));
}
}
private Configuration getConfiguration(
String distcp_task_timeout, String distcp_memory_mb, Integer distcp_num_maps) {
final Configuration conf = new Configuration();
conf.set("dfs.webhdfs.socket.connect-timeout", distcp_task_timeout);
conf.set("dfs.webhdfs.socket.read-timeout", distcp_task_timeout);
conf.set("dfs.http.client.retry.policy.enabled", "true");
conf.set("mapred.task.timeout", distcp_task_timeout);
conf.set("mapreduce.map.memory.mb", distcp_memory_mb);
conf.set("mapred.map.tasks", String.valueOf(distcp_num_maps));
return conf;
}
private List<Path> getSourcePaths(String sourceNN, ISLookUpService isLookUp)
throws ISLookUpException {
String XQUERY = "distinct-values(\n"
+ "let $basePath := collection('/db/DRIVER/ServiceResources/ActionManagerServiceResourceType')//SERVICE_PROPERTIES/PROPERTY[@key = 'basePath']/@value/string()\n"
+ "for $x in collection('/db/DRIVER/ActionManagerSetDSResources/ActionManagerSetDSResourceType') \n"
+ "let $setDir := $x//SET/@directory/string()\n"
+ "let $rawSet := $x//RAW_SETS/LATEST/@id/string()\n"
+ "return concat($basePath, '/', $setDir, '/', $rawSet))";
log.info(String.format("running xquery:\n%s", XQUERY));
return isLookUp
.quickSearchProfile(XQUERY)
.stream()
.map(p -> sourceNN + p)
.map(Path::new)
.collect(Collectors.toList());
}
}

View File

@ -1,710 +0,0 @@
package eu.dnetlib.dhp.actionmanager.migration;
import static eu.dnetlib.dhp.schema.common.ModelConstants.*;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.List;
import java.util.Optional;
import java.util.stream.Collectors;
import org.apache.commons.lang3.StringUtils;
import com.google.common.collect.Lists;
import com.googlecode.protobuf.format.JsonFormat;
import eu.dnetlib.data.proto.*;
import eu.dnetlib.dhp.schema.oaf.*;
public class ProtoConverter implements Serializable {
public static Oaf convert(OafProtos.Oaf oaf) {
try {
switch (oaf.getKind()) {
case entity:
return convertEntity(oaf);
case relation:
return convertRelation(oaf);
default:
throw new IllegalArgumentException("invalid kind " + oaf.getKind());
}
} catch (Throwable e) {
throw new RuntimeException("error on getting " + JsonFormat.printToString(oaf), e);
}
}
private static Relation convertRelation(OafProtos.Oaf oaf) {
final OafProtos.OafRel r = oaf.getRel();
final Relation rel = new Relation();
rel.setDataInfo(mapDataInfo(oaf.getDataInfo()));
rel.setLastupdatetimestamp(oaf.getLastupdatetimestamp());
rel.setSource(r.getSource());
rel.setTarget(r.getTarget());
rel.setRelType(r.getRelType().toString());
rel.setSubRelType(r.getSubRelType().toString());
rel.setRelClass(r.getRelClass());
rel
.setCollectedfrom(
r.getCollectedfromCount() > 0
? r.getCollectedfromList().stream().map(kv -> mapKV(kv)).collect(Collectors.toList())
: null);
return rel;
}
private static OafEntity convertEntity(OafProtos.Oaf oaf) {
switch (oaf.getEntity().getType()) {
case result:
final Result r = convertResult(oaf);
r.setInstance(convertInstances(oaf));
r.setExternalReference(convertExternalRefs(oaf));
return r;
case project:
return convertProject(oaf);
case datasource:
return convertDataSource(oaf);
case organization:
return convertOrganization(oaf);
default:
throw new RuntimeException("received unknown type");
}
}
private static List<Instance> convertInstances(OafProtos.Oaf oaf) {
final ResultProtos.Result r = oaf.getEntity().getResult();
if (r.getInstanceCount() > 0) {
return r.getInstanceList().stream().map(i -> convertInstance(i)).collect(Collectors.toList());
}
return Lists.newArrayList();
}
private static Instance convertInstance(ResultProtos.Result.Instance ri) {
final Instance i = new Instance();
i.setAccessright(mapQualifier(ri.getAccessright()));
i.setCollectedfrom(mapKV(ri.getCollectedfrom()));
i.setDateofacceptance(mapStringField(ri.getDateofacceptance()));
i.setDistributionlocation(ri.getDistributionlocation());
i.setHostedby(mapKV(ri.getHostedby()));
i.setInstancetype(mapQualifier(ri.getInstancetype()));
i.setLicense(mapStringField(ri.getLicense()));
i
.setUrl(
ri.getUrlList() != null ? ri
.getUrlList()
.stream()
.distinct()
.collect(Collectors.toCollection(ArrayList::new)) : null);
i.setRefereed(mapRefereed(ri.getRefereed()));
i.setProcessingchargeamount(mapStringField(ri.getProcessingchargeamount()));
i.setProcessingchargecurrency(mapStringField(ri.getProcessingchargecurrency()));
return i;
}
private static Qualifier mapRefereed(FieldTypeProtos.StringField refereed) {
Qualifier q = new Qualifier();
q.setClassid(refereed.getValue());
q.setSchemename(refereed.getValue());
q.setSchemeid("dnet:review_levels");
q.setSchemename("dnet:review_levels");
return q;
}
private static List<ExternalReference> convertExternalRefs(OafProtos.Oaf oaf) {
ResultProtos.Result r = oaf.getEntity().getResult();
if (r.getExternalReferenceCount() > 0) {
return r
.getExternalReferenceList()
.stream()
.map(e -> convertExtRef(e))
.collect(Collectors.toList());
}
return Lists.newArrayList();
}
private static ExternalReference convertExtRef(ResultProtos.Result.ExternalReference e) {
ExternalReference ex = new ExternalReference();
ex.setUrl(e.getUrl());
ex.setSitename(e.getSitename());
ex.setRefidentifier(e.getRefidentifier());
ex.setQuery(e.getQuery());
ex.setQualifier(mapQualifier(e.getQualifier()));
ex.setLabel(e.getLabel());
ex.setDescription(e.getDescription());
ex.setDataInfo(ex.getDataInfo());
return ex;
}
private static Organization convertOrganization(OafProtos.Oaf oaf) {
final OrganizationProtos.Organization.Metadata m = oaf.getEntity().getOrganization().getMetadata();
final Organization org = setOaf(new Organization(), oaf);
setEntity(org, oaf);
org.setLegalshortname(mapStringField(m.getLegalshortname()));
org.setLegalname(mapStringField(m.getLegalname()));
org
.setAlternativeNames(
m
.getAlternativeNamesList()
.stream()
.map(ProtoConverter::mapStringField)
.collect(Collectors.toList()));
org.setWebsiteurl(mapStringField(m.getWebsiteurl()));
org.setLogourl(mapStringField(m.getLogourl()));
org.setEclegalbody(mapStringField(m.getEclegalbody()));
org.setEclegalperson(mapStringField(m.getEclegalperson()));
org.setEcnonprofit(mapStringField(m.getEcnonprofit()));
org.setEcresearchorganization(mapStringField(m.getEcresearchorganization()));
org.setEchighereducation(mapStringField(m.getEchighereducation()));
org
.setEcinternationalorganizationeurinterests(
mapStringField(m.getEcinternationalorganizationeurinterests()));
org.setEcinternationalorganization(mapStringField(m.getEcinternationalorganization()));
org.setEcenterprise(mapStringField(m.getEcenterprise()));
org.setEcsmevalidated(mapStringField(m.getEcsmevalidated()));
org.setEcnutscode(mapStringField(m.getEcnutscode()));
org.setCountry(mapQualifier(m.getCountry()));
return org;
}
private static Datasource convertDataSource(OafProtos.Oaf oaf) {
final DatasourceProtos.Datasource.Metadata m = oaf.getEntity().getDatasource().getMetadata();
final Datasource datasource = setOaf(new Datasource(), oaf);
setEntity(datasource, oaf);
datasource
.setAccessinfopackage(
m
.getAccessinfopackageList()
.stream()
.map(ProtoConverter::mapStringField)
.collect(Collectors.toList()));
datasource.setCertificates(mapStringField(m.getCertificates()));
datasource.setCitationguidelineurl(mapStringField(m.getCitationguidelineurl()));
datasource.setContactemail(mapStringField(m.getContactemail()));
datasource.setDatabaseaccessrestriction(mapStringField(m.getDatabaseaccessrestriction()));
datasource.setDatabaseaccesstype(mapStringField(m.getDatabaseaccesstype()));
datasource.setDataprovider(mapBoolField(m.getDataprovider()));
datasource.setDatasourcetype(mapQualifier(m.getDatasourcetype()));
datasource.setDatauploadrestriction(mapStringField(m.getDatauploadrestriction()));
datasource.setCitationguidelineurl(mapStringField(m.getCitationguidelineurl()));
datasource.setDatauploadtype(mapStringField(m.getDatauploadtype()));
datasource.setDateofvalidation(mapStringField(m.getDateofvalidation()));
datasource.setDescription(mapStringField(m.getDescription()));
datasource.setEnglishname(mapStringField(m.getEnglishname()));
datasource.setLatitude(mapStringField(m.getLatitude()));
datasource.setLongitude(mapStringField(m.getLongitude()));
datasource.setLogourl(mapStringField(m.getLogourl()));
datasource.setMissionstatementurl(mapStringField(m.getMissionstatementurl()));
datasource.setNamespaceprefix(mapStringField(m.getNamespaceprefix()));
datasource
.setOdcontenttypes(
m
.getOdcontenttypesList()
.stream()
.map(ProtoConverter::mapStringField)
.collect(Collectors.toList()));
datasource
.setOdlanguages(
m
.getOdlanguagesList()
.stream()
.map(ProtoConverter::mapStringField)
.collect(Collectors.toList()));
datasource.setOdnumberofitems(mapStringField(m.getOdnumberofitems()));
datasource.setOdnumberofitemsdate(mapStringField(m.getOdnumberofitemsdate()));
datasource.setOdpolicies(mapStringField(m.getOdpolicies()));
datasource.setOfficialname(mapStringField(m.getOfficialname()));
datasource.setOpenairecompatibility(mapQualifier(m.getOpenairecompatibility()));
datasource.setPidsystems(mapStringField(m.getPidsystems()));
datasource
.setPolicies(
m.getPoliciesList().stream().map(ProtoConverter::mapKV).collect(Collectors.toList()));
datasource.setQualitymanagementkind(mapStringField(m.getQualitymanagementkind()));
datasource.setReleaseenddate(mapStringField(m.getReleaseenddate()));
datasource.setServiceprovider(mapBoolField(m.getServiceprovider()));
datasource.setReleasestartdate(mapStringField(m.getReleasestartdate()));
datasource
.setSubjects(
m
.getSubjectsList()
.stream()
.map(ProtoConverter::mapStructuredProperty)
.collect(Collectors.toList()));
datasource.setVersioning(mapBoolField(m.getVersioning()));
datasource.setWebsiteurl(mapStringField(m.getWebsiteurl()));
datasource.setJournal(mapJournal(m.getJournal()));
return datasource;
}
private static Project convertProject(OafProtos.Oaf oaf) {
final ProjectProtos.Project.Metadata m = oaf.getEntity().getProject().getMetadata();
final Project project = setOaf(new Project(), oaf);
setEntity(project, oaf);
project.setAcronym(mapStringField(m.getAcronym()));
project.setCallidentifier(mapStringField(m.getCallidentifier()));
project.setCode(mapStringField(m.getCode()));
project.setContactemail(mapStringField(m.getContactemail()));
project.setContactfax(mapStringField(m.getContactfax()));
project.setContactfullname(mapStringField(m.getContactfullname()));
project.setContactphone(mapStringField(m.getContactphone()));
project.setContracttype(mapQualifier(m.getContracttype()));
project.setCurrency(mapStringField(m.getCurrency()));
project.setDuration(mapStringField(m.getDuration()));
project.setEcarticle29_3(mapStringField(m.getEcarticle293()));
project.setEcsc39(mapStringField(m.getEcsc39()));
project.setOamandatepublications(mapStringField(m.getOamandatepublications()));
project.setStartdate(mapStringField(m.getStartdate()));
project.setEnddate(mapStringField(m.getEnddate()));
project.setFundedamount(m.getFundedamount());
project.setTotalcost(m.getTotalcost());
project.setKeywords(mapStringField(m.getKeywords()));
project
.setSubjects(
m
.getSubjectsList()
.stream()
.map(sp -> mapStructuredProperty(sp))
.collect(Collectors.toList()));
project.setTitle(mapStringField(m.getTitle()));
project.setWebsiteurl(mapStringField(m.getWebsiteurl()));
project
.setFundingtree(
m.getFundingtreeList().stream().map(f -> mapStringField(f)).collect(Collectors.toList()));
project.setJsonextrainfo(mapStringField(m.getJsonextrainfo()));
project.setSummary(mapStringField(m.getSummary()));
project.setOptional1(mapStringField(m.getOptional1()));
project.setOptional2(mapStringField(m.getOptional2()));
return project;
}
private static Result convertResult(OafProtos.Oaf oaf) {
switch (oaf.getEntity().getResult().getMetadata().getResulttype().getClassid()) {
case "dataset":
return createDataset(oaf);
case "publication":
return createPublication(oaf);
case "software":
return createSoftware(oaf);
case "other":
return createORP(oaf);
default:
Result result = setOaf(new Result(), oaf);
setEntity(result, oaf);
return setResult(result, oaf);
}
}
private static Software createSoftware(OafProtos.Oaf oaf) {
ResultProtos.Result.Metadata m = oaf.getEntity().getResult().getMetadata();
Software software = setOaf(new Software(), oaf);
setEntity(software, oaf);
setResult(software, oaf);
software
.setDocumentationUrl(
m
.getDocumentationUrlList()
.stream()
.map(ProtoConverter::mapStringField)
.collect(Collectors.toList()));
software
.setLicense(
m
.getLicenseList()
.stream()
.map(ProtoConverter::mapStructuredProperty)
.collect(Collectors.toList()));
software.setCodeRepositoryUrl(mapStringField(m.getCodeRepositoryUrl()));
software.setProgrammingLanguage(mapQualifier(m.getProgrammingLanguage()));
return software;
}
private static OtherResearchProduct createORP(OafProtos.Oaf oaf) {
ResultProtos.Result.Metadata m = oaf.getEntity().getResult().getMetadata();
OtherResearchProduct otherResearchProducts = setOaf(new OtherResearchProduct(), oaf);
setEntity(otherResearchProducts, oaf);
setResult(otherResearchProducts, oaf);
otherResearchProducts
.setContactperson(
m
.getContactpersonList()
.stream()
.map(ProtoConverter::mapStringField)
.collect(Collectors.toList()));
otherResearchProducts
.setContactgroup(
m
.getContactgroupList()
.stream()
.map(ProtoConverter::mapStringField)
.collect(Collectors.toList()));
otherResearchProducts
.setTool(
m.getToolList().stream().map(ProtoConverter::mapStringField).collect(Collectors.toList()));
return otherResearchProducts;
}
private static Publication createPublication(OafProtos.Oaf oaf) {
ResultProtos.Result.Metadata m = oaf.getEntity().getResult().getMetadata();
Publication publication = setOaf(new Publication(), oaf);
setEntity(publication, oaf);
setResult(publication, oaf);
publication.setJournal(mapJournal(m.getJournal()));
return publication;
}
private static Dataset createDataset(OafProtos.Oaf oaf) {
ResultProtos.Result.Metadata m = oaf.getEntity().getResult().getMetadata();
Dataset dataset = setOaf(new Dataset(), oaf);
setEntity(dataset, oaf);
setResult(dataset, oaf);
dataset.setStoragedate(mapStringField(m.getStoragedate()));
dataset.setDevice(mapStringField(m.getDevice()));
dataset.setSize(mapStringField(m.getSize()));
dataset.setVersion(mapStringField(m.getVersion()));
dataset.setLastmetadataupdate(mapStringField(m.getLastmetadataupdate()));
dataset.setMetadataversionnumber(mapStringField(m.getMetadataversionnumber()));
dataset
.setGeolocation(
m
.getGeolocationList()
.stream()
.map(ProtoConverter::mapGeolocation)
.collect(Collectors.toList()));
return dataset;
}
public static <T extends Oaf> T setOaf(T oaf, OafProtos.Oaf o) {
oaf.setDataInfo(mapDataInfo(o.getDataInfo()));
oaf.setLastupdatetimestamp(o.getLastupdatetimestamp());
return oaf;
}
public static <T extends OafEntity> T setEntity(T entity, OafProtos.Oaf oaf) {
// setting Entity fields
final OafProtos.OafEntity e = oaf.getEntity();
entity.setId(e.getId());
entity.setOriginalId(e.getOriginalIdList());
entity
.setCollectedfrom(
e.getCollectedfromList().stream().map(ProtoConverter::mapKV).collect(Collectors.toList()));
entity
.setPid(
e
.getPidList()
.stream()
.map(ProtoConverter::mapStructuredProperty)
.collect(Collectors.toList()));
entity.setDateofcollection(e.getDateofcollection());
entity.setDateoftransformation(e.getDateoftransformation());
entity
.setExtraInfo(
e
.getExtraInfoList()
.stream()
.map(ProtoConverter::mapExtraInfo)
.collect(Collectors.toList()));
return entity;
}
public static <T extends Result> T setResult(T entity, OafProtos.Oaf oaf) {
// setting Entity fields
final ResultProtos.Result.Metadata m = oaf.getEntity().getResult().getMetadata();
entity
.setAuthor(
m.getAuthorList().stream().map(ProtoConverter::mapAuthor).collect(Collectors.toList()));
entity.setResulttype(mapQualifier(m.getResulttype()));
entity.setLanguage(mapQualifier(m.getLanguage()));
entity
.setCountry(
m
.getCountryList()
.stream()
.map(ProtoConverter::mapQualifierAsCountry)
.collect(Collectors.toList()));
entity
.setSubject(
m
.getSubjectList()
.stream()
.map(ProtoConverter::mapStructuredProperty)
.collect(Collectors.toList()));
entity
.setTitle(
m
.getTitleList()
.stream()
.map(ProtoConverter::mapStructuredProperty)
.collect(Collectors.toList()));
entity
.setRelevantdate(
m
.getRelevantdateList()
.stream()
.map(ProtoConverter::mapStructuredProperty)
.collect(Collectors.toList()));
entity
.setDescription(
m
.getDescriptionList()
.stream()
.map(ProtoConverter::mapStringField)
.collect(Collectors.toList()));
entity.setDateofacceptance(mapStringField(m.getDateofacceptance()));
entity.setPublisher(mapStringField(m.getPublisher()));
entity.setEmbargoenddate(mapStringField(m.getEmbargoenddate()));
entity
.setSource(
m
.getSourceList()
.stream()
.map(ProtoConverter::mapStringField)
.collect(Collectors.toList()));
entity
.setFulltext(
m
.getFulltextList()
.stream()
.map(ProtoConverter::mapStringField)
.collect(Collectors.toList()));
entity
.setFormat(
m
.getFormatList()
.stream()
.map(ProtoConverter::mapStringField)
.collect(Collectors.toList()));
entity
.setContributor(
m
.getContributorList()
.stream()
.map(ProtoConverter::mapStringField)
.collect(Collectors.toList()));
entity.setResourcetype(mapQualifier(m.getResourcetype()));
entity
.setCoverage(
m
.getCoverageList()
.stream()
.map(ProtoConverter::mapStringField)
.collect(Collectors.toList()));
entity
.setContext(
m.getContextList().stream().map(ProtoConverter::mapContext).collect(Collectors.toList()));
entity.setBestaccessright(getBestAccessRights(oaf.getEntity().getResult().getInstanceList()));
return entity;
}
private static Qualifier getBestAccessRights(List<ResultProtos.Result.Instance> instanceList) {
if (instanceList != null) {
final Optional<FieldTypeProtos.Qualifier> min = instanceList
.stream()
.map(i -> i.getAccessright())
.min(new LicenseComparator());
final Qualifier rights = min.isPresent() ? mapQualifier(min.get()) : new Qualifier();
if (StringUtils.isBlank(rights.getClassid())) {
rights.setClassid(UNKNOWN);
}
if (StringUtils.isBlank(rights.getClassname())
|| UNKNOWN.equalsIgnoreCase(rights.getClassname())) {
rights.setClassname(NOT_AVAILABLE);
}
if (StringUtils.isBlank(rights.getSchemeid())) {
rights.setSchemeid(DNET_ACCESS_MODES);
}
if (StringUtils.isBlank(rights.getSchemename())) {
rights.setSchemename(DNET_ACCESS_MODES);
}
return rights;
}
return null;
}
private static Context mapContext(ResultProtos.Result.Context context) {
if (context == null || StringUtils.isBlank(context.getId())) {
return null;
}
final Context entity = new Context();
entity.setId(context.getId());
entity
.setDataInfo(
context
.getDataInfoList()
.stream()
.map(ProtoConverter::mapDataInfo)
.collect(Collectors.toList()));
return entity;
}
public static KeyValue mapKV(FieldTypeProtos.KeyValue kv) {
if (kv == null || StringUtils.isBlank(kv.getKey()) & StringUtils.isBlank(kv.getValue())) {
return null;
}
final KeyValue keyValue = new KeyValue();
keyValue.setKey(kv.getKey());
keyValue.setValue(kv.getValue());
keyValue.setDataInfo(mapDataInfo(kv.getDataInfo()));
return keyValue;
}
public static DataInfo mapDataInfo(FieldTypeProtos.DataInfo d) {
final DataInfo dataInfo = new DataInfo();
dataInfo.setDeletedbyinference(d.getDeletedbyinference());
dataInfo.setInferenceprovenance(d.getInferenceprovenance());
dataInfo.setInferred(d.getInferred());
dataInfo.setInvisible(d.getInvisible());
dataInfo.setProvenanceaction(mapQualifier(d.getProvenanceaction()));
dataInfo.setTrust(d.getTrust());
return dataInfo;
}
public static Qualifier mapQualifier(FieldTypeProtos.Qualifier q) {
final Qualifier qualifier = new Qualifier();
qualifier.setClassid(q.getClassid());
qualifier.setClassname(q.getClassname());
qualifier.setSchemeid(q.getSchemeid());
qualifier.setSchemename(q.getSchemename());
return qualifier;
}
public static Country mapQualifierAsCountry(FieldTypeProtos.Qualifier q) {
final Country c = new Country();
c.setClassid(q.getClassid());
c.setClassname(q.getClassname());
c.setSchemeid(q.getSchemeid());
c.setSchemename(q.getSchemename());
c.setDataInfo(mapDataInfo(q.getDataInfo()));
return c;
}
public static StructuredProperty mapStructuredProperty(FieldTypeProtos.StructuredProperty sp) {
if (sp == null | StringUtils.isBlank(sp.getValue())) {
return null;
}
final StructuredProperty structuredProperty = new StructuredProperty();
structuredProperty.setValue(sp.getValue());
structuredProperty.setQualifier(mapQualifier(sp.getQualifier()));
structuredProperty.setDataInfo(mapDataInfo(sp.getDataInfo()));
return structuredProperty;
}
public static ExtraInfo mapExtraInfo(FieldTypeProtos.ExtraInfo extraInfo) {
final ExtraInfo entity = new ExtraInfo();
entity.setName(extraInfo.getName());
entity.setTypology(extraInfo.getTypology());
entity.setProvenance(extraInfo.getProvenance());
entity.setTrust(extraInfo.getTrust());
entity.setValue(extraInfo.getValue());
return entity;
}
public static OAIProvenance mapOAIProvenance(FieldTypeProtos.OAIProvenance oaiProvenance) {
final OAIProvenance entity = new OAIProvenance();
entity.setOriginDescription(mapOriginalDescription(oaiProvenance.getOriginDescription()));
return entity;
}
public static OriginDescription mapOriginalDescription(
FieldTypeProtos.OAIProvenance.OriginDescription originDescription) {
final OriginDescription originDescriptionResult = new OriginDescription();
originDescriptionResult.setHarvestDate(originDescription.getHarvestDate());
originDescriptionResult.setAltered(originDescription.getAltered());
originDescriptionResult.setBaseURL(originDescription.getBaseURL());
originDescriptionResult.setIdentifier(originDescription.getIdentifier());
originDescriptionResult.setDatestamp(originDescription.getDatestamp());
originDescriptionResult.setMetadataNamespace(originDescription.getMetadataNamespace());
return originDescriptionResult;
}
public static Field<String> mapStringField(FieldTypeProtos.StringField s) {
if (s == null || StringUtils.isBlank(s.getValue())) {
return null;
}
final Field<String> stringField = new Field<>();
stringField.setValue(s.getValue());
stringField.setDataInfo(mapDataInfo(s.getDataInfo()));
return stringField;
}
public static Field<Boolean> mapBoolField(FieldTypeProtos.BoolField b) {
if (b == null) {
return null;
}
final Field<Boolean> booleanField = new Field<>();
booleanField.setValue(b.getValue());
booleanField.setDataInfo(mapDataInfo(b.getDataInfo()));
return booleanField;
}
public static Journal mapJournal(FieldTypeProtos.Journal j) {
final Journal journal = new Journal();
journal.setConferencedate(j.getConferencedate());
journal.setConferenceplace(j.getConferenceplace());
journal.setEdition(j.getEdition());
journal.setEp(j.getEp());
journal.setIss(j.getIss());
journal.setIssnLinking(j.getIssnLinking());
journal.setIssnOnline(j.getIssnOnline());
journal.setIssnPrinted(j.getIssnPrinted());
journal.setName(j.getName());
journal.setSp(j.getSp());
journal.setVol(j.getVol());
journal.setDataInfo(mapDataInfo(j.getDataInfo()));
return journal;
}
public static Author mapAuthor(FieldTypeProtos.Author author) {
final Author entity = new Author();
entity.setFullname(author.getFullname());
entity.setName(author.getName());
entity.setSurname(author.getSurname());
entity.setRank(author.getRank());
entity
.setPid(
author
.getPidList()
.stream()
.map(
kv -> {
final StructuredProperty sp = new StructuredProperty();
sp.setValue(kv.getValue());
final Qualifier q = new Qualifier();
q.setClassid(kv.getKey());
q.setClassname(kv.getKey());
sp.setQualifier(q);
return sp;
})
.collect(Collectors.toList()));
entity
.setAffiliation(
author
.getAffiliationList()
.stream()
.map(ProtoConverter::mapStringField)
.collect(Collectors.toList()));
return entity;
}
public static GeoLocation mapGeolocation(ResultProtos.Result.GeoLocation geoLocation) {
final GeoLocation entity = new GeoLocation();
entity.setPoint(geoLocation.getPoint());
entity.setBox(geoLocation.getBox());
entity.setPlace(geoLocation.getPlace());
return entity;
}
}

View File

@ -1,172 +0,0 @@
package eu.dnetlib.dhp.actionmanager.migration;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import java.io.IOException;
import java.io.Serializable;
import java.util.LinkedList;
import java.util.Objects;
import java.util.Optional;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.sql.SparkSession;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.google.common.base.Splitter;
import com.google.common.collect.Lists;
import com.google.protobuf.InvalidProtocolBufferException;
import eu.dnetlib.data.proto.OafProtos;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.schema.action.AtomicAction;
import eu.dnetlib.dhp.schema.oaf.*;
import eu.dnetlib.dhp.utils.ISLookupClientFactory;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
import scala.Tuple2;
public class TransformActions implements Serializable {
private static final Logger log = LoggerFactory.getLogger(TransformActions.class);
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
private static final String SEPARATOR = "/";
public static void main(String[] args) throws Exception {
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
IOUtils
.toString(
MigrateActionSet.class
.getResourceAsStream(
"/eu/dnetlib/dhp/actionmanager/migration/transform_actionsets_parameters.json")));
parser.parseArgument(args);
Boolean isSparkSessionManaged = Optional
.ofNullable(parser.get("isSparkSessionManaged"))
.map(Boolean::valueOf)
.orElse(Boolean.TRUE);
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
final String isLookupUrl = parser.get("isLookupUrl");
log.info("isLookupUrl: {}", isLookupUrl);
final String inputPaths = parser.get("inputPaths");
if (StringUtils.isBlank(inputPaths)) {
throw new RuntimeException("empty inputPaths");
}
log.info("inputPaths: {}", inputPaths);
final String targetBaseDir = getTargetBaseDir(isLookupUrl);
SparkConf conf = new SparkConf();
runWithSparkSession(
conf, isSparkSessionManaged, spark -> transformActions(inputPaths, targetBaseDir, spark));
}
private static void transformActions(String inputPaths, String targetBaseDir, SparkSession spark)
throws IOException {
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
final FileSystem fs = FileSystem.get(spark.sparkContext().hadoopConfiguration());
for (String sourcePath : Lists.newArrayList(Splitter.on(",").split(inputPaths))) {
LinkedList<String> pathQ = Lists.newLinkedList(Splitter.on(SEPARATOR).split(sourcePath));
final String rawset = pathQ.pollLast();
final String actionSetDirectory = pathQ.pollLast();
final Path targetDirectory = new Path(targetBaseDir + SEPARATOR + actionSetDirectory + SEPARATOR + rawset);
if (fs.exists(targetDirectory)) {
log.info("found target directory '{}", targetDirectory);
fs.delete(targetDirectory, true);
log.info("deleted target directory '{}", targetDirectory);
}
log.info("transforming actions from '{}' to '{}'", sourcePath, targetDirectory);
sc
.sequenceFile(sourcePath, Text.class, Text.class)
.map(a -> eu.dnetlib.actionmanager.actions.AtomicAction.fromJSON(a._2().toString()))
.map(TransformActions::doTransform)
.filter(Objects::nonNull)
.mapToPair(
a -> new Tuple2<>(a.getClazz().toString(), OBJECT_MAPPER.writeValueAsString(a)))
.mapToPair(t -> new Tuple2(new Text(t._1()), new Text(t._2())))
.saveAsNewAPIHadoopFile(
targetDirectory.toString(),
Text.class,
Text.class,
SequenceFileOutputFormat.class,
sc.hadoopConfiguration());
}
}
private static AtomicAction doTransform(eu.dnetlib.actionmanager.actions.AtomicAction aa)
throws InvalidProtocolBufferException {
// dedup similarity relations had empty target value, don't migrate them
if (aa.getTargetValue().length == 0) {
return null;
}
final OafProtos.Oaf proto_oaf = OafProtos.Oaf.parseFrom(aa.getTargetValue());
final Oaf oaf = ProtoConverter.convert(proto_oaf);
switch (proto_oaf.getKind()) {
case entity:
switch (proto_oaf.getEntity().getType()) {
case datasource:
return new AtomicAction<>(Datasource.class, (Datasource) oaf);
case organization:
return new AtomicAction<>(Organization.class, (Organization) oaf);
case project:
return new AtomicAction<>(Project.class, (Project) oaf);
case result:
final String resulttypeid = proto_oaf
.getEntity()
.getResult()
.getMetadata()
.getResulttype()
.getClassid();
switch (resulttypeid) {
case "publication":
return new AtomicAction<>(Publication.class, (Publication) oaf);
case "software":
return new AtomicAction<>(Software.class, (Software) oaf);
case "other":
return new AtomicAction<>(OtherResearchProduct.class, (OtherResearchProduct) oaf);
case "dataset":
return new AtomicAction<>(Dataset.class, (Dataset) oaf);
default:
// can be an update, where the resulttype is not specified
return new AtomicAction<>(Result.class, (Result) oaf);
}
default:
throw new IllegalArgumentException(
"invalid entity type: " + proto_oaf.getEntity().getType());
}
case relation:
return new AtomicAction<>(Relation.class, (Relation) oaf);
default:
throw new IllegalArgumentException("invalid kind: " + proto_oaf.getKind());
}
}
private static String getTargetBaseDir(String isLookupUrl) throws ISLookUpException {
ISLookUpService isLookUp = ISLookupClientFactory.getLookUpService(isLookupUrl);
String XQUERY = "collection('/db/DRIVER/ServiceResources/ActionManagerServiceResourceType')//SERVICE_PROPERTIES/PROPERTY[@key = 'basePath']/@value/string()";
return isLookUp.getResourceProfileByQuery(XQUERY);
}
}

View File

@ -62,6 +62,7 @@ public class MergeAndGet {
x.getClass().getCanonicalName(), y.getClass().getCanonicalName()));
}
@SuppressWarnings("unchecked")
private static <G extends Oaf, A extends Oaf> G selectNewerAndGet(G x, A y) {
if (x.getClass().equals(y.getClass())
&& x.getLastupdatetimestamp() > y.getLastupdatetimestamp()) {

View File

@ -5,12 +5,12 @@ import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import static eu.dnetlib.dhp.schema.common.ModelSupport.isSubClass;
import java.io.IOException;
import java.util.Objects;
import java.util.Optional;
import java.util.function.BiFunction;
import java.util.function.Function;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.Dataset;
@ -68,7 +68,15 @@ public class PromoteActionPayloadForGraphTableJob {
MergeAndGet.Strategy strategy = MergeAndGet.Strategy.valueOf(parser.get("mergeAndGetStrategy").toUpperCase());
logger.info("strategy: {}", strategy);
Boolean shouldGroupById = Optional
.ofNullable(parser.get("shouldGroupById"))
.map(Boolean::valueOf)
.orElse(true);
logger.info("shouldGroupById: {}", shouldGroupById);
@SuppressWarnings("unchecked")
Class<? extends Oaf> rowClazz = (Class<? extends Oaf>) Class.forName(graphTableClassName);
@SuppressWarnings("unchecked")
Class<? extends Oaf> actionPayloadClazz = (Class<? extends Oaf>) Class.forName(actionPayloadClassName);
throwIfGraphTableClassIsNotSubClassOfActionPayloadClass(rowClazz, actionPayloadClazz);
@ -89,7 +97,8 @@ public class PromoteActionPayloadForGraphTableJob {
outputGraphTablePath,
strategy,
rowClazz,
actionPayloadClazz);
actionPayloadClazz,
shouldGroupById);
});
}
@ -115,12 +124,12 @@ public class PromoteActionPayloadForGraphTableJob {
String outputGraphTablePath,
MergeAndGet.Strategy strategy,
Class<G> rowClazz,
Class<A> actionPayloadClazz) {
Class<A> actionPayloadClazz, Boolean shouldGroupById) {
Dataset<G> rowDS = readGraphTable(spark, inputGraphTablePath, rowClazz);
Dataset<A> actionPayloadDS = readActionPayload(spark, inputActionPayloadPath, actionPayloadClazz);
Dataset<G> result = promoteActionPayloadForGraphTable(
rowDS, actionPayloadDS, strategy, rowClazz, actionPayloadClazz)
rowDS, actionPayloadDS, strategy, rowClazz, actionPayloadClazz, shouldGroupById)
.map((MapFunction<G, G>) value -> value, Encoders.bean(rowClazz));
saveGraphTable(result, outputGraphTablePath);
@ -145,7 +154,7 @@ public class PromoteActionPayloadForGraphTableJob {
return spark
.read()
.parquet(path)
.map((MapFunction<Row, String>) value -> extractPayload(value), Encoders.STRING())
.map((MapFunction<Row, String>) PromoteActionPayloadForGraphTableJob::extractPayload, Encoders.STRING())
.map(
(MapFunction<String, A>) value -> decodePayload(actionPayloadClazz, value),
Encoders.bean(actionPayloadClazz));
@ -153,9 +162,9 @@ public class PromoteActionPayloadForGraphTableJob {
private static String extractPayload(Row value) {
try {
return value.<String> getAs("payload");
return value.getAs("payload");
} catch (IllegalArgumentException | ClassCastException e) {
logger.error("cannot extract payload from action: {}", value.toString());
logger.error("cannot extract payload from action: {}", value);
throw e;
}
}
@ -174,7 +183,8 @@ public class PromoteActionPayloadForGraphTableJob {
Dataset<A> actionPayloadDS,
MergeAndGet.Strategy strategy,
Class<G> rowClazz,
Class<A> actionPayloadClazz) {
Class<A> actionPayloadClazz,
Boolean shouldGroupById) {
logger
.info(
"Promoting action payload for graph table: payload={}, table={}",
@ -186,7 +196,7 @@ public class PromoteActionPayloadForGraphTableJob {
SerializableSupplier<BiFunction<G, A, G>> mergeRowWithActionPayloadAndGetFn = MergeAndGet.functionFor(strategy);
SerializableSupplier<BiFunction<G, G, G>> mergeRowsAndGetFn = MergeAndGet.functionFor(strategy);
SerializableSupplier<G> zeroFn = zeroFn(rowClazz);
SerializableSupplier<Function<G, Boolean>> isNotZeroFn = PromoteActionPayloadForGraphTableJob::isNotZeroFnUsingIdOrSource;
SerializableSupplier<Function<G, Boolean>> isNotZeroFn = PromoteActionPayloadForGraphTableJob::isNotZeroFnUsingIdOrSourceAndTarget;
Dataset<G> joinedAndMerged = PromoteActionPayloadFunctions
.joinGraphTableWithActionPayloadAndMerge(
@ -198,9 +208,13 @@ public class PromoteActionPayloadForGraphTableJob {
rowClazz,
actionPayloadClazz);
return PromoteActionPayloadFunctions
.groupGraphTableByIdAndMerge(
joinedAndMerged, rowIdFn, mergeRowsAndGetFn, zeroFn, isNotZeroFn, rowClazz);
if (shouldGroupById) {
return PromoteActionPayloadFunctions
.groupGraphTableByIdAndMerge(
joinedAndMerged, rowIdFn, mergeRowsAndGetFn, zeroFn, isNotZeroFn, rowClazz);
} else {
return joinedAndMerged;
}
}
private static <T extends Oaf> SerializableSupplier<T> zeroFn(Class<T> clazz) {
@ -226,12 +240,13 @@ public class PromoteActionPayloadForGraphTableJob {
}
}
private static <T extends Oaf> Function<T, Boolean> isNotZeroFnUsingIdOrSource() {
private static <T extends Oaf> Function<T, Boolean> isNotZeroFnUsingIdOrSourceAndTarget() {
return t -> {
if (isSubClass(t, Relation.class)) {
return Objects.nonNull(((Relation) t).getSource());
final Relation rel = (Relation) t;
return StringUtils.isNotBlank(rel.getSource()) && StringUtils.isNotBlank(rel.getTarget());
}
return Objects.nonNull(((OafEntity) t).getId());
return StringUtils.isNotBlank(((OafEntity) t).getId());
};
}

View File

@ -112,6 +112,7 @@ public class PromoteActionPayloadFunctions {
Class<G> rowClazz) {
TypedColumn<G, G> aggregator = new TableAggregator<>(zeroFn, mergeAndGetFn, isNotZeroFn, rowClazz).toColumn();
return rowDS
.filter((FilterFunction<G>) o -> isNotZeroFn.get().apply(o))
.groupByKey((MapFunction<G, String>) x -> rowIdFn.get().apply(x), Encoders.STRING())
.agg(aggregator)
.map((MapFunction<Tuple2<String, G>, G>) Tuple2::_2, Encoders.kryo(rowClazz));

View File

@ -1,56 +0,0 @@
[
{
"paramName": "issm",
"paramLongName": "isSparkSessionManaged",
"paramDescription": "when true will stop SparkSession after job execution",
"paramRequired": false
},
{
"paramName": "is",
"paramLongName": "isLookupUrl",
"paramDescription": "URL of the isLookUp Service",
"paramRequired": true
},
{
"paramName": "sn",
"paramLongName": "sourceNameNode",
"paramDescription": "nameNode of the source cluster",
"paramRequired": true
},
{
"paramName": "tn",
"paramLongName": "targetNameNode",
"paramDescription": "namoNode of the target cluster",
"paramRequired": true
},
{
"paramName": "w",
"paramLongName": "workingDirectory",
"paramDescription": "working directory",
"paramRequired": true
},
{
"paramName": "nm",
"paramLongName": "distcp_num_maps",
"paramDescription": "maximum number of map tasks used in the distcp process",
"paramRequired": true
},
{
"paramName": "mm",
"paramLongName": "distcp_memory_mb",
"paramDescription": "memory for distcp action copying actionsets from remote cluster",
"paramRequired": true
},
{
"paramName": "tt",
"paramLongName": "distcp_task_timeout",
"paramDescription": "timeout for distcp copying actions from remote cluster",
"paramRequired": true
},
{
"paramName": "tr",
"paramLongName": "transform_only",
"paramDescription": "activate tranform-only mode. Only apply transformation step",
"paramRequired": true
}
]

View File

@ -40,5 +40,11 @@
"paramLongName": "mergeAndGetStrategy",
"paramDescription": "strategy for merging graph table objects with action payload instances, MERGE_FROM_AND_GET or SELECT_NEWER_AND_GET",
"paramRequired": true
},
{
"paramName": "sgid",
"paramLongName": "shouldGroupById",
"paramDescription": "indicates whether the promotion operation should group objects in the graph by id or not",
"paramRequired": false
}
]

View File

@ -24,6 +24,10 @@
<name>mergeAndGetStrategy</name>
<description>strategy for merging graph table objects with action payload instances, MERGE_FROM_AND_GET or SELECT_NEWER_AND_GET</description>
</property>
<property>
<name>shouldGroupById</name>
<description>indicates whether the promotion operation should group objects in the graph by id or not</description>
</property>
<property>
<name>sparkDriverMemory</name>
<description>memory for driver process</description>
@ -111,6 +115,7 @@
<arg>--actionPayloadClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Dataset</arg>
<arg>--outputGraphTablePath</arg><arg>${workingDir}/dataset</arg>
<arg>--mergeAndGetStrategy</arg><arg>${mergeAndGetStrategy}</arg>
<arg>--shouldGroupById</arg><arg>${shouldGroupById}</arg>
</spark>
<ok to="DecisionPromoteResultActionPayloadForDatasetTable"/>
<error to="Kill"/>
@ -162,6 +167,7 @@
<arg>--actionPayloadClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Result</arg>
<arg>--outputGraphTablePath</arg><arg>${outputGraphRootPath}/dataset</arg>
<arg>--mergeAndGetStrategy</arg><arg>${mergeAndGetStrategy}</arg>
<arg>--shouldGroupById</arg><arg>${shouldGroupById}</arg>
</spark>
<ok to="End"/>
<error to="Kill"/>

View File

@ -56,6 +56,11 @@
<name>mergeAndGetStrategy</name>
<description>strategy for merging graph table objects with action payload instances, MERGE_FROM_AND_GET or SELECT_NEWER_AND_GET</description>
</property>
<property>
<name>shouldGroupById</name>
<value>false</value>
<description>indicates whether the promotion operation should group objects in the graph by id or not</description>
</property>
<property>
<name>sparkDriverMemory</name>
<description>memory for driver process</description>

View File

@ -24,6 +24,10 @@
<name>mergeAndGetStrategy</name>
<description>strategy for merging graph table objects with action payload instances, MERGE_FROM_AND_GET or SELECT_NEWER_AND_GET</description>
</property>
<property>
<name>shouldGroupById</name>
<description>indicates whether the promotion operation should group objects in the graph by id or not</description>
</property>
<property>
<name>sparkDriverMemory</name>
<description>memory for driver process</description>
@ -110,6 +114,7 @@
<arg>--actionPayloadClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.OtherResearchProduct</arg>
<arg>--outputGraphTablePath</arg><arg>${workingDir}/otherresearchproduct</arg>
<arg>--mergeAndGetStrategy</arg><arg>${mergeAndGetStrategy}</arg>
<arg>--shouldGroupById</arg><arg>${shouldGroupById}</arg>
</spark>
<ok to="DecisionPromoteResultActionPayloadForOtherResearchProductTable"/>
<error to="Kill"/>
@ -161,6 +166,7 @@
<arg>--actionPayloadClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Result</arg>
<arg>--outputGraphTablePath</arg><arg>${outputGraphRootPath}/otherresearchproduct</arg>
<arg>--mergeAndGetStrategy</arg><arg>${mergeAndGetStrategy}</arg>
<arg>--shouldGroupById</arg><arg>${shouldGroupById}</arg>
</spark>
<ok to="End"/>
<error to="Kill"/>

View File

@ -24,6 +24,10 @@
<name>mergeAndGetStrategy</name>
<description>strategy for merging graph table objects with action payload instances, MERGE_FROM_AND_GET or SELECT_NEWER_AND_GET</description>
</property>
<property>
<name>shouldGroupById</name>
<description>indicates whether the promotion operation should group objects in the graph by id or not</description>
</property>
<property>
<name>sparkDriverMemory</name>
<description>memory for driver process</description>
@ -103,7 +107,7 @@
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=2560
--conf spark.sql.shuffle.partitions=5000
</spark-opts>
<arg>--inputGraphTablePath</arg><arg>${inputGraphRootPath}/publication</arg>
<arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Publication</arg>
@ -111,6 +115,7 @@
<arg>--actionPayloadClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Publication</arg>
<arg>--outputGraphTablePath</arg><arg>${workingDir}/publication</arg>
<arg>--mergeAndGetStrategy</arg><arg>${mergeAndGetStrategy}</arg>
<arg>--shouldGroupById</arg><arg>${shouldGroupById}</arg>
</spark>
<ok to="DecisionPromoteResultActionPayloadForPublicationTable"/>
<error to="Kill"/>
@ -154,7 +159,7 @@
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=2560
--conf spark.sql.shuffle.partitions=5000
</spark-opts>
<arg>--inputGraphTablePath</arg><arg>${workingDir}/publication</arg>
<arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Publication</arg>
@ -162,6 +167,7 @@
<arg>--actionPayloadClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Result</arg>
<arg>--outputGraphTablePath</arg><arg>${outputGraphRootPath}/publication</arg>
<arg>--mergeAndGetStrategy</arg><arg>${mergeAndGetStrategy}</arg>
<arg>--shouldGroupById</arg><arg>${shouldGroupById}</arg>
</spark>
<ok to="End"/>
<error to="Kill"/>

View File

@ -99,7 +99,7 @@
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=2560
--conf spark.sql.shuffle.partitions=5000
</spark-opts>
<arg>--inputGraphTablePath</arg><arg>${inputGraphRootPath}/relation</arg>
<arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Relation</arg>

View File

@ -24,6 +24,10 @@
<name>mergeAndGetStrategy</name>
<description>strategy for merging graph table objects with action payload instances, MERGE_FROM_AND_GET or SELECT_NEWER_AND_GET</description>
</property>
<property>
<name>shouldGroupById</name>
<description>indicates whether the promotion operation should group objects in the graph by id or not</description>
</property>
<property>
<name>sparkDriverMemory</name>
<description>memory for driver process</description>
@ -110,6 +114,7 @@
<arg>--actionPayloadClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Software</arg>
<arg>--outputGraphTablePath</arg><arg>${workingDir}/software</arg>
<arg>--mergeAndGetStrategy</arg><arg>${mergeAndGetStrategy}</arg>
<arg>--shouldGroupById</arg><arg>${shouldGroupById}</arg>
</spark>
<ok to="DecisionPromoteResultActionPayloadForSoftwareTable"/>
<error to="Kill"/>
@ -161,6 +166,7 @@
<arg>--actionPayloadClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Result</arg>
<arg>--outputGraphTablePath</arg><arg>${outputGraphRootPath}/software</arg>
<arg>--mergeAndGetStrategy</arg><arg>${mergeAndGetStrategy}</arg>
<arg>--shouldGroupById</arg><arg>${shouldGroupById}</arg>
</spark>
<ok to="End"/>
<error to="Kill"/>

View File

@ -80,7 +80,7 @@ public class PartitionActionSetsByPayloadTypeJobTest {
private ISClient isClient;
@Test
public void shouldPartitionActionSetsByPayloadType(@TempDir Path workingDir) throws Exception {
void shouldPartitionActionSetsByPayloadType(@TempDir Path workingDir) throws Exception {
// given
Path inputActionSetsBaseDir = workingDir.resolve("input").resolve("action_sets");
Path outputDir = workingDir.resolve("output");

View File

@ -20,7 +20,7 @@ public class MergeAndGetTest {
class MergeFromAndGetStrategy {
@Test
public void shouldThrowForOafAndOaf() {
void shouldThrowForOafAndOaf() {
// given
Oaf a = mock(Oaf.class);
Oaf b = mock(Oaf.class);
@ -33,7 +33,7 @@ public class MergeAndGetTest {
}
@Test
public void shouldThrowForOafAndRelation() {
void shouldThrowForOafAndRelation() {
// given
Oaf a = mock(Oaf.class);
Relation b = mock(Relation.class);
@ -46,7 +46,7 @@ public class MergeAndGetTest {
}
@Test
public void shouldThrowForOafAndOafEntity() {
void shouldThrowForOafAndOafEntity() {
// given
Oaf a = mock(Oaf.class);
OafEntity b = mock(OafEntity.class);
@ -59,7 +59,7 @@ public class MergeAndGetTest {
}
@Test
public void shouldThrowForRelationAndOaf() {
void shouldThrowForRelationAndOaf() {
// given
Relation a = mock(Relation.class);
Oaf b = mock(Oaf.class);
@ -72,7 +72,7 @@ public class MergeAndGetTest {
}
@Test
public void shouldThrowForRelationAndOafEntity() {
void shouldThrowForRelationAndOafEntity() {
// given
Relation a = mock(Relation.class);
OafEntity b = mock(OafEntity.class);
@ -85,7 +85,7 @@ public class MergeAndGetTest {
}
@Test
public void shouldBehaveProperlyForRelationAndRelation() {
void shouldBehaveProperlyForRelationAndRelation() {
// given
Relation a = mock(Relation.class);
Relation b = mock(Relation.class);
@ -101,7 +101,7 @@ public class MergeAndGetTest {
}
@Test
public void shouldThrowForOafEntityAndOaf() {
void shouldThrowForOafEntityAndOaf() {
// given
OafEntity a = mock(OafEntity.class);
Oaf b = mock(Oaf.class);
@ -114,7 +114,7 @@ public class MergeAndGetTest {
}
@Test
public void shouldThrowForOafEntityAndRelation() {
void shouldThrowForOafEntityAndRelation() {
// given
OafEntity a = mock(OafEntity.class);
Relation b = mock(Relation.class);
@ -127,7 +127,7 @@ public class MergeAndGetTest {
}
@Test
public void shouldThrowForOafEntityAndOafEntityButNotSubclasses() {
void shouldThrowForOafEntityAndOafEntityButNotSubclasses() {
// given
class OafEntitySub1 extends OafEntity {
}
@ -145,7 +145,7 @@ public class MergeAndGetTest {
}
@Test
public void shouldBehaveProperlyForOafEntityAndOafEntity() {
void shouldBehaveProperlyForOafEntityAndOafEntity() {
// given
OafEntity a = mock(OafEntity.class);
OafEntity b = mock(OafEntity.class);
@ -165,7 +165,7 @@ public class MergeAndGetTest {
class SelectNewerAndGetStrategy {
@Test
public void shouldThrowForOafEntityAndRelation() {
void shouldThrowForOafEntityAndRelation() {
// given
OafEntity a = mock(OafEntity.class);
Relation b = mock(Relation.class);
@ -178,7 +178,7 @@ public class MergeAndGetTest {
}
@Test
public void shouldThrowForRelationAndOafEntity() {
void shouldThrowForRelationAndOafEntity() {
// given
Relation a = mock(Relation.class);
OafEntity b = mock(OafEntity.class);
@ -191,7 +191,7 @@ public class MergeAndGetTest {
}
@Test
public void shouldThrowForOafEntityAndResult() {
void shouldThrowForOafEntityAndResult() {
// given
OafEntity a = mock(OafEntity.class);
Result b = mock(Result.class);
@ -204,7 +204,7 @@ public class MergeAndGetTest {
}
@Test
public void shouldThrowWhenSuperTypeIsNewerForResultAndOafEntity() {
void shouldThrowWhenSuperTypeIsNewerForResultAndOafEntity() {
// given
// real types must be used because subclass-superclass resolution does not work for
// mocks
@ -221,7 +221,7 @@ public class MergeAndGetTest {
}
@Test
public void shouldShouldReturnLeftForOafEntityAndOafEntity() {
void shouldShouldReturnLeftForOafEntityAndOafEntity() {
// given
OafEntity a = mock(OafEntity.class);
when(a.getLastupdatetimestamp()).thenReturn(1L);
@ -238,7 +238,7 @@ public class MergeAndGetTest {
}
@Test
public void shouldShouldReturnRightForOafEntityAndOafEntity() {
void shouldShouldReturnRightForOafEntityAndOafEntity() {
// given
OafEntity a = mock(OafEntity.class);
when(a.getLastupdatetimestamp()).thenReturn(2L);

View File

@ -77,7 +77,7 @@ public class PromoteActionPayloadForGraphTableJobTest {
class Main {
@Test
public void shouldThrowWhenGraphTableClassIsNotASubClassOfActionPayloadClass() {
void shouldThrowWhenGraphTableClassIsNotASubClassOfActionPayloadClass() {
// given
Class<Relation> rowClazz = Relation.class;
Class<OafEntity> actionPayloadClazz = OafEntity.class;
@ -101,7 +101,9 @@ public class PromoteActionPayloadForGraphTableJobTest {
"-outputGraphTablePath",
"",
"-mergeAndGetStrategy",
MergeAndGet.Strategy.SELECT_NEWER_AND_GET.name()
MergeAndGet.Strategy.SELECT_NEWER_AND_GET.name(),
"--shouldGroupById",
"true"
}));
// then
@ -114,7 +116,7 @@ public class PromoteActionPayloadForGraphTableJobTest {
@ParameterizedTest(name = "strategy: {0}, graph table: {1}, action payload: {2}")
@MethodSource("eu.dnetlib.dhp.actionmanager.promote.PromoteActionPayloadForGraphTableJobTest#promoteJobTestParams")
public void shouldPromoteActionPayloadForGraphTable(
void shouldPromoteActionPayloadForGraphTable(
MergeAndGet.Strategy strategy,
Class<? extends Oaf> rowClazz,
Class<? extends Oaf> actionPayloadClazz)
@ -141,7 +143,9 @@ public class PromoteActionPayloadForGraphTableJobTest {
"-outputGraphTablePath",
outputGraphTableDir.toString(),
"-mergeAndGetStrategy",
strategy.name()
strategy.name(),
"--shouldGroupById",
"true"
});
// then

View File

@ -44,7 +44,7 @@ public class PromoteActionPayloadFunctionsTest {
class JoinTableWithActionPayloadAndMerge {
@Test
public void shouldThrowWhenTableTypeIsNotSubtypeOfActionPayloadType() {
void shouldThrowWhenTableTypeIsNotSubtypeOfActionPayloadType() {
// given
class OafImpl extends Oaf {
}
@ -58,7 +58,7 @@ public class PromoteActionPayloadFunctionsTest {
}
@Test
public void shouldRunProperlyWhenActionPayloadTypeAndTableTypeAreTheSame() {
void shouldRunProperlyWhenActionPayloadTypeAndTableTypeAreTheSame() {
// given
String id0 = "id0";
String id1 = "id1";
@ -138,7 +138,7 @@ public class PromoteActionPayloadFunctionsTest {
}
@Test
public void shouldRunProperlyWhenActionPayloadTypeIsSuperTypeOfTableType() {
void shouldRunProperlyWhenActionPayloadTypeIsSuperTypeOfTableType() {
// given
String id0 = "id0";
String id1 = "id1";
@ -218,7 +218,7 @@ public class PromoteActionPayloadFunctionsTest {
class GroupTableByIdAndMerge {
@Test
public void shouldRunProperly() {
void shouldRunProperly() {
// given
String id1 = "id1";
String id2 = "id2";

View File

@ -1,29 +1,27 @@
Description of the Module
--------------------------
This module defines a **collector worker application** that runs on Hadoop.
This module defines a set of oozie workflows for the **collection** and **transformation** of metadata records.
Both workflows interact with the Metadata Store Manager (MdSM) to handle the logical transactions required to ensure
the consistency of the read/write operations on the data as the MdSM in fact keeps track of the logical-physical mapping
of each MDStore.
It is responsible for harvesting metadata using different plugins.
## Metadata collection
The collector worker uses a message queue to inform the progress
of the harvesting action (using a message queue for sending **ONGOING** messages) furthermore,
It gives, at the end of the job, some information about the status
of the collection i.e Number of records collected(using a message queue for sending **REPORT** messages).
The **metadata collection workflow** is responsible for harvesting metadata records from different protocols and responding to
different formats and to store them as on HDFS so that they can be further processed.
To work the collection worker need some parameter like:
### Collector Plugins
* **hdfsPath**: the path where storing the sequential file
* **apidescriptor**: the JSON encoding of the API Descriptor
* **namenode**: the Name Node URI
* **userHDFS**: the user wich create the hdfs seq file
* **rabbitUser**: the user to connect with RabbitMq for messaging
* **rabbitPassWord**: the password to connect with RabbitMq for messaging
* **rabbitHost**: the host of the RabbitMq server
* **rabbitOngoingQueue**: the name of the ongoing queue
* **rabbitReportQueue**: the name of the report queue
* **workflowId**: the identifier of the dnet Workflow
Different protocols are managed by dedicated Collector plugins, i.e. java programs implementing a defined interface:
##Plugins
* OAI Plugin
```eu.dnetlib.dhp.collection.plugin.CollectorPlugin```
The list of the supported plugins:
* OAI Plugin: collects from OAI-PMH compatible endpoints
* MDStore plugin: collects from a given D-Net MetadataStore, (identified by moogodb URI, dbName, MDStoreID)
* MDStore dump plugin: collects from an MDStore dump stored on the HDFS location indicated by the `path` parameter
# Transformation Plugins
TODO
## Usage
TODO

View File

@ -7,13 +7,20 @@
<version>1.2.4-SNAPSHOT</version>
</parent>
<artifactId>dhp-aggregation</artifactId>
<<<<<<< HEAD
=======
>>>>>>> upstream/master
<build>
<plugins>
<plugin>
<groupId>net.alchim31.maven</groupId>
<artifactId>scala-maven-plugin</artifactId>
<<<<<<< HEAD
<version>4.0.1</version>
=======
<version>${net.alchim31.maven.version}</version>
>>>>>>> upstream/master
<executions>
<execution>
<id>scala-compile-first</id>
@ -30,6 +37,16 @@
<goal>testCompile</goal>
</goals>
</execution>
<<<<<<< HEAD
=======
<execution>
<id>scala-doc</id>
<phase>process-resources</phase> <!-- or wherever -->
<goals>
<goal>doc</goal>
</goals>
</execution>
>>>>>>> upstream/master
</executions>
<configuration>
<scalaVersion>${scala.version}</scalaVersion>
@ -38,10 +55,19 @@
</plugins>
</build>
<<<<<<< HEAD
=======
>>>>>>> upstream/master
<dependencies>
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient</artifactId>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.11</artifactId>
@ -55,6 +81,7 @@
<groupId>eu.dnetlib.dhp</groupId>
<artifactId>dhp-common</artifactId>
<version>${project.version}</version>
<<<<<<< HEAD
</dependency>
<dependency>
@ -70,6 +97,10 @@
=======
</dependency>
>>>>>>> upstream/master
<dependency>
<groupId>net.sf.saxon</groupId>
<artifactId>Saxon-HE</artifactId>
@ -89,14 +120,11 @@
<artifactId>jaxen</artifactId>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.commons/commons-csv -->
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-csv</artifactId>
<version>1.8</version>
<groupId>org.json</groupId>
<artifactId>json</artifactId>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.poi/poi-ooxml -->
<dependency>
<groupId>org.apache.poi</groupId>
@ -109,8 +137,11 @@
<artifactId>commons-compress</artifactId>
</dependency>
<dependency>
<groupId>org.mongodb</groupId>
<artifactId>mongo-java-driver</artifactId>
</dependency>
</dependencies>
</project>
</project>

View File

@ -4,6 +4,7 @@ package eu.dnetlib.dhp.actionmanager.bipfinder;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import java.io.Serializable;
import java.util.Objects;
import java.util.Optional;
import org.apache.commons.io.IOUtils;
@ -28,15 +29,16 @@ import eu.dnetlib.dhp.schema.oaf.Result;
public class CollectAndSave implements Serializable {
private static final Logger log = LoggerFactory.getLogger(CollectAndSave.class);
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
public static <I extends Result> void main(String[] args) throws Exception {
public static void main(String[] args) throws Exception {
String jsonConfiguration = IOUtils
.toString(
CollectAndSave.class
.getResourceAsStream(
"/eu/dnetlib/dhp/actionmanager/bipfinder/input_actionset_parameter.json"));
Objects
.requireNonNull(
CollectAndSave.class
.getResourceAsStream(
"/eu/dnetlib/dhp/actionmanager/bipfinder/input_actionset_parameter.json")));
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
@ -75,7 +77,6 @@ public class CollectAndSave implements Serializable {
.union(sc.sequenceFile(inputPath + "/otherresearchproduct", Text.class, Text.class))
.union(sc.sequenceFile(inputPath + "/software", Text.class, Text.class))
.saveAsHadoopFile(outputPath, Text.class, Text.class, SequenceFileOutputFormat.class);
;
}
private static void removeOutputDir(SparkSession spark, String path) {

View File

@ -36,7 +36,7 @@ import scala.Tuple2;
*/
public class SparkAtomicActionScoreJob implements Serializable {
private static String DOI = "doi";
private static final String DOI = "doi";
private static final Logger log = LoggerFactory.getLogger(SparkAtomicActionScoreJob.class);
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
@ -87,7 +87,7 @@ public class SparkAtomicActionScoreJob implements Serializable {
private static <I extends Result> void prepareResults(SparkSession spark, String inputPath, String outputPath,
String bipScorePath, Class<I> inputClazz) {
final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
JavaRDD<BipDeserialize> bipDeserializeJavaRDD = sc
.textFile(bipScorePath)
@ -101,8 +101,6 @@ public class SparkAtomicActionScoreJob implements Serializable {
return bs;
}).collect(Collectors.toList()).iterator()).rdd(), Encoders.bean(BipScore.class));
System.out.println(bipScores.count());
Dataset<I> results = readPath(spark, inputPath, inputClazz);
results.createOrReplaceTempView("result");
@ -124,7 +122,7 @@ public class SparkAtomicActionScoreJob implements Serializable {
ret.setId(value._2().getId());
return ret;
}, Encoders.bean(BipScore.class))
.groupByKey((MapFunction<BipScore, String>) value -> value.getId(), Encoders.STRING())
.groupByKey((MapFunction<BipScore, String>) BipScore::getId, Encoders.STRING())
.mapGroups((MapGroupsFunction<String, BipScore, Result>) (k, it) -> {
Result ret = new Result();
ret.setDataInfo(getDataInfo());

View File

@ -0,0 +1,49 @@
package eu.dnetlib.dhp.actionmanager.createunresolvedentities;
import java.util.Optional;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SparkSession;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
public class Constants {
public static final String DOI = "doi";
public static final String UPDATE_DATA_INFO_TYPE = "update";
public static final String UPDATE_SUBJECT_FOS_CLASS_ID = "subject:fos";
public static final String UPDATE_CLASS_NAME = "Inferred by OpenAIRE";
public static final String UPDATE_MEASURE_BIP_CLASS_ID = "measure:bip";
public static final String FOS_CLASS_ID = "FOS";
public static final String FOS_CLASS_NAME = "Fields of Science and Technology classification";
public static final String NULL = "NULL";
public static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
private Constants() {
}
public static Boolean isSparkSessionManaged(ArgumentApplicationParser parser) {
return Optional
.ofNullable(parser.get("isSparkSessionManaged"))
.map(Boolean::valueOf)
.orElse(Boolean.TRUE);
}
public static <R> Dataset<R> readPath(
SparkSession spark, String inputPath, Class<R> clazz) {
return spark
.read()
.textFile(inputPath)
.map((MapFunction<String, R>) value -> OBJECT_MAPPER.readValue(value, clazz), Encoders.bean(clazz));
}
}

View File

@ -0,0 +1,77 @@
package eu.dnetlib.dhp.actionmanager.createunresolvedentities;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.Serializable;
import java.util.Objects;
import java.util.Optional;
import org.apache.commons.io.IOUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.common.collection.GetCSV;
public class GetFOSData implements Serializable {
private static final Logger log = LoggerFactory.getLogger(GetFOSData.class);
public static final char DEFAULT_DELIMITER = '\t';
public static void main(final String[] args) throws Exception {
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
IOUtils
.toString(
Objects
.requireNonNull(
GetFOSData.class
.getResourceAsStream(
"/eu/dnetlib/dhp/actionmanager/createunresolvedentities/get_fos_parameters.json"))));
parser.parseArgument(args);
// the path where the original fos csv file is stored
final String sourcePath = parser.get("sourcePath");
log.info("sourcePath {}", sourcePath);
// the path where to put the file as json
final String outputPath = parser.get("outputPath");
log.info("outputPath {}", outputPath);
final String hdfsNameNode = parser.get("hdfsNameNode");
log.info("hdfsNameNode {}", hdfsNameNode);
final String classForName = parser.get("classForName");
log.info("classForName {}", classForName);
final char delimiter = Optional
.ofNullable(parser.get("delimiter"))
.map(s -> s.charAt(0))
.orElse(DEFAULT_DELIMITER);
log.info("delimiter {}", delimiter);
Configuration conf = new Configuration();
conf.set("fs.defaultFS", hdfsNameNode);
FileSystem fileSystem = FileSystem.get(conf);
new GetFOSData().doRewrite(sourcePath, outputPath, classForName, delimiter, fileSystem);
}
public void doRewrite(String inputPath, String outputFile, String classForName, char delimiter, FileSystem fs)
throws IOException, ClassNotFoundException {
// reads the csv and writes it as its json equivalent
try (InputStreamReader reader = new InputStreamReader(fs.open(new Path(inputPath)))) {
GetCSV.getCsv(fs, reader, outputFile, classForName, delimiter);
}
}
}

Some files were not shown because too many files have changed in this diff Show More