reformatted code according to the updated style descriptor

This commit is contained in:
Claudio Atzori 2020-04-28 11:23:29 +02:00
parent e6d68d1364
commit 6f5b899038
160 changed files with 1844 additions and 1497 deletions

View File

@ -4,6 +4,7 @@ package eu.dnetlib.maven.plugin.properties;
import java.io.File; import java.io.File;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.List; import java.util.List;
import org.apache.commons.lang.ArrayUtils; import org.apache.commons.lang.ArrayUtils;
import org.apache.commons.lang.StringUtils; import org.apache.commons.lang.StringUtils;
import org.apache.maven.plugin.AbstractMojo; import org.apache.maven.plugin.AbstractMojo;

View File

@ -12,7 +12,6 @@
package eu.dnetlib.maven.plugin.properties; package eu.dnetlib.maven.plugin.properties;
import edu.umd.cs.findbugs.annotations.SuppressFBWarnings;
import java.io.File; import java.io.File;
import java.io.FileInputStream; import java.io.FileInputStream;
import java.io.IOException; import java.io.IOException;
@ -25,6 +24,7 @@ import java.util.List;
import java.util.Map.Entry; import java.util.Map.Entry;
import java.util.Properties; import java.util.Properties;
import java.util.Set; import java.util.Set;
import org.apache.commons.io.FileUtils; import org.apache.commons.io.FileUtils;
import org.apache.commons.io.IOUtils; import org.apache.commons.io.IOUtils;
import org.apache.commons.lang.StringUtils; import org.apache.commons.lang.StringUtils;
@ -36,6 +36,8 @@ import org.springframework.core.io.DefaultResourceLoader;
import org.springframework.core.io.Resource; import org.springframework.core.io.Resource;
import org.springframework.core.io.ResourceLoader; import org.springframework.core.io.ResourceLoader;
import edu.umd.cs.findbugs.annotations.SuppressFBWarnings;
/** /**
* Writes project properties for the keys listed in specified properties files. Based on: * Writes project properties for the keys listed in specified properties files. Based on:
* http://site.kuali.org/maven/plugins/properties-maven-plugin/1.3.2/write-project-properties-mojo.html * http://site.kuali.org/maven/plugins/properties-maven-plugin/1.3.2/write-project-properties-mojo.html

View File

@ -8,6 +8,7 @@ import static org.mockito.Mockito.lenient;
import java.io.*; import java.io.*;
import java.util.Properties; import java.util.Properties;
import org.apache.maven.plugin.MojoExecutionException; import org.apache.maven.plugin.MojoExecutionException;
import org.apache.maven.project.MavenProject; import org.apache.maven.project.MavenProject;
import org.junit.jupiter.api.*; import org.junit.jupiter.api.*;

View File

@ -3,6 +3,7 @@ package eu.dnetlib.data.mdstore.manager.common.model;
import java.io.Serializable; import java.io.Serializable;
import java.util.UUID; import java.util.UUID;
import javax.persistence.Column; import javax.persistence.Column;
import javax.persistence.Entity; import javax.persistence.Entity;
import javax.persistence.Id; import javax.persistence.Id;

View File

@ -2,6 +2,7 @@
package eu.dnetlib.data.mdstore.manager.common.model; package eu.dnetlib.data.mdstore.manager.common.model;
import java.io.Serializable; import java.io.Serializable;
import javax.persistence.Column; import javax.persistence.Column;
import javax.persistence.Entity; import javax.persistence.Entity;
import javax.persistence.Id; import javax.persistence.Id;

View File

@ -3,6 +3,7 @@ package eu.dnetlib.data.mdstore.manager.common.model;
import java.io.Serializable; import java.io.Serializable;
import java.util.Date; import java.util.Date;
import javax.persistence.Column; import javax.persistence.Column;
import javax.persistence.Entity; import javax.persistence.Entity;
import javax.persistence.Id; import javax.persistence.Id;

View File

@ -3,6 +3,7 @@ package eu.dnetlib.data.mdstore.manager.common.model;
import java.io.Serializable; import java.io.Serializable;
import java.util.Date; import java.util.Date;
import javax.persistence.Column; import javax.persistence.Column;
import javax.persistence.Entity; import javax.persistence.Entity;
import javax.persistence.Id; import javax.persistence.Id;

View File

@ -1,7 +1,6 @@
package eu.dnetlib.dhp.application; package eu.dnetlib.dhp.application;
import com.fasterxml.jackson.databind.ObjectMapper;
import java.io.ByteArrayInputStream; import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream; import java.io.ByteArrayOutputStream;
import java.io.Serializable; import java.io.Serializable;
@ -9,10 +8,13 @@ import java.io.StringWriter;
import java.util.*; import java.util.*;
import java.util.zip.GZIPInputStream; import java.util.zip.GZIPInputStream;
import java.util.zip.GZIPOutputStream; import java.util.zip.GZIPOutputStream;
import org.apache.commons.cli.*; import org.apache.commons.cli.*;
import org.apache.commons.codec.binary.Base64; import org.apache.commons.codec.binary.Base64;
import org.apache.commons.io.IOUtils; import org.apache.commons.io.IOUtils;
import com.fasterxml.jackson.databind.ObjectMapper;
public class ArgumentApplicationParser implements Serializable { public class ArgumentApplicationParser implements Serializable {
private final Options options = new Options(); private final Options options = new Options();

View File

@ -6,6 +6,7 @@ import static eu.dnetlib.dhp.common.ThrowingSupport.rethrowAsRuntimeException;
import java.util.Arrays; import java.util.Arrays;
import java.util.List; import java.util.List;
import java.util.stream.Collectors; import java.util.stream.Collectors;
import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.FileSystem;

View File

@ -1,12 +1,14 @@
package eu.dnetlib.dhp.common; package eu.dnetlib.dhp.common;
import eu.dnetlib.dhp.common.FunctionalInterfaceSupport.ThrowingConsumer;
import java.util.Objects; import java.util.Objects;
import java.util.function.Function; import java.util.function.Function;
import org.apache.spark.SparkConf; import org.apache.spark.SparkConf;
import org.apache.spark.sql.SparkSession; import org.apache.spark.sql.SparkSession;
import eu.dnetlib.dhp.common.FunctionalInterfaceSupport.ThrowingConsumer;
/** SparkSession utility methods. */ /** SparkSession utility methods. */
public class SparkSessionSupport { public class SparkSessionSupport {

View File

@ -1,9 +1,10 @@
package eu.dnetlib.dhp.model.mdstore; package eu.dnetlib.dhp.model.mdstore;
import eu.dnetlib.dhp.utils.DHPUtils;
import java.io.Serializable; import java.io.Serializable;
import eu.dnetlib.dhp.utils.DHPUtils;
/** This class models a record inside the new Metadata store collection on HDFS * */ /** This class models a record inside the new Metadata store collection on HDFS * */
public class MetadataRecord implements Serializable { public class MetadataRecord implements Serializable {

View File

@ -1,13 +1,14 @@
package eu.dnetlib.dhp.parser.utility; package eu.dnetlib.dhp.parser.utility;
import com.ximpleware.AutoPilot;
import com.ximpleware.VTDNav;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.HashMap; import java.util.HashMap;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
import com.ximpleware.AutoPilot;
import com.ximpleware.VTDNav;
/** Created by sandro on 9/29/16. */ /** Created by sandro on 9/29/16. */
public class VtdUtilityParser { public class VtdUtilityParser {

View File

@ -1,18 +1,21 @@
package eu.dnetlib.dhp.utils; package eu.dnetlib.dhp.utils;
import com.jayway.jsonpath.JsonPath;
import java.io.ByteArrayInputStream; import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream; import java.io.ByteArrayOutputStream;
import java.nio.charset.StandardCharsets; import java.nio.charset.StandardCharsets;
import java.security.MessageDigest; import java.security.MessageDigest;
import java.util.zip.GZIPInputStream; import java.util.zip.GZIPInputStream;
import java.util.zip.GZIPOutputStream; import java.util.zip.GZIPOutputStream;
import net.minidev.json.JSONArray;
import org.apache.commons.codec.binary.Base64; import org.apache.commons.codec.binary.Base64;
import org.apache.commons.codec.binary.Base64OutputStream; import org.apache.commons.codec.binary.Base64OutputStream;
import org.apache.commons.codec.binary.Hex; import org.apache.commons.codec.binary.Hex;
import com.jayway.jsonpath.JsonPath;
import net.minidev.json.JSONArray;
public class DHPUtils { public class DHPUtils {
public static String md5(final String s) { public static String md5(final String s) {

View File

@ -1,11 +1,12 @@
package eu.dnetlib.dhp.utils; package eu.dnetlib.dhp.utils;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
import org.apache.commons.logging.Log; import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory; import org.apache.commons.logging.LogFactory;
import org.apache.cxf.jaxws.JaxWsProxyFactoryBean; import org.apache.cxf.jaxws.JaxWsProxyFactoryBean;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
public class ISLookupClientFactory { public class ISLookupClientFactory {
private static final Log log = LogFactory.getLog(ISLookupClientFactory.class); private static final Log log = LogFactory.getLog(ISLookupClientFactory.class);

View File

@ -5,6 +5,7 @@ import java.text.ParseException;
import java.text.SimpleDateFormat; import java.text.SimpleDateFormat;
import java.util.Calendar; import java.util.Calendar;
import java.util.GregorianCalendar; import java.util.GregorianCalendar;
import net.sf.saxon.expr.XPathContext; import net.sf.saxon.expr.XPathContext;
import net.sf.saxon.om.Item; import net.sf.saxon.om.Item;
import net.sf.saxon.om.Sequence; import net.sf.saxon.om.Sequence;

View File

@ -4,6 +4,7 @@ package eu.dnetlib.dhp.utils.saxon;
import java.text.ParseException; import java.text.ParseException;
import java.text.SimpleDateFormat; import java.text.SimpleDateFormat;
import java.util.Date; import java.util.Date;
import net.sf.saxon.expr.XPathContext; import net.sf.saxon.expr.XPathContext;
import net.sf.saxon.om.Sequence; import net.sf.saxon.om.Sequence;
import net.sf.saxon.trans.XPathException; import net.sf.saxon.trans.XPathException;

View File

@ -1,13 +1,14 @@
package eu.dnetlib.dhp.utils.saxon; package eu.dnetlib.dhp.utils.saxon;
import org.apache.commons.lang3.StringUtils;
import net.sf.saxon.expr.XPathContext; import net.sf.saxon.expr.XPathContext;
import net.sf.saxon.om.Item; import net.sf.saxon.om.Item;
import net.sf.saxon.om.Sequence; import net.sf.saxon.om.Sequence;
import net.sf.saxon.trans.XPathException; import net.sf.saxon.trans.XPathException;
import net.sf.saxon.value.SequenceType; import net.sf.saxon.value.SequenceType;
import net.sf.saxon.value.StringValue; import net.sf.saxon.value.StringValue;
import org.apache.commons.lang3.StringUtils;
public class PickFirst extends AbstractExtensionFunction { public class PickFirst extends AbstractExtensionFunction {

View File

@ -2,9 +2,11 @@
package eu.dnetlib.dhp.utils.saxon; package eu.dnetlib.dhp.utils.saxon;
import java.io.StringReader; import java.io.StringReader;
import javax.xml.transform.Transformer; import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerException; import javax.xml.transform.TransformerException;
import javax.xml.transform.stream.StreamSource; import javax.xml.transform.stream.StreamSource;
import net.sf.saxon.Configuration; import net.sf.saxon.Configuration;
import net.sf.saxon.TransformerFactoryImpl; import net.sf.saxon.TransformerFactoryImpl;

View File

@ -1,11 +1,12 @@
package eu.dnetlib.message; package eu.dnetlib.message;
import com.fasterxml.jackson.core.JsonProcessingException;
import com.fasterxml.jackson.databind.ObjectMapper;
import java.io.IOException; import java.io.IOException;
import java.util.Map; import java.util.Map;
import com.fasterxml.jackson.core.JsonProcessingException;
import com.fasterxml.jackson.databind.ObjectMapper;
public class Message { public class Message {
private String workflowId; private String workflowId;

View File

@ -1,13 +1,14 @@
package eu.dnetlib.message; package eu.dnetlib.message;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.util.concurrent.LinkedBlockingQueue;
import com.rabbitmq.client.AMQP; import com.rabbitmq.client.AMQP;
import com.rabbitmq.client.Channel; import com.rabbitmq.client.Channel;
import com.rabbitmq.client.DefaultConsumer; import com.rabbitmq.client.DefaultConsumer;
import com.rabbitmq.client.Envelope; import com.rabbitmq.client.Envelope;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.util.concurrent.LinkedBlockingQueue;
public class MessageConsumer extends DefaultConsumer { public class MessageConsumer extends DefaultConsumer {

View File

@ -1,15 +1,16 @@
package eu.dnetlib.message; package eu.dnetlib.message;
import com.rabbitmq.client.Channel;
import com.rabbitmq.client.Connection;
import com.rabbitmq.client.ConnectionFactory;
import java.io.IOException; import java.io.IOException;
import java.util.HashMap; import java.util.HashMap;
import java.util.Map; import java.util.Map;
import java.util.concurrent.LinkedBlockingQueue; import java.util.concurrent.LinkedBlockingQueue;
import java.util.concurrent.TimeoutException; import java.util.concurrent.TimeoutException;
import com.rabbitmq.client.Channel;
import com.rabbitmq.client.Connection;
import com.rabbitmq.client.ConnectionFactory;
public class MessageManager { public class MessageManager {
private final String messageHost; private final String messageHost;

View File

@ -1,11 +1,13 @@
package eu.dnetlib.scholexplorer.relation; package eu.dnetlib.scholexplorer.relation;
import com.fasterxml.jackson.databind.ObjectMapper;
import java.io.Serializable; import java.io.Serializable;
import java.util.HashMap; import java.util.HashMap;
import org.apache.commons.io.IOUtils; import org.apache.commons.io.IOUtils;
import com.fasterxml.jackson.databind.ObjectMapper;
public class RelationMapper extends HashMap<String, RelInfo> implements Serializable { public class RelationMapper extends HashMap<String, RelInfo> implements Serializable {
public static RelationMapper load() throws Exception { public static RelationMapper load() throws Exception {

View File

@ -9,6 +9,7 @@ import java.nio.file.Path;
import java.util.Arrays; import java.util.Arrays;
import java.util.List; import java.util.List;
import java.util.stream.Collectors; import java.util.stream.Collectors;
import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configuration;
import org.junit.jupiter.api.Nested; import org.junit.jupiter.api.Nested;
import org.junit.jupiter.api.Test; import org.junit.jupiter.api.Test;

View File

@ -3,13 +3,15 @@ package eu.dnetlib.dhp.common;
import static org.mockito.Mockito.*; import static org.mockito.Mockito.*;
import eu.dnetlib.dhp.common.FunctionalInterfaceSupport.ThrowingConsumer;
import java.util.function.Function; import java.util.function.Function;
import org.apache.spark.SparkConf; import org.apache.spark.SparkConf;
import org.apache.spark.sql.SparkSession; import org.apache.spark.sql.SparkSession;
import org.junit.jupiter.api.Nested; import org.junit.jupiter.api.Nested;
import org.junit.jupiter.api.Test; import org.junit.jupiter.api.Test;
import eu.dnetlib.dhp.common.FunctionalInterfaceSupport.ThrowingConsumer;
public class SparkSessionSupportTest { public class SparkSessionSupportTest {
@Nested @Nested

View File

@ -6,6 +6,7 @@ import static org.junit.jupiter.api.Assertions.*;
import java.io.IOException; import java.io.IOException;
import java.util.HashMap; import java.util.HashMap;
import java.util.Map; import java.util.Map;
import org.junit.jupiter.api.Test; import org.junit.jupiter.api.Test;
public class MessageTest { public class MessageTest {

View File

@ -1,10 +1,12 @@
package eu.dnetlib.dhp.schema.action; package eu.dnetlib.dhp.schema.action;
import com.fasterxml.jackson.databind.annotation.JsonDeserialize;
import eu.dnetlib.dhp.schema.oaf.Oaf;
import java.io.Serializable; import java.io.Serializable;
import com.fasterxml.jackson.databind.annotation.JsonDeserialize;
import eu.dnetlib.dhp.schema.oaf.Oaf;
@JsonDeserialize(using = AtomicActionDeserializer.class) @JsonDeserialize(using = AtomicActionDeserializer.class)
public class AtomicAction<T extends Oaf> implements Serializable { public class AtomicAction<T extends Oaf> implements Serializable {

View File

@ -1,14 +1,16 @@
package eu.dnetlib.dhp.schema.action; package eu.dnetlib.dhp.schema.action;
import java.io.IOException;
import com.fasterxml.jackson.core.JsonParser; import com.fasterxml.jackson.core.JsonParser;
import com.fasterxml.jackson.core.JsonProcessingException; import com.fasterxml.jackson.core.JsonProcessingException;
import com.fasterxml.jackson.databind.DeserializationContext; import com.fasterxml.jackson.databind.DeserializationContext;
import com.fasterxml.jackson.databind.JsonDeserializer; import com.fasterxml.jackson.databind.JsonDeserializer;
import com.fasterxml.jackson.databind.JsonNode; import com.fasterxml.jackson.databind.JsonNode;
import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.schema.oaf.Oaf; import eu.dnetlib.dhp.schema.oaf.Oaf;
import java.io.IOException;
public class AtomicActionDeserializer extends JsonDeserializer { public class AtomicActionDeserializer extends JsonDeserializer {

View File

@ -1,12 +1,14 @@
package eu.dnetlib.dhp.schema.common; package eu.dnetlib.dhp.schema.common;
import com.google.common.collect.Maps;
import eu.dnetlib.dhp.schema.oaf.*;
import java.util.Map; import java.util.Map;
import java.util.Optional; import java.util.Optional;
import java.util.function.Function; import java.util.function.Function;
import com.google.common.collect.Maps;
import eu.dnetlib.dhp.schema.oaf.*;
/** Oaf model utility methods. */ /** Oaf model utility methods. */
public class ModelSupport { public class ModelSupport {

View File

@ -1,10 +1,11 @@
package eu.dnetlib.dhp.schema.oaf; package eu.dnetlib.dhp.schema.oaf;
import eu.dnetlib.dhp.schema.common.ModelConstants;
import java.io.Serializable; import java.io.Serializable;
import java.util.List; import java.util.List;
import eu.dnetlib.dhp.schema.common.ModelConstants;
public class Dataset extends Result implements Serializable { public class Dataset extends Result implements Serializable {
private Field<String> storagedate; private Field<String> storagedate;

View File

@ -1,10 +1,12 @@
package eu.dnetlib.dhp.schema.oaf; package eu.dnetlib.dhp.schema.oaf;
import com.fasterxml.jackson.annotation.JsonIgnore;
import java.io.Serializable; import java.io.Serializable;
import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.StringUtils;
import com.fasterxml.jackson.annotation.JsonIgnore;
public class GeoLocation implements Serializable { public class GeoLocation implements Serializable {
private String point; private String point;

View File

@ -1,10 +1,12 @@
package eu.dnetlib.dhp.schema.oaf; package eu.dnetlib.dhp.schema.oaf;
import com.fasterxml.jackson.annotation.JsonIgnore;
import java.io.Serializable; import java.io.Serializable;
import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.StringUtils;
import com.fasterxml.jackson.annotation.JsonIgnore;
public class KeyValue implements Serializable { public class KeyValue implements Serializable {
private String key; private String key;

View File

@ -1,10 +1,11 @@
package eu.dnetlib.dhp.schema.oaf; package eu.dnetlib.dhp.schema.oaf;
import eu.dnetlib.dhp.schema.common.ModelConstants;
import java.io.Serializable; import java.io.Serializable;
import java.util.List; import java.util.List;
import eu.dnetlib.dhp.schema.common.ModelConstants;
public class OtherResearchProduct extends Result implements Serializable { public class OtherResearchProduct extends Result implements Serializable {
private List<Field<String>> contactperson; private List<Field<String>> contactperson;

View File

@ -1,9 +1,10 @@
package eu.dnetlib.dhp.schema.oaf; package eu.dnetlib.dhp.schema.oaf;
import eu.dnetlib.dhp.schema.common.ModelConstants;
import java.io.Serializable; import java.io.Serializable;
import eu.dnetlib.dhp.schema.common.ModelConstants;
public class Publication extends Result implements Serializable { public class Publication extends Result implements Serializable {
// publication specific // publication specific

View File

@ -1,10 +1,12 @@
package eu.dnetlib.dhp.schema.oaf; package eu.dnetlib.dhp.schema.oaf;
import com.fasterxml.jackson.annotation.JsonIgnore;
import java.io.Serializable; import java.io.Serializable;
import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.StringUtils;
import com.fasterxml.jackson.annotation.JsonIgnore;
public class Qualifier implements Serializable { public class Qualifier implements Serializable {
private String classid; private String classid;

View File

@ -1,10 +1,11 @@
package eu.dnetlib.dhp.schema.oaf; package eu.dnetlib.dhp.schema.oaf;
import eu.dnetlib.dhp.schema.common.ModelConstants;
import java.io.Serializable; import java.io.Serializable;
import java.util.List; import java.util.List;
import eu.dnetlib.dhp.schema.common.ModelConstants;
public class Software extends Result implements Serializable { public class Software extends Result implements Serializable {
private List<Field<String>> documentationUrl; private List<Field<String>> documentationUrl;

View File

@ -1,14 +1,16 @@
package eu.dnetlib.dhp.schema.scholexplorer; package eu.dnetlib.dhp.schema.scholexplorer;
import eu.dnetlib.dhp.schema.oaf.Dataset;
import eu.dnetlib.dhp.schema.oaf.OafEntity;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.HashMap; import java.util.HashMap;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.StringUtils;
import eu.dnetlib.dhp.schema.oaf.Dataset;
import eu.dnetlib.dhp.schema.oaf.OafEntity;
public class DLIDataset extends Dataset { public class DLIDataset extends Dataset {
private String originalObjIdentifier; private String originalObjIdentifier;

View File

@ -1,12 +1,14 @@
package eu.dnetlib.dhp.schema.scholexplorer; package eu.dnetlib.dhp.schema.scholexplorer;
import eu.dnetlib.dhp.schema.oaf.OafEntity;
import eu.dnetlib.dhp.schema.oaf.Publication;
import java.io.Serializable; import java.io.Serializable;
import java.util.*; import java.util.*;
import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.StringUtils;
import eu.dnetlib.dhp.schema.oaf.OafEntity;
import eu.dnetlib.dhp.schema.oaf.Publication;
public class DLIPublication extends Publication implements Serializable { public class DLIPublication extends Publication implements Serializable {
private String originalObjIdentifier; private String originalObjIdentifier;

View File

@ -1,15 +1,17 @@
package eu.dnetlib.dhp.schema.scholexplorer; package eu.dnetlib.dhp.schema.scholexplorer;
import eu.dnetlib.dhp.schema.oaf.Oaf;
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
import java.io.Serializable; import java.io.Serializable;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.HashMap; import java.util.HashMap;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.StringUtils;
import eu.dnetlib.dhp.schema.oaf.Oaf;
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
public class DLIUnknown extends Oaf implements Serializable { public class DLIUnknown extends Oaf implements Serializable {
private String id; private String id;

View File

@ -3,12 +3,15 @@ package eu.dnetlib.dhp.schema.action;
import static org.junit.jupiter.api.Assertions.*; import static org.junit.jupiter.api.Assertions.*;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.schema.oaf.Relation;
import java.io.IOException; import java.io.IOException;
import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.StringUtils;
import org.junit.jupiter.api.Test; import org.junit.jupiter.api.Test;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.schema.oaf.Relation;
/** @author claudio.atzori */ /** @author claudio.atzori */
public class AtomicActionTest { public class AtomicActionTest {

View File

@ -4,11 +4,12 @@ package eu.dnetlib.dhp.schema.common;
import static org.junit.jupiter.api.Assertions.assertFalse; import static org.junit.jupiter.api.Assertions.assertFalse;
import static org.junit.jupiter.api.Assertions.assertTrue; import static org.junit.jupiter.api.Assertions.assertTrue;
import org.junit.jupiter.api.Nested;
import org.junit.jupiter.api.Test;
import eu.dnetlib.dhp.schema.oaf.OafEntity; import eu.dnetlib.dhp.schema.oaf.OafEntity;
import eu.dnetlib.dhp.schema.oaf.Relation; import eu.dnetlib.dhp.schema.oaf.Relation;
import eu.dnetlib.dhp.schema.oaf.Result; import eu.dnetlib.dhp.schema.oaf.Result;
import org.junit.jupiter.api.Nested;
import org.junit.jupiter.api.Test;
public class ModelSupportTest { public class ModelSupportTest {

View File

@ -5,6 +5,7 @@ import static org.junit.jupiter.api.Assertions.*;
import java.util.Arrays; import java.util.Arrays;
import java.util.List; import java.util.List;
import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test; import org.junit.jupiter.api.Test;

View File

@ -1,16 +1,19 @@
package eu.dnetlib.dhp.schema.scholexplorer; package eu.dnetlib.dhp.schema.scholexplorer;
import java.io.IOException;
import java.util.Arrays;
import java.util.Collections;
import org.junit.jupiter.api.Test;
import com.fasterxml.jackson.core.JsonProcessingException; import com.fasterxml.jackson.core.JsonProcessingException;
import com.fasterxml.jackson.databind.DeserializationFeature; import com.fasterxml.jackson.databind.DeserializationFeature;
import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.databind.ObjectMapper;
import com.fasterxml.jackson.databind.SerializationFeature; import com.fasterxml.jackson.databind.SerializationFeature;
import eu.dnetlib.dhp.schema.oaf.Qualifier; import eu.dnetlib.dhp.schema.oaf.Qualifier;
import eu.dnetlib.dhp.schema.oaf.StructuredProperty; import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
import java.io.IOException;
import java.util.Arrays;
import java.util.Collections;
import org.junit.jupiter.api.Test;
public class DLItest { public class DLItest {

View File

@ -1,9 +1,23 @@
package eu.dnetlib.dhp.actionmanager; package eu.dnetlib.dhp.actionmanager;
import java.io.Serializable;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.List;
import java.util.NoSuchElementException;
import java.util.stream.Collectors;
import org.dom4j.Document;
import org.dom4j.Element;
import org.dom4j.io.SAXReader;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.google.common.base.Splitter; import com.google.common.base.Splitter;
import com.google.common.collect.Iterables; import com.google.common.collect.Iterables;
import com.google.common.collect.Lists; import com.google.common.collect.Lists;
import eu.dnetlib.actionmanager.rmi.ActionManagerException; import eu.dnetlib.actionmanager.rmi.ActionManagerException;
import eu.dnetlib.actionmanager.set.ActionManagerSet; import eu.dnetlib.actionmanager.set.ActionManagerSet;
import eu.dnetlib.actionmanager.set.ActionManagerSet.ImpactTypes; import eu.dnetlib.actionmanager.set.ActionManagerSet.ImpactTypes;
@ -11,17 +25,6 @@ import eu.dnetlib.dhp.actionmanager.partition.PartitionActionSetsByPayloadTypeJo
import eu.dnetlib.dhp.utils.ISLookupClientFactory; import eu.dnetlib.dhp.utils.ISLookupClientFactory;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException; import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService; import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
import java.io.Serializable;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.List;
import java.util.NoSuchElementException;
import java.util.stream.Collectors;
import org.dom4j.Document;
import org.dom4j.Element;
import org.dom4j.io.SAXReader;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
public class ISClient implements Serializable { public class ISClient implements Serializable {

View File

@ -1,9 +1,10 @@
package eu.dnetlib.dhp.actionmanager.migration; package eu.dnetlib.dhp.actionmanager.migration;
import eu.dnetlib.data.proto.FieldTypeProtos.Qualifier;
import java.util.Comparator; import java.util.Comparator;
import eu.dnetlib.data.proto.FieldTypeProtos.Qualifier;
public class LicenseComparator implements Comparator<Qualifier> { public class LicenseComparator implements Comparator<Qualifier> {
@Override @Override

View File

@ -1,12 +1,6 @@
package eu.dnetlib.dhp.actionmanager.migration; package eu.dnetlib.dhp.actionmanager.migration;
import com.google.common.base.Splitter;
import com.google.common.collect.Lists;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.utils.ISLookupClientFactory;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
import java.io.File; import java.io.File;
import java.io.FileOutputStream; import java.io.FileOutputStream;
import java.io.OutputStream; import java.io.OutputStream;
@ -15,6 +9,7 @@ import java.util.LinkedList;
import java.util.List; import java.util.List;
import java.util.Properties; import java.util.Properties;
import java.util.stream.Collectors; import java.util.stream.Collectors;
import org.apache.commons.io.IOUtils; import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.StringUtils;
import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configuration;
@ -26,6 +21,14 @@ import org.apache.hadoop.util.ToolRunner;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
import com.google.common.base.Splitter;
import com.google.common.collect.Lists;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.utils.ISLookupClientFactory;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
public class MigrateActionSet { public class MigrateActionSet {
private static final Logger log = LoggerFactory.getLogger(MigrateActionSet.class); private static final Logger log = LoggerFactory.getLogger(MigrateActionSet.class);

View File

@ -6,16 +6,19 @@ import static eu.dnetlib.data.proto.KindProtos.Kind.relation;
import static eu.dnetlib.data.proto.TypeProtos.*; import static eu.dnetlib.data.proto.TypeProtos.*;
import static eu.dnetlib.data.proto.TypeProtos.Type.*; import static eu.dnetlib.data.proto.TypeProtos.Type.*;
import com.google.common.collect.Lists;
import com.googlecode.protobuf.format.JsonFormat;
import eu.dnetlib.data.proto.*;
import eu.dnetlib.dhp.schema.oaf.*;
import java.io.Serializable; import java.io.Serializable;
import java.util.List; import java.util.List;
import java.util.Optional; import java.util.Optional;
import java.util.stream.Collectors; import java.util.stream.Collectors;
import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.StringUtils;
import com.google.common.collect.Lists;
import com.googlecode.protobuf.format.JsonFormat;
import eu.dnetlib.data.proto.*;
import eu.dnetlib.dhp.schema.oaf.*;
public class ProtoConverter implements Serializable { public class ProtoConverter implements Serializable {
public static final String UNKNOWN = "UNKNOWN"; public static final String UNKNOWN = "UNKNOWN";

View File

@ -3,22 +3,12 @@ package eu.dnetlib.dhp.actionmanager.migration;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.google.common.base.Splitter;
import com.google.common.collect.Lists;
import com.google.protobuf.InvalidProtocolBufferException;
import eu.dnetlib.data.proto.OafProtos;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.schema.action.AtomicAction;
import eu.dnetlib.dhp.schema.oaf.*;
import eu.dnetlib.dhp.utils.ISLookupClientFactory;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
import java.io.IOException; import java.io.IOException;
import java.io.Serializable; import java.io.Serializable;
import java.util.LinkedList; import java.util.LinkedList;
import java.util.Objects; import java.util.Objects;
import java.util.Optional; import java.util.Optional;
import org.apache.commons.io.IOUtils; import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.StringUtils;
import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.FileSystem;
@ -30,6 +20,19 @@ import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.sql.SparkSession; import org.apache.spark.sql.SparkSession;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.google.common.base.Splitter;
import com.google.common.collect.Lists;
import com.google.protobuf.InvalidProtocolBufferException;
import eu.dnetlib.data.proto.OafProtos;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.schema.action.AtomicAction;
import eu.dnetlib.dhp.schema.oaf.*;
import eu.dnetlib.dhp.utils.ISLookupClientFactory;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
import scala.Tuple2; import scala.Tuple2;
public class TransformActions implements Serializable { public class TransformActions implements Serializable {

View File

@ -4,13 +4,10 @@ package eu.dnetlib.dhp.actionmanager.partition;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import static org.apache.spark.sql.functions.*; import static org.apache.spark.sql.functions.*;
import eu.dnetlib.dhp.actionmanager.ISClient;
import eu.dnetlib.dhp.actionmanager.promote.PromoteActionPayloadForGraphTableJob;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.common.HdfsSupport;
import java.util.Arrays; import java.util.Arrays;
import java.util.List; import java.util.List;
import java.util.Optional; import java.util.Optional;
import org.apache.commons.io.IOUtils; import org.apache.commons.io.IOUtils;
import org.apache.hadoop.io.Text; import org.apache.hadoop.io.Text;
import org.apache.spark.SparkConf; import org.apache.spark.SparkConf;
@ -21,6 +18,11 @@ import org.apache.spark.sql.types.*;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
import eu.dnetlib.dhp.actionmanager.ISClient;
import eu.dnetlib.dhp.actionmanager.promote.PromoteActionPayloadForGraphTableJob;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.common.HdfsSupport;
/** Partitions given set of action sets by payload type. */ /** Partitions given set of action sets by payload type. */
public class PartitionActionSetsByPayloadTypeJob { public class PartitionActionSetsByPayloadTypeJob {

View File

@ -3,11 +3,12 @@ package eu.dnetlib.dhp.actionmanager.promote;
import static eu.dnetlib.dhp.schema.common.ModelSupport.isSubClass; import static eu.dnetlib.dhp.schema.common.ModelSupport.isSubClass;
import java.util.function.BiFunction;
import eu.dnetlib.dhp.common.FunctionalInterfaceSupport.SerializableSupplier; import eu.dnetlib.dhp.common.FunctionalInterfaceSupport.SerializableSupplier;
import eu.dnetlib.dhp.schema.oaf.Oaf; import eu.dnetlib.dhp.schema.oaf.Oaf;
import eu.dnetlib.dhp.schema.oaf.OafEntity; import eu.dnetlib.dhp.schema.oaf.OafEntity;
import eu.dnetlib.dhp.schema.oaf.Relation; import eu.dnetlib.dhp.schema.oaf.Relation;
import java.util.function.BiFunction;
/** OAF model merging support. */ /** OAF model merging support. */
public class MergeAndGet { public class MergeAndGet {

View File

@ -4,16 +4,11 @@ package eu.dnetlib.dhp.actionmanager.promote;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import static eu.dnetlib.dhp.schema.common.ModelSupport.isSubClass; import static eu.dnetlib.dhp.schema.common.ModelSupport.isSubClass;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.common.FunctionalInterfaceSupport.SerializableSupplier;
import eu.dnetlib.dhp.common.HdfsSupport;
import eu.dnetlib.dhp.schema.common.ModelSupport;
import eu.dnetlib.dhp.schema.oaf.*;
import java.util.Objects; import java.util.Objects;
import java.util.Optional; import java.util.Optional;
import java.util.function.BiFunction; import java.util.function.BiFunction;
import java.util.function.Function; import java.util.function.Function;
import org.apache.commons.io.IOUtils; import org.apache.commons.io.IOUtils;
import org.apache.spark.SparkConf; import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.MapFunction; import org.apache.spark.api.java.function.MapFunction;
@ -24,6 +19,14 @@ import org.apache.spark.sql.SparkSession;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.common.FunctionalInterfaceSupport.SerializableSupplier;
import eu.dnetlib.dhp.common.HdfsSupport;
import eu.dnetlib.dhp.schema.common.ModelSupport;
import eu.dnetlib.dhp.schema.oaf.*;
/** Applies a given action payload file to graph table of compatible type. */ /** Applies a given action payload file to graph table of compatible type. */
public class PromoteActionPayloadForGraphTableJob { public class PromoteActionPayloadForGraphTableJob {
private static final Logger logger = LoggerFactory.getLogger(PromoteActionPayloadForGraphTableJob.class); private static final Logger logger = LoggerFactory.getLogger(PromoteActionPayloadForGraphTableJob.class);

View File

@ -3,12 +3,11 @@ package eu.dnetlib.dhp.actionmanager.promote;
import static eu.dnetlib.dhp.schema.common.ModelSupport.isSubClass; import static eu.dnetlib.dhp.schema.common.ModelSupport.isSubClass;
import eu.dnetlib.dhp.common.FunctionalInterfaceSupport.SerializableSupplier;
import eu.dnetlib.dhp.schema.oaf.Oaf;
import java.util.Objects; import java.util.Objects;
import java.util.Optional; import java.util.Optional;
import java.util.function.BiFunction; import java.util.function.BiFunction;
import java.util.function.Function; import java.util.function.Function;
import org.apache.spark.api.java.function.FilterFunction; import org.apache.spark.api.java.function.FilterFunction;
import org.apache.spark.api.java.function.MapFunction; import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Dataset;
@ -16,6 +15,9 @@ import org.apache.spark.sql.Encoder;
import org.apache.spark.sql.Encoders; import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.TypedColumn; import org.apache.spark.sql.TypedColumn;
import org.apache.spark.sql.expressions.Aggregator; import org.apache.spark.sql.expressions.Aggregator;
import eu.dnetlib.dhp.common.FunctionalInterfaceSupport.SerializableSupplier;
import eu.dnetlib.dhp.schema.oaf.Oaf;
import scala.Tuple2; import scala.Tuple2;
/** Promote action payload functions. */ /** Promote action payload functions. */

View File

@ -6,16 +6,13 @@ import static org.apache.spark.sql.functions.*;
import static org.junit.jupiter.api.Assertions.assertIterableEquals; import static org.junit.jupiter.api.Assertions.assertIterableEquals;
import static scala.collection.JavaConversions.mutableSeqAsJavaList; import static scala.collection.JavaConversions.mutableSeqAsJavaList;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.actionmanager.ISClient;
import eu.dnetlib.dhp.actionmanager.promote.PromoteActionPayloadForGraphTableJobTest;
import eu.dnetlib.dhp.schema.oaf.*;
import java.io.IOException; import java.io.IOException;
import java.nio.file.Files; import java.nio.file.Files;
import java.nio.file.Path; import java.nio.file.Path;
import java.nio.file.Paths; import java.nio.file.Paths;
import java.util.*; import java.util.*;
import java.util.stream.Collectors; import java.util.stream.Collectors;
import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.Text; import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Job;
@ -33,6 +30,12 @@ import org.junit.jupiter.api.io.TempDir;
import org.mockito.Mock; import org.mockito.Mock;
import org.mockito.Mockito; import org.mockito.Mockito;
import org.mockito.junit.jupiter.MockitoExtension; import org.mockito.junit.jupiter.MockitoExtension;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.actionmanager.ISClient;
import eu.dnetlib.dhp.actionmanager.promote.PromoteActionPayloadForGraphTableJobTest;
import eu.dnetlib.dhp.schema.oaf.*;
import scala.Tuple2; import scala.Tuple2;
import scala.collection.mutable.Seq; import scala.collection.mutable.Seq;

View File

@ -6,12 +6,14 @@ import static eu.dnetlib.dhp.actionmanager.promote.MergeAndGet.functionFor;
import static org.junit.jupiter.api.Assertions.*; import static org.junit.jupiter.api.Assertions.*;
import static org.mockito.Mockito.*; import static org.mockito.Mockito.*;
import eu.dnetlib.dhp.common.FunctionalInterfaceSupport.SerializableSupplier;
import eu.dnetlib.dhp.schema.oaf.*;
import java.util.function.BiFunction; import java.util.function.BiFunction;
import org.junit.jupiter.api.Nested; import org.junit.jupiter.api.Nested;
import org.junit.jupiter.api.Test; import org.junit.jupiter.api.Test;
import eu.dnetlib.dhp.common.FunctionalInterfaceSupport.SerializableSupplier;
import eu.dnetlib.dhp.schema.oaf.*;
public class MergeAndGetTest { public class MergeAndGetTest {
@Nested @Nested

View File

@ -4,9 +4,6 @@ package eu.dnetlib.dhp.actionmanager.promote;
import static org.junit.jupiter.api.Assertions.*; import static org.junit.jupiter.api.Assertions.*;
import static org.junit.jupiter.params.provider.Arguments.arguments; import static org.junit.jupiter.params.provider.Arguments.arguments;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.schema.common.ModelSupport;
import eu.dnetlib.dhp.schema.oaf.*;
import java.io.IOException; import java.io.IOException;
import java.nio.file.Files; import java.nio.file.Files;
import java.nio.file.Path; import java.nio.file.Path;
@ -16,6 +13,7 @@ import java.util.List;
import java.util.Objects; import java.util.Objects;
import java.util.stream.Collectors; import java.util.stream.Collectors;
import java.util.stream.Stream; import java.util.stream.Stream;
import org.apache.commons.io.FileUtils; import org.apache.commons.io.FileUtils;
import org.apache.spark.SparkConf; import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.MapFunction; import org.apache.spark.api.java.function.MapFunction;
@ -27,6 +25,11 @@ import org.junit.jupiter.params.ParameterizedTest;
import org.junit.jupiter.params.provider.Arguments; import org.junit.jupiter.params.provider.Arguments;
import org.junit.jupiter.params.provider.MethodSource; import org.junit.jupiter.params.provider.MethodSource;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.schema.common.ModelSupport;
import eu.dnetlib.dhp.schema.oaf.*;
public class PromoteActionPayloadForGraphTableJobTest { public class PromoteActionPayloadForGraphTableJobTest {
private static final ClassLoader cl = PromoteActionPayloadForGraphTableJobTest.class.getClassLoader(); private static final ClassLoader cl = PromoteActionPayloadForGraphTableJobTest.class.getClassLoader();

View File

@ -4,13 +4,12 @@ package eu.dnetlib.dhp.actionmanager.promote;
import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertThrows; import static org.junit.jupiter.api.Assertions.assertThrows;
import eu.dnetlib.dhp.common.FunctionalInterfaceSupport.SerializableSupplier;
import eu.dnetlib.dhp.schema.oaf.Oaf;
import java.util.Arrays; import java.util.Arrays;
import java.util.List; import java.util.List;
import java.util.Objects; import java.util.Objects;
import java.util.function.BiFunction; import java.util.function.BiFunction;
import java.util.function.Function; import java.util.function.Function;
import org.apache.spark.SparkConf; import org.apache.spark.SparkConf;
import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders; import org.apache.spark.sql.Encoders;
@ -20,6 +19,9 @@ import org.junit.jupiter.api.BeforeAll;
import org.junit.jupiter.api.Nested; import org.junit.jupiter.api.Nested;
import org.junit.jupiter.api.Test; import org.junit.jupiter.api.Test;
import eu.dnetlib.dhp.common.FunctionalInterfaceSupport.SerializableSupplier;
import eu.dnetlib.dhp.schema.oaf.Oaf;
public class PromoteActionPayloadFunctionsTest { public class PromoteActionPayloadFunctionsTest {
private static SparkSession spark; private static SparkSession spark;

View File

@ -1,18 +1,12 @@
package eu.dnetlib.dhp.collection; package eu.dnetlib.dhp.collection;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.model.mdstore.MetadataRecord;
import eu.dnetlib.dhp.model.mdstore.Provenance;
import eu.dnetlib.message.Message;
import eu.dnetlib.message.MessageManager;
import eu.dnetlib.message.MessageType;
import java.io.ByteArrayInputStream; import java.io.ByteArrayInputStream;
import java.nio.charset.StandardCharsets; import java.nio.charset.StandardCharsets;
import java.util.HashMap; import java.util.HashMap;
import java.util.Map; import java.util.Map;
import java.util.Objects; import java.util.Objects;
import org.apache.commons.cli.*; import org.apache.commons.cli.*;
import org.apache.commons.io.IOUtils; import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.StringUtils;
@ -30,6 +24,15 @@ import org.dom4j.Document;
import org.dom4j.Node; import org.dom4j.Node;
import org.dom4j.io.SAXReader; import org.dom4j.io.SAXReader;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.model.mdstore.MetadataRecord;
import eu.dnetlib.dhp.model.mdstore.Provenance;
import eu.dnetlib.message.Message;
import eu.dnetlib.message.MessageManager;
import eu.dnetlib.message.MessageType;
public class GenerateNativeStoreSparkJob { public class GenerateNativeStoreSparkJob {
public static MetadataRecord parseRecord( public static MetadataRecord parseRecord(

View File

@ -1,9 +1,10 @@
package eu.dnetlib.dhp.collection.plugin; package eu.dnetlib.dhp.collection.plugin;
import java.util.stream.Stream;
import eu.dnetlib.collector.worker.model.ApiDescriptor; import eu.dnetlib.collector.worker.model.ApiDescriptor;
import eu.dnetlib.dhp.collection.worker.DnetCollectorException; import eu.dnetlib.dhp.collection.worker.DnetCollectorException;
import java.util.stream.Stream;
public interface CollectorPlugin { public interface CollectorPlugin {

View File

@ -1,12 +1,6 @@
package eu.dnetlib.dhp.collection.plugin.oai; package eu.dnetlib.dhp.collection.plugin.oai;
import com.google.common.base.Splitter;
import com.google.common.collect.Iterators;
import com.google.common.collect.Lists;
import eu.dnetlib.collector.worker.model.ApiDescriptor;
import eu.dnetlib.dhp.collection.plugin.CollectorPlugin;
import eu.dnetlib.dhp.collection.worker.DnetCollectorException;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Iterator; import java.util.Iterator;
import java.util.List; import java.util.List;
@ -15,6 +9,14 @@ import java.util.Spliterators;
import java.util.stream.Stream; import java.util.stream.Stream;
import java.util.stream.StreamSupport; import java.util.stream.StreamSupport;
import com.google.common.base.Splitter;
import com.google.common.collect.Iterators;
import com.google.common.collect.Lists;
import eu.dnetlib.collector.worker.model.ApiDescriptor;
import eu.dnetlib.dhp.collection.plugin.CollectorPlugin;
import eu.dnetlib.dhp.collection.worker.DnetCollectorException;
public class OaiCollectorPlugin implements CollectorPlugin { public class OaiCollectorPlugin implements CollectorPlugin {
private static final String FORMAT_PARAM = "format"; private static final String FORMAT_PARAM = "format";

View File

@ -1,15 +1,13 @@
package eu.dnetlib.dhp.collection.plugin.oai; package eu.dnetlib.dhp.collection.plugin.oai;
import eu.dnetlib.dhp.collection.worker.DnetCollectorException;
import eu.dnetlib.dhp.collection.worker.utils.HttpConnector;
import eu.dnetlib.dhp.collection.worker.utils.XmlCleaner;
import java.io.StringReader; import java.io.StringReader;
import java.io.UnsupportedEncodingException; import java.io.UnsupportedEncodingException;
import java.net.URLEncoder; import java.net.URLEncoder;
import java.util.Iterator; import java.util.Iterator;
import java.util.Queue; import java.util.Queue;
import java.util.concurrent.PriorityBlockingQueue; import java.util.concurrent.PriorityBlockingQueue;
import org.apache.commons.lang.StringUtils; import org.apache.commons.lang.StringUtils;
import org.apache.commons.logging.Log; import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory; import org.apache.commons.logging.LogFactory;
@ -18,6 +16,10 @@ import org.dom4j.DocumentException;
import org.dom4j.Node; import org.dom4j.Node;
import org.dom4j.io.SAXReader; import org.dom4j.io.SAXReader;
import eu.dnetlib.dhp.collection.worker.DnetCollectorException;
import eu.dnetlib.dhp.collection.worker.utils.HttpConnector;
import eu.dnetlib.dhp.collection.worker.utils.XmlCleaner;
public class OaiIterator implements Iterator<String> { public class OaiIterator implements Iterator<String> {
private static final Log log = LogFactory.getLog(OaiIterator.class); // NOPMD by marko on private static final Log log = LogFactory.getLog(OaiIterator.class); // NOPMD by marko on

View File

@ -1,9 +1,10 @@
package eu.dnetlib.dhp.collection.plugin.oai; package eu.dnetlib.dhp.collection.plugin.oai;
import eu.dnetlib.dhp.collection.worker.utils.HttpConnector;
import java.util.Iterator; import java.util.Iterator;
import eu.dnetlib.dhp.collection.worker.utils.HttpConnector;
public class OaiIteratorFactory { public class OaiIteratorFactory {
private HttpConnector httpConnector; private HttpConnector httpConnector;

View File

@ -1,19 +1,12 @@
package eu.dnetlib.dhp.collection.worker; package eu.dnetlib.dhp.collection.worker;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.collector.worker.model.ApiDescriptor;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.collection.plugin.CollectorPlugin;
import eu.dnetlib.dhp.collection.worker.utils.CollectorPluginFactory;
import eu.dnetlib.message.Message;
import eu.dnetlib.message.MessageManager;
import eu.dnetlib.message.MessageType;
import java.io.IOException; import java.io.IOException;
import java.net.URI; import java.net.URI;
import java.util.HashMap; import java.util.HashMap;
import java.util.Map; import java.util.Map;
import java.util.concurrent.atomic.AtomicInteger; import java.util.concurrent.atomic.AtomicInteger;
import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path; import org.apache.hadoop.fs.Path;
@ -23,6 +16,16 @@ import org.apache.hadoop.io.Text;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.collector.worker.model.ApiDescriptor;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.collection.plugin.CollectorPlugin;
import eu.dnetlib.dhp.collection.worker.utils.CollectorPluginFactory;
import eu.dnetlib.message.Message;
import eu.dnetlib.message.MessageManager;
import eu.dnetlib.message.MessageType;
public class DnetCollectorWorker { public class DnetCollectorWorker {
private static final Logger log = LoggerFactory.getLogger(DnetCollectorWorker.class); private static final Logger log = LoggerFactory.getLogger(DnetCollectorWorker.class);

View File

@ -1,12 +1,13 @@
package eu.dnetlib.dhp.collection.worker; package eu.dnetlib.dhp.collection.worker;
import org.apache.commons.io.IOUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.collection.worker.utils.CollectorPluginFactory; import eu.dnetlib.dhp.collection.worker.utils.CollectorPluginFactory;
import eu.dnetlib.message.MessageManager; import eu.dnetlib.message.MessageManager;
import org.apache.commons.io.IOUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/** /**
* DnetCollectortWorkerApplication is the main class responsible to start the Dnet Collection into HDFS. This module * DnetCollectortWorkerApplication is the main class responsible to start the Dnet Collection into HDFS. This module

View File

@ -1,7 +1,6 @@
package eu.dnetlib.dhp.collection.worker.utils; package eu.dnetlib.dhp.collection.worker.utils;
import eu.dnetlib.dhp.collection.worker.DnetCollectorException;
import java.io.IOException; import java.io.IOException;
import java.io.InputStream; import java.io.InputStream;
import java.net.*; import java.net.*;
@ -9,15 +8,19 @@ import java.security.GeneralSecurityException;
import java.security.cert.X509Certificate; import java.security.cert.X509Certificate;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
import javax.net.ssl.HttpsURLConnection; import javax.net.ssl.HttpsURLConnection;
import javax.net.ssl.SSLContext; import javax.net.ssl.SSLContext;
import javax.net.ssl.TrustManager; import javax.net.ssl.TrustManager;
import javax.net.ssl.X509TrustManager; import javax.net.ssl.X509TrustManager;
import org.apache.commons.io.IOUtils; import org.apache.commons.io.IOUtils;
import org.apache.commons.lang.math.NumberUtils; import org.apache.commons.lang.math.NumberUtils;
import org.apache.commons.logging.Log; import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory; import org.apache.commons.logging.LogFactory;
import eu.dnetlib.dhp.collection.worker.DnetCollectorException;
public class HttpConnector { public class HttpConnector {
private static final Log log = LogFactory.getLog(HttpConnector.class); private static final Log log = LogFactory.getLog(HttpConnector.class);

View File

@ -1,16 +1,19 @@
package eu.dnetlib.dhp.transformation; package eu.dnetlib.dhp.transformation;
import java.io.ByteArrayInputStream;
import java.io.StringWriter;
import java.util.Map;
import javax.xml.transform.stream.StreamSource;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.util.LongAccumulator;
import eu.dnetlib.dhp.model.mdstore.MetadataRecord; import eu.dnetlib.dhp.model.mdstore.MetadataRecord;
import eu.dnetlib.dhp.transformation.functions.Cleaner; import eu.dnetlib.dhp.transformation.functions.Cleaner;
import eu.dnetlib.dhp.transformation.vocabulary.Vocabulary; import eu.dnetlib.dhp.transformation.vocabulary.Vocabulary;
import java.io.ByteArrayInputStream;
import java.io.StringWriter;
import java.util.Map;
import javax.xml.transform.stream.StreamSource;
import net.sf.saxon.s9api.*; import net.sf.saxon.s9api.*;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.util.LongAccumulator;
public class TransformFunction implements MapFunction<MetadataRecord, MetadataRecord> { public class TransformFunction implements MapFunction<MetadataRecord, MetadataRecord> {

View File

@ -1,18 +1,11 @@
package eu.dnetlib.dhp.transformation; package eu.dnetlib.dhp.transformation;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.model.mdstore.MetadataRecord;
import eu.dnetlib.dhp.transformation.vocabulary.Vocabulary;
import eu.dnetlib.dhp.transformation.vocabulary.VocabularyHelper;
import eu.dnetlib.dhp.utils.DHPUtils;
import eu.dnetlib.message.Message;
import eu.dnetlib.message.MessageManager;
import eu.dnetlib.message.MessageType;
import java.io.ByteArrayInputStream; import java.io.ByteArrayInputStream;
import java.util.HashMap; import java.util.HashMap;
import java.util.Map; import java.util.Map;
import java.util.Objects; import java.util.Objects;
import org.apache.commons.cli.*; import org.apache.commons.cli.*;
import org.apache.commons.io.IOUtils; import org.apache.commons.io.IOUtils;
import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Dataset;
@ -25,6 +18,15 @@ import org.dom4j.DocumentException;
import org.dom4j.Node; import org.dom4j.Node;
import org.dom4j.io.SAXReader; import org.dom4j.io.SAXReader;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.model.mdstore.MetadataRecord;
import eu.dnetlib.dhp.transformation.vocabulary.Vocabulary;
import eu.dnetlib.dhp.transformation.vocabulary.VocabularyHelper;
import eu.dnetlib.dhp.utils.DHPUtils;
import eu.dnetlib.message.Message;
import eu.dnetlib.message.MessageManager;
import eu.dnetlib.message.MessageType;
public class TransformSparkJobNode { public class TransformSparkJobNode {
public static void main(String[] args) throws Exception { public static void main(String[] args) throws Exception {

View File

@ -1,10 +1,11 @@
package eu.dnetlib.dhp.transformation.functions; package eu.dnetlib.dhp.transformation.functions;
import eu.dnetlib.dhp.transformation.vocabulary.Term;
import eu.dnetlib.dhp.transformation.vocabulary.Vocabulary;
import java.util.Map; import java.util.Map;
import java.util.Optional; import java.util.Optional;
import eu.dnetlib.dhp.transformation.vocabulary.Term;
import eu.dnetlib.dhp.transformation.vocabulary.Vocabulary;
import net.sf.saxon.s9api.*; import net.sf.saxon.s9api.*;
import scala.Serializable; import scala.Serializable;

View File

@ -1,12 +1,14 @@
package eu.dnetlib.dhp.transformation.vocabulary; package eu.dnetlib.dhp.transformation.vocabulary;
import com.fasterxml.jackson.databind.ObjectMapper;
import java.io.Serializable; import java.io.Serializable;
import java.net.URL; import java.net.URL;
import java.nio.charset.Charset; import java.nio.charset.Charset;
import org.apache.commons.io.IOUtils; import org.apache.commons.io.IOUtils;
import com.fasterxml.jackson.databind.ObjectMapper;
public class VocabularyHelper implements Serializable { public class VocabularyHelper implements Serializable {
private static final String OPENAIRE_URL = "http://api.openaire.eu/vocabularies/%s.json"; private static final String OPENAIRE_URL = "http://api.openaire.eu/vocabularies/%s.json";

View File

@ -3,18 +3,21 @@ package eu.dnetlib.dhp.collection;
import static org.junit.jupiter.api.Assertions.*; import static org.junit.jupiter.api.Assertions.*;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.model.mdstore.MetadataRecord;
import eu.dnetlib.dhp.model.mdstore.Provenance;
import java.io.IOException; import java.io.IOException;
import java.nio.file.Files; import java.nio.file.Files;
import java.nio.file.Path; import java.nio.file.Path;
import org.apache.commons.io.FileUtils; import org.apache.commons.io.FileUtils;
import org.apache.commons.io.IOUtils; import org.apache.commons.io.IOUtils;
import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.AfterEach;
import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test; import org.junit.jupiter.api.Test;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.model.mdstore.MetadataRecord;
import eu.dnetlib.dhp.model.mdstore.Provenance;
public class CollectionJobTest { public class CollectionJobTest {
private Path testDir; private Path testDir;

View File

@ -4,17 +4,20 @@ package eu.dnetlib.dhp.collector.worker;
import static org.junit.jupiter.api.Assertions.assertNotNull; import static org.junit.jupiter.api.Assertions.assertNotNull;
import static org.mockito.Mockito.*; import static org.mockito.Mockito.*;
import java.io.File;
import org.junit.jupiter.api.AfterEach;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.collector.worker.model.ApiDescriptor; import eu.dnetlib.collector.worker.model.ApiDescriptor;
import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.collection.worker.DnetCollectorWorker; import eu.dnetlib.dhp.collection.worker.DnetCollectorWorker;
import eu.dnetlib.dhp.collection.worker.utils.CollectorPluginFactory; import eu.dnetlib.dhp.collection.worker.utils.CollectorPluginFactory;
import eu.dnetlib.message.Message; import eu.dnetlib.message.Message;
import eu.dnetlib.message.MessageManager; import eu.dnetlib.message.MessageManager;
import java.io.File;
import org.junit.jupiter.api.AfterEach;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
public class DnetCollectorWorkerApplicationTests { public class DnetCollectorWorkerApplicationTests {

View File

@ -3,18 +3,14 @@ package eu.dnetlib.dhp.transformation;
import static org.junit.jupiter.api.Assertions.assertNotNull; import static org.junit.jupiter.api.Assertions.assertNotNull;
import eu.dnetlib.dhp.model.mdstore.MetadataRecord;
import eu.dnetlib.dhp.transformation.functions.Cleaner;
import eu.dnetlib.dhp.transformation.vocabulary.Vocabulary;
import eu.dnetlib.dhp.transformation.vocabulary.VocabularyHelper;
import eu.dnetlib.dhp.utils.DHPUtils;
import java.io.StringWriter; import java.io.StringWriter;
import java.nio.file.Files; import java.nio.file.Files;
import java.nio.file.Path; import java.nio.file.Path;
import java.util.HashMap; import java.util.HashMap;
import java.util.Map; import java.util.Map;
import javax.xml.transform.stream.StreamSource; import javax.xml.transform.stream.StreamSource;
import net.sf.saxon.s9api.*;
import org.apache.commons.io.IOUtils; import org.apache.commons.io.IOUtils;
import org.apache.spark.util.LongAccumulator; import org.apache.spark.util.LongAccumulator;
import org.dom4j.Document; import org.dom4j.Document;
@ -27,6 +23,13 @@ import org.junit.jupiter.api.io.TempDir;
import org.mockito.Mock; import org.mockito.Mock;
import org.mockito.junit.jupiter.MockitoExtension; import org.mockito.junit.jupiter.MockitoExtension;
import eu.dnetlib.dhp.model.mdstore.MetadataRecord;
import eu.dnetlib.dhp.transformation.functions.Cleaner;
import eu.dnetlib.dhp.transformation.vocabulary.Vocabulary;
import eu.dnetlib.dhp.transformation.vocabulary.VocabularyHelper;
import eu.dnetlib.dhp.utils.DHPUtils;
import net.sf.saxon.s9api.*;
@ExtendWith(MockitoExtension.class) @ExtendWith(MockitoExtension.class)
public class TransformationJobTest { public class TransformationJobTest {

View File

@ -1,19 +1,12 @@
package eu.dnetlib.dhp.oa.dedup; package eu.dnetlib.dhp.oa.dedup;
import com.fasterxml.jackson.databind.DeserializationFeature;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.common.HdfsSupport;
import eu.dnetlib.dhp.schema.oaf.*;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
import eu.dnetlib.pace.config.DedupConfig;
import java.io.IOException; import java.io.IOException;
import java.io.Serializable; import java.io.Serializable;
import java.io.StringReader; import java.io.StringReader;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.List; import java.util.List;
import org.apache.spark.SparkConf; import org.apache.spark.SparkConf;
import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.SaveMode; import org.apache.spark.sql.SaveMode;
@ -23,6 +16,16 @@ import org.dom4j.DocumentException;
import org.dom4j.Element; import org.dom4j.Element;
import org.dom4j.io.SAXReader; import org.dom4j.io.SAXReader;
import com.fasterxml.jackson.databind.DeserializationFeature;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.common.HdfsSupport;
import eu.dnetlib.dhp.schema.oaf.*;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
import eu.dnetlib.pace.config.DedupConfig;
abstract class AbstractSparkAction implements Serializable { abstract class AbstractSparkAction implements Serializable {
protected static final ObjectMapper OBJECT_MAPPER = new ObjectMapper() protected static final ObjectMapper OBJECT_MAPPER = new ObjectMapper()

View File

@ -4,15 +4,18 @@ package eu.dnetlib.dhp.oa.dedup;
import static java.util.Collections.reverseOrder; import static java.util.Collections.reverseOrder;
import static java.util.Map.Entry.comparingByValue; import static java.util.Map.Entry.comparingByValue;
import static java.util.stream.Collectors.toMap; import static java.util.stream.Collectors.toMap;
import static org.apache.commons.lang.StringUtils.endsWith; import static org.apache.commons.lang.StringUtils.endsWith;
import static org.apache.commons.lang.StringUtils.substringBefore; import static org.apache.commons.lang.StringUtils.substringBefore;
import eu.dnetlib.dhp.schema.oaf.Field;
import java.time.Year; import java.time.Year;
import java.util.*; import java.util.*;
import java.util.stream.Collectors; import java.util.stream.Collectors;
import org.apache.commons.lang.StringUtils; import org.apache.commons.lang.StringUtils;
import eu.dnetlib.dhp.schema.oaf.Field;
public class DatePicker { public class DatePicker {
private static final String DATE_PATTERN = "\\d{4}-\\d{2}-\\d{2}"; private static final String DATE_PATTERN = "\\d{4}-\\d{2}-\\d{2}";

View File

@ -1,13 +1,9 @@
package eu.dnetlib.dhp.oa.dedup; package eu.dnetlib.dhp.oa.dedup;
import com.fasterxml.jackson.databind.DeserializationFeature;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.google.common.collect.Lists;
import eu.dnetlib.dhp.schema.common.ModelSupport;
import eu.dnetlib.dhp.schema.oaf.*;
import java.util.Collection; import java.util.Collection;
import java.util.Iterator; import java.util.Iterator;
import org.apache.spark.api.java.function.MapFunction; import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.api.java.function.MapGroupsFunction; import org.apache.spark.api.java.function.MapGroupsFunction;
import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Dataset;
@ -15,6 +11,13 @@ import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SparkSession; import org.apache.spark.sql.SparkSession;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.databind.DeserializationFeature;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.google.common.collect.Lists;
import eu.dnetlib.dhp.schema.common.ModelSupport;
import eu.dnetlib.dhp.schema.oaf.*;
import scala.Tuple2; import scala.Tuple2;
public class DedupRecordFactory { public class DedupRecordFactory {

View File

@ -1,8 +1,24 @@
package eu.dnetlib.dhp.oa.dedup; package eu.dnetlib.dhp.oa.dedup;
import java.io.StringReader;
import java.security.MessageDigest;
import java.text.Normalizer;
import java.util.*;
import java.util.stream.Collectors;
import org.apache.commons.codec.binary.Hex;
import org.apache.commons.lang3.StringUtils;
import org.apache.spark.SparkContext;
import org.apache.spark.util.LongAccumulator;
import org.dom4j.Document;
import org.dom4j.DocumentException;
import org.dom4j.Element;
import org.dom4j.io.SAXReader;
import com.google.common.collect.Sets; import com.google.common.collect.Sets;
import com.wcohen.ss.JaroWinkler; import com.wcohen.ss.JaroWinkler;
import eu.dnetlib.dhp.schema.oaf.Author; import eu.dnetlib.dhp.schema.oaf.Author;
import eu.dnetlib.dhp.schema.oaf.StructuredProperty; import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
import eu.dnetlib.dhp.utils.ISLookupClientFactory; import eu.dnetlib.dhp.utils.ISLookupClientFactory;
@ -12,19 +28,6 @@ import eu.dnetlib.pace.clustering.BlacklistAwareClusteringCombiner;
import eu.dnetlib.pace.config.DedupConfig; import eu.dnetlib.pace.config.DedupConfig;
import eu.dnetlib.pace.model.MapDocument; import eu.dnetlib.pace.model.MapDocument;
import eu.dnetlib.pace.model.Person; import eu.dnetlib.pace.model.Person;
import java.io.StringReader;
import java.security.MessageDigest;
import java.text.Normalizer;
import java.util.*;
import java.util.stream.Collectors;
import org.apache.commons.codec.binary.Hex;
import org.apache.commons.lang3.StringUtils;
import org.apache.spark.SparkContext;
import org.apache.spark.util.LongAccumulator;
import org.dom4j.Document;
import org.dom4j.DocumentException;
import org.dom4j.Element;
import org.dom4j.io.SAXReader;
import scala.Tuple2; import scala.Tuple2;
public class DedupUtility { public class DedupUtility {

View File

@ -1,15 +1,17 @@
package eu.dnetlib.dhp.oa.dedup; package eu.dnetlib.dhp.oa.dedup;
import java.util.Map;
import java.util.stream.Collectors;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.util.LongAccumulator;
import eu.dnetlib.dhp.oa.dedup.model.Block; import eu.dnetlib.dhp.oa.dedup.model.Block;
import eu.dnetlib.pace.config.DedupConfig; import eu.dnetlib.pace.config.DedupConfig;
import eu.dnetlib.pace.model.MapDocument; import eu.dnetlib.pace.model.MapDocument;
import eu.dnetlib.pace.util.BlockProcessor; import eu.dnetlib.pace.util.BlockProcessor;
import java.util.Map;
import java.util.stream.Collectors;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.util.LongAccumulator;
import scala.Serializable; import scala.Serializable;
import scala.Tuple2; import scala.Tuple2;

View File

@ -1,6 +1,16 @@
package eu.dnetlib.dhp.oa.dedup; package eu.dnetlib.dhp.oa.dedup;
import java.io.IOException;
import org.apache.commons.io.IOUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.sql.SaveMode;
import org.apache.spark.sql.SparkSession;
import org.dom4j.DocumentException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.schema.common.EntityType; import eu.dnetlib.dhp.schema.common.EntityType;
import eu.dnetlib.dhp.schema.common.ModelSupport; import eu.dnetlib.dhp.schema.common.ModelSupport;
@ -11,14 +21,6 @@ import eu.dnetlib.dhp.utils.ISLookupClientFactory;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException; import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService; import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
import eu.dnetlib.pace.config.DedupConfig; import eu.dnetlib.pace.config.DedupConfig;
import java.io.IOException;
import org.apache.commons.io.IOUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.sql.SaveMode;
import org.apache.spark.sql.SparkSession;
import org.dom4j.DocumentException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
public class SparkCreateDedupRecord extends AbstractSparkAction { public class SparkCreateDedupRecord extends AbstractSparkAction {

View File

@ -1,23 +1,11 @@
package eu.dnetlib.dhp.oa.dedup; package eu.dnetlib.dhp.oa.dedup;
import com.google.common.hash.Hashing;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.oa.dedup.graph.ConnectedComponent;
import eu.dnetlib.dhp.oa.dedup.graph.GraphProcessor;
import eu.dnetlib.dhp.schema.common.ModelSupport;
import eu.dnetlib.dhp.schema.oaf.DataInfo;
import eu.dnetlib.dhp.schema.oaf.Qualifier;
import eu.dnetlib.dhp.schema.oaf.Relation;
import eu.dnetlib.dhp.utils.ISLookupClientFactory;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
import eu.dnetlib.pace.config.DedupConfig;
import eu.dnetlib.pace.util.MapDocumentUtil;
import java.io.IOException; import java.io.IOException;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Iterator; import java.util.Iterator;
import java.util.List; import java.util.List;
import org.apache.commons.io.IOUtils; import org.apache.commons.io.IOUtils;
import org.apache.spark.SparkConf; import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD; import org.apache.spark.api.java.JavaPairRDD;
@ -32,6 +20,21 @@ import org.apache.spark.sql.SparkSession;
import org.dom4j.DocumentException; import org.dom4j.DocumentException;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
import com.google.common.hash.Hashing;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.oa.dedup.graph.ConnectedComponent;
import eu.dnetlib.dhp.oa.dedup.graph.GraphProcessor;
import eu.dnetlib.dhp.schema.common.ModelSupport;
import eu.dnetlib.dhp.schema.oaf.DataInfo;
import eu.dnetlib.dhp.schema.oaf.Qualifier;
import eu.dnetlib.dhp.schema.oaf.Relation;
import eu.dnetlib.dhp.utils.ISLookupClientFactory;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
import eu.dnetlib.pace.config.DedupConfig;
import eu.dnetlib.pace.util.MapDocumentUtil;
import scala.Tuple2; import scala.Tuple2;
public class SparkCreateMergeRels extends AbstractSparkAction { public class SparkCreateMergeRels extends AbstractSparkAction {

View File

@ -1,6 +1,21 @@
package eu.dnetlib.dhp.oa.dedup; package eu.dnetlib.dhp.oa.dedup;
import java.io.IOException;
import org.apache.commons.io.IOUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SaveMode;
import org.apache.spark.sql.SparkSession;
import org.dom4j.DocumentException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.oa.dedup.model.Block; import eu.dnetlib.dhp.oa.dedup.model.Block;
import eu.dnetlib.dhp.schema.oaf.DataInfo; import eu.dnetlib.dhp.schema.oaf.DataInfo;
@ -13,19 +28,6 @@ import eu.dnetlib.pace.model.FieldListImpl;
import eu.dnetlib.pace.model.FieldValueImpl; import eu.dnetlib.pace.model.FieldValueImpl;
import eu.dnetlib.pace.model.MapDocument; import eu.dnetlib.pace.model.MapDocument;
import eu.dnetlib.pace.util.MapDocumentUtil; import eu.dnetlib.pace.util.MapDocumentUtil;
import java.io.IOException;
import org.apache.commons.io.IOUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SaveMode;
import org.apache.spark.sql.SparkSession;
import org.dom4j.DocumentException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import scala.Tuple2; import scala.Tuple2;
public class SparkCreateSimRels extends AbstractSparkAction { public class SparkCreateSimRels extends AbstractSparkAction {

View File

@ -3,18 +3,19 @@ package eu.dnetlib.dhp.oa.dedup;
import static org.apache.spark.sql.functions.col; import static org.apache.spark.sql.functions.col;
import org.apache.commons.io.IOUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.*;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.schema.common.ModelSupport; import eu.dnetlib.dhp.schema.common.ModelSupport;
import eu.dnetlib.dhp.schema.oaf.DataInfo; import eu.dnetlib.dhp.schema.oaf.DataInfo;
import eu.dnetlib.dhp.schema.oaf.Relation; import eu.dnetlib.dhp.schema.oaf.Relation;
import eu.dnetlib.dhp.utils.ISLookupClientFactory; import eu.dnetlib.dhp.utils.ISLookupClientFactory;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService; import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
import org.apache.commons.io.IOUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.*;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import scala.Tuple2; import scala.Tuple2;
public class SparkPropagateRelation extends AbstractSparkAction { public class SparkPropagateRelation extends AbstractSparkAction {

View File

@ -1,11 +1,13 @@
package eu.dnetlib.dhp.oa.dedup; package eu.dnetlib.dhp.oa.dedup;
import eu.dnetlib.pace.util.Reporter;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
import org.apache.spark.util.LongAccumulator; import org.apache.spark.util.LongAccumulator;
import eu.dnetlib.pace.util.Reporter;
import scala.Serializable; import scala.Serializable;
import scala.Tuple2; import scala.Tuple2;

View File

@ -1,16 +1,8 @@
package eu.dnetlib.dhp.oa.dedup; package eu.dnetlib.dhp.oa.dedup;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.schema.common.ModelSupport;
import eu.dnetlib.dhp.schema.oaf.DataInfo;
import eu.dnetlib.dhp.schema.oaf.Oaf;
import eu.dnetlib.dhp.schema.oaf.OafEntity;
import eu.dnetlib.dhp.schema.oaf.Relation;
import eu.dnetlib.dhp.utils.ISLookupClientFactory;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
import eu.dnetlib.pace.util.MapDocumentUtil;
import java.io.IOException; import java.io.IOException;
import org.apache.commons.io.IOUtils; import org.apache.commons.io.IOUtils;
import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileStatus;
@ -28,6 +20,16 @@ import org.apache.spark.sql.Row;
import org.apache.spark.sql.SparkSession; import org.apache.spark.sql.SparkSession;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.schema.common.ModelSupport;
import eu.dnetlib.dhp.schema.oaf.DataInfo;
import eu.dnetlib.dhp.schema.oaf.Oaf;
import eu.dnetlib.dhp.schema.oaf.OafEntity;
import eu.dnetlib.dhp.schema.oaf.Relation;
import eu.dnetlib.dhp.utils.ISLookupClientFactory;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
import eu.dnetlib.pace.util.MapDocumentUtil;
import scala.Tuple2; import scala.Tuple2;
public class SparkUpdateEntity extends AbstractSparkAction { public class SparkUpdateEntity extends AbstractSparkAction {

View File

@ -1,15 +1,18 @@
package eu.dnetlib.dhp.oa.dedup.graph; package eu.dnetlib.dhp.oa.dedup.graph;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.oa.dedup.DedupUtility;
import eu.dnetlib.pace.util.PaceException;
import java.io.IOException; import java.io.IOException;
import java.io.Serializable; import java.io.Serializable;
import java.util.Set; import java.util.Set;
import org.apache.commons.lang.StringUtils; import org.apache.commons.lang.StringUtils;
import org.codehaus.jackson.annotate.JsonIgnore; import org.codehaus.jackson.annotate.JsonIgnore;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.oa.dedup.DedupUtility;
import eu.dnetlib.pace.util.PaceException;
public class ConnectedComponent implements Serializable { public class ConnectedComponent implements Serializable {
private Set<String> docIds; private Set<String> docIds;

View File

@ -1,8 +1,6 @@
package eu.dnetlib.dhp.oa.dedup.model; package eu.dnetlib.dhp.oa.dedup.model;
import com.google.common.collect.Lists;
import eu.dnetlib.pace.model.MapDocument;
import java.io.Serializable; import java.io.Serializable;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Comparator; import java.util.Comparator;
@ -12,6 +10,10 @@ import java.util.stream.Collectors;
import java.util.stream.Stream; import java.util.stream.Stream;
import java.util.stream.StreamSupport; import java.util.stream.StreamSupport;
import com.google.common.collect.Lists;
import eu.dnetlib.pace.model.MapDocument;
public class Block implements Serializable { public class Block implements Serializable {
private String key; private String key;

View File

@ -1,15 +1,17 @@
package eu.dnetlib.dhp.oa.dedup; package eu.dnetlib.dhp.oa.dedup;
import eu.dnetlib.dhp.schema.oaf.Publication;
import java.io.IOException; import java.io.IOException;
import java.util.Arrays; import java.util.Arrays;
import java.util.List; import java.util.List;
import java.util.stream.Collectors; import java.util.stream.Collectors;
import org.apache.commons.io.IOUtils; import org.apache.commons.io.IOUtils;
import org.codehaus.jackson.map.ObjectMapper; import org.codehaus.jackson.map.ObjectMapper;
import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.BeforeEach;
import eu.dnetlib.dhp.schema.oaf.Publication;
public class MergeAuthorTest { public class MergeAuthorTest {
private List<Publication> publicationsToMerge; private List<Publication> publicationsToMerge;

View File

@ -2,19 +2,16 @@
package eu.dnetlib.dhp.oa.dedup; package eu.dnetlib.dhp.oa.dedup;
import static java.nio.file.Files.createTempDirectory; import static java.nio.file.Files.createTempDirectory;
import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.mockito.Mockito.lenient; import static org.mockito.Mockito.lenient;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.schema.oaf.Relation;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
import eu.dnetlib.pace.util.MapDocumentUtil;
import java.io.File; import java.io.File;
import java.io.IOException; import java.io.IOException;
import java.io.Serializable; import java.io.Serializable;
import java.net.URISyntaxException; import java.net.URISyntaxException;
import java.nio.file.Paths; import java.nio.file.Paths;
import org.apache.commons.io.FileUtils; import org.apache.commons.io.FileUtils;
import org.apache.commons.io.IOUtils; import org.apache.commons.io.IOUtils;
import org.apache.spark.SparkConf; import org.apache.spark.SparkConf;
@ -31,6 +28,12 @@ import org.junit.jupiter.api.extension.ExtendWith;
import org.mockito.Mock; import org.mockito.Mock;
import org.mockito.Mockito; import org.mockito.Mockito;
import org.mockito.junit.jupiter.MockitoExtension; import org.mockito.junit.jupiter.MockitoExtension;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.schema.oaf.Relation;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
import eu.dnetlib.pace.util.MapDocumentUtil;
import scala.Tuple2; import scala.Tuple2;
@ExtendWith(MockitoExtension.class) @ExtendWith(MockitoExtension.class)

View File

@ -1,10 +1,11 @@
package eu.dnetlib.dhp.oa.dedup.jpath; package eu.dnetlib.dhp.oa.dedup.jpath;
import org.junit.jupiter.api.Test;
import eu.dnetlib.pace.config.DedupConfig; import eu.dnetlib.pace.config.DedupConfig;
import eu.dnetlib.pace.model.MapDocument; import eu.dnetlib.pace.model.MapDocument;
import eu.dnetlib.pace.util.MapDocumentUtil; import eu.dnetlib.pace.util.MapDocumentUtil;
import org.junit.jupiter.api.Test;
public class JsonPathTest { public class JsonPathTest {

View File

@ -4,15 +4,18 @@ package eu.dnetlib.dedup;
import static java.util.Collections.reverseOrder; import static java.util.Collections.reverseOrder;
import static java.util.Map.Entry.comparingByValue; import static java.util.Map.Entry.comparingByValue;
import static java.util.stream.Collectors.toMap; import static java.util.stream.Collectors.toMap;
import static org.apache.commons.lang.StringUtils.endsWith; import static org.apache.commons.lang.StringUtils.endsWith;
import static org.apache.commons.lang.StringUtils.substringBefore; import static org.apache.commons.lang.StringUtils.substringBefore;
import eu.dnetlib.dhp.schema.oaf.Field;
import java.time.Year; import java.time.Year;
import java.util.*; import java.util.*;
import java.util.stream.Collectors; import java.util.stream.Collectors;
import org.apache.commons.lang.StringUtils; import org.apache.commons.lang.StringUtils;
import eu.dnetlib.dhp.schema.oaf.Field;
public class DatePicker { public class DatePicker {
private static final String DATE_PATTERN = "\\d{4}-\\d{2}-\\d{2}"; private static final String DATE_PATTERN = "\\d{4}-\\d{2}-\\d{2}";

View File

@ -1,19 +1,22 @@
package eu.dnetlib.dedup; package eu.dnetlib.dedup;
import com.fasterxml.jackson.databind.DeserializationFeature;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.google.common.collect.Lists;
import eu.dnetlib.dhp.schema.oaf.*;
import eu.dnetlib.pace.config.DedupConfig;
import eu.dnetlib.pace.util.MapDocumentUtil;
import java.util.Collection; import java.util.Collection;
import org.apache.spark.api.java.JavaPairRDD; import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.PairFunction; import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.sql.Encoders; import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SparkSession; import org.apache.spark.sql.SparkSession;
import com.fasterxml.jackson.databind.DeserializationFeature;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.google.common.collect.Lists;
import eu.dnetlib.dhp.schema.oaf.*;
import eu.dnetlib.pace.config.DedupConfig;
import eu.dnetlib.pace.util.MapDocumentUtil;
import scala.Tuple2; import scala.Tuple2;
public class DedupRecordFactory { public class DedupRecordFactory {

View File

@ -1,14 +1,6 @@
package eu.dnetlib.dedup; package eu.dnetlib.dedup;
import com.google.common.collect.Sets;
import com.wcohen.ss.JaroWinkler;
import eu.dnetlib.dhp.schema.oaf.Author;
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
import eu.dnetlib.pace.clustering.BlacklistAwareClusteringCombiner;
import eu.dnetlib.pace.config.DedupConfig;
import eu.dnetlib.pace.model.MapDocument;
import eu.dnetlib.pace.model.Person;
import java.io.IOException; import java.io.IOException;
import java.io.StringWriter; import java.io.StringWriter;
import java.nio.charset.StandardCharsets; import java.nio.charset.StandardCharsets;
@ -16,6 +8,7 @@ import java.security.MessageDigest;
import java.text.Normalizer; import java.text.Normalizer;
import java.util.*; import java.util.*;
import java.util.stream.Collectors; import java.util.stream.Collectors;
import org.apache.commons.codec.binary.Hex; import org.apache.commons.codec.binary.Hex;
import org.apache.commons.io.IOUtils; import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.StringUtils;
@ -27,6 +20,16 @@ import org.apache.spark.SparkContext;
import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.util.LongAccumulator; import org.apache.spark.util.LongAccumulator;
import com.google.common.collect.Sets;
import com.wcohen.ss.JaroWinkler;
import eu.dnetlib.dhp.schema.oaf.Author;
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
import eu.dnetlib.pace.clustering.BlacklistAwareClusteringCombiner;
import eu.dnetlib.pace.config.DedupConfig;
import eu.dnetlib.pace.model.MapDocument;
import eu.dnetlib.pace.model.Person;
import scala.Tuple2; import scala.Tuple2;
public class DedupUtility { public class DedupUtility {

View File

@ -1,12 +1,9 @@
package eu.dnetlib.dedup; package eu.dnetlib.dedup;
import eu.dnetlib.pace.config.DedupConfig;
import eu.dnetlib.pace.model.MapDocument;
import eu.dnetlib.pace.util.BlockProcessor;
import eu.dnetlib.pace.util.MapDocumentUtil;
import java.util.*; import java.util.*;
import java.util.stream.Collectors; import java.util.stream.Collectors;
import org.apache.commons.logging.Log; import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory; import org.apache.commons.logging.LogFactory;
import org.apache.spark.api.java.JavaPairRDD; import org.apache.spark.api.java.JavaPairRDD;
@ -16,6 +13,11 @@ import org.apache.spark.api.java.function.Function2;
import org.apache.spark.api.java.function.PairFlatMapFunction; import org.apache.spark.api.java.function.PairFlatMapFunction;
import org.apache.spark.api.java.function.PairFunction; import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.util.LongAccumulator; import org.apache.spark.util.LongAccumulator;
import eu.dnetlib.pace.config.DedupConfig;
import eu.dnetlib.pace.model.MapDocument;
import eu.dnetlib.pace.util.BlockProcessor;
import eu.dnetlib.pace.util.MapDocumentUtil;
import scala.Serializable; import scala.Serializable;
import scala.Tuple2; import scala.Tuple2;

View File

@ -1,15 +1,9 @@
package eu.dnetlib.dedup; package eu.dnetlib.dedup;
import com.google.common.hash.Hashing;
import eu.dnetlib.dedup.graph.ConnectedComponent;
import eu.dnetlib.dedup.graph.GraphProcessor;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.schema.oaf.Relation;
import eu.dnetlib.pace.config.DedupConfig;
import eu.dnetlib.pace.util.MapDocumentUtil;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.List; import java.util.List;
import org.apache.commons.io.IOUtils; import org.apache.commons.io.IOUtils;
import org.apache.spark.api.java.JavaPairRDD; import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaRDD;
@ -21,6 +15,15 @@ import org.apache.spark.rdd.RDD;
import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders; import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SparkSession; import org.apache.spark.sql.SparkSession;
import com.google.common.hash.Hashing;
import eu.dnetlib.dedup.graph.ConnectedComponent;
import eu.dnetlib.dedup.graph.GraphProcessor;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.schema.oaf.Relation;
import eu.dnetlib.pace.config.DedupConfig;
import eu.dnetlib.pace.util.MapDocumentUtil;
import scala.Tuple2; import scala.Tuple2;
public class SparkCreateConnectedComponent { public class SparkCreateConnectedComponent {

View File

@ -1,15 +1,17 @@
package eu.dnetlib.dedup; package eu.dnetlib.dedup;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.schema.oaf.OafEntity;
import eu.dnetlib.pace.config.DedupConfig;
import org.apache.commons.io.IOUtils; import org.apache.commons.io.IOUtils;
import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.sql.SparkSession; import org.apache.spark.sql.SparkSession;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.schema.oaf.OafEntity;
import eu.dnetlib.pace.config.DedupConfig;
public class SparkCreateDedupRecord { public class SparkCreateDedupRecord {
public static void main(String[] args) throws Exception { public static void main(String[] args) throws Exception {
final ArgumentApplicationParser parser = new ArgumentApplicationParser( final ArgumentApplicationParser parser = new ArgumentApplicationParser(

View File

@ -1,18 +1,20 @@
package eu.dnetlib.dedup; package eu.dnetlib.dedup;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.schema.oaf.Relation;
import eu.dnetlib.pace.config.DedupConfig;
import eu.dnetlib.pace.model.MapDocument;
import eu.dnetlib.pace.util.MapDocumentUtil;
import java.util.List; import java.util.List;
import org.apache.commons.io.IOUtils; import org.apache.commons.io.IOUtils;
import org.apache.spark.api.java.JavaPairRDD; import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.sql.Encoders; import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SparkSession; import org.apache.spark.sql.SparkSession;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.schema.oaf.Relation;
import eu.dnetlib.pace.config.DedupConfig;
import eu.dnetlib.pace.model.MapDocument;
import eu.dnetlib.pace.util.MapDocumentUtil;
import scala.Tuple2; import scala.Tuple2;
/** /**

View File

@ -1,13 +1,15 @@
package eu.dnetlib.dedup; package eu.dnetlib.dedup;
import eu.dnetlib.pace.util.Reporter;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
import org.apache.commons.logging.Log; import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory; import org.apache.commons.logging.LogFactory;
import org.apache.spark.util.LongAccumulator; import org.apache.spark.util.LongAccumulator;
import eu.dnetlib.pace.util.Reporter;
import scala.Serializable; import scala.Serializable;
import scala.Tuple2; import scala.Tuple2;

View File

@ -1,15 +1,18 @@
package eu.dnetlib.dedup.graph; package eu.dnetlib.dedup.graph;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dedup.DedupUtility;
import eu.dnetlib.pace.util.PaceException;
import java.io.IOException; import java.io.IOException;
import java.io.Serializable; import java.io.Serializable;
import java.util.Set; import java.util.Set;
import org.apache.commons.lang.StringUtils; import org.apache.commons.lang.StringUtils;
import org.codehaus.jackson.annotate.JsonIgnore; import org.codehaus.jackson.annotate.JsonIgnore;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dedup.DedupUtility;
import eu.dnetlib.pace.util.PaceException;
public class ConnectedComponent implements Serializable { public class ConnectedComponent implements Serializable {
private Set<String> docIds; private Set<String> docIds;

View File

@ -1,17 +1,20 @@
package eu.dnetlib.dedup.sx; package eu.dnetlib.dedup.sx;
import com.fasterxml.jackson.databind.DeserializationFeature;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.schema.oaf.DataInfo;
import eu.dnetlib.dhp.schema.oaf.Relation;
import eu.dnetlib.dhp.utils.DHPUtils;
import java.io.IOException; import java.io.IOException;
import org.apache.commons.io.IOUtils; import org.apache.commons.io.IOUtils;
import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.MapFunction; import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.*; import org.apache.spark.sql.*;
import com.fasterxml.jackson.databind.DeserializationFeature;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.schema.oaf.DataInfo;
import eu.dnetlib.dhp.schema.oaf.Relation;
import eu.dnetlib.dhp.utils.DHPUtils;
import scala.Tuple2; import scala.Tuple2;
public class SparkPropagateRelationsJob { public class SparkPropagateRelationsJob {

View File

@ -1,8 +1,19 @@
package eu.dnetlib.dedup.sx; package eu.dnetlib.dedup.sx;
import java.io.IOException;
import org.apache.commons.io.IOUtils;
import org.apache.hadoop.io.compress.GzipCodec;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.sql.*;
import com.fasterxml.jackson.databind.DeserializationFeature; import com.fasterxml.jackson.databind.DeserializationFeature;
import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.schema.oaf.DataInfo; import eu.dnetlib.dhp.schema.oaf.DataInfo;
import eu.dnetlib.dhp.schema.oaf.Oaf; import eu.dnetlib.dhp.schema.oaf.Oaf;
@ -11,14 +22,6 @@ import eu.dnetlib.dhp.schema.scholexplorer.DLIDataset;
import eu.dnetlib.dhp.schema.scholexplorer.DLIPublication; import eu.dnetlib.dhp.schema.scholexplorer.DLIPublication;
import eu.dnetlib.dhp.schema.scholexplorer.DLIUnknown; import eu.dnetlib.dhp.schema.scholexplorer.DLIUnknown;
import eu.dnetlib.dhp.utils.DHPUtils; import eu.dnetlib.dhp.utils.DHPUtils;
import java.io.IOException;
import org.apache.commons.io.IOUtils;
import org.apache.hadoop.io.compress.GzipCodec;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.sql.*;
import scala.Tuple2; import scala.Tuple2;
public class SparkUpdateEntityJob { public class SparkUpdateEntityJob {

View File

@ -3,10 +3,8 @@ package eu.dnetlib.dhp.oa.graph.hive;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkHiveSession; import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkHiveSession;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.schema.common.ModelSupport;
import java.util.Optional; import java.util.Optional;
import org.apache.commons.io.IOUtils; import org.apache.commons.io.IOUtils;
import org.apache.spark.SparkConf; import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.api.java.JavaSparkContext;
@ -16,6 +14,11 @@ import org.apache.spark.sql.SparkSession;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.schema.common.ModelSupport;
public class GraphHiveImporterJob { public class GraphHiveImporterJob {
private static final Logger log = LoggerFactory.getLogger(GraphHiveImporterJob.class); private static final Logger log = LoggerFactory.getLogger(GraphHiveImporterJob.class);

View File

@ -1,3 +1,4 @@
package eu.dnetlib.dhp.oa.graph.raw; package eu.dnetlib.dhp.oa.graph.raw;
import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.createOpenaireId; import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.createOpenaireId;
@ -10,6 +11,19 @@ import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.oaiIProvenance;
import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.qualifier; import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.qualifier;
import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.structuredProperty; import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.structuredProperty;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Date;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.apache.commons.lang3.StringUtils;
import org.dom4j.Document;
import org.dom4j.DocumentFactory;
import org.dom4j.DocumentHelper;
import org.dom4j.Node;
import eu.dnetlib.dhp.oa.graph.raw.common.MigrationConstants; import eu.dnetlib.dhp.oa.graph.raw.common.MigrationConstants;
import eu.dnetlib.dhp.schema.oaf.Author; import eu.dnetlib.dhp.schema.oaf.Author;
import eu.dnetlib.dhp.schema.oaf.Context; import eu.dnetlib.dhp.schema.oaf.Context;
@ -29,440 +43,429 @@ import eu.dnetlib.dhp.schema.oaf.Relation;
import eu.dnetlib.dhp.schema.oaf.Result; import eu.dnetlib.dhp.schema.oaf.Result;
import eu.dnetlib.dhp.schema.oaf.Software; import eu.dnetlib.dhp.schema.oaf.Software;
import eu.dnetlib.dhp.schema.oaf.StructuredProperty; import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Date;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.apache.commons.lang3.StringUtils;
import org.dom4j.Document;
import org.dom4j.DocumentFactory;
import org.dom4j.DocumentHelper;
import org.dom4j.Node;
public abstract class AbstractMdRecordToOafMapper { public abstract class AbstractMdRecordToOafMapper {
protected final Map<String, String> code2name; protected final Map<String, String> code2name;
protected static final Qualifier MAIN_TITLE_QUALIFIER = protected static final Qualifier MAIN_TITLE_QUALIFIER = qualifier(
qualifier("main title", "main title", "dnet:dataCite_title", "dnet:dataCite_title"); "main title", "main title", "dnet:dataCite_title", "dnet:dataCite_title");
protected AbstractMdRecordToOafMapper(final Map<String, String> code2name) { protected AbstractMdRecordToOafMapper(final Map<String, String> code2name) {
this.code2name = code2name; this.code2name = code2name;
} }
public List<Oaf> processMdRecord(final String xml) { public List<Oaf> processMdRecord(final String xml) {
try { try {
final Map<String, String> nsContext = new HashMap<>(); final Map<String, String> nsContext = new HashMap<>();
nsContext.put("dr", "http://www.driver-repository.eu/namespace/dr"); nsContext.put("dr", "http://www.driver-repository.eu/namespace/dr");
nsContext.put("dri", "http://www.driver-repository.eu/namespace/dri"); nsContext.put("dri", "http://www.driver-repository.eu/namespace/dri");
nsContext.put("oaf", "http://namespace.openaire.eu/oaf"); nsContext.put("oaf", "http://namespace.openaire.eu/oaf");
nsContext.put("oai", "http://www.openarchives.org/OAI/2.0/"); nsContext.put("oai", "http://www.openarchives.org/OAI/2.0/");
nsContext.put("prov", "http://www.openarchives.org/OAI/2.0/provenance"); nsContext.put("prov", "http://www.openarchives.org/OAI/2.0/provenance");
nsContext.put("dc", "http://purl.org/dc/elements/1.1/"); nsContext.put("dc", "http://purl.org/dc/elements/1.1/");
nsContext.put("datacite", "http://datacite.org/schema/kernel-3"); nsContext.put("datacite", "http://datacite.org/schema/kernel-3");
DocumentFactory.getInstance().setXPathNamespaceURIs(nsContext); DocumentFactory.getInstance().setXPathNamespaceURIs(nsContext);
final Document doc = final Document doc = DocumentHelper
DocumentHelper.parseText( .parseText(
xml.replaceAll( xml
"http://datacite.org/schema/kernel-4", "http://datacite.org/schema/kernel-3")); .replaceAll(
"http://datacite.org/schema/kernel-4", "http://datacite.org/schema/kernel-3"));
final String type = doc.valueOf("//dr:CobjCategory/@type");
final KeyValue collectedFrom = final String type = doc.valueOf("//dr:CobjCategory/@type");
keyValue( final KeyValue collectedFrom = keyValue(
createOpenaireId(10, doc.valueOf("//oaf:collectedFrom/@id"), true), createOpenaireId(10, doc.valueOf("//oaf:collectedFrom/@id"), true),
doc.valueOf("//oaf:collectedFrom/@name")); doc.valueOf("//oaf:collectedFrom/@name"));
final KeyValue hostedBy = final KeyValue hostedBy = StringUtils.isBlank(doc.valueOf("//oaf:hostedBy/@id"))
StringUtils.isBlank(doc.valueOf("//oaf:hostedBy/@id")) ? collectedFrom
? collectedFrom : keyValue(
: keyValue( createOpenaireId(10, doc.valueOf("//oaf:hostedBy/@id"), true),
createOpenaireId(10, doc.valueOf("//oaf:hostedBy/@id"), true), doc.valueOf("//oaf:hostedBy/@name"));
doc.valueOf("//oaf:hostedBy/@name"));
final DataInfo info = prepareDataInfo(doc);
final DataInfo info = prepareDataInfo(doc); final long lastUpdateTimestamp = new Date().getTime();
final long lastUpdateTimestamp = new Date().getTime();
return createOafs(doc, type, collectedFrom, hostedBy, info, lastUpdateTimestamp);
return createOafs(doc, type, collectedFrom, hostedBy, info, lastUpdateTimestamp); } catch (final Exception e) {
} catch (final Exception e) { throw new RuntimeException(e);
throw new RuntimeException(e); }
} }
}
protected List<Oaf> createOafs(
protected List<Oaf> createOafs( final Document doc,
final Document doc, final String type,
final String type, final KeyValue collectedFrom,
final KeyValue collectedFrom, final KeyValue hostedBy,
final KeyValue hostedBy, final DataInfo info,
final DataInfo info, final long lastUpdateTimestamp) {
final long lastUpdateTimestamp) {
final List<Oaf> oafs = new ArrayList<>();
final List<Oaf> oafs = new ArrayList<>();
switch (type.toLowerCase()) {
switch (type.toLowerCase()) { case "":
case "": case "publication":
case "publication": final Publication p = new Publication();
final Publication p = new Publication(); populateResultFields(p, doc, collectedFrom, hostedBy, info, lastUpdateTimestamp);
populateResultFields(p, doc, collectedFrom, hostedBy, info, lastUpdateTimestamp); p.setResulttype(MigrationConstants.PUBLICATION_RESULTTYPE_QUALIFIER);
p.setResulttype(MigrationConstants.PUBLICATION_RESULTTYPE_QUALIFIER); p.setJournal(prepareJournal(doc, info));
p.setJournal(prepareJournal(doc, info)); oafs.add(p);
oafs.add(p); break;
break; case "dataset":
case "dataset": final Dataset d = new Dataset();
final Dataset d = new Dataset(); populateResultFields(d, doc, collectedFrom, hostedBy, info, lastUpdateTimestamp);
populateResultFields(d, doc, collectedFrom, hostedBy, info, lastUpdateTimestamp); d.setResulttype(MigrationConstants.DATASET_RESULTTYPE_QUALIFIER);
d.setResulttype(MigrationConstants.DATASET_RESULTTYPE_QUALIFIER); d.setStoragedate(prepareDatasetStorageDate(doc, info));
d.setStoragedate(prepareDatasetStorageDate(doc, info)); d.setDevice(prepareDatasetDevice(doc, info));
d.setDevice(prepareDatasetDevice(doc, info)); d.setSize(prepareDatasetSize(doc, info));
d.setSize(prepareDatasetSize(doc, info)); d.setVersion(prepareDatasetVersion(doc, info));
d.setVersion(prepareDatasetVersion(doc, info)); d.setLastmetadataupdate(prepareDatasetLastMetadataUpdate(doc, info));
d.setLastmetadataupdate(prepareDatasetLastMetadataUpdate(doc, info)); d.setMetadataversionnumber(prepareDatasetMetadataVersionNumber(doc, info));
d.setMetadataversionnumber(prepareDatasetMetadataVersionNumber(doc, info)); d.setGeolocation(prepareDatasetGeoLocations(doc, info));
d.setGeolocation(prepareDatasetGeoLocations(doc, info)); oafs.add(d);
oafs.add(d); break;
break; case "software":
case "software": final Software s = new Software();
final Software s = new Software(); populateResultFields(s, doc, collectedFrom, hostedBy, info, lastUpdateTimestamp);
populateResultFields(s, doc, collectedFrom, hostedBy, info, lastUpdateTimestamp); s.setResulttype(MigrationConstants.SOFTWARE_RESULTTYPE_QUALIFIER);
s.setResulttype(MigrationConstants.SOFTWARE_RESULTTYPE_QUALIFIER); s.setDocumentationUrl(prepareSoftwareDocumentationUrls(doc, info));
s.setDocumentationUrl(prepareSoftwareDocumentationUrls(doc, info)); s.setLicense(prepareSoftwareLicenses(doc, info));
s.setLicense(prepareSoftwareLicenses(doc, info)); s.setCodeRepositoryUrl(prepareSoftwareCodeRepositoryUrl(doc, info));
s.setCodeRepositoryUrl(prepareSoftwareCodeRepositoryUrl(doc, info)); s.setProgrammingLanguage(prepareSoftwareProgrammingLanguage(doc, info));
s.setProgrammingLanguage(prepareSoftwareProgrammingLanguage(doc, info)); oafs.add(s);
oafs.add(s); break;
break; case "otherresearchproducts":
case "otherresearchproducts": default:
default: final OtherResearchProduct o = new OtherResearchProduct();
final OtherResearchProduct o = new OtherResearchProduct(); populateResultFields(o, doc, collectedFrom, hostedBy, info, lastUpdateTimestamp);
populateResultFields(o, doc, collectedFrom, hostedBy, info, lastUpdateTimestamp); o.setResulttype(MigrationConstants.OTHER_RESULTTYPE_QUALIFIER);
o.setResulttype(MigrationConstants.OTHER_RESULTTYPE_QUALIFIER); o.setContactperson(prepareOtherResearchProductContactPersons(doc, info));
o.setContactperson(prepareOtherResearchProductContactPersons(doc, info)); o.setContactgroup(prepareOtherResearchProductContactGroups(doc, info));
o.setContactgroup(prepareOtherResearchProductContactGroups(doc, info)); o.setTool(prepareOtherResearchProductTools(doc, info));
o.setTool(prepareOtherResearchProductTools(doc, info)); oafs.add(o);
oafs.add(o); break;
break; }
}
if (!oafs.isEmpty()) {
if (!oafs.isEmpty()) { oafs.addAll(addProjectRels(doc, collectedFrom, info, lastUpdateTimestamp));
oafs.addAll(addProjectRels(doc, collectedFrom, info, lastUpdateTimestamp)); oafs.addAll(addOtherResultRels(doc, collectedFrom, info, lastUpdateTimestamp));
oafs.addAll(addOtherResultRels(doc, collectedFrom, info, lastUpdateTimestamp)); }
}
return oafs;
return oafs; }
}
private List<Oaf> addProjectRels(
private List<Oaf> addProjectRels( final Document doc,
final Document doc, final KeyValue collectedFrom,
final KeyValue collectedFrom, final DataInfo info,
final DataInfo info, final long lastUpdateTimestamp) {
final long lastUpdateTimestamp) {
final List<Oaf> res = new ArrayList<>();
final List<Oaf> res = new ArrayList<>();
final String docId = createOpenaireId(50, doc.valueOf("//dri:objIdentifier"), false);
final String docId = createOpenaireId(50, doc.valueOf("//dri:objIdentifier"), false);
for (final Object o : doc.selectNodes("//oaf:projectid")) {
for (final Object o : doc.selectNodes("//oaf:projectid")) {
final String originalId = ((Node) o).getText();
final String originalId = ((Node) o).getText();
if (StringUtils.isNotBlank(originalId)) {
if (StringUtils.isNotBlank(originalId)) { final String projectId = createOpenaireId(40, originalId, true);
final String projectId = createOpenaireId(40, originalId, true);
final Relation r1 = new Relation();
final Relation r1 = new Relation(); r1.setRelType("resultProject");
r1.setRelType("resultProject"); r1.setSubRelType("outcome");
r1.setSubRelType("outcome"); r1.setRelClass("isProducedBy");
r1.setRelClass("isProducedBy"); r1.setSource(docId);
r1.setSource(docId); r1.setTarget(projectId);
r1.setTarget(projectId); r1.setCollectedfrom(Arrays.asList(collectedFrom));
r1.setCollectedfrom(Arrays.asList(collectedFrom)); r1.setDataInfo(info);
r1.setDataInfo(info); r1.setLastupdatetimestamp(lastUpdateTimestamp);
r1.setLastupdatetimestamp(lastUpdateTimestamp); res.add(r1);
res.add(r1);
final Relation r2 = new Relation();
final Relation r2 = new Relation(); r2.setRelType("resultProject");
r2.setRelType("resultProject"); r2.setSubRelType("outcome");
r2.setSubRelType("outcome"); r2.setRelClass("produces");
r2.setRelClass("produces"); r2.setSource(projectId);
r2.setSource(projectId); r2.setTarget(docId);
r2.setTarget(docId); r2.setCollectedfrom(Arrays.asList(collectedFrom));
r2.setCollectedfrom(Arrays.asList(collectedFrom)); r2.setDataInfo(info);
r2.setDataInfo(info); r2.setLastupdatetimestamp(lastUpdateTimestamp);
r2.setLastupdatetimestamp(lastUpdateTimestamp); res.add(r2);
res.add(r2); }
} }
}
return res;
return res; }
}
protected abstract List<Oaf> addOtherResultRels(
protected abstract List<Oaf> addOtherResultRels( final Document doc,
final Document doc, final KeyValue collectedFrom,
final KeyValue collectedFrom, final DataInfo info,
final DataInfo info, final long lastUpdateTimestamp);
final long lastUpdateTimestamp);
private void populateResultFields(
private void populateResultFields( final Result r,
final Result r, final Document doc,
final Document doc, final KeyValue collectedFrom,
final KeyValue collectedFrom, final KeyValue hostedBy,
final KeyValue hostedBy, final DataInfo info,
final DataInfo info, final long lastUpdateTimestamp) {
final long lastUpdateTimestamp) { r.setDataInfo(info);
r.setDataInfo(info); r.setLastupdatetimestamp(lastUpdateTimestamp);
r.setLastupdatetimestamp(lastUpdateTimestamp); r.setId(createOpenaireId(50, doc.valueOf("//dri:objIdentifier"), false));
r.setId(createOpenaireId(50, doc.valueOf("//dri:objIdentifier"), false)); r.setOriginalId(Arrays.asList(doc.valueOf("//dri:objIdentifier")));
r.setOriginalId(Arrays.asList(doc.valueOf("//dri:objIdentifier"))); r.setCollectedfrom(Arrays.asList(collectedFrom));
r.setCollectedfrom(Arrays.asList(collectedFrom)); r
r.setPid( .setPid(
prepareListStructProps( prepareListStructProps(
doc, "//oaf:identifier", "@identifierType", "dnet:pid_types", "dnet:pid_types", info)); doc, "//oaf:identifier", "@identifierType", "dnet:pid_types", "dnet:pid_types", info));
r.setDateofcollection(doc.valueOf("//dr:dateOfCollection")); r.setDateofcollection(doc.valueOf("//dr:dateOfCollection"));
r.setDateoftransformation(doc.valueOf("//dr:dateOfTransformation")); r.setDateoftransformation(doc.valueOf("//dr:dateOfTransformation"));
r.setExtraInfo(new ArrayList<>()); // NOT PRESENT IN MDSTORES r.setExtraInfo(new ArrayList<>()); // NOT PRESENT IN MDSTORES
r.setOaiprovenance(prepareOAIprovenance(doc)); r.setOaiprovenance(prepareOAIprovenance(doc));
r.setAuthor(prepareAuthors(doc, info)); r.setAuthor(prepareAuthors(doc, info));
r.setLanguage(prepareLanguages(doc)); r.setLanguage(prepareLanguages(doc));
r.setCountry(new ArrayList<>()); // NOT PRESENT IN MDSTORES r.setCountry(new ArrayList<>()); // NOT PRESENT IN MDSTORES
r.setSubject(prepareSubjects(doc, info)); r.setSubject(prepareSubjects(doc, info));
r.setTitle(prepareTitles(doc, info)); r.setTitle(prepareTitles(doc, info));
r.setRelevantdate(prepareRelevantDates(doc, info)); r.setRelevantdate(prepareRelevantDates(doc, info));
r.setDescription(prepareDescriptions(doc, info)); r.setDescription(prepareDescriptions(doc, info));
r.setDateofacceptance(prepareField(doc, "//oaf:dateAccepted", info)); r.setDateofacceptance(prepareField(doc, "//oaf:dateAccepted", info));
r.setPublisher(preparePublisher(doc, info)); r.setPublisher(preparePublisher(doc, info));
r.setEmbargoenddate(prepareField(doc, "//oaf:embargoenddate", info)); r.setEmbargoenddate(prepareField(doc, "//oaf:embargoenddate", info));
r.setSource(prepareSources(doc, info)); r.setSource(prepareSources(doc, info));
r.setFulltext(new ArrayList<>()); // NOT PRESENT IN MDSTORES r.setFulltext(new ArrayList<>()); // NOT PRESENT IN MDSTORES
r.setFormat(prepareFormats(doc, info)); r.setFormat(prepareFormats(doc, info));
r.setContributor(prepareContributors(doc, info)); r.setContributor(prepareContributors(doc, info));
r.setResourcetype(prepareResourceType(doc, info)); r.setResourcetype(prepareResourceType(doc, info));
r.setCoverage(prepareCoverages(doc, info)); r.setCoverage(prepareCoverages(doc, info));
r.setContext(prepareContexts(doc, info)); r.setContext(prepareContexts(doc, info));
r.setExternalReference(new ArrayList<>()); // NOT PRESENT IN MDSTORES r.setExternalReference(new ArrayList<>()); // NOT PRESENT IN MDSTORES
r.setInstance(prepareInstances(doc, info, collectedFrom, hostedBy)); r.setInstance(prepareInstances(doc, info, collectedFrom, hostedBy));
} }
private List<Context> prepareContexts(final Document doc, final DataInfo info) { private List<Context> prepareContexts(final Document doc, final DataInfo info) {
final List<Context> list = new ArrayList<>(); final List<Context> list = new ArrayList<>();
for (final Object o : doc.selectNodes("//oaf:concept")) { for (final Object o : doc.selectNodes("//oaf:concept")) {
final String cid = ((Node) o).valueOf("@id"); final String cid = ((Node) o).valueOf("@id");
if (StringUtils.isNotBlank(cid)) { if (StringUtils.isNotBlank(cid)) {
final Context c = new Context(); final Context c = new Context();
c.setId(cid); c.setId(cid);
c.setDataInfo(Arrays.asList(info)); c.setDataInfo(Arrays.asList(info));
list.add(c); list.add(c);
} }
} }
return list; return list;
} }
protected abstract Qualifier prepareResourceType(Document doc, DataInfo info); protected abstract Qualifier prepareResourceType(Document doc, DataInfo info);
protected abstract List<Instance> prepareInstances( protected abstract List<Instance> prepareInstances(
Document doc, DataInfo info, KeyValue collectedfrom, KeyValue hostedby); Document doc, DataInfo info, KeyValue collectedfrom, KeyValue hostedby);
protected abstract List<Field<String>> prepareSources(Document doc, DataInfo info); protected abstract List<Field<String>> prepareSources(Document doc, DataInfo info);
protected abstract List<StructuredProperty> prepareRelevantDates(Document doc, DataInfo info); protected abstract List<StructuredProperty> prepareRelevantDates(Document doc, DataInfo info);
protected abstract List<Field<String>> prepareCoverages(Document doc, DataInfo info); protected abstract List<Field<String>> prepareCoverages(Document doc, DataInfo info);
protected abstract List<Field<String>> prepareContributors(Document doc, DataInfo info); protected abstract List<Field<String>> prepareContributors(Document doc, DataInfo info);
protected abstract List<Field<String>> prepareFormats(Document doc, DataInfo info); protected abstract List<Field<String>> prepareFormats(Document doc, DataInfo info);
protected abstract Field<String> preparePublisher(Document doc, DataInfo info); protected abstract Field<String> preparePublisher(Document doc, DataInfo info);
protected abstract List<Field<String>> prepareDescriptions(Document doc, DataInfo info); protected abstract List<Field<String>> prepareDescriptions(Document doc, DataInfo info);
protected abstract List<StructuredProperty> prepareTitles(Document doc, DataInfo info); protected abstract List<StructuredProperty> prepareTitles(Document doc, DataInfo info);
protected abstract List<StructuredProperty> prepareSubjects(Document doc, DataInfo info); protected abstract List<StructuredProperty> prepareSubjects(Document doc, DataInfo info);
protected abstract Qualifier prepareLanguages(Document doc); protected abstract Qualifier prepareLanguages(Document doc);
protected abstract List<Author> prepareAuthors(Document doc, DataInfo info); protected abstract List<Author> prepareAuthors(Document doc, DataInfo info);
protected abstract List<Field<String>> prepareOtherResearchProductTools( protected abstract List<Field<String>> prepareOtherResearchProductTools(
Document doc, DataInfo info); Document doc, DataInfo info);
protected abstract List<Field<String>> prepareOtherResearchProductContactGroups( protected abstract List<Field<String>> prepareOtherResearchProductContactGroups(
Document doc, DataInfo info); Document doc, DataInfo info);
protected abstract List<Field<String>> prepareOtherResearchProductContactPersons( protected abstract List<Field<String>> prepareOtherResearchProductContactPersons(
Document doc, DataInfo info); Document doc, DataInfo info);
protected abstract Qualifier prepareSoftwareProgrammingLanguage(Document doc, DataInfo info); protected abstract Qualifier prepareSoftwareProgrammingLanguage(Document doc, DataInfo info);
protected abstract Field<String> prepareSoftwareCodeRepositoryUrl(Document doc, DataInfo info); protected abstract Field<String> prepareSoftwareCodeRepositoryUrl(Document doc, DataInfo info);
protected abstract List<StructuredProperty> prepareSoftwareLicenses(Document doc, DataInfo info); protected abstract List<StructuredProperty> prepareSoftwareLicenses(Document doc, DataInfo info);
protected abstract List<Field<String>> prepareSoftwareDocumentationUrls( protected abstract List<Field<String>> prepareSoftwareDocumentationUrls(
Document doc, DataInfo info); Document doc, DataInfo info);
protected abstract List<GeoLocation> prepareDatasetGeoLocations(Document doc, DataInfo info); protected abstract List<GeoLocation> prepareDatasetGeoLocations(Document doc, DataInfo info);
protected abstract Field<String> prepareDatasetMetadataVersionNumber(Document doc, DataInfo info); protected abstract Field<String> prepareDatasetMetadataVersionNumber(Document doc, DataInfo info);
protected abstract Field<String> prepareDatasetLastMetadataUpdate(Document doc, DataInfo info); protected abstract Field<String> prepareDatasetLastMetadataUpdate(Document doc, DataInfo info);
protected abstract Field<String> prepareDatasetVersion(Document doc, DataInfo info); protected abstract Field<String> prepareDatasetVersion(Document doc, DataInfo info);
protected abstract Field<String> prepareDatasetSize(Document doc, DataInfo info); protected abstract Field<String> prepareDatasetSize(Document doc, DataInfo info);
protected abstract Field<String> prepareDatasetDevice(Document doc, DataInfo info); protected abstract Field<String> prepareDatasetDevice(Document doc, DataInfo info);
protected abstract Field<String> prepareDatasetStorageDate(Document doc, DataInfo info); protected abstract Field<String> prepareDatasetStorageDate(Document doc, DataInfo info);
private Journal prepareJournal(final Document doc, final DataInfo info) { private Journal prepareJournal(final Document doc, final DataInfo info) {
final Node n = doc.selectSingleNode("//oaf:journal"); final Node n = doc.selectSingleNode("//oaf:journal");
if (n != null) { if (n != null) {
final String name = n.getText(); final String name = n.getText();
final String issnPrinted = n.valueOf("@issn"); final String issnPrinted = n.valueOf("@issn");
final String issnOnline = n.valueOf("@eissn"); final String issnOnline = n.valueOf("@eissn");
final String issnLinking = n.valueOf("@lissn"); final String issnLinking = n.valueOf("@lissn");
final String ep = n.valueOf("@ep"); final String ep = n.valueOf("@ep");
final String iss = n.valueOf("@iss"); final String iss = n.valueOf("@iss");
final String sp = n.valueOf("@sp"); final String sp = n.valueOf("@sp");
final String vol = n.valueOf("@vol"); final String vol = n.valueOf("@vol");
final String edition = n.valueOf("@edition"); final String edition = n.valueOf("@edition");
if (StringUtils.isNotBlank(name)) { if (StringUtils.isNotBlank(name)) {
return journal( return journal(
name, name,
issnPrinted, issnPrinted,
issnOnline, issnOnline,
issnLinking, issnLinking,
ep, ep,
iss, iss,
sp, sp,
vol, vol,
edition, edition,
null, null,
null, null,
info); info);
} }
} }
return null; return null;
} }
protected Qualifier prepareQualifier( protected Qualifier prepareQualifier(
final Node node, final String xpath, final String schemeId, final String schemeName) { final Node node, final String xpath, final String schemeId, final String schemeName) {
final String classId = node.valueOf(xpath); final String classId = node.valueOf(xpath);
final String className = code2name.get(classId); final String className = code2name.get(classId);
return qualifier(classId, className, schemeId, schemeName); return qualifier(classId, className, schemeId, schemeName);
} }
protected List<StructuredProperty> prepareListStructProps( protected List<StructuredProperty> prepareListStructProps(
final Node node, final Node node,
final String xpath, final String xpath,
final String xpathClassId, final String xpathClassId,
final String schemeId, final String schemeId,
final String schemeName, final String schemeName,
final DataInfo info) { final DataInfo info) {
final List<StructuredProperty> res = new ArrayList<>(); final List<StructuredProperty> res = new ArrayList<>();
for (final Object o : node.selectNodes(xpath)) { for (final Object o : node.selectNodes(xpath)) {
final Node n = (Node) o; final Node n = (Node) o;
final String classId = n.valueOf(xpathClassId); final String classId = n.valueOf(xpathClassId);
final String className = code2name.get(classId); final String className = code2name.get(classId);
res.add(structuredProperty(n.getText(), classId, className, schemeId, schemeName, info)); res.add(structuredProperty(n.getText(), classId, className, schemeId, schemeName, info));
} }
return res; return res;
} }
protected List<StructuredProperty> prepareListStructProps( protected List<StructuredProperty> prepareListStructProps(
final Node node, final String xpath, final Qualifier qualifier, final DataInfo info) { final Node node, final String xpath, final Qualifier qualifier, final DataInfo info) {
final List<StructuredProperty> res = new ArrayList<>(); final List<StructuredProperty> res = new ArrayList<>();
for (final Object o : node.selectNodes(xpath)) { for (final Object o : node.selectNodes(xpath)) {
final Node n = (Node) o; final Node n = (Node) o;
res.add(structuredProperty(n.getText(), qualifier, info)); res.add(structuredProperty(n.getText(), qualifier, info));
} }
return res; return res;
} }
protected List<StructuredProperty> prepareListStructProps( protected List<StructuredProperty> prepareListStructProps(
final Node node, final String xpath, final DataInfo info) { final Node node, final String xpath, final DataInfo info) {
final List<StructuredProperty> res = new ArrayList<>(); final List<StructuredProperty> res = new ArrayList<>();
for (final Object o : node.selectNodes(xpath)) { for (final Object o : node.selectNodes(xpath)) {
final Node n = (Node) o; final Node n = (Node) o;
res.add( res
structuredProperty( .add(
n.getText(), structuredProperty(
n.valueOf("@classid"), n.getText(),
n.valueOf("@classname"), n.valueOf("@classid"),
n.valueOf("@schemeid"), n.valueOf("@classname"),
n.valueOf("@schemename"), n.valueOf("@schemeid"),
info)); n.valueOf("@schemename"),
} info));
return res; }
} return res;
}
protected OAIProvenance prepareOAIprovenance(final Document doc) {
final Node n = protected OAIProvenance prepareOAIprovenance(final Document doc) {
doc.selectSingleNode("//*[local-name()='provenance']/*[local-name()='originDescription']"); final Node n = doc.selectSingleNode("//*[local-name()='provenance']/*[local-name()='originDescription']");
if (n == null) { if (n == null) {
return null; return null;
} }
final String identifier = n.valueOf("./*[local-name()='identifier']"); final String identifier = n.valueOf("./*[local-name()='identifier']");
final String baseURL = n.valueOf("./*[local-name()='baseURL']"); final String baseURL = n.valueOf("./*[local-name()='baseURL']");
; ;
final String metadataNamespace = n.valueOf("./*[local-name()='metadataNamespace']"); final String metadataNamespace = n.valueOf("./*[local-name()='metadataNamespace']");
; ;
final boolean altered = n.valueOf("@altered").equalsIgnoreCase("true"); final boolean altered = n.valueOf("@altered").equalsIgnoreCase("true");
final String datestamp = n.valueOf("./*[local-name()='datestamp']"); final String datestamp = n.valueOf("./*[local-name()='datestamp']");
; ;
final String harvestDate = n.valueOf("@harvestDate"); final String harvestDate = n.valueOf("@harvestDate");
; ;
return oaiIProvenance(identifier, baseURL, metadataNamespace, altered, datestamp, harvestDate); return oaiIProvenance(identifier, baseURL, metadataNamespace, altered, datestamp, harvestDate);
} }
protected DataInfo prepareDataInfo(final Document doc) { protected DataInfo prepareDataInfo(final Document doc) {
final Node n = doc.selectSingleNode("//oaf:datainfo"); final Node n = doc.selectSingleNode("//oaf:datainfo");
if (n == null) { if (n == null) {
return dataInfo( return dataInfo(
false, null, false, false, MigrationConstants.REPOSITORY_PROVENANCE_ACTIONS, "0.9"); false, null, false, false, MigrationConstants.REPOSITORY_PROVENANCE_ACTIONS, "0.9");
} }
final String paClassId = n.valueOf("./oaf:provenanceaction/@classid"); final String paClassId = n.valueOf("./oaf:provenanceaction/@classid");
final String paClassName = n.valueOf("./oaf:provenanceaction/@classname"); final String paClassName = n.valueOf("./oaf:provenanceaction/@classname");
final String paSchemeId = n.valueOf("./oaf:provenanceaction/@schemeid"); final String paSchemeId = n.valueOf("./oaf:provenanceaction/@schemeid");
final String paSchemeName = n.valueOf("./oaf:provenanceaction/@schemename"); final String paSchemeName = n.valueOf("./oaf:provenanceaction/@schemename");
final boolean deletedbyinference = Boolean.parseBoolean(n.valueOf("./oaf:deletedbyinference")); final boolean deletedbyinference = Boolean.parseBoolean(n.valueOf("./oaf:deletedbyinference"));
final String inferenceprovenance = n.valueOf("./oaf:inferenceprovenance"); final String inferenceprovenance = n.valueOf("./oaf:inferenceprovenance");
final Boolean inferred = Boolean.parseBoolean(n.valueOf("./oaf:inferred")); final Boolean inferred = Boolean.parseBoolean(n.valueOf("./oaf:inferred"));
final String trust = n.valueOf("./oaf:trust"); final String trust = n.valueOf("./oaf:trust");
return dataInfo( return dataInfo(
deletedbyinference, deletedbyinference,
inferenceprovenance, inferenceprovenance,
inferred, inferred,
false, false,
qualifier(paClassId, paClassName, paSchemeId, paSchemeName), qualifier(paClassId, paClassName, paSchemeId, paSchemeName),
trust); trust);
} }
protected Field<String> prepareField(final Node node, final String xpath, final DataInfo info) { protected Field<String> prepareField(final Node node, final String xpath, final DataInfo info) {
return field(node.valueOf(xpath), info); return field(node.valueOf(xpath), info);
} }
protected List<Field<String>> prepareListFields( protected List<Field<String>> prepareListFields(
final Node node, final String xpath, final DataInfo info) { final Node node, final String xpath, final DataInfo info) {
return listFields(info, prepareListString(node, xpath)); return listFields(info, prepareListString(node, xpath));
} }
protected List<String> prepareListString(final Node node, final String xpath) { protected List<String> prepareListString(final Node node, final String xpath) {
final List<String> res = new ArrayList<>(); final List<String> res = new ArrayList<>();
for (final Object o : node.selectNodes(xpath)) { for (final Object o : node.selectNodes(xpath)) {
final String s = ((Node) o).getText().trim(); final String s = ((Node) o).getText().trim();
if (StringUtils.isNotBlank(s)) { if (StringUtils.isNotBlank(s)) {
res.add(s); res.add(s);
} }
} }
return res; return res;
} }
} }

View File

@ -3,11 +3,8 @@ package eu.dnetlib.dhp.oa.graph.raw;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.common.HdfsSupport;
import eu.dnetlib.dhp.schema.common.ModelSupport;
import eu.dnetlib.dhp.schema.oaf.*;
import java.util.Optional; import java.util.Optional;
import org.apache.commons.io.IOUtils; import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.StringUtils;
import org.apache.spark.SparkConf; import org.apache.spark.SparkConf;
@ -19,6 +16,11 @@ import org.apache.spark.sql.SparkSession;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.common.HdfsSupport;
import eu.dnetlib.dhp.schema.common.ModelSupport;
import eu.dnetlib.dhp.schema.oaf.*;
public class DispatchEntitiesApplication { public class DispatchEntitiesApplication {
private static final Logger log = LoggerFactory.getLogger(DispatchEntitiesApplication.class); private static final Logger log = LoggerFactory.getLogger(DispatchEntitiesApplication.class);

Some files were not shown because too many files have changed in this diff Show More