forked from D-Net/dnet-hadoop
Merge branch 'master' into dhp_oaf_model
This commit is contained in:
commit
603b1bd0bb
|
@ -6,7 +6,7 @@
|
|||
<parent>
|
||||
<groupId>eu.dnetlib.dhp</groupId>
|
||||
<artifactId>dhp-build</artifactId>
|
||||
<version>1.2.2-SNAPSHOT</version>
|
||||
<version>1.2.3-SNAPSHOT</version>
|
||||
</parent>
|
||||
|
||||
<artifactId>dhp-build-assembly-resources</artifactId>
|
||||
|
|
|
@ -6,7 +6,7 @@
|
|||
<parent>
|
||||
<groupId>eu.dnetlib.dhp</groupId>
|
||||
<artifactId>dhp-build</artifactId>
|
||||
<version>1.2.2-SNAPSHOT</version>
|
||||
<version>1.2.3-SNAPSHOT</version>
|
||||
</parent>
|
||||
|
||||
<artifactId>dhp-build-properties-maven-plugin</artifactId>
|
||||
|
|
|
@ -5,7 +5,7 @@
|
|||
|
||||
<groupId>eu.dnetlib.dhp</groupId>
|
||||
<artifactId>dhp-code-style</artifactId>
|
||||
<version>1.2.2-SNAPSHOT</version>
|
||||
<version>1.2.3-SNAPSHOT</version>
|
||||
|
||||
<packaging>jar</packaging>
|
||||
|
||||
|
|
|
@ -4,7 +4,7 @@
|
|||
<parent>
|
||||
<groupId>eu.dnetlib.dhp</groupId>
|
||||
<artifactId>dhp</artifactId>
|
||||
<version>1.2.2-SNAPSHOT</version>
|
||||
<version>1.2.3-SNAPSHOT</version>
|
||||
</parent>
|
||||
<artifactId>dhp-build</artifactId>
|
||||
<packaging>pom</packaging>
|
||||
|
|
|
@ -5,7 +5,7 @@
|
|||
<parent>
|
||||
<groupId>eu.dnetlib.dhp</groupId>
|
||||
<artifactId>dhp</artifactId>
|
||||
<version>1.2.2-SNAPSHOT</version>
|
||||
<version>1.2.3-SNAPSHOT</version>
|
||||
<relativePath>../</relativePath>
|
||||
</parent>
|
||||
|
||||
|
|
|
@ -2,6 +2,7 @@
|
|||
package eu.dnetlib.dhp.common;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.function.Consumer;
|
||||
import java.util.function.Supplier;
|
||||
|
||||
/** Provides serializable and throwing extensions to standard functional interfaces. */
|
||||
|
@ -10,6 +11,16 @@ public class FunctionalInterfaceSupport {
|
|||
private FunctionalInterfaceSupport() {
|
||||
}
|
||||
|
||||
/**
|
||||
* Serializable consumer of any kind of objects. To be used withing spark processing pipelines when supplying
|
||||
* functions externally.
|
||||
*
|
||||
* @param <T>
|
||||
*/
|
||||
@FunctionalInterface
|
||||
public interface SerializableConsumer<T> extends Consumer<T>, Serializable {
|
||||
}
|
||||
|
||||
/**
|
||||
* Serializable supplier of any kind of objects. To be used withing spark processing pipelines when supplying
|
||||
* functions externally.
|
||||
|
|
|
@ -16,6 +16,12 @@ import com.google.common.collect.Iterables;
|
|||
import com.google.common.collect.Lists;
|
||||
import com.google.common.hash.Hashing;
|
||||
|
||||
/**
|
||||
* PacePerson tries to derive information from the fullname string of an author. Such informations are Names, Surnames
|
||||
* an Fullname split into terms. It provides also an additional field for the original data. The calculation of the
|
||||
* names and the surnames is not always possible. When it is impossible to assert which are the names and the surnames,
|
||||
* the lists are empty.
|
||||
*/
|
||||
public class PacePerson {
|
||||
|
||||
private static final String UTF8 = "UTF-8";
|
||||
|
@ -26,10 +32,19 @@ public class PacePerson {
|
|||
|
||||
private static Set<String> particles = null;
|
||||
|
||||
/**
|
||||
* Capitalizes a string
|
||||
*
|
||||
* @param s the string to capitalize
|
||||
* @return the input string with capital letter
|
||||
*/
|
||||
public static final String capitalize(final String s) {
|
||||
return WordUtils.capitalize(s.toLowerCase(), ' ', '-');
|
||||
}
|
||||
|
||||
/**
|
||||
* Adds a dot to a string with length equals to 1
|
||||
*/
|
||||
public static final String dotAbbreviations(final String s) {
|
||||
return s.length() == 1 ? s + "." : s;
|
||||
}
|
||||
|
@ -46,6 +61,12 @@ public class PacePerson {
|
|||
return h;
|
||||
}
|
||||
|
||||
/**
|
||||
* The constructor of the class. It fills the fields of the class basing on the input fullname.
|
||||
*
|
||||
* @param s the input string (fullname of the author)
|
||||
* @param aggressive set the string normalization type
|
||||
*/
|
||||
public PacePerson(String s, final boolean aggressive) {
|
||||
original = s;
|
||||
s = Normalizer.normalize(s, Normalizer.Form.NFD);
|
||||
|
@ -64,6 +85,7 @@ public class PacePerson {
|
|||
// s = s.replaceAll("[\\W&&[^,-]]", "");
|
||||
}
|
||||
|
||||
// if the string contains a comma, it can derive surname and name by splitting on it
|
||||
if (s.contains(",")) {
|
||||
final String[] arr = s.split(",");
|
||||
if (arr.length == 1) {
|
||||
|
@ -74,21 +96,23 @@ public class PacePerson {
|
|||
fullname.addAll(surname);
|
||||
fullname.addAll(name);
|
||||
}
|
||||
} else {
|
||||
} else { // otherwise, it should rely on CAPS terms and short terms
|
||||
fullname = splitTerms(s);
|
||||
|
||||
int lastInitialPosition = fullname.size();
|
||||
boolean hasSurnameInUpperCase = false;
|
||||
|
||||
// computes lastInitialPosition and hasSurnameInUpperCase
|
||||
for (int i = 0; i < fullname.size(); i++) {
|
||||
final String term = fullname.get(i);
|
||||
if (term.length() == 1) {
|
||||
lastInitialPosition = i;
|
||||
lastInitialPosition = i; // first word in the name longer than 1 (to avoid name with dots)
|
||||
} else if (term.equals(term.toUpperCase())) {
|
||||
hasSurnameInUpperCase = true;
|
||||
hasSurnameInUpperCase = true; // if one of the words is CAPS
|
||||
}
|
||||
}
|
||||
|
||||
// manages particular cases of fullnames
|
||||
if (lastInitialPosition < fullname.size() - 1) { // Case: Michele G. Artini
|
||||
name = fullname.subList(0, lastInitialPosition + 1);
|
||||
surname = fullname.subList(lastInitialPosition + 1, fullname.size());
|
||||
|
|
|
@ -0,0 +1,27 @@
|
|||
|
||||
package eu.dnetlib.dhp.common;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.*;
|
||||
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
public class PacePersonTest {
|
||||
|
||||
@Test
|
||||
public void pacePersonTest1() {
|
||||
|
||||
PacePerson p = new PacePerson("Artini, Michele", false);
|
||||
assertEquals("Artini", p.getSurnameString());
|
||||
assertEquals("Michele", p.getNameString());
|
||||
assertEquals("Artini, Michele", p.getNormalisedFullname());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void pacePersonTest2() {
|
||||
PacePerson p = new PacePerson("Michele G. Artini", false);
|
||||
assertEquals("Artini, Michele G.", p.getNormalisedFullname());
|
||||
assertEquals("Michele G", p.getNameString());
|
||||
assertEquals("Artini", p.getSurnameString());
|
||||
}
|
||||
|
||||
}
|
|
@ -5,7 +5,7 @@
|
|||
<parent>
|
||||
<groupId>eu.dnetlib.dhp</groupId>
|
||||
<artifactId>dhp</artifactId>
|
||||
<version>1.2.2-SNAPSHOT</version>
|
||||
<version>1.2.3-SNAPSHOT</version>
|
||||
<relativePath>../</relativePath>
|
||||
</parent>
|
||||
|
||||
|
|
|
@ -14,6 +14,7 @@ public class ModelConstants {
|
|||
public static final String DNET_DATA_CITE_RESOURCE = "dnet:dataCite_resource";
|
||||
public static final String DNET_PROVENANCE_ACTIONS = "dnet:provenanceActions";
|
||||
public static final String DNET_COUNTRY_TYPE = "dnet:countries";
|
||||
public static final String DNET_REVIEW_LEVELS = "dnet:review_levels";
|
||||
|
||||
public static final String SYSIMPORT_CROSSWALK_REPOSITORY = "sysimport:crosswalk:repository";
|
||||
public static final String SYSIMPORT_CROSSWALK_ENTITYREGISTRY = "sysimport:crosswalk:entityregistry";
|
||||
|
@ -25,6 +26,10 @@ public class ModelConstants {
|
|||
public static final String ORP_RESULTTYPE_CLASSID = "other";
|
||||
|
||||
public static final String RESULT_RESULT = "resultResult";
|
||||
/**
|
||||
* @deprecated Use {@link ModelConstants#RELATIONSHIP} instead.
|
||||
*/
|
||||
@Deprecated
|
||||
public static final String PUBLICATION_DATASET = "publicationDataset";
|
||||
public static final String IS_RELATED_TO = "isRelatedTo";
|
||||
public static final String SUPPLEMENT = "supplement";
|
||||
|
@ -34,6 +39,12 @@ public class ModelConstants {
|
|||
public static final String IS_PART_OF = "IsPartOf";
|
||||
public static final String HAS_PARTS = "HasParts";
|
||||
public static final String RELATIONSHIP = "relationship";
|
||||
public static final String CITATION = "citation";
|
||||
public static final String CITES = "cites";
|
||||
public static final String IS_CITED_BY = "IsCitedBy";
|
||||
public static final String REVIEW = "review";
|
||||
public static final String REVIEWS = "reviews";
|
||||
public static final String IS_REVIEWED_BY = "IsReviewedBy";
|
||||
|
||||
public static final String RESULT_PROJECT = "resultProject";
|
||||
public static final String OUTCOME = "outcome";
|
||||
|
|
|
@ -58,6 +58,18 @@ public class ModelSupport {
|
|||
oafTypes.put("relation", Relation.class);
|
||||
}
|
||||
|
||||
public static final Map<Class, String> idPrefixMap = Maps.newHashMap();
|
||||
|
||||
static {
|
||||
idPrefixMap.put(Datasource.class, "10");
|
||||
idPrefixMap.put(Organization.class, "20");
|
||||
idPrefixMap.put(Project.class, "40");
|
||||
idPrefixMap.put(Dataset.class, "50");
|
||||
idPrefixMap.put(OtherResearchProduct.class, "50");
|
||||
idPrefixMap.put(Software.class, "50");
|
||||
idPrefixMap.put(Publication.class, "50");
|
||||
}
|
||||
|
||||
public static final Map<String, String> entityIdPrefix = Maps.newHashMap();
|
||||
|
||||
static {
|
||||
|
@ -289,6 +301,10 @@ public class ModelSupport {
|
|||
private ModelSupport() {
|
||||
}
|
||||
|
||||
public static <E extends OafEntity> String getIdPrefix(Class<E> clazz) {
|
||||
return idPrefixMap.get(clazz);
|
||||
}
|
||||
|
||||
/**
|
||||
* Checks subclass-superclass relationship.
|
||||
*
|
||||
|
|
|
@ -10,6 +10,7 @@ public class Dataset extends Result implements Serializable {
|
|||
|
||||
private Field<String> storagedate;
|
||||
|
||||
// candidate for removal
|
||||
private Field<String> device;
|
||||
|
||||
private Field<String> size;
|
||||
|
|
|
@ -31,7 +31,7 @@ public class Instance implements Serializable {
|
|||
// typed results
|
||||
private Field<String> processingchargecurrency;
|
||||
|
||||
private Field<String> refereed; // peer-review status
|
||||
private Qualifier refereed; // peer-review status
|
||||
|
||||
public Field<String> getLicense() {
|
||||
return license;
|
||||
|
@ -113,11 +113,11 @@ public class Instance implements Serializable {
|
|||
this.processingchargecurrency = processingchargecurrency;
|
||||
}
|
||||
|
||||
public Field<String> getRefereed() {
|
||||
public Qualifier getRefereed() {
|
||||
return refereed;
|
||||
}
|
||||
|
||||
public void setRefereed(Field<String> refereed) {
|
||||
public void setRefereed(Qualifier refereed) {
|
||||
this.refereed = refereed;
|
||||
}
|
||||
|
||||
|
|
|
@ -2,8 +2,10 @@
|
|||
package eu.dnetlib.dhp.schema.oaf;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Comparator;
|
||||
import java.util.List;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
public class Result extends OafEntity implements Serializable {
|
||||
|
||||
|
@ -260,21 +262,29 @@ public class Result extends OafEntity implements Serializable {
|
|||
StructuredProperty baseMainTitle = null;
|
||||
if (title != null) {
|
||||
baseMainTitle = getMainTitle(title);
|
||||
title.remove(baseMainTitle);
|
||||
if (baseMainTitle != null) {
|
||||
final StructuredProperty p = baseMainTitle;
|
||||
title = title.stream().filter(t -> t != p).collect(Collectors.toList());
|
||||
}
|
||||
}
|
||||
|
||||
StructuredProperty newMainTitle = null;
|
||||
if (r.getTitle() != null) {
|
||||
newMainTitle = getMainTitle(r.getTitle());
|
||||
r.getTitle().remove(newMainTitle);
|
||||
if (newMainTitle != null && title != null) {
|
||||
final StructuredProperty p = newMainTitle;
|
||||
title = title.stream().filter(t -> t != p).collect(Collectors.toList());
|
||||
}
|
||||
}
|
||||
|
||||
if (newMainTitle != null && compareTrust(this, r) < 0)
|
||||
if (newMainTitle != null && compareTrust(this, r) < 0) {
|
||||
baseMainTitle = newMainTitle;
|
||||
}
|
||||
|
||||
title = mergeLists(title, r.getTitle());
|
||||
if (title != null && baseMainTitle != null)
|
||||
if (title != null && baseMainTitle != null) {
|
||||
title.add(baseMainTitle);
|
||||
}
|
||||
|
||||
relevantdate = mergeLists(relevantdate, r.getRelevantdate());
|
||||
|
||||
|
|
|
@ -10,8 +10,10 @@ public class Software extends Result implements Serializable {
|
|||
|
||||
private List<Field<String>> documentationUrl;
|
||||
|
||||
// candidate for removal
|
||||
private List<StructuredProperty> license;
|
||||
|
||||
// candidate for removal
|
||||
private Field<String> codeRepositoryUrl;
|
||||
|
||||
private Qualifier programmingLanguage;
|
||||
|
|
|
@ -4,7 +4,7 @@
|
|||
<parent>
|
||||
<groupId>eu.dnetlib.dhp</groupId>
|
||||
<artifactId>dhp-workflows</artifactId>
|
||||
<version>1.2.2-SNAPSHOT</version>
|
||||
<version>1.2.3-SNAPSHOT</version>
|
||||
</parent>
|
||||
<artifactId>dhp-actionmanager</artifactId>
|
||||
|
||||
|
|
|
@ -96,12 +96,21 @@ public class ProtoConverter implements Serializable {
|
|||
.stream()
|
||||
.distinct()
|
||||
.collect(Collectors.toCollection(ArrayList::new)) : null);
|
||||
i.setRefereed(mapStringField(ri.getRefereed()));
|
||||
i.setRefereed(mapRefereed(ri.getRefereed()));
|
||||
i.setProcessingchargeamount(mapStringField(ri.getProcessingchargeamount()));
|
||||
i.setProcessingchargecurrency(mapStringField(ri.getProcessingchargecurrency()));
|
||||
return i;
|
||||
}
|
||||
|
||||
private static Qualifier mapRefereed(FieldTypeProtos.StringField refereed) {
|
||||
Qualifier q = new Qualifier();
|
||||
q.setClassid(refereed.getValue());
|
||||
q.setSchemename(refereed.getValue());
|
||||
q.setSchemeid("dnet:review_levels");
|
||||
q.setSchemename("dnet:review_levels");
|
||||
return q;
|
||||
}
|
||||
|
||||
private static List<ExternalReference> convertExternalRefs(OafProtos.Oaf oaf) {
|
||||
ResultProtos.Result r = oaf.getEntity().getResult();
|
||||
if (r.getExternalReferenceCount() > 0) {
|
||||
|
|
|
@ -4,7 +4,7 @@
|
|||
<parent>
|
||||
<groupId>eu.dnetlib.dhp</groupId>
|
||||
<artifactId>dhp-workflows</artifactId>
|
||||
<version>1.2.2-SNAPSHOT</version>
|
||||
<version>1.2.3-SNAPSHOT</version>
|
||||
</parent>
|
||||
<artifactId>dhp-aggregation</artifactId>
|
||||
|
||||
|
|
|
@ -8,6 +8,7 @@ import java.io.File;
|
|||
|
||||
import org.junit.jupiter.api.AfterEach;
|
||||
import org.junit.jupiter.api.BeforeEach;
|
||||
import org.junit.jupiter.api.Disabled;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
|
@ -19,6 +20,7 @@ import eu.dnetlib.dhp.collection.worker.utils.CollectorPluginFactory;
|
|||
import eu.dnetlib.message.Message;
|
||||
import eu.dnetlib.message.MessageManager;
|
||||
|
||||
@Disabled
|
||||
public class DnetCollectorWorkerApplicationTests {
|
||||
|
||||
private final ArgumentApplicationParser argumentParser = mock(ArgumentApplicationParser.class);
|
||||
|
|
|
@ -3,7 +3,7 @@
|
|||
<parent>
|
||||
<artifactId>dhp-workflows</artifactId>
|
||||
<groupId>eu.dnetlib.dhp</groupId>
|
||||
<version>1.2.2-SNAPSHOT</version>
|
||||
<version>1.2.3-SNAPSHOT</version>
|
||||
</parent>
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
|
||||
|
|
|
@ -1,66 +1,70 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
|
||||
<parent>
|
||||
<artifactId>dhp-workflows</artifactId>
|
||||
<groupId>eu.dnetlib.dhp</groupId>
|
||||
<version>1.2.2-SNAPSHOT</version>
|
||||
</parent>
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
<parent>
|
||||
<artifactId>dhp-workflows</artifactId>
|
||||
<groupId>eu.dnetlib.dhp</groupId>
|
||||
<version>1.2.3-SNAPSHOT</version>
|
||||
</parent>
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
|
||||
<artifactId>dhp-broker-events</artifactId>
|
||||
<artifactId>dhp-broker-events</artifactId>
|
||||
|
||||
<dependencies>
|
||||
<dependencies>
|
||||
|
||||
<dependency>
|
||||
<groupId>commons-io</groupId>
|
||||
<artifactId>commons-io</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>commons-io</groupId>
|
||||
<artifactId>commons-io</artifactId>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>org.apache.spark</groupId>
|
||||
<artifactId>spark-core_2.11</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.spark</groupId>
|
||||
<artifactId>spark-sql_2.11</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.spark</groupId>
|
||||
<artifactId>spark-hive_2.11</artifactId>
|
||||
<scope>test</scope>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.spark</groupId>
|
||||
<artifactId>spark-core_2.11</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.spark</groupId>
|
||||
<artifactId>spark-sql_2.11</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.spark</groupId>
|
||||
<artifactId>spark-hive_2.11</artifactId>
|
||||
<scope>test</scope>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>eu.dnetlib.dhp</groupId>
|
||||
<artifactId>dhp-common</artifactId>
|
||||
<version>${project.version}</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>eu.dnetlib.dhp</groupId>
|
||||
<artifactId>dhp-schemas</artifactId>
|
||||
<version>${project.version}</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>eu.dnetlib.dhp</groupId>
|
||||
<artifactId>dhp-common</artifactId>
|
||||
<version>${project.version}</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>eu.dnetlib.dhp</groupId>
|
||||
<artifactId>dhp-schemas</artifactId>
|
||||
<version>${project.version}</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>eu.dnetlib</groupId>
|
||||
<artifactId>dnet-pace-core</artifactId>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>com.jayway.jsonpath</groupId>
|
||||
<artifactId>json-path</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>dom4j</groupId>
|
||||
<artifactId>dom4j</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>jaxen</groupId>
|
||||
<artifactId>jaxen</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>com.jayway.jsonpath</groupId>
|
||||
<artifactId>json-path</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>dom4j</groupId>
|
||||
<artifactId>dom4j</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>jaxen</groupId>
|
||||
<artifactId>jaxen</artifactId>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>eu.dnetlib</groupId>
|
||||
<artifactId>dnet-openaire-broker-common</artifactId>
|
||||
<version>[2.0.0,3.0.0)</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>eu.dnetlib</groupId>
|
||||
<artifactId>dnet-openaire-broker-common</artifactId>
|
||||
<version>[3.0.1,4.0.0)</version>
|
||||
</dependency>
|
||||
|
||||
</dependencies>
|
||||
</dependencies>
|
||||
|
||||
|
||||
</project>
|
|
@ -0,0 +1,37 @@
|
|||
<?xml version='1.0' encoding='UTF-8'?>
|
||||
<dfxml xmloutputversion='1.0'>
|
||||
<metadata
|
||||
xmlns='http://afflib.org/tcpflow/'
|
||||
xmlns:xsi='http://www.w3.org/2001/XMLSchema-instance'
|
||||
xmlns:dc='http://purl.org/dc/elements/1.1/'>
|
||||
<dc:type>Feature Extraction</dc:type>
|
||||
</metadata>
|
||||
<creator version='1.0'>
|
||||
<program>TCPFLOW</program>
|
||||
<version>1.5.0</version>
|
||||
<build_environment>
|
||||
<compiler>4.2.1 (4.2.1 Compatible Apple LLVM 11.0.0 (clang-1100.0.33.8))</compiler>
|
||||
<CPPFLAGS>-D_THREAD_SAFE -pthread -I/usr/local/include -I/usr/local/include -DUTC_OFFSET=+0000 </CPPFLAGS>
|
||||
<CFLAGS>-g -D_THREAD_SAFE -pthread -g -O3 -MD -Wpointer-arith -Wmissing-declarations -Wmissing-prototypes -Wshadow -Wwrite-strings -Wcast-align -Waggregate-return -Wbad-function-cast -Wcast-qual -Wundef -Wredundant-decls -Wdisabled-optimization -Wfloat-equal -Wmultichar -Wc++-compat -Wmissing-noreturn -Wall -Wstrict-prototypes -MD -D_FORTIFY_SOURCE=2 -Wpointer-arith -Wmissing-declarations -Wmissing-prototypes -Wshadow -Wwrite-strings -Wcast-align -Waggregate-return -Wbad-function-cast -Wcast-qual -Wundef -Wredundant-decls -Wdisabled-optimization -Wfloat-equal -Wmultichar -Wc++-compat -Wmissing-noreturn -Wall -Wstrict-prototypes</CFLAGS>
|
||||
<CXXFLAGS>-g -D_THREAD_SAFE -pthread -g -O3 -Wall -MD -D_FORTIFY_SOURCE=2 -Wpointer-arith -Wshadow -Wwrite-strings -Wcast-align -Wredundant-decls -Wdisabled-optimization -Wfloat-equal -Wmultichar -Wmissing-noreturn -Woverloaded-virtual -Wsign-promo -funit-at-a-time -Weffc++ -std=c++11 -Wall -MD -D_FORTIFY_SOURCE=2 -Wpointer-arith -Wshadow -Wwrite-strings -Wcast-align -Wredundant-decls -Wdisabled-optimization -Wfloat-equal -Wmultichar -Wmissing-noreturn -Woverloaded-virtual -Wsign-promo -funit-at-a-time -Weffc++ </CXXFLAGS>
|
||||
<LDFLAGS>-L/usr/local/lib -L/usr/local/lib </LDFLAGS>
|
||||
<LIBS>-lpython2.7 -lpython2.7 -lpcap -lbz2 -lexpat -lsqlite3 -lcrypto -lssl -lcrypto -ldl -lz </LIBS>
|
||||
<compilation_date>2019-10-11T01:16:58</compilation_date>
|
||||
<library name="boost" version="107100"/>
|
||||
<library name="sqlite" version="3.28.0" source_id="2019-04-15 14:49:49 378230ae7f4b721c8b8d83c8ceb891449685cd23b1702a57841f1be40b5daapl"/>
|
||||
</build_environment>
|
||||
<execution_environment>
|
||||
<os_sysname>Darwin</os_sysname>
|
||||
<os_release>19.5.0</os_release>
|
||||
<os_version>Darwin Kernel Version 19.5.0: Tue May 26 20:41:44 PDT 2020; root:xnu-6153.121.2~2/RELEASE_X86_64</os_version>
|
||||
<host>Micheles-MBP.local</host>
|
||||
<arch>x86_64</arch>
|
||||
<command_line>tcpflow</command_line>
|
||||
<uid>501</uid>
|
||||
<username>michele</username>
|
||||
<start_time>2020-06-15T14:55:03Z</start_time>
|
||||
</execution_environment>
|
||||
</creator>
|
||||
<configuration>
|
||||
</configuration>
|
||||
<tdelta>0</tdelta>
|
|
@ -1,9 +1,15 @@
|
|||
|
||||
package eu.dnetlib.dhp.broker.model;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.Map;
|
||||
|
||||
public class Event {
|
||||
public class Event implements Serializable {
|
||||
|
||||
/**
|
||||
*
|
||||
*/
|
||||
private static final long serialVersionUID = -5936790326505781395L;
|
||||
|
||||
private String eventId;
|
||||
|
||||
|
|
|
@ -6,18 +6,13 @@ import java.util.Date;
|
|||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.apache.commons.codec.digest.DigestUtils;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.apache.commons.lang3.time.DateUtils;
|
||||
|
||||
import eu.dnetlib.broker.objects.OpenAireEventPayload;
|
||||
import eu.dnetlib.broker.objects.OpenaireBrokerResult;
|
||||
import eu.dnetlib.dhp.broker.oa.util.UpdateInfo;
|
||||
import eu.dnetlib.dhp.schema.oaf.Author;
|
||||
import eu.dnetlib.dhp.schema.oaf.KeyValue;
|
||||
import eu.dnetlib.dhp.schema.oaf.Result;
|
||||
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
|
||||
|
||||
public class EventFactory {
|
||||
|
||||
|
@ -37,15 +32,12 @@ public class EventFactory {
|
|||
|
||||
final Map<String, Object> map = createMapFromResult(updateInfo);
|
||||
|
||||
final String payload = createPayload(updateInfo);
|
||||
|
||||
final String eventId = calculateEventId(
|
||||
updateInfo.getTopicPath(), updateInfo.getTarget().getOriginalId().get(0),
|
||||
updateInfo.getHighlightValueAsString());
|
||||
updateInfo.getTopicPath(), updateInfo.getTarget().getOriginalId(), updateInfo.getHighlightValueAsString());
|
||||
|
||||
res.setEventId(eventId);
|
||||
res.setProducerId(PRODUCER_ID);
|
||||
res.setPayload(payload);
|
||||
res.setPayload(updateInfo.asBrokerPayload().toJSON());
|
||||
res.setMap(map);
|
||||
res.setTopic(updateInfo.getTopicPath());
|
||||
res.setCreationDate(now);
|
||||
|
@ -54,65 +46,34 @@ public class EventFactory {
|
|||
return res;
|
||||
}
|
||||
|
||||
private static String createPayload(final UpdateInfo<?> updateInfo) {
|
||||
final OpenAireEventPayload payload = new OpenAireEventPayload();
|
||||
// TODO
|
||||
|
||||
updateInfo.compileHighlight(payload);
|
||||
|
||||
return payload.toJSON();
|
||||
}
|
||||
|
||||
private static Map<String, Object> createMapFromResult(final UpdateInfo<?> updateInfo) {
|
||||
final Map<String, Object> map = new HashMap<>();
|
||||
|
||||
final Result source = updateInfo.getSource();
|
||||
final Result target = updateInfo.getTarget();
|
||||
final OpenaireBrokerResult source = updateInfo.getSource();
|
||||
final OpenaireBrokerResult target = updateInfo.getTarget();
|
||||
|
||||
final List<KeyValue> collectedFrom = target.getCollectedfrom();
|
||||
if (collectedFrom.size() == 1) {
|
||||
map.put("target_datasource_id", collectedFrom.get(0).getKey());
|
||||
map.put("target_datasource_name", collectedFrom.get(0).getValue());
|
||||
}
|
||||
map.put("target_datasource_id", target.getCollectedFromId());
|
||||
map.put("target_datasource_name", target.getCollectedFromName());
|
||||
|
||||
final List<String> ids = target.getOriginalId();
|
||||
if (ids.size() > 0) {
|
||||
map.put("target_publication_id", ids.get(0));
|
||||
}
|
||||
map.put("target_publication_id", target.getOriginalId());
|
||||
|
||||
final List<StructuredProperty> titles = target.getTitle();
|
||||
final List<String> titles = target.getTitles();
|
||||
if (titles.size() > 0) {
|
||||
map.put("target_publication_title", titles.get(0));
|
||||
}
|
||||
|
||||
final long date = parseDateTolong(target.getDateofacceptance().getValue());
|
||||
final long date = parseDateTolong(target.getPublicationdate());
|
||||
if (date > 0) {
|
||||
map.put("target_dateofacceptance", date);
|
||||
}
|
||||
|
||||
final List<StructuredProperty> subjects = target.getSubject();
|
||||
if (subjects.size() > 0) {
|
||||
map
|
||||
.put(
|
||||
"target_publication_subject_list",
|
||||
subjects.stream().map(StructuredProperty::getValue).collect(Collectors.toList()));
|
||||
}
|
||||
|
||||
final List<Author> authors = target.getAuthor();
|
||||
if (authors.size() > 0) {
|
||||
map
|
||||
.put(
|
||||
"target_publication_author_list",
|
||||
authors.stream().map(Author::getFullname).collect(Collectors.toList()));
|
||||
}
|
||||
map.put("target_publication_subject_list", target.getSubjects());
|
||||
map.put("target_publication_author_list", target.getCreators());
|
||||
|
||||
// PROVENANCE INFO
|
||||
map.put("trust", updateInfo.getTrust());
|
||||
final List<KeyValue> sourceCollectedFrom = source.getCollectedfrom();
|
||||
if (sourceCollectedFrom.size() == 1) {
|
||||
map.put("provenance_datasource_id", sourceCollectedFrom.get(0).getKey());
|
||||
map.put("provenance_datasource_name", sourceCollectedFrom.get(0).getValue());
|
||||
}
|
||||
map.put("provenance_datasource_id", source.getCollectedFromId());
|
||||
map.put("provenance_datasource_name", source.getCollectedFromName());
|
||||
map.put("provenance_publication_id_list", source.getOriginalId());
|
||||
|
||||
return map;
|
||||
|
|
|
@ -3,59 +3,33 @@ package eu.dnetlib.dhp.broker.oa;
|
|||
|
||||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collection;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Optional;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.commons.lang3.tuple.Pair;
|
||||
import org.apache.hadoop.io.compress.GzipCodec;
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.api.java.JavaRDD;
|
||||
import org.apache.spark.api.java.JavaSparkContext;
|
||||
import org.apache.spark.api.java.function.MapFunction;
|
||||
import org.apache.spark.sql.Column;
|
||||
import org.apache.spark.sql.Dataset;
|
||||
import org.apache.spark.sql.Encoders;
|
||||
import org.apache.spark.sql.Row;
|
||||
import org.apache.spark.sql.SaveMode;
|
||||
import org.apache.spark.sql.SparkSession;
|
||||
import org.apache.spark.sql.TypedColumn;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
|
||||
import eu.dnetlib.broker.objects.OpenaireBrokerResult;
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.broker.model.Event;
|
||||
import eu.dnetlib.dhp.broker.model.EventFactory;
|
||||
import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher;
|
||||
import eu.dnetlib.dhp.broker.oa.matchers.relatedDatasets.EnrichMissingDatasetIsReferencedBy;
|
||||
import eu.dnetlib.dhp.broker.oa.matchers.relatedDatasets.EnrichMissingDatasetIsRelatedTo;
|
||||
import eu.dnetlib.dhp.broker.oa.matchers.relatedDatasets.EnrichMissingDatasetIsSupplementedBy;
|
||||
import eu.dnetlib.dhp.broker.oa.matchers.relatedDatasets.EnrichMissingDatasetIsSupplementedTo;
|
||||
import eu.dnetlib.dhp.broker.oa.matchers.relatedDatasets.EnrichMissingDatasetReferences;
|
||||
import eu.dnetlib.dhp.broker.oa.matchers.relatedProjects.EnrichMissingProject;
|
||||
import eu.dnetlib.dhp.broker.oa.matchers.relatedProjects.EnrichMoreProject;
|
||||
import eu.dnetlib.dhp.broker.oa.matchers.relatedPublications.EnrichMissingPublicationIsReferencedBy;
|
||||
import eu.dnetlib.dhp.broker.oa.matchers.relatedPublications.EnrichMissingPublicationIsRelatedTo;
|
||||
import eu.dnetlib.dhp.broker.oa.matchers.relatedPublications.EnrichMissingPublicationIsSupplementedBy;
|
||||
import eu.dnetlib.dhp.broker.oa.matchers.relatedPublications.EnrichMissingPublicationIsSupplementedTo;
|
||||
import eu.dnetlib.dhp.broker.oa.matchers.relatedPublications.EnrichMissingPublicationReferences;
|
||||
import eu.dnetlib.dhp.broker.oa.matchers.simple.EnrichMissingAbstract;
|
||||
import eu.dnetlib.dhp.broker.oa.matchers.simple.EnrichMissingAuthorOrcid;
|
||||
import eu.dnetlib.dhp.broker.oa.matchers.simple.EnrichMissingOpenAccess;
|
||||
import eu.dnetlib.dhp.broker.oa.matchers.simple.EnrichMissingPid;
|
||||
import eu.dnetlib.dhp.broker.oa.matchers.simple.EnrichMissingPublicationDate;
|
||||
import eu.dnetlib.dhp.broker.oa.matchers.simple.EnrichMissingSoftware;
|
||||
import eu.dnetlib.dhp.broker.oa.matchers.simple.EnrichMissingSubject;
|
||||
import eu.dnetlib.dhp.broker.oa.matchers.simple.EnrichMoreOpenAccess;
|
||||
import eu.dnetlib.dhp.broker.oa.matchers.simple.EnrichMorePid;
|
||||
import eu.dnetlib.dhp.broker.oa.matchers.simple.EnrichMoreSoftware;
|
||||
import eu.dnetlib.dhp.broker.oa.matchers.simple.EnrichMoreSubject;
|
||||
import eu.dnetlib.dhp.broker.oa.util.BrokerConstants;
|
||||
import eu.dnetlib.dhp.broker.oa.util.UpdateInfo;
|
||||
import eu.dnetlib.dhp.broker.oa.util.ConversionUtils;
|
||||
import eu.dnetlib.dhp.broker.oa.util.EventFinder;
|
||||
import eu.dnetlib.dhp.broker.oa.util.EventGroup;
|
||||
import eu.dnetlib.dhp.broker.oa.util.aggregators.simple.ResultAggregator;
|
||||
import eu.dnetlib.dhp.broker.oa.util.aggregators.simple.ResultGroup;
|
||||
import eu.dnetlib.dhp.broker.oa.util.aggregators.withRels.OpenaireBrokerResultAggregator;
|
||||
import eu.dnetlib.dhp.broker.oa.util.aggregators.withRels.RelatedEntityFactory;
|
||||
import eu.dnetlib.dhp.broker.oa.util.aggregators.withRels.RelatedProject;
|
||||
import eu.dnetlib.dhp.common.HdfsSupport;
|
||||
import eu.dnetlib.dhp.schema.oaf.OtherResearchProduct;
|
||||
import eu.dnetlib.dhp.schema.oaf.Project;
|
||||
|
@ -63,49 +37,23 @@ import eu.dnetlib.dhp.schema.oaf.Publication;
|
|||
import eu.dnetlib.dhp.schema.oaf.Relation;
|
||||
import eu.dnetlib.dhp.schema.oaf.Result;
|
||||
import eu.dnetlib.dhp.schema.oaf.Software;
|
||||
import eu.dnetlib.dhp.utils.ISLookupClientFactory;
|
||||
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
|
||||
import eu.dnetlib.pace.config.DedupConfig;
|
||||
import scala.Tuple2;
|
||||
|
||||
public class GenerateEventsApplication {
|
||||
|
||||
private static final Logger log = LoggerFactory.getLogger(GenerateEventsApplication.class);
|
||||
|
||||
// Simple Matchers
|
||||
private static final UpdateMatcher<Result, ?> enrichMissingAbstract = new EnrichMissingAbstract();
|
||||
private static final UpdateMatcher<Result, ?> enrichMissingAuthorOrcid = new EnrichMissingAuthorOrcid();
|
||||
private static final UpdateMatcher<Result, ?> enrichMissingOpenAccess = new EnrichMissingOpenAccess();
|
||||
private static final UpdateMatcher<Result, ?> enrichMissingPid = new EnrichMissingPid();
|
||||
private static final UpdateMatcher<Result, ?> enrichMissingPublicationDate = new EnrichMissingPublicationDate();
|
||||
private static final UpdateMatcher<Result, ?> enrichMissingSubject = new EnrichMissingSubject();
|
||||
private static final UpdateMatcher<Result, ?> enrichMoreOpenAccess = new EnrichMoreOpenAccess();
|
||||
private static final UpdateMatcher<Result, ?> enrichMorePid = new EnrichMorePid();
|
||||
private static final UpdateMatcher<Result, ?> enrichMoreSubject = new EnrichMoreSubject();
|
||||
|
||||
// Advanced matchers
|
||||
private static final UpdateMatcher<Pair<Result, List<Project>>, ?> enrichMissingProject = new EnrichMissingProject();
|
||||
private static final UpdateMatcher<Pair<Result, List<Project>>, ?> enrichMoreProject = new EnrichMoreProject();
|
||||
|
||||
private static final UpdateMatcher<Pair<Result, List<Software>>, ?> enrichMissingSoftware = new EnrichMissingSoftware();
|
||||
private static final UpdateMatcher<Pair<Result, List<Software>>, ?> enrichMoreSoftware = new EnrichMoreSoftware();
|
||||
|
||||
private static final UpdateMatcher<Pair<Result, List<Publication>>, ?> enrichMisissingPublicationIsRelatedTo = new EnrichMissingPublicationIsRelatedTo();
|
||||
private static final UpdateMatcher<Pair<Result, List<Publication>>, ?> enrichMissingPublicationIsReferencedBy = new EnrichMissingPublicationIsReferencedBy();
|
||||
private static final UpdateMatcher<Pair<Result, List<Publication>>, ?> enrichMissingPublicationReferences = new EnrichMissingPublicationReferences();
|
||||
private static final UpdateMatcher<Pair<Result, List<Publication>>, ?> enrichMissingPublicationIsSupplementedTo = new EnrichMissingPublicationIsSupplementedTo();
|
||||
private static final UpdateMatcher<Pair<Result, List<Publication>>, ?> enrichMissingPublicationIsSupplementedBy = new EnrichMissingPublicationIsSupplementedBy();
|
||||
|
||||
private static final UpdateMatcher<Pair<Result, List<eu.dnetlib.dhp.schema.oaf.Dataset>>, ?> enrichMisissingDatasetIsRelatedTo = new EnrichMissingDatasetIsRelatedTo();
|
||||
private static final UpdateMatcher<Pair<Result, List<eu.dnetlib.dhp.schema.oaf.Dataset>>, ?> enrichMissingDatasetIsReferencedBy = new EnrichMissingDatasetIsReferencedBy();
|
||||
private static final UpdateMatcher<Pair<Result, List<eu.dnetlib.dhp.schema.oaf.Dataset>>, ?> enrichMissingDatasetReferences = new EnrichMissingDatasetReferences();
|
||||
private static final UpdateMatcher<Pair<Result, List<eu.dnetlib.dhp.schema.oaf.Dataset>>, ?> enrichMissingDatasetIsSupplementedTo = new EnrichMissingDatasetIsSupplementedTo();
|
||||
private static final UpdateMatcher<Pair<Result, List<eu.dnetlib.dhp.schema.oaf.Dataset>>, ?> enrichMissingDatasetIsSupplementedBy = new EnrichMissingDatasetIsSupplementedBy();
|
||||
|
||||
public static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
|
||||
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
|
||||
|
||||
public static void main(final String[] args) throws Exception {
|
||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
|
||||
IOUtils
|
||||
.toString(
|
||||
GenerateEventsApplication.class
|
||||
.getResourceAsStream("/eu/dnetlib/dhp/oa/graph/merge_claims_parameters.json")));
|
||||
.getResourceAsStream("/eu/dnetlib/dhp/broker/oa/generate_broker_events.json")));
|
||||
parser.parseArgument(args);
|
||||
|
||||
final Boolean isSparkSessionManaged = Optional
|
||||
|
@ -120,20 +68,32 @@ public class GenerateEventsApplication {
|
|||
final String eventsPath = parser.get("eventsPath");
|
||||
log.info("eventsPath: {}", eventsPath);
|
||||
|
||||
final String isLookupUrl = parser.get("isLookupUrl");
|
||||
log.info("isLookupUrl: {}", isLookupUrl);
|
||||
|
||||
final String dedupConfigProfileId = parser.get("dedupConfProfile");
|
||||
log.info("dedupConfigProfileId: {}", dedupConfigProfileId);
|
||||
|
||||
final SparkConf conf = new SparkConf();
|
||||
conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer");
|
||||
conf.registerKryoClasses(BrokerConstants.getModelClasses());
|
||||
|
||||
final DedupConfig dedupConfig = loadDedupConfig(isLookupUrl, dedupConfigProfileId);
|
||||
|
||||
runWithSparkSession(conf, isSparkSessionManaged, spark -> {
|
||||
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
|
||||
|
||||
removeOutputDir(spark, eventsPath);
|
||||
|
||||
final JavaRDD<Event> eventsRdd = sc.emptyRDD();
|
||||
|
||||
eventsRdd.union(generateSimpleEvents(spark, graphPath, Publication.class));
|
||||
eventsRdd.union(generateSimpleEvents(spark, graphPath, eu.dnetlib.dhp.schema.oaf.Dataset.class));
|
||||
eventsRdd.union(generateSimpleEvents(spark, graphPath, Software.class));
|
||||
eventsRdd.union(generateSimpleEvents(spark, graphPath, OtherResearchProduct.class));
|
||||
|
||||
eventsRdd.saveAsTextFile(eventsPath, GzipCodec.class);
|
||||
spark
|
||||
.emptyDataset(Encoders.kryo(Event.class))
|
||||
.union(generateEvents(spark, graphPath, Publication.class, dedupConfig))
|
||||
.union(generateEvents(spark, graphPath, eu.dnetlib.dhp.schema.oaf.Dataset.class, dedupConfig))
|
||||
.union(generateEvents(spark, graphPath, Software.class, dedupConfig))
|
||||
.union(generateEvents(spark, graphPath, OtherResearchProduct.class, dedupConfig))
|
||||
.write()
|
||||
.mode(SaveMode.Overwrite)
|
||||
.option("compression", "gzip")
|
||||
.json(eventsPath);
|
||||
});
|
||||
|
||||
}
|
||||
|
@ -142,130 +102,85 @@ public class GenerateEventsApplication {
|
|||
HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration());
|
||||
}
|
||||
|
||||
private static <R extends Result> JavaRDD<Event> generateSimpleEvents(final SparkSession spark,
|
||||
private static <SRC extends Result> Dataset<Event> generateEvents(
|
||||
final SparkSession spark,
|
||||
final String graphPath,
|
||||
final Class<R> resultClazz) {
|
||||
final Class<SRC> sourceClass,
|
||||
final DedupConfig dedupConfig) {
|
||||
|
||||
final Dataset<R> results = readPath(
|
||||
spark, graphPath + "/" + resultClazz.getSimpleName().toLowerCase(), resultClazz)
|
||||
.filter(r -> r.getDataInfo().getDeletedbyinference());
|
||||
final Dataset<OpenaireBrokerResult> results = expandResultsWithRelations(spark, graphPath, sourceClass);
|
||||
|
||||
final Dataset<Relation> rels = readPath(spark, graphPath + "/relation", Relation.class)
|
||||
final Dataset<Relation> mergedRels = readPath(spark, graphPath + "/relation", Relation.class)
|
||||
.filter(r -> r.getRelClass().equals(BrokerConstants.IS_MERGED_IN_CLASS));
|
||||
|
||||
final Column c = null; // TODO
|
||||
final TypedColumn<Tuple2<OpenaireBrokerResult, Relation>, ResultGroup> aggr = new ResultAggregator()
|
||||
.toColumn();
|
||||
|
||||
final Dataset<Row> aa = results
|
||||
.joinWith(rels, results.col("id").equalTo(rels.col("source")), "inner")
|
||||
.groupBy(rels.col("target"))
|
||||
.agg(c)
|
||||
.filter(x -> x.size() > 1)
|
||||
// generateSimpleEvents(...)
|
||||
// flatMap()
|
||||
// toRdd()
|
||||
return results
|
||||
.joinWith(mergedRels, results.col("result.id").equalTo(mergedRels.col("source")), "inner")
|
||||
.groupByKey(
|
||||
(MapFunction<Tuple2<OpenaireBrokerResult, Relation>, String>) t -> t._2.getTarget(), Encoders.STRING())
|
||||
.agg(aggr)
|
||||
.map((MapFunction<Tuple2<String, ResultGroup>, ResultGroup>) t -> t._2, Encoders.kryo(ResultGroup.class))
|
||||
.filter(ResultGroup::isValid)
|
||||
.map(
|
||||
(MapFunction<ResultGroup, EventGroup>) g -> EventFinder.generateEvents(g, dedupConfig),
|
||||
Encoders.kryo(EventGroup.class))
|
||||
.flatMap(group -> group.getData().iterator(), Encoders.kryo(Event.class));
|
||||
}
|
||||
|
||||
private static <SRC extends Result> Dataset<OpenaireBrokerResult> expandResultsWithRelations(
|
||||
final SparkSession spark,
|
||||
final String graphPath,
|
||||
final Class<SRC> sourceClass) {
|
||||
final Dataset<Project> projects = readPath(spark, graphPath + "/project", Project.class);
|
||||
final Dataset<eu.dnetlib.dhp.schema.oaf.Dataset> datasets = readPath(
|
||||
spark, graphPath + "/dataset", eu.dnetlib.dhp.schema.oaf.Dataset.class);
|
||||
final Dataset<Software> softwares = readPath(spark, graphPath + "/software", Software.class);
|
||||
final Dataset<Publication> publications = readPath(spark, graphPath + "/publication", Publication.class);
|
||||
|
||||
final Dataset<Relation> rels = readPath(spark, graphPath + "/relation", Relation.class)
|
||||
.filter(r -> !r.getRelClass().equals(BrokerConstants.IS_MERGED_IN_CLASS));
|
||||
|
||||
final Dataset<OpenaireBrokerResult> r0 = readPath(
|
||||
spark, graphPath + "/" + sourceClass.getSimpleName().toLowerCase(), Result.class)
|
||||
.filter(r -> r.getDataInfo().getDeletedbyinference())
|
||||
.map(ConversionUtils::oafResultToBrokerResult, Encoders.kryo(OpenaireBrokerResult.class));
|
||||
|
||||
final Dataset<OpenaireBrokerResult> r1 = join(r0, rels, relatedEntities(projects, rels, RelatedProject.class));
|
||||
final Dataset<OpenaireBrokerResult> r2 = join(r1, rels, relatedEntities(softwares, rels, RelatedProject.class));
|
||||
final Dataset<OpenaireBrokerResult> r3 = join(r2, rels, relatedEntities(datasets, rels, RelatedProject.class));
|
||||
final Dataset<OpenaireBrokerResult> r4 = join(
|
||||
r3, rels, relatedEntities(publications, rels, RelatedProject.class));
|
||||
;
|
||||
|
||||
return null;
|
||||
|
||||
return r4;
|
||||
}
|
||||
|
||||
private List<Event> generateSimpleEvents(final Collection<Result> children) {
|
||||
final List<UpdateInfo<?>> list = new ArrayList<>();
|
||||
|
||||
for (final Result target : children) {
|
||||
list.addAll(enrichMissingAbstract.searchUpdatesForRecord(target, children));
|
||||
list.addAll(enrichMissingAuthorOrcid.searchUpdatesForRecord(target, children));
|
||||
list.addAll(enrichMissingOpenAccess.searchUpdatesForRecord(target, children));
|
||||
list.addAll(enrichMissingPid.searchUpdatesForRecord(target, children));
|
||||
list.addAll(enrichMissingPublicationDate.searchUpdatesForRecord(target, children));
|
||||
list.addAll(enrichMissingSubject.searchUpdatesForRecord(target, children));
|
||||
list.addAll(enrichMoreOpenAccess.searchUpdatesForRecord(target, children));
|
||||
list.addAll(enrichMorePid.searchUpdatesForRecord(target, children));
|
||||
list.addAll(enrichMoreSubject.searchUpdatesForRecord(target, children));
|
||||
}
|
||||
|
||||
return list.stream().map(EventFactory::newBrokerEvent).collect(Collectors.toList());
|
||||
private static <T, RT> Dataset<RT> relatedEntities(final Dataset<T> targets,
|
||||
final Dataset<Relation> rels,
|
||||
final Class<RT> clazz) {
|
||||
return rels
|
||||
.joinWith(targets, targets.col("id").equalTo(rels.col("target")), "inner")
|
||||
.map(
|
||||
t -> RelatedEntityFactory.newRelatedEntity(t._1.getSource(), t._1.getRelType(), t._2, clazz),
|
||||
Encoders.kryo(clazz));
|
||||
}
|
||||
|
||||
private List<Event> generateProjectsEvents(final Collection<Pair<Result, List<Project>>> childrenWithProjects) {
|
||||
final List<UpdateInfo<?>> list = new ArrayList<>();
|
||||
private static <T> Dataset<OpenaireBrokerResult> join(final Dataset<OpenaireBrokerResult> sources,
|
||||
final Dataset<Relation> rels,
|
||||
final Dataset<T> typedRels) {
|
||||
|
||||
for (final Pair<Result, List<Project>> target : childrenWithProjects) {
|
||||
list.addAll(enrichMissingProject.searchUpdatesForRecord(target, childrenWithProjects));
|
||||
list.addAll(enrichMoreProject.searchUpdatesForRecord(target, childrenWithProjects));
|
||||
}
|
||||
|
||||
return list.stream().map(EventFactory::newBrokerEvent).collect(Collectors.toList());
|
||||
}
|
||||
|
||||
private List<Event> generateSoftwareEvents(final Collection<Pair<Result, List<Software>>> childrenWithSoftwares) {
|
||||
final List<UpdateInfo<?>> list = new ArrayList<>();
|
||||
|
||||
for (final Pair<Result, List<Software>> target : childrenWithSoftwares) {
|
||||
list.addAll(enrichMissingSoftware.searchUpdatesForRecord(target, childrenWithSoftwares));
|
||||
list.addAll(enrichMoreSoftware.searchUpdatesForRecord(target, childrenWithSoftwares));
|
||||
}
|
||||
return list.stream().map(EventFactory::newBrokerEvent).collect(Collectors.toList());
|
||||
}
|
||||
|
||||
private List<Event> generatePublicationRelatedEvents(final String relType,
|
||||
final Collection<Pair<Result, Map<String, List<Publication>>>> childrenWithRels) {
|
||||
|
||||
final List<UpdateInfo<?>> list = new ArrayList<>();
|
||||
|
||||
final List<Pair<Result, List<Publication>>> cleanedChildrens = childrenWithRels
|
||||
.stream()
|
||||
.filter(p -> p.getRight().containsKey(relType))
|
||||
.map(p -> Pair.of(p.getLeft(), p.getRight().get(relType)))
|
||||
.filter(p -> p.getRight().size() > 0)
|
||||
.collect(Collectors.toList());
|
||||
|
||||
for (final Pair<Result, List<Publication>> target : cleanedChildrens) {
|
||||
if (relType.equals("isRelatedTo")) {
|
||||
list.addAll(enrichMisissingPublicationIsRelatedTo.searchUpdatesForRecord(target, cleanedChildrens));
|
||||
} else if (relType.equals("references")) {
|
||||
list.addAll(enrichMissingPublicationReferences.searchUpdatesForRecord(target, cleanedChildrens));
|
||||
} else if (relType.equals("isReferencedBy")) {
|
||||
list.addAll(enrichMissingPublicationIsReferencedBy.searchUpdatesForRecord(target, cleanedChildrens));
|
||||
} else if (relType.equals("isSupplementedTo")) {
|
||||
list.addAll(enrichMissingPublicationIsSupplementedTo.searchUpdatesForRecord(target, cleanedChildrens));
|
||||
} else if (relType.equals("isSupplementedBy")) {
|
||||
list.addAll(enrichMissingPublicationIsSupplementedBy.searchUpdatesForRecord(target, cleanedChildrens));
|
||||
}
|
||||
}
|
||||
|
||||
return list.stream().map(EventFactory::newBrokerEvent).collect(Collectors.toList());
|
||||
|
||||
}
|
||||
|
||||
private List<Event> generateDatasetRelatedEvents(final String relType,
|
||||
final Collection<Pair<Result, Map<String, List<eu.dnetlib.dhp.schema.oaf.Dataset>>>> childrenWithRels) {
|
||||
|
||||
final List<UpdateInfo<?>> list = new ArrayList<>();
|
||||
|
||||
final List<Pair<Result, List<eu.dnetlib.dhp.schema.oaf.Dataset>>> cleanedChildrens = childrenWithRels
|
||||
.stream()
|
||||
.filter(p -> p.getRight().containsKey(relType))
|
||||
.map(p -> Pair.of(p.getLeft(), p.getRight().get(relType)))
|
||||
.filter(p -> p.getRight().size() > 0)
|
||||
.collect(Collectors.toList());
|
||||
|
||||
for (final Pair<Result, List<eu.dnetlib.dhp.schema.oaf.Dataset>> target : cleanedChildrens) {
|
||||
if (relType.equals("isRelatedTo")) {
|
||||
list.addAll(enrichMisissingDatasetIsRelatedTo.searchUpdatesForRecord(target, cleanedChildrens));
|
||||
} else if (relType.equals("references")) {
|
||||
list.addAll(enrichMissingDatasetReferences.searchUpdatesForRecord(target, cleanedChildrens));
|
||||
} else if (relType.equals("isReferencedBy")) {
|
||||
list.addAll(enrichMissingDatasetIsReferencedBy.searchUpdatesForRecord(target, cleanedChildrens));
|
||||
} else if (relType.equals("isSupplementedTo")) {
|
||||
list.addAll(enrichMissingDatasetIsSupplementedTo.searchUpdatesForRecord(target, cleanedChildrens));
|
||||
} else if (relType.equals("isSupplementedBy")) {
|
||||
list.addAll(enrichMissingDatasetIsSupplementedBy.searchUpdatesForRecord(target, cleanedChildrens));
|
||||
}
|
||||
}
|
||||
|
||||
return list.stream().map(EventFactory::newBrokerEvent).collect(Collectors.toList());
|
||||
final TypedColumn<Tuple2<OpenaireBrokerResult, T>, OpenaireBrokerResult> aggr = new OpenaireBrokerResultAggregator<T>()
|
||||
.toColumn();
|
||||
;
|
||||
|
||||
return sources
|
||||
.joinWith(typedRels, sources.col("result.id").equalTo(rels.col("source")), "left_outer")
|
||||
.groupByKey(
|
||||
(MapFunction<Tuple2<OpenaireBrokerResult, T>, String>) t -> t._1.getOpenaireId(), Encoders.STRING())
|
||||
.agg(aggr)
|
||||
.map(t -> t._2, Encoders.kryo(OpenaireBrokerResult.class));
|
||||
}
|
||||
|
||||
public static <R> Dataset<R> readPath(
|
||||
|
@ -277,4 +192,23 @@ public class GenerateEventsApplication {
|
|||
.textFile(inputPath)
|
||||
.map((MapFunction<String, R>) value -> OBJECT_MAPPER.readValue(value, clazz), Encoders.bean(clazz));
|
||||
}
|
||||
|
||||
private static DedupConfig loadDedupConfig(final String isLookupUrl, final String profId) throws Exception {
|
||||
final ISLookUpService isLookUpService = ISLookupClientFactory.getLookUpService(isLookupUrl);
|
||||
|
||||
final String conf = isLookUpService
|
||||
.getResourceProfileByQuery(
|
||||
String
|
||||
.format(
|
||||
"for $x in /RESOURCE_PROFILE[.//RESOURCE_IDENTIFIER/@value = '%s'] return $x//DEDUPLICATION/text()",
|
||||
profId));
|
||||
|
||||
final DedupConfig dedupConfig = new ObjectMapper().readValue(conf, DedupConfig.class);
|
||||
dedupConfig.getPace().initModel();
|
||||
dedupConfig.getPace().initTranslationMap();
|
||||
// dedupConfig.getWf().setConfigurationId("???");
|
||||
|
||||
return dedupConfig;
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -6,28 +6,45 @@ import java.util.Collection;
|
|||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.function.BiConsumer;
|
||||
import java.util.function.Function;
|
||||
|
||||
import org.apache.commons.codec.digest.DigestUtils;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
|
||||
import eu.dnetlib.broker.objects.OpenaireBrokerResult;
|
||||
import eu.dnetlib.dhp.broker.model.Topic;
|
||||
import eu.dnetlib.dhp.broker.oa.util.UpdateInfo;
|
||||
import eu.dnetlib.dhp.schema.oaf.Field;
|
||||
import eu.dnetlib.pace.config.DedupConfig;
|
||||
|
||||
public abstract class UpdateMatcher<K, T> {
|
||||
public abstract class UpdateMatcher<T> {
|
||||
|
||||
private final boolean multipleUpdate;
|
||||
private final Function<T, Topic> topicFunction;
|
||||
private final BiConsumer<OpenaireBrokerResult, T> compileHighlightFunction;
|
||||
private final Function<T, String> highlightToStringFunction;
|
||||
|
||||
public UpdateMatcher(final boolean multipleUpdate) {
|
||||
public UpdateMatcher(final boolean multipleUpdate, final Function<T, Topic> topicFunction,
|
||||
final BiConsumer<OpenaireBrokerResult, T> compileHighlightFunction,
|
||||
final Function<T, String> highlightToStringFunction) {
|
||||
this.multipleUpdate = multipleUpdate;
|
||||
this.topicFunction = topicFunction;
|
||||
this.compileHighlightFunction = compileHighlightFunction;
|
||||
this.highlightToStringFunction = highlightToStringFunction;
|
||||
}
|
||||
|
||||
public Collection<UpdateInfo<T>> searchUpdatesForRecord(final K res, final Collection<K> others) {
|
||||
public Collection<UpdateInfo<T>> searchUpdatesForRecord(final OpenaireBrokerResult res,
|
||||
final Collection<OpenaireBrokerResult> others,
|
||||
final DedupConfig dedupConfig) {
|
||||
|
||||
final Map<String, UpdateInfo<T>> infoMap = new HashMap<>();
|
||||
|
||||
for (final K source : others) {
|
||||
for (final OpenaireBrokerResult source : others) {
|
||||
if (source != res) {
|
||||
for (final UpdateInfo<T> info : findUpdates(source, res)) {
|
||||
for (final T hl : findDifferences(source, res)) {
|
||||
final Topic topic = getTopicFunction().apply(hl);
|
||||
final UpdateInfo<T> info = new UpdateInfo<>(topic, hl, source, res, getCompileHighlightFunction(),
|
||||
getHighlightToStringFunction(), dedupConfig);
|
||||
final String s = DigestUtils.md5Hex(info.getHighlightValueAsString());
|
||||
if (!infoMap.containsKey(s) || infoMap.get(s).getTrust() < info.getTrust()) {
|
||||
} else {
|
||||
|
@ -51,18 +68,30 @@ public abstract class UpdateMatcher<K, T> {
|
|||
}
|
||||
}
|
||||
|
||||
protected abstract List<UpdateInfo<T>> findUpdates(K source, K target);
|
||||
protected abstract List<T> findDifferences(OpenaireBrokerResult source, OpenaireBrokerResult target);
|
||||
|
||||
protected abstract UpdateInfo<T> generateUpdateInfo(final T highlightValue,
|
||||
final K source,
|
||||
final K target);
|
||||
|
||||
protected static boolean isMissing(final List<Field<String>> list) {
|
||||
return list == null || list.isEmpty() || StringUtils.isBlank(list.get(0).getValue());
|
||||
protected static boolean isMissing(final List<String> list) {
|
||||
return list == null || list.isEmpty() || StringUtils.isBlank(list.get(0));
|
||||
}
|
||||
|
||||
protected boolean isMissing(final Field<String> field) {
|
||||
return field == null || StringUtils.isBlank(field.getValue());
|
||||
protected boolean isMissing(final String field) {
|
||||
return StringUtils.isBlank(field);
|
||||
}
|
||||
|
||||
public boolean isMultipleUpdate() {
|
||||
return multipleUpdate;
|
||||
}
|
||||
|
||||
public Function<T, Topic> getTopicFunction() {
|
||||
return topicFunction;
|
||||
}
|
||||
|
||||
public BiConsumer<OpenaireBrokerResult, T> getCompileHighlightFunction() {
|
||||
return compileHighlightFunction;
|
||||
}
|
||||
|
||||
public Function<T, String> getHighlightToStringFunction() {
|
||||
return highlightToStringFunction;
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -5,59 +5,41 @@ import java.util.List;
|
|||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.apache.commons.lang3.tuple.Pair;
|
||||
|
||||
import eu.dnetlib.broker.objects.Dataset;
|
||||
import eu.dnetlib.broker.objects.OpenaireBrokerResult;
|
||||
import eu.dnetlib.dhp.broker.model.Topic;
|
||||
import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher;
|
||||
import eu.dnetlib.dhp.broker.oa.util.ConversionUtils;
|
||||
import eu.dnetlib.dhp.broker.oa.util.UpdateInfo;
|
||||
import eu.dnetlib.dhp.schema.oaf.Dataset;
|
||||
import eu.dnetlib.dhp.schema.oaf.Result;
|
||||
|
||||
public abstract class AbstractEnrichMissingDataset
|
||||
extends UpdateMatcher<Pair<Result, List<Dataset>>, eu.dnetlib.broker.objects.Dataset> {
|
||||
|
||||
private final Topic topic;
|
||||
extends UpdateMatcher<Dataset> {
|
||||
|
||||
public AbstractEnrichMissingDataset(final Topic topic) {
|
||||
super(true);
|
||||
this.topic = topic;
|
||||
super(true,
|
||||
rel -> topic,
|
||||
(p, rel) -> p.getDatasets().add(rel),
|
||||
rel -> rel.getOriginalId());
|
||||
}
|
||||
|
||||
protected abstract boolean filterByType(String relType);
|
||||
|
||||
@Override
|
||||
protected final List<UpdateInfo<eu.dnetlib.broker.objects.Dataset>> findUpdates(
|
||||
final Pair<Result, List<Dataset>> source,
|
||||
final Pair<Result, List<Dataset>> target) {
|
||||
protected final List<Dataset> findDifferences(final OpenaireBrokerResult source,
|
||||
final OpenaireBrokerResult target) {
|
||||
|
||||
final Set<String> existingDatasets = target
|
||||
.getRight()
|
||||
.getDatasets()
|
||||
.stream()
|
||||
.map(Dataset::getId)
|
||||
.filter(rel -> filterByType(rel.getRelType()))
|
||||
.map(Dataset::getOriginalId)
|
||||
.collect(Collectors.toSet());
|
||||
|
||||
return source
|
||||
.getRight()
|
||||
.getDatasets()
|
||||
.stream()
|
||||
.filter(d -> !existingDatasets.contains(d.getId()))
|
||||
.map(ConversionUtils::oafDatasetToBrokerDataset)
|
||||
.map(i -> generateUpdateInfo(i, source, target))
|
||||
.filter(rel -> filterByType(rel.getRelType()))
|
||||
.filter(d -> !existingDatasets.contains(d.getOriginalId()))
|
||||
.collect(Collectors.toList());
|
||||
|
||||
}
|
||||
|
||||
@Override
|
||||
protected final UpdateInfo<eu.dnetlib.broker.objects.Dataset> generateUpdateInfo(
|
||||
final eu.dnetlib.broker.objects.Dataset highlightValue,
|
||||
final Pair<Result, List<Dataset>> source,
|
||||
final Pair<Result, List<Dataset>> target) {
|
||||
return new UpdateInfo<>(
|
||||
getTopic(),
|
||||
highlightValue, source.getLeft(), target.getLeft(),
|
||||
(p, rel) -> p.getDatasets().add(rel),
|
||||
rel -> rel.getInstances().get(0).getUrl());
|
||||
}
|
||||
|
||||
public Topic getTopic() {
|
||||
return topic;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -9,4 +9,9 @@ public class EnrichMissingDatasetIsReferencedBy extends AbstractEnrichMissingDat
|
|||
super(Topic.ENRICH_MISSING_DATASET_IS_REFERENCED_BY);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected boolean filterByType(final String relType) {
|
||||
return relType.equals("isReferencedBy");
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -9,4 +9,9 @@ public class EnrichMissingDatasetIsRelatedTo extends AbstractEnrichMissingDatase
|
|||
super(Topic.ENRICH_MISSING_DATASET_IS_RELATED_TO);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected boolean filterByType(final String relType) {
|
||||
return relType.equals("isRelatedTo");
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -9,4 +9,9 @@ public class EnrichMissingDatasetIsSupplementedBy extends AbstractEnrichMissingD
|
|||
super(Topic.ENRICH_MISSING_DATASET_IS_SUPPLEMENTED_BY);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected boolean filterByType(final String relType) {
|
||||
return relType.equals("isSupplementedBy");
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -9,4 +9,9 @@ public class EnrichMissingDatasetIsSupplementedTo extends AbstractEnrichMissingD
|
|||
super(Topic.ENRICH_MISSING_DATASET_IS_SUPPLEMENTED_TO);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected boolean filterByType(final String relType) {
|
||||
return relType.equals("isSupplementedTo");
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -9,4 +9,9 @@ public class EnrichMissingDatasetReferences extends AbstractEnrichMissingDataset
|
|||
super(Topic.ENRICH_MISSING_DATASET_REFERENCES);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected boolean filterByType(final String relType) {
|
||||
return relType.equals("references");
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -1,41 +1,29 @@
|
|||
|
||||
package eu.dnetlib.dhp.broker.oa.matchers.relatedProjects;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.commons.lang3.tuple.Pair;
|
||||
|
||||
import eu.dnetlib.broker.objects.OpenaireBrokerResult;
|
||||
import eu.dnetlib.broker.objects.Project;
|
||||
import eu.dnetlib.dhp.broker.model.Topic;
|
||||
import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher;
|
||||
import eu.dnetlib.dhp.broker.oa.util.UpdateInfo;
|
||||
import eu.dnetlib.dhp.schema.oaf.Project;
|
||||
import eu.dnetlib.dhp.schema.oaf.Result;
|
||||
|
||||
public class EnrichMissingProject
|
||||
extends UpdateMatcher<Pair<Result, List<Project>>, eu.dnetlib.broker.objects.Project> {
|
||||
public class EnrichMissingProject extends UpdateMatcher<Project> {
|
||||
|
||||
public EnrichMissingProject() {
|
||||
super(true);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected List<UpdateInfo<eu.dnetlib.broker.objects.Project>> findUpdates(final Pair<Result, List<Project>> source,
|
||||
final Pair<Result, List<Project>> target) {
|
||||
// TODO
|
||||
return Arrays.asList();
|
||||
}
|
||||
|
||||
@Override
|
||||
public UpdateInfo<eu.dnetlib.broker.objects.Project> generateUpdateInfo(
|
||||
final eu.dnetlib.broker.objects.Project highlightValue,
|
||||
final Pair<Result, List<Project>> source,
|
||||
final Pair<Result, List<Project>> target) {
|
||||
return new UpdateInfo<>(
|
||||
Topic.ENRICH_MISSING_PROJECT,
|
||||
highlightValue, source.getLeft(), target.getLeft(),
|
||||
super(true,
|
||||
prj -> Topic.ENRICH_MISSING_PROJECT,
|
||||
(p, prj) -> p.getProjects().add(prj),
|
||||
prj -> prj.getFunder() + "::" + prj.getFundingProgram() + prj.getCode());
|
||||
}
|
||||
|
||||
@Override
|
||||
protected List<Project> findDifferences(final OpenaireBrokerResult source, final OpenaireBrokerResult target) {
|
||||
if (target.getProjects().isEmpty()) {
|
||||
return source.getProjects();
|
||||
} else {
|
||||
return new ArrayList<>();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,40 +1,43 @@
|
|||
|
||||
package eu.dnetlib.dhp.broker.oa.matchers.relatedProjects;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.apache.commons.lang3.tuple.Pair;
|
||||
|
||||
import eu.dnetlib.broker.objects.OpenaireBrokerResult;
|
||||
import eu.dnetlib.broker.objects.Project;
|
||||
import eu.dnetlib.dhp.broker.model.Topic;
|
||||
import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher;
|
||||
import eu.dnetlib.dhp.broker.oa.util.UpdateInfo;
|
||||
import eu.dnetlib.dhp.schema.oaf.Project;
|
||||
import eu.dnetlib.dhp.schema.oaf.Result;
|
||||
|
||||
public class EnrichMoreProject extends UpdateMatcher<Pair<Result, List<Project>>, eu.dnetlib.broker.objects.Project> {
|
||||
public class EnrichMoreProject extends UpdateMatcher<Project> {
|
||||
|
||||
public EnrichMoreProject() {
|
||||
super(true);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected List<UpdateInfo<eu.dnetlib.broker.objects.Project>> findUpdates(final Pair<Result, List<Project>> source,
|
||||
final Pair<Result, List<Project>> target) {
|
||||
// TODO
|
||||
return Arrays.asList();
|
||||
}
|
||||
|
||||
@Override
|
||||
public UpdateInfo<eu.dnetlib.broker.objects.Project> generateUpdateInfo(
|
||||
final eu.dnetlib.broker.objects.Project highlightValue,
|
||||
final Pair<Result, List<Project>> source,
|
||||
final Pair<Result, List<Project>> target) {
|
||||
return new UpdateInfo<>(
|
||||
Topic.ENRICH_MORE_PROJECT,
|
||||
highlightValue, source.getLeft(), target.getLeft(),
|
||||
super(true,
|
||||
prj -> Topic.ENRICH_MORE_PROJECT,
|
||||
(p, prj) -> p.getProjects().add(prj),
|
||||
prj -> prj.getFunder() + "::" + prj.getFundingProgram() + prj.getCode());
|
||||
prj -> projectAsString(prj));
|
||||
}
|
||||
|
||||
private static String projectAsString(final Project prj) {
|
||||
return prj.getFunder() + "::" + prj.getFundingProgram() + "::" + prj.getCode();
|
||||
}
|
||||
|
||||
@Override
|
||||
protected List<eu.dnetlib.broker.objects.Project> findDifferences(final OpenaireBrokerResult source,
|
||||
final OpenaireBrokerResult target) {
|
||||
|
||||
final Set<String> existingProjects = target
|
||||
.getProjects()
|
||||
.stream()
|
||||
.map(EnrichMoreProject::projectAsString)
|
||||
.collect(Collectors.toSet());
|
||||
|
||||
return source
|
||||
.getProjects()
|
||||
.stream()
|
||||
.filter(p -> !existingProjects.contains(projectAsString(p)))
|
||||
.collect(Collectors.toList());
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -5,59 +5,41 @@ import java.util.List;
|
|||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.apache.commons.lang3.tuple.Pair;
|
||||
|
||||
import eu.dnetlib.broker.objects.OpenaireBrokerResult;
|
||||
import eu.dnetlib.broker.objects.Publication;
|
||||
import eu.dnetlib.dhp.broker.model.Topic;
|
||||
import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher;
|
||||
import eu.dnetlib.dhp.broker.oa.util.ConversionUtils;
|
||||
import eu.dnetlib.dhp.broker.oa.util.UpdateInfo;
|
||||
import eu.dnetlib.dhp.schema.oaf.Publication;
|
||||
import eu.dnetlib.dhp.schema.oaf.Result;
|
||||
|
||||
public abstract class AbstractEnrichMissingPublication
|
||||
extends UpdateMatcher<Pair<Result, List<Publication>>, eu.dnetlib.broker.objects.Publication> {
|
||||
|
||||
private final Topic topic;
|
||||
public abstract class AbstractEnrichMissingPublication extends UpdateMatcher<Publication> {
|
||||
|
||||
public AbstractEnrichMissingPublication(final Topic topic) {
|
||||
super(true);
|
||||
this.topic = topic;
|
||||
super(true,
|
||||
rel -> topic,
|
||||
(p, rel) -> p.getPublications().add(rel),
|
||||
rel -> rel.getOriginalId());
|
||||
|
||||
}
|
||||
|
||||
protected abstract boolean filterByType(String relType);
|
||||
|
||||
@Override
|
||||
protected final List<UpdateInfo<eu.dnetlib.broker.objects.Publication>> findUpdates(
|
||||
final Pair<Result, List<Publication>> source,
|
||||
final Pair<Result, List<Publication>> target) {
|
||||
protected final List<eu.dnetlib.broker.objects.Publication> findDifferences(
|
||||
final OpenaireBrokerResult source,
|
||||
final OpenaireBrokerResult target) {
|
||||
|
||||
final Set<String> existingPublications = target
|
||||
.getRight()
|
||||
.getPublications()
|
||||
.stream()
|
||||
.map(Publication::getId)
|
||||
.filter(rel -> filterByType(rel.getRelType()))
|
||||
.map(Publication::getOriginalId)
|
||||
.collect(Collectors.toSet());
|
||||
|
||||
return source
|
||||
.getRight()
|
||||
.getPublications()
|
||||
.stream()
|
||||
.filter(d -> !existingPublications.contains(d.getId()))
|
||||
.map(ConversionUtils::oafPublicationToBrokerPublication)
|
||||
.map(i -> generateUpdateInfo(i, source, target))
|
||||
.filter(rel -> filterByType(rel.getRelType()))
|
||||
.filter(p -> !existingPublications.contains(p.getOriginalId()))
|
||||
.collect(Collectors.toList());
|
||||
|
||||
}
|
||||
|
||||
@Override
|
||||
protected final UpdateInfo<eu.dnetlib.broker.objects.Publication> generateUpdateInfo(
|
||||
final eu.dnetlib.broker.objects.Publication highlightValue,
|
||||
final Pair<Result, List<Publication>> source,
|
||||
final Pair<Result, List<Publication>> target) {
|
||||
return new UpdateInfo<>(
|
||||
getTopic(),
|
||||
highlightValue, source.getLeft(), target.getLeft(),
|
||||
(p, rel) -> p.getPublications().add(rel),
|
||||
rel -> rel.getInstances().get(0).getUrl());
|
||||
}
|
||||
|
||||
public Topic getTopic() {
|
||||
return topic;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -9,4 +9,8 @@ public class EnrichMissingPublicationIsReferencedBy extends AbstractEnrichMissin
|
|||
super(Topic.ENRICH_MISSING_PUBLICATION_IS_REFERENCED_BY);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected boolean filterByType(final String relType) {
|
||||
return relType.equals("isReferencedBy");
|
||||
}
|
||||
}
|
||||
|
|
|
@ -9,4 +9,9 @@ public class EnrichMissingPublicationIsRelatedTo extends AbstractEnrichMissingPu
|
|||
super(Topic.ENRICH_MISSING_PUBLICATION_IS_RELATED_TO);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected boolean filterByType(final String relType) {
|
||||
return relType.equals("isRelatedTo");
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -9,4 +9,8 @@ public class EnrichMissingPublicationIsSupplementedBy extends AbstractEnrichMiss
|
|||
super(Topic.ENRICH_MISSING_PUBLICATION_IS_SUPPLEMENTED_BY);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected boolean filterByType(final String relType) {
|
||||
return relType.equals("isSupplementedBy");
|
||||
}
|
||||
}
|
||||
|
|
|
@ -9,4 +9,9 @@ public class EnrichMissingPublicationIsSupplementedTo extends AbstractEnrichMiss
|
|||
super(Topic.ENRICH_MISSING_PUBLICATION_IS_SUPPLEMENTED_TO);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected boolean filterByType(final String relType) {
|
||||
return relType.equals("isSupplementedTo");
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -9,4 +9,9 @@ public class EnrichMissingPublicationReferences extends AbstractEnrichMissingPub
|
|||
super(Topic.ENRICH_MISSING_PUBLICATION_REFERENCES);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected boolean filterByType(final String relType) {
|
||||
return relType.equals("references");
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -0,0 +1,33 @@
|
|||
|
||||
package eu.dnetlib.dhp.broker.oa.matchers.relatedSoftware;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import eu.dnetlib.broker.objects.OpenaireBrokerResult;
|
||||
import eu.dnetlib.dhp.broker.model.Topic;
|
||||
import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher;
|
||||
|
||||
public class EnrichMissingSoftware
|
||||
extends UpdateMatcher<eu.dnetlib.broker.objects.Software> {
|
||||
|
||||
public EnrichMissingSoftware() {
|
||||
super(true,
|
||||
s -> Topic.ENRICH_MISSING_SOFTWARE,
|
||||
(p, s) -> p.getSoftwares().add(s),
|
||||
s -> s.getName());
|
||||
}
|
||||
|
||||
@Override
|
||||
protected List<eu.dnetlib.broker.objects.Software> findDifferences(
|
||||
final OpenaireBrokerResult source,
|
||||
final OpenaireBrokerResult target) {
|
||||
|
||||
if (target.getSoftwares().isEmpty()) {
|
||||
return source.getSoftwares();
|
||||
} else {
|
||||
return new ArrayList<>();
|
||||
}
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,40 @@
|
|||
|
||||
package eu.dnetlib.dhp.broker.oa.matchers.relatedSoftware;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import eu.dnetlib.broker.objects.OpenaireBrokerResult;
|
||||
import eu.dnetlib.broker.objects.Software;
|
||||
import eu.dnetlib.dhp.broker.model.Topic;
|
||||
import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher;
|
||||
|
||||
public class EnrichMoreSoftware extends UpdateMatcher<Software> {
|
||||
|
||||
public EnrichMoreSoftware() {
|
||||
super(true,
|
||||
s -> Topic.ENRICH_MORE_SOFTWARE,
|
||||
(p, s) -> p.getSoftwares().add(s),
|
||||
s -> s.getName());
|
||||
}
|
||||
|
||||
@Override
|
||||
protected List<eu.dnetlib.broker.objects.Software> findDifferences(
|
||||
final OpenaireBrokerResult source,
|
||||
final OpenaireBrokerResult target) {
|
||||
|
||||
final Set<String> existingSoftwares = source
|
||||
.getSoftwares()
|
||||
.stream()
|
||||
.map(Software::getName)
|
||||
.collect(Collectors.toSet());
|
||||
|
||||
return target
|
||||
.getSoftwares()
|
||||
.stream()
|
||||
.filter(p -> !existingSoftwares.contains(p.getName()))
|
||||
.collect(Collectors.toList());
|
||||
}
|
||||
|
||||
}
|
|
@ -5,34 +5,26 @@ import java.util.ArrayList;
|
|||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
|
||||
import eu.dnetlib.broker.objects.OpenaireBrokerResult;
|
||||
import eu.dnetlib.dhp.broker.model.Topic;
|
||||
import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher;
|
||||
import eu.dnetlib.dhp.broker.oa.util.UpdateInfo;
|
||||
import eu.dnetlib.dhp.schema.oaf.Result;
|
||||
|
||||
public class EnrichMissingAbstract extends UpdateMatcher<Result, String> {
|
||||
public class EnrichMissingAbstract extends UpdateMatcher<String> {
|
||||
|
||||
public EnrichMissingAbstract() {
|
||||
super(false);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected List<UpdateInfo<String>> findUpdates(final Result source, final Result target) {
|
||||
if (isMissing(target.getDescription()) && !isMissing(source.getDescription())) {
|
||||
return Arrays.asList(generateUpdateInfo(source.getDescription().get(0).getValue(), source, target));
|
||||
}
|
||||
return new ArrayList<>();
|
||||
}
|
||||
|
||||
@Override
|
||||
public UpdateInfo<String> generateUpdateInfo(final String highlightValue,
|
||||
final Result source,
|
||||
final Result target) {
|
||||
return new UpdateInfo<>(
|
||||
Topic.ENRICH_MISSING_ABSTRACT,
|
||||
highlightValue, source, target,
|
||||
super(false,
|
||||
s -> Topic.ENRICH_MISSING_ABSTRACT,
|
||||
(p, s) -> p.getAbstracts().add(s),
|
||||
s -> s);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected List<String> findDifferences(final OpenaireBrokerResult source, final OpenaireBrokerResult target) {
|
||||
if (isMissing(target.getAbstracts()) && !isMissing(source.getAbstracts())) {
|
||||
return Arrays.asList(source.getAbstracts().get(0));
|
||||
} else {
|
||||
return new ArrayList<>();
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -1,36 +1,43 @@
|
|||
|
||||
package eu.dnetlib.dhp.broker.oa.matchers.simple;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.apache.commons.lang3.tuple.Pair;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
|
||||
import eu.dnetlib.broker.objects.Author;
|
||||
import eu.dnetlib.broker.objects.OpenaireBrokerResult;
|
||||
import eu.dnetlib.dhp.broker.model.Topic;
|
||||
import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher;
|
||||
import eu.dnetlib.dhp.broker.oa.util.UpdateInfo;
|
||||
import eu.dnetlib.dhp.schema.oaf.Result;
|
||||
|
||||
public class EnrichMissingAuthorOrcid extends UpdateMatcher<Result, Pair<String, String>> {
|
||||
public class EnrichMissingAuthorOrcid extends UpdateMatcher<Author> {
|
||||
|
||||
public EnrichMissingAuthorOrcid() {
|
||||
super(true);
|
||||
super(true,
|
||||
aut -> Topic.ENRICH_MISSING_AUTHOR_ORCID,
|
||||
(p, aut) -> p.getCreators().add(aut),
|
||||
aut -> aut.getOrcid());
|
||||
}
|
||||
|
||||
@Override
|
||||
protected List<UpdateInfo<Pair<String, String>>> findUpdates(final Result source, final Result target) {
|
||||
// return Arrays.asList(new EnrichMissingAbstract("xxxxxxx", 0.9f));
|
||||
return Arrays.asList();
|
||||
}
|
||||
protected List<Author> findDifferences(final OpenaireBrokerResult source,
|
||||
final OpenaireBrokerResult target) {
|
||||
|
||||
final Set<String> existingOrcids = target
|
||||
.getCreators()
|
||||
.stream()
|
||||
.map(Author::getOrcid)
|
||||
.filter(StringUtils::isNotBlank)
|
||||
.collect(Collectors.toSet());
|
||||
|
||||
return source
|
||||
.getCreators()
|
||||
.stream()
|
||||
.filter(a -> StringUtils.isNotBlank(a.getOrcid()))
|
||||
.filter(a -> !existingOrcids.contains(a.getOrcid()))
|
||||
.collect(Collectors.toList());
|
||||
|
||||
@Override
|
||||
public UpdateInfo<Pair<String, String>> generateUpdateInfo(final Pair<String, String> highlightValue,
|
||||
final Result source,
|
||||
final Result target) {
|
||||
return new UpdateInfo<>(
|
||||
Topic.ENRICH_MISSING_AUTHOR_ORCID,
|
||||
highlightValue, source, target,
|
||||
(p, pair) -> p.getCreators().add(pair.getLeft() + " - ORCID: " + pair.getRight()),
|
||||
pair -> pair.getLeft() + "::" + pair.getRight());
|
||||
}
|
||||
}
|
||||
|
|
|
@ -6,25 +6,27 @@ import java.util.List;
|
|||
import java.util.stream.Collectors;
|
||||
|
||||
import eu.dnetlib.broker.objects.Instance;
|
||||
import eu.dnetlib.broker.objects.OpenaireBrokerResult;
|
||||
import eu.dnetlib.dhp.broker.model.Topic;
|
||||
import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher;
|
||||
import eu.dnetlib.dhp.broker.oa.util.BrokerConstants;
|
||||
import eu.dnetlib.dhp.broker.oa.util.ConversionUtils;
|
||||
import eu.dnetlib.dhp.broker.oa.util.UpdateInfo;
|
||||
import eu.dnetlib.dhp.schema.oaf.Result;
|
||||
|
||||
public class EnrichMissingOpenAccess extends UpdateMatcher<Result, Instance> {
|
||||
public class EnrichMissingOpenAccess extends UpdateMatcher<Instance> {
|
||||
|
||||
public EnrichMissingOpenAccess() {
|
||||
super(true);
|
||||
super(true,
|
||||
i -> Topic.ENRICH_MISSING_OA_VERSION,
|
||||
(p, i) -> p.getInstances().add(i),
|
||||
Instance::getUrl);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected List<UpdateInfo<Instance>> findUpdates(final Result source, final Result target) {
|
||||
protected List<Instance> findDifferences(final OpenaireBrokerResult source,
|
||||
final OpenaireBrokerResult target) {
|
||||
final long count = target
|
||||
.getInstance()
|
||||
.getInstances()
|
||||
.stream()
|
||||
.map(i -> i.getAccessright().getClassid())
|
||||
.map(Instance::getLicense)
|
||||
.filter(right -> right.equals(BrokerConstants.OPEN_ACCESS))
|
||||
.count();
|
||||
|
||||
|
@ -33,24 +35,10 @@ public class EnrichMissingOpenAccess extends UpdateMatcher<Result, Instance> {
|
|||
}
|
||||
|
||||
return source
|
||||
.getInstance()
|
||||
.getInstances()
|
||||
.stream()
|
||||
.filter(i -> i.getAccessright().getClassid().equals(BrokerConstants.OPEN_ACCESS))
|
||||
.map(ConversionUtils::oafInstanceToBrokerInstances)
|
||||
.flatMap(s -> s)
|
||||
.map(i -> generateUpdateInfo(i, source, target))
|
||||
.filter(i -> i.getLicense().equals(BrokerConstants.OPEN_ACCESS))
|
||||
.collect(Collectors.toList());
|
||||
}
|
||||
|
||||
@Override
|
||||
public UpdateInfo<Instance> generateUpdateInfo(final Instance highlightValue,
|
||||
final Result source,
|
||||
final Result target) {
|
||||
return new UpdateInfo<>(
|
||||
Topic.ENRICH_MISSING_OA_VERSION,
|
||||
highlightValue, source, target,
|
||||
(p, i) -> p.getInstances().add(i),
|
||||
Instance::getUrl);
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -5,42 +5,33 @@ import java.util.Arrays;
|
|||
import java.util.List;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import eu.dnetlib.broker.objects.Pid;
|
||||
import eu.dnetlib.broker.objects.OpenaireBrokerResult;
|
||||
import eu.dnetlib.broker.objects.TypedValue;
|
||||
import eu.dnetlib.dhp.broker.model.Topic;
|
||||
import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher;
|
||||
import eu.dnetlib.dhp.broker.oa.util.ConversionUtils;
|
||||
import eu.dnetlib.dhp.broker.oa.util.UpdateInfo;
|
||||
import eu.dnetlib.dhp.schema.oaf.Result;
|
||||
|
||||
public class EnrichMissingPid extends UpdateMatcher<Result, Pid> {
|
||||
public class EnrichMissingPid extends UpdateMatcher<TypedValue> {
|
||||
|
||||
public EnrichMissingPid() {
|
||||
super(true);
|
||||
super(true,
|
||||
pid -> Topic.ENRICH_MISSING_PID,
|
||||
(p, pid) -> p.getPids().add(pid),
|
||||
pid -> pid.getType() + "::" + pid.getValue());
|
||||
}
|
||||
|
||||
@Override
|
||||
protected List<UpdateInfo<Pid>> findUpdates(final Result source, final Result target) {
|
||||
final long count = target.getPid().size();
|
||||
protected List<TypedValue> findDifferences(final OpenaireBrokerResult source,
|
||||
final OpenaireBrokerResult target) {
|
||||
final long count = target.getPids().size();
|
||||
|
||||
if (count > 0) {
|
||||
return Arrays.asList();
|
||||
}
|
||||
|
||||
return source
|
||||
.getPid()
|
||||
.getPids()
|
||||
.stream()
|
||||
.map(ConversionUtils::oafPidToBrokerPid)
|
||||
.map(i -> generateUpdateInfo(i, source, target))
|
||||
.collect(Collectors.toList());
|
||||
}
|
||||
|
||||
@Override
|
||||
public UpdateInfo<Pid> generateUpdateInfo(final Pid highlightValue, final Result source, final Result target) {
|
||||
return new UpdateInfo<>(
|
||||
Topic.ENRICH_MISSING_PID,
|
||||
highlightValue, source, target,
|
||||
(p, pid) -> p.getPids().add(pid),
|
||||
pid -> pid.getType() + "::" + pid.getValue());
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -5,34 +5,28 @@ import java.util.ArrayList;
|
|||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
|
||||
import eu.dnetlib.broker.objects.OpenaireBrokerResult;
|
||||
import eu.dnetlib.dhp.broker.model.Topic;
|
||||
import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher;
|
||||
import eu.dnetlib.dhp.broker.oa.util.UpdateInfo;
|
||||
import eu.dnetlib.dhp.schema.oaf.Result;
|
||||
|
||||
public class EnrichMissingPublicationDate extends UpdateMatcher<Result, String> {
|
||||
public class EnrichMissingPublicationDate extends UpdateMatcher<String> {
|
||||
|
||||
public EnrichMissingPublicationDate() {
|
||||
super(false);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected List<UpdateInfo<String>> findUpdates(final Result source, final Result target) {
|
||||
if (isMissing(target.getDateofacceptance()) && !isMissing(source.getDateofacceptance())) {
|
||||
return Arrays.asList(generateUpdateInfo(source.getDateofacceptance().getValue(), source, target));
|
||||
}
|
||||
return new ArrayList<>();
|
||||
}
|
||||
|
||||
@Override
|
||||
public UpdateInfo<String> generateUpdateInfo(final String highlightValue,
|
||||
final Result source,
|
||||
final Result target) {
|
||||
return new UpdateInfo<>(
|
||||
Topic.ENRICH_MISSING_PUBLICATION_DATE,
|
||||
highlightValue, source, target,
|
||||
super(false,
|
||||
date -> Topic.ENRICH_MISSING_PUBLICATION_DATE,
|
||||
(p, date) -> p.setPublicationdate(date),
|
||||
s -> s);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected List<String> findDifferences(final OpenaireBrokerResult source,
|
||||
final OpenaireBrokerResult target) {
|
||||
|
||||
if (isMissing(target.getPublicationdate()) && !isMissing(source.getPublicationdate())) {
|
||||
return Arrays.asList(source.getPublicationdate());
|
||||
} else {
|
||||
return new ArrayList<>();
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -1,42 +0,0 @@
|
|||
|
||||
package eu.dnetlib.dhp.broker.oa.matchers.simple;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.commons.lang3.tuple.Pair;
|
||||
|
||||
import eu.dnetlib.dhp.broker.model.Topic;
|
||||
import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher;
|
||||
import eu.dnetlib.dhp.broker.oa.util.UpdateInfo;
|
||||
import eu.dnetlib.dhp.schema.oaf.Result;
|
||||
import eu.dnetlib.dhp.schema.oaf.Software;
|
||||
|
||||
public class EnrichMissingSoftware
|
||||
extends UpdateMatcher<Pair<Result, List<Software>>, eu.dnetlib.broker.objects.Software> {
|
||||
|
||||
public EnrichMissingSoftware() {
|
||||
super(true);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected List<UpdateInfo<eu.dnetlib.broker.objects.Software>> findUpdates(
|
||||
final Pair<Result, List<Software>> source,
|
||||
final Pair<Result, List<Software>> target) {
|
||||
// TODO
|
||||
return Arrays.asList();
|
||||
}
|
||||
|
||||
@Override
|
||||
public UpdateInfo<eu.dnetlib.broker.objects.Software> generateUpdateInfo(
|
||||
final eu.dnetlib.broker.objects.Software highlightValue,
|
||||
final Pair<Result, List<Software>> source,
|
||||
final Pair<Result, List<Software>> target) {
|
||||
return new UpdateInfo<>(
|
||||
Topic.ENRICH_MISSING_SOFTWARE,
|
||||
highlightValue, source.getLeft(), target.getLeft(),
|
||||
(p, s) -> p.getSoftwares().add(s),
|
||||
s -> s.getName());
|
||||
}
|
||||
|
||||
}
|
|
@ -5,50 +5,38 @@ import java.util.List;
|
|||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.apache.commons.lang3.tuple.Pair;
|
||||
|
||||
import eu.dnetlib.broker.objects.OpenaireBrokerResult;
|
||||
import eu.dnetlib.broker.objects.TypedValue;
|
||||
import eu.dnetlib.dhp.broker.model.Topic;
|
||||
import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher;
|
||||
import eu.dnetlib.dhp.broker.oa.util.ConversionUtils;
|
||||
import eu.dnetlib.dhp.broker.oa.util.UpdateInfo;
|
||||
import eu.dnetlib.dhp.schema.oaf.Qualifier;
|
||||
import eu.dnetlib.dhp.schema.oaf.Result;
|
||||
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
|
||||
|
||||
public class EnrichMissingSubject extends UpdateMatcher<Result, Pair<String, String>> {
|
||||
public class EnrichMissingSubject extends UpdateMatcher<TypedValue> {
|
||||
|
||||
public EnrichMissingSubject() {
|
||||
super(true);
|
||||
super(true,
|
||||
s -> Topic.fromPath("ENRICH/MISSING/SUBJECT/" + s.getType()),
|
||||
(p, s) -> p.getSubjects().add(s),
|
||||
s -> subjectAsString(s));
|
||||
}
|
||||
|
||||
@Override
|
||||
protected List<UpdateInfo<Pair<String, String>>> findUpdates(final Result source, final Result target) {
|
||||
final Set<String> existingTypes = target
|
||||
.getSubject()
|
||||
protected List<TypedValue> findDifferences(final OpenaireBrokerResult source,
|
||||
final OpenaireBrokerResult target) {
|
||||
final Set<String> existingSubject = target
|
||||
.getSubjects()
|
||||
.stream()
|
||||
.map(StructuredProperty::getQualifier)
|
||||
.map(Qualifier::getClassid)
|
||||
.map(s -> subjectAsString(s))
|
||||
.collect(Collectors.toSet());
|
||||
|
||||
return source
|
||||
.getPid()
|
||||
.getSubjects()
|
||||
.stream()
|
||||
.filter(pid -> !existingTypes.contains(pid.getQualifier().getClassid()))
|
||||
.map(ConversionUtils::oafSubjectToPair)
|
||||
.map(i -> generateUpdateInfo(i, source, target))
|
||||
.filter(s -> !existingSubject.contains(subjectAsString(s)))
|
||||
.collect(Collectors.toList());
|
||||
}
|
||||
|
||||
@Override
|
||||
public UpdateInfo<Pair<String, String>> generateUpdateInfo(final Pair<String, String> highlightValue,
|
||||
final Result source,
|
||||
final Result target) {
|
||||
|
||||
return new UpdateInfo<>(
|
||||
Topic.fromPath("ENRICH/MISSING/SUBJECT/" + highlightValue.getLeft()),
|
||||
highlightValue, source, target,
|
||||
(p, pair) -> p.getSubjects().add(pair.getRight()),
|
||||
pair -> pair.getLeft() + "::" + pair.getRight());
|
||||
private static String subjectAsString(final TypedValue s) {
|
||||
return s.getType() + "::" + s.getValue();
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -6,49 +6,36 @@ import java.util.Set;
|
|||
import java.util.stream.Collectors;
|
||||
|
||||
import eu.dnetlib.broker.objects.Instance;
|
||||
import eu.dnetlib.broker.objects.OpenaireBrokerResult;
|
||||
import eu.dnetlib.dhp.broker.model.Topic;
|
||||
import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher;
|
||||
import eu.dnetlib.dhp.broker.oa.util.BrokerConstants;
|
||||
import eu.dnetlib.dhp.broker.oa.util.ConversionUtils;
|
||||
import eu.dnetlib.dhp.broker.oa.util.UpdateInfo;
|
||||
import eu.dnetlib.dhp.schema.oaf.Result;
|
||||
|
||||
public class EnrichMoreOpenAccess extends UpdateMatcher<Result, Instance> {
|
||||
public class EnrichMoreOpenAccess extends UpdateMatcher<Instance> {
|
||||
|
||||
public EnrichMoreOpenAccess() {
|
||||
super(true);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected List<UpdateInfo<Instance>> findUpdates(final Result source, final Result target) {
|
||||
final Set<String> urls = target
|
||||
.getInstance()
|
||||
.stream()
|
||||
.filter(i -> i.getAccessright().getClassid().equals(BrokerConstants.OPEN_ACCESS))
|
||||
.map(i -> i.getUrl())
|
||||
.flatMap(List::stream)
|
||||
.collect(Collectors.toSet());
|
||||
|
||||
return source
|
||||
.getInstance()
|
||||
.stream()
|
||||
.filter(i -> i.getAccessright().getClassid().equals(BrokerConstants.OPEN_ACCESS))
|
||||
.map(ConversionUtils::oafInstanceToBrokerInstances)
|
||||
.flatMap(s -> s)
|
||||
.filter(i -> !urls.contains(i.getUrl()))
|
||||
.map(i -> generateUpdateInfo(i, source, target))
|
||||
.collect(Collectors.toList());
|
||||
}
|
||||
|
||||
@Override
|
||||
public UpdateInfo<Instance> generateUpdateInfo(final Instance highlightValue,
|
||||
final Result source,
|
||||
final Result target) {
|
||||
return new UpdateInfo<>(
|
||||
Topic.ENRICH_MORE_OA_VERSION,
|
||||
highlightValue, source, target,
|
||||
super(true,
|
||||
i -> Topic.ENRICH_MORE_OA_VERSION,
|
||||
(p, i) -> p.getInstances().add(i),
|
||||
Instance::getUrl);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected List<Instance> findDifferences(final OpenaireBrokerResult source,
|
||||
final OpenaireBrokerResult target) {
|
||||
final Set<String> urls = target
|
||||
.getInstances()
|
||||
.stream()
|
||||
.filter(i -> i.getLicense().equals(BrokerConstants.OPEN_ACCESS))
|
||||
.map(i -> i.getUrl())
|
||||
.collect(Collectors.toSet());
|
||||
|
||||
return source
|
||||
.getInstances()
|
||||
.stream()
|
||||
.filter(i -> i.getLicense().equals(BrokerConstants.OPEN_ACCESS))
|
||||
.filter(i -> !urls.contains(i.getUrl()))
|
||||
.collect(Collectors.toList());
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -5,43 +5,37 @@ import java.util.List;
|
|||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import eu.dnetlib.broker.objects.Pid;
|
||||
import eu.dnetlib.broker.objects.OpenaireBrokerResult;
|
||||
import eu.dnetlib.broker.objects.TypedValue;
|
||||
import eu.dnetlib.dhp.broker.model.Topic;
|
||||
import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher;
|
||||
import eu.dnetlib.dhp.broker.oa.util.ConversionUtils;
|
||||
import eu.dnetlib.dhp.broker.oa.util.UpdateInfo;
|
||||
import eu.dnetlib.dhp.schema.oaf.Result;
|
||||
|
||||
public class EnrichMorePid extends UpdateMatcher<Result, Pid> {
|
||||
public class EnrichMorePid extends UpdateMatcher<TypedValue> {
|
||||
|
||||
public EnrichMorePid() {
|
||||
super(true);
|
||||
super(true,
|
||||
pid -> Topic.ENRICH_MORE_PID,
|
||||
(p, pid) -> p.getPids().add(pid),
|
||||
pid -> pidAsString(pid));
|
||||
}
|
||||
|
||||
@Override
|
||||
protected List<UpdateInfo<Pid>> findUpdates(final Result source, final Result target) {
|
||||
protected List<TypedValue> findDifferences(final OpenaireBrokerResult source,
|
||||
final OpenaireBrokerResult target) {
|
||||
final Set<String> existingPids = target
|
||||
.getPid()
|
||||
.getPids()
|
||||
.stream()
|
||||
.map(pid -> pid.getQualifier().getClassid() + "::" + pid.getValue())
|
||||
.map(pid -> pidAsString(pid))
|
||||
.collect(Collectors.toSet());
|
||||
|
||||
return source
|
||||
.getPid()
|
||||
.getPids()
|
||||
.stream()
|
||||
.filter(pid -> !existingPids.contains(pid.getQualifier().getClassid() + "::" + pid.getValue()))
|
||||
.map(ConversionUtils::oafPidToBrokerPid)
|
||||
.map(i -> generateUpdateInfo(i, source, target))
|
||||
.filter(pid -> !existingPids.contains(pidAsString(pid)))
|
||||
.collect(Collectors.toList());
|
||||
}
|
||||
|
||||
@Override
|
||||
public UpdateInfo<Pid> generateUpdateInfo(final Pid highlightValue, final Result source, final Result target) {
|
||||
return new UpdateInfo<>(
|
||||
Topic.ENRICH_MORE_PID,
|
||||
highlightValue, source, target,
|
||||
(p, pid) -> p.getPids().add(pid),
|
||||
pid -> pid.getType() + "::" + pid.getValue());
|
||||
private static String pidAsString(final TypedValue pid) {
|
||||
return pid.getType() + "::" + pid.getValue();
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -1,42 +0,0 @@
|
|||
|
||||
package eu.dnetlib.dhp.broker.oa.matchers.simple;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.commons.lang3.tuple.Pair;
|
||||
|
||||
import eu.dnetlib.dhp.broker.model.Topic;
|
||||
import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher;
|
||||
import eu.dnetlib.dhp.broker.oa.util.UpdateInfo;
|
||||
import eu.dnetlib.dhp.schema.oaf.Result;
|
||||
import eu.dnetlib.dhp.schema.oaf.Software;
|
||||
|
||||
public class EnrichMoreSoftware
|
||||
extends UpdateMatcher<Pair<Result, List<Software>>, eu.dnetlib.broker.objects.Software> {
|
||||
|
||||
public EnrichMoreSoftware() {
|
||||
super(true);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected List<UpdateInfo<eu.dnetlib.broker.objects.Software>> findUpdates(
|
||||
final Pair<Result, List<Software>> source,
|
||||
final Pair<Result, List<Software>> target) {
|
||||
// TODO
|
||||
return Arrays.asList();
|
||||
}
|
||||
|
||||
@Override
|
||||
public UpdateInfo<eu.dnetlib.broker.objects.Software> generateUpdateInfo(
|
||||
final eu.dnetlib.broker.objects.Software highlightValue,
|
||||
final Pair<Result, List<Software>> source,
|
||||
final Pair<Result, List<Software>> target) {
|
||||
return new UpdateInfo<>(
|
||||
Topic.ENRICH_MORE_SOFTWARE,
|
||||
highlightValue, source.getLeft(), target.getLeft(),
|
||||
(p, s) -> p.getSoftwares().add(s),
|
||||
s -> s.getName());
|
||||
}
|
||||
|
||||
}
|
|
@ -5,47 +5,37 @@ import java.util.List;
|
|||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.apache.commons.lang3.tuple.Pair;
|
||||
|
||||
import eu.dnetlib.broker.objects.OpenaireBrokerResult;
|
||||
import eu.dnetlib.broker.objects.TypedValue;
|
||||
import eu.dnetlib.dhp.broker.model.Topic;
|
||||
import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher;
|
||||
import eu.dnetlib.dhp.broker.oa.util.ConversionUtils;
|
||||
import eu.dnetlib.dhp.broker.oa.util.UpdateInfo;
|
||||
import eu.dnetlib.dhp.schema.oaf.Result;
|
||||
|
||||
public class EnrichMoreSubject extends UpdateMatcher<Result, Pair<String, String>> {
|
||||
public class EnrichMoreSubject extends UpdateMatcher<TypedValue> {
|
||||
|
||||
public EnrichMoreSubject() {
|
||||
super(true);
|
||||
super(true,
|
||||
s -> Topic.fromPath("ENRICH/MORE/SUBJECT/" + s.getType()),
|
||||
(p, s) -> p.getSubjects().add(s),
|
||||
s -> subjectAsString(s));
|
||||
}
|
||||
|
||||
@Override
|
||||
protected List<UpdateInfo<Pair<String, String>>> findUpdates(final Result source, final Result target) {
|
||||
protected List<TypedValue> findDifferences(final OpenaireBrokerResult source,
|
||||
final OpenaireBrokerResult target) {
|
||||
final Set<String> existingSubjects = target
|
||||
.getSubject()
|
||||
.getSubjects()
|
||||
.stream()
|
||||
.map(pid -> pid.getQualifier().getClassid() + "::" + pid.getValue())
|
||||
.map(pid -> subjectAsString(pid))
|
||||
.collect(Collectors.toSet());
|
||||
|
||||
return source
|
||||
.getPid()
|
||||
.getPids()
|
||||
.stream()
|
||||
.filter(pid -> !existingSubjects.contains(pid.getQualifier().getClassid() + "::" + pid.getValue()))
|
||||
.map(ConversionUtils::oafSubjectToPair)
|
||||
.map(i -> generateUpdateInfo(i, source, target))
|
||||
.filter(s -> !existingSubjects.contains(subjectAsString(s)))
|
||||
.collect(Collectors.toList());
|
||||
}
|
||||
|
||||
@Override
|
||||
public UpdateInfo<Pair<String, String>> generateUpdateInfo(final Pair<String, String> highlightValue,
|
||||
final Result source,
|
||||
final Result target) {
|
||||
|
||||
return new UpdateInfo<>(
|
||||
Topic.fromPath("ENRICH/MORE/SUBJECT/" + highlightValue.getLeft()),
|
||||
highlightValue, source, target,
|
||||
(p, pair) -> p.getSubjects().add(pair.getRight()),
|
||||
pair -> pair.getLeft() + "::" + pair.getRight());
|
||||
private static String subjectAsString(final TypedValue s) {
|
||||
return s.getType() + "::" + s.getValue();
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -1,9 +1,27 @@
|
|||
|
||||
package eu.dnetlib.dhp.broker.oa.util;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.HashSet;
|
||||
import java.util.Set;
|
||||
|
||||
import eu.dnetlib.dhp.broker.model.Event;
|
||||
import eu.dnetlib.dhp.broker.oa.util.aggregators.simple.ResultGroup;
|
||||
import eu.dnetlib.dhp.schema.common.ModelSupport;
|
||||
|
||||
public class BrokerConstants {
|
||||
|
||||
public final static String OPEN_ACCESS = "OPEN";
|
||||
public final static String IS_MERGED_IN_CLASS = "isMergedIn";
|
||||
public static final String OPEN_ACCESS = "OPEN";
|
||||
public static final String IS_MERGED_IN_CLASS = "isMergedIn";
|
||||
|
||||
public static final float MIN_TRUST = 0.25f;
|
||||
public static final float MAX_TRUST = 1.00f;
|
||||
|
||||
public static Class<?>[] getModelClasses() {
|
||||
final Set<Class<?>> list = new HashSet<>();
|
||||
list.addAll(Arrays.asList(ModelSupport.getOafModelClasses()));
|
||||
list.addAll(Arrays.asList(ResultGroup.class, Event.class));
|
||||
return list.toArray(new Class[] {});
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -1,49 +1,213 @@
|
|||
|
||||
package eu.dnetlib.dhp.broker.oa.util;
|
||||
|
||||
import java.util.stream.Stream;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.apache.commons.lang3.tuple.Pair;
|
||||
import org.dom4j.Document;
|
||||
import org.dom4j.DocumentException;
|
||||
import org.dom4j.DocumentHelper;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import eu.dnetlib.broker.objects.Instance;
|
||||
import eu.dnetlib.broker.objects.Pid;
|
||||
import eu.dnetlib.broker.objects.OpenaireBrokerResult;
|
||||
import eu.dnetlib.broker.objects.TypedValue;
|
||||
import eu.dnetlib.dhp.schema.oaf.Author;
|
||||
import eu.dnetlib.dhp.schema.oaf.Dataset;
|
||||
import eu.dnetlib.dhp.schema.oaf.ExternalReference;
|
||||
import eu.dnetlib.dhp.schema.oaf.Field;
|
||||
import eu.dnetlib.dhp.schema.oaf.Instance;
|
||||
import eu.dnetlib.dhp.schema.oaf.Journal;
|
||||
import eu.dnetlib.dhp.schema.oaf.KeyValue;
|
||||
import eu.dnetlib.dhp.schema.oaf.Project;
|
||||
import eu.dnetlib.dhp.schema.oaf.Publication;
|
||||
import eu.dnetlib.dhp.schema.oaf.Result;
|
||||
import eu.dnetlib.dhp.schema.oaf.Software;
|
||||
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
|
||||
|
||||
public class ConversionUtils {
|
||||
|
||||
public static Stream<Instance> oafInstanceToBrokerInstances(final eu.dnetlib.dhp.schema.oaf.Instance i) {
|
||||
private static final Logger log = LoggerFactory.getLogger(ConversionUtils.class);
|
||||
|
||||
public static List<eu.dnetlib.broker.objects.Instance> oafInstanceToBrokerInstances(final Instance i) {
|
||||
return i.getUrl().stream().map(url -> {
|
||||
final Instance r = new Instance();
|
||||
r.setUrl(url);
|
||||
r.setInstancetype(i.getInstancetype().getClassid());
|
||||
r.setLicense(BrokerConstants.OPEN_ACCESS);
|
||||
r.setHostedby(i.getHostedby().getValue());
|
||||
return r;
|
||||
});
|
||||
return new eu.dnetlib.broker.objects.Instance()
|
||||
.setUrl(url)
|
||||
.setInstancetype(i.getInstancetype().getClassid())
|
||||
.setLicense(BrokerConstants.OPEN_ACCESS)
|
||||
.setHostedby(i.getHostedby().getValue());
|
||||
}).collect(Collectors.toList());
|
||||
}
|
||||
|
||||
public static Pid oafPidToBrokerPid(final StructuredProperty sp) {
|
||||
final Pid pid = new Pid();
|
||||
pid.setValue(sp.getValue());
|
||||
pid.setType(sp.getQualifier().getClassid());
|
||||
return pid;
|
||||
public static TypedValue oafPidToBrokerPid(final StructuredProperty sp) {
|
||||
return sp != null ? new TypedValue()
|
||||
.setValue(sp.getValue())
|
||||
.setType(sp.getQualifier().getClassid()) : null;
|
||||
}
|
||||
|
||||
public static final Pair<String, String> oafSubjectToPair(final StructuredProperty sp) {
|
||||
return Pair.of(sp.getQualifier().getClassid(), sp.getValue());
|
||||
return sp != null ? Pair.of(sp.getQualifier().getClassid(), sp.getValue()) : null;
|
||||
}
|
||||
|
||||
public static final eu.dnetlib.broker.objects.Dataset oafDatasetToBrokerDataset(final Dataset d) {
|
||||
final eu.dnetlib.broker.objects.Dataset res = new eu.dnetlib.broker.objects.Dataset();
|
||||
// TODO
|
||||
return d != null ? new eu.dnetlib.broker.objects.Dataset()
|
||||
.setOriginalId(d.getOriginalId().get(0))
|
||||
.setTitle(structPropValue(d.getTitle()))
|
||||
.setPids(d.getPid().stream().map(ConversionUtils::oafPidToBrokerPid).collect(Collectors.toList()))
|
||||
.setInstances(
|
||||
d
|
||||
.getInstance()
|
||||
.stream()
|
||||
.map(ConversionUtils::oafInstanceToBrokerInstances)
|
||||
.flatMap(List::stream)
|
||||
.collect(Collectors.toList()))
|
||||
.setCollectedFrom(d.getCollectedfrom().stream().map(KeyValue::getValue).findFirst().orElse(null))
|
||||
: null;
|
||||
}
|
||||
|
||||
public static final OpenaireBrokerResult oafResultToBrokerResult(final Result result) {
|
||||
|
||||
return result != null ? new OpenaireBrokerResult()
|
||||
.setOpenaireId(result.getId())
|
||||
.setOriginalId(result.getOriginalId().get(0))
|
||||
.setTypology(result.getResulttype().getClassid())
|
||||
.setTitles(structPropList(result.getTitle()))
|
||||
.setAbstracts(fieldList(result.getDescription()))
|
||||
.setLanguage(result.getLanguage().getClassid())
|
||||
.setSubjects(structPropTypedList(result.getSubject()))
|
||||
.setCreators(
|
||||
result.getAuthor().stream().map(ConversionUtils::oafAuthorToBrokerAuthor).collect(Collectors.toList()))
|
||||
.setPublicationdate(result.getDateofacceptance().getValue())
|
||||
.setPublisher(fieldValue(result.getPublisher()))
|
||||
.setEmbargoenddate(fieldValue(result.getEmbargoenddate()))
|
||||
.setContributor(fieldList(result.getContributor()))
|
||||
.setJournal(
|
||||
result instanceof Publication ? oafJournalToBrokerJournal(((Publication) result).getJournal()) : null)
|
||||
.setCollectedFromId(result.getCollectedfrom().stream().map(KeyValue::getKey).findFirst().orElse(null))
|
||||
.setCollectedFromName(result.getCollectedfrom().stream().map(KeyValue::getValue).findFirst().orElse(null))
|
||||
.setPids(result.getPid().stream().map(ConversionUtils::oafPidToBrokerPid).collect(Collectors.toList()))
|
||||
.setInstances(
|
||||
result
|
||||
.getInstance()
|
||||
.stream()
|
||||
.map(ConversionUtils::oafInstanceToBrokerInstances)
|
||||
.flatMap(List::stream)
|
||||
.collect(Collectors.toList()))
|
||||
.setExternalReferences(
|
||||
result
|
||||
.getExternalReference()
|
||||
.stream()
|
||||
.map(ConversionUtils::oafExtRefToBrokerExtRef)
|
||||
.collect(Collectors.toList()))
|
||||
: null;
|
||||
}
|
||||
|
||||
private static List<TypedValue> structPropTypedList(final List<StructuredProperty> list) {
|
||||
return list
|
||||
.stream()
|
||||
.map(
|
||||
p -> new TypedValue()
|
||||
.setValue(p.getValue())
|
||||
.setType(p.getQualifier().getClassid()))
|
||||
.collect(Collectors.toList());
|
||||
}
|
||||
|
||||
private static eu.dnetlib.broker.objects.Author oafAuthorToBrokerAuthor(final Author author) {
|
||||
return author != null ? new eu.dnetlib.broker.objects.Author()
|
||||
.setFullname(author.getFullname())
|
||||
.setOrcid(
|
||||
author
|
||||
.getPid()
|
||||
.stream()
|
||||
.filter(pid -> pid.getQualifier().getClassid().equalsIgnoreCase("orcid"))
|
||||
.map(pid -> pid.getValue())
|
||||
.findFirst()
|
||||
.orElse(null))
|
||||
: null;
|
||||
}
|
||||
|
||||
private static eu.dnetlib.broker.objects.Journal oafJournalToBrokerJournal(final Journal journal) {
|
||||
return journal != null ? new eu.dnetlib.broker.objects.Journal()
|
||||
.setName(journal.getName())
|
||||
.setIssn(journal.getIssnPrinted())
|
||||
.setEissn(journal.getIssnOnline())
|
||||
.setLissn(journal.getIssnLinking()) : null;
|
||||
}
|
||||
|
||||
private static eu.dnetlib.broker.objects.ExternalReference oafExtRefToBrokerExtRef(final ExternalReference ref) {
|
||||
return ref != null ? new eu.dnetlib.broker.objects.ExternalReference()
|
||||
.setRefidentifier(ref.getRefidentifier())
|
||||
.setSitename(ref.getSitename())
|
||||
.setType(ref.getQualifier().getClassid())
|
||||
.setUrl(ref.getUrl())
|
||||
: null;
|
||||
}
|
||||
|
||||
public static final eu.dnetlib.broker.objects.Project oafProjectToBrokerProject(final Project p) {
|
||||
if (p == null) {
|
||||
return null;
|
||||
}
|
||||
|
||||
final eu.dnetlib.broker.objects.Project res = new eu.dnetlib.broker.objects.Project()
|
||||
.setTitle(fieldValue(p.getTitle()))
|
||||
.setAcronym(fieldValue(p.getAcronym()))
|
||||
.setCode(fieldValue(p.getCode()));
|
||||
|
||||
final String ftree = fieldValue(p.getFundingtree());
|
||||
if (StringUtils.isNotBlank(ftree)) {
|
||||
try {
|
||||
final Document fdoc = DocumentHelper.parseText(ftree);
|
||||
res.setFunder(fdoc.valueOf("/fundingtree/funder/shortname"));
|
||||
res.setJurisdiction(fdoc.valueOf("/fundingtree/funder/jurisdiction"));
|
||||
res.setFundingProgram(fdoc.valueOf("//funding_level_0/name"));
|
||||
} catch (final DocumentException e) {
|
||||
log.error("Error in record " + p.getId() + ": invalid fundingtree: " + ftree);
|
||||
}
|
||||
}
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
public static final eu.dnetlib.broker.objects.Publication oafPublicationToBrokerPublication(final Publication d) {
|
||||
final eu.dnetlib.broker.objects.Publication res = new eu.dnetlib.broker.objects.Publication();
|
||||
// TODO
|
||||
return res;
|
||||
public static final eu.dnetlib.broker.objects.Software oafSoftwareToBrokerSoftware(final Software sw) {
|
||||
return sw != null ? new eu.dnetlib.broker.objects.Software()
|
||||
.setName(structPropValue(sw.getTitle()))
|
||||
.setDescription(fieldValue(sw.getDescription()))
|
||||
.setRepository(fieldValue(sw.getCodeRepositoryUrl()))
|
||||
.setLandingPage(fieldValue(sw.getDocumentationUrl()))
|
||||
: null;
|
||||
}
|
||||
|
||||
private static String fieldValue(final Field<String> f) {
|
||||
return f != null ? f.getValue() : null;
|
||||
}
|
||||
|
||||
private static String fieldValue(final List<Field<String>> fl) {
|
||||
return fl != null ? fl.stream().map(Field::getValue).filter(StringUtils::isNotBlank).findFirst().orElse(null)
|
||||
: null;
|
||||
}
|
||||
|
||||
private static String structPropValue(final List<StructuredProperty> props) {
|
||||
return props != null
|
||||
? props.stream().map(StructuredProperty::getValue).filter(StringUtils::isNotBlank).findFirst().orElse(null)
|
||||
: null;
|
||||
}
|
||||
|
||||
private static List<String> fieldList(final List<Field<String>> fl) {
|
||||
return fl != null
|
||||
? fl.stream().map(Field::getValue).filter(StringUtils::isNotBlank).collect(Collectors.toList())
|
||||
: new ArrayList<>();
|
||||
}
|
||||
|
||||
private static List<String> structPropList(final List<StructuredProperty> props) {
|
||||
return props != null
|
||||
? props
|
||||
.stream()
|
||||
.map(StructuredProperty::getValue)
|
||||
.filter(StringUtils::isNotBlank)
|
||||
.collect(Collectors.toList())
|
||||
: new ArrayList<>();
|
||||
}
|
||||
}
|
||||
|
|
|
@ -0,0 +1,86 @@
|
|||
|
||||
package eu.dnetlib.dhp.broker.oa.util;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import eu.dnetlib.broker.objects.OpenaireBrokerResult;
|
||||
import eu.dnetlib.dhp.broker.model.EventFactory;
|
||||
import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher;
|
||||
import eu.dnetlib.dhp.broker.oa.matchers.relatedDatasets.EnrichMissingDatasetIsReferencedBy;
|
||||
import eu.dnetlib.dhp.broker.oa.matchers.relatedDatasets.EnrichMissingDatasetIsRelatedTo;
|
||||
import eu.dnetlib.dhp.broker.oa.matchers.relatedDatasets.EnrichMissingDatasetIsSupplementedBy;
|
||||
import eu.dnetlib.dhp.broker.oa.matchers.relatedDatasets.EnrichMissingDatasetIsSupplementedTo;
|
||||
import eu.dnetlib.dhp.broker.oa.matchers.relatedDatasets.EnrichMissingDatasetReferences;
|
||||
import eu.dnetlib.dhp.broker.oa.matchers.relatedProjects.EnrichMissingProject;
|
||||
import eu.dnetlib.dhp.broker.oa.matchers.relatedProjects.EnrichMoreProject;
|
||||
import eu.dnetlib.dhp.broker.oa.matchers.relatedPublications.EnrichMissingPublicationIsReferencedBy;
|
||||
import eu.dnetlib.dhp.broker.oa.matchers.relatedPublications.EnrichMissingPublicationIsRelatedTo;
|
||||
import eu.dnetlib.dhp.broker.oa.matchers.relatedPublications.EnrichMissingPublicationIsSupplementedBy;
|
||||
import eu.dnetlib.dhp.broker.oa.matchers.relatedPublications.EnrichMissingPublicationIsSupplementedTo;
|
||||
import eu.dnetlib.dhp.broker.oa.matchers.relatedPublications.EnrichMissingPublicationReferences;
|
||||
import eu.dnetlib.dhp.broker.oa.matchers.relatedSoftware.EnrichMissingSoftware;
|
||||
import eu.dnetlib.dhp.broker.oa.matchers.relatedSoftware.EnrichMoreSoftware;
|
||||
import eu.dnetlib.dhp.broker.oa.matchers.simple.EnrichMissingAbstract;
|
||||
import eu.dnetlib.dhp.broker.oa.matchers.simple.EnrichMissingAuthorOrcid;
|
||||
import eu.dnetlib.dhp.broker.oa.matchers.simple.EnrichMissingOpenAccess;
|
||||
import eu.dnetlib.dhp.broker.oa.matchers.simple.EnrichMissingPid;
|
||||
import eu.dnetlib.dhp.broker.oa.matchers.simple.EnrichMissingPublicationDate;
|
||||
import eu.dnetlib.dhp.broker.oa.matchers.simple.EnrichMissingSubject;
|
||||
import eu.dnetlib.dhp.broker.oa.matchers.simple.EnrichMoreOpenAccess;
|
||||
import eu.dnetlib.dhp.broker.oa.matchers.simple.EnrichMorePid;
|
||||
import eu.dnetlib.dhp.broker.oa.matchers.simple.EnrichMoreSubject;
|
||||
import eu.dnetlib.dhp.broker.oa.util.aggregators.simple.ResultGroup;
|
||||
import eu.dnetlib.pace.config.DedupConfig;
|
||||
|
||||
public class EventFinder {
|
||||
|
||||
private static List<UpdateMatcher<?>> matchers = new ArrayList<>();
|
||||
static {
|
||||
matchers.add(new EnrichMissingAbstract());
|
||||
matchers.add(new EnrichMissingAuthorOrcid());
|
||||
matchers.add(new EnrichMissingOpenAccess());
|
||||
matchers.add(new EnrichMissingPid());
|
||||
matchers.add(new EnrichMissingPublicationDate());
|
||||
matchers.add(new EnrichMissingSubject());
|
||||
matchers.add(new EnrichMoreOpenAccess());
|
||||
matchers.add(new EnrichMorePid());
|
||||
matchers.add(new EnrichMoreSubject());
|
||||
|
||||
// Advanced matchers
|
||||
matchers.add(new EnrichMissingProject());
|
||||
matchers.add(new EnrichMoreProject());
|
||||
matchers.add(new EnrichMissingSoftware());
|
||||
matchers.add(new EnrichMoreSoftware());
|
||||
matchers.add(new EnrichMissingPublicationIsRelatedTo());
|
||||
matchers.add(new EnrichMissingPublicationIsReferencedBy());
|
||||
matchers.add(new EnrichMissingPublicationReferences());
|
||||
matchers.add(new EnrichMissingPublicationIsSupplementedTo());
|
||||
matchers.add(new EnrichMissingPublicationIsSupplementedBy());
|
||||
matchers.add(new EnrichMissingDatasetIsRelatedTo());
|
||||
matchers.add(new EnrichMissingDatasetIsReferencedBy());
|
||||
matchers.add(new EnrichMissingDatasetReferences());
|
||||
matchers.add(new EnrichMissingDatasetIsSupplementedTo());
|
||||
matchers.add(new EnrichMissingDatasetIsSupplementedBy());
|
||||
matchers.add(new EnrichMissingAbstract());
|
||||
}
|
||||
|
||||
public static EventGroup generateEvents(final ResultGroup results, final DedupConfig dedupConfig) {
|
||||
final List<UpdateInfo<?>> list = new ArrayList<>();
|
||||
|
||||
for (final OpenaireBrokerResult target : results.getData()) {
|
||||
for (final UpdateMatcher<?> matcher : matchers) {
|
||||
list.addAll(matcher.searchUpdatesForRecord(target, results.getData(), dedupConfig));
|
||||
}
|
||||
}
|
||||
|
||||
return asEventGroup(list);
|
||||
}
|
||||
|
||||
private static EventGroup asEventGroup(final List<UpdateInfo<?>> list) {
|
||||
final EventGroup events = new EventGroup();
|
||||
list.stream().map(EventFactory::newBrokerEvent).forEach(events::addElement);
|
||||
return events;
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,33 @@
|
|||
|
||||
package eu.dnetlib.dhp.broker.oa.util;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import eu.dnetlib.dhp.broker.model.Event;
|
||||
|
||||
public class EventGroup implements Serializable {
|
||||
|
||||
/**
|
||||
*
|
||||
*/
|
||||
private static final long serialVersionUID = 765977943803533130L;
|
||||
|
||||
private final List<Event> data = new ArrayList<>();
|
||||
|
||||
public List<Event> getData() {
|
||||
return data;
|
||||
}
|
||||
|
||||
public EventGroup addElement(final Event elem) {
|
||||
data.add(elem);
|
||||
return this;
|
||||
}
|
||||
|
||||
public EventGroup addGroup(final EventGroup group) {
|
||||
data.addAll(group.getData());
|
||||
return this;
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,23 @@
|
|||
|
||||
package eu.dnetlib.dhp.broker.oa.util;
|
||||
|
||||
public class TrustUtils {
|
||||
|
||||
public static float rescale(final double score, final double threshold) {
|
||||
if (score >= BrokerConstants.MAX_TRUST) {
|
||||
return BrokerConstants.MAX_TRUST;
|
||||
}
|
||||
|
||||
final double val = (score - threshold) * (BrokerConstants.MAX_TRUST - BrokerConstants.MIN_TRUST)
|
||||
/ (BrokerConstants.MAX_TRUST - threshold);
|
||||
|
||||
if (val < BrokerConstants.MIN_TRUST) {
|
||||
return BrokerConstants.MIN_TRUST;
|
||||
}
|
||||
if (val > BrokerConstants.MAX_TRUST) {
|
||||
return BrokerConstants.MAX_TRUST;
|
||||
}
|
||||
|
||||
return (float) val;
|
||||
}
|
||||
}
|
|
@ -4,10 +4,20 @@ package eu.dnetlib.dhp.broker.oa.util;
|
|||
import java.util.function.BiConsumer;
|
||||
import java.util.function.Function;
|
||||
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
|
||||
import eu.dnetlib.broker.objects.Instance;
|
||||
import eu.dnetlib.broker.objects.OpenAireEventPayload;
|
||||
import eu.dnetlib.broker.objects.Publication;
|
||||
import eu.dnetlib.broker.objects.OpenaireBrokerResult;
|
||||
import eu.dnetlib.broker.objects.Provenance;
|
||||
import eu.dnetlib.dhp.broker.model.Topic;
|
||||
import eu.dnetlib.dhp.schema.oaf.Result;
|
||||
import eu.dnetlib.pace.config.DedupConfig;
|
||||
import eu.dnetlib.pace.model.MapDocument;
|
||||
import eu.dnetlib.pace.tree.support.TreeProcessor;
|
||||
import eu.dnetlib.pace.util.MapDocumentUtil;
|
||||
|
||||
public final class UpdateInfo<T> {
|
||||
|
||||
|
@ -15,43 +25,61 @@ public final class UpdateInfo<T> {
|
|||
|
||||
private final T highlightValue;
|
||||
|
||||
private final Result source;
|
||||
private final OpenaireBrokerResult source;
|
||||
|
||||
private final Result target;
|
||||
private final OpenaireBrokerResult target;
|
||||
|
||||
private final BiConsumer<Publication, T> compileHighlight;
|
||||
private final BiConsumer<OpenaireBrokerResult, T> compileHighlight;
|
||||
|
||||
private final Function<T, String> highlightToString;
|
||||
|
||||
private final float trust;
|
||||
|
||||
public UpdateInfo(final Topic topic, final T highlightValue, final Result source, final Result target,
|
||||
final BiConsumer<Publication, T> compileHighlight,
|
||||
final Function<T, String> highlightToString) {
|
||||
private static final Logger log = LoggerFactory.getLogger(UpdateInfo.class);
|
||||
|
||||
public UpdateInfo(final Topic topic, final T highlightValue, final OpenaireBrokerResult source,
|
||||
final OpenaireBrokerResult target,
|
||||
final BiConsumer<OpenaireBrokerResult, T> compileHighlight,
|
||||
final Function<T, String> highlightToString,
|
||||
final DedupConfig dedupConfig) {
|
||||
this.topic = topic;
|
||||
this.highlightValue = highlightValue;
|
||||
this.source = source;
|
||||
this.target = target;
|
||||
this.compileHighlight = compileHighlight;
|
||||
this.highlightToString = highlightToString;
|
||||
this.trust = calculateTrust(source, target);
|
||||
this.trust = calculateTrust(dedupConfig, source, target);
|
||||
}
|
||||
|
||||
public T getHighlightValue() {
|
||||
return highlightValue;
|
||||
}
|
||||
|
||||
public Result getSource() {
|
||||
public OpenaireBrokerResult getSource() {
|
||||
return source;
|
||||
}
|
||||
|
||||
public Result getTarget() {
|
||||
public OpenaireBrokerResult getTarget() {
|
||||
return target;
|
||||
}
|
||||
|
||||
private float calculateTrust(final Result source, final Result target) {
|
||||
// TODO
|
||||
return 0.9f;
|
||||
private float calculateTrust(final DedupConfig dedupConfig, final OpenaireBrokerResult r1,
|
||||
final OpenaireBrokerResult r2) {
|
||||
try {
|
||||
final ObjectMapper objectMapper = new ObjectMapper();
|
||||
final MapDocument doc1 = MapDocumentUtil
|
||||
.asMapDocumentWithJPath(dedupConfig, objectMapper.writeValueAsString(r1));
|
||||
final MapDocument doc2 = MapDocumentUtil
|
||||
.asMapDocumentWithJPath(dedupConfig, objectMapper.writeValueAsString(r2));
|
||||
|
||||
final double score = new TreeProcessor(dedupConfig).computeScore(doc1, doc2);
|
||||
final double threshold = dedupConfig.getWf().getThreshold();
|
||||
|
||||
return TrustUtils.rescale(score, threshold);
|
||||
} catch (final Exception e) {
|
||||
log.error("Error computing score between results", e);
|
||||
return BrokerConstants.MIN_TRUST;
|
||||
}
|
||||
}
|
||||
|
||||
protected Topic getTopic() {
|
||||
|
@ -66,12 +94,35 @@ public final class UpdateInfo<T> {
|
|||
return trust;
|
||||
}
|
||||
|
||||
public void compileHighlight(final OpenAireEventPayload payload) {
|
||||
compileHighlight.accept(payload.getHighlight(), getHighlightValue());
|
||||
}
|
||||
|
||||
public String getHighlightValueAsString() {
|
||||
return highlightToString.apply(getHighlightValue());
|
||||
}
|
||||
|
||||
public OpenAireEventPayload asBrokerPayload() {
|
||||
|
||||
compileHighlight.accept(target, getHighlightValue());
|
||||
|
||||
final OpenaireBrokerResult hl = new OpenaireBrokerResult();
|
||||
compileHighlight.accept(hl, getHighlightValue());
|
||||
|
||||
final String provId = getSource().getOriginalId();
|
||||
final String provRepo = getSource().getCollectedFromName();
|
||||
|
||||
final String provUrl = getSource()
|
||||
.getInstances()
|
||||
.stream()
|
||||
.map(Instance::getUrl)
|
||||
.findFirst()
|
||||
.orElse(null);
|
||||
;
|
||||
|
||||
final Provenance provenance = new Provenance().setId(provId).setRepositoryName(provRepo).setUrl(provUrl);
|
||||
|
||||
return new OpenAireEventPayload()
|
||||
.setPublication(target)
|
||||
.setHighlight(hl)
|
||||
.setTrust(trust)
|
||||
.setProvenance(provenance);
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -0,0 +1,51 @@
|
|||
|
||||
package eu.dnetlib.dhp.broker.oa.util.aggregators.simple;
|
||||
|
||||
import org.apache.spark.sql.Encoder;
|
||||
import org.apache.spark.sql.Encoders;
|
||||
import org.apache.spark.sql.expressions.Aggregator;
|
||||
|
||||
import eu.dnetlib.broker.objects.OpenaireBrokerResult;
|
||||
import eu.dnetlib.dhp.schema.oaf.Relation;
|
||||
import scala.Tuple2;
|
||||
|
||||
public class ResultAggregator extends Aggregator<Tuple2<OpenaireBrokerResult, Relation>, ResultGroup, ResultGroup> {
|
||||
|
||||
/**
|
||||
*
|
||||
*/
|
||||
private static final long serialVersionUID = -1492327874705585538L;
|
||||
|
||||
@Override
|
||||
public ResultGroup zero() {
|
||||
return new ResultGroup();
|
||||
}
|
||||
|
||||
@Override
|
||||
public ResultGroup reduce(final ResultGroup group, final Tuple2<OpenaireBrokerResult, Relation> t) {
|
||||
return group.addElement(t._1);
|
||||
}
|
||||
|
||||
@Override
|
||||
public ResultGroup merge(final ResultGroup g1, final ResultGroup g2) {
|
||||
return g1.addGroup(g2);
|
||||
}
|
||||
|
||||
@Override
|
||||
public ResultGroup finish(final ResultGroup group) {
|
||||
return group;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Encoder<ResultGroup> bufferEncoder() {
|
||||
return Encoders.kryo(ResultGroup.class);
|
||||
|
||||
}
|
||||
|
||||
@Override
|
||||
public Encoder<ResultGroup> outputEncoder() {
|
||||
return Encoders.kryo(ResultGroup.class);
|
||||
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,36 @@
|
|||
|
||||
package eu.dnetlib.dhp.broker.oa.util.aggregators.simple;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import eu.dnetlib.broker.objects.OpenaireBrokerResult;
|
||||
|
||||
public class ResultGroup implements Serializable {
|
||||
|
||||
/**
|
||||
*
|
||||
*/
|
||||
private static final long serialVersionUID = -3360828477088669296L;
|
||||
|
||||
private final List<OpenaireBrokerResult> data = new ArrayList<>();
|
||||
|
||||
public List<OpenaireBrokerResult> getData() {
|
||||
return data;
|
||||
}
|
||||
|
||||
public ResultGroup addElement(final OpenaireBrokerResult elem) {
|
||||
data.add(elem);
|
||||
return this;
|
||||
}
|
||||
|
||||
public ResultGroup addGroup(final ResultGroup group) {
|
||||
data.addAll(group.getData());
|
||||
return this;
|
||||
}
|
||||
|
||||
public boolean isValid() {
|
||||
return data.size() > 1;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,69 @@
|
|||
|
||||
package eu.dnetlib.dhp.broker.oa.util.aggregators.withRels;
|
||||
|
||||
import org.apache.spark.sql.Encoder;
|
||||
import org.apache.spark.sql.Encoders;
|
||||
import org.apache.spark.sql.expressions.Aggregator;
|
||||
|
||||
import eu.dnetlib.broker.objects.OpenaireBrokerResult;
|
||||
import scala.Tuple2;
|
||||
|
||||
public class OpenaireBrokerResultAggregator<T>
|
||||
extends Aggregator<Tuple2<OpenaireBrokerResult, T>, OpenaireBrokerResult, OpenaireBrokerResult> {
|
||||
|
||||
/**
|
||||
*
|
||||
*/
|
||||
private static final long serialVersionUID = -3687878788861013488L;
|
||||
|
||||
@Override
|
||||
public OpenaireBrokerResult zero() {
|
||||
return new OpenaireBrokerResult();
|
||||
}
|
||||
|
||||
@Override
|
||||
public OpenaireBrokerResult finish(final OpenaireBrokerResult g) {
|
||||
return g;
|
||||
}
|
||||
|
||||
@Override
|
||||
public OpenaireBrokerResult reduce(final OpenaireBrokerResult g, final Tuple2<OpenaireBrokerResult, T> t) {
|
||||
if (g.getOriginalId() == null) {
|
||||
return t._1;
|
||||
} else if (t._2 instanceof RelatedSoftware) {
|
||||
g.getSoftwares().add(((RelatedSoftware) t._2).getRelSoftware());
|
||||
} else if (t._2 instanceof RelatedDataset) {
|
||||
g.getDatasets().add(((RelatedDataset) t._2).getRelDataset());
|
||||
} else if (t._2 instanceof RelatedPublication) {
|
||||
g.getPublications().add(((RelatedPublication) t._2).getRelPublication());
|
||||
} else if (t._2 instanceof RelatedProject) {
|
||||
g.getProjects().add(((RelatedProject) t._2).getRelProject());
|
||||
}
|
||||
return g;
|
||||
|
||||
}
|
||||
|
||||
@Override
|
||||
public OpenaireBrokerResult merge(final OpenaireBrokerResult g1, final OpenaireBrokerResult g2) {
|
||||
if (g1.getOriginalId() != null) {
|
||||
g1.getSoftwares().addAll(g2.getSoftwares());
|
||||
g1.getDatasets().addAll(g2.getDatasets());
|
||||
g1.getPublications().addAll(g2.getPublications());
|
||||
g1.getProjects().addAll(g2.getProjects());
|
||||
return g1;
|
||||
} else {
|
||||
return g2;
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public Encoder<OpenaireBrokerResult> bufferEncoder() {
|
||||
return Encoders.kryo(OpenaireBrokerResult.class);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Encoder<OpenaireBrokerResult> outputEncoder() {
|
||||
return Encoders.kryo(OpenaireBrokerResult.class);
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,36 @@
|
|||
|
||||
package eu.dnetlib.dhp.broker.oa.util.aggregators.withRels;
|
||||
|
||||
import java.io.Serializable;
|
||||
|
||||
import eu.dnetlib.broker.objects.Dataset;
|
||||
|
||||
public class RelatedDataset implements Serializable {
|
||||
|
||||
/**
|
||||
*
|
||||
*/
|
||||
private static final long serialVersionUID = 774487705184038324L;
|
||||
private final String source;
|
||||
private final String relType;
|
||||
private final Dataset relDataset;
|
||||
|
||||
public RelatedDataset(final String source, final String relType, final Dataset relDataset) {
|
||||
this.source = source;
|
||||
this.relType = relType;
|
||||
this.relDataset = relDataset;
|
||||
}
|
||||
|
||||
public String getSource() {
|
||||
return source;
|
||||
}
|
||||
|
||||
public String getRelType() {
|
||||
return relType;
|
||||
}
|
||||
|
||||
public Dataset getRelDataset() {
|
||||
return relDataset;
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,30 @@
|
|||
|
||||
package eu.dnetlib.dhp.broker.oa.util.aggregators.withRels;
|
||||
|
||||
import eu.dnetlib.broker.objects.Dataset;
|
||||
import eu.dnetlib.broker.objects.Project;
|
||||
import eu.dnetlib.broker.objects.Publication;
|
||||
import eu.dnetlib.broker.objects.Software;
|
||||
|
||||
public class RelatedEntityFactory {
|
||||
|
||||
@SuppressWarnings("unchecked")
|
||||
public static <RT, T> RT newRelatedEntity(final String sourceId,
|
||||
final String relType,
|
||||
final T target,
|
||||
final Class<RT> clazz) {
|
||||
if (clazz == RelatedProject.class) {
|
||||
return (RT) new RelatedProject(sourceId, relType, (Project) target);
|
||||
}
|
||||
if (clazz == RelatedSoftware.class) {
|
||||
return (RT) new RelatedSoftware(sourceId, relType, (Software) target);
|
||||
}
|
||||
if (clazz == RelatedDataset.class) {
|
||||
return (RT) new RelatedDataset(sourceId, relType, (Dataset) target);
|
||||
}
|
||||
if (clazz == RelatedPublication.class) {
|
||||
return (RT) new RelatedPublication(sourceId, relType, (Publication) target);
|
||||
}
|
||||
return null;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,37 @@
|
|||
|
||||
package eu.dnetlib.dhp.broker.oa.util.aggregators.withRels;
|
||||
|
||||
import java.io.Serializable;
|
||||
|
||||
import eu.dnetlib.broker.objects.Project;
|
||||
|
||||
public class RelatedProject implements Serializable {
|
||||
|
||||
/**
|
||||
*
|
||||
*/
|
||||
private static final long serialVersionUID = 4941437626549329870L;
|
||||
|
||||
private final String source;
|
||||
private final String relType;
|
||||
private final Project relProject;
|
||||
|
||||
public RelatedProject(final String source, final String relType, final Project relProject) {
|
||||
this.source = source;
|
||||
this.relType = relType;
|
||||
this.relProject = relProject;
|
||||
}
|
||||
|
||||
public String getSource() {
|
||||
return source;
|
||||
}
|
||||
|
||||
public String getRelType() {
|
||||
return relType;
|
||||
}
|
||||
|
||||
public Project getRelProject() {
|
||||
return relProject;
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,37 @@
|
|||
|
||||
package eu.dnetlib.dhp.broker.oa.util.aggregators.withRels;
|
||||
|
||||
import java.io.Serializable;
|
||||
|
||||
import eu.dnetlib.broker.objects.Publication;
|
||||
|
||||
public class RelatedPublication implements Serializable {
|
||||
|
||||
/**
|
||||
*
|
||||
*/
|
||||
private static final long serialVersionUID = 9021609640411395128L;
|
||||
|
||||
private final String source;
|
||||
private final String relType;
|
||||
private final Publication relPublication;
|
||||
|
||||
public RelatedPublication(final String source, final String relType, final Publication relPublication) {
|
||||
this.source = source;
|
||||
this.relType = relType;
|
||||
this.relPublication = relPublication;
|
||||
}
|
||||
|
||||
public String getSource() {
|
||||
return source;
|
||||
}
|
||||
|
||||
public String getRelType() {
|
||||
return relType;
|
||||
}
|
||||
|
||||
public Publication getRelPublication() {
|
||||
return relPublication;
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,36 @@
|
|||
|
||||
package eu.dnetlib.dhp.broker.oa.util.aggregators.withRels;
|
||||
|
||||
import java.io.Serializable;
|
||||
|
||||
import eu.dnetlib.broker.objects.Software;
|
||||
|
||||
public class RelatedSoftware implements Serializable {
|
||||
|
||||
/**
|
||||
*
|
||||
*/
|
||||
private static final long serialVersionUID = 7573383356943300157L;
|
||||
private final String source;
|
||||
private final String relType;
|
||||
private final Software relSoftware;
|
||||
|
||||
public RelatedSoftware(final String source, final String relType, final Software relSoftware) {
|
||||
this.source = source;
|
||||
this.relType = relType;
|
||||
this.relSoftware = relSoftware;
|
||||
}
|
||||
|
||||
public String getSource() {
|
||||
return source;
|
||||
}
|
||||
|
||||
public String getRelType() {
|
||||
return relType;
|
||||
}
|
||||
|
||||
public Software getRelSoftware() {
|
||||
return relSoftware;
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,18 @@
|
|||
<configuration>
|
||||
<property>
|
||||
<name>jobTracker</name>
|
||||
<value>yarnRM</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>nameNode</name>
|
||||
<value>hdfs://nameservice1</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.use.system.libpath</name>
|
||||
<value>true</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.action.sharelib.for.spark</name>
|
||||
<value>spark2</value>
|
||||
</property>
|
||||
</configuration>
|
|
@ -0,0 +1,99 @@
|
|||
<workflow-app name="create broker events" xmlns="uri:oozie:workflow:0.5">
|
||||
|
||||
<parameters>
|
||||
<property>
|
||||
<name>graphInputPath</name>
|
||||
<description>the path where the graph is stored</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>eventsOutputPath</name>
|
||||
<description>the path where the the events will be stored</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>isLookupUrl</name>
|
||||
<description>the address of the lookUp service</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>dedupConfProfId</name>
|
||||
<description>the id of a valid Dedup Configuration Profile</description>
|
||||
</property>
|
||||
|
||||
<property>
|
||||
<name>sparkDriverMemory</name>
|
||||
<description>memory for driver process</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>sparkExecutorMemory</name>
|
||||
<description>memory for individual executor</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>sparkExecutorCores</name>
|
||||
<description>number of cores used by single executor</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozieActionShareLibForSpark2</name>
|
||||
<description>oozie action sharelib for spark 2.*</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>spark2ExtraListeners</name>
|
||||
<value>com.cloudera.spark.lineage.NavigatorAppListener</value>
|
||||
<description>spark 2.* extra listeners classname</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>spark2SqlQueryExecutionListeners</name>
|
||||
<value>com.cloudera.spark.lineage.NavigatorQueryListener</value>
|
||||
<description>spark 2.* sql query execution listeners classname</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>spark2YarnHistoryServerAddress</name>
|
||||
<description>spark 2.* yarn history server address</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>spark2EventLogDir</name>
|
||||
<description>spark 2.* event log dir location</description>
|
||||
</property>
|
||||
</parameters>
|
||||
|
||||
<global>
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<configuration>
|
||||
<property>
|
||||
<name>mapreduce.job.queuename</name>
|
||||
<value>${queueName}</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.launcher.mapred.job.queue.name</name>
|
||||
<value>${oozieLauncherQueueName}</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.action.sharelib.for.spark</name>
|
||||
<value>${oozieActionShareLibForSpark2}</value>
|
||||
</property>
|
||||
</configuration>
|
||||
</global>
|
||||
|
||||
<start to="generate_events"/>
|
||||
|
||||
<kill name="Kill">
|
||||
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
||||
</kill>
|
||||
|
||||
<action name="generate_events">
|
||||
<java>
|
||||
<prepare>
|
||||
<delete path="${eventsOutputPath}"/>
|
||||
</prepare>
|
||||
<main-class>eu.dnetlib.dhp.broker.oa.GenerateEventsApplication</main-class>
|
||||
<arg>--graphPath</arg><arg>${graphInputPath}</arg>
|
||||
<arg>--eventsPath</arg><arg>${eventsOutputPath}</arg>
|
||||
<arg>--isLookupUrl</arg><arg>${isLookupUrl}</arg>
|
||||
<arg>--dedupConfProfile</arg><arg>${dedupConfProfId}</arg>
|
||||
</java>
|
||||
<ok to="End"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<end name="End"/>
|
||||
|
||||
</workflow-app>
|
|
@ -0,0 +1,26 @@
|
|||
[
|
||||
{
|
||||
"paramName": "g",
|
||||
"paramLongName": "graphPath",
|
||||
"paramDescription": "the path where there the graph is stored",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "o",
|
||||
"paramLongName": "eventsPath",
|
||||
"paramDescription": "the path where the generated events will be stored",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "lu",
|
||||
"paramLongName": "isLookupUrl",
|
||||
"paramDescription": "the address of the ISLookUpService",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "d",
|
||||
"paramLongName": "dedupConfProfile",
|
||||
"paramDescription": "the id of a valid Dedup Configuration Profile",
|
||||
"paramRequired": true
|
||||
}
|
||||
]
|
|
@ -0,0 +1,73 @@
|
|||
|
||||
package eu.dnetlib.dhp.broker.oa.util;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.assertTrue;
|
||||
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
public class TrustUtilsTest {
|
||||
|
||||
private static final double THRESHOLD = 0.95;
|
||||
|
||||
@Test
|
||||
public void rescaleTest_1() {
|
||||
verifyValue(-0.3, BrokerConstants.MIN_TRUST);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void rescaleTest_2() {
|
||||
verifyValue(0.0, BrokerConstants.MIN_TRUST);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void rescaleTest_3() {
|
||||
verifyValue(0.5, BrokerConstants.MIN_TRUST);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void rescaleTest_4() {
|
||||
verifyValue(0.95, BrokerConstants.MIN_TRUST);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void rescaleTest_5() {
|
||||
verifyValue(0.96, BrokerConstants.MIN_TRUST);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void rescaleTest_6() {
|
||||
verifyValue(0.97, 0.3f);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void rescaleTest_7() {
|
||||
verifyValue(0.98, 0.45f);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void rescaleTest_8() {
|
||||
verifyValue(0.99, 0.6f);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void rescaleTest_9() {
|
||||
verifyValue(1.00, BrokerConstants.MAX_TRUST);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void rescaleTest_10() {
|
||||
verifyValue(1.01, BrokerConstants.MAX_TRUST);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void rescaleTest_11() {
|
||||
verifyValue(2.00, BrokerConstants.MAX_TRUST);
|
||||
}
|
||||
|
||||
private void verifyValue(final double originalScore, final float expectedTrust) {
|
||||
final float trust = TrustUtils.rescale(originalScore, THRESHOLD);
|
||||
System.out.println(trust);
|
||||
assertTrue(Math.abs(trust - expectedTrust) < 0.01);
|
||||
}
|
||||
|
||||
}
|
|
@ -3,7 +3,7 @@
|
|||
<parent>
|
||||
<artifactId>dhp-workflows</artifactId>
|
||||
<groupId>eu.dnetlib.dhp</groupId>
|
||||
<version>1.2.2-SNAPSHOT</version>
|
||||
<version>1.2.3-SNAPSHOT</version>
|
||||
</parent>
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
<artifactId>dhp-dedup-openaire</artifactId>
|
||||
|
|
|
@ -3,7 +3,7 @@
|
|||
<parent>
|
||||
<artifactId>dhp-workflows</artifactId>
|
||||
<groupId>eu.dnetlib.dhp</groupId>
|
||||
<version>1.2.2-SNAPSHOT</version>
|
||||
<version>1.2.3-SNAPSHOT</version>
|
||||
</parent>
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
|
||||
|
|
|
@ -3,7 +3,7 @@
|
|||
<parent>
|
||||
<artifactId>dhp-workflows</artifactId>
|
||||
<groupId>eu.dnetlib.dhp</groupId>
|
||||
<version>1.2.2-SNAPSHOT</version>
|
||||
<version>1.2.3-SNAPSHOT</version>
|
||||
</parent>
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
|
||||
|
|
|
@ -0,0 +1,92 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
|
||||
<parent>
|
||||
<artifactId>dhp-workflows</artifactId>
|
||||
<groupId>eu.dnetlib.dhp</groupId>
|
||||
<version>1.2.3-SNAPSHOT</version>
|
||||
</parent>
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
|
||||
<artifactId>dhp-doiboost</artifactId>
|
||||
|
||||
<build>
|
||||
<plugins>
|
||||
<plugin>
|
||||
<groupId>net.alchim31.maven</groupId>
|
||||
<artifactId>scala-maven-plugin</artifactId>
|
||||
<version>4.0.1</version>
|
||||
<executions>
|
||||
<execution>
|
||||
<id>scala-compile-first</id>
|
||||
<phase>initialize</phase>
|
||||
<goals>
|
||||
<goal>add-source</goal>
|
||||
<goal>compile</goal>
|
||||
</goals>
|
||||
</execution>
|
||||
<execution>
|
||||
<id>scala-test-compile</id>
|
||||
<phase>process-test-resources</phase>
|
||||
<goals>
|
||||
<goal>testCompile</goal>
|
||||
</goals>
|
||||
</execution>
|
||||
</executions>
|
||||
<configuration>
|
||||
<scalaVersion>${scala.version}</scalaVersion>
|
||||
</configuration>
|
||||
</plugin>
|
||||
</plugins>
|
||||
|
||||
</build>
|
||||
|
||||
|
||||
|
||||
|
||||
<dependencies>
|
||||
<dependency>
|
||||
<groupId>org.apache.hadoop</groupId>
|
||||
<artifactId>hadoop-client</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.httpcomponents</groupId>
|
||||
<artifactId>httpclient</artifactId>
|
||||
<version>4.3.4</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>eu.dnetlib.dhp</groupId>
|
||||
<artifactId>dhp-common</artifactId>
|
||||
<version>${project.version}</version>
|
||||
<exclusions>
|
||||
<exclusion>
|
||||
<groupId>org.apache.cxf</groupId>
|
||||
<artifactId>cxf-rt-transports-http</artifactId>
|
||||
</exclusion>
|
||||
</exclusions>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>eu.dnetlib.dhp</groupId>
|
||||
<artifactId>dhp-schemas</artifactId>
|
||||
<version>${project.version}</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>com.jayway.jsonpath</groupId>
|
||||
<artifactId>json-path</artifactId>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>org.apache.spark</groupId>
|
||||
<artifactId>spark-core_2.11</artifactId>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>org.apache.spark</groupId>
|
||||
<artifactId>spark-sql_2.11</artifactId>
|
||||
</dependency>
|
||||
|
||||
|
||||
|
||||
</dependencies>
|
||||
|
||||
|
||||
</project>
|
|
@ -0,0 +1,366 @@
|
|||
package eu.dnetlib.doiboost
|
||||
|
||||
import eu.dnetlib.dhp.schema.action.AtomicAction
|
||||
import eu.dnetlib.dhp.schema.oaf.{DataInfo, Dataset, Field, Instance, KeyValue, Oaf, Organization, Publication, Qualifier, Relation, Result, StructuredProperty}
|
||||
import eu.dnetlib.dhp.utils.DHPUtils
|
||||
import org.apache.commons.lang3.StringUtils
|
||||
import org.codehaus.jackson.map.ObjectMapper
|
||||
import org.json4s
|
||||
import org.json4s.DefaultFormats
|
||||
import org.json4s.jackson.JsonMethods.parse
|
||||
import org.slf4j.{Logger, LoggerFactory}
|
||||
|
||||
import scala.collection.JavaConverters._
|
||||
import scala.io.Source
|
||||
|
||||
|
||||
case class HostedByItemType(id: String, officialname: String, issn: String, eissn: String, lissn: String, openAccess: Boolean) {}
|
||||
|
||||
case class DoiBoostAffiliation(PaperId:Long, AffiliationId:Long, GridId:Option[String], OfficialPage:Option[String], DisplayName:Option[String]){}
|
||||
|
||||
object DoiBoostMappingUtil {
|
||||
def getUnknownCountry(): Qualifier = {
|
||||
createQualifier("UNKNOWN","UNKNOWN","dnet:countries","dnet:countries")
|
||||
}
|
||||
|
||||
|
||||
|
||||
def generateMAGAffiliationId(affId: String): String = {
|
||||
s"20|microsoft___$SEPARATOR${DHPUtils.md5(affId)}"
|
||||
}
|
||||
|
||||
|
||||
val logger: Logger = LoggerFactory.getLogger(getClass)
|
||||
|
||||
//STATIC STRING
|
||||
val MAG = "microsoft"
|
||||
val MAG_NAME = "Microsoft Academic Graph"
|
||||
val ORCID = "ORCID"
|
||||
val CROSSREF = "Crossref"
|
||||
val UNPAYWALL = "UnpayWall"
|
||||
val GRID_AC = "grid.ac"
|
||||
val WIKPEDIA = "wikpedia"
|
||||
val doiBoostNSPREFIX = "doiboost____"
|
||||
val OPENAIRE_PREFIX = "openaire____"
|
||||
val SEPARATOR = "::"
|
||||
val DNET_LANGUAGES = "dnet:languages"
|
||||
val PID_TYPES = "dnet:pid_types"
|
||||
|
||||
val invalidName = List(",", "none none", "none, none", "none &na;", "(:null)", "test test test", "test test", "test", "&na; &na;")
|
||||
|
||||
def toActionSet(item:Oaf) :(String, String) = {
|
||||
val mapper = new ObjectMapper()
|
||||
|
||||
item match {
|
||||
case dataset: Dataset =>
|
||||
val a: AtomicAction[Dataset] = new AtomicAction[Dataset]
|
||||
a.setClazz(classOf[Dataset])
|
||||
a.setPayload(dataset)
|
||||
(dataset.getClass.getCanonicalName, mapper.writeValueAsString(a))
|
||||
case publication: Publication =>
|
||||
val a: AtomicAction[Publication] = new AtomicAction[Publication]
|
||||
a.setClazz(classOf[Publication])
|
||||
a.setPayload(publication)
|
||||
(publication.getClass.getCanonicalName, mapper.writeValueAsString(a))
|
||||
case organization: Organization =>
|
||||
val a: AtomicAction[Organization] = new AtomicAction[Organization]
|
||||
a.setClazz(classOf[Organization])
|
||||
a.setPayload(organization)
|
||||
(organization.getClass.getCanonicalName, mapper.writeValueAsString(a))
|
||||
case relation: Relation =>
|
||||
val a: AtomicAction[Relation] = new AtomicAction[Relation]
|
||||
a.setClazz(classOf[Relation])
|
||||
a.setPayload(relation)
|
||||
(relation.getClass.getCanonicalName, mapper.writeValueAsString(a))
|
||||
case _ =>
|
||||
null
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
def toHostedByItem(input:String): (String, HostedByItemType) = {
|
||||
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
|
||||
|
||||
lazy val json: json4s.JValue = parse(input)
|
||||
val c :Map[String,HostedByItemType] = json.extract[Map[String, HostedByItemType]]
|
||||
(c.keys.head, c.values.head)
|
||||
}
|
||||
|
||||
|
||||
def toISSNPair(publication: Publication) : (String, Publication) = {
|
||||
val issn = if (publication.getJournal == null) null else publication.getJournal.getIssnPrinted
|
||||
val eissn =if (publication.getJournal == null) null else publication.getJournal.getIssnOnline
|
||||
val lissn =if (publication.getJournal == null) null else publication.getJournal.getIssnLinking
|
||||
|
||||
if (issn!= null && issn.nonEmpty)
|
||||
(issn, publication)
|
||||
else if(eissn!= null && eissn.nonEmpty)
|
||||
(eissn, publication)
|
||||
else if(lissn!= null && lissn.nonEmpty)
|
||||
(lissn, publication)
|
||||
else
|
||||
(publication.getId, publication)
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
def generateGridAffiliationId(gridId:String) :String = {
|
||||
s"20|grid________::${DHPUtils.md5(gridId.toLowerCase().trim())}"
|
||||
}
|
||||
|
||||
|
||||
def fixResult(result: Dataset) :Dataset = {
|
||||
val instanceType = result.getInstance().asScala.find(i => i.getInstancetype != null && i.getInstancetype.getClassid.nonEmpty)
|
||||
if (instanceType.isDefined) {
|
||||
result.getInstance().asScala.foreach(i => i.setInstancetype(instanceType.get.getInstancetype))
|
||||
}
|
||||
result.getInstance().asScala.foreach(i => {
|
||||
i.setHostedby(getUbknownHostedBy())
|
||||
})
|
||||
result
|
||||
}
|
||||
|
||||
def getUbknownHostedBy():KeyValue = {
|
||||
val hb = new KeyValue
|
||||
hb.setValue("Unknown Repository")
|
||||
hb.setKey(s"10|$OPENAIRE_PREFIX::55045bd2a65019fd8e6741a755395c8c")
|
||||
hb
|
||||
|
||||
}
|
||||
|
||||
|
||||
def getOpenAccessQualifier():Qualifier = {
|
||||
createQualifier("OPEN","Open Access","dnet:access_modes", "dnet:access_modes")
|
||||
|
||||
}
|
||||
|
||||
def getRestrictedQualifier():Qualifier = {
|
||||
createQualifier("RESTRICTED","Restricted","dnet:access_modes", "dnet:access_modes")
|
||||
|
||||
}
|
||||
|
||||
def fixPublication(input:((String,Publication), (String,HostedByItemType))): Publication = {
|
||||
|
||||
val publication = input._1._2
|
||||
|
||||
val item = if (input._2 != null) input._2._2 else null
|
||||
|
||||
|
||||
val instanceType = publication.getInstance().asScala.find(i => i.getInstancetype != null && i.getInstancetype.getClassid.nonEmpty)
|
||||
|
||||
if (instanceType.isDefined) {
|
||||
publication.getInstance().asScala.foreach(i => i.setInstancetype(instanceType.get.getInstancetype))
|
||||
}
|
||||
|
||||
|
||||
publication.getInstance().asScala.foreach(i => {
|
||||
val hb = new KeyValue
|
||||
if (item != null) {
|
||||
hb.setValue(item.officialname)
|
||||
hb.setKey(generateDSId(item.id))
|
||||
if (item.openAccess)
|
||||
i.setAccessright(getOpenAccessQualifier())
|
||||
publication.setBestaccessright(getOpenAccessQualifier())
|
||||
}
|
||||
else {
|
||||
hb.setValue("Unknown Repository")
|
||||
hb.setKey(s"10|$OPENAIRE_PREFIX::55045bd2a65019fd8e6741a755395c8c")
|
||||
}
|
||||
i.setHostedby(hb)
|
||||
})
|
||||
|
||||
val ar = publication.getInstance().asScala.filter(i => i.getInstancetype != null && i.getAccessright!= null && i.getAccessright.getClassid!= null).map(f=> f.getAccessright.getClassid)
|
||||
if (ar.nonEmpty) {
|
||||
if(ar.contains("OPEN")){
|
||||
publication.setBestaccessright(getOpenAccessQualifier())
|
||||
}
|
||||
else {
|
||||
publication.setBestaccessright(getRestrictedQualifier())
|
||||
}
|
||||
}
|
||||
publication
|
||||
}
|
||||
|
||||
|
||||
def generateDSId(input: String): String = {
|
||||
|
||||
val b = StringUtils.substringBefore(input, "::")
|
||||
val a = StringUtils.substringAfter(input, "::")
|
||||
s"10|${b}::${DHPUtils.md5(a)}"
|
||||
}
|
||||
|
||||
|
||||
def generateDataInfo(): DataInfo = {
|
||||
generateDataInfo("0.9")
|
||||
}
|
||||
|
||||
|
||||
def filterPublication(publication: Publication): Boolean = {
|
||||
|
||||
//Case empty publication
|
||||
if (publication == null)
|
||||
return false
|
||||
|
||||
//Case publication with no title
|
||||
if (publication.getTitle == null || publication.getTitle.size == 0)
|
||||
return false
|
||||
|
||||
|
||||
val s = publication.getTitle.asScala.count(p => p.getValue != null
|
||||
&& p.getValue.nonEmpty && !p.getValue.equalsIgnoreCase("[NO TITLE AVAILABLE]"))
|
||||
|
||||
if (s == 0)
|
||||
return false
|
||||
|
||||
// fixes #4360 (test publisher)
|
||||
val publisher = if (publication.getPublisher != null) publication.getPublisher.getValue else null
|
||||
|
||||
if (publisher != null && (publisher.equalsIgnoreCase("Test accounts") || publisher.equalsIgnoreCase("CrossRef Test Account"))) {
|
||||
return false;
|
||||
}
|
||||
|
||||
//Publication with no Author
|
||||
if (publication.getAuthor == null || publication.getAuthor.size() == 0)
|
||||
return false
|
||||
|
||||
|
||||
//filter invalid author
|
||||
val authors = publication.getAuthor.asScala.map(s => {
|
||||
if (s.getFullname.nonEmpty) {
|
||||
s.getFullname
|
||||
}
|
||||
else
|
||||
s"${
|
||||
s.getName
|
||||
} ${
|
||||
s.getSurname
|
||||
}"
|
||||
})
|
||||
|
||||
val c = authors.count(isValidAuthorName)
|
||||
if (c == 0)
|
||||
return false
|
||||
|
||||
// fixes #4368
|
||||
if (authors.count(s => s.equalsIgnoreCase("Addie Jackson")) > 0 && "Elsevier BV".equalsIgnoreCase(publication.getPublisher.getValue))
|
||||
return false
|
||||
|
||||
true
|
||||
}
|
||||
|
||||
|
||||
def isValidAuthorName(fullName: String): Boolean = {
|
||||
if (fullName == null || fullName.isEmpty)
|
||||
return false
|
||||
if (invalidName.contains(fullName.toLowerCase.trim))
|
||||
return false
|
||||
true
|
||||
}
|
||||
|
||||
|
||||
def generateDataInfo(trust: String): DataInfo = {
|
||||
val di = new DataInfo
|
||||
di.setDeletedbyinference(false)
|
||||
di.setInferred(false)
|
||||
di.setInvisible(false)
|
||||
di.setTrust(trust)
|
||||
di.setProvenanceaction(createQualifier("sysimport:actionset", "dnet:provenanceActions"))
|
||||
di
|
||||
}
|
||||
|
||||
|
||||
def createSP(value: String, classId: String, schemeId: String): StructuredProperty = {
|
||||
val sp = new StructuredProperty
|
||||
sp.setQualifier(createQualifier(classId, schemeId))
|
||||
sp.setValue(value)
|
||||
sp
|
||||
|
||||
}
|
||||
|
||||
def createSP(value: String, classId: String, schemeId: String, dataInfo: DataInfo): StructuredProperty = {
|
||||
val sp = new StructuredProperty
|
||||
sp.setQualifier(createQualifier(classId, schemeId))
|
||||
sp.setValue(value)
|
||||
sp.setDataInfo(dataInfo)
|
||||
sp
|
||||
|
||||
}
|
||||
|
||||
def createCrossrefCollectedFrom(): KeyValue = {
|
||||
|
||||
val cf = new KeyValue
|
||||
cf.setValue(CROSSREF)
|
||||
cf.setKey("10|" + OPENAIRE_PREFIX + SEPARATOR + DHPUtils.md5(CROSSREF.toLowerCase))
|
||||
cf
|
||||
|
||||
}
|
||||
|
||||
|
||||
def createUnpayWallCollectedFrom(): KeyValue = {
|
||||
|
||||
val cf = new KeyValue
|
||||
cf.setValue(UNPAYWALL)
|
||||
cf.setKey("10|" + OPENAIRE_PREFIX + SEPARATOR + DHPUtils.md5(UNPAYWALL.toLowerCase))
|
||||
cf
|
||||
|
||||
}
|
||||
|
||||
def createORIDCollectedFrom(): KeyValue = {
|
||||
|
||||
val cf = new KeyValue
|
||||
cf.setValue(ORCID)
|
||||
cf.setKey("10|" + OPENAIRE_PREFIX + SEPARATOR + DHPUtils.md5(ORCID.toLowerCase))
|
||||
cf
|
||||
|
||||
}
|
||||
|
||||
|
||||
def generateIdentifier (oaf: Result, doi: String): String = {
|
||||
val id = DHPUtils.md5 (doi.toLowerCase)
|
||||
return s"50|${
|
||||
doiBoostNSPREFIX
|
||||
}${
|
||||
SEPARATOR
|
||||
}${
|
||||
id
|
||||
}"
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
def createMAGCollectedFrom(): KeyValue = {
|
||||
|
||||
val cf = new KeyValue
|
||||
cf.setValue(MAG_NAME)
|
||||
cf.setKey("10|" + OPENAIRE_PREFIX + SEPARATOR + DHPUtils.md5(MAG))
|
||||
cf
|
||||
|
||||
}
|
||||
|
||||
def createQualifier(clsName: String, clsValue: String, schName: String, schValue: String): Qualifier = {
|
||||
val q = new Qualifier
|
||||
q.setClassid(clsName)
|
||||
q.setClassname(clsValue)
|
||||
q.setSchemeid(schName)
|
||||
q.setSchemename(schValue)
|
||||
q
|
||||
}
|
||||
|
||||
def createQualifier(cls: String, sch: String): Qualifier = {
|
||||
createQualifier(cls, cls, sch, sch)
|
||||
}
|
||||
|
||||
|
||||
def asField[T](value: T): Field[T] = {
|
||||
val tmp = new Field[T]
|
||||
tmp.setValue(value)
|
||||
tmp
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
}
|
|
@ -0,0 +1,80 @@
|
|||
package eu.dnetlib.doiboost
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser
|
||||
import eu.dnetlib.dhp.schema.action.AtomicAction
|
||||
import eu.dnetlib.dhp.schema.oaf.{Organization, Publication, Relation, Dataset => OafDataset}
|
||||
import org.apache.commons.io.IOUtils
|
||||
import org.apache.hadoop.io.Text
|
||||
import org.apache.hadoop.io.compress.GzipCodec
|
||||
import org.apache.hadoop.mapred.SequenceFileOutputFormat
|
||||
import org.apache.spark.SparkConf
|
||||
import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession}
|
||||
import org.slf4j.{Logger, LoggerFactory}
|
||||
|
||||
object SparkGenerateDOIBoostActionSet {
|
||||
val logger: Logger = LoggerFactory.getLogger(getClass)
|
||||
def main(args: Array[String]): Unit = {
|
||||
|
||||
val conf: SparkConf = new SparkConf()
|
||||
val parser = new ArgumentApplicationParser(IOUtils.toString(getClass.getResourceAsStream("/eu/dnetlib/dhp/doiboost/generate_doiboost_as_params.json")))
|
||||
parser.parseArgument(args)
|
||||
val spark: SparkSession =
|
||||
SparkSession
|
||||
.builder()
|
||||
.config(conf)
|
||||
.appName(getClass.getSimpleName)
|
||||
.master(parser.get("master")).getOrCreate()
|
||||
|
||||
implicit val mapEncoderPub: Encoder[Publication] = Encoders.kryo[Publication]
|
||||
implicit val mapEncoderOrg: Encoder[Organization] = Encoders.kryo[Organization]
|
||||
implicit val mapEncoderDataset: Encoder[OafDataset] = Encoders.kryo[OafDataset]
|
||||
implicit val mapEncoderRel: Encoder[Relation] = Encoders.kryo[Relation]
|
||||
implicit val mapEncoderAS: Encoder[(String, String)] = Encoders.tuple(Encoders.STRING, Encoders.STRING)
|
||||
|
||||
implicit val mapEncoderAtomiAction: Encoder[AtomicAction[OafDataset]] = Encoders.kryo[AtomicAction[OafDataset]]
|
||||
|
||||
val dbPublicationPath = parser.get("dbPublicationPath")
|
||||
val dbDatasetPath = parser.get("dbDatasetPath")
|
||||
val crossRefRelation = parser.get("crossRefRelation")
|
||||
val dbaffiliationRelationPath = parser.get("dbaffiliationRelationPath")
|
||||
val dbOrganizationPath = parser.get("dbOrganizationPath")
|
||||
val workingDirPath = parser.get("targetPath")
|
||||
|
||||
spark.read.load(dbDatasetPath).as[OafDataset]
|
||||
.map(d =>DoiBoostMappingUtil.fixResult(d))
|
||||
.map(d=>DoiBoostMappingUtil.toActionSet(d))(Encoders.tuple(Encoders.STRING, Encoders.STRING))
|
||||
.write.mode(SaveMode.Overwrite).save(s"$workingDirPath/actionSet")
|
||||
|
||||
spark.read.load(dbPublicationPath).as[Publication]
|
||||
.map(d=>DoiBoostMappingUtil.toActionSet(d))(Encoders.tuple(Encoders.STRING, Encoders.STRING))
|
||||
.write.mode(SaveMode.Append).save(s"$workingDirPath/actionSet")
|
||||
|
||||
spark.read.load(dbOrganizationPath).as[Organization]
|
||||
.map(d=>DoiBoostMappingUtil.toActionSet(d))(Encoders.tuple(Encoders.STRING, Encoders.STRING))
|
||||
.write.mode(SaveMode.Append).save(s"$workingDirPath/actionSet")
|
||||
|
||||
|
||||
spark.read.load(crossRefRelation).as[Relation]
|
||||
.map(d=>DoiBoostMappingUtil.toActionSet(d))(Encoders.tuple(Encoders.STRING, Encoders.STRING))
|
||||
.write.mode(SaveMode.Append).save(s"$workingDirPath/actionSet")
|
||||
|
||||
spark.read.load(dbaffiliationRelationPath).as[Relation]
|
||||
.map(d=>DoiBoostMappingUtil.toActionSet(d))(Encoders.tuple(Encoders.STRING, Encoders.STRING))
|
||||
.write.mode(SaveMode.Append).save(s"$workingDirPath/actionSet")
|
||||
|
||||
|
||||
val d: Dataset[(String, String)] =spark.read.load(s"$workingDirPath/actionSet").as[(String,String)]
|
||||
|
||||
d.rdd.map(s => (new Text(s._1), new Text(s._2))).saveAsHadoopFile(s"$workingDirPath/rawset", classOf[Text], classOf[Text], classOf[SequenceFileOutputFormat[Text,Text]], classOf[GzipCodec])
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,140 @@
|
|||
package eu.dnetlib.doiboost
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser
|
||||
import eu.dnetlib.dhp.schema.oaf.{Publication, Relation, Dataset => OafDataset, Organization}
|
||||
import eu.dnetlib.doiboost.mag.ConversionUtil
|
||||
import org.apache.commons.io.IOUtils
|
||||
import org.apache.spark.SparkConf
|
||||
import org.apache.spark.sql.functions.col
|
||||
import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession}
|
||||
import org.slf4j.{Logger, LoggerFactory}
|
||||
import scala.collection.JavaConverters._
|
||||
|
||||
object SparkGenerateDoiBoost {
|
||||
|
||||
def main(args: Array[String]): Unit = {
|
||||
|
||||
val logger: Logger = LoggerFactory.getLogger(getClass)
|
||||
val conf: SparkConf = new SparkConf()
|
||||
val parser = new ArgumentApplicationParser(IOUtils.toString(getClass.getResourceAsStream("/eu/dnetlib/dhp/doiboost/generate_doiboost_params.json")))
|
||||
parser.parseArgument(args)
|
||||
val spark: SparkSession =
|
||||
SparkSession
|
||||
.builder()
|
||||
.config(conf)
|
||||
.appName(getClass.getSimpleName)
|
||||
.master(parser.get("master")).getOrCreate()
|
||||
|
||||
import spark.implicits._
|
||||
|
||||
val hostedByMapPath = parser.get("hostedByMapPath")
|
||||
val workingDirPath = parser.get("workingDirPath")
|
||||
|
||||
|
||||
implicit val mapEncoderPub: Encoder[Publication] = Encoders.kryo[Publication]
|
||||
implicit val mapEncoderOrg: Encoder[Organization] = Encoders.kryo[Organization]
|
||||
implicit val mapEncoderDataset: Encoder[OafDataset] = Encoders.kryo[OafDataset]
|
||||
implicit val tupleForJoinEncoder: Encoder[(String, Publication)] = Encoders.tuple(Encoders.STRING, mapEncoderPub)
|
||||
implicit val mapEncoderRel: Encoder[Relation] = Encoders.kryo[Relation]
|
||||
|
||||
logger.info("Phase 2) Join Crossref with UnpayWall")
|
||||
|
||||
val crossrefPublication: Dataset[(String, Publication)] = spark.read.load(s"$workingDirPath/crossrefPublication").as[Publication].map(p => (p.getId, p))
|
||||
val uwPublication: Dataset[(String, Publication)] = spark.read.load(s"$workingDirPath/uwPublication").as[Publication].map(p => (p.getId, p))
|
||||
|
||||
def applyMerge(item:((String, Publication), (String, Publication))) : Publication =
|
||||
{
|
||||
val crossrefPub = item._1._2
|
||||
if (item._2!= null) {
|
||||
val otherPub = item._2._2
|
||||
if (otherPub != null) {
|
||||
crossrefPub.mergeFrom(otherPub)
|
||||
}
|
||||
}
|
||||
crossrefPub
|
||||
}
|
||||
crossrefPublication.joinWith(uwPublication, crossrefPublication("_1").equalTo(uwPublication("_1")), "left").map(applyMerge).write.mode(SaveMode.Overwrite).save(s"$workingDirPath/firstJoin")
|
||||
logger.info("Phase 3) Join Result with ORCID")
|
||||
val fj: Dataset[(String, Publication)] = spark.read.load(s"$workingDirPath/firstJoin").as[Publication].map(p => (p.getId, p))
|
||||
val orcidPublication: Dataset[(String, Publication)] = spark.read.load(s"$workingDirPath/orcidPublication").as[Publication].map(p => (p.getId, p))
|
||||
fj.joinWith(orcidPublication, fj("_1").equalTo(orcidPublication("_1")), "left").map(applyMerge).write.mode(SaveMode.Overwrite).save(s"$workingDirPath/secondJoin")
|
||||
|
||||
logger.info("Phase 3) Join Result with MAG")
|
||||
val sj: Dataset[(String, Publication)] = spark.read.load(s"$workingDirPath/secondJoin").as[Publication].map(p => (p.getId, p))
|
||||
|
||||
val magPublication: Dataset[(String, Publication)] = spark.read.load(s"$workingDirPath/magPublication").as[Publication].map(p => (p.getId, p))
|
||||
sj.joinWith(magPublication, sj("_1").equalTo(magPublication("_1")), "left").map(applyMerge).write.mode(SaveMode.Overwrite).save(s"$workingDirPath/doiBoostPublication")
|
||||
|
||||
|
||||
val doiBoostPublication: Dataset[(String,Publication)] = spark.read.load(s"$workingDirPath/doiBoostPublication").as[Publication].filter(p=>DoiBoostMappingUtil.filterPublication(p)).map(DoiBoostMappingUtil.toISSNPair)(tupleForJoinEncoder)
|
||||
|
||||
val hostedByDataset : Dataset[(String, HostedByItemType)] = spark.createDataset(spark.sparkContext.textFile(hostedByMapPath).map(DoiBoostMappingUtil.toHostedByItem))
|
||||
|
||||
|
||||
doiBoostPublication.joinWith(hostedByDataset, doiBoostPublication("_1").equalTo(hostedByDataset("_1")), "left")
|
||||
.map(DoiBoostMappingUtil.fixPublication)
|
||||
.write.mode(SaveMode.Overwrite).save(s"$workingDirPath/doiBoostPublicationFiltered")
|
||||
|
||||
val affiliationPath = parser.get("affiliationPath")
|
||||
val paperAffiliationPath = parser.get("paperAffiliationPath")
|
||||
|
||||
val affiliation = spark.read.load(affiliationPath).select(col("AffiliationId"), col("GridId"), col("OfficialPage"), col("DisplayName"))
|
||||
|
||||
val paperAffiliation = spark.read.load(paperAffiliationPath).select(col("AffiliationId").alias("affId"), col("PaperId"))
|
||||
|
||||
|
||||
val a:Dataset[DoiBoostAffiliation] = paperAffiliation
|
||||
.joinWith(affiliation, paperAffiliation("affId").equalTo(affiliation("AffiliationId")))
|
||||
.select(col("_1.PaperId"), col("_2.AffiliationId"), col("_2.GridId"), col("_2.OfficialPage"), col("_2.DisplayName")).as[DoiBoostAffiliation]
|
||||
|
||||
|
||||
|
||||
val magPubs:Dataset[(String,Publication)]= spark.read.load(s"$workingDirPath/doiBoostPublicationFiltered").as[Publication]
|
||||
.map(p => (ConversionUtil.extractMagIdentifier(p.getOriginalId.asScala), p))(tupleForJoinEncoder).filter(s =>s._1!= null )
|
||||
|
||||
|
||||
magPubs.joinWith(a,magPubs("_1").equalTo(a("PaperId"))).flatMap(item => {
|
||||
val pub:Publication = item._1._2
|
||||
val affiliation = item._2
|
||||
val affId:String = if (affiliation.GridId.isDefined) DoiBoostMappingUtil.generateGridAffiliationId(affiliation.GridId.get) else DoiBoostMappingUtil.generateMAGAffiliationId(affiliation.AffiliationId.toString)
|
||||
val r:Relation = new Relation
|
||||
r.setSource(pub.getId)
|
||||
r.setTarget(affId)
|
||||
r.setRelType("resultOrganization")
|
||||
r.setRelClass("hasAuthorInstitution")
|
||||
r.setSubRelType("affiliation")
|
||||
r.setDataInfo(pub.getDataInfo)
|
||||
r.setCollectedfrom(List(DoiBoostMappingUtil.createMAGCollectedFrom()).asJava)
|
||||
val r1:Relation = new Relation
|
||||
r1.setTarget(pub.getId)
|
||||
r1.setSource(affId)
|
||||
r1.setRelType("resultOrganization")
|
||||
r1.setRelClass("isAuthorInstitutionOf")
|
||||
r1.setSubRelType("affiliation")
|
||||
r1.setDataInfo(pub.getDataInfo)
|
||||
r1.setCollectedfrom(List(DoiBoostMappingUtil.createMAGCollectedFrom()).asJava)
|
||||
List(r, r1)
|
||||
})(mapEncoderRel).write.mode(SaveMode.Overwrite).save(s"$workingDirPath/doiBoostPublicationAffiliation")
|
||||
|
||||
|
||||
magPubs.joinWith(a,magPubs("_1").equalTo(a("PaperId"))).map( item => {
|
||||
val affiliation = item._2
|
||||
if (affiliation.GridId.isEmpty) {
|
||||
val o = new Organization
|
||||
o.setCollectedfrom(List(DoiBoostMappingUtil.createMAGCollectedFrom()).asJava)
|
||||
o.setDataInfo(DoiBoostMappingUtil.generateDataInfo())
|
||||
o.setId(DoiBoostMappingUtil.generateMAGAffiliationId(affiliation.AffiliationId.toString))
|
||||
o.setOriginalId(List(affiliation.AffiliationId.toString).asJava)
|
||||
if (affiliation.DisplayName.nonEmpty)
|
||||
o.setLegalname(DoiBoostMappingUtil.asField(affiliation.DisplayName.get))
|
||||
if (affiliation.OfficialPage.isDefined)
|
||||
o.setWebsiteurl(DoiBoostMappingUtil.asField(affiliation.OfficialPage.get))
|
||||
o.setCountry(DoiBoostMappingUtil.getUnknownCountry())
|
||||
o
|
||||
}
|
||||
else
|
||||
null
|
||||
}).filter(o=> o!=null).write.mode(SaveMode.Overwrite).save(s"$workingDirPath/doiBoostOrganization")
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,445 @@
|
|||
package eu.dnetlib.doiboost.crossref
|
||||
|
||||
import eu.dnetlib.dhp.schema.oaf._
|
||||
import eu.dnetlib.dhp.utils.DHPUtils
|
||||
import eu.dnetlib.doiboost.DoiBoostMappingUtil._
|
||||
import org.apache.commons.lang.StringUtils
|
||||
import org.json4s
|
||||
import org.json4s.DefaultFormats
|
||||
import org.json4s.JsonAST._
|
||||
import org.json4s.jackson.JsonMethods._
|
||||
import org.slf4j.{Logger, LoggerFactory}
|
||||
|
||||
import scala.collection.JavaConverters._
|
||||
import scala.collection.mutable
|
||||
import scala.util.matching.Regex
|
||||
|
||||
case class mappingAffiliation(name: String) {}
|
||||
|
||||
case class mappingAuthor(given: Option[String], family: String, ORCID: Option[String], affiliation: Option[mappingAffiliation]) {}
|
||||
|
||||
case class mappingFunder(name: String, DOI: Option[String], award: Option[List[String]]) {}
|
||||
|
||||
|
||||
case object Crossref2Oaf {
|
||||
val logger: Logger = LoggerFactory.getLogger(Crossref2Oaf.getClass)
|
||||
|
||||
val mappingCrossrefType = Map(
|
||||
"book-section" -> "publication",
|
||||
"book" -> "publication",
|
||||
"book-chapter" -> "publication",
|
||||
"book-part" -> "publication",
|
||||
"book-series" -> "publication",
|
||||
"book-set" -> "publication",
|
||||
"book-track" -> "publication",
|
||||
"edited-book" -> "publication",
|
||||
"reference-book" -> "publication",
|
||||
"monograph" -> "publication",
|
||||
"journal-article" -> "publication",
|
||||
"dissertation" -> "publication",
|
||||
"other" -> "publication",
|
||||
"peer-review" -> "publication",
|
||||
"proceedings" -> "publication",
|
||||
"proceedings-article" -> "publication",
|
||||
"reference-entry" -> "publication",
|
||||
"report" -> "publication",
|
||||
"report-series" -> "publication",
|
||||
"standard" -> "publication",
|
||||
"standard-series" -> "publication",
|
||||
"posted-content" -> "publication",
|
||||
"dataset" -> "dataset"
|
||||
)
|
||||
|
||||
|
||||
val mappingCrossrefSubType = Map(
|
||||
"book-section" -> "0013 Part of book or chapter of book",
|
||||
"book" -> "0002 Book",
|
||||
"book-chapter" -> "0013 Part of book or chapter of book",
|
||||
"book-part" -> "0013 Part of book or chapter of book",
|
||||
"book-series" -> "0002 Book",
|
||||
"book-set" -> "0002 Book",
|
||||
"book-track" -> "0002 Book",
|
||||
"edited-book" -> "0002 Book",
|
||||
"reference-book" -> "0002 Book",
|
||||
"monograph" -> "0002 Book",
|
||||
"journal-article" -> "0001 Article",
|
||||
"dissertation" -> "0006 Doctoral thesis",
|
||||
"other" -> "0038 Other literature type",
|
||||
"peer-review" -> "0015 Review",
|
||||
"proceedings" -> "0004 Conference object",
|
||||
"proceedings-article" -> "0004 Conference object",
|
||||
"reference-entry" -> "0013 Part of book or chapter of book",
|
||||
"report" -> "0017 Report",
|
||||
"report-series" -> "0017 Report",
|
||||
"standard" -> "0038 Other literature type",
|
||||
"standard-series" -> "0038 Other literature type",
|
||||
"dataset" -> "0021 Dataset",
|
||||
"preprint" -> "0016 Preprint",
|
||||
"report" -> "0017 Report"
|
||||
)
|
||||
|
||||
def mappingResult(result: Result, json: JValue, cobjCategory: String): Result = {
|
||||
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
|
||||
|
||||
//MAPPING Crossref DOI into PID
|
||||
val doi: String = (json \ "DOI").extract[String]
|
||||
result.setPid(List(createSP(doi, "doi", PID_TYPES)).asJava)
|
||||
|
||||
//MAPPING Crossref DOI into OriginalId
|
||||
//and Other Original Identifier of dataset like clinical-trial-number
|
||||
val clinicalTrialNumbers = for (JString(ctr) <- json \ "clinical-trial-number") yield ctr
|
||||
val alternativeIds = for (JString(ids) <- json \ "alternative-id") yield ids
|
||||
val tmp = clinicalTrialNumbers ::: alternativeIds ::: List(doi)
|
||||
|
||||
result.setOriginalId(tmp.filter(id => id != null).asJava)
|
||||
|
||||
//Set identifier as {50|60} | doiboost____::md5(DOI)
|
||||
result.setId(generateIdentifier(result, doi))
|
||||
|
||||
// Add DataInfo
|
||||
result.setDataInfo(generateDataInfo())
|
||||
|
||||
result.setLastupdatetimestamp((json \ "indexed" \ "timestamp").extract[Long])
|
||||
result.setDateofcollection((json \ "indexed" \ "date-time").extract[String])
|
||||
|
||||
result.setCollectedfrom(List(createCrossrefCollectedFrom()).asJava)
|
||||
|
||||
// Publisher ( Name of work's publisher mapped into Result/Publisher)
|
||||
val publisher = (json \ "publisher").extractOrElse[String](null)
|
||||
if (publisher!= null && publisher.nonEmpty)
|
||||
result.setPublisher(asField(publisher))
|
||||
|
||||
|
||||
// TITLE
|
||||
val mainTitles = for {JString(title) <- json \ "title" if title.nonEmpty} yield createSP(title, "main title", "dnet:dataCite_title")
|
||||
val originalTitles = for {JString(title) <- json \ "original-title" if title.nonEmpty} yield createSP(title, "alternative title", "dnet:dataCite_title")
|
||||
val shortTitles = for {JString(title) <- json \ "short-title" if title.nonEmpty} yield createSP(title, "alternative title", "dnet:dataCite_title")
|
||||
val subtitles = for {JString(title) <- json \ "subtitle" if title.nonEmpty} yield createSP(title, "subtitle", "dnet:dataCite_title")
|
||||
result.setTitle((mainTitles ::: originalTitles ::: shortTitles ::: subtitles).asJava)
|
||||
|
||||
// DESCRIPTION
|
||||
val descriptionList = for {JString(description) <- json \ "abstract"} yield asField(description)
|
||||
result.setDescription(descriptionList.asJava)
|
||||
|
||||
// Source
|
||||
val sourceList = for {JString(source) <- json \ "source" if source!= null && source.nonEmpty} yield asField(source)
|
||||
result.setSource(sourceList.asJava)
|
||||
|
||||
//RELEVANT DATE Mapping
|
||||
val createdDate = generateDate((json \ "created" \ "date-time").extract[String], (json \ "created" \ "date-parts").extract[List[List[Int]]], "created", "dnet:dataCite_date")
|
||||
val postedDate = generateDate((json \ "posted" \ "date-time").extractOrElse[String](null), (json \ "posted" \ "date-parts").extract[List[List[Int]]], "available", "dnet:dataCite_date")
|
||||
val acceptedDate = generateDate((json \ "accepted" \ "date-time").extractOrElse[String](null), (json \ "accepted" \ "date-parts").extract[List[List[Int]]], "accepted", "dnet:dataCite_date")
|
||||
val publishedPrintDate = generateDate((json \ "published-print" \ "date-time").extractOrElse[String](null), (json \ "published-print" \ "date-parts").extract[List[List[Int]]], "published-print", "dnet:dataCite_date")
|
||||
val publishedOnlineDate = generateDate((json \ "published-online" \ "date-time").extractOrElse[String](null), (json \ "published-online" \ "date-parts").extract[List[List[Int]]], "published-online", "dnet:dataCite_date")
|
||||
|
||||
val issuedDate = extractDate((json \ "issued" \ "date-time").extractOrElse[String](null), (json \ "issued" \ "date-parts").extract[List[List[Int]]])
|
||||
if (StringUtils.isNotBlank(issuedDate)) {
|
||||
result.setDateofacceptance(asField(issuedDate))
|
||||
}
|
||||
else {
|
||||
result.setDateofacceptance(asField(createdDate.getValue))
|
||||
}
|
||||
result.setRelevantdate(List(createdDate, postedDate, acceptedDate, publishedOnlineDate, publishedPrintDate).filter(p => p != null).asJava)
|
||||
|
||||
//Mapping Subject
|
||||
val subjectList:List[String] = (json \ "subject").extractOrElse[List[String]](List())
|
||||
|
||||
if (subjectList.nonEmpty) {
|
||||
result.setSubject(subjectList.map(s=> createSP(s, "keywords", "dnet:subject_classification_typologies")).asJava)
|
||||
}
|
||||
|
||||
|
||||
|
||||
//Mapping Author
|
||||
val authorList: List[mappingAuthor] = (json \ "author").extractOrElse[List[mappingAuthor]](List())
|
||||
result.setAuthor(authorList.map(a => generateAuhtor(a.given.orNull, a.family, a.ORCID.orNull)).asJava)
|
||||
|
||||
// Mapping instance
|
||||
val instance = new Instance()
|
||||
val license = for {
|
||||
JString(lic) <- json \ "license" \ "URL"
|
||||
} yield asField(lic)
|
||||
val l = license.filter(d => StringUtils.isNotBlank(d.getValue))
|
||||
if (l.nonEmpty)
|
||||
instance.setLicense(l.head)
|
||||
|
||||
|
||||
val has_review = (json \ "relation" \"has-review" \ "id")
|
||||
|
||||
if(has_review != JNothing) {
|
||||
instance.setRefereed(
|
||||
createQualifier("0001", "peerReviewed", "dnet:review_levels", "dnet:review_levels"))
|
||||
}
|
||||
|
||||
|
||||
instance.setAccessright(getRestrictedQualifier())
|
||||
result.setInstance(List(instance).asJava)
|
||||
instance.setInstancetype(createQualifier(cobjCategory.substring(0, 4), cobjCategory.substring(5), "dnet:publication_resource", "dnet:publication_resource"))
|
||||
result.setResourcetype(createQualifier(cobjCategory.substring(0, 4),"dnet:dataCite_resource"))
|
||||
|
||||
instance.setCollectedfrom(createCrossrefCollectedFrom())
|
||||
if (StringUtils.isNotBlank(issuedDate)) {
|
||||
instance.setDateofacceptance(asField(issuedDate))
|
||||
}
|
||||
else {
|
||||
instance.setDateofacceptance(asField(createdDate.getValue))
|
||||
}
|
||||
val s: String = (json \ "URL").extract[String]
|
||||
val links: List[String] = ((for {JString(url) <- json \ "link" \ "URL"} yield url) ::: List(s)).filter(p => p != null).distinct
|
||||
if (links.nonEmpty)
|
||||
instance.setUrl(links.asJava)
|
||||
result
|
||||
}
|
||||
|
||||
|
||||
def generateAuhtor(given: String, family: String, orcid: String): Author = {
|
||||
val a = new Author
|
||||
a.setName(given)
|
||||
a.setSurname(family)
|
||||
a.setFullname(s"$given $family")
|
||||
if (StringUtils.isNotBlank(orcid))
|
||||
a.setPid(List(createSP(orcid, ORCID, PID_TYPES)).asJava)
|
||||
|
||||
a
|
||||
}
|
||||
|
||||
def convert(input: String): List[Oaf] = {
|
||||
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
|
||||
lazy val json: json4s.JValue = parse(input)
|
||||
|
||||
var resultList: List[Oaf] = List()
|
||||
|
||||
|
||||
val objectType = (json \ "type").extractOrElse[String](null)
|
||||
val objectSubType = (json \ "subtype").extractOrElse[String](null)
|
||||
if (objectType == null)
|
||||
return resultList
|
||||
|
||||
|
||||
val result = generateItemFromType(objectType, objectSubType)
|
||||
if (result == null)
|
||||
return List()
|
||||
val cOBJCategory = mappingCrossrefSubType.getOrElse(objectType, mappingCrossrefSubType.getOrElse(objectSubType, "0038 Other literature type"));
|
||||
mappingResult(result, json, cOBJCategory)
|
||||
|
||||
|
||||
val funderList: List[mappingFunder] = (json \ "funder").extractOrElse[List[mappingFunder]](List())
|
||||
|
||||
if (funderList.nonEmpty) {
|
||||
resultList = resultList ::: mappingFunderToRelations(funderList, result.getId, createCrossrefCollectedFrom(), result.getDataInfo, result.getLastupdatetimestamp)
|
||||
}
|
||||
|
||||
|
||||
result match {
|
||||
case publication: Publication => convertPublication(publication, json, cOBJCategory)
|
||||
case dataset: Dataset => convertDataset(dataset)
|
||||
}
|
||||
|
||||
resultList = resultList ::: List(result)
|
||||
resultList
|
||||
}
|
||||
|
||||
|
||||
def mappingFunderToRelations(funders: List[mappingFunder], sourceId: String, cf: KeyValue, di: DataInfo, ts: Long): List[Relation] = {
|
||||
|
||||
val queue = new mutable.Queue[Relation]
|
||||
|
||||
|
||||
def snsfRule(award:String): String = {
|
||||
var tmp1 = StringUtils.substringAfter(award,"_")
|
||||
val tmp2 = StringUtils.substringBefore(tmp1,"/")
|
||||
logger.debug(s"From $award to $tmp2")
|
||||
tmp2
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
def extractECAward(award: String): String = {
|
||||
val awardECRegex: Regex = "[0-9]{4,9}".r
|
||||
if (awardECRegex.findAllIn(award).hasNext)
|
||||
return awardECRegex.findAllIn(award).max
|
||||
null
|
||||
}
|
||||
|
||||
|
||||
def generateRelation(sourceId:String, targetId:String, nsPrefix:String) :Relation = {
|
||||
|
||||
val r = new Relation
|
||||
r.setSource(sourceId)
|
||||
r.setTarget(s"$nsPrefix::$targetId")
|
||||
r.setRelType("resultProject")
|
||||
r.setRelClass("isProducedBy")
|
||||
r.setSubRelType("outcome")
|
||||
r.setCollectedfrom(List(cf).asJava)
|
||||
r.setDataInfo(di)
|
||||
r.setLastupdatetimestamp(ts)
|
||||
r
|
||||
}
|
||||
|
||||
|
||||
def generateSimpleRelationFromAward(funder: mappingFunder, nsPrefix: String, extractField: String => String): Unit = {
|
||||
if (funder.award.isDefined && funder.award.get.nonEmpty)
|
||||
funder.award.get.map(extractField).filter(a => a!= null && a.nonEmpty).foreach(
|
||||
award => {
|
||||
val targetId = DHPUtils.md5(award)
|
||||
queue += generateRelation(sourceId, targetId, nsPrefix)
|
||||
}
|
||||
)
|
||||
}
|
||||
|
||||
if (funders != null)
|
||||
funders.foreach(funder => {
|
||||
if (funder.DOI.isDefined && funder.DOI.get.nonEmpty) {
|
||||
funder.DOI.get match {
|
||||
case "10.13039/100010663" |
|
||||
"10.13039/100010661" |
|
||||
"10.13039/501100007601" |
|
||||
"10.13039/501100000780" |
|
||||
"10.13039/100010665" => generateSimpleRelationFromAward(funder, "corda__h2020", extractECAward)
|
||||
case "10.13039/100011199" |
|
||||
"10.13039/100004431" |
|
||||
"10.13039/501100004963" |
|
||||
"10.13039/501100000780" => generateSimpleRelationFromAward(funder, "corda_______", extractECAward)
|
||||
case "10.13039/501100000781" => generateSimpleRelationFromAward(funder, "corda_______", extractECAward)
|
||||
generateSimpleRelationFromAward(funder, "corda__h2020", extractECAward)
|
||||
case "10.13039/100000001" => generateSimpleRelationFromAward(funder, "nsf_________", a => a)
|
||||
case "10.13039/501100001665" => generateSimpleRelationFromAward(funder, "anr_________", a => a)
|
||||
case "10.13039/501100002341" => generateSimpleRelationFromAward(funder, "aka_________", a => a)
|
||||
case "10.13039/501100001602" => generateSimpleRelationFromAward(funder, "aka_________", a => a.replace("SFI", ""))
|
||||
case "10.13039/501100000923" => generateSimpleRelationFromAward(funder, "arc_________", a => a)
|
||||
case "10.13039/501100000038"=> queue += generateRelation(sourceId,"1e5e62235d094afd01cd56e65112fc63", "nserc_______" )
|
||||
case "10.13039/501100000155"=> queue += generateRelation(sourceId,"1e5e62235d094afd01cd56e65112fc63", "sshrc_______" )
|
||||
case "10.13039/501100000024"=> queue += generateRelation(sourceId,"1e5e62235d094afd01cd56e65112fc63", "cihr________" )
|
||||
case "10.13039/501100002848" => generateSimpleRelationFromAward(funder, "conicytf____", a => a)
|
||||
case "10.13039/501100003448" => generateSimpleRelationFromAward(funder, "gsrt________", extractECAward)
|
||||
case "10.13039/501100010198" => generateSimpleRelationFromAward(funder, "sgov________", a=>a)
|
||||
case "10.13039/501100004564" => generateSimpleRelationFromAward(funder, "mestd_______", extractECAward)
|
||||
case "10.13039/501100003407" => generateSimpleRelationFromAward(funder, "miur________", a=>a)
|
||||
queue += generateRelation(sourceId,"1e5e62235d094afd01cd56e65112fc63", "miur________" )
|
||||
case "10.13039/501100006588" |
|
||||
"10.13039/501100004488" => generateSimpleRelationFromAward(funder, "irb_hr______", a=>a.replaceAll("Project No.", "").replaceAll("HRZZ-","") )
|
||||
case "10.13039/501100006769"=> generateSimpleRelationFromAward(funder, "rsf_________", a=>a)
|
||||
case "10.13039/501100001711"=> generateSimpleRelationFromAward(funder, "snsf________", snsfRule)
|
||||
case "10.13039/501100004410"=> generateSimpleRelationFromAward(funder, "tubitakf____", a =>a)
|
||||
case "10.10.13039/100004440"=> generateSimpleRelationFromAward(funder, "wt__________", a =>a)
|
||||
case "10.13039/100004440"=> queue += generateRelation(sourceId,"1e5e62235d094afd01cd56e65112fc63", "wt__________" )
|
||||
case _ => logger.debug("no match for "+funder.DOI.get )
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
} else {
|
||||
funder.name match {
|
||||
case "European Union’s Horizon 2020 research and innovation program" => generateSimpleRelationFromAward(funder, "corda__h2020", extractECAward)
|
||||
case "European Union's" =>
|
||||
generateSimpleRelationFromAward(funder, "corda__h2020", extractECAward)
|
||||
generateSimpleRelationFromAward(funder, "corda_______", extractECAward)
|
||||
case "The French National Research Agency (ANR)" |
|
||||
"The French National Research Agency" => generateSimpleRelationFromAward(funder, "anr_________", a => a)
|
||||
case "CONICYT, Programa de Formación de Capital Humano Avanzado" => generateSimpleRelationFromAward(funder, "conicytf____", extractECAward)
|
||||
case "Wellcome Trust Masters Fellowship" => queue += generateRelation(sourceId,"1e5e62235d094afd01cd56e65112fc63", "wt__________" )
|
||||
case _ => logger.debug("no match for "+funder.name )
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
)
|
||||
queue.toList
|
||||
}
|
||||
|
||||
def convertDataset(dataset: Dataset): Unit = {
|
||||
// TODO check if there are other info to map into the Dataset
|
||||
}
|
||||
|
||||
|
||||
def convertPublication(publication: Publication, json: JValue, cobjCategory: String): Unit = {
|
||||
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
|
||||
val containerTitles = for {JString(ct) <- json \ "container-title"} yield ct
|
||||
|
||||
|
||||
//Mapping book
|
||||
if (cobjCategory.toLowerCase.contains("book")) {
|
||||
val ISBN = for {JString(isbn) <- json \ "ISBN"} yield isbn
|
||||
if (ISBN.nonEmpty && containerTitles.nonEmpty) {
|
||||
val source = s"${containerTitles.head} ISBN: ${ISBN.head}"
|
||||
if (publication.getSource != null) {
|
||||
val l: List[Field[String]] = publication.getSource.asScala.toList
|
||||
val ll: List[Field[String]] = l ::: List(asField(source))
|
||||
publication.setSource(ll.asJava)
|
||||
}
|
||||
else
|
||||
publication.setSource(List(asField(source)).asJava)
|
||||
}
|
||||
} else {
|
||||
// Mapping Journal
|
||||
|
||||
val issnInfos = for {JArray(issn_types) <- json \ "issn-type"
|
||||
JObject(issn_type) <- issn_types
|
||||
JField("type", JString(tp)) <- issn_type
|
||||
JField("value", JString(vl)) <- issn_type
|
||||
} yield Tuple2(tp, vl)
|
||||
|
||||
val volume = (json \ "volume").extractOrElse[String](null)
|
||||
if (containerTitles.nonEmpty) {
|
||||
val journal = new Journal
|
||||
journal.setName(containerTitles.head)
|
||||
if (issnInfos.nonEmpty) {
|
||||
|
||||
issnInfos.foreach(tp => {
|
||||
tp._1 match {
|
||||
case "electronic" => journal.setIssnOnline(tp._2)
|
||||
case "print" => journal.setIssnPrinted(tp._2)
|
||||
}
|
||||
})
|
||||
}
|
||||
journal.setVol(volume)
|
||||
val page = (json \ "page").extractOrElse[String](null)
|
||||
if (page != null) {
|
||||
val pp = page.split("-")
|
||||
if (pp.nonEmpty)
|
||||
journal.setSp(pp.head)
|
||||
if (pp.size > 1)
|
||||
journal.setEp(pp(1))
|
||||
}
|
||||
publication.setJournal(journal)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
def extractDate(dt: String, datePart: List[List[Int]]): String = {
|
||||
if (StringUtils.isNotBlank(dt))
|
||||
return dt
|
||||
if (datePart != null && datePart.size == 1) {
|
||||
val res = datePart.head
|
||||
if (res.size == 3) {
|
||||
val dp = f"${res.head}-${res(1)}%02d-${res(2)}%02d"
|
||||
if (dp.length == 10) {
|
||||
return dp
|
||||
}
|
||||
}
|
||||
}
|
||||
null
|
||||
|
||||
}
|
||||
|
||||
def generateDate(dt: String, datePart: List[List[Int]], classId: String, schemeId: String): StructuredProperty = {
|
||||
val dp = extractDate(dt, datePart)
|
||||
if (StringUtils.isNotBlank(dp))
|
||||
return createSP(dp, classId, schemeId)
|
||||
null
|
||||
}
|
||||
|
||||
def generateItemFromType(objectType: String, objectSubType: String): Result = {
|
||||
if (mappingCrossrefType.contains(objectType)) {
|
||||
if (mappingCrossrefType(objectType).equalsIgnoreCase("publication"))
|
||||
return new Publication()
|
||||
if (mappingCrossrefType(objectType).equalsIgnoreCase("dataset"))
|
||||
return new Dataset()
|
||||
}
|
||||
null
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,103 @@
|
|||
|
||||
package eu.dnetlib.doiboost.crossref;
|
||||
|
||||
import java.io.ByteArrayOutputStream;
|
||||
import java.util.zip.Inflater;
|
||||
|
||||
import org.apache.commons.codec.binary.Base64;
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.hadoop.io.IntWritable;
|
||||
import org.apache.hadoop.io.SequenceFile;
|
||||
import org.apache.hadoop.io.Text;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
|
||||
public class CrossrefImporter {
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
|
||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
|
||||
IOUtils
|
||||
.toString(
|
||||
CrossrefImporter.class
|
||||
.getResourceAsStream(
|
||||
"/eu/dnetlib/dhp/doiboost/import_from_es.json")));
|
||||
|
||||
parser.parseArgument(args);
|
||||
|
||||
final String hdfsuri = parser.get("namenode");
|
||||
System.out.println("HDFS URI" + hdfsuri);
|
||||
Path hdfswritepath = new Path(parser.get("targetPath"));
|
||||
System.out.println("TargetPath: " + hdfsuri);
|
||||
|
||||
final Long timestamp = StringUtils.isNotBlank(parser.get("timestamp"))
|
||||
? Long.parseLong(parser.get("timestamp"))
|
||||
: -1;
|
||||
|
||||
if (timestamp > 0)
|
||||
System.out.println("Timestamp added " + timestamp);
|
||||
|
||||
// ====== Init HDFS File System Object
|
||||
Configuration conf = new Configuration();
|
||||
// Set FileSystem URI
|
||||
conf.set("fs.defaultFS", hdfsuri);
|
||||
// Because of Maven
|
||||
conf.set("fs.hdfs.impl", org.apache.hadoop.hdfs.DistributedFileSystem.class.getName());
|
||||
conf.set("fs.file.impl", org.apache.hadoop.fs.LocalFileSystem.class.getName());
|
||||
|
||||
ESClient client = timestamp > 0
|
||||
? new ESClient("ip-90-147-167-25.ct1.garrservices.it", "crossref", timestamp)
|
||||
: new ESClient("ip-90-147-167-25.ct1.garrservices.it", "crossref");
|
||||
|
||||
try (SequenceFile.Writer writer = SequenceFile
|
||||
.createWriter(
|
||||
conf,
|
||||
SequenceFile.Writer.file(hdfswritepath),
|
||||
SequenceFile.Writer.keyClass(IntWritable.class),
|
||||
SequenceFile.Writer.valueClass(Text.class))) {
|
||||
|
||||
int i = 0;
|
||||
long start = System.currentTimeMillis();
|
||||
long end = 0;
|
||||
final IntWritable key = new IntWritable(i);
|
||||
final Text value = new Text();
|
||||
while (client.hasNext()) {
|
||||
key.set(i++);
|
||||
value.set(client.next());
|
||||
writer.append(key, value);
|
||||
if (i % 100000 == 0) {
|
||||
end = System.currentTimeMillis();
|
||||
final float time = (end - start) / 1000.0F;
|
||||
System.out
|
||||
.println(
|
||||
String.format("Imported %d records last 100000 imported in %f seconds", i, time));
|
||||
start = System.currentTimeMillis();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public static String decompressBlob(final String blob) {
|
||||
try {
|
||||
byte[] byteArray = Base64.decodeBase64(blob.getBytes());
|
||||
final Inflater decompresser = new Inflater();
|
||||
decompresser.setInput(byteArray);
|
||||
final ByteArrayOutputStream bos = new ByteArrayOutputStream(byteArray.length);
|
||||
byte[] buffer = new byte[8192];
|
||||
while (!decompresser.finished()) {
|
||||
int size = decompresser.inflate(buffer);
|
||||
bos.write(buffer, 0, size);
|
||||
}
|
||||
byte[] unzippeddata = bos.toByteArray();
|
||||
decompresser.end();
|
||||
return new String(unzippeddata);
|
||||
} catch (Throwable e) {
|
||||
throw new RuntimeException("Wrong record:" + blob, e);
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,114 @@
|
|||
|
||||
package eu.dnetlib.doiboost.crossref;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.http.client.methods.CloseableHttpResponse;
|
||||
import org.apache.http.client.methods.HttpPost;
|
||||
import org.apache.http.entity.StringEntity;
|
||||
import org.apache.http.impl.client.CloseableHttpClient;
|
||||
import org.apache.http.impl.client.HttpClients;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import com.jayway.jsonpath.JsonPath;
|
||||
|
||||
public class ESClient implements Iterator<String> {
|
||||
private static final Logger logger = LoggerFactory.getLogger(ESClient.class);
|
||||
|
||||
static final String blobPath = "$.hits[*].hits[*]._source.blob";
|
||||
static final String scrollIdPath = "$._scroll_id";
|
||||
static final String JSON_NO_TS = "{\"size\":1000}";
|
||||
static final String JSON_WITH_TS = "{\"size\":1000, \"query\":{\"range\":{\"timestamp\":{\"gte\":%d}}}}";
|
||||
static final String JSON_SCROLL = "{\"scroll_id\":\"%s\",\"scroll\" : \"1m\"}";
|
||||
|
||||
private final String scrollId;
|
||||
|
||||
private List<String> buffer;
|
||||
|
||||
private final String esHost;
|
||||
|
||||
public ESClient(final String esHost, final String esIndex) throws IOException {
|
||||
|
||||
this.esHost = esHost;
|
||||
final String body = getResponse(
|
||||
String.format("http://%s:9200/%s/_search?scroll=1m", esHost, esIndex), JSON_NO_TS);
|
||||
scrollId = getJPathString(scrollIdPath, body);
|
||||
buffer = getBlobs(body);
|
||||
}
|
||||
|
||||
public ESClient(final String esHost, final String esIndex, final long timestamp)
|
||||
throws IOException {
|
||||
this.esHost = esHost;
|
||||
final String body = getResponse(
|
||||
String.format("http://%s:9200/%s/_search?scroll=1m", esHost, esIndex),
|
||||
String.format(JSON_WITH_TS, timestamp));
|
||||
scrollId = getJPathString(scrollIdPath, body);
|
||||
buffer = getBlobs(body);
|
||||
}
|
||||
|
||||
private String getResponse(final String url, final String json) {
|
||||
CloseableHttpClient client = HttpClients.createDefault();
|
||||
try {
|
||||
|
||||
HttpPost httpPost = new HttpPost(url);
|
||||
if (json != null) {
|
||||
StringEntity entity = new StringEntity(json);
|
||||
httpPost.setEntity(entity);
|
||||
httpPost.setHeader("Accept", "application/json");
|
||||
httpPost.setHeader("Content-type", "application/json");
|
||||
}
|
||||
CloseableHttpResponse response = client.execute(httpPost);
|
||||
|
||||
return IOUtils.toString(response.getEntity().getContent());
|
||||
} catch (Throwable e) {
|
||||
throw new RuntimeException("Error on executing request ", e);
|
||||
} finally {
|
||||
try {
|
||||
client.close();
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException("Unable to close client ", e);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private String getJPathString(final String jsonPath, final String json) {
|
||||
try {
|
||||
Object o = JsonPath.read(json, jsonPath);
|
||||
if (o instanceof String)
|
||||
return (String) o;
|
||||
return null;
|
||||
} catch (Exception e) {
|
||||
return "";
|
||||
}
|
||||
}
|
||||
|
||||
private List<String> getBlobs(final String body) {
|
||||
final List<String> res = JsonPath.read(body, "$.hits.hits[*]._source.blob");
|
||||
return res;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean hasNext() {
|
||||
return (buffer != null && !buffer.isEmpty());
|
||||
}
|
||||
|
||||
@Override
|
||||
public String next() {
|
||||
final String nextItem = buffer.remove(0);
|
||||
if (buffer.isEmpty()) {
|
||||
|
||||
final String json_param = String.format(JSON_SCROLL, scrollId);
|
||||
final String body = getResponse(String.format("http://%s:9200/_search/scroll", esHost), json_param);
|
||||
try {
|
||||
buffer = getBlobs(body);
|
||||
} catch (Throwable e) {
|
||||
logger.error("Error on get next page: body:" + body);
|
||||
}
|
||||
}
|
||||
return nextItem;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,98 @@
|
|||
package eu.dnetlib.doiboost.crossref
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser
|
||||
import eu.dnetlib.dhp.schema.oaf
|
||||
import eu.dnetlib.dhp.schema.oaf.{Oaf, Publication, Relation, Dataset => OafDataset}
|
||||
import org.apache.commons.io.IOUtils
|
||||
import org.apache.hadoop.io.{IntWritable, Text}
|
||||
import org.apache.spark.SparkConf
|
||||
import org.apache.spark.rdd.RDD
|
||||
import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession}
|
||||
import org.slf4j.{Logger, LoggerFactory}
|
||||
|
||||
|
||||
case class Reference(author: String, firstPage: String) {}
|
||||
|
||||
object SparkMapDumpIntoOAF {
|
||||
|
||||
def main(args: Array[String]): Unit = {
|
||||
|
||||
|
||||
val logger: Logger = LoggerFactory.getLogger(SparkMapDumpIntoOAF.getClass)
|
||||
val conf: SparkConf = new SparkConf()
|
||||
val parser = new ArgumentApplicationParser(IOUtils.toString(SparkMapDumpIntoOAF.getClass.getResourceAsStream("/eu/dnetlib/dhp/doiboost/convert_map_to_oaf_params.json")))
|
||||
parser.parseArgument(args)
|
||||
val spark: SparkSession =
|
||||
SparkSession
|
||||
.builder()
|
||||
.config(conf)
|
||||
.appName(SparkMapDumpIntoOAF.getClass.getSimpleName)
|
||||
.master(parser.get("master")).getOrCreate()
|
||||
|
||||
implicit val mapEncoderPubs: Encoder[Publication] = Encoders.kryo[Publication]
|
||||
implicit val mapEncoderRelatons: Encoder[Relation] = Encoders.kryo[Relation]
|
||||
implicit val mapEncoderDatasets: Encoder[oaf.Dataset] = Encoders.kryo[OafDataset]
|
||||
|
||||
val sc = spark.sparkContext
|
||||
val targetPath = parser.get("targetPath")
|
||||
|
||||
|
||||
sc.sequenceFile(parser.get("sourcePath"), classOf[IntWritable], classOf[Text])
|
||||
.map(k => k._2.toString).map(CrossrefImporter.decompressBlob)
|
||||
.flatMap(k => Crossref2Oaf.convert(k)).saveAsObjectFile(s"${targetPath}/mixObject")
|
||||
|
||||
val inputRDD = sc.objectFile[Oaf](s"${targetPath}/mixObject").filter(p=> p!= null)
|
||||
|
||||
val distinctPubs:RDD[Publication] = inputRDD.filter(k => k != null && k.isInstanceOf[Publication])
|
||||
.map(k => k.asInstanceOf[Publication]).map { p: Publication => Tuple2(p.getId, p) }.reduceByKey { case (p1: Publication, p2: Publication) =>
|
||||
var r = if (p1 == null) p2 else p1
|
||||
if (p1 != null && p2 != null) {
|
||||
if (p1.getLastupdatetimestamp != null && p2.getLastupdatetimestamp != null) {
|
||||
if (p1.getLastupdatetimestamp < p2.getLastupdatetimestamp)
|
||||
r = p2
|
||||
else
|
||||
r = p1
|
||||
} else {
|
||||
r = if (p1.getLastupdatetimestamp == null) p2 else p1
|
||||
}
|
||||
}
|
||||
r
|
||||
}.map(_._2)
|
||||
|
||||
val pubs:Dataset[Publication] = spark.createDataset(distinctPubs)
|
||||
pubs.write.mode(SaveMode.Overwrite).save(s"${targetPath}/publication")
|
||||
|
||||
|
||||
val distincDatasets:RDD[OafDataset] = inputRDD.filter(k => k != null && k.isInstanceOf[OafDataset])
|
||||
.map(k => k.asInstanceOf[OafDataset]).map(p => Tuple2(p.getId, p)).reduceByKey { case (p1: OafDataset, p2: OafDataset) =>
|
||||
var r = if (p1 == null) p2 else p1
|
||||
if (p1 != null && p2 != null) {
|
||||
if (p1.getLastupdatetimestamp != null && p2.getLastupdatetimestamp != null) {
|
||||
if (p1.getLastupdatetimestamp < p2.getLastupdatetimestamp)
|
||||
r = p2
|
||||
else
|
||||
r = p1
|
||||
} else {
|
||||
r = if (p1.getLastupdatetimestamp == null) p2 else p1
|
||||
}
|
||||
}
|
||||
r
|
||||
}.map(_._2)
|
||||
|
||||
spark.createDataset(distincDatasets).write.mode(SaveMode.Overwrite).save(s"${targetPath}/dataset")
|
||||
|
||||
|
||||
|
||||
val distinctRels =inputRDD.filter(k => k != null && k.isInstanceOf[Relation])
|
||||
.map(k => k.asInstanceOf[Relation]).map(r=> (s"${r.getSource}::${r.getTarget}",r))
|
||||
.reduceByKey { case (p1: Relation, p2: Relation) =>
|
||||
if (p1 == null) p2 else p1
|
||||
}.map(_._2)
|
||||
|
||||
val rels: Dataset[Relation] = spark.createDataset(distinctRels)
|
||||
|
||||
rels.write.mode(SaveMode.Overwrite).save(s"${targetPath}/relations")
|
||||
}
|
||||
|
||||
|
||||
}
|
|
@ -0,0 +1,316 @@
|
|||
package eu.dnetlib.doiboost.mag
|
||||
|
||||
|
||||
import eu.dnetlib.dhp.schema.oaf.{Instance, Journal, Publication, StructuredProperty}
|
||||
import eu.dnetlib.doiboost.DoiBoostMappingUtil
|
||||
import org.json4s
|
||||
import org.json4s.DefaultFormats
|
||||
import org.json4s.jackson.JsonMethods.parse
|
||||
import eu.dnetlib.doiboost.DoiBoostMappingUtil._
|
||||
|
||||
import scala.collection.JavaConverters._
|
||||
import scala.collection.mutable
|
||||
import scala.util.matching.Regex
|
||||
|
||||
|
||||
case class MagPapers(PaperId: Long, Rank: Integer, Doi: String,
|
||||
DocType: String, PaperTitle: String, OriginalTitle: String,
|
||||
BookTitle: String, Year: Option[Integer], Date: Option[java.sql.Timestamp], Publisher: String,
|
||||
JournalId: Option[Long], ConferenceSeriesId: Option[Long], ConferenceInstanceId: Option[Long],
|
||||
Volume: String, Issue: String, FirstPage: String, LastPage: String,
|
||||
ReferenceCount: Option[Long], CitationCount: Option[Long], EstimatedCitation: Option[Long],
|
||||
OriginalVenue: String, FamilyId: Option[Long], CreatedDate: java.sql.Timestamp) {}
|
||||
|
||||
|
||||
case class MagPaperAbstract(PaperId: Long, IndexedAbstract: String) {}
|
||||
|
||||
case class MagAuthor(AuthorId: Long, Rank: Option[Int], NormalizedName: Option[String], DisplayName: Option[String], LastKnownAffiliationId: Option[Long], PaperCount: Option[Long], CitationCount: Option[Long], CreatedDate: Option[java.sql.Timestamp]) {}
|
||||
|
||||
case class MagAffiliation(AffiliationId: Long, Rank: Int, NormalizedName: String, DisplayName: String, GridId: String, OfficialPage: String, WikiPage: String, PaperCount: Long, CitationCount: Long, Latitude: Option[Float], Longitude: Option[Float], CreatedDate: java.sql.Timestamp) {}
|
||||
|
||||
case class MagPaperAuthorAffiliation(PaperId: Long, AuthorId: Long, AffiliationId: Option[Long], AuthorSequenceNumber: Int, OriginalAuthor: String, OriginalAffiliation: String) {}
|
||||
|
||||
|
||||
case class MagAuthorAffiliation(author: MagAuthor, affiliation:String)
|
||||
|
||||
case class MagPaperWithAuthorList(PaperId: Long, authors: List[MagAuthorAffiliation]) {}
|
||||
|
||||
case class MagPaperAuthorDenormalized(PaperId: Long, author: MagAuthor, affiliation:String) {}
|
||||
|
||||
case class MagPaperUrl(PaperId: Long, SourceType: Option[Int], SourceUrl: Option[String], LanguageCode: Option[String]) {}
|
||||
|
||||
case class MagUrlInstance(SourceUrl:String){}
|
||||
|
||||
case class MagUrl(PaperId: Long, instances: List[MagUrlInstance])
|
||||
|
||||
case class MagSubject(FieldOfStudyId:Long, DisplayName:String, MainType:Option[String], Score:Float){}
|
||||
|
||||
case class MagFieldOfStudy(PaperId:Long, subjects:List[MagSubject]) {}
|
||||
|
||||
case class MagJournal(JournalId: Long, Rank: Option[Int], NormalizedName: Option[String], DisplayName: Option[String], Issn: Option[String], Publisher: Option[String], Webpage: Option[String], PaperCount: Option[Long], CitationCount: Option[Long], CreatedDate: Option[java.sql.Timestamp]) {}
|
||||
|
||||
|
||||
case class MagConferenceInstance(ci:Long, DisplayName:Option[String], Location:Option[String], StartDate:Option[java.sql.Timestamp], EndDate:Option[java.sql.Timestamp], PaperId:Long){}
|
||||
|
||||
case object ConversionUtil {
|
||||
|
||||
def extractMagIdentifier(pids:mutable.Buffer[String]) :String ={
|
||||
val magIDRegex: Regex = "^[0-9]+$".r
|
||||
val s =pids.filter(p=> magIDRegex.findAllIn(p).hasNext)
|
||||
|
||||
if (s.nonEmpty)
|
||||
return s.head
|
||||
null
|
||||
}
|
||||
|
||||
|
||||
def mergePublication(a: Publication, b:Publication) : Publication = {
|
||||
if ((a != null) && (b != null)) {
|
||||
a.mergeFrom(b)
|
||||
a
|
||||
} else {
|
||||
if (a == null) b else a
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
def choiceLatestMagArtitcle(p1: MagPapers, p2:MagPapers) :MagPapers = {
|
||||
var r = if (p1 == null) p2 else p1
|
||||
if (p1 != null && p2 != null) {
|
||||
if (p1.CreatedDate != null && p2.CreatedDate != null) {
|
||||
if (p1.CreatedDate.before(p2.CreatedDate))
|
||||
r = p2
|
||||
else
|
||||
r = p1
|
||||
} else {
|
||||
r = if (p1.CreatedDate == null) p2 else p1
|
||||
}
|
||||
}
|
||||
r
|
||||
|
||||
}
|
||||
|
||||
|
||||
def updatePubsWithDescription(inputItem:((String, Publication), MagPaperAbstract)) : Publication = {
|
||||
val pub = inputItem._1._2
|
||||
val abst = inputItem._2
|
||||
if (abst != null) {
|
||||
pub.setDescription(List(asField(abst.IndexedAbstract)).asJava)
|
||||
}
|
||||
pub
|
||||
|
||||
}
|
||||
|
||||
|
||||
def updatePubsWithConferenceInfo(inputItem:((String, Publication), MagConferenceInstance)) : Publication = {
|
||||
val publication:Publication= inputItem._1._2
|
||||
val ci:MagConferenceInstance = inputItem._2
|
||||
|
||||
if (ci!= null){
|
||||
|
||||
val j:Journal = new Journal
|
||||
if (ci.Location.isDefined)
|
||||
j.setConferenceplace(ci.Location.get)
|
||||
j.setName(ci.DisplayName.get)
|
||||
if (ci.StartDate.isDefined && ci.EndDate.isDefined)
|
||||
{
|
||||
j.setConferencedate(s"${ci.StartDate.get.toString.substring(0,10)} - ${ci.EndDate.get.toString.substring(0,10)}")
|
||||
}
|
||||
|
||||
publication.setJournal(j)
|
||||
}
|
||||
publication
|
||||
}
|
||||
|
||||
def updatePubsWithSubject(item:((String, Publication), MagFieldOfStudy)) : Publication = {
|
||||
|
||||
val publication = item._1._2
|
||||
val fieldOfStudy = item._2
|
||||
if (fieldOfStudy != null && fieldOfStudy.subjects != null && fieldOfStudy.subjects.nonEmpty) {
|
||||
val p: List[StructuredProperty] = fieldOfStudy.subjects.flatMap(s => {
|
||||
val s1 = createSP(s.DisplayName, "keyword", "dnet:subject_classification_typologies")
|
||||
val di = DoiBoostMappingUtil.generateDataInfo(s.Score.toString)
|
||||
var resList: List[StructuredProperty] = List(s1)
|
||||
if (s.MainType.isDefined) {
|
||||
val maintp = s.MainType.get
|
||||
val s2 = createSP(s.MainType.get, "keyword", "dnet:subject_classification_typologies")
|
||||
s2.setDataInfo(di)
|
||||
resList = resList ::: List(s2)
|
||||
if (maintp.contains(".")) {
|
||||
val s3 = createSP(maintp.split("\\.").head, "keyword", "dnet:subject_classification_typologies")
|
||||
s3.setDataInfo(di)
|
||||
resList = resList ::: List(s3)
|
||||
}
|
||||
}
|
||||
resList
|
||||
})
|
||||
publication.setSubject(p.asJava)
|
||||
}
|
||||
publication
|
||||
}
|
||||
|
||||
|
||||
|
||||
def addInstances(a: (Publication, MagUrl)): Publication = {
|
||||
val pub = a._1
|
||||
val urls = a._2
|
||||
|
||||
|
||||
|
||||
val i = new Instance
|
||||
|
||||
|
||||
if (urls!= null) {
|
||||
|
||||
val l:List[String] = urls.instances.filter(k=>k.SourceUrl.nonEmpty).map(k=>k.SourceUrl):::List(s"https://academic.microsoft.com/#/detail/${extractMagIdentifier(pub.getOriginalId.asScala)}")
|
||||
|
||||
i.setUrl(l.asJava)
|
||||
}
|
||||
else
|
||||
i.setUrl(List(s"https://academic.microsoft.com/#/detail/${extractMagIdentifier(pub.getOriginalId.asScala)}").asJava)
|
||||
|
||||
i.setCollectedfrom(createMAGCollectedFrom())
|
||||
pub.setInstance(List(i).asJava)
|
||||
pub
|
||||
}
|
||||
|
||||
|
||||
def transformPaperAbstract(input: MagPaperAbstract): MagPaperAbstract = {
|
||||
MagPaperAbstract(input.PaperId, convertInvertedIndexString(input.IndexedAbstract))
|
||||
}
|
||||
|
||||
|
||||
def createOAFFromJournalAuthorPaper(inputParams: ((MagPapers, MagJournal), MagPaperWithAuthorList)): Publication = {
|
||||
val paper = inputParams._1._1
|
||||
val journal = inputParams._1._2
|
||||
val authors = inputParams._2
|
||||
|
||||
val pub = new Publication
|
||||
pub.setPid(List(createSP(paper.Doi.toLowerCase, "doi", PID_TYPES)).asJava)
|
||||
pub.setOriginalId(List(paper.PaperId.toString, paper.Doi.toLowerCase).asJava)
|
||||
|
||||
//Set identifier as 50|doiboost____::md5(DOI)
|
||||
pub.setId(generateIdentifier(pub, paper.Doi.toLowerCase))
|
||||
|
||||
val mainTitles = createSP(paper.PaperTitle, "main title", "dnet:dataCite_title")
|
||||
val originalTitles = createSP(paper.OriginalTitle, "alternative title", "dnet:dataCite_title")
|
||||
pub.setTitle(List(mainTitles, originalTitles).asJava)
|
||||
|
||||
pub.setSource(List(asField(paper.BookTitle)).asJava)
|
||||
|
||||
val authorsOAF = authors.authors.map { f: MagAuthorAffiliation =>
|
||||
|
||||
val a: eu.dnetlib.dhp.schema.oaf.Author = new eu.dnetlib.dhp.schema.oaf.Author
|
||||
|
||||
a.setFullname(f.author.DisplayName.get)
|
||||
|
||||
if(f.affiliation!= null)
|
||||
a.setAffiliation(List(asField(f.affiliation)).asJava)
|
||||
a.setPid(List(createSP(s"https://academic.microsoft.com/#/detail/${f.author.AuthorId}", "URL", PID_TYPES)).asJava)
|
||||
a
|
||||
}
|
||||
pub.setAuthor(authorsOAF.asJava)
|
||||
|
||||
|
||||
if (paper.Date != null && paper.Date.isDefined) {
|
||||
pub.setDateofacceptance(asField(paper.Date.get.toString.substring(0,10)))
|
||||
}
|
||||
pub.setPublisher(asField(paper.Publisher))
|
||||
|
||||
|
||||
if (journal != null && journal.DisplayName.isDefined) {
|
||||
val j = new Journal
|
||||
|
||||
j.setName(journal.DisplayName.get)
|
||||
j.setSp(paper.FirstPage)
|
||||
j.setEp(paper.LastPage)
|
||||
if (journal.Publisher.isDefined)
|
||||
pub.setPublisher(asField(journal.Publisher.get))
|
||||
if (journal.Issn.isDefined)
|
||||
j.setIssnPrinted(journal.Issn.get)
|
||||
j.setVol(paper.Volume)
|
||||
j.setIss(paper.Issue)
|
||||
pub.setJournal(j)
|
||||
}
|
||||
pub.setCollectedfrom(List(createMAGCollectedFrom()).asJava)
|
||||
pub.setDataInfo(generateDataInfo())
|
||||
pub
|
||||
}
|
||||
|
||||
|
||||
def createOAF(inputParams: ((MagPapers, MagPaperWithAuthorList), MagPaperAbstract)): Publication = {
|
||||
|
||||
val paper = inputParams._1._1
|
||||
val authors = inputParams._1._2
|
||||
val description = inputParams._2
|
||||
|
||||
val pub = new Publication
|
||||
pub.setPid(List(createSP(paper.Doi.toLowerCase, "doi", PID_TYPES)).asJava)
|
||||
pub.setOriginalId(List(paper.PaperId.toString, paper.Doi.toLowerCase).asJava)
|
||||
|
||||
//Set identifier as 50 | doiboost____::md5(DOI)
|
||||
pub.setId(generateIdentifier(pub, paper.Doi.toLowerCase))
|
||||
|
||||
val mainTitles = createSP(paper.PaperTitle, "main title", "dnet:dataCite_title")
|
||||
val originalTitles = createSP(paper.OriginalTitle, "alternative title", "dnet:dataCite_title")
|
||||
pub.setTitle(List(mainTitles, originalTitles).asJava)
|
||||
|
||||
pub.setSource(List(asField(paper.BookTitle)).asJava)
|
||||
|
||||
|
||||
if (description != null) {
|
||||
pub.setDescription(List(asField(description.IndexedAbstract)).asJava)
|
||||
}
|
||||
|
||||
|
||||
val authorsOAF = authors.authors.map { f: MagAuthorAffiliation =>
|
||||
|
||||
val a: eu.dnetlib.dhp.schema.oaf.Author = new eu.dnetlib.dhp.schema.oaf.Author
|
||||
|
||||
a.setFullname(f.author.DisplayName.get)
|
||||
|
||||
if(f.affiliation!= null)
|
||||
a.setAffiliation(List(asField(f.affiliation)).asJava)
|
||||
|
||||
|
||||
a.setPid(List(createSP(s"https://academic.microsoft.com/#/detail/${f.author.AuthorId}", "URL", PID_TYPES)).asJava)
|
||||
|
||||
a
|
||||
|
||||
}
|
||||
|
||||
|
||||
if (paper.Date != null) {
|
||||
pub.setDateofacceptance(asField(paper.Date.toString.substring(0,10)))
|
||||
}
|
||||
|
||||
pub.setAuthor(authorsOAF.asJava)
|
||||
|
||||
|
||||
pub
|
||||
|
||||
}
|
||||
|
||||
|
||||
def convertInvertedIndexString(json_input: String): String = {
|
||||
implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats
|
||||
lazy val json: json4s.JValue = parse(json_input)
|
||||
val idl = (json \ "IndexLength").extract[Int]
|
||||
if (idl > 0) {
|
||||
val res = Array.ofDim[String](idl)
|
||||
|
||||
val iid = (json \ "InvertedIndex").extract[Map[String, List[Int]]]
|
||||
|
||||
for {(k: String, v: List[Int]) <- iid} {
|
||||
v.foreach(item => res(item) = k)
|
||||
}
|
||||
(0 until idl).foreach(i => {
|
||||
if (res(i) == null)
|
||||
res(i) = ""
|
||||
})
|
||||
return res.mkString(" ")
|
||||
}
|
||||
""
|
||||
}
|
||||
}
|
|
@ -0,0 +1,92 @@
|
|||
package eu.dnetlib.doiboost.mag
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser
|
||||
import org.apache.commons.io.IOUtils
|
||||
import org.apache.spark.SparkConf
|
||||
import org.apache.spark.sql.{SaveMode, SparkSession}
|
||||
import org.apache.spark.sql.types._
|
||||
import org.slf4j.{Logger, LoggerFactory}
|
||||
import org.apache.spark.sql.functions._
|
||||
|
||||
object SparkImportMagIntoDataset {
|
||||
val datatypedict = Map(
|
||||
"int" -> IntegerType,
|
||||
"uint" -> IntegerType,
|
||||
"long" -> LongType,
|
||||
"ulong" -> LongType,
|
||||
"float" -> FloatType,
|
||||
"string" -> StringType,
|
||||
"DateTime" -> DateType
|
||||
)
|
||||
|
||||
|
||||
val stream = Map(
|
||||
"Affiliations" -> Tuple2("mag/Affiliations.txt", Seq("AffiliationId:long", "Rank:uint", "NormalizedName:string", "DisplayName:string", "GridId:string", "OfficialPage:string", "WikiPage:string", "PaperCount:long", "CitationCount:long", "Latitude:float?", "Longitude:float?", "CreatedDate:DateTime")),
|
||||
"Authors" -> Tuple2("mag/Authors.txt", Seq("AuthorId:long", "Rank:uint", "NormalizedName:string", "DisplayName:string", "LastKnownAffiliationId:long?", "PaperCount:long", "CitationCount:long", "CreatedDate:DateTime")),
|
||||
"ConferenceInstances" -> Tuple2("mag/ConferenceInstances.txt", Seq("ConferenceInstanceId:long", "NormalizedName:string", "DisplayName:string", "ConferenceSeriesId:long", "Location:string", "OfficialUrl:string", "StartDate:DateTime?", "EndDate:DateTime?", "AbstractRegistrationDate:DateTime?", "SubmissionDeadlineDate:DateTime?", "NotificationDueDate:DateTime?", "FinalVersionDueDate:DateTime?", "PaperCount:long", "CitationCount:long", "Latitude:float?", "Longitude:float?", "CreatedDate:DateTime")),
|
||||
"ConferenceSeries" -> Tuple2("mag/ConferenceSeries.txt", Seq("ConferenceSeriesId:long", "Rank:uint", "NormalizedName:string", "DisplayName:string", "PaperCount:long", "CitationCount:long", "CreatedDate:DateTime")),
|
||||
"EntityRelatedEntities" -> Tuple2("advanced/EntityRelatedEntities.txt", Seq("EntityId:long", "EntityType:string", "RelatedEntityId:long", "RelatedEntityType:string", "RelatedType:int", "Score:float")),
|
||||
"FieldOfStudyChildren" -> Tuple2("advanced/FieldOfStudyChildren.txt", Seq("FieldOfStudyId:long", "ChildFieldOfStudyId:long")),
|
||||
"FieldOfStudyExtendedAttributes" -> Tuple2("advanced/FieldOfStudyExtendedAttributes.txt", Seq("FieldOfStudyId:long", "AttributeType:int", "AttributeValue:string")),
|
||||
"FieldsOfStudy" -> Tuple2("advanced/FieldsOfStudy.txt", Seq("FieldOfStudyId:long", "Rank:uint", "NormalizedName:string", "DisplayName:string", "MainType:string", "Level:int", "PaperCount:long", "CitationCount:long", "CreatedDate:DateTime")),
|
||||
"Journals" -> Tuple2("mag/Journals.txt", Seq("JournalId:long", "Rank:uint", "NormalizedName:string", "DisplayName:string", "Issn:string", "Publisher:string", "Webpage:string", "PaperCount:long", "CitationCount:long", "CreatedDate:DateTime")),
|
||||
"PaperAbstractsInvertedIndex" -> Tuple2("nlp/PaperAbstractsInvertedIndex.txt.*", Seq("PaperId:long", "IndexedAbstract:string")),
|
||||
"PaperAuthorAffiliations" -> Tuple2("mag/PaperAuthorAffiliations.txt", Seq("PaperId:long", "AuthorId:long", "AffiliationId:long?", "AuthorSequenceNumber:uint", "OriginalAuthor:string", "OriginalAffiliation:string")),
|
||||
"PaperCitationContexts" -> Tuple2("nlp/PaperCitationContexts.txt", Seq("PaperId:long", "PaperReferenceId:long", "CitationContext:string")),
|
||||
"PaperExtendedAttributes" -> Tuple2("mag/PaperExtendedAttributes.txt", Seq("PaperId:long", "AttributeType:int", "AttributeValue:string")),
|
||||
"PaperFieldsOfStudy" -> Tuple2("advanced/PaperFieldsOfStudy.txt", Seq("PaperId:long", "FieldOfStudyId:long", "Score:float")),
|
||||
"PaperRecommendations" -> Tuple2("advanced/PaperRecommendations.txt", Seq("PaperId:long", "RecommendedPaperId:long", "Score:float")),
|
||||
"PaperReferences" -> Tuple2("mag/PaperReferences.txt", Seq("PaperId:long", "PaperReferenceId:long")),
|
||||
"PaperResources" -> Tuple2("mag/PaperResources.txt", Seq("PaperId:long", "ResourceType:int", "ResourceUrl:string", "SourceUrl:string", "RelationshipType:int")),
|
||||
"PaperUrls" -> Tuple2("mag/PaperUrls.txt", Seq("PaperId:long", "SourceType:int?", "SourceUrl:string", "LanguageCode:string")),
|
||||
"Papers" -> Tuple2("mag/Papers.txt", Seq("PaperId:long", "Rank:uint", "Doi:string", "DocType:string", "PaperTitle:string", "OriginalTitle:string", "BookTitle:string", "Year:int?", "Date:DateTime?", "Publisher:string", "JournalId:long?", "ConferenceSeriesId:long?", "ConferenceInstanceId:long?", "Volume:string", "Issue:string", "FirstPage:string", "LastPage:string", "ReferenceCount:long", "CitationCount:long", "EstimatedCitation:long", "OriginalVenue:string", "FamilyId:long?", "CreatedDate:DateTime")),
|
||||
"RelatedFieldOfStudy" -> Tuple2("advanced/RelatedFieldOfStudy.txt", Seq("FieldOfStudyId1:long", "Type1:string", "FieldOfStudyId2:long", "Type2:string", "Rank:float"))
|
||||
)
|
||||
|
||||
|
||||
def getSchema(streamName: String): StructType = {
|
||||
var schema = new StructType()
|
||||
val d: Seq[String] = stream(streamName)._2
|
||||
d.foreach { case t =>
|
||||
val currentType = t.split(":")
|
||||
val fieldName: String = currentType.head
|
||||
var fieldType: String = currentType.last
|
||||
val nullable: Boolean = fieldType.endsWith("?")
|
||||
if (nullable)
|
||||
fieldType = fieldType.replace("?", "")
|
||||
schema = schema.add(StructField(fieldName, datatypedict(fieldType), nullable))
|
||||
}
|
||||
schema
|
||||
}
|
||||
|
||||
|
||||
def main(args: Array[String]): Unit = {
|
||||
val logger: Logger = LoggerFactory.getLogger(getClass)
|
||||
val conf: SparkConf = new SparkConf()
|
||||
val parser = new ArgumentApplicationParser(IOUtils.toString(getClass.getResourceAsStream("/eu/dnetlib/dhp/doiboost/mag/convert_mag_to_oaf_params.json")))
|
||||
parser.parseArgument(args)
|
||||
val spark: SparkSession =
|
||||
SparkSession
|
||||
.builder()
|
||||
.config(conf)
|
||||
.appName(getClass.getSimpleName)
|
||||
.master(parser.get("master")).getOrCreate()
|
||||
|
||||
|
||||
|
||||
stream.foreach { case (k, v) =>
|
||||
val s: StructType = getSchema(k)
|
||||
val df = spark.read
|
||||
.option("header", "false")
|
||||
.option("charset", "UTF8")
|
||||
.option("delimiter", "\t")
|
||||
.schema(s)
|
||||
.csv(s"${parser.get("sourcePath")}/${v._1}")
|
||||
logger.info(s"Converting $k")
|
||||
|
||||
df.write.mode(SaveMode.Overwrite).save(s"${parser.get("targetPath")}/$k")
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,157 @@
|
|||
package eu.dnetlib.doiboost.mag
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser
|
||||
import eu.dnetlib.dhp.schema.oaf.Publication
|
||||
import org.apache.commons.io.IOUtils
|
||||
import org.apache.spark.SparkConf
|
||||
import org.apache.spark.rdd.RDD
|
||||
import org.apache.spark.sql.functions._
|
||||
import org.apache.spark.sql._
|
||||
import org.slf4j.{Logger, LoggerFactory}
|
||||
|
||||
import scala.collection.JavaConverters._
|
||||
|
||||
object SparkPreProcessMAG {
|
||||
def main(args: Array[String]): Unit = {
|
||||
|
||||
val logger: Logger = LoggerFactory.getLogger(getClass)
|
||||
val conf: SparkConf = new SparkConf()
|
||||
val parser = new ArgumentApplicationParser(IOUtils.toString(getClass.getResourceAsStream("/eu/dnetlib/dhp/doiboost/mag/preprocess_mag_params.json")))
|
||||
parser.parseArgument(args)
|
||||
val spark: SparkSession =
|
||||
SparkSession
|
||||
.builder()
|
||||
.config(conf)
|
||||
.appName(getClass.getSimpleName)
|
||||
.master(parser.get("master")).getOrCreate()
|
||||
|
||||
val sourcePath = parser.get("sourcePath")
|
||||
import spark.implicits._
|
||||
implicit val mapEncoderPubs: Encoder[Publication] = org.apache.spark.sql.Encoders.kryo[Publication]
|
||||
implicit val tupleForJoinEncoder: Encoder[(String, Publication)] = Encoders.tuple(Encoders.STRING, mapEncoderPubs)
|
||||
|
||||
logger.info("Phase 1) make uninque DOI in Papers:")
|
||||
val d: Dataset[MagPapers] = spark.read.load(s"${parser.get("sourcePath")}/Papers").as[MagPapers]
|
||||
|
||||
// Filtering Papers with DOI, and since for the same DOI we have multiple version of item with different PapersId we get the last one
|
||||
val result: RDD[MagPapers] = d.where(col("Doi").isNotNull)
|
||||
.rdd
|
||||
.map{ p: MagPapers => Tuple2(p.Doi, p) }
|
||||
.reduceByKey((p1:MagPapers,p2:MagPapers) => ConversionUtil.choiceLatestMagArtitcle(p1,p2))
|
||||
.map(_._2)
|
||||
|
||||
val distinctPaper: Dataset[MagPapers] = spark.createDataset(result)
|
||||
distinctPaper.write.mode(SaveMode.Overwrite).save(s"${parser.get("targetPath")}/Papers_distinct")
|
||||
|
||||
logger.info("Phase 6) Enrich Publication with description")
|
||||
val pa = spark.read.load(s"${parser.get("sourcePath")}/PaperAbstractsInvertedIndex").as[MagPaperAbstract]
|
||||
pa.map(ConversionUtil.transformPaperAbstract).write.mode(SaveMode.Overwrite).save(s"${parser.get("targetPath")}/PaperAbstract")
|
||||
|
||||
logger.info("Phase 3) Group Author by PaperId")
|
||||
val authors = spark.read.load(s"$sourcePath/Authors").as[MagAuthor]
|
||||
|
||||
val affiliation = spark.read.load(s"$sourcePath/Affiliations").as[MagAffiliation]
|
||||
val paperAuthorAffiliation = spark.read.load(s"$sourcePath/PaperAuthorAffiliations").as[MagPaperAuthorAffiliation]
|
||||
|
||||
paperAuthorAffiliation.joinWith(authors, paperAuthorAffiliation("AuthorId").equalTo(authors("AuthorId")))
|
||||
.map { case (a: MagPaperAuthorAffiliation, b: MagAuthor) => (a.AffiliationId, MagPaperAuthorDenormalized(a.PaperId, b, null)) }
|
||||
.joinWith(affiliation, affiliation("AffiliationId").equalTo(col("_1")), "left")
|
||||
.map(s => {
|
||||
val mpa = s._1._2
|
||||
val af = s._2
|
||||
if (af != null) {
|
||||
MagPaperAuthorDenormalized(mpa.PaperId, mpa.author, af.DisplayName)
|
||||
} else
|
||||
mpa
|
||||
}).groupBy("PaperId").agg(collect_list(struct($"author", $"affiliation")).as("authors"))
|
||||
.write.mode(SaveMode.Overwrite).save(s"${parser.get("targetPath")}/merge_step_1_paper_authors")
|
||||
|
||||
logger.info("Phase 4) create First Version of publication Entity with Paper Journal and Authors")
|
||||
|
||||
val journals = spark.read.load(s"$sourcePath/Journals").as[MagJournal]
|
||||
|
||||
val papers = spark.read.load((s"${parser.get("targetPath")}/Papers_distinct")).as[MagPapers]
|
||||
|
||||
val paperWithAuthors = spark.read.load(s"${parser.get("targetPath")}/merge_step_1_paper_authors").as[MagPaperWithAuthorList]
|
||||
|
||||
val firstJoin = papers.joinWith(journals, papers("JournalId").equalTo(journals("JournalId")), "left")
|
||||
firstJoin.joinWith(paperWithAuthors, firstJoin("_1.PaperId").equalTo(paperWithAuthors("PaperId")), "left")
|
||||
.map { a => ConversionUtil.createOAFFromJournalAuthorPaper(a) }
|
||||
.write.mode(SaveMode.Overwrite).save(s"${parser.get("targetPath")}/merge_step_2")
|
||||
|
||||
|
||||
var magPubs: Dataset[(String, Publication)] =
|
||||
spark.read.load(s"${parser.get("targetPath")}/merge_step_2").as[Publication]
|
||||
.map(p => (ConversionUtil.extractMagIdentifier(p.getOriginalId.asScala), p)).as[(String, Publication)]
|
||||
|
||||
|
||||
val conference = spark.read.load(s"$sourcePath/ConferenceInstances")
|
||||
.select($"ConferenceInstanceId".as("ci"), $"DisplayName", $"Location", $"StartDate",$"EndDate" )
|
||||
val conferenceInstance = conference.joinWith(papers, papers("ConferenceInstanceId").equalTo(conference("ci")))
|
||||
.select($"_1.ci", $"_1.DisplayName", $"_1.Location", $"_1.StartDate",$"_1.EndDate", $"_2.PaperId").as[MagConferenceInstance]
|
||||
|
||||
|
||||
magPubs.joinWith(conferenceInstance, col("_1").equalTo(conferenceInstance("PaperId")), "left")
|
||||
.map(item => ConversionUtil.updatePubsWithConferenceInfo(item))
|
||||
.write
|
||||
.mode(SaveMode.Overwrite)
|
||||
.save(s"${parser.get("targetPath")}/merge_step_2_conference")
|
||||
|
||||
|
||||
magPubs= spark.read.load(s"${parser.get("targetPath")}/merge_step_2_conference").as[Publication]
|
||||
.map(p => (ConversionUtil.extractMagIdentifier(p.getOriginalId.asScala), p)).as[(String, Publication)]
|
||||
|
||||
val paperUrlDataset = spark.read.load(s"$sourcePath/PaperUrls").as[MagPaperUrl].groupBy("PaperId").agg(collect_list(struct("sourceUrl")).as("instances")).as[MagUrl]
|
||||
|
||||
|
||||
logger.info("Phase 5) enrich publication with URL and Instances")
|
||||
magPubs.joinWith(paperUrlDataset, col("_1").equalTo(paperUrlDataset("PaperId")), "left")
|
||||
.map { a: ((String, Publication), MagUrl) => ConversionUtil.addInstances((a._1._2, a._2)) }
|
||||
.write.mode(SaveMode.Overwrite)
|
||||
.save(s"${parser.get("targetPath")}/merge_step_3")
|
||||
|
||||
|
||||
// logger.info("Phase 6) Enrich Publication with description")
|
||||
// val pa = spark.read.load(s"${parser.get("sourcePath")}/PaperAbstractsInvertedIndex").as[MagPaperAbstract]
|
||||
// pa.map(ConversionUtil.transformPaperAbstract).write.mode(SaveMode.Overwrite).save(s"${parser.get("targetPath")}/PaperAbstract")
|
||||
|
||||
val paperAbstract = spark.read.load((s"${parser.get("targetPath")}/PaperAbstract")).as[MagPaperAbstract]
|
||||
|
||||
|
||||
magPubs = spark.read.load(s"${parser.get("targetPath")}/merge_step_3").as[Publication]
|
||||
.map(p => (ConversionUtil.extractMagIdentifier(p.getOriginalId.asScala), p)).as[(String, Publication)]
|
||||
|
||||
magPubs.joinWith(paperAbstract, col("_1").equalTo(paperAbstract("PaperId")), "left")
|
||||
.map(item => ConversionUtil.updatePubsWithDescription(item)
|
||||
).write.mode(SaveMode.Overwrite).save(s"${parser.get("targetPath")}/merge_step_4")
|
||||
|
||||
|
||||
logger.info("Phase 7) Enrich Publication with FieldOfStudy")
|
||||
|
||||
magPubs = spark.read.load(s"${parser.get("targetPath")}/merge_step_4").as[Publication]
|
||||
.map(p => (ConversionUtil.extractMagIdentifier(p.getOriginalId.asScala), p)).as[(String, Publication)]
|
||||
|
||||
val fos = spark.read.load(s"$sourcePath/FieldsOfStudy").select($"FieldOfStudyId".alias("fos"), $"DisplayName", $"MainType")
|
||||
|
||||
val pfos = spark.read.load(s"$sourcePath/PaperFieldsOfStudy")
|
||||
|
||||
val paperField = pfos.joinWith(fos, fos("fos").equalTo(pfos("FieldOfStudyId")))
|
||||
.select($"_1.FieldOfStudyId", $"_2.DisplayName", $"_2.MainType", $"_1.PaperId", $"_1.Score")
|
||||
.groupBy($"PaperId").agg(collect_list(struct($"FieldOfStudyId", $"DisplayName", $"MainType", $"Score")).as("subjects"))
|
||||
.as[MagFieldOfStudy]
|
||||
|
||||
magPubs.joinWith(paperField, col("_1")
|
||||
.equalTo(paperField("PaperId")), "left")
|
||||
.map(item => ConversionUtil.updatePubsWithSubject(item))
|
||||
.write.mode(SaveMode.Overwrite)
|
||||
.save(s"${parser.get("targetPath")}/mag_publication")
|
||||
|
||||
|
||||
val s:RDD[Publication] = spark.read.load(s"${parser.get("targetPath")}/mag_publication").as[Publication]
|
||||
.map(p=>Tuple2(p.getId, p)).rdd.reduceByKey((a:Publication, b:Publication) => ConversionUtil.mergePublication(a,b))
|
||||
.map(_._2)
|
||||
|
||||
spark.createDataset(s).as[Publication].write.mode(SaveMode.Overwrite).save(s"${parser.get("targetPath")}/mag_publication_u")
|
||||
|
||||
}
|
||||
}
|
|
@ -0,0 +1,146 @@
|
|||
|
||||
package eu.dnetlib.doiboost.orcid;
|
||||
|
||||
import java.io.BufferedReader;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.io.InputStreamReader;
|
||||
import java.net.URI;
|
||||
|
||||
import org.apache.commons.compress.archivers.tar.TarArchiveEntry;
|
||||
import org.apache.commons.compress.archivers.tar.TarArchiveInputStream;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.hadoop.io.IOUtils;
|
||||
import org.apache.hadoop.io.SequenceFile;
|
||||
import org.apache.hadoop.io.Text;
|
||||
import org.apache.hadoop.io.compress.CompressionCodec;
|
||||
import org.apache.hadoop.io.compress.CompressionCodecFactory;
|
||||
import org.mortbay.log.Log;
|
||||
|
||||
import eu.dnetlib.doiboost.orcid.json.JsonWriter;
|
||||
import eu.dnetlib.doiboost.orcid.model.WorkData;
|
||||
import eu.dnetlib.doiboost.orcid.xml.XMLRecordParser;
|
||||
|
||||
public class ActivitiesDecompressor {
|
||||
|
||||
private static final int MAX_XML_WORKS_PARSED = -1;
|
||||
private static final int XML_WORKS_PARSED_COUNTER_LOG_INTERVAL = 100000;
|
||||
|
||||
public static void parseGzActivities(Configuration conf, String inputUri, Path outputPath)
|
||||
throws Exception {
|
||||
String uri = inputUri;
|
||||
FileSystem fs = FileSystem.get(URI.create(uri), conf);
|
||||
Path inputPath = new Path(uri);
|
||||
CompressionCodecFactory factory = new CompressionCodecFactory(conf);
|
||||
CompressionCodec codec = factory.getCodec(inputPath);
|
||||
if (codec == null) {
|
||||
System.err.println("No codec found for " + uri);
|
||||
System.exit(1);
|
||||
}
|
||||
CompressionCodecFactory.removeSuffix(uri, codec.getDefaultExtension());
|
||||
InputStream gzipInputStream = null;
|
||||
try {
|
||||
gzipInputStream = codec.createInputStream(fs.open(inputPath));
|
||||
parseTarActivities(fs, conf, gzipInputStream, outputPath);
|
||||
|
||||
} finally {
|
||||
Log.debug("Closing gzip stream");
|
||||
IOUtils.closeStream(gzipInputStream);
|
||||
}
|
||||
}
|
||||
|
||||
private static void parseTarActivities(
|
||||
FileSystem fs, Configuration conf, InputStream gzipInputStream, Path outputPath) {
|
||||
int counter = 0;
|
||||
int doiFound = 0;
|
||||
int errorFromOrcidFound = 0;
|
||||
int xmlParserErrorFound = 0;
|
||||
try (TarArchiveInputStream tais = new TarArchiveInputStream(gzipInputStream)) {
|
||||
TarArchiveEntry entry = null;
|
||||
|
||||
try (SequenceFile.Writer writer = SequenceFile
|
||||
.createWriter(
|
||||
conf,
|
||||
SequenceFile.Writer.file(outputPath),
|
||||
SequenceFile.Writer.keyClass(Text.class),
|
||||
SequenceFile.Writer.valueClass(Text.class))) {
|
||||
while ((entry = tais.getNextTarEntry()) != null) {
|
||||
String filename = entry.getName();
|
||||
|
||||
try {
|
||||
if (entry.isDirectory() || !filename.contains("works")) {
|
||||
|
||||
} else {
|
||||
Log.debug("XML work entry name: " + entry.getName());
|
||||
counter++;
|
||||
BufferedReader br = new BufferedReader(new InputStreamReader(tais)); // Read directly from
|
||||
// tarInput
|
||||
String line;
|
||||
StringBuffer buffer = new StringBuffer();
|
||||
while ((line = br.readLine()) != null) {
|
||||
buffer.append(line);
|
||||
}
|
||||
WorkData workData = XMLRecordParser.VTDParseWorkData(buffer.toString().getBytes());
|
||||
if (workData != null) {
|
||||
if (workData.getErrorCode() != null) {
|
||||
errorFromOrcidFound += 1;
|
||||
Log
|
||||
.debug(
|
||||
"error from Orcid with code "
|
||||
+ workData.getErrorCode()
|
||||
+ " for entry "
|
||||
+ entry.getName());
|
||||
continue;
|
||||
}
|
||||
if (workData.isDoiFound()) {
|
||||
String jsonData = JsonWriter.create(workData);
|
||||
Log.debug("oid: " + workData.getOid() + " data: " + jsonData);
|
||||
|
||||
final Text key = new Text(workData.getOid());
|
||||
final Text value = new Text(jsonData);
|
||||
|
||||
try {
|
||||
writer.append(key, value);
|
||||
} catch (IOException e) {
|
||||
Log.debug("Writing to sequence file: " + e.getMessage());
|
||||
Log.debug(e);
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
doiFound += 1;
|
||||
}
|
||||
|
||||
} else {
|
||||
Log.warn("Data not retrievable [" + entry.getName() + "] " + buffer.toString());
|
||||
xmlParserErrorFound += 1;
|
||||
}
|
||||
}
|
||||
} catch (Exception e) {
|
||||
Log
|
||||
.warn(
|
||||
"Parsing work from tar archive and xml work: " + filename + " " + e.getMessage());
|
||||
Log.warn(e);
|
||||
}
|
||||
|
||||
if ((counter % XML_WORKS_PARSED_COUNTER_LOG_INTERVAL) == 0) {
|
||||
Log.info("Current xml works parsed: " + counter);
|
||||
}
|
||||
|
||||
if ((MAX_XML_WORKS_PARSED > -1) && (counter > MAX_XML_WORKS_PARSED)) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
} catch (IOException e) {
|
||||
Log.warn("Parsing work from gzip archive: " + e.getMessage());
|
||||
Log.warn(e);
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
Log.info("Activities parse completed");
|
||||
Log.info("Total XML works parsed: " + counter);
|
||||
Log.info("Total doi found: " + doiFound);
|
||||
Log.info("Error from Orcid found: " + errorFromOrcidFound);
|
||||
Log.info("Error parsing xml work found: " + xmlParserErrorFound);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,81 @@
|
|||
package eu.dnetlib.doiboost.orcid
|
||||
|
||||
import eu.dnetlib.dhp.schema.oaf.{Author, Publication}
|
||||
import eu.dnetlib.doiboost.DoiBoostMappingUtil
|
||||
import eu.dnetlib.doiboost.DoiBoostMappingUtil.{ORCID, PID_TYPES, createSP, generateDataInfo, generateIdentifier}
|
||||
import org.apache.commons.lang.StringUtils
|
||||
import org.codehaus.jackson.map.ObjectMapper
|
||||
import org.slf4j.{Logger, LoggerFactory}
|
||||
|
||||
import scala.collection.JavaConverters._
|
||||
|
||||
|
||||
case class ORCIDItem(oid:String,name:String,surname:String,creditName:String,errorCode:String){}
|
||||
|
||||
|
||||
|
||||
case class ORCIDElement(doi:String, authors:List[ORCIDItem]) {}
|
||||
object ORCIDToOAF {
|
||||
val logger: Logger = LoggerFactory.getLogger(ORCIDToOAF.getClass)
|
||||
val mapper = new ObjectMapper
|
||||
|
||||
def isJsonValid(inputStr: String): Boolean = {
|
||||
import java.io.IOException
|
||||
try {
|
||||
mapper.readTree(inputStr)
|
||||
true
|
||||
} catch {
|
||||
case e: IOException =>
|
||||
false
|
||||
}
|
||||
}
|
||||
|
||||
def extractValueFromInputString(input: String): (String, String) = {
|
||||
val i = input.indexOf('[')
|
||||
if (i <5) {
|
||||
return null
|
||||
}
|
||||
val orcidList = input.substring(i, input.length - 1)
|
||||
val doi = input.substring(1, i - 1)
|
||||
if (isJsonValid(orcidList)) {
|
||||
(doi, orcidList)
|
||||
} else null
|
||||
}
|
||||
|
||||
|
||||
def convertTOOAF(input:ORCIDElement) :Publication = {
|
||||
val doi = input.doi
|
||||
val pub:Publication = new Publication
|
||||
pub.setPid(List(createSP(doi, "doi", PID_TYPES)).asJava)
|
||||
pub.setDataInfo(generateDataInfo())
|
||||
pub.setId(generateIdentifier(pub, doi.toLowerCase))
|
||||
try{
|
||||
pub.setAuthor(input.authors.map(a=> {
|
||||
generateAuthor(a.name, a.surname, a.creditName, a.oid)
|
||||
}).asJava)
|
||||
pub.setCollectedfrom(List(DoiBoostMappingUtil.createORIDCollectedFrom()).asJava)
|
||||
pub.setDataInfo(DoiBoostMappingUtil.generateDataInfo())
|
||||
pub
|
||||
} catch {
|
||||
case e: Throwable =>
|
||||
logger.info(s"ERROR ON GENERATE Publication from $input")
|
||||
null
|
||||
}
|
||||
}
|
||||
|
||||
def generateAuthor(given: String, family: String, fullName:String, orcid: String): Author = {
|
||||
val a = new Author
|
||||
a.setName(given)
|
||||
a.setSurname(family)
|
||||
if (fullName!= null && fullName.nonEmpty)
|
||||
a.setFullname(fullName)
|
||||
else
|
||||
a.setFullname(s"$given $family")
|
||||
if (StringUtils.isNotBlank(orcid))
|
||||
a.setPid(List(createSP(orcid, ORCID, PID_TYPES)).asJava)
|
||||
|
||||
a
|
||||
}
|
||||
|
||||
|
||||
}
|
|
@ -0,0 +1,51 @@
|
|||
|
||||
package eu.dnetlib.doiboost.orcid;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.mortbay.log.Log;
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
|
||||
public class OrcidAuthorsDOIsDataGen extends OrcidDSManager {
|
||||
|
||||
private String activitiesFileNameTarGz;
|
||||
private String outputAuthorsDOIsPath;
|
||||
|
||||
public static void main(String[] args) throws IOException, Exception {
|
||||
OrcidAuthorsDOIsDataGen orcidAuthorsDOIsDataGen = new OrcidAuthorsDOIsDataGen();
|
||||
orcidAuthorsDOIsDataGen.loadArgs(args);
|
||||
orcidAuthorsDOIsDataGen.generateAuthorsDOIsData();
|
||||
}
|
||||
|
||||
public void generateAuthorsDOIsData() throws Exception {
|
||||
Configuration conf = initConfigurationObject();
|
||||
FileSystem fs = initFileSystemObject(conf);
|
||||
String tarGzUri = hdfsServerUri.concat(hdfsOrcidDefaultPath).concat(activitiesFileNameTarGz);
|
||||
Path outputPath = new Path(hdfsServerUri.concat(hdfsOrcidDefaultPath).concat(outputAuthorsDOIsPath));
|
||||
ActivitiesDecompressor.parseGzActivities(conf, tarGzUri, outputPath);
|
||||
}
|
||||
|
||||
private void loadArgs(String[] args) throws IOException, Exception {
|
||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
|
||||
IOUtils
|
||||
.toString(
|
||||
OrcidAuthorsDOIsDataGen.class
|
||||
.getResourceAsStream(
|
||||
"/eu/dnetlib/dhp/doiboost/create_orcid_authors_dois_data.json")));
|
||||
parser.parseArgument(args);
|
||||
|
||||
hdfsServerUri = parser.get("hdfsServerUri");
|
||||
Log.info("HDFS URI: " + hdfsServerUri);
|
||||
hdfsOrcidDefaultPath = parser.get("hdfsOrcidDefaultPath");
|
||||
Log.info("Default Path: " + hdfsOrcidDefaultPath);
|
||||
activitiesFileNameTarGz = parser.get("activitiesFileNameTarGz");
|
||||
Log.info("Activities File Name: " + activitiesFileNameTarGz);
|
||||
outputAuthorsDOIsPath = parser.get("outputAuthorsDOIsPath");
|
||||
Log.info("Output Authors DOIs Data: " + outputAuthorsDOIsPath);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,81 @@
|
|||
|
||||
package eu.dnetlib.doiboost.orcid;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.net.URI;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.mortbay.log.Log;
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
|
||||
public class OrcidDSManager {
|
||||
|
||||
protected String hdfsServerUri;
|
||||
protected String hdfsOrcidDefaultPath;
|
||||
private String summariesFileNameTarGz;
|
||||
private String outputAuthorsPath;
|
||||
|
||||
public static void main(String[] args) throws IOException, Exception {
|
||||
OrcidDSManager orcidDSManager = new OrcidDSManager();
|
||||
orcidDSManager.loadArgs(args);
|
||||
orcidDSManager.generateAuthors();
|
||||
}
|
||||
|
||||
public void generateAuthors() throws Exception {
|
||||
Configuration conf = initConfigurationObject();
|
||||
FileSystem fs = initFileSystemObject(conf);
|
||||
String tarGzUri = hdfsServerUri.concat(hdfsOrcidDefaultPath).concat(summariesFileNameTarGz);
|
||||
Path outputPath = new Path(
|
||||
hdfsServerUri
|
||||
.concat(hdfsOrcidDefaultPath)
|
||||
.concat(outputAuthorsPath)
|
||||
.concat("authors.seq"));
|
||||
SummariesDecompressor.parseGzSummaries(conf, tarGzUri, outputPath);
|
||||
}
|
||||
|
||||
protected Configuration initConfigurationObject() {
|
||||
// ====== Init HDFS File System Object
|
||||
Configuration conf = new Configuration();
|
||||
// Set FileSystem URI
|
||||
conf.set("fs.defaultFS", hdfsServerUri.concat(hdfsOrcidDefaultPath));
|
||||
// Because of Maven
|
||||
conf.set("fs.hdfs.impl", org.apache.hadoop.hdfs.DistributedFileSystem.class.getName());
|
||||
conf.set("fs.file.impl", org.apache.hadoop.fs.LocalFileSystem.class.getName());
|
||||
return conf;
|
||||
}
|
||||
|
||||
protected FileSystem initFileSystemObject(Configuration conf) {
|
||||
// Get the filesystem - HDFS
|
||||
FileSystem fs = null;
|
||||
try {
|
||||
fs = FileSystem.get(URI.create(hdfsServerUri.concat(hdfsOrcidDefaultPath)), conf);
|
||||
} catch (IOException e) {
|
||||
// TODO Auto-generated catch block
|
||||
e.printStackTrace();
|
||||
}
|
||||
return fs;
|
||||
}
|
||||
|
||||
private void loadArgs(String[] args) throws IOException, Exception {
|
||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
|
||||
IOUtils
|
||||
.toString(
|
||||
OrcidDSManager.class
|
||||
.getResourceAsStream(
|
||||
"/eu/dnetlib/dhp/doiboost/create_orcid_authors_data.json")));
|
||||
parser.parseArgument(args);
|
||||
|
||||
hdfsServerUri = parser.get("hdfsServerUri");
|
||||
Log.info("HDFS URI: " + hdfsServerUri);
|
||||
hdfsOrcidDefaultPath = parser.get("hdfsOrcidDefaultPath");
|
||||
Log.info("Default Path: " + hdfsOrcidDefaultPath);
|
||||
summariesFileNameTarGz = parser.get("summariesFileNameTarGz");
|
||||
Log.info("Summaries File Name: " + summariesFileNameTarGz);
|
||||
outputAuthorsPath = parser.get("outputAuthorsPath");
|
||||
Log.info("Output Authors Data: " + outputAuthorsPath);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,203 @@
|
|||
|
||||
package eu.dnetlib.doiboost.orcid;
|
||||
|
||||
import java.io.BufferedReader;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStreamReader;
|
||||
import java.text.SimpleDateFormat;
|
||||
import java.util.Arrays;
|
||||
import java.util.Date;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.fs.FSDataInputStream;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.hadoop.io.SequenceFile;
|
||||
import org.apache.hadoop.io.Text;
|
||||
import org.apache.http.client.methods.CloseableHttpResponse;
|
||||
import org.apache.http.client.methods.HttpGet;
|
||||
import org.apache.http.impl.client.CloseableHttpClient;
|
||||
import org.apache.http.impl.client.HttpClients;
|
||||
import org.mortbay.log.Log;
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
|
||||
public class OrcidDownloader extends OrcidDSManager {
|
||||
|
||||
static final int REQ_LIMIT = 24;
|
||||
// static final int REQ_MAX_TEST = 100;
|
||||
static final int RECORD_PARSED_COUNTER_LOG_INTERVAL = 10000;
|
||||
static final String DATE_FORMAT = "yyyy-MM-dd HH:mm:ss";
|
||||
static final String lastUpdate = "2019-09-30 00:00:00";
|
||||
private String lambdaFileName;
|
||||
private String outputPath;
|
||||
private String token;
|
||||
|
||||
public static void main(String[] args) throws IOException, Exception {
|
||||
OrcidDownloader orcidDownloader = new OrcidDownloader();
|
||||
orcidDownloader.loadArgs(args);
|
||||
orcidDownloader.parseLambdaFile();
|
||||
}
|
||||
|
||||
private String downloadRecord(String orcidId) {
|
||||
try (CloseableHttpClient client = HttpClients.createDefault()) {
|
||||
HttpGet httpGet = new HttpGet("https://api.orcid.org/v3.0/" + orcidId + "/record");
|
||||
httpGet.addHeader("Accept", "application/vnd.orcid+xml");
|
||||
httpGet.addHeader("Authorization", String.format("Bearer %s", token));
|
||||
CloseableHttpResponse response = client.execute(httpGet);
|
||||
if (response.getStatusLine().getStatusCode() != 200) {
|
||||
Log
|
||||
.warn(
|
||||
"Downloading " + orcidId + " status code: " + response.getStatusLine().getStatusCode());
|
||||
return new String("");
|
||||
}
|
||||
return IOUtils.toString(response.getEntity().getContent());
|
||||
|
||||
} catch (Throwable e) {
|
||||
Log.warn("Downloading " + orcidId, e.getMessage());
|
||||
|
||||
}
|
||||
return new String("");
|
||||
}
|
||||
|
||||
public void parseLambdaFile() throws Exception {
|
||||
int parsedRecordsCounter = 0;
|
||||
int downloadedRecordsCounter = 0;
|
||||
int savedRecordsCounter = 0;
|
||||
long startDownload = 0;
|
||||
Configuration conf = initConfigurationObject();
|
||||
FileSystem fs = initFileSystemObject(conf);
|
||||
String lambdaFileUri = hdfsServerUri.concat(hdfsOrcidDefaultPath).concat(lambdaFileName);
|
||||
Path hdfsreadpath = new Path(lambdaFileUri);
|
||||
FSDataInputStream lambdaFileStream = fs.open(hdfsreadpath);
|
||||
Path hdfsoutputPath = new Path(
|
||||
hdfsServerUri
|
||||
.concat(hdfsOrcidDefaultPath)
|
||||
.concat(outputPath)
|
||||
.concat("orcid_records.seq"));
|
||||
|
||||
try (SequenceFile.Writer writer = SequenceFile
|
||||
.createWriter(
|
||||
conf,
|
||||
SequenceFile.Writer.file(hdfsoutputPath),
|
||||
SequenceFile.Writer.keyClass(Text.class),
|
||||
SequenceFile.Writer.valueClass(Text.class))) {
|
||||
|
||||
try (BufferedReader br = new BufferedReader(new InputStreamReader(lambdaFileStream))) {
|
||||
String line;
|
||||
int nReqTmp = 0;
|
||||
startDownload = System.currentTimeMillis();
|
||||
long startReqTmp = System.currentTimeMillis();
|
||||
while ((line = br.readLine()) != null) {
|
||||
parsedRecordsCounter++;
|
||||
// skip headers line
|
||||
if (parsedRecordsCounter == 1) {
|
||||
continue;
|
||||
}
|
||||
String[] values = line.split(",");
|
||||
List<String> recordInfo = Arrays.asList(values);
|
||||
String orcidId = recordInfo.get(0);
|
||||
if (isModified(orcidId, recordInfo.get(3))) {
|
||||
String record = downloadRecord(orcidId);
|
||||
downloadedRecordsCounter++;
|
||||
if (!record.isEmpty()) {
|
||||
String compressRecord = ArgumentApplicationParser.compressArgument(record);
|
||||
final Text key = new Text(recordInfo.get(0));
|
||||
final Text value = new Text(compressRecord);
|
||||
|
||||
try {
|
||||
writer.append(key, value);
|
||||
savedRecordsCounter++;
|
||||
} catch (IOException e) {
|
||||
Log.warn("Writing to sequence file: " + e.getMessage());
|
||||
Log.warn(e);
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
}
|
||||
long endReq = System.currentTimeMillis();
|
||||
nReqTmp++;
|
||||
if (nReqTmp == REQ_LIMIT) {
|
||||
long reqSessionDuration = endReq - startReqTmp;
|
||||
if (reqSessionDuration <= 1000) {
|
||||
Log
|
||||
.warn(
|
||||
"\nreqSessionDuration: "
|
||||
+ reqSessionDuration
|
||||
+ " nReqTmp: "
|
||||
+ nReqTmp
|
||||
+ " wait ....");
|
||||
Thread.sleep(1000 - reqSessionDuration);
|
||||
} else {
|
||||
nReqTmp = 0;
|
||||
startReqTmp = System.currentTimeMillis();
|
||||
}
|
||||
}
|
||||
|
||||
// if (parsedRecordsCounter > REQ_MAX_TEST) {
|
||||
// break;
|
||||
// }
|
||||
if ((parsedRecordsCounter % RECORD_PARSED_COUNTER_LOG_INTERVAL) == 0) {
|
||||
Log
|
||||
.info(
|
||||
"Current parsed: "
|
||||
+ parsedRecordsCounter
|
||||
+ " downloaded: "
|
||||
+ downloadedRecordsCounter
|
||||
+ " saved: "
|
||||
+ savedRecordsCounter);
|
||||
// if (parsedRecordsCounter > REQ_MAX_TEST) {
|
||||
// break;
|
||||
// }
|
||||
}
|
||||
}
|
||||
long endDownload = System.currentTimeMillis();
|
||||
long downloadTime = endDownload - startDownload;
|
||||
Log.info("Download time: " + ((downloadTime / 1000) / 60) + " minutes");
|
||||
}
|
||||
}
|
||||
lambdaFileStream.close();
|
||||
Log.info("Download started at: " + new Date(startDownload).toString());
|
||||
Log.info("Parsed Records Counter: " + parsedRecordsCounter);
|
||||
Log.info("Downloaded Records Counter: " + downloadedRecordsCounter);
|
||||
Log.info("Saved Records Counter: " + savedRecordsCounter);
|
||||
}
|
||||
|
||||
private void loadArgs(String[] args) throws IOException, Exception {
|
||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
|
||||
IOUtils
|
||||
.toString(
|
||||
OrcidDownloader.class
|
||||
.getResourceAsStream(
|
||||
"/eu/dnetlib/dhp/doiboost/download_orcid_data.json")));
|
||||
parser.parseArgument(args);
|
||||
|
||||
hdfsServerUri = parser.get("hdfsServerUri");
|
||||
Log.info("HDFS URI: " + hdfsServerUri);
|
||||
hdfsOrcidDefaultPath = parser.get("hdfsOrcidDefaultPath");
|
||||
Log.info("Default Path: " + hdfsOrcidDefaultPath);
|
||||
lambdaFileName = parser.get("lambdaFileName");
|
||||
Log.info("Lambda File Name: " + lambdaFileName);
|
||||
outputPath = parser.get("outputPath");
|
||||
Log.info("Output Data: " + outputPath);
|
||||
token = parser.get("token");
|
||||
}
|
||||
|
||||
private boolean isModified(String orcidId, String modifiedDate) {
|
||||
Date modifiedDateDt = null;
|
||||
Date lastUpdateDt = null;
|
||||
try {
|
||||
if (modifiedDate.length() != 19) {
|
||||
modifiedDate = modifiedDate.substring(0, 19);
|
||||
}
|
||||
modifiedDateDt = new SimpleDateFormat(DATE_FORMAT).parse(modifiedDate);
|
||||
lastUpdateDt = new SimpleDateFormat(DATE_FORMAT).parse(lastUpdate);
|
||||
} catch (Exception e) {
|
||||
Log.warn("[" + orcidId + "] Parsing date: ", e.getMessage());
|
||||
return true;
|
||||
}
|
||||
return modifiedDateDt.after(lastUpdateDt);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,44 @@
|
|||
package eu.dnetlib.doiboost.orcid
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser
|
||||
import eu.dnetlib.dhp.schema.oaf.Publication
|
||||
import eu.dnetlib.doiboost.mag.ConversionUtil
|
||||
import org.apache.commons.io.IOUtils
|
||||
import org.apache.spark.SparkConf
|
||||
import org.apache.spark.rdd.RDD
|
||||
import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession}
|
||||
import org.slf4j.{Logger, LoggerFactory}
|
||||
|
||||
object SparkConvertORCIDToOAF {
|
||||
|
||||
|
||||
|
||||
def main(args: Array[String]): Unit = {
|
||||
|
||||
val logger: Logger = LoggerFactory.getLogger(SparkConvertORCIDToOAF.getClass)
|
||||
val conf: SparkConf = new SparkConf()
|
||||
val parser = new ArgumentApplicationParser(IOUtils.toString(SparkConvertORCIDToOAF.getClass.getResourceAsStream("/eu/dnetlib/dhp/doiboost/convert_map_to_oaf_params.json")))
|
||||
parser.parseArgument(args)
|
||||
val spark: SparkSession =
|
||||
SparkSession
|
||||
.builder()
|
||||
.config(conf)
|
||||
.appName(getClass.getSimpleName)
|
||||
.master(parser.get("master")).getOrCreate()
|
||||
|
||||
implicit val mapEncoderPubs: Encoder[Publication] = Encoders.kryo[Publication]
|
||||
implicit val tupleForJoinEncoder: Encoder[(String, Publication)] = Encoders.tuple(Encoders.STRING, mapEncoderPubs)
|
||||
import spark.implicits._
|
||||
val sourcePath = parser.get("sourcePath")
|
||||
val targetPath = parser.get("targetPath")
|
||||
val dataset:Dataset[ORCIDElement] = spark.read.json(sourcePath).as[ORCIDElement]
|
||||
|
||||
|
||||
logger.info("Converting ORCID to OAF")
|
||||
val d:RDD[Publication] = dataset.map(o => ORCIDToOAF.convertTOOAF(o)).filter(p=>p!=null).map(p=>(p.getId,p)).rdd.reduceByKey(ConversionUtil.mergePublication)
|
||||
.map(_._2)
|
||||
|
||||
spark.createDataset(d).as[Publication].write.mode(SaveMode.Overwrite).save(targetPath)
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,180 @@
|
|||
|
||||
package eu.dnetlib.doiboost.orcid;
|
||||
|
||||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
import java.util.Objects;
|
||||
import java.util.Optional;
|
||||
import java.util.stream.Collectors;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.hadoop.io.Text;
|
||||
import org.apache.hadoop.io.compress.GzipCodec;
|
||||
import org.apache.hadoop.mapreduce.Job;
|
||||
import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.api.java.JavaPairRDD;
|
||||
import org.apache.spark.api.java.JavaRDD;
|
||||
import org.apache.spark.api.java.JavaSparkContext;
|
||||
import org.apache.spark.api.java.function.Function;
|
||||
import org.apache.spark.api.java.function.MapFunction;
|
||||
import org.apache.spark.sql.Dataset;
|
||||
import org.apache.spark.sql.Encoders;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import com.esotericsoftware.minlog.Log;
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
import com.google.gson.JsonElement;
|
||||
import com.google.gson.JsonParser;
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.doiboost.orcid.model.AuthorData;
|
||||
import eu.dnetlib.doiboost.orcid.model.WorkData;
|
||||
import scala.Tuple2;
|
||||
|
||||
public class SparkGenerateDoiAuthorList {
|
||||
|
||||
public static void main(String[] args) throws IOException, Exception {
|
||||
Logger logger = LoggerFactory.getLogger(SparkGenerateDoiAuthorList.class);
|
||||
logger.info("[ SparkGenerateDoiAuthorList STARTED]");
|
||||
|
||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
|
||||
IOUtils
|
||||
.toString(
|
||||
SparkGenerateDoiAuthorList.class
|
||||
.getResourceAsStream(
|
||||
"/eu/dnetlib/dhp/doiboost/gen_doi_author_list_orcid_parameters.json")));
|
||||
parser.parseArgument(args);
|
||||
Boolean isSparkSessionManaged = Optional
|
||||
.ofNullable(parser.get("isSparkSessionManaged"))
|
||||
.map(Boolean::valueOf)
|
||||
.orElse(Boolean.TRUE);
|
||||
logger.info("isSparkSessionManaged: {}", isSparkSessionManaged);
|
||||
final String workingPath = parser.get("workingPath");
|
||||
logger.info("workingPath: ", workingPath);
|
||||
final String outputDoiAuthorListPath = parser.get("outputDoiAuthorListPath");
|
||||
logger.info("outputDoiAuthorListPath: ", outputDoiAuthorListPath);
|
||||
|
||||
SparkConf conf = new SparkConf();
|
||||
runWithSparkSession(
|
||||
conf,
|
||||
isSparkSessionManaged,
|
||||
spark -> {
|
||||
JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
|
||||
|
||||
JavaPairRDD<Text, Text> summariesRDD = sc
|
||||
.sequenceFile(workingPath + "../orcid_summaries/output/authors.seq", Text.class, Text.class);
|
||||
Dataset<AuthorData> summariesDataset = spark
|
||||
.createDataset(
|
||||
summariesRDD.map(seq -> loadAuthorFromJson(seq._1(), seq._2())).rdd(),
|
||||
Encoders.bean(AuthorData.class));
|
||||
|
||||
JavaPairRDD<Text, Text> activitiesRDD = sc
|
||||
.sequenceFile(workingPath + "/output/*.seq", Text.class, Text.class);
|
||||
Dataset<WorkData> activitiesDataset = spark
|
||||
.createDataset(
|
||||
activitiesRDD.map(seq -> loadWorkFromJson(seq._1(), seq._2())).rdd(),
|
||||
Encoders.bean(WorkData.class));
|
||||
|
||||
Function<Tuple2<String, AuthorData>, Tuple2<String, List<AuthorData>>> toAuthorListFunction = data -> {
|
||||
try {
|
||||
String doi = data._1();
|
||||
if (doi == null) {
|
||||
return null;
|
||||
}
|
||||
AuthorData author = data._2();
|
||||
if (author == null) {
|
||||
return null;
|
||||
}
|
||||
List<AuthorData> toAuthorList = Arrays.asList(author);
|
||||
return new Tuple2<>(doi, toAuthorList);
|
||||
} catch (Exception e) {
|
||||
Log.error("toAuthorListFunction ERROR", e);
|
||||
return null;
|
||||
}
|
||||
};
|
||||
|
||||
JavaRDD<Tuple2<String, List<AuthorData>>> doisRDD = activitiesDataset
|
||||
.joinWith(
|
||||
summariesDataset,
|
||||
activitiesDataset.col("oid").equalTo(summariesDataset.col("oid")), "inner")
|
||||
.map(
|
||||
(MapFunction<Tuple2<WorkData, AuthorData>, Tuple2<String, AuthorData>>) value -> {
|
||||
WorkData w = value._1;
|
||||
AuthorData a = value._2;
|
||||
return new Tuple2<>(w.getDoi(), a);
|
||||
},
|
||||
Encoders.tuple(Encoders.STRING(), Encoders.bean(AuthorData.class)))
|
||||
.filter(Objects::nonNull)
|
||||
.toJavaRDD()
|
||||
.map(toAuthorListFunction);
|
||||
|
||||
JavaPairRDD
|
||||
.fromJavaRDD(doisRDD)
|
||||
.reduceByKey((d1, d2) -> {
|
||||
try {
|
||||
if (d1 != null && d2 != null) {
|
||||
Stream<AuthorData> mergedStream = Stream
|
||||
.concat(
|
||||
d1.stream(),
|
||||
d2.stream());
|
||||
List<AuthorData> mergedAuthors = mergedStream.collect(Collectors.toList());
|
||||
return mergedAuthors;
|
||||
}
|
||||
if (d1 != null) {
|
||||
return d1;
|
||||
}
|
||||
if (d2 != null) {
|
||||
return d2;
|
||||
}
|
||||
} catch (Exception e) {
|
||||
Log.error("mergeAuthorsFunction ERROR", e);
|
||||
return null;
|
||||
}
|
||||
return null;
|
||||
})
|
||||
.mapToPair(
|
||||
s -> {
|
||||
ObjectMapper mapper = new ObjectMapper();
|
||||
return new Tuple2<>(s._1(), mapper.writeValueAsString(s._2()));
|
||||
})
|
||||
.repartition(10)
|
||||
.saveAsTextFile(workingPath + outputDoiAuthorListPath);
|
||||
});
|
||||
|
||||
}
|
||||
|
||||
private static AuthorData loadAuthorFromJson(Text orcidId, Text json) {
|
||||
AuthorData authorData = new AuthorData();
|
||||
authorData.setOid(orcidId.toString());
|
||||
JsonElement jElement = new JsonParser().parse(json.toString());
|
||||
authorData.setName(getJsonValue(jElement, "name"));
|
||||
authorData.setSurname(getJsonValue(jElement, "surname"));
|
||||
authorData.setCreditName(getJsonValue(jElement, "creditname"));
|
||||
return authorData;
|
||||
}
|
||||
|
||||
private static WorkData loadWorkFromJson(Text orcidId, Text json) {
|
||||
WorkData workData = new WorkData();
|
||||
workData.setOid(orcidId.toString());
|
||||
JsonElement jElement = new JsonParser().parse(json.toString());
|
||||
workData.setDoi(getJsonValue(jElement, "doi"));
|
||||
return workData;
|
||||
}
|
||||
|
||||
private static String getJsonValue(JsonElement jElement, String property) {
|
||||
if (jElement.getAsJsonObject().has(property)) {
|
||||
JsonElement name = null;
|
||||
name = jElement.getAsJsonObject().get(property);
|
||||
if (name != null && !name.isJsonNull()) {
|
||||
return name.getAsString();
|
||||
}
|
||||
}
|
||||
return null;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,165 @@
|
|||
|
||||
package eu.dnetlib.doiboost.orcid;
|
||||
|
||||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.text.SimpleDateFormat;
|
||||
import java.util.Date;
|
||||
import java.util.List;
|
||||
import java.util.Optional;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.hadoop.io.Text;
|
||||
import org.apache.http.client.methods.CloseableHttpResponse;
|
||||
import org.apache.http.client.methods.HttpGet;
|
||||
import org.apache.http.impl.client.CloseableHttpClient;
|
||||
import org.apache.http.impl.client.HttpClients;
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.api.java.JavaRDD;
|
||||
import org.apache.spark.api.java.JavaSparkContext;
|
||||
import org.apache.spark.api.java.function.Function;
|
||||
import org.apache.spark.sql.Encoders;
|
||||
import org.apache.spark.sql.SaveMode;
|
||||
import org.apache.spark.util.LongAccumulator;
|
||||
import org.mortbay.log.Log;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.doiboost.orcid.model.DownloadedRecordData;
|
||||
import scala.Tuple2;
|
||||
|
||||
public class SparkOrcidGenerateAuthors {
|
||||
|
||||
static final String DATE_FORMAT = "yyyy-MM-dd HH:mm:ss";
|
||||
static final String lastUpdate = "2019-09-30 00:00:00";
|
||||
|
||||
public static void main(String[] args) throws IOException, Exception {
|
||||
Logger logger = LoggerFactory.getLogger(SparkOrcidGenerateAuthors.class);
|
||||
logger.info("[ SparkOrcidGenerateAuthors STARTED]");
|
||||
|
||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
|
||||
IOUtils
|
||||
.toString(
|
||||
SparkOrcidGenerateAuthors.class
|
||||
.getResourceAsStream(
|
||||
"/eu/dnetlib/dhp/doiboost/gen_orcid_authors_parameters.json")));
|
||||
parser.parseArgument(args);
|
||||
Boolean isSparkSessionManaged = Optional
|
||||
.ofNullable(parser.get("isSparkSessionManaged"))
|
||||
.map(Boolean::valueOf)
|
||||
.orElse(Boolean.TRUE);
|
||||
logger.info("isSparkSessionManaged: {}", isSparkSessionManaged);
|
||||
final String workingPath = parser.get("workingPath");
|
||||
logger.info("workingPath: ", workingPath);
|
||||
final String outputAuthorsPath = parser.get("outputAuthorsPath");
|
||||
logger.info("outputAuthorsPath: ", outputAuthorsPath);
|
||||
final String token = parser.get("token");
|
||||
|
||||
SparkConf conf = new SparkConf();
|
||||
runWithSparkSession(
|
||||
conf,
|
||||
isSparkSessionManaged,
|
||||
spark -> {
|
||||
JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
|
||||
|
||||
LongAccumulator parsedRecordsAcc = sc.sc().longAccumulator("parsedRecords");
|
||||
LongAccumulator modifiedRecordsAcc = sc.sc().longAccumulator("modifiedRecords");
|
||||
LongAccumulator downloadedRecordsAcc = sc.sc().longAccumulator("downloadedRecords");
|
||||
LongAccumulator alreadyDownloadedRecords = sc.sc().longAccumulator("alreadyDownloadedRecords");
|
||||
JavaRDD<String> lamdaFileRDD = sc.textFile(workingPath + "lamdafiles");
|
||||
|
||||
JavaRDD<String> downloadedRDD = sc.textFile(workingPath + "downloaded");
|
||||
Function<String, String> getOrcidIdFunction = line -> {
|
||||
try {
|
||||
String[] values = line.split(",");
|
||||
return values[0].substring(1);
|
||||
} catch (Exception e) {
|
||||
return new String("");
|
||||
}
|
||||
};
|
||||
List<String> downloadedRecords = downloadedRDD.map(getOrcidIdFunction).collect();
|
||||
|
||||
Function<String, Boolean> isModifiedAfterFilter = line -> {
|
||||
String[] values = line.split(",");
|
||||
String orcidId = values[0];
|
||||
parsedRecordsAcc.add(1);
|
||||
if (isModified(orcidId, values[3])) {
|
||||
modifiedRecordsAcc.add(1);
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
};
|
||||
Function<String, Boolean> isNotDownloadedFilter = line -> {
|
||||
String[] values = line.split(",");
|
||||
String orcidId = values[0];
|
||||
if (downloadedRecords.contains(orcidId)) {
|
||||
alreadyDownloadedRecords.add(1);
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
};
|
||||
Function<String, Tuple2<String, String>> downloadRecordFunction = line -> {
|
||||
String[] values = line.split(",");
|
||||
String orcidId = values[0];
|
||||
String modifiedDate = values[3];
|
||||
return downloadRecord(orcidId, modifiedDate, token, downloadedRecordsAcc);
|
||||
};
|
||||
|
||||
lamdaFileRDD
|
||||
.filter(isModifiedAfterFilter)
|
||||
.filter(isNotDownloadedFilter)
|
||||
.map(downloadRecordFunction)
|
||||
.rdd()
|
||||
.saveAsTextFile(workingPath.concat(outputAuthorsPath));
|
||||
});
|
||||
|
||||
}
|
||||
|
||||
private static boolean isModified(String orcidId, String modifiedDate) {
|
||||
Date modifiedDateDt = null;
|
||||
Date lastUpdateDt = null;
|
||||
try {
|
||||
if (modifiedDate.length() != 19) {
|
||||
modifiedDate = modifiedDate.substring(0, 19);
|
||||
}
|
||||
modifiedDateDt = new SimpleDateFormat(DATE_FORMAT).parse(modifiedDate);
|
||||
lastUpdateDt = new SimpleDateFormat(DATE_FORMAT).parse(lastUpdate);
|
||||
} catch (Exception e) {
|
||||
Log.warn("[" + orcidId + "] Parsing date: ", e.getMessage());
|
||||
return true;
|
||||
}
|
||||
return modifiedDateDt.after(lastUpdateDt);
|
||||
}
|
||||
|
||||
private static Tuple2<String, String> downloadRecord(String orcidId, String modifiedDate, String token,
|
||||
LongAccumulator downloadedRecordsAcc) {
|
||||
final DownloadedRecordData data = new DownloadedRecordData();
|
||||
data.setOrcidId(orcidId);
|
||||
data.setModifiedDate(modifiedDate);
|
||||
try (CloseableHttpClient client = HttpClients.createDefault()) {
|
||||
HttpGet httpGet = new HttpGet("https://api.orcid.org/v3.0/" + orcidId + "/record");
|
||||
httpGet.addHeader("Accept", "application/vnd.orcid+xml");
|
||||
httpGet.addHeader("Authorization", String.format("Bearer %s", token));
|
||||
CloseableHttpResponse response = client.execute(httpGet);
|
||||
int statusCode = response.getStatusLine().getStatusCode();
|
||||
data.setStatusCode(statusCode);
|
||||
if (statusCode != 200) {
|
||||
Log
|
||||
.warn(
|
||||
"Downloading " + orcidId + " status code: " + response.getStatusLine().getStatusCode());
|
||||
return data.toTuple2();
|
||||
}
|
||||
downloadedRecordsAcc.add(1);
|
||||
data
|
||||
.setCompressedData(
|
||||
ArgumentApplicationParser.compressArgument(IOUtils.toString(response.getEntity().getContent())));
|
||||
} catch (Throwable e) {
|
||||
Log.warn("Downloading " + orcidId, e.getMessage());
|
||||
data.setErrorMessage(e.getMessage());
|
||||
return data.toTuple2();
|
||||
}
|
||||
return data.toTuple2();
|
||||
}
|
||||
}
|
|
@ -0,0 +1,50 @@
|
|||
|
||||
package eu.dnetlib.doiboost.orcid;
|
||||
|
||||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Optional;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.api.java.JavaRDD;
|
||||
import org.apache.spark.api.java.JavaSparkContext;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
|
||||
public class SparkPartitionLambdaFile {
|
||||
|
||||
public static void main(String[] args) throws IOException, Exception {
|
||||
Logger logger = LoggerFactory.getLogger(SparkOrcidGenerateAuthors.class);
|
||||
|
||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
|
||||
IOUtils
|
||||
.toString(
|
||||
SparkOrcidGenerateAuthors.class
|
||||
.getResourceAsStream(
|
||||
"/eu/dnetlib/dhp/doiboost/gen_orcid_authors_parameters.json")));
|
||||
parser.parseArgument(args);
|
||||
Boolean isSparkSessionManaged = Optional
|
||||
.ofNullable(parser.get("isSparkSessionManaged"))
|
||||
.map(Boolean::valueOf)
|
||||
.orElse(Boolean.TRUE);
|
||||
final String workingPath = parser.get("workingPath");
|
||||
|
||||
SparkConf conf = new SparkConf();
|
||||
runWithSparkSession(
|
||||
conf,
|
||||
isSparkSessionManaged,
|
||||
spark -> {
|
||||
JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
|
||||
JavaRDD<String> lamdaFileRDD = sc.textFile(workingPath + "last_modified.csv");
|
||||
|
||||
lamdaFileRDD
|
||||
.repartition(20)
|
||||
.saveAsTextFile(workingPath.concat("lamdafiles"));
|
||||
});
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,158 @@
|
|||
|
||||
package eu.dnetlib.doiboost.orcid;
|
||||
|
||||
import java.io.BufferedReader;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.io.InputStreamReader;
|
||||
import java.net.URI;
|
||||
|
||||
import org.apache.commons.compress.archivers.tar.TarArchiveEntry;
|
||||
import org.apache.commons.compress.archivers.tar.TarArchiveInputStream;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.hadoop.io.IOUtils;
|
||||
import org.apache.hadoop.io.SequenceFile;
|
||||
import org.apache.hadoop.io.Text;
|
||||
import org.apache.hadoop.io.compress.CompressionCodec;
|
||||
import org.apache.hadoop.io.compress.CompressionCodecFactory;
|
||||
import org.mortbay.log.Log;
|
||||
|
||||
import eu.dnetlib.doiboost.orcid.json.JsonWriter;
|
||||
import eu.dnetlib.doiboost.orcid.model.AuthorData;
|
||||
import eu.dnetlib.doiboost.orcid.xml.XMLRecordParser;
|
||||
|
||||
public class SummariesDecompressor {
|
||||
|
||||
private static final int MAX_XML_RECORDS_PARSED = -1;
|
||||
|
||||
public static void parseGzSummaries(Configuration conf, String inputUri, Path outputPath)
|
||||
throws Exception {
|
||||
String uri = inputUri;
|
||||
FileSystem fs = FileSystem.get(URI.create(uri), conf);
|
||||
Path inputPath = new Path(uri);
|
||||
CompressionCodecFactory factory = new CompressionCodecFactory(conf);
|
||||
CompressionCodec codec = factory.getCodec(inputPath);
|
||||
if (codec == null) {
|
||||
System.err.println("No codec found for " + uri);
|
||||
System.exit(1);
|
||||
}
|
||||
CompressionCodecFactory.removeSuffix(uri, codec.getDefaultExtension());
|
||||
InputStream gzipInputStream = null;
|
||||
try {
|
||||
gzipInputStream = codec.createInputStream(fs.open(inputPath));
|
||||
parseTarSummaries(fs, conf, gzipInputStream, outputPath);
|
||||
|
||||
} finally {
|
||||
Log.debug("Closing gzip stream");
|
||||
IOUtils.closeStream(gzipInputStream);
|
||||
}
|
||||
}
|
||||
|
||||
private static void parseTarSummaries(
|
||||
FileSystem fs, Configuration conf, InputStream gzipInputStream, Path outputPath) {
|
||||
int counter = 0;
|
||||
int nameFound = 0;
|
||||
int surnameFound = 0;
|
||||
int creditNameFound = 0;
|
||||
int errorFromOrcidFound = 0;
|
||||
int xmlParserErrorFound = 0;
|
||||
try (TarArchiveInputStream tais = new TarArchiveInputStream(gzipInputStream)) {
|
||||
TarArchiveEntry entry = null;
|
||||
|
||||
try (SequenceFile.Writer writer = SequenceFile
|
||||
.createWriter(
|
||||
conf,
|
||||
SequenceFile.Writer.file(outputPath),
|
||||
SequenceFile.Writer.keyClass(Text.class),
|
||||
SequenceFile.Writer.valueClass(Text.class))) {
|
||||
while ((entry = tais.getNextTarEntry()) != null) {
|
||||
String filename = entry.getName();
|
||||
try {
|
||||
if (entry.isDirectory()) {
|
||||
Log.debug("Directory entry name: " + entry.getName());
|
||||
} else {
|
||||
Log.debug("XML record entry name: " + entry.getName());
|
||||
counter++;
|
||||
BufferedReader br = new BufferedReader(new InputStreamReader(tais)); // Read directly from
|
||||
// tarInput
|
||||
String line;
|
||||
StringBuffer buffer = new StringBuffer();
|
||||
while ((line = br.readLine()) != null) {
|
||||
buffer.append(line);
|
||||
}
|
||||
AuthorData authorData = XMLRecordParser.VTDParseAuthorData(buffer.toString().getBytes());
|
||||
if (authorData != null) {
|
||||
if (authorData.getErrorCode() != null) {
|
||||
errorFromOrcidFound += 1;
|
||||
Log
|
||||
.debug(
|
||||
"error from Orcid with code "
|
||||
+ authorData.getErrorCode()
|
||||
+ " for oid "
|
||||
+ entry.getName());
|
||||
continue;
|
||||
}
|
||||
String jsonData = JsonWriter.create(authorData);
|
||||
Log.debug("oid: " + authorData.getOid() + " data: " + jsonData);
|
||||
|
||||
final Text key = new Text(authorData.getOid());
|
||||
final Text value = new Text(jsonData);
|
||||
|
||||
try {
|
||||
writer.append(key, value);
|
||||
} catch (IOException e) {
|
||||
Log.debug("Writing to sequence file: " + e.getMessage());
|
||||
Log.debug(e);
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
|
||||
if (authorData.getName() != null) {
|
||||
nameFound += 1;
|
||||
}
|
||||
if (authorData.getSurname() != null) {
|
||||
surnameFound += 1;
|
||||
}
|
||||
if (authorData.getCreditName() != null) {
|
||||
creditNameFound += 1;
|
||||
}
|
||||
|
||||
} else {
|
||||
Log.warn("Data not retrievable [" + entry.getName() + "] " + buffer.toString());
|
||||
xmlParserErrorFound += 1;
|
||||
}
|
||||
}
|
||||
} catch (Exception e) {
|
||||
Log
|
||||
.warn(
|
||||
"Parsing record from tar archive and xml record: "
|
||||
+ filename
|
||||
+ " "
|
||||
+ e.getMessage());
|
||||
Log.warn(e);
|
||||
}
|
||||
|
||||
if ((counter % 100000) == 0) {
|
||||
Log.info("Current xml records parsed: " + counter);
|
||||
}
|
||||
|
||||
if ((MAX_XML_RECORDS_PARSED > -1) && (counter > MAX_XML_RECORDS_PARSED)) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
} catch (IOException e) {
|
||||
Log.warn("Parsing record from gzip archive: " + e.getMessage());
|
||||
Log.warn(e);
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
Log.info("Summaries parse completed");
|
||||
Log.info("Total XML records parsed: " + counter);
|
||||
Log.info("Name found: " + nameFound);
|
||||
Log.info("Surname found: " + surnameFound);
|
||||
Log.info("Credit name found: " + creditNameFound);
|
||||
Log.info("Error from Orcid found: " + errorFromOrcidFound);
|
||||
Log.info("Error parsing xml record found: " + xmlParserErrorFound);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,28 @@
|
|||
|
||||
package eu.dnetlib.doiboost.orcid.json;
|
||||
|
||||
import com.google.gson.JsonObject;
|
||||
|
||||
import eu.dnetlib.doiboost.orcid.model.AuthorData;
|
||||
import eu.dnetlib.doiboost.orcid.model.WorkData;
|
||||
|
||||
public class JsonWriter {
|
||||
|
||||
public static String create(AuthorData authorData) {
|
||||
JsonObject author = new JsonObject();
|
||||
author.addProperty("oid", authorData.getOid());
|
||||
author.addProperty("name", authorData.getName());
|
||||
author.addProperty("surname", authorData.getSurname());
|
||||
if (authorData.getCreditName() != null) {
|
||||
author.addProperty("creditname", authorData.getCreditName());
|
||||
}
|
||||
return author.toString();
|
||||
}
|
||||
|
||||
public static String create(WorkData workData) {
|
||||
JsonObject work = new JsonObject();
|
||||
work.addProperty("oid", workData.getOid());
|
||||
work.addProperty("doi", workData.getDoi());
|
||||
return work.toString();
|
||||
}
|
||||
}
|
|
@ -0,0 +1,53 @@
|
|||
|
||||
package eu.dnetlib.doiboost.orcid.model;
|
||||
|
||||
import java.io.Serializable;
|
||||
|
||||
public class AuthorData implements Serializable {
|
||||
|
||||
private String oid;
|
||||
private String name;
|
||||
private String surname;
|
||||
private String creditName;
|
||||
private String errorCode;
|
||||
|
||||
public String getErrorCode() {
|
||||
return errorCode;
|
||||
}
|
||||
|
||||
public void setErrorCode(String errorCode) {
|
||||
this.errorCode = errorCode;
|
||||
}
|
||||
|
||||
public String getName() {
|
||||
return name;
|
||||
}
|
||||
|
||||
public void setName(String name) {
|
||||
this.name = name;
|
||||
}
|
||||
|
||||
public String getSurname() {
|
||||
return surname;
|
||||
}
|
||||
|
||||
public void setSurname(String surname) {
|
||||
this.surname = surname;
|
||||
}
|
||||
|
||||
public String getCreditName() {
|
||||
return creditName;
|
||||
}
|
||||
|
||||
public void setCreditName(String creditName) {
|
||||
this.creditName = creditName;
|
||||
}
|
||||
|
||||
public String getOid() {
|
||||
return oid;
|
||||
}
|
||||
|
||||
public void setOid(String oid) {
|
||||
this.oid = oid;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,76 @@
|
|||
|
||||
package eu.dnetlib.doiboost.orcid.model;
|
||||
|
||||
import java.io.Serializable;
|
||||
|
||||
import org.apache.hadoop.io.Text;
|
||||
|
||||
import com.google.gson.JsonObject;
|
||||
|
||||
import scala.Tuple2;
|
||||
|
||||
public class DownloadedRecordData implements Serializable {
|
||||
|
||||
private String orcidId;
|
||||
private String modifiedDate;
|
||||
private String statusCode;
|
||||
private String compressedData;
|
||||
private String errorMessage;
|
||||
|
||||
public Tuple2<String, String> toTuple2() {
|
||||
JsonObject data = new JsonObject();
|
||||
data.addProperty("statusCode", getStatusCode());
|
||||
data.addProperty("modifiedDate", getModifiedDate());
|
||||
if (getCompressedData() != null) {
|
||||
data.addProperty("compressedData", getCompressedData());
|
||||
}
|
||||
if (getErrorMessage() != null) {
|
||||
data.addProperty("errorMessage", getErrorMessage());
|
||||
}
|
||||
return new Tuple2<>(orcidId, data.toString());
|
||||
}
|
||||
|
||||
public String getErrorMessage() {
|
||||
return errorMessage;
|
||||
}
|
||||
|
||||
public void setErrorMessage(String errorMessage) {
|
||||
this.errorMessage = errorMessage;
|
||||
}
|
||||
|
||||
public String getOrcidId() {
|
||||
return orcidId;
|
||||
}
|
||||
|
||||
public void setOrcidId(String orcidId) {
|
||||
this.orcidId = orcidId;
|
||||
}
|
||||
|
||||
public int getStatusCode() {
|
||||
try {
|
||||
return Integer.parseInt(statusCode);
|
||||
} catch (Exception e) {
|
||||
return -2;
|
||||
}
|
||||
}
|
||||
|
||||
public void setStatusCode(int statusCode) {
|
||||
this.statusCode = Integer.toString(statusCode);
|
||||
}
|
||||
|
||||
public String getCompressedData() {
|
||||
return compressedData;
|
||||
}
|
||||
|
||||
public void setCompressedData(String compressedData) {
|
||||
this.compressedData = compressedData;
|
||||
}
|
||||
|
||||
public String getModifiedDate() {
|
||||
return modifiedDate;
|
||||
}
|
||||
|
||||
public void setModifiedDate(String modifiedDate) {
|
||||
this.modifiedDate = modifiedDate;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,45 @@
|
|||
|
||||
package eu.dnetlib.doiboost.orcid.model;
|
||||
|
||||
import java.io.Serializable;
|
||||
|
||||
public class WorkData implements Serializable {
|
||||
|
||||
private String oid;
|
||||
private String doi;
|
||||
private boolean doiFound = false;
|
||||
|
||||
public boolean isDoiFound() {
|
||||
return doiFound;
|
||||
}
|
||||
|
||||
public void setDoiFound(boolean doiFound) {
|
||||
this.doiFound = doiFound;
|
||||
}
|
||||
|
||||
public String getOid() {
|
||||
return oid;
|
||||
}
|
||||
|
||||
public void setOid(String oid) {
|
||||
this.oid = oid;
|
||||
}
|
||||
|
||||
public String getDoi() {
|
||||
return doi;
|
||||
}
|
||||
|
||||
public void setDoi(String doi) {
|
||||
this.doi = doi;
|
||||
}
|
||||
|
||||
public String getErrorCode() {
|
||||
return errorCode;
|
||||
}
|
||||
|
||||
public void setErrorCode(String errorCode) {
|
||||
this.errorCode = errorCode;
|
||||
}
|
||||
|
||||
private String errorCode;
|
||||
}
|
|
@ -0,0 +1,123 @@
|
|||
|
||||
package eu.dnetlib.doiboost.orcid.xml;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
|
||||
import com.ximpleware.AutoPilot;
|
||||
import com.ximpleware.EOFException;
|
||||
import com.ximpleware.EncodingException;
|
||||
import com.ximpleware.EntityException;
|
||||
import com.ximpleware.ParseException;
|
||||
import com.ximpleware.VTDGen;
|
||||
import com.ximpleware.VTDNav;
|
||||
|
||||
import eu.dnetlib.dhp.parser.utility.VtdException;
|
||||
import eu.dnetlib.dhp.parser.utility.VtdUtilityParser;
|
||||
import eu.dnetlib.doiboost.orcid.model.AuthorData;
|
||||
import eu.dnetlib.doiboost.orcid.model.WorkData;
|
||||
|
||||
public class XMLRecordParser {
|
||||
|
||||
private static final String NS_COMMON_URL = "http://www.orcid.org/ns/common";
|
||||
private static final String NS_COMMON = "common";
|
||||
private static final String NS_PERSON_URL = "http://www.orcid.org/ns/person";
|
||||
private static final String NS_PERSON = "person";
|
||||
private static final String NS_DETAILS_URL = "http://www.orcid.org/ns/personal-details";
|
||||
private static final String NS_DETAILS = "personal-details";
|
||||
private static final String NS_OTHER_URL = "http://www.orcid.org/ns/other-name";
|
||||
private static final String NS_OTHER = "other-name";
|
||||
private static final String NS_RECORD_URL = "http://www.orcid.org/ns/record";
|
||||
private static final String NS_RECORD = "record";
|
||||
private static final String NS_ERROR_URL = "http://www.orcid.org/ns/error";
|
||||
|
||||
private static final String NS_WORK = "work";
|
||||
private static final String NS_WORK_URL = "http://www.orcid.org/ns/work";
|
||||
|
||||
private static final String NS_ERROR = "error";
|
||||
|
||||
public static AuthorData VTDParseAuthorData(byte[] bytes)
|
||||
throws VtdException, EncodingException, EOFException, EntityException, ParseException {
|
||||
final VTDGen vg = new VTDGen();
|
||||
vg.setDoc(bytes);
|
||||
vg.parse(true);
|
||||
final VTDNav vn = vg.getNav();
|
||||
final AutoPilot ap = new AutoPilot(vn);
|
||||
ap.declareXPathNameSpace(NS_COMMON, NS_COMMON_URL);
|
||||
ap.declareXPathNameSpace(NS_PERSON, NS_PERSON_URL);
|
||||
ap.declareXPathNameSpace(NS_DETAILS, NS_DETAILS_URL);
|
||||
ap.declareXPathNameSpace(NS_OTHER, NS_OTHER_URL);
|
||||
ap.declareXPathNameSpace(NS_RECORD, NS_RECORD_URL);
|
||||
ap.declareXPathNameSpace(NS_ERROR, NS_ERROR_URL);
|
||||
|
||||
AuthorData authorData = new AuthorData();
|
||||
final List<String> errors = VtdUtilityParser.getTextValue(ap, vn, "//error:response-code");
|
||||
if (!errors.isEmpty()) {
|
||||
authorData.setErrorCode(errors.get(0));
|
||||
return authorData;
|
||||
}
|
||||
|
||||
List<VtdUtilityParser.Node> recordNodes = VtdUtilityParser
|
||||
.getTextValuesWithAttributes(
|
||||
ap, vn, "//record:record", Arrays.asList("path"));
|
||||
if (!recordNodes.isEmpty()) {
|
||||
final String oid = (recordNodes.get(0).getAttributes().get("path")).substring(1);
|
||||
authorData.setOid(oid);
|
||||
} else {
|
||||
return null;
|
||||
}
|
||||
|
||||
final List<String> names = VtdUtilityParser.getTextValue(ap, vn, "//personal-details:given-names");
|
||||
if (!names.isEmpty()) {
|
||||
authorData.setName(names.get(0));
|
||||
}
|
||||
|
||||
final List<String> surnames = VtdUtilityParser.getTextValue(ap, vn, "//personal-details:family-name");
|
||||
if (!surnames.isEmpty()) {
|
||||
authorData.setSurname(surnames.get(0));
|
||||
}
|
||||
|
||||
final List<String> creditNames = VtdUtilityParser.getTextValue(ap, vn, "//personal-details:credit-name");
|
||||
if (!creditNames.isEmpty()) {
|
||||
authorData.setCreditName(creditNames.get(0));
|
||||
}
|
||||
return authorData;
|
||||
}
|
||||
|
||||
public static WorkData VTDParseWorkData(byte[] bytes)
|
||||
throws VtdException, EncodingException, EOFException, EntityException, ParseException {
|
||||
final VTDGen vg = new VTDGen();
|
||||
vg.setDoc(bytes);
|
||||
vg.parse(true);
|
||||
final VTDNav vn = vg.getNav();
|
||||
final AutoPilot ap = new AutoPilot(vn);
|
||||
ap.declareXPathNameSpace(NS_COMMON, NS_COMMON_URL);
|
||||
ap.declareXPathNameSpace(NS_WORK, NS_WORK_URL);
|
||||
ap.declareXPathNameSpace(NS_ERROR, NS_ERROR_URL);
|
||||
|
||||
WorkData workData = new WorkData();
|
||||
final List<String> errors = VtdUtilityParser.getTextValue(ap, vn, "//error:response-code");
|
||||
if (!errors.isEmpty()) {
|
||||
workData.setErrorCode(errors.get(0));
|
||||
return workData;
|
||||
}
|
||||
|
||||
List<VtdUtilityParser.Node> workNodes = VtdUtilityParser
|
||||
.getTextValuesWithAttributes(ap, vn, "//work:work", Arrays.asList("path"));
|
||||
if (!workNodes.isEmpty()) {
|
||||
final String oid = (workNodes.get(0).getAttributes().get("path")).split("/")[1];
|
||||
workData.setOid(oid);
|
||||
} else {
|
||||
return null;
|
||||
}
|
||||
|
||||
final List<String> dois = VtdUtilityParser
|
||||
.getTextValue(
|
||||
ap, vn, "//common:external-id-type[text()=\"doi\"]/../common:external-id-value");
|
||||
if (!dois.isEmpty()) {
|
||||
workData.setDoi(dois.get(0));
|
||||
workData.setDoiFound(true);
|
||||
}
|
||||
return workData;
|
||||
}
|
||||
}
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue