Compare commits

...

23 Commits

Author SHA1 Message Date
Claudio Atzori c15c8c0ad0 map datasource identities (including piwik ids) as original IDs 2020-06-15 16:07:30 +02:00
Claudio Atzori 0d52816244 WIP: graph cleaner implementation 2020-06-13 13:06:04 +02:00
Claudio Atzori bed65a1be6 WIP: graph cleaner implementation 2020-06-12 18:25:47 +02:00
Claudio Atzori c4d9f1837f [maven-release-plugin] prepare for next development iteration 2020-06-12 12:21:08 +02:00
Claudio Atzori f0746a7605 [maven-release-plugin] prepare release dhp-1.2.2 2020-06-12 12:21:03 +02:00
Claudio Atzori 463489f59f code formatting 2020-06-12 12:03:25 +02:00
Claudio Atzori 4bcad1c9c3 Merge branch 'graph_cleaning' 2020-06-12 11:40:25 +02:00
Claudio Atzori cdb1956fe9 WIP: graph cleaner implementation 2020-06-12 11:36:59 +02:00
Alessia Bardi b347499745 do not use deprecated subreltype 2020-06-12 10:58:02 +02:00
Alessia Bardi ed8879ed8b deprecate PUBLICATION_DATASET 2020-06-12 10:55:56 +02:00
Alessia Bardi 3ade2631b3 Constants for new rels: citations and reviews 2020-06-12 10:52:12 +02:00
Claudio Atzori 97b1c4057c WIP: graph cleaner implementation 2020-06-12 10:45:18 +02:00
Claudio Atzori ba8a024af9 avoid NPEs merging titles 2020-06-12 10:45:11 +02:00
Michele Artini 30ea1bda88 oozie workflow 2020-06-12 10:42:35 +02:00
Michele Artini c22cb5a3c6 refactoring 2020-06-12 09:47:55 +02:00
Claudio Atzori d1d92c4d8c fixed integration of claims in the graph 2020-06-11 10:12:00 +02:00
Claudio Atzori 953da4a427 Merge branch 'master' into graph_cleaning 2020-06-10 21:36:56 +02:00
Claudio Atzori f1bce64391 WIP: graph cleaner implementation 2020-06-10 21:36:31 +02:00
Claudio Atzori 67c7b31ba6 Merge branch 'master' into graph_cleaning 2020-06-10 15:00:35 +02:00
Claudio Atzori a2fdf85ba1 WIP: graph cleaner implementation 2020-06-09 19:52:53 +02:00
Claudio Atzori d9f33582c5 WIP: graph cleaner implementation 2020-06-09 17:20:40 +02:00
Claudio Atzori 3d871c6651 Merge branch 'master' into graph_cleaning 2020-06-08 15:23:24 +02:00
Claudio Atzori b2349659cf WIP: graph property fixing implementation 2020-06-05 18:37:38 +02:00
70 changed files with 4279 additions and 441 deletions

View File

@ -6,7 +6,7 @@
<parent> <parent>
<groupId>eu.dnetlib.dhp</groupId> <groupId>eu.dnetlib.dhp</groupId>
<artifactId>dhp-build</artifactId> <artifactId>dhp-build</artifactId>
<version>1.2.2-SNAPSHOT</version> <version>1.2.3-SNAPSHOT</version>
</parent> </parent>
<artifactId>dhp-build-assembly-resources</artifactId> <artifactId>dhp-build-assembly-resources</artifactId>

View File

@ -6,7 +6,7 @@
<parent> <parent>
<groupId>eu.dnetlib.dhp</groupId> <groupId>eu.dnetlib.dhp</groupId>
<artifactId>dhp-build</artifactId> <artifactId>dhp-build</artifactId>
<version>1.2.2-SNAPSHOT</version> <version>1.2.3-SNAPSHOT</version>
</parent> </parent>
<artifactId>dhp-build-properties-maven-plugin</artifactId> <artifactId>dhp-build-properties-maven-plugin</artifactId>

View File

@ -5,7 +5,7 @@
<groupId>eu.dnetlib.dhp</groupId> <groupId>eu.dnetlib.dhp</groupId>
<artifactId>dhp-code-style</artifactId> <artifactId>dhp-code-style</artifactId>
<version>1.2.2-SNAPSHOT</version> <version>1.2.3-SNAPSHOT</version>
<packaging>jar</packaging> <packaging>jar</packaging>

View File

@ -4,7 +4,7 @@
<parent> <parent>
<groupId>eu.dnetlib.dhp</groupId> <groupId>eu.dnetlib.dhp</groupId>
<artifactId>dhp</artifactId> <artifactId>dhp</artifactId>
<version>1.2.2-SNAPSHOT</version> <version>1.2.3-SNAPSHOT</version>
</parent> </parent>
<artifactId>dhp-build</artifactId> <artifactId>dhp-build</artifactId>
<packaging>pom</packaging> <packaging>pom</packaging>

View File

@ -5,7 +5,7 @@
<parent> <parent>
<groupId>eu.dnetlib.dhp</groupId> <groupId>eu.dnetlib.dhp</groupId>
<artifactId>dhp</artifactId> <artifactId>dhp</artifactId>
<version>1.2.2-SNAPSHOT</version> <version>1.2.3-SNAPSHOT</version>
<relativePath>../</relativePath> <relativePath>../</relativePath>
</parent> </parent>

View File

@ -2,6 +2,7 @@
package eu.dnetlib.dhp.common; package eu.dnetlib.dhp.common;
import java.io.Serializable; import java.io.Serializable;
import java.util.function.Consumer;
import java.util.function.Supplier; import java.util.function.Supplier;
/** Provides serializable and throwing extensions to standard functional interfaces. */ /** Provides serializable and throwing extensions to standard functional interfaces. */
@ -10,6 +11,16 @@ public class FunctionalInterfaceSupport {
private FunctionalInterfaceSupport() { private FunctionalInterfaceSupport() {
} }
/**
* Serializable consumer of any kind of objects. To be used withing spark processing pipelines when supplying
* functions externally.
*
* @param <T>
*/
@FunctionalInterface
public interface SerializableConsumer<T> extends Consumer<T>, Serializable {
}
/** /**
* Serializable supplier of any kind of objects. To be used withing spark processing pipelines when supplying * Serializable supplier of any kind of objects. To be used withing spark processing pipelines when supplying
* functions externally. * functions externally.

View File

@ -17,12 +17,11 @@ import com.google.common.collect.Lists;
import com.google.common.hash.Hashing; import com.google.common.hash.Hashing;
/** /**
* PacePerson tries to derive information from the fullname string of an author. * PacePerson tries to derive information from the fullname string of an author. Such informations are Names, Surnames
* Such informations are Names, Surnames an Fullname split into terms. It provides also an additional field for * an Fullname split into terms. It provides also an additional field for the original data. The calculation of the
* the original data. * names and the surnames is not always possible. When it is impossible to assert which are the names and the surnames,
* The calculation of the names and the surnames is not always possible. When it is impossible to assert which are the * the lists are empty.
* names and the surnames, the lists are empty. */
* */
public class PacePerson { public class PacePerson {
private static final String UTF8 = "UTF-8"; private static final String UTF8 = "UTF-8";
@ -34,18 +33,18 @@ public class PacePerson {
private static Set<String> particles = null; private static Set<String> particles = null;
/** /**
* Capitalizes a string * Capitalizes a string
* *
* @param s the string to capitalize * @param s the string to capitalize
* @return the input string with capital letter * @return the input string with capital letter
* */ */
public static final String capitalize(final String s) { public static final String capitalize(final String s) {
return WordUtils.capitalize(s.toLowerCase(), ' ', '-'); return WordUtils.capitalize(s.toLowerCase(), ' ', '-');
} }
/** /**
* Adds a dot to a string with length equals to 1 * Adds a dot to a string with length equals to 1
* */ */
public static final String dotAbbreviations(final String s) { public static final String dotAbbreviations(final String s) {
return s.length() == 1 ? s + "." : s; return s.length() == 1 ? s + "." : s;
} }
@ -63,11 +62,11 @@ public class PacePerson {
} }
/** /**
* The constructor of the class. It fills the fields of the class basing on the input fullname. * The constructor of the class. It fills the fields of the class basing on the input fullname.
* *
* @param s the input string (fullname of the author) * @param s the input string (fullname of the author)
* @param aggressive set the string normalization type * @param aggressive set the string normalization type
* */ */
public PacePerson(String s, final boolean aggressive) { public PacePerson(String s, final boolean aggressive) {
original = s; original = s;
s = Normalizer.normalize(s, Normalizer.Form.NFD); s = Normalizer.normalize(s, Normalizer.Form.NFD);
@ -86,7 +85,7 @@ public class PacePerson {
// s = s.replaceAll("[\\W&&[^,-]]", ""); // s = s.replaceAll("[\\W&&[^,-]]", "");
} }
//if the string contains a comma, it can derive surname and name by splitting on it // if the string contains a comma, it can derive surname and name by splitting on it
if (s.contains(",")) { if (s.contains(",")) {
final String[] arr = s.split(","); final String[] arr = s.split(",");
if (arr.length == 1) { if (arr.length == 1) {
@ -97,23 +96,23 @@ public class PacePerson {
fullname.addAll(surname); fullname.addAll(surname);
fullname.addAll(name); fullname.addAll(name);
} }
} else { //otherwise, it should rely on CAPS terms and short terms } else { // otherwise, it should rely on CAPS terms and short terms
fullname = splitTerms(s); fullname = splitTerms(s);
int lastInitialPosition = fullname.size(); int lastInitialPosition = fullname.size();
boolean hasSurnameInUpperCase = false; boolean hasSurnameInUpperCase = false;
//computes lastInitialPosition and hasSurnameInUpperCase // computes lastInitialPosition and hasSurnameInUpperCase
for (int i = 0; i < fullname.size(); i++) { for (int i = 0; i < fullname.size(); i++) {
final String term = fullname.get(i); final String term = fullname.get(i);
if (term.length() == 1) { if (term.length() == 1) {
lastInitialPosition = i; //first word in the name longer than 1 (to avoid name with dots) lastInitialPosition = i; // first word in the name longer than 1 (to avoid name with dots)
} else if (term.equals(term.toUpperCase())) { } else if (term.equals(term.toUpperCase())) {
hasSurnameInUpperCase = true; //if one of the words is CAPS hasSurnameInUpperCase = true; // if one of the words is CAPS
} }
} }
//manages particular cases of fullnames // manages particular cases of fullnames
if (lastInitialPosition < fullname.size() - 1) { // Case: Michele G. Artini if (lastInitialPosition < fullname.size() - 1) { // Case: Michele G. Artini
name = fullname.subList(0, lastInitialPosition + 1); name = fullname.subList(0, lastInitialPosition + 1);
surname = fullname.subList(lastInitialPosition + 1, fullname.size()); surname = fullname.subList(lastInitialPosition + 1, fullname.size());

View File

@ -1,26 +1,27 @@
package eu.dnetlib.dhp.common;
import org.junit.jupiter.api.Test; package eu.dnetlib.dhp.common;
import static org.junit.jupiter.api.Assertions.*; import static org.junit.jupiter.api.Assertions.*;
import org.junit.jupiter.api.Test;
public class PacePersonTest { public class PacePersonTest {
@Test @Test
public void pacePersonTest1(){ public void pacePersonTest1() {
PacePerson p = new PacePerson("Artini, Michele", false); PacePerson p = new PacePerson("Artini, Michele", false);
assertEquals("Artini",p.getSurnameString()); assertEquals("Artini", p.getSurnameString());
assertEquals("Michele", p.getNameString()); assertEquals("Michele", p.getNameString());
assertEquals("Artini, Michele", p.getNormalisedFullname()); assertEquals("Artini, Michele", p.getNormalisedFullname());
} }
@Test @Test
public void pacePersonTest2(){ public void pacePersonTest2() {
PacePerson p = new PacePerson("Michele G. Artini", false); PacePerson p = new PacePerson("Michele G. Artini", false);
assertEquals("Artini, Michele G.", p.getNormalisedFullname()); assertEquals("Artini, Michele G.", p.getNormalisedFullname());
assertEquals("Michele G", p.getNameString()); assertEquals("Michele G", p.getNameString());
assertEquals("Artini", p.getSurnameString()); assertEquals("Artini", p.getSurnameString());
} }
} }

View File

@ -5,7 +5,7 @@
<parent> <parent>
<groupId>eu.dnetlib.dhp</groupId> <groupId>eu.dnetlib.dhp</groupId>
<artifactId>dhp</artifactId> <artifactId>dhp</artifactId>
<version>1.2.2-SNAPSHOT</version> <version>1.2.3-SNAPSHOT</version>
<relativePath>../</relativePath> <relativePath>../</relativePath>
</parent> </parent>

View File

@ -14,6 +14,7 @@ public class ModelConstants {
public static final String DNET_DATA_CITE_RESOURCE = "dnet:dataCite_resource"; public static final String DNET_DATA_CITE_RESOURCE = "dnet:dataCite_resource";
public static final String DNET_PROVENANCE_ACTIONS = "dnet:provenanceActions"; public static final String DNET_PROVENANCE_ACTIONS = "dnet:provenanceActions";
public static final String DNET_COUNTRY_TYPE = "dnet:countries"; public static final String DNET_COUNTRY_TYPE = "dnet:countries";
public static final String DNET_REVIEW_LEVELS = "dnet:review_levels";
public static final String SYSIMPORT_CROSSWALK_REPOSITORY = "sysimport:crosswalk:repository"; public static final String SYSIMPORT_CROSSWALK_REPOSITORY = "sysimport:crosswalk:repository";
public static final String SYSIMPORT_CROSSWALK_ENTITYREGISTRY = "sysimport:crosswalk:entityregistry"; public static final String SYSIMPORT_CROSSWALK_ENTITYREGISTRY = "sysimport:crosswalk:entityregistry";
@ -25,6 +26,10 @@ public class ModelConstants {
public static final String ORP_RESULTTYPE_CLASSID = "other"; public static final String ORP_RESULTTYPE_CLASSID = "other";
public static final String RESULT_RESULT = "resultResult"; public static final String RESULT_RESULT = "resultResult";
/**
* @deprecated Use {@link ModelConstants#RELATIONSHIP} instead.
*/
@Deprecated
public static final String PUBLICATION_DATASET = "publicationDataset"; public static final String PUBLICATION_DATASET = "publicationDataset";
public static final String IS_RELATED_TO = "isRelatedTo"; public static final String IS_RELATED_TO = "isRelatedTo";
public static final String SUPPLEMENT = "supplement"; public static final String SUPPLEMENT = "supplement";
@ -34,6 +39,12 @@ public class ModelConstants {
public static final String IS_PART_OF = "IsPartOf"; public static final String IS_PART_OF = "IsPartOf";
public static final String HAS_PARTS = "HasParts"; public static final String HAS_PARTS = "HasParts";
public static final String RELATIONSHIP = "relationship"; public static final String RELATIONSHIP = "relationship";
public static final String CITATION = "citation";
public static final String CITES = "cites";
public static final String IS_CITED_BY = "IsCitedBy";
public static final String REVIEW = "review";
public static final String REVIEWS = "reviews";
public static final String IS_REVIEWED_BY = "IsReviewedBy";
public static final String RESULT_PROJECT = "resultProject"; public static final String RESULT_PROJECT = "resultProject";
public static final String OUTCOME = "outcome"; public static final String OUTCOME = "outcome";

View File

@ -31,7 +31,7 @@ public class Instance implements Serializable {
// typed results // typed results
private Field<String> processingchargecurrency; private Field<String> processingchargecurrency;
private Field<String> refereed; // peer-review status private Qualifier refereed; // peer-review status
public Field<String> getLicense() { public Field<String> getLicense() {
return license; return license;
@ -113,11 +113,11 @@ public class Instance implements Serializable {
this.processingchargecurrency = processingchargecurrency; this.processingchargecurrency = processingchargecurrency;
} }
public Field<String> getRefereed() { public Qualifier getRefereed() {
return refereed; return refereed;
} }
public void setRefereed(Field<String> refereed) { public void setRefereed(Qualifier refereed) {
this.refereed = refereed; this.refereed = refereed;
} }

View File

@ -254,28 +254,25 @@ public class Result extends OafEntity implements Serializable {
final StructuredProperty p = baseMainTitle; final StructuredProperty p = baseMainTitle;
title = title.stream().filter(t -> t != p).collect(Collectors.toList()); title = title.stream().filter(t -> t != p).collect(Collectors.toList());
} }
//
//
// title.remove(baseMainTitle);
} }
StructuredProperty newMainTitle = null; StructuredProperty newMainTitle = null;
if (r.getTitle() != null) { if (r.getTitle() != null) {
newMainTitle = getMainTitle(r.getTitle()); newMainTitle = getMainTitle(r.getTitle());
if (newMainTitle != null) { if (newMainTitle != null && title != null) {
final StructuredProperty p = newMainTitle; final StructuredProperty p = newMainTitle;
title = title.stream().filter(t -> t != p).collect(Collectors.toList()); title = title.stream().filter(t -> t != p).collect(Collectors.toList());
} }
// r.getTitle().remove(newMainTitle);
} }
if (newMainTitle != null && compareTrust(this, r) < 0) if (newMainTitle != null && compareTrust(this, r) < 0) {
baseMainTitle = newMainTitle; baseMainTitle = newMainTitle;
}
title = mergeLists(title, r.getTitle()); title = mergeLists(title, r.getTitle());
if (title != null && baseMainTitle != null) if (title != null && baseMainTitle != null) {
title.add(baseMainTitle); title.add(baseMainTitle);
}
relevantdate = mergeLists(relevantdate, r.getRelevantdate()); relevantdate = mergeLists(relevantdate, r.getRelevantdate());

View File

@ -4,7 +4,7 @@
<parent> <parent>
<groupId>eu.dnetlib.dhp</groupId> <groupId>eu.dnetlib.dhp</groupId>
<artifactId>dhp-workflows</artifactId> <artifactId>dhp-workflows</artifactId>
<version>1.2.2-SNAPSHOT</version> <version>1.2.3-SNAPSHOT</version>
</parent> </parent>
<artifactId>dhp-actionmanager</artifactId> <artifactId>dhp-actionmanager</artifactId>

View File

@ -96,12 +96,21 @@ public class ProtoConverter implements Serializable {
.stream() .stream()
.distinct() .distinct()
.collect(Collectors.toCollection(ArrayList::new)) : null); .collect(Collectors.toCollection(ArrayList::new)) : null);
i.setRefereed(mapStringField(ri.getRefereed())); i.setRefereed(mapRefereed(ri.getRefereed()));
i.setProcessingchargeamount(mapStringField(ri.getProcessingchargeamount())); i.setProcessingchargeamount(mapStringField(ri.getProcessingchargeamount()));
i.setProcessingchargecurrency(mapStringField(ri.getProcessingchargecurrency())); i.setProcessingchargecurrency(mapStringField(ri.getProcessingchargecurrency()));
return i; return i;
} }
private static Qualifier mapRefereed(FieldTypeProtos.StringField refereed) {
Qualifier q = new Qualifier();
q.setClassid(refereed.getValue());
q.setSchemename(refereed.getValue());
q.setSchemeid("dnet:review_levels");
q.setSchemename("dnet:review_levels");
return q;
}
private static List<ExternalReference> convertExternalRefs(OafProtos.Oaf oaf) { private static List<ExternalReference> convertExternalRefs(OafProtos.Oaf oaf) {
ResultProtos.Result r = oaf.getEntity().getResult(); ResultProtos.Result r = oaf.getEntity().getResult();
if (r.getExternalReferenceCount() > 0) { if (r.getExternalReferenceCount() > 0) {

View File

@ -4,7 +4,7 @@
<parent> <parent>
<groupId>eu.dnetlib.dhp</groupId> <groupId>eu.dnetlib.dhp</groupId>
<artifactId>dhp-workflows</artifactId> <artifactId>dhp-workflows</artifactId>
<version>1.2.2-SNAPSHOT</version> <version>1.2.3-SNAPSHOT</version>
</parent> </parent>
<artifactId>dhp-aggregation</artifactId> <artifactId>dhp-aggregation</artifactId>

View File

@ -8,6 +8,7 @@ import java.io.File;
import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.AfterEach;
import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Disabled;
import org.junit.jupiter.api.Test; import org.junit.jupiter.api.Test;
import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.databind.ObjectMapper;
@ -19,6 +20,7 @@ import eu.dnetlib.dhp.collection.worker.utils.CollectorPluginFactory;
import eu.dnetlib.message.Message; import eu.dnetlib.message.Message;
import eu.dnetlib.message.MessageManager; import eu.dnetlib.message.MessageManager;
@Disabled
public class DnetCollectorWorkerApplicationTests { public class DnetCollectorWorkerApplicationTests {
private final ArgumentApplicationParser argumentParser = mock(ArgumentApplicationParser.class); private final ArgumentApplicationParser argumentParser = mock(ArgumentApplicationParser.class);

View File

@ -3,7 +3,7 @@
<parent> <parent>
<artifactId>dhp-workflows</artifactId> <artifactId>dhp-workflows</artifactId>
<groupId>eu.dnetlib.dhp</groupId> <groupId>eu.dnetlib.dhp</groupId>
<version>1.2.2-SNAPSHOT</version> <version>1.2.3-SNAPSHOT</version>
</parent> </parent>
<modelVersion>4.0.0</modelVersion> <modelVersion>4.0.0</modelVersion>

View File

@ -1,11 +1,9 @@
<?xml version="1.0" encoding="UTF-8"?> <?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0" <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<parent> <parent>
<artifactId>dhp-workflows</artifactId> <artifactId>dhp-workflows</artifactId>
<groupId>eu.dnetlib.dhp</groupId> <groupId>eu.dnetlib.dhp</groupId>
<version>1.2.2-SNAPSHOT</version> <version>1.2.3-SNAPSHOT</version>
</parent> </parent>
<modelVersion>4.0.0</modelVersion> <modelVersion>4.0.0</modelVersion>

View File

@ -51,7 +51,7 @@ public class GenerateEventsApplication {
IOUtils IOUtils
.toString( .toString(
GenerateEventsApplication.class GenerateEventsApplication.class
.getResourceAsStream("/eu/dnetlib/dhp/oa/graph/merge_claims_parameters.json"))); .getResourceAsStream("/eu/dnetlib/dhp/broker/oa/generate_broker_events.json")));
parser.parseArgument(args); parser.parseArgument(args);
final Boolean isSparkSessionManaged = Optional final Boolean isSparkSessionManaged = Optional
@ -149,7 +149,8 @@ public class GenerateEventsApplication {
return r4; return r4;
} }
private static <T, RT> Dataset<RT> relatedEntities(final Dataset<T> targets, final Dataset<Relation> rels, private static <T, RT> Dataset<RT> relatedEntities(final Dataset<T> targets,
final Dataset<Relation> rels,
final Class<RT> clazz) { final Class<RT> clazz) {
return rels return rels
.joinWith(targets, targets.col("id").equalTo(rels.col("target")), "inner") .joinWith(targets, targets.col("id").equalTo(rels.col("target")), "inner")

View File

@ -6,10 +6,14 @@ import java.util.Collection;
import java.util.HashMap; import java.util.HashMap;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
import java.util.function.BiConsumer;
import java.util.function.Function;
import org.apache.commons.codec.digest.DigestUtils; import org.apache.commons.codec.digest.DigestUtils;
import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.StringUtils;
import eu.dnetlib.broker.objects.Publication;
import eu.dnetlib.dhp.broker.model.Topic;
import eu.dnetlib.dhp.broker.oa.util.UpdateInfo; import eu.dnetlib.dhp.broker.oa.util.UpdateInfo;
import eu.dnetlib.dhp.broker.oa.util.aggregators.withRels.ResultWithRelations; import eu.dnetlib.dhp.broker.oa.util.aggregators.withRels.ResultWithRelations;
import eu.dnetlib.dhp.schema.oaf.Field; import eu.dnetlib.dhp.schema.oaf.Field;
@ -18,9 +22,17 @@ import eu.dnetlib.pace.config.DedupConfig;
public abstract class UpdateMatcher<T> { public abstract class UpdateMatcher<T> {
private final boolean multipleUpdate; private final boolean multipleUpdate;
private final Function<T, Topic> topicFunction;
private final BiConsumer<Publication, T> compileHighlightFunction;
private final Function<T, String> highlightToStringFunction;
public UpdateMatcher(final boolean multipleUpdate) { public UpdateMatcher(final boolean multipleUpdate, final Function<T, Topic> topicFunction,
final BiConsumer<Publication, T> compileHighlightFunction,
final Function<T, String> highlightToStringFunction) {
this.multipleUpdate = multipleUpdate; this.multipleUpdate = multipleUpdate;
this.topicFunction = topicFunction;
this.compileHighlightFunction = compileHighlightFunction;
this.highlightToStringFunction = highlightToStringFunction;
} }
public Collection<UpdateInfo<T>> searchUpdatesForRecord(final ResultWithRelations res, public Collection<UpdateInfo<T>> searchUpdatesForRecord(final ResultWithRelations res,
@ -31,7 +43,11 @@ public abstract class UpdateMatcher<T> {
for (final ResultWithRelations source : others) { for (final ResultWithRelations source : others) {
if (source != res) { if (source != res) {
for (final UpdateInfo<T> info : findUpdates(source, res, dedupConfig)) { for (final T hl : findDifferences(source, res)) {
final Topic topic = getTopicFunction().apply(hl);
final UpdateInfo<T> info = new UpdateInfo<>(topic, hl, source, res, getCompileHighlightFunction(),
getHighlightToStringFunction(),
dedupConfig);
final String s = DigestUtils.md5Hex(info.getHighlightValueAsString()); final String s = DigestUtils.md5Hex(info.getHighlightValueAsString());
if (!infoMap.containsKey(s) || infoMap.get(s).getTrust() < info.getTrust()) { if (!infoMap.containsKey(s) || infoMap.get(s).getTrust() < info.getTrust()) {
} else { } else {
@ -55,8 +71,7 @@ public abstract class UpdateMatcher<T> {
} }
} }
protected abstract List<UpdateInfo<T>> findUpdates(ResultWithRelations source, ResultWithRelations target, protected abstract List<T> findDifferences(ResultWithRelations source, ResultWithRelations target);
DedupConfig dedupConfig);
protected static boolean isMissing(final List<Field<String>> list) { protected static boolean isMissing(final List<Field<String>> list) {
return list == null || list.isEmpty() || StringUtils.isBlank(list.get(0).getValue()); return list == null || list.isEmpty() || StringUtils.isBlank(list.get(0).getValue());
@ -66,4 +81,20 @@ public abstract class UpdateMatcher<T> {
return field == null || StringUtils.isBlank(field.getValue()); return field == null || StringUtils.isBlank(field.getValue());
} }
public boolean isMultipleUpdate() {
return multipleUpdate;
}
public Function<T, Topic> getTopicFunction() {
return topicFunction;
}
public BiConsumer<Publication, T> getCompileHighlightFunction() {
return compileHighlightFunction;
}
public Function<T, String> getHighlightToStringFunction() {
return highlightToStringFunction;
}
} }

View File

@ -8,29 +8,26 @@ import java.util.stream.Collectors;
import eu.dnetlib.dhp.broker.model.Topic; import eu.dnetlib.dhp.broker.model.Topic;
import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher; import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher;
import eu.dnetlib.dhp.broker.oa.util.ConversionUtils; import eu.dnetlib.dhp.broker.oa.util.ConversionUtils;
import eu.dnetlib.dhp.broker.oa.util.UpdateInfo;
import eu.dnetlib.dhp.broker.oa.util.aggregators.withRels.RelatedDataset; import eu.dnetlib.dhp.broker.oa.util.aggregators.withRels.RelatedDataset;
import eu.dnetlib.dhp.broker.oa.util.aggregators.withRels.ResultWithRelations; import eu.dnetlib.dhp.broker.oa.util.aggregators.withRels.ResultWithRelations;
import eu.dnetlib.dhp.schema.oaf.Dataset; import eu.dnetlib.dhp.schema.oaf.Dataset;
import eu.dnetlib.pace.config.DedupConfig;
public abstract class AbstractEnrichMissingDataset public abstract class AbstractEnrichMissingDataset
extends UpdateMatcher<eu.dnetlib.broker.objects.Dataset> { extends UpdateMatcher<eu.dnetlib.broker.objects.Dataset> {
private final Topic topic;
public AbstractEnrichMissingDataset(final Topic topic) { public AbstractEnrichMissingDataset(final Topic topic) {
super(true); super(true,
this.topic = topic; rel -> topic,
(p, rel) -> p.getDatasets().add(rel),
rel -> rel.getInstances().get(0).getUrl());
} }
protected abstract boolean filterByType(String relType); protected abstract boolean filterByType(String relType);
@Override @Override
protected final List<UpdateInfo<eu.dnetlib.broker.objects.Dataset>> findUpdates( protected final List<eu.dnetlib.broker.objects.Dataset> findDifferences(
final ResultWithRelations source, final ResultWithRelations source,
final ResultWithRelations target, final ResultWithRelations target) {
final DedupConfig dedupConfig) {
final Set<String> existingDatasets = target final Set<String> existingDatasets = target
.getDatasets() .getDatasets()
@ -47,26 +44,8 @@ public abstract class AbstractEnrichMissingDataset
.map(RelatedDataset::getRelDataset) .map(RelatedDataset::getRelDataset)
.filter(d -> !existingDatasets.contains(d.getId())) .filter(d -> !existingDatasets.contains(d.getId()))
.map(ConversionUtils::oafDatasetToBrokerDataset) .map(ConversionUtils::oafDatasetToBrokerDataset)
.map(i -> generateUpdateInfo(i, source, target, dedupConfig))
.collect(Collectors.toList()); .collect(Collectors.toList());
} }
protected final UpdateInfo<eu.dnetlib.broker.objects.Dataset> generateUpdateInfo(
final eu.dnetlib.broker.objects.Dataset highlightValue,
final ResultWithRelations source,
final ResultWithRelations target,
final DedupConfig dedupConfig) {
return new UpdateInfo<>(
getTopic(),
highlightValue, source, target,
(p, rel) -> p.getDatasets().add(rel),
rel -> rel.getInstances().get(0).getUrl(),
dedupConfig);
}
public Topic getTopic() {
return topic;
}
} }

View File

@ -5,26 +5,25 @@ import java.util.Arrays;
import java.util.List; import java.util.List;
import java.util.stream.Collectors; import java.util.stream.Collectors;
import eu.dnetlib.broker.objects.Project;
import eu.dnetlib.dhp.broker.model.Topic; import eu.dnetlib.dhp.broker.model.Topic;
import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher; import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher;
import eu.dnetlib.dhp.broker.oa.util.ConversionUtils; import eu.dnetlib.dhp.broker.oa.util.ConversionUtils;
import eu.dnetlib.dhp.broker.oa.util.UpdateInfo;
import eu.dnetlib.dhp.broker.oa.util.aggregators.withRels.RelatedProject; import eu.dnetlib.dhp.broker.oa.util.aggregators.withRels.RelatedProject;
import eu.dnetlib.dhp.broker.oa.util.aggregators.withRels.ResultWithRelations; import eu.dnetlib.dhp.broker.oa.util.aggregators.withRels.ResultWithRelations;
import eu.dnetlib.pace.config.DedupConfig;
public class EnrichMissingProject public class EnrichMissingProject
extends UpdateMatcher<eu.dnetlib.broker.objects.Project> { extends UpdateMatcher<eu.dnetlib.broker.objects.Project> {
public EnrichMissingProject() { public EnrichMissingProject() {
super(true); super(true,
prj -> Topic.ENRICH_MISSING_PROJECT,
(p, prj) -> p.getProjects().add(prj),
prj -> prj.getFunder() + "::" + prj.getFundingProgram() + prj.getCode());
} }
@Override @Override
protected List<UpdateInfo<eu.dnetlib.broker.objects.Project>> findUpdates(final ResultWithRelations source, protected List<Project> findDifferences(final ResultWithRelations source, final ResultWithRelations target) {
final ResultWithRelations target,
final DedupConfig dedupConfig) {
if (source.getProjects().isEmpty()) { if (source.getProjects().isEmpty()) {
return Arrays.asList(); return Arrays.asList();
} else { } else {
@ -33,21 +32,7 @@ public class EnrichMissingProject
.stream() .stream()
.map(RelatedProject::getRelProject) .map(RelatedProject::getRelProject)
.map(ConversionUtils::oafProjectToBrokerProject) .map(ConversionUtils::oafProjectToBrokerProject)
.map(p -> generateUpdateInfo(p, source, target, dedupConfig))
.collect(Collectors.toList()); .collect(Collectors.toList());
} }
} }
public UpdateInfo<eu.dnetlib.broker.objects.Project> generateUpdateInfo(
final eu.dnetlib.broker.objects.Project highlightValue,
final ResultWithRelations source,
final ResultWithRelations target,
final DedupConfig dedupConfig) {
return new UpdateInfo<>(
Topic.ENRICH_MISSING_PROJECT,
highlightValue, source, target,
(p, prj) -> p.getProjects().add(prj),
prj -> prj.getFunder() + "::" + prj.getFundingProgram() + prj.getCode(), dedupConfig);
}
} }

View File

@ -8,22 +8,22 @@ import java.util.stream.Collectors;
import eu.dnetlib.dhp.broker.model.Topic; import eu.dnetlib.dhp.broker.model.Topic;
import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher; import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher;
import eu.dnetlib.dhp.broker.oa.util.ConversionUtils; import eu.dnetlib.dhp.broker.oa.util.ConversionUtils;
import eu.dnetlib.dhp.broker.oa.util.UpdateInfo;
import eu.dnetlib.dhp.broker.oa.util.aggregators.withRels.RelatedProject; import eu.dnetlib.dhp.broker.oa.util.aggregators.withRels.RelatedProject;
import eu.dnetlib.dhp.broker.oa.util.aggregators.withRels.ResultWithRelations; import eu.dnetlib.dhp.broker.oa.util.aggregators.withRels.ResultWithRelations;
import eu.dnetlib.dhp.schema.oaf.Project; import eu.dnetlib.dhp.schema.oaf.Project;
import eu.dnetlib.pace.config.DedupConfig;
public class EnrichMoreProject extends UpdateMatcher<eu.dnetlib.broker.objects.Project> { public class EnrichMoreProject extends UpdateMatcher<eu.dnetlib.broker.objects.Project> {
public EnrichMoreProject() { public EnrichMoreProject() {
super(true); super(true,
prj -> Topic.ENRICH_MORE_PROJECT,
(p, prj) -> p.getProjects().add(prj),
prj -> prj.getFunder() + "::" + prj.getFundingProgram() + prj.getCode());
} }
@Override @Override
protected List<UpdateInfo<eu.dnetlib.broker.objects.Project>> findUpdates(final ResultWithRelations source, protected List<eu.dnetlib.broker.objects.Project> findDifferences(final ResultWithRelations source,
final ResultWithRelations target, final ResultWithRelations target) {
final DedupConfig dedupConfig) {
final Set<String> existingProjects = source final Set<String> existingProjects = source
.getProjects() .getProjects()
@ -38,20 +38,7 @@ public class EnrichMoreProject extends UpdateMatcher<eu.dnetlib.broker.objects.P
.map(RelatedProject::getRelProject) .map(RelatedProject::getRelProject)
.filter(p -> !existingProjects.contains(p.getId())) .filter(p -> !existingProjects.contains(p.getId()))
.map(ConversionUtils::oafProjectToBrokerProject) .map(ConversionUtils::oafProjectToBrokerProject)
.map(p -> generateUpdateInfo(p, source, target, dedupConfig))
.collect(Collectors.toList()); .collect(Collectors.toList());
} }
public UpdateInfo<eu.dnetlib.broker.objects.Project> generateUpdateInfo(
final eu.dnetlib.broker.objects.Project highlightValue,
final ResultWithRelations source,
final ResultWithRelations target,
final DedupConfig dedupConfig) {
return new UpdateInfo<>(
Topic.ENRICH_MORE_PROJECT,
highlightValue, source, target,
(p, prj) -> p.getProjects().add(prj),
prj -> prj.getFunder() + "::" + prj.getFundingProgram() + prj.getCode(), dedupConfig);
}
} }

View File

@ -8,29 +8,27 @@ import java.util.stream.Collectors;
import eu.dnetlib.dhp.broker.model.Topic; import eu.dnetlib.dhp.broker.model.Topic;
import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher; import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher;
import eu.dnetlib.dhp.broker.oa.util.ConversionUtils; import eu.dnetlib.dhp.broker.oa.util.ConversionUtils;
import eu.dnetlib.dhp.broker.oa.util.UpdateInfo;
import eu.dnetlib.dhp.broker.oa.util.aggregators.withRels.RelatedPublication; import eu.dnetlib.dhp.broker.oa.util.aggregators.withRels.RelatedPublication;
import eu.dnetlib.dhp.broker.oa.util.aggregators.withRels.ResultWithRelations; import eu.dnetlib.dhp.broker.oa.util.aggregators.withRels.ResultWithRelations;
import eu.dnetlib.dhp.schema.oaf.Publication; import eu.dnetlib.dhp.schema.oaf.Publication;
import eu.dnetlib.pace.config.DedupConfig;
public abstract class AbstractEnrichMissingPublication public abstract class AbstractEnrichMissingPublication
extends UpdateMatcher<eu.dnetlib.broker.objects.Publication> { extends UpdateMatcher<eu.dnetlib.broker.objects.Publication> {
private final Topic topic;
public AbstractEnrichMissingPublication(final Topic topic) { public AbstractEnrichMissingPublication(final Topic topic) {
super(true); super(true,
this.topic = topic; rel -> topic,
(p, rel) -> p.getPublications().add(rel),
rel -> rel.getInstances().get(0).getUrl());
} }
protected abstract boolean filterByType(String relType); protected abstract boolean filterByType(String relType);
@Override @Override
protected final List<UpdateInfo<eu.dnetlib.broker.objects.Publication>> findUpdates( protected final List<eu.dnetlib.broker.objects.Publication> findDifferences(
final ResultWithRelations source, final ResultWithRelations source,
final ResultWithRelations target, final ResultWithRelations target) {
final DedupConfig dedupConfig) {
final Set<String> existingPublications = target final Set<String> existingPublications = target
.getPublications() .getPublications()
@ -47,24 +45,7 @@ public abstract class AbstractEnrichMissingPublication
.map(RelatedPublication::getRelPublication) .map(RelatedPublication::getRelPublication)
.filter(d -> !existingPublications.contains(d.getId())) .filter(d -> !existingPublications.contains(d.getId()))
.map(ConversionUtils::oafResultToBrokerPublication) .map(ConversionUtils::oafResultToBrokerPublication)
.map(i -> generateUpdateInfo(i, source, target, dedupConfig))
.collect(Collectors.toList()); .collect(Collectors.toList());
} }
protected final UpdateInfo<eu.dnetlib.broker.objects.Publication> generateUpdateInfo(
final eu.dnetlib.broker.objects.Publication highlightValue,
final ResultWithRelations source,
final ResultWithRelations target,
final DedupConfig dedupConfig) {
return new UpdateInfo<>(
getTopic(),
highlightValue, source, target,
(p, rel) -> p.getPublications().add(rel),
rel -> rel.getInstances().get(0).getUrl(), dedupConfig);
}
public Topic getTopic() {
return topic;
}
} }

View File

@ -8,23 +8,23 @@ import java.util.stream.Collectors;
import eu.dnetlib.dhp.broker.model.Topic; import eu.dnetlib.dhp.broker.model.Topic;
import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher; import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher;
import eu.dnetlib.dhp.broker.oa.util.ConversionUtils; import eu.dnetlib.dhp.broker.oa.util.ConversionUtils;
import eu.dnetlib.dhp.broker.oa.util.UpdateInfo;
import eu.dnetlib.dhp.broker.oa.util.aggregators.withRels.RelatedSoftware; import eu.dnetlib.dhp.broker.oa.util.aggregators.withRels.RelatedSoftware;
import eu.dnetlib.dhp.broker.oa.util.aggregators.withRels.ResultWithRelations; import eu.dnetlib.dhp.broker.oa.util.aggregators.withRels.ResultWithRelations;
import eu.dnetlib.pace.config.DedupConfig;
public class EnrichMissingSoftware public class EnrichMissingSoftware
extends UpdateMatcher<eu.dnetlib.broker.objects.Software> { extends UpdateMatcher<eu.dnetlib.broker.objects.Software> {
public EnrichMissingSoftware() { public EnrichMissingSoftware() {
super(true); super(true,
s -> Topic.ENRICH_MISSING_SOFTWARE,
(p, s) -> p.getSoftwares().add(s),
s -> s.getName());
} }
@Override @Override
protected List<UpdateInfo<eu.dnetlib.broker.objects.Software>> findUpdates( protected List<eu.dnetlib.broker.objects.Software> findDifferences(
final ResultWithRelations source, final ResultWithRelations source,
final ResultWithRelations target, final ResultWithRelations target) {
final DedupConfig dedupConfig) {
if (source.getSoftwares().isEmpty()) { if (source.getSoftwares().isEmpty()) {
return Arrays.asList(); return Arrays.asList();
@ -34,21 +34,8 @@ public class EnrichMissingSoftware
.stream() .stream()
.map(RelatedSoftware::getRelSoftware) .map(RelatedSoftware::getRelSoftware)
.map(ConversionUtils::oafSoftwareToBrokerSoftware) .map(ConversionUtils::oafSoftwareToBrokerSoftware)
.map(p -> generateUpdateInfo(p, source, target, dedupConfig))
.collect(Collectors.toList()); .collect(Collectors.toList());
} }
} }
public UpdateInfo<eu.dnetlib.broker.objects.Software> generateUpdateInfo(
final eu.dnetlib.broker.objects.Software highlightValue,
final ResultWithRelations source,
final ResultWithRelations target,
final DedupConfig dedupConfig) {
return new UpdateInfo<>(
Topic.ENRICH_MISSING_SOFTWARE,
highlightValue, source, target,
(p, s) -> p.getSoftwares().add(s),
s -> s.getName(), dedupConfig);
}
} }

View File

@ -8,24 +8,24 @@ import java.util.stream.Collectors;
import eu.dnetlib.dhp.broker.model.Topic; import eu.dnetlib.dhp.broker.model.Topic;
import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher; import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher;
import eu.dnetlib.dhp.broker.oa.util.ConversionUtils; import eu.dnetlib.dhp.broker.oa.util.ConversionUtils;
import eu.dnetlib.dhp.broker.oa.util.UpdateInfo;
import eu.dnetlib.dhp.broker.oa.util.aggregators.withRels.RelatedSoftware; import eu.dnetlib.dhp.broker.oa.util.aggregators.withRels.RelatedSoftware;
import eu.dnetlib.dhp.broker.oa.util.aggregators.withRels.ResultWithRelations; import eu.dnetlib.dhp.broker.oa.util.aggregators.withRels.ResultWithRelations;
import eu.dnetlib.dhp.schema.oaf.Software; import eu.dnetlib.dhp.schema.oaf.Software;
import eu.dnetlib.pace.config.DedupConfig;
public class EnrichMoreSoftware public class EnrichMoreSoftware
extends UpdateMatcher<eu.dnetlib.broker.objects.Software> { extends UpdateMatcher<eu.dnetlib.broker.objects.Software> {
public EnrichMoreSoftware() { public EnrichMoreSoftware() {
super(true); super(true,
s -> Topic.ENRICH_MORE_SOFTWARE,
(p, s) -> p.getSoftwares().add(s),
s -> s.getName());
} }
@Override @Override
protected List<UpdateInfo<eu.dnetlib.broker.objects.Software>> findUpdates( protected List<eu.dnetlib.broker.objects.Software> findDifferences(
final ResultWithRelations source, final ResultWithRelations source,
final ResultWithRelations target, final ResultWithRelations target) {
final DedupConfig dedupConfig) {
final Set<String> existingSoftwares = source final Set<String> existingSoftwares = source
.getSoftwares() .getSoftwares()
@ -40,20 +40,7 @@ public class EnrichMoreSoftware
.map(RelatedSoftware::getRelSoftware) .map(RelatedSoftware::getRelSoftware)
.filter(p -> !existingSoftwares.contains(p.getId())) .filter(p -> !existingSoftwares.contains(p.getId()))
.map(ConversionUtils::oafSoftwareToBrokerSoftware) .map(ConversionUtils::oafSoftwareToBrokerSoftware)
.map(p -> generateUpdateInfo(p, source, target, dedupConfig))
.collect(Collectors.toList()); .collect(Collectors.toList());
} }
public UpdateInfo<eu.dnetlib.broker.objects.Software> generateUpdateInfo(
final eu.dnetlib.broker.objects.Software highlightValue,
final ResultWithRelations source,
final ResultWithRelations target,
final DedupConfig dedupConfig) {
return new UpdateInfo<>(
Topic.ENRICH_MORE_SOFTWARE,
highlightValue, source, target,
(p, s) -> p.getSoftwares().add(s),
s -> s.getName(), dedupConfig);
}
} }

View File

@ -7,38 +7,25 @@ import java.util.List;
import eu.dnetlib.dhp.broker.model.Topic; import eu.dnetlib.dhp.broker.model.Topic;
import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher; import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher;
import eu.dnetlib.dhp.broker.oa.util.UpdateInfo;
import eu.dnetlib.dhp.broker.oa.util.aggregators.withRels.ResultWithRelations; import eu.dnetlib.dhp.broker.oa.util.aggregators.withRels.ResultWithRelations;
import eu.dnetlib.pace.config.DedupConfig;
public class EnrichMissingAbstract extends UpdateMatcher<String> { public class EnrichMissingAbstract extends UpdateMatcher<String> {
public EnrichMissingAbstract() { public EnrichMissingAbstract() {
super(false); super(false,
s -> Topic.ENRICH_MISSING_ABSTRACT,
(p, s) -> p.getAbstracts().add(s),
s -> s);
} }
@Override @Override
protected List<UpdateInfo<String>> findUpdates(final ResultWithRelations source, protected List<String> findDifferences(final ResultWithRelations source,
final ResultWithRelations target, final ResultWithRelations target) {
final DedupConfig dedupConfig) {
if (isMissing(target.getResult().getDescription()) && !isMissing(source.getResult().getDescription())) { if (isMissing(target.getResult().getDescription()) && !isMissing(source.getResult().getDescription())) {
return Arrays return Arrays
.asList( .asList(source.getResult().getDescription().get(0).getValue());
generateUpdateInfo(
source.getResult().getDescription().get(0).getValue(), source, target, dedupConfig));
} }
return new ArrayList<>(); return new ArrayList<>();
} }
public UpdateInfo<String> generateUpdateInfo(final String highlightValue,
final ResultWithRelations source,
final ResultWithRelations target,
final DedupConfig dedupConfig) {
return new UpdateInfo<>(
Topic.ENRICH_MISSING_ABSTRACT,
highlightValue, source, target,
(p, s) -> p.getAbstracts().add(s),
s -> s, dedupConfig);
}
} }

View File

@ -8,22 +8,22 @@ import java.util.stream.Collectors;
import eu.dnetlib.dhp.broker.model.Topic; import eu.dnetlib.dhp.broker.model.Topic;
import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher; import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher;
import eu.dnetlib.dhp.broker.oa.util.UpdateInfo;
import eu.dnetlib.dhp.broker.oa.util.aggregators.withRels.ResultWithRelations; import eu.dnetlib.dhp.broker.oa.util.aggregators.withRels.ResultWithRelations;
import eu.dnetlib.dhp.schema.oaf.Author; import eu.dnetlib.dhp.schema.oaf.Author;
import eu.dnetlib.dhp.schema.oaf.StructuredProperty; import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
import eu.dnetlib.pace.config.DedupConfig;
public class EnrichMissingAuthorOrcid extends UpdateMatcher<String> { public class EnrichMissingAuthorOrcid extends UpdateMatcher<String> {
public EnrichMissingAuthorOrcid() { public EnrichMissingAuthorOrcid() {
super(true); super(true,
aut -> Topic.ENRICH_MISSING_AUTHOR_ORCID,
(p, aut) -> p.getCreators().add(aut),
aut -> aut);
} }
@Override @Override
protected List<UpdateInfo<String>> findUpdates(final ResultWithRelations source, protected List<String> findDifferences(final ResultWithRelations source,
final ResultWithRelations target, final ResultWithRelations target) {
final DedupConfig dedupConfig) {
final Set<String> existingOrcids = target final Set<String> existingOrcids = target
.getResult() .getResult()
@ -35,7 +35,7 @@ public class EnrichMissingAuthorOrcid extends UpdateMatcher<String> {
.map(pid -> pid.getValue()) .map(pid -> pid.getValue())
.collect(Collectors.toSet()); .collect(Collectors.toSet());
final List<UpdateInfo<String>> list = new ArrayList<>(); final List<String> list = new ArrayList<>();
for (final Author author : source.getResult().getAuthor()) { for (final Author author : source.getResult().getAuthor()) {
final String name = author.getFullname(); final String name = author.getFullname();
@ -43,26 +43,11 @@ public class EnrichMissingAuthorOrcid extends UpdateMatcher<String> {
for (final StructuredProperty pid : author.getPid()) { for (final StructuredProperty pid : author.getPid()) {
if (pid.getQualifier().getClassid().equalsIgnoreCase("orcid") if (pid.getQualifier().getClassid().equalsIgnoreCase("orcid")
&& !existingOrcids.contains(pid.getValue())) { && !existingOrcids.contains(pid.getValue())) {
list list.add(name + " [ORCID: " + pid.getValue() + "]");
.add(
generateUpdateInfo(name + " [ORCID: " + pid.getValue() + "]", source, target, dedupConfig));
;
} }
} }
} }
return list; return list;
} }
public UpdateInfo<String> generateUpdateInfo(final String highlightValue,
final ResultWithRelations source,
final ResultWithRelations target,
final DedupConfig dedupConfig) {
return new UpdateInfo<>(
Topic.ENRICH_MISSING_AUTHOR_ORCID,
highlightValue, source, target,
(p, aut) -> p.getCreators().add(aut),
aut -> aut,
dedupConfig);
}
} }

View File

@ -10,20 +10,20 @@ import eu.dnetlib.dhp.broker.model.Topic;
import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher; import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher;
import eu.dnetlib.dhp.broker.oa.util.BrokerConstants; import eu.dnetlib.dhp.broker.oa.util.BrokerConstants;
import eu.dnetlib.dhp.broker.oa.util.ConversionUtils; import eu.dnetlib.dhp.broker.oa.util.ConversionUtils;
import eu.dnetlib.dhp.broker.oa.util.UpdateInfo;
import eu.dnetlib.dhp.broker.oa.util.aggregators.withRels.ResultWithRelations; import eu.dnetlib.dhp.broker.oa.util.aggregators.withRels.ResultWithRelations;
import eu.dnetlib.pace.config.DedupConfig;
public class EnrichMissingOpenAccess extends UpdateMatcher<Instance> { public class EnrichMissingOpenAccess extends UpdateMatcher<Instance> {
public EnrichMissingOpenAccess() { public EnrichMissingOpenAccess() {
super(true); super(true,
i -> Topic.ENRICH_MISSING_OA_VERSION,
(p, i) -> p.getInstances().add(i),
Instance::getUrl);
} }
@Override @Override
protected List<UpdateInfo<Instance>> findUpdates(final ResultWithRelations source, protected List<Instance> findDifferences(final ResultWithRelations source,
final ResultWithRelations target, final ResultWithRelations target) {
final DedupConfig dedupConfig) {
final long count = target final long count = target
.getResult() .getResult()
.getInstance() .getInstance()
@ -43,19 +43,7 @@ public class EnrichMissingOpenAccess extends UpdateMatcher<Instance> {
.filter(i -> i.getAccessright().getClassid().equals(BrokerConstants.OPEN_ACCESS)) .filter(i -> i.getAccessright().getClassid().equals(BrokerConstants.OPEN_ACCESS))
.map(ConversionUtils::oafInstanceToBrokerInstances) .map(ConversionUtils::oafInstanceToBrokerInstances)
.flatMap(List::stream) .flatMap(List::stream)
.map(i -> generateUpdateInfo(i, source, target, dedupConfig))
.collect(Collectors.toList()); .collect(Collectors.toList());
} }
public UpdateInfo<Instance> generateUpdateInfo(final Instance highlightValue,
final ResultWithRelations source,
final ResultWithRelations target,
final DedupConfig dedupConfig) {
return new UpdateInfo<>(
Topic.ENRICH_MISSING_OA_VERSION,
highlightValue, source, target,
(p, i) -> p.getInstances().add(i),
Instance::getUrl, dedupConfig);
}
} }

View File

@ -9,20 +9,20 @@ import eu.dnetlib.broker.objects.Pid;
import eu.dnetlib.dhp.broker.model.Topic; import eu.dnetlib.dhp.broker.model.Topic;
import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher; import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher;
import eu.dnetlib.dhp.broker.oa.util.ConversionUtils; import eu.dnetlib.dhp.broker.oa.util.ConversionUtils;
import eu.dnetlib.dhp.broker.oa.util.UpdateInfo;
import eu.dnetlib.dhp.broker.oa.util.aggregators.withRels.ResultWithRelations; import eu.dnetlib.dhp.broker.oa.util.aggregators.withRels.ResultWithRelations;
import eu.dnetlib.pace.config.DedupConfig;
public class EnrichMissingPid extends UpdateMatcher<Pid> { public class EnrichMissingPid extends UpdateMatcher<Pid> {
public EnrichMissingPid() { public EnrichMissingPid() {
super(true); super(true,
pid -> Topic.ENRICH_MISSING_PID,
(p, pid) -> p.getPids().add(pid),
pid -> pid.getType() + "::" + pid.getValue());
} }
@Override @Override
protected List<UpdateInfo<Pid>> findUpdates(final ResultWithRelations source, protected List<Pid> findDifferences(final ResultWithRelations source,
final ResultWithRelations target, final ResultWithRelations target) {
final DedupConfig dedupConfig) {
final long count = target.getResult().getPid().size(); final long count = target.getResult().getPid().size();
if (count > 0) { if (count > 0) {
@ -34,19 +34,7 @@ public class EnrichMissingPid extends UpdateMatcher<Pid> {
.getPid() .getPid()
.stream() .stream()
.map(ConversionUtils::oafPidToBrokerPid) .map(ConversionUtils::oafPidToBrokerPid)
.map(i -> generateUpdateInfo(i, source, target, dedupConfig))
.collect(Collectors.toList()); .collect(Collectors.toList());
} }
public UpdateInfo<Pid> generateUpdateInfo(final Pid highlightValue,
final ResultWithRelations source,
final ResultWithRelations target,
final DedupConfig dedupConfig) {
return new UpdateInfo<>(
Topic.ENRICH_MISSING_PID,
highlightValue, source, target,
(p, pid) -> p.getPids().add(pid),
pid -> pid.getType() + "::" + pid.getValue(), dedupConfig);
}
} }

View File

@ -7,39 +7,25 @@ import java.util.List;
import eu.dnetlib.dhp.broker.model.Topic; import eu.dnetlib.dhp.broker.model.Topic;
import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher; import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher;
import eu.dnetlib.dhp.broker.oa.util.UpdateInfo;
import eu.dnetlib.dhp.broker.oa.util.aggregators.withRels.ResultWithRelations; import eu.dnetlib.dhp.broker.oa.util.aggregators.withRels.ResultWithRelations;
import eu.dnetlib.pace.config.DedupConfig;
public class EnrichMissingPublicationDate extends UpdateMatcher<String> { public class EnrichMissingPublicationDate extends UpdateMatcher<String> {
public EnrichMissingPublicationDate() { public EnrichMissingPublicationDate() {
super(false); super(false,
date -> Topic.ENRICH_MISSING_PUBLICATION_DATE,
(p, date) -> p.setPublicationdate(date),
s -> s);
} }
@Override @Override
protected List<UpdateInfo<String>> findUpdates(final ResultWithRelations source, protected List<String> findDifferences(final ResultWithRelations source,
final ResultWithRelations target, final ResultWithRelations target) {
final DedupConfig dedupConfig) {
if (isMissing(target.getResult().getDateofacceptance()) if (isMissing(target.getResult().getDateofacceptance())
&& !isMissing(source.getResult().getDateofacceptance())) { && !isMissing(source.getResult().getDateofacceptance())) {
return Arrays return Arrays.asList(source.getResult().getDateofacceptance().getValue());
.asList(
generateUpdateInfo(
source.getResult().getDateofacceptance().getValue(), source, target, dedupConfig));
} }
return new ArrayList<>(); return new ArrayList<>();
} }
public UpdateInfo<String> generateUpdateInfo(final String highlightValue,
final ResultWithRelations source,
final ResultWithRelations target,
final DedupConfig dedupConfig) {
return new UpdateInfo<>(
Topic.ENRICH_MISSING_PUBLICATION_DATE,
highlightValue, source, target,
(p, date) -> p.setPublicationdate(date),
s -> s, dedupConfig);
}
} }

View File

@ -10,22 +10,22 @@ import org.apache.commons.lang3.tuple.Pair;
import eu.dnetlib.dhp.broker.model.Topic; import eu.dnetlib.dhp.broker.model.Topic;
import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher; import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher;
import eu.dnetlib.dhp.broker.oa.util.ConversionUtils; import eu.dnetlib.dhp.broker.oa.util.ConversionUtils;
import eu.dnetlib.dhp.broker.oa.util.UpdateInfo;
import eu.dnetlib.dhp.broker.oa.util.aggregators.withRels.ResultWithRelations; import eu.dnetlib.dhp.broker.oa.util.aggregators.withRels.ResultWithRelations;
import eu.dnetlib.dhp.schema.oaf.Qualifier; import eu.dnetlib.dhp.schema.oaf.Qualifier;
import eu.dnetlib.dhp.schema.oaf.StructuredProperty; import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
import eu.dnetlib.pace.config.DedupConfig;
public class EnrichMissingSubject extends UpdateMatcher<Pair<String, String>> { public class EnrichMissingSubject extends UpdateMatcher<Pair<String, String>> {
public EnrichMissingSubject() { public EnrichMissingSubject() {
super(true); super(true,
pair -> Topic.fromPath("ENRICH/MISSING/SUBJECT/" + pair.getLeft()),
(p, pair) -> p.getSubjects().add(pair.getRight()),
pair -> pair.getLeft() + "::" + pair.getRight());
} }
@Override @Override
protected List<UpdateInfo<Pair<String, String>>> findUpdates(final ResultWithRelations source, protected List<Pair<String, String>> findDifferences(final ResultWithRelations source,
final ResultWithRelations target, final ResultWithRelations target) {
final DedupConfig dedupConfig) {
final Set<String> existingTypes = target final Set<String> existingTypes = target
.getResult() .getResult()
.getSubject() .getSubject()
@ -40,20 +40,7 @@ public class EnrichMissingSubject extends UpdateMatcher<Pair<String, String>> {
.stream() .stream()
.filter(pid -> !existingTypes.contains(pid.getQualifier().getClassid())) .filter(pid -> !existingTypes.contains(pid.getQualifier().getClassid()))
.map(ConversionUtils::oafSubjectToPair) .map(ConversionUtils::oafSubjectToPair)
.map(i -> generateUpdateInfo(i, source, target, dedupConfig))
.collect(Collectors.toList()); .collect(Collectors.toList());
} }
public UpdateInfo<Pair<String, String>> generateUpdateInfo(final Pair<String, String> highlightValue,
final ResultWithRelations source,
final ResultWithRelations target,
final DedupConfig dedupConfig) {
return new UpdateInfo<>(
Topic.fromPath("ENRICH/MISSING/SUBJECT/" + highlightValue.getLeft()),
highlightValue, source, target,
(p, pair) -> p.getSubjects().add(pair.getRight()),
pair -> pair.getLeft() + "::" + pair.getRight(), dedupConfig);
}
} }

View File

@ -10,20 +10,20 @@ import eu.dnetlib.dhp.broker.model.Topic;
import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher; import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher;
import eu.dnetlib.dhp.broker.oa.util.BrokerConstants; import eu.dnetlib.dhp.broker.oa.util.BrokerConstants;
import eu.dnetlib.dhp.broker.oa.util.ConversionUtils; import eu.dnetlib.dhp.broker.oa.util.ConversionUtils;
import eu.dnetlib.dhp.broker.oa.util.UpdateInfo;
import eu.dnetlib.dhp.broker.oa.util.aggregators.withRels.ResultWithRelations; import eu.dnetlib.dhp.broker.oa.util.aggregators.withRels.ResultWithRelations;
import eu.dnetlib.pace.config.DedupConfig;
public class EnrichMoreOpenAccess extends UpdateMatcher<Instance> { public class EnrichMoreOpenAccess extends UpdateMatcher<Instance> {
public EnrichMoreOpenAccess() { public EnrichMoreOpenAccess() {
super(true); super(true,
i -> Topic.ENRICH_MORE_OA_VERSION,
(p, i) -> p.getInstances().add(i),
Instance::getUrl);
} }
@Override @Override
protected List<UpdateInfo<Instance>> findUpdates(final ResultWithRelations source, protected List<Instance> findDifferences(final ResultWithRelations source,
final ResultWithRelations target, final ResultWithRelations target) {
final DedupConfig dedupConfig) {
final Set<String> urls = target final Set<String> urls = target
.getResult() .getResult()
.getInstance() .getInstance()
@ -41,19 +41,7 @@ public class EnrichMoreOpenAccess extends UpdateMatcher<Instance> {
.map(ConversionUtils::oafInstanceToBrokerInstances) .map(ConversionUtils::oafInstanceToBrokerInstances)
.flatMap(List::stream) .flatMap(List::stream)
.filter(i -> !urls.contains(i.getUrl())) .filter(i -> !urls.contains(i.getUrl()))
.map(i -> generateUpdateInfo(i, source, target, dedupConfig))
.collect(Collectors.toList()); .collect(Collectors.toList());
} }
public UpdateInfo<Instance> generateUpdateInfo(final Instance highlightValue,
final ResultWithRelations source,
final ResultWithRelations target,
final DedupConfig dedupConfig) {
return new UpdateInfo<>(
Topic.ENRICH_MORE_OA_VERSION,
highlightValue, source, target,
(p, i) -> p.getInstances().add(i),
Instance::getUrl, dedupConfig);
}
} }

View File

@ -9,20 +9,20 @@ import eu.dnetlib.broker.objects.Pid;
import eu.dnetlib.dhp.broker.model.Topic; import eu.dnetlib.dhp.broker.model.Topic;
import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher; import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher;
import eu.dnetlib.dhp.broker.oa.util.ConversionUtils; import eu.dnetlib.dhp.broker.oa.util.ConversionUtils;
import eu.dnetlib.dhp.broker.oa.util.UpdateInfo;
import eu.dnetlib.dhp.broker.oa.util.aggregators.withRels.ResultWithRelations; import eu.dnetlib.dhp.broker.oa.util.aggregators.withRels.ResultWithRelations;
import eu.dnetlib.pace.config.DedupConfig;
public class EnrichMorePid extends UpdateMatcher<Pid> { public class EnrichMorePid extends UpdateMatcher<Pid> {
public EnrichMorePid() { public EnrichMorePid() {
super(true); super(true,
pid -> Topic.ENRICH_MORE_PID,
(p, pid) -> p.getPids().add(pid),
pid -> pid.getType() + "::" + pid.getValue());
} }
@Override @Override
protected List<UpdateInfo<Pid>> findUpdates(final ResultWithRelations source, protected List<Pid> findDifferences(final ResultWithRelations source,
final ResultWithRelations target, final ResultWithRelations target) {
final DedupConfig dedupConfig) {
final Set<String> existingPids = target final Set<String> existingPids = target
.getResult() .getResult()
.getPid() .getPid()
@ -36,19 +36,7 @@ public class EnrichMorePid extends UpdateMatcher<Pid> {
.stream() .stream()
.filter(pid -> !existingPids.contains(pid.getQualifier().getClassid() + "::" + pid.getValue())) .filter(pid -> !existingPids.contains(pid.getQualifier().getClassid() + "::" + pid.getValue()))
.map(ConversionUtils::oafPidToBrokerPid) .map(ConversionUtils::oafPidToBrokerPid)
.map(i -> generateUpdateInfo(i, source, target, dedupConfig))
.collect(Collectors.toList()); .collect(Collectors.toList());
} }
public UpdateInfo<Pid> generateUpdateInfo(final Pid highlightValue,
final ResultWithRelations source,
final ResultWithRelations target,
final DedupConfig dedupConfig) {
return new UpdateInfo<>(
Topic.ENRICH_MORE_PID,
highlightValue, source, target,
(p, pid) -> p.getPids().add(pid),
pid -> pid.getType() + "::" + pid.getValue(), dedupConfig);
}
} }

View File

@ -10,20 +10,20 @@ import org.apache.commons.lang3.tuple.Pair;
import eu.dnetlib.dhp.broker.model.Topic; import eu.dnetlib.dhp.broker.model.Topic;
import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher; import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher;
import eu.dnetlib.dhp.broker.oa.util.ConversionUtils; import eu.dnetlib.dhp.broker.oa.util.ConversionUtils;
import eu.dnetlib.dhp.broker.oa.util.UpdateInfo;
import eu.dnetlib.dhp.broker.oa.util.aggregators.withRels.ResultWithRelations; import eu.dnetlib.dhp.broker.oa.util.aggregators.withRels.ResultWithRelations;
import eu.dnetlib.pace.config.DedupConfig;
public class EnrichMoreSubject extends UpdateMatcher<Pair<String, String>> { public class EnrichMoreSubject extends UpdateMatcher<Pair<String, String>> {
public EnrichMoreSubject() { public EnrichMoreSubject() {
super(true); super(true,
pair -> Topic.fromPath("ENRICH/MORE/SUBJECT/" + pair.getLeft()),
(p, pair) -> p.getSubjects().add(pair.getRight()),
pair -> pair.getLeft() + "::" + pair.getRight());
} }
@Override @Override
protected List<UpdateInfo<Pair<String, String>>> findUpdates(final ResultWithRelations source, protected List<Pair<String, String>> findDifferences(final ResultWithRelations source,
final ResultWithRelations target, final ResultWithRelations target) {
final DedupConfig dedupConfig) {
final Set<String> existingSubjects = target final Set<String> existingSubjects = target
.getResult() .getResult()
.getSubject() .getSubject()
@ -37,20 +37,7 @@ public class EnrichMoreSubject extends UpdateMatcher<Pair<String, String>> {
.stream() .stream()
.filter(pid -> !existingSubjects.contains(pid.getQualifier().getClassid() + "::" + pid.getValue())) .filter(pid -> !existingSubjects.contains(pid.getQualifier().getClassid() + "::" + pid.getValue()))
.map(ConversionUtils::oafSubjectToPair) .map(ConversionUtils::oafSubjectToPair)
.map(i -> generateUpdateInfo(i, source, target, dedupConfig))
.collect(Collectors.toList()); .collect(Collectors.toList());
} }
public UpdateInfo<Pair<String, String>> generateUpdateInfo(final Pair<String, String> highlightValue,
final ResultWithRelations source,
final ResultWithRelations target,
final DedupConfig dedupConfig) {
return new UpdateInfo<>(
Topic.fromPath("ENRICH/MORE/SUBJECT/" + highlightValue.getLeft()),
highlightValue, source, target,
(p, pair) -> p.getSubjects().add(pair.getRight()),
pair -> pair.getLeft() + "::" + pair.getRight(), dedupConfig);
}
} }

View File

@ -0,0 +1,18 @@
<configuration>
<property>
<name>jobTracker</name>
<value>yarnRM</value>
</property>
<property>
<name>nameNode</name>
<value>hdfs://nameservice1</value>
</property>
<property>
<name>oozie.use.system.libpath</name>
<value>true</value>
</property>
<property>
<name>oozie.action.sharelib.for.spark</name>
<value>spark2</value>
</property>
</configuration>

View File

@ -0,0 +1,99 @@
<workflow-app name="create broker events" xmlns="uri:oozie:workflow:0.5">
<parameters>
<property>
<name>graphInputPath</name>
<description>the path where the graph is stored</description>
</property>
<property>
<name>eventsOutputPath</name>
<description>the path where the the events will be stored</description>
</property>
<property>
<name>isLookupUrl</name>
<description>the address of the lookUp service</description>
</property>
<property>
<name>dedupConfProfId</name>
<description>the id of a valid Dedup Configuration Profile</description>
</property>
<property>
<name>sparkDriverMemory</name>
<description>memory for driver process</description>
</property>
<property>
<name>sparkExecutorMemory</name>
<description>memory for individual executor</description>
</property>
<property>
<name>sparkExecutorCores</name>
<description>number of cores used by single executor</description>
</property>
<property>
<name>oozieActionShareLibForSpark2</name>
<description>oozie action sharelib for spark 2.*</description>
</property>
<property>
<name>spark2ExtraListeners</name>
<value>com.cloudera.spark.lineage.NavigatorAppListener</value>
<description>spark 2.* extra listeners classname</description>
</property>
<property>
<name>spark2SqlQueryExecutionListeners</name>
<value>com.cloudera.spark.lineage.NavigatorQueryListener</value>
<description>spark 2.* sql query execution listeners classname</description>
</property>
<property>
<name>spark2YarnHistoryServerAddress</name>
<description>spark 2.* yarn history server address</description>
</property>
<property>
<name>spark2EventLogDir</name>
<description>spark 2.* event log dir location</description>
</property>
</parameters>
<global>
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<configuration>
<property>
<name>mapreduce.job.queuename</name>
<value>${queueName}</value>
</property>
<property>
<name>oozie.launcher.mapred.job.queue.name</name>
<value>${oozieLauncherQueueName}</value>
</property>
<property>
<name>oozie.action.sharelib.for.spark</name>
<value>${oozieActionShareLibForSpark2}</value>
</property>
</configuration>
</global>
<start to="generate_events"/>
<kill name="Kill">
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
</kill>
<action name="generate_events">
<java>
<prepare>
<delete path="${eventsOutputPath}"/>
</prepare>
<main-class>eu.dnetlib.dhp.broker.oa.GenerateEventsApplication</main-class>
<arg>--graphPath</arg><arg>${graphInputPath}</arg>
<arg>--eventsPath</arg><arg>${eventsOutputPath}</arg>
<arg>--isLookupUrl</arg><arg>${isLookupUrl}</arg>
<arg>--dedupConfProfile</arg><arg>${dedupConfProfId}</arg>
</java>
<ok to="End"/>
<error to="Kill"/>
</action>
<end name="End"/>
</workflow-app>

View File

@ -0,0 +1,26 @@
[
{
"paramName": "g",
"paramLongName": "graphPath",
"paramDescription": "the path where there the graph is stored",
"paramRequired": true
},
{
"paramName": "o",
"paramLongName": "eventsPath",
"paramDescription": "the path where the generated events will be stored",
"paramRequired": true
},
{
"paramName": "lu",
"paramLongName": "isLookupUrl",
"paramDescription": "the address of the ISLookUpService",
"paramRequired": true
},
{
"paramName": "d",
"paramLongName": "dedupConfProfile",
"paramDescription": "the id of a valid Dedup Configuration Profile",
"paramRequired": true
}
]

View File

@ -3,7 +3,7 @@
<parent> <parent>
<artifactId>dhp-workflows</artifactId> <artifactId>dhp-workflows</artifactId>
<groupId>eu.dnetlib.dhp</groupId> <groupId>eu.dnetlib.dhp</groupId>
<version>1.2.2-SNAPSHOT</version> <version>1.2.3-SNAPSHOT</version>
</parent> </parent>
<modelVersion>4.0.0</modelVersion> <modelVersion>4.0.0</modelVersion>
<artifactId>dhp-dedup-openaire</artifactId> <artifactId>dhp-dedup-openaire</artifactId>

View File

@ -3,7 +3,7 @@
<parent> <parent>
<artifactId>dhp-workflows</artifactId> <artifactId>dhp-workflows</artifactId>
<groupId>eu.dnetlib.dhp</groupId> <groupId>eu.dnetlib.dhp</groupId>
<version>1.2.2-SNAPSHOT</version> <version>1.2.3-SNAPSHOT</version>
</parent> </parent>
<modelVersion>4.0.0</modelVersion> <modelVersion>4.0.0</modelVersion>

View File

@ -3,7 +3,7 @@
<parent> <parent>
<artifactId>dhp-workflows</artifactId> <artifactId>dhp-workflows</artifactId>
<groupId>eu.dnetlib.dhp</groupId> <groupId>eu.dnetlib.dhp</groupId>
<version>1.2.2-SNAPSHOT</version> <version>1.2.3-SNAPSHOT</version>
</parent> </parent>
<modelVersion>4.0.0</modelVersion> <modelVersion>4.0.0</modelVersion>

View File

@ -1,10 +1,9 @@
<?xml version="1.0" encoding="UTF-8"?> <?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<parent> <parent>
<artifactId>dhp-workflows</artifactId> <artifactId>dhp-workflows</artifactId>
<groupId>eu.dnetlib.dhp</groupId> <groupId>eu.dnetlib.dhp</groupId>
<version>1.2.2-SNAPSHOT</version> <version>1.2.3-SNAPSHOT</version>
</parent> </parent>
<modelVersion>4.0.0</modelVersion> <modelVersion>4.0.0</modelVersion>

View File

@ -166,8 +166,10 @@ case object Crossref2Oaf {
val has_review = (json \ "relation" \"has-review" \ "id") val has_review = (json \ "relation" \"has-review" \ "id")
if(has_review != JNothing) if(has_review != JNothing) {
instance.setRefereed(asField("peerReviewed")) instance.setRefereed(
createQualifier("0001", "peerReviewed", "dnet:review_levels", "dnet:review_levels"))
}
instance.setAccessright(getRestrictedQualifier()) instance.setAccessright(getRestrictedQualifier())

View File

@ -3,7 +3,7 @@
<parent> <parent>
<artifactId>dhp-workflows</artifactId> <artifactId>dhp-workflows</artifactId>
<groupId>eu.dnetlib.dhp</groupId> <groupId>eu.dnetlib.dhp</groupId>
<version>1.2.2-SNAPSHOT</version> <version>1.2.3-SNAPSHOT</version>
</parent> </parent>
<modelVersion>4.0.0</modelVersion> <modelVersion>4.0.0</modelVersion>

View File

@ -3,7 +3,7 @@
<parent> <parent>
<artifactId>dhp-workflows</artifactId> <artifactId>dhp-workflows</artifactId>
<groupId>eu.dnetlib.dhp</groupId> <groupId>eu.dnetlib.dhp</groupId>
<version>1.2.2-SNAPSHOT</version> <version>1.2.3-SNAPSHOT</version>
</parent> </parent>
<modelVersion>4.0.0</modelVersion> <modelVersion>4.0.0</modelVersion>

View File

@ -0,0 +1,109 @@
package eu.dnetlib.dhp.oa.graph.clean;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import java.util.Optional;
import org.apache.commons.io.IOUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SaveMode;
import org.apache.spark.sql.SparkSession;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.common.HdfsSupport;
import eu.dnetlib.dhp.oa.graph.raw.common.VocabularyGroup;
import eu.dnetlib.dhp.schema.oaf.*;
import eu.dnetlib.dhp.utils.ISLookupClientFactory;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
public class CleanGraphSparkJob {
private static final Logger log = LoggerFactory.getLogger(CleanGraphSparkJob.class);
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
public static void main(String[] args) throws Exception {
String jsonConfiguration = IOUtils
.toString(
CleanGraphSparkJob.class
.getResourceAsStream(
"/eu/dnetlib/dhp/oa/graph/input_clean_graph_parameters.json"));
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
parser.parseArgument(args);
Boolean isSparkSessionManaged = Optional
.ofNullable(parser.get("isSparkSessionManaged"))
.map(Boolean::valueOf)
.orElse(Boolean.TRUE);
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
String inputPath = parser.get("inputPath");
log.info("inputPath: {}", inputPath);
String outputPath = parser.get("outputPath");
log.info("outputPath: {}", outputPath);
String isLookupUrl = parser.get("isLookupUrl");
log.info("isLookupUrl: {}", isLookupUrl);
String graphTableClassName = parser.get("graphTableClassName");
log.info("graphTableClassName: {}", graphTableClassName);
Class<? extends OafEntity> entityClazz = (Class<? extends OafEntity>) Class.forName(graphTableClassName);
final ISLookUpService isLookupService = ISLookupClientFactory.getLookUpService(isLookupUrl);
final VocabularyGroup vocs = VocabularyGroup.loadVocsFromIS(isLookupService);
SparkConf conf = new SparkConf();
runWithSparkSession(
conf,
isSparkSessionManaged,
spark -> {
removeOutputDir(spark, outputPath);
fixGraphTable(spark, vocs, inputPath, entityClazz, outputPath);
});
}
private static <T extends Oaf> void fixGraphTable(
SparkSession spark,
VocabularyGroup vocs,
String inputPath,
Class<T> clazz,
String outputPath) {
final CleaningRuleMap mapping = CleaningRuleMap.create(vocs);
readTableFromPath(spark, inputPath, clazz)
.map((MapFunction<T, T>) value -> OafCleaner.apply(value, mapping), Encoders.bean(clazz))
.write()
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.json(outputPath);
}
private static <T extends Oaf> Dataset<T> readTableFromPath(
SparkSession spark, String inputEntityPath, Class<T> clazz) {
log.info("Reading Graph table from: {}", inputEntityPath);
return spark
.read()
.textFile(inputEntityPath)
.map(
(MapFunction<String, T>) value -> OBJECT_MAPPER.readValue(value, clazz),
Encoders.bean(clazz));
}
private static void removeOutputDir(SparkSession spark, String path) {
HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration());
}
}

View File

@ -0,0 +1,40 @@
package eu.dnetlib.dhp.oa.graph.clean;
import java.io.Serializable;
import java.util.HashMap;
import eu.dnetlib.dhp.common.FunctionalInterfaceSupport.SerializableConsumer;
import eu.dnetlib.dhp.oa.graph.raw.common.VocabularyGroup;
import eu.dnetlib.dhp.schema.oaf.Qualifier;
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
public class CleaningRuleMap extends HashMap<Class, SerializableConsumer<Object>> implements Serializable {
/**
* Creates the mapping for the Oaf types subject to cleaning
*
* @param vocabularies
*/
public static CleaningRuleMap create(VocabularyGroup vocabularies) {
CleaningRuleMap mapping = new CleaningRuleMap();
mapping.put(Qualifier.class, o -> {
Qualifier q = (Qualifier) o;
if (vocabularies.vocabularyExists(q.getSchemeid())) {
Qualifier newValue = vocabularies.lookup(q.getSchemeid(), q.getClassid());
q.setClassid(newValue.getClassid());
q.setClassname(newValue.getClassname());
}
});
mapping.put(StructuredProperty.class, o -> {
StructuredProperty sp = (StructuredProperty) o;
// TODO implement a policy
/*
* if (StringUtils.isBlank(sp.getValue())) { sp.setValue(null); sp.setQualifier(null); sp.setDataInfo(null);
* }
*/
});
return mapping;
}
}

View File

@ -0,0 +1,82 @@
package eu.dnetlib.dhp.oa.graph.clean;
import java.io.Serializable;
import java.lang.reflect.Field;
import java.util.Arrays;
import java.util.LinkedList;
import java.util.List;
import java.util.Objects;
import eu.dnetlib.dhp.schema.oaf.Oaf;
public class OafCleaner implements Serializable {
public static <E extends Oaf> E apply(E oaf, CleaningRuleMap mapping) {
try {
navigate(oaf, mapping);
} catch (IllegalAccessException e) {
throw new RuntimeException(e);
}
return oaf;
}
private static void navigate(Object o, CleaningRuleMap mapping) throws IllegalAccessException {
if (isPrimitive(o)) {
return;
} else if (isIterable(o.getClass())) {
for (final Object elem : (Iterable<?>) o) {
navigate(elem, mapping);
}
} else if (hasMapping(o, mapping)) {
mapping.get(o.getClass()).accept(o);
} else {
for (final Field f : getAllFields(o.getClass())) {
f.setAccessible(true);
final Object val = f.get(o);
if (!isPrimitive(val) && hasMapping(val, mapping)) {
mapping.get(val.getClass()).accept(val);
} else {
navigate(f.get(o), mapping);
}
}
}
}
private static boolean hasMapping(Object o, CleaningRuleMap mapping) {
return mapping.containsKey(o.getClass());
}
private static boolean isIterable(final Class<?> cl) {
return Iterable.class.isAssignableFrom(cl);
}
private static boolean isPrimitive(Object o) {
return Objects.isNull(o)
|| o.getClass().isPrimitive()
|| o instanceof Class
|| o instanceof Integer
|| o instanceof Double
|| o instanceof Float
|| o instanceof Long
|| o instanceof Boolean
|| o instanceof String
|| o instanceof Byte;
}
private static List<Field> getAllFields(Class<?> clazz) {
return getAllFields(new LinkedList<>(), clazz);
}
private static List<Field> getAllFields(List<Field> fields, Class<?> clazz) {
fields.addAll(Arrays.asList(clazz.getDeclaredFields()));
final Class<?> superclass = clazz.getSuperclass();
if (Objects.nonNull(superclass) && superclass.getPackage().equals(Oaf.class.getPackage())) {
getAllFields(fields, superclass);
}
return fields;
}
}

View File

@ -39,6 +39,8 @@ import eu.dnetlib.dhp.schema.oaf.Project;
import eu.dnetlib.dhp.schema.oaf.Publication; import eu.dnetlib.dhp.schema.oaf.Publication;
import eu.dnetlib.dhp.schema.oaf.Relation; import eu.dnetlib.dhp.schema.oaf.Relation;
import eu.dnetlib.dhp.schema.oaf.Software; import eu.dnetlib.dhp.schema.oaf.Software;
import eu.dnetlib.dhp.utils.ISLookupClientFactory;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
import scala.Tuple2; import scala.Tuple2;
public class GenerateEntitiesApplication { public class GenerateEntitiesApplication {
@ -71,7 +73,8 @@ public class GenerateEntitiesApplication {
final String isLookupUrl = parser.get("isLookupUrl"); final String isLookupUrl = parser.get("isLookupUrl");
log.info("isLookupUrl: {}", isLookupUrl); log.info("isLookupUrl: {}", isLookupUrl);
final VocabularyGroup vocs = VocabularyGroup.loadVocsFromIS(isLookupUrl); final ISLookUpService isLookupService = ISLookupClientFactory.getLookUpService(isLookupUrl);
final VocabularyGroup vocs = VocabularyGroup.loadVocsFromIS(isLookupService);
final SparkConf conf = new SparkConf(); final SparkConf conf = new SparkConf();
runWithSparkSession(conf, isSparkSessionManaged, spark -> { runWithSparkSession(conf, isSparkSessionManaged, spark -> {
@ -137,11 +140,11 @@ public class GenerateEntitiesApplication {
final String type = StringUtils.substringAfter(id, ":"); final String type = StringUtils.substringAfter(id, ":");
switch (type.toLowerCase()) { switch (type.toLowerCase()) {
case "oaf-store-claim":
case "oaf-store-cleaned": case "oaf-store-cleaned":
case "oaf-store-claim":
return new OafToOafMapper(vocs, false).processMdRecord(s); return new OafToOafMapper(vocs, false).processMdRecord(s);
case "odf-store-claim":
case "odf-store-cleaned": case "odf-store-cleaned":
case "odf-store-claim":
return new OdfToOafMapper(vocs, false).processMdRecord(s); return new OdfToOafMapper(vocs, false).processMdRecord(s);
case "oaf-store-intersection": case "oaf-store-intersection":
return new OafToOafMapper(vocs, true).processMdRecord(s); return new OafToOafMapper(vocs, true).processMdRecord(s);

View File

@ -50,6 +50,8 @@ import org.apache.commons.lang3.StringUtils;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
import com.google.common.collect.Lists;
import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.common.DbClient; import eu.dnetlib.dhp.common.DbClient;
import eu.dnetlib.dhp.oa.graph.raw.common.AbstractMigrationApplication; import eu.dnetlib.dhp.oa.graph.raw.common.AbstractMigrationApplication;
@ -71,6 +73,7 @@ import eu.dnetlib.dhp.schema.oaf.Relation;
import eu.dnetlib.dhp.schema.oaf.Result; import eu.dnetlib.dhp.schema.oaf.Result;
import eu.dnetlib.dhp.schema.oaf.Software; import eu.dnetlib.dhp.schema.oaf.Software;
import eu.dnetlib.dhp.schema.oaf.StructuredProperty; import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
import eu.dnetlib.dhp.utils.ISLookupClientFactory;
public class MigrateDbEntitiesApplication extends AbstractMigrationApplication implements Closeable { public class MigrateDbEntitiesApplication extends AbstractMigrationApplication implements Closeable {
@ -151,7 +154,7 @@ public class MigrateDbEntitiesApplication extends AbstractMigrationApplication i
super(hdfsPath); super(hdfsPath);
this.dbClient = new DbClient(dbUrl, dbUser, dbPassword); this.dbClient = new DbClient(dbUrl, dbUser, dbPassword);
this.lastUpdateTimestamp = new Date().getTime(); this.lastUpdateTimestamp = new Date().getTime();
this.vocs = VocabularyGroup.loadVocsFromIS(isLookupUrl); this.vocs = VocabularyGroup.loadVocsFromIS(ISLookupClientFactory.getLookUpService(isLookupUrl));
} }
public void execute(final String sqlFile, final Function<ResultSet, List<Oaf>> producer) public void execute(final String sqlFile, final Function<ResultSet, List<Oaf>> producer)
@ -170,7 +173,7 @@ public class MigrateDbEntitiesApplication extends AbstractMigrationApplication i
final Datasource ds = new Datasource(); final Datasource ds = new Datasource();
ds.setId(createOpenaireId(10, rs.getString("datasourceid"), true)); ds.setId(createOpenaireId(10, rs.getString("datasourceid"), true));
ds.setOriginalId(Arrays.asList(rs.getString("datasourceid"))); ds.setOriginalId(Arrays.asList((String[]) rs.getArray("identities").getArray()));
ds ds
.setCollectedfrom( .setCollectedfrom(
listKeyValues( listKeyValues(

View File

@ -4,13 +4,7 @@ package eu.dnetlib.dhp.oa.graph.raw;
import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.createOpenaireId; import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.createOpenaireId;
import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.field; import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.field;
import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.structuredProperty; import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.structuredProperty;
import static eu.dnetlib.dhp.schema.common.ModelConstants.DNET_ACCESS_MODES; import static eu.dnetlib.dhp.schema.common.ModelConstants.*;
import static eu.dnetlib.dhp.schema.common.ModelConstants.DNET_LANGUAGES;
import static eu.dnetlib.dhp.schema.common.ModelConstants.DNET_PID_TYPES;
import static eu.dnetlib.dhp.schema.common.ModelConstants.DNET_PUBLICATION_RESOURCE;
import static eu.dnetlib.dhp.schema.common.ModelConstants.IS_RELATED_TO;
import static eu.dnetlib.dhp.schema.common.ModelConstants.PUBLICATION_DATASET;
import static eu.dnetlib.dhp.schema.common.ModelConstants.RESULT_RESULT;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.List; import java.util.List;
@ -139,7 +133,7 @@ public class OafToOafMapper extends AbstractMdRecordToOafMapper {
instance instance
.setAccessright(prepareQualifier(doc, "//oaf:accessrights", DNET_ACCESS_MODES)); .setAccessright(prepareQualifier(doc, "//oaf:accessrights", DNET_ACCESS_MODES));
instance.setLicense(field(doc.valueOf("//oaf:license"), info)); instance.setLicense(field(doc.valueOf("//oaf:license"), info));
instance.setRefereed(field(doc.valueOf("//oaf:refereed"), info)); instance.setRefereed(prepareQualifier(doc, "//oaf:refereed", DNET_REVIEW_LEVELS));
instance instance
.setProcessingchargeamount(field(doc.valueOf("//oaf:processingchargeamount"), info)); .setProcessingchargeamount(field(doc.valueOf("//oaf:processingchargeamount"), info));
instance instance
@ -281,12 +275,12 @@ public class OafToOafMapper extends AbstractMdRecordToOafMapper {
res res
.add( .add(
getRelation( getRelation(
docId, otherId, RESULT_RESULT, PUBLICATION_DATASET, IS_RELATED_TO, collectedFrom, info, docId, otherId, RESULT_RESULT, RELATIONSHIP, IS_RELATED_TO, collectedFrom, info,
lastUpdateTimestamp)); lastUpdateTimestamp));
res res
.add( .add(
getRelation( getRelation(
otherId, docId, RESULT_RESULT, PUBLICATION_DATASET, IS_RELATED_TO, collectedFrom, info, otherId, docId, RESULT_RESULT, RELATIONSHIP, IS_RELATED_TO, collectedFrom, info,
lastUpdateTimestamp)); lastUpdateTimestamp));
} }
} }

View File

@ -4,19 +4,7 @@ package eu.dnetlib.dhp.oa.graph.raw;
import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.createOpenaireId; import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.createOpenaireId;
import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.field; import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.field;
import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.structuredProperty; import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.structuredProperty;
import static eu.dnetlib.dhp.schema.common.ModelConstants.DNET_ACCESS_MODES; import static eu.dnetlib.dhp.schema.common.ModelConstants.*;
import static eu.dnetlib.dhp.schema.common.ModelConstants.DNET_DATA_CITE_DATE;
import static eu.dnetlib.dhp.schema.common.ModelConstants.DNET_DATA_CITE_RESOURCE;
import static eu.dnetlib.dhp.schema.common.ModelConstants.DNET_LANGUAGES;
import static eu.dnetlib.dhp.schema.common.ModelConstants.DNET_PID_TYPES;
import static eu.dnetlib.dhp.schema.common.ModelConstants.DNET_PUBLICATION_RESOURCE;
import static eu.dnetlib.dhp.schema.common.ModelConstants.HAS_PARTS;
import static eu.dnetlib.dhp.schema.common.ModelConstants.IS_PART_OF;
import static eu.dnetlib.dhp.schema.common.ModelConstants.IS_SUPPLEMENTED_BY;
import static eu.dnetlib.dhp.schema.common.ModelConstants.IS_SUPPLEMENT_TO;
import static eu.dnetlib.dhp.schema.common.ModelConstants.PART;
import static eu.dnetlib.dhp.schema.common.ModelConstants.RESULT_RESULT;
import static eu.dnetlib.dhp.schema.common.ModelConstants.SUPPLEMENT;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Arrays; import java.util.Arrays;
@ -129,7 +117,7 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper {
instance instance
.setAccessright(prepareQualifier(doc, "//oaf:accessrights", DNET_ACCESS_MODES)); .setAccessright(prepareQualifier(doc, "//oaf:accessrights", DNET_ACCESS_MODES));
instance.setLicense(field(doc.valueOf("//oaf:license"), info)); instance.setLicense(field(doc.valueOf("//oaf:license"), info));
instance.setRefereed(field(doc.valueOf("//oaf:refereed"), info)); instance.setRefereed(prepareQualifier(doc, "//oaf:refereed", DNET_REVIEW_LEVELS));
instance.setProcessingchargeamount(field(doc.valueOf("//oaf:processingchargeamount"), info)); instance.setProcessingchargeamount(field(doc.valueOf("//oaf:processingchargeamount"), info));
instance instance
.setProcessingchargecurrency(field(doc.valueOf("//oaf:processingchargeamount/@currency"), info)); .setProcessingchargecurrency(field(doc.valueOf("//oaf:processingchargeamount/@currency"), info));

View File

@ -60,6 +60,10 @@ public class OafMapperUtils {
.collect(Collectors.toList()); .collect(Collectors.toList());
} }
public static Qualifier unknown(final String schemeid, final String schemename) {
return qualifier("UNKNOWN", "Unknown", schemeid, schemename);
}
public static Qualifier qualifier( public static Qualifier qualifier(
final String classid, final String classid,
final String classname, final String classname,

View File

@ -4,14 +4,29 @@ package eu.dnetlib.dhp.oa.graph.raw.common;
import java.io.Serializable; import java.io.Serializable;
import java.util.HashMap; import java.util.HashMap;
import java.util.Map; import java.util.Map;
import java.util.Optional;
import org.apache.commons.lang3.StringUtils;
import com.google.common.collect.Maps;
import eu.dnetlib.dhp.schema.oaf.Qualifier;
public class Vocabulary implements Serializable { public class Vocabulary implements Serializable {
private final String id; private final String id;
private final String name; private final String name;
/**
* Code to Term mappings for this Vocabulary.
*/
private final Map<String, VocabularyTerm> terms = new HashMap<>(); private final Map<String, VocabularyTerm> terms = new HashMap<>();
/**
* Synonym to Code mappings for this Vocabulary.
*/
private final Map<String, String> synonyms = Maps.newHashMap();
public Vocabulary(final String id, final String name) { public Vocabulary(final String id, final String name) {
this.id = id; this.id = id;
this.name = name; this.name = name;
@ -30,7 +45,7 @@ public class Vocabulary implements Serializable {
} }
public VocabularyTerm getTerm(final String id) { public VocabularyTerm getTerm(final String id) {
return terms.get(id.toLowerCase()); return Optional.ofNullable(id).map(s -> s.toLowerCase()).map(s -> terms.get(s)).orElse(null);
} }
protected void addTerm(final String id, final String name) { protected void addTerm(final String id, final String name) {
@ -40,4 +55,32 @@ public class Vocabulary implements Serializable {
protected boolean termExists(final String id) { protected boolean termExists(final String id) {
return terms.containsKey(id.toLowerCase()); return terms.containsKey(id.toLowerCase());
} }
protected void addSynonym(final String syn, final String termCode) {
synonyms.put(syn, termCode.toLowerCase());
}
public VocabularyTerm getTermBySynonym(final String syn) {
return getTerm(synonyms.get(syn));
}
public Qualifier getTermAsQualifier(final String termId) {
if (StringUtils.isBlank(termId)) {
return OafMapperUtils.unknown(getId(), getName());
} else if (termExists(termId)) {
final VocabularyTerm t = getTerm(termId);
return OafMapperUtils.qualifier(t.getId(), t.getName(), getId(), getName());
} else {
return OafMapperUtils.qualifier(termId, termId, getId(), getName());
}
}
public Qualifier getSynonymAsQualifier(final String syn) {
return Optional
.ofNullable(getTermBySynonym(syn))
.map(term -> getTermAsQualifier(term.getId()))
.orElse(null);
// .orElse(OafMapperUtils.unknown(getId(), getName()));
}
} }

View File

@ -1,33 +1,39 @@
package eu.dnetlib.dhp.oa.graph.raw.common; package eu.dnetlib.dhp.oa.graph.raw.common;
import java.io.IOException;
import java.io.Serializable; import java.io.Serializable;
import java.util.HashMap; import java.util.*;
import java.util.Map; import java.util.function.Supplier;
import java.util.stream.Collectors;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.StringUtils;
import eu.dnetlib.dhp.oa.graph.raw.GenerateEntitiesApplication;
import eu.dnetlib.dhp.schema.oaf.Qualifier; import eu.dnetlib.dhp.schema.oaf.Qualifier;
import eu.dnetlib.dhp.utils.ISLookupClientFactory;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException; import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService; import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
public class VocabularyGroup implements Serializable { public class VocabularyGroup implements Serializable {
public static VocabularyGroup loadVocsFromIS(final String isLookupUrl) throws IOException, ISLookUpException { public static final String VOCABULARIES_XQUERY = "for $x in collection(' /db/DRIVER/VocabularyDSResources/VocabularyDSResourceType') \n"
final ISLookUpService isLookUpService = ISLookupClientFactory.getLookUpService(isLookupUrl); +
"let $vocid := $x//VOCABULARY_NAME/@code\n" +
"let $vocname := $x//VOCABULARY_NAME/text()\n" +
"for $term in ($x//TERM)\n" +
"return concat($vocid,' @=@ ',$vocname,' @=@ ',$term/@code,' @=@ ',$term/@english_name)";
final String xquery = IOUtils public static final String VOCABULARY_SYNONYMS_XQUERY = "for $x in collection('/db/DRIVER/VocabularyDSResources/VocabularyDSResourceType')\n"
.toString( +
GenerateEntitiesApplication.class "let $vocid := $x//VOCABULARY_NAME/@code\n" +
.getResourceAsStream("/eu/dnetlib/dhp/oa/graph/xquery/load_vocabularies.xquery")); "let $vocname := $x//VOCABULARY_NAME/text()\n" +
"for $term in ($x//TERM)\n" +
"for $syn in ($term//SYNONYM/@term)\n" +
"return concat($vocid,' @=@ ',$term/@code,' @=@ ', $syn)\n";
public static VocabularyGroup loadVocsFromIS(ISLookUpService isLookUpService) throws ISLookUpException {
final VocabularyGroup vocs = new VocabularyGroup(); final VocabularyGroup vocs = new VocabularyGroup();
for (final String s : isLookUpService.quickSearchProfile(xquery)) { for (final String s : isLookUpService.quickSearchProfile(VOCABULARIES_XQUERY)) {
final String[] arr = s.split("@=@"); final String[] arr = s.split("@=@");
if (arr.length == 4) { if (arr.length == 4) {
final String vocId = arr[0].trim(); final String vocId = arr[0].trim();
@ -40,6 +46,19 @@ public class VocabularyGroup implements Serializable {
} }
vocs.addTerm(vocId, termId, termName); vocs.addTerm(vocId, termId, termName);
vocs.addSynonyms(vocId, termId, termId);
}
}
for (final String s : isLookUpService.quickSearchProfile(VOCABULARY_SYNONYMS_XQUERY)) {
final String[] arr = s.split("@=@");
if (arr.length == 3) {
final String vocId = arr[0].trim();
final String termId = arr[1].trim();
final String syn = arr[2].trim();
vocs.addSynonyms(vocId, termId, syn);
vocs.addSynonyms(vocId, termId, termId);
} }
} }
@ -66,16 +85,37 @@ public class VocabularyGroup implements Serializable {
} }
} }
public Qualifier getTermAsQualifier(final String vocId, final String id) { public Set<String> getTerms(String vocId) {
if (StringUtils.isBlank(id)) { if (!vocabularyExists(vocId)) {
return OafMapperUtils.qualifier("UNKNOWN", "UNKNOWN", vocId, vocId); return new HashSet<>();
} else if (termExists(vocId, id)) {
final Vocabulary v = vocs.get(vocId.toLowerCase());
final VocabularyTerm t = v.getTerm(id);
return OafMapperUtils.qualifier(t.getId(), t.getName(), v.getId(), v.getName());
} else {
return OafMapperUtils.qualifier(id, id, vocId, vocId);
} }
return vocs
.get(vocId.toLowerCase())
.getTerms()
.values()
.stream()
.map(t -> t.getId())
.collect(Collectors.toCollection(HashSet::new));
}
public Qualifier lookup(String vocId, String id) {
return Optional
.ofNullable(getSynonymAsQualifier(vocId, id))
.orElse(getTermAsQualifier(vocId, id));
}
public Qualifier getTermAsQualifier(final String vocId, final String id) {
if (vocabularyExists(vocId)) {
return vocs.get(vocId.toLowerCase()).getTermAsQualifier(id);
}
return OafMapperUtils.qualifier(id, id, "", "");
}
public Qualifier getSynonymAsQualifier(final String vocId, final String syn) {
if (StringUtils.isBlank(vocId)) {
return OafMapperUtils.unknown("", "");
}
return vocs.get(vocId.toLowerCase()).getSynonymAsQualifier(syn);
} }
public boolean termExists(final String vocId, final String id) { public boolean termExists(final String vocId, final String id) {
@ -86,4 +126,16 @@ public class VocabularyGroup implements Serializable {
return vocs.containsKey(vocId.toLowerCase()); return vocs.containsKey(vocId.toLowerCase());
} }
private void addSynonyms(final String vocId, final String termId, final String syn) {
String id = Optional
.ofNullable(vocId)
.map(s -> s.toLowerCase())
.orElseThrow(
() -> new IllegalArgumentException(String.format("empty vocabulary id for [term:%s, synonym:%s]")));
Optional
.ofNullable(vocs.get(id))
.orElseThrow(() -> new IllegalArgumentException("missing vocabulary id: " + vocId))
.addSynonym(syn, termId);
}
} }

View File

@ -0,0 +1,18 @@
<configuration>
<property>
<name>jobTracker</name>
<value>yarnRM</value>
</property>
<property>
<name>nameNode</name>
<value>hdfs://nameservice1</value>
</property>
<property>
<name>oozie.use.system.libpath</name>
<value>true</value>
</property>
<property>
<name>oozie.action.sharelib.for.spark</name>
<value>spark2</value>
</property>
</configuration>

View File

@ -0,0 +1,281 @@
<workflow-app name="clean graph" xmlns="uri:oozie:workflow:0.5">
<parameters>
<property>
<name>graphInputPath</name>
<description>the input path to read graph content</description>
</property>
<property>
<name>graphOutputPath</name>
<description>the target path to store cleaned graph</description>
</property>
<property>
<name>isLookupUrl</name>
<description>the address of the lookUp service</description>
</property>
<property>
<name>sparkDriverMemory</name>
<description>memory for driver process</description>
</property>
<property>
<name>sparkExecutorMemory</name>
<description>memory for individual executor</description>
</property>
<property>
<name>sparkExecutorCores</name>
<description>number of cores used by single executor</description>
</property>
<property>
<name>oozieActionShareLibForSpark2</name>
<description>oozie action sharelib for spark 2.*</description>
</property>
<property>
<name>spark2ExtraListeners</name>
<value>com.cloudera.spark.lineage.NavigatorAppListener</value>
<description>spark 2.* extra listeners classname</description>
</property>
<property>
<name>spark2SqlQueryExecutionListeners</name>
<value>com.cloudera.spark.lineage.NavigatorQueryListener</value>
<description>spark 2.* sql query execution listeners classname</description>
</property>
<property>
<name>spark2YarnHistoryServerAddress</name>
<description>spark 2.* yarn history server address</description>
</property>
<property>
<name>spark2EventLogDir</name>
<description>spark 2.* event log dir location</description>
</property>
</parameters>
<start to="fork_clean_graph"/>
<kill name="Kill">
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
</kill>
<fork name="fork_clean_graph">
<path start="clean_publication"/>
<path start="clean_dataset"/>
<path start="clean_otherresearchproduct"/>
<path start="clean_software"/>
<path start="clean_datasource"/>
<path start="clean_organization"/>
<path start="clean_project"/>
<path start="clean_relation"/>
</fork>
<action name="clean_publication">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>Clean publications</name>
<class>eu.dnetlib.dhp.oa.graph.clean.CleanGraphSparkJob</class>
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
<spark-opts>
--executor-cores=${sparkExecutorCores}
--executor-memory=${sparkExecutorMemory}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=7680
</spark-opts>
<arg>--inputPath</arg><arg>${graphInputPath}/publication</arg>
<arg>--outputPath</arg><arg>${graphOutputPath}/publication</arg>
<arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Publication</arg>
<arg>--isLookupUrl</arg><arg>${isLookupUrl}</arg>
</spark>
<ok to="wait_clean"/>
<error to="Kill"/>
</action>
<action name="clean_dataset">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>Clean datasets</name>
<class>eu.dnetlib.dhp.oa.graph.clean.CleanGraphSparkJob</class>
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
<spark-opts>
--executor-cores=${sparkExecutorCores}
--executor-memory=${sparkExecutorMemory}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=7680
</spark-opts>
<arg>--inputPath</arg><arg>${graphInputPath}/dataset</arg>
<arg>--outputPath</arg><arg>${graphOutputPath}/dataset</arg>
<arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Dataset</arg>
<arg>--isLookupUrl</arg><arg>${isLookupUrl}</arg>
</spark>
<ok to="wait_clean"/>
<error to="Kill"/>
</action>
<action name="clean_otherresearchproduct">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>Clean otherresearchproducts</name>
<class>eu.dnetlib.dhp.oa.graph.clean.CleanGraphSparkJob</class>
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
<spark-opts>
--executor-cores=${sparkExecutorCores}
--executor-memory=${sparkExecutorMemory}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=7680
</spark-opts>
<arg>--inputPath</arg><arg>${graphInputPath}/otherresearchproduct</arg>
<arg>--outputPath</arg><arg>${graphOutputPath}/otherresearchproduct</arg>
<arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.OtherResearchProduct</arg>
<arg>--isLookupUrl</arg><arg>${isLookupUrl}</arg>
</spark>
<ok to="wait_clean"/>
<error to="Kill"/>
</action>
<action name="clean_software">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>Clean softwares</name>
<class>eu.dnetlib.dhp.oa.graph.clean.CleanGraphSparkJob</class>
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
<spark-opts>
--executor-cores=${sparkExecutorCores}
--executor-memory=${sparkExecutorMemory}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=7680
</spark-opts>
<arg>--inputPath</arg><arg>${graphInputPath}/software</arg>
<arg>--outputPath</arg><arg>${graphOutputPath}/software</arg>
<arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Software</arg>
<arg>--isLookupUrl</arg><arg>${isLookupUrl}</arg>
</spark>
<ok to="wait_clean"/>
<error to="Kill"/>
</action>
<action name="clean_datasource">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>Clean datasources</name>
<class>eu.dnetlib.dhp.oa.graph.clean.CleanGraphSparkJob</class>
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
<spark-opts>
--executor-cores=${sparkExecutorCores}
--executor-memory=${sparkExecutorMemory}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=7680
</spark-opts>
<arg>--inputPath</arg><arg>${graphInputPath}/datasource</arg>
<arg>--outputPath</arg><arg>${graphOutputPath}/datasource</arg>
<arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Datasource</arg>
<arg>--isLookupUrl</arg><arg>${isLookupUrl}</arg>
</spark>
<ok to="wait_clean"/>
<error to="Kill"/>
</action>
<action name="clean_organization">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>Clean organizations</name>
<class>eu.dnetlib.dhp.oa.graph.clean.CleanGraphSparkJob</class>
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
<spark-opts>
--executor-cores=${sparkExecutorCores}
--executor-memory=${sparkExecutorMemory}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=7680
</spark-opts>
<arg>--inputPath</arg><arg>${graphInputPath}/organization</arg>
<arg>--outputPath</arg><arg>${graphOutputPath}/organization</arg>
<arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Organization</arg>
<arg>--isLookupUrl</arg><arg>${isLookupUrl}</arg>
</spark>
<ok to="wait_clean"/>
<error to="Kill"/>
</action>
<action name="clean_project">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>Clean projects</name>
<class>eu.dnetlib.dhp.oa.graph.clean.CleanGraphSparkJob</class>
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
<spark-opts>
--executor-cores=${sparkExecutorCores}
--executor-memory=${sparkExecutorMemory}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=7680
</spark-opts>
<arg>--inputPath</arg><arg>${graphInputPath}/project</arg>
<arg>--outputPath</arg><arg>${graphOutputPath}/project</arg>
<arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Project</arg>
<arg>--isLookupUrl</arg><arg>${isLookupUrl}</arg>
</spark>
<ok to="wait_clean"/>
<error to="Kill"/>
</action>
<action name="clean_relation">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>Clean relations</name>
<class>eu.dnetlib.dhp.oa.graph.clean.CleanGraphSparkJob</class>
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
<spark-opts>
--executor-cores=${sparkExecutorCores}
--executor-memory=${sparkExecutorMemory}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=7680
</spark-opts>
<arg>--inputPath</arg><arg>${graphInputPath}/relation</arg>
<arg>--outputPath</arg><arg>${graphOutputPath}/relation</arg>
<arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Relation</arg>
<arg>--isLookupUrl</arg><arg>${isLookupUrl}</arg>
</spark>
<ok to="wait_clean"/>
<error to="Kill"/>
</action>
<join name="wait_clean" to="End"/>
<end name="End"/>
</workflow-app>

View File

@ -0,0 +1,32 @@
[
{
"paramName": "issm",
"paramLongName": "isSparkSessionManaged",
"paramDescription": "when true will stop SparkSession after job execution",
"paramRequired": false
},
{
"paramName": "in",
"paramLongName": "inputPath",
"paramDescription": "the path to the graph data dump to read",
"paramRequired": true
},
{
"paramName": "out",
"paramLongName": "outputPath",
"paramDescription": "the path to store the output graph",
"paramRequired": true
},
{
"paramName": "isu",
"paramLongName": "isLookupUrl",
"paramDescription": "url to the ISLookup Service",
"paramRequired": true
},
{
"paramName": "class",
"paramLongName": "graphTableClassName",
"paramDescription": "class name moelling the graph table",
"paramRequired": true
}
]

View File

@ -0,0 +1,6 @@
for $x in collection('/db/DRIVER/VocabularyDSResources/VocabularyDSResourceType')
let $vocid := $x//VOCABULARY_NAME/@code
let $vocname := $x//VOCABULARY_NAME/text()
for $term in ($x//TERM)
for $syn in ($term//SYNONYM/@term)
return concat($vocid,' @=@ ',$term/@code,' @=@ ', $syn)

View File

@ -0,0 +1,106 @@
package eu.dnetlib.dhp.oa.graph.clean;
import static org.junit.jupiter.api.Assertions.*;
import static org.mockito.Mockito.lenient;
import java.io.IOException;
import java.util.List;
import java.util.Set;
import java.util.stream.Stream;
import org.apache.commons.io.IOUtils;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.api.extension.ExtendWith;
import org.mockito.Mock;
import org.mockito.junit.jupiter.MockitoExtension;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.oa.graph.raw.common.VocabularyGroup;
import eu.dnetlib.dhp.schema.oaf.Publication;
import eu.dnetlib.dhp.schema.oaf.Qualifier;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
@ExtendWith(MockitoExtension.class)
public class CleaningFunctionTest {
public static final ObjectMapper MAPPER = new ObjectMapper();
@Mock
private ISLookUpService isLookUpService;
private VocabularyGroup vocabularies;
private CleaningRuleMap mapping;
@BeforeEach
public void setUp() throws ISLookUpException, IOException {
lenient().when(isLookUpService.quickSearchProfile(VocabularyGroup.VOCABULARIES_XQUERY)).thenReturn(vocs());
lenient()
.when(isLookUpService.quickSearchProfile(VocabularyGroup.VOCABULARY_SYNONYMS_XQUERY))
.thenReturn(synonyms());
vocabularies = VocabularyGroup.loadVocsFromIS(isLookUpService);
mapping = CleaningRuleMap.create(vocabularies);
}
@Test
public void testCleaning() throws Exception {
assertNotNull(vocabularies);
assertNotNull(mapping);
String json = IOUtils.toString(getClass().getResourceAsStream("/eu/dnetlib/dhp/oa/graph/clean/result.json"));
Publication p_in = MAPPER.readValue(json, Publication.class);
Publication p_out = OafCleaner.apply(p_in, mapping);
assertNotNull(p_out);
assertEquals("eng", p_out.getLanguage().getClassid());
assertEquals("English", p_out.getLanguage().getClassname());
assertEquals("0018", p_out.getInstance().get(0).getInstancetype().getClassid());
assertEquals("Annotation", p_out.getInstance().get(0).getInstancetype().getClassname());
assertEquals("CLOSED", p_out.getInstance().get(0).getAccessright().getClassid());
assertEquals("Closed Access", p_out.getInstance().get(0).getAccessright().getClassname());
Set<String> pidTerms = vocabularies.getTerms("dnet:pid_types");
assertTrue(
p_out
.getPid()
.stream()
.map(p -> p.getQualifier())
.allMatch(q -> pidTerms.contains(q.getClassid())));
// TODO add more assertions to verity the cleaned values
System.out.println(MAPPER.writeValueAsString(p_out));
/*
* assertTrue( p_out .getPid() .stream() .allMatch(sp -> StringUtils.isNotBlank(sp.getValue())));
*/
}
private Stream<Qualifier> getAuthorPidTypes(Publication pub) {
return pub
.getAuthor()
.stream()
.map(a -> a.getPid())
.flatMap(p -> p.stream())
.map(s -> s.getQualifier());
}
private List<String> vocs() throws IOException {
return IOUtils
.readLines(CleaningFunctionTest.class.getResourceAsStream("/eu/dnetlib/dhp/oa/graph/clean/terms.txt"));
}
private List<String> synonyms() throws IOException {
return IOUtils
.readLines(CleaningFunctionTest.class.getResourceAsStream("/eu/dnetlib/dhp/oa/graph/clean/synonyms.txt"));
}
}

View File

@ -0,0 +1,757 @@
{
"author": [
{
"affiliation": [
],
"fullname": "Brien, Tom",
"name": "Tom",
"pid": [
{
"dataInfo": {
"deletedbyinference": false,
"inferenceprovenance": "",
"inferred": false,
"invisible": false,
"provenanceaction": {
"classid": "sysimport:crosswalk:datasetarchive",
"classname": "sysimport:crosswalk:datasetarchive",
"schemeid": "dnet:provenanceActions",
"schemename": "dnet:provenanceActions"
},
"trust": "0.9"
},
"qualifier": {
"classid": "ORCID12",
"classname": "ORCID12",
"schemeid": "dnet:pid_types",
"schemename": "dnet:pid_types"
},
"value": "0000-0001-9613-6639"
}
],
"rank": 1,
"surname": "Brien"
},
{
"affiliation": [
],
"fullname": "Ade, Peter",
"name": "Peter",
"pid": [
{
"dataInfo": {
"deletedbyinference": false,
"inferenceprovenance": "",
"inferred": false,
"invisible": false,
"provenanceaction": {
"classid": "sysimport:crosswalk:datasetarchive",
"classname": "sysimport:crosswalk:datasetarchive",
"schemeid": "dnet:provenanceActions",
"schemename": "dnet:provenanceActions"
},
"trust": "0.9"
},
"qualifier": {
"classid": "xyz",
"classname": "XYZ",
"schemeid": "dnet:pid_types",
"schemename": "dnet:pid_types"
},
"value": "qwerty"
}
],
"rank": 2,
"surname": "Ade"
},
{
"affiliation": [
],
"fullname": "Barry, Peter S.",
"name": "Peter S.",
"pid": [
],
"rank": 3,
"surname": "Barry"
},
{
"affiliation": [
],
"fullname": "Dunscombe, Chris J.",
"name": "Chris J.",
"pid": [
],
"rank": 4,
"surname": "Dunscombe"
},
{
"affiliation": [
],
"fullname": "Leadley, David R.",
"name": "David R.",
"pid": [
],
"rank": 5,
"surname": "Leadley"
},
{
"affiliation": [
],
"fullname": "Morozov, Dmitry V.",
"name": "Dmitry V.",
"pid": [
],
"rank": 6,
"surname": "Morozov"
},
{
"affiliation": [
],
"fullname": "Myronov, Maksym",
"name": "Maksym",
"pid": [
],
"rank": 7,
"surname": "Myronov"
},
{
"affiliation": [
],
"fullname": "Parker, Evan",
"name": "Evan",
"pid": [
],
"rank": 8,
"surname": "Parker"
},
{
"affiliation": [
],
"fullname": "Prest, Martin J.",
"name": "Martin J.",
"pid": [
],
"rank": 9,
"surname": "Prest"
},
{
"affiliation": [
],
"fullname": "Prunnila, Mika",
"name": "Mika",
"pid": [
],
"rank": 10,
"surname": "Prunnila"
},
{
"affiliation": [
],
"fullname": "Sudiwala, Rashmi V.",
"name": "Rashmi V.",
"pid": [
],
"rank": 11,
"surname": "Sudiwala"
},
{
"affiliation": [
],
"fullname": "Whall, Terry E.",
"name": "Terry E.",
"pid": [
],
"rank": 12,
"surname": "Whall"
},
{
"affiliation": [
],
"fullname": "Mauskopf",
"name": "",
"pid": [
],
"rank": 13,
"surname": ""
},
{
"affiliation": [
],
"fullname": " P. D. ",
"name": "",
"pid": [
],
"rank": 14,
"surname": ""
}
],
"bestaccessright": {
"classid": "CLOSED",
"classname": "Closed Access",
"schemeid": "dnet:access_modes",
"schemename": "dnet:access_modes"
},
"collectedfrom": [
{
"key": "10|CSC_________::a2b9ce8435390bcbfc05f3cae3948747",
"value": "VIRTA"
}
],
"context": [
],
"contributor": [
],
"country": [
],
"coverage": [
],
"dataInfo": {
"deletedbyinference": false,
"inferenceprovenance": "",
"inferred": false,
"invisible": false,
"provenanceaction": {
"classid": "sysimport:crosswalk:datasetarchive",
"classname": "sysimport:crosswalk:datasetarchive",
"schemeid": "dnet:provenanceActions",
"schemename": "dnet:provenanceActions"
},
"trust": "0.9"
},
"dateofacceptance": {
"dataInfo": {
"deletedbyinference": false,
"inferenceprovenance": "",
"inferred": false,
"invisible": false,
"provenanceaction": {
"classid": "sysimport:crosswalk:datasetarchive",
"classname": "sysimport:crosswalk:datasetarchive",
"schemeid": "dnet:provenanceActions",
"schemename": "dnet:provenanceActions"
},
"trust": "0.9"
},
"value": "2016-01-01"
},
"dateofcollection": "",
"dateoftransformation": "2020-04-22T12:34:08.009Z",
"description": [
],
"externalReference": [
],
"extraInfo": [
],
"format": [
],
"fulltext": [
],
"id": "50|CSC_________::2250a70c903c6ac6e4c01438259e9375",
"instance": [
{
"accessright": {
"classid": "CLOSED",
"classname": "CLOSED",
"schemeid": "dnet:access_modes",
"schemename": "dnet:access_modes"
},
"collectedfrom": {
"key": "10|CSC_________::a2b9ce8435390bcbfc05f3cae3948747",
"value": "VIRTA"
},
"dateofacceptance": {
"dataInfo": {
"deletedbyinference": false,
"inferenceprovenance": "",
"inferred": false,
"invisible": false,
"provenanceaction": {
"classid": "sysimport:crosswalk:datasetarchive",
"classname": "sysimport:crosswalk:datasetarchive",
"schemeid": "dnet:provenanceActions",
"schemename": "dnet:provenanceActions"
},
"trust": "0.9"
},
"value": "2016-01-01"
},
"distributionlocation": "",
"hostedby": {
"key": "10|CSC_________::a2b9ce8435390bcbfc05f3cae3948747",
"value": "VIRTA"
},
"instancetype": {
"classid": "Comentario",
"classname": "Comentario",
"schemeid": "dnet:publication_resource",
"schemename": "dnet:publication_resource"
},
"url": [
"http://juuli.fi/Record/0275158616",
"http://dx.doi.org/10.1007/s109090161569x"
]
}
],
"journal": {
"dataInfo": {
"deletedbyinference": false,
"inferenceprovenance": "",
"inferred": false,
"invisible": false,
"provenanceaction": {
"classid": "sysimport:crosswalk:datasetarchive",
"classname": "sysimport:crosswalk:datasetarchive",
"schemeid": "dnet:provenanceActions",
"schemename": "dnet:provenanceActions"
},
"trust": "0.9"
},
"edition": "",
"ep": " 7",
"iss": "9 March",
"issnLinking": "",
"issnOnline": "",
"issnPrinted": "0022-2291",
"name": "Journal of Low Temperature Physics - Early Acces",
"sp": "1 ",
"vol": ""
},
"language": {
"classid": "en",
"classname": "en",
"schemeid": "dnet:languages",
"schemename": "dnet:languages"
},
"lastupdatetimestamp": 1591283286319,
"oaiprovenance": {
"originDescription": {
"altered": true,
"baseURL": "https%3A%2F%2Fvirta-jtp.csc.fi%2Fapi%2Fcerif",
"datestamp": "2019-07-30",
"harvestDate": "2020-04-22T11:04:38.685Z",
"identifier": "oai:virta-jtp.csc.fi:Publications/0275158616",
"metadataNamespace": ""
}
},
"originalId": [
"CSC_________::2250a70c903c6ac6e4c01438259e9375"
],
"pid": [
{
"dataInfo": {
"deletedbyinference": false,
"inferenceprovenance": "",
"inferred": false,
"invisible": false,
"provenanceaction": {
"classid": "sysimport:crosswalk:datasetarchive",
"classname": "sysimport:crosswalk:datasetarchive",
"schemeid": "dnet:provenanceActions",
"schemename": "dnet:provenanceActions"
},
"trust": "0.9"
},
"qualifier": {
"classid": "doi",
"classname": "doi",
"schemeid": "dnet:pid_types",
"schemename": "dnet:pid_types"
},
"value": "10.1007/s109090161569x"
},
{
"dataInfo": {
"deletedbyinference": false,
"inferenceprovenance": "",
"inferred": false,
"invisible": false,
"provenanceaction": {
"classid": "sysimport:crosswalk:datasetarchive",
"classname": "sysimport:crosswalk:datasetarchive",
"schemeid": "dnet:provenanceActions",
"schemename": "dnet:provenanceActions"
},
"trust": "0.9"
},
"qualifier": {
"classid": "doi",
"classname": "doi",
"schemeid": "dnet:pid_types",
"schemename": "dnet:pid_types"
},
"value": "10.1007/s109090161569x"
},
{
"dataInfo": {
"deletedbyinference": false,
"inferenceprovenance": "",
"inferred": false,
"invisible": false,
"provenanceaction": {
"classid": "sysimport:crosswalk:datasetarchive",
"classname": "sysimport:crosswalk:datasetarchive",
"schemeid": "dnet:provenanceActions",
"schemename": "dnet:provenanceActions"
},
"trust": "0.9"
},
"qualifier": {
"classid": "doi",
"classname": "doi",
"schemeid": "dnet:pid_types",
"schemename": "dnet:pid_types"
},
"value": ""
}
],
"relevantdate": [
],
"resourcetype": {
"classid": "0001",
"classname": "0001",
"schemeid": "dnet:dataCite_resource",
"schemename": "dnet:dataCite_resource"
},
"resulttype": {
"classid": "publication",
"classname": "publication",
"schemeid": "dnet:result_typologies",
"schemename": "dnet:result_typologies"
},
"source": [
],
"subject": [
{
"dataInfo": {
"deletedbyinference": false,
"inferenceprovenance": "",
"inferred": false,
"invisible": false,
"provenanceaction": {
"classid": "sysimport:crosswalk:datasetarchive",
"classname": "sysimport:crosswalk:datasetarchive",
"schemeid": "dnet:provenanceActions",
"schemename": "dnet:provenanceActions"
},
"trust": "0.9"
},
"qualifier": {
"classid": "",
"classname": "",
"schemeid": "",
"schemename": ""
},
"value": "ta213"
},
{
"dataInfo": {
"deletedbyinference": false,
"inferenceprovenance": "",
"inferred": false,
"invisible": false,
"provenanceaction": {
"classid": "sysimport:crosswalk:datasetarchive",
"classname": "sysimport:crosswalk:datasetarchive",
"schemeid": "dnet:provenanceActions",
"schemename": "dnet:provenanceActions"
},
"trust": "0.9"
},
"qualifier": {
"classid": "",
"classname": "",
"schemeid": "",
"schemename": ""
},
"value": "infrared detectors"
},
{
"dataInfo": {
"deletedbyinference": false,
"inferenceprovenance": "",
"inferred": false,
"invisible": false,
"provenanceaction": {
"classid": "sysimport:crosswalk:datasetarchive",
"classname": "sysimport:crosswalk:datasetarchive",
"schemeid": "dnet:provenanceActions",
"schemename": "dnet:provenanceActions"
},
"trust": "0.9"
},
"qualifier": {
"classid": "",
"classname": "",
"schemeid": "",
"schemename": ""
},
"value": "lens antennas"
},
{
"dataInfo": {
"deletedbyinference": false,
"inferenceprovenance": "",
"inferred": false,
"invisible": false,
"provenanceaction": {
"classid": "sysimport:crosswalk:datasetarchive",
"classname": "sysimport:crosswalk:datasetarchive",
"schemeid": "dnet:provenanceActions",
"schemename": "dnet:provenanceActions"
},
"trust": "0.9"
},
"qualifier": {
"classid": "",
"classname": "",
"schemeid": "",
"schemename": ""
},
"value": "silicon"
},
{
"dataInfo": {
"deletedbyinference": false,
"inferenceprovenance": "",
"inferred": false,
"invisible": false,
"provenanceaction": {
"classid": "sysimport:crosswalk:datasetarchive",
"classname": "sysimport:crosswalk:datasetarchive",
"schemeid": "dnet:provenanceActions",
"schemename": "dnet:provenanceActions"
},
"trust": "0.9"
},
"qualifier": {
"classid": "",
"classname": "",
"schemeid": "",
"schemename": ""
},
"value": "slot antennas"
},
{
"dataInfo": {
"deletedbyinference": false,
"inferenceprovenance": "",
"inferred": false,
"invisible": false,
"provenanceaction": {
"classid": "sysimport:crosswalk:datasetarchive",
"classname": "sysimport:crosswalk:datasetarchive",
"schemeid": "dnet:provenanceActions",
"schemename": "dnet:provenanceActions"
},
"trust": "0.9"
},
"qualifier": {
"classid": "",
"classname": "",
"schemeid": "",
"schemename": ""
},
"value": "strained silicon"
},
{
"dataInfo": {
"deletedbyinference": false,
"inferenceprovenance": "",
"inferred": false,
"invisible": false,
"provenanceaction": {
"classid": "sysimport:crosswalk:datasetarchive",
"classname": "sysimport:crosswalk:datasetarchive",
"schemeid": "dnet:provenanceActions",
"schemename": "dnet:provenanceActions"
},
"trust": "0.9"
},
"qualifier": {
"classid": "",
"classname": "",
"schemeid": "",
"schemename": ""
},
"value": "cold electron bolometers"
},
{
"dataInfo": {
"deletedbyinference": false,
"inferenceprovenance": "",
"inferred": false,
"invisible": false,
"provenanceaction": {
"classid": "sysimport:crosswalk:datasetarchive",
"classname": "sysimport:crosswalk:datasetarchive",
"schemeid": "dnet:provenanceActions",
"schemename": "dnet:provenanceActions"
},
"trust": "0.9"
},
"qualifier": {
"classid": "",
"classname": "",
"schemeid": "",
"schemename": ""
},
"value": "doped silicon"
},
{
"dataInfo": {
"deletedbyinference": false,
"inferenceprovenance": "",
"inferred": false,
"invisible": false,
"provenanceaction": {
"classid": "sysimport:crosswalk:datasetarchive",
"classname": "sysimport:crosswalk:datasetarchive",
"schemeid": "dnet:provenanceActions",
"schemename": "dnet:provenanceActions"
},
"trust": "0.9"
},
"qualifier": {
"classid": "",
"classname": "",
"schemeid": "",
"schemename": ""
},
"value": "measure noise"
},
{
"dataInfo": {
"deletedbyinference": false,
"inferenceprovenance": "",
"inferred": false,
"invisible": false,
"provenanceaction": {
"classid": "sysimport:crosswalk:datasetarchive",
"classname": "sysimport:crosswalk:datasetarchive",
"schemeid": "dnet:provenanceActions",
"schemename": "dnet:provenanceActions"
},
"trust": "0.9"
},
"qualifier": {
"classid": "",
"classname": "",
"schemeid": "",
"schemename": ""
},
"value": "noise equivalent power"
},
{
"dataInfo": {
"deletedbyinference": false,
"inferenceprovenance": "",
"inferred": false,
"invisible": false,
"provenanceaction": {
"classid": "sysimport:crosswalk:datasetarchive",
"classname": "sysimport:crosswalk:datasetarchive",
"schemeid": "dnet:provenanceActions",
"schemename": "dnet:provenanceActions"
},
"trust": "0.9"
},
"qualifier": {
"classid": "",
"classname": "",
"schemeid": "",
"schemename": ""
},
"value": "optical characterisation"
},
{
"dataInfo": {
"deletedbyinference": false,
"inferenceprovenance": "",
"inferred": false,
"invisible": false,
"provenanceaction": {
"classid": "sysimport:crosswalk:datasetarchive",
"classname": "sysimport:crosswalk:datasetarchive",
"schemeid": "dnet:provenanceActions",
"schemename": "dnet:provenanceActions"
},
"trust": "0.9"
},
"qualifier": {
"classid": "",
"classname": "",
"schemeid": "",
"schemename": ""
},
"value": "optical response"
},
{
"dataInfo": {
"deletedbyinference": false,
"inferenceprovenance": "",
"inferred": false,
"invisible": false,
"provenanceaction": {
"classid": "sysimport:crosswalk:datasetarchive",
"classname": "sysimport:crosswalk:datasetarchive",
"schemeid": "dnet:provenanceActions",
"schemename": "dnet:provenanceActions"
},
"trust": "0.9"
},
"qualifier": {
"classid": "",
"classname": "",
"schemeid": "",
"schemename": ""
},
"value": "photon noise"
},
{
"dataInfo": {
"deletedbyinference": false,
"inferenceprovenance": "",
"inferred": false,
"invisible": false,
"provenanceaction": {
"classid": "sysimport:crosswalk:datasetarchive",
"classname": "sysimport:crosswalk:datasetarchive",
"schemeid": "dnet:provenanceActions",
"schemename": "dnet:provenanceActions"
},
"trust": "0.9"
},
"qualifier": {
"classid": "",
"classname": "",
"schemeid": "",
"schemename": ""
},
"value": "silicon absorbers"
}
],
"title": [
{
"dataInfo": {
"deletedbyinference": false,
"inferenceprovenance": "",
"inferred": false,
"invisible": false,
"provenanceaction": {
"classid": "sysimport:crosswalk:datasetarchive",
"classname": "sysimport:crosswalk:datasetarchive",
"schemeid": "dnet:provenanceActions",
"schemename": "dnet:provenanceActions"
},
"trust": "0.9"
},
"qualifier": {
"classid": "main title",
"classname": "main title",
"schemeid": "dnet:dataCite_title",
"schemename": "dnet:dataCite_title"
},
"value": "Optical response of strained- and unstrained-silicon cold-electron bolometers"
}
]
}

View File

@ -3,7 +3,7 @@
<parent> <parent>
<artifactId>dhp-workflows</artifactId> <artifactId>dhp-workflows</artifactId>
<groupId>eu.dnetlib.dhp</groupId> <groupId>eu.dnetlib.dhp</groupId>
<version>1.2.2-SNAPSHOT</version> <version>1.2.3-SNAPSHOT</version>
</parent> </parent>
<modelVersion>4.0.0</modelVersion> <modelVersion>4.0.0</modelVersion>

View File

@ -3,7 +3,7 @@
<parent> <parent>
<artifactId>dhp-workflows</artifactId> <artifactId>dhp-workflows</artifactId>
<groupId>eu.dnetlib.dhp</groupId> <groupId>eu.dnetlib.dhp</groupId>
<version>1.2.2-SNAPSHOT</version> <version>1.2.3-SNAPSHOT</version>
</parent> </parent>
<modelVersion>4.0.0</modelVersion> <modelVersion>4.0.0</modelVersion>

View File

@ -897,7 +897,10 @@ public class XmlRecordFactory implements Serializable {
metadata.add(XmlSerializationUtils.mapQualifier("contracttype", p.getContracttype())); metadata.add(XmlSerializationUtils.mapQualifier("contracttype", p.getContracttype()));
} }
if (p.getOamandatepublications() != null) { if (p.getOamandatepublications() != null) {
metadata.add(XmlSerializationUtils.asXmlElement("oamandatepublications", p.getOamandatepublications().getValue())); metadata
.add(
XmlSerializationUtils
.asXmlElement("oamandatepublications", p.getOamandatepublications().getValue()));
} }
if (p.getEcsc39() != null) { if (p.getEcsc39() != null) {
metadata.add(XmlSerializationUtils.asXmlElement("ecsc39", p.getEcsc39().getValue())); metadata.add(XmlSerializationUtils.asXmlElement("ecsc39", p.getEcsc39().getValue()));
@ -1168,10 +1171,10 @@ public class XmlRecordFactory implements Serializable {
.asXmlElement( .asXmlElement(
"distributionlocation", instance.getDistributionlocation())); "distributionlocation", instance.getDistributionlocation()));
} }
if (instance.getRefereed() != null && isNotBlank(instance.getRefereed().getValue())) { if (instance.getRefereed() != null && !instance.getRefereed().isBlank()) {
fields fields
.add( .add(
XmlSerializationUtils.asXmlElement("refereed", instance.getRefereed().getValue())); XmlSerializationUtils.mapQualifier("refereed", instance.getRefereed()));
} }
if (instance.getProcessingchargeamount() != null if (instance.getProcessingchargeamount() != null
&& isNotBlank(instance.getProcessingchargeamount().getValue())) { && isNotBlank(instance.getProcessingchargeamount().getValue())) {

View File

@ -3,7 +3,7 @@
<parent> <parent>
<artifactId>dhp-workflows</artifactId> <artifactId>dhp-workflows</artifactId>
<groupId>eu.dnetlib.dhp</groupId> <groupId>eu.dnetlib.dhp</groupId>
<version>1.2.2-SNAPSHOT</version> <version>1.2.3-SNAPSHOT</version>
</parent> </parent>
<modelVersion>4.0.0</modelVersion> <modelVersion>4.0.0</modelVersion>
<artifactId>dhp-stats-update</artifactId> <artifactId>dhp-stats-update</artifactId>

View File

@ -3,7 +3,7 @@
<parent> <parent>
<artifactId>dhp-workflows</artifactId> <artifactId>dhp-workflows</artifactId>
<groupId>eu.dnetlib.dhp</groupId> <groupId>eu.dnetlib.dhp</groupId>
<version>1.2.2-SNAPSHOT</version> <version>1.2.3-SNAPSHOT</version>
</parent> </parent>
<modelVersion>4.0.0</modelVersion> <modelVersion>4.0.0</modelVersion>

View File

@ -6,7 +6,7 @@
<parent> <parent>
<groupId>eu.dnetlib.dhp</groupId> <groupId>eu.dnetlib.dhp</groupId>
<artifactId>dhp</artifactId> <artifactId>dhp</artifactId>
<version>1.2.2-SNAPSHOT</version> <version>1.2.3-SNAPSHOT</version>
<relativePath>../</relativePath> <relativePath>../</relativePath>
</parent> </parent>

View File

@ -3,7 +3,7 @@
<modelVersion>4.0.0</modelVersion> <modelVersion>4.0.0</modelVersion>
<groupId>eu.dnetlib.dhp</groupId> <groupId>eu.dnetlib.dhp</groupId>
<artifactId>dhp</artifactId> <artifactId>dhp</artifactId>
<version>1.2.2-SNAPSHOT</version> <version>1.2.3-SNAPSHOT</version>
<packaging>pom</packaging> <packaging>pom</packaging>
<licenses> <licenses>