From 81e85465d85e04efb92d27b70774abeb9580e755 Mon Sep 17 00:00:00 2001 From: "michele.artini" Date: Mon, 8 Jun 2020 16:26:16 +0200 Subject: [PATCH 01/17] join simrels --- .../broker/oa/GenerateEventsApplication.java | 74 +++++++++---------- .../dhp/broker/oa/util/EventGroup.java | 32 ++++++++ .../dhp/broker/oa/util/ResultAggregator.java | 50 +++++++++++++ .../dhp/broker/oa/util/ResultGroup.java | 35 +++++++++ 4 files changed, 153 insertions(+), 38 deletions(-) create mode 100644 dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EventGroup.java create mode 100644 dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/ResultAggregator.java create mode 100644 dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/ResultGroup.java diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/GenerateEventsApplication.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/GenerateEventsApplication.java index fede6f8bfb..05fab47f09 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/GenerateEventsApplication.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/GenerateEventsApplication.java @@ -12,16 +12,13 @@ import java.util.stream.Collectors; import org.apache.commons.io.IOUtils; import org.apache.commons.lang3.tuple.Pair; -import org.apache.hadoop.io.compress.GzipCodec; import org.apache.spark.SparkConf; -import org.apache.spark.api.java.JavaRDD; -import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.api.java.function.MapFunction; -import org.apache.spark.sql.Column; import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Encoders; -import org.apache.spark.sql.Row; +import org.apache.spark.sql.SaveMode; import org.apache.spark.sql.SparkSession; +import org.apache.spark.sql.TypedColumn; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -55,6 +52,9 @@ import eu.dnetlib.dhp.broker.oa.matchers.simple.EnrichMoreOpenAccess; import eu.dnetlib.dhp.broker.oa.matchers.simple.EnrichMorePid; import eu.dnetlib.dhp.broker.oa.matchers.simple.EnrichMoreSubject; import eu.dnetlib.dhp.broker.oa.util.BrokerConstants; +import eu.dnetlib.dhp.broker.oa.util.EventGroup; +import eu.dnetlib.dhp.broker.oa.util.ResultAggregator; +import eu.dnetlib.dhp.broker.oa.util.ResultGroup; import eu.dnetlib.dhp.broker.oa.util.UpdateInfo; import eu.dnetlib.dhp.common.HdfsSupport; import eu.dnetlib.dhp.schema.oaf.OafEntity; @@ -63,6 +63,7 @@ import eu.dnetlib.dhp.schema.oaf.Publication; import eu.dnetlib.dhp.schema.oaf.Relation; import eu.dnetlib.dhp.schema.oaf.Result; import eu.dnetlib.dhp.schema.oaf.Software; +import scala.Tuple2; public class GenerateEventsApplication { @@ -130,20 +131,20 @@ public class GenerateEventsApplication { final SparkConf conf = new SparkConf(); runWithSparkSession(conf, isSparkSessionManaged, spark -> { - final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); + removeOutputDir(spark, eventsPath); - final JavaRDD eventsRdd = sc.emptyRDD(); + final Dataset all = spark.emptyDataset(Encoders.kryo(Event.class)); for (final Class r1 : BrokerConstants.RESULT_CLASSES) { - eventsRdd.union(generateSimpleEvents(spark, graphPath, r1)); + all.union(generateSimpleEvents(spark, graphPath, r1)); for (final Class r2 : BrokerConstants.RESULT_CLASSES) { - eventsRdd.union(generateRelationEvents(spark, graphPath, r1, r2)); + all.union(generateRelationEvents(spark, graphPath, r1, r2)); } } - eventsRdd.saveAsTextFile(eventsPath, GzipCodec.class); + all.write().mode(SaveMode.Overwrite).json(eventsPath); }); } @@ -152,51 +153,48 @@ public class GenerateEventsApplication { HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration()); } - private static JavaRDD generateSimpleEvents(final SparkSession spark, + private static Dataset generateSimpleEvents(final SparkSession spark, final String graphPath, final Class resultClazz) { - final Dataset results = readPath(spark, graphPath + "/" + resultClazz.getSimpleName().toLowerCase(), resultClazz) + final Dataset results = readPath(spark, graphPath + "/" + resultClazz.getSimpleName().toLowerCase(), Result.class) .filter(r -> r.getDataInfo().getDeletedbyinference()); final Dataset rels = readPath(spark, graphPath + "/relation", Relation.class) .filter(r -> r.getRelClass().equals(BrokerConstants.IS_MERGED_IN_CLASS)); - final Column c = null; // TODO - - final Dataset aa = results - .joinWith(rels, results.col("id").equalTo(rels.col("source")), "inner") - .groupBy(rels.col("target")) - .agg(c) - .filter(x -> x.size() > 1) - // generateSimpleEvents(...) - // flatMap() - // toRdd() - ; - - return null; + final TypedColumn, ResultGroup> aggr = new ResultAggregator().toColumn(); + return results.joinWith(rels, results.col("id").equalTo(rels.col("source")), "inner") + .groupByKey((MapFunction, String>) t -> t._2.getTarget(), Encoders.STRING()) + .agg(aggr) + .map((MapFunction, ResultGroup>) t -> t._2, Encoders.kryo(ResultGroup.class)) + .filter(ResultGroup::isValid) + .map((MapFunction) g -> GenerateEventsApplication.generateSimpleEvents(g), Encoders.kryo(EventGroup.class)) + .flatMap(group -> group.getData().iterator(), Encoders.kryo(Event.class)); } - private List generateSimpleEvents(final Collection children) { + private static EventGroup generateSimpleEvents(final ResultGroup results) { final List> list = new ArrayList<>(); - for (final Result target : children) { - list.addAll(enrichMissingAbstract.searchUpdatesForRecord(target, children)); - list.addAll(enrichMissingAuthorOrcid.searchUpdatesForRecord(target, children)); - list.addAll(enrichMissingOpenAccess.searchUpdatesForRecord(target, children)); - list.addAll(enrichMissingPid.searchUpdatesForRecord(target, children)); - list.addAll(enrichMissingPublicationDate.searchUpdatesForRecord(target, children)); - list.addAll(enrichMissingSubject.searchUpdatesForRecord(target, children)); - list.addAll(enrichMoreOpenAccess.searchUpdatesForRecord(target, children)); - list.addAll(enrichMorePid.searchUpdatesForRecord(target, children)); - list.addAll(enrichMoreSubject.searchUpdatesForRecord(target, children)); + for (final Result target : results.getData()) { + list.addAll(enrichMissingAbstract.searchUpdatesForRecord(target, results.getData())); + list.addAll(enrichMissingAuthorOrcid.searchUpdatesForRecord(target, results.getData())); + list.addAll(enrichMissingOpenAccess.searchUpdatesForRecord(target, results.getData())); + list.addAll(enrichMissingPid.searchUpdatesForRecord(target, results.getData())); + list.addAll(enrichMissingPublicationDate.searchUpdatesForRecord(target, results.getData())); + list.addAll(enrichMissingSubject.searchUpdatesForRecord(target, results.getData())); + list.addAll(enrichMoreOpenAccess.searchUpdatesForRecord(target, results.getData())); + list.addAll(enrichMorePid.searchUpdatesForRecord(target, results.getData())); + list.addAll(enrichMoreSubject.searchUpdatesForRecord(target, results.getData())); } - return list.stream().map(EventFactory::newBrokerEvent).collect(Collectors.toList()); + final EventGroup events = new EventGroup(); + list.stream().map(EventFactory::newBrokerEvent).forEach(events::addElement); + return events; } - private static JavaRDD generateRelationEvents(final SparkSession spark, + private static Dataset generateRelationEvents(final SparkSession spark, final String graphPath, final Class sourceClass, final Class targetClass) { diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EventGroup.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EventGroup.java new file mode 100644 index 0000000000..9c7081c79b --- /dev/null +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EventGroup.java @@ -0,0 +1,32 @@ +package eu.dnetlib.dhp.broker.oa.util; + +import java.io.Serializable; +import java.util.ArrayList; +import java.util.List; + +import eu.dnetlib.dhp.broker.model.Event; + +public class EventGroup implements Serializable { + + /** + * + */ + private static final long serialVersionUID = 765977943803533130L; + + private final List data = new ArrayList<>(); + + public List getData() { + return data; + } + + public EventGroup addElement(final Event elem) { + data.add(elem); + return this; + } + + public EventGroup addGroup(final EventGroup group) { + data.addAll(group.getData()); + return this; + } + +} diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/ResultAggregator.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/ResultAggregator.java new file mode 100644 index 0000000000..94685eeae2 --- /dev/null +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/ResultAggregator.java @@ -0,0 +1,50 @@ +package eu.dnetlib.dhp.broker.oa.util; + +import org.apache.spark.sql.Encoder; +import org.apache.spark.sql.Encoders; +import org.apache.spark.sql.expressions.Aggregator; + +import eu.dnetlib.dhp.schema.oaf.Relation; +import eu.dnetlib.dhp.schema.oaf.Result; +import scala.Tuple2; + +public class ResultAggregator extends Aggregator, ResultGroup, ResultGroup> { + + /** + * + */ + private static final long serialVersionUID = -1492327874705585538L; + + @Override + public ResultGroup zero() { + return new ResultGroup(); + } + + @Override + public ResultGroup reduce(final ResultGroup group, final Tuple2 t) { + return group.addElement(t._1); + } + + @Override + public ResultGroup merge(final ResultGroup g1, final ResultGroup g2) { + return g1.addGroup(g2); + } + + @Override + public ResultGroup finish(final ResultGroup group) { + return group; + } + + @Override + public Encoder bufferEncoder() { + return Encoders.kryo(ResultGroup.class); + + } + + @Override + public Encoder outputEncoder() { + return Encoders.kryo(ResultGroup.class); + + } + +} diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/ResultGroup.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/ResultGroup.java new file mode 100644 index 0000000000..8fe7a59392 --- /dev/null +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/ResultGroup.java @@ -0,0 +1,35 @@ +package eu.dnetlib.dhp.broker.oa.util; + +import java.io.Serializable; +import java.util.ArrayList; +import java.util.List; + +import eu.dnetlib.dhp.schema.oaf.Result; + +public class ResultGroup implements Serializable { + + /** + * + */ + private static final long serialVersionUID = -3360828477088669296L; + + private final List data = new ArrayList<>(); + + public List getData() { + return data; + } + + public ResultGroup addElement(final Result elem) { + data.add(elem); + return this; + } + + public ResultGroup addGroup(final ResultGroup group) { + data.addAll(group.getData()); + return this; + } + + public boolean isValid() { + return data.size() > 1; + } +} From 16cb073b15e065b763abe37ac9d621b1f7d2355b Mon Sep 17 00:00:00 2001 From: Alessia Bardi Date: Mon, 8 Jun 2020 19:06:03 +0200 Subject: [PATCH 02/17] set the instance datepfacceptance with the Crossref createdDate in case the issuedDate is blank --- .../main/java/eu/dnetlib/doiboost/crossref/Crossref2Oaf.scala | 3 +++ 1 file changed, 3 insertions(+) diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/crossref/Crossref2Oaf.scala b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/crossref/Crossref2Oaf.scala index cc2c9d5860..85997fa364 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/crossref/Crossref2Oaf.scala +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/crossref/Crossref2Oaf.scala @@ -179,6 +179,9 @@ case object Crossref2Oaf { if (StringUtils.isNotBlank(issuedDate)) { instance.setDateofacceptance(asField(issuedDate)) } + else { + instance.setDateofacceptance(asField(createdDate.getValue)) + } val s: String = (json \ "URL").extract[String] val links: List[String] = ((for {JString(url) <- json \ "link" \ "URL"} yield url) ::: List(s)).filter(p => p != null).distinct if (links.nonEmpty) From 9fd25887f7dfd8033fb3e0c7e9f9b2d6c482a43b Mon Sep 17 00:00:00 2001 From: Alessia Bardi Date: Mon, 8 Jun 2020 19:32:24 +0200 Subject: [PATCH 03/17] Result identifiers all start with 50| --- .../java/eu/dnetlib/doiboost/DoiBoostMappingUtil.scala | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/DoiBoostMappingUtil.scala b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/DoiBoostMappingUtil.scala index 90bfacdc91..7b21ecda29 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/DoiBoostMappingUtil.scala +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/DoiBoostMappingUtil.scala @@ -319,15 +319,7 @@ object DoiBoostMappingUtil { def generateIdentifier (oaf: Result, doi: String): String = { val id = DHPUtils.md5 (doi.toLowerCase) - if (oaf.isInstanceOf[Dataset] ) - return s"60|${ - doiBoostNSPREFIX - }${ - SEPARATOR - }${ - id - }" - s"50|${ + return s"50|${ doiBoostNSPREFIX }${ SEPARATOR From 181f52b9bccfdb1c13858e0eeebfa85d63911c5c Mon Sep 17 00:00:00 2001 From: Alessia Bardi Date: Mon, 8 Jun 2020 19:33:47 +0200 Subject: [PATCH 04/17] Added mapping table for Crossref --- .../crossref_mapping.csv | 60 +++++++++++++++++++ 1 file changed, 60 insertions(+) create mode 100644 dhp-workflows/dhp-doiboost/src/main/resources/eu.dnetlib.dhp.doiboost.mappings/crossref_mapping.csv diff --git a/dhp-workflows/dhp-doiboost/src/main/resources/eu.dnetlib.dhp.doiboost.mappings/crossref_mapping.csv b/dhp-workflows/dhp-doiboost/src/main/resources/eu.dnetlib.dhp.doiboost.mappings/crossref_mapping.csv new file mode 100644 index 0000000000..d87ae5da6a --- /dev/null +++ b/dhp-workflows/dhp-doiboost/src/main/resources/eu.dnetlib.dhp.doiboost.mappings/crossref_mapping.csv @@ -0,0 +1,60 @@ +Crossref Field,Type,Required,Description (from Crossref),OAF field,Comments +publisher,String,Yes,Name of work's publisher,Result/Publisher, +title,Array of String,Yes,"Work titles, including translated titles","Result/Title with Qualifier(""main title"", ""dnet:dataCite_title"")", +original-title,Array of String,No,Work titles in the work's original publication language,"Result/Title with Qualifier(""alternative title"", ""dnet:dataCite_title"")", +short-title,Array of String,No,Short or abbreviated work titles,"Result/Title with Qualifier(""alternative title"", ""dnet:dataCite_title"")", +abstract,XML String,No,Abstract as a JSON string or a JATS XML snippet encoded into a JSON string,Result/description, +reference-count,Number,Yes,Deprecated Same as references-count,"- ", +references-count,Number,Yes,Count of outbound references deposited with Crossref,N/A, +is-referenced-by-count,Number,Yes,Count of inbound references deposited with Crossref,N/A, +source,String,Yes,Currently always Crossref,Result/source, +prefix,String,Yes,DOI prefix identifier of the form http://id.crossref.org/prefix/DOI_PREFIX,N/A, +DOI,String,Yes,DOI of the work,OafEntity/originalId, +,,,,OafEntity/PID, +,,,,"Oaf/id ",Use to generate the OpenAIRE id in the form 50|doiboost____::md5(DOI) +URL,URL,Yes,URL form of the work's DOI,Instance/url, +member,String,Yes,Member identifier of the form http://id.crossref.org/member/MEMBER_ID,N/A, +type,String,Yes,"Enumeration, one of the type ids from https://api.crossref.org/v1/types",Instance/instancetype,Also use to map the record as OAF Publication or Dataset according to the mapping defined in eu/dnetlib/doiboost/crossref/Crossref2Oaf.scala eu/dnetlib/doiboost/crossref/Crossref2Oaf.scala +created,Date,Yes,Date on which the DOI was first registered,"Result/relevantDate with Qualifier(""created"", ""dnet:dataCite_date"")", +,,,,"Result/dateofacceptance +Instance/dateofacceptance",If crossref.issued is blank +deposited,Date,Yes,Date on which the work metadata was most recently updated,N/A, +indexed,Date,Yes,"Date on which the work metadata was most recently indexed. Re-indexing does not imply a metadata change, see deposited for the most recent metadata change date",Result/lastupdatetimestamp, +issued,Partial Date,Yes,Earliest of published-print and published-online,Result/dateofacceptance,OAF dateofacceptance is used also for the publishing date. It's the date visualised in the OpenAIRE EXPLORE portal. +,,,,Instance/dateofacceptance, +posted,Partial Date,No,Date on which posted content was made available online,"Result/relevantDate with Qualifier(""available"", ""dnet:dataCite_date"")", +accepted,Partial Date,No,"Date on which a work was accepted, after being submitted, during a submission process","Result/relevantDate with Qualifier(""accepted"", ""dnet:dataCite_date"")", +subtitle,Array of String,No,"Work subtitles, including original language and translated","Result/Title with Qualifier(""subtitle"", ""dnet:dataCite_title"")", +container-title,Array of String,No,Full titles of the containing work (usually a book or journal),Publication/Journal/name only in case of Journal title for book title see ISBN Mapping, +short-container-title,Array of String,No,Abbreviated titles of the containing work,N/A, +group-title,String,No,Group title for posted content,N/A, +issue,String,No,Issue number of an article's journal,Publication/Journal/iss, +volume,String,No,Volume number of an article's journal,Publication/Journal/vol, +page,String,No,Pages numbers of an article within its journal,"Publication/Journal/sp +Publication/Journal/ep",Obtain start and end page by splitting by '-' +article-number,String,No,,N/A, +published-print,Partial Date,No,Date on which the work was published in print,"Result/relevantDate with Qualifier(""published-print"", ""dnet:dataCite_date"")", +published-online,Partial Date,No,Date on which the work was published online,"Result/relevantDate with Qualifier(""published-online"", ""dnet:dataCite_date"")", +subject,Array of String,No,"Subject category names, a controlled vocabulary from Sci-Val. Available for most journal articles","Result/subject with Qualifier(""keywords"", ""dnet:subject_classification_typologies""). ","Future improvements: map the controlled vocabulary instead of using the generic ""keywords"" qualifier" +ISSN,Array of String,No,,"Publication/Journal/issn +Publication/Journal/lissn +Publication/Journal/eissn",The mapping depends on the value of issn-type +issn-type,Array of ISSN with Type,No,List of ISSNs with ISSN type information,N/A,Its value guides the setting of the properties in Journal (see row above) +ISBN,Array of String,No,,Publication/source,"In case of Book We can map ISBN and container title on Publication/source using this syntax container-title + ""ISBN: "" + ISBN" +archive,Array of String,No,,N/A, +license,Array of License,No,,Result/Instance/License, +funder,Array of Funder,No,,Relation,Whenever we are able to link to a funder or project integrated into OpenAIRE. Mapping to OpenAIRE funders and projects is in eu/dnetlib/doiboost/crossref/Crossref2Oaf.scala.generateSimpleRelationFromAward +assertion,Array of Assertion,No,,N/A, +author,Array of Contributor,No,,Result/author (with orcid if available), +editor,Array of Contributor,No,,N/A, +chair,Array of Contributor,No,,N/A, +translator,Array of Contributor,No,,N/A, +update-to,Array of Update,No,,N/A, +update-policy,URL,No,Link to an update policy covering Crossmark updates for this work,N/A, +link,Array of Resource Link,No,URLs to full-text locations,Result/Instance/url, +clinical-trial-number,Array of Clinical Trial Number,No,,OafEntity/originalId, +alternative-id,String,No,Other identifiers for the work provided by the depositing member,OafEntity/originalId, +reference,Array of Reference,No,List of references made by the work,,Future improvement: map to references +content-domain,Content Domain,No,Information on domains that support Crossmark for this work,N/A, +relation,Relations,No,Relations to other works,Result/Instance/refereed,"if(relation.has-review) instance.refereed = ""peerReviewed"". " +review,Review,No,Peer review metadata,N/A, From b7cb1163eadc208129c2fbe669d50426a28d0af9 Mon Sep 17 00:00:00 2001 From: Alessia Bardi Date: Tue, 9 Jun 2020 10:39:11 +0200 Subject: [PATCH 05/17] identifiers always start with 50 --- .../src/main/java/eu/dnetlib/doiboost/mag/MagDataModel.scala | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/mag/MagDataModel.scala b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/mag/MagDataModel.scala index 1c6e1b0e65..b97fb739c8 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/mag/MagDataModel.scala +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/mag/MagDataModel.scala @@ -190,7 +190,7 @@ case object ConversionUtil { pub.setPid(List(createSP(paper.Doi.toLowerCase, "doi", PID_TYPES)).asJava) pub.setOriginalId(List(paper.PaperId.toString, paper.Doi.toLowerCase).asJava) - //Set identifier as {50|60} | doiboost____::md5(DOI) + //Set identifier as 50|doiboost____::md5(DOI) pub.setId(generateIdentifier(pub, paper.Doi.toLowerCase)) val mainTitles = createSP(paper.PaperTitle, "main title", "dnet:dataCite_title") @@ -247,7 +247,7 @@ case object ConversionUtil { pub.setPid(List(createSP(paper.Doi.toLowerCase, "doi", PID_TYPES)).asJava) pub.setOriginalId(List(paper.PaperId.toString, paper.Doi.toLowerCase).asJava) - //Set identifier as {50|60} | doiboost____::md5(DOI) + //Set identifier as 50 | doiboost____::md5(DOI) pub.setId(generateIdentifier(pub, paper.Doi.toLowerCase)) val mainTitles = createSP(paper.PaperTitle, "main title", "dnet:dataCite_title") From f072125152ff9dd4c6f98400653571bbe620119a Mon Sep 17 00:00:00 2001 From: Alessia Bardi Date: Tue, 9 Jun 2020 14:32:10 +0200 Subject: [PATCH 06/17] map volume and issue in journal information from MAG --- .../src/main/java/eu/dnetlib/doiboost/mag/MagDataModel.scala | 2 ++ 1 file changed, 2 insertions(+) diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/mag/MagDataModel.scala b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/mag/MagDataModel.scala index b97fb739c8..58516202be 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/mag/MagDataModel.scala +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/mag/MagDataModel.scala @@ -229,6 +229,8 @@ case object ConversionUtil { pub.setPublisher(asField(journal.Publisher.get)) if (journal.Issn.isDefined) j.setIssnPrinted(journal.Issn.get) + j.setVol(paper.Volume) + j.setIss(paper.Issue) pub.setJournal(j) } pub.setCollectedfrom(List(createMAGCollectedFrom()).asJava) From d6de406e1140490a4ad6c4bd8e6dea42f0214514 Mon Sep 17 00:00:00 2001 From: Alessia Bardi Date: Tue, 9 Jun 2020 14:43:34 +0200 Subject: [PATCH 07/17] fixed classid for subjects --- .../main/java/eu/dnetlib/doiboost/mag/MagDataModel.scala | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/mag/MagDataModel.scala b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/mag/MagDataModel.scala index 58516202be..2419f86a34 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/mag/MagDataModel.scala +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/mag/MagDataModel.scala @@ -129,16 +129,16 @@ case object ConversionUtil { val fieldOfStudy = item._2 if (fieldOfStudy != null && fieldOfStudy.subjects != null && fieldOfStudy.subjects.nonEmpty) { val p: List[StructuredProperty] = fieldOfStudy.subjects.flatMap(s => { - val s1 = createSP(s.DisplayName, "keywords", "dnet:subject_classification_typologies") + val s1 = createSP(s.DisplayName, "keyword", "dnet:subject_classification_typologies") val di = DoiBoostMappingUtil.generateDataInfo(s.Score.toString) var resList: List[StructuredProperty] = List(s1) if (s.MainType.isDefined) { val maintp = s.MainType.get - val s2 = createSP(s.MainType.get, "keywords", "dnet:subject_classification_typologies") + val s2 = createSP(s.MainType.get, "keyword", "dnet:subject_classification_typologies") s2.setDataInfo(di) resList = resList ::: List(s2) if (maintp.contains(".")) { - val s3 = createSP(maintp.split("\\.").head, "keywords", "dnet:subject_classification_typologies") + val s3 = createSP(maintp.split("\\.").head, "keyword", "dnet:subject_classification_typologies") s3.setDataInfo(di) resList = resList ::: List(s3) } From 33b130ec430ea464c7c20890e982fc8548486b83 Mon Sep 17 00:00:00 2001 From: Alessia Bardi Date: Tue, 9 Jun 2020 15:57:15 +0200 Subject: [PATCH 08/17] Mapping instructions for MAG --- .../mag_mapping.csv | 37 +++++++++++++++++++ 1 file changed, 37 insertions(+) create mode 100644 dhp-workflows/dhp-doiboost/src/main/resources/eu.dnetlib.dhp.doiboost.mappings/mag_mapping.csv diff --git a/dhp-workflows/dhp-doiboost/src/main/resources/eu.dnetlib.dhp.doiboost.mappings/mag_mapping.csv b/dhp-workflows/dhp-doiboost/src/main/resources/eu.dnetlib.dhp.doiboost.mappings/mag_mapping.csv new file mode 100644 index 0000000000..35ee651770 --- /dev/null +++ b/dhp-workflows/dhp-doiboost/src/main/resources/eu.dnetlib.dhp.doiboost.mappings/mag_mapping.csv @@ -0,0 +1,37 @@ +Micorsoft Academic Graph Field,OAF field,Comments +Papers/DOI,OafEntity/originalId, +,OafEntity/PID, +,Oaf/id in the form 50|doiboost____::md5(DOI), +Papers/PaperId,OafEntity/originalId, +Papers/PaperTitle,"Result/Title with Qualifier(""main title"", ""dnet:dataCite_title"")", +Papers/OriginalTitle,"Result/Title with Qualifier(""alternative title"", ""dnet:dataCite_title"")", +Papers/BookTitle,Publication/Source, +Papers/Year,N/A, +Papers/Date,Result/dateofacceptance, +Papers/Publisher,Result/Publisher,Possibly overridden by Journal/Publisher +Papers/JournalId,N/A, +Journal/Rank,N/A, +Journal/NormalizedName,N/A, +Journal/DisplayName,Publication/Journal/Name, +Journal/Issn,Publication/Journal/issnPrinted, +Journal/Publisher,Result/publisher,"If avalable, it overrides value in Papers/Publisher" +Journal/Webpage,N/A, +Journal/PaperCount,N/A, +Journal/CitationCount,N/A, +Journal/CreatedDate,N/A, +ConferenceInstances/DisplayName,Publication/Journal/Name, +ConferenceInstances/Location,Publication/Journal/Conferenceplace, +ConferenceInstances/StartDate,Publication/Journal/Conferencedate,subjectTo be constructed as StartDate - EndDate (only the first 10 chars ofthe dates are kept) +ConferenceInstances/EndDate,Publication/Journal/Conferencedate,To be constructed as StartDate - EndDate (only the first 10 chars ofthe dates are kept) +Papers/Volume,Publication/Journal/vol, +Papers/Issue,Publication/Journal/iss, +Papers/FirstPage,Publication/Journal/sp, +Papers/LastPage,Publication/Journal/ep, +Papers/ReferenceCount,N/A, +Papers/CitationCount,N/A, +Papers/EstimatedCitation,N/A, +Papers/OriginalVenue,N/A, +Papers/FamilyId,N/A, +Papers/CreatedDate,Result/LastUpdateTimeStamp, +Papers/FieldOfStudy/DisplayName,"Result/subject with Qualifier(""keyword"", ""dnet:subject_classification_typologies"")",Some FieldOfStudy come from the UMLS controlled vocabulary. Future releases of DOIBoost will include such terms with a specific Qualifier (i.e. not as generic keywords) +Papers/FieldOfStudy/MainType,"Result/subject with Qualifier(""keyword"", ""dnet:subject_classification_typologies"")","If MainType is splittable on . (like 'food.type_of_dish', 'aviation.airport', 'soccer.team'), then the first token (i.e. food, aviation, soccer) is also added as additional subject." \ No newline at end of file From baaa55f4a3d75cf0ed7894d152a977fd5f20ea50 Mon Sep 17 00:00:00 2001 From: "michele.artini" Date: Tue, 9 Jun 2020 16:01:31 +0200 Subject: [PATCH 09/17] use of pace to calculate trusts --- dhp-workflows/dhp-broker-events/pom.xml | 114 +++++++++-------- .../broker/oa/GenerateEventsApplication.java | 118 ++++++++++++------ .../dhp/broker/oa/matchers/UpdateMatcher.java | 14 +-- .../AbstractEnrichMissingDataset.java | 13 +- .../relatedProjects/EnrichMissingProject.java | 12 +- .../relatedProjects/EnrichMoreProject.java | 12 +- .../AbstractEnrichMissingPublication.java | 12 +- .../EnrichMissingSoftware.java | 12 +- .../relatedSoftware/EnrichMoreSoftware.java | 12 +- .../simple/EnrichMissingAbstract.java | 11 +- .../simple/EnrichMissingAuthorOrcid.java | 11 +- .../simple/EnrichMissingOpenAccess.java | 15 ++- .../oa/matchers/simple/EnrichMissingPid.java | 14 +-- .../simple/EnrichMissingPublicationDate.java | 11 +- .../matchers/simple/EnrichMissingSubject.java | 11 +- .../matchers/simple/EnrichMoreOpenAccess.java | 11 +- .../oa/matchers/simple/EnrichMorePid.java | 10 +- .../oa/matchers/simple/EnrichMoreSubject.java | 11 +- .../dhp/broker/oa/util/BrokerConstants.java | 3 + .../dhp/broker/oa/util/TrustUtils.java | 15 +++ .../dhp/broker/oa/util/UpdateInfo.java | 36 ++++-- .../dhp/broker/oa/util/TrustUtilsTest.java | 72 +++++++++++ 22 files changed, 360 insertions(+), 190 deletions(-) create mode 100644 dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/TrustUtils.java create mode 100644 dhp-workflows/dhp-broker-events/src/test/java/eu/dnetlib/dhp/broker/oa/util/TrustUtilsTest.java diff --git a/dhp-workflows/dhp-broker-events/pom.xml b/dhp-workflows/dhp-broker-events/pom.xml index 59ff05ed3f..cc323d1092 100644 --- a/dhp-workflows/dhp-broker-events/pom.xml +++ b/dhp-workflows/dhp-broker-events/pom.xml @@ -1,66 +1,72 @@ - - - dhp-workflows - eu.dnetlib.dhp - 1.2.2-SNAPSHOT - - 4.0.0 + + + dhp-workflows + eu.dnetlib.dhp + 1.2.2-SNAPSHOT + + 4.0.0 - dhp-broker-events + dhp-broker-events - + - - commons-io - commons-io - + + commons-io + commons-io + - - org.apache.spark - spark-core_2.11 - - - org.apache.spark - spark-sql_2.11 - - - org.apache.spark - spark-hive_2.11 - test - + + org.apache.spark + spark-core_2.11 + + + org.apache.spark + spark-sql_2.11 + + + org.apache.spark + spark-hive_2.11 + test + - - eu.dnetlib.dhp - dhp-common - ${project.version} - - - eu.dnetlib.dhp - dhp-schemas - ${project.version} - + + eu.dnetlib.dhp + dhp-common + ${project.version} + + + eu.dnetlib.dhp + dhp-schemas + ${project.version} + + + eu.dnetlib + dnet-pace-core + - - com.jayway.jsonpath - json-path - - - dom4j - dom4j - - - jaxen - jaxen - + + com.jayway.jsonpath + json-path + + + dom4j + dom4j + + + jaxen + jaxen + - - eu.dnetlib - dnet-openaire-broker-common - [2.0.1,3.0.0) - + + eu.dnetlib + dnet-openaire-broker-common + [2.0.1,3.0.0) + - + \ No newline at end of file diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/GenerateEventsApplication.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/GenerateEventsApplication.java index 05fab47f09..44bc5cb6ee 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/GenerateEventsApplication.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/GenerateEventsApplication.java @@ -63,6 +63,9 @@ import eu.dnetlib.dhp.schema.oaf.Publication; import eu.dnetlib.dhp.schema.oaf.Relation; import eu.dnetlib.dhp.schema.oaf.Result; import eu.dnetlib.dhp.schema.oaf.Software; +import eu.dnetlib.dhp.utils.ISLookupClientFactory; +import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService; +import eu.dnetlib.pace.config.DedupConfig; import scala.Tuple2; public class GenerateEventsApplication { @@ -107,7 +110,10 @@ public class GenerateEventsApplication { private static final UpdateMatcher>, ?> enrichMissingDatasetIsSupplementedBy = new EnrichMissingDatasetIsSupplementedBy(); - public static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); + // Aggregators + private static final TypedColumn, ResultGroup> resultAggrTypedColumn = new ResultAggregator().toColumn(); + + private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); public static void main(final String[] args) throws Exception { final ArgumentApplicationParser parser = new ArgumentApplicationParser( @@ -128,8 +134,16 @@ public class GenerateEventsApplication { final String eventsPath = parser.get("eventsPath"); log.info("eventsPath: {}", eventsPath); + final String isLookupUrl = parser.get("isLookupUrl"); + log.info("isLookupUrl: {}", isLookupUrl); + + final String dedupConfigProfileId = parser.get("dedupConfProfile"); + log.info("dedupConfigProfileId: {}", dedupConfigProfileId); + final SparkConf conf = new SparkConf(); + final DedupConfig dedupConfig = loadDedupConfig(isLookupUrl, dedupConfigProfileId); + runWithSparkSession(conf, isSparkSessionManaged, spark -> { removeOutputDir(spark, eventsPath); @@ -137,10 +151,10 @@ public class GenerateEventsApplication { final Dataset all = spark.emptyDataset(Encoders.kryo(Event.class)); for (final Class r1 : BrokerConstants.RESULT_CLASSES) { - all.union(generateSimpleEvents(spark, graphPath, r1)); + all.union(generateSimpleEvents(spark, graphPath, r1, dedupConfig)); for (final Class r2 : BrokerConstants.RESULT_CLASSES) { - all.union(generateRelationEvents(spark, graphPath, r1, r2)); + all.union(generateRelationEvents(spark, graphPath, r1, r2, dedupConfig)); } } @@ -155,38 +169,37 @@ public class GenerateEventsApplication { private static Dataset generateSimpleEvents(final SparkSession spark, final String graphPath, - final Class resultClazz) { + final Class resultClazz, + final DedupConfig dedupConfig) { final Dataset results = readPath(spark, graphPath + "/" + resultClazz.getSimpleName().toLowerCase(), Result.class) .filter(r -> r.getDataInfo().getDeletedbyinference()); - final Dataset rels = readPath(spark, graphPath + "/relation", Relation.class) + final Dataset mergedRels = readPath(spark, graphPath + "/relation", Relation.class) .filter(r -> r.getRelClass().equals(BrokerConstants.IS_MERGED_IN_CLASS)); - final TypedColumn, ResultGroup> aggr = new ResultAggregator().toColumn(); - - return results.joinWith(rels, results.col("id").equalTo(rels.col("source")), "inner") + return results.joinWith(mergedRels, results.col("id").equalTo(mergedRels.col("source")), "inner") .groupByKey((MapFunction, String>) t -> t._2.getTarget(), Encoders.STRING()) - .agg(aggr) + .agg(resultAggrTypedColumn) .map((MapFunction, ResultGroup>) t -> t._2, Encoders.kryo(ResultGroup.class)) .filter(ResultGroup::isValid) - .map((MapFunction) g -> GenerateEventsApplication.generateSimpleEvents(g), Encoders.kryo(EventGroup.class)) + .map((MapFunction) g -> GenerateEventsApplication.generateSimpleEvents(g, dedupConfig), Encoders.kryo(EventGroup.class)) .flatMap(group -> group.getData().iterator(), Encoders.kryo(Event.class)); } - private static EventGroup generateSimpleEvents(final ResultGroup results) { + private static EventGroup generateSimpleEvents(final ResultGroup results, final DedupConfig dedupConfig) { final List> list = new ArrayList<>(); for (final Result target : results.getData()) { - list.addAll(enrichMissingAbstract.searchUpdatesForRecord(target, results.getData())); - list.addAll(enrichMissingAuthorOrcid.searchUpdatesForRecord(target, results.getData())); - list.addAll(enrichMissingOpenAccess.searchUpdatesForRecord(target, results.getData())); - list.addAll(enrichMissingPid.searchUpdatesForRecord(target, results.getData())); - list.addAll(enrichMissingPublicationDate.searchUpdatesForRecord(target, results.getData())); - list.addAll(enrichMissingSubject.searchUpdatesForRecord(target, results.getData())); - list.addAll(enrichMoreOpenAccess.searchUpdatesForRecord(target, results.getData())); - list.addAll(enrichMorePid.searchUpdatesForRecord(target, results.getData())); - list.addAll(enrichMoreSubject.searchUpdatesForRecord(target, results.getData())); + list.addAll(enrichMissingAbstract.searchUpdatesForRecord(target, results.getData(), dedupConfig)); + list.addAll(enrichMissingAuthorOrcid.searchUpdatesForRecord(target, results.getData(), dedupConfig)); + list.addAll(enrichMissingOpenAccess.searchUpdatesForRecord(target, results.getData(), dedupConfig)); + list.addAll(enrichMissingPid.searchUpdatesForRecord(target, results.getData(), dedupConfig)); + list.addAll(enrichMissingPublicationDate.searchUpdatesForRecord(target, results.getData(), dedupConfig)); + list.addAll(enrichMissingSubject.searchUpdatesForRecord(target, results.getData(), dedupConfig)); + list.addAll(enrichMoreOpenAccess.searchUpdatesForRecord(target, results.getData(), dedupConfig)); + list.addAll(enrichMorePid.searchUpdatesForRecord(target, results.getData(), dedupConfig)); + list.addAll(enrichMoreSubject.searchUpdatesForRecord(target, results.getData(), dedupConfig)); } final EventGroup events = new EventGroup(); @@ -197,9 +210,10 @@ public class GenerateEventsApplication { private static Dataset generateRelationEvents(final SparkSession spark, final String graphPath, final Class sourceClass, - final Class targetClass) { + final Class targetClass, + final DedupConfig dedupConfig) { - final Dataset sources = readPath(spark, graphPath + "/" + sourceClass.getSimpleName().toLowerCase(), sourceClass) + final Dataset sources = readPath(spark, graphPath + "/" + sourceClass.getSimpleName().toLowerCase(), Result.class) .filter(r -> r.getDataInfo().getDeletedbyinference()); final Dataset targets = readPath(spark, graphPath + "/" + sourceClass.getSimpleName().toLowerCase(), targetClass); @@ -210,6 +224,12 @@ public class GenerateEventsApplication { final Dataset rels = readPath(spark, graphPath + "/relation", Relation.class) .filter(r -> !r.getRelClass().equals(BrokerConstants.IS_MERGED_IN_CLASS)); + final Dataset duplicates = sources.joinWith(mergedRels, sources.col("id").equalTo(rels.col("source")), "inner") + .groupByKey((MapFunction, String>) t -> t._2.getTarget(), Encoders.STRING()) + .agg(resultAggrTypedColumn) + .map((MapFunction, ResultGroup>) t -> t._2, Encoders.kryo(ResultGroup.class)) + .filter(ResultGroup::isValid); + if (targetClass == Project.class) { // TODO join using: generateProjectsEvents } else if (targetClass == Software.class) { @@ -223,29 +243,30 @@ public class GenerateEventsApplication { return null; } - private List generateProjectsEvents(final Collection>> childrenWithProjects) { + private List generateProjectsEvents(final Collection>> childrenWithProjects, final DedupConfig dedupConfig) { final List> list = new ArrayList<>(); for (final Pair> target : childrenWithProjects) { - list.addAll(enrichMissingProject.searchUpdatesForRecord(target, childrenWithProjects)); - list.addAll(enrichMoreProject.searchUpdatesForRecord(target, childrenWithProjects)); + list.addAll(enrichMissingProject.searchUpdatesForRecord(target, childrenWithProjects, dedupConfig)); + list.addAll(enrichMoreProject.searchUpdatesForRecord(target, childrenWithProjects, dedupConfig)); } return list.stream().map(EventFactory::newBrokerEvent).collect(Collectors.toList()); } - private List generateSoftwareEvents(final Collection>> childrenWithSoftwares) { + private List generateSoftwareEvents(final Collection>> childrenWithSoftwares, final DedupConfig dedupConfig) { final List> list = new ArrayList<>(); for (final Pair> target : childrenWithSoftwares) { - list.addAll(enrichMissingSoftware.searchUpdatesForRecord(target, childrenWithSoftwares)); - list.addAll(enrichMoreSoftware.searchUpdatesForRecord(target, childrenWithSoftwares)); + list.addAll(enrichMissingSoftware.searchUpdatesForRecord(target, childrenWithSoftwares, dedupConfig)); + list.addAll(enrichMoreSoftware.searchUpdatesForRecord(target, childrenWithSoftwares, dedupConfig)); } return list.stream().map(EventFactory::newBrokerEvent).collect(Collectors.toList()); } private List generatePublicationRelatedEvents(final String relType, - final Collection>>> childrenWithRels) { + final Collection>>> childrenWithRels, + final DedupConfig dedupConfig) { final List> list = new ArrayList<>(); @@ -258,15 +279,15 @@ public class GenerateEventsApplication { for (final Pair> target : cleanedChildrens) { if (relType.equals("isRelatedTo")) { - list.addAll(enrichMisissingPublicationIsRelatedTo.searchUpdatesForRecord(target, cleanedChildrens)); + list.addAll(enrichMisissingPublicationIsRelatedTo.searchUpdatesForRecord(target, cleanedChildrens, dedupConfig)); } else if (relType.equals("references")) { - list.addAll(enrichMissingPublicationReferences.searchUpdatesForRecord(target, cleanedChildrens)); + list.addAll(enrichMissingPublicationReferences.searchUpdatesForRecord(target, cleanedChildrens, dedupConfig)); } else if (relType.equals("isReferencedBy")) { - list.addAll(enrichMissingPublicationIsReferencedBy.searchUpdatesForRecord(target, cleanedChildrens)); + list.addAll(enrichMissingPublicationIsReferencedBy.searchUpdatesForRecord(target, cleanedChildrens, dedupConfig)); } else if (relType.equals("isSupplementedTo")) { - list.addAll(enrichMissingPublicationIsSupplementedTo.searchUpdatesForRecord(target, cleanedChildrens)); + list.addAll(enrichMissingPublicationIsSupplementedTo.searchUpdatesForRecord(target, cleanedChildrens, dedupConfig)); } else if (relType.equals("isSupplementedBy")) { - list.addAll(enrichMissingPublicationIsSupplementedBy.searchUpdatesForRecord(target, cleanedChildrens)); + list.addAll(enrichMissingPublicationIsSupplementedBy.searchUpdatesForRecord(target, cleanedChildrens, dedupConfig)); } } @@ -275,7 +296,8 @@ public class GenerateEventsApplication { } private List generateDatasetRelatedEvents(final String relType, - final Collection>>> childrenWithRels) { + final Collection>>> childrenWithRels, + final DedupConfig dedupConfig) { final List> list = new ArrayList<>(); @@ -288,15 +310,15 @@ public class GenerateEventsApplication { for (final Pair> target : cleanedChildrens) { if (relType.equals("isRelatedTo")) { - list.addAll(enrichMisissingDatasetIsRelatedTo.searchUpdatesForRecord(target, cleanedChildrens)); + list.addAll(enrichMisissingDatasetIsRelatedTo.searchUpdatesForRecord(target, cleanedChildrens, dedupConfig)); } else if (relType.equals("references")) { - list.addAll(enrichMissingDatasetReferences.searchUpdatesForRecord(target, cleanedChildrens)); + list.addAll(enrichMissingDatasetReferences.searchUpdatesForRecord(target, cleanedChildrens, dedupConfig)); } else if (relType.equals("isReferencedBy")) { - list.addAll(enrichMissingDatasetIsReferencedBy.searchUpdatesForRecord(target, cleanedChildrens)); + list.addAll(enrichMissingDatasetIsReferencedBy.searchUpdatesForRecord(target, cleanedChildrens, dedupConfig)); } else if (relType.equals("isSupplementedTo")) { - list.addAll(enrichMissingDatasetIsSupplementedTo.searchUpdatesForRecord(target, cleanedChildrens)); + list.addAll(enrichMissingDatasetIsSupplementedTo.searchUpdatesForRecord(target, cleanedChildrens, dedupConfig)); } else if (relType.equals("isSupplementedBy")) { - list.addAll(enrichMissingDatasetIsSupplementedBy.searchUpdatesForRecord(target, cleanedChildrens)); + list.addAll(enrichMissingDatasetIsSupplementedBy.searchUpdatesForRecord(target, cleanedChildrens, dedupConfig)); } } @@ -313,4 +335,20 @@ public class GenerateEventsApplication { .textFile(inputPath) .map((MapFunction) value -> OBJECT_MAPPER.readValue(value, clazz), Encoders.bean(clazz)); } + + private static DedupConfig loadDedupConfig(final String isLookupUrl, final String profId) throws Exception { + final ISLookUpService isLookUpService = ISLookupClientFactory.getLookUpService(isLookupUrl); + + final String conf = isLookUpService.getResourceProfileByQuery(String + .format("for $x in /RESOURCE_PROFILE[.//RESOURCE_IDENTIFIER/@value = '%s'] return $x//DEDUPLICATION/text()", profId)); + + final DedupConfig dedupConfig = new ObjectMapper().readValue(conf, DedupConfig.class); + dedupConfig.getPace().initModel(); + dedupConfig.getPace().initTranslationMap(); + // dedupConfig.getWf().setConfigurationId("???"); + + return dedupConfig; + + } + } diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/UpdateMatcher.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/UpdateMatcher.java index 5bfe108a56..286b40ad51 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/UpdateMatcher.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/UpdateMatcher.java @@ -12,6 +12,7 @@ import org.apache.commons.lang3.StringUtils; import eu.dnetlib.dhp.broker.oa.util.UpdateInfo; import eu.dnetlib.dhp.schema.oaf.Field; +import eu.dnetlib.pace.config.DedupConfig; public abstract class UpdateMatcher { @@ -21,16 +22,15 @@ public abstract class UpdateMatcher { this.multipleUpdate = multipleUpdate; } - public Collection> searchUpdatesForRecord(final K res, final Collection others) { + public Collection> searchUpdatesForRecord(final K res, final Collection others, final DedupConfig dedupConfig) { final Map> infoMap = new HashMap<>(); for (final K source : others) { if (source != res) { - for (final UpdateInfo info : findUpdates(source, res)) { + for (final UpdateInfo info : findUpdates(source, res, dedupConfig)) { final String s = DigestUtils.md5Hex(info.getHighlightValueAsString()); - if (!infoMap.containsKey(s) || infoMap.get(s).getTrust() < info.getTrust()) { - } else { + if (!infoMap.containsKey(s) || infoMap.get(s).getTrust() < info.getTrust()) {} else { infoMap.put(s, info); } } @@ -51,11 +51,7 @@ public abstract class UpdateMatcher { } } - protected abstract List> findUpdates(K source, K target); - - protected abstract UpdateInfo generateUpdateInfo(final T highlightValue, - final K source, - final K target); + protected abstract List> findUpdates(K source, K target, DedupConfig dedupConfig); protected static boolean isMissing(final List> list) { return list == null || list.isEmpty() || StringUtils.isBlank(list.get(0).getValue()); diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedDatasets/AbstractEnrichMissingDataset.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedDatasets/AbstractEnrichMissingDataset.java index 321fd43184..3cf7b18f9a 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedDatasets/AbstractEnrichMissingDataset.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedDatasets/AbstractEnrichMissingDataset.java @@ -13,6 +13,7 @@ import eu.dnetlib.dhp.broker.oa.util.ConversionUtils; import eu.dnetlib.dhp.broker.oa.util.UpdateInfo; import eu.dnetlib.dhp.schema.oaf.Dataset; import eu.dnetlib.dhp.schema.oaf.Result; +import eu.dnetlib.pace.config.DedupConfig; public abstract class AbstractEnrichMissingDataset extends UpdateMatcher>, eu.dnetlib.broker.objects.Dataset> { @@ -27,7 +28,8 @@ public abstract class AbstractEnrichMissingDataset @Override protected final List> findUpdates( final Pair> source, - final Pair> target) { + final Pair> target, + final DedupConfig dedupConfig) { final Set existingDatasets = target .getRight() @@ -40,21 +42,22 @@ public abstract class AbstractEnrichMissingDataset .stream() .filter(d -> !existingDatasets.contains(d.getId())) .map(ConversionUtils::oafDatasetToBrokerDataset) - .map(i -> generateUpdateInfo(i, source, target)) + .map(i -> generateUpdateInfo(i, source, target, dedupConfig)) .collect(Collectors.toList()); } - @Override protected final UpdateInfo generateUpdateInfo( final eu.dnetlib.broker.objects.Dataset highlightValue, final Pair> source, - final Pair> target) { + final Pair> target, + final DedupConfig dedupConfig) { return new UpdateInfo<>( getTopic(), highlightValue, source.getLeft(), target.getLeft(), (p, rel) -> p.getDatasets().add(rel), - rel -> rel.getInstances().get(0).getUrl()); + rel -> rel.getInstances().get(0).getUrl(), + dedupConfig); } public Topic getTopic() { diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedProjects/EnrichMissingProject.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedProjects/EnrichMissingProject.java index ed5e71d52d..22817a25d9 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedProjects/EnrichMissingProject.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedProjects/EnrichMissingProject.java @@ -13,6 +13,7 @@ import eu.dnetlib.dhp.broker.oa.util.ConversionUtils; import eu.dnetlib.dhp.broker.oa.util.UpdateInfo; import eu.dnetlib.dhp.schema.oaf.Project; import eu.dnetlib.dhp.schema.oaf.Result; +import eu.dnetlib.pace.config.DedupConfig; public class EnrichMissingProject extends UpdateMatcher>, eu.dnetlib.broker.objects.Project> { @@ -23,7 +24,8 @@ public class EnrichMissingProject @Override protected List> findUpdates(final Pair> source, - final Pair> target) { + final Pair> target, + final DedupConfig dedupConfig) { if (source.getRight().isEmpty()) { return Arrays.asList(); @@ -32,21 +34,21 @@ public class EnrichMissingProject .getRight() .stream() .map(ConversionUtils::oafProjectToBrokerProject) - .map(p -> generateUpdateInfo(p, source, target)) + .map(p -> generateUpdateInfo(p, source, target, dedupConfig)) .collect(Collectors.toList()); } } - @Override public UpdateInfo generateUpdateInfo( final eu.dnetlib.broker.objects.Project highlightValue, final Pair> source, - final Pair> target) { + final Pair> target, + final DedupConfig dedupConfig) { return new UpdateInfo<>( Topic.ENRICH_MISSING_PROJECT, highlightValue, source.getLeft(), target.getLeft(), (p, prj) -> p.getProjects().add(prj), - prj -> prj.getFunder() + "::" + prj.getFundingProgram() + prj.getCode()); + prj -> prj.getFunder() + "::" + prj.getFundingProgram() + prj.getCode(), dedupConfig); } } diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedProjects/EnrichMoreProject.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedProjects/EnrichMoreProject.java index 753166a824..016bdd2836 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedProjects/EnrichMoreProject.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedProjects/EnrichMoreProject.java @@ -13,6 +13,7 @@ import eu.dnetlib.dhp.broker.oa.util.ConversionUtils; import eu.dnetlib.dhp.broker.oa.util.UpdateInfo; import eu.dnetlib.dhp.schema.oaf.Project; import eu.dnetlib.dhp.schema.oaf.Result; +import eu.dnetlib.pace.config.DedupConfig; public class EnrichMoreProject extends UpdateMatcher>, eu.dnetlib.broker.objects.Project> { @@ -22,7 +23,8 @@ public class EnrichMoreProject extends UpdateMatcher> @Override protected List> findUpdates(final Pair> source, - final Pair> target) { + final Pair> target, + final DedupConfig dedupConfig) { final Set existingProjects = source .getRight() @@ -35,20 +37,20 @@ public class EnrichMoreProject extends UpdateMatcher> .stream() .filter(p -> !existingProjects.contains(p.getId())) .map(ConversionUtils::oafProjectToBrokerProject) - .map(p -> generateUpdateInfo(p, source, target)) + .map(p -> generateUpdateInfo(p, source, target, dedupConfig)) .collect(Collectors.toList()); } - @Override public UpdateInfo generateUpdateInfo( final eu.dnetlib.broker.objects.Project highlightValue, final Pair> source, - final Pair> target) { + final Pair> target, + final DedupConfig dedupConfig) { return new UpdateInfo<>( Topic.ENRICH_MORE_PROJECT, highlightValue, source.getLeft(), target.getLeft(), (p, prj) -> p.getProjects().add(prj), - prj -> prj.getFunder() + "::" + prj.getFundingProgram() + prj.getCode()); + prj -> prj.getFunder() + "::" + prj.getFundingProgram() + prj.getCode(), dedupConfig); } } diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedPublications/AbstractEnrichMissingPublication.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedPublications/AbstractEnrichMissingPublication.java index 074b6043a4..ec575e68d7 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedPublications/AbstractEnrichMissingPublication.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedPublications/AbstractEnrichMissingPublication.java @@ -13,6 +13,7 @@ import eu.dnetlib.dhp.broker.oa.util.ConversionUtils; import eu.dnetlib.dhp.broker.oa.util.UpdateInfo; import eu.dnetlib.dhp.schema.oaf.Publication; import eu.dnetlib.dhp.schema.oaf.Result; +import eu.dnetlib.pace.config.DedupConfig; public abstract class AbstractEnrichMissingPublication extends UpdateMatcher>, eu.dnetlib.broker.objects.Publication> { @@ -27,7 +28,8 @@ public abstract class AbstractEnrichMissingPublication @Override protected final List> findUpdates( final Pair> source, - final Pair> target) { + final Pair> target, + final DedupConfig dedupConfig) { final Set existingPublications = target .getRight() @@ -40,21 +42,21 @@ public abstract class AbstractEnrichMissingPublication .stream() .filter(d -> !existingPublications.contains(d.getId())) .map(ConversionUtils::oafResultToBrokerPublication) - .map(i -> generateUpdateInfo(i, source, target)) + .map(i -> generateUpdateInfo(i, source, target, dedupConfig)) .collect(Collectors.toList()); } - @Override protected final UpdateInfo generateUpdateInfo( final eu.dnetlib.broker.objects.Publication highlightValue, final Pair> source, - final Pair> target) { + final Pair> target, + final DedupConfig dedupConfig) { return new UpdateInfo<>( getTopic(), highlightValue, source.getLeft(), target.getLeft(), (p, rel) -> p.getPublications().add(rel), - rel -> rel.getInstances().get(0).getUrl()); + rel -> rel.getInstances().get(0).getUrl(), dedupConfig); } public Topic getTopic() { diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedSoftware/EnrichMissingSoftware.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedSoftware/EnrichMissingSoftware.java index fd9091dc1a..699d546ec5 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedSoftware/EnrichMissingSoftware.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedSoftware/EnrichMissingSoftware.java @@ -13,6 +13,7 @@ import eu.dnetlib.dhp.broker.oa.util.ConversionUtils; import eu.dnetlib.dhp.broker.oa.util.UpdateInfo; import eu.dnetlib.dhp.schema.oaf.Result; import eu.dnetlib.dhp.schema.oaf.Software; +import eu.dnetlib.pace.config.DedupConfig; public class EnrichMissingSoftware extends UpdateMatcher>, eu.dnetlib.broker.objects.Software> { @@ -24,7 +25,8 @@ public class EnrichMissingSoftware @Override protected List> findUpdates( final Pair> source, - final Pair> target) { + final Pair> target, + final DedupConfig dedupConfig) { if (source.getRight().isEmpty()) { return Arrays.asList(); @@ -33,21 +35,21 @@ public class EnrichMissingSoftware .getRight() .stream() .map(ConversionUtils::oafSoftwareToBrokerSoftware) - .map(p -> generateUpdateInfo(p, source, target)) + .map(p -> generateUpdateInfo(p, source, target, dedupConfig)) .collect(Collectors.toList()); } } - @Override public UpdateInfo generateUpdateInfo( final eu.dnetlib.broker.objects.Software highlightValue, final Pair> source, - final Pair> target) { + final Pair> target, + final DedupConfig dedupConfig) { return new UpdateInfo<>( Topic.ENRICH_MISSING_SOFTWARE, highlightValue, source.getLeft(), target.getLeft(), (p, s) -> p.getSoftwares().add(s), - s -> s.getName()); + s -> s.getName(), dedupConfig); } } diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedSoftware/EnrichMoreSoftware.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedSoftware/EnrichMoreSoftware.java index cb649dffff..45631df20a 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedSoftware/EnrichMoreSoftware.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/relatedSoftware/EnrichMoreSoftware.java @@ -13,6 +13,7 @@ import eu.dnetlib.dhp.broker.oa.util.ConversionUtils; import eu.dnetlib.dhp.broker.oa.util.UpdateInfo; import eu.dnetlib.dhp.schema.oaf.Result; import eu.dnetlib.dhp.schema.oaf.Software; +import eu.dnetlib.pace.config.DedupConfig; public class EnrichMoreSoftware extends UpdateMatcher>, eu.dnetlib.broker.objects.Software> { @@ -24,7 +25,8 @@ public class EnrichMoreSoftware @Override protected List> findUpdates( final Pair> source, - final Pair> target) { + final Pair> target, + final DedupConfig dedupConfig) { final Set existingSoftwares = source .getRight() @@ -37,20 +39,20 @@ public class EnrichMoreSoftware .stream() .filter(p -> !existingSoftwares.contains(p.getId())) .map(ConversionUtils::oafSoftwareToBrokerSoftware) - .map(p -> generateUpdateInfo(p, source, target)) + .map(p -> generateUpdateInfo(p, source, target, dedupConfig)) .collect(Collectors.toList()); } - @Override public UpdateInfo generateUpdateInfo( final eu.dnetlib.broker.objects.Software highlightValue, final Pair> source, - final Pair> target) { + final Pair> target, + final DedupConfig dedupConfig) { return new UpdateInfo<>( Topic.ENRICH_MORE_SOFTWARE, highlightValue, source.getLeft(), target.getLeft(), (p, s) -> p.getSoftwares().add(s), - s -> s.getName()); + s -> s.getName(), dedupConfig); } } diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMissingAbstract.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMissingAbstract.java index a418b633ed..c3b6bda668 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMissingAbstract.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMissingAbstract.java @@ -9,6 +9,7 @@ import eu.dnetlib.dhp.broker.model.Topic; import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher; import eu.dnetlib.dhp.broker.oa.util.UpdateInfo; import eu.dnetlib.dhp.schema.oaf.Result; +import eu.dnetlib.pace.config.DedupConfig; public class EnrichMissingAbstract extends UpdateMatcher { @@ -17,22 +18,22 @@ public class EnrichMissingAbstract extends UpdateMatcher { } @Override - protected List> findUpdates(final Result source, final Result target) { + protected List> findUpdates(final Result source, final Result target, final DedupConfig dedupConfig) { if (isMissing(target.getDescription()) && !isMissing(source.getDescription())) { - return Arrays.asList(generateUpdateInfo(source.getDescription().get(0).getValue(), source, target)); + return Arrays.asList(generateUpdateInfo(source.getDescription().get(0).getValue(), source, target, dedupConfig)); } return new ArrayList<>(); } - @Override public UpdateInfo generateUpdateInfo(final String highlightValue, final Result source, - final Result target) { + final Result target, + final DedupConfig dedupConfig) { return new UpdateInfo<>( Topic.ENRICH_MISSING_ABSTRACT, highlightValue, source, target, (p, s) -> p.getAbstracts().add(s), - s -> s); + s -> s, dedupConfig); } } diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMissingAuthorOrcid.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMissingAuthorOrcid.java index b5c2f7e723..89292d3da2 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMissingAuthorOrcid.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMissingAuthorOrcid.java @@ -10,6 +10,7 @@ import eu.dnetlib.dhp.broker.model.Topic; import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher; import eu.dnetlib.dhp.broker.oa.util.UpdateInfo; import eu.dnetlib.dhp.schema.oaf.Result; +import eu.dnetlib.pace.config.DedupConfig; public class EnrichMissingAuthorOrcid extends UpdateMatcher> { @@ -18,19 +19,21 @@ public class EnrichMissingAuthorOrcid extends UpdateMatcher>> findUpdates(final Result source, final Result target) { + protected List>> findUpdates(final Result source, final Result target, final DedupConfig dedupConfig) { + // TODO // return Arrays.asList(new EnrichMissingAbstract("xxxxxxx", 0.9f)); return Arrays.asList(); } - @Override public UpdateInfo> generateUpdateInfo(final Pair highlightValue, final Result source, - final Result target) { + final Result target, + final DedupConfig dedupConfig) { return new UpdateInfo<>( Topic.ENRICH_MISSING_AUTHOR_ORCID, highlightValue, source, target, (p, pair) -> p.getCreators().add(pair.getLeft() + " - ORCID: " + pair.getRight()), - pair -> pair.getLeft() + "::" + pair.getRight()); + pair -> pair.getLeft() + "::" + pair.getRight(), + dedupConfig); } } diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMissingOpenAccess.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMissingOpenAccess.java index 30638cefbb..7f5a595cc8 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMissingOpenAccess.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMissingOpenAccess.java @@ -12,6 +12,7 @@ import eu.dnetlib.dhp.broker.oa.util.BrokerConstants; import eu.dnetlib.dhp.broker.oa.util.ConversionUtils; import eu.dnetlib.dhp.broker.oa.util.UpdateInfo; import eu.dnetlib.dhp.schema.oaf.Result; +import eu.dnetlib.pace.config.DedupConfig; public class EnrichMissingOpenAccess extends UpdateMatcher { @@ -20,7 +21,7 @@ public class EnrichMissingOpenAccess extends UpdateMatcher { } @Override - protected List> findUpdates(final Result source, final Result target) { + protected List> findUpdates(final Result source, final Result target, final DedupConfig dedupConfig) { final long count = target .getInstance() .stream() @@ -28,9 +29,7 @@ public class EnrichMissingOpenAccess extends UpdateMatcher { .filter(right -> right.equals(BrokerConstants.OPEN_ACCESS)) .count(); - if (count > 0) { - return Arrays.asList(); - } + if (count > 0) { return Arrays.asList(); } return source .getInstance() @@ -38,19 +37,19 @@ public class EnrichMissingOpenAccess extends UpdateMatcher { .filter(i -> i.getAccessright().getClassid().equals(BrokerConstants.OPEN_ACCESS)) .map(ConversionUtils::oafInstanceToBrokerInstances) .flatMap(List::stream) - .map(i -> generateUpdateInfo(i, source, target)) + .map(i -> generateUpdateInfo(i, source, target, dedupConfig)) .collect(Collectors.toList()); } - @Override public UpdateInfo generateUpdateInfo(final Instance highlightValue, final Result source, - final Result target) { + final Result target, + final DedupConfig dedupConfig) { return new UpdateInfo<>( Topic.ENRICH_MISSING_OA_VERSION, highlightValue, source, target, (p, i) -> p.getInstances().add(i), - Instance::getUrl); + Instance::getUrl, dedupConfig); } } diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMissingPid.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMissingPid.java index 522d46d402..6e106e6695 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMissingPid.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMissingPid.java @@ -11,6 +11,7 @@ import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher; import eu.dnetlib.dhp.broker.oa.util.ConversionUtils; import eu.dnetlib.dhp.broker.oa.util.UpdateInfo; import eu.dnetlib.dhp.schema.oaf.Result; +import eu.dnetlib.pace.config.DedupConfig; public class EnrichMissingPid extends UpdateMatcher { @@ -19,28 +20,25 @@ public class EnrichMissingPid extends UpdateMatcher { } @Override - protected List> findUpdates(final Result source, final Result target) { + protected List> findUpdates(final Result source, final Result target, final DedupConfig dedupConfig) { final long count = target.getPid().size(); - if (count > 0) { - return Arrays.asList(); - } + if (count > 0) { return Arrays.asList(); } return source .getPid() .stream() .map(ConversionUtils::oafPidToBrokerPid) - .map(i -> generateUpdateInfo(i, source, target)) + .map(i -> generateUpdateInfo(i, source, target, dedupConfig)) .collect(Collectors.toList()); } - @Override - public UpdateInfo generateUpdateInfo(final Pid highlightValue, final Result source, final Result target) { + public UpdateInfo generateUpdateInfo(final Pid highlightValue, final Result source, final Result target, final DedupConfig dedupConfig) { return new UpdateInfo<>( Topic.ENRICH_MISSING_PID, highlightValue, source, target, (p, pid) -> p.getPids().add(pid), - pid -> pid.getType() + "::" + pid.getValue()); + pid -> pid.getType() + "::" + pid.getValue(), dedupConfig); } } diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMissingPublicationDate.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMissingPublicationDate.java index 197ace97c8..d2b28d65d0 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMissingPublicationDate.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMissingPublicationDate.java @@ -9,6 +9,7 @@ import eu.dnetlib.dhp.broker.model.Topic; import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher; import eu.dnetlib.dhp.broker.oa.util.UpdateInfo; import eu.dnetlib.dhp.schema.oaf.Result; +import eu.dnetlib.pace.config.DedupConfig; public class EnrichMissingPublicationDate extends UpdateMatcher { @@ -17,22 +18,22 @@ public class EnrichMissingPublicationDate extends UpdateMatcher } @Override - protected List> findUpdates(final Result source, final Result target) { + protected List> findUpdates(final Result source, final Result target, final DedupConfig dedupConfig) { if (isMissing(target.getDateofacceptance()) && !isMissing(source.getDateofacceptance())) { - return Arrays.asList(generateUpdateInfo(source.getDateofacceptance().getValue(), source, target)); + return Arrays.asList(generateUpdateInfo(source.getDateofacceptance().getValue(), source, target, dedupConfig)); } return new ArrayList<>(); } - @Override public UpdateInfo generateUpdateInfo(final String highlightValue, final Result source, - final Result target) { + final Result target, + final DedupConfig dedupConfig) { return new UpdateInfo<>( Topic.ENRICH_MISSING_PUBLICATION_DATE, highlightValue, source, target, (p, date) -> p.setPublicationdate(date), - s -> s); + s -> s, dedupConfig); } } diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMissingSubject.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMissingSubject.java index 290bad48b3..de888ff871 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMissingSubject.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMissingSubject.java @@ -14,6 +14,7 @@ import eu.dnetlib.dhp.broker.oa.util.UpdateInfo; import eu.dnetlib.dhp.schema.oaf.Qualifier; import eu.dnetlib.dhp.schema.oaf.Result; import eu.dnetlib.dhp.schema.oaf.StructuredProperty; +import eu.dnetlib.pace.config.DedupConfig; public class EnrichMissingSubject extends UpdateMatcher> { @@ -22,7 +23,7 @@ public class EnrichMissingSubject extends UpdateMatcher>> findUpdates(final Result source, final Result target) { + protected List>> findUpdates(final Result source, final Result target, final DedupConfig dedupConfig) { final Set existingTypes = target .getSubject() .stream() @@ -35,20 +36,20 @@ public class EnrichMissingSubject extends UpdateMatcher !existingTypes.contains(pid.getQualifier().getClassid())) .map(ConversionUtils::oafSubjectToPair) - .map(i -> generateUpdateInfo(i, source, target)) + .map(i -> generateUpdateInfo(i, source, target, dedupConfig)) .collect(Collectors.toList()); } - @Override public UpdateInfo> generateUpdateInfo(final Pair highlightValue, final Result source, - final Result target) { + final Result target, + final DedupConfig dedupConfig) { return new UpdateInfo<>( Topic.fromPath("ENRICH/MISSING/SUBJECT/" + highlightValue.getLeft()), highlightValue, source, target, (p, pair) -> p.getSubjects().add(pair.getRight()), - pair -> pair.getLeft() + "::" + pair.getRight()); + pair -> pair.getLeft() + "::" + pair.getRight(), dedupConfig); } } diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMoreOpenAccess.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMoreOpenAccess.java index f9c5f81da1..021449797d 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMoreOpenAccess.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMoreOpenAccess.java @@ -12,6 +12,7 @@ import eu.dnetlib.dhp.broker.oa.util.BrokerConstants; import eu.dnetlib.dhp.broker.oa.util.ConversionUtils; import eu.dnetlib.dhp.broker.oa.util.UpdateInfo; import eu.dnetlib.dhp.schema.oaf.Result; +import eu.dnetlib.pace.config.DedupConfig; public class EnrichMoreOpenAccess extends UpdateMatcher { @@ -20,7 +21,7 @@ public class EnrichMoreOpenAccess extends UpdateMatcher { } @Override - protected List> findUpdates(final Result source, final Result target) { + protected List> findUpdates(final Result source, final Result target, final DedupConfig dedupConfig) { final Set urls = target .getInstance() .stream() @@ -36,19 +37,19 @@ public class EnrichMoreOpenAccess extends UpdateMatcher { .map(ConversionUtils::oafInstanceToBrokerInstances) .flatMap(List::stream) .filter(i -> !urls.contains(i.getUrl())) - .map(i -> generateUpdateInfo(i, source, target)) + .map(i -> generateUpdateInfo(i, source, target, dedupConfig)) .collect(Collectors.toList()); } - @Override public UpdateInfo generateUpdateInfo(final Instance highlightValue, final Result source, - final Result target) { + final Result target, + final DedupConfig dedupConfig) { return new UpdateInfo<>( Topic.ENRICH_MORE_OA_VERSION, highlightValue, source, target, (p, i) -> p.getInstances().add(i), - Instance::getUrl); + Instance::getUrl, dedupConfig); } } diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMorePid.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMorePid.java index 2ee327c831..c64ed20eac 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMorePid.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMorePid.java @@ -11,6 +11,7 @@ import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher; import eu.dnetlib.dhp.broker.oa.util.ConversionUtils; import eu.dnetlib.dhp.broker.oa.util.UpdateInfo; import eu.dnetlib.dhp.schema.oaf.Result; +import eu.dnetlib.pace.config.DedupConfig; public class EnrichMorePid extends UpdateMatcher { @@ -19,7 +20,7 @@ public class EnrichMorePid extends UpdateMatcher { } @Override - protected List> findUpdates(final Result source, final Result target) { + protected List> findUpdates(final Result source, final Result target, final DedupConfig dedupConfig) { final Set existingPids = target .getPid() .stream() @@ -31,17 +32,16 @@ public class EnrichMorePid extends UpdateMatcher { .stream() .filter(pid -> !existingPids.contains(pid.getQualifier().getClassid() + "::" + pid.getValue())) .map(ConversionUtils::oafPidToBrokerPid) - .map(i -> generateUpdateInfo(i, source, target)) + .map(i -> generateUpdateInfo(i, source, target, dedupConfig)) .collect(Collectors.toList()); } - @Override - public UpdateInfo generateUpdateInfo(final Pid highlightValue, final Result source, final Result target) { + public UpdateInfo generateUpdateInfo(final Pid highlightValue, final Result source, final Result target, final DedupConfig dedupConfig) { return new UpdateInfo<>( Topic.ENRICH_MORE_PID, highlightValue, source, target, (p, pid) -> p.getPids().add(pid), - pid -> pid.getType() + "::" + pid.getValue()); + pid -> pid.getType() + "::" + pid.getValue(), dedupConfig); } } diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMoreSubject.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMoreSubject.java index b38445e885..3f7f5b3d56 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMoreSubject.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMoreSubject.java @@ -12,6 +12,7 @@ import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher; import eu.dnetlib.dhp.broker.oa.util.ConversionUtils; import eu.dnetlib.dhp.broker.oa.util.UpdateInfo; import eu.dnetlib.dhp.schema.oaf.Result; +import eu.dnetlib.pace.config.DedupConfig; public class EnrichMoreSubject extends UpdateMatcher> { @@ -20,7 +21,7 @@ public class EnrichMoreSubject extends UpdateMatcher>> findUpdates(final Result source, final Result target) { + protected List>> findUpdates(final Result source, final Result target, final DedupConfig dedupConfig) { final Set existingSubjects = target .getSubject() .stream() @@ -32,20 +33,20 @@ public class EnrichMoreSubject extends UpdateMatcher !existingSubjects.contains(pid.getQualifier().getClassid() + "::" + pid.getValue())) .map(ConversionUtils::oafSubjectToPair) - .map(i -> generateUpdateInfo(i, source, target)) + .map(i -> generateUpdateInfo(i, source, target, dedupConfig)) .collect(Collectors.toList()); } - @Override public UpdateInfo> generateUpdateInfo(final Pair highlightValue, final Result source, - final Result target) { + final Result target, + final DedupConfig dedupConfig) { return new UpdateInfo<>( Topic.fromPath("ENRICH/MORE/SUBJECT/" + highlightValue.getLeft()), highlightValue, source, target, (p, pair) -> p.getSubjects().add(pair.getRight()), - pair -> pair.getLeft() + "::" + pair.getRight()); + pair -> pair.getLeft() + "::" + pair.getRight(), dedupConfig); } } diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/BrokerConstants.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/BrokerConstants.java index f4da59518d..0665c69ddd 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/BrokerConstants.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/BrokerConstants.java @@ -15,6 +15,9 @@ public class BrokerConstants { public static final String OPEN_ACCESS = "OPEN"; public static final String IS_MERGED_IN_CLASS = "isMergedIn"; + public static final float MIN_TRUST = 0.25f; + public static final float MAX_TRUST = 1.00f; + public static final List> RESULT_CLASSES = Arrays .asList(Publication.class, Dataset.class, Software.class, OtherResearchProduct.class); diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/TrustUtils.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/TrustUtils.java new file mode 100644 index 0000000000..6bf59c1252 --- /dev/null +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/TrustUtils.java @@ -0,0 +1,15 @@ +package eu.dnetlib.dhp.broker.oa.util; + +public class TrustUtils { + + public static float rescale(final double score, final double threshold) { + if (score >= BrokerConstants.MAX_TRUST) { return BrokerConstants.MAX_TRUST; } + + final double val = (score - threshold) * (BrokerConstants.MAX_TRUST - BrokerConstants.MIN_TRUST) / (BrokerConstants.MAX_TRUST - threshold); + + if (val < BrokerConstants.MIN_TRUST) { return BrokerConstants.MIN_TRUST; } + if (val > BrokerConstants.MAX_TRUST) { return BrokerConstants.MAX_TRUST; } + + return (float) val; + } +} diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/UpdateInfo.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/UpdateInfo.java index 04b426a1c2..de6a713974 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/UpdateInfo.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/UpdateInfo.java @@ -5,6 +5,11 @@ import java.util.List; import java.util.function.BiConsumer; import java.util.function.Function; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import com.fasterxml.jackson.databind.ObjectMapper; + import eu.dnetlib.broker.objects.OpenAireEventPayload; import eu.dnetlib.broker.objects.Provenance; import eu.dnetlib.broker.objects.Publication; @@ -12,6 +17,10 @@ import eu.dnetlib.dhp.broker.model.Topic; import eu.dnetlib.dhp.schema.oaf.Instance; import eu.dnetlib.dhp.schema.oaf.KeyValue; import eu.dnetlib.dhp.schema.oaf.Result; +import eu.dnetlib.pace.config.DedupConfig; +import eu.dnetlib.pace.model.MapDocument; +import eu.dnetlib.pace.tree.support.TreeProcessor; +import eu.dnetlib.pace.util.MapDocumentUtil; public final class UpdateInfo { @@ -29,16 +38,19 @@ public final class UpdateInfo { private final float trust; + private static final Logger log = LoggerFactory.getLogger(UpdateInfo.class); + public UpdateInfo(final Topic topic, final T highlightValue, final Result source, final Result target, final BiConsumer compileHighlight, - final Function highlightToString) { + final Function highlightToString, + final DedupConfig dedupConfig) { this.topic = topic; this.highlightValue = highlightValue; this.source = source; this.target = target; this.compileHighlight = compileHighlight; this.highlightToString = highlightToString; - this.trust = calculateTrust(source, target); + this.trust = calculateTrust(dedupConfig, source, target); } public T getHighlightValue() { @@ -53,9 +65,20 @@ public final class UpdateInfo { return target; } - private float calculateTrust(final Result source, final Result target) { - // TODO - return 0.9f; + private float calculateTrust(final DedupConfig dedupConfig, final Result r1, final Result r2) { + try { + final ObjectMapper objectMapper = new ObjectMapper(); + final MapDocument doc1 = MapDocumentUtil.asMapDocumentWithJPath(dedupConfig, objectMapper.writeValueAsString(r1)); + final MapDocument doc2 = MapDocumentUtil.asMapDocumentWithJPath(dedupConfig, objectMapper.writeValueAsString(r2)); + + final double score = new TreeProcessor(dedupConfig).computeScore(doc1, doc2); + final double threshold = dedupConfig.getWf().getThreshold(); + + return TrustUtils.rescale(score, threshold); + } catch (final Exception e) { + log.error("Error computing score between results", e); + return BrokerConstants.MIN_TRUST; + } } protected Topic getTopic() { @@ -95,8 +118,7 @@ public final class UpdateInfo { .map(Instance::getUrl) .flatMap(List::stream) .findFirst() - .orElse(null); - ; + .orElse(null);; final Provenance provenance = new Provenance().setId(provId).setRepositoryName(provRepo).setUrl(provUrl); diff --git a/dhp-workflows/dhp-broker-events/src/test/java/eu/dnetlib/dhp/broker/oa/util/TrustUtilsTest.java b/dhp-workflows/dhp-broker-events/src/test/java/eu/dnetlib/dhp/broker/oa/util/TrustUtilsTest.java new file mode 100644 index 0000000000..58f391c24b --- /dev/null +++ b/dhp-workflows/dhp-broker-events/src/test/java/eu/dnetlib/dhp/broker/oa/util/TrustUtilsTest.java @@ -0,0 +1,72 @@ +package eu.dnetlib.dhp.broker.oa.util; + +import static org.junit.jupiter.api.Assertions.assertTrue; + +import org.junit.jupiter.api.Test; + +public class TrustUtilsTest { + + private static final double THRESHOLD = 0.95; + + @Test + public void rescaleTest_1() { + verifyValue(-0.3, BrokerConstants.MIN_TRUST); + } + + @Test + public void rescaleTest_2() { + verifyValue(0.0, BrokerConstants.MIN_TRUST); + } + + @Test + public void rescaleTest_3() { + verifyValue(0.5, BrokerConstants.MIN_TRUST); + } + + @Test + public void rescaleTest_4() { + verifyValue(0.95, BrokerConstants.MIN_TRUST); + } + + @Test + public void rescaleTest_5() { + verifyValue(0.96, BrokerConstants.MIN_TRUST); + } + + @Test + public void rescaleTest_6() { + verifyValue(0.97, 0.3f); + } + + @Test + public void rescaleTest_7() { + verifyValue(0.98, 0.45f); + } + + @Test + public void rescaleTest_8() { + verifyValue(0.99, 0.6f); + } + + @Test + public void rescaleTest_9() { + verifyValue(1.00, BrokerConstants.MAX_TRUST); + } + + @Test + public void rescaleTest_10() { + verifyValue(1.01, BrokerConstants.MAX_TRUST); + } + + @Test + public void rescaleTest_11() { + verifyValue(2.00, BrokerConstants.MAX_TRUST); + } + + private void verifyValue(final double originalScore, final float expectedTrust) { + final float trust = TrustUtils.rescale(originalScore, THRESHOLD); + System.out.println(trust); + assertTrue(Math.abs(trust - expectedTrust) < 0.01); + } + +} From fc4d220964cac446ab982d0978665273fd1aa6ce Mon Sep 17 00:00:00 2001 From: Alessia Bardi Date: Tue, 9 Jun 2020 17:05:31 +0200 Subject: [PATCH 10/17] updated function name for SNSF --- .../main/java/eu/dnetlib/doiboost/crossref/Crossref2Oaf.scala | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/crossref/Crossref2Oaf.scala b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/crossref/Crossref2Oaf.scala index 85997fa364..c1089ec282 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/crossref/Crossref2Oaf.scala +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/crossref/Crossref2Oaf.scala @@ -243,7 +243,7 @@ case object Crossref2Oaf { val queue = new mutable.Queue[Relation] - def snfRule(award:String): String = { + def snsfRule(award:String): String = { var tmp1 = StringUtils.substringAfter(award,"_") val tmp2 = StringUtils.substringBefore(tmp1,"/") logger.debug(s"From $award to $tmp2") @@ -318,7 +318,7 @@ case object Crossref2Oaf { case "10.13039/501100006588" | "10.13039/501100004488" => generateSimpleRelationFromAward(funder, "irb_hr______", a=>a.replaceAll("Project No.", "").replaceAll("HRZZ-","") ) case "10.13039/501100006769"=> generateSimpleRelationFromAward(funder, "rsf_________", a=>a) - case "10.13039/501100001711"=> generateSimpleRelationFromAward(funder, "snsf________", snfRule) + case "10.13039/501100001711"=> generateSimpleRelationFromAward(funder, "snsf________", snsfRule) case "10.13039/501100004410"=> generateSimpleRelationFromAward(funder, "tubitakf____", a =>a) case "10.10.13039/100004440"=> generateSimpleRelationFromAward(funder, "wt__________", a =>a) case "10.13039/100004440"=> queue += generateRelation(sourceId,"1e5e62235d094afd01cd56e65112fc63", "wt__________" ) From f3b033cf09abc5045ed404a2c00f91ad4320a709 Mon Sep 17 00:00:00 2001 From: Alessia Bardi Date: Tue, 9 Jun 2020 17:08:26 +0200 Subject: [PATCH 11/17] added csv line for funders from Crossref --- .../eu.dnetlib.dhp.doiboost.mappings/crossref_mapping.csv | 2 ++ 1 file changed, 2 insertions(+) diff --git a/dhp-workflows/dhp-doiboost/src/main/resources/eu.dnetlib.dhp.doiboost.mappings/crossref_mapping.csv b/dhp-workflows/dhp-doiboost/src/main/resources/eu.dnetlib.dhp.doiboost.mappings/crossref_mapping.csv index d87ae5da6a..6a5fc3f871 100644 --- a/dhp-workflows/dhp-doiboost/src/main/resources/eu.dnetlib.dhp.doiboost.mappings/crossref_mapping.csv +++ b/dhp-workflows/dhp-doiboost/src/main/resources/eu.dnetlib.dhp.doiboost.mappings/crossref_mapping.csv @@ -58,3 +58,5 @@ reference,Array of Reference,No,List of references made by the work,,Future impr content-domain,Content Domain,No,Information on domains that support Crossmark for this work,N/A, relation,Relations,No,Relations to other works,Result/Instance/refereed,"if(relation.has-review) instance.refereed = ""peerReviewed"". " review,Review,No,Peer review metadata,N/A, +funder,Array of Funder,No,,Relation between Result and Project,"The mapping between Crossref funder elements and OpenAIRE projects is implemented in eu.dnetlib.doiboost.crossref.Crossref2Oaf.mappingFunderToRelations. +The matching is based on Funder DOIs, Funder names, and grant codes. Different mapping approaches are applied to cover the different cases in Crossref records." From a3a6755d582ff86a13e83c7145f16b0130e0e7f6 Mon Sep 17 00:00:00 2001 From: Alessia Bardi Date: Tue, 9 Jun 2020 17:45:44 +0200 Subject: [PATCH 12/17] mapping csv for Unpaywall --- .../unpaywall_mapping.csv | 11 +++++++++++ 1 file changed, 11 insertions(+) create mode 100644 dhp-workflows/dhp-doiboost/src/main/resources/eu.dnetlib.dhp.doiboost.mappings/unpaywall_mapping.csv diff --git a/dhp-workflows/dhp-doiboost/src/main/resources/eu.dnetlib.dhp.doiboost.mappings/unpaywall_mapping.csv b/dhp-workflows/dhp-doiboost/src/main/resources/eu.dnetlib.dhp.doiboost.mappings/unpaywall_mapping.csv new file mode 100644 index 0000000000..0c891b35a6 --- /dev/null +++ b/dhp-workflows/dhp-doiboost/src/main/resources/eu.dnetlib.dhp.doiboost.mappings/unpaywall_mapping.csv @@ -0,0 +1,11 @@ +Unpaywall field,Description,OAF Mapping,Comment +doi,The DOI of this resource.,Result/pid, +is_oa,Is there an OA copy of this resource.,N/A,Add a Result/Instance only if is_oa is true +best_oa_location,The best OA Location Object we could find for this DOI.,,Create one Result/Instance with Open Access right +best_oa_location.url,The url_for_pdf if there is one; otherwise landing page URL.,Result/instance/url, +best_oa_location.license,The license under which this copy is published.,Result/instance/license, +oa_status,"The OA status, or color, of this resource. Classifies OA resources by location and license terms as one of: gold, hybrid, bronze, green or closed.",N/A,New field to be introduced in the OpenAIRE data model +published_date,"The date this resource was published. As reported by the publishers, who unfortunately have inconsistent definitions of what counts as officially ""published.""",N/A,"Not used in the mapping. Could be used to improve the coverage of dates, if needed." +title,The title of this resource.,N/A, +oa_locations,List of all the OA Location objects associated with this resource.,N/A,"Coud be useful if we plan to ""patch"" existing instances. If the instance URL is a URL of a oa_location, then it's Open. +" \ No newline at end of file From 2d3f7d1eb4be258e029b737fbdc24fb606ca36ca Mon Sep 17 00:00:00 2001 From: Alessia Bardi Date: Tue, 9 Jun 2020 18:07:14 +0200 Subject: [PATCH 13/17] fixed log classes to make the ORCID test run --- .../java/eu/dnetlib/doiboost/orcid/ORCIDToOAF.scala | 12 +++--------- .../doiboost/orcid/MappingORCIDToOAFTest.scala | 8 +++----- 2 files changed, 6 insertions(+), 14 deletions(-) diff --git a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/ORCIDToOAF.scala b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/ORCIDToOAF.scala index 7b344f4a5c..f230c604f7 100644 --- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/ORCIDToOAF.scala +++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/orcid/ORCIDToOAF.scala @@ -1,16 +1,10 @@ package eu.dnetlib.doiboost.orcid -import java.io.IOException - import eu.dnetlib.dhp.schema.oaf.{Author, Publication} import eu.dnetlib.doiboost.DoiBoostMappingUtil import eu.dnetlib.doiboost.DoiBoostMappingUtil.{ORCID, PID_TYPES, createSP, generateDataInfo, generateIdentifier} -import eu.dnetlib.doiboost.crossref.Crossref2Oaf import org.apache.commons.lang.StringUtils import org.codehaus.jackson.map.ObjectMapper -import org.json4s -import org.json4s.DefaultFormats -import org.json4s.jackson.JsonMethods.parse import org.slf4j.{Logger, LoggerFactory} import scala.collection.JavaConverters._ @@ -22,7 +16,7 @@ case class ORCIDItem(oid:String,name:String,surname:String,creditName:String,err case class ORCIDElement(doi:String, authors:List[ORCIDItem]) {} object ORCIDToOAF { - val logger: Logger = LoggerFactory.getLogger(Crossref2Oaf.getClass) + val logger: Logger = LoggerFactory.getLogger(ORCIDToOAF.getClass) val mapper = new ObjectMapper def isJsonValid(inputStr: String): Boolean = { @@ -57,7 +51,7 @@ object ORCIDToOAF { pub.setId(generateIdentifier(pub, doi.toLowerCase)) try{ pub.setAuthor(input.authors.map(a=> { - generateAuhtor(a.name, a.surname, a.creditName, a.oid) + generateAuthor(a.name, a.surname, a.creditName, a.oid) }).asJava) pub.setCollectedfrom(List(DoiBoostMappingUtil.createORIDCollectedFrom()).asJava) pub.setDataInfo(DoiBoostMappingUtil.generateDataInfo()) @@ -69,7 +63,7 @@ object ORCIDToOAF { } } - def generateAuhtor(given: String, family: String, fullName:String, orcid: String): Author = { + def generateAuthor(given: String, family: String, fullName:String, orcid: String): Author = { val a = new Author a.setName(given) a.setSurname(family) diff --git a/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcid/MappingORCIDToOAFTest.scala b/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcid/MappingORCIDToOAFTest.scala index 158746ab52..5b82409421 100644 --- a/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcid/MappingORCIDToOAFTest.scala +++ b/dhp-workflows/dhp-doiboost/src/test/java/eu/dnetlib/doiboost/orcid/MappingORCIDToOAFTest.scala @@ -1,16 +1,14 @@ package eu.dnetlib.doiboost.orcid -import eu.dnetlib.dhp.schema.oaf.Publication -import eu.dnetlib.doiboost.crossref.Crossref2Oaf -import org.codehaus.jackson.map.{ObjectMapper, SerializationConfig} +import org.codehaus.jackson.map.ObjectMapper +import org.junit.jupiter.api.Assertions._ import org.junit.jupiter.api.Test import org.slf4j.{Logger, LoggerFactory} -import org.junit.jupiter.api.Assertions._ import scala.io.Source class MappingORCIDToOAFTest { - val logger: Logger = LoggerFactory.getLogger(Crossref2Oaf.getClass) + val logger: Logger = LoggerFactory.getLogger(ORCIDToOAF.getClass) val mapper = new ObjectMapper() @Test From 4551c1082f29311a486ef5c192e3bfdc5f9fa3c1 Mon Sep 17 00:00:00 2001 From: Alessia Bardi Date: Tue, 9 Jun 2020 18:08:47 +0200 Subject: [PATCH 14/17] mapping csv for orcid --- .../eu.dnetlib.dhp.doiboost.mappings/orcid_mapping.csv | 6 ++++++ 1 file changed, 6 insertions(+) create mode 100644 dhp-workflows/dhp-doiboost/src/main/resources/eu.dnetlib.dhp.doiboost.mappings/orcid_mapping.csv diff --git a/dhp-workflows/dhp-doiboost/src/main/resources/eu.dnetlib.dhp.doiboost.mappings/orcid_mapping.csv b/dhp-workflows/dhp-doiboost/src/main/resources/eu.dnetlib.dhp.doiboost.mappings/orcid_mapping.csv new file mode 100644 index 0000000000..71020d5c34 --- /dev/null +++ b/dhp-workflows/dhp-doiboost/src/main/resources/eu.dnetlib.dhp.doiboost.mappings/orcid_mapping.csv @@ -0,0 +1,6 @@ +ORCID field,OAF mapping,Comment +doi,Result/pid, +oid,Result/Author/pid, +name,Result/Author/name, +surname,Result/Author/surname, +creditName,N/A,Result/Author/fullname is generated by concatenating name and surname \ No newline at end of file From ce12f236bbe88e6c2dab9a1aee2adc905f95750a Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Tue, 9 Jun 2020 20:07:36 +0200 Subject: [PATCH 15/17] disabled test, need to need to update the joined_entity.json file --- .../java/eu/dnetlib/dhp/oa/provision/XmlRecordFactoryTest.java | 3 +++ 1 file changed, 3 insertions(+) diff --git a/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/oa/provision/XmlRecordFactoryTest.java b/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/oa/provision/XmlRecordFactoryTest.java index 9a115bfa6f..992ab26e81 100644 --- a/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/oa/provision/XmlRecordFactoryTest.java +++ b/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/oa/provision/XmlRecordFactoryTest.java @@ -11,6 +11,7 @@ import org.dom4j.Document; import org.dom4j.DocumentException; import org.dom4j.io.SAXReader; import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.Disabled; import org.junit.jupiter.api.Test; import com.fasterxml.jackson.databind.ObjectMapper; @@ -19,6 +20,8 @@ import eu.dnetlib.dhp.oa.provision.model.JoinedEntity; import eu.dnetlib.dhp.oa.provision.utils.ContextMapper; import eu.dnetlib.dhp.oa.provision.utils.XmlRecordFactory; +//TODO to enable it we need to update the joined_entity.json test file +@Disabled public class XmlRecordFactoryTest { private static final String otherDsTypeId = "scholarcomminfra,infospace,pubsrepository::mock,entityregistry,entityregistry::projects,entityregistry::repositories,websource"; From 5869cb76b31bde5f764202fcdcf74f879ce7224c Mon Sep 17 00:00:00 2001 From: "michele.artini" Date: Wed, 10 Jun 2020 12:11:16 +0200 Subject: [PATCH 16/17] reformatting --- .../broker/oa/GenerateEventsApplication.java | 122 ++++++++++++------ .../dhp/broker/oa/matchers/UpdateMatcher.java | 6 +- .../simple/EnrichMissingAbstract.java | 6 +- .../simple/EnrichMissingAuthorOrcid.java | 3 +- .../simple/EnrichMissingOpenAccess.java | 7 +- .../oa/matchers/simple/EnrichMissingPid.java | 10 +- .../simple/EnrichMissingPublicationDate.java | 6 +- .../matchers/simple/EnrichMissingSubject.java | 3 +- .../matchers/simple/EnrichMoreOpenAccess.java | 3 +- .../oa/matchers/simple/EnrichMorePid.java | 6 +- .../oa/matchers/simple/EnrichMoreSubject.java | 3 +- .../dhp/broker/oa/util/EventGroup.java | 1 + .../dhp/broker/oa/util/ResultAggregator.java | 1 + .../dhp/broker/oa/util/ResultGroup.java | 1 + .../dhp/broker/oa/util/TrustUtils.java | 16 ++- .../dhp/broker/oa/util/UpdateInfo.java | 9 +- .../dhp/broker/oa/util/TrustUtilsTest.java | 1 + 17 files changed, 138 insertions(+), 66 deletions(-) diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/GenerateEventsApplication.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/GenerateEventsApplication.java index 44bc5cb6ee..ecf4e3eff4 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/GenerateEventsApplication.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/GenerateEventsApplication.java @@ -91,35 +91,29 @@ public class GenerateEventsApplication { private static final UpdateMatcher>, ?> enrichMoreSoftware = new EnrichMoreSoftware(); private static final UpdateMatcher>, ?> enrichMisissingPublicationIsRelatedTo = new EnrichMissingPublicationIsRelatedTo(); - private static final UpdateMatcher>, ?> enrichMissingPublicationIsReferencedBy = - new EnrichMissingPublicationIsReferencedBy(); + private static final UpdateMatcher>, ?> enrichMissingPublicationIsReferencedBy = new EnrichMissingPublicationIsReferencedBy(); private static final UpdateMatcher>, ?> enrichMissingPublicationReferences = new EnrichMissingPublicationReferences(); - private static final UpdateMatcher>, ?> enrichMissingPublicationIsSupplementedTo = - new EnrichMissingPublicationIsSupplementedTo(); - private static final UpdateMatcher>, ?> enrichMissingPublicationIsSupplementedBy = - new EnrichMissingPublicationIsSupplementedBy(); + private static final UpdateMatcher>, ?> enrichMissingPublicationIsSupplementedTo = new EnrichMissingPublicationIsSupplementedTo(); + private static final UpdateMatcher>, ?> enrichMissingPublicationIsSupplementedBy = new EnrichMissingPublicationIsSupplementedBy(); - private static final UpdateMatcher>, ?> enrichMisissingDatasetIsRelatedTo = - new EnrichMissingDatasetIsRelatedTo(); - private static final UpdateMatcher>, ?> enrichMissingDatasetIsReferencedBy = - new EnrichMissingDatasetIsReferencedBy(); - private static final UpdateMatcher>, ?> enrichMissingDatasetReferences = - new EnrichMissingDatasetReferences(); - private static final UpdateMatcher>, ?> enrichMissingDatasetIsSupplementedTo = - new EnrichMissingDatasetIsSupplementedTo(); - private static final UpdateMatcher>, ?> enrichMissingDatasetIsSupplementedBy = - new EnrichMissingDatasetIsSupplementedBy(); + private static final UpdateMatcher>, ?> enrichMisissingDatasetIsRelatedTo = new EnrichMissingDatasetIsRelatedTo(); + private static final UpdateMatcher>, ?> enrichMissingDatasetIsReferencedBy = new EnrichMissingDatasetIsReferencedBy(); + private static final UpdateMatcher>, ?> enrichMissingDatasetReferences = new EnrichMissingDatasetReferences(); + private static final UpdateMatcher>, ?> enrichMissingDatasetIsSupplementedTo = new EnrichMissingDatasetIsSupplementedTo(); + private static final UpdateMatcher>, ?> enrichMissingDatasetIsSupplementedBy = new EnrichMissingDatasetIsSupplementedBy(); // Aggregators - private static final TypedColumn, ResultGroup> resultAggrTypedColumn = new ResultAggregator().toColumn(); + private static final TypedColumn, ResultGroup> resultAggrTypedColumn = new ResultAggregator() + .toColumn(); private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); public static void main(final String[] args) throws Exception { final ArgumentApplicationParser parser = new ArgumentApplicationParser( IOUtils - .toString(GenerateEventsApplication.class - .getResourceAsStream("/eu/dnetlib/dhp/oa/graph/merge_claims_parameters.json"))); + .toString( + GenerateEventsApplication.class + .getResourceAsStream("/eu/dnetlib/dhp/oa/graph/merge_claims_parameters.json"))); parser.parseArgument(args); final Boolean isSparkSessionManaged = Optional @@ -172,18 +166,23 @@ public class GenerateEventsApplication { final Class resultClazz, final DedupConfig dedupConfig) { - final Dataset results = readPath(spark, graphPath + "/" + resultClazz.getSimpleName().toLowerCase(), Result.class) - .filter(r -> r.getDataInfo().getDeletedbyinference()); + final Dataset results = readPath( + spark, graphPath + "/" + resultClazz.getSimpleName().toLowerCase(), Result.class) + .filter(r -> r.getDataInfo().getDeletedbyinference()); final Dataset mergedRels = readPath(spark, graphPath + "/relation", Relation.class) .filter(r -> r.getRelClass().equals(BrokerConstants.IS_MERGED_IN_CLASS)); - return results.joinWith(mergedRels, results.col("id").equalTo(mergedRels.col("source")), "inner") + return results + .joinWith(mergedRels, results.col("id").equalTo(mergedRels.col("source")), "inner") .groupByKey((MapFunction, String>) t -> t._2.getTarget(), Encoders.STRING()) .agg(resultAggrTypedColumn) .map((MapFunction, ResultGroup>) t -> t._2, Encoders.kryo(ResultGroup.class)) .filter(ResultGroup::isValid) - .map((MapFunction) g -> GenerateEventsApplication.generateSimpleEvents(g, dedupConfig), Encoders.kryo(EventGroup.class)) + .map( + (MapFunction) g -> GenerateEventsApplication + .generateSimpleEvents(g, dedupConfig), + Encoders.kryo(EventGroup.class)) .flatMap(group -> group.getData().iterator(), Encoders.kryo(Event.class)); } @@ -207,16 +206,19 @@ public class GenerateEventsApplication { return events; } - private static Dataset generateRelationEvents(final SparkSession spark, + private static Dataset generateRelationEvents( + final SparkSession spark, final String graphPath, final Class sourceClass, final Class targetClass, final DedupConfig dedupConfig) { - final Dataset sources = readPath(spark, graphPath + "/" + sourceClass.getSimpleName().toLowerCase(), Result.class) - .filter(r -> r.getDataInfo().getDeletedbyinference()); + final Dataset sources = readPath( + spark, graphPath + "/" + sourceClass.getSimpleName().toLowerCase(), Result.class) + .filter(r -> r.getDataInfo().getDeletedbyinference()); - final Dataset targets = readPath(spark, graphPath + "/" + sourceClass.getSimpleName().toLowerCase(), targetClass); + final Dataset targets = readPath( + spark, graphPath + "/" + sourceClass.getSimpleName().toLowerCase(), targetClass); final Dataset mergedRels = readPath(spark, graphPath + "/relation", Relation.class) .filter(r -> r.getRelClass().equals(BrokerConstants.IS_MERGED_IN_CLASS)); @@ -224,7 +226,8 @@ public class GenerateEventsApplication { final Dataset rels = readPath(spark, graphPath + "/relation", Relation.class) .filter(r -> !r.getRelClass().equals(BrokerConstants.IS_MERGED_IN_CLASS)); - final Dataset duplicates = sources.joinWith(mergedRels, sources.col("id").equalTo(rels.col("source")), "inner") + final Dataset duplicates = sources + .joinWith(mergedRels, sources.col("id").equalTo(rels.col("source")), "inner") .groupByKey((MapFunction, String>) t -> t._2.getTarget(), Encoders.STRING()) .agg(resultAggrTypedColumn) .map((MapFunction, ResultGroup>) t -> t._2, Encoders.kryo(ResultGroup.class)) @@ -243,7 +246,8 @@ public class GenerateEventsApplication { return null; } - private List generateProjectsEvents(final Collection>> childrenWithProjects, final DedupConfig dedupConfig) { + private List generateProjectsEvents(final Collection>> childrenWithProjects, + final DedupConfig dedupConfig) { final List> list = new ArrayList<>(); for (final Pair> target : childrenWithProjects) { @@ -254,7 +258,8 @@ public class GenerateEventsApplication { return list.stream().map(EventFactory::newBrokerEvent).collect(Collectors.toList()); } - private List generateSoftwareEvents(final Collection>> childrenWithSoftwares, final DedupConfig dedupConfig) { + private List generateSoftwareEvents(final Collection>> childrenWithSoftwares, + final DedupConfig dedupConfig) { final List> list = new ArrayList<>(); for (final Pair> target : childrenWithSoftwares) { @@ -279,15 +284,30 @@ public class GenerateEventsApplication { for (final Pair> target : cleanedChildrens) { if (relType.equals("isRelatedTo")) { - list.addAll(enrichMisissingPublicationIsRelatedTo.searchUpdatesForRecord(target, cleanedChildrens, dedupConfig)); + list + .addAll( + enrichMisissingPublicationIsRelatedTo + .searchUpdatesForRecord(target, cleanedChildrens, dedupConfig)); } else if (relType.equals("references")) { - list.addAll(enrichMissingPublicationReferences.searchUpdatesForRecord(target, cleanedChildrens, dedupConfig)); + list + .addAll( + enrichMissingPublicationReferences + .searchUpdatesForRecord(target, cleanedChildrens, dedupConfig)); } else if (relType.equals("isReferencedBy")) { - list.addAll(enrichMissingPublicationIsReferencedBy.searchUpdatesForRecord(target, cleanedChildrens, dedupConfig)); + list + .addAll( + enrichMissingPublicationIsReferencedBy + .searchUpdatesForRecord(target, cleanedChildrens, dedupConfig)); } else if (relType.equals("isSupplementedTo")) { - list.addAll(enrichMissingPublicationIsSupplementedTo.searchUpdatesForRecord(target, cleanedChildrens, dedupConfig)); + list + .addAll( + enrichMissingPublicationIsSupplementedTo + .searchUpdatesForRecord(target, cleanedChildrens, dedupConfig)); } else if (relType.equals("isSupplementedBy")) { - list.addAll(enrichMissingPublicationIsSupplementedBy.searchUpdatesForRecord(target, cleanedChildrens, dedupConfig)); + list + .addAll( + enrichMissingPublicationIsSupplementedBy + .searchUpdatesForRecord(target, cleanedChildrens, dedupConfig)); } } @@ -310,15 +330,29 @@ public class GenerateEventsApplication { for (final Pair> target : cleanedChildrens) { if (relType.equals("isRelatedTo")) { - list.addAll(enrichMisissingDatasetIsRelatedTo.searchUpdatesForRecord(target, cleanedChildrens, dedupConfig)); + list + .addAll( + enrichMisissingDatasetIsRelatedTo + .searchUpdatesForRecord(target, cleanedChildrens, dedupConfig)); } else if (relType.equals("references")) { - list.addAll(enrichMissingDatasetReferences.searchUpdatesForRecord(target, cleanedChildrens, dedupConfig)); + list + .addAll( + enrichMissingDatasetReferences.searchUpdatesForRecord(target, cleanedChildrens, dedupConfig)); } else if (relType.equals("isReferencedBy")) { - list.addAll(enrichMissingDatasetIsReferencedBy.searchUpdatesForRecord(target, cleanedChildrens, dedupConfig)); + list + .addAll( + enrichMissingDatasetIsReferencedBy + .searchUpdatesForRecord(target, cleanedChildrens, dedupConfig)); } else if (relType.equals("isSupplementedTo")) { - list.addAll(enrichMissingDatasetIsSupplementedTo.searchUpdatesForRecord(target, cleanedChildrens, dedupConfig)); + list + .addAll( + enrichMissingDatasetIsSupplementedTo + .searchUpdatesForRecord(target, cleanedChildrens, dedupConfig)); } else if (relType.equals("isSupplementedBy")) { - list.addAll(enrichMissingDatasetIsSupplementedBy.searchUpdatesForRecord(target, cleanedChildrens, dedupConfig)); + list + .addAll( + enrichMissingDatasetIsSupplementedBy + .searchUpdatesForRecord(target, cleanedChildrens, dedupConfig)); } } @@ -339,8 +373,12 @@ public class GenerateEventsApplication { private static DedupConfig loadDedupConfig(final String isLookupUrl, final String profId) throws Exception { final ISLookUpService isLookUpService = ISLookupClientFactory.getLookUpService(isLookupUrl); - final String conf = isLookUpService.getResourceProfileByQuery(String - .format("for $x in /RESOURCE_PROFILE[.//RESOURCE_IDENTIFIER/@value = '%s'] return $x//DEDUPLICATION/text()", profId)); + final String conf = isLookUpService + .getResourceProfileByQuery( + String + .format( + "for $x in /RESOURCE_PROFILE[.//RESOURCE_IDENTIFIER/@value = '%s'] return $x//DEDUPLICATION/text()", + profId)); final DedupConfig dedupConfig = new ObjectMapper().readValue(conf, DedupConfig.class); dedupConfig.getPace().initModel(); diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/UpdateMatcher.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/UpdateMatcher.java index 286b40ad51..95d43ae686 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/UpdateMatcher.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/UpdateMatcher.java @@ -22,7 +22,8 @@ public abstract class UpdateMatcher { this.multipleUpdate = multipleUpdate; } - public Collection> searchUpdatesForRecord(final K res, final Collection others, final DedupConfig dedupConfig) { + public Collection> searchUpdatesForRecord(final K res, final Collection others, + final DedupConfig dedupConfig) { final Map> infoMap = new HashMap<>(); @@ -30,7 +31,8 @@ public abstract class UpdateMatcher { if (source != res) { for (final UpdateInfo info : findUpdates(source, res, dedupConfig)) { final String s = DigestUtils.md5Hex(info.getHighlightValueAsString()); - if (!infoMap.containsKey(s) || infoMap.get(s).getTrust() < info.getTrust()) {} else { + if (!infoMap.containsKey(s) || infoMap.get(s).getTrust() < info.getTrust()) { + } else { infoMap.put(s, info); } } diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMissingAbstract.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMissingAbstract.java index c3b6bda668..7dc340b3c7 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMissingAbstract.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMissingAbstract.java @@ -18,9 +18,11 @@ public class EnrichMissingAbstract extends UpdateMatcher { } @Override - protected List> findUpdates(final Result source, final Result target, final DedupConfig dedupConfig) { + protected List> findUpdates(final Result source, final Result target, + final DedupConfig dedupConfig) { if (isMissing(target.getDescription()) && !isMissing(source.getDescription())) { - return Arrays.asList(generateUpdateInfo(source.getDescription().get(0).getValue(), source, target, dedupConfig)); + return Arrays + .asList(generateUpdateInfo(source.getDescription().get(0).getValue(), source, target, dedupConfig)); } return new ArrayList<>(); } diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMissingAuthorOrcid.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMissingAuthorOrcid.java index 89292d3da2..7a1677ae2d 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMissingAuthorOrcid.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMissingAuthorOrcid.java @@ -19,7 +19,8 @@ public class EnrichMissingAuthorOrcid extends UpdateMatcher>> findUpdates(final Result source, final Result target, final DedupConfig dedupConfig) { + protected List>> findUpdates(final Result source, final Result target, + final DedupConfig dedupConfig) { // TODO // return Arrays.asList(new EnrichMissingAbstract("xxxxxxx", 0.9f)); return Arrays.asList(); diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMissingOpenAccess.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMissingOpenAccess.java index 7f5a595cc8..d14490ba85 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMissingOpenAccess.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMissingOpenAccess.java @@ -21,7 +21,8 @@ public class EnrichMissingOpenAccess extends UpdateMatcher { } @Override - protected List> findUpdates(final Result source, final Result target, final DedupConfig dedupConfig) { + protected List> findUpdates(final Result source, final Result target, + final DedupConfig dedupConfig) { final long count = target .getInstance() .stream() @@ -29,7 +30,9 @@ public class EnrichMissingOpenAccess extends UpdateMatcher { .filter(right -> right.equals(BrokerConstants.OPEN_ACCESS)) .count(); - if (count > 0) { return Arrays.asList(); } + if (count > 0) { + return Arrays.asList(); + } return source .getInstance() diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMissingPid.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMissingPid.java index 6e106e6695..20303ec1b4 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMissingPid.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMissingPid.java @@ -20,10 +20,13 @@ public class EnrichMissingPid extends UpdateMatcher { } @Override - protected List> findUpdates(final Result source, final Result target, final DedupConfig dedupConfig) { + protected List> findUpdates(final Result source, final Result target, + final DedupConfig dedupConfig) { final long count = target.getPid().size(); - if (count > 0) { return Arrays.asList(); } + if (count > 0) { + return Arrays.asList(); + } return source .getPid() @@ -33,7 +36,8 @@ public class EnrichMissingPid extends UpdateMatcher { .collect(Collectors.toList()); } - public UpdateInfo generateUpdateInfo(final Pid highlightValue, final Result source, final Result target, final DedupConfig dedupConfig) { + public UpdateInfo generateUpdateInfo(final Pid highlightValue, final Result source, final Result target, + final DedupConfig dedupConfig) { return new UpdateInfo<>( Topic.ENRICH_MISSING_PID, highlightValue, source, target, diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMissingPublicationDate.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMissingPublicationDate.java index d2b28d65d0..e1de8ce4d3 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMissingPublicationDate.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMissingPublicationDate.java @@ -18,9 +18,11 @@ public class EnrichMissingPublicationDate extends UpdateMatcher } @Override - protected List> findUpdates(final Result source, final Result target, final DedupConfig dedupConfig) { + protected List> findUpdates(final Result source, final Result target, + final DedupConfig dedupConfig) { if (isMissing(target.getDateofacceptance()) && !isMissing(source.getDateofacceptance())) { - return Arrays.asList(generateUpdateInfo(source.getDateofacceptance().getValue(), source, target, dedupConfig)); + return Arrays + .asList(generateUpdateInfo(source.getDateofacceptance().getValue(), source, target, dedupConfig)); } return new ArrayList<>(); } diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMissingSubject.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMissingSubject.java index de888ff871..c51f8991c4 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMissingSubject.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMissingSubject.java @@ -23,7 +23,8 @@ public class EnrichMissingSubject extends UpdateMatcher>> findUpdates(final Result source, final Result target, final DedupConfig dedupConfig) { + protected List>> findUpdates(final Result source, final Result target, + final DedupConfig dedupConfig) { final Set existingTypes = target .getSubject() .stream() diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMoreOpenAccess.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMoreOpenAccess.java index 021449797d..2ac04fd12f 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMoreOpenAccess.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMoreOpenAccess.java @@ -21,7 +21,8 @@ public class EnrichMoreOpenAccess extends UpdateMatcher { } @Override - protected List> findUpdates(final Result source, final Result target, final DedupConfig dedupConfig) { + protected List> findUpdates(final Result source, final Result target, + final DedupConfig dedupConfig) { final Set urls = target .getInstance() .stream() diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMorePid.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMorePid.java index c64ed20eac..e4bf5d2c2e 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMorePid.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMorePid.java @@ -20,7 +20,8 @@ public class EnrichMorePid extends UpdateMatcher { } @Override - protected List> findUpdates(final Result source, final Result target, final DedupConfig dedupConfig) { + protected List> findUpdates(final Result source, final Result target, + final DedupConfig dedupConfig) { final Set existingPids = target .getPid() .stream() @@ -36,7 +37,8 @@ public class EnrichMorePid extends UpdateMatcher { .collect(Collectors.toList()); } - public UpdateInfo generateUpdateInfo(final Pid highlightValue, final Result source, final Result target, final DedupConfig dedupConfig) { + public UpdateInfo generateUpdateInfo(final Pid highlightValue, final Result source, final Result target, + final DedupConfig dedupConfig) { return new UpdateInfo<>( Topic.ENRICH_MORE_PID, highlightValue, source, target, diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMoreSubject.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMoreSubject.java index 3f7f5b3d56..d6e607c31c 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMoreSubject.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMoreSubject.java @@ -21,7 +21,8 @@ public class EnrichMoreSubject extends UpdateMatcher>> findUpdates(final Result source, final Result target, final DedupConfig dedupConfig) { + protected List>> findUpdates(final Result source, final Result target, + final DedupConfig dedupConfig) { final Set existingSubjects = target .getSubject() .stream() diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EventGroup.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EventGroup.java index 9c7081c79b..25c7698a0c 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EventGroup.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/EventGroup.java @@ -1,3 +1,4 @@ + package eu.dnetlib.dhp.broker.oa.util; import java.io.Serializable; diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/ResultAggregator.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/ResultAggregator.java index 94685eeae2..475c768149 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/ResultAggregator.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/ResultAggregator.java @@ -1,3 +1,4 @@ + package eu.dnetlib.dhp.broker.oa.util; import org.apache.spark.sql.Encoder; diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/ResultGroup.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/ResultGroup.java index 8fe7a59392..2be673db03 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/ResultGroup.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/ResultGroup.java @@ -1,3 +1,4 @@ + package eu.dnetlib.dhp.broker.oa.util; import java.io.Serializable; diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/TrustUtils.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/TrustUtils.java index 6bf59c1252..5338d4f3d7 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/TrustUtils.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/TrustUtils.java @@ -1,14 +1,22 @@ + package eu.dnetlib.dhp.broker.oa.util; public class TrustUtils { public static float rescale(final double score, final double threshold) { - if (score >= BrokerConstants.MAX_TRUST) { return BrokerConstants.MAX_TRUST; } + if (score >= BrokerConstants.MAX_TRUST) { + return BrokerConstants.MAX_TRUST; + } - final double val = (score - threshold) * (BrokerConstants.MAX_TRUST - BrokerConstants.MIN_TRUST) / (BrokerConstants.MAX_TRUST - threshold); + final double val = (score - threshold) * (BrokerConstants.MAX_TRUST - BrokerConstants.MIN_TRUST) + / (BrokerConstants.MAX_TRUST - threshold); - if (val < BrokerConstants.MIN_TRUST) { return BrokerConstants.MIN_TRUST; } - if (val > BrokerConstants.MAX_TRUST) { return BrokerConstants.MAX_TRUST; } + if (val < BrokerConstants.MIN_TRUST) { + return BrokerConstants.MIN_TRUST; + } + if (val > BrokerConstants.MAX_TRUST) { + return BrokerConstants.MAX_TRUST; + } return (float) val; } diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/UpdateInfo.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/UpdateInfo.java index de6a713974..893aa2827d 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/UpdateInfo.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/UpdateInfo.java @@ -68,8 +68,10 @@ public final class UpdateInfo { private float calculateTrust(final DedupConfig dedupConfig, final Result r1, final Result r2) { try { final ObjectMapper objectMapper = new ObjectMapper(); - final MapDocument doc1 = MapDocumentUtil.asMapDocumentWithJPath(dedupConfig, objectMapper.writeValueAsString(r1)); - final MapDocument doc2 = MapDocumentUtil.asMapDocumentWithJPath(dedupConfig, objectMapper.writeValueAsString(r2)); + final MapDocument doc1 = MapDocumentUtil + .asMapDocumentWithJPath(dedupConfig, objectMapper.writeValueAsString(r1)); + final MapDocument doc2 = MapDocumentUtil + .asMapDocumentWithJPath(dedupConfig, objectMapper.writeValueAsString(r2)); final double score = new TreeProcessor(dedupConfig).computeScore(doc1, doc2); final double threshold = dedupConfig.getWf().getThreshold(); @@ -118,7 +120,8 @@ public final class UpdateInfo { .map(Instance::getUrl) .flatMap(List::stream) .findFirst() - .orElse(null);; + .orElse(null); + ; final Provenance provenance = new Provenance().setId(provId).setRepositoryName(provRepo).setUrl(provUrl); diff --git a/dhp-workflows/dhp-broker-events/src/test/java/eu/dnetlib/dhp/broker/oa/util/TrustUtilsTest.java b/dhp-workflows/dhp-broker-events/src/test/java/eu/dnetlib/dhp/broker/oa/util/TrustUtilsTest.java index 58f391c24b..bb23d6085e 100644 --- a/dhp-workflows/dhp-broker-events/src/test/java/eu/dnetlib/dhp/broker/oa/util/TrustUtilsTest.java +++ b/dhp-workflows/dhp-broker-events/src/test/java/eu/dnetlib/dhp/broker/oa/util/TrustUtilsTest.java @@ -1,3 +1,4 @@ + package eu.dnetlib.dhp.broker.oa.util; import static org.junit.jupiter.api.Assertions.assertTrue; From c77fc684849f8a2bcc7423936983599c2b6e5516 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Wed, 10 Jun 2020 14:49:25 +0200 Subject: [PATCH 17/17] restored Saxon-HE dependency definition --- pom.xml | 1 - 1 file changed, 1 deletion(-) diff --git a/pom.xml b/pom.xml index f4b96fefb3..e0ee189001 100644 --- a/pom.xml +++ b/pom.xml @@ -193,7 +193,6 @@ net.sf.saxon Saxon-HE 9.9.1-6 - provided