From 4953ae56499e2b11916c8afe0f5af7fdd8a8fb1f Mon Sep 17 00:00:00 2001 From: "michele.artini" Date: Wed, 11 Jan 2023 08:35:53 +0100 Subject: [PATCH 1/7] fixed an invalid char --- .../test/resources/eu/dnetlib/dhp/transform/cnr_explora_tr.xslt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/cnr_explora_tr.xslt b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/cnr_explora_tr.xslt index 78b167fde..33770ce47 100644 --- a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/cnr_explora_tr.xslt +++ b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/cnr_explora_tr.xslt @@ -130,7 +130,7 @@ - + From d40e20f437b3e89c952f08d5f5d8c4fc51e74ee6 Mon Sep 17 00:00:00 2001 From: "michele.artini" Date: Wed, 11 Jan 2023 09:37:34 +0100 Subject: [PATCH 2/7] Considering instance pids and alteternative identifiers --- .../dhp/broker/oa/util/ConversionUtils.java | 84 +++++++---------- .../broker/oa/util/ConversionUtilsTest.java | 92 +++++++++++++++++++ 2 files changed, 126 insertions(+), 50 deletions(-) create mode 100644 dhp-workflows/dhp-broker-events/src/test/java/eu/dnetlib/dhp/broker/oa/util/ConversionUtilsTest.java diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/ConversionUtils.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/ConversionUtils.java index bc37203d3..1b28c4064 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/ConversionUtils.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/ConversionUtils.java @@ -2,7 +2,10 @@ package eu.dnetlib.dhp.broker.oa.util; import java.util.ArrayList; +import java.util.Collection; +import java.util.HashMap; import java.util.List; +import java.util.Map; import java.util.Objects; import java.util.function.Function; import java.util.stream.Collectors; @@ -45,13 +48,10 @@ public class ConversionUtils { private static final Logger log = LoggerFactory.getLogger(ConversionUtils.class); - private ConversionUtils() { - } + private ConversionUtils() {} public static List oafInstanceToBrokerInstances(final Instance i) { - if (i == null) { - return new ArrayList<>(); - } + if (i == null) { return new ArrayList<>(); } return mappedList(i.getUrl(), url -> { final OaBrokerInstance res = new OaBrokerInstance(); @@ -72,30 +72,26 @@ public class ConversionUtils { } public static OaBrokerRelatedDataset oafDatasetToBrokerDataset(final Dataset d) { - if (d == null) { - return null; - } + if (d == null) { return null; } final OaBrokerRelatedDataset res = new OaBrokerRelatedDataset(); res.setOpenaireId(cleanOpenaireId(d.getId())); res.setOriginalId(first(d.getOriginalId())); res.setTitle(structPropValue(d.getTitle())); - res.setPids(mappedList(d.getPid(), ConversionUtils::oafPidToBrokerPid)); + res.setPids(allResultPids(d)); res.setInstances(flatMappedList(d.getInstance(), ConversionUtils::oafInstanceToBrokerInstances)); res.setCollectedFrom(mappedFirst(d.getCollectedfrom(), KeyValue::getValue)); return res; } public static OaBrokerRelatedPublication oafPublicationToBrokerPublication(final Publication p) { - if (p == null) { - return null; - } + if (p == null) { return null; } final OaBrokerRelatedPublication res = new OaBrokerRelatedPublication(); res.setOpenaireId(cleanOpenaireId(p.getId())); res.setOriginalId(first(p.getOriginalId())); res.setTitle(structPropValue(p.getTitle())); - res.setPids(mappedList(p.getPid(), ConversionUtils::oafPidToBrokerPid)); + res.setPids(allResultPids(p)); res.setInstances(flatMappedList(p.getInstance(), ConversionUtils::oafInstanceToBrokerInstances)); res.setCollectedFrom(mappedFirst(p.getCollectedfrom(), KeyValue::getValue)); @@ -103,9 +99,7 @@ public class ConversionUtils { } public static OaBrokerMainEntity oafResultToBrokerResult(final Result result) { - if (result == null) { - return null; - } + if (result == null) { return null; } final OaBrokerMainEntity res = new OaBrokerMainEntity(); @@ -122,9 +116,8 @@ public class ConversionUtils { res.setEmbargoenddate(fieldValue(result.getEmbargoenddate())); res.setContributor(fieldList(result.getContributor())); res - .setJournal( - result instanceof Publication ? oafJournalToBrokerJournal(((Publication) result).getJournal()) : null); - res.setPids(mappedList(result.getPid(), ConversionUtils::oafPidToBrokerPid)); + .setJournal(result instanceof Publication ? oafJournalToBrokerJournal(((Publication) result).getJournal()) : null); + res.setPids(allResultPids(result)); res.setInstances(flatMappedList(result.getInstance(), ConversionUtils::oafInstanceToBrokerInstances)); res .setExternalReferences(mappedList(result.getExternalReference(), ConversionUtils::oafExtRefToBrokerExtRef)); @@ -132,14 +125,23 @@ public class ConversionUtils { return res; } + protected static List allResultPids(final Result result) { + final Map map = new HashMap<>(); + result.getPid().forEach(sp -> map.put(sp.getValue(), sp)); + result.getInstance().forEach(i -> { + i.getPid().forEach(sp -> map.put(sp.getValue(), sp)); + i.getAlternateIdentifier().forEach(sp -> map.put(sp.getValue(), sp)); + }); + final List pids = mappedList(map.values(), ConversionUtils::oafPidToBrokerPid); + return pids; + } + public static String cleanOpenaireId(final String id) { return id.contains("|") ? StringUtils.substringAfter(id, "|") : id; } private static OaBrokerAuthor oafAuthorToBrokerAuthor(final Author author) { - if (author == null) { - return null; - } + if (author == null) { return null; } final String pids = author.getPid() != null ? author .getPid() @@ -163,9 +165,7 @@ public class ConversionUtils { } private static OaBrokerJournal oafJournalToBrokerJournal(final Journal journal) { - if (journal == null) { - return null; - } + if (journal == null) { return null; } final OaBrokerJournal res = new OaBrokerJournal(); res.setName(journal.getName()); @@ -177,9 +177,7 @@ public class ConversionUtils { } private static OaBrokerExternalReference oafExtRefToBrokerExtRef(final ExternalReference ref) { - if (ref == null) { - return null; - } + if (ref == null) { return null; } final OaBrokerExternalReference res = new OaBrokerExternalReference(); res.setRefidentifier(ref.getRefidentifier()); @@ -190,9 +188,7 @@ public class ConversionUtils { } public static OaBrokerProject oafProjectToBrokerProject(final Project p) { - if (p == null) { - return null; - } + if (p == null) { return null; } final OaBrokerProject res = new OaBrokerProject(); res.setOpenaireId(cleanOpenaireId(p.getId())); @@ -216,9 +212,7 @@ public class ConversionUtils { } public static OaBrokerRelatedSoftware oafSoftwareToBrokerSoftware(final Software sw) { - if (sw == null) { - return null; - } + if (sw == null) { return null; } final OaBrokerRelatedSoftware res = new OaBrokerRelatedSoftware(); res.setOpenaireId(cleanOpenaireId(sw.getId())); @@ -231,9 +225,7 @@ public class ConversionUtils { } public static OaBrokerRelatedDatasource oafDatasourceToBrokerDatasource(final Datasource ds) { - if (ds == null) { - return null; - } + if (ds == null) { return null; } final OaBrokerRelatedDatasource res = new OaBrokerRelatedDatasource(); res.setName(StringUtils.defaultIfBlank(fieldValue(ds.getOfficialname()), fieldValue(ds.getEnglishname()))); @@ -293,9 +285,7 @@ public class ConversionUtils { } private static List structPropTypedList(final List list) { - if (list == null) { - return new ArrayList<>(); - } + if (list == null) { return new ArrayList<>(); } return list .stream() @@ -304,10 +294,8 @@ public class ConversionUtils { .collect(Collectors.toList()); } - private static List mappedList(final List list, final Function func) { - if (list == null) { - return new ArrayList<>(); - } + private static List mappedList(final Collection list, final Function func) { + if (list == null) { return new ArrayList<>(); } return list .stream() @@ -318,9 +306,7 @@ public class ConversionUtils { } private static List flatMappedList(final List list, final Function> func) { - if (list == null) { - return new ArrayList<>(); - } + if (list == null) { return new ArrayList<>(); } return list .stream() @@ -332,9 +318,7 @@ public class ConversionUtils { } private static T mappedFirst(final List list, final Function func) { - if (list == null) { - return null; - } + if (list == null) { return null; } return list .stream() diff --git a/dhp-workflows/dhp-broker-events/src/test/java/eu/dnetlib/dhp/broker/oa/util/ConversionUtilsTest.java b/dhp-workflows/dhp-broker-events/src/test/java/eu/dnetlib/dhp/broker/oa/util/ConversionUtilsTest.java new file mode 100644 index 000000000..fb5454c34 --- /dev/null +++ b/dhp-workflows/dhp-broker-events/src/test/java/eu/dnetlib/dhp/broker/oa/util/ConversionUtilsTest.java @@ -0,0 +1,92 @@ +package eu.dnetlib.dhp.broker.oa.util; + +import static org.junit.jupiter.api.Assertions.assertEquals; + +import java.util.ArrayList; +import java.util.List; + +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; + +import eu.dnetlib.broker.objects.OaBrokerTypedValue; +import eu.dnetlib.dhp.schema.oaf.Instance; +import eu.dnetlib.dhp.schema.oaf.Qualifier; +import eu.dnetlib.dhp.schema.oaf.Result; +import eu.dnetlib.dhp.schema.oaf.StructuredProperty; + +class ConversionUtilsTest { + + @BeforeEach + void setUp() throws Exception {} + + @Test + void testAllResultPids() { + final Qualifier qf = new Qualifier(); + qf.setClassid("test"); + qf.setClassname("test"); + qf.setSchemeid("test"); + qf.setSchemename("test"); + + final StructuredProperty sp1 = new StructuredProperty(); + sp1.setValue("1"); + sp1.setQualifier(qf); + + final StructuredProperty sp2 = new StructuredProperty(); + sp2.setValue("2"); + sp2.setQualifier(qf); + + final StructuredProperty sp3 = new StructuredProperty(); + sp3.setValue("3"); + sp3.setQualifier(qf); + + final StructuredProperty sp4a = new StructuredProperty(); + sp4a.setValue("4"); + sp4a.setQualifier(qf); + + final StructuredProperty sp4b = new StructuredProperty(); + sp4b.setValue("4"); + sp4b.setQualifier(qf); + + final StructuredProperty sp5 = new StructuredProperty(); + sp5.setValue("5"); + sp5.setQualifier(qf); + + final StructuredProperty sp6a = new StructuredProperty(); + sp6a.setValue("6"); + sp6a.setQualifier(qf); + + final StructuredProperty sp6b = new StructuredProperty(); + sp6b.setValue("6"); + sp6b.setQualifier(qf); + + final Result oaf = new Result(); + oaf.setPid(new ArrayList<>()); + oaf.getPid().add(sp1); + oaf.getPid().add(sp2); + oaf.getPid().add(sp4a); + + final Instance instance1 = new Instance(); + instance1.setPid(new ArrayList<>()); + instance1.setAlternateIdentifier(new ArrayList<>()); + instance1.getPid().add(sp3); + instance1.getPid().add(sp4b); + instance1.getAlternateIdentifier().add(sp5); + instance1.getAlternateIdentifier().add(sp6a); + + final Instance instance2 = new Instance(); + instance2.setPid(new ArrayList<>()); + instance2.setAlternateIdentifier(new ArrayList<>()); + instance2.getPid().add(sp6b); + + oaf.setInstance(new ArrayList<>()); + oaf.getInstance().add(instance1); + oaf.getInstance().add(instance2); + + final List list = ConversionUtils.allResultPids(oaf); + + // list.forEach(x -> System.out.println(x.getValue())); + + assertEquals(6, list.size()); + } + +} From f86e19b282ba2b089a258a22d5907385e377e6d3 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Wed, 11 Jan 2023 09:53:19 +0100 Subject: [PATCH 3/7] code formatting --- .../ebi/SparkCreateBaselineDataFrame.scala | 3 +- .../dnetlib/dhp/sx/bio/BioScholixTest.scala | 24 ++++--- .../dhp/broker/oa/util/ConversionUtils.java | 62 ++++++++++++++----- .../broker/oa/util/ConversionUtilsTest.java | 4 +- .../orcid/MappingORCIDToOAFTest.scala | 8 ++- .../resolution/ResolveEntitiesTest.scala | 6 +- .../sx/graph/scholix/ScholixGraphTest.scala | 3 +- 7 files changed, 78 insertions(+), 32 deletions(-) diff --git a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/ebi/SparkCreateBaselineDataFrame.scala b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/ebi/SparkCreateBaselineDataFrame.scala index 29e716abb..8ac8b00bf 100644 --- a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/ebi/SparkCreateBaselineDataFrame.scala +++ b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/ebi/SparkCreateBaselineDataFrame.scala @@ -27,7 +27,8 @@ object SparkCreateBaselineDataFrame { def requestBaseLineUpdatePage(maxFile: String): List[(String, String)] = { val data = requestPage("https://ftp.ncbi.nlm.nih.gov/pubmed/updatefiles/") - val result = data.linesWithSeparators.map(l => l.stripLineEnd) + val result = data.linesWithSeparators + .map(l => l.stripLineEnd) .filter(l => l.startsWith("") diff --git a/dhp-workflows/dhp-aggregation/src/test/scala/eu/dnetlib/dhp/sx/bio/BioScholixTest.scala b/dhp-workflows/dhp-aggregation/src/test/scala/eu/dnetlib/dhp/sx/bio/BioScholixTest.scala index 8412e7110..d1611300d 100644 --- a/dhp-workflows/dhp-aggregation/src/test/scala/eu/dnetlib/dhp/sx/bio/BioScholixTest.scala +++ b/dhp-workflows/dhp-aggregation/src/test/scala/eu/dnetlib/dhp/sx/bio/BioScholixTest.scala @@ -63,7 +63,9 @@ class BioScholixTest extends AbstractVocabularyTest { val records: String = Source .fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/pubmed_dump")) .mkString - val r: List[Oaf] = records.linesWithSeparators.map(l => l.stripLineEnd).toList + val r: List[Oaf] = records.linesWithSeparators + .map(l => l.stripLineEnd) + .toList .map(s => mapper.readValue(s, classOf[PMArticle])) .map(a => PubMedToOaf.convert(a, vocabularies)) assertEquals(10, r.size) @@ -175,7 +177,8 @@ class BioScholixTest extends AbstractVocabularyTest { .mkString records.linesWithSeparators.map(l => l.stripLineEnd).foreach(s => assertTrue(s.nonEmpty)) - val result: List[Oaf] = records.linesWithSeparators.map(l => l.stripLineEnd).toList.flatMap(o => BioDBToOAF.pdbTOOaf(o)) + val result: List[Oaf] = + records.linesWithSeparators.map(l => l.stripLineEnd).toList.flatMap(o => BioDBToOAF.pdbTOOaf(o)) assertTrue(result.nonEmpty) result.foreach(r => assertNotNull(r)) @@ -196,7 +199,8 @@ class BioScholixTest extends AbstractVocabularyTest { .mkString records.linesWithSeparators.map(l => l.stripLineEnd).foreach(s => assertTrue(s.nonEmpty)) - val result: List[Oaf] = records.linesWithSeparators.map(l => l.stripLineEnd).toList.flatMap(o => BioDBToOAF.uniprotToOAF(o)) + val result: List[Oaf] = + records.linesWithSeparators.map(l => l.stripLineEnd).toList.flatMap(o => BioDBToOAF.uniprotToOAF(o)) assertTrue(result.nonEmpty) result.foreach(r => assertNotNull(r)) @@ -241,7 +245,8 @@ class BioScholixTest extends AbstractVocabularyTest { .mkString records.linesWithSeparators.map(l => l.stripLineEnd).foreach(s => assertTrue(s.nonEmpty)) - val result: List[Oaf] = records.linesWithSeparators.map(l => l.stripLineEnd).map(s => BioDBToOAF.crossrefLinksToOaf(s)).toList + val result: List[Oaf] = + records.linesWithSeparators.map(l => l.stripLineEnd).map(s => BioDBToOAF.crossrefLinksToOaf(s)).toList assertNotNull(result) assertTrue(result.nonEmpty) @@ -280,10 +285,13 @@ class BioScholixTest extends AbstractVocabularyTest { implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats - val l: List[ScholixResolved] = records.linesWithSeparators.map(l => l.stripLineEnd).map { input => - lazy val json = parse(input) - json.extract[ScholixResolved] - }.toList + val l: List[ScholixResolved] = records.linesWithSeparators + .map(l => l.stripLineEnd) + .map { input => + lazy val json = parse(input) + json.extract[ScholixResolved] + } + .toList val result: List[Oaf] = l.map(s => BioDBToOAF.scholixResolvedToOAF(s)) diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/ConversionUtils.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/ConversionUtils.java index 1b28c4064..cfe9ed61c 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/ConversionUtils.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/ConversionUtils.java @@ -48,10 +48,13 @@ public class ConversionUtils { private static final Logger log = LoggerFactory.getLogger(ConversionUtils.class); - private ConversionUtils() {} + private ConversionUtils() { + } public static List oafInstanceToBrokerInstances(final Instance i) { - if (i == null) { return new ArrayList<>(); } + if (i == null) { + return new ArrayList<>(); + } return mappedList(i.getUrl(), url -> { final OaBrokerInstance res = new OaBrokerInstance(); @@ -72,7 +75,9 @@ public class ConversionUtils { } public static OaBrokerRelatedDataset oafDatasetToBrokerDataset(final Dataset d) { - if (d == null) { return null; } + if (d == null) { + return null; + } final OaBrokerRelatedDataset res = new OaBrokerRelatedDataset(); res.setOpenaireId(cleanOpenaireId(d.getId())); @@ -85,7 +90,9 @@ public class ConversionUtils { } public static OaBrokerRelatedPublication oafPublicationToBrokerPublication(final Publication p) { - if (p == null) { return null; } + if (p == null) { + return null; + } final OaBrokerRelatedPublication res = new OaBrokerRelatedPublication(); res.setOpenaireId(cleanOpenaireId(p.getId())); @@ -99,7 +106,9 @@ public class ConversionUtils { } public static OaBrokerMainEntity oafResultToBrokerResult(final Result result) { - if (result == null) { return null; } + if (result == null) { + return null; + } final OaBrokerMainEntity res = new OaBrokerMainEntity(); @@ -116,7 +125,8 @@ public class ConversionUtils { res.setEmbargoenddate(fieldValue(result.getEmbargoenddate())); res.setContributor(fieldList(result.getContributor())); res - .setJournal(result instanceof Publication ? oafJournalToBrokerJournal(((Publication) result).getJournal()) : null); + .setJournal( + result instanceof Publication ? oafJournalToBrokerJournal(((Publication) result).getJournal()) : null); res.setPids(allResultPids(result)); res.setInstances(flatMappedList(result.getInstance(), ConversionUtils::oafInstanceToBrokerInstances)); res @@ -141,7 +151,9 @@ public class ConversionUtils { } private static OaBrokerAuthor oafAuthorToBrokerAuthor(final Author author) { - if (author == null) { return null; } + if (author == null) { + return null; + } final String pids = author.getPid() != null ? author .getPid() @@ -165,7 +177,9 @@ public class ConversionUtils { } private static OaBrokerJournal oafJournalToBrokerJournal(final Journal journal) { - if (journal == null) { return null; } + if (journal == null) { + return null; + } final OaBrokerJournal res = new OaBrokerJournal(); res.setName(journal.getName()); @@ -177,7 +191,9 @@ public class ConversionUtils { } private static OaBrokerExternalReference oafExtRefToBrokerExtRef(final ExternalReference ref) { - if (ref == null) { return null; } + if (ref == null) { + return null; + } final OaBrokerExternalReference res = new OaBrokerExternalReference(); res.setRefidentifier(ref.getRefidentifier()); @@ -188,7 +204,9 @@ public class ConversionUtils { } public static OaBrokerProject oafProjectToBrokerProject(final Project p) { - if (p == null) { return null; } + if (p == null) { + return null; + } final OaBrokerProject res = new OaBrokerProject(); res.setOpenaireId(cleanOpenaireId(p.getId())); @@ -212,7 +230,9 @@ public class ConversionUtils { } public static OaBrokerRelatedSoftware oafSoftwareToBrokerSoftware(final Software sw) { - if (sw == null) { return null; } + if (sw == null) { + return null; + } final OaBrokerRelatedSoftware res = new OaBrokerRelatedSoftware(); res.setOpenaireId(cleanOpenaireId(sw.getId())); @@ -225,7 +245,9 @@ public class ConversionUtils { } public static OaBrokerRelatedDatasource oafDatasourceToBrokerDatasource(final Datasource ds) { - if (ds == null) { return null; } + if (ds == null) { + return null; + } final OaBrokerRelatedDatasource res = new OaBrokerRelatedDatasource(); res.setName(StringUtils.defaultIfBlank(fieldValue(ds.getOfficialname()), fieldValue(ds.getEnglishname()))); @@ -285,7 +307,9 @@ public class ConversionUtils { } private static List structPropTypedList(final List list) { - if (list == null) { return new ArrayList<>(); } + if (list == null) { + return new ArrayList<>(); + } return list .stream() @@ -295,7 +319,9 @@ public class ConversionUtils { } private static List mappedList(final Collection list, final Function func) { - if (list == null) { return new ArrayList<>(); } + if (list == null) { + return new ArrayList<>(); + } return list .stream() @@ -306,7 +332,9 @@ public class ConversionUtils { } private static List flatMappedList(final List list, final Function> func) { - if (list == null) { return new ArrayList<>(); } + if (list == null) { + return new ArrayList<>(); + } return list .stream() @@ -318,7 +346,9 @@ public class ConversionUtils { } private static T mappedFirst(final List list, final Function func) { - if (list == null) { return null; } + if (list == null) { + return null; + } return list .stream() diff --git a/dhp-workflows/dhp-broker-events/src/test/java/eu/dnetlib/dhp/broker/oa/util/ConversionUtilsTest.java b/dhp-workflows/dhp-broker-events/src/test/java/eu/dnetlib/dhp/broker/oa/util/ConversionUtilsTest.java index fb5454c34..fc630df05 100644 --- a/dhp-workflows/dhp-broker-events/src/test/java/eu/dnetlib/dhp/broker/oa/util/ConversionUtilsTest.java +++ b/dhp-workflows/dhp-broker-events/src/test/java/eu/dnetlib/dhp/broker/oa/util/ConversionUtilsTest.java @@ -1,3 +1,4 @@ + package eu.dnetlib.dhp.broker.oa.util; import static org.junit.jupiter.api.Assertions.assertEquals; @@ -17,7 +18,8 @@ import eu.dnetlib.dhp.schema.oaf.StructuredProperty; class ConversionUtilsTest { @BeforeEach - void setUp() throws Exception {} + void setUp() throws Exception { + } @Test void testAllResultPids() { diff --git a/dhp-workflows/dhp-doiboost/src/test/scala/eu/dnetlib/dhp/doiboost/orcid/MappingORCIDToOAFTest.scala b/dhp-workflows/dhp-doiboost/src/test/scala/eu/dnetlib/dhp/doiboost/orcid/MappingORCIDToOAFTest.scala index ea63ec46d..8033f02fb 100644 --- a/dhp-workflows/dhp-doiboost/src/test/scala/eu/dnetlib/dhp/doiboost/orcid/MappingORCIDToOAFTest.scala +++ b/dhp-workflows/dhp-doiboost/src/test/scala/eu/dnetlib/dhp/doiboost/orcid/MappingORCIDToOAFTest.scala @@ -25,9 +25,11 @@ class MappingORCIDToOAFTest { .mkString assertNotNull(json) assertFalse(json.isEmpty) - json.linesWithSeparators.map(l => l.stripLineEnd).foreach(s => { - assertNotNull(ORCIDToOAF.extractValueFromInputString(s)) - }) + json.linesWithSeparators + .map(l => l.stripLineEnd) + .foreach(s => { + assertNotNull(ORCIDToOAF.extractValueFromInputString(s)) + }) } @Test diff --git a/dhp-workflows/dhp-graph-mapper/src/test/scala/eu/dnetlib/dhp/oa/graph/resolution/ResolveEntitiesTest.scala b/dhp-workflows/dhp-graph-mapper/src/test/scala/eu/dnetlib/dhp/oa/graph/resolution/ResolveEntitiesTest.scala index 3b6922e16..619b21ade 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/scala/eu/dnetlib/dhp/oa/graph/resolution/ResolveEntitiesTest.scala +++ b/dhp-workflows/dhp-graph-mapper/src/test/scala/eu/dnetlib/dhp/oa/graph/resolution/ResolveEntitiesTest.scala @@ -53,7 +53,8 @@ class ResolveEntitiesTest extends Serializable { def generateUpdates(spark: SparkSession): Unit = { val template = Source.fromInputStream(this.getClass.getResourceAsStream("updates")).mkString - val pids: List[String] = template.linesWithSeparators.map(l => l.stripLineEnd) + val pids: List[String] = template.linesWithSeparators + .map(l => l.stripLineEnd) .map { id => val r = new Result r.setId(id.toLowerCase.trim) @@ -264,7 +265,8 @@ class ResolveEntitiesTest extends Serializable { Source .fromInputStream(this.getClass.getResourceAsStream(s"publication")) .mkString - .linesWithSeparators.map(l => l.stripLineEnd) + .linesWithSeparators + .map(l => l.stripLineEnd) .next(), classOf[Publication] ) diff --git a/dhp-workflows/dhp-graph-mapper/src/test/scala/eu/dnetlib/dhp/sx/graph/scholix/ScholixGraphTest.scala b/dhp-workflows/dhp-graph-mapper/src/test/scala/eu/dnetlib/dhp/sx/graph/scholix/ScholixGraphTest.scala index 73bdcead9..b838ae065 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/scala/eu/dnetlib/dhp/sx/graph/scholix/ScholixGraphTest.scala +++ b/dhp-workflows/dhp-graph-mapper/src/test/scala/eu/dnetlib/dhp/sx/graph/scholix/ScholixGraphTest.scala @@ -69,7 +69,8 @@ class ScholixGraphTest extends AbstractVocabularyTest { getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/merge_result_scholix") ) .mkString - val result: List[(Relation, ScholixSummary)] = inputRelations.linesWithSeparators.map(l => l.stripLineEnd) + val result: List[(Relation, ScholixSummary)] = inputRelations.linesWithSeparators + .map(l => l.stripLineEnd) .sliding(2) .map(s => (s.head, s(1))) .map(p => (mapper.readValue(p._1, classOf[Relation]), mapper.readValue(p._2, classOf[ScholixSummary]))) From 699736addc45fa8e18c4877414f462a8c23e68e8 Mon Sep 17 00:00:00 2001 From: "michele.artini" Date: Wed, 11 Jan 2023 13:14:44 +0100 Subject: [PATCH 4/7] NPE prevention --- .../dhp/broker/oa/util/ConversionUtils.java | 23 +++++++++++++------ 1 file changed, 16 insertions(+), 7 deletions(-) diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/ConversionUtils.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/ConversionUtils.java index cfe9ed61c..077ba3b3b 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/ConversionUtils.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/ConversionUtils.java @@ -137,13 +137,22 @@ public class ConversionUtils { protected static List allResultPids(final Result result) { final Map map = new HashMap<>(); - result.getPid().forEach(sp -> map.put(sp.getValue(), sp)); - result.getInstance().forEach(i -> { - i.getPid().forEach(sp -> map.put(sp.getValue(), sp)); - i.getAlternateIdentifier().forEach(sp -> map.put(sp.getValue(), sp)); - }); - final List pids = mappedList(map.values(), ConversionUtils::oafPidToBrokerPid); - return pids; + + if (result.getPid() != null) { + result.getPid().forEach(sp -> map.put(sp.getValue(), sp)); + } + + if (result.getInstance() != null) { + result.getInstance().forEach(i -> { + if (i.getPid() != null) { + i.getPid().forEach(sp -> map.put(sp.getValue(), sp)); + } + if (i.getAlternateIdentifier() != null) { + i.getAlternateIdentifier().forEach(sp -> map.put(sp.getValue(), sp)); + } + }); + } + return mappedList(map.values(), ConversionUtils::oafPidToBrokerPid); } public static String cleanOpenaireId(final String id) { From 3800361033989b20bc2ee0c5142a2ec3c5132413 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Thu, 19 Jan 2023 15:57:43 +0100 Subject: [PATCH 5/7] [country propagation] fixes error 'cannot resolve countrySet given input columns: []' when there is no prepared information driving the propagation process for a given result type --- .../SparkCountryPropagationJob.java | 30 ++++++++++++------- 1 file changed, 19 insertions(+), 11 deletions(-) diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/countrypropagation/SparkCountryPropagationJob.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/countrypropagation/SparkCountryPropagationJob.java index d9f6433a0..9d553ded3 100644 --- a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/countrypropagation/SparkCountryPropagationJob.java +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/countrypropagation/SparkCountryPropagationJob.java @@ -14,6 +14,7 @@ import org.apache.spark.SparkConf; import org.apache.spark.api.java.function.MapFunction; import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Encoders; +import org.apache.spark.sql.Row; import org.apache.spark.sql.SaveMode; import org.apache.spark.sql.SparkSession; import org.slf4j.Logger; @@ -84,19 +85,26 @@ public class SparkCountryPropagationJob { Dataset res = readPath(spark, sourcePath, resultClazz); log.info("Reading prepared info: {}", preparedInfoPath); - Dataset prepared = spark + final Dataset preparedInfoRaw = spark .read() - .json(preparedInfoPath) - .as(Encoders.bean(ResultCountrySet.class)); - - res - .joinWith(prepared, res.col("id").equalTo(prepared.col("resultId")), "left_outer") - .map(getCountryMergeFn(), Encoders.bean(resultClazz)) - .write() - .option("compression", "gzip") - .mode(SaveMode.Overwrite) - .json(outputPath); + .json(preparedInfoPath); + if (!preparedInfoRaw.isEmpty()) { + final Dataset prepared = preparedInfoRaw.as(Encoders.bean(ResultCountrySet.class)); + res + .joinWith(prepared, res.col("id").equalTo(prepared.col("resultId")), "left_outer") + .map(getCountryMergeFn(), Encoders.bean(resultClazz)) + .write() + .option("compression", "gzip") + .mode(SaveMode.Overwrite) + .json(outputPath); + } else { + res + .write() + .option("compression", "gzip") + .mode(SaveMode.Overwrite) + .json(outputPath); + } } private static MapFunction, R> getCountryMergeFn() { From c1e24602939e5e2085935963ad79973bdcfe7d20 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Fri, 20 Jan 2023 09:20:26 +0100 Subject: [PATCH 6/7] [cleaning] the datasource master-duplicate fixup should not be brought to production yet --- .../dhp/oa/graph/clean/oozie_app/workflow.xml | 196 +----------------- 1 file changed, 1 insertion(+), 195 deletions(-) diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/clean/oozie_app/workflow.xml b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/clean/oozie_app/workflow.xml index 683c2417b..ee79f4f1a 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/clean/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/clean/oozie_app/workflow.xml @@ -582,201 +582,7 @@ - - - - - ${wf:conf('shouldClean') eq true} - - - - - - - eu.dnetlib.dhp.oa.graph.clean.MasterDuplicateAction - --postgresUrl${postgresURL} - --postgresUser${postgresUser} - --postgresPassword${postgresPassword} - --hdfsPath${workingDir}/masterduplicate - --hdfsNameNode${nameNode} - - - - - - - - - - - - - - - yarn - cluster - patch publication cfhb - eu.dnetlib.dhp.oa.graph.clean.cfhb.CleanCfHbSparkJob - dhp-graph-mapper-${projectVersion}.jar - - --executor-cores=${sparkExecutorCores} - --executor-memory=${sparkExecutorMemory} - --driver-memory=${sparkDriverMemory} - --conf spark.extraListeners=${spark2ExtraListeners} - --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} - --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} - --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} - --conf spark.sql.shuffle.partitions=7680 - - --inputPath${graphOutputPath}/publication - --resolvedPath${workingDir}/cfHbResolved/publication - --outputPath${workingDir}/cfHbPatched/publication - --graphTableClassNameeu.dnetlib.dhp.schema.oaf.Publication - --masterDuplicatePath${workingDir}/masterduplicate - - - - - - - - yarn - cluster - patch dataset cfhb - eu.dnetlib.dhp.oa.graph.clean.cfhb.CleanCfHbSparkJob - dhp-graph-mapper-${projectVersion}.jar - - --executor-cores=${sparkExecutorCores} - --executor-memory=${sparkExecutorMemory} - --driver-memory=${sparkDriverMemory} - --conf spark.extraListeners=${spark2ExtraListeners} - --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} - --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} - --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} - --conf spark.sql.shuffle.partitions=7680 - - --inputPath${graphOutputPath}/dataset - --resolvedPath${workingDir}/cfHbResolved/dataset - --outputPath${workingDir}/cfHbPatched/dataset - --graphTableClassNameeu.dnetlib.dhp.schema.oaf.Dataset - --masterDuplicatePath${workingDir}/masterduplicate - - - - - - - - yarn - cluster - patch otherresearchproduct cfhb - eu.dnetlib.dhp.oa.graph.clean.cfhb.CleanCfHbSparkJob - dhp-graph-mapper-${projectVersion}.jar - - --executor-cores=${sparkExecutorCores} - --executor-memory=${sparkExecutorMemory} - --driver-memory=${sparkDriverMemory} - --conf spark.extraListeners=${spark2ExtraListeners} - --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} - --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} - --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} - --conf spark.sql.shuffle.partitions=7680 - - --inputPath${graphOutputPath}/otherresearchproduct - --resolvedPath${workingDir}/cfHbResolved/otherresearchproduct - --outputPath${workingDir}/cfHbPatched/otherresearchproduct - --graphTableClassNameeu.dnetlib.dhp.schema.oaf.OtherResearchProduct - --masterDuplicatePath${workingDir}/masterduplicate - - - - - - - - yarn - cluster - patch software cfhb - eu.dnetlib.dhp.oa.graph.clean.cfhb.CleanCfHbSparkJob - dhp-graph-mapper-${projectVersion}.jar - - --executor-cores=${sparkExecutorCores} - --executor-memory=${sparkExecutorMemory} - --driver-memory=${sparkDriverMemory} - --conf spark.extraListeners=${spark2ExtraListeners} - --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} - --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} - --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} - --conf spark.sql.shuffle.partitions=7680 - - --inputPath${graphOutputPath}/software - --resolvedPath${workingDir}/cfHbResolved/software - --outputPath${workingDir}/cfHbPatched/software - --graphTableClassNameeu.dnetlib.dhp.schema.oaf.Software - --masterDuplicatePath${workingDir}/masterduplicate - - - - - - - - - - - - - - - - - - - - ${workingDir}/cfHbPatched/publication - ${graphOutputPath}/publication - - - - - - - - - - - ${workingDir}/cfHbPatched/dataset - ${graphOutputPath}/dataset - - - - - - - - - - - ${workingDir}/cfHbPatched/otherresearchproduct - ${graphOutputPath}/otherresearchproduct - - - - - - - - - - - ${workingDir}/cfHbPatched/software - ${graphOutputPath}/software - - - - - - + From 1b37516578350473f276dd137457868349fa6e00 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Fri, 20 Jan 2023 16:11:26 +0100 Subject: [PATCH 7/7] [bulk tagging] better node naming --- .../dhp/bulktag/oozie_app/workflow.xml | 24 +++++++++---------- 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/bulktag/oozie_app/workflow.xml b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/bulktag/oozie_app/workflow.xml index 42799419e..8ac8097e8 100644 --- a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/bulktag/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/bulktag/oozie_app/workflow.xml @@ -105,13 +105,13 @@ - - - - + + + + - + yarn-cluster cluster @@ -138,7 +138,7 @@ - + yarn-cluster cluster @@ -165,7 +165,7 @@ - + yarn-cluster cluster @@ -192,7 +192,7 @@ - + yarn-cluster cluster @@ -269,7 +269,7 @@ yarn-cluster cluster - EOSC_tagging + EOSC tagging publication eu.dnetlib.dhp.bulktag.eosc.SparkEoscBulkTag dhp-enrichment-${projectVersion}.jar @@ -296,7 +296,7 @@ yarn-cluster cluster - EOSC_tagging + EOSC tagging dataset eu.dnetlib.dhp.bulktag.eosc.SparkEoscBulkTag dhp-enrichment-${projectVersion}.jar @@ -322,7 +322,7 @@ yarn-cluster cluster - EOSC_tagging + EOSC tagging software eu.dnetlib.dhp.bulktag.eosc.SparkEoscBulkTag dhp-enrichment-${projectVersion}.jar @@ -348,7 +348,7 @@ yarn-cluster cluster - EOSC_tagging + EOSC tagging ORP eu.dnetlib.dhp.bulktag.eosc.SparkEoscBulkTag dhp-enrichment-${projectVersion}.jar