From 359b8ebda81abb5fd82fa26028b65dd0fa7bead0 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Thu, 25 Jul 2024 15:22:29 +0200 Subject: [PATCH 1/4] [graph provision] include only FoS L1..L2 in the record serialization --- .../model/ProvisionModelSupport.java | 11 ++++++++ .../oa/provision/utils/XmlRecordFactory.java | 6 ++-- .../utils/XmlSerializationUtils.java | 4 --- .../oa/provision/XmlRecordFactoryTest.java | 2 ++ .../dnetlib/dhp/oa/provision/publication.json | 28 +++++++++++++++++-- 5 files changed, 40 insertions(+), 11 deletions(-) diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/ProvisionModelSupport.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/ProvisionModelSupport.java index 1a75deafc..277d0deb6 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/ProvisionModelSupport.java +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/ProvisionModelSupport.java @@ -702,6 +702,7 @@ public class ProvisionModelSupport { .stream() .filter(s -> Objects.nonNull(s.getQualifier())) .filter(s -> Objects.nonNull(s.getQualifier().getClassname())) + .filter(ProvisionModelSupport::filterFosL1L2) .map( s -> Subject .newInstance(s.getValue(), s.getQualifier().getClassid(), s.getQualifier().getClassname())) @@ -709,6 +710,16 @@ public class ProvisionModelSupport { .orElse(null); } + public static boolean filterFosL1L2(StructuredProperty s) { + final String subjectType = Optional.ofNullable(s.getQualifier()).map(Qualifier::getClassid).orElse(""); + if (ModelConstants.DNET_SUBJECT_FOS_CLASSID.equals(subjectType)) { + String code = StringUtils.substringBefore(s.getValue(), " "); + return code.matches("^\\d{2}$|^\\d{4}$"); + } + + return true; + } + private static Country asCountry(eu.dnetlib.dhp.schema.oaf.Qualifier country) { return Optional .ofNullable(country) diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/XmlRecordFactory.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/XmlRecordFactory.java index 899dad221..44004faf3 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/XmlRecordFactory.java +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/XmlRecordFactory.java @@ -20,6 +20,7 @@ import javax.xml.transform.*; import javax.xml.transform.dom.DOMSource; import javax.xml.transform.stream.StreamResult; +import eu.dnetlib.dhp.oa.provision.model.*; import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.tuple.ImmutablePair; import org.apache.commons.lang3.tuple.Pair; @@ -41,10 +42,6 @@ import com.google.common.collect.Sets; import com.mycila.xmltool.XMLDoc; import com.mycila.xmltool.XMLTag; -import eu.dnetlib.dhp.oa.provision.model.JoinedEntity; -import eu.dnetlib.dhp.oa.provision.model.RelatedEntity; -import eu.dnetlib.dhp.oa.provision.model.RelatedEntityWrapper; -import eu.dnetlib.dhp.oa.provision.model.XmlInstance; import eu.dnetlib.dhp.schema.common.*; import eu.dnetlib.dhp.schema.oaf.*; import eu.dnetlib.dhp.schema.oaf.Result; @@ -389,6 +386,7 @@ public class XmlRecordFactory implements Serializable { .getSubject() .stream() .filter(Objects::nonNull) + .filter(ProvisionModelSupport::filterFosL1L2) .map(s -> XmlSerializationUtils.mapStructuredProperty("subject", s)) .collect(Collectors.toList())); } diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/XmlSerializationUtils.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/XmlSerializationUtils.java index fbd647ae4..b4517002c 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/XmlSerializationUtils.java +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/XmlSerializationUtils.java @@ -5,11 +5,7 @@ import static eu.dnetlib.dhp.oa.provision.utils.GraphMappingUtils.removePrefix; import static org.apache.commons.lang3.StringUtils.isBlank; import static org.apache.commons.lang3.StringUtils.isNotBlank; -import java.util.HashSet; import java.util.List; -import java.util.Optional; -import java.util.Set; -import java.util.stream.Collectors; import org.apache.commons.lang3.StringUtils; diff --git a/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/oa/provision/XmlRecordFactoryTest.java b/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/oa/provision/XmlRecordFactoryTest.java index ab4301f9a..dcd021db1 100644 --- a/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/oa/provision/XmlRecordFactoryTest.java +++ b/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/oa/provision/XmlRecordFactoryTest.java @@ -97,6 +97,8 @@ public class XmlRecordFactoryTest { assertEquals("bronze", doc.valueOf("//*[local-name() = 'result']/openaccesscolor/text()")); assertEquals("true", doc.valueOf("//*[local-name() = 'result']/isindiamondjournal/text()")); assertEquals("true", doc.valueOf("//*[local-name() = 'result']/publiclyfunded/text()")); + + assertEquals(15, doc.selectNodes("//*[local-name() = 'result']/*[local-name() = 'subject']").size()); } @Test diff --git a/dhp-workflows/dhp-graph-provision/src/test/resources/eu/dnetlib/dhp/oa/provision/publication.json b/dhp-workflows/dhp-graph-provision/src/test/resources/eu/dnetlib/dhp/oa/provision/publication.json index a89ec62d5..a073fbebd 100644 --- a/dhp-workflows/dhp-graph-provision/src/test/resources/eu/dnetlib/dhp/oa/provision/publication.json +++ b/dhp-workflows/dhp-graph-provision/src/test/resources/eu/dnetlib/dhp/oa/provision/publication.json @@ -1886,12 +1886,34 @@ "trust": "" }, "qualifier": { - "classid": "keyword", - "classname": "keyword", + "classid": "FOS", + "classname": "Fields of Science and Technology classification", "schemeid": "dnet:subject_classification_typologies", "schemename": "dnet:subject_classification_typologies" }, - "value": "Thermal conductivity" + "value": "0101 mathematics" + }, + { + "dataInfo": { + "deletedbyinference": false, + "inferenceprovenance": "", + "inferred": false, + "invisible": false, + "provenanceaction": { + "classid": "", + "classname": "", + "schemeid": "", + "schemename": "" + }, + "trust": "" + }, + "qualifier": { + "classid": "FOS", + "classname": "Fields of Science and Technology classification", + "schemeid": "dnet:subject_classification_typologies", + "schemename": "dnet:subject_classification_typologies" + }, + "value": "010101 applied mathematics" } ], "title": [ From a81c555fe6bfa23b7c4108eac2d0415d78c8a630 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Thu, 25 Jul 2024 15:26:47 +0200 Subject: [PATCH 2/4] [graph provision] include only FoS L1..L2 in the record serialization --- .../dnetlib/dhp/oa/provision/model/ProvisionModelSupport.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/ProvisionModelSupport.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/ProvisionModelSupport.java index 277d0deb6..4a2326453 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/ProvisionModelSupport.java +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/ProvisionModelSupport.java @@ -687,6 +687,7 @@ public class ProvisionModelSupport { .stream() .filter(s -> Objects.nonNull(s.getQualifier())) .filter(s -> Objects.nonNull(s.getQualifier().getClassname())) + .filter(ProvisionModelSupport::filterFosL1L2) .map( s -> Subject .newInstance(s.getValue(), s.getQualifier().getClassid(), s.getQualifier().getClassname())) @@ -702,7 +703,6 @@ public class ProvisionModelSupport { .stream() .filter(s -> Objects.nonNull(s.getQualifier())) .filter(s -> Objects.nonNull(s.getQualifier().getClassname())) - .filter(ProvisionModelSupport::filterFosL1L2) .map( s -> Subject .newInstance(s.getValue(), s.getQualifier().getClassid(), s.getQualifier().getClassname())) From 985ca15264dbe3e7a3407451b2c883c76d2d87dd Mon Sep 17 00:00:00 2001 From: Miriam Baglioni Date: Mon, 5 Aug 2024 12:10:40 +0200 Subject: [PATCH 3/4] [openaire-affiliation]removes matchings without DOI --- .../bipaffiliations/PrepareAffiliationRelations.java | 3 ++- .../dnetlib/dhp/enrich/orcid/ORCIDAuthorMatchersTest.scala | 6 ++++++ 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipaffiliations/PrepareAffiliationRelations.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipaffiliations/PrepareAffiliationRelations.java index 8f911e980..633e53d46 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipaffiliations/PrepareAffiliationRelations.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipaffiliations/PrepareAffiliationRelations.java @@ -129,7 +129,8 @@ public class PrepareAffiliationRelations implements Serializable { Dataset df = spark .read() .schema("`DOI` STRING, `Matchings` ARRAY>") - .json(inputPath); + .json(inputPath) + .where("DOI is not NULL"); // unroll nested arrays df = df diff --git a/dhp-workflows/dhp-graph-mapper/src/test/scala/eu/dnetlib/dhp/enrich/orcid/ORCIDAuthorMatchersTest.scala b/dhp-workflows/dhp-graph-mapper/src/test/scala/eu/dnetlib/dhp/enrich/orcid/ORCIDAuthorMatchersTest.scala index f3a5fe77c..4e5ad5365 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/scala/eu/dnetlib/dhp/enrich/orcid/ORCIDAuthorMatchersTest.scala +++ b/dhp-workflows/dhp-graph-mapper/src/test/scala/eu/dnetlib/dhp/enrich/orcid/ORCIDAuthorMatchersTest.scala @@ -31,5 +31,11 @@ class ORCIDAuthorMatchersTest { assertTrue(matchOrderedTokenAndAbbreviations("孙林 Sun Lin", "Sun Lin")) // assertTrue(AuthorsMatchRevised.compare("孙林 Sun Lin", "孙林")); // not yet implemented } + @Test def testDocumentationNames(): Unit = { + assertTrue(matchOrderedTokenAndAbbreviations("James C. A. Miller-Jones", "James Antony Miller-Jones")) + } + @Test def testDocumentationNames2(): Unit = { + assertTrue(matchOrderedTokenAndAbbreviations("James C. A. Miller-Jones", "James Antony Miller Jones")) + } } From 8e7ef79ce09d41d57d9d70f90875563bd2799e40 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Mon, 5 Aug 2024 12:13:48 +0200 Subject: [PATCH 4/4] [bip affiliations] considers only DOI based records --- .../bipaffiliations/PrepareAffiliationRelations.java | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipaffiliations/PrepareAffiliationRelations.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipaffiliations/PrepareAffiliationRelations.java index 8f911e980..98915bdc5 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipaffiliations/PrepareAffiliationRelations.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipaffiliations/PrepareAffiliationRelations.java @@ -129,7 +129,8 @@ public class PrepareAffiliationRelations implements Serializable { Dataset df = spark .read() .schema("`DOI` STRING, `Matchings` ARRAY>") - .json(inputPath); + .json(inputPath) + .where("DOI is not null"); // unroll nested arrays df = df