From e62018e95d45cb99442d80576527590a19419194 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Wed, 3 Aug 2022 12:26:05 +0200 Subject: [PATCH 01/20] [aggregator graph] added more assertions in test --- .../graph/raw/MigrateDbEntitiesApplicationTest.java | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MigrateDbEntitiesApplicationTest.java b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MigrateDbEntitiesApplicationTest.java index 9048a22ea..408196665 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MigrateDbEntitiesApplicationTest.java +++ b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MigrateDbEntitiesApplicationTest.java @@ -18,6 +18,7 @@ import java.util.Objects; import java.util.Optional; import java.util.stream.Collectors; +import eu.dnetlib.dhp.schema.common.ModelConstants; import org.apache.commons.io.IOUtils; import org.apache.commons.lang3.StringUtils; import org.junit.jupiter.api.BeforeEach; @@ -252,6 +253,18 @@ public class MigrateDbEntitiesApplicationTest { assertValidId(r2.getSource()); assertEquals(r1.getSource(), r2.getTarget()); assertEquals(r2.getSource(), r1.getTarget()); + + assertTrue(r1.getSource().startsWith("10|")); + assertTrue(r1.getTarget().startsWith("20|")); + + assertEquals(ModelConstants.DATASOURCE_ORGANIZATION, r1.getRelType()); + assertEquals(ModelConstants.DATASOURCE_ORGANIZATION, r2.getRelType()); + + assertEquals(ModelConstants.PROVISION, r1.getSubRelType()); + assertEquals(ModelConstants.PROVISION, r2.getSubRelType()); + + assertEquals(ModelConstants.IS_PROVIDED_BY, r1.getRelClass()); + assertEquals(ModelConstants.PROVIDES, r2.getRelClass()); } @Test From b09d7ddc74fc13d3d1e7cfefccbfe1e472e88c41 Mon Sep 17 00:00:00 2001 From: Antonis Lempesis Date: Wed, 3 Aug 2022 12:26:59 +0300 Subject: [PATCH 02/20] fixed the datasourceOrganization relations --- .../eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step8.sql | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step8.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step8.sql index 6fa0e6fdf..01bed17cc 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step8.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step8.sql @@ -93,7 +93,7 @@ where d.datainfo.deletedbyinference=false and d.datainfo.invisible=false; CREATE TABLE ${stats_db_name}.datasource_organizations STORED AS PARQUET AS SELECT substr(r.target, 4) AS id, substr(r.source, 4) AS organization FROM ${openaire_db_name}.relation r -WHERE r.reltype = 'datasourceOrganization' and r.datainfo.deletedbyinference = false and r.target like '20|%' and r.datainfo.invisible=false; +WHERE r.reltype = 'datasourceOrganization' and r.datainfo.deletedbyinference = false and r.source like '20|%' and r.datainfo.invisible=false; -- datasource sources: -- where the datasource info have been collected from. From 27a91841e7fa2a1b615b4d1e161d606db5bead96 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Thu, 4 Aug 2022 11:39:39 +0200 Subject: [PATCH 03/20] WIP: cleaning of subjects --- .../oaf/utils/GraphCleaningFunctions.java | 5 ++ .../dhp/schema/oaf/utils/OafMapperUtils.java | 25 ++++++++++ .../dnetlib/dhp/actionmanager/Constants.java | 13 ++--- .../PrepareFOSSparkJob.java | 3 +- .../PrepareSDGSparkJob.java | 3 +- .../DataciteToOAFTransformation.scala | 6 +-- .../eu/dnetlib/dhp/sx/bio/BioDBToOAF.scala | 2 +- .../dhp/sx/bio/pubmed/PubMedToOaf.scala | 4 +- .../createunresolvedentities/ProduceTest.java | 10 ++-- .../dhp/broker/oa/util/ConversionUtils.java | 33 +++++++------ .../doiboost/DoiBoostMappingUtil.scala | 22 +++++++++ .../doiboost/crossref/Crossref2Oaf.scala | 2 +- .../dnetlib/doiboost/mag/MagDataModel.scala | 12 ++--- .../dnetlib/dhp/bulktag/EOSCTagJobTest.java | 2 +- .../dhp/oa/graph/clean/CleaningRuleMap.java | 1 + .../dhp/oa/graph/dump/ResultMapper.java | 11 ++--- .../raw/AbstractMdRecordToOafMapper.java | 49 +++++++------------ .../dhp/oa/graph/raw/OafToOafMapper.java | 4 +- .../dhp/oa/graph/raw/OdfToOafMapper.java | 4 +- .../resolution/ResolveEntitiesTest.scala | 4 +- pom.xml | 2 +- 21 files changed, 132 insertions(+), 85 deletions(-) diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/GraphCleaningFunctions.java b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/GraphCleaningFunctions.java index 351bd2dd5..151c53685 100644 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/GraphCleaningFunctions.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/GraphCleaningFunctions.java @@ -520,6 +520,11 @@ public class GraphCleaningFunctions extends CleaningFunctions { return s; } + protected static Subject cleanValue(Subject s) { + s.setValue(s.getValue().replaceAll(CLEANING_REGEX, " ")); + return s; + } + protected static Field cleanValue(Field s) { s.setValue(s.getValue().replaceAll(CLEANING_REGEX, " ")); return s; diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/OafMapperUtils.java b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/OafMapperUtils.java index 6f452e846..d58b354ab 100644 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/OafMapperUtils.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/OafMapperUtils.java @@ -189,6 +189,17 @@ public class OafMapperUtils { return q; } + public static Subject subject( + final String value, + final String classid, + final String classname, + final String schemeid, + final String schemename, + final DataInfo dataInfo) { + + return subject(value, qualifier(classid, classname, schemeid, schemename), dataInfo); + } + public static StructuredProperty structuredProperty( final String value, final String classid, @@ -200,6 +211,20 @@ public class OafMapperUtils { return structuredProperty(value, qualifier(classid, classname, schemeid, schemename), dataInfo); } + public static Subject subject( + final String value, + final Qualifier qualifier, + final DataInfo dataInfo) { + if (value == null) { + return null; + } + final Subject s = new Subject(); + s.setValue(value); + s.setQualifier(qualifier); + s.setDataInfo(dataInfo); + return s; + } + public static StructuredProperty structuredProperty( final String value, final Qualifier qualifier, diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/Constants.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/Constants.java index aa25ca633..bd223d7c9 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/Constants.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/Constants.java @@ -13,6 +13,7 @@ import com.fasterxml.jackson.databind.ObjectMapper; import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.schema.common.ModelConstants; import eu.dnetlib.dhp.schema.oaf.StructuredProperty; +import eu.dnetlib.dhp.schema.oaf.Subject; import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils; public class Constants { @@ -58,13 +59,13 @@ public class Constants { .map((MapFunction) value -> OBJECT_MAPPER.readValue(value, clazz), Encoders.bean(clazz)); } - public static StructuredProperty getSubject(String sbj, String classid, String classname, + public static Subject getSubject(String sbj, String classid, String classname, String diqualifierclassid) { if (sbj.equals(NULL)) return null; - StructuredProperty sp = new StructuredProperty(); - sp.setValue(sbj); - sp + Subject s = new Subject(); + s.setValue(sbj); + s .setQualifier( OafMapperUtils .qualifier( @@ -72,7 +73,7 @@ public class Constants { classname, ModelConstants.DNET_SUBJECT_TYPOLOGIES, ModelConstants.DNET_SUBJECT_TYPOLOGIES)); - sp + s .setDataInfo( OafMapperUtils .dataInfo( @@ -88,7 +89,7 @@ public class Constants { ModelConstants.DNET_PROVENANCE_ACTIONS), "")); - return sp; + return s; } } diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/createunresolvedentities/PrepareFOSSparkJob.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/createunresolvedentities/PrepareFOSSparkJob.java index 55e391932..4d2d25215 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/createunresolvedentities/PrepareFOSSparkJob.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/createunresolvedentities/PrepareFOSSparkJob.java @@ -24,6 +24,7 @@ import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.schema.common.ModelConstants; import eu.dnetlib.dhp.schema.oaf.Result; import eu.dnetlib.dhp.schema.oaf.StructuredProperty; +import eu.dnetlib.dhp.schema.oaf.Subject; import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils; import eu.dnetlib.dhp.utils.DHPUtils; @@ -79,7 +80,7 @@ public class PrepareFOSSparkJob implements Serializable { HashSet level3 = new HashSet<>(); addLevels(level1, level2, level3, first); it.forEachRemaining(v -> addLevels(level1, level2, level3, v)); - List sbjs = new ArrayList<>(); + List sbjs = new ArrayList<>(); level1.forEach(l -> sbjs.add(getSubject(l, FOS_CLASS_ID, FOS_CLASS_NAME, UPDATE_SUBJECT_FOS_CLASS_ID))); level2.forEach(l -> sbjs.add(getSubject(l, FOS_CLASS_ID, FOS_CLASS_NAME, UPDATE_SUBJECT_FOS_CLASS_ID))); level3.forEach(l -> sbjs.add(getSubject(l, FOS_CLASS_ID, FOS_CLASS_NAME, UPDATE_SUBJECT_FOS_CLASS_ID))); diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/createunresolvedentities/PrepareSDGSparkJob.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/createunresolvedentities/PrepareSDGSparkJob.java index a31e380fe..bfdf14234 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/createunresolvedentities/PrepareSDGSparkJob.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/createunresolvedentities/PrepareSDGSparkJob.java @@ -24,6 +24,7 @@ import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.schema.common.ModelConstants; import eu.dnetlib.dhp.schema.oaf.Result; import eu.dnetlib.dhp.schema.oaf.StructuredProperty; +import eu.dnetlib.dhp.schema.oaf.Subject; import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils; import eu.dnetlib.dhp.utils.DHPUtils; @@ -73,7 +74,7 @@ public class PrepareSDGSparkJob implements Serializable { Result r = new Result(); r.setId(DHPUtils.generateUnresolvedIdentifier(k, DOI)); SDGDataModel first = it.next(); - List sbjs = new ArrayList<>(); + List sbjs = new ArrayList<>(); sbjs.add(getSubject(first.getSbj(), SDG_CLASS_ID, SDG_CLASS_NAME, UPDATE_SUBJECT_SDG_CLASS_ID)); it .forEachRemaining( diff --git a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/datacite/DataciteToOAFTransformation.scala b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/datacite/DataciteToOAFTransformation.scala index 088a07427..c29614d33 100644 --- a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/datacite/DataciteToOAFTransformation.scala +++ b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/datacite/DataciteToOAFTransformation.scala @@ -19,7 +19,7 @@ import java.time.chrono.ThaiBuddhistDate import java.time.format.DateTimeFormatter import java.util.{Date, Locale} import scala.collection.JavaConverters._ -import scala.io.{Codec, Source} +import scala.io.Source object DataciteToOAFTransformation { @@ -252,7 +252,7 @@ object DataciteToOAFTransformation { .exists(i => i.getHostedby != null && "figshare".equalsIgnoreCase(i.getHostedby.getValue)) if (hosted_by_figshare) { r.getInstance().asScala.foreach(i => i.setAccessright(ModelConstants.OPEN_ACCESS_RIGHT())) - val l: List[StructuredProperty] = List() + val l: List[Subject] = List() r.setSubject(l.asJava) } } @@ -492,7 +492,7 @@ object DataciteToOAFTransformation { subjects .filter(s => s.subject.nonEmpty) .map(s => - OafMapperUtils.structuredProperty( + OafMapperUtils.subject( s.subject.get, SUBJ_CLASS, SUBJ_CLASS, diff --git a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/BioDBToOAF.scala b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/BioDBToOAF.scala index ffdab1799..670323598 100644 --- a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/BioDBToOAF.scala +++ b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/BioDBToOAF.scala @@ -281,7 +281,7 @@ object BioDBToOAF { d.setSubject( subjects .map(s => - OafMapperUtils.structuredProperty( + OafMapperUtils.subject( s, SUBJ_CLASS, SUBJ_CLASS, diff --git a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/pubmed/PubMedToOaf.scala b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/pubmed/PubMedToOaf.scala index 42bafc93e..410686f97 100644 --- a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/pubmed/PubMedToOaf.scala +++ b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/pubmed/PubMedToOaf.scala @@ -265,8 +265,8 @@ object PubMedToOaf { result.setLanguage(term) } - val subjects: List[StructuredProperty] = article.getSubjects.asScala.map(s => - OafMapperUtils.structuredProperty( + val subjects: List[Subject] = article.getSubjects.asScala.map(s => + OafMapperUtils.subject( s.getValue, SUBJ_CLASS, SUBJ_CLASS, diff --git a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/createunresolvedentities/ProduceTest.java b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/createunresolvedentities/ProduceTest.java index a5ecaeabf..c3c110f09 100644 --- a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/createunresolvedentities/ProduceTest.java +++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/createunresolvedentities/ProduceTest.java @@ -72,7 +72,7 @@ public class ProduceTest { JavaRDD tmp = getResultJavaRDD(); - List sbjs = tmp + List sbjs = tmp .filter(row -> row.getSubject() != null && row.getSubject().size() > 0) .flatMap(row -> row.getSubject().iterator()) .collect(); @@ -169,7 +169,7 @@ public class ProduceTest { .getSubject() .size()); - List sbjs = tmp + List sbjs = tmp .filter(row -> row.getId().equals(doi)) .flatMap(row -> row.getSubject().iterator()) .collect(); @@ -396,7 +396,7 @@ public class ProduceTest { .getSubject() .size()); - List sbjs = tmp + List sbjs = tmp .filter(row -> row.getId().equals(doi)) .flatMap(row -> row.getSubject().iterator()) .collect(); @@ -508,7 +508,7 @@ public class ProduceTest { .getSubject() .size()); - List sbjs = tmp + List sbjs = tmp .filter(row -> row.getId().equals(doi)) .flatMap(row -> row.getSubject().iterator()) .collect(); @@ -537,7 +537,7 @@ public class ProduceTest { JavaRDD tmp = getResultJavaRDDPlusSDG(); - List sbjs_sdg = tmp + List sbjs_sdg = tmp .filter(row -> row.getSubject() != null && row.getSubject().size() > 0) .flatMap(row -> row.getSubject().iterator()) .filter(sbj -> sbj.getQualifier().getClassid().equals(Constants.SDG_CLASS_ID)) diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/ConversionUtils.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/ConversionUtils.java index bc37203d3..5e7adec79 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/ConversionUtils.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/ConversionUtils.java @@ -26,20 +26,7 @@ import eu.dnetlib.broker.objects.OaBrokerRelatedPublication; import eu.dnetlib.broker.objects.OaBrokerRelatedSoftware; import eu.dnetlib.broker.objects.OaBrokerTypedValue; import eu.dnetlib.dhp.schema.common.ModelConstants; -import eu.dnetlib.dhp.schema.oaf.Author; -import eu.dnetlib.dhp.schema.oaf.Dataset; -import eu.dnetlib.dhp.schema.oaf.Datasource; -import eu.dnetlib.dhp.schema.oaf.ExternalReference; -import eu.dnetlib.dhp.schema.oaf.Field; -import eu.dnetlib.dhp.schema.oaf.Instance; -import eu.dnetlib.dhp.schema.oaf.Journal; -import eu.dnetlib.dhp.schema.oaf.KeyValue; -import eu.dnetlib.dhp.schema.oaf.Project; -import eu.dnetlib.dhp.schema.oaf.Publication; -import eu.dnetlib.dhp.schema.oaf.Qualifier; -import eu.dnetlib.dhp.schema.oaf.Result; -import eu.dnetlib.dhp.schema.oaf.Software; -import eu.dnetlib.dhp.schema.oaf.StructuredProperty; +import eu.dnetlib.dhp.schema.oaf.*; public class ConversionUtils { @@ -71,6 +58,10 @@ public class ConversionUtils { return sp != null ? new OaBrokerTypedValue(classId(sp.getQualifier()), sp.getValue()) : null; } + public static OaBrokerTypedValue oafSubjectToBrokerTypedValue(final Subject sp) { + return sp != null ? new OaBrokerTypedValue(classId(sp.getQualifier()), sp.getValue()) : null; + } + public static OaBrokerRelatedDataset oafDatasetToBrokerDataset(final Dataset d) { if (d == null) { return null; @@ -115,7 +106,7 @@ public class ConversionUtils { res.setTitles(structPropList(result.getTitle())); res.setAbstracts(fieldList(result.getDescription())); res.setLanguage(classId(result.getLanguage())); - res.setSubjects(structPropTypedList(result.getSubject())); + res.setSubjects(subjectList(result.getSubject())); res.setCreators(mappedList(result.getAuthor(), ConversionUtils::oafAuthorToBrokerAuthor)); res.setPublicationdate(fieldValue(result.getDateofacceptance())); res.setPublisher(fieldValue(result.getPublisher())); @@ -304,6 +295,18 @@ public class ConversionUtils { .collect(Collectors.toList()); } + private static List subjectList(final List list) { + if (list == null) { + return new ArrayList<>(); + } + + return list + .stream() + .map(ConversionUtils::oafSubjectToBrokerTypedValue) + .filter(Objects::nonNull) + .collect(Collectors.toList()); + } + private static List mappedList(final List list, final Function func) { if (list == null) { return new ArrayList<>(); diff --git a/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/DoiBoostMappingUtil.scala b/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/DoiBoostMappingUtil.scala index 20471973a..4789093cd 100644 --- a/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/DoiBoostMappingUtil.scala +++ b/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/DoiBoostMappingUtil.scala @@ -391,6 +391,28 @@ object DoiBoostMappingUtil { di } + def createSubject(value: String, classId: String, schemeId: String): Subject = { + val s = new Subject + s.setQualifier(OafMapperUtils.qualifier(classId, classId, schemeId, schemeId)) + s.setValue(value) + s + + } + + def createSubject( + value: String, + classId: String, + className: String, + schemeId: String, + schemeName: String + ): Subject = { + val s = new Subject + s.setQualifier(OafMapperUtils.qualifier(classId, className, schemeId, schemeName)) + s.setValue(value) + s + + } + def createSP( value: String, classId: String, diff --git a/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/crossref/Crossref2Oaf.scala b/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/crossref/Crossref2Oaf.scala index 9eec9e759..7fb10863f 100644 --- a/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/crossref/Crossref2Oaf.scala +++ b/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/crossref/Crossref2Oaf.scala @@ -201,7 +201,7 @@ case object Crossref2Oaf { if (subjectList.nonEmpty) { result.setSubject( - subjectList.map(s => createSP(s, "keywords", ModelConstants.DNET_SUBJECT_TYPOLOGIES)).asJava + subjectList.map(s => createSubject(s, "keyword", ModelConstants.DNET_SUBJECT_TYPOLOGIES)).asJava ) } diff --git a/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/mag/MagDataModel.scala b/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/mag/MagDataModel.scala index 18ba864ce..9a0b0d845 100644 --- a/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/mag/MagDataModel.scala +++ b/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/mag/MagDataModel.scala @@ -2,7 +2,7 @@ package eu.dnetlib.doiboost.mag import eu.dnetlib.dhp.schema.common.ModelConstants import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory -import eu.dnetlib.dhp.schema.oaf.{Instance, Journal, Publication, StructuredProperty} +import eu.dnetlib.dhp.schema.oaf.{Instance, Journal, Publication, StructuredProperty, Subject} import eu.dnetlib.doiboost.DoiBoostMappingUtil import eu.dnetlib.doiboost.DoiBoostMappingUtil._ import org.json4s @@ -210,8 +210,8 @@ case object ConversionUtil { val className = "Microsoft Academic Graph classification" val classid = "MAG" - val p: List[StructuredProperty] = fieldOfStudy.subjects.flatMap(s => { - val s1 = createSP( + val p: List[Subject] = fieldOfStudy.subjects.flatMap(s => { + val s1 = createSubject( s.DisplayName, classid, className, @@ -219,10 +219,10 @@ case object ConversionUtil { ModelConstants.DNET_SUBJECT_TYPOLOGIES ) val di = DoiBoostMappingUtil.generateDataInfo(s.Score.toString) - var resList: List[StructuredProperty] = List(s1) + var resList: List[Subject] = List(s1) if (s.MainType.isDefined) { val maintp = s.MainType.get - val s2 = createSP( + val s2 = createSubject( s.MainType.get, classid, className, @@ -232,7 +232,7 @@ case object ConversionUtil { s2.setDataInfo(di) resList = resList ::: List(s2) if (maintp.contains(".")) { - val s3 = createSP( + val s3 = createSubject( maintp.split("\\.").head, classid, className, diff --git a/dhp-workflows/dhp-enrichment/src/test/java/eu/dnetlib/dhp/bulktag/EOSCTagJobTest.java b/dhp-workflows/dhp-enrichment/src/test/java/eu/dnetlib/dhp/bulktag/EOSCTagJobTest.java index bfe4f6448..a640e2009 100644 --- a/dhp-workflows/dhp-enrichment/src/test/java/eu/dnetlib/dhp/bulktag/EOSCTagJobTest.java +++ b/dhp-workflows/dhp-enrichment/src/test/java/eu/dnetlib/dhp/bulktag/EOSCTagJobTest.java @@ -290,7 +290,7 @@ public class EOSCTagJobTest { .stream() .anyMatch(s -> s.getCode().equals("EOSC::Jupyter Notebook"))); - List subjects = tmp + List subjects = tmp .filter(sw -> sw.getId().equals("50|od______1582::6e7a9b21a2feef45673890432af34244")) .collect() .get(0) diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleaningRuleMap.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleaningRuleMap.java index 7a3583289..6c156edb7 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleaningRuleMap.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleaningRuleMap.java @@ -27,6 +27,7 @@ public class CleaningRuleMap extends HashMap, SerializableConsumer cleanQualifier(vocabularies, (AccessRight) o)); mapping.put(Country.class, o -> cleanCountry(vocabularies, (Country) o)); mapping.put(Relation.class, o -> cleanRelation(vocabularies, (Relation) o)); + return mapping; } diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/dump/ResultMapper.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/dump/ResultMapper.java index aad3a8706..510f9c3a6 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/dump/ResultMapper.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/dump/ResultMapper.java @@ -17,6 +17,7 @@ import eu.dnetlib.dhp.schema.dump.oaf.Instance; import eu.dnetlib.dhp.schema.dump.oaf.Measure; import eu.dnetlib.dhp.schema.dump.oaf.OpenAccessRoute; import eu.dnetlib.dhp.schema.dump.oaf.Result; +import eu.dnetlib.dhp.schema.dump.oaf.Subject; import eu.dnetlib.dhp.schema.dump.oaf.community.CfHbKeyValue; import eu.dnetlib.dhp.schema.dump.oaf.community.CommunityInstance; import eu.dnetlib.dhp.schema.dump.oaf.community.CommunityResult; @@ -66,7 +67,7 @@ public class ResultMapper implements Serializable { final List contributorList = new ArrayList<>(); Optional .ofNullable(input.getContributor()) - .ifPresent(value -> value.stream().forEach(c -> contributorList.add(c.getValue()))); + .ifPresent(value -> value.forEach(c -> contributorList.add(c.getValue()))); out.setContributor(contributorList); Optional @@ -103,7 +104,7 @@ public class ResultMapper implements Serializable { final List coverageList = new ArrayList<>(); Optional .ofNullable(input.getCoverage()) - .ifPresent(value -> value.stream().forEach(c -> coverageList.add(c.getValue()))); + .ifPresent(value -> value.forEach(c -> coverageList.add(c.getValue()))); out.setCoverage(coverageList); out.setDateofcollection(input.getDateofcollection()); @@ -114,14 +115,12 @@ public class ResultMapper implements Serializable { .ifPresent(value -> value.forEach(d -> descriptionList.add(d.getValue()))); out.setDescription(descriptionList); Optional> oStr = Optional.ofNullable(input.getEmbargoenddate()); - if (oStr.isPresent()) { - out.setEmbargoenddate(oStr.get().getValue()); - } + oStr.ifPresent(stringField -> out.setEmbargoenddate(stringField.getValue())); final List formatList = new ArrayList<>(); Optional .ofNullable(input.getFormat()) - .ifPresent(value -> value.stream().forEach(f -> formatList.add(f.getValue()))); + .ifPresent(value -> value.forEach(f -> formatList.add(f.getValue()))); out.setFormat(formatList); out.setId(input.getId()); out.setOriginalId(new ArrayList<>()); diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java index e06327b2a..da1c764e8 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java @@ -8,15 +8,7 @@ import static eu.dnetlib.dhp.schema.common.ModelConstants.PRODUCES; import static eu.dnetlib.dhp.schema.common.ModelConstants.REPOSITORY_PROVENANCE_ACTIONS; import static eu.dnetlib.dhp.schema.common.ModelConstants.RESULT_PROJECT; import static eu.dnetlib.dhp.schema.common.ModelConstants.UNKNOWN; -import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.createOpenaireId; -import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.dataInfo; -import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.field; -import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.journal; -import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.keyValue; -import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.listFields; -import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.oaiIProvenance; -import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.qualifier; -import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.structuredProperty; +import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.*; import java.util.*; @@ -29,26 +21,7 @@ import com.google.common.collect.Sets; import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup; import eu.dnetlib.dhp.schema.common.ModelConstants; import eu.dnetlib.dhp.schema.common.ModelSupport; -import eu.dnetlib.dhp.schema.oaf.AccessRight; -import eu.dnetlib.dhp.schema.oaf.Author; -import eu.dnetlib.dhp.schema.oaf.Context; -import eu.dnetlib.dhp.schema.oaf.DataInfo; -import eu.dnetlib.dhp.schema.oaf.Dataset; -import eu.dnetlib.dhp.schema.oaf.Field; -import eu.dnetlib.dhp.schema.oaf.GeoLocation; -import eu.dnetlib.dhp.schema.oaf.Instance; -import eu.dnetlib.dhp.schema.oaf.Journal; -import eu.dnetlib.dhp.schema.oaf.KeyValue; -import eu.dnetlib.dhp.schema.oaf.OAIProvenance; -import eu.dnetlib.dhp.schema.oaf.Oaf; -import eu.dnetlib.dhp.schema.oaf.OafEntity; -import eu.dnetlib.dhp.schema.oaf.OtherResearchProduct; -import eu.dnetlib.dhp.schema.oaf.Publication; -import eu.dnetlib.dhp.schema.oaf.Qualifier; -import eu.dnetlib.dhp.schema.oaf.Relation; -import eu.dnetlib.dhp.schema.oaf.Result; -import eu.dnetlib.dhp.schema.oaf.Software; -import eu.dnetlib.dhp.schema.oaf.StructuredProperty; +import eu.dnetlib.dhp.schema.oaf.*; import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory; import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils; @@ -411,7 +384,7 @@ public abstract class AbstractMdRecordToOafMapper { protected abstract List prepareTitles(Document doc, DataInfo info); - protected abstract List prepareSubjects(Document doc, DataInfo info); + protected abstract List prepareSubjects(Document doc, DataInfo info); protected abstract Qualifier prepareLanguages(Document doc); @@ -559,6 +532,22 @@ public abstract class AbstractMdRecordToOafMapper { return res; } + protected List prepareSubjectList( + final Node node, + final String xpath, + final DataInfo info) { + final List res = new ArrayList<>(); + for (final Object o : node.selectNodes(xpath)) { + final Node n = (Node) o; + res + .add( + subject( + n.getText(), n.valueOf("@classid"), n.valueOf("@classname"), n.valueOf("@schemeid"), + n.valueOf("@schemename"), info)); + } + return res; + } + protected OAIProvenance prepareOAIprovenance(final Document doc) { final Node n = doc.selectSingleNode("//*[local-name()='provenance']/*[local-name()='originDescription']"); diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OafToOafMapper.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OafToOafMapper.java index b7afd3595..9225e174d 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OafToOafMapper.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OafToOafMapper.java @@ -84,8 +84,8 @@ public class OafToOafMapper extends AbstractMdRecordToOafMapper { } @Override - protected List prepareSubjects(final Document doc, final DataInfo info) { - return prepareListStructProps(doc, "//dc:subject", info); + protected List prepareSubjects(final Document doc, final DataInfo info) { + return prepareSubjectList(doc, "//dc:subject", info); } @Override diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OdfToOafMapper.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OdfToOafMapper.java index 1bbeac9fb..101f74607 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OdfToOafMapper.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OdfToOafMapper.java @@ -249,8 +249,8 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper { } @Override - protected List prepareSubjects(final Document doc, final DataInfo info) { - return prepareListStructProps(doc, "//*[local-name()='subject']", info); + protected List prepareSubjects(final Document doc, final DataInfo info) { + return prepareSubjectList(doc, "//*[local-name()='subject']", info); } @Override diff --git a/dhp-workflows/dhp-graph-mapper/src/test/scala/eu/dnetlib/dhp/oa/graph/resolution/ResolveEntitiesTest.scala b/dhp-workflows/dhp-graph-mapper/src/test/scala/eu/dnetlib/dhp/oa/graph/resolution/ResolveEntitiesTest.scala index c8e41743f..e333da1aa 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/scala/eu/dnetlib/dhp/oa/graph/resolution/ResolveEntitiesTest.scala +++ b/dhp-workflows/dhp-graph-mapper/src/test/scala/eu/dnetlib/dhp/oa/graph/resolution/ResolveEntitiesTest.scala @@ -59,7 +59,7 @@ class ResolveEntitiesTest extends Serializable { r.setId(id.toLowerCase.trim) r.setSubject( List( - OafMapperUtils.structuredProperty( + OafMapperUtils.subject( FAKE_SUBJECT, OafMapperUtils.qualifier("fos", "fosCS", "fossSchema", "fossiFIgo"), null @@ -250,7 +250,7 @@ class ResolveEntitiesTest extends Serializable { val r = new Result r.setSubject( List( - OafMapperUtils.structuredProperty( + OafMapperUtils.subject( FAKE_SUBJECT, OafMapperUtils.qualifier("fos", "fosCS", "fossSchema", "fossiFIgo"), null diff --git a/pom.xml b/pom.xml index 973bc3773..632ca99b8 100644 --- a/pom.xml +++ b/pom.xml @@ -801,7 +801,7 @@ 3.3.3 3.4.2 [2.12,3.0) - [2.12.2-SNAPSHOT] + [2.13.2-SNAPSHOT] [4.0.3] [6.0.5] [3.1.6] From b78889a0ce27a79c7ab2d8da05b118ee4f1bcb36 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Fri, 5 Aug 2022 09:11:37 +0200 Subject: [PATCH 04/20] WIP: cleaning of subjects --- .../dhp/common/vocabulary/Vocabulary.java | 6 ++++ .../common/vocabulary/VocabularyGroup.java | 7 +++++ .../dhp/oa/graph/clean/CleaningRuleMap.java | 29 +++++++++++++++---- 3 files changed, 37 insertions(+), 5 deletions(-) diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/common/vocabulary/Vocabulary.java b/dhp-common/src/main/java/eu/dnetlib/dhp/common/vocabulary/Vocabulary.java index b3eb98d4f..24a30500d 100644 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/common/vocabulary/Vocabulary.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/common/vocabulary/Vocabulary.java @@ -83,4 +83,10 @@ public class Vocabulary implements Serializable { .orElse(null); } + public Qualifier lookup(String id) { + return Optional + .ofNullable(getSynonymAsQualifier(id)) + .orElse(getTermAsQualifier(id)); + } + } diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/common/vocabulary/VocabularyGroup.java b/dhp-common/src/main/java/eu/dnetlib/dhp/common/vocabulary/VocabularyGroup.java index 1c129ff9c..8435b8bf3 100644 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/common/vocabulary/VocabularyGroup.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/common/vocabulary/VocabularyGroup.java @@ -81,6 +81,13 @@ public class VocabularyGroup implements Serializable { vocs.put(id.toLowerCase(), new Vocabulary(id, name)); } + public Optional find(final String vocId) { + return Optional + .ofNullable(vocId) + .map(String::toLowerCase) + .map(vocs::get); + } + public void addTerm(final String vocId, final String id, final String name) { if (vocabularyExists(vocId)) { vocs.get(vocId.toLowerCase()).addTerm(id, name); diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleaningRuleMap.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleaningRuleMap.java index 6c156edb7..2a4183f1b 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleaningRuleMap.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleaningRuleMap.java @@ -4,15 +4,15 @@ package eu.dnetlib.dhp.oa.graph.clean; import java.io.Serializable; import java.util.HashMap; +import eu.dnetlib.dhp.common.vocabulary.Vocabulary; +import eu.dnetlib.dhp.schema.oaf.*; import org.apache.commons.lang3.StringUtils; import eu.dnetlib.dhp.common.FunctionalInterfaceSupport.SerializableConsumer; import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup; import eu.dnetlib.dhp.schema.common.ModelConstants; -import eu.dnetlib.dhp.schema.oaf.AccessRight; -import eu.dnetlib.dhp.schema.oaf.Country; -import eu.dnetlib.dhp.schema.oaf.Qualifier; -import eu.dnetlib.dhp.schema.oaf.Relation; + +import javax.jws.WebParam; public class CleaningRuleMap extends HashMap, SerializableConsumer> implements Serializable { @@ -27,10 +27,29 @@ public class CleaningRuleMap extends HashMap, SerializableConsumer cleanQualifier(vocabularies, (AccessRight) o)); mapping.put(Country.class, o -> cleanCountry(vocabularies, (Country) o)); mapping.put(Relation.class, o -> cleanRelation(vocabularies, (Relation) o)); - + mapping.put(Subject.class, o -> cleanSubject(vocabularies, (Subject) o)); return mapping; } + private static void cleanSubject(VocabularyGroup vocabularies, Subject s) { + // TODO cleaning based on different subject vocabs can be added here + cleanSubjectForVocabulary(ModelConstants.DNET_SUBJECT_FOS_CLASSID, vocabularies, s); + } + + private static void cleanSubjectForVocabulary(String vocabularyId, VocabularyGroup vocabularies, Subject s) { + vocabularies.find(vocabularyId).ifPresent(vocabulary -> { + if (!ModelConstants.DNET_SUBJECT_KEYWORD.equalsIgnoreCase(s.getQualifier().getClassid())) { + return; + } + Qualifier newValue = vocabulary.lookup(s.getValue()); + if (!s.getValue().equals(newValue.getClassid())) { + s.setValue(newValue.getClassid()); + s.getQualifier().setClassid(vocabularyId); + s.getQualifier().setClassname(vocabulary.getName()); + } + }); + } + private static void cleanRelation(VocabularyGroup vocabularies, Relation r) { if (vocabularies.vocabularyExists(ModelConstants.DNET_RELATION_SUBRELTYPE)) { Qualifier newValue = vocabularies.lookup(ModelConstants.DNET_RELATION_SUBRELTYPE, r.getSubRelType()); From 32cee1f619eb30d2e2ac6083435b76b1aba7db09 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Fri, 5 Aug 2022 12:32:08 +0200 Subject: [PATCH 05/20] WIP: cleaning of subjects --- .../dhp/common/vocabulary/Vocabulary.java | 4 ++-- .../common/vocabulary/VocabularyGroup.java | 6 +++--- .../dhp/oa/graph/clean/CleaningRuleMap.java | 8 ++++---- .../clean/GraphCleaningFunctionsTest.java | 11 ++++++++++ .../eu/dnetlib/dhp/oa/graph/clean/result.json | 20 +++++++++---------- .../dnetlib/dhp/oa/graph/clean/synonyms.txt | 4 +++- .../eu/dnetlib/dhp/oa/graph/clean/terms.txt | 5 ++++- 7 files changed, 37 insertions(+), 21 deletions(-) diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/common/vocabulary/Vocabulary.java b/dhp-common/src/main/java/eu/dnetlib/dhp/common/vocabulary/Vocabulary.java index 24a30500d..3a8df5c9e 100644 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/common/vocabulary/Vocabulary.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/common/vocabulary/Vocabulary.java @@ -85,8 +85,8 @@ public class Vocabulary implements Serializable { public Qualifier lookup(String id) { return Optional - .ofNullable(getSynonymAsQualifier(id)) - .orElse(getTermAsQualifier(id)); + .ofNullable(getSynonymAsQualifier(id)) + .orElse(getTermAsQualifier(id)); } } diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/common/vocabulary/VocabularyGroup.java b/dhp-common/src/main/java/eu/dnetlib/dhp/common/vocabulary/VocabularyGroup.java index 8435b8bf3..fc7175270 100644 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/common/vocabulary/VocabularyGroup.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/common/vocabulary/VocabularyGroup.java @@ -83,9 +83,9 @@ public class VocabularyGroup implements Serializable { public Optional find(final String vocId) { return Optional - .ofNullable(vocId) - .map(String::toLowerCase) - .map(vocs::get); + .ofNullable(vocId) + .map(String::toLowerCase) + .map(vocs::get); } public void addTerm(final String vocId, final String id, final String name) { diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleaningRuleMap.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleaningRuleMap.java index 2a4183f1b..894d5d059 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleaningRuleMap.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleaningRuleMap.java @@ -4,15 +4,15 @@ package eu.dnetlib.dhp.oa.graph.clean; import java.io.Serializable; import java.util.HashMap; -import eu.dnetlib.dhp.common.vocabulary.Vocabulary; -import eu.dnetlib.dhp.schema.oaf.*; +import javax.jws.WebParam; + import org.apache.commons.lang3.StringUtils; import eu.dnetlib.dhp.common.FunctionalInterfaceSupport.SerializableConsumer; +import eu.dnetlib.dhp.common.vocabulary.Vocabulary; import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup; import eu.dnetlib.dhp.schema.common.ModelConstants; - -import javax.jws.WebParam; +import eu.dnetlib.dhp.schema.oaf.*; public class CleaningRuleMap extends HashMap, SerializableConsumer> implements Serializable { diff --git a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/clean/GraphCleaningFunctionsTest.java b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/clean/GraphCleaningFunctionsTest.java index 7c39efb40..f4c4581b1 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/clean/GraphCleaningFunctionsTest.java +++ b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/clean/GraphCleaningFunctionsTest.java @@ -251,6 +251,17 @@ public class GraphCleaningFunctionsTest { pid.getQualifier().getClassname())); }); + assertNotNull(p_cleaned.getSubject()); + + List fos_subjects = p_cleaned + .getSubject() + .stream() + .filter(s -> ModelConstants.DNET_SUBJECT_FOS_CLASSID.equals(s.getQualifier().getClassid())) + .collect(Collectors.toList()); + + assertNotNull(fos_subjects); + assertEquals(2, fos_subjects.size()); + // TODO add more assertions to verity the cleaned values System.out.println(MAPPER.writeValueAsString(p_cleaned)); } diff --git a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/clean/result.json b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/clean/result.json index 78fdc4c9d..ea63bba28 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/clean/result.json +++ b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/clean/result.json @@ -743,12 +743,12 @@ "trust": "0.9" }, "qualifier": { - "classid": "", - "classname": "", - "schemeid": "", - "schemename": "" + "classid": "keyword", + "classname": "keyword", + "schemeid": "dnet:subject_classification_typologies", + "schemename": "dnet:subject_classification_typologies" }, - "value": "infrared detectors" + "value": "FOS: Mathematics" }, { "dataInfo": { @@ -765,12 +765,12 @@ "trust": "0.9" }, "qualifier": { - "classid": "", - "classname": "", - "schemeid": "", - "schemename": "" + "classid": "keyword", + "classname": "keyword", + "schemeid": "dnet:subject_classification_typologies", + "schemename": "dnet:subject_classification_typologies" }, - "value": "lens antennas" + "value": "FOS: Computer and information sciences" }, { "dataInfo": { diff --git a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/clean/synonyms.txt b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/clean/synonyms.txt index 39ed0cef1..409dfd5dc 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/clean/synonyms.txt +++ b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/clean/synonyms.txt @@ -1243,4 +1243,6 @@ dnet:relation_relClass @=@ IsSupplementTo @=@ isSupplementTo dnet:relation_relClass @=@ IsSupplementedBy @=@ isSupplementedBy dnet:relation_relClass @=@ IsRelatedTo @=@ isRelatedTo dnet:relation_subRelType @=@ relationship @=@ publicationDataset -dnet:provenanceActions @=@ iis @=@ erroneous label to be cleaned \ No newline at end of file +dnet:provenanceActions @=@ iis @=@ erroneous label to be cleaned +FOS @=@ 0101 mathematics @=@ FOS: Mathematics +FOS @=@ 0102 computer and information sciences @=@ FOS: Computer and information sciences \ No newline at end of file diff --git a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/clean/terms.txt b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/clean/terms.txt index 59311d5a7..83ca81670 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/clean/terms.txt +++ b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/clean/terms.txt @@ -1117,4 +1117,7 @@ dnet:relation_subRelType @=@ dnet:relation_subRelType @=@ relationship @=@ relat dnet:relation_subRelType @=@ dnet:relation_subRelType @=@ review @=@ review dnet:relation_subRelType @=@ dnet:relation_subRelType @=@ similarity @=@ similarity dnet:relation_subRelType @=@ dnet:relation_subRelType @=@ supplement @=@ supplement -dnet:relation_subRelType @=@ dnet:relation_subRelType @=@ version @=@ version \ No newline at end of file +dnet:relation_subRelType @=@ dnet:relation_subRelType @=@ version @=@ version +FOS @=@ Fields of Science and Technology classification @=@ 0101 mathematics @=@ 0101 mathematics +FOS @=@ Fields of Science and Technology classification @=@ 0102 computer and information sciences @=@ 0102 computer and information sciences +FOS @=@ Fields of Science and Technology classification @=@ 0103 physical sciences @=@ 0103 physical sciences \ No newline at end of file From 4eaa063b1f03f3b855ddc6687c905f6b269b1756 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Fri, 5 Aug 2022 16:56:09 +0200 Subject: [PATCH 06/20] cleaning of subjects --- .../dhp/oa/graph/clean/CleaningRuleMap.java | 32 +++++++++++-------- .../clean/GraphCleaningFunctionsTest.java | 14 ++++++++ 2 files changed, 33 insertions(+), 13 deletions(-) diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleaningRuleMap.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleaningRuleMap.java index 894d5d059..c650400af 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleaningRuleMap.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleaningRuleMap.java @@ -3,13 +3,12 @@ package eu.dnetlib.dhp.oa.graph.clean; import java.io.Serializable; import java.util.HashMap; +import java.util.concurrent.atomic.AtomicReference; -import javax.jws.WebParam; - +import org.apache.commons.lang3.SerializationUtils; import org.apache.commons.lang3.StringUtils; import eu.dnetlib.dhp.common.FunctionalInterfaceSupport.SerializableConsumer; -import eu.dnetlib.dhp.common.vocabulary.Vocabulary; import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup; import eu.dnetlib.dhp.schema.common.ModelConstants; import eu.dnetlib.dhp.schema.oaf.*; @@ -31,23 +30,30 @@ public class CleaningRuleMap extends HashMap, SerializableConsumer modified = new AtomicReference<>(false); vocabularies.find(vocabularyId).ifPresent(vocabulary -> { - if (!ModelConstants.DNET_SUBJECT_KEYWORD.equalsIgnoreCase(s.getQualifier().getClassid())) { + if (!ModelConstants.DNET_SUBJECT_KEYWORD.equalsIgnoreCase(subject.getQualifier().getClassid())) { return; } - Qualifier newValue = vocabulary.lookup(s.getValue()); - if (!s.getValue().equals(newValue.getClassid())) { - s.setValue(newValue.getClassid()); - s.getQualifier().setClassid(vocabularyId); - s.getQualifier().setClassname(vocabulary.getName()); + Qualifier newValue = vocabulary.lookup(subject.getValue()); + if (!subject.getValue().equals(newValue.getClassid())) { + subject.setValue(newValue.getClassid()); + subject.getQualifier().setClassid(vocabularyId); + subject.getQualifier().setClassname(vocabulary.getName()); + modified.set(true); } }); + return modified.get(); } private static void cleanRelation(VocabularyGroup vocabularies, Relation r) { diff --git a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/clean/GraphCleaningFunctionsTest.java b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/clean/GraphCleaningFunctionsTest.java index f4c4581b1..0bfee1151 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/clean/GraphCleaningFunctionsTest.java +++ b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/clean/GraphCleaningFunctionsTest.java @@ -262,6 +262,20 @@ public class GraphCleaningFunctionsTest { assertNotNull(fos_subjects); assertEquals(2, fos_subjects.size()); + assertTrue( + fos_subjects + .stream() + .anyMatch( + s -> "0101 mathematics".equals(s.getValue()) & + ModelConstants.DNET_SUBJECT_FOS_CLASSID.equals(s.getQualifier().getClassid()))); + + assertTrue( + fos_subjects + .stream() + .anyMatch( + s -> "0102 computer and information sciences".equals(s.getValue()) & + ModelConstants.DNET_SUBJECT_FOS_CLASSID.equals(s.getQualifier().getClassid()))); + // TODO add more assertions to verity the cleaned values System.out.println(MAPPER.writeValueAsString(p_cleaned)); } From 3418ce50ac9b28fed4fa949919e6c8208738cdcf Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Mon, 8 Aug 2022 12:48:47 +0200 Subject: [PATCH 07/20] cleaning of subjects: perform the cleaning when the given value is equivalent to one of the terms in the vocabulary --- .../dnetlib/dhp/schema/oaf/utils/OafMapperUtils.java | 3 ++- .../eu/dnetlib/dhp/oa/graph/clean/CleaningRuleMap.java | 2 +- .../dhp/oa/graph/clean/GraphCleaningFunctionsTest.java | 2 +- .../eu/dnetlib/dhp/oa/graph/clean/result.json | 10 +++++----- 4 files changed, 9 insertions(+), 8 deletions(-) diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/OafMapperUtils.java b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/OafMapperUtils.java index d58b354ab..6ba7d70f1 100644 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/OafMapperUtils.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/OafMapperUtils.java @@ -11,6 +11,7 @@ import java.util.function.Function; import java.util.function.Predicate; import java.util.stream.Collectors; +import eu.dnetlib.dhp.schema.common.ModelConstants; import org.apache.commons.lang3.StringUtils; import eu.dnetlib.dhp.schema.common.AccessRightComparator; @@ -141,7 +142,7 @@ public class OafMapperUtils { } public static Qualifier unknown(final String schemeid, final String schemename) { - return qualifier("UNKNOWN", "Unknown", schemeid, schemename); + return qualifier(UNKNOWN, "Unknown", schemeid, schemename); } public static AccessRight accessRight( diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleaningRuleMap.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleaningRuleMap.java index c650400af..147e26699 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleaningRuleMap.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleaningRuleMap.java @@ -46,7 +46,7 @@ public class CleaningRuleMap extends HashMap, SerializableConsumer Date: Fri, 12 Aug 2022 15:09:16 +0200 Subject: [PATCH 08/20] cleaning of subjects: avoid duplicated subjects, prioritise collected vs inferred or other sources --- .../oaf/utils/GraphCleaningFunctions.java | 29 +++++++----- .../dhp/schema/oaf/utils/OafMapperUtils.java | 13 +++++- .../utils/SubjectProvenanceComparator.java | 46 +++++++++++++++++++ .../clean/GraphCleaningFunctionsTest.java | 6 ++- .../eu/dnetlib/dhp/oa/graph/clean/result.json | 22 +++++++++ 5 files changed, 101 insertions(+), 15 deletions(-) create mode 100644 dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/SubjectProvenanceComparator.java diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/GraphCleaningFunctions.java b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/GraphCleaningFunctions.java index 151c53685..775f228eb 100644 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/GraphCleaningFunctions.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/GraphCleaningFunctions.java @@ -1,6 +1,8 @@ package eu.dnetlib.dhp.schema.oaf.utils; +import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.getProvenance; + import java.time.LocalDate; import java.time.ZoneId; import java.time.format.DateTimeFormatter; @@ -16,7 +18,6 @@ import com.github.sisyphsu.dateparser.DateParserUtils; import com.google.common.collect.Lists; import com.google.common.collect.Sets; -import eu.dnetlib.dhp.common.vocabulary.Vocabulary; import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup; import eu.dnetlib.dhp.schema.common.ModelConstants; import eu.dnetlib.dhp.schema.common.ModelSupport; @@ -191,8 +192,8 @@ public class GraphCleaningFunctions extends CleaningFunctions { qualifier("und", "Undetermined", ModelConstants.DNET_LANGUAGES)); } if (Objects.nonNull(r.getSubject())) { - r - .setSubject( + List subjects = Lists + .newArrayList( r .getSubject() .stream() @@ -201,7 +202,18 @@ public class GraphCleaningFunctions extends CleaningFunctions { .filter(sp -> Objects.nonNull(sp.getQualifier())) .filter(sp -> StringUtils.isNotBlank(sp.getQualifier().getClassid())) .map(GraphCleaningFunctions::cleanValue) - .collect(Collectors.toList())); + .collect( + Collectors + .toMap( + s -> Optional + .ofNullable(s.getQualifier()) + .map(q -> q.getClassid() + s.getValue()) + .orElse(s.getValue()), + Function.identity(), + (s1, s2) -> Collections + .min(Lists.newArrayList(s1, s1), new SubjectProvenanceComparator()))) + .values()); + r.setSubject(subjects); } if (Objects.nonNull(r.getTitle())) { r @@ -382,14 +394,7 @@ public class GraphCleaningFunctions extends CleaningFunctions { .filter(p -> StringUtils.isNotBlank(p.getValue())) .map(p -> { // hack to distinguish orcid from orcid_pending - String pidProvenance = Optional - .ofNullable(p.getDataInfo()) - .map( - d -> Optional - .ofNullable(d.getProvenanceaction()) - .map(Qualifier::getClassid) - .orElse("")) - .orElse(""); + String pidProvenance = getProvenance(p.getDataInfo()); if (p .getQualifier() .getClassid() diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/OafMapperUtils.java b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/OafMapperUtils.java index 6ba7d70f1..c58096d35 100644 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/OafMapperUtils.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/OafMapperUtils.java @@ -11,10 +11,10 @@ import java.util.function.Function; import java.util.function.Predicate; import java.util.stream.Collectors; -import eu.dnetlib.dhp.schema.common.ModelConstants; import org.apache.commons.lang3.StringUtils; import eu.dnetlib.dhp.schema.common.AccessRightComparator; +import eu.dnetlib.dhp.schema.common.ModelConstants; import eu.dnetlib.dhp.schema.common.ModelSupport; import eu.dnetlib.dhp.schema.oaf.*; @@ -503,4 +503,15 @@ public class OafMapperUtils { rel.setProperties(properties); return rel; } + + public static String getProvenance(DataInfo dataInfo) { + return Optional + .ofNullable(dataInfo) + .map( + d -> Optional + .ofNullable(d.getProvenanceaction()) + .map(Qualifier::getClassid) + .orElse("")) + .orElse(""); + } } diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/SubjectProvenanceComparator.java b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/SubjectProvenanceComparator.java new file mode 100644 index 000000000..f4e3c8841 --- /dev/null +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/SubjectProvenanceComparator.java @@ -0,0 +1,46 @@ + +package eu.dnetlib.dhp.schema.oaf.utils; + +import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.getProvenance; +import static org.apache.commons.lang3.StringUtils.isBlank; + +import java.util.Comparator; + +import eu.dnetlib.dhp.schema.oaf.Subject; + +public class SubjectProvenanceComparator implements Comparator { + + @Override + public int compare(Subject left, Subject right) { + + String lProv = getProvenance(left.getDataInfo()); + String rProv = getProvenance(right.getDataInfo()); + + if (isBlank(lProv) && isBlank(rProv)) + return 0; + if (isBlank(lProv)) + return 1; + if (isBlank(rProv)) + return -1; + if (lProv.equals(rProv)) + return 0; + if (lProv.toLowerCase().contains("crosswalk")) + return -1; + if (rProv.toLowerCase().contains("crosswalk")) + return 1; + if (lProv.toLowerCase().contains("user")) + return -1; + if (rProv.toLowerCase().contains("user")) + return 1; + if (lProv.toLowerCase().contains("propagation")) + return -1; + if (rProv.toLowerCase().contains("propagation")) + return 1; + if (lProv.toLowerCase().contains("iis")) + return -1; + if (rProv.toLowerCase().contains("iis")) + return 1; + + return 0; + } +} diff --git a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/clean/GraphCleaningFunctionsTest.java b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/clean/GraphCleaningFunctionsTest.java index 0289d62b4..c6222af14 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/clean/GraphCleaningFunctionsTest.java +++ b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/clean/GraphCleaningFunctionsTest.java @@ -260,14 +260,16 @@ public class GraphCleaningFunctionsTest { .collect(Collectors.toList()); assertNotNull(fos_subjects); - assertEquals(3, fos_subjects.size()); + assertEquals(2, fos_subjects.size()); assertTrue( fos_subjects .stream() .anyMatch( s -> "0101 mathematics".equals(s.getValue()) & - ModelConstants.DNET_SUBJECT_FOS_CLASSID.equals(s.getQualifier().getClassid()))); + ModelConstants.DNET_SUBJECT_FOS_CLASSID.equals(s.getQualifier().getClassid()) & + "sysimport:crosswalk:datasetarchive".equals(s.getDataInfo().getProvenanceaction().getClassid()) + )); assertTrue( fos_subjects diff --git a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/clean/result.json b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/clean/result.json index a5ba747c4..8e4fc4545 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/clean/result.json +++ b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/clean/result.json @@ -794,6 +794,28 @@ }, "value": "0101 mathematics" }, + { + "dataInfo": { + "deletedbyinference": false, + "inferenceprovenance": "", + "inferred": false, + "invisible": false, + "provenanceaction": { + "classid": "subject:fos", + "classname": "subject:fos", + "schemeid": "dnet:provenanceActions", + "schemename": "dnet:provenanceActions" + }, + "trust": "0.9" + }, + "qualifier": { + "classid": "keyword", + "classname": "keyword", + "schemeid": "dnet:subject_classification_typologies", + "schemename": "dnet:subject_classification_typologies" + }, + "value": "0101 mathematics" + }, { "dataInfo": { "deletedbyinference": false, From 2b5f8c9c9a3611c57ee5febfe262a455a39ad801 Mon Sep 17 00:00:00 2001 From: dimitrispie Date: Tue, 6 Sep 2022 12:27:53 +0300 Subject: [PATCH 09/20] comment out duplicate table creation --- .../scripts/step16-createIndicatorsTables.sql | 22 +++++++++---------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16-createIndicatorsTables.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16-createIndicatorsTables.sql index 2581f1d64..b13eae0aa 100755 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16-createIndicatorsTables.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16-createIndicatorsTables.sql @@ -179,17 +179,17 @@ from publication_datasources pd compute stats indi_pub_diamond; -create table indi_pub_hybrid stored as parquet as -select distinct pd.id, coalesce(is_hybrid, 0) as is_hybrid -from publication_datasources pd - left outer join ( - select pd.id, 1 as is_hybrid from publication_datasources pd - join datasource d on d.id=pd.datasource - join stats_ext.plan_s_jn ps where (ps.issn_print=d.issn_printed and ps.issn_online=d.issn_online) - and (ps.journal_is_in_doaj=false and ps.journal_is_oa=false)) tmp - on pd.id=tmp.id; - -compute stats indi_pub_hybrid; +--create table indi_pub_hybrid stored as parquet as +--select distinct pd.id, coalesce(is_hybrid, 0) as is_hybrid +--from publication_datasources pd +-- left outer join ( +-- select pd.id, 1 as is_hybrid from publication_datasources pd +-- join datasource d on d.id=pd.datasource +-- join stats_ext.plan_s_jn ps where (ps.issn_print=d.issn_printed and ps.issn_online=d.issn_online) +-- and (ps.journal_is_in_doaj=false and ps.journal_is_oa=false)) tmp +-- on pd.id=tmp.id; +-- +--compute stats indi_pub_hybrid; create table indi_pub_in_transformative stored as parquet as select distinct pd.id, coalesce(is_transformative, 0) as is_transformative From 71b069ca90a2f7ec09d64241c60917d3636fc81e Mon Sep 17 00:00:00 2001 From: dimitrispie Date: Fri, 9 Sep 2022 13:15:58 +0300 Subject: [PATCH 10/20] Changes to indicator and monitor scripts --- .../scripts/step16-createIndicatorsTables.sql | 16 ++++++++-------- .../oozie_app/scripts/step20-createMonitorDB.sql | 5 ++++- 2 files changed, 12 insertions(+), 9 deletions(-) diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16-createIndicatorsTables.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16-createIndicatorsTables.sql index b13eae0aa..0f294145c 100755 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16-createIndicatorsTables.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16-createIndicatorsTables.sql @@ -564,12 +564,12 @@ create table indi_org_fairness stored as parquet as (select ro.organization organization, count(distinct ro.id) no_result_fair from result_organization ro join result r on r.id=ro.id --join result_pids rp on r.id=rp.id - where (title is not null) and (publisher is not null) and (abstract is true) and (year is not null) and (authors>0) and year>2003 + where (title is not null) and (publisher is not null) and (abstract is true) and (year is not null) and (authors>0) and cast(year as int)>2003 group by ro.organization), --return all results group by organization allresults as (select organization, count(distinct ro.id) no_allresults from result_organization ro join result r on r.id=ro.id - where year>2003 + where cast(year as int)>2003 group by organization) --return results_fair/all_results select allresults.organization, result_fair.no_result_fair/allresults.no_allresults org_fairness @@ -638,11 +638,11 @@ create table indi_org_fairness_year stored as parquet as (select year, ro.organization organization, count(distinct ro.id) no_result_fair from result_organization ro join result r on r.id=ro.id join result_pids rp on r.id=rp.id - where (title is not null) and (publisher is not null) and (abstract is true) and (year is not null) and (authors>0) and year>2003 + where (title is not null) and (publisher is not null) and (abstract is true) and (year is not null) and (authors>0) and cast(year as int)>2003 group by ro.organization, year), allresults as (select year, organization, count(distinct ro.id) no_allresults from result_organization ro join result r on r.id=ro.id - where year>2003 + where cast(year as int)>2003 group by organization, year) --return results_fair/all_results select allresults.year, allresults.organization, result_fair.no_result_fair/allresults.no_allresults org_fairness @@ -657,12 +657,12 @@ create table indi_org_findable_year stored as parquet as (select year, ro.organization organization, count(distinct rp.id) no_result_with_pid from result_organization ro join result_pids rp on rp.id=ro.id join result r on r.id=rp.id - where year >2003 + where cast(year as int) >2003 group by ro.organization, year), --return all results group by organization,year allresults as (select year, organization, count(distinct ro.id) no_allresults from result_organization ro join result r on r.id=ro.id - where year >2003 + where cast(year as int) >2003 group by organization, year) --return results_with_pid/all_results select allresults.year, allresults.organization, result_with_pid.no_result_with_pid/allresults.no_allresults org_findable @@ -677,12 +677,12 @@ create table indi_org_findable stored as parquet as (select ro.organization organization, count(distinct rp.id) no_result_with_pid from result_organization ro join result_pids rp on rp.id=ro.id join result r on r.id=rp.id - where year >2003 + where cast(year as int) >2003 group by ro.organization), --return all results group by organization allresults as (select organization, count(distinct ro.id) no_allresults from result_organization ro join result r on r.id=ro.id - where year >2003 + where cast(year as int) >2003 group by organization) --return results_with_pid/all_results select allresults.organization, result_with_pid.no_result_with_pid/allresults.no_allresults org_findable diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql index d78895198..572b70ef7 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql @@ -45,7 +45,10 @@ create table TARGET.result stored as parquet as 'openorgs____::15e7921fc50d9aa1229a82a84429419e', -- University Of Thessaly 'openorgs____::11f7919dadc8f8a7251af54bba60c956', -- Technical University of Crete 'openorgs____::84f0c5f5dbb6daf42748485924efde4b', -- University of Piraeus - 'openorgs____::4ac562f0376fce3539504567649cb373' -- University of Patras + 'openorgs____::4ac562f0376fce3539504567649cb373', -- University of Patras + 'openorgs____::3e8d1f8c3f6cd7f418b09f1f58b4873b', -- Aristotle University of Thessaloniki + 'openorgs____::3fcef6e1c469c10f2a84b281372c9814', -- World Bank + 'openorgs____::1698a2eb1885ef8adb5a4a969e745ad3' -- École des Ponts ParisTech ) )) foo; compute stats TARGET.result; From ff6f789b6d9be0567b6ad72f8a0e75fe3f52726a Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Fri, 9 Sep 2022 15:16:31 +0200 Subject: [PATCH 11/20] code formatting --- .../java/eu/dnetlib/dhp/oa/graph/raw/OdfToOafMapper.java | 2 +- .../dhp/oa/graph/clean/GraphCleaningFunctionsTest.java | 4 ++-- .../test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java | 3 ++- .../dnetlib/dhp/oa/provision/IndexRecordTransformerTest.java | 5 +---- pom.xml | 2 +- 5 files changed, 7 insertions(+), 9 deletions(-) diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OdfToOafMapper.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OdfToOafMapper.java index 24ec5d0ef..d6bfe6714 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OdfToOafMapper.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OdfToOafMapper.java @@ -179,7 +179,7 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper { url.add(HTTP_DOI_PREIFX + ((Node) o).getText().trim()); } for (final Object o : doc - .selectNodes("//*[local-name()='alternateIdentifier' and ./@alternateIdentifierType='Handle']")) { + .selectNodes("//*[local-name()='alternateIdentifier' and ./@alternateIdentifierType='Handle']")) { url.add(HTTP_HANDLE_PREIFX + ((Node) o).getText().trim()); } for (final Object o : doc.selectNodes("//*[local-name()='identifier' and ./@identifierType='Handle']")) { diff --git a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/clean/GraphCleaningFunctionsTest.java b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/clean/GraphCleaningFunctionsTest.java index c6222af14..6c43da832 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/clean/GraphCleaningFunctionsTest.java +++ b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/clean/GraphCleaningFunctionsTest.java @@ -268,8 +268,8 @@ public class GraphCleaningFunctionsTest { .anyMatch( s -> "0101 mathematics".equals(s.getValue()) & ModelConstants.DNET_SUBJECT_FOS_CLASSID.equals(s.getQualifier().getClassid()) & - "sysimport:crosswalk:datasetarchive".equals(s.getDataInfo().getProvenanceaction().getClassid()) - )); + "sysimport:crosswalk:datasetarchive" + .equals(s.getDataInfo().getProvenanceaction().getClassid()))); assertTrue( fos_subjects diff --git a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java index 955ef1d42..67406e794 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java +++ b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java @@ -928,7 +928,8 @@ class MappersTest { @Test void testROHub2() throws IOException, DocumentException { - final String xml = IOUtils.toString(Objects.requireNonNull(getClass().getResourceAsStream("rohub-modified.xml"))); + final String xml = IOUtils + .toString(Objects.requireNonNull(getClass().getResourceAsStream("rohub-modified.xml"))); final List list = new OdfToOafMapper(vocs, false, true).processMdRecord(xml); System.out.println("***************"); System.out.println(new ObjectMapper().writeValueAsString(list)); diff --git a/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/oa/provision/IndexRecordTransformerTest.java b/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/oa/provision/IndexRecordTransformerTest.java index be5dda411..e0fbb2a2f 100644 --- a/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/oa/provision/IndexRecordTransformerTest.java +++ b/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/oa/provision/IndexRecordTransformerTest.java @@ -85,7 +85,7 @@ public class IndexRecordTransformerTest { public void testRiunet() throws IOException, TransformerException { final XmlRecordFactory xmlRecordFactory = new XmlRecordFactory(contextMapper, false, - XmlConverterJob.schemaLocation); + XmlConverterJob.schemaLocation); final Publication p = load("riunet.json", Publication.class); @@ -95,7 +95,6 @@ public class IndexRecordTransformerTest { testRecordTransformation(record); } - @Test public void testForEOSCFutureDataTransferPilot() throws IOException, TransformerException { final String record = IOUtils.toString(getClass().getResourceAsStream("eosc-future/data-transfer-pilot.xml")); @@ -129,8 +128,6 @@ public class IndexRecordTransformerTest { testRecordTransformation(record); } - - @Test void testDoiUrlNormalization() throws MalformedURLException { diff --git a/pom.xml b/pom.xml index 632ca99b8..ab59e7be3 100644 --- a/pom.xml +++ b/pom.xml @@ -801,7 +801,7 @@ 3.3.3 3.4.2 [2.12,3.0) - [2.13.2-SNAPSHOT] + [3.14.0] [4.0.3] [6.0.5] [3.1.6] From 27af5122d2d7f7d6d549dc23a5bf79a0c0030d1c Mon Sep 17 00:00:00 2001 From: Alessia Bardi Date: Mon, 12 Sep 2022 14:25:23 +0200 Subject: [PATCH 12/20] logs for non well formed XML files --- .../raw/AbstractMdRecordToOafMapper.java | 62 +++++++++------- .../dnetlib/dhp/oa/graph/raw/MappersTest.java | 12 ++++ .../dhp/oa/graph/raw/oaf_notwellformed.xml | 70 +++++++++++++++++++ .../src/test/resources/log4j.properties | 8 +++ 4 files changed, 125 insertions(+), 27 deletions(-) create mode 100644 dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/oaf_notwellformed.xml create mode 100644 dhp-workflows/dhp-graph-mapper/src/test/resources/log4j.properties diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java index a05faab80..e263cffa8 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java @@ -22,6 +22,8 @@ import java.util.*; import org.apache.commons.lang3.StringUtils; import org.dom4j.*; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import com.google.common.collect.Lists; import com.google.common.collect.Sets; @@ -77,6 +79,8 @@ public abstract class AbstractMdRecordToOafMapper { protected static final Map nsContext = new HashMap<>(); + private static final Logger log = LoggerFactory.getLogger(AbstractMdRecordToOafMapper.class); + static { nsContext.put("dr", "http://www.driver-repository.eu/namespace/dr"); nsContext.put("dri", "http://www.driver-repository.eu/namespace/dri"); @@ -106,37 +110,41 @@ public abstract class AbstractMdRecordToOafMapper { public List processMdRecord(final String xml) throws DocumentException { DocumentFactory.getInstance().setXPathNamespaceURIs(nsContext); + try { + final Document doc = DocumentHelper + .parseText( + xml + .replaceAll(DATACITE_SCHEMA_KERNEL_4, DATACITE_SCHEMA_KERNEL_3) + .replaceAll(DATACITE_SCHEMA_KERNEL_4_SLASH, DATACITE_SCHEMA_KERNEL_3) + .replaceAll(DATACITE_SCHEMA_KERNEL_3_SLASH, DATACITE_SCHEMA_KERNEL_3)); - final Document doc = DocumentHelper - .parseText( - xml - .replaceAll(DATACITE_SCHEMA_KERNEL_4, DATACITE_SCHEMA_KERNEL_3) - .replaceAll(DATACITE_SCHEMA_KERNEL_4_SLASH, DATACITE_SCHEMA_KERNEL_3) - .replaceAll(DATACITE_SCHEMA_KERNEL_3_SLASH, DATACITE_SCHEMA_KERNEL_3)); + final KeyValue collectedFrom = getProvenanceDatasource( + doc, "//oaf:collectedFrom/@id", "//oaf:collectedFrom/@name"); - final KeyValue collectedFrom = getProvenanceDatasource( - doc, "//oaf:collectedFrom/@id", "//oaf:collectedFrom/@name"); + if (collectedFrom == null) { + return Lists.newArrayList(); + } - if (collectedFrom == null) { - return Lists.newArrayList(); + final KeyValue hostedBy = StringUtils.isBlank(doc.valueOf("//oaf:hostedBy/@id")) + ? collectedFrom + : getProvenanceDatasource(doc, "//oaf:hostedBy/@id", "//oaf:hostedBy/@name"); + + if (hostedBy == null) { + return Lists.newArrayList(); + } + + final DataInfo info = prepareDataInfo(doc, invisible); + final long lastUpdateTimestamp = new Date().getTime(); + + final List instances = prepareInstances(doc, info, collectedFrom, hostedBy); + + final String type = getResultType(doc, instances); + + return createOafs(doc, type, instances, collectedFrom, info, lastUpdateTimestamp); + } catch (DocumentException e) { + log.error("Error with record:\n" + xml); + throw e; } - - final KeyValue hostedBy = StringUtils.isBlank(doc.valueOf("//oaf:hostedBy/@id")) - ? collectedFrom - : getProvenanceDatasource(doc, "//oaf:hostedBy/@id", "//oaf:hostedBy/@name"); - - if (hostedBy == null) { - return Lists.newArrayList(); - } - - final DataInfo info = prepareDataInfo(doc, invisible); - final long lastUpdateTimestamp = new Date().getTime(); - - final List instances = prepareInstances(doc, info, collectedFrom, hostedBy); - - final String type = getResultType(doc, instances); - - return createOafs(doc, type, instances, collectedFrom, info, lastUpdateTimestamp); } protected String getResultType(final Document doc, final List instances) { diff --git a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java index b75f626d6..c4e34a9a8 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java +++ b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java @@ -12,6 +12,7 @@ import java.util.Objects; import java.util.Optional; import java.util.stream.Collectors; +import kotlin.jvm.Throws; import org.apache.commons.io.IOUtils; import org.apache.commons.lang3.StringUtils; import org.dom4j.DocumentException; @@ -926,6 +927,17 @@ class MappersTest { // assertTrue(StringUtils.isNotBlank(p.getTitle().get(0).getValue())); } + @Test + void testNotWellFormed() throws IOException, DocumentException { + final String xml = IOUtils + .toString(Objects.requireNonNull(getClass().getResourceAsStream("oaf_notwellformed.xml"))); + final DocumentException generalEx = new DocumentException(); + + DocumentException exception = assertThrows(DocumentException.class, () -> { + new OafToOafMapper(vocs, false, true).processMdRecord(xml); + }); + } + private void assertValidId(final String id) { // System.out.println(id); diff --git a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/oaf_notwellformed.xml b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/oaf_notwellformed.xml new file mode 100644 index 000000000..09384054e --- /dev/null +++ b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/oaf_notwellformed.xml @@ -0,0 +1,70 @@ + + +
+ jairo_______::000012e58ed836576ef2a0d38b0f726f + oai:irdb.nii.ac.jp:01221:0000010198 + + + + + + 2021-05-10T11:31:09.424Z + 2021-06-03T01:45:42.536Z + jairo_______ +
+ + 多項式GCDを用いた復号法に関する研究 + 上原, 剛 + 甲斐, 博 + 野田, 松太郎 + application/pdf + http://hdl.handle.net/2433/25934 + jpn + 京都大学数理解析研究所 + 410 + Departmental Bulletin Paper + 0014 + 2004-10-01 + + openaire____::554c7c2873 + OPEN + + + 2433/25934 + AN00061013 + http://hdl.handle.net/2433/25934 + http://repository.kulib.kyoto-u.ac.jp/dspace/bitstream/2433/25934/1/1395-16.pdf + 数理解析研究所講究録 + + + + + https%3A%2F%2Firdb.nii.ac.jp%2Foai + oai:irdb.nii.ac.jp:01221:0000010198 + 2021-04-13T13:36:29Z + + + http://repository.kulib.kyoto-u.ac.jp/dspace-oai/request + oai:repository.kulib.kyoto-u.ac.jp:2433/25934 + 2012-07-12T14:15:41Z + http://irdb.nii.ac.jp/oai + + + + + false + false + 0.9 + + + + +
\ No newline at end of file diff --git a/dhp-workflows/dhp-graph-mapper/src/test/resources/log4j.properties b/dhp-workflows/dhp-graph-mapper/src/test/resources/log4j.properties new file mode 100644 index 000000000..71255bb77 --- /dev/null +++ b/dhp-workflows/dhp-graph-mapper/src/test/resources/log4j.properties @@ -0,0 +1,8 @@ +# Root logger option +log4j.rootLogger=DEBUG, stdout + +# Direct log messages to stdout +log4j.appender.stdout=org.apache.log4j.ConsoleAppender +log4j.appender.stdout.Target=System.out +log4j.appender.stdout.layout=org.apache.log4j.PatternLayout +log4j.appender.stdout.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %-5p %c{1}:%L - %m%n \ No newline at end of file From b99a01134545beb75cf3a4114575be61c9c75486 Mon Sep 17 00:00:00 2001 From: Alessia Bardi Date: Tue, 13 Sep 2022 11:51:55 +0200 Subject: [PATCH 13/20] return empty Oaf list if record cannot be parsed --- .../dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java | 2 +- .../test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java | 5 +---- 2 files changed, 2 insertions(+), 5 deletions(-) diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java index e263cffa8..984254665 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java @@ -143,7 +143,7 @@ public abstract class AbstractMdRecordToOafMapper { return createOafs(doc, type, instances, collectedFrom, info, lastUpdateTimestamp); } catch (DocumentException e) { log.error("Error with record:\n" + xml); - throw e; + return Lists.newArrayList(); } } diff --git a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java index c4e34a9a8..8165ad757 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java +++ b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java @@ -931,11 +931,8 @@ class MappersTest { void testNotWellFormed() throws IOException, DocumentException { final String xml = IOUtils .toString(Objects.requireNonNull(getClass().getResourceAsStream("oaf_notwellformed.xml"))); - final DocumentException generalEx = new DocumentException(); + assertEquals(0, new OafToOafMapper(vocs, false, true).processMdRecord(xml).size()); - DocumentException exception = assertThrows(DocumentException.class, () -> { - new OafToOafMapper(vocs, false, true).processMdRecord(xml); - }); } private void assertValidId(final String id) { From a0919ed495c61da8d4cecbfa4c322d8f9c95176a Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Wed, 14 Sep 2022 13:27:39 +0200 Subject: [PATCH 14/20] [aggregator graph] save invalid records aside for further inspection --- .../raw/AbstractMdRecordToOafMapper.java | 2 +- .../raw/GenerateEntitiesApplication.java | 37 ++++++++++++++++++- .../graph/generate_entities_parameters.json | 6 +++ .../oa/graph/raw_all/oozie_app/workflow.xml | 2 + .../dnetlib/dhp/oa/graph/raw/MappersTest.java | 2 - .../raw/MigrateDbEntitiesApplicationTest.java | 2 +- 6 files changed, 45 insertions(+), 6 deletions(-) diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java index 984254665..cdc707084 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java @@ -107,7 +107,7 @@ public abstract class AbstractMdRecordToOafMapper { this.forceOriginalId = false; } - public List processMdRecord(final String xml) throws DocumentException { + public List processMdRecord(final String xml) { DocumentFactory.getInstance().setXPathNamespaceURIs(nsContext); try { diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/GenerateEntitiesApplication.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/GenerateEntitiesApplication.java index 6bb18c375..e9de43f7f 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/GenerateEntitiesApplication.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/GenerateEntitiesApplication.java @@ -16,6 +16,9 @@ import org.apache.hadoop.io.compress.GzipCodec; import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.rdd.RDD; +import org.apache.spark.sql.Encoders; +import org.apache.spark.sql.SaveMode; import org.apache.spark.sql.SparkSession; import org.dom4j.DocumentException; import org.slf4j.Logger; @@ -76,6 +79,9 @@ public class GenerateEntitiesApplication { final String targetPath = parser.get("targetPath"); log.info("targetPath: {}", targetPath); + final String invalidPath = parser.get("invalidPath"); + log.info("invalidPath: {}", invalidPath); + final String isLookupUrl = parser.get("isLookupUrl"); log.info("isLookupUrl: {}", isLookupUrl); @@ -97,7 +103,8 @@ public class GenerateEntitiesApplication { final SparkConf conf = new SparkConf(); runWithSparkSession(conf, isSparkSessionManaged, spark -> { HdfsSupport.remove(targetPath, spark.sparkContext().hadoopConfiguration()); - generateEntities(spark, vocs, sourcePaths, targetPath, shouldHashId, mode); + HdfsSupport.remove(invalidPath, spark.sparkContext().hadoopConfiguration()); + generateEntities(spark, vocs, sourcePaths, targetPath, invalidPath, shouldHashId, mode); }); } @@ -106,6 +113,7 @@ public class GenerateEntitiesApplication { final VocabularyGroup vocs, final String sourcePaths, final String targetPath, + final String invalidPath, final boolean shouldHashId, final Mode mode) { @@ -121,6 +129,19 @@ public class GenerateEntitiesApplication { JavaRDD inputRdd = sc.emptyRDD(); for (final String sp : existingSourcePaths) { + RDD invalidRecords = sc + .sequenceFile(sp, Text.class, Text.class) + .map(k -> new Tuple2<>(k._1().toString(), k._2().toString())) + .map(k -> tryApplyMapping(k._1(), k._2(), shouldHashId, vocs)) + .filter(Objects::nonNull) + .rdd(); + spark + .createDataset(invalidRecords, Encoders.STRING()) + .write() + .mode(SaveMode.Append) + .option("compression", "gzip") + .text(invalidPath); + inputRdd = inputRdd .union( sc @@ -159,7 +180,7 @@ public class GenerateEntitiesApplication { final String id, final String s, final boolean shouldHashId, - final VocabularyGroup vocs) throws DocumentException { + final VocabularyGroup vocs) { final String type = StringUtils.substringAfter(id, ":"); switch (type.toLowerCase()) { @@ -196,6 +217,18 @@ public class GenerateEntitiesApplication { } } + private static String tryApplyMapping( + final String id, + final String s, + final boolean shouldHashId, + final VocabularyGroup vocs) { + + if (convertToListOaf(id, s, shouldHashId, vocs).isEmpty()) { + return s; + } + return null; + } + private static Oaf convertFromJson(final String s, final Class clazz) { try { return OBJECT_MAPPER.readValue(s, clazz); diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/generate_entities_parameters.json b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/generate_entities_parameters.json index 52cbbf45f..da6730fbb 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/generate_entities_parameters.json +++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/generate_entities_parameters.json @@ -17,6 +17,12 @@ "paramDescription": "the path of the target file", "paramRequired": true }, + { + "paramName": "i", + "paramLongName": "invalidPath", + "paramDescription": "the path of the invalid records file", + "paramRequired": false + }, { "paramName": "isu", "paramLongName": "isLookupUrl", diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/raw_all/oozie_app/workflow.xml b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/raw_all/oozie_app/workflow.xml index c6cc46c0f..d00232e9a 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/raw_all/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/raw_all/oozie_app/workflow.xml @@ -468,6 +468,7 @@ --sourcePaths${contentPath}/db_claims,${contentPath}/oaf_claims,${contentPath}/odf_claims --targetPath${workingDir}/entities_claim + --invalidPath${workingDir}/invalid_records_claim --isLookupUrl${isLookupUrl} --shouldHashId${shouldHashId} --modeclaim @@ -517,6 +518,7 @@ --sourcePaths${contentPath}/db_openaire,${contentPath}/db_openorgs,${contentPath}/oaf_records,${contentPath}/odf_records,${contentPath}/oaf_records_hdfs,${contentPath}/odf_records_hdfs,${contentPath}/oaf_records_invisible --targetPath${workingDir}/entities + --invalidPath${workingDir}/invalid_records --isLookupUrl${isLookupUrl} --shouldHashId${shouldHashId} diff --git a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java index 8165ad757..204649633 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java +++ b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java @@ -12,7 +12,6 @@ import java.util.Objects; import java.util.Optional; import java.util.stream.Collectors; -import kotlin.jvm.Throws; import org.apache.commons.io.IOUtils; import org.apache.commons.lang3.StringUtils; import org.dom4j.DocumentException; @@ -22,7 +21,6 @@ import org.junit.jupiter.api.extension.ExtendWith; import org.mockito.Mock; import org.mockito.junit.jupiter.MockitoExtension; -import com.fasterxml.jackson.core.JsonProcessingException; import com.fasterxml.jackson.databind.ObjectMapper; import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup; diff --git a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MigrateDbEntitiesApplicationTest.java b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MigrateDbEntitiesApplicationTest.java index 408196665..11947dbe7 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MigrateDbEntitiesApplicationTest.java +++ b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MigrateDbEntitiesApplicationTest.java @@ -18,7 +18,6 @@ import java.util.Objects; import java.util.Optional; import java.util.stream.Collectors; -import eu.dnetlib.dhp.schema.common.ModelConstants; import org.apache.commons.io.IOUtils; import org.apache.commons.lang3.StringUtils; import org.junit.jupiter.api.BeforeEach; @@ -32,6 +31,7 @@ import com.fasterxml.jackson.core.type.TypeReference; import com.fasterxml.jackson.databind.ObjectMapper; import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup; +import eu.dnetlib.dhp.schema.common.ModelConstants; import eu.dnetlib.dhp.schema.oaf.*; import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils; From 3bf3127251477ea0a1ea5a3d422c55403aaf28b5 Mon Sep 17 00:00:00 2001 From: dimitrispie Date: Wed, 14 Sep 2022 16:36:19 +0300 Subject: [PATCH 15/20] Changes to monitor and indicator scripts --- .../oozie_app/scripts/step16-createIndicatorsTables.sql | 6 +++--- .../stats/oozie_app/scripts/step16_1-definitions.sql | 6 +++--- .../stats/oozie_app/scripts/step20-createMonitorDB.sql | 8 ++++---- 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16-createIndicatorsTables.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16-createIndicatorsTables.sql index 0f294145c..b8792ae61 100755 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16-createIndicatorsTables.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16-createIndicatorsTables.sql @@ -76,11 +76,11 @@ compute stats indi_result_with_orcid; ---- Sprint 3 ---- create table indi_funded_result_with_fundref stored as parquet as -select distinct r.id, coalesce(fundref, 0) as fundref +select distinct r.result as id, coalesce(fundref, 0) as fundref from project_results r - left outer join (select distinct id, 1 as fundref from project_results + left outer join (select distinct result, 1 as fundref from project_results where provenance='Harvested') tmp - on r.id= tmp.id; + on r.result= tmp.result; compute stats indi_funded_result_with_fundref; diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16_1-definitions.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16_1-definitions.sql index 88c1ece78..41c95758c 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16_1-definitions.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16_1-definitions.sql @@ -3,20 +3,20 @@ ---------------------------------------------------- -- Peer reviewed: -create table ${stats_db_name}.result_peerreviewed STORED AS PARQUET as +create table IF NOT EXISTS ${stats_db_name}.result_peerreviewed STORED AS PARQUET as select r.id as id, case when doi.doi_from_crossref=1 and grey.grey_lit=0 then true else false end as peer_reviewed from ${stats_db_name}.result r left outer join ${stats_db_name}.indi_pub_doi_from_crossref doi on doi.id=r.id left outer join ${stats_db_name}.indi_pub_grey_lit grey on grey.id=r.id; -- Green OA: -create table ${stats_db_name}.result_greenoa STORED AS PARQUET as +create table IF NOT EXISTS ${stats_db_name}.result_greenoa STORED AS PARQUET as select r.id, case when green.green_oa=1 then true else false end as green from ${stats_db_name}.result r left outer join ${stats_db_name}.indi_pub_green_oa green on green.id=r.id; -- GOLD OA: -create table ${stats_db_name}.result_gold STORED AS PARQUET as +create table IF NOT EXISTS ${stats_db_name}.result_gold STORED AS PARQUET as select r.id, case when gold.is_gold=1 then true else false end as gold from ${stats_db_name}.result r left outer join ${stats_db_name}.indi_pub_gold_oa gold on gold.id=r.id; \ No newline at end of file diff --git a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql index 572b70ef7..5af2e9edd 100644 --- a/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql +++ b/dhp-workflows/dhp-stats-update/src/main/resources/eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step20-createMonitorDB.sql @@ -162,10 +162,10 @@ create table TARGET.indi_pub_doi_from_crossref stored as parquet as select * fro compute stats TARGET.indi_pub_doi_from_crossref; create table TARGET.indi_pub_gold_oa stored as parquet as select * from SOURCE.indi_pub_gold_oa orig where exists (select 1 from TARGET.result r where r.id=orig.id); compute stats TARGET.indi_pub_gold_oa; -create table TARGET.indi_datasets_gold_oa stored as parquet as select * from SOURCE.indi_datasets_gold_oa orig where exists (select 1 from TARGET.result r where r.id=orig.id); -compute stats TARGET.indi_datasets_gold_oa; -create table TARGET.indi_software_gold_oa stored as parquet as select * from SOURCE.indi_software_gold_oa orig where exists (select 1 from TARGET.result r where r.id=orig.id); -compute stats TARGET.indi_software_gold_oa; +--create table TARGET.indi_datasets_gold_oa stored as parquet as select * from SOURCE.indi_datasets_gold_oa orig where exists (select 1 from TARGET.result r where r.id=orig.id); +--compute stats TARGET.indi_datasets_gold_oa; +--create table TARGET.indi_software_gold_oa stored as parquet as select * from SOURCE.indi_software_gold_oa orig where exists (select 1 from TARGET.result r where r.id=orig.id); +--compute stats TARGET.indi_software_gold_oa; create table TARGET.indi_pub_has_abstract stored as parquet as select * from SOURCE.indi_pub_has_abstract orig where exists (select 1 from TARGET.result r where r.id=orig.id); compute stats TARGET.indi_pub_has_abstract; create table TARGET.indi_result_has_cc_licence stored as parquet as select * from SOURCE.indi_result_has_cc_licence orig where exists (select 1 from TARGET.result r where r.id=orig.id); From c48f6e9c57c6b1fe549d59e032bc0cad6fa01f66 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Wed, 14 Sep 2022 17:11:26 +0200 Subject: [PATCH 16/20] [aggregator graph] save invalid records aside for further inspection --- .../dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java | 2 +- .../dhp/oa/graph/raw/GenerateEntitiesApplication.java | 8 +++++--- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java index cdc707084..0a32766c9 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java @@ -143,7 +143,7 @@ public abstract class AbstractMdRecordToOafMapper { return createOafs(doc, type, instances, collectedFrom, info, lastUpdateTimestamp); } catch (DocumentException e) { log.error("Error with record:\n" + xml); - return Lists.newArrayList(); + return null; } } diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/GenerateEntitiesApplication.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/GenerateEntitiesApplication.java index e9de43f7f..290a22656 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/GenerateEntitiesApplication.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/GenerateEntitiesApplication.java @@ -126,8 +126,6 @@ public class GenerateEntitiesApplication { log.info("Generate entities from files:"); existingSourcePaths.forEach(log::info); - JavaRDD inputRdd = sc.emptyRDD(); - for (final String sp : existingSourcePaths) { RDD invalidRecords = sc .sequenceFile(sp, Text.class, Text.class) @@ -141,7 +139,11 @@ public class GenerateEntitiesApplication { .mode(SaveMode.Append) .option("compression", "gzip") .text(invalidPath); + } + JavaRDD inputRdd = sc.emptyRDD(); + + for (final String sp : existingSourcePaths) { inputRdd = inputRdd .union( sc @@ -223,7 +225,7 @@ public class GenerateEntitiesApplication { final boolean shouldHashId, final VocabularyGroup vocs) { - if (convertToListOaf(id, s, shouldHashId, vocs).isEmpty()) { + if (Objects.isNull(convertToListOaf(id, s, shouldHashId, vocs))) { return s; } return null; From 9e7ec4198fb99514b2f5e5f1c7817ffa7d0c1f8e Mon Sep 17 00:00:00 2001 From: Alessia Bardi Date: Wed, 14 Sep 2022 18:08:56 +0200 Subject: [PATCH 17/20] fixed test --- .../src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java index 204649633..390920027 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java +++ b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java @@ -929,7 +929,7 @@ class MappersTest { void testNotWellFormed() throws IOException, DocumentException { final String xml = IOUtils .toString(Objects.requireNonNull(getClass().getResourceAsStream("oaf_notwellformed.xml"))); - assertEquals(0, new OafToOafMapper(vocs, false, true).processMdRecord(xml).size()); + assertEquals(null, new OafToOafMapper(vocs, false, true).processMdRecord(xml)); } From 1e42d984e1ead5e1e7519d69f8150f44060f9565 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Thu, 15 Sep 2022 10:49:42 +0200 Subject: [PATCH 18/20] [aggregator graph] save invalid records aside for further inspection --- .../dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java | 2 +- .../dhp/oa/graph/raw/GenerateEntitiesApplication.java | 10 +++++----- .../java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java | 7 ++++--- 3 files changed, 10 insertions(+), 9 deletions(-) diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java index 0a32766c9..cdc707084 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java @@ -143,7 +143,7 @@ public abstract class AbstractMdRecordToOafMapper { return createOafs(doc, type, instances, collectedFrom, info, lastUpdateTimestamp); } catch (DocumentException e) { log.error("Error with record:\n" + xml); - return null; + return Lists.newArrayList(); } } diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/GenerateEntitiesApplication.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/GenerateEntitiesApplication.java index 290a22656..06d5e9acb 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/GenerateEntitiesApplication.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/GenerateEntitiesApplication.java @@ -150,8 +150,8 @@ public class GenerateEntitiesApplication { .sequenceFile(sp, Text.class, Text.class) .map(k -> new Tuple2<>(k._1().toString(), k._2().toString())) .map(k -> convertToListOaf(k._1(), k._2(), shouldHashId, vocs)) - .filter(Objects::nonNull) - .flatMap(List::iterator)); + .flatMap(List::iterator) + .filter(Objects::nonNull)); } switch (mode) { @@ -225,7 +225,8 @@ public class GenerateEntitiesApplication { final boolean shouldHashId, final VocabularyGroup vocs) { - if (Objects.isNull(convertToListOaf(id, s, shouldHashId, vocs))) { + final List oaf = convertToListOaf(id, s, shouldHashId, vocs); + if (Optional.ofNullable(oaf).map(List::isEmpty).orElse(false)) { return s; } return null; @@ -235,8 +236,7 @@ public class GenerateEntitiesApplication { try { return OBJECT_MAPPER.readValue(s, clazz); } catch (final Exception e) { - log.error("Error parsing object of class: {}", clazz); - log.error(s); + log.error("Error parsing object of class: {}:\n{}", clazz, s); throw new IllegalArgumentException(e); } } diff --git a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java index 390920027..506a69012 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java +++ b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java @@ -926,11 +926,12 @@ class MappersTest { } @Test - void testNotWellFormed() throws IOException, DocumentException { + void testNotWellFormed() throws IOException { final String xml = IOUtils .toString(Objects.requireNonNull(getClass().getResourceAsStream("oaf_notwellformed.xml"))); - assertEquals(null, new OafToOafMapper(vocs, false, true).processMdRecord(xml)); - + final List actual = new OafToOafMapper(vocs, false, true).processMdRecord(xml); + assertNotNull(actual); + assertTrue(actual.isEmpty()); } private void assertValidId(final String id) { From e370e940d871b5b9772eff48300ad8d9074641c3 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Fri, 16 Sep 2022 14:06:28 +0200 Subject: [PATCH 19/20] [aggregator graph] save invalid records aside for further inspection --- .../raw/GenerateEntitiesApplication.java | 37 +----- .../graph/raw/VerifyRecordsApplication.java | 108 ++++++++++++++++++ .../common/AbstractMigrationApplication.java | 6 +- .../graph/generate_entities_parameters.json | 6 - .../oa/graph/raw_all/oozie_app/workflow.xml | 54 ++++++++- .../oa/graph/verify_records_parameters.json | 26 +++++ 6 files changed, 191 insertions(+), 46 deletions(-) create mode 100644 dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/VerifyRecordsApplication.java create mode 100644 dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/verify_records_parameters.json diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/GenerateEntitiesApplication.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/GenerateEntitiesApplication.java index 06d5e9acb..5f9d98073 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/GenerateEntitiesApplication.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/GenerateEntitiesApplication.java @@ -79,9 +79,6 @@ public class GenerateEntitiesApplication { final String targetPath = parser.get("targetPath"); log.info("targetPath: {}", targetPath); - final String invalidPath = parser.get("invalidPath"); - log.info("invalidPath: {}", invalidPath); - final String isLookupUrl = parser.get("isLookupUrl"); log.info("isLookupUrl: {}", isLookupUrl); @@ -103,8 +100,7 @@ public class GenerateEntitiesApplication { final SparkConf conf = new SparkConf(); runWithSparkSession(conf, isSparkSessionManaged, spark -> { HdfsSupport.remove(targetPath, spark.sparkContext().hadoopConfiguration()); - HdfsSupport.remove(invalidPath, spark.sparkContext().hadoopConfiguration()); - generateEntities(spark, vocs, sourcePaths, targetPath, invalidPath, shouldHashId, mode); + generateEntities(spark, vocs, sourcePaths, targetPath, shouldHashId, mode); }); } @@ -113,7 +109,6 @@ public class GenerateEntitiesApplication { final VocabularyGroup vocs, final String sourcePaths, final String targetPath, - final String invalidPath, final boolean shouldHashId, final Mode mode) { @@ -126,21 +121,6 @@ public class GenerateEntitiesApplication { log.info("Generate entities from files:"); existingSourcePaths.forEach(log::info); - for (final String sp : existingSourcePaths) { - RDD invalidRecords = sc - .sequenceFile(sp, Text.class, Text.class) - .map(k -> new Tuple2<>(k._1().toString(), k._2().toString())) - .map(k -> tryApplyMapping(k._1(), k._2(), shouldHashId, vocs)) - .filter(Objects::nonNull) - .rdd(); - spark - .createDataset(invalidRecords, Encoders.STRING()) - .write() - .mode(SaveMode.Append) - .option("compression", "gzip") - .text(invalidPath); - } - JavaRDD inputRdd = sc.emptyRDD(); for (final String sp : existingSourcePaths) { @@ -178,7 +158,7 @@ public class GenerateEntitiesApplication { .saveAsTextFile(targetPath, GzipCodec.class); } - private static List convertToListOaf( + public static List convertToListOaf( final String id, final String s, final boolean shouldHashId, @@ -219,19 +199,6 @@ public class GenerateEntitiesApplication { } } - private static String tryApplyMapping( - final String id, - final String s, - final boolean shouldHashId, - final VocabularyGroup vocs) { - - final List oaf = convertToListOaf(id, s, shouldHashId, vocs); - if (Optional.ofNullable(oaf).map(List::isEmpty).orElse(false)) { - return s; - } - return null; - } - private static Oaf convertFromJson(final String s, final Class clazz) { try { return OBJECT_MAPPER.readValue(s, clazz); diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/VerifyRecordsApplication.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/VerifyRecordsApplication.java new file mode 100644 index 000000000..a8eb871c8 --- /dev/null +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/VerifyRecordsApplication.java @@ -0,0 +1,108 @@ + +package eu.dnetlib.dhp.oa.graph.raw; + +import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; + +import java.util.Arrays; +import java.util.List; +import java.util.Objects; +import java.util.Optional; +import java.util.stream.Collectors; + +import org.apache.commons.io.IOUtils; +import org.apache.hadoop.io.Text; +import org.apache.spark.SparkConf; +import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.rdd.RDD; +import org.apache.spark.sql.Encoders; +import org.apache.spark.sql.SaveMode; +import org.apache.spark.sql.SparkSession; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.common.HdfsSupport; +import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup; +import eu.dnetlib.dhp.schema.oaf.Oaf; +import eu.dnetlib.dhp.utils.ISLookupClientFactory; +import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService; +import scala.Tuple2; + +public class VerifyRecordsApplication { + + private static final Logger log = LoggerFactory.getLogger(VerifyRecordsApplication.class); + + public static void main(final String[] args) throws Exception { + final ArgumentApplicationParser parser = new ArgumentApplicationParser( + IOUtils + .toString( + VerifyRecordsApplication.class + .getResourceAsStream("/eu/dnetlib/dhp/oa/graph/verify_records_parameters.json"))); + + parser.parseArgument(args); + + final Boolean isSparkSessionManaged = Optional + .ofNullable(parser.get("isSparkSessionManaged")) + .map(Boolean::valueOf) + .orElse(Boolean.TRUE); + log.info("isSparkSessionManaged: {}", isSparkSessionManaged); + + final String sourcePaths = parser.get("sourcePaths"); + log.info("sourcePaths: {}", sourcePaths); + + final String invalidPath = parser.get("invalidPath"); + log.info("invalidPath: {}", invalidPath); + + final String isLookupUrl = parser.get("isLookupUrl"); + log.info("isLookupUrl: {}", isLookupUrl); + + final ISLookUpService isLookupService = ISLookupClientFactory.getLookUpService(isLookupUrl); + final VocabularyGroup vocs = VocabularyGroup.loadVocsFromIS(isLookupService); + + final SparkConf conf = new SparkConf(); + runWithSparkSession(conf, isSparkSessionManaged, spark -> { + HdfsSupport.remove(invalidPath, spark.sparkContext().hadoopConfiguration()); + validateRecords(spark, sourcePaths, invalidPath, vocs); + }); + } + + private static void validateRecords(SparkSession spark, String sourcePaths, String invalidPath, + VocabularyGroup vocs) { + + final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); + final List existingSourcePaths = Arrays + .stream(sourcePaths.split(",")) + .filter(p -> HdfsSupport.exists(p, sc.hadoopConfiguration())) + .collect(Collectors.toList()); + + log.info("Verify records in files:"); + existingSourcePaths.forEach(log::info); + + for (final String sp : existingSourcePaths) { + RDD invalidRecords = sc + .sequenceFile(sp, Text.class, Text.class) + .map(k -> tryApplyMapping(k._1().toString(), k._2().toString(), true, vocs)) + .filter(Objects::nonNull) + .rdd(); + spark + .createDataset(invalidRecords, Encoders.STRING()) + .write() + .mode(SaveMode.Append) + .option("compression", "gzip") + .text(invalidPath); + } + } + + private static String tryApplyMapping( + final String id, + final String xmlRecord, + final boolean shouldHashId, + final VocabularyGroup vocs) { + + final List oaf = GenerateEntitiesApplication.convertToListOaf(id, xmlRecord, shouldHashId, vocs); + if (Optional.ofNullable(oaf).map(List::isEmpty).orElse(false)) { + return xmlRecord; + } + return null; + } +} diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/common/AbstractMigrationApplication.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/common/AbstractMigrationApplication.java index cba64899b..6f63e9327 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/common/AbstractMigrationApplication.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/common/AbstractMigrationApplication.java @@ -4,6 +4,7 @@ package eu.dnetlib.dhp.oa.graph.raw.common; import java.io.Closeable; import java.io.IOException; import java.util.Arrays; +import java.util.List; import java.util.Set; import java.util.concurrent.atomic.AtomicInteger; import java.util.stream.Collectors; @@ -24,8 +25,11 @@ import org.apache.http.impl.client.HttpClients; import com.fasterxml.jackson.core.JsonProcessingException; import com.fasterxml.jackson.databind.ObjectMapper; +import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup; +import eu.dnetlib.dhp.oa.graph.raw.OafToOafMapper; +import eu.dnetlib.dhp.oa.graph.raw.OdfToOafMapper; import eu.dnetlib.dhp.schema.mdstore.MDStoreWithInfo; -import eu.dnetlib.dhp.schema.oaf.Oaf; +import eu.dnetlib.dhp.schema.oaf.*; import eu.dnetlib.dhp.utils.DHPUtils; public class AbstractMigrationApplication implements Closeable { diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/generate_entities_parameters.json b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/generate_entities_parameters.json index da6730fbb..52cbbf45f 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/generate_entities_parameters.json +++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/generate_entities_parameters.json @@ -17,12 +17,6 @@ "paramDescription": "the path of the target file", "paramRequired": true }, - { - "paramName": "i", - "paramLongName": "invalidPath", - "paramDescription": "the path of the invalid records file", - "paramRequired": false - }, { "paramName": "isu", "paramLongName": "isLookupUrl", diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/raw_all/oozie_app/workflow.xml b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/raw_all/oozie_app/workflow.xml index d00232e9a..8262c6923 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/raw_all/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/raw_all/oozie_app/workflow.xml @@ -446,10 +446,34 @@ - - + + + + + yarn + cluster + VerifyRecords_claim + eu.dnetlib.dhp.oa.graph.raw.VerifyRecordsApplication + dhp-graph-mapper-${projectVersion}.jar + + --executor-memory ${sparkExecutorMemory} + --executor-cores ${sparkExecutorCores} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + + --sourcePaths${contentPath}/db_claims,${contentPath}/oaf_claims,${contentPath}/odf_claims + --invalidPath${workingDir}/invalid_records_claim + --isLookupUrl${isLookupUrl} + + + + + yarn @@ -468,7 +492,6 @@ --sourcePaths${contentPath}/db_claims,${contentPath}/oaf_claims,${contentPath}/odf_claims --targetPath${workingDir}/entities_claim - --invalidPath${workingDir}/invalid_records_claim --isLookupUrl${isLookupUrl} --shouldHashId${shouldHashId} --modeclaim @@ -500,6 +523,30 @@ + + + yarn + cluster + VerifyRecords + eu.dnetlib.dhp.oa.graph.raw.VerifyRecordsApplication + dhp-graph-mapper-${projectVersion}.jar + + --executor-memory ${sparkExecutorMemory} + --executor-cores ${sparkExecutorCores} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + + --sourcePaths${contentPath}/db_openaire,${contentPath}/db_openorgs,${contentPath}/oaf_records,${contentPath}/odf_records,${contentPath}/oaf_records_hdfs,${contentPath}/odf_records_hdfs,${contentPath}/oaf_records_invisible + --invalidPath${workingDir}/invalid_records + --isLookupUrl${isLookupUrl} + + + + + yarn @@ -518,7 +565,6 @@ --sourcePaths${contentPath}/db_openaire,${contentPath}/db_openorgs,${contentPath}/oaf_records,${contentPath}/odf_records,${contentPath}/oaf_records_hdfs,${contentPath}/odf_records_hdfs,${contentPath}/oaf_records_invisible --targetPath${workingDir}/entities - --invalidPath${workingDir}/invalid_records --isLookupUrl${isLookupUrl} --shouldHashId${shouldHashId} diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/verify_records_parameters.json b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/verify_records_parameters.json new file mode 100644 index 000000000..eb00e7609 --- /dev/null +++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/verify_records_parameters.json @@ -0,0 +1,26 @@ +[ + { + "paramName": "issm", + "paramLongName": "isSparkSessionManaged", + "paramDescription": "when true will stop SparkSession after job execution", + "paramRequired": false + }, + { + "paramName": "s", + "paramLongName": "sourcePaths", + "paramDescription": "the HDFS source paths which contains the sequential file (comma separated)", + "paramRequired": true + }, + { + "paramName": "i", + "paramLongName": "invalidPath", + "paramDescription": "the path of the invalid records file", + "paramRequired": false + }, + { + "paramName": "isu", + "paramLongName": "isLookupUrl", + "paramDescription": "the url of the ISLookupService", + "paramRequired": true + } +] \ No newline at end of file From 26e1baddedc95ee93ba7577ef34b29e9b09940ed Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Mon, 19 Sep 2022 11:19:10 +0200 Subject: [PATCH 20/20] added instance.url syntactical validation, avoid creating multiple duplicated URLs --- dhp-workflows/dhp-graph-mapper/pom.xml | 5 +++ .../raw/AbstractMdRecordToOafMapper.java | 15 +++++++ .../dhp/oa/graph/raw/OafToOafMapper.java | 35 +++++++++-------- .../dhp/oa/graph/raw/OdfToOafMapper.java | 39 ++++++++++++------- .../dnetlib/dhp/oa/graph/raw/MappersTest.java | 2 +- pom.xml | 6 +++ 6 files changed, 71 insertions(+), 31 deletions(-) diff --git a/dhp-workflows/dhp-graph-mapper/pom.xml b/dhp-workflows/dhp-graph-mapper/pom.xml index 687f0de66..f579a7d2b 100644 --- a/dhp-workflows/dhp-graph-mapper/pom.xml +++ b/dhp-workflows/dhp-graph-mapper/pom.xml @@ -57,6 +57,11 @@ commons-io + + commons-validator + commons-validator + + org.apache.spark spark-core_2.11 diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java index 5cfb22cb9..a8d09e4a7 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java @@ -10,9 +10,13 @@ import static eu.dnetlib.dhp.schema.common.ModelConstants.RESULT_PROJECT; import static eu.dnetlib.dhp.schema.common.ModelConstants.UNKNOWN; import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.*; +import java.net.MalformedURLException; +import java.net.URL; import java.util.*; +import java.util.stream.Collectors; import org.apache.commons.lang3.StringUtils; +import org.apache.commons.validator.routines.UrlValidator; import org.dom4j.*; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -617,4 +621,15 @@ public abstract class AbstractMdRecordToOafMapper { return res; } + protected Set validateUrl(Collection url) { + UrlValidator urlValidator = UrlValidator.getInstance(); + if (Objects.isNull(url)) { + return new HashSet<>(); + } + return url + .stream() + .filter(u -> urlValidator.isValid(u)) + .collect(Collectors.toCollection(HashSet::new)); + } + } diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OafToOafMapper.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OafToOafMapper.java index 9225e174d..30f3935f5 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OafToOafMapper.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OafToOafMapper.java @@ -159,22 +159,25 @@ public class OafToOafMapper extends AbstractMdRecordToOafMapper { .setProcessingchargecurrency(field(doc.valueOf("//oaf:processingchargeamount/@currency"), info)); final List nodes = Lists.newArrayList(doc.selectNodes("//dc:identifier")); - instance - .setUrl( - nodes - .stream() - .filter(n -> StringUtils.isNotBlank(n.getText())) - .map(n -> n.getText().trim()) - .filter(u -> u.startsWith("http")) - .map(s -> { - try { - return URLDecoder.decode(s, "UTF-8"); - } catch (Throwable t) { - return s; - } - }) - .distinct() - .collect(Collectors.toCollection(ArrayList::new))); + final List url = nodes + .stream() + .filter(n -> StringUtils.isNotBlank(n.getText())) + .map(n -> n.getText().trim()) + .filter(u -> u.startsWith("http")) + .map(s -> { + try { + return URLDecoder.decode(s, "UTF-8"); + } catch (Throwable t) { + return s; + } + }) + .distinct() + .collect(Collectors.toCollection(ArrayList::new)); + final Set validUrl = validateUrl(url); + if (!validUrl.isEmpty()) { + instance.setUrl(new ArrayList<>()); + instance.getUrl().addAll(validUrl); + } return Lists.newArrayList(instance); } diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OdfToOafMapper.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OdfToOafMapper.java index d6bfe6714..5781988e6 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OdfToOafMapper.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OdfToOafMapper.java @@ -6,11 +6,14 @@ import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.*; import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.structuredProperty; import java.io.UnsupportedEncodingException; +import java.net.MalformedURLException; +import java.net.URL; import java.net.URLDecoder; import java.util.*; import java.util.stream.Collectors; import org.apache.commons.lang3.StringUtils; +import org.apache.commons.validator.routines.UrlValidator; import org.dom4j.Document; import org.dom4j.Element; import org.dom4j.Node; @@ -171,23 +174,31 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper { for (final Object o : doc.selectNodes("//*[local-name()='identifier' and ./@identifierType='landingPage']")) { url.add(trimAndDecodeUrl(((Node) o).getText().trim())); } - for (final Object o : doc - .selectNodes("//*[local-name()='alternateIdentifier' and ./@alternateIdentifierType='DOI']")) { - url.add(HTTP_DOI_PREIFX + ((Node) o).getText().trim()); + + Set validUrl = validateUrl(url); + + if (validUrl.stream().noneMatch(s -> s.contains("doi.org"))) { + for (final Object o : doc + .selectNodes("//*[local-name()='alternateIdentifier' and ./@alternateIdentifierType='DOI']")) { + validUrl.add(HTTP_DOI_PREIFX + ((Node) o).getText().trim()); + } + for (final Object o : doc.selectNodes("//*[local-name()='identifier' and ./@identifierType='DOI']")) { + validUrl.add(HTTP_DOI_PREIFX + ((Node) o).getText().trim()); + } } - for (final Object o : doc.selectNodes("//*[local-name()='identifier' and ./@identifierType='DOI']")) { - url.add(HTTP_DOI_PREIFX + ((Node) o).getText().trim()); + if (validUrl.stream().noneMatch(s -> s.contains("hdl.handle.net"))) { + for (final Object o : doc + .selectNodes("//*[local-name()='alternateIdentifier' and ./@alternateIdentifierType='Handle']")) { + validUrl.add(HTTP_HANDLE_PREIFX + ((Node) o).getText().trim()); + } + for (final Object o : doc.selectNodes("//*[local-name()='identifier' and ./@identifierType='Handle']")) { + validUrl.add(HTTP_HANDLE_PREIFX + ((Node) o).getText().trim()); + } } - for (final Object o : doc - .selectNodes("//*[local-name()='alternateIdentifier' and ./@alternateIdentifierType='Handle']")) { - url.add(HTTP_HANDLE_PREIFX + ((Node) o).getText().trim()); - } - for (final Object o : doc.selectNodes("//*[local-name()='identifier' and ./@identifierType='Handle']")) { - url.add(HTTP_HANDLE_PREIFX + ((Node) o).getText().trim()); - } - if (!url.isEmpty()) { + + if (!validUrl.isEmpty()) { instance.setUrl(new ArrayList<>()); - instance.getUrl().addAll(url); + instance.getUrl().addAll(validUrl); } return Arrays.asList(instance); } diff --git a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java index 231d5b0ac..64b68e6af 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java +++ b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java @@ -950,7 +950,7 @@ class MappersTest { @Test void testNotWellFormed() throws IOException { final String xml = IOUtils - .toString(Objects.requireNonNull(getClass().getResourceAsStream("oaf_notwellformed.xml"))); + .toString(Objects.requireNonNull(getClass().getResourceAsStream("oaf_notwellformed.xml"))); final List actual = new OafToOafMapper(vocs, false, true).processMdRecord(xml); assertNotNull(actual); assertTrue(actual.isEmpty()); diff --git a/pom.xml b/pom.xml index ab59e7be3..a1b26966e 100644 --- a/pom.xml +++ b/pom.xml @@ -200,6 +200,12 @@ ${dhp.commons.lang.version} + + commons-validator + commons-validator + 1.7 + + com.github.sisyphsu dateparser