From 75a11d0ba5a3ff9a362c7c160ace11d0965a57a5 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Thu, 25 Jul 2024 16:34:32 +0200 Subject: [PATCH 001/111] [dedup] avoid NPEs in the countryInference dedup utility --- .../dnetlib/pace/common/AbstractPaceFunctions.java | 2 +- .../eu/dnetlib/pace/common/PaceFunctionTest.java | 12 ++++++++++-- 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/common/AbstractPaceFunctions.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/common/AbstractPaceFunctions.java index 6ef550c50..12a54bade 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/common/AbstractPaceFunctions.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/common/AbstractPaceFunctions.java @@ -90,7 +90,7 @@ public class AbstractPaceFunctions extends PaceCommonUtils { inferFrom = normalize(inferFrom); inferFrom = filterAllStopWords(inferFrom); Set cities = getCities(inferFrom, 4); - return citiesToCountry(cities).stream().findFirst().orElse("UNKNOWN"); + return citiesToCountry(cities).stream().filter(Objects::nonNull).findFirst().orElse("UNKNOWN"); } public static String cityInference(String original) { diff --git a/dhp-pace-core/src/test/java/eu/dnetlib/pace/common/PaceFunctionTest.java b/dhp-pace-core/src/test/java/eu/dnetlib/pace/common/PaceFunctionTest.java index 4ec120f4a..92f7bf6ff 100644 --- a/dhp-pace-core/src/test/java/eu/dnetlib/pace/common/PaceFunctionTest.java +++ b/dhp-pace-core/src/test/java/eu/dnetlib/pace/common/PaceFunctionTest.java @@ -1,8 +1,7 @@ package eu.dnetlib.pace.common; -import static org.junit.jupiter.api.Assertions.assertEquals; -import static org.junit.jupiter.api.Assertions.assertTrue; +import static org.junit.jupiter.api.Assertions.*; import org.junit.jupiter.api.*; @@ -54,8 +53,17 @@ public class PaceFunctionTest extends AbstractPaceFunctions { System.out.println("Fixed aliases : " + fixAliases(TEST_STRING)); } + @Test() + public void countryInferenceTest_NPE() { + assertThrows( + NullPointerException.class, + () -> countryInference("UNKNOWN", null), + "Expected countryInference() to throw an NPE"); + } + @Test public void countryInferenceTest() { + assertEquals("UNKNOWN", countryInference("UNKNOWN", "")); assertEquals("IT", countryInference("UNKNOWN", "Università di Bologna")); assertEquals("UK", countryInference("UK", "Università di Bologna")); assertEquals("IT", countryInference("UNKNOWN", "Universiteé de Naples")); From 9486e21a44f9c8e13919e7d43f34983866e3874f Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Tue, 30 Jul 2024 14:25:31 +0200 Subject: [PATCH 002/111] copy or process the person records throughout the graph pipeline --- .../oaf/utils/GraphCleaningFunctions.java | 2 + .../wf/main/oozie_app/import.txt | 1 + .../wf/main/oozie_app/workflow.xml | 16 +++ .../wf/person/oozie_app/workflow.xml | 130 ++++++++++++++++++ .../dhp/blacklist/oozie_app/workflow.xml | 10 ++ .../oozie_app/workflow.xml | 12 ++ .../dhp/enrich/orcid/oozie_app/workflow.xml | 8 ++ .../dhp/oa/graph/clean/oozie_app/workflow.xml | 36 +++++ 8 files changed, 215 insertions(+) create mode 100644 dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/person/oozie_app/workflow.xml diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/GraphCleaningFunctions.java b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/GraphCleaningFunctions.java index 2be4e8e0c..b9dc3253b 100644 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/GraphCleaningFunctions.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/GraphCleaningFunctions.java @@ -363,6 +363,8 @@ public class GraphCleaningFunctions extends CleaningFunctions { // nothing to clean here } else if (value instanceof Project) { // nothing to clean here + } else if (value instanceof Person) { + // nothing to clean here } else if (value instanceof Organization) { Organization o = (Organization) value; if (Objects.isNull(o.getCountry()) || StringUtils.isBlank(o.getCountry().getClassid())) { diff --git a/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/main/oozie_app/import.txt b/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/main/oozie_app/import.txt index dd8f5e14e..14409a42a 100644 --- a/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/main/oozie_app/import.txt +++ b/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/main/oozie_app/import.txt @@ -7,3 +7,4 @@ promote_action_payload_for_project_table classpath eu/dnetlib/dhp/actionmanager/ promote_action_payload_for_publication_table classpath eu/dnetlib/dhp/actionmanager/wf/publication/oozie_app promote_action_payload_for_relation_table classpath eu/dnetlib/dhp/actionmanager/wf/relation/oozie_app promote_action_payload_for_software_table classpath eu/dnetlib/dhp/actionmanager/wf/software/oozie_app +promote_action_payload_for_person_table classpath eu/dnetlib/dhp/actionmanager/wf/person/oozie_app diff --git a/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/main/oozie_app/workflow.xml b/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/main/oozie_app/workflow.xml index 65ddd402b..7ccfb342e 100644 --- a/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/main/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/main/oozie_app/workflow.xml @@ -148,6 +148,7 @@ + @@ -270,6 +271,21 @@ + + + ${wf:appPath()}/promote_action_payload_for_person_table + + + + inputActionPayloadRootPath + ${workingDir}/action_payload_by_type + + + + + + + diff --git a/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/person/oozie_app/workflow.xml b/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/person/oozie_app/workflow.xml new file mode 100644 index 000000000..7c119b305 --- /dev/null +++ b/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/person/oozie_app/workflow.xml @@ -0,0 +1,130 @@ + + + + activePromotePersonActionPayload + when true will promote actions with eu.dnetlib.dhp.schema.oaf.Person payload + + + inputGraphRootPath + root location of input materialized graph + + + inputActionPayloadRootPath + root location of action payloads to promote + + + outputGraphRootPath + root location for output materialized graph + + + mergeAndGetStrategy + strategy for merging graph table objects with action payload instances, MERGE_FROM_AND_GET or SELECT_NEWER_AND_GET + + + sparkDriverMemory + memory for driver process + + + sparkExecutorMemory + memory for individual executor + + + sparkExecutorCores + number of cores used by single executor + + + oozieActionShareLibForSpark2 + oozie action sharelib for spark 2.* + + + spark2ExtraListeners + com.cloudera.spark.lineage.NavigatorAppListener + spark 2.* extra listeners classname + + + spark2SqlQueryExecutionListeners + com.cloudera.spark.lineage.NavigatorQueryListener + spark 2.* sql query execution listeners classname + + + spark2YarnHistoryServerAddress + spark 2.* yarn history server address + + + spark2EventLogDir + spark 2.* event log dir location + + + + + ${jobTracker} + ${nameNode} + + + oozie.action.sharelib.for.spark + ${oozieActionShareLibForSpark2} + + + + + + + + Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] + + + + + + ${(activePromotePersonActionPayload eq "true") and + (fs:exists(concat(concat(concat(concat(wf:conf('nameNode'),'/'),wf:conf('inputGraphRootPath')),'/'),'person')) eq "true") and + (fs:exists(concat(concat(concat(concat(wf:conf('nameNode'),'/'),wf:conf('inputActionPayloadRootPath')),'/'),'clazz=eu.dnetlib.dhp.schema.oaf.Person')) eq "true")} + + + + + + + + yarn-cluster + cluster + PromotePersonActionPayloadForPersonTable + eu.dnetlib.dhp.actionmanager.promote.PromoteActionPayloadForGraphTableJob + dhp-actionmanager-${projectVersion}.jar + + --executor-memory=${sparkExecutorMemory} + --executor-cores=${sparkExecutorCores} + --driver-memory=${sparkDriverMemory} + --conf spark.executor.memoryOverhead=${sparkExecutorMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + + --inputGraphTablePath${inputGraphRootPath}/person + --graphTableClassNameeu.dnetlib.dhp.schema.oaf.Person + --inputActionPayloadPath${inputActionPayloadRootPath}/clazz=eu.dnetlib.dhp.schema.oaf.Person + --actionPayloadClassNameeu.dnetlib.dhp.schema.oaf.Person + --outputGraphTablePath${outputGraphRootPath}/person + --mergeAndGetStrategy${mergeAndGetStrategy} + --promoteActionStrategy${promoteActionStrategy} + + + + + + + + + + + -pb + ${inputGraphRootPath}/person + ${outputGraphRootPath}/person + + + + + + + \ No newline at end of file diff --git a/dhp-workflows/dhp-blacklist/src/main/resources/eu/dnetlib/dhp/blacklist/oozie_app/workflow.xml b/dhp-workflows/dhp-blacklist/src/main/resources/eu/dnetlib/dhp/blacklist/oozie_app/workflow.xml index dd7827da4..563a549f3 100644 --- a/dhp-workflows/dhp-blacklist/src/main/resources/eu/dnetlib/dhp/blacklist/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-blacklist/src/main/resources/eu/dnetlib/dhp/blacklist/oozie_app/workflow.xml @@ -63,6 +63,7 @@ + @@ -120,6 +121,15 @@ + + + ${nameNode}/${sourcePath}/person + ${nameNode}/${outputPath}/person + + + + + ${nameNode}/${sourcePath}/datasource diff --git a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/wf/subworkflows/orcidtoresultfromsemrel/oozie_app/workflow.xml b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/wf/subworkflows/orcidtoresultfromsemrel/oozie_app/workflow.xml index ba3633e07..8eaa79c53 100644 --- a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/wf/subworkflows/orcidtoresultfromsemrel/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/wf/subworkflows/orcidtoresultfromsemrel/oozie_app/workflow.xml @@ -34,6 +34,7 @@ + @@ -80,6 +81,17 @@ + + + ${jobTracker} + ${nameNode} + ${nameNode}/${sourcePath}/person + ${nameNode}/${outputPath}/person + + + + + diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/enrich/orcid/oozie_app/workflow.xml b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/enrich/orcid/oozie_app/workflow.xml index 72fc9e338..4031da15a 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/enrich/orcid/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/enrich/orcid/oozie_app/workflow.xml @@ -89,6 +89,14 @@ ${nameNode}/${graphPath}/project ${nameNode}/${targetPath}/project + + + + + + ${nameNode}/${graphPath}/person + ${nameNode}/${targetPath}/person + diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/clean/oozie_app/workflow.xml b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/clean/oozie_app/workflow.xml index 4188cb018..2512fc5bc 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/clean/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/clean/oozie_app/workflow.xml @@ -142,6 +142,7 @@ + @@ -390,6 +391,41 @@ + + + yarn + cluster + Clean person + eu.dnetlib.dhp.oa.graph.clean.CleanGraphSparkJob + dhp-graph-mapper-${projectVersion}.jar + + --executor-cores=${sparkExecutorCores} + --executor-memory=${sparkExecutorMemory} + --driver-memory=${sparkDriverMemory} + --conf spark.executor.memoryOverhead=${sparkExecutorMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.sql.shuffle.partitions=2000 + + --inputPath${graphInputPath}/person + --outputPath${graphOutputPath}/person + --graphTableClassNameeu.dnetlib.dhp.schema.oaf.Person + --isLookupUrl${isLookupUrl} + --contextId${contextId} + --verifyParam${verifyParam} + --country${country} + --verifyCountryParam${verifyCountryParam} + --hostedBy${workingDir}/working/hostedby + --collectedfrom${collectedfrom} + --masterDuplicatePath${workingDir}/masterduplicate + --deepClean${shouldClean} + + + + + yarn From 6bdb8643e6531ea0acf004f14a10a8baf55fa308 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Wed, 31 Jul 2024 11:02:22 +0200 Subject: [PATCH 003/111] ActionManager promote: allow to ingest person records in a graph that did not contain them, bumped dhp-schemas version --- .../PromoteActionPayloadForGraphTableJob.java | 21 ++++++++++++------- .../PromoteActionPayloadFunctions.java | 2 +- .../wf/person/oozie_app/workflow.xml | 1 - pom.xml | 2 +- 4 files changed, 16 insertions(+), 10 deletions(-) diff --git a/dhp-workflows/dhp-actionmanager/src/main/java/eu/dnetlib/dhp/actionmanager/promote/PromoteActionPayloadForGraphTableJob.java b/dhp-workflows/dhp-actionmanager/src/main/java/eu/dnetlib/dhp/actionmanager/promote/PromoteActionPayloadForGraphTableJob.java index 56cbda4d6..f72fd4269 100644 --- a/dhp-workflows/dhp-actionmanager/src/main/java/eu/dnetlib/dhp/actionmanager/promote/PromoteActionPayloadForGraphTableJob.java +++ b/dhp-workflows/dhp-actionmanager/src/main/java/eu/dnetlib/dhp/actionmanager/promote/PromoteActionPayloadForGraphTableJob.java @@ -151,12 +151,17 @@ public class PromoteActionPayloadForGraphTableJob { SparkSession spark, String path, Class rowClazz) { logger.info("Reading graph table from path: {}", path); - return spark - .read() - .textFile(path) - .map( - (MapFunction) value -> OBJECT_MAPPER.readValue(value, rowClazz), - Encoders.bean(rowClazz)); + if (HdfsSupport.exists(path, spark.sparkContext().hadoopConfiguration())) { + return spark + .read() + .textFile(path) + .map( + (MapFunction) value -> OBJECT_MAPPER.readValue(value, rowClazz), + Encoders.bean(rowClazz)); + } else { + logger.info("Found empty graph table from path: {}", path); + return spark.emptyDataset(Encoders.bean(rowClazz)); + } } private static Dataset readActionPayload( @@ -223,7 +228,7 @@ public class PromoteActionPayloadForGraphTableJob { rowClazz, actionPayloadClazz); - if (shouldGroupById) { + if (Boolean.TRUE.equals(shouldGroupById)) { return PromoteActionPayloadFunctions .groupGraphTableByIdAndMerge( joinedAndMerged, rowIdFn, mergeRowsAndGetFn, zeroFn, isNotZeroFn, rowClazz); @@ -250,6 +255,8 @@ public class PromoteActionPayloadForGraphTableJob { return () -> clazz.cast(new eu.dnetlib.dhp.schema.oaf.Relation()); case "eu.dnetlib.dhp.schema.oaf.Software": return () -> clazz.cast(new eu.dnetlib.dhp.schema.oaf.Software()); + case "eu.dnetlib.dhp.schema.oaf.Person": + return () -> clazz.cast(new eu.dnetlib.dhp.schema.oaf.Person()); default: throw new RuntimeException("unknown class: " + clazz.getCanonicalName()); } diff --git a/dhp-workflows/dhp-actionmanager/src/main/java/eu/dnetlib/dhp/actionmanager/promote/PromoteActionPayloadFunctions.java b/dhp-workflows/dhp-actionmanager/src/main/java/eu/dnetlib/dhp/actionmanager/promote/PromoteActionPayloadFunctions.java index f0b094240..a3b975d0a 100644 --- a/dhp-workflows/dhp-actionmanager/src/main/java/eu/dnetlib/dhp/actionmanager/promote/PromoteActionPayloadFunctions.java +++ b/dhp-workflows/dhp-actionmanager/src/main/java/eu/dnetlib/dhp/actionmanager/promote/PromoteActionPayloadFunctions.java @@ -50,7 +50,7 @@ public class PromoteActionPayloadFunctions { PromoteAction.Strategy promoteActionStrategy, Class rowClazz, Class actionPayloadClazz) { - if (!isSubClass(rowClazz, actionPayloadClazz)) { + if (Boolean.FALSE.equals(isSubClass(rowClazz, actionPayloadClazz))) { throw new RuntimeException( "action payload type must be the same or be a super type of table row type"); } diff --git a/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/person/oozie_app/workflow.xml b/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/person/oozie_app/workflow.xml index 7c119b305..1bacd09f1 100644 --- a/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/person/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/person/oozie_app/workflow.xml @@ -77,7 +77,6 @@ ${(activePromotePersonActionPayload eq "true") and - (fs:exists(concat(concat(concat(concat(wf:conf('nameNode'),'/'),wf:conf('inputGraphRootPath')),'/'),'person')) eq "true") and (fs:exists(concat(concat(concat(concat(wf:conf('nameNode'),'/'),wf:conf('inputActionPayloadRootPath')),'/'),'clazz=eu.dnetlib.dhp.schema.oaf.Person')) eq "true")} diff --git a/pom.xml b/pom.xml index 666ba2350..175cb9e7c 100644 --- a/pom.xml +++ b/pom.xml @@ -937,7 +937,7 @@ 1.1.3 1.7 1.0.7 - [7.0.1] + [7.0.2] cdh5.9.2 3.5 11.0.2 From 975d44cac7e9fa617c9b00070eba88edafc98c7d Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Fri, 2 Aug 2024 16:14:10 +0200 Subject: [PATCH 004/111] [graph provision] added person to the provision workflow --- .../model/ProvisionModelSupport.java | 11 ++++ .../dhp/oa/provision/oozie_app/workflow.xml | 59 +++++++++++++++++++ 2 files changed, 70 insertions(+) diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/ProvisionModelSupport.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/ProvisionModelSupport.java index 1a75deafc..196faf9ca 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/ProvisionModelSupport.java +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/ProvisionModelSupport.java @@ -5,6 +5,7 @@ import java.io.StringReader; import java.util.*; import java.util.stream.Collectors; +import eu.dnetlib.dhp.schema.solr.Person; import org.apache.commons.lang3.StringUtils; import org.dom4j.Document; import org.dom4j.DocumentException; @@ -89,6 +90,8 @@ public class ProvisionModelSupport { r.setOrganization(mapOrganization((eu.dnetlib.dhp.schema.oaf.Organization) e)); } else if (e instanceof eu.dnetlib.dhp.schema.oaf.Project) { r.setProject(mapProject((eu.dnetlib.dhp.schema.oaf.Project) e, vocs)); + } else if (e instanceof eu.dnetlib.dhp.schema.oaf.Person) { + r.setPerson(mapPerson((eu.dnetlib.dhp.schema.oaf.Person) e)); } r .setLinks( @@ -185,6 +188,14 @@ public class ProvisionModelSupport { return ps; } + private static Person mapPerson(eu.dnetlib.dhp.schema.oaf.Person p) { + Person ps = new Person(); + ps.setFamilyName(p.getFamilyName()); + ps.setGivenName(p.getGivenName()); + ps.setAlternativeNames(p.getAlternativeNames()); + return ps; + } + private static Funding mapFunding(List fundingtree, VocabularyGroup vocs) { SAXReader reader = new SAXReader(); return Optional diff --git a/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/oozie_app/workflow.xml b/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/oozie_app/workflow.xml index 15d3b6300..879911ccc 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-graph-provision/src/main/resources/eu/dnetlib/dhp/oa/provision/oozie_app/workflow.xml @@ -180,6 +180,7 @@ + @@ -378,6 +379,34 @@ + + + yarn + cluster + Join[relation.target = person.id] + eu.dnetlib.dhp.oa.provision.CreateRelatedEntitiesJob_phase1 + dhp-graph-provision-${projectVersion}.jar + + --executor-cores=${sparkExecutorCoresForJoining} + --executor-memory=${sparkExecutorMemoryForJoining} + --driver-memory=${sparkDriverMemoryForJoining} + --conf spark.executor.memoryOverhead=${sparkExecutorMemoryForJoining} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.sql.shuffle.partitions=5000 + --conf spark.network.timeout=${sparkNetworkTimeout} + + --inputRelationsPath${workingDir}/relation + --inputEntityPath${inputGraphRootPath}/person + --graphTableClassNameeu.dnetlib.dhp.schema.oaf.Person + --outputPath${workingDir}/join_partial/person + + + + + @@ -388,6 +417,7 @@ + @@ -593,6 +623,35 @@ + + + yarn + cluster + Join[person.id = relatedEntity.source] + eu.dnetlib.dhp.oa.provision.CreateRelatedEntitiesJob_phase2 + dhp-graph-provision-${projectVersion}.jar + + --executor-cores=${sparkExecutorCoresForJoining} + --executor-memory=${sparkExecutorMemoryForJoining} + --driver-memory=${sparkDriverMemoryForJoining} + --conf spark.executor.memoryOverhead=${sparkExecutorMemoryForJoining} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.sql.shuffle.partitions=5000 + --conf spark.network.timeout=${sparkNetworkTimeout} + + --inputEntityPath${inputGraphRootPath}/person + --graphTableClassNameeu.dnetlib.dhp.schema.oaf.Person + --inputRelatedEntitiesPath${workingDir}/join_partial + --outputPath${workingDir}/join_entities/person + --numPartitions10000 + + + + + From 0bf76f2a3401c550dea7da6c1fd4c38ca3903527 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Mon, 5 Aug 2024 09:35:07 +0200 Subject: [PATCH 005/111] [graph provision] added person to the graph2hive workflow --- .../dhp/oa/graph/hive/oozie_app/workflow.xml | 30 +++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/hive/oozie_app/workflow.xml b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/hive/oozie_app/workflow.xml index eec67fc5c..872ef8a2d 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/hive/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/hive/oozie_app/workflow.xml @@ -102,6 +102,7 @@ + @@ -308,6 +309,35 @@ + + + yarn + cluster + Import table person + eu.dnetlib.dhp.oa.graph.hive.GraphHiveTableImporterJob + dhp-graph-mapper-${projectVersion}.jar + + --executor-memory=${sparkExecutorMemory} + --executor-cores=${sparkExecutorCores} + --driver-memory=${sparkDriverMemory} + --conf spark.executor.memoryOverhead=${sparkExecutorMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir} + --conf spark.sql.shuffle.partitions=1000 + + --inputPath${inputPath}/person + --hiveDbName${hiveDbName} + --classNameeu.dnetlib.dhp.schema.oaf.Person + --hiveMetastoreUris${hiveMetastoreUris} + --numPartitions1000 + + + + + yarn From e16616b9646b77622a1a035574f2e8a39932294d Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Mon, 5 Aug 2024 15:57:37 +0200 Subject: [PATCH 006/111] added dataInfo to person records --- .../personentity/ExtractPerson.java | 52 +++++++------------ 1 file changed, 20 insertions(+), 32 deletions(-) diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/personentity/ExtractPerson.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/personentity/ExtractPerson.java index d381ed176..7e82698f7 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/personentity/ExtractPerson.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/personentity/ExtractPerson.java @@ -32,6 +32,7 @@ import eu.dnetlib.dhp.common.HdfsSupport; import eu.dnetlib.dhp.schema.action.AtomicAction; import eu.dnetlib.dhp.schema.common.ModelConstants; import eu.dnetlib.dhp.schema.common.ModelSupport; +import eu.dnetlib.dhp.schema.oaf.DataInfo; import eu.dnetlib.dhp.schema.oaf.KeyValue; import eu.dnetlib.dhp.schema.oaf.Person; import eu.dnetlib.dhp.schema.oaf.Relation; @@ -62,6 +63,20 @@ public class ExtractPerson implements Serializable { public static final String ORCID_AUTHORS_CLASSID = "sysimport:crosswalk:orcid"; public static final String ORCID_AUTHORS_CLASSNAME = "Imported from ORCID"; + public static final DataInfo DATAINFO = OafMapperUtils + .dataInfo( + false, + null, + false, + false, + OafMapperUtils + .qualifier( + ORCID_AUTHORS_CLASSID, + ORCID_AUTHORS_CLASSNAME, + ModelConstants.DNET_PROVENANCE_ACTIONS, + ModelConstants.DNET_PROVENANCE_ACTIONS), + "0.91"); + public static void main(final String[] args) throws IOException, ParseException { final ArgumentApplicationParser parser = new ArgumentApplicationParser( @@ -193,6 +208,7 @@ public class ExtractPerson implements Serializable { ModelConstants.DNET_PID_TYPES, ModelConstants.DNET_PID_TYPES, null)); person.setDateofcollection(op.getLastModifiedDate()); person.setOriginalId(Arrays.asList(op.getOrcid())); + person.setDataInfo(DATAINFO); return person; }, Encoders.bean(Person.class)) .write() @@ -307,14 +323,7 @@ public class ExtractPerson implements Serializable { source, target, ModelConstants.ORG_PERSON_RELTYPE, ModelConstants.ORG_PERSON_SUBRELTYPE, ModelConstants.ORG_PERSON_PARTICIPATES, Arrays.asList(OafMapperUtils.keyValue(orcidKey, ModelConstants.ORCID_DS)), - OafMapperUtils - .dataInfo( - false, null, false, false, - OafMapperUtils - .qualifier( - ORCID_AUTHORS_CLASSID, ORCID_AUTHORS_CLASSNAME, ModelConstants.DNET_PROVENANCE_ACTIONS, - ModelConstants.DNET_PROVENANCE_ACTIONS), - "0.91"), + DATAINFO, null); if (Optional.ofNullable(row.getStartDate()).isPresent() && StringUtil.isNotBlank(row.getStartDate())) { @@ -348,14 +357,7 @@ public class ExtractPerson implements Serializable { ModelConstants.PERSON_PERSON_SUBRELTYPE, ModelConstants.PERSON_PERSON_HASCOAUTHORED, Arrays.asList(OafMapperUtils.keyValue(orcidKey, ModelConstants.ORCID_DS)), - OafMapperUtils - .dataInfo( - false, null, false, false, - OafMapperUtils - .qualifier( - ORCID_AUTHORS_CLASSID, ORCID_AUTHORS_CLASSNAME, - ModelConstants.DNET_PROVENANCE_ACTIONS, ModelConstants.DNET_PROVENANCE_ACTIONS), - "0.91"), + DATAINFO, null), OafMapperUtils .getRelation( @@ -363,14 +365,7 @@ public class ExtractPerson implements Serializable { ModelConstants.PERSON_PERSON_SUBRELTYPE, ModelConstants.PERSON_PERSON_HASCOAUTHORED, Arrays.asList(OafMapperUtils.keyValue(orcidKey, ModelConstants.ORCID_DS)), - OafMapperUtils - .dataInfo( - false, null, false, false, - OafMapperUtils - .qualifier( - ORCID_AUTHORS_CLASSID, ORCID_AUTHORS_CLASSNAME, - ModelConstants.DNET_PROVENANCE_ACTIONS, ModelConstants.DNET_PROVENANCE_ACTIONS), - "0.91"), + DATAINFO, null)); } @@ -424,14 +419,7 @@ public class ExtractPerson implements Serializable { ModelConstants.RESULT_PERSON_SUBRELTYPE, ModelConstants.RESULT_PERSON_HASAUTHORED, Arrays.asList(OafMapperUtils.keyValue(orcidKey, ModelConstants.ORCID_DS)), - OafMapperUtils - .dataInfo( - false, null, false, false, - OafMapperUtils - .qualifier( - ORCID_AUTHORS_CLASSID, ORCID_AUTHORS_CLASSNAME, ModelConstants.DNET_PROVENANCE_ACTIONS, - ModelConstants.DNET_PROVENANCE_ACTIONS), - "0.91"), + DATAINFO, null); } } From 5a7ba772717c36a7f5ccbf442b427337e700a0b5 Mon Sep 17 00:00:00 2001 From: Miriam Baglioni Date: Mon, 12 Aug 2024 18:01:15 +0200 Subject: [PATCH 007/111] [Person]fix issue in affiliation relation id construction for person (missing ::) --- .../dnetlib/dhp/actionmanager/personentity/ExtractPerson.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/personentity/ExtractPerson.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/personentity/ExtractPerson.java index 7e82698f7..6f61d427d 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/personentity/ExtractPerson.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/personentity/ExtractPerson.java @@ -313,7 +313,7 @@ public class ExtractPerson implements Serializable { } private static Relation getAffiliationRelation(Employment row) { - String source = PERSON_PREFIX + IdentifierFactory.md5(row.getOrcid()); + String source = PERSON_PREFIX + "::" + IdentifierFactory.md5(row.getOrcid()); String target = ROR_PREFIX + IdentifierFactory.md5(PidCleaner.normalizePidValue("ROR", row.getAffiliationId().getValue())); List properties = new ArrayList<>(); From db03f853660767450ad1d283c1b841c849b0110a Mon Sep 17 00:00:00 2001 From: Serafeim Chatzopoulos Date: Wed, 4 Sep 2024 14:25:44 +0300 Subject: [PATCH 008/111] Remove steps for updating BIP! from the impact indicators workflow --- .../oozie_app/get_score_limits.sh | 63 ------- .../oozie_app/map_openaire_ids_to_dois.py | 60 ------- .../oozie_app/map_scores_to_dois.py | 168 ----------------- .../impact_indicators/oozie_app/workflow.xml | 169 ++---------------- 4 files changed, 16 insertions(+), 444 deletions(-) delete mode 100644 dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/get_score_limits.sh delete mode 100644 dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/map_openaire_ids_to_dois.py delete mode 100755 dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/map_scores_to_dois.py diff --git a/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/get_score_limits.sh b/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/get_score_limits.sh deleted file mode 100644 index 6d4161d7f..000000000 --- a/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/get_score_limits.sh +++ /dev/null @@ -1,63 +0,0 @@ -#/usr/bin/bash - -# Read log files from ranking scripts and create a two-line file -# with score limits for the various measures. To be used by Kleanthis - -attrank_file=$(ls *attrank*.log); -pr_file=$(ls *pagerank*.log) -ram_file=$(ls *ram*.log); -cc_file=$(ls *cc*.log); -impulse_file=$(ls *impulse*.log); - -echo -echo "-----------------------------" -echo "Attrank file:${attrank_file}"; -echo "PageRank file:${pr_file}"; -echo "RAM file:${ram_file}"; -echo "CC file:${cc_file}"; -echo "Impulse file:${impulse_file}"; -echo "-----------------------------" -echo -echo - -# output file will be called score_limits.csv -echo -e "influence_top001\tinfluence_top01\tinfluence_top1\tinfluence_top10\tpopularity_top001\tpopularity_top01\tpopularity_top1\tpopularity_top10\timpulse_top001\timpulse_top01\timpulse_top1\timpulse_top10\tcc_top001\tcc_top01\tcc_top1\tcc_top10" > score_limits.csv -# ---------------------------------------------------- # -# Get respective score limits (we don't need RAM) -inf_001=$(grep "^0.01%" ${pr_file} | cut -f 2); -inf_01=$(grep "^0.1%" ${pr_file} | cut -f 2); -inf_1=$(grep "^1%" ${pr_file} | cut -f 2); -inf_10=$(grep "^10%" ${pr_file} | cut -f 2); -echo "Influnence limits:" -echo -e "${inf_001}\t${inf_01}\t${inf_1}\t${inf_10}"; -# ---------------------------------------------------- # -pop_001=$(grep "^0.01%" ${attrank_file} | cut -f 2); -pop_01=$(grep "^0.1%" ${attrank_file} | cut -f 2); -pop_1=$(grep "^1%" ${attrank_file} | cut -f 2); -pop_10=$(grep "^10%" ${attrank_file} | cut -f 2); -echo "Popularity limits:"; -echo -e "${pop_001}\t${pop_01}\t${pop_1}\t${pop_10}"; -# ---------------------------------------------------- # -imp_001=$(grep "^0.01%" ${impulse_file} | cut -f 2); -imp_01=$(grep "^0.1%" ${impulse_file} | cut -f 2); -imp_1=$(grep "^1%" ${impulse_file} | cut -f 2); -imp_10=$(grep "^10%" ${impulse_file} | cut -f 2); -echo "Popularity limits:"; -echo -e "${imp_001}\t${imp_01}\t${imp_1}\t${imp_10}"; -# ---------------------------------------------------- # -cc_001=$(grep "^0.01%" ${cc_file} | cut -f 2); -cc_01=$(grep "^0.1%" ${cc_file} | cut -f 2); -cc_1=$(grep "^1%" ${cc_file} | cut -f 2); -cc_10=$(grep "^10%" ${cc_file} | cut -f 2); -echo "Popularity limits:"; -echo -e "${cc_001}\t${cc_01}\t${cc_1}\t${cc_10}"; -# ---------------------------------------------------- # - -echo -e "${inf_001}\t${inf_01}\t${inf_1}\t${inf_10}\t${pop_001}\t${pop_01}\t${pop_1}\t${pop_10}\t${imp_001}\t${imp_01}\t${imp_1}\t${imp_10}\t${cc_001}\t${cc_01}\t${cc_1}\t${cc_10}" >> score_limits.csv - -echo -echo "score_limits.csv contents:" -cat score_limits.csv - -echo; -echo; diff --git a/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/map_openaire_ids_to_dois.py b/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/map_openaire_ids_to_dois.py deleted file mode 100644 index 7997eec82..000000000 --- a/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/map_openaire_ids_to_dois.py +++ /dev/null @@ -1,60 +0,0 @@ -import json -import sys -from pyspark.sql import SparkSession -from pyspark import SparkConf, SparkContext - -if len(sys.argv) != 3: - print("Usage: map_openaire_ids_to_dois.py ") - sys.exit(-1) - -conf = SparkConf().setAppName('BIP!: Map OpenAIRE IDs to DOIs') -sc = SparkContext(conf = conf) -spark = SparkSession.builder.appName('BIP!: Map OpenAIRE IDs to DOIs').getOrCreate() -sc.setLogLevel('OFF') - -src_dir = sys.argv[1] -output = sys.argv[2] - -# src_dir = "/tmp/beta_provision/graph/21_graph_cleaned/" -# output = '/tmp/openaireid_to_dois/' - -def transform(doc): - - # get publication year from 'doc.dateofacceptance.value' - dateofacceptance = doc.get('dateofacceptance', {}).get('value') - - year = 0 - - if (dateofacceptance is not None): - year = dateofacceptance.split('-')[0] - - # for each pid get 'pid.value' if 'pid.qualifier.classid' equals to 'doi' - dois = [ pid['value'] for pid in doc.get('pid', []) if (pid.get('qualifier', {}).get('classid') == 'doi' and pid['value'] is not None)] - - num_dois = len(dois) - - # exlcude openaire ids that do not correspond to DOIs - if (num_dois == 0): - return None - - fields = [ doc['id'], str(num_dois), chr(0x02).join(dois), str(year) ] - - return '\t'.join([ v.encode('utf-8') for v in fields ]) - -docs = None - -for result_type in ["publication", "dataset", "software", "otherresearchproduct"]: - - tmp = sc.textFile(src_dir + result_type).map(json.loads) - - if (docs is None): - docs = tmp - else: - # append all result types in one RDD - docs = docs.union(tmp) - -docs = docs.filter(lambda d: d.get('dataInfo', {}).get('deletedbyinference') == False and d.get('dataInfo', {}).get('invisible') == False) - -docs = docs.map(transform).filter(lambda d: d is not None) - -docs.saveAsTextFile(output) diff --git a/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/map_scores_to_dois.py b/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/map_scores_to_dois.py deleted file mode 100755 index f6a8e9996..000000000 --- a/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/map_scores_to_dois.py +++ /dev/null @@ -1,168 +0,0 @@ -#!/usr/bin/python -# This program reads the openaire to doi mapping from the ${synonymFolder} of the workflow -# and uses this mapping to create doi-based score files in the format required by BiP! DB. -# This is done by reading each openaire-id based ranking file and joining the openaire based -# score and classes to all the corresponding dois. -################################################################################################# -# Imports -import sys - -# Sparksession lib to communicate with cluster via session object -from pyspark.sql import SparkSession - -# Import sql types to define schemas -from pyspark.sql.types import * - -# Import sql functions with shorthand alias -import pyspark.sql.functions as F - -from pyspark.sql.functions import max -# from pyspark.sql.functions import udf -################################################################################################# -################################################################################################# -# Clean up directory name - no longer needed in final workflow version -''' -def clean_directory_name(dir_name): - # We have a name with the form *_bip_universe_* or *_graph_universe_* - # and we need to keep the parts in * - - - dir_name_parts = dir_name.split('_') - dir_name_parts = [part for part in dir_name_parts if ('bip' not in part and 'graph' not in part and 'universe' not in part and 'from' not in part)] - - dir_name = dir_name.replace("openaire_id_graph", "openaire_ids") - clean_name = dir_name + ".txt.gz" - - # clean_name = '_'.join(dir_name_parts) - - # if '_ids' not in clean_name: - # clean_name = clean_name.replace('id_', 'ids_') - - # clean_name = clean_name.replace('.txt', '') - # clean_name = clean_name.replace('.gz', '') - - # if 'openaire_ids_' in clean_name: - # clean_name = clean_name.replace('openaire_ids_', '') - # clean_name = clean_name + '.txt.gz' - # else: - # clean_name = clean_name + '.txt.gz' - - return clean_name -''' -################################################################################################# -if len(sys.argv) < 3: - print ("Usage: ./map_scores_to_dois.py <...etc...>") - sys.exit(-1) - -# Read arguments -synonyms_folder = sys.argv[1] -num_partitions = int(sys.argv[2]) -input_file_list = [argument.replace("_openaire_id_graph", "").replace("_openaire_id_graph_", "") + "_openaire_ids.txt.gz" for argument in sys.argv[3:]] -# input_file_list = [clean_directory_name(item) for item in input_file_list] - -# Prepare output specific variables -output_file_list = [item.replace("_openaire_ids", "") for item in input_file_list] -output_file_list = [item + ".txt.gz" if not item.endswith(".txt.gz") else item for item in output_file_list] - -# --- INFO MESSAGES --- # -print ("\n\n----------------------------") -print ("Mpping openaire ids to DOIs") -print ("Reading input from: " + synonyms_folder) -print ("Num partitions: " + str(num_partitions)) -print ("Input files:" + " -- ".join(input_file_list)) -print ("Output files: " + " -- ".join(output_file_list)) -print ("----------------------------\n\n") -####################################################################################### -# We weill define the following schemas: -# --> the schema of the openaire - doi mapping file [string - int - doi_list] (the separator of the doi-list is a non printable character) -# --> a schema for floating point ranking scores [string - float - string] (the latter string is the class) -# --> a schema for integer ranking scores [string - int - string] (the latter string is the class) - -float_schema = StructType([ - StructField('id', StringType(), False), - StructField('score', FloatType(), False), - StructField('class', StringType(), False) - ]) - -int_schema = StructType([ - StructField('id', StringType(), False), - StructField('score', IntegerType(), False), - StructField('class', StringType(), False) - ]) - -# This schema concerns the output of the file -# containing the number of references of each doi -synonyms_schema = StructType([ - StructField('id', StringType(), False), - StructField('num_synonyms', IntegerType(), False), - StructField('doi_list', StringType(), False), - ]) -####################################################################################### -# Start spark session -spark = SparkSession.builder.appName('Map openaire scores to DOIs').getOrCreate() -# Set Log Level for spark session -spark.sparkContext.setLogLevel('WARN') -####################################################################################### -# MAIN Program - -# Read and repartition the synonym folder - also cache it since we will need to perform multiple joins -synonym_df = spark.read.schema(synonyms_schema).option('delimiter', '\t').csv(synonyms_folder) -synonym_df = synonym_df.select('id', F.split(F.col('doi_list'), chr(0x02)).alias('doi_list')) -synonym_df = synonym_df.select('id', F.explode('doi_list').alias('doi')).repartition(num_partitions, 'id').cache() - -# TESTING -# print ("Synonyms: " + str(synonym_df.count())) -# print ("DF looks like this:" ) -# synonym_df.show(1000, False) - -print ("\n\n-----------------------------") -# Now we need to join the score files on the openaire-id with the synonyms and then keep -# only doi - score - class and write this to the output -for offset, input_file in enumerate(input_file_list): - - print ("Mapping scores from " + input_file) - - # Select correct schema - schema = int_schema - if "attrank" in input_file.lower() or "pr" in input_file.lower() or "ram" in input_file.lower(): - schema = float_schema - - # Load file to dataframe - ranking_df = spark.read.schema(schema).option('delimiter', '\t').csv(input_file).repartition(num_partitions, 'id') - - # Get max score - max_score = ranking_df.select(max('score').alias('max')).collect()[0]['max'] - print ("Max Score for " + str(input_file) + " is " + str(max_score)) - - # TESTING - # print ("Loaded df sample:") - # ranking_df.show(1000, False) - - # Join scores to synonyms and keep required fields - doi_score_df = synonym_df.join(ranking_df, ['id']).select('doi', 'score', 'class').repartition(num_partitions, 'doi').cache() - # Write output - output_file = output_file_list[offset] - print ("Writing to: " + output_file) - doi_score_df.write.mode('overwrite').option('delimiter','\t').option('header',False).csv(output_file, compression='gzip') - - # Creata another file for the bip update process - ranking_df = ranking_df.select('id', 'score', F.lit(F.col('score')/max_score).alias('normalized_score'), 'class', F.col('class').alias('class_dup')) - doi_score_df = synonym_df.join(ranking_df, ['id']).select('doi', 'score', 'normalized_score', 'class', 'class_dup').repartition(num_partitions, 'doi').cache() - output_file = output_file.replace(".txt.gz", "_for_bip_update.txt.gz") - print ("Writing bip update to: " + output_file) - doi_score_df.write.mode('overwrite').option('delimiter','\t').option('header',False).csv(output_file, compression='gzip') - - - # Free memory? - ranking_df.unpersist(True) - -print ("-----------------------------") -print ("\n\nFinished!\n\n") - - - - - - - - diff --git a/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/workflow.xml b/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/workflow.xml index 70f5f8d2a..108cf70b1 100644 --- a/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/workflow.xml @@ -17,10 +17,6 @@ openaireGraphInputPath ${nameNode}/${workingDir}/openaire_id_graph - - synonymFolder - ${nameNode}/${workingDir}/openaireid_to_dois/ - checkpointDir ${nameNode}/${workingDir}/check/ @@ -32,29 +28,34 @@ - + - - + + + ${wf:conf('resume') eq "start"} + + ${wf:conf('resume') eq "cc"} ${wf:conf('resume') eq "ram"} ${wf:conf('resume') eq "impulse"} ${wf:conf('resume') eq "pagerank"} ${wf:conf('resume') eq "attrank"} - - ${wf:conf('resume') eq "format-results"} - ${wf:conf('resume') eq "map-ids"} - ${wf:conf('resume') eq "map-scores"} - ${wf:conf('resume') eq "start"} - + + ${wf:conf('resume') eq "format-results"} + + ${wf:conf('resume') eq "projects-impact"} + + ${wf:conf('resume') eq "create-actionset"} + + @@ -295,18 +296,11 @@ - + - - - - - - - @@ -345,139 +339,8 @@ ${wfAppPath}/format_ranking_results.py#format_ranking_results.py - - - - - - - - - - - yarn-cluster - cluster - - - Format Ranking Results BiP! DB - - format_ranking_results.py - - - - --executor-memory=${sparkNormalExecutorMemory} - --executor-cores=${sparkExecutorCores} - --driver-memory=${sparkNormalDriverMemory} - --conf spark.executor.memoryOverhead=${sparkNormalExecutorMemory} - --conf spark.sql.shuffle.partitions=${sparkShufflePartitions} - --conf spark.extraListeners=${spark2ExtraListeners} - --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} - --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} - --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} - - - - zenodo - - ${nameNode}/${workingDir}/${wf:actionData('get-file-names')['pr_file']} - ${nameNode}/${workingDir}/${wf:actionData('get-file-names')['attrank_file']} - ${nameNode}/${workingDir}/${wf:actionData('get-file-names')['cc_file']} - ${nameNode}/${workingDir}/${wf:actionData('get-file-names')['impulse_file']} - ${nameNode}/${workingDir}/${wf:actionData('get-file-names')['ram_file']} - - ${sparkShufflePartitions} - - openaire - - ${wfAppPath}/format_ranking_results.py#format_ranking_results.py - - - - - - - - - - - - - - - - - - - yarn-cluster - cluster - Openaire-DOI synonym collection - map_openaire_ids_to_dois.py - - - --executor-memory=${sparkHighExecutorMemory} - --executor-cores=${sparkExecutorCores} - --driver-memory=${sparkHighDriverMemory} - --conf spark.executor.memoryOverhead=${sparkHighExecutorMemory} - --conf spark.sql.shuffle.partitions=${sparkShufflePartitions} - --conf spark.extraListeners=${spark2ExtraListeners} - --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} - --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} - --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} - - - - ${openaireDataInput}/ - - ${synonymFolder} - - ${wfAppPath}/map_openaire_ids_to_dois.py#map_openaire_ids_to_dois.py - - - - - - - - - - - - - - yarn-cluster - cluster - Mapping Openaire Scores to DOIs - map_scores_to_dois.py - - - --executor-memory=${sparkHighExecutorMemory} - --executor-cores=${sparkExecutorCores} - --driver-memory=${sparkHighDriverMemory} - --conf spark.executor.memoryOverhead=${sparkHighExecutorMemory} - --conf spark.sql.shuffle.partitions=${sparkShufflePartitions} - --conf spark.extraListeners=${spark2ExtraListeners} - --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} - --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} - --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} - - - - ${synonymFolder} - - ${sparkShufflePartitions} - - ${nameNode}/${workingDir}/${wf:actionData('get-file-names')['pr_file']} - ${nameNode}/${workingDir}/${wf:actionData('get-file-names')['attrank_file']} - ${nameNode}/${workingDir}/${wf:actionData('get-file-names')['cc_file']} - ${nameNode}/${workingDir}/${wf:actionData('get-file-names')['impulse_file']} - ${nameNode}/${workingDir}/${wf:actionData('get-file-names')['ram_file']} - - ${wfAppPath}/map_scores_to_dois.py#map_scores_to_dois.py - - - - + From b043f8a96370cfdf593fb05c71b119d6175fe240 Mon Sep 17 00:00:00 2001 From: Serafeim Chatzopoulos Date: Wed, 4 Sep 2024 14:28:43 +0300 Subject: [PATCH 009/111] Remove redundant error messages from impact indicators workflow --- .../graph/impact_indicators/oozie_app/workflow.xml | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/workflow.xml b/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/workflow.xml index 108cf70b1..5d8669823 100644 --- a/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/workflow.xml @@ -457,18 +457,6 @@ Error formatting json files, error message[${wf:errorMessage(wf:lastErrorNode())}] - - Error formatting BIP files, error message[${wf:errorMessage(wf:lastErrorNode())}] - - - - Synonym collection failed, error message[${wf:errorMessage(wf:lastErrorNode())}] - - - - Mapping scores to DOIs failed, error message[${wf:errorMessage(wf:lastErrorNode())}] - - Deleting output path for actionsets failed, error message[${wf:errorMessage(wf:lastErrorNode())}] From 07e6e7b4d6e3489a55d8dca917c65e28ead21275 Mon Sep 17 00:00:00 2001 From: Alessia Date: Mon, 16 Sep 2024 13:41:56 +0200 Subject: [PATCH 010/111] #9839: include claimed affiliation relationships --- .../raw/MigrateDbEntitiesApplication.java | 22 ++++++++++++ .../raw/MigrateDbEntitiesApplicationTest.java | 35 +++++++++++++++++++ .../raw/claimsrel_resultset_affiliation.json | 27 ++++++++++++++ 3 files changed, 84 insertions(+) create mode 100644 dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/claimsrel_resultset_affiliation.json diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/MigrateDbEntitiesApplication.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/MigrateDbEntitiesApplication.java index c9a32cde6..00505fedc 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/MigrateDbEntitiesApplication.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/MigrateDbEntitiesApplication.java @@ -519,6 +519,28 @@ public class MigrateDbEntitiesApplication extends AbstractMigrationApplication i r1 = setRelationSemantic(r1, RESULT_RESULT, PUBLICATION_DATASET, IS_RELATED_TO); r2 = setRelationSemantic(r2, RESULT_RESULT, PUBLICATION_DATASET, IS_RELATED_TO); break; + case "resultOrganization_affiliation_isAuthorInstitutionOf": + if (!"organization".equals(sourceType)) { + throw new IllegalStateException( + String + .format( + "invalid claim, sourceId: %s, targetId: %s, semantics: %s", sourceId, targetId, + semantics)); + } + r1 = setRelationSemantic(r1, RESULT_ORGANIZATION, AFFILIATION, IS_AUTHOR_INSTITUTION_OF); + r2 = setRelationSemantic(r2, RESULT_ORGANIZATION, AFFILIATION, HAS_AUTHOR_INSTITUTION); + break; + case "resultOrganization_affiliation_hasAuthorInstitution": + if (!"organization".equals(targetType)) { + throw new IllegalStateException( + String + .format( + "invalid claim, sourceId: %s, targetId: %s, semantics: %s", sourceId, targetId, + semantics)); + } + r1 = setRelationSemantic(r1, RESULT_ORGANIZATION, AFFILIATION, HAS_AUTHOR_INSTITUTION); + r2 = setRelationSemantic(r2, RESULT_ORGANIZATION, AFFILIATION, IS_AUTHOR_INSTITUTION_OF); + break; default: throw new IllegalArgumentException("claim semantics not managed: " + semantics); } diff --git a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MigrateDbEntitiesApplicationTest.java b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MigrateDbEntitiesApplicationTest.java index 27304ec06..c4d1b6b58 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MigrateDbEntitiesApplicationTest.java +++ b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MigrateDbEntitiesApplicationTest.java @@ -16,6 +16,8 @@ import java.util.Objects; import java.util.Optional; import java.util.stream.Collectors; +import eu.dnetlib.dhp.schema.common.ModelSupport; +import eu.dnetlib.dhp.schema.common.RelationInverse; import org.apache.commons.io.IOUtils; import org.apache.commons.lang3.StringUtils; import org.junit.jupiter.api.BeforeEach; @@ -364,6 +366,39 @@ class MigrateDbEntitiesApplicationTest { assertValidId(r1.getCollectedfrom().get(0).getKey()); assertValidId(r2.getCollectedfrom().get(0).getKey()); } + @Test + void testProcessClaims_affiliation() throws Exception { + final List fields = prepareMocks("claimsrel_resultset_affiliation.json"); + + final List list = app.processClaims(rs); + + assertEquals(2, list.size()); + verifyMocks(fields); + + assertTrue(list.get(0) instanceof Relation); + assertTrue(list.get(1) instanceof Relation); + + final Relation r1 = (Relation) list.get(0); + final Relation r2 = (Relation) list.get(1); + + assertValidId(r1.getSource()); + assertValidId(r1.getTarget()); + assertValidId(r2.getSource()); + assertValidId(r2.getTarget()); + assertNotNull(r1.getDataInfo()); + assertNotNull(r2.getDataInfo()); + assertNotNull(r1.getDataInfo().getTrust()); + assertNotNull(r2.getDataInfo().getTrust()); + assertEquals(r1.getSource(), r2.getTarget()); + assertEquals(r2.getSource(), r1.getTarget()); + assertTrue(StringUtils.isNotBlank(r1.getRelClass())); + assertTrue(StringUtils.isNotBlank(r2.getRelClass())); + assertTrue(StringUtils.isNotBlank(r1.getRelType())); + assertTrue(StringUtils.isNotBlank(r2.getRelType())); + + assertValidId(r1.getCollectedfrom().get(0).getKey()); + assertValidId(r2.getCollectedfrom().get(0).getKey()); + } private List prepareMocks(final String jsonFile) throws IOException, SQLException { final String json = IOUtils.toString(getClass().getResourceAsStream(jsonFile)); diff --git a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/claimsrel_resultset_affiliation.json b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/claimsrel_resultset_affiliation.json new file mode 100644 index 000000000..07cc025d6 --- /dev/null +++ b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/claimsrel_resultset_affiliation.json @@ -0,0 +1,27 @@ +[ + { + "field": "source_type", + "type": "string", + "value": "organization" + }, + { + "field": "source_id", + "type": "string", + "value": "openorgs____::b5ca9d4340e26454e367e2908ef3872f" + }, + { + "field": "target_type", + "type": "string", + "value": "software" + }, + { + "field": "target_id", + "type": "string", + "value": "userclaim___::bde53826d07c8cf47c99222a375cd2e8" + }, + { + "field": "semantics", + "type": "string", + "value": "resultOrganization_affiliation_isAuthorInstitutionOf" + } +] \ No newline at end of file From 6df6b4583ebeecda8ff69cd370b8d39d5d8dd7b3 Mon Sep 17 00:00:00 2001 From: miconis Date: Mon, 16 Sep 2024 14:04:59 +0200 Subject: [PATCH 011/111] blacklist filtering moved before the cleanup phase in order to have case sensitive regex --- .../NumAuthorsTitleSuffixPrefixChain.java | 2 +- .../java/eu/dnetlib/pace/model/FieldDef.java | 16 +++ .../eu/dnetlib/pace/model/SparkDeduper.scala | 40 +------ .../eu/dnetlib/pace/model/SparkModel.scala | 100 ++++++++++++------ .../clustering/ClusteringFunctionTest.java | 13 +++ .../pace/comparators/ComparatorTest.java | 12 +++ .../dnetlib/dhp/oa/dedup/SparkBlockStats.java | 1 - .../dnetlib/dhp/dedup/conf/pub.curr.conf.json | 2 +- 8 files changed, 111 insertions(+), 75 deletions(-) diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/NumAuthorsTitleSuffixPrefixChain.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/NumAuthorsTitleSuffixPrefixChain.java index f1d1e17b9..4e6d8231f 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/NumAuthorsTitleSuffixPrefixChain.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/NumAuthorsTitleSuffixPrefixChain.java @@ -38,7 +38,7 @@ public class NumAuthorsTitleSuffixPrefixChain extends AbstractClusteringFunction @Override protected Collection doApply(Config conf, String s) { - return suffixPrefixChain(cleanup(s), param("mod")); + return suffixPrefixChain(cleanup(s), paramOrDefault("mod", 10)); } private Collection suffixPrefixChain(String s, int mod) { diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/FieldDef.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/FieldDef.java index b0dc11656..2e329f690 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/FieldDef.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/FieldDef.java @@ -54,6 +54,22 @@ public class FieldDef implements Serializable { public FieldDef() { } + public FieldDef clone() { + FieldDef fieldDef = new FieldDef(); + fieldDef.setName(this.name); + fieldDef.setPath(this.path); + fieldDef.setType(this.type); + fieldDef.setOverrideMatch(this.overrideMatch); + fieldDef.setSize(this.size); + fieldDef.setLength(this.length); + fieldDef.setFilter(this.filter); + fieldDef.setSorted(this.sorted); + fieldDef.setClean(this.clean); + fieldDef.setInfer(this.infer); + fieldDef.setInferenceFrom(this.inferenceFrom); + return fieldDef; + } + public String getInferenceFrom() { return inferenceFrom; } diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/SparkDeduper.scala b/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/SparkDeduper.scala index bc702b9e2..a3eb3cba8 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/SparkDeduper.scala +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/SparkDeduper.scala @@ -19,48 +19,10 @@ case class SparkDeduper(conf: DedupConfig) extends Serializable { val model: SparkModel = SparkModel(conf) val dedup: (Dataset[Row] => Dataset[Row]) = df => { - df.transform(filterAndCleanup) - .transform(generateClustersWithCollect) + df.transform(generateClustersWithCollect) .transform(processBlocks) } - - val filterAndCleanup: (Dataset[Row] => Dataset[Row]) = df => { - val df_with_filters = conf.getPace.getModel.asScala.foldLeft(df)((res, fdef) => { - if (conf.blacklists.containsKey(fdef.getName)) { - res.withColumn( - fdef.getName + "_filtered", - filterColumnUDF(fdef).apply(new Column(fdef.getName)) - ) - } else { - res - } - }) - - df_with_filters - } - - def filterColumnUDF(fdef: FieldDef): UserDefinedFunction = { - val blacklist: Predicate[String] = conf.blacklists().get(fdef.getName) - - if (blacklist == null) { - throw new IllegalArgumentException("Column: " + fdef.getName + " does not have any filter") - } else { - fdef.getType match { - case Type.List | Type.JSON => - udf[Array[String], Array[String]](values => { - values.filter((v: String) => !blacklist.test(v)) - }) - - case _ => - udf[String, String](v => { - if (blacklist.test(v)) "" - else v - }) - } - } - } - val generateClustersWithCollect: (Dataset[Row] => Dataset[Row]) = df_with_filters => { var df_with_clustering_keys: Dataset[Row] = null diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/SparkModel.scala b/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/SparkModel.scala index c6db62339..580a88b7e 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/SparkModel.scala +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/SparkModel.scala @@ -5,12 +5,12 @@ import eu.dnetlib.pace.common.AbstractPaceFunctions import eu.dnetlib.pace.config.{DedupConfig, Type} import eu.dnetlib.pace.util.{MapDocumentUtil, SparkCompatUtils} import org.apache.commons.lang3.StringUtils -import org.apache.spark.sql.catalyst.encoders.RowEncoder import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema import org.apache.spark.sql.types.{DataTypes, Metadata, StructField, StructType} import org.apache.spark.sql.{Dataset, Row} import java.util.Locale +import java.util.function.Predicate import java.util.regex.Pattern import scala.collection.JavaConverters._ @@ -29,8 +29,20 @@ case class SparkModel(conf: DedupConfig) { identifier.setName(identifierFieldName) identifier.setType(Type.String) + // create fields for blacklist + val filtered = conf.getPace.getModel.asScala.flatMap(fdef => { + if (conf.blacklists().containsKey(fdef.getName)) { + val fdef_filtered = fdef.clone() + fdef_filtered.setName(fdef.getName + "_filtered") + Seq(fdef, fdef_filtered) + } + else { + Seq(fdef) + } + }) + // Construct a Spark StructType representing the schema of the model - (Seq(identifier) ++ conf.getPace.getModel.asScala) + (Seq(identifier) ++ filtered) .foldLeft( new StructType() )((resType, fieldDef) => { @@ -44,7 +56,6 @@ case class SparkModel(conf: DedupConfig) { }) }) - } val identityFieldPosition: Int = schema.fieldIndex(identifierFieldName) @@ -52,7 +63,8 @@ case class SparkModel(conf: DedupConfig) { val orderingFieldPosition: Int = schema.fieldIndex(orderingFieldName) val parseJsonDataset: (Dataset[String] => Dataset[Row]) = df => { - df.map(r => rowFromJson(r))(SparkCompatUtils.encoderFor(schema)) + df + .map(r => rowFromJson(r))(SparkCompatUtils.encoderFor(schema)) } def rowFromJson(json: String): Row = { @@ -64,41 +76,63 @@ case class SparkModel(conf: DedupConfig) { schema.fieldNames.zipWithIndex.foldLeft(values) { case ((res, (fname, index))) => - val fdef = conf.getPace.getModelMap.get(fname) + + val fdef = conf.getPace.getModelMap.get(fname.split("_filtered")(0)) if (fdef != null) { - res(index) = fdef.getType match { - case Type.String | Type.Int => - MapDocumentUtil.truncateValue( - MapDocumentUtil.getJPathString(fdef.getPath, documentContext), - fdef.getLength - ) + if (!fname.contains("_filtered")) { //process fields with no blacklist + res(index) = fdef.getType match { + case Type.String | Type.Int => + MapDocumentUtil.truncateValue( + MapDocumentUtil.getJPathString(fdef.getPath, documentContext), + fdef.getLength + ) - case Type.URL => - var uv = MapDocumentUtil.getJPathString(fdef.getPath, documentContext) - if (!URL_REGEX.matcher(uv).matches) - uv = "" - uv + case Type.URL => + var uv = MapDocumentUtil.getJPathString(fdef.getPath, documentContext) + if (!URL_REGEX.matcher(uv).matches) + uv = "" + uv - case Type.List | Type.JSON => - MapDocumentUtil.truncateList( - MapDocumentUtil.getJPathList(fdef.getPath, documentContext, fdef.getType), - fdef.getSize - ).asScala + case Type.List | Type.JSON => + MapDocumentUtil.truncateList( + MapDocumentUtil.getJPathList(fdef.getPath, documentContext, fdef.getType), + fdef.getSize + ).asScala - case Type.StringConcat => - val jpaths = CONCAT_REGEX.split(fdef.getPath) + case Type.StringConcat => + val jpaths = CONCAT_REGEX.split(fdef.getPath) - MapDocumentUtil.truncateValue( - jpaths - .map(jpath => MapDocumentUtil.getJPathString(jpath, documentContext)) - .mkString(" "), - fdef.getLength - ) + MapDocumentUtil.truncateValue( + jpaths + .map(jpath => MapDocumentUtil.getJPathString(jpath, documentContext)) + .mkString(" "), + fdef.getLength + ) - case Type.DoubleArray => - MapDocumentUtil.getJPathArray(fdef.getPath, json) + case Type.DoubleArray => + MapDocumentUtil.getJPathArray(fdef.getPath, json) + } } + else { //process fields with blacklist + val blacklist: Predicate[String] = conf.blacklists().get(fdef.getName) + + res(index) = fdef.getType match { + case Type.List | Type.JSON => + MapDocumentUtil.truncateList( + MapDocumentUtil.getJPathList(fdef.getPath, documentContext, fdef.getType), + fdef.getSize + ).asScala.filter((v: String) => !blacklist.test(v)) + + case _ => + val value: String = MapDocumentUtil.truncateValue( + MapDocumentUtil.getJPathString(fdef.getPath, documentContext), + fdef.getLength + ) + if (blacklist.test(value)) "" else value + } + } + val filter = fdef.getFilter @@ -125,13 +159,12 @@ case class SparkModel(conf: DedupConfig) { } if (StringUtils.isNotBlank(fdef.getInfer)) { - val inferFrom : String = if (StringUtils.isNotBlank(fdef.getInferenceFrom)) fdef.getInferenceFrom else fdef.getPath + val inferFrom: String = if (StringUtils.isNotBlank(fdef.getInferenceFrom)) fdef.getInferenceFrom else fdef.getPath res(index) = res(index) match { case x: Seq[String] => x.map(inference(_, MapDocumentUtil.getJPathString(inferFrom, documentContext), fdef.getInfer)) case _ => inference(res(index).toString, MapDocumentUtil.getJPathString(inferFrom, documentContext), fdef.getInfer) } } - } res @@ -139,6 +172,7 @@ case class SparkModel(conf: DedupConfig) { } new GenericRowWithSchema(values, schema) + } def clean(value: String, cleantype: String) : String = { diff --git a/dhp-pace-core/src/test/java/eu/dnetlib/pace/clustering/ClusteringFunctionTest.java b/dhp-pace-core/src/test/java/eu/dnetlib/pace/clustering/ClusteringFunctionTest.java index e62f742f8..236f17eca 100644 --- a/dhp-pace-core/src/test/java/eu/dnetlib/pace/clustering/ClusteringFunctionTest.java +++ b/dhp-pace-core/src/test/java/eu/dnetlib/pace/clustering/ClusteringFunctionTest.java @@ -227,4 +227,17 @@ public class ClusteringFunctionTest extends AbstractPaceTest { System.out.println(cf.apply(conf, Lists.newArrayList(s))); } + @Test + public void testNumAuthorsTitleSuffixPrefixChain() { + + final ClusteringFunction cf = new NumAuthorsTitleSuffixPrefixChain(params); + params.put("mod", 10); + + final String title = "PARP-2 Regulates SIRT1 Expression and Whole-Body Energy Expenditure"; + final String num_authors = "10"; + System.out.println("title = " + title); + System.out.println("num_authors = " + num_authors); + System.out.println(cf.apply(conf, Lists.newArrayList(num_authors, title))); + } + } diff --git a/dhp-pace-core/src/test/java/eu/dnetlib/pace/comparators/ComparatorTest.java b/dhp-pace-core/src/test/java/eu/dnetlib/pace/comparators/ComparatorTest.java index c008902c4..d2e83e695 100644 --- a/dhp-pace-core/src/test/java/eu/dnetlib/pace/comparators/ComparatorTest.java +++ b/dhp-pace-core/src/test/java/eu/dnetlib/pace/comparators/ComparatorTest.java @@ -327,4 +327,16 @@ public class ComparatorTest extends AbstractPaceTest { } + @Test + public void titleVersionMatchTest() { + + TitleVersionMatch titleVersionMatch = new TitleVersionMatch(params); + + double result = titleVersionMatch + .compare( + "parp 2 regulates sirt 1 expression and whole body energy expenditure", + "parp 2 regulates sirt 1 expression and whole body energy expenditure", conf); + assertEquals(1.0, result); + } + } diff --git a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkBlockStats.java b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkBlockStats.java index 3e5215d42..612a1cb19 100644 --- a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkBlockStats.java +++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkBlockStats.java @@ -91,7 +91,6 @@ public class SparkBlockStats extends AbstractSparkAction { .read() .textFile(DedupUtility.createEntityPath(graphBasePath, subEntity)) .transform(deduper.model().parseJsonDataset()) - .transform(deduper.filterAndCleanup()) .transform(deduper.generateClustersWithCollect()) .filter(functions.size(new Column("block")).geq(1)); diff --git a/dhp-workflows/dhp-dedup-openaire/src/test/resources/eu/dnetlib/dhp/dedup/conf/pub.curr.conf.json b/dhp-workflows/dhp-dedup-openaire/src/test/resources/eu/dnetlib/dhp/dedup/conf/pub.curr.conf.json index c3a769874..c5ff1c1fa 100644 --- a/dhp-workflows/dhp-dedup-openaire/src/test/resources/eu/dnetlib/dhp/dedup/conf/pub.curr.conf.json +++ b/dhp-workflows/dhp-dedup-openaire/src/test/resources/eu/dnetlib/dhp/dedup/conf/pub.curr.conf.json @@ -96,7 +96,7 @@ "aggregation": "MAX", "positive": "layer4", "negative": "NO_MATCH", - "undefined": "MATCH", + "undefined": "layer4", "ignoreUndefined": "true" }, "layer4": { From 23e0ab3a7c301a9f1bf3e10beb1944e08cb14fcb Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Mon, 16 Sep 2024 16:16:23 +0200 Subject: [PATCH 012/111] run mergeResultsOfDifferentTypes only when checkDelegatedAuthority is true --- .../dhp/schema/oaf/utils/MergeUtils.java | 3 +- .../PromoteResultWithMeasuresTest.java | 210 ++++++++++++++++++ .../measures/actionPayloads/part0000.json | 3 + .../promote/measures/graph/part00000.json | 1 + 4 files changed, 216 insertions(+), 1 deletion(-) create mode 100644 dhp-workflows/dhp-actionmanager/src/test/java/eu/dnetlib/dhp/actionmanager/promote/PromoteResultWithMeasuresTest.java create mode 100644 dhp-workflows/dhp-actionmanager/src/test/resources/eu/dnetlib/dhp/actionmanager/promote/measures/actionPayloads/part0000.json create mode 100644 dhp-workflows/dhp-actionmanager/src/test/resources/eu/dnetlib/dhp/actionmanager/promote/measures/graph/part00000.json diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/MergeUtils.java b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/MergeUtils.java index ea402ecbf..ac7694d18 100644 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/MergeUtils.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/MergeUtils.java @@ -30,6 +30,7 @@ import eu.dnetlib.dhp.schema.common.ModelSupport; import eu.dnetlib.dhp.schema.oaf.*; public class MergeUtils { + public static T mergeById(String s, Iterator oafEntityIterator) { return mergeGroup(s, oafEntityIterator, true); } @@ -88,7 +89,7 @@ public class MergeUtils { private static Oaf mergeEntities(Oaf left, Oaf right, boolean checkDelegatedAuthority) { if (sameClass(left, right, Result.class)) { - if (!left.getClass().equals(right.getClass()) || checkDelegatedAuthority) { + if (checkDelegatedAuthority) { return mergeResultsOfDifferentTypes((Result) left, (Result) right); } diff --git a/dhp-workflows/dhp-actionmanager/src/test/java/eu/dnetlib/dhp/actionmanager/promote/PromoteResultWithMeasuresTest.java b/dhp-workflows/dhp-actionmanager/src/test/java/eu/dnetlib/dhp/actionmanager/promote/PromoteResultWithMeasuresTest.java new file mode 100644 index 000000000..3eafe7115 --- /dev/null +++ b/dhp-workflows/dhp-actionmanager/src/test/java/eu/dnetlib/dhp/actionmanager/promote/PromoteResultWithMeasuresTest.java @@ -0,0 +1,210 @@ +/* + * Copyright (c) 2024. + * SPDX-FileCopyrightText: © 2023 Consiglio Nazionale delle Ricerche + * SPDX-License-Identifier: AGPL-3.0-or-later + */ + +package eu.dnetlib.dhp.actionmanager.promote; + +import static eu.dnetlib.dhp.common.FunctionalInterfaceSupport.*; +import static eu.dnetlib.dhp.schema.common.ModelSupport.isSubClass; +import static org.apache.spark.sql.functions.*; +import static org.junit.jupiter.api.Assertions.*; + +import java.io.IOException; +import java.nio.file.DirectoryStream; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.HashSet; +import java.util.List; +import java.util.function.BiFunction; +import java.util.function.Function; +import java.util.stream.Collectors; + +import org.apache.commons.io.FileUtils; +import org.apache.commons.lang3.StringUtils; +import org.apache.spark.SparkConf; +import org.apache.spark.sql.*; +import org.apache.spark.sql.Dataset; +import org.junit.jupiter.api.AfterAll; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.Test; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import com.fasterxml.jackson.core.JsonProcessingException; +import com.fasterxml.jackson.databind.ObjectMapper; +import com.google.common.collect.Lists; + +import eu.dnetlib.dhp.schema.common.ModelSupport; +import eu.dnetlib.dhp.schema.oaf.*; + +public class PromoteResultWithMeasuresTest { + + private static final Logger log = LoggerFactory.getLogger(PromoteResultWithMeasuresTest.class); + + private static SparkSession spark; + + private static Path tempDir; + + public static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); + + @BeforeAll + public static void beforeAll() throws IOException { + tempDir = Files.createTempDirectory(PromoteResultWithMeasuresTest.class.getSimpleName()); + log.info("using work dir {}", tempDir); + + SparkConf conf = new SparkConf(); + conf.setMaster("local[*]"); + conf.setAppName(PromoteResultWithMeasuresTest.class.getSimpleName()); + conf.set("spark.driver.host", "localhost"); + + conf.set("hive.metastore.local", "true"); + conf.set("spark.ui.enabled", "false"); + + conf.set("spark.sql.warehouse.dir", tempDir.toString()); + conf.set("hive.metastore.warehouse.dir", tempDir.resolve("warehouse").toString()); + + spark = SparkSession.builder().config(conf).getOrCreate(); + } + + @AfterAll + public static void afterAll() throws IOException { + spark.stop(); + FileUtils.deleteDirectory(tempDir.toFile()); + } + + @Test + void testPromoteResultWithMeasures_job() throws Exception { + + final String inputGraphTablePath = getClass() + .getResource("/eu/dnetlib/dhp/actionmanager/promote/measures/graph") + .getPath(); + + final String inputActionPayloadPath = getClass() + .getResource("/eu/dnetlib/dhp/actionmanager/promote/measures/actionPayloads") + .getPath(); + + final String actionPayloadsPath = tempDir.resolve("actionPayloads").toString(); + + spark + .read() + .text(inputActionPayloadPath) + .withColumn("payload", col("value")) + .select("payload") + .write() + .parquet(actionPayloadsPath); + + final Path outputGraphTablePath = tempDir.resolve("outputGraphTablePath"); + + PromoteActionPayloadForGraphTableJob + .main(new String[] { + "--isSparkSessionManaged", Boolean.FALSE.toString(), + "--graphTableClassName", Publication.class.getCanonicalName(), + "--inputGraphTablePath", inputGraphTablePath, + "--inputActionPayloadPath", actionPayloadsPath, + "--actionPayloadClassName", Result.class.getCanonicalName(), + "--outputGraphTablePath", outputGraphTablePath.toString(), + "--mergeAndGetStrategy", MergeAndGet.Strategy.MERGE_FROM_AND_GET.toString(), + "--promoteActionStrategy", PromoteAction.Strategy.ENRICH.toString(), + "--shouldGroupById", "true" + }); + + assertFalse(isDirEmpty(outputGraphTablePath)); + + final Encoder pubEncoder = Encoders.bean(Publication.class); + List results = spark + .read() + .schema(pubEncoder.schema()) + .json(outputGraphTablePath.toString()) + .as(pubEncoder) + .collectAsList(); + + verify(results); + } + + @Test + void testPromoteResultWithMeasures_internal() throws JsonProcessingException { + + Dataset rowDS = spark + .read() + .schema(Encoders.bean(Publication.class).schema()) + .json("src/test/resources/eu/dnetlib/dhp/actionmanager/promote/measures/graph") + .as(Encoders.bean(Publication.class)); + + Dataset actionPayloadDS = spark + .read() + .schema(Encoders.bean(Result.class).schema()) + .json("src/test/resources/eu/dnetlib/dhp/actionmanager/promote/measures/actionPayloads") + .as(Encoders.bean(Result.class)); + + final MergeAndGet.Strategy mergeFromAndGet = MergeAndGet.Strategy.MERGE_FROM_AND_GET; + + final SerializableSupplier> rowIdFn = ModelSupport::idFn; + final SerializableSupplier> mergeAndGetFn = MergeAndGet + .functionFor(mergeFromAndGet); + final SerializableSupplier zeroFn = () -> Publication.class + .cast(new eu.dnetlib.dhp.schema.oaf.Publication()); + final SerializableSupplier> isNotZeroFn = PromoteResultWithMeasuresTest::isNotZeroFnUsingIdOrSourceAndTarget; + + Dataset joinedResults = PromoteActionPayloadFunctions + .joinGraphTableWithActionPayloadAndMerge( + rowDS, + actionPayloadDS, + rowIdFn, + ModelSupport::idFn, + mergeAndGetFn, + PromoteAction.Strategy.ENRICH, + Publication.class, + Result.class); + + SerializableSupplier> mergeRowsAndGetFn = MergeAndGet + .functionFor(mergeFromAndGet); + + Dataset mergedResults = PromoteActionPayloadFunctions + .groupGraphTableByIdAndMerge( + joinedResults, rowIdFn, mergeRowsAndGetFn, zeroFn, isNotZeroFn, Publication.class); + + verify(mergedResults.collectAsList()); + } + + private static void verify(List results) throws JsonProcessingException { + assertNotNull(results); + assertEquals(1, results.size()); + + Result r = results.get(0); + + log.info(OBJECT_MAPPER.writeValueAsString(r)); + + assertNotNull(r.getMeasures()); + assertFalse(r.getMeasures().isEmpty()); + assertTrue( + r + .getMeasures() + .stream() + .map(Measure::getId) + .collect(Collectors.toCollection(HashSet::new)) + .containsAll( + Lists + .newArrayList( + "downloads", "views", "influence", "popularity", "influence_alt", "popularity_alt", + "impulse"))); + } + + private static Function isNotZeroFnUsingIdOrSourceAndTarget() { + return t -> { + if (isSubClass(t, Relation.class)) { + final Relation rel = (Relation) t; + return StringUtils.isNotBlank(rel.getSource()) && StringUtils.isNotBlank(rel.getTarget()); + } + return StringUtils.isNotBlank(((OafEntity) t).getId()); + }; + } + + private static boolean isDirEmpty(final Path directory) throws IOException { + try (DirectoryStream dirStream = Files.newDirectoryStream(directory)) { + return !dirStream.iterator().hasNext(); + } + } + +} diff --git a/dhp-workflows/dhp-actionmanager/src/test/resources/eu/dnetlib/dhp/actionmanager/promote/measures/actionPayloads/part0000.json b/dhp-workflows/dhp-actionmanager/src/test/resources/eu/dnetlib/dhp/actionmanager/promote/measures/actionPayloads/part0000.json new file mode 100644 index 000000000..806bcf5c8 --- /dev/null +++ b/dhp-workflows/dhp-actionmanager/src/test/resources/eu/dnetlib/dhp/actionmanager/promote/measures/actionPayloads/part0000.json @@ -0,0 +1,3 @@ +{"collectedfrom":null,"dataInfo":null,"lastupdatetimestamp":null,"id":"50|doi_dedup___::02317b7093277ec8aa0311d5c6a25b9b","originalId":null,"pid":null,"dateofcollection":null,"dateoftransformation":null,"extraInfo":null,"oaiprovenance":null,"measures":[{"id":"downloads","unit":[{"key":"opendoar____::358aee4cc897452c00244351e4d91f69||ZENODO","value":"125","dataInfo":{"invisible":false,"inferred":true,"deletedbyinference":false,"trust":"","inferenceprovenance":"update","provenanceaction":{"classid":"measure:usage_counts","classname":"Inferred by OpenAIRE","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}}}]},{"id":"views","unit":[{"key":"opendoar____::358aee4cc897452c00244351e4d91f69||ZENODO","value":"35","dataInfo":{"invisible":false,"inferred":true,"deletedbyinference":false,"trust":"","inferenceprovenance":"update","provenanceaction":{"classid":"measure:usage_counts","classname":"Inferred by OpenAIRE","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}}}]}],"context":null,"processingchargeamount":null,"processingchargecurrency":null,"author":null,"resulttype":null,"metaResourceType":null,"language":null,"country":null,"subject":null,"title":null,"relevantdate":null,"description":null,"dateofacceptance":null,"publisher":null,"embargoenddate":null,"source":null,"fulltext":null,"format":null,"contributor":null,"resourcetype":null,"coverage":null,"bestaccessright":null,"externalReference":null,"instance":null,"eoscifguidelines":null,"openAccessColor":null,"publiclyFunded":null,"transformativeAgreement":null,"isGreen":null,"isInDiamondJournal":null} +{"collectedfrom":null,"dataInfo":null,"lastupdatetimestamp":null,"id":"50|doi_dedup___::02317b7093277ec8aa0311d5c6a25b9b","originalId":null,"pid":null,"dateofcollection":null,"dateoftransformation":null,"extraInfo":null,"oaiprovenance":null,"measures":[{"id":"influence","unit":[{"key":"score","value":"3.1167566E-9","dataInfo":{"invisible":false,"inferred":true,"deletedbyinference":false,"trust":"","inferenceprovenance":"update","provenanceaction":{"classid":"measure:bip","classname":"Inferred by OpenAIRE","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}}},{"key":"class","value":"C5","dataInfo":{"invisible":false,"inferred":true,"deletedbyinference":false,"trust":"","inferenceprovenance":"update","provenanceaction":{"classid":"measure:bip","classname":"Inferred by OpenAIRE","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}}}]},{"id":"popularity","unit":[{"key":"score","value":"7.335433E-9","dataInfo":{"invisible":false,"inferred":true,"deletedbyinference":false,"trust":"","inferenceprovenance":"update","provenanceaction":{"classid":"measure:bip","classname":"Inferred by OpenAIRE","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}}},{"key":"class","value":"C4","dataInfo":{"invisible":false,"inferred":true,"deletedbyinference":false,"trust":"","inferenceprovenance":"update","provenanceaction":{"classid":"measure:bip","classname":"Inferred by OpenAIRE","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}}}]},{"id":"influence_alt","unit":[{"key":"score","value":"4","dataInfo":{"invisible":false,"inferred":true,"deletedbyinference":false,"trust":"","inferenceprovenance":"update","provenanceaction":{"classid":"measure:bip","classname":"Inferred by OpenAIRE","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}}},{"key":"class","value":"C5","dataInfo":{"invisible":false,"inferred":true,"deletedbyinference":false,"trust":"","inferenceprovenance":"update","provenanceaction":{"classid":"measure:bip","classname":"Inferred by OpenAIRE","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}}}]},{"id":"popularity_alt","unit":[{"key":"score","value":"2.96","dataInfo":{"invisible":false,"inferred":true,"deletedbyinference":false,"trust":"","inferenceprovenance":"update","provenanceaction":{"classid":"measure:bip","classname":"Inferred by OpenAIRE","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}}},{"key":"class","value":"C4","dataInfo":{"invisible":false,"inferred":true,"deletedbyinference":false,"trust":"","inferenceprovenance":"update","provenanceaction":{"classid":"measure:bip","classname":"Inferred by OpenAIRE","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}}}]},{"id":"impulse","unit":[{"key":"score","value":"4","dataInfo":{"invisible":false,"inferred":true,"deletedbyinference":false,"trust":"","inferenceprovenance":"update","provenanceaction":{"classid":"measure:bip","classname":"Inferred by OpenAIRE","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}}},{"key":"class","value":"C5","dataInfo":{"invisible":false,"inferred":true,"deletedbyinference":false,"trust":"","inferenceprovenance":"update","provenanceaction":{"classid":"measure:bip","classname":"Inferred by OpenAIRE","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}}}]}],"context":null,"processingchargeamount":null,"processingchargecurrency":null,"author":null,"resulttype":null,"metaResourceType":null,"language":null,"country":null,"subject":null,"title":null,"relevantdate":null,"description":null,"dateofacceptance":null,"publisher":null,"embargoenddate":null,"source":null,"fulltext":null,"format":null,"contributor":null,"resourcetype":null,"coverage":null,"bestaccessright":null,"externalReference":null,"instance":null,"eoscifguidelines":null,"openAccessColor":null,"publiclyFunded":null,"transformativeAgreement":null,"isGreen":null,"isInDiamondJournal":null} +{"collectedfrom":null,"dataInfo":null,"lastupdatetimestamp":null,"id":"50|doi_dedup___::02317b7093277ec8aa0311d5c6a25b9b","originalId":null,"pid":null,"dateofcollection":null,"dateoftransformation":null,"extraInfo":null,"oaiprovenance":null,"measures":null,"context":null,"processingchargeamount":null,"processingchargecurrency":null,"author":null,"resulttype":null,"metaResourceType":null,"language":null,"country":null,"subject":null,"title":null,"relevantdate":null,"description":null,"dateofacceptance":null,"publisher":null,"embargoenddate":null,"source":null,"fulltext":null,"format":null,"contributor":null,"resourcetype":null,"coverage":null,"bestaccessright":null,"externalReference":null,"instance":null,"eoscifguidelines":null,"openAccessColor":"hybrid","publiclyFunded":false,"transformativeAgreement":null,"isGreen":true,"isInDiamondJournal":false} \ No newline at end of file diff --git a/dhp-workflows/dhp-actionmanager/src/test/resources/eu/dnetlib/dhp/actionmanager/promote/measures/graph/part00000.json b/dhp-workflows/dhp-actionmanager/src/test/resources/eu/dnetlib/dhp/actionmanager/promote/measures/graph/part00000.json new file mode 100644 index 000000000..9f03cebe4 --- /dev/null +++ b/dhp-workflows/dhp-actionmanager/src/test/resources/eu/dnetlib/dhp/actionmanager/promote/measures/graph/part00000.json @@ -0,0 +1 @@ +{"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "sysimport:dedup", "classname": "sysimport:dedup", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}, "inferred": true, "inferenceprovenance": "dedup-result-decisiontree-v4", "invisible": false, "trust": "0.8"}, "resourcetype": {"classid": "publication", "classname": "publication", "schemename": "dnet:result_typologies", "schemeid": "dnet:result_typologies"}, "pid": [{"qualifier": {"classid": "doi", "classname": "Digital Object Identifier", "schemename": "dnet:pid_types", "schemeid": "dnet:pid_types"}, "value": "10.14778/3415478.3415507"}, {"qualifier": {"classid": "mag_id", "classname": "Microsoft Academic Graph Identifier", "schemename": "dnet:pid_types", "schemeid": "dnet:pid_types"}, "value": "3086187510"}], "bestaccessright": {"classid": "OPEN", "classname": "Open Access", "schemename": "dnet:access_modes", "schemeid": "dnet:access_modes"}, "relevantdate": [{"qualifier": {"classid": "created", "classname": "created", "schemename": "dnet:dataCite_date", "schemeid": "dnet:dataCite_date"}, "value": "2020-09-14"}, {"qualifier": {"classid": "published-online", "classname": "published-online", "schemename": "dnet:dataCite_date", "schemeid": "dnet:dataCite_date"}, "value": "2020-09-14"}, {"qualifier": {"classid": "published-print", "classname": "published-print", "schemename": "dnet:dataCite_date", "schemeid": "dnet:dataCite_date"}, "value": "2020-08-01"}, {"dataInfo": {"invisible": false, "trust": "0.9", "deletedbyinference": false, "inferred": false, "provenanceaction": {"classid": "sysimport:crosswalk:repository", "classname": "Harvested", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}}, "qualifier": {"classid": "issued", "classname": "issued", "schemename": "dnet:dataCite_date", "schemeid": "dnet:dataCite_date"}, "value": "2020-09-02"}, {"dataInfo": {"invisible": false, "trust": "0.9", "deletedbyinference": false, "inferred": false, "provenanceaction": {"classid": "sysimport:crosswalk:repository", "classname": "Harvested", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}}, "qualifier": {"classid": "updated", "classname": "updated", "schemename": "dnet:dataCite_date", "schemeid": "dnet:dataCite_date"}, "value": "2020-09-01"}], "contributor": [], "id": "50|doi_dedup___::02317b7093277ec8aa0311d5c6a25b9b", "description": [{"value": "We present SPHINX, a system for metapath-based entity exploration in Heterogeneous Information Networks (HINs). SPHINX allows users to define different views over a HIN based on both automatically selected and user-defined meta-paths. Then, entity ranking and similarity search can be performed over these views to find and explore entities of interest, taking also into account any spatial or temporal properties of entities. A Web-based user interface is provided to facilitate users in performing the various functionalities supported by the system, including metapath-based view definition, index construction, search parameters specification, and visual comparison of the results."}], "lastupdatetimestamp": 1725554400176, "author": [{"fullname": "Thanasis Vergoulis", "pid": [], "rank": 1}, {"fullname": "Kostas Patroumpas", "pid": [], "rank": 2}, {"fullname": "Alexandros Zeakis", "pid": [], "rank": 3}, {"fullname": "Dimitrios Skoutas", "pid": [], "rank": 4}, {"fullname": "Serafeim Chatzopoulos", "pid": [], "rank": 5}], "collectedfrom": [{"value": "ZENODO", "key": "10|opendoar____::358aee4cc897452c00244351e4d91f69"}, {"value": "Crossref", "key": "10|openaire____::081b82f96300b6a6e3d282bad31cb6e2"}, {"value": "Microsoft Academic Graph", "key": "10|openaire____::5f532a3fc4f1ea403f37070f59a7a53a"}, {"value": "UnpayWall", "key": "10|openaire____:8ac8380272269217cb09a928c8caa993"}, {"value": "European Union Open Data Portal", "key": "10|re3data_____::c4b2081b224be6b3e79d0e5e5556f631"}], "instance": [{"refereed": {"classid": "0001", "classname": "peerReviewed", "schemename": "dnet:review_levels", "schemeid": "dnet:review_levels"}, "hostedby": {"dataInfo": {"invisible": false, "deletedbyinference": false}, "value": "Proceedings of the VLDB Endowment", "key": "10|issn___print::8e719dcc0c83f87be79812fcf8024e2b"}, "url": ["https://doi.org/10.14778/3415478.3415507"], "pid": [{"qualifier": {"classid": "doi", "classname": "Digital Object Identifier", "schemename": "dnet:pid_types", "schemeid": "dnet:pid_types"}, "value": "10.14778/3415478.3415507"}], "instanceTypeMapping": [{"originalType": "journal-article", "typeLabel": "research article", "vocabularyName": "openaire::coar_resource_types_3_1", "typeCode": "http://purl.org/coar/resource_type/c_2df8fbb1"}, {"originalType": "http://purl.org/coar/resource_type/c_2df8fbb1", "typeLabel": "Article", "vocabularyName": "openaire::user_resource_types", "typeCode": "Article"}], "dateofacceptance": {"value": "2020-08-01"}, "collectedfrom": {"value": "Crossref", "key": "10|openaire____::081b82f96300b6a6e3d282bad31cb6e2"}, "accessright": {"classid": "UNKNOWN", "classname": "not available", "schemename": "dnet:access_modes", "schemeid": "dnet:access_modes"}, "instancetype": {"classid": "0001", "classname": "Article", "schemename": "dnet:publication_resource", "schemeid": "dnet:publication_resource"}}, {"refereed": {"classid": "0002", "classname": "nonPeerReviewed", "schemename": "dnet:review_levels", "schemeid": "dnet:review_levels"}, "hostedby": {"dataInfo": {"invisible": false, "deletedbyinference": false}, "value": "Proceedings of the VLDB Endowment", "key": "10|issn___print::8e719dcc0c83f87be79812fcf8024e2b"}, "license": {"value": "CC BY"}, "url": ["https://zenodo.org/record/4010307/files/p2913-chatzopoulos.pdf"], "pid": [{"qualifier": {"classid": "doi", "classname": "Digital Object Identifier", "schemename": "dnet:pid_types", "schemeid": "dnet:pid_types"}, "value": "10.14778/3415478.3415507"}], "instanceTypeMapping": [{"originalType": "journal-article", "typeLabel": "research article", "vocabularyName": "openaire::coar_resource_types_3_1", "typeCode": "http://purl.org/coar/resource_type/c_2df8fbb1"}, {"originalType": "http://purl.org/coar/resource_type/c_2df8fbb1", "typeLabel": "Article", "vocabularyName": "openaire::user_resource_types", "typeCode": "Article"}], "collectedfrom": {"value": "UnpayWall", "key": "10|openaire____:8ac8380272269217cb09a928c8caa993"}, "accessright": {"classid": "OPEN", "classname": "Open Access", "schemename": "dnet:access_modes", "schemeid": "dnet:access_modes", "openAccessRoute": "green"}, "instancetype": {"classid": "0001", "classname": "Article", "schemename": "dnet:publication_resource", "schemeid": "dnet:publication_resource"}}, {"refereed": {"classid": "0002", "classname": "nonPeerReviewed", "schemename": "dnet:review_levels", "schemeid": "dnet:review_levels"}, "hostedby": {"value": "Unknown Repository", "key": "10|openaire____::55045bd2a65019fd8e6741a755395c8c"}, "url": ["http://dx.doi.org/10.14778/3415478.3415507"], "pid": [], "instanceTypeMapping": [{"originalType": "CONFERENCE_PROCEEDING", "vocabularyName": "openaire::coar_resource_types_3_1"}], "distributionlocation": "", "alternateIdentifier": [{"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "sysimport:crosswalk", "classname": "Harvested", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": "0.9"}, "qualifier": {"classid": "doi", "classname": "Digital Object Identifier", "schemename": "dnet:pid_types", "schemeid": "dnet:pid_types"}, "value": "10.14778/3415478.3415507"}], "dateofacceptance": {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "sysimport:crosswalk", "classname": "Harvested", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": "0.9"}, "value": "2020-01-01"}, "collectedfrom": {"value": "European Union Open Data Portal", "key": "10|re3data_____::c4b2081b224be6b3e79d0e5e5556f631"}, "accessright": {"classid": "UNKNOWN", "classname": "not available", "schemename": "dnet:access_modes", "schemeid": "dnet:access_modes"}, "instancetype": {"classid": "0004", "classname": "Conference object", "schemename": "dnet:publication_resource", "schemeid": "dnet:publication_resource"}}, {"refereed": {"classid": "0002", "classname": "nonPeerReviewed", "schemename": "dnet:review_levels", "schemeid": "dnet:review_levels"}, "hostedby": {"value": "Unknown Repository", "key": "10|openaire____::55045bd2a65019fd8e6741a755395c8c"}, "url": ["http://dx.doi.org/10.14778/3415478.3415507"], "pid": [], "instanceTypeMapping": [{"originalType": "Conference proceedings", "typeLabel": "conference proceedings", "vocabularyName": "openaire::coar_resource_types_3_1", "typeCode": "http://purl.org/coar/resource_type/c_f744"}], "distributionlocation": "", "alternateIdentifier": [{"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "sysimport:crosswalk", "classname": "Harvested", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": "0.9"}, "qualifier": {"classid": "doi", "classname": "Digital Object Identifier", "schemename": "dnet:pid_types", "schemeid": "dnet:pid_types"}, "value": "10.14778/3415478.3415507"}], "dateofacceptance": {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "sysimport:crosswalk", "classname": "Harvested", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": "0.9"}, "value": "2020-01-01"}, "collectedfrom": {"value": "European Union Open Data Portal", "key": "10|re3data_____::c4b2081b224be6b3e79d0e5e5556f631"}, "accessright": {"classid": "UNKNOWN", "classname": "not available", "schemename": "dnet:access_modes", "schemeid": "dnet:access_modes"}, "instancetype": {"classid": "0038", "classname": "Other literature type", "schemename": "dnet:publication_resource", "schemeid": "dnet:publication_resource"}}, {"refereed": {"classid": "0002", "classname": "nonPeerReviewed", "schemename": "dnet:review_levels", "schemeid": "dnet:review_levels"}, "hostedby": {"value": "ZENODO", "key": "10|opendoar____::358aee4cc897452c00244351e4d91f69"}, "license": {"dataInfo": {"invisible": false, "trust": "0.9", "deletedbyinference": false, "inferred": false, "provenanceaction": {"classid": "sysimport:crosswalk:repository", "classname": "Harvested", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}}, "value": "CC BY"}, "url": ["http://dx.doi.org/10.14778/3415478.3415507"], "pid": [], "instanceTypeMapping": [{"originalType": "ConferencePaper", "typeLabel": "conference paper", "vocabularyName": "openaire::coar_resource_types_3_1", "typeCode": "http://purl.org/coar/resource_type/c_5794"}, {"originalType": "http://purl.org/coar/resource_type/c_5794", "typeLabel": "Article", "vocabularyName": "openaire::user_resource_types", "typeCode": "Article"}], "alternateIdentifier": [{"dataInfo": {"invisible": false, "trust": "0.9", "deletedbyinference": false, "inferred": false, "provenanceaction": {"classid": "sysimport:crosswalk:repository", "classname": "Harvested", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}}, "qualifier": {"classid": "doi", "classname": "Digital Object Identifier", "schemename": "dnet:pid_types", "schemeid": "dnet:pid_types"}, "value": "10.14778/3415478.3415507"}, {"dataInfo": {"invisible": false, "trust": "0.9", "deletedbyinference": false, "inferred": false, "provenanceaction": {"classid": "sysimport:crosswalk:repository", "classname": "Harvested", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}}, "qualifier": {"classid": "doi", "classname": "Digital Object Identifier", "schemename": "dnet:pid_types", "schemeid": "dnet:pid_types"}, "value": "10.14778/3415478.3415507"}, {"dataInfo": {"invisible": false, "trust": "0.9", "deletedbyinference": false, "inferred": false, "provenanceaction": {"classid": "sysimport:crosswalk:repository", "classname": "Harvested", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}}, "qualifier": {"classid": "doi", "classname": "Digital Object Identifier", "schemename": "dnet:pid_types", "schemeid": "dnet:pid_types"}, "value": "10.14778/3415478.3415507"}, {"dataInfo": {"invisible": false, "trust": "0.9", "deletedbyinference": false, "inferred": false, "provenanceaction": {"classid": "sysimport:crosswalk:repository", "classname": "Harvested", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}}, "qualifier": {"classid": "oai", "classname": "Open Archives Initiative", "schemename": "dnet:pid_types", "schemeid": "dnet:pid_types"}, "value": "oai:zenodo.org:4010307"}], "dateofacceptance": {"dataInfo": {"invisible": false, "trust": "0.9", "deletedbyinference": false, "inferred": false, "provenanceaction": {"classid": "sysimport:crosswalk:repository", "classname": "Harvested", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}}, "value": "2020-09-02"}, "collectedfrom": {"value": "ZENODO", "key": "10|opendoar____::358aee4cc897452c00244351e4d91f69"}, "accessright": {"classid": "OPEN", "classname": "Open Access", "schemename": "dnet:access_modes", "schemeid": "dnet:access_modes"}, "instancetype": {"classid": "0004", "classname": "Conference object", "schemename": "dnet:publication_resource", "schemeid": "dnet:publication_resource"}}, {"refereed": {"classid": "0002", "classname": "nonPeerReviewed", "schemename": "dnet:review_levels", "schemeid": "dnet:review_levels"}, "hostedby": {"dataInfo": {"invisible": false, "deletedbyinference": false}, "value": "Unknown Repository", "key": "10|openaire____::55045bd2a65019fd8e6741a755395c8c"}, "url": ["http://www.vldb.org/pvldb/vol13/p2913-chatzopoulos.pdf", "https://dblp.uni-trier.de/db/journals/pvldb/pvldb13.html#ChatzopoulosPZV20", "https://dl.acm.org/doi/10.14778/3415478.3415507", "https://doi.org/10.14778/3415478.3415507"], "pid": [{"qualifier": {"classid": "mag_id", "classname": "Microsoft Academic Graph Identifier", "schemename": "dnet:pid_types", "schemeid": "dnet:pid_types"}, "value": "3086187510"}, {"qualifier": {"classid": "doi", "classname": "Digital Object Identifier", "schemename": "dnet:pid_types", "schemeid": "dnet:pid_types"}, "value": "10.14778/3415478.3415507"}], "instanceTypeMapping": [{"originalType": "Conference", "typeLabel": "conference output", "vocabularyName": "openaire::coar_resource_types_3_1", "typeCode": "http://purl.org/coar/resource_type/c_c94f"}], "collectedfrom": {"value": "Microsoft Academic Graph", "key": "10|openaire____::5f532a3fc4f1ea403f37070f59a7a53a"}, "accessright": {"classid": "UNKNOWN", "classname": "not available", "schemename": "dnet:access_modes", "schemeid": "dnet:access_modes"}, "instancetype": {"classid": "0001", "classname": "Article", "schemename": "dnet:publication_resource", "schemeid": "dnet:publication_resource"}}], "dateofcollection": "2024-09-05T16:53:05.687", "metaResourceType": {"classid": "Research Literature", "classname": "Research Literature", "schemename": "openaire::meta_resource_types", "schemeid": "openaire::meta_resource_types"}, "fulltext": [], "dateofacceptance": {"value": "2020-08-01"}, "format": [], "journal": {"issnPrinted": "2150-8097", "vol": "13", "sp": "2913", "ep": "2916", "name": "Proceedings of the VLDB Endowment"}, "subject": [{"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "subject:fos", "classname": "Inferred by OpenAIRE", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}, "inferred": true, "inferenceprovenance": "update", "invisible": false, "trust": ""}, "qualifier": {"classid": "FOS", "classname": "Fields of Science and Technology classification", "schemename": "dnet:subject_classification_typologies", "schemeid": "dnet:subject_classification_typologies"}, "value": "02 engineering and technology"}, {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "subject:fos", "classname": "Inferred by OpenAIRE", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}, "inferred": true, "inferenceprovenance": "update", "invisible": false, "trust": ""}, "qualifier": {"classid": "FOS", "classname": "Fields of Science and Technology classification", "schemename": "dnet:subject_classification_typologies", "schemeid": "dnet:subject_classification_typologies"}, "value": "0202 electrical engineering, electronic engineering, information engineering"}, {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "subject:fos", "classname": "Inferred by OpenAIRE", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}, "inferred": true, "inferenceprovenance": "update", "invisible": false, "trust": "0.5467381477355957"}, "qualifier": {"classid": "FOS", "classname": "Fields of Science and Technology classification", "schemename": "dnet:subject_classification_typologies", "schemeid": "dnet:subject_classification_typologies"}, "value": "020201 artificial intelligence & image processing"}, {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "subject:fos", "classname": "Inferred by OpenAIRE", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}, "inferred": true, "inferenceprovenance": "update", "invisible": false, "trust": "0.4532618224620819"}, "qualifier": {"classid": "FOS", "classname": "Fields of Science and Technology classification", "schemename": "dnet:subject_classification_typologies", "schemeid": "dnet:subject_classification_typologies"}, "value": "020204 information systems"}, {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "subject:fos", "classname": "Inferred by OpenAIRE", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}, "inferred": true, "inferenceprovenance": "update", "invisible": false, "trust": "0.5"}, "qualifier": {"classid": "FOS", "classname": "Fields of Science and Technology classification", "schemename": "dnet:subject_classification_typologies", "schemeid": "dnet:subject_classification_typologies"}, "value": "02020108 Machine learning/Social Info Processing"}, {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "subject:fos", "classname": "Inferred by OpenAIRE", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}, "inferred": true, "inferenceprovenance": "update", "invisible": false, "trust": "0.5"}, "qualifier": {"classid": "FOS", "classname": "Fields of Science and Technology classification", "schemename": "dnet:subject_classification_typologies", "schemeid": "dnet:subject_classification_typologies"}, "value": "02020402 Cryptography/Information governance"}], "coverage": [], "externalReference": [], "publisher": {"value": "Association for Computing Machinery (ACM)"}, "eoscifguidelines": [], "language": {"classid": "und", "classname": "Undetermined", "schemename": "dnet:languages", "schemeid": "dnet:languages"}, "resulttype": {"classid": "publication", "classname": "publication", "schemename": "dnet:result_typologies", "schemeid": "dnet:result_typologies"}, "country": [], "extraInfo": [], "originalId": ["10.14778/3415478.3415507", "50|doiboost____|02317b7093277ec8aa0311d5c6a25b9b", "825041_1260870_PUBLI", "50|r3c4b2081b22::0d0cc9ff8949f9091272abb7a9e083f8", "50|r3c4b2081b22::02317b7093277ec8aa0311d5c6a25b9b", "oai:zenodo.org:4010307", "50|od______2659::de3dfee8ed6f2e53c85690531ab23028", "3086187510"], "source": [{"value": "Crossref"}, {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "sysimport:crosswalk", "classname": "Harvested", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": "0.9"}, "value": "International Conference on Very Large Data Bases (VLDB)"}], "context": [{"dataInfo": [{"invisible": false, "trust": "0.9", "deletedbyinference": false, "inferred": false, "provenanceaction": {"classid": "sysimport:crosswalk:repository", "classname": "Harvested", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}}], "id": "https://zenodo.org/communities/smartdatalake-project"}, {"dataInfo": [{"invisible": false, "trust": "0.9", "deletedbyinference": false, "inferred": false, "provenanceaction": {"classid": "sysimport:crosswalk:repository", "classname": "Harvested", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}}], "id": "https://zenodo.org/communities/eu"}], "title": [{"qualifier": {"classid": "main title", "classname": "main title", "schemename": "dnet:dataCite_title", "schemeid": "dnet:dataCite_title"}, "value": "SPHINX"}, {"qualifier": {"classid": "subtitle", "classname": "subtitle", "schemename": "dnet:dataCite_title", "schemeid": "dnet:dataCite_title"}, "value": "a system for metapath-based entity exploration in heterogeneous information networks"}, {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "sysimport:crosswalk", "classname": "Harvested", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}, "inferred": false, "inferenceprovenance": "", "invisible": false, "trust": "0.9"}, "qualifier": {"classid": "main title", "classname": "main title", "schemename": "dnet:dataCite_title", "schemeid": "dnet:dataCite_title"}, "value": "SPHINX: A System for Metapath-based Entity Exploration in Heterogeneous Information Networks"}, {"qualifier": {"classid": "main title", "classname": "main title", "schemename": "dnet:dataCite_title", "schemeid": "dnet:dataCite_title"}, "value": "sphinx a system for metapath based entity exploration in heterogeneous information networks"}]} \ No newline at end of file From 5f86c93be6fbe137b417213d49e0dd426b6c7898 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Fri, 20 Sep 2024 12:20:00 +0200 Subject: [PATCH 013/111] [graph provision] person serialisation --- .../CreateRelatedEntitiesJob_phase1.java | 8 +++ .../model/ProvisionModelSupport.java | 7 ++- .../dhp/oa/provision/model/RelatedEntity.java | 39 ++++++++++++- .../oa/provision/utils/XmlRecordFactory.java | 57 ++++++++++++++++++- 4 files changed, 107 insertions(+), 4 deletions(-) diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/CreateRelatedEntitiesJob_phase1.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/CreateRelatedEntitiesJob_phase1.java index 63f3c2ead..3fc5893c6 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/CreateRelatedEntitiesJob_phase1.java +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/CreateRelatedEntitiesJob_phase1.java @@ -231,6 +231,14 @@ public class CreateRelatedEntitiesJob_phase1 { if (!f.isEmpty()) { re.setFundingtree(f.stream().map(Field::getValue).collect(Collectors.toList())); } + break; + case person: + final Person person = (Person) entity; + + re.setGivenName(person.getGivenName()); + re.setFamilyName(person.getFamilyName()); + re.setAlternativeNames(person.getAlternativeNames()); + break; } return re; diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/ProvisionModelSupport.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/ProvisionModelSupport.java index 797e84315..de7932a8a 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/ProvisionModelSupport.java +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/ProvisionModelSupport.java @@ -5,7 +5,6 @@ import java.io.StringReader; import java.util.*; import java.util.stream.Collectors; -import eu.dnetlib.dhp.schema.solr.Person; import org.apache.commons.lang3.StringUtils; import org.dom4j.Document; import org.dom4j.DocumentException; @@ -38,6 +37,8 @@ import eu.dnetlib.dhp.schema.solr.Measure; import eu.dnetlib.dhp.schema.solr.OpenAccessColor; import eu.dnetlib.dhp.schema.solr.OpenAccessRoute; import eu.dnetlib.dhp.schema.solr.Organization; +import eu.dnetlib.dhp.schema.solr.Person; +import eu.dnetlib.dhp.schema.solr.PersonTopic; import eu.dnetlib.dhp.schema.solr.Pid; import eu.dnetlib.dhp.schema.solr.Project; import eu.dnetlib.dhp.schema.solr.Result; @@ -193,6 +194,10 @@ public class ProvisionModelSupport { ps.setFamilyName(p.getFamilyName()); ps.setGivenName(p.getGivenName()); ps.setAlternativeNames(p.getAlternativeNames()); + ps.setBiography(p.getBiography()); + ps.setConsent(p.getConsent()); + // ps.setSubject(...)); + return ps; } diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/RelatedEntity.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/RelatedEntity.java index ee010910c..2a6332857 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/RelatedEntity.java +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/RelatedEntity.java @@ -51,6 +51,11 @@ public class RelatedEntity implements Serializable { private Qualifier contracttype; private List fundingtree; + // person + private String givenName; + private String familyName; + private List alternativeNames; + public String getId() { return id; } @@ -251,6 +256,30 @@ public class RelatedEntity implements Serializable { this.fundingtree = fundingtree; } + public String getGivenName() { + return givenName; + } + + public void setGivenName(String givenName) { + this.givenName = givenName; + } + + public String getFamilyName() { + return familyName; + } + + public void setFamilyName(String familyName) { + this.familyName = familyName; + } + + public List getAlternativeNames() { + return alternativeNames; + } + + public void setAlternativeNames(List alternativeNames) { + this.alternativeNames = alternativeNames; + } + @Override public boolean equals(Object o) { if (this == o) @@ -280,7 +309,10 @@ public class RelatedEntity implements Serializable { && Objects.equal(code, that.code) && Objects.equal(acronym, that.acronym) && Objects.equal(contracttype, that.contracttype) - && Objects.equal(fundingtree, that.fundingtree); + && Objects.equal(fundingtree, that.fundingtree) + && Objects.equal(givenName, that.givenName) + && Objects.equal(familyName, that.familyName) + && Objects.equal(alternativeNames, that.alternativeNames); } @Override @@ -309,6 +341,9 @@ public class RelatedEntity implements Serializable { code, acronym, contracttype, - fundingtree); + fundingtree, + familyName, + givenName, + alternativeNames); } } diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/XmlRecordFactory.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/XmlRecordFactory.java index 44004faf3..b1f419a7e 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/XmlRecordFactory.java +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/XmlRecordFactory.java @@ -20,7 +20,6 @@ import javax.xml.transform.*; import javax.xml.transform.dom.DOMSource; import javax.xml.transform.stream.StreamResult; -import eu.dnetlib.dhp.oa.provision.model.*; import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.tuple.ImmutablePair; import org.apache.commons.lang3.tuple.Pair; @@ -42,6 +41,7 @@ import com.google.common.collect.Sets; import com.mycila.xmltool.XMLDoc; import com.mycila.xmltool.XMLTag; +import eu.dnetlib.dhp.oa.provision.model.*; import eu.dnetlib.dhp.schema.common.*; import eu.dnetlib.dhp.schema.oaf.*; import eu.dnetlib.dhp.schema.oaf.Result; @@ -1035,6 +1035,42 @@ public class XmlRecordFactory implements Serializable { .collect(Collectors.toList())); } + break; + case person: + final Person person = (Person) entity; + + if (person.getGivenName() != null) { + metadata.add(XmlSerializationUtils.asXmlElement("givenname", person.getGivenName())); + } + if (person.getFamilyName() != null) { + metadata.add(XmlSerializationUtils.asXmlElement("familyname", person.getFamilyName())); + } + if (person.getAlternativeNames() != null) { + metadata.addAll(person.getAlternativeNames()); + } + if (person.getBiography() != null) { + metadata.add(XmlSerializationUtils.asXmlElement("biography", person.getBiography())); + } + if (person.getSubject() != null) { + metadata + .addAll( + person + .getSubject() + .stream() + .map(pt -> { + List> attrs = Lists.newArrayList(); + attrs.add(new Tuple2<>("schema", pt.getSchema())); + attrs.add(new Tuple2<>("value", pt.getValue())); + attrs.add(new Tuple2<>("fromYear", String.valueOf(pt.getFromYear()))); + attrs.add(new Tuple2<>("toYear", String.valueOf(pt.getToYear()))); + return XmlSerializationUtils.asXmlElement("subject", attrs); + }) + .collect(Collectors.toList())); + } + if (person.getConsent() != null) { + metadata.add(XmlSerializationUtils.asXmlElement("consent", String.valueOf(person.getConsent()))); + } + break; default: throw new IllegalArgumentException("invalid entity type: " + type); @@ -1240,6 +1276,25 @@ public class XmlRecordFactory implements Serializable { .collect(Collectors.toList())); } break; + + case person: + + if (isNotBlank(re.getGivenName())) { + metadata.add(XmlSerializationUtils.asXmlElement("givenname", re.getGivenName())); + } + if (isNotBlank(re.getFamilyName())) { + metadata.add(XmlSerializationUtils.asXmlElement("familyname", re.getFamilyName())); + } + if (re.getAlternativeNames() != null && !re.getAlternativeNames().isEmpty()) { + metadata + .addAll( + re + .getAlternativeNames() + .stream() + .map(name -> XmlSerializationUtils.asXmlElement("alternativename", name)) + .collect(Collectors.toList())); + } + break; default: throw new IllegalArgumentException("invalid target type: " + targetType); } From e0ff84baf0d8caf88965d13cd45a946a783f229d Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Mon, 23 Sep 2024 10:29:46 +0200 Subject: [PATCH 014/111] [graph provision] person serialisation, limit the number of authorships and coauthorships before expanding the payloads --- .../dhp/schema/oaf/utils/ModelHardLimits.java | 12 +++++++ .../dhp/oa/provision/PayloadConverterJob.java | 33 +++++++++++++++++++ .../oa/provision/utils/XmlRecordFactory.java | 8 ++++- 3 files changed, 52 insertions(+), 1 deletion(-) diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/ModelHardLimits.java b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/ModelHardLimits.java index 36d138ba1..e4b184fa1 100644 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/ModelHardLimits.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/ModelHardLimits.java @@ -1,6 +1,12 @@ package eu.dnetlib.dhp.schema.oaf.utils; +import java.util.Map; + +import com.google.common.collect.Maps; + +import eu.dnetlib.dhp.schema.common.ModelConstants; + public class ModelHardLimits { private ModelHardLimits() { @@ -19,6 +25,12 @@ public class ModelHardLimits { public static final int MAX_ABSTRACT_LENGTH = 150000; public static final int MAX_RELATED_ABSTRACT_LENGTH = 500; public static final int MAX_INSTANCES = 10; + public static final Map MAX_RELATIONS_BY_RELCLASS = Maps.newHashMap(); + + static { + MAX_RELATIONS_BY_RELCLASS.put(ModelConstants.PERSON_PERSON_HASCOAUTHORED, 500); + MAX_RELATIONS_BY_RELCLASS.put(ModelConstants.RESULT_PERSON_HASAUTHORED, 500); + } public static String getCollectionName(String format) { return format + SEPARATOR + LAYOUT + SEPARATOR + INTERPRETATION; diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/PayloadConverterJob.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/PayloadConverterJob.java index 351526336..cb2d2e799 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/PayloadConverterJob.java +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/PayloadConverterJob.java @@ -2,10 +2,12 @@ package eu.dnetlib.dhp.oa.provision; import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; +import static eu.dnetlib.dhp.schema.oaf.utils.ModelHardLimits.MAX_RELATIONS_BY_RELCLASS; import static eu.dnetlib.dhp.utils.DHPUtils.toSeq; import java.util.List; import java.util.Map; +import java.util.Objects; import java.util.Optional; import org.apache.commons.io.IOUtils; @@ -15,11 +17,13 @@ import org.apache.spark.api.java.function.FilterFunction; import org.apache.spark.api.java.function.MapFunction; import org.apache.spark.sql.*; import org.apache.spark.util.LongAccumulator; +import org.jetbrains.annotations.NotNull; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import com.fasterxml.jackson.annotation.JsonInclude; import com.fasterxml.jackson.databind.ObjectMapper; +import com.google.common.collect.Lists; import com.google.common.collect.Maps; import eu.dnetlib.dhp.application.ArgumentApplicationParser; @@ -27,11 +31,13 @@ import eu.dnetlib.dhp.common.HdfsSupport; import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup; import eu.dnetlib.dhp.oa.provision.model.JoinedEntity; import eu.dnetlib.dhp.oa.provision.model.ProvisionModelSupport; +import eu.dnetlib.dhp.oa.provision.model.RelatedEntityWrapper; import eu.dnetlib.dhp.oa.provision.model.TupleWrapper; import eu.dnetlib.dhp.oa.provision.utils.ContextMapper; import eu.dnetlib.dhp.oa.provision.utils.XmlRecordFactory; import eu.dnetlib.dhp.schema.oaf.DataInfo; import eu.dnetlib.dhp.schema.oaf.Oaf; +import eu.dnetlib.dhp.schema.oaf.utils.ModelHardLimits; import eu.dnetlib.dhp.schema.solr.SolrRecord; import eu.dnetlib.dhp.utils.ISLookupClientFactory; import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService; @@ -124,6 +130,9 @@ public class PayloadConverterJob { .map(Oaf::getDataInfo) .map(DataInfo::getDeletedbyinference) .orElse(false)) + .map( + (MapFunction) PayloadConverterJob::pruneRelatedEntities, + Encoders.kryo(JoinedEntity.class)) .map( (MapFunction>) je -> new Tuple2<>( recordFactory.build(je, validateXML), @@ -139,6 +148,30 @@ public class PayloadConverterJob { .json(outputPath); } + /** + This function iterates through the RelatedEntityWrapper(s) associated to the JoinedEntity and rules out + those exceeding the maximum allowed frequency defined in eu.dnetlib.dhp.schema.oaf.utils.ModelHardLimits#MAX_RELATIONS_BY_RELCLASS + */ + private static JoinedEntity pruneRelatedEntities(JoinedEntity je) { + Map freqs = Maps.newHashMap(); + List rew = Lists.newArrayList(); + + if (je.getLinks() != null) { + je.getLinks().forEach(link -> { + final String relClass = link.getRelation().getRelClass(); + Long count = freqs.putIfAbsent(relClass, 0L); + if (Objects.isNull(count) || (MAX_RELATIONS_BY_RELCLASS.containsKey(relClass) + && count <= MAX_RELATIONS_BY_RELCLASS.get(relClass))) { + rew.add(link); + freqs.put(relClass, freqs.get(relClass) + 1); + } + }); + je.setLinks(rew); + } + + return je; + } + private static void removeOutputDir(final SparkSession spark, final String path) { HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration()); } diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/XmlRecordFactory.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/XmlRecordFactory.java index b1f419a7e..97d2d3989 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/XmlRecordFactory.java +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/utils/XmlRecordFactory.java @@ -1046,7 +1046,13 @@ public class XmlRecordFactory implements Serializable { metadata.add(XmlSerializationUtils.asXmlElement("familyname", person.getFamilyName())); } if (person.getAlternativeNames() != null) { - metadata.addAll(person.getAlternativeNames()); + metadata + .addAll( + person + .getAlternativeNames() + .stream() + .map(altName -> XmlSerializationUtils.asXmlElement("alternativename", altName)) + .collect(Collectors.toList())); } if (person.getBiography() != null) { metadata.add(XmlSerializationUtils.asXmlElement("biography", person.getBiography())); From 7f81673f3ced3fe0f373d041d5ff47ac092d09d6 Mon Sep 17 00:00:00 2001 From: "michele.artini" Date: Mon, 23 Sep 2024 13:01:45 +0200 Subject: [PATCH 015/111] removed the deletedByInference=true filter --- .../eu/dnetlib/dhp/broker/oa/PrepareRelatedProjectsJob.java | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/PrepareRelatedProjectsJob.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/PrepareRelatedProjectsJob.java index 5ff469cd0..236269ff3 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/PrepareRelatedProjectsJob.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/PrepareRelatedProjectsJob.java @@ -70,9 +70,8 @@ public class PrepareRelatedProjectsJob { final Dataset rels = ClusterUtils .loadRelations(graphPath, spark) - .filter((FilterFunction) r -> r.getDataInfo().getDeletedbyinference()) - .filter((FilterFunction) r -> r.getRelType().equals(ModelConstants.RESULT_PROJECT)) - .filter((FilterFunction) r -> !r.getRelClass().equals(BrokerConstants.IS_MERGED_IN_CLASS)) + .filter((FilterFunction) r -> ModelConstants.RESULT_PROJECT.equals(r.getRelType())) + .filter((FilterFunction) r -> !BrokerConstants.IS_MERGED_IN_CLASS.equals(r.getRelClass())) .filter((FilterFunction) r -> !ClusterUtils.isDedupRoot(r.getSource())) .filter((FilterFunction) r -> !ClusterUtils.isDedupRoot(r.getTarget())); From 0e89d4a1cfc4c2fbe4db61299cda84718a62a49f Mon Sep 17 00:00:00 2001 From: "michele.artini" Date: Mon, 23 Sep 2024 09:47:29 +0200 Subject: [PATCH 016/111] fixed a bug with topic ENRICH/MORE/SUBJECT/ARXIV --- .../oa/matchers/simple/EnrichMoreSubject.java | 2 +- .../simple/EnrichMoreSubjectTest.java | 60 +++++++++++++++++++ 2 files changed, 61 insertions(+), 1 deletion(-) create mode 100644 dhp-workflows/dhp-broker-events/src/test/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMoreSubjectTest.java diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMoreSubject.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMoreSubject.java index b62b509c7..390357f99 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMoreSubject.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMoreSubject.java @@ -53,7 +53,7 @@ public class EnrichMoreSubject extends UpdateMatcher { .collect(Collectors.toSet()); return source - .getPids() + .getSubjects() .stream() .filter(s -> !existingSubjects.contains(subjectAsString(s))) .collect(Collectors.toList()); diff --git a/dhp-workflows/dhp-broker-events/src/test/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMoreSubjectTest.java b/dhp-workflows/dhp-broker-events/src/test/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMoreSubjectTest.java new file mode 100644 index 000000000..1fb35c0c9 --- /dev/null +++ b/dhp-workflows/dhp-broker-events/src/test/java/eu/dnetlib/dhp/broker/oa/matchers/simple/EnrichMoreSubjectTest.java @@ -0,0 +1,60 @@ + +package eu.dnetlib.dhp.broker.oa.matchers.simple; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertTrue; + +import java.util.Arrays; +import java.util.List; + +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; + +import eu.dnetlib.broker.objects.OaBrokerMainEntity; +import eu.dnetlib.broker.objects.OaBrokerTypedValue; + +public class EnrichMoreSubjectTest { + + final EnrichMoreSubject matcher = new EnrichMoreSubject(); + + @BeforeEach + void setUp() throws Exception { + } + + @Test + void testFindDifferences_1() { + final OaBrokerMainEntity source = new OaBrokerMainEntity(); + final OaBrokerMainEntity target = new OaBrokerMainEntity(); + final List list = this.matcher.findDifferences(source, target); + assertTrue(list.isEmpty()); + } + + @Test + void testFindDifferences_2() { + final OaBrokerMainEntity source = new OaBrokerMainEntity(); + final OaBrokerMainEntity target = new OaBrokerMainEntity(); + source.setSubjects(Arrays.asList(new OaBrokerTypedValue("arxiv", "subject_01"))); + final List list = this.matcher.findDifferences(source, target); + assertEquals(1, list.size()); + } + + @Test + void testFindDifferences_3() { + final OaBrokerMainEntity source = new OaBrokerMainEntity(); + final OaBrokerMainEntity target = new OaBrokerMainEntity(); + target.setSubjects(Arrays.asList(new OaBrokerTypedValue("arxiv", "subject_01"))); + final List list = this.matcher.findDifferences(source, target); + assertTrue(list.isEmpty()); + } + + @Test + void testFindDifferences_4() { + final OaBrokerMainEntity source = new OaBrokerMainEntity(); + final OaBrokerMainEntity target = new OaBrokerMainEntity(); + source.setSubjects(Arrays.asList(new OaBrokerTypedValue("arxiv", "subject_01"))); + target.setSubjects(Arrays.asList(new OaBrokerTypedValue("arxiv", "subject_01"))); + final List list = this.matcher.findDifferences(source, target); + assertTrue(list.isEmpty()); + } + +} From d1cadc77c90bd3e6eca9b351b9dc4620cb915c2c Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Tue, 24 Sep 2024 10:57:20 +0200 Subject: [PATCH 017/111] [graph provision] person serialisation, limit the number of authorships and coauthorships before expanding the payloads --- .../dhp/schema/oaf/utils/ModelHardLimits.java | 6 +++--- .../dhp/oa/provision/PayloadConverterJob.java | 12 +++++++----- 2 files changed, 10 insertions(+), 8 deletions(-) diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/ModelHardLimits.java b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/ModelHardLimits.java index e4b184fa1..68f60d4d9 100644 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/ModelHardLimits.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/ModelHardLimits.java @@ -25,11 +25,11 @@ public class ModelHardLimits { public static final int MAX_ABSTRACT_LENGTH = 150000; public static final int MAX_RELATED_ABSTRACT_LENGTH = 500; public static final int MAX_INSTANCES = 10; - public static final Map MAX_RELATIONS_BY_RELCLASS = Maps.newHashMap(); + public static final Map MAX_RELATIONS_BY_RELCLASS = Maps.newHashMap(); static { - MAX_RELATIONS_BY_RELCLASS.put(ModelConstants.PERSON_PERSON_HASCOAUTHORED, 500); - MAX_RELATIONS_BY_RELCLASS.put(ModelConstants.RESULT_PERSON_HASAUTHORED, 500); + MAX_RELATIONS_BY_RELCLASS.put(ModelConstants.PERSON_PERSON_HASCOAUTHORED, 500L); + MAX_RELATIONS_BY_RELCLASS.put(ModelConstants.RESULT_PERSON_HASAUTHORED, 500L); } public static String getCollectionName(String format) { diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/PayloadConverterJob.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/PayloadConverterJob.java index cb2d2e799..58838d047 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/PayloadConverterJob.java +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/PayloadConverterJob.java @@ -149,8 +149,8 @@ public class PayloadConverterJob { } /** - This function iterates through the RelatedEntityWrapper(s) associated to the JoinedEntity and rules out - those exceeding the maximum allowed frequency defined in eu.dnetlib.dhp.schema.oaf.utils.ModelHardLimits#MAX_RELATIONS_BY_RELCLASS + * This function iterates through the RelatedEntityWrapper(s) associated to the JoinedEntity and rules out + * those exceeding the maximum allowed frequency defined in eu.dnetlib.dhp.schema.oaf.utils.ModelHardLimits#MAX_RELATIONS_BY_RELCLASS */ private static JoinedEntity pruneRelatedEntities(JoinedEntity je) { Map freqs = Maps.newHashMap(); @@ -159,9 +159,11 @@ public class PayloadConverterJob { if (je.getLinks() != null) { je.getLinks().forEach(link -> { final String relClass = link.getRelation().getRelClass(); - Long count = freqs.putIfAbsent(relClass, 0L); - if (Objects.isNull(count) || (MAX_RELATIONS_BY_RELCLASS.containsKey(relClass) - && count <= MAX_RELATIONS_BY_RELCLASS.get(relClass))) { + + final Long count = freqs.getOrDefault(relClass, Long.MAX_VALUE); + final Long max = MAX_RELATIONS_BY_RELCLASS.getOrDefault(relClass, Long.MAX_VALUE); + + if (count <= max) { rew.add(link); freqs.put(relClass, freqs.get(relClass) + 1); } From 4f0463d7796ea18e32e94af551d1a951ed574503 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Tue, 24 Sep 2024 14:54:34 +0200 Subject: [PATCH 018/111] [graph provision] person serialisation, limit the number of authorships and coauthorships before expanding the payloads --- .../java/eu/dnetlib/dhp/oa/provision/PayloadConverterJob.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/PayloadConverterJob.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/PayloadConverterJob.java index 58838d047..2593ef6fe 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/PayloadConverterJob.java +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/PayloadConverterJob.java @@ -160,12 +160,12 @@ public class PayloadConverterJob { je.getLinks().forEach(link -> { final String relClass = link.getRelation().getRelClass(); - final Long count = freqs.getOrDefault(relClass, Long.MAX_VALUE); + final Long count = freqs.getOrDefault(relClass, 0L); final Long max = MAX_RELATIONS_BY_RELCLASS.getOrDefault(relClass, Long.MAX_VALUE); if (count <= max) { rew.add(link); - freqs.put(relClass, freqs.get(relClass) + 1); + freqs.put(relClass, freqs.getOrDefault(relClass, 0L) + 1); } }); je.setLinks(rew); From 62c4c3ed29dcf395b2f18c5c6495f6502f011c65 Mon Sep 17 00:00:00 2001 From: miconis Date: Wed, 9 Oct 2024 12:26:03 +0200 Subject: [PATCH 019/111] implementation of new comparators for organization and dataset disambiguation --- .../java/eu/dnetlib/pace/tree/CodeMatch.java | 2 +- .../eu/dnetlib/pace/tree/JsonListMatch.java | 43 +++++++++++++------ .../pace/comparators/ComparatorTest.java | 34 ++++++++++++--- 3 files changed, 60 insertions(+), 19 deletions(-) diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/CodeMatch.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/CodeMatch.java index 25a12bcdf..5cd21adcd 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/CodeMatch.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/CodeMatch.java @@ -21,7 +21,7 @@ public class CodeMatch extends AbstractStringComparator { public CodeMatch(Map params) { super(params); this.params = params; - this.CODE_REGEX = Pattern.compile(params.getOrDefault("codeRegex", "[a-zA-Z]::\\d+")); + this.CODE_REGEX = Pattern.compile(params.getOrDefault("codeRegex", "[a-zA-Z]+::\\d+")); } public Set getRegexList(String input) { diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/JsonListMatch.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/JsonListMatch.java index 3897e37f8..e95d9206e 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/JsonListMatch.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/JsonListMatch.java @@ -41,21 +41,38 @@ public class JsonListMatch extends AbstractListComparator { return -1; } - final Set ca = sa.stream().map(this::toComparableString).collect(Collectors.toSet()); - final Set cb = sb.stream().map(this::toComparableString).collect(Collectors.toSet()); + Set ca = sa.stream().map(this::toComparableString).collect(Collectors.toSet()); + Set cb = sb.stream().map(this::toComparableString).collect(Collectors.toSet()); - int incommon = Sets.intersection(ca, cb).size(); - int simDiff = Sets.symmetricDifference(ca, cb).size(); + switch (MODE) { + case "count": + return Sets.intersection(ca, cb).size(); - if (incommon + simDiff == 0) { - return 0.0; + case "percentage": + int incommon = Sets.intersection(ca, cb).size(); + int simDiff = Sets.symmetricDifference(ca, cb).size(); + if (incommon + simDiff == 0) { + return 0.0; + } + return (double) incommon / (incommon + simDiff); + + case "type": + Set typesA = ca.stream().map(s -> s.split("::")[0]).collect(Collectors.toSet()); + Set typesB = cb.stream().map(s -> s.split("::")[0]).collect(Collectors.toSet()); + + Set types = Sets.intersection(typesA, typesB); + + if (types.isEmpty()) // if no common type, it is impossible to compare + return -1; + + ca = ca.stream().filter(s -> types.contains(s.split("::")[0])).collect(Collectors.toSet()); + cb = cb.stream().filter(s -> types.contains(s.split("::")[0])).collect(Collectors.toSet()); + + return (double) Sets.intersection(ca, cb).size() / types.size(); + + default: + return -1; } - - if (MODE.equals("percentage")) - return (double) incommon / (incommon + simDiff); - else - return incommon; - } // converts every json into a comparable string basing on parameters @@ -69,7 +86,7 @@ public class JsonListMatch extends AbstractListComparator { // for each path in the param list for (String key : params.keySet().stream().filter(k -> k.contains("jpath")).collect(Collectors.toList())) { String path = params.get(key); - String value = MapDocumentUtil.getJPathString(path, documentContext); + String value = MapDocumentUtil.getJPathString(path, documentContext).toLowerCase(); if (value == null || value.isEmpty()) value = ""; st.append(value); diff --git a/dhp-pace-core/src/test/java/eu/dnetlib/pace/comparators/ComparatorTest.java b/dhp-pace-core/src/test/java/eu/dnetlib/pace/comparators/ComparatorTest.java index c008902c4..635f1fa44 100644 --- a/dhp-pace-core/src/test/java/eu/dnetlib/pace/comparators/ComparatorTest.java +++ b/dhp-pace-core/src/test/java/eu/dnetlib/pace/comparators/ComparatorTest.java @@ -65,6 +65,23 @@ public class ComparatorTest extends AbstractPaceTest { } + @Test + public void datasetVersionCodeMatchTest() { + + params.put("codeRegex", "(?=[\\w-]*[a-zA-Z])(?=[\\w-]*\\d)[\\w-]+"); + CodeMatch codeMatch = new CodeMatch(params); + + // names have different codes + assertEquals(0.0, codeMatch.distance("physical oceanography at ctd station june 1998 ev02a", "physical oceanography at ctd station june 1998 ir02", conf)); + + // names have same code + assertEquals(1.0, codeMatch.distance("physical oceanography at ctd station june 1998 ev02a", "physical oceanography at ctd station june 1998 ev02a", conf)); + + // code is not in both names + assertEquals(-1, codeMatch.distance("physical oceanography at ctd station june 1998", "physical oceanography at ctd station june 1998 ev02a", conf)); + assertEquals(1.0, codeMatch.distance("physical oceanography at ctd station june 1998", "physical oceanography at ctd station june 1998", conf)); + } + @Test public void listContainsMatchTest() { @@ -257,15 +274,15 @@ public class ComparatorTest extends AbstractPaceTest { List a = createFieldList( Arrays .asList( - "{\"datainfo\":{\"deletedbyinference\":false,\"inferenceprovenance\":null,\"inferred\":false,\"invisible\":false,\"provenanceaction\":{\"classid\":\"sysimport:actionset\",\"classname\":\"Harvested\",\"schemeid\":\"dnet:provenanceActions\",\"schemename\":\"dnet:provenanceActions\"},\"trust\":\"0.9\"},\"qualifier\":{\"classid\":\"doi\",\"classname\":\"Digital Object Identifier\",\"schemeid\":\"dnet:pid_types\",\"schemename\":\"dnet:pid_types\"},\"value\":\"10.1111/pbi.12655\"}"), + "{\"datainfo\":{\"deletedbyinference\":false,\"inferenceprovenance\":null,\"inferred\":false,\"invisible\":false,\"provenanceaction\":{\"classid\":\"sysimport:actionset\",\"classname\":\"Harvested\",\"schemeid\":\"dnet:provenanceActions\",\"schemename\":\"dnet:provenanceActions\"},\"trust\":\"0.9\"},\"qualifier\":{\"classid\":\"grid\",\"classname\":\"GRID Identifier\",\"schemeid\":\"dnet:pid_types\",\"schemename\":\"dnet:pid_types\"},\"value\":\"grid_1\"}", + "{\"datainfo\":{\"deletedbyinference\":false,\"inferenceprovenance\":null,\"inferred\":false,\"invisible\":false,\"provenanceaction\":{\"classid\":\"sysimport:actionset\",\"classname\":\"Harvested\",\"schemeid\":\"dnet:provenanceActions\",\"schemename\":\"dnet:provenanceActions\"},\"trust\":\"0.9\"},\"qualifier\":{\"classid\":\"ror\",\"classname\":\"Research Organization Registry\",\"schemeid\":\"dnet:pid_types\",\"schemename\":\"dnet:pid_types\"},\"value\":\"ror_1\"}"), "authors"); List b = createFieldList( Arrays .asList( - "{\"datainfo\":{\"deletedbyinference\":false,\"inferenceprovenance\":\"\",\"inferred\":false,\"invisible\":false,\"provenanceaction\":{\"classid\":\"sysimport:crosswalk:repository\",\"classname\":\"Harvested\",\"schemeid\":\"dnet:provenanceActions\",\"schemename\":\"dnet:provenanceActions\"},\"trust\":\"0.9\"},\"qualifier\":{\"classid\":\"pmc\",\"classname\":\"PubMed Central ID\",\"schemeid\":\"dnet:pid_types\",\"schemename\":\"dnet:pid_types\"},\"value\":\"PMC5399005\"}", - "{\"datainfo\":{\"deletedbyinference\":false,\"inferenceprovenance\":\"\",\"inferred\":false,\"invisible\":false,\"provenanceaction\":{\"classid\":\"sysimport:crosswalk:repository\",\"classname\":\"Harvested\",\"schemeid\":\"dnet:provenanceActions\",\"schemename\":\"dnet:provenanceActions\"},\"trust\":\"0.9\"},\"qualifier\":{\"classid\":\"pmid\",\"classname\":\"PubMed ID\",\"schemeid\":\"dnet:pid_types\",\"schemename\":\"dnet:pid_types\"},\"value\":\"27775869\"}", - "{\"datainfo\":{\"deletedbyinference\":false,\"inferenceprovenance\":\"\",\"inferred\":false,\"invisible\":false,\"provenanceaction\":{\"classid\":\"user:claim\",\"classname\":\"Linked by user\",\"schemeid\":\"dnet:provenanceActions\",\"schemename\":\"dnet:provenanceActions\"},\"trust\":\"0.9\"},\"qualifier\":{\"classid\":\"doi\",\"classname\":\"Digital Object Identifier\",\"schemeid\":\"dnet:pid_types\",\"schemename\":\"dnet:pid_types\"},\"value\":\"10.1111/pbi.12655\"}", - "{\"datainfo\":{\"deletedbyinference\":false,\"inferenceprovenance\":\"\",\"inferred\":false,\"invisible\":false,\"provenanceaction\":{\"classid\":\"sysimport:crosswalk:repository\",\"classname\":\"Harvested\",\"schemeid\":\"dnet:provenanceActions\",\"schemename\":\"dnet:provenanceActions\"},\"trust\":\"0.9\"},\"qualifier\":{\"classid\":\"handle\",\"classname\":\"Handle\",\"schemeid\":\"dnet:pid_types\",\"schemename\":\"dnet:pid_types\"},\"value\":\"1854/LU-8523529\"}"), + "{\"datainfo\":{\"deletedbyinference\":false,\"inferenceprovenance\":\"\",\"inferred\":false,\"invisible\":false,\"provenanceaction\":{\"classid\":\"sysimport:crosswalk:repository\",\"classname\":\"Harvested\",\"schemeid\":\"dnet:provenanceActions\",\"schemename\":\"dnet:provenanceActions\"},\"trust\":\"0.9\"},\"qualifier\":{\"classid\":\"grid\",\"classname\":\"GRID Identifier\",\"schemeid\":\"dnet:pid_types\",\"schemename\":\"dnet:pid_types\"},\"value\":\"grid_1\"}", + "{\"datainfo\":{\"deletedbyinference\":false,\"inferenceprovenance\":\"\",\"inferred\":false,\"invisible\":false,\"provenanceaction\":{\"classid\":\"sysimport:crosswalk:repository\",\"classname\":\"Harvested\",\"schemeid\":\"dnet:provenanceActions\",\"schemename\":\"dnet:provenanceActions\"},\"trust\":\"0.9\"},\"qualifier\":{\"classid\":\"ror\",\"classname\":\"Research Organization Registry\",\"schemeid\":\"dnet:pid_types\",\"schemename\":\"dnet:pid_types\"},\"value\":\"ror_2\"}", + "{\"datainfo\":{\"deletedbyinference\":false,\"inferenceprovenance\":\"\",\"inferred\":false,\"invisible\":false,\"provenanceaction\":{\"classid\":\"user:claim\",\"classname\":\"Linked by user\",\"schemeid\":\"dnet:provenanceActions\",\"schemename\":\"dnet:provenanceActions\"},\"trust\":\"0.9\"},\"qualifier\":{\"classid\":\"isni\",\"classname\":\"ISNI Identifier\",\"schemeid\":\"dnet:pid_types\",\"schemename\":\"dnet:pid_types\"},\"value\":\"isni_1\"}"), "authors"); double result = jsonListMatch.compare(a, b, conf); @@ -277,6 +294,13 @@ public class ComparatorTest extends AbstractPaceTest { result = jsonListMatch.compare(a, b, conf); assertEquals(1.0, result); + + params.put("mode", "type"); + jsonListMatch = new JsonListMatch(params); + result = jsonListMatch.compare(a, b, conf); + + assertEquals(0.5, result); + } @Test From 5015ba10eba66ba12750ff348cea559062f68e7e Mon Sep 17 00:00:00 2001 From: miconis Date: Mon, 14 Oct 2024 10:23:42 +0200 Subject: [PATCH 020/111] addition of date comparator --- dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/DateRange.java | 2 ++ 1 file changed, 2 insertions(+) create mode 100644 dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/DateRange.java diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/DateRange.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/DateRange.java new file mode 100644 index 000000000..91b8d1c41 --- /dev/null +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/DateRange.java @@ -0,0 +1,2 @@ +package eu.dnetlib.pace.tree;public class DateRange { +} From eab623ddfa0b01137df729981ef99abe0c129215 Mon Sep 17 00:00:00 2001 From: miconis Date: Mon, 14 Oct 2024 10:24:19 +0200 Subject: [PATCH 021/111] implementation of date matcher --- .../java/eu/dnetlib/pace/tree/DateRange.java | 67 ++++++++++++++++++- .../pace/comparators/ComparatorTest.java | 19 ++++++ 2 files changed, 85 insertions(+), 1 deletion(-) diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/DateRange.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/DateRange.java index 91b8d1c41..c913109a4 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/DateRange.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/DateRange.java @@ -1,2 +1,67 @@ -package eu.dnetlib.pace.tree;public class DateRange { +package eu.dnetlib.pace.tree; + +import com.wcohen.ss.AbstractStringDistance; +import eu.dnetlib.pace.config.Config; +import eu.dnetlib.pace.tree.support.AbstractStringComparator; +import eu.dnetlib.pace.tree.support.ComparatorClass; +import org.joda.time.DateTime; + +import java.time.DateTimeException; +import java.time.LocalDate; +import java.time.Period; +import java.time.format.DateTimeFormatter; +import java.util.Locale; +import java.util.Map; + +@ComparatorClass("dateRange") +public class DateRange extends AbstractStringComparator { + + int YEAR_RANGE; + + public DateRange(Map params) { + super(params, new com.wcohen.ss.JaroWinkler()); + YEAR_RANGE = Integer.parseInt(params.getOrDefault("year_range", "3")); + } + + public DateRange(final double weight) { + super(weight, new com.wcohen.ss.JaroWinkler()); + } + + protected DateRange(final double weight, final AbstractStringDistance ssalgo) { + super(weight, ssalgo); + } + + public static boolean isNumeric(String str) { + return str.matches("\\d+"); //match a number with optional '-' and decimal. + } + + @Override + public double distance(final String a, final String b, final Config conf) { + if (a.isEmpty() || b.isEmpty()) { + return -1.0; // return -1 if a field is missing + } + + try { + DateTimeFormatter formatter = DateTimeFormatter.ofPattern("yyyy-MM-dd", Locale.ENGLISH); + LocalDate d1 = LocalDate.parse(a, formatter); + LocalDate d2 = LocalDate.parse(b, formatter); + Period period = Period.between(d1, d2); + + return period.getYears() <= YEAR_RANGE? 1.0 : 0.0; + } + catch (DateTimeException e) { + return -1.0; + } + + } + + @Override + public double getWeight() { + return super.weight; + } + + @Override + protected double normalize(final double d) { + return d; + } } diff --git a/dhp-pace-core/src/test/java/eu/dnetlib/pace/comparators/ComparatorTest.java b/dhp-pace-core/src/test/java/eu/dnetlib/pace/comparators/ComparatorTest.java index 635f1fa44..5e1484254 100644 --- a/dhp-pace-core/src/test/java/eu/dnetlib/pace/comparators/ComparatorTest.java +++ b/dhp-pace-core/src/test/java/eu/dnetlib/pace/comparators/ComparatorTest.java @@ -351,4 +351,23 @@ public class ComparatorTest extends AbstractPaceTest { } + @Test + public void dateMatch() { + + DateRange dateRange = new DateRange(params); + + double result = dateRange.distance("2021-05-13", "2023-05-13", conf); + assertEquals(1.0, result); + + result = dateRange.distance("2021-05-13", "2025-05-13", conf); + assertEquals(0.0, result); + + result = dateRange.distance("", "2020-05-05", conf); + assertEquals(-1.0, result); + + result = dateRange.distance("invalid date", "2021-05-02", conf); + assertEquals(-1.0, result); + + } + } From 0e5dd14538fc8b2ba2bc08f3af93793e3b9e19b7 Mon Sep 17 00:00:00 2001 From: Miriam Baglioni Date: Fri, 18 Oct 2024 16:22:21 +0200 Subject: [PATCH 022/111] [createASfromAffRo] adding the provenance datasource used to get the relation (no datasource can be webcrawl = publisher, rawaff means oalex) --- .../PrepareAffiliationRelations.java | 47 ++++++++++--------- .../PrepareAffiliationRelationsTest.java | 16 +++---- 2 files changed, 34 insertions(+), 29 deletions(-) diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipaffiliations/PrepareAffiliationRelations.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipaffiliations/PrepareAffiliationRelations.java index 028fa47dc..61a018a41 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipaffiliations/PrepareAffiliationRelations.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipaffiliations/PrepareAffiliationRelations.java @@ -104,22 +104,22 @@ public class PrepareAffiliationRelations implements Serializable { .listKeyValues(OPENAIRE_DATASOURCE_ID, OPENAIRE_DATASOURCE_NAME); JavaPairRDD crossrefRelations = prepareAffiliationRelationsNewModel( - spark, crossrefInputPath, collectedfromOpenAIRE); + spark, crossrefInputPath, collectedfromOpenAIRE, BIP_INFERENCE_PROVENANCE + "::crossref"); JavaPairRDD pubmedRelations = prepareAffiliationRelations( - spark, pubmedInputPath, collectedfromOpenAIRE); + spark, pubmedInputPath, collectedfromOpenAIRE, BIP_INFERENCE_PROVENANCE + "::pubmed"); JavaPairRDD openAPCRelations = prepareAffiliationRelationsNewModel( - spark, openapcInputPath, collectedfromOpenAIRE); + spark, openapcInputPath, collectedfromOpenAIRE, BIP_INFERENCE_PROVENANCE + "::openapc"); - JavaPairRDD dataciteRelations = prepareAffiliationRelations( - spark, dataciteInputPath, collectedfromOpenAIRE); + JavaPairRDD dataciteRelations = prepareAffiliationRelationsNewModel( + spark, dataciteInputPath, collectedfromOpenAIRE, BIP_INFERENCE_PROVENANCE + "::datacite"); - JavaPairRDD webCrawlRelations = prepareAffiliationRelations( - spark, webcrawlInputPath, collectedfromOpenAIRE); + JavaPairRDD webCrawlRelations = prepareAffiliationRelationsNewModel( + spark, webcrawlInputPath, collectedfromOpenAIRE, BIP_INFERENCE_PROVENANCE + "::rawaff"); - JavaPairRDD publisherRelations = prepareAffiliationRelationFromPublisher( - spark, publisherlInputPath, collectedfromOpenAIRE); + JavaPairRDD publisherRelations = prepareAffiliationRelationFromPublisherNewModel( + spark, publisherlInputPath, collectedfromOpenAIRE, BIP_INFERENCE_PROVENANCE + "::webcrawl"); crossrefRelations .union(pubmedRelations) @@ -133,7 +133,8 @@ public class PrepareAffiliationRelations implements Serializable { private static JavaPairRDD prepareAffiliationRelationFromPublisherNewModel(SparkSession spark, String inputPath, - List collectedfrom) { + List collectedfrom, + String dataprovenance) { Dataset df = spark .read() @@ -142,12 +143,13 @@ public class PrepareAffiliationRelations implements Serializable { .json(inputPath) .where("DOI is not null"); - return getTextTextJavaPairRDD(collectedfrom, df.selectExpr("DOI", "Organizations as Matchings")); + return getTextTextJavaPairRDDNew( + collectedfrom, df.selectExpr("DOI", "Organizations as Matchings"), dataprovenance); } private static JavaPairRDD prepareAffiliationRelationFromPublisher(SparkSession spark, String inputPath, - List collectedfrom) { + List collectedfrom, String dataprovenance) { Dataset df = spark .read() @@ -155,13 +157,14 @@ public class PrepareAffiliationRelations implements Serializable { .json(inputPath) .where("DOI is not null"); - return getTextTextJavaPairRDD(collectedfrom, df.selectExpr("DOI", "Organizations as Matchings")); + return getTextTextJavaPairRDD( + collectedfrom, df.selectExpr("DOI", "Organizations as Matchings"), dataprovenance); } private static JavaPairRDD prepareAffiliationRelations(SparkSession spark, String inputPath, - List collectedfrom) { + List collectedfrom, String dataprovenance) { // load and parse affiliation relations from HDFS Dataset df = spark @@ -170,12 +173,12 @@ public class PrepareAffiliationRelations implements Serializable { .json(inputPath) .where("DOI is not null"); - return getTextTextJavaPairRDD(collectedfrom, df); + return getTextTextJavaPairRDD(collectedfrom, df, dataprovenance); } private static JavaPairRDD prepareAffiliationRelationsNewModel(SparkSession spark, String inputPath, - List collectedfrom) { + List collectedfrom, String dataprovenance) { // load and parse affiliation relations from HDFS Dataset df = spark .read() @@ -184,10 +187,11 @@ public class PrepareAffiliationRelations implements Serializable { .json(inputPath) .where("DOI is not null"); - return getTextTextJavaPairRDDNew(collectedfrom, df); + return getTextTextJavaPairRDDNew(collectedfrom, df, dataprovenance); } - private static JavaPairRDD getTextTextJavaPairRDD(List collectedfrom, Dataset df) { + private static JavaPairRDD getTextTextJavaPairRDD(List collectedfrom, Dataset df, + String dataprovenance) { // unroll nested arrays df = df .withColumn("matching", functions.explode(new Column("Matchings"))) @@ -219,7 +223,7 @@ public class PrepareAffiliationRelations implements Serializable { DataInfo dataInfo = OafMapperUtils .dataInfo( false, - BIP_INFERENCE_PROVENANCE, + dataprovenance, true, false, qualifier, @@ -235,7 +239,8 @@ public class PrepareAffiliationRelations implements Serializable { new Text(OBJECT_MAPPER.writeValueAsString(aa)))); } - private static JavaPairRDD getTextTextJavaPairRDDNew(List collectedfrom, Dataset df) { + private static JavaPairRDD getTextTextJavaPairRDDNew(List collectedfrom, Dataset df, + String dataprovenance) { // unroll nested arrays df = df .withColumn("matching", functions.explode(new Column("Matchings"))) @@ -276,7 +281,7 @@ public class PrepareAffiliationRelations implements Serializable { DataInfo dataInfo = OafMapperUtils .dataInfo( false, - BIP_INFERENCE_PROVENANCE, + dataprovenance, true, false, qualifier, diff --git a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/bipaffiliations/PrepareAffiliationRelationsTest.java b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/bipaffiliations/PrepareAffiliationRelationsTest.java index 179cbecb5..c704bb99b 100644 --- a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/bipaffiliations/PrepareAffiliationRelationsTest.java +++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/bipaffiliations/PrepareAffiliationRelationsTest.java @@ -98,9 +98,9 @@ public class PrepareAffiliationRelationsTest { "-crossrefInputPath", crossrefAffiliationRelationPathNew, "-pubmedInputPath", crossrefAffiliationRelationPath, "-openapcInputPath", crossrefAffiliationRelationPathNew, - "-dataciteInputPath", crossrefAffiliationRelationPath, - "-webCrawlInputPath", crossrefAffiliationRelationPath, - "-publisherInputPath", publisherAffiliationRelationOldPath, + "-dataciteInputPath", crossrefAffiliationRelationPathNew, + "-webCrawlInputPath", crossrefAffiliationRelationPathNew, + "-publisherInputPath", publisherAffiliationRelationPath, "-outputPath", outputPath }); @@ -112,7 +112,7 @@ public class PrepareAffiliationRelationsTest { .map(aa -> ((Relation) aa.getPayload())); // count the number of relations - assertEquals(150, tmp.count());// 18 + 24 *3 + 30 * 2 = + assertEquals(162, tmp.count());// 18 + 24 + 30 * 4 = Dataset dataset = spark.createDataset(tmp.rdd(), Encoders.bean(Relation.class)); dataset.createOrReplaceTempView("result"); @@ -123,7 +123,7 @@ public class PrepareAffiliationRelationsTest { // verify that we have equal number of bi-directional relations Assertions .assertEquals( - 75, execVerification + 81, execVerification .filter( "relClass='" + ModelConstants.HAS_AUTHOR_INSTITUTION + "'") .collectAsList() @@ -131,7 +131,7 @@ public class PrepareAffiliationRelationsTest { Assertions .assertEquals( - 75, execVerification + 81, execVerification .filter( "relClass='" + ModelConstants.IS_AUTHOR_INSTITUTION_OF + "'") .collectAsList() @@ -158,7 +158,7 @@ public class PrepareAffiliationRelationsTest { Assertions .assertEquals( - 2, execVerification.filter("source = '" + publisherid + "' and target = '" + rorId + "'").count()); + 4, execVerification.filter("source = '" + publisherid + "' and target = '" + rorId + "'").count()); Assertions .assertEquals( @@ -173,7 +173,7 @@ public class PrepareAffiliationRelationsTest { Assertions .assertEquals( - 3, execVerification + 1, execVerification .filter( "source = '" + ID_PREFIX + IdentifierFactory From 2b27afaec8d3a6bd7bdaf2ff83040f88c68660e2 Mon Sep 17 00:00:00 2001 From: Miriam Baglioni Date: Fri, 18 Oct 2024 16:22:51 +0200 Subject: [PATCH 023/111] [createASfromAffRo] refactoring after compilation --- .../dhp/oa/graph/hive/GraphHiveTableImporterJob.java | 6 +++--- .../dnetlib/dhp/enrich/orcid/ORCIDAuthorMatchersTest.scala | 1 + 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/hive/GraphHiveTableImporterJob.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/hive/GraphHiveTableImporterJob.java index 73243dbc5..d4fec3f52 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/hive/GraphHiveTableImporterJob.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/hive/GraphHiveTableImporterJob.java @@ -72,9 +72,9 @@ public class GraphHiveTableImporterJob { final Encoder clazzEncoder = Encoders.bean(clazz); Dataset dataset = spark - .read() - .schema(clazzEncoder.schema()) - .json(inputPath); + .read() + .schema(clazzEncoder.schema()) + .json(inputPath); if (numPartitions > 0) { log.info("repartitioning {} to {} partitions", clazz.getSimpleName(), numPartitions); diff --git a/dhp-workflows/dhp-graph-mapper/src/test/scala/eu/dnetlib/dhp/enrich/orcid/ORCIDAuthorMatchersTest.scala b/dhp-workflows/dhp-graph-mapper/src/test/scala/eu/dnetlib/dhp/enrich/orcid/ORCIDAuthorMatchersTest.scala index 4e5ad5365..eece56b74 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/scala/eu/dnetlib/dhp/enrich/orcid/ORCIDAuthorMatchersTest.scala +++ b/dhp-workflows/dhp-graph-mapper/src/test/scala/eu/dnetlib/dhp/enrich/orcid/ORCIDAuthorMatchersTest.scala @@ -31,6 +31,7 @@ class ORCIDAuthorMatchersTest { assertTrue(matchOrderedTokenAndAbbreviations("孙林 Sun Lin", "Sun Lin")) // assertTrue(AuthorsMatchRevised.compare("孙林 Sun Lin", "孙林")); // not yet implemented } + @Test def testDocumentationNames(): Unit = { assertTrue(matchOrderedTokenAndAbbreviations("James C. A. Miller-Jones", "James Antony Miller-Jones")) } From ce4ee1189f4d83ff114c63ddb782621db7a939b2 Mon Sep 17 00:00:00 2001 From: Miriam Baglioni Date: Mon, 21 Oct 2024 14:38:15 +0200 Subject: [PATCH 024/111] [personEntity] create entity for each profile in orcid even without works. Added validated true to each relation coming from orcid data --- .../personentity/CoAuthorshipIterator.java | 5 ++- .../personentity/ExtractPerson.java | 31 +++++++++---------- 2 files changed, 18 insertions(+), 18 deletions(-) diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/personentity/CoAuthorshipIterator.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/personentity/CoAuthorshipIterator.java index 76e4c4851..94ac7ab28 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/personentity/CoAuthorshipIterator.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/personentity/CoAuthorshipIterator.java @@ -61,7 +61,8 @@ public class CoAuthorshipIterator implements Iterator { private Relation getRelation(String orcid1, String orcid2) { String source = PERSON_PREFIX + IdentifierFactory.md5(orcid1); String target = PERSON_PREFIX + IdentifierFactory.md5(orcid2); - return OafMapperUtils + Relation relation = + OafMapperUtils .getRelation( source, target, ModelConstants.PERSON_PERSON_RELTYPE, ModelConstants.PERSON_PERSON_SUBRELTYPE, @@ -76,5 +77,7 @@ public class CoAuthorshipIterator implements Iterator { ModelConstants.DNET_PROVENANCE_ACTIONS, ModelConstants.DNET_PROVENANCE_ACTIONS), "0.91"), null); + relation.setValidated(true); + return relation; } } diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/personentity/ExtractPerson.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/personentity/ExtractPerson.java index 6f61d427d..c29c04699 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/personentity/ExtractPerson.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/personentity/ExtractPerson.java @@ -144,18 +144,13 @@ public class ExtractPerson implements Serializable { .parquet(inputPath + "Employments") .as(Encoders.bean(Employment.class)); - Dataset peopleToMap = authors - .joinWith(works, authors.col("orcid").equalTo(works.col("orcid"))) - .map((MapFunction, Author>) t2 -> t2._1(), Encoders.bean(Author.class)) - .groupByKey((MapFunction) a -> a.getOrcid(), Encoders.STRING()) - .mapGroups((MapGroupsFunction) (k, it) -> it.next(), Encoders.bean(Author.class)); - Dataset employment = employmentDataset - .joinWith(peopleToMap, employmentDataset.col("orcid").equalTo(peopleToMap.col("orcid"))) + .joinWith(authors, employmentDataset.col("orcid").equalTo(authors.col("orcid"))) .map((MapFunction, Employment>) t2 -> t2._1(), Encoders.bean(Employment.class)); + //Mapping all the orcid profiles even if the profile has no visible works Dataset people; - peopleToMap.map((MapFunction) op -> { + authors.map((MapFunction) op -> { Person person = new Person(); person.setId(DHPUtils.generateIdentifier(op.getOrcid(), PERSON_PREFIX)); person @@ -325,6 +320,7 @@ public class ExtractPerson implements Serializable { Arrays.asList(OafMapperUtils.keyValue(orcidKey, ModelConstants.ORCID_DS)), DATAINFO, null); + relation.setValidated(true); if (Optional.ofNullable(row.getStartDate()).isPresent() && StringUtil.isNotBlank(row.getStartDate())) { KeyValue kv = new KeyValue(); @@ -412,14 +408,15 @@ public class ExtractPerson implements Serializable { default: return null; } - - return OafMapperUtils - .getRelation( - source, target, ModelConstants.RESULT_PERSON_RELTYPE, - ModelConstants.RESULT_PERSON_SUBRELTYPE, - ModelConstants.RESULT_PERSON_HASAUTHORED, - Arrays.asList(OafMapperUtils.keyValue(orcidKey, ModelConstants.ORCID_DS)), - DATAINFO, - null); + Relation relation = OafMapperUtils + .getRelation( + source, target, ModelConstants.RESULT_PERSON_RELTYPE, + ModelConstants.RESULT_PERSON_SUBRELTYPE, + ModelConstants.RESULT_PERSON_HASAUTHORED, + Arrays.asList(OafMapperUtils.keyValue(orcidKey, ModelConstants.ORCID_DS)), + DATAINFO, + null); + relation.setValidated(true); + return relation; } } From 09a2c93fc7197e48a8f769a131908e22ed02a78c Mon Sep 17 00:00:00 2001 From: Miriam Baglioni Date: Mon, 21 Oct 2024 16:21:15 +0200 Subject: [PATCH 025/111] [personEntity] added relations with projects extracting the info from the database --- .../personentity/ExtractPerson.java | 166 ++++++++++++++---- 1 file changed, 133 insertions(+), 33 deletions(-) diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/personentity/ExtractPerson.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/personentity/ExtractPerson.java index c29c04699..960dfbe44 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/personentity/ExtractPerson.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/personentity/ExtractPerson.java @@ -2,13 +2,18 @@ package eu.dnetlib.dhp.actionmanager.personentity; import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; -import static org.apache.spark.sql.functions.*; +import java.io.BufferedWriter; import java.io.IOException; +import java.io.OutputStreamWriter; import java.io.Serializable; +import java.nio.charset.StandardCharsets; +import java.sql.ResultSet; +import java.sql.SQLException; import java.util.*; import java.util.stream.Collectors; +import eu.dnetlib.dhp.common.DbClient; import org.apache.commons.cli.ParseException; import org.apache.commons.io.IOUtils; import org.apache.hadoop.io.Text; @@ -22,6 +27,7 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.spark_project.jetty.util.StringUtil; + import com.fasterxml.jackson.databind.ObjectMapper; import eu.dnetlib.dhp.application.ArgumentApplicationParser; @@ -43,9 +49,14 @@ import eu.dnetlib.dhp.schema.oaf.utils.PidType; import eu.dnetlib.dhp.utils.DHPUtils; import scala.Tuple2; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FSDataOutputStream; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; + public class ExtractPerson implements Serializable { private static final Logger log = LoggerFactory.getLogger(ExtractPerson.class); - + private static final String QUERY = "SELECT * FROM project_person WHERE pid_type = 'ORCID'"; private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); private static final String OPENAIRE_PREFIX = "openaire____"; private static final String SEPARATOR = "::"; @@ -62,6 +73,12 @@ public class ExtractPerson implements Serializable { private static final String PERSON_PREFIX = ModelSupport.getIdPrefix(Person.class) + "|orcid_______"; public static final String ORCID_AUTHORS_CLASSID = "sysimport:crosswalk:orcid"; public static final String ORCID_AUTHORS_CLASSNAME = "Imported from ORCID"; + public static final String OPENAIRE_DATASOURCE_ID = "10|infrastruct_::f66f1bd369679b5b077dcdf006089556"; + public static final String OPENAIRE_DATASOURCE_NAME = "OpenAIRE"; + + public static List collectedfromOpenAIRE = OafMapperUtils + .listKeyValues(OPENAIRE_DATASOURCE_ID, OPENAIRE_DATASOURCE_NAME); + public static final DataInfo DATAINFO = OafMapperUtils .dataInfo( @@ -106,19 +123,130 @@ public class ExtractPerson implements Serializable { final String workingDir = parser.get("workingDir"); log.info("workingDir {}", workingDir); + final String dbUrl = parser.get("postgresUrl"); + final String dbUser = parser.get("postgresUser"); + final String dbPassword = parser.get("postgresPassword"); + + final String hdfsNameNode = parser.get("hdfsNameNode"); + SparkConf conf = new SparkConf(); runWithSparkSession( conf, isSparkSessionManaged, spark -> { HdfsSupport.remove(outputPath, spark.sparkContext().hadoopConfiguration()); - createActionSet(spark, inputPath, outputPath, workingDir); + extractInfoForActionSetFromORCID(spark, inputPath, workingDir); + extractInfoForActionSetFromProjects(spark, inputPath, workingDir, dbUrl, dbUser, dbPassword, workingDir + "/project", hdfsNameNode); + createActionSet(spark, outputPath, workingDir); }); } - private static void createActionSet(SparkSession spark, String inputPath, String outputPath, String workingDir) { + private static void extractInfoForActionSetFromProjects(SparkSession spark, String inputPath, String workingDir, + String dbUrl, String dbUser, String dbPassword, String hdfsPath, String hdfsNameNode) throws IOException { + Configuration conf = new Configuration(); + conf.set("fs.defaultFS", hdfsNameNode); + + FileSystem fileSystem = FileSystem.get(conf); + Path hdfsWritePath = new Path(hdfsPath); + FSDataOutputStream fos = fileSystem.create(hdfsWritePath); + try (DbClient dbClient = new DbClient(dbUrl, dbUser, dbPassword)) { + try (BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(fos, StandardCharsets.UTF_8))) { + dbClient.processResults(QUERY, rs -> writeRelation(getRelationWithProject(rs), writer)); + } + + } catch (IOException e) { + e.printStackTrace(); + } + + } + + public static Relation getRelationWithProject(ResultSet rs) { + try { + return getProjectRelation(rs.getString("project"), rs.getString("pid"), + rs.getString("role")); + } catch (final SQLException e) { + throw new RuntimeException(e); + } + } + + private static Relation getProjectRelation(String project, String orcid, String role) { + + String source = PERSON_PREFIX + "::" + IdentifierFactory.md5(orcid); + String target = project.substring(0,14) + + IdentifierFactory.md5(project.substring(15)); + List properties = new ArrayList<>(); + + Relation relation = OafMapperUtils + .getRelation( + source, target, ModelConstants.PROJECT_PERSON_RELTYPE, ModelConstants.PROJECT_PERSON_SUBRELTYPE, + ModelConstants.PROJECT_PERSON_PARTICIPATES, + collectedfromOpenAIRE, + DATAINFO, + null); + relation.setValidated(true); + + if (StringUtil.isNotBlank(role)) { + KeyValue kv = new KeyValue(); + kv.setKey("role"); + kv.setValue(role); + properties.add(kv); + } + + + if (!properties.isEmpty()) + relation.setProperties(properties); + return relation; + + + } + + protected static void writeRelation(final Relation relation, BufferedWriter writer) { + try { + writer.write(OBJECT_MAPPER.writeValueAsString(relation)); + writer.newLine(); + } catch (final IOException e) { + throw new RuntimeException(e); + } + } + + private static void createActionSet(SparkSession spark,String outputPath, String workingDir) { + + Dataset people; + people = spark + .read() + .textFile(workingDir + "/people") + .map( + (MapFunction) value -> OBJECT_MAPPER + .readValue(value, Person.class), + Encoders.bean(Person.class)); + + people + .toJavaRDD() + .map(p -> new AtomicAction(p.getClass(), p)) + .union( + getRelations(spark, workingDir + "/authorship").toJavaRDD().map(r -> new AtomicAction(r.getClass(), r))) + .union( + getRelations(spark, workingDir + "/coauthorship") + .toJavaRDD() + .map(r -> new AtomicAction(r.getClass(), r))) + .union( + getRelations(spark, workingDir + "/affiliation") + .toJavaRDD() + .map(r -> new AtomicAction(r.getClass(), r))) + .union( + getRelations(spark, workingDir + "/project") + .toJavaRDD() + .map(r -> new AtomicAction(r.getClass(), r))) + .mapToPair( + aa -> new Tuple2<>(new Text(aa.getClazz().getCanonicalName()), + new Text(OBJECT_MAPPER.writeValueAsString(aa)))) + .saveAsHadoopFile( + outputPath, Text.class, Text.class, SequenceFileOutputFormat.class, BZip2Codec.class); + } + + private static void extractInfoForActionSetFromORCID(SparkSession spark, String inputPath, String workingDir) { Dataset authors = spark .read() .parquet(inputPath + "Authors") @@ -149,7 +277,7 @@ public class ExtractPerson implements Serializable { .map((MapFunction, Employment>) t2 -> t2._1(), Encoders.bean(Employment.class)); //Mapping all the orcid profiles even if the profile has no visible works - Dataset people; + authors.map((MapFunction) op -> { Person person = new Person(); person.setId(DHPUtils.generateIdentifier(op.getOrcid(), PERSON_PREFIX)); @@ -257,34 +385,6 @@ public class ExtractPerson implements Serializable { .option("compression", "gzip") .mode(SaveMode.Overwrite) .json(workingDir + "/affiliation"); - - people = spark - .read() - .textFile(workingDir + "/people") - .map( - (MapFunction) value -> OBJECT_MAPPER - .readValue(value, Person.class), - Encoders.bean(Person.class)); - - people.show(false); - people - .toJavaRDD() - .map(p -> new AtomicAction(p.getClass(), p)) - .union( - getRelations(spark, workingDir + "/authorship").toJavaRDD().map(r -> new AtomicAction(r.getClass(), r))) - .union( - getRelations(spark, workingDir + "/coauthorship") - .toJavaRDD() - .map(r -> new AtomicAction(r.getClass(), r))) - .union( - getRelations(spark, workingDir + "/affiliation") - .toJavaRDD() - .map(r -> new AtomicAction(r.getClass(), r))) - .mapToPair( - aa -> new Tuple2<>(new Text(aa.getClazz().getCanonicalName()), - new Text(OBJECT_MAPPER.writeValueAsString(aa)))) - .saveAsHadoopFile( - outputPath, Text.class, Text.class, SequenceFileOutputFormat.class, BZip2Codec.class); } private static Dataset getRelations(SparkSession spark, String path) { From 821540f94a0ae055c74cd642ec6921465465e8ae Mon Sep 17 00:00:00 2001 From: Miriam Baglioni Date: Tue, 22 Oct 2024 10:13:30 +0200 Subject: [PATCH 026/111] [personEntity] updated the property file to include also the db parameters. The same for the wf definition. Refactoring for compilation --- .../personentity/CoAuthorshipIterator.java | 3 +- .../personentity/ExtractPerson.java | 99 +++++++++---------- .../personentity/as_parameters.json | 25 +++++ .../actionmanager/personentity/job.properties | 5 +- .../personentity/oozie_app/workflow.xml | 16 +++ .../orcid/ORCIDAuthorMatchersTest.scala | 1 + 6 files changed, 95 insertions(+), 54 deletions(-) diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/personentity/CoAuthorshipIterator.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/personentity/CoAuthorshipIterator.java index 94ac7ab28..131f3f466 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/personentity/CoAuthorshipIterator.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/personentity/CoAuthorshipIterator.java @@ -61,8 +61,7 @@ public class CoAuthorshipIterator implements Iterator { private Relation getRelation(String orcid1, String orcid2) { String source = PERSON_PREFIX + IdentifierFactory.md5(orcid1); String target = PERSON_PREFIX + IdentifierFactory.md5(orcid2); - Relation relation = - OafMapperUtils + Relation relation = OafMapperUtils .getRelation( source, target, ModelConstants.PERSON_PERSON_RELTYPE, ModelConstants.PERSON_PERSON_SUBRELTYPE, diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/personentity/ExtractPerson.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/personentity/ExtractPerson.java index 960dfbe44..fb0621b6e 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/personentity/ExtractPerson.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/personentity/ExtractPerson.java @@ -13,9 +13,12 @@ import java.sql.SQLException; import java.util.*; import java.util.stream.Collectors; -import eu.dnetlib.dhp.common.DbClient; import org.apache.commons.cli.ParseException; import org.apache.commons.io.IOUtils; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FSDataOutputStream; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.compress.BZip2Codec; import org.apache.hadoop.mapred.SequenceFileOutputFormat; @@ -27,13 +30,13 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.spark_project.jetty.util.StringUtil; - import com.fasterxml.jackson.databind.ObjectMapper; import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.collection.orcid.model.Author; import eu.dnetlib.dhp.collection.orcid.model.Employment; import eu.dnetlib.dhp.collection.orcid.model.Work; +import eu.dnetlib.dhp.common.DbClient; import eu.dnetlib.dhp.common.HdfsSupport; import eu.dnetlib.dhp.schema.action.AtomicAction; import eu.dnetlib.dhp.schema.common.ModelConstants; @@ -49,11 +52,6 @@ import eu.dnetlib.dhp.schema.oaf.utils.PidType; import eu.dnetlib.dhp.utils.DHPUtils; import scala.Tuple2; -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.FSDataOutputStream; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; - public class ExtractPerson implements Serializable { private static final Logger log = LoggerFactory.getLogger(ExtractPerson.class); private static final String QUERY = "SELECT * FROM project_person WHERE pid_type = 'ORCID'"; @@ -77,8 +75,7 @@ public class ExtractPerson implements Serializable { public static final String OPENAIRE_DATASOURCE_NAME = "OpenAIRE"; public static List collectedfromOpenAIRE = OafMapperUtils - .listKeyValues(OPENAIRE_DATASOURCE_ID, OPENAIRE_DATASOURCE_NAME); - + .listKeyValues(OPENAIRE_DATASOURCE_ID, OPENAIRE_DATASOURCE_NAME); public static final DataInfo DATAINFO = OafMapperUtils .dataInfo( @@ -136,14 +133,15 @@ public class ExtractPerson implements Serializable { spark -> { HdfsSupport.remove(outputPath, spark.sparkContext().hadoopConfiguration()); extractInfoForActionSetFromORCID(spark, inputPath, workingDir); - extractInfoForActionSetFromProjects(spark, inputPath, workingDir, dbUrl, dbUser, dbPassword, workingDir + "/project", hdfsNameNode); + extractInfoForActionSetFromProjects( + spark, inputPath, workingDir, dbUrl, dbUser, dbPassword, workingDir + "/project", hdfsNameNode); createActionSet(spark, outputPath, workingDir); }); } private static void extractInfoForActionSetFromProjects(SparkSession spark, String inputPath, String workingDir, - String dbUrl, String dbUser, String dbPassword, String hdfsPath, String hdfsNameNode) throws IOException { + String dbUrl, String dbUser, String dbPassword, String hdfsPath, String hdfsNameNode) throws IOException { Configuration conf = new Configuration(); conf.set("fs.defaultFS", hdfsNameNode); @@ -164,41 +162,40 @@ public class ExtractPerson implements Serializable { public static Relation getRelationWithProject(ResultSet rs) { try { - return getProjectRelation(rs.getString("project"), rs.getString("pid"), - rs.getString("role")); + return getProjectRelation( + rs.getString("project"), rs.getString("pid"), + rs.getString("role")); } catch (final SQLException e) { throw new RuntimeException(e); } - } + } private static Relation getProjectRelation(String project, String orcid, String role) { - String source = PERSON_PREFIX + "::" + IdentifierFactory.md5(orcid); - String target = project.substring(0,14) - + IdentifierFactory.md5(project.substring(15)); - List properties = new ArrayList<>(); + String source = PERSON_PREFIX + "::" + IdentifierFactory.md5(orcid); + String target = project.substring(0, 14) + + IdentifierFactory.md5(project.substring(15)); + List properties = new ArrayList<>(); - Relation relation = OafMapperUtils - .getRelation( - source, target, ModelConstants.PROJECT_PERSON_RELTYPE, ModelConstants.PROJECT_PERSON_SUBRELTYPE, - ModelConstants.PROJECT_PERSON_PARTICIPATES, - collectedfromOpenAIRE, - DATAINFO, - null); - relation.setValidated(true); + Relation relation = OafMapperUtils + .getRelation( + source, target, ModelConstants.PROJECT_PERSON_RELTYPE, ModelConstants.PROJECT_PERSON_SUBRELTYPE, + ModelConstants.PROJECT_PERSON_PARTICIPATES, + collectedfromOpenAIRE, + DATAINFO, + null); + relation.setValidated(true); - if (StringUtil.isNotBlank(role)) { - KeyValue kv = new KeyValue(); - kv.setKey("role"); - kv.setValue(role); - properties.add(kv); - } - - - if (!properties.isEmpty()) - relation.setProperties(properties); - return relation; + if (StringUtil.isNotBlank(role)) { + KeyValue kv = new KeyValue(); + kv.setKey("role"); + kv.setValue(role); + properties.add(kv); + } + if (!properties.isEmpty()) + relation.setProperties(properties); + return relation; } @@ -211,7 +208,7 @@ public class ExtractPerson implements Serializable { } } - private static void createActionSet(SparkSession spark,String outputPath, String workingDir) { + private static void createActionSet(SparkSession spark, String outputPath, String workingDir) { Dataset people; people = spark @@ -221,7 +218,7 @@ public class ExtractPerson implements Serializable { (MapFunction) value -> OBJECT_MAPPER .readValue(value, Person.class), Encoders.bean(Person.class)); - + people .toJavaRDD() .map(p -> new AtomicAction(p.getClass(), p)) @@ -235,10 +232,10 @@ public class ExtractPerson implements Serializable { getRelations(spark, workingDir + "/affiliation") .toJavaRDD() .map(r -> new AtomicAction(r.getClass(), r))) - .union( - getRelations(spark, workingDir + "/project") - .toJavaRDD() - .map(r -> new AtomicAction(r.getClass(), r))) + .union( + getRelations(spark, workingDir + "/project") + .toJavaRDD() + .map(r -> new AtomicAction(r.getClass(), r))) .mapToPair( aa -> new Tuple2<>(new Text(aa.getClazz().getCanonicalName()), new Text(OBJECT_MAPPER.writeValueAsString(aa)))) @@ -276,7 +273,7 @@ public class ExtractPerson implements Serializable { .joinWith(authors, employmentDataset.col("orcid").equalTo(authors.col("orcid"))) .map((MapFunction, Employment>) t2 -> t2._1(), Encoders.bean(Employment.class)); - //Mapping all the orcid profiles even if the profile has no visible works + // Mapping all the orcid profiles even if the profile has no visible works authors.map((MapFunction) op -> { Person person = new Person(); @@ -509,13 +506,13 @@ public class ExtractPerson implements Serializable { return null; } Relation relation = OafMapperUtils - .getRelation( - source, target, ModelConstants.RESULT_PERSON_RELTYPE, - ModelConstants.RESULT_PERSON_SUBRELTYPE, - ModelConstants.RESULT_PERSON_HASAUTHORED, - Arrays.asList(OafMapperUtils.keyValue(orcidKey, ModelConstants.ORCID_DS)), - DATAINFO, - null); + .getRelation( + source, target, ModelConstants.RESULT_PERSON_RELTYPE, + ModelConstants.RESULT_PERSON_SUBRELTYPE, + ModelConstants.RESULT_PERSON_HASAUTHORED, + Arrays.asList(OafMapperUtils.keyValue(orcidKey, ModelConstants.ORCID_DS)), + DATAINFO, + null); relation.setValidated(true); return relation; } diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/personentity/as_parameters.json b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/personentity/as_parameters.json index 5175552e7..1894a6beb 100644 --- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/personentity/as_parameters.json +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/personentity/as_parameters.json @@ -21,5 +21,30 @@ "paramLongName": "workingDir", "paramDescription": "the hdfs name node", "paramRequired": false +}, + { + "paramName": "pu", + "paramLongName": "postgresUrl", + "paramDescription": "the hdfs name node", + "paramRequired": false + }, + + { + "paramName": "ps", + "paramLongName": "postgresUser", + "paramDescription": "the hdfs name node", + "paramRequired": false + }, + { + "paramName": "pp", + "paramLongName": "postgresPassword", + "paramDescription": "the hdfs name node", + "paramRequired": false +},{ + "paramName": "nn", + "paramLongName": "hdfsNameNode", + "paramDescription": "the hdfs name node", + "paramRequired": false } + ] diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/personentity/job.properties b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/personentity/job.properties index d2269718c..ac63d8a68 100644 --- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/personentity/job.properties +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/personentity/job.properties @@ -1,2 +1,5 @@ inputPath=/data/orcid_2023/tables/ -outputPath=/user/miriam.baglioni/peopleAS \ No newline at end of file +outputPath=/user/miriam.baglioni/peopleAS +postgresUrl=jdbc:postgresql://beta.services.openaire.eu:5432/dnet_openaireplus +postgresUser=dnet' +postgresPassword=dnetPwd \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/personentity/oozie_app/workflow.xml b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/personentity/oozie_app/workflow.xml index 166e7bb9c..5b613a76a 100644 --- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/personentity/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/personentity/oozie_app/workflow.xml @@ -9,6 +9,18 @@ outputPath the path where to store the actionset + + postgresUrl + the path where to store the actionset + + + postgresUser + the path where to store the actionset + + + postgresPassword + the path where to store the actionset + sparkDriverMemory memory for driver process @@ -102,6 +114,10 @@ --inputPath${inputPath} --outputPath${outputPath} --workingDir${workingDir} + --hdfsNameNode${nameNode} + --postgresUrl${postgresUrl} + --postgresUser${postgresUser} + --postgresPassword${postgresPassword} diff --git a/dhp-workflows/dhp-graph-mapper/src/test/scala/eu/dnetlib/dhp/enrich/orcid/ORCIDAuthorMatchersTest.scala b/dhp-workflows/dhp-graph-mapper/src/test/scala/eu/dnetlib/dhp/enrich/orcid/ORCIDAuthorMatchersTest.scala index 4e5ad5365..eece56b74 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/scala/eu/dnetlib/dhp/enrich/orcid/ORCIDAuthorMatchersTest.scala +++ b/dhp-workflows/dhp-graph-mapper/src/test/scala/eu/dnetlib/dhp/enrich/orcid/ORCIDAuthorMatchersTest.scala @@ -31,6 +31,7 @@ class ORCIDAuthorMatchersTest { assertTrue(matchOrderedTokenAndAbbreviations("孙林 Sun Lin", "Sun Lin")) // assertTrue(AuthorsMatchRevised.compare("孙林 Sun Lin", "孙林")); // not yet implemented } + @Test def testDocumentationNames(): Unit = { assertTrue(matchOrderedTokenAndAbbreviations("James C. A. Miller-Jones", "James Antony Miller-Jones")) } From aac5eb34995b6cf736c5a8dbdc67d66891a89992 Mon Sep 17 00:00:00 2001 From: Miriam Baglioni Date: Tue, 22 Oct 2024 11:54:16 +0200 Subject: [PATCH 027/111] [personEntity] changed the data info for the relations with projects. added missing parameters to the job.properties file --- .../personentity/ExtractPerson.java | 51 ++++++++----------- .../actionmanager/personentity/job.properties | 2 +- 2 files changed, 22 insertions(+), 31 deletions(-) diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/personentity/ExtractPerson.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/personentity/ExtractPerson.java index fb0621b6e..7b04d4d52 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/personentity/ExtractPerson.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/personentity/ExtractPerson.java @@ -71,13 +71,15 @@ public class ExtractPerson implements Serializable { private static final String PERSON_PREFIX = ModelSupport.getIdPrefix(Person.class) + "|orcid_______"; public static final String ORCID_AUTHORS_CLASSID = "sysimport:crosswalk:orcid"; public static final String ORCID_AUTHORS_CLASSNAME = "Imported from ORCID"; + public static final String FUNDER_AUTHORS_CLASSID = "sysimport:crosswalk:funderdatabase"; + public static final String FUNDER_AUTHORS_CLASSNAME = "Imported from Funder Database"; public static final String OPENAIRE_DATASOURCE_ID = "10|infrastruct_::f66f1bd369679b5b077dcdf006089556"; public static final String OPENAIRE_DATASOURCE_NAME = "OpenAIRE"; public static List collectedfromOpenAIRE = OafMapperUtils .listKeyValues(OPENAIRE_DATASOURCE_ID, OPENAIRE_DATASOURCE_NAME); - public static final DataInfo DATAINFO = OafMapperUtils + public static final DataInfo ORCIDDATAINFO = OafMapperUtils .dataInfo( false, null, @@ -91,6 +93,20 @@ public class ExtractPerson implements Serializable { ModelConstants.DNET_PROVENANCE_ACTIONS), "0.91"); + public static final DataInfo FUNDERDATAINFO = OafMapperUtils + .dataInfo( + false, + null, + false, + false, + OafMapperUtils + .qualifier( + FUNDER_AUTHORS_CLASSID, + FUNDER_AUTHORS_CLASSNAME, + ModelConstants.DNET_PROVENANCE_ACTIONS, + ModelConstants.DNET_PROVENANCE_ACTIONS), + "0.91"); + public static void main(final String[] args) throws IOException, ParseException { final ArgumentApplicationParser parser = new ArgumentApplicationParser( @@ -182,7 +198,7 @@ public class ExtractPerson implements Serializable { source, target, ModelConstants.PROJECT_PERSON_RELTYPE, ModelConstants.PROJECT_PERSON_SUBRELTYPE, ModelConstants.PROJECT_PERSON_PARTICIPATES, collectedfromOpenAIRE, - DATAINFO, + FUNDERDATAINFO, null); relation.setValidated(true); @@ -328,7 +344,7 @@ public class ExtractPerson implements Serializable { ModelConstants.DNET_PID_TYPES, ModelConstants.DNET_PID_TYPES, null)); person.setDateofcollection(op.getLastModifiedDate()); person.setOriginalId(Arrays.asList(op.getOrcid())); - person.setDataInfo(DATAINFO); + person.setDataInfo(ORCIDDATAINFO); return person; }, Encoders.bean(Person.class)) .write() @@ -415,7 +431,7 @@ public class ExtractPerson implements Serializable { source, target, ModelConstants.ORG_PERSON_RELTYPE, ModelConstants.ORG_PERSON_SUBRELTYPE, ModelConstants.ORG_PERSON_PARTICIPATES, Arrays.asList(OafMapperUtils.keyValue(orcidKey, ModelConstants.ORCID_DS)), - DATAINFO, + ORCIDDATAINFO, null); relation.setValidated(true); @@ -438,31 +454,6 @@ public class ExtractPerson implements Serializable { } - private static Collection getCoAuthorshipRelations(String orcid1, String orcid2) { - String source = PERSON_PREFIX + "::" + IdentifierFactory.md5(orcid1); - String target = PERSON_PREFIX + "::" + IdentifierFactory.md5(orcid2); - - return Arrays - .asList( - OafMapperUtils - .getRelation( - source, target, ModelConstants.PERSON_PERSON_RELTYPE, - ModelConstants.PERSON_PERSON_SUBRELTYPE, - ModelConstants.PERSON_PERSON_HASCOAUTHORED, - Arrays.asList(OafMapperUtils.keyValue(orcidKey, ModelConstants.ORCID_DS)), - DATAINFO, - null), - OafMapperUtils - .getRelation( - target, source, ModelConstants.PERSON_PERSON_RELTYPE, - ModelConstants.PERSON_PERSON_SUBRELTYPE, - ModelConstants.PERSON_PERSON_HASCOAUTHORED, - Arrays.asList(OafMapperUtils.keyValue(orcidKey, ModelConstants.ORCID_DS)), - DATAINFO, - null)); - - } - private static @NotNull Iterator getAuthorshipRelationIterator(Work w) { if (Optional.ofNullable(w.getPids()).isPresent()) @@ -511,7 +502,7 @@ public class ExtractPerson implements Serializable { ModelConstants.RESULT_PERSON_SUBRELTYPE, ModelConstants.RESULT_PERSON_HASAUTHORED, Arrays.asList(OafMapperUtils.keyValue(orcidKey, ModelConstants.ORCID_DS)), - DATAINFO, + ORCIDDATAINFO, null); relation.setValidated(true); return relation; diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/personentity/job.properties b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/personentity/job.properties index ac63d8a68..b9325bcb7 100644 --- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/personentity/job.properties +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/personentity/job.properties @@ -1,5 +1,5 @@ inputPath=/data/orcid_2023/tables/ outputPath=/user/miriam.baglioni/peopleAS postgresUrl=jdbc:postgresql://beta.services.openaire.eu:5432/dnet_openaireplus -postgresUser=dnet' +postgresUser=dnet postgresPassword=dnetPwd \ No newline at end of file From 0e34b0ece13f4d6dfeb5a4f0cb274168327c954a Mon Sep 17 00:00:00 2001 From: Giambattista Bloisi Date: Mon, 21 Oct 2024 09:05:13 +0200 Subject: [PATCH 028/111] Fix imports: point them from the main distribution packages --- .../src/test/java/eu/dnetlib/pace/util/UtilTest.java | 1 - .../dhp/actionmanager/personentity/ExtractPerson.java | 6 +++--- .../eu/dnetlib/dhp/collection/crossref/Crossref2Oaf.scala | 2 +- .../scala/eu/dnetlib/doiboost/crossref/Crossref2Oaf.scala | 2 +- .../main/scala/eu/dnetlib/doiboost/orcid/ORCIDToOAF.scala | 2 +- .../src/main/java/eu/dnetlib/dhp/api/Utils.java | 4 ++-- .../java/eu/dnetlib/dhp/bulktag/community/Constraint.java | 2 +- .../dhp/oa/graph/hive/GraphHiveTableImporterJob.java | 6 +++--- .../main/java/eu/dnetlib/dhp/swh/models/LastVisitData.java | 2 +- 9 files changed, 13 insertions(+), 14 deletions(-) diff --git a/dhp-pace-core/src/test/java/eu/dnetlib/pace/util/UtilTest.java b/dhp-pace-core/src/test/java/eu/dnetlib/pace/util/UtilTest.java index be5c1ebb9..93db552c1 100644 --- a/dhp-pace-core/src/test/java/eu/dnetlib/pace/util/UtilTest.java +++ b/dhp-pace-core/src/test/java/eu/dnetlib/pace/util/UtilTest.java @@ -11,7 +11,6 @@ import org.junit.jupiter.api.Disabled; import org.junit.jupiter.api.Test; import eu.dnetlib.pace.model.Person; -import jdk.nashorn.internal.ir.annotations.Ignore; public class UtilTest { diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/personentity/ExtractPerson.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/personentity/ExtractPerson.java index e63a50984..debf7e38e 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/personentity/ExtractPerson.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/personentity/ExtractPerson.java @@ -11,6 +11,7 @@ import java.util.stream.Collectors; import org.apache.commons.cli.ParseException; import org.apache.commons.io.IOUtils; +import org.apache.commons.lang3.StringUtils; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.compress.BZip2Codec; import org.apache.hadoop.mapred.SequenceFileOutputFormat; @@ -20,7 +21,6 @@ import org.apache.spark.sql.*; import org.jetbrains.annotations.NotNull; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import org.spark_project.jetty.util.StringUtil; import com.fasterxml.jackson.databind.ObjectMapper; @@ -317,13 +317,13 @@ public class ExtractPerson implements Serializable { "0.91"), null); - if (Optional.ofNullable(row.getStartDate()).isPresent() && StringUtil.isNotBlank(row.getStartDate())) { + if (Optional.ofNullable(row.getStartDate()).isPresent() && StringUtils.isNotBlank(row.getStartDate())) { KeyValue kv = new KeyValue(); kv.setKey("startDate"); kv.setValue(row.getStartDate()); properties.add(kv); } - if (Optional.ofNullable(row.getEndDate()).isPresent() && StringUtil.isNotBlank(row.getEndDate())) { + if (Optional.ofNullable(row.getEndDate()).isPresent() && StringUtils.isNotBlank(row.getEndDate())) { KeyValue kv = new KeyValue(); kv.setKey("endDate"); kv.setValue(row.getEndDate()); diff --git a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/collection/crossref/Crossref2Oaf.scala b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/collection/crossref/Crossref2Oaf.scala index 7c45234f6..e7d68920b 100644 --- a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/collection/crossref/Crossref2Oaf.scala +++ b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/collection/crossref/Crossref2Oaf.scala @@ -14,7 +14,7 @@ import eu.dnetlib.dhp.schema.oaf.utils.{ PidType } import eu.dnetlib.dhp.utils.DHPUtils -import org.apache.commons.lang.StringUtils +import org.apache.commons.lang3.StringUtils import org.apache.spark.sql.Row import org.json4s import org.json4s.DefaultFormats diff --git a/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/crossref/Crossref2Oaf.scala b/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/crossref/Crossref2Oaf.scala index d8292a631..a2c36041d 100644 --- a/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/crossref/Crossref2Oaf.scala +++ b/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/crossref/Crossref2Oaf.scala @@ -7,7 +7,7 @@ import eu.dnetlib.dhp.schema.oaf.utils.{GraphCleaningFunctions, IdentifierFactor import eu.dnetlib.dhp.utils.DHPUtils import eu.dnetlib.doiboost.DoiBoostMappingUtil import eu.dnetlib.doiboost.DoiBoostMappingUtil._ -import org.apache.commons.lang.StringUtils +import org.apache.commons.lang3.StringUtils import org.json4s import org.json4s.DefaultFormats import org.json4s.JsonAST._ diff --git a/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/orcid/ORCIDToOAF.scala b/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/orcid/ORCIDToOAF.scala index 7c58afc09..6ec75f5c3 100644 --- a/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/orcid/ORCIDToOAF.scala +++ b/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/orcid/ORCIDToOAF.scala @@ -6,7 +6,7 @@ import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory import eu.dnetlib.dhp.schema.oaf.{Author, DataInfo, Publication} import eu.dnetlib.doiboost.DoiBoostMappingUtil import eu.dnetlib.doiboost.DoiBoostMappingUtil.{createSP, generateDataInfo} -import org.apache.commons.lang.StringUtils +import org.apache.commons.lang3.StringUtils import org.json4s import org.json4s.DefaultFormats import org.json4s.JsonAST._ diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/api/Utils.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/api/Utils.java index 27fb37e5b..6079da365 100644 --- a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/api/Utils.java +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/api/Utils.java @@ -6,11 +6,11 @@ import java.io.Serializable; import java.util.*; import java.util.stream.Collectors; +import org.apache.commons.lang3.StringUtils; import org.jetbrains.annotations.NotNull; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import com.amazonaws.util.StringUtils; import com.fasterxml.jackson.databind.ObjectMapper; import com.google.common.collect.Maps; @@ -81,7 +81,7 @@ public class Utils implements Serializable { Community c = new Community(); c.setId(cm.getId()); c.setZenodoCommunities(cm.getOtherZenodoCommunities()); - if (!StringUtils.isNullOrEmpty(cm.getZenodoCommunity())) + if (StringUtils.isNotBlank(cm.getZenodoCommunity())) c.getZenodoCommunities().add(cm.getZenodoCommunity()); c.setSubjects(cm.getSubjects()); c.getSubjects().addAll(cm.getFos()); diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/community/Constraint.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/community/Constraint.java index 82a6a3b85..51525e4d3 100644 --- a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/community/Constraint.java +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/community/Constraint.java @@ -4,7 +4,7 @@ package eu.dnetlib.dhp.bulktag.community; import java.io.Serializable; import java.lang.reflect.InvocationTargetException; -import org.apache.htrace.fasterxml.jackson.annotation.JsonIgnore; +import com.fasterxml.jackson.annotation.JsonIgnore; import eu.dnetlib.dhp.bulktag.criteria.Selection; import eu.dnetlib.dhp.bulktag.criteria.VerbResolver; diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/hive/GraphHiveTableImporterJob.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/hive/GraphHiveTableImporterJob.java index 73243dbc5..d4fec3f52 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/hive/GraphHiveTableImporterJob.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/hive/GraphHiveTableImporterJob.java @@ -72,9 +72,9 @@ public class GraphHiveTableImporterJob { final Encoder clazzEncoder = Encoders.bean(clazz); Dataset dataset = spark - .read() - .schema(clazzEncoder.schema()) - .json(inputPath); + .read() + .schema(clazzEncoder.schema()) + .json(inputPath); if (numPartitions > 0) { log.info("repartitioning {} to {} partitions", clazz.getSimpleName(), numPartitions); diff --git a/dhp-workflows/dhp-swh/src/main/java/eu/dnetlib/dhp/swh/models/LastVisitData.java b/dhp-workflows/dhp-swh/src/main/java/eu/dnetlib/dhp/swh/models/LastVisitData.java index 5e705716c..0461e2f94 100644 --- a/dhp-workflows/dhp-swh/src/main/java/eu/dnetlib/dhp/swh/models/LastVisitData.java +++ b/dhp-workflows/dhp-swh/src/main/java/eu/dnetlib/dhp/swh/models/LastVisitData.java @@ -3,8 +3,8 @@ package eu.dnetlib.dhp.swh.models; import java.io.Serializable; -import com.cloudera.com.fasterxml.jackson.annotation.JsonProperty; import com.fasterxml.jackson.annotation.JsonIgnoreProperties; +import com.fasterxml.jackson.annotation.JsonProperty; @JsonIgnoreProperties(ignoreUnknown = true) public class LastVisitData implements Serializable { From aa7b8fd014f8b1a3855330806efa40cec1fc11d6 Mon Sep 17 00:00:00 2001 From: Giambattista Bloisi Date: Mon, 21 Oct 2024 18:05:01 +0200 Subject: [PATCH 029/111] Use workingDir parameter for temporary data of ORCID enrichment --- .../enrich/orcid/enrich_graph_orcid_parameters.json | 6 ++++++ .../orcid/SparkEnrichGraphWithOrcidAuthors.scala | 12 +++++++----- 2 files changed, 13 insertions(+), 5 deletions(-) diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/enrich/orcid/enrich_graph_orcid_parameters.json b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/enrich/orcid/enrich_graph_orcid_parameters.json index 765c0e8ff..772e1381f 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/enrich/orcid/enrich_graph_orcid_parameters.json +++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/enrich/orcid/enrich_graph_orcid_parameters.json @@ -22,5 +22,11 @@ "paramLongName": "targetPath", "paramDescription": "the output path of the graph enriched", "paramRequired": true + }, + { + "paramName": "wp", + "paramLongName": "workingDir", + "paramDescription": "the working dir", + "paramRequired": true } ] \ No newline at end of file diff --git a/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/enrich/orcid/SparkEnrichGraphWithOrcidAuthors.scala b/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/enrich/orcid/SparkEnrichGraphWithOrcidAuthors.scala index 0824c2a71..847a5f090 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/enrich/orcid/SparkEnrichGraphWithOrcidAuthors.scala +++ b/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/enrich/orcid/SparkEnrichGraphWithOrcidAuthors.scala @@ -47,13 +47,15 @@ class SparkEnrichGraphWithOrcidAuthors(propertyPath: String, args: Array[String] log.info(s"orcidPath is '$orcidPath'") val targetPath = parser.get("targetPath") log.info(s"targetPath is '$targetPath'") + val workingDir = parser.get("workingDir") + log.info(s"targetPath is '$workingDir'") - createTemporaryData(graphPath, orcidPath, targetPath) - analisys(targetPath) - generateGraph(graphPath, targetPath) + createTemporaryData(graphPath, orcidPath, workingDir) + analisys(workingDir) + generateGraph(graphPath, workingDir, targetPath) } - private def generateGraph(graphPath: String, targetPath: String): Unit = { + private def generateGraph(graphPath: String, workingDir: String, targetPath: String): Unit = { ModelSupport.entityTypes.asScala .filter(e => ModelSupport.isResult(e._1)) @@ -63,7 +65,7 @@ class SparkEnrichGraphWithOrcidAuthors(propertyPath: String, args: Array[String] val matched = spark.read .schema(Encoders.bean(classOf[ORCIDAuthorEnricherResult]).schema) - .parquet(s"${targetPath}/${resultType}_matched") + .parquet(s"${workingDir}/${resultType}_matched") .selectExpr("id", "enriched_author") spark.read From 6bc741715c08a4b71e3b737b3d8befdbfa743aab Mon Sep 17 00:00:00 2001 From: Giambattista Bloisi Date: Wed, 23 Oct 2024 14:01:12 +0200 Subject: [PATCH 030/111] Fix OafMapperUtilsTest.testMergePubs --- .../eu/dnetlib/dhp/schema/oaf/utils/OafMapperUtilsTest.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dhp-common/src/test/java/eu/dnetlib/dhp/schema/oaf/utils/OafMapperUtilsTest.java b/dhp-common/src/test/java/eu/dnetlib/dhp/schema/oaf/utils/OafMapperUtilsTest.java index 9317c0ce4..1ee8e52de 100644 --- a/dhp-common/src/test/java/eu/dnetlib/dhp/schema/oaf/utils/OafMapperUtilsTest.java +++ b/dhp-common/src/test/java/eu/dnetlib/dhp/schema/oaf/utils/OafMapperUtilsTest.java @@ -179,7 +179,7 @@ class OafMapperUtilsTest { assertEquals( ModelConstants.DATASET_RESULTTYPE_CLASSID, ((Result) MergeUtils - .merge(p2, d1)) + .merge(p2, d1, true)) .getResulttype() .getClassid()); } From c921cf7ee033053eaf00ccf876e3bb2edc8bfa3e Mon Sep 17 00:00:00 2001 From: Miriam Baglioni Date: Thu, 24 Oct 2024 09:57:20 +0200 Subject: [PATCH 031/111] [personEntity] removed the deletedbyinference results (not indexed, but still in the graph). Changed the writing mode: append instead of overwrite --- .../common/person}/CoAuthorshipIterator.java | 2 +- .../dnetlib/dhp/common/person}/Coauthors.java | 5 +- .../personentity/ExtractPerson.java | 2 + dhp-workflows/dhp-enrichment/pom.xml | 7 +- .../input_personpropagation_parameters.json | 21 ++++++ .../dhp/wf/subworkflows/person/job.properties | 1 + .../person/oozie_app/config-default.xml | 58 ++++++++++++++++ .../person/oozie_app/workflow.xml | 68 +++++++++++++++++++ 8 files changed, 153 insertions(+), 11 deletions(-) rename {dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/personentity => dhp-common/src/main/java/eu/dnetlib/dhp/common/person}/CoAuthorshipIterator.java (98%) rename {dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/personentity => dhp-common/src/main/java/eu/dnetlib/dhp/common/person}/Coauthors.java (70%) create mode 100644 dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/wf/subworkflows/person/input_personpropagation_parameters.json create mode 100644 dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/wf/subworkflows/person/job.properties create mode 100644 dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/wf/subworkflows/person/oozie_app/config-default.xml create mode 100644 dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/wf/subworkflows/person/oozie_app/workflow.xml diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/personentity/CoAuthorshipIterator.java b/dhp-common/src/main/java/eu/dnetlib/dhp/common/person/CoAuthorshipIterator.java similarity index 98% rename from dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/personentity/CoAuthorshipIterator.java rename to dhp-common/src/main/java/eu/dnetlib/dhp/common/person/CoAuthorshipIterator.java index 131f3f466..853f223d0 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/personentity/CoAuthorshipIterator.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/common/person/CoAuthorshipIterator.java @@ -1,5 +1,5 @@ -package eu.dnetlib.dhp.actionmanager.personentity; +package eu.dnetlib.dhp.common.person; import java.util.Arrays; import java.util.Iterator; diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/personentity/Coauthors.java b/dhp-common/src/main/java/eu/dnetlib/dhp/common/person/Coauthors.java similarity index 70% rename from dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/personentity/Coauthors.java rename to dhp-common/src/main/java/eu/dnetlib/dhp/common/person/Coauthors.java index 17f46d5c7..ff9324d2e 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/personentity/Coauthors.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/common/person/Coauthors.java @@ -1,12 +1,9 @@ -package eu.dnetlib.dhp.actionmanager.personentity; +package eu.dnetlib.dhp.common.person; import java.io.Serializable; -import java.util.ArrayList; import java.util.List; -import eu.dnetlib.dhp.schema.oaf.Relation; - public class Coauthors implements Serializable { private List coauthors; diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/personentity/ExtractPerson.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/personentity/ExtractPerson.java index 7b04d4d52..3ee89e772 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/personentity/ExtractPerson.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/personentity/ExtractPerson.java @@ -13,6 +13,8 @@ import java.sql.SQLException; import java.util.*; import java.util.stream.Collectors; +import eu.dnetlib.dhp.common.person.CoAuthorshipIterator; +import eu.dnetlib.dhp.common.person.Coauthors; import org.apache.commons.cli.ParseException; import org.apache.commons.io.IOUtils; import org.apache.hadoop.conf.Configuration; diff --git a/dhp-workflows/dhp-enrichment/pom.xml b/dhp-workflows/dhp-enrichment/pom.xml index 9698dee03..41f57e6df 100644 --- a/dhp-workflows/dhp-enrichment/pom.xml +++ b/dhp-workflows/dhp-enrichment/pom.xml @@ -48,12 +48,7 @@ io.github.classgraph classgraph - - eu.dnetlib.dhp - dhp-aggregation - 1.2.5-SNAPSHOT - compile - + diff --git a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/wf/subworkflows/person/input_personpropagation_parameters.json b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/wf/subworkflows/person/input_personpropagation_parameters.json new file mode 100644 index 000000000..df65d5320 --- /dev/null +++ b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/wf/subworkflows/person/input_personpropagation_parameters.json @@ -0,0 +1,21 @@ +[ + { + "paramName":"s", + "paramLongName":"sourcePath", + "paramDescription": "the path of the sequencial file to read", + "paramRequired": true + }, + { + "paramName": "out", + "paramLongName": "outputPath", + "paramDescription": "the path used to store temporary output files", + "paramRequired": true + }, + + { + "paramName": "ssm", + "paramLongName": "isSparkSessionManaged", + "paramDescription": "true if the spark session is managed, false otherwise", + "paramRequired": false + } +] diff --git a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/wf/subworkflows/person/job.properties b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/wf/subworkflows/person/job.properties new file mode 100644 index 000000000..61bd3d121 --- /dev/null +++ b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/wf/subworkflows/person/job.properties @@ -0,0 +1 @@ +sourcePath=/tmp/miriam/13_graph_copy \ No newline at end of file diff --git a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/wf/subworkflows/person/oozie_app/config-default.xml b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/wf/subworkflows/person/oozie_app/config-default.xml new file mode 100644 index 000000000..1cb0b8a5e --- /dev/null +++ b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/wf/subworkflows/person/oozie_app/config-default.xml @@ -0,0 +1,58 @@ + + + jobTracker + yarnRM + + + nameNode + hdfs://nameservice1 + + + oozie.use.system.libpath + true + + + oozie.action.sharelib.for.spark + spark2 + + + hive_metastore_uris + thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083 + + + spark2YarnHistoryServerAddress + http://iis-cdh5-test-gw.ocean.icm.edu.pl:18089 + + + spark2EventLogDir + /user/spark/spark2ApplicationHistory + + + spark2ExtraListeners + com.cloudera.spark.lineage.NavigatorAppListener + + + spark2SqlQueryExecutionListeners + com.cloudera.spark.lineage.NavigatorQueryListener + + + sparkExecutorNumber + 4 + + + sparkDriverMemory + 15G + + + sparkExecutorMemory + 5G + + + sparkExecutorCores + 4 + + + spark2MaxExecutors + 50 + + \ No newline at end of file diff --git a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/wf/subworkflows/person/oozie_app/workflow.xml b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/wf/subworkflows/person/oozie_app/workflow.xml new file mode 100644 index 000000000..c9b914384 --- /dev/null +++ b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/wf/subworkflows/person/oozie_app/workflow.xml @@ -0,0 +1,68 @@ + + + + sourcePath + the source path + + + + + + ${jobTracker} + ${nameNode} + + + oozie.action.sharelib.for.spark + ${oozieActionShareLibForSpark2} + + + + + + + + Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] + + + + + + + + + + + + + + + yarn + cluster + personPropagation + eu.dnetlib.dhp.person.SparkExtractPersonRelations + dhp-enrichment-${projectVersion}.jar + + --executor-cores=${sparkExecutorCores} + --executor-memory=${sparkExecutorMemory} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.speculation=false + --conf spark.hadoop.mapreduce.map.speculative=false + --conf spark.hadoop.mapreduce.reduce.speculative=false + --conf spark.sql.shuffle.partitions=7680 + + --sourcePath${sourcePath}/ + --outputPath${workingDir}/relation + + + + + + + + + + \ No newline at end of file From cf07ed90584b8d792f7868f325404eb35fbd3fbf Mon Sep 17 00:00:00 2001 From: Miriam Baglioni Date: Thu, 24 Oct 2024 14:35:14 +0200 Subject: [PATCH 032/111] [person] refactoring --- .../person/SparkExtractPersonRelations.java | 271 ++++++++++++++++++ 1 file changed, 271 insertions(+) create mode 100644 dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/person/SparkExtractPersonRelations.java diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/person/SparkExtractPersonRelations.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/person/SparkExtractPersonRelations.java new file mode 100644 index 000000000..34bd976ea --- /dev/null +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/person/SparkExtractPersonRelations.java @@ -0,0 +1,271 @@ + +package eu.dnetlib.dhp.person; + +import static com.ibm.icu.text.PluralRules.Operand.w; +import static eu.dnetlib.dhp.PropagationConstant.*; +import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; + +import java.io.Serializable; +import java.util.*; +import java.util.stream.Collectors; +import java.util.stream.Stream; + +import org.apache.commons.io.IOUtils; +import org.apache.spark.SparkConf; +import org.apache.spark.api.java.function.FilterFunction; +import org.apache.spark.api.java.function.FlatMapFunction; +import org.apache.spark.api.java.function.MapFunction; +import org.apache.spark.api.java.function.MapGroupsFunction; +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Encoders; +import org.apache.spark.sql.SaveMode; +import org.apache.spark.sql.SparkSession; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.common.person.CoAuthorshipIterator; +import eu.dnetlib.dhp.common.person.Coauthors; +import eu.dnetlib.dhp.countrypropagation.SparkCountryPropagationJob; +import eu.dnetlib.dhp.schema.common.ModelConstants; +import eu.dnetlib.dhp.schema.common.ModelSupport; +import eu.dnetlib.dhp.schema.oaf.*; +import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory; +import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils; +import scala.Tuple2; + +public class SparkExtractPersonRelations { + + private static final Logger log = LoggerFactory.getLogger(SparkCountryPropagationJob.class); + private static final String PERSON_PREFIX = ModelSupport.getIdPrefix(Person.class) + "|orcid_______"; + + public static final DataInfo DATAINFO = OafMapperUtils + .dataInfo( + false, + "openaire", + true, + false, + OafMapperUtils + .qualifier( + ModelConstants.SYSIMPORT_CROSSWALK_REPOSITORY, + ModelConstants.SYSIMPORT_CROSSWALK_REPOSITORY, + ModelConstants.DNET_PROVENANCE_ACTIONS, + ModelConstants.DNET_PROVENANCE_ACTIONS), + "0.85"); + + public static void main(String[] args) throws Exception { + + String jsonConfiguration = IOUtils + .toString( + SparkCountryPropagationJob.class + .getResourceAsStream( + "/eu/dnetlib/dhp/wf/subworkflows/person/input_personpropagation_parameters.json")); + + final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration); + + parser.parseArgument(args); + + Boolean isSparkSessionManaged = isSparkSessionManaged(parser); + log.info("isSparkSessionManaged: {}", isSparkSessionManaged); + + String sourcePath = parser.get("sourcePath"); + log.info("sourcePath: {}", sourcePath); + + final String workingPath = parser.get("outputPath"); + log.info("workingPath: {}", workingPath); + + SparkConf conf = new SparkConf(); + runWithSparkSession( + conf, + isSparkSessionManaged, + spark -> { + + extractRelations( + spark, + sourcePath, + workingPath); + }); + } + + private static void extractRelations(SparkSession spark, String sourcePath, String workingPath) { + + Dataset> relationDataset = spark + .read() + .schema(Encoders.bean(Relation.class).schema()) + .json(sourcePath + "relation") + .as(Encoders.bean(Relation.class)) + .map( + (MapFunction>) r -> new Tuple2<>( + r.getSource() + r.getRelClass() + r.getTarget(), r), + Encoders.tuple(Encoders.STRING(), Encoders.bean(Relation.class))); + + ModelSupport.entityTypes + .keySet() + .stream() + .filter(ModelSupport::isResult) + .forEach( + e -> { + // 1. search for results having orcid_pending and orcid in the set of pids for the authors + Dataset resultWithOrcids = spark + .read() + .schema(Encoders.bean(Result.class).schema()) + .json(sourcePath + e.name()) + .as(Encoders.bean(Result.class)) + .filter( + (FilterFunction) r -> !r.getDataInfo().getDeletedbyinference() && + !r.getDataInfo().getInvisible() && + Optional + .ofNullable(r.getAuthor()) + .isPresent()) + .filter( + (FilterFunction) r -> r + .getAuthor() + .stream() + .anyMatch( + a -> Optional + .ofNullable( + a + .getPid()) + .isPresent() && + a + .getPid() + .stream() + .anyMatch( + p -> Arrays + .asList("orcid", "orcid_pending") + .contains(p.getQualifier().getClassid().toLowerCase())))); + // 2. create authorship relations between the result identifier and the person entity with + // orcid_pending. + Dataset> newRelations = resultWithOrcids + .flatMap( + (FlatMapFunction) r -> getAuthorshipRelations(r), + Encoders.bean(Relation.class)) + .map( + (MapFunction>) r -> new Tuple2<>( + r.getSource() + r.getRelClass() + r.getTarget(), r), + Encoders.tuple(Encoders.STRING(), Encoders.bean(Relation.class))); + newRelations + .joinWith(relationDataset, newRelations.col("_1").equalTo(relationDataset.col("_1")), "left") + .map((MapFunction, Tuple2>, Relation>) t2 -> { + if (t2._2() == null) + return t2._1()._2(); + return null; + }, Encoders.bean(Relation.class)) + .filter((FilterFunction) r -> r != null) + .write() + .mode(SaveMode.Append) + .option("compression", "gzip") + .json(workingPath); + + // 2.1 store in a separate location the relation between the person and the pids for the result? + + // 3. create co_authorship relations between the pairs of authors with orcid/orcid_pending pids + newRelations = resultWithOrcids + .map((MapFunction) r -> getAuthorsPidList(r), Encoders.bean(Coauthors.class)) + .flatMap( + (FlatMapFunction) c -> new CoAuthorshipIterator(c.getCoauthors()), + Encoders.bean(Relation.class)) + .groupByKey( + (MapFunction) r -> r.getSource() + r.getTarget(), Encoders.STRING()) + .mapGroups( + (MapGroupsFunction) (k, it) -> it.next(), + Encoders.bean(Relation.class)) + .map( + (MapFunction>) r -> new Tuple2<>( + r.getSource() + r.getRelClass() + r.getTarget(), r), + Encoders.tuple(Encoders.STRING(), Encoders.bean(Relation.class))); + newRelations + .joinWith(relationDataset, newRelations.col("_1").equalTo(relationDataset.col("_1")), "left") + .map((MapFunction, Tuple2>, Relation>) t2 -> { + if (t2._2() == null) + return t2._1()._2(); + return null; + }, Encoders.bean(Relation.class)) + .filter((FilterFunction) r -> r != null) + .write() + .mode(SaveMode.Append) + .option("compression", "gzip") + .json(workingPath); + + }); + spark + .read() + .schema(Encoders.bean(Relation.class).schema()) + .json(workingPath) + .write() + .mode(SaveMode.Append) + .option("compression", "gzip") + .json(sourcePath + "relation"); + + } + + private static Coauthors getAuthorsPidList(Result r) { + Coauthors coauth = new Coauthors(); + coauth + .setCoauthors( + r + .getAuthor() + .stream() + .filter( + a -> a + .getPid() + .stream() + .anyMatch( + p -> Arrays.asList("orcid", "orcid_pending").contains(p.getQualifier().getClassid()))) + .map(a -> { + Optional tmp = a + .getPid() + .stream() + .filter(p -> p.getQualifier().getClassid().equalsIgnoreCase("orcid")) + .findFirst(); + if (tmp.isPresent()) + return tmp.get().getValue(); + tmp = a + .getPid() + .stream() + .filter(p -> p.getQualifier().getClassid().equalsIgnoreCase("orcid_pending")) + .findFirst(); + if (tmp.isPresent()) + return tmp.get().getValue(); + + return null; + }) + .filter(Objects::nonNull) + .collect(Collectors.toList())); + return coauth; + + } + + private static Iterator getAuthorshipRelations(Result r) { + List relationList = new ArrayList<>(); + for (Author a : r.getAuthor()) + + relationList.addAll(a.getPid().stream().map(p -> { + + if (p.getQualifier().getClassid().equalsIgnoreCase("orcid_pending")) + return getRelation(p.getValue(), r.getId()); + return null; + }) + .filter(Objects::nonNull) + .collect(Collectors.toList())); + + return relationList.iterator(); + } + + private static Relation getRelation(String orcid, String resultId) { + + String source = PERSON_PREFIX + "::" + IdentifierFactory.md5(orcid); + + Relation relation = OafMapperUtils + .getRelation( + source, resultId, ModelConstants.RESULT_PERSON_RELTYPE, + ModelConstants.RESULT_PERSON_SUBRELTYPE, + ModelConstants.RESULT_PERSON_HASAUTHORED, + null, // collectedfrom = null + DATAINFO, + null); + + return relation; + } + +} From c773421cc73531dce34ee869f5dce44296ea425c Mon Sep 17 00:00:00 2001 From: Miriam Baglioni Date: Thu, 24 Oct 2024 14:44:13 +0200 Subject: [PATCH 033/111] [person] added new substep in propagation worflow main --- .../dnetlib/dhp/wf/main/oozie_app/import.txt | 3 ++- .../dhp/wf/main/oozie_app/workflow.xml | 19 ++++++++++++++++++- 2 files changed, 20 insertions(+), 2 deletions(-) diff --git a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/wf/main/oozie_app/import.txt b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/wf/main/oozie_app/import.txt index b20259414..8922b6ac6 100644 --- a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/wf/main/oozie_app/import.txt +++ b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/wf/main/oozie_app/import.txt @@ -7,4 +7,5 @@ community_organization classpath eu/dnetlib/dhp/wf/subworkflows/resulttocommunit result_project classpath eu/dnetlib/dhp/wf/subworkflows/projecttoresult/oozie_app community_project classpath eu/dnetlib/dhp/wf/subworkflows/resulttocommunityfromproject/oozie_app community_sem_rel classpath eu/dnetlib/dhp/wf/subworkflows/resulttocommunityfromsemrel/oozie_app -country_propagation classpath eu/dnetlib/dhp/wf/subworkflows/countrypropagation/oozie_app \ No newline at end of file +country_propagation classpath eu/dnetlib/dhp/wf/subworkflows/countrypropagation/oozie_app +person_propagation classpath eu/dnetlib/dhp/wf/subworkflows/person/oozie_app \ No newline at end of file diff --git a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/wf/main/oozie_app/workflow.xml b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/wf/main/oozie_app/workflow.xml index 8e91707b6..4351cd595 100644 --- a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/wf/main/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/wf/main/oozie_app/workflow.xml @@ -122,6 +122,7 @@ ${wf:conf('resumeFrom') eq 'CommunityProject'} ${wf:conf('resumeFrom') eq 'CommunitySemanticRelation'} ${wf:conf('resumeFrom') eq 'CountryPropagation'} + ${wf:conf('resumeFrom') eq 'PersonPropagation'} @@ -291,10 +292,24 @@ + + + + + + ${wf:appPath()}/person_propagation + + + + + sourcePath + ${outputPath} + + + - ${wf:appPath()}/country_propagation @@ -319,6 +334,8 @@ + + From 01679c935a96c796a45eac96a7e15e5ea44f312d Mon Sep 17 00:00:00 2001 From: Miriam Baglioni Date: Thu, 24 Oct 2024 15:27:06 +0200 Subject: [PATCH 034/111] [person] added test class to be implemented --- .../dhp/person/PersonPropagationJobTest.java | 95 +++++++++++++++++++ .../dhp/person/graph/dataset/part-00000 | 0 .../graph/otherresearchproduct/part-00000 | 0 .../dhp/person/graph/publication/part-00000 | 0 .../dhp/person/graph/relation/part-00000 | 1 + .../dhp/person/graph/software/part-00000 | 0 6 files changed, 96 insertions(+) create mode 100644 dhp-workflows/dhp-enrichment/src/test/java/eu/dnetlib/dhp/person/PersonPropagationJobTest.java create mode 100644 dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/person/graph/dataset/part-00000 create mode 100644 dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/person/graph/otherresearchproduct/part-00000 create mode 100644 dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/person/graph/publication/part-00000 create mode 100644 dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/person/graph/relation/part-00000 create mode 100644 dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/person/graph/software/part-00000 diff --git a/dhp-workflows/dhp-enrichment/src/test/java/eu/dnetlib/dhp/person/PersonPropagationJobTest.java b/dhp-workflows/dhp-enrichment/src/test/java/eu/dnetlib/dhp/person/PersonPropagationJobTest.java new file mode 100644 index 000000000..43f913d3d --- /dev/null +++ b/dhp-workflows/dhp-enrichment/src/test/java/eu/dnetlib/dhp/person/PersonPropagationJobTest.java @@ -0,0 +1,95 @@ + +package eu.dnetlib.dhp.person; + +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.ArrayList; +import java.util.List; + +import eu.dnetlib.dhp.schema.oaf.*; +import org.apache.commons.io.FileUtils; +import org.apache.spark.SparkConf; +import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.api.java.function.FlatMapFunction; +import org.apache.spark.api.java.function.MapFunction; +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Encoders; +import org.apache.spark.sql.SparkSession; +import org.junit.jupiter.api.AfterAll; +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.Test; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import com.fasterxml.jackson.databind.ObjectMapper; + +import eu.dnetlib.dhp.countrypropagation.SparkCountryPropagationJob; +import scala.Tuple2; + +public class PersonPropagationJobTest { + + private static final Logger log = LoggerFactory.getLogger(PersonPropagationJobTest.class); + + private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); + + private static SparkSession spark; + + private static Path workingDir; + + @BeforeAll + public static void beforeAll() throws IOException { + workingDir = Files.createTempDirectory(PersonPropagationJobTest.class.getSimpleName()); + log.info("using work dir {}", workingDir); + + SparkConf conf = new SparkConf(); + conf.setAppName(PersonPropagationJobTest.class.getSimpleName()); + + conf.setMaster("local[*]"); + conf.set("spark.driver.host", "localhost"); + conf.set("hive.metastore.local", "true"); + conf.set("spark.ui.enabled", "false"); + conf.set("spark.sql.warehouse.dir", workingDir.toString()); + conf.set("hive.metastore.warehouse.dir", workingDir.resolve("warehouse").toString()); + + spark = SparkSession + .builder() + .appName(PersonPropagationJobTest.class.getSimpleName()) + .config(conf) + .getOrCreate(); + } + + @AfterAll + public static void afterAll() throws IOException { + FileUtils.deleteDirectory(workingDir.toFile()); + spark.stop(); + } + + @Test + void testPersonPropagation() throws Exception { + final String sourcePath = getClass() + .getResource("/eu/dnetlib/dhp/personpropagation/graph") + .getPath(); + + SparkExtractPersonRelations + .main( + new String[] { + "--isSparkSessionManaged", Boolean.FALSE.toString(), + "--sourcePath", sourcePath, + "--outputPath", workingDir.toString() + }); + + final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); + + JavaRDD tmp = sc + .textFile(workingDir.toString() + "/relation") + .map(item -> OBJECT_MAPPER.readValue(item, Relation.class)); + + //TODO write assertions and find relevant information for hte resource files + } + + + +} diff --git a/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/person/graph/dataset/part-00000 b/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/person/graph/dataset/part-00000 new file mode 100644 index 000000000..e69de29bb diff --git a/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/person/graph/otherresearchproduct/part-00000 b/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/person/graph/otherresearchproduct/part-00000 new file mode 100644 index 000000000..e69de29bb diff --git a/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/person/graph/publication/part-00000 b/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/person/graph/publication/part-00000 new file mode 100644 index 000000000..e69de29bb diff --git a/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/person/graph/relation/part-00000 b/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/person/graph/relation/part-00000 new file mode 100644 index 000000000..a17560e55 --- /dev/null +++ b/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/person/graph/relation/part-00000 @@ -0,0 +1 @@ +{"clazz":"eu.dnetlib.dhp.schema.oaf.Relation","payload":{"collectedfrom":[{"key":"10|openaire____::806360c771262b4d6770e7cdf04b5c5a","value":"ORCID","dataInfo":null}],"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"0.91","inferenceprovenance":null,"provenanceaction":{"classid":"sysimport:crosswalk:orcid","classname":"Imported from ORCID","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"lastupdatetimestamp":null,"relType":"personPerson","subRelType":"coAuthorship","relClass":"hasCoAuthor","source":"30|orcid_______::028da52095190c6573d6bf9dba4c8ede","target":"30|orcid_______::8791a84ea413592878d6fe191f0ed35f","validated":true,"validationDate":null,"properties":[]}} \ No newline at end of file diff --git a/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/person/graph/software/part-00000 b/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/person/graph/software/part-00000 new file mode 100644 index 000000000..e69de29bb From a7699558ed38382618911b1d7bca62f9e738d36d Mon Sep 17 00:00:00 2001 From: Miriam Baglioni Date: Thu, 24 Oct 2024 16:15:12 +0200 Subject: [PATCH 035/111] [person] - --- .../java/eu/dnetlib/dhp/person/SparkExtractPersonRelations.java | 2 ++ .../eu/dnetlib/dhp/person/graph/otherresearchproduct/part-00000 | 1 + 2 files changed, 3 insertions(+) diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/person/SparkExtractPersonRelations.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/person/SparkExtractPersonRelations.java index 34bd976ea..6caeef478 100644 --- a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/person/SparkExtractPersonRelations.java +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/person/SparkExtractPersonRelations.java @@ -140,6 +140,8 @@ public class SparkExtractPersonRelations { .flatMap( (FlatMapFunction) r -> getAuthorshipRelations(r), Encoders.bean(Relation.class)) +// .groupByKey((MapFunction) r-> r.getSource()+r.getTarget(), Encoders.STRING() ) +// .mapGroups((MapGroupsFunction) (k,it) -> it.next(), Encoders.bean(Relation.class) ) .map( (MapFunction>) r -> new Tuple2<>( r.getSource() + r.getRelClass() + r.getTarget(), r), diff --git a/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/person/graph/otherresearchproduct/part-00000 b/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/person/graph/otherresearchproduct/part-00000 index e69de29bb..47a3fdccb 100644 --- a/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/person/graph/otherresearchproduct/part-00000 +++ b/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/person/graph/otherresearchproduct/part-00000 @@ -0,0 +1 @@ +{"dataInfo": {"invisible": false, "trust": "0.9", "deletedbyinference": false, "inferred": false, "provenanceaction": {"classid": "sysimport:crosswalk:repository", "classname": "Harvested", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}}, "resourcetype": {"classid": "Taxonomic treatment", "classname": "Taxonomic treatment", "schemename": "dnet:dataCite_resource", "schemeid": "dnet:dataCite_resource"}, "pid": [{"dataInfo": {"invisible": false, "trust": "0.9", "deletedbyinference": false, "inferred": false, "provenanceaction": {"classid": "sysimport:crosswalk:repository", "classname": "Harvested", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}}, "qualifier": {"classid": "doi", "classname": "Digital Object Identifier", "schemename": "dnet:pid_types", "schemeid": "dnet:pid_types"}, "value": "10.5281/zenodo.10249277"}, {"dataInfo": {"invisible": false, "trust": "0.9", "deletedbyinference": false, "inferred": false, "provenanceaction": {"classid": "sysimport:crosswalk:repository", "classname": "Harvested", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}}, "qualifier": {"classid": "doi", "classname": "Digital Object Identifier", "schemename": "dnet:pid_types", "schemeid": "dnet:pid_types"}, "value": "10.5281/zenodo.10249277"}, {"dataInfo": {"invisible": false, "trust": "0.9", "deletedbyinference": false, "inferred": false, "provenanceaction": {"classid": "sysimport:crosswalk:repository", "classname": "Harvested", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}}, "qualifier": {"classid": "doi", "classname": "Digital Object Identifier", "schemename": "dnet:pid_types", "schemeid": "dnet:pid_types"}, "value": "10.5281/zenodo.10249277"}], "bestaccessright": {"classid": "OPEN", "classname": "Open Access", "schemename": "dnet:access_modes", "schemeid": "dnet:access_modes"}, "relevantdate": [{"dataInfo": {"invisible": false, "trust": "0.9", "deletedbyinference": false, "inferred": false, "provenanceaction": {"classid": "sysimport:crosswalk:repository", "classname": "Harvested", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}}, "qualifier": {"classid": "issued", "classname": "issued", "schemename": "dnet:dataCite_date", "schemeid": "dnet:dataCite_date"}, "value": "2023-11-28"}], "contributor": [], "id": "50|doi_________::fa6db8629c4a8d13ec21e445b309d1c8", "description": [{"dataInfo": {"invisible": false, "trust": "0.9", "deletedbyinference": false, "inferred": false, "provenanceaction": {"classid": "sysimport:crosswalk:repository", "classname": "Harvested", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}}, "value": "11.1 Saltia papposa (Forrsk.) Moq., Prodr. [A. P. de Candolle] 13(2): 325. 1849 \\u2261 Achyranthes papposa Forssk., Fl. Aegypt.-Arab.: 48. 1775. Lectotype (designate here): \\u2014 YEMEN. Zabid, s.d., Forssk\\u00e5l 205 (C10001569!, image of the lectotype available at https://plants.jstor. org/stable/viewer/10.5555/al.ap.specimen.c10001569?page=1); isolectotypes: C10001570! (image of the isolectotype available at https://plants.jstor.org/stable/viewer/10.5555/al.ap.specimen.c10001570?page=1) and BM000950560! (image of the isolectotype available at https://data.nhm.ac.uk/object/c634a45c-983a-42f3-9c4d-1d1b06f5f88b/1691539200000). Typification of the name Achyranthes papposa:\\u2014 Forssk\\u00e5l (1775: 48) published Achyranthes papposa by giving a short diagnosis (\\u201c foliis alternis; crassiusculis; lineari-cuneatis, obtusis \\u201d) and a detailed description; the provenance [\\u201c Zeb\\u00edd \\u201d (currently Zabid), a city of W-Yemen] is reported [see also Forssk\\u00e5l (1775: CVII) who indicated \\u201cMi.\\u201d as provenance of A. papposa, \\u201cMi.\\u201d meaning \\u201cMontium Regionis Inferior\\u201d (Forssk\\u00e5l 1775: CI)]. We traced two specimens at C, where Forsskal\\u2019s herbarium and types are mostly preserved (HUH Index of Botanists 2013c), i.e. viz. C10001569 and C10001570, both collected at Zabid; a further specimen is kept at BM (BM000950560) and it was annotated by Frank Nigel Hepper as an isotype. These three specimens are part of the original material for A. papposa (see also Hepper & Friis 1994). C10001569 bears a branch of a plant with more flowers than in C10001570. Since the morphology of the flowers is important to identify Saltia papposa (Townsend 1993), we here designate C10001569 as the lectotype of A. papposa. C10001570 and BM000950560 are isolectotypes. Chorology:\\u2015 Endemic to the Arabian Peninsula (Saudi Arabia and Yemen; POWO 2023). Occurrence in Saudi Arabia:\\u2015 Doubtfully in Makkah (Miller & Cope 1996). We did not trace any specimen collected in Saudi Arabia, but it is not impossible that Saltia papposa occurs in the country, probably in the south-eastern coastal area (Jazan?) (see also Ghazanfar & Fisher 2013: 178\\u2013179)."}], "eoscifguidelines": [], "author": [{"surname": "Hassan", "name": "Walaa A.", "pid": [{"dataInfo": {"invisible": false, "trust": "0.9", "deletedbyinference": false, "inferred": false, "provenanceaction": {"classid": "sysimport:crosswalk:repository", "classname": "Harvested", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}}, "qualifier": {"classid": "orcid_pending", "classname": "Open Researcher and Contributor ID", "schemename": "dnet:pid_types", "schemeid": "dnet:pid_types"}, "value": "0000-0001-7605-9058"}], "rank": 1, "affiliation": [{"dataInfo": {"invisible": false, "trust": "0.9", "deletedbyinference": false, "inferred": false, "provenanceaction": {"classid": "sysimport:crosswalk:repository", "classname": "Harvested", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}}, "value": "Botany and Microbiology Department, Faculty of Science, Beni-Suef University, Beni-Suef, Egypt & azmeyw @ gmail. com; https: // orcid. org / 0000 - 0001 - 7605 - 9058"}], "fullname": "Hassan, Walaa A."}, {"surname": "Al-Shaye", "name": "Najla A.", "pid": [{"dataInfo": {"invisible": false, "trust": "0.9", "deletedbyinference": false, "inferred": false, "provenanceaction": {"classid": "sysimport:crosswalk:repository", "classname": "Harvested", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}}, "qualifier": {"classid": "orcid_pending", "classname": "Open Researcher and Contributor ID", "schemename": "dnet:pid_types", "schemeid": "dnet:pid_types"}, "value": "0000-0002-0447-8613"}], "rank": 2, "affiliation": [{"dataInfo": {"invisible": false, "trust": "0.9", "deletedbyinference": false, "inferred": false, "provenanceaction": {"classid": "sysimport:crosswalk:repository", "classname": "Harvested", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}}, "value": "Department of Biology, College of Science, Princess Nourah bint Abdulrahman University, Riyadh, Saudi Arabia & naaalshaye @ pnu. edu. sa; https: // orcid. org / 0000 - 0002 - 0447 - 8613"}], "fullname": "Al-Shaye, Najla A."}, {"surname": "Iamonico", "name": "Duilio", "pid": [{"dataInfo": {"invisible": false, "trust": "0.9", "deletedbyinference": false, "inferred": false, "provenanceaction": {"classid": "sysimport:crosswalk:repository", "classname": "Harvested", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}}, "qualifier": {"classid": "orcid_pending", "classname": "Open Researcher and Contributor ID", "schemename": "dnet:pid_types", "schemeid": "dnet:pid_types"}, "value": "0000-0001-5491-7568"}], "rank": 3, "affiliation": [{"dataInfo": {"invisible": false, "trust": "0.9", "deletedbyinference": false, "inferred": false, "provenanceaction": {"classid": "sysimport:crosswalk:repository", "classname": "Harvested", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}}, "value": "Department of Environmental Biology, Univeristy of Rome Sapienza, 00185 Rome, Italy & duilio. iamonico @ uniroma 1. it; https: // orcid. org / 0000 - 0001 - 5491 - 7568"}], "fullname": "Iamonico, Duilio"}], "contactgroup": [], "collectedfrom": [{"value": "ZENODO", "key": "10|opendoar____::358aee4cc897452c00244351e4d91f69"}], "instance": [{"refereed": {"classid": "0002", "classname": "nonPeerReviewed", "schemename": "dnet:review_levels", "schemeid": "dnet:review_levels"}, "hostedby": {"value": "ZENODO", "key": "10|opendoar____::358aee4cc897452c00244351e4d91f69"}, "license": {"dataInfo": {"invisible": false, "trust": "0.9", "deletedbyinference": false, "inferred": false, "provenanceaction": {"classid": "sysimport:crosswalk:repository", "classname": "Harvested", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}}, "value": "CC 0"}, "url": ["http://dx.doi.org/10.5281/zenodo.10249277", "http://treatment.plazi.org/id/97224201FFE29001FF4C6AB685F912EB"], "pid": [{"dataInfo": {"invisible": false, "trust": "0.9", "deletedbyinference": false, "inferred": false, "provenanceaction": {"classid": "sysimport:crosswalk:repository", "classname": "Harvested", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}}, "qualifier": {"classid": "doi", "classname": "Digital Object Identifier", "schemename": "dnet:pid_types", "schemeid": "dnet:pid_types"}, "value": "10.5281/zenodo.10249277"}, {"dataInfo": {"invisible": false, "trust": "0.9", "deletedbyinference": false, "inferred": false, "provenanceaction": {"classid": "sysimport:crosswalk:repository", "classname": "Harvested", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}}, "qualifier": {"classid": "doi", "classname": "Digital Object Identifier", "schemename": "dnet:pid_types", "schemeid": "dnet:pid_types"}, "value": "10.5281/zenodo.10249277"}, {"dataInfo": {"invisible": false, "trust": "0.9", "deletedbyinference": false, "inferred": false, "provenanceaction": {"classid": "sysimport:crosswalk:repository", "classname": "Harvested", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}}, "qualifier": {"classid": "doi", "classname": "Digital Object Identifier", "schemename": "dnet:pid_types", "schemeid": "dnet:pid_types"}, "value": "10.5281/zenodo.10249277"}], "instanceTypeMapping": [{"originalType": "Taxonomic treatment", "vocabularyName": "openaire::coar_resource_types_3_1"}], "alternateIdentifier": [{"dataInfo": {"invisible": false, "trust": "0.9", "deletedbyinference": false, "inferred": false, "provenanceaction": {"classid": "sysimport:crosswalk:repository", "classname": "Harvested", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}}, "qualifier": {"classid": "oai", "classname": "Open Archives Initiative", "schemename": "dnet:pid_types", "schemeid": "dnet:pid_types"}, "value": "oai:zenodo.org:10249277"}], "dateofacceptance": {"dataInfo": {"invisible": false, "trust": "0.9", "deletedbyinference": false, "inferred": false, "provenanceaction": {"classid": "sysimport:crosswalk:repository", "classname": "Harvested", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}}, "value": "2023-11-28"}, "collectedfrom": {"value": "ZENODO", "key": "10|opendoar____::358aee4cc897452c00244351e4d91f69"}, "accessright": {"classid": "OPEN", "classname": "Open Access", "schemename": "dnet:access_modes", "schemeid": "dnet:access_modes"}, "instancetype": {"classid": "0020", "classname": "Other ORP type", "schemename": "dnet:publication_resource", "schemeid": "dnet:publication_resource"}}], "dateofcollection": "2023-12-21T22:24:48+0000", "fulltext": [], "dateoftransformation": "2024-01-18T06:50:15.691Z", "dateofacceptance": {"dataInfo": {"invisible": false, "trust": "0.9", "deletedbyinference": false, "inferred": false, "provenanceaction": {"classid": "sysimport:crosswalk:repository", "classname": "Harvested", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}}, "value": "2023-11-28"}, "format": [], "tool": [], "subject": [{"dataInfo": {"invisible": false, "trust": "0.9", "deletedbyinference": false, "inferred": false, "provenanceaction": {"classid": "sysimport:crosswalk:repository", "classname": "Harvested", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}}, "qualifier": {"classid": "keyword", "classname": "keyword", "schemename": "dnet:subject_classification_typologies", "schemeid": "dnet:subject_classification_typologies"}, "value": "Tracheophyta"}, {"dataInfo": {"invisible": false, "trust": "0.9", "deletedbyinference": false, "inferred": false, "provenanceaction": {"classid": "sysimport:crosswalk:repository", "classname": "Harvested", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}}, "qualifier": {"classid": "keyword", "classname": "keyword", "schemename": "dnet:subject_classification_typologies", "schemeid": "dnet:subject_classification_typologies"}, "value": "Magnoliopsida"}, {"dataInfo": {"invisible": false, "trust": "0.9", "deletedbyinference": false, "inferred": false, "provenanceaction": {"classid": "sysimport:crosswalk:repository", "classname": "Harvested", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}}, "qualifier": {"classid": "keyword", "classname": "keyword", "schemename": "dnet:subject_classification_typologies", "schemeid": "dnet:subject_classification_typologies"}, "value": "Amaranthaceae"}, {"dataInfo": {"invisible": false, "trust": "0.9", "deletedbyinference": false, "inferred": false, "provenanceaction": {"classid": "sysimport:crosswalk:repository", "classname": "Harvested", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}}, "qualifier": {"classid": "keyword", "classname": "keyword", "schemename": "dnet:subject_classification_typologies", "schemeid": "dnet:subject_classification_typologies"}, "value": "Saltia"}, {"dataInfo": {"invisible": false, "trust": "0.9", "deletedbyinference": false, "inferred": false, "provenanceaction": {"classid": "sysimport:crosswalk:repository", "classname": "Harvested", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}}, "qualifier": {"classid": "keyword", "classname": "keyword", "schemename": "dnet:subject_classification_typologies", "schemeid": "dnet:subject_classification_typologies"}, "value": "Saltia papposa"}, {"dataInfo": {"invisible": false, "trust": "0.9", "deletedbyinference": false, "inferred": false, "provenanceaction": {"classid": "sysimport:crosswalk:repository", "classname": "Harvested", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}}, "qualifier": {"classid": "keyword", "classname": "keyword", "schemename": "dnet:subject_classification_typologies", "schemeid": "dnet:subject_classification_typologies"}, "value": "Biodiversity"}, {"dataInfo": {"invisible": false, "trust": "0.9", "deletedbyinference": false, "inferred": false, "provenanceaction": {"classid": "sysimport:crosswalk:repository", "classname": "Harvested", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}}, "qualifier": {"classid": "keyword", "classname": "keyword", "schemename": "dnet:subject_classification_typologies", "schemeid": "dnet:subject_classification_typologies"}, "value": "Plantae"}, {"dataInfo": {"invisible": false, "trust": "0.9", "deletedbyinference": false, "inferred": false, "provenanceaction": {"classid": "sysimport:crosswalk:repository", "classname": "Harvested", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}}, "qualifier": {"classid": "keyword", "classname": "keyword", "schemename": "dnet:subject_classification_typologies", "schemeid": "dnet:subject_classification_typologies"}, "value": "Caryophyllales"}, {"dataInfo": {"invisible": false, "trust": "0.9", "deletedbyinference": false, "inferred": false, "provenanceaction": {"classid": "sysimport:crosswalk:repository", "classname": "Harvested", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}}, "qualifier": {"classid": "keyword", "classname": "keyword", "schemename": "dnet:subject_classification_typologies", "schemeid": "dnet:subject_classification_typologies"}, "value": "Taxonomy"}], "coverage": [], "externalReference": [], "publisher": {"dataInfo": {"invisible": false, "trust": "0.9", "deletedbyinference": false, "inferred": false, "provenanceaction": {"classid": "sysimport:crosswalk:repository", "classname": "Harvested", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}}, "value": "Zenodo"}, "lastupdatetimestamp": 1721832280654, "language": {"classid": "und", "classname": "Undetermined", "schemename": "dnet:languages", "schemeid": "dnet:languages"}, "resulttype": {"classid": "other", "classname": "other", "schemename": "dnet:result_typologies", "schemeid": "dnet:result_typologies"}, "country": [], "extraInfo": [], "originalId": ["oai:zenodo.org:10249277", "50|od______2659::42fc9730cd6f5de3b0e3bfacdc347177"], "contactperson": [], "source": [], "context": [], "title": [{"dataInfo": {"invisible": false, "trust": "0.9", "deletedbyinference": false, "inferred": false, "provenanceaction": {"classid": "sysimport:crosswalk:repository", "classname": "Harvested", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}}, "qualifier": {"classid": "main title", "classname": "main title", "schemename": "dnet:dataCite_title", "schemeid": "dnet:dataCite_title"}, "value": "Saltia papposa Moq., Prodr."}]} \ No newline at end of file From c93bf824875fb3ffdb2220ac62fc52495e17fa01 Mon Sep 17 00:00:00 2001 From: Miriam Baglioni Date: Thu, 24 Oct 2024 17:34:34 +0200 Subject: [PATCH 036/111] [affroNewModel] extended wf definition --- .../actionmanager/bipaffiliations/oozie_app/workflow.xml | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/bipaffiliations/oozie_app/workflow.xml b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/bipaffiliations/oozie_app/workflow.xml index 2e89c07fd..88ff42dc2 100644 --- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/bipaffiliations/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/bipaffiliations/oozie_app/workflow.xml @@ -21,6 +21,10 @@ webCrawlInputPath the path where to find the inferred affiliation relations from webCrawl + + publisherInputPath + the path where to find the inferred affiliation relations from publisher websites + outputPath the path where to store the actionset @@ -99,7 +103,7 @@ yarn cluster - Produces the atomic action with the inferred by BIP! affiliation relations (from Crossref and Pubmed) + Produces the atomic action with the inferred by OpenAIRE affiliation relations eu.dnetlib.dhp.actionmanager.bipaffiliations.PrepareAffiliationRelations dhp-aggregation-${projectVersion}.jar @@ -117,6 +121,7 @@ --openapcInputPath${openapcInputPath} --dataciteInputPath${dataciteInputPath} --webCrawlInputPath${webCrawlInputPath} + --publisherInputPath${publisherInputPath} --outputPath${outputPath} From cab8f1135fd82d2721cb8051d44550ac24b0b3eb Mon Sep 17 00:00:00 2001 From: Miriam Baglioni Date: Thu, 24 Oct 2024 17:44:33 +0200 Subject: [PATCH 037/111] [affroNewModel] - --- .../bipaffiliations/PrepareAffiliationRelations.java | 2 +- .../dhp/actionmanager/bipaffiliations/job.properties | 6 ++++-- .../bipaffiliations/PrepareAffiliationRelationsTest.java | 2 +- 3 files changed, 6 insertions(+), 4 deletions(-) diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipaffiliations/PrepareAffiliationRelations.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipaffiliations/PrepareAffiliationRelations.java index 61a018a41..15c1cc376 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipaffiliations/PrepareAffiliationRelations.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipaffiliations/PrepareAffiliationRelations.java @@ -34,7 +34,7 @@ import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils; import scala.Tuple2; /** - * Creates action sets for Crossref affiliation relations inferred by BIP! + * Creates action sets for Crossref affiliation relations inferred by OpenAIRE */ public class PrepareAffiliationRelations implements Serializable { diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/bipaffiliations/job.properties b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/bipaffiliations/job.properties index ded4fe409..58124c9d1 100644 --- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/bipaffiliations/job.properties +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/bipaffiliations/job.properties @@ -32,8 +32,10 @@ spark2SqlQueryExecutionListeners=com.cloudera.spark.lineage.NavigatorQueryListen oozie.wf.application.path=${oozieTopWfApplicationPath} crossrefInputPath=/data/bip-affiliations/crossref-data.json -pubmedInputPath=/data/bip-affiliations/pubmed-data.json +pubmedInputPath=/data/bip-affiliations/pubmed-data-v4.json openapcInputPath=/data/bip-affiliations/openapc-data.json dataciteInputPath=/data/bip-affiliations/datacite-data.json +webCrawlInputPath=/data/bip-affiliations/webCrawl +publisherInputPath=/data/bip-affiliations/publishers -outputPath=/tmp/crossref-affiliations-output-v5 +outputPath=/tmp/affRoAS diff --git a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/bipaffiliations/PrepareAffiliationRelationsTest.java b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/bipaffiliations/PrepareAffiliationRelationsTest.java index c704bb99b..16d60f7da 100644 --- a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/bipaffiliations/PrepareAffiliationRelationsTest.java +++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/bipaffiliations/PrepareAffiliationRelationsTest.java @@ -112,7 +112,7 @@ public class PrepareAffiliationRelationsTest { .map(aa -> ((Relation) aa.getPayload())); // count the number of relations - assertEquals(162, tmp.count());// 18 + 24 + 30 * 4 = + assertEquals(162, tmp.count());// 18 + 24 + 30 * 4 = Dataset dataset = spark.createDataset(tmp.rdd(), Encoders.bean(Relation.class)); dataset.createOrReplaceTempView("result"); From 32f444984e3a6277c3ffc32f45408e4033b7ba3b Mon Sep 17 00:00:00 2001 From: Miriam Baglioni Date: Thu, 24 Oct 2024 17:51:42 +0200 Subject: [PATCH 038/111] [person] - --- .../resources/eu/dnetlib/dhp/person/graph/publication/part-00000 | 1 + .../resources/eu/dnetlib/dhp/person/graph/relation/part-00000 | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/person/graph/publication/part-00000 b/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/person/graph/publication/part-00000 index e69de29bb..af1b5d55c 100644 --- a/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/person/graph/publication/part-00000 +++ b/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/person/graph/publication/part-00000 @@ -0,0 +1 @@ +{"dataInfo": {"invisible": false, "trust": "0.91", "deletedbyinference": true, "inferred": false, "provenanceaction": {"classid": "sysimport:actionset", "classname": "Harvested", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}}, "resourcetype": {"classid": "publication", "classname": "publication", "schemename": "dnet:result_typologies", "schemeid": "dnet:result_typologies"}, "pid": [{"qualifier": {"classid": "doi", "classname": "Digital Object Identifier", "schemename": "dnet:pid_types", "schemeid": "dnet:pid_types"}, "value": "10.11646/phytotaxa.379.3.5"}], "bestaccessright": {"classid": "UNKNOWN", "classname": "not available", "schemename": "dnet:access_modes", "schemeid": "dnet:access_modes"}, "relevantdate": [{"qualifier": {"classid": "created", "classname": "created", "schemename": "dnet:dataCite_date", "schemeid": "dnet:dataCite_date"}, "value": "2018-11-29"}, {"qualifier": {"classid": "published-online", "classname": "published-online", "schemename": "dnet:dataCite_date", "schemeid": "dnet:dataCite_date"}, "value": "2018-11-29"}], "collectedfrom": [{"value": "Crossref", "key": "10|openaire____::081b82f96300b6a6e3d282bad31cb6e2"}], "id": "50|doi_________::b2eae15cfe9b0d7f416b6dcfc84c09f9", "description": [{"value": "As part of the ongoing studies on the genus Polycarpon Linnaeus (1759: 859, 881) (see e.g., Iamonico 2015a, 2015b, 2015c, Iamonico & Domina 2015), and on the Italian loci classici (see e.g., Peruzzi et al. 2015, Brundu et al. 2015, Domina et al. 2016, Di Gristina et al. 2017, Domina et al. 2017, 2018a, 2018b), we present here a note regarding Hagaea alsinifolia Bivona-Bernardi (1815: 7\\u20138) [currently accepted (see Bartolucci et al. 2018) as Polycarpon tetraphyllum Linnaeus (1759: 881) subsp. alsinifolium (Biv.) Ball (1877: 370)]."}], "lastupdatetimestamp": 1648743612067, "author": [{"surname": "IAMONICO", "fullname": "DUILIO IAMONICO", "pid": [], "name": "DUILIO", "rank": 1}, {"surname": "DOMINA", "fullname": "GIANNIANTONIO DOMINA", "pid": [], "name": "GIANNIANTONIO", "rank": 2}], "instance": [{"refereed": {"classid": "0001", "classname": "peerReviewed", "schemename": "dnet:review_levels", "schemeid": "dnet:review_levels"}, "hostedby": {"dataInfo": {"invisible": false, "deletedbyinference": false}, "value": "Phytotaxa", "key": "10|issn___print::9336d3bbf63c241b54726a55fa38c0ef"}, "url": ["https://doi.org/10.11646/phytotaxa.379.3.5"], "pid": [{"qualifier": {"classid": "doi", "classname": "Digital Object Identifier", "schemename": "dnet:pid_types", "schemeid": "dnet:pid_types"}, "value": "10.11646/phytotaxa.379.3.5"}], "instanceTypeMapping": [{"originalType": "journal-article", "typeLabel": "research article", "vocabularyName": "openaire::coar_resource_types_3_1", "typeCode": "http://purl.org/coar/resource_type/c_2df8fbb1"}, {"originalType": "http://purl.org/coar/resource_type/c_2df8fbb1", "typeLabel": "Article", "vocabularyName": "openaire::user_resource_types", "typeCode": "Article"}], "dateofacceptance": {"value": "2018-11-29"}, "collectedfrom": {"value": "Crossref", "key": "10|openaire____::081b82f96300b6a6e3d282bad31cb6e2"}, "accessright": {"classid": "UNKNOWN", "classname": "not available", "schemename": "dnet:access_modes", "schemeid": "dnet:access_modes"}, "instancetype": {"classid": "0001", "classname": "Article", "schemename": "dnet:publication_resource", "schemeid": "dnet:publication_resource"}}], "dateofcollection": "2024-07-26T02:32:47.105", "metaResourceType": {"classid": "Research Literature", "classname": "Research Literature", "schemename": "openaire::meta_resource_types", "schemeid": "openaire::meta_resource_types"}, "context": [], "journal": {"issnPrinted": "1179-3155", "vol": "379", "sp": "267", "issnOnline": "1179-3163", "name": "Phytotaxa"}, "subject": [{"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "subject:fos", "classname": "Inferred by OpenAIRE", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}, "inferred": true, "inferenceprovenance": "update", "invisible": false, "trust": ""}, "qualifier": {"classid": "FOS", "classname": "Fields of Science and Technology classification", "schemename": "dnet:subject_classification_typologies", "schemeid": "dnet:subject_classification_typologies"}, "value": "0106 biological sciences"}, {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "subject:fos", "classname": "Inferred by OpenAIRE", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}, "inferred": true, "inferenceprovenance": "update", "invisible": false, "trust": ""}, "qualifier": {"classid": "FOS", "classname": "Fields of Science and Technology classification", "schemename": "dnet:subject_classification_typologies", "schemeid": "dnet:subject_classification_typologies"}, "value": "0301 basic medicine"}, {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "subject:fos", "classname": "Inferred by OpenAIRE", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}, "inferred": true, "inferenceprovenance": "update", "invisible": false, "trust": ""}, "qualifier": {"classid": "FOS", "classname": "Fields of Science and Technology classification", "schemename": "dnet:subject_classification_typologies", "schemeid": "dnet:subject_classification_typologies"}, "value": "03 medical and health sciences"}, {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "subject:fos", "classname": "Inferred by OpenAIRE", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}, "inferred": true, "inferenceprovenance": "update", "invisible": false, "trust": "0.5"}, "qualifier": {"classid": "FOS", "classname": "Fields of Science and Technology classification", "schemename": "dnet:subject_classification_typologies", "schemeid": "dnet:subject_classification_typologies"}, "value": "03010801 Mycology/Symbiosis"}, {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "subject:fos", "classname": "Inferred by OpenAIRE", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}, "inferred": true, "inferenceprovenance": "update", "invisible": false, "trust": "0.5"}, "qualifier": {"classid": "FOS", "classname": "Fields of Science and Technology classification", "schemename": "dnet:subject_classification_typologies", "schemeid": "dnet:subject_classification_typologies"}, "value": "030108 mycology & parasitology"}, {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "subject:fos", "classname": "Inferred by OpenAIRE", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}, "inferred": true, "inferenceprovenance": "update", "invisible": false, "trust": "0.5"}, "qualifier": {"classid": "FOS", "classname": "Fields of Science and Technology classification", "schemename": "dnet:subject_classification_typologies", "schemeid": "dnet:subject_classification_typologies"}, "value": "010603 evolutionary biology"}, {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "subject:fos", "classname": "Inferred by OpenAIRE", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}, "inferred": true, "inferenceprovenance": "update", "invisible": false, "trust": ""}, "qualifier": {"classid": "FOS", "classname": "Fields of Science and Technology classification", "schemename": "dnet:subject_classification_typologies", "schemeid": "dnet:subject_classification_typologies"}, "value": "01 natural sciences"}, {"dataInfo": {"deletedbyinference": false, "provenanceaction": {"classid": "subject:fos", "classname": "Inferred by OpenAIRE", "schemename": "dnet:provenanceActions", "schemeid": "dnet:provenanceActions"}, "inferred": true, "inferenceprovenance": "update", "invisible": false, "trust": "0.5"}, "qualifier": {"classid": "FOS", "classname": "Fields of Science and Technology classification", "schemename": "dnet:subject_classification_typologies", "schemeid": "dnet:subject_classification_typologies"}, "value": "01060304 Pollination/Angiosperms"}], "externalReference": [], "publisher": {"value": "Magnolia Press"}, "eoscifguidelines": [], "language": {"classid": "und", "classname": "Undetermined", "schemename": "dnet:languages", "schemeid": "dnet:languages"}, "resulttype": {"classid": "publication", "classname": "publication", "schemename": "dnet:result_typologies", "schemeid": "dnet:result_typologies"}, "country": [], "title": [{"qualifier": {"classid": "main title", "classname": "main title", "schemename": "dnet:dataCite_title", "schemeid": "dnet:dataCite_title"}, "value": "Epitypification of Hagaea alsinifolia (Polyycarpon tetraphyllum subsp. Alsinifolium, Caryophyllaceae)"}], "originalId": ["10.11646/phytotaxa.379.3.5", "50|doiboost____|b2eae15cfe9b0d7f416b6dcfc84c09f9"], "source": [{"value": "Crossref"}], "dateofacceptance": {"value": "2018-11-29"}} \ No newline at end of file diff --git a/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/person/graph/relation/part-00000 b/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/person/graph/relation/part-00000 index a17560e55..e69de29bb 100644 --- a/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/person/graph/relation/part-00000 +++ b/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/person/graph/relation/part-00000 @@ -1 +0,0 @@ -{"clazz":"eu.dnetlib.dhp.schema.oaf.Relation","payload":{"collectedfrom":[{"key":"10|openaire____::806360c771262b4d6770e7cdf04b5c5a","value":"ORCID","dataInfo":null}],"dataInfo":{"invisible":false,"inferred":false,"deletedbyinference":false,"trust":"0.91","inferenceprovenance":null,"provenanceaction":{"classid":"sysimport:crosswalk:orcid","classname":"Imported from ORCID","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"}},"lastupdatetimestamp":null,"relType":"personPerson","subRelType":"coAuthorship","relClass":"hasCoAuthor","source":"30|orcid_______::028da52095190c6573d6bf9dba4c8ede","target":"30|orcid_______::8791a84ea413592878d6fe191f0ed35f","validated":true,"validationDate":null,"properties":[]}} \ No newline at end of file From e75326d6ec712d534c34efd50a48c5a18a7a358a Mon Sep 17 00:00:00 2001 From: Miriam Baglioni Date: Fri, 25 Oct 2024 09:13:54 +0200 Subject: [PATCH 039/111] [FundersMatchFromCrossref] added match from CrossRef to DFG unidentified project --- .../scala/eu/dnetlib/doiboost/crossref/Crossref2Oaf.scala | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/crossref/Crossref2Oaf.scala b/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/crossref/Crossref2Oaf.scala index d8292a631..824c7ff52 100644 --- a/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/crossref/Crossref2Oaf.scala +++ b/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/crossref/Crossref2Oaf.scala @@ -560,7 +560,11 @@ case object Crossref2Oaf { "10.13039/501100000266" | "10.13039/501100006041" | "10.13039/501100000265" | "10.13039/501100000270" | "10.13039/501100013589" | "10.13039/501100000271" => generateSimpleRelationFromAward(funder, "ukri________", a => a) - + //DFG + case "10.13039/501100001659" => + val targetId = getProjectId("dfgf________", "1e5e62235d094afd01cd56e65112fc63") + queue += generateRelation(sourceId, targetId, ModelConstants.IS_PRODUCED_BY) + queue += generateRelation(targetId, sourceId, ModelConstants.PRODUCES) case _ => logger.debug("no match for " + funder.DOI.get) } From 842cc75dae0b11bcc0f4974cf6fe199813f7696d Mon Sep 17 00:00:00 2001 From: Miriam Baglioni Date: Fri, 25 Oct 2024 09:42:52 +0200 Subject: [PATCH 040/111] [AffRo] fix name --- .../dhp/actionmanager/bipaffiliations/job.properties | 12 ++++++------ .../bipaffiliations/oozie_app/workflow.xml | 2 +- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/bipaffiliations/job.properties b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/bipaffiliations/job.properties index 58124c9d1..c61830cba 100644 --- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/bipaffiliations/job.properties +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/bipaffiliations/job.properties @@ -31,11 +31,11 @@ spark2SqlQueryExecutionListeners=com.cloudera.spark.lineage.NavigatorQueryListen # The following is needed as a property of a workflow oozie.wf.application.path=${oozieTopWfApplicationPath} -crossrefInputPath=/data/bip-affiliations/crossref-data.json -pubmedInputPath=/data/bip-affiliations/pubmed-data-v4.json -openapcInputPath=/data/bip-affiliations/openapc-data.json -dataciteInputPath=/data/bip-affiliations/datacite-data.json -webCrawlInputPath=/data/bip-affiliations/webCrawl -publisherInputPath=/data/bip-affiliations/publishers +crossrefInputPath=/data/openaire-affiliations/crossref-data.json +pubmedInputPath=/data/openaire-affiliations/pubmed-data-v4.json +openapcInputPath=/data/openaire-affiliations/openapc-data.json +dataciteInputPath=/data/openaire-affiliations/datacite-data.json +webCrawlInputPath=/data/openaire-affiliations/webCrawl +publisherInputPath=/data/openaire-affiliations/publishers outputPath=/tmp/affRoAS diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/bipaffiliations/oozie_app/workflow.xml b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/bipaffiliations/oozie_app/workflow.xml index 88ff42dc2..2e65aaa5e 100644 --- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/bipaffiliations/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/bipaffiliations/oozie_app/workflow.xml @@ -1,4 +1,4 @@ - + From 1fce7d5a0f467b478993d408c1103dbd7d895acf Mon Sep 17 00:00:00 2001 From: Miriam Baglioni Date: Fri, 25 Oct 2024 10:05:17 +0200 Subject: [PATCH 041/111] [Person] remove the isolated nodes from the person set --- .../person/SparkExtractPersonRelations.java | 28 +++++++++++++++++-- 1 file changed, 25 insertions(+), 3 deletions(-) diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/person/SparkExtractPersonRelations.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/person/SparkExtractPersonRelations.java index 6caeef478..3892498df 100644 --- a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/person/SparkExtractPersonRelations.java +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/person/SparkExtractPersonRelations.java @@ -16,10 +16,8 @@ import org.apache.spark.api.java.function.FilterFunction; import org.apache.spark.api.java.function.FlatMapFunction; import org.apache.spark.api.java.function.MapFunction; import org.apache.spark.api.java.function.MapGroupsFunction; +import org.apache.spark.sql.*; import org.apache.spark.sql.Dataset; -import org.apache.spark.sql.Encoders; -import org.apache.spark.sql.SaveMode; -import org.apache.spark.sql.SparkSession; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -84,9 +82,33 @@ public class SparkExtractPersonRelations { spark, sourcePath, workingPath); + removeIsolatedPerson(spark,sourcePath, workingPath); }); } + private static void removeIsolatedPerson(SparkSession spark, String sourcePath, String workingPath) { + Dataset personDataset = spark.read().schema(Encoders.bean(Person.class).schema()) + .json(sourcePath + "person") + .as(Encoders.bean(Person.class)); + + Dataset relationDataset = spark.read().schema(Encoders.bean(Relation.class).schema()) + .json(sourcePath + "relation") + .as(Encoders.bean(Relation.class)); + + personDataset.join(relationDataset, personDataset.col("id").equalTo(relationDataset.col("source")), "left_semi") + .write() + .option("compression","gzip") + .mode(SaveMode.Overwrite) + .json(workingPath + "person"); + + spark.read().schema(Encoders.bean(Person.class).schema()) + .json(workingPath + "person") + .write() + .mode(SaveMode.Overwrite) + .option("compression","gzip") + .json(sourcePath + "person"); + } + private static void extractRelations(SparkSession spark, String sourcePath, String workingPath) { Dataset> relationDataset = spark From 0fb6af5586ac0532745bd9fde1347b1c972fcf8a Mon Sep 17 00:00:00 2001 From: "miriam.baglioni" Date: Mon, 29 Jan 2024 18:12:33 +0100 Subject: [PATCH 042/111] Updated main pom dependency against dhp-schema, from 8.0.1 to 9.0.0. The new fields included in the updated schema module are populated by the Solr JSON payload mapping, which also limits the number of authors serialised to 200. --- .../dhp/schema/oaf/utils/ModelHardLimits.java | 1 + .../model/ProvisionModelSupport.java | 19 ++++++++++++++++++- pom.xml | 2 +- 3 files changed, 20 insertions(+), 2 deletions(-) diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/ModelHardLimits.java b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/ModelHardLimits.java index 36d138ba1..74cd1b42a 100644 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/ModelHardLimits.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/ModelHardLimits.java @@ -12,6 +12,7 @@ public class ModelHardLimits { public static final int MAX_EXTERNAL_ENTITIES = 50; public static final int MAX_AUTHORS = 200; + public static final int MAX_RELATED_AUTHORS = 20; public static final int MAX_AUTHOR_FULLNAME_LENGTH = 1000; public static final int MAX_TITLE_LENGTH = 5000; public static final int MAX_TITLES = 10; diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/ProvisionModelSupport.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/ProvisionModelSupport.java index 4a2326453..bc02b595f 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/ProvisionModelSupport.java +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/ProvisionModelSupport.java @@ -5,6 +5,7 @@ import java.io.StringReader; import java.util.*; import java.util.stream.Collectors; +import eu.dnetlib.dhp.schema.oaf.utils.ModelHardLimits; import org.apache.commons.lang3.StringUtils; import org.dom4j.Document; import org.dom4j.DocumentException; @@ -150,6 +151,12 @@ public class ProvisionModelSupport { rr.setPublisher(re.getPublisher()); rr.setResulttype(mapQualifier(re.getResulttype())); rr.setTitle(Optional.ofNullable(re.getTitle()).map(StructuredProperty::getValue).orElse(null)); + rr.setDescription(StringUtils.left(re.getDescription(), ModelHardLimits.MAX_RELATED_ABSTRACT_LENGTH)); + rr.setAuthor(Optional.ofNullable(re.getAuthor()) + .map(aa -> aa.stream() + .limit(ModelHardLimits.MAX_RELATED_AUTHORS) + .collect(Collectors.toList())) + .orElse(null)); if (relation.getValidated() == null) { relation.setValidated(false); @@ -378,6 +385,7 @@ public class ProvisionModelSupport { rs.setPubliclyFunded(r.getPubliclyFunded()); rs.setTransformativeAgreement(r.getTransformativeAgreement()); rs.setExternalReference(mapExternalReference(r.getExternalReference())); + rs.setBestinstancetype(mapQualifier(r.getBestInstancetype())); rs.setInstance(mapInstances(r.getInstance())); if (r instanceof Publication) { @@ -667,14 +675,23 @@ public class ProvisionModelSupport { } private static List asAuthor(List authorList) { + return asAuthor(authorList, ModelHardLimits.MAX_AUTHORS); + } + + private static List asAuthor(List authorList, int maxAuthors) { return Optional .ofNullable(authorList) .map( authors -> authors .stream() + .limit(maxAuthors) .map( a -> Author - .newInstance(a.getFullname(), a.getName(), a.getSurname(), a.getRank(), asPid(a.getPid()))) + .newInstance( + StringUtils.left(a.getFullname(), ModelHardLimits.MAX_AUTHOR_FULLNAME_LENGTH), + a.getName(), + a.getSurname(), + a.getRank(), asPid(a.getPid()))) .collect(Collectors.toList())) .orElse(null); } diff --git a/pom.xml b/pom.xml index e1d99f25b..9480ddfc0 100644 --- a/pom.xml +++ b/pom.xml @@ -937,7 +937,7 @@ 1.1.3 1.7 1.0.7 - [8.0.1] + [9.0.0] cdh5.9.2 3.5 11.0.2 From 32fa579b809138f9ae5d52d661423b1cebf4fbe4 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Mon, 28 Oct 2024 10:03:02 +0100 Subject: [PATCH 043/111] [graph provision] select the longest abstract --- .../dhp/oa/provision/CreateRelatedEntitiesJob_phase1.java | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/CreateRelatedEntitiesJob_phase1.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/CreateRelatedEntitiesJob_phase1.java index 63f3c2ead..add1c80fa 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/CreateRelatedEntitiesJob_phase1.java +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/CreateRelatedEntitiesJob_phase1.java @@ -3,6 +3,7 @@ package eu.dnetlib.dhp.oa.provision; import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; +import java.util.Comparator; import java.util.List; import java.util.Objects; import java.util.Optional; @@ -167,8 +168,9 @@ public class CreateRelatedEntitiesJob_phase1 { result .getDescription() .stream() - .findFirst() + .filter(d -> Objects.nonNull(d.getValue())) .map(Field::getValue) + .max(Comparator.comparingInt(String::length)) .ifPresent( d -> re.setDescription(StringUtils.left(d, ModelHardLimits.MAX_RELATED_ABSTRACT_LENGTH))); } From 6fd50266f11c39d7b71600d8da9fc010da4037e5 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Mon, 28 Oct 2024 10:42:46 +0100 Subject: [PATCH 044/111] translate 'otherresearchproduct' into 'other' when setting the related record type --- .../dnetlib/dhp/oa/provision/model/ProvisionModelSupport.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/ProvisionModelSupport.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/ProvisionModelSupport.java index bc02b595f..69aa940c9 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/ProvisionModelSupport.java +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/ProvisionModelSupport.java @@ -109,7 +109,7 @@ public class ProvisionModelSupport { RelatedRecord rr = new RelatedRecord(); final RelatedEntity re = rew.getTarget(); - final RecordType relatedRecordType = RecordType.valueOf(re.getType()); + final RecordType relatedRecordType = RecordType.fromString(re.getType()); final Relation relation = rew.getRelation(); final String relationProvenance = Optional .ofNullable(relation.getDataInfo()) From 5916346ba17cfcbc8a9ba378e4f6f1972c1b729e Mon Sep 17 00:00:00 2001 From: Miriam Baglioni Date: Mon, 28 Oct 2024 12:18:50 +0100 Subject: [PATCH 045/111] [TransformativeAgreement] fix to remove the file downloaded from a previous run of the workflow --- .../transformativeagreement/oozie_app/workflow.xml | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/transformativeagreement/oozie_app/workflow.xml b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/transformativeagreement/oozie_app/workflow.xml index 0c5b1c119..a11e3350e 100644 --- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/transformativeagreement/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/transformativeagreement/oozie_app/workflow.xml @@ -24,7 +24,7 @@ - ${wf:conf('resumeFrom') eq 'DownloadDump'} + ${wf:conf('resumeFrom') eq 'DownloadDump'} @@ -33,6 +33,14 @@ Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] + + + + + + + + ${jobTracker} From 56224e034ab1b19bfc21d5dab38ea49d6da62529 Mon Sep 17 00:00:00 2001 From: Giambattista Bloisi Date: Mon, 28 Oct 2024 12:05:56 +0100 Subject: [PATCH 046/111] Fill the new mergedIds field when generating dedup records Filter out dedup records composed of invisible records only Filter out mergerels that have not been used when creating the dedup record (ungrouping of cliques) --- .../dhp/oa/dedup/DedupRecordFactory.java | 15 ++++++-- .../dhp/oa/dedup/SparkCreateDedupRecord.java | 36 +++++++++++++++++-- pom.xml | 2 +- 3 files changed, 47 insertions(+), 6 deletions(-) diff --git a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/DedupRecordFactory.java b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/DedupRecordFactory.java index 36ed4d7c1..44482cfdb 100644 --- a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/DedupRecordFactory.java +++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/DedupRecordFactory.java @@ -2,14 +2,13 @@ package eu.dnetlib.dhp.oa.dedup; import java.util.*; +import java.util.stream.Collectors; import java.util.stream.Stream; import org.apache.commons.beanutils.BeanUtils; import org.apache.commons.lang3.StringUtils; -import org.apache.spark.api.java.function.FlatMapFunction; import org.apache.spark.api.java.function.FlatMapGroupsFunction; import org.apache.spark.api.java.function.MapFunction; -import org.apache.spark.api.java.function.ReduceFunction; import org.apache.spark.sql.*; import eu.dnetlib.dhp.oa.dedup.model.Identifier; @@ -107,6 +106,8 @@ public class DedupRecordFactory { final HashSet acceptanceDate = new HashSet<>(); + boolean isVisible = false; + while (it.hasNext()) { Tuple3 t = it.next(); OafEntity entity = t._3(); @@ -114,6 +115,7 @@ public class DedupRecordFactory { if (entity == null) { aliases.add(t._2()); } else { + isVisible = isVisible || !entity.getDataInfo().getInvisible(); cliques.add(entity); if (acceptanceDate.size() < MAX_ACCEPTANCE_DATE) { @@ -129,13 +131,20 @@ public class DedupRecordFactory { } - if (acceptanceDate.size() >= MAX_ACCEPTANCE_DATE || cliques.isEmpty()) { + if (!isVisible || acceptanceDate.size() >= MAX_ACCEPTANCE_DATE || cliques.isEmpty()) { return Collections.emptyIterator(); } OafEntity mergedEntity = MergeUtils.mergeGroup(dedupId, cliques.iterator()); // dedup records do not have date of transformation attribute mergedEntity.setDateoftransformation(null); + mergedEntity + .setMergedIds( + Stream + .concat(cliques.stream().map(OafEntity::getId), aliases.stream()) + .distinct() + .sorted() + .collect(Collectors.toList())); return Stream .concat( diff --git a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkCreateDedupRecord.java b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkCreateDedupRecord.java index 6989ec54b..6f5f40e43 100644 --- a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkCreateDedupRecord.java +++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkCreateDedupRecord.java @@ -5,11 +5,11 @@ import static eu.dnetlib.dhp.schema.common.ModelConstants.DNET_PROVENANCE_ACTION import static eu.dnetlib.dhp.schema.common.ModelConstants.PROVENANCE_DEDUP; import java.io.IOException; +import java.util.Arrays; import org.apache.commons.io.IOUtils; import org.apache.spark.SparkConf; -import org.apache.spark.sql.SaveMode; -import org.apache.spark.sql.SparkSession; +import org.apache.spark.sql.*; import org.dom4j.DocumentException; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -17,6 +17,7 @@ import org.xml.sax.SAXException; import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.schema.common.EntityType; +import eu.dnetlib.dhp.schema.common.ModelConstants; import eu.dnetlib.dhp.schema.common.ModelSupport; import eu.dnetlib.dhp.schema.oaf.DataInfo; import eu.dnetlib.dhp.schema.oaf.OafEntity; @@ -25,6 +26,7 @@ import eu.dnetlib.dhp.utils.ISLookupClientFactory; import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException; import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService; import eu.dnetlib.pace.config.DedupConfig; +import scala.collection.JavaConverters; public class SparkCreateDedupRecord extends AbstractSparkAction { @@ -85,6 +87,36 @@ public class SparkCreateDedupRecord extends AbstractSparkAction { .mode(SaveMode.Overwrite) .option("compression", "gzip") .json(outputPath); + + log.info("Updating mergerels for: '{}'", subEntity); + final Dataset dedupIds = spark + .read() + .schema("`id` STRING, `mergedIds` ARRAY") + .json(outputPath) + .selectExpr("id as source", "explode(mergedIds) as target"); + spark + .read() + .load(mergeRelPath) + .where("relClass == 'merges'") + .join(dedupIds, JavaConverters.asScalaBuffer(Arrays.asList("source", "target")).toSeq(), "left_semi") + .write() + .mode(SaveMode.Overwrite) + .option("compression", "gzip") + .save(workingPath + "/mergerel_filtered"); + + final Dataset validRels = spark.read().load(workingPath + "/mergerel_filtered"); + + final Dataset filteredMergeRels = validRels + .union( + validRels + .withColumnRenamed("source", "source_tmp") + .withColumnRenamed("target", "target_tmp") + .withColumn("relClass", functions.lit(ModelConstants.IS_MERGED_IN)) + .withColumnRenamed("target_tmp", "source") + .withColumnRenamed("source_tmp", "target")); + + saveParquet(filteredMergeRels, mergeRelPath, SaveMode.Overwrite); + removeOutputDir(spark, workingPath + "/mergerel_filtered"); } } diff --git a/pom.xml b/pom.xml index e1d99f25b..9480ddfc0 100644 --- a/pom.xml +++ b/pom.xml @@ -937,7 +937,7 @@ 1.1.3 1.7 1.0.7 - [8.0.1] + [9.0.0] cdh5.9.2 3.5 11.0.2 From e6ca382debc8c9ee96e008bfa71a6ebd4083a76a Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Mon, 28 Oct 2024 13:52:06 +0100 Subject: [PATCH 047/111] using scala 2.11 converters --- .../java/eu/dnetlib/dhp/oa/dedup/SparkCreateDedupRecord.java | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkCreateDedupRecord.java b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkCreateDedupRecord.java index 6f5f40e43..29394cb12 100644 --- a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkCreateDedupRecord.java +++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkCreateDedupRecord.java @@ -98,7 +98,9 @@ public class SparkCreateDedupRecord extends AbstractSparkAction { .read() .load(mergeRelPath) .where("relClass == 'merges'") - .join(dedupIds, JavaConverters.asScalaBuffer(Arrays.asList("source", "target")).toSeq(), "left_semi") + .join( + dedupIds, JavaConverters.asScalaBufferConverter(Arrays.asList("source", "target")).asScala(), + "left_semi") .write() .mode(SaveMode.Overwrite) .option("compression", "gzip") From 9b4415cb674c5fedfa8644a035c2bfc66414dfe2 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Mon, 28 Oct 2024 13:56:25 +0100 Subject: [PATCH 048/111] using _the right_ scala 2.11 converters --- .../java/eu/dnetlib/dhp/oa/dedup/SparkCreateDedupRecord.java | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkCreateDedupRecord.java b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkCreateDedupRecord.java index 29394cb12..bd17d05eb 100644 --- a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkCreateDedupRecord.java +++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkCreateDedupRecord.java @@ -26,6 +26,7 @@ import eu.dnetlib.dhp.utils.ISLookupClientFactory; import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException; import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService; import eu.dnetlib.pace.config.DedupConfig; +import scala.collection.JavaConversions; import scala.collection.JavaConverters; public class SparkCreateDedupRecord extends AbstractSparkAction { @@ -98,9 +99,7 @@ public class SparkCreateDedupRecord extends AbstractSparkAction { .read() .load(mergeRelPath) .where("relClass == 'merges'") - .join( - dedupIds, JavaConverters.asScalaBufferConverter(Arrays.asList("source", "target")).asScala(), - "left_semi") + .join(dedupIds, JavaConversions.asScalaBuffer(Arrays.asList("source", "target")), "left_semi") .write() .mode(SaveMode.Overwrite) .option("compression", "gzip") From e4504fd98de52f20b201ff3985a18792241751f1 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Mon, 28 Oct 2024 15:32:09 +0100 Subject: [PATCH 049/111] [Person] fixed project identifier creation --- .../personentity/ExtractPerson.java | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/personentity/ExtractPerson.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/personentity/ExtractPerson.java index 1131f85e9..bf2c19c3d 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/personentity/ExtractPerson.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/personentity/ExtractPerson.java @@ -25,6 +25,7 @@ import org.apache.hadoop.mapred.SequenceFileOutputFormat; import org.apache.spark.SparkConf; import org.apache.spark.api.java.function.*; import org.apache.spark.sql.*; +import org.apache.spark.sql.Dataset; import org.jetbrains.annotations.NotNull; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -43,10 +44,7 @@ import eu.dnetlib.dhp.common.person.Coauthors; import eu.dnetlib.dhp.schema.action.AtomicAction; import eu.dnetlib.dhp.schema.common.ModelConstants; import eu.dnetlib.dhp.schema.common.ModelSupport; -import eu.dnetlib.dhp.schema.oaf.DataInfo; -import eu.dnetlib.dhp.schema.oaf.KeyValue; -import eu.dnetlib.dhp.schema.oaf.Person; -import eu.dnetlib.dhp.schema.oaf.Relation; +import eu.dnetlib.dhp.schema.oaf.*; import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory; import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils; import eu.dnetlib.dhp.schema.oaf.utils.PidCleaner; @@ -70,7 +68,11 @@ public class ExtractPerson implements Serializable { private static final String PMCID_PREFIX = "50|pmcid_______::"; private static final String ROR_PREFIX = "20|ror_________::"; - private static final String PERSON_PREFIX = ModelSupport.getIdPrefix(Person.class) + "|orcid_______"; + private static final String PERSON_PREFIX = ModelSupport.getIdPrefix(Person.class) + + IdentifierFactory.ID_PREFIX_SEPARATOR + ModelConstants.ORCID + "_______"; + private static final String PROJECT_ID_PREFIX = ModelSupport.getIdPrefix(Project.class) + + IdentifierFactory.ID_PREFIX_SEPARATOR; + public static final String ORCID_AUTHORS_CLASSID = "sysimport:crosswalk:orcid"; public static final String ORCID_AUTHORS_CLASSNAME = "Imported from ORCID"; public static final String FUNDER_AUTHORS_CLASSID = "sysimport:crosswalk:funderdatabase"; @@ -173,7 +175,7 @@ public class ExtractPerson implements Serializable { } } catch (IOException e) { - e.printStackTrace(); + throw new RuntimeException(e); } } @@ -191,7 +193,7 @@ public class ExtractPerson implements Serializable { private static Relation getProjectRelation(String project, String orcid, String role) { String source = PERSON_PREFIX + "::" + IdentifierFactory.md5(orcid); - String target = project.substring(0, 14) + String target = PROJECT_ID_PREFIX + project.substring(0, 14) + IdentifierFactory.md5(project.substring(15)); List properties = new ArrayList<>(); From 499892b67c4549bd0cebe6bc07873a4e553eda3a Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Tue, 29 Oct 2024 09:51:30 +0100 Subject: [PATCH 050/111] [graph raw] rule out empty PIDs --- .../graph/raw/AbstractMdRecordToOafMapper.java | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java index a85f47d99..2436a272c 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java @@ -657,13 +657,21 @@ public abstract class AbstractMdRecordToOafMapper { final Node n = (Node) o; final String classId = n.valueOf(xpathClassId).trim(); if (this.vocs.termExists(schemeId, classId)) { - res - .add( - HashableStructuredProperty - .newInstance(n.getText(), this.vocs.getTermAsQualifier(schemeId, classId), info)); + final String value = n.getText(); + if (StringUtils.isNotBlank(value)) { + res + .add( + HashableStructuredProperty + .newInstance(value, this.vocs.getTermAsQualifier(schemeId, classId), info)); + } } } - return Lists.newArrayList(res); + return res + .stream() + .filter(Objects::nonNull) + .filter(p -> StringUtils.isNotBlank(p.getValue())) + .filter(p -> StringUtils.isNotBlank(p.getValue().trim())) + .collect(Collectors.toList()); } protected List prepareListStructProps( From 5ca031c8d6a780b46adc2b1b0b4a7a5307cf9c73 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Tue, 29 Oct 2024 13:48:41 +0100 Subject: [PATCH 051/111] [graph raw] rule out empty PIDs --- .../java/eu/dnetlib/dhp/schema/oaf/utils/IdentifierFactory.java | 1 + .../main/java/eu/dnetlib/dhp/schema/oaf/utils/PidCleaner.java | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/IdentifierFactory.java b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/IdentifierFactory.java index 2c77c3b37..da245d67c 100644 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/IdentifierFactory.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/IdentifierFactory.java @@ -204,6 +204,7 @@ public class IdentifierFactory implements Serializable { .map( pp -> pp .stream() + .filter(p -> StringUtils.isNotBlank(p.getValue())) // filter away PIDs provided by a DS that is not considered an authority for the // given PID Type .filter(p -> shouldFilterPidByCriteria(collectedFrom, p, mapHandles)) diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/PidCleaner.java b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/PidCleaner.java index 114c2b3af..678ed71dd 100644 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/PidCleaner.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/PidCleaner.java @@ -26,7 +26,7 @@ public class PidCleaner { String value = Optional .ofNullable(pidValue) .map(String::trim) - .orElseThrow(() -> new IllegalArgumentException("PID value cannot be empty")); + .orElseThrow(() -> new IllegalArgumentException("PID (" + pidType + ") value cannot be empty")); switch (pidType) { From 69aee609ef32c6e8121e08042b25d779f8a40dcb Mon Sep 17 00:00:00 2001 From: Miriam Baglioni Date: Tue, 29 Oct 2024 15:53:04 +0100 Subject: [PATCH 052/111] [bulktag] align type to community api --- .../eu/dnetlib/dhp/api/model/CommunityContentprovider.java | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/api/model/CommunityContentprovider.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/api/model/CommunityContentprovider.java index 9fab5a80c..8e0ea598c 100644 --- a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/api/model/CommunityContentprovider.java +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/api/model/CommunityContentprovider.java @@ -13,13 +13,13 @@ public class CommunityContentprovider { private String openaireId; private SelectionConstraints selectioncriteria; - private String enabled; + private Boolean enabled; - public String getEnabled() { + public Boolean getEnabled() { return enabled; } - public void setEnabled(String enabled) { + public void setEnabled(Boolean enabled) { this.enabled = enabled; } From 323c76eafca0e992c7b7dd749fd31fdc2eb840e2 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Wed, 30 Oct 2024 07:35:30 +0100 Subject: [PATCH 053/111] patch relations job: removed non necessary logging --- .../eu/dnetlib/dhp/oa/graph/raw/PatchRelationsApplication.java | 3 --- 1 file changed, 3 deletions(-) diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/PatchRelationsApplication.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/PatchRelationsApplication.java index 615b4a824..dce6cd39d 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/PatchRelationsApplication.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/PatchRelationsApplication.java @@ -80,9 +80,6 @@ public class PatchRelationsApplication { final Dataset rels = readPath(spark, relationPath, Relation.class); final Dataset idMapping = readPath(spark, idMappingPath, RelationIdMapping.class); - log.info("relations: {}", rels.count()); - log.info("idMapping: {}", idMapping.count()); - final Dataset bySource = rels .joinWith(idMapping, rels.col("source").equalTo(idMapping.col("oldId")), "left") .map((MapFunction, Relation>) t -> { From 26cdc7e439e8035a31a21f4895a4d521aa85d1e0 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Wed, 30 Oct 2024 07:35:47 +0100 Subject: [PATCH 054/111] Avoid NPEs in MergeUtils --- .../main/java/eu/dnetlib/dhp/schema/oaf/utils/MergeUtils.java | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/MergeUtils.java b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/MergeUtils.java index e01813110..79a12d630 100644 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/MergeUtils.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/MergeUtils.java @@ -715,7 +715,9 @@ public class MergeUtils { private static String spKeyExtractor(StructuredProperty sp) { return Optional .ofNullable(sp) - .map(s -> Joiner.on("||").join(qualifierKeyExtractor(s.getQualifier()), s.getValue())) + .map(s -> Joiner.on("||") + .useForNull("") + .join(qualifierKeyExtractor(s.getQualifier()), s.getValue())) .orElse(null); } From a877c76d70a10afdd888f84a33cbe683c2f78755 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Wed, 30 Oct 2024 11:24:25 +0100 Subject: [PATCH 055/111] make MergeUtils.selectOldestDate less prone to errors when receiving invalid date formats --- .../schema/oaf/utils/GraphCleaningFunctions.java | 2 +- .../dnetlib/dhp/schema/oaf/utils/MergeUtils.java | 15 +++++---------- 2 files changed, 6 insertions(+), 11 deletions(-) diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/GraphCleaningFunctions.java b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/GraphCleaningFunctions.java index fdfd63a15..b6574da16 100644 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/GraphCleaningFunctions.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/GraphCleaningFunctions.java @@ -810,7 +810,7 @@ public class GraphCleaningFunctions extends CleaningFunctions { return author; } - private static Optional cleanDateField(Field dateofacceptance) { + public static Optional cleanDateField(Field dateofacceptance) { return Optional .ofNullable(dateofacceptance) .map(Field::getValue) diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/MergeUtils.java b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/MergeUtils.java index 79a12d630..4c411a155 100644 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/MergeUtils.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/MergeUtils.java @@ -654,16 +654,9 @@ public class MergeUtils { } private static Field selectOldestDate(Field d1, Field d2) { - if (d1 == null || StringUtils.isBlank(d1.getValue())) { + if (!GraphCleaningFunctions.cleanDateField(d1).isPresent()) { return d2; - } else if (d2 == null || StringUtils.isBlank(d2.getValue())) { - return d1; - } - - if (StringUtils.contains(d1.getValue(), "null")) { - return d2; - } - if (StringUtils.contains(d2.getValue(), "null")) { + } else if (!GraphCleaningFunctions.cleanDateField(d2).isPresent()) { return d1; } @@ -715,7 +708,9 @@ public class MergeUtils { private static String spKeyExtractor(StructuredProperty sp) { return Optional .ofNullable(sp) - .map(s -> Joiner.on("||") + .map( + s -> Joiner + .on("||") .useForNull("") .join(qualifierKeyExtractor(s.getQualifier()), s.getValue())) .orElse(null); From a42c8b7c8541d3225d5d087469af9b7ff5fb9aa7 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Wed, 30 Oct 2024 11:25:17 +0100 Subject: [PATCH 056/111] person table directory produced by the workflows raw_all and merge graphs --- .../dhp/oa/graph/merge/oozie_app/workflow.xml | 28 +++++++++++++++++++ .../oa/graph/raw_all/oozie_app/workflow.xml | 27 ++++++++++++++++++ 2 files changed, 55 insertions(+) diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/merge/oozie_app/workflow.xml b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/merge/oozie_app/workflow.xml index a8d0d5068..3444e3afe 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/merge/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/merge/oozie_app/workflow.xml @@ -68,6 +68,7 @@ + @@ -260,6 +261,33 @@ + + + yarn + cluster + Merge person + eu.dnetlib.dhp.oa.graph.merge.MergeGraphTableSparkJob + dhp-graph-mapper-${projectVersion}.jar + + --executor-cores=${sparkExecutorCores} + --executor-memory=${sparkExecutorMemory} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.sql.shuffle.partitions=7680 + + --betaInputPath${betaInputGraphPath}/person + --prodInputPath${prodInputGraphPath}/person + --outputPath${graphOutputPath}/person + --graphTableClassNameeu.dnetlib.dhp.schema.oaf.Person + --priority${priority} + + + + + yarn diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/raw_all/oozie_app/workflow.xml b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/raw_all/oozie_app/workflow.xml index ff927fe52..1b3cb1111 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/raw_all/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/raw_all/oozie_app/workflow.xml @@ -649,6 +649,7 @@ + @@ -860,6 +861,32 @@ + + + yarn + cluster + MergeClaims_person + eu.dnetlib.dhp.oa.graph.raw.MergeClaimsApplication + dhp-graph-mapper-${projectVersion}.jar + + --executor-memory ${sparkExecutorMemory} + --executor-cores ${sparkExecutorCores} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.sql.shuffle.partitions=200 + + --rawGraphPath${workingDir}/graph_raw + --claimsGraphPath${workingDir}/graph_claims + --outputRawGaphPath${graphOutputPath} + --graphTableClassNameeu.dnetlib.dhp.schema.oaf.Person + + + + + From a8ed5a3b048b560a6c2b794834e27e4adcdcac97 Mon Sep 17 00:00:00 2001 From: Sandro La Bruzzo Date: Mon, 4 Nov 2024 17:45:28 +0100 Subject: [PATCH 057/111] Organized getters and setters in the PMArticle class for better readability and maintainability. --- dhp-shade-package/pom.xml | 122 +++---- .../personentity/ExtractPerson.java | 8 +- .../dnetlib/dhp/sx/bio/pubmed/PMArticle.java | 312 +++++------------- .../sx/bio/ebi/baseline_to_oaf_params.json | 7 +- .../sx/bio/ebi/SparkCreatePubmedDump.scala | 90 +++++ .../dnetlib/dhp/sx/bio/pubmed/PMParser2.scala | 264 +++++++++++++++ .../dhp/sx/graph/bio/single_pubmed.xml | 222 +++++++++++++ .../dnetlib/dhp/sx/bio/BioScholixTest.scala | 28 +- 8 files changed, 754 insertions(+), 299 deletions(-) create mode 100644 dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/ebi/SparkCreatePubmedDump.scala create mode 100644 dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/pubmed/PMParser2.scala create mode 100644 dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/sx/graph/bio/single_pubmed.xml diff --git a/dhp-shade-package/pom.xml b/dhp-shade-package/pom.xml index d8e17ed46..c4f9b262e 100644 --- a/dhp-shade-package/pom.xml +++ b/dhp-shade-package/pom.xml @@ -26,16 +26,16 @@ - - eu.dnetlib.dhp - dhp-actionmanager - ${project.version} - - + + + eu.dnetlib.dhp + dhp-aggregation + ${project.version} + @@ -56,61 +56,61 @@ - - eu.dnetlib.dhp - dhp-graph-mapper - ${project.version} - - - eu.dnetlib.dhp - dhp-graph-provision - ${project.version} - - - eu.dnetlib.dhp - dhp-impact-indicators - ${project.version} - - - eu.dnetlib.dhp - dhp-stats-actionsets - ${project.version} - - - eu.dnetlib.dhp - dhp-stats-hist-snaps - ${project.version} - - - eu.dnetlib.dhp - dhp-stats-monitor-irish - ${project.version} - - - eu.dnetlib.dhp - dhp-stats-promote - ${project.version} - - - eu.dnetlib.dhp - dhp-stats-update - ${project.version} - - - eu.dnetlib.dhp - dhp-swh - ${project.version} - - - eu.dnetlib.dhp - dhp-usage-raw-data-update - ${project.version} - - - eu.dnetlib.dhp - dhp-usage-stats-build - ${project.version} - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/personentity/ExtractPerson.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/personentity/ExtractPerson.java index bf2c19c3d..db31bb43f 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/personentity/ExtractPerson.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/personentity/ExtractPerson.java @@ -15,6 +15,7 @@ import java.util.stream.Collectors; import org.apache.commons.cli.ParseException; import org.apache.commons.io.IOUtils; +import org.apache.commons.lang3.StringUtils; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FSDataOutputStream; import org.apache.hadoop.fs.FileSystem; @@ -29,7 +30,6 @@ import org.apache.spark.sql.Dataset; import org.jetbrains.annotations.NotNull; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import org.spark_project.jetty.util.StringUtil; import com.fasterxml.jackson.databind.ObjectMapper; @@ -206,7 +206,7 @@ public class ExtractPerson implements Serializable { null); relation.setValidated(true); - if (StringUtil.isNotBlank(role)) { + if (StringUtils.isNotBlank(role)) { KeyValue kv = new KeyValue(); kv.setKey("role"); kv.setValue(role); @@ -439,13 +439,13 @@ public class ExtractPerson implements Serializable { null); relation.setValidated(true); - if (Optional.ofNullable(row.getStartDate()).isPresent() && StringUtil.isNotBlank(row.getStartDate())) { + if (Optional.ofNullable(row.getStartDate()).isPresent() && StringUtils.isNotBlank(row.getStartDate())) { KeyValue kv = new KeyValue(); kv.setKey("startDate"); kv.setValue(row.getStartDate()); properties.add(kv); } - if (Optional.ofNullable(row.getEndDate()).isPresent() && StringUtil.isNotBlank(row.getEndDate())) { + if (Optional.ofNullable(row.getEndDate()).isPresent() && StringUtils.isNotBlank(row.getEndDate())) { KeyValue kv = new KeyValue(); kv.setKey("endDate"); kv.setValue(row.getEndDate()); diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/pubmed/PMArticle.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/pubmed/PMArticle.java index 3fb814606..6191f6446 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/pubmed/PMArticle.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/pubmed/PMArticle.java @@ -8,259 +8,115 @@ import java.util.List; /** * This class represent an instance of Pubmed Article extracted from the native XML * - * @author Sandro La Bruzzo */ - public class PMArticle implements Serializable { - /** - * the Pubmed Identifier - */ private String pmid; - private String pmcId; - - /** - * the DOI - */ private String doi; - /** - * the Pubmed Date extracted from Specifies a date significant to either the article's history or the citation's processing. - * All dates will have a , , and elements. Some may have an , , and element(s). - */ private String date; - /** - * This is an 'envelop' element that contains various elements describing the journal cited; i.e., ISSN, Volume, Issue, and PubDate and author name(s), however, it does not contain data itself. - */ private PMJournal journal; - /** - * The full journal title (taken from NLM cataloging data following NLM rules for how to compile a serial name) is exported in this element. Some characters that are not part of the NLM MEDLINE/PubMed Character Set reside in a relatively small number of full journal titles. The NLM journal title abbreviation is exported in the element. - */ private String title; - /** - * English-language abstracts are taken directly from the published article. - * If the article does not have a published abstract, the National Library of Medicine does not create one, - * thus the record lacks the and elements. However, in the absence of a formally - * labeled abstract in the published article, text from a substantive "summary", "summary and conclusions" or "conclusions and summary" may be used. - */ private String description; - /** - * the language in which an article was published is recorded in . - * All entries are three letter abbreviations stored in lower case, such as eng, fre, ger, jpn, etc. When a single - * record contains more than one language value the XML export program extracts the languages in alphabetic order by the 3-letter language value. - * Some records provided by collaborating data producers may contain the value und to identify articles whose language is undetermined. - */ private String language; - - /** - * NLM controlled vocabulary, Medical Subject Headings (MeSH®), is used to characterize the content of the articles represented by MEDLINE citations. * - */ - private final List subjects = new ArrayList<>(); - /** - * This element is used to identify the type of article indexed for MEDLINE; - * it characterizes the nature of the information or the manner in which it is conveyed as well as the type of - * research support received (e.g., Review, Letter, Retracted Publication, Clinical Conference, Research Support, N.I.H., Extramural). - */ - private final List publicationTypes = new ArrayList<>(); - /** - * Personal and collective (corporate) author names published with the article are found in . - */ + private List subjects; + private List publicationTypes = new ArrayList<>(); private List authors = new ArrayList<>(); + private List grants = new ArrayList<>(); - /** - * contains the research grant or contract number (or both) that designates financial support by any agency of the United States Public Health Service - * or any institute of the National Institutes of Health. Additionally, beginning in late 2005, grant numbers are included for many other US and non-US funding agencies and organizations. - */ - private final List grants = new ArrayList<>(); - - /** - * get the DOI - * @return a DOI - */ - public String getDoi() { - return doi; - } - - /** - * Set the DOI - * @param doi a DOI - */ - public void setDoi(String doi) { - this.doi = doi; - } - - /** - * get the Pubmed Identifier - * @return the PMID - */ public String getPmid() { return pmid; } - /** - * set the Pubmed Identifier - * @param pmid the Pubmed Identifier - */ public void setPmid(String pmid) { this.pmid = pmid; } - /** - * the Pubmed Date extracted from Specifies a date significant to either the article's history or the citation's processing. - * All dates will have a , , and elements. Some may have an , , and element(s). - * - * @return the Pubmed Date - */ - public String getDate() { - return date; - } - - /** - * Set the pubmed Date - * @param date - */ - public void setDate(String date) { - this.date = date; - } - - /** - * The full journal title (taken from NLM cataloging data following NLM rules for how to compile a serial name) is exported in this element. - * Some characters that are not part of the NLM MEDLINE/PubMed Character Set reside in a relatively small number of full journal titles. - * The NLM journal title abbreviation is exported in the element. - * - * @return the pubmed Journal Extracted - */ - public PMJournal getJournal() { - return journal; - } - - /** - * Set the mapped pubmed Journal - * @param journal - */ - public void setJournal(PMJournal journal) { - this.journal = journal; - } - - /** - * contains the entire title of the journal article. is always in English; - * those titles originally published in a non-English language and translated for are enclosed in square brackets. - * All titles end with a period unless another punctuation mark such as a question mark or bracket is present. - * Explanatory information about the title itself is enclosed in parentheses, e.g.: (author's transl). - * Corporate/collective authors may appear at the end of for citations up to about the year 2000. - * - * @return the extracted pubmed Title - */ - public String getTitle() { - return title; - } - - /** - * set the pubmed title - * @param title - */ - public void setTitle(String title) { - this.title = title; - } - - /** - * English-language abstracts are taken directly from the published article. - * If the article does not have a published abstract, the National Library of Medicine does not create one, - * thus the record lacks the and elements. However, in the absence of a formally - * labeled abstract in the published article, text from a substantive "summary", "summary and conclusions" or "conclusions and summary" may be used. - * - * @return the Mapped Pubmed Article Abstracts - */ - public String getDescription() { - return description; - } - - /** - * Set the Mapped Pubmed Article Abstracts - * @param description - */ - public void setDescription(String description) { - this.description = description; - } - - /** - * Personal and collective (corporate) author names published with the article are found in . - * - * @return get the Mapped Authors lists - */ - public List getAuthors() { - return authors; - } - - /** - * Set the Mapped Authors lists - * @param authors - */ - public void setAuthors(List authors) { - this.authors = authors; - } - - /** - * This element is used to identify the type of article indexed for MEDLINE; - * it characterizes the nature of the information or the manner in which it is conveyed as well as the type of - * research support received (e.g., Review, Letter, Retracted Publication, Clinical Conference, Research Support, N.I.H., Extramural). - * - * @return the mapped Subjects - */ - public List getSubjects() { - return subjects; - } - - /** - * - * the language in which an article was published is recorded in . - * All entries are three letter abbreviations stored in lower case, such as eng, fre, ger, jpn, etc. When a single - * record contains more than one language value the XML export program extracts the languages in alphabetic order by the 3-letter language value. - * Some records provided by collaborating data producers may contain the value und to identify articles whose language is undetermined. - * - * @return The mapped Language - */ - public String getLanguage() { - return language; - } - - /** - * - * Set The mapped Language - * - * @param language the mapped Language - */ - public void setLanguage(String language) { - this.language = language; - } - - /** - * This element is used to identify the type of article indexed for MEDLINE; - * it characterizes the nature of the information or the manner in which it is conveyed as well as the type of - * research support received (e.g., Review, Letter, Retracted Publication, Clinical Conference, Research Support, N.I.H., Extramural). - * - * @return the mapped Publication Type - */ - public List getPublicationTypes() { - return publicationTypes; - } - - /** - * contains the research grant or contract number (or both) that designates financial support by any agency of the United States Public Health Service - * or any institute of the National Institutes of Health. Additionally, beginning in late 2005, grant numbers are included for many other US and non-US funding agencies and organizations. - * @return the mapped grants - */ - - public List getGrants() { - return grants; - } - public String getPmcId() { return pmcId; } - public PMArticle setPmcId(String pmcId) { + public void setPmcId(String pmcId) { this.pmcId = pmcId; - return this; + } + + public String getDoi() { + return doi; + } + + public void setDoi(String doi) { + this.doi = doi; + } + + public String getDate() { + return date; + } + + public void setDate(String date) { + this.date = date; + } + + public PMJournal getJournal() { + return journal; + } + + public void setJournal(PMJournal journal) { + this.journal = journal; + } + + public String getTitle() { + return title; + } + + public void setTitle(String title) { + this.title = title; + } + + public String getDescription() { + return description; + } + + public void setDescription(String description) { + this.description = description; + } + + public String getLanguage() { + return language; + } + + public void setLanguage(String language) { + this.language = language; + } + + public List getSubjects() { + return subjects; + } + + public void setSubjects(List subjects) { + this.subjects = subjects; + } + + public List getPublicationTypes() { + return publicationTypes; + } + + public void setPublicationTypes(List publicationTypes) { + this.publicationTypes = publicationTypes; + } + + public List getAuthors() { + return authors; + } + + public void setAuthors(List authors) { + this.authors = authors; + } + + public List getGrants() { + return grants; + } + + public void setGrants(List grants) { + this.grants = grants; } } diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/sx/bio/ebi/baseline_to_oaf_params.json b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/sx/bio/ebi/baseline_to_oaf_params.json index 3ba83764d..8326fab0f 100644 --- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/sx/bio/ebi/baseline_to_oaf_params.json +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/sx/bio/ebi/baseline_to_oaf_params.json @@ -1,8 +1,7 @@ [ {"paramName":"mt", "paramLongName":"master", "paramDescription": "should be local or yarn", "paramRequired": true}, {"paramName":"i", "paramLongName":"isLookupUrl", "paramDescription": "isLookupUrl", "paramRequired": true}, - {"paramName":"w", "paramLongName":"workingPath", "paramDescription": "the path of the sequencial file to read", "paramRequired": true}, - {"paramName":"mo", "paramLongName":"mdstoreOutputVersion", "paramDescription": "the oaf path ", "paramRequired": true}, - {"paramName":"s", "paramLongName":"skipUpdate", "paramDescription": "skip update ", "paramRequired": false}, - {"paramName":"h", "paramLongName":"hdfsServerUri", "paramDescription": "the working path ", "paramRequired": true} + {"paramName":"s", "paramLongName":"sourcePath", "paramDescription": "the baseline path", "paramRequired": true}, + {"paramName":"t", "paramLongName":"targetPath", "paramDescription": "the mdstore path to save", "paramRequired": true} + ] \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/ebi/SparkCreatePubmedDump.scala b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/ebi/SparkCreatePubmedDump.scala new file mode 100644 index 000000000..c21bfd7c3 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/ebi/SparkCreatePubmedDump.scala @@ -0,0 +1,90 @@ +package eu.dnetlib.dhp.sx.bio.ebi + +import com.fasterxml.jackson.databind.ObjectMapper +import eu.dnetlib.dhp.application.{AbstractScalaApplication, ArgumentApplicationParser} +import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup +import eu.dnetlib.dhp.schema.oaf.Oaf +import eu.dnetlib.dhp.sx.bio.pubmed.{PMArticle, PMAuthor, PMJournal, PMParser, PMParser2, PubMedToOaf} +import eu.dnetlib.dhp.utils.ISLookupClientFactory +import org.apache.spark.sql.functions._ +import org.apache.spark.sql.{Dataset, Encoder, Encoders, SparkSession} +import org.slf4j.{Logger, LoggerFactory} + +import java.io.ByteArrayInputStream +import javax.xml.stream.XMLInputFactory + +class SparkCreatePubmedDump(propertyPath: String, args: Array[String], log: Logger) + extends AbstractScalaApplication(propertyPath, args, log: Logger) { + + /** Here all the spark applications runs this method + * where the whole logic of the spark node is defined + */ + override def run(): Unit = { + val isLookupUrl: String = parser.get("isLookupUrl") + log.info("isLookupUrl: {}", isLookupUrl) + val sourcePath = parser.get("sourcePath") + log.info(s"SourcePath is '$sourcePath'") + val targetPath = parser.get("targetPath") + log.info(s"TargetPath is '$targetPath'") + + val isLookupService = ISLookupClientFactory.getLookUpService(isLookupUrl) + val vocabularies = VocabularyGroup.loadVocsFromIS(isLookupService) + + createPubmedDump(spark, sourcePath, targetPath, vocabularies) + + } + + def createPubmedDump( + spark: SparkSession, + sourcePath: String, + targetPath: String, + vocabularies: VocabularyGroup + ): Unit = { + require(spark != null) + + implicit val PMEncoder: Encoder[PMArticle] = Encoders.bean(classOf[PMArticle]) + + import spark.implicits._ + val df = spark.read.option("lineSep", "").text(sourcePath) + val mapper = new ObjectMapper() + df.as[String] + .map(s => { + val id = s.indexOf("") + if (id >= 0) s"${s.substring(id)}" else null + }) + .filter(s => s != null) + .map { i => + try { + new PMParser2().parse(i) + } catch { + case _: Exception => { + throw new RuntimeException(s"Error parsing article: $i") + } + } + } + .dropDuplicates("pmid") + .map { a => + val oaf = PubMedToOaf.convert(a, vocabularies) + if (oaf != null) + mapper.writeValueAsString(oaf) + else + null + } + .as[String] + .filter(s => s != null) + .write + .option("compression", "gzip") + .mode("overwrite") + .text(targetPath) + } +} + +object SparkCreatePubmedDump { + + def main(args: Array[String]): Unit = { + val log: Logger = LoggerFactory.getLogger(getClass) + + new SparkCreatePubmedDump("/eu/dnetlib/dhp/sx/bio/ebi/baseline_to_oaf_params.json", args, log).initialize().run() + + } +} diff --git a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/pubmed/PMParser2.scala b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/pubmed/PMParser2.scala new file mode 100644 index 000000000..c9e868185 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/pubmed/PMParser2.scala @@ -0,0 +1,264 @@ +package eu.dnetlib.dhp.sx.bio.pubmed + +import org.apache.commons.lang3.StringUtils + +import javax.xml.stream.XMLEventReader +import scala.collection.JavaConverters._ +import scala.xml.{MetaData, NodeSeq} +import scala.xml.pull.{EvElemEnd, EvElemStart, EvText} + +class PMParser2 { + + /** Extracts the value of an attribute from a MetaData object. + * @param attrs the MetaData object + * @param key the key of the attribute + * @return the value of the attribute or null if the attribute is not found + */ + private def extractAttributes(attrs: MetaData, key: String): String = { + + val res = attrs.get(key) + if (res.isDefined) { + val s = res.get + if (s != null && s.nonEmpty) + s.head.text + else + null + } else null + } + + /** Validates and formats a date given the year, month, and day as strings. + * + * @param year the year as a string + * @param month the month as a string + * @param day the day as a string + * @return the formatted date as "YYYY-MM-DD" or null if the date is invalid + */ + private def validate_Date(year: String, month: String, day: String): String = { + try { + f"${year.toInt}-${month.toInt}%02d-${day.toInt}%02d" + + } catch { + case _: Throwable => null + } + } + + /** Extracts the grant information from a NodeSeq object. + * + * @param gNode the NodeSeq object + * @return the grant information or an empty list if the grant information is not found + */ + private def extractGrant(gNode: NodeSeq): List[PMGrant] = { + gNode + .map(node => { + val grantId = (node \ "GrantID").text + val agency = (node \ "Agency").text + val country = (node \ "Country").text + new PMGrant(grantId, agency, country) + }) + .toList + } + + /** Extracts the journal information from a NodeSeq object. + * + * @param jNode the NodeSeq object + * @return the journal information or null if the journal information is not found + */ + private def extractJournal(jNode: NodeSeq): PMJournal = { + val journal = new PMJournal + journal.setTitle((jNode \ "Title").text) + journal.setIssn((jNode \ "ISSN").text) + journal.setVolume((jNode \ "JournalIssue" \ "Volume").text) + journal.setIssue((jNode \ "JournalIssue" \ "Issue").text) + if (journal.getTitle != null && StringUtils.isNotEmpty(journal.getTitle)) + journal + else + null + } + + private def extractAuthors(aNode: NodeSeq): List[PMAuthor] = { + aNode + .map(author => { + val a = new PMAuthor + a.setLastName((author \ "LastName").text) + a.setForeName((author \ "ForeName").text) + a + }) + .toList + } + + def parse(input: String): PMArticle = { + val xml = scala.xml.XML.loadString(input) + val article = new PMArticle + + val grantNodes = xml \ "MedlineCitation" \\ "Grant" + article.setGrants(extractGrant(grantNodes).asJava) + + val journal = xml \ "MedlineCitation" \ "Article" \ "Journal" + article.setJournal(extractJournal(journal)) + + val authors = xml \ "MedlineCitation" \ "Article" \ "AuthorList" \ "Author" + + article.setAuthors( + authors + .map(author => { + val a = new PMAuthor + a.setLastName((author \ "LastName").text) + a.setForeName((author \ "ForeName").text) + a + }) + .toList + .asJava + ) + + val pmId = xml \ "MedlineCitation" \ "PMID" + + val articleIds = xml \ "PubmedData" \ "ArticleIdList" \ "ArticleId" + articleIds.foreach(articleId => { + val idType = (articleId \ "@IdType").text + val id = articleId.text + if ("doi".equalsIgnoreCase(idType)) article.setDoi(id) + if ("pmc".equalsIgnoreCase(idType)) article.setPmcId(id) + }) + article.setPmid(pmId.text) + + val pubMedPubDate = xml \ "MedlineCitation" \ "DateCompleted" + val currentDate = + validate_Date((pubMedPubDate \ "Year").text, (pubMedPubDate \ "Month").text, (pubMedPubDate \ "Day").text) + if (currentDate != null) article.setDate(currentDate) + + val articleTitle = xml \ "MedlineCitation" \ "Article" \ "ArticleTitle" + article.setTitle(articleTitle.text) + + val abstractText = xml \ "MedlineCitation" \ "Article" \ "Abstract" \ "AbstractText" + if (abstractText != null && abstractText.text != null && abstractText.text.nonEmpty) + article.setDescription(abstractText.text.split("\n").map(s => s.trim).mkString(" ").trim) + + val language = xml \ "MedlineCitation" \ "Article" \ "Language" + article.setLanguage(language.text) + + val subjects = xml \ "MedlineCitation" \ "MeshHeadingList" \ "MeshHeading" + article.setSubjects( + subjects + .take(20) + .map(subject => { + val descriptorName = (subject \ "DescriptorName").text + val ui = (subject \ "DescriptorName" \ "@UI").text + val s = new PMSubject + s.setValue(descriptorName) + s.setMeshId(ui) + s + }) + .toList + .asJava + ) + val publicationTypes = xml \ "MedlineCitation" \ "Article" \ "PublicationTypeList" \ "PublicationType" + article.setPublicationTypes( + publicationTypes + .map(pt => { + val s = new PMSubject + s.setValue(pt.text) + s + }) + .toList + .asJava + ) + + article + } + + def parse2(xml: XMLEventReader): PMArticle = { + var currentArticle: PMArticle = null + var currentSubject: PMSubject = null + var currentAuthor: PMAuthor = null + var currentJournal: PMJournal = null + var currentGrant: PMGrant = null + var currNode: String = null + var currentYear = "0" + var currentMonth = "01" + var currentDay = "01" + var currentArticleType: String = null + + while (xml.hasNext) { + val ne = xml.next + ne match { + case EvElemStart(_, label, attrs, _) => + currNode = label + + label match { + case "PubmedArticle" => currentArticle = new PMArticle + case "Author" => currentAuthor = new PMAuthor + case "Journal" => currentJournal = new PMJournal + case "Grant" => currentGrant = new PMGrant + case "PublicationType" | "DescriptorName" => + currentSubject = new PMSubject + currentSubject.setMeshId(extractAttributes(attrs, "UI")) + case "ArticleId" => currentArticleType = extractAttributes(attrs, "IdType") + case _ => + } + case EvElemEnd(_, label) => + label match { + case "PubmedArticle" => return currentArticle + case "Author" => currentArticle.getAuthors.add(currentAuthor) + case "Journal" => currentArticle.setJournal(currentJournal) + case "Grant" => currentArticle.getGrants.add(currentGrant) + case "PubMedPubDate" => + if (currentArticle.getDate == null) + currentArticle.setDate(validate_Date(currentYear, currentMonth, currentDay)) + case "PubDate" => currentJournal.setDate(s"$currentYear-$currentMonth-$currentDay") + case "DescriptorName" => currentArticle.getSubjects.add(currentSubject) + case "PublicationType" => currentArticle.getPublicationTypes.add(currentSubject) + case _ => + } + case EvText(text) => + if (currNode != null && text.trim.nonEmpty) + currNode match { + case "ArticleTitle" => { + if (currentArticle.getTitle == null) + currentArticle.setTitle(text.trim) + else + currentArticle.setTitle(currentArticle.getTitle + text.trim) + } + case "AbstractText" => { + if (currentArticle.getDescription == null) + currentArticle.setDescription(text.trim) + else + currentArticle.setDescription(currentArticle.getDescription + text.trim) + } + case "PMID" => currentArticle.setPmid(text.trim) + case "ArticleId" => + if ("doi".equalsIgnoreCase(currentArticleType)) currentArticle.setDoi(text.trim) + if ("pmc".equalsIgnoreCase(currentArticleType)) currentArticle.setPmcId(text.trim) + case "Language" => currentArticle.setLanguage(text.trim) + case "ISSN" => currentJournal.setIssn(text.trim) + case "GrantID" => currentGrant.setGrantID(text.trim) + case "Agency" => currentGrant.setAgency(text.trim) + case "Country" => if (currentGrant != null) currentGrant.setCountry(text.trim) + case "Year" => currentYear = text.trim + case "Month" => currentMonth = text.trim + case "Day" => currentDay = text.trim + case "Volume" => currentJournal.setVolume(text.trim) + case "Issue" => currentJournal.setIssue(text.trim) + case "PublicationType" | "DescriptorName" => currentSubject.setValue(text.trim) + case "LastName" => { + if (currentAuthor != null) + currentAuthor.setLastName(text.trim) + } + case "ForeName" => + if (currentAuthor != null) + currentAuthor.setForeName(text.trim) + case "Title" => + if (currentJournal.getTitle == null) + currentJournal.setTitle(text.trim) + else + currentJournal.setTitle(currentJournal.getTitle + text.trim) + case _ => + + } + case _ => + } + + } + null + } + +} diff --git a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/sx/graph/bio/single_pubmed.xml b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/sx/graph/bio/single_pubmed.xml new file mode 100644 index 000000000..4b4d860d7 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/sx/graph/bio/single_pubmed.xml @@ -0,0 +1,222 @@ + + + 37885214 + + 2024 + 02 + 14 + + + 2024 + 02 + 14 + +
+ + 2752-7549 + + 40 + 5 + + 2023 Sep-Oct + + + Journal of pediatric hematology/oncology nursing + J Pediatr Hematol Oncol Nurs + + Care Needs of Parents of Children With Cancer in a Low-Middle-Income Country. + + 295-304 + + 10.1177/27527530231193972 + + Background: Mapping out actual supportive care needs assists nurses in providing holistic individualized care. This study aimed to explore the care needs of parents of children with cancer in the Philippines. Method: Guided by the Supportive Care Needs Framework (SCNF), this study used an embedded mixed-method design with the quantitative revised Cancer Patient Needs Questionnaire and qualitative semistructured interviews to describe parents' care needs and priorities. Results: Filipino parents (N = 156) of children with cancer have various care needs which could be classified along the SCNF categories-practical, informational, spiritual, physical, emotional, and physical needs as ranked from highest to lowest. A number of variables were significantly associated with care needs. Solid tumor diagnosis was associated with greater practical, emotional, and psychosocial care needs; having a child who had undergone surgery was associated with more practical and spiritual care needs; and being within one year of the child's diagnosis was associated with practical, psychosocial, and spiritual care needs. Parent priority needs included (a) addressing financial needs; (b) access to temporary housing to minimize treatment-related costs; (c) support groups among parents of children with cancer as a source of information; (d) financial and social support between members of family and partners of parents of children with cancer; and (e) using prayer to facilitate acceptance. Conclusions: Supportive care needs of parents of children with cancer are important components of care that should be given recognition to enhance holistic individualized care throughout the childhood cancer experience. + + + + Banayat + Aprille Campos + AC + 0000-0001-9339-9871 + + College of Nursing, University of the Philippines Manila, Manila, Philippines. + + + + Abad + Peter James B + PJB + + College of Nursing, University of the Philippines Manila, Manila, Philippines. + + + + Bonito + Sheila R + SR + + College of Nursing, University of the Philippines Manila, Manila, Philippines. + + + + Manahan + Lydia T + LT + + College of Nursing, University of the Philippines Manila, Manila, Philippines. + + + + Peralta + Arnold B + AB + + College of Nursing, University of the Philippines Manila, Manila, Philippines. + + + + eng + + Journal Article + + + 2023 + 10 + 26 + +
+ + United States + J Pediatr Hematol Oncol Nurs + 9918282681506676 + 2752-7530 + + IM + + + Child + + + Humans + + + Parents + psychology + + + Social Support + + + Spirituality + + + Religion + + + Neoplasms + therapy + + + + cancer + mixed methods + parent + pediatric + research + supportive care + + Declaration of Conflicting InterestsThe author(s) declared no potential conflicts of interest with respect to the research, authorship, and/or publication of this article. +
+ + + + 2024 + 2 + 12 + 18 + 42 + + + 2023 + 10 + 27 + 6 + 42 + + + 2023 + 10 + 27 + 3 + 43 + + + ppublish + + 37885214 + 10.1177/27527530231193972 + + +
+ +30522158 +32769323 +34061701 +34661197 +34837091 +35035475 +35211699 +35557982 +35782783 +35795240 +35832688 +35847411 +36081602 +36081858 +36468085 +36468934 +36580086 +36589526 +36619609 +36649460 +36654909 +36655054 +36700856 +36705625 +36713939 +36714172 +36741203 +36741905 +36743825 +36788221 +36844926 +36846546 +36935776 +36946757 +36972191 +37034422 +37124311 +37152108 +37171968 +37273889 +37333905 +37387733 +37431449 +37576947 +37601162 +37711214 +37901290 +37981909 +37981945 +37982005 +38037601 +38037602 +38150730 +38274640 +38332671 +38334184 +38335456 +38349506 +38349576 +38353676 + \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/test/scala/eu/dnetlib/dhp/sx/bio/BioScholixTest.scala b/dhp-workflows/dhp-aggregation/src/test/scala/eu/dnetlib/dhp/sx/bio/BioScholixTest.scala index c4af14c40..1374b741d 100644 --- a/dhp-workflows/dhp-aggregation/src/test/scala/eu/dnetlib/dhp/sx/bio/BioScholixTest.scala +++ b/dhp-workflows/dhp-aggregation/src/test/scala/eu/dnetlib/dhp/sx/bio/BioScholixTest.scala @@ -5,7 +5,10 @@ import eu.dnetlib.dhp.aggregation.AbstractVocabularyTest import eu.dnetlib.dhp.schema.oaf.utils.PidType import eu.dnetlib.dhp.schema.oaf.{Oaf, Publication, Relation, Result} import eu.dnetlib.dhp.sx.bio.BioDBToOAF.ScholixResolved -import eu.dnetlib.dhp.sx.bio.pubmed.{PMArticle, PMParser, PMSubject, PubMedToOaf} +import eu.dnetlib.dhp.sx.bio.ebi.SparkCreatePubmedDump +import eu.dnetlib.dhp.sx.bio.pubmed.{PMArticle, PMAuthor, PMJournal, PMParser, PMParser2, PMSubject, PubMedToOaf} +import org.apache.commons.io.IOUtils +import org.apache.spark.sql.{Dataset, Encoder, Encoders, SparkSession} import org.json4s.DefaultFormats import org.json4s.JsonAST.{JField, JObject, JString} import org.json4s.jackson.JsonMethods.parse @@ -13,8 +16,9 @@ import org.junit.jupiter.api.Assertions._ import org.junit.jupiter.api.extension.ExtendWith import org.junit.jupiter.api.{BeforeEach, Test} import org.mockito.junit.jupiter.MockitoExtension +import org.slf4j.LoggerFactory -import java.io.{BufferedReader, InputStream, InputStreamReader} +import java.io.{BufferedReader, ByteArrayInputStream, InputStream, InputStreamReader} import java.util.zip.GZIPInputStream import javax.xml.stream.XMLInputFactory import scala.collection.JavaConverters._ @@ -48,6 +52,17 @@ class BioScholixTest extends AbstractVocabularyTest { } } + @Test + def testParsingPubmed2(): Unit = { + val mapper = new ObjectMapper() + val xml = IOUtils.toString(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/single_pubmed.xml")) + val parser = new PMParser2() + val article = parser.parse(xml) + + println(mapper.writerWithDefaultPrettyPrinter().writeValueAsString(article)) + + } + @Test def testEBIData() = { val inputFactory = XMLInputFactory.newInstance @@ -124,6 +139,15 @@ class BioScholixTest extends AbstractVocabularyTest { } } + @Test + def testPubmedSplitting(): Unit = { + + val spark: SparkSession = SparkSession.builder().appName("test").master("local").getOrCreate() + new SparkCreatePubmedDump("", Array.empty, LoggerFactory.getLogger(getClass)) + .createPubmedDump(spark, "/home/sandro/Downloads/pubmed", "/home/sandro/Downloads/pubmed_mapped", vocabularies) + + } + @Test def testPubmedOriginalID(): Unit = { val article: PMArticle = new PMArticle From c1cef5d685373ad28dac341b56fc459cd43ff606 Mon Sep 17 00:00:00 2001 From: Sandro La Bruzzo Date: Tue, 5 Nov 2024 10:38:40 +0100 Subject: [PATCH 058/111] removed old library joda time replaced with standard java.time introduced in java 8 --- .../plugin/gtr2/Gtr2PublicationsIterator.java | 26 +++++++++---------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/gtr2/Gtr2PublicationsIterator.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/gtr2/Gtr2PublicationsIterator.java index 5b8f48680..779c43712 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/gtr2/Gtr2PublicationsIterator.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/gtr2/Gtr2PublicationsIterator.java @@ -1,6 +1,8 @@ package eu.dnetlib.dhp.collection.plugin.gtr2; +import java.nio.charset.StandardCharsets; +import java.time.LocalDate; import java.util.ArrayList; import java.util.HashMap; import java.util.Iterator; @@ -16,9 +18,7 @@ import org.dom4j.Document; import org.dom4j.DocumentException; import org.dom4j.DocumentHelper; import org.dom4j.Element; -import org.joda.time.DateTime; -import org.joda.time.format.DateTimeFormat; -import org.joda.time.format.DateTimeFormatter; +import java.time.format.DateTimeFormatter; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -33,7 +33,7 @@ public class Gtr2PublicationsIterator implements Iterator { private static final Logger log = LoggerFactory.getLogger(Gtr2PublicationsIterator.class); private final HttpConnector2 connector; - private static final DateTimeFormatter simpleDateTimeFormatter = DateTimeFormat.forPattern("yyyy-MM-dd"); + private static final DateTimeFormatter simpleDateTimeFormatter = DateTimeFormatter.ofPattern("yyyy-MM-dd"); private static final int MAX_ATTEMPTS = 10; @@ -41,7 +41,7 @@ public class Gtr2PublicationsIterator implements Iterator { private int currPage; private int endPage; private boolean incremental = false; - private DateTime fromDate; + private LocalDate fromDate; private final Map cache = new HashMap<>(); @@ -188,28 +188,28 @@ public class Gtr2PublicationsIterator implements Iterator { private Document loadURL(final String cleanUrl, final int attempt) { try { - log.debug(" * Downloading Url: " + cleanUrl); - final byte[] bytes = this.connector.getInputSource(cleanUrl).getBytes("UTF-8"); + log.debug(" * Downloading Url: {}", cleanUrl); + final byte[] bytes = this.connector.getInputSource(cleanUrl).getBytes(StandardCharsets.UTF_8); return DocumentHelper.parseText(new String(bytes)); } catch (final Throwable e) { - log.error("Error dowloading url: " + cleanUrl + ", attempt = " + attempt, e); + log.error("Error dowloading url: {}, attempt = {}", cleanUrl, attempt, e); if (attempt >= MAX_ATTEMPTS) { - throw new RuntimeException("Error dowloading url: " + cleanUrl, e); + throw new RuntimeException("Error downloading url: " + cleanUrl, e); } try { Thread.sleep(60000); // I wait for a minute } catch (final InterruptedException e1) { - throw new RuntimeException("Error dowloading url: " + cleanUrl, e); + throw new RuntimeException("Error downloading url: " + cleanUrl, e); } return loadURL(cleanUrl, attempt + 1); } } - private DateTime parseDate(final String s) { - return DateTime.parse(s.contains("T") ? s.substring(0, s.indexOf("T")) : s, simpleDateTimeFormatter); + private LocalDate parseDate(final String s) { + return LocalDate.parse(s.contains("T") ? s.substring(0, s.indexOf("T")) : s, simpleDateTimeFormatter); } - private boolean isAfter(final String d, final DateTime fromDate) { + private boolean isAfter(final String d, final LocalDate fromDate) { return StringUtils.isNotBlank(d) && parseDate(d).isAfter(fromDate); } } From 973aa7dca6508e2c1e82fa7c775b3d40c18fe45b Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Wed, 6 Nov 2024 12:29:06 +0100 Subject: [PATCH 059/111] [dedup] force the Relation schema when reading the merge rels --- .../java/eu/dnetlib/dhp/oa/dedup/SparkPropagateRelation.java | 1 + 1 file changed, 1 insertion(+) diff --git a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkPropagateRelation.java b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkPropagateRelation.java index c7efce4d7..b0bc314e2 100644 --- a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkPropagateRelation.java +++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkPropagateRelation.java @@ -69,6 +69,7 @@ public class SparkPropagateRelation extends AbstractSparkAction { Dataset mergeRels = spark .read() + .schema(REL_BEAN_ENC.schema()) .load(DedupUtility.createMergeRelPath(workingPath, "*", "*")) .as(REL_BEAN_ENC); From f7bb53fe7895105f4a7a73b7de14cd6b4121589e Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Thu, 7 Nov 2024 01:04:43 +0100 Subject: [PATCH 060/111] [orcid enrichment] added missing workflow parameter: workingDir --- .../resources/eu/dnetlib/dhp/enrich/orcid/oozie_app/workflow.xml | 1 + 1 file changed, 1 insertion(+) diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/enrich/orcid/oozie_app/workflow.xml b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/enrich/orcid/oozie_app/workflow.xml index 4031da15a..1ece2c0be 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/enrich/orcid/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/enrich/orcid/oozie_app/workflow.xml @@ -51,6 +51,7 @@ --orcidPath${orcidPath} --targetPath${targetPath} --graphPath${graphPath} + --workingDir${workingDir} --masteryarn From 8f5171557e20ed58f69f7abe3af1ad0a85b10ba3 Mon Sep 17 00:00:00 2001 From: Giambattista Bloisi Date: Thu, 7 Nov 2024 12:22:34 +0100 Subject: [PATCH 061/111] Remove ORCID information when the same ORCID ID is used multiple times in the same result for different authors --- .../oaf/utils/GraphCleaningFunctions.java | 36 ++++++++++++++++++- 1 file changed, 35 insertions(+), 1 deletion(-) diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/GraphCleaningFunctions.java b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/GraphCleaningFunctions.java index b6574da16..9153a6476 100644 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/GraphCleaningFunctions.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/GraphCleaningFunctions.java @@ -2,7 +2,6 @@ package eu.dnetlib.dhp.schema.oaf.utils; import static eu.dnetlib.dhp.schema.common.ModelConstants.*; -import static eu.dnetlib.dhp.schema.common.ModelConstants.OPENAIRE_META_RESOURCE_TYPE; import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.getProvenance; import java.net.MalformedURLException; @@ -696,6 +695,7 @@ public class GraphCleaningFunctions extends CleaningFunctions { } } + // set ORCID_PENDING to all orcid values that are not coming from ORCID provenance for (Author a : r.getAuthor()) { if (Objects.isNull(a.getPid())) { a.setPid(Lists.newArrayList()); @@ -752,6 +752,40 @@ public class GraphCleaningFunctions extends CleaningFunctions { .collect(Collectors.toList())); } } + + // Identify clashing ORCIDS:that is same ORCID associated to multiple authors in this result + Map clashing_orcid = new HashMap<>(); + + for (Author a : r.getAuthor()) { + a + .getPid() + .stream() + .filter( + p -> StringUtils + .contains(StringUtils.lowerCase(p.getQualifier().getClassid()), ORCID_PENDING)) + .map(StructuredProperty::getValue) + .distinct() + .forEach(orcid -> clashing_orcid.compute(orcid, (k, v) -> (v == null) ? 1 : v + 1)); + } + + Set clashing = clashing_orcid + .entrySet() + .stream() + .filter(ee -> ee.getValue() > 1) + .map(Map.Entry::getKey) + .collect(Collectors.toSet()); + + // filter out clashing orcids + for (Author a : r.getAuthor()) { + a + .setPid( + a + .getPid() + .stream() + .filter(p -> !clashing.contains(p.getValue())) + .collect(Collectors.toList())); + } + } if (value instanceof Publication) { From 6fd9ec856608c3ca9baeedfa3677a64287483d92 Mon Sep 17 00:00:00 2001 From: Miriam Baglioni Date: Thu, 7 Nov 2024 13:55:31 +0100 Subject: [PATCH 062/111] [danishfunders] added link for danish funders versus the unidentified project for IRFD (501100004836) CF (501100002808) and NNF(501100009708) --- .../doiboost/crossref/Crossref2Oaf.scala | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/crossref/Crossref2Oaf.scala b/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/crossref/Crossref2Oaf.scala index f284a063e..bf11ed0a8 100644 --- a/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/crossref/Crossref2Oaf.scala +++ b/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/crossref/Crossref2Oaf.scala @@ -566,7 +566,23 @@ case object Crossref2Oaf { queue += generateRelation(sourceId, targetId, ModelConstants.IS_PRODUCED_BY) queue += generateRelation(targetId, sourceId, ModelConstants.PRODUCES) case _ => logger.debug("no match for " + funder.DOI.get) - + //Add for Danish funders + //Independent Research Fund Denmark (IRFD) + case "10.13039/501100004836" => + val targetId = getProjectId("irfd________", "1e5e62235d094afd01cd56e65112fc63") + queue += generateRelation(sourceId, targetId, ModelConstants.IS_PRODUCED_BY) + queue += generateRelation(targetId, sourceId, ModelConstants.PRODUCES) + //Carlsberg Foundation (CF) + case "10.13039/501100002808" => + val targetId = getProjectId("cf__________", "1e5e62235d094afd01cd56e65112fc63") + queue += generateRelation(sourceId, targetId, ModelConstants.IS_PRODUCED_BY) + queue += generateRelation(targetId, sourceId, ModelConstants.PRODUCES) + //Novo Nordisk Foundation (NNF) + case "10.13039/501100009708" => + val targetId = getProjectId("nnf_________", "1e5e62235d094afd01cd56e65112fc63") + queue += generateRelation(sourceId, targetId, ModelConstants.IS_PRODUCED_BY) + queue += generateRelation(targetId, sourceId, ModelConstants.PRODUCES) + case _ => logger.debug("no match for " + funder.DOI.get) } } else { From 0d0904f4ec2b1d4424d5d8d5d6d782049c0544cd Mon Sep 17 00:00:00 2001 From: Sandro La Bruzzo Date: Mon, 11 Nov 2024 10:27:23 +0100 Subject: [PATCH 063/111] updated workflow baseline to direct transform on OAF --- .../sx/bio/ebi/baseline_to_oaf_params.json | 8 ++--- .../dhp/sx/bio/pubmed/oozie_app/workflow.xml | 17 ++++------- .../sx/bio/ebi/SparkCreatePubmedDump.scala | 29 ++++++++++++------- 3 files changed, 27 insertions(+), 27 deletions(-) diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/sx/bio/ebi/baseline_to_oaf_params.json b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/sx/bio/ebi/baseline_to_oaf_params.json index 8326fab0f..0fcc03266 100644 --- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/sx/bio/ebi/baseline_to_oaf_params.json +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/sx/bio/ebi/baseline_to_oaf_params.json @@ -1,7 +1,7 @@ [ - {"paramName":"mt", "paramLongName":"master", "paramDescription": "should be local or yarn", "paramRequired": true}, - {"paramName":"i", "paramLongName":"isLookupUrl", "paramDescription": "isLookupUrl", "paramRequired": true}, - {"paramName":"s", "paramLongName":"sourcePath", "paramDescription": "the baseline path", "paramRequired": true}, - {"paramName":"t", "paramLongName":"targetPath", "paramDescription": "the mdstore path to save", "paramRequired": true} + {"paramName":"mt", "paramLongName":"master", "paramDescription": "should be local or yarn", "paramRequired": true}, + {"paramName":"i", "paramLongName":"isLookupUrl", "paramDescription": "isLookupUrl", "paramRequired": true}, + {"paramName":"s", "paramLongName":"sourcePath", "paramDescription": "the baseline path", "paramRequired": true}, + {"paramName":"mo", "paramLongName":"mdstoreOutputVersion", "paramDescription": "the mdstore path to save", "paramRequired": true} ] \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/sx/bio/pubmed/oozie_app/workflow.xml b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/sx/bio/pubmed/oozie_app/workflow.xml index 30eb41469..0f4c5c249 100644 --- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/sx/bio/pubmed/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/sx/bio/pubmed/oozie_app/workflow.xml @@ -16,11 +16,6 @@ mdStoreManagerURI the path of the cleaned mdstore
- - skipUpdate - false - The request block size -
@@ -44,16 +39,16 @@ --mdStoreManagerURI${mdStoreManagerURI} - + - + yarn cluster - Convert Baseline to OAF Dataset - eu.dnetlib.dhp.sx.bio.ebi.SparkCreateBaselineDataFrame + Convert Baseline Pubmed to OAF Dataset + eu.dnetlib.dhp.sx.bio.ebi.SparkCreatePubmedDump dhp-aggregation-${projectVersion}.jar --executor-memory=${sparkExecutorMemory} @@ -65,12 +60,10 @@ --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} - --workingPath${baselineWorkingPath} + --sourcePath${baselineWorkingPath} --mdstoreOutputVersion${wf:actionData('StartTransaction')['mdStoreVersion']} --masteryarn --isLookupUrl${isLookupUrl} - --hdfsServerUri${nameNode} - --skipUpdate${skipUpdate} diff --git a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/ebi/SparkCreatePubmedDump.scala b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/ebi/SparkCreatePubmedDump.scala index c21bfd7c3..1bdd2a4bc 100644 --- a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/ebi/SparkCreatePubmedDump.scala +++ b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/ebi/SparkCreatePubmedDump.scala @@ -1,18 +1,14 @@ package eu.dnetlib.dhp.sx.bio.ebi import com.fasterxml.jackson.databind.ObjectMapper -import eu.dnetlib.dhp.application.{AbstractScalaApplication, ArgumentApplicationParser} +import eu.dnetlib.dhp.application.AbstractScalaApplication import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup -import eu.dnetlib.dhp.schema.oaf.Oaf -import eu.dnetlib.dhp.sx.bio.pubmed.{PMArticle, PMAuthor, PMJournal, PMParser, PMParser2, PubMedToOaf} +import eu.dnetlib.dhp.schema.mdstore.MDStoreVersion +import eu.dnetlib.dhp.sx.bio.pubmed.{PMArticle, PMParser2, PubMedToOaf} import eu.dnetlib.dhp.utils.ISLookupClientFactory -import org.apache.spark.sql.functions._ -import org.apache.spark.sql.{Dataset, Encoder, Encoders, SparkSession} +import org.apache.spark.sql.{Encoder, Encoders, SparkSession} import org.slf4j.{Logger, LoggerFactory} -import java.io.ByteArrayInputStream -import javax.xml.stream.XMLInputFactory - class SparkCreatePubmedDump(propertyPath: String, args: Array[String], log: Logger) extends AbstractScalaApplication(propertyPath, args, log: Logger) { @@ -24,16 +20,26 @@ class SparkCreatePubmedDump(propertyPath: String, args: Array[String], log: Logg log.info("isLookupUrl: {}", isLookupUrl) val sourcePath = parser.get("sourcePath") log.info(s"SourcePath is '$sourcePath'") - val targetPath = parser.get("targetPath") - log.info(s"TargetPath is '$targetPath'") + val mdstoreOutputVersion = parser.get("mdstoreOutputVersion") + log.info(s"mdstoreOutputVersion is '$mdstoreOutputVersion'") + val mapper = new ObjectMapper() + val cleanedMdStoreVersion = mapper.readValue(mdstoreOutputVersion, classOf[MDStoreVersion]) + val outputBasePath = cleanedMdStoreVersion.getHdfsPath + log.info(s"outputBasePath is '$outputBasePath'") val isLookupService = ISLookupClientFactory.getLookUpService(isLookupUrl) val vocabularies = VocabularyGroup.loadVocsFromIS(isLookupService) - createPubmedDump(spark, sourcePath, targetPath, vocabularies) + createPubmedDump(spark, sourcePath, outputBasePath, vocabularies) } + /** This method creates a dump of the pubmed articles + * @param spark the spark session + * @param sourcePath the path of the source file + * @param targetPath the path of the target file + * @param vocabularies the vocabularies + */ def createPubmedDump( spark: SparkSession, sourcePath: String, @@ -54,6 +60,7 @@ class SparkCreatePubmedDump(propertyPath: String, args: Array[String], log: Logg }) .filter(s => s != null) .map { i => + //remove try catch try { new PMParser2().parse(i) } catch { From 19ce783e58f4f9176f4dc9a98d9bb250dc615e0d Mon Sep 17 00:00:00 2001 From: "sandro.labruzzo" Date: Mon, 11 Nov 2024 12:28:02 +0100 Subject: [PATCH 064/111] renamed workflow --- .../eu/dnetlib/dhp/sx/bio/pubmed/oozie_app/workflow.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/sx/bio/pubmed/oozie_app/workflow.xml b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/sx/bio/pubmed/oozie_app/workflow.xml index 0f4c5c249..2a51b4050 100644 --- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/sx/bio/pubmed/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/sx/bio/pubmed/oozie_app/workflow.xml @@ -1,4 +1,4 @@ - + baselineWorkingPath From 474f365286b7f83b8d58e4969277b3e67ebdd0cc Mon Sep 17 00:00:00 2001 From: "sandro.labruzzo" Date: Mon, 11 Nov 2024 12:37:27 +0100 Subject: [PATCH 065/111] removed wrong test --- .../test/scala/eu/dnetlib/dhp/sx/bio/BioScholixTest.scala | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/dhp-workflows/dhp-aggregation/src/test/scala/eu/dnetlib/dhp/sx/bio/BioScholixTest.scala b/dhp-workflows/dhp-aggregation/src/test/scala/eu/dnetlib/dhp/sx/bio/BioScholixTest.scala index 1374b741d..c942ca132 100644 --- a/dhp-workflows/dhp-aggregation/src/test/scala/eu/dnetlib/dhp/sx/bio/BioScholixTest.scala +++ b/dhp-workflows/dhp-aggregation/src/test/scala/eu/dnetlib/dhp/sx/bio/BioScholixTest.scala @@ -6,9 +6,9 @@ import eu.dnetlib.dhp.schema.oaf.utils.PidType import eu.dnetlib.dhp.schema.oaf.{Oaf, Publication, Relation, Result} import eu.dnetlib.dhp.sx.bio.BioDBToOAF.ScholixResolved import eu.dnetlib.dhp.sx.bio.ebi.SparkCreatePubmedDump -import eu.dnetlib.dhp.sx.bio.pubmed.{PMArticle, PMAuthor, PMJournal, PMParser, PMParser2, PMSubject, PubMedToOaf} +import eu.dnetlib.dhp.sx.bio.pubmed._ import org.apache.commons.io.IOUtils -import org.apache.spark.sql.{Dataset, Encoder, Encoders, SparkSession} +import org.apache.spark.sql.SparkSession import org.json4s.DefaultFormats import org.json4s.JsonAST.{JField, JObject, JString} import org.json4s.jackson.JsonMethods.parse @@ -18,13 +18,12 @@ import org.junit.jupiter.api.{BeforeEach, Test} import org.mockito.junit.jupiter.MockitoExtension import org.slf4j.LoggerFactory -import java.io.{BufferedReader, ByteArrayInputStream, InputStream, InputStreamReader} +import java.io.{BufferedReader, InputStream, InputStreamReader} import java.util.zip.GZIPInputStream import javax.xml.stream.XMLInputFactory import scala.collection.JavaConverters._ import scala.collection.mutable.ListBuffer import scala.io.Source -import scala.xml.pull.XMLEventReader @ExtendWith(Array(classOf[MockitoExtension])) class BioScholixTest extends AbstractVocabularyTest { @@ -139,7 +138,6 @@ class BioScholixTest extends AbstractVocabularyTest { } } - @Test def testPubmedSplitting(): Unit = { val spark: SparkSession = SparkSession.builder().appName("test").master("local").getOrCreate() From b0283fe94c168b87176f283f414ef2c4dfd3cdab Mon Sep 17 00:00:00 2001 From: Miriam Baglioni Date: Mon, 11 Nov 2024 14:57:57 +0100 Subject: [PATCH 066/111] [person] fix provenance of pid in person when it is orcid (classid entityregistry to avoid the cleaning put orcid_pending) --- .../dhp/actionmanager/personentity/ExtractPerson.java | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/personentity/ExtractPerson.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/personentity/ExtractPerson.java index bf2c19c3d..6830f2291 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/personentity/ExtractPerson.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/personentity/ExtractPerson.java @@ -345,7 +345,16 @@ public class ExtractPerson implements Serializable { OafMapperUtils .structuredProperty( op.getOrcid(), ModelConstants.ORCID, ModelConstants.ORCID_CLASSNAME, - ModelConstants.DNET_PID_TYPES, ModelConstants.DNET_PID_TYPES, null)); + ModelConstants.DNET_PID_TYPES, ModelConstants.DNET_PID_TYPES, + OafMapperUtils.dataInfo(false, + null, + false, + false, + OafMapperUtils.qualifier(ModelConstants.SYSIMPORT_CROSSWALK_ENTITYREGISTRY, + ModelConstants.SYSIMPORT_CROSSWALK_ENTITYREGISTRY, + ModelConstants.DNET_PID_TYPES, + ModelConstants.DNET_PID_TYPES), + "0.91"))); person.setDateofcollection(op.getLastModifiedDate()); person.setOriginalId(Arrays.asList(op.getOrcid())); person.setDataInfo(ORCIDDATAINFO); From f1ea9da5bcda277451416253982c24233e40d87b Mon Sep 17 00:00:00 2001 From: Miriam Baglioni Date: Mon, 11 Nov 2024 15:37:56 +0100 Subject: [PATCH 067/111] [person] checked type in inferenceprovenance --- .../bipaffiliations/PrepareAffiliationRelations.java | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipaffiliations/PrepareAffiliationRelations.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipaffiliations/PrepareAffiliationRelations.java index 15c1cc376..75e58e665 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipaffiliations/PrepareAffiliationRelations.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipaffiliations/PrepareAffiliationRelations.java @@ -104,22 +104,22 @@ public class PrepareAffiliationRelations implements Serializable { .listKeyValues(OPENAIRE_DATASOURCE_ID, OPENAIRE_DATASOURCE_NAME); JavaPairRDD crossrefRelations = prepareAffiliationRelationsNewModel( - spark, crossrefInputPath, collectedfromOpenAIRE, BIP_INFERENCE_PROVENANCE + "::crossref"); + spark, crossrefInputPath, collectedfromOpenAIRE, BIP_INFERENCE_PROVENANCE + ":crossref"); JavaPairRDD pubmedRelations = prepareAffiliationRelations( - spark, pubmedInputPath, collectedfromOpenAIRE, BIP_INFERENCE_PROVENANCE + "::pubmed"); + spark, pubmedInputPath, collectedfromOpenAIRE, BIP_INFERENCE_PROVENANCE + ":pubmed"); JavaPairRDD openAPCRelations = prepareAffiliationRelationsNewModel( - spark, openapcInputPath, collectedfromOpenAIRE, BIP_INFERENCE_PROVENANCE + "::openapc"); + spark, openapcInputPath, collectedfromOpenAIRE, BIP_INFERENCE_PROVENANCE + ":openapc"); JavaPairRDD dataciteRelations = prepareAffiliationRelationsNewModel( - spark, dataciteInputPath, collectedfromOpenAIRE, BIP_INFERENCE_PROVENANCE + "::datacite"); + spark, dataciteInputPath, collectedfromOpenAIRE, BIP_INFERENCE_PROVENANCE + ":datacite"); JavaPairRDD webCrawlRelations = prepareAffiliationRelationsNewModel( - spark, webcrawlInputPath, collectedfromOpenAIRE, BIP_INFERENCE_PROVENANCE + "::rawaff"); + spark, webcrawlInputPath, collectedfromOpenAIRE, BIP_INFERENCE_PROVENANCE + ":rawaff"); JavaPairRDD publisherRelations = prepareAffiliationRelationFromPublisherNewModel( - spark, publisherlInputPath, collectedfromOpenAIRE, BIP_INFERENCE_PROVENANCE + "::webcrawl"); + spark, publisherlInputPath, collectedfromOpenAIRE, BIP_INFERENCE_PROVENANCE + ":webcrawl"); crossrefRelations .union(pubmedRelations) From 250f101779a16ffbec1c16d3e0dc1050d6533c87 Mon Sep 17 00:00:00 2001 From: Miriam Baglioni Date: Mon, 11 Nov 2024 16:04:06 +0100 Subject: [PATCH 068/111] [person] fixed issue in creating project identifier for the graph for person->project relations --- .../dhp/actionmanager/personentity/ExtractPerson.java | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/personentity/ExtractPerson.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/personentity/ExtractPerson.java index 6830f2291..6976def4c 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/personentity/ExtractPerson.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/personentity/ExtractPerson.java @@ -15,6 +15,7 @@ import java.util.stream.Collectors; import org.apache.commons.cli.ParseException; import org.apache.commons.io.IOUtils; +import org.apache.commons.lang.StringUtils; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FSDataOutputStream; import org.apache.hadoop.fs.FileSystem; @@ -193,8 +194,8 @@ public class ExtractPerson implements Serializable { private static Relation getProjectRelation(String project, String orcid, String role) { String source = PERSON_PREFIX + "::" + IdentifierFactory.md5(orcid); - String target = PROJECT_ID_PREFIX + project.substring(0, 14) - + IdentifierFactory.md5(project.substring(15)); + String target = PROJECT_ID_PREFIX + StringUtils.substringBefore(project, "::") + "::" + + IdentifierFactory.md5(StringUtils.substringAfter(project, "::")); List properties = new ArrayList<>(); Relation relation = OafMapperUtils From 6c5df761e21d5aea6c203cb2ca2374b33a9219e5 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Tue, 12 Nov 2024 14:18:04 +0100 Subject: [PATCH 069/111] enforce resulttype based on the dnet:result_typologies vocabulary and upon merge --- .../dhp/oa/merge/GroupEntitiesSparkJob.java | 7 ++- .../dhp/schema/oaf/utils/MergeUtils.java | 62 ++++++++++++++++--- .../dhp/oa/dedup/DedupRecordFactory.java | 2 +- .../dhp/oa/dedup/DatasetMergerTest.java | 4 +- .../raw/AbstractMdRecordToOafMapper.java | 11 ++-- .../raw/GenerateEntitiesApplication.java | 2 +- .../dhp/sx/graph/SparkCreateInputGraph.scala | 2 +- 7 files changed, 66 insertions(+), 24 deletions(-) diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/oa/merge/GroupEntitiesSparkJob.java b/dhp-common/src/main/java/eu/dnetlib/dhp/oa/merge/GroupEntitiesSparkJob.java index 24de1a787..98ec09277 100644 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/oa/merge/GroupEntitiesSparkJob.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/oa/merge/GroupEntitiesSparkJob.java @@ -2,8 +2,7 @@ package eu.dnetlib.dhp.oa.merge; import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; -import static org.apache.spark.sql.functions.col; -import static org.apache.spark.sql.functions.when; +import static org.apache.spark.sql.functions.*; import java.util.Map; import java.util.Optional; @@ -135,7 +134,9 @@ public class GroupEntitiesSparkJob { .applyCoarVocabularies(entity, vocs), OAFENTITY_KRYO_ENC) .groupByKey((MapFunction) OafEntity::getId, Encoders.STRING()) - .mapGroups((MapGroupsFunction) MergeUtils::mergeById, OAFENTITY_KRYO_ENC) + .mapGroups( + (MapGroupsFunction) (key, group) -> MergeUtils.mergeById(group, vocs), + OAFENTITY_KRYO_ENC) .map( (MapFunction>) t -> new Tuple2<>( t.getClass().getName(), t), diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/MergeUtils.java b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/MergeUtils.java index 4c411a155..d7e08fca7 100644 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/MergeUtils.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/MergeUtils.java @@ -23,24 +23,30 @@ import org.apache.commons.lang3.tuple.Pair; import com.github.sisyphsu.dateparser.DateParserUtils; import com.google.common.base.Joiner; +import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup; import eu.dnetlib.dhp.oa.merge.AuthorMerger; import eu.dnetlib.dhp.schema.common.AccessRightComparator; +import eu.dnetlib.dhp.schema.common.EntityType; import eu.dnetlib.dhp.schema.common.ModelConstants; import eu.dnetlib.dhp.schema.common.ModelSupport; import eu.dnetlib.dhp.schema.oaf.*; public class MergeUtils { - public static T mergeById(String s, Iterator oafEntityIterator) { - return mergeGroup(s, oafEntityIterator, true); + public static T mergeById(Iterator oafEntityIterator, VocabularyGroup vocs) { + return mergeGroup(oafEntityIterator, true, vocs); } - public static T mergeGroup(String s, Iterator oafEntityIterator) { - return mergeGroup(s, oafEntityIterator, false); + public static T mergeGroup(Iterator oafEntityIterator) { + return mergeGroup(oafEntityIterator, false); } - public static T mergeGroup(String s, Iterator oafEntityIterator, - boolean checkDelegateAuthority) { + public static T mergeGroup(Iterator oafEntityIterator, boolean checkDelegateAuthority) { + return mergeGroup(oafEntityIterator, checkDelegateAuthority, null); + } + + public static T mergeGroup(Iterator oafEntityIterator, + boolean checkDelegateAuthority, VocabularyGroup vocs) { ArrayList sortedEntities = new ArrayList<>(); oafEntityIterator.forEachRemaining(sortedEntities::add); @@ -49,13 +55,49 @@ public class MergeUtils { Iterator it = sortedEntities.iterator(); T merged = it.next(); - while (it.hasNext()) { - merged = checkedMerge(merged, it.next(), checkDelegateAuthority); + if (!it.hasNext() && merged instanceof Result && vocs != null) { + return enforceResultType(vocs, (Result) merged); + } else { + while (it.hasNext()) { + merged = checkedMerge(merged, it.next(), checkDelegateAuthority); + } } - return merged; } + private static T enforceResultType(VocabularyGroup vocs, Result mergedResult) { + if (Optional.ofNullable(mergedResult.getInstance()).map(List::isEmpty).orElse(true)) { + return (T) mergedResult; + } else { + final Instance i = mergedResult.getInstance().get(0); + + if (!vocs.vocabularyExists(ModelConstants.DNET_RESULT_TYPOLOGIES)) { + return (T) mergedResult; + } else { + final Qualifier expectedResultType = vocs + .getSynonymAsQualifier( + ModelConstants.DNET_RESULT_TYPOLOGIES, + i.getInstancetype().getClassid()); + + // there is a clash among the result types + if (!expectedResultType.getClassid().equals(mergedResult.getResulttype().getClassid())) { + try { + String resulttype = expectedResultType.getClassid(); + if (EntityType.otherresearchproduct.toString().equals(resulttype)) { + resulttype = "other"; + } + Result result = (Result) ModelSupport.oafTypes.get(resulttype).newInstance(); + return (T) mergeResultFields(result, mergedResult); + } catch (InstantiationException | IllegalAccessException e) { + throw new IllegalStateException(e); + } + } else { + return (T) mergedResult; + } + } + } + } + public static T checkedMerge(final T left, final T right, boolean checkDelegateAuthority) { return (T) merge(left, right, checkDelegateAuthority); } @@ -106,7 +148,7 @@ public class MergeUtils { return mergeSoftware((Software) left, (Software) right); } - return mergeResultFields((Result) left, (Result) right); + return left; } else if (sameClass(left, right, Datasource.class)) { // TODO final int trust = compareTrust(left, right); diff --git a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/DedupRecordFactory.java b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/DedupRecordFactory.java index 44482cfdb..f6a436543 100644 --- a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/DedupRecordFactory.java +++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/DedupRecordFactory.java @@ -135,7 +135,7 @@ public class DedupRecordFactory { return Collections.emptyIterator(); } - OafEntity mergedEntity = MergeUtils.mergeGroup(dedupId, cliques.iterator()); + OafEntity mergedEntity = MergeUtils.mergeGroup(cliques.iterator()); // dedup records do not have date of transformation attribute mergedEntity.setDateoftransformation(null); mergedEntity diff --git a/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/DatasetMergerTest.java b/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/DatasetMergerTest.java index 726814c43..a79047590 100644 --- a/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/DatasetMergerTest.java +++ b/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/DatasetMergerTest.java @@ -46,8 +46,8 @@ class DatasetMergerTest implements Serializable { } @Test - void datasetMergerTest() throws InstantiationException, IllegalAccessException, InvocationTargetException { - Dataset pub_merged = MergeUtils.mergeGroup(dedupId, datasets.stream().map(Tuple2::_2).iterator()); + void datasetMergerTest() { + Dataset pub_merged = MergeUtils.mergeGroup(datasets.stream().map(Tuple2::_2).iterator()); // verify id assertEquals(dedupId, pub_merged.getId()); diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java index 2436a272c..ba6887a2e 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java @@ -155,7 +155,7 @@ public abstract class AbstractMdRecordToOafMapper { final List instances = prepareInstances(doc, entityInfo, collectedFrom, hostedBy); - final String type = getResultType(doc, instances); + final String type = getResultType(instances); return createOafs(doc, type, instances, collectedFrom, entityInfo, lastUpdateTimestamp); } catch (final DocumentException e) { @@ -164,10 +164,9 @@ public abstract class AbstractMdRecordToOafMapper { } } - protected String getResultType(final Document doc, final List instances) { - final String type = doc.valueOf("//dr:CobjCategory/@type"); + protected String getResultType(final List instances) { - if (StringUtils.isBlank(type) && this.vocs.vocabularyExists(ModelConstants.DNET_RESULT_TYPOLOGIES)) { + if (this.vocs.vocabularyExists(ModelConstants.DNET_RESULT_TYPOLOGIES)) { final String instanceType = instances .stream() .map(i -> i.getInstancetype().getClassid()) @@ -178,9 +177,9 @@ public abstract class AbstractMdRecordToOafMapper { .ofNullable(this.vocs.getSynonymAsQualifier(ModelConstants.DNET_RESULT_TYPOLOGIES, instanceType)) .map(Qualifier::getClassid) .orElse("0000"); + } else { + throw new IllegalStateException("Missing vocabulary: " + ModelConstants.DNET_RESULT_TYPOLOGIES); } - - return type; } private KeyValue getProvenanceDatasource(final Document doc, final String xpathId, final String xpathName) { diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/GenerateEntitiesApplication.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/GenerateEntitiesApplication.java index c3806c211..357fae470 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/GenerateEntitiesApplication.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/GenerateEntitiesApplication.java @@ -133,7 +133,7 @@ public class GenerateEntitiesApplication extends AbstractMigrationApplication { inputRdd .keyBy(oaf -> ModelSupport.idFn().apply(oaf)) .groupByKey() - .map(t -> MergeUtils.mergeGroup(t._1, t._2.iterator())), + .map(t -> MergeUtils.mergeGroup(t._2.iterator())), // .mapToPair(oaf -> new Tuple2<>(ModelSupport.idFn().apply(oaf), oaf)) // .reduceByKey(MergeUtils::merge) // .map(Tuple2::_2), diff --git a/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/SparkCreateInputGraph.scala b/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/SparkCreateInputGraph.scala index d94a23947..42299cd34 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/SparkCreateInputGraph.scala +++ b/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/sx/graph/SparkCreateInputGraph.scala @@ -133,7 +133,7 @@ object SparkCreateInputGraph { val ds: Dataset[T] = spark.read.load(sourcePath).as[T] ds.groupByKey(_.getId) - .mapGroups { (id, it) => MergeUtils.mergeGroup(id, it.asJava).asInstanceOf[T] } + .mapGroups { (id, it) => MergeUtils.mergeGroup(it.asJava).asInstanceOf[T] } // .reduceGroups { (x: T, y: T) => MergeUtils.merge(x, y).asInstanceOf[T] } // .map(_) .write From 07f267bb10911d62e30a2b299db3c50fcd1746a2 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Wed, 13 Nov 2024 08:14:26 +0100 Subject: [PATCH 070/111] fix vocabulary lookup in mergeutils --- .../dhp/schema/oaf/utils/MergeUtils.java | 425 +++++++++--------- 1 file changed, 215 insertions(+), 210 deletions(-) diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/MergeUtils.java b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/MergeUtils.java index d7e08fca7..dc76860f8 100644 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/MergeUtils.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/MergeUtils.java @@ -16,6 +16,8 @@ import java.util.function.Function; import java.util.stream.Collectors; import java.util.stream.Stream; +import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup; +import eu.dnetlib.dhp.schema.common.EntityType; import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.tuple.ImmutablePair; import org.apache.commons.lang3.tuple.Pair; @@ -23,10 +25,8 @@ import org.apache.commons.lang3.tuple.Pair; import com.github.sisyphsu.dateparser.DateParserUtils; import com.google.common.base.Joiner; -import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup; import eu.dnetlib.dhp.oa.merge.AuthorMerger; import eu.dnetlib.dhp.schema.common.AccessRightComparator; -import eu.dnetlib.dhp.schema.common.EntityType; import eu.dnetlib.dhp.schema.common.ModelConstants; import eu.dnetlib.dhp.schema.common.ModelSupport; import eu.dnetlib.dhp.schema.oaf.*; @@ -46,7 +46,7 @@ public class MergeUtils { } public static T mergeGroup(Iterator oafEntityIterator, - boolean checkDelegateAuthority, VocabularyGroup vocs) { + boolean checkDelegateAuthority, VocabularyGroup vocs) { ArrayList sortedEntities = new ArrayList<>(); oafEntityIterator.forEachRemaining(sortedEntities::add); @@ -74,11 +74,16 @@ public class MergeUtils { if (!vocs.vocabularyExists(ModelConstants.DNET_RESULT_TYPOLOGIES)) { return (T) mergedResult; } else { - final Qualifier expectedResultType = vocs - .getSynonymAsQualifier( + final Qualifier expectedResultType = vocs.lookupTermBySynonym( ModelConstants.DNET_RESULT_TYPOLOGIES, i.getInstancetype().getClassid()); + if (Objects.isNull(expectedResultType)) { + throw new IllegalArgumentException( + "instance type not bound to any result type in dnet:result_typologies: " + + i.getInstancetype().getClassid()); + } + // there is a clash among the result types if (!expectedResultType.getClassid().equals(mergedResult.getResulttype().getClassid())) { try { @@ -117,10 +122,10 @@ public class MergeUtils { return mergeRelation((Relation) left, (Relation) right); } else { throw new RuntimeException( - String - .format( - "MERGE_FROM_AND_GET incompatible types: %s, %s", - left.getClass().getCanonicalName(), right.getClass().getCanonicalName())); + String + .format( + "MERGE_FROM_AND_GET incompatible types: %s, %s", + left.getClass().getCanonicalName(), right.getClass().getCanonicalName())); } } @@ -159,10 +164,10 @@ public class MergeUtils { return mergeProject((Project) left, (Project) right); } else { throw new RuntimeException( - String - .format( - "MERGE_FROM_AND_GET incompatible types: %s, %s", - left.getClass().getCanonicalName(), right.getClass().getCanonicalName())); + String + .format( + "MERGE_FROM_AND_GET incompatible types: %s, %s", + left.getClass().getCanonicalName(), right.getClass().getCanonicalName())); } } @@ -253,7 +258,7 @@ public class MergeUtils { } private static List mergeLists(final List left, final List right, int trust, - Function keyExtractor, BinaryOperator merger) { + Function keyExtractor, BinaryOperator merger) { if (left == null || left.isEmpty()) { return right != null ? right : new ArrayList<>(); } else if (right == null || right.isEmpty()) { @@ -264,11 +269,11 @@ public class MergeUtils { List l = trust >= 0 ? right : left; return new ArrayList<>(Stream - .concat(h.stream(), l.stream()) - .filter(Objects::nonNull) - .distinct() - .collect(Collectors.toMap(keyExtractor, v -> v, merger, LinkedHashMap::new)) - .values()); + .concat(h.stream(), l.stream()) + .filter(Objects::nonNull) + .distinct() + .collect(Collectors.toMap(keyExtractor, v -> v, merger, LinkedHashMap::new)) + .values()); } private static List unionDistinctLists(final List left, final List right, int trust) { @@ -282,10 +287,10 @@ public class MergeUtils { List l = trust >= 0 ? right : left; return Stream - .concat(h.stream(), l.stream()) - .filter(Objects::nonNull) - .distinct() - .collect(Collectors.toList()); + .concat(h.stream(), l.stream()) + .filter(Objects::nonNull) + .distinct() + .collect(Collectors.toList()); } private static List unionDistinctListOfString(final List l, final List r) { @@ -296,10 +301,10 @@ public class MergeUtils { } return Stream - .concat(l.stream(), r.stream()) - .filter(StringUtils::isNotBlank) - .distinct() - .collect(Collectors.toList()); + .concat(l.stream(), r.stream()) + .filter(StringUtils::isNotBlank) + .distinct() + .collect(Collectors.toList()); } // TODO review @@ -325,7 +330,7 @@ public class MergeUtils { } private static List unionTitle(List left, List right, - int trust) { + int trust) { if (left == null) { return right; } else if (right == null) { @@ -336,10 +341,10 @@ public class MergeUtils { List l = trust >= 0 ? right : left; return Stream - .concat(h.stream(), l.stream()) - .filter(Objects::isNull) - .distinct() - .collect(Collectors.toList()); + .concat(h.stream(), l.stream()) + .filter(Objects::isNull) + .distinct() + .collect(Collectors.toList()); } /** @@ -374,8 +379,8 @@ public class MergeUtils { merged.setPid(mergeLists(merged.getPid(), enrich.getPid(), trust, MergeUtils::spKeyExtractor, (p1, p2) -> p1)); merged.setDateofcollection(LocalDateTime.now().toString()); merged - .setDateoftransformation( - chooseString(merged.getDateoftransformation(), enrich.getDateoftransformation(), trust)); + .setDateoftransformation( + chooseString(merged.getDateoftransformation(), enrich.getDateoftransformation(), trust)); merged.setExtraInfo(unionDistinctLists(merged.getExtraInfo(), enrich.getExtraInfo(), trust)); // When merging records OAI provenance becomes null merged.setOaiprovenance(null); @@ -392,7 +397,7 @@ public class MergeUtils { checkArgument(Objects.equals(merge.getTarget(), enrich.getTarget()), "target ids must be equal"); checkArgument(Objects.equals(merge.getRelType(), enrich.getRelType()), "relType(s) must be equal"); checkArgument( - Objects.equals(merge.getSubRelType(), enrich.getSubRelType()), "subRelType(s) must be equal"); + Objects.equals(merge.getSubRelType(), enrich.getSubRelType()), "subRelType(s) must be equal"); checkArgument(Objects.equals(merge.getRelClass(), enrich.getRelClass()), "relClass(es) must be equal"); // merge.setProvenance(mergeLists(merge.getProvenance(), enrich.getProvenance())); @@ -403,10 +408,10 @@ public class MergeUtils { merge.setValidationDate(ModelSupport.oldest(merge.getValidationDate(), enrich.getValidationDate())); } catch (ParseException e) { throw new IllegalArgumentException(String - .format( - "invalid validation date format in relation [s:%s, t:%s]: %s", merge.getSource(), - merge.getTarget(), - merge.getValidationDate())); + .format( + "invalid validation date format in relation [s:%s, t:%s]: %s", merge.getSource(), + merge.getTarget(), + merge.getValidationDate())); } // TODO keyvalue merge @@ -420,7 +425,7 @@ public class MergeUtils { T merge = mergeOafEntityFields(original, enrich, trust); if (merge.getProcessingchargeamount() == null - || StringUtils.isBlank(merge.getProcessingchargeamount().getValue())) { + || StringUtils.isBlank(merge.getProcessingchargeamount().getValue())) { merge.setProcessingchargeamount(enrich.getProcessingchargeamount()); merge.setProcessingchargecurrency(enrich.getProcessingchargecurrency()); } @@ -452,8 +457,8 @@ public class MergeUtils { } merge - .setDateofacceptance( - mergeDateOfAcceptance(merge.getDateofacceptance(), enrich.getDateofacceptance(), trust)); + .setDateofacceptance( + mergeDateOfAcceptance(merge.getDateofacceptance(), enrich.getDateofacceptance(), trust)); merge.setPublisher(coalesce(merge.getPublisher(), enrich.getPublisher())); merge.setEmbargoenddate(coalesce(merge.getEmbargoenddate(), enrich.getEmbargoenddate())); @@ -468,7 +473,7 @@ public class MergeUtils { merge.setCoverage(unionDistinctLists(merge.getCoverage(), enrich.getCoverage(), trust)); if (enrich.getBestaccessright() != null - && new AccessRightComparator<>() + && new AccessRightComparator<>() .compare(enrich.getBestaccessright(), merge.getBestaccessright()) < 0) { merge.setBestaccessright(enrich.getBestaccessright()); } @@ -481,8 +486,8 @@ public class MergeUtils { // ok merge - .setExternalReference( - mergeExternalReference(merge.getExternalReference(), enrich.getExternalReference(), trust)); + .setExternalReference( + mergeExternalReference(merge.getExternalReference(), enrich.getExternalReference(), trust)); // instance enrichment or union // review instance equals => add pid to comparision @@ -490,17 +495,17 @@ public class MergeUtils { merge.setInstance(mergeInstances(merge.getInstance(), enrich.getInstance(), trust)); } else { final List enrichmentInstances = isAnEnrichment(merge) ? merge.getInstance() - : enrich.getInstance(); + : enrich.getInstance(); final List enrichedInstances = isAnEnrichment(merge) ? enrich.getInstance() - : merge.getInstance(); + : merge.getInstance(); if (isAnEnrichment(merge)) merge.setDataInfo(enrich.getDataInfo()); merge.setInstance(enrichInstances(enrichedInstances, enrichmentInstances)); } merge - .setEoscifguidelines( - mergeEosciifguidelines(merge.getEoscifguidelines(), enrich.getEoscifguidelines(), trust)); + .setEoscifguidelines( + mergeEosciifguidelines(merge.getEoscifguidelines(), enrich.getEoscifguidelines(), trust)); merge.setIsGreen(booleanOR(merge.getIsGreen(), enrich.getIsGreen())); // OK but should be list of values merge.setOpenAccessColor(coalesce(merge.getOpenAccessColor(), enrich.getOpenAccessColor())); @@ -526,7 +531,7 @@ public class MergeUtils { LocalDate enrich_date = LocalDate.parse(enrich.getValue(), DateTimeFormatter.ISO_DATE); if (enrich_date.getYear() > 1300 - && (merge_date.getYear() < 1300 || merge_date.isAfter(enrich_date))) { + && (merge_date.getYear() < 1300 || merge_date.isAfter(enrich_date))) { return enrich; } } catch (NullPointerException | DateTimeParseException e) { @@ -544,56 +549,56 @@ public class MergeUtils { private static List mergeInstances(List v1, List v2, int trust) { return mergeLists( - v1, v2, trust, - MergeUtils::instanceKeyExtractor, - MergeUtils::instanceMerger); + v1, v2, trust, + MergeUtils::instanceKeyExtractor, + MergeUtils::instanceMerger); } private static List mergeEosciifguidelines(List v1, List v2, - int trust) { + int trust) { return mergeLists( - v1, v2, trust, er -> Joiner - .on("||") - .useForNull("") - .join(er.getCode(), er.getLabel(), er.getUrl(), er.getSemanticRelation()), - (r, l) -> r); + v1, v2, trust, er -> Joiner + .on("||") + .useForNull("") + .join(er.getCode(), er.getLabel(), er.getUrl(), er.getSemanticRelation()), + (r, l) -> r); } private static List mergeExternalReference(List v1, - List v2, int trust) { + List v2, int trust) { return mergeLists( - v1, v2, trust, er -> Joiner - .on(',') - .useForNull("") - .join( - er.getSitename(), er.getLabel(), - er.getUrl(), toString(er.getQualifier()), er.getRefidentifier(), - er.getQuery(), toString(er.getDataInfo())), - (r, l) -> r); + v1, v2, trust, er -> Joiner + .on(',') + .useForNull("") + .join( + er.getSitename(), er.getLabel(), + er.getUrl(), toString(er.getQualifier()), er.getRefidentifier(), + er.getQuery(), toString(er.getDataInfo())), + (r, l) -> r); } private static String toString(DataInfo di) { return Joiner - .on(',') - .useForNull("") - .join( - di.getInvisible(), di.getInferred(), di.getDeletedbyinference(), di.getTrust(), - di.getInferenceprovenance(), toString(di.getProvenanceaction())); + .on(',') + .useForNull("") + .join( + di.getInvisible(), di.getInferred(), di.getDeletedbyinference(), di.getTrust(), + di.getInferenceprovenance(), toString(di.getProvenanceaction())); } private static String toString(Qualifier q) { return Joiner - .on(',') - .useForNull("") - .join(q.getClassid(), q.getClassname(), q.getSchemeid(), q.getSchemename()); + .on(',') + .useForNull("") + .join(q.getClassid(), q.getClassname(), q.getSchemeid(), q.getSchemename()); } private static String toString(StructuredProperty sp) { return Joiner - .on(',') - .useForNull("") - .join(toString(sp.getQualifier()), sp.getValue()); + .on(',') + .useForNull("") + .join(toString(sp.getQualifier()), sp.getValue()); } private static List mergeStructuredProperties(List v1, List v2, int trust) { @@ -632,17 +637,17 @@ public class MergeUtils { // 2. @@ // 3. || return String - .join( - "::", - kvKeyExtractor(i.getHostedby()), - kvKeyExtractor(i.getCollectedfrom()), - qualifierKeyExtractor(i.getAccessright()), - qualifierKeyExtractor(i.getInstancetype()), - Optional.ofNullable(i.getUrl()).map(u -> String.join("@@", u)).orElse(null), - Optional - .ofNullable(i.getPid()) - .map(pp -> pp.stream().map(MergeUtils::spKeyExtractor).collect(Collectors.joining("@@"))) - .orElse(null)); + .join( + "::", + kvKeyExtractor(i.getHostedby()), + kvKeyExtractor(i.getCollectedfrom()), + qualifierKeyExtractor(i.getAccessright()), + qualifierKeyExtractor(i.getInstancetype()), + Optional.ofNullable(i.getUrl()).map(u -> String.join("@@", u)).orElse(null), + Optional + .ofNullable(i.getPid()) + .map(pp -> pp.stream().map(MergeUtils::spKeyExtractor).collect(Collectors.joining("@@"))) + .orElse(null)); } private static Instance instanceMerger(Instance i1, Instance i2) { @@ -653,30 +658,30 @@ public class MergeUtils { i.setInstancetype(i1.getInstancetype()); i.setPid(mergeLists(i1.getPid(), i2.getPid(), 0, MergeUtils::spKeyExtractor, (sp1, sp2) -> sp1)); i - .setAlternateIdentifier( - mergeLists( - i1.getAlternateIdentifier(), i2.getAlternateIdentifier(), 0, MergeUtils::spKeyExtractor, - (sp1, sp2) -> sp1)); + .setAlternateIdentifier( + mergeLists( + i1.getAlternateIdentifier(), i2.getAlternateIdentifier(), 0, MergeUtils::spKeyExtractor, + (sp1, sp2) -> sp1)); i - .setRefereed( - Collections - .min( - Stream.of(i1.getRefereed(), i2.getRefereed()).collect(Collectors.toList()), - new RefereedComparator())); + .setRefereed( + Collections + .min( + Stream.of(i1.getRefereed(), i2.getRefereed()).collect(Collectors.toList()), + new RefereedComparator())); i - .setInstanceTypeMapping( - mergeLists( - i1.getInstanceTypeMapping(), i2.getInstanceTypeMapping(), 0, - MergeUtils::instanceTypeMappingKeyExtractor, (itm1, itm2) -> itm1)); + .setInstanceTypeMapping( + mergeLists( + i1.getInstanceTypeMapping(), i2.getInstanceTypeMapping(), 0, + MergeUtils::instanceTypeMappingKeyExtractor, (itm1, itm2) -> itm1)); i.setFulltext(selectFulltext(i1.getFulltext(), i2.getFulltext())); i.setDateofacceptance(selectOldestDate(i1.getDateofacceptance(), i2.getDateofacceptance())); i.setLicense(coalesce(i1.getLicense(), i2.getLicense())); i.setProcessingchargeamount(coalesce(i1.getProcessingchargeamount(), i2.getProcessingchargeamount())); i.setProcessingchargecurrency(coalesce(i1.getProcessingchargecurrency(), i2.getProcessingchargecurrency())); i - .setMeasures( - mergeLists(i1.getMeasures(), i2.getMeasures(), 0, MergeUtils::measureKeyExtractor, (m1, m2) -> m1)); + .setMeasures( + mergeLists(i1.getMeasures(), i2.getMeasures(), 0, MergeUtils::measureKeyExtractor, (m1, m2) -> m1)); i.setUrl(unionDistinctListOfString(i1.getUrl(), i2.getUrl())); @@ -685,14 +690,14 @@ public class MergeUtils { private static String measureKeyExtractor(Measure m) { return String - .join( - "::", - m.getId(), - m - .getUnit() - .stream() - .map(KeyValue::getKey) - .collect(Collectors.joining("::"))); + .join( + "::", + m.getId(), + m + .getUnit() + .stream() + .map(KeyValue::getKey) + .collect(Collectors.joining("::"))); } private static Field selectOldestDate(Field d1, Field d2) { @@ -703,16 +708,16 @@ public class MergeUtils { } return Stream - .of(d1, d2) - .min( - Comparator - .comparing( - f -> DateParserUtils - .parseDate(f.getValue()) - .toInstant() - .atZone(ZoneId.systemDefault()) - .toLocalDate())) - .orElse(d1); + .of(d1, d2) + .min( + Comparator + .comparing( + f -> DateParserUtils + .parseDate(f.getValue()) + .toInstant() + .atZone(ZoneId.systemDefault()) + .toLocalDate())) + .orElse(d1); } private static String selectFulltext(String ft1, String ft2) { @@ -727,12 +732,12 @@ public class MergeUtils { private static String instanceTypeMappingKeyExtractor(InstanceTypeMapping itm) { return String - .join( - "::", - itm.getOriginalType(), - itm.getTypeCode(), - itm.getTypeLabel(), - itm.getVocabularyName()); + .join( + "::", + itm.getOriginalType(), + itm.getTypeCode(), + itm.getTypeLabel(), + itm.getVocabularyName()); } private static String kvKeyExtractor(KeyValue kv) { @@ -749,13 +754,13 @@ public class MergeUtils { private static String spKeyExtractor(StructuredProperty sp) { return Optional - .ofNullable(sp) - .map( - s -> Joiner - .on("||") - .useForNull("") - .join(qualifierKeyExtractor(s.getQualifier()), s.getValue())) - .orElse(null); + .ofNullable(sp) + .map( + s -> Joiner + .on("||") + .useForNull("") + .join(qualifierKeyExtractor(s.getQualifier()), s.getValue())) + .orElse(null); } private static T mergeORP(T original, T enrich) { @@ -777,8 +782,8 @@ public class MergeUtils { merge.setLicense(unionDistinctLists(merge.getLicense(), enrich.getLicense(), trust)); merge.setCodeRepositoryUrl(chooseReference(merge.getCodeRepositoryUrl(), enrich.getCodeRepositoryUrl(), trust)); merge - .setProgrammingLanguage( - chooseReference(merge.getProgrammingLanguage(), enrich.getProgrammingLanguage(), trust)); + .setProgrammingLanguage( + chooseReference(merge.getProgrammingLanguage(), enrich.getProgrammingLanguage(), trust)); return merge; } @@ -792,11 +797,11 @@ public class MergeUtils { merge.setSize(chooseReference(merge.getSize(), enrich.getSize(), trust)); merge.setVersion(chooseReference(merge.getVersion(), enrich.getVersion(), trust)); merge - .setLastmetadataupdate( - chooseReference(merge.getLastmetadataupdate(), enrich.getLastmetadataupdate(), trust)); + .setLastmetadataupdate( + chooseReference(merge.getLastmetadataupdate(), enrich.getLastmetadataupdate(), trust)); merge - .setMetadataversionnumber( - chooseReference(merge.getMetadataversionnumber(), enrich.getMetadataversionnumber(), trust)); + .setMetadataversionnumber( + chooseReference(merge.getMetadataversionnumber(), enrich.getMetadataversionnumber(), trust)); merge.setGeolocation(unionDistinctLists(merge.getGeolocation(), enrich.getGeolocation(), trust)); return merge; @@ -818,26 +823,26 @@ public class MergeUtils { merged.setLegalshortname(chooseReference(merged.getLegalshortname(), enrich.getLegalshortname(), trust)); merged.setLegalname(chooseReference(merged.getLegalname(), enrich.getLegalname(), trust)); merged - .setAlternativeNames(unionDistinctLists(enrich.getAlternativeNames(), merged.getAlternativeNames(), trust)); + .setAlternativeNames(unionDistinctLists(enrich.getAlternativeNames(), merged.getAlternativeNames(), trust)); merged.setWebsiteurl(chooseReference(merged.getWebsiteurl(), enrich.getWebsiteurl(), trust)); merged.setLogourl(chooseReference(merged.getLogourl(), enrich.getLogourl(), trust)); merged.setEclegalbody(chooseReference(merged.getEclegalbody(), enrich.getEclegalbody(), trust)); merged.setEclegalperson(chooseReference(merged.getEclegalperson(), enrich.getEclegalperson(), trust)); merged.setEcnonprofit(chooseReference(merged.getEcnonprofit(), enrich.getEcnonprofit(), trust)); merged - .setEcresearchorganization( - chooseReference(merged.getEcresearchorganization(), enrich.getEcresearchorganization(), trust)); + .setEcresearchorganization( + chooseReference(merged.getEcresearchorganization(), enrich.getEcresearchorganization(), trust)); merged - .setEchighereducation(chooseReference(merged.getEchighereducation(), enrich.getEchighereducation(), trust)); + .setEchighereducation(chooseReference(merged.getEchighereducation(), enrich.getEchighereducation(), trust)); merged - .setEcinternationalorganizationeurinterests( - chooseReference( - merged.getEcinternationalorganizationeurinterests(), - enrich.getEcinternationalorganizationeurinterests(), trust)); + .setEcinternationalorganizationeurinterests( + chooseReference( + merged.getEcinternationalorganizationeurinterests(), + enrich.getEcinternationalorganizationeurinterests(), trust)); merged - .setEcinternationalorganization( - chooseReference( - merged.getEcinternationalorganization(), enrich.getEcinternationalorganization(), trust)); + .setEcinternationalorganization( + chooseReference( + merged.getEcinternationalorganization(), enrich.getEcinternationalorganization(), trust)); merged.setEcenterprise(chooseReference(merged.getEcenterprise(), enrich.getEcenterprise(), trust)); merged.setEcsmevalidated(chooseReference(merged.getEcsmevalidated(), enrich.getEcsmevalidated(), trust)); merged.setEcnutscode(chooseReference(merged.getEcnutscode(), enrich.getEcnutscode(), trust)); @@ -861,8 +866,8 @@ public class MergeUtils { merged.setDuration(chooseReference(merged.getDuration(), enrich.getDuration(), trust)); merged.setEcsc39(chooseReference(merged.getEcsc39(), enrich.getEcsc39(), trust)); merged - .setOamandatepublications( - chooseReference(merged.getOamandatepublications(), enrich.getOamandatepublications(), trust)); + .setOamandatepublications( + chooseReference(merged.getOamandatepublications(), enrich.getOamandatepublications(), trust)); merged.setEcarticle29_3(chooseReference(merged.getEcarticle29_3(), enrich.getEcarticle29_3(), trust)); merged.setSubjects(unionDistinctLists(merged.getSubjects(), enrich.getSubjects(), trust)); merged.setFundingtree(unionDistinctLists(merged.getFundingtree(), enrich.getFundingtree(), trust)); @@ -888,8 +893,8 @@ public class MergeUtils { } merged - .setH2020classification( - unionDistinctLists(merged.getH2020classification(), enrich.getH2020classification(), trust)); + .setH2020classification( + unionDistinctLists(merged.getH2020classification(), enrich.getH2020classification(), trust)); return merged; } @@ -916,7 +921,7 @@ public class MergeUtils { * @return list of instances possibly enriched */ private static List enrichInstances(final List toEnrichInstances, - final List enrichmentInstances) { + final List enrichmentInstances) { final List enrichmentResult = new ArrayList<>(); if (toEnrichInstances == null) { @@ -954,42 +959,42 @@ public class MergeUtils { */ private static Map toInstanceMap(final List ri) { return ri - .stream() - .filter(i -> i.getPid() != null || i.getAlternateIdentifier() != null) - .flatMap(i -> { - final List> result = new ArrayList<>(); - if (i.getPid() != null) - i - .getPid() - .stream() - .filter(MergeUtils::validPid) - .forEach(p -> result.add(new ImmutablePair<>(extractKeyFromPid(p), i))); - if (i.getAlternateIdentifier() != null) - i - .getAlternateIdentifier() - .stream() - .filter(MergeUtils::validPid) - .forEach(p -> result.add(new ImmutablePair<>(extractKeyFromPid(p), i))); - return result.stream(); - }) - .collect( - Collectors - .toMap( - Pair::getLeft, - Pair::getRight, - (a, b) -> a)); + .stream() + .filter(i -> i.getPid() != null || i.getAlternateIdentifier() != null) + .flatMap(i -> { + final List> result = new ArrayList<>(); + if (i.getPid() != null) + i + .getPid() + .stream() + .filter(MergeUtils::validPid) + .forEach(p -> result.add(new ImmutablePair<>(extractKeyFromPid(p), i))); + if (i.getAlternateIdentifier() != null) + i + .getAlternateIdentifier() + .stream() + .filter(MergeUtils::validPid) + .forEach(p -> result.add(new ImmutablePair<>(extractKeyFromPid(p), i))); + return result.stream(); + }) + .collect( + Collectors + .toMap( + Pair::getLeft, + Pair::getRight, + (a, b) -> a)); } private static boolean isFromDelegatedAuthority(Result r) { return Optional - .ofNullable(r.getInstance()) - .map( - instance -> instance - .stream() - .filter(i -> Objects.nonNull(i.getCollectedfrom())) - .map(i -> i.getCollectedfrom().getKey()) - .anyMatch(cfId -> IdentifierFactory.delegatedAuthorityDatasourceIds().contains(cfId))) - .orElse(false); + .ofNullable(r.getInstance()) + .map( + instance -> instance + .stream() + .filter(i -> Objects.nonNull(i.getCollectedfrom())) + .map(i -> i.getCollectedfrom().getKey()) + .anyMatch(cfId -> IdentifierFactory.delegatedAuthorityDatasourceIds().contains(cfId))) + .orElse(false); } /** @@ -1025,15 +1030,15 @@ public class MergeUtils { * @return the list */ private static List findEnrichmentsByPID(final List pids, - final Map enrichments) { + final Map enrichments) { if (pids == null || enrichments == null) return null; return pids - .stream() - .map(MergeUtils::extractKeyFromPid) - .map(enrichments::get) - .filter(Objects::nonNull) - .collect(Collectors.toList()); + .stream() + .map(MergeUtils::extractKeyFromPid) + .map(enrichments::get) + .filter(Objects::nonNull) + .collect(Collectors.toList()); } /** @@ -1044,8 +1049,8 @@ public class MergeUtils { */ private static boolean isAnEnrichment(OafEntity e) { return e.getDataInfo() != null && - e.getDataInfo().getProvenanceaction() != null - && ModelConstants.PROVENANCE_ENRICH.equalsIgnoreCase(e.getDataInfo().getProvenanceaction().getClassid()); + e.getDataInfo().getProvenanceaction() != null + && ModelConstants.PROVENANCE_ENRICH.equalsIgnoreCase(e.getDataInfo().getProvenanceaction().getClassid()); } /** @@ -1068,17 +1073,17 @@ public class MergeUtils { merge.setHostedby(firstNonNull(merge.getHostedby(), enrichment.getHostedby())); merge.setUrl(unionDistinctLists(merge.getUrl(), enrichment.getUrl(), 0)); merge - .setDistributionlocation( - firstNonNull(merge.getDistributionlocation(), enrichment.getDistributionlocation())); + .setDistributionlocation( + firstNonNull(merge.getDistributionlocation(), enrichment.getDistributionlocation())); merge.setCollectedfrom(firstNonNull(merge.getCollectedfrom(), enrichment.getCollectedfrom())); // pid and alternateId are used for matching merge.setDateofacceptance(firstNonNull(merge.getDateofacceptance(), enrichment.getDateofacceptance())); merge - .setProcessingchargeamount( - firstNonNull(merge.getProcessingchargeamount(), enrichment.getProcessingchargeamount())); + .setProcessingchargeamount( + firstNonNull(merge.getProcessingchargeamount(), enrichment.getProcessingchargeamount())); merge - .setProcessingchargecurrency( - firstNonNull(merge.getProcessingchargecurrency(), enrichment.getProcessingchargecurrency())); + .setProcessingchargecurrency( + firstNonNull(merge.getProcessingchargecurrency(), enrichment.getProcessingchargecurrency())); merge.setRefereed(firstNonNull(merge.getRefereed(), enrichment.getRefereed())); merge.setMeasures(unionDistinctLists(merge.getMeasures(), enrichment.getMeasures(), 0)); merge.setFulltext(firstNonNull(merge.getFulltext(), enrichment.getFulltext())); @@ -1086,14 +1091,14 @@ public class MergeUtils { private static int compareTrust(Oaf a, Oaf b) { String left = Optional - .ofNullable(a.getDataInfo()) - .map(DataInfo::getTrust) - .orElse("0.0"); + .ofNullable(a.getDataInfo()) + .map(DataInfo::getTrust) + .orElse("0.0"); String right = Optional - .ofNullable(b.getDataInfo()) - .map(DataInfo::getTrust) - .orElse("0.0"); + .ofNullable(b.getDataInfo()) + .map(DataInfo::getTrust) + .orElse("0.0"); return left.compareTo(right); } From a1d5ad5c2609c91b60b97600163072d551dcc440 Mon Sep 17 00:00:00 2001 From: "sandro.labruzzo" Date: Wed, 13 Nov 2024 09:51:13 +0100 Subject: [PATCH 071/111] code formatted --- .../dhp/schema/oaf/utils/MergeUtils.java | 423 +++++++++--------- .../personentity/ExtractPerson.java | 22 +- .../plugin/gtr2/Gtr2PublicationsIterator.java | 6 +- 3 files changed, 228 insertions(+), 223 deletions(-) diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/MergeUtils.java b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/MergeUtils.java index dc76860f8..cd8506583 100644 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/MergeUtils.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/MergeUtils.java @@ -16,8 +16,6 @@ import java.util.function.Function; import java.util.stream.Collectors; import java.util.stream.Stream; -import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup; -import eu.dnetlib.dhp.schema.common.EntityType; import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.tuple.ImmutablePair; import org.apache.commons.lang3.tuple.Pair; @@ -25,8 +23,10 @@ import org.apache.commons.lang3.tuple.Pair; import com.github.sisyphsu.dateparser.DateParserUtils; import com.google.common.base.Joiner; +import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup; import eu.dnetlib.dhp.oa.merge.AuthorMerger; import eu.dnetlib.dhp.schema.common.AccessRightComparator; +import eu.dnetlib.dhp.schema.common.EntityType; import eu.dnetlib.dhp.schema.common.ModelConstants; import eu.dnetlib.dhp.schema.common.ModelSupport; import eu.dnetlib.dhp.schema.oaf.*; @@ -46,7 +46,7 @@ public class MergeUtils { } public static T mergeGroup(Iterator oafEntityIterator, - boolean checkDelegateAuthority, VocabularyGroup vocs) { + boolean checkDelegateAuthority, VocabularyGroup vocs) { ArrayList sortedEntities = new ArrayList<>(); oafEntityIterator.forEachRemaining(sortedEntities::add); @@ -74,14 +74,15 @@ public class MergeUtils { if (!vocs.vocabularyExists(ModelConstants.DNET_RESULT_TYPOLOGIES)) { return (T) mergedResult; } else { - final Qualifier expectedResultType = vocs.lookupTermBySynonym( + final Qualifier expectedResultType = vocs + .lookupTermBySynonym( ModelConstants.DNET_RESULT_TYPOLOGIES, i.getInstancetype().getClassid()); if (Objects.isNull(expectedResultType)) { throw new IllegalArgumentException( - "instance type not bound to any result type in dnet:result_typologies: " + - i.getInstancetype().getClassid()); + "instance type not bound to any result type in dnet:result_typologies: " + + i.getInstancetype().getClassid()); } // there is a clash among the result types @@ -122,10 +123,10 @@ public class MergeUtils { return mergeRelation((Relation) left, (Relation) right); } else { throw new RuntimeException( - String - .format( - "MERGE_FROM_AND_GET incompatible types: %s, %s", - left.getClass().getCanonicalName(), right.getClass().getCanonicalName())); + String + .format( + "MERGE_FROM_AND_GET incompatible types: %s, %s", + left.getClass().getCanonicalName(), right.getClass().getCanonicalName())); } } @@ -164,10 +165,10 @@ public class MergeUtils { return mergeProject((Project) left, (Project) right); } else { throw new RuntimeException( - String - .format( - "MERGE_FROM_AND_GET incompatible types: %s, %s", - left.getClass().getCanonicalName(), right.getClass().getCanonicalName())); + String + .format( + "MERGE_FROM_AND_GET incompatible types: %s, %s", + left.getClass().getCanonicalName(), right.getClass().getCanonicalName())); } } @@ -258,7 +259,7 @@ public class MergeUtils { } private static List mergeLists(final List left, final List right, int trust, - Function keyExtractor, BinaryOperator merger) { + Function keyExtractor, BinaryOperator merger) { if (left == null || left.isEmpty()) { return right != null ? right : new ArrayList<>(); } else if (right == null || right.isEmpty()) { @@ -269,11 +270,11 @@ public class MergeUtils { List l = trust >= 0 ? right : left; return new ArrayList<>(Stream - .concat(h.stream(), l.stream()) - .filter(Objects::nonNull) - .distinct() - .collect(Collectors.toMap(keyExtractor, v -> v, merger, LinkedHashMap::new)) - .values()); + .concat(h.stream(), l.stream()) + .filter(Objects::nonNull) + .distinct() + .collect(Collectors.toMap(keyExtractor, v -> v, merger, LinkedHashMap::new)) + .values()); } private static List unionDistinctLists(final List left, final List right, int trust) { @@ -287,10 +288,10 @@ public class MergeUtils { List l = trust >= 0 ? right : left; return Stream - .concat(h.stream(), l.stream()) - .filter(Objects::nonNull) - .distinct() - .collect(Collectors.toList()); + .concat(h.stream(), l.stream()) + .filter(Objects::nonNull) + .distinct() + .collect(Collectors.toList()); } private static List unionDistinctListOfString(final List l, final List r) { @@ -301,10 +302,10 @@ public class MergeUtils { } return Stream - .concat(l.stream(), r.stream()) - .filter(StringUtils::isNotBlank) - .distinct() - .collect(Collectors.toList()); + .concat(l.stream(), r.stream()) + .filter(StringUtils::isNotBlank) + .distinct() + .collect(Collectors.toList()); } // TODO review @@ -330,7 +331,7 @@ public class MergeUtils { } private static List unionTitle(List left, List right, - int trust) { + int trust) { if (left == null) { return right; } else if (right == null) { @@ -341,10 +342,10 @@ public class MergeUtils { List l = trust >= 0 ? right : left; return Stream - .concat(h.stream(), l.stream()) - .filter(Objects::isNull) - .distinct() - .collect(Collectors.toList()); + .concat(h.stream(), l.stream()) + .filter(Objects::isNull) + .distinct() + .collect(Collectors.toList()); } /** @@ -379,8 +380,8 @@ public class MergeUtils { merged.setPid(mergeLists(merged.getPid(), enrich.getPid(), trust, MergeUtils::spKeyExtractor, (p1, p2) -> p1)); merged.setDateofcollection(LocalDateTime.now().toString()); merged - .setDateoftransformation( - chooseString(merged.getDateoftransformation(), enrich.getDateoftransformation(), trust)); + .setDateoftransformation( + chooseString(merged.getDateoftransformation(), enrich.getDateoftransformation(), trust)); merged.setExtraInfo(unionDistinctLists(merged.getExtraInfo(), enrich.getExtraInfo(), trust)); // When merging records OAI provenance becomes null merged.setOaiprovenance(null); @@ -397,7 +398,7 @@ public class MergeUtils { checkArgument(Objects.equals(merge.getTarget(), enrich.getTarget()), "target ids must be equal"); checkArgument(Objects.equals(merge.getRelType(), enrich.getRelType()), "relType(s) must be equal"); checkArgument( - Objects.equals(merge.getSubRelType(), enrich.getSubRelType()), "subRelType(s) must be equal"); + Objects.equals(merge.getSubRelType(), enrich.getSubRelType()), "subRelType(s) must be equal"); checkArgument(Objects.equals(merge.getRelClass(), enrich.getRelClass()), "relClass(es) must be equal"); // merge.setProvenance(mergeLists(merge.getProvenance(), enrich.getProvenance())); @@ -408,10 +409,10 @@ public class MergeUtils { merge.setValidationDate(ModelSupport.oldest(merge.getValidationDate(), enrich.getValidationDate())); } catch (ParseException e) { throw new IllegalArgumentException(String - .format( - "invalid validation date format in relation [s:%s, t:%s]: %s", merge.getSource(), - merge.getTarget(), - merge.getValidationDate())); + .format( + "invalid validation date format in relation [s:%s, t:%s]: %s", merge.getSource(), + merge.getTarget(), + merge.getValidationDate())); } // TODO keyvalue merge @@ -425,7 +426,7 @@ public class MergeUtils { T merge = mergeOafEntityFields(original, enrich, trust); if (merge.getProcessingchargeamount() == null - || StringUtils.isBlank(merge.getProcessingchargeamount().getValue())) { + || StringUtils.isBlank(merge.getProcessingchargeamount().getValue())) { merge.setProcessingchargeamount(enrich.getProcessingchargeamount()); merge.setProcessingchargecurrency(enrich.getProcessingchargecurrency()); } @@ -457,8 +458,8 @@ public class MergeUtils { } merge - .setDateofacceptance( - mergeDateOfAcceptance(merge.getDateofacceptance(), enrich.getDateofacceptance(), trust)); + .setDateofacceptance( + mergeDateOfAcceptance(merge.getDateofacceptance(), enrich.getDateofacceptance(), trust)); merge.setPublisher(coalesce(merge.getPublisher(), enrich.getPublisher())); merge.setEmbargoenddate(coalesce(merge.getEmbargoenddate(), enrich.getEmbargoenddate())); @@ -473,7 +474,7 @@ public class MergeUtils { merge.setCoverage(unionDistinctLists(merge.getCoverage(), enrich.getCoverage(), trust)); if (enrich.getBestaccessright() != null - && new AccessRightComparator<>() + && new AccessRightComparator<>() .compare(enrich.getBestaccessright(), merge.getBestaccessright()) < 0) { merge.setBestaccessright(enrich.getBestaccessright()); } @@ -486,8 +487,8 @@ public class MergeUtils { // ok merge - .setExternalReference( - mergeExternalReference(merge.getExternalReference(), enrich.getExternalReference(), trust)); + .setExternalReference( + mergeExternalReference(merge.getExternalReference(), enrich.getExternalReference(), trust)); // instance enrichment or union // review instance equals => add pid to comparision @@ -495,17 +496,17 @@ public class MergeUtils { merge.setInstance(mergeInstances(merge.getInstance(), enrich.getInstance(), trust)); } else { final List enrichmentInstances = isAnEnrichment(merge) ? merge.getInstance() - : enrich.getInstance(); + : enrich.getInstance(); final List enrichedInstances = isAnEnrichment(merge) ? enrich.getInstance() - : merge.getInstance(); + : merge.getInstance(); if (isAnEnrichment(merge)) merge.setDataInfo(enrich.getDataInfo()); merge.setInstance(enrichInstances(enrichedInstances, enrichmentInstances)); } merge - .setEoscifguidelines( - mergeEosciifguidelines(merge.getEoscifguidelines(), enrich.getEoscifguidelines(), trust)); + .setEoscifguidelines( + mergeEosciifguidelines(merge.getEoscifguidelines(), enrich.getEoscifguidelines(), trust)); merge.setIsGreen(booleanOR(merge.getIsGreen(), enrich.getIsGreen())); // OK but should be list of values merge.setOpenAccessColor(coalesce(merge.getOpenAccessColor(), enrich.getOpenAccessColor())); @@ -531,7 +532,7 @@ public class MergeUtils { LocalDate enrich_date = LocalDate.parse(enrich.getValue(), DateTimeFormatter.ISO_DATE); if (enrich_date.getYear() > 1300 - && (merge_date.getYear() < 1300 || merge_date.isAfter(enrich_date))) { + && (merge_date.getYear() < 1300 || merge_date.isAfter(enrich_date))) { return enrich; } } catch (NullPointerException | DateTimeParseException e) { @@ -549,56 +550,56 @@ public class MergeUtils { private static List mergeInstances(List v1, List v2, int trust) { return mergeLists( - v1, v2, trust, - MergeUtils::instanceKeyExtractor, - MergeUtils::instanceMerger); + v1, v2, trust, + MergeUtils::instanceKeyExtractor, + MergeUtils::instanceMerger); } private static List mergeEosciifguidelines(List v1, List v2, - int trust) { + int trust) { return mergeLists( - v1, v2, trust, er -> Joiner - .on("||") - .useForNull("") - .join(er.getCode(), er.getLabel(), er.getUrl(), er.getSemanticRelation()), - (r, l) -> r); + v1, v2, trust, er -> Joiner + .on("||") + .useForNull("") + .join(er.getCode(), er.getLabel(), er.getUrl(), er.getSemanticRelation()), + (r, l) -> r); } private static List mergeExternalReference(List v1, - List v2, int trust) { + List v2, int trust) { return mergeLists( - v1, v2, trust, er -> Joiner - .on(',') - .useForNull("") - .join( - er.getSitename(), er.getLabel(), - er.getUrl(), toString(er.getQualifier()), er.getRefidentifier(), - er.getQuery(), toString(er.getDataInfo())), - (r, l) -> r); + v1, v2, trust, er -> Joiner + .on(',') + .useForNull("") + .join( + er.getSitename(), er.getLabel(), + er.getUrl(), toString(er.getQualifier()), er.getRefidentifier(), + er.getQuery(), toString(er.getDataInfo())), + (r, l) -> r); } private static String toString(DataInfo di) { return Joiner - .on(',') - .useForNull("") - .join( - di.getInvisible(), di.getInferred(), di.getDeletedbyinference(), di.getTrust(), - di.getInferenceprovenance(), toString(di.getProvenanceaction())); + .on(',') + .useForNull("") + .join( + di.getInvisible(), di.getInferred(), di.getDeletedbyinference(), di.getTrust(), + di.getInferenceprovenance(), toString(di.getProvenanceaction())); } private static String toString(Qualifier q) { return Joiner - .on(',') - .useForNull("") - .join(q.getClassid(), q.getClassname(), q.getSchemeid(), q.getSchemename()); + .on(',') + .useForNull("") + .join(q.getClassid(), q.getClassname(), q.getSchemeid(), q.getSchemename()); } private static String toString(StructuredProperty sp) { return Joiner - .on(',') - .useForNull("") - .join(toString(sp.getQualifier()), sp.getValue()); + .on(',') + .useForNull("") + .join(toString(sp.getQualifier()), sp.getValue()); } private static List mergeStructuredProperties(List v1, List v2, int trust) { @@ -637,17 +638,17 @@ public class MergeUtils { // 2. @@ // 3. || return String - .join( - "::", - kvKeyExtractor(i.getHostedby()), - kvKeyExtractor(i.getCollectedfrom()), - qualifierKeyExtractor(i.getAccessright()), - qualifierKeyExtractor(i.getInstancetype()), - Optional.ofNullable(i.getUrl()).map(u -> String.join("@@", u)).orElse(null), - Optional - .ofNullable(i.getPid()) - .map(pp -> pp.stream().map(MergeUtils::spKeyExtractor).collect(Collectors.joining("@@"))) - .orElse(null)); + .join( + "::", + kvKeyExtractor(i.getHostedby()), + kvKeyExtractor(i.getCollectedfrom()), + qualifierKeyExtractor(i.getAccessright()), + qualifierKeyExtractor(i.getInstancetype()), + Optional.ofNullable(i.getUrl()).map(u -> String.join("@@", u)).orElse(null), + Optional + .ofNullable(i.getPid()) + .map(pp -> pp.stream().map(MergeUtils::spKeyExtractor).collect(Collectors.joining("@@"))) + .orElse(null)); } private static Instance instanceMerger(Instance i1, Instance i2) { @@ -658,30 +659,30 @@ public class MergeUtils { i.setInstancetype(i1.getInstancetype()); i.setPid(mergeLists(i1.getPid(), i2.getPid(), 0, MergeUtils::spKeyExtractor, (sp1, sp2) -> sp1)); i - .setAlternateIdentifier( - mergeLists( - i1.getAlternateIdentifier(), i2.getAlternateIdentifier(), 0, MergeUtils::spKeyExtractor, - (sp1, sp2) -> sp1)); + .setAlternateIdentifier( + mergeLists( + i1.getAlternateIdentifier(), i2.getAlternateIdentifier(), 0, MergeUtils::spKeyExtractor, + (sp1, sp2) -> sp1)); i - .setRefereed( - Collections - .min( - Stream.of(i1.getRefereed(), i2.getRefereed()).collect(Collectors.toList()), - new RefereedComparator())); + .setRefereed( + Collections + .min( + Stream.of(i1.getRefereed(), i2.getRefereed()).collect(Collectors.toList()), + new RefereedComparator())); i - .setInstanceTypeMapping( - mergeLists( - i1.getInstanceTypeMapping(), i2.getInstanceTypeMapping(), 0, - MergeUtils::instanceTypeMappingKeyExtractor, (itm1, itm2) -> itm1)); + .setInstanceTypeMapping( + mergeLists( + i1.getInstanceTypeMapping(), i2.getInstanceTypeMapping(), 0, + MergeUtils::instanceTypeMappingKeyExtractor, (itm1, itm2) -> itm1)); i.setFulltext(selectFulltext(i1.getFulltext(), i2.getFulltext())); i.setDateofacceptance(selectOldestDate(i1.getDateofacceptance(), i2.getDateofacceptance())); i.setLicense(coalesce(i1.getLicense(), i2.getLicense())); i.setProcessingchargeamount(coalesce(i1.getProcessingchargeamount(), i2.getProcessingchargeamount())); i.setProcessingchargecurrency(coalesce(i1.getProcessingchargecurrency(), i2.getProcessingchargecurrency())); i - .setMeasures( - mergeLists(i1.getMeasures(), i2.getMeasures(), 0, MergeUtils::measureKeyExtractor, (m1, m2) -> m1)); + .setMeasures( + mergeLists(i1.getMeasures(), i2.getMeasures(), 0, MergeUtils::measureKeyExtractor, (m1, m2) -> m1)); i.setUrl(unionDistinctListOfString(i1.getUrl(), i2.getUrl())); @@ -690,14 +691,14 @@ public class MergeUtils { private static String measureKeyExtractor(Measure m) { return String - .join( - "::", - m.getId(), - m - .getUnit() - .stream() - .map(KeyValue::getKey) - .collect(Collectors.joining("::"))); + .join( + "::", + m.getId(), + m + .getUnit() + .stream() + .map(KeyValue::getKey) + .collect(Collectors.joining("::"))); } private static Field selectOldestDate(Field d1, Field d2) { @@ -708,16 +709,16 @@ public class MergeUtils { } return Stream - .of(d1, d2) - .min( - Comparator - .comparing( - f -> DateParserUtils - .parseDate(f.getValue()) - .toInstant() - .atZone(ZoneId.systemDefault()) - .toLocalDate())) - .orElse(d1); + .of(d1, d2) + .min( + Comparator + .comparing( + f -> DateParserUtils + .parseDate(f.getValue()) + .toInstant() + .atZone(ZoneId.systemDefault()) + .toLocalDate())) + .orElse(d1); } private static String selectFulltext(String ft1, String ft2) { @@ -732,12 +733,12 @@ public class MergeUtils { private static String instanceTypeMappingKeyExtractor(InstanceTypeMapping itm) { return String - .join( - "::", - itm.getOriginalType(), - itm.getTypeCode(), - itm.getTypeLabel(), - itm.getVocabularyName()); + .join( + "::", + itm.getOriginalType(), + itm.getTypeCode(), + itm.getTypeLabel(), + itm.getVocabularyName()); } private static String kvKeyExtractor(KeyValue kv) { @@ -754,13 +755,13 @@ public class MergeUtils { private static String spKeyExtractor(StructuredProperty sp) { return Optional - .ofNullable(sp) - .map( - s -> Joiner - .on("||") - .useForNull("") - .join(qualifierKeyExtractor(s.getQualifier()), s.getValue())) - .orElse(null); + .ofNullable(sp) + .map( + s -> Joiner + .on("||") + .useForNull("") + .join(qualifierKeyExtractor(s.getQualifier()), s.getValue())) + .orElse(null); } private static T mergeORP(T original, T enrich) { @@ -782,8 +783,8 @@ public class MergeUtils { merge.setLicense(unionDistinctLists(merge.getLicense(), enrich.getLicense(), trust)); merge.setCodeRepositoryUrl(chooseReference(merge.getCodeRepositoryUrl(), enrich.getCodeRepositoryUrl(), trust)); merge - .setProgrammingLanguage( - chooseReference(merge.getProgrammingLanguage(), enrich.getProgrammingLanguage(), trust)); + .setProgrammingLanguage( + chooseReference(merge.getProgrammingLanguage(), enrich.getProgrammingLanguage(), trust)); return merge; } @@ -797,11 +798,11 @@ public class MergeUtils { merge.setSize(chooseReference(merge.getSize(), enrich.getSize(), trust)); merge.setVersion(chooseReference(merge.getVersion(), enrich.getVersion(), trust)); merge - .setLastmetadataupdate( - chooseReference(merge.getLastmetadataupdate(), enrich.getLastmetadataupdate(), trust)); + .setLastmetadataupdate( + chooseReference(merge.getLastmetadataupdate(), enrich.getLastmetadataupdate(), trust)); merge - .setMetadataversionnumber( - chooseReference(merge.getMetadataversionnumber(), enrich.getMetadataversionnumber(), trust)); + .setMetadataversionnumber( + chooseReference(merge.getMetadataversionnumber(), enrich.getMetadataversionnumber(), trust)); merge.setGeolocation(unionDistinctLists(merge.getGeolocation(), enrich.getGeolocation(), trust)); return merge; @@ -823,26 +824,26 @@ public class MergeUtils { merged.setLegalshortname(chooseReference(merged.getLegalshortname(), enrich.getLegalshortname(), trust)); merged.setLegalname(chooseReference(merged.getLegalname(), enrich.getLegalname(), trust)); merged - .setAlternativeNames(unionDistinctLists(enrich.getAlternativeNames(), merged.getAlternativeNames(), trust)); + .setAlternativeNames(unionDistinctLists(enrich.getAlternativeNames(), merged.getAlternativeNames(), trust)); merged.setWebsiteurl(chooseReference(merged.getWebsiteurl(), enrich.getWebsiteurl(), trust)); merged.setLogourl(chooseReference(merged.getLogourl(), enrich.getLogourl(), trust)); merged.setEclegalbody(chooseReference(merged.getEclegalbody(), enrich.getEclegalbody(), trust)); merged.setEclegalperson(chooseReference(merged.getEclegalperson(), enrich.getEclegalperson(), trust)); merged.setEcnonprofit(chooseReference(merged.getEcnonprofit(), enrich.getEcnonprofit(), trust)); merged - .setEcresearchorganization( - chooseReference(merged.getEcresearchorganization(), enrich.getEcresearchorganization(), trust)); + .setEcresearchorganization( + chooseReference(merged.getEcresearchorganization(), enrich.getEcresearchorganization(), trust)); merged - .setEchighereducation(chooseReference(merged.getEchighereducation(), enrich.getEchighereducation(), trust)); + .setEchighereducation(chooseReference(merged.getEchighereducation(), enrich.getEchighereducation(), trust)); merged - .setEcinternationalorganizationeurinterests( - chooseReference( - merged.getEcinternationalorganizationeurinterests(), - enrich.getEcinternationalorganizationeurinterests(), trust)); + .setEcinternationalorganizationeurinterests( + chooseReference( + merged.getEcinternationalorganizationeurinterests(), + enrich.getEcinternationalorganizationeurinterests(), trust)); merged - .setEcinternationalorganization( - chooseReference( - merged.getEcinternationalorganization(), enrich.getEcinternationalorganization(), trust)); + .setEcinternationalorganization( + chooseReference( + merged.getEcinternationalorganization(), enrich.getEcinternationalorganization(), trust)); merged.setEcenterprise(chooseReference(merged.getEcenterprise(), enrich.getEcenterprise(), trust)); merged.setEcsmevalidated(chooseReference(merged.getEcsmevalidated(), enrich.getEcsmevalidated(), trust)); merged.setEcnutscode(chooseReference(merged.getEcnutscode(), enrich.getEcnutscode(), trust)); @@ -866,8 +867,8 @@ public class MergeUtils { merged.setDuration(chooseReference(merged.getDuration(), enrich.getDuration(), trust)); merged.setEcsc39(chooseReference(merged.getEcsc39(), enrich.getEcsc39(), trust)); merged - .setOamandatepublications( - chooseReference(merged.getOamandatepublications(), enrich.getOamandatepublications(), trust)); + .setOamandatepublications( + chooseReference(merged.getOamandatepublications(), enrich.getOamandatepublications(), trust)); merged.setEcarticle29_3(chooseReference(merged.getEcarticle29_3(), enrich.getEcarticle29_3(), trust)); merged.setSubjects(unionDistinctLists(merged.getSubjects(), enrich.getSubjects(), trust)); merged.setFundingtree(unionDistinctLists(merged.getFundingtree(), enrich.getFundingtree(), trust)); @@ -893,8 +894,8 @@ public class MergeUtils { } merged - .setH2020classification( - unionDistinctLists(merged.getH2020classification(), enrich.getH2020classification(), trust)); + .setH2020classification( + unionDistinctLists(merged.getH2020classification(), enrich.getH2020classification(), trust)); return merged; } @@ -921,7 +922,7 @@ public class MergeUtils { * @return list of instances possibly enriched */ private static List enrichInstances(final List toEnrichInstances, - final List enrichmentInstances) { + final List enrichmentInstances) { final List enrichmentResult = new ArrayList<>(); if (toEnrichInstances == null) { @@ -959,42 +960,42 @@ public class MergeUtils { */ private static Map toInstanceMap(final List ri) { return ri - .stream() - .filter(i -> i.getPid() != null || i.getAlternateIdentifier() != null) - .flatMap(i -> { - final List> result = new ArrayList<>(); - if (i.getPid() != null) - i - .getPid() - .stream() - .filter(MergeUtils::validPid) - .forEach(p -> result.add(new ImmutablePair<>(extractKeyFromPid(p), i))); - if (i.getAlternateIdentifier() != null) - i - .getAlternateIdentifier() - .stream() - .filter(MergeUtils::validPid) - .forEach(p -> result.add(new ImmutablePair<>(extractKeyFromPid(p), i))); - return result.stream(); - }) - .collect( - Collectors - .toMap( - Pair::getLeft, - Pair::getRight, - (a, b) -> a)); + .stream() + .filter(i -> i.getPid() != null || i.getAlternateIdentifier() != null) + .flatMap(i -> { + final List> result = new ArrayList<>(); + if (i.getPid() != null) + i + .getPid() + .stream() + .filter(MergeUtils::validPid) + .forEach(p -> result.add(new ImmutablePair<>(extractKeyFromPid(p), i))); + if (i.getAlternateIdentifier() != null) + i + .getAlternateIdentifier() + .stream() + .filter(MergeUtils::validPid) + .forEach(p -> result.add(new ImmutablePair<>(extractKeyFromPid(p), i))); + return result.stream(); + }) + .collect( + Collectors + .toMap( + Pair::getLeft, + Pair::getRight, + (a, b) -> a)); } private static boolean isFromDelegatedAuthority(Result r) { return Optional - .ofNullable(r.getInstance()) - .map( - instance -> instance - .stream() - .filter(i -> Objects.nonNull(i.getCollectedfrom())) - .map(i -> i.getCollectedfrom().getKey()) - .anyMatch(cfId -> IdentifierFactory.delegatedAuthorityDatasourceIds().contains(cfId))) - .orElse(false); + .ofNullable(r.getInstance()) + .map( + instance -> instance + .stream() + .filter(i -> Objects.nonNull(i.getCollectedfrom())) + .map(i -> i.getCollectedfrom().getKey()) + .anyMatch(cfId -> IdentifierFactory.delegatedAuthorityDatasourceIds().contains(cfId))) + .orElse(false); } /** @@ -1030,15 +1031,15 @@ public class MergeUtils { * @return the list */ private static List findEnrichmentsByPID(final List pids, - final Map enrichments) { + final Map enrichments) { if (pids == null || enrichments == null) return null; return pids - .stream() - .map(MergeUtils::extractKeyFromPid) - .map(enrichments::get) - .filter(Objects::nonNull) - .collect(Collectors.toList()); + .stream() + .map(MergeUtils::extractKeyFromPid) + .map(enrichments::get) + .filter(Objects::nonNull) + .collect(Collectors.toList()); } /** @@ -1049,8 +1050,8 @@ public class MergeUtils { */ private static boolean isAnEnrichment(OafEntity e) { return e.getDataInfo() != null && - e.getDataInfo().getProvenanceaction() != null - && ModelConstants.PROVENANCE_ENRICH.equalsIgnoreCase(e.getDataInfo().getProvenanceaction().getClassid()); + e.getDataInfo().getProvenanceaction() != null + && ModelConstants.PROVENANCE_ENRICH.equalsIgnoreCase(e.getDataInfo().getProvenanceaction().getClassid()); } /** @@ -1073,17 +1074,17 @@ public class MergeUtils { merge.setHostedby(firstNonNull(merge.getHostedby(), enrichment.getHostedby())); merge.setUrl(unionDistinctLists(merge.getUrl(), enrichment.getUrl(), 0)); merge - .setDistributionlocation( - firstNonNull(merge.getDistributionlocation(), enrichment.getDistributionlocation())); + .setDistributionlocation( + firstNonNull(merge.getDistributionlocation(), enrichment.getDistributionlocation())); merge.setCollectedfrom(firstNonNull(merge.getCollectedfrom(), enrichment.getCollectedfrom())); // pid and alternateId are used for matching merge.setDateofacceptance(firstNonNull(merge.getDateofacceptance(), enrichment.getDateofacceptance())); merge - .setProcessingchargeamount( - firstNonNull(merge.getProcessingchargeamount(), enrichment.getProcessingchargeamount())); + .setProcessingchargeamount( + firstNonNull(merge.getProcessingchargeamount(), enrichment.getProcessingchargeamount())); merge - .setProcessingchargecurrency( - firstNonNull(merge.getProcessingchargecurrency(), enrichment.getProcessingchargecurrency())); + .setProcessingchargecurrency( + firstNonNull(merge.getProcessingchargecurrency(), enrichment.getProcessingchargecurrency())); merge.setRefereed(firstNonNull(merge.getRefereed(), enrichment.getRefereed())); merge.setMeasures(unionDistinctLists(merge.getMeasures(), enrichment.getMeasures(), 0)); merge.setFulltext(firstNonNull(merge.getFulltext(), enrichment.getFulltext())); @@ -1091,14 +1092,14 @@ public class MergeUtils { private static int compareTrust(Oaf a, Oaf b) { String left = Optional - .ofNullable(a.getDataInfo()) - .map(DataInfo::getTrust) - .orElse("0.0"); + .ofNullable(a.getDataInfo()) + .map(DataInfo::getTrust) + .orElse("0.0"); String right = Optional - .ofNullable(b.getDataInfo()) - .map(DataInfo::getTrust) - .orElse("0.0"); + .ofNullable(b.getDataInfo()) + .map(DataInfo::getTrust) + .orElse("0.0"); return left.compareTo(right); } diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/personentity/ExtractPerson.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/personentity/ExtractPerson.java index 06924f05a..05f083740 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/personentity/ExtractPerson.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/personentity/ExtractPerson.java @@ -346,15 +346,19 @@ public class ExtractPerson implements Serializable { .structuredProperty( op.getOrcid(), ModelConstants.ORCID, ModelConstants.ORCID_CLASSNAME, ModelConstants.DNET_PID_TYPES, ModelConstants.DNET_PID_TYPES, - OafMapperUtils.dataInfo(false, - null, - false, - false, - OafMapperUtils.qualifier(ModelConstants.SYSIMPORT_CROSSWALK_ENTITYREGISTRY, - ModelConstants.SYSIMPORT_CROSSWALK_ENTITYREGISTRY, - ModelConstants.DNET_PID_TYPES, - ModelConstants.DNET_PID_TYPES), - "0.91"))); + OafMapperUtils + .dataInfo( + false, + null, + false, + false, + OafMapperUtils + .qualifier( + ModelConstants.SYSIMPORT_CROSSWALK_ENTITYREGISTRY, + ModelConstants.SYSIMPORT_CROSSWALK_ENTITYREGISTRY, + ModelConstants.DNET_PID_TYPES, + ModelConstants.DNET_PID_TYPES), + "0.91"))); person.setDateofcollection(op.getLastModifiedDate()); person.setOriginalId(Arrays.asList(op.getOrcid())); person.setDataInfo(ORCIDDATAINFO); diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/gtr2/Gtr2PublicationsIterator.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/gtr2/Gtr2PublicationsIterator.java index 779c43712..1b1ff8db4 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/gtr2/Gtr2PublicationsIterator.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/gtr2/Gtr2PublicationsIterator.java @@ -3,6 +3,7 @@ package eu.dnetlib.dhp.collection.plugin.gtr2; import java.nio.charset.StandardCharsets; import java.time.LocalDate; +import java.time.format.DateTimeFormatter; import java.util.ArrayList; import java.util.HashMap; import java.util.Iterator; @@ -18,7 +19,6 @@ import org.dom4j.Document; import org.dom4j.DocumentException; import org.dom4j.DocumentHelper; import org.dom4j.Element; -import java.time.format.DateTimeFormatter; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -188,11 +188,11 @@ public class Gtr2PublicationsIterator implements Iterator { private Document loadURL(final String cleanUrl, final int attempt) { try { - log.debug(" * Downloading Url: {}", cleanUrl); + log.debug(" * Downloading Url: {}", cleanUrl); final byte[] bytes = this.connector.getInputSource(cleanUrl).getBytes(StandardCharsets.UTF_8); return DocumentHelper.parseText(new String(bytes)); } catch (final Throwable e) { - log.error("Error dowloading url: {}, attempt = {}", cleanUrl, attempt, e); + log.error("Error dowloading url: {}, attempt = {}", cleanUrl, attempt, e); if (attempt >= MAX_ATTEMPTS) { throw new RuntimeException("Error downloading url: " + cleanUrl, e); } From 03c262ccb981bed4d6351705026e699963c9e4fc Mon Sep 17 00:00:00 2001 From: Giambattista Bloisi Date: Wed, 13 Nov 2024 10:56:17 +0100 Subject: [PATCH 072/111] Crossref: generate canonical openaire id for results in affiliation relationship --- .gitignore | 1 + .../eu/dnetlib/dhp/collection/crossref/Crossref2Oaf.scala | 5 +++-- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/.gitignore b/.gitignore index 6fafc7055..ef9144ae3 100644 --- a/.gitignore +++ b/.gitignore @@ -28,3 +28,4 @@ spark-warehouse /**/.scalafmt.conf /.java-version /dhp-shade-package/dependency-reduced-pom.xml +/**/job.properties diff --git a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/collection/crossref/Crossref2Oaf.scala b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/collection/crossref/Crossref2Oaf.scala index e7d68920b..d3a68c92e 100644 --- a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/collection/crossref/Crossref2Oaf.scala +++ b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/collection/crossref/Crossref2Oaf.scala @@ -673,11 +673,12 @@ case object Crossref2Oaf { val doi = input.getString(0) val rorId = input.getString(1) - val pubId = s"50|${PidType.doi.toString.padTo(12, "_")}::${DoiCleaningRule.clean(doi)}" + + val pubId = IdentifierFactory.idFromPid("50", "doi", DoiCleaningRule.clean(doi), true) val affId = GenerateRorActionSetJob.calculateOpenaireId(rorId) val r: Relation = new Relation - DoiCleaningRule.clean(doi) + r.setSource(pubId) r.setTarget(affId) r.setRelType(ModelConstants.RESULT_ORGANIZATION) From fb1f0f8850b867f758fffdf9751ec9e4d2543db5 Mon Sep 17 00:00:00 2001 From: Miriam Baglioni Date: Thu, 7 Nov 2024 14:05:02 +0100 Subject: [PATCH 073/111] [danishfunders] added the possibility to link also versus a specif award if present in the metadata --- .../collection/crossref/Crossref2Oaf.scala | 21 ++++++++++++++++++- .../doiboost/crossref/Crossref2Oaf.scala | 3 +++ 2 files changed, 23 insertions(+), 1 deletion(-) diff --git a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/collection/crossref/Crossref2Oaf.scala b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/collection/crossref/Crossref2Oaf.scala index e7d68920b..59a12bc03 100644 --- a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/collection/crossref/Crossref2Oaf.scala +++ b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/collection/crossref/Crossref2Oaf.scala @@ -978,7 +978,26 @@ case object Crossref2Oaf { case "10.13039/501100010790" => generateSimpleRelationFromAward(funder, "erasmusplus_", a => a) case _ => logger.debug("no match for " + funder.DOI.get) - + //Add for Danish funders + //Independent Research Fund Denmark (IRFD) + case "10.13039/501100004836" => + generateSimpleRelationFromAward(funder, "irfd________", a => a) + val targetId = getProjectId("irfd________", "1e5e62235d094afd01cd56e65112fc63") + queue += generateRelation(sourceId, targetId, ModelConstants.IS_PRODUCED_BY) + queue += generateRelation(targetId, sourceId, ModelConstants.PRODUCES) + //Carlsberg Foundation (CF) + case "10.13039/501100002808" => + generateSimpleRelationFromAward(funder, "cf__________", a => a) + val targetId = getProjectId("cf__________", "1e5e62235d094afd01cd56e65112fc63") + queue += generateRelation(sourceId, targetId, ModelConstants.IS_PRODUCED_BY) + queue += generateRelation(targetId, sourceId, ModelConstants.PRODUCES) + //Novo Nordisk Foundation (NNF) + case "10.13039/501100009708" => + generateSimpleRelationFromAward(funder, "nnf___________", a => a) + val targetId = getProjectId("nnf_________", "1e5e62235d094afd01cd56e65112fc63") + queue += generateRelation(sourceId, targetId, ModelConstants.IS_PRODUCED_BY) + queue += generateRelation(targetId, sourceId, ModelConstants.PRODUCES) + case _ => logger.debug("no match for " + funder.DOI.get) } } else { diff --git a/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/crossref/Crossref2Oaf.scala b/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/crossref/Crossref2Oaf.scala index bf11ed0a8..031a04058 100644 --- a/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/crossref/Crossref2Oaf.scala +++ b/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/crossref/Crossref2Oaf.scala @@ -569,16 +569,19 @@ case object Crossref2Oaf { //Add for Danish funders //Independent Research Fund Denmark (IRFD) case "10.13039/501100004836" => + generateSimpleRelationFromAward(funder, "irfd________", a => a) val targetId = getProjectId("irfd________", "1e5e62235d094afd01cd56e65112fc63") queue += generateRelation(sourceId, targetId, ModelConstants.IS_PRODUCED_BY) queue += generateRelation(targetId, sourceId, ModelConstants.PRODUCES) //Carlsberg Foundation (CF) case "10.13039/501100002808" => + generateSimpleRelationFromAward(funder, "cf__________", a => a) val targetId = getProjectId("cf__________", "1e5e62235d094afd01cd56e65112fc63") queue += generateRelation(sourceId, targetId, ModelConstants.IS_PRODUCED_BY) queue += generateRelation(targetId, sourceId, ModelConstants.PRODUCES) //Novo Nordisk Foundation (NNF) case "10.13039/501100009708" => + generateSimpleRelationFromAward(funder, "nnf___________", a => a) val targetId = getProjectId("nnf_________", "1e5e62235d094afd01cd56e65112fc63") queue += generateRelation(sourceId, targetId, ModelConstants.IS_PRODUCED_BY) queue += generateRelation(targetId, sourceId, ModelConstants.PRODUCES) From ac0a94d62d7c34fc3953d47a9a2263ac5cfadb82 Mon Sep 17 00:00:00 2001 From: "sandro.labruzzo" Date: Wed, 13 Nov 2024 16:26:59 +0100 Subject: [PATCH 074/111] updated pubmed parser to add also ORCID id and affiliation string to authors --- .../dhp/sx/bio/pubmed/PMAffiliation.java | 39 +++ .../dnetlib/dhp/sx/bio/pubmed/PMAuthor.java | 39 +++ .../dhp/sx/bio/pubmed/PMIdentifier.java | 53 +++++ .../dnetlib/dhp/sx/bio/pubmed/PMParser2.scala | 30 ++- .../dhp/sx/bio/pubmed/PubMedToOaf.scala | 6 + .../dhp/sx/graph/bio/single_pubmed.xml | 223 +++++++----------- .../dnetlib/dhp/sx/bio/BioScholixTest.scala | 64 ++++- 7 files changed, 300 insertions(+), 154 deletions(-) create mode 100644 dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/pubmed/PMAffiliation.java create mode 100644 dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/pubmed/PMIdentifier.java diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/pubmed/PMAffiliation.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/pubmed/PMAffiliation.java new file mode 100644 index 000000000..54aba8715 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/pubmed/PMAffiliation.java @@ -0,0 +1,39 @@ +package eu.dnetlib.dhp.sx.bio.pubmed; + +/** + * The type Pubmed Affiliation. + * + * @author Sandro La Bruzzo + */ +public class PMAffiliation { + + private String name; + + private PMIdentifier identifier; + + public PMAffiliation() { + + } + public PMAffiliation(String name, PMIdentifier identifier) { + this.name = name; + this.identifier = identifier; + } + + public String getName() { + return name; + } + + public PMAffiliation setName(String name) { + this.name = name; + return this; + } + + public PMIdentifier getIdentifier() { + return identifier; + } + + public PMAffiliation setIdentifier(PMIdentifier identifier) { + this.identifier = identifier; + return this; + } +} diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/pubmed/PMAuthor.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/pubmed/PMAuthor.java index 68ef6459e..b0df25663 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/pubmed/PMAuthor.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/pubmed/PMAuthor.java @@ -12,6 +12,8 @@ public class PMAuthor implements Serializable { private String lastName; private String foreName; + private PMIdentifier identifier; + private PMAffiliation affiliation; /** * Gets last name. @@ -59,4 +61,41 @@ public class PMAuthor implements Serializable { .format("%s, %s", this.foreName != null ? this.foreName : "", this.lastName != null ? this.lastName : ""); } + /** + * Gets identifier. + * + * @return the identifier + */ + public PMIdentifier getIdentifier() { + return identifier; + } + + /** + * Sets identifier. + * + * @param identifier the identifier + */ + public void setIdentifier(PMIdentifier identifier) { + this.identifier = identifier; + } + + /** + * Gets affiliation. + * + * @return the affiliation + */ + public PMAffiliation getAffiliation() { + return affiliation; + } + + /** + * Sets affiliation. + * + * @param affiliation the affiliation + */ + public void setAffiliation(PMAffiliation affiliation) { + this.affiliation = affiliation; + } + + } diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/pubmed/PMIdentifier.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/pubmed/PMIdentifier.java new file mode 100644 index 000000000..0c8c55e40 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/pubmed/PMIdentifier.java @@ -0,0 +1,53 @@ +package eu.dnetlib.dhp.sx.bio.pubmed; + +public class PMIdentifier { + + private String pid; + private String type; + + + public PMIdentifier(String pid, String type) { + this.pid = cleanPid(pid); + this.type = type; + } + + public PMIdentifier() { + + } + + private String cleanPid(String pid) { + + if (pid == null) { + return null; + } + + // clean ORCID ID in the form 0000000163025705 to 0000-0001-6302-5705 + if (pid.matches("[0-9]{15}[0-9X]")) { + return pid.replaceAll("(.{4})(.{4})(.{4})(.{4})", "$1-$2-$3-$4"); + } + + // clean ORCID in the form http://orcid.org/0000-0001-8567-3543 to 0000-0001-8567-3543 + if (pid.matches("http://orcid.org/[0-9]{4}-[0-9]{4}-[0-9]{4}-[0-9]{4}")) { + return pid.replaceAll("http://orcid.org/", ""); + } + return pid; + } + + public String getPid() { + return pid; + } + + public PMIdentifier setPid(String pid) { + this.pid = cleanPid(pid); + return this; + } + + public String getType() { + return type; + } + + public PMIdentifier setType(String type) { + this.type = type; + return this; + } +} diff --git a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/pubmed/PMParser2.scala b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/pubmed/PMParser2.scala index c9e868185..2eb4bea65 100644 --- a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/pubmed/PMParser2.scala +++ b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/pubmed/PMParser2.scala @@ -81,6 +81,26 @@ class PMParser2 { val a = new PMAuthor a.setLastName((author \ "LastName").text) a.setForeName((author \ "ForeName").text) + val id = (author \ "Identifier").text + val idType =(author \ "Identifier" \ "@Source").text + + if(id != null && id.nonEmpty && idType != null && idType.nonEmpty) { + a.setIdentifier(new PMIdentifier(id, idType)) + } + + + val affiliation = (author \ "AffiliationInfo" \ "Affiliation").text + val affiliationId = (author \ "AffiliationInfo" \ "Identifier").text + val affiliationIdType = (author \ "AffiliationInfo" \ "Identifier" \ "@Source").text + + if(affiliation != null && affiliation.nonEmpty) { + val aff = new PMAffiliation() + aff.setName(affiliation) + if(affiliationId != null && affiliationId.nonEmpty && affiliationIdType != null && affiliationIdType.nonEmpty) { + aff.setIdentifier(new PMIdentifier(affiliationId, affiliationIdType)) + } + a.setAffiliation(aff) + } a }) .toList @@ -99,15 +119,7 @@ class PMParser2 { val authors = xml \ "MedlineCitation" \ "Article" \ "AuthorList" \ "Author" article.setAuthors( - authors - .map(author => { - val a = new PMAuthor - a.setLastName((author \ "LastName").text) - a.setForeName((author \ "ForeName").text) - a - }) - .toList - .asJava + extractAuthors(authors).asJava ) val pmId = xml \ "MedlineCitation" \ "PMID" diff --git a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/pubmed/PubMedToOaf.scala b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/pubmed/PubMedToOaf.scala index d59d73bd0..5e14c731a 100644 --- a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/pubmed/PubMedToOaf.scala +++ b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/pubmed/PubMedToOaf.scala @@ -294,6 +294,12 @@ object PubMedToOaf { author.setName(a.getForeName) author.setSurname(a.getLastName) author.setFullname(a.getFullName) + if(a.getIdentifier != null) { + author.setPid(List(OafMapperUtils.structuredProperty(a.getIdentifier.getPid, + OafMapperUtils.qualifier(a.getIdentifier.getType,a.getIdentifier.getType,ModelConstants.DNET_PID_TYPES, ModelConstants.DNET_PID_TYPES), dataInfo)).asJava) + } + if (a.getAffiliation!= null) + author.setRawAffiliationString(List(a.getAffiliation.getName).asJava) author.setRank(index + 1) author }(collection.breakOut) diff --git a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/sx/graph/bio/single_pubmed.xml b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/sx/graph/bio/single_pubmed.xml index 4b4d860d7..c2e503f57 100644 --- a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/sx/graph/bio/single_pubmed.xml +++ b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/sx/graph/bio/single_pubmed.xml @@ -1,77 +1,56 @@ - - 37885214 + + 37318999 2024 02 - 14 + 09 2024 02 - 14 + 09
- 2752-7549 + 1522-1229 - 40 - 5 + 47 + 3 - 2023 Sep-Oct + 2023 + Sep + 01 - Journal of pediatric hematology/oncology nursing - J Pediatr Hematol Oncol Nurs + Advances in physiology education + Adv Physiol Educ - Care Needs of Parents of Children With Cancer in a Low-Middle-Income Country. + Providing the choice of in-person or videoconference attendance in a clinical physiology course may harm learning outcomes for the entire cohort. - 295-304 + 548-556 - 10.1177/27527530231193972 + 10.1152/advan.00160.2022 - Background: Mapping out actual supportive care needs assists nurses in providing holistic individualized care. This study aimed to explore the care needs of parents of children with cancer in the Philippines. Method: Guided by the Supportive Care Needs Framework (SCNF), this study used an embedded mixed-method design with the quantitative revised Cancer Patient Needs Questionnaire and qualitative semistructured interviews to describe parents' care needs and priorities. Results: Filipino parents (N = 156) of children with cancer have various care needs which could be classified along the SCNF categories-practical, informational, spiritual, physical, emotional, and physical needs as ranked from highest to lowest. A number of variables were significantly associated with care needs. Solid tumor diagnosis was associated with greater practical, emotional, and psychosocial care needs; having a child who had undergone surgery was associated with more practical and spiritual care needs; and being within one year of the child's diagnosis was associated with practical, psychosocial, and spiritual care needs. Parent priority needs included (a) addressing financial needs; (b) access to temporary housing to minimize treatment-related costs; (c) support groups among parents of children with cancer as a source of information; (d) financial and social support between members of family and partners of parents of children with cancer; and (e) using prayer to facilitate acceptance. Conclusions: Supportive care needs of parents of children with cancer are important components of care that should be given recognition to enhance holistic individualized care throughout the childhood cancer experience. + Clinical Physiology 1 and 2 are flipped classes in which students watch prerecorded videos before class. During the 3-h class, students take practice assessments, work in groups on critical thinking exercises, work through case studies, and engage in drawing exercises. Due to the COVID pandemic, these courses were transitioned from in-person classes to online classes. Despite the university's return-to-class policy, some students were reluctant to return to in-person classes; therefore during the 2021-2022 academic year, Clinical Physiology 1 and 2 were offered as flipped, hybrid courses. In a hybrid format, students either attended the synchronous class in person or online. Here we evaluate the learning outcomes and the perceptions of the learning experience for students who attended Clinical Physiology 1 and 2 either online (2020-2021) or in a hybrid format (2021-2022). In addition to exam scores, in-class surveys and end of course evaluations were compiled to describe the student experience in the flipped hybrid setting. Retrospective linear mixed-model regression analysis of exam scores revealed that a hybrid modality (2021-2022) was associated with lower exam scores when controlling for sex, graduate/undergraduate status, delivery method, and the order in which the courses were taken (F test: F = 8.65, df1 = 2, df2 = 179.28, P = 0.0003). In addition, being a Black Indigenous Person of Color (BIPOC) student is associated with a lower exam score, controlling for the same previous factors (F test: F = 4.23, df1 = 1, df2 = 130.28, P = 0.04), albeit with lower confidence; the BIPOC representation in this sample is small (BIPOC: n = 144; total: n = 504). There is no significant interaction between the hybrid modality and race, meaning that BIPOC and White students are both negatively affected in a hybrid flipped course. Instructors should consider carefully about offering hybrid courses and build in extra student support.NEW & NOTEWORTHY The transition from online to in-person teaching has been as challenging as the original transition to remote teaching with the onset of the pandemic. Since not all students were ready to return to the classroom, students could choose to take this course in person or online. This arrangement provided flexibility and opportunities for innovative class activities for students but introduced tradeoffs in lower test scores from the hybrid modality than fully online or fully in-person modalities. - Banayat - Aprille Campos - AC - 0000-0001-9339-9871 + Anderson + Lisa Carney + LC + 0000-0003-2261-1921 - College of Nursing, University of the Philippines Manila, Manila, Philippines. + Department of Integrative Biology and Physiology, University of Minnesota, Minneapolis, Minnesota, United States. + https://ror.org/017zqws13 - Abad - Peter James B - PJB + Jacobson + Tate + T - College of Nursing, University of the Philippines Manila, Manila, Philippines. - - - - Bonito - Sheila R - SR - - College of Nursing, University of the Philippines Manila, Manila, Philippines. - - - - Manahan - Lydia T - LT - - College of Nursing, University of the Philippines Manila, Manila, Philippines. - - - - Peralta - Arnold B - AB - - College of Nursing, University of the Philippines Manila, Manila, Philippines. + Department of Statistics, University of Minnesota, Minneapolis, Minnesota, United States. @@ -81,142 +60,98 @@ 2023 - 10 - 26 + 06 + 15
United States - J Pediatr Hematol Oncol Nurs - 9918282681506676 - 2752-7530 + Adv Physiol Educ + 100913944 + 1043-4046 IM - Child + Physiology + education + + + Retrospective Studies + + + Learning + + + Pandemics + + + COVID-19 + + + Regression Analysis + + + Students Humans - Parents - psychology + Male - Social Support + Female - Spirituality + White People - Religion + Black People - Neoplasms - therapy + Education, Distance + + + Curriculum - cancer - mixed methods - parent - pediatric - research - supportive care + flipped teaching + hybrid teaching + inequity + learning outcomes + responsive teaching - Declaration of Conflicting InterestsThe author(s) declared no potential conflicts of interest with respect to the research, authorship, and/or publication of this article.
- 2024 - 2 - 12 - 18 - 42 + 2023 + 7 + 21 + 6 + 44 2023 - 10 - 27 - 6 - 42 + 6 + 15 + 19 + 14 2023 - 10 - 27 - 3 - 43 + 6 + 15 + 12 + 53 ppublish - 37885214 - 10.1177/27527530231193972 + 37318999 + 10.1152/advan.00160.2022
- -30522158 -32769323 -34061701 -34661197 -34837091 -35035475 -35211699 -35557982 -35782783 -35795240 -35832688 -35847411 -36081602 -36081858 -36468085 -36468934 -36580086 -36589526 -36619609 -36649460 -36654909 -36655054 -36700856 -36705625 -36713939 -36714172 -36741203 -36741905 -36743825 -36788221 -36844926 -36846546 -36935776 -36946757 -36972191 -37034422 -37124311 -37152108 -37171968 -37273889 -37333905 -37387733 -37431449 -37576947 -37601162 -37711214 -37901290 -37981909 -37981945 -37982005 -38037601 -38037602 -38150730 -38274640 -38332671 -38334184 -38335456 -38349506 -38349576 -38353676 - \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/test/scala/eu/dnetlib/dhp/sx/bio/BioScholixTest.scala b/dhp-workflows/dhp-aggregation/src/test/scala/eu/dnetlib/dhp/sx/bio/BioScholixTest.scala index c942ca132..4a926df01 100644 --- a/dhp-workflows/dhp-aggregation/src/test/scala/eu/dnetlib/dhp/sx/bio/BioScholixTest.scala +++ b/dhp-workflows/dhp-aggregation/src/test/scala/eu/dnetlib/dhp/sx/bio/BioScholixTest.scala @@ -19,9 +19,11 @@ import org.mockito.junit.jupiter.MockitoExtension import org.slf4j.LoggerFactory import java.io.{BufferedReader, InputStream, InputStreamReader} +import java.util.regex.Pattern import java.util.zip.GZIPInputStream import javax.xml.stream.XMLInputFactory import scala.collection.JavaConverters._ +import scala.collection.mutable import scala.collection.mutable.ListBuffer import scala.io.Source @@ -51,6 +53,64 @@ class BioScholixTest extends AbstractVocabularyTest { } } + @Test + def testPid(): Unit = { + val pids = List( + "0000000163025705", + "000000018494732X", + "0000000308873343", + "0000000335964515", + "0000000333457333", + "0000000335964515", + "0000000302921949", + + "http://orcid.org/0000-0001-8567-3543", + "http://orcid.org/0000-0001-7868-8528", + "0000-0001-9189-1440", + "0000-0003-3727-9247", + "0000-0001-7246-1058", + "000000033962389X", + "0000000330371470", + "0000000171236123", + "0000000272569752", + "0000000293231371", + "http://orcid.org/0000-0003-3345-7333", + "0000000340145688", + "http://orcid.org/0000-0003-4894-1689" + ) + + pids.foreach(pid => { + val pidCleaned = new PMIdentifier(pid, "ORCID").getPid + // assert pid is in the format of ORCID + println(pidCleaned) + assertTrue(pidCleaned.matches("[0-9]{4}-[0-9]{4}-[0-9]{4}-[0-9]{3}[0-9X]")) + }) + } + + def extractAffiliation(s: String): List[String] = { + val regex: String = "(.*)<\\/Affiliation>" + val pattern = Pattern.compile(regex, Pattern.MULTILINE) + val matcher = pattern.matcher(s) + val l: mutable.ListBuffer[String] = mutable.ListBuffer() + while (matcher.find()) { + l += matcher.group(1) + } + l.toList + } + + case class AuthorPID(pidType: String, pid: String) {} + + def extractAuthorIdentifier(s: String): List[AuthorPID] = { + val regex: String = "(.*)<\\/Identifier>" + val pattern = Pattern.compile(regex, Pattern.MULTILINE) + val matcher = pattern.matcher(s) + val l: mutable.ListBuffer[AuthorPID] = mutable.ListBuffer() + while (matcher.find()) { + l += AuthorPID(pidType = matcher.group(1), pid = matcher.group(2)) + } + l.toList + } + @Test def testParsingPubmed2(): Unit = { val mapper = new ObjectMapper() @@ -58,7 +118,9 @@ class BioScholixTest extends AbstractVocabularyTest { val parser = new PMParser2() val article = parser.parse(xml) - println(mapper.writerWithDefaultPrettyPrinter().writeValueAsString(article)) +// println(mapper.writerWithDefaultPrettyPrinter().writeValueAsString(article)) + + println(mapper.writerWithDefaultPrettyPrinter().writeValueAsString(PubMedToOaf.convert(article, vocabularies))) } From 4a3b173ca2d917c52de1671c352d1296ac211736 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Wed, 13 Nov 2024 16:27:00 +0100 Subject: [PATCH 075/111] defaults to 0000 - Unknown in case the instance type lookup in the dnet:result_typologies doesn't find a corresponding result type binding --- .../dhp/schema/oaf/utils/MergeUtils.java | 12 +--- .../raw/AbstractMdRecordToOafMapper.java | 55 ++++++++++--------- .../dhp/oa/graph/raw/OafToOafMapper.java | 4 +- .../dhp/oa/graph/raw/OdfToOafMapper.java | 4 +- 4 files changed, 36 insertions(+), 39 deletions(-) diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/MergeUtils.java b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/MergeUtils.java index dc76860f8..c9b235fd6 100644 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/MergeUtils.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/MergeUtils.java @@ -74,15 +74,9 @@ public class MergeUtils { if (!vocs.vocabularyExists(ModelConstants.DNET_RESULT_TYPOLOGIES)) { return (T) mergedResult; } else { - final Qualifier expectedResultType = vocs.lookupTermBySynonym( - ModelConstants.DNET_RESULT_TYPOLOGIES, - i.getInstancetype().getClassid()); - - if (Objects.isNull(expectedResultType)) { - throw new IllegalArgumentException( - "instance type not bound to any result type in dnet:result_typologies: " + - i.getInstancetype().getClassid()); - } + final Qualifier expectedResultType = Optional + .ofNullable(vocs.lookupTermBySynonym(ModelConstants.DNET_RESULT_TYPOLOGIES, i.getInstancetype().getClassid())) + .orElse(OafMapperUtils.unknown(ModelConstants.DNET_RESULT_TYPOLOGIES, ModelConstants.DNET_RESULT_TYPOLOGIES)); // there is a clash among the result types if (!expectedResultType.getClassid().equals(mergedResult.getResulttype().getClassid())) { diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java index ba6887a2e..be84778f5 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java @@ -153,30 +153,33 @@ public abstract class AbstractMdRecordToOafMapper { final DataInfo entityInfo = prepareDataInfo(doc, this.invisible); final long lastUpdateTimestamp = new Date().getTime(); - final List instances = prepareInstances(doc, entityInfo, collectedFrom, hostedBy); + final Instance instance = prepareInstances(doc, entityInfo, collectedFrom, hostedBy); - final String type = getResultType(instances); + if (!Optional.ofNullable(instance.getInstancetype()) + .map(Qualifier::getClassid) + .filter(StringUtils::isNotBlank) + .isPresent()) { + return Lists.newArrayList(); + } - return createOafs(doc, type, instances, collectedFrom, entityInfo, lastUpdateTimestamp); + final String type = getResultType(instance); + + return createOafs(doc, type, instance, collectedFrom, entityInfo, lastUpdateTimestamp); } catch (final DocumentException e) { log.error("Error with record:\n" + xml); return Lists.newArrayList(); } } - protected String getResultType(final List instances) { - + protected String getResultType(final Instance instance) { if (this.vocs.vocabularyExists(ModelConstants.DNET_RESULT_TYPOLOGIES)) { - final String instanceType = instances - .stream() - .map(i -> i.getInstancetype().getClassid()) - .findFirst() - .filter(s -> !UNKNOWN.equalsIgnoreCase(s)) - .orElse("0000"); // Unknown - return Optional - .ofNullable(this.vocs.getSynonymAsQualifier(ModelConstants.DNET_RESULT_TYPOLOGIES, instanceType)) - .map(Qualifier::getClassid) - .orElse("0000"); + return Optional.ofNullable(instance.getInstancetype()) + .map(Qualifier::getClassid) + .map(instanceType -> Optional + .ofNullable(this.vocs.getSynonymAsQualifier(ModelConstants.DNET_RESULT_TYPOLOGIES, instanceType)) + .map(Qualifier::getClassid) + .orElse("0000")) + .orElse("0000"); } else { throw new IllegalStateException("Missing vocabulary: " + ModelConstants.DNET_RESULT_TYPOLOGIES); } @@ -196,12 +199,12 @@ public abstract class AbstractMdRecordToOafMapper { protected List createOafs( final Document doc, final String type, - final List instances, + final Instance instance, final KeyValue collectedFrom, final DataInfo info, final long lastUpdateTimestamp) { - final OafEntity entity = createEntity(doc, type, instances, collectedFrom, info, lastUpdateTimestamp); + final OafEntity entity = createEntity(doc, type, instance, collectedFrom, info, lastUpdateTimestamp); final Set originalId = Sets.newHashSet(entity.getOriginalId()); originalId.add(entity.getId()); @@ -234,19 +237,19 @@ public abstract class AbstractMdRecordToOafMapper { private OafEntity createEntity(final Document doc, final String type, - final List instances, + final Instance instance, final KeyValue collectedFrom, final DataInfo info, final long lastUpdateTimestamp) { switch (type.toLowerCase()) { case "publication": final Publication p = new Publication(); - populateResultFields(p, doc, instances, collectedFrom, info, lastUpdateTimestamp); + populateResultFields(p, doc, instance, collectedFrom, info, lastUpdateTimestamp); p.setJournal(prepareJournal(doc, info)); return p; case "dataset": final Dataset d = new Dataset(); - populateResultFields(d, doc, instances, collectedFrom, info, lastUpdateTimestamp); + populateResultFields(d, doc, instance, collectedFrom, info, lastUpdateTimestamp); d.setStoragedate(prepareDatasetStorageDate(doc, info)); d.setDevice(prepareDatasetDevice(doc, info)); d.setSize(prepareDatasetSize(doc, info)); @@ -257,7 +260,7 @@ public abstract class AbstractMdRecordToOafMapper { return d; case "software": final Software s = new Software(); - populateResultFields(s, doc, instances, collectedFrom, info, lastUpdateTimestamp); + populateResultFields(s, doc, instance, collectedFrom, info, lastUpdateTimestamp); s.setDocumentationUrl(prepareSoftwareDocumentationUrls(doc, info)); s.setLicense(prepareSoftwareLicenses(doc, info)); s.setCodeRepositoryUrl(prepareSoftwareCodeRepositoryUrl(doc, info)); @@ -267,7 +270,7 @@ public abstract class AbstractMdRecordToOafMapper { case "otherresearchproducts": default: final OtherResearchProduct o = new OtherResearchProduct(); - populateResultFields(o, doc, instances, collectedFrom, info, lastUpdateTimestamp); + populateResultFields(o, doc, instance, collectedFrom, info, lastUpdateTimestamp); o.setContactperson(prepareOtherResearchProductContactPersons(doc, info)); o.setContactgroup(prepareOtherResearchProductContactGroups(doc, info)); o.setTool(prepareOtherResearchProductTools(doc, info)); @@ -414,7 +417,7 @@ public abstract class AbstractMdRecordToOafMapper { private void populateResultFields( final Result r, final Document doc, - final List instances, + final Instance instance, final KeyValue collectedFrom, final DataInfo info, final long lastUpdateTimestamp) { @@ -448,8 +451,8 @@ public abstract class AbstractMdRecordToOafMapper { r.setExternalReference(new ArrayList<>()); // NOT PRESENT IN MDSTORES r.setProcessingchargeamount(field(doc.valueOf("//oaf:processingchargeamount"), info)); r.setProcessingchargecurrency(field(doc.valueOf("//oaf:processingchargeamount/@currency"), info)); - r.setInstance(instances); - r.setBestaccessright(OafMapperUtils.createBestAccessRights(instances)); + r.setInstance(Arrays.asList(instance)); + r.setBestaccessright(OafMapperUtils.createBestAccessRights(Arrays.asList(instance))); r.setEoscifguidelines(prepareEOSCIfGuidelines(doc, info)); } @@ -508,7 +511,7 @@ public abstract class AbstractMdRecordToOafMapper { protected abstract Qualifier prepareResourceType(Document doc, DataInfo info); - protected abstract List prepareInstances( + protected abstract Instance prepareInstances( Document doc, DataInfo info, KeyValue collectedfrom, diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OafToOafMapper.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OafToOafMapper.java index 98da48f9e..33351e91f 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OafToOafMapper.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OafToOafMapper.java @@ -135,7 +135,7 @@ public class OafToOafMapper extends AbstractMdRecordToOafMapper { } @Override - protected List prepareInstances( + protected Instance prepareInstances( final Document doc, final DataInfo info, final KeyValue collectedfrom, @@ -197,7 +197,7 @@ public class OafToOafMapper extends AbstractMdRecordToOafMapper { instance.getUrl().addAll(validUrl); } - return Lists.newArrayList(instance); + return instance; } /** diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OdfToOafMapper.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OdfToOafMapper.java index ad61304a0..a811aad46 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OdfToOafMapper.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OdfToOafMapper.java @@ -126,7 +126,7 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper { } @Override - protected List prepareInstances( + protected Instance prepareInstances( final Document doc, final DataInfo info, final KeyValue collectedfrom, @@ -210,7 +210,7 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper { instance.setUrl(new ArrayList<>()); instance.getUrl().addAll(validUrl); } - return Arrays.asList(instance); + return instance; } protected String trimAndDecodeUrl(String url) { From b95672b4204667f1b011a7b6ed281b7fcbb3525c Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Fri, 15 Nov 2024 09:16:18 +0100 Subject: [PATCH 076/111] mergeUtils set the result identifier when enforcing the result type --- .../dhp/schema/oaf/utils/MergeUtils.java | 451 +++++++++--------- 1 file changed, 229 insertions(+), 222 deletions(-) diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/MergeUtils.java b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/MergeUtils.java index c9b235fd6..c092f6035 100644 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/MergeUtils.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/MergeUtils.java @@ -16,8 +16,6 @@ import java.util.function.Function; import java.util.stream.Collectors; import java.util.stream.Stream; -import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup; -import eu.dnetlib.dhp.schema.common.EntityType; import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.tuple.ImmutablePair; import org.apache.commons.lang3.tuple.Pair; @@ -25,8 +23,10 @@ import org.apache.commons.lang3.tuple.Pair; import com.github.sisyphsu.dateparser.DateParserUtils; import com.google.common.base.Joiner; +import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup; import eu.dnetlib.dhp.oa.merge.AuthorMerger; import eu.dnetlib.dhp.schema.common.AccessRightComparator; +import eu.dnetlib.dhp.schema.common.EntityType; import eu.dnetlib.dhp.schema.common.ModelConstants; import eu.dnetlib.dhp.schema.common.ModelSupport; import eu.dnetlib.dhp.schema.oaf.*; @@ -46,7 +46,7 @@ public class MergeUtils { } public static T mergeGroup(Iterator oafEntityIterator, - boolean checkDelegateAuthority, VocabularyGroup vocs) { + boolean checkDelegateAuthority, VocabularyGroup vocs) { ArrayList sortedEntities = new ArrayList<>(); oafEntityIterator.forEachRemaining(sortedEntities::add); @@ -74,22 +74,29 @@ public class MergeUtils { if (!vocs.vocabularyExists(ModelConstants.DNET_RESULT_TYPOLOGIES)) { return (T) mergedResult; } else { - final Qualifier expectedResultType = Optional - .ofNullable(vocs.lookupTermBySynonym(ModelConstants.DNET_RESULT_TYPOLOGIES, i.getInstancetype().getClassid())) - .orElse(OafMapperUtils.unknown(ModelConstants.DNET_RESULT_TYPOLOGIES, ModelConstants.DNET_RESULT_TYPOLOGIES)); + final String expectedResultType = Optional + .ofNullable( + vocs + .lookupTermBySynonym( + ModelConstants.DNET_RESULT_TYPOLOGIES, i.getInstancetype().getClassid())) + .orElse(ModelConstants.ORP_DEFAULT_RESULTTYPE) + .getClassid(); // there is a clash among the result types - if (!expectedResultType.getClassid().equals(mergedResult.getResulttype().getClassid())) { - try { - String resulttype = expectedResultType.getClassid(); - if (EntityType.otherresearchproduct.toString().equals(resulttype)) { - resulttype = "other"; - } - Result result = (Result) ModelSupport.oafTypes.get(resulttype).newInstance(); - return (T) mergeResultFields(result, mergedResult); - } catch (InstantiationException | IllegalAccessException e) { - throw new IllegalStateException(e); - } + if (!expectedResultType.equals(mergedResult.getResulttype().getClassid())) { + + Result result = (Result) Optional + .ofNullable(ModelSupport.oafTypes.get(expectedResultType)) + .map(r -> { + try { + return r.newInstance(); + } catch (InstantiationException | IllegalAccessException e) { + throw new IllegalStateException(e); + } + }) + .orElse(new OtherResearchProduct()); + result.setId(mergedResult.getId()); + return (T) mergeResultFields(result, mergedResult); } else { return (T) mergedResult; } @@ -116,10 +123,10 @@ public class MergeUtils { return mergeRelation((Relation) left, (Relation) right); } else { throw new RuntimeException( - String - .format( - "MERGE_FROM_AND_GET incompatible types: %s, %s", - left.getClass().getCanonicalName(), right.getClass().getCanonicalName())); + String + .format( + "MERGE_FROM_AND_GET incompatible types: %s, %s", + left.getClass().getCanonicalName(), right.getClass().getCanonicalName())); } } @@ -158,10 +165,10 @@ public class MergeUtils { return mergeProject((Project) left, (Project) right); } else { throw new RuntimeException( - String - .format( - "MERGE_FROM_AND_GET incompatible types: %s, %s", - left.getClass().getCanonicalName(), right.getClass().getCanonicalName())); + String + .format( + "MERGE_FROM_AND_GET incompatible types: %s, %s", + left.getClass().getCanonicalName(), right.getClass().getCanonicalName())); } } @@ -252,7 +259,7 @@ public class MergeUtils { } private static List mergeLists(final List left, final List right, int trust, - Function keyExtractor, BinaryOperator merger) { + Function keyExtractor, BinaryOperator merger) { if (left == null || left.isEmpty()) { return right != null ? right : new ArrayList<>(); } else if (right == null || right.isEmpty()) { @@ -263,11 +270,11 @@ public class MergeUtils { List l = trust >= 0 ? right : left; return new ArrayList<>(Stream - .concat(h.stream(), l.stream()) - .filter(Objects::nonNull) - .distinct() - .collect(Collectors.toMap(keyExtractor, v -> v, merger, LinkedHashMap::new)) - .values()); + .concat(h.stream(), l.stream()) + .filter(Objects::nonNull) + .distinct() + .collect(Collectors.toMap(keyExtractor, v -> v, merger, LinkedHashMap::new)) + .values()); } private static List unionDistinctLists(final List left, final List right, int trust) { @@ -281,10 +288,10 @@ public class MergeUtils { List l = trust >= 0 ? right : left; return Stream - .concat(h.stream(), l.stream()) - .filter(Objects::nonNull) - .distinct() - .collect(Collectors.toList()); + .concat(h.stream(), l.stream()) + .filter(Objects::nonNull) + .distinct() + .collect(Collectors.toList()); } private static List unionDistinctListOfString(final List l, final List r) { @@ -295,10 +302,10 @@ public class MergeUtils { } return Stream - .concat(l.stream(), r.stream()) - .filter(StringUtils::isNotBlank) - .distinct() - .collect(Collectors.toList()); + .concat(l.stream(), r.stream()) + .filter(StringUtils::isNotBlank) + .distinct() + .collect(Collectors.toList()); } // TODO review @@ -324,7 +331,7 @@ public class MergeUtils { } private static List unionTitle(List left, List right, - int trust) { + int trust) { if (left == null) { return right; } else if (right == null) { @@ -335,10 +342,10 @@ public class MergeUtils { List l = trust >= 0 ? right : left; return Stream - .concat(h.stream(), l.stream()) - .filter(Objects::isNull) - .distinct() - .collect(Collectors.toList()); + .concat(h.stream(), l.stream()) + .filter(Objects::isNull) + .distinct() + .collect(Collectors.toList()); } /** @@ -373,8 +380,8 @@ public class MergeUtils { merged.setPid(mergeLists(merged.getPid(), enrich.getPid(), trust, MergeUtils::spKeyExtractor, (p1, p2) -> p1)); merged.setDateofcollection(LocalDateTime.now().toString()); merged - .setDateoftransformation( - chooseString(merged.getDateoftransformation(), enrich.getDateoftransformation(), trust)); + .setDateoftransformation( + chooseString(merged.getDateoftransformation(), enrich.getDateoftransformation(), trust)); merged.setExtraInfo(unionDistinctLists(merged.getExtraInfo(), enrich.getExtraInfo(), trust)); // When merging records OAI provenance becomes null merged.setOaiprovenance(null); @@ -391,7 +398,7 @@ public class MergeUtils { checkArgument(Objects.equals(merge.getTarget(), enrich.getTarget()), "target ids must be equal"); checkArgument(Objects.equals(merge.getRelType(), enrich.getRelType()), "relType(s) must be equal"); checkArgument( - Objects.equals(merge.getSubRelType(), enrich.getSubRelType()), "subRelType(s) must be equal"); + Objects.equals(merge.getSubRelType(), enrich.getSubRelType()), "subRelType(s) must be equal"); checkArgument(Objects.equals(merge.getRelClass(), enrich.getRelClass()), "relClass(es) must be equal"); // merge.setProvenance(mergeLists(merge.getProvenance(), enrich.getProvenance())); @@ -402,10 +409,10 @@ public class MergeUtils { merge.setValidationDate(ModelSupport.oldest(merge.getValidationDate(), enrich.getValidationDate())); } catch (ParseException e) { throw new IllegalArgumentException(String - .format( - "invalid validation date format in relation [s:%s, t:%s]: %s", merge.getSource(), - merge.getTarget(), - merge.getValidationDate())); + .format( + "invalid validation date format in relation [s:%s, t:%s]: %s", merge.getSource(), + merge.getTarget(), + merge.getValidationDate())); } // TODO keyvalue merge @@ -419,7 +426,7 @@ public class MergeUtils { T merge = mergeOafEntityFields(original, enrich, trust); if (merge.getProcessingchargeamount() == null - || StringUtils.isBlank(merge.getProcessingchargeamount().getValue())) { + || StringUtils.isBlank(merge.getProcessingchargeamount().getValue())) { merge.setProcessingchargeamount(enrich.getProcessingchargeamount()); merge.setProcessingchargecurrency(enrich.getProcessingchargecurrency()); } @@ -451,8 +458,8 @@ public class MergeUtils { } merge - .setDateofacceptance( - mergeDateOfAcceptance(merge.getDateofacceptance(), enrich.getDateofacceptance(), trust)); + .setDateofacceptance( + mergeDateOfAcceptance(merge.getDateofacceptance(), enrich.getDateofacceptance(), trust)); merge.setPublisher(coalesce(merge.getPublisher(), enrich.getPublisher())); merge.setEmbargoenddate(coalesce(merge.getEmbargoenddate(), enrich.getEmbargoenddate())); @@ -467,7 +474,7 @@ public class MergeUtils { merge.setCoverage(unionDistinctLists(merge.getCoverage(), enrich.getCoverage(), trust)); if (enrich.getBestaccessright() != null - && new AccessRightComparator<>() + && new AccessRightComparator<>() .compare(enrich.getBestaccessright(), merge.getBestaccessright()) < 0) { merge.setBestaccessright(enrich.getBestaccessright()); } @@ -480,8 +487,8 @@ public class MergeUtils { // ok merge - .setExternalReference( - mergeExternalReference(merge.getExternalReference(), enrich.getExternalReference(), trust)); + .setExternalReference( + mergeExternalReference(merge.getExternalReference(), enrich.getExternalReference(), trust)); // instance enrichment or union // review instance equals => add pid to comparision @@ -489,17 +496,17 @@ public class MergeUtils { merge.setInstance(mergeInstances(merge.getInstance(), enrich.getInstance(), trust)); } else { final List enrichmentInstances = isAnEnrichment(merge) ? merge.getInstance() - : enrich.getInstance(); + : enrich.getInstance(); final List enrichedInstances = isAnEnrichment(merge) ? enrich.getInstance() - : merge.getInstance(); + : merge.getInstance(); if (isAnEnrichment(merge)) merge.setDataInfo(enrich.getDataInfo()); merge.setInstance(enrichInstances(enrichedInstances, enrichmentInstances)); } merge - .setEoscifguidelines( - mergeEosciifguidelines(merge.getEoscifguidelines(), enrich.getEoscifguidelines(), trust)); + .setEoscifguidelines( + mergeEosciifguidelines(merge.getEoscifguidelines(), enrich.getEoscifguidelines(), trust)); merge.setIsGreen(booleanOR(merge.getIsGreen(), enrich.getIsGreen())); // OK but should be list of values merge.setOpenAccessColor(coalesce(merge.getOpenAccessColor(), enrich.getOpenAccessColor())); @@ -525,7 +532,7 @@ public class MergeUtils { LocalDate enrich_date = LocalDate.parse(enrich.getValue(), DateTimeFormatter.ISO_DATE); if (enrich_date.getYear() > 1300 - && (merge_date.getYear() < 1300 || merge_date.isAfter(enrich_date))) { + && (merge_date.getYear() < 1300 || merge_date.isAfter(enrich_date))) { return enrich; } } catch (NullPointerException | DateTimeParseException e) { @@ -543,56 +550,56 @@ public class MergeUtils { private static List mergeInstances(List v1, List v2, int trust) { return mergeLists( - v1, v2, trust, - MergeUtils::instanceKeyExtractor, - MergeUtils::instanceMerger); + v1, v2, trust, + MergeUtils::instanceKeyExtractor, + MergeUtils::instanceMerger); } private static List mergeEosciifguidelines(List v1, List v2, - int trust) { + int trust) { return mergeLists( - v1, v2, trust, er -> Joiner - .on("||") - .useForNull("") - .join(er.getCode(), er.getLabel(), er.getUrl(), er.getSemanticRelation()), - (r, l) -> r); + v1, v2, trust, er -> Joiner + .on("||") + .useForNull("") + .join(er.getCode(), er.getLabel(), er.getUrl(), er.getSemanticRelation()), + (r, l) -> r); } private static List mergeExternalReference(List v1, - List v2, int trust) { + List v2, int trust) { return mergeLists( - v1, v2, trust, er -> Joiner - .on(',') - .useForNull("") - .join( - er.getSitename(), er.getLabel(), - er.getUrl(), toString(er.getQualifier()), er.getRefidentifier(), - er.getQuery(), toString(er.getDataInfo())), - (r, l) -> r); + v1, v2, trust, er -> Joiner + .on(',') + .useForNull("") + .join( + er.getSitename(), er.getLabel(), + er.getUrl(), toString(er.getQualifier()), er.getRefidentifier(), + er.getQuery(), toString(er.getDataInfo())), + (r, l) -> r); } private static String toString(DataInfo di) { return Joiner - .on(',') - .useForNull("") - .join( - di.getInvisible(), di.getInferred(), di.getDeletedbyinference(), di.getTrust(), - di.getInferenceprovenance(), toString(di.getProvenanceaction())); + .on(',') + .useForNull("") + .join( + di.getInvisible(), di.getInferred(), di.getDeletedbyinference(), di.getTrust(), + di.getInferenceprovenance(), toString(di.getProvenanceaction())); } private static String toString(Qualifier q) { return Joiner - .on(',') - .useForNull("") - .join(q.getClassid(), q.getClassname(), q.getSchemeid(), q.getSchemename()); + .on(',') + .useForNull("") + .join(q.getClassid(), q.getClassname(), q.getSchemeid(), q.getSchemename()); } private static String toString(StructuredProperty sp) { return Joiner - .on(',') - .useForNull("") - .join(toString(sp.getQualifier()), sp.getValue()); + .on(',') + .useForNull("") + .join(toString(sp.getQualifier()), sp.getValue()); } private static List mergeStructuredProperties(List v1, List v2, int trust) { @@ -631,17 +638,17 @@ public class MergeUtils { // 2. @@ // 3. || return String - .join( - "::", - kvKeyExtractor(i.getHostedby()), - kvKeyExtractor(i.getCollectedfrom()), - qualifierKeyExtractor(i.getAccessright()), - qualifierKeyExtractor(i.getInstancetype()), - Optional.ofNullable(i.getUrl()).map(u -> String.join("@@", u)).orElse(null), - Optional - .ofNullable(i.getPid()) - .map(pp -> pp.stream().map(MergeUtils::spKeyExtractor).collect(Collectors.joining("@@"))) - .orElse(null)); + .join( + "::", + kvKeyExtractor(i.getHostedby()), + kvKeyExtractor(i.getCollectedfrom()), + qualifierKeyExtractor(i.getAccessright()), + qualifierKeyExtractor(i.getInstancetype()), + Optional.ofNullable(i.getUrl()).map(u -> String.join("@@", u)).orElse(null), + Optional + .ofNullable(i.getPid()) + .map(pp -> pp.stream().map(MergeUtils::spKeyExtractor).collect(Collectors.joining("@@"))) + .orElse(null)); } private static Instance instanceMerger(Instance i1, Instance i2) { @@ -652,30 +659,30 @@ public class MergeUtils { i.setInstancetype(i1.getInstancetype()); i.setPid(mergeLists(i1.getPid(), i2.getPid(), 0, MergeUtils::spKeyExtractor, (sp1, sp2) -> sp1)); i - .setAlternateIdentifier( - mergeLists( - i1.getAlternateIdentifier(), i2.getAlternateIdentifier(), 0, MergeUtils::spKeyExtractor, - (sp1, sp2) -> sp1)); + .setAlternateIdentifier( + mergeLists( + i1.getAlternateIdentifier(), i2.getAlternateIdentifier(), 0, MergeUtils::spKeyExtractor, + (sp1, sp2) -> sp1)); i - .setRefereed( - Collections - .min( - Stream.of(i1.getRefereed(), i2.getRefereed()).collect(Collectors.toList()), - new RefereedComparator())); + .setRefereed( + Collections + .min( + Stream.of(i1.getRefereed(), i2.getRefereed()).collect(Collectors.toList()), + new RefereedComparator())); i - .setInstanceTypeMapping( - mergeLists( - i1.getInstanceTypeMapping(), i2.getInstanceTypeMapping(), 0, - MergeUtils::instanceTypeMappingKeyExtractor, (itm1, itm2) -> itm1)); + .setInstanceTypeMapping( + mergeLists( + i1.getInstanceTypeMapping(), i2.getInstanceTypeMapping(), 0, + MergeUtils::instanceTypeMappingKeyExtractor, (itm1, itm2) -> itm1)); i.setFulltext(selectFulltext(i1.getFulltext(), i2.getFulltext())); i.setDateofacceptance(selectOldestDate(i1.getDateofacceptance(), i2.getDateofacceptance())); i.setLicense(coalesce(i1.getLicense(), i2.getLicense())); i.setProcessingchargeamount(coalesce(i1.getProcessingchargeamount(), i2.getProcessingchargeamount())); i.setProcessingchargecurrency(coalesce(i1.getProcessingchargecurrency(), i2.getProcessingchargecurrency())); i - .setMeasures( - mergeLists(i1.getMeasures(), i2.getMeasures(), 0, MergeUtils::measureKeyExtractor, (m1, m2) -> m1)); + .setMeasures( + mergeLists(i1.getMeasures(), i2.getMeasures(), 0, MergeUtils::measureKeyExtractor, (m1, m2) -> m1)); i.setUrl(unionDistinctListOfString(i1.getUrl(), i2.getUrl())); @@ -684,14 +691,14 @@ public class MergeUtils { private static String measureKeyExtractor(Measure m) { return String - .join( - "::", - m.getId(), - m - .getUnit() - .stream() - .map(KeyValue::getKey) - .collect(Collectors.joining("::"))); + .join( + "::", + m.getId(), + m + .getUnit() + .stream() + .map(KeyValue::getKey) + .collect(Collectors.joining("::"))); } private static Field selectOldestDate(Field d1, Field d2) { @@ -702,16 +709,16 @@ public class MergeUtils { } return Stream - .of(d1, d2) - .min( - Comparator - .comparing( - f -> DateParserUtils - .parseDate(f.getValue()) - .toInstant() - .atZone(ZoneId.systemDefault()) - .toLocalDate())) - .orElse(d1); + .of(d1, d2) + .min( + Comparator + .comparing( + f -> DateParserUtils + .parseDate(f.getValue()) + .toInstant() + .atZone(ZoneId.systemDefault()) + .toLocalDate())) + .orElse(d1); } private static String selectFulltext(String ft1, String ft2) { @@ -726,12 +733,12 @@ public class MergeUtils { private static String instanceTypeMappingKeyExtractor(InstanceTypeMapping itm) { return String - .join( - "::", - itm.getOriginalType(), - itm.getTypeCode(), - itm.getTypeLabel(), - itm.getVocabularyName()); + .join( + "::", + itm.getOriginalType(), + itm.getTypeCode(), + itm.getTypeLabel(), + itm.getVocabularyName()); } private static String kvKeyExtractor(KeyValue kv) { @@ -748,13 +755,13 @@ public class MergeUtils { private static String spKeyExtractor(StructuredProperty sp) { return Optional - .ofNullable(sp) - .map( - s -> Joiner - .on("||") - .useForNull("") - .join(qualifierKeyExtractor(s.getQualifier()), s.getValue())) - .orElse(null); + .ofNullable(sp) + .map( + s -> Joiner + .on("||") + .useForNull("") + .join(qualifierKeyExtractor(s.getQualifier()), s.getValue())) + .orElse(null); } private static T mergeORP(T original, T enrich) { @@ -776,8 +783,8 @@ public class MergeUtils { merge.setLicense(unionDistinctLists(merge.getLicense(), enrich.getLicense(), trust)); merge.setCodeRepositoryUrl(chooseReference(merge.getCodeRepositoryUrl(), enrich.getCodeRepositoryUrl(), trust)); merge - .setProgrammingLanguage( - chooseReference(merge.getProgrammingLanguage(), enrich.getProgrammingLanguage(), trust)); + .setProgrammingLanguage( + chooseReference(merge.getProgrammingLanguage(), enrich.getProgrammingLanguage(), trust)); return merge; } @@ -791,11 +798,11 @@ public class MergeUtils { merge.setSize(chooseReference(merge.getSize(), enrich.getSize(), trust)); merge.setVersion(chooseReference(merge.getVersion(), enrich.getVersion(), trust)); merge - .setLastmetadataupdate( - chooseReference(merge.getLastmetadataupdate(), enrich.getLastmetadataupdate(), trust)); + .setLastmetadataupdate( + chooseReference(merge.getLastmetadataupdate(), enrich.getLastmetadataupdate(), trust)); merge - .setMetadataversionnumber( - chooseReference(merge.getMetadataversionnumber(), enrich.getMetadataversionnumber(), trust)); + .setMetadataversionnumber( + chooseReference(merge.getMetadataversionnumber(), enrich.getMetadataversionnumber(), trust)); merge.setGeolocation(unionDistinctLists(merge.getGeolocation(), enrich.getGeolocation(), trust)); return merge; @@ -817,26 +824,26 @@ public class MergeUtils { merged.setLegalshortname(chooseReference(merged.getLegalshortname(), enrich.getLegalshortname(), trust)); merged.setLegalname(chooseReference(merged.getLegalname(), enrich.getLegalname(), trust)); merged - .setAlternativeNames(unionDistinctLists(enrich.getAlternativeNames(), merged.getAlternativeNames(), trust)); + .setAlternativeNames(unionDistinctLists(enrich.getAlternativeNames(), merged.getAlternativeNames(), trust)); merged.setWebsiteurl(chooseReference(merged.getWebsiteurl(), enrich.getWebsiteurl(), trust)); merged.setLogourl(chooseReference(merged.getLogourl(), enrich.getLogourl(), trust)); merged.setEclegalbody(chooseReference(merged.getEclegalbody(), enrich.getEclegalbody(), trust)); merged.setEclegalperson(chooseReference(merged.getEclegalperson(), enrich.getEclegalperson(), trust)); merged.setEcnonprofit(chooseReference(merged.getEcnonprofit(), enrich.getEcnonprofit(), trust)); merged - .setEcresearchorganization( - chooseReference(merged.getEcresearchorganization(), enrich.getEcresearchorganization(), trust)); + .setEcresearchorganization( + chooseReference(merged.getEcresearchorganization(), enrich.getEcresearchorganization(), trust)); merged - .setEchighereducation(chooseReference(merged.getEchighereducation(), enrich.getEchighereducation(), trust)); + .setEchighereducation(chooseReference(merged.getEchighereducation(), enrich.getEchighereducation(), trust)); merged - .setEcinternationalorganizationeurinterests( - chooseReference( - merged.getEcinternationalorganizationeurinterests(), - enrich.getEcinternationalorganizationeurinterests(), trust)); + .setEcinternationalorganizationeurinterests( + chooseReference( + merged.getEcinternationalorganizationeurinterests(), + enrich.getEcinternationalorganizationeurinterests(), trust)); merged - .setEcinternationalorganization( - chooseReference( - merged.getEcinternationalorganization(), enrich.getEcinternationalorganization(), trust)); + .setEcinternationalorganization( + chooseReference( + merged.getEcinternationalorganization(), enrich.getEcinternationalorganization(), trust)); merged.setEcenterprise(chooseReference(merged.getEcenterprise(), enrich.getEcenterprise(), trust)); merged.setEcsmevalidated(chooseReference(merged.getEcsmevalidated(), enrich.getEcsmevalidated(), trust)); merged.setEcnutscode(chooseReference(merged.getEcnutscode(), enrich.getEcnutscode(), trust)); @@ -860,8 +867,8 @@ public class MergeUtils { merged.setDuration(chooseReference(merged.getDuration(), enrich.getDuration(), trust)); merged.setEcsc39(chooseReference(merged.getEcsc39(), enrich.getEcsc39(), trust)); merged - .setOamandatepublications( - chooseReference(merged.getOamandatepublications(), enrich.getOamandatepublications(), trust)); + .setOamandatepublications( + chooseReference(merged.getOamandatepublications(), enrich.getOamandatepublications(), trust)); merged.setEcarticle29_3(chooseReference(merged.getEcarticle29_3(), enrich.getEcarticle29_3(), trust)); merged.setSubjects(unionDistinctLists(merged.getSubjects(), enrich.getSubjects(), trust)); merged.setFundingtree(unionDistinctLists(merged.getFundingtree(), enrich.getFundingtree(), trust)); @@ -887,8 +894,8 @@ public class MergeUtils { } merged - .setH2020classification( - unionDistinctLists(merged.getH2020classification(), enrich.getH2020classification(), trust)); + .setH2020classification( + unionDistinctLists(merged.getH2020classification(), enrich.getH2020classification(), trust)); return merged; } @@ -915,7 +922,7 @@ public class MergeUtils { * @return list of instances possibly enriched */ private static List enrichInstances(final List toEnrichInstances, - final List enrichmentInstances) { + final List enrichmentInstances) { final List enrichmentResult = new ArrayList<>(); if (toEnrichInstances == null) { @@ -953,42 +960,42 @@ public class MergeUtils { */ private static Map toInstanceMap(final List ri) { return ri - .stream() - .filter(i -> i.getPid() != null || i.getAlternateIdentifier() != null) - .flatMap(i -> { - final List> result = new ArrayList<>(); - if (i.getPid() != null) - i - .getPid() - .stream() - .filter(MergeUtils::validPid) - .forEach(p -> result.add(new ImmutablePair<>(extractKeyFromPid(p), i))); - if (i.getAlternateIdentifier() != null) - i - .getAlternateIdentifier() - .stream() - .filter(MergeUtils::validPid) - .forEach(p -> result.add(new ImmutablePair<>(extractKeyFromPid(p), i))); - return result.stream(); - }) - .collect( - Collectors - .toMap( - Pair::getLeft, - Pair::getRight, - (a, b) -> a)); + .stream() + .filter(i -> i.getPid() != null || i.getAlternateIdentifier() != null) + .flatMap(i -> { + final List> result = new ArrayList<>(); + if (i.getPid() != null) + i + .getPid() + .stream() + .filter(MergeUtils::validPid) + .forEach(p -> result.add(new ImmutablePair<>(extractKeyFromPid(p), i))); + if (i.getAlternateIdentifier() != null) + i + .getAlternateIdentifier() + .stream() + .filter(MergeUtils::validPid) + .forEach(p -> result.add(new ImmutablePair<>(extractKeyFromPid(p), i))); + return result.stream(); + }) + .collect( + Collectors + .toMap( + Pair::getLeft, + Pair::getRight, + (a, b) -> a)); } private static boolean isFromDelegatedAuthority(Result r) { return Optional - .ofNullable(r.getInstance()) - .map( - instance -> instance - .stream() - .filter(i -> Objects.nonNull(i.getCollectedfrom())) - .map(i -> i.getCollectedfrom().getKey()) - .anyMatch(cfId -> IdentifierFactory.delegatedAuthorityDatasourceIds().contains(cfId))) - .orElse(false); + .ofNullable(r.getInstance()) + .map( + instance -> instance + .stream() + .filter(i -> Objects.nonNull(i.getCollectedfrom())) + .map(i -> i.getCollectedfrom().getKey()) + .anyMatch(cfId -> IdentifierFactory.delegatedAuthorityDatasourceIds().contains(cfId))) + .orElse(false); } /** @@ -1024,15 +1031,15 @@ public class MergeUtils { * @return the list */ private static List findEnrichmentsByPID(final List pids, - final Map enrichments) { + final Map enrichments) { if (pids == null || enrichments == null) return null; return pids - .stream() - .map(MergeUtils::extractKeyFromPid) - .map(enrichments::get) - .filter(Objects::nonNull) - .collect(Collectors.toList()); + .stream() + .map(MergeUtils::extractKeyFromPid) + .map(enrichments::get) + .filter(Objects::nonNull) + .collect(Collectors.toList()); } /** @@ -1043,8 +1050,8 @@ public class MergeUtils { */ private static boolean isAnEnrichment(OafEntity e) { return e.getDataInfo() != null && - e.getDataInfo().getProvenanceaction() != null - && ModelConstants.PROVENANCE_ENRICH.equalsIgnoreCase(e.getDataInfo().getProvenanceaction().getClassid()); + e.getDataInfo().getProvenanceaction() != null + && ModelConstants.PROVENANCE_ENRICH.equalsIgnoreCase(e.getDataInfo().getProvenanceaction().getClassid()); } /** @@ -1067,17 +1074,17 @@ public class MergeUtils { merge.setHostedby(firstNonNull(merge.getHostedby(), enrichment.getHostedby())); merge.setUrl(unionDistinctLists(merge.getUrl(), enrichment.getUrl(), 0)); merge - .setDistributionlocation( - firstNonNull(merge.getDistributionlocation(), enrichment.getDistributionlocation())); + .setDistributionlocation( + firstNonNull(merge.getDistributionlocation(), enrichment.getDistributionlocation())); merge.setCollectedfrom(firstNonNull(merge.getCollectedfrom(), enrichment.getCollectedfrom())); // pid and alternateId are used for matching merge.setDateofacceptance(firstNonNull(merge.getDateofacceptance(), enrichment.getDateofacceptance())); merge - .setProcessingchargeamount( - firstNonNull(merge.getProcessingchargeamount(), enrichment.getProcessingchargeamount())); + .setProcessingchargeamount( + firstNonNull(merge.getProcessingchargeamount(), enrichment.getProcessingchargeamount())); merge - .setProcessingchargecurrency( - firstNonNull(merge.getProcessingchargecurrency(), enrichment.getProcessingchargecurrency())); + .setProcessingchargecurrency( + firstNonNull(merge.getProcessingchargecurrency(), enrichment.getProcessingchargecurrency())); merge.setRefereed(firstNonNull(merge.getRefereed(), enrichment.getRefereed())); merge.setMeasures(unionDistinctLists(merge.getMeasures(), enrichment.getMeasures(), 0)); merge.setFulltext(firstNonNull(merge.getFulltext(), enrichment.getFulltext())); @@ -1085,14 +1092,14 @@ public class MergeUtils { private static int compareTrust(Oaf a, Oaf b) { String left = Optional - .ofNullable(a.getDataInfo()) - .map(DataInfo::getTrust) - .orElse("0.0"); + .ofNullable(a.getDataInfo()) + .map(DataInfo::getTrust) + .orElse("0.0"); String right = Optional - .ofNullable(b.getDataInfo()) - .map(DataInfo::getTrust) - .orElse("0.0"); + .ofNullable(b.getDataInfo()) + .map(DataInfo::getTrust) + .orElse("0.0"); return left.compareTo(right); } From 5f512f510e92a717f1e536b9be9db15399d42805 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Fri, 15 Nov 2024 09:16:51 +0100 Subject: [PATCH 077/111] code formatting --- .../raw/AbstractMdRecordToOafMapper.java | 26 +++++++++++-------- 1 file changed, 15 insertions(+), 11 deletions(-) diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java index be84778f5..881d3202c 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/AbstractMdRecordToOafMapper.java @@ -155,10 +155,11 @@ public abstract class AbstractMdRecordToOafMapper { final Instance instance = prepareInstances(doc, entityInfo, collectedFrom, hostedBy); - if (!Optional.ofNullable(instance.getInstancetype()) - .map(Qualifier::getClassid) - .filter(StringUtils::isNotBlank) - .isPresent()) { + if (!Optional + .ofNullable(instance.getInstancetype()) + .map(Qualifier::getClassid) + .filter(StringUtils::isNotBlank) + .isPresent()) { return Lists.newArrayList(); } @@ -173,13 +174,16 @@ public abstract class AbstractMdRecordToOafMapper { protected String getResultType(final Instance instance) { if (this.vocs.vocabularyExists(ModelConstants.DNET_RESULT_TYPOLOGIES)) { - return Optional.ofNullable(instance.getInstancetype()) - .map(Qualifier::getClassid) - .map(instanceType -> Optional - .ofNullable(this.vocs.getSynonymAsQualifier(ModelConstants.DNET_RESULT_TYPOLOGIES, instanceType)) - .map(Qualifier::getClassid) - .orElse("0000")) - .orElse("0000"); + return Optional + .ofNullable(instance.getInstancetype()) + .map(Qualifier::getClassid) + .map( + instanceType -> Optional + .ofNullable( + this.vocs.getSynonymAsQualifier(ModelConstants.DNET_RESULT_TYPOLOGIES, instanceType)) + .map(Qualifier::getClassid) + .orElse("0000")) + .orElse("0000"); } else { throw new IllegalStateException("Missing vocabulary: " + ModelConstants.DNET_RESULT_TYPOLOGIES); } From cf7d9a32ab847d50bcafe03bc66795d2719e7d03 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Fri, 15 Nov 2024 09:17:28 +0100 Subject: [PATCH 078/111] disable autoBroadcastJoin in the cleaning workflow --- .../eu/dnetlib/dhp/oa/graph/clean/oozie_app/workflow.xml | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/clean/oozie_app/workflow.xml b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/clean/oozie_app/workflow.xml index 2512fc5bc..01aaadae5 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/clean/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/clean/oozie_app/workflow.xml @@ -162,6 +162,7 @@ --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.sql.autoBroadcastJoinThreshold=-1 --conf spark.sql.shuffle.partitions=15000 --inputPath${graphInputPath}/publication @@ -197,6 +198,7 @@ --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.sql.autoBroadcastJoinThreshold=-1 --conf spark.sql.shuffle.partitions=8000 --inputPath${graphInputPath}/dataset @@ -232,6 +234,7 @@ --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.sql.autoBroadcastJoinThreshold=-1 --conf spark.sql.shuffle.partitions=5000 --inputPath${graphInputPath}/otherresearchproduct @@ -267,6 +270,7 @@ --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.sql.autoBroadcastJoinThreshold=-1 --conf spark.sql.shuffle.partitions=2000 --inputPath${graphInputPath}/software @@ -302,6 +306,7 @@ --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.sql.autoBroadcastJoinThreshold=-1 --conf spark.sql.shuffle.partitions=1000 --inputPath${graphInputPath}/datasource @@ -337,6 +342,7 @@ --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.sql.autoBroadcastJoinThreshold=-1 --conf spark.sql.shuffle.partitions=1000 --inputPath${graphInputPath}/organization @@ -372,6 +378,7 @@ --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.sql.autoBroadcastJoinThreshold=-1 --conf spark.sql.shuffle.partitions=2000 --inputPath${graphInputPath}/project @@ -407,6 +414,7 @@ --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.sql.autoBroadcastJoinThreshold=-1 --conf spark.sql.shuffle.partitions=2000 --inputPath${graphInputPath}/person @@ -442,6 +450,7 @@ --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.sql.autoBroadcastJoinThreshold=-1 --conf spark.sql.shuffle.partitions=20000 --inputPath${graphInputPath}/relation From 9e439f5ecaea8c92ca7db4e62158289766fee84e Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Fri, 15 Nov 2024 12:19:26 +0100 Subject: [PATCH 079/111] map the abstracts considering both the datacite and the dc nsPrefix --- .../dhp/oa/graph/raw/OdfToOafMapper.java | 2 +- .../dnetlib/dhp/oa/graph/raw/MappersTest.java | 23 +++++ .../dhp/oa/graph/raw/odf_guidelines4.xml | 95 +++++++++++++++++++ 3 files changed, 119 insertions(+), 1 deletion(-) create mode 100644 dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/odf_guidelines4.xml diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OdfToOafMapper.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OdfToOafMapper.java index a811aad46..efe05eb68 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OdfToOafMapper.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/OdfToOafMapper.java @@ -319,7 +319,7 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper { @Override protected List> prepareDescriptions(final Document doc, final DataInfo info) { - return prepareListFields(doc, "//*[local-name()='description' and ./@descriptionType='Abstract']", info); + return prepareListFields(doc, "//datacite:description[./@descriptionType='Abstract'] | //dc:description", info); } @Override diff --git a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java index 2cf3ea0c0..ea9503d17 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java +++ b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java @@ -906,6 +906,29 @@ class MappersTest { assertEquals("IT", p.getCountry().get(0).getClassid()); assertEquals("FR", p.getCountry().get(1).getClassid()); assertEquals("DE", p.getCountry().get(2).getClassid()); + + assertNotNull(p.getDescription()); + assertEquals(1, p.getDescription().size()); + assertNotNull(p.getDescription().get(0)); + assertTrue(StringUtils.isNotBlank(p.getDescription().get(0).getValue())); + } + + @Test + void testODFRecord_guidelines4() throws IOException { + final String xml = IOUtils.toString(Objects.requireNonNull(getClass().getResourceAsStream("odf_guidelines4.xml"))); + final List list = new OdfToOafMapper(vocs, false, true).processMdRecord(xml); + + final Publication p = (Publication) list.get(0); + assertValidId(p.getId()); + assertValidId(p.getCollectedfrom().get(0).getKey()); + assertTrue(StringUtils.isNotBlank(p.getTitle().get(0).getValue())); + + assertNotNull(p.getDescription()); + assertEquals(2, p.getDescription().size()); + assertNotNull(p.getDescription().get(0)); + assertTrue(StringUtils.isNotBlank(p.getDescription().get(0).getValue())); + assertNotNull(p.getDescription().get(1)); + assertTrue(StringUtils.isNotBlank(p.getDescription().get(1).getValue())); } @Test diff --git a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/odf_guidelines4.xml b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/odf_guidelines4.xml new file mode 100644 index 000000000..4f390afb7 --- /dev/null +++ b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/raw/odf_guidelines4.xml @@ -0,0 +1,95 @@ + + +
+ od______1261::66de86a37fa980b9b8f4cefdf1c5a0c5 + oai:air.unimi.it:2434/636308 + 2020-09-04T14:36:48.411Z + od______1261 + oai:air.unimi.it:2434/636308 + 2022-10-14T14:17:28Z + com_2434_73555 + col_2434_73557 + openaire + 2024-11-07T06:17:52.454Z +
+ + + + I processi incorpati di costruzione della conoscenza nelle pratiche di cura + + + + S. Visioli + rp17381 + University of Milan + 53567 + + + L. Zannini + 0000-0001-7287-6616 + rp31993 + University of Milan + 5556 + + + 487 + 2 + 506 + 8 + METIS + + 10.30557/MT00041 + http://hdl.handle.net/2434/636308 + + + 2018 + 2018 + 2019-05-23 + + ita + journal article + In ambito educativo, il corpo è stato tradizionalmente marginalizzato o rifiutato come fonte di conoscenza. Negli ultimi decenni, la letteratura pedagogica ha manifestato crescente interesse per il ruolo che ha il corpo dell’insegnante, nel pianificare e realizzare l’insegnamento, sfidando le ideologie e le epistemologie dominanti, secondo le quali la nostra mente è la fonte primaria di apprendimento. Al contrario, una pedagogia della conoscenza incorpata (embodiment) considera il corpo centrale nel nostro conoscere (le pratiche d’insegnamento) e nel nostro essere (insegnanti). Analogamente, la letteratura delle professioni del caring si è rivelata molto attenta ai temi della conoscenza incorpata, ossia di quel sapere che è presente in un corpo, spesso in modo tacito. Le infermiere hanno da tempo esplorato il ruolo del loro corpo nei processi di costruzione della conoscenza, come anche il ruolo del corpo dello studente, e addirittura del paziente, nel conoscere la malattia. In questo paper riportiamo alcune riflessioni ed esperienze sulla conoscenza incorpata nelle pratiche assistenziali, mettendo in luce come questa, benché spesso tacita, abbia un ruolo centrale nell’aver cura delle persone malate. + In educational contexts, the body has been traditionally marginalized or rejected as a source of knowledge. In the last decades, some pedagogical contributions have shown interest for the role that the teacher’s body has in planning and implementing teaching, challenging dominant ideologies and epistemologies that tell us our minds are the primary sources of learning. Conversely, a pedagogy of embodiment makes the body central in our knowing (educational practices) and being (a teacher). Similarly, literature from the caring professions has revealed very receptive to-ward issues related to embodied knowledge, that is to say the knowledge that the body owns, which is frequently tacit. Since many years, nurses have explored the role of body in the processes of knowledge building, as well as the role of the student’s body, and even the patient’s body, in knowing the illness. In this paper we report some reflections and experiences concerning embodied knowledge in the nursing practice, highlighting that embodied knowledge, even if tacit, has a pivotal role when caring for sick people. + application/pdf + 2434/636308 + open access + + body; educational practices; embodied knowledge; nursing; teaching + + + + 413926 bytes + + + https://air.unimi.it/bitstream/2434/636308/2/Finale%20pubblicato.pdf + + https://air.unimi.it/bitstream/2434/636308/2/Finale%20pubblicato.pdf + 0001 + 2018-01-01 + OPEN + ita + + + +
\ No newline at end of file From a1297082e2df74e672512334238143542ad323ce Mon Sep 17 00:00:00 2001 From: "sandro.labruzzo" Date: Tue, 19 Nov 2024 14:57:18 +0100 Subject: [PATCH 080/111] Crossref Enhancements: -Accurate Review Type Assignment: Resolved an issue identified in ticket https://support.openaire.eu/issues/9525#note-13. When a relationship of "is-review-of" is detected, the publication type is now correctly set to "Review." -Enhanced Author Affiliation Data: Implemented Miriam's suggestion by including a new field, "RawAffiliationString," in each author entry. This additional data provides a more granular level of detail regarding author affiliations, potentially improving discoverability and research analysis. --- .../collection/crossref/Crossref2Oaf.scala | 67 ++--- .../collection/crossref/affiliationTest.json | 232 ++++++++++++++++++ .../crossref/CrossrefMappingTest.scala | 12 +- 3 files changed, 265 insertions(+), 46 deletions(-) create mode 100644 dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/collection/crossref/affiliationTest.json diff --git a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/collection/crossref/Crossref2Oaf.scala b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/collection/crossref/Crossref2Oaf.scala index e7d68920b..e15312e43 100644 --- a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/collection/crossref/Crossref2Oaf.scala +++ b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/collection/crossref/Crossref2Oaf.scala @@ -37,7 +37,7 @@ case class mappingAuthor( family: Option[String], sequence: Option[String], ORCID: Option[String], - affiliation: Option[mappingAffiliation] + affiliation: Option[List[mappingAffiliation]] ) {} case class funderInfo(id: String, uri: String, name: String, synonym: List[String]) {} @@ -457,15 +457,14 @@ case object Crossref2Oaf { } //Mapping Author - val authorList: List[mappingAuthor] = - (json \ "author").extract[List[mappingAuthor]].filter(a => a.family.isDefined) + val authorList: List[mappingAuthor] = (json \ "author").extract[List[mappingAuthor]].filter(a => a.family.isDefined) val sorted_list = authorList.sortWith((a: mappingAuthor, b: mappingAuthor) => a.sequence.isDefined && a.sequence.get.equalsIgnoreCase("first") ) result.setAuthor(sorted_list.zipWithIndex.map { case (a, index) => - generateAuhtor(a.given.orNull, a.family.get, a.ORCID.orNull, index) + generateAuthor(a.given.orNull, a.family.get, a.ORCID.orNull, index, a.affiliation) }.asJava) // Mapping instance @@ -504,18 +503,6 @@ case object Crossref2Oaf { ) } - val is_review = json \ "relation" \ "is-review-of" \ "id" - - if (is_review != JNothing) { - instance.setInstancetype( - OafMapperUtils.qualifier( - "0015", - "peerReviewed", - ModelConstants.DNET_REVIEW_LEVELS, - ModelConstants.DNET_REVIEW_LEVELS - ) - ) - } if (doi.startsWith("10.3410") || doi.startsWith("10.12703")) instance.setHostedby( @@ -569,17 +556,24 @@ case object Crossref2Oaf { result } + + def generateIdentifier(oaf: Result, doi: String): String = { val id = DHPUtils.md5(doi.toLowerCase) s"50|doiboost____|$id" } - def generateAuhtor(given: String, family: String, orcid: String, index: Int): Author = { + private def generateAuthor(given: String, family: String, orcid: String, index: Int, affiliation: Option[List[mappingAffiliation]]): Author = { val a = new Author a.setName(given) a.setSurname(family) a.setFullname(s"$given $family") a.setRank(index + 1) + + // Adding Raw affiliation if it's defined + if (affiliation.isDefined) { + a.setRawAffiliationString(affiliation.get.map(a => a.name).asJava) + } if (StringUtils.isNotBlank(orcid)) a.setPid( List( @@ -705,11 +699,21 @@ case object Crossref2Oaf { val objectType = (json \ "type").extractOrElse[String](null) if (objectType == null) return resultList - val typology = getTypeQualifier(objectType, vocabularies) + + + // If the item has a relations is-review-of, then we force it to a peer-review + val is_review = json \ "relation" \ "is-review-of" \ "id" + var force_to_review = false + if (is_review != JNothing) { + force_to_review = true + } + + val typology = getTypeQualifier(if (force_to_review) "peer-review" else objectType, vocabularies) if (typology == null) return List() + val result = generateItemFromType(typology._2) if (result == null) return List() @@ -757,33 +761,6 @@ case object Crossref2Oaf { else resultList } - - // if (uw != null) { -// result.getCollectedfrom.add(createUnpayWallCollectedFrom()) -// val i: Instance = new Instance() -// i.setCollectedfrom(createUnpayWallCollectedFrom()) -// if (uw.best_oa_location != null) { -// -// i.setUrl(List(uw.best_oa_location.url).asJava) -// if (uw.best_oa_location.license.isDefined) { -// i.setLicense(field[String](uw.best_oa_location.license.get, null)) -// } -// -// val colour = get_unpaywall_color(uw.oa_status) -// if (colour.isDefined) { -// val a = new AccessRight -// a.setClassid(ModelConstants.ACCESS_RIGHT_OPEN) -// a.setClassname(ModelConstants.ACCESS_RIGHT_OPEN) -// a.setSchemeid(ModelConstants.DNET_ACCESS_MODES) -// a.setSchemename(ModelConstants.DNET_ACCESS_MODES) -// a.setOpenAccessRoute(colour.get) -// i.setAccessright(a) -// } -// i.setPid(result.getPid) -// result.getInstance().add(i) -// } -// } - } private def createCiteRelation(source: Result, targetPid: String, targetPidType: String): List[Relation] = { diff --git a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/collection/crossref/affiliationTest.json b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/collection/crossref/affiliationTest.json new file mode 100644 index 000000000..201138e45 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/collection/crossref/affiliationTest.json @@ -0,0 +1,232 @@ +{ + "indexed": { + "date-parts": [ + [ + 2022, + 4, + 3 + ] + ], + "date-time": "2022-04-03T01:45:59Z", + "timestamp": 1648950359167 + }, + "reference-count": 0, + "publisher": "American Society of Clinical Oncology (ASCO)", + "issue": "18_suppl", + "content-domain": { + "domain": [], + "crossmark-restriction": false + }, + "short-container-title": [ + "JCO" + ], + "published-print": { + "date-parts": [ + [ + 2007, + 6, + 20 + ] + ] + }, + "abstract": " 3507 Purpose: To detect IGF-1R on circulating tumor cells (CTCs) as a biomarker in the clinical development of a monoclonal human antibody, CP-751,871, targeting IGF-1R. Experimental Design: An automated sample preparation and analysis system for enumerating CTCs (Celltracks) was adapted for detecting IGF-1R positive CTCs with a diagnostic antibody targeting a different IGF-1R epitope to CP-751,871. This assay was utilized in three phase I trials of CP-751,871 as a single agent or with chemotherapy and was validated using cell lines and blood samples from healthy volunteers and patients with metastatic carcinoma. Results: There was no interference between the analytical and therapeutic antibodies. CP-751,871 was well tolerated as a single agent, and in combination with docetaxel or carboplatin and paclitaxel, at doses ranging from 0.05 mg/kg to 20 mg/kg. Eighty patients were enrolled on phase 1 studies of CP-751,871, with 47 (59%) patients having CTCs detected during the study. Prior to treatment 26 patients (33%) had CTCs, with 23 having detectable IGF-1R positive CTCs. CP-751,871 alone, and CP-751,871 with cytotoxic chemotherapy, decreased CTCs and IGF-1R positive CTCs; these increased towards the end of the 21-day cycle in some patients, falling again with retreatment. CTCs were commonest in advanced hormone refractory prostate cancer (11/20). Detectable IGF-1R expression on CTCs before treatment with CP-751,871 and docetaxel was associated with a higher frequency of PSA decline by more than 50% (6/10 vs 2/8 patients). A relationship was observed between sustained falls in CTCs counts and PSA declines by more than 50%. Conclusions: IGF-1R expression is detectable by immunofluorescence on CTCs. These data support the further evaluation of CTCs in pharmacodynamic studies and patient selection, particularly in advanced prostate cancer. No significant financial relationships to disclose. ", + "DOI": "10.1200/jco.2007.25.18_suppl.3507", + "type": "journal-article", + "created": { + "date-parts": [ + [ + 2020, + 3, + 6 + ] + ], + "date-time": "2020-03-06T20:50:42Z", + "timestamp": 1583527842000 + }, + "page": "3507-3507", + "source": "Crossref", + "is-referenced-by-count": 0, + "title": [ + "Circulating tumor cells expressing the insulin growth factor-1 receptor (IGF-1R): Method of detection, incidence and potential applications" + ], + "prefix": "10.1200", + "volume": "25", + "author": [ + { + "given": "J. S.", + "family": "de Bono", + "sequence": "first", + "affiliation": [ + { + "name": "Royal Marsden Hospital, Surrey, United Kingdom; Mayo Clinic, Rochester, MN; McGill University & Lady Davis Research Institute, Montreal, PQ, Canada; Pfizer Global Research & Development, New London, CT; Immunicon Corporation, Huntingdon Valley, PA" + } + ] + }, + { + "given": "A.", + "family": "Adjei", + "sequence": "additional", + "affiliation": [ + { + "name": "Royal Marsden Hospital, Surrey, United Kingdom; Mayo Clinic, Rochester, MN; McGill University & Lady Davis Research Institute, Montreal, PQ, Canada; Pfizer Global Research & Development, New London, CT; Immunicon Corporation, Huntingdon Valley, PA" + } + ] + }, + { + "given": "G.", + "family": "Attard", + "sequence": "additional", + "affiliation": [ + { + "name": "Royal Marsden Hospital, Surrey, United Kingdom; Mayo Clinic, Rochester, MN; McGill University & Lady Davis Research Institute, Montreal, PQ, Canada; Pfizer Global Research & Development, New London, CT; Immunicon Corporation, Huntingdon Valley, PA" + } + ] + }, + { + "given": "M.", + "family": "Pollak", + "sequence": "additional", + "affiliation": [ + { + "name": "Royal Marsden Hospital, Surrey, United Kingdom; Mayo Clinic, Rochester, MN; McGill University & Lady Davis Research Institute, Montreal, PQ, Canada; Pfizer Global Research & Development, New London, CT; Immunicon Corporation, Huntingdon Valley, PA" + } + ] + }, + { + "given": "P.", + "family": "Fong", + "sequence": "additional", + "affiliation": [ + { + "name": "Royal Marsden Hospital, Surrey, United Kingdom; Mayo Clinic, Rochester, MN; McGill University & Lady Davis Research Institute, Montreal, PQ, Canada; Pfizer Global Research & Development, New London, CT; Immunicon Corporation, Huntingdon Valley, PA" + } + ] + }, + { + "given": "P.", + "family": "Haluska", + "sequence": "additional", + "affiliation": [ + { + "name": "Royal Marsden Hospital, Surrey, United Kingdom; Mayo Clinic, Rochester, MN; McGill University & Lady Davis Research Institute, Montreal, PQ, Canada; Pfizer Global Research & Development, New London, CT; Immunicon Corporation, Huntingdon Valley, PA" + } + ] + }, + { + "given": "L.", + "family": "Roberts", + "sequence": "additional", + "affiliation": [ + { + "name": "Royal Marsden Hospital, Surrey, United Kingdom; Mayo Clinic, Rochester, MN; McGill University & Lady Davis Research Institute, Montreal, PQ, Canada; Pfizer Global Research & Development, New London, CT; Immunicon Corporation, Huntingdon Valley, PA" + } + ] + }, + { + "given": "D.", + "family": "Chainese", + "sequence": "additional", + "affiliation": [ + { + "name": "Royal Marsden Hospital, Surrey, United Kingdom; Mayo Clinic, Rochester, MN; McGill University & Lady Davis Research Institute, Montreal, PQ, Canada; Pfizer Global Research & Development, New London, CT; Immunicon Corporation, Huntingdon Valley, PA" + } + ] + }, + { + "given": "L.", + "family": "Terstappen", + "sequence": "additional", + "affiliation": [ + { + "name": "Royal Marsden Hospital, Surrey, United Kingdom; Mayo Clinic, Rochester, MN; McGill University & Lady Davis Research Institute, Montreal, PQ, Canada; Pfizer Global Research & Development, New London, CT; Immunicon Corporation, Huntingdon Valley, PA" + } + ] + }, + { + "given": "A.", + "family": "Gualberto", + "sequence": "additional", + "affiliation": [ + { + "name": "Royal Marsden Hospital, Surrey, United Kingdom; Mayo Clinic, Rochester, MN; McGill University & Lady Davis Research Institute, Montreal, PQ, Canada; Pfizer Global Research & Development, New London, CT; Immunicon Corporation, Huntingdon Valley, PA" + } + ] + } + ], + "member": "233", + "container-title": [ + "Journal of Clinical Oncology" + ], + "original-title": [], + "language": "en", + "deposited": { + "date-parts": [ + [ + 2020, + 3, + 6 + ] + ], + "date-time": "2020-03-06T20:51:03Z", + "timestamp": 1583527863000 + }, + "score": 1, + "resource": { + "primary": { + "URL": "http://ascopubs.org/doi/10.1200/jco.2007.25.18_suppl.3507" + } + }, + "subtitle": [], + "short-title": [], + "issued": { + "date-parts": [ + [ + 2007, + 6, + 20 + ] + ] + }, + "references-count": 0, + "journal-issue": { + "issue": "18_suppl", + "published-print": { + "date-parts": [ + [ + 2007, + 6, + 20 + ] + ] + } + }, + "alternative-id": [ + "10.1200/jco.2007.25.18_suppl.3507" + ], + "URL": "http://dx.doi.org/10.1200/jco.2007.25.18_suppl.3507", + "relation": {}, + "ISSN": [ + "0732-183X", + "1527-7755" + ], + "issn-type": [ + { + "value": "0732-183X", + "type": "print" + }, + { + "value": "1527-7755", + "type": "electronic" + } + ], + "subject": [], + "published": { + "date-parts": [ + [ + 2007, + 6, + 20 + ] + ] + } +} \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/test/scala/eu/dnetlib/dhp/collection/crossref/CrossrefMappingTest.scala b/dhp-workflows/dhp-aggregation/src/test/scala/eu/dnetlib/dhp/collection/crossref/CrossrefMappingTest.scala index c3ea884eb..f6f71ca66 100644 --- a/dhp-workflows/dhp-aggregation/src/test/scala/eu/dnetlib/dhp/collection/crossref/CrossrefMappingTest.scala +++ b/dhp-workflows/dhp-aggregation/src/test/scala/eu/dnetlib/dhp/collection/crossref/CrossrefMappingTest.scala @@ -25,8 +25,18 @@ class CrossrefMappingTest extends AbstractVocabularyTest { val input = IOUtils.toString(getClass.getResourceAsStream("/eu/dnetlib/dhp/collection/crossref/issn_pub.json"), "utf-8") - println(Crossref2Oaf.convert(input, vocabularies, TransformationType.All)) + Crossref2Oaf.convert(input, vocabularies, TransformationType.All).foreach(record => { + println(mapper.writerWithDefaultPrettyPrinter().writeValueAsString(record)) + }) } + + @Test + def mappingAffiliation(): Unit = { + val input = + IOUtils.toString(getClass.getResourceAsStream("/eu/dnetlib/dhp/collection/crossref/affiliationTest.json"), "utf-8") + val data = Crossref2Oaf.convert(input, vocabularies, TransformationType.OnlyResult) + println(mapper.writerWithDefaultPrettyPrinter().writeValueAsString(data.head)) + } } From 5d344323983e5aa6cff4cd02557bf40d0dc6c5dd Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Tue, 19 Nov 2024 15:12:04 +0100 Subject: [PATCH 081/111] align MergeUtils with beta branch --- .../dhp/schema/oaf/utils/MergeUtils.java | 42 +++++++++---------- 1 file changed, 21 insertions(+), 21 deletions(-) diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/MergeUtils.java b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/MergeUtils.java index cd8506583..c092f6035 100644 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/MergeUtils.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/MergeUtils.java @@ -74,29 +74,29 @@ public class MergeUtils { if (!vocs.vocabularyExists(ModelConstants.DNET_RESULT_TYPOLOGIES)) { return (T) mergedResult; } else { - final Qualifier expectedResultType = vocs - .lookupTermBySynonym( - ModelConstants.DNET_RESULT_TYPOLOGIES, - i.getInstancetype().getClassid()); - - if (Objects.isNull(expectedResultType)) { - throw new IllegalArgumentException( - "instance type not bound to any result type in dnet:result_typologies: " + - i.getInstancetype().getClassid()); - } + final String expectedResultType = Optional + .ofNullable( + vocs + .lookupTermBySynonym( + ModelConstants.DNET_RESULT_TYPOLOGIES, i.getInstancetype().getClassid())) + .orElse(ModelConstants.ORP_DEFAULT_RESULTTYPE) + .getClassid(); // there is a clash among the result types - if (!expectedResultType.getClassid().equals(mergedResult.getResulttype().getClassid())) { - try { - String resulttype = expectedResultType.getClassid(); - if (EntityType.otherresearchproduct.toString().equals(resulttype)) { - resulttype = "other"; - } - Result result = (Result) ModelSupport.oafTypes.get(resulttype).newInstance(); - return (T) mergeResultFields(result, mergedResult); - } catch (InstantiationException | IllegalAccessException e) { - throw new IllegalStateException(e); - } + if (!expectedResultType.equals(mergedResult.getResulttype().getClassid())) { + + Result result = (Result) Optional + .ofNullable(ModelSupport.oafTypes.get(expectedResultType)) + .map(r -> { + try { + return r.newInstance(); + } catch (InstantiationException | IllegalAccessException e) { + throw new IllegalStateException(e); + } + }) + .orElse(new OtherResearchProduct()); + result.setId(mergedResult.getId()); + return (T) mergeResultFields(result, mergedResult); } else { return (T) mergedResult; } From 4e55ddc547451c18b9e14cce8244015e6825b3cf Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Tue, 19 Nov 2024 16:50:42 +0100 Subject: [PATCH 082/111] [PubMed aggregation] storing contents into mdStoreVersion/store --- .../dnetlib/dhp/sx/bio/ebi/SparkCreatePubmedDump.scala | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/ebi/SparkCreatePubmedDump.scala b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/ebi/SparkCreatePubmedDump.scala index 1bdd2a4bc..adac9ffb6 100644 --- a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/ebi/SparkCreatePubmedDump.scala +++ b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/ebi/SparkCreatePubmedDump.scala @@ -2,9 +2,13 @@ package eu.dnetlib.dhp.sx.bio.ebi import com.fasterxml.jackson.databind.ObjectMapper import eu.dnetlib.dhp.application.AbstractScalaApplication +import eu.dnetlib.dhp.common.Constants +import eu.dnetlib.dhp.common.Constants.{MDSTORE_DATA_PATH, MDSTORE_SIZE_PATH} import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup import eu.dnetlib.dhp.schema.mdstore.MDStoreVersion import eu.dnetlib.dhp.sx.bio.pubmed.{PMArticle, PMParser2, PubMedToOaf} +import eu.dnetlib.dhp.transformation.TransformSparkJobNode +import eu.dnetlib.dhp.utils.DHPUtils.writeHdfsFile import eu.dnetlib.dhp.utils.ISLookupClientFactory import org.apache.spark.sql.{Encoder, Encoders, SparkSession} import org.slf4j.{Logger, LoggerFactory} @@ -82,7 +86,10 @@ class SparkCreatePubmedDump(propertyPath: String, args: Array[String], log: Logg .write .option("compression", "gzip") .mode("overwrite") - .text(targetPath) + .text(targetPath + MDSTORE_DATA_PATH) + + val mdStoreSize = spark.read.text(targetPath + MDSTORE_DATA_PATH).count + writeHdfsFile(spark.sparkContext.hadoopConfiguration, "" + mdStoreSize, targetPath + MDSTORE_SIZE_PATH) } } From 496007188a720b18ce8301b6050c6e0b924522d1 Mon Sep 17 00:00:00 2001 From: "sandro.labruzzo" Date: Wed, 20 Nov 2024 09:50:09 +0100 Subject: [PATCH 083/111] Added assertion on CrossrefMappingTest --- .../crossref/CrossrefMappingTest.scala | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/dhp-workflows/dhp-aggregation/src/test/scala/eu/dnetlib/dhp/collection/crossref/CrossrefMappingTest.scala b/dhp-workflows/dhp-aggregation/src/test/scala/eu/dnetlib/dhp/collection/crossref/CrossrefMappingTest.scala index f6f71ca66..12ca14ba1 100644 --- a/dhp-workflows/dhp-aggregation/src/test/scala/eu/dnetlib/dhp/collection/crossref/CrossrefMappingTest.scala +++ b/dhp-workflows/dhp-aggregation/src/test/scala/eu/dnetlib/dhp/collection/crossref/CrossrefMappingTest.scala @@ -3,12 +3,15 @@ package eu.dnetlib.dhp.collection.crossref import com.fasterxml.jackson.databind.ObjectMapper import eu.dnetlib.dhp.aggregation.AbstractVocabularyTest import eu.dnetlib.dhp.collection.crossref.Crossref2Oaf.TransformationType +import eu.dnetlib.dhp.schema.oaf.Publication import org.apache.commons.io.IOUtils -import org.junit.jupiter.api.{BeforeEach, Test} +import org.junit.jupiter.api.{Assertions, BeforeEach, Test} import org.junit.jupiter.api.extension.ExtendWith import org.mockito.junit.jupiter.MockitoExtension import org.slf4j.{Logger, LoggerFactory} +import scala.collection.JavaConverters.asScalaBufferConverter + @ExtendWith(Array(classOf[MockitoExtension])) class CrossrefMappingTest extends AbstractVocabularyTest { @@ -26,7 +29,7 @@ class CrossrefMappingTest extends AbstractVocabularyTest { IOUtils.toString(getClass.getResourceAsStream("/eu/dnetlib/dhp/collection/crossref/issn_pub.json"), "utf-8") Crossref2Oaf.convert(input, vocabularies, TransformationType.All).foreach(record => { - println(mapper.writerWithDefaultPrettyPrinter().writeValueAsString(record)) + Assertions.assertNotNull(record) }) } @@ -37,6 +40,16 @@ class CrossrefMappingTest extends AbstractVocabularyTest { val input = IOUtils.toString(getClass.getResourceAsStream("/eu/dnetlib/dhp/collection/crossref/affiliationTest.json"), "utf-8") val data = Crossref2Oaf.convert(input, vocabularies, TransformationType.OnlyResult) + data.foreach(record => { + Assertions.assertNotNull(record) + Assertions.assertTrue(record.isInstanceOf[Publication]) + val publication = record.asInstanceOf[Publication] + publication.getAuthor.asScala.foreach(author => { + Assertions.assertNotNull(author.getRawAffiliationString) + Assertions.assertTrue(author.getRawAffiliationString.size()>0) + + }) + }) println(mapper.writerWithDefaultPrettyPrinter().writeValueAsString(data.head)) } } From 15227f82b8acecaf7b51cddba83d68bde5f10169 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Wed, 20 Nov 2024 15:52:40 +0100 Subject: [PATCH 084/111] added related author's given name and family name in the solr json payload serialisation --- .../model/ProvisionModelSupport.java | 23 +++++++++++++++++-- pom.xml | 2 +- 2 files changed, 22 insertions(+), 3 deletions(-) diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/ProvisionModelSupport.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/ProvisionModelSupport.java index 738d75189..0da0f6955 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/ProvisionModelSupport.java +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/ProvisionModelSupport.java @@ -5,6 +5,7 @@ import java.io.StringReader; import java.util.*; import java.util.stream.Collectors; +import eu.dnetlib.dhp.schema.solr.PersonTopic; import org.apache.commons.lang3.StringUtils; import org.dom4j.Document; import org.dom4j.DocumentException; @@ -39,7 +40,6 @@ import eu.dnetlib.dhp.schema.solr.OpenAccessColor; import eu.dnetlib.dhp.schema.solr.OpenAccessRoute; import eu.dnetlib.dhp.schema.solr.Organization; import eu.dnetlib.dhp.schema.solr.Person; -import eu.dnetlib.dhp.schema.solr.PersonTopic; import eu.dnetlib.dhp.schema.solr.Pid; import eu.dnetlib.dhp.schema.solr.Project; import eu.dnetlib.dhp.schema.solr.Result; @@ -174,6 +174,8 @@ public class ProvisionModelSupport { && StringUtils.isNotBlank(relation.getValidationDate())) { rr.setValidationDate(relation.getValidationDate()); } + rr.setGivenName(re.getGivenName()); + rr.setFamilyName(re.getFamilyName()); return rr; } @@ -208,11 +210,28 @@ public class ProvisionModelSupport { ps.setAlternativeNames(p.getAlternativeNames()); ps.setBiography(p.getBiography()); ps.setConsent(p.getConsent()); - // ps.setSubject(...)); + ps.setSubject(mapPersonTopics(p.getSubject())); return ps; } + private static List mapPersonTopics(List subjects) { + return Optional.ofNullable(subjects) + .map(ss -> ss.stream() + .map(ProvisionModelSupport::mapPersonTopic) + .collect(Collectors.toList())) + .orElse(null); + } + + private static PersonTopic mapPersonTopic(eu.dnetlib.dhp.schema.oaf.PersonTopic pt) { + PersonTopic topic = new PersonTopic(); + topic.setValue(pt.getValue()); + topic.setSchema(pt.getSchema()); + topic.setFromYear(pt.getFromYear()); + topic.setToYear(pt.getToYear()); + return topic; + } + private static Funding mapFunding(List fundingtree, VocabularyGroup vocs) { SAXReader reader = new SAXReader(); return Optional diff --git a/pom.xml b/pom.xml index 9480ddfc0..033d88b0b 100644 --- a/pom.xml +++ b/pom.xml @@ -937,7 +937,7 @@ 1.1.3 1.7 1.0.7 - [9.0.0] + [10.0.0] cdh5.9.2 3.5 11.0.2 From e5b04e61ff8f7f7b1d80f607e0fcb4da7c7c9f37 Mon Sep 17 00:00:00 2001 From: Miriam Baglioni Date: Thu, 21 Nov 2024 10:20:12 +0100 Subject: [PATCH 085/111] [CommunityPatents] extends the community propagation considering also the results of type patents linked with a isrelatedto semantcis --- .../main/java/eu/dnetlib/dhp/api/Utils.java | 2 +- .../PrepareResultCommunitySetStep1.java | 51 ++++++++++++++++--- .../PrepareResultCommunitySetStep2.java | 18 ++----- 3 files changed, 49 insertions(+), 22 deletions(-) diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/api/Utils.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/api/Utils.java index 6079da365..1e39d99c3 100644 --- a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/api/Utils.java +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/api/Utils.java @@ -171,7 +171,7 @@ public class Utils implements Serializable { public static List getCommunityIdList(String baseURL) throws IOException { return getValidCommunities(baseURL) .stream() - .map(community -> community.getId()) + .map(CommunityModel::getId) .collect(Collectors.toList()); } diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttocommunityfromsemrel/PrepareResultCommunitySetStep1.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttocommunityfromsemrel/PrepareResultCommunitySetStep1.java index aede9ef05..ff496bb87 100644 --- a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttocommunityfromsemrel/PrepareResultCommunitySetStep1.java +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttocommunityfromsemrel/PrepareResultCommunitySetStep1.java @@ -3,11 +3,14 @@ package eu.dnetlib.dhp.resulttocommunityfromsemrel; import static eu.dnetlib.dhp.PropagationConstant.*; import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkHiveSession; +import static java.lang.String.join; import java.io.IOException; import java.util.Arrays; +import java.util.Collections; import java.util.List; +import eu.dnetlib.dhp.schema.common.ModelConstants; import org.apache.commons.io.IOUtils; import org.apache.spark.SparkConf; import org.apache.spark.sql.*; @@ -45,7 +48,7 @@ public class PrepareResultCommunitySetStep1 { /** * a dataset for example could be linked to more than one publication. For each publication linked to that dataset - * the previous query will produce a row: targetId set of community context the target could possibly inherit with + * the previous query will produce a row: targetId, set of community context the target could possibly inherit. With * the following query there will be a single row for each result linked to more than one result of the result type * currently being used */ @@ -56,6 +59,16 @@ public class PrepareResultCommunitySetStep1 { + "where length(co) > 0 " + "group by resultId"; + private static final String RESULT_CONTEXT_QUERY_TEMPLATE_IS_RELATED_TO = "select target resultId, community_context " + + "from (select id, collect_set(co.id) community_context " + + " from result " + + " lateral view explode (context) c as co " + + " where datainfo.deletedbyinference = false %s " + + " and array_contains(instance.instancetype.classname, 'Patent') group by id) p " + + " JOIN " + + " (select source, target from relation " + + " where datainfo.deletedbyinference = false %s ) r ON p.id = r.source"; + public static void main(String[] args) throws Exception { String jsonConfiguration = IOUtils .toString( @@ -82,15 +95,20 @@ public class PrepareResultCommunitySetStep1 { SparkConf conf = new SparkConf(); conf.set("hive.metastore.uris", parser.get("hive_metastore_uris")); - final List allowedsemrel = Arrays.asList(parser.get("allowedsemrels").split(";")); + final String allowedsemrel = join(",", Arrays.stream(parser.get("allowedsemrels").split(";")) + .map(value -> "'" + value.toLowerCase() + "'") + .toArray(String[]::new)); + log.info("allowedSemRel: {}", new Gson().toJson(allowedsemrel)); final String baseURL = parser.get("baseURL"); log.info("baseURL: {}", baseURL); - final List communityIdList = getCommunityList(baseURL); - log.info("communityIdList: {}", new Gson().toJson(communityIdList)); + final String communityIdList = join(",", getCommunityList(baseURL).stream() + .map(value -> "'" + value.toLowerCase() + "'") + .toArray(String[]::new)); + log.info("communityIdList: {}", new Gson().toJson(communityIdList)); final String resultType = resultClassName.substring(resultClassName.lastIndexOf(".") + 1).toLowerCase(); log.info("resultType: {}", resultType); @@ -118,10 +136,10 @@ public class PrepareResultCommunitySetStep1 { SparkSession spark, String inputPath, String outputPath, - List allowedsemrel, + String allowedsemrel, Class resultClazz, String resultType, - List communityIdList) { + String communityIdList) { final String inputResultPath = inputPath + "/" + resultType; log.info("Reading Graph table from: {}", inputResultPath); @@ -141,8 +159,15 @@ public class PrepareResultCommunitySetStep1 { String resultContextQuery = String .format( RESULT_CONTEXT_QUERY_TEMPLATE, - getConstraintList(" lower(co.id) = '", communityIdList), - getConstraintList(" lower(relClass) = '", allowedsemrel)); + " lower(co.id) IN " + communityIdList, + " AND lower(relClass) IN " + allowedsemrel); + + String resultContextQueryIsRelatedTo = String + .format( + RESULT_CONTEXT_QUERY_TEMPLATE_IS_RELATED_TO, + " AND lower(co.id) IN " + communityIdList, + "AND lower(relClass) = '"+ + ModelConstants.IS_RELATED_TO.toLowerCase() + "'"); Dataset result_context = spark.sql(resultContextQuery); result_context.createOrReplaceTempView("result_context"); @@ -154,6 +179,16 @@ public class PrepareResultCommunitySetStep1 { .option("compression", "gzip") .mode(SaveMode.Overwrite) .json(outputResultPath); + + result_context = spark.sql(resultContextQueryIsRelatedTo); + result_context.createOrReplaceTempView("result_context"); + spark + .sql(RESULT_COMMUNITY_LIST_QUERY) + .as(Encoders.bean(ResultCommunityList.class)) + .write() + .option("compression", "gzip") + .mode(SaveMode.Append) + .json(outputResultPath); } public static List getCommunityList(final String baseURL) throws IOException { diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttocommunityfromsemrel/PrepareResultCommunitySetStep2.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttocommunityfromsemrel/PrepareResultCommunitySetStep2.java index a53d3dfe3..9bebc36e5 100644 --- a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttocommunityfromsemrel/PrepareResultCommunitySetStep2.java +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttocommunityfromsemrel/PrepareResultCommunitySetStep2.java @@ -4,6 +4,7 @@ package eu.dnetlib.dhp.resulttocommunityfromsemrel; import static eu.dnetlib.dhp.PropagationConstant.*; import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; +import java.util.ArrayList; import java.util.HashSet; import java.util.Set; @@ -76,22 +77,13 @@ public class PrepareResultCommunitySetStep2 { if (b == null) { return a; } - Set community_set = new HashSet<>(); - a.getCommunityList().stream().forEach(aa -> community_set.add(aa)); - b - .getCommunityList() - .stream() - .forEach( - aa -> { - if (!community_set.contains(aa)) { - a.getCommunityList().add(aa); - community_set.add(aa); - } - }); + Set community_set = new HashSet<>(a.getCommunityList()); + community_set.addAll(b.getCommunityList()); + a.setCommunityList(new ArrayList<>(community_set)); return a; }) .map(Tuple2::_2) - .map(r -> OBJECT_MAPPER.writeValueAsString(r)) + .map(OBJECT_MAPPER::writeValueAsString) .saveAsTextFile(outputPath, GzipCodec.class); } From 821700299a31e5201f616fd4235dcd599018d19b Mon Sep 17 00:00:00 2001 From: Miriam Baglioni Date: Fri, 22 Nov 2024 17:21:58 +0100 Subject: [PATCH 086/111] [patents] added test and resources --- .../PrepareResultCommunitySetStep1.java | 93 +++++++++++-------- .../ResultToCommunityJobTest.java | 55 +++++++++++ .../graph/publication/part-00000.json | 13 +++ .../graph/relation/part-00000.json | 24 +++++ 4 files changed, 147 insertions(+), 38 deletions(-) create mode 100644 dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/resulttocommunityfromsemrel/graph/publication/part-00000.json create mode 100644 dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/resulttocommunityfromsemrel/graph/relation/part-00000.json diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttocommunityfromsemrel/PrepareResultCommunitySetStep1.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttocommunityfromsemrel/PrepareResultCommunitySetStep1.java index ff496bb87..8f23a4cfd 100644 --- a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttocommunityfromsemrel/PrepareResultCommunitySetStep1.java +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttocommunityfromsemrel/PrepareResultCommunitySetStep1.java @@ -37,7 +37,8 @@ public class PrepareResultCommunitySetStep1 { * relation */ // TODO - private static final String RESULT_CONTEXT_QUERY_TEMPLATE = "select target resultId, community_context " + private static final String RESULT_CONTEXT_QUERY_TEMPLATE = + "select target resultId, community_context " + "from (select id, collect_set(co.id) community_context " + " from result " + " lateral view explode (context) c as co " @@ -59,15 +60,26 @@ public class PrepareResultCommunitySetStep1 { + "where length(co) > 0 " + "group by resultId"; - private static final String RESULT_CONTEXT_QUERY_TEMPLATE_IS_RELATED_TO = "select target resultId, community_context " - + "from (select id, collect_set(co.id) community_context " - + " from result " - + " lateral view explode (context) c as co " - + " where datainfo.deletedbyinference = false %s " - + " and array_contains(instance.instancetype.classname, 'Patent') group by id) p " - + " JOIN " - + " (select source, target from relation " - + " where datainfo.deletedbyinference = false %s ) r ON p.id = r.source"; + private static final String RESULT_CONTEXT_QUERY_TEMPLATE_IS_RELATED_TO = + "select target as resultId, community_context " + + "from resultWithContext rwc " + + "join relatedToRelations r " + + "join patents p " + + "on rwc.id = r.source and r.target = p.id"; + + private static final String RESULT_WITH_CONTEXT = "select id, collect_set(co.id) community_context \n" + + " from result " + + " lateral view explode (context) c as co " + + " where datainfo.deletedbyinference = false AND lower(co.id) IN %s" + + " group by id"; + + private static final String RESULT_PATENT = "select id " + + " from result " + + " where array_contains(instance.instancetype.classname, 'Patent')"; + + private static final String IS_RELATED_TO_RELATIONS = "select source, target " + + " from relation " + + " where lower(relClass) = 'isrelatedto' and datainfo.deletedbyinference = false"; public static void main(String[] args) throws Exception { String jsonConfiguration = IOUtils @@ -95,20 +107,18 @@ public class PrepareResultCommunitySetStep1 { SparkConf conf = new SparkConf(); conf.set("hive.metastore.uris", parser.get("hive_metastore_uris")); - final String allowedsemrel = join(",", Arrays.stream(parser.get("allowedsemrels").split(";")) - .map(value -> "'" + value.toLowerCase() + "'") - .toArray(String[]::new)); - - log.info("allowedSemRel: {}", new Gson().toJson(allowedsemrel)); + final String allowedsemrel ="(" + join(",", + Arrays.asList(parser.get("allowedsemrels").split(";")).stream().map(value -> "'" + value.toLowerCase() + "'") + .toArray(String[]::new)) + ")"; + log.info("allowedSemRel: {}", allowedsemrel); final String baseURL = parser.get("baseURL"); log.info("baseURL: {}", baseURL); - final String communityIdList = join(",", getCommunityList(baseURL).stream() + final String communityIdList = "(" + join(",", getCommunityList(baseURL).stream() .map(value -> "'" + value.toLowerCase() + "'") - .toArray(String[]::new)); + .toArray(String[]::new)) + ")"; - log.info("communityIdList: {}", new Gson().toJson(communityIdList)); final String resultType = resultClassName.substring(resultClassName.lastIndexOf(".") + 1).toLowerCase(); log.info("resultType: {}", resultType); @@ -156,32 +166,38 @@ public class PrepareResultCommunitySetStep1 { final String outputResultPath = outputPath + "/" + resultType; log.info("writing output results to: {}", outputResultPath); + String resultContextQuery = String .format( RESULT_CONTEXT_QUERY_TEMPLATE, - " lower(co.id) IN " + communityIdList, - " AND lower(relClass) IN " + allowedsemrel); - - String resultContextQueryIsRelatedTo = String - .format( - RESULT_CONTEXT_QUERY_TEMPLATE_IS_RELATED_TO, - " AND lower(co.id) IN " + communityIdList, - "AND lower(relClass) = '"+ - ModelConstants.IS_RELATED_TO.toLowerCase() + "'"); - + "AND lower(co.id) IN " + communityIdList, + "AND lower(relClass) IN " + allowedsemrel); Dataset result_context = spark.sql(resultContextQuery); + //result_context.createOrReplaceTempView("result_context"); + +// spark +// .sql(RESULT_COMMUNITY_LIST_QUERY) +// .as(Encoders.bean(ResultCommunityList.class)) +// .write() +// .option("compression", "gzip") +// .mode(SaveMode.Overwrite) +// .json(outputResultPath); + + Dataset rwc = spark.sql(String.format(RESULT_WITH_CONTEXT, communityIdList)); + Dataset patents = spark.sql(RESULT_PATENT); + Dataset relatedToRelations = spark.sql(IS_RELATED_TO_RELATIONS); + + rwc.createOrReplaceTempView("resultWithContext"); + + patents.createOrReplaceTempView("patents"); + + relatedToRelations.createOrReplaceTempView("relatedTorelations"); + + + result_context = result_context.unionAll( spark.sql(RESULT_CONTEXT_QUERY_TEMPLATE_IS_RELATED_TO)); + result_context.createOrReplaceTempView("result_context"); - spark - .sql(RESULT_COMMUNITY_LIST_QUERY) - .as(Encoders.bean(ResultCommunityList.class)) - .write() - .option("compression", "gzip") - .mode(SaveMode.Overwrite) - .json(outputResultPath); - - result_context = spark.sql(resultContextQueryIsRelatedTo); - result_context.createOrReplaceTempView("result_context"); spark .sql(RESULT_COMMUNITY_LIST_QUERY) .as(Encoders.bean(ResultCommunityList.class)) @@ -189,6 +205,7 @@ public class PrepareResultCommunitySetStep1 { .option("compression", "gzip") .mode(SaveMode.Append) .json(outputResultPath); + } public static List getCommunityList(final String baseURL) throws IOException { diff --git a/dhp-workflows/dhp-enrichment/src/test/java/eu/dnetlib/dhp/resulttocommunityfromsemrel/ResultToCommunityJobTest.java b/dhp-workflows/dhp-enrichment/src/test/java/eu/dnetlib/dhp/resulttocommunityfromsemrel/ResultToCommunityJobTest.java index 0d5b12c80..4361b6f39 100644 --- a/dhp-workflows/dhp-enrichment/src/test/java/eu/dnetlib/dhp/resulttocommunityfromsemrel/ResultToCommunityJobTest.java +++ b/dhp-workflows/dhp-enrichment/src/test/java/eu/dnetlib/dhp/resulttocommunityfromsemrel/ResultToCommunityJobTest.java @@ -6,8 +6,11 @@ import static org.apache.spark.sql.functions.desc; import java.io.IOException; import java.nio.file.Files; import java.nio.file.Path; +import java.util.ArrayList; import java.util.List; +import java.util.stream.Collectors; +import eu.dnetlib.dhp.resulttocommunityfromorganization.ResultCommunityList; import org.apache.commons.io.FileUtils; import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaRDD; @@ -25,6 +28,7 @@ import org.slf4j.LoggerFactory; import com.fasterxml.jackson.databind.ObjectMapper; import eu.dnetlib.dhp.schema.oaf.Dataset; +import scala.collection.Seq; public class ResultToCommunityJobTest { @@ -271,4 +275,55 @@ public class ResultToCommunityJobTest { .get(0) .getString(0)); } + + @Test + public void prepareStep1Test() throws Exception { + /* + + + final String allowedsemrel = join(",", Arrays.stream(parser.get("allowedsemrels").split(";")) + .map(value -> "'" + value.toLowerCase() + "'") + .toArray(String[]::new)); + + log.info("allowedSemRel: {}", new Gson().toJson(allowedsemrel)); + + final String baseURL = parser.get("baseURL"); + log.info("baseURL: {}", baseURL); + */ + PrepareResultCommunitySetStep1 + .main( + new String[] { + "-isSparkSessionManaged", Boolean.FALSE.toString(), + "-sourcePath", getClass() + .getResource("/eu/dnetlib/dhp/resulttocommunityfromsemrel/graph") + .getPath(), + "-hive_metastore_uris", "", + "-resultTableName", "eu.dnetlib.dhp.schema.oaf.Publication", + "-outputPath", workingDir.toString() + "/preparedInfo", + "-allowedsemrels","issupplementto;issupplementedby", + "-baseURL","https://dev-openaire.d4science.org/openaire/community/" + }); + + + org.apache.spark.sql.Dataset resultCommunityList = spark.read().schema(Encoders.bean(ResultCommunityList.class).schema()) + .json(workingDir.toString() + "/preparedInfo/publication") + .as(Encoders.bean(ResultCommunityList.class)); + + Assertions.assertEquals(2, resultCommunityList.count()); + Assertions.assertEquals(1,resultCommunityList.filter("resultId = '50|dedup_wf_001::06e51d2bf295531b2d2e7a1b55500783'").count()); + Assertions.assertEquals(1,resultCommunityList.filter("resultId = '50|pending_org_::82f63b2d21ae88596b9d8991780e9888'").count()); + + ArrayList communities = resultCommunityList + .filter("resultId = '50|dedup_wf_001::06e51d2bf295531b2d2e7a1b55500783'") + .first().getCommunityList(); + Assertions.assertEquals(2, communities.size()); + Assertions.assertTrue(communities.stream().anyMatch(cid -> "beopen".equals(cid))); + Assertions.assertTrue(communities.stream().anyMatch(cid -> "dh-ch".equals(cid))); + + communities = resultCommunityList + .filter("resultId = '50|pending_org_::82f63b2d21ae88596b9d8991780e9888'") + .first().getCommunityList(); + Assertions.assertEquals(1, communities.size()); + Assertions.assertEquals("dh-ch", communities.get(0)); + } } diff --git a/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/resulttocommunityfromsemrel/graph/publication/part-00000.json b/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/resulttocommunityfromsemrel/graph/publication/part-00000.json new file mode 100644 index 000000000..7957bcfd5 --- /dev/null +++ b/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/resulttocommunityfromsemrel/graph/publication/part-00000.json @@ -0,0 +1,13 @@ +{"author":[{"fullname":"Tanouayi, Gnon","name":"Gnon","pid":[],"rank":1,"surname":"Tanouayi"},{"fullname":"GNANDI, Kissao","name":"Kissao","pid":[],"rank":2,"surname":"Gnandi"},{"fullname":"Ouro-Sama, Kamilou","name":"Kamilou","pid":[],"rank":3,"surname":"Ouro-Sama"},{"fullname":"Ahoudi, Housséni","name":"Housséni","pid":[],"rank":4,"surname":"Ahoudi"},{"fullname":"Solitoke, Hodabalo Dhéoulaba","name":"Hodabalo Dhéoulaba","pid":[],"rank":5,"surname":"Solitoke"},{"fullname":"Badassan, Tchaa Esso-Essinam","name":"Tchaa Esso-Essinam","pid":[],"rank":6,"surname":"Badassan"},{"fullname":"Nyametso, A. Yawovi","name":"A. Yawovi","pid":[],"rank":7,"surname":"Nyametso"},{"fullname":"Agbéko, Aduayi-Akué Adoté","name":"Aduayi-Akué Adoté","pid":[],"rank":8,"surname":"Agbéko"}],"bestaccessright":{"classid":"OPEN","classname":"Open Access","schemeid":"dnet:access_modes","schemename":"dnet:access_modes"},"collectedfrom":[{"key":"10|eurocrisdris::fe4903425d9040f680d8610d9079ea14","value":"Episciences"}],"context":[],"contributor":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"Importer of dst articles previously hosted by inist Eid system account"}],"country":[],"coverage":[],"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"dateofacceptance":{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"2017-05-01"},"dateofcollection":"2022-03-01T00:17:08.896Z","dateoftransformation":"2022-03-01T02:57:50.566Z","description":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"This study is a contribution to the development of adsorption techniques for the removal of fluoride in natural waters. The work is carried out on a laboratory scale using local geo-materials sorbents, on the one hand the residues from the treatment of natural phosphorite of Hahotoé-Kpogamé and on the other hand the attapulgite clay mineral from the costal basin of Togo. The work carried out concerns the adsorption of fluoride on those sorbents. The following parameters are batch tested on synthetic fluoride solutions: time, solution pH, geo-material dose and fluoride concentration. The fluoride is analyzed by absorption spectrometry. The adsorption yields on the phosphorite treatment residues for aqueous fluoride solutions at an initial concentration of 10 mg/L and an adsorbent concentration of 10 g/L are 49 % at pH 6.5 and 66 % at pH 4.0. In the same experimental conditions, the yields on clay minerals are 28.2 % and 36.3 %. These yields are logically improved by increasing the adsorbent dosage (from 2 to 30 g/L). Additional tests are carried out on natural water at an initial fluoride concentration of 3.76 mg/L."},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"Cette étude est une contribution au développement de techniques d’adsorption pour l’élimination du fluor dans les eaux naturelles. Les travaux ont été réalisés à l’échelle du laboratoire en utilisant comme sorbants des géo-matériaux locaux, d’une part les résidus du traitement des phosphates naturels de Hahotoé-Kpogamé et d’autre part l’argilite feuilletée du bassin sédimentaire côtier du Togo. Les travaux réalisés concernent l’adsorption du fluor sur les sorbants considérés. Les paramètres suivants ont été testés en batch sur des solutions synthétiques de fluor : le temps, le pH de la solution, la dose du géo-matériau et la concentration du fluor. Le fluor a été dosé par spectromètrie d’absorption. Les rendements d’adsorption sur les résidus de traitement des phosphates pour des solutions aqueuses de fluor à concentration initiale de 10 mg/L et une concentration en adsorbant de 10 g/L ont été de 49 % à pH 6,5 et 66 % à pH de 4,0. Dans les mêmes conditions expérimentales, les rendements sur les argilites ont été de 28,2 % et 36,3 %. Ces rendements ont logiquement été améliorés en augmentant le dosage en adsorbant (de 2 à 30 g/L). Des essais complémentaires ont été réalisés sur une eau naturelle à une concentration initiale en fluor de 3,76 mg/L."}],"externalReference":[],"extraInfo":[],"format":[],"fulltext":[],"id":"50|06cdd3ff4700::49ec404cee4e1452808aabeaffbd3072","instance":[{"accessright":{"classid":"OPEN","classname":"Open Access","schemeid":"dnet:access_modes","schemename":"dnet:access_modes"},"alternateIdentifier":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"doi","classname":"Digital Object Identifier","schemeid":"dnet:pid_types","schemename":"dnet:pid_types"},"value":"10.4267/dechets-sciences-techniques.3534"}],"collectedfrom":{"key":"10|openaire____::6824b298c96ba906a3e6a70593affbf5","value":"Episciences"},"dateofacceptance":{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"2017-05-01"},"distributionlocation":"","hostedby":{"key":"10|openaire____::6824b298c96ba906a3e6a70593affbf5","value":"Episciences"},"instancetype":{"classid":"0001","classname":"Article","schemeid":"dnet:publication_resource","schemename":"dnet:publication_resource"},"pid":[],"refereed":{"classid":"0000","classname":"UNKNOWN","schemeid":"dnet:review_levels","schemename":"dnet:review_levels"},"url":["https://eid.episciences.org/7781"]}],"language":{"classid":"fra/fre","classname":"French","schemeid":"dnet:languages","schemename":"dnet:languages"},"lastupdatetimestamp":1646506202085,"oaiprovenance":{"originDescription":{"altered":true,"baseURL":"https%3A%2F%2Foai.episciences.org%2F","datestamp":"2017-05-01","harvestDate":"2022-03-01T00:17:08.896Z","identifier":"oai:episciences.org:eid:7781","metadataNamespace":"http://www.openarchives.org/OAI/2.0/oai_dc/"}},"originalId":["oai:episciences.org:eid:7781","50|06cdd3ff4700::49ec404cee4e1452808aabeaffbd3072"],"pid":[],"relevantdate":[],"resourcetype":{"classid":"UNKNOWN","classname":"UNKNOWN","schemeid":"dnet:dataCite_resource","schemename":"dnet:dataCite_resource"},"resulttype":{"classid":"publication","classname":"publication","schemeid":"dnet:result_typologies","schemename":"dnet:result_typologies"},"source":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"ISSN: 2778-844X"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"Environnement, Ingénierie & Développement"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"Episciences.org"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"eid:7781 - Environnement, Ingénierie & Développement, 2017-05-01, N°73 - mai 2017"}],"subject":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject_classification_typologies","schemename":"dnet:subject_classification_typologies"},"value":"clay"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject_classification_typologies","schemename":"dnet:subject_classification_typologies"},"value":"geo-materials"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject_classification_typologies","schemename":"dnet:subject_classification_typologies"},"value":"phosphorite of Hahotoé-Kpogamé"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject_classification_typologies","schemename":"dnet:subject_classification_typologies"},"value":"removal of fluoride"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject_classification_typologies","schemename":"dnet:subject_classification_typologies"},"value":"argilite"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject_classification_typologies","schemename":"dnet:subject_classification_typologies"},"value":"élimination du fluor"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject_classification_typologies","schemename":"dnet:subject_classification_typologies"},"value":"géo-matériaux"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject_classification_typologies","schemename":"dnet:subject_classification_typologies"},"value":"phosphate de Hahotoé-Kpogamé"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject_classification_typologies","schemename":"dnet:subject_classification_typologies"},"value":"[SDE.IE]Environmental Sciences/Environmental Engineering"}],"title":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"main title","classname":"main title","schemeid":"dnet:dataCite_title","schemename":"dnet:dataCite_title"},"value":"Défluoruration des eaux à l’aide des résidus du traitement des phosphates naturels et des argilites feuilletées"}]} +{"author":[],"bestaccessright":{"classid":"OPEN","classname":"Open Access","schemeid":"dnet:access_modes","schemename":"dnet:access_modes"},"collectedfrom":[{"key":"10|openaire____::160a261e5d06fd542c2efcac6e17e08c","value":"RS Global Journals"}],"context":[],"contributor":[],"country":[],"coverage":[],"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"dateofacceptance":{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"2016-02-28"},"dateofcollection":"2020-06-01T07:11:47.22Z","dateoftransformation":"2020-07-25T07:25:11.051Z","description":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"The results of treatment of 21 patients with multiple injuries, including 12 (57.2%) patients with lesions of limb bones, 9 (42.8%) patients with injuries of the pelvis treated at the Department of Traumatology number 2, 5 for the period of 2013 to 2014 were analyzed by the authors. Developed gentle immobilization of the lower limbs in patients with multiple injuries provides adequate fixation and extension of the lower limb in the intensive care period and during emergency external fixation by the device of external fixation device shin bone or hip.The introduction of surgical treatment of patients in the acute period of polytrauma, using minimally invasive fracture fixation technology allowed providing the early stabilization of the victim’s condition, to avoid diagnostic errors and obtain positive results of treatment in 98% of cases."}],"externalReference":[],"extraInfo":[],"format":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"application/pdf"}],"fulltext":[],"id":"50|07b5c0ccd4fe::e7f5459cc97865f2af6e3da964c1250b","instance":[{"accessright":{"classid":"OPEN","classname":"Open Access","schemeid":"dnet:access_modes","schemename":"dnet:access_modes"},"alternateIdentifier":[],"collectedfrom":{"key":"10|openaire____::160a261e5d06fd542c2efcac6e17e08c","value":"RS Global Journals"},"dateofacceptance":{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"2016-02-28"},"distributionlocation":"","hostedby":{"key":"10|eurocrisdris::fe4903425d9040f680d8610d9079ea14","value":"World Science"},"instancetype":{"classid":"0001","classname":"Article","schemeid":"dnet:publication_resource","schemename":"dnet:publication_resource"},"license":{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"https://creativecommons.org/licenses/by/4.0"},"pid":[],"refereed":{"classid":"0000","classname":"UNKNOWN","schemeid":"dnet:review_levels","schemename":"dnet:review_levels"},"url":["https://rsglobal.pl/index.php/ws/article/view/895"]}],"journal":{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"edition":"","ep":"","iss":"","issnLinking":"","issnOnline":"2413-1032","issnPrinted":"2414-6404","name":"World Science","sp":"","vol":""},"language":{"classid":"eng","classname":"English","schemeid":"dnet:languages","schemename":"dnet:languages"},"lastupdatetimestamp":1646506135978,"oaiprovenance":{"originDescription":{"altered":true,"baseURL":"https%3A%2F%2Frsglobal.pl%2Findex.php%2Findex%2Foai","datestamp":"2020-05-23T18:09:46Z","harvestDate":"2020-06-01T07:11:47.22Z","identifier":"oai:ojs2.rsglobal.pl:article/895","metadataNamespace":"http://www.openarchives.org/OAI/2.0/oai_dc/"}},"originalId":["oai:ojs2.rsglobal.pl:article/895","50|07b5c0ccd4fe::e7f5459cc97865f2af6e3da964c1250b"],"pid":[],"publisher":{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"RS Global Sp. z O.O."},"relevantdate":[],"resourcetype":{"classid":"UNKNOWN","classname":"UNKNOWN","schemeid":"dnet:dataCite_resource","schemename":"dnet:dataCite_resource"},"resulttype":{"classid":"publication","classname":"publication","schemeid":"dnet:result_typologies","schemename":"dnet:result_typologies"},"source":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"World Science; Vol 3 No 2(6) (2016): World Science; 43-50"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"World Science; Том 3 № 2(6) (2016): World Science; 43-50"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"2414-6404"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"2413-1032"}],"subject":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject_classification_typologies","schemename":"dnet:subject_classification_typologies"},"value":"Polytrauma"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject_classification_typologies","schemename":"dnet:subject_classification_typologies"},"value":"multiple trauma"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject_classification_typologies","schemename":"dnet:subject_classification_typologies"},"value":"combined injury injury"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject_classification_typologies","schemename":"dnet:subject_classification_typologies"},"value":"the severity of the damage"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject_classification_typologies","schemename":"dnet:subject_classification_typologies"},"value":"the device of external fixator"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject_classification_typologies","schemename":"dnet:subject_classification_typologies"},"value":"submersible osteosynthesis"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject_classification_typologies","schemename":"dnet:subject_classification_typologies"},"value":"«damage control»"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject_classification_typologies","schemename":"dnet:subject_classification_typologies"},"value":"Polytrauma"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject_classification_typologies","schemename":"dnet:subject_classification_typologies"},"value":"multiple trauma"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject_classification_typologies","schemename":"dnet:subject_classification_typologies"},"value":"combined injury injury"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject_classification_typologies","schemename":"dnet:subject_classification_typologies"},"value":"the severity of the damage"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject_classification_typologies","schemename":"dnet:subject_classification_typologies"},"value":"the device of external fixator"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject_classification_typologies","schemename":"dnet:subject_classification_typologies"},"value":"submersible osteosynthesis"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:subject_classification_typologies","schemename":"dnet:subject_classification_typologies"},"value":"«damage control»"}],"title":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"main title","classname":"main title","schemeid":"dnet:dataCite_title","schemename":"dnet:dataCite_title"},"value":"ЛЕЧЕНИЯ ПАЦИЕНТОВ С СОЧЕТАННЫМИ И МНОЖЕСТВЕННЫМИ ТРАВМАМИ КОНЕЧНОСТЕЙ И КОСТЕЙ ТАЗА"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"main title","classname":"main title","schemeid":"dnet:dataCite_title","schemename":"dnet:dataCite_title"},"value":"ЛЕЧЕНИЯ ПАЦИЕНТОВ С СОЧЕТАННЫМИ И МНОЖЕСТВЕННЫМИ ТРАВМАМИ КОНЕЧНОСТЕЙ И КОСТЕЙ ТАЗА"}]} +{"author":[{"fullname":"Kemppainen, Mika","name":"Mika","pid":[],"rank":1,"surname":"Kemppainen"},{"fullname":"Virkkunen, Iikka","name":"Iikka","pid":[],"rank":2,"surname":"Virkkunen"},{"fullname":"Pitkänen, Jorma","name":"Jorma","pid":[],"rank":3,"surname":"Pitkänen"},{"fullname":"Paussu, Raimo","name":"Raimo","pid":[],"rank":4,"surname":"Paussu"},{"fullname":"Hänninen, Hannu","name":"Hannu","pid":[],"rank":5,"surname":"Hänninen"}],"bestaccessright":{"classid":"OPEN","classname":"Open Access","schemeid":"dnet:access_modes","schemename":"dnet:access_modes"},"collectedfrom":[{"key":"10|openaire____::4692342f0992d91f9e705c26959f09e0","value":"VTT Research Information System"}],"context":[],"contributor":[],"country":[],"coverage":[],"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"dateofacceptance":{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"2003-01-01"},"dateofcollection":"2022-02-28T12:34:39.606Z","dateoftransformation":"2022-02-28T14:02:46.142Z","description":[],"externalReference":[],"extraInfo":[],"format":[],"fulltext":[],"id":"50|355e65625b88::210c52944502777ba567442480e6a76e","instance":[{"accessright":{"classid":"OPEN","classname":"Open Access","schemeid":"dnet:access_modes","schemename":"dnet:access_modes"},"alternateIdentifier":[],"collectedfrom":{"key":"10|eurocrisdris::fe4903425d9040f680d8610d9079ea14","value":"VTT Research Information System"},"dateofacceptance":{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"2003-01-01"},"distributionlocation":"","hostedby":{"key":"10|openaire____::4692342f0992d91f9e705c26959f09e0","value":"VTT Research Information System"},"instancetype":{"classid":"0001","classname":"Article","schemeid":"dnet:publication_resource","schemename":"dnet:publication_resource"},"pid":[],"refereed":{"classid":"0000","classname":"UNKNOWN","schemeid":"dnet:review_levels","schemename":"dnet:review_levels"},"url":["https://cris.vtt.fi/en/publications/66c69ab9-a33b-4817-97cd-1f38b3b31405"]}],"language":{"classid":"eng","classname":"English","schemeid":"dnet:languages","schemename":"dnet:languages"},"lastupdatetimestamp":1646505945226,"oaiprovenance":{"originDescription":{"altered":true,"baseURL":"https%3A%2F%2Fcris.vtt.fi%2Fws%2Foai","datestamp":"2019-09-23T08:45:14Z","harvestDate":"2022-02-28T12:34:39.606Z","identifier":"oai:cris.vtt.fi:publications/66c69ab9-a33b-4817-97cd-1f38b3b31405","metadataNamespace":"http://www.openarchives.org/OAI/2.0/oai_dc/"}},"originalId":["50|355e65625b88::210c52944502777ba567442480e6a76e","oai:cris.vtt.fi:publications/66c69ab9-a33b-4817-97cd-1f38b3b31405"],"pid":[],"relevantdate":[],"resourcetype":{"classid":"UNKNOWN","classname":"UNKNOWN","schemeid":"dnet:dataCite_resource","schemename":"dnet:dataCite_resource"},"resulttype":{"classid":"publication","classname":"publication","schemeid":"dnet:result_typologies","schemename":"dnet:result_typologies"},"source":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"Kemppainen , M , Virkkunen , I , Pitkänen , J , Paussu , R & Hänninen , H 2003 , ' Comparison of realistic artificial cracks and in-service cracks ' , The e-Journal of Nondestructive Testing & Ultrasonics , vol. 8 , no. 3 , 6 . < http://www.ndt.net/article/ecndt02/401/401.htm >"}],"subject":[],"title":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"main title","classname":"main title","schemeid":"dnet:dataCite_title","schemename":"dnet:dataCite_title"},"value":"Comparison of realistic artificial cracks and in-service cracks"}]} +{"author":[{"fullname":"Kelhä, Väinö","name":"Väinö","pid":[],"rank":1,"surname":"Kelhä"},{"fullname":"Manninen, M.","name":"M.","pid":[],"rank":2,"surname":"Manninen"},{"fullname":"Oittinen, P.","name":"P.","pid":[],"rank":3,"surname":"Oittinen"},{"fullname":"Tiesmäki, Jarkko","name":"Jarkko","pid":[],"rank":4,"surname":"Tiesmäki"}],"bestaccessright":{"classid":"RESTRICTED","classname":"Restricted","schemeid":"dnet:access_modes","schemename":"dnet:access_modes"},"collectedfrom":[{"key":"10|openaire____::4692342f0992d91f9e705c26959f09e0","value":"VTT Research Information System"}],"context":[],"contributor":[],"country":[],"coverage":[],"dataInfo":{"deletedbyinference":true,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"dateofacceptance":{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"1974-01-01"},"dateofcollection":"2022-02-28T12:32:28.547Z","dateoftransformation":"2022-02-28T14:46:21.4Z","description":[],"externalReference":[],"extraInfo":[],"format":[],"fulltext":[],"id":"50|355e65625b88::3754cff043a1700077031ea29f8cc240","instance":[{"accessright":{"classid":"RESTRICTED","classname":"Restricted","schemeid":"dnet:access_modes","schemename":"dnet:access_modes"},"alternateIdentifier":[],"collectedfrom":{"key":"10|openaire____::4692342f0992d91f9e705c26959f09e0","value":"VTT Research Information System"},"dateofacceptance":{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"1974-01-01"},"distributionlocation":"","hostedby":{"key":"10|openaire____::4692342f0992d91f9e705c26959f09e0","value":"VTT Research Information System"},"instancetype":{"classid":"0001","classname":"Article","schemeid":"dnet:publication_resource","schemename":"dnet:publication_resource"},"pid":[],"refereed":{"classid":"0000","classname":"UNKNOWN","schemeid":"dnet:review_levels","schemename":"dnet:review_levels"},"url":["https://cris.vtt.fi/en/publications/d2b5302f-004a-407c-8f9c-4312e0dbf679"]}],"language":{"classid":"eng","classname":"English","schemeid":"dnet:languages","schemename":"dnet:languages"},"lastupdatetimestamp":1646505968213,"oaiprovenance":{"originDescription":{"altered":true,"baseURL":"https%3A%2F%2Fcris.vtt.fi%2Fws%2Foai","datestamp":"2021-08-31T11:55:56Z","harvestDate":"2022-02-28T12:32:28.547Z","identifier":"oai:cris.vtt.fi:publications/d2b5302f-004a-407c-8f9c-4312e0dbf679","metadataNamespace":"http://www.openarchives.org/OAI/2.0/oai_dc/"}},"originalId":["50|355e65625b88::3754cff043a1700077031ea29f8cc240","oai:cris.vtt.fi:publications/d2b5302f-004a-407c-8f9c-4312e0dbf679"],"pid":[],"relevantdate":[],"resourcetype":{"classid":"UNKNOWN","classname":"UNKNOWN","schemeid":"dnet:dataCite_resource","schemename":"dnet:dataCite_resource"},"resulttype":{"classid":"publication","classname":"publication","schemeid":"dnet:result_typologies","schemename":"dnet:result_typologies"},"source":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"Kelhä , V , Manninen , M , Oittinen , P & Tiesmäki , J 1974 , ' A parallel plate tackmeter for measuring the splitting resistance of printing inks ' , Surface Coatings International: JOCCA , vol. 57 , pp. 184-188 ."}],"subject":[],"title":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"main title","classname":"main title","schemeid":"dnet:dataCite_title","schemename":"dnet:dataCite_title"},"value":"A parallel plate tackmeter for measuring the splitting resistance of printing inks"}]} +{"author":[{"fullname":"Mononen, Petri","name":"Petri","pid":[],"rank":1,"surname":"Mononen"},{"fullname":"Innamaa, Satu","name":"Satu","pid":[],"rank":2,"surname":"Innamaa"}],"bestaccessright":{"classid":"RESTRICTED","classname":"Restricted","schemeid":"dnet:access_modes","schemename":"dnet:access_modes"},"collectedfrom":[{"key":"10|openaire____::4692342f0992d91f9e705c26959f09e0","value":"VTT Research Information System"}],"context":[],"contributor":[],"country":[],"coverage":[],"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"dateofacceptance":{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"2013-01-01"},"dateofcollection":"2022-02-28T12:38:14.128Z","dateoftransformation":"2022-02-28T14:48:17.052Z","description":[],"externalReference":[],"extraInfo":[],"format":[],"fulltext":[],"id":"50|355e65625b88::3875365f5052758953b072682e62bc80","instance":[{"accessright":{"classid":"RESTRICTED","classname":"Restricted","schemeid":"dnet:access_modes","schemename":"dnet:access_modes"},"alternateIdentifier":[],"collectedfrom":{"key":"10|openaire____::4692342f0992d91f9e705c26959f09e0","value":"VTT Research Information System"},"dateofacceptance":{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"2013-01-01"},"distributionlocation":"","hostedby":{"key":"10|openaire____::4692342f0992d91f9e705c26959f09e0","value":"VTT Research Information System"},"instancetype":{"classid":"0001","classname":"Article","schemeid":"dnet:publication_resource","schemename":"dnet:publication_resource"},"pid":[],"refereed":{"classid":"0000","classname":"UNKNOWN","schemeid":"dnet:review_levels","schemename":"dnet:review_levels"},"url":["https://cris.vtt.fi/en/publications/8602dae4-00e8-4f45-828b-65a367eb4730"]}],"language":{"classid":"eng","classname":"English","schemeid":"dnet:languages","schemename":"dnet:languages"},"lastupdatetimestamp":1646505969378,"oaiprovenance":{"originDescription":{"altered":true,"baseURL":"https%3A%2F%2Fcris.vtt.fi%2Fws%2Foai","datestamp":"2021-09-17T12:01:51Z","harvestDate":"2022-02-28T12:38:14.128Z","identifier":"oai:cris.vtt.fi:publications/8602dae4-00e8-4f45-828b-65a367eb4730","metadataNamespace":"http://www.openarchives.org/OAI/2.0/oai_dc/"}},"originalId":["oai:cris.vtt.fi:publications/8602dae4-00e8-4f45-828b-65a367eb4730","50|355e65625b88::3875365f5052758953b072682e62bc80"],"pid":[],"relevantdate":[],"resourcetype":{"classid":"UNKNOWN","classname":"UNKNOWN","schemeid":"dnet:dataCite_resource","schemename":"dnet:dataCite_resource"},"resulttype":{"classid":"publication","classname":"publication","schemeid":"dnet:result_typologies","schemename":"dnet:result_typologies"},"source":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"Mononen , P & Innamaa , S 2013 , ' Enhancing journey quality : Field Operational Test of Aftermarket and Nomadic Devices in Vehicles ' , Baltic Transport Journal , pp. 46-47 ."}],"subject":[],"title":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"main title","classname":"main title","schemeid":"dnet:dataCite_title","schemename":"dnet:dataCite_title"},"value":"Enhancing journey quality:Field Operational Test of Aftermarket and Nomadic Devices in Vehicles"}]} +{"author":[{"fullname":"Tsupari, Eemeli","name":"Eemeli","pid":[],"rank":1,"surname":"Tsupari"}],"bestaccessright":{"classid":"OPEN","classname":"Open Access","schemeid":"dnet:access_modes","schemename":"dnet:access_modes"},"collectedfrom":[{"key":"10|eurocrisdris::9ae43d14471c4b33661fedda6f06b539","value":"VTT Research Information System"},{"key":"10|opendoar____::f0dd4a99fba6075a9494772b58f95280","value":"VTT Research Information System"}],"context":[],"contributor":[],"country":[],"coverage":[],"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"dateofacceptance":{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"2018-10-01"},"dateofcollection":"2022-02-28T12:38:39.075Z","dateoftransformation":"2022-02-28T15:37:43.154Z","description":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"Despite international agreements, global greenhouse gas (GHG) emissions have not decreased according to the targets. Consequently, our generation is creating an enormous problem for future generations. As climate change is a global problem, GHG emissions must decrease globally. Consequently, international policies are needed, actions should be effective and the impacts should be assessed with broad boundaries. In Europe, the cornerstone of climate policy is the EU Emissions Trading Scheme (EU ETS) but the rebound impacts within the EU ETS are often excluded in the assessments. This dissertation examines the impacts of major CO2 emission reduction solutions with different system boundaries, highlighting the importance of boundary selection on the results. In addition, the economic feasibilities of the selected solutions are evaluated.The case examples represent the most important sectors in terms of global CO2 emissions, such as electricity and heat production, the steel industry and transport. The studied technologies include efficient Waste-to-Energy (WtE) concepts with high power-to-heat ratio, utilisation of CO2 Capture and Storage (CCS) in different applications, replacing steel mill blast furnaces with Oxygen Blast Furnaces (OBF), Combined Heat and Power (CHP) and Carbon Capture and Utilisation (CCU) for storable fuels, which can be used for example in transportation. The results highlight the importance of the consequences in the electricity production system as well as the rebound impacts in the EU ETS. For example, the studied concepts to decrease direct GHG emissions of steel mills lead to increased power purchase from markets and consequently increase in emissions of the power system. The impacts of CCU concepts based on electrolysis increase the emissions in electricity production but enable a decrease in the usage of fossil fuels in transportation. In addition, converting electricity to storable fuels enable higher shares of variable solar and wind energy in the power systems. The consequences in the power systems are complex, including for example the impacts on electricity imports and exports, future investments and the EU ETS. Even if these impacts can be recognised by qualitative means, unambiguous quantitative consequences cannot be given. Understanding the decisive impacts of the framework and boundaries is crucial to interpreting different assessments and making effective actions and policy decisions. Solutions which decrease emissions within a narrow system boundary can actually increase the emissions of the broader system."}],"externalReference":[],"extraInfo":[],"format":[],"fulltext":[],"id":"50|355e65625b88::54a1c76f520bb2c8da27d12e42891088","instance":[{"accessright":{"classid":"OPEN","classname":"Open Access","schemeid":"dnet:access_modes","schemename":"dnet:access_modes"},"alternateIdentifier":[],"collectedfrom":{"key":"10|openaire____::4692342f0992d91f9e705c26959f09e0","value":"VTT Research Information System"},"dateofacceptance":{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"2018-10-01"},"distributionlocation":"","hostedby":{"key":"10|openaire____::4692342f0992d91f9e705c26959f09e0","value":"VTT Research Information System"},"instancetype":{"classid":"0006","classname":"Doctoral thesis","schemeid":"dnet:publication_resource","schemename":"dnet:publication_resource"},"pid":[],"refereed":{"classid":"0000","classname":"UNKNOWN","schemeid":"dnet:review_levels","schemename":"dnet:review_levels"},"url":["https://cris.vtt.fi/en/publications/d62ac5ef-7347-400f-95b2-59d970ceb505"]}],"language":{"classid":"eng","classname":"English","schemeid":"dnet:languages","schemename":"dnet:languages"},"lastupdatetimestamp":1646505683219,"oaiprovenance":{"originDescription":{"altered":true,"baseURL":"https%3A%2F%2Fcris.vtt.fi%2Fws%2Foai","datestamp":"2021-05-18T10:33:00Z","harvestDate":"2022-02-28T12:38:39.075Z","identifier":"oai:cris.vtt.fi:publications/d62ac5ef-7347-400f-95b2-59d970ceb505","metadataNamespace":"http://www.openarchives.org/OAI/2.0/oai_dc/"}},"originalId":["oai:cris.vtt.fi:publications/d62ac5ef-7347-400f-95b2-59d970ceb505","50|355e65625b88::54a1c76f520bb2c8da27d12e42891088"],"pid":[],"publisher":{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"Aalto University"},"relevantdate":[],"resourcetype":{"classid":"UNKNOWN","classname":"UNKNOWN","schemeid":"dnet:dataCite_resource","schemename":"dnet:dataCite_resource"},"resulttype":{"classid":"publication","classname":"publication","schemeid":"dnet:result_typologies","schemename":"dnet:result_typologies"},"source":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"Tsupari , E 2018 , ' Impact of system boundaries on the effectiveness of climate change mitigation actions : Dissertation ' , Doctor Degree , Aalto University . < http://urn.fi/URN:ISBN:978-952-60-8358-2 >"}],"subject":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:result_subject","schemename":"dnet:result_subject"},"value":"energy"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:result_subject","schemename":"dnet:result_subject"},"value":"environmental science"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:result_subject","schemename":"dnet:result_subject"},"value":"climate change mitigation"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:result_subject","schemename":"dnet:result_subject"},"value":"greenhouse gases"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:result_subject","schemename":"dnet:result_subject"},"value":"carbon dioxide"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:result_subject","schemename":"dnet:result_subject"},"value":"emissions trading"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:result_subject","schemename":"dnet:result_subject"},"value":"economic feasibility"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:result_subject","schemename":"dnet:result_subject"},"value":"/dk/atira/pure/sustainabledevelopmentgoals/climate_action"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:result_subject","schemename":"dnet:result_subject"},"value":"SDG 13 - Climate Action"}],"title":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"main title","classname":"main title","schemeid":"dnet:dataCite_title","schemename":"dnet:dataCite_title"},"value":"Impact of system boundaries on the effectiveness of climate change mitigation actions:Dissertation"}]} +{"author":[{"fullname":"Turkia, Heidi","name":"Heidi","pid":[],"rank":1,"surname":"Turkia"},{"fullname":"Sirén, Heli","name":"Heli","pid":[],"rank":2,"surname":"Sirén"},{"fullname":"Penttilä, Merja","name":"Merja","pid":[],"rank":3,"surname":"Penttilä"},{"fullname":"Pitkänen, Juha Pekka","name":"Juha Pekka","pid":[],"rank":4,"surname":"Pitkänen"}],"bestaccessright":{"classid":"CLOSED","classname":"Closed Access","schemeid":"dnet:access_modes","schemename":"dnet:access_modes"},"collectedfrom":[{"key":"10|openaire____::4692342f0992d91f9e705c26959f09e0","value":"VTT Research Information System"}],"context":[],"contributor":[],"country":[],"coverage":[],"dataInfo":{"deletedbyinference":true,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"dateofacceptance":{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"2013-02-22"},"dateofcollection":"2022-02-28T12:29:51.291Z","dateoftransformation":"2022-02-28T16:19:35.201Z","description":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"

Hydrolysates of lignocellulosic biomass, used as substrates for the sustainable production of fuels and chemicals often contain high amounts of phenolic compounds inhibiting the production microbiota. Quantification of these inhibitor compounds may help to understand possible difficulties in bioprocessing and further the development of more efficient, robust and tolerable processes. A separation method based on capillary electrophoresis with UV detection was developed for the simultaneous quantification of 10 phenolic compounds that may have inhibitor properties. Intraday relative standard deviations were less than 0.7% for migration times and between 2.6% and 6.4% for peak areas. Interday relative standard deviations were less than 3.0% for migration times and between 5.0% and 7.2% for peak areas. The method was applied to demonstrate that Saccharomyces cerevisiae was able to decrease the concentrations of vanillin, coniferyl aldehyde, syringaldehyde, acetoguaiacone and cinnamic acid during the cultivation, whereas the concentrations of phenols increased.

"}],"externalReference":[],"extraInfo":[],"format":[],"fulltext":[],"id":"50|355e65625b88::6c232359e3b3165574cb88f0554d9264","instance":[{"accessright":{"classid":"CLOSED","classname":"Closed Access","schemeid":"dnet:access_modes","schemename":"dnet:access_modes"},"alternateIdentifier":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"doi","classname":"Digital Object Identifier","schemeid":"dnet:pid_types","schemename":"dnet:pid_types"},"value":"10.1016/j.chroma.2013.01.004"}],"collectedfrom":{"key":"10|openaire____::4692342f0992d91f9e705c26959f09e0","value":"VTT Research Information System"},"dateofacceptance":{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"2013-02-22"},"distributionlocation":"","hostedby":{"key":"10|openaire____::4692342f0992d91f9e705c26959f09e0","value":"VTT Research Information System"},"instancetype":{"classid":"0001","classname":"Article","schemeid":"dnet:publication_resource","schemename":"dnet:publication_resource"},"pid":[],"refereed":{"classid":"0000","classname":"UNKNOWN","schemeid":"dnet:review_levels","schemename":"dnet:review_levels"},"url":["https://cris.vtt.fi/en/publications/91f411d0-d8f8-4bf1-9072-345303cc776c"]}],"language":{"classid":"eng","classname":"English","schemeid":"dnet:languages","schemename":"dnet:languages"},"lastupdatetimestamp":1646505708387,"oaiprovenance":{"originDescription":{"altered":true,"baseURL":"https%3A%2F%2Fcris.vtt.fi%2Fws%2Foai","datestamp":"2022-01-29T08:16:51Z","harvestDate":"2022-02-28T12:29:51.291Z","identifier":"oai:cris.vtt.fi:publications/91f411d0-d8f8-4bf1-9072-345303cc776c","metadataNamespace":"http://www.openarchives.org/OAI/2.0/oai_dc/"}},"originalId":["oai:cris.vtt.fi:publications/91f411d0-d8f8-4bf1-9072-345303cc776c","50|355e65625b88::6c232359e3b3165574cb88f0554d9264"],"pid":[],"relevantdate":[],"resourcetype":{"classid":"UNKNOWN","classname":"UNKNOWN","schemeid":"dnet:dataCite_resource","schemename":"dnet:dataCite_resource"},"resulttype":{"classid":"publication","classname":"publication","schemeid":"dnet:result_typologies","schemename":"dnet:result_typologies"},"source":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"Turkia , H , Sirén , H , Penttilä , M & Pitkänen , J P 2013 , ' Capillary electrophoresis for the monitoring of phenolic compounds in bioprocesses ' , Journal of Chromatography A , vol. 1278 , pp. 175-180 . https://doi.org/10.1016/j.chroma.2013.01.004"}],"subject":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:result_subject","schemename":"dnet:result_subject"},"value":"Bioprocess monitoring"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:result_subject","schemename":"dnet:result_subject"},"value":"Capillary electrophoresis"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:result_subject","schemename":"dnet:result_subject"},"value":"Phenolic compounds"}],"title":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"main title","classname":"main title","schemeid":"dnet:dataCite_title","schemename":"dnet:dataCite_title"},"value":"Capillary electrophoresis for the monitoring of phenolic compounds in bioprocesses"}]} +{"author":[{"fullname":"Veijalainen, Jari","name":"Jari","pid":[],"rank":1,"surname":"Veijalainen"},{"fullname":"Wolski, Antoni","name":"Antoni","pid":[],"rank":2,"surname":"Wolski"}],"bestaccessright":{"classid":"RESTRICTED","classname":"Restricted","schemeid":"dnet:access_modes","schemename":"dnet:access_modes"},"collectedfrom":[{"key":"10|openaire____::4692342f0992d91f9e705c26959f09e0","value":"VTT Research Information System"}],"context":[{"dataInfo": null,"id": "dariah"}],"contributor":[],"country":[],"coverage":[],"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"dateofacceptance":{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"1991-01-01"},"dateofcollection":"2022-02-28T12:33:57.005Z","dateoftransformation":"2022-02-28T16:33:35.101Z","description":[],"externalReference":[],"extraInfo":[],"format":[],"fulltext":[],"id":"50|355e65625b88::74009c567c81b4aa55c813db658734df","instance":[{"accessright":{"classid":"RESTRICTED","classname":"Restricted","schemeid":"dnet:access_modes","schemename":"dnet:access_modes"},"alternateIdentifier":[],"collectedfrom":{"key":"10|openaire____::4692342f0992d91f9e705c26959f09e0","value":"VTT Research Information System"},"dateofacceptance":{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"1991-01-01"},"distributionlocation":"","hostedby":{"key":"10|openaire____::4692342f0992d91f9e705c26959f09e0","value":"VTT Research Information System"},"instancetype":{"classid":"0002","classname":"Book","schemeid":"dnet:publication_resource","schemename":"dnet:publication_resource"},"pid":[],"refereed":{"classid":"0000","classname":"UNKNOWN","schemeid":"dnet:review_levels","schemename":"dnet:review_levels"},"url":["https://cris.vtt.fi/en/publications/bb17c77a-f574-4921-a5cb-32dc1f283fa3"]},{"accessright":{"classid":"RESTRICTED","classname":"Restricted","schemeid":"dnet:access_modes","schemename":"dnet:access_modes"},"alternateIdentifier":[],"collectedfrom":{"key":"10|openaire____::4692342f0992d91f9e705c26959f09e0","value":"VTT Research Information System"},"dateofacceptance":{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"1991-01-01"},"distributionlocation":"","hostedby":{"key":"10|eurocrisdris::fe4903425d9040f680d8610d9079ea14","value":"VTT Research Information System"},"instancetype":{"classid":"0002","classname":"Book","schemeid":"dnet:publication_resource","schemename":"dnet:publication_resource"},"pid":[],"refereed":{"classid":"0000","classname":"UNKNOWN","schemeid":"dnet:review_levels","schemename":"dnet:review_levels"},"url":["https://cris.vtt.fi/en/publications/bb17c77a-f574-4921-a5cb-32dc1f283fa3"]}, {"accessright":{"classid":"RESTRICTED","classname":"Restricted","schemeid":"dnet:access_modes","schemename":"dnet:access_modes"},"alternateIdentifier":[],"collectedfrom":{"key":"10|openaire____::4692342f0992d91f9e705c26959f09e0","value":"VTT Research Information System"},"dateofacceptance":{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"1991-01-01"},"distributionlocation":"","hostedby":{"key":"10|opendoar____::f0dd4a99fba6075a9494772b58f95280","value":"VTT Research Information System"},"instancetype":{"classid":"0002","classname":"Book","schemeid":"dnet:publication_resource","schemename":"dnet:publication_resource"},"pid":[],"refereed":{"classid":"0000","classname":"UNKNOWN","schemeid":"dnet:review_levels","schemename":"dnet:review_levels"},"url":["https://cris.vtt.fi/en/publications/bb17c77a-f574-4921-a5cb-32dc1f283fa3"]}],"language":{"classid":"eng","classname":"English","schemeid":"dnet:languages","schemename":"dnet:languages"},"lastupdatetimestamp":1646505716994,"oaiprovenance":{"originDescription":{"altered":true,"baseURL":"https%3A%2F%2Fcris.vtt.fi%2Fws%2Foai","datestamp":"2020-12-21T07:05:54Z","harvestDate":"2022-02-28T12:33:57.005Z","identifier":"oai:cris.vtt.fi:publications/bb17c77a-f574-4921-a5cb-32dc1f283fa3","metadataNamespace":"http://www.openarchives.org/OAI/2.0/oai_dc/"}},"originalId":["50|355e65625b88::74009c567c81b4aa55c813db658734df","oai:cris.vtt.fi:publications/bb17c77a-f574-4921-a5cb-32dc1f283fa3"],"pid":[],"publisher":{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"VTT Technical Research Centre of Finland"},"relevantdate":[],"resourcetype":{"classid":"UNKNOWN","classname":"UNKNOWN","schemeid":"dnet:dataCite_resource","schemename":"dnet:dataCite_resource"},"resulttype":{"classid":"publication","classname":"publication","schemeid":"dnet:result_typologies","schemename":"dnet:result_typologies"},"source":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"Veijalainen , J & Wolski , A 1991 , Prepare and commit certification for decentralized transaction management in rigorous multidatabases : Research Report No. J-1 . VTT Technical Research Centre of Finland ."}],"subject":[],"title":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"main title","classname":"main title","schemeid":"dnet:dataCite_title","schemename":"dnet:dataCite_title"},"value":"Prepare and commit certification for decentralized transaction management in rigorous multidatabases:Research Report No. J-1"}]} +{"author":[{"fullname":"Hanhijärvi, Antti","name":"Antti","pid":[],"rank":1,"surname":"Hanhijärvi"},{"fullname":"Hukka, A.","name":"A.","pid":[],"rank":2,"surname":"Hukka"},{"fullname":"Paajanen, T.","name":"T.","pid":[],"rank":3,"surname":"Paajanen"},{"fullname":"Pulkkinen, P.","name":"P.","pid":[],"rank":4,"surname":"Pulkkinen"},{"fullname":"Sundman, S.","name":"S.","pid":[],"rank":5,"surname":"Sundman"}],"bestaccessright":{"classid":"CLOSED","classname":"Closed Access","schemeid":"dnet:access_modes","schemename":"dnet:access_modes"},"collectedfrom":[{"key":"10|openaire____::4692342f0992d91f9e705c26959f09e0","value":"VTT Research Information System"}],"context":[],"contributor":[],"country":[],"coverage":[],"dataInfo":{"deletedbyinference":true,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"dateofacceptance":{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"2003-01-01"},"dateofcollection":"2022-02-28T12:32:33.974Z","dateoftransformation":"2022-02-28T17:38:24.191Z","description":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"The paper presents experimental results of jet drying tests of birch (Betula pendula) and spruce (Picea abies) veneers at temperatures of 140°C and 190°C. Results include drying rates for 1.5 mm thick birch veneers as well as 1.5 mm and 2.6 mm thick spruce veneers of both heartwood and sapwood. Based on the test results, material parameter values for a simplified drying model are assessed. The model is based on the use of an effective diffusion coefficient and an effective surface emission coefficient, which values are calibrated to fit to the experimental results. It is observed, that separate model parameter sets are needed for the two different species but also for occurrence of heartwood or sapwood (spruce) and different thickness values of veneers."}],"externalReference":[],"extraInfo":[],"format":[],"fulltext":[],"id":"50|dedup_wf_001::08d6f2001319c86d0e69b0f83ad75df2","instance":[{"accessright":{"classid":"CLOSED","classname":"Closed Access","schemeid":"dnet:access_modes","schemename":"dnet:access_modes"},"alternateIdentifier":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"doi","classname":"Digital Object Identifier","schemeid":"dnet:pid_types","schemename":"dnet:pid_types"},"value":"10.1007/s00107-003-0379-4"}],"collectedfrom":{"key":"10|openaire____::4692342f0992d91f9e705c26959f09e0","value":"VTT Research Information System"},"dateofacceptance":{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"2003-01-01"},"distributionlocation":"","hostedby":{"key":"10|openaire____::4692342f0992d91f9e705c26959f09e0","value":"VTT Research Information System"},"instancetype":{"classid":"0001","classname":"Article","schemeid":"dnet:publication_resource","schemename":"dnet:publication_resource"},"pid":[],"refereed":{"classid":"0000","classname":"UNKNOWN","schemeid":"dnet:review_levels","schemename":"dnet:review_levels"},"url":["https://cris.vtt.fi/en/publications/8cc91100-904f-43c5-bb3d-1cc3e0f4a4b5"]}],"language":{"classid":"eng","classname":"English","schemeid":"dnet:languages","schemename":"dnet:languages"},"lastupdatetimestamp":1646505760180,"oaiprovenance":{"originDescription":{"altered":true,"baseURL":"https%3A%2F%2Fcris.vtt.fi%2Fws%2Foai","datestamp":"2022-01-31T21:18:51Z","harvestDate":"2022-02-28T12:32:33.974Z","identifier":"oai:cris.vtt.fi:publications/8cc91100-904f-43c5-bb3d-1cc3e0f4a4b5","metadataNamespace":"http://www.openarchives.org/OAI/2.0/oai_dc/"}},"originalId":["50|355e65625b88::9cb10895b4a92b0215b85acb2c3268b9","oai:cris.vtt.fi:publications/8cc91100-904f-43c5-bb3d-1cc3e0f4a4b5"],"pid":[],"relevantdate":[],"resourcetype":{"classid":"UNKNOWN","classname":"UNKNOWN","schemeid":"dnet:dataCite_resource","schemename":"dnet:dataCite_resource"},"resulttype":{"classid":"publication","classname":"publication","schemeid":"dnet:result_typologies","schemename":"dnet:result_typologies"},"source":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"Hanhijärvi , A , Hukka , A , Paajanen , T , Pulkkinen , P & Sundman , S 2003 , ' Experimental investigation of jet drying of birch and spruce veneers and modelling with a simplified approach ' , Holz als Roh- und Werkstoff , vol. 61 , no. 2 , pp. 83-88 . https://doi.org/10.1007/s00107-003-0379-4"}],"subject":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:result_subject","schemename":"dnet:result_subject"},"value":"jet drying"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:result_subject","schemename":"dnet:result_subject"},"value":"drying"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:result_subject","schemename":"dnet:result_subject"},"value":"veneers"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:result_subject","schemename":"dnet:result_subject"},"value":"birch"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:result_subject","schemename":"dnet:result_subject"},"value":"spruce"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:result_subject","schemename":"dnet:result_subject"},"value":"heartwood"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:result_subject","schemename":"dnet:result_subject"},"value":"sapwood"}],"title":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"main title","classname":"main title","schemeid":"dnet:dataCite_title","schemename":"dnet:dataCite_title"},"value":"Experimental investigation of jet drying of birch and spruce veneers and modelling with a simplified approach"}]} +{"author":[{"fullname":"Vainonen-Ahlgren, Elizaveta","name":"Elizaveta","pid":[],"rank":1,"surname":"Vainonen-Ahlgren"},{"fullname":"Likonen, Jari","name":"Jari","pid":[],"rank":2,"surname":"Likonen"},{"fullname":"Renvall,","pid":[],"rank":3},{"fullname":"Rohde, V.","name":"V.","pid":[],"rank":4,"surname":"Rohde"},{"fullname":"Mayer, M.","name":"M.","pid":[],"rank":5,"surname":"Mayer"}],"bestaccessright":{"classid":"CLOSED","classname":"Closed Access","schemeid":"dnet:access_modes","schemename":"dnet:access_modes"},"collectedfrom":[{"key":"10|openaire____::4692342f0992d91f9e705c26959f09e0","value":"VTT Research Information System"}],"context":[],"contributor":[],"country":[],"coverage":[],"dataInfo":{"deletedbyinference":true,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"dateofacceptance":{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"2007-01-01"},"dateofcollection":"2022-02-28T12:32:50.667Z","dateoftransformation":"2022-02-28T17:49:49.964Z","description":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"To investigate material transport in scrape-off layer plasma and long term deposition in divertor, 13CH4 was puffed at the end of 2004 and 2005 experimental campaigns into ASDEX Upgrade from the outer mid-plane.
Ex situ analyses of the tiles were performed by secondary ion mass spectrometry.
The peaks of 13C were detected below the bottom inner strike point and at the horizontal tile at the outer lower divertor. It was detected ∼21% of the total puffed 13C amount.
The deposition rate for carbon by plasma was also calculated in long term experiment. It was obtained to be 22 × 10−3 and 8.7 × 10−3 g/s for the upper (campaign 2004) and lower (campaign 2003) divertors, respectively."}],"externalReference":[],"extraInfo":[],"format":[],"fulltext":[],"id":"50|dedup_wf_001::06e51d2bf295531b2d2e7a1b55500783","instance":[{"accessright":{"classid":"CLOSED","classname":"Closed Access","schemeid":"dnet:access_modes","schemename":"dnet:access_modes"},"alternateIdentifier":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"doi","classname":"Digital Object Identifier","schemeid":"dnet:pid_types","schemename":"dnet:pid_types"},"value":"10.1016/j.jnucmat.2007.01.026"}],"collectedfrom":{"key":"10|openaire____::4692342f0992d91f9e705c26959f09e0","value":"VTT Research Information System"},"dateofacceptance":{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"2007-01-01"},"distributionlocation":"","hostedby":{"key":"10|openaire____::4692342f0992d91f9e705c26959f09e0","value":"VTT Research Information System"},"instancetype":{"classid":"0001","classname":"Patent","schemeid":"dnet:publication_resource","schemename":"dnet:publication_resource"},"pid":[],"refereed":{"classid":"0000","classname":"UNKNOWN","schemeid":"dnet:review_levels","schemename":"dnet:review_levels"},"url":["https://cris.vtt.fi/en/publications/2472b21e-1fdc-4121-946e-e9c8fae6d02d"]}],"language":{"classid":"eng","classname":"English","schemeid":"dnet:languages","schemename":"dnet:languages"},"lastupdatetimestamp":1646505766149,"oaiprovenance":{"originDescription":{"altered":true,"baseURL":"https%3A%2F%2Fcris.vtt.fi%2Fws%2Foai","datestamp":"2022-02-01T02:35:05Z","harvestDate":"2022-02-28T12:32:50.667Z","identifier":"oai:cris.vtt.fi:publications/2472b21e-1fdc-4121-946e-e9c8fae6d02d","metadataNamespace":"http://www.openarchives.org/OAI/2.0/oai_dc/"}},"originalId":["50|355e65625b88::a29614444f5030f11e75c6c27264d272","oai:cris.vtt.fi:publications/2472b21e-1fdc-4121-946e-e9c8fae6d02d"],"pid":[],"relevantdate":[],"resourcetype":{"classid":"UNKNOWN","classname":"UNKNOWN","schemeid":"dnet:dataCite_resource","schemename":"dnet:dataCite_resource"},"resulttype":{"classid":"publication","classname":"publication","schemeid":"dnet:result_typologies","schemename":"dnet:result_typologies"},"source":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"Vainonen-Ahlgren , E , Likonen , J , Renvall , Rohde , V & Mayer , M 2007 , ' Migration of 13C and deposition at ASDEX Upgrade ' , Journal of Nuclear Materials , vol. 363-365 , pp. 270-275 . https://doi.org/10.1016/j.jnucmat.2007.01.026"}],"subject":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:result_subject","schemename":"dnet:result_subject"},"value":"ASDEX upgrade"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:result_subject","schemename":"dnet:result_subject"},"value":"divertor"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:result_subject","schemename":"dnet:result_subject"},"value":"carbon based materials"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:result_subject","schemename":"dnet:result_subject"},"value":"erosion"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:result_subject","schemename":"dnet:result_subject"},"value":"deposition"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:result_subject","schemename":"dnet:result_subject"},"value":"ITER"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:result_subject","schemename":"dnet:result_subject"},"value":"JET"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:result_subject","schemename":"dnet:result_subject"},"value":"plasma"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:result_subject","schemename":"dnet:result_subject"},"value":"fusion energy"}],"title":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"main title","classname":"main title","schemeid":"dnet:dataCite_title","schemename":"dnet:dataCite_title"},"value":"Migration of 13C and deposition at ASDEX Upgrade"}]} +{"author":[{"fullname":"Aalto, Timo","name":"Timo","pid":[],"rank":1,"surname":"Aalto"},{"fullname":"Harjanne, Mikko","name":"Mikko","pid":[],"rank":2,"surname":"Harjanne"},{"fullname":"Kapulainen, Markku","name":"Markku","pid":[],"rank":3,"surname":"Kapulainen"}],"bestaccessright":{"classid":"CLOSED","classname":"Closed Access","schemeid":"dnet:access_modes","schemename":"dnet:access_modes"},"collectedfrom":[{"key":"10|openaire____::4692342f0992d91f9e705c26959f09e0","value":"VTT Research Information System"}],"context":[{"dataInfo": null,"id": "beopen"}],"contributor":[],"country":[],"coverage":[],"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"dateofacceptance":{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"2003-01-01"},"dateofcollection":"2022-02-28T12:32:37.581Z","dateoftransformation":"2022-02-28T19:39:18.717Z","description":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"A novel method has been developed for measuring the rotational angle of a fiber's or a waveguide's polarization axis with respect to a reference angle. The reference angle is the polarization axis of the measuring device. The method also gives the true polarization extinction ratio of the measured fiber or waveguide. The method is suitable for the characterization and rotational alignment of polarization-maintaining waveguides and fibers. In particular, the method can be used to rotationally align the fiber-waveguide interconnections during waveguide characterization. The measuring device is either a linear polarizer or a polarization splitter that is accurately rotated with respect to the device under test. According to the experiments with a polarization-maintaining fiber, the method is very easy and inexpensive to implement, and the angular accuracy can be better than 0.2 deg."}],"externalReference":[],"extraInfo":[],"format":[],"fulltext":[],"id":"50|openorgs____::64badd35233ba2cd4946368ef2f4cf57","instance":[{"accessright":{"classid":"CLOSED","classname":"Closed Access","schemeid":"dnet:access_modes","schemename":"dnet:access_modes"},"alternateIdentifier":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"doi","classname":"Digital Object Identifier","schemeid":"dnet:pid_types","schemename":"dnet:pid_types"},"value":"10.1117/1.1600730"}],"collectedfrom":{"key":"10|openaire____::4692342f0992d91f9e705c26959f09e0","value":"VTT Research Information System"},"dateofacceptance":{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"2003-01-01"},"distributionlocation":"","hostedby":{"key":"10|openaire____::4692342f0992d91f9e705c26959f09e0","value":"VTT Research Information System"},"instancetype":{"classid":"0001","classname":"Article","schemeid":"dnet:publication_resource","schemename":"dnet:publication_resource"},"pid":[],"refereed":{"classid":"0000","classname":"UNKNOWN","schemeid":"dnet:review_levels","schemename":"dnet:review_levels"},"url":["https://cris.vtt.fi/en/publications/8cd538fb-6484-4655-81dd-47348d358fd4"]}],"language":{"classid":"eng","classname":"English","schemeid":"dnet:languages","schemename":"dnet:languages"},"lastupdatetimestamp":1646505829230,"oaiprovenance":{"originDescription":{"altered":true,"baseURL":"https%3A%2F%2Fcris.vtt.fi%2Fws%2Foai","datestamp":"2022-01-31T21:47:37Z","harvestDate":"2022-02-28T12:32:37.581Z","identifier":"oai:cris.vtt.fi:publications/8cd538fb-6484-4655-81dd-47348d358fd4","metadataNamespace":"http://www.openarchives.org/OAI/2.0/oai_dc/"}},"originalId":["50|355e65625b88::df0143af011fd82af8ac2d07b03ee8cd","oai:cris.vtt.fi:publications/8cd538fb-6484-4655-81dd-47348d358fd4"],"pid":[],"relevantdate":[],"resourcetype":{"classid":"UNKNOWN","classname":"UNKNOWN","schemeid":"dnet:dataCite_resource","schemename":"dnet:dataCite_resource"},"resulttype":{"classid":"publication","classname":"publication","schemeid":"dnet:result_typologies","schemename":"dnet:result_typologies"},"source":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"Aalto , T , Harjanne , M & Kapulainen , M 2003 , ' Method for the rotational alignment of polarization-maintaining optical fibers and waveguides ' , Optical Engineering , vol. 42 , no. 10 , pp. 2861-2867 . https://doi.org/10.1117/1.1600730"}],"subject":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:result_subject","schemename":"dnet:result_subject"},"value":"optical waveguide"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:result_subject","schemename":"dnet:result_subject"},"value":"polarization-maintaining fiber"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:result_subject","schemename":"dnet:result_subject"},"value":"characterization"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:result_subject","schemename":"dnet:result_subject"},"value":"fiber-waveguide coupling"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:result_subject","schemename":"dnet:result_subject"},"value":"polarization"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:result_subject","schemename":"dnet:result_subject"},"value":"polarization axis"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:result_subject","schemename":"dnet:result_subject"},"value":"polarizer"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:result_subject","schemename":"dnet:result_subject"},"value":"polarization splitter"}],"title":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"main title","classname":"main title","schemeid":"dnet:dataCite_title","schemename":"dnet:dataCite_title"},"value":"Method for the rotational alignment of polarization-maintaining optical fibers and waveguides"}]} +{"author":[{"fullname":"Aalto, Timo","name":"Timo","pid":[],"rank":1,"surname":"Aalto"},{"fullname":"Harjanne, Mikko","name":"Mikko","pid":[],"rank":2,"surname":"Harjanne"},{"fullname":"Kapulainen, Markku","name":"Markku","pid":[],"rank":3,"surname":"Kapulainen"}],"bestaccessright":{"classid":"CLOSED","classname":"Closed Access","schemeid":"dnet:access_modes","schemename":"dnet:access_modes"},"collectedfrom":[{"key":"10|openaire____::4692342f0992d91f9e705c26959f09e0","value":"VTT Research Information System"}],"context":[{"dataInfo": null,"id": "beopen"}],"contributor":[],"country":[],"coverage":[],"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"dateofacceptance":{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"2003-01-01"},"dateofcollection":"2022-02-28T12:32:37.581Z","dateoftransformation":"2022-02-28T19:39:18.717Z","description":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"A novel method has been developed for measuring the rotational angle of a fiber's or a waveguide's polarization axis with respect to a reference angle. The reference angle is the polarization axis of the measuring device. The method also gives the true polarization extinction ratio of the measured fiber or waveguide. The method is suitable for the characterization and rotational alignment of polarization-maintaining waveguides and fibers. In particular, the method can be used to rotationally align the fiber-waveguide interconnections during waveguide characterization. The measuring device is either a linear polarizer or a polarization splitter that is accurately rotated with respect to the device under test. According to the experiments with a polarization-maintaining fiber, the method is very easy and inexpensive to implement, and the angular accuracy can be better than 0.2 deg."}],"externalReference":[],"extraInfo":[],"format":[],"fulltext":[],"id":"50|openorgs____::64badd35233ba2cd4946368ef2f4cf57","instance":[{"accessright":{"classid":"CLOSED","classname":"Closed Access","schemeid":"dnet:access_modes","schemename":"dnet:access_modes"},"alternateIdentifier":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"doi","classname":"Digital Object Identifier","schemeid":"dnet:pid_types","schemename":"dnet:pid_types"},"value":"10.1117/1.1600730"}],"collectedfrom":{"key":"10|openaire____::4692342f0992d91f9e705c26959f09e0","value":"VTT Research Information System"},"dateofacceptance":{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"2003-01-01"},"distributionlocation":"","hostedby":{"key":"10|openaire____::4692342f0992d91f9e705c26959f09e0","value":"VTT Research Information System"},"instancetype":{"classid":"0001","classname":"Article","schemeid":"dnet:publication_resource","schemename":"dnet:publication_resource"},"pid":[],"refereed":{"classid":"0000","classname":"UNKNOWN","schemeid":"dnet:review_levels","schemename":"dnet:review_levels"},"url":["https://cris.vtt.fi/en/publications/8cd538fb-6484-4655-81dd-47348d358fd4"]}],"language":{"classid":"eng","classname":"English","schemeid":"dnet:languages","schemename":"dnet:languages"},"lastupdatetimestamp":1646505829230,"oaiprovenance":{"originDescription":{"altered":true,"baseURL":"https%3A%2F%2Fcris.vtt.fi%2Fws%2Foai","datestamp":"2022-01-31T21:47:37Z","harvestDate":"2022-02-28T12:32:37.581Z","identifier":"oai:cris.vtt.fi:publications/8cd538fb-6484-4655-81dd-47348d358fd4","metadataNamespace":"http://www.openarchives.org/OAI/2.0/oai_dc/"}},"originalId":["50|355e65625b88::df0143af011fd82af8ac2d07b03ee8cd","oai:cris.vtt.fi:publications/8cd538fb-6484-4655-81dd-47348d358fd4"],"pid":[],"relevantdate":[],"resourcetype":{"classid":"UNKNOWN","classname":"UNKNOWN","schemeid":"dnet:dataCite_resource","schemename":"dnet:dataCite_resource"},"resulttype":{"classid":"publication","classname":"publication","schemeid":"dnet:result_typologies","schemename":"dnet:result_typologies"},"source":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"Aalto , T , Harjanne , M & Kapulainen , M 2003 , ' Method for the rotational alignment of polarization-maintaining optical fibers and waveguides ' , Optical Engineering , vol. 42 , no. 10 , pp. 2861-2867 . https://doi.org/10.1117/1.1600730"}],"subject":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:result_subject","schemename":"dnet:result_subject"},"value":"optical waveguide"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:result_subject","schemename":"dnet:result_subject"},"value":"polarization-maintaining fiber"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:result_subject","schemename":"dnet:result_subject"},"value":"characterization"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:result_subject","schemename":"dnet:result_subject"},"value":"fiber-waveguide coupling"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:result_subject","schemename":"dnet:result_subject"},"value":"polarization"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:result_subject","schemename":"dnet:result_subject"},"value":"polarization axis"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:result_subject","schemename":"dnet:result_subject"},"value":"polarizer"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:result_subject","schemename":"dnet:result_subject"},"value":"polarization splitter"}],"title":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"main title","classname":"main title","schemeid":"dnet:dataCite_title","schemename":"dnet:dataCite_title"},"value":"Method for the rotational alignment of polarization-maintaining optical fibers and waveguides"}]} +{"author":[{"fullname":"Penttilä, Raimo","name":"Raimo","pid":[],"rank":1,"surname":"Penttilä"},{"fullname":"Vanttaja, Ilkka","name":"Ilkka","pid":[],"rank":2,"surname":"Vanttaja"},{"fullname":"Haapamäki, Petteri","name":"Petteri","pid":[],"rank":3,"surname":"Haapamäki"},{"fullname":"Kujanpää, Veli","name":"Veli","pid":[],"rank":4,"surname":"Kujanpää"}],"bestaccessright":{"classid":"RESTRICTED","classname":"Restricted","schemeid":"dnet:access_modes","schemename":"dnet:access_modes"},"collectedfrom":[{"key":"10|opendoar____::f0dd4a99fba6075a9494772b58f95280","value":"VTT Research Information System"}],"context": [{"dataInfo":null, "id":"dh-ch"}],"contributor":[],"country":[],"coverage":[],"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"dateofacceptance":{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"1994-01-01"},"dateofcollection":"2022-02-28T12:35:26.769Z","dateoftransformation":"2022-02-28T19:54:10.494Z","description":[],"externalReference":[],"extraInfo":[],"format":[],"fulltext":[],"id":"50|355e65625b88::e7d48a470b13bda61f7ebe3513e20cb6","instance":[{"accessright":{"classid":"RESTRICTED","classname":"Restricted","schemeid":"dnet:access_modes","schemename":"dnet:access_modes"},"alternateIdentifier":[],"collectedfrom":{"key":"10|openaire____::4692342f0992d91f9e705c26959f09e0","value":"VTT Research Information System"},"dateofacceptance":{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"1994-01-01"},"distributionlocation":"","hostedby":{"key":"10|eurocrisdris::9ae43d14471c4b33661fedda6f06b539","value":"VTT Research Information System"},"instancetype":{"classid":"0001","classname":"Article","schemeid":"dnet:publication_resource","schemename":"dnet:publication_resource"},"pid":[],"refereed":{"classid":"0000","classname":"UNKNOWN","schemeid":"dnet:review_levels","schemename":"dnet:review_levels"},"url":["https://cris.vtt.fi/en/publications/ddbd28ea-5fbf-43e1-896f-e69856870c26"]}],"language":{"classid":"fin","classname":"Finnish","schemeid":"dnet:languages","schemename":"dnet:languages"},"lastupdatetimestamp":1646505838552,"oaiprovenance":{"originDescription":{"altered":true,"baseURL":"https%3A%2F%2Fcris.vtt.fi%2Fws%2Foai","datestamp":"2019-08-08T07:09:42Z","harvestDate":"2022-02-28T12:35:26.769Z","identifier":"oai:cris.vtt.fi:publications/ddbd28ea-5fbf-43e1-896f-e69856870c26","metadataNamespace":"http://www.openarchives.org/OAI/2.0/oai_dc/"}},"originalId":["oai:cris.vtt.fi:publications/ddbd28ea-5fbf-43e1-896f-e69856870c26","50|355e65625b88::e7d48a470b13bda61f7ebe3513e20cb6"],"pid":[],"relevantdate":[],"resourcetype":{"classid":"UNKNOWN","classname":"UNKNOWN","schemeid":"dnet:dataCite_resource","schemename":"dnet:dataCite_resource"},"resulttype":{"classid":"publication","classname":"publication","schemeid":"dnet:result_typologies","schemename":"dnet:result_typologies"},"source":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"Penttilä , R , Vanttaja , I , Haapamäki , P & Kujanpää , V 1994 , ' Liimauksen ja puristusliittämisen yhdistämisellä lisää lujuutta, jäykkyyttä ja tiiveyttä ' , Ohutlevyuutiset , no. 2 , pp. 17-19 ."}],"subject":[],"title":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"main title","classname":"main title","schemeid":"dnet:dataCite_title","schemename":"dnet:dataCite_title"},"value":"Liimauksen ja puristusliittämisen yhdistämisellä lisää lujuutta, jäykkyyttä ja tiiveyttä"}]} \ No newline at end of file diff --git a/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/resulttocommunityfromsemrel/graph/relation/part-00000.json b/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/resulttocommunityfromsemrel/graph/relation/part-00000.json new file mode 100644 index 000000000..1733e2549 --- /dev/null +++ b/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/resulttocommunityfromsemrel/graph/relation/part-00000.json @@ -0,0 +1,24 @@ +{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"propagation","inferred":true,"invisible":false,"provenanceaction":{"classid":"result:organization:instrepo","classname":"Propagation of affiliation to result collected from datasources of type institutional repository","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.85"},"properties":[],"relClass":"issupplementedby","relType":"resultOrganization","source":"50|355e65625b88::e7d48a470b13bda61f7ebe3513e20cb6","subRelType":"affiliation","target":"50|pending_org_::82f63b2d21ae88596b9d8991780e9888","validated":false} +{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"propagation","inferred":true,"invisible":false,"provenanceaction":{"classid":"result:organization:instrepo","classname":"Propagation of affiliation to result collected from datasources of type institutional repository","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.85"},"properties":[],"relClass":"issupplementedby","relType":"resultOrganization","source":"50|355e65625b88::e7d48a470b13bda61f7ebe3513e20cb6","subRelType":"affiliation","target":"50|dedup_wf_001::06e51d2bf295531b2d2e7a1b55500783","validated":false} +{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"propagation","inferred":true,"invisible":false,"provenanceaction":{"classid":"result:organization:instrepo","classname":"Propagation of affiliation to result collected from datasources of type institutional repository","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.85"},"properties":[],"relClass":"IsProvidedBy","relType":"resultOrganization","source":"10|opendoar____::f0dd4a99fba6075a9494772b58f95280","subRelType":"affiliation","target":"20|openorgs____::322ff2a6524820640bc5d1311871585e","validated":false} +{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"propagation","inferred":true,"invisible":false,"provenanceaction":{"classid":"result:organization:instrepo","classname":"Propagation of affiliation to result collected from datasources of type institutional repository","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.85"},"properties":[],"relClass":"IsProvidedBy","relType":"resultOrganization","source":"10|eurocrisdris::9ae43d14471c4b33661fedda6f06b539","subRelType":"affiliation","target":"20|openorgs____::58e60f1715d219aa6757ba0b0f2ccbce","validated":false} +{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"propagation","inferred":true,"invisible":false,"provenanceaction":{"classid":"result:organization:instrepo","classname":"Propagation of affiliation to result collected from datasources of type institutional repository","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.85"},"properties":[],"relClass":"IsProvidedBy","relType":"resultOrganization","target":"20|openorgs____::64badd35233ba2cd4946368ef2f4cf57","subRelType":"affiliation","source":"10|issn___print::a7a2010e75d849442790955162ef4e42","validated":false} +{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"propagation","inferred":true,"invisible":false,"provenanceaction":{"classid":"result:organization:instrepo","classname":"Propagation of affiliation to result collected from datasources of type institutional repository","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.85"},"properties":[],"relClass":"IsProvidedBy","relType":"resultOrganization","source":"10|issn___print::a7a2010e75d849442790955162ef4e43","subRelType":"affiliation","target":"20|openorgs____::64badd35233ba2cd4946368ef2f4cf57","validated":false} +{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"propagation","inferred":true,"invisible":false,"provenanceaction":{"classid":"result:organization:instrepo","classname":"Propagation of affiliation to result collected from datasources of type institutional repository","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.85"},"properties":[],"relClass":"IsProvidedBy","relType":"resultOrganization","source":"10|issn___print::a7a2010e75d849442790955162ef4e44","subRelType":"affiliation","target":"20|openorgs____::548cbb0c5a93722f3a9aa62aa17a1ba1","validated":false} +{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"propagation","inferred":true,"invisible":false,"provenanceaction":{"classid":"result:organization:instrepo","classname":"Propagation of affiliation to result collected from datasources of type institutional repository","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.85"},"properties":[],"relClass":"IsProvidedBy","relType":"resultOrganization","source":"10|issn___print::a7a2010e75d849442790955162ef4e45","subRelType":"affiliation","target":"20|pending_org_::c522a7c935f9fd9578122e60eeec282c","validated":false} +{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"propagation","inferred":true,"invisible":false,"provenanceaction":{"classid":"result:organization:instrepo","classname":"Propagation of affiliation to result collected from datasources of type institutional repository","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.85"},"properties":[],"relClass":"isrelatedto","relType":"resultOrganization","source":"50|openorgs____::64badd35233ba2cd4946368ef2f4cf57","subRelType":"affiliation","target":"50|dedup_wf_001::06e51d2bf295531b2d2e7a1b55500783","validated":false} +{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"propagation","inferred":true,"invisible":false,"provenanceaction":{"classid":"result:organization:instrepo","classname":"Propagation of affiliation to result collected from datasources of type institutional repository","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.85"},"properties":[],"relClass":"hasAuthorInstitution","relType":"resultOrganization","source":"50|dedup_wf_001::06e51d2bf295531b2d2e7a1b55500783","subRelType":"affiliation","target":"20|openorgs____::64badd35233ba2cd4946368ef2f4cf57","validated":false} +{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"propagation","inferred":true,"invisible":false,"provenanceaction":{"classid":"result:organization:instrepo","classname":"Propagation of affiliation to result collected from datasources of type institutional repository","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.85"},"properties":[],"relClass":"isrelatedto","relType":"resultOrganization","source":"50|355e65625b88::74009c567c81b4aa55c813db658734df","subRelType":"affiliation","target":"50|dedup_wf_001::08d6f2001319c86d0e69b0f83ad75df2","validated":false} +{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"propagation","inferred":true,"invisible":false,"provenanceaction":{"classid":"result:organization:instrepo","classname":"Propagation of affiliation to result collected from datasources of type institutional repository","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.85"},"properties":[],"relClass":"hasAuthorInstitution","relType":"resultOrganization","source":"50|dedup_wf_001::08d6f2001319c86d0e69b0f83ad75df2","subRelType":"affiliation","target":"20|openorgs____::91a81877815afb4ebf25c1a3f3b03c5d","validated":false} +{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"propagation","inferred":true,"invisible":false,"provenanceaction":{"classid":"result:organization:instrepo","classname":"Propagation of affiliation to result collected from datasources of type institutional repository","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.85"},"properties":[],"relClass":"isAuthorInstitutionOf","relType":"resultOrganization","source":"20|openorgs____::548cbb0c5a93722f3a9aa62aa17a1ba1","subRelType":"affiliation","target":"50|dedup_wf_001::0a1cdf269375d32ce341fdeb0e92dfa8","validated":false} +{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"propagation","inferred":true,"invisible":false,"provenanceaction":{"classid":"result:organization:instrepo","classname":"Propagation of affiliation to result collected from datasources of type institutional repository","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.85"},"properties":[],"relClass":"hasAuthorInstitution","relType":"resultOrganization","source":"50|dedup_wf_001::0a1cdf269375d32ce341fdeb0e92dfa8","subRelType":"affiliation","target":"20|openorgs____::548cbb0c5a93722f3a9aa62aa17a1ba1","validated":false} +{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"propagation","inferred":true,"invisible":false,"provenanceaction":{"classid":"result:organization:instrepo","classname":"Propagation of affiliation to result collected from datasources of type institutional repository","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.85"},"properties":[],"relClass":"isAuthorInstitutionOf","relType":"resultOrganization","source":"20|pending_org_::a50fdd7f7e77b74ea2b16823151c391a","subRelType":"affiliation","target":"50|dedup_wf_001::0ab92bed024ee6883c7a1244722e5eec","validated":false} +{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"propagation","inferred":true,"invisible":false,"provenanceaction":{"classid":"result:organization:instrepo","classname":"Propagation of affiliation to result collected from datasources of type institutional repository","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.85"},"properties":[],"relClass":"hasAuthorInstitution","relType":"resultOrganization","source":"50|dedup_wf_001::0ab92bed024ee6883c7a1244722e5eec","subRelType":"affiliation","target":"20|pending_org_::a50fdd7f7e77b74ea2b16823151c391a","validated":false} +{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"propagation","inferred":true,"invisible":false,"provenanceaction":{"classid":"result:organization:instrepo","classname":"Propagation of affiliation to result collected from datasources of type institutional repository","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.85"},"properties":[],"relClass":"isAuthorInstitutionOf","relType":"resultOrganization","source":"20|openorgs____::64badd35233ba2cd4946368ef2f4cf57","subRelType":"affiliation","target":"50|dedup_wf_001::0ca26c736ad4d15b3d5ee90a4d7853e1","validated":false} +{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"propagation","inferred":true,"invisible":false,"provenanceaction":{"classid":"result:organization:instrepo","classname":"Propagation of affiliation to result collected from datasources of type institutional repository","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.85"},"properties":[],"relClass":"hasAuthorInstitution","relType":"resultOrganization","source":"50|dedup_wf_001::0ca26c736ad4d15b3d5ee90a4d7853e1","subRelType":"affiliation","target":"20|openorgs____::64badd35233ba2cd4946368ef2f4cf57","validated":false} +{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"propagation","inferred":true,"invisible":false,"provenanceaction":{"classid":"result:organization:instrepo","classname":"Propagation of affiliation to result collected from datasources of type institutional repository","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.85"},"properties":[],"relClass":"isAuthorInstitutionOf","relType":"resultOrganization","source":"20|pending_org_::a50fdd7f7e77b74ea2b16823151c391a","subRelType":"affiliation","target":"50|dedup_wf_001::0ef8dfab3927cb4d69df0d3113f05a42","validated":false} +{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"propagation","inferred":true,"invisible":false,"provenanceaction":{"classid":"result:organization:instrepo","classname":"Propagation of affiliation to result collected from datasources of type institutional repository","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.85"},"properties":[],"relClass":"hasAuthorInstitution","relType":"resultOrganization","source":"50|dedup_wf_001::0ef8dfab3927cb4d69df0d3113f05a42","subRelType":"affiliation","target":"20|pending_org_::a50fdd7f7e77b74ea2b16823151c391a","validated":false} +{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"propagation","inferred":true,"invisible":false,"provenanceaction":{"classid":"result:organization:instrepo","classname":"Propagation of affiliation to result collected from datasources of type institutional repository","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.85"},"properties":[],"relClass":"isAuthorInstitutionOf","relType":"resultOrganization","source":"20|openorgs____::548cbb0c5a93722f3a9aa62aa17a1ba1","subRelType":"affiliation","target":"50|dedup_wf_001::0f488ad00253126c14a21abe6b2d406c","validated":false} +{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"propagation","inferred":true,"invisible":false,"provenanceaction":{"classid":"result:organization:instrepo","classname":"Propagation of affiliation to result collected from datasources of type institutional repository","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.85"},"properties":[],"relClass":"hasAuthorInstitution","relType":"resultOrganization","source":"50|dedup_wf_001::0f488ad00253126c14a21abe6b2d406c","subRelType":"affiliation","target":"20|openorgs____::548cbb0c5a93722f3a9aa62aa17a1ba1","validated":false} +{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"propagation","inferred":true,"invisible":false,"provenanceaction":{"classid":"result:organization:instrepo","classname":"Propagation of affiliation to result collected from datasources of type institutional repository","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.85"},"properties":[],"relClass":"isAuthorInstitutionOf","relType":"resultOrganization","source":"20|pending_org_::c522a7c935f9fd9578122e60eeec282c","subRelType":"affiliation","target":"50|dedup_wf_001::12206bf78aabd7d52132477182d19147","validated":false} +{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"propagation","inferred":true,"invisible":false,"provenanceaction":{"classid":"result:organization:instrepo","classname":"Propagation of affiliation to result collected from datasources of type institutional repository","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.85"},"properties":[],"relClass":"hasAuthorInstitution","relType":"resultOrganization","source":"50|dedup_wf_001::12206bf78aabd7d52132477182d19147","subRelType":"affiliation","target":"20|pending_org_::c522a7c935f9fd9578122e60eeec282c","validated":false} \ No newline at end of file From 189a7c255a52127f79db59d7cde171f216d76152 Mon Sep 17 00:00:00 2001 From: Miriam Baglioni Date: Mon, 25 Nov 2024 16:52:13 +0100 Subject: [PATCH 087/111] [patents] added test and resources --- .../PrepareResultCommunitySetStep1.java | 11 ----------- .../ResultToCommunityJobTest.java | 2 ++ 2 files changed, 2 insertions(+), 11 deletions(-) diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttocommunityfromsemrel/PrepareResultCommunitySetStep1.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttocommunityfromsemrel/PrepareResultCommunitySetStep1.java index 8f23a4cfd..5af2bf481 100644 --- a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttocommunityfromsemrel/PrepareResultCommunitySetStep1.java +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttocommunityfromsemrel/PrepareResultCommunitySetStep1.java @@ -173,24 +173,13 @@ public class PrepareResultCommunitySetStep1 { "AND lower(co.id) IN " + communityIdList, "AND lower(relClass) IN " + allowedsemrel); Dataset result_context = spark.sql(resultContextQuery); - //result_context.createOrReplaceTempView("result_context"); - -// spark -// .sql(RESULT_COMMUNITY_LIST_QUERY) -// .as(Encoders.bean(ResultCommunityList.class)) -// .write() -// .option("compression", "gzip") -// .mode(SaveMode.Overwrite) -// .json(outputResultPath); Dataset rwc = spark.sql(String.format(RESULT_WITH_CONTEXT, communityIdList)); Dataset patents = spark.sql(RESULT_PATENT); Dataset relatedToRelations = spark.sql(IS_RELATED_TO_RELATIONS); rwc.createOrReplaceTempView("resultWithContext"); - patents.createOrReplaceTempView("patents"); - relatedToRelations.createOrReplaceTempView("relatedTorelations"); diff --git a/dhp-workflows/dhp-enrichment/src/test/java/eu/dnetlib/dhp/resulttocommunityfromsemrel/ResultToCommunityJobTest.java b/dhp-workflows/dhp-enrichment/src/test/java/eu/dnetlib/dhp/resulttocommunityfromsemrel/ResultToCommunityJobTest.java index 4361b6f39..c1fcff4d9 100644 --- a/dhp-workflows/dhp-enrichment/src/test/java/eu/dnetlib/dhp/resulttocommunityfromsemrel/ResultToCommunityJobTest.java +++ b/dhp-workflows/dhp-enrichment/src/test/java/eu/dnetlib/dhp/resulttocommunityfromsemrel/ResultToCommunityJobTest.java @@ -326,4 +326,6 @@ public class ResultToCommunityJobTest { Assertions.assertEquals(1, communities.size()); Assertions.assertEquals("dh-ch", communities.get(0)); } + + } From 2e54715d715426cd66cca94f1b63fe29abbcc8ce Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Fri, 22 Nov 2024 10:05:56 +0100 Subject: [PATCH 088/111] Applying PR#512 - Sequential ActionSet promotion --- .../wf/main/oozie_app/workflow.xml | 47 ++++--------------- 1 file changed, 9 insertions(+), 38 deletions(-) diff --git a/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/main/oozie_app/workflow.xml b/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/main/oozie_app/workflow.xml index 7ccfb342e..cdb4de908 100644 --- a/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/main/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-actionmanager/src/main/resources/eu/dnetlib/dhp/actionmanager/wf/main/oozie_app/workflow.xml @@ -135,22 +135,10 @@ --outputPath${workingDir}/action_payload_by_type --isLookupUrl${isLookupUrl} - +
- - - - - - - - - - - - ${wf:appPath()}/promote_action_payload_for_dataset_table @@ -162,7 +150,7 @@
- + @@ -177,7 +165,7 @@ - + @@ -192,7 +180,7 @@ - + @@ -207,7 +195,7 @@ - + @@ -222,7 +210,7 @@ - + @@ -237,7 +225,7 @@ - + @@ -252,7 +240,7 @@ - + @@ -267,26 +255,9 @@ - + - - - ${wf:appPath()}/promote_action_payload_for_person_table - - - - inputActionPayloadRootPath - ${workingDir}/action_payload_by_type - - - - - - - - - \ No newline at end of file From ca2d480df372b174d92f141f38cdf14b19130979 Mon Sep 17 00:00:00 2001 From: Miriam Baglioni Date: Tue, 26 Nov 2024 13:36:27 +0100 Subject: [PATCH 089/111] [BulkTagging] added fix to consider when the set of constraints for the datasource is empty. Added check for remove constraints and advanced constraints to verify if the constraints list is empty and in that case do nothing --- .../dhp/bulktag/community/ResultTagger.java | 15 +++++++++------ .../bulktag/community/SelectionConstraints.java | 2 ++ 2 files changed, 11 insertions(+), 6 deletions(-) diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/community/ResultTagger.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/community/ResultTagger.java index 2ea229e3e..64cbd70ba 100644 --- a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/community/ResultTagger.java +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/community/ResultTagger.java @@ -130,6 +130,7 @@ public class ResultTagger implements Serializable { // log.info("Remove constraints for " + communityId); if (conf.getRemoveConstraintsMap().keySet().contains(communityId) && conf.getRemoveConstraintsMap().get(communityId).getCriteria() != null && + !conf.getRemoveConstraintsMap().get(communityId).getCriteria().isEmpty() && conf .getRemoveConstraintsMap() .get(communityId) @@ -161,29 +162,30 @@ public class ResultTagger implements Serializable { // Tagging for datasource final Set datasources = new HashSet<>(); - final Set collfrom = new HashSet<>(); + final Set cfhb = new HashSet<>(); final Set hostdby = new HashSet<>(); if (Objects.nonNull(result.getInstance())) { for (Instance i : result.getInstance()) { if (Objects.nonNull(i.getCollectedfrom()) && Objects.nonNull(i.getCollectedfrom().getKey())) { - collfrom.add(i.getCollectedfrom().getKey()); + cfhb.add(i.getCollectedfrom().getKey()); } if (Objects.nonNull(i.getHostedby()) && Objects.nonNull(i.getHostedby().getKey())) { + cfhb.add(i.getHostedby().getKey()); hostdby.add(i.getHostedby().getKey()); } } - collfrom + cfhb .forEach( dsId -> datasources .addAll( conf.getCommunityForDatasource(dsId, param))); hostdby.forEach(dsId -> { - datasources - .addAll( - conf.getCommunityForDatasource(dsId, param)); +// datasources +// .addAll( +// conf.getCommunityForDatasource(dsId, param)); if (conf.isEoscDatasource(dsId)) { datasources.add("eosc"); } @@ -226,6 +228,7 @@ public class ResultTagger implements Serializable { .forEach(communityId -> { if (!removeCommunities.contains(communityId) && conf.getSelectionConstraintsMap().get(communityId).getCriteria() != null && + !conf.getSelectionConstraintsMap().get(communityId).getCriteria().isEmpty() && conf .getSelectionConstraintsMap() .get(communityId) diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/community/SelectionConstraints.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/community/SelectionConstraints.java index 57cc658fc..8a23a7017 100644 --- a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/community/SelectionConstraints.java +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/community/SelectionConstraints.java @@ -33,6 +33,8 @@ public class SelectionConstraints implements Serializable { // Constraints in or public boolean verifyCriteria(final Map> param) { + if (criteria.isEmpty()) + return true; for (Constraints selc : criteria) { if (selc.verifyCriteria(param)) { return true; From 0517e452e34cf30d471552f5930983a9435a91c5 Mon Sep 17 00:00:00 2001 From: "sandro.labruzzo" Date: Mon, 2 Dec 2024 14:00:59 +0100 Subject: [PATCH 090/111] Fixed error on empty affiliation --- .../main/java/eu/dnetlib/dhp/sx/bio/pubmed/PMAffiliation.java | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/pubmed/PMAffiliation.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/pubmed/PMAffiliation.java index 54aba8715..a8dacd132 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/pubmed/PMAffiliation.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/pubmed/PMAffiliation.java @@ -23,9 +23,8 @@ public class PMAffiliation { return name; } - public PMAffiliation setName(String name) { + public void setName(String name) { this.name = name; - return this; } public PMIdentifier getIdentifier() { From cc6bbbb80474553a4d3e71bc0fa80c4b84035194 Mon Sep 17 00:00:00 2001 From: "sandro.labruzzo" Date: Tue, 3 Dec 2024 14:31:11 +0100 Subject: [PATCH 091/111] make setter void --- .../main/java/eu/dnetlib/dhp/sx/bio/pubmed/PMAffiliation.java | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/pubmed/PMAffiliation.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/pubmed/PMAffiliation.java index a8dacd132..e3dcc95dd 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/pubmed/PMAffiliation.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/pubmed/PMAffiliation.java @@ -31,8 +31,7 @@ public class PMAffiliation { return identifier; } - public PMAffiliation setIdentifier(PMIdentifier identifier) { + public void setIdentifier(PMIdentifier identifier) { this.identifier = identifier; - return this; } } From 65902a87e3ae7b161cd8e682f4a3535df996e3cd Mon Sep 17 00:00:00 2001 From: "michele.artini" Date: Wed, 4 Dec 2024 13:18:17 +0100 Subject: [PATCH 092/111] support of the new apis --- .../plugin/researchfi/ResearchFiIterator.java | 54 +++++++++---------- 1 file changed, 27 insertions(+), 27 deletions(-) diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/researchfi/ResearchFiIterator.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/researchfi/ResearchFiIterator.java index 269a89f71..cad499962 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/researchfi/ResearchFiIterator.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/researchfi/ResearchFiIterator.java @@ -6,7 +6,7 @@ import java.util.Queue; import java.util.concurrent.PriorityBlockingQueue; import org.apache.commons.io.IOUtils; -import org.apache.commons.lang3.math.NumberUtils; +import org.apache.commons.lang3.StringUtils; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.http.Header; @@ -27,25 +27,25 @@ public class ResearchFiIterator implements Iterator { private final String baseUrl; private final String authToken; - private int currPage; - private int nPages; + private String nextUrl; + private int nCalls = 0; private final Queue queue = new PriorityBlockingQueue<>(); public ResearchFiIterator(final String baseUrl, final String authToken) { this.baseUrl = baseUrl; this.authToken = authToken; - this.currPage = 0; - this.nPages = 0; + this.nextUrl = null; } private void verifyStarted() { - if (this.currPage == 0) { - try { - nextCall(); - } catch (final CollectorException e) { - throw new IllegalStateException(e); + + try { + if (this.nCalls == 0) { + this.nextUrl = invokeUrl(this.baseUrl); } + } catch (final CollectorException e) { + throw new IllegalStateException(e); } } @@ -62,9 +62,9 @@ public class ResearchFiIterator implements Iterator { synchronized (this.queue) { verifyStarted(); final String res = this.queue.poll(); - while (this.queue.isEmpty() && (this.currPage < this.nPages)) { + while (this.queue.isEmpty() && StringUtils.isNotBlank(this.nextUrl)) { try { - nextCall(); + this.nextUrl = invokeUrl(this.nextUrl); } catch (final CollectorException e) { throw new IllegalStateException(e); } @@ -73,18 +73,11 @@ public class ResearchFiIterator implements Iterator { } } - private void nextCall() throws CollectorException { + private String invokeUrl(final String url) throws CollectorException { - this.currPage += 1; + this.nCalls += 1; + String next = null; - final String url; - if (!this.baseUrl.contains("?")) { - url = String.format("%s?PageNumber=%d&PageSize=%d", this.baseUrl, this.currPage, PAGE_SIZE); - } else if (!this.baseUrl.contains("PageSize=")) { - url = String.format("%s&PageNumber=%d&PageSize=%d", this.baseUrl, this.currPage, PAGE_SIZE); - } else { - url = String.format("%s&PageNumber=%d", this.baseUrl, this.currPage); - } log.info("Calling url: " + url); try (final CloseableHttpClient client = HttpClients.createDefault()) { @@ -94,11 +87,15 @@ public class ResearchFiIterator implements Iterator { try (final CloseableHttpResponse response = client.execute(req)) { for (final Header header : response.getAllHeaders()) { log.debug("HEADER: " + header.getName() + " = " + header.getValue()); - if ("x-page-count".equals(header.getName())) { - final int totalPages = NumberUtils.toInt(header.getValue()); - if (this.nPages != totalPages) { - this.nPages = NumberUtils.toInt(header.getValue()); - log.info("Total pages: " + totalPages); + if ("link".equals(header.getName())) { + final String s = StringUtils.substringBetween(header.getValue(), "<", ">"); + final String token = StringUtils + .substringBefore(StringUtils.substringAfter(s, "NextPageToken="), "&"); + + if (this.baseUrl.contains("?")) { + next = this.baseUrl + "&NextPageToken=" + token; + } else { + next = this.baseUrl + "?NextPageToken=" + token; } } } @@ -108,6 +105,9 @@ public class ResearchFiIterator implements Iterator { jsonArray.forEach(obj -> this.queue.add(JsonUtils.convertToXML(obj.toString()))); } + + return next; + } catch (final Throwable e) { log.warn("Error calling url: " + url, e); throw new CollectorException("Error calling url: " + url, e); From 32e2a8b34053dc8162bf20bb91c977d544d05b94 Mon Sep 17 00:00:00 2001 From: "sandro.labruzzo" Date: Wed, 4 Dec 2024 13:36:21 +0100 Subject: [PATCH 093/111] implemented zenodo dump collector plugin --- .../CollectZenodoDumpCollectorPlugin.java | 96 ++++++++++++++++++ .../plugin/zenodo/ZenodoTarIterator.java | 59 +++++++++++ .../zenodo/ZenodoPluginCollectionTest.java | 35 +++++++ .../dhp/collection/zenodo/zenodo.tar.gz | Bin 0 -> 7412 bytes 4 files changed, 190 insertions(+) create mode 100644 dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/zenodo/CollectZenodoDumpCollectorPlugin.java create mode 100644 dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/zenodo/ZenodoTarIterator.java create mode 100644 dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/plugin/zenodo/ZenodoPluginCollectionTest.java create mode 100644 dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/collection/zenodo/zenodo.tar.gz diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/zenodo/CollectZenodoDumpCollectorPlugin.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/zenodo/CollectZenodoDumpCollectorPlugin.java new file mode 100644 index 000000000..3ea29a9b0 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/zenodo/CollectZenodoDumpCollectorPlugin.java @@ -0,0 +1,96 @@ + +package eu.dnetlib.dhp.collection.plugin.zenodo; + +import static eu.dnetlib.dhp.utils.DHPUtils.getHadoopConfiguration; + +import java.io.IOException; +import java.io.InputStream; +import java.util.stream.Stream; +import java.util.stream.StreamSupport; + +import org.apache.commons.io.IOUtils; +import org.apache.hadoop.fs.FSDataOutputStream; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.compress.CompressionCodec; +import org.apache.hadoop.io.compress.CompressionCodecFactory; +import org.apache.http.client.config.RequestConfig; +import org.apache.http.client.methods.CloseableHttpResponse; +import org.apache.http.client.methods.HttpGet; +import org.apache.http.impl.client.CloseableHttpClient; +import org.apache.http.impl.client.HttpClientBuilder; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import eu.dnetlib.dhp.collection.ApiDescriptor; +import eu.dnetlib.dhp.collection.plugin.CollectorPlugin; +import eu.dnetlib.dhp.common.aggregation.AggregatorReport; +import eu.dnetlib.dhp.common.collection.CollectorException; + +public class CollectZenodoDumpCollectorPlugin implements CollectorPlugin { + + final private Logger log = LoggerFactory.getLogger(getClass()); + + private void downloadItem(final String name, final String itemURL, final String basePath, + final FileSystem fileSystem) { + try { + final Path hdfsWritePath = new Path(String.format("%s/%s", basePath, name)); + final FSDataOutputStream fsDataOutputStream = fileSystem.create(hdfsWritePath, true); + final HttpGet request = new HttpGet(itemURL); + final int timeout = 60; // seconds + final RequestConfig config = RequestConfig + .custom() + .setConnectTimeout(timeout * 1000) + .setConnectionRequestTimeout(timeout * 1000) + .setSocketTimeout(timeout * 1000) + .build(); + log.info("Downloading url {} into {}", itemURL, hdfsWritePath.getName()); + try (CloseableHttpClient client = HttpClientBuilder.create().setDefaultRequestConfig(config).build(); + CloseableHttpResponse response = client.execute(request)) { + int responseCode = response.getStatusLine().getStatusCode(); + log.info("Response code is {}", responseCode); + if (responseCode >= 200 && responseCode < 400) { + IOUtils.copy(response.getEntity().getContent(), fsDataOutputStream); + } + } catch (Throwable eu) { + throw new RuntimeException(eu); + } + } catch (Throwable e) { + throw new RuntimeException(e); + } + } + + @Override + public Stream collect(ApiDescriptor api, AggregatorReport report) throws CollectorException { + try { + final String zenodoURL = api.getBaseUrl(); + final String hdfsURI = api.getParams().get("hdfsURI"); + final FileSystem fileSystem = FileSystem.get(getHadoopConfiguration(hdfsURI)); + downloadItem("zenodoDump.tar.gz", zenodoURL, "/tmp", fileSystem); + CompressionCodecFactory factory = new CompressionCodecFactory(fileSystem.getConf()); + + Path sourcePath = new Path("/tmp/zenodoDump.tar.gz"); + CompressionCodec codec = factory.getCodec(sourcePath); + InputStream gzipInputStream = null; + try { + gzipInputStream = codec.createInputStream(fileSystem.open(sourcePath)); + return iterateTar(gzipInputStream); + + } catch (IOException e) { + throw new CollectorException(e); + } finally { + log.info("Closing gzip stream"); + org.apache.hadoop.io.IOUtils.closeStream(gzipInputStream); + } + } catch (Exception e) { + throw new CollectorException(e); + } + } + + private Stream iterateTar(InputStream gzipInputStream) throws Exception { + + Iterable iterable = () -> new ZenodoTarIterator(gzipInputStream); + return StreamSupport.stream(iterable.spliterator(), false); + + } +} diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/zenodo/ZenodoTarIterator.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/zenodo/ZenodoTarIterator.java new file mode 100644 index 000000000..8e627683e --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/zenodo/ZenodoTarIterator.java @@ -0,0 +1,59 @@ + +package eu.dnetlib.dhp.collection.plugin.zenodo; + +import java.io.Closeable; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.util.Iterator; + +import org.apache.commons.compress.archivers.tar.TarArchiveEntry; +import org.apache.commons.compress.archivers.tar.TarArchiveInputStream; +import org.apache.commons.io.IOUtils; + +public class ZenodoTarIterator implements Iterator, Closeable { + + private final InputStream gzipInputStream; + private final StringBuilder currentItem = new StringBuilder(); + private TarArchiveInputStream tais; + private boolean hasNext; + + public ZenodoTarIterator(InputStream gzipInputStream) { + this.gzipInputStream = gzipInputStream; + tais = new TarArchiveInputStream(gzipInputStream); + hasNext = getNextItem(); + } + + private boolean getNextItem() { + try { + TarArchiveEntry entry; + while ((entry = tais.getNextTarEntry()) != null) { + if (entry.isFile()) { + currentItem.setLength(0); + currentItem.append(IOUtils.toString(new InputStreamReader(tais))); + return true; + } + } + return false; + } catch (Throwable e) { + throw new RuntimeException(e); + } + } + + @Override + public boolean hasNext() { + return hasNext; + } + + @Override + public String next() { + final String data = currentItem.toString(); + hasNext = getNextItem(); + return data; + } + + @Override + public void close() throws IOException { + gzipInputStream.close(); + } +} diff --git a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/plugin/zenodo/ZenodoPluginCollectionTest.java b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/plugin/zenodo/ZenodoPluginCollectionTest.java new file mode 100644 index 000000000..9b5cf1850 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/collection/plugin/zenodo/ZenodoPluginCollectionTest.java @@ -0,0 +1,35 @@ + +package eu.dnetlib.dhp.collection.plugin.zenodo; + +import static org.junit.jupiter.api.Assertions.assertNotNull; + +import java.util.zip.GZIPInputStream; + +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.Test; + +import com.fasterxml.jackson.databind.ObjectMapper; + +import eu.dnetlib.dhp.collection.ApiDescriptor; +import eu.dnetlib.dhp.common.collection.CollectorException; + +public class ZenodoPluginCollectionTest { + + @Test + public void testZenodoIterator() throws Exception { + + final GZIPInputStream gis = new GZIPInputStream( + getClass().getResourceAsStream("/eu/dnetlib/dhp/collection/zenodo/zenodo.tar.gz")); + try (ZenodoTarIterator it = new ZenodoTarIterator(gis)) { + Assertions.assertTrue(it.hasNext()); + int i = 0; + while (it.hasNext()) { + Assertions.assertNotNull(it.next()); + i++; + } + Assertions.assertEquals(10, i); + + } + } + +} diff --git a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/collection/zenodo/zenodo.tar.gz b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/collection/zenodo/zenodo.tar.gz new file mode 100644 index 0000000000000000000000000000000000000000..6c06bf4e5a5d568aa6c534cc4636796d9938f97e GIT binary patch literal 7412 zcmV44`c0g6Vvm2)J$*9; zQILd$BG?2eTe;eI*ca%jx!CHh+Ka8;CuqOPKFOW~iu$oB*|FrFwjNqPe| zq*``Y6H&he#qs~5p&6x}wEXU+>u-+TGuam=sCmU1!?@QE6MHPb>q~ zJDZ-or$a^p_$O`h7>bL4`aAnIP$;y^3gnay6XugHBMlmjs6P;UrMZXq_xgOq0(F{K zbar>IM~B=WtMf_id!@N+d$iZ%K3BtOALg47J`p`q>qp#6{5XlIPe9F)4-C4<0~&D} zkW-I?LSPc;y!wYA$29Q;IgNn<#N z`YbvD83K1+C)^*f2nsc|rR-TOl+Pbk-@gWHrW3pggB)h<+vS{@EAix~^HRo?W=b&T zvCp#K{WkAw#3Vc*uYrV^G@!yRC9NTg`bbOxr+bVJ)>skT7>Fp5qyd@&=pk(p4OL@= zN^-(RjQXrgI%6O^>2fB?xdd;8P;x2~c={}&;a(|idGZUCgh|Ke5=P(t&q~+tm1bAc zo2i-oE2GiA2@dL{z}(e6nRGrfFP77@{W=FvX~Hz{7$#Szx6kOM2*hyyR%uoV7+*H| zA^1-^eh#Ee7#L)vp{P4W(^B3AyHW5w#B){Bw2Qi1G>oidX;O9Odf)?O{*TuzU@$zO zwIS`Z9h6WK!RVSVgOB^z!h)f@agE%#GIJWzd%(Y}YsaAdj(R)!w7i8&FmY<2AU&Tc zj{}UcTat!e1x6r0s%6tCI!NcWHot%{bTHNax8rw=aO*q8jd60AH?#I%Q!nH>CddxtDQi^GGre91>^jXgNH2{ zc`ssenq7w4)nohFNm^x>NiX6d_C+rIMdr@aUe-D?j;I&!WP0jol&VTlI-o<{Uu=%l~&G= z(AF5W_F}zIjp0Bo8!2am=`L3)_xjP@Oyb#}h5uMdgeRo-#qpUE2a#;#3t#jw>;7 z9dBCF046K7XD-CR@uIR|5(lz z|M?on7XR7eKljCdl-hU<{AbeegW*4`>fapx^CRULUqPuTkn|JSO13mUyk0CnNQ4>} z14L|gjt`4}stJdN{G@Tv+}TIfTZI43V!2u_7cHw?-776Vu^4fJ*@NIJ0et{Au;mF~ z^iylw8VNtaBIuGpgoHQ1)T^h!VxCcc0p8Ut+AL(3CO8ovabOYFnm!E1-Ou9I0%=nW zZk4Yyy><-Yt*7WX6+!=#PZw%`0}jq#k=g3z0Pa>n#3nR5pxj4_m(xbxu&4v>Nirla z&T0oQZ|z6OIN13z`X&p#`Z$%KCch75qESza2Y({jNEeaEyK1243SwO>;!Of$jYY+G zpx2j=vT1?*@%1KIe|T9%)3vI_s_wdvGrT?y2g&zAoM^0@Xz|4=2`te1#h}N*>+_Qq zF%0w;&n}oh0?Duoq5>N5G1I}yA{c?>#pH}p7|ib=T9UCCDvLtdOVW$Pa5`4kMm7*n zco1YGJ2?h2V^mT%AA&Gzz(hD0$28~mJK7I~&qWCFN$&k|6oaQO^KLEjET=@q>-#}5 zrd2fbqVY{bF-F-gmr!;PSzS`CZ0ee+8BYO=fyp~JqP5VcpShaIyj!X@-CDEhRNL*Q z+iq8!reiqmc1y3@7F?^@Pl5iWSQ0oQ#+tBBuIwA&6L#z4B_y7)%Q)ZPGtGyP_g$!& zh&e`Oi*Jw?h3pu%RdY-Ogj@$9*FfIQcCAvb8|7BJp;x{(9ET3%1CFe31&+%ss78(l zA{L7_b11;d1eCnh2>87d&U{+vBI$2uN`W!Go{&uOGyVx&>sBZEv^<+|ayP&KX%!n8 z`L+XGu=HCdf2|jBuNM8pN7wqlP{_mdO=1%4D+CrX&@MNCUeghnvjCGmm*>X5U78D< zJ-(5bBZh;aPSw7JTVKS8r66*xE7DjHqBBK`3Nr=9fvRQ*uAzN0XNoWc`+)^xm|G2j zAm#Iek>C--T#G(ac@|g&^&$cEDWOsVu`&^1D57`*A*eDwX5t1Cej%o0GvrzQpf|G% z0bu{tPA{T6L|shfb{5eFYe5G(S4qFg2ix36Y!Sg)Y$+phb2IN$RdyxTmvGK ztSc-ajjR7%=nmYR^hi|)O_EsP>jM8d_Ue~}k>XsZu6~nP&SzKuiG8AB7;%Xrc?dZD zj&QR5kZ^K}n;pYF;H9E%qnPp=7$G3r8xYk|M+%?a4bVdmkE#(EUGkBZbdmsM#bOQq zFAY=!C8P8p2Rcq1?{yqn0S72kcKhXveJ1_F*XZRQjy@N zn4&Co#wA93arU6sFvxo{i{Of}e~Ffx>7`2@niUngY3Cgz_PhN+ot_Vn>4cjuZ$u=aI^!oHW5FZ*-A^Xr_=S zbk4wad7sDBhiZkNTA*BxNyRcsB<}~xZs6SjN0U_Vim%fG^-52SK>2jQ_Tu~CzAqqb z@YC1H(242O49v62DHtPt5JU@~Mo@0XjYG7BWGFyBv;*cQwaU9WFOk3dk;dLYbutZI zQ6xZ~;$%3dmI;mj_%%lzg3#bURTuo0ei;wM=ki zMr6q`ds&*YDB4vrV(Xj?6{Xgk$=~ z$elbgd9O+8V{nAPE_yhWkQ*Z|`AwNcQ<=d$WdZGgw{~p~D3N%=_hArEQk)n4zj<;b z4xSk1;dH=wBES@0n0T0I%z$z5o6Rcik6-6!36z;}3RragjmH+hf6ODz|JAE{`SJ39 zO{;t}|F^uw|G&nu#s9bX|Go2nQEfa*{%_jw!{z_3sDE?#|A`o~NEAq2fN3Ug=?F>v znGV5e4wj=_OG+P02fO4g%;slkh}9NiV|1IVAP!5WG>d30Ix!uAw_RIp$aYgB7)Wa)fMaOc%&q#i`BZ$uObNnqw#xMU3CBGQPG z&*Ud@DLfJTSAU0?(=|*>FHE0#clFyPQkjZhDxUSl_hUK`0;R4EFIOj5&oSqoslAp; zckx*gLGTQoFUP2sCS4vYkd-7sfNQ70*`Lp|x%LbgTFS_aBMbw&^9eR!G~&SpuokpU zHrW4#lYbBSip<}d#9`H6lKeZzyIhLq1hiK6uer_sT}an5wDMEPo?VswTeobp?bwD< zxAc~6H(GAF)^0bdm4<6#qUckge|h#VxFPz}$^O-=_L|f@Q1-9gcI%aS;LZx5>~g%|G%l_|7pqcHvs@{m;ZlT%l{3t`Xm9sx#j;&5OP&5|8F%~b*EytYt_ah zEdRIH+;dm$MA4aCQhj|%WD@nL+FQAKV~>ID#sUwxv4Bz8ZY*%W zBg_A*SdWJP*`|B*`!DwP`!8SR*y4X%{O`{AU;dk*$AJJ*Wj$;P;AZMSJVo>aoc>)O zjP>r+Z{3BEW!GxyEvssk!EvqFhTVdQqz%zYrEWQJtySLy0aIqK3X|?h$mYP%51fHd z1O5Z2eL)VFe<@>DARnVjE7}B3rJI#txOVS)E@ep;J&RWWwZAIb0gcS~TLBXjL-2}DFWc&`VNlEcK~lr)z^ z(u;+b>`tI6+{ESt71yp5t1hBfE4OEu2mwDSL|R};x(QX=&%YM@>hDr9KLlt0H0cU_ zHC_Pk8}9xk;Smq6eh-C@{jWz9+MB)0ASu+bC zKH`!N6(fwh$#kK&>&mpwB5=+dKoyuV@__~*uH`!}+$fhXSFCgsQgvYP7WH!FmMM1q z%TWz}^o@I>X71~4ycBaNtK>H!!OOkWLX`cbJy>$s3p zzD@@?ofpJ9@^JB^_&#Ahsdb{&5}oVmmjcNrEL*&#UV0oyLPEF(ET!pkHJ@XkNf&ZJ z3gnP7pJQ&w%WSD-N<4npq5(lB?E)(y_JW&XBt~GtB^gmaQLDw`6yD`tx>{7lx<0Od zBybhnL!VB*F4X-9{Jfmn?v|O}J(3q93Q7QI}Zl`Qex7zD${xdvpr~=`>7&+C~ z-GelTjd!7CHE!B4EjRV{nl<>Xx7((RJE+zT@C2L2np+@zG5aQOKyni*D{CfHZkCwM zgYs(-vnD2=LkuGF^0qNDy}8X z2N2Vp$U!i%6nyMxIz&&K*Fkrp!Ks$bQfJ_6=s9V)pWT|Ty4EH{xdoc#mYp?M4cuES zPE{|NhU*mFN?8YkX}BhMWKV<`(8T(>yK8S?z*gPOYyG>?kbW4;G|^86Uq-LGj#d2p zBnGWrDO-;v8r`atq-{7(Redi^*Q<}DWUuqYdc+-)?(v56Uaugpqf!Nu2a!Nrm=|P| z7dWk7?vj6w08YEP9AKAJZ3rs#+tvb9A@)8w+|9t}?xofhsgt=1H2(Hpwmrck%m1%D z;`jf`ZqEMS_V)eX*EqKC|F-Y{*2w>_Jl6OBDi8eqzoqqW^Z&Q`|J(fkZT|mV^Z!j# zo29?SET9s%5(JN1a(Od#al2UHn_4XJE$tSvT`cg8Efy$0#2y%%TrAM2x7*e9TY*)l zS$ErwcEjB~On&=ffy(wtmluh{lP$sHMsNfe4B>Z`NNf z`JiAO=~8L`*{22D8aalL%U@?*!E5NAN`wKgRwIYt&w`7DTY%u!#a_0NHYGjP=Cq-~ zI0R}3;KddQ?$tJ=_&fH~#RUSoHP&z&=hS@wz8nh?QXr1S5Ikwbu*&ZLw|6zmZQDTj zRdBRtL<-DZQJI59BektIP(a0%=m7T=PsqyFn{o$O}e#RSh99O-0`Ou@|!^Xz#YH!z(a=D0zX z|L=KBR{W1Trr|Ef{}lemqcj!&Q}Mrt{1_^DTytHXC6|TFNAvl1^hGk&J&Rp`)l*47CSZ6=o1829@kz5_6r7b z;I;W^K>2O`Km>)A$I*cq66e|hmXW}+CI^>avM6bW`n~$TA&+{2T+`@o3o(Jo0v09+ zYnbM}o(LbEftg)^86lvQhr%F2(`S-S&$5q`B#(+5jM_<&g)c~_c$*>?ZwzMaMOo)1 zCa)*)2Q0z?hS^~h6^=(X5DOMOMPn1t4-pr3E&)NJ~3^4HdQ+ z-%t?h9x)xpHEKL_1X#zVZRTl=KJhpR_1yKR$b1v>)AKd~j{5AJ_^;$yo$L}qMi+iL zc&g)cGK(?$1zr3WI^_5%SnFdUR6&<*`Q=B+cshzd!24bCJ_^StpfIZ6p&z`}Eh%Dc z#8Xfg#VE~-{9D6$TS>jGpo#2WJ=uN)za!B31+gh5|NcE*n-jfkTHwj2W!2syHx+B; zXE*3Lbh#PF3Y=mw;$?n{vuQjlcV=w!CV*%T?=^?raFeg4$G3n(#qxauuJN;ISM`59 ziS>Uhwv7L&@_!zsDgR&j|Etvh@nqHi@itulr-{C*|D)>vsQN#u{?BojSgQWd6Fy|E z>i?+vKdSzZs{bRW{?Ce;SSlEJrDXMzJUn)Bcups?>|>sXrIN&ha4FgORE*|^VR;y? zgxs9h{t@_?rwEy;N&wxx1dy#t0R8w)#Q&)4a!XeH&oC@!IseBoRQ&HTnu`Ca_}|^} zKPhwC@AM>{=?I^JwvOdVi2w?TwZRDB+UVa3{}}`U1uq2tILGq_oLTU{&w`+91s%)Z zcnGo!w#q2PJ-y36QgRd)BLp@ar+M=cvE`oiQLK{mVic=B=oG|50f>%xdsoCuh(2Fq zU?kbJMVRG!0!5E>!YqqtDTWk}zCz?%0068ZH^{=1aC9cZhWMQph1m$L*cP!6<$#&C z<7hUg3EVW^%SoCe)Lea{KB8-sKm8g~~uyAbE${8f_rII)nF$WLTQm;DO*X zoTeuUD3n{nu}fVWa>$VOU04j9=p<=0hpBAAQUR|fCpV!hyB`q94pA%Pd`kZN{F`9d zHEt!&TkxJRiT!%X^=BA*x<1>uT|PTMgPxavor_tUj#AJ-h&}7TBV{(VEsS`Ub9z6` zFaIRNlpH5O6Bp8&=XWwXhbP)Q2j}Hg0k=jlpw|Q;yV$E?UL(b3rhf-(KFH!kJnH=v zZWvedFeKG&HHz7#p7q?RXg`viU7Z()Be)HLsq?akU=_e%cCzsDpLfmcbGc(ZE9RtS z^P+LvV26z3{0uz#*+rVYCvaCW$!EpzqbO)ROitlRjtF}HDan&l+$fo5X@n3#lT%UI z7;~CP5N6}mHv$a#NtFg}!e}P@Zm~_*hvP_HZ`Dxmy%9pcMp>O9a@1mEbvIyv0}anf zf`RIAisvS+E}n@EI>Gv}^HyNrV}rfkI+*oC96o$7>}gLdaDqIOQ^yQ;cio_C8v%Ep z48AI|?!b|#~|o&3^0&<)C5-Q|ww8Is0g!qjcQO*loC$l^yCmWk3a z<~?ARuFeddu9~O6(&Op%eP3ssySl>=-&EpI_Irl|T~3~Gc$P+$n1PioYcyg#<+yu5B?2eIq}lQPZoiLi7}GSF-L@>=W^Kw0YRcjd zPMSgQpk$3nspD)Q&F3L2n!4}_Xai}CvFtX8Tx#2nE?ym(%HY61tSn5hV5qZ!av1kf zOx;1RJ2;wHq}nQ(g2(>(~xOtR_0x~l(-?w5vZ)*qnG|%O@lQfb Date: Wed, 4 Dec 2024 13:37:14 +0100 Subject: [PATCH 094/111] code formatted --- .../dhp/sx/bio/pubmed/PMAffiliation.java | 42 ++++++----- .../dnetlib/dhp/sx/bio/pubmed/PMAuthor.java | 1 - .../dhp/sx/bio/pubmed/PMIdentifier.java | 74 +++++++++---------- .../collection/crossref/Crossref2Oaf.scala | 1 - .../dnetlib/dhp/sx/bio/pubmed/PMParser2.scala | 13 ++-- .../dhp/sx/bio/pubmed/PubMedToOaf.scala | 20 ++++- .../dnetlib/dhp/sx/bio/BioScholixTest.scala | 1 - 7 files changed, 82 insertions(+), 70 deletions(-) diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/pubmed/PMAffiliation.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/pubmed/PMAffiliation.java index e3dcc95dd..427eb2725 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/pubmed/PMAffiliation.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/pubmed/PMAffiliation.java @@ -1,3 +1,4 @@ + package eu.dnetlib.dhp.sx.bio.pubmed; /** @@ -7,31 +8,32 @@ package eu.dnetlib.dhp.sx.bio.pubmed; */ public class PMAffiliation { - private String name; + private String name; - private PMIdentifier identifier; + private PMIdentifier identifier; - public PMAffiliation() { + public PMAffiliation() { - } - public PMAffiliation(String name, PMIdentifier identifier) { - this.name = name; - this.identifier = identifier; - } + } - public String getName() { - return name; - } + public PMAffiliation(String name, PMIdentifier identifier) { + this.name = name; + this.identifier = identifier; + } - public void setName(String name) { - this.name = name; - } + public String getName() { + return name; + } - public PMIdentifier getIdentifier() { - return identifier; - } + public void setName(String name) { + this.name = name; + } - public void setIdentifier(PMIdentifier identifier) { - this.identifier = identifier; - } + public PMIdentifier getIdentifier() { + return identifier; + } + + public void setIdentifier(PMIdentifier identifier) { + this.identifier = identifier; + } } diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/pubmed/PMAuthor.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/pubmed/PMAuthor.java index b0df25663..e023f2e62 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/pubmed/PMAuthor.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/pubmed/PMAuthor.java @@ -97,5 +97,4 @@ public class PMAuthor implements Serializable { this.affiliation = affiliation; } - } diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/pubmed/PMIdentifier.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/pubmed/PMIdentifier.java index 0c8c55e40..6cd17a90c 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/pubmed/PMIdentifier.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/pubmed/PMIdentifier.java @@ -1,53 +1,53 @@ + package eu.dnetlib.dhp.sx.bio.pubmed; public class PMIdentifier { - private String pid; - private String type; + private String pid; + private String type; + public PMIdentifier(String pid, String type) { + this.pid = cleanPid(pid); + this.type = type; + } - public PMIdentifier(String pid, String type) { - this.pid = cleanPid(pid); - this.type = type; - } + public PMIdentifier() { - public PMIdentifier() { + } - } + private String cleanPid(String pid) { - private String cleanPid(String pid) { + if (pid == null) { + return null; + } - if (pid == null) { - return null; - } + // clean ORCID ID in the form 0000000163025705 to 0000-0001-6302-5705 + if (pid.matches("[0-9]{15}[0-9X]")) { + return pid.replaceAll("(.{4})(.{4})(.{4})(.{4})", "$1-$2-$3-$4"); + } - // clean ORCID ID in the form 0000000163025705 to 0000-0001-6302-5705 - if (pid.matches("[0-9]{15}[0-9X]")) { - return pid.replaceAll("(.{4})(.{4})(.{4})(.{4})", "$1-$2-$3-$4"); - } + // clean ORCID in the form http://orcid.org/0000-0001-8567-3543 to 0000-0001-8567-3543 + if (pid.matches("http://orcid.org/[0-9]{4}-[0-9]{4}-[0-9]{4}-[0-9]{4}")) { + return pid.replaceAll("http://orcid.org/", ""); + } + return pid; + } - // clean ORCID in the form http://orcid.org/0000-0001-8567-3543 to 0000-0001-8567-3543 - if (pid.matches("http://orcid.org/[0-9]{4}-[0-9]{4}-[0-9]{4}-[0-9]{4}")) { - return pid.replaceAll("http://orcid.org/", ""); - } - return pid; - } + public String getPid() { + return pid; + } - public String getPid() { - return pid; - } + public PMIdentifier setPid(String pid) { + this.pid = cleanPid(pid); + return this; + } - public PMIdentifier setPid(String pid) { - this.pid = cleanPid(pid); - return this; - } + public String getType() { + return type; + } - public String getType() { - return type; - } - - public PMIdentifier setType(String type) { - this.type = type; - return this; - } + public PMIdentifier setType(String type) { + this.type = type; + return this; + } } diff --git a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/collection/crossref/Crossref2Oaf.scala b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/collection/crossref/Crossref2Oaf.scala index e4a238c8f..de68ebb58 100644 --- a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/collection/crossref/Crossref2Oaf.scala +++ b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/collection/crossref/Crossref2Oaf.scala @@ -673,7 +673,6 @@ case object Crossref2Oaf { val doi = input.getString(0) val rorId = input.getString(1) - val pubId = IdentifierFactory.idFromPid("50", "doi", DoiCleaningRule.clean(doi), true) val affId = GenerateRorActionSetJob.calculateOpenaireId(rorId) diff --git a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/pubmed/PMParser2.scala b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/pubmed/PMParser2.scala index 2eb4bea65..bc9a2cf02 100644 --- a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/pubmed/PMParser2.scala +++ b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/pubmed/PMParser2.scala @@ -82,21 +82,22 @@ class PMParser2 { a.setLastName((author \ "LastName").text) a.setForeName((author \ "ForeName").text) val id = (author \ "Identifier").text - val idType =(author \ "Identifier" \ "@Source").text + val idType = (author \ "Identifier" \ "@Source").text - if(id != null && id.nonEmpty && idType != null && idType.nonEmpty) { + if (id != null && id.nonEmpty && idType != null && idType.nonEmpty) { a.setIdentifier(new PMIdentifier(id, idType)) } - val affiliation = (author \ "AffiliationInfo" \ "Affiliation").text - val affiliationId = (author \ "AffiliationInfo" \ "Identifier").text + val affiliationId = (author \ "AffiliationInfo" \ "Identifier").text val affiliationIdType = (author \ "AffiliationInfo" \ "Identifier" \ "@Source").text - if(affiliation != null && affiliation.nonEmpty) { + if (affiliation != null && affiliation.nonEmpty) { val aff = new PMAffiliation() aff.setName(affiliation) - if(affiliationId != null && affiliationId.nonEmpty && affiliationIdType != null && affiliationIdType.nonEmpty) { + if ( + affiliationId != null && affiliationId.nonEmpty && affiliationIdType != null && affiliationIdType.nonEmpty + ) { aff.setIdentifier(new PMIdentifier(affiliationId, affiliationIdType)) } a.setAffiliation(aff) diff --git a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/pubmed/PubMedToOaf.scala b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/pubmed/PubMedToOaf.scala index 5e14c731a..281ca0e07 100644 --- a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/pubmed/PubMedToOaf.scala +++ b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/pubmed/PubMedToOaf.scala @@ -294,11 +294,23 @@ object PubMedToOaf { author.setName(a.getForeName) author.setSurname(a.getLastName) author.setFullname(a.getFullName) - if(a.getIdentifier != null) { - author.setPid(List(OafMapperUtils.structuredProperty(a.getIdentifier.getPid, - OafMapperUtils.qualifier(a.getIdentifier.getType,a.getIdentifier.getType,ModelConstants.DNET_PID_TYPES, ModelConstants.DNET_PID_TYPES), dataInfo)).asJava) + if (a.getIdentifier != null) { + author.setPid( + List( + OafMapperUtils.structuredProperty( + a.getIdentifier.getPid, + OafMapperUtils.qualifier( + a.getIdentifier.getType, + a.getIdentifier.getType, + ModelConstants.DNET_PID_TYPES, + ModelConstants.DNET_PID_TYPES + ), + dataInfo + ) + ).asJava + ) } - if (a.getAffiliation!= null) + if (a.getAffiliation != null) author.setRawAffiliationString(List(a.getAffiliation.getName).asJava) author.setRank(index + 1) author diff --git a/dhp-workflows/dhp-aggregation/src/test/scala/eu/dnetlib/dhp/sx/bio/BioScholixTest.scala b/dhp-workflows/dhp-aggregation/src/test/scala/eu/dnetlib/dhp/sx/bio/BioScholixTest.scala index 4a926df01..cb7826dbf 100644 --- a/dhp-workflows/dhp-aggregation/src/test/scala/eu/dnetlib/dhp/sx/bio/BioScholixTest.scala +++ b/dhp-workflows/dhp-aggregation/src/test/scala/eu/dnetlib/dhp/sx/bio/BioScholixTest.scala @@ -63,7 +63,6 @@ class BioScholixTest extends AbstractVocabularyTest { "0000000333457333", "0000000335964515", "0000000302921949", - "http://orcid.org/0000-0001-8567-3543", "http://orcid.org/0000-0001-7868-8528", "0000-0001-9189-1440", From 730a7751b68b0dc2c5a8fa1f97ee4e82bdb2b3f2 Mon Sep 17 00:00:00 2001 From: "sandro.labruzzo" Date: Wed, 4 Dec 2024 15:03:59 +0100 Subject: [PATCH 095/111] added zenodoDump to enum of CollectorPlugin --- .../main/java/eu/dnetlib/dhp/collection/CollectorWorker.java | 3 +++ .../java/eu/dnetlib/dhp/collection/plugin/CollectorPlugin.java | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/CollectorWorker.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/CollectorWorker.java index f63bfcb48..4c6d0653e 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/CollectorWorker.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/CollectorWorker.java @@ -7,6 +7,7 @@ import java.io.IOException; import java.util.Optional; import java.util.concurrent.atomic.AtomicInteger; +import eu.dnetlib.dhp.collection.plugin.zenodo.CollectZenodoDumpCollectorPlugin; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.IntWritable; @@ -129,6 +130,8 @@ public class CollectorWorker extends ReportingJob { return new Gtr2PublicationsCollectorPlugin(this.clientParams); case osfPreprints: return new OsfPreprintsCollectorPlugin(this.clientParams); + case zenodoDump: + return new CollectZenodoDumpCollectorPlugin(); case other: final CollectorPlugin.NAME.OTHER_NAME plugin = Optional .ofNullable(this.api.getParams().get("other_plugin_type")) diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/CollectorPlugin.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/CollectorPlugin.java index 72e691579..93e65b6a7 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/CollectorPlugin.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/plugin/CollectorPlugin.java @@ -11,7 +11,7 @@ public interface CollectorPlugin { enum NAME { - oai, other, rest_json2xml, file, fileGzip, baseDump, gtr2Publications, osfPreprints; + oai, other, rest_json2xml, file, fileGzip, baseDump, gtr2Publications, osfPreprints, zenodoDump; public enum OTHER_NAME { mdstore_mongodb_dump, mdstore_mongodb From bde59a7c8f49cd964317a9e240dacb3be1eeec01 Mon Sep 17 00:00:00 2001 From: miconis Date: Thu, 5 Dec 2024 11:09:30 +0100 Subject: [PATCH 096/111] implementation of the utilities for the inclusion of raids in the graph --- .../java/eu/dnetlib/dhp/common/Constants.java | 4 + .../dnetlib/dhp/actionmanager/Constants.java | 8 + .../raid/GenerateRAiDActionSetJob.java | 190 ++++++++++++++++++ .../raid/model/GenerateRAiDActionSetJob.java | 2 + .../actionmanager/raid/model/RAiDEntity.java | 102 ++++++++++ .../ror/GenerateRorActionSetJob.java | 8 +- .../raid/action_set_parameters.json | 14 ++ .../raid/oozie_app/action_set_parameters.json | 0 .../raid/oozie_app/config-default.xml | 58 ++++++ .../actionmanager/raid/oozie_app/workflow.xml | 55 +++++ .../raid/GenerateRAiDActionSetJobTest.java | 112 +++++++++++ .../dhp/actionmanager/raid/raid_example.json | 6 + 12 files changed, 552 insertions(+), 7 deletions(-) create mode 100644 dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/raid/GenerateRAiDActionSetJob.java create mode 100644 dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/raid/model/GenerateRAiDActionSetJob.java create mode 100644 dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/raid/model/RAiDEntity.java create mode 100644 dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/raid/action_set_parameters.json create mode 100644 dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/raid/oozie_app/action_set_parameters.json create mode 100644 dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/raid/oozie_app/config-default.xml create mode 100644 dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/raid/oozie_app/workflow.xml create mode 100644 dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/raid/GenerateRAiDActionSetJobTest.java create mode 100644 dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/actionmanager/raid/raid_example.json diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/common/Constants.java b/dhp-common/src/main/java/eu/dnetlib/dhp/common/Constants.java index 0477d6399..b00199ea5 100644 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/common/Constants.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/common/Constants.java @@ -10,6 +10,10 @@ public class Constants { public static final Map accessRightsCoarMap = Maps.newHashMap(); public static final Map coarCodeLabelMap = Maps.newHashMap(); + public static final String RAID_NS_PREFIX = "raid________"; + public static final String RAID_DATASOURCE_NAME = "Research Activity Identifier Service (RAiD)"; + public static final String RAID_OPENAIRE_ID = ""; + public static final String ROR_NS_PREFIX = "ror_________"; public static final String ROR_OPENAIRE_ID = "10|openaire____::993a7ae7a863813cf95028b50708e222"; diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/Constants.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/Constants.java index 73b4b77cb..722415c2e 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/Constants.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/Constants.java @@ -3,6 +3,8 @@ package eu.dnetlib.dhp.actionmanager; import java.util.Optional; +import eu.dnetlib.dhp.schema.oaf.Instance; +import eu.dnetlib.dhp.schema.oaf.Qualifier; import org.apache.spark.api.java.function.MapFunction; import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Encoders; @@ -110,6 +112,12 @@ public class Constants { } + public static Instance getInstance(Qualifier qualifier) { + Instance instance = new Instance(); + instance.setInstancetype(qualifier); + return instance; + } + public static void removeOutputDir(SparkSession spark, String path) { HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration()); } diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/raid/GenerateRAiDActionSetJob.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/raid/GenerateRAiDActionSetJob.java new file mode 100644 index 000000000..8e5e1bdcb --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/raid/GenerateRAiDActionSetJob.java @@ -0,0 +1,190 @@ +package eu.dnetlib.dhp.actionmanager.raid; + +import com.fasterxml.jackson.databind.ObjectMapper; +import eu.dnetlib.dhp.actionmanager.raid.model.RAiDEntity; +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.common.Constants; +import eu.dnetlib.dhp.common.HdfsSupport; +import eu.dnetlib.dhp.schema.action.AtomicAction; +import eu.dnetlib.dhp.schema.common.ModelConstants; +import eu.dnetlib.dhp.schema.oaf.*; +import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils; +import eu.dnetlib.dhp.utils.DHPUtils; +import org.apache.commons.io.IOUtils; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.mapred.SequenceFileOutputFormat; +import org.apache.spark.SparkConf; +import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.sql.Encoders; +import org.apache.spark.sql.SparkSession; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import scala.Tuple2; + +import java.util.*; +import java.util.stream.Collectors; + +import static eu.dnetlib.dhp.actionmanager.personentity.ExtractPerson.OPENAIRE_DATASOURCE_ID; +import static eu.dnetlib.dhp.actionmanager.personentity.ExtractPerson.OPENAIRE_DATASOURCE_NAME; +import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; +import static eu.dnetlib.dhp.schema.common.ModelConstants.*; +import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.*; + +public class GenerateRAiDActionSetJob { + + private static final Logger log = LoggerFactory.getLogger(eu.dnetlib.dhp.actionmanager.raid.GenerateRAiDActionSetJob.class); + + private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); + + private static final List RAID_COLLECTED_FROM = listKeyValues( + OPENAIRE_DATASOURCE_ID, OPENAIRE_DATASOURCE_NAME); + + private static final Qualifier RAID_QUALIFIER = qualifier("raid:openaireinference", "raid:openaireinference", DNET_PROVENANCE_ACTIONS, DNET_PROVENANCE_ACTIONS); + + private static final DataInfo RAID_DATA_INFO = dataInfo( + false, OPENAIRE_DATASOURCE_NAME, true, false, RAID_QUALIFIER, "0.92"); + + public static void main(final String[] args) throws Exception { + + final String jsonConfiguration = IOUtils + .toString( + eu.dnetlib.dhp.actionmanager.raid.GenerateRAiDActionSetJob.class + .getResourceAsStream("/eu/dnetlib/dhp/actionmanager/raid/action_set_parameters.json")); + + final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration); + + parser.parseArgument(args); + + final Boolean isSparkSessionManaged = Optional + .ofNullable(parser.get("isSparkSessionManaged")) + .map(Boolean::valueOf) + .orElse(Boolean.TRUE); + + log.info("isSparkSessionManaged: {}", isSparkSessionManaged); + + final String inputPath = parser.get("inputPath"); + log.info("inputPath: {}", inputPath); + + final String outputPath = parser.get("outputPath"); + log.info("outputPath {}: ", outputPath); + + final SparkConf conf = new SparkConf(); + + runWithSparkSession(conf, isSparkSessionManaged, spark -> { + removeOutputDir(spark, outputPath); + processRAiDEntities(spark, inputPath, outputPath); + }); + } + + private static void removeOutputDir(final SparkSession spark, final String path) { + HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration()); + } + + static void processRAiDEntities(final SparkSession spark, + final String inputPath, + final String outputPath) { + readInputPath(spark, inputPath) + .map(GenerateRAiDActionSetJob::prepareRAiD) + .flatMap(List::iterator) + .mapToPair( + aa -> new Tuple2<>(new Text(aa.getClazz().getCanonicalName()), + new Text(OBJECT_MAPPER.writeValueAsString(aa)))) + .saveAsHadoopFile(outputPath, Text.class, Text.class, SequenceFileOutputFormat.class); + + } + + protected static List> prepareRAiD(final RAiDEntity r) { + + final Date now = new Date(); + final OtherResearchProduct orp = new OtherResearchProduct(); + final List> res = new ArrayList<>(); + String raidId = calculateOpenaireId(r.getRaid()); + + orp.setId(raidId); + orp.setCollectedfrom(RAID_COLLECTED_FROM); + orp.setDataInfo(RAID_DATA_INFO); + orp.setResourcetype(RAID_QUALIFIER); + orp.setTitle( + Collections.singletonList( + structuredProperty( + r.getTitle(), + qualifier("main title", "main title", DNET_DATACITE_TITLE, DNET_DATACITE_TITLE), + RAID_DATA_INFO)) + ); + orp.setDescription(listFields(RAID_DATA_INFO, r.getSummary())); + orp.setAuthor(createAuthors(r.getAuthors())); + orp.setInstance(Collections.singletonList(eu.dnetlib.dhp.actionmanager.Constants.getInstance(RAID_QUALIFIER))); + orp.setSubject( + r.getSubjects() + .stream() + .map(s -> subject(s, qualifier(DNET_SUBJECT_KEYWORD, DNET_SUBJECT_KEYWORD, DNET_SUBJECT_TYPOLOGIES, DNET_SUBJECT_TYPOLOGIES), RAID_DATA_INFO)) + .collect(Collectors.toList()) + ); + orp.setRelevantdate( + Arrays.asList( + structuredProperty(r.getEndDate(), qualifier("endDate","endDate", DNET_DATACITE_DATE, DNET_DATACITE_DATE), RAID_DATA_INFO), + structuredProperty(r.getStartDate(), qualifier("startDate", "startDate", DNET_DATACITE_DATE, DNET_DATACITE_DATE), RAID_DATA_INFO) + ) + ); + orp.setLastupdatetimestamp(now.getTime()); + orp.setDateofcollection(r.getStartDate()); + + res.add(new AtomicAction<>(OtherResearchProduct.class, orp)); + + for(String resultId: r.getIds()) { + Relation rel1 = OafMapperUtils.getRelation( + raidId, + resultId, + ModelConstants.RESULT_RESULT, + ModelConstants.OUTCOME, + PART, + RAID_COLLECTED_FROM, + RAID_DATA_INFO, + now.getTime(), + null, + null + ); + Relation rel2 = OafMapperUtils.getRelation( + resultId, + raidId, + ModelConstants.RESULT_RESULT, + ModelConstants.OUTCOME, + IS_PART_OF, + RAID_COLLECTED_FROM, + RAID_DATA_INFO, + now.getTime(), + null, + null + ); + res.add(new AtomicAction<>(Relation.class, rel1)); + res.add(new AtomicAction<>(Relation.class, rel2)); + } + + return res; + } + + public static String calculateOpenaireId(final String raid) { + return String.format("50|%s::%s", Constants.RAID_NS_PREFIX, DHPUtils.md5(raid)); + } + + public static List createAuthors(final List author) { + return author.stream().map(s-> { + Author a = new Author(); + a.setFullname(s); + return a; + }).collect(Collectors.toList()); + } + + private static JavaRDD readInputPath( + final SparkSession spark, + final String path) { + + return spark + .read() + .json(path) + .as(Encoders.bean(RAiDEntity.class)) + .toJavaRDD(); + + } + +} diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/raid/model/GenerateRAiDActionSetJob.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/raid/model/GenerateRAiDActionSetJob.java new file mode 100644 index 000000000..b0aec71d3 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/raid/model/GenerateRAiDActionSetJob.java @@ -0,0 +1,2 @@ +package eu.dnetlib.dhp.actionmanager.raid.model;public class GenerateRAiDActionSetJob { +} diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/raid/model/RAiDEntity.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/raid/model/RAiDEntity.java new file mode 100644 index 000000000..bd7e28926 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/raid/model/RAiDEntity.java @@ -0,0 +1,102 @@ +package eu.dnetlib.dhp.actionmanager.raid.model; + +import java.io.Serializable; +import java.util.List; + +public class RAiDEntity implements Serializable { + + String raid; + List authors; + String startDate; + String endDate; + List subjects; + List titles; + List ids; + String title; + String summary; + + public RAiDEntity(){} + public RAiDEntity(String raid, List authors, String startDate, String endDate, List subjects, List titles, List ids, String title, String summary) { + this.raid = raid; + this.authors = authors; + this.startDate = startDate; + this.endDate = endDate; + this.subjects = subjects; + this.titles = titles; + this.ids = ids; + this.title = title; + this.summary = summary; + } + + public String getRaid() { + return raid; + } + + public void setRaid(String raid) { + this.raid = raid; + } + + public List getAuthors() { + return authors; + } + + public void setAuthors(List authors) { + this.authors = authors; + } + + public String getStartDate() { + return startDate; + } + + public void setStartDate(String startDate) { + this.startDate = startDate; + } + + public String getEndDate() { + return endDate; + } + + public void setEndDate(String endDate) { + this.endDate = endDate; + } + + public List getSubjects() { + return subjects; + } + + public void setSubjects(List subjects) { + this.subjects = subjects; + } + + public List getTitles() { + return titles; + } + + public void setTitles(List titles) { + this.titles = titles; + } + + public List getIds() { + return ids; + } + + public void setIds(List ids) { + this.ids = ids; + } + + public String getTitle() { + return title; + } + + public void setTitle(String title) { + this.title = title; + } + + public String getSummary() { + return summary; + } + + public void setSummary(String summary) { + this.summary = summary; + } +} diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/ror/GenerateRorActionSetJob.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/ror/GenerateRorActionSetJob.java index 5f3493d56..ce1973a7f 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/ror/GenerateRorActionSetJob.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/ror/GenerateRorActionSetJob.java @@ -21,6 +21,7 @@ import java.util.Optional; import java.util.Set; import java.util.stream.Collectors; +import eu.dnetlib.dhp.schema.oaf.*; import org.apache.commons.io.IOUtils; import org.apache.commons.lang3.StringUtils; import org.apache.hadoop.conf.Configuration; @@ -44,13 +45,6 @@ import eu.dnetlib.dhp.common.Constants; import eu.dnetlib.dhp.common.HdfsSupport; import eu.dnetlib.dhp.schema.action.AtomicAction; import eu.dnetlib.dhp.schema.common.ModelConstants; -import eu.dnetlib.dhp.schema.oaf.DataInfo; -import eu.dnetlib.dhp.schema.oaf.Field; -import eu.dnetlib.dhp.schema.oaf.KeyValue; -import eu.dnetlib.dhp.schema.oaf.Oaf; -import eu.dnetlib.dhp.schema.oaf.Organization; -import eu.dnetlib.dhp.schema.oaf.Qualifier; -import eu.dnetlib.dhp.schema.oaf.StructuredProperty; import eu.dnetlib.dhp.utils.DHPUtils; import scala.Tuple2; diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/raid/action_set_parameters.json b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/raid/action_set_parameters.json new file mode 100644 index 000000000..2049630d2 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/raid/action_set_parameters.json @@ -0,0 +1,14 @@ +[ + { + "paramName": "i", + "paramLongName": "inputPath", + "paramDescription": "the path of the input json", + "paramRequired": true + }, + { + "paramName": "o", + "paramLongName": "outputPath", + "paramDescription": "the path of the new ActionSet", + "paramRequired": true + } +] \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/raid/oozie_app/action_set_parameters.json b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/raid/oozie_app/action_set_parameters.json new file mode 100644 index 000000000..e69de29bb diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/raid/oozie_app/config-default.xml b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/raid/oozie_app/config-default.xml new file mode 100644 index 000000000..a1755f329 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/raid/oozie_app/config-default.xml @@ -0,0 +1,58 @@ + + + jobTracker + yarnRM + + + nameNode + hdfs://nameservice1 + + + oozie.use.system.libpath + true + + + oozie.action.sharelib.for.spark + spark2 + + + hive_metastore_uris + thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083 + + + spark2YarnHistoryServerAddress + http://iis-cdh5-test-gw.ocean.icm.edu.pl:18089 + + + spark2ExtraListeners + com.cloudera.spark.lineage.NavigatorAppListener + + + spark2SqlQueryExecutionListeners + com.cloudera.spark.lineage.NavigatorQueryListener + + + oozie.launcher.mapreduce.user.classpath.first + true + + + sparkExecutorNumber + 4 + + + spark2EventLogDir + /user/spark/spark2ApplicationHistory + + + sparkDriverMemory + 15G + + + sparkExecutorMemory + 6G + + + sparkExecutorCores + 1 + + \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/raid/oozie_app/workflow.xml b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/raid/oozie_app/workflow.xml new file mode 100644 index 000000000..9b5aa5905 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/raid/oozie_app/workflow.xml @@ -0,0 +1,55 @@ + + + + raidJsonInputPath + the path of the json + + + raidActionSetPath + path where to store the action set + + + + + + + Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] + + + + + + + + + + + + + + + + yarn + cluster + ProcessRAiDFile + eu.dnetlib.dhp.actionmanager.raid.GenerateRAiDActionSetJob + dhp-aggregation-${projectVersion}.jar + + --executor-cores=${sparkExecutorCores} + --executor-memory=${sparkExecutorMemory} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.sql.shuffle.partitions=3840 + + --inputPath${raidJsonInputPath} + --outputPath${raidActionSetPath} + + + + + + + diff --git a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/raid/GenerateRAiDActionSetJobTest.java b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/raid/GenerateRAiDActionSetJobTest.java new file mode 100644 index 000000000..1f33f45b2 --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/raid/GenerateRAiDActionSetJobTest.java @@ -0,0 +1,112 @@ +package eu.dnetlib.dhp.actionmanager.raid; + +import eu.dnetlib.dhp.actionmanager.opencitations.CreateOpenCitationsASTest; +import eu.dnetlib.dhp.actionmanager.raid.model.RAiDEntity; +import eu.dnetlib.dhp.schema.action.AtomicAction; +import eu.dnetlib.dhp.schema.oaf.Oaf; +import eu.dnetlib.dhp.schema.oaf.OtherResearchProduct; +import eu.dnetlib.dhp.schema.oaf.Relation; +import org.apache.commons.io.FileUtils; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat; +import org.apache.spark.SparkConf; +import org.apache.spark.api.java.JavaPairRDD; +import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.rdd.RDD; +import org.apache.spark.sql.Row; +import org.apache.spark.sql.SparkSession; +import org.junit.jupiter.api.AfterAll; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Disabled; +import org.junit.jupiter.api.Test; +import scala.Tuple2; + +import java.io.File; +import java.nio.file.Paths; +import java.util.Arrays; +import java.util.List; + +import static eu.dnetlib.dhp.actionmanager.Constants.OBJECT_MAPPER; +import static java.nio.file.Files.createTempDirectory; +import static org.junit.jupiter.api.Assertions.assertEquals; + +public class GenerateRAiDActionSetJobTest { + private static String input_path; + private static String output_path; + static SparkSession spark; + + @BeforeEach + void setUp() throws Exception { + + input_path = Paths + .get(GenerateRAiDActionSetJobTest.class.getResource("/eu/dnetlib/dhp/actionmanager/raid/raid_example.json").toURI()) + .toFile() + .getAbsolutePath(); + + output_path = createTempDirectory(GenerateRAiDActionSetJobTest.class.getSimpleName() + "-") + .toAbsolutePath() + .toString(); + + SparkConf conf = new SparkConf(); + conf.setAppName(GenerateRAiDActionSetJobTest.class.getSimpleName()); + + conf.setMaster("local[*]"); + conf.set("spark.driver.host", "localhost"); + conf.set("hive.metastore.local", "true"); + conf.set("spark.ui.enabled", "false"); + conf.set("spark.sql.warehouse.dir", output_path); + conf.set("hive.metastore.warehouse.dir", output_path); + + spark = SparkSession + .builder() + .appName(GenerateRAiDActionSetJobTest.class.getSimpleName()) + .config(conf) + .getOrCreate(); + } + + @AfterAll + static void cleanUp() throws Exception { + FileUtils.deleteDirectory(new File(output_path)); + } + + @Test + @Disabled + void testProcessRAiDEntities() { + GenerateRAiDActionSetJob.processRAiDEntities(spark, input_path, output_path + "/test_raid_action_set"); + + JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); + + JavaRDD result = sc + .sequenceFile(output_path + "/test_raid_action_set", Text.class, Text.class) + .map(value -> OBJECT_MAPPER.readValue(value._2().toString(), AtomicAction.class)) + .map(AtomicAction::getPayload); + + assertEquals(80, result.count()); + } + + @Test + void testPrepareRAiD() { + + List> atomicActions = GenerateRAiDActionSetJob.prepareRAiD(new RAiDEntity( + "-92190526", + Arrays.asList("Berli, Justin", "Le Mao, Bérénice", "Guillaume Touya", "Wenclik, Laura", "Courtial, Azelle", "Muehlenhaus, Ian", "Justin Berli", "Touya, Guillaume", "Gruget, Maïeul", "Azelle Courtial", "Ian Muhlenhaus", "Maïeul Gruget", "Marion Dumont", "Maïeul GRUGET", "Cécile Duchêne"), + "2021-09-10", + "2024-02-16", + Arrays.asList("cartography, zoom, pan, desert fog", "Road network", "zooming", "Pan-scalar maps", "pan-scalar map", "Python library", "QGIS", "map design", "landmarks", "Cartes transscalaires", "anchor", "disorientation", "[INFO]Computer Science [cs]", "[SHS.GEO]Humanities and Social Sciences/Geography", "cognitive cartography", "eye-tracking", "Computers in Earth Sciences", "Topographic map", "National Mapping Agency", "General Medicine", "Geography, Planning and Development", "multi-scales", "pan-scalar maps", "Selection", "cartography", "General Earth and Planetary Sciences", "progressiveness", "map generalisation", "Eye-tracker", "zoom", "algorithms", "Map Design", "cartography, map generalisation, zoom, multi-scale map", "Interactive maps", "Map generalisation", "Earth and Planetary Sciences (miscellaneous)", "Cartographic generalization", "rivers", "Benchmark", "General Environmental Science", "open source", "drawing", "Constraint", "Multi-scale maps"), + Arrays.asList("Where do people look at during multi-scale map tasks?", "FogDetector survey raw data", "Collection of cartographic disorientation stories", "Anchorwhat dataset", "BasqueRoads: A Benchmark for Road Network Selection", "Progressive river network selection for pan-scalar maps", "BasqueRoads, a dataset to benchmark road selection algorithms", "Missing the city for buildings? A critical review of pan-scalar map generalization and design in contemporary zoomable maps", "Empirical approach to advance the generalisation of multi-scale maps", "L'Alpe d'Huez: a dataset to benchmark topographic map generalisation", "eye-tracking data from a survey on zooming in a pan-scalar map", "Material of the experiment 'More is Less' from the MapMuxing project", "Cartagen4py, an open source Python library for map generalisation", "L’Alpe d’Huez: A Benchmark for Topographic Map Generalisation"), + Arrays.asList("50|doi_dedup___::6915135e0aa39f913394513f809ae58a", "50|doi_dedup___::754e3c283639bc6e104c925ff3e34007", "50|doi_dedup___::13517477f3c1261d57a3364363ce6ce0", "50|doi_dedup___::675b16c73accc4e7242bbb4ed9b3724a", "50|doi_dedup___::94ce09906b2d7d37eb2206cea8a50153", "50|dedup_wf_002::cc575d5ca5651ff8c3029a3a76e7e70a", "50|doi_dedup___::c5e52baddda17c755d1bae012a97dc13", "50|doi_dedup___::4f5f38c9e08fe995f7278963183f8ad4", "50|doi_dedup___::a9bc4453273b2d02648a5cb453195042", "50|doi_dedup___::5e893dc0cb7624a33f41c9b428bd59f7", "50|doi_dedup___::c1ecdef48fd9be811a291deed950e1c5", "50|doi_dedup___::9e93c8f2d97c35de8a6a57a5b53ef283", "50|dedup_wf_002::d08be0ed27b13d8a880e891e08d093ea", "50|doi_dedup___::f8d8b3b9eddeca2fc0e3bc9e63996555"), + "Exploring Multi-Scale Map Generalization and Design", + "This project aims to advance the generalization of multi-scale maps by investigating the impact of different design elements on user experience. The research involves collecting and analyzing data from various sources, including surveys, eye-tracking studies, and user experiments. The goal is to identify best practices for map generalization and design, with a focus on reducing disorientation and improving information retrieval during exploration. The project has led to the development of several datasets, including BasqueRoads, AnchorWhat, and L'Alpe d'Huez, which can be used to benchmark road selection algorithms and topographic map generalization techniques. The research has also resulted in the creation of a Python library, Cartagen4py, for map generalization. The findings of this project have the potential to improve the design and usability of multi-scale maps, making them more effective tools for navigation and information retrieval." + )); + + OtherResearchProduct orp = (OtherResearchProduct) atomicActions.get(0).getPayload(); + Relation rel = (Relation) atomicActions.get(1).getPayload(); + + assertEquals("Exploring Multi-Scale Map Generalization and Design", orp.getTitle().get(0).getValue()); + assertEquals("50|raid________::759a564ce5cc7360cab030c517c7366b", rel.getSource()); + assertEquals("50|doi_dedup___::6915135e0aa39f913394513f809ae58a", rel.getTarget()); + + } + +} diff --git a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/actionmanager/raid/raid_example.json b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/actionmanager/raid/raid_example.json new file mode 100644 index 000000000..7694b605c --- /dev/null +++ b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/actionmanager/raid/raid_example.json @@ -0,0 +1,6 @@ +{"raid": "-9222092103004099540", "authors": ["Department of Archaeology & Museums", "Department of Archaeology and Museums", "Department Of Archaeology & Museums"], "subjects": ["Begamganj", "Raisen", "Bhopal", "Budhni", "Malwa site survey", "सीहोर", "Gauharganj", "बुधनी", "Budni", "Berasia"], "titles": ["Malwa site survey : Raisen District, Begamganj Tahsīl, photographic documentation", "Malwa site survey : Bhopal District, photographic documentation (version 1, TIFF files)", "Malwa site survey : Raisen District, Gauharganj Tahsīl, village finds", "Malwa site survey : Sehore सीहोर District, Budni Tahsīl, photographic documentation (part 1)", "Malwa site survey: Bhopal District, Berasia Tahsīl, photographic documentation (with villages named)", "Malwa site survey : Sehore सीहोर District, Budni Tahsīl, photographic documentation (part 2)", "Malwa site survey : Bhopal District, photographic documentation (version 2, JPEG files)"], "ids": ["50|doi_dedup___::7523d165970830dd857e6cbea4302adf", "50|doi_dedup___::02309ae8a9fae291df321e317f5c5330", "50|doi_dedup___::95347ba2c4264414fab39712ee7fe481", "50|doi_dedup___::970aa708fe667596754fd02a708780f5", "50|doi_dedup___::b7cd9128cc53b1257a4f000347f339b0", "50|doi_dedup___::c7d65da0ecedef4d2c702b9db197d90c", "50|doi_dedup___::addbb67cf5046e340f342ba091bcebfa"], "title": "Documentation of Malwa Region", "summary": "This project involves the documentation of the Malwa region through photographic surveys. The surveys were conducted by the Department of Archaeology and Museums, Madhya Pradesh, and cover various districts and tahsils. The documentation includes photographic records of sites, villages, and other relevant features. The project aims to provide a comprehensive understanding of the region's cultural and historical significance.", "startDate": "2019-03-06", "endDate": "2019-03-08"} +{"raid": "-9221424331076109424", "authors": ["Hutchings, Judy", "Ward, Catherine", "Baban, Adriana", "D��nil��, Ingrid", "Frantz, Inga", "Gardner, Frances", "Lachman, Jamie", "Lachman, Jamie M.", "Foran, Heather", "Heinrichs, Nina", "Murphy, Hugh", "B��ban, Adriana", "Raleva, Marija", "Fang, Xiangming", "Jansen, Elena", "Taut, Diana", "Foran, Heather M.", "T��ut, Diana", "Ward, Catherine L.", "Williams, Margiad", "Lesco, Galina", "Brühl, Antonia"], "subjects": ["3. Good health", "5. Gender equality", "Criminology not elsewhere classified", "1. No poverty", "2. Zero hunger"], "titles": ["sj-docx-1-vaw-10.1177_10778012231188090 - Supplemental material for Co-Occurrence of Intimate Partner Violence Against Mothers and Maltreatment of Their Children With Behavioral Problems in Eastern Europe", "Hunger in vulnerable families in Southeastern Europe: Associations with health and violence", "Prevention of child mental health problems through parenting interventions in Southeastern Europe (RISE): study protocol for a multi-site randomised controlled trial"], "ids": ["50|doi_dedup___::a70015063e5400dae2e097ee10b4a589", "50|doi_dedup___::6e1d12026fcde9087724622ccdeed430", "50|doi_dedup___::5b7bd5d46c5d95e2ef5b36663504a67e"], "title": "Exploring the Impact of Hunger and Violence on Child Health in Southeastern Europe", "summary": "This study aims to investigate the relationship between hunger, violence, and child health in vulnerable families in Southeastern Europe. The research will explore the experiences of families in FYR Macedonia, Republic of Moldova, and Romania, and examine the associations between hunger, maltreatment, and other health indicators. The study will also test the efficacy of a parenting intervention targeting child behavioral problems in alleviating these issues. The findings of this research will contribute to the development of effective interventions to address the complex needs of vulnerable families in the region.", "startDate": "2019-06-04", "endDate": "2023-01-01"} +{"raid": "-9219052635741785098", "authors": ["Berli, Justin", "Le Mao, Bérénice", "Guillaume Touya", "Wenclik, Laura", "Courtial, Azelle", "Muehlenhaus, Ian", "Justin Berli", "Touya, Guillaume", "Gruget, Maïeul", "Azelle Courtial", "Ian Muhlenhaus", "Maïeul Gruget", "Marion Dumont", "Maïeul GRUGET", "Cécile Duchêne"], "subjects": ["cartography, zoom, pan, desert fog", "Road network", "zooming", "Pan-scalar maps", "pan-scalar map", "Python library", "QGIS", "map design", "landmarks", "Cartes transscalaires", "anchor", "disorientation", "[INFO]Computer Science [cs]", "[SHS.GEO]Humanities and Social Sciences/Geography", "cognitive cartography", "eye-tracking", "Computers in Earth Sciences", "Topographic map", "National Mapping Agency", "General Medicine", "Geography, Planning and Development", "multi-scales", "pan-scalar maps", "Selection", "cartography", "General Earth and Planetary Sciences", "progressiveness", "map generalisation", "Eye-tracker", "zoom", "algorithms", "Map Design", "cartography, map generalisation, zoom, multi-scale map", "Interactive maps", "Map generalisation", "Earth and Planetary Sciences (miscellaneous)", "Cartographic generalization", "rivers", "Benchmark", "General Environmental Science", "open source", "drawing", "Constraint", "Multi-scale maps"], "titles": ["Where do people look at during multi-scale map tasks?", "FogDetector survey raw data", "Collection of cartographic disorientation stories", "Anchorwhat dataset", "BasqueRoads: A Benchmark for Road Network Selection", "Progressive river network selection for pan-scalar maps", "BasqueRoads, a dataset to benchmark road selection algorithms", "Missing the city for buildings? A critical review of pan-scalar map generalization and design in contemporary zoomable maps", "Empirical approach to advance the generalisation of multi-scale maps", "L'Alpe d'Huez: a dataset to benchmark topographic map generalisation", "eye-tracking data from a survey on zooming in a pan-scalar map", "Material of the experiment \"More is Less\" from the MapMuxing project", "Cartagen4py, an open source Python library for map generalisation", "L’Alpe d’Huez: A Benchmark for Topographic Map Generalisation"], "ids": ["50|doi_dedup___::6915135e0aa39f913394513f809ae58a", "50|doi_dedup___::754e3c283639bc6e104c925ff3e34007", "50|doi_dedup___::13517477f3c1261d57a3364363ce6ce0", "50|doi_dedup___::675b16c73accc4e7242bbb4ed9b3724a", "50|doi_dedup___::94ce09906b2d7d37eb2206cea8a50153", "50|dedup_wf_002::cc575d5ca5651ff8c3029a3a76e7e70a", "50|doi_dedup___::c5e52baddda17c755d1bae012a97dc13", "50|doi_dedup___::4f5f38c9e08fe995f7278963183f8ad4", "50|doi_dedup___::a9bc4453273b2d02648a5cb453195042", "50|doi_dedup___::5e893dc0cb7624a33f41c9b428bd59f7", "50|doi_dedup___::c1ecdef48fd9be811a291deed950e1c5", "50|doi_dedup___::9e93c8f2d97c35de8a6a57a5b53ef283", "50|dedup_wf_002::d08be0ed27b13d8a880e891e08d093ea", "50|doi_dedup___::f8d8b3b9eddeca2fc0e3bc9e63996555"], "title": "Exploring Multi-Scale Map Generalization and Design", "summary": "This project aims to advance the generalization of multi-scale maps by investigating the impact of different design elements on user experience. The research involves collecting and analyzing data from various sources, including surveys, eye-tracking studies, and user experiments. The goal is to identify best practices for map generalization and design, with a focus on reducing disorientation and improving information retrieval during exploration. The project has led to the development of several datasets, including BasqueRoads, AnchorWhat, and L'Alpe d'Huez, which can be used to benchmark road selection algorithms and topographic map generalization techniques. The research has also resulted in the creation of a Python library, Cartagen4py, for map generalization. The findings of this project have the potential to improve the design and usability of multi-scale maps, making them more effective tools for navigation and information retrieval.", "startDate": "2021-09-10", "endDate": "2024-02-16"} +{"raid": "-9216828847055450272", "authors": ["Grey, Alan", "Gorelov, Sergey", "Pall, Szilard", "Merz, Pascal", "Justin A., Lemkul", "Szilárd Páll", "Pasquadibisceglie, Andrea", "Kutzner, Carsten", "Schulz, Roland", "Nabet, Julien", "Abraham, Mark", "Jalalypour, Farzaneh", "Lundborg, Magnus", "Gray, Alan", "Villa, Alessandra", "Berk Hess", "Santuz, Hubert", "Irrgang, M. Eric", "Wingbermuehle, Sebastian", "Lemkul, Justin A.", "Jordan, Joe", "Pellegrino, Michele", "Doijade, Mahesh", "Shvetsov, Alexey", "Hess, Berk", "Behera, Sudarshan", "Andrey Alekseenko", "Shugaeva, Tatiana", "Fleischmann, Stefan", "Bergh, Cathrine", "Morozov, Dmitry", "Adam Hospital", "Briand, Eliane", "Lindahl, Erik", "Brown, Ania", "Marta Lloret Llinares", "Miletic, Vedran", "Alekseenko, Andrey", "Gouaillardet, Gilles", "Fiorin, Giacomo", "Basov, Vladimir"], "subjects": ["webinar"], "titles": ["Introduction to HPC: molecular dynamics simulations with GROMACS: log files", "BioExcel webinar #73: Competency frameworks to support training design and professional development", "Introduction to HPC: molecular dynamics simulations with GROMACS: output files - Devana", "GROMACS 2024.0 Manual", "BioExcel Webinar #71: GROMACS-PMX for accurate estimation of free energy differences", "Introduction to HPC: molecular dynamics simulations with GROMACS: input files", "BioExcel Webinar #68: What's new in GROMACS 2023", "BioExcel Webinar #69: BioBB-Wfs and BioBB-API, integrated web-based platform and programmatic interface for biomolecular simulations workflows using the BioExcel Building Blocks library", "GROMACS 2024-beta Source code"], "ids": ["50|doi_dedup___::8318fbc815ee1943c3269be7567f220b", "50|doi_dedup___::9530e03fb2aac63e82b18a40dc09e32c", "50|doi_dedup___::30174ab31075e76a428ca5b4f4d236b8", "50|doi_________::70b7c6dce09ae6f1361d22913fdf95eb", "50|doi_dedup___::337dd48600618f3c06257edd750d6201", "50|doi_dedup___::d622992ba9077617f37ebd268b3e806d", "50|doi_dedup___::0b0bcc6825d6c052c37882fd5cfc1e8c", "50|doi_dedup___::4b1541a7cee32527c65ace5d1ed57335", "50|doi_dedup___::1379861df59bd755e4fb39b9f95ffbd3"], "title": "Exploring High-Performance Computing and Biomolecular Simulations", "summary": "This project involves exploring high-performance computing (HPC) and biomolecular simulations using GROMACS. The objectives include understanding molecular dynamics simulations, log files, input files, and output files. Additionally, the project aims to explore competency frameworks for professional development, specifically in the field of computational biomolecular research. The tools and techniques used will include GROMACS, BioExcel Building Blocks, and competency frameworks. The expected outcomes include a deeper understanding of HPC and biomolecular simulations, as well as the development of skills in using GROMACS and BioExcel Building Blocks. The project will also contribute to the development of competency frameworks for professional development in the field of computational biomolecular research.", "startDate": "2023-04-25", "endDate": "2024-01-30"} +{"raid": "-9210544816395499758", "authors": ["Bateson, Melissa", "Andrews, Clare", "Verhulst, Simon", "Nettle, Daniel", "Zuidersma, Erica"], "subjects": ["2. Zero hunger"], "titles": ["Exposure to food insecurity increases energy storage and reduces somatic maintenance in European starlings", "Data and code archive for Andrews et al. 'Exposure to food insecurity increases energy storage and reduces somatic maintenance in European starlings'"], "ids": ["50|doi_dedup___::176117239be06189523c253e0ca9c5ec", "50|doi_dedup___::343e0b0ddf0d54763a89a62af1f7a379"], "title": "Investigating the Effects of Food Insecurity on Energy Storage and Somatic Maintenance in European Starlings", "summary": "This study examines the impact of food insecurity on energy storage and somatic maintenance in European starlings. The research involved exposing juvenile starlings to either uninterrupted food availability or a regime of unpredictable food unavailability. The results show that birds exposed to food insecurity stored more energy, but at the expense of somatic maintenance and repair. The study provides insights into the adaptive responses of birds to food scarcity and the trade-offs involved in energy storage and maintenance.", "startDate": "2021-06-28", "endDate": "2021-06-28"} +{"raid": "-9208499171224730388", "authors": ["Maniati, Eleni", "Bakker, Bjorn", "McClelland, Sarah E.", "Shaikh, Nadeem", "De Angelis, Simone", "Johnson, Sarah C.", "Wang, Jun", "Foijer, Floris", "Spierings, Diana C. J.", "Boemo, Michael A.", "Wardenaar, René", "Mazzagatti, Alice"], "subjects": [], "titles": ["Additional file 2 of Replication stress generates distinctive landscapes of DNA copy number alterations and chromosome scale losses", "Additional file 5 of Replication stress generates distinctive landscapes of DNA copy number alterations and chromosome scale losses"], "ids": ["50|doi_dedup___::a1bfeb173971f74a274fab8bdd78a4bc", "50|doi_dedup___::3d6e151aaeb2f7c40a320207fdd80ade"], "title": "Analysis of DNA Copy Number Alterations and Chromosome Scale Losses", "summary": "This study analyzed the effects of replication stress on DNA copy number alterations and chromosome scale losses. The results show distinctive landscapes of these alterations and losses, which were further investigated in additional files. The study provides valuable insights into the mechanisms of replication stress and its impact on genomic stability.", "startDate": "2022-01-01", "endDate": "2022-01-01"} \ No newline at end of file From 6af3fd16b6362c4a9357f44ba779b7822f5b7033 Mon Sep 17 00:00:00 2001 From: miconis Date: Thu, 5 Dec 2024 14:39:42 +0100 Subject: [PATCH 097/111] attributes fixes --- .../java/eu/dnetlib/pace/tree/DateRange.java | 90 ++--- .../eu/dnetlib/pace/tree/JsonListMatch.java | 2 +- .../pace/comparators/ComparatorTest.java | 30 +- .../dnetlib/dhp/actionmanager/Constants.java | 4 +- .../raid/GenerateRAiDActionSetJob.java | 309 ++++++++++-------- .../raid/model/GenerateRAiDActionSetJob.java | 5 +- .../actionmanager/raid/model/RAiDEntity.java | 154 ++++----- .../ror/GenerateRorActionSetJob.java | 2 +- .../dhp/sx/bio/pubmed/PMAffiliation.java | 44 +-- .../dnetlib/dhp/sx/bio/pubmed/PMAuthor.java | 1 - .../dhp/sx/bio/pubmed/PMIdentifier.java | 74 ++--- .../raid/oozie_app/action_set_parameters.json | 0 .../actionmanager/raid/oozie_app/workflow.xml | 2 - .../collection/crossref/Crossref2Oaf.scala | 1 - .../dnetlib/dhp/sx/bio/pubmed/PMParser2.scala | 13 +- .../dhp/sx/bio/pubmed/PubMedToOaf.scala | 20 +- .../raid/GenerateRAiDActionSetJobTest.java | 203 +++++++----- .../dnetlib/dhp/sx/bio/BioScholixTest.scala | 1 - .../dhp/bulktag/community/ResultTagger.java | 4 +- .../dnetlib/dhp/oa/graph/raw/MappersTest.java | 3 +- .../model/ProvisionModelSupport.java | 15 +- 21 files changed, 546 insertions(+), 431 deletions(-) delete mode 100644 dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/raid/oozie_app/action_set_parameters.json diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/DateRange.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/DateRange.java index c913109a4..194677e6e 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/DateRange.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/DateRange.java @@ -1,10 +1,5 @@ -package eu.dnetlib.pace.tree; -import com.wcohen.ss.AbstractStringDistance; -import eu.dnetlib.pace.config.Config; -import eu.dnetlib.pace.tree.support.AbstractStringComparator; -import eu.dnetlib.pace.tree.support.ComparatorClass; -import org.joda.time.DateTime; +package eu.dnetlib.pace.tree; import java.time.DateTimeException; import java.time.LocalDate; @@ -13,55 +8,62 @@ import java.time.format.DateTimeFormatter; import java.util.Locale; import java.util.Map; +import org.joda.time.DateTime; + +import com.wcohen.ss.AbstractStringDistance; + +import eu.dnetlib.pace.config.Config; +import eu.dnetlib.pace.tree.support.AbstractStringComparator; +import eu.dnetlib.pace.tree.support.ComparatorClass; + @ComparatorClass("dateRange") public class DateRange extends AbstractStringComparator { - int YEAR_RANGE; + int YEAR_RANGE; - public DateRange(Map params) { - super(params, new com.wcohen.ss.JaroWinkler()); - YEAR_RANGE = Integer.parseInt(params.getOrDefault("year_range", "3")); - } + public DateRange(Map params) { + super(params, new com.wcohen.ss.JaroWinkler()); + YEAR_RANGE = Integer.parseInt(params.getOrDefault("year_range", "3")); + } - public DateRange(final double weight) { - super(weight, new com.wcohen.ss.JaroWinkler()); - } + public DateRange(final double weight) { + super(weight, new com.wcohen.ss.JaroWinkler()); + } - protected DateRange(final double weight, final AbstractStringDistance ssalgo) { - super(weight, ssalgo); - } + protected DateRange(final double weight, final AbstractStringDistance ssalgo) { + super(weight, ssalgo); + } - public static boolean isNumeric(String str) { - return str.matches("\\d+"); //match a number with optional '-' and decimal. - } + public static boolean isNumeric(String str) { + return str.matches("\\d+"); // match a number with optional '-' and decimal. + } - @Override - public double distance(final String a, final String b, final Config conf) { - if (a.isEmpty() || b.isEmpty()) { - return -1.0; // return -1 if a field is missing - } + @Override + public double distance(final String a, final String b, final Config conf) { + if (a.isEmpty() || b.isEmpty()) { + return -1.0; // return -1 if a field is missing + } - try { - DateTimeFormatter formatter = DateTimeFormatter.ofPattern("yyyy-MM-dd", Locale.ENGLISH); - LocalDate d1 = LocalDate.parse(a, formatter); - LocalDate d2 = LocalDate.parse(b, formatter); - Period period = Period.between(d1, d2); + try { + DateTimeFormatter formatter = DateTimeFormatter.ofPattern("yyyy-MM-dd", Locale.ENGLISH); + LocalDate d1 = LocalDate.parse(a, formatter); + LocalDate d2 = LocalDate.parse(b, formatter); + Period period = Period.between(d1, d2); - return period.getYears() <= YEAR_RANGE? 1.0 : 0.0; - } - catch (DateTimeException e) { - return -1.0; - } + return period.getYears() <= YEAR_RANGE ? 1.0 : 0.0; + } catch (DateTimeException e) { + return -1.0; + } - } + } - @Override - public double getWeight() { - return super.weight; - } + @Override + public double getWeight() { + return super.weight; + } - @Override - protected double normalize(final double d) { - return d; - } + @Override + protected double normalize(final double d) { + return d; + } } diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/JsonListMatch.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/JsonListMatch.java index e95d9206e..d9558df90 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/JsonListMatch.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/JsonListMatch.java @@ -62,7 +62,7 @@ public class JsonListMatch extends AbstractListComparator { Set types = Sets.intersection(typesA, typesB); - if (types.isEmpty()) // if no common type, it is impossible to compare + if (types.isEmpty()) // if no common type, it is impossible to compare return -1; ca = ca.stream().filter(s -> types.contains(s.split("::")[0])).collect(Collectors.toSet()); diff --git a/dhp-pace-core/src/test/java/eu/dnetlib/pace/comparators/ComparatorTest.java b/dhp-pace-core/src/test/java/eu/dnetlib/pace/comparators/ComparatorTest.java index 83539de4a..0abde84bc 100644 --- a/dhp-pace-core/src/test/java/eu/dnetlib/pace/comparators/ComparatorTest.java +++ b/dhp-pace-core/src/test/java/eu/dnetlib/pace/comparators/ComparatorTest.java @@ -72,14 +72,34 @@ public class ComparatorTest extends AbstractPaceTest { CodeMatch codeMatch = new CodeMatch(params); // names have different codes - assertEquals(0.0, codeMatch.distance("physical oceanography at ctd station june 1998 ev02a", "physical oceanography at ctd station june 1998 ir02", conf)); + assertEquals( + 0.0, + codeMatch + .distance( + "physical oceanography at ctd station june 1998 ev02a", + "physical oceanography at ctd station june 1998 ir02", conf)); // names have same code - assertEquals(1.0, codeMatch.distance("physical oceanography at ctd station june 1998 ev02a", "physical oceanography at ctd station june 1998 ev02a", conf)); + assertEquals( + 1.0, + codeMatch + .distance( + "physical oceanography at ctd station june 1998 ev02a", + "physical oceanography at ctd station june 1998 ev02a", conf)); // code is not in both names - assertEquals(-1, codeMatch.distance("physical oceanography at ctd station june 1998", "physical oceanography at ctd station june 1998 ev02a", conf)); - assertEquals(1.0, codeMatch.distance("physical oceanography at ctd station june 1998", "physical oceanography at ctd station june 1998", conf)); + assertEquals( + -1, + codeMatch + .distance( + "physical oceanography at ctd station june 1998", + "physical oceanography at ctd station june 1998 ev02a", conf)); + assertEquals( + 1.0, + codeMatch + .distance( + "physical oceanography at ctd station june 1998", "physical oceanography at ctd station june 1998", + conf)); } @Test @@ -275,7 +295,7 @@ public class ComparatorTest extends AbstractPaceTest { Arrays .asList( "{\"datainfo\":{\"deletedbyinference\":false,\"inferenceprovenance\":null,\"inferred\":false,\"invisible\":false,\"provenanceaction\":{\"classid\":\"sysimport:actionset\",\"classname\":\"Harvested\",\"schemeid\":\"dnet:provenanceActions\",\"schemename\":\"dnet:provenanceActions\"},\"trust\":\"0.9\"},\"qualifier\":{\"classid\":\"grid\",\"classname\":\"GRID Identifier\",\"schemeid\":\"dnet:pid_types\",\"schemename\":\"dnet:pid_types\"},\"value\":\"grid_1\"}", - "{\"datainfo\":{\"deletedbyinference\":false,\"inferenceprovenance\":null,\"inferred\":false,\"invisible\":false,\"provenanceaction\":{\"classid\":\"sysimport:actionset\",\"classname\":\"Harvested\",\"schemeid\":\"dnet:provenanceActions\",\"schemename\":\"dnet:provenanceActions\"},\"trust\":\"0.9\"},\"qualifier\":{\"classid\":\"ror\",\"classname\":\"Research Organization Registry\",\"schemeid\":\"dnet:pid_types\",\"schemename\":\"dnet:pid_types\"},\"value\":\"ror_1\"}"), + "{\"datainfo\":{\"deletedbyinference\":false,\"inferenceprovenance\":null,\"inferred\":false,\"invisible\":false,\"provenanceaction\":{\"classid\":\"sysimport:actionset\",\"classname\":\"Harvested\",\"schemeid\":\"dnet:provenanceActions\",\"schemename\":\"dnet:provenanceActions\"},\"trust\":\"0.9\"},\"qualifier\":{\"classid\":\"ror\",\"classname\":\"Research Organization Registry\",\"schemeid\":\"dnet:pid_types\",\"schemename\":\"dnet:pid_types\"},\"value\":\"ror_1\"}"), "authors"); List b = createFieldList( Arrays diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/Constants.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/Constants.java index 722415c2e..394cc22a3 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/Constants.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/Constants.java @@ -3,8 +3,6 @@ package eu.dnetlib.dhp.actionmanager; import java.util.Optional; -import eu.dnetlib.dhp.schema.oaf.Instance; -import eu.dnetlib.dhp.schema.oaf.Qualifier; import org.apache.spark.api.java.function.MapFunction; import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Encoders; @@ -15,6 +13,8 @@ import com.fasterxml.jackson.databind.ObjectMapper; import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.common.HdfsSupport; import eu.dnetlib.dhp.schema.common.ModelConstants; +import eu.dnetlib.dhp.schema.oaf.Instance; +import eu.dnetlib.dhp.schema.oaf.Qualifier; import eu.dnetlib.dhp.schema.oaf.StructuredProperty; import eu.dnetlib.dhp.schema.oaf.Subject; import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils; diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/raid/GenerateRAiDActionSetJob.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/raid/GenerateRAiDActionSetJob.java index 8e5e1bdcb..3b2405956 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/raid/GenerateRAiDActionSetJob.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/raid/GenerateRAiDActionSetJob.java @@ -1,15 +1,15 @@ + package eu.dnetlib.dhp.actionmanager.raid; -import com.fasterxml.jackson.databind.ObjectMapper; -import eu.dnetlib.dhp.actionmanager.raid.model.RAiDEntity; -import eu.dnetlib.dhp.application.ArgumentApplicationParser; -import eu.dnetlib.dhp.common.Constants; -import eu.dnetlib.dhp.common.HdfsSupport; -import eu.dnetlib.dhp.schema.action.AtomicAction; -import eu.dnetlib.dhp.schema.common.ModelConstants; -import eu.dnetlib.dhp.schema.oaf.*; -import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils; -import eu.dnetlib.dhp.utils.DHPUtils; +import static eu.dnetlib.dhp.actionmanager.personentity.ExtractPerson.OPENAIRE_DATASOURCE_ID; +import static eu.dnetlib.dhp.actionmanager.personentity.ExtractPerson.OPENAIRE_DATASOURCE_NAME; +import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; +import static eu.dnetlib.dhp.schema.common.ModelConstants.*; +import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.*; + +import java.util.*; +import java.util.stream.Collectors; + import org.apache.commons.io.IOUtils; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapred.SequenceFileOutputFormat; @@ -19,172 +19,191 @@ import org.apache.spark.sql.Encoders; import org.apache.spark.sql.SparkSession; import org.slf4j.Logger; import org.slf4j.LoggerFactory; + +import com.fasterxml.jackson.databind.ObjectMapper; + +import eu.dnetlib.dhp.actionmanager.raid.model.RAiDEntity; +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.common.Constants; +import eu.dnetlib.dhp.common.HdfsSupport; +import eu.dnetlib.dhp.schema.action.AtomicAction; +import eu.dnetlib.dhp.schema.common.ModelConstants; +import eu.dnetlib.dhp.schema.oaf.*; +import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils; +import eu.dnetlib.dhp.utils.DHPUtils; import scala.Tuple2; -import java.util.*; -import java.util.stream.Collectors; - -import static eu.dnetlib.dhp.actionmanager.personentity.ExtractPerson.OPENAIRE_DATASOURCE_ID; -import static eu.dnetlib.dhp.actionmanager.personentity.ExtractPerson.OPENAIRE_DATASOURCE_NAME; -import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; -import static eu.dnetlib.dhp.schema.common.ModelConstants.*; -import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.*; - public class GenerateRAiDActionSetJob { - private static final Logger log = LoggerFactory.getLogger(eu.dnetlib.dhp.actionmanager.raid.GenerateRAiDActionSetJob.class); + private static final Logger log = LoggerFactory + .getLogger(eu.dnetlib.dhp.actionmanager.raid.GenerateRAiDActionSetJob.class); - private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); + private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); - private static final List RAID_COLLECTED_FROM = listKeyValues( - OPENAIRE_DATASOURCE_ID, OPENAIRE_DATASOURCE_NAME); + private static final List RAID_COLLECTED_FROM = listKeyValues( + OPENAIRE_DATASOURCE_ID, OPENAIRE_DATASOURCE_NAME); - private static final Qualifier RAID_QUALIFIER = qualifier("raid:openaireinference", "raid:openaireinference", DNET_PROVENANCE_ACTIONS, DNET_PROVENANCE_ACTIONS); + private static final Qualifier RAID_QUALIFIER = qualifier("0049", "Research Activity Identifier", DNET_PUBLICATION_RESOURCE, DNET_PUBLICATION_RESOURCE); - private static final DataInfo RAID_DATA_INFO = dataInfo( - false, OPENAIRE_DATASOURCE_NAME, true, false, RAID_QUALIFIER, "0.92"); + private static final Qualifier RAID_INFERENCE_QUALIFIER = qualifier( + "raid:openaireinference", "Inferred by OpenAIRE", DNET_PROVENANCE_ACTIONS, DNET_PROVENANCE_ACTIONS); - public static void main(final String[] args) throws Exception { + private static final DataInfo RAID_DATA_INFO = dataInfo( + false, OPENAIRE_DATASOURCE_NAME, true, false, RAID_INFERENCE_QUALIFIER, "0.92"); - final String jsonConfiguration = IOUtils - .toString( - eu.dnetlib.dhp.actionmanager.raid.GenerateRAiDActionSetJob.class - .getResourceAsStream("/eu/dnetlib/dhp/actionmanager/raid/action_set_parameters.json")); + public static void main(final String[] args) throws Exception { - final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration); + final String jsonConfiguration = IOUtils + .toString( + eu.dnetlib.dhp.actionmanager.raid.GenerateRAiDActionSetJob.class + .getResourceAsStream("/eu/dnetlib/dhp/actionmanager/raid/action_set_parameters.json")); - parser.parseArgument(args); + final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration); - final Boolean isSparkSessionManaged = Optional - .ofNullable(parser.get("isSparkSessionManaged")) - .map(Boolean::valueOf) - .orElse(Boolean.TRUE); + parser.parseArgument(args); - log.info("isSparkSessionManaged: {}", isSparkSessionManaged); + final Boolean isSparkSessionManaged = Optional + .ofNullable(parser.get("isSparkSessionManaged")) + .map(Boolean::valueOf) + .orElse(Boolean.TRUE); - final String inputPath = parser.get("inputPath"); - log.info("inputPath: {}", inputPath); + log.info("isSparkSessionManaged: {}", isSparkSessionManaged); - final String outputPath = parser.get("outputPath"); - log.info("outputPath {}: ", outputPath); + final String inputPath = parser.get("inputPath"); + log.info("inputPath: {}", inputPath); - final SparkConf conf = new SparkConf(); + final String outputPath = parser.get("outputPath"); + log.info("outputPath {}: ", outputPath); - runWithSparkSession(conf, isSparkSessionManaged, spark -> { - removeOutputDir(spark, outputPath); - processRAiDEntities(spark, inputPath, outputPath); - }); - } + final SparkConf conf = new SparkConf(); - private static void removeOutputDir(final SparkSession spark, final String path) { - HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration()); - } + runWithSparkSession(conf, isSparkSessionManaged, spark -> { + removeOutputDir(spark, outputPath); + processRAiDEntities(spark, inputPath, outputPath); + }); + } - static void processRAiDEntities(final SparkSession spark, - final String inputPath, - final String outputPath) { - readInputPath(spark, inputPath) - .map(GenerateRAiDActionSetJob::prepareRAiD) - .flatMap(List::iterator) - .mapToPair( - aa -> new Tuple2<>(new Text(aa.getClazz().getCanonicalName()), - new Text(OBJECT_MAPPER.writeValueAsString(aa)))) - .saveAsHadoopFile(outputPath, Text.class, Text.class, SequenceFileOutputFormat.class); + private static void removeOutputDir(final SparkSession spark, final String path) { + HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration()); + } - } + static void processRAiDEntities(final SparkSession spark, + final String inputPath, + final String outputPath) { + readInputPath(spark, inputPath) + .map(GenerateRAiDActionSetJob::prepareRAiD) + .flatMap(List::iterator) + .mapToPair( + aa -> new Tuple2<>(new Text(aa.getClazz().getCanonicalName()), + new Text(OBJECT_MAPPER.writeValueAsString(aa)))) + .saveAsHadoopFile(outputPath, Text.class, Text.class, SequenceFileOutputFormat.class); - protected static List> prepareRAiD(final RAiDEntity r) { + } - final Date now = new Date(); - final OtherResearchProduct orp = new OtherResearchProduct(); - final List> res = new ArrayList<>(); - String raidId = calculateOpenaireId(r.getRaid()); + protected static List> prepareRAiD(final RAiDEntity r) { - orp.setId(raidId); - orp.setCollectedfrom(RAID_COLLECTED_FROM); - orp.setDataInfo(RAID_DATA_INFO); - orp.setResourcetype(RAID_QUALIFIER); - orp.setTitle( - Collections.singletonList( - structuredProperty( - r.getTitle(), - qualifier("main title", "main title", DNET_DATACITE_TITLE, DNET_DATACITE_TITLE), - RAID_DATA_INFO)) - ); - orp.setDescription(listFields(RAID_DATA_INFO, r.getSummary())); - orp.setAuthor(createAuthors(r.getAuthors())); - orp.setInstance(Collections.singletonList(eu.dnetlib.dhp.actionmanager.Constants.getInstance(RAID_QUALIFIER))); - orp.setSubject( - r.getSubjects() - .stream() - .map(s -> subject(s, qualifier(DNET_SUBJECT_KEYWORD, DNET_SUBJECT_KEYWORD, DNET_SUBJECT_TYPOLOGIES, DNET_SUBJECT_TYPOLOGIES), RAID_DATA_INFO)) - .collect(Collectors.toList()) - ); - orp.setRelevantdate( - Arrays.asList( - structuredProperty(r.getEndDate(), qualifier("endDate","endDate", DNET_DATACITE_DATE, DNET_DATACITE_DATE), RAID_DATA_INFO), - structuredProperty(r.getStartDate(), qualifier("startDate", "startDate", DNET_DATACITE_DATE, DNET_DATACITE_DATE), RAID_DATA_INFO) - ) - ); - orp.setLastupdatetimestamp(now.getTime()); - orp.setDateofcollection(r.getStartDate()); + final Date now = new Date(); + final OtherResearchProduct orp = new OtherResearchProduct(); + final List> res = new ArrayList<>(); + String raidId = calculateOpenaireId(r.getRaid()); - res.add(new AtomicAction<>(OtherResearchProduct.class, orp)); + orp.setId(raidId); + orp.setCollectedfrom(RAID_COLLECTED_FROM); + orp.setDataInfo(RAID_DATA_INFO); + orp + .setTitle( + Collections + .singletonList( + structuredProperty( + r.getTitle(), + qualifier("main title", "main title", DNET_DATACITE_TITLE, DNET_DATACITE_TITLE), + RAID_DATA_INFO))); + orp.setDescription(listFields(RAID_DATA_INFO, r.getSummary())); +// orp.setAuthor(createAuthors(r.getAuthors())); + orp.setInstance(Collections.singletonList(eu.dnetlib.dhp.actionmanager.Constants.getInstance(RAID_QUALIFIER))); + orp + .setSubject( + r + .getSubjects() + .stream() + .map( + s -> subject( + s, + qualifier( + DNET_SUBJECT_KEYWORD, DNET_SUBJECT_KEYWORD, DNET_SUBJECT_TYPOLOGIES, + DNET_SUBJECT_TYPOLOGIES), + RAID_DATA_INFO)) + .collect(Collectors.toList())); + orp + .setRelevantdate( + Arrays + .asList( + structuredProperty( + r.getEndDate(), qualifier("endDate", "endDate", DNET_DATACITE_DATE, DNET_DATACITE_DATE), + RAID_DATA_INFO), + structuredProperty( + r.getStartDate(), + qualifier("startDate", "startDate", DNET_DATACITE_DATE, DNET_DATACITE_DATE), + RAID_DATA_INFO))); + orp.setLastupdatetimestamp(now.getTime()); + orp.setDateofacceptance(field(r.getStartDate(), RAID_DATA_INFO)); - for(String resultId: r.getIds()) { - Relation rel1 = OafMapperUtils.getRelation( - raidId, - resultId, - ModelConstants.RESULT_RESULT, - ModelConstants.OUTCOME, - PART, - RAID_COLLECTED_FROM, - RAID_DATA_INFO, - now.getTime(), - null, - null - ); - Relation rel2 = OafMapperUtils.getRelation( - resultId, - raidId, - ModelConstants.RESULT_RESULT, - ModelConstants.OUTCOME, - IS_PART_OF, - RAID_COLLECTED_FROM, - RAID_DATA_INFO, - now.getTime(), - null, - null - ); - res.add(new AtomicAction<>(Relation.class, rel1)); - res.add(new AtomicAction<>(Relation.class, rel2)); - } + res.add(new AtomicAction<>(OtherResearchProduct.class, orp)); - return res; - } + for (String resultId : r.getIds()) { + Relation rel1 = OafMapperUtils + .getRelation( + raidId, + resultId, + ModelConstants.RESULT_RESULT, + PART, + HAS_PART, + RAID_COLLECTED_FROM, + RAID_DATA_INFO, + now.getTime(), + null, + null); + Relation rel2 = OafMapperUtils + .getRelation( + resultId, + raidId, + ModelConstants.RESULT_RESULT, + PART, + IS_PART_OF, + RAID_COLLECTED_FROM, + RAID_DATA_INFO, + now.getTime(), + null, + null); + res.add(new AtomicAction<>(Relation.class, rel1)); + res.add(new AtomicAction<>(Relation.class, rel2)); + } - public static String calculateOpenaireId(final String raid) { - return String.format("50|%s::%s", Constants.RAID_NS_PREFIX, DHPUtils.md5(raid)); - } + return res; + } - public static List createAuthors(final List author) { - return author.stream().map(s-> { - Author a = new Author(); - a.setFullname(s); - return a; - }).collect(Collectors.toList()); - } + public static String calculateOpenaireId(final String raid) { + return String.format("50|%s::%s", Constants.RAID_NS_PREFIX, DHPUtils.md5(raid)); + } - private static JavaRDD readInputPath( - final SparkSession spark, - final String path) { + public static List createAuthors(final List author) { + return author.stream().map(s -> { + Author a = new Author(); + a.setFullname(s); + return a; + }).collect(Collectors.toList()); + } - return spark - .read() - .json(path) - .as(Encoders.bean(RAiDEntity.class)) - .toJavaRDD(); + private static JavaRDD readInputPath( + final SparkSession spark, + final String path) { - } + return spark + .read() + .json(path) + .as(Encoders.bean(RAiDEntity.class)) + .toJavaRDD(); + + } } diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/raid/model/GenerateRAiDActionSetJob.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/raid/model/GenerateRAiDActionSetJob.java index b0aec71d3..856b52e18 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/raid/model/GenerateRAiDActionSetJob.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/raid/model/GenerateRAiDActionSetJob.java @@ -1,2 +1,5 @@ -package eu.dnetlib.dhp.actionmanager.raid.model;public class GenerateRAiDActionSetJob { + +package eu.dnetlib.dhp.actionmanager.raid.model; + +public class GenerateRAiDActionSetJob { } diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/raid/model/RAiDEntity.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/raid/model/RAiDEntity.java index bd7e28926..1203b28a7 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/raid/model/RAiDEntity.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/raid/model/RAiDEntity.java @@ -1,3 +1,4 @@ + package eu.dnetlib.dhp.actionmanager.raid.model; import java.io.Serializable; @@ -5,98 +6,101 @@ import java.util.List; public class RAiDEntity implements Serializable { - String raid; - List authors; - String startDate; - String endDate; - List subjects; - List titles; - List ids; - String title; - String summary; + String raid; + List authors; + String startDate; + String endDate; + List subjects; + List titles; + List ids; + String title; + String summary; - public RAiDEntity(){} - public RAiDEntity(String raid, List authors, String startDate, String endDate, List subjects, List titles, List ids, String title, String summary) { - this.raid = raid; - this.authors = authors; - this.startDate = startDate; - this.endDate = endDate; - this.subjects = subjects; - this.titles = titles; - this.ids = ids; - this.title = title; - this.summary = summary; - } + public RAiDEntity() { + } - public String getRaid() { - return raid; - } + public RAiDEntity(String raid, List authors, String startDate, String endDate, List subjects, + List titles, List ids, String title, String summary) { + this.raid = raid; + this.authors = authors; + this.startDate = startDate; + this.endDate = endDate; + this.subjects = subjects; + this.titles = titles; + this.ids = ids; + this.title = title; + this.summary = summary; + } - public void setRaid(String raid) { - this.raid = raid; - } + public String getRaid() { + return raid; + } - public List getAuthors() { - return authors; - } + public void setRaid(String raid) { + this.raid = raid; + } - public void setAuthors(List authors) { - this.authors = authors; - } + public List getAuthors() { + return authors; + } - public String getStartDate() { - return startDate; - } + public void setAuthors(List authors) { + this.authors = authors; + } - public void setStartDate(String startDate) { - this.startDate = startDate; - } + public String getStartDate() { + return startDate; + } - public String getEndDate() { - return endDate; - } + public void setStartDate(String startDate) { + this.startDate = startDate; + } - public void setEndDate(String endDate) { - this.endDate = endDate; - } + public String getEndDate() { + return endDate; + } - public List getSubjects() { - return subjects; - } + public void setEndDate(String endDate) { + this.endDate = endDate; + } - public void setSubjects(List subjects) { - this.subjects = subjects; - } + public List getSubjects() { + return subjects; + } - public List getTitles() { - return titles; - } + public void setSubjects(List subjects) { + this.subjects = subjects; + } - public void setTitles(List titles) { - this.titles = titles; - } + public List getTitles() { + return titles; + } - public List getIds() { - return ids; - } + public void setTitles(List titles) { + this.titles = titles; + } - public void setIds(List ids) { - this.ids = ids; - } + public List getIds() { + return ids; + } - public String getTitle() { - return title; - } + public void setIds(List ids) { + this.ids = ids; + } - public void setTitle(String title) { - this.title = title; - } + public String getTitle() { + return title; + } - public String getSummary() { - return summary; - } + public void setTitle(String title) { + this.title = title; + } - public void setSummary(String summary) { - this.summary = summary; - } + public String getSummary() { + return summary; + } + + public void setSummary(String summary) { + this.summary = summary; + } } diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/ror/GenerateRorActionSetJob.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/ror/GenerateRorActionSetJob.java index ce1973a7f..6e8f48bda 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/ror/GenerateRorActionSetJob.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/ror/GenerateRorActionSetJob.java @@ -21,7 +21,6 @@ import java.util.Optional; import java.util.Set; import java.util.stream.Collectors; -import eu.dnetlib.dhp.schema.oaf.*; import org.apache.commons.io.IOUtils; import org.apache.commons.lang3.StringUtils; import org.apache.hadoop.conf.Configuration; @@ -45,6 +44,7 @@ import eu.dnetlib.dhp.common.Constants; import eu.dnetlib.dhp.common.HdfsSupport; import eu.dnetlib.dhp.schema.action.AtomicAction; import eu.dnetlib.dhp.schema.common.ModelConstants; +import eu.dnetlib.dhp.schema.oaf.*; import eu.dnetlib.dhp.utils.DHPUtils; import scala.Tuple2; diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/pubmed/PMAffiliation.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/pubmed/PMAffiliation.java index a8dacd132..5ac1920ea 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/pubmed/PMAffiliation.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/pubmed/PMAffiliation.java @@ -1,3 +1,4 @@ + package eu.dnetlib.dhp.sx.bio.pubmed; /** @@ -7,32 +8,33 @@ package eu.dnetlib.dhp.sx.bio.pubmed; */ public class PMAffiliation { - private String name; + private String name; - private PMIdentifier identifier; + private PMIdentifier identifier; - public PMAffiliation() { + public PMAffiliation() { - } - public PMAffiliation(String name, PMIdentifier identifier) { - this.name = name; - this.identifier = identifier; - } + } - public String getName() { - return name; - } + public PMAffiliation(String name, PMIdentifier identifier) { + this.name = name; + this.identifier = identifier; + } - public void setName(String name) { - this.name = name; - } + public String getName() { + return name; + } - public PMIdentifier getIdentifier() { - return identifier; - } + public void setName(String name) { + this.name = name; + } - public PMAffiliation setIdentifier(PMIdentifier identifier) { - this.identifier = identifier; - return this; - } + public PMIdentifier getIdentifier() { + return identifier; + } + + public PMAffiliation setIdentifier(PMIdentifier identifier) { + this.identifier = identifier; + return this; + } } diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/pubmed/PMAuthor.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/pubmed/PMAuthor.java index b0df25663..e023f2e62 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/pubmed/PMAuthor.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/pubmed/PMAuthor.java @@ -97,5 +97,4 @@ public class PMAuthor implements Serializable { this.affiliation = affiliation; } - } diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/pubmed/PMIdentifier.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/pubmed/PMIdentifier.java index 0c8c55e40..6cd17a90c 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/pubmed/PMIdentifier.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/sx/bio/pubmed/PMIdentifier.java @@ -1,53 +1,53 @@ + package eu.dnetlib.dhp.sx.bio.pubmed; public class PMIdentifier { - private String pid; - private String type; + private String pid; + private String type; + public PMIdentifier(String pid, String type) { + this.pid = cleanPid(pid); + this.type = type; + } - public PMIdentifier(String pid, String type) { - this.pid = cleanPid(pid); - this.type = type; - } + public PMIdentifier() { - public PMIdentifier() { + } - } + private String cleanPid(String pid) { - private String cleanPid(String pid) { + if (pid == null) { + return null; + } - if (pid == null) { - return null; - } + // clean ORCID ID in the form 0000000163025705 to 0000-0001-6302-5705 + if (pid.matches("[0-9]{15}[0-9X]")) { + return pid.replaceAll("(.{4})(.{4})(.{4})(.{4})", "$1-$2-$3-$4"); + } - // clean ORCID ID in the form 0000000163025705 to 0000-0001-6302-5705 - if (pid.matches("[0-9]{15}[0-9X]")) { - return pid.replaceAll("(.{4})(.{4})(.{4})(.{4})", "$1-$2-$3-$4"); - } + // clean ORCID in the form http://orcid.org/0000-0001-8567-3543 to 0000-0001-8567-3543 + if (pid.matches("http://orcid.org/[0-9]{4}-[0-9]{4}-[0-9]{4}-[0-9]{4}")) { + return pid.replaceAll("http://orcid.org/", ""); + } + return pid; + } - // clean ORCID in the form http://orcid.org/0000-0001-8567-3543 to 0000-0001-8567-3543 - if (pid.matches("http://orcid.org/[0-9]{4}-[0-9]{4}-[0-9]{4}-[0-9]{4}")) { - return pid.replaceAll("http://orcid.org/", ""); - } - return pid; - } + public String getPid() { + return pid; + } - public String getPid() { - return pid; - } + public PMIdentifier setPid(String pid) { + this.pid = cleanPid(pid); + return this; + } - public PMIdentifier setPid(String pid) { - this.pid = cleanPid(pid); - return this; - } + public String getType() { + return type; + } - public String getType() { - return type; - } - - public PMIdentifier setType(String type) { - this.type = type; - return this; - } + public PMIdentifier setType(String type) { + this.type = type; + return this; + } } diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/raid/oozie_app/action_set_parameters.json b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/raid/oozie_app/action_set_parameters.json deleted file mode 100644 index e69de29bb..000000000 diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/raid/oozie_app/workflow.xml b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/raid/oozie_app/workflow.xml index 9b5aa5905..d3392596f 100644 --- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/raid/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/raid/oozie_app/workflow.xml @@ -20,8 +20,6 @@ - - diff --git a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/collection/crossref/Crossref2Oaf.scala b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/collection/crossref/Crossref2Oaf.scala index e4a238c8f..de68ebb58 100644 --- a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/collection/crossref/Crossref2Oaf.scala +++ b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/collection/crossref/Crossref2Oaf.scala @@ -673,7 +673,6 @@ case object Crossref2Oaf { val doi = input.getString(0) val rorId = input.getString(1) - val pubId = IdentifierFactory.idFromPid("50", "doi", DoiCleaningRule.clean(doi), true) val affId = GenerateRorActionSetJob.calculateOpenaireId(rorId) diff --git a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/pubmed/PMParser2.scala b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/pubmed/PMParser2.scala index 2eb4bea65..bc9a2cf02 100644 --- a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/pubmed/PMParser2.scala +++ b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/pubmed/PMParser2.scala @@ -82,21 +82,22 @@ class PMParser2 { a.setLastName((author \ "LastName").text) a.setForeName((author \ "ForeName").text) val id = (author \ "Identifier").text - val idType =(author \ "Identifier" \ "@Source").text + val idType = (author \ "Identifier" \ "@Source").text - if(id != null && id.nonEmpty && idType != null && idType.nonEmpty) { + if (id != null && id.nonEmpty && idType != null && idType.nonEmpty) { a.setIdentifier(new PMIdentifier(id, idType)) } - val affiliation = (author \ "AffiliationInfo" \ "Affiliation").text - val affiliationId = (author \ "AffiliationInfo" \ "Identifier").text + val affiliationId = (author \ "AffiliationInfo" \ "Identifier").text val affiliationIdType = (author \ "AffiliationInfo" \ "Identifier" \ "@Source").text - if(affiliation != null && affiliation.nonEmpty) { + if (affiliation != null && affiliation.nonEmpty) { val aff = new PMAffiliation() aff.setName(affiliation) - if(affiliationId != null && affiliationId.nonEmpty && affiliationIdType != null && affiliationIdType.nonEmpty) { + if ( + affiliationId != null && affiliationId.nonEmpty && affiliationIdType != null && affiliationIdType.nonEmpty + ) { aff.setIdentifier(new PMIdentifier(affiliationId, affiliationIdType)) } a.setAffiliation(aff) diff --git a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/pubmed/PubMedToOaf.scala b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/pubmed/PubMedToOaf.scala index 5e14c731a..281ca0e07 100644 --- a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/pubmed/PubMedToOaf.scala +++ b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/pubmed/PubMedToOaf.scala @@ -294,11 +294,23 @@ object PubMedToOaf { author.setName(a.getForeName) author.setSurname(a.getLastName) author.setFullname(a.getFullName) - if(a.getIdentifier != null) { - author.setPid(List(OafMapperUtils.structuredProperty(a.getIdentifier.getPid, - OafMapperUtils.qualifier(a.getIdentifier.getType,a.getIdentifier.getType,ModelConstants.DNET_PID_TYPES, ModelConstants.DNET_PID_TYPES), dataInfo)).asJava) + if (a.getIdentifier != null) { + author.setPid( + List( + OafMapperUtils.structuredProperty( + a.getIdentifier.getPid, + OafMapperUtils.qualifier( + a.getIdentifier.getType, + a.getIdentifier.getType, + ModelConstants.DNET_PID_TYPES, + ModelConstants.DNET_PID_TYPES + ), + dataInfo + ) + ).asJava + ) } - if (a.getAffiliation!= null) + if (a.getAffiliation != null) author.setRawAffiliationString(List(a.getAffiliation.getName).asJava) author.setRank(index + 1) author diff --git a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/raid/GenerateRAiDActionSetJobTest.java b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/raid/GenerateRAiDActionSetJobTest.java index 1f33f45b2..9417822af 100644 --- a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/raid/GenerateRAiDActionSetJobTest.java +++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/raid/GenerateRAiDActionSetJobTest.java @@ -1,11 +1,16 @@ + package eu.dnetlib.dhp.actionmanager.raid; -import eu.dnetlib.dhp.actionmanager.opencitations.CreateOpenCitationsASTest; -import eu.dnetlib.dhp.actionmanager.raid.model.RAiDEntity; -import eu.dnetlib.dhp.schema.action.AtomicAction; -import eu.dnetlib.dhp.schema.oaf.Oaf; -import eu.dnetlib.dhp.schema.oaf.OtherResearchProduct; -import eu.dnetlib.dhp.schema.oaf.Relation; +import static java.nio.file.Files.createTempDirectory; + +import static eu.dnetlib.dhp.actionmanager.Constants.OBJECT_MAPPER; +import static org.junit.jupiter.api.Assertions.assertEquals; + +import java.io.File; +import java.nio.file.Paths; +import java.util.Arrays; +import java.util.List; + import org.apache.commons.io.FileUtils; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat; @@ -20,93 +25,141 @@ import org.junit.jupiter.api.AfterAll; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Disabled; import org.junit.jupiter.api.Test; + +import eu.dnetlib.dhp.actionmanager.opencitations.CreateOpenCitationsASTest; +import eu.dnetlib.dhp.actionmanager.raid.model.RAiDEntity; +import eu.dnetlib.dhp.schema.action.AtomicAction; +import eu.dnetlib.dhp.schema.oaf.Oaf; +import eu.dnetlib.dhp.schema.oaf.OtherResearchProduct; +import eu.dnetlib.dhp.schema.oaf.Relation; import scala.Tuple2; -import java.io.File; -import java.nio.file.Paths; -import java.util.Arrays; -import java.util.List; - -import static eu.dnetlib.dhp.actionmanager.Constants.OBJECT_MAPPER; -import static java.nio.file.Files.createTempDirectory; -import static org.junit.jupiter.api.Assertions.assertEquals; - public class GenerateRAiDActionSetJobTest { - private static String input_path; - private static String output_path; - static SparkSession spark; + private static String input_path; + private static String output_path; + static SparkSession spark; - @BeforeEach - void setUp() throws Exception { + @BeforeEach + void setUp() throws Exception { - input_path = Paths - .get(GenerateRAiDActionSetJobTest.class.getResource("/eu/dnetlib/dhp/actionmanager/raid/raid_example.json").toURI()) - .toFile() - .getAbsolutePath(); + input_path = Paths + .get( + GenerateRAiDActionSetJobTest.class + .getResource("/eu/dnetlib/dhp/actionmanager/raid/raid_example.json") + .toURI()) + .toFile() + .getAbsolutePath(); - output_path = createTempDirectory(GenerateRAiDActionSetJobTest.class.getSimpleName() + "-") - .toAbsolutePath() - .toString(); + output_path = createTempDirectory(GenerateRAiDActionSetJobTest.class.getSimpleName() + "-") + .toAbsolutePath() + .toString(); - SparkConf conf = new SparkConf(); - conf.setAppName(GenerateRAiDActionSetJobTest.class.getSimpleName()); + SparkConf conf = new SparkConf(); + conf.setAppName(GenerateRAiDActionSetJobTest.class.getSimpleName()); - conf.setMaster("local[*]"); - conf.set("spark.driver.host", "localhost"); - conf.set("hive.metastore.local", "true"); - conf.set("spark.ui.enabled", "false"); - conf.set("spark.sql.warehouse.dir", output_path); - conf.set("hive.metastore.warehouse.dir", output_path); + conf.setMaster("local[*]"); + conf.set("spark.driver.host", "localhost"); + conf.set("hive.metastore.local", "true"); + conf.set("spark.ui.enabled", "false"); + conf.set("spark.sql.warehouse.dir", output_path); + conf.set("hive.metastore.warehouse.dir", output_path); - spark = SparkSession - .builder() - .appName(GenerateRAiDActionSetJobTest.class.getSimpleName()) - .config(conf) - .getOrCreate(); - } + spark = SparkSession + .builder() + .appName(GenerateRAiDActionSetJobTest.class.getSimpleName()) + .config(conf) + .getOrCreate(); + } - @AfterAll - static void cleanUp() throws Exception { - FileUtils.deleteDirectory(new File(output_path)); - } + @AfterAll + static void cleanUp() throws Exception { + FileUtils.deleteDirectory(new File(output_path)); + } - @Test - @Disabled - void testProcessRAiDEntities() { - GenerateRAiDActionSetJob.processRAiDEntities(spark, input_path, output_path + "/test_raid_action_set"); + @Test + @Disabled + void testProcessRAiDEntities() { + GenerateRAiDActionSetJob.processRAiDEntities(spark, input_path, output_path + "/test_raid_action_set"); - JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); + JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); - JavaRDD result = sc - .sequenceFile(output_path + "/test_raid_action_set", Text.class, Text.class) - .map(value -> OBJECT_MAPPER.readValue(value._2().toString(), AtomicAction.class)) - .map(AtomicAction::getPayload); + JavaRDD result = sc + .sequenceFile(output_path + "/test_raid_action_set", Text.class, Text.class) + .map(value -> OBJECT_MAPPER.readValue(value._2().toString(), AtomicAction.class)) + .map(AtomicAction::getPayload); - assertEquals(80, result.count()); - } + assertEquals(80, result.count()); + } - @Test - void testPrepareRAiD() { + @Test + void testPrepareRAiD() { - List> atomicActions = GenerateRAiDActionSetJob.prepareRAiD(new RAiDEntity( - "-92190526", - Arrays.asList("Berli, Justin", "Le Mao, Bérénice", "Guillaume Touya", "Wenclik, Laura", "Courtial, Azelle", "Muehlenhaus, Ian", "Justin Berli", "Touya, Guillaume", "Gruget, Maïeul", "Azelle Courtial", "Ian Muhlenhaus", "Maïeul Gruget", "Marion Dumont", "Maïeul GRUGET", "Cécile Duchêne"), - "2021-09-10", - "2024-02-16", - Arrays.asList("cartography, zoom, pan, desert fog", "Road network", "zooming", "Pan-scalar maps", "pan-scalar map", "Python library", "QGIS", "map design", "landmarks", "Cartes transscalaires", "anchor", "disorientation", "[INFO]Computer Science [cs]", "[SHS.GEO]Humanities and Social Sciences/Geography", "cognitive cartography", "eye-tracking", "Computers in Earth Sciences", "Topographic map", "National Mapping Agency", "General Medicine", "Geography, Planning and Development", "multi-scales", "pan-scalar maps", "Selection", "cartography", "General Earth and Planetary Sciences", "progressiveness", "map generalisation", "Eye-tracker", "zoom", "algorithms", "Map Design", "cartography, map generalisation, zoom, multi-scale map", "Interactive maps", "Map generalisation", "Earth and Planetary Sciences (miscellaneous)", "Cartographic generalization", "rivers", "Benchmark", "General Environmental Science", "open source", "drawing", "Constraint", "Multi-scale maps"), - Arrays.asList("Where do people look at during multi-scale map tasks?", "FogDetector survey raw data", "Collection of cartographic disorientation stories", "Anchorwhat dataset", "BasqueRoads: A Benchmark for Road Network Selection", "Progressive river network selection for pan-scalar maps", "BasqueRoads, a dataset to benchmark road selection algorithms", "Missing the city for buildings? A critical review of pan-scalar map generalization and design in contemporary zoomable maps", "Empirical approach to advance the generalisation of multi-scale maps", "L'Alpe d'Huez: a dataset to benchmark topographic map generalisation", "eye-tracking data from a survey on zooming in a pan-scalar map", "Material of the experiment 'More is Less' from the MapMuxing project", "Cartagen4py, an open source Python library for map generalisation", "L’Alpe d’Huez: A Benchmark for Topographic Map Generalisation"), - Arrays.asList("50|doi_dedup___::6915135e0aa39f913394513f809ae58a", "50|doi_dedup___::754e3c283639bc6e104c925ff3e34007", "50|doi_dedup___::13517477f3c1261d57a3364363ce6ce0", "50|doi_dedup___::675b16c73accc4e7242bbb4ed9b3724a", "50|doi_dedup___::94ce09906b2d7d37eb2206cea8a50153", "50|dedup_wf_002::cc575d5ca5651ff8c3029a3a76e7e70a", "50|doi_dedup___::c5e52baddda17c755d1bae012a97dc13", "50|doi_dedup___::4f5f38c9e08fe995f7278963183f8ad4", "50|doi_dedup___::a9bc4453273b2d02648a5cb453195042", "50|doi_dedup___::5e893dc0cb7624a33f41c9b428bd59f7", "50|doi_dedup___::c1ecdef48fd9be811a291deed950e1c5", "50|doi_dedup___::9e93c8f2d97c35de8a6a57a5b53ef283", "50|dedup_wf_002::d08be0ed27b13d8a880e891e08d093ea", "50|doi_dedup___::f8d8b3b9eddeca2fc0e3bc9e63996555"), - "Exploring Multi-Scale Map Generalization and Design", - "This project aims to advance the generalization of multi-scale maps by investigating the impact of different design elements on user experience. The research involves collecting and analyzing data from various sources, including surveys, eye-tracking studies, and user experiments. The goal is to identify best practices for map generalization and design, with a focus on reducing disorientation and improving information retrieval during exploration. The project has led to the development of several datasets, including BasqueRoads, AnchorWhat, and L'Alpe d'Huez, which can be used to benchmark road selection algorithms and topographic map generalization techniques. The research has also resulted in the creation of a Python library, Cartagen4py, for map generalization. The findings of this project have the potential to improve the design and usability of multi-scale maps, making them more effective tools for navigation and information retrieval." - )); + List> atomicActions = GenerateRAiDActionSetJob + .prepareRAiD( + new RAiDEntity( + "-92190526", + Arrays + .asList( + "Berli, Justin", "Le Mao, Bérénice", "Guillaume Touya", "Wenclik, Laura", + "Courtial, Azelle", "Muehlenhaus, Ian", "Justin Berli", "Touya, Guillaume", + "Gruget, Maïeul", "Azelle Courtial", "Ian Muhlenhaus", "Maïeul Gruget", "Marion Dumont", + "Maïeul GRUGET", "Cécile Duchêne"), + "2021-09-10", + "2024-02-16", + Arrays + .asList( + "cartography, zoom, pan, desert fog", "Road network", "zooming", "Pan-scalar maps", + "pan-scalar map", "Python library", "QGIS", "map design", "landmarks", + "Cartes transscalaires", "anchor", "disorientation", "[INFO]Computer Science [cs]", + "[SHS.GEO]Humanities and Social Sciences/Geography", "cognitive cartography", + "eye-tracking", "Computers in Earth Sciences", "Topographic map", "National Mapping Agency", + "General Medicine", "Geography, Planning and Development", "multi-scales", + "pan-scalar maps", "Selection", "cartography", "General Earth and Planetary Sciences", + "progressiveness", "map generalisation", "Eye-tracker", "zoom", "algorithms", "Map Design", + "cartography, map generalisation, zoom, multi-scale map", "Interactive maps", + "Map generalisation", "Earth and Planetary Sciences (miscellaneous)", + "Cartographic generalization", "rivers", "Benchmark", "General Environmental Science", + "open source", "drawing", "Constraint", "Multi-scale maps"), + Arrays + .asList( + "Where do people look at during multi-scale map tasks?", "FogDetector survey raw data", + "Collection of cartographic disorientation stories", "Anchorwhat dataset", + "BasqueRoads: A Benchmark for Road Network Selection", + "Progressive river network selection for pan-scalar maps", + "BasqueRoads, a dataset to benchmark road selection algorithms", + "Missing the city for buildings? A critical review of pan-scalar map generalization and design in contemporary zoomable maps", + "Empirical approach to advance the generalisation of multi-scale maps", + "L'Alpe d'Huez: a dataset to benchmark topographic map generalisation", + "eye-tracking data from a survey on zooming in a pan-scalar map", + "Material of the experiment 'More is Less' from the MapMuxing project", + "Cartagen4py, an open source Python library for map generalisation", + "L’Alpe d’Huez: A Benchmark for Topographic Map Generalisation"), + Arrays + .asList( + "50|doi_dedup___::6915135e0aa39f913394513f809ae58a", + "50|doi_dedup___::754e3c283639bc6e104c925ff3e34007", + "50|doi_dedup___::13517477f3c1261d57a3364363ce6ce0", + "50|doi_dedup___::675b16c73accc4e7242bbb4ed9b3724a", + "50|doi_dedup___::94ce09906b2d7d37eb2206cea8a50153", + "50|dedup_wf_002::cc575d5ca5651ff8c3029a3a76e7e70a", + "50|doi_dedup___::c5e52baddda17c755d1bae012a97dc13", + "50|doi_dedup___::4f5f38c9e08fe995f7278963183f8ad4", + "50|doi_dedup___::a9bc4453273b2d02648a5cb453195042", + "50|doi_dedup___::5e893dc0cb7624a33f41c9b428bd59f7", + "50|doi_dedup___::c1ecdef48fd9be811a291deed950e1c5", + "50|doi_dedup___::9e93c8f2d97c35de8a6a57a5b53ef283", + "50|dedup_wf_002::d08be0ed27b13d8a880e891e08d093ea", + "50|doi_dedup___::f8d8b3b9eddeca2fc0e3bc9e63996555"), + "Exploring Multi-Scale Map Generalization and Design", + "This project aims to advance the generalization of multi-scale maps by investigating the impact of different design elements on user experience. The research involves collecting and analyzing data from various sources, including surveys, eye-tracking studies, and user experiments. The goal is to identify best practices for map generalization and design, with a focus on reducing disorientation and improving information retrieval during exploration. The project has led to the development of several datasets, including BasqueRoads, AnchorWhat, and L'Alpe d'Huez, which can be used to benchmark road selection algorithms and topographic map generalization techniques. The research has also resulted in the creation of a Python library, Cartagen4py, for map generalization. The findings of this project have the potential to improve the design and usability of multi-scale maps, making them more effective tools for navigation and information retrieval.")); - OtherResearchProduct orp = (OtherResearchProduct) atomicActions.get(0).getPayload(); - Relation rel = (Relation) atomicActions.get(1).getPayload(); + OtherResearchProduct orp = (OtherResearchProduct) atomicActions.get(0).getPayload(); + Relation rel = (Relation) atomicActions.get(1).getPayload(); - assertEquals("Exploring Multi-Scale Map Generalization and Design", orp.getTitle().get(0).getValue()); - assertEquals("50|raid________::759a564ce5cc7360cab030c517c7366b", rel.getSource()); - assertEquals("50|doi_dedup___::6915135e0aa39f913394513f809ae58a", rel.getTarget()); + assertEquals("Exploring Multi-Scale Map Generalization and Design", orp.getTitle().get(0).getValue()); + assertEquals("50|raid________::759a564ce5cc7360cab030c517c7366b", rel.getSource()); + assertEquals("50|doi_dedup___::6915135e0aa39f913394513f809ae58a", rel.getTarget()); - } + } } diff --git a/dhp-workflows/dhp-aggregation/src/test/scala/eu/dnetlib/dhp/sx/bio/BioScholixTest.scala b/dhp-workflows/dhp-aggregation/src/test/scala/eu/dnetlib/dhp/sx/bio/BioScholixTest.scala index 4a926df01..cb7826dbf 100644 --- a/dhp-workflows/dhp-aggregation/src/test/scala/eu/dnetlib/dhp/sx/bio/BioScholixTest.scala +++ b/dhp-workflows/dhp-aggregation/src/test/scala/eu/dnetlib/dhp/sx/bio/BioScholixTest.scala @@ -63,7 +63,6 @@ class BioScholixTest extends AbstractVocabularyTest { "0000000333457333", "0000000335964515", "0000000302921949", - "http://orcid.org/0000-0001-8567-3543", "http://orcid.org/0000-0001-7868-8528", "0000-0001-9189-1440", diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/community/ResultTagger.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/community/ResultTagger.java index 64cbd70ba..0d6c81627 100644 --- a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/community/ResultTagger.java +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/community/ResultTagger.java @@ -130,7 +130,7 @@ public class ResultTagger implements Serializable { // log.info("Remove constraints for " + communityId); if (conf.getRemoveConstraintsMap().keySet().contains(communityId) && conf.getRemoveConstraintsMap().get(communityId).getCriteria() != null && - !conf.getRemoveConstraintsMap().get(communityId).getCriteria().isEmpty() && + !conf.getRemoveConstraintsMap().get(communityId).getCriteria().isEmpty() && conf .getRemoveConstraintsMap() .get(communityId) @@ -228,7 +228,7 @@ public class ResultTagger implements Serializable { .forEach(communityId -> { if (!removeCommunities.contains(communityId) && conf.getSelectionConstraintsMap().get(communityId).getCriteria() != null && - !conf.getSelectionConstraintsMap().get(communityId).getCriteria().isEmpty() && + !conf.getSelectionConstraintsMap().get(communityId).getCriteria().isEmpty() && conf .getSelectionConstraintsMap() .get(communityId) diff --git a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java index ea9503d17..e1710db54 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java +++ b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java @@ -915,7 +915,8 @@ class MappersTest { @Test void testODFRecord_guidelines4() throws IOException { - final String xml = IOUtils.toString(Objects.requireNonNull(getClass().getResourceAsStream("odf_guidelines4.xml"))); + final String xml = IOUtils + .toString(Objects.requireNonNull(getClass().getResourceAsStream("odf_guidelines4.xml"))); final List list = new OdfToOafMapper(vocs, false, true).processMdRecord(xml); final Publication p = (Publication) list.get(0); diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/ProvisionModelSupport.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/ProvisionModelSupport.java index 0da0f6955..2c977a390 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/ProvisionModelSupport.java +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/ProvisionModelSupport.java @@ -5,7 +5,6 @@ import java.io.StringReader; import java.util.*; import java.util.stream.Collectors; -import eu.dnetlib.dhp.schema.solr.PersonTopic; import org.apache.commons.lang3.StringUtils; import org.dom4j.Document; import org.dom4j.DocumentException; @@ -40,6 +39,7 @@ import eu.dnetlib.dhp.schema.solr.OpenAccessColor; import eu.dnetlib.dhp.schema.solr.OpenAccessRoute; import eu.dnetlib.dhp.schema.solr.Organization; import eu.dnetlib.dhp.schema.solr.Person; +import eu.dnetlib.dhp.schema.solr.PersonTopic; import eu.dnetlib.dhp.schema.solr.Pid; import eu.dnetlib.dhp.schema.solr.Project; import eu.dnetlib.dhp.schema.solr.Result; @@ -216,11 +216,14 @@ public class ProvisionModelSupport { } private static List mapPersonTopics(List subjects) { - return Optional.ofNullable(subjects) - .map(ss -> ss.stream() - .map(ProvisionModelSupport::mapPersonTopic) - .collect(Collectors.toList())) - .orElse(null); + return Optional + .ofNullable(subjects) + .map( + ss -> ss + .stream() + .map(ProvisionModelSupport::mapPersonTopic) + .collect(Collectors.toList())) + .orElse(null); } private static PersonTopic mapPersonTopic(eu.dnetlib.dhp.schema.oaf.PersonTopic pt) { From fed13e083e8d9370d7d16885c17c13a8aa1a8d46 Mon Sep 17 00:00:00 2001 From: Giambattista Bloisi Date: Thu, 5 Dec 2024 15:21:32 +0100 Subject: [PATCH 098/111] Fix: do not import joda formatting --- .../java/eu/dnetlib/pace/tree/DateRange.java | 88 +++++++++---------- .../eu/dnetlib/pace/tree/JsonListMatch.java | 2 +- .../pace/comparators/ComparatorTest.java | 30 +++++-- .../dhp/bulktag/community/ResultTagger.java | 4 +- .../dnetlib/dhp/oa/graph/raw/MappersTest.java | 3 +- .../model/ProvisionModelSupport.java | 15 ++-- 6 files changed, 83 insertions(+), 59 deletions(-) diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/DateRange.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/DateRange.java index c913109a4..6349c944b 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/DateRange.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/DateRange.java @@ -1,10 +1,5 @@ -package eu.dnetlib.pace.tree; -import com.wcohen.ss.AbstractStringDistance; -import eu.dnetlib.pace.config.Config; -import eu.dnetlib.pace.tree.support.AbstractStringComparator; -import eu.dnetlib.pace.tree.support.ComparatorClass; -import org.joda.time.DateTime; +package eu.dnetlib.pace.tree; import java.time.DateTimeException; import java.time.LocalDate; @@ -13,55 +8,60 @@ import java.time.format.DateTimeFormatter; import java.util.Locale; import java.util.Map; +import com.wcohen.ss.AbstractStringDistance; + +import eu.dnetlib.pace.config.Config; +import eu.dnetlib.pace.tree.support.AbstractStringComparator; +import eu.dnetlib.pace.tree.support.ComparatorClass; + @ComparatorClass("dateRange") public class DateRange extends AbstractStringComparator { - int YEAR_RANGE; + int YEAR_RANGE; - public DateRange(Map params) { - super(params, new com.wcohen.ss.JaroWinkler()); - YEAR_RANGE = Integer.parseInt(params.getOrDefault("year_range", "3")); - } + public DateRange(Map params) { + super(params, new com.wcohen.ss.JaroWinkler()); + YEAR_RANGE = Integer.parseInt(params.getOrDefault("year_range", "3")); + } - public DateRange(final double weight) { - super(weight, new com.wcohen.ss.JaroWinkler()); - } + public DateRange(final double weight) { + super(weight, new com.wcohen.ss.JaroWinkler()); + } - protected DateRange(final double weight, final AbstractStringDistance ssalgo) { - super(weight, ssalgo); - } + protected DateRange(final double weight, final AbstractStringDistance ssalgo) { + super(weight, ssalgo); + } - public static boolean isNumeric(String str) { - return str.matches("\\d+"); //match a number with optional '-' and decimal. - } + public static boolean isNumeric(String str) { + return str.matches("\\d+"); // match a number with optional '-' and decimal. + } - @Override - public double distance(final String a, final String b, final Config conf) { - if (a.isEmpty() || b.isEmpty()) { - return -1.0; // return -1 if a field is missing - } + @Override + public double distance(final String a, final String b, final Config conf) { + if (a.isEmpty() || b.isEmpty()) { + return -1.0; // return -1 if a field is missing + } - try { - DateTimeFormatter formatter = DateTimeFormatter.ofPattern("yyyy-MM-dd", Locale.ENGLISH); - LocalDate d1 = LocalDate.parse(a, formatter); - LocalDate d2 = LocalDate.parse(b, formatter); - Period period = Period.between(d1, d2); + try { + DateTimeFormatter formatter = DateTimeFormatter.ofPattern("yyyy-MM-dd", Locale.ENGLISH); + LocalDate d1 = LocalDate.parse(a, formatter); + LocalDate d2 = LocalDate.parse(b, formatter); + Period period = Period.between(d1, d2); - return period.getYears() <= YEAR_RANGE? 1.0 : 0.0; - } - catch (DateTimeException e) { - return -1.0; - } + return period.getYears() <= YEAR_RANGE ? 1.0 : 0.0; + } catch (DateTimeException e) { + return -1.0; + } - } + } - @Override - public double getWeight() { - return super.weight; - } + @Override + public double getWeight() { + return super.weight; + } - @Override - protected double normalize(final double d) { - return d; - } + @Override + protected double normalize(final double d) { + return d; + } } diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/JsonListMatch.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/JsonListMatch.java index e95d9206e..d9558df90 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/JsonListMatch.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/JsonListMatch.java @@ -62,7 +62,7 @@ public class JsonListMatch extends AbstractListComparator { Set types = Sets.intersection(typesA, typesB); - if (types.isEmpty()) // if no common type, it is impossible to compare + if (types.isEmpty()) // if no common type, it is impossible to compare return -1; ca = ca.stream().filter(s -> types.contains(s.split("::")[0])).collect(Collectors.toSet()); diff --git a/dhp-pace-core/src/test/java/eu/dnetlib/pace/comparators/ComparatorTest.java b/dhp-pace-core/src/test/java/eu/dnetlib/pace/comparators/ComparatorTest.java index 83539de4a..0abde84bc 100644 --- a/dhp-pace-core/src/test/java/eu/dnetlib/pace/comparators/ComparatorTest.java +++ b/dhp-pace-core/src/test/java/eu/dnetlib/pace/comparators/ComparatorTest.java @@ -72,14 +72,34 @@ public class ComparatorTest extends AbstractPaceTest { CodeMatch codeMatch = new CodeMatch(params); // names have different codes - assertEquals(0.0, codeMatch.distance("physical oceanography at ctd station june 1998 ev02a", "physical oceanography at ctd station june 1998 ir02", conf)); + assertEquals( + 0.0, + codeMatch + .distance( + "physical oceanography at ctd station june 1998 ev02a", + "physical oceanography at ctd station june 1998 ir02", conf)); // names have same code - assertEquals(1.0, codeMatch.distance("physical oceanography at ctd station june 1998 ev02a", "physical oceanography at ctd station june 1998 ev02a", conf)); + assertEquals( + 1.0, + codeMatch + .distance( + "physical oceanography at ctd station june 1998 ev02a", + "physical oceanography at ctd station june 1998 ev02a", conf)); // code is not in both names - assertEquals(-1, codeMatch.distance("physical oceanography at ctd station june 1998", "physical oceanography at ctd station june 1998 ev02a", conf)); - assertEquals(1.0, codeMatch.distance("physical oceanography at ctd station june 1998", "physical oceanography at ctd station june 1998", conf)); + assertEquals( + -1, + codeMatch + .distance( + "physical oceanography at ctd station june 1998", + "physical oceanography at ctd station june 1998 ev02a", conf)); + assertEquals( + 1.0, + codeMatch + .distance( + "physical oceanography at ctd station june 1998", "physical oceanography at ctd station june 1998", + conf)); } @Test @@ -275,7 +295,7 @@ public class ComparatorTest extends AbstractPaceTest { Arrays .asList( "{\"datainfo\":{\"deletedbyinference\":false,\"inferenceprovenance\":null,\"inferred\":false,\"invisible\":false,\"provenanceaction\":{\"classid\":\"sysimport:actionset\",\"classname\":\"Harvested\",\"schemeid\":\"dnet:provenanceActions\",\"schemename\":\"dnet:provenanceActions\"},\"trust\":\"0.9\"},\"qualifier\":{\"classid\":\"grid\",\"classname\":\"GRID Identifier\",\"schemeid\":\"dnet:pid_types\",\"schemename\":\"dnet:pid_types\"},\"value\":\"grid_1\"}", - "{\"datainfo\":{\"deletedbyinference\":false,\"inferenceprovenance\":null,\"inferred\":false,\"invisible\":false,\"provenanceaction\":{\"classid\":\"sysimport:actionset\",\"classname\":\"Harvested\",\"schemeid\":\"dnet:provenanceActions\",\"schemename\":\"dnet:provenanceActions\"},\"trust\":\"0.9\"},\"qualifier\":{\"classid\":\"ror\",\"classname\":\"Research Organization Registry\",\"schemeid\":\"dnet:pid_types\",\"schemename\":\"dnet:pid_types\"},\"value\":\"ror_1\"}"), + "{\"datainfo\":{\"deletedbyinference\":false,\"inferenceprovenance\":null,\"inferred\":false,\"invisible\":false,\"provenanceaction\":{\"classid\":\"sysimport:actionset\",\"classname\":\"Harvested\",\"schemeid\":\"dnet:provenanceActions\",\"schemename\":\"dnet:provenanceActions\"},\"trust\":\"0.9\"},\"qualifier\":{\"classid\":\"ror\",\"classname\":\"Research Organization Registry\",\"schemeid\":\"dnet:pid_types\",\"schemename\":\"dnet:pid_types\"},\"value\":\"ror_1\"}"), "authors"); List b = createFieldList( Arrays diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/community/ResultTagger.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/community/ResultTagger.java index 64cbd70ba..0d6c81627 100644 --- a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/community/ResultTagger.java +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/community/ResultTagger.java @@ -130,7 +130,7 @@ public class ResultTagger implements Serializable { // log.info("Remove constraints for " + communityId); if (conf.getRemoveConstraintsMap().keySet().contains(communityId) && conf.getRemoveConstraintsMap().get(communityId).getCriteria() != null && - !conf.getRemoveConstraintsMap().get(communityId).getCriteria().isEmpty() && + !conf.getRemoveConstraintsMap().get(communityId).getCriteria().isEmpty() && conf .getRemoveConstraintsMap() .get(communityId) @@ -228,7 +228,7 @@ public class ResultTagger implements Serializable { .forEach(communityId -> { if (!removeCommunities.contains(communityId) && conf.getSelectionConstraintsMap().get(communityId).getCriteria() != null && - !conf.getSelectionConstraintsMap().get(communityId).getCriteria().isEmpty() && + !conf.getSelectionConstraintsMap().get(communityId).getCriteria().isEmpty() && conf .getSelectionConstraintsMap() .get(communityId) diff --git a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java index ea9503d17..e1710db54 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java +++ b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/raw/MappersTest.java @@ -915,7 +915,8 @@ class MappersTest { @Test void testODFRecord_guidelines4() throws IOException { - final String xml = IOUtils.toString(Objects.requireNonNull(getClass().getResourceAsStream("odf_guidelines4.xml"))); + final String xml = IOUtils + .toString(Objects.requireNonNull(getClass().getResourceAsStream("odf_guidelines4.xml"))); final List list = new OdfToOafMapper(vocs, false, true).processMdRecord(xml); final Publication p = (Publication) list.get(0); diff --git a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/ProvisionModelSupport.java b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/ProvisionModelSupport.java index 0da0f6955..2c977a390 100644 --- a/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/ProvisionModelSupport.java +++ b/dhp-workflows/dhp-graph-provision/src/main/java/eu/dnetlib/dhp/oa/provision/model/ProvisionModelSupport.java @@ -5,7 +5,6 @@ import java.io.StringReader; import java.util.*; import java.util.stream.Collectors; -import eu.dnetlib.dhp.schema.solr.PersonTopic; import org.apache.commons.lang3.StringUtils; import org.dom4j.Document; import org.dom4j.DocumentException; @@ -40,6 +39,7 @@ import eu.dnetlib.dhp.schema.solr.OpenAccessColor; import eu.dnetlib.dhp.schema.solr.OpenAccessRoute; import eu.dnetlib.dhp.schema.solr.Organization; import eu.dnetlib.dhp.schema.solr.Person; +import eu.dnetlib.dhp.schema.solr.PersonTopic; import eu.dnetlib.dhp.schema.solr.Pid; import eu.dnetlib.dhp.schema.solr.Project; import eu.dnetlib.dhp.schema.solr.Result; @@ -216,11 +216,14 @@ public class ProvisionModelSupport { } private static List mapPersonTopics(List subjects) { - return Optional.ofNullable(subjects) - .map(ss -> ss.stream() - .map(ProvisionModelSupport::mapPersonTopic) - .collect(Collectors.toList())) - .orElse(null); + return Optional + .ofNullable(subjects) + .map( + ss -> ss + .stream() + .map(ProvisionModelSupport::mapPersonTopic) + .collect(Collectors.toList())) + .orElse(null); } private static PersonTopic mapPersonTopic(eu.dnetlib.dhp.schema.oaf.PersonTopic pt) { From fd1038b44d10e17146d0aac51841818460872eca Mon Sep 17 00:00:00 2001 From: Sandro La Bruzzo Date: Fri, 6 Dec 2024 09:12:06 +0100 Subject: [PATCH 099/111] removed a sneaky break that was committed by mistake. --- .../java/eu/dnetlib/dhp/collection/orcid/ORCIDExtractor.java | 1 - 1 file changed, 1 deletion(-) diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/orcid/ORCIDExtractor.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/orcid/ORCIDExtractor.java index 11f4c55d8..8172456bb 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/orcid/ORCIDExtractor.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/orcid/ORCIDExtractor.java @@ -154,7 +154,6 @@ public class ORCIDExtractor extends Thread { extractedItem++; if (extractedItem % 100000 == 0) { log.info("Thread {}: Extracted {} items", id, extractedItem); - break; } } } From 1c144a4dcb951319ed88a7cac4825837c5385316 Mon Sep 17 00:00:00 2001 From: miconis Date: Fri, 6 Dec 2024 09:18:10 +0100 Subject: [PATCH 100/111] minor change --- .../dhp/actionmanager/raid/GenerateRAiDActionSetJob.java | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/raid/GenerateRAiDActionSetJob.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/raid/GenerateRAiDActionSetJob.java index 3b2405956..c82934cdb 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/raid/GenerateRAiDActionSetJob.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/raid/GenerateRAiDActionSetJob.java @@ -43,7 +43,8 @@ public class GenerateRAiDActionSetJob { private static final List RAID_COLLECTED_FROM = listKeyValues( OPENAIRE_DATASOURCE_ID, OPENAIRE_DATASOURCE_NAME); - private static final Qualifier RAID_QUALIFIER = qualifier("0049", "Research Activity Identifier", DNET_PUBLICATION_RESOURCE, DNET_PUBLICATION_RESOURCE); + private static final Qualifier RAID_QUALIFIER = qualifier( + "0049", "Research Activity Identifier", DNET_PUBLICATION_RESOURCE, DNET_PUBLICATION_RESOURCE); private static final Qualifier RAID_INFERENCE_QUALIFIER = qualifier( "raid:openaireinference", "Inferred by OpenAIRE", DNET_PROVENANCE_ACTIONS, DNET_PROVENANCE_ACTIONS); From dade7d5bb86d4030fc2b69f3a26940e055e216eb Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Fri, 6 Dec 2024 10:02:07 +0100 Subject: [PATCH 101/111] minor changes --- .../java/eu/dnetlib/dhp/common/Constants.java | 5 ++-- .../raid/GenerateRAiDActionSetJob.java | 27 +++++++------------ 2 files changed, 13 insertions(+), 19 deletions(-) diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/common/Constants.java b/dhp-common/src/main/java/eu/dnetlib/dhp/common/Constants.java index b00199ea5..6a4bb34d3 100644 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/common/Constants.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/common/Constants.java @@ -11,8 +11,9 @@ public class Constants { public static final Map coarCodeLabelMap = Maps.newHashMap(); public static final String RAID_NS_PREFIX = "raid________"; - public static final String RAID_DATASOURCE_NAME = "Research Activity Identifier Service (RAiD)"; - public static final String RAID_OPENAIRE_ID = ""; + + public static final String END_DATE = "endDate"; + public static final String START_DATE = "startDate"; public static final String ROR_NS_PREFIX = "ror_________"; diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/raid/GenerateRAiDActionSetJob.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/raid/GenerateRAiDActionSetJob.java index c82934cdb..e67e7171f 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/raid/GenerateRAiDActionSetJob.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/raid/GenerateRAiDActionSetJob.java @@ -3,6 +3,7 @@ package eu.dnetlib.dhp.actionmanager.raid; import static eu.dnetlib.dhp.actionmanager.personentity.ExtractPerson.OPENAIRE_DATASOURCE_ID; import static eu.dnetlib.dhp.actionmanager.personentity.ExtractPerson.OPENAIRE_DATASOURCE_NAME; +import static eu.dnetlib.dhp.common.Constants.*; import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; import static eu.dnetlib.dhp.schema.common.ModelConstants.*; import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.*; @@ -24,7 +25,6 @@ import com.fasterxml.jackson.databind.ObjectMapper; import eu.dnetlib.dhp.actionmanager.raid.model.RAiDEntity; import eu.dnetlib.dhp.application.ArgumentApplicationParser; -import eu.dnetlib.dhp.common.Constants; import eu.dnetlib.dhp.common.HdfsSupport; import eu.dnetlib.dhp.schema.action.AtomicAction; import eu.dnetlib.dhp.schema.common.ModelConstants; @@ -120,8 +120,10 @@ public class GenerateRAiDActionSetJob { qualifier("main title", "main title", DNET_DATACITE_TITLE, DNET_DATACITE_TITLE), RAID_DATA_INFO))); orp.setDescription(listFields(RAID_DATA_INFO, r.getSummary())); -// orp.setAuthor(createAuthors(r.getAuthors())); - orp.setInstance(Collections.singletonList(eu.dnetlib.dhp.actionmanager.Constants.getInstance(RAID_QUALIFIER))); + + Instance instance = new Instance(); + instance.setInstancetype(RAID_QUALIFIER); + orp.setInstance(Collections.singletonList(instance)); orp .setSubject( r @@ -140,11 +142,11 @@ public class GenerateRAiDActionSetJob { Arrays .asList( structuredProperty( - r.getEndDate(), qualifier("endDate", "endDate", DNET_DATACITE_DATE, DNET_DATACITE_DATE), + r.getEndDate(), qualifier(END_DATE, END_DATE, DNET_DATACITE_DATE, DNET_DATACITE_DATE), RAID_DATA_INFO), structuredProperty( r.getStartDate(), - qualifier("startDate", "startDate", DNET_DATACITE_DATE, DNET_DATACITE_DATE), + qualifier(START_DATE, START_DATE, DNET_DATACITE_DATE, DNET_DATACITE_DATE), RAID_DATA_INFO))); orp.setLastupdatetimestamp(now.getTime()); orp.setDateofacceptance(field(r.getStartDate(), RAID_DATA_INFO)); @@ -159,11 +161,7 @@ public class GenerateRAiDActionSetJob { ModelConstants.RESULT_RESULT, PART, HAS_PART, - RAID_COLLECTED_FROM, - RAID_DATA_INFO, - now.getTime(), - null, - null); + orp); Relation rel2 = OafMapperUtils .getRelation( resultId, @@ -171,11 +169,7 @@ public class GenerateRAiDActionSetJob { ModelConstants.RESULT_RESULT, PART, IS_PART_OF, - RAID_COLLECTED_FROM, - RAID_DATA_INFO, - now.getTime(), - null, - null); + orp); res.add(new AtomicAction<>(Relation.class, rel1)); res.add(new AtomicAction<>(Relation.class, rel2)); } @@ -184,7 +178,7 @@ public class GenerateRAiDActionSetJob { } public static String calculateOpenaireId(final String raid) { - return String.format("50|%s::%s", Constants.RAID_NS_PREFIX, DHPUtils.md5(raid)); + return String.format("50|%s::%s", RAID_NS_PREFIX, DHPUtils.md5(raid)); } public static List createAuthors(final List author) { @@ -204,7 +198,6 @@ public class GenerateRAiDActionSetJob { .json(path) .as(Encoders.bean(RAiDEntity.class)) .toJavaRDD(); - } } From 8a5ba8df45d6fb1b570853307fb99f465d0667f5 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Fri, 6 Dec 2024 10:03:11 +0100 Subject: [PATCH 102/111] minor changes --- .../main/java/eu/dnetlib/dhp/actionmanager/Constants.java | 6 ------ 1 file changed, 6 deletions(-) diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/Constants.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/Constants.java index 394cc22a3..d7ad7fcb9 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/Constants.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/Constants.java @@ -112,12 +112,6 @@ public class Constants { } - public static Instance getInstance(Qualifier qualifier) { - Instance instance = new Instance(); - instance.setInstancetype(qualifier); - return instance; - } - public static void removeOutputDir(SparkSession spark, String path) { HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration()); } From ee84db7a6a5ff1e271f5d6a0cfdfcc07e1a59ac9 Mon Sep 17 00:00:00 2001 From: Miriam Baglioni Date: Fri, 6 Dec 2024 12:20:13 +0100 Subject: [PATCH 103/111] [communityfromsemrelpropagation] added filtering to remove the deletedbyinference and invisible results --- .../PrepareResultCommunitySetStep1.java | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttocommunityfromsemrel/PrepareResultCommunitySetStep1.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttocommunityfromsemrel/PrepareResultCommunitySetStep1.java index 5af2bf481..764390442 100644 --- a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttocommunityfromsemrel/PrepareResultCommunitySetStep1.java +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttocommunityfromsemrel/PrepareResultCommunitySetStep1.java @@ -70,7 +70,7 @@ public class PrepareResultCommunitySetStep1 { private static final String RESULT_WITH_CONTEXT = "select id, collect_set(co.id) community_context \n" + " from result " + " lateral view explode (context) c as co " + - " where datainfo.deletedbyinference = false AND lower(co.id) IN %s" + + " where lower(co.id) IN %s" + " group by id"; private static final String RESULT_PATENT = "select id " + @@ -160,7 +160,8 @@ public class PrepareResultCommunitySetStep1 { Dataset relation = readPath(spark, inputRelationPath, Relation.class); relation.createOrReplaceTempView("relation"); - Dataset result = readPath(spark, inputResultPath, resultClazz); + Dataset result = readPath(spark, inputResultPath, resultClazz) + .where("datainfo.deletedbyinference != true AND datainfo.invisible != true"); result.createOrReplaceTempView("result"); final String outputResultPath = outputPath + "/" + resultType; From 666155bafaa04db7153c98d91b911d88e9cb2f01 Mon Sep 17 00:00:00 2001 From: Miriam Baglioni Date: Fri, 6 Dec 2024 12:26:41 +0100 Subject: [PATCH 104/111] [communityfromsemrelpropagation] changed resource to have deletedbyinference = false. --- .../graph/publication/part-00000.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/resulttocommunityfromsemrel/graph/publication/part-00000.json b/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/resulttocommunityfromsemrel/graph/publication/part-00000.json index 7957bcfd5..ae9083c65 100644 --- a/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/resulttocommunityfromsemrel/graph/publication/part-00000.json +++ b/dhp-workflows/dhp-enrichment/src/test/resources/eu/dnetlib/dhp/resulttocommunityfromsemrel/graph/publication/part-00000.json @@ -7,7 +7,7 @@ {"author":[{"fullname":"Turkia, Heidi","name":"Heidi","pid":[],"rank":1,"surname":"Turkia"},{"fullname":"Sirén, Heli","name":"Heli","pid":[],"rank":2,"surname":"Sirén"},{"fullname":"Penttilä, Merja","name":"Merja","pid":[],"rank":3,"surname":"Penttilä"},{"fullname":"Pitkänen, Juha Pekka","name":"Juha Pekka","pid":[],"rank":4,"surname":"Pitkänen"}],"bestaccessright":{"classid":"CLOSED","classname":"Closed Access","schemeid":"dnet:access_modes","schemename":"dnet:access_modes"},"collectedfrom":[{"key":"10|openaire____::4692342f0992d91f9e705c26959f09e0","value":"VTT Research Information System"}],"context":[],"contributor":[],"country":[],"coverage":[],"dataInfo":{"deletedbyinference":true,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"dateofacceptance":{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"2013-02-22"},"dateofcollection":"2022-02-28T12:29:51.291Z","dateoftransformation":"2022-02-28T16:19:35.201Z","description":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"

Hydrolysates of lignocellulosic biomass, used as substrates for the sustainable production of fuels and chemicals often contain high amounts of phenolic compounds inhibiting the production microbiota. Quantification of these inhibitor compounds may help to understand possible difficulties in bioprocessing and further the development of more efficient, robust and tolerable processes. A separation method based on capillary electrophoresis with UV detection was developed for the simultaneous quantification of 10 phenolic compounds that may have inhibitor properties. Intraday relative standard deviations were less than 0.7% for migration times and between 2.6% and 6.4% for peak areas. Interday relative standard deviations were less than 3.0% for migration times and between 5.0% and 7.2% for peak areas. The method was applied to demonstrate that Saccharomyces cerevisiae was able to decrease the concentrations of vanillin, coniferyl aldehyde, syringaldehyde, acetoguaiacone and cinnamic acid during the cultivation, whereas the concentrations of phenols increased.

"}],"externalReference":[],"extraInfo":[],"format":[],"fulltext":[],"id":"50|355e65625b88::6c232359e3b3165574cb88f0554d9264","instance":[{"accessright":{"classid":"CLOSED","classname":"Closed Access","schemeid":"dnet:access_modes","schemename":"dnet:access_modes"},"alternateIdentifier":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"doi","classname":"Digital Object Identifier","schemeid":"dnet:pid_types","schemename":"dnet:pid_types"},"value":"10.1016/j.chroma.2013.01.004"}],"collectedfrom":{"key":"10|openaire____::4692342f0992d91f9e705c26959f09e0","value":"VTT Research Information System"},"dateofacceptance":{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"2013-02-22"},"distributionlocation":"","hostedby":{"key":"10|openaire____::4692342f0992d91f9e705c26959f09e0","value":"VTT Research Information System"},"instancetype":{"classid":"0001","classname":"Article","schemeid":"dnet:publication_resource","schemename":"dnet:publication_resource"},"pid":[],"refereed":{"classid":"0000","classname":"UNKNOWN","schemeid":"dnet:review_levels","schemename":"dnet:review_levels"},"url":["https://cris.vtt.fi/en/publications/91f411d0-d8f8-4bf1-9072-345303cc776c"]}],"language":{"classid":"eng","classname":"English","schemeid":"dnet:languages","schemename":"dnet:languages"},"lastupdatetimestamp":1646505708387,"oaiprovenance":{"originDescription":{"altered":true,"baseURL":"https%3A%2F%2Fcris.vtt.fi%2Fws%2Foai","datestamp":"2022-01-29T08:16:51Z","harvestDate":"2022-02-28T12:29:51.291Z","identifier":"oai:cris.vtt.fi:publications/91f411d0-d8f8-4bf1-9072-345303cc776c","metadataNamespace":"http://www.openarchives.org/OAI/2.0/oai_dc/"}},"originalId":["oai:cris.vtt.fi:publications/91f411d0-d8f8-4bf1-9072-345303cc776c","50|355e65625b88::6c232359e3b3165574cb88f0554d9264"],"pid":[],"relevantdate":[],"resourcetype":{"classid":"UNKNOWN","classname":"UNKNOWN","schemeid":"dnet:dataCite_resource","schemename":"dnet:dataCite_resource"},"resulttype":{"classid":"publication","classname":"publication","schemeid":"dnet:result_typologies","schemename":"dnet:result_typologies"},"source":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"Turkia , H , Sirén , H , Penttilä , M & Pitkänen , J P 2013 , ' Capillary electrophoresis for the monitoring of phenolic compounds in bioprocesses ' , Journal of Chromatography A , vol. 1278 , pp. 175-180 . https://doi.org/10.1016/j.chroma.2013.01.004"}],"subject":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:result_subject","schemename":"dnet:result_subject"},"value":"Bioprocess monitoring"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:result_subject","schemename":"dnet:result_subject"},"value":"Capillary electrophoresis"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:result_subject","schemename":"dnet:result_subject"},"value":"Phenolic compounds"}],"title":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"main title","classname":"main title","schemeid":"dnet:dataCite_title","schemename":"dnet:dataCite_title"},"value":"Capillary electrophoresis for the monitoring of phenolic compounds in bioprocesses"}]} {"author":[{"fullname":"Veijalainen, Jari","name":"Jari","pid":[],"rank":1,"surname":"Veijalainen"},{"fullname":"Wolski, Antoni","name":"Antoni","pid":[],"rank":2,"surname":"Wolski"}],"bestaccessright":{"classid":"RESTRICTED","classname":"Restricted","schemeid":"dnet:access_modes","schemename":"dnet:access_modes"},"collectedfrom":[{"key":"10|openaire____::4692342f0992d91f9e705c26959f09e0","value":"VTT Research Information System"}],"context":[{"dataInfo": null,"id": "dariah"}],"contributor":[],"country":[],"coverage":[],"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"dateofacceptance":{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"1991-01-01"},"dateofcollection":"2022-02-28T12:33:57.005Z","dateoftransformation":"2022-02-28T16:33:35.101Z","description":[],"externalReference":[],"extraInfo":[],"format":[],"fulltext":[],"id":"50|355e65625b88::74009c567c81b4aa55c813db658734df","instance":[{"accessright":{"classid":"RESTRICTED","classname":"Restricted","schemeid":"dnet:access_modes","schemename":"dnet:access_modes"},"alternateIdentifier":[],"collectedfrom":{"key":"10|openaire____::4692342f0992d91f9e705c26959f09e0","value":"VTT Research Information System"},"dateofacceptance":{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"1991-01-01"},"distributionlocation":"","hostedby":{"key":"10|openaire____::4692342f0992d91f9e705c26959f09e0","value":"VTT Research Information System"},"instancetype":{"classid":"0002","classname":"Book","schemeid":"dnet:publication_resource","schemename":"dnet:publication_resource"},"pid":[],"refereed":{"classid":"0000","classname":"UNKNOWN","schemeid":"dnet:review_levels","schemename":"dnet:review_levels"},"url":["https://cris.vtt.fi/en/publications/bb17c77a-f574-4921-a5cb-32dc1f283fa3"]},{"accessright":{"classid":"RESTRICTED","classname":"Restricted","schemeid":"dnet:access_modes","schemename":"dnet:access_modes"},"alternateIdentifier":[],"collectedfrom":{"key":"10|openaire____::4692342f0992d91f9e705c26959f09e0","value":"VTT Research Information System"},"dateofacceptance":{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"1991-01-01"},"distributionlocation":"","hostedby":{"key":"10|eurocrisdris::fe4903425d9040f680d8610d9079ea14","value":"VTT Research Information System"},"instancetype":{"classid":"0002","classname":"Book","schemeid":"dnet:publication_resource","schemename":"dnet:publication_resource"},"pid":[],"refereed":{"classid":"0000","classname":"UNKNOWN","schemeid":"dnet:review_levels","schemename":"dnet:review_levels"},"url":["https://cris.vtt.fi/en/publications/bb17c77a-f574-4921-a5cb-32dc1f283fa3"]}, {"accessright":{"classid":"RESTRICTED","classname":"Restricted","schemeid":"dnet:access_modes","schemename":"dnet:access_modes"},"alternateIdentifier":[],"collectedfrom":{"key":"10|openaire____::4692342f0992d91f9e705c26959f09e0","value":"VTT Research Information System"},"dateofacceptance":{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"1991-01-01"},"distributionlocation":"","hostedby":{"key":"10|opendoar____::f0dd4a99fba6075a9494772b58f95280","value":"VTT Research Information System"},"instancetype":{"classid":"0002","classname":"Book","schemeid":"dnet:publication_resource","schemename":"dnet:publication_resource"},"pid":[],"refereed":{"classid":"0000","classname":"UNKNOWN","schemeid":"dnet:review_levels","schemename":"dnet:review_levels"},"url":["https://cris.vtt.fi/en/publications/bb17c77a-f574-4921-a5cb-32dc1f283fa3"]}],"language":{"classid":"eng","classname":"English","schemeid":"dnet:languages","schemename":"dnet:languages"},"lastupdatetimestamp":1646505716994,"oaiprovenance":{"originDescription":{"altered":true,"baseURL":"https%3A%2F%2Fcris.vtt.fi%2Fws%2Foai","datestamp":"2020-12-21T07:05:54Z","harvestDate":"2022-02-28T12:33:57.005Z","identifier":"oai:cris.vtt.fi:publications/bb17c77a-f574-4921-a5cb-32dc1f283fa3","metadataNamespace":"http://www.openarchives.org/OAI/2.0/oai_dc/"}},"originalId":["50|355e65625b88::74009c567c81b4aa55c813db658734df","oai:cris.vtt.fi:publications/bb17c77a-f574-4921-a5cb-32dc1f283fa3"],"pid":[],"publisher":{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"VTT Technical Research Centre of Finland"},"relevantdate":[],"resourcetype":{"classid":"UNKNOWN","classname":"UNKNOWN","schemeid":"dnet:dataCite_resource","schemename":"dnet:dataCite_resource"},"resulttype":{"classid":"publication","classname":"publication","schemeid":"dnet:result_typologies","schemename":"dnet:result_typologies"},"source":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"Veijalainen , J & Wolski , A 1991 , Prepare and commit certification for decentralized transaction management in rigorous multidatabases : Research Report No. J-1 . VTT Technical Research Centre of Finland ."}],"subject":[],"title":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"main title","classname":"main title","schemeid":"dnet:dataCite_title","schemename":"dnet:dataCite_title"},"value":"Prepare and commit certification for decentralized transaction management in rigorous multidatabases:Research Report No. J-1"}]} {"author":[{"fullname":"Hanhijärvi, Antti","name":"Antti","pid":[],"rank":1,"surname":"Hanhijärvi"},{"fullname":"Hukka, A.","name":"A.","pid":[],"rank":2,"surname":"Hukka"},{"fullname":"Paajanen, T.","name":"T.","pid":[],"rank":3,"surname":"Paajanen"},{"fullname":"Pulkkinen, P.","name":"P.","pid":[],"rank":4,"surname":"Pulkkinen"},{"fullname":"Sundman, S.","name":"S.","pid":[],"rank":5,"surname":"Sundman"}],"bestaccessright":{"classid":"CLOSED","classname":"Closed Access","schemeid":"dnet:access_modes","schemename":"dnet:access_modes"},"collectedfrom":[{"key":"10|openaire____::4692342f0992d91f9e705c26959f09e0","value":"VTT Research Information System"}],"context":[],"contributor":[],"country":[],"coverage":[],"dataInfo":{"deletedbyinference":true,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"dateofacceptance":{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"2003-01-01"},"dateofcollection":"2022-02-28T12:32:33.974Z","dateoftransformation":"2022-02-28T17:38:24.191Z","description":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"The paper presents experimental results of jet drying tests of birch (Betula pendula) and spruce (Picea abies) veneers at temperatures of 140°C and 190°C. Results include drying rates for 1.5 mm thick birch veneers as well as 1.5 mm and 2.6 mm thick spruce veneers of both heartwood and sapwood. Based on the test results, material parameter values for a simplified drying model are assessed. The model is based on the use of an effective diffusion coefficient and an effective surface emission coefficient, which values are calibrated to fit to the experimental results. It is observed, that separate model parameter sets are needed for the two different species but also for occurrence of heartwood or sapwood (spruce) and different thickness values of veneers."}],"externalReference":[],"extraInfo":[],"format":[],"fulltext":[],"id":"50|dedup_wf_001::08d6f2001319c86d0e69b0f83ad75df2","instance":[{"accessright":{"classid":"CLOSED","classname":"Closed Access","schemeid":"dnet:access_modes","schemename":"dnet:access_modes"},"alternateIdentifier":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"doi","classname":"Digital Object Identifier","schemeid":"dnet:pid_types","schemename":"dnet:pid_types"},"value":"10.1007/s00107-003-0379-4"}],"collectedfrom":{"key":"10|openaire____::4692342f0992d91f9e705c26959f09e0","value":"VTT Research Information System"},"dateofacceptance":{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"2003-01-01"},"distributionlocation":"","hostedby":{"key":"10|openaire____::4692342f0992d91f9e705c26959f09e0","value":"VTT Research Information System"},"instancetype":{"classid":"0001","classname":"Article","schemeid":"dnet:publication_resource","schemename":"dnet:publication_resource"},"pid":[],"refereed":{"classid":"0000","classname":"UNKNOWN","schemeid":"dnet:review_levels","schemename":"dnet:review_levels"},"url":["https://cris.vtt.fi/en/publications/8cc91100-904f-43c5-bb3d-1cc3e0f4a4b5"]}],"language":{"classid":"eng","classname":"English","schemeid":"dnet:languages","schemename":"dnet:languages"},"lastupdatetimestamp":1646505760180,"oaiprovenance":{"originDescription":{"altered":true,"baseURL":"https%3A%2F%2Fcris.vtt.fi%2Fws%2Foai","datestamp":"2022-01-31T21:18:51Z","harvestDate":"2022-02-28T12:32:33.974Z","identifier":"oai:cris.vtt.fi:publications/8cc91100-904f-43c5-bb3d-1cc3e0f4a4b5","metadataNamespace":"http://www.openarchives.org/OAI/2.0/oai_dc/"}},"originalId":["50|355e65625b88::9cb10895b4a92b0215b85acb2c3268b9","oai:cris.vtt.fi:publications/8cc91100-904f-43c5-bb3d-1cc3e0f4a4b5"],"pid":[],"relevantdate":[],"resourcetype":{"classid":"UNKNOWN","classname":"UNKNOWN","schemeid":"dnet:dataCite_resource","schemename":"dnet:dataCite_resource"},"resulttype":{"classid":"publication","classname":"publication","schemeid":"dnet:result_typologies","schemename":"dnet:result_typologies"},"source":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"Hanhijärvi , A , Hukka , A , Paajanen , T , Pulkkinen , P & Sundman , S 2003 , ' Experimental investigation of jet drying of birch and spruce veneers and modelling with a simplified approach ' , Holz als Roh- und Werkstoff , vol. 61 , no. 2 , pp. 83-88 . https://doi.org/10.1007/s00107-003-0379-4"}],"subject":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:result_subject","schemename":"dnet:result_subject"},"value":"jet drying"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:result_subject","schemename":"dnet:result_subject"},"value":"drying"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:result_subject","schemename":"dnet:result_subject"},"value":"veneers"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:result_subject","schemename":"dnet:result_subject"},"value":"birch"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:result_subject","schemename":"dnet:result_subject"},"value":"spruce"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:result_subject","schemename":"dnet:result_subject"},"value":"heartwood"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:result_subject","schemename":"dnet:result_subject"},"value":"sapwood"}],"title":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"main title","classname":"main title","schemeid":"dnet:dataCite_title","schemename":"dnet:dataCite_title"},"value":"Experimental investigation of jet drying of birch and spruce veneers and modelling with a simplified approach"}]} -{"author":[{"fullname":"Vainonen-Ahlgren, Elizaveta","name":"Elizaveta","pid":[],"rank":1,"surname":"Vainonen-Ahlgren"},{"fullname":"Likonen, Jari","name":"Jari","pid":[],"rank":2,"surname":"Likonen"},{"fullname":"Renvall,","pid":[],"rank":3},{"fullname":"Rohde, V.","name":"V.","pid":[],"rank":4,"surname":"Rohde"},{"fullname":"Mayer, M.","name":"M.","pid":[],"rank":5,"surname":"Mayer"}],"bestaccessright":{"classid":"CLOSED","classname":"Closed Access","schemeid":"dnet:access_modes","schemename":"dnet:access_modes"},"collectedfrom":[{"key":"10|openaire____::4692342f0992d91f9e705c26959f09e0","value":"VTT Research Information System"}],"context":[],"contributor":[],"country":[],"coverage":[],"dataInfo":{"deletedbyinference":true,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"dateofacceptance":{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"2007-01-01"},"dateofcollection":"2022-02-28T12:32:50.667Z","dateoftransformation":"2022-02-28T17:49:49.964Z","description":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"To investigate material transport in scrape-off layer plasma and long term deposition in divertor, 13CH4 was puffed at the end of 2004 and 2005 experimental campaigns into ASDEX Upgrade from the outer mid-plane.
Ex situ analyses of the tiles were performed by secondary ion mass spectrometry.
The peaks of 13C were detected below the bottom inner strike point and at the horizontal tile at the outer lower divertor. It was detected ∼21% of the total puffed 13C amount.
The deposition rate for carbon by plasma was also calculated in long term experiment. It was obtained to be 22 × 10−3 and 8.7 × 10−3 g/s for the upper (campaign 2004) and lower (campaign 2003) divertors, respectively."}],"externalReference":[],"extraInfo":[],"format":[],"fulltext":[],"id":"50|dedup_wf_001::06e51d2bf295531b2d2e7a1b55500783","instance":[{"accessright":{"classid":"CLOSED","classname":"Closed Access","schemeid":"dnet:access_modes","schemename":"dnet:access_modes"},"alternateIdentifier":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"doi","classname":"Digital Object Identifier","schemeid":"dnet:pid_types","schemename":"dnet:pid_types"},"value":"10.1016/j.jnucmat.2007.01.026"}],"collectedfrom":{"key":"10|openaire____::4692342f0992d91f9e705c26959f09e0","value":"VTT Research Information System"},"dateofacceptance":{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"2007-01-01"},"distributionlocation":"","hostedby":{"key":"10|openaire____::4692342f0992d91f9e705c26959f09e0","value":"VTT Research Information System"},"instancetype":{"classid":"0001","classname":"Patent","schemeid":"dnet:publication_resource","schemename":"dnet:publication_resource"},"pid":[],"refereed":{"classid":"0000","classname":"UNKNOWN","schemeid":"dnet:review_levels","schemename":"dnet:review_levels"},"url":["https://cris.vtt.fi/en/publications/2472b21e-1fdc-4121-946e-e9c8fae6d02d"]}],"language":{"classid":"eng","classname":"English","schemeid":"dnet:languages","schemename":"dnet:languages"},"lastupdatetimestamp":1646505766149,"oaiprovenance":{"originDescription":{"altered":true,"baseURL":"https%3A%2F%2Fcris.vtt.fi%2Fws%2Foai","datestamp":"2022-02-01T02:35:05Z","harvestDate":"2022-02-28T12:32:50.667Z","identifier":"oai:cris.vtt.fi:publications/2472b21e-1fdc-4121-946e-e9c8fae6d02d","metadataNamespace":"http://www.openarchives.org/OAI/2.0/oai_dc/"}},"originalId":["50|355e65625b88::a29614444f5030f11e75c6c27264d272","oai:cris.vtt.fi:publications/2472b21e-1fdc-4121-946e-e9c8fae6d02d"],"pid":[],"relevantdate":[],"resourcetype":{"classid":"UNKNOWN","classname":"UNKNOWN","schemeid":"dnet:dataCite_resource","schemename":"dnet:dataCite_resource"},"resulttype":{"classid":"publication","classname":"publication","schemeid":"dnet:result_typologies","schemename":"dnet:result_typologies"},"source":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"Vainonen-Ahlgren , E , Likonen , J , Renvall , Rohde , V & Mayer , M 2007 , ' Migration of 13C and deposition at ASDEX Upgrade ' , Journal of Nuclear Materials , vol. 363-365 , pp. 270-275 . https://doi.org/10.1016/j.jnucmat.2007.01.026"}],"subject":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:result_subject","schemename":"dnet:result_subject"},"value":"ASDEX upgrade"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:result_subject","schemename":"dnet:result_subject"},"value":"divertor"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:result_subject","schemename":"dnet:result_subject"},"value":"carbon based materials"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:result_subject","schemename":"dnet:result_subject"},"value":"erosion"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:result_subject","schemename":"dnet:result_subject"},"value":"deposition"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:result_subject","schemename":"dnet:result_subject"},"value":"ITER"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:result_subject","schemename":"dnet:result_subject"},"value":"JET"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:result_subject","schemename":"dnet:result_subject"},"value":"plasma"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:result_subject","schemename":"dnet:result_subject"},"value":"fusion energy"}],"title":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"main title","classname":"main title","schemeid":"dnet:dataCite_title","schemename":"dnet:dataCite_title"},"value":"Migration of 13C and deposition at ASDEX Upgrade"}]} +{"author":[{"fullname":"Vainonen-Ahlgren, Elizaveta","name":"Elizaveta","pid":[],"rank":1,"surname":"Vainonen-Ahlgren"},{"fullname":"Likonen, Jari","name":"Jari","pid":[],"rank":2,"surname":"Likonen"},{"fullname":"Renvall,","pid":[],"rank":3},{"fullname":"Rohde, V.","name":"V.","pid":[],"rank":4,"surname":"Rohde"},{"fullname":"Mayer, M.","name":"M.","pid":[],"rank":5,"surname":"Mayer"}],"bestaccessright":{"classid":"CLOSED","classname":"Closed Access","schemeid":"dnet:access_modes","schemename":"dnet:access_modes"},"collectedfrom":[{"key":"10|openaire____::4692342f0992d91f9e705c26959f09e0","value":"VTT Research Information System"}],"context":[],"contributor":[],"country":[],"coverage":[],"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"dateofacceptance":{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"2007-01-01"},"dateofcollection":"2022-02-28T12:32:50.667Z","dateoftransformation":"2022-02-28T17:49:49.964Z","description":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"To investigate material transport in scrape-off layer plasma and long term deposition in divertor, 13CH4 was puffed at the end of 2004 and 2005 experimental campaigns into ASDEX Upgrade from the outer mid-plane.
Ex situ analyses of the tiles were performed by secondary ion mass spectrometry.
The peaks of 13C were detected below the bottom inner strike point and at the horizontal tile at the outer lower divertor. It was detected ∼21% of the total puffed 13C amount.
The deposition rate for carbon by plasma was also calculated in long term experiment. It was obtained to be 22 × 10−3 and 8.7 × 10−3 g/s for the upper (campaign 2004) and lower (campaign 2003) divertors, respectively."}],"externalReference":[],"extraInfo":[],"format":[],"fulltext":[],"id":"50|dedup_wf_001::06e51d2bf295531b2d2e7a1b55500783","instance":[{"accessright":{"classid":"CLOSED","classname":"Closed Access","schemeid":"dnet:access_modes","schemename":"dnet:access_modes"},"alternateIdentifier":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"doi","classname":"Digital Object Identifier","schemeid":"dnet:pid_types","schemename":"dnet:pid_types"},"value":"10.1016/j.jnucmat.2007.01.026"}],"collectedfrom":{"key":"10|openaire____::4692342f0992d91f9e705c26959f09e0","value":"VTT Research Information System"},"dateofacceptance":{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"2007-01-01"},"distributionlocation":"","hostedby":{"key":"10|openaire____::4692342f0992d91f9e705c26959f09e0","value":"VTT Research Information System"},"instancetype":{"classid":"0001","classname":"Patent","schemeid":"dnet:publication_resource","schemename":"dnet:publication_resource"},"pid":[],"refereed":{"classid":"0000","classname":"UNKNOWN","schemeid":"dnet:review_levels","schemename":"dnet:review_levels"},"url":["https://cris.vtt.fi/en/publications/2472b21e-1fdc-4121-946e-e9c8fae6d02d"]}],"language":{"classid":"eng","classname":"English","schemeid":"dnet:languages","schemename":"dnet:languages"},"lastupdatetimestamp":1646505766149,"oaiprovenance":{"originDescription":{"altered":true,"baseURL":"https%3A%2F%2Fcris.vtt.fi%2Fws%2Foai","datestamp":"2022-02-01T02:35:05Z","harvestDate":"2022-02-28T12:32:50.667Z","identifier":"oai:cris.vtt.fi:publications/2472b21e-1fdc-4121-946e-e9c8fae6d02d","metadataNamespace":"http://www.openarchives.org/OAI/2.0/oai_dc/"}},"originalId":["50|355e65625b88::a29614444f5030f11e75c6c27264d272","oai:cris.vtt.fi:publications/2472b21e-1fdc-4121-946e-e9c8fae6d02d"],"pid":[],"relevantdate":[],"resourcetype":{"classid":"UNKNOWN","classname":"UNKNOWN","schemeid":"dnet:dataCite_resource","schemename":"dnet:dataCite_resource"},"resulttype":{"classid":"publication","classname":"publication","schemeid":"dnet:result_typologies","schemename":"dnet:result_typologies"},"source":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"Vainonen-Ahlgren , E , Likonen , J , Renvall , Rohde , V & Mayer , M 2007 , ' Migration of 13C and deposition at ASDEX Upgrade ' , Journal of Nuclear Materials , vol. 363-365 , pp. 270-275 . https://doi.org/10.1016/j.jnucmat.2007.01.026"}],"subject":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:result_subject","schemename":"dnet:result_subject"},"value":"ASDEX upgrade"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:result_subject","schemename":"dnet:result_subject"},"value":"divertor"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:result_subject","schemename":"dnet:result_subject"},"value":"carbon based materials"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:result_subject","schemename":"dnet:result_subject"},"value":"erosion"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:result_subject","schemename":"dnet:result_subject"},"value":"deposition"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:result_subject","schemename":"dnet:result_subject"},"value":"ITER"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:result_subject","schemename":"dnet:result_subject"},"value":"JET"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:result_subject","schemename":"dnet:result_subject"},"value":"plasma"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:result_subject","schemename":"dnet:result_subject"},"value":"fusion energy"}],"title":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"main title","classname":"main title","schemeid":"dnet:dataCite_title","schemename":"dnet:dataCite_title"},"value":"Migration of 13C and deposition at ASDEX Upgrade"}]} {"author":[{"fullname":"Aalto, Timo","name":"Timo","pid":[],"rank":1,"surname":"Aalto"},{"fullname":"Harjanne, Mikko","name":"Mikko","pid":[],"rank":2,"surname":"Harjanne"},{"fullname":"Kapulainen, Markku","name":"Markku","pid":[],"rank":3,"surname":"Kapulainen"}],"bestaccessright":{"classid":"CLOSED","classname":"Closed Access","schemeid":"dnet:access_modes","schemename":"dnet:access_modes"},"collectedfrom":[{"key":"10|openaire____::4692342f0992d91f9e705c26959f09e0","value":"VTT Research Information System"}],"context":[{"dataInfo": null,"id": "beopen"}],"contributor":[],"country":[],"coverage":[],"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"dateofacceptance":{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"2003-01-01"},"dateofcollection":"2022-02-28T12:32:37.581Z","dateoftransformation":"2022-02-28T19:39:18.717Z","description":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"A novel method has been developed for measuring the rotational angle of a fiber's or a waveguide's polarization axis with respect to a reference angle. The reference angle is the polarization axis of the measuring device. The method also gives the true polarization extinction ratio of the measured fiber or waveguide. The method is suitable for the characterization and rotational alignment of polarization-maintaining waveguides and fibers. In particular, the method can be used to rotationally align the fiber-waveguide interconnections during waveguide characterization. The measuring device is either a linear polarizer or a polarization splitter that is accurately rotated with respect to the device under test. According to the experiments with a polarization-maintaining fiber, the method is very easy and inexpensive to implement, and the angular accuracy can be better than 0.2 deg."}],"externalReference":[],"extraInfo":[],"format":[],"fulltext":[],"id":"50|openorgs____::64badd35233ba2cd4946368ef2f4cf57","instance":[{"accessright":{"classid":"CLOSED","classname":"Closed Access","schemeid":"dnet:access_modes","schemename":"dnet:access_modes"},"alternateIdentifier":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"doi","classname":"Digital Object Identifier","schemeid":"dnet:pid_types","schemename":"dnet:pid_types"},"value":"10.1117/1.1600730"}],"collectedfrom":{"key":"10|openaire____::4692342f0992d91f9e705c26959f09e0","value":"VTT Research Information System"},"dateofacceptance":{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"2003-01-01"},"distributionlocation":"","hostedby":{"key":"10|openaire____::4692342f0992d91f9e705c26959f09e0","value":"VTT Research Information System"},"instancetype":{"classid":"0001","classname":"Article","schemeid":"dnet:publication_resource","schemename":"dnet:publication_resource"},"pid":[],"refereed":{"classid":"0000","classname":"UNKNOWN","schemeid":"dnet:review_levels","schemename":"dnet:review_levels"},"url":["https://cris.vtt.fi/en/publications/8cd538fb-6484-4655-81dd-47348d358fd4"]}],"language":{"classid":"eng","classname":"English","schemeid":"dnet:languages","schemename":"dnet:languages"},"lastupdatetimestamp":1646505829230,"oaiprovenance":{"originDescription":{"altered":true,"baseURL":"https%3A%2F%2Fcris.vtt.fi%2Fws%2Foai","datestamp":"2022-01-31T21:47:37Z","harvestDate":"2022-02-28T12:32:37.581Z","identifier":"oai:cris.vtt.fi:publications/8cd538fb-6484-4655-81dd-47348d358fd4","metadataNamespace":"http://www.openarchives.org/OAI/2.0/oai_dc/"}},"originalId":["50|355e65625b88::df0143af011fd82af8ac2d07b03ee8cd","oai:cris.vtt.fi:publications/8cd538fb-6484-4655-81dd-47348d358fd4"],"pid":[],"relevantdate":[],"resourcetype":{"classid":"UNKNOWN","classname":"UNKNOWN","schemeid":"dnet:dataCite_resource","schemename":"dnet:dataCite_resource"},"resulttype":{"classid":"publication","classname":"publication","schemeid":"dnet:result_typologies","schemename":"dnet:result_typologies"},"source":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"Aalto , T , Harjanne , M & Kapulainen , M 2003 , ' Method for the rotational alignment of polarization-maintaining optical fibers and waveguides ' , Optical Engineering , vol. 42 , no. 10 , pp. 2861-2867 . https://doi.org/10.1117/1.1600730"}],"subject":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:result_subject","schemename":"dnet:result_subject"},"value":"optical waveguide"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:result_subject","schemename":"dnet:result_subject"},"value":"polarization-maintaining fiber"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:result_subject","schemename":"dnet:result_subject"},"value":"characterization"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:result_subject","schemename":"dnet:result_subject"},"value":"fiber-waveguide coupling"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:result_subject","schemename":"dnet:result_subject"},"value":"polarization"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:result_subject","schemename":"dnet:result_subject"},"value":"polarization axis"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:result_subject","schemename":"dnet:result_subject"},"value":"polarizer"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:result_subject","schemename":"dnet:result_subject"},"value":"polarization splitter"}],"title":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"main title","classname":"main title","schemeid":"dnet:dataCite_title","schemename":"dnet:dataCite_title"},"value":"Method for the rotational alignment of polarization-maintaining optical fibers and waveguides"}]} {"author":[{"fullname":"Aalto, Timo","name":"Timo","pid":[],"rank":1,"surname":"Aalto"},{"fullname":"Harjanne, Mikko","name":"Mikko","pid":[],"rank":2,"surname":"Harjanne"},{"fullname":"Kapulainen, Markku","name":"Markku","pid":[],"rank":3,"surname":"Kapulainen"}],"bestaccessright":{"classid":"CLOSED","classname":"Closed Access","schemeid":"dnet:access_modes","schemename":"dnet:access_modes"},"collectedfrom":[{"key":"10|openaire____::4692342f0992d91f9e705c26959f09e0","value":"VTT Research Information System"}],"context":[{"dataInfo": null,"id": "beopen"}],"contributor":[],"country":[],"coverage":[],"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"dateofacceptance":{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"2003-01-01"},"dateofcollection":"2022-02-28T12:32:37.581Z","dateoftransformation":"2022-02-28T19:39:18.717Z","description":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"A novel method has been developed for measuring the rotational angle of a fiber's or a waveguide's polarization axis with respect to a reference angle. The reference angle is the polarization axis of the measuring device. The method also gives the true polarization extinction ratio of the measured fiber or waveguide. The method is suitable for the characterization and rotational alignment of polarization-maintaining waveguides and fibers. In particular, the method can be used to rotationally align the fiber-waveguide interconnections during waveguide characterization. The measuring device is either a linear polarizer or a polarization splitter that is accurately rotated with respect to the device under test. According to the experiments with a polarization-maintaining fiber, the method is very easy and inexpensive to implement, and the angular accuracy can be better than 0.2 deg."}],"externalReference":[],"extraInfo":[],"format":[],"fulltext":[],"id":"50|openorgs____::64badd35233ba2cd4946368ef2f4cf57","instance":[{"accessright":{"classid":"CLOSED","classname":"Closed Access","schemeid":"dnet:access_modes","schemename":"dnet:access_modes"},"alternateIdentifier":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"doi","classname":"Digital Object Identifier","schemeid":"dnet:pid_types","schemename":"dnet:pid_types"},"value":"10.1117/1.1600730"}],"collectedfrom":{"key":"10|openaire____::4692342f0992d91f9e705c26959f09e0","value":"VTT Research Information System"},"dateofacceptance":{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"2003-01-01"},"distributionlocation":"","hostedby":{"key":"10|openaire____::4692342f0992d91f9e705c26959f09e0","value":"VTT Research Information System"},"instancetype":{"classid":"0001","classname":"Article","schemeid":"dnet:publication_resource","schemename":"dnet:publication_resource"},"pid":[],"refereed":{"classid":"0000","classname":"UNKNOWN","schemeid":"dnet:review_levels","schemename":"dnet:review_levels"},"url":["https://cris.vtt.fi/en/publications/8cd538fb-6484-4655-81dd-47348d358fd4"]}],"language":{"classid":"eng","classname":"English","schemeid":"dnet:languages","schemename":"dnet:languages"},"lastupdatetimestamp":1646505829230,"oaiprovenance":{"originDescription":{"altered":true,"baseURL":"https%3A%2F%2Fcris.vtt.fi%2Fws%2Foai","datestamp":"2022-01-31T21:47:37Z","harvestDate":"2022-02-28T12:32:37.581Z","identifier":"oai:cris.vtt.fi:publications/8cd538fb-6484-4655-81dd-47348d358fd4","metadataNamespace":"http://www.openarchives.org/OAI/2.0/oai_dc/"}},"originalId":["50|355e65625b88::df0143af011fd82af8ac2d07b03ee8cd","oai:cris.vtt.fi:publications/8cd538fb-6484-4655-81dd-47348d358fd4"],"pid":[],"relevantdate":[],"resourcetype":{"classid":"UNKNOWN","classname":"UNKNOWN","schemeid":"dnet:dataCite_resource","schemename":"dnet:dataCite_resource"},"resulttype":{"classid":"publication","classname":"publication","schemeid":"dnet:result_typologies","schemename":"dnet:result_typologies"},"source":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"Aalto , T , Harjanne , M & Kapulainen , M 2003 , ' Method for the rotational alignment of polarization-maintaining optical fibers and waveguides ' , Optical Engineering , vol. 42 , no. 10 , pp. 2861-2867 . https://doi.org/10.1117/1.1600730"}],"subject":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:result_subject","schemename":"dnet:result_subject"},"value":"optical waveguide"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:result_subject","schemename":"dnet:result_subject"},"value":"polarization-maintaining fiber"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:result_subject","schemename":"dnet:result_subject"},"value":"characterization"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:result_subject","schemename":"dnet:result_subject"},"value":"fiber-waveguide coupling"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:result_subject","schemename":"dnet:result_subject"},"value":"polarization"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:result_subject","schemename":"dnet:result_subject"},"value":"polarization axis"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:result_subject","schemename":"dnet:result_subject"},"value":"polarizer"},{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"keyword","classname":"keyword","schemeid":"dnet:result_subject","schemename":"dnet:result_subject"},"value":"polarization splitter"}],"title":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"main title","classname":"main title","schemeid":"dnet:dataCite_title","schemename":"dnet:dataCite_title"},"value":"Method for the rotational alignment of polarization-maintaining optical fibers and waveguides"}]} {"author":[{"fullname":"Penttilä, Raimo","name":"Raimo","pid":[],"rank":1,"surname":"Penttilä"},{"fullname":"Vanttaja, Ilkka","name":"Ilkka","pid":[],"rank":2,"surname":"Vanttaja"},{"fullname":"Haapamäki, Petteri","name":"Petteri","pid":[],"rank":3,"surname":"Haapamäki"},{"fullname":"Kujanpää, Veli","name":"Veli","pid":[],"rank":4,"surname":"Kujanpää"}],"bestaccessright":{"classid":"RESTRICTED","classname":"Restricted","schemeid":"dnet:access_modes","schemename":"dnet:access_modes"},"collectedfrom":[{"key":"10|opendoar____::f0dd4a99fba6075a9494772b58f95280","value":"VTT Research Information System"}],"context": [{"dataInfo":null, "id":"dh-ch"}],"contributor":[],"country":[],"coverage":[],"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"dateofacceptance":{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"1994-01-01"},"dateofcollection":"2022-02-28T12:35:26.769Z","dateoftransformation":"2022-02-28T19:54:10.494Z","description":[],"externalReference":[],"extraInfo":[],"format":[],"fulltext":[],"id":"50|355e65625b88::e7d48a470b13bda61f7ebe3513e20cb6","instance":[{"accessright":{"classid":"RESTRICTED","classname":"Restricted","schemeid":"dnet:access_modes","schemename":"dnet:access_modes"},"alternateIdentifier":[],"collectedfrom":{"key":"10|openaire____::4692342f0992d91f9e705c26959f09e0","value":"VTT Research Information System"},"dateofacceptance":{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"1994-01-01"},"distributionlocation":"","hostedby":{"key":"10|eurocrisdris::9ae43d14471c4b33661fedda6f06b539","value":"VTT Research Information System"},"instancetype":{"classid":"0001","classname":"Article","schemeid":"dnet:publication_resource","schemename":"dnet:publication_resource"},"pid":[],"refereed":{"classid":"0000","classname":"UNKNOWN","schemeid":"dnet:review_levels","schemename":"dnet:review_levels"},"url":["https://cris.vtt.fi/en/publications/ddbd28ea-5fbf-43e1-896f-e69856870c26"]}],"language":{"classid":"fin","classname":"Finnish","schemeid":"dnet:languages","schemename":"dnet:languages"},"lastupdatetimestamp":1646505838552,"oaiprovenance":{"originDescription":{"altered":true,"baseURL":"https%3A%2F%2Fcris.vtt.fi%2Fws%2Foai","datestamp":"2019-08-08T07:09:42Z","harvestDate":"2022-02-28T12:35:26.769Z","identifier":"oai:cris.vtt.fi:publications/ddbd28ea-5fbf-43e1-896f-e69856870c26","metadataNamespace":"http://www.openarchives.org/OAI/2.0/oai_dc/"}},"originalId":["oai:cris.vtt.fi:publications/ddbd28ea-5fbf-43e1-896f-e69856870c26","50|355e65625b88::e7d48a470b13bda61f7ebe3513e20cb6"],"pid":[],"relevantdate":[],"resourcetype":{"classid":"UNKNOWN","classname":"UNKNOWN","schemeid":"dnet:dataCite_resource","schemename":"dnet:dataCite_resource"},"resulttype":{"classid":"publication","classname":"publication","schemeid":"dnet:result_typologies","schemename":"dnet:result_typologies"},"source":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"value":"Penttilä , R , Vanttaja , I , Haapamäki , P & Kujanpää , V 1994 , ' Liimauksen ja puristusliittämisen yhdistämisellä lisää lujuutta, jäykkyyttä ja tiiveyttä ' , Ohutlevyuutiset , no. 2 , pp. 17-19 ."}],"subject":[],"title":[{"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"sysimport:crosswalk:repository","classname":"Harvested","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"qualifier":{"classid":"main title","classname":"main title","schemeid":"dnet:dataCite_title","schemename":"dnet:dataCite_title"},"value":"Liimauksen ja puristusliittämisen yhdistämisellä lisää lujuutta, jäykkyyttä ja tiiveyttä"}]} \ No newline at end of file From e4b814b3f1bdb3ab52a439aff82aa56359a80cb7 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Fri, 6 Dec 2024 13:58:39 +0100 Subject: [PATCH 105/111] code formatting --- .../dhp/collection/CollectorWorker.java | 2 +- .../collection/crossref/Crossref2Oaf.scala | 13 ++-- .../crossref/CrossrefMappingTest.scala | 18 +++-- .../PrepareResultCommunitySetStep1.java | 76 ++++++++++--------- .../PrepareResultCommunitySetStep2.java | 2 +- .../ResultToCommunityJobTest.java | 70 ++++++++--------- 6 files changed, 97 insertions(+), 84 deletions(-) diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/CollectorWorker.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/CollectorWorker.java index 4c6d0653e..67966d523 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/CollectorWorker.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/CollectorWorker.java @@ -7,7 +7,6 @@ import java.io.IOException; import java.util.Optional; import java.util.concurrent.atomic.AtomicInteger; -import eu.dnetlib.dhp.collection.plugin.zenodo.CollectZenodoDumpCollectorPlugin; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.IntWritable; @@ -29,6 +28,7 @@ import eu.dnetlib.dhp.collection.plugin.mongodb.MongoDbDumpCollectorPlugin; import eu.dnetlib.dhp.collection.plugin.oai.OaiCollectorPlugin; import eu.dnetlib.dhp.collection.plugin.osf.OsfPreprintsCollectorPlugin; import eu.dnetlib.dhp.collection.plugin.rest.RestCollectorPlugin; +import eu.dnetlib.dhp.collection.plugin.zenodo.CollectZenodoDumpCollectorPlugin; import eu.dnetlib.dhp.common.aggregation.AggregatorReport; import eu.dnetlib.dhp.common.collection.CollectorException; import eu.dnetlib.dhp.common.collection.HttpClientParams; diff --git a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/collection/crossref/Crossref2Oaf.scala b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/collection/crossref/Crossref2Oaf.scala index c72b366a0..ea2177497 100644 --- a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/collection/crossref/Crossref2Oaf.scala +++ b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/collection/crossref/Crossref2Oaf.scala @@ -503,7 +503,6 @@ case object Crossref2Oaf { ) } - if (doi.startsWith("10.3410") || doi.startsWith("10.12703")) instance.setHostedby( OafMapperUtils.keyValue(OafMapperUtils.createOpenaireId(10, "openaire____::H1Connect", true), "H1Connect") @@ -556,14 +555,18 @@ case object Crossref2Oaf { result } - - def generateIdentifier(oaf: Result, doi: String): String = { val id = DHPUtils.md5(doi.toLowerCase) s"50|doiboost____|$id" } - private def generateAuthor(given: String, family: String, orcid: String, index: Int, affiliation: Option[List[mappingAffiliation]]): Author = { + private def generateAuthor( + given: String, + family: String, + orcid: String, + index: Int, + affiliation: Option[List[mappingAffiliation]] + ): Author = { val a = new Author a.setName(given) a.setSurname(family) @@ -700,7 +703,6 @@ case object Crossref2Oaf { if (objectType == null) return resultList - // If the item has a relations is-review-of, then we force it to a peer-review val is_review = json \ "relation" \ "is-review-of" \ "id" var force_to_review = false @@ -713,7 +715,6 @@ case object Crossref2Oaf { if (typology == null) return List() - val result = generateItemFromType(typology._2) if (result == null) return List() diff --git a/dhp-workflows/dhp-aggregation/src/test/scala/eu/dnetlib/dhp/collection/crossref/CrossrefMappingTest.scala b/dhp-workflows/dhp-aggregation/src/test/scala/eu/dnetlib/dhp/collection/crossref/CrossrefMappingTest.scala index 12ca14ba1..ebe247d8a 100644 --- a/dhp-workflows/dhp-aggregation/src/test/scala/eu/dnetlib/dhp/collection/crossref/CrossrefMappingTest.scala +++ b/dhp-workflows/dhp-aggregation/src/test/scala/eu/dnetlib/dhp/collection/crossref/CrossrefMappingTest.scala @@ -28,17 +28,21 @@ class CrossrefMappingTest extends AbstractVocabularyTest { val input = IOUtils.toString(getClass.getResourceAsStream("/eu/dnetlib/dhp/collection/crossref/issn_pub.json"), "utf-8") - Crossref2Oaf.convert(input, vocabularies, TransformationType.All).foreach(record => { - Assertions.assertNotNull(record) - }) + Crossref2Oaf + .convert(input, vocabularies, TransformationType.All) + .foreach(record => { + Assertions.assertNotNull(record) + }) } - @Test def mappingAffiliation(): Unit = { val input = - IOUtils.toString(getClass.getResourceAsStream("/eu/dnetlib/dhp/collection/crossref/affiliationTest.json"), "utf-8") + IOUtils.toString( + getClass.getResourceAsStream("/eu/dnetlib/dhp/collection/crossref/affiliationTest.json"), + "utf-8" + ) val data = Crossref2Oaf.convert(input, vocabularies, TransformationType.OnlyResult) data.foreach(record => { Assertions.assertNotNull(record) @@ -46,10 +50,10 @@ class CrossrefMappingTest extends AbstractVocabularyTest { val publication = record.asInstanceOf[Publication] publication.getAuthor.asScala.foreach(author => { Assertions.assertNotNull(author.getRawAffiliationString) - Assertions.assertTrue(author.getRawAffiliationString.size()>0) + Assertions.assertTrue(author.getRawAffiliationString.size() > 0) - }) }) + }) println(mapper.writerWithDefaultPrettyPrinter().writeValueAsString(data.head)) } } diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttocommunityfromsemrel/PrepareResultCommunitySetStep1.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttocommunityfromsemrel/PrepareResultCommunitySetStep1.java index 764390442..ecb7cc827 100644 --- a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttocommunityfromsemrel/PrepareResultCommunitySetStep1.java +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttocommunityfromsemrel/PrepareResultCommunitySetStep1.java @@ -1,16 +1,16 @@ package eu.dnetlib.dhp.resulttocommunityfromsemrel; +import static java.lang.String.join; + import static eu.dnetlib.dhp.PropagationConstant.*; import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkHiveSession; -import static java.lang.String.join; import java.io.IOException; import java.util.Arrays; import java.util.Collections; import java.util.List; -import eu.dnetlib.dhp.schema.common.ModelConstants; import org.apache.commons.io.IOUtils; import org.apache.spark.SparkConf; import org.apache.spark.sql.*; @@ -22,6 +22,7 @@ import com.google.gson.Gson; import eu.dnetlib.dhp.api.Utils; import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.resulttocommunityfromorganization.ResultCommunityList; +import eu.dnetlib.dhp.schema.common.ModelConstants; import eu.dnetlib.dhp.schema.oaf.Relation; import eu.dnetlib.dhp.schema.oaf.Result; import eu.dnetlib.dhp.utils.ISLookupClientFactory; @@ -37,8 +38,7 @@ public class PrepareResultCommunitySetStep1 { * relation */ // TODO - private static final String RESULT_CONTEXT_QUERY_TEMPLATE = - "select target resultId, community_context " + private static final String RESULT_CONTEXT_QUERY_TEMPLATE = "select target resultId, community_context " + "from (select id, collect_set(co.id) community_context " + " from result " + " lateral view explode (context) c as co " @@ -60,26 +60,26 @@ public class PrepareResultCommunitySetStep1 { + "where length(co) > 0 " + "group by resultId"; - private static final String RESULT_CONTEXT_QUERY_TEMPLATE_IS_RELATED_TO = - "select target as resultId, community_context " + - "from resultWithContext rwc " + - "join relatedToRelations r " + - "join patents p " + - "on rwc.id = r.source and r.target = p.id"; + private static final String RESULT_CONTEXT_QUERY_TEMPLATE_IS_RELATED_TO = "select target as resultId, community_context " + + + "from resultWithContext rwc " + + "join relatedToRelations r " + + "join patents p " + + "on rwc.id = r.source and r.target = p.id"; private static final String RESULT_WITH_CONTEXT = "select id, collect_set(co.id) community_context \n" + - " from result " + - " lateral view explode (context) c as co " + - " where lower(co.id) IN %s" + - " group by id"; + " from result " + + " lateral view explode (context) c as co " + + " where lower(co.id) IN %s" + + " group by id"; private static final String RESULT_PATENT = "select id " + - " from result " + - " where array_contains(instance.instancetype.classname, 'Patent')"; + " from result " + + " where array_contains(instance.instancetype.classname, 'Patent')"; private static final String IS_RELATED_TO_RELATIONS = "select source, target " + - " from relation " + - " where lower(relClass) = 'isrelatedto' and datainfo.deletedbyinference = false"; + " from relation " + + " where lower(relClass) = 'isrelatedto' and datainfo.deletedbyinference = false"; public static void main(String[] args) throws Exception { String jsonConfiguration = IOUtils @@ -107,17 +107,25 @@ public class PrepareResultCommunitySetStep1 { SparkConf conf = new SparkConf(); conf.set("hive.metastore.uris", parser.get("hive_metastore_uris")); - final String allowedsemrel ="(" + join(",", - Arrays.asList(parser.get("allowedsemrels").split(";")).stream().map(value -> "'" + value.toLowerCase() + "'") - .toArray(String[]::new)) + ")"; + final String allowedsemrel = "(" + join( + ",", + Arrays + .asList(parser.get("allowedsemrels").split(";")) + .stream() + .map(value -> "'" + value.toLowerCase() + "'") + .toArray(String[]::new)) + + ")"; log.info("allowedSemRel: {}", allowedsemrel); final String baseURL = parser.get("baseURL"); log.info("baseURL: {}", baseURL); - final String communityIdList = "(" + join(",", getCommunityList(baseURL).stream() + final String communityIdList = "(" + join( + ",", getCommunityList(baseURL) + .stream() .map(value -> "'" + value.toLowerCase() + "'") - .toArray(String[]::new)) + ")"; + .toArray(String[]::new)) + + ")"; final String resultType = resultClassName.substring(resultClassName.lastIndexOf(".") + 1).toLowerCase(); log.info("resultType: {}", resultType); @@ -161,18 +169,17 @@ public class PrepareResultCommunitySetStep1 { relation.createOrReplaceTempView("relation"); Dataset result = readPath(spark, inputResultPath, resultClazz) - .where("datainfo.deletedbyinference != true AND datainfo.invisible != true"); + .where("datainfo.deletedbyinference != true AND datainfo.invisible != true"); result.createOrReplaceTempView("result"); final String outputResultPath = outputPath + "/" + resultType; log.info("writing output results to: {}", outputResultPath); - String resultContextQuery = String .format( RESULT_CONTEXT_QUERY_TEMPLATE, - "AND lower(co.id) IN " + communityIdList, - "AND lower(relClass) IN " + allowedsemrel); + "AND lower(co.id) IN " + communityIdList, + "AND lower(relClass) IN " + allowedsemrel); Dataset result_context = spark.sql(resultContextQuery); Dataset rwc = spark.sql(String.format(RESULT_WITH_CONTEXT, communityIdList)); @@ -183,18 +190,17 @@ public class PrepareResultCommunitySetStep1 { patents.createOrReplaceTempView("patents"); relatedToRelations.createOrReplaceTempView("relatedTorelations"); - - result_context = result_context.unionAll( spark.sql(RESULT_CONTEXT_QUERY_TEMPLATE_IS_RELATED_TO)); + result_context = result_context.unionAll(spark.sql(RESULT_CONTEXT_QUERY_TEMPLATE_IS_RELATED_TO)); result_context.createOrReplaceTempView("result_context"); spark - .sql(RESULT_COMMUNITY_LIST_QUERY) - .as(Encoders.bean(ResultCommunityList.class)) - .write() - .option("compression", "gzip") - .mode(SaveMode.Append) - .json(outputResultPath); + .sql(RESULT_COMMUNITY_LIST_QUERY) + .as(Encoders.bean(ResultCommunityList.class)) + .write() + .option("compression", "gzip") + .mode(SaveMode.Append) + .json(outputResultPath); } diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttocommunityfromsemrel/PrepareResultCommunitySetStep2.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttocommunityfromsemrel/PrepareResultCommunitySetStep2.java index 9bebc36e5..9801b1bf6 100644 --- a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttocommunityfromsemrel/PrepareResultCommunitySetStep2.java +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttocommunityfromsemrel/PrepareResultCommunitySetStep2.java @@ -77,7 +77,7 @@ public class PrepareResultCommunitySetStep2 { if (b == null) { return a; } - Set community_set = new HashSet<>(a.getCommunityList()); + Set community_set = new HashSet<>(a.getCommunityList()); community_set.addAll(b.getCommunityList()); a.setCommunityList(new ArrayList<>(community_set)); return a; diff --git a/dhp-workflows/dhp-enrichment/src/test/java/eu/dnetlib/dhp/resulttocommunityfromsemrel/ResultToCommunityJobTest.java b/dhp-workflows/dhp-enrichment/src/test/java/eu/dnetlib/dhp/resulttocommunityfromsemrel/ResultToCommunityJobTest.java index c1fcff4d9..2b52c91de 100644 --- a/dhp-workflows/dhp-enrichment/src/test/java/eu/dnetlib/dhp/resulttocommunityfromsemrel/ResultToCommunityJobTest.java +++ b/dhp-workflows/dhp-enrichment/src/test/java/eu/dnetlib/dhp/resulttocommunityfromsemrel/ResultToCommunityJobTest.java @@ -10,7 +10,6 @@ import java.util.ArrayList; import java.util.List; import java.util.stream.Collectors; -import eu.dnetlib.dhp.resulttocommunityfromorganization.ResultCommunityList; import org.apache.commons.io.FileUtils; import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaRDD; @@ -27,6 +26,7 @@ import org.slf4j.LoggerFactory; import com.fasterxml.jackson.databind.ObjectMapper; +import eu.dnetlib.dhp.resulttocommunityfromorganization.ResultCommunityList; import eu.dnetlib.dhp.schema.oaf.Dataset; import scala.collection.Seq; @@ -279,53 +279,55 @@ public class ResultToCommunityJobTest { @Test public void prepareStep1Test() throws Exception { /* - - - final String allowedsemrel = join(",", Arrays.stream(parser.get("allowedsemrels").split(";")) - .map(value -> "'" + value.toLowerCase() + "'") - .toArray(String[]::new)); - - log.info("allowedSemRel: {}", new Gson().toJson(allowedsemrel)); - - final String baseURL = parser.get("baseURL"); - log.info("baseURL: {}", baseURL); + * final String allowedsemrel = join(",", Arrays.stream(parser.get("allowedsemrels").split(";")) .map(value -> + * "'" + value.toLowerCase() + "'") .toArray(String[]::new)); log.info("allowedSemRel: {}", new + * Gson().toJson(allowedsemrel)); final String baseURL = parser.get("baseURL"); log.info("baseURL: {}", + * baseURL); */ PrepareResultCommunitySetStep1 - .main( - new String[] { - "-isSparkSessionManaged", Boolean.FALSE.toString(), - "-sourcePath", getClass() - .getResource("/eu/dnetlib/dhp/resulttocommunityfromsemrel/graph") - .getPath(), - "-hive_metastore_uris", "", - "-resultTableName", "eu.dnetlib.dhp.schema.oaf.Publication", - "-outputPath", workingDir.toString() + "/preparedInfo", - "-allowedsemrels","issupplementto;issupplementedby", - "-baseURL","https://dev-openaire.d4science.org/openaire/community/" - }); + .main( + new String[] { + "-isSparkSessionManaged", Boolean.FALSE.toString(), + "-sourcePath", getClass() + .getResource("/eu/dnetlib/dhp/resulttocommunityfromsemrel/graph") + .getPath(), + "-hive_metastore_uris", "", + "-resultTableName", "eu.dnetlib.dhp.schema.oaf.Publication", + "-outputPath", workingDir.toString() + "/preparedInfo", + "-allowedsemrels", "issupplementto;issupplementedby", + "-baseURL", "https://dev-openaire.d4science.org/openaire/community/" + }); - - org.apache.spark.sql.Dataset resultCommunityList = spark.read().schema(Encoders.bean(ResultCommunityList.class).schema()) - .json(workingDir.toString() + "/preparedInfo/publication") - .as(Encoders.bean(ResultCommunityList.class)); + org.apache.spark.sql.Dataset resultCommunityList = spark + .read() + .schema(Encoders.bean(ResultCommunityList.class).schema()) + .json(workingDir.toString() + "/preparedInfo/publication") + .as(Encoders.bean(ResultCommunityList.class)); Assertions.assertEquals(2, resultCommunityList.count()); - Assertions.assertEquals(1,resultCommunityList.filter("resultId = '50|dedup_wf_001::06e51d2bf295531b2d2e7a1b55500783'").count()); - Assertions.assertEquals(1,resultCommunityList.filter("resultId = '50|pending_org_::82f63b2d21ae88596b9d8991780e9888'").count()); + Assertions + .assertEquals( + 1, + resultCommunityList.filter("resultId = '50|dedup_wf_001::06e51d2bf295531b2d2e7a1b55500783'").count()); + Assertions + .assertEquals( + 1, + resultCommunityList.filter("resultId = '50|pending_org_::82f63b2d21ae88596b9d8991780e9888'").count()); ArrayList communities = resultCommunityList - .filter("resultId = '50|dedup_wf_001::06e51d2bf295531b2d2e7a1b55500783'") - .first().getCommunityList(); + .filter("resultId = '50|dedup_wf_001::06e51d2bf295531b2d2e7a1b55500783'") + .first() + .getCommunityList(); Assertions.assertEquals(2, communities.size()); Assertions.assertTrue(communities.stream().anyMatch(cid -> "beopen".equals(cid))); Assertions.assertTrue(communities.stream().anyMatch(cid -> "dh-ch".equals(cid))); communities = resultCommunityList - .filter("resultId = '50|pending_org_::82f63b2d21ae88596b9d8991780e9888'") - .first().getCommunityList(); + .filter("resultId = '50|pending_org_::82f63b2d21ae88596b9d8991780e9888'") + .first() + .getCommunityList(); Assertions.assertEquals(1, communities.size()); Assertions.assertEquals("dh-ch", communities.get(0)); } - } From 0d050061146fc8fb7dfb1f619601719b3892da82 Mon Sep 17 00:00:00 2001 From: Sandro La Bruzzo Date: Fri, 6 Dec 2024 14:23:47 +0100 Subject: [PATCH 106/111] code formatted --- .../java/eu/dnetlib/dhp/collection/orcid/ORCIDExtractor.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/orcid/ORCIDExtractor.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/orcid/ORCIDExtractor.java index 8172456bb..1adad104e 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/orcid/ORCIDExtractor.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/orcid/ORCIDExtractor.java @@ -160,7 +160,7 @@ public class ORCIDExtractor extends Thread { } } finally { for (SequenceFile.Writer k : fileMap.values()) { - log.info("Thread {}: Completed processed {} items", id, extractedItem); + log.info("Thread {}: Completed processed {} items", id, extractedItem); k.hflush(); k.close(); } From 72fd618ebb51cc2ac07bbcda36a88e239d146f90 Mon Sep 17 00:00:00 2001 From: Giambattista Bloisi Date: Mon, 21 Oct 2024 16:42:19 +0200 Subject: [PATCH 107/111] Give a base path to FileSystem.get to resolve s3 file system --- .../main/java/eu/dnetlib/dhp/oa/dedup/SparkUpdateEntity.java | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkUpdateEntity.java b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkUpdateEntity.java index 49021ab58..91795eb47 100644 --- a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkUpdateEntity.java +++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkUpdateEntity.java @@ -2,6 +2,7 @@ package eu.dnetlib.dhp.oa.dedup; import java.io.IOException; +import java.net.URI; import java.util.Map; import org.apache.commons.io.IOUtils; @@ -132,7 +133,7 @@ public class SparkUpdateEntity extends AbstractSparkAction { boolean result = false; - FileSystem fileSystem = FileSystem.get(new Configuration()); + FileSystem fileSystem = FileSystem.get(URI.create(basePath), new Configuration()); FileStatus[] fileStatuses = fileSystem.listStatus(new Path(basePath)); for (FileStatus fs : fileStatuses) { From bccf84a1a947a2858f5fa5c0fc76cee2435df470 Mon Sep 17 00:00:00 2001 From: Giambattista Bloisi Date: Mon, 21 Oct 2024 21:38:51 +0200 Subject: [PATCH 108/111] Support for empty value of master option --- .../eu/dnetlib/dhp/application/SparkScalaApplication.scala | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/dhp-common/src/main/scala/eu/dnetlib/dhp/application/SparkScalaApplication.scala b/dhp-common/src/main/scala/eu/dnetlib/dhp/application/SparkScalaApplication.scala index 526bbd295..09a1887b1 100644 --- a/dhp-common/src/main/scala/eu/dnetlib/dhp/application/SparkScalaApplication.scala +++ b/dhp-common/src/main/scala/eu/dnetlib/dhp/application/SparkScalaApplication.scala @@ -2,6 +2,7 @@ package eu.dnetlib.dhp.application import eu.dnetlib.dhp.common.Constants import eu.dnetlib.dhp.utils.DHPUtils.writeHdfsFile +import org.apache.commons.lang3.StringUtils import scala.io.Source @@ -69,7 +70,7 @@ abstract class AbstractScalaApplication( .builder() .config(conf) .appName(getClass.getSimpleName) - if (master != null) + if (StringUtils.isNotBlank(master)) b.master(master) b.getOrCreate() } From 329b47178791923ae318614fb9aa66667acd65d9 Mon Sep 17 00:00:00 2001 From: Giambattista Bloisi Date: Wed, 23 Oct 2024 13:55:28 +0200 Subject: [PATCH 109/111] Include graph-mapper in shaded package --- dhp-shade-package/pom.xml | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/dhp-shade-package/pom.xml b/dhp-shade-package/pom.xml index c4f9b262e..42735f9d7 100644 --- a/dhp-shade-package/pom.xml +++ b/dhp-shade-package/pom.xml @@ -56,11 +56,11 @@ - - - - - + + eu.dnetlib.dhp + dhp-graph-mapper + ${project.version} + From 8bb31e205b142ad0daa6b6224ba17c223c72dda3 Mon Sep 17 00:00:00 2001 From: Giambattista Bloisi Date: Wed, 30 Oct 2024 11:22:04 +0100 Subject: [PATCH 110/111] New CopyEntitiesSparkJob as replacement to distcp for copying intermediate graph data --- .../dhp/oa/merge/CopyEntitiesSparkJob.java | 108 ++++++++++++++++++ .../merge/copy_graph_entities_parameters.json | 32 ++++++ 2 files changed, 140 insertions(+) create mode 100644 dhp-common/src/main/java/eu/dnetlib/dhp/oa/merge/CopyEntitiesSparkJob.java create mode 100644 dhp-common/src/main/resources/eu/dnetlib/dhp/oa/merge/copy_graph_entities_parameters.json diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/oa/merge/CopyEntitiesSparkJob.java b/dhp-common/src/main/java/eu/dnetlib/dhp/oa/merge/CopyEntitiesSparkJob.java new file mode 100644 index 000000000..ba378c7ea --- /dev/null +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/oa/merge/CopyEntitiesSparkJob.java @@ -0,0 +1,108 @@ + +package eu.dnetlib.dhp.oa.merge; + +import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; + +import java.util.Arrays; +import java.util.Optional; + +import org.apache.commons.io.IOUtils; +import org.apache.spark.SparkConf; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.schema.common.ModelSupport; +import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException; + +/** + * Copy specified entities from a graph snapshot to another + */ +public class CopyEntitiesSparkJob { + private static final Logger log = LoggerFactory.getLogger(CopyEntitiesSparkJob.class); + + private ArgumentApplicationParser parser; + + public CopyEntitiesSparkJob(ArgumentApplicationParser parser) { + this.parser = parser; + } + + public static void main(String[] args) throws Exception { + + String jsonConfiguration = IOUtils + .toString( + CopyEntitiesSparkJob.class + .getResourceAsStream( + "/eu/dnetlib/dhp/oa/merge/copy_graph_entities_parameters.json")); + final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration); + parser.parseArgument(args); + + Boolean isSparkSessionManaged = Optional + .ofNullable(parser.get("isSparkSessionManaged")) + .map(Boolean::valueOf) + .orElse(Boolean.TRUE); + log.info("isSparkSessionManaged: {}", isSparkSessionManaged); + + new CopyEntitiesSparkJob(parser).run(isSparkSessionManaged); + } + + public void run(Boolean isSparkSessionManaged) + throws ISLookUpException { + + String graphInputPath = parser.get("graphInputPath"); + log.info("graphInputPath: {}", graphInputPath); + + String outputPath = parser.get("outputPath"); + log.info("outputPath: {}", outputPath); + + String entities = parser.get("entities"); + log.info("entities: {}", entities); + + String format = parser.get("format"); + log.info("format: {}", format); + + SparkConf conf = new SparkConf(); + + runWithSparkSession( + conf, + isSparkSessionManaged, + spark -> { + Arrays + .stream(entities.split(",")) + .map(x -> x.trim().toLowerCase()) + .filter(ModelSupport.oafTypes::containsKey) + .forEachOrdered( + entity -> { + switch (format.toLowerCase()) { + case "text": + spark + .read() + .text(graphInputPath + "/" + entity) + .write() + .option("compression", "gzip") + .mode("overwrite") + .text(outputPath + "/" + entity); + break; + case "json": + spark + .read() + .json(graphInputPath + "/" + entity) + .write() + .option("compression", "gzip") + .mode("overwrite") + .json(outputPath + "/" + entity); + break; + case "parquet": + spark + .read() + .parquet(graphInputPath + "/" + entity) + .write() + .option("compression", "gzip") + .mode("overwrite") + .parquet(outputPath + "/" + entity); + break; + } + }); + }); + } +} diff --git a/dhp-common/src/main/resources/eu/dnetlib/dhp/oa/merge/copy_graph_entities_parameters.json b/dhp-common/src/main/resources/eu/dnetlib/dhp/oa/merge/copy_graph_entities_parameters.json new file mode 100644 index 000000000..0617228d1 --- /dev/null +++ b/dhp-common/src/main/resources/eu/dnetlib/dhp/oa/merge/copy_graph_entities_parameters.json @@ -0,0 +1,32 @@ +[ + { + "paramName": "issm", + "paramLongName": "isSparkSessionManaged", + "paramDescription": "when true will stop SparkSession after job execution", + "paramRequired": false + }, + { + "paramName": "gin", + "paramLongName": "graphInputPath", + "paramDescription": "the input graph root path", + "paramRequired": true + }, + { + "paramName": "out", + "paramLongName": "outputPath", + "paramDescription": "the output graph root path", + "paramRequired": true + }, + { + "paramName": "ent", + "paramLongName": "entities", + "paramDescription": "the output graph root path", + "paramRequired": true + }, + { + "paramName": "fmt", + "paramLongName": "format", + "paramDescription": "the output graph root path", + "paramRequired": true + } +] \ No newline at end of file From 8570bba6d22ceff2fe0aa0459a26d21e1d8e82cd Mon Sep 17 00:00:00 2001 From: Giambattista Bloisi Date: Mon, 21 Oct 2024 09:03:20 +0200 Subject: [PATCH 111/111] Import cnr-rmi-api project, rename it dnet-rmi-api and use jakarta and metro in place of javax for portability to java 17 --- .gitignore | 3 +- dhp-common/pom.xml | 30 +--- dhp-rmi-api/pom.xml | 34 +++++ .../common/rmi/APIDeprecatedException.java | 11 ++ .../eu/dnetlib/common/rmi/BaseService.java | 35 +++++ .../common/rmi/DNetRestDocumentation.java | 19 +++ .../eu/dnetlib/common/rmi/RMIException.java | 27 ++++ .../common/rmi/UnimplementedException.java | 11 ++ .../rmi/CollectionService.java | 31 ++++ .../rmi/CollectionServiceException.java | 27 ++++ .../publisher/rmi/PublisherService.java | 57 ++++++++ .../mdstore/DocumentNotFoundException.java | 30 ++++ .../dnetlib/data/mdstore/MDStoreService.java | 118 +++++++++++++++ .../data/mdstore/MDStoreServiceException.java | 33 +++++ .../rmi/ObjectPackagingException.java | 28 ++++ .../rmi/ObjectPackagingService.java | 37 +++++ .../dnetlib/enabling/dlm/rmi/DlmService.java | 17 +++ .../hcm/rmi/HostingContextManagerService.java | 19 +++ .../hnm/rmi/HostingNodeManagerService.java | 15 ++ .../ISLookUpDocumentNotFoundException.java | 47 ++++++ .../is/lookup/rmi/ISLookUpException.java | 27 ++++ .../is/lookup/rmi/ISLookUpService.java | 57 ++++++++ .../ISRegistryDocumentNotFoundException.java | 28 ++++ .../is/registry/rmi/ISRegistryException.java | 25 ++++ .../is/registry/rmi/ISRegistryService.java | 72 ++++++++++ .../enabling/is/sn/rmi/ISSNException.java | 25 ++++ .../enabling/is/sn/rmi/ISSNService.java | 121 ++++++++++++++++ .../SubscriptionRequestRejectedException.java | 21 +++ .../is/store/rmi/ISStoreException.java | 25 ++++ .../enabling/is/store/rmi/ISStoreService.java | 47 ++++++ .../resultset/rmi/ResultSetException.java | 22 +++ .../resultset/rmi/ResultSetService.java | 135 ++++++++++++++++++ dhp-shade-package/dependency-reduced-pom.xml | 113 --------------- dhp-shade-package/pom.xml | 1 - .../dhp/collection/orcid/ORCIDExtractor.java | 2 +- dhp-workflows/pom.xml | 7 +- pom.xml | 17 +-- 37 files changed, 1225 insertions(+), 149 deletions(-) create mode 100644 dhp-rmi-api/pom.xml create mode 100644 dhp-rmi-api/src/main/java/eu/dnetlib/common/rmi/APIDeprecatedException.java create mode 100644 dhp-rmi-api/src/main/java/eu/dnetlib/common/rmi/BaseService.java create mode 100644 dhp-rmi-api/src/main/java/eu/dnetlib/common/rmi/DNetRestDocumentation.java create mode 100644 dhp-rmi-api/src/main/java/eu/dnetlib/common/rmi/RMIException.java create mode 100644 dhp-rmi-api/src/main/java/eu/dnetlib/common/rmi/UnimplementedException.java create mode 100644 dhp-rmi-api/src/main/java/eu/dnetlib/data/information/collectionservice/rmi/CollectionService.java create mode 100644 dhp-rmi-api/src/main/java/eu/dnetlib/data/information/collectionservice/rmi/CollectionServiceException.java create mode 100644 dhp-rmi-api/src/main/java/eu/dnetlib/data/information/publisher/rmi/PublisherService.java create mode 100644 dhp-rmi-api/src/main/java/eu/dnetlib/data/mdstore/DocumentNotFoundException.java create mode 100644 dhp-rmi-api/src/main/java/eu/dnetlib/data/mdstore/MDStoreService.java create mode 100644 dhp-rmi-api/src/main/java/eu/dnetlib/data/mdstore/MDStoreServiceException.java create mode 100644 dhp-rmi-api/src/main/java/eu/dnetlib/data/utility/objectpackaging/rmi/ObjectPackagingException.java create mode 100644 dhp-rmi-api/src/main/java/eu/dnetlib/data/utility/objectpackaging/rmi/ObjectPackagingService.java create mode 100644 dhp-rmi-api/src/main/java/eu/dnetlib/enabling/dlm/rmi/DlmService.java create mode 100644 dhp-rmi-api/src/main/java/eu/dnetlib/enabling/hcm/rmi/HostingContextManagerService.java create mode 100644 dhp-rmi-api/src/main/java/eu/dnetlib/enabling/hnm/rmi/HostingNodeManagerService.java create mode 100644 dhp-rmi-api/src/main/java/eu/dnetlib/enabling/is/lookup/rmi/ISLookUpDocumentNotFoundException.java create mode 100644 dhp-rmi-api/src/main/java/eu/dnetlib/enabling/is/lookup/rmi/ISLookUpException.java create mode 100644 dhp-rmi-api/src/main/java/eu/dnetlib/enabling/is/lookup/rmi/ISLookUpService.java create mode 100644 dhp-rmi-api/src/main/java/eu/dnetlib/enabling/is/registry/ISRegistryDocumentNotFoundException.java create mode 100644 dhp-rmi-api/src/main/java/eu/dnetlib/enabling/is/registry/rmi/ISRegistryException.java create mode 100644 dhp-rmi-api/src/main/java/eu/dnetlib/enabling/is/registry/rmi/ISRegistryService.java create mode 100644 dhp-rmi-api/src/main/java/eu/dnetlib/enabling/is/sn/rmi/ISSNException.java create mode 100644 dhp-rmi-api/src/main/java/eu/dnetlib/enabling/is/sn/rmi/ISSNService.java create mode 100644 dhp-rmi-api/src/main/java/eu/dnetlib/enabling/is/sn/rmi/SubscriptionRequestRejectedException.java create mode 100644 dhp-rmi-api/src/main/java/eu/dnetlib/enabling/is/store/rmi/ISStoreException.java create mode 100644 dhp-rmi-api/src/main/java/eu/dnetlib/enabling/is/store/rmi/ISStoreService.java create mode 100644 dhp-rmi-api/src/main/java/eu/dnetlib/enabling/resultset/rmi/ResultSetException.java create mode 100644 dhp-rmi-api/src/main/java/eu/dnetlib/enabling/resultset/rmi/ResultSetService.java delete mode 100644 dhp-shade-package/dependency-reduced-pom.xml diff --git a/.gitignore b/.gitignore index ef9144ae3..caeab2b81 100644 --- a/.gitignore +++ b/.gitignore @@ -27,5 +27,6 @@ spark-warehouse /**/.factorypath /**/.scalafmt.conf /.java-version -/dhp-shade-package/dependency-reduced-pom.xml +/**/dependency-reduced-pom.xml /**/job.properties + diff --git a/dhp-common/pom.xml b/dhp-common/pom.xml index bfec019af..a998ba7f6 100644 --- a/dhp-common/pom.xml +++ b/dhp-common/pom.xml @@ -62,6 +62,12 @@ + + eu.dnetlib + dhp-rmi-api + ${project.version} + + edu.cmu secondstring @@ -125,10 +131,6 @@ org.apache.cxf cxf-rt-transports-http - - eu.dnetlib - cnr-rmi-api - com.ximpleware @@ -168,24 +170,4 @@ opencsv - - - - - spark-34 - - - javax.xml.bind - jaxb-api - 2.2.11 - - - com.sun.xml.ws - jaxws-ri - 2.3.3 - pom - - - - diff --git a/dhp-rmi-api/pom.xml b/dhp-rmi-api/pom.xml new file mode 100644 index 000000000..2073bd8ba --- /dev/null +++ b/dhp-rmi-api/pom.xml @@ -0,0 +1,34 @@ + + + + eu.dnetlib.dhp + dhp + 1.2.5-SNAPSHOT + ../pom.xml + + 4.0.0 + eu.dnetlib + dhp-rmi-api + jar + 1.2.5-SNAPSHOT + + + org.apache.cxf + cxf-core + ${cxf.version} + + + + org.apache.cxf + cxf-rt-frontend-jaxws + ${cxf.version} + + + + org.glassfish.metro + webservices-rt + ${metro.version} + + + diff --git a/dhp-rmi-api/src/main/java/eu/dnetlib/common/rmi/APIDeprecatedException.java b/dhp-rmi-api/src/main/java/eu/dnetlib/common/rmi/APIDeprecatedException.java new file mode 100644 index 000000000..409bf71c5 --- /dev/null +++ b/dhp-rmi-api/src/main/java/eu/dnetlib/common/rmi/APIDeprecatedException.java @@ -0,0 +1,11 @@ + +package eu.dnetlib.common.rmi; + +public class APIDeprecatedException extends RuntimeException { + + /** + * + */ + private static final long serialVersionUID = -5606373588445519515L; + +} diff --git a/dhp-rmi-api/src/main/java/eu/dnetlib/common/rmi/BaseService.java b/dhp-rmi-api/src/main/java/eu/dnetlib/common/rmi/BaseService.java new file mode 100644 index 000000000..0d521796f --- /dev/null +++ b/dhp-rmi-api/src/main/java/eu/dnetlib/common/rmi/BaseService.java @@ -0,0 +1,35 @@ + +package eu.dnetlib.common.rmi; + +import jakarta.jws.WebMethod; +import jakarta.jws.WebParam; +import jakarta.jws.WebService; + +@WebService(targetNamespace = "http://services.dnetlib.eu/") +public interface BaseService { + + /** + * All DRIVER services must implement method notify() in order to communicate with the IS_SN + * + * @param subsrciptionId + * @param topic + * @param isId + * @param message + */ + @WebMethod(operationName = "notify") + void notify(@WebParam(name = "subscrId") String subscriptionId, + @WebParam(name = "topic") String topic, + @WebParam(name = "is_id") String isId, + @WebParam(name = "message") String message); + + /** + * Identifies the service's version. Version syntax: ${NAME}-${MAJOR}.${MINOR}.${MICRO}[-${LABEL}] + * + * @return the service's version + */ + @WebMethod(operationName = "identify") + String identify(); + + void start(); + +} diff --git a/dhp-rmi-api/src/main/java/eu/dnetlib/common/rmi/DNetRestDocumentation.java b/dhp-rmi-api/src/main/java/eu/dnetlib/common/rmi/DNetRestDocumentation.java new file mode 100644 index 000000000..4b9bd6b4f --- /dev/null +++ b/dhp-rmi-api/src/main/java/eu/dnetlib/common/rmi/DNetRestDocumentation.java @@ -0,0 +1,19 @@ + +package eu.dnetlib.common.rmi; + +import java.lang.annotation.ElementType; +import java.lang.annotation.Retention; +import java.lang.annotation.RetentionPolicy; +import java.lang.annotation.Target; + +/** + * Created by claudio on 30/11/2016. + * to be used in REST controllers, and autodiscovered to build and publish their documentation + */ +@Target({ + ElementType.TYPE +}) +@Retention(RetentionPolicy.RUNTIME) +public @interface DNetRestDocumentation { + +} diff --git a/dhp-rmi-api/src/main/java/eu/dnetlib/common/rmi/RMIException.java b/dhp-rmi-api/src/main/java/eu/dnetlib/common/rmi/RMIException.java new file mode 100644 index 000000000..bf62b7da3 --- /dev/null +++ b/dhp-rmi-api/src/main/java/eu/dnetlib/common/rmi/RMIException.java @@ -0,0 +1,27 @@ + +package eu.dnetlib.common.rmi; + +/** + * All RMI exception thrown from the service remote method invocation interfaces inherit this class + * + * @author marko + */ +abstract public class RMIException extends Exception { // NOPMD + + /** + * + */ + private static final long serialVersionUID = 428841258652765265L; + + public RMIException(final Throwable exception) { + super(exception); + } + + public RMIException(final String string) { + super(string); + } + + public RMIException(final String string, final Throwable exception) { + super(string, exception); + } +} diff --git a/dhp-rmi-api/src/main/java/eu/dnetlib/common/rmi/UnimplementedException.java b/dhp-rmi-api/src/main/java/eu/dnetlib/common/rmi/UnimplementedException.java new file mode 100644 index 000000000..d3b6611df --- /dev/null +++ b/dhp-rmi-api/src/main/java/eu/dnetlib/common/rmi/UnimplementedException.java @@ -0,0 +1,11 @@ + +package eu.dnetlib.common.rmi; + +public class UnimplementedException extends RuntimeException { + + /** + * + */ + private static final long serialVersionUID = 6040968020696349497L; + +} diff --git a/dhp-rmi-api/src/main/java/eu/dnetlib/data/information/collectionservice/rmi/CollectionService.java b/dhp-rmi-api/src/main/java/eu/dnetlib/data/information/collectionservice/rmi/CollectionService.java new file mode 100644 index 000000000..6827dcfbe --- /dev/null +++ b/dhp-rmi-api/src/main/java/eu/dnetlib/data/information/collectionservice/rmi/CollectionService.java @@ -0,0 +1,31 @@ + +package eu.dnetlib.data.information.collectionservice.rmi; + +import java.util.List; + +import eu.dnetlib.common.rmi.BaseService; +import jakarta.jws.WebParam; +import jakarta.jws.WebService; + +/** + * The Collection Service is used to ... + */ + +@WebService(targetNamespace = "http://services.dnetlib.eu/") +public interface CollectionService extends BaseService { + + String getCollection(@WebParam(name = "collId") + final String collId) throws CollectionServiceException; + + List getCollections(@WebParam(name = "collIds") + final List collIds) throws CollectionServiceException; + + void updateCollection(@WebParam(name = "coll") + final String coll) throws CollectionServiceException; + + void deleteCollection(@WebParam(name = "collId") + final String collId) throws CollectionServiceException; + + String createCollection(@WebParam(name = "coll") + final String coll) throws CollectionServiceException; +} diff --git a/dhp-rmi-api/src/main/java/eu/dnetlib/data/information/collectionservice/rmi/CollectionServiceException.java b/dhp-rmi-api/src/main/java/eu/dnetlib/data/information/collectionservice/rmi/CollectionServiceException.java new file mode 100644 index 000000000..1d0fd0bd1 --- /dev/null +++ b/dhp-rmi-api/src/main/java/eu/dnetlib/data/information/collectionservice/rmi/CollectionServiceException.java @@ -0,0 +1,27 @@ + +package eu.dnetlib.data.information.collectionservice.rmi; + +import eu.dnetlib.common.rmi.RMIException; +import jakarta.xml.ws.WebFault; + +@WebFault +public class CollectionServiceException extends RMIException { + + /** + * + */ + private static final long serialVersionUID = 8094008463553904905L; + + public CollectionServiceException(Throwable e) { + super(e); + } + + public CollectionServiceException(String message, Throwable e) { + super(message, e); + } + + public CollectionServiceException(String message) { + super(message); + } + +} diff --git a/dhp-rmi-api/src/main/java/eu/dnetlib/data/information/publisher/rmi/PublisherService.java b/dhp-rmi-api/src/main/java/eu/dnetlib/data/information/publisher/rmi/PublisherService.java new file mode 100644 index 000000000..f9865c7bc --- /dev/null +++ b/dhp-rmi-api/src/main/java/eu/dnetlib/data/information/publisher/rmi/PublisherService.java @@ -0,0 +1,57 @@ + +package eu.dnetlib.data.information.publisher.rmi; + +import java.util.List; + +import eu.dnetlib.common.rmi.BaseService; +import jakarta.jws.WebMethod; +import jakarta.jws.WebParam; +import jakarta.jws.WebService; +import jakarta.xml.ws.wsaddressing.W3CEndpointReference; + +/** + * Publisher service. Provides access to metadata records and objects. + * + * @author marko + */ +@WebService(targetNamespace = "http://services.dnetlib.eu/") +public interface PublisherService extends BaseService { + + /** + * Get a (metadata) resource by ID. + * + * @param id + * @param format + * @param layout + * @param interpretation + * @return + */ + @WebMethod + String getResourceById(@WebParam(name = "id") + final String id, + @WebParam(name = "format") + final String format, + @WebParam(name = "layout") + final String layout, + @WebParam(name = "interpretation") + final String interpretation); + + /** + * Get (metadata) resources by IDs. + * + * @param ids + * @param format + * @param layout + * @param interpretation + * @return + */ + @WebMethod + W3CEndpointReference getResourcesByIds(@WebParam(name = "ids") + final List ids, + @WebParam(name = "format") + final String format, + @WebParam(name = "layout") + final String layout, + @WebParam(name = "interpretation") + final String interpretation); +} diff --git a/dhp-rmi-api/src/main/java/eu/dnetlib/data/mdstore/DocumentNotFoundException.java b/dhp-rmi-api/src/main/java/eu/dnetlib/data/mdstore/DocumentNotFoundException.java new file mode 100644 index 000000000..cffb113f2 --- /dev/null +++ b/dhp-rmi-api/src/main/java/eu/dnetlib/data/mdstore/DocumentNotFoundException.java @@ -0,0 +1,30 @@ + +package eu.dnetlib.data.mdstore; + +/** + * Signals that a metadata record cannot be found in a given MDStore. + */ +public class DocumentNotFoundException extends MDStoreServiceException { + + /** + * + */ + private static final long serialVersionUID = 5188036989114250548L; + + public DocumentNotFoundException(final String s, final Throwable e) { + super(s, e); + } + + public DocumentNotFoundException(final String s) { + super(s); + } + + public DocumentNotFoundException(final Throwable e) { + super(e); + } + + public DocumentNotFoundException() { + super(); + } + +} diff --git a/dhp-rmi-api/src/main/java/eu/dnetlib/data/mdstore/MDStoreService.java b/dhp-rmi-api/src/main/java/eu/dnetlib/data/mdstore/MDStoreService.java new file mode 100644 index 000000000..e59c4e9b7 --- /dev/null +++ b/dhp-rmi-api/src/main/java/eu/dnetlib/data/mdstore/MDStoreService.java @@ -0,0 +1,118 @@ + +package eu.dnetlib.data.mdstore; + +import java.util.List; + +import eu.dnetlib.common.rmi.BaseService; +import jakarta.jws.WebMethod; +import jakarta.jws.WebParam; +import jakarta.jws.WebService; +import jakarta.xml.ws.wsaddressing.W3CEndpointReference; + +@WebService(targetNamespace = "http://services.dnetlib.eu/") +public interface MDStoreService extends BaseService { + + /** + * Identifies service and version. + * + * @return + */ + @Override + String identify(); + + /** + * Returns ResultSet EPR for delivered mdstore records. + * + * @param mdId + * @param from + * @param until + * @param recordFilter REGEX on the metadata record + * @return ResultSet EPR + * @throws MDStoreServiceException + */ + W3CEndpointReference deliverMDRecords(@WebParam(name = "mdId") + final String mdId, + @WebParam(name = "from") + final String from, + @WebParam(name = "until") + final String until, + @WebParam(name = "recordsFilter") + final String recordFilter) throws MDStoreServiceException; + + /** + * Deliver single record from selected mdstore. + * + * @param mdId + * @param recordId + * @return record + * @throws MDStoreServiceException + */ + String deliverRecord(@WebParam(name = "mdId") + final String mdId, @WebParam(name = "recordId") + final String recordId) throws MDStoreServiceException; + + /** + * Returns list of all stored indices. + * + * @return list of all stored indices + */ + List getListOfMDStores() throws MDStoreServiceException; + + List listMDStores(@WebParam(name = "format") + final String format, + @WebParam(name = "layout") + final String layout, + @WebParam(name = "interpretation") + final String interpretation) throws MDStoreServiceException; + + W3CEndpointReference bulkDeliverMDRecords(@WebParam(name = "format") + final String format, + @WebParam(name = "layout") + final String layout, + @WebParam(name = "interpretation") + final String interpretation) throws MDStoreServiceException; + + /** + * Store md records from a result set + * + * @param mdId + * @param rsId + * @param storingType + * @return returns true immediately. + * @throws MDStoreServiceException + */ + @Deprecated + boolean storeMDRecordsFromRS(@WebParam(name = "mdId") + final String mdId, + @WebParam(name = "rsId") + final String rsId, + @WebParam(name = "storingType") + final String storingType) throws MDStoreServiceException; + + /** + * Gets the size of the mdstore with the given identifier. + * + * @param mdId identifier of an mdstore + * @return the number of records in the store + */ + @WebMethod(operationName = "size") + int size(@WebParam(name = "mdId") + final String mdId) throws MDStoreServiceException; + + /** + * Gets the sum of records stored in all mdstore with the given format, layout , interpretation + * + * @param format format + * @param layout layout + * @param interpretation interpretation + * @return the total number of records in the mdstores of the given type + */ + @WebMethod(operationName = "sizeByFormat") + int size(@WebParam(name = "format") + final String format, + @WebParam(name = "layout") + final String layout, + @WebParam(name = "interpretation") + final String interpretation) throws MDStoreServiceException; + +} diff --git a/dhp-rmi-api/src/main/java/eu/dnetlib/data/mdstore/MDStoreServiceException.java b/dhp-rmi-api/src/main/java/eu/dnetlib/data/mdstore/MDStoreServiceException.java new file mode 100644 index 000000000..471b2a90a --- /dev/null +++ b/dhp-rmi-api/src/main/java/eu/dnetlib/data/mdstore/MDStoreServiceException.java @@ -0,0 +1,33 @@ + +package eu.dnetlib.data.mdstore; + +/** + * General mdstore service exception. + * + * @author claudio atzori + * @version 1.0.0 + */ +public class MDStoreServiceException extends Exception { + + /** + * + */ + private static final long serialVersionUID = -6772977735282310658L; + + public MDStoreServiceException(String s, Throwable e) { + super(s, e); + } + + public MDStoreServiceException(String s) { + super(s); + } + + public MDStoreServiceException(Throwable e) { + super(e); + } + + public MDStoreServiceException() { + super(); + } + +} diff --git a/dhp-rmi-api/src/main/java/eu/dnetlib/data/utility/objectpackaging/rmi/ObjectPackagingException.java b/dhp-rmi-api/src/main/java/eu/dnetlib/data/utility/objectpackaging/rmi/ObjectPackagingException.java new file mode 100644 index 000000000..ade6d9ffc --- /dev/null +++ b/dhp-rmi-api/src/main/java/eu/dnetlib/data/utility/objectpackaging/rmi/ObjectPackagingException.java @@ -0,0 +1,28 @@ + +package eu.dnetlib.data.utility.objectpackaging.rmi; + +import eu.dnetlib.common.rmi.RMIException; +import jakarta.xml.ws.WebFault; + +@WebFault +public class ObjectPackagingException extends RMIException { + + private static final long serialVersionUID = 3468254939586031822L; + + /** + * + */ + + public ObjectPackagingException(Throwable e) { + super(e); + } + + public ObjectPackagingException(String message, Throwable e) { + super(message, e); + } + + public ObjectPackagingException(String message) { + super(message); + } + +} diff --git a/dhp-rmi-api/src/main/java/eu/dnetlib/data/utility/objectpackaging/rmi/ObjectPackagingService.java b/dhp-rmi-api/src/main/java/eu/dnetlib/data/utility/objectpackaging/rmi/ObjectPackagingService.java new file mode 100644 index 000000000..6453cfe1d --- /dev/null +++ b/dhp-rmi-api/src/main/java/eu/dnetlib/data/utility/objectpackaging/rmi/ObjectPackagingService.java @@ -0,0 +1,37 @@ + +package eu.dnetlib.data.utility.objectpackaging.rmi; + +import java.util.List; + +import eu.dnetlib.common.rmi.BaseService; +import jakarta.jws.WebParam; +import jakarta.jws.WebService; +import jakarta.xml.ws.wsaddressing.W3CEndpointReference; + +/** + * The Object Packaging Service is used to combine the records spread + * into one information package, namely an Object Record. + */ + +@WebService(targetNamespace = "http://services.dnetlib.eu/") +public interface ObjectPackagingService extends BaseService { + /** + * Return the EPR of the resultSet containing the generated packages + * + * @param eprs A list of EPRs used to access the input resultSets. ResultSets MUST be ordered using an order key identified by xpath_ID + * @param xpath_ID A valid xpath, used to access the ordered ID of the elements of the input resultSets. + * @return EPR of the generated resultset + */ + W3CEndpointReference generatePackages(@WebParam(name = "eprs") List eprs, + @WebParam(name = "xpath_ID") String xpath_ID) throws ObjectPackagingException; + + /** + * Return the EPR of the resultSet containing the unpackaged element + * + * @param epr The epr used to access the resultset that contains input packages, packages are xml record in this format: REC1REC2REC3 + * @return EPR of the generated resultset + */ + W3CEndpointReference splitPackages(@WebParam(name = "epr") W3CEndpointReference epr) + throws ObjectPackagingException; + +} diff --git a/dhp-rmi-api/src/main/java/eu/dnetlib/enabling/dlm/rmi/DlmService.java b/dhp-rmi-api/src/main/java/eu/dnetlib/enabling/dlm/rmi/DlmService.java new file mode 100644 index 000000000..046b78a44 --- /dev/null +++ b/dhp-rmi-api/src/main/java/eu/dnetlib/enabling/dlm/rmi/DlmService.java @@ -0,0 +1,17 @@ + +package eu.dnetlib.enabling.dlm.rmi; + +import eu.dnetlib.common.rmi.BaseService; +import jakarta.jws.WebService; + +/** + * Distributed lock manager. Currently is used mostly to start the underlying lock manager (e.g. zookeeper) and let + * client interface directly with it. + * + *

The DLM service profile contains the entry point of the underlying locking service.

+ * + * @author marko + */ +@WebService(targetNamespace = "http://services.dnetlib.eu/") +public interface DlmService extends BaseService { +} diff --git a/dhp-rmi-api/src/main/java/eu/dnetlib/enabling/hcm/rmi/HostingContextManagerService.java b/dhp-rmi-api/src/main/java/eu/dnetlib/enabling/hcm/rmi/HostingContextManagerService.java new file mode 100644 index 000000000..3d11ca129 --- /dev/null +++ b/dhp-rmi-api/src/main/java/eu/dnetlib/enabling/hcm/rmi/HostingContextManagerService.java @@ -0,0 +1,19 @@ + +package eu.dnetlib.enabling.hcm.rmi; + +import eu.dnetlib.common.rmi.BaseService; +import jakarta.jws.WebService; + +/** + * Like a HostingNodeManager, but any webapp (web context) can have its own. + *

+ * useful for dispatching notifications shared by all the services local to a single context. + *

+ * + * @author marko + * @author antonis + */ +@WebService(targetNamespace = "http://services.dnetlib.eu/") +public interface HostingContextManagerService extends BaseService { + +} diff --git a/dhp-rmi-api/src/main/java/eu/dnetlib/enabling/hnm/rmi/HostingNodeManagerService.java b/dhp-rmi-api/src/main/java/eu/dnetlib/enabling/hnm/rmi/HostingNodeManagerService.java new file mode 100644 index 000000000..bd8431818 --- /dev/null +++ b/dhp-rmi-api/src/main/java/eu/dnetlib/enabling/hnm/rmi/HostingNodeManagerService.java @@ -0,0 +1,15 @@ + +package eu.dnetlib.enabling.hnm.rmi; + +import eu.dnetlib.common.rmi.BaseService; +import jakarta.jws.WebParam; +import jakarta.jws.WebService; + +/** + * The HostingNodeManager Service is used to ... + */ + +@WebService(targetNamespace = "http://services.dnetlib.eu/") +public interface HostingNodeManagerService extends BaseService { + String echo(@WebParam(name = "s") String s); +} diff --git a/dhp-rmi-api/src/main/java/eu/dnetlib/enabling/is/lookup/rmi/ISLookUpDocumentNotFoundException.java b/dhp-rmi-api/src/main/java/eu/dnetlib/enabling/is/lookup/rmi/ISLookUpDocumentNotFoundException.java new file mode 100644 index 000000000..82c3a1750 --- /dev/null +++ b/dhp-rmi-api/src/main/java/eu/dnetlib/enabling/is/lookup/rmi/ISLookUpDocumentNotFoundException.java @@ -0,0 +1,47 @@ + +package eu.dnetlib.enabling.is.lookup.rmi; + +import jakarta.xml.ws.WebFault; + +/** + * Thrown when a given document is not found. + * + * @author marko + */ +@WebFault +public class ISLookUpDocumentNotFoundException extends ISLookUpException { + + /** + * exception chain + message. + * + * @param message message + * @param e + */ + public ISLookUpDocumentNotFoundException(String message, Throwable e) { + super(message, e); + } + + /** + * exception chain constructor. + * + * @param e + */ + public ISLookUpDocumentNotFoundException(Throwable e) { + super(e); + } + + /** + * exception message. + * + * @param message + */ + public ISLookUpDocumentNotFoundException(String message) { + super(message); + } + + /** + * + */ + private static final long serialVersionUID = 2295995755165801937L; + +} diff --git a/dhp-rmi-api/src/main/java/eu/dnetlib/enabling/is/lookup/rmi/ISLookUpException.java b/dhp-rmi-api/src/main/java/eu/dnetlib/enabling/is/lookup/rmi/ISLookUpException.java new file mode 100644 index 000000000..bd5c26ce7 --- /dev/null +++ b/dhp-rmi-api/src/main/java/eu/dnetlib/enabling/is/lookup/rmi/ISLookUpException.java @@ -0,0 +1,27 @@ + +package eu.dnetlib.enabling.is.lookup.rmi; + +import eu.dnetlib.common.rmi.RMIException; +import jakarta.xml.ws.WebFault; + +@WebFault +public class ISLookUpException extends RMIException { + + /** + * + */ + private static final long serialVersionUID = -5626136963653382533L; + + public ISLookUpException(Throwable e) { + super(e); + } + + public ISLookUpException(String message, Throwable e) { + super(message, e); + } + + public ISLookUpException(String message) { + super(message); + } + +} diff --git a/dhp-rmi-api/src/main/java/eu/dnetlib/enabling/is/lookup/rmi/ISLookUpService.java b/dhp-rmi-api/src/main/java/eu/dnetlib/enabling/is/lookup/rmi/ISLookUpService.java new file mode 100644 index 000000000..a63ea36a6 --- /dev/null +++ b/dhp-rmi-api/src/main/java/eu/dnetlib/enabling/is/lookup/rmi/ISLookUpService.java @@ -0,0 +1,57 @@ + +package eu.dnetlib.enabling.is.lookup.rmi; + +import java.util.List; + +import eu.dnetlib.common.rmi.BaseService; +import jakarta.jws.WebParam; +import jakarta.jws.WebService; + +@WebService(targetNamespace = "http://services.dnetlib.eu/") +public interface ISLookUpService extends BaseService { + + Boolean flushCachedResultSets(); + + @Deprecated + String getCollection(@WebParam(name = "profId") String profId, @WebParam(name = "format") String format) + throws ISLookUpException; + + String retrieveCollection(@WebParam(name = "profId") String profId) throws ISLookUpException; + + String getResourceProfile(@WebParam(name = "profId") String profId) + throws ISLookUpException; + + String getResourceProfileByQuery(@WebParam(name = "XQuery") String XQuery) + throws ISLookUpException; + + String getResourceQoSParams(@WebParam(name = "id") String id) throws ISLookUpException; + + String getResourceTypeSchema(@WebParam(name = "resourceType") String resourceType) + throws ISLookUpException; + + List listCollections( + @WebParam(name = "format") String format, + @WebParam(name = "idfather") String idfather, + @WebParam(name = "owner") String owner) throws ISLookUpException; + + @Deprecated + List listDHNIDs() throws ISLookUpException; + + List listResourceTypes() throws ISLookUpException; + + @Deprecated + List listServiceIDs(@WebParam(name = "serviceType") String serviceType) throws ISLookUpException; + + @Deprecated + List listServiceTypes() throws ISLookUpException; + + /** + * Like searchProfile(), but bypassing the resultset. Useful for short xquery results. + * + * @param xquery xquery to be executed + * @return list of strings (never null) + * @throws ISLookUpException could happen + */ + List quickSearchProfile(@WebParam(name = "XQuery") String xquery) throws ISLookUpException; + +} diff --git a/dhp-rmi-api/src/main/java/eu/dnetlib/enabling/is/registry/ISRegistryDocumentNotFoundException.java b/dhp-rmi-api/src/main/java/eu/dnetlib/enabling/is/registry/ISRegistryDocumentNotFoundException.java new file mode 100644 index 000000000..62de1aaab --- /dev/null +++ b/dhp-rmi-api/src/main/java/eu/dnetlib/enabling/is/registry/ISRegistryDocumentNotFoundException.java @@ -0,0 +1,28 @@ + +package eu.dnetlib.enabling.is.registry; + +import eu.dnetlib.enabling.is.registry.rmi.ISRegistryException; + +public class ISRegistryDocumentNotFoundException extends ISRegistryException { + + /** + * + */ + private static final long serialVersionUID = -1304948213334188538L; + + public ISRegistryDocumentNotFoundException(String string, Throwable e) { + super(string, e); + // TODO Auto-generated constructor stub + } + + public ISRegistryDocumentNotFoundException(String string) { + super(string); + // TODO Auto-generated constructor stub + } + + public ISRegistryDocumentNotFoundException(Throwable e) { + super(e); + // TODO Auto-generated constructor stub + } + +} diff --git a/dhp-rmi-api/src/main/java/eu/dnetlib/enabling/is/registry/rmi/ISRegistryException.java b/dhp-rmi-api/src/main/java/eu/dnetlib/enabling/is/registry/rmi/ISRegistryException.java new file mode 100644 index 000000000..e7795ab1b --- /dev/null +++ b/dhp-rmi-api/src/main/java/eu/dnetlib/enabling/is/registry/rmi/ISRegistryException.java @@ -0,0 +1,25 @@ + +package eu.dnetlib.enabling.is.registry.rmi; + +import eu.dnetlib.common.rmi.RMIException; + +public class ISRegistryException extends RMIException { + + /** + * + */ + private static final long serialVersionUID = -3347405941287624771L; + + public ISRegistryException(Throwable e) { + super(e); + } + + public ISRegistryException(String string) { + super(string); + } + + public ISRegistryException(String string, Throwable e) { + super(string, e); + } + +} diff --git a/dhp-rmi-api/src/main/java/eu/dnetlib/enabling/is/registry/rmi/ISRegistryService.java b/dhp-rmi-api/src/main/java/eu/dnetlib/enabling/is/registry/rmi/ISRegistryService.java new file mode 100644 index 000000000..3fd10bcbf --- /dev/null +++ b/dhp-rmi-api/src/main/java/eu/dnetlib/enabling/is/registry/rmi/ISRegistryService.java @@ -0,0 +1,72 @@ + +package eu.dnetlib.enabling.is.registry.rmi; + +import java.util.List; + +import eu.dnetlib.common.rmi.BaseService; +import jakarta.jws.WebService; + +@WebService(targetNamespace = "http://services.dnetlib.eu/") +public interface ISRegistryService extends BaseService { + + boolean addOrUpdateResourceType(String resourceType, String resourceSchema) throws ISRegistryException; + + boolean addResourceType(String resourceType, String resourceSchema) throws ISRegistryException; + + boolean deleteProfile(String profId) throws ISRegistryException; + + @Deprecated + boolean deleteProfiles(List arrayprofId) throws ISRegistryException; + + /** + * @param resourceType + * @param hierarchical remove subscription topics + * @return + * @throws ISRegistryException + */ + boolean deleteResourceType(String resourceType, Boolean hierarchical) throws ISRegistryException; + + boolean executeXUpdate(String XQuery) throws ISRegistryException; + + String insertProfileForValidation(String resourceType, String resourceProfile) throws ISRegistryException; + + String invalidateProfile(String profId) throws ISRegistryException; + + boolean refreshProfile(String profId, String resourceType) throws ISRegistryException; + + /** + * register a XML Profile. + * + * @param resourceProfile xml profile + * @return profile id + * @throws ISRegistryException + */ + String registerProfile(String resourceProfile) throws ISRegistryException; + + String registerSecureProfile(String resourceProfId, String secureProfId) throws ISRegistryException; + + boolean updateProfile(String profId, String resourceProfile, String resourceType) throws ISRegistryException; + + @Deprecated + String updateProfileDHN(String resourceProfile) throws ISRegistryException; + + boolean addProfileNode(String profId, String xpath, String node) throws ISRegistryException; + + boolean updateProfileNode(String profId, String xpath, String node) throws ISRegistryException; + + boolean removeProfileNode(String profId, String nodeId) throws ISRegistryException; + + @Deprecated + boolean updateRegionDescription(String profId, String resourceProfile) throws ISRegistryException; + + String validateProfile(String profId) throws ISRegistryException; + + @Deprecated + List validateProfiles(List profIds) throws ISRegistryException; + + void addBlackBoardMessage(String profId, String messageId, String message) throws ISRegistryException; + + void replyBlackBoardMessage(String profId, String message) throws ISRegistryException; + + void deleteBlackBoardMessage(String profId, String messageId) throws ISRegistryException; +} diff --git a/dhp-rmi-api/src/main/java/eu/dnetlib/enabling/is/sn/rmi/ISSNException.java b/dhp-rmi-api/src/main/java/eu/dnetlib/enabling/is/sn/rmi/ISSNException.java new file mode 100644 index 000000000..c81470911 --- /dev/null +++ b/dhp-rmi-api/src/main/java/eu/dnetlib/enabling/is/sn/rmi/ISSNException.java @@ -0,0 +1,25 @@ + +package eu.dnetlib.enabling.is.sn.rmi; + +import eu.dnetlib.common.rmi.RMIException; + +public class ISSNException extends RMIException { + + /** + * + */ + private static final long serialVersionUID = -7384073901457430004L; + + public ISSNException(final Throwable e) { + super(e); + } + + public ISSNException(final String message) { + super(message); + } + + public ISSNException(final String message, final Throwable e) { + super(message, e); + } + +} diff --git a/dhp-rmi-api/src/main/java/eu/dnetlib/enabling/is/sn/rmi/ISSNService.java b/dhp-rmi-api/src/main/java/eu/dnetlib/enabling/is/sn/rmi/ISSNService.java new file mode 100644 index 000000000..c53c4eda1 --- /dev/null +++ b/dhp-rmi-api/src/main/java/eu/dnetlib/enabling/is/sn/rmi/ISSNService.java @@ -0,0 +1,121 @@ + +package eu.dnetlib.enabling.is.sn.rmi; + +import java.util.List; + +import eu.dnetlib.common.rmi.BaseService; +import jakarta.jws.WebParam; +import jakarta.jws.WebService; +import jakarta.xml.ws.wsaddressing.W3CEndpointReference; + +@WebService(targetNamespace = "http://services.dnetlib.eu/") +public interface ISSNService extends BaseService { + + /** + * fossil. + * + * @param topic + * @return + * @throws ISSNException + */ + String getCurrentMessage(@WebParam(name = "topic") String topic) throws ISSNException; + + /** + * puts a subcription in a paused state. paused subscription are not notified even when triggered. + * + * @param subscrId subscription identifier + * @return returns false if the subscription is already paused. + * @throws ISSNException may happen + */ + boolean pauseSubscription(@WebParam(name = "subscrId") String subscrId) throws ISSNException; + + /** + * Used to renew the subscription before it expires. + * + *

+ * In practice it resets the ttl to another value, so it can be used to reset a infinte ttl subscription to a finite + * value. + *

+ * + * @param subscrId subscription id + * @param terminationTime new ttl (from now), or 0 (infinite) + * @return true if successful + * @throws ISSNException may happen + */ + boolean renew(@WebParam(name = "subscrId") String subscrId, @WebParam(name = "terminationTime") int terminationTime) + throws ISSNException; + + /** + * resumes a paused subscription. + * + * @param subscrId subscription id + * @return true if resumed. false if it was not paused. + * @throws ISSNException may happen + */ + boolean resumeSubscription(@WebParam(name = "subscrId") String subscrId) throws ISSNException; + + /** + * @param consumerReference epr to be called when the notification is triggered + * @param topicExpression topic expression to register + * @param initialTerminationTime ttl in seconds (0 = infinite) + * @return subscription id + * @throws ISSNException may happen + */ + String subscribe( + @WebParam(name = "consumerReference") W3CEndpointReference consumerReference, + @WebParam(name = "topicExpression") String topicExpression, + @WebParam(name = "initialTerminationTime") int initialTerminationTime) + throws ISSNException; + + boolean unsubscribe(@WebParam(name = "subscrId") String subscrId) throws ISSNException; + + /** + * fossil. + * + * @param resourceType + * @param profileId + * @param profile + * @return + * @throws ISSNException + */ + boolean actionCreatePerformed( + @WebParam(name = "resourceType") String resourceType, + @WebParam(name = "profileId") String profileId, + @WebParam(name = "profile") String profile) throws ISSNException; + + /** + * fossil. + * + * @param resourceType + * @param profileId + * @param profileBefore + * @param profileAfter + * @return + * @throws ISSNException + */ + boolean actionUpdatePerformed( + @WebParam(name = "resourceType") String resourceType, + @WebParam(name = "profileId") String profileId, + @WebParam(name = "profileBefore") String profileBefore, + @WebParam(name = "profileAfter") String profileAfter) throws ISSNException; + + /** + * fossil. + * + * @param resourceType + * @param profileId + * @return + * @throws ISSNException + */ + boolean actionDeletePerformed(@WebParam(name = "resourceType") String resourceType, + @WebParam(name = "profileId") String profileId) + throws ISSNException; + + /** + * list all subscriptions. Mostly for debug reasons. + * + * @return list of subscription ids. + */ + List listSubscriptions(); + +} diff --git a/dhp-rmi-api/src/main/java/eu/dnetlib/enabling/is/sn/rmi/SubscriptionRequestRejectedException.java b/dhp-rmi-api/src/main/java/eu/dnetlib/enabling/is/sn/rmi/SubscriptionRequestRejectedException.java new file mode 100644 index 000000000..75928092e --- /dev/null +++ b/dhp-rmi-api/src/main/java/eu/dnetlib/enabling/is/sn/rmi/SubscriptionRequestRejectedException.java @@ -0,0 +1,21 @@ + +package eu.dnetlib.enabling.is.sn.rmi; + +/** + * Thrown when a subscription request is rejected. + * + * @author claudio + */ +public class SubscriptionRequestRejectedException extends ISSNException { + + /** + * + */ + private static final long serialVersionUID = 263095606953662098L; + + public SubscriptionRequestRejectedException(String message) { + super(message); + // TODO Auto-generated constructor stub + } + +} diff --git a/dhp-rmi-api/src/main/java/eu/dnetlib/enabling/is/store/rmi/ISStoreException.java b/dhp-rmi-api/src/main/java/eu/dnetlib/enabling/is/store/rmi/ISStoreException.java new file mode 100644 index 000000000..4f9530dfa --- /dev/null +++ b/dhp-rmi-api/src/main/java/eu/dnetlib/enabling/is/store/rmi/ISStoreException.java @@ -0,0 +1,25 @@ + +package eu.dnetlib.enabling.is.store.rmi; + +import eu.dnetlib.common.rmi.RMIException; + +public class ISStoreException extends RMIException { + + /** + * + */ + private static final long serialVersionUID = 8683126829156096420L; + + public ISStoreException(Throwable e) { + super(e); + } + + public ISStoreException(String message, Throwable e) { + super(message, e); + } + + public ISStoreException(String message) { + super(message); + } + +} diff --git a/dhp-rmi-api/src/main/java/eu/dnetlib/enabling/is/store/rmi/ISStoreService.java b/dhp-rmi-api/src/main/java/eu/dnetlib/enabling/is/store/rmi/ISStoreService.java new file mode 100644 index 000000000..4b92b69c9 --- /dev/null +++ b/dhp-rmi-api/src/main/java/eu/dnetlib/enabling/is/store/rmi/ISStoreService.java @@ -0,0 +1,47 @@ + +package eu.dnetlib.enabling.is.store.rmi; + +import java.util.List; + +import eu.dnetlib.common.rmi.BaseService; +import jakarta.jws.WebParam; +import jakarta.jws.WebService; + +@WebService(targetNamespace = "http://services.dnetlib.eu/") +public interface ISStoreService extends BaseService { + + boolean createFileColl(@WebParam(name = "fileColl") String fileColl) throws ISStoreException; + + boolean deleteFileColl(@WebParam(name = "fileColl") String fileColl) throws ISStoreException; + + boolean deleteXML(@WebParam(name = "fileName") String fileName, @WebParam(name = "fileColl") String fileColl) + throws ISStoreException; + + boolean executeXUpdate(@WebParam(name = "query") String query) throws ISStoreException; + + List getFileColls() throws ISStoreException; + + List getFileNames(@WebParam(name = "fileColl") String fileColl) throws ISStoreException; + + String getXML(@WebParam(name = "fileName") String fileName, @WebParam(name = "fileColl") String fileColl) + throws ISStoreException; + + String getXMLbyQuery(@WebParam(name = "query") String query) throws ISStoreException; + + boolean insertXML(@WebParam(name = "fileName") String fileName, @WebParam(name = "fileColl") String fileColl, + @WebParam(name = "file") String file) + throws ISStoreException; + + boolean reindex(); + + List quickSearchXML(@WebParam(name = "query") String query) throws ISStoreException; + + boolean sync(); + + boolean updateXML(@WebParam(name = "fileName") String fileName, @WebParam(name = "fileColl") String fileColl, + @WebParam(name = "file") String file) + throws ISStoreException; + + String backup() throws ISStoreException; + +} diff --git a/dhp-rmi-api/src/main/java/eu/dnetlib/enabling/resultset/rmi/ResultSetException.java b/dhp-rmi-api/src/main/java/eu/dnetlib/enabling/resultset/rmi/ResultSetException.java new file mode 100644 index 000000000..f376a4625 --- /dev/null +++ b/dhp-rmi-api/src/main/java/eu/dnetlib/enabling/resultset/rmi/ResultSetException.java @@ -0,0 +1,22 @@ + +package eu.dnetlib.enabling.resultset.rmi; + +import eu.dnetlib.common.rmi.RMIException; + +public class ResultSetException extends RMIException { + + /** + * + */ + private static final long serialVersionUID = -7130554407601059627L; + + public ResultSetException(Throwable e) { + super(e); + // TODO Auto-generated constructor stub + } + + public ResultSetException(String string) { + super(string); + } + +} diff --git a/dhp-rmi-api/src/main/java/eu/dnetlib/enabling/resultset/rmi/ResultSetService.java b/dhp-rmi-api/src/main/java/eu/dnetlib/enabling/resultset/rmi/ResultSetService.java new file mode 100644 index 000000000..10079c907 --- /dev/null +++ b/dhp-rmi-api/src/main/java/eu/dnetlib/enabling/resultset/rmi/ResultSetService.java @@ -0,0 +1,135 @@ + +package eu.dnetlib.enabling.resultset.rmi; + +import java.util.List; + +import eu.dnetlib.common.rmi.BaseService; +import jakarta.jws.WebMethod; +import jakarta.jws.WebParam; +import jakarta.jws.WebService; +import jakarta.xml.ws.wsaddressing.W3CEndpointReference; + +/** + * ResultSet service interface. + *

+ * TODO: implement other compatibility methods as needed. + * + * @author marko + */ +@WebService(targetNamespace = "http://services.dnetlib.eu/") +public interface ResultSetService extends BaseService { + /** + * create a new pull rs. + * + * @param bdId bulk data identifier + * @param initialPageSize page size for the polling on the server side. + * @param expiryTime RS expiry time + * @return + */ + W3CEndpointReference createPullRSEPR( + @WebParam(name = "dataProviderServiceAddress") W3CEndpointReference dataProviderEPR, + @WebParam(name = "bdId") String bdId, + @WebParam(name = "initialPageSize") int initialPageSize, + @WebParam(name = "expiryTime") int expiryTime, + @WebParam(name = "styleSheet") String styleSheet, + @WebParam(name = "keepAliveTime") Integer keepAliveTime, + @WebParam(name = "total") Integer total); + + /** + * create a new pull rs. + *

+ * compatibility version + * + * @param bdId bulk data identifier + * @param initialPageSize page size for the polling on the server side. + * @param expiryTime RS expiry time + * @return + */ + W3CEndpointReference createPullRS( + @WebParam(name = "dataProviderServiceAddress") String dataProviderServiceAddress, + @WebParam(name = "bdId") String bdId, + @WebParam(name = "initialPageSize") int initialPageSize, + @WebParam(name = "expiryTime") int expiryTime, + @WebParam(name = "styleSheet") String styleSheet, + @WebParam(name = "keepAliveTime") Integer keepAliveTime, + @WebParam(name = "total") Integer total); + + /** + * close a result set. A closed resultset is guaranteed not to grow. + * + * @param rsId + */ + void closeRS(@WebParam(name = "rsId") String rsId); + + /** + * get one 'page' of results. + *

+ * TODO: define how results are returned when the range is not present in the result set. + * + * @param fromPosition counting from 1 + * @param toPosition included + * @param requestMode + * @return a page of data + */ + List getResult( + @WebParam(name = "rsId") String rsId, + @WebParam(name = "fromPosition") int fromPosition, + @WebParam(name = "toPosition") int toPosition, + @WebParam(name = "requestMode") String requestMode) throws ResultSetException; + + /** + * get the number of result elements present in the resultset. + * + * @param rsId result set identifier + * @return number of results available in the resultset + * @throws ResultSetException + */ + int getNumberOfElements(@WebParam(name = "rsId") String rsId) throws ResultSetException; + + /** + * create a new push resultset. + * + * @param expiryTime RS expiry time + * @param keepAliveTime keep alive time + * @return epr of new resultset + * @throws ResultSetException + */ + W3CEndpointReference createPushRS(@WebParam(name = "expiryTime") int expiryTime, + @WebParam(name = "keepAliveTime") int keepAliveTime) + throws ResultSetException; + + /** + * add new data to a push resultset. + * + * @param rsId resultset id + * @param elements list of elements to be addded + * @return dummy value + * @throws ResultSetException + */ + String populateRS(@WebParam(name = "rsId") String rsId, @WebParam(name = "elements") List elements) + throws ResultSetException; + + /** + * return current status of a resultset. + * + * @param rsId resultset id + * @return status + * @throws ResultSetException + */ + String getRSStatus(@WebParam(name = "rsId") String rsId) throws ResultSetException; + + /** + * read a resultset property. + * + * @param rsId resultset id + * @param name property value + * @return property value + * @throws ResultSetException + */ + String getProperty(@WebParam(name = "rsId") String rsId, @WebParam(name = "name") String name) + throws ResultSetException; + + @WebMethod(operationName = "identify") + String identify(); + +} diff --git a/dhp-shade-package/dependency-reduced-pom.xml b/dhp-shade-package/dependency-reduced-pom.xml deleted file mode 100644 index 04843072f..000000000 --- a/dhp-shade-package/dependency-reduced-pom.xml +++ /dev/null @@ -1,113 +0,0 @@ - - - - dhp - eu.dnetlib.dhp - 1.2.5-SNAPSHOT - - 4.0.0 - dhp-shade-package - This module create a jar of all module dependencies - - - - maven-shade-plugin - - - package - - shade - - - - - eu.dnetlib.dhp.oa.dedup.SparkCreateSimRels - - - - META-INF/cxf/bus-extensions.txt - - - - - *:* - - META-INF/maven/** - META-INF/*.SF - META-INF/*.DSA - META-INF/*.RSA - - - - - - com - repackaged.com.google.common - - com.google.common.** - - - - - - - - - - - - org.projectlombok - lombok - 1.18.28 - provided - - - org.junit.jupiter - junit-jupiter - 5.6.1 - test - - - junit-jupiter-api - org.junit.jupiter - - - junit-jupiter-params - org.junit.jupiter - - - junit-jupiter-engine - org.junit.jupiter - - - - - org.mockito - mockito-core - 3.3.3 - test - - - byte-buddy - net.bytebuddy - - - byte-buddy-agent - net.bytebuddy - - - - - org.mockito - mockito-junit-jupiter - 3.3.3 - test - - - - - DHPSite - ${dhp.site.stage.path}/dhp-common - - - diff --git a/dhp-shade-package/pom.xml b/dhp-shade-package/pom.xml index 42735f9d7..fe3b3c0d2 100644 --- a/dhp-shade-package/pom.xml +++ b/dhp-shade-package/pom.xml @@ -25,7 +25,6 @@ - diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/orcid/ORCIDExtractor.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/orcid/ORCIDExtractor.java index 1adad104e..8172456bb 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/orcid/ORCIDExtractor.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/collection/orcid/ORCIDExtractor.java @@ -160,7 +160,7 @@ public class ORCIDExtractor extends Thread { } } finally { for (SequenceFile.Writer k : fileMap.values()) { - log.info("Thread {}: Completed processed {} items", id, extractedItem); + log.info("Thread {}: Completed processed {} items", id, extractedItem); k.hflush(); k.close(); } diff --git a/dhp-workflows/pom.xml b/dhp-workflows/pom.xml index 1c331d126..88e92fb9a 100644 --- a/dhp-workflows/pom.xml +++ b/dhp-workflows/pom.xml @@ -40,7 +40,6 @@ dhp-usage-stats-build dhp-usage-raw-data-update dhp-broker-events - dhp-doiboost dhp-impact-indicators dhp-swh @@ -94,6 +93,12 @@ + + spark-24 + + dhp-doiboost + + attach-test-resources diff --git a/pom.xml b/pom.xml index 033d88b0b..9d0cac0e5 100644 --- a/pom.xml +++ b/pom.xml @@ -19,6 +19,7 @@ + dhp-rmi-api dhp-build dhp-pace-core dhp-common @@ -440,12 +441,6 @@ provided - - eu.dnetlib - cnr-rmi-api - ${cnr-rmi-api.version} - - eu.dnetlib.dhp dnet-openaire-broker-common @@ -926,7 +921,6 @@ 4.1.2 - [2.6.1] 1.20 1.8 1.8 @@ -936,6 +930,7 @@ 2.4 1.1.3 1.7 + 3.5.9 1.0.7 [10.0.0] cdh5.9.2 @@ -953,6 +948,7 @@ 3.5.3 4.13.0 5.6.1 + 3.0.3 3.3.3 3.4.2 4.7.2 @@ -1036,6 +1032,7 @@ 4.8.1 + 3.3.4 1.23.0 1.8 1.10.0 @@ -1049,7 +1046,7 @@ 14.0.1 8.11.0 4.0.4 - 3.5.1.openaire-SNAPSHOT + 3.5.3 2.15.2 3.12.0 2.20.0 @@ -1067,6 +1064,10 @@ [11 + + 4.0.5 + 4.0.4 +