From 0e34b0ece13f4d6dfeb5a4f0cb274168327c954a Mon Sep 17 00:00:00 2001 From: Giambattista Bloisi Date: Mon, 21 Oct 2024 09:05:13 +0200 Subject: [PATCH 1/3] Fix imports: point them from the main distribution packages --- .../src/test/java/eu/dnetlib/pace/util/UtilTest.java | 1 - .../dhp/actionmanager/personentity/ExtractPerson.java | 6 +++--- .../eu/dnetlib/dhp/collection/crossref/Crossref2Oaf.scala | 2 +- .../scala/eu/dnetlib/doiboost/crossref/Crossref2Oaf.scala | 2 +- .../main/scala/eu/dnetlib/doiboost/orcid/ORCIDToOAF.scala | 2 +- .../src/main/java/eu/dnetlib/dhp/api/Utils.java | 4 ++-- .../java/eu/dnetlib/dhp/bulktag/community/Constraint.java | 2 +- .../dhp/oa/graph/hive/GraphHiveTableImporterJob.java | 6 +++--- .../main/java/eu/dnetlib/dhp/swh/models/LastVisitData.java | 2 +- 9 files changed, 13 insertions(+), 14 deletions(-) diff --git a/dhp-pace-core/src/test/java/eu/dnetlib/pace/util/UtilTest.java b/dhp-pace-core/src/test/java/eu/dnetlib/pace/util/UtilTest.java index be5c1ebb9..93db552c1 100644 --- a/dhp-pace-core/src/test/java/eu/dnetlib/pace/util/UtilTest.java +++ b/dhp-pace-core/src/test/java/eu/dnetlib/pace/util/UtilTest.java @@ -11,7 +11,6 @@ import org.junit.jupiter.api.Disabled; import org.junit.jupiter.api.Test; import eu.dnetlib.pace.model.Person; -import jdk.nashorn.internal.ir.annotations.Ignore; public class UtilTest { diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/personentity/ExtractPerson.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/personentity/ExtractPerson.java index e63a50984..debf7e38e 100644 --- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/personentity/ExtractPerson.java +++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/personentity/ExtractPerson.java @@ -11,6 +11,7 @@ import java.util.stream.Collectors; import org.apache.commons.cli.ParseException; import org.apache.commons.io.IOUtils; +import org.apache.commons.lang3.StringUtils; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.compress.BZip2Codec; import org.apache.hadoop.mapred.SequenceFileOutputFormat; @@ -20,7 +21,6 @@ import org.apache.spark.sql.*; import org.jetbrains.annotations.NotNull; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import org.spark_project.jetty.util.StringUtil; import com.fasterxml.jackson.databind.ObjectMapper; @@ -317,13 +317,13 @@ public class ExtractPerson implements Serializable { "0.91"), null); - if (Optional.ofNullable(row.getStartDate()).isPresent() && StringUtil.isNotBlank(row.getStartDate())) { + if (Optional.ofNullable(row.getStartDate()).isPresent() && StringUtils.isNotBlank(row.getStartDate())) { KeyValue kv = new KeyValue(); kv.setKey("startDate"); kv.setValue(row.getStartDate()); properties.add(kv); } - if (Optional.ofNullable(row.getEndDate()).isPresent() && StringUtil.isNotBlank(row.getEndDate())) { + if (Optional.ofNullable(row.getEndDate()).isPresent() && StringUtils.isNotBlank(row.getEndDate())) { KeyValue kv = new KeyValue(); kv.setKey("endDate"); kv.setValue(row.getEndDate()); diff --git a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/collection/crossref/Crossref2Oaf.scala b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/collection/crossref/Crossref2Oaf.scala index 7c45234f6..e7d68920b 100644 --- a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/collection/crossref/Crossref2Oaf.scala +++ b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/collection/crossref/Crossref2Oaf.scala @@ -14,7 +14,7 @@ import eu.dnetlib.dhp.schema.oaf.utils.{ PidType } import eu.dnetlib.dhp.utils.DHPUtils -import org.apache.commons.lang.StringUtils +import org.apache.commons.lang3.StringUtils import org.apache.spark.sql.Row import org.json4s import org.json4s.DefaultFormats diff --git a/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/crossref/Crossref2Oaf.scala b/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/crossref/Crossref2Oaf.scala index d8292a631..a2c36041d 100644 --- a/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/crossref/Crossref2Oaf.scala +++ b/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/crossref/Crossref2Oaf.scala @@ -7,7 +7,7 @@ import eu.dnetlib.dhp.schema.oaf.utils.{GraphCleaningFunctions, IdentifierFactor import eu.dnetlib.dhp.utils.DHPUtils import eu.dnetlib.doiboost.DoiBoostMappingUtil import eu.dnetlib.doiboost.DoiBoostMappingUtil._ -import org.apache.commons.lang.StringUtils +import org.apache.commons.lang3.StringUtils import org.json4s import org.json4s.DefaultFormats import org.json4s.JsonAST._ diff --git a/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/orcid/ORCIDToOAF.scala b/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/orcid/ORCIDToOAF.scala index 7c58afc09..6ec75f5c3 100644 --- a/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/orcid/ORCIDToOAF.scala +++ b/dhp-workflows/dhp-doiboost/src/main/scala/eu/dnetlib/doiboost/orcid/ORCIDToOAF.scala @@ -6,7 +6,7 @@ import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory import eu.dnetlib.dhp.schema.oaf.{Author, DataInfo, Publication} import eu.dnetlib.doiboost.DoiBoostMappingUtil import eu.dnetlib.doiboost.DoiBoostMappingUtil.{createSP, generateDataInfo} -import org.apache.commons.lang.StringUtils +import org.apache.commons.lang3.StringUtils import org.json4s import org.json4s.DefaultFormats import org.json4s.JsonAST._ diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/api/Utils.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/api/Utils.java index 27fb37e5b..6079da365 100644 --- a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/api/Utils.java +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/api/Utils.java @@ -6,11 +6,11 @@ import java.io.Serializable; import java.util.*; import java.util.stream.Collectors; +import org.apache.commons.lang3.StringUtils; import org.jetbrains.annotations.NotNull; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import com.amazonaws.util.StringUtils; import com.fasterxml.jackson.databind.ObjectMapper; import com.google.common.collect.Maps; @@ -81,7 +81,7 @@ public class Utils implements Serializable { Community c = new Community(); c.setId(cm.getId()); c.setZenodoCommunities(cm.getOtherZenodoCommunities()); - if (!StringUtils.isNullOrEmpty(cm.getZenodoCommunity())) + if (StringUtils.isNotBlank(cm.getZenodoCommunity())) c.getZenodoCommunities().add(cm.getZenodoCommunity()); c.setSubjects(cm.getSubjects()); c.getSubjects().addAll(cm.getFos()); diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/community/Constraint.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/community/Constraint.java index 82a6a3b85..51525e4d3 100644 --- a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/community/Constraint.java +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/community/Constraint.java @@ -4,7 +4,7 @@ package eu.dnetlib.dhp.bulktag.community; import java.io.Serializable; import java.lang.reflect.InvocationTargetException; -import org.apache.htrace.fasterxml.jackson.annotation.JsonIgnore; +import com.fasterxml.jackson.annotation.JsonIgnore; import eu.dnetlib.dhp.bulktag.criteria.Selection; import eu.dnetlib.dhp.bulktag.criteria.VerbResolver; diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/hive/GraphHiveTableImporterJob.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/hive/GraphHiveTableImporterJob.java index 73243dbc5..d4fec3f52 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/hive/GraphHiveTableImporterJob.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/hive/GraphHiveTableImporterJob.java @@ -72,9 +72,9 @@ public class GraphHiveTableImporterJob { final Encoder clazzEncoder = Encoders.bean(clazz); Dataset dataset = spark - .read() - .schema(clazzEncoder.schema()) - .json(inputPath); + .read() + .schema(clazzEncoder.schema()) + .json(inputPath); if (numPartitions > 0) { log.info("repartitioning {} to {} partitions", clazz.getSimpleName(), numPartitions); diff --git a/dhp-workflows/dhp-swh/src/main/java/eu/dnetlib/dhp/swh/models/LastVisitData.java b/dhp-workflows/dhp-swh/src/main/java/eu/dnetlib/dhp/swh/models/LastVisitData.java index 5e705716c..0461e2f94 100644 --- a/dhp-workflows/dhp-swh/src/main/java/eu/dnetlib/dhp/swh/models/LastVisitData.java +++ b/dhp-workflows/dhp-swh/src/main/java/eu/dnetlib/dhp/swh/models/LastVisitData.java @@ -3,8 +3,8 @@ package eu.dnetlib.dhp.swh.models; import java.io.Serializable; -import com.cloudera.com.fasterxml.jackson.annotation.JsonProperty; import com.fasterxml.jackson.annotation.JsonIgnoreProperties; +import com.fasterxml.jackson.annotation.JsonProperty; @JsonIgnoreProperties(ignoreUnknown = true) public class LastVisitData implements Serializable { From aa7b8fd014f8b1a3855330806efa40cec1fc11d6 Mon Sep 17 00:00:00 2001 From: Giambattista Bloisi Date: Mon, 21 Oct 2024 18:05:01 +0200 Subject: [PATCH 2/3] Use workingDir parameter for temporary data of ORCID enrichment --- .../enrich/orcid/enrich_graph_orcid_parameters.json | 6 ++++++ .../orcid/SparkEnrichGraphWithOrcidAuthors.scala | 12 +++++++----- 2 files changed, 13 insertions(+), 5 deletions(-) diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/enrich/orcid/enrich_graph_orcid_parameters.json b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/enrich/orcid/enrich_graph_orcid_parameters.json index 765c0e8ff..772e1381f 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/enrich/orcid/enrich_graph_orcid_parameters.json +++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/enrich/orcid/enrich_graph_orcid_parameters.json @@ -22,5 +22,11 @@ "paramLongName": "targetPath", "paramDescription": "the output path of the graph enriched", "paramRequired": true + }, + { + "paramName": "wp", + "paramLongName": "workingDir", + "paramDescription": "the working dir", + "paramRequired": true } ] \ No newline at end of file diff --git a/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/enrich/orcid/SparkEnrichGraphWithOrcidAuthors.scala b/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/enrich/orcid/SparkEnrichGraphWithOrcidAuthors.scala index 0824c2a71..847a5f090 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/enrich/orcid/SparkEnrichGraphWithOrcidAuthors.scala +++ b/dhp-workflows/dhp-graph-mapper/src/main/scala/eu/dnetlib/dhp/enrich/orcid/SparkEnrichGraphWithOrcidAuthors.scala @@ -47,13 +47,15 @@ class SparkEnrichGraphWithOrcidAuthors(propertyPath: String, args: Array[String] log.info(s"orcidPath is '$orcidPath'") val targetPath = parser.get("targetPath") log.info(s"targetPath is '$targetPath'") + val workingDir = parser.get("workingDir") + log.info(s"targetPath is '$workingDir'") - createTemporaryData(graphPath, orcidPath, targetPath) - analisys(targetPath) - generateGraph(graphPath, targetPath) + createTemporaryData(graphPath, orcidPath, workingDir) + analisys(workingDir) + generateGraph(graphPath, workingDir, targetPath) } - private def generateGraph(graphPath: String, targetPath: String): Unit = { + private def generateGraph(graphPath: String, workingDir: String, targetPath: String): Unit = { ModelSupport.entityTypes.asScala .filter(e => ModelSupport.isResult(e._1)) @@ -63,7 +65,7 @@ class SparkEnrichGraphWithOrcidAuthors(propertyPath: String, args: Array[String] val matched = spark.read .schema(Encoders.bean(classOf[ORCIDAuthorEnricherResult]).schema) - .parquet(s"${targetPath}/${resultType}_matched") + .parquet(s"${workingDir}/${resultType}_matched") .selectExpr("id", "enriched_author") spark.read From 6bc741715c08a4b71e3b737b3d8befdbfa743aab Mon Sep 17 00:00:00 2001 From: Giambattista Bloisi Date: Wed, 23 Oct 2024 14:01:12 +0200 Subject: [PATCH 3/3] Fix OafMapperUtilsTest.testMergePubs --- .../eu/dnetlib/dhp/schema/oaf/utils/OafMapperUtilsTest.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dhp-common/src/test/java/eu/dnetlib/dhp/schema/oaf/utils/OafMapperUtilsTest.java b/dhp-common/src/test/java/eu/dnetlib/dhp/schema/oaf/utils/OafMapperUtilsTest.java index 9317c0ce4..1ee8e52de 100644 --- a/dhp-common/src/test/java/eu/dnetlib/dhp/schema/oaf/utils/OafMapperUtilsTest.java +++ b/dhp-common/src/test/java/eu/dnetlib/dhp/schema/oaf/utils/OafMapperUtilsTest.java @@ -179,7 +179,7 @@ class OafMapperUtilsTest { assertEquals( ModelConstants.DATASET_RESULTTYPE_CLASSID, ((Result) MergeUtils - .merge(p2, d1)) + .merge(p2, d1, true)) .getResulttype() .getClassid()); }