From 1f226d1dce74f4858be1f30d713c3d991b367836 Mon Sep 17 00:00:00 2001 From: Giambattista Bloisi Date: Wed, 20 Sep 2023 15:42:00 +0200 Subject: [PATCH 1/3] Fix defect #8997: GenerateEventsJob is generating huge amounts of logs because broker entity similarity calculation consistently failed --- .../eu/dnetlib/pace/model/SparkModel.scala | 4 ++-- .../dhp/broker/oa/util/TrustUtils.java | 22 +++++++++---------- 2 files changed, 12 insertions(+), 14 deletions(-) diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/SparkModel.scala b/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/SparkModel.scala index 95325ace0..3ba36aa22 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/SparkModel.scala +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/SparkModel.scala @@ -78,10 +78,10 @@ case class SparkModel(conf: DedupConfig) { uv case Type.List | Type.JSON => - MapDocumentUtil.truncateList( + Seq(MapDocumentUtil.truncateList( MapDocumentUtil.getJPathList(fdef.getPath, documentContext, fdef.getType), fdef.getSize - ).toArray + )) case Type.StringConcat => val jpaths = CONCAT_REGEX.split(fdef.getPath) diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/TrustUtils.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/TrustUtils.java index a6fa2b1a1..6f197a8ce 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/TrustUtils.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/TrustUtils.java @@ -1,18 +1,18 @@ package eu.dnetlib.dhp.broker.oa.util; -import java.io.IOException; - -import org.apache.spark.sql.Row; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - import com.fasterxml.jackson.databind.ObjectMapper; - import eu.dnetlib.broker.objects.OaBrokerMainEntity; import eu.dnetlib.pace.config.DedupConfig; import eu.dnetlib.pace.model.SparkDeduper; import eu.dnetlib.pace.tree.support.TreeProcessor; +import org.apache.commons.io.IOUtils; +import org.apache.spark.sql.Row; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.nio.charset.StandardCharsets; public class TrustUtils { @@ -27,10 +27,8 @@ public class TrustUtils { static { mapper = new ObjectMapper(); try { - dedupConfig = mapper - .readValue( - DedupConfig.class.getResourceAsStream("/eu/dnetlib/dhp/broker/oa/dedupConfig/dedupConfig.json"), - DedupConfig.class); + dedupConfig = DedupConfig.load(IOUtils.toString(DedupConfig.class.getResourceAsStream("/eu/dnetlib/dhp/broker/oa/dedupConfig/dedupConfig.json"), StandardCharsets.UTF_8)); + deduper = new SparkDeduper(dedupConfig); } catch (final IOException e) { log.error("Error loading dedupConfig, e"); @@ -57,7 +55,7 @@ public class TrustUtils { return TrustUtils.rescale(score, threshold); } catch (final Exception e) { log.error("Error computing score between results", e); - return BrokerConstants.MIN_TRUST; + throw new RuntimeException(e); } } From 4853c19b5edd96cd3828e2d675a1b6ccd37b8d0d Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Wed, 20 Sep 2023 15:53:21 +0200 Subject: [PATCH 2/3] code formatting --- .../dhp/broker/oa/util/TrustUtils.java | 24 ++++++++++++------- 1 file changed, 16 insertions(+), 8 deletions(-) diff --git a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/TrustUtils.java b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/TrustUtils.java index 6f197a8ce..67468c6f9 100644 --- a/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/TrustUtils.java +++ b/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/TrustUtils.java @@ -1,18 +1,20 @@ package eu.dnetlib.dhp.broker.oa.util; -import com.fasterxml.jackson.databind.ObjectMapper; -import eu.dnetlib.broker.objects.OaBrokerMainEntity; -import eu.dnetlib.pace.config.DedupConfig; -import eu.dnetlib.pace.model.SparkDeduper; -import eu.dnetlib.pace.tree.support.TreeProcessor; +import java.io.IOException; +import java.nio.charset.StandardCharsets; + import org.apache.commons.io.IOUtils; import org.apache.spark.sql.Row; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import java.io.IOException; -import java.nio.charset.StandardCharsets; +import com.fasterxml.jackson.databind.ObjectMapper; + +import eu.dnetlib.broker.objects.OaBrokerMainEntity; +import eu.dnetlib.pace.config.DedupConfig; +import eu.dnetlib.pace.model.SparkDeduper; +import eu.dnetlib.pace.tree.support.TreeProcessor; public class TrustUtils { @@ -27,7 +29,13 @@ public class TrustUtils { static { mapper = new ObjectMapper(); try { - dedupConfig = DedupConfig.load(IOUtils.toString(DedupConfig.class.getResourceAsStream("/eu/dnetlib/dhp/broker/oa/dedupConfig/dedupConfig.json"), StandardCharsets.UTF_8)); + dedupConfig = DedupConfig + .load( + IOUtils + .toString( + DedupConfig.class + .getResourceAsStream("/eu/dnetlib/dhp/broker/oa/dedupConfig/dedupConfig.json"), + StandardCharsets.UTF_8)); deduper = new SparkDeduper(dedupConfig); } catch (final IOException e) { From 7152d47f84587d9e4a6af6357abb19f90d6be6af Mon Sep 17 00:00:00 2001 From: Giambattista Bloisi Date: Wed, 20 Sep 2023 16:14:01 +0200 Subject: [PATCH 3/3] Use asScala to convert java List to Scala Sequence --- .../src/main/java/eu/dnetlib/pace/model/SparkModel.scala | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/SparkModel.scala b/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/SparkModel.scala index 3ba36aa22..aa997c6e9 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/SparkModel.scala +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/SparkModel.scala @@ -78,10 +78,10 @@ case class SparkModel(conf: DedupConfig) { uv case Type.List | Type.JSON => - Seq(MapDocumentUtil.truncateList( + MapDocumentUtil.truncateList( MapDocumentUtil.getJPathList(fdef.getPath, documentContext, fdef.getType), fdef.getSize - )) + ).asScala case Type.StringConcat => val jpaths = CONCAT_REGEX.split(fdef.getPath)