dnet-hadoop/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/TrustUtils.java

88 lines
2.3 KiB
Java
Raw Normal View History

2020-06-10 12:11:16 +02:00
2020-06-09 16:01:31 +02:00
package eu.dnetlib.dhp.broker.oa.util;
2023-09-20 15:53:21 +02:00
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import org.apache.commons.io.IOUtils;
import org.apache.spark.sql.Row;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
2023-09-20 15:53:21 +02:00
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.broker.objects.OaBrokerMainEntity;
import eu.dnetlib.pace.config.DedupConfig;
import eu.dnetlib.pace.model.SparkDeduper;
import eu.dnetlib.pace.tree.support.TreeProcessor;
2020-07-09 12:53:46 +02:00
2020-06-09 16:01:31 +02:00
public class TrustUtils {
2020-07-09 12:53:46 +02:00
private static final Logger log = LoggerFactory.getLogger(TrustUtils.class);
private static DedupConfig dedupConfig;
private static SparkDeduper deduper;
private static final ObjectMapper mapper;
2020-07-09 12:53:46 +02:00
static {
mapper = new ObjectMapper();
2020-07-09 12:53:46 +02:00
try {
2023-09-20 15:53:21 +02:00
dedupConfig = DedupConfig
.load(
IOUtils
.toString(
DedupConfig.class
.getResourceAsStream("/eu/dnetlib/dhp/broker/oa/dedupConfig/dedupConfig.json"),
StandardCharsets.UTF_8));
deduper = new SparkDeduper(dedupConfig);
2020-07-09 12:53:46 +02:00
} catch (final IOException e) {
log.error("Error loading dedupConfig, e");
}
2021-08-11 12:13:22 +02:00
}
2020-07-09 12:53:46 +02:00
2021-08-11 12:13:22 +02:00
private TrustUtils() {
2020-07-09 12:53:46 +02:00
}
protected static float calculateTrust(final OaBrokerMainEntity r1, final OaBrokerMainEntity r2) {
if (dedupConfig == null) {
return BrokerConstants.MIN_TRUST;
}
try {
final Row doc1 = deduper.model().rowFromJson(mapper.writeValueAsString(r1));
final Row doc2 = deduper.model().rowFromJson(mapper.writeValueAsString(r2));
2020-07-09 12:53:46 +02:00
final double score = new TreeProcessor(dedupConfig).computeScore(doc1, doc2);
final double threshold = dedupConfig.getWf().getThreshold();
return TrustUtils.rescale(score, threshold);
} catch (final Exception e) {
log.error("Error computing score between results", e);
throw new RuntimeException(e);
2020-07-09 12:53:46 +02:00
}
}
2020-06-09 16:01:31 +02:00
public static float rescale(final double score, final double threshold) {
2020-06-10 12:11:16 +02:00
if (score >= BrokerConstants.MAX_TRUST) {
return BrokerConstants.MAX_TRUST;
}
2020-06-09 16:01:31 +02:00
2020-06-10 12:11:16 +02:00
final double val = (score - threshold) * (BrokerConstants.MAX_TRUST - BrokerConstants.MIN_TRUST)
/ (BrokerConstants.MAX_TRUST - threshold);
2020-06-09 16:01:31 +02:00
2020-06-10 12:11:16 +02:00
if (val < BrokerConstants.MIN_TRUST) {
return BrokerConstants.MIN_TRUST;
}
if (val > BrokerConstants.MAX_TRUST) {
return BrokerConstants.MAX_TRUST;
}
2020-06-09 16:01:31 +02:00
return (float) val;
}
}