dnet-hadoop/dhp-workflows/dhp-broker-events/src/main/java/eu/dnetlib/dhp/broker/oa/util/TrustUtils.java

88 lines
2.3 KiB
Java

package eu.dnetlib.dhp.broker.oa.util;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import org.apache.commons.io.IOUtils;
import org.apache.spark.sql.Row;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.broker.objects.OaBrokerMainEntity;
import eu.dnetlib.pace.config.DedupConfig;
import eu.dnetlib.pace.model.SparkDeduper;
import eu.dnetlib.pace.tree.support.TreeProcessor;
public class TrustUtils {
private static final Logger log = LoggerFactory.getLogger(TrustUtils.class);
private static DedupConfig dedupConfig;
private static SparkDeduper deduper;
private static final ObjectMapper mapper;
static {
mapper = new ObjectMapper();
try {
dedupConfig = DedupConfig
.load(
IOUtils
.toString(
DedupConfig.class
.getResourceAsStream("/eu/dnetlib/dhp/broker/oa/dedupConfig/dedupConfig.json"),
StandardCharsets.UTF_8));
deduper = new SparkDeduper(dedupConfig);
} catch (final IOException e) {
log.error("Error loading dedupConfig, e");
}
}
private TrustUtils() {
}
protected static float calculateTrust(final OaBrokerMainEntity r1, final OaBrokerMainEntity r2) {
if (dedupConfig == null) {
return BrokerConstants.MIN_TRUST;
}
try {
final Row doc1 = deduper.model().rowFromJson(mapper.writeValueAsString(r1));
final Row doc2 = deduper.model().rowFromJson(mapper.writeValueAsString(r2));
final double score = new TreeProcessor(dedupConfig).computeScore(doc1, doc2);
final double threshold = dedupConfig.getWf().getThreshold();
return TrustUtils.rescale(score, threshold);
} catch (final Exception e) {
log.error("Error computing score between results", e);
throw new RuntimeException(e);
}
}
public static float rescale(final double score, final double threshold) {
if (score >= BrokerConstants.MAX_TRUST) {
return BrokerConstants.MAX_TRUST;
}
final double val = (score - threshold) * (BrokerConstants.MAX_TRUST - BrokerConstants.MIN_TRUST)
/ (BrokerConstants.MAX_TRUST - threshold);
if (val < BrokerConstants.MIN_TRUST) {
return BrokerConstants.MIN_TRUST;
}
if (val > BrokerConstants.MAX_TRUST) {
return BrokerConstants.MAX_TRUST;
}
return (float) val;
}
}