forked from D-Net/dnet-hadoop
DedupConfig as json file
This commit is contained in:
parent
a44b9b36b9
commit
2d742a84ae
|
@ -18,8 +18,6 @@ import org.apache.spark.util.LongAccumulator;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
|
||||||
|
|
||||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||||
import eu.dnetlib.dhp.broker.model.Event;
|
import eu.dnetlib.dhp.broker.model.Event;
|
||||||
import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher;
|
import eu.dnetlib.dhp.broker.oa.matchers.UpdateMatcher;
|
||||||
|
@ -27,9 +25,6 @@ import eu.dnetlib.dhp.broker.oa.util.ClusterUtils;
|
||||||
import eu.dnetlib.dhp.broker.oa.util.EventFinder;
|
import eu.dnetlib.dhp.broker.oa.util.EventFinder;
|
||||||
import eu.dnetlib.dhp.broker.oa.util.EventGroup;
|
import eu.dnetlib.dhp.broker.oa.util.EventGroup;
|
||||||
import eu.dnetlib.dhp.broker.oa.util.aggregators.simple.ResultGroup;
|
import eu.dnetlib.dhp.broker.oa.util.aggregators.simple.ResultGroup;
|
||||||
import eu.dnetlib.dhp.utils.ISLookupClientFactory;
|
|
||||||
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
|
|
||||||
import eu.dnetlib.pace.config.DedupConfig;
|
|
||||||
|
|
||||||
public class GenerateEventsJob {
|
public class GenerateEventsJob {
|
||||||
|
|
||||||
|
@ -52,12 +47,6 @@ public class GenerateEventsJob {
|
||||||
final String workingPath = parser.get("workingPath");
|
final String workingPath = parser.get("workingPath");
|
||||||
log.info("workingPath: {}", workingPath);
|
log.info("workingPath: {}", workingPath);
|
||||||
|
|
||||||
final String isLookupUrl = parser.get("isLookupUrl");
|
|
||||||
log.info("isLookupUrl: {}", isLookupUrl);
|
|
||||||
|
|
||||||
final String dedupConfigProfileId = parser.get("dedupConfProfile");
|
|
||||||
log.info("dedupConfigProfileId: {}", dedupConfigProfileId);
|
|
||||||
|
|
||||||
final String eventsPath = workingPath + "/events";
|
final String eventsPath = workingPath + "/events";
|
||||||
log.info("eventsPath: {}", eventsPath);
|
log.info("eventsPath: {}", eventsPath);
|
||||||
|
|
||||||
|
@ -72,10 +61,6 @@ public class GenerateEventsJob {
|
||||||
|
|
||||||
final SparkConf conf = new SparkConf();
|
final SparkConf conf = new SparkConf();
|
||||||
|
|
||||||
// TODO UNCOMMENT
|
|
||||||
// final DedupConfig dedupConfig = loadDedupConfig(isLookupUrl, dedupConfigProfileId);
|
|
||||||
final DedupConfig dedupConfig = null;
|
|
||||||
|
|
||||||
runWithSparkSession(conf, isSparkSessionManaged, spark -> {
|
runWithSparkSession(conf, isSparkSessionManaged, spark -> {
|
||||||
|
|
||||||
ClusterUtils.removeDir(spark, eventsPath);
|
ClusterUtils.removeDir(spark, eventsPath);
|
||||||
|
@ -90,7 +75,7 @@ public class GenerateEventsJob {
|
||||||
final Dataset<Event> dataset = groups
|
final Dataset<Event> dataset = groups
|
||||||
.map(
|
.map(
|
||||||
g -> EventFinder
|
g -> EventFinder
|
||||||
.generateEvents(g, dsIdWhitelist, dsIdBlacklist, dsTypeWhitelist, dedupConfig, accumulators),
|
.generateEvents(g, dsIdWhitelist, dsIdBlacklist, dsTypeWhitelist, accumulators),
|
||||||
Encoders
|
Encoders
|
||||||
.bean(EventGroup.class))
|
.bean(EventGroup.class))
|
||||||
.flatMap(g -> g.getData().iterator(), Encoders.bean(Event.class));
|
.flatMap(g -> g.getData().iterator(), Encoders.bean(Event.class));
|
||||||
|
@ -112,23 +97,4 @@ public class GenerateEventsJob {
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private static DedupConfig loadDedupConfig(final String isLookupUrl, final String profId) throws Exception {
|
|
||||||
|
|
||||||
final ISLookUpService isLookUpService = ISLookupClientFactory.getLookUpService(isLookupUrl);
|
|
||||||
|
|
||||||
final String conf = isLookUpService
|
|
||||||
.getResourceProfileByQuery(
|
|
||||||
String
|
|
||||||
.format(
|
|
||||||
"for $x in /RESOURCE_PROFILE[.//RESOURCE_IDENTIFIER/@value = '%s'] return $x//DEDUPLICATION/text()",
|
|
||||||
profId));
|
|
||||||
|
|
||||||
final DedupConfig dedupConfig = new ObjectMapper().readValue(conf, DedupConfig.class);
|
|
||||||
dedupConfig.getPace().initModel();
|
|
||||||
dedupConfig.getPace().initTranslationMap();
|
|
||||||
// dedupConfig.getWf().setConfigurationId("???");
|
|
||||||
|
|
||||||
return dedupConfig;
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -17,7 +17,6 @@ import org.apache.spark.util.LongAccumulator;
|
||||||
import eu.dnetlib.broker.objects.OaBrokerMainEntity;
|
import eu.dnetlib.broker.objects.OaBrokerMainEntity;
|
||||||
import eu.dnetlib.dhp.broker.model.Topic;
|
import eu.dnetlib.dhp.broker.model.Topic;
|
||||||
import eu.dnetlib.dhp.broker.oa.util.UpdateInfo;
|
import eu.dnetlib.dhp.broker.oa.util.UpdateInfo;
|
||||||
import eu.dnetlib.pace.config.DedupConfig;
|
|
||||||
|
|
||||||
public abstract class UpdateMatcher<T> {
|
public abstract class UpdateMatcher<T> {
|
||||||
|
|
||||||
|
@ -37,7 +36,6 @@ public abstract class UpdateMatcher<T> {
|
||||||
|
|
||||||
public Collection<UpdateInfo<T>> searchUpdatesForRecord(final OaBrokerMainEntity res,
|
public Collection<UpdateInfo<T>> searchUpdatesForRecord(final OaBrokerMainEntity res,
|
||||||
final Collection<OaBrokerMainEntity> others,
|
final Collection<OaBrokerMainEntity> others,
|
||||||
final DedupConfig dedupConfig,
|
|
||||||
final Map<String, LongAccumulator> accumulators) {
|
final Map<String, LongAccumulator> accumulators) {
|
||||||
|
|
||||||
final Map<String, UpdateInfo<T>> infoMap = new HashMap<>();
|
final Map<String, UpdateInfo<T>> infoMap = new HashMap<>();
|
||||||
|
@ -49,7 +47,7 @@ public abstract class UpdateMatcher<T> {
|
||||||
if (topic != null) {
|
if (topic != null) {
|
||||||
final UpdateInfo<T> info = new UpdateInfo<>(topic, hl, source, res,
|
final UpdateInfo<T> info = new UpdateInfo<>(topic, hl, source, res,
|
||||||
getCompileHighlightFunction(),
|
getCompileHighlightFunction(),
|
||||||
getHighlightToStringFunction(), dedupConfig);
|
getHighlightToStringFunction());
|
||||||
|
|
||||||
final String s = DigestUtils.md5Hex(info.getHighlightValueAsString());
|
final String s = DigestUtils.md5Hex(info.getHighlightValueAsString());
|
||||||
if (!infoMap.containsKey(s) || infoMap.get(s).getTrust() < info.getTrust()) {
|
if (!infoMap.containsKey(s) || infoMap.get(s).getTrust() < info.getTrust()) {
|
||||||
|
|
|
@ -37,7 +37,6 @@ import eu.dnetlib.dhp.broker.oa.matchers.simple.EnrichMoreOpenAccess;
|
||||||
import eu.dnetlib.dhp.broker.oa.matchers.simple.EnrichMorePid;
|
import eu.dnetlib.dhp.broker.oa.matchers.simple.EnrichMorePid;
|
||||||
import eu.dnetlib.dhp.broker.oa.matchers.simple.EnrichMoreSubject;
|
import eu.dnetlib.dhp.broker.oa.matchers.simple.EnrichMoreSubject;
|
||||||
import eu.dnetlib.dhp.broker.oa.util.aggregators.simple.ResultGroup;
|
import eu.dnetlib.dhp.broker.oa.util.aggregators.simple.ResultGroup;
|
||||||
import eu.dnetlib.pace.config.DedupConfig;
|
|
||||||
|
|
||||||
public class EventFinder {
|
public class EventFinder {
|
||||||
|
|
||||||
|
@ -76,7 +75,6 @@ public class EventFinder {
|
||||||
final Set<String> dsIdWhitelist,
|
final Set<String> dsIdWhitelist,
|
||||||
final Set<String> dsIdBlacklist,
|
final Set<String> dsIdBlacklist,
|
||||||
final Set<String> dsTypeWhitelist,
|
final Set<String> dsTypeWhitelist,
|
||||||
final DedupConfig dedupConfig,
|
|
||||||
final Map<String, LongAccumulator> accumulators) {
|
final Map<String, LongAccumulator> accumulators) {
|
||||||
|
|
||||||
final List<UpdateInfo<?>> list = new ArrayList<>();
|
final List<UpdateInfo<?>> list = new ArrayList<>();
|
||||||
|
@ -84,7 +82,7 @@ public class EventFinder {
|
||||||
for (final OaBrokerMainEntity target : results.getData()) {
|
for (final OaBrokerMainEntity target : results.getData()) {
|
||||||
if (verifyTarget(target, dsIdWhitelist, dsIdBlacklist, dsTypeWhitelist)) {
|
if (verifyTarget(target, dsIdWhitelist, dsIdBlacklist, dsTypeWhitelist)) {
|
||||||
for (final UpdateMatcher<?> matcher : matchers) {
|
for (final UpdateMatcher<?> matcher : matchers) {
|
||||||
list.addAll(matcher.searchUpdatesForRecord(target, results.getData(), dedupConfig, accumulators));
|
list.addAll(matcher.searchUpdatesForRecord(target, results.getData(), accumulators));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,8 +1,62 @@
|
||||||
|
|
||||||
package eu.dnetlib.dhp.broker.oa.util;
|
package eu.dnetlib.dhp.broker.oa.util;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||||
|
|
||||||
|
import eu.dnetlib.broker.objects.OaBrokerMainEntity;
|
||||||
|
import eu.dnetlib.pace.config.DedupConfig;
|
||||||
|
import eu.dnetlib.pace.model.MapDocument;
|
||||||
|
import eu.dnetlib.pace.tree.support.TreeProcessor;
|
||||||
|
import eu.dnetlib.pace.util.MapDocumentUtil;
|
||||||
|
|
||||||
public class TrustUtils {
|
public class TrustUtils {
|
||||||
|
|
||||||
|
private static final Logger log = LoggerFactory.getLogger(TrustUtils.class);
|
||||||
|
|
||||||
|
private static DedupConfig dedupConfig;
|
||||||
|
|
||||||
|
static {
|
||||||
|
final ObjectMapper mapper = new ObjectMapper();
|
||||||
|
try {
|
||||||
|
dedupConfig = mapper
|
||||||
|
.readValue(
|
||||||
|
DedupConfig.class.getResourceAsStream("/eu/dnetlib/dhp/broker/oa/dedupConfig/dedupConfig.json"),
|
||||||
|
DedupConfig.class);
|
||||||
|
} catch (final IOException e) {
|
||||||
|
log.error("Error loading dedupConfig, e");
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
protected static float calculateTrust(final OaBrokerMainEntity r1, final OaBrokerMainEntity r2) {
|
||||||
|
|
||||||
|
if (dedupConfig == null) {
|
||||||
|
return BrokerConstants.MIN_TRUST;
|
||||||
|
}
|
||||||
|
|
||||||
|
try {
|
||||||
|
final ObjectMapper objectMapper = new ObjectMapper();
|
||||||
|
final MapDocument doc1 = MapDocumentUtil
|
||||||
|
.asMapDocumentWithJPath(dedupConfig, objectMapper.writeValueAsString(r1));
|
||||||
|
final MapDocument doc2 = MapDocumentUtil
|
||||||
|
.asMapDocumentWithJPath(dedupConfig, objectMapper.writeValueAsString(r2));
|
||||||
|
|
||||||
|
final double score = new TreeProcessor(dedupConfig).computeScore(doc1, doc2);
|
||||||
|
|
||||||
|
final double threshold = dedupConfig.getWf().getThreshold();
|
||||||
|
|
||||||
|
return TrustUtils.rescale(score, threshold);
|
||||||
|
} catch (final Exception e) {
|
||||||
|
log.error("Error computing score between results", e);
|
||||||
|
return BrokerConstants.MIN_TRUST;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
public static float rescale(final double score, final double threshold) {
|
public static float rescale(final double score, final double threshold) {
|
||||||
if (score >= BrokerConstants.MAX_TRUST) {
|
if (score >= BrokerConstants.MAX_TRUST) {
|
||||||
return BrokerConstants.MAX_TRUST;
|
return BrokerConstants.MAX_TRUST;
|
||||||
|
|
|
@ -4,20 +4,11 @@ package eu.dnetlib.dhp.broker.oa.util;
|
||||||
import java.util.function.BiConsumer;
|
import java.util.function.BiConsumer;
|
||||||
import java.util.function.Function;
|
import java.util.function.Function;
|
||||||
|
|
||||||
import org.slf4j.Logger;
|
|
||||||
import org.slf4j.LoggerFactory;
|
|
||||||
|
|
||||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
|
||||||
|
|
||||||
import eu.dnetlib.broker.objects.OaBrokerEventPayload;
|
import eu.dnetlib.broker.objects.OaBrokerEventPayload;
|
||||||
import eu.dnetlib.broker.objects.OaBrokerInstance;
|
import eu.dnetlib.broker.objects.OaBrokerInstance;
|
||||||
import eu.dnetlib.broker.objects.OaBrokerMainEntity;
|
import eu.dnetlib.broker.objects.OaBrokerMainEntity;
|
||||||
import eu.dnetlib.broker.objects.OaBrokerProvenance;
|
import eu.dnetlib.broker.objects.OaBrokerProvenance;
|
||||||
import eu.dnetlib.dhp.broker.model.Topic;
|
import eu.dnetlib.dhp.broker.model.Topic;
|
||||||
import eu.dnetlib.pace.config.DedupConfig;
|
|
||||||
import eu.dnetlib.pace.model.MapDocument;
|
|
||||||
import eu.dnetlib.pace.tree.support.TreeProcessor;
|
|
||||||
import eu.dnetlib.pace.util.MapDocumentUtil;
|
|
||||||
|
|
||||||
public final class UpdateInfo<T> {
|
public final class UpdateInfo<T> {
|
||||||
|
|
||||||
|
@ -35,20 +26,17 @@ public final class UpdateInfo<T> {
|
||||||
|
|
||||||
private final float trust;
|
private final float trust;
|
||||||
|
|
||||||
private static final Logger log = LoggerFactory.getLogger(UpdateInfo.class);
|
|
||||||
|
|
||||||
public UpdateInfo(final Topic topic, final T highlightValue, final OaBrokerMainEntity source,
|
public UpdateInfo(final Topic topic, final T highlightValue, final OaBrokerMainEntity source,
|
||||||
final OaBrokerMainEntity target,
|
final OaBrokerMainEntity target,
|
||||||
final BiConsumer<OaBrokerMainEntity, T> compileHighlight,
|
final BiConsumer<OaBrokerMainEntity, T> compileHighlight,
|
||||||
final Function<T, String> highlightToString,
|
final Function<T, String> highlightToString) {
|
||||||
final DedupConfig dedupConfig) {
|
|
||||||
this.topic = topic;
|
this.topic = topic;
|
||||||
this.highlightValue = highlightValue;
|
this.highlightValue = highlightValue;
|
||||||
this.source = source;
|
this.source = source;
|
||||||
this.target = target;
|
this.target = target;
|
||||||
this.compileHighlight = compileHighlight;
|
this.compileHighlight = compileHighlight;
|
||||||
this.highlightToString = highlightToString;
|
this.highlightToString = highlightToString;
|
||||||
this.trust = calculateTrust(dedupConfig, source, target);
|
this.trust = TrustUtils.calculateTrust(source, target);
|
||||||
}
|
}
|
||||||
|
|
||||||
public T getHighlightValue() {
|
public T getHighlightValue() {
|
||||||
|
@ -63,31 +51,6 @@ public final class UpdateInfo<T> {
|
||||||
return target;
|
return target;
|
||||||
}
|
}
|
||||||
|
|
||||||
private float calculateTrust(final DedupConfig dedupConfig,
|
|
||||||
final OaBrokerMainEntity r1,
|
|
||||||
final OaBrokerMainEntity r2) {
|
|
||||||
|
|
||||||
if (dedupConfig == null) {
|
|
||||||
return BrokerConstants.MIN_TRUST;
|
|
||||||
}
|
|
||||||
|
|
||||||
try {
|
|
||||||
final ObjectMapper objectMapper = new ObjectMapper();
|
|
||||||
final MapDocument doc1 = MapDocumentUtil
|
|
||||||
.asMapDocumentWithJPath(dedupConfig, objectMapper.writeValueAsString(r1));
|
|
||||||
final MapDocument doc2 = MapDocumentUtil
|
|
||||||
.asMapDocumentWithJPath(dedupConfig, objectMapper.writeValueAsString(r2));
|
|
||||||
|
|
||||||
final double score = new TreeProcessor(dedupConfig).computeScore(doc1, doc2);
|
|
||||||
final double threshold = dedupConfig.getWf().getThreshold();
|
|
||||||
|
|
||||||
return TrustUtils.rescale(score, threshold);
|
|
||||||
} catch (final Exception e) {
|
|
||||||
log.error("Error computing score between results", e);
|
|
||||||
return BrokerConstants.MIN_TRUST;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
protected Topic getTopic() {
|
protected Topic getTopic() {
|
||||||
return topic;
|
return topic;
|
||||||
}
|
}
|
||||||
|
|
|
@ -0,0 +1,122 @@
|
||||||
|
{
|
||||||
|
"wf": {
|
||||||
|
|
||||||
|
},
|
||||||
|
"pace": {
|
||||||
|
"clustering": [
|
||||||
|
{
|
||||||
|
"name": "wordssuffixprefix",
|
||||||
|
"fields": [
|
||||||
|
"title"
|
||||||
|
],
|
||||||
|
"params": {
|
||||||
|
"max": "2",
|
||||||
|
"len": "3"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "lowercase",
|
||||||
|
"fields": [
|
||||||
|
"doi"
|
||||||
|
],
|
||||||
|
"params": {
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"decisionTree": {
|
||||||
|
"start": {
|
||||||
|
"fields": [
|
||||||
|
{
|
||||||
|
"field": "doi",
|
||||||
|
"comparator": "exactMatch",
|
||||||
|
"weight": 1.0,
|
||||||
|
"countIfUndefined": "false",
|
||||||
|
"params": {
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"threshold": 0.5,
|
||||||
|
"aggregation": "AVG",
|
||||||
|
"positive": "MATCH",
|
||||||
|
"negative": "layer1",
|
||||||
|
"undefined": "layer1",
|
||||||
|
"ignoreUndefined": "true"
|
||||||
|
},
|
||||||
|
"layer1": {
|
||||||
|
"fields": [
|
||||||
|
{
|
||||||
|
"field": "title",
|
||||||
|
"comparator": "titleVersionMatch",
|
||||||
|
"weight": 0.9,
|
||||||
|
"countIfUndefined": "false",
|
||||||
|
"params": {
|
||||||
|
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"field": "authors",
|
||||||
|
"comparator": "sizeMatch",
|
||||||
|
"weight": 0.9,
|
||||||
|
"countIfUndefined": "false",
|
||||||
|
"params": {
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"threshold": 0.5,
|
||||||
|
"aggregation": "AVG",
|
||||||
|
"positive": "MATCH",
|
||||||
|
"negative": "layer2",
|
||||||
|
"undefined": "layer2",
|
||||||
|
"ignoreUndefined": "true"
|
||||||
|
},
|
||||||
|
"layer2": {
|
||||||
|
"fields": [
|
||||||
|
{
|
||||||
|
"field": "title",
|
||||||
|
"comparator": "levensteinTitle",
|
||||||
|
"weight": 1.0,
|
||||||
|
"countIfUndefined": "true",
|
||||||
|
"params": {
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"threshold": 0.99,
|
||||||
|
"aggregation": "AVG",
|
||||||
|
"positive": "MATCH",
|
||||||
|
"negative": "NO_MATCH",
|
||||||
|
"undefined": "NO_MATCH",
|
||||||
|
"ignoreUndefined": "true"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"model": [
|
||||||
|
{
|
||||||
|
"name": "doi",
|
||||||
|
"type": "String",
|
||||||
|
"path": "$.pids[?(@.type == 'doi')].value"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "title",
|
||||||
|
"type": "String",
|
||||||
|
"path": "$.titles",
|
||||||
|
"length": 250,
|
||||||
|
"size": 5
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "authors",
|
||||||
|
"type": "List",
|
||||||
|
"path": "$.creators[*].fullname",
|
||||||
|
"size": 200
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"blacklists": {
|
||||||
|
|
||||||
|
},
|
||||||
|
"synonyms": {
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
|
@ -8,14 +8,6 @@
|
||||||
<property>
|
<property>
|
||||||
<name>workingPath</name>
|
<name>workingPath</name>
|
||||||
<description>the path where the the generated data will be stored</description>
|
<description>the path where the the generated data will be stored</description>
|
||||||
</property>
|
|
||||||
<property>
|
|
||||||
<name>isLookupUrl</name>
|
|
||||||
<description>the address of the lookUp service</description>
|
|
||||||
</property>
|
|
||||||
<property>
|
|
||||||
<name>dedupConfProfId</name>
|
|
||||||
<description>the id of a valid Dedup Configuration Profile</description>
|
|
||||||
</property>
|
</property>
|
||||||
<property>
|
<property>
|
||||||
<name>datasourceIdWhitelist</name>
|
<name>datasourceIdWhitelist</name>
|
||||||
|
@ -427,8 +419,6 @@
|
||||||
--conf spark.sql.shuffle.partitions=3840
|
--conf spark.sql.shuffle.partitions=3840
|
||||||
</spark-opts>
|
</spark-opts>
|
||||||
<arg>--workingPath</arg><arg>${workingPath}</arg>
|
<arg>--workingPath</arg><arg>${workingPath}</arg>
|
||||||
<arg>--isLookupUrl</arg><arg>${isLookupUrl}</arg>
|
|
||||||
<arg>--dedupConfProfile</arg><arg>${dedupConfProfId}</arg>
|
|
||||||
<arg>--datasourceIdWhitelist</arg><arg>${datasourceIdWhitelist}</arg>
|
<arg>--datasourceIdWhitelist</arg><arg>${datasourceIdWhitelist}</arg>
|
||||||
<arg>--datasourceTypeWhitelist</arg><arg>${datasourceTypeWhitelist}</arg>
|
<arg>--datasourceTypeWhitelist</arg><arg>${datasourceTypeWhitelist}</arg>
|
||||||
<arg>--datasourceIdBlacklist</arg><arg>${datasourceIdBlacklist}</arg>
|
<arg>--datasourceIdBlacklist</arg><arg>${datasourceIdBlacklist}</arg>
|
||||||
|
|
|
@ -5,18 +5,6 @@
|
||||||
"paramDescription": "the path where the generated events will be stored",
|
"paramDescription": "the path where the generated events will be stored",
|
||||||
"paramRequired": true
|
"paramRequired": true
|
||||||
},
|
},
|
||||||
{
|
|
||||||
"paramName": "lu",
|
|
||||||
"paramLongName": "isLookupUrl",
|
|
||||||
"paramDescription": "the address of the ISLookUpService",
|
|
||||||
"paramRequired": true
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"paramName": "d",
|
|
||||||
"paramLongName": "dedupConfProfile",
|
|
||||||
"paramDescription": "the id of a valid Dedup Configuration Profile",
|
|
||||||
"paramRequired": true
|
|
||||||
},
|
|
||||||
{
|
{
|
||||||
"paramName": "datasourceIdWhitelist",
|
"paramName": "datasourceIdWhitelist",
|
||||||
"paramLongName": "datasourceIdWhitelist",
|
"paramLongName": "datasourceIdWhitelist",
|
||||||
|
|
|
@ -9,15 +9,6 @@
|
||||||
<name>workingPath</name>
|
<name>workingPath</name>
|
||||||
<description>the path where the the generated data will be stored</description>
|
<description>the path where the the generated data will be stored</description>
|
||||||
</property>
|
</property>
|
||||||
<property>
|
|
||||||
<name>isLookupUrl</name>
|
|
||||||
<description>the address of the lookUp service</description>
|
|
||||||
</property>
|
|
||||||
<property>
|
|
||||||
<name>dedupConfProfId</name>
|
|
||||||
<description>the id of a valid Dedup Configuration Profile</description>
|
|
||||||
</property>
|
|
||||||
|
|
||||||
<property>
|
<property>
|
||||||
<name>sparkDriverMemory</name>
|
<name>sparkDriverMemory</name>
|
||||||
<description>memory for driver process</description>
|
<description>memory for driver process</description>
|
||||||
|
|
|
@ -30,7 +30,7 @@ class UpdateMatcherTest {
|
||||||
final OaBrokerMainEntity p4 = new OaBrokerMainEntity();
|
final OaBrokerMainEntity p4 = new OaBrokerMainEntity();
|
||||||
|
|
||||||
final Collection<UpdateInfo<String>> list = matcher
|
final Collection<UpdateInfo<String>> list = matcher
|
||||||
.searchUpdatesForRecord(res, Arrays.asList(p1, p2, p3, p4), null, null);
|
.searchUpdatesForRecord(res, Arrays.asList(p1, p2, p3, p4), null);
|
||||||
|
|
||||||
assertTrue(list.isEmpty());
|
assertTrue(list.isEmpty());
|
||||||
}
|
}
|
||||||
|
@ -46,7 +46,7 @@ class UpdateMatcherTest {
|
||||||
res.setPublicationdate("2018");
|
res.setPublicationdate("2018");
|
||||||
|
|
||||||
final Collection<UpdateInfo<String>> list = matcher
|
final Collection<UpdateInfo<String>> list = matcher
|
||||||
.searchUpdatesForRecord(res, Arrays.asList(p1, p2, p3, p4), null, null);
|
.searchUpdatesForRecord(res, Arrays.asList(p1, p2, p3, p4), null);
|
||||||
|
|
||||||
assertTrue(list.isEmpty());
|
assertTrue(list.isEmpty());
|
||||||
}
|
}
|
||||||
|
@ -62,7 +62,7 @@ class UpdateMatcherTest {
|
||||||
p2.setPublicationdate("2018");
|
p2.setPublicationdate("2018");
|
||||||
|
|
||||||
final Collection<UpdateInfo<String>> list = matcher
|
final Collection<UpdateInfo<String>> list = matcher
|
||||||
.searchUpdatesForRecord(res, Arrays.asList(p1, p2, p3, p4), null, null);
|
.searchUpdatesForRecord(res, Arrays.asList(p1, p2, p3, p4), null);
|
||||||
|
|
||||||
assertTrue(list.size() == 1);
|
assertTrue(list.size() == 1);
|
||||||
}
|
}
|
||||||
|
@ -79,7 +79,7 @@ class UpdateMatcherTest {
|
||||||
p2.setPublicationdate("2018");
|
p2.setPublicationdate("2018");
|
||||||
|
|
||||||
final Collection<UpdateInfo<String>> list = matcher
|
final Collection<UpdateInfo<String>> list = matcher
|
||||||
.searchUpdatesForRecord(res, Arrays.asList(p1, p2, p3, p4), null, null);
|
.searchUpdatesForRecord(res, Arrays.asList(p1, p2, p3, p4), null);
|
||||||
|
|
||||||
assertTrue(list.isEmpty());
|
assertTrue(list.isEmpty());
|
||||||
}
|
}
|
||||||
|
@ -98,7 +98,7 @@ class UpdateMatcherTest {
|
||||||
p4.setPublicationdate("2018");
|
p4.setPublicationdate("2018");
|
||||||
|
|
||||||
final Collection<UpdateInfo<String>> list = matcher
|
final Collection<UpdateInfo<String>> list = matcher
|
||||||
.searchUpdatesForRecord(res, Arrays.asList(p1, p2, p3, p4), null, null);
|
.searchUpdatesForRecord(res, Arrays.asList(p1, p2, p3, p4), null);
|
||||||
|
|
||||||
assertTrue(list.isEmpty());
|
assertTrue(list.isEmpty());
|
||||||
}
|
}
|
||||||
|
@ -117,7 +117,7 @@ class UpdateMatcherTest {
|
||||||
p4.setPublicationdate("2018");
|
p4.setPublicationdate("2018");
|
||||||
|
|
||||||
final Collection<UpdateInfo<String>> list = matcher
|
final Collection<UpdateInfo<String>> list = matcher
|
||||||
.searchUpdatesForRecord(res, Arrays.asList(p1, p2, p3, p4), null, null);
|
.searchUpdatesForRecord(res, Arrays.asList(p1, p2, p3, p4), null);
|
||||||
|
|
||||||
assertTrue(list.size() == 1);
|
assertTrue(list.size() == 1);
|
||||||
}
|
}
|
||||||
|
|
|
@ -5,6 +5,10 @@ import static org.junit.jupiter.api.Assertions.assertTrue;
|
||||||
|
|
||||||
import org.junit.jupiter.api.Test;
|
import org.junit.jupiter.api.Test;
|
||||||
|
|
||||||
|
import eu.dnetlib.broker.objects.OaBrokerAuthor;
|
||||||
|
import eu.dnetlib.broker.objects.OaBrokerMainEntity;
|
||||||
|
import eu.dnetlib.broker.objects.OaBrokerTypedValue;
|
||||||
|
|
||||||
public class TrustUtilsTest {
|
public class TrustUtilsTest {
|
||||||
|
|
||||||
private static final double THRESHOLD = 0.95;
|
private static final double THRESHOLD = 0.95;
|
||||||
|
@ -64,6 +68,23 @@ public class TrustUtilsTest {
|
||||||
verifyValue(2.00, BrokerConstants.MAX_TRUST);
|
verifyValue(2.00, BrokerConstants.MAX_TRUST);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void test() throws Exception {
|
||||||
|
final OaBrokerMainEntity r1 = new OaBrokerMainEntity();
|
||||||
|
r1.getTitles().add("D-NET Service Package: Data Import");
|
||||||
|
r1.getPids().add(new OaBrokerTypedValue("doi", "123"));
|
||||||
|
r1.getCreators().add(new OaBrokerAuthor("Michele Artini", null));
|
||||||
|
r1.getCreators().add(new OaBrokerAuthor("Claudio Atzori", null));
|
||||||
|
|
||||||
|
final OaBrokerMainEntity r2 = new OaBrokerMainEntity();
|
||||||
|
r2.getTitles().add("D-NET Service Package: Data Import");
|
||||||
|
// r2.getPids().add(new OaBrokerTypedValue("doi", "123"));
|
||||||
|
r2.getCreators().add(new OaBrokerAuthor("Michele Artini", null));
|
||||||
|
// r2.getCreators().add(new OaBrokerAuthor("Claudio Atzori", null));
|
||||||
|
|
||||||
|
System.out.println("TRUST: " + TrustUtils.calculateTrust(r1, r2));
|
||||||
|
}
|
||||||
|
|
||||||
private void verifyValue(final double originalScore, final float expectedTrust) {
|
private void verifyValue(final double originalScore, final float expectedTrust) {
|
||||||
final float trust = TrustUtils.rescale(originalScore, THRESHOLD);
|
final float trust = TrustUtils.rescale(originalScore, THRESHOLD);
|
||||||
System.out.println(trust);
|
System.out.println(trust);
|
||||||
|
|
Loading…
Reference in New Issue