From e5a77f0a5367b4d8cf395e2015e42ffbb0d77200 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Mon, 19 Nov 2018 17:37:57 +0100 Subject: [PATCH] added new properties to FieldDef (size, length) to limit the information mapped onto each MapDocument --- dnet-dedup-test/pom.xml | 2 +- .../src/main/java/eu/dnetlib/SparkTest.java | 5 +- .../data/transform/AbstractProtoMapper.java | 150 ------------------ .../pace/model/ProtoDocumentBuilder.java | 36 ----- .../dnetlib/pace/organization.test2.pace.conf | 3 +- .../eu/dnetlib/pace/result.full.pace.conf | 4 +- .../resources/eu/dnetlib/pace/results.json | 10 ++ .../dnetlib/pace/distance/DetectorTest.java | 1 - .../pace/model/ProtoDocumentBuilderTest.java | 4 - .../java/eu/dnetlib/pace/model/FieldDef.java | 45 ++++-- .../eu/dnetlib/pace/util/BlockProcessor.java | 11 +- pom.xml | 2 +- 12 files changed, 52 insertions(+), 221 deletions(-) delete mode 100644 dnet-dedup-test/src/main/java/eu/dnetlib/data/transform/AbstractProtoMapper.java delete mode 100644 dnet-dedup-test/src/main/java/eu/dnetlib/pace/model/ProtoDocumentBuilder.java create mode 100644 dnet-dedup-test/src/main/resources/eu/dnetlib/pace/results.json diff --git a/dnet-dedup-test/pom.xml b/dnet-dedup-test/pom.xml index b2101fe..3ae6742 100644 --- a/dnet-dedup-test/pom.xml +++ b/dnet-dedup-test/pom.xml @@ -63,7 +63,7 @@ eu.dnetlib dnet-openaireplus-mapping-utils - test + diff --git a/dnet-dedup-test/src/main/java/eu/dnetlib/SparkTest.java b/dnet-dedup-test/src/main/java/eu/dnetlib/SparkTest.java index 93c3bda..ff3a59b 100644 --- a/dnet-dedup-test/src/main/java/eu/dnetlib/SparkTest.java +++ b/dnet-dedup-test/src/main/java/eu/dnetlib/SparkTest.java @@ -1,6 +1,5 @@ package eu.dnetlib; -import com.google.common.collect.Lists; import com.google.common.collect.Sets; import eu.dnetlib.graph.GraphProcessor; import eu.dnetlib.pace.clustering.BlacklistAwareClusteringCombiner; @@ -34,13 +33,13 @@ public class SparkTest { public static void main(String[] args) { final JavaSparkContext context = new JavaSparkContext(new SparkConf().setAppName("Deduplication").setMaster("local[*]")); - final URL dataset = SparkTest.class.getResource("/eu/dnetlib/pace/orgs.json"); + final URL dataset = SparkTest.class.getResource("/eu/dnetlib/pace/results.json"); final JavaRDD dataRDD = context.textFile(dataset.getPath()); counter = new SparkCounter(context); //read the configuration from the classpath - final DedupConfig config = DedupConfig.load(readFromClasspath("/eu/dnetlib/pace/organization.test2.pace.conf")); + final DedupConfig config = DedupConfig.load(readFromClasspath("/eu/dnetlib/pace/result.full.pace.conf")); BlockProcessor.constructAccumulator(config); BlockProcessor.accumulators.forEach(acc -> { diff --git a/dnet-dedup-test/src/main/java/eu/dnetlib/data/transform/AbstractProtoMapper.java b/dnet-dedup-test/src/main/java/eu/dnetlib/data/transform/AbstractProtoMapper.java deleted file mode 100644 index 2d5a6e0..0000000 --- a/dnet-dedup-test/src/main/java/eu/dnetlib/data/transform/AbstractProtoMapper.java +++ /dev/null @@ -1,150 +0,0 @@ -package eu.dnetlib.data.transform; - -import java.util.List; - -import org.apache.commons.lang.StringUtils; - -import com.google.common.base.Splitter; -import com.google.common.collect.Iterables; -import com.google.common.collect.Lists; -import com.google.protobuf.Descriptors.EnumValueDescriptor; -import com.google.protobuf.Descriptors.FieldDescriptor; -import com.google.protobuf.GeneratedMessage; -import com.google.protobuf.Message; -import com.googlecode.protobuf.format.JsonFormat; - -import eu.dnetlib.pace.config.Type; - -/** - * AbstractProtoMapper provide common navigation methods on the protocolbuffers Messages. - * - * @author claudio - */ -public abstract class AbstractProtoMapper { - - private static final String COND_WRAPPER = "\\{|\\}"; - private static final String COND_SEPARATOR = "#"; - /** The Constant PATH_SEPARATOR. */ - private static final String PATH_SEPARATOR = "/"; - - /** - * Process multi path. - * - * @param proto - * the proto - * @param paths - * the paths - * @return the list - */ - protected List processMultiPath(final GeneratedMessage proto, final List paths, final Type type) { - final List response = Lists.newArrayList(); - for (final String pathElements : paths) { - response.addAll(processPath(proto, pathElements, type)); - } - return response; - } - - /** - * Process path. - * - * @param proto - * the proto - * @param path - * the path - * @return the list - */ - protected List processPath(final GeneratedMessage proto, final String path, final Type type) { - return processPath(proto, Lists.newLinkedList(Splitter.on(PATH_SEPARATOR).trimResults().split(path)), type); - } - - /** - * Process path. - * - * @param proto - * the proto - * @param pathElements - * the list - * @return the list - */ - protected List processPath(final GeneratedMessage proto, final List pathElements, final Type type) { - - final List response = Lists.newArrayList(); - - if (pathElements.isEmpty()) throw new RuntimeException("ProtoBuf navigation path is empty"); - - final String fieldPathCond = pathElements.get(0); - - final String fieldPath = StringUtils.substringBefore(fieldPathCond, "["); - final String cond = getCondition(fieldPathCond); - - final FieldDescriptor fd = proto.getDescriptorForType().findFieldByName(fieldPath); - if ((fd != null)) { - if (fd.isRepeated()) { - final int count = proto.getRepeatedFieldCount(fd); - for (int i = 0; i < count; i++) { - final Object field = proto.getRepeatedField(fd, i); - response.addAll(generateFields(fd, field, pathElements, cond, type)); - } - } else { - final Object field = proto.getField(fd); - response.addAll(generateFields(fd, field, pathElements, cond, type)); - } - } else throw new IllegalArgumentException("Invalid protobuf path (field not found): " + StringUtils.join(pathElements, ">") + "\nMessage:\n" + proto); - - return response; - } - - /** - * Generate fields. - * - * @param fd - * the fd - * @param field - * the field - * @param list - * the list - * @return the list - */ - private List generateFields(final FieldDescriptor fd, final Object field, final List list, final String cond, final Type type) { - - final List res = Lists.newArrayList(); - if (field instanceof GeneratedMessage) { - if (list.size() > 1) { - - if (StringUtils.isBlank(cond)) return processPath((GeneratedMessage) field, list.subList(1, list.size()), type); - else { - - final List condPath = - Lists.newLinkedList(Splitter.on(COND_SEPARATOR).trimResults().split(StringUtils.substringBefore(cond, "="))); - - final String val = (String) Iterables.getOnlyElement(processPath((GeneratedMessage) field, condPath, type)); - final String condVal = StringUtils.substringAfter(cond, "=").replaceAll(COND_WRAPPER, "").trim(); - - return val.equals(condVal) ? processPath((GeneratedMessage) field, list.subList(1, list.size()), type) : res; - } - } - else if (Type.JSON.equals(type)) { - res.add(JsonFormat.printToString((Message) field)); - return res; - } else throw new RuntimeException("No primitive type found"); - } else { - if (list.size() == 1) { - - switch (fd.getType()) { - case ENUM: - res.add(((EnumValueDescriptor) field).getName()); - break; - default: - res.add(field); - break; - } - return res; - } - else throw new RuntimeException("Found a primitive type before the path end"); - } - } - - private String getCondition(final String fieldPathCond) { - return fieldPathCond.contains("[") ? StringUtils.substringAfter(fieldPathCond, "[").replace("]", "") : ""; - } -} \ No newline at end of file diff --git a/dnet-dedup-test/src/main/java/eu/dnetlib/pace/model/ProtoDocumentBuilder.java b/dnet-dedup-test/src/main/java/eu/dnetlib/pace/model/ProtoDocumentBuilder.java deleted file mode 100644 index bb53424..0000000 --- a/dnet-dedup-test/src/main/java/eu/dnetlib/pace/model/ProtoDocumentBuilder.java +++ /dev/null @@ -1,36 +0,0 @@ -package eu.dnetlib.pace.model; - -import java.util.List; -import java.util.Map; - -import com.google.common.collect.Maps; -import com.google.protobuf.GeneratedMessage; - -import eu.dnetlib.data.transform.AbstractProtoMapper; - -public class ProtoDocumentBuilder extends AbstractProtoMapper { - - public static MapDocument newInstance(final String id, final GeneratedMessage proto, final List fields) { - final Map fieldMap = new ProtoDocumentBuilder().generateFieldMap(proto, fields); - return new MapDocument(id, fieldMap); - } - - private Map generateFieldMap(final GeneratedMessage proto, final List fields) { - final Map fieldMap = Maps.newHashMap(); - - for (final FieldDef fd : fields) { - - final FieldList fl = new FieldListImpl(fd.getName(), fd.getType()); - - for (final Object o : processPath(proto, fd.getPathList(), fd.getType())) { - - fl.add(new FieldValueImpl(fd.getType(), fd.getName(), o)); - } - - fieldMap.put(fd.getName(), fl); - } - - return fieldMap; - } - -} \ No newline at end of file diff --git a/dnet-dedup-test/src/main/resources/eu/dnetlib/pace/organization.test2.pace.conf b/dnet-dedup-test/src/main/resources/eu/dnetlib/pace/organization.test2.pace.conf index d828d6f..14737ef 100644 --- a/dnet-dedup-test/src/main/resources/eu/dnetlib/pace/organization.test2.pace.conf +++ b/dnet-dedup-test/src/main/resources/eu/dnetlib/pace/organization.test2.pace.conf @@ -21,10 +21,9 @@ { "name" : "DomainExactMatch", "fields" : [ "websiteurl" ] } ], "model" : [ - { "name" : "legalname", "algo" : "Null", "type" : "String", "weight" : "0", "ignoreMissing" : "false", "path" : "organization/metadata/legalname/value" }, { "name" : "country", "algo" : "Null", "type" : "String", "weight" : "0", "ignoreMissing" : "true", "path" : "organization/metadata/country/classid" }, { "name" : "legalshortname", "algo" : "JaroWinkler", "type" : "String", "weight" : "0.3", "ignoreMissing" : "true", "path" : "organization/metadata/legalshortname/value" }, - { "name" : "legalname", "algo" : "JaroWinklerNormalizedName", "type" : "String", "weight" : "0.7", "ignoreMissing" : "false", "path" : "organization/metadata/legalname/value" }, + { "name" : "legalname", "algo" : "JaroWinklerNormalizedName", "type" : "String", "weight" : "0.7", "ignoreMissing" : "false", "path" : "organization/metadata/legalname/value", "length" : 5 }, { "name" : "websiteurl", "algo" : "Null", "type" : "URL", "weight" : "0", "ignoreMissing" : "true", "path" : "organization/metadata/websiteurl/value", "params" : { "host" : 0.5, "path" : 0.5 } } ], "blacklists" : { } diff --git a/dnet-dedup-test/src/main/resources/eu/dnetlib/pace/result.full.pace.conf b/dnet-dedup-test/src/main/resources/eu/dnetlib/pace/result.full.pace.conf index b9aff4e..6f3a538 100644 --- a/dnet-dedup-test/src/main/resources/eu/dnetlib/pace/result.full.pace.conf +++ b/dnet-dedup-test/src/main/resources/eu/dnetlib/pace/result.full.pace.conf @@ -24,9 +24,9 @@ ], "model" : [ { "name" : "pid", "algo" : "Null", "type" : "JSON", "weight" : "0.0", "ignoreMissing" : "true", "path" : "pid", "overrideMatch" : "true" }, - { "name" : "title", "algo" : "JaroWinkler", "type" : "String", "weight" : "1.0", "ignoreMissing" : "false", "path" : "result/metadata/title[qualifier#classid = {main title}]/value" }, + { "name" : "title", "algo" : "JaroWinkler", "type" : "String", "weight" : "1.0", "ignoreMissing" : "false", "path" : "result/metadata/title[qualifier#classid = {main title}]/value", "length" : 10 }, { "name" : "dateofacceptance", "algo" : "Null", "type" : "String", "weight" : "0.0", "ignoreMissing" : "true", "path" : "result/metadata/dateofacceptance/value" } , - { "name" : "authors", "algo" : "Null", "type" : "String", "weight" : "0.0", "ignoreMissing" : "true", "path" : "result/metadata/author/fullname" } + { "name" : "authors", "algo" : "Null", "type" : "String", "weight" : "0.0", "ignoreMissing" : "true", "path" : "result/metadata/author/fullname", "size" : 1 } ], "blacklists" : { "title" : [ diff --git a/dnet-dedup-test/src/main/resources/eu/dnetlib/pace/results.json b/dnet-dedup-test/src/main/resources/eu/dnetlib/pace/results.json new file mode 100644 index 0000000..b42c197 --- /dev/null +++ b/dnet-dedup-test/src/main/resources/eu/dnetlib/pace/results.json @@ -0,0 +1,10 @@ +{"dateoftransformation":"2016-03-12T12:49:38.412Z","pid":[{"qualifier":{"classid":"doi","classname":"doi","schemename":"dnet:pid_types","schemeid":"dnet:pid_types"},"value":"10.1145/1621890.1621915"}],"originalId":["1621915"],"oaiprovenance":{},"result":{"instance":[{"hostedby":{"value":"ACM Digital Library","key":"10|openaire____::02b55e4f52388520bfe11f959f836e68"},"url":["http://dl.acm.org/citation.cfm?id=1621915"],"dateofacceptance":{"value":"2009-06-16"},"collectedfrom":{"value":"ACM Digital Library","key":"10|openaire____::02b55e4f52388520bfe11f959f836e68"},"accessright":{"classid":"CLOSED","classname":"Closed Access","schemename":"dnet:access_modes","schemeid":"dnet:access_modes"},"instancetype":{"classid":"0001","classname":"Article","schemename":"dnet:publication_resource","schemeid":"dnet:publication_resource"}}],"metadata":{"embargoenddate":{"value":""},"language":{"classid":"und","classname":"Undetermined","schemename":"dnet:languages","schemeid":"dnet:languages"},"author":[{"fullname":"Fermín Galán","rank":1},{"fullname":"Americo Sampaio","rank":2},{"fullname":"Luis Rodero-Merino","rank":3},{"fullname":"Irit Loy","rank":4},{"fullname":"Victor Gil","rank":5},{"fullname":"Luis Vaquero","rank":6}],"title":[{"qualifier":{"classid":"main title","classname":"main title","schemename":"dnet:dataCite_title","schemeid":"dnet:dataCite_title"},"value":"Service specification in cloud environments based on extensions to open standards"}],"resulttype":{"classid":"publication","classname":"publication","schemename":"dnet:result_typologies","schemeid":"dnet:result_typologies"},"dateofacceptance":{"value":"2009-06-16"}}},"collectedfrom":[{"value":"ACM Digital Library","key":"10|openaire____::02b55e4f52388520bfe11f959f836e68"}],"dateofcollection":"2015-01-20T00:00:00Z","type":50,"id":"50|acm_________::0002c24f82c295e925a2bdf7bbf49bfc"} +{"dateoftransformation":"2016-03-12T12:49:38.413Z","pid":[{"qualifier":{"classid":"doi","classname":"doi","schemename":"dnet:pid_types","schemeid":"dnet:pid_types"},"value":"10.1109/PESOS.2009.5068828"}],"originalId":["1564735"],"oaiprovenance":{},"result":{"instance":[{"hostedby":{"value":"ACM Digital Library","key":"10|openaire____::02b55e4f52388520bfe11f959f836e68"},"url":["http://dl.acm.org/citation.cfm?id=1564735"],"dateofacceptance":{"value":"2009-05-18"},"collectedfrom":{"value":"ACM Digital Library","key":"10|openaire____::02b55e4f52388520bfe11f959f836e68"},"accessright":{"classid":"CLOSED","classname":"Closed Access","schemename":"dnet:access_modes","schemeid":"dnet:access_modes"},"instancetype":{"classid":"0001","classname":"Article","schemename":"dnet:publication_resource","schemeid":"dnet:publication_resource"}}],"metadata":{"embargoenddate":{"value":""},"language":{"classid":"und","classname":"Undetermined","schemename":"dnet:languages","schemeid":"dnet:languages"},"author":[{"fullname":"Domenico Bianculli","rank":1},{"fullname":"Carlo Ghezzi","rank":2},{"fullname":"Cesare Pautasso","rank":3}],"title":[{"qualifier":{"classid":"main title","classname":"main title","schemename":"dnet:dataCite_title","schemeid":"dnet:dataCite_title"},"value":"Embedding continuous lifelong verification in service life cycles"}],"resulttype":{"classid":"publication","classname":"publication","schemename":"dnet:result_typologies","schemeid":"dnet:result_typologies"},"dateofacceptance":{"value":"2009-05-18"}}},"collectedfrom":[{"value":"ACM Digital Library","key":"10|openaire____::02b55e4f52388520bfe11f959f836e68"}],"dateofcollection":"2015-01-20T00:00:00Z","type":50,"id":"50|acm_________::000e0060b89af1706db93e289527a88d"} +{"dateoftransformation":"2016-03-12T12:49:38.413Z","pid":[{"qualifier":{"classid":"doi","classname":"doi","schemename":"dnet:pid_types","schemeid":"dnet:pid_types"},"value":"10.1145/1966913.1966935"}],"originalId":["1966935"],"oaiprovenance":{},"result":{"instance":[{"hostedby":{"value":"ACM Digital Library","key":"10|openaire____::02b55e4f52388520bfe11f959f836e68"},"url":["http://dl.acm.org/citation.cfm?id=1966935"],"dateofacceptance":{"value":"2011-03-22"},"collectedfrom":{"value":"ACM Digital Library","key":"10|openaire____::02b55e4f52388520bfe11f959f836e68"},"accessright":{"classid":"CLOSED","classname":"Closed Access","schemename":"dnet:access_modes","schemeid":"dnet:access_modes"},"instancetype":{"classid":"0001","classname":"Article","schemename":"dnet:publication_resource","schemeid":"dnet:publication_resource"}}],"metadata":{"embargoenddate":{"value":""},"language":{"classid":"und","classname":"Undetermined","schemename":"dnet:languages","schemeid":"dnet:languages"},"author":[{"fullname":"Francesco Alberti","rank":1},{"fullname":"Alessandro Armando","rank":2},{"fullname":"Silvio Ranise","rank":3}],"title":[{"qualifier":{"classid":"main title","classname":"main title","schemename":"dnet:dataCite_title","schemeid":"dnet:dataCite_title"},"value":"Efficient symbolic automated analysis of administrative attribute-based RBAC-policies"}],"resulttype":{"classid":"publication","classname":"publication","schemename":"dnet:result_typologies","schemeid":"dnet:result_typologies"},"dateofacceptance":{"value":"2011-03-22"}}},"collectedfrom":[{"value":"ACM Digital Library","key":"10|openaire____::02b55e4f52388520bfe11f959f836e68"}],"dateofcollection":"2015-01-20T00:00:00Z","type":50,"id":"50|acm_________::00680ab21c76269e780f5e9e7e636619"} +{"dateoftransformation":"2016-03-12T12:49:38.414Z","pid":[{"qualifier":{"classid":"doi","classname":"doi","schemename":"dnet:pid_types","schemeid":"dnet:pid_types"},"value":"10.1145/1570433.1570486"}],"originalId":["1570486"],"oaiprovenance":{},"result":{"instance":[{"hostedby":{"value":"ACM Digital Library","key":"10|openaire____::02b55e4f52388520bfe11f959f836e68"},"url":["http://dl.acm.org/citation.cfm?id=1570486"],"dateofacceptance":{"value":"2009-07-15"},"collectedfrom":{"value":"ACM Digital Library","key":"10|openaire____::02b55e4f52388520bfe11f959f836e68"},"accessright":{"classid":"CLOSED","classname":"Closed Access","schemename":"dnet:access_modes","schemeid":"dnet:access_modes"},"instancetype":{"classid":"0001","classname":"Article","schemename":"dnet:publication_resource","schemeid":"dnet:publication_resource"}}],"metadata":{"embargoenddate":{"value":""},"language":{"classid":"und","classname":"Undetermined","schemename":"dnet:languages","schemeid":"dnet:languages"},"author":[{"fullname":"Daniel Schreiber","rank":1},{"fullname":"Melanie Hartmann","rank":2},{"fullname":"Max Mühlhäuser","surname":"Hlh User","name":"Max M.","rank":3}],"title":[{"qualifier":{"classid":"main title","classname":"main title","schemename":"dnet:dataCite_title","schemeid":"dnet:dataCite_title"},"value":"MundoMonkey"}],"resulttype":{"classid":"publication","classname":"publication","schemename":"dnet:result_typologies","schemeid":"dnet:result_typologies"},"dateofacceptance":{"value":"2009-07-15"}}},"collectedfrom":[{"value":"ACM Digital Library","key":"10|openaire____::02b55e4f52388520bfe11f959f836e68"}],"dateofcollection":"2015-01-20T00:00:00Z","type":50,"id":"50|acm_________::008169b761b014b88105a9ed96bb0b4c"} +{"dateoftransformation":"2016-03-12T12:49:38.414Z","pid":[{"qualifier":{"classid":"doi","classname":"doi","schemename":"dnet:pid_types","schemeid":"dnet:pid_types"},"value":"10.1145/2002259.2002332"}],"originalId":["2002332"],"oaiprovenance":{},"result":{"instance":[{"hostedby":{"value":"ACM Digital Library","key":"10|openaire____::02b55e4f52388520bfe11f959f836e68"},"url":["http://dl.acm.org/citation.cfm?id=2002332"],"dateofacceptance":{"value":"2011-07-11"},"collectedfrom":{"value":"ACM Digital Library","key":"10|openaire____::02b55e4f52388520bfe11f959f836e68"},"accessright":{"classid":"CLOSED","classname":"Closed Access","schemename":"dnet:access_modes","schemeid":"dnet:access_modes"},"instancetype":{"classid":"0001","classname":"Article","schemename":"dnet:publication_resource","schemeid":"dnet:publication_resource"}}],"metadata":{"embargoenddate":{"value":""},"language":{"classid":"und","classname":"Undetermined","schemename":"dnet:languages","schemeid":"dnet:languages"},"author":[{"fullname":"Roland Stühmer","rank":1},{"fullname":"Nenad Stojanovic","rank":2}],"title":[{"qualifier":{"classid":"main title","classname":"main title","schemename":"dnet:dataCite_title","schemeid":"dnet:dataCite_title"},"value":"Large-scale, situation-driven and quality-aware event marketplace"}],"resulttype":{"classid":"publication","classname":"publication","schemename":"dnet:result_typologies","schemeid":"dnet:result_typologies"},"dateofacceptance":{"value":"2011-07-11"}}},"collectedfrom":[{"value":"ACM Digital Library","key":"10|openaire____::02b55e4f52388520bfe11f959f836e68"}],"dateofcollection":"2015-01-20T00:00:00Z","type":50,"id":"50|acm_________::00b0f37683e305a90c3397f328fb558a"} +{"dateoftransformation":"2016-03-12T12:49:38.414Z","pid":[{"qualifier":{"classid":"doi","classname":"doi","schemename":"dnet:pid_types","schemeid":"dnet:pid_types"},"value":"10.1145/1852658.1852664"}],"originalId":["1852664"],"oaiprovenance":{},"result":{"instance":[{"hostedby":{"value":"ACM Digital Library","key":"10|openaire____::02b55e4f52388520bfe11f959f836e68"},"url":["http://dl.acm.org/citation.cfm?id=1852664"],"dateofacceptance":{"value":"2010-04-13"},"collectedfrom":{"value":"ACM Digital Library","key":"10|openaire____::02b55e4f52388520bfe11f959f836e68"},"accessright":{"classid":"CLOSED","classname":"Closed Access","schemename":"dnet:access_modes","schemeid":"dnet:access_modes"},"instancetype":{"classid":"0001","classname":"Article","schemename":"dnet:publication_resource","schemeid":"dnet:publication_resource"}}],"metadata":{"embargoenddate":{"value":""},"language":{"classid":"und","classname":"Undetermined","schemename":"dnet:languages","schemeid":"dnet:languages"},"author":[{"fullname":"Michal Kryczka","rank":1},{"fullname":"Ruben Cuevas","rank":2},{"fullname":"Carmen Guerrero","rank":3},{"fullname":"Eiko Yoneki","rank":4},{"fullname":"Arturo Azcorra","rank":5}],"title":[{"qualifier":{"classid":"main title","classname":"main title","schemename":"dnet:dataCite_title","schemeid":"dnet:dataCite_title"},"value":"A first step towards user assisted online social networks"}],"resulttype":{"classid":"publication","classname":"publication","schemename":"dnet:result_typologies","schemeid":"dnet:result_typologies"},"dateofacceptance":{"value":"2010-04-13"}}},"collectedfrom":[{"value":"ACM Digital Library","key":"10|openaire____::02b55e4f52388520bfe11f959f836e68"}],"dateofcollection":"2015-01-20T00:00:00Z","type":50,"id":"50|acm_________::00e918f80a81af40a5e5770024f9256f"} +{"dateoftransformation":"2016-03-12T12:49:38.415Z","pid":[{"qualifier":{"classid":"doi","classname":"doi","schemename":"dnet:pid_types","schemeid":"dnet:pid_types"},"value":"10.1145/1978582.1978584"}],"originalId":["1978584"],"oaiprovenance":{},"result":{"instance":[{"hostedby":{"value":"ACM Digital Library","key":"10|openaire____::02b55e4f52388520bfe11f959f836e68"},"url":["http://dl.acm.org/citation.cfm?id=1978584"],"dateofacceptance":{"value":"2011-05-11"},"collectedfrom":{"value":"ACM Digital Library","key":"10|openaire____::02b55e4f52388520bfe11f959f836e68"},"accessright":{"classid":"CLOSED","classname":"Closed Access","schemename":"dnet:access_modes","schemeid":"dnet:access_modes"},"instancetype":{"classid":"0001","classname":"Article","schemename":"dnet:publication_resource","schemeid":"dnet:publication_resource"}}],"metadata":{"embargoenddate":{"value":""},"language":{"classid":"und","classname":"Undetermined","schemename":"dnet:languages","schemeid":"dnet:languages"},"author":[{"fullname":"Salvatore D'Antonio","surname":"Antonio","name":"Salvatore D.","rank":1},{"fullname":"Luigi Coppolino","rank":2},{"fullname":"Ivano Elia","rank":3},{"fullname":"Valerio Formicola","rank":4}],"title":[{"qualifier":{"classid":"main title","classname":"main title","schemename":"dnet:dataCite_title","schemeid":"dnet:dataCite_title"},"value":"Security issues of a phasor data concentrator for smart grid infrastructure"}],"resulttype":{"classid":"publication","classname":"publication","schemename":"dnet:result_typologies","schemeid":"dnet:result_typologies"},"dateofacceptance":{"value":"2011-05-11"}}},"collectedfrom":[{"value":"ACM Digital Library","key":"10|openaire____::02b55e4f52388520bfe11f959f836e68"}],"dateofcollection":"2015-01-20T00:00:00Z","type":50,"id":"50|acm_________::012f02c31a80f63a43772e662aca364f"} +{"dateoftransformation":"2016-03-12T12:49:38.415Z","pid":[{"qualifier":{"classid":"doi","classname":"doi","schemename":"dnet:pid_types","schemeid":"dnet:pid_types"},"value":"10.1145/1809400.1809402"}],"originalId":["1809402"],"oaiprovenance":{},"result":{"instance":[{"hostedby":{"value":"ACM Digital Library","key":"10|openaire____::02b55e4f52388520bfe11f959f836e68"},"url":["http://dl.acm.org/citation.cfm?id=1809402"],"dateofacceptance":{"value":"2010-05-27"},"collectedfrom":{"value":"ACM Digital Library","key":"10|openaire____::02b55e4f52388520bfe11f959f836e68"},"accessright":{"classid":"CLOSED","classname":"Closed Access","schemename":"dnet:access_modes","schemeid":"dnet:access_modes"},"instancetype":{"classid":"0001","classname":"Article","schemename":"dnet:publication_resource","schemeid":"dnet:publication_resource"}}],"metadata":{"embargoenddate":{"value":""},"language":{"classid":"und","classname":"Undetermined","schemename":"dnet:languages","schemeid":"dnet:languages"},"author":[{"fullname":"Kai Puolamäki","rank":1},{"fullname":"Alessio Bertone","rank":2}],"title":[{"qualifier":{"classid":"main title","classname":"main title","schemename":"dnet:dataCite_title","schemeid":"dnet:dataCite_title"},"value":"Introduction to the special issue on visual analytics and knowledge discovery"}],"resulttype":{"classid":"publication","classname":"publication","schemename":"dnet:result_typologies","schemeid":"dnet:result_typologies"},"dateofacceptance":{"value":"2010-05-27"}}},"collectedfrom":[{"value":"ACM Digital Library","key":"10|openaire____::02b55e4f52388520bfe11f959f836e68"}],"dateofcollection":"2015-01-20T00:00:00Z","type":50,"id":"50|acm_________::020794cfeedb650987bf93d3e3e09011"} +{"dateoftransformation":"2016-03-12T12:49:38.416Z","pid":[{"qualifier":{"classid":"doi","classname":"doi","schemename":"dnet:pid_types","schemeid":"dnet:pid_types"},"value":"10.1145/1851275.1851254"}],"originalId":["1851254"],"oaiprovenance":{},"result":{"instance":[{"hostedby":{"value":"ACM Digital Library","key":"10|openaire____::02b55e4f52388520bfe11f959f836e68"},"url":["http://dl.acm.org/citation.cfm?id=1851254"],"dateofacceptance":{"value":"2010-08-30"},"collectedfrom":{"value":"ACM Digital Library","key":"10|openaire____::02b55e4f52388520bfe11f959f836e68"},"accessright":{"classid":"CLOSED","classname":"Closed Access","schemename":"dnet:access_modes","schemeid":"dnet:access_modes"},"instancetype":{"classid":"0001","classname":"Article","schemename":"dnet:publication_resource","schemeid":"dnet:publication_resource"}}],"metadata":{"embargoenddate":{"value":""},"language":{"classid":"und","classname":"Undetermined","schemename":"dnet:languages","schemeid":"dnet:languages"},"author":[{"fullname":"Sébastien Barré","surname":"Bastien Barr","name":"S.","rank":1},{"fullname":"Olivier Bonaventure","rank":2},{"fullname":"Costin Raiciu","rank":3},{"fullname":"Mark Handley","rank":4}],"title":[{"qualifier":{"classid":"main title","classname":"main title","schemename":"dnet:dataCite_title","schemeid":"dnet:dataCite_title"},"value":"Experimenting with multipath TCP"}],"resulttype":{"classid":"publication","classname":"publication","schemename":"dnet:result_typologies","schemeid":"dnet:result_typologies"},"dateofacceptance":{"value":"2010-08-30"}}},"collectedfrom":[{"value":"ACM Digital Library","key":"10|openaire____::02b55e4f52388520bfe11f959f836e68"}],"dateofcollection":"2015-01-20T00:00:00Z","type":50,"id":"50|acm_________::023fa75845681e2812d97440d070fb69"} +{"dateoftransformation":"2016-03-12T12:49:38.416Z","originalId":["2043516"],"oaiprovenance":{},"result":{"instance":[{"hostedby":{"value":"ACM Digital Library","key":"10|openaire____::02b55e4f52388520bfe11f959f836e68"},"url":["http://dl.acm.org/citation.cfm?id=2043516"],"dateofacceptance":{"value":"2011-09-06"},"collectedfrom":{"value":"ACM Digital Library","key":"10|openaire____::02b55e4f52388520bfe11f959f836e68"},"accessright":{"classid":"CLOSED","classname":"Closed Access","schemename":"dnet:access_modes","schemeid":"dnet:access_modes"},"instancetype":{"classid":"0001","classname":"Article","schemename":"dnet:publication_resource","schemeid":"dnet:publication_resource"}}],"metadata":{"embargoenddate":{"value":""},"language":{"classid":"und","classname":"Undetermined","schemename":"dnet:languages","schemeid":"dnet:languages"},"author":[{"fullname":"Paolo Pileggi","rank":1},{"fullname":"Giuseppe Bianchi","rank":2}],"title":[{"qualifier":{"classid":"main title","classname":"main title","schemename":"dnet:dataCite_title","schemeid":"dnet:dataCite_title"},"value":"Traffic-centric modeling of future wireless internet access technologies"}],"resulttype":{"classid":"publication","classname":"publication","schemename":"dnet:result_typologies","schemeid":"dnet:result_typologies"},"dateofacceptance":{"value":"2011-09-06"}}},"collectedfrom":[{"value":"ACM Digital Library","key":"10|openaire____::02b55e4f52388520bfe11f959f836e68"}],"dateofcollection":"2015-01-20T00:00:00Z","type":50,"id":"50|acm_________::02a8fbd0aa341df6dbb8323f453091f8"} diff --git a/dnet-dedup-test/src/test/java/eu/dnetlib/pace/distance/DetectorTest.java b/dnet-dedup-test/src/test/java/eu/dnetlib/pace/distance/DetectorTest.java index 304fda5..a474a0e 100644 --- a/dnet-dedup-test/src/test/java/eu/dnetlib/pace/distance/DetectorTest.java +++ b/dnet-dedup-test/src/test/java/eu/dnetlib/pace/distance/DetectorTest.java @@ -17,7 +17,6 @@ import org.junit.Ignore; import org.junit.Test; import java.io.IOException; -import java.util.LinkedList; import java.util.List; import java.util.Set; diff --git a/dnet-dedup-test/src/test/java/eu/dnetlib/pace/model/ProtoDocumentBuilderTest.java b/dnet-dedup-test/src/test/java/eu/dnetlib/pace/model/ProtoDocumentBuilderTest.java index 0e774b7..2139aec 100644 --- a/dnet-dedup-test/src/test/java/eu/dnetlib/pace/model/ProtoDocumentBuilderTest.java +++ b/dnet-dedup-test/src/test/java/eu/dnetlib/pace/model/ProtoDocumentBuilderTest.java @@ -5,10 +5,6 @@ import com.google.common.collect.Sets; import com.google.common.collect.Sets.SetView; import eu.dnetlib.pace.AbstractProtoPaceTest; import eu.dnetlib.pace.config.Config; -import eu.dnetlib.pace.distance.DetectorTest; -import eu.dnetlib.pace.model.MapDocument; -import eu.dnetlib.pace.model.MapDocumentSerializer; -import eu.dnetlib.pace.model.ProtoDocumentBuilder; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.junit.Test; diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/FieldDef.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/FieldDef.java index bcc96c6..736a255 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/FieldDef.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/FieldDef.java @@ -36,7 +36,15 @@ public class FieldDef implements Serializable { private double weight; - private int limit = -1; + /** + * Sets maximum size for the repeatable fields in the model. -1 for unbounded size. + */ + private int size = -1; + + /** + * Sets maximum length for field values in the model. -1 for unbounded length. + */ + private int length = -1; private Map params; @@ -73,7 +81,12 @@ public class FieldDef implements Serializable { if (params == null) { params = new HashMap<>(); } - params.put("limit", getLimit()); + + //TODO verify that the init signatures for the distance algos are all the same! + /* + params.put("size", getSize()); + params.put("length", getLength()); + */ params.put("weight", getWeight()); return PaceConfig.paceResolver.getDistanceAlgo(getAlgo(), params); } @@ -98,11 +111,6 @@ public class FieldDef implements Serializable { this.overrideMatch = overrideMatch; } - @Override - public String toString() { - return new Gson().toJson(this); - } - public double getWeight() { return weight; } @@ -119,12 +127,21 @@ public class FieldDef implements Serializable { this.algo = algo; } - public int getLimit() { - return limit; + + public int getSize() { + return size; } - public void setLimit(final int limit) { - this.limit = limit; + public void setSize(int size) { + this.size = size; + } + + public int getLength() { + return length; + } + + public void setLength(int length) { + this.length = length; } public Map getParams() { @@ -146,4 +163,10 @@ public class FieldDef implements Serializable { public void setIgnoreMissing(boolean ignoreMissing) { this.ignoreMissing = ignoreMissing; } + + @Override + public String toString() { + return new Gson().toJson(this); + } + } diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/util/BlockProcessor.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/util/BlockProcessor.java index f5a41f5..2b2ddf0 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/util/BlockProcessor.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/util/BlockProcessor.java @@ -149,7 +149,7 @@ public class BlockProcessor { if (!idCurr.equals(idPivot) && (fieldCurr != null)) { - final ScoreResult sr = similarity(algo, pivot, curr); + final ScoreResult sr = algo.between(pivot, curr, dedupConf); log.debug(sr.toString()+"SCORE "+ sr.getScore()); emitOutput(sr, idPivot, idCurr, context); i++; @@ -171,15 +171,6 @@ public class BlockProcessor { } } - private ScoreResult similarity(final PaceDocumentDistance algo, final MapDocument a, final MapDocument b) { - try { - return algo.between(a, b, dedupConf); - } catch(Throwable e) { - log.error(String.format("\nA: %s\n----------------------\nB: %s", a, b), e); - throw new IllegalArgumentException(e); - } - } - private boolean mustSkip(final String idPivot) { return dedupConf.getWf().getSkipList().contains(getNsPrefix(idPivot)); } diff --git a/pom.xml b/pom.xml index e148733..2034447 100644 --- a/pom.xml +++ b/pom.xml @@ -122,7 +122,7 @@ eu.dnetlib dnet-openaireplus-mapping-utils - 6.2.18 + 6.2.22-SNAPSHOT