added new properties to FieldDef (size, length) to limit the information mapped onto each MapDocument

This commit is contained in:
Claudio Atzori 2018-11-19 17:37:57 +01:00
parent db37cce4a4
commit e5a77f0a53
12 changed files with 52 additions and 221 deletions

View File

@ -63,7 +63,7 @@
<dependency>
<groupId>eu.dnetlib</groupId>
<artifactId>dnet-openaireplus-mapping-utils</artifactId>
<scope>test</scope>
</dependency>
<dependency>

View File

@ -1,6 +1,5 @@
package eu.dnetlib;
import com.google.common.collect.Lists;
import com.google.common.collect.Sets;
import eu.dnetlib.graph.GraphProcessor;
import eu.dnetlib.pace.clustering.BlacklistAwareClusteringCombiner;
@ -34,13 +33,13 @@ public class SparkTest {
public static void main(String[] args) {
final JavaSparkContext context = new JavaSparkContext(new SparkConf().setAppName("Deduplication").setMaster("local[*]"));
final URL dataset = SparkTest.class.getResource("/eu/dnetlib/pace/orgs.json");
final URL dataset = SparkTest.class.getResource("/eu/dnetlib/pace/results.json");
final JavaRDD<String> dataRDD = context.textFile(dataset.getPath());
counter = new SparkCounter(context);
//read the configuration from the classpath
final DedupConfig config = DedupConfig.load(readFromClasspath("/eu/dnetlib/pace/organization.test2.pace.conf"));
final DedupConfig config = DedupConfig.load(readFromClasspath("/eu/dnetlib/pace/result.full.pace.conf"));
BlockProcessor.constructAccumulator(config);
BlockProcessor.accumulators.forEach(acc -> {

View File

@ -1,150 +0,0 @@
package eu.dnetlib.data.transform;
import java.util.List;
import org.apache.commons.lang.StringUtils;
import com.google.common.base.Splitter;
import com.google.common.collect.Iterables;
import com.google.common.collect.Lists;
import com.google.protobuf.Descriptors.EnumValueDescriptor;
import com.google.protobuf.Descriptors.FieldDescriptor;
import com.google.protobuf.GeneratedMessage;
import com.google.protobuf.Message;
import com.googlecode.protobuf.format.JsonFormat;
import eu.dnetlib.pace.config.Type;
/**
* AbstractProtoMapper provide common navigation methods on the protocolbuffers Messages.
*
* @author claudio
*/
public abstract class AbstractProtoMapper {
private static final String COND_WRAPPER = "\\{|\\}";
private static final String COND_SEPARATOR = "#";
/** The Constant PATH_SEPARATOR. */
private static final String PATH_SEPARATOR = "/";
/**
* Process multi path.
*
* @param proto
* the proto
* @param paths
* the paths
* @return the list
*/
protected List<Object> processMultiPath(final GeneratedMessage proto, final List<String> paths, final Type type) {
final List<Object> response = Lists.newArrayList();
for (final String pathElements : paths) {
response.addAll(processPath(proto, pathElements, type));
}
return response;
}
/**
* Process path.
*
* @param proto
* the proto
* @param path
* the path
* @return the list
*/
protected List<Object> processPath(final GeneratedMessage proto, final String path, final Type type) {
return processPath(proto, Lists.newLinkedList(Splitter.on(PATH_SEPARATOR).trimResults().split(path)), type);
}
/**
* Process path.
*
* @param proto
* the proto
* @param pathElements
* the list
* @return the list
*/
protected List<Object> processPath(final GeneratedMessage proto, final List<String> pathElements, final Type type) {
final List<Object> response = Lists.newArrayList();
if (pathElements.isEmpty()) throw new RuntimeException("ProtoBuf navigation path is empty");
final String fieldPathCond = pathElements.get(0);
final String fieldPath = StringUtils.substringBefore(fieldPathCond, "[");
final String cond = getCondition(fieldPathCond);
final FieldDescriptor fd = proto.getDescriptorForType().findFieldByName(fieldPath);
if ((fd != null)) {
if (fd.isRepeated()) {
final int count = proto.getRepeatedFieldCount(fd);
for (int i = 0; i < count; i++) {
final Object field = proto.getRepeatedField(fd, i);
response.addAll(generateFields(fd, field, pathElements, cond, type));
}
} else {
final Object field = proto.getField(fd);
response.addAll(generateFields(fd, field, pathElements, cond, type));
}
} else throw new IllegalArgumentException("Invalid protobuf path (field not found): " + StringUtils.join(pathElements, ">") + "\nMessage:\n" + proto);
return response;
}
/**
* Generate fields.
*
* @param fd
* the fd
* @param field
* the field
* @param list
* the list
* @return the list
*/
private List<Object> generateFields(final FieldDescriptor fd, final Object field, final List<String> list, final String cond, final Type type) {
final List<Object> res = Lists.newArrayList();
if (field instanceof GeneratedMessage) {
if (list.size() > 1) {
if (StringUtils.isBlank(cond)) return processPath((GeneratedMessage) field, list.subList(1, list.size()), type);
else {
final List<String> condPath =
Lists.newLinkedList(Splitter.on(COND_SEPARATOR).trimResults().split(StringUtils.substringBefore(cond, "=")));
final String val = (String) Iterables.getOnlyElement(processPath((GeneratedMessage) field, condPath, type));
final String condVal = StringUtils.substringAfter(cond, "=").replaceAll(COND_WRAPPER, "").trim();
return val.equals(condVal) ? processPath((GeneratedMessage) field, list.subList(1, list.size()), type) : res;
}
}
else if (Type.JSON.equals(type)) {
res.add(JsonFormat.printToString((Message) field));
return res;
} else throw new RuntimeException("No primitive type found");
} else {
if (list.size() == 1) {
switch (fd.getType()) {
case ENUM:
res.add(((EnumValueDescriptor) field).getName());
break;
default:
res.add(field);
break;
}
return res;
}
else throw new RuntimeException("Found a primitive type before the path end");
}
}
private String getCondition(final String fieldPathCond) {
return fieldPathCond.contains("[") ? StringUtils.substringAfter(fieldPathCond, "[").replace("]", "") : "";
}
}

View File

@ -1,36 +0,0 @@
package eu.dnetlib.pace.model;
import java.util.List;
import java.util.Map;
import com.google.common.collect.Maps;
import com.google.protobuf.GeneratedMessage;
import eu.dnetlib.data.transform.AbstractProtoMapper;
public class ProtoDocumentBuilder extends AbstractProtoMapper {
public static MapDocument newInstance(final String id, final GeneratedMessage proto, final List<FieldDef> fields) {
final Map<String, Field> fieldMap = new ProtoDocumentBuilder().generateFieldMap(proto, fields);
return new MapDocument(id, fieldMap);
}
private Map<String, Field> generateFieldMap(final GeneratedMessage proto, final List<FieldDef> fields) {
final Map<String, Field> fieldMap = Maps.newHashMap();
for (final FieldDef fd : fields) {
final FieldList fl = new FieldListImpl(fd.getName(), fd.getType());
for (final Object o : processPath(proto, fd.getPathList(), fd.getType())) {
fl.add(new FieldValueImpl(fd.getType(), fd.getName(), o));
}
fieldMap.put(fd.getName(), fl);
}
return fieldMap;
}
}

View File

@ -21,10 +21,9 @@
{ "name" : "DomainExactMatch", "fields" : [ "websiteurl" ] }
],
"model" : [
{ "name" : "legalname", "algo" : "Null", "type" : "String", "weight" : "0", "ignoreMissing" : "false", "path" : "organization/metadata/legalname/value" },
{ "name" : "country", "algo" : "Null", "type" : "String", "weight" : "0", "ignoreMissing" : "true", "path" : "organization/metadata/country/classid" },
{ "name" : "legalshortname", "algo" : "JaroWinkler", "type" : "String", "weight" : "0.3", "ignoreMissing" : "true", "path" : "organization/metadata/legalshortname/value" },
{ "name" : "legalname", "algo" : "JaroWinklerNormalizedName", "type" : "String", "weight" : "0.7", "ignoreMissing" : "false", "path" : "organization/metadata/legalname/value" },
{ "name" : "legalname", "algo" : "JaroWinklerNormalizedName", "type" : "String", "weight" : "0.7", "ignoreMissing" : "false", "path" : "organization/metadata/legalname/value", "length" : 5 },
{ "name" : "websiteurl", "algo" : "Null", "type" : "URL", "weight" : "0", "ignoreMissing" : "true", "path" : "organization/metadata/websiteurl/value", "params" : { "host" : 0.5, "path" : 0.5 } }
],
"blacklists" : { }

View File

@ -24,9 +24,9 @@
],
"model" : [
{ "name" : "pid", "algo" : "Null", "type" : "JSON", "weight" : "0.0", "ignoreMissing" : "true", "path" : "pid", "overrideMatch" : "true" },
{ "name" : "title", "algo" : "JaroWinkler", "type" : "String", "weight" : "1.0", "ignoreMissing" : "false", "path" : "result/metadata/title[qualifier#classid = {main title}]/value" },
{ "name" : "title", "algo" : "JaroWinkler", "type" : "String", "weight" : "1.0", "ignoreMissing" : "false", "path" : "result/metadata/title[qualifier#classid = {main title}]/value", "length" : 10 },
{ "name" : "dateofacceptance", "algo" : "Null", "type" : "String", "weight" : "0.0", "ignoreMissing" : "true", "path" : "result/metadata/dateofacceptance/value" } ,
{ "name" : "authors", "algo" : "Null", "type" : "String", "weight" : "0.0", "ignoreMissing" : "true", "path" : "result/metadata/author/fullname" }
{ "name" : "authors", "algo" : "Null", "type" : "String", "weight" : "0.0", "ignoreMissing" : "true", "path" : "result/metadata/author/fullname", "size" : 1 }
],
"blacklists" : {
"title" : [

View File

@ -0,0 +1,10 @@
{"dateoftransformation":"2016-03-12T12:49:38.412Z","pid":[{"qualifier":{"classid":"doi","classname":"doi","schemename":"dnet:pid_types","schemeid":"dnet:pid_types"},"value":"10.1145/1621890.1621915"}],"originalId":["1621915"],"oaiprovenance":{},"result":{"instance":[{"hostedby":{"value":"ACM Digital Library","key":"10|openaire____::02b55e4f52388520bfe11f959f836e68"},"url":["http://dl.acm.org/citation.cfm?id=1621915"],"dateofacceptance":{"value":"2009-06-16"},"collectedfrom":{"value":"ACM Digital Library","key":"10|openaire____::02b55e4f52388520bfe11f959f836e68"},"accessright":{"classid":"CLOSED","classname":"Closed Access","schemename":"dnet:access_modes","schemeid":"dnet:access_modes"},"instancetype":{"classid":"0001","classname":"Article","schemename":"dnet:publication_resource","schemeid":"dnet:publication_resource"}}],"metadata":{"embargoenddate":{"value":""},"language":{"classid":"und","classname":"Undetermined","schemename":"dnet:languages","schemeid":"dnet:languages"},"author":[{"fullname":"Ferm&#237;n Gal&#225;n","rank":1},{"fullname":"Americo Sampaio","rank":2},{"fullname":"Luis Rodero-Merino","rank":3},{"fullname":"Irit Loy","rank":4},{"fullname":"Victor Gil","rank":5},{"fullname":"Luis Vaquero","rank":6}],"title":[{"qualifier":{"classid":"main title","classname":"main title","schemename":"dnet:dataCite_title","schemeid":"dnet:dataCite_title"},"value":"Service specification in cloud environments based on extensions to open standards"}],"resulttype":{"classid":"publication","classname":"publication","schemename":"dnet:result_typologies","schemeid":"dnet:result_typologies"},"dateofacceptance":{"value":"2009-06-16"}}},"collectedfrom":[{"value":"ACM Digital Library","key":"10|openaire____::02b55e4f52388520bfe11f959f836e68"}],"dateofcollection":"2015-01-20T00:00:00Z","type":50,"id":"50|acm_________::0002c24f82c295e925a2bdf7bbf49bfc"}
{"dateoftransformation":"2016-03-12T12:49:38.413Z","pid":[{"qualifier":{"classid":"doi","classname":"doi","schemename":"dnet:pid_types","schemeid":"dnet:pid_types"},"value":"10.1109/PESOS.2009.5068828"}],"originalId":["1564735"],"oaiprovenance":{},"result":{"instance":[{"hostedby":{"value":"ACM Digital Library","key":"10|openaire____::02b55e4f52388520bfe11f959f836e68"},"url":["http://dl.acm.org/citation.cfm?id=1564735"],"dateofacceptance":{"value":"2009-05-18"},"collectedfrom":{"value":"ACM Digital Library","key":"10|openaire____::02b55e4f52388520bfe11f959f836e68"},"accessright":{"classid":"CLOSED","classname":"Closed Access","schemename":"dnet:access_modes","schemeid":"dnet:access_modes"},"instancetype":{"classid":"0001","classname":"Article","schemename":"dnet:publication_resource","schemeid":"dnet:publication_resource"}}],"metadata":{"embargoenddate":{"value":""},"language":{"classid":"und","classname":"Undetermined","schemename":"dnet:languages","schemeid":"dnet:languages"},"author":[{"fullname":"Domenico Bianculli","rank":1},{"fullname":"Carlo Ghezzi","rank":2},{"fullname":"Cesare Pautasso","rank":3}],"title":[{"qualifier":{"classid":"main title","classname":"main title","schemename":"dnet:dataCite_title","schemeid":"dnet:dataCite_title"},"value":"Embedding continuous lifelong verification in service life cycles"}],"resulttype":{"classid":"publication","classname":"publication","schemename":"dnet:result_typologies","schemeid":"dnet:result_typologies"},"dateofacceptance":{"value":"2009-05-18"}}},"collectedfrom":[{"value":"ACM Digital Library","key":"10|openaire____::02b55e4f52388520bfe11f959f836e68"}],"dateofcollection":"2015-01-20T00:00:00Z","type":50,"id":"50|acm_________::000e0060b89af1706db93e289527a88d"}
{"dateoftransformation":"2016-03-12T12:49:38.413Z","pid":[{"qualifier":{"classid":"doi","classname":"doi","schemename":"dnet:pid_types","schemeid":"dnet:pid_types"},"value":"10.1145/1966913.1966935"}],"originalId":["1966935"],"oaiprovenance":{},"result":{"instance":[{"hostedby":{"value":"ACM Digital Library","key":"10|openaire____::02b55e4f52388520bfe11f959f836e68"},"url":["http://dl.acm.org/citation.cfm?id=1966935"],"dateofacceptance":{"value":"2011-03-22"},"collectedfrom":{"value":"ACM Digital Library","key":"10|openaire____::02b55e4f52388520bfe11f959f836e68"},"accessright":{"classid":"CLOSED","classname":"Closed Access","schemename":"dnet:access_modes","schemeid":"dnet:access_modes"},"instancetype":{"classid":"0001","classname":"Article","schemename":"dnet:publication_resource","schemeid":"dnet:publication_resource"}}],"metadata":{"embargoenddate":{"value":""},"language":{"classid":"und","classname":"Undetermined","schemename":"dnet:languages","schemeid":"dnet:languages"},"author":[{"fullname":"Francesco Alberti","rank":1},{"fullname":"Alessandro Armando","rank":2},{"fullname":"Silvio Ranise","rank":3}],"title":[{"qualifier":{"classid":"main title","classname":"main title","schemename":"dnet:dataCite_title","schemeid":"dnet:dataCite_title"},"value":"Efficient symbolic automated analysis of administrative attribute-based RBAC-policies"}],"resulttype":{"classid":"publication","classname":"publication","schemename":"dnet:result_typologies","schemeid":"dnet:result_typologies"},"dateofacceptance":{"value":"2011-03-22"}}},"collectedfrom":[{"value":"ACM Digital Library","key":"10|openaire____::02b55e4f52388520bfe11f959f836e68"}],"dateofcollection":"2015-01-20T00:00:00Z","type":50,"id":"50|acm_________::00680ab21c76269e780f5e9e7e636619"}
{"dateoftransformation":"2016-03-12T12:49:38.414Z","pid":[{"qualifier":{"classid":"doi","classname":"doi","schemename":"dnet:pid_types","schemeid":"dnet:pid_types"},"value":"10.1145/1570433.1570486"}],"originalId":["1570486"],"oaiprovenance":{},"result":{"instance":[{"hostedby":{"value":"ACM Digital Library","key":"10|openaire____::02b55e4f52388520bfe11f959f836e68"},"url":["http://dl.acm.org/citation.cfm?id=1570486"],"dateofacceptance":{"value":"2009-07-15"},"collectedfrom":{"value":"ACM Digital Library","key":"10|openaire____::02b55e4f52388520bfe11f959f836e68"},"accessright":{"classid":"CLOSED","classname":"Closed Access","schemename":"dnet:access_modes","schemeid":"dnet:access_modes"},"instancetype":{"classid":"0001","classname":"Article","schemename":"dnet:publication_resource","schemeid":"dnet:publication_resource"}}],"metadata":{"embargoenddate":{"value":""},"language":{"classid":"und","classname":"Undetermined","schemename":"dnet:languages","schemeid":"dnet:languages"},"author":[{"fullname":"Daniel Schreiber","rank":1},{"fullname":"Melanie Hartmann","rank":2},{"fullname":"Max M&#252;hlh&#228;user","surname":"Hlh User","name":"Max M.","rank":3}],"title":[{"qualifier":{"classid":"main title","classname":"main title","schemename":"dnet:dataCite_title","schemeid":"dnet:dataCite_title"},"value":"MundoMonkey"}],"resulttype":{"classid":"publication","classname":"publication","schemename":"dnet:result_typologies","schemeid":"dnet:result_typologies"},"dateofacceptance":{"value":"2009-07-15"}}},"collectedfrom":[{"value":"ACM Digital Library","key":"10|openaire____::02b55e4f52388520bfe11f959f836e68"}],"dateofcollection":"2015-01-20T00:00:00Z","type":50,"id":"50|acm_________::008169b761b014b88105a9ed96bb0b4c"}
{"dateoftransformation":"2016-03-12T12:49:38.414Z","pid":[{"qualifier":{"classid":"doi","classname":"doi","schemename":"dnet:pid_types","schemeid":"dnet:pid_types"},"value":"10.1145/2002259.2002332"}],"originalId":["2002332"],"oaiprovenance":{},"result":{"instance":[{"hostedby":{"value":"ACM Digital Library","key":"10|openaire____::02b55e4f52388520bfe11f959f836e68"},"url":["http://dl.acm.org/citation.cfm?id=2002332"],"dateofacceptance":{"value":"2011-07-11"},"collectedfrom":{"value":"ACM Digital Library","key":"10|openaire____::02b55e4f52388520bfe11f959f836e68"},"accessright":{"classid":"CLOSED","classname":"Closed Access","schemename":"dnet:access_modes","schemeid":"dnet:access_modes"},"instancetype":{"classid":"0001","classname":"Article","schemename":"dnet:publication_resource","schemeid":"dnet:publication_resource"}}],"metadata":{"embargoenddate":{"value":""},"language":{"classid":"und","classname":"Undetermined","schemename":"dnet:languages","schemeid":"dnet:languages"},"author":[{"fullname":"Roland St&#252;hmer","rank":1},{"fullname":"Nenad Stojanovic","rank":2}],"title":[{"qualifier":{"classid":"main title","classname":"main title","schemename":"dnet:dataCite_title","schemeid":"dnet:dataCite_title"},"value":"Large-scale, situation-driven and quality-aware event marketplace"}],"resulttype":{"classid":"publication","classname":"publication","schemename":"dnet:result_typologies","schemeid":"dnet:result_typologies"},"dateofacceptance":{"value":"2011-07-11"}}},"collectedfrom":[{"value":"ACM Digital Library","key":"10|openaire____::02b55e4f52388520bfe11f959f836e68"}],"dateofcollection":"2015-01-20T00:00:00Z","type":50,"id":"50|acm_________::00b0f37683e305a90c3397f328fb558a"}
{"dateoftransformation":"2016-03-12T12:49:38.414Z","pid":[{"qualifier":{"classid":"doi","classname":"doi","schemename":"dnet:pid_types","schemeid":"dnet:pid_types"},"value":"10.1145/1852658.1852664"}],"originalId":["1852664"],"oaiprovenance":{},"result":{"instance":[{"hostedby":{"value":"ACM Digital Library","key":"10|openaire____::02b55e4f52388520bfe11f959f836e68"},"url":["http://dl.acm.org/citation.cfm?id=1852664"],"dateofacceptance":{"value":"2010-04-13"},"collectedfrom":{"value":"ACM Digital Library","key":"10|openaire____::02b55e4f52388520bfe11f959f836e68"},"accessright":{"classid":"CLOSED","classname":"Closed Access","schemename":"dnet:access_modes","schemeid":"dnet:access_modes"},"instancetype":{"classid":"0001","classname":"Article","schemename":"dnet:publication_resource","schemeid":"dnet:publication_resource"}}],"metadata":{"embargoenddate":{"value":""},"language":{"classid":"und","classname":"Undetermined","schemename":"dnet:languages","schemeid":"dnet:languages"},"author":[{"fullname":"Michal Kryczka","rank":1},{"fullname":"Ruben Cuevas","rank":2},{"fullname":"Carmen Guerrero","rank":3},{"fullname":"Eiko Yoneki","rank":4},{"fullname":"Arturo Azcorra","rank":5}],"title":[{"qualifier":{"classid":"main title","classname":"main title","schemename":"dnet:dataCite_title","schemeid":"dnet:dataCite_title"},"value":"A first step towards user assisted online social networks"}],"resulttype":{"classid":"publication","classname":"publication","schemename":"dnet:result_typologies","schemeid":"dnet:result_typologies"},"dateofacceptance":{"value":"2010-04-13"}}},"collectedfrom":[{"value":"ACM Digital Library","key":"10|openaire____::02b55e4f52388520bfe11f959f836e68"}],"dateofcollection":"2015-01-20T00:00:00Z","type":50,"id":"50|acm_________::00e918f80a81af40a5e5770024f9256f"}
{"dateoftransformation":"2016-03-12T12:49:38.415Z","pid":[{"qualifier":{"classid":"doi","classname":"doi","schemename":"dnet:pid_types","schemeid":"dnet:pid_types"},"value":"10.1145/1978582.1978584"}],"originalId":["1978584"],"oaiprovenance":{},"result":{"instance":[{"hostedby":{"value":"ACM Digital Library","key":"10|openaire____::02b55e4f52388520bfe11f959f836e68"},"url":["http://dl.acm.org/citation.cfm?id=1978584"],"dateofacceptance":{"value":"2011-05-11"},"collectedfrom":{"value":"ACM Digital Library","key":"10|openaire____::02b55e4f52388520bfe11f959f836e68"},"accessright":{"classid":"CLOSED","classname":"Closed Access","schemename":"dnet:access_modes","schemeid":"dnet:access_modes"},"instancetype":{"classid":"0001","classname":"Article","schemename":"dnet:publication_resource","schemeid":"dnet:publication_resource"}}],"metadata":{"embargoenddate":{"value":""},"language":{"classid":"und","classname":"Undetermined","schemename":"dnet:languages","schemeid":"dnet:languages"},"author":[{"fullname":"Salvatore D'Antonio","surname":"Antonio","name":"Salvatore D.","rank":1},{"fullname":"Luigi Coppolino","rank":2},{"fullname":"Ivano Elia","rank":3},{"fullname":"Valerio Formicola","rank":4}],"title":[{"qualifier":{"classid":"main title","classname":"main title","schemename":"dnet:dataCite_title","schemeid":"dnet:dataCite_title"},"value":"Security issues of a phasor data concentrator for smart grid infrastructure"}],"resulttype":{"classid":"publication","classname":"publication","schemename":"dnet:result_typologies","schemeid":"dnet:result_typologies"},"dateofacceptance":{"value":"2011-05-11"}}},"collectedfrom":[{"value":"ACM Digital Library","key":"10|openaire____::02b55e4f52388520bfe11f959f836e68"}],"dateofcollection":"2015-01-20T00:00:00Z","type":50,"id":"50|acm_________::012f02c31a80f63a43772e662aca364f"}
{"dateoftransformation":"2016-03-12T12:49:38.415Z","pid":[{"qualifier":{"classid":"doi","classname":"doi","schemename":"dnet:pid_types","schemeid":"dnet:pid_types"},"value":"10.1145/1809400.1809402"}],"originalId":["1809402"],"oaiprovenance":{},"result":{"instance":[{"hostedby":{"value":"ACM Digital Library","key":"10|openaire____::02b55e4f52388520bfe11f959f836e68"},"url":["http://dl.acm.org/citation.cfm?id=1809402"],"dateofacceptance":{"value":"2010-05-27"},"collectedfrom":{"value":"ACM Digital Library","key":"10|openaire____::02b55e4f52388520bfe11f959f836e68"},"accessright":{"classid":"CLOSED","classname":"Closed Access","schemename":"dnet:access_modes","schemeid":"dnet:access_modes"},"instancetype":{"classid":"0001","classname":"Article","schemename":"dnet:publication_resource","schemeid":"dnet:publication_resource"}}],"metadata":{"embargoenddate":{"value":""},"language":{"classid":"und","classname":"Undetermined","schemename":"dnet:languages","schemeid":"dnet:languages"},"author":[{"fullname":"Kai Puolam&#228;ki","rank":1},{"fullname":"Alessio Bertone","rank":2}],"title":[{"qualifier":{"classid":"main title","classname":"main title","schemename":"dnet:dataCite_title","schemeid":"dnet:dataCite_title"},"value":"Introduction to the special issue on visual analytics and knowledge discovery"}],"resulttype":{"classid":"publication","classname":"publication","schemename":"dnet:result_typologies","schemeid":"dnet:result_typologies"},"dateofacceptance":{"value":"2010-05-27"}}},"collectedfrom":[{"value":"ACM Digital Library","key":"10|openaire____::02b55e4f52388520bfe11f959f836e68"}],"dateofcollection":"2015-01-20T00:00:00Z","type":50,"id":"50|acm_________::020794cfeedb650987bf93d3e3e09011"}
{"dateoftransformation":"2016-03-12T12:49:38.416Z","pid":[{"qualifier":{"classid":"doi","classname":"doi","schemename":"dnet:pid_types","schemeid":"dnet:pid_types"},"value":"10.1145/1851275.1851254"}],"originalId":["1851254"],"oaiprovenance":{},"result":{"instance":[{"hostedby":{"value":"ACM Digital Library","key":"10|openaire____::02b55e4f52388520bfe11f959f836e68"},"url":["http://dl.acm.org/citation.cfm?id=1851254"],"dateofacceptance":{"value":"2010-08-30"},"collectedfrom":{"value":"ACM Digital Library","key":"10|openaire____::02b55e4f52388520bfe11f959f836e68"},"accessright":{"classid":"CLOSED","classname":"Closed Access","schemename":"dnet:access_modes","schemeid":"dnet:access_modes"},"instancetype":{"classid":"0001","classname":"Article","schemename":"dnet:publication_resource","schemeid":"dnet:publication_resource"}}],"metadata":{"embargoenddate":{"value":""},"language":{"classid":"und","classname":"Undetermined","schemename":"dnet:languages","schemeid":"dnet:languages"},"author":[{"fullname":"S&#233;bastien Barr&#233;","surname":"Bastien Barr","name":"S.","rank":1},{"fullname":"Olivier Bonaventure","rank":2},{"fullname":"Costin Raiciu","rank":3},{"fullname":"Mark Handley","rank":4}],"title":[{"qualifier":{"classid":"main title","classname":"main title","schemename":"dnet:dataCite_title","schemeid":"dnet:dataCite_title"},"value":"Experimenting with multipath TCP"}],"resulttype":{"classid":"publication","classname":"publication","schemename":"dnet:result_typologies","schemeid":"dnet:result_typologies"},"dateofacceptance":{"value":"2010-08-30"}}},"collectedfrom":[{"value":"ACM Digital Library","key":"10|openaire____::02b55e4f52388520bfe11f959f836e68"}],"dateofcollection":"2015-01-20T00:00:00Z","type":50,"id":"50|acm_________::023fa75845681e2812d97440d070fb69"}
{"dateoftransformation":"2016-03-12T12:49:38.416Z","originalId":["2043516"],"oaiprovenance":{},"result":{"instance":[{"hostedby":{"value":"ACM Digital Library","key":"10|openaire____::02b55e4f52388520bfe11f959f836e68"},"url":["http://dl.acm.org/citation.cfm?id=2043516"],"dateofacceptance":{"value":"2011-09-06"},"collectedfrom":{"value":"ACM Digital Library","key":"10|openaire____::02b55e4f52388520bfe11f959f836e68"},"accessright":{"classid":"CLOSED","classname":"Closed Access","schemename":"dnet:access_modes","schemeid":"dnet:access_modes"},"instancetype":{"classid":"0001","classname":"Article","schemename":"dnet:publication_resource","schemeid":"dnet:publication_resource"}}],"metadata":{"embargoenddate":{"value":""},"language":{"classid":"und","classname":"Undetermined","schemename":"dnet:languages","schemeid":"dnet:languages"},"author":[{"fullname":"Paolo Pileggi","rank":1},{"fullname":"Giuseppe Bianchi","rank":2}],"title":[{"qualifier":{"classid":"main title","classname":"main title","schemename":"dnet:dataCite_title","schemeid":"dnet:dataCite_title"},"value":"Traffic-centric modeling of future wireless internet access technologies"}],"resulttype":{"classid":"publication","classname":"publication","schemename":"dnet:result_typologies","schemeid":"dnet:result_typologies"},"dateofacceptance":{"value":"2011-09-06"}}},"collectedfrom":[{"value":"ACM Digital Library","key":"10|openaire____::02b55e4f52388520bfe11f959f836e68"}],"dateofcollection":"2015-01-20T00:00:00Z","type":50,"id":"50|acm_________::02a8fbd0aa341df6dbb8323f453091f8"}

View File

@ -17,7 +17,6 @@ import org.junit.Ignore;
import org.junit.Test;
import java.io.IOException;
import java.util.LinkedList;
import java.util.List;
import java.util.Set;

View File

@ -5,10 +5,6 @@ import com.google.common.collect.Sets;
import com.google.common.collect.Sets.SetView;
import eu.dnetlib.pace.AbstractProtoPaceTest;
import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.distance.DetectorTest;
import eu.dnetlib.pace.model.MapDocument;
import eu.dnetlib.pace.model.MapDocumentSerializer;
import eu.dnetlib.pace.model.ProtoDocumentBuilder;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.junit.Test;

View File

@ -36,7 +36,15 @@ public class FieldDef implements Serializable {
private double weight;
private int limit = -1;
/**
* Sets maximum size for the repeatable fields in the model. -1 for unbounded size.
*/
private int size = -1;
/**
* Sets maximum length for field values in the model. -1 for unbounded length.
*/
private int length = -1;
private Map<String, Number> params;
@ -73,7 +81,12 @@ public class FieldDef implements Serializable {
if (params == null) {
params = new HashMap<>();
}
params.put("limit", getLimit());
//TODO verify that the init signatures for the distance algos are all the same!
/*
params.put("size", getSize());
params.put("length", getLength());
*/
params.put("weight", getWeight());
return PaceConfig.paceResolver.getDistanceAlgo(getAlgo(), params);
}
@ -98,11 +111,6 @@ public class FieldDef implements Serializable {
this.overrideMatch = overrideMatch;
}
@Override
public String toString() {
return new Gson().toJson(this);
}
public double getWeight() {
return weight;
}
@ -119,12 +127,21 @@ public class FieldDef implements Serializable {
this.algo = algo;
}
public int getLimit() {
return limit;
public int getSize() {
return size;
}
public void setLimit(final int limit) {
this.limit = limit;
public void setSize(int size) {
this.size = size;
}
public int getLength() {
return length;
}
public void setLength(int length) {
this.length = length;
}
public Map<String, Number> getParams() {
@ -146,4 +163,10 @@ public class FieldDef implements Serializable {
public void setIgnoreMissing(boolean ignoreMissing) {
this.ignoreMissing = ignoreMissing;
}
@Override
public String toString() {
return new Gson().toJson(this);
}
}

View File

@ -149,7 +149,7 @@ public class BlockProcessor {
if (!idCurr.equals(idPivot) && (fieldCurr != null)) {
final ScoreResult sr = similarity(algo, pivot, curr);
final ScoreResult sr = algo.between(pivot, curr, dedupConf);
log.debug(sr.toString()+"SCORE "+ sr.getScore());
emitOutput(sr, idPivot, idCurr, context);
i++;
@ -171,15 +171,6 @@ public class BlockProcessor {
}
}
private ScoreResult similarity(final PaceDocumentDistance algo, final MapDocument a, final MapDocument b) {
try {
return algo.between(a, b, dedupConf);
} catch(Throwable e) {
log.error(String.format("\nA: %s\n----------------------\nB: %s", a, b), e);
throw new IllegalArgumentException(e);
}
}
private boolean mustSkip(final String idPivot) {
return dedupConf.getWf().getSkipList().contains(getNsPrefix(idPivot));
}

View File

@ -122,7 +122,7 @@
<dependency>
<groupId>eu.dnetlib</groupId>
<artifactId>dnet-openaireplus-mapping-utils</artifactId>
<version>6.2.18</version>
<version>6.2.22-SNAPSHOT</version>
</dependency>
<dependency>