added new properties to FieldDef (size, length) to limit the information mapped onto each MapDocument
This commit is contained in:
parent
db37cce4a4
commit
e5a77f0a53
|
@ -63,7 +63,7 @@
|
|||
<dependency>
|
||||
<groupId>eu.dnetlib</groupId>
|
||||
<artifactId>dnet-openaireplus-mapping-utils</artifactId>
|
||||
<scope>test</scope>
|
||||
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
|
|
|
@ -1,6 +1,5 @@
|
|||
package eu.dnetlib;
|
||||
|
||||
import com.google.common.collect.Lists;
|
||||
import com.google.common.collect.Sets;
|
||||
import eu.dnetlib.graph.GraphProcessor;
|
||||
import eu.dnetlib.pace.clustering.BlacklistAwareClusteringCombiner;
|
||||
|
@ -34,13 +33,13 @@ public class SparkTest {
|
|||
public static void main(String[] args) {
|
||||
final JavaSparkContext context = new JavaSparkContext(new SparkConf().setAppName("Deduplication").setMaster("local[*]"));
|
||||
|
||||
final URL dataset = SparkTest.class.getResource("/eu/dnetlib/pace/orgs.json");
|
||||
final URL dataset = SparkTest.class.getResource("/eu/dnetlib/pace/results.json");
|
||||
final JavaRDD<String> dataRDD = context.textFile(dataset.getPath());
|
||||
|
||||
counter = new SparkCounter(context);
|
||||
|
||||
//read the configuration from the classpath
|
||||
final DedupConfig config = DedupConfig.load(readFromClasspath("/eu/dnetlib/pace/organization.test2.pace.conf"));
|
||||
final DedupConfig config = DedupConfig.load(readFromClasspath("/eu/dnetlib/pace/result.full.pace.conf"));
|
||||
|
||||
BlockProcessor.constructAccumulator(config);
|
||||
BlockProcessor.accumulators.forEach(acc -> {
|
||||
|
|
|
@ -1,150 +0,0 @@
|
|||
package eu.dnetlib.data.transform;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.commons.lang.StringUtils;
|
||||
|
||||
import com.google.common.base.Splitter;
|
||||
import com.google.common.collect.Iterables;
|
||||
import com.google.common.collect.Lists;
|
||||
import com.google.protobuf.Descriptors.EnumValueDescriptor;
|
||||
import com.google.protobuf.Descriptors.FieldDescriptor;
|
||||
import com.google.protobuf.GeneratedMessage;
|
||||
import com.google.protobuf.Message;
|
||||
import com.googlecode.protobuf.format.JsonFormat;
|
||||
|
||||
import eu.dnetlib.pace.config.Type;
|
||||
|
||||
/**
|
||||
* AbstractProtoMapper provide common navigation methods on the protocolbuffers Messages.
|
||||
*
|
||||
* @author claudio
|
||||
*/
|
||||
public abstract class AbstractProtoMapper {
|
||||
|
||||
private static final String COND_WRAPPER = "\\{|\\}";
|
||||
private static final String COND_SEPARATOR = "#";
|
||||
/** The Constant PATH_SEPARATOR. */
|
||||
private static final String PATH_SEPARATOR = "/";
|
||||
|
||||
/**
|
||||
* Process multi path.
|
||||
*
|
||||
* @param proto
|
||||
* the proto
|
||||
* @param paths
|
||||
* the paths
|
||||
* @return the list
|
||||
*/
|
||||
protected List<Object> processMultiPath(final GeneratedMessage proto, final List<String> paths, final Type type) {
|
||||
final List<Object> response = Lists.newArrayList();
|
||||
for (final String pathElements : paths) {
|
||||
response.addAll(processPath(proto, pathElements, type));
|
||||
}
|
||||
return response;
|
||||
}
|
||||
|
||||
/**
|
||||
* Process path.
|
||||
*
|
||||
* @param proto
|
||||
* the proto
|
||||
* @param path
|
||||
* the path
|
||||
* @return the list
|
||||
*/
|
||||
protected List<Object> processPath(final GeneratedMessage proto, final String path, final Type type) {
|
||||
return processPath(proto, Lists.newLinkedList(Splitter.on(PATH_SEPARATOR).trimResults().split(path)), type);
|
||||
}
|
||||
|
||||
/**
|
||||
* Process path.
|
||||
*
|
||||
* @param proto
|
||||
* the proto
|
||||
* @param pathElements
|
||||
* the list
|
||||
* @return the list
|
||||
*/
|
||||
protected List<Object> processPath(final GeneratedMessage proto, final List<String> pathElements, final Type type) {
|
||||
|
||||
final List<Object> response = Lists.newArrayList();
|
||||
|
||||
if (pathElements.isEmpty()) throw new RuntimeException("ProtoBuf navigation path is empty");
|
||||
|
||||
final String fieldPathCond = pathElements.get(0);
|
||||
|
||||
final String fieldPath = StringUtils.substringBefore(fieldPathCond, "[");
|
||||
final String cond = getCondition(fieldPathCond);
|
||||
|
||||
final FieldDescriptor fd = proto.getDescriptorForType().findFieldByName(fieldPath);
|
||||
if ((fd != null)) {
|
||||
if (fd.isRepeated()) {
|
||||
final int count = proto.getRepeatedFieldCount(fd);
|
||||
for (int i = 0; i < count; i++) {
|
||||
final Object field = proto.getRepeatedField(fd, i);
|
||||
response.addAll(generateFields(fd, field, pathElements, cond, type));
|
||||
}
|
||||
} else {
|
||||
final Object field = proto.getField(fd);
|
||||
response.addAll(generateFields(fd, field, pathElements, cond, type));
|
||||
}
|
||||
} else throw new IllegalArgumentException("Invalid protobuf path (field not found): " + StringUtils.join(pathElements, ">") + "\nMessage:\n" + proto);
|
||||
|
||||
return response;
|
||||
}
|
||||
|
||||
/**
|
||||
* Generate fields.
|
||||
*
|
||||
* @param fd
|
||||
* the fd
|
||||
* @param field
|
||||
* the field
|
||||
* @param list
|
||||
* the list
|
||||
* @return the list
|
||||
*/
|
||||
private List<Object> generateFields(final FieldDescriptor fd, final Object field, final List<String> list, final String cond, final Type type) {
|
||||
|
||||
final List<Object> res = Lists.newArrayList();
|
||||
if (field instanceof GeneratedMessage) {
|
||||
if (list.size() > 1) {
|
||||
|
||||
if (StringUtils.isBlank(cond)) return processPath((GeneratedMessage) field, list.subList(1, list.size()), type);
|
||||
else {
|
||||
|
||||
final List<String> condPath =
|
||||
Lists.newLinkedList(Splitter.on(COND_SEPARATOR).trimResults().split(StringUtils.substringBefore(cond, "=")));
|
||||
|
||||
final String val = (String) Iterables.getOnlyElement(processPath((GeneratedMessage) field, condPath, type));
|
||||
final String condVal = StringUtils.substringAfter(cond, "=").replaceAll(COND_WRAPPER, "").trim();
|
||||
|
||||
return val.equals(condVal) ? processPath((GeneratedMessage) field, list.subList(1, list.size()), type) : res;
|
||||
}
|
||||
}
|
||||
else if (Type.JSON.equals(type)) {
|
||||
res.add(JsonFormat.printToString((Message) field));
|
||||
return res;
|
||||
} else throw new RuntimeException("No primitive type found");
|
||||
} else {
|
||||
if (list.size() == 1) {
|
||||
|
||||
switch (fd.getType()) {
|
||||
case ENUM:
|
||||
res.add(((EnumValueDescriptor) field).getName());
|
||||
break;
|
||||
default:
|
||||
res.add(field);
|
||||
break;
|
||||
}
|
||||
return res;
|
||||
}
|
||||
else throw new RuntimeException("Found a primitive type before the path end");
|
||||
}
|
||||
}
|
||||
|
||||
private String getCondition(final String fieldPathCond) {
|
||||
return fieldPathCond.contains("[") ? StringUtils.substringAfter(fieldPathCond, "[").replace("]", "") : "";
|
||||
}
|
||||
}
|
|
@ -1,36 +0,0 @@
|
|||
package eu.dnetlib.pace.model;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
import com.google.common.collect.Maps;
|
||||
import com.google.protobuf.GeneratedMessage;
|
||||
|
||||
import eu.dnetlib.data.transform.AbstractProtoMapper;
|
||||
|
||||
public class ProtoDocumentBuilder extends AbstractProtoMapper {
|
||||
|
||||
public static MapDocument newInstance(final String id, final GeneratedMessage proto, final List<FieldDef> fields) {
|
||||
final Map<String, Field> fieldMap = new ProtoDocumentBuilder().generateFieldMap(proto, fields);
|
||||
return new MapDocument(id, fieldMap);
|
||||
}
|
||||
|
||||
private Map<String, Field> generateFieldMap(final GeneratedMessage proto, final List<FieldDef> fields) {
|
||||
final Map<String, Field> fieldMap = Maps.newHashMap();
|
||||
|
||||
for (final FieldDef fd : fields) {
|
||||
|
||||
final FieldList fl = new FieldListImpl(fd.getName(), fd.getType());
|
||||
|
||||
for (final Object o : processPath(proto, fd.getPathList(), fd.getType())) {
|
||||
|
||||
fl.add(new FieldValueImpl(fd.getType(), fd.getName(), o));
|
||||
}
|
||||
|
||||
fieldMap.put(fd.getName(), fl);
|
||||
}
|
||||
|
||||
return fieldMap;
|
||||
}
|
||||
|
||||
}
|
|
@ -21,10 +21,9 @@
|
|||
{ "name" : "DomainExactMatch", "fields" : [ "websiteurl" ] }
|
||||
],
|
||||
"model" : [
|
||||
{ "name" : "legalname", "algo" : "Null", "type" : "String", "weight" : "0", "ignoreMissing" : "false", "path" : "organization/metadata/legalname/value" },
|
||||
{ "name" : "country", "algo" : "Null", "type" : "String", "weight" : "0", "ignoreMissing" : "true", "path" : "organization/metadata/country/classid" },
|
||||
{ "name" : "legalshortname", "algo" : "JaroWinkler", "type" : "String", "weight" : "0.3", "ignoreMissing" : "true", "path" : "organization/metadata/legalshortname/value" },
|
||||
{ "name" : "legalname", "algo" : "JaroWinklerNormalizedName", "type" : "String", "weight" : "0.7", "ignoreMissing" : "false", "path" : "organization/metadata/legalname/value" },
|
||||
{ "name" : "legalname", "algo" : "JaroWinklerNormalizedName", "type" : "String", "weight" : "0.7", "ignoreMissing" : "false", "path" : "organization/metadata/legalname/value", "length" : 5 },
|
||||
{ "name" : "websiteurl", "algo" : "Null", "type" : "URL", "weight" : "0", "ignoreMissing" : "true", "path" : "organization/metadata/websiteurl/value", "params" : { "host" : 0.5, "path" : 0.5 } }
|
||||
],
|
||||
"blacklists" : { }
|
||||
|
|
|
@ -24,9 +24,9 @@
|
|||
],
|
||||
"model" : [
|
||||
{ "name" : "pid", "algo" : "Null", "type" : "JSON", "weight" : "0.0", "ignoreMissing" : "true", "path" : "pid", "overrideMatch" : "true" },
|
||||
{ "name" : "title", "algo" : "JaroWinkler", "type" : "String", "weight" : "1.0", "ignoreMissing" : "false", "path" : "result/metadata/title[qualifier#classid = {main title}]/value" },
|
||||
{ "name" : "title", "algo" : "JaroWinkler", "type" : "String", "weight" : "1.0", "ignoreMissing" : "false", "path" : "result/metadata/title[qualifier#classid = {main title}]/value", "length" : 10 },
|
||||
{ "name" : "dateofacceptance", "algo" : "Null", "type" : "String", "weight" : "0.0", "ignoreMissing" : "true", "path" : "result/metadata/dateofacceptance/value" } ,
|
||||
{ "name" : "authors", "algo" : "Null", "type" : "String", "weight" : "0.0", "ignoreMissing" : "true", "path" : "result/metadata/author/fullname" }
|
||||
{ "name" : "authors", "algo" : "Null", "type" : "String", "weight" : "0.0", "ignoreMissing" : "true", "path" : "result/metadata/author/fullname", "size" : 1 }
|
||||
],
|
||||
"blacklists" : {
|
||||
"title" : [
|
||||
|
|
|
@ -0,0 +1,10 @@
|
|||
{"dateoftransformation":"2016-03-12T12:49:38.412Z","pid":[{"qualifier":{"classid":"doi","classname":"doi","schemename":"dnet:pid_types","schemeid":"dnet:pid_types"},"value":"10.1145/1621890.1621915"}],"originalId":["1621915"],"oaiprovenance":{},"result":{"instance":[{"hostedby":{"value":"ACM Digital Library","key":"10|openaire____::02b55e4f52388520bfe11f959f836e68"},"url":["http://dl.acm.org/citation.cfm?id=1621915"],"dateofacceptance":{"value":"2009-06-16"},"collectedfrom":{"value":"ACM Digital Library","key":"10|openaire____::02b55e4f52388520bfe11f959f836e68"},"accessright":{"classid":"CLOSED","classname":"Closed Access","schemename":"dnet:access_modes","schemeid":"dnet:access_modes"},"instancetype":{"classid":"0001","classname":"Article","schemename":"dnet:publication_resource","schemeid":"dnet:publication_resource"}}],"metadata":{"embargoenddate":{"value":""},"language":{"classid":"und","classname":"Undetermined","schemename":"dnet:languages","schemeid":"dnet:languages"},"author":[{"fullname":"Fermín Galán","rank":1},{"fullname":"Americo Sampaio","rank":2},{"fullname":"Luis Rodero-Merino","rank":3},{"fullname":"Irit Loy","rank":4},{"fullname":"Victor Gil","rank":5},{"fullname":"Luis Vaquero","rank":6}],"title":[{"qualifier":{"classid":"main title","classname":"main title","schemename":"dnet:dataCite_title","schemeid":"dnet:dataCite_title"},"value":"Service specification in cloud environments based on extensions to open standards"}],"resulttype":{"classid":"publication","classname":"publication","schemename":"dnet:result_typologies","schemeid":"dnet:result_typologies"},"dateofacceptance":{"value":"2009-06-16"}}},"collectedfrom":[{"value":"ACM Digital Library","key":"10|openaire____::02b55e4f52388520bfe11f959f836e68"}],"dateofcollection":"2015-01-20T00:00:00Z","type":50,"id":"50|acm_________::0002c24f82c295e925a2bdf7bbf49bfc"}
|
||||
{"dateoftransformation":"2016-03-12T12:49:38.413Z","pid":[{"qualifier":{"classid":"doi","classname":"doi","schemename":"dnet:pid_types","schemeid":"dnet:pid_types"},"value":"10.1109/PESOS.2009.5068828"}],"originalId":["1564735"],"oaiprovenance":{},"result":{"instance":[{"hostedby":{"value":"ACM Digital Library","key":"10|openaire____::02b55e4f52388520bfe11f959f836e68"},"url":["http://dl.acm.org/citation.cfm?id=1564735"],"dateofacceptance":{"value":"2009-05-18"},"collectedfrom":{"value":"ACM Digital Library","key":"10|openaire____::02b55e4f52388520bfe11f959f836e68"},"accessright":{"classid":"CLOSED","classname":"Closed Access","schemename":"dnet:access_modes","schemeid":"dnet:access_modes"},"instancetype":{"classid":"0001","classname":"Article","schemename":"dnet:publication_resource","schemeid":"dnet:publication_resource"}}],"metadata":{"embargoenddate":{"value":""},"language":{"classid":"und","classname":"Undetermined","schemename":"dnet:languages","schemeid":"dnet:languages"},"author":[{"fullname":"Domenico Bianculli","rank":1},{"fullname":"Carlo Ghezzi","rank":2},{"fullname":"Cesare Pautasso","rank":3}],"title":[{"qualifier":{"classid":"main title","classname":"main title","schemename":"dnet:dataCite_title","schemeid":"dnet:dataCite_title"},"value":"Embedding continuous lifelong verification in service life cycles"}],"resulttype":{"classid":"publication","classname":"publication","schemename":"dnet:result_typologies","schemeid":"dnet:result_typologies"},"dateofacceptance":{"value":"2009-05-18"}}},"collectedfrom":[{"value":"ACM Digital Library","key":"10|openaire____::02b55e4f52388520bfe11f959f836e68"}],"dateofcollection":"2015-01-20T00:00:00Z","type":50,"id":"50|acm_________::000e0060b89af1706db93e289527a88d"}
|
||||
{"dateoftransformation":"2016-03-12T12:49:38.413Z","pid":[{"qualifier":{"classid":"doi","classname":"doi","schemename":"dnet:pid_types","schemeid":"dnet:pid_types"},"value":"10.1145/1966913.1966935"}],"originalId":["1966935"],"oaiprovenance":{},"result":{"instance":[{"hostedby":{"value":"ACM Digital Library","key":"10|openaire____::02b55e4f52388520bfe11f959f836e68"},"url":["http://dl.acm.org/citation.cfm?id=1966935"],"dateofacceptance":{"value":"2011-03-22"},"collectedfrom":{"value":"ACM Digital Library","key":"10|openaire____::02b55e4f52388520bfe11f959f836e68"},"accessright":{"classid":"CLOSED","classname":"Closed Access","schemename":"dnet:access_modes","schemeid":"dnet:access_modes"},"instancetype":{"classid":"0001","classname":"Article","schemename":"dnet:publication_resource","schemeid":"dnet:publication_resource"}}],"metadata":{"embargoenddate":{"value":""},"language":{"classid":"und","classname":"Undetermined","schemename":"dnet:languages","schemeid":"dnet:languages"},"author":[{"fullname":"Francesco Alberti","rank":1},{"fullname":"Alessandro Armando","rank":2},{"fullname":"Silvio Ranise","rank":3}],"title":[{"qualifier":{"classid":"main title","classname":"main title","schemename":"dnet:dataCite_title","schemeid":"dnet:dataCite_title"},"value":"Efficient symbolic automated analysis of administrative attribute-based RBAC-policies"}],"resulttype":{"classid":"publication","classname":"publication","schemename":"dnet:result_typologies","schemeid":"dnet:result_typologies"},"dateofacceptance":{"value":"2011-03-22"}}},"collectedfrom":[{"value":"ACM Digital Library","key":"10|openaire____::02b55e4f52388520bfe11f959f836e68"}],"dateofcollection":"2015-01-20T00:00:00Z","type":50,"id":"50|acm_________::00680ab21c76269e780f5e9e7e636619"}
|
||||
{"dateoftransformation":"2016-03-12T12:49:38.414Z","pid":[{"qualifier":{"classid":"doi","classname":"doi","schemename":"dnet:pid_types","schemeid":"dnet:pid_types"},"value":"10.1145/1570433.1570486"}],"originalId":["1570486"],"oaiprovenance":{},"result":{"instance":[{"hostedby":{"value":"ACM Digital Library","key":"10|openaire____::02b55e4f52388520bfe11f959f836e68"},"url":["http://dl.acm.org/citation.cfm?id=1570486"],"dateofacceptance":{"value":"2009-07-15"},"collectedfrom":{"value":"ACM Digital Library","key":"10|openaire____::02b55e4f52388520bfe11f959f836e68"},"accessright":{"classid":"CLOSED","classname":"Closed Access","schemename":"dnet:access_modes","schemeid":"dnet:access_modes"},"instancetype":{"classid":"0001","classname":"Article","schemename":"dnet:publication_resource","schemeid":"dnet:publication_resource"}}],"metadata":{"embargoenddate":{"value":""},"language":{"classid":"und","classname":"Undetermined","schemename":"dnet:languages","schemeid":"dnet:languages"},"author":[{"fullname":"Daniel Schreiber","rank":1},{"fullname":"Melanie Hartmann","rank":2},{"fullname":"Max Mühlhäuser","surname":"Hlh User","name":"Max M.","rank":3}],"title":[{"qualifier":{"classid":"main title","classname":"main title","schemename":"dnet:dataCite_title","schemeid":"dnet:dataCite_title"},"value":"MundoMonkey"}],"resulttype":{"classid":"publication","classname":"publication","schemename":"dnet:result_typologies","schemeid":"dnet:result_typologies"},"dateofacceptance":{"value":"2009-07-15"}}},"collectedfrom":[{"value":"ACM Digital Library","key":"10|openaire____::02b55e4f52388520bfe11f959f836e68"}],"dateofcollection":"2015-01-20T00:00:00Z","type":50,"id":"50|acm_________::008169b761b014b88105a9ed96bb0b4c"}
|
||||
{"dateoftransformation":"2016-03-12T12:49:38.414Z","pid":[{"qualifier":{"classid":"doi","classname":"doi","schemename":"dnet:pid_types","schemeid":"dnet:pid_types"},"value":"10.1145/2002259.2002332"}],"originalId":["2002332"],"oaiprovenance":{},"result":{"instance":[{"hostedby":{"value":"ACM Digital Library","key":"10|openaire____::02b55e4f52388520bfe11f959f836e68"},"url":["http://dl.acm.org/citation.cfm?id=2002332"],"dateofacceptance":{"value":"2011-07-11"},"collectedfrom":{"value":"ACM Digital Library","key":"10|openaire____::02b55e4f52388520bfe11f959f836e68"},"accessright":{"classid":"CLOSED","classname":"Closed Access","schemename":"dnet:access_modes","schemeid":"dnet:access_modes"},"instancetype":{"classid":"0001","classname":"Article","schemename":"dnet:publication_resource","schemeid":"dnet:publication_resource"}}],"metadata":{"embargoenddate":{"value":""},"language":{"classid":"und","classname":"Undetermined","schemename":"dnet:languages","schemeid":"dnet:languages"},"author":[{"fullname":"Roland Stühmer","rank":1},{"fullname":"Nenad Stojanovic","rank":2}],"title":[{"qualifier":{"classid":"main title","classname":"main title","schemename":"dnet:dataCite_title","schemeid":"dnet:dataCite_title"},"value":"Large-scale, situation-driven and quality-aware event marketplace"}],"resulttype":{"classid":"publication","classname":"publication","schemename":"dnet:result_typologies","schemeid":"dnet:result_typologies"},"dateofacceptance":{"value":"2011-07-11"}}},"collectedfrom":[{"value":"ACM Digital Library","key":"10|openaire____::02b55e4f52388520bfe11f959f836e68"}],"dateofcollection":"2015-01-20T00:00:00Z","type":50,"id":"50|acm_________::00b0f37683e305a90c3397f328fb558a"}
|
||||
{"dateoftransformation":"2016-03-12T12:49:38.414Z","pid":[{"qualifier":{"classid":"doi","classname":"doi","schemename":"dnet:pid_types","schemeid":"dnet:pid_types"},"value":"10.1145/1852658.1852664"}],"originalId":["1852664"],"oaiprovenance":{},"result":{"instance":[{"hostedby":{"value":"ACM Digital Library","key":"10|openaire____::02b55e4f52388520bfe11f959f836e68"},"url":["http://dl.acm.org/citation.cfm?id=1852664"],"dateofacceptance":{"value":"2010-04-13"},"collectedfrom":{"value":"ACM Digital Library","key":"10|openaire____::02b55e4f52388520bfe11f959f836e68"},"accessright":{"classid":"CLOSED","classname":"Closed Access","schemename":"dnet:access_modes","schemeid":"dnet:access_modes"},"instancetype":{"classid":"0001","classname":"Article","schemename":"dnet:publication_resource","schemeid":"dnet:publication_resource"}}],"metadata":{"embargoenddate":{"value":""},"language":{"classid":"und","classname":"Undetermined","schemename":"dnet:languages","schemeid":"dnet:languages"},"author":[{"fullname":"Michal Kryczka","rank":1},{"fullname":"Ruben Cuevas","rank":2},{"fullname":"Carmen Guerrero","rank":3},{"fullname":"Eiko Yoneki","rank":4},{"fullname":"Arturo Azcorra","rank":5}],"title":[{"qualifier":{"classid":"main title","classname":"main title","schemename":"dnet:dataCite_title","schemeid":"dnet:dataCite_title"},"value":"A first step towards user assisted online social networks"}],"resulttype":{"classid":"publication","classname":"publication","schemename":"dnet:result_typologies","schemeid":"dnet:result_typologies"},"dateofacceptance":{"value":"2010-04-13"}}},"collectedfrom":[{"value":"ACM Digital Library","key":"10|openaire____::02b55e4f52388520bfe11f959f836e68"}],"dateofcollection":"2015-01-20T00:00:00Z","type":50,"id":"50|acm_________::00e918f80a81af40a5e5770024f9256f"}
|
||||
{"dateoftransformation":"2016-03-12T12:49:38.415Z","pid":[{"qualifier":{"classid":"doi","classname":"doi","schemename":"dnet:pid_types","schemeid":"dnet:pid_types"},"value":"10.1145/1978582.1978584"}],"originalId":["1978584"],"oaiprovenance":{},"result":{"instance":[{"hostedby":{"value":"ACM Digital Library","key":"10|openaire____::02b55e4f52388520bfe11f959f836e68"},"url":["http://dl.acm.org/citation.cfm?id=1978584"],"dateofacceptance":{"value":"2011-05-11"},"collectedfrom":{"value":"ACM Digital Library","key":"10|openaire____::02b55e4f52388520bfe11f959f836e68"},"accessright":{"classid":"CLOSED","classname":"Closed Access","schemename":"dnet:access_modes","schemeid":"dnet:access_modes"},"instancetype":{"classid":"0001","classname":"Article","schemename":"dnet:publication_resource","schemeid":"dnet:publication_resource"}}],"metadata":{"embargoenddate":{"value":""},"language":{"classid":"und","classname":"Undetermined","schemename":"dnet:languages","schemeid":"dnet:languages"},"author":[{"fullname":"Salvatore D'Antonio","surname":"Antonio","name":"Salvatore D.","rank":1},{"fullname":"Luigi Coppolino","rank":2},{"fullname":"Ivano Elia","rank":3},{"fullname":"Valerio Formicola","rank":4}],"title":[{"qualifier":{"classid":"main title","classname":"main title","schemename":"dnet:dataCite_title","schemeid":"dnet:dataCite_title"},"value":"Security issues of a phasor data concentrator for smart grid infrastructure"}],"resulttype":{"classid":"publication","classname":"publication","schemename":"dnet:result_typologies","schemeid":"dnet:result_typologies"},"dateofacceptance":{"value":"2011-05-11"}}},"collectedfrom":[{"value":"ACM Digital Library","key":"10|openaire____::02b55e4f52388520bfe11f959f836e68"}],"dateofcollection":"2015-01-20T00:00:00Z","type":50,"id":"50|acm_________::012f02c31a80f63a43772e662aca364f"}
|
||||
{"dateoftransformation":"2016-03-12T12:49:38.415Z","pid":[{"qualifier":{"classid":"doi","classname":"doi","schemename":"dnet:pid_types","schemeid":"dnet:pid_types"},"value":"10.1145/1809400.1809402"}],"originalId":["1809402"],"oaiprovenance":{},"result":{"instance":[{"hostedby":{"value":"ACM Digital Library","key":"10|openaire____::02b55e4f52388520bfe11f959f836e68"},"url":["http://dl.acm.org/citation.cfm?id=1809402"],"dateofacceptance":{"value":"2010-05-27"},"collectedfrom":{"value":"ACM Digital Library","key":"10|openaire____::02b55e4f52388520bfe11f959f836e68"},"accessright":{"classid":"CLOSED","classname":"Closed Access","schemename":"dnet:access_modes","schemeid":"dnet:access_modes"},"instancetype":{"classid":"0001","classname":"Article","schemename":"dnet:publication_resource","schemeid":"dnet:publication_resource"}}],"metadata":{"embargoenddate":{"value":""},"language":{"classid":"und","classname":"Undetermined","schemename":"dnet:languages","schemeid":"dnet:languages"},"author":[{"fullname":"Kai Puolamäki","rank":1},{"fullname":"Alessio Bertone","rank":2}],"title":[{"qualifier":{"classid":"main title","classname":"main title","schemename":"dnet:dataCite_title","schemeid":"dnet:dataCite_title"},"value":"Introduction to the special issue on visual analytics and knowledge discovery"}],"resulttype":{"classid":"publication","classname":"publication","schemename":"dnet:result_typologies","schemeid":"dnet:result_typologies"},"dateofacceptance":{"value":"2010-05-27"}}},"collectedfrom":[{"value":"ACM Digital Library","key":"10|openaire____::02b55e4f52388520bfe11f959f836e68"}],"dateofcollection":"2015-01-20T00:00:00Z","type":50,"id":"50|acm_________::020794cfeedb650987bf93d3e3e09011"}
|
||||
{"dateoftransformation":"2016-03-12T12:49:38.416Z","pid":[{"qualifier":{"classid":"doi","classname":"doi","schemename":"dnet:pid_types","schemeid":"dnet:pid_types"},"value":"10.1145/1851275.1851254"}],"originalId":["1851254"],"oaiprovenance":{},"result":{"instance":[{"hostedby":{"value":"ACM Digital Library","key":"10|openaire____::02b55e4f52388520bfe11f959f836e68"},"url":["http://dl.acm.org/citation.cfm?id=1851254"],"dateofacceptance":{"value":"2010-08-30"},"collectedfrom":{"value":"ACM Digital Library","key":"10|openaire____::02b55e4f52388520bfe11f959f836e68"},"accessright":{"classid":"CLOSED","classname":"Closed Access","schemename":"dnet:access_modes","schemeid":"dnet:access_modes"},"instancetype":{"classid":"0001","classname":"Article","schemename":"dnet:publication_resource","schemeid":"dnet:publication_resource"}}],"metadata":{"embargoenddate":{"value":""},"language":{"classid":"und","classname":"Undetermined","schemename":"dnet:languages","schemeid":"dnet:languages"},"author":[{"fullname":"Sébastien Barré","surname":"Bastien Barr","name":"S.","rank":1},{"fullname":"Olivier Bonaventure","rank":2},{"fullname":"Costin Raiciu","rank":3},{"fullname":"Mark Handley","rank":4}],"title":[{"qualifier":{"classid":"main title","classname":"main title","schemename":"dnet:dataCite_title","schemeid":"dnet:dataCite_title"},"value":"Experimenting with multipath TCP"}],"resulttype":{"classid":"publication","classname":"publication","schemename":"dnet:result_typologies","schemeid":"dnet:result_typologies"},"dateofacceptance":{"value":"2010-08-30"}}},"collectedfrom":[{"value":"ACM Digital Library","key":"10|openaire____::02b55e4f52388520bfe11f959f836e68"}],"dateofcollection":"2015-01-20T00:00:00Z","type":50,"id":"50|acm_________::023fa75845681e2812d97440d070fb69"}
|
||||
{"dateoftransformation":"2016-03-12T12:49:38.416Z","originalId":["2043516"],"oaiprovenance":{},"result":{"instance":[{"hostedby":{"value":"ACM Digital Library","key":"10|openaire____::02b55e4f52388520bfe11f959f836e68"},"url":["http://dl.acm.org/citation.cfm?id=2043516"],"dateofacceptance":{"value":"2011-09-06"},"collectedfrom":{"value":"ACM Digital Library","key":"10|openaire____::02b55e4f52388520bfe11f959f836e68"},"accessright":{"classid":"CLOSED","classname":"Closed Access","schemename":"dnet:access_modes","schemeid":"dnet:access_modes"},"instancetype":{"classid":"0001","classname":"Article","schemename":"dnet:publication_resource","schemeid":"dnet:publication_resource"}}],"metadata":{"embargoenddate":{"value":""},"language":{"classid":"und","classname":"Undetermined","schemename":"dnet:languages","schemeid":"dnet:languages"},"author":[{"fullname":"Paolo Pileggi","rank":1},{"fullname":"Giuseppe Bianchi","rank":2}],"title":[{"qualifier":{"classid":"main title","classname":"main title","schemename":"dnet:dataCite_title","schemeid":"dnet:dataCite_title"},"value":"Traffic-centric modeling of future wireless internet access technologies"}],"resulttype":{"classid":"publication","classname":"publication","schemename":"dnet:result_typologies","schemeid":"dnet:result_typologies"},"dateofacceptance":{"value":"2011-09-06"}}},"collectedfrom":[{"value":"ACM Digital Library","key":"10|openaire____::02b55e4f52388520bfe11f959f836e68"}],"dateofcollection":"2015-01-20T00:00:00Z","type":50,"id":"50|acm_________::02a8fbd0aa341df6dbb8323f453091f8"}
|
|
@ -17,7 +17,6 @@ import org.junit.Ignore;
|
|||
import org.junit.Test;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
|
||||
|
|
|
@ -5,10 +5,6 @@ import com.google.common.collect.Sets;
|
|||
import com.google.common.collect.Sets.SetView;
|
||||
import eu.dnetlib.pace.AbstractProtoPaceTest;
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
import eu.dnetlib.pace.distance.DetectorTest;
|
||||
import eu.dnetlib.pace.model.MapDocument;
|
||||
import eu.dnetlib.pace.model.MapDocumentSerializer;
|
||||
import eu.dnetlib.pace.model.ProtoDocumentBuilder;
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
import org.junit.Test;
|
||||
|
|
|
@ -36,7 +36,15 @@ public class FieldDef implements Serializable {
|
|||
|
||||
private double weight;
|
||||
|
||||
private int limit = -1;
|
||||
/**
|
||||
* Sets maximum size for the repeatable fields in the model. -1 for unbounded size.
|
||||
*/
|
||||
private int size = -1;
|
||||
|
||||
/**
|
||||
* Sets maximum length for field values in the model. -1 for unbounded length.
|
||||
*/
|
||||
private int length = -1;
|
||||
|
||||
private Map<String, Number> params;
|
||||
|
||||
|
@ -73,7 +81,12 @@ public class FieldDef implements Serializable {
|
|||
if (params == null) {
|
||||
params = new HashMap<>();
|
||||
}
|
||||
params.put("limit", getLimit());
|
||||
|
||||
//TODO verify that the init signatures for the distance algos are all the same!
|
||||
/*
|
||||
params.put("size", getSize());
|
||||
params.put("length", getLength());
|
||||
*/
|
||||
params.put("weight", getWeight());
|
||||
return PaceConfig.paceResolver.getDistanceAlgo(getAlgo(), params);
|
||||
}
|
||||
|
@ -98,11 +111,6 @@ public class FieldDef implements Serializable {
|
|||
this.overrideMatch = overrideMatch;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return new Gson().toJson(this);
|
||||
}
|
||||
|
||||
public double getWeight() {
|
||||
return weight;
|
||||
}
|
||||
|
@ -119,12 +127,21 @@ public class FieldDef implements Serializable {
|
|||
this.algo = algo;
|
||||
}
|
||||
|
||||
public int getLimit() {
|
||||
return limit;
|
||||
|
||||
public int getSize() {
|
||||
return size;
|
||||
}
|
||||
|
||||
public void setLimit(final int limit) {
|
||||
this.limit = limit;
|
||||
public void setSize(int size) {
|
||||
this.size = size;
|
||||
}
|
||||
|
||||
public int getLength() {
|
||||
return length;
|
||||
}
|
||||
|
||||
public void setLength(int length) {
|
||||
this.length = length;
|
||||
}
|
||||
|
||||
public Map<String, Number> getParams() {
|
||||
|
@ -146,4 +163,10 @@ public class FieldDef implements Serializable {
|
|||
public void setIgnoreMissing(boolean ignoreMissing) {
|
||||
this.ignoreMissing = ignoreMissing;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return new Gson().toJson(this);
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -149,7 +149,7 @@ public class BlockProcessor {
|
|||
|
||||
if (!idCurr.equals(idPivot) && (fieldCurr != null)) {
|
||||
|
||||
final ScoreResult sr = similarity(algo, pivot, curr);
|
||||
final ScoreResult sr = algo.between(pivot, curr, dedupConf);
|
||||
log.debug(sr.toString()+"SCORE "+ sr.getScore());
|
||||
emitOutput(sr, idPivot, idCurr, context);
|
||||
i++;
|
||||
|
@ -171,15 +171,6 @@ public class BlockProcessor {
|
|||
}
|
||||
}
|
||||
|
||||
private ScoreResult similarity(final PaceDocumentDistance algo, final MapDocument a, final MapDocument b) {
|
||||
try {
|
||||
return algo.between(a, b, dedupConf);
|
||||
} catch(Throwable e) {
|
||||
log.error(String.format("\nA: %s\n----------------------\nB: %s", a, b), e);
|
||||
throw new IllegalArgumentException(e);
|
||||
}
|
||||
}
|
||||
|
||||
private boolean mustSkip(final String idPivot) {
|
||||
return dedupConf.getWf().getSkipList().contains(getNsPrefix(idPivot));
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue