update in the discovery of clustering, conditions and distance functions (annotated with custom annotations)
This commit is contained in:
parent
bc4505e0e6
commit
1d678ddc9c
|
@ -36,6 +36,7 @@ public class SparkTest {
|
||||||
|
|
||||||
counter = new SparkCounter(context);
|
counter = new SparkCounter(context);
|
||||||
|
|
||||||
|
//read the configuration from the classpath
|
||||||
final DedupConfig config = DedupConfig.load(readFromClasspath("/eu/dnetlib/pace/organization.pace.conf"));
|
final DedupConfig config = DedupConfig.load(readFromClasspath("/eu/dnetlib/pace/organization.pace.conf"));
|
||||||
|
|
||||||
BlockProcessor.constructAccumulator(config);
|
BlockProcessor.constructAccumulator(config);
|
||||||
|
@ -55,7 +56,7 @@ public class SparkTest {
|
||||||
|
|
||||||
//create relations between documents
|
//create relations between documents
|
||||||
final JavaPairRDD<String, String> relationRDD = mapDocs.reduceByKey((a, b) -> a) //the reduce is just to be sure that we haven't document with same id
|
final JavaPairRDD<String, String> relationRDD = mapDocs.reduceByKey((a, b) -> a) //the reduce is just to be sure that we haven't document with same id
|
||||||
//from <id, doc> to List<groupkey,doc>
|
//Clustering: from <id, doc> to List<groupkey,doc>
|
||||||
.flatMapToPair(a -> {
|
.flatMapToPair(a -> {
|
||||||
final MapDocument currentDocument = a._2();
|
final MapDocument currentDocument = a._2();
|
||||||
return getGroupingKeys(config, currentDocument).stream()
|
return getGroupingKeys(config, currentDocument).stream()
|
||||||
|
@ -83,7 +84,7 @@ public class SparkTest {
|
||||||
|
|
||||||
//print ids
|
//print ids
|
||||||
// ccs.foreach(cc -> System.out.println(cc.getId()));
|
// ccs.foreach(cc -> System.out.println(cc.getId()));
|
||||||
ccs.saveAsTextFile("file:///Users/miconis/Downloads/dumps/organizations_dedup");
|
// ccs.saveAsTextFile("file:///Users/miconis/Downloads/dumps/organizations_dedup");
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -1,208 +1,208 @@
|
||||||
package eu.dnetlib.pace;
|
//package eu.dnetlib.pace;
|
||||||
|
//
|
||||||
import com.google.common.collect.Lists;
|
//import com.google.common.collect.Lists;
|
||||||
import com.google.common.collect.Sets;
|
//import com.google.common.collect.Sets;
|
||||||
import com.google.gson.Gson;
|
//import com.google.gson.Gson;
|
||||||
import eu.dnetlib.data.proto.FieldTypeProtos.Author;
|
//import eu.dnetlib.data.proto.FieldTypeProtos.Author;
|
||||||
import eu.dnetlib.data.proto.FieldTypeProtos.Qualifier;
|
//import eu.dnetlib.data.proto.FieldTypeProtos.Qualifier;
|
||||||
import eu.dnetlib.data.proto.FieldTypeProtos.StructuredProperty;
|
//import eu.dnetlib.data.proto.FieldTypeProtos.StructuredProperty;
|
||||||
import eu.dnetlib.data.proto.FieldTypeProtos.StructuredProperty.Builder;
|
//import eu.dnetlib.data.proto.FieldTypeProtos.StructuredProperty.Builder;
|
||||||
import eu.dnetlib.data.proto.OafProtos.Oaf;
|
//import eu.dnetlib.data.proto.OafProtos.Oaf;
|
||||||
import eu.dnetlib.data.proto.OafProtos.OafEntity;
|
//import eu.dnetlib.data.proto.OafProtos.OafEntity;
|
||||||
import eu.dnetlib.data.proto.OrganizationProtos.Organization;
|
//import eu.dnetlib.data.proto.OrganizationProtos.Organization;
|
||||||
import eu.dnetlib.data.proto.ResultProtos.Result;
|
//import eu.dnetlib.data.proto.ResultProtos.Result;
|
||||||
import eu.dnetlib.pace.config.Config;
|
//import eu.dnetlib.pace.config.Config;
|
||||||
import eu.dnetlib.pace.config.DedupConfig;
|
//import eu.dnetlib.pace.config.DedupConfig;
|
||||||
import eu.dnetlib.pace.config.Type;
|
//import eu.dnetlib.pace.config.Type;
|
||||||
import eu.dnetlib.pace.model.Field;
|
//import eu.dnetlib.pace.model.Field;
|
||||||
import eu.dnetlib.pace.model.FieldValueImpl;
|
//import eu.dnetlib.pace.model.FieldValueImpl;
|
||||||
import eu.dnetlib.pace.model.MapDocument;
|
//import eu.dnetlib.pace.model.MapDocument;
|
||||||
import eu.dnetlib.pace.model.ProtoDocumentBuilder;
|
//import eu.dnetlib.pace.model.ProtoDocumentBuilder;
|
||||||
import eu.dnetlib.pace.model.gt.GTAuthor;
|
//import eu.dnetlib.pace.model.gt.GTAuthor;
|
||||||
import org.apache.commons.io.IOUtils;
|
//import org.apache.commons.io.IOUtils;
|
||||||
import org.apache.commons.lang.RandomStringUtils;
|
//import org.apache.commons.lang.RandomStringUtils;
|
||||||
import org.apache.commons.lang.StringUtils;
|
//import org.apache.commons.lang.StringUtils;
|
||||||
import org.apache.commons.lang3.RandomUtils;
|
//import org.apache.commons.lang3.RandomUtils;
|
||||||
|
//
|
||||||
import java.io.IOException;
|
//import java.io.IOException;
|
||||||
import java.io.StringWriter;
|
//import java.io.StringWriter;
|
||||||
import java.util.ArrayList;
|
//import java.util.ArrayList;
|
||||||
import java.util.LinkedList;
|
//import java.util.LinkedList;
|
||||||
import java.util.List;
|
//import java.util.List;
|
||||||
import java.util.Set;
|
//import java.util.Set;
|
||||||
import java.util.stream.Collectors;
|
//import java.util.stream.Collectors;
|
||||||
import java.util.stream.IntStream;
|
//import java.util.stream.IntStream;
|
||||||
|
//
|
||||||
public abstract class AbstractProtoPaceTest extends OafTest {
|
//public abstract class AbstractProtoPaceTest extends OafTest {
|
||||||
|
//
|
||||||
protected DedupConfig getResultFullConf() {
|
// protected DedupConfig getResultFullConf() {
|
||||||
return DedupConfig.load(readFromClasspath("/eu/dnetlib/pace/result.full.pace.conf"));
|
// return DedupConfig.load(readFromClasspath("/eu/dnetlib/pace/result.full.pace.conf"));
|
||||||
}
|
// }
|
||||||
|
//
|
||||||
protected DedupConfig getResultSimpleConf() {
|
// protected DedupConfig getResultSimpleConf() {
|
||||||
return DedupConfig.load(readFromClasspath("/eu/dnetlib/pace/result.simple.pace.conf"));
|
// return DedupConfig.load(readFromClasspath("/eu/dnetlib/pace/result.simple.pace.conf"));
|
||||||
}
|
// }
|
||||||
|
//
|
||||||
protected DedupConfig getResultConf() {
|
// protected DedupConfig getResultConf() {
|
||||||
return DedupConfig.load(readFromClasspath("/eu/dnetlib/pace/result.pace.conf"));
|
// return DedupConfig.load(readFromClasspath("/eu/dnetlib/pace/result.pace.conf"));
|
||||||
}
|
// }
|
||||||
|
//
|
||||||
protected DedupConfig getOrganizationSimpleConf() {
|
// protected DedupConfig getOrganizationSimpleConf() {
|
||||||
return DedupConfig.load(readFromClasspath("/eu/dnetlib/pace/organization.pace.conf"));
|
// return DedupConfig.load(readFromClasspath("/eu/dnetlib/pace/organization.pace.conf"));
|
||||||
}
|
// }
|
||||||
|
//
|
||||||
protected DedupConfig getResultAuthorsConf() {
|
// protected DedupConfig getResultAuthorsConf() {
|
||||||
return DedupConfig.load(readFromClasspath("/eu/dnetlib/pace/result.authors.pace.conf"));
|
// return DedupConfig.load(readFromClasspath("/eu/dnetlib/pace/result.authors.pace.conf"));
|
||||||
}
|
// }
|
||||||
|
//
|
||||||
protected DedupConfig getResultProdConf() {
|
// protected DedupConfig getResultProdConf() {
|
||||||
return DedupConfig.load(readFromClasspath("/eu/dnetlib/pace/result.prod.pace.conf"));
|
// return DedupConfig.load(readFromClasspath("/eu/dnetlib/pace/result.prod.pace.conf"));
|
||||||
}
|
// }
|
||||||
|
//
|
||||||
protected MapDocument author(final Config conf, final String id, final Oaf oaf) {
|
// protected MapDocument author(final Config conf, final String id, final Oaf oaf) {
|
||||||
return ProtoDocumentBuilder.newInstance(id, oaf.getEntity(), conf.model());
|
// return ProtoDocumentBuilder.newInstance(id, oaf.getEntity(), conf.model());
|
||||||
}
|
// }
|
||||||
|
//
|
||||||
protected GTAuthor getGTAuthor(final String path) {
|
// protected GTAuthor getGTAuthor(final String path) {
|
||||||
|
//
|
||||||
final Gson gson = new Gson();
|
// final Gson gson = new Gson();
|
||||||
|
//
|
||||||
final String json = readFromClasspath(path);
|
// final String json = readFromClasspath(path);
|
||||||
|
//
|
||||||
final GTAuthor gta = gson.fromJson(json, GTAuthor.class);
|
// final GTAuthor gta = gson.fromJson(json, GTAuthor.class);
|
||||||
|
//
|
||||||
return gta;
|
// return gta;
|
||||||
}
|
// }
|
||||||
|
//
|
||||||
protected String readFromClasspath(final String filename) {
|
// protected String readFromClasspath(final String filename) {
|
||||||
final StringWriter sw = new StringWriter();
|
// final StringWriter sw = new StringWriter();
|
||||||
try {
|
// try {
|
||||||
IOUtils.copy(getClass().getResourceAsStream(filename), sw);
|
// IOUtils.copy(getClass().getResourceAsStream(filename), sw);
|
||||||
return sw.toString();
|
// return sw.toString();
|
||||||
} catch (final IOException e) {
|
// } catch (final IOException e) {
|
||||||
throw new RuntimeException("cannot load resource from classpath: " + filename);
|
// throw new RuntimeException("cannot load resource from classpath: " + filename);
|
||||||
}
|
// }
|
||||||
}
|
// }
|
||||||
|
//
|
||||||
protected MapDocument result(final Config config, final String id, final String title) {
|
// protected MapDocument result(final Config config, final String id, final String title) {
|
||||||
return result(config, id, title, null, new ArrayList<>(), null);
|
// return result(config, id, title, null, new ArrayList<>(), null);
|
||||||
}
|
// }
|
||||||
|
//
|
||||||
protected MapDocument result(final Config config, final String id, final String title, final String date) {
|
// protected MapDocument result(final Config config, final String id, final String title, final String date) {
|
||||||
return result(config, id, title, date, new ArrayList<>(), null);
|
// return result(config, id, title, date, new ArrayList<>(), null);
|
||||||
}
|
// }
|
||||||
|
//
|
||||||
protected MapDocument result(final Config config, final String id, final String title, final String date, final List<String> pid) {
|
// protected MapDocument result(final Config config, final String id, final String title, final String date, final List<String> pid) {
|
||||||
return result(config, id, title, date, pid, null);
|
// return result(config, id, title, date, pid, null);
|
||||||
}
|
// }
|
||||||
|
//
|
||||||
protected MapDocument result(final Config config, final String id, final String title, final String date, final String pid) {
|
// protected MapDocument result(final Config config, final String id, final String title, final String date, final String pid) {
|
||||||
return result(config, id, title, date, pid, null);
|
// return result(config, id, title, date, pid, null);
|
||||||
}
|
// }
|
||||||
|
//
|
||||||
protected MapDocument result(final Config config, final String id, final String title, final String date, final String pid, final List<String> authors) {
|
// protected MapDocument result(final Config config, final String id, final String title, final String date, final String pid, final List<String> authors) {
|
||||||
return result(config, id, title, date, Lists.newArrayList(pid), authors);
|
// return result(config, id, title, date, Lists.newArrayList(pid), authors);
|
||||||
}
|
// }
|
||||||
|
//
|
||||||
static List<String> pidTypes = Lists.newArrayList();
|
// static List<String> pidTypes = Lists.newArrayList();
|
||||||
static {
|
// static {
|
||||||
pidTypes.add("doi");
|
// pidTypes.add("doi");
|
||||||
//pidTypes.add("oai");
|
// //pidTypes.add("oai");
|
||||||
//pidTypes.add("pmid");
|
// //pidTypes.add("pmid");
|
||||||
}
|
// }
|
||||||
|
//
|
||||||
protected MapDocument result(final Config config, final String id, final String title, final String date, final List<String> pid, final List<String> authors) {
|
// protected MapDocument result(final Config config, final String id, final String title, final String date, final List<String> pid, final List<String> authors) {
|
||||||
final Result.Metadata.Builder metadata = Result.Metadata.newBuilder();
|
// final Result.Metadata.Builder metadata = Result.Metadata.newBuilder();
|
||||||
if (!StringUtils.isBlank(title)) {
|
// if (!StringUtils.isBlank(title)) {
|
||||||
metadata.addTitle(getStruct(title, getQualifier("main title", "dnet:titles")));
|
// metadata.addTitle(getStruct(title, getQualifier("main title", "dnet:titles")));
|
||||||
metadata.addTitle(getStruct(RandomStringUtils.randomAlphabetic(10), getQualifier("alternative title", "dnet:titles")));
|
// metadata.addTitle(getStruct(RandomStringUtils.randomAlphabetic(10), getQualifier("alternative title", "dnet:titles")));
|
||||||
}
|
// }
|
||||||
if (!StringUtils.isBlank(date)) {
|
// if (!StringUtils.isBlank(date)) {
|
||||||
metadata.setDateofacceptance(sf(date));
|
// metadata.setDateofacceptance(sf(date));
|
||||||
}
|
// }
|
||||||
|
//
|
||||||
final OafEntity.Builder entity = oafEntity(id, eu.dnetlib.data.proto.TypeProtos.Type.result);
|
// final OafEntity.Builder entity = oafEntity(id, eu.dnetlib.data.proto.TypeProtos.Type.result);
|
||||||
final Result.Builder result = Result.newBuilder().setMetadata(metadata);
|
// final Result.Builder result = Result.newBuilder().setMetadata(metadata);
|
||||||
|
//
|
||||||
if (authors != null) {
|
// if (authors != null) {
|
||||||
result.getMetadataBuilder().addAllAuthor(
|
// result.getMetadataBuilder().addAllAuthor(
|
||||||
IntStream.range(0, authors.size())
|
// IntStream.range(0, authors.size())
|
||||||
.mapToObj(i -> author(authors.get(i), i))
|
// .mapToObj(i -> author(authors.get(i), i))
|
||||||
.collect(Collectors.toCollection(LinkedList::new)));
|
// .collect(Collectors.toCollection(LinkedList::new)));
|
||||||
}
|
// }
|
||||||
|
//
|
||||||
entity.setResult(result);
|
// entity.setResult(result);
|
||||||
|
//
|
||||||
if (pid != null) {
|
// if (pid != null) {
|
||||||
for(String p : pid) {
|
// for(String p : pid) {
|
||||||
if (!StringUtils.isBlank(p)) {
|
// if (!StringUtils.isBlank(p)) {
|
||||||
entity.addPid(sp(p, pidTypes.get(RandomUtils.nextInt(0, pidTypes.size() - 1))));
|
// entity.addPid(sp(p, pidTypes.get(RandomUtils.nextInt(0, pidTypes.size() - 1))));
|
||||||
//entity.addPid(sp(RandomStringUtils.randomAlphabetic(10), "oai"));
|
// //entity.addPid(sp(RandomStringUtils.randomAlphabetic(10), "oai"));
|
||||||
}
|
// }
|
||||||
}
|
// }
|
||||||
}
|
// }
|
||||||
|
//
|
||||||
final OafEntity build = entity.build();
|
// final OafEntity build = entity.build();
|
||||||
return ProtoDocumentBuilder.newInstance(id, build, config.model());
|
// return ProtoDocumentBuilder.newInstance(id, build, config.model());
|
||||||
}
|
// }
|
||||||
|
//
|
||||||
private Author author(final String s, int rank) {
|
// private Author author(final String s, int rank) {
|
||||||
final eu.dnetlib.pace.model.Person p = new eu.dnetlib.pace.model.Person(s, false);
|
// final eu.dnetlib.pace.model.Person p = new eu.dnetlib.pace.model.Person(s, false);
|
||||||
final Author.Builder author = Author.newBuilder();
|
// final Author.Builder author = Author.newBuilder();
|
||||||
if (p.isAccurate()) {
|
// if (p.isAccurate()) {
|
||||||
author.setName(p.getNormalisedFirstName());
|
// author.setName(p.getNormalisedFirstName());
|
||||||
author.setSurname(p.getNormalisedSurname());
|
// author.setSurname(p.getNormalisedSurname());
|
||||||
}
|
// }
|
||||||
author.setFullname(p.getNormalisedFullname());
|
// author.setFullname(p.getNormalisedFullname());
|
||||||
author.setRank(rank);
|
// author.setRank(rank);
|
||||||
|
//
|
||||||
return author.build();
|
// return author.build();
|
||||||
}
|
// }
|
||||||
|
//
|
||||||
private OafEntity.Builder oafEntity(final String id, final eu.dnetlib.data.proto.TypeProtos.Type type) {
|
// private OafEntity.Builder oafEntity(final String id, final eu.dnetlib.data.proto.TypeProtos.Type type) {
|
||||||
final OafEntity.Builder entity = OafEntity.newBuilder().setId(id).setType(type);
|
// final OafEntity.Builder entity = OafEntity.newBuilder().setId(id).setType(type);
|
||||||
return entity;
|
// return entity;
|
||||||
}
|
// }
|
||||||
|
//
|
||||||
protected MapDocument organization(final Config config, final String id, final String legalName) {
|
// protected MapDocument organization(final Config config, final String id, final String legalName) {
|
||||||
return organization(config, id, legalName, null);
|
// return organization(config, id, legalName, null);
|
||||||
}
|
// }
|
||||||
|
//
|
||||||
protected MapDocument organization(final Config config, final String id, final String legalName, final String legalShortName) {
|
// protected MapDocument organization(final Config config, final String id, final String legalName, final String legalShortName) {
|
||||||
final Organization.Metadata.Builder metadata = Organization.Metadata.newBuilder();
|
// final Organization.Metadata.Builder metadata = Organization.Metadata.newBuilder();
|
||||||
if (legalName != null) {
|
// if (legalName != null) {
|
||||||
metadata.setLegalname(sf(legalName));
|
// metadata.setLegalname(sf(legalName));
|
||||||
}
|
// }
|
||||||
if (legalShortName != null) {
|
// if (legalShortName != null) {
|
||||||
metadata.setLegalshortname(sf(legalShortName));
|
// metadata.setLegalshortname(sf(legalShortName));
|
||||||
}
|
// }
|
||||||
|
//
|
||||||
final OafEntity.Builder entity = oafEntity(id, eu.dnetlib.data.proto.TypeProtos.Type.result);
|
// final OafEntity.Builder entity = oafEntity(id, eu.dnetlib.data.proto.TypeProtos.Type.result);
|
||||||
entity.setOrganization(Organization.newBuilder().setMetadata(metadata));
|
// entity.setOrganization(Organization.newBuilder().setMetadata(metadata));
|
||||||
|
//
|
||||||
return ProtoDocumentBuilder.newInstance(id, entity.build(), config.model());
|
// return ProtoDocumentBuilder.newInstance(id, entity.build(), config.model());
|
||||||
}
|
// }
|
||||||
|
//
|
||||||
private StructuredProperty sp(final String pid, final String type) {
|
// private StructuredProperty sp(final String pid, final String type) {
|
||||||
final Builder pidSp =
|
// final Builder pidSp =
|
||||||
StructuredProperty.newBuilder().setValue(pid)
|
// StructuredProperty.newBuilder().setValue(pid)
|
||||||
.setQualifier(Qualifier.newBuilder().setClassid(type).setClassname(type).setSchemeid("dnet:pid_types").setSchemename("dnet:pid_types"));
|
// .setQualifier(Qualifier.newBuilder().setClassid(type).setClassname(type).setSchemeid("dnet:pid_types").setSchemename("dnet:pid_types"));
|
||||||
return pidSp.build();
|
// return pidSp.build();
|
||||||
}
|
// }
|
||||||
|
//
|
||||||
protected Field title(final String s) {
|
// protected Field title(final String s) {
|
||||||
return new FieldValueImpl(Type.String, "title", s);
|
// return new FieldValueImpl(Type.String, "title", s);
|
||||||
}
|
// }
|
||||||
|
//
|
||||||
protected static Builder getStruct(final String value, final Qualifier.Builder qualifier) {
|
// protected static Builder getStruct(final String value, final Qualifier.Builder qualifier) {
|
||||||
return StructuredProperty.newBuilder().setValue(value).setQualifier(qualifier);
|
// return StructuredProperty.newBuilder().setValue(value).setQualifier(qualifier);
|
||||||
}
|
// }
|
||||||
|
//
|
||||||
/*
|
// /*
|
||||||
* protected static StringField.Builder sf(final String s) { return StringField.newBuilder().setValue(s); }
|
// * protected static StringField.Builder sf(final String s) { return StringField.newBuilder().setValue(s); }
|
||||||
*
|
// *
|
||||||
* protected static Qualifier.Builder getQualifier(final String classname, final String schemename) { return
|
// * protected static Qualifier.Builder getQualifier(final String classname, final String schemename) { return
|
||||||
* Qualifier.newBuilder().setClassid(classname).setClassname(classname).setSchemeid(schemename).setSchemename(schemename); }
|
// * Qualifier.newBuilder().setClassid(classname).setClassname(classname).setSchemeid(schemename).setSchemename(schemename); }
|
||||||
*/
|
// */
|
||||||
|
//
|
||||||
}
|
//}
|
||||||
|
|
File diff suppressed because one or more lines are too long
|
@ -1,42 +1,42 @@
|
||||||
package eu.dnetlib.pace.clustering;
|
//package eu.dnetlib.pace.clustering;
|
||||||
|
//
|
||||||
import eu.dnetlib.pace.AbstractProtoPaceTest;
|
//import eu.dnetlib.pace.AbstractProtoPaceTest;
|
||||||
import eu.dnetlib.pace.clustering.BlacklistAwareClusteringCombiner;
|
//import eu.dnetlib.pace.clustering.BlacklistAwareClusteringCombiner;
|
||||||
import eu.dnetlib.pace.config.Config;
|
//import eu.dnetlib.pace.config.Config;
|
||||||
import eu.dnetlib.pace.config.Type;
|
//import eu.dnetlib.pace.config.Type;
|
||||||
import eu.dnetlib.pace.model.FieldListImpl;
|
//import eu.dnetlib.pace.model.FieldListImpl;
|
||||||
import eu.dnetlib.pace.model.FieldValueImpl;
|
//import eu.dnetlib.pace.model.FieldValueImpl;
|
||||||
import eu.dnetlib.pace.model.MapDocument;
|
//import eu.dnetlib.pace.model.MapDocument;
|
||||||
import org.apache.commons.logging.Log;
|
//import org.apache.commons.logging.Log;
|
||||||
import org.apache.commons.logging.LogFactory;
|
//import org.apache.commons.logging.LogFactory;
|
||||||
import org.junit.Before;
|
//import org.junit.Before;
|
||||||
import org.junit.Test;
|
//import org.junit.Test;
|
||||||
|
//
|
||||||
public class BlacklistAwareClusteringCombinerTest extends AbstractProtoPaceTest {
|
//public class BlacklistAwareClusteringCombinerTest extends AbstractProtoPaceTest {
|
||||||
|
//
|
||||||
private static final Log log = LogFactory.getLog(BlacklistAwareClusteringCombinerTest.class);
|
// private static final Log log = LogFactory.getLog(BlacklistAwareClusteringCombinerTest.class);
|
||||||
|
//
|
||||||
private Config config;
|
// private Config config;
|
||||||
|
//
|
||||||
@Before
|
// @Before
|
||||||
public void setUp() {
|
// public void setUp() {
|
||||||
config = getResultFullConf();
|
// config = getResultFullConf();
|
||||||
}
|
// }
|
||||||
|
//
|
||||||
@Test
|
// @Test
|
||||||
public void testCombine() {
|
// public void testCombine() {
|
||||||
final MapDocument result =
|
// final MapDocument result =
|
||||||
result(config, "A", "Dipping in Cygnus X-2 in a multi-wavelength campaign due to absorption of extended ADC emission", "2013");
|
// result(config, "A", "Dipping in Cygnus X-2 in a multi-wavelength campaign due to absorption of extended ADC emission", "2013");
|
||||||
final FieldListImpl fl = new FieldListImpl();
|
// final FieldListImpl fl = new FieldListImpl();
|
||||||
fl.add(new FieldValueImpl(Type.String, "desc", "hello world description pipeline"));
|
// fl.add(new FieldValueImpl(Type.String, "desc", "hello world description pipeline"));
|
||||||
|
//
|
||||||
result.getFieldMap().put("desc", fl);
|
// result.getFieldMap().put("desc", fl);
|
||||||
|
//
|
||||||
fl.clear();
|
// fl.clear();
|
||||||
fl.add(new FieldValueImpl(Type.String, "title", "lorem ipsum cabalie qwerty"));
|
// fl.add(new FieldValueImpl(Type.String, "title", "lorem ipsum cabalie qwerty"));
|
||||||
final FieldListImpl field = (FieldListImpl) result.getFieldMap().get("title");
|
// final FieldListImpl field = (FieldListImpl) result.getFieldMap().get("title");
|
||||||
field.add(fl);
|
// field.add(fl);
|
||||||
|
//
|
||||||
log.info(BlacklistAwareClusteringCombiner.filterAndCombine(result, config));
|
// log.info(BlacklistAwareClusteringCombiner.filterAndCombine(result, config));
|
||||||
}
|
// }
|
||||||
}
|
//}
|
||||||
|
|
|
@ -1,39 +1,39 @@
|
||||||
package eu.dnetlib.pace.clustering;
|
//package eu.dnetlib.pace.clustering;
|
||||||
|
//
|
||||||
import eu.dnetlib.pace.AbstractProtoPaceTest;
|
//import eu.dnetlib.pace.AbstractProtoPaceTest;
|
||||||
import eu.dnetlib.pace.clustering.ClusteringCombiner;
|
//import eu.dnetlib.pace.clustering.ClusteringCombiner;
|
||||||
import eu.dnetlib.pace.config.Config;
|
//import eu.dnetlib.pace.config.Config;
|
||||||
import eu.dnetlib.pace.config.Type;
|
//import eu.dnetlib.pace.config.Type;
|
||||||
import eu.dnetlib.pace.model.FieldListImpl;
|
//import eu.dnetlib.pace.model.FieldListImpl;
|
||||||
import eu.dnetlib.pace.model.FieldValueImpl;
|
//import eu.dnetlib.pace.model.FieldValueImpl;
|
||||||
import eu.dnetlib.pace.model.MapDocument;
|
//import eu.dnetlib.pace.model.MapDocument;
|
||||||
import org.apache.commons.logging.Log;
|
//import org.apache.commons.logging.Log;
|
||||||
import org.apache.commons.logging.LogFactory;
|
//import org.apache.commons.logging.LogFactory;
|
||||||
import org.junit.Before;
|
//import org.junit.Before;
|
||||||
import org.junit.Test;
|
//import org.junit.Test;
|
||||||
|
//
|
||||||
public class ClusteringCombinerTest extends AbstractProtoPaceTest {
|
//public class ClusteringCombinerTest extends AbstractProtoPaceTest {
|
||||||
|
//
|
||||||
private static final Log log = LogFactory.getLog(ClusteringCombinerTest.class);
|
// private static final Log log = LogFactory.getLog(ClusteringCombinerTest.class);
|
||||||
|
//
|
||||||
private Config config;
|
// private Config config;
|
||||||
|
//
|
||||||
@Before
|
// @Before
|
||||||
public void setUp() {
|
// public void setUp() {
|
||||||
config = getResultFullConf();
|
// config = getResultFullConf();
|
||||||
}
|
// }
|
||||||
|
//
|
||||||
@Test
|
// @Test
|
||||||
public void testCombine() {
|
// public void testCombine() {
|
||||||
String title = "Dipping in Cygnus X-2 in a multi-wavelength campaign due to absorption of extended ADC emission";
|
// String title = "Dipping in Cygnus X-2 in a multi-wavelength campaign due to absorption of extended ADC emission";
|
||||||
MapDocument result = result(config, "A", title, "2013");
|
// MapDocument result = result(config, "A", title, "2013");
|
||||||
|
//
|
||||||
FieldListImpl fl = new FieldListImpl();
|
// FieldListImpl fl = new FieldListImpl();
|
||||||
fl.add(new FieldValueImpl(Type.String, "desc", "lorem ipsum cabalie qwerty"));
|
// fl.add(new FieldValueImpl(Type.String, "desc", "lorem ipsum cabalie qwerty"));
|
||||||
|
//
|
||||||
result.getFieldMap().put("desc", fl);
|
// result.getFieldMap().put("desc", fl);
|
||||||
log.info(title);
|
// log.info(title);
|
||||||
log.info(ClusteringCombiner.combine(result, config));
|
// log.info(ClusteringCombiner.combine(result, config));
|
||||||
}
|
// }
|
||||||
|
//
|
||||||
}
|
//}
|
||||||
|
|
|
@ -1,450 +1,450 @@
|
||||||
package eu.dnetlib.pace.distance;
|
//package eu.dnetlib.pace.distance;
|
||||||
|
//
|
||||||
import com.google.common.collect.Lists;
|
//import com.google.common.collect.Lists;
|
||||||
import com.google.common.collect.Maps;
|
//import com.google.common.collect.Maps;
|
||||||
import com.google.common.collect.Sets;
|
//import com.google.common.collect.Sets;
|
||||||
import com.googlecode.protobuf.format.JsonFormat;
|
//import com.googlecode.protobuf.format.JsonFormat;
|
||||||
import eu.dnetlib.data.proto.OafProtos;
|
//import eu.dnetlib.data.proto.OafProtos;
|
||||||
import eu.dnetlib.pace.AbstractProtoPaceTest;
|
//import eu.dnetlib.pace.AbstractProtoPaceTest;
|
||||||
import eu.dnetlib.pace.clustering.BlacklistAwareClusteringCombiner;
|
//import eu.dnetlib.pace.clustering.BlacklistAwareClusteringCombiner;
|
||||||
import eu.dnetlib.pace.config.Config;
|
//import eu.dnetlib.pace.config.Config;
|
||||||
import eu.dnetlib.pace.config.DedupConfig;
|
//import eu.dnetlib.pace.config.DedupConfig;
|
||||||
import eu.dnetlib.pace.distance.eval.ScoreResult;
|
//import eu.dnetlib.pace.distance.eval.ScoreResult;
|
||||||
import eu.dnetlib.pace.model.MapDocument;
|
//import eu.dnetlib.pace.model.MapDocument;
|
||||||
import eu.dnetlib.pace.model.ProtoDocumentBuilder;
|
//import eu.dnetlib.pace.model.ProtoDocumentBuilder;
|
||||||
import org.apache.commons.io.IOUtils;
|
//import org.apache.commons.io.IOUtils;
|
||||||
import org.apache.commons.logging.Log;
|
//import org.apache.commons.logging.Log;
|
||||||
import org.apache.commons.logging.LogFactory;
|
//import org.apache.commons.logging.LogFactory;
|
||||||
import org.junit.Ignore;
|
//import org.junit.Ignore;
|
||||||
import org.junit.Test;
|
//import org.junit.Test;
|
||||||
|
//
|
||||||
import java.io.IOException;
|
//import java.io.IOException;
|
||||||
import java.util.List;
|
//import java.util.List;
|
||||||
import java.util.Map;
|
//import java.util.Map;
|
||||||
import java.util.Set;
|
//import java.util.Set;
|
||||||
import java.util.stream.Collectors;
|
//import java.util.stream.Collectors;
|
||||||
|
//
|
||||||
import static org.junit.Assert.assertFalse;
|
//import static org.junit.Assert.assertFalse;
|
||||||
import static org.junit.Assert.assertTrue;
|
//import static org.junit.Assert.assertTrue;
|
||||||
|
//
|
||||||
public class DetectorTest extends AbstractProtoPaceTest {
|
//public class DetectorTest extends AbstractProtoPaceTest {
|
||||||
|
//
|
||||||
private static final Log log = LogFactory.getLog(DetectorTest.class);
|
// private static final Log log = LogFactory.getLog(DetectorTest.class);
|
||||||
|
//
|
||||||
@Test
|
// @Test
|
||||||
public void testDistanceResultSimple() {
|
// public void testDistanceResultSimple() {
|
||||||
final Config config = getResultSimpleConf();
|
// final Config config = getResultSimpleConf();
|
||||||
|
//
|
||||||
final MapDocument resA = result(config, "A", "Recent results from CDF");
|
// final MapDocument resA = result(config, "A", "Recent results from CDF");
|
||||||
final MapDocument resB = result(config, "B", "Recent results from CDF");
|
// final MapDocument resB = result(config, "B", "Recent results from CDF");
|
||||||
|
//
|
||||||
final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config);
|
// final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config);
|
||||||
final double d = sr.getScore();
|
// final double d = sr.getScore();
|
||||||
log.info(String.format(" d ---> %s", d));
|
// log.info(String.format(" d ---> %s", d));
|
||||||
|
//
|
||||||
assertTrue(d == 1.0);
|
// assertTrue(d == 1.0);
|
||||||
}
|
// }
|
||||||
|
//
|
||||||
@Test
|
// @Test
|
||||||
public void testDistanceResultSimpleMissingDates() {
|
// public void testDistanceResultSimpleMissingDates() {
|
||||||
final Config config = getResultSimpleConf();
|
// final Config config = getResultSimpleConf();
|
||||||
|
//
|
||||||
final MapDocument resA = result(config, "A", "Recent results from BES");
|
// final MapDocument resA = result(config, "A", "Recent results from BES");
|
||||||
final MapDocument resB = result(config, "A", "Recent results from CES");
|
// final MapDocument resB = result(config, "A", "Recent results from CES");
|
||||||
|
//
|
||||||
final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config);
|
// final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config);
|
||||||
final double d = sr.getScore();
|
// final double d = sr.getScore();
|
||||||
log.info(String.format(" d ---> %s", d));
|
// log.info(String.format(" d ---> %s", d));
|
||||||
|
//
|
||||||
assertTrue(d > 0.97);
|
// assertTrue(d > 0.97);
|
||||||
}
|
// }
|
||||||
|
//
|
||||||
@Test
|
// @Test
|
||||||
public void testDistanceResultInvalidDate() {
|
// public void testDistanceResultInvalidDate() {
|
||||||
final Config config = getResultConf();
|
// final Config config = getResultConf();
|
||||||
|
//
|
||||||
final MapDocument resA = result(config, "A", "title title title 6BESR", "2013-01-05");
|
// final MapDocument resA = result(config, "A", "title title title 6BESR", "2013-01-05");
|
||||||
final MapDocument resB = result(config, "B", "title title title 6BESR", "qwerty");
|
// final MapDocument resB = result(config, "B", "title title title 6BESR", "qwerty");
|
||||||
|
//
|
||||||
final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config);
|
// final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config);
|
||||||
final double d = sr.getScore();
|
// final double d = sr.getScore();
|
||||||
log.info(String.format(" d ---> %s", d));
|
// log.info(String.format(" d ---> %s", d));
|
||||||
|
//
|
||||||
assertTrue(d == 1.0);
|
// assertTrue(d == 1.0);
|
||||||
}
|
// }
|
||||||
|
//
|
||||||
@Ignore
|
// @Ignore
|
||||||
@Test
|
// @Test
|
||||||
public void testDistanceResultMissingOneDate() {
|
// public void testDistanceResultMissingOneDate() {
|
||||||
final Config config = getResultConf();
|
// final Config config = getResultConf();
|
||||||
|
//
|
||||||
final MapDocument resA = result(config, "A", "title title title 6BESR", null);
|
// final MapDocument resA = result(config, "A", "title title title 6BESR", null);
|
||||||
final MapDocument resB = result(config, "B", "title title title 6CLER", "2012-02");
|
// final MapDocument resB = result(config, "B", "title title title 6CLER", "2012-02");
|
||||||
|
//
|
||||||
final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config);
|
// final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config);
|
||||||
double d = sr.getScore();
|
// double d = sr.getScore();
|
||||||
log.info(String.format(" d ---> %s", d));
|
// log.info(String.format(" d ---> %s", d));
|
||||||
|
//
|
||||||
assertTrue((d > 0.9) && (d < 1.0));
|
// assertTrue((d > 0.9) && (d < 1.0));
|
||||||
}
|
// }
|
||||||
|
//
|
||||||
@Ignore
|
// @Ignore
|
||||||
@Test
|
// @Test
|
||||||
public void testDistanceResult() {
|
// public void testDistanceResult() {
|
||||||
final Config config = getResultConf();
|
// final Config config = getResultConf();
|
||||||
|
//
|
||||||
final MapDocument resA = result(config, "A", "title title title BES", "");
|
// final MapDocument resA = result(config, "A", "title title title BES", "");
|
||||||
final MapDocument resB = result(config, "B", "title title title CLEO");
|
// final MapDocument resB = result(config, "B", "title title title CLEO");
|
||||||
|
//
|
||||||
final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config);
|
// final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config);
|
||||||
double d = sr.getScore();
|
// double d = sr.getScore();
|
||||||
log.info(String.format(" d ---> %s", d));
|
// log.info(String.format(" d ---> %s", d));
|
||||||
|
//
|
||||||
assertTrue((d > 0.9) && (d < 1.0));
|
// assertTrue((d > 0.9) && (d < 1.0));
|
||||||
}
|
// }
|
||||||
|
//
|
||||||
@Ignore
|
// @Ignore
|
||||||
@Test
|
// @Test
|
||||||
public void testDistanceResultMissingTwoDate() {
|
// public void testDistanceResultMissingTwoDate() {
|
||||||
final Config config = getResultConf();
|
// final Config config = getResultConf();
|
||||||
|
//
|
||||||
final MapDocument resA = result(config, "A", "bellaciao");
|
// final MapDocument resA = result(config, "A", "bellaciao");
|
||||||
final MapDocument resB = result(config, "B", "bellocioa");
|
// final MapDocument resB = result(config, "B", "bellocioa");
|
||||||
|
//
|
||||||
final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config);
|
// final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config);
|
||||||
double d = sr.getScore();
|
// double d = sr.getScore();
|
||||||
log.info(String.format(" d ---> %s", d));
|
// log.info(String.format(" d ---> %s", d));
|
||||||
|
//
|
||||||
assertTrue((d > 0.9) && (d < 1.0));
|
// assertTrue((d > 0.9) && (d < 1.0));
|
||||||
}
|
// }
|
||||||
|
//
|
||||||
@Ignore
|
// @Ignore
|
||||||
@Test
|
// @Test
|
||||||
public void testDistanceOrganizationIgnoreMissing() {
|
// public void testDistanceOrganizationIgnoreMissing() {
|
||||||
|
//
|
||||||
final Config config = getOrganizationSimpleConf();
|
// final Config config = getOrganizationSimpleConf();
|
||||||
|
//
|
||||||
final MapDocument orgA = organization(config, "A", "CONSIGLIO NAZIONALE DELLE RICERCHE");
|
// final MapDocument orgA = organization(config, "A", "CONSIGLIO NAZIONALE DELLE RICERCHE");
|
||||||
final MapDocument orgB = organization(config, "B", "CONSIGLIO NAZIONALE DELLE RICERCHE", "CNR");
|
// final MapDocument orgB = organization(config, "B", "CONSIGLIO NAZIONALE DELLE RICERCHE", "CNR");
|
||||||
|
//
|
||||||
final ScoreResult sr = new PaceDocumentDistance().between(orgA, orgB, config);
|
// final ScoreResult sr = new PaceDocumentDistance().between(orgA, orgB, config);
|
||||||
final double d = sr.getScore();
|
// final double d = sr.getScore();
|
||||||
log.info(String.format(" d ---> %s", d));
|
// log.info(String.format(" d ---> %s", d));
|
||||||
|
//
|
||||||
assertTrue(d > 0.99);
|
// assertTrue(d > 0.99);
|
||||||
}
|
// }
|
||||||
|
//
|
||||||
@Test
|
// @Test
|
||||||
public void testDistanceResultCase1() {
|
// public void testDistanceResultCase1() {
|
||||||
|
//
|
||||||
final Config config = getResultConf();
|
// final Config config = getResultConf();
|
||||||
|
//
|
||||||
final MapDocument resA = result(config, "A", "Search the Standard Model Higgs boson", "2003");
|
// final MapDocument resA = result(config, "A", "Search the Standard Model Higgs boson", "2003");
|
||||||
final MapDocument resB = result(config, "B", "Search for the Standard Model Higgs Boson", "2003");
|
// final MapDocument resB = result(config, "B", "Search for the Standard Model Higgs Boson", "2003");
|
||||||
|
//
|
||||||
final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config);
|
// final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config);
|
||||||
double d = sr.getScore();
|
// double d = sr.getScore();
|
||||||
log.info(String.format(" d ---> %s", d));
|
// log.info(String.format(" d ---> %s", d));
|
||||||
|
//
|
||||||
assertTrue((d > 0.9) && (d < 1.0));
|
// assertTrue((d > 0.9) && (d < 1.0));
|
||||||
}
|
// }
|
||||||
|
//
|
||||||
@Test
|
// @Test
|
||||||
public void testDistanceResultCaseDoiMatch1() {
|
// public void testDistanceResultCaseDoiMatch1() {
|
||||||
final Config config = getResultConf();
|
// final Config config = getResultConf();
|
||||||
|
//
|
||||||
final MapDocument resA = result(config, "A", "Search the Standard Model Higgs boson", "2003", "10.1594/PANGAEA.726855");
|
// final MapDocument resA = result(config, "A", "Search the Standard Model Higgs boson", "2003", "10.1594/PANGAEA.726855");
|
||||||
final MapDocument resB = result(config, "B", "Search the Standard Model Higgs Boson", "2003", "10.1594/PANGAEA.726855");
|
// final MapDocument resB = result(config, "B", "Search the Standard Model Higgs Boson", "2003", "10.1594/PANGAEA.726855");
|
||||||
|
//
|
||||||
final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config);
|
// final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config);
|
||||||
double d = sr.getScore();
|
// double d = sr.getScore();
|
||||||
log.info(String.format(" d ---> %s", d));
|
// log.info(String.format(" d ---> %s", d));
|
||||||
|
//
|
||||||
assertTrue("exact DOIs will produce an exact match", d == 1.0);
|
// assertTrue("exact DOIs will produce an exact match", d == 1.0);
|
||||||
}
|
// }
|
||||||
|
//
|
||||||
@Test
|
// @Test
|
||||||
public void testDistanceResultCaseDoiMatch2() {
|
// public void testDistanceResultCaseDoiMatch2() {
|
||||||
final Config config = getResultConf();
|
// final Config config = getResultConf();
|
||||||
|
//
|
||||||
final MapDocument resA = result(config, "A", "Conference proceedings on X. Appendix", "2003", "10.1594/PANGAEA.726855");
|
// final MapDocument resA = result(config, "A", "Conference proceedings on X. Appendix", "2003", "10.1594/PANGAEA.726855");
|
||||||
final MapDocument resB = result(config, "B", "Search the Standard Model Higgs Boson", "2005", "10.1594/PANGAEA.726855");
|
// final MapDocument resB = result(config, "B", "Search the Standard Model Higgs Boson", "2005", "10.1594/PANGAEA.726855");
|
||||||
|
//
|
||||||
final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config);
|
// final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config);
|
||||||
double d = sr.getScore();
|
// double d = sr.getScore();
|
||||||
log.info(String.format(" d ---> %s", d));
|
// log.info(String.format(" d ---> %s", d));
|
||||||
|
//
|
||||||
assertTrue("exact DOIs will produce an exact match, regardless of different titles or publication years", d == 1.0);
|
// assertTrue("exact DOIs will produce an exact match, regardless of different titles or publication years", d == 1.0);
|
||||||
}
|
// }
|
||||||
|
//
|
||||||
@Test
|
// @Test
|
||||||
public void testDistanceResultCaseDoiMatch3() {
|
// public void testDistanceResultCaseDoiMatch3() {
|
||||||
final Config config = getResultConf();
|
// final Config config = getResultConf();
|
||||||
|
//
|
||||||
final MapDocument resA = result(config, "A", "Conference proceedings on X. Appendix", "2003", "10.1016/j.jmb.2010.12.024");
|
// final MapDocument resA = result(config, "A", "Conference proceedings on X. Appendix", "2003", "10.1016/j.jmb.2010.12.024");
|
||||||
final MapDocument resB = result(config, "B", "Conference proceedings on X. Appendix", "2003");
|
// final MapDocument resB = result(config, "B", "Conference proceedings on X. Appendix", "2003");
|
||||||
|
//
|
||||||
final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config);
|
// final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config);
|
||||||
double d = sr.getScore();
|
// double d = sr.getScore();
|
||||||
log.info(String.format(" d ---> %s", d));
|
// log.info(String.format(" d ---> %s", d));
|
||||||
|
//
|
||||||
assertTrue("a missing DOI will casue the comparsion to continue with the following conditions", d == 1.0);
|
// assertTrue("a missing DOI will casue the comparsion to continue with the following conditions", d == 1.0);
|
||||||
}
|
// }
|
||||||
|
//
|
||||||
@Test
|
// @Test
|
||||||
public void testDistanceResultCaseDoiMatch4() {
|
// public void testDistanceResultCaseDoiMatch4() {
|
||||||
final Config config = getResultConf();
|
// final Config config = getResultConf();
|
||||||
|
//
|
||||||
final MapDocument resA = result(config, "A", "Conference proceedings on X. Appendix", "2003", "10.1016/j.jmb.2010.12.024");
|
// final MapDocument resA = result(config, "A", "Conference proceedings on X. Appendix", "2003", "10.1016/j.jmb.2010.12.024");
|
||||||
final MapDocument resB = result(config, "B", "Conference proceedings on X. Appendix", "2005");
|
// final MapDocument resB = result(config, "B", "Conference proceedings on X. Appendix", "2005");
|
||||||
|
//
|
||||||
final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config);
|
// final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config);
|
||||||
double d = sr.getScore();
|
// double d = sr.getScore();
|
||||||
log.info(String.format(" d ---> %s", d));
|
// log.info(String.format(" d ---> %s", d));
|
||||||
|
//
|
||||||
assertTrue("a missing DOI, comparsion continues with the following conditions, different publication years will drop the score to 0", d == 0.0);
|
// assertTrue("a missing DOI, comparsion continues with the following conditions, different publication years will drop the score to 0", d == 0.0);
|
||||||
}
|
// }
|
||||||
|
//
|
||||||
@Test
|
// @Test
|
||||||
public void testDistanceResultCaseDoiMatch5() {
|
// public void testDistanceResultCaseDoiMatch5() {
|
||||||
|
//
|
||||||
final Config config = getResultConf();
|
// final Config config = getResultConf();
|
||||||
|
//
|
||||||
final MapDocument resA = result(config, "A", "Search for the Standard Model Higgs Boson", "2003", "10.1016/j.jmb.2010.12.020");
|
// final MapDocument resA = result(config, "A", "Search for the Standard Model Higgs Boson", "2003", "10.1016/j.jmb.2010.12.020");
|
||||||
final MapDocument resB = result(config, "B", "Search the Standard Model Higgs Boson", "2003");
|
// final MapDocument resB = result(config, "B", "Search the Standard Model Higgs Boson", "2003");
|
||||||
|
//
|
||||||
final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config);
|
// final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config);
|
||||||
double d = sr.getScore();
|
// double d = sr.getScore();
|
||||||
log.info(String.format(" d ---> %s", d));
|
// log.info(String.format(" d ---> %s", d));
|
||||||
|
//
|
||||||
assertTrue("a missing DOI, comparsion continues with the following conditions", (d > 0.9) && (d < 1.0));
|
// assertTrue("a missing DOI, comparsion continues with the following conditions", (d > 0.9) && (d < 1.0));
|
||||||
}
|
// }
|
||||||
|
//
|
||||||
@Test
|
// @Test
|
||||||
public void testDistanceResultCaseDoiMatch6() {
|
// public void testDistanceResultCaseDoiMatch6() {
|
||||||
final Config config = getResultConf();
|
// final Config config = getResultConf();
|
||||||
|
//
|
||||||
final MapDocument resA = result(config, "A", "Conference proceedings on X. Appendix", "2003", "10.1016/j.jmb.2010.12.024");
|
// final MapDocument resA = result(config, "A", "Conference proceedings on X. Appendix", "2003", "10.1016/j.jmb.2010.12.024");
|
||||||
final MapDocument resB = result(config, "B", "Conference proceedings on X. Appendix", "2003", "anotherDifferentDOI");
|
// final MapDocument resB = result(config, "B", "Conference proceedings on X. Appendix", "2003", "anotherDifferentDOI");
|
||||||
|
//
|
||||||
final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config);
|
// final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config);
|
||||||
double d = sr.getScore();
|
// double d = sr.getScore();
|
||||||
log.info(String.format(" d ---> %s", d));
|
// log.info(String.format(" d ---> %s", d));
|
||||||
|
//
|
||||||
assertTrue("different DOIs will NOT drop the score to 0, then evaluate other fields", d == 1.0);
|
// assertTrue("different DOIs will NOT drop the score to 0, then evaluate other fields", d == 1.0);
|
||||||
}
|
// }
|
||||||
|
//
|
||||||
@Test
|
// @Test
|
||||||
public void testDistanceResultCaseDoiMatch7() {
|
// public void testDistanceResultCaseDoiMatch7() {
|
||||||
final Config config = getResultConf();
|
// final Config config = getResultConf();
|
||||||
|
//
|
||||||
final MapDocument resA = result(config, "A", "Adrenal Insufficiency asd asd", "1951", Lists.newArrayList("PMC2037944", "axdsds"));
|
// final MapDocument resA = result(config, "A", "Adrenal Insufficiency asd asd", "1951", Lists.newArrayList("PMC2037944", "axdsds"));
|
||||||
final MapDocument resB = result(config, "B", "Adrenal Insufficiency", "1951", "PMC2037944");
|
// final MapDocument resB = result(config, "B", "Adrenal Insufficiency", "1951", "PMC2037944");
|
||||||
|
//
|
||||||
final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config);
|
// final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config);
|
||||||
double d = sr.getScore();
|
// double d = sr.getScore();
|
||||||
log.info(String.format(" d ---> %s", d));
|
// log.info(String.format(" d ---> %s", d));
|
||||||
|
//
|
||||||
assertTrue("different DOIs will drop the score to 0, regardless of the other fields", d > 0.9 & d < 1);
|
// assertTrue("different DOIs will drop the score to 0, regardless of the other fields", d > 0.9 & d < 1);
|
||||||
}
|
// }
|
||||||
|
//
|
||||||
// http://dx.doi.org/10.1594/PANGAEA.726855 doi:10.1594/PANGAEA.726855
|
// // http://dx.doi.org/10.1594/PANGAEA.726855 doi:10.1594/PANGAEA.726855
|
||||||
|
//
|
||||||
@Test
|
// @Test
|
||||||
public void testDistanceResultCaseAuthor1() {
|
// public void testDistanceResultCaseAuthor1() {
|
||||||
|
//
|
||||||
final Config config = getResultAuthorsConf();
|
// final Config config = getResultAuthorsConf();
|
||||||
|
//
|
||||||
final List<String> authorsA = Lists.newArrayList("a", "b", "c", "d");
|
// final List<String> authorsA = Lists.newArrayList("a", "b", "c", "d");
|
||||||
final List<String> authorsB = Lists.newArrayList("a", "b", "c");
|
// final List<String> authorsB = Lists.newArrayList("a", "b", "c");
|
||||||
final List<String> pid = Lists.newArrayList();
|
// final List<String> pid = Lists.newArrayList();
|
||||||
|
//
|
||||||
final MapDocument resA = result(config, "A", "Search the Standard Model Higgs Boson", "2003", pid, authorsA);
|
// final MapDocument resA = result(config, "A", "Search the Standard Model Higgs Boson", "2003", pid, authorsA);
|
||||||
final MapDocument resB = result(config, "B", "Search the Standard Model Higgs Boson", "2003", pid, authorsB);
|
// final MapDocument resB = result(config, "B", "Search the Standard Model Higgs Boson", "2003", pid, authorsB);
|
||||||
|
//
|
||||||
final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config);
|
// final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config);
|
||||||
final double d = sr.getScore();
|
// final double d = sr.getScore();
|
||||||
log.info(String.format(" d ---> %s", d));
|
// log.info(String.format(" d ---> %s", d));
|
||||||
|
//
|
||||||
assertTrue(d == 0.0);
|
// assertTrue(d == 0.0);
|
||||||
}
|
// }
|
||||||
|
//
|
||||||
@Test
|
// @Test
|
||||||
public void testDistanceResultCaseAuthor2() {
|
// public void testDistanceResultCaseAuthor2() {
|
||||||
|
//
|
||||||
final Config config = getResultAuthorsConf();
|
// final Config config = getResultAuthorsConf();
|
||||||
|
//
|
||||||
final List<String> authorsA = Lists.newArrayList("a", "b", "c");
|
// final List<String> authorsA = Lists.newArrayList("a", "b", "c");
|
||||||
final List<String> authorsB = Lists.newArrayList("a", "b", "c");
|
// final List<String> authorsB = Lists.newArrayList("a", "b", "c");
|
||||||
final List<String> pid = Lists.newArrayList();
|
// final List<String> pid = Lists.newArrayList();
|
||||||
|
//
|
||||||
final MapDocument resA = result(config, "A", "Search the Standard Model Higgs Boson", "2003", pid, authorsA);
|
// final MapDocument resA = result(config, "A", "Search the Standard Model Higgs Boson", "2003", pid, authorsA);
|
||||||
final MapDocument resB = result(config, "B", "Search the Standard Model Higgs Boson", "2003", pid, authorsB);
|
// final MapDocument resB = result(config, "B", "Search the Standard Model Higgs Boson", "2003", pid, authorsB);
|
||||||
|
//
|
||||||
final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config);
|
// final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config);
|
||||||
final double d = sr.getScore();
|
// final double d = sr.getScore();
|
||||||
log.info(String.format(" d ---> %s", d));
|
// log.info(String.format(" d ---> %s", d));
|
||||||
|
//
|
||||||
assertTrue(d == 1.0);
|
// assertTrue(d == 1.0);
|
||||||
}
|
// }
|
||||||
|
//
|
||||||
@Test
|
// @Test
|
||||||
public void testDistanceResultCaseAuthor3() {
|
// public void testDistanceResultCaseAuthor3() {
|
||||||
|
//
|
||||||
final Config config = getResultAuthorsConf();
|
// final Config config = getResultAuthorsConf();
|
||||||
|
//
|
||||||
final List<String> authorsA = Lists.newArrayList("Bardi, A.", "Manghi, P.", "Artini, M.");
|
// final List<String> authorsA = Lists.newArrayList("Bardi, A.", "Manghi, P.", "Artini, M.");
|
||||||
final List<String> authorsB = Lists.newArrayList("Bardi Alessia", "Manghi Paolo", "Artini Michele");
|
// final List<String> authorsB = Lists.newArrayList("Bardi Alessia", "Manghi Paolo", "Artini Michele");
|
||||||
final List<String> pid = Lists.newArrayList();
|
// final List<String> pid = Lists.newArrayList();
|
||||||
|
//
|
||||||
final MapDocument resA = result(config, "A", "Search the Standard Model Higgs Boson", "2003", pid, authorsA);
|
// final MapDocument resA = result(config, "A", "Search the Standard Model Higgs Boson", "2003", pid, authorsA);
|
||||||
final MapDocument resB = result(config, "B", "Search the Standard Model Higgs Boson", "2003", pid, authorsB);
|
// final MapDocument resB = result(config, "B", "Search the Standard Model Higgs Boson", "2003", pid, authorsB);
|
||||||
|
//
|
||||||
final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config);
|
// final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config);
|
||||||
double d = sr.getScore();
|
// double d = sr.getScore();
|
||||||
log.info(String.format(" d ---> %s", d));
|
// log.info(String.format(" d ---> %s", d));
|
||||||
|
//
|
||||||
assertTrue((d > 0.9) && (d < 1.0));
|
// assertTrue((d > 0.9) && (d < 1.0));
|
||||||
}
|
// }
|
||||||
|
//
|
||||||
@Test
|
// @Test
|
||||||
public void testDistanceResultCaseAuthor4() {
|
// public void testDistanceResultCaseAuthor4() {
|
||||||
|
//
|
||||||
final Config config = getResultAuthorsConf();
|
// final Config config = getResultAuthorsConf();
|
||||||
|
//
|
||||||
final List<String> authorsA = Lists.newArrayList("Bardi, Alessia", "Manghi, Paolo", "Artini, Michele", "a");
|
// final List<String> authorsA = Lists.newArrayList("Bardi, Alessia", "Manghi, Paolo", "Artini, Michele", "a");
|
||||||
final List<String> authorsB = Lists.newArrayList("Bardi Alessia", "Manghi Paolo", "Artini Michele");
|
// final List<String> authorsB = Lists.newArrayList("Bardi Alessia", "Manghi Paolo", "Artini Michele");
|
||||||
final List<String> pid = Lists.newArrayList();
|
// final List<String> pid = Lists.newArrayList();
|
||||||
|
//
|
||||||
final MapDocument resA = result(config, "A", "Search the Standard Model Higgs Boson", "2003", pid, authorsA);
|
// final MapDocument resA = result(config, "A", "Search the Standard Model Higgs Boson", "2003", pid, authorsA);
|
||||||
final MapDocument resB = result(config, "B", "Search the Standard Model Higgs Boson", "2003", pid, authorsB);
|
// final MapDocument resB = result(config, "B", "Search the Standard Model Higgs Boson", "2003", pid, authorsB);
|
||||||
|
//
|
||||||
final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config);
|
// final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config);
|
||||||
final double d = sr.getScore();
|
// final double d = sr.getScore();
|
||||||
log.info(String.format(" d ---> %s", d));
|
// log.info(String.format(" d ---> %s", d));
|
||||||
|
//
|
||||||
// assertTrue(d.getScore() == 0.0);
|
// // assertTrue(d.getScore() == 0.0);
|
||||||
}
|
// }
|
||||||
|
//
|
||||||
@Test
|
// @Test
|
||||||
public void testDistanceResultNoPidsConf() {
|
// public void testDistanceResultNoPidsConf() {
|
||||||
|
//
|
||||||
final Config config = getResultFullConf();
|
// final Config config = getResultFullConf();
|
||||||
|
//
|
||||||
final MapDocument resA =
|
// final MapDocument resA =
|
||||||
result(config, "A", "Presentations of perforated colonic pathology in patients with polymyalgia rheumatica: two case reports", "2010");
|
// result(config, "A", "Presentations of perforated colonic pathology in patients with polymyalgia rheumatica: two case reports", "2010");
|
||||||
|
//
|
||||||
final MapDocument resB =
|
// final MapDocument resB =
|
||||||
result(config, "B", "Presentations of perforated colonic pathology in patients with polymyalgia rheumatica: two case reportsX", "2010");
|
// result(config, "B", "Presentations of perforated colonic pathology in patients with polymyalgia rheumatica: two case reportsX", "2010");
|
||||||
|
//
|
||||||
final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config);
|
// final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config);
|
||||||
final double s = sr.getScore();
|
// final double s = sr.getScore();
|
||||||
|
//
|
||||||
log.info(sr.toString());
|
// log.info(sr.toString());
|
||||||
log.info(String.format(" s ---> %s", s));
|
// log.info(String.format(" s ---> %s", s));
|
||||||
// assertTrue(d.getScore() == 0.0);
|
// // assertTrue(d.getScore() == 0.0);
|
||||||
}
|
// }
|
||||||
|
//
|
||||||
@Test
|
// @Test
|
||||||
public void testDistanceResultPidsConf() {
|
// public void testDistanceResultPidsConf() {
|
||||||
|
//
|
||||||
final Config config = getResultFullConf();
|
// final Config config = getResultFullConf();
|
||||||
|
//
|
||||||
final List<String> authorsA = Lists.newArrayList("Nagarajan Pranesh", "Guy Vautier", "Punyanganie de Silva");
|
// final List<String> authorsA = Lists.newArrayList("Nagarajan Pranesh", "Guy Vautier", "Punyanganie de Silva");
|
||||||
final List<String> authorsB = Lists.newArrayList("Pranesh Nagarajan", "Vautier Guy", "de Silva Punyanganie");
|
// final List<String> authorsB = Lists.newArrayList("Pranesh Nagarajan", "Vautier Guy", "de Silva Punyanganie");
|
||||||
|
//
|
||||||
final List<String> pidA = Lists.newArrayList("10.1186/1752-1947-4-299", "a", "b");
|
// final List<String> pidA = Lists.newArrayList("10.1186/1752-1947-4-299", "a", "b");
|
||||||
final MapDocument resA =
|
// final MapDocument resA =
|
||||||
result(config, "A", "Presentations of perforated colonic pathology in patients with polymyalgia rheumatica: two case reports", "2010",
|
// result(config, "A", "Presentations of perforated colonic pathology in patients with polymyalgia rheumatica: two case reports", "2010",
|
||||||
pidA, authorsA);
|
// pidA, authorsA);
|
||||||
|
//
|
||||||
final List<String> pidB = Lists.newArrayList("c", "a", "10.1186/1752-1947-4-299", "d");
|
// final List<String> pidB = Lists.newArrayList("c", "a", "10.1186/1752-1947-4-299", "d");
|
||||||
final MapDocument resB =
|
// final MapDocument resB =
|
||||||
result(config, "B", "Presentations of perforated colonic pathology in patients with polymyalgia rheumatica: two case reportsX", "2010",
|
// result(config, "B", "Presentations of perforated colonic pathology in patients with polymyalgia rheumatica: two case reportsX", "2010",
|
||||||
pidB, authorsB);
|
// pidB, authorsB);
|
||||||
|
//
|
||||||
final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config);
|
// final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config);
|
||||||
final double s = sr.getScore();
|
// final double s = sr.getScore();
|
||||||
log.info(sr.toString());
|
// log.info(sr.toString());
|
||||||
log.info(String.format(" s ---> %s", s));
|
// log.info(String.format(" s ---> %s", s));
|
||||||
|
//
|
||||||
// assertTrue(d.getScore() == 0.0);
|
// // assertTrue(d.getScore() == 0.0);
|
||||||
}
|
// }
|
||||||
|
//
|
||||||
@Test
|
// @Test
|
||||||
public void testDistanceResultFullConf() {
|
// public void testDistanceResultFullConf() {
|
||||||
|
//
|
||||||
final Config config = getResultFullConf();
|
// final Config config = getResultFullConf();
|
||||||
|
//
|
||||||
final List<String> authorsA = Lists.newArrayList("Nagarajan Pranesh", "Guy Vautier", "Punyanganie de Silva");
|
// final List<String> authorsA = Lists.newArrayList("Nagarajan Pranesh", "Guy Vautier", "Punyanganie de Silva");
|
||||||
final List<String> authorsB = Lists.newArrayList("Pranesh Nagarajan", "Vautier Guy", "de Silva Punyanganie");
|
// final List<String> authorsB = Lists.newArrayList("Pranesh Nagarajan", "Vautier Guy", "de Silva Punyanganie");
|
||||||
|
//
|
||||||
final MapDocument resA =
|
// final MapDocument resA =
|
||||||
result(config, "A", "Presentations of perforated colonic pathology in patients with polymyalgia rheumatica: two case reports", "2010",
|
// result(config, "A", "Presentations of perforated colonic pathology in patients with polymyalgia rheumatica: two case reports", "2010",
|
||||||
"10.1186/1752-1947-4-299", authorsA);
|
// "10.1186/1752-1947-4-299", authorsA);
|
||||||
|
//
|
||||||
final MapDocument resB =
|
// final MapDocument resB =
|
||||||
result(config, "B", "Presentations of perforated colonic pathology in patients with polymyalgia rheumatica: two case reports", "2010",
|
// result(config, "B", "Presentations of perforated colonic pathology in patients with polymyalgia rheumatica: two case reports", "2010",
|
||||||
"10.1186/1752-1947-4-299", authorsB);
|
// "10.1186/1752-1947-4-299", authorsB);
|
||||||
|
//
|
||||||
final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config);
|
// final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config);
|
||||||
final double d = sr.getScore();
|
// final double d = sr.getScore();
|
||||||
log.info(String.format(" d ---> %s", d));
|
// log.info(String.format(" d ---> %s", d));
|
||||||
|
//
|
||||||
// assertTrue(d.getScore() == 0.0);
|
// // assertTrue(d.getScore() == 0.0);
|
||||||
}
|
// }
|
||||||
|
//
|
||||||
@Ignore
|
// @Ignore
|
||||||
@Test
|
// @Test
|
||||||
public void testDistance() throws IOException {
|
// public void testDistance() throws IOException {
|
||||||
|
//
|
||||||
final DedupConfig conf = DedupConfig.load(readFromClasspath("/eu/dnetlib/pace/result.prod.pace.json"));
|
// final DedupConfig conf = DedupConfig.load(readFromClasspath("/eu/dnetlib/pace/result.prod.pace.json"));
|
||||||
|
//
|
||||||
final MapDocument crossref = asMapDocument(conf, "/eu/dnetlib/pace/crossref.json");
|
// final MapDocument crossref = asMapDocument(conf, "/eu/dnetlib/pace/crossref.json");
|
||||||
final MapDocument alicante = asMapDocument(conf, "/eu/dnetlib/pace/alicante.json");
|
// final MapDocument alicante = asMapDocument(conf, "/eu/dnetlib/pace/alicante.json");
|
||||||
|
//
|
||||||
final ScoreResult result = new PaceDocumentDistance().between(crossref, alicante, conf);
|
// final ScoreResult result = new PaceDocumentDistance().between(crossref, alicante, conf);
|
||||||
|
//
|
||||||
log.info("score = " + result);
|
// log.info("score = " + result);
|
||||||
|
//
|
||||||
}
|
// }
|
||||||
|
//
|
||||||
@Ignore
|
// @Ignore
|
||||||
@Test
|
// @Test
|
||||||
public void testDistanceOrgs() throws IOException {
|
// public void testDistanceOrgs() throws IOException {
|
||||||
|
//
|
||||||
final DedupConfig conf = DedupConfig.load(readFromClasspath("/eu/dnetlib/pace/organization.pace.conf"));
|
// final DedupConfig conf = DedupConfig.load(readFromClasspath("/eu/dnetlib/pace/organization.pace.conf"));
|
||||||
|
//
|
||||||
final MapDocument orgA = asMapDocument(conf, readFromClasspath("/eu/dnetlib/pace/organization1.json"));
|
// final MapDocument orgA = asMapDocument(conf, readFromClasspath("/eu/dnetlib/pace/organization1.json"));
|
||||||
final MapDocument orgB = asMapDocument(conf, readFromClasspath("/eu/dnetlib/pace/organization2.json"));
|
// final MapDocument orgB = asMapDocument(conf, readFromClasspath("/eu/dnetlib/pace/organization2.json"));
|
||||||
|
//
|
||||||
Set<String> keysA = getGroupingKeys(conf, orgA);
|
// Set<String> keysA = getGroupingKeys(conf, orgA);
|
||||||
Set<String> keysB = getGroupingKeys(conf, orgB);
|
// Set<String> keysB = getGroupingKeys(conf, orgB);
|
||||||
|
//
|
||||||
assertFalse(String.format("A: %s\nB: %s", keysA, keysB), Sets.intersection(keysA, keysB).isEmpty());
|
// assertFalse(String.format("A: %s\nB: %s", keysA, keysB), Sets.intersection(keysA, keysB).isEmpty());
|
||||||
|
//
|
||||||
log.info("clustering keys A = " + getGroupingKeys(conf, orgA));
|
// log.info("clustering keys A = " + getGroupingKeys(conf, orgA));
|
||||||
log.info("clustering keys B = " + getGroupingKeys(conf, orgB));
|
// log.info("clustering keys B = " + getGroupingKeys(conf, orgB));
|
||||||
|
//
|
||||||
final ScoreResult result = new PaceDocumentDistance().between(orgA, orgB, conf);
|
// final ScoreResult result = new PaceDocumentDistance().between(orgA, orgB, conf);
|
||||||
|
//
|
||||||
log.info("score = " + result);
|
// log.info("score = " + result);
|
||||||
log.info("distance = " + result.getScore());
|
// log.info("distance = " + result.getScore());
|
||||||
}
|
// }
|
||||||
|
//
|
||||||
private Set<String> getGroupingKeys(DedupConfig conf, MapDocument doc) {
|
// private Set<String> getGroupingKeys(DedupConfig conf, MapDocument doc) {
|
||||||
return Sets.newHashSet(BlacklistAwareClusteringCombiner.filterAndCombine(doc, conf));
|
// return Sets.newHashSet(BlacklistAwareClusteringCombiner.filterAndCombine(doc, conf));
|
||||||
}
|
// }
|
||||||
|
//
|
||||||
private MapDocument asMapDocument(DedupConfig conf, final String json) {
|
// private MapDocument asMapDocument(DedupConfig conf, final String json) {
|
||||||
OafProtos.OafEntity.Builder b = OafProtos.OafEntity.newBuilder();
|
// OafProtos.OafEntity.Builder b = OafProtos.OafEntity.newBuilder();
|
||||||
try {
|
// try {
|
||||||
JsonFormat.merge(json, b);
|
// JsonFormat.merge(json, b);
|
||||||
} catch (JsonFormat.ParseException e) {
|
// } catch (JsonFormat.ParseException e) {
|
||||||
throw new IllegalArgumentException(e);
|
// throw new IllegalArgumentException(e);
|
||||||
}
|
// }
|
||||||
return ProtoDocumentBuilder.newInstance(b.getId(), b.build(), conf.getPace().getModel());
|
// return ProtoDocumentBuilder.newInstance(b.getId(), b.build(), conf.getPace().getModel());
|
||||||
}
|
// }
|
||||||
|
//
|
||||||
|
//
|
||||||
}
|
//}
|
||||||
|
|
|
@ -1,50 +1,50 @@
|
||||||
package eu.dnetlib.pace.model;
|
//package eu.dnetlib.pace.model;
|
||||||
|
//
|
||||||
import com.google.common.collect.Iterables;
|
//import com.google.common.collect.Iterables;
|
||||||
import com.google.common.collect.Sets;
|
//import com.google.common.collect.Sets;
|
||||||
import com.google.common.collect.Sets.SetView;
|
//import com.google.common.collect.Sets.SetView;
|
||||||
import eu.dnetlib.pace.AbstractProtoPaceTest;
|
//import eu.dnetlib.pace.AbstractProtoPaceTest;
|
||||||
import eu.dnetlib.pace.config.Config;
|
//import eu.dnetlib.pace.config.Config;
|
||||||
import eu.dnetlib.pace.distance.DetectorTest;
|
//import eu.dnetlib.pace.distance.DetectorTest;
|
||||||
import eu.dnetlib.pace.model.MapDocument;
|
//import eu.dnetlib.pace.model.MapDocument;
|
||||||
import eu.dnetlib.pace.model.MapDocumentSerializer;
|
//import eu.dnetlib.pace.model.MapDocumentSerializer;
|
||||||
import eu.dnetlib.pace.model.ProtoDocumentBuilder;
|
//import eu.dnetlib.pace.model.ProtoDocumentBuilder;
|
||||||
import org.apache.commons.logging.Log;
|
//import org.apache.commons.logging.Log;
|
||||||
import org.apache.commons.logging.LogFactory;
|
//import org.apache.commons.logging.LogFactory;
|
||||||
import org.junit.Test;
|
//import org.junit.Test;
|
||||||
|
//
|
||||||
import static org.junit.Assert.assertFalse;
|
//import static org.junit.Assert.assertFalse;
|
||||||
import static org.junit.Assert.assertTrue;
|
//import static org.junit.Assert.assertTrue;
|
||||||
|
//
|
||||||
public class ProtoDocumentBuilderTest extends AbstractProtoPaceTest {
|
//public class ProtoDocumentBuilderTest extends AbstractProtoPaceTest {
|
||||||
|
//
|
||||||
private static final Log log = LogFactory.getLog(ProtoDocumentBuilderTest.class);
|
// private static final Log log = LogFactory.getLog(ProtoDocumentBuilderTest.class);
|
||||||
|
//
|
||||||
@Test
|
// @Test
|
||||||
public void test_serialise1() {
|
// public void test_serialise1() {
|
||||||
|
//
|
||||||
final String id = "12345";
|
// final String id = "12345";
|
||||||
|
//
|
||||||
final Config config = getResultFullConf();
|
// final Config config = getResultFullConf();
|
||||||
|
//
|
||||||
final MapDocument document = ProtoDocumentBuilder.newInstance(id, getResult(id), config.model());
|
// final MapDocument document = ProtoDocumentBuilder.newInstance(id, getResult(id), config.model());
|
||||||
|
//
|
||||||
assertFalse(document.fieldNames().isEmpty());
|
// assertFalse(document.fieldNames().isEmpty());
|
||||||
assertFalse(Iterables.isEmpty(document.fields()));
|
// assertFalse(Iterables.isEmpty(document.fields()));
|
||||||
|
//
|
||||||
log.info("original:\n" + document);
|
// log.info("original:\n" + document);
|
||||||
|
//
|
||||||
final String stringDoc = MapDocumentSerializer.toString(document);
|
// final String stringDoc = MapDocumentSerializer.toString(document);
|
||||||
|
//
|
||||||
log.info("srialization:\n" + stringDoc);
|
// log.info("srialization:\n" + stringDoc);
|
||||||
|
//
|
||||||
final MapDocument decoded = MapDocumentSerializer.decode(stringDoc.getBytes());
|
// final MapDocument decoded = MapDocumentSerializer.decode(stringDoc.getBytes());
|
||||||
|
//
|
||||||
final SetView<String> diff = Sets.difference(document.fieldNames(), decoded.fieldNames());
|
// final SetView<String> diff = Sets.difference(document.fieldNames(), decoded.fieldNames());
|
||||||
|
//
|
||||||
assertTrue(diff.isEmpty());
|
// assertTrue(diff.isEmpty());
|
||||||
|
//
|
||||||
log.info("decoded:\n" + decoded);
|
// log.info("decoded:\n" + decoded);
|
||||||
}
|
// }
|
||||||
|
//
|
||||||
}
|
//}
|
||||||
|
|
|
@ -55,6 +55,12 @@
|
||||||
<artifactId>junit</artifactId>
|
<artifactId>junit</artifactId>
|
||||||
<scope>test</scope>
|
<scope>test</scope>
|
||||||
</dependency>
|
</dependency>
|
||||||
|
<dependency>
|
||||||
|
<groupId>org.reflections</groupId>
|
||||||
|
<artifactId>reflections</artifactId>
|
||||||
|
<version>0.9.10</version>
|
||||||
|
</dependency>
|
||||||
|
|
||||||
</dependencies>
|
</dependencies>
|
||||||
|
|
||||||
</project>
|
</project>
|
||||||
|
|
|
@ -18,7 +18,13 @@ public abstract class AbstractClusteringFunction extends AbstractPaceFunctions i
|
||||||
public AbstractClusteringFunction(final Map<String, Integer> params) {
|
public AbstractClusteringFunction(final Map<String, Integer> params) {
|
||||||
this.params = params;
|
this.params = params;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public AbstractClusteringFunction(){}
|
||||||
|
|
||||||
|
public void setParams(Map<String, Integer> params){
|
||||||
|
this.params = params;
|
||||||
|
}
|
||||||
|
|
||||||
protected abstract Collection<String> doApply(String s);
|
protected abstract Collection<String> doApply(String s);
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
|
|
@ -7,12 +7,17 @@ import java.util.StringTokenizer;
|
||||||
|
|
||||||
import com.google.common.collect.Sets;
|
import com.google.common.collect.Sets;
|
||||||
|
|
||||||
|
@ClusteringClass("acronyms")
|
||||||
public class Acronyms extends AbstractClusteringFunction {
|
public class Acronyms extends AbstractClusteringFunction {
|
||||||
|
|
||||||
public Acronyms(Map<String, Integer> params) {
|
public Acronyms(Map<String, Integer> params) {
|
||||||
super(params);
|
super(params);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public Acronyms(){
|
||||||
|
super();
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
protected Collection<String> doApply(String s) {
|
protected Collection<String> doApply(String s) {
|
||||||
return extractAcronyms(s, param("max"), param("minLen"), param("maxLen"));
|
return extractAcronyms(s, param("max"), param("minLen"), param("maxLen"));
|
||||||
|
|
|
@ -22,9 +22,6 @@ public class BlacklistAwareClusteringCombiner extends ClusteringCombiner {
|
||||||
|
|
||||||
private static final Log log = LogFactory.getLog(BlacklistAwareClusteringCombiner.class);
|
private static final Log log = LogFactory.getLog(BlacklistAwareClusteringCombiner.class);
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
public static Collection<String> filterAndCombine(final MapDocument a, final Config conf) {
|
public static Collection<String> filterAndCombine(final MapDocument a, final Config conf) {
|
||||||
|
|
||||||
final Document filtered = new BlacklistAwareClusteringCombiner().filter(a, conf.blacklists());
|
final Document filtered = new BlacklistAwareClusteringCombiner().filter(a, conf.blacklists());
|
||||||
|
|
|
@ -1,5 +0,0 @@
|
||||||
package eu.dnetlib.pace.clustering;
|
|
||||||
|
|
||||||
public enum Clustering {
|
|
||||||
acronyms, ngrams, ngrampairs, sortedngrampairs, suffixprefix, spacetrimmingfieldvalue, immutablefieldvalue, personhash, personclustering, lowercase, urlclustering
|
|
||||||
}
|
|
|
@ -0,0 +1,13 @@
|
||||||
|
package eu.dnetlib.pace.clustering;
|
||||||
|
|
||||||
|
import java.lang.annotation.ElementType;
|
||||||
|
import java.lang.annotation.Retention;
|
||||||
|
import java.lang.annotation.RetentionPolicy;
|
||||||
|
import java.lang.annotation.Target;
|
||||||
|
|
||||||
|
@Retention(RetentionPolicy.RUNTIME)
|
||||||
|
@Target(ElementType.TYPE)
|
||||||
|
public @interface ClusteringClass {
|
||||||
|
|
||||||
|
public String value();
|
||||||
|
}
|
|
@ -12,4 +12,5 @@ public interface ClusteringFunction {
|
||||||
|
|
||||||
public Map<String, Integer> getParams();
|
public Map<String, Integer> getParams();
|
||||||
|
|
||||||
|
public void setParams(Map<String, Integer> params);
|
||||||
}
|
}
|
||||||
|
|
|
@ -0,0 +1,24 @@
|
||||||
|
package eu.dnetlib.pace.clustering;
|
||||||
|
|
||||||
|
import java.io.Serializable;
|
||||||
|
import java.lang.reflect.InvocationTargetException;
|
||||||
|
import java.util.Map;
|
||||||
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
|
import org.reflections.Reflections;
|
||||||
|
|
||||||
|
public class ClusteringResolver implements Serializable {
|
||||||
|
private final Map<String, Class<ClusteringFunction>> functionMap;
|
||||||
|
|
||||||
|
public ClusteringResolver() {
|
||||||
|
|
||||||
|
this.functionMap = new Reflections("eu.dnetlib").getTypesAnnotatedWith(ClusteringClass.class).stream()
|
||||||
|
.filter(ClusteringFunction.class::isAssignableFrom)
|
||||||
|
.collect(Collectors.toMap(cl -> cl.getAnnotation(ClusteringClass.class).value(), cl -> (Class<ClusteringFunction>)cl));
|
||||||
|
}
|
||||||
|
|
||||||
|
public ClusteringFunction resolve(String clusteringFunction) throws IllegalAccessException, InstantiationException, NoSuchMethodException, InvocationTargetException {
|
||||||
|
|
||||||
|
return functionMap.get(clusteringFunction).newInstance();
|
||||||
|
}
|
||||||
|
}
|
|
@ -6,12 +6,17 @@ import java.util.Map;
|
||||||
|
|
||||||
import com.google.common.collect.Lists;
|
import com.google.common.collect.Lists;
|
||||||
|
|
||||||
|
@ClusteringClass("immutablefieldvalue")
|
||||||
public class ImmutableFieldValue extends AbstractClusteringFunction {
|
public class ImmutableFieldValue extends AbstractClusteringFunction {
|
||||||
|
|
||||||
public ImmutableFieldValue(final Map<String, Integer> params) {
|
public ImmutableFieldValue(final Map<String, Integer> params) {
|
||||||
super(params);
|
super(params);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public ImmutableFieldValue() {
|
||||||
|
super();
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
protected Collection<String> doApply(final String s) {
|
protected Collection<String> doApply(final String s) {
|
||||||
final List<String> res = Lists.newArrayList();
|
final List<String> res = Lists.newArrayList();
|
||||||
|
|
|
@ -9,12 +9,17 @@ import com.google.common.collect.Sets;
|
||||||
import eu.dnetlib.pace.model.Field;
|
import eu.dnetlib.pace.model.Field;
|
||||||
import org.apache.commons.lang.StringUtils;
|
import org.apache.commons.lang.StringUtils;
|
||||||
|
|
||||||
|
@ClusteringClass("lowercase")
|
||||||
public class LowercaseClustering extends AbstractClusteringFunction {
|
public class LowercaseClustering extends AbstractClusteringFunction {
|
||||||
|
|
||||||
public LowercaseClustering(final Map<String, Integer> params) {
|
public LowercaseClustering(final Map<String, Integer> params) {
|
||||||
super(params);
|
super(params);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public LowercaseClustering(){
|
||||||
|
super();
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Collection<String> apply(List<Field> fields) {
|
public Collection<String> apply(List<Field> fields) {
|
||||||
Collection<String> c = Sets.newLinkedHashSet();
|
Collection<String> c = Sets.newLinkedHashSet();
|
||||||
|
|
|
@ -6,8 +6,13 @@ import java.util.Map;
|
||||||
|
|
||||||
import com.google.common.collect.Lists;
|
import com.google.common.collect.Lists;
|
||||||
|
|
||||||
|
@ClusteringClass("ngrampairs")
|
||||||
public class NgramPairs extends Ngrams {
|
public class NgramPairs extends Ngrams {
|
||||||
|
|
||||||
|
public NgramPairs() {
|
||||||
|
super();
|
||||||
|
}
|
||||||
|
|
||||||
public NgramPairs(Map<String, Integer> params) {
|
public NgramPairs(Map<String, Integer> params) {
|
||||||
super(params);
|
super(params);
|
||||||
}
|
}
|
||||||
|
|
|
@ -5,12 +5,17 @@ import java.util.LinkedHashSet;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
import java.util.StringTokenizer;
|
import java.util.StringTokenizer;
|
||||||
|
|
||||||
|
@ClusteringClass("ngrams")
|
||||||
public class Ngrams extends AbstractClusteringFunction {
|
public class Ngrams extends AbstractClusteringFunction {
|
||||||
|
|
||||||
public Ngrams(Map<String, Integer> params) {
|
public Ngrams(Map<String, Integer> params) {
|
||||||
super(params);
|
super(params);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public Ngrams() {
|
||||||
|
super();
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
protected Collection<String> doApply(String s) {
|
protected Collection<String> doApply(String s) {
|
||||||
return getNgrams(s, param("ngramLen"), param("max"), param("maxPerToken"), param("minNgramLen"));
|
return getNgrams(s, param("ngramLen"), param("max"), param("maxPerToken"), param("minNgramLen"));
|
||||||
|
|
|
@ -1,5 +1,6 @@
|
||||||
package eu.dnetlib.pace.clustering;
|
package eu.dnetlib.pace.clustering;
|
||||||
|
|
||||||
|
import java.io.Serializable;
|
||||||
import java.util.Collection;
|
import java.util.Collection;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
|
@ -18,6 +19,7 @@ import eu.dnetlib.pace.model.Field;
|
||||||
import eu.dnetlib.pace.model.gt.Author;
|
import eu.dnetlib.pace.model.gt.Author;
|
||||||
import eu.dnetlib.pace.model.gt.GTAuthor;
|
import eu.dnetlib.pace.model.gt.GTAuthor;
|
||||||
|
|
||||||
|
@ClusteringClass("personclustering")
|
||||||
public class PersonClustering extends AbstractPaceFunctions implements ClusteringFunction {
|
public class PersonClustering extends AbstractPaceFunctions implements ClusteringFunction {
|
||||||
|
|
||||||
private Map<String, Integer> params;
|
private Map<String, Integer> params;
|
||||||
|
@ -28,6 +30,10 @@ public class PersonClustering extends AbstractPaceFunctions implements Clusterin
|
||||||
this.params = params;
|
this.params = params;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void setParams(Map<String, Integer> params){
|
||||||
|
this.params = params;
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Collection<String> apply(final List<Field> fields) {
|
public Collection<String> apply(final List<Field> fields) {
|
||||||
final Set<String> hashes = Sets.newHashSet();
|
final Set<String> hashes = Sets.newHashSet();
|
||||||
|
|
|
@ -8,6 +8,7 @@ import com.google.common.collect.Lists;
|
||||||
|
|
||||||
import eu.dnetlib.pace.model.Person;
|
import eu.dnetlib.pace.model.Person;
|
||||||
|
|
||||||
|
@ClusteringClass("personhash")
|
||||||
public class PersonHash extends AbstractClusteringFunction {
|
public class PersonHash extends AbstractClusteringFunction {
|
||||||
|
|
||||||
private boolean DEFAULT_AGGRESSIVE = false;
|
private boolean DEFAULT_AGGRESSIVE = false;
|
||||||
|
@ -16,6 +17,10 @@ public class PersonHash extends AbstractClusteringFunction {
|
||||||
super(params);
|
super(params);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public PersonHash(){
|
||||||
|
super();
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
protected Collection<String> doApply(final String s) {
|
protected Collection<String> doApply(final String s) {
|
||||||
final List<String> res = Lists.newArrayList();
|
final List<String> res = Lists.newArrayList();
|
||||||
|
|
|
@ -9,6 +9,10 @@ public class RandomClusteringFunction extends AbstractClusteringFunction {
|
||||||
super(params);
|
super(params);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public RandomClusteringFunction(){
|
||||||
|
super();
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
protected Collection<String> doApply(String s) {
|
protected Collection<String> doApply(String s) {
|
||||||
// TODO Auto-generated method stub
|
// TODO Auto-generated method stub
|
||||||
|
|
|
@ -9,12 +9,17 @@ import com.google.common.base.Joiner;
|
||||||
import com.google.common.base.Splitter;
|
import com.google.common.base.Splitter;
|
||||||
import com.google.common.collect.Lists;
|
import com.google.common.collect.Lists;
|
||||||
|
|
||||||
|
@ClusteringClass("sortedngrampairs")
|
||||||
public class SortedNgramPairs extends NgramPairs {
|
public class SortedNgramPairs extends NgramPairs {
|
||||||
|
|
||||||
public SortedNgramPairs(Map<String, Integer> params) {
|
public SortedNgramPairs(Map<String, Integer> params) {
|
||||||
super(params);
|
super(params);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public SortedNgramPairs(){
|
||||||
|
super();
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
protected Collection<String> doApply(String s) {
|
protected Collection<String> doApply(String s) {
|
||||||
|
|
||||||
|
|
|
@ -9,12 +9,17 @@ import org.apache.commons.lang.StringUtils;
|
||||||
|
|
||||||
import com.google.common.collect.Lists;
|
import com.google.common.collect.Lists;
|
||||||
|
|
||||||
|
@ClusteringClass("spacetrimmingfieldvalue")
|
||||||
public class SpaceTrimmingFieldValue extends AbstractClusteringFunction {
|
public class SpaceTrimmingFieldValue extends AbstractClusteringFunction {
|
||||||
|
|
||||||
public SpaceTrimmingFieldValue(final Map<String, Integer> params) {
|
public SpaceTrimmingFieldValue(final Map<String, Integer> params) {
|
||||||
super(params);
|
super(params);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public SpaceTrimmingFieldValue(){
|
||||||
|
super();
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
protected Collection<String> doApply(final String s) {
|
protected Collection<String> doApply(final String s) {
|
||||||
final List<String> res = Lists.newArrayList();
|
final List<String> res = Lists.newArrayList();
|
||||||
|
|
|
@ -6,12 +6,17 @@ import java.util.Set;
|
||||||
|
|
||||||
import com.google.common.collect.Sets;
|
import com.google.common.collect.Sets;
|
||||||
|
|
||||||
|
@ClusteringClass("suffixprefix")
|
||||||
public class SuffixPrefix extends AbstractClusteringFunction {
|
public class SuffixPrefix extends AbstractClusteringFunction {
|
||||||
|
|
||||||
public SuffixPrefix(Map<String, Integer> params) {
|
public SuffixPrefix(Map<String, Integer> params) {
|
||||||
super(params);
|
super(params);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public SuffixPrefix(){
|
||||||
|
super();
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
protected Collection<String> doApply(String s) {
|
protected Collection<String> doApply(String s) {
|
||||||
return suffixPrefix(s, param("len"), param("max"));
|
return suffixPrefix(s, param("len"), param("max"));
|
||||||
|
|
|
@ -11,6 +11,7 @@ import java.util.List;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
import java.util.stream.Collectors;
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
|
@ClusteringClass("urlclustering")
|
||||||
public class UrlClustering extends AbstractPaceFunctions implements ClusteringFunction {
|
public class UrlClustering extends AbstractPaceFunctions implements ClusteringFunction {
|
||||||
|
|
||||||
protected Map<String, Integer> params;
|
protected Map<String, Integer> params;
|
||||||
|
@ -19,6 +20,14 @@ public class UrlClustering extends AbstractPaceFunctions implements ClusteringFu
|
||||||
this.params = params;
|
this.params = params;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public UrlClustering() {
|
||||||
|
super();
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setParams(Map<String, Integer> params){
|
||||||
|
this.params = params;
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Collection<String> apply(List<Field> fields) {
|
public Collection<String> apply(List<Field> fields) {
|
||||||
return fields.stream()
|
return fields.stream()
|
||||||
|
|
|
@ -2,7 +2,6 @@ package eu.dnetlib.pace.condition;
|
||||||
|
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import eu.dnetlib.pace.common.AbstractPaceFunctions;
|
import eu.dnetlib.pace.common.AbstractPaceFunctions;
|
||||||
import eu.dnetlib.pace.config.Cond;
|
|
||||||
import eu.dnetlib.pace.distance.eval.ConditionEval;
|
import eu.dnetlib.pace.distance.eval.ConditionEval;
|
||||||
import eu.dnetlib.pace.distance.eval.ConditionEvalMap;
|
import eu.dnetlib.pace.distance.eval.ConditionEvalMap;
|
||||||
import eu.dnetlib.pace.model.Document;
|
import eu.dnetlib.pace.model.Document;
|
||||||
|
@ -17,15 +16,25 @@ import eu.dnetlib.pace.model.FieldDef;
|
||||||
*/
|
*/
|
||||||
public abstract class AbstractCondition extends AbstractPaceFunctions implements ConditionAlgo {
|
public abstract class AbstractCondition extends AbstractPaceFunctions implements ConditionAlgo {
|
||||||
|
|
||||||
protected Cond cond;
|
protected String cond;
|
||||||
|
|
||||||
protected List<FieldDef> fields;
|
protected List<FieldDef> fields;
|
||||||
|
|
||||||
public AbstractCondition(final Cond cond, final List<FieldDef> fields) {
|
public AbstractCondition(final String cond, final List<FieldDef> fields) {
|
||||||
this.cond = cond;
|
this.cond = cond;
|
||||||
this.fields = fields;
|
this.fields = fields;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public AbstractCondition(){}
|
||||||
|
|
||||||
|
public void setCond(String cond){
|
||||||
|
this.cond = cond;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setFields(List<FieldDef> fields){
|
||||||
|
this.fields = fields;
|
||||||
|
}
|
||||||
|
|
||||||
protected abstract ConditionEval verify(FieldDef fd, Field a, Field b);
|
protected abstract ConditionEval verify(FieldDef fd, Field a, Field b);
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
|
|
@ -1,7 +1,6 @@
|
||||||
package eu.dnetlib.pace.condition;
|
package eu.dnetlib.pace.condition;
|
||||||
|
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import eu.dnetlib.pace.config.Cond;
|
|
||||||
import eu.dnetlib.pace.distance.eval.ConditionEval;
|
import eu.dnetlib.pace.distance.eval.ConditionEval;
|
||||||
import eu.dnetlib.pace.model.Field;
|
import eu.dnetlib.pace.model.Field;
|
||||||
import eu.dnetlib.pace.model.FieldDef;
|
import eu.dnetlib.pace.model.FieldDef;
|
||||||
|
@ -11,12 +10,16 @@ import eu.dnetlib.pace.model.FieldDef;
|
||||||
*
|
*
|
||||||
* @author claudio
|
* @author claudio
|
||||||
*/
|
*/
|
||||||
|
@ConditionClass("alwaystruecondition")
|
||||||
public class AlwaysTrueCondition extends AbstractCondition {
|
public class AlwaysTrueCondition extends AbstractCondition {
|
||||||
|
|
||||||
public AlwaysTrueCondition(final Cond cond, final List<FieldDef> fields) {
|
public AlwaysTrueCondition(final String cond, final List<FieldDef> fields) {
|
||||||
super(cond, fields);
|
super(cond, fields);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public AlwaysTrueCondition(){
|
||||||
|
super();
|
||||||
|
}
|
||||||
@Override
|
@Override
|
||||||
protected ConditionEval verify(final FieldDef fd, final Field a, final Field b) {
|
protected ConditionEval verify(final FieldDef fd, final Field a, final Field b) {
|
||||||
return new ConditionEval(cond, a, b, 1);
|
return new ConditionEval(cond, a, b, 1);
|
||||||
|
|
|
@ -1,9 +1,9 @@
|
||||||
package eu.dnetlib.pace.condition;
|
package eu.dnetlib.pace.condition;
|
||||||
|
|
||||||
import java.util.Map;
|
import java.util.List;
|
||||||
|
|
||||||
import eu.dnetlib.pace.distance.eval.ConditionEvalMap;
|
import eu.dnetlib.pace.distance.eval.ConditionEvalMap;
|
||||||
import eu.dnetlib.pace.model.Document;
|
import eu.dnetlib.pace.model.Document;
|
||||||
|
import eu.dnetlib.pace.model.FieldDef;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Allows to express general conditions to be satisfied or not between two Documents.
|
* Allows to express general conditions to be satisfied or not between two Documents.
|
||||||
|
@ -24,4 +24,7 @@ public interface ConditionAlgo {
|
||||||
*/
|
*/
|
||||||
public abstract ConditionEvalMap verify(Document a, Document b);
|
public abstract ConditionEvalMap verify(Document a, Document b);
|
||||||
|
|
||||||
|
public void setFields(List<FieldDef> fields);
|
||||||
|
public void setCond(String name);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -0,0 +1,13 @@
|
||||||
|
package eu.dnetlib.pace.condition;
|
||||||
|
|
||||||
|
import java.lang.annotation.ElementType;
|
||||||
|
import java.lang.annotation.Retention;
|
||||||
|
import java.lang.annotation.RetentionPolicy;
|
||||||
|
import java.lang.annotation.Target;
|
||||||
|
|
||||||
|
@Retention(RetentionPolicy.RUNTIME)
|
||||||
|
@Target(ElementType.TYPE)
|
||||||
|
public @interface ConditionClass {
|
||||||
|
|
||||||
|
public String value();
|
||||||
|
}
|
|
@ -0,0 +1,22 @@
|
||||||
|
package eu.dnetlib.pace.condition;
|
||||||
|
|
||||||
|
import java.io.Serializable;
|
||||||
|
import java.util.Map;
|
||||||
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
|
import org.reflections.Reflections;
|
||||||
|
|
||||||
|
public class ConditionResolver implements Serializable {
|
||||||
|
private final Map<String, Class<ConditionAlgo>> functionMap;
|
||||||
|
|
||||||
|
public ConditionResolver() {
|
||||||
|
|
||||||
|
this.functionMap = new Reflections("eu.dnetlib").getTypesAnnotatedWith(ConditionClass.class).stream()
|
||||||
|
.filter(ConditionAlgo.class::isAssignableFrom)
|
||||||
|
.collect(Collectors.toMap(cl -> cl.getAnnotation(ConditionClass.class).value(), cl -> (Class<ConditionAlgo>)cl));
|
||||||
|
}
|
||||||
|
|
||||||
|
public ConditionAlgo resolve(String name) throws IllegalAccessException, InstantiationException {
|
||||||
|
return functionMap.get(name).newInstance();
|
||||||
|
}
|
||||||
|
}
|
|
@ -2,7 +2,6 @@ package eu.dnetlib.pace.condition;
|
||||||
|
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
import eu.dnetlib.pace.config.Cond;
|
|
||||||
import eu.dnetlib.pace.model.Field;
|
import eu.dnetlib.pace.model.Field;
|
||||||
import eu.dnetlib.pace.model.FieldDef;
|
import eu.dnetlib.pace.model.FieldDef;
|
||||||
|
|
||||||
|
@ -11,11 +10,12 @@ import eu.dnetlib.pace.model.FieldDef;
|
||||||
*
|
*
|
||||||
* @author claudio
|
* @author claudio
|
||||||
*/
|
*/
|
||||||
|
@ConditionClass("doiExactMatch")
|
||||||
public class DoiExactMatch extends ExactMatchIgnoreCase {
|
public class DoiExactMatch extends ExactMatchIgnoreCase {
|
||||||
|
|
||||||
public final String PREFIX = "(http:\\/\\/dx\\.doi\\.org\\/)|(doi:)";
|
public final String PREFIX = "(http:\\/\\/dx\\.doi\\.org\\/)|(doi:)";
|
||||||
|
|
||||||
public DoiExactMatch(final Cond cond, final List<FieldDef> fields) {
|
public DoiExactMatch(final String cond, final List<FieldDef> fields) {
|
||||||
super(cond, fields);
|
super(cond, fields);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -2,7 +2,6 @@ package eu.dnetlib.pace.condition;
|
||||||
|
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
import eu.dnetlib.pace.config.Cond;
|
|
||||||
import eu.dnetlib.pace.distance.eval.ConditionEval;
|
import eu.dnetlib.pace.distance.eval.ConditionEval;
|
||||||
import eu.dnetlib.pace.model.Field;
|
import eu.dnetlib.pace.model.Field;
|
||||||
import eu.dnetlib.pace.model.FieldDef;
|
import eu.dnetlib.pace.model.FieldDef;
|
||||||
|
@ -13,12 +12,17 @@ import org.apache.commons.lang.StringUtils;
|
||||||
*
|
*
|
||||||
* @author claudio
|
* @author claudio
|
||||||
*/
|
*/
|
||||||
|
@ConditionClass("exactMatch")
|
||||||
public class ExactMatch extends AbstractCondition {
|
public class ExactMatch extends AbstractCondition {
|
||||||
|
|
||||||
public ExactMatch(final Cond cond, final List<FieldDef> fields) {
|
public ExactMatch(final String cond, final List<FieldDef> fields) {
|
||||||
super(cond, fields);
|
super(cond, fields);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public ExactMatch(){
|
||||||
|
super();
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
protected ConditionEval verify(final FieldDef fd, final Field a, final Field b) {
|
protected ConditionEval verify(final FieldDef fd, final Field a, final Field b) {
|
||||||
|
|
||||||
|
|
|
@ -2,7 +2,6 @@ package eu.dnetlib.pace.condition;
|
||||||
|
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
import eu.dnetlib.pace.config.Cond;
|
|
||||||
import eu.dnetlib.pace.distance.eval.ConditionEval;
|
import eu.dnetlib.pace.distance.eval.ConditionEval;
|
||||||
import eu.dnetlib.pace.model.Field;
|
import eu.dnetlib.pace.model.Field;
|
||||||
import eu.dnetlib.pace.model.FieldDef;
|
import eu.dnetlib.pace.model.FieldDef;
|
||||||
|
@ -12,9 +11,10 @@ import eu.dnetlib.pace.model.FieldDef;
|
||||||
*
|
*
|
||||||
* @author claudio
|
* @author claudio
|
||||||
*/
|
*/
|
||||||
|
@ConditionClass("exactMatchIgnoreCase")
|
||||||
public class ExactMatchIgnoreCase extends AbstractCondition {
|
public class ExactMatchIgnoreCase extends AbstractCondition {
|
||||||
|
|
||||||
public ExactMatchIgnoreCase(final Cond cond, final List<FieldDef> fields) {
|
public ExactMatchIgnoreCase(final String cond, final List<FieldDef> fields) {
|
||||||
super(cond, fields);
|
super(cond, fields);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -3,7 +3,6 @@ package eu.dnetlib.pace.condition;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
import com.google.common.collect.Iterables;
|
import com.google.common.collect.Iterables;
|
||||||
import eu.dnetlib.pace.config.Cond;
|
|
||||||
import eu.dnetlib.pace.distance.eval.ConditionEval;
|
import eu.dnetlib.pace.distance.eval.ConditionEval;
|
||||||
import eu.dnetlib.pace.model.Field;
|
import eu.dnetlib.pace.model.Field;
|
||||||
import eu.dnetlib.pace.model.FieldDef;
|
import eu.dnetlib.pace.model.FieldDef;
|
||||||
|
@ -13,6 +12,7 @@ import eu.dnetlib.pace.model.FieldDef;
|
||||||
*
|
*
|
||||||
* @author claudio
|
* @author claudio
|
||||||
*/
|
*/
|
||||||
|
@ConditionClass("mustBeDifferent")
|
||||||
public class MustBeDifferent extends AbstractCondition {
|
public class MustBeDifferent extends AbstractCondition {
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -20,7 +20,7 @@ public class MustBeDifferent extends AbstractCondition {
|
||||||
*
|
*
|
||||||
* @param fields the fields
|
* @param fields the fields
|
||||||
*/
|
*/
|
||||||
public MustBeDifferent(final Cond cond, final List<FieldDef> fields) {
|
public MustBeDifferent(final String cond, final List<FieldDef> fields) {
|
||||||
super(cond, fields);
|
super(cond, fields);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -6,7 +6,6 @@ import java.util.Set;
|
||||||
import java.util.stream.Collectors;
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
import com.google.common.collect.Sets;
|
import com.google.common.collect.Sets;
|
||||||
import eu.dnetlib.pace.config.Cond;
|
|
||||||
import eu.dnetlib.pace.distance.eval.ConditionEval;
|
import eu.dnetlib.pace.distance.eval.ConditionEval;
|
||||||
import eu.dnetlib.pace.model.Field;
|
import eu.dnetlib.pace.model.Field;
|
||||||
import eu.dnetlib.pace.model.FieldDef;
|
import eu.dnetlib.pace.model.FieldDef;
|
||||||
|
@ -20,11 +19,12 @@ import org.apache.commons.logging.LogFactory;
|
||||||
*
|
*
|
||||||
* @author claudio
|
* @author claudio
|
||||||
*/
|
*/
|
||||||
|
@ConditionClass("pidMatch")
|
||||||
public class PidMatch extends AbstractCondition {
|
public class PidMatch extends AbstractCondition {
|
||||||
|
|
||||||
private static final Log log = LogFactory.getLog(PidMatch.class);
|
private static final Log log = LogFactory.getLog(PidMatch.class);
|
||||||
|
|
||||||
public PidMatch(final Cond cond, final List<FieldDef> fields) {
|
public PidMatch(final String cond, final List<FieldDef> fields) {
|
||||||
super(cond, fields);
|
super(cond, fields);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -4,7 +4,6 @@ import java.util.List;
|
||||||
|
|
||||||
import com.google.common.collect.Iterables;
|
import com.google.common.collect.Iterables;
|
||||||
|
|
||||||
import eu.dnetlib.pace.config.Cond;
|
|
||||||
import eu.dnetlib.pace.distance.eval.ConditionEval;
|
import eu.dnetlib.pace.distance.eval.ConditionEval;
|
||||||
import eu.dnetlib.pace.model.Field;
|
import eu.dnetlib.pace.model.Field;
|
||||||
import eu.dnetlib.pace.model.FieldDef;
|
import eu.dnetlib.pace.model.FieldDef;
|
||||||
|
@ -14,6 +13,7 @@ import eu.dnetlib.pace.model.FieldDef;
|
||||||
*
|
*
|
||||||
* @author claudio
|
* @author claudio
|
||||||
*/
|
*/
|
||||||
|
@ConditionClass("sizeMatch")
|
||||||
public class SizeMatch extends AbstractCondition {
|
public class SizeMatch extends AbstractCondition {
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -22,7 +22,7 @@ public class SizeMatch extends AbstractCondition {
|
||||||
* @param fields
|
* @param fields
|
||||||
* the fields
|
* the fields
|
||||||
*/
|
*/
|
||||||
public SizeMatch(final Cond cond, final List<FieldDef> fields) {
|
public SizeMatch(final String cond, final List<FieldDef> fields) {
|
||||||
super(cond, fields);
|
super(cond, fields);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -2,7 +2,6 @@ package eu.dnetlib.pace.condition;
|
||||||
|
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
import eu.dnetlib.pace.config.Cond;
|
|
||||||
import eu.dnetlib.pace.distance.eval.ConditionEval;
|
import eu.dnetlib.pace.distance.eval.ConditionEval;
|
||||||
import eu.dnetlib.pace.model.Field;
|
import eu.dnetlib.pace.model.Field;
|
||||||
import eu.dnetlib.pace.model.FieldDef;
|
import eu.dnetlib.pace.model.FieldDef;
|
||||||
|
@ -13,9 +12,10 @@ import eu.dnetlib.pace.model.FieldDef;
|
||||||
* @author claudio
|
* @author claudio
|
||||||
*
|
*
|
||||||
*/
|
*/
|
||||||
|
@ConditionClass("titleVersionMatch")
|
||||||
public class TitleVersionMatch extends AbstractCondition {
|
public class TitleVersionMatch extends AbstractCondition {
|
||||||
|
|
||||||
public TitleVersionMatch(final Cond cond, final List<FieldDef> fields) {
|
public TitleVersionMatch(final String cond, final List<FieldDef> fields) {
|
||||||
super(cond, fields);
|
super(cond, fields);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -1,8 +1,8 @@
|
||||||
package eu.dnetlib.pace.condition;
|
package eu.dnetlib.pace.condition;
|
||||||
|
|
||||||
|
import java.time.Year;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
import eu.dnetlib.pace.config.Cond;
|
|
||||||
import eu.dnetlib.pace.distance.eval.ConditionEval;
|
import eu.dnetlib.pace.distance.eval.ConditionEval;
|
||||||
import org.apache.commons.lang.StringUtils;
|
import org.apache.commons.lang.StringUtils;
|
||||||
|
|
||||||
|
@ -14,14 +14,17 @@ import eu.dnetlib.pace.model.FieldDef;
|
||||||
*
|
*
|
||||||
* @author claudio
|
* @author claudio
|
||||||
*/
|
*/
|
||||||
|
@ConditionClass("yearMatch")
|
||||||
public class YearMatch extends AbstractCondition {
|
public class YearMatch extends AbstractCondition {
|
||||||
|
|
||||||
private int limit = 4;
|
private int limit = 4;
|
||||||
|
|
||||||
public YearMatch(final Cond cond, final List<FieldDef> fields) {
|
public YearMatch(final String cond, final List<FieldDef> fields) {
|
||||||
super(cond, fields);
|
super(cond, fields);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public YearMatch(){}
|
||||||
|
|
||||||
// @Override
|
// @Override
|
||||||
// public boolean verify(final Document a, final Document b) {
|
// public boolean verify(final Document a, final Document b) {
|
||||||
// boolean res = true;
|
// boolean res = true;
|
||||||
|
|
|
@ -1,46 +0,0 @@
|
||||||
package eu.dnetlib.pace.config;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Enumerates the distance Algos.
|
|
||||||
*/
|
|
||||||
public enum Algo {
|
|
||||||
|
|
||||||
/** The Jaro winkler. */
|
|
||||||
JaroWinkler,
|
|
||||||
/** The Jaro winkler title. */
|
|
||||||
JaroWinklerTitle,
|
|
||||||
/** The Levenstein. */
|
|
||||||
Levenstein,
|
|
||||||
/** The Levenstein distance for title matching */
|
|
||||||
LevensteinTitle,
|
|
||||||
/** The Level2 jaro winkler. */
|
|
||||||
Level2JaroWinkler,
|
|
||||||
/** The Level2 jaro winkler for title matching */
|
|
||||||
Level2JaroWinklerTitle,
|
|
||||||
/** The Level2 levenstein. */
|
|
||||||
Level2Levenstein,
|
|
||||||
/** The Sub string levenstein. */
|
|
||||||
SubStringLevenstein,
|
|
||||||
/** The Year levenstein. */
|
|
||||||
YearLevenstein,
|
|
||||||
/** The Sorted jaro winkler. */
|
|
||||||
SortedJaroWinkler,
|
|
||||||
/** The Sorted level2 jaro winkler. */
|
|
||||||
SortedLevel2JaroWinkler,
|
|
||||||
/** Compares two urls */
|
|
||||||
urlMatcher,
|
|
||||||
/** Exact match algo. */
|
|
||||||
ExactMatch,
|
|
||||||
/**
|
|
||||||
* Returns 0 for equal strings, 1 for different strings.
|
|
||||||
*/
|
|
||||||
MustBeDifferent,
|
|
||||||
/** Always return 1.0 as distance. */
|
|
||||||
AlwaysMatch,
|
|
||||||
/** Person distance */
|
|
||||||
PersonCoAuthorSurnamesDistance,
|
|
||||||
PersonCoAnchorsDistance,
|
|
||||||
PersonDistance,
|
|
||||||
/** The Null. */
|
|
||||||
Null
|
|
||||||
}
|
|
|
@ -1,28 +0,0 @@
|
||||||
package eu.dnetlib.pace.config;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* The Enum Cond.
|
|
||||||
*/
|
|
||||||
public enum Cond {
|
|
||||||
|
|
||||||
/** The year match. */
|
|
||||||
yearMatch,
|
|
||||||
/** The title version match. */
|
|
||||||
titleVersionMatch,
|
|
||||||
/** The size match. */
|
|
||||||
sizeMatch,
|
|
||||||
/**
|
|
||||||
* Returns true if the field values are different
|
|
||||||
*/
|
|
||||||
mustBeDifferent,
|
|
||||||
/** The Exact match. */
|
|
||||||
exactMatch,
|
|
||||||
/**
|
|
||||||
* The Exact match ignore case.
|
|
||||||
*/
|
|
||||||
exactMatchIgnoreCase,
|
|
||||||
/** The Exact match specialized to recognize DOI values. */
|
|
||||||
doiExactMatch,
|
|
||||||
/** The Exact match that checks if pid type and value are the same */
|
|
||||||
pidMatch
|
|
||||||
}
|
|
|
@ -2,6 +2,8 @@ package eu.dnetlib.pace.distance;
|
||||||
|
|
||||||
import eu.dnetlib.pace.model.Field;
|
import eu.dnetlib.pace.model.Field;
|
||||||
|
|
||||||
|
import java.util.Map;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Each field is configured with a distance algo which knows how to compute the distance (0-1) between the fields of two
|
* Each field is configured with a distance algo which knows how to compute the distance (0-1) between the fields of two
|
||||||
* objects.
|
* objects.
|
||||||
|
@ -11,5 +13,9 @@ public interface DistanceAlgo {
|
||||||
public abstract double distance(Field a, Field b);
|
public abstract double distance(Field a, Field b);
|
||||||
|
|
||||||
public double getWeight();
|
public double getWeight();
|
||||||
|
public Map<String, Number> getParams();
|
||||||
|
|
||||||
|
public void setWeight(double w);
|
||||||
|
public void setParams(Map<String, Number> params);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -0,0 +1,13 @@
|
||||||
|
package eu.dnetlib.pace.distance;
|
||||||
|
|
||||||
|
import java.lang.annotation.ElementType;
|
||||||
|
import java.lang.annotation.Retention;
|
||||||
|
import java.lang.annotation.RetentionPolicy;
|
||||||
|
import java.lang.annotation.Target;
|
||||||
|
|
||||||
|
@Retention(RetentionPolicy.RUNTIME)
|
||||||
|
@Target(ElementType.TYPE)
|
||||||
|
public @interface DistanceClass {
|
||||||
|
|
||||||
|
public String value();
|
||||||
|
}
|
|
@ -0,0 +1,24 @@
|
||||||
|
package eu.dnetlib.pace.distance;
|
||||||
|
|
||||||
|
import java.io.Serializable;
|
||||||
|
import java.lang.reflect.InvocationTargetException;
|
||||||
|
import java.util.Map;
|
||||||
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
|
import org.reflections.Reflections;
|
||||||
|
|
||||||
|
public class DistanceResolver implements Serializable {
|
||||||
|
private final Map<String, Class<DistanceAlgo>> functionMap;
|
||||||
|
|
||||||
|
public DistanceResolver() {
|
||||||
|
|
||||||
|
this.functionMap = new Reflections("eu.dnetlib").getTypesAnnotatedWith(DistanceClass.class).stream()
|
||||||
|
.filter(DistanceAlgo.class::isAssignableFrom)
|
||||||
|
.collect(Collectors.toMap(cl -> cl.getAnnotation(DistanceClass.class).value(), cl -> (Class<DistanceAlgo>)cl));
|
||||||
|
}
|
||||||
|
|
||||||
|
public DistanceAlgo resolve(String algo) throws IllegalAccessException, InstantiationException {
|
||||||
|
|
||||||
|
return functionMap.get(algo).newInstance();
|
||||||
|
}
|
||||||
|
}
|
|
@ -25,7 +25,7 @@ public class DistanceScorer {
|
||||||
}
|
}
|
||||||
|
|
||||||
public ScoreResult distance(final Document a, final Document b) {
|
public ScoreResult distance(final Document a, final Document b) {
|
||||||
final ScoreResult sr = new ScoreResult();
|
final ScoreResult sr = new ScoreResult(); //to keep track of the result of the comparison
|
||||||
|
|
||||||
sr.setStrictConditions(verify(a, b, config.strictConditions()));
|
sr.setStrictConditions(verify(a, b, config.strictConditions()));
|
||||||
sr.setConditions(verify(a, b, config.conditions()));
|
sr.setConditions(verify(a, b, config.conditions()));
|
||||||
|
|
|
@ -1,6 +1,8 @@
|
||||||
package eu.dnetlib.pace.distance;
|
package eu.dnetlib.pace.distance;
|
||||||
|
|
||||||
|
import java.io.Serializable;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
import java.util.Map;
|
||||||
|
|
||||||
import com.wcohen.ss.AbstractStringDistance;
|
import com.wcohen.ss.AbstractStringDistance;
|
||||||
|
|
||||||
|
@ -24,6 +26,27 @@ public abstract class SecondStringDistanceAlgo extends AbstractPaceFunctions imp
|
||||||
/** The weight. */
|
/** The weight. */
|
||||||
protected double weight = 0.0;
|
protected double weight = 0.0;
|
||||||
|
|
||||||
|
private Map<String, Number> params;
|
||||||
|
|
||||||
|
protected SecondStringDistanceAlgo(){
|
||||||
|
}
|
||||||
|
|
||||||
|
protected SecondStringDistanceAlgo(Map<String, Number> params){
|
||||||
|
this.params = params;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setWeight(double w){
|
||||||
|
this.weight = w;
|
||||||
|
}
|
||||||
|
|
||||||
|
public Map<String, Number> getParams(){
|
||||||
|
return this.params;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setParams(Map<String, Number> params){
|
||||||
|
this.params = params;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Instantiates a new second string distance algo.
|
* Instantiates a new second string distance algo.
|
||||||
*
|
*
|
||||||
|
@ -37,6 +60,10 @@ public abstract class SecondStringDistanceAlgo extends AbstractPaceFunctions imp
|
||||||
this.weight = weight;
|
this.weight = weight;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
protected SecondStringDistanceAlgo(final AbstractStringDistance ssalgo){
|
||||||
|
this.ssalgo = ssalgo;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Normalize.
|
* Normalize.
|
||||||
*
|
*
|
||||||
|
|
|
@ -1,10 +1,22 @@
|
||||||
package eu.dnetlib.pace.distance.algo;
|
package eu.dnetlib.pace.distance.algo;
|
||||||
|
|
||||||
import com.wcohen.ss.AbstractStringDistance;
|
import com.wcohen.ss.AbstractStringDistance;
|
||||||
|
import eu.dnetlib.pace.distance.DistanceClass;
|
||||||
import eu.dnetlib.pace.distance.SecondStringDistanceAlgo;
|
import eu.dnetlib.pace.distance.SecondStringDistanceAlgo;
|
||||||
|
|
||||||
|
import java.util.Map;
|
||||||
|
|
||||||
|
@DistanceClass("AlwaysMatch")
|
||||||
public class AlwaysMatch extends SecondStringDistanceAlgo {
|
public class AlwaysMatch extends SecondStringDistanceAlgo {
|
||||||
|
|
||||||
|
public AlwaysMatch(){
|
||||||
|
super();
|
||||||
|
}
|
||||||
|
|
||||||
|
public AlwaysMatch(final Map<String, Number> params){
|
||||||
|
super(params);
|
||||||
|
}
|
||||||
|
|
||||||
public AlwaysMatch(final double weight) {
|
public AlwaysMatch(final double weight) {
|
||||||
super(weight, new com.wcohen.ss.JaroWinkler());
|
super(weight, new com.wcohen.ss.JaroWinkler());
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,10 +1,22 @@
|
||||||
package eu.dnetlib.pace.distance.algo;
|
package eu.dnetlib.pace.distance.algo;
|
||||||
|
|
||||||
import com.wcohen.ss.AbstractStringDistance;
|
import com.wcohen.ss.AbstractStringDistance;
|
||||||
|
import eu.dnetlib.pace.distance.DistanceClass;
|
||||||
import eu.dnetlib.pace.distance.SecondStringDistanceAlgo;
|
import eu.dnetlib.pace.distance.SecondStringDistanceAlgo;
|
||||||
|
|
||||||
|
import java.util.Map;
|
||||||
|
|
||||||
|
@DistanceClass("ExactMatch")
|
||||||
public class ExactMatch extends SecondStringDistanceAlgo {
|
public class ExactMatch extends SecondStringDistanceAlgo {
|
||||||
|
|
||||||
|
public ExactMatch(){
|
||||||
|
super();
|
||||||
|
}
|
||||||
|
|
||||||
|
public ExactMatch(Map<String, Number> params){
|
||||||
|
super(params);
|
||||||
|
}
|
||||||
|
|
||||||
public ExactMatch(final double weight) {
|
public ExactMatch(final double weight) {
|
||||||
super(weight, new com.wcohen.ss.JaroWinkler());
|
super(weight, new com.wcohen.ss.JaroWinkler());
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,11 +1,23 @@
|
||||||
package eu.dnetlib.pace.distance.algo;
|
package eu.dnetlib.pace.distance.algo;
|
||||||
|
|
||||||
import com.wcohen.ss.AbstractStringDistance;
|
import com.wcohen.ss.AbstractStringDistance;
|
||||||
|
import eu.dnetlib.pace.distance.DistanceClass;
|
||||||
import eu.dnetlib.pace.distance.SecondStringDistanceAlgo;
|
import eu.dnetlib.pace.distance.SecondStringDistanceAlgo;
|
||||||
|
|
||||||
|
import java.util.Map;
|
||||||
|
|
||||||
//case class JaroWinkler(w: Double) extends SecondStringDistanceAlgo(w, new com.wcohen.ss.JaroWinkler())
|
//case class JaroWinkler(w: Double) extends SecondStringDistanceAlgo(w, new com.wcohen.ss.JaroWinkler())
|
||||||
|
@DistanceClass("JaroWinkler")
|
||||||
public class JaroWinkler extends SecondStringDistanceAlgo {
|
public class JaroWinkler extends SecondStringDistanceAlgo {
|
||||||
|
|
||||||
|
public JaroWinkler(){
|
||||||
|
super();
|
||||||
|
}
|
||||||
|
|
||||||
|
public JaroWinkler(Map<String, Number> params){
|
||||||
|
super(params);
|
||||||
|
}
|
||||||
|
|
||||||
public JaroWinkler(double weight) {
|
public JaroWinkler(double weight) {
|
||||||
super(weight, new com.wcohen.ss.JaroWinkler());
|
super(weight, new com.wcohen.ss.JaroWinkler());
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,11 +1,23 @@
|
||||||
package eu.dnetlib.pace.distance.algo;
|
package eu.dnetlib.pace.distance.algo;
|
||||||
|
|
||||||
import com.wcohen.ss.AbstractStringDistance;
|
import com.wcohen.ss.AbstractStringDistance;
|
||||||
|
import eu.dnetlib.pace.distance.DistanceClass;
|
||||||
import eu.dnetlib.pace.distance.SecondStringDistanceAlgo;
|
import eu.dnetlib.pace.distance.SecondStringDistanceAlgo;
|
||||||
|
|
||||||
|
import java.util.Map;
|
||||||
|
|
||||||
//case class JaroWinkler(w: Double) extends SecondStringDistanceAlgo(w, new com.wcohen.ss.JaroWinkler())
|
//case class JaroWinkler(w: Double) extends SecondStringDistanceAlgo(w, new com.wcohen.ss.JaroWinkler())
|
||||||
|
@DistanceClass("JaroWinklerTitle")
|
||||||
public class JaroWinklerTitle extends SecondStringDistanceAlgo {
|
public class JaroWinklerTitle extends SecondStringDistanceAlgo {
|
||||||
|
|
||||||
|
public JaroWinklerTitle(){
|
||||||
|
super();
|
||||||
|
}
|
||||||
|
|
||||||
|
public JaroWinklerTitle(Map<String, Number> params){
|
||||||
|
super(params);
|
||||||
|
}
|
||||||
|
|
||||||
public JaroWinklerTitle(double weight) {
|
public JaroWinklerTitle(double weight) {
|
||||||
super(weight, new com.wcohen.ss.JaroWinkler());
|
super(weight, new com.wcohen.ss.JaroWinkler());
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,8 +1,10 @@
|
||||||
package eu.dnetlib.pace.distance.algo;
|
package eu.dnetlib.pace.distance.algo;
|
||||||
|
|
||||||
import com.wcohen.ss.AbstractStringDistance;
|
import com.wcohen.ss.AbstractStringDistance;
|
||||||
|
import eu.dnetlib.pace.distance.DistanceClass;
|
||||||
import eu.dnetlib.pace.distance.SecondStringDistanceAlgo;
|
import eu.dnetlib.pace.distance.SecondStringDistanceAlgo;
|
||||||
|
|
||||||
|
@DistanceClass("Level2JaroWinkler")
|
||||||
public class Level2JaroWinkler extends SecondStringDistanceAlgo {
|
public class Level2JaroWinkler extends SecondStringDistanceAlgo {
|
||||||
|
|
||||||
public Level2JaroWinkler(double w) {
|
public Level2JaroWinkler(double w) {
|
||||||
|
|
|
@ -1,8 +1,10 @@
|
||||||
package eu.dnetlib.pace.distance.algo;
|
package eu.dnetlib.pace.distance.algo;
|
||||||
|
|
||||||
import com.wcohen.ss.AbstractStringDistance;
|
import com.wcohen.ss.AbstractStringDistance;
|
||||||
|
import eu.dnetlib.pace.distance.DistanceClass;
|
||||||
import eu.dnetlib.pace.distance.SecondStringDistanceAlgo;
|
import eu.dnetlib.pace.distance.SecondStringDistanceAlgo;
|
||||||
|
|
||||||
|
@DistanceClass("Level2JaroWinklerTitle")
|
||||||
public class Level2JaroWinklerTitle extends SecondStringDistanceAlgo {
|
public class Level2JaroWinklerTitle extends SecondStringDistanceAlgo {
|
||||||
|
|
||||||
public Level2JaroWinklerTitle(final double w) {
|
public Level2JaroWinklerTitle(final double w) {
|
||||||
|
|
|
@ -1,8 +1,10 @@
|
||||||
package eu.dnetlib.pace.distance.algo;
|
package eu.dnetlib.pace.distance.algo;
|
||||||
|
|
||||||
import com.wcohen.ss.AbstractStringDistance;
|
import com.wcohen.ss.AbstractStringDistance;
|
||||||
|
import eu.dnetlib.pace.distance.DistanceClass;
|
||||||
import eu.dnetlib.pace.distance.SecondStringDistanceAlgo;
|
import eu.dnetlib.pace.distance.SecondStringDistanceAlgo;
|
||||||
|
|
||||||
|
@DistanceClass("Level2Levenstein")
|
||||||
public class Level2Levenstein extends SecondStringDistanceAlgo {
|
public class Level2Levenstein extends SecondStringDistanceAlgo {
|
||||||
|
|
||||||
public Level2Levenstein(double w) {
|
public Level2Levenstein(double w) {
|
||||||
|
|
|
@ -1,10 +1,16 @@
|
||||||
package eu.dnetlib.pace.distance.algo;
|
package eu.dnetlib.pace.distance.algo;
|
||||||
|
|
||||||
import com.wcohen.ss.AbstractStringDistance;
|
import com.wcohen.ss.AbstractStringDistance;
|
||||||
|
import eu.dnetlib.pace.distance.DistanceClass;
|
||||||
import eu.dnetlib.pace.distance.SecondStringDistanceAlgo;
|
import eu.dnetlib.pace.distance.SecondStringDistanceAlgo;
|
||||||
|
|
||||||
|
@DistanceClass("Levenstein")
|
||||||
public class Levenstein extends SecondStringDistanceAlgo {
|
public class Levenstein extends SecondStringDistanceAlgo {
|
||||||
|
|
||||||
|
public Levenstein(){
|
||||||
|
super(new com.wcohen.ss.Levenstein());
|
||||||
|
}
|
||||||
|
|
||||||
public Levenstein(double w) {
|
public Levenstein(double w) {
|
||||||
super(w, new com.wcohen.ss.Levenstein());
|
super(w, new com.wcohen.ss.Levenstein());
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,10 +1,16 @@
|
||||||
package eu.dnetlib.pace.distance.algo;
|
package eu.dnetlib.pace.distance.algo;
|
||||||
|
|
||||||
import com.wcohen.ss.AbstractStringDistance;
|
import com.wcohen.ss.AbstractStringDistance;
|
||||||
|
import eu.dnetlib.pace.distance.DistanceClass;
|
||||||
import eu.dnetlib.pace.distance.SecondStringDistanceAlgo;
|
import eu.dnetlib.pace.distance.SecondStringDistanceAlgo;
|
||||||
|
|
||||||
|
@DistanceClass("LevensteinTitle")
|
||||||
public class LevensteinTitle extends SecondStringDistanceAlgo {
|
public class LevensteinTitle extends SecondStringDistanceAlgo {
|
||||||
|
|
||||||
|
public LevensteinTitle(){
|
||||||
|
super(new com.wcohen.ss.Levenstein());
|
||||||
|
}
|
||||||
|
|
||||||
public LevensteinTitle(final double w) {
|
public LevensteinTitle(final double w) {
|
||||||
super(w, new com.wcohen.ss.Levenstein());
|
super(w, new com.wcohen.ss.Levenstein());
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,8 +1,10 @@
|
||||||
package eu.dnetlib.pace.distance.algo;
|
package eu.dnetlib.pace.distance.algo;
|
||||||
|
|
||||||
import com.wcohen.ss.AbstractStringDistance;
|
import com.wcohen.ss.AbstractStringDistance;
|
||||||
|
import eu.dnetlib.pace.distance.DistanceClass;
|
||||||
import eu.dnetlib.pace.distance.SecondStringDistanceAlgo;
|
import eu.dnetlib.pace.distance.SecondStringDistanceAlgo;
|
||||||
|
|
||||||
|
@DistanceClass("MustBeDifferent")
|
||||||
public class MustBeDifferent extends SecondStringDistanceAlgo {
|
public class MustBeDifferent extends SecondStringDistanceAlgo {
|
||||||
|
|
||||||
public MustBeDifferent(final double weight) {
|
public MustBeDifferent(final double weight) {
|
||||||
|
|
|
@ -1,12 +1,16 @@
|
||||||
package eu.dnetlib.pace.distance.algo;
|
package eu.dnetlib.pace.distance.algo;
|
||||||
|
|
||||||
import eu.dnetlib.pace.distance.DistanceAlgo;
|
import eu.dnetlib.pace.distance.DistanceAlgo;
|
||||||
|
import eu.dnetlib.pace.distance.DistanceClass;
|
||||||
import eu.dnetlib.pace.model.Field;
|
import eu.dnetlib.pace.model.Field;
|
||||||
|
|
||||||
|
import java.util.Map;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Not all fields of a document need to partecipate in the distance measure. We model those fields as having a
|
* Not all fields of a document need to partecipate in the distance measure. We model those fields as having a
|
||||||
* NullDistanceAlgo.
|
* NullDistanceAlgo.
|
||||||
*/
|
*/
|
||||||
|
@DistanceClass("Null")
|
||||||
public class NullDistanceAlgo implements DistanceAlgo {
|
public class NullDistanceAlgo implements DistanceAlgo {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
@ -19,4 +23,16 @@ public class NullDistanceAlgo implements DistanceAlgo {
|
||||||
return 0.0;
|
return 0.0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void setWeight(double w){
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Map<String, Number> getParams() {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void setParams(Map<String, Number> params) {
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,10 +1,12 @@
|
||||||
package eu.dnetlib.pace.distance.algo;
|
package eu.dnetlib.pace.distance.algo;
|
||||||
|
|
||||||
import com.wcohen.ss.AbstractStringDistance;
|
import com.wcohen.ss.AbstractStringDistance;
|
||||||
|
import eu.dnetlib.pace.distance.DistanceClass;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* The Class SortedJaroWinkler.
|
* The Class SortedJaroWinkler.
|
||||||
*/
|
*/
|
||||||
|
@DistanceClass("SortedJaroWinkler")
|
||||||
public class SortedJaroWinkler extends SortedSecondStringDistanceAlgo {
|
public class SortedJaroWinkler extends SortedSecondStringDistanceAlgo {
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
|
@ -1,10 +1,12 @@
|
||||||
package eu.dnetlib.pace.distance.algo;
|
package eu.dnetlib.pace.distance.algo;
|
||||||
|
|
||||||
import com.wcohen.ss.AbstractStringDistance;
|
import com.wcohen.ss.AbstractStringDistance;
|
||||||
|
import eu.dnetlib.pace.distance.DistanceClass;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* The Class SortedJaroWinkler.
|
* The Class SortedJaroWinkler.
|
||||||
*/
|
*/
|
||||||
|
@DistanceClass("Sorted2JaroWinkler")
|
||||||
public class SortedLevel2JaroWinkler extends SortedSecondStringDistanceAlgo {
|
public class SortedLevel2JaroWinkler extends SortedSecondStringDistanceAlgo {
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
|
@ -1,5 +1,6 @@
|
||||||
package eu.dnetlib.pace.distance.algo;
|
package eu.dnetlib.pace.distance.algo;
|
||||||
|
|
||||||
|
import eu.dnetlib.pace.distance.DistanceClass;
|
||||||
import eu.dnetlib.pace.distance.SecondStringDistanceAlgo;
|
import eu.dnetlib.pace.distance.SecondStringDistanceAlgo;
|
||||||
import org.apache.commons.lang.StringUtils;
|
import org.apache.commons.lang.StringUtils;
|
||||||
|
|
||||||
|
@ -8,14 +9,21 @@ import com.wcohen.ss.AbstractStringDistance;
|
||||||
import eu.dnetlib.pace.config.Type;
|
import eu.dnetlib.pace.config.Type;
|
||||||
import eu.dnetlib.pace.model.Field;
|
import eu.dnetlib.pace.model.Field;
|
||||||
|
|
||||||
|
import java.util.Map;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* The Class SubStringLevenstein.
|
* The Class SubStringLevenstein.
|
||||||
*/
|
*/
|
||||||
|
@DistanceClass("SubStringLevenstein")
|
||||||
public class SubStringLevenstein extends SecondStringDistanceAlgo {
|
public class SubStringLevenstein extends SecondStringDistanceAlgo {
|
||||||
|
|
||||||
/** The limit. */
|
/** The limit. */
|
||||||
protected int limit;
|
protected int limit;
|
||||||
|
|
||||||
|
public SubStringLevenstein() {
|
||||||
|
super(new com.wcohen.ss.Levenstein());
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Instantiates a new sub string levenstein.
|
* Instantiates a new sub string levenstein.
|
||||||
*
|
*
|
||||||
|
@ -87,4 +95,9 @@ public class SubStringLevenstein extends SecondStringDistanceAlgo {
|
||||||
return 1 / Math.pow(Math.abs(d) + 1, 0.1);
|
return 1 / Math.pow(Math.abs(d) + 1, 0.1);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void setParams(Map<String, Number> params){
|
||||||
|
this.limit = params.get("limit").intValue(); //necessary because this class needs also the limit
|
||||||
|
super.setParams(params);
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,5 +1,6 @@
|
||||||
package eu.dnetlib.pace.distance.algo;
|
package eu.dnetlib.pace.distance.algo;
|
||||||
|
|
||||||
|
import eu.dnetlib.pace.distance.DistanceClass;
|
||||||
import eu.dnetlib.pace.model.Field;
|
import eu.dnetlib.pace.model.Field;
|
||||||
import org.apache.commons.lang.StringUtils;
|
import org.apache.commons.lang.StringUtils;
|
||||||
|
|
||||||
|
@ -7,15 +8,24 @@ import java.net.MalformedURLException;
|
||||||
import java.net.URL;
|
import java.net.URL;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
|
|
||||||
|
@DistanceClass("urlMatcher")
|
||||||
public class UrlMatcher extends Levenstein {
|
public class UrlMatcher extends Levenstein {
|
||||||
|
|
||||||
private Map<String, Number> params;
|
private Map<String, Number> params;
|
||||||
|
|
||||||
|
public UrlMatcher(){
|
||||||
|
super();
|
||||||
|
}
|
||||||
|
|
||||||
public UrlMatcher(double weight, Map<String, Number> params) {
|
public UrlMatcher(double weight, Map<String, Number> params) {
|
||||||
super(weight);
|
super(weight);
|
||||||
this.params = params;
|
this.params = params;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void setParams(Map<String, Number> params) {
|
||||||
|
this.params = params;
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public double distance(Field a, Field b) {
|
public double distance(Field a, Field b) {
|
||||||
|
|
||||||
|
|
|
@ -1,6 +1,5 @@
|
||||||
package eu.dnetlib.pace.distance.eval;
|
package eu.dnetlib.pace.distance.eval;
|
||||||
|
|
||||||
import eu.dnetlib.pace.config.Cond;
|
|
||||||
import eu.dnetlib.pace.model.Field;
|
import eu.dnetlib.pace.model.Field;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -8,7 +7,7 @@ import eu.dnetlib.pace.model.Field;
|
||||||
*/
|
*/
|
||||||
public class ConditionEval {
|
public class ConditionEval {
|
||||||
|
|
||||||
private Cond cond;
|
private String cond;
|
||||||
|
|
||||||
private Field a;
|
private Field a;
|
||||||
|
|
||||||
|
@ -16,7 +15,7 @@ public class ConditionEval {
|
||||||
|
|
||||||
private int result;
|
private int result;
|
||||||
|
|
||||||
public ConditionEval(final Cond cond, final Field a, final Field b, final int result) {
|
public ConditionEval(final String cond, final Field a, final Field b, final int result) {
|
||||||
this.cond = cond;
|
this.cond = cond;
|
||||||
this.a = a;
|
this.a = a;
|
||||||
this.b = b;
|
this.b = b;
|
||||||
|
@ -47,11 +46,11 @@ public class ConditionEval {
|
||||||
this.result = result;
|
this.result = result;
|
||||||
}
|
}
|
||||||
|
|
||||||
public Cond getCond() {
|
public String getCond() {
|
||||||
return cond;
|
return cond;
|
||||||
}
|
}
|
||||||
|
|
||||||
public void setCond(final Cond cond) {
|
public void setCond(final String cond) {
|
||||||
this.cond = cond;
|
this.cond = cond;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,6 +1,5 @@
|
||||||
package eu.dnetlib.pace.distance.eval;
|
package eu.dnetlib.pace.distance.eval;
|
||||||
|
|
||||||
import eu.dnetlib.pace.config.Algo;
|
|
||||||
import eu.dnetlib.pace.model.Field;
|
import eu.dnetlib.pace.model.Field;
|
||||||
import eu.dnetlib.pace.model.FieldDef;
|
import eu.dnetlib.pace.model.FieldDef;
|
||||||
|
|
||||||
|
|
|
@ -2,10 +2,12 @@ package eu.dnetlib.pace.distance.eval;
|
||||||
|
|
||||||
import com.google.gson.GsonBuilder;
|
import com.google.gson.GsonBuilder;
|
||||||
|
|
||||||
|
import java.io.Serializable;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Created by claudio on 09/03/16.
|
* Created by claudio on 09/03/16.
|
||||||
*/
|
*/
|
||||||
public class ScoreResult {
|
public class ScoreResult implements Serializable {
|
||||||
|
|
||||||
private ConditionEvalMap strictConditions;
|
private ConditionEvalMap strictConditions;
|
||||||
|
|
||||||
|
@ -49,8 +51,12 @@ public class ScoreResult {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public String toString() {
|
public String toString() {
|
||||||
final GsonBuilder b = new GsonBuilder();
|
//TODO cannot print: why?
|
||||||
b.serializeSpecialFloatingPointValues();
|
// final GsonBuilder b = new GsonBuilder()
|
||||||
return b.setPrettyPrinting().create().toJson(this);
|
// .serializeSpecialFloatingPointValues()
|
||||||
|
// .serializeNulls();
|
||||||
|
//
|
||||||
|
// return b.setPrettyPrinting().create().toJson(this);
|
||||||
|
return "{}";
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,6 +1,7 @@
|
||||||
package eu.dnetlib.pace.model;
|
package eu.dnetlib.pace.model;
|
||||||
|
|
||||||
import java.io.Serializable;
|
import java.io.Serializable;
|
||||||
|
import java.lang.reflect.InvocationTargetException;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
|
|
||||||
|
@ -9,49 +10,36 @@ import eu.dnetlib.pace.clustering.*;
|
||||||
|
|
||||||
public class ClusteringDef implements Serializable {
|
public class ClusteringDef implements Serializable {
|
||||||
|
|
||||||
private Clustering name;
|
private String name;
|
||||||
|
|
||||||
private List<String> fields;
|
private List<String> fields;
|
||||||
|
|
||||||
private Map<String, Integer> params;
|
private Map<String, Integer> params;
|
||||||
|
|
||||||
|
private ClusteringResolver clusteringResolver = new ClusteringResolver();
|
||||||
|
|
||||||
public ClusteringDef() {}
|
public ClusteringDef() {}
|
||||||
|
|
||||||
public Clustering getName() {
|
public String getName() {
|
||||||
return name;
|
return name;
|
||||||
}
|
}
|
||||||
|
|
||||||
public void setName(final Clustering name) {
|
public void setName(final String name) {
|
||||||
this.name = name;
|
this.name = name;
|
||||||
}
|
}
|
||||||
|
|
||||||
public ClusteringFunction getClusteringFunction() {
|
public ClusteringFunction getClusteringFunction() {
|
||||||
switch (getName()) {
|
|
||||||
case acronyms:
|
try {
|
||||||
return new Acronyms(getParams());
|
ClusteringFunction clusteringFunction = clusteringResolver.resolve(getName());
|
||||||
case ngrams:
|
clusteringFunction.setParams(params);
|
||||||
return new Ngrams(getParams());
|
return clusteringFunction;
|
||||||
case ngrampairs:
|
|
||||||
return new NgramPairs(getParams());
|
} catch (IllegalAccessException | InstantiationException | NoSuchMethodException | InvocationTargetException e) {
|
||||||
case sortedngrampairs:
|
e.printStackTrace();
|
||||||
return new SortedNgramPairs(getParams());
|
|
||||||
case suffixprefix:
|
|
||||||
return new SuffixPrefix(getParams());
|
|
||||||
case spacetrimmingfieldvalue:
|
|
||||||
return new SpaceTrimmingFieldValue(getParams());
|
|
||||||
case immutablefieldvalue:
|
|
||||||
return new ImmutableFieldValue(getParams());
|
|
||||||
case personhash:
|
|
||||||
return new PersonHash(getParams());
|
|
||||||
case personclustering:
|
|
||||||
return new PersonClustering(getParams());
|
|
||||||
case lowercase:
|
|
||||||
return new LowercaseClustering(getParams());
|
|
||||||
case urlclustering:
|
|
||||||
return new UrlClustering(getParams());
|
|
||||||
default:
|
|
||||||
return new RandomClusteringFunction(getParams());
|
return new RandomClusteringFunction(getParams());
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public List<String> getFields() {
|
public List<String> getFields() {
|
||||||
|
|
|
@ -5,44 +5,36 @@ import java.util.List;
|
||||||
|
|
||||||
import com.google.gson.Gson;
|
import com.google.gson.Gson;
|
||||||
import eu.dnetlib.pace.condition.*;
|
import eu.dnetlib.pace.condition.*;
|
||||||
import eu.dnetlib.pace.config.Cond;
|
|
||||||
|
|
||||||
public class CondDef implements Serializable {
|
public class CondDef implements Serializable {
|
||||||
|
|
||||||
private Cond name;
|
private String name;
|
||||||
|
|
||||||
private List<String> fields;
|
private List<String> fields;
|
||||||
|
|
||||||
|
private ConditionResolver conditionResolver = new ConditionResolver();
|
||||||
|
|
||||||
public CondDef() {}
|
public CondDef() {}
|
||||||
|
|
||||||
public ConditionAlgo getConditionAlgo(final List<FieldDef> fields) {
|
public ConditionAlgo getConditionAlgo(final List<FieldDef> fields) {
|
||||||
switch (getName()) {
|
|
||||||
case yearMatch:
|
try {
|
||||||
return new YearMatch(getName(), fields);
|
ConditionAlgo conditionAlgo = conditionResolver.resolve(getName());
|
||||||
case titleVersionMatch:
|
conditionAlgo.setFields(fields);
|
||||||
return new TitleVersionMatch(getName(), fields);
|
conditionAlgo.setCond(getName());
|
||||||
case sizeMatch:
|
return conditionAlgo;
|
||||||
return new SizeMatch(getName(), fields);
|
} catch (IllegalAccessException | InstantiationException e) {
|
||||||
case exactMatch:
|
e.printStackTrace();
|
||||||
return new ExactMatch(getName(), fields);
|
|
||||||
case mustBeDifferent:
|
|
||||||
return new MustBeDifferent(getName(), fields);
|
|
||||||
case exactMatchIgnoreCase:
|
|
||||||
return new ExactMatchIgnoreCase(getName(), fields);
|
|
||||||
case doiExactMatch:
|
|
||||||
return new DoiExactMatch(getName(), fields);
|
|
||||||
case pidMatch:
|
|
||||||
return new PidMatch(getName(), fields);
|
|
||||||
default:
|
|
||||||
return new AlwaysTrueCondition(getName(), fields);
|
return new AlwaysTrueCondition(getName(), fields);
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public Cond getName() {
|
public String getName() {
|
||||||
return name;
|
return name;
|
||||||
}
|
}
|
||||||
|
|
||||||
public void setName(final Cond name) {
|
public void setName(final String name) {
|
||||||
this.name = name;
|
this.name = name;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -1,13 +1,13 @@
|
||||||
package eu.dnetlib.pace.model;
|
package eu.dnetlib.pace.model;
|
||||||
|
|
||||||
import java.io.Serializable;
|
import java.io.Serializable;
|
||||||
|
import java.util.HashMap;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
|
|
||||||
import com.google.common.base.Splitter;
|
import com.google.common.base.Splitter;
|
||||||
import com.google.common.collect.Lists;
|
import com.google.common.collect.Lists;
|
||||||
import com.google.gson.Gson;
|
import com.google.gson.Gson;
|
||||||
import eu.dnetlib.pace.config.Algo;
|
|
||||||
import eu.dnetlib.pace.config.Type;
|
import eu.dnetlib.pace.config.Type;
|
||||||
import eu.dnetlib.pace.distance.*;
|
import eu.dnetlib.pace.distance.*;
|
||||||
import eu.dnetlib.pace.distance.algo.*;
|
import eu.dnetlib.pace.distance.algo.*;
|
||||||
|
@ -19,7 +19,7 @@ public class FieldDef implements Serializable {
|
||||||
|
|
||||||
public final static String PATH_SEPARATOR = "/";
|
public final static String PATH_SEPARATOR = "/";
|
||||||
|
|
||||||
private Algo algo;
|
private String algo;
|
||||||
|
|
||||||
private String name;
|
private String name;
|
||||||
|
|
||||||
|
@ -37,6 +37,8 @@ public class FieldDef implements Serializable {
|
||||||
|
|
||||||
private Map<String, Number> params;
|
private Map<String, Number> params;
|
||||||
|
|
||||||
|
private DistanceResolver distanceResolver = new DistanceResolver();
|
||||||
|
|
||||||
public FieldDef() {}
|
public FieldDef() {}
|
||||||
|
|
||||||
// def apply(s: String): Field[A]
|
// def apply(s: String): Field[A]
|
||||||
|
@ -66,40 +68,22 @@ public class FieldDef implements Serializable {
|
||||||
}
|
}
|
||||||
|
|
||||||
public DistanceAlgo getDistanceAlgo() {
|
public DistanceAlgo getDistanceAlgo() {
|
||||||
switch (getAlgo()) {
|
|
||||||
case JaroWinkler:
|
try {
|
||||||
return new JaroWinkler(getWeight());
|
if (params == null) {
|
||||||
case JaroWinklerTitle:
|
params = new HashMap<>();
|
||||||
return new JaroWinklerTitle(getWeight());
|
}
|
||||||
case Level2JaroWinkler:
|
params.put("limit", getLimit());
|
||||||
return new Level2JaroWinkler(getWeight());
|
params.put("weight", getWeight());
|
||||||
case Level2JaroWinklerTitle:
|
DistanceAlgo distanceAlgo = distanceResolver.resolve(getAlgo());
|
||||||
return new Level2JaroWinklerTitle(getWeight());
|
distanceAlgo.setParams(params);
|
||||||
case Level2Levenstein:
|
distanceAlgo.setWeight(getWeight());
|
||||||
return new Level2Levenstein(getWeight());
|
return distanceAlgo;
|
||||||
case Levenstein:
|
} catch (IllegalAccessException | InstantiationException e) {
|
||||||
return new Levenstein(getWeight());
|
e.printStackTrace();
|
||||||
case LevensteinTitle:
|
|
||||||
return new LevensteinTitle(getWeight());
|
|
||||||
case SubStringLevenstein:
|
|
||||||
return new SubStringLevenstein(getWeight(), getLimit());
|
|
||||||
case SortedJaroWinkler:
|
|
||||||
return new SortedJaroWinkler(getWeight());
|
|
||||||
case SortedLevel2JaroWinkler:
|
|
||||||
return new SortedLevel2JaroWinkler(getWeight());
|
|
||||||
case urlMatcher:
|
|
||||||
return new UrlMatcher(getWeight(), getParams());
|
|
||||||
case ExactMatch:
|
|
||||||
return new ExactMatch(getWeight());
|
|
||||||
case MustBeDifferent:
|
|
||||||
return new MustBeDifferent(getWeight());
|
|
||||||
case AlwaysMatch:
|
|
||||||
return new AlwaysMatch(getWeight());
|
|
||||||
case Null:
|
|
||||||
return new NullDistanceAlgo();
|
|
||||||
default:
|
|
||||||
return new NullDistanceAlgo();
|
return new NullDistanceAlgo();
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public boolean isIgnoreMissing() {
|
public boolean isIgnoreMissing() {
|
||||||
|
@ -135,11 +119,11 @@ public class FieldDef implements Serializable {
|
||||||
this.weight = weight;
|
this.weight = weight;
|
||||||
}
|
}
|
||||||
|
|
||||||
public Algo getAlgo() {
|
public String getAlgo() {
|
||||||
return algo;
|
return algo;
|
||||||
}
|
}
|
||||||
|
|
||||||
public void setAlgo(final Algo algo) {
|
public void setAlgo(final String algo) {
|
||||||
this.algo = algo;
|
this.algo = algo;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -23,7 +23,6 @@ public class BlockProcessor {
|
||||||
|
|
||||||
private DedupConfig dedupConf;
|
private DedupConfig dedupConf;
|
||||||
|
|
||||||
|
|
||||||
public static void constructAccumulator( final DedupConfig dedupConf) {
|
public static void constructAccumulator( final DedupConfig dedupConf) {
|
||||||
accumulators.add(String.format("%s::%s",dedupConf.getWf().getEntityType(), "records per hash key = 1"));
|
accumulators.add(String.format("%s::%s",dedupConf.getWf().getEntityType(), "records per hash key = 1"));
|
||||||
accumulators.add(String.format("%s::%s",dedupConf.getWf().getEntityType(), "missing " + dedupConf.getWf().getOrderField()));
|
accumulators.add(String.format("%s::%s",dedupConf.getWf().getEntityType(), "missing " + dedupConf.getWf().getOrderField()));
|
||||||
|
|
Binary file not shown.
Loading…
Reference in New Issue