update in the discovery of clustering, conditions and distance functions (annotated with custom annotations)

This commit is contained in:
Michele De Bonis 2018-10-24 12:09:41 +02:00
parent bc4505e0e6
commit 1d678ddc9c
69 changed files with 1679 additions and 1441 deletions

View File

@ -36,6 +36,7 @@ public class SparkTest {
counter = new SparkCounter(context); counter = new SparkCounter(context);
//read the configuration from the classpath
final DedupConfig config = DedupConfig.load(readFromClasspath("/eu/dnetlib/pace/organization.pace.conf")); final DedupConfig config = DedupConfig.load(readFromClasspath("/eu/dnetlib/pace/organization.pace.conf"));
BlockProcessor.constructAccumulator(config); BlockProcessor.constructAccumulator(config);
@ -55,7 +56,7 @@ public class SparkTest {
//create relations between documents //create relations between documents
final JavaPairRDD<String, String> relationRDD = mapDocs.reduceByKey((a, b) -> a) //the reduce is just to be sure that we haven't document with same id final JavaPairRDD<String, String> relationRDD = mapDocs.reduceByKey((a, b) -> a) //the reduce is just to be sure that we haven't document with same id
//from <id, doc> to List<groupkey,doc> //Clustering: from <id, doc> to List<groupkey,doc>
.flatMapToPair(a -> { .flatMapToPair(a -> {
final MapDocument currentDocument = a._2(); final MapDocument currentDocument = a._2();
return getGroupingKeys(config, currentDocument).stream() return getGroupingKeys(config, currentDocument).stream()
@ -83,7 +84,7 @@ public class SparkTest {
//print ids //print ids
// ccs.foreach(cc -> System.out.println(cc.getId())); // ccs.foreach(cc -> System.out.println(cc.getId()));
ccs.saveAsTextFile("file:///Users/miconis/Downloads/dumps/organizations_dedup"); // ccs.saveAsTextFile("file:///Users/miconis/Downloads/dumps/organizations_dedup");
} }

View File

@ -1,208 +1,208 @@
package eu.dnetlib.pace; //package eu.dnetlib.pace;
//
import com.google.common.collect.Lists; //import com.google.common.collect.Lists;
import com.google.common.collect.Sets; //import com.google.common.collect.Sets;
import com.google.gson.Gson; //import com.google.gson.Gson;
import eu.dnetlib.data.proto.FieldTypeProtos.Author; //import eu.dnetlib.data.proto.FieldTypeProtos.Author;
import eu.dnetlib.data.proto.FieldTypeProtos.Qualifier; //import eu.dnetlib.data.proto.FieldTypeProtos.Qualifier;
import eu.dnetlib.data.proto.FieldTypeProtos.StructuredProperty; //import eu.dnetlib.data.proto.FieldTypeProtos.StructuredProperty;
import eu.dnetlib.data.proto.FieldTypeProtos.StructuredProperty.Builder; //import eu.dnetlib.data.proto.FieldTypeProtos.StructuredProperty.Builder;
import eu.dnetlib.data.proto.OafProtos.Oaf; //import eu.dnetlib.data.proto.OafProtos.Oaf;
import eu.dnetlib.data.proto.OafProtos.OafEntity; //import eu.dnetlib.data.proto.OafProtos.OafEntity;
import eu.dnetlib.data.proto.OrganizationProtos.Organization; //import eu.dnetlib.data.proto.OrganizationProtos.Organization;
import eu.dnetlib.data.proto.ResultProtos.Result; //import eu.dnetlib.data.proto.ResultProtos.Result;
import eu.dnetlib.pace.config.Config; //import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.config.DedupConfig; //import eu.dnetlib.pace.config.DedupConfig;
import eu.dnetlib.pace.config.Type; //import eu.dnetlib.pace.config.Type;
import eu.dnetlib.pace.model.Field; //import eu.dnetlib.pace.model.Field;
import eu.dnetlib.pace.model.FieldValueImpl; //import eu.dnetlib.pace.model.FieldValueImpl;
import eu.dnetlib.pace.model.MapDocument; //import eu.dnetlib.pace.model.MapDocument;
import eu.dnetlib.pace.model.ProtoDocumentBuilder; //import eu.dnetlib.pace.model.ProtoDocumentBuilder;
import eu.dnetlib.pace.model.gt.GTAuthor; //import eu.dnetlib.pace.model.gt.GTAuthor;
import org.apache.commons.io.IOUtils; //import org.apache.commons.io.IOUtils;
import org.apache.commons.lang.RandomStringUtils; //import org.apache.commons.lang.RandomStringUtils;
import org.apache.commons.lang.StringUtils; //import org.apache.commons.lang.StringUtils;
import org.apache.commons.lang3.RandomUtils; //import org.apache.commons.lang3.RandomUtils;
//
import java.io.IOException; //import java.io.IOException;
import java.io.StringWriter; //import java.io.StringWriter;
import java.util.ArrayList; //import java.util.ArrayList;
import java.util.LinkedList; //import java.util.LinkedList;
import java.util.List; //import java.util.List;
import java.util.Set; //import java.util.Set;
import java.util.stream.Collectors; //import java.util.stream.Collectors;
import java.util.stream.IntStream; //import java.util.stream.IntStream;
//
public abstract class AbstractProtoPaceTest extends OafTest { //public abstract class AbstractProtoPaceTest extends OafTest {
//
protected DedupConfig getResultFullConf() { // protected DedupConfig getResultFullConf() {
return DedupConfig.load(readFromClasspath("/eu/dnetlib/pace/result.full.pace.conf")); // return DedupConfig.load(readFromClasspath("/eu/dnetlib/pace/result.full.pace.conf"));
} // }
//
protected DedupConfig getResultSimpleConf() { // protected DedupConfig getResultSimpleConf() {
return DedupConfig.load(readFromClasspath("/eu/dnetlib/pace/result.simple.pace.conf")); // return DedupConfig.load(readFromClasspath("/eu/dnetlib/pace/result.simple.pace.conf"));
} // }
//
protected DedupConfig getResultConf() { // protected DedupConfig getResultConf() {
return DedupConfig.load(readFromClasspath("/eu/dnetlib/pace/result.pace.conf")); // return DedupConfig.load(readFromClasspath("/eu/dnetlib/pace/result.pace.conf"));
} // }
//
protected DedupConfig getOrganizationSimpleConf() { // protected DedupConfig getOrganizationSimpleConf() {
return DedupConfig.load(readFromClasspath("/eu/dnetlib/pace/organization.pace.conf")); // return DedupConfig.load(readFromClasspath("/eu/dnetlib/pace/organization.pace.conf"));
} // }
//
protected DedupConfig getResultAuthorsConf() { // protected DedupConfig getResultAuthorsConf() {
return DedupConfig.load(readFromClasspath("/eu/dnetlib/pace/result.authors.pace.conf")); // return DedupConfig.load(readFromClasspath("/eu/dnetlib/pace/result.authors.pace.conf"));
} // }
//
protected DedupConfig getResultProdConf() { // protected DedupConfig getResultProdConf() {
return DedupConfig.load(readFromClasspath("/eu/dnetlib/pace/result.prod.pace.conf")); // return DedupConfig.load(readFromClasspath("/eu/dnetlib/pace/result.prod.pace.conf"));
} // }
//
protected MapDocument author(final Config conf, final String id, final Oaf oaf) { // protected MapDocument author(final Config conf, final String id, final Oaf oaf) {
return ProtoDocumentBuilder.newInstance(id, oaf.getEntity(), conf.model()); // return ProtoDocumentBuilder.newInstance(id, oaf.getEntity(), conf.model());
} // }
//
protected GTAuthor getGTAuthor(final String path) { // protected GTAuthor getGTAuthor(final String path) {
//
final Gson gson = new Gson(); // final Gson gson = new Gson();
//
final String json = readFromClasspath(path); // final String json = readFromClasspath(path);
//
final GTAuthor gta = gson.fromJson(json, GTAuthor.class); // final GTAuthor gta = gson.fromJson(json, GTAuthor.class);
//
return gta; // return gta;
} // }
//
protected String readFromClasspath(final String filename) { // protected String readFromClasspath(final String filename) {
final StringWriter sw = new StringWriter(); // final StringWriter sw = new StringWriter();
try { // try {
IOUtils.copy(getClass().getResourceAsStream(filename), sw); // IOUtils.copy(getClass().getResourceAsStream(filename), sw);
return sw.toString(); // return sw.toString();
} catch (final IOException e) { // } catch (final IOException e) {
throw new RuntimeException("cannot load resource from classpath: " + filename); // throw new RuntimeException("cannot load resource from classpath: " + filename);
} // }
} // }
//
protected MapDocument result(final Config config, final String id, final String title) { // protected MapDocument result(final Config config, final String id, final String title) {
return result(config, id, title, null, new ArrayList<>(), null); // return result(config, id, title, null, new ArrayList<>(), null);
} // }
//
protected MapDocument result(final Config config, final String id, final String title, final String date) { // protected MapDocument result(final Config config, final String id, final String title, final String date) {
return result(config, id, title, date, new ArrayList<>(), null); // return result(config, id, title, date, new ArrayList<>(), null);
} // }
//
protected MapDocument result(final Config config, final String id, final String title, final String date, final List<String> pid) { // protected MapDocument result(final Config config, final String id, final String title, final String date, final List<String> pid) {
return result(config, id, title, date, pid, null); // return result(config, id, title, date, pid, null);
} // }
//
protected MapDocument result(final Config config, final String id, final String title, final String date, final String pid) { // protected MapDocument result(final Config config, final String id, final String title, final String date, final String pid) {
return result(config, id, title, date, pid, null); // return result(config, id, title, date, pid, null);
} // }
//
protected MapDocument result(final Config config, final String id, final String title, final String date, final String pid, final List<String> authors) { // protected MapDocument result(final Config config, final String id, final String title, final String date, final String pid, final List<String> authors) {
return result(config, id, title, date, Lists.newArrayList(pid), authors); // return result(config, id, title, date, Lists.newArrayList(pid), authors);
} // }
//
static List<String> pidTypes = Lists.newArrayList(); // static List<String> pidTypes = Lists.newArrayList();
static { // static {
pidTypes.add("doi"); // pidTypes.add("doi");
//pidTypes.add("oai"); // //pidTypes.add("oai");
//pidTypes.add("pmid"); // //pidTypes.add("pmid");
} // }
//
protected MapDocument result(final Config config, final String id, final String title, final String date, final List<String> pid, final List<String> authors) { // protected MapDocument result(final Config config, final String id, final String title, final String date, final List<String> pid, final List<String> authors) {
final Result.Metadata.Builder metadata = Result.Metadata.newBuilder(); // final Result.Metadata.Builder metadata = Result.Metadata.newBuilder();
if (!StringUtils.isBlank(title)) { // if (!StringUtils.isBlank(title)) {
metadata.addTitle(getStruct(title, getQualifier("main title", "dnet:titles"))); // metadata.addTitle(getStruct(title, getQualifier("main title", "dnet:titles")));
metadata.addTitle(getStruct(RandomStringUtils.randomAlphabetic(10), getQualifier("alternative title", "dnet:titles"))); // metadata.addTitle(getStruct(RandomStringUtils.randomAlphabetic(10), getQualifier("alternative title", "dnet:titles")));
} // }
if (!StringUtils.isBlank(date)) { // if (!StringUtils.isBlank(date)) {
metadata.setDateofacceptance(sf(date)); // metadata.setDateofacceptance(sf(date));
} // }
//
final OafEntity.Builder entity = oafEntity(id, eu.dnetlib.data.proto.TypeProtos.Type.result); // final OafEntity.Builder entity = oafEntity(id, eu.dnetlib.data.proto.TypeProtos.Type.result);
final Result.Builder result = Result.newBuilder().setMetadata(metadata); // final Result.Builder result = Result.newBuilder().setMetadata(metadata);
//
if (authors != null) { // if (authors != null) {
result.getMetadataBuilder().addAllAuthor( // result.getMetadataBuilder().addAllAuthor(
IntStream.range(0, authors.size()) // IntStream.range(0, authors.size())
.mapToObj(i -> author(authors.get(i), i)) // .mapToObj(i -> author(authors.get(i), i))
.collect(Collectors.toCollection(LinkedList::new))); // .collect(Collectors.toCollection(LinkedList::new)));
} // }
//
entity.setResult(result); // entity.setResult(result);
//
if (pid != null) { // if (pid != null) {
for(String p : pid) { // for(String p : pid) {
if (!StringUtils.isBlank(p)) { // if (!StringUtils.isBlank(p)) {
entity.addPid(sp(p, pidTypes.get(RandomUtils.nextInt(0, pidTypes.size() - 1)))); // entity.addPid(sp(p, pidTypes.get(RandomUtils.nextInt(0, pidTypes.size() - 1))));
//entity.addPid(sp(RandomStringUtils.randomAlphabetic(10), "oai")); // //entity.addPid(sp(RandomStringUtils.randomAlphabetic(10), "oai"));
} // }
} // }
} // }
//
final OafEntity build = entity.build(); // final OafEntity build = entity.build();
return ProtoDocumentBuilder.newInstance(id, build, config.model()); // return ProtoDocumentBuilder.newInstance(id, build, config.model());
} // }
//
private Author author(final String s, int rank) { // private Author author(final String s, int rank) {
final eu.dnetlib.pace.model.Person p = new eu.dnetlib.pace.model.Person(s, false); // final eu.dnetlib.pace.model.Person p = new eu.dnetlib.pace.model.Person(s, false);
final Author.Builder author = Author.newBuilder(); // final Author.Builder author = Author.newBuilder();
if (p.isAccurate()) { // if (p.isAccurate()) {
author.setName(p.getNormalisedFirstName()); // author.setName(p.getNormalisedFirstName());
author.setSurname(p.getNormalisedSurname()); // author.setSurname(p.getNormalisedSurname());
} // }
author.setFullname(p.getNormalisedFullname()); // author.setFullname(p.getNormalisedFullname());
author.setRank(rank); // author.setRank(rank);
//
return author.build(); // return author.build();
} // }
//
private OafEntity.Builder oafEntity(final String id, final eu.dnetlib.data.proto.TypeProtos.Type type) { // private OafEntity.Builder oafEntity(final String id, final eu.dnetlib.data.proto.TypeProtos.Type type) {
final OafEntity.Builder entity = OafEntity.newBuilder().setId(id).setType(type); // final OafEntity.Builder entity = OafEntity.newBuilder().setId(id).setType(type);
return entity; // return entity;
} // }
//
protected MapDocument organization(final Config config, final String id, final String legalName) { // protected MapDocument organization(final Config config, final String id, final String legalName) {
return organization(config, id, legalName, null); // return organization(config, id, legalName, null);
} // }
//
protected MapDocument organization(final Config config, final String id, final String legalName, final String legalShortName) { // protected MapDocument organization(final Config config, final String id, final String legalName, final String legalShortName) {
final Organization.Metadata.Builder metadata = Organization.Metadata.newBuilder(); // final Organization.Metadata.Builder metadata = Organization.Metadata.newBuilder();
if (legalName != null) { // if (legalName != null) {
metadata.setLegalname(sf(legalName)); // metadata.setLegalname(sf(legalName));
} // }
if (legalShortName != null) { // if (legalShortName != null) {
metadata.setLegalshortname(sf(legalShortName)); // metadata.setLegalshortname(sf(legalShortName));
} // }
//
final OafEntity.Builder entity = oafEntity(id, eu.dnetlib.data.proto.TypeProtos.Type.result); // final OafEntity.Builder entity = oafEntity(id, eu.dnetlib.data.proto.TypeProtos.Type.result);
entity.setOrganization(Organization.newBuilder().setMetadata(metadata)); // entity.setOrganization(Organization.newBuilder().setMetadata(metadata));
//
return ProtoDocumentBuilder.newInstance(id, entity.build(), config.model()); // return ProtoDocumentBuilder.newInstance(id, entity.build(), config.model());
} // }
//
private StructuredProperty sp(final String pid, final String type) { // private StructuredProperty sp(final String pid, final String type) {
final Builder pidSp = // final Builder pidSp =
StructuredProperty.newBuilder().setValue(pid) // StructuredProperty.newBuilder().setValue(pid)
.setQualifier(Qualifier.newBuilder().setClassid(type).setClassname(type).setSchemeid("dnet:pid_types").setSchemename("dnet:pid_types")); // .setQualifier(Qualifier.newBuilder().setClassid(type).setClassname(type).setSchemeid("dnet:pid_types").setSchemename("dnet:pid_types"));
return pidSp.build(); // return pidSp.build();
} // }
//
protected Field title(final String s) { // protected Field title(final String s) {
return new FieldValueImpl(Type.String, "title", s); // return new FieldValueImpl(Type.String, "title", s);
} // }
//
protected static Builder getStruct(final String value, final Qualifier.Builder qualifier) { // protected static Builder getStruct(final String value, final Qualifier.Builder qualifier) {
return StructuredProperty.newBuilder().setValue(value).setQualifier(qualifier); // return StructuredProperty.newBuilder().setValue(value).setQualifier(qualifier);
} // }
//
/* // /*
* protected static StringField.Builder sf(final String s) { return StringField.newBuilder().setValue(s); } // * protected static StringField.Builder sf(final String s) { return StringField.newBuilder().setValue(s); }
* // *
* protected static Qualifier.Builder getQualifier(final String classname, final String schemename) { return // * protected static Qualifier.Builder getQualifier(final String classname, final String schemename) { return
* Qualifier.newBuilder().setClassid(classname).setClassname(classname).setSchemeid(schemename).setSchemename(schemename); } // * Qualifier.newBuilder().setClassid(classname).setClassname(classname).setSchemeid(schemename).setSchemename(schemename); }
*/ // */
//
} //}

File diff suppressed because one or more lines are too long

View File

@ -1,42 +1,42 @@
package eu.dnetlib.pace.clustering; //package eu.dnetlib.pace.clustering;
//
import eu.dnetlib.pace.AbstractProtoPaceTest; //import eu.dnetlib.pace.AbstractProtoPaceTest;
import eu.dnetlib.pace.clustering.BlacklistAwareClusteringCombiner; //import eu.dnetlib.pace.clustering.BlacklistAwareClusteringCombiner;
import eu.dnetlib.pace.config.Config; //import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.config.Type; //import eu.dnetlib.pace.config.Type;
import eu.dnetlib.pace.model.FieldListImpl; //import eu.dnetlib.pace.model.FieldListImpl;
import eu.dnetlib.pace.model.FieldValueImpl; //import eu.dnetlib.pace.model.FieldValueImpl;
import eu.dnetlib.pace.model.MapDocument; //import eu.dnetlib.pace.model.MapDocument;
import org.apache.commons.logging.Log; //import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory; //import org.apache.commons.logging.LogFactory;
import org.junit.Before; //import org.junit.Before;
import org.junit.Test; //import org.junit.Test;
//
public class BlacklistAwareClusteringCombinerTest extends AbstractProtoPaceTest { //public class BlacklistAwareClusteringCombinerTest extends AbstractProtoPaceTest {
//
private static final Log log = LogFactory.getLog(BlacklistAwareClusteringCombinerTest.class); // private static final Log log = LogFactory.getLog(BlacklistAwareClusteringCombinerTest.class);
//
private Config config; // private Config config;
//
@Before // @Before
public void setUp() { // public void setUp() {
config = getResultFullConf(); // config = getResultFullConf();
} // }
//
@Test // @Test
public void testCombine() { // public void testCombine() {
final MapDocument result = // final MapDocument result =
result(config, "A", "Dipping in Cygnus X-2 in a multi-wavelength campaign due to absorption of extended ADC emission", "2013"); // result(config, "A", "Dipping in Cygnus X-2 in a multi-wavelength campaign due to absorption of extended ADC emission", "2013");
final FieldListImpl fl = new FieldListImpl(); // final FieldListImpl fl = new FieldListImpl();
fl.add(new FieldValueImpl(Type.String, "desc", "hello world description pipeline")); // fl.add(new FieldValueImpl(Type.String, "desc", "hello world description pipeline"));
//
result.getFieldMap().put("desc", fl); // result.getFieldMap().put("desc", fl);
//
fl.clear(); // fl.clear();
fl.add(new FieldValueImpl(Type.String, "title", "lorem ipsum cabalie qwerty")); // fl.add(new FieldValueImpl(Type.String, "title", "lorem ipsum cabalie qwerty"));
final FieldListImpl field = (FieldListImpl) result.getFieldMap().get("title"); // final FieldListImpl field = (FieldListImpl) result.getFieldMap().get("title");
field.add(fl); // field.add(fl);
//
log.info(BlacklistAwareClusteringCombiner.filterAndCombine(result, config)); // log.info(BlacklistAwareClusteringCombiner.filterAndCombine(result, config));
} // }
} //}

View File

@ -1,39 +1,39 @@
package eu.dnetlib.pace.clustering; //package eu.dnetlib.pace.clustering;
//
import eu.dnetlib.pace.AbstractProtoPaceTest; //import eu.dnetlib.pace.AbstractProtoPaceTest;
import eu.dnetlib.pace.clustering.ClusteringCombiner; //import eu.dnetlib.pace.clustering.ClusteringCombiner;
import eu.dnetlib.pace.config.Config; //import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.config.Type; //import eu.dnetlib.pace.config.Type;
import eu.dnetlib.pace.model.FieldListImpl; //import eu.dnetlib.pace.model.FieldListImpl;
import eu.dnetlib.pace.model.FieldValueImpl; //import eu.dnetlib.pace.model.FieldValueImpl;
import eu.dnetlib.pace.model.MapDocument; //import eu.dnetlib.pace.model.MapDocument;
import org.apache.commons.logging.Log; //import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory; //import org.apache.commons.logging.LogFactory;
import org.junit.Before; //import org.junit.Before;
import org.junit.Test; //import org.junit.Test;
//
public class ClusteringCombinerTest extends AbstractProtoPaceTest { //public class ClusteringCombinerTest extends AbstractProtoPaceTest {
//
private static final Log log = LogFactory.getLog(ClusteringCombinerTest.class); // private static final Log log = LogFactory.getLog(ClusteringCombinerTest.class);
//
private Config config; // private Config config;
//
@Before // @Before
public void setUp() { // public void setUp() {
config = getResultFullConf(); // config = getResultFullConf();
} // }
//
@Test // @Test
public void testCombine() { // public void testCombine() {
String title = "Dipping in Cygnus X-2 in a multi-wavelength campaign due to absorption of extended ADC emission"; // String title = "Dipping in Cygnus X-2 in a multi-wavelength campaign due to absorption of extended ADC emission";
MapDocument result = result(config, "A", title, "2013"); // MapDocument result = result(config, "A", title, "2013");
//
FieldListImpl fl = new FieldListImpl(); // FieldListImpl fl = new FieldListImpl();
fl.add(new FieldValueImpl(Type.String, "desc", "lorem ipsum cabalie qwerty")); // fl.add(new FieldValueImpl(Type.String, "desc", "lorem ipsum cabalie qwerty"));
//
result.getFieldMap().put("desc", fl); // result.getFieldMap().put("desc", fl);
log.info(title); // log.info(title);
log.info(ClusteringCombiner.combine(result, config)); // log.info(ClusteringCombiner.combine(result, config));
} // }
//
} //}

View File

@ -1,450 +1,450 @@
package eu.dnetlib.pace.distance; //package eu.dnetlib.pace.distance;
//
import com.google.common.collect.Lists; //import com.google.common.collect.Lists;
import com.google.common.collect.Maps; //import com.google.common.collect.Maps;
import com.google.common.collect.Sets; //import com.google.common.collect.Sets;
import com.googlecode.protobuf.format.JsonFormat; //import com.googlecode.protobuf.format.JsonFormat;
import eu.dnetlib.data.proto.OafProtos; //import eu.dnetlib.data.proto.OafProtos;
import eu.dnetlib.pace.AbstractProtoPaceTest; //import eu.dnetlib.pace.AbstractProtoPaceTest;
import eu.dnetlib.pace.clustering.BlacklistAwareClusteringCombiner; //import eu.dnetlib.pace.clustering.BlacklistAwareClusteringCombiner;
import eu.dnetlib.pace.config.Config; //import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.config.DedupConfig; //import eu.dnetlib.pace.config.DedupConfig;
import eu.dnetlib.pace.distance.eval.ScoreResult; //import eu.dnetlib.pace.distance.eval.ScoreResult;
import eu.dnetlib.pace.model.MapDocument; //import eu.dnetlib.pace.model.MapDocument;
import eu.dnetlib.pace.model.ProtoDocumentBuilder; //import eu.dnetlib.pace.model.ProtoDocumentBuilder;
import org.apache.commons.io.IOUtils; //import org.apache.commons.io.IOUtils;
import org.apache.commons.logging.Log; //import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory; //import org.apache.commons.logging.LogFactory;
import org.junit.Ignore; //import org.junit.Ignore;
import org.junit.Test; //import org.junit.Test;
//
import java.io.IOException; //import java.io.IOException;
import java.util.List; //import java.util.List;
import java.util.Map; //import java.util.Map;
import java.util.Set; //import java.util.Set;
import java.util.stream.Collectors; //import java.util.stream.Collectors;
//
import static org.junit.Assert.assertFalse; //import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertTrue; //import static org.junit.Assert.assertTrue;
//
public class DetectorTest extends AbstractProtoPaceTest { //public class DetectorTest extends AbstractProtoPaceTest {
//
private static final Log log = LogFactory.getLog(DetectorTest.class); // private static final Log log = LogFactory.getLog(DetectorTest.class);
//
@Test // @Test
public void testDistanceResultSimple() { // public void testDistanceResultSimple() {
final Config config = getResultSimpleConf(); // final Config config = getResultSimpleConf();
//
final MapDocument resA = result(config, "A", "Recent results from CDF"); // final MapDocument resA = result(config, "A", "Recent results from CDF");
final MapDocument resB = result(config, "B", "Recent results from CDF"); // final MapDocument resB = result(config, "B", "Recent results from CDF");
//
final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config); // final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config);
final double d = sr.getScore(); // final double d = sr.getScore();
log.info(String.format(" d ---> %s", d)); // log.info(String.format(" d ---> %s", d));
//
assertTrue(d == 1.0); // assertTrue(d == 1.0);
} // }
//
@Test // @Test
public void testDistanceResultSimpleMissingDates() { // public void testDistanceResultSimpleMissingDates() {
final Config config = getResultSimpleConf(); // final Config config = getResultSimpleConf();
//
final MapDocument resA = result(config, "A", "Recent results from BES"); // final MapDocument resA = result(config, "A", "Recent results from BES");
final MapDocument resB = result(config, "A", "Recent results from CES"); // final MapDocument resB = result(config, "A", "Recent results from CES");
//
final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config); // final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config);
final double d = sr.getScore(); // final double d = sr.getScore();
log.info(String.format(" d ---> %s", d)); // log.info(String.format(" d ---> %s", d));
//
assertTrue(d > 0.97); // assertTrue(d > 0.97);
} // }
//
@Test // @Test
public void testDistanceResultInvalidDate() { // public void testDistanceResultInvalidDate() {
final Config config = getResultConf(); // final Config config = getResultConf();
//
final MapDocument resA = result(config, "A", "title title title 6BESR", "2013-01-05"); // final MapDocument resA = result(config, "A", "title title title 6BESR", "2013-01-05");
final MapDocument resB = result(config, "B", "title title title 6BESR", "qwerty"); // final MapDocument resB = result(config, "B", "title title title 6BESR", "qwerty");
//
final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config); // final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config);
final double d = sr.getScore(); // final double d = sr.getScore();
log.info(String.format(" d ---> %s", d)); // log.info(String.format(" d ---> %s", d));
//
assertTrue(d == 1.0); // assertTrue(d == 1.0);
} // }
//
@Ignore // @Ignore
@Test // @Test
public void testDistanceResultMissingOneDate() { // public void testDistanceResultMissingOneDate() {
final Config config = getResultConf(); // final Config config = getResultConf();
//
final MapDocument resA = result(config, "A", "title title title 6BESR", null); // final MapDocument resA = result(config, "A", "title title title 6BESR", null);
final MapDocument resB = result(config, "B", "title title title 6CLER", "2012-02"); // final MapDocument resB = result(config, "B", "title title title 6CLER", "2012-02");
//
final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config); // final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config);
double d = sr.getScore(); // double d = sr.getScore();
log.info(String.format(" d ---> %s", d)); // log.info(String.format(" d ---> %s", d));
//
assertTrue((d > 0.9) && (d < 1.0)); // assertTrue((d > 0.9) && (d < 1.0));
} // }
//
@Ignore // @Ignore
@Test // @Test
public void testDistanceResult() { // public void testDistanceResult() {
final Config config = getResultConf(); // final Config config = getResultConf();
//
final MapDocument resA = result(config, "A", "title title title BES", ""); // final MapDocument resA = result(config, "A", "title title title BES", "");
final MapDocument resB = result(config, "B", "title title title CLEO"); // final MapDocument resB = result(config, "B", "title title title CLEO");
//
final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config); // final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config);
double d = sr.getScore(); // double d = sr.getScore();
log.info(String.format(" d ---> %s", d)); // log.info(String.format(" d ---> %s", d));
//
assertTrue((d > 0.9) && (d < 1.0)); // assertTrue((d > 0.9) && (d < 1.0));
} // }
//
@Ignore // @Ignore
@Test // @Test
public void testDistanceResultMissingTwoDate() { // public void testDistanceResultMissingTwoDate() {
final Config config = getResultConf(); // final Config config = getResultConf();
//
final MapDocument resA = result(config, "A", "bellaciao"); // final MapDocument resA = result(config, "A", "bellaciao");
final MapDocument resB = result(config, "B", "bellocioa"); // final MapDocument resB = result(config, "B", "bellocioa");
//
final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config); // final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config);
double d = sr.getScore(); // double d = sr.getScore();
log.info(String.format(" d ---> %s", d)); // log.info(String.format(" d ---> %s", d));
//
assertTrue((d > 0.9) && (d < 1.0)); // assertTrue((d > 0.9) && (d < 1.0));
} // }
//
@Ignore // @Ignore
@Test // @Test
public void testDistanceOrganizationIgnoreMissing() { // public void testDistanceOrganizationIgnoreMissing() {
//
final Config config = getOrganizationSimpleConf(); // final Config config = getOrganizationSimpleConf();
//
final MapDocument orgA = organization(config, "A", "CONSIGLIO NAZIONALE DELLE RICERCHE"); // final MapDocument orgA = organization(config, "A", "CONSIGLIO NAZIONALE DELLE RICERCHE");
final MapDocument orgB = organization(config, "B", "CONSIGLIO NAZIONALE DELLE RICERCHE", "CNR"); // final MapDocument orgB = organization(config, "B", "CONSIGLIO NAZIONALE DELLE RICERCHE", "CNR");
//
final ScoreResult sr = new PaceDocumentDistance().between(orgA, orgB, config); // final ScoreResult sr = new PaceDocumentDistance().between(orgA, orgB, config);
final double d = sr.getScore(); // final double d = sr.getScore();
log.info(String.format(" d ---> %s", d)); // log.info(String.format(" d ---> %s", d));
//
assertTrue(d > 0.99); // assertTrue(d > 0.99);
} // }
//
@Test // @Test
public void testDistanceResultCase1() { // public void testDistanceResultCase1() {
//
final Config config = getResultConf(); // final Config config = getResultConf();
//
final MapDocument resA = result(config, "A", "Search the Standard Model Higgs boson", "2003"); // final MapDocument resA = result(config, "A", "Search the Standard Model Higgs boson", "2003");
final MapDocument resB = result(config, "B", "Search for the Standard Model Higgs Boson", "2003"); // final MapDocument resB = result(config, "B", "Search for the Standard Model Higgs Boson", "2003");
//
final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config); // final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config);
double d = sr.getScore(); // double d = sr.getScore();
log.info(String.format(" d ---> %s", d)); // log.info(String.format(" d ---> %s", d));
//
assertTrue((d > 0.9) && (d < 1.0)); // assertTrue((d > 0.9) && (d < 1.0));
} // }
//
@Test // @Test
public void testDistanceResultCaseDoiMatch1() { // public void testDistanceResultCaseDoiMatch1() {
final Config config = getResultConf(); // final Config config = getResultConf();
//
final MapDocument resA = result(config, "A", "Search the Standard Model Higgs boson", "2003", "10.1594/PANGAEA.726855"); // final MapDocument resA = result(config, "A", "Search the Standard Model Higgs boson", "2003", "10.1594/PANGAEA.726855");
final MapDocument resB = result(config, "B", "Search the Standard Model Higgs Boson", "2003", "10.1594/PANGAEA.726855"); // final MapDocument resB = result(config, "B", "Search the Standard Model Higgs Boson", "2003", "10.1594/PANGAEA.726855");
//
final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config); // final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config);
double d = sr.getScore(); // double d = sr.getScore();
log.info(String.format(" d ---> %s", d)); // log.info(String.format(" d ---> %s", d));
//
assertTrue("exact DOIs will produce an exact match", d == 1.0); // assertTrue("exact DOIs will produce an exact match", d == 1.0);
} // }
//
@Test // @Test
public void testDistanceResultCaseDoiMatch2() { // public void testDistanceResultCaseDoiMatch2() {
final Config config = getResultConf(); // final Config config = getResultConf();
//
final MapDocument resA = result(config, "A", "Conference proceedings on X. Appendix", "2003", "10.1594/PANGAEA.726855"); // final MapDocument resA = result(config, "A", "Conference proceedings on X. Appendix", "2003", "10.1594/PANGAEA.726855");
final MapDocument resB = result(config, "B", "Search the Standard Model Higgs Boson", "2005", "10.1594/PANGAEA.726855"); // final MapDocument resB = result(config, "B", "Search the Standard Model Higgs Boson", "2005", "10.1594/PANGAEA.726855");
//
final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config); // final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config);
double d = sr.getScore(); // double d = sr.getScore();
log.info(String.format(" d ---> %s", d)); // log.info(String.format(" d ---> %s", d));
//
assertTrue("exact DOIs will produce an exact match, regardless of different titles or publication years", d == 1.0); // assertTrue("exact DOIs will produce an exact match, regardless of different titles or publication years", d == 1.0);
} // }
//
@Test // @Test
public void testDistanceResultCaseDoiMatch3() { // public void testDistanceResultCaseDoiMatch3() {
final Config config = getResultConf(); // final Config config = getResultConf();
//
final MapDocument resA = result(config, "A", "Conference proceedings on X. Appendix", "2003", "10.1016/j.jmb.2010.12.024"); // final MapDocument resA = result(config, "A", "Conference proceedings on X. Appendix", "2003", "10.1016/j.jmb.2010.12.024");
final MapDocument resB = result(config, "B", "Conference proceedings on X. Appendix", "2003"); // final MapDocument resB = result(config, "B", "Conference proceedings on X. Appendix", "2003");
//
final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config); // final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config);
double d = sr.getScore(); // double d = sr.getScore();
log.info(String.format(" d ---> %s", d)); // log.info(String.format(" d ---> %s", d));
//
assertTrue("a missing DOI will casue the comparsion to continue with the following conditions", d == 1.0); // assertTrue("a missing DOI will casue the comparsion to continue with the following conditions", d == 1.0);
} // }
//
@Test // @Test
public void testDistanceResultCaseDoiMatch4() { // public void testDistanceResultCaseDoiMatch4() {
final Config config = getResultConf(); // final Config config = getResultConf();
//
final MapDocument resA = result(config, "A", "Conference proceedings on X. Appendix", "2003", "10.1016/j.jmb.2010.12.024"); // final MapDocument resA = result(config, "A", "Conference proceedings on X. Appendix", "2003", "10.1016/j.jmb.2010.12.024");
final MapDocument resB = result(config, "B", "Conference proceedings on X. Appendix", "2005"); // final MapDocument resB = result(config, "B", "Conference proceedings on X. Appendix", "2005");
//
final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config); // final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config);
double d = sr.getScore(); // double d = sr.getScore();
log.info(String.format(" d ---> %s", d)); // log.info(String.format(" d ---> %s", d));
//
assertTrue("a missing DOI, comparsion continues with the following conditions, different publication years will drop the score to 0", d == 0.0); // assertTrue("a missing DOI, comparsion continues with the following conditions, different publication years will drop the score to 0", d == 0.0);
} // }
//
@Test // @Test
public void testDistanceResultCaseDoiMatch5() { // public void testDistanceResultCaseDoiMatch5() {
//
final Config config = getResultConf(); // final Config config = getResultConf();
//
final MapDocument resA = result(config, "A", "Search for the Standard Model Higgs Boson", "2003", "10.1016/j.jmb.2010.12.020"); // final MapDocument resA = result(config, "A", "Search for the Standard Model Higgs Boson", "2003", "10.1016/j.jmb.2010.12.020");
final MapDocument resB = result(config, "B", "Search the Standard Model Higgs Boson", "2003"); // final MapDocument resB = result(config, "B", "Search the Standard Model Higgs Boson", "2003");
//
final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config); // final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config);
double d = sr.getScore(); // double d = sr.getScore();
log.info(String.format(" d ---> %s", d)); // log.info(String.format(" d ---> %s", d));
//
assertTrue("a missing DOI, comparsion continues with the following conditions", (d > 0.9) && (d < 1.0)); // assertTrue("a missing DOI, comparsion continues with the following conditions", (d > 0.9) && (d < 1.0));
} // }
//
@Test // @Test
public void testDistanceResultCaseDoiMatch6() { // public void testDistanceResultCaseDoiMatch6() {
final Config config = getResultConf(); // final Config config = getResultConf();
//
final MapDocument resA = result(config, "A", "Conference proceedings on X. Appendix", "2003", "10.1016/j.jmb.2010.12.024"); // final MapDocument resA = result(config, "A", "Conference proceedings on X. Appendix", "2003", "10.1016/j.jmb.2010.12.024");
final MapDocument resB = result(config, "B", "Conference proceedings on X. Appendix", "2003", "anotherDifferentDOI"); // final MapDocument resB = result(config, "B", "Conference proceedings on X. Appendix", "2003", "anotherDifferentDOI");
//
final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config); // final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config);
double d = sr.getScore(); // double d = sr.getScore();
log.info(String.format(" d ---> %s", d)); // log.info(String.format(" d ---> %s", d));
//
assertTrue("different DOIs will NOT drop the score to 0, then evaluate other fields", d == 1.0); // assertTrue("different DOIs will NOT drop the score to 0, then evaluate other fields", d == 1.0);
} // }
//
@Test // @Test
public void testDistanceResultCaseDoiMatch7() { // public void testDistanceResultCaseDoiMatch7() {
final Config config = getResultConf(); // final Config config = getResultConf();
//
final MapDocument resA = result(config, "A", "Adrenal Insufficiency asd asd", "1951", Lists.newArrayList("PMC2037944", "axdsds")); // final MapDocument resA = result(config, "A", "Adrenal Insufficiency asd asd", "1951", Lists.newArrayList("PMC2037944", "axdsds"));
final MapDocument resB = result(config, "B", "Adrenal Insufficiency", "1951", "PMC2037944"); // final MapDocument resB = result(config, "B", "Adrenal Insufficiency", "1951", "PMC2037944");
//
final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config); // final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config);
double d = sr.getScore(); // double d = sr.getScore();
log.info(String.format(" d ---> %s", d)); // log.info(String.format(" d ---> %s", d));
//
assertTrue("different DOIs will drop the score to 0, regardless of the other fields", d > 0.9 & d < 1); // assertTrue("different DOIs will drop the score to 0, regardless of the other fields", d > 0.9 & d < 1);
} // }
//
// http://dx.doi.org/10.1594/PANGAEA.726855 doi:10.1594/PANGAEA.726855 // // http://dx.doi.org/10.1594/PANGAEA.726855 doi:10.1594/PANGAEA.726855
//
@Test // @Test
public void testDistanceResultCaseAuthor1() { // public void testDistanceResultCaseAuthor1() {
//
final Config config = getResultAuthorsConf(); // final Config config = getResultAuthorsConf();
//
final List<String> authorsA = Lists.newArrayList("a", "b", "c", "d"); // final List<String> authorsA = Lists.newArrayList("a", "b", "c", "d");
final List<String> authorsB = Lists.newArrayList("a", "b", "c"); // final List<String> authorsB = Lists.newArrayList("a", "b", "c");
final List<String> pid = Lists.newArrayList(); // final List<String> pid = Lists.newArrayList();
//
final MapDocument resA = result(config, "A", "Search the Standard Model Higgs Boson", "2003", pid, authorsA); // final MapDocument resA = result(config, "A", "Search the Standard Model Higgs Boson", "2003", pid, authorsA);
final MapDocument resB = result(config, "B", "Search the Standard Model Higgs Boson", "2003", pid, authorsB); // final MapDocument resB = result(config, "B", "Search the Standard Model Higgs Boson", "2003", pid, authorsB);
//
final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config); // final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config);
final double d = sr.getScore(); // final double d = sr.getScore();
log.info(String.format(" d ---> %s", d)); // log.info(String.format(" d ---> %s", d));
//
assertTrue(d == 0.0); // assertTrue(d == 0.0);
} // }
//
@Test // @Test
public void testDistanceResultCaseAuthor2() { // public void testDistanceResultCaseAuthor2() {
//
final Config config = getResultAuthorsConf(); // final Config config = getResultAuthorsConf();
//
final List<String> authorsA = Lists.newArrayList("a", "b", "c"); // final List<String> authorsA = Lists.newArrayList("a", "b", "c");
final List<String> authorsB = Lists.newArrayList("a", "b", "c"); // final List<String> authorsB = Lists.newArrayList("a", "b", "c");
final List<String> pid = Lists.newArrayList(); // final List<String> pid = Lists.newArrayList();
//
final MapDocument resA = result(config, "A", "Search the Standard Model Higgs Boson", "2003", pid, authorsA); // final MapDocument resA = result(config, "A", "Search the Standard Model Higgs Boson", "2003", pid, authorsA);
final MapDocument resB = result(config, "B", "Search the Standard Model Higgs Boson", "2003", pid, authorsB); // final MapDocument resB = result(config, "B", "Search the Standard Model Higgs Boson", "2003", pid, authorsB);
//
final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config); // final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config);
final double d = sr.getScore(); // final double d = sr.getScore();
log.info(String.format(" d ---> %s", d)); // log.info(String.format(" d ---> %s", d));
//
assertTrue(d == 1.0); // assertTrue(d == 1.0);
} // }
//
@Test // @Test
public void testDistanceResultCaseAuthor3() { // public void testDistanceResultCaseAuthor3() {
//
final Config config = getResultAuthorsConf(); // final Config config = getResultAuthorsConf();
//
final List<String> authorsA = Lists.newArrayList("Bardi, A.", "Manghi, P.", "Artini, M."); // final List<String> authorsA = Lists.newArrayList("Bardi, A.", "Manghi, P.", "Artini, M.");
final List<String> authorsB = Lists.newArrayList("Bardi Alessia", "Manghi Paolo", "Artini Michele"); // final List<String> authorsB = Lists.newArrayList("Bardi Alessia", "Manghi Paolo", "Artini Michele");
final List<String> pid = Lists.newArrayList(); // final List<String> pid = Lists.newArrayList();
//
final MapDocument resA = result(config, "A", "Search the Standard Model Higgs Boson", "2003", pid, authorsA); // final MapDocument resA = result(config, "A", "Search the Standard Model Higgs Boson", "2003", pid, authorsA);
final MapDocument resB = result(config, "B", "Search the Standard Model Higgs Boson", "2003", pid, authorsB); // final MapDocument resB = result(config, "B", "Search the Standard Model Higgs Boson", "2003", pid, authorsB);
//
final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config); // final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config);
double d = sr.getScore(); // double d = sr.getScore();
log.info(String.format(" d ---> %s", d)); // log.info(String.format(" d ---> %s", d));
//
assertTrue((d > 0.9) && (d < 1.0)); // assertTrue((d > 0.9) && (d < 1.0));
} // }
//
@Test // @Test
public void testDistanceResultCaseAuthor4() { // public void testDistanceResultCaseAuthor4() {
//
final Config config = getResultAuthorsConf(); // final Config config = getResultAuthorsConf();
//
final List<String> authorsA = Lists.newArrayList("Bardi, Alessia", "Manghi, Paolo", "Artini, Michele", "a"); // final List<String> authorsA = Lists.newArrayList("Bardi, Alessia", "Manghi, Paolo", "Artini, Michele", "a");
final List<String> authorsB = Lists.newArrayList("Bardi Alessia", "Manghi Paolo", "Artini Michele"); // final List<String> authorsB = Lists.newArrayList("Bardi Alessia", "Manghi Paolo", "Artini Michele");
final List<String> pid = Lists.newArrayList(); // final List<String> pid = Lists.newArrayList();
//
final MapDocument resA = result(config, "A", "Search the Standard Model Higgs Boson", "2003", pid, authorsA); // final MapDocument resA = result(config, "A", "Search the Standard Model Higgs Boson", "2003", pid, authorsA);
final MapDocument resB = result(config, "B", "Search the Standard Model Higgs Boson", "2003", pid, authorsB); // final MapDocument resB = result(config, "B", "Search the Standard Model Higgs Boson", "2003", pid, authorsB);
//
final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config); // final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config);
final double d = sr.getScore(); // final double d = sr.getScore();
log.info(String.format(" d ---> %s", d)); // log.info(String.format(" d ---> %s", d));
//
// assertTrue(d.getScore() == 0.0); // // assertTrue(d.getScore() == 0.0);
} // }
//
@Test // @Test
public void testDistanceResultNoPidsConf() { // public void testDistanceResultNoPidsConf() {
//
final Config config = getResultFullConf(); // final Config config = getResultFullConf();
//
final MapDocument resA = // final MapDocument resA =
result(config, "A", "Presentations of perforated colonic pathology in patients with polymyalgia rheumatica: two case reports", "2010"); // result(config, "A", "Presentations of perforated colonic pathology in patients with polymyalgia rheumatica: two case reports", "2010");
//
final MapDocument resB = // final MapDocument resB =
result(config, "B", "Presentations of perforated colonic pathology in patients with polymyalgia rheumatica: two case reportsX", "2010"); // result(config, "B", "Presentations of perforated colonic pathology in patients with polymyalgia rheumatica: two case reportsX", "2010");
//
final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config); // final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config);
final double s = sr.getScore(); // final double s = sr.getScore();
//
log.info(sr.toString()); // log.info(sr.toString());
log.info(String.format(" s ---> %s", s)); // log.info(String.format(" s ---> %s", s));
// assertTrue(d.getScore() == 0.0); // // assertTrue(d.getScore() == 0.0);
} // }
//
@Test // @Test
public void testDistanceResultPidsConf() { // public void testDistanceResultPidsConf() {
//
final Config config = getResultFullConf(); // final Config config = getResultFullConf();
//
final List<String> authorsA = Lists.newArrayList("Nagarajan Pranesh", "Guy Vautier", "Punyanganie de Silva"); // final List<String> authorsA = Lists.newArrayList("Nagarajan Pranesh", "Guy Vautier", "Punyanganie de Silva");
final List<String> authorsB = Lists.newArrayList("Pranesh Nagarajan", "Vautier Guy", "de Silva Punyanganie"); // final List<String> authorsB = Lists.newArrayList("Pranesh Nagarajan", "Vautier Guy", "de Silva Punyanganie");
//
final List<String> pidA = Lists.newArrayList("10.1186/1752-1947-4-299", "a", "b"); // final List<String> pidA = Lists.newArrayList("10.1186/1752-1947-4-299", "a", "b");
final MapDocument resA = // final MapDocument resA =
result(config, "A", "Presentations of perforated colonic pathology in patients with polymyalgia rheumatica: two case reports", "2010", // result(config, "A", "Presentations of perforated colonic pathology in patients with polymyalgia rheumatica: two case reports", "2010",
pidA, authorsA); // pidA, authorsA);
//
final List<String> pidB = Lists.newArrayList("c", "a", "10.1186/1752-1947-4-299", "d"); // final List<String> pidB = Lists.newArrayList("c", "a", "10.1186/1752-1947-4-299", "d");
final MapDocument resB = // final MapDocument resB =
result(config, "B", "Presentations of perforated colonic pathology in patients with polymyalgia rheumatica: two case reportsX", "2010", // result(config, "B", "Presentations of perforated colonic pathology in patients with polymyalgia rheumatica: two case reportsX", "2010",
pidB, authorsB); // pidB, authorsB);
//
final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config); // final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config);
final double s = sr.getScore(); // final double s = sr.getScore();
log.info(sr.toString()); // log.info(sr.toString());
log.info(String.format(" s ---> %s", s)); // log.info(String.format(" s ---> %s", s));
//
// assertTrue(d.getScore() == 0.0); // // assertTrue(d.getScore() == 0.0);
} // }
//
@Test // @Test
public void testDistanceResultFullConf() { // public void testDistanceResultFullConf() {
//
final Config config = getResultFullConf(); // final Config config = getResultFullConf();
//
final List<String> authorsA = Lists.newArrayList("Nagarajan Pranesh", "Guy Vautier", "Punyanganie de Silva"); // final List<String> authorsA = Lists.newArrayList("Nagarajan Pranesh", "Guy Vautier", "Punyanganie de Silva");
final List<String> authorsB = Lists.newArrayList("Pranesh Nagarajan", "Vautier Guy", "de Silva Punyanganie"); // final List<String> authorsB = Lists.newArrayList("Pranesh Nagarajan", "Vautier Guy", "de Silva Punyanganie");
//
final MapDocument resA = // final MapDocument resA =
result(config, "A", "Presentations of perforated colonic pathology in patients with polymyalgia rheumatica: two case reports", "2010", // result(config, "A", "Presentations of perforated colonic pathology in patients with polymyalgia rheumatica: two case reports", "2010",
"10.1186/1752-1947-4-299", authorsA); // "10.1186/1752-1947-4-299", authorsA);
//
final MapDocument resB = // final MapDocument resB =
result(config, "B", "Presentations of perforated colonic pathology in patients with polymyalgia rheumatica: two case reports", "2010", // result(config, "B", "Presentations of perforated colonic pathology in patients with polymyalgia rheumatica: two case reports", "2010",
"10.1186/1752-1947-4-299", authorsB); // "10.1186/1752-1947-4-299", authorsB);
//
final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config); // final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config);
final double d = sr.getScore(); // final double d = sr.getScore();
log.info(String.format(" d ---> %s", d)); // log.info(String.format(" d ---> %s", d));
//
// assertTrue(d.getScore() == 0.0); // // assertTrue(d.getScore() == 0.0);
} // }
//
@Ignore // @Ignore
@Test // @Test
public void testDistance() throws IOException { // public void testDistance() throws IOException {
//
final DedupConfig conf = DedupConfig.load(readFromClasspath("/eu/dnetlib/pace/result.prod.pace.json")); // final DedupConfig conf = DedupConfig.load(readFromClasspath("/eu/dnetlib/pace/result.prod.pace.json"));
//
final MapDocument crossref = asMapDocument(conf, "/eu/dnetlib/pace/crossref.json"); // final MapDocument crossref = asMapDocument(conf, "/eu/dnetlib/pace/crossref.json");
final MapDocument alicante = asMapDocument(conf, "/eu/dnetlib/pace/alicante.json"); // final MapDocument alicante = asMapDocument(conf, "/eu/dnetlib/pace/alicante.json");
//
final ScoreResult result = new PaceDocumentDistance().between(crossref, alicante, conf); // final ScoreResult result = new PaceDocumentDistance().between(crossref, alicante, conf);
//
log.info("score = " + result); // log.info("score = " + result);
//
} // }
//
@Ignore // @Ignore
@Test // @Test
public void testDistanceOrgs() throws IOException { // public void testDistanceOrgs() throws IOException {
//
final DedupConfig conf = DedupConfig.load(readFromClasspath("/eu/dnetlib/pace/organization.pace.conf")); // final DedupConfig conf = DedupConfig.load(readFromClasspath("/eu/dnetlib/pace/organization.pace.conf"));
//
final MapDocument orgA = asMapDocument(conf, readFromClasspath("/eu/dnetlib/pace/organization1.json")); // final MapDocument orgA = asMapDocument(conf, readFromClasspath("/eu/dnetlib/pace/organization1.json"));
final MapDocument orgB = asMapDocument(conf, readFromClasspath("/eu/dnetlib/pace/organization2.json")); // final MapDocument orgB = asMapDocument(conf, readFromClasspath("/eu/dnetlib/pace/organization2.json"));
//
Set<String> keysA = getGroupingKeys(conf, orgA); // Set<String> keysA = getGroupingKeys(conf, orgA);
Set<String> keysB = getGroupingKeys(conf, orgB); // Set<String> keysB = getGroupingKeys(conf, orgB);
//
assertFalse(String.format("A: %s\nB: %s", keysA, keysB), Sets.intersection(keysA, keysB).isEmpty()); // assertFalse(String.format("A: %s\nB: %s", keysA, keysB), Sets.intersection(keysA, keysB).isEmpty());
//
log.info("clustering keys A = " + getGroupingKeys(conf, orgA)); // log.info("clustering keys A = " + getGroupingKeys(conf, orgA));
log.info("clustering keys B = " + getGroupingKeys(conf, orgB)); // log.info("clustering keys B = " + getGroupingKeys(conf, orgB));
//
final ScoreResult result = new PaceDocumentDistance().between(orgA, orgB, conf); // final ScoreResult result = new PaceDocumentDistance().between(orgA, orgB, conf);
//
log.info("score = " + result); // log.info("score = " + result);
log.info("distance = " + result.getScore()); // log.info("distance = " + result.getScore());
} // }
//
private Set<String> getGroupingKeys(DedupConfig conf, MapDocument doc) { // private Set<String> getGroupingKeys(DedupConfig conf, MapDocument doc) {
return Sets.newHashSet(BlacklistAwareClusteringCombiner.filterAndCombine(doc, conf)); // return Sets.newHashSet(BlacklistAwareClusteringCombiner.filterAndCombine(doc, conf));
} // }
//
private MapDocument asMapDocument(DedupConfig conf, final String json) { // private MapDocument asMapDocument(DedupConfig conf, final String json) {
OafProtos.OafEntity.Builder b = OafProtos.OafEntity.newBuilder(); // OafProtos.OafEntity.Builder b = OafProtos.OafEntity.newBuilder();
try { // try {
JsonFormat.merge(json, b); // JsonFormat.merge(json, b);
} catch (JsonFormat.ParseException e) { // } catch (JsonFormat.ParseException e) {
throw new IllegalArgumentException(e); // throw new IllegalArgumentException(e);
} // }
return ProtoDocumentBuilder.newInstance(b.getId(), b.build(), conf.getPace().getModel()); // return ProtoDocumentBuilder.newInstance(b.getId(), b.build(), conf.getPace().getModel());
} // }
//
//
} //}

View File

@ -1,50 +1,50 @@
package eu.dnetlib.pace.model; //package eu.dnetlib.pace.model;
//
import com.google.common.collect.Iterables; //import com.google.common.collect.Iterables;
import com.google.common.collect.Sets; //import com.google.common.collect.Sets;
import com.google.common.collect.Sets.SetView; //import com.google.common.collect.Sets.SetView;
import eu.dnetlib.pace.AbstractProtoPaceTest; //import eu.dnetlib.pace.AbstractProtoPaceTest;
import eu.dnetlib.pace.config.Config; //import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.distance.DetectorTest; //import eu.dnetlib.pace.distance.DetectorTest;
import eu.dnetlib.pace.model.MapDocument; //import eu.dnetlib.pace.model.MapDocument;
import eu.dnetlib.pace.model.MapDocumentSerializer; //import eu.dnetlib.pace.model.MapDocumentSerializer;
import eu.dnetlib.pace.model.ProtoDocumentBuilder; //import eu.dnetlib.pace.model.ProtoDocumentBuilder;
import org.apache.commons.logging.Log; //import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory; //import org.apache.commons.logging.LogFactory;
import org.junit.Test; //import org.junit.Test;
//
import static org.junit.Assert.assertFalse; //import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertTrue; //import static org.junit.Assert.assertTrue;
//
public class ProtoDocumentBuilderTest extends AbstractProtoPaceTest { //public class ProtoDocumentBuilderTest extends AbstractProtoPaceTest {
//
private static final Log log = LogFactory.getLog(ProtoDocumentBuilderTest.class); // private static final Log log = LogFactory.getLog(ProtoDocumentBuilderTest.class);
//
@Test // @Test
public void test_serialise1() { // public void test_serialise1() {
//
final String id = "12345"; // final String id = "12345";
//
final Config config = getResultFullConf(); // final Config config = getResultFullConf();
//
final MapDocument document = ProtoDocumentBuilder.newInstance(id, getResult(id), config.model()); // final MapDocument document = ProtoDocumentBuilder.newInstance(id, getResult(id), config.model());
//
assertFalse(document.fieldNames().isEmpty()); // assertFalse(document.fieldNames().isEmpty());
assertFalse(Iterables.isEmpty(document.fields())); // assertFalse(Iterables.isEmpty(document.fields()));
//
log.info("original:\n" + document); // log.info("original:\n" + document);
//
final String stringDoc = MapDocumentSerializer.toString(document); // final String stringDoc = MapDocumentSerializer.toString(document);
//
log.info("srialization:\n" + stringDoc); // log.info("srialization:\n" + stringDoc);
//
final MapDocument decoded = MapDocumentSerializer.decode(stringDoc.getBytes()); // final MapDocument decoded = MapDocumentSerializer.decode(stringDoc.getBytes());
//
final SetView<String> diff = Sets.difference(document.fieldNames(), decoded.fieldNames()); // final SetView<String> diff = Sets.difference(document.fieldNames(), decoded.fieldNames());
//
assertTrue(diff.isEmpty()); // assertTrue(diff.isEmpty());
//
log.info("decoded:\n" + decoded); // log.info("decoded:\n" + decoded);
} // }
//
} //}

View File

@ -55,6 +55,12 @@
<artifactId>junit</artifactId> <artifactId>junit</artifactId>
<scope>test</scope> <scope>test</scope>
</dependency> </dependency>
<dependency>
<groupId>org.reflections</groupId>
<artifactId>reflections</artifactId>
<version>0.9.10</version>
</dependency>
</dependencies> </dependencies>
</project> </project>

View File

@ -18,7 +18,13 @@ public abstract class AbstractClusteringFunction extends AbstractPaceFunctions i
public AbstractClusteringFunction(final Map<String, Integer> params) { public AbstractClusteringFunction(final Map<String, Integer> params) {
this.params = params; this.params = params;
} }
public AbstractClusteringFunction(){}
public void setParams(Map<String, Integer> params){
this.params = params;
}
protected abstract Collection<String> doApply(String s); protected abstract Collection<String> doApply(String s);
@Override @Override

View File

@ -7,12 +7,17 @@ import java.util.StringTokenizer;
import com.google.common.collect.Sets; import com.google.common.collect.Sets;
@ClusteringClass("acronyms")
public class Acronyms extends AbstractClusteringFunction { public class Acronyms extends AbstractClusteringFunction {
public Acronyms(Map<String, Integer> params) { public Acronyms(Map<String, Integer> params) {
super(params); super(params);
} }
public Acronyms(){
super();
}
@Override @Override
protected Collection<String> doApply(String s) { protected Collection<String> doApply(String s) {
return extractAcronyms(s, param("max"), param("minLen"), param("maxLen")); return extractAcronyms(s, param("max"), param("minLen"), param("maxLen"));

View File

@ -22,9 +22,6 @@ public class BlacklistAwareClusteringCombiner extends ClusteringCombiner {
private static final Log log = LogFactory.getLog(BlacklistAwareClusteringCombiner.class); private static final Log log = LogFactory.getLog(BlacklistAwareClusteringCombiner.class);
public static Collection<String> filterAndCombine(final MapDocument a, final Config conf) { public static Collection<String> filterAndCombine(final MapDocument a, final Config conf) {
final Document filtered = new BlacklistAwareClusteringCombiner().filter(a, conf.blacklists()); final Document filtered = new BlacklistAwareClusteringCombiner().filter(a, conf.blacklists());

View File

@ -1,5 +0,0 @@
package eu.dnetlib.pace.clustering;
public enum Clustering {
acronyms, ngrams, ngrampairs, sortedngrampairs, suffixprefix, spacetrimmingfieldvalue, immutablefieldvalue, personhash, personclustering, lowercase, urlclustering
}

View File

@ -0,0 +1,13 @@
package eu.dnetlib.pace.clustering;
import java.lang.annotation.ElementType;
import java.lang.annotation.Retention;
import java.lang.annotation.RetentionPolicy;
import java.lang.annotation.Target;
@Retention(RetentionPolicy.RUNTIME)
@Target(ElementType.TYPE)
public @interface ClusteringClass {
public String value();
}

View File

@ -12,4 +12,5 @@ public interface ClusteringFunction {
public Map<String, Integer> getParams(); public Map<String, Integer> getParams();
public void setParams(Map<String, Integer> params);
} }

View File

@ -0,0 +1,24 @@
package eu.dnetlib.pace.clustering;
import java.io.Serializable;
import java.lang.reflect.InvocationTargetException;
import java.util.Map;
import java.util.stream.Collectors;
import org.reflections.Reflections;
public class ClusteringResolver implements Serializable {
private final Map<String, Class<ClusteringFunction>> functionMap;
public ClusteringResolver() {
this.functionMap = new Reflections("eu.dnetlib").getTypesAnnotatedWith(ClusteringClass.class).stream()
.filter(ClusteringFunction.class::isAssignableFrom)
.collect(Collectors.toMap(cl -> cl.getAnnotation(ClusteringClass.class).value(), cl -> (Class<ClusteringFunction>)cl));
}
public ClusteringFunction resolve(String clusteringFunction) throws IllegalAccessException, InstantiationException, NoSuchMethodException, InvocationTargetException {
return functionMap.get(clusteringFunction).newInstance();
}
}

View File

@ -6,12 +6,17 @@ import java.util.Map;
import com.google.common.collect.Lists; import com.google.common.collect.Lists;
@ClusteringClass("immutablefieldvalue")
public class ImmutableFieldValue extends AbstractClusteringFunction { public class ImmutableFieldValue extends AbstractClusteringFunction {
public ImmutableFieldValue(final Map<String, Integer> params) { public ImmutableFieldValue(final Map<String, Integer> params) {
super(params); super(params);
} }
public ImmutableFieldValue() {
super();
}
@Override @Override
protected Collection<String> doApply(final String s) { protected Collection<String> doApply(final String s) {
final List<String> res = Lists.newArrayList(); final List<String> res = Lists.newArrayList();

View File

@ -9,12 +9,17 @@ import com.google.common.collect.Sets;
import eu.dnetlib.pace.model.Field; import eu.dnetlib.pace.model.Field;
import org.apache.commons.lang.StringUtils; import org.apache.commons.lang.StringUtils;
@ClusteringClass("lowercase")
public class LowercaseClustering extends AbstractClusteringFunction { public class LowercaseClustering extends AbstractClusteringFunction {
public LowercaseClustering(final Map<String, Integer> params) { public LowercaseClustering(final Map<String, Integer> params) {
super(params); super(params);
} }
public LowercaseClustering(){
super();
}
@Override @Override
public Collection<String> apply(List<Field> fields) { public Collection<String> apply(List<Field> fields) {
Collection<String> c = Sets.newLinkedHashSet(); Collection<String> c = Sets.newLinkedHashSet();

View File

@ -6,8 +6,13 @@ import java.util.Map;
import com.google.common.collect.Lists; import com.google.common.collect.Lists;
@ClusteringClass("ngrampairs")
public class NgramPairs extends Ngrams { public class NgramPairs extends Ngrams {
public NgramPairs() {
super();
}
public NgramPairs(Map<String, Integer> params) { public NgramPairs(Map<String, Integer> params) {
super(params); super(params);
} }

View File

@ -5,12 +5,17 @@ import java.util.LinkedHashSet;
import java.util.Map; import java.util.Map;
import java.util.StringTokenizer; import java.util.StringTokenizer;
@ClusteringClass("ngrams")
public class Ngrams extends AbstractClusteringFunction { public class Ngrams extends AbstractClusteringFunction {
public Ngrams(Map<String, Integer> params) { public Ngrams(Map<String, Integer> params) {
super(params); super(params);
} }
public Ngrams() {
super();
}
@Override @Override
protected Collection<String> doApply(String s) { protected Collection<String> doApply(String s) {
return getNgrams(s, param("ngramLen"), param("max"), param("maxPerToken"), param("minNgramLen")); return getNgrams(s, param("ngramLen"), param("max"), param("maxPerToken"), param("minNgramLen"));

View File

@ -1,5 +1,6 @@
package eu.dnetlib.pace.clustering; package eu.dnetlib.pace.clustering;
import java.io.Serializable;
import java.util.Collection; import java.util.Collection;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
@ -18,6 +19,7 @@ import eu.dnetlib.pace.model.Field;
import eu.dnetlib.pace.model.gt.Author; import eu.dnetlib.pace.model.gt.Author;
import eu.dnetlib.pace.model.gt.GTAuthor; import eu.dnetlib.pace.model.gt.GTAuthor;
@ClusteringClass("personclustering")
public class PersonClustering extends AbstractPaceFunctions implements ClusteringFunction { public class PersonClustering extends AbstractPaceFunctions implements ClusteringFunction {
private Map<String, Integer> params; private Map<String, Integer> params;
@ -28,6 +30,10 @@ public class PersonClustering extends AbstractPaceFunctions implements Clusterin
this.params = params; this.params = params;
} }
public void setParams(Map<String, Integer> params){
this.params = params;
}
@Override @Override
public Collection<String> apply(final List<Field> fields) { public Collection<String> apply(final List<Field> fields) {
final Set<String> hashes = Sets.newHashSet(); final Set<String> hashes = Sets.newHashSet();

View File

@ -8,6 +8,7 @@ import com.google.common.collect.Lists;
import eu.dnetlib.pace.model.Person; import eu.dnetlib.pace.model.Person;
@ClusteringClass("personhash")
public class PersonHash extends AbstractClusteringFunction { public class PersonHash extends AbstractClusteringFunction {
private boolean DEFAULT_AGGRESSIVE = false; private boolean DEFAULT_AGGRESSIVE = false;
@ -16,6 +17,10 @@ public class PersonHash extends AbstractClusteringFunction {
super(params); super(params);
} }
public PersonHash(){
super();
}
@Override @Override
protected Collection<String> doApply(final String s) { protected Collection<String> doApply(final String s) {
final List<String> res = Lists.newArrayList(); final List<String> res = Lists.newArrayList();

View File

@ -9,6 +9,10 @@ public class RandomClusteringFunction extends AbstractClusteringFunction {
super(params); super(params);
} }
public RandomClusteringFunction(){
super();
}
@Override @Override
protected Collection<String> doApply(String s) { protected Collection<String> doApply(String s) {
// TODO Auto-generated method stub // TODO Auto-generated method stub

View File

@ -9,12 +9,17 @@ import com.google.common.base.Joiner;
import com.google.common.base.Splitter; import com.google.common.base.Splitter;
import com.google.common.collect.Lists; import com.google.common.collect.Lists;
@ClusteringClass("sortedngrampairs")
public class SortedNgramPairs extends NgramPairs { public class SortedNgramPairs extends NgramPairs {
public SortedNgramPairs(Map<String, Integer> params) { public SortedNgramPairs(Map<String, Integer> params) {
super(params); super(params);
} }
public SortedNgramPairs(){
super();
}
@Override @Override
protected Collection<String> doApply(String s) { protected Collection<String> doApply(String s) {

View File

@ -9,12 +9,17 @@ import org.apache.commons.lang.StringUtils;
import com.google.common.collect.Lists; import com.google.common.collect.Lists;
@ClusteringClass("spacetrimmingfieldvalue")
public class SpaceTrimmingFieldValue extends AbstractClusteringFunction { public class SpaceTrimmingFieldValue extends AbstractClusteringFunction {
public SpaceTrimmingFieldValue(final Map<String, Integer> params) { public SpaceTrimmingFieldValue(final Map<String, Integer> params) {
super(params); super(params);
} }
public SpaceTrimmingFieldValue(){
super();
}
@Override @Override
protected Collection<String> doApply(final String s) { protected Collection<String> doApply(final String s) {
final List<String> res = Lists.newArrayList(); final List<String> res = Lists.newArrayList();

View File

@ -6,12 +6,17 @@ import java.util.Set;
import com.google.common.collect.Sets; import com.google.common.collect.Sets;
@ClusteringClass("suffixprefix")
public class SuffixPrefix extends AbstractClusteringFunction { public class SuffixPrefix extends AbstractClusteringFunction {
public SuffixPrefix(Map<String, Integer> params) { public SuffixPrefix(Map<String, Integer> params) {
super(params); super(params);
} }
public SuffixPrefix(){
super();
}
@Override @Override
protected Collection<String> doApply(String s) { protected Collection<String> doApply(String s) {
return suffixPrefix(s, param("len"), param("max")); return suffixPrefix(s, param("len"), param("max"));

View File

@ -11,6 +11,7 @@ import java.util.List;
import java.util.Map; import java.util.Map;
import java.util.stream.Collectors; import java.util.stream.Collectors;
@ClusteringClass("urlclustering")
public class UrlClustering extends AbstractPaceFunctions implements ClusteringFunction { public class UrlClustering extends AbstractPaceFunctions implements ClusteringFunction {
protected Map<String, Integer> params; protected Map<String, Integer> params;
@ -19,6 +20,14 @@ public class UrlClustering extends AbstractPaceFunctions implements ClusteringFu
this.params = params; this.params = params;
} }
public UrlClustering() {
super();
}
public void setParams(Map<String, Integer> params){
this.params = params;
}
@Override @Override
public Collection<String> apply(List<Field> fields) { public Collection<String> apply(List<Field> fields) {
return fields.stream() return fields.stream()

View File

@ -2,7 +2,6 @@ package eu.dnetlib.pace.condition;
import java.util.List; import java.util.List;
import eu.dnetlib.pace.common.AbstractPaceFunctions; import eu.dnetlib.pace.common.AbstractPaceFunctions;
import eu.dnetlib.pace.config.Cond;
import eu.dnetlib.pace.distance.eval.ConditionEval; import eu.dnetlib.pace.distance.eval.ConditionEval;
import eu.dnetlib.pace.distance.eval.ConditionEvalMap; import eu.dnetlib.pace.distance.eval.ConditionEvalMap;
import eu.dnetlib.pace.model.Document; import eu.dnetlib.pace.model.Document;
@ -17,15 +16,25 @@ import eu.dnetlib.pace.model.FieldDef;
*/ */
public abstract class AbstractCondition extends AbstractPaceFunctions implements ConditionAlgo { public abstract class AbstractCondition extends AbstractPaceFunctions implements ConditionAlgo {
protected Cond cond; protected String cond;
protected List<FieldDef> fields; protected List<FieldDef> fields;
public AbstractCondition(final Cond cond, final List<FieldDef> fields) { public AbstractCondition(final String cond, final List<FieldDef> fields) {
this.cond = cond; this.cond = cond;
this.fields = fields; this.fields = fields;
} }
public AbstractCondition(){}
public void setCond(String cond){
this.cond = cond;
}
public void setFields(List<FieldDef> fields){
this.fields = fields;
}
protected abstract ConditionEval verify(FieldDef fd, Field a, Field b); protected abstract ConditionEval verify(FieldDef fd, Field a, Field b);
@Override @Override

View File

@ -1,7 +1,6 @@
package eu.dnetlib.pace.condition; package eu.dnetlib.pace.condition;
import java.util.List; import java.util.List;
import eu.dnetlib.pace.config.Cond;
import eu.dnetlib.pace.distance.eval.ConditionEval; import eu.dnetlib.pace.distance.eval.ConditionEval;
import eu.dnetlib.pace.model.Field; import eu.dnetlib.pace.model.Field;
import eu.dnetlib.pace.model.FieldDef; import eu.dnetlib.pace.model.FieldDef;
@ -11,12 +10,16 @@ import eu.dnetlib.pace.model.FieldDef;
* *
* @author claudio * @author claudio
*/ */
@ConditionClass("alwaystruecondition")
public class AlwaysTrueCondition extends AbstractCondition { public class AlwaysTrueCondition extends AbstractCondition {
public AlwaysTrueCondition(final Cond cond, final List<FieldDef> fields) { public AlwaysTrueCondition(final String cond, final List<FieldDef> fields) {
super(cond, fields); super(cond, fields);
} }
public AlwaysTrueCondition(){
super();
}
@Override @Override
protected ConditionEval verify(final FieldDef fd, final Field a, final Field b) { protected ConditionEval verify(final FieldDef fd, final Field a, final Field b) {
return new ConditionEval(cond, a, b, 1); return new ConditionEval(cond, a, b, 1);

View File

@ -1,9 +1,9 @@
package eu.dnetlib.pace.condition; package eu.dnetlib.pace.condition;
import java.util.Map; import java.util.List;
import eu.dnetlib.pace.distance.eval.ConditionEvalMap; import eu.dnetlib.pace.distance.eval.ConditionEvalMap;
import eu.dnetlib.pace.model.Document; import eu.dnetlib.pace.model.Document;
import eu.dnetlib.pace.model.FieldDef;
/** /**
* Allows to express general conditions to be satisfied or not between two Documents. * Allows to express general conditions to be satisfied or not between two Documents.
@ -24,4 +24,7 @@ public interface ConditionAlgo {
*/ */
public abstract ConditionEvalMap verify(Document a, Document b); public abstract ConditionEvalMap verify(Document a, Document b);
public void setFields(List<FieldDef> fields);
public void setCond(String name);
} }

View File

@ -0,0 +1,13 @@
package eu.dnetlib.pace.condition;
import java.lang.annotation.ElementType;
import java.lang.annotation.Retention;
import java.lang.annotation.RetentionPolicy;
import java.lang.annotation.Target;
@Retention(RetentionPolicy.RUNTIME)
@Target(ElementType.TYPE)
public @interface ConditionClass {
public String value();
}

View File

@ -0,0 +1,22 @@
package eu.dnetlib.pace.condition;
import java.io.Serializable;
import java.util.Map;
import java.util.stream.Collectors;
import org.reflections.Reflections;
public class ConditionResolver implements Serializable {
private final Map<String, Class<ConditionAlgo>> functionMap;
public ConditionResolver() {
this.functionMap = new Reflections("eu.dnetlib").getTypesAnnotatedWith(ConditionClass.class).stream()
.filter(ConditionAlgo.class::isAssignableFrom)
.collect(Collectors.toMap(cl -> cl.getAnnotation(ConditionClass.class).value(), cl -> (Class<ConditionAlgo>)cl));
}
public ConditionAlgo resolve(String name) throws IllegalAccessException, InstantiationException {
return functionMap.get(name).newInstance();
}
}

View File

@ -2,7 +2,6 @@ package eu.dnetlib.pace.condition;
import java.util.List; import java.util.List;
import eu.dnetlib.pace.config.Cond;
import eu.dnetlib.pace.model.Field; import eu.dnetlib.pace.model.Field;
import eu.dnetlib.pace.model.FieldDef; import eu.dnetlib.pace.model.FieldDef;
@ -11,11 +10,12 @@ import eu.dnetlib.pace.model.FieldDef;
* *
* @author claudio * @author claudio
*/ */
@ConditionClass("doiExactMatch")
public class DoiExactMatch extends ExactMatchIgnoreCase { public class DoiExactMatch extends ExactMatchIgnoreCase {
public final String PREFIX = "(http:\\/\\/dx\\.doi\\.org\\/)|(doi:)"; public final String PREFIX = "(http:\\/\\/dx\\.doi\\.org\\/)|(doi:)";
public DoiExactMatch(final Cond cond, final List<FieldDef> fields) { public DoiExactMatch(final String cond, final List<FieldDef> fields) {
super(cond, fields); super(cond, fields);
} }

View File

@ -2,7 +2,6 @@ package eu.dnetlib.pace.condition;
import java.util.List; import java.util.List;
import eu.dnetlib.pace.config.Cond;
import eu.dnetlib.pace.distance.eval.ConditionEval; import eu.dnetlib.pace.distance.eval.ConditionEval;
import eu.dnetlib.pace.model.Field; import eu.dnetlib.pace.model.Field;
import eu.dnetlib.pace.model.FieldDef; import eu.dnetlib.pace.model.FieldDef;
@ -13,12 +12,17 @@ import org.apache.commons.lang.StringUtils;
* *
* @author claudio * @author claudio
*/ */
@ConditionClass("exactMatch")
public class ExactMatch extends AbstractCondition { public class ExactMatch extends AbstractCondition {
public ExactMatch(final Cond cond, final List<FieldDef> fields) { public ExactMatch(final String cond, final List<FieldDef> fields) {
super(cond, fields); super(cond, fields);
} }
public ExactMatch(){
super();
}
@Override @Override
protected ConditionEval verify(final FieldDef fd, final Field a, final Field b) { protected ConditionEval verify(final FieldDef fd, final Field a, final Field b) {

View File

@ -2,7 +2,6 @@ package eu.dnetlib.pace.condition;
import java.util.List; import java.util.List;
import eu.dnetlib.pace.config.Cond;
import eu.dnetlib.pace.distance.eval.ConditionEval; import eu.dnetlib.pace.distance.eval.ConditionEval;
import eu.dnetlib.pace.model.Field; import eu.dnetlib.pace.model.Field;
import eu.dnetlib.pace.model.FieldDef; import eu.dnetlib.pace.model.FieldDef;
@ -12,9 +11,10 @@ import eu.dnetlib.pace.model.FieldDef;
* *
* @author claudio * @author claudio
*/ */
@ConditionClass("exactMatchIgnoreCase")
public class ExactMatchIgnoreCase extends AbstractCondition { public class ExactMatchIgnoreCase extends AbstractCondition {
public ExactMatchIgnoreCase(final Cond cond, final List<FieldDef> fields) { public ExactMatchIgnoreCase(final String cond, final List<FieldDef> fields) {
super(cond, fields); super(cond, fields);
} }

View File

@ -3,7 +3,6 @@ package eu.dnetlib.pace.condition;
import java.util.List; import java.util.List;
import com.google.common.collect.Iterables; import com.google.common.collect.Iterables;
import eu.dnetlib.pace.config.Cond;
import eu.dnetlib.pace.distance.eval.ConditionEval; import eu.dnetlib.pace.distance.eval.ConditionEval;
import eu.dnetlib.pace.model.Field; import eu.dnetlib.pace.model.Field;
import eu.dnetlib.pace.model.FieldDef; import eu.dnetlib.pace.model.FieldDef;
@ -13,6 +12,7 @@ import eu.dnetlib.pace.model.FieldDef;
* *
* @author claudio * @author claudio
*/ */
@ConditionClass("mustBeDifferent")
public class MustBeDifferent extends AbstractCondition { public class MustBeDifferent extends AbstractCondition {
/** /**
@ -20,7 +20,7 @@ public class MustBeDifferent extends AbstractCondition {
* *
* @param fields the fields * @param fields the fields
*/ */
public MustBeDifferent(final Cond cond, final List<FieldDef> fields) { public MustBeDifferent(final String cond, final List<FieldDef> fields) {
super(cond, fields); super(cond, fields);
} }

View File

@ -6,7 +6,6 @@ import java.util.Set;
import java.util.stream.Collectors; import java.util.stream.Collectors;
import com.google.common.collect.Sets; import com.google.common.collect.Sets;
import eu.dnetlib.pace.config.Cond;
import eu.dnetlib.pace.distance.eval.ConditionEval; import eu.dnetlib.pace.distance.eval.ConditionEval;
import eu.dnetlib.pace.model.Field; import eu.dnetlib.pace.model.Field;
import eu.dnetlib.pace.model.FieldDef; import eu.dnetlib.pace.model.FieldDef;
@ -20,11 +19,12 @@ import org.apache.commons.logging.LogFactory;
* *
* @author claudio * @author claudio
*/ */
@ConditionClass("pidMatch")
public class PidMatch extends AbstractCondition { public class PidMatch extends AbstractCondition {
private static final Log log = LogFactory.getLog(PidMatch.class); private static final Log log = LogFactory.getLog(PidMatch.class);
public PidMatch(final Cond cond, final List<FieldDef> fields) { public PidMatch(final String cond, final List<FieldDef> fields) {
super(cond, fields); super(cond, fields);
} }

View File

@ -4,7 +4,6 @@ import java.util.List;
import com.google.common.collect.Iterables; import com.google.common.collect.Iterables;
import eu.dnetlib.pace.config.Cond;
import eu.dnetlib.pace.distance.eval.ConditionEval; import eu.dnetlib.pace.distance.eval.ConditionEval;
import eu.dnetlib.pace.model.Field; import eu.dnetlib.pace.model.Field;
import eu.dnetlib.pace.model.FieldDef; import eu.dnetlib.pace.model.FieldDef;
@ -14,6 +13,7 @@ import eu.dnetlib.pace.model.FieldDef;
* *
* @author claudio * @author claudio
*/ */
@ConditionClass("sizeMatch")
public class SizeMatch extends AbstractCondition { public class SizeMatch extends AbstractCondition {
/** /**
@ -22,7 +22,7 @@ public class SizeMatch extends AbstractCondition {
* @param fields * @param fields
* the fields * the fields
*/ */
public SizeMatch(final Cond cond, final List<FieldDef> fields) { public SizeMatch(final String cond, final List<FieldDef> fields) {
super(cond, fields); super(cond, fields);
} }

View File

@ -2,7 +2,6 @@ package eu.dnetlib.pace.condition;
import java.util.List; import java.util.List;
import eu.dnetlib.pace.config.Cond;
import eu.dnetlib.pace.distance.eval.ConditionEval; import eu.dnetlib.pace.distance.eval.ConditionEval;
import eu.dnetlib.pace.model.Field; import eu.dnetlib.pace.model.Field;
import eu.dnetlib.pace.model.FieldDef; import eu.dnetlib.pace.model.FieldDef;
@ -13,9 +12,10 @@ import eu.dnetlib.pace.model.FieldDef;
* @author claudio * @author claudio
* *
*/ */
@ConditionClass("titleVersionMatch")
public class TitleVersionMatch extends AbstractCondition { public class TitleVersionMatch extends AbstractCondition {
public TitleVersionMatch(final Cond cond, final List<FieldDef> fields) { public TitleVersionMatch(final String cond, final List<FieldDef> fields) {
super(cond, fields); super(cond, fields);
} }

View File

@ -1,8 +1,8 @@
package eu.dnetlib.pace.condition; package eu.dnetlib.pace.condition;
import java.time.Year;
import java.util.List; import java.util.List;
import eu.dnetlib.pace.config.Cond;
import eu.dnetlib.pace.distance.eval.ConditionEval; import eu.dnetlib.pace.distance.eval.ConditionEval;
import org.apache.commons.lang.StringUtils; import org.apache.commons.lang.StringUtils;
@ -14,14 +14,17 @@ import eu.dnetlib.pace.model.FieldDef;
* *
* @author claudio * @author claudio
*/ */
@ConditionClass("yearMatch")
public class YearMatch extends AbstractCondition { public class YearMatch extends AbstractCondition {
private int limit = 4; private int limit = 4;
public YearMatch(final Cond cond, final List<FieldDef> fields) { public YearMatch(final String cond, final List<FieldDef> fields) {
super(cond, fields); super(cond, fields);
} }
public YearMatch(){}
// @Override // @Override
// public boolean verify(final Document a, final Document b) { // public boolean verify(final Document a, final Document b) {
// boolean res = true; // boolean res = true;

View File

@ -1,46 +0,0 @@
package eu.dnetlib.pace.config;
/**
* Enumerates the distance Algos.
*/
public enum Algo {
/** The Jaro winkler. */
JaroWinkler,
/** The Jaro winkler title. */
JaroWinklerTitle,
/** The Levenstein. */
Levenstein,
/** The Levenstein distance for title matching */
LevensteinTitle,
/** The Level2 jaro winkler. */
Level2JaroWinkler,
/** The Level2 jaro winkler for title matching */
Level2JaroWinklerTitle,
/** The Level2 levenstein. */
Level2Levenstein,
/** The Sub string levenstein. */
SubStringLevenstein,
/** The Year levenstein. */
YearLevenstein,
/** The Sorted jaro winkler. */
SortedJaroWinkler,
/** The Sorted level2 jaro winkler. */
SortedLevel2JaroWinkler,
/** Compares two urls */
urlMatcher,
/** Exact match algo. */
ExactMatch,
/**
* Returns 0 for equal strings, 1 for different strings.
*/
MustBeDifferent,
/** Always return 1.0 as distance. */
AlwaysMatch,
/** Person distance */
PersonCoAuthorSurnamesDistance,
PersonCoAnchorsDistance,
PersonDistance,
/** The Null. */
Null
}

View File

@ -1,28 +0,0 @@
package eu.dnetlib.pace.config;
/**
* The Enum Cond.
*/
public enum Cond {
/** The year match. */
yearMatch,
/** The title version match. */
titleVersionMatch,
/** The size match. */
sizeMatch,
/**
* Returns true if the field values are different
*/
mustBeDifferent,
/** The Exact match. */
exactMatch,
/**
* The Exact match ignore case.
*/
exactMatchIgnoreCase,
/** The Exact match specialized to recognize DOI values. */
doiExactMatch,
/** The Exact match that checks if pid type and value are the same */
pidMatch
}

View File

@ -2,6 +2,8 @@ package eu.dnetlib.pace.distance;
import eu.dnetlib.pace.model.Field; import eu.dnetlib.pace.model.Field;
import java.util.Map;
/** /**
* Each field is configured with a distance algo which knows how to compute the distance (0-1) between the fields of two * Each field is configured with a distance algo which knows how to compute the distance (0-1) between the fields of two
* objects. * objects.
@ -11,5 +13,9 @@ public interface DistanceAlgo {
public abstract double distance(Field a, Field b); public abstract double distance(Field a, Field b);
public double getWeight(); public double getWeight();
public Map<String, Number> getParams();
public void setWeight(double w);
public void setParams(Map<String, Number> params);
} }

View File

@ -0,0 +1,13 @@
package eu.dnetlib.pace.distance;
import java.lang.annotation.ElementType;
import java.lang.annotation.Retention;
import java.lang.annotation.RetentionPolicy;
import java.lang.annotation.Target;
@Retention(RetentionPolicy.RUNTIME)
@Target(ElementType.TYPE)
public @interface DistanceClass {
public String value();
}

View File

@ -0,0 +1,24 @@
package eu.dnetlib.pace.distance;
import java.io.Serializable;
import java.lang.reflect.InvocationTargetException;
import java.util.Map;
import java.util.stream.Collectors;
import org.reflections.Reflections;
public class DistanceResolver implements Serializable {
private final Map<String, Class<DistanceAlgo>> functionMap;
public DistanceResolver() {
this.functionMap = new Reflections("eu.dnetlib").getTypesAnnotatedWith(DistanceClass.class).stream()
.filter(DistanceAlgo.class::isAssignableFrom)
.collect(Collectors.toMap(cl -> cl.getAnnotation(DistanceClass.class).value(), cl -> (Class<DistanceAlgo>)cl));
}
public DistanceAlgo resolve(String algo) throws IllegalAccessException, InstantiationException {
return functionMap.get(algo).newInstance();
}
}

View File

@ -25,7 +25,7 @@ public class DistanceScorer {
} }
public ScoreResult distance(final Document a, final Document b) { public ScoreResult distance(final Document a, final Document b) {
final ScoreResult sr = new ScoreResult(); final ScoreResult sr = new ScoreResult(); //to keep track of the result of the comparison
sr.setStrictConditions(verify(a, b, config.strictConditions())); sr.setStrictConditions(verify(a, b, config.strictConditions()));
sr.setConditions(verify(a, b, config.conditions())); sr.setConditions(verify(a, b, config.conditions()));

View File

@ -1,6 +1,8 @@
package eu.dnetlib.pace.distance; package eu.dnetlib.pace.distance;
import java.io.Serializable;
import java.util.List; import java.util.List;
import java.util.Map;
import com.wcohen.ss.AbstractStringDistance; import com.wcohen.ss.AbstractStringDistance;
@ -24,6 +26,27 @@ public abstract class SecondStringDistanceAlgo extends AbstractPaceFunctions imp
/** The weight. */ /** The weight. */
protected double weight = 0.0; protected double weight = 0.0;
private Map<String, Number> params;
protected SecondStringDistanceAlgo(){
}
protected SecondStringDistanceAlgo(Map<String, Number> params){
this.params = params;
}
public void setWeight(double w){
this.weight = w;
}
public Map<String, Number> getParams(){
return this.params;
}
public void setParams(Map<String, Number> params){
this.params = params;
}
/** /**
* Instantiates a new second string distance algo. * Instantiates a new second string distance algo.
* *
@ -37,6 +60,10 @@ public abstract class SecondStringDistanceAlgo extends AbstractPaceFunctions imp
this.weight = weight; this.weight = weight;
} }
protected SecondStringDistanceAlgo(final AbstractStringDistance ssalgo){
this.ssalgo = ssalgo;
}
/** /**
* Normalize. * Normalize.
* *

View File

@ -1,10 +1,22 @@
package eu.dnetlib.pace.distance.algo; package eu.dnetlib.pace.distance.algo;
import com.wcohen.ss.AbstractStringDistance; import com.wcohen.ss.AbstractStringDistance;
import eu.dnetlib.pace.distance.DistanceClass;
import eu.dnetlib.pace.distance.SecondStringDistanceAlgo; import eu.dnetlib.pace.distance.SecondStringDistanceAlgo;
import java.util.Map;
@DistanceClass("AlwaysMatch")
public class AlwaysMatch extends SecondStringDistanceAlgo { public class AlwaysMatch extends SecondStringDistanceAlgo {
public AlwaysMatch(){
super();
}
public AlwaysMatch(final Map<String, Number> params){
super(params);
}
public AlwaysMatch(final double weight) { public AlwaysMatch(final double weight) {
super(weight, new com.wcohen.ss.JaroWinkler()); super(weight, new com.wcohen.ss.JaroWinkler());
} }

View File

@ -1,10 +1,22 @@
package eu.dnetlib.pace.distance.algo; package eu.dnetlib.pace.distance.algo;
import com.wcohen.ss.AbstractStringDistance; import com.wcohen.ss.AbstractStringDistance;
import eu.dnetlib.pace.distance.DistanceClass;
import eu.dnetlib.pace.distance.SecondStringDistanceAlgo; import eu.dnetlib.pace.distance.SecondStringDistanceAlgo;
import java.util.Map;
@DistanceClass("ExactMatch")
public class ExactMatch extends SecondStringDistanceAlgo { public class ExactMatch extends SecondStringDistanceAlgo {
public ExactMatch(){
super();
}
public ExactMatch(Map<String, Number> params){
super(params);
}
public ExactMatch(final double weight) { public ExactMatch(final double weight) {
super(weight, new com.wcohen.ss.JaroWinkler()); super(weight, new com.wcohen.ss.JaroWinkler());
} }

View File

@ -1,11 +1,23 @@
package eu.dnetlib.pace.distance.algo; package eu.dnetlib.pace.distance.algo;
import com.wcohen.ss.AbstractStringDistance; import com.wcohen.ss.AbstractStringDistance;
import eu.dnetlib.pace.distance.DistanceClass;
import eu.dnetlib.pace.distance.SecondStringDistanceAlgo; import eu.dnetlib.pace.distance.SecondStringDistanceAlgo;
import java.util.Map;
//case class JaroWinkler(w: Double) extends SecondStringDistanceAlgo(w, new com.wcohen.ss.JaroWinkler()) //case class JaroWinkler(w: Double) extends SecondStringDistanceAlgo(w, new com.wcohen.ss.JaroWinkler())
@DistanceClass("JaroWinkler")
public class JaroWinkler extends SecondStringDistanceAlgo { public class JaroWinkler extends SecondStringDistanceAlgo {
public JaroWinkler(){
super();
}
public JaroWinkler(Map<String, Number> params){
super(params);
}
public JaroWinkler(double weight) { public JaroWinkler(double weight) {
super(weight, new com.wcohen.ss.JaroWinkler()); super(weight, new com.wcohen.ss.JaroWinkler());
} }

View File

@ -1,11 +1,23 @@
package eu.dnetlib.pace.distance.algo; package eu.dnetlib.pace.distance.algo;
import com.wcohen.ss.AbstractStringDistance; import com.wcohen.ss.AbstractStringDistance;
import eu.dnetlib.pace.distance.DistanceClass;
import eu.dnetlib.pace.distance.SecondStringDistanceAlgo; import eu.dnetlib.pace.distance.SecondStringDistanceAlgo;
import java.util.Map;
//case class JaroWinkler(w: Double) extends SecondStringDistanceAlgo(w, new com.wcohen.ss.JaroWinkler()) //case class JaroWinkler(w: Double) extends SecondStringDistanceAlgo(w, new com.wcohen.ss.JaroWinkler())
@DistanceClass("JaroWinklerTitle")
public class JaroWinklerTitle extends SecondStringDistanceAlgo { public class JaroWinklerTitle extends SecondStringDistanceAlgo {
public JaroWinklerTitle(){
super();
}
public JaroWinklerTitle(Map<String, Number> params){
super(params);
}
public JaroWinklerTitle(double weight) { public JaroWinklerTitle(double weight) {
super(weight, new com.wcohen.ss.JaroWinkler()); super(weight, new com.wcohen.ss.JaroWinkler());
} }

View File

@ -1,8 +1,10 @@
package eu.dnetlib.pace.distance.algo; package eu.dnetlib.pace.distance.algo;
import com.wcohen.ss.AbstractStringDistance; import com.wcohen.ss.AbstractStringDistance;
import eu.dnetlib.pace.distance.DistanceClass;
import eu.dnetlib.pace.distance.SecondStringDistanceAlgo; import eu.dnetlib.pace.distance.SecondStringDistanceAlgo;
@DistanceClass("Level2JaroWinkler")
public class Level2JaroWinkler extends SecondStringDistanceAlgo { public class Level2JaroWinkler extends SecondStringDistanceAlgo {
public Level2JaroWinkler(double w) { public Level2JaroWinkler(double w) {

View File

@ -1,8 +1,10 @@
package eu.dnetlib.pace.distance.algo; package eu.dnetlib.pace.distance.algo;
import com.wcohen.ss.AbstractStringDistance; import com.wcohen.ss.AbstractStringDistance;
import eu.dnetlib.pace.distance.DistanceClass;
import eu.dnetlib.pace.distance.SecondStringDistanceAlgo; import eu.dnetlib.pace.distance.SecondStringDistanceAlgo;
@DistanceClass("Level2JaroWinklerTitle")
public class Level2JaroWinklerTitle extends SecondStringDistanceAlgo { public class Level2JaroWinklerTitle extends SecondStringDistanceAlgo {
public Level2JaroWinklerTitle(final double w) { public Level2JaroWinklerTitle(final double w) {

View File

@ -1,8 +1,10 @@
package eu.dnetlib.pace.distance.algo; package eu.dnetlib.pace.distance.algo;
import com.wcohen.ss.AbstractStringDistance; import com.wcohen.ss.AbstractStringDistance;
import eu.dnetlib.pace.distance.DistanceClass;
import eu.dnetlib.pace.distance.SecondStringDistanceAlgo; import eu.dnetlib.pace.distance.SecondStringDistanceAlgo;
@DistanceClass("Level2Levenstein")
public class Level2Levenstein extends SecondStringDistanceAlgo { public class Level2Levenstein extends SecondStringDistanceAlgo {
public Level2Levenstein(double w) { public Level2Levenstein(double w) {

View File

@ -1,10 +1,16 @@
package eu.dnetlib.pace.distance.algo; package eu.dnetlib.pace.distance.algo;
import com.wcohen.ss.AbstractStringDistance; import com.wcohen.ss.AbstractStringDistance;
import eu.dnetlib.pace.distance.DistanceClass;
import eu.dnetlib.pace.distance.SecondStringDistanceAlgo; import eu.dnetlib.pace.distance.SecondStringDistanceAlgo;
@DistanceClass("Levenstein")
public class Levenstein extends SecondStringDistanceAlgo { public class Levenstein extends SecondStringDistanceAlgo {
public Levenstein(){
super(new com.wcohen.ss.Levenstein());
}
public Levenstein(double w) { public Levenstein(double w) {
super(w, new com.wcohen.ss.Levenstein()); super(w, new com.wcohen.ss.Levenstein());
} }

View File

@ -1,10 +1,16 @@
package eu.dnetlib.pace.distance.algo; package eu.dnetlib.pace.distance.algo;
import com.wcohen.ss.AbstractStringDistance; import com.wcohen.ss.AbstractStringDistance;
import eu.dnetlib.pace.distance.DistanceClass;
import eu.dnetlib.pace.distance.SecondStringDistanceAlgo; import eu.dnetlib.pace.distance.SecondStringDistanceAlgo;
@DistanceClass("LevensteinTitle")
public class LevensteinTitle extends SecondStringDistanceAlgo { public class LevensteinTitle extends SecondStringDistanceAlgo {
public LevensteinTitle(){
super(new com.wcohen.ss.Levenstein());
}
public LevensteinTitle(final double w) { public LevensteinTitle(final double w) {
super(w, new com.wcohen.ss.Levenstein()); super(w, new com.wcohen.ss.Levenstein());
} }

View File

@ -1,8 +1,10 @@
package eu.dnetlib.pace.distance.algo; package eu.dnetlib.pace.distance.algo;
import com.wcohen.ss.AbstractStringDistance; import com.wcohen.ss.AbstractStringDistance;
import eu.dnetlib.pace.distance.DistanceClass;
import eu.dnetlib.pace.distance.SecondStringDistanceAlgo; import eu.dnetlib.pace.distance.SecondStringDistanceAlgo;
@DistanceClass("MustBeDifferent")
public class MustBeDifferent extends SecondStringDistanceAlgo { public class MustBeDifferent extends SecondStringDistanceAlgo {
public MustBeDifferent(final double weight) { public MustBeDifferent(final double weight) {

View File

@ -1,12 +1,16 @@
package eu.dnetlib.pace.distance.algo; package eu.dnetlib.pace.distance.algo;
import eu.dnetlib.pace.distance.DistanceAlgo; import eu.dnetlib.pace.distance.DistanceAlgo;
import eu.dnetlib.pace.distance.DistanceClass;
import eu.dnetlib.pace.model.Field; import eu.dnetlib.pace.model.Field;
import java.util.Map;
/** /**
* Not all fields of a document need to partecipate in the distance measure. We model those fields as having a * Not all fields of a document need to partecipate in the distance measure. We model those fields as having a
* NullDistanceAlgo. * NullDistanceAlgo.
*/ */
@DistanceClass("Null")
public class NullDistanceAlgo implements DistanceAlgo { public class NullDistanceAlgo implements DistanceAlgo {
@Override @Override
@ -19,4 +23,16 @@ public class NullDistanceAlgo implements DistanceAlgo {
return 0.0; return 0.0;
} }
@Override
public void setWeight(double w){
}
@Override
public Map<String, Number> getParams() {
return null;
}
@Override
public void setParams(Map<String, Number> params) {
}
} }

View File

@ -1,10 +1,12 @@
package eu.dnetlib.pace.distance.algo; package eu.dnetlib.pace.distance.algo;
import com.wcohen.ss.AbstractStringDistance; import com.wcohen.ss.AbstractStringDistance;
import eu.dnetlib.pace.distance.DistanceClass;
/** /**
* The Class SortedJaroWinkler. * The Class SortedJaroWinkler.
*/ */
@DistanceClass("SortedJaroWinkler")
public class SortedJaroWinkler extends SortedSecondStringDistanceAlgo { public class SortedJaroWinkler extends SortedSecondStringDistanceAlgo {
/** /**

View File

@ -1,10 +1,12 @@
package eu.dnetlib.pace.distance.algo; package eu.dnetlib.pace.distance.algo;
import com.wcohen.ss.AbstractStringDistance; import com.wcohen.ss.AbstractStringDistance;
import eu.dnetlib.pace.distance.DistanceClass;
/** /**
* The Class SortedJaroWinkler. * The Class SortedJaroWinkler.
*/ */
@DistanceClass("Sorted2JaroWinkler")
public class SortedLevel2JaroWinkler extends SortedSecondStringDistanceAlgo { public class SortedLevel2JaroWinkler extends SortedSecondStringDistanceAlgo {
/** /**

View File

@ -1,5 +1,6 @@
package eu.dnetlib.pace.distance.algo; package eu.dnetlib.pace.distance.algo;
import eu.dnetlib.pace.distance.DistanceClass;
import eu.dnetlib.pace.distance.SecondStringDistanceAlgo; import eu.dnetlib.pace.distance.SecondStringDistanceAlgo;
import org.apache.commons.lang.StringUtils; import org.apache.commons.lang.StringUtils;
@ -8,14 +9,21 @@ import com.wcohen.ss.AbstractStringDistance;
import eu.dnetlib.pace.config.Type; import eu.dnetlib.pace.config.Type;
import eu.dnetlib.pace.model.Field; import eu.dnetlib.pace.model.Field;
import java.util.Map;
/** /**
* The Class SubStringLevenstein. * The Class SubStringLevenstein.
*/ */
@DistanceClass("SubStringLevenstein")
public class SubStringLevenstein extends SecondStringDistanceAlgo { public class SubStringLevenstein extends SecondStringDistanceAlgo {
/** The limit. */ /** The limit. */
protected int limit; protected int limit;
public SubStringLevenstein() {
super(new com.wcohen.ss.Levenstein());
}
/** /**
* Instantiates a new sub string levenstein. * Instantiates a new sub string levenstein.
* *
@ -87,4 +95,9 @@ public class SubStringLevenstein extends SecondStringDistanceAlgo {
return 1 / Math.pow(Math.abs(d) + 1, 0.1); return 1 / Math.pow(Math.abs(d) + 1, 0.1);
} }
public void setParams(Map<String, Number> params){
this.limit = params.get("limit").intValue(); //necessary because this class needs also the limit
super.setParams(params);
}
} }

View File

@ -1,5 +1,6 @@
package eu.dnetlib.pace.distance.algo; package eu.dnetlib.pace.distance.algo;
import eu.dnetlib.pace.distance.DistanceClass;
import eu.dnetlib.pace.model.Field; import eu.dnetlib.pace.model.Field;
import org.apache.commons.lang.StringUtils; import org.apache.commons.lang.StringUtils;
@ -7,15 +8,24 @@ import java.net.MalformedURLException;
import java.net.URL; import java.net.URL;
import java.util.Map; import java.util.Map;
@DistanceClass("urlMatcher")
public class UrlMatcher extends Levenstein { public class UrlMatcher extends Levenstein {
private Map<String, Number> params; private Map<String, Number> params;
public UrlMatcher(){
super();
}
public UrlMatcher(double weight, Map<String, Number> params) { public UrlMatcher(double weight, Map<String, Number> params) {
super(weight); super(weight);
this.params = params; this.params = params;
} }
public void setParams(Map<String, Number> params) {
this.params = params;
}
@Override @Override
public double distance(Field a, Field b) { public double distance(Field a, Field b) {

View File

@ -1,6 +1,5 @@
package eu.dnetlib.pace.distance.eval; package eu.dnetlib.pace.distance.eval;
import eu.dnetlib.pace.config.Cond;
import eu.dnetlib.pace.model.Field; import eu.dnetlib.pace.model.Field;
/** /**
@ -8,7 +7,7 @@ import eu.dnetlib.pace.model.Field;
*/ */
public class ConditionEval { public class ConditionEval {
private Cond cond; private String cond;
private Field a; private Field a;
@ -16,7 +15,7 @@ public class ConditionEval {
private int result; private int result;
public ConditionEval(final Cond cond, final Field a, final Field b, final int result) { public ConditionEval(final String cond, final Field a, final Field b, final int result) {
this.cond = cond; this.cond = cond;
this.a = a; this.a = a;
this.b = b; this.b = b;
@ -47,11 +46,11 @@ public class ConditionEval {
this.result = result; this.result = result;
} }
public Cond getCond() { public String getCond() {
return cond; return cond;
} }
public void setCond(final Cond cond) { public void setCond(final String cond) {
this.cond = cond; this.cond = cond;
} }
} }

View File

@ -1,6 +1,5 @@
package eu.dnetlib.pace.distance.eval; package eu.dnetlib.pace.distance.eval;
import eu.dnetlib.pace.config.Algo;
import eu.dnetlib.pace.model.Field; import eu.dnetlib.pace.model.Field;
import eu.dnetlib.pace.model.FieldDef; import eu.dnetlib.pace.model.FieldDef;

View File

@ -2,10 +2,12 @@ package eu.dnetlib.pace.distance.eval;
import com.google.gson.GsonBuilder; import com.google.gson.GsonBuilder;
import java.io.Serializable;
/** /**
* Created by claudio on 09/03/16. * Created by claudio on 09/03/16.
*/ */
public class ScoreResult { public class ScoreResult implements Serializable {
private ConditionEvalMap strictConditions; private ConditionEvalMap strictConditions;
@ -49,8 +51,12 @@ public class ScoreResult {
@Override @Override
public String toString() { public String toString() {
final GsonBuilder b = new GsonBuilder(); //TODO cannot print: why?
b.serializeSpecialFloatingPointValues(); // final GsonBuilder b = new GsonBuilder()
return b.setPrettyPrinting().create().toJson(this); // .serializeSpecialFloatingPointValues()
// .serializeNulls();
//
// return b.setPrettyPrinting().create().toJson(this);
return "{}";
} }
} }

View File

@ -1,6 +1,7 @@
package eu.dnetlib.pace.model; package eu.dnetlib.pace.model;
import java.io.Serializable; import java.io.Serializable;
import java.lang.reflect.InvocationTargetException;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
@ -9,49 +10,36 @@ import eu.dnetlib.pace.clustering.*;
public class ClusteringDef implements Serializable { public class ClusteringDef implements Serializable {
private Clustering name; private String name;
private List<String> fields; private List<String> fields;
private Map<String, Integer> params; private Map<String, Integer> params;
private ClusteringResolver clusteringResolver = new ClusteringResolver();
public ClusteringDef() {} public ClusteringDef() {}
public Clustering getName() { public String getName() {
return name; return name;
} }
public void setName(final Clustering name) { public void setName(final String name) {
this.name = name; this.name = name;
} }
public ClusteringFunction getClusteringFunction() { public ClusteringFunction getClusteringFunction() {
switch (getName()) {
case acronyms: try {
return new Acronyms(getParams()); ClusteringFunction clusteringFunction = clusteringResolver.resolve(getName());
case ngrams: clusteringFunction.setParams(params);
return new Ngrams(getParams()); return clusteringFunction;
case ngrampairs:
return new NgramPairs(getParams()); } catch (IllegalAccessException | InstantiationException | NoSuchMethodException | InvocationTargetException e) {
case sortedngrampairs: e.printStackTrace();
return new SortedNgramPairs(getParams());
case suffixprefix:
return new SuffixPrefix(getParams());
case spacetrimmingfieldvalue:
return new SpaceTrimmingFieldValue(getParams());
case immutablefieldvalue:
return new ImmutableFieldValue(getParams());
case personhash:
return new PersonHash(getParams());
case personclustering:
return new PersonClustering(getParams());
case lowercase:
return new LowercaseClustering(getParams());
case urlclustering:
return new UrlClustering(getParams());
default:
return new RandomClusteringFunction(getParams()); return new RandomClusteringFunction(getParams());
} }
} }
public List<String> getFields() { public List<String> getFields() {

View File

@ -5,44 +5,36 @@ import java.util.List;
import com.google.gson.Gson; import com.google.gson.Gson;
import eu.dnetlib.pace.condition.*; import eu.dnetlib.pace.condition.*;
import eu.dnetlib.pace.config.Cond;
public class CondDef implements Serializable { public class CondDef implements Serializable {
private Cond name; private String name;
private List<String> fields; private List<String> fields;
private ConditionResolver conditionResolver = new ConditionResolver();
public CondDef() {} public CondDef() {}
public ConditionAlgo getConditionAlgo(final List<FieldDef> fields) { public ConditionAlgo getConditionAlgo(final List<FieldDef> fields) {
switch (getName()) {
case yearMatch: try {
return new YearMatch(getName(), fields); ConditionAlgo conditionAlgo = conditionResolver.resolve(getName());
case titleVersionMatch: conditionAlgo.setFields(fields);
return new TitleVersionMatch(getName(), fields); conditionAlgo.setCond(getName());
case sizeMatch: return conditionAlgo;
return new SizeMatch(getName(), fields); } catch (IllegalAccessException | InstantiationException e) {
case exactMatch: e.printStackTrace();
return new ExactMatch(getName(), fields);
case mustBeDifferent:
return new MustBeDifferent(getName(), fields);
case exactMatchIgnoreCase:
return new ExactMatchIgnoreCase(getName(), fields);
case doiExactMatch:
return new DoiExactMatch(getName(), fields);
case pidMatch:
return new PidMatch(getName(), fields);
default:
return new AlwaysTrueCondition(getName(), fields); return new AlwaysTrueCondition(getName(), fields);
} }
} }
public Cond getName() { public String getName() {
return name; return name;
} }
public void setName(final Cond name) { public void setName(final String name) {
this.name = name; this.name = name;
} }

View File

@ -1,13 +1,13 @@
package eu.dnetlib.pace.model; package eu.dnetlib.pace.model;
import java.io.Serializable; import java.io.Serializable;
import java.util.HashMap;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
import com.google.common.base.Splitter; import com.google.common.base.Splitter;
import com.google.common.collect.Lists; import com.google.common.collect.Lists;
import com.google.gson.Gson; import com.google.gson.Gson;
import eu.dnetlib.pace.config.Algo;
import eu.dnetlib.pace.config.Type; import eu.dnetlib.pace.config.Type;
import eu.dnetlib.pace.distance.*; import eu.dnetlib.pace.distance.*;
import eu.dnetlib.pace.distance.algo.*; import eu.dnetlib.pace.distance.algo.*;
@ -19,7 +19,7 @@ public class FieldDef implements Serializable {
public final static String PATH_SEPARATOR = "/"; public final static String PATH_SEPARATOR = "/";
private Algo algo; private String algo;
private String name; private String name;
@ -37,6 +37,8 @@ public class FieldDef implements Serializable {
private Map<String, Number> params; private Map<String, Number> params;
private DistanceResolver distanceResolver = new DistanceResolver();
public FieldDef() {} public FieldDef() {}
// def apply(s: String): Field[A] // def apply(s: String): Field[A]
@ -66,40 +68,22 @@ public class FieldDef implements Serializable {
} }
public DistanceAlgo getDistanceAlgo() { public DistanceAlgo getDistanceAlgo() {
switch (getAlgo()) {
case JaroWinkler: try {
return new JaroWinkler(getWeight()); if (params == null) {
case JaroWinklerTitle: params = new HashMap<>();
return new JaroWinklerTitle(getWeight()); }
case Level2JaroWinkler: params.put("limit", getLimit());
return new Level2JaroWinkler(getWeight()); params.put("weight", getWeight());
case Level2JaroWinklerTitle: DistanceAlgo distanceAlgo = distanceResolver.resolve(getAlgo());
return new Level2JaroWinklerTitle(getWeight()); distanceAlgo.setParams(params);
case Level2Levenstein: distanceAlgo.setWeight(getWeight());
return new Level2Levenstein(getWeight()); return distanceAlgo;
case Levenstein: } catch (IllegalAccessException | InstantiationException e) {
return new Levenstein(getWeight()); e.printStackTrace();
case LevensteinTitle:
return new LevensteinTitle(getWeight());
case SubStringLevenstein:
return new SubStringLevenstein(getWeight(), getLimit());
case SortedJaroWinkler:
return new SortedJaroWinkler(getWeight());
case SortedLevel2JaroWinkler:
return new SortedLevel2JaroWinkler(getWeight());
case urlMatcher:
return new UrlMatcher(getWeight(), getParams());
case ExactMatch:
return new ExactMatch(getWeight());
case MustBeDifferent:
return new MustBeDifferent(getWeight());
case AlwaysMatch:
return new AlwaysMatch(getWeight());
case Null:
return new NullDistanceAlgo();
default:
return new NullDistanceAlgo(); return new NullDistanceAlgo();
} }
} }
public boolean isIgnoreMissing() { public boolean isIgnoreMissing() {
@ -135,11 +119,11 @@ public class FieldDef implements Serializable {
this.weight = weight; this.weight = weight;
} }
public Algo getAlgo() { public String getAlgo() {
return algo; return algo;
} }
public void setAlgo(final Algo algo) { public void setAlgo(final String algo) {
this.algo = algo; this.algo = algo;
} }

View File

@ -23,7 +23,6 @@ public class BlockProcessor {
private DedupConfig dedupConf; private DedupConfig dedupConf;
public static void constructAccumulator( final DedupConfig dedupConf) { public static void constructAccumulator( final DedupConfig dedupConf) {
accumulators.add(String.format("%s::%s",dedupConf.getWf().getEntityType(), "records per hash key = 1")); accumulators.add(String.format("%s::%s",dedupConf.getWf().getEntityType(), "records per hash key = 1"));
accumulators.add(String.format("%s::%s",dedupConf.getWf().getEntityType(), "missing " + dedupConf.getWf().getOrderField())); accumulators.add(String.format("%s::%s",dedupConf.getWf().getEntityType(), "missing " + dedupConf.getWf().getOrderField()));

Binary file not shown.