From 1d678ddc9cecea8355fc11852c7d958b583406e6 Mon Sep 17 00:00:00 2001 From: Michele De Bonis Date: Wed, 24 Oct 2018 12:09:41 +0200 Subject: [PATCH] update in the discovery of clustering, conditions and distance functions (annotated with custom annotations) --- .../src/main/java/eu/dnetlib/SparkTest.java | 5 +- .../dnetlib/pace/AbstractProtoPaceTest.java | 416 ++++---- .../test/java/eu/dnetlib/pace/OafTest.java | 892 ++++++++--------- .../BlacklistAwareClusteringCombinerTest.java | 84 +- .../clustering/ClusteringCombinerTest.java | 78 +- .../dnetlib/pace/distance/DetectorTest.java | 900 +++++++++--------- .../pace/model/ProtoDocumentBuilderTest.java | 100 +- dnet-pace-core/pom.xml | 6 + .../AbstractClusteringFunction.java | 8 +- .../eu/dnetlib/pace/clustering/Acronyms.java | 5 + .../BlacklistAwareClusteringCombiner.java | 3 - .../dnetlib/pace/clustering/Clustering.java | 5 - .../pace/clustering/ClusteringClass.java | 13 + .../pace/clustering/ClusteringFunction.java | 1 + .../pace/clustering/ClusteringResolver.java | 24 + .../pace/clustering/ImmutableFieldValue.java | 5 + .../pace/clustering/LowercaseClustering.java | 5 + .../dnetlib/pace/clustering/NgramPairs.java | 5 + .../eu/dnetlib/pace/clustering/Ngrams.java | 7 +- .../pace/clustering/PersonClustering.java | 6 + .../dnetlib/pace/clustering/PersonHash.java | 5 + .../clustering/RandomClusteringFunction.java | 4 + .../pace/clustering/SortedNgramPairs.java | 5 + .../clustering/SpaceTrimmingFieldValue.java | 5 + .../dnetlib/pace/clustering/SuffixPrefix.java | 5 + .../pace/clustering/UrlClustering.java | 9 + .../pace/condition/AbstractCondition.java | 15 +- .../pace/condition/AlwaysTrueCondition.java | 7 +- .../dnetlib/pace/condition/ConditionAlgo.java | 7 +- .../pace/condition/ConditionClass.java | 13 + .../pace/condition/ConditionResolver.java | 22 + .../dnetlib/pace/condition/DoiExactMatch.java | 4 +- .../eu/dnetlib/pace/condition/ExactMatch.java | 8 +- .../pace/condition/ExactMatchIgnoreCase.java | 4 +- .../pace/condition/MustBeDifferent.java | 4 +- .../eu/dnetlib/pace/condition/PidMatch.java | 4 +- .../eu/dnetlib/pace/condition/SizeMatch.java | 4 +- .../pace/condition/TitleVersionMatch.java | 4 +- .../eu/dnetlib/pace/condition/YearMatch.java | 7 +- .../java/eu/dnetlib/pace/config/Algo.java | 46 - .../java/eu/dnetlib/pace/config/Cond.java | 28 - .../dnetlib/pace/distance/DistanceAlgo.java | 6 + .../dnetlib/pace/distance/DistanceClass.java | 13 + .../pace/distance/DistanceResolver.java | 24 + .../dnetlib/pace/distance/DistanceScorer.java | 2 +- .../distance/SecondStringDistanceAlgo.java | 27 + .../pace/distance/algo/AlwaysMatch.java | 12 + .../pace/distance/algo/ExactMatch.java | 12 + .../pace/distance/algo/JaroWinkler.java | 12 + .../pace/distance/algo/JaroWinklerTitle.java | 12 + .../pace/distance/algo/Level2JaroWinkler.java | 2 + .../distance/algo/Level2JaroWinklerTitle.java | 2 + .../pace/distance/algo/Level2Levenstein.java | 2 + .../pace/distance/algo/Levenstein.java | 6 + .../pace/distance/algo/LevensteinTitle.java | 6 + .../pace/distance/algo/MustBeDifferent.java | 2 + .../pace/distance/algo/NullDistanceAlgo.java | 16 + .../pace/distance/algo/SortedJaroWinkler.java | 2 + .../algo/SortedLevel2JaroWinkler.java | 2 + .../distance/algo/SubStringLevenstein.java | 13 + .../pace/distance/algo/UrlMatcher.java | 10 + .../pace/distance/eval/ConditionEval.java | 9 +- .../pace/distance/eval/DistanceEval.java | 1 - .../pace/distance/eval/ScoreResult.java | 14 +- .../eu/dnetlib/pace/model/ClusteringDef.java | 42 +- .../java/eu/dnetlib/pace/model/CondDef.java | 36 +- .../java/eu/dnetlib/pace/model/FieldDef.java | 56 +- .../eu/dnetlib/pace/util/BlockProcessor.java | 1 - .../main/resources/eu/dnetlib/pace/.DS_Store | Bin 0 -> 6148 bytes 69 files changed, 1679 insertions(+), 1441 deletions(-) delete mode 100644 dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/Clustering.java create mode 100644 dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/ClusteringClass.java create mode 100644 dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/ClusteringResolver.java create mode 100644 dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/ConditionClass.java create mode 100644 dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/ConditionResolver.java delete mode 100644 dnet-pace-core/src/main/java/eu/dnetlib/pace/config/Algo.java delete mode 100644 dnet-pace-core/src/main/java/eu/dnetlib/pace/config/Cond.java create mode 100644 dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/DistanceClass.java create mode 100644 dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/DistanceResolver.java create mode 100644 dnet-pace-core/src/main/resources/eu/dnetlib/pace/.DS_Store diff --git a/dnet-dedup-test/src/main/java/eu/dnetlib/SparkTest.java b/dnet-dedup-test/src/main/java/eu/dnetlib/SparkTest.java index ce07444..b78ba82 100644 --- a/dnet-dedup-test/src/main/java/eu/dnetlib/SparkTest.java +++ b/dnet-dedup-test/src/main/java/eu/dnetlib/SparkTest.java @@ -36,6 +36,7 @@ public class SparkTest { counter = new SparkCounter(context); + //read the configuration from the classpath final DedupConfig config = DedupConfig.load(readFromClasspath("/eu/dnetlib/pace/organization.pace.conf")); BlockProcessor.constructAccumulator(config); @@ -55,7 +56,7 @@ public class SparkTest { //create relations between documents final JavaPairRDD relationRDD = mapDocs.reduceByKey((a, b) -> a) //the reduce is just to be sure that we haven't document with same id - //from to List + //Clustering: from to List .flatMapToPair(a -> { final MapDocument currentDocument = a._2(); return getGroupingKeys(config, currentDocument).stream() @@ -83,7 +84,7 @@ public class SparkTest { //print ids // ccs.foreach(cc -> System.out.println(cc.getId())); - ccs.saveAsTextFile("file:///Users/miconis/Downloads/dumps/organizations_dedup"); +// ccs.saveAsTextFile("file:///Users/miconis/Downloads/dumps/organizations_dedup"); } diff --git a/dnet-dedup-test/src/test/java/eu/dnetlib/pace/AbstractProtoPaceTest.java b/dnet-dedup-test/src/test/java/eu/dnetlib/pace/AbstractProtoPaceTest.java index 838836b..30be387 100644 --- a/dnet-dedup-test/src/test/java/eu/dnetlib/pace/AbstractProtoPaceTest.java +++ b/dnet-dedup-test/src/test/java/eu/dnetlib/pace/AbstractProtoPaceTest.java @@ -1,208 +1,208 @@ -package eu.dnetlib.pace; - -import com.google.common.collect.Lists; -import com.google.common.collect.Sets; -import com.google.gson.Gson; -import eu.dnetlib.data.proto.FieldTypeProtos.Author; -import eu.dnetlib.data.proto.FieldTypeProtos.Qualifier; -import eu.dnetlib.data.proto.FieldTypeProtos.StructuredProperty; -import eu.dnetlib.data.proto.FieldTypeProtos.StructuredProperty.Builder; -import eu.dnetlib.data.proto.OafProtos.Oaf; -import eu.dnetlib.data.proto.OafProtos.OafEntity; -import eu.dnetlib.data.proto.OrganizationProtos.Organization; -import eu.dnetlib.data.proto.ResultProtos.Result; -import eu.dnetlib.pace.config.Config; -import eu.dnetlib.pace.config.DedupConfig; -import eu.dnetlib.pace.config.Type; -import eu.dnetlib.pace.model.Field; -import eu.dnetlib.pace.model.FieldValueImpl; -import eu.dnetlib.pace.model.MapDocument; -import eu.dnetlib.pace.model.ProtoDocumentBuilder; -import eu.dnetlib.pace.model.gt.GTAuthor; -import org.apache.commons.io.IOUtils; -import org.apache.commons.lang.RandomStringUtils; -import org.apache.commons.lang.StringUtils; -import org.apache.commons.lang3.RandomUtils; - -import java.io.IOException; -import java.io.StringWriter; -import java.util.ArrayList; -import java.util.LinkedList; -import java.util.List; -import java.util.Set; -import java.util.stream.Collectors; -import java.util.stream.IntStream; - -public abstract class AbstractProtoPaceTest extends OafTest { - - protected DedupConfig getResultFullConf() { - return DedupConfig.load(readFromClasspath("/eu/dnetlib/pace/result.full.pace.conf")); - } - - protected DedupConfig getResultSimpleConf() { - return DedupConfig.load(readFromClasspath("/eu/dnetlib/pace/result.simple.pace.conf")); - } - - protected DedupConfig getResultConf() { - return DedupConfig.load(readFromClasspath("/eu/dnetlib/pace/result.pace.conf")); - } - - protected DedupConfig getOrganizationSimpleConf() { - return DedupConfig.load(readFromClasspath("/eu/dnetlib/pace/organization.pace.conf")); - } - - protected DedupConfig getResultAuthorsConf() { - return DedupConfig.load(readFromClasspath("/eu/dnetlib/pace/result.authors.pace.conf")); - } - - protected DedupConfig getResultProdConf() { - return DedupConfig.load(readFromClasspath("/eu/dnetlib/pace/result.prod.pace.conf")); - } - - protected MapDocument author(final Config conf, final String id, final Oaf oaf) { - return ProtoDocumentBuilder.newInstance(id, oaf.getEntity(), conf.model()); - } - - protected GTAuthor getGTAuthor(final String path) { - - final Gson gson = new Gson(); - - final String json = readFromClasspath(path); - - final GTAuthor gta = gson.fromJson(json, GTAuthor.class); - - return gta; - } - - protected String readFromClasspath(final String filename) { - final StringWriter sw = new StringWriter(); - try { - IOUtils.copy(getClass().getResourceAsStream(filename), sw); - return sw.toString(); - } catch (final IOException e) { - throw new RuntimeException("cannot load resource from classpath: " + filename); - } - } - - protected MapDocument result(final Config config, final String id, final String title) { - return result(config, id, title, null, new ArrayList<>(), null); - } - - protected MapDocument result(final Config config, final String id, final String title, final String date) { - return result(config, id, title, date, new ArrayList<>(), null); - } - - protected MapDocument result(final Config config, final String id, final String title, final String date, final List pid) { - return result(config, id, title, date, pid, null); - } - - protected MapDocument result(final Config config, final String id, final String title, final String date, final String pid) { - return result(config, id, title, date, pid, null); - } - - protected MapDocument result(final Config config, final String id, final String title, final String date, final String pid, final List authors) { - return result(config, id, title, date, Lists.newArrayList(pid), authors); - } - - static List pidTypes = Lists.newArrayList(); - static { - pidTypes.add("doi"); - //pidTypes.add("oai"); - //pidTypes.add("pmid"); - } - - protected MapDocument result(final Config config, final String id, final String title, final String date, final List pid, final List authors) { - final Result.Metadata.Builder metadata = Result.Metadata.newBuilder(); - if (!StringUtils.isBlank(title)) { - metadata.addTitle(getStruct(title, getQualifier("main title", "dnet:titles"))); - metadata.addTitle(getStruct(RandomStringUtils.randomAlphabetic(10), getQualifier("alternative title", "dnet:titles"))); - } - if (!StringUtils.isBlank(date)) { - metadata.setDateofacceptance(sf(date)); - } - - final OafEntity.Builder entity = oafEntity(id, eu.dnetlib.data.proto.TypeProtos.Type.result); - final Result.Builder result = Result.newBuilder().setMetadata(metadata); - - if (authors != null) { - result.getMetadataBuilder().addAllAuthor( - IntStream.range(0, authors.size()) - .mapToObj(i -> author(authors.get(i), i)) - .collect(Collectors.toCollection(LinkedList::new))); - } - - entity.setResult(result); - - if (pid != null) { - for(String p : pid) { - if (!StringUtils.isBlank(p)) { - entity.addPid(sp(p, pidTypes.get(RandomUtils.nextInt(0, pidTypes.size() - 1)))); - //entity.addPid(sp(RandomStringUtils.randomAlphabetic(10), "oai")); - } - } - } - - final OafEntity build = entity.build(); - return ProtoDocumentBuilder.newInstance(id, build, config.model()); - } - - private Author author(final String s, int rank) { - final eu.dnetlib.pace.model.Person p = new eu.dnetlib.pace.model.Person(s, false); - final Author.Builder author = Author.newBuilder(); - if (p.isAccurate()) { - author.setName(p.getNormalisedFirstName()); - author.setSurname(p.getNormalisedSurname()); - } - author.setFullname(p.getNormalisedFullname()); - author.setRank(rank); - - return author.build(); - } - - private OafEntity.Builder oafEntity(final String id, final eu.dnetlib.data.proto.TypeProtos.Type type) { - final OafEntity.Builder entity = OafEntity.newBuilder().setId(id).setType(type); - return entity; - } - - protected MapDocument organization(final Config config, final String id, final String legalName) { - return organization(config, id, legalName, null); - } - - protected MapDocument organization(final Config config, final String id, final String legalName, final String legalShortName) { - final Organization.Metadata.Builder metadata = Organization.Metadata.newBuilder(); - if (legalName != null) { - metadata.setLegalname(sf(legalName)); - } - if (legalShortName != null) { - metadata.setLegalshortname(sf(legalShortName)); - } - - final OafEntity.Builder entity = oafEntity(id, eu.dnetlib.data.proto.TypeProtos.Type.result); - entity.setOrganization(Organization.newBuilder().setMetadata(metadata)); - - return ProtoDocumentBuilder.newInstance(id, entity.build(), config.model()); - } - - private StructuredProperty sp(final String pid, final String type) { - final Builder pidSp = - StructuredProperty.newBuilder().setValue(pid) - .setQualifier(Qualifier.newBuilder().setClassid(type).setClassname(type).setSchemeid("dnet:pid_types").setSchemename("dnet:pid_types")); - return pidSp.build(); - } - - protected Field title(final String s) { - return new FieldValueImpl(Type.String, "title", s); - } - - protected static Builder getStruct(final String value, final Qualifier.Builder qualifier) { - return StructuredProperty.newBuilder().setValue(value).setQualifier(qualifier); - } - - /* - * protected static StringField.Builder sf(final String s) { return StringField.newBuilder().setValue(s); } - * - * protected static Qualifier.Builder getQualifier(final String classname, final String schemename) { return - * Qualifier.newBuilder().setClassid(classname).setClassname(classname).setSchemeid(schemename).setSchemename(schemename); } - */ - -} +//package eu.dnetlib.pace; +// +//import com.google.common.collect.Lists; +//import com.google.common.collect.Sets; +//import com.google.gson.Gson; +//import eu.dnetlib.data.proto.FieldTypeProtos.Author; +//import eu.dnetlib.data.proto.FieldTypeProtos.Qualifier; +//import eu.dnetlib.data.proto.FieldTypeProtos.StructuredProperty; +//import eu.dnetlib.data.proto.FieldTypeProtos.StructuredProperty.Builder; +//import eu.dnetlib.data.proto.OafProtos.Oaf; +//import eu.dnetlib.data.proto.OafProtos.OafEntity; +//import eu.dnetlib.data.proto.OrganizationProtos.Organization; +//import eu.dnetlib.data.proto.ResultProtos.Result; +//import eu.dnetlib.pace.config.Config; +//import eu.dnetlib.pace.config.DedupConfig; +//import eu.dnetlib.pace.config.Type; +//import eu.dnetlib.pace.model.Field; +//import eu.dnetlib.pace.model.FieldValueImpl; +//import eu.dnetlib.pace.model.MapDocument; +//import eu.dnetlib.pace.model.ProtoDocumentBuilder; +//import eu.dnetlib.pace.model.gt.GTAuthor; +//import org.apache.commons.io.IOUtils; +//import org.apache.commons.lang.RandomStringUtils; +//import org.apache.commons.lang.StringUtils; +//import org.apache.commons.lang3.RandomUtils; +// +//import java.io.IOException; +//import java.io.StringWriter; +//import java.util.ArrayList; +//import java.util.LinkedList; +//import java.util.List; +//import java.util.Set; +//import java.util.stream.Collectors; +//import java.util.stream.IntStream; +// +//public abstract class AbstractProtoPaceTest extends OafTest { +// +// protected DedupConfig getResultFullConf() { +// return DedupConfig.load(readFromClasspath("/eu/dnetlib/pace/result.full.pace.conf")); +// } +// +// protected DedupConfig getResultSimpleConf() { +// return DedupConfig.load(readFromClasspath("/eu/dnetlib/pace/result.simple.pace.conf")); +// } +// +// protected DedupConfig getResultConf() { +// return DedupConfig.load(readFromClasspath("/eu/dnetlib/pace/result.pace.conf")); +// } +// +// protected DedupConfig getOrganizationSimpleConf() { +// return DedupConfig.load(readFromClasspath("/eu/dnetlib/pace/organization.pace.conf")); +// } +// +// protected DedupConfig getResultAuthorsConf() { +// return DedupConfig.load(readFromClasspath("/eu/dnetlib/pace/result.authors.pace.conf")); +// } +// +// protected DedupConfig getResultProdConf() { +// return DedupConfig.load(readFromClasspath("/eu/dnetlib/pace/result.prod.pace.conf")); +// } +// +// protected MapDocument author(final Config conf, final String id, final Oaf oaf) { +// return ProtoDocumentBuilder.newInstance(id, oaf.getEntity(), conf.model()); +// } +// +// protected GTAuthor getGTAuthor(final String path) { +// +// final Gson gson = new Gson(); +// +// final String json = readFromClasspath(path); +// +// final GTAuthor gta = gson.fromJson(json, GTAuthor.class); +// +// return gta; +// } +// +// protected String readFromClasspath(final String filename) { +// final StringWriter sw = new StringWriter(); +// try { +// IOUtils.copy(getClass().getResourceAsStream(filename), sw); +// return sw.toString(); +// } catch (final IOException e) { +// throw new RuntimeException("cannot load resource from classpath: " + filename); +// } +// } +// +// protected MapDocument result(final Config config, final String id, final String title) { +// return result(config, id, title, null, new ArrayList<>(), null); +// } +// +// protected MapDocument result(final Config config, final String id, final String title, final String date) { +// return result(config, id, title, date, new ArrayList<>(), null); +// } +// +// protected MapDocument result(final Config config, final String id, final String title, final String date, final List pid) { +// return result(config, id, title, date, pid, null); +// } +// +// protected MapDocument result(final Config config, final String id, final String title, final String date, final String pid) { +// return result(config, id, title, date, pid, null); +// } +// +// protected MapDocument result(final Config config, final String id, final String title, final String date, final String pid, final List authors) { +// return result(config, id, title, date, Lists.newArrayList(pid), authors); +// } +// +// static List pidTypes = Lists.newArrayList(); +// static { +// pidTypes.add("doi"); +// //pidTypes.add("oai"); +// //pidTypes.add("pmid"); +// } +// +// protected MapDocument result(final Config config, final String id, final String title, final String date, final List pid, final List authors) { +// final Result.Metadata.Builder metadata = Result.Metadata.newBuilder(); +// if (!StringUtils.isBlank(title)) { +// metadata.addTitle(getStruct(title, getQualifier("main title", "dnet:titles"))); +// metadata.addTitle(getStruct(RandomStringUtils.randomAlphabetic(10), getQualifier("alternative title", "dnet:titles"))); +// } +// if (!StringUtils.isBlank(date)) { +// metadata.setDateofacceptance(sf(date)); +// } +// +// final OafEntity.Builder entity = oafEntity(id, eu.dnetlib.data.proto.TypeProtos.Type.result); +// final Result.Builder result = Result.newBuilder().setMetadata(metadata); +// +// if (authors != null) { +// result.getMetadataBuilder().addAllAuthor( +// IntStream.range(0, authors.size()) +// .mapToObj(i -> author(authors.get(i), i)) +// .collect(Collectors.toCollection(LinkedList::new))); +// } +// +// entity.setResult(result); +// +// if (pid != null) { +// for(String p : pid) { +// if (!StringUtils.isBlank(p)) { +// entity.addPid(sp(p, pidTypes.get(RandomUtils.nextInt(0, pidTypes.size() - 1)))); +// //entity.addPid(sp(RandomStringUtils.randomAlphabetic(10), "oai")); +// } +// } +// } +// +// final OafEntity build = entity.build(); +// return ProtoDocumentBuilder.newInstance(id, build, config.model()); +// } +// +// private Author author(final String s, int rank) { +// final eu.dnetlib.pace.model.Person p = new eu.dnetlib.pace.model.Person(s, false); +// final Author.Builder author = Author.newBuilder(); +// if (p.isAccurate()) { +// author.setName(p.getNormalisedFirstName()); +// author.setSurname(p.getNormalisedSurname()); +// } +// author.setFullname(p.getNormalisedFullname()); +// author.setRank(rank); +// +// return author.build(); +// } +// +// private OafEntity.Builder oafEntity(final String id, final eu.dnetlib.data.proto.TypeProtos.Type type) { +// final OafEntity.Builder entity = OafEntity.newBuilder().setId(id).setType(type); +// return entity; +// } +// +// protected MapDocument organization(final Config config, final String id, final String legalName) { +// return organization(config, id, legalName, null); +// } +// +// protected MapDocument organization(final Config config, final String id, final String legalName, final String legalShortName) { +// final Organization.Metadata.Builder metadata = Organization.Metadata.newBuilder(); +// if (legalName != null) { +// metadata.setLegalname(sf(legalName)); +// } +// if (legalShortName != null) { +// metadata.setLegalshortname(sf(legalShortName)); +// } +// +// final OafEntity.Builder entity = oafEntity(id, eu.dnetlib.data.proto.TypeProtos.Type.result); +// entity.setOrganization(Organization.newBuilder().setMetadata(metadata)); +// +// return ProtoDocumentBuilder.newInstance(id, entity.build(), config.model()); +// } +// +// private StructuredProperty sp(final String pid, final String type) { +// final Builder pidSp = +// StructuredProperty.newBuilder().setValue(pid) +// .setQualifier(Qualifier.newBuilder().setClassid(type).setClassname(type).setSchemeid("dnet:pid_types").setSchemename("dnet:pid_types")); +// return pidSp.build(); +// } +// +// protected Field title(final String s) { +// return new FieldValueImpl(Type.String, "title", s); +// } +// +// protected static Builder getStruct(final String value, final Qualifier.Builder qualifier) { +// return StructuredProperty.newBuilder().setValue(value).setQualifier(qualifier); +// } +// +// /* +// * protected static StringField.Builder sf(final String s) { return StringField.newBuilder().setValue(s); } +// * +// * protected static Qualifier.Builder getQualifier(final String classname, final String schemename) { return +// * Qualifier.newBuilder().setClassid(classname).setClassname(classname).setSchemeid(schemename).setSchemename(schemename); } +// */ +// +//} diff --git a/dnet-dedup-test/src/test/java/eu/dnetlib/pace/OafTest.java b/dnet-dedup-test/src/test/java/eu/dnetlib/pace/OafTest.java index 590c416..3ec495b 100644 --- a/dnet-dedup-test/src/test/java/eu/dnetlib/pace/OafTest.java +++ b/dnet-dedup-test/src/test/java/eu/dnetlib/pace/OafTest.java @@ -1,446 +1,446 @@ -package eu.dnetlib.pace; - -import com.google.protobuf.GeneratedMessage; -import com.google.protobuf.InvalidProtocolBufferException; -import eu.dnetlib.data.mapreduce.util.OafDecoder; -import eu.dnetlib.data.proto.DatasourceOrganizationProtos.DatasourceOrganization; -import eu.dnetlib.data.proto.DatasourceOrganizationProtos.DatasourceOrganization.Provision; -import eu.dnetlib.data.proto.DatasourceProtos.Datasource; -import eu.dnetlib.data.proto.DedupProtos.Dedup; -import eu.dnetlib.data.proto.FieldTypeProtos.*; -import eu.dnetlib.data.proto.FieldTypeProtos.StructuredProperty.Builder; -import eu.dnetlib.data.proto.KindProtos.Kind; -import eu.dnetlib.data.proto.OafProtos.Oaf; -import eu.dnetlib.data.proto.OafProtos.OafEntity; -import eu.dnetlib.data.proto.OafProtos.OafRel; -import eu.dnetlib.data.proto.OrganizationOrganizationProtos.OrganizationOrganization; -import eu.dnetlib.data.proto.OrganizationProtos.Organization; -import eu.dnetlib.data.proto.ProjectOrganizationProtos.ProjectOrganization; -import eu.dnetlib.data.proto.ProjectOrganizationProtos.ProjectOrganization.Participation; -import eu.dnetlib.data.proto.ProjectProtos.Project; -import eu.dnetlib.data.proto.RelMetadataProtos.RelMetadata; -import eu.dnetlib.data.proto.RelTypeProtos.RelType; -import eu.dnetlib.data.proto.RelTypeProtos.SubRelType; -import eu.dnetlib.data.proto.ResultProjectProtos.ResultProject; -import eu.dnetlib.data.proto.ResultProjectProtos.ResultProject.Outcome; -import eu.dnetlib.data.proto.ResultProtos.Result; -import eu.dnetlib.data.proto.ResultProtos.Result.Context; -import eu.dnetlib.data.proto.ResultProtos.Result.Instance; -import eu.dnetlib.data.proto.ResultResultProtos.ResultResult; -import eu.dnetlib.data.proto.ResultResultProtos.ResultResult.Similarity; -import eu.dnetlib.data.proto.TypeProtos.Type; - -public class OafTest { - - public static final String CITATION_JSON = - "\n \n [10] M. Foret et al., Phys. Rev. B 66, 024204 (2002).\n \n \n [11] B. Ru\175404\264e et al., Phys. Rev. Lett. 90, 095502 (2003).\n \n \n [12] U. Buchenau et al., Phys. Rev. B 34, 5665 (1986).\n \n \n [13] S.N. Taraskin and S.R. Elliott, J. Phys.: Condens. Mat- ter 11, A219 (1999).\n \n \n [14] B. Hehlen et al., Phys. Rev. Lett. 84, 5355 (2000).\n \n \n [15] N.V. Surotsev et al., J. Phys.: Condens. Matter 10, L113 (1998).\n \n \n [16] D.A. Parshin and C. Laermans, Phys. Rev. B 63, 132203 (2001).\n \n \n [17] V.L. Gurevich et al., Phys. Rev. B 67, 094203 (2003).\n \n \n [18] A. Matic et al., Phys. Rev. Lett. 86, 3803 (2001).\n \n \n [19] E. Rat et al., arXiv:cond-mat/0505558, 23 May 2005.\n \n \n [1] R.C. Zeller and R.O. Pohl, Phys. Rev. B 4, 2029 (1971).\n \n \n [20] C.A. Angell, J. Non-Cryst. Solids 131\20023133, 13 (1991).\n \n \n [21] A.P. Sokolov et al., Phys. Rev. Lett. 71, 2062 (1993).\n \n \n [22] T. Matsuo et al., Solid State Ionics 154-155, 759 (2002).\n \n \n [23] V.K. Malinovsky et al., Europhys. Lett. 11, 43 (1990).\n \n \n [24] J. Lor\250osch et al., J. Non-Cryst. Solids 69, 1 (1984).\n \n \n [25] U. Buchenau, Z. Phys. B 58, 181 (1985).\n \n \n [26] A.F. Io\175400e and A.R. Regel, Prog. Semicond. 4, 237 (1960).\n \n \n [27] R. Dell\20031Anna et al., Phys. Rev. Lett. 80, 1236 (1998).\n \n \n [28] D. Fioretto et al., Phys. Rev. E 59, 4470 (1999).\n \n \n [29] U. Buchenau et al., Phys. Rev. Lett. 77, 4035 (1996).\n \n \n [2] M. Rothenfusser et al., Phys. Rev. B 27, 5196 (1983).\n \n \n [30] J. Mattsson et al., J. Phys.: Condens. Matter 15, S1259 (2003).\n \n \n [31] T. Scopigno et al., Phys. Rev. Lett. 92, 025503 (2004).\n \n \n [32] M. Foret et al., Phys. Rev. Lett. 81, 2100 (1998).\n \n \n [33] F. Sette et al., Science 280, 1550 (1998).\n \n \n [34] J. Wuttke et al., Phys. Rev. E 52, 4026 (1995).\n \n \n [35] M.A. Ramos et al., Phys. Rev. Lett. 78, 82 (1997).\n \n \n [36] G. Monaco et al., Phys. Rev. Lett. 80, 2161 (1998).\n \n \n [37] A. T\250olle, Rep. Prog. Phys. 64, 1473 (2001).\n \n \n [38] As the straight lines do not cross the origin, this does not 2 imply \1623 \21035 \1651 .\n \n \n [39] A. Matic et al., Europhys. Lett. 54, 77 (2001).\n \n \n [3] S. Hunklinger and W. Arnold, in Physical Acoustics, Vol. XII, W.P. Mason and R.N. Thurston Eds. (Academic Press, N.Y. 1976), p. 155.\n \n \n [40] IXS data are usually not available below \1651co, mostly for experimental reasons. E.g., that the rapid onset was not evidenced in vitreous silica [27], is not indicative of its absence but rather of a low qco \21074 1 nm\210221.\n \n \n [41] G. Ruocco et al., Phys. Rev. Lett. 83, 5583 (1999).\n \n \n [42] D. C\1307 iplys et al., J. Physique (Paris) 42, C6-184 (1981).\n \n \n [43] R. Vacher et al., Rev. Sci. Instrum. 51, 288 (1980).\n \n \n [44] R. Vacher et al., arXiv:cond-mat/0505560, 23 May 2005.\n \n \n [45] T.N. Claytor et al., Phys. Rev. B 18, 5842 (1978).\n \n \n [46] M. Arai et al., Physica B 263-264, 268 (1999).\n \n \n [4] R. Vacher et al., J. Non-Cryst. Solids 45, 397 (1981); T.C. Zhu et al., Phys. Rev. B 44, 4281 (1991).\n \n \n [5] J.E. Graebner et al., Phys. Rev. B 34, 5696 (1986).\n \n \n [6] E. Duval and A. Mermet, Phys. Rev. B 58, 8159 (1998).\n \n \n [7] A. Matic et al., Phys. Rev. Lett. 93, 145502 (2004).\n \n \n [8] Often alluded to, e.g. in the Encyclopedia of Materials: Science and Technology, K.H.J. Buschow et al., Eds., Vol. 1 (Elsevier, Oxford, 2001), articles by S.R. Elliott on pp. 171-174 and U. Buchenau on pp. 212-215.\n \n \n [9] E. Rat et al., Phys. Rev. Lett. 83, 1355 (1999).\n \n"; - - public static final String STATISTICS_JSON = - "[{ \"citationsPerYear\": \"many\", \"anotherCoolStatistic\": \"WoW\", \"nestedStat\": { \"firstNestedStat\" : \"value 1\", \"secondNestedStat\" : \"value 2\"}, \"listingStat\" : [ \"one\", \"two\" ] }]"; - - public static Builder getStructuredproperty(final String value, final String classname, final String schemename) { - return getStructuredproperty(value, classname, schemename, null); - } - - public static Builder getStructuredproperty(final String value, final String classname, final String schemename, final DataInfo dataInfo) { - final Builder sp = StructuredProperty.newBuilder().setValue(value).setQualifier(getQualifier(classname, schemename)); - if (dataInfo != null) { - sp.setDataInfo(dataInfo); - } - return sp; - } - - public static Qualifier.Builder getQualifier(final String classname, final String schemename) { - return Qualifier.newBuilder().setClassid(classname).setClassname(classname).setSchemeid(schemename).setSchemename(schemename); - } - - public static KeyValue getKV(final String id, final String name) { - return KeyValue.newBuilder().setKey(id).setValue(name).build(); - } - - public static OafEntity getDatasource(final String datasourceId) { - return OafEntity - .newBuilder() - .setType(Type.datasource) - .setId(datasourceId) - .setDatasource( - Datasource.newBuilder().setMetadata( - Datasource.Metadata.newBuilder().setOfficialname(sf("officialname")).setEnglishname(sf("englishname")) - .setWebsiteurl(sf("websiteurl")).setContactemail(sf("contactemail")).addAccessinfopackage(sf("accessinforpackage")) - .setNamespaceprefix(sf("namespaceprofix")).setDescription(sf("description")).setOdnumberofitems(sf("numberofitems")) - .setOdnumberofitemsdate(sf("numberofitems date")) - // .addOdsubjects("subjects") - .setOdpolicies(sf("policies")).addOdlanguages(sf("languages")).addOdcontenttypes(sf("contenttypes")) - .setDatasourcetype(getQualifier("type class", "type scheme")))).build(); - } - - public static OafEntity getResult(final String id) { - return getResultBuilder(id).build(); - } - - public static OafEntity.Builder getResultBuilder(final String id) { - return OafEntity - .newBuilder() - .setType(Type.result) - .setId(id) - .setResult( - Result.newBuilder() - .setMetadata( - Result.Metadata - .newBuilder() - .addTitle( - getStructuredproperty( - "Analysis of cell viability in intervertebral disc: Effect of endplate permeability on cell population", - "main title", "dnet:result_titles", getDataInfo())) - .addTitle(getStructuredproperty("Another title", "alternative title", "dnet:result_titles", getDataInfo())) - .addSubject(getStructuredproperty("Biophysics", "subject", "dnet:result_sujects")) - .setDateofacceptance(sf("2010-01-01")).addSource(sf("sourceA")).addSource(sf("sourceB")) - .addContext(Context.newBuilder().setId("egi::virtual::970")) - .addContext(Context.newBuilder().setId("egi::classification::natsc::math::applied")) - .addContext(Context.newBuilder().setId("egi::classification::natsc::math")) - .addContext(Context.newBuilder().setId("egi::classification::natsc")) - .addContext(Context.newBuilder().setId("egi::classification")).addContext(Context.newBuilder().setId("egi")) - .addDescription(sf("Responsible for making and maintaining the extracellular matrix ...")) - .addDescription(sf("Another description ...")).setPublisher(sf("ELSEVIER SCI LTD")) - .setResulttype(getQualifier("publication", "dnet:result_types")) - .setLanguage(getQualifier("eng", "dnet:languages"))).addInstance(getInstance("10|od__10", "Uk pubmed")) - .addInstance(getInstance("10|od__10", "arxiv"))) - .addCollectedfrom(getKV("opendoar____::1064", "Oxford University Research Archive")) - .addPid(getStructuredproperty("doi:74293", "doi", "dnet:pids")).addPid(getStructuredproperty("oai:74295", "oai", "dnet:pids")) - .setDateofcollection(""); - } - - public static DataInfo getDataInfo() { - return getDataInfo("0.4"); - } - - public static DataInfo getDataInfo(final String trust) { - return DataInfo.newBuilder().setDeletedbyinference(false).setTrust("0.4").setInferenceprovenance("algo").setProvenanceaction(getQualifier("xx", "yy")) - .build(); - } - - public static Instance.Builder getInstance(final String hostedbyId, final String hostedbyName) { - return Instance.newBuilder().setHostedby(getKV(hostedbyId, hostedbyName)).setAccessright(getQualifier("OpenAccess", "dnet:access_modes")) - .setInstancetype(getQualifier("publication", "dnet:result_typologies")).addUrl("webresource url"); - - } - - public static OafRel getDedupRel(final String source, final String target, final RelType relType, final String relClass) { - return OafRel.newBuilder().setSource(source).setTarget(target).setRelType(relType).setSubRelType(SubRelType.dedup).setRelClass(relClass) - .setChild(false).setCachedTarget(getResult(target)) - .setResultResult(ResultResult.newBuilder().setDedup(Dedup.newBuilder().setRelMetadata(RelMetadata.getDefaultInstance()))) - .build(); - } - - public static OafRel getProjectOrganization(final String source, final String target, final String relClass) throws InvalidProtocolBufferException { - final OafRel.Builder oafRel = OafRel - .newBuilder() - .setSource(source) - .setTarget(target) - .setRelType(RelType.projectOrganization) - .setSubRelType(SubRelType.participation) - .setRelClass(relClass) - .setChild(false) - .setProjectOrganization( - ProjectOrganization.newBuilder().setParticipation( - Participation.newBuilder().setParticipantnumber("" + 1) - .setRelMetadata(relMetadata(relClass, "dnet:project_organization_relations")))); - switch (Participation.RelName.valueOf(relClass)) { - case hasParticipant: - oafRel.setCachedTarget(getProjectFP7(target, "SP3")); - break; - case isParticipant: - oafRel.setCachedTarget(getOrganization(target)); - break; - default: - break; - } - return oafRel.build(); - } - - public static GeneratedMessage getOrganizationOrganization(final String source, final String target, final String relClass) { - final OafRel.Builder oafRel = OafRel - .newBuilder() - .setSource(source) - .setTarget(target) - .setRelType(RelType.organizationOrganization) - .setSubRelType(SubRelType.dedup) - .setRelClass(relClass) - .setChild(true) - .setOrganizationOrganization( - OrganizationOrganization.newBuilder().setDedup( - Dedup.newBuilder().setRelMetadata(relMetadata(relClass, "dnet:organization_organization_relations")))); - - switch (Dedup.RelName.valueOf(relClass)) { - case isMergedIn: - oafRel.setCachedTarget(getOrganization(source)); - break; - case merges: - oafRel.setCachedTarget(getOrganization(target)); - break; - default: - break; - } - return oafRel.build(); - } - - public static OafRel getDatasourceOrganization(final String source, final String target, final String relClass) throws InvalidProtocolBufferException { - final OafRel.Builder oafRel = OafRel - .newBuilder() - .setSource(source) - .setTarget(target) - .setRelType(RelType.datasourceOrganization) - .setSubRelType(SubRelType.provision) - .setRelClass(relClass) - .setChild(false) - .setDatasourceOrganization( - DatasourceOrganization.newBuilder().setProvision( - Provision.newBuilder().setRelMetadata(relMetadata(relClass, "dnet:datasource_organization_relations")))); - switch (Provision.RelName.valueOf(relClass)) { - case isProvidedBy: - oafRel.setCachedTarget(getOrganization(target)); - break; - case provides: - oafRel.setCachedTarget(getDatasource(target)); - break; - default: - break; - } - return oafRel.build(); - } - - public static OafRel getSimilarityRel(final String sourceId, final String targetId, final OafEntity result, final String relClass) { - return OafRel - .newBuilder() - .setSource(sourceId) - .setTarget(targetId) - .setRelType(RelType.resultResult) - .setSubRelType(SubRelType.similarity) - .setRelClass(relClass) - .setChild(false) - .setCachedTarget(result) - .setResultResult( - ResultResult.newBuilder().setSimilarity( - Similarity.newBuilder().setRelMetadata(relMetadata(relClass, "dnet:resultResult_relations")).setSimilarity(.4f) - .setType(Similarity.Type.STANDARD))).build(); - } - - public static RelMetadata.Builder relMetadata(final String classname, final String schemename) { - return RelMetadata.newBuilder().setSemantics(getQualifier(classname, schemename)); - } - - public static OafEntity getOrganization(final String orgId) { - return OafEntity - .newBuilder() - .setType(Type.organization) - .setId(orgId) - .addCollectedfrom(getKV("opendoar_1234", "UK pubmed")) - .setOrganization( - Organization.newBuilder().setMetadata( - Organization.Metadata.newBuilder().setLegalname(sf("CENTRE D'APPUI A LA RECHERCHE ET A LA FORMATION GIE")) - .setLegalshortname(sf("CAREF")).setWebsiteurl(sf("www.caref-mali.org")) - .setCountry(getQualifier("ML", "dnet:countries")))).build(); - } - - public static OafRel getResultProject(final String from, final String to, final OafEntity project, final String relClass) - throws InvalidProtocolBufferException { - return OafRel - .newBuilder() - .setSource(from) - .setTarget(to) - .setRelType(RelType.resultProject) - .setSubRelType(SubRelType.outcome) - .setRelClass(relClass) - .setChild(false) - .setResultProject( - ResultProject.newBuilder().setOutcome(Outcome.newBuilder().setRelMetadata(relMetadata(relClass, "dnet:result_project_relations")))) - .setCachedTarget(project).build(); - } - - public static OafEntity getProjectFP7(final String projectId, final String fundingProgram) throws InvalidProtocolBufferException { - return OafEntity - .newBuilder() - .setType(Type.project) - .setId(projectId) - .addCollectedfrom(getKV("opendoar_1234", "UK pubmed")) - .setProject( - Project.newBuilder() - .setMetadata( - Project.Metadata - .newBuilder() - .setAcronym(sf("5CYRQOL")) - .setTitle(sf("Cypriot Researchers Contribute to our Quality of Life")) - .setStartdate(sf("2007-05-01")) - .setEnddate(sf("2007-10-31")) - .setEcsc39(sf("false")) - .setContracttype(getQualifier("CSA", "ec:FP7contractTypes")) - .addFundingtree( - sf("ec__________::ECECEuropean Commissionec__________::EC::FP7::" - + fundingProgram - + "::PEOPLEMarie-Curie ActionsPEOPLEec:programec__________::EC::FP7::" - + fundingProgram - + "" - + fundingProgram - + "-People" - + fundingProgram - + "ec:specificprogramec__________::EC::FP7SEVENTH FRAMEWORK PROGRAMMEFP7ec:frameworkprogram")))) - .build(); - } - - public static OafEntity getProjectWT() throws InvalidProtocolBufferException { - return OafEntity - .newBuilder() - .setType(Type.project) - .setId("project|wt::087536") - .addCollectedfrom(getKV("wellcomeTrust", "wellcome trust")) - .setProject( - Project.newBuilder() - .setMetadata( - Project.Metadata - .newBuilder() - .setAcronym(sf("UNKNOWN")) - .setTitle(sf("Research Institute for Infectious Diseases of Poverty (IIDP).")) - .setStartdate(sf("2007-05-01")) - .setEnddate(sf("2007-10-31")) - .setEcsc39(sf("false")) - .setContracttype(getQualifier("UNKNOWN", "wt:contractTypes")) - .addFundingtree( - sf("wt__________::WTWTWellcome Trustwt__________::WT::UNKNOWNUNKNOWNUNKNOWNwt:fundingStream")) - .addFundingtree( - sf("wt__________::WTWTWellcome Trustwt__________::WT::Technology TransferTechnology TransferTechnology Transferwt:fundingStream")))) - .build(); - } - - public static ExtraInfo extraInfo(final String name, final String provenance, final String trust, final String typology, final String value) { - final ExtraInfo.Builder e = ExtraInfo.newBuilder().setName(name).setProvenance(provenance).setTrust(trust).setTypology(typology).setValue(value); - return e.build(); - } - - // public static DocumentClasses documentClasses() { - // DocumentClasses.Builder builder = DocumentClasses.newBuilder(); - // for (int i = 0; i < RandomUtils.nextInt(N_DOCUMENT_CLASSES) + 1; i++) { - // builder.addArXivClasses(getDocumentClass()).addDdcClasses(getDocumentClass()).addWosClasses(getDocumentClass()) - // .addMeshEuroPMCClasses(getDocumentClass()); - // } - // return builder.build(); - // } - // - // private static DocumentClass getDocumentClass() { - // DocumentClass.Builder builder = DocumentClass.newBuilder(); - // for (int i = 0; i < RandomUtils.nextInt(N_DOCUMENT_CLASS_LABELS) + 1; i++) { - // builder.addClassLabels("test_class_" + i); - // } - // return builder.setConfidenceLevel(0.5F).build(); - // } - // - // public static DocumentStatistics documentStatistics() { - // return - // DocumentStatistics.newBuilder().setCitationsFromAllPapers(basicCitationStatistics()).setCitationsFromPublishedPapers(basicCitationStatistics()) - // .build(); - // } - // - // private static BasicCitationStatistics basicCitationStatistics() { - // BasicCitationStatistics.Builder builder = BasicCitationStatistics.newBuilder(); - // for (int i = 0; i < N_CITATION_STATS; i++) { - // builder.addNumberOfCitationsPerYear(statisticsKeyValue()); - // builder.setNumberOfCitations(RandomUtils.nextInt(5) + 1); - // } - // return builder.build(); - // } - // - // private static StatisticsKeyValue statisticsKeyValue() { - // return StatisticsKeyValue.newBuilder().setKey((RandomUtils.nextInt(30) + 1980) + "").setValue(RandomUtils.nextInt(5) + 1).build(); - // } - // - // public static AuthorStatistics authorStatistics() { - // AuthorStatistics.Builder builder = AuthorStatistics.newBuilder(); - // builder.setCore(commonCoreStatistics()); - // for (int i = 0; i < N_COAUTHORS; i++) { - // builder.addCoAuthors(coAuthor()); - // } - // return builder.build(); - // } - // - // private static CoAuthor coAuthor() { - // CoAuthor.Builder builder = CoAuthor.newBuilder(); - // builder.setId("30|od______2345::" + Hashing.md5(RandomStringUtils.random(10))); - // builder.setCoauthoredPapersCount(RandomUtils.nextInt(5) + 1); - // return builder.build(); - // } - // - // public static CommonCoreStatistics commonCoreStatistics() { - // CommonCoreStatistics.Builder builder = CommonCoreStatistics.newBuilder(); - // - // builder.setAllPapers(coreStatistics()); - // builder.setPublishedPapers(coreStatistics()); - // - // return builder.build(); - // } - // - // private static CoreStatistics coreStatistics() { - // CoreStatistics.Builder builder = CoreStatistics.newBuilder(); - // - // builder.setNumberOfPapers(RandomUtils.nextInt(10)); - // builder.setCitationsFromAllPapers(extendedStatistics()); - // builder.setCitationsFromPublishedPapers(extendedStatistics()); - // - // return builder.build(); - // } - // - // private static ExtendedStatistics extendedStatistics() { - // ExtendedStatistics.Builder builder = ExtendedStatistics.newBuilder(); - // - // builder.setBasic(basicCitationStatistics()); - // builder.setAverageNumberOfCitationsPerPaper(RandomUtils.nextFloat()); - // for (int i = 0; i < N_CITATION_STATS; i++) { - // builder.addNumberOfPapersCitedAtLeastXTimes(statisticsKeyValue()); - // } - // - // return builder.build(); - // } - - public static StringField sf(final String s) { - return sf(s, null); - } - - public static StringField sf(final String s, final DataInfo dataInfo) { - final StringField.Builder sf = StringField.newBuilder().setValue(s); - if (dataInfo != null) { - sf.setDataInfo(dataInfo); - } - return sf.build(); - } - - public static OafDecoder embed(final GeneratedMessage msg, - final Kind kind, - final boolean deletedByInference, - final boolean inferred, - final String provenance, - final String action) { - - final Oaf.Builder oaf = Oaf - .newBuilder() - .setKind(kind) - .setLastupdatetimestamp(System.currentTimeMillis()) - .setDataInfo( - DataInfo.newBuilder().setDeletedbyinference(deletedByInference).setInferred(inferred).setTrust("0.5") - .setInferenceprovenance(provenance).setProvenanceaction(getQualifier(action, action))); - switch (kind) { - case entity: - oaf.setEntity((OafEntity) msg); - break; - case relation: - oaf.setRel((OafRel) msg); - break; - default: - break; - } - - return OafDecoder.decode(oaf.build()); - } - - public static OafDecoder embed(final GeneratedMessage msg, final Kind kind) { - return embed(msg, kind, false, false, "inference_provenance", "provenance_action"); - } - -} +//package eu.dnetlib.pace; +// +//import com.google.protobuf.GeneratedMessage; +//import com.google.protobuf.InvalidProtocolBufferException; +//import eu.dnetlib.data.mapreduce.util.OafDecoder; +//import eu.dnetlib.data.proto.DatasourceOrganizationProtos.DatasourceOrganization; +//import eu.dnetlib.data.proto.DatasourceOrganizationProtos.DatasourceOrganization.Provision; +//import eu.dnetlib.data.proto.DatasourceProtos.Datasource; +//import eu.dnetlib.data.proto.DedupProtos.Dedup; +//import eu.dnetlib.data.proto.FieldTypeProtos.*; +//import eu.dnetlib.data.proto.FieldTypeProtos.StructuredProperty.Builder; +//import eu.dnetlib.data.proto.KindProtos.Kind; +//import eu.dnetlib.data.proto.OafProtos.Oaf; +//import eu.dnetlib.data.proto.OafProtos.OafEntity; +//import eu.dnetlib.data.proto.OafProtos.OafRel; +//import eu.dnetlib.data.proto.OrganizationOrganizationProtos.OrganizationOrganization; +//import eu.dnetlib.data.proto.OrganizationProtos.Organization; +//import eu.dnetlib.data.proto.ProjectOrganizationProtos.ProjectOrganization; +//import eu.dnetlib.data.proto.ProjectOrganizationProtos.ProjectOrganization.Participation; +//import eu.dnetlib.data.proto.ProjectProtos.Project; +//import eu.dnetlib.data.proto.RelMetadataProtos.RelMetadata; +//import eu.dnetlib.data.proto.RelTypeProtos.RelType; +//import eu.dnetlib.data.proto.RelTypeProtos.SubRelType; +//import eu.dnetlib.data.proto.ResultProjectProtos.ResultProject; +//import eu.dnetlib.data.proto.ResultProjectProtos.ResultProject.Outcome; +//import eu.dnetlib.data.proto.ResultProtos.Result; +//import eu.dnetlib.data.proto.ResultProtos.Result.Context; +//import eu.dnetlib.data.proto.ResultProtos.Result.Instance; +//import eu.dnetlib.data.proto.ResultResultProtos.ResultResult; +//import eu.dnetlib.data.proto.ResultResultProtos.ResultResult.Similarity; +//import eu.dnetlib.data.proto.TypeProtos.Type; +// +//public class OafTest { +// +// public static final String CITATION_JSON = +// "\n \n [10] M. Foret et al., Phys. Rev. B 66, 024204 (2002).\n \n \n [11] B. Ru\175404\264e et al., Phys. Rev. Lett. 90, 095502 (2003).\n \n \n [12] U. Buchenau et al., Phys. Rev. B 34, 5665 (1986).\n \n \n [13] S.N. Taraskin and S.R. Elliott, J. Phys.: Condens. Mat- ter 11, A219 (1999).\n \n \n [14] B. Hehlen et al., Phys. Rev. Lett. 84, 5355 (2000).\n \n \n [15] N.V. Surotsev et al., J. Phys.: Condens. Matter 10, L113 (1998).\n \n \n [16] D.A. Parshin and C. Laermans, Phys. Rev. B 63, 132203 (2001).\n \n \n [17] V.L. Gurevich et al., Phys. Rev. B 67, 094203 (2003).\n \n \n [18] A. Matic et al., Phys. Rev. Lett. 86, 3803 (2001).\n \n \n [19] E. Rat et al., arXiv:cond-mat/0505558, 23 May 2005.\n \n \n [1] R.C. Zeller and R.O. Pohl, Phys. Rev. B 4, 2029 (1971).\n \n \n [20] C.A. Angell, J. Non-Cryst. Solids 131\20023133, 13 (1991).\n \n \n [21] A.P. Sokolov et al., Phys. Rev. Lett. 71, 2062 (1993).\n \n \n [22] T. Matsuo et al., Solid State Ionics 154-155, 759 (2002).\n \n \n [23] V.K. Malinovsky et al., Europhys. Lett. 11, 43 (1990).\n \n \n [24] J. Lor\250osch et al., J. Non-Cryst. Solids 69, 1 (1984).\n \n \n [25] U. Buchenau, Z. Phys. B 58, 181 (1985).\n \n \n [26] A.F. Io\175400e and A.R. Regel, Prog. Semicond. 4, 237 (1960).\n \n \n [27] R. Dell\20031Anna et al., Phys. Rev. Lett. 80, 1236 (1998).\n \n \n [28] D. Fioretto et al., Phys. Rev. E 59, 4470 (1999).\n \n \n [29] U. Buchenau et al., Phys. Rev. Lett. 77, 4035 (1996).\n \n \n [2] M. Rothenfusser et al., Phys. Rev. B 27, 5196 (1983).\n \n \n [30] J. Mattsson et al., J. Phys.: Condens. Matter 15, S1259 (2003).\n \n \n [31] T. Scopigno et al., Phys. Rev. Lett. 92, 025503 (2004).\n \n \n [32] M. Foret et al., Phys. Rev. Lett. 81, 2100 (1998).\n \n \n [33] F. Sette et al., Science 280, 1550 (1998).\n \n \n [34] J. Wuttke et al., Phys. Rev. E 52, 4026 (1995).\n \n \n [35] M.A. Ramos et al., Phys. Rev. Lett. 78, 82 (1997).\n \n \n [36] G. Monaco et al., Phys. Rev. Lett. 80, 2161 (1998).\n \n \n [37] A. T\250olle, Rep. Prog. Phys. 64, 1473 (2001).\n \n \n [38] As the straight lines do not cross the origin, this does not 2 imply \1623 \21035 \1651 .\n \n \n [39] A. Matic et al., Europhys. Lett. 54, 77 (2001).\n \n \n [3] S. Hunklinger and W. Arnold, in Physical Acoustics, Vol. XII, W.P. Mason and R.N. Thurston Eds. (Academic Press, N.Y. 1976), p. 155.\n \n \n [40] IXS data are usually not available below \1651co, mostly for experimental reasons. E.g., that the rapid onset was not evidenced in vitreous silica [27], is not indicative of its absence but rather of a low qco \21074 1 nm\210221.\n \n \n [41] G. Ruocco et al., Phys. Rev. Lett. 83, 5583 (1999).\n \n \n [42] D. C\1307 iplys et al., J. Physique (Paris) 42, C6-184 (1981).\n \n \n [43] R. Vacher et al., Rev. Sci. Instrum. 51, 288 (1980).\n \n \n [44] R. Vacher et al., arXiv:cond-mat/0505560, 23 May 2005.\n \n \n [45] T.N. Claytor et al., Phys. Rev. B 18, 5842 (1978).\n \n \n [46] M. Arai et al., Physica B 263-264, 268 (1999).\n \n \n [4] R. Vacher et al., J. Non-Cryst. Solids 45, 397 (1981); T.C. Zhu et al., Phys. Rev. B 44, 4281 (1991).\n \n \n [5] J.E. Graebner et al., Phys. Rev. B 34, 5696 (1986).\n \n \n [6] E. Duval and A. Mermet, Phys. Rev. B 58, 8159 (1998).\n \n \n [7] A. Matic et al., Phys. Rev. Lett. 93, 145502 (2004).\n \n \n [8] Often alluded to, e.g. in the Encyclopedia of Materials: Science and Technology, K.H.J. Buschow et al., Eds., Vol. 1 (Elsevier, Oxford, 2001), articles by S.R. Elliott on pp. 171-174 and U. Buchenau on pp. 212-215.\n \n \n [9] E. Rat et al., Phys. Rev. Lett. 83, 1355 (1999).\n \n"; +// +// public static final String STATISTICS_JSON = +// "[{ \"citationsPerYear\": \"many\", \"anotherCoolStatistic\": \"WoW\", \"nestedStat\": { \"firstNestedStat\" : \"value 1\", \"secondNestedStat\" : \"value 2\"}, \"listingStat\" : [ \"one\", \"two\" ] }]"; +// +// public static Builder getStructuredproperty(final String value, final String classname, final String schemename) { +// return getStructuredproperty(value, classname, schemename, null); +// } +// +// public static Builder getStructuredproperty(final String value, final String classname, final String schemename, final DataInfo dataInfo) { +// final Builder sp = StructuredProperty.newBuilder().setValue(value).setQualifier(getQualifier(classname, schemename)); +// if (dataInfo != null) { +// sp.setDataInfo(dataInfo); +// } +// return sp; +// } +// +// public static Qualifier.Builder getQualifier(final String classname, final String schemename) { +// return Qualifier.newBuilder().setClassid(classname).setClassname(classname).setSchemeid(schemename).setSchemename(schemename); +// } +// +// public static KeyValue getKV(final String id, final String name) { +// return KeyValue.newBuilder().setKey(id).setValue(name).build(); +// } +// +// public static OafEntity getDatasource(final String datasourceId) { +// return OafEntity +// .newBuilder() +// .setType(Type.datasource) +// .setId(datasourceId) +// .setDatasource( +// Datasource.newBuilder().setMetadata( +// Datasource.Metadata.newBuilder().setOfficialname(sf("officialname")).setEnglishname(sf("englishname")) +// .setWebsiteurl(sf("websiteurl")).setContactemail(sf("contactemail")).addAccessinfopackage(sf("accessinforpackage")) +// .setNamespaceprefix(sf("namespaceprofix")).setDescription(sf("description")).setOdnumberofitems(sf("numberofitems")) +// .setOdnumberofitemsdate(sf("numberofitems date")) +// // .addOdsubjects("subjects") +// .setOdpolicies(sf("policies")).addOdlanguages(sf("languages")).addOdcontenttypes(sf("contenttypes")) +// .setDatasourcetype(getQualifier("type class", "type scheme")))).build(); +// } +// +// public static OafEntity getResult(final String id) { +// return getResultBuilder(id).build(); +// } +// +// public static OafEntity.Builder getResultBuilder(final String id) { +// return OafEntity +// .newBuilder() +// .setType(Type.result) +// .setId(id) +// .setResult( +// Result.newBuilder() +// .setMetadata( +// Result.Metadata +// .newBuilder() +// .addTitle( +// getStructuredproperty( +// "Analysis of cell viability in intervertebral disc: Effect of endplate permeability on cell population", +// "main title", "dnet:result_titles", getDataInfo())) +// .addTitle(getStructuredproperty("Another title", "alternative title", "dnet:result_titles", getDataInfo())) +// .addSubject(getStructuredproperty("Biophysics", "subject", "dnet:result_sujects")) +// .setDateofacceptance(sf("2010-01-01")).addSource(sf("sourceA")).addSource(sf("sourceB")) +// .addContext(Context.newBuilder().setId("egi::virtual::970")) +// .addContext(Context.newBuilder().setId("egi::classification::natsc::math::applied")) +// .addContext(Context.newBuilder().setId("egi::classification::natsc::math")) +// .addContext(Context.newBuilder().setId("egi::classification::natsc")) +// .addContext(Context.newBuilder().setId("egi::classification")).addContext(Context.newBuilder().setId("egi")) +// .addDescription(sf("Responsible for making and maintaining the extracellular matrix ...")) +// .addDescription(sf("Another description ...")).setPublisher(sf("ELSEVIER SCI LTD")) +// .setResulttype(getQualifier("publication", "dnet:result_types")) +// .setLanguage(getQualifier("eng", "dnet:languages"))).addInstance(getInstance("10|od__10", "Uk pubmed")) +// .addInstance(getInstance("10|od__10", "arxiv"))) +// .addCollectedfrom(getKV("opendoar____::1064", "Oxford University Research Archive")) +// .addPid(getStructuredproperty("doi:74293", "doi", "dnet:pids")).addPid(getStructuredproperty("oai:74295", "oai", "dnet:pids")) +// .setDateofcollection(""); +// } +// +// public static DataInfo getDataInfo() { +// return getDataInfo("0.4"); +// } +// +// public static DataInfo getDataInfo(final String trust) { +// return DataInfo.newBuilder().setDeletedbyinference(false).setTrust("0.4").setInferenceprovenance("algo").setProvenanceaction(getQualifier("xx", "yy")) +// .build(); +// } +// +// public static Instance.Builder getInstance(final String hostedbyId, final String hostedbyName) { +// return Instance.newBuilder().setHostedby(getKV(hostedbyId, hostedbyName)).setAccessright(getQualifier("OpenAccess", "dnet:access_modes")) +// .setInstancetype(getQualifier("publication", "dnet:result_typologies")).addUrl("webresource url"); +// +// } +// +// public static OafRel getDedupRel(final String source, final String target, final RelType relType, final String relClass) { +// return OafRel.newBuilder().setSource(source).setTarget(target).setRelType(relType).setSubRelType(SubRelType.dedup).setRelClass(relClass) +// .setChild(false).setCachedTarget(getResult(target)) +// .setResultResult(ResultResult.newBuilder().setDedup(Dedup.newBuilder().setRelMetadata(RelMetadata.getDefaultInstance()))) +// .build(); +// } +// +// public static OafRel getProjectOrganization(final String source, final String target, final String relClass) throws InvalidProtocolBufferException { +// final OafRel.Builder oafRel = OafRel +// .newBuilder() +// .setSource(source) +// .setTarget(target) +// .setRelType(RelType.projectOrganization) +// .setSubRelType(SubRelType.participation) +// .setRelClass(relClass) +// .setChild(false) +// .setProjectOrganization( +// ProjectOrganization.newBuilder().setParticipation( +// Participation.newBuilder().setParticipantnumber("" + 1) +// .setRelMetadata(relMetadata(relClass, "dnet:project_organization_relations")))); +// switch (Participation.RelName.valueOf(relClass)) { +// case hasParticipant: +// oafRel.setCachedTarget(getProjectFP7(target, "SP3")); +// break; +// case isParticipant: +// oafRel.setCachedTarget(getOrganization(target)); +// break; +// default: +// break; +// } +// return oafRel.build(); +// } +// +// public static GeneratedMessage getOrganizationOrganization(final String source, final String target, final String relClass) { +// final OafRel.Builder oafRel = OafRel +// .newBuilder() +// .setSource(source) +// .setTarget(target) +// .setRelType(RelType.organizationOrganization) +// .setSubRelType(SubRelType.dedup) +// .setRelClass(relClass) +// .setChild(true) +// .setOrganizationOrganization( +// OrganizationOrganization.newBuilder().setDedup( +// Dedup.newBuilder().setRelMetadata(relMetadata(relClass, "dnet:organization_organization_relations")))); +// +// switch (Dedup.RelName.valueOf(relClass)) { +// case isMergedIn: +// oafRel.setCachedTarget(getOrganization(source)); +// break; +// case merges: +// oafRel.setCachedTarget(getOrganization(target)); +// break; +// default: +// break; +// } +// return oafRel.build(); +// } +// +// public static OafRel getDatasourceOrganization(final String source, final String target, final String relClass) throws InvalidProtocolBufferException { +// final OafRel.Builder oafRel = OafRel +// .newBuilder() +// .setSource(source) +// .setTarget(target) +// .setRelType(RelType.datasourceOrganization) +// .setSubRelType(SubRelType.provision) +// .setRelClass(relClass) +// .setChild(false) +// .setDatasourceOrganization( +// DatasourceOrganization.newBuilder().setProvision( +// Provision.newBuilder().setRelMetadata(relMetadata(relClass, "dnet:datasource_organization_relations")))); +// switch (Provision.RelName.valueOf(relClass)) { +// case isProvidedBy: +// oafRel.setCachedTarget(getOrganization(target)); +// break; +// case provides: +// oafRel.setCachedTarget(getDatasource(target)); +// break; +// default: +// break; +// } +// return oafRel.build(); +// } +// +// public static OafRel getSimilarityRel(final String sourceId, final String targetId, final OafEntity result, final String relClass) { +// return OafRel +// .newBuilder() +// .setSource(sourceId) +// .setTarget(targetId) +// .setRelType(RelType.resultResult) +// .setSubRelType(SubRelType.similarity) +// .setRelClass(relClass) +// .setChild(false) +// .setCachedTarget(result) +// .setResultResult( +// ResultResult.newBuilder().setSimilarity( +// Similarity.newBuilder().setRelMetadata(relMetadata(relClass, "dnet:resultResult_relations")).setSimilarity(.4f) +// .setType(Similarity.Type.STANDARD))).build(); +// } +// +// public static RelMetadata.Builder relMetadata(final String classname, final String schemename) { +// return RelMetadata.newBuilder().setSemantics(getQualifier(classname, schemename)); +// } +// +// public static OafEntity getOrganization(final String orgId) { +// return OafEntity +// .newBuilder() +// .setType(Type.organization) +// .setId(orgId) +// .addCollectedfrom(getKV("opendoar_1234", "UK pubmed")) +// .setOrganization( +// Organization.newBuilder().setMetadata( +// Organization.Metadata.newBuilder().setLegalname(sf("CENTRE D'APPUI A LA RECHERCHE ET A LA FORMATION GIE")) +// .setLegalshortname(sf("CAREF")).setWebsiteurl(sf("www.caref-mali.org")) +// .setCountry(getQualifier("ML", "dnet:countries")))).build(); +// } +// +// public static OafRel getResultProject(final String from, final String to, final OafEntity project, final String relClass) +// throws InvalidProtocolBufferException { +// return OafRel +// .newBuilder() +// .setSource(from) +// .setTarget(to) +// .setRelType(RelType.resultProject) +// .setSubRelType(SubRelType.outcome) +// .setRelClass(relClass) +// .setChild(false) +// .setResultProject( +// ResultProject.newBuilder().setOutcome(Outcome.newBuilder().setRelMetadata(relMetadata(relClass, "dnet:result_project_relations")))) +// .setCachedTarget(project).build(); +// } +// +// public static OafEntity getProjectFP7(final String projectId, final String fundingProgram) throws InvalidProtocolBufferException { +// return OafEntity +// .newBuilder() +// .setType(Type.project) +// .setId(projectId) +// .addCollectedfrom(getKV("opendoar_1234", "UK pubmed")) +// .setProject( +// Project.newBuilder() +// .setMetadata( +// Project.Metadata +// .newBuilder() +// .setAcronym(sf("5CYRQOL")) +// .setTitle(sf("Cypriot Researchers Contribute to our Quality of Life")) +// .setStartdate(sf("2007-05-01")) +// .setEnddate(sf("2007-10-31")) +// .setEcsc39(sf("false")) +// .setContracttype(getQualifier("CSA", "ec:FP7contractTypes")) +// .addFundingtree( +// sf("ec__________::ECECEuropean Commissionec__________::EC::FP7::" +// + fundingProgram +// + "::PEOPLEMarie-Curie ActionsPEOPLEec:programec__________::EC::FP7::" +// + fundingProgram +// + "" +// + fundingProgram +// + "-People" +// + fundingProgram +// + "ec:specificprogramec__________::EC::FP7SEVENTH FRAMEWORK PROGRAMMEFP7ec:frameworkprogram")))) +// .build(); +// } +// +// public static OafEntity getProjectWT() throws InvalidProtocolBufferException { +// return OafEntity +// .newBuilder() +// .setType(Type.project) +// .setId("project|wt::087536") +// .addCollectedfrom(getKV("wellcomeTrust", "wellcome trust")) +// .setProject( +// Project.newBuilder() +// .setMetadata( +// Project.Metadata +// .newBuilder() +// .setAcronym(sf("UNKNOWN")) +// .setTitle(sf("Research Institute for Infectious Diseases of Poverty (IIDP).")) +// .setStartdate(sf("2007-05-01")) +// .setEnddate(sf("2007-10-31")) +// .setEcsc39(sf("false")) +// .setContracttype(getQualifier("UNKNOWN", "wt:contractTypes")) +// .addFundingtree( +// sf("wt__________::WTWTWellcome Trustwt__________::WT::UNKNOWNUNKNOWNUNKNOWNwt:fundingStream")) +// .addFundingtree( +// sf("wt__________::WTWTWellcome Trustwt__________::WT::Technology TransferTechnology TransferTechnology Transferwt:fundingStream")))) +// .build(); +// } +// +// public static ExtraInfo extraInfo(final String name, final String provenance, final String trust, final String typology, final String value) { +// final ExtraInfo.Builder e = ExtraInfo.newBuilder().setName(name).setProvenance(provenance).setTrust(trust).setTypology(typology).setValue(value); +// return e.build(); +// } +// +// // public static DocumentClasses documentClasses() { +// // DocumentClasses.Builder builder = DocumentClasses.newBuilder(); +// // for (int i = 0; i < RandomUtils.nextInt(N_DOCUMENT_CLASSES) + 1; i++) { +// // builder.addArXivClasses(getDocumentClass()).addDdcClasses(getDocumentClass()).addWosClasses(getDocumentClass()) +// // .addMeshEuroPMCClasses(getDocumentClass()); +// // } +// // return builder.build(); +// // } +// // +// // private static DocumentClass getDocumentClass() { +// // DocumentClass.Builder builder = DocumentClass.newBuilder(); +// // for (int i = 0; i < RandomUtils.nextInt(N_DOCUMENT_CLASS_LABELS) + 1; i++) { +// // builder.addClassLabels("test_class_" + i); +// // } +// // return builder.setConfidenceLevel(0.5F).build(); +// // } +// // +// // public static DocumentStatistics documentStatistics() { +// // return +// // DocumentStatistics.newBuilder().setCitationsFromAllPapers(basicCitationStatistics()).setCitationsFromPublishedPapers(basicCitationStatistics()) +// // .build(); +// // } +// // +// // private static BasicCitationStatistics basicCitationStatistics() { +// // BasicCitationStatistics.Builder builder = BasicCitationStatistics.newBuilder(); +// // for (int i = 0; i < N_CITATION_STATS; i++) { +// // builder.addNumberOfCitationsPerYear(statisticsKeyValue()); +// // builder.setNumberOfCitations(RandomUtils.nextInt(5) + 1); +// // } +// // return builder.build(); +// // } +// // +// // private static StatisticsKeyValue statisticsKeyValue() { +// // return StatisticsKeyValue.newBuilder().setKey((RandomUtils.nextInt(30) + 1980) + "").setValue(RandomUtils.nextInt(5) + 1).build(); +// // } +// // +// // public static AuthorStatistics authorStatistics() { +// // AuthorStatistics.Builder builder = AuthorStatistics.newBuilder(); +// // builder.setCore(commonCoreStatistics()); +// // for (int i = 0; i < N_COAUTHORS; i++) { +// // builder.addCoAuthors(coAuthor()); +// // } +// // return builder.build(); +// // } +// // +// // private static CoAuthor coAuthor() { +// // CoAuthor.Builder builder = CoAuthor.newBuilder(); +// // builder.setId("30|od______2345::" + Hashing.md5(RandomStringUtils.random(10))); +// // builder.setCoauthoredPapersCount(RandomUtils.nextInt(5) + 1); +// // return builder.build(); +// // } +// // +// // public static CommonCoreStatistics commonCoreStatistics() { +// // CommonCoreStatistics.Builder builder = CommonCoreStatistics.newBuilder(); +// // +// // builder.setAllPapers(coreStatistics()); +// // builder.setPublishedPapers(coreStatistics()); +// // +// // return builder.build(); +// // } +// // +// // private static CoreStatistics coreStatistics() { +// // CoreStatistics.Builder builder = CoreStatistics.newBuilder(); +// // +// // builder.setNumberOfPapers(RandomUtils.nextInt(10)); +// // builder.setCitationsFromAllPapers(extendedStatistics()); +// // builder.setCitationsFromPublishedPapers(extendedStatistics()); +// // +// // return builder.build(); +// // } +// // +// // private static ExtendedStatistics extendedStatistics() { +// // ExtendedStatistics.Builder builder = ExtendedStatistics.newBuilder(); +// // +// // builder.setBasic(basicCitationStatistics()); +// // builder.setAverageNumberOfCitationsPerPaper(RandomUtils.nextFloat()); +// // for (int i = 0; i < N_CITATION_STATS; i++) { +// // builder.addNumberOfPapersCitedAtLeastXTimes(statisticsKeyValue()); +// // } +// // +// // return builder.build(); +// // } +// +// public static StringField sf(final String s) { +// return sf(s, null); +// } +// +// public static StringField sf(final String s, final DataInfo dataInfo) { +// final StringField.Builder sf = StringField.newBuilder().setValue(s); +// if (dataInfo != null) { +// sf.setDataInfo(dataInfo); +// } +// return sf.build(); +// } +// +// public static OafDecoder embed(final GeneratedMessage msg, +// final Kind kind, +// final boolean deletedByInference, +// final boolean inferred, +// final String provenance, +// final String action) { +// +// final Oaf.Builder oaf = Oaf +// .newBuilder() +// .setKind(kind) +// .setLastupdatetimestamp(System.currentTimeMillis()) +// .setDataInfo( +// DataInfo.newBuilder().setDeletedbyinference(deletedByInference).setInferred(inferred).setTrust("0.5") +// .setInferenceprovenance(provenance).setProvenanceaction(getQualifier(action, action))); +// switch (kind) { +// case entity: +// oaf.setEntity((OafEntity) msg); +// break; +// case relation: +// oaf.setRel((OafRel) msg); +// break; +// default: +// break; +// } +// +// return OafDecoder.decode(oaf.build()); +// } +// +// public static OafDecoder embed(final GeneratedMessage msg, final Kind kind) { +// return embed(msg, kind, false, false, "inference_provenance", "provenance_action"); +// } +// +//} diff --git a/dnet-dedup-test/src/test/java/eu/dnetlib/pace/clustering/BlacklistAwareClusteringCombinerTest.java b/dnet-dedup-test/src/test/java/eu/dnetlib/pace/clustering/BlacklistAwareClusteringCombinerTest.java index c9fa084..3d4f306 100644 --- a/dnet-dedup-test/src/test/java/eu/dnetlib/pace/clustering/BlacklistAwareClusteringCombinerTest.java +++ b/dnet-dedup-test/src/test/java/eu/dnetlib/pace/clustering/BlacklistAwareClusteringCombinerTest.java @@ -1,42 +1,42 @@ -package eu.dnetlib.pace.clustering; - -import eu.dnetlib.pace.AbstractProtoPaceTest; -import eu.dnetlib.pace.clustering.BlacklistAwareClusteringCombiner; -import eu.dnetlib.pace.config.Config; -import eu.dnetlib.pace.config.Type; -import eu.dnetlib.pace.model.FieldListImpl; -import eu.dnetlib.pace.model.FieldValueImpl; -import eu.dnetlib.pace.model.MapDocument; -import org.apache.commons.logging.Log; -import org.apache.commons.logging.LogFactory; -import org.junit.Before; -import org.junit.Test; - -public class BlacklistAwareClusteringCombinerTest extends AbstractProtoPaceTest { - - private static final Log log = LogFactory.getLog(BlacklistAwareClusteringCombinerTest.class); - - private Config config; - - @Before - public void setUp() { - config = getResultFullConf(); - } - - @Test - public void testCombine() { - final MapDocument result = - result(config, "A", "Dipping in Cygnus X-2 in a multi-wavelength campaign due to absorption of extended ADC emission", "2013"); - final FieldListImpl fl = new FieldListImpl(); - fl.add(new FieldValueImpl(Type.String, "desc", "hello world description pipeline")); - - result.getFieldMap().put("desc", fl); - - fl.clear(); - fl.add(new FieldValueImpl(Type.String, "title", "lorem ipsum cabalie qwerty")); - final FieldListImpl field = (FieldListImpl) result.getFieldMap().get("title"); - field.add(fl); - - log.info(BlacklistAwareClusteringCombiner.filterAndCombine(result, config)); - } -} +//package eu.dnetlib.pace.clustering; +// +//import eu.dnetlib.pace.AbstractProtoPaceTest; +//import eu.dnetlib.pace.clustering.BlacklistAwareClusteringCombiner; +//import eu.dnetlib.pace.config.Config; +//import eu.dnetlib.pace.config.Type; +//import eu.dnetlib.pace.model.FieldListImpl; +//import eu.dnetlib.pace.model.FieldValueImpl; +//import eu.dnetlib.pace.model.MapDocument; +//import org.apache.commons.logging.Log; +//import org.apache.commons.logging.LogFactory; +//import org.junit.Before; +//import org.junit.Test; +// +//public class BlacklistAwareClusteringCombinerTest extends AbstractProtoPaceTest { +// +// private static final Log log = LogFactory.getLog(BlacklistAwareClusteringCombinerTest.class); +// +// private Config config; +// +// @Before +// public void setUp() { +// config = getResultFullConf(); +// } +// +// @Test +// public void testCombine() { +// final MapDocument result = +// result(config, "A", "Dipping in Cygnus X-2 in a multi-wavelength campaign due to absorption of extended ADC emission", "2013"); +// final FieldListImpl fl = new FieldListImpl(); +// fl.add(new FieldValueImpl(Type.String, "desc", "hello world description pipeline")); +// +// result.getFieldMap().put("desc", fl); +// +// fl.clear(); +// fl.add(new FieldValueImpl(Type.String, "title", "lorem ipsum cabalie qwerty")); +// final FieldListImpl field = (FieldListImpl) result.getFieldMap().get("title"); +// field.add(fl); +// +// log.info(BlacklistAwareClusteringCombiner.filterAndCombine(result, config)); +// } +//} diff --git a/dnet-dedup-test/src/test/java/eu/dnetlib/pace/clustering/ClusteringCombinerTest.java b/dnet-dedup-test/src/test/java/eu/dnetlib/pace/clustering/ClusteringCombinerTest.java index 125bf63..74ba8e4 100644 --- a/dnet-dedup-test/src/test/java/eu/dnetlib/pace/clustering/ClusteringCombinerTest.java +++ b/dnet-dedup-test/src/test/java/eu/dnetlib/pace/clustering/ClusteringCombinerTest.java @@ -1,39 +1,39 @@ -package eu.dnetlib.pace.clustering; - -import eu.dnetlib.pace.AbstractProtoPaceTest; -import eu.dnetlib.pace.clustering.ClusteringCombiner; -import eu.dnetlib.pace.config.Config; -import eu.dnetlib.pace.config.Type; -import eu.dnetlib.pace.model.FieldListImpl; -import eu.dnetlib.pace.model.FieldValueImpl; -import eu.dnetlib.pace.model.MapDocument; -import org.apache.commons.logging.Log; -import org.apache.commons.logging.LogFactory; -import org.junit.Before; -import org.junit.Test; - -public class ClusteringCombinerTest extends AbstractProtoPaceTest { - - private static final Log log = LogFactory.getLog(ClusteringCombinerTest.class); - - private Config config; - - @Before - public void setUp() { - config = getResultFullConf(); - } - - @Test - public void testCombine() { - String title = "Dipping in Cygnus X-2 in a multi-wavelength campaign due to absorption of extended ADC emission"; - MapDocument result = result(config, "A", title, "2013"); - - FieldListImpl fl = new FieldListImpl(); - fl.add(new FieldValueImpl(Type.String, "desc", "lorem ipsum cabalie qwerty")); - - result.getFieldMap().put("desc", fl); - log.info(title); - log.info(ClusteringCombiner.combine(result, config)); - } - -} +//package eu.dnetlib.pace.clustering; +// +//import eu.dnetlib.pace.AbstractProtoPaceTest; +//import eu.dnetlib.pace.clustering.ClusteringCombiner; +//import eu.dnetlib.pace.config.Config; +//import eu.dnetlib.pace.config.Type; +//import eu.dnetlib.pace.model.FieldListImpl; +//import eu.dnetlib.pace.model.FieldValueImpl; +//import eu.dnetlib.pace.model.MapDocument; +//import org.apache.commons.logging.Log; +//import org.apache.commons.logging.LogFactory; +//import org.junit.Before; +//import org.junit.Test; +// +//public class ClusteringCombinerTest extends AbstractProtoPaceTest { +// +// private static final Log log = LogFactory.getLog(ClusteringCombinerTest.class); +// +// private Config config; +// +// @Before +// public void setUp() { +// config = getResultFullConf(); +// } +// +// @Test +// public void testCombine() { +// String title = "Dipping in Cygnus X-2 in a multi-wavelength campaign due to absorption of extended ADC emission"; +// MapDocument result = result(config, "A", title, "2013"); +// +// FieldListImpl fl = new FieldListImpl(); +// fl.add(new FieldValueImpl(Type.String, "desc", "lorem ipsum cabalie qwerty")); +// +// result.getFieldMap().put("desc", fl); +// log.info(title); +// log.info(ClusteringCombiner.combine(result, config)); +// } +// +//} diff --git a/dnet-dedup-test/src/test/java/eu/dnetlib/pace/distance/DetectorTest.java b/dnet-dedup-test/src/test/java/eu/dnetlib/pace/distance/DetectorTest.java index 7a265e5..90177e8 100644 --- a/dnet-dedup-test/src/test/java/eu/dnetlib/pace/distance/DetectorTest.java +++ b/dnet-dedup-test/src/test/java/eu/dnetlib/pace/distance/DetectorTest.java @@ -1,450 +1,450 @@ -package eu.dnetlib.pace.distance; - -import com.google.common.collect.Lists; -import com.google.common.collect.Maps; -import com.google.common.collect.Sets; -import com.googlecode.protobuf.format.JsonFormat; -import eu.dnetlib.data.proto.OafProtos; -import eu.dnetlib.pace.AbstractProtoPaceTest; -import eu.dnetlib.pace.clustering.BlacklistAwareClusteringCombiner; -import eu.dnetlib.pace.config.Config; -import eu.dnetlib.pace.config.DedupConfig; -import eu.dnetlib.pace.distance.eval.ScoreResult; -import eu.dnetlib.pace.model.MapDocument; -import eu.dnetlib.pace.model.ProtoDocumentBuilder; -import org.apache.commons.io.IOUtils; -import org.apache.commons.logging.Log; -import org.apache.commons.logging.LogFactory; -import org.junit.Ignore; -import org.junit.Test; - -import java.io.IOException; -import java.util.List; -import java.util.Map; -import java.util.Set; -import java.util.stream.Collectors; - -import static org.junit.Assert.assertFalse; -import static org.junit.Assert.assertTrue; - -public class DetectorTest extends AbstractProtoPaceTest { - - private static final Log log = LogFactory.getLog(DetectorTest.class); - - @Test - public void testDistanceResultSimple() { - final Config config = getResultSimpleConf(); - - final MapDocument resA = result(config, "A", "Recent results from CDF"); - final MapDocument resB = result(config, "B", "Recent results from CDF"); - - final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config); - final double d = sr.getScore(); - log.info(String.format(" d ---> %s", d)); - - assertTrue(d == 1.0); - } - - @Test - public void testDistanceResultSimpleMissingDates() { - final Config config = getResultSimpleConf(); - - final MapDocument resA = result(config, "A", "Recent results from BES"); - final MapDocument resB = result(config, "A", "Recent results from CES"); - - final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config); - final double d = sr.getScore(); - log.info(String.format(" d ---> %s", d)); - - assertTrue(d > 0.97); - } - - @Test - public void testDistanceResultInvalidDate() { - final Config config = getResultConf(); - - final MapDocument resA = result(config, "A", "title title title 6BESR", "2013-01-05"); - final MapDocument resB = result(config, "B", "title title title 6BESR", "qwerty"); - - final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config); - final double d = sr.getScore(); - log.info(String.format(" d ---> %s", d)); - - assertTrue(d == 1.0); - } - - @Ignore - @Test - public void testDistanceResultMissingOneDate() { - final Config config = getResultConf(); - - final MapDocument resA = result(config, "A", "title title title 6BESR", null); - final MapDocument resB = result(config, "B", "title title title 6CLER", "2012-02"); - - final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config); - double d = sr.getScore(); - log.info(String.format(" d ---> %s", d)); - - assertTrue((d > 0.9) && (d < 1.0)); - } - - @Ignore - @Test - public void testDistanceResult() { - final Config config = getResultConf(); - - final MapDocument resA = result(config, "A", "title title title BES", ""); - final MapDocument resB = result(config, "B", "title title title CLEO"); - - final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config); - double d = sr.getScore(); - log.info(String.format(" d ---> %s", d)); - - assertTrue((d > 0.9) && (d < 1.0)); - } - - @Ignore - @Test - public void testDistanceResultMissingTwoDate() { - final Config config = getResultConf(); - - final MapDocument resA = result(config, "A", "bellaciao"); - final MapDocument resB = result(config, "B", "bellocioa"); - - final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config); - double d = sr.getScore(); - log.info(String.format(" d ---> %s", d)); - - assertTrue((d > 0.9) && (d < 1.0)); - } - - @Ignore - @Test - public void testDistanceOrganizationIgnoreMissing() { - - final Config config = getOrganizationSimpleConf(); - - final MapDocument orgA = organization(config, "A", "CONSIGLIO NAZIONALE DELLE RICERCHE"); - final MapDocument orgB = organization(config, "B", "CONSIGLIO NAZIONALE DELLE RICERCHE", "CNR"); - - final ScoreResult sr = new PaceDocumentDistance().between(orgA, orgB, config); - final double d = sr.getScore(); - log.info(String.format(" d ---> %s", d)); - - assertTrue(d > 0.99); - } - - @Test - public void testDistanceResultCase1() { - - final Config config = getResultConf(); - - final MapDocument resA = result(config, "A", "Search the Standard Model Higgs boson", "2003"); - final MapDocument resB = result(config, "B", "Search for the Standard Model Higgs Boson", "2003"); - - final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config); - double d = sr.getScore(); - log.info(String.format(" d ---> %s", d)); - - assertTrue((d > 0.9) && (d < 1.0)); - } - - @Test - public void testDistanceResultCaseDoiMatch1() { - final Config config = getResultConf(); - - final MapDocument resA = result(config, "A", "Search the Standard Model Higgs boson", "2003", "10.1594/PANGAEA.726855"); - final MapDocument resB = result(config, "B", "Search the Standard Model Higgs Boson", "2003", "10.1594/PANGAEA.726855"); - - final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config); - double d = sr.getScore(); - log.info(String.format(" d ---> %s", d)); - - assertTrue("exact DOIs will produce an exact match", d == 1.0); - } - - @Test - public void testDistanceResultCaseDoiMatch2() { - final Config config = getResultConf(); - - final MapDocument resA = result(config, "A", "Conference proceedings on X. Appendix", "2003", "10.1594/PANGAEA.726855"); - final MapDocument resB = result(config, "B", "Search the Standard Model Higgs Boson", "2005", "10.1594/PANGAEA.726855"); - - final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config); - double d = sr.getScore(); - log.info(String.format(" d ---> %s", d)); - - assertTrue("exact DOIs will produce an exact match, regardless of different titles or publication years", d == 1.0); - } - - @Test - public void testDistanceResultCaseDoiMatch3() { - final Config config = getResultConf(); - - final MapDocument resA = result(config, "A", "Conference proceedings on X. Appendix", "2003", "10.1016/j.jmb.2010.12.024"); - final MapDocument resB = result(config, "B", "Conference proceedings on X. Appendix", "2003"); - - final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config); - double d = sr.getScore(); - log.info(String.format(" d ---> %s", d)); - - assertTrue("a missing DOI will casue the comparsion to continue with the following conditions", d == 1.0); - } - - @Test - public void testDistanceResultCaseDoiMatch4() { - final Config config = getResultConf(); - - final MapDocument resA = result(config, "A", "Conference proceedings on X. Appendix", "2003", "10.1016/j.jmb.2010.12.024"); - final MapDocument resB = result(config, "B", "Conference proceedings on X. Appendix", "2005"); - - final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config); - double d = sr.getScore(); - log.info(String.format(" d ---> %s", d)); - - assertTrue("a missing DOI, comparsion continues with the following conditions, different publication years will drop the score to 0", d == 0.0); - } - - @Test - public void testDistanceResultCaseDoiMatch5() { - - final Config config = getResultConf(); - - final MapDocument resA = result(config, "A", "Search for the Standard Model Higgs Boson", "2003", "10.1016/j.jmb.2010.12.020"); - final MapDocument resB = result(config, "B", "Search the Standard Model Higgs Boson", "2003"); - - final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config); - double d = sr.getScore(); - log.info(String.format(" d ---> %s", d)); - - assertTrue("a missing DOI, comparsion continues with the following conditions", (d > 0.9) && (d < 1.0)); - } - - @Test - public void testDistanceResultCaseDoiMatch6() { - final Config config = getResultConf(); - - final MapDocument resA = result(config, "A", "Conference proceedings on X. Appendix", "2003", "10.1016/j.jmb.2010.12.024"); - final MapDocument resB = result(config, "B", "Conference proceedings on X. Appendix", "2003", "anotherDifferentDOI"); - - final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config); - double d = sr.getScore(); - log.info(String.format(" d ---> %s", d)); - - assertTrue("different DOIs will NOT drop the score to 0, then evaluate other fields", d == 1.0); - } - - @Test - public void testDistanceResultCaseDoiMatch7() { - final Config config = getResultConf(); - - final MapDocument resA = result(config, "A", "Adrenal Insufficiency asd asd", "1951", Lists.newArrayList("PMC2037944", "axdsds")); - final MapDocument resB = result(config, "B", "Adrenal Insufficiency", "1951", "PMC2037944"); - - final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config); - double d = sr.getScore(); - log.info(String.format(" d ---> %s", d)); - - assertTrue("different DOIs will drop the score to 0, regardless of the other fields", d > 0.9 & d < 1); - } - - // http://dx.doi.org/10.1594/PANGAEA.726855 doi:10.1594/PANGAEA.726855 - - @Test - public void testDistanceResultCaseAuthor1() { - - final Config config = getResultAuthorsConf(); - - final List authorsA = Lists.newArrayList("a", "b", "c", "d"); - final List authorsB = Lists.newArrayList("a", "b", "c"); - final List pid = Lists.newArrayList(); - - final MapDocument resA = result(config, "A", "Search the Standard Model Higgs Boson", "2003", pid, authorsA); - final MapDocument resB = result(config, "B", "Search the Standard Model Higgs Boson", "2003", pid, authorsB); - - final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config); - final double d = sr.getScore(); - log.info(String.format(" d ---> %s", d)); - - assertTrue(d == 0.0); - } - - @Test - public void testDistanceResultCaseAuthor2() { - - final Config config = getResultAuthorsConf(); - - final List authorsA = Lists.newArrayList("a", "b", "c"); - final List authorsB = Lists.newArrayList("a", "b", "c"); - final List pid = Lists.newArrayList(); - - final MapDocument resA = result(config, "A", "Search the Standard Model Higgs Boson", "2003", pid, authorsA); - final MapDocument resB = result(config, "B", "Search the Standard Model Higgs Boson", "2003", pid, authorsB); - - final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config); - final double d = sr.getScore(); - log.info(String.format(" d ---> %s", d)); - - assertTrue(d == 1.0); - } - - @Test - public void testDistanceResultCaseAuthor3() { - - final Config config = getResultAuthorsConf(); - - final List authorsA = Lists.newArrayList("Bardi, A.", "Manghi, P.", "Artini, M."); - final List authorsB = Lists.newArrayList("Bardi Alessia", "Manghi Paolo", "Artini Michele"); - final List pid = Lists.newArrayList(); - - final MapDocument resA = result(config, "A", "Search the Standard Model Higgs Boson", "2003", pid, authorsA); - final MapDocument resB = result(config, "B", "Search the Standard Model Higgs Boson", "2003", pid, authorsB); - - final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config); - double d = sr.getScore(); - log.info(String.format(" d ---> %s", d)); - - assertTrue((d > 0.9) && (d < 1.0)); - } - - @Test - public void testDistanceResultCaseAuthor4() { - - final Config config = getResultAuthorsConf(); - - final List authorsA = Lists.newArrayList("Bardi, Alessia", "Manghi, Paolo", "Artini, Michele", "a"); - final List authorsB = Lists.newArrayList("Bardi Alessia", "Manghi Paolo", "Artini Michele"); - final List pid = Lists.newArrayList(); - - final MapDocument resA = result(config, "A", "Search the Standard Model Higgs Boson", "2003", pid, authorsA); - final MapDocument resB = result(config, "B", "Search the Standard Model Higgs Boson", "2003", pid, authorsB); - - final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config); - final double d = sr.getScore(); - log.info(String.format(" d ---> %s", d)); - - // assertTrue(d.getScore() == 0.0); - } - - @Test - public void testDistanceResultNoPidsConf() { - - final Config config = getResultFullConf(); - - final MapDocument resA = - result(config, "A", "Presentations of perforated colonic pathology in patients with polymyalgia rheumatica: two case reports", "2010"); - - final MapDocument resB = - result(config, "B", "Presentations of perforated colonic pathology in patients with polymyalgia rheumatica: two case reportsX", "2010"); - - final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config); - final double s = sr.getScore(); - - log.info(sr.toString()); - log.info(String.format(" s ---> %s", s)); - // assertTrue(d.getScore() == 0.0); - } - - @Test - public void testDistanceResultPidsConf() { - - final Config config = getResultFullConf(); - - final List authorsA = Lists.newArrayList("Nagarajan Pranesh", "Guy Vautier", "Punyanganie de Silva"); - final List authorsB = Lists.newArrayList("Pranesh Nagarajan", "Vautier Guy", "de Silva Punyanganie"); - - final List pidA = Lists.newArrayList("10.1186/1752-1947-4-299", "a", "b"); - final MapDocument resA = - result(config, "A", "Presentations of perforated colonic pathology in patients with polymyalgia rheumatica: two case reports", "2010", - pidA, authorsA); - - final List pidB = Lists.newArrayList("c", "a", "10.1186/1752-1947-4-299", "d"); - final MapDocument resB = - result(config, "B", "Presentations of perforated colonic pathology in patients with polymyalgia rheumatica: two case reportsX", "2010", - pidB, authorsB); - - final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config); - final double s = sr.getScore(); - log.info(sr.toString()); - log.info(String.format(" s ---> %s", s)); - - // assertTrue(d.getScore() == 0.0); - } - - @Test - public void testDistanceResultFullConf() { - - final Config config = getResultFullConf(); - - final List authorsA = Lists.newArrayList("Nagarajan Pranesh", "Guy Vautier", "Punyanganie de Silva"); - final List authorsB = Lists.newArrayList("Pranesh Nagarajan", "Vautier Guy", "de Silva Punyanganie"); - - final MapDocument resA = - result(config, "A", "Presentations of perforated colonic pathology in patients with polymyalgia rheumatica: two case reports", "2010", - "10.1186/1752-1947-4-299", authorsA); - - final MapDocument resB = - result(config, "B", "Presentations of perforated colonic pathology in patients with polymyalgia rheumatica: two case reports", "2010", - "10.1186/1752-1947-4-299", authorsB); - - final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config); - final double d = sr.getScore(); - log.info(String.format(" d ---> %s", d)); - - // assertTrue(d.getScore() == 0.0); - } - - @Ignore - @Test - public void testDistance() throws IOException { - - final DedupConfig conf = DedupConfig.load(readFromClasspath("/eu/dnetlib/pace/result.prod.pace.json")); - - final MapDocument crossref = asMapDocument(conf, "/eu/dnetlib/pace/crossref.json"); - final MapDocument alicante = asMapDocument(conf, "/eu/dnetlib/pace/alicante.json"); - - final ScoreResult result = new PaceDocumentDistance().between(crossref, alicante, conf); - - log.info("score = " + result); - - } - - @Ignore - @Test - public void testDistanceOrgs() throws IOException { - - final DedupConfig conf = DedupConfig.load(readFromClasspath("/eu/dnetlib/pace/organization.pace.conf")); - - final MapDocument orgA = asMapDocument(conf, readFromClasspath("/eu/dnetlib/pace/organization1.json")); - final MapDocument orgB = asMapDocument(conf, readFromClasspath("/eu/dnetlib/pace/organization2.json")); - - Set keysA = getGroupingKeys(conf, orgA); - Set keysB = getGroupingKeys(conf, orgB); - - assertFalse(String.format("A: %s\nB: %s", keysA, keysB), Sets.intersection(keysA, keysB).isEmpty()); - - log.info("clustering keys A = " + getGroupingKeys(conf, orgA)); - log.info("clustering keys B = " + getGroupingKeys(conf, orgB)); - - final ScoreResult result = new PaceDocumentDistance().between(orgA, orgB, conf); - - log.info("score = " + result); - log.info("distance = " + result.getScore()); - } - - private Set getGroupingKeys(DedupConfig conf, MapDocument doc) { - return Sets.newHashSet(BlacklistAwareClusteringCombiner.filterAndCombine(doc, conf)); - } - - private MapDocument asMapDocument(DedupConfig conf, final String json) { - OafProtos.OafEntity.Builder b = OafProtos.OafEntity.newBuilder(); - try { - JsonFormat.merge(json, b); - } catch (JsonFormat.ParseException e) { - throw new IllegalArgumentException(e); - } - return ProtoDocumentBuilder.newInstance(b.getId(), b.build(), conf.getPace().getModel()); - } - - -} +//package eu.dnetlib.pace.distance; +// +//import com.google.common.collect.Lists; +//import com.google.common.collect.Maps; +//import com.google.common.collect.Sets; +//import com.googlecode.protobuf.format.JsonFormat; +//import eu.dnetlib.data.proto.OafProtos; +//import eu.dnetlib.pace.AbstractProtoPaceTest; +//import eu.dnetlib.pace.clustering.BlacklistAwareClusteringCombiner; +//import eu.dnetlib.pace.config.Config; +//import eu.dnetlib.pace.config.DedupConfig; +//import eu.dnetlib.pace.distance.eval.ScoreResult; +//import eu.dnetlib.pace.model.MapDocument; +//import eu.dnetlib.pace.model.ProtoDocumentBuilder; +//import org.apache.commons.io.IOUtils; +//import org.apache.commons.logging.Log; +//import org.apache.commons.logging.LogFactory; +//import org.junit.Ignore; +//import org.junit.Test; +// +//import java.io.IOException; +//import java.util.List; +//import java.util.Map; +//import java.util.Set; +//import java.util.stream.Collectors; +// +//import static org.junit.Assert.assertFalse; +//import static org.junit.Assert.assertTrue; +// +//public class DetectorTest extends AbstractProtoPaceTest { +// +// private static final Log log = LogFactory.getLog(DetectorTest.class); +// +// @Test +// public void testDistanceResultSimple() { +// final Config config = getResultSimpleConf(); +// +// final MapDocument resA = result(config, "A", "Recent results from CDF"); +// final MapDocument resB = result(config, "B", "Recent results from CDF"); +// +// final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config); +// final double d = sr.getScore(); +// log.info(String.format(" d ---> %s", d)); +// +// assertTrue(d == 1.0); +// } +// +// @Test +// public void testDistanceResultSimpleMissingDates() { +// final Config config = getResultSimpleConf(); +// +// final MapDocument resA = result(config, "A", "Recent results from BES"); +// final MapDocument resB = result(config, "A", "Recent results from CES"); +// +// final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config); +// final double d = sr.getScore(); +// log.info(String.format(" d ---> %s", d)); +// +// assertTrue(d > 0.97); +// } +// +// @Test +// public void testDistanceResultInvalidDate() { +// final Config config = getResultConf(); +// +// final MapDocument resA = result(config, "A", "title title title 6BESR", "2013-01-05"); +// final MapDocument resB = result(config, "B", "title title title 6BESR", "qwerty"); +// +// final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config); +// final double d = sr.getScore(); +// log.info(String.format(" d ---> %s", d)); +// +// assertTrue(d == 1.0); +// } +// +// @Ignore +// @Test +// public void testDistanceResultMissingOneDate() { +// final Config config = getResultConf(); +// +// final MapDocument resA = result(config, "A", "title title title 6BESR", null); +// final MapDocument resB = result(config, "B", "title title title 6CLER", "2012-02"); +// +// final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config); +// double d = sr.getScore(); +// log.info(String.format(" d ---> %s", d)); +// +// assertTrue((d > 0.9) && (d < 1.0)); +// } +// +// @Ignore +// @Test +// public void testDistanceResult() { +// final Config config = getResultConf(); +// +// final MapDocument resA = result(config, "A", "title title title BES", ""); +// final MapDocument resB = result(config, "B", "title title title CLEO"); +// +// final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config); +// double d = sr.getScore(); +// log.info(String.format(" d ---> %s", d)); +// +// assertTrue((d > 0.9) && (d < 1.0)); +// } +// +// @Ignore +// @Test +// public void testDistanceResultMissingTwoDate() { +// final Config config = getResultConf(); +// +// final MapDocument resA = result(config, "A", "bellaciao"); +// final MapDocument resB = result(config, "B", "bellocioa"); +// +// final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config); +// double d = sr.getScore(); +// log.info(String.format(" d ---> %s", d)); +// +// assertTrue((d > 0.9) && (d < 1.0)); +// } +// +// @Ignore +// @Test +// public void testDistanceOrganizationIgnoreMissing() { +// +// final Config config = getOrganizationSimpleConf(); +// +// final MapDocument orgA = organization(config, "A", "CONSIGLIO NAZIONALE DELLE RICERCHE"); +// final MapDocument orgB = organization(config, "B", "CONSIGLIO NAZIONALE DELLE RICERCHE", "CNR"); +// +// final ScoreResult sr = new PaceDocumentDistance().between(orgA, orgB, config); +// final double d = sr.getScore(); +// log.info(String.format(" d ---> %s", d)); +// +// assertTrue(d > 0.99); +// } +// +// @Test +// public void testDistanceResultCase1() { +// +// final Config config = getResultConf(); +// +// final MapDocument resA = result(config, "A", "Search the Standard Model Higgs boson", "2003"); +// final MapDocument resB = result(config, "B", "Search for the Standard Model Higgs Boson", "2003"); +// +// final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config); +// double d = sr.getScore(); +// log.info(String.format(" d ---> %s", d)); +// +// assertTrue((d > 0.9) && (d < 1.0)); +// } +// +// @Test +// public void testDistanceResultCaseDoiMatch1() { +// final Config config = getResultConf(); +// +// final MapDocument resA = result(config, "A", "Search the Standard Model Higgs boson", "2003", "10.1594/PANGAEA.726855"); +// final MapDocument resB = result(config, "B", "Search the Standard Model Higgs Boson", "2003", "10.1594/PANGAEA.726855"); +// +// final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config); +// double d = sr.getScore(); +// log.info(String.format(" d ---> %s", d)); +// +// assertTrue("exact DOIs will produce an exact match", d == 1.0); +// } +// +// @Test +// public void testDistanceResultCaseDoiMatch2() { +// final Config config = getResultConf(); +// +// final MapDocument resA = result(config, "A", "Conference proceedings on X. Appendix", "2003", "10.1594/PANGAEA.726855"); +// final MapDocument resB = result(config, "B", "Search the Standard Model Higgs Boson", "2005", "10.1594/PANGAEA.726855"); +// +// final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config); +// double d = sr.getScore(); +// log.info(String.format(" d ---> %s", d)); +// +// assertTrue("exact DOIs will produce an exact match, regardless of different titles or publication years", d == 1.0); +// } +// +// @Test +// public void testDistanceResultCaseDoiMatch3() { +// final Config config = getResultConf(); +// +// final MapDocument resA = result(config, "A", "Conference proceedings on X. Appendix", "2003", "10.1016/j.jmb.2010.12.024"); +// final MapDocument resB = result(config, "B", "Conference proceedings on X. Appendix", "2003"); +// +// final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config); +// double d = sr.getScore(); +// log.info(String.format(" d ---> %s", d)); +// +// assertTrue("a missing DOI will casue the comparsion to continue with the following conditions", d == 1.0); +// } +// +// @Test +// public void testDistanceResultCaseDoiMatch4() { +// final Config config = getResultConf(); +// +// final MapDocument resA = result(config, "A", "Conference proceedings on X. Appendix", "2003", "10.1016/j.jmb.2010.12.024"); +// final MapDocument resB = result(config, "B", "Conference proceedings on X. Appendix", "2005"); +// +// final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config); +// double d = sr.getScore(); +// log.info(String.format(" d ---> %s", d)); +// +// assertTrue("a missing DOI, comparsion continues with the following conditions, different publication years will drop the score to 0", d == 0.0); +// } +// +// @Test +// public void testDistanceResultCaseDoiMatch5() { +// +// final Config config = getResultConf(); +// +// final MapDocument resA = result(config, "A", "Search for the Standard Model Higgs Boson", "2003", "10.1016/j.jmb.2010.12.020"); +// final MapDocument resB = result(config, "B", "Search the Standard Model Higgs Boson", "2003"); +// +// final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config); +// double d = sr.getScore(); +// log.info(String.format(" d ---> %s", d)); +// +// assertTrue("a missing DOI, comparsion continues with the following conditions", (d > 0.9) && (d < 1.0)); +// } +// +// @Test +// public void testDistanceResultCaseDoiMatch6() { +// final Config config = getResultConf(); +// +// final MapDocument resA = result(config, "A", "Conference proceedings on X. Appendix", "2003", "10.1016/j.jmb.2010.12.024"); +// final MapDocument resB = result(config, "B", "Conference proceedings on X. Appendix", "2003", "anotherDifferentDOI"); +// +// final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config); +// double d = sr.getScore(); +// log.info(String.format(" d ---> %s", d)); +// +// assertTrue("different DOIs will NOT drop the score to 0, then evaluate other fields", d == 1.0); +// } +// +// @Test +// public void testDistanceResultCaseDoiMatch7() { +// final Config config = getResultConf(); +// +// final MapDocument resA = result(config, "A", "Adrenal Insufficiency asd asd", "1951", Lists.newArrayList("PMC2037944", "axdsds")); +// final MapDocument resB = result(config, "B", "Adrenal Insufficiency", "1951", "PMC2037944"); +// +// final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config); +// double d = sr.getScore(); +// log.info(String.format(" d ---> %s", d)); +// +// assertTrue("different DOIs will drop the score to 0, regardless of the other fields", d > 0.9 & d < 1); +// } +// +// // http://dx.doi.org/10.1594/PANGAEA.726855 doi:10.1594/PANGAEA.726855 +// +// @Test +// public void testDistanceResultCaseAuthor1() { +// +// final Config config = getResultAuthorsConf(); +// +// final List authorsA = Lists.newArrayList("a", "b", "c", "d"); +// final List authorsB = Lists.newArrayList("a", "b", "c"); +// final List pid = Lists.newArrayList(); +// +// final MapDocument resA = result(config, "A", "Search the Standard Model Higgs Boson", "2003", pid, authorsA); +// final MapDocument resB = result(config, "B", "Search the Standard Model Higgs Boson", "2003", pid, authorsB); +// +// final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config); +// final double d = sr.getScore(); +// log.info(String.format(" d ---> %s", d)); +// +// assertTrue(d == 0.0); +// } +// +// @Test +// public void testDistanceResultCaseAuthor2() { +// +// final Config config = getResultAuthorsConf(); +// +// final List authorsA = Lists.newArrayList("a", "b", "c"); +// final List authorsB = Lists.newArrayList("a", "b", "c"); +// final List pid = Lists.newArrayList(); +// +// final MapDocument resA = result(config, "A", "Search the Standard Model Higgs Boson", "2003", pid, authorsA); +// final MapDocument resB = result(config, "B", "Search the Standard Model Higgs Boson", "2003", pid, authorsB); +// +// final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config); +// final double d = sr.getScore(); +// log.info(String.format(" d ---> %s", d)); +// +// assertTrue(d == 1.0); +// } +// +// @Test +// public void testDistanceResultCaseAuthor3() { +// +// final Config config = getResultAuthorsConf(); +// +// final List authorsA = Lists.newArrayList("Bardi, A.", "Manghi, P.", "Artini, M."); +// final List authorsB = Lists.newArrayList("Bardi Alessia", "Manghi Paolo", "Artini Michele"); +// final List pid = Lists.newArrayList(); +// +// final MapDocument resA = result(config, "A", "Search the Standard Model Higgs Boson", "2003", pid, authorsA); +// final MapDocument resB = result(config, "B", "Search the Standard Model Higgs Boson", "2003", pid, authorsB); +// +// final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config); +// double d = sr.getScore(); +// log.info(String.format(" d ---> %s", d)); +// +// assertTrue((d > 0.9) && (d < 1.0)); +// } +// +// @Test +// public void testDistanceResultCaseAuthor4() { +// +// final Config config = getResultAuthorsConf(); +// +// final List authorsA = Lists.newArrayList("Bardi, Alessia", "Manghi, Paolo", "Artini, Michele", "a"); +// final List authorsB = Lists.newArrayList("Bardi Alessia", "Manghi Paolo", "Artini Michele"); +// final List pid = Lists.newArrayList(); +// +// final MapDocument resA = result(config, "A", "Search the Standard Model Higgs Boson", "2003", pid, authorsA); +// final MapDocument resB = result(config, "B", "Search the Standard Model Higgs Boson", "2003", pid, authorsB); +// +// final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config); +// final double d = sr.getScore(); +// log.info(String.format(" d ---> %s", d)); +// +// // assertTrue(d.getScore() == 0.0); +// } +// +// @Test +// public void testDistanceResultNoPidsConf() { +// +// final Config config = getResultFullConf(); +// +// final MapDocument resA = +// result(config, "A", "Presentations of perforated colonic pathology in patients with polymyalgia rheumatica: two case reports", "2010"); +// +// final MapDocument resB = +// result(config, "B", "Presentations of perforated colonic pathology in patients with polymyalgia rheumatica: two case reportsX", "2010"); +// +// final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config); +// final double s = sr.getScore(); +// +// log.info(sr.toString()); +// log.info(String.format(" s ---> %s", s)); +// // assertTrue(d.getScore() == 0.0); +// } +// +// @Test +// public void testDistanceResultPidsConf() { +// +// final Config config = getResultFullConf(); +// +// final List authorsA = Lists.newArrayList("Nagarajan Pranesh", "Guy Vautier", "Punyanganie de Silva"); +// final List authorsB = Lists.newArrayList("Pranesh Nagarajan", "Vautier Guy", "de Silva Punyanganie"); +// +// final List pidA = Lists.newArrayList("10.1186/1752-1947-4-299", "a", "b"); +// final MapDocument resA = +// result(config, "A", "Presentations of perforated colonic pathology in patients with polymyalgia rheumatica: two case reports", "2010", +// pidA, authorsA); +// +// final List pidB = Lists.newArrayList("c", "a", "10.1186/1752-1947-4-299", "d"); +// final MapDocument resB = +// result(config, "B", "Presentations of perforated colonic pathology in patients with polymyalgia rheumatica: two case reportsX", "2010", +// pidB, authorsB); +// +// final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config); +// final double s = sr.getScore(); +// log.info(sr.toString()); +// log.info(String.format(" s ---> %s", s)); +// +// // assertTrue(d.getScore() == 0.0); +// } +// +// @Test +// public void testDistanceResultFullConf() { +// +// final Config config = getResultFullConf(); +// +// final List authorsA = Lists.newArrayList("Nagarajan Pranesh", "Guy Vautier", "Punyanganie de Silva"); +// final List authorsB = Lists.newArrayList("Pranesh Nagarajan", "Vautier Guy", "de Silva Punyanganie"); +// +// final MapDocument resA = +// result(config, "A", "Presentations of perforated colonic pathology in patients with polymyalgia rheumatica: two case reports", "2010", +// "10.1186/1752-1947-4-299", authorsA); +// +// final MapDocument resB = +// result(config, "B", "Presentations of perforated colonic pathology in patients with polymyalgia rheumatica: two case reports", "2010", +// "10.1186/1752-1947-4-299", authorsB); +// +// final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config); +// final double d = sr.getScore(); +// log.info(String.format(" d ---> %s", d)); +// +// // assertTrue(d.getScore() == 0.0); +// } +// +// @Ignore +// @Test +// public void testDistance() throws IOException { +// +// final DedupConfig conf = DedupConfig.load(readFromClasspath("/eu/dnetlib/pace/result.prod.pace.json")); +// +// final MapDocument crossref = asMapDocument(conf, "/eu/dnetlib/pace/crossref.json"); +// final MapDocument alicante = asMapDocument(conf, "/eu/dnetlib/pace/alicante.json"); +// +// final ScoreResult result = new PaceDocumentDistance().between(crossref, alicante, conf); +// +// log.info("score = " + result); +// +// } +// +// @Ignore +// @Test +// public void testDistanceOrgs() throws IOException { +// +// final DedupConfig conf = DedupConfig.load(readFromClasspath("/eu/dnetlib/pace/organization.pace.conf")); +// +// final MapDocument orgA = asMapDocument(conf, readFromClasspath("/eu/dnetlib/pace/organization1.json")); +// final MapDocument orgB = asMapDocument(conf, readFromClasspath("/eu/dnetlib/pace/organization2.json")); +// +// Set keysA = getGroupingKeys(conf, orgA); +// Set keysB = getGroupingKeys(conf, orgB); +// +// assertFalse(String.format("A: %s\nB: %s", keysA, keysB), Sets.intersection(keysA, keysB).isEmpty()); +// +// log.info("clustering keys A = " + getGroupingKeys(conf, orgA)); +// log.info("clustering keys B = " + getGroupingKeys(conf, orgB)); +// +// final ScoreResult result = new PaceDocumentDistance().between(orgA, orgB, conf); +// +// log.info("score = " + result); +// log.info("distance = " + result.getScore()); +// } +// +// private Set getGroupingKeys(DedupConfig conf, MapDocument doc) { +// return Sets.newHashSet(BlacklistAwareClusteringCombiner.filterAndCombine(doc, conf)); +// } +// +// private MapDocument asMapDocument(DedupConfig conf, final String json) { +// OafProtos.OafEntity.Builder b = OafProtos.OafEntity.newBuilder(); +// try { +// JsonFormat.merge(json, b); +// } catch (JsonFormat.ParseException e) { +// throw new IllegalArgumentException(e); +// } +// return ProtoDocumentBuilder.newInstance(b.getId(), b.build(), conf.getPace().getModel()); +// } +// +// +//} diff --git a/dnet-dedup-test/src/test/java/eu/dnetlib/pace/model/ProtoDocumentBuilderTest.java b/dnet-dedup-test/src/test/java/eu/dnetlib/pace/model/ProtoDocumentBuilderTest.java index 56ddc2c..39e1a91 100644 --- a/dnet-dedup-test/src/test/java/eu/dnetlib/pace/model/ProtoDocumentBuilderTest.java +++ b/dnet-dedup-test/src/test/java/eu/dnetlib/pace/model/ProtoDocumentBuilderTest.java @@ -1,50 +1,50 @@ -package eu.dnetlib.pace.model; - -import com.google.common.collect.Iterables; -import com.google.common.collect.Sets; -import com.google.common.collect.Sets.SetView; -import eu.dnetlib.pace.AbstractProtoPaceTest; -import eu.dnetlib.pace.config.Config; -import eu.dnetlib.pace.distance.DetectorTest; -import eu.dnetlib.pace.model.MapDocument; -import eu.dnetlib.pace.model.MapDocumentSerializer; -import eu.dnetlib.pace.model.ProtoDocumentBuilder; -import org.apache.commons.logging.Log; -import org.apache.commons.logging.LogFactory; -import org.junit.Test; - -import static org.junit.Assert.assertFalse; -import static org.junit.Assert.assertTrue; - -public class ProtoDocumentBuilderTest extends AbstractProtoPaceTest { - - private static final Log log = LogFactory.getLog(ProtoDocumentBuilderTest.class); - - @Test - public void test_serialise1() { - - final String id = "12345"; - - final Config config = getResultFullConf(); - - final MapDocument document = ProtoDocumentBuilder.newInstance(id, getResult(id), config.model()); - - assertFalse(document.fieldNames().isEmpty()); - assertFalse(Iterables.isEmpty(document.fields())); - - log.info("original:\n" + document); - - final String stringDoc = MapDocumentSerializer.toString(document); - - log.info("srialization:\n" + stringDoc); - - final MapDocument decoded = MapDocumentSerializer.decode(stringDoc.getBytes()); - - final SetView diff = Sets.difference(document.fieldNames(), decoded.fieldNames()); - - assertTrue(diff.isEmpty()); - - log.info("decoded:\n" + decoded); - } - -} +//package eu.dnetlib.pace.model; +// +//import com.google.common.collect.Iterables; +//import com.google.common.collect.Sets; +//import com.google.common.collect.Sets.SetView; +//import eu.dnetlib.pace.AbstractProtoPaceTest; +//import eu.dnetlib.pace.config.Config; +//import eu.dnetlib.pace.distance.DetectorTest; +//import eu.dnetlib.pace.model.MapDocument; +//import eu.dnetlib.pace.model.MapDocumentSerializer; +//import eu.dnetlib.pace.model.ProtoDocumentBuilder; +//import org.apache.commons.logging.Log; +//import org.apache.commons.logging.LogFactory; +//import org.junit.Test; +// +//import static org.junit.Assert.assertFalse; +//import static org.junit.Assert.assertTrue; +// +//public class ProtoDocumentBuilderTest extends AbstractProtoPaceTest { +// +// private static final Log log = LogFactory.getLog(ProtoDocumentBuilderTest.class); +// +// @Test +// public void test_serialise1() { +// +// final String id = "12345"; +// +// final Config config = getResultFullConf(); +// +// final MapDocument document = ProtoDocumentBuilder.newInstance(id, getResult(id), config.model()); +// +// assertFalse(document.fieldNames().isEmpty()); +// assertFalse(Iterables.isEmpty(document.fields())); +// +// log.info("original:\n" + document); +// +// final String stringDoc = MapDocumentSerializer.toString(document); +// +// log.info("srialization:\n" + stringDoc); +// +// final MapDocument decoded = MapDocumentSerializer.decode(stringDoc.getBytes()); +// +// final SetView diff = Sets.difference(document.fieldNames(), decoded.fieldNames()); +// +// assertTrue(diff.isEmpty()); +// +// log.info("decoded:\n" + decoded); +// } +// +//} diff --git a/dnet-pace-core/pom.xml b/dnet-pace-core/pom.xml index fc4acf1..1a41a74 100644 --- a/dnet-pace-core/pom.xml +++ b/dnet-pace-core/pom.xml @@ -55,6 +55,12 @@ junit test + + org.reflections + reflections + 0.9.10 + + diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/AbstractClusteringFunction.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/AbstractClusteringFunction.java index 6f29f22..2885994 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/AbstractClusteringFunction.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/AbstractClusteringFunction.java @@ -18,7 +18,13 @@ public abstract class AbstractClusteringFunction extends AbstractPaceFunctions i public AbstractClusteringFunction(final Map params) { this.params = params; } - + + public AbstractClusteringFunction(){} + + public void setParams(Map params){ + this.params = params; + } + protected abstract Collection doApply(String s); @Override diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/Acronyms.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/Acronyms.java index 1897e6a..09d2ce0 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/Acronyms.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/Acronyms.java @@ -7,12 +7,17 @@ import java.util.StringTokenizer; import com.google.common.collect.Sets; +@ClusteringClass("acronyms") public class Acronyms extends AbstractClusteringFunction { public Acronyms(Map params) { super(params); } + public Acronyms(){ + super(); + } + @Override protected Collection doApply(String s) { return extractAcronyms(s, param("max"), param("minLen"), param("maxLen")); diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/BlacklistAwareClusteringCombiner.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/BlacklistAwareClusteringCombiner.java index 4ecedc4..0167d2f 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/BlacklistAwareClusteringCombiner.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/BlacklistAwareClusteringCombiner.java @@ -22,9 +22,6 @@ public class BlacklistAwareClusteringCombiner extends ClusteringCombiner { private static final Log log = LogFactory.getLog(BlacklistAwareClusteringCombiner.class); - - - public static Collection filterAndCombine(final MapDocument a, final Config conf) { final Document filtered = new BlacklistAwareClusteringCombiner().filter(a, conf.blacklists()); diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/Clustering.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/Clustering.java deleted file mode 100644 index 7257540..0000000 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/Clustering.java +++ /dev/null @@ -1,5 +0,0 @@ -package eu.dnetlib.pace.clustering; - -public enum Clustering { - acronyms, ngrams, ngrampairs, sortedngrampairs, suffixprefix, spacetrimmingfieldvalue, immutablefieldvalue, personhash, personclustering, lowercase, urlclustering -} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/ClusteringClass.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/ClusteringClass.java new file mode 100644 index 0000000..e677671 --- /dev/null +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/ClusteringClass.java @@ -0,0 +1,13 @@ +package eu.dnetlib.pace.clustering; + +import java.lang.annotation.ElementType; +import java.lang.annotation.Retention; +import java.lang.annotation.RetentionPolicy; +import java.lang.annotation.Target; + +@Retention(RetentionPolicy.RUNTIME) +@Target(ElementType.TYPE) +public @interface ClusteringClass { + + public String value(); +} \ No newline at end of file diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/ClusteringFunction.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/ClusteringFunction.java index 4fe1b59..040b928 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/ClusteringFunction.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/ClusteringFunction.java @@ -12,4 +12,5 @@ public interface ClusteringFunction { public Map getParams(); + public void setParams(Map params); } diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/ClusteringResolver.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/ClusteringResolver.java new file mode 100644 index 0000000..06a364c --- /dev/null +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/ClusteringResolver.java @@ -0,0 +1,24 @@ +package eu.dnetlib.pace.clustering; + +import java.io.Serializable; +import java.lang.reflect.InvocationTargetException; +import java.util.Map; +import java.util.stream.Collectors; + +import org.reflections.Reflections; + +public class ClusteringResolver implements Serializable { + private final Map> functionMap; + + public ClusteringResolver() { + + this.functionMap = new Reflections("eu.dnetlib").getTypesAnnotatedWith(ClusteringClass.class).stream() + .filter(ClusteringFunction.class::isAssignableFrom) + .collect(Collectors.toMap(cl -> cl.getAnnotation(ClusteringClass.class).value(), cl -> (Class)cl)); + } + + public ClusteringFunction resolve(String clusteringFunction) throws IllegalAccessException, InstantiationException, NoSuchMethodException, InvocationTargetException { + + return functionMap.get(clusteringFunction).newInstance(); + } +} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/ImmutableFieldValue.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/ImmutableFieldValue.java index 988476d..2d5b67a 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/ImmutableFieldValue.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/ImmutableFieldValue.java @@ -6,12 +6,17 @@ import java.util.Map; import com.google.common.collect.Lists; +@ClusteringClass("immutablefieldvalue") public class ImmutableFieldValue extends AbstractClusteringFunction { public ImmutableFieldValue(final Map params) { super(params); } + public ImmutableFieldValue() { + super(); + } + @Override protected Collection doApply(final String s) { final List res = Lists.newArrayList(); diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/LowercaseClustering.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/LowercaseClustering.java index 6d00992..50d73cf 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/LowercaseClustering.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/LowercaseClustering.java @@ -9,12 +9,17 @@ import com.google.common.collect.Sets; import eu.dnetlib.pace.model.Field; import org.apache.commons.lang.StringUtils; +@ClusteringClass("lowercase") public class LowercaseClustering extends AbstractClusteringFunction { public LowercaseClustering(final Map params) { super(params); } + public LowercaseClustering(){ + super(); + } + @Override public Collection apply(List fields) { Collection c = Sets.newLinkedHashSet(); diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/NgramPairs.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/NgramPairs.java index 3cffa4d..6c96ca2 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/NgramPairs.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/NgramPairs.java @@ -6,8 +6,13 @@ import java.util.Map; import com.google.common.collect.Lists; +@ClusteringClass("ngrampairs") public class NgramPairs extends Ngrams { + public NgramPairs() { + super(); + } + public NgramPairs(Map params) { super(params); } diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/Ngrams.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/Ngrams.java index aaba9af..49ce404 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/Ngrams.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/Ngrams.java @@ -5,12 +5,17 @@ import java.util.LinkedHashSet; import java.util.Map; import java.util.StringTokenizer; +@ClusteringClass("ngrams") public class Ngrams extends AbstractClusteringFunction { public Ngrams(Map params) { super(params); } - + + public Ngrams() { + super(); + } + @Override protected Collection doApply(String s) { return getNgrams(s, param("ngramLen"), param("max"), param("maxPerToken"), param("minNgramLen")); diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/PersonClustering.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/PersonClustering.java index d717077..4230079 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/PersonClustering.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/PersonClustering.java @@ -1,5 +1,6 @@ package eu.dnetlib.pace.clustering; +import java.io.Serializable; import java.util.Collection; import java.util.List; import java.util.Map; @@ -18,6 +19,7 @@ import eu.dnetlib.pace.model.Field; import eu.dnetlib.pace.model.gt.Author; import eu.dnetlib.pace.model.gt.GTAuthor; +@ClusteringClass("personclustering") public class PersonClustering extends AbstractPaceFunctions implements ClusteringFunction { private Map params; @@ -28,6 +30,10 @@ public class PersonClustering extends AbstractPaceFunctions implements Clusterin this.params = params; } + public void setParams(Map params){ + this.params = params; + } + @Override public Collection apply(final List fields) { final Set hashes = Sets.newHashSet(); diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/PersonHash.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/PersonHash.java index 42d9d5b..b0e57e9 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/PersonHash.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/PersonHash.java @@ -8,6 +8,7 @@ import com.google.common.collect.Lists; import eu.dnetlib.pace.model.Person; +@ClusteringClass("personhash") public class PersonHash extends AbstractClusteringFunction { private boolean DEFAULT_AGGRESSIVE = false; @@ -16,6 +17,10 @@ public class PersonHash extends AbstractClusteringFunction { super(params); } + public PersonHash(){ + super(); + } + @Override protected Collection doApply(final String s) { final List res = Lists.newArrayList(); diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/RandomClusteringFunction.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/RandomClusteringFunction.java index f012aac..893abe8 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/RandomClusteringFunction.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/RandomClusteringFunction.java @@ -9,6 +9,10 @@ public class RandomClusteringFunction extends AbstractClusteringFunction { super(params); } + public RandomClusteringFunction(){ + super(); + } + @Override protected Collection doApply(String s) { // TODO Auto-generated method stub diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/SortedNgramPairs.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/SortedNgramPairs.java index 56e6604..9ce12fc 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/SortedNgramPairs.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/SortedNgramPairs.java @@ -9,12 +9,17 @@ import com.google.common.base.Joiner; import com.google.common.base.Splitter; import com.google.common.collect.Lists; +@ClusteringClass("sortedngrampairs") public class SortedNgramPairs extends NgramPairs { public SortedNgramPairs(Map params) { super(params); } + public SortedNgramPairs(){ + super(); + } + @Override protected Collection doApply(String s) { diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/SpaceTrimmingFieldValue.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/SpaceTrimmingFieldValue.java index 19a51d4..8e1fdf3 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/SpaceTrimmingFieldValue.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/SpaceTrimmingFieldValue.java @@ -9,12 +9,17 @@ import org.apache.commons.lang.StringUtils; import com.google.common.collect.Lists; +@ClusteringClass("spacetrimmingfieldvalue") public class SpaceTrimmingFieldValue extends AbstractClusteringFunction { public SpaceTrimmingFieldValue(final Map params) { super(params); } + public SpaceTrimmingFieldValue(){ + super(); + } + @Override protected Collection doApply(final String s) { final List res = Lists.newArrayList(); diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/SuffixPrefix.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/SuffixPrefix.java index 3ed336a..25520d9 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/SuffixPrefix.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/SuffixPrefix.java @@ -6,12 +6,17 @@ import java.util.Set; import com.google.common.collect.Sets; +@ClusteringClass("suffixprefix") public class SuffixPrefix extends AbstractClusteringFunction { public SuffixPrefix(Map params) { super(params); } + public SuffixPrefix(){ + super(); + } + @Override protected Collection doApply(String s) { return suffixPrefix(s, param("len"), param("max")); diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/UrlClustering.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/UrlClustering.java index 1962814..4c0c33f 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/UrlClustering.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/UrlClustering.java @@ -11,6 +11,7 @@ import java.util.List; import java.util.Map; import java.util.stream.Collectors; +@ClusteringClass("urlclustering") public class UrlClustering extends AbstractPaceFunctions implements ClusteringFunction { protected Map params; @@ -19,6 +20,14 @@ public class UrlClustering extends AbstractPaceFunctions implements ClusteringFu this.params = params; } + public UrlClustering() { + super(); + } + + public void setParams(Map params){ + this.params = params; + } + @Override public Collection apply(List fields) { return fields.stream() diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/AbstractCondition.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/AbstractCondition.java index bbfac97..adc6825 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/AbstractCondition.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/AbstractCondition.java @@ -2,7 +2,6 @@ package eu.dnetlib.pace.condition; import java.util.List; import eu.dnetlib.pace.common.AbstractPaceFunctions; -import eu.dnetlib.pace.config.Cond; import eu.dnetlib.pace.distance.eval.ConditionEval; import eu.dnetlib.pace.distance.eval.ConditionEvalMap; import eu.dnetlib.pace.model.Document; @@ -17,15 +16,25 @@ import eu.dnetlib.pace.model.FieldDef; */ public abstract class AbstractCondition extends AbstractPaceFunctions implements ConditionAlgo { - protected Cond cond; + protected String cond; protected List fields; - public AbstractCondition(final Cond cond, final List fields) { + public AbstractCondition(final String cond, final List fields) { this.cond = cond; this.fields = fields; } + public AbstractCondition(){} + + public void setCond(String cond){ + this.cond = cond; + } + + public void setFields(List fields){ + this.fields = fields; + } + protected abstract ConditionEval verify(FieldDef fd, Field a, Field b); @Override diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/AlwaysTrueCondition.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/AlwaysTrueCondition.java index f9ff2b6..a67567e 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/AlwaysTrueCondition.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/AlwaysTrueCondition.java @@ -1,7 +1,6 @@ package eu.dnetlib.pace.condition; import java.util.List; -import eu.dnetlib.pace.config.Cond; import eu.dnetlib.pace.distance.eval.ConditionEval; import eu.dnetlib.pace.model.Field; import eu.dnetlib.pace.model.FieldDef; @@ -11,12 +10,16 @@ import eu.dnetlib.pace.model.FieldDef; * * @author claudio */ +@ConditionClass("alwaystruecondition") public class AlwaysTrueCondition extends AbstractCondition { - public AlwaysTrueCondition(final Cond cond, final List fields) { + public AlwaysTrueCondition(final String cond, final List fields) { super(cond, fields); } + public AlwaysTrueCondition(){ + super(); + } @Override protected ConditionEval verify(final FieldDef fd, final Field a, final Field b) { return new ConditionEval(cond, a, b, 1); diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/ConditionAlgo.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/ConditionAlgo.java index ceb7c73..1293c7d 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/ConditionAlgo.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/ConditionAlgo.java @@ -1,9 +1,9 @@ package eu.dnetlib.pace.condition; -import java.util.Map; - +import java.util.List; import eu.dnetlib.pace.distance.eval.ConditionEvalMap; import eu.dnetlib.pace.model.Document; +import eu.dnetlib.pace.model.FieldDef; /** * Allows to express general conditions to be satisfied or not between two Documents. @@ -24,4 +24,7 @@ public interface ConditionAlgo { */ public abstract ConditionEvalMap verify(Document a, Document b); + public void setFields(List fields); + public void setCond(String name); + } diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/ConditionClass.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/ConditionClass.java new file mode 100644 index 0000000..155360c --- /dev/null +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/ConditionClass.java @@ -0,0 +1,13 @@ +package eu.dnetlib.pace.condition; + +import java.lang.annotation.ElementType; +import java.lang.annotation.Retention; +import java.lang.annotation.RetentionPolicy; +import java.lang.annotation.Target; + +@Retention(RetentionPolicy.RUNTIME) +@Target(ElementType.TYPE) +public @interface ConditionClass { + + public String value(); +} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/ConditionResolver.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/ConditionResolver.java new file mode 100644 index 0000000..58a30dd --- /dev/null +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/ConditionResolver.java @@ -0,0 +1,22 @@ +package eu.dnetlib.pace.condition; + +import java.io.Serializable; +import java.util.Map; +import java.util.stream.Collectors; + +import org.reflections.Reflections; + +public class ConditionResolver implements Serializable { + private final Map> functionMap; + + public ConditionResolver() { + + this.functionMap = new Reflections("eu.dnetlib").getTypesAnnotatedWith(ConditionClass.class).stream() + .filter(ConditionAlgo.class::isAssignableFrom) + .collect(Collectors.toMap(cl -> cl.getAnnotation(ConditionClass.class).value(), cl -> (Class)cl)); + } + + public ConditionAlgo resolve(String name) throws IllegalAccessException, InstantiationException { + return functionMap.get(name).newInstance(); + } +} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/DoiExactMatch.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/DoiExactMatch.java index 25b1a01..dfdc5cd 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/DoiExactMatch.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/DoiExactMatch.java @@ -2,7 +2,6 @@ package eu.dnetlib.pace.condition; import java.util.List; -import eu.dnetlib.pace.config.Cond; import eu.dnetlib.pace.model.Field; import eu.dnetlib.pace.model.FieldDef; @@ -11,11 +10,12 @@ import eu.dnetlib.pace.model.FieldDef; * * @author claudio */ +@ConditionClass("doiExactMatch") public class DoiExactMatch extends ExactMatchIgnoreCase { public final String PREFIX = "(http:\\/\\/dx\\.doi\\.org\\/)|(doi:)"; - public DoiExactMatch(final Cond cond, final List fields) { + public DoiExactMatch(final String cond, final List fields) { super(cond, fields); } diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/ExactMatch.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/ExactMatch.java index 4f0f371..f4ba8de 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/ExactMatch.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/ExactMatch.java @@ -2,7 +2,6 @@ package eu.dnetlib.pace.condition; import java.util.List; -import eu.dnetlib.pace.config.Cond; import eu.dnetlib.pace.distance.eval.ConditionEval; import eu.dnetlib.pace.model.Field; import eu.dnetlib.pace.model.FieldDef; @@ -13,12 +12,17 @@ import org.apache.commons.lang.StringUtils; * * @author claudio */ +@ConditionClass("exactMatch") public class ExactMatch extends AbstractCondition { - public ExactMatch(final Cond cond, final List fields) { + public ExactMatch(final String cond, final List fields) { super(cond, fields); } + public ExactMatch(){ + super(); + } + @Override protected ConditionEval verify(final FieldDef fd, final Field a, final Field b) { diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/ExactMatchIgnoreCase.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/ExactMatchIgnoreCase.java index 8baad5b..7741f38 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/ExactMatchIgnoreCase.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/ExactMatchIgnoreCase.java @@ -2,7 +2,6 @@ package eu.dnetlib.pace.condition; import java.util.List; -import eu.dnetlib.pace.config.Cond; import eu.dnetlib.pace.distance.eval.ConditionEval; import eu.dnetlib.pace.model.Field; import eu.dnetlib.pace.model.FieldDef; @@ -12,9 +11,10 @@ import eu.dnetlib.pace.model.FieldDef; * * @author claudio */ +@ConditionClass("exactMatchIgnoreCase") public class ExactMatchIgnoreCase extends AbstractCondition { - public ExactMatchIgnoreCase(final Cond cond, final List fields) { + public ExactMatchIgnoreCase(final String cond, final List fields) { super(cond, fields); } diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/MustBeDifferent.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/MustBeDifferent.java index bc99a4c..f2b3bdb 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/MustBeDifferent.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/MustBeDifferent.java @@ -3,7 +3,6 @@ package eu.dnetlib.pace.condition; import java.util.List; import com.google.common.collect.Iterables; -import eu.dnetlib.pace.config.Cond; import eu.dnetlib.pace.distance.eval.ConditionEval; import eu.dnetlib.pace.model.Field; import eu.dnetlib.pace.model.FieldDef; @@ -13,6 +12,7 @@ import eu.dnetlib.pace.model.FieldDef; * * @author claudio */ +@ConditionClass("mustBeDifferent") public class MustBeDifferent extends AbstractCondition { /** @@ -20,7 +20,7 @@ public class MustBeDifferent extends AbstractCondition { * * @param fields the fields */ - public MustBeDifferent(final Cond cond, final List fields) { + public MustBeDifferent(final String cond, final List fields) { super(cond, fields); } diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/PidMatch.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/PidMatch.java index a20ab95..53aa2de 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/PidMatch.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/PidMatch.java @@ -6,7 +6,6 @@ import java.util.Set; import java.util.stream.Collectors; import com.google.common.collect.Sets; -import eu.dnetlib.pace.config.Cond; import eu.dnetlib.pace.distance.eval.ConditionEval; import eu.dnetlib.pace.model.Field; import eu.dnetlib.pace.model.FieldDef; @@ -20,11 +19,12 @@ import org.apache.commons.logging.LogFactory; * * @author claudio */ +@ConditionClass("pidMatch") public class PidMatch extends AbstractCondition { private static final Log log = LogFactory.getLog(PidMatch.class); - public PidMatch(final Cond cond, final List fields) { + public PidMatch(final String cond, final List fields) { super(cond, fields); } diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/SizeMatch.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/SizeMatch.java index ae6e940..afd0a8e 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/SizeMatch.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/SizeMatch.java @@ -4,7 +4,6 @@ import java.util.List; import com.google.common.collect.Iterables; -import eu.dnetlib.pace.config.Cond; import eu.dnetlib.pace.distance.eval.ConditionEval; import eu.dnetlib.pace.model.Field; import eu.dnetlib.pace.model.FieldDef; @@ -14,6 +13,7 @@ import eu.dnetlib.pace.model.FieldDef; * * @author claudio */ +@ConditionClass("sizeMatch") public class SizeMatch extends AbstractCondition { /** @@ -22,7 +22,7 @@ public class SizeMatch extends AbstractCondition { * @param fields * the fields */ - public SizeMatch(final Cond cond, final List fields) { + public SizeMatch(final String cond, final List fields) { super(cond, fields); } diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/TitleVersionMatch.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/TitleVersionMatch.java index 41a617a..4b94a04 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/TitleVersionMatch.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/TitleVersionMatch.java @@ -2,7 +2,6 @@ package eu.dnetlib.pace.condition; import java.util.List; -import eu.dnetlib.pace.config.Cond; import eu.dnetlib.pace.distance.eval.ConditionEval; import eu.dnetlib.pace.model.Field; import eu.dnetlib.pace.model.FieldDef; @@ -13,9 +12,10 @@ import eu.dnetlib.pace.model.FieldDef; * @author claudio * */ +@ConditionClass("titleVersionMatch") public class TitleVersionMatch extends AbstractCondition { - public TitleVersionMatch(final Cond cond, final List fields) { + public TitleVersionMatch(final String cond, final List fields) { super(cond, fields); } diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/YearMatch.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/YearMatch.java index 8971842..54d0ba8 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/YearMatch.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/YearMatch.java @@ -1,8 +1,8 @@ package eu.dnetlib.pace.condition; +import java.time.Year; import java.util.List; -import eu.dnetlib.pace.config.Cond; import eu.dnetlib.pace.distance.eval.ConditionEval; import org.apache.commons.lang.StringUtils; @@ -14,14 +14,17 @@ import eu.dnetlib.pace.model.FieldDef; * * @author claudio */ +@ConditionClass("yearMatch") public class YearMatch extends AbstractCondition { private int limit = 4; - public YearMatch(final Cond cond, final List fields) { + public YearMatch(final String cond, final List fields) { super(cond, fields); } + public YearMatch(){} + // @Override // public boolean verify(final Document a, final Document b) { // boolean res = true; diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/Algo.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/Algo.java deleted file mode 100644 index cb2e434..0000000 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/Algo.java +++ /dev/null @@ -1,46 +0,0 @@ -package eu.dnetlib.pace.config; - -/** - * Enumerates the distance Algos. - */ -public enum Algo { - - /** The Jaro winkler. */ - JaroWinkler, - /** The Jaro winkler title. */ - JaroWinklerTitle, - /** The Levenstein. */ - Levenstein, - /** The Levenstein distance for title matching */ - LevensteinTitle, - /** The Level2 jaro winkler. */ - Level2JaroWinkler, - /** The Level2 jaro winkler for title matching */ - Level2JaroWinklerTitle, - /** The Level2 levenstein. */ - Level2Levenstein, - /** The Sub string levenstein. */ - SubStringLevenstein, - /** The Year levenstein. */ - YearLevenstein, - /** The Sorted jaro winkler. */ - SortedJaroWinkler, - /** The Sorted level2 jaro winkler. */ - SortedLevel2JaroWinkler, - /** Compares two urls */ - urlMatcher, - /** Exact match algo. */ - ExactMatch, - /** - * Returns 0 for equal strings, 1 for different strings. - */ - MustBeDifferent, - /** Always return 1.0 as distance. */ - AlwaysMatch, - /** Person distance */ - PersonCoAuthorSurnamesDistance, - PersonCoAnchorsDistance, - PersonDistance, - /** The Null. */ - Null -} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/Cond.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/Cond.java deleted file mode 100644 index b287fdd..0000000 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/Cond.java +++ /dev/null @@ -1,28 +0,0 @@ -package eu.dnetlib.pace.config; - -/** - * The Enum Cond. - */ -public enum Cond { - - /** The year match. */ - yearMatch, - /** The title version match. */ - titleVersionMatch, - /** The size match. */ - sizeMatch, - /** - * Returns true if the field values are different - */ - mustBeDifferent, - /** The Exact match. */ - exactMatch, - /** - * The Exact match ignore case. - */ - exactMatchIgnoreCase, - /** The Exact match specialized to recognize DOI values. */ - doiExactMatch, - /** The Exact match that checks if pid type and value are the same */ - pidMatch -} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/DistanceAlgo.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/DistanceAlgo.java index e9d0095..c2749c5 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/DistanceAlgo.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/DistanceAlgo.java @@ -2,6 +2,8 @@ package eu.dnetlib.pace.distance; import eu.dnetlib.pace.model.Field; +import java.util.Map; + /** * Each field is configured with a distance algo which knows how to compute the distance (0-1) between the fields of two * objects. @@ -11,5 +13,9 @@ public interface DistanceAlgo { public abstract double distance(Field a, Field b); public double getWeight(); + public Map getParams(); + + public void setWeight(double w); + public void setParams(Map params); } diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/DistanceClass.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/DistanceClass.java new file mode 100644 index 0000000..9479fdb --- /dev/null +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/DistanceClass.java @@ -0,0 +1,13 @@ +package eu.dnetlib.pace.distance; + +import java.lang.annotation.ElementType; +import java.lang.annotation.Retention; +import java.lang.annotation.RetentionPolicy; +import java.lang.annotation.Target; + +@Retention(RetentionPolicy.RUNTIME) +@Target(ElementType.TYPE) +public @interface DistanceClass { + + public String value(); +} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/DistanceResolver.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/DistanceResolver.java new file mode 100644 index 0000000..0937760 --- /dev/null +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/DistanceResolver.java @@ -0,0 +1,24 @@ +package eu.dnetlib.pace.distance; + +import java.io.Serializable; +import java.lang.reflect.InvocationTargetException; +import java.util.Map; +import java.util.stream.Collectors; + +import org.reflections.Reflections; + +public class DistanceResolver implements Serializable { + private final Map> functionMap; + + public DistanceResolver() { + + this.functionMap = new Reflections("eu.dnetlib").getTypesAnnotatedWith(DistanceClass.class).stream() + .filter(DistanceAlgo.class::isAssignableFrom) + .collect(Collectors.toMap(cl -> cl.getAnnotation(DistanceClass.class).value(), cl -> (Class)cl)); + } + + public DistanceAlgo resolve(String algo) throws IllegalAccessException, InstantiationException { + + return functionMap.get(algo).newInstance(); + } +} \ No newline at end of file diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/DistanceScorer.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/DistanceScorer.java index 0cbb6f4..467a19c 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/DistanceScorer.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/DistanceScorer.java @@ -25,7 +25,7 @@ public class DistanceScorer { } public ScoreResult distance(final Document a, final Document b) { - final ScoreResult sr = new ScoreResult(); + final ScoreResult sr = new ScoreResult(); //to keep track of the result of the comparison sr.setStrictConditions(verify(a, b, config.strictConditions())); sr.setConditions(verify(a, b, config.conditions())); diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/SecondStringDistanceAlgo.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/SecondStringDistanceAlgo.java index 8329604..785c00b 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/SecondStringDistanceAlgo.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/SecondStringDistanceAlgo.java @@ -1,6 +1,8 @@ package eu.dnetlib.pace.distance; +import java.io.Serializable; import java.util.List; +import java.util.Map; import com.wcohen.ss.AbstractStringDistance; @@ -24,6 +26,27 @@ public abstract class SecondStringDistanceAlgo extends AbstractPaceFunctions imp /** The weight. */ protected double weight = 0.0; + private Map params; + + protected SecondStringDistanceAlgo(){ + } + + protected SecondStringDistanceAlgo(Map params){ + this.params = params; + } + + public void setWeight(double w){ + this.weight = w; + } + + public Map getParams(){ + return this.params; + } + + public void setParams(Map params){ + this.params = params; + } + /** * Instantiates a new second string distance algo. * @@ -37,6 +60,10 @@ public abstract class SecondStringDistanceAlgo extends AbstractPaceFunctions imp this.weight = weight; } + protected SecondStringDistanceAlgo(final AbstractStringDistance ssalgo){ + this.ssalgo = ssalgo; + } + /** * Normalize. * diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/AlwaysMatch.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/AlwaysMatch.java index 9044982..7039f05 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/AlwaysMatch.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/AlwaysMatch.java @@ -1,10 +1,22 @@ package eu.dnetlib.pace.distance.algo; import com.wcohen.ss.AbstractStringDistance; +import eu.dnetlib.pace.distance.DistanceClass; import eu.dnetlib.pace.distance.SecondStringDistanceAlgo; +import java.util.Map; + +@DistanceClass("AlwaysMatch") public class AlwaysMatch extends SecondStringDistanceAlgo { + public AlwaysMatch(){ + super(); + } + + public AlwaysMatch(final Map params){ + super(params); + } + public AlwaysMatch(final double weight) { super(weight, new com.wcohen.ss.JaroWinkler()); } diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/ExactMatch.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/ExactMatch.java index ef95c02..2e714c4 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/ExactMatch.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/ExactMatch.java @@ -1,10 +1,22 @@ package eu.dnetlib.pace.distance.algo; import com.wcohen.ss.AbstractStringDistance; +import eu.dnetlib.pace.distance.DistanceClass; import eu.dnetlib.pace.distance.SecondStringDistanceAlgo; +import java.util.Map; + +@DistanceClass("ExactMatch") public class ExactMatch extends SecondStringDistanceAlgo { + public ExactMatch(){ + super(); + } + + public ExactMatch(Map params){ + super(params); + } + public ExactMatch(final double weight) { super(weight, new com.wcohen.ss.JaroWinkler()); } diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/JaroWinkler.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/JaroWinkler.java index 87f6c4e..ea1e079 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/JaroWinkler.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/JaroWinkler.java @@ -1,11 +1,23 @@ package eu.dnetlib.pace.distance.algo; import com.wcohen.ss.AbstractStringDistance; +import eu.dnetlib.pace.distance.DistanceClass; import eu.dnetlib.pace.distance.SecondStringDistanceAlgo; +import java.util.Map; + //case class JaroWinkler(w: Double) extends SecondStringDistanceAlgo(w, new com.wcohen.ss.JaroWinkler()) +@DistanceClass("JaroWinkler") public class JaroWinkler extends SecondStringDistanceAlgo { + public JaroWinkler(){ + super(); + } + + public JaroWinkler(Map params){ + super(params); + } + public JaroWinkler(double weight) { super(weight, new com.wcohen.ss.JaroWinkler()); } diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/JaroWinklerTitle.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/JaroWinklerTitle.java index 1419a07..b37c88d 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/JaroWinklerTitle.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/JaroWinklerTitle.java @@ -1,11 +1,23 @@ package eu.dnetlib.pace.distance.algo; import com.wcohen.ss.AbstractStringDistance; +import eu.dnetlib.pace.distance.DistanceClass; import eu.dnetlib.pace.distance.SecondStringDistanceAlgo; +import java.util.Map; + //case class JaroWinkler(w: Double) extends SecondStringDistanceAlgo(w, new com.wcohen.ss.JaroWinkler()) +@DistanceClass("JaroWinklerTitle") public class JaroWinklerTitle extends SecondStringDistanceAlgo { + public JaroWinklerTitle(){ + super(); + } + + public JaroWinklerTitle(Map params){ + super(params); + } + public JaroWinklerTitle(double weight) { super(weight, new com.wcohen.ss.JaroWinkler()); } diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/Level2JaroWinkler.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/Level2JaroWinkler.java index 3ad1cfa..a2afc38 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/Level2JaroWinkler.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/Level2JaroWinkler.java @@ -1,8 +1,10 @@ package eu.dnetlib.pace.distance.algo; import com.wcohen.ss.AbstractStringDistance; +import eu.dnetlib.pace.distance.DistanceClass; import eu.dnetlib.pace.distance.SecondStringDistanceAlgo; +@DistanceClass("Level2JaroWinkler") public class Level2JaroWinkler extends SecondStringDistanceAlgo { public Level2JaroWinkler(double w) { diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/Level2JaroWinklerTitle.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/Level2JaroWinklerTitle.java index a1c3472..272e530 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/Level2JaroWinklerTitle.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/Level2JaroWinklerTitle.java @@ -1,8 +1,10 @@ package eu.dnetlib.pace.distance.algo; import com.wcohen.ss.AbstractStringDistance; +import eu.dnetlib.pace.distance.DistanceClass; import eu.dnetlib.pace.distance.SecondStringDistanceAlgo; +@DistanceClass("Level2JaroWinklerTitle") public class Level2JaroWinklerTitle extends SecondStringDistanceAlgo { public Level2JaroWinklerTitle(final double w) { diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/Level2Levenstein.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/Level2Levenstein.java index 7a2b029..1e955bd 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/Level2Levenstein.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/Level2Levenstein.java @@ -1,8 +1,10 @@ package eu.dnetlib.pace.distance.algo; import com.wcohen.ss.AbstractStringDistance; +import eu.dnetlib.pace.distance.DistanceClass; import eu.dnetlib.pace.distance.SecondStringDistanceAlgo; +@DistanceClass("Level2Levenstein") public class Level2Levenstein extends SecondStringDistanceAlgo { public Level2Levenstein(double w) { diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/Levenstein.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/Levenstein.java index 9dfce83..2e014b6 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/Levenstein.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/Levenstein.java @@ -1,10 +1,16 @@ package eu.dnetlib.pace.distance.algo; import com.wcohen.ss.AbstractStringDistance; +import eu.dnetlib.pace.distance.DistanceClass; import eu.dnetlib.pace.distance.SecondStringDistanceAlgo; +@DistanceClass("Levenstein") public class Levenstein extends SecondStringDistanceAlgo { + public Levenstein(){ + super(new com.wcohen.ss.Levenstein()); + } + public Levenstein(double w) { super(w, new com.wcohen.ss.Levenstein()); } diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/LevensteinTitle.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/LevensteinTitle.java index 281de31..c66f972 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/LevensteinTitle.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/LevensteinTitle.java @@ -1,10 +1,16 @@ package eu.dnetlib.pace.distance.algo; import com.wcohen.ss.AbstractStringDistance; +import eu.dnetlib.pace.distance.DistanceClass; import eu.dnetlib.pace.distance.SecondStringDistanceAlgo; +@DistanceClass("LevensteinTitle") public class LevensteinTitle extends SecondStringDistanceAlgo { + public LevensteinTitle(){ + super(new com.wcohen.ss.Levenstein()); + } + public LevensteinTitle(final double w) { super(w, new com.wcohen.ss.Levenstein()); } diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/MustBeDifferent.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/MustBeDifferent.java index 1177ed5..0acb82c 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/MustBeDifferent.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/MustBeDifferent.java @@ -1,8 +1,10 @@ package eu.dnetlib.pace.distance.algo; import com.wcohen.ss.AbstractStringDistance; +import eu.dnetlib.pace.distance.DistanceClass; import eu.dnetlib.pace.distance.SecondStringDistanceAlgo; +@DistanceClass("MustBeDifferent") public class MustBeDifferent extends SecondStringDistanceAlgo { public MustBeDifferent(final double weight) { diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/NullDistanceAlgo.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/NullDistanceAlgo.java index 8070a00..ef798cb 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/NullDistanceAlgo.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/NullDistanceAlgo.java @@ -1,12 +1,16 @@ package eu.dnetlib.pace.distance.algo; import eu.dnetlib.pace.distance.DistanceAlgo; +import eu.dnetlib.pace.distance.DistanceClass; import eu.dnetlib.pace.model.Field; +import java.util.Map; + /** * Not all fields of a document need to partecipate in the distance measure. We model those fields as having a * NullDistanceAlgo. */ +@DistanceClass("Null") public class NullDistanceAlgo implements DistanceAlgo { @Override @@ -19,4 +23,16 @@ public class NullDistanceAlgo implements DistanceAlgo { return 0.0; } + @Override + public void setWeight(double w){ + } + + @Override + public Map getParams() { + return null; + } + + @Override + public void setParams(Map params) { + } } diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/SortedJaroWinkler.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/SortedJaroWinkler.java index d834207..5f71600 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/SortedJaroWinkler.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/SortedJaroWinkler.java @@ -1,10 +1,12 @@ package eu.dnetlib.pace.distance.algo; import com.wcohen.ss.AbstractStringDistance; +import eu.dnetlib.pace.distance.DistanceClass; /** * The Class SortedJaroWinkler. */ +@DistanceClass("SortedJaroWinkler") public class SortedJaroWinkler extends SortedSecondStringDistanceAlgo { /** diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/SortedLevel2JaroWinkler.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/SortedLevel2JaroWinkler.java index 43ac190..493bbef 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/SortedLevel2JaroWinkler.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/SortedLevel2JaroWinkler.java @@ -1,10 +1,12 @@ package eu.dnetlib.pace.distance.algo; import com.wcohen.ss.AbstractStringDistance; +import eu.dnetlib.pace.distance.DistanceClass; /** * The Class SortedJaroWinkler. */ +@DistanceClass("Sorted2JaroWinkler") public class SortedLevel2JaroWinkler extends SortedSecondStringDistanceAlgo { /** diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/SubStringLevenstein.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/SubStringLevenstein.java index 1fa358b..9fee7df 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/SubStringLevenstein.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/SubStringLevenstein.java @@ -1,5 +1,6 @@ package eu.dnetlib.pace.distance.algo; +import eu.dnetlib.pace.distance.DistanceClass; import eu.dnetlib.pace.distance.SecondStringDistanceAlgo; import org.apache.commons.lang.StringUtils; @@ -8,14 +9,21 @@ import com.wcohen.ss.AbstractStringDistance; import eu.dnetlib.pace.config.Type; import eu.dnetlib.pace.model.Field; +import java.util.Map; + /** * The Class SubStringLevenstein. */ +@DistanceClass("SubStringLevenstein") public class SubStringLevenstein extends SecondStringDistanceAlgo { /** The limit. */ protected int limit; + public SubStringLevenstein() { + super(new com.wcohen.ss.Levenstein()); + } + /** * Instantiates a new sub string levenstein. * @@ -87,4 +95,9 @@ public class SubStringLevenstein extends SecondStringDistanceAlgo { return 1 / Math.pow(Math.abs(d) + 1, 0.1); } + public void setParams(Map params){ + this.limit = params.get("limit").intValue(); //necessary because this class needs also the limit + super.setParams(params); + } + } diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/UrlMatcher.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/UrlMatcher.java index 46a438e..2aa7ca1 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/UrlMatcher.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/UrlMatcher.java @@ -1,5 +1,6 @@ package eu.dnetlib.pace.distance.algo; +import eu.dnetlib.pace.distance.DistanceClass; import eu.dnetlib.pace.model.Field; import org.apache.commons.lang.StringUtils; @@ -7,15 +8,24 @@ import java.net.MalformedURLException; import java.net.URL; import java.util.Map; +@DistanceClass("urlMatcher") public class UrlMatcher extends Levenstein { private Map params; + public UrlMatcher(){ + super(); + } + public UrlMatcher(double weight, Map params) { super(weight); this.params = params; } + public void setParams(Map params) { + this.params = params; + } + @Override public double distance(Field a, Field b) { diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/eval/ConditionEval.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/eval/ConditionEval.java index 49e526f..d3fcee5 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/eval/ConditionEval.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/eval/ConditionEval.java @@ -1,6 +1,5 @@ package eu.dnetlib.pace.distance.eval; -import eu.dnetlib.pace.config.Cond; import eu.dnetlib.pace.model.Field; /** @@ -8,7 +7,7 @@ import eu.dnetlib.pace.model.Field; */ public class ConditionEval { - private Cond cond; + private String cond; private Field a; @@ -16,7 +15,7 @@ public class ConditionEval { private int result; - public ConditionEval(final Cond cond, final Field a, final Field b, final int result) { + public ConditionEval(final String cond, final Field a, final Field b, final int result) { this.cond = cond; this.a = a; this.b = b; @@ -47,11 +46,11 @@ public class ConditionEval { this.result = result; } - public Cond getCond() { + public String getCond() { return cond; } - public void setCond(final Cond cond) { + public void setCond(final String cond) { this.cond = cond; } } diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/eval/DistanceEval.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/eval/DistanceEval.java index a943d4c..ef3c4da 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/eval/DistanceEval.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/eval/DistanceEval.java @@ -1,6 +1,5 @@ package eu.dnetlib.pace.distance.eval; -import eu.dnetlib.pace.config.Algo; import eu.dnetlib.pace.model.Field; import eu.dnetlib.pace.model.FieldDef; diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/eval/ScoreResult.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/eval/ScoreResult.java index b5cdad7..61d5c93 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/eval/ScoreResult.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/eval/ScoreResult.java @@ -2,10 +2,12 @@ package eu.dnetlib.pace.distance.eval; import com.google.gson.GsonBuilder; +import java.io.Serializable; + /** * Created by claudio on 09/03/16. */ -public class ScoreResult { +public class ScoreResult implements Serializable { private ConditionEvalMap strictConditions; @@ -49,8 +51,12 @@ public class ScoreResult { @Override public String toString() { - final GsonBuilder b = new GsonBuilder(); - b.serializeSpecialFloatingPointValues(); - return b.setPrettyPrinting().create().toJson(this); + //TODO cannot print: why? +// final GsonBuilder b = new GsonBuilder() +// .serializeSpecialFloatingPointValues() +// .serializeNulls(); +// +// return b.setPrettyPrinting().create().toJson(this); + return "{}"; } } diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/ClusteringDef.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/ClusteringDef.java index db7092b..7e09d44 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/ClusteringDef.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/ClusteringDef.java @@ -1,6 +1,7 @@ package eu.dnetlib.pace.model; import java.io.Serializable; +import java.lang.reflect.InvocationTargetException; import java.util.List; import java.util.Map; @@ -9,49 +10,36 @@ import eu.dnetlib.pace.clustering.*; public class ClusteringDef implements Serializable { - private Clustering name; + private String name; private List fields; private Map params; + private ClusteringResolver clusteringResolver = new ClusteringResolver(); + public ClusteringDef() {} - public Clustering getName() { + public String getName() { return name; } - public void setName(final Clustering name) { + public void setName(final String name) { this.name = name; } public ClusteringFunction getClusteringFunction() { - switch (getName()) { - case acronyms: - return new Acronyms(getParams()); - case ngrams: - return new Ngrams(getParams()); - case ngrampairs: - return new NgramPairs(getParams()); - case sortedngrampairs: - return new SortedNgramPairs(getParams()); - case suffixprefix: - return new SuffixPrefix(getParams()); - case spacetrimmingfieldvalue: - return new SpaceTrimmingFieldValue(getParams()); - case immutablefieldvalue: - return new ImmutableFieldValue(getParams()); - case personhash: - return new PersonHash(getParams()); - case personclustering: - return new PersonClustering(getParams()); - case lowercase: - return new LowercaseClustering(getParams()); - case urlclustering: - return new UrlClustering(getParams()); - default: + + try { + ClusteringFunction clusteringFunction = clusteringResolver.resolve(getName()); + clusteringFunction.setParams(params); + return clusteringFunction; + + } catch (IllegalAccessException | InstantiationException | NoSuchMethodException | InvocationTargetException e) { + e.printStackTrace(); return new RandomClusteringFunction(getParams()); } + } public List getFields() { diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/CondDef.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/CondDef.java index 747f6c1..14de69a 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/CondDef.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/CondDef.java @@ -5,44 +5,36 @@ import java.util.List; import com.google.gson.Gson; import eu.dnetlib.pace.condition.*; -import eu.dnetlib.pace.config.Cond; public class CondDef implements Serializable { - private Cond name; + private String name; private List fields; + private ConditionResolver conditionResolver = new ConditionResolver(); + public CondDef() {} public ConditionAlgo getConditionAlgo(final List fields) { - switch (getName()) { - case yearMatch: - return new YearMatch(getName(), fields); - case titleVersionMatch: - return new TitleVersionMatch(getName(), fields); - case sizeMatch: - return new SizeMatch(getName(), fields); - case exactMatch: - return new ExactMatch(getName(), fields); - case mustBeDifferent: - return new MustBeDifferent(getName(), fields); - case exactMatchIgnoreCase: - return new ExactMatchIgnoreCase(getName(), fields); - case doiExactMatch: - return new DoiExactMatch(getName(), fields); - case pidMatch: - return new PidMatch(getName(), fields); - default: + + try { + ConditionAlgo conditionAlgo = conditionResolver.resolve(getName()); + conditionAlgo.setFields(fields); + conditionAlgo.setCond(getName()); + return conditionAlgo; + } catch (IllegalAccessException | InstantiationException e) { + e.printStackTrace(); return new AlwaysTrueCondition(getName(), fields); } + } - public Cond getName() { + public String getName() { return name; } - public void setName(final Cond name) { + public void setName(final String name) { this.name = name; } diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/FieldDef.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/FieldDef.java index 5445053..3f4619d 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/FieldDef.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/FieldDef.java @@ -1,13 +1,13 @@ package eu.dnetlib.pace.model; import java.io.Serializable; +import java.util.HashMap; import java.util.List; import java.util.Map; import com.google.common.base.Splitter; import com.google.common.collect.Lists; import com.google.gson.Gson; -import eu.dnetlib.pace.config.Algo; import eu.dnetlib.pace.config.Type; import eu.dnetlib.pace.distance.*; import eu.dnetlib.pace.distance.algo.*; @@ -19,7 +19,7 @@ public class FieldDef implements Serializable { public final static String PATH_SEPARATOR = "/"; - private Algo algo; + private String algo; private String name; @@ -37,6 +37,8 @@ public class FieldDef implements Serializable { private Map params; + private DistanceResolver distanceResolver = new DistanceResolver(); + public FieldDef() {} // def apply(s: String): Field[A] @@ -66,40 +68,22 @@ public class FieldDef implements Serializable { } public DistanceAlgo getDistanceAlgo() { - switch (getAlgo()) { - case JaroWinkler: - return new JaroWinkler(getWeight()); - case JaroWinklerTitle: - return new JaroWinklerTitle(getWeight()); - case Level2JaroWinkler: - return new Level2JaroWinkler(getWeight()); - case Level2JaroWinklerTitle: - return new Level2JaroWinklerTitle(getWeight()); - case Level2Levenstein: - return new Level2Levenstein(getWeight()); - case Levenstein: - return new Levenstein(getWeight()); - case LevensteinTitle: - return new LevensteinTitle(getWeight()); - case SubStringLevenstein: - return new SubStringLevenstein(getWeight(), getLimit()); - case SortedJaroWinkler: - return new SortedJaroWinkler(getWeight()); - case SortedLevel2JaroWinkler: - return new SortedLevel2JaroWinkler(getWeight()); - case urlMatcher: - return new UrlMatcher(getWeight(), getParams()); - case ExactMatch: - return new ExactMatch(getWeight()); - case MustBeDifferent: - return new MustBeDifferent(getWeight()); - case AlwaysMatch: - return new AlwaysMatch(getWeight()); - case Null: - return new NullDistanceAlgo(); - default: + + try { + if (params == null) { + params = new HashMap<>(); + } + params.put("limit", getLimit()); + params.put("weight", getWeight()); + DistanceAlgo distanceAlgo = distanceResolver.resolve(getAlgo()); + distanceAlgo.setParams(params); + distanceAlgo.setWeight(getWeight()); + return distanceAlgo; + } catch (IllegalAccessException | InstantiationException e) { + e.printStackTrace(); return new NullDistanceAlgo(); } + } public boolean isIgnoreMissing() { @@ -135,11 +119,11 @@ public class FieldDef implements Serializable { this.weight = weight; } - public Algo getAlgo() { + public String getAlgo() { return algo; } - public void setAlgo(final Algo algo) { + public void setAlgo(final String algo) { this.algo = algo; } diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/util/BlockProcessor.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/util/BlockProcessor.java index a9979f5..3e6cd6e 100644 --- a/dnet-pace-core/src/main/java/eu/dnetlib/pace/util/BlockProcessor.java +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/util/BlockProcessor.java @@ -23,7 +23,6 @@ public class BlockProcessor { private DedupConfig dedupConf; - public static void constructAccumulator( final DedupConfig dedupConf) { accumulators.add(String.format("%s::%s",dedupConf.getWf().getEntityType(), "records per hash key = 1")); accumulators.add(String.format("%s::%s",dedupConf.getWf().getEntityType(), "missing " + dedupConf.getWf().getOrderField())); diff --git a/dnet-pace-core/src/main/resources/eu/dnetlib/pace/.DS_Store b/dnet-pace-core/src/main/resources/eu/dnetlib/pace/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..12db7cf64da79dd932ccfccbbd901c4eda211125 GIT binary patch literal 6148 zcmeHKI|>3Z5S>vG!N$@uSMUZw^aNf&P!vH{5VYRPb9pr1d>UQtw2?P3dC6p6LSC`6 zBO*G#Y-b`95gEY^B@zm@)+3y3!9m+;d(#Yi0p zwti-h{ zq5pp-aYY5Fz+Wk#gT-nw$CI+Qb{=Q7w!qhL%elkNFn0c7o_bQ`6`Nzf VCbof2N8IT^{tTEdG%E0G1s?X~6^#G@ literal 0 HcmV?d00001