From 476c3d7b077bd69bb0ca1fdf1f276da75d62d069 Mon Sep 17 00:00:00 2001 From: Sandro La Bruzzo Date: Tue, 2 Oct 2018 10:37:54 +0200 Subject: [PATCH] added d-net pace core module and ignored target folder --- dnet-pace-core/pom.xml | 70 ++++ .../AbstractClusteringFunction.java | 44 +++ .../eu/dnetlib/pace/clustering/Acronyms.java | 47 +++ .../BlacklistAwareClusteringCombiner.java | 59 ++++ .../dnetlib/pace/clustering/Clustering.java | 5 + .../pace/clustering/ClusteringCombiner.java | 29 ++ .../pace/clustering/ClusteringFunction.java | 15 + .../dnetlib/pace/clustering/FieldFilter.java | 48 +++ .../pace/clustering/ImmutableFieldValue.java | 24 ++ .../pace/clustering/LowercaseClustering.java | 34 ++ .../dnetlib/pace/clustering/NGramUtils.java | 20 ++ .../dnetlib/pace/clustering/NgramPairs.java | 33 ++ .../eu/dnetlib/pace/clustering/Ngrams.java | 43 +++ .../pace/clustering/PersonClustering.java | 69 ++++ .../dnetlib/pace/clustering/PersonHash.java | 30 ++ .../clustering/RandomClusteringFunction.java | 18 + .../pace/clustering/SortedNgramPairs.java | 28 ++ .../clustering/SpaceTrimmingFieldValue.java | 27 ++ .../dnetlib/pace/clustering/SuffixPrefix.java | 38 ++ .../pace/clustering/UrlClustering.java | 46 +++ .../pace/common/AbstractPaceFunctions.java | 167 +++++++++ .../pace/condition/AbstractCondition.java | 52 +++ .../pace/condition/AlwaysTrueCondition.java | 25 ++ .../dnetlib/pace/condition/ConditionAlgo.java | 27 ++ .../dnetlib/pace/condition/DoiExactMatch.java | 27 ++ .../eu/dnetlib/pace/condition/ExactMatch.java | 43 +++ .../pace/condition/ExactMatchIgnoreCase.java | 34 ++ .../pace/condition/MustBeDifferent.java | 56 +++ .../eu/dnetlib/pace/condition/PidMatch.java | 52 +++ .../eu/dnetlib/pace/condition/SizeMatch.java | 56 +++ .../pace/condition/TitleVersionMatch.java | 35 ++ .../eu/dnetlib/pace/condition/YearMatch.java | 59 ++++ .../java/eu/dnetlib/pace/config/Algo.java | 46 +++ .../java/eu/dnetlib/pace/config/Cond.java | 28 ++ .../java/eu/dnetlib/pace/config/Config.java | 59 ++++ .../eu/dnetlib/pace/config/DedupConfig.java | 131 +++++++ .../eu/dnetlib/pace/config/PaceConfig.java | 104 ++++++ .../java/eu/dnetlib/pace/config/Type.java | 5 + .../java/eu/dnetlib/pace/config/WfConfig.java | 254 ++++++++++++++ .../pace/distance/AbstractDistance.java | 15 + .../distance/ConfigurableDistanceAlgo.java | 26 ++ .../eu/dnetlib/pace/distance/Distance.java | 9 + .../dnetlib/pace/distance/DistanceAlgo.java | 15 + .../dnetlib/pace/distance/DistanceScorer.java | 101 ++++++ .../pace/distance/PaceDocumentDistance.java | 12 + .../distance/SecondStringDistanceAlgo.java | 100 ++++++ .../pace/distance/algo/AlwaysMatch.java | 31 ++ .../pace/distance/algo/ExactMatch.java | 31 ++ .../pace/distance/algo/JaroWinkler.java | 35 ++ .../pace/distance/algo/JaroWinklerTitle.java | 36 ++ .../pace/distance/algo/Level2JaroWinkler.java | 26 ++ .../distance/algo/Level2JaroWinklerTitle.java | 41 +++ .../pace/distance/algo/Level2Levenstein.java | 26 ++ .../pace/distance/algo/Levenstein.java | 26 ++ .../pace/distance/algo/LevensteinDate.java | 25 ++ .../pace/distance/algo/LevensteinTitle.java | 45 +++ .../pace/distance/algo/MustBeDifferent.java | 31 ++ .../pace/distance/algo/NullDistanceAlgo.java | 22 ++ .../pace/distance/algo/SortedJaroWinkler.java | 52 +++ .../algo/SortedLevel2JaroWinkler.java | 52 +++ .../algo/SortedSecondStringDistanceAlgo.java | 43 +++ .../distance/algo/SubStringLevenstein.java | 90 +++++ .../pace/distance/algo/UrlMatcher.java | 48 +++ .../pace/distance/algo/YearLevenstein.java | 37 ++ .../pace/distance/eval/ConditionEval.java | 57 +++ .../pace/distance/eval/ConditionEvalMap.java | 38 ++ .../pace/distance/eval/DistanceEval.java | 57 +++ .../pace/distance/eval/DistanceEvalMap.java | 32 ++ .../pace/distance/eval/ScoreResult.java | 55 +++ .../eu/dnetlib/pace/model/AbstractField.java | 74 ++++ .../eu/dnetlib/pace/model/ClusteringDef.java | 77 +++++ .../java/eu/dnetlib/pace/model/CondDef.java | 61 ++++ .../java/eu/dnetlib/pace/model/Document.java | 39 +++ .../java/eu/dnetlib/pace/model/Field.java | 54 +++ .../java/eu/dnetlib/pace/model/FieldDef.java | 163 +++++++++ .../java/eu/dnetlib/pace/model/FieldList.java | 17 + .../eu/dnetlib/pace/model/FieldListImpl.java | 327 ++++++++++++++++++ .../eu/dnetlib/pace/model/FieldValue.java | 23 ++ .../eu/dnetlib/pace/model/FieldValueImpl.java | 126 +++++++ .../eu/dnetlib/pace/model/MapDocument.java | 146 ++++++++ .../pace/model/MapDocumentComparator.java | 50 +++ .../pace/model/MapDocumentSerializer.java | 101 ++++++ .../java/eu/dnetlib/pace/model/Person.java | 155 +++++++++ .../pace/model/PersonComparatorUtils.java | 118 +++++++ .../eu/dnetlib/pace/model/adaptor/Pid.java | 57 +++ .../pace/model/adaptor/PidOafSerialiser.java | 50 +++ .../java/eu/dnetlib/pace/model/gt/Author.java | 129 +++++++ .../eu/dnetlib/pace/model/gt/AuthorSet.java | 37 ++ .../eu/dnetlib/pace/model/gt/Authors.java | 54 +++ .../eu/dnetlib/pace/model/gt/CoAuthor.java | 50 +++ .../eu/dnetlib/pace/model/gt/CoAuthorSet.java | 36 ++ .../pace/model/gt/CoAuthorSetLite.java | 40 +++ .../eu/dnetlib/pace/model/gt/CoAuthors.java | 78 +++++ .../eu/dnetlib/pace/model/gt/GTAuthor.java | 197 +++++++++++ .../pace/model/gt/GTAuthorOafSerialiser.java | 104 ++++++ .../java/eu/dnetlib/pace/model/gt/Group.java | 44 +++ .../dnetlib/pace/model/gt/InvertedAuthor.java | 41 +++ .../java/eu/dnetlib/pace/model/gt/Match.java | 31 ++ .../java/eu/dnetlib/pace/model/gt/Result.java | 72 ++++ .../dnetlib/pace/model/gt/ScoredResult.java | 26 ++ .../eu/dnetlib/pace/model/gt/Subjects.java | 10 + .../eu/dnetlib/pace/model/gt/SubjectsMap.java | 35 ++ .../java/eu/dnetlib/pace/util/Capitalise.java | 15 + .../dnetlib/pace/util/DotAbbreviations.java | 10 + .../eu/dnetlib/pace/config/dedupConfig.st | 23 ++ .../eu/dnetlib/pace/config/name_particles.txt | 7 + .../dnetlib/pace/config/ngram_blacklist.txt | 0 .../eu/dnetlib/pace/config/stopwords_en.txt | 139 ++++++++ .../dnetlib/pace/config/title_blacklist.txt | 14 + .../eu/dnetlib/pace/AbstractPaceTest.java | 32 ++ .../clustering/ClusteringFunctionTest.java | 108 ++++++ .../eu/dnetlib/pace/config/ConfigTest.java | 24 ++ .../pace/distance/DistanceAlgoTest.java | 38 ++ .../PersonComparatorUtilsNGramsTest.java | 126 +++++++ .../PersonComparatorUtilsSimilarityTest.java | 89 +++++ .../eu/dnetlib/pace/model/PersonTest.java | 111 ++++++ .../eu/dnetlib/pace/clustering/gt.author.json | 1 + .../dnetlib/pace/config/result.pace.conf.json | 53 +++ .../dnetlib/pace/config/title_blacklist.txt | 15 + 119 files changed, 6628 insertions(+) create mode 100644 dnet-pace-core/pom.xml create mode 100644 dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/AbstractClusteringFunction.java create mode 100644 dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/Acronyms.java create mode 100644 dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/BlacklistAwareClusteringCombiner.java create mode 100644 dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/Clustering.java create mode 100644 dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/ClusteringCombiner.java create mode 100644 dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/ClusteringFunction.java create mode 100644 dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/FieldFilter.java create mode 100644 dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/ImmutableFieldValue.java create mode 100644 dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/LowercaseClustering.java create mode 100644 dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/NGramUtils.java create mode 100644 dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/NgramPairs.java create mode 100644 dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/Ngrams.java create mode 100644 dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/PersonClustering.java create mode 100644 dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/PersonHash.java create mode 100644 dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/RandomClusteringFunction.java create mode 100644 dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/SortedNgramPairs.java create mode 100644 dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/SpaceTrimmingFieldValue.java create mode 100644 dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/SuffixPrefix.java create mode 100644 dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/UrlClustering.java create mode 100644 dnet-pace-core/src/main/java/eu/dnetlib/pace/common/AbstractPaceFunctions.java create mode 100644 dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/AbstractCondition.java create mode 100644 dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/AlwaysTrueCondition.java create mode 100644 dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/ConditionAlgo.java create mode 100644 dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/DoiExactMatch.java create mode 100644 dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/ExactMatch.java create mode 100644 dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/ExactMatchIgnoreCase.java create mode 100644 dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/MustBeDifferent.java create mode 100644 dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/PidMatch.java create mode 100644 dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/SizeMatch.java create mode 100644 dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/TitleVersionMatch.java create mode 100644 dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/YearMatch.java create mode 100644 dnet-pace-core/src/main/java/eu/dnetlib/pace/config/Algo.java create mode 100644 dnet-pace-core/src/main/java/eu/dnetlib/pace/config/Cond.java create mode 100644 dnet-pace-core/src/main/java/eu/dnetlib/pace/config/Config.java create mode 100644 dnet-pace-core/src/main/java/eu/dnetlib/pace/config/DedupConfig.java create mode 100644 dnet-pace-core/src/main/java/eu/dnetlib/pace/config/PaceConfig.java create mode 100644 dnet-pace-core/src/main/java/eu/dnetlib/pace/config/Type.java create mode 100644 dnet-pace-core/src/main/java/eu/dnetlib/pace/config/WfConfig.java create mode 100644 dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/AbstractDistance.java create mode 100644 dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/ConfigurableDistanceAlgo.java create mode 100644 dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/Distance.java create mode 100644 dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/DistanceAlgo.java create mode 100644 dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/DistanceScorer.java create mode 100644 dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/PaceDocumentDistance.java create mode 100644 dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/SecondStringDistanceAlgo.java create mode 100644 dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/AlwaysMatch.java create mode 100644 dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/ExactMatch.java create mode 100644 dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/JaroWinkler.java create mode 100644 dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/JaroWinklerTitle.java create mode 100644 dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/Level2JaroWinkler.java create mode 100644 dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/Level2JaroWinklerTitle.java create mode 100644 dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/Level2Levenstein.java create mode 100644 dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/Levenstein.java create mode 100644 dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/LevensteinDate.java create mode 100644 dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/LevensteinTitle.java create mode 100644 dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/MustBeDifferent.java create mode 100644 dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/NullDistanceAlgo.java create mode 100644 dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/SortedJaroWinkler.java create mode 100644 dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/SortedLevel2JaroWinkler.java create mode 100644 dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/SortedSecondStringDistanceAlgo.java create mode 100644 dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/SubStringLevenstein.java create mode 100644 dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/UrlMatcher.java create mode 100644 dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/YearLevenstein.java create mode 100644 dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/eval/ConditionEval.java create mode 100644 dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/eval/ConditionEvalMap.java create mode 100644 dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/eval/DistanceEval.java create mode 100644 dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/eval/DistanceEvalMap.java create mode 100644 dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/eval/ScoreResult.java create mode 100644 dnet-pace-core/src/main/java/eu/dnetlib/pace/model/AbstractField.java create mode 100644 dnet-pace-core/src/main/java/eu/dnetlib/pace/model/ClusteringDef.java create mode 100644 dnet-pace-core/src/main/java/eu/dnetlib/pace/model/CondDef.java create mode 100644 dnet-pace-core/src/main/java/eu/dnetlib/pace/model/Document.java create mode 100644 dnet-pace-core/src/main/java/eu/dnetlib/pace/model/Field.java create mode 100644 dnet-pace-core/src/main/java/eu/dnetlib/pace/model/FieldDef.java create mode 100644 dnet-pace-core/src/main/java/eu/dnetlib/pace/model/FieldList.java create mode 100644 dnet-pace-core/src/main/java/eu/dnetlib/pace/model/FieldListImpl.java create mode 100644 dnet-pace-core/src/main/java/eu/dnetlib/pace/model/FieldValue.java create mode 100644 dnet-pace-core/src/main/java/eu/dnetlib/pace/model/FieldValueImpl.java create mode 100644 dnet-pace-core/src/main/java/eu/dnetlib/pace/model/MapDocument.java create mode 100644 dnet-pace-core/src/main/java/eu/dnetlib/pace/model/MapDocumentComparator.java create mode 100644 dnet-pace-core/src/main/java/eu/dnetlib/pace/model/MapDocumentSerializer.java create mode 100644 dnet-pace-core/src/main/java/eu/dnetlib/pace/model/Person.java create mode 100644 dnet-pace-core/src/main/java/eu/dnetlib/pace/model/PersonComparatorUtils.java create mode 100644 dnet-pace-core/src/main/java/eu/dnetlib/pace/model/adaptor/Pid.java create mode 100644 dnet-pace-core/src/main/java/eu/dnetlib/pace/model/adaptor/PidOafSerialiser.java create mode 100644 dnet-pace-core/src/main/java/eu/dnetlib/pace/model/gt/Author.java create mode 100644 dnet-pace-core/src/main/java/eu/dnetlib/pace/model/gt/AuthorSet.java create mode 100644 dnet-pace-core/src/main/java/eu/dnetlib/pace/model/gt/Authors.java create mode 100644 dnet-pace-core/src/main/java/eu/dnetlib/pace/model/gt/CoAuthor.java create mode 100644 dnet-pace-core/src/main/java/eu/dnetlib/pace/model/gt/CoAuthorSet.java create mode 100644 dnet-pace-core/src/main/java/eu/dnetlib/pace/model/gt/CoAuthorSetLite.java create mode 100644 dnet-pace-core/src/main/java/eu/dnetlib/pace/model/gt/CoAuthors.java create mode 100644 dnet-pace-core/src/main/java/eu/dnetlib/pace/model/gt/GTAuthor.java create mode 100644 dnet-pace-core/src/main/java/eu/dnetlib/pace/model/gt/GTAuthorOafSerialiser.java create mode 100644 dnet-pace-core/src/main/java/eu/dnetlib/pace/model/gt/Group.java create mode 100644 dnet-pace-core/src/main/java/eu/dnetlib/pace/model/gt/InvertedAuthor.java create mode 100644 dnet-pace-core/src/main/java/eu/dnetlib/pace/model/gt/Match.java create mode 100644 dnet-pace-core/src/main/java/eu/dnetlib/pace/model/gt/Result.java create mode 100644 dnet-pace-core/src/main/java/eu/dnetlib/pace/model/gt/ScoredResult.java create mode 100644 dnet-pace-core/src/main/java/eu/dnetlib/pace/model/gt/Subjects.java create mode 100644 dnet-pace-core/src/main/java/eu/dnetlib/pace/model/gt/SubjectsMap.java create mode 100644 dnet-pace-core/src/main/java/eu/dnetlib/pace/util/Capitalise.java create mode 100644 dnet-pace-core/src/main/java/eu/dnetlib/pace/util/DotAbbreviations.java create mode 100644 dnet-pace-core/src/main/resources/eu/dnetlib/pace/config/dedupConfig.st create mode 100644 dnet-pace-core/src/main/resources/eu/dnetlib/pace/config/name_particles.txt create mode 100644 dnet-pace-core/src/main/resources/eu/dnetlib/pace/config/ngram_blacklist.txt create mode 100644 dnet-pace-core/src/main/resources/eu/dnetlib/pace/config/stopwords_en.txt create mode 100644 dnet-pace-core/src/main/resources/eu/dnetlib/pace/config/title_blacklist.txt create mode 100644 dnet-pace-core/src/test/java/eu/dnetlib/pace/AbstractPaceTest.java create mode 100644 dnet-pace-core/src/test/java/eu/dnetlib/pace/clustering/ClusteringFunctionTest.java create mode 100644 dnet-pace-core/src/test/java/eu/dnetlib/pace/config/ConfigTest.java create mode 100644 dnet-pace-core/src/test/java/eu/dnetlib/pace/distance/DistanceAlgoTest.java create mode 100644 dnet-pace-core/src/test/java/eu/dnetlib/pace/model/PersonComparatorUtilsNGramsTest.java create mode 100644 dnet-pace-core/src/test/java/eu/dnetlib/pace/model/PersonComparatorUtilsSimilarityTest.java create mode 100644 dnet-pace-core/src/test/java/eu/dnetlib/pace/model/PersonTest.java create mode 100644 dnet-pace-core/src/test/resources/eu/dnetlib/pace/clustering/gt.author.json create mode 100644 dnet-pace-core/src/test/resources/eu/dnetlib/pace/config/result.pace.conf.json create mode 100644 dnet-pace-core/src/test/resources/eu/dnetlib/pace/config/title_blacklist.txt diff --git a/dnet-pace-core/pom.xml b/dnet-pace-core/pom.xml new file mode 100644 index 000000000..8e0e0f741 --- /dev/null +++ b/dnet-pace-core/pom.xml @@ -0,0 +1,70 @@ + + + + eu.dnetlib + dnet45-parent + 1.0.0 + + + 4.0.0 + eu.dnetlib + dnet-pace-core + jar + 2.6.8-SNAPSHOT + + scm:svn:https://svn.driver.research-infrastructures.eu/driver/dnet45/modules/dnet-pace-core/trunk + + + + edu.cmu + secondstring + 1.0.0 + + + com.google.guava + guava + ${google.guava.version} + + + com.google.code.gson + gson + ${google.gson.version} + + + commons-lang + commons-lang + ${commons.lang.version} + + + commons-io + commons-io + ${commons.io.version} + + + commons-collections + commons-collections + ${commons.collections.version} + + + com.googlecode.protobuf-java-format + protobuf-java-format + 1.2 + + + org.antlr + stringtemplate + 3.2 + + + commons-logging + commons-logging + ${commons.logging.version} + + + junit + junit + ${junit.version} + test + + + diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/AbstractClusteringFunction.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/AbstractClusteringFunction.java new file mode 100644 index 000000000..6f29f22ce --- /dev/null +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/AbstractClusteringFunction.java @@ -0,0 +1,44 @@ +package eu.dnetlib.pace.clustering; + +import java.util.*; +import java.util.function.Function; +import java.util.function.Predicate; +import java.util.stream.Collectors; + +import com.google.common.collect.Sets; + +import eu.dnetlib.pace.common.AbstractPaceFunctions; +import eu.dnetlib.pace.model.Field; +import org.apache.commons.lang.StringUtils; + +public abstract class AbstractClusteringFunction extends AbstractPaceFunctions implements ClusteringFunction { + + protected Map params; + + public AbstractClusteringFunction(final Map params) { + this.params = params; + } + + protected abstract Collection doApply(String s); + + @Override + public Collection apply(List fields) { + return fields.stream().filter(f -> !f.isEmpty()) + .map(Field::stringValue) + .map(this::normalize) + .map(s -> filterStopWords(s, stopwords)) + .map(this::doApply) + .map(c -> filterBlacklisted(c, ngramBlacklist)) + .flatMap(c -> c.stream()) + .filter(StringUtils::isNotBlank) + .collect(Collectors.toCollection(HashSet::new)); + } + + public Map getParams() { + return params; + } + + protected Integer param(String name) { + return params.get(name); + } +} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/Acronyms.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/Acronyms.java new file mode 100644 index 000000000..1897e6a87 --- /dev/null +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/Acronyms.java @@ -0,0 +1,47 @@ +package eu.dnetlib.pace.clustering; + +import java.util.Collection; +import java.util.Map; +import java.util.Set; +import java.util.StringTokenizer; + +import com.google.common.collect.Sets; + +public class Acronyms extends AbstractClusteringFunction { + + public Acronyms(Map params) { + super(params); + } + + @Override + protected Collection doApply(String s) { + return extractAcronyms(s, param("max"), param("minLen"), param("maxLen")); + } + + private Set extractAcronyms(final String s, int maxAcronyms, int minLen, int maxLen) { + + final Set acronyms = Sets.newLinkedHashSet(); + + for (int i = 0; i < maxAcronyms; i++) { + + final StringTokenizer st = new StringTokenizer(s); + final StringBuilder sb = new StringBuilder(); + + while (st.hasMoreTokens()) { + final String token = st.nextToken(); + if (sb.length() > maxLen) { + break; + } + if (token.length() > 1 && i < token.length()) { + sb.append(token.charAt(i)); + } + } + String acronym = sb.toString(); + if (acronym.length() > minLen) { + acronyms.add(acronym); + } + } + return acronyms; + } + +} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/BlacklistAwareClusteringCombiner.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/BlacklistAwareClusteringCombiner.java new file mode 100644 index 000000000..b007853a9 --- /dev/null +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/BlacklistAwareClusteringCombiner.java @@ -0,0 +1,59 @@ +package eu.dnetlib.pace.clustering; + +import java.util.Collection; +import java.util.List; +import java.util.Map; +import java.util.Map.Entry; +import java.util.Set; + +import com.google.common.collect.Iterables; +import com.google.common.collect.Lists; +import com.google.common.collect.Maps; + +import eu.dnetlib.pace.config.Config; +import eu.dnetlib.pace.model.Document; +import eu.dnetlib.pace.model.Field; +import eu.dnetlib.pace.model.FieldListImpl; +import eu.dnetlib.pace.model.MapDocument; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; + +public class BlacklistAwareClusteringCombiner extends ClusteringCombiner { + + private static final Log log = LogFactory.getLog(BlacklistAwareClusteringCombiner.class); + + public static Collection filterAndCombine(final MapDocument a, final Config conf, final Map> blacklists) { + + final Document filtered = new BlacklistAwareClusteringCombiner().filter(a, blacklists); + return combine(filtered, conf); + } + + private MapDocument filter(final MapDocument a, final Map> blacklists) { + final Map filtered = Maps.newHashMap(a.getFieldMap()); + if (blacklists != null) { + for (final Entry e : filtered.entrySet()) { + + final FieldListImpl fl = new FieldListImpl(); + fl.addAll(Lists.newArrayList(Iterables.filter(e.getValue(), new FieldFilter(e.getKey(), blacklists)))); + filtered.put(e.getKey(), fl); + } + } + return new MapDocument(a.getIdentifier(), filtered); + } + + /** + * Tries to match the fields in the regex blacklist. + * + * @param fieldName + * @param value + * @return true if the field matches, false otherwise + */ + protected boolean regexMatches(final String fieldName, final String value, final Map> blacklists) { + if (blacklists.containsKey(fieldName)) { + for (final String regex : blacklists.get(fieldName)) { + if (value.matches(regex)) return true; + } + } + return false; + } +} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/Clustering.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/Clustering.java new file mode 100644 index 000000000..72575409f --- /dev/null +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/Clustering.java @@ -0,0 +1,5 @@ +package eu.dnetlib.pace.clustering; + +public enum Clustering { + acronyms, ngrams, ngrampairs, sortedngrampairs, suffixprefix, spacetrimmingfieldvalue, immutablefieldvalue, personhash, personclustering, lowercase, urlclustering +} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/ClusteringCombiner.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/ClusteringCombiner.java new file mode 100644 index 000000000..d6887050a --- /dev/null +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/ClusteringCombiner.java @@ -0,0 +1,29 @@ +package eu.dnetlib.pace.clustering; + +import java.util.Collection; +import java.util.List; + +import com.google.common.collect.Sets; + +import eu.dnetlib.pace.config.Config; +import eu.dnetlib.pace.model.ClusteringDef; +import eu.dnetlib.pace.model.Document; +import eu.dnetlib.pace.model.Field; + +public class ClusteringCombiner { + + public static Collection combine(final Document a, final Config conf) { + return new ClusteringCombiner().doCombine(a, conf.clusterings()); + } + + private Collection doCombine(final Document a, final List defs) { + final Collection res = Sets.newLinkedHashSet(); + for (final ClusteringDef cd : defs) { + for (final String fieldName : cd.getFields()) { + final Field values = a.values(fieldName); + res.addAll(cd.getClusteringFunction().apply((List) values)); + } + } + return res; + } +} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/ClusteringFunction.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/ClusteringFunction.java new file mode 100644 index 000000000..4fe1b596e --- /dev/null +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/ClusteringFunction.java @@ -0,0 +1,15 @@ +package eu.dnetlib.pace.clustering; + +import java.util.Collection; +import java.util.List; +import java.util.Map; + +import eu.dnetlib.pace.model.Field; + +public interface ClusteringFunction { + + public Collection apply(List fields); + + public Map getParams(); + +} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/FieldFilter.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/FieldFilter.java new file mode 100644 index 000000000..7ede4c239 --- /dev/null +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/FieldFilter.java @@ -0,0 +1,48 @@ +package eu.dnetlib.pace.clustering; + +import java.util.List; +import java.util.Map; + +import com.google.common.base.Predicate; + +import eu.dnetlib.pace.model.Field; +import org.apache.commons.lang.StringUtils; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; + +public class FieldFilter implements Predicate { + + private static final Log log = LogFactory.getLog(FieldFilter.class); + + private Map> blacklists; + + private String filedName; + + public FieldFilter(final String fieldName, final Map> blacklists) { + this.filedName = fieldName; + this.blacklists = blacklists; + } + + @Override + public boolean apply(final Field f) { + return !regexMatches(filedName, f.stringValue(), blacklists); + } + + /** + * Tries to match the fields in the regex blacklist. + * + * @param fieldName + * @param value + * @return true if the field matches, false otherwise + */ + protected boolean regexMatches(final String fieldName, final String value, final Map> blacklists) { + if (blacklists.containsKey(fieldName)) { + final Iterable regexes = blacklists.get(fieldName); + for (final String regex : regexes) { + if (StringUtils.isBlank(regex)) return false; + if (value.matches(regex)) return true; + } + } + return false; + } +} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/ImmutableFieldValue.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/ImmutableFieldValue.java new file mode 100644 index 000000000..988476ddd --- /dev/null +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/ImmutableFieldValue.java @@ -0,0 +1,24 @@ +package eu.dnetlib.pace.clustering; + +import java.util.Collection; +import java.util.List; +import java.util.Map; + +import com.google.common.collect.Lists; + +public class ImmutableFieldValue extends AbstractClusteringFunction { + + public ImmutableFieldValue(final Map params) { + super(params); + } + + @Override + protected Collection doApply(final String s) { + final List res = Lists.newArrayList(); + + res.add(s); + + return res; + } + +} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/LowercaseClustering.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/LowercaseClustering.java new file mode 100644 index 000000000..6d00992bd --- /dev/null +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/LowercaseClustering.java @@ -0,0 +1,34 @@ +package eu.dnetlib.pace.clustering; + +import java.util.Collection; +import java.util.List; +import java.util.Map; + +import com.google.common.collect.Lists; +import com.google.common.collect.Sets; +import eu.dnetlib.pace.model.Field; +import org.apache.commons.lang.StringUtils; + +public class LowercaseClustering extends AbstractClusteringFunction { + + public LowercaseClustering(final Map params) { + super(params); + } + + @Override + public Collection apply(List fields) { + Collection c = Sets.newLinkedHashSet(); + for(Field f : fields) { + c.addAll(doApply(f.stringValue())); + } + return c; + } + + @Override + protected Collection doApply(final String s) { + if(StringUtils.isBlank(s)) { + return Lists.newArrayList(); + } + return Lists.newArrayList(s.toLowerCase().trim()); + } +} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/NGramUtils.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/NGramUtils.java new file mode 100644 index 000000000..aeb790f76 --- /dev/null +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/NGramUtils.java @@ -0,0 +1,20 @@ +package eu.dnetlib.pace.clustering; + +import java.util.Set; + +import org.apache.commons.lang.StringUtils; + +import eu.dnetlib.pace.common.AbstractPaceFunctions; + +public class NGramUtils extends AbstractPaceFunctions { + + private static final int SIZE = 100; + + private static Set stopwords = AbstractPaceFunctions.loadFromClasspath("/eu/dnetlib/pace/config/stopwords_en.txt"); + + public static String cleanupForOrdering(String s) { + NGramUtils utils = new NGramUtils(); + return (utils.filterStopWords(utils.normalize(s), stopwords) + StringUtils.repeat(" ", SIZE)).substring(0, SIZE).replaceAll(" ", ""); + } + +} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/NgramPairs.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/NgramPairs.java new file mode 100644 index 000000000..3cffa4d54 --- /dev/null +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/NgramPairs.java @@ -0,0 +1,33 @@ +package eu.dnetlib.pace.clustering; + +import java.util.Collection; +import java.util.List; +import java.util.Map; + +import com.google.common.collect.Lists; + +public class NgramPairs extends Ngrams { + + public NgramPairs(Map params) { + super(params); + } + + @Override + protected Collection doApply(String s) { + return ngramPairs(Lists.newArrayList(getNgrams(s, param("ngramLen"), param("max") * 2, 1, 2)), param("max")); + } + + protected Collection ngramPairs(final List ngrams, int maxNgrams) { + Collection res = Lists.newArrayList(); + int j = 0; + for (int i = 0; i < ngrams.size() && res.size() < maxNgrams; i++) { + if (++j >= ngrams.size()) { + break; + } + res.add(ngrams.get(i) + ngrams.get(j)); + //System.out.println("-- " + concatNgrams); + } + return res; + } + +} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/Ngrams.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/Ngrams.java new file mode 100644 index 000000000..aaba9afbf --- /dev/null +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/Ngrams.java @@ -0,0 +1,43 @@ +package eu.dnetlib.pace.clustering; + +import java.util.Collection; +import java.util.LinkedHashSet; +import java.util.Map; +import java.util.StringTokenizer; + +public class Ngrams extends AbstractClusteringFunction { + + public Ngrams(Map params) { + super(params); + } + + @Override + protected Collection doApply(String s) { + return getNgrams(s, param("ngramLen"), param("max"), param("maxPerToken"), param("minNgramLen")); + } + + protected Collection getNgrams(String s, int ngramLen, int max, int maxPerToken, int minNgramLen) { + + final Collection ngrams = new LinkedHashSet(); + final StringTokenizer st = new StringTokenizer(s); + + while (st.hasMoreTokens()) { + final String token = st.nextToken(); + if (!token.isEmpty()) { + + for (int i = 0; i < maxPerToken && ngramLen + i <= token.length(); i++) { + String ngram = (token + " ").substring(i, ngramLen + i).trim(); + if (ngrams.size() >= max) { + return ngrams; + } + if (ngram.length() >= minNgramLen) { + ngrams.add(ngram); + } + } + } + } + //System.out.println(ngrams + " n: " + ngrams.size()); + return ngrams; + } + +} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/PersonClustering.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/PersonClustering.java new file mode 100644 index 000000000..d71707721 --- /dev/null +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/PersonClustering.java @@ -0,0 +1,69 @@ +package eu.dnetlib.pace.clustering; + +import java.util.Collection; +import java.util.List; +import java.util.Map; +import java.util.Set; + +import eu.dnetlib.pace.model.FieldList; +import eu.dnetlib.pace.model.FieldValue; +import org.apache.commons.lang.StringUtils; + +import com.google.common.base.Splitter; +import com.google.common.collect.Iterables; +import com.google.common.collect.Sets; + +import eu.dnetlib.pace.common.AbstractPaceFunctions; +import eu.dnetlib.pace.model.Field; +import eu.dnetlib.pace.model.gt.Author; +import eu.dnetlib.pace.model.gt.GTAuthor; + +public class PersonClustering extends AbstractPaceFunctions implements ClusteringFunction { + + private Map params; + + private static final int MAX_TOKENS = 5; + + public PersonClustering(final Map params) { + this.params = params; + } + + @Override + public Collection apply(final List fields) { + final Set hashes = Sets.newHashSet(); + + for (final Field f : fields) { + + final GTAuthor gta = GTAuthor.fromOafJson(f.stringValue()); + + final Author a = gta.getAuthor(); + if (a.isWellFormed()) { + hashes.add(firstLC(a.getFirstname()) + a.getSecondnames().toLowerCase()); + } else { + for (final String token1 : tokens(a.getFullname())) { + for (final String token2 : tokens(a.getFullname())) { + if (!token1.equals(token2)) { + hashes.add(firstLC(token1) + token2); + } + } + } + } + } + + return hashes; + } + + private String firstLC(final String s) { + return StringUtils.substring(s, 0, 1).toLowerCase(); + } + + private Iterable tokens(final String s) { + return Iterables.limit(Splitter.on(" ").omitEmptyStrings().trimResults().split(s), MAX_TOKENS); + } + + @Override + public Map getParams() { + return params; + } + +} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/PersonHash.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/PersonHash.java new file mode 100644 index 000000000..42d9d5bab --- /dev/null +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/PersonHash.java @@ -0,0 +1,30 @@ +package eu.dnetlib.pace.clustering; + +import java.util.Collection; +import java.util.List; +import java.util.Map; + +import com.google.common.collect.Lists; + +import eu.dnetlib.pace.model.Person; + +public class PersonHash extends AbstractClusteringFunction { + + private boolean DEFAULT_AGGRESSIVE = false; + + public PersonHash(final Map params) { + super(params); + } + + @Override + protected Collection doApply(final String s) { + final List res = Lists.newArrayList(); + + final boolean aggressive = (Boolean) (getParams().containsKey("aggressive") ? getParams().get("aggressive") : DEFAULT_AGGRESSIVE); + + res.add(new Person(s, aggressive).hash()); + + return res; + } + +} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/RandomClusteringFunction.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/RandomClusteringFunction.java new file mode 100644 index 000000000..f012aacab --- /dev/null +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/RandomClusteringFunction.java @@ -0,0 +1,18 @@ +package eu.dnetlib.pace.clustering; + +import java.util.Collection; +import java.util.Map; + +public class RandomClusteringFunction extends AbstractClusteringFunction { + + public RandomClusteringFunction(Map params) { + super(params); + } + + @Override + protected Collection doApply(String s) { + // TODO Auto-generated method stub + return null; + } + +} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/SortedNgramPairs.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/SortedNgramPairs.java new file mode 100644 index 000000000..56e660438 --- /dev/null +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/SortedNgramPairs.java @@ -0,0 +1,28 @@ +package eu.dnetlib.pace.clustering; + +import java.util.Collection; +import java.util.Collections; +import java.util.List; +import java.util.Map; + +import com.google.common.base.Joiner; +import com.google.common.base.Splitter; +import com.google.common.collect.Lists; + +public class SortedNgramPairs extends NgramPairs { + + public SortedNgramPairs(Map params) { + super(params); + } + + @Override + protected Collection doApply(String s) { + + final List tokens = Lists.newArrayList(Splitter.on(" ").omitEmptyStrings().trimResults().split(s)); + + Collections.sort(tokens); + + return ngramPairs(Lists.newArrayList(getNgrams(Joiner.on(" ").join(tokens), param("ngramLen"), param("max") * 2, 1, 2)), param("max")); + } + +} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/SpaceTrimmingFieldValue.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/SpaceTrimmingFieldValue.java new file mode 100644 index 000000000..19a51d4ca --- /dev/null +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/SpaceTrimmingFieldValue.java @@ -0,0 +1,27 @@ +package eu.dnetlib.pace.clustering; + +import java.util.Collection; +import java.util.List; +import java.util.Map; + +import org.apache.commons.lang.RandomStringUtils; +import org.apache.commons.lang.StringUtils; + +import com.google.common.collect.Lists; + +public class SpaceTrimmingFieldValue extends AbstractClusteringFunction { + + public SpaceTrimmingFieldValue(final Map params) { + super(params); + } + + @Override + protected Collection doApply(final String s) { + final List res = Lists.newArrayList(); + + res.add(StringUtils.isBlank(s) ? RandomStringUtils.random(getParams().get("randomLength")) : s.toLowerCase().replaceAll("\\s+", "")); + + return res; + } + +} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/SuffixPrefix.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/SuffixPrefix.java new file mode 100644 index 000000000..3ed336af4 --- /dev/null +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/SuffixPrefix.java @@ -0,0 +1,38 @@ +package eu.dnetlib.pace.clustering; + +import java.util.Collection; +import java.util.Map; +import java.util.Set; + +import com.google.common.collect.Sets; + +public class SuffixPrefix extends AbstractClusteringFunction { + + public SuffixPrefix(Map params) { + super(params); + } + + @Override + protected Collection doApply(String s) { + return suffixPrefix(s, param("len"), param("max")); + } + + private Collection suffixPrefix(String s, int len, int max) { + final Set bigrams = Sets.newLinkedHashSet(); + int i = 0; + while (++i < s.length() && bigrams.size() < max) { + int j = s.indexOf(" ", i); + + int offset = j + len + 1 < s.length() ? j + len + 1 : s.length(); + + if (j - len > 0) { + String bigram = s.substring(j - len, offset).replaceAll(" ", "").trim(); + if (bigram.length() >= 4) { + bigrams.add(bigram); + } + } + } + return bigrams; + } + +} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/UrlClustering.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/UrlClustering.java new file mode 100644 index 000000000..196281444 --- /dev/null +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/clustering/UrlClustering.java @@ -0,0 +1,46 @@ +package eu.dnetlib.pace.clustering; + +import eu.dnetlib.pace.common.AbstractPaceFunctions; +import eu.dnetlib.pace.model.Field; + +import java.net.MalformedURLException; +import java.net.URL; +import java.util.Collection; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.stream.Collectors; + +public class UrlClustering extends AbstractPaceFunctions implements ClusteringFunction { + + protected Map params; + + public UrlClustering(final Map params) { + this.params = params; + } + + @Override + public Collection apply(List fields) { + return fields.stream() + .filter(f -> !f.isEmpty()) + .map(Field::stringValue) + .map(this::asUrl) + .map(URL::getHost) + .collect(Collectors.toCollection(HashSet::new)); + } + + @Override + public Map getParams() { + return null; + } + + private URL asUrl(final String value) { + try { + return new URL(value); + } catch (MalformedURLException e) { + // should not happen as checked by pace typing + throw new IllegalStateException("invalid URL: " + value); + } + } + +} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/common/AbstractPaceFunctions.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/common/AbstractPaceFunctions.java new file mode 100644 index 000000000..9174bed19 --- /dev/null +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/common/AbstractPaceFunctions.java @@ -0,0 +1,167 @@ +package eu.dnetlib.pace.common; + +import java.text.Normalizer; +import java.util.Collection; +import java.util.List; +import java.util.Set; +import java.util.StringTokenizer; + +import org.apache.commons.io.IOUtils; +import org.apache.commons.lang.StringUtils; + +import com.google.common.base.Joiner; +import com.google.common.collect.Iterables; +import com.google.common.collect.Lists; +import com.google.common.collect.Sets; + +import eu.dnetlib.pace.clustering.NGramUtils; +import eu.dnetlib.pace.model.Field; +import eu.dnetlib.pace.model.FieldList; +import eu.dnetlib.pace.model.FieldListImpl; + +/** + * Set of common functions + * + * @author claudio + * + */ +public abstract class AbstractPaceFunctions { + + protected static Set stopwords = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_en.txt"); + + protected static Set ngramBlacklist = loadFromClasspath("/eu/dnetlib/pace/config/ngram_blacklist.txt"); + + private static final String alpha = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789 "; + private static final String aliases_from = "⁰¹²³⁴⁵⁶⁷⁸⁹⁺⁻⁼⁽⁾ⁿ₀₁₂₃₄₅₆₇₈₉₊₋₌₍₎"; + private static final String aliases_to = "0123456789+-=()n0123456789+-=()"; + + protected final static FieldList EMPTY_FIELD = new FieldListImpl(); + + protected String concat(final List l) { + return Joiner.on(" ").skipNulls().join(l); + } + + protected String cleanup(final String s) { + final String s1 = nfd(s); + final String s2 = fixAliases(s1); + final String s3 = s2.replaceAll("–", " "); + final String s4 = s3.replaceAll("&", " "); + final String s5 = s4.replaceAll(""", " "); + final String s6 = s5.replaceAll("−", " "); + final String s7 = s6.replaceAll("([0-9]+)", " $1 "); + final String s8 = s7.replaceAll("[^\\p{ASCII}]|\\p{Punct}", " "); + final String s9 = s8.replaceAll("\\n", " "); + final String s10 = s9.replaceAll("(?m)\\s+", " "); + final String s11 = s10.trim(); + return s11; + } + + protected String finalCleanup(final String s) { + return s.toLowerCase(); + } + + protected boolean checkNumbers(final String a, final String b) { + final String numbersA = getNumbers(a); + final String numbersB = getNumbers(b); + final String romansA = getRomans(a); + final String romansB = getRomans(b); + return !numbersA.equals(numbersB) || !romansA.equals(romansB); + } + + protected String getRomans(final String s) { + final StringBuilder sb = new StringBuilder(); + for (final String t : s.split(" ")) { + sb.append(isRoman(t) ? t : ""); + } + return sb.toString(); + } + + protected boolean isRoman(final String s) { + return s.replaceAll("^M{0,4}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|V?I{0,3})$", "qwertyuiop").equals("qwertyuiop"); + } + + protected String getNumbers(final String s) { + return s.replaceAll("\\D", ""); + } + + protected String fixAliases(final String s) { + final StringBuilder sb = new StringBuilder(); + for (final char ch : Lists.charactersOf(s)) { + final int i = StringUtils.indexOf(aliases_from, ch); + sb.append(i >= 0 ? aliases_to.charAt(i) : ch); + } + return sb.toString(); + } + + protected String removeSymbols(final String s) { + final StringBuilder sb = new StringBuilder(); + + for (final char ch : Lists.charactersOf(s)) { + sb.append(StringUtils.contains(alpha, ch) ? ch : " "); + } + return sb.toString().replaceAll("\\s+", " "); + } + + protected String getFirstValue(final Field values) { + return (values != null) && !Iterables.isEmpty(values) ? Iterables.getFirst(values, EMPTY_FIELD).stringValue() : ""; + } + + protected boolean notNull(final String s) { + return s != null; + } + + // /////////////////////// + + protected String normalize(final String s) { + return nfd(s).toLowerCase() + // do not compact the regexes in a single expression, would cause StackOverflowError in case of large input strings + .replaceAll("(\\W)+", " ") + .replaceAll("(\\p{InCombiningDiacriticalMarks})+", " ") + .replaceAll("(\\p{Punct})+", " ") + .replaceAll("(\\d)+", " ") + .replaceAll("(\\n)+", " ") + .trim(); + } + + private String nfd(final String s) { + return Normalizer.normalize(s, Normalizer.Form.NFD); + } + + protected String filterStopWords(final String s, final Set stopwords) { + final StringTokenizer st = new StringTokenizer(s); + final StringBuilder sb = new StringBuilder(); + while (st.hasMoreTokens()) { + final String token = st.nextToken(); + if (!stopwords.contains(token)) { + sb.append(token); + sb.append(" "); + } + } + return sb.toString().trim(); + } + + protected Collection filterBlacklisted(final Collection set, final Set ngramBlacklist) { + final Set newset = Sets.newLinkedHashSet(); + for (final String s : set) { + if (!ngramBlacklist.contains(s)) { + newset.add(s); + } + } + return newset; + } + + // //////////////////// + + public static Set loadFromClasspath(final String classpath) { + final Set h = Sets.newHashSet(); + try { + for (final String s : IOUtils.readLines(NGramUtils.class.getResourceAsStream(classpath))) { + h.add(s); + } + } catch (final Throwable e) { + return Sets.newHashSet(); + } + return h; + } + +} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/AbstractCondition.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/AbstractCondition.java new file mode 100644 index 000000000..bbfac97b9 --- /dev/null +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/AbstractCondition.java @@ -0,0 +1,52 @@ +package eu.dnetlib.pace.condition; + +import java.util.List; +import eu.dnetlib.pace.common.AbstractPaceFunctions; +import eu.dnetlib.pace.config.Cond; +import eu.dnetlib.pace.distance.eval.ConditionEval; +import eu.dnetlib.pace.distance.eval.ConditionEvalMap; +import eu.dnetlib.pace.model.Document; +import eu.dnetlib.pace.model.Field; +import eu.dnetlib.pace.model.FieldDef; + +/** + * Abstract conditions needs a list of field names. + * + * @author claudio + * + */ +public abstract class AbstractCondition extends AbstractPaceFunctions implements ConditionAlgo { + + protected Cond cond; + + protected List fields; + + public AbstractCondition(final Cond cond, final List fields) { + this.cond = cond; + this.fields = fields; + } + + protected abstract ConditionEval verify(FieldDef fd, Field a, Field b); + + @Override + public ConditionEvalMap verify(final Document a, final Document b) { + final ConditionEvalMap res = new ConditionEvalMap(); + for (final FieldDef fd : getFields()) { + + final Field va = a.values(fd.getName()); + final Field vb = b.values(fd.getName()); + + if ((va.isEmpty() || vb.isEmpty()) && fd.isIgnoreMissing()) { + res.put(fd.getName(), new ConditionEval(cond, va, vb, 0)); + } else { + res.put(fd.getName(), verify(fd, va, vb)); + } + } + return res; + } + + public List getFields() { + return fields; + } + +} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/AlwaysTrueCondition.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/AlwaysTrueCondition.java new file mode 100644 index 000000000..f9ff2b60b --- /dev/null +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/AlwaysTrueCondition.java @@ -0,0 +1,25 @@ +package eu.dnetlib.pace.condition; + +import java.util.List; +import eu.dnetlib.pace.config.Cond; +import eu.dnetlib.pace.distance.eval.ConditionEval; +import eu.dnetlib.pace.model.Field; +import eu.dnetlib.pace.model.FieldDef; + +/** + * Default always true condition + * + * @author claudio + */ +public class AlwaysTrueCondition extends AbstractCondition { + + public AlwaysTrueCondition(final Cond cond, final List fields) { + super(cond, fields); + } + + @Override + protected ConditionEval verify(final FieldDef fd, final Field a, final Field b) { + return new ConditionEval(cond, a, b, 1); + } + +} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/ConditionAlgo.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/ConditionAlgo.java new file mode 100644 index 000000000..ceb7c73cc --- /dev/null +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/ConditionAlgo.java @@ -0,0 +1,27 @@ +package eu.dnetlib.pace.condition; + +import java.util.Map; + +import eu.dnetlib.pace.distance.eval.ConditionEvalMap; +import eu.dnetlib.pace.model.Document; + +/** + * Allows to express general conditions to be satisfied or not between two Documents. + * + * @author claudio + */ +public interface ConditionAlgo { + + /** + * Verify a condition. + * + * @param a + * the Document a + * @param b + * the Document b + * @return 0 when condition cannot be verified (ignoremissing = true). Positive int when the condition is verified. Negative int when + * the condition is not verified. + */ + public abstract ConditionEvalMap verify(Document a, Document b); + +} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/DoiExactMatch.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/DoiExactMatch.java new file mode 100644 index 000000000..25b1a01cd --- /dev/null +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/DoiExactMatch.java @@ -0,0 +1,27 @@ +package eu.dnetlib.pace.condition; + +import java.util.List; + +import eu.dnetlib.pace.config.Cond; +import eu.dnetlib.pace.model.Field; +import eu.dnetlib.pace.model.FieldDef; + +/** + * The Class ExactMatch. + * + * @author claudio + */ +public class DoiExactMatch extends ExactMatchIgnoreCase { + + public final String PREFIX = "(http:\\/\\/dx\\.doi\\.org\\/)|(doi:)"; + + public DoiExactMatch(final Cond cond, final List fields) { + super(cond, fields); + } + + @Override + protected String getValue(final Field f) { + return super.getValue(f).replaceAll(PREFIX, ""); + } + +} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/ExactMatch.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/ExactMatch.java new file mode 100644 index 000000000..4f0f37188 --- /dev/null +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/ExactMatch.java @@ -0,0 +1,43 @@ +package eu.dnetlib.pace.condition; + +import java.util.List; + +import eu.dnetlib.pace.config.Cond; +import eu.dnetlib.pace.distance.eval.ConditionEval; +import eu.dnetlib.pace.model.Field; +import eu.dnetlib.pace.model.FieldDef; +import org.apache.commons.lang.StringUtils; + +/** + * The Class ExactMatch. + * + * @author claudio + */ +public class ExactMatch extends AbstractCondition { + + public ExactMatch(final Cond cond, final List fields) { + super(cond, fields); + } + + @Override + protected ConditionEval verify(final FieldDef fd, final Field a, final Field b) { + + final String fa = getValue(a); + final String fb = getValue(b); + + int res; + + if (StringUtils.isBlank(fa) && StringUtils.isBlank(fb)) { + res = 0; + } else { + res = fa.equals(fb) ? 1 : -1; + } + + return new ConditionEval(cond, a, b, res); + } + + protected String getValue(final Field f) { + return getFirstValue(f); + } + +} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/ExactMatchIgnoreCase.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/ExactMatchIgnoreCase.java new file mode 100644 index 000000000..8baad5b24 --- /dev/null +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/ExactMatchIgnoreCase.java @@ -0,0 +1,34 @@ +package eu.dnetlib.pace.condition; + +import java.util.List; + +import eu.dnetlib.pace.config.Cond; +import eu.dnetlib.pace.distance.eval.ConditionEval; +import eu.dnetlib.pace.model.Field; +import eu.dnetlib.pace.model.FieldDef; + +/** + * The Class ExactMatch. + * + * @author claudio + */ +public class ExactMatchIgnoreCase extends AbstractCondition { + + public ExactMatchIgnoreCase(final Cond cond, final List fields) { + super(cond, fields); + } + + @Override + protected ConditionEval verify(final FieldDef fd, final Field a, final Field b) { + + final String fa = getValue(a); + final String fb = getValue(b); + + return new ConditionEval(cond, a, b, fa.equalsIgnoreCase(fb) ? 1 : -1); + } + + protected String getValue(final Field f) { + return getFirstValue(f); + } + +} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/MustBeDifferent.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/MustBeDifferent.java new file mode 100644 index 000000000..bc99a4cc5 --- /dev/null +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/MustBeDifferent.java @@ -0,0 +1,56 @@ +package eu.dnetlib.pace.condition; + +import java.util.List; + +import com.google.common.collect.Iterables; +import eu.dnetlib.pace.config.Cond; +import eu.dnetlib.pace.distance.eval.ConditionEval; +import eu.dnetlib.pace.model.Field; +import eu.dnetlib.pace.model.FieldDef; + +/** + * Returns true if the field values are different. + * + * @author claudio + */ +public class MustBeDifferent extends AbstractCondition { + + /** + * Instantiates a new size match. + * + * @param fields the fields + */ + public MustBeDifferent(final Cond cond, final List fields) { + super(cond, fields); + } + + /* + * (non-Javadoc) + * + * @see eu.dnetlib.pace.condition.AbstractCondition#verify(eu.dnetlib.pace.model.FieldDef, java.util.List, java.util.List) + */ + @Override + protected ConditionEval verify(final FieldDef fd, final Field a, final Field b) { + + final String fa = getValue(a); + final String fb = getValue(b); + + return new ConditionEval(cond, a, b, fa.equals(fb) ? -1 : 1); + + } + + protected String getValue(final Field f) { + return getFirstValue(f); + } + + /** + * Checks if is empty. + * + * @param a the a + * @return true, if is empty + */ + protected boolean isEmpty(final Iterable a) { + return (a == null) || Iterables.isEmpty(a); + } + +} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/PidMatch.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/PidMatch.java new file mode 100644 index 000000000..4f9e0423d --- /dev/null +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/PidMatch.java @@ -0,0 +1,52 @@ +package eu.dnetlib.pace.condition; + +import java.util.List; + +import eu.dnetlib.pace.config.Cond; +import eu.dnetlib.pace.distance.eval.ConditionEval; +import eu.dnetlib.pace.model.Field; +import eu.dnetlib.pace.model.FieldDef; +import eu.dnetlib.pace.model.FieldList; +import eu.dnetlib.pace.model.adaptor.Pid; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; + +/** + * The Class PidMatch. + * + * @author claudio + */ +public class PidMatch extends AbstractCondition { + + private static final Log log = LogFactory.getLog(PidMatch.class); + + public PidMatch(final Cond cond, final List fields) { + super(cond, fields); + } + + @Override + protected ConditionEval verify(final FieldDef fd, final Field a, final Field b) { + + final List sa = ((FieldList) a).stringList(); + final List sb = ((FieldList) b).stringList(); + + final List pal = Pid.fromOafJson(sa); + final List pbl = Pid.fromOafJson(sb); + + int result = 0; + for(Pid pa : pal) { + final String ta = pa.getType(); + + for(Pid pb : pbl) { + final String tb = pb.getType(); + + if (tb.equalsIgnoreCase(ta)) { + result += pa.getValue().equalsIgnoreCase(pb.getValue()) ? 1 : -1; + } + } + } + + return new ConditionEval(cond, a, b, result); + } + +} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/SizeMatch.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/SizeMatch.java new file mode 100644 index 000000000..ae6e94037 --- /dev/null +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/SizeMatch.java @@ -0,0 +1,56 @@ +package eu.dnetlib.pace.condition; + +import java.util.List; + +import com.google.common.collect.Iterables; + +import eu.dnetlib.pace.config.Cond; +import eu.dnetlib.pace.distance.eval.ConditionEval; +import eu.dnetlib.pace.model.Field; +import eu.dnetlib.pace.model.FieldDef; + +/** + * Returns true if the number of values in the fields is the same. + * + * @author claudio + */ +public class SizeMatch extends AbstractCondition { + + /** + * Instantiates a new size match. + * + * @param fields + * the fields + */ + public SizeMatch(final Cond cond, final List fields) { + super(cond, fields); + } + + /* + * (non-Javadoc) + * + * @see eu.dnetlib.pace.condition.AbstractCondition#verify(eu.dnetlib.pace.model.FieldDef, java.util.List, java.util.List) + */ + @Override + protected ConditionEval verify(final FieldDef fd, final Field a, final Field b) { + + // if (a.isEmpty() & b.isEmpty()) return 1; + // + // if (a.isEmpty()) return -1; + // if (b.isEmpty()) return -1; + + return new ConditionEval(cond, a, b, Iterables.size(a) == Iterables.size(b) ? 1 : -1); + } + + /** + * Checks if is empty. + * + * @param a + * the a + * @return true, if is empty + */ + protected boolean isEmpty(final Iterable a) { + return (a == null) || Iterables.isEmpty(a); + } + +} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/TitleVersionMatch.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/TitleVersionMatch.java new file mode 100644 index 000000000..41a617aa5 --- /dev/null +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/TitleVersionMatch.java @@ -0,0 +1,35 @@ +package eu.dnetlib.pace.condition; + +import java.util.List; + +import eu.dnetlib.pace.config.Cond; +import eu.dnetlib.pace.distance.eval.ConditionEval; +import eu.dnetlib.pace.model.Field; +import eu.dnetlib.pace.model.FieldDef; + +/** + * Returns true if the titles in the given documents contains the same numbers, false otherwise. + * + * @author claudio + * + */ +public class TitleVersionMatch extends AbstractCondition { + + public TitleVersionMatch(final Cond cond, final List fields) { + super(cond, fields); + } + + @Override + protected ConditionEval verify(final FieldDef fd, final Field a, final Field b) { + final String valueA = getFirstValue(a); + final String valueB = getFirstValue(b); + + return new ConditionEval(cond, a, b, notNull(valueA) && notNull(valueB) && !checkNumbers(valueA, valueB) ? 1 : -1); + } + + @Override + public String toString() { + return getClass().getSimpleName() + ":" + super.toString(); + } + +} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/YearMatch.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/YearMatch.java new file mode 100644 index 000000000..89718426c --- /dev/null +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/condition/YearMatch.java @@ -0,0 +1,59 @@ +package eu.dnetlib.pace.condition; + +import java.util.List; + +import eu.dnetlib.pace.config.Cond; +import eu.dnetlib.pace.distance.eval.ConditionEval; +import org.apache.commons.lang.StringUtils; + +import eu.dnetlib.pace.model.Field; +import eu.dnetlib.pace.model.FieldDef; + +/** + * Returns true if the year of the date field in the given documents are the same, false when any of the two is invalid or it's missing. + * + * @author claudio + */ +public class YearMatch extends AbstractCondition { + + private int limit = 4; + + public YearMatch(final Cond cond, final List fields) { + super(cond, fields); + } + + // @Override + // public boolean verify(final Document a, final Document b) { + // boolean res = true; + // for (FieldDef fd : getFields()) { + // + // } + // + // return res; + // } + + @Override + protected ConditionEval verify(final FieldDef fd, final Field a, final Field b) { + final String valueA = getNumbers(getFirstValue(a)); + final String valueB = getNumbers(getFirstValue(b)); + + final boolean lengthMatch = checkLength(valueA) && checkLength(valueB); + final boolean onemissing = valueA.isEmpty() || valueB.isEmpty(); + + return new ConditionEval(cond, a, b, lengthMatch && valueA.equals(valueB) || onemissing ? 1 : -1); + } + + protected boolean checkLength(final String s) { + return s.length() == limit; + } + + protected String getFirstValue(final Field value) { + return (value != null) && !value.isEmpty() ? StringUtils.left(value.stringValue(), limit) : ""; + } + + @Override + public String toString() { + return getClass().getSimpleName() + ":" + super.toString(); + } + +} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/Algo.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/Algo.java new file mode 100644 index 000000000..cb2e434b6 --- /dev/null +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/Algo.java @@ -0,0 +1,46 @@ +package eu.dnetlib.pace.config; + +/** + * Enumerates the distance Algos. + */ +public enum Algo { + + /** The Jaro winkler. */ + JaroWinkler, + /** The Jaro winkler title. */ + JaroWinklerTitle, + /** The Levenstein. */ + Levenstein, + /** The Levenstein distance for title matching */ + LevensteinTitle, + /** The Level2 jaro winkler. */ + Level2JaroWinkler, + /** The Level2 jaro winkler for title matching */ + Level2JaroWinklerTitle, + /** The Level2 levenstein. */ + Level2Levenstein, + /** The Sub string levenstein. */ + SubStringLevenstein, + /** The Year levenstein. */ + YearLevenstein, + /** The Sorted jaro winkler. */ + SortedJaroWinkler, + /** The Sorted level2 jaro winkler. */ + SortedLevel2JaroWinkler, + /** Compares two urls */ + urlMatcher, + /** Exact match algo. */ + ExactMatch, + /** + * Returns 0 for equal strings, 1 for different strings. + */ + MustBeDifferent, + /** Always return 1.0 as distance. */ + AlwaysMatch, + /** Person distance */ + PersonCoAuthorSurnamesDistance, + PersonCoAnchorsDistance, + PersonDistance, + /** The Null. */ + Null +} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/Cond.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/Cond.java new file mode 100644 index 000000000..b287fdd76 --- /dev/null +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/Cond.java @@ -0,0 +1,28 @@ +package eu.dnetlib.pace.config; + +/** + * The Enum Cond. + */ +public enum Cond { + + /** The year match. */ + yearMatch, + /** The title version match. */ + titleVersionMatch, + /** The size match. */ + sizeMatch, + /** + * Returns true if the field values are different + */ + mustBeDifferent, + /** The Exact match. */ + exactMatch, + /** + * The Exact match ignore case. + */ + exactMatchIgnoreCase, + /** The Exact match specialized to recognize DOI values. */ + doiExactMatch, + /** The Exact match that checks if pid type and value are the same */ + pidMatch +} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/Config.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/Config.java new file mode 100644 index 000000000..7498c23cf --- /dev/null +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/Config.java @@ -0,0 +1,59 @@ +package eu.dnetlib.pace.config; + +import java.util.List; +import java.util.Map; + +import eu.dnetlib.pace.condition.ConditionAlgo; +import eu.dnetlib.pace.model.ClusteringDef; +import eu.dnetlib.pace.model.FieldDef; + +/** + * Interface for PACE configuration bean. + * + * @author claudio + */ +public interface Config { + + /** + * Field configuration definitions. + * + * @return the list of definitions + */ + public List model(); + + /** + * Field configuration definitions. + * + * @return the list of definitions + */ + public Map modelMap(); + + /** + * Strict Pre-Condition definitions. + * + * @return the list of conditions + */ + public List strictConditions(); + + /** + * Pre-Condition definitions. + * + * @return the list of conditions + */ + public List conditions(); + + /** + * Clusterings. + * + * @return the list + */ + public List clusterings(); + + /** + * Blacklists. + * + * @return the map + */ + public Map> blacklists(); + +} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/DedupConfig.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/DedupConfig.java new file mode 100644 index 000000000..5116f3684 --- /dev/null +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/DedupConfig.java @@ -0,0 +1,131 @@ +package eu.dnetlib.pace.config; + +import java.io.IOException; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Map.Entry; + +import org.antlr.stringtemplate.StringTemplate; +import org.apache.commons.io.IOUtils; + +import com.google.common.collect.Maps; +import com.google.gson.Gson; +import com.google.gson.GsonBuilder; + +import eu.dnetlib.pace.condition.ConditionAlgo; +import eu.dnetlib.pace.model.ClusteringDef; +import eu.dnetlib.pace.model.FieldDef; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; + +public class DedupConfig implements Config { + + private static final Log log = LogFactory.getLog(DedupConfig.class); + + private static String CONFIG_TEMPLATE = "dedupConfig.st"; + + private PaceConfig pace; + + private WfConfig wf; + + private static Map defaults = Maps.newHashMap(); + + static { + defaults.put("threshold", "0"); + defaults.put("run", "001"); + defaults.put("entityType", "result"); + defaults.put("orderField", "title"); + defaults.put("queueMaxSize", "2000"); + defaults.put("groupMaxSize", "10"); + defaults.put("slidingWindowSize", "200"); + defaults.put("rootBuilder", "result"); + defaults.put("includeChildren", "true"); + } + + public DedupConfig() {} + + public static DedupConfig load(final String json) { + + final DedupConfig config = new Gson().fromJson(json, DedupConfig.class); + + config.getPace().initModel(); + + return config; + } + + public static DedupConfig loadDefault() throws IOException { + return loadDefault(new HashMap()); + } + + public static DedupConfig loadDefault(final Map params) throws IOException { + + final StringTemplate template = new StringTemplate(new DedupConfig().readFromClasspath(CONFIG_TEMPLATE)); + + for (final Entry e : defaults.entrySet()) { + template.setAttribute(e.getKey(), e.getValue()); + } + for (final Entry e : params.entrySet()) { + template.setAttribute(e.getKey(), e.getValue()); + } + + final String json = template.toString(); + return load(json); + } + + private String readFromClasspath(final String resource) throws IOException { + return IOUtils.toString(getClass().getResource(resource)); + } + + public PaceConfig getPace() { + return pace; + } + + public void setPace(final PaceConfig pace) { + this.pace = pace; + } + + public WfConfig getWf() { + return wf; + } + + public void setWf(final WfConfig wf) { + this.wf = wf; + } + + @Override + public String toString() { + return new GsonBuilder().setPrettyPrinting().create().toJson(this); + } + + @Override + public List model() { + return getPace().getModel(); + } + + @Override + public Map modelMap() { + return getPace().getModelMap(); + } + + @Override + public List strictConditions() { + return getPace().getStrictConditionAlgos(); + } + + @Override + public List conditions() { + return getPace().getConditionAlgos(); + } + + @Override + public List clusterings() { + return getPace().getClustering(); + } + + @Override + public Map> blacklists() { + return getPace().getBlacklists(); + } + +} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/PaceConfig.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/PaceConfig.java new file mode 100644 index 000000000..ffc67e775 --- /dev/null +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/PaceConfig.java @@ -0,0 +1,104 @@ +package eu.dnetlib.pace.config; + +import java.util.List; +import java.util.Map; +import java.util.stream.Collectors; + +import com.google.common.collect.Lists; +import com.google.common.collect.Maps; +import eu.dnetlib.pace.condition.ConditionAlgo; +import eu.dnetlib.pace.model.ClusteringDef; +import eu.dnetlib.pace.model.CondDef; +import eu.dnetlib.pace.model.FieldDef; +import org.apache.commons.collections.CollectionUtils; + +public class PaceConfig { + + private List model; + private List strictConditions; + private List conditions; + private List clustering; + private Map> blacklists; + + private Map modelMap; + + public PaceConfig() {} + + public void initModel() { + modelMap = Maps.newHashMap(); + for(FieldDef fd : getModel()) { + modelMap.put(fd.getName(), fd); + } + } + + public List getModel() { + return model; + } + + public void setModel(final List model) { + this.model = model; + } + + public List getStrictConditions() { + return strictConditions; + } + + public void setStrictConditions(final List strictConditions) { + this.strictConditions = strictConditions; + } + + public List getConditions() { + return conditions; + } + + public List getConditionAlgos() { + return asConditionAlgos(getConditions()); + } + + public List getStrictConditionAlgos() { + return asConditionAlgos(getStrictConditions()); + } + + public void setConditions(final List conditions) { + this.conditions = conditions; + } + + public List getClustering() { + return clustering; + } + + public void setClustering(final List clustering) { + this.clustering = clustering; + } + + public Map> getBlacklists() { + return blacklists; + } + + public void setBlacklists(final Map> blacklists) { + this.blacklists = blacklists; + } + + public Map getModelMap() { + return modelMap; + } + + public void setModelMap(final Map modelMap) { + this.modelMap = modelMap; + } + + // helper + + private List asConditionAlgos(final List defs) { + final List algos = Lists.newArrayList(); + if (CollectionUtils.isEmpty(defs)) return algos; + for (final CondDef cd : defs) { + final List fields = getModel().stream() + .filter(fd -> cd.getFields().contains(fd.getName())) + .collect(Collectors.toList()); + algos.add(cd.getConditionAlgo(fields)); + } + return algos; + } + +} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/Type.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/Type.java new file mode 100644 index 000000000..0f1f696ab --- /dev/null +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/Type.java @@ -0,0 +1,5 @@ +package eu.dnetlib.pace.config; + +public enum Type { + String, Int, List, JSON, URL +} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/WfConfig.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/WfConfig.java new file mode 100644 index 000000000..9e836ebe2 --- /dev/null +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/config/WfConfig.java @@ -0,0 +1,254 @@ +package eu.dnetlib.pace.config; + +import java.util.HashSet; +import java.util.List; +import java.util.Set; + +import com.google.common.collect.Lists; +import com.google.common.collect.Sets; +import com.google.gson.GsonBuilder; +import org.apache.commons.lang.StringUtils; + +public class WfConfig { + + /** + * Entity type. + */ + private String entityType = ""; + + /** + * Sub-Entity type refers to one of fields declared in the model. See eu.dnetlib.pace.config.PaceConfig.modelMap + */ + private String subEntityType = ""; + + /** + * Sub-Entity value declares a value for subTypes to be considered. + */ + private String subEntityValue = ""; + + /** + * Field name used to sort the values in the reducer phase. + */ + private String orderField = ""; + + /** + * Column Families involved in the relations redirection. + */ + private List rootBuilder = Lists.newArrayList(); + + /** + * Set of datasource namespace prefixes that won't be deduplicated. + */ + private Set skipList = Sets.newHashSet(); + + /** + * Subprefix used to build the root id, allows multiple dedup runs. + */ + private String dedupRun = ""; + + /** + * Similarity threshold. + */ + private double threshold = 0; + + /** The queue max size. */ + private int queueMaxSize = 2000; + + /** The group max size. */ + private int groupMaxSize; + + /** The sliding window size. */ + private int slidingWindowSize; + + /** The configuration id. */ + private String configurationId; + + /** The include children. */ + private boolean includeChildren; + + /** Default maximum number of allowed children. */ + private final static int MAX_CHILDREN = 10; + + /** Maximum number of allowed children. */ + private int maxChildren = MAX_CHILDREN; + + public WfConfig() {} + + /** + * Instantiates a new dedup config. + * + * @param entityType + * the entity type + * @param orderField + * the order field + * @param rootBuilder + * the root builder families + * @param dedupRun + * the dedup run + * @param threshold + * the threshold + * @param skipList + * the skip list + * @param queueMaxSize + * the queue max size + * @param groupMaxSize + * the group max size + * @param slidingWindowSize + * the sliding window size + * @param includeChildren + * allows the children to be included in the representative records or not. + */ + public WfConfig(final String entityType, final String orderField, final List rootBuilder, final String dedupRun, + final double threshold, + final Set skipList, final int queueMaxSize, final int groupMaxSize, final int slidingWindowSize, final boolean includeChildren) { + super(); + this.entityType = entityType; + this.orderField = orderField; + this.rootBuilder = rootBuilder; + this.dedupRun = cleanupStringNumber(dedupRun); + this.threshold = threshold; + this.skipList = skipList; + this.queueMaxSize = queueMaxSize; + this.groupMaxSize = groupMaxSize; + this.slidingWindowSize = slidingWindowSize; + this.includeChildren = includeChildren; + } + + /** + * Cleanup string number. + * + * @param s + * the s + * @return the string + */ + private String cleanupStringNumber(final String s) { + return s.contains("'") ? s.replaceAll("'", "") : s; + } + + public boolean hasSubType() { + return StringUtils.isNotBlank(getSubEntityType()) && StringUtils.isNotBlank(getSubEntityValue()); + } + + public String getEntityType() { + return entityType; + } + + public void setEntityType(final String entityType) { + this.entityType = entityType; + } + + public String getSubEntityType() { + return subEntityType; + } + + public void setSubEntityType(final String subEntityType) { + this.subEntityType = subEntityType; + } + + public String getSubEntityValue() { + return subEntityValue; + } + + public void setSubEntityValue(final String subEntityValue) { + this.subEntityValue = subEntityValue; + } + + public String getOrderField() { + return orderField; + } + + public void setOrderField(final String orderField) { + this.orderField = orderField; + } + + public List getRootBuilder() { + return rootBuilder; + } + + public void setRootBuilder(final List rootBuilder) { + this.rootBuilder = rootBuilder; + } + + public Set getSkipList() { + return skipList != null ? skipList : new HashSet(); + } + + public void setSkipList(final Set skipList) { + this.skipList = skipList; + } + + public String getDedupRun() { + return dedupRun; + } + + public void setDedupRun(final String dedupRun) { + this.dedupRun = dedupRun; + } + + public double getThreshold() { + return threshold; + } + + public void setThreshold(final double threshold) { + this.threshold = threshold; + } + + public int getQueueMaxSize() { + return queueMaxSize; + } + + public void setQueueMaxSize(final int queueMaxSize) { + this.queueMaxSize = queueMaxSize; + } + + public int getGroupMaxSize() { + return groupMaxSize; + } + + public void setGroupMaxSize(final int groupMaxSize) { + this.groupMaxSize = groupMaxSize; + } + + public int getSlidingWindowSize() { + return slidingWindowSize; + } + + public void setSlidingWindowSize(final int slidingWindowSize) { + this.slidingWindowSize = slidingWindowSize; + } + + public String getConfigurationId() { + return configurationId; + } + + public void setConfigurationId(final String configurationId) { + this.configurationId = configurationId; + } + + public boolean isIncludeChildren() { + return includeChildren; + } + + public void setIncludeChildren(final boolean includeChildren) { + this.includeChildren = includeChildren; + } + + public int getMaxChildren() { + return maxChildren; + } + + public void setMaxChildren(final int maxChildren) { + this.maxChildren = maxChildren; + } + + /* + * (non-Javadoc) + * + * @see java.lang.Object#toString() + */ + @Override + public String toString() { + return new GsonBuilder().setPrettyPrinting().create().toJson(this); + } + +} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/AbstractDistance.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/AbstractDistance.java new file mode 100644 index 000000000..f9d189ff6 --- /dev/null +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/AbstractDistance.java @@ -0,0 +1,15 @@ +package eu.dnetlib.pace.distance; + +import eu.dnetlib.pace.config.Config; +import eu.dnetlib.pace.distance.eval.ScoreResult; +import eu.dnetlib.pace.model.Document; + +public abstract class AbstractDistance implements Distance { + + protected abstract Document toDocument(A a); + + @Override + public ScoreResult between(final A a, final A b, final Config config) { + return new DistanceScorer(config).distance(toDocument(a), toDocument(b)); + } +} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/ConfigurableDistanceAlgo.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/ConfigurableDistanceAlgo.java new file mode 100644 index 000000000..b354f0654 --- /dev/null +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/ConfigurableDistanceAlgo.java @@ -0,0 +1,26 @@ +package eu.dnetlib.pace.distance; + +import java.util.Map; + +import eu.dnetlib.pace.common.AbstractPaceFunctions; + +public abstract class ConfigurableDistanceAlgo extends AbstractPaceFunctions { + + private Map params; + + private double weigth; + + public ConfigurableDistanceAlgo(final Map params, final double weight) { + this.params = params; + this.weigth = weight; + } + + public Map getParams() { + return params; + } + + public double getWeigth() { + return weigth; + } + +} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/Distance.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/Distance.java new file mode 100644 index 000000000..93a6e757a --- /dev/null +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/Distance.java @@ -0,0 +1,9 @@ +package eu.dnetlib.pace.distance; + +import eu.dnetlib.pace.config.Config; +import eu.dnetlib.pace.distance.eval.ScoreResult; + +public interface Distance { + + public ScoreResult between(A a, A b, Config config); +} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/DistanceAlgo.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/DistanceAlgo.java new file mode 100644 index 000000000..e9d009548 --- /dev/null +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/DistanceAlgo.java @@ -0,0 +1,15 @@ +package eu.dnetlib.pace.distance; + +import eu.dnetlib.pace.model.Field; + +/** + * Each field is configured with a distance algo which knows how to compute the distance (0-1) between the fields of two + * objects. + */ +public interface DistanceAlgo { + + public abstract double distance(Field a, Field b); + + public double getWeight(); + +} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/DistanceScorer.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/DistanceScorer.java new file mode 100644 index 000000000..0cbb6f4f6 --- /dev/null +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/DistanceScorer.java @@ -0,0 +1,101 @@ +package eu.dnetlib.pace.distance; + +import java.util.Collection; +import java.util.List; + +import eu.dnetlib.pace.condition.ConditionAlgo; +import eu.dnetlib.pace.config.Config; +import eu.dnetlib.pace.distance.eval.ConditionEvalMap; +import eu.dnetlib.pace.distance.eval.DistanceEval; +import eu.dnetlib.pace.distance.eval.DistanceEvalMap; +import eu.dnetlib.pace.distance.eval.ScoreResult; +import eu.dnetlib.pace.model.Document; +import eu.dnetlib.pace.model.Field; +import eu.dnetlib.pace.model.FieldDef; + +/** + * The distance between two documents is given by the weighted mean of the field distances + */ +public class DistanceScorer { + + private Config config; + + public DistanceScorer(final Config config) { + this.config = config; + } + + public ScoreResult distance(final Document a, final Document b) { + final ScoreResult sr = new ScoreResult(); + + sr.setStrictConditions(verify(a, b, config.strictConditions())); + sr.setConditions(verify(a, b, config.conditions())); + + final DistanceEvalMap dMap = new DistanceEvalMap(sumWeights(config.model())); + + for (final FieldDef fd : config.model()) { + + dMap.updateDistance(fieldDistance(a, b, fd)); + } + sr.setDistances(dMap); + return sr; + } + + private ConditionEvalMap verify(final Document a, final Document b, final List conditions) { + final ConditionEvalMap res = new ConditionEvalMap(); + + for (final ConditionAlgo cd : conditions) { + final ConditionEvalMap map = cd.verify(a, b); + res.mergeFrom(map); + + // commented out shortcuts + /* + if (map.anyNegative()) { + return res; + } + */ + + //if (strict && (res < 0)) return -1; + //cond += verify; + } + return res; + } + + private DistanceEval fieldDistance(final Document a, final Document b, final FieldDef fd) { + + final double w = fd.getWeight(); + final Field va = getValue(a, fd); + final Field vb = getValue(b, fd); + + final DistanceEval de = new DistanceEval(fd, va, vb); + if ((w == 0)) return de; // optimization for 0 weight + else { + if (va.isEmpty() || vb.isEmpty()) { + if (fd.isIgnoreMissing()) { + de.setDistance(-1); + } else { + de.setDistance(w); + } + } else { + if (va.getType().equals(vb.getType())) { + de.setDistance(w * fd.getDistanceAlgo().distance(va, vb)); + } else { + throw new IllegalArgumentException(String.format("Types are differents type: %s:%s - %s:%s", va, va.getType(), vb, vb.getType())); + } + } + return de; + } + } + + private Field getValue(final Document d, final FieldDef fd) { + return d.values(fd.getName()); + } + + private double sumWeights(final Collection fields) { + double sum = 0.0; + for (final FieldDef fd : fields) { + sum += fd.getWeight(); + } + return sum; + } + +} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/PaceDocumentDistance.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/PaceDocumentDistance.java new file mode 100644 index 000000000..7651479ee --- /dev/null +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/PaceDocumentDistance.java @@ -0,0 +1,12 @@ +package eu.dnetlib.pace.distance; + +import eu.dnetlib.pace.model.Document; + +public class PaceDocumentDistance extends AbstractDistance { + + @Override + protected Document toDocument(Document a) { + return a; + } + +} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/SecondStringDistanceAlgo.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/SecondStringDistanceAlgo.java new file mode 100644 index 000000000..83296048d --- /dev/null +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/SecondStringDistanceAlgo.java @@ -0,0 +1,100 @@ +package eu.dnetlib.pace.distance; + +import java.util.List; + +import com.wcohen.ss.AbstractStringDistance; + +import eu.dnetlib.pace.common.AbstractPaceFunctions; +import eu.dnetlib.pace.config.Type; +import eu.dnetlib.pace.model.Field; +import eu.dnetlib.pace.model.FieldList; + +/** + * For the rest of the fields delegate the distance measure to the second string library. + */ +public abstract class SecondStringDistanceAlgo extends AbstractPaceFunctions implements DistanceAlgo { + + // val aliases = Map(('₁' to '₉') zip ('1' to '9'): _*) ++ Map(('⁴' to '⁹') zip ('4' to '9'): _*) ++ Map('¹' -> '1', '²' -> + // '2', * '³' + // -> '3') + + /** The ssalgo. */ + protected AbstractStringDistance ssalgo; + + /** The weight. */ + protected double weight = 0.0; + + /** + * Instantiates a new second string distance algo. + * + * @param weight + * the weight + * @param ssalgo + * the ssalgo + */ + protected SecondStringDistanceAlgo(final double weight, final AbstractStringDistance ssalgo) { + this.ssalgo = ssalgo; + this.weight = weight; + } + + /** + * Normalize. + * + * @param d + * the d + * @return the double + */ + protected abstract double normalize(double d); + + /** + * Distance. + * + * @param a + * the a + * @param b + * the b + * @return the double + */ + public double distance(final String a, final String b) { + double score = ssalgo.score(a, b); + return normalize(score); + } + + /** + * Distance. + * + * @param a + * the a + * @param b + * the b + * @return the double + */ + protected double distance(final List a, final List b) { + return distance(concat(a), concat(b)); + } + + /* + * (non-Javadoc) + * + * @see eu.dnetlib.pace.distance.DistanceAlgo#distance(eu.dnetlib.pace.model.Field, eu.dnetlib.pace.model.Field) + */ + @Override + public double distance(final Field a, final Field b) { + if (a.getType().equals(Type.String) && b.getType().equals(Type.String)) return distance(a.stringValue(), b.stringValue()); + if (a.getType().equals(Type.List) && b.getType().equals(Type.List)) return distance(toList(a), toList(b)); + + throw new IllegalArgumentException("invalid types\n- A: " + a.toString() + "\n- B: " + b.toString()); + } + + /** + * To list. + * + * @param list + * the list + * @return the list + */ + protected List toList(final Field list) { + return ((FieldList) list).stringList(); + } + +} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/AlwaysMatch.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/AlwaysMatch.java new file mode 100644 index 000000000..904498202 --- /dev/null +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/AlwaysMatch.java @@ -0,0 +1,31 @@ +package eu.dnetlib.pace.distance.algo; + +import com.wcohen.ss.AbstractStringDistance; +import eu.dnetlib.pace.distance.SecondStringDistanceAlgo; + +public class AlwaysMatch extends SecondStringDistanceAlgo { + + public AlwaysMatch(final double weight) { + super(weight, new com.wcohen.ss.JaroWinkler()); + } + + protected AlwaysMatch(final double weight, final AbstractStringDistance ssalgo) { + super(weight, ssalgo); + } + + @Override + public double distance(final String a, final String b) { + return 1.0; + } + + @Override + public double getWeight() { + return super.weight; + } + + @Override + protected double normalize(final double d) { + return d; + } + +} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/ExactMatch.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/ExactMatch.java new file mode 100644 index 000000000..ef95c024a --- /dev/null +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/ExactMatch.java @@ -0,0 +1,31 @@ +package eu.dnetlib.pace.distance.algo; + +import com.wcohen.ss.AbstractStringDistance; +import eu.dnetlib.pace.distance.SecondStringDistanceAlgo; + +public class ExactMatch extends SecondStringDistanceAlgo { + + public ExactMatch(final double weight) { + super(weight, new com.wcohen.ss.JaroWinkler()); + } + + protected ExactMatch(final double weight, final AbstractStringDistance ssalgo) { + super(weight, ssalgo); + } + + @Override + public double distance(final String a, final String b) { + return a.equals(b) ? 1.0 : 0; + } + + @Override + public double getWeight() { + return super.weight; + } + + @Override + protected double normalize(final double d) { + return d; + } + +} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/JaroWinkler.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/JaroWinkler.java new file mode 100644 index 000000000..87f6c4e6a --- /dev/null +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/JaroWinkler.java @@ -0,0 +1,35 @@ +package eu.dnetlib.pace.distance.algo; + +import com.wcohen.ss.AbstractStringDistance; +import eu.dnetlib.pace.distance.SecondStringDistanceAlgo; + +//case class JaroWinkler(w: Double) extends SecondStringDistanceAlgo(w, new com.wcohen.ss.JaroWinkler()) +public class JaroWinkler extends SecondStringDistanceAlgo { + + public JaroWinkler(double weight) { + super(weight, new com.wcohen.ss.JaroWinkler()); + } + + protected JaroWinkler(double weight, AbstractStringDistance ssalgo) { + super(weight, ssalgo); + } + + @Override + public double distance(String a, String b) { + String ca = cleanup(a); + String cb = cleanup(b); + + return normalize(ssalgo.score(ca, cb)); + } + + @Override + public double getWeight() { + return super.weight; + } + + @Override + protected double normalize(double d) { + return d; + } + +} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/JaroWinklerTitle.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/JaroWinklerTitle.java new file mode 100644 index 000000000..1419a072b --- /dev/null +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/JaroWinklerTitle.java @@ -0,0 +1,36 @@ +package eu.dnetlib.pace.distance.algo; + +import com.wcohen.ss.AbstractStringDistance; +import eu.dnetlib.pace.distance.SecondStringDistanceAlgo; + +//case class JaroWinkler(w: Double) extends SecondStringDistanceAlgo(w, new com.wcohen.ss.JaroWinkler()) +public class JaroWinklerTitle extends SecondStringDistanceAlgo { + + public JaroWinklerTitle(double weight) { + super(weight, new com.wcohen.ss.JaroWinkler()); + } + + protected JaroWinklerTitle(double weight, AbstractStringDistance ssalgo) { + super(weight, ssalgo); + } + + @Override + public double distance(String a, String b) { + String ca = cleanup(a); + String cb = cleanup(b); + + boolean check = checkNumbers(ca, cb); + return check ? 0.5 : normalize(ssalgo.score(ca, cb)); + } + + @Override + public double getWeight() { + return super.weight; + } + + @Override + protected double normalize(double d) { + return d; + } + +} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/Level2JaroWinkler.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/Level2JaroWinkler.java new file mode 100644 index 000000000..3ad1cfaaf --- /dev/null +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/Level2JaroWinkler.java @@ -0,0 +1,26 @@ +package eu.dnetlib.pace.distance.algo; + +import com.wcohen.ss.AbstractStringDistance; +import eu.dnetlib.pace.distance.SecondStringDistanceAlgo; + +public class Level2JaroWinkler extends SecondStringDistanceAlgo { + + public Level2JaroWinkler(double w) { + super(w, new com.wcohen.ss.Level2JaroWinkler()); + } + + protected Level2JaroWinkler(double w, AbstractStringDistance ssalgo) { + super(w, ssalgo); + } + + @Override + public double getWeight() { + return super.weight; + } + + @Override + protected double normalize(double d) { + return d; + } + +} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/Level2JaroWinklerTitle.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/Level2JaroWinklerTitle.java new file mode 100644 index 000000000..a1c347256 --- /dev/null +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/Level2JaroWinklerTitle.java @@ -0,0 +1,41 @@ +package eu.dnetlib.pace.distance.algo; + +import com.wcohen.ss.AbstractStringDistance; +import eu.dnetlib.pace.distance.SecondStringDistanceAlgo; + +public class Level2JaroWinklerTitle extends SecondStringDistanceAlgo { + + public Level2JaroWinklerTitle(final double w) { + super(w, new com.wcohen.ss.Level2JaroWinkler()); + } + + protected Level2JaroWinklerTitle(final double w, final AbstractStringDistance ssalgo) { + super(w, ssalgo); + } + + @Override + public double distance(final String a, final String b) { + final String ca = cleanup(a); + final String cb = cleanup(b); + + final boolean check = checkNumbers(ca, cb); + + if (check) return 0.5; + + final String cca = finalCleanup(ca); + final String ccb = finalCleanup(cb); + + return ssalgo.score(cca, ccb); + } + + @Override + public double getWeight() { + return super.weight; + } + + @Override + protected double normalize(final double d) { + return d; + } + +} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/Level2Levenstein.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/Level2Levenstein.java new file mode 100644 index 000000000..7a2b0295f --- /dev/null +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/Level2Levenstein.java @@ -0,0 +1,26 @@ +package eu.dnetlib.pace.distance.algo; + +import com.wcohen.ss.AbstractStringDistance; +import eu.dnetlib.pace.distance.SecondStringDistanceAlgo; + +public class Level2Levenstein extends SecondStringDistanceAlgo { + + public Level2Levenstein(double w) { + super(w, new com.wcohen.ss.Level2Levenstein()); + } + + protected Level2Levenstein(double w, AbstractStringDistance ssalgo) { + super(w, ssalgo); + } + + @Override + public double getWeight() { + return super.weight; + } + + @Override + protected double normalize(double d) { + return 1 / Math.pow(Math.abs(d) + 1, 0.1); + } + +} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/Levenstein.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/Levenstein.java new file mode 100644 index 000000000..9dfce83e5 --- /dev/null +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/Levenstein.java @@ -0,0 +1,26 @@ +package eu.dnetlib.pace.distance.algo; + +import com.wcohen.ss.AbstractStringDistance; +import eu.dnetlib.pace.distance.SecondStringDistanceAlgo; + +public class Levenstein extends SecondStringDistanceAlgo { + + public Levenstein(double w) { + super(w, new com.wcohen.ss.Levenstein()); + } + + protected Levenstein(double w, AbstractStringDistance ssalgo) { + super(w, ssalgo); + } + + @Override + public double getWeight() { + return super.weight; + } + + @Override + protected double normalize(double d) { + return 1 / Math.pow(Math.abs(d) + 1, 0.1); + } + +} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/LevensteinDate.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/LevensteinDate.java new file mode 100644 index 000000000..545295567 --- /dev/null +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/LevensteinDate.java @@ -0,0 +1,25 @@ +package eu.dnetlib.pace.distance.algo; + + +public class LevensteinDate extends Levenstein { + + + public LevensteinDate(double w) { + super(w); + } + + + @Override + public double distance(String a, String b) { + + return 1.0; + } + + + + @Override + public double getWeight() { + return super.weight; + } + +} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/LevensteinTitle.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/LevensteinTitle.java new file mode 100644 index 000000000..281de31c3 --- /dev/null +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/LevensteinTitle.java @@ -0,0 +1,45 @@ +package eu.dnetlib.pace.distance.algo; + +import com.wcohen.ss.AbstractStringDistance; +import eu.dnetlib.pace.distance.SecondStringDistanceAlgo; + +public class LevensteinTitle extends SecondStringDistanceAlgo { + + public LevensteinTitle(final double w) { + super(w, new com.wcohen.ss.Levenstein()); + } + + protected LevensteinTitle(final double w, final AbstractStringDistance ssalgo) { + super(w, ssalgo); + } + + @Override + public double distance(final String a, final String b) { + final String ca = cleanup(a); + final String cb = cleanup(b); + + final boolean check = checkNumbers(ca, cb); + + if (check) return 0.5; + + final String cca = finalCleanup(ca); + final String ccb = finalCleanup(cb); + + return normalize(ssalgo.score(cca, ccb), cca.length(), ccb.length()); + } + + private double normalize(final double score, final int la, final int lb) { + return 1 - (Math.abs(score) / Math.max(la, lb)); + } + + @Override + public double getWeight() { + return super.weight; + } + + @Override + protected double normalize(final double d) { + return 1 / Math.pow(Math.abs(d) + 1, 0.1); + } + +} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/MustBeDifferent.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/MustBeDifferent.java new file mode 100644 index 000000000..1177ed528 --- /dev/null +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/MustBeDifferent.java @@ -0,0 +1,31 @@ +package eu.dnetlib.pace.distance.algo; + +import com.wcohen.ss.AbstractStringDistance; +import eu.dnetlib.pace.distance.SecondStringDistanceAlgo; + +public class MustBeDifferent extends SecondStringDistanceAlgo { + + public MustBeDifferent(final double weight) { + super(weight, new com.wcohen.ss.JaroWinkler()); + } + + protected MustBeDifferent(final double weight, final AbstractStringDistance ssalgo) { + super(weight, ssalgo); + } + + @Override + public double distance(final String a, final String b) { + return !a.equals(b) ? 1.0 : 0; + } + + @Override + public double getWeight() { + return super.weight; + } + + @Override + protected double normalize(final double d) { + return d; + } + +} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/NullDistanceAlgo.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/NullDistanceAlgo.java new file mode 100644 index 000000000..8070a0010 --- /dev/null +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/NullDistanceAlgo.java @@ -0,0 +1,22 @@ +package eu.dnetlib.pace.distance.algo; + +import eu.dnetlib.pace.distance.DistanceAlgo; +import eu.dnetlib.pace.model.Field; + +/** + * Not all fields of a document need to partecipate in the distance measure. We model those fields as having a + * NullDistanceAlgo. + */ +public class NullDistanceAlgo implements DistanceAlgo { + + @Override + public double distance(Field a, Field b) { + return 0.0; + } + + @Override + public double getWeight() { + return 0.0; + } + +} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/SortedJaroWinkler.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/SortedJaroWinkler.java new file mode 100644 index 000000000..d83420750 --- /dev/null +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/SortedJaroWinkler.java @@ -0,0 +1,52 @@ +package eu.dnetlib.pace.distance.algo; + +import com.wcohen.ss.AbstractStringDistance; + +/** + * The Class SortedJaroWinkler. + */ +public class SortedJaroWinkler extends SortedSecondStringDistanceAlgo { + + /** + * Instantiates a new sorted jaro winkler. + * + * @param weight + * the weight + */ + public SortedJaroWinkler(final double weight) { + super(weight, new com.wcohen.ss.JaroWinkler()); + } + + /** + * Instantiates a new sorted jaro winkler. + * + * @param weight + * the weight + * @param ssalgo + * the ssalgo + */ + protected SortedJaroWinkler(final double weight, final AbstractStringDistance ssalgo) { + super(weight, ssalgo); + } + + /* + * (non-Javadoc) + * + * @see eu.dnetlib.pace.distance.DistanceAlgo#getWeight() + */ + @Override + public double getWeight() { + return super.weight; + } + + /* + * (non-Javadoc) + * + * @see eu.dnetlib.pace.distance.SecondStringDistanceAlgo#normalize(double) + */ + @Override + protected double normalize(final double d) { + return d; + } + +} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/SortedLevel2JaroWinkler.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/SortedLevel2JaroWinkler.java new file mode 100644 index 000000000..43ac190e3 --- /dev/null +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/SortedLevel2JaroWinkler.java @@ -0,0 +1,52 @@ +package eu.dnetlib.pace.distance.algo; + +import com.wcohen.ss.AbstractStringDistance; + +/** + * The Class SortedJaroWinkler. + */ +public class SortedLevel2JaroWinkler extends SortedSecondStringDistanceAlgo { + + /** + * Instantiates a new sorted jaro winkler. + * + * @param weight + * the weight + */ + public SortedLevel2JaroWinkler(final double weight) { + super(weight, new com.wcohen.ss.Level2JaroWinkler()); + } + + /** + * Instantiates a new sorted jaro winkler. + * + * @param weight + * the weight + * @param ssalgo + * the ssalgo + */ + protected SortedLevel2JaroWinkler(final double weight, final AbstractStringDistance ssalgo) { + super(weight, ssalgo); + } + + /* + * (non-Javadoc) + * + * @see eu.dnetlib.pace.distance.DistanceAlgo#getWeight() + */ + @Override + public double getWeight() { + return super.weight; + } + + /* + * (non-Javadoc) + * + * @see eu.dnetlib.pace.distance.SecondStringDistanceAlgo#normalize(double) + */ + @Override + protected double normalize(final double d) { + return d; + } + +} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/SortedSecondStringDistanceAlgo.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/SortedSecondStringDistanceAlgo.java new file mode 100644 index 000000000..d47fbbacd --- /dev/null +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/SortedSecondStringDistanceAlgo.java @@ -0,0 +1,43 @@ +package eu.dnetlib.pace.distance.algo; + +import java.util.Collections; +import java.util.List; + +import com.google.common.collect.Lists; +import com.wcohen.ss.AbstractStringDistance; + +import eu.dnetlib.pace.distance.SecondStringDistanceAlgo; +import eu.dnetlib.pace.model.Field; +import eu.dnetlib.pace.model.FieldList; + +/** + * For the rest of the fields delegate the distance measure to the second string library. + */ +public abstract class SortedSecondStringDistanceAlgo extends SecondStringDistanceAlgo { + + /** + * Instantiates a new sorted second string distance algo. + * + * @param weight + * the weight + * @param ssalgo + * the ssalgo + */ + protected SortedSecondStringDistanceAlgo(final double weight, final AbstractStringDistance ssalgo) { + super(weight, ssalgo); + } + + /* + * (non-Javadoc) + * + * @see eu.dnetlib.pace.distance.SecondStringDistanceAlgo#toList(eu.dnetlib.pace.model.Field) + */ + @Override + protected List toList(final Field list) { + FieldList fl = (FieldList) list; + List values = Lists.newArrayList(fl.stringList()); + Collections.sort(values); + return values; + } + +} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/SubStringLevenstein.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/SubStringLevenstein.java new file mode 100644 index 000000000..1fa358b0f --- /dev/null +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/SubStringLevenstein.java @@ -0,0 +1,90 @@ +package eu.dnetlib.pace.distance.algo; + +import eu.dnetlib.pace.distance.SecondStringDistanceAlgo; +import org.apache.commons.lang.StringUtils; + +import com.wcohen.ss.AbstractStringDistance; + +import eu.dnetlib.pace.config.Type; +import eu.dnetlib.pace.model.Field; + +/** + * The Class SubStringLevenstein. + */ +public class SubStringLevenstein extends SecondStringDistanceAlgo { + + /** The limit. */ + protected int limit; + + /** + * Instantiates a new sub string levenstein. + * + * @param w + * the w + */ + public SubStringLevenstein(final double w) { + super(w, new com.wcohen.ss.Levenstein()); + } + + /** + * Instantiates a new sub string levenstein. + * + * @param w + * the w + * @param limit + * the limit + */ + public SubStringLevenstein(final double w, final int limit) { + super(w, new com.wcohen.ss.Levenstein()); + this.limit = limit; + } + + /** + * Instantiates a new sub string levenstein. + * + * @param w + * the w + * @param limit + * the limit + * @param ssalgo + * the ssalgo + */ + protected SubStringLevenstein(final double w, final int limit, final AbstractStringDistance ssalgo) { + super(w, ssalgo); + this.limit = limit; + } + + /* + * (non-Javadoc) + * + * @see eu.dnetlib.pace.distance.SecondStringDistanceAlgo#distance(eu.dnetlib.pace.model.Field, eu.dnetlib.pace.model.Field) + */ + @Override + public double distance(final Field a, final Field b) { + if (a.getType().equals(Type.String) && b.getType().equals(Type.String)) + return distance(StringUtils.left(a.stringValue(), limit), StringUtils.left(b.stringValue(), limit)); + + throw new IllegalArgumentException("invalid types\n- A: " + a.toString() + "\n- B: " + b.toString()); + } + + /* + * (non-Javadoc) + * + * @see eu.dnetlib.pace.distance.DistanceAlgo#getWeight() + */ + @Override + public double getWeight() { + return super.weight; + } + + /* + * (non-Javadoc) + * + * @see eu.dnetlib.pace.distance.SecondStringDistanceAlgo#normalize(double) + */ + @Override + protected double normalize(final double d) { + return 1 / Math.pow(Math.abs(d) + 1, 0.1); + } + +} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/UrlMatcher.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/UrlMatcher.java new file mode 100644 index 000000000..46a438ebe --- /dev/null +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/UrlMatcher.java @@ -0,0 +1,48 @@ +package eu.dnetlib.pace.distance.algo; + +import eu.dnetlib.pace.model.Field; +import org.apache.commons.lang.StringUtils; + +import java.net.MalformedURLException; +import java.net.URL; +import java.util.Map; + +public class UrlMatcher extends Levenstein { + + private Map params; + + public UrlMatcher(double weight, Map params) { + super(weight); + this.params = params; + } + + @Override + public double distance(Field a, Field b) { + + final URL urlA = asUrl(getFirstValue(a)); + final URL urlB = asUrl(getFirstValue(b)); + + if (!urlA.getHost().equalsIgnoreCase(urlB.getHost())) { + return 0.0; + } + + Double hostW = params.get("host").doubleValue(); + Double pathW = params.get("path").doubleValue(); + + if (StringUtils.isBlank(urlA.getPath()) || StringUtils.isBlank(urlB.getPath())) { + return hostW * 0.5; + } + + return hostW + pathW * super.distance(urlA.getPath(), urlB.getPath()); + } + + private URL asUrl(final String value) { + try { + return new URL(value); + } catch (MalformedURLException e) { + // should not happen as checked by pace typing + throw new IllegalStateException("invalid URL: " + value); + } + } + +} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/YearLevenstein.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/YearLevenstein.java new file mode 100644 index 000000000..4e9796c2d --- /dev/null +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/algo/YearLevenstein.java @@ -0,0 +1,37 @@ +package eu.dnetlib.pace.distance.algo; + + +public class YearLevenstein extends SubStringLevenstein { + + public YearLevenstein(double w) { + super(w); + } + + public YearLevenstein(double w, int limit) { + super(w, limit); + } + + @Override + public double distance(String a, String b) { + boolean check = checkLength(a) && checkLength(b); + if (check) { + if (a.equals(b)) { + return 1.0; + } else { + return 0.5; + } + } else { + return 1.0; + } + } + + protected boolean checkLength(String s) { + return getNumbers(s).length() == limit; + } + + @Override + public double getWeight() { + return super.weight; + } + +} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/eval/ConditionEval.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/eval/ConditionEval.java new file mode 100644 index 000000000..49e526f42 --- /dev/null +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/eval/ConditionEval.java @@ -0,0 +1,57 @@ +package eu.dnetlib.pace.distance.eval; + +import eu.dnetlib.pace.config.Cond; +import eu.dnetlib.pace.model.Field; + +/** + * Created by claudio on 09/03/16. + */ +public class ConditionEval { + + private Cond cond; + + private Field a; + + private Field b; + + private int result; + + public ConditionEval(final Cond cond, final Field a, final Field b, final int result) { + this.cond = cond; + this.a = a; + this.b = b; + this.result = result; + } + + public Field getA() { + return a; + } + + public void setA(final Field a) { + this.a = a; + } + + public Field getB() { + return b; + } + + public void setB(final Field b) { + this.b = b; + } + + public int getResult() { + return result; + } + + public void setResult(final int result) { + this.result = result; + } + + public Cond getCond() { + return cond; + } + + public void setCond(final Cond cond) { + this.cond = cond; + } +} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/eval/ConditionEvalMap.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/eval/ConditionEvalMap.java new file mode 100644 index 000000000..a851596b3 --- /dev/null +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/eval/ConditionEvalMap.java @@ -0,0 +1,38 @@ +package eu.dnetlib.pace.distance.eval; + +import java.util.HashMap; + +import com.google.common.base.Predicate; +import com.google.common.collect.Iterables; + +/** + * Created by claudio on 09/03/16. + */ +public class ConditionEvalMap extends HashMap { + + + public ConditionEvalMap mergeFrom(ConditionEvalMap map) { + putAll(map); + return this; + } + + public boolean anyNegative() { + return values().stream() + .allMatch(ec -> ec.getResult() < 0); + } + + public boolean isZero() { + return result() == 0; + } + + public int result() { + int res = 0; + for(ConditionEval ec : values()) { + final int verify = ec.getResult(); + if (verify < 0) return -1; + res += verify; + } + return res; + } + +} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/eval/DistanceEval.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/eval/DistanceEval.java new file mode 100644 index 000000000..a943d4cea --- /dev/null +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/eval/DistanceEval.java @@ -0,0 +1,57 @@ +package eu.dnetlib.pace.distance.eval; + +import eu.dnetlib.pace.config.Algo; +import eu.dnetlib.pace.model.Field; +import eu.dnetlib.pace.model.FieldDef; + +/** + * Created by claudio on 09/03/16. + */ +public class DistanceEval { + + private FieldDef fieldDef; + + private Field a; + + private Field b; + + private double distance = 0.0; + + public DistanceEval(final FieldDef fieldDef, final Field a, final Field b) { + this.fieldDef = fieldDef; + this.a = a; + this.b = b; + } + + public Field getA() { + return a; + } + + public void setA(final Field a) { + this.a = a; + } + + public Field getB() { + return b; + } + + public void setB(final Field b) { + this.b = b; + } + + public FieldDef getFieldDef() { + return fieldDef; + } + + public void setFieldDef(final FieldDef fieldDef) { + this.fieldDef = fieldDef; + } + + public double getDistance() { + return distance; + } + + public void setDistance(final double distance) { + this.distance = distance; + } +} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/eval/DistanceEvalMap.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/eval/DistanceEvalMap.java new file mode 100644 index 000000000..764e60354 --- /dev/null +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/eval/DistanceEvalMap.java @@ -0,0 +1,32 @@ +package eu.dnetlib.pace.distance.eval; + +import java.util.HashMap; + +/** + * Created by claudio on 10/03/16. + */ +public class DistanceEvalMap extends HashMap { + + private double sumWeights; + + private double sumDistances = 0.0; + + public DistanceEvalMap(final double sumWeights) { + this.sumWeights = sumWeights; + } + + public void updateDistance(final DistanceEval d) { + + put(d.getFieldDef().getName(), d); + if (d.getDistance() >= 0) { + sumDistances += d.getDistance(); + } else { + sumWeights -= d.getFieldDef().getWeight(); + } + } + + public double distance() { + return sumWeights == 0 ? 0 : sumDistances / sumWeights; + } + +} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/eval/ScoreResult.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/eval/ScoreResult.java new file mode 100644 index 000000000..a61cf6e15 --- /dev/null +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/distance/eval/ScoreResult.java @@ -0,0 +1,55 @@ +package eu.dnetlib.pace.distance.eval; + +import com.google.gson.GsonBuilder; + +/** + * Created by claudio on 09/03/16. + */ +public class ScoreResult { + + private ConditionEvalMap strictConditions; + + private ConditionEvalMap conditions; + + private DistanceEvalMap distances; + + public double getScore() { + + if (getStrictConditions().result() > 0) return 1.0; + // if (getStrictConditions().result() < 0) return 0.0; + if (getConditions().result() < 0) return 0.0; + + return getDistances().distance(); + } + + + public ConditionEvalMap getStrictConditions() { + return strictConditions; + } + + public void setStrictConditions(final ConditionEvalMap strictConditions) { + this.strictConditions = strictConditions; + } + + public ConditionEvalMap getConditions() { + return conditions; + } + + public void setConditions(final ConditionEvalMap conditions) { + this.conditions = conditions; + } + + public DistanceEvalMap getDistances() { + return distances; + } + + public void setDistances(final DistanceEvalMap distances) { + this.distances = distances; + } + + @Override + public String toString() { + final GsonBuilder b = new GsonBuilder(); + return b.setPrettyPrinting().create().toJson(this); + } +} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/AbstractField.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/AbstractField.java new file mode 100644 index 000000000..b418b75bb --- /dev/null +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/AbstractField.java @@ -0,0 +1,74 @@ +package eu.dnetlib.pace.model; + +import eu.dnetlib.pace.config.Type; + +/** + * The Class AbstractField. + */ +public abstract class AbstractField implements Field { + + /** The type. */ + protected Type type = Type.String; + + /** The name. */ + protected String name; + + /** + * Instantiates a new abstract field. + */ + protected AbstractField() {} + + /** + * Instantiates a new abstract field. + * + * @param type + * the type + * @param name + * the name + */ + protected AbstractField(final Type type, final String name) { + this.type = type; + this.name = name; + } + + /* + * (non-Javadoc) + * + * @see eu.dnetlib.pace.model.Field#getName() + */ + @Override + public String getName() { + return name; + } + + /* + * (non-Javadoc) + * + * @see eu.dnetlib.pace.model.Field#getType() + */ + @Override + public Type getType() { + return type; + } + + /* + * (non-Javadoc) + * + * @see eu.dnetlib.pace.model.Field#setName(java.lang.String) + */ + @Override + public void setName(final String name) { + this.name = name; + } + + /* + * (non-Javadoc) + * + * @see eu.dnetlib.pace.model.Field#setType(eu.dnetlib.pace.config.Type) + */ + @Override + public void setType(final Type type) { + this.type = type; + } + +} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/ClusteringDef.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/ClusteringDef.java new file mode 100644 index 000000000..5909788b7 --- /dev/null +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/ClusteringDef.java @@ -0,0 +1,77 @@ +package eu.dnetlib.pace.model; + +import java.util.List; +import java.util.Map; + +import com.google.gson.Gson; +import eu.dnetlib.pace.clustering.*; + +public class ClusteringDef { + + private Clustering name; + + private List fields; + + private Map params; + + public ClusteringDef() {} + + public Clustering getName() { + return name; + } + + public void setName(final Clustering name) { + this.name = name; + } + + public ClusteringFunction getClusteringFunction() { + switch (getName()) { + case acronyms: + return new Acronyms(getParams()); + case ngrams: + return new Ngrams(getParams()); + case ngrampairs: + return new NgramPairs(getParams()); + case sortedngrampairs: + return new SortedNgramPairs(getParams()); + case suffixprefix: + return new SuffixPrefix(getParams()); + case spacetrimmingfieldvalue: + return new SpaceTrimmingFieldValue(getParams()); + case immutablefieldvalue: + return new ImmutableFieldValue(getParams()); + case personhash: + return new PersonHash(getParams()); + case personclustering: + return new PersonClustering(getParams()); + case lowercase: + return new LowercaseClustering(getParams()); + case urlclustering: + return new UrlClustering(getParams()); + default: + return new RandomClusteringFunction(getParams()); + } + } + + public List getFields() { + return fields; + } + + public void setFields(final List fields) { + this.fields = fields; + } + + public Map getParams() { + return params; + } + + public void setParams(final Map params) { + this.params = params; + } + + @Override + public String toString() { + return new Gson().toJson(this); + } + +} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/CondDef.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/CondDef.java new file mode 100644 index 000000000..33f30faff --- /dev/null +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/CondDef.java @@ -0,0 +1,61 @@ +package eu.dnetlib.pace.model; + +import java.util.List; + +import com.google.gson.Gson; +import eu.dnetlib.pace.condition.*; +import eu.dnetlib.pace.config.Cond; + +public class CondDef { + + private Cond name; + + private List fields; + + public CondDef() {} + + public ConditionAlgo getConditionAlgo(final List fields) { + switch (getName()) { + case yearMatch: + return new YearMatch(getName(), fields); + case titleVersionMatch: + return new TitleVersionMatch(getName(), fields); + case sizeMatch: + return new SizeMatch(getName(), fields); + case exactMatch: + return new ExactMatch(getName(), fields); + case mustBeDifferent: + return new MustBeDifferent(getName(), fields); + case exactMatchIgnoreCase: + return new ExactMatchIgnoreCase(getName(), fields); + case doiExactMatch: + return new DoiExactMatch(getName(), fields); + case pidMatch: + return new PidMatch(getName(), fields); + default: + return new AlwaysTrueCondition(getName(), fields); + } + } + + public Cond getName() { + return name; + } + + public void setName(final Cond name) { + this.name = name; + } + + public List getFields() { + return fields; + } + + public void setFields(final List fields) { + this.fields = fields; + } + + @Override + public String toString() { + return new Gson().toJson(this); + } + +} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/Document.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/Document.java new file mode 100644 index 000000000..fcacadd6f --- /dev/null +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/Document.java @@ -0,0 +1,39 @@ +package eu.dnetlib.pace.model; + +import java.util.Set; + +/** + * The Interface Document. Models the common operations available on a Pace Document. + */ +public interface Document { + + /** + * Gets the identifier. + * + * @return the identifier + */ + String getIdentifier(); + + /** + * Fields. + * + * @return the iterable + */ + Iterable fields(); + + /** + * Values. + * + * @param name + * the name + * @return the field list + */ + Field values(String name); + + /** + * Field names. + * + * @return the sets the + */ + Set fieldNames(); +} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/Field.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/Field.java new file mode 100644 index 000000000..9c97ce38d --- /dev/null +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/Field.java @@ -0,0 +1,54 @@ +package eu.dnetlib.pace.model; + +import eu.dnetlib.pace.config.Type; + +/** + * The Interface Field. + */ +public interface Field extends Iterable { + + /** + * Gets the name. + * + * @return the name + */ + public String getName(); + + /** + * Sets the name. + * + * @param name + * the new name + */ + public void setName(String name); + + /** + * Gets the type. + * + * @return the type + */ + public Type getType(); + + /** + * Sets the type. + * + * @param type + * the new type + */ + public void setType(Type type); + + /** + * Checks if is empty. + * + * @return true, if is empty + */ + public boolean isEmpty(); + + /** + * String value. + * + * @return the string + */ + public String stringValue(); + +} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/FieldDef.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/FieldDef.java new file mode 100644 index 000000000..776c20306 --- /dev/null +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/FieldDef.java @@ -0,0 +1,163 @@ +package eu.dnetlib.pace.model; + +import java.util.List; +import java.util.Map; + +import com.google.common.base.Splitter; +import com.google.common.collect.Lists; +import com.google.gson.Gson; +import eu.dnetlib.pace.config.Algo; +import eu.dnetlib.pace.config.Type; +import eu.dnetlib.pace.distance.*; +import eu.dnetlib.pace.distance.algo.*; + +/** + * The schema is composed by field definitions (FieldDef). Each field has a type, a name, and an associated distance algorithm. + */ +public class FieldDef { + + public final static String PATH_SEPARATOR = "/"; + + private Algo algo; + + private String name; + + private String path; + + private boolean ignoreMissing; + + private Type type; + + private boolean overrideMatch; + + private double weight; + + private int limit = -1; + + private Map params; + + public FieldDef() {} + + // def apply(s: String): Field[A] + public Field apply(final Type type, final String s) { + switch (type) { + case Int: + return new FieldValueImpl(type, name, Integer.parseInt(s)); + case String: + return new FieldValueImpl(type, name, s); + case List: + return new FieldListImpl(name, type); + default: + throw new IllegalArgumentException("Casting not implemented for type " + type); + } + } + + public String getName() { + return name; + } + + public String getPath() { + return path; + } + + public List getPathList() { + return Lists.newArrayList(Splitter.on(PATH_SEPARATOR).split(getPath())); + } + + public DistanceAlgo getDistanceAlgo() { + switch (getAlgo()) { + case JaroWinkler: + return new JaroWinkler(getWeight()); + case JaroWinklerTitle: + return new JaroWinklerTitle(getWeight()); + case Level2JaroWinkler: + return new Level2JaroWinkler(getWeight()); + case Level2JaroWinklerTitle: + return new Level2JaroWinklerTitle(getWeight()); + case Level2Levenstein: + return new Level2Levenstein(getWeight()); + case Levenstein: + return new Levenstein(getWeight()); + case LevensteinTitle: + return new LevensteinTitle(getWeight()); + case SubStringLevenstein: + return new SubStringLevenstein(getWeight(), getLimit()); + case YearLevenstein: + return new YearLevenstein(getWeight(), getLimit()); + case SortedJaroWinkler: + return new SortedJaroWinkler(getWeight()); + case SortedLevel2JaroWinkler: + return new SortedLevel2JaroWinkler(getWeight()); + case urlMatcher: + return new UrlMatcher(getWeight(), getParams()); + case ExactMatch: + return new ExactMatch(getWeight()); + case MustBeDifferent: + return new MustBeDifferent(getWeight()); + case AlwaysMatch: + return new AlwaysMatch(getWeight()); + case Null: + return new NullDistanceAlgo(); + default: + return new NullDistanceAlgo(); + } + } + + public boolean isIgnoreMissing() { + return ignoreMissing; + } + + public Type getType() { + return type; + } + + public void setType(final Type type) { + this.type = type; + } + + public boolean isOverrideMatch() { + return overrideMatch; + } + + public void setOverrideMatch(final boolean overrideMatch) { + this.overrideMatch = overrideMatch; + } + + @Override + public String toString() { + return new Gson().toJson(this); + } + + public double getWeight() { + return weight; + } + + public void setWeight(final double weight) { + this.weight = weight; + } + + public Algo getAlgo() { + return algo; + } + + public void setAlgo(final Algo algo) { + this.algo = algo; + } + + public int getLimit() { + return limit; + } + + public void setLimit(final int limit) { + this.limit = limit; + } + + public Map getParams() { + return params; + } + + public void setParams(final Map params) { + this.params = params; + } + +} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/FieldList.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/FieldList.java new file mode 100644 index 000000000..3b771fa67 --- /dev/null +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/FieldList.java @@ -0,0 +1,17 @@ +package eu.dnetlib.pace.model; + +import java.util.List; + +/** + * The Interface FieldList. + */ +public interface FieldList extends List, Field { + + /** + * String list. + * + * @return the list + */ + public List stringList(); + +} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/FieldListImpl.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/FieldListImpl.java new file mode 100644 index 000000000..17d0cae06 --- /dev/null +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/FieldListImpl.java @@ -0,0 +1,327 @@ +package eu.dnetlib.pace.model; + +import java.util.Collection; +import java.util.Iterator; +import java.util.List; +import java.util.ListIterator; + +import com.google.common.base.Function; +import com.google.common.base.Joiner; +import com.google.common.base.Predicate; +import com.google.common.collect.Iterables; +import com.google.common.collect.Lists; + +import com.google.gson.Gson; +import com.google.gson.GsonBuilder; +import com.google.gson.JsonObject; +import eu.dnetlib.pace.config.Type; + +/** + * The Class FieldListImpl. + */ +public class FieldListImpl extends AbstractField implements FieldList { + + /** The fields. */ + private List fields; + + /** + * Instantiates a new field list impl. + */ + public FieldListImpl() { + fields = Lists.newArrayList(); + } + + /** + * Instantiates a new field list impl. + * + * @param name + * the name + */ + public FieldListImpl(final String name, final Type type) { + super(type, name); + fields = Lists.newArrayList(); + } + + /* + * (non-Javadoc) + * + * @see java.util.List#add(java.lang.Object) + */ + @Override + public boolean add(final Field f) { + return fields.add(f); + } + + /* + * (non-Javadoc) + * + * @see java.util.List#add(int, java.lang.Object) + */ + @Override + public void add(final int i, final Field f) { + fields.add(i, f); + } + + /* + * (non-Javadoc) + * + * @see java.util.List#addAll(java.util.Collection) + */ + @Override + public boolean addAll(final Collection f) { + return fields.addAll(f); + } + + /* + * (non-Javadoc) + * + * @see java.util.List#addAll(int, java.util.Collection) + */ + @Override + public boolean addAll(final int i, final Collection f) { + return fields.addAll(i, f); + } + + /* + * (non-Javadoc) + * + * @see java.util.List#clear() + */ + @Override + public void clear() { + fields.clear(); + } + + /* + * (non-Javadoc) + * + * @see java.util.List#contains(java.lang.Object) + */ + @Override + public boolean contains(final Object o) { + return fields.contains(o); + } + + /* + * (non-Javadoc) + * + * @see java.util.List#containsAll(java.util.Collection) + */ + @Override + public boolean containsAll(final Collection f) { + return fields.containsAll(f); + } + + /* + * (non-Javadoc) + * + * @see java.util.List#get(int) + */ + @Override + public Field get(final int i) { + return fields.get(i); + } + + /* + * (non-Javadoc) + * + * @see java.util.List#indexOf(java.lang.Object) + */ + @Override + public int indexOf(final Object o) { + return fields.indexOf(o); + } + + /* + * (non-Javadoc) + * + * @see eu.dnetlib.pace.model.Field#isEmpty() + */ + @Override + public boolean isEmpty() { + return Iterables.all(fields, new Predicate() { + + @Override + public boolean apply(final Field f) { + return f.isEmpty(); + } + }); + } + + /* + * (non-Javadoc) + * + * @see java.lang.Iterable#iterator() + */ + @Override + public Iterator iterator() { + return fields.iterator(); + } + + /* + * (non-Javadoc) + * + * @see java.util.List#lastIndexOf(java.lang.Object) + */ + @Override + public int lastIndexOf(final Object o) { + return fields.lastIndexOf(o); + } + + /* + * (non-Javadoc) + * + * @see java.util.List#listIterator() + */ + @Override + public ListIterator listIterator() { + return fields.listIterator(); + } + + /* + * (non-Javadoc) + * + * @see java.util.List#listIterator(int) + */ + @Override + public ListIterator listIterator(final int i) { + return fields.listIterator(i); + } + + /* + * (non-Javadoc) + * + * @see java.util.List#remove(java.lang.Object) + */ + @Override + public boolean remove(final Object o) { + return fields.remove(o); + } + + /* + * (non-Javadoc) + * + * @see java.util.List#remove(int) + */ + @Override + public Field remove(final int i) { + return fields.remove(i); + } + + /* + * (non-Javadoc) + * + * @see java.util.List#removeAll(java.util.Collection) + */ + @Override + public boolean removeAll(final Collection f) { + return fields.removeAll(f); + } + + /* + * (non-Javadoc) + * + * @see java.util.List#retainAll(java.util.Collection) + */ + @Override + public boolean retainAll(final Collection f) { + return fields.retainAll(f); + } + + /* + * (non-Javadoc) + * + * @see java.util.List#set(int, java.lang.Object) + */ + @Override + public Field set(final int i, final Field f) { + return fields.set(i, f); + } + + /* + * (non-Javadoc) + * + * @see java.util.List#size() + */ + @Override + public int size() { + return fields.size(); + } + + /* + * (non-Javadoc) + * + * @see java.util.List#subList(int, int) + */ + @Override + public List subList(final int from, final int to) { + return fields.subList(from, to); + } + + /* + * (non-Javadoc) + * + * @see java.util.List#toArray() + */ + @Override + public Object[] toArray() { + return fields.toArray(); + } + + /* + * (non-Javadoc) + * + * @see java.util.List#toArray(java.lang.Object[]) + */ + @Override + public T[] toArray(final T[] t) { + return fields.toArray(t); + } + + /* + * (non-Javadoc) + * + * @see eu.dnetlib.pace.model.Field#stringValue() + */ + @Override + public String stringValue() { + switch (getType()) { + + case List: + case Int: + case String: + return Joiner.on(" ").join(stringList()); + case JSON: + final String json = new Gson().toJson(stringList()); + return json; + default: + throw new IllegalArgumentException("Unknown type: " + getType().toString()); + } + } + + /* + * (non-Javadoc) + * + * @see eu.dnetlib.pace.model.FieldList#stringList() + */ + @Override + public List stringList() { + return Lists.newArrayList(Iterables.transform(fields, getValuesTransformer())); + } + + private Function getValuesTransformer() { + return new Function() { + + @Override + public String apply(final Field f) { + return f.stringValue(); + } + }; + } + + @Override + public String toString() { + return stringList().toString(); + } + +} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/FieldValue.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/FieldValue.java new file mode 100644 index 000000000..861332729 --- /dev/null +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/FieldValue.java @@ -0,0 +1,23 @@ +package eu.dnetlib.pace.model; + +/** + * The Interface FieldValue. + */ +public interface FieldValue extends Field { + + /** + * Gets the value. + * + * @return the value + */ + public Object getValue(); + + /** + * Sets the value. + * + * @param value + * the new value + */ + public void setValue(final Object value); + +} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/FieldValueImpl.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/FieldValueImpl.java new file mode 100644 index 000000000..ea31ec36e --- /dev/null +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/FieldValueImpl.java @@ -0,0 +1,126 @@ +package eu.dnetlib.pace.model; + +import java.net.MalformedURLException; +import java.net.URL; +import java.util.Iterator; +import java.util.List; + +import eu.dnetlib.pace.config.Type; +import org.apache.commons.collections.iterators.SingletonIterator; +import org.apache.commons.lang.StringUtils; + +/** + * The Class FieldValueImpl. + */ +public class FieldValueImpl extends AbstractField implements FieldValue { + + /** The value. */ + private Object value = null; + + /** + * Instantiates a new field value impl. + */ + public FieldValueImpl() {} + + /** + * Instantiates a new field value impl. + * + * @param type + * the type + * @param name + * the name + * @param value + * the value + */ + public FieldValueImpl(final Type type, final String name, final Object value) { + super(type, name); + this.value = value; + } + + /* + * (non-Javadoc) + * + * @see eu.dnetlib.pace.model.Field#isEmpty() + */ + @Override + public boolean isEmpty() { + if (value == null) return false; + + switch (type) { + case String: + case JSON: + return value.toString().isEmpty(); + case List: + List list = (List) value; + return list.isEmpty() || ((FieldValueImpl) list.get(0)).isEmpty(); + case URL: + String str = value.toString(); + return StringUtils.isBlank(str) || !isValidURL(str); + default: + return true; + } + } + + private boolean isValidURL(final String s) { + try { + new URL(s); + return true; + } catch (MalformedURLException e) { + return false; + } + } + + /* + * (non-Javadoc) + * + * @see eu.dnetlib.pace.model.FieldValue#getValue() + */ + @Override + public Object getValue() { + return value; + } + + /* + * (non-Javadoc) + * + * @see eu.dnetlib.pace.model.FieldValue#setValue(java.lang.Object) + */ + @Override + public void setValue(final Object value) { + this.value = value; + } + + /* + * (non-Javadoc) + * + * @see eu.dnetlib.pace.model.Field#stringValue() + */ + @Override + // @SuppressWarnings("unchecked") + public String stringValue() { + return String.valueOf(getValue()); + // switch (getType()) { + // + // case Int: + // return String.valueOf(getValue()); + // case List: + // return Joiner.on(" ").join((List) getValue()); + // case String: + // return (String) getValue(); + // default: + // throw new IllegalArgumentException("Unknown type: " + getType().toString()); + // } + } + + /* + * (non-Javadoc) + * + * @see java.lang.Iterable#iterator() + */ + @Override + @SuppressWarnings("unchecked") + public Iterator iterator() { + return new SingletonIterator(this); + } + +} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/MapDocument.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/MapDocument.java new file mode 100644 index 000000000..74935de56 --- /dev/null +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/MapDocument.java @@ -0,0 +1,146 @@ +package eu.dnetlib.pace.model; + +import java.util.Map; +import java.util.Set; + +import com.google.common.collect.Iterables; +import com.google.common.collect.Lists; +import com.google.common.collect.Maps; + +/** + * The Class MapDocument. + */ +public class MapDocument implements Document { + + /** The identifier. */ + private String identifier; + + /** The field map. */ + private Map fieldMap; + + /** + * Instantiates a new map document. + */ + public MapDocument() { + identifier = null; + fieldMap = Maps.newHashMap(); + } + + /** + * Instantiates a new map document. + * + * @param identifier + * the identifier + * @param fieldMap + * the field map + */ + public MapDocument(final String identifier, final Map fieldMap) { + this.setIdentifier(identifier); + this.fieldMap = fieldMap; + } + + /** + * Instantiates a new map document. + * + * @param identifier + * the identifier + * @param data + * the data + */ + public MapDocument(final String identifier, final byte[] data) { + final MapDocument doc = MapDocumentSerializer.decode(data); + + this.fieldMap = doc.fieldMap; + this.identifier = doc.identifier; + } + + /* + * (non-Javadoc) + * + * @see eu.dnetlib.pace.model.document.Document#fields() + */ + @Override + public Iterable fields() { + return Lists.newArrayList(Iterables.concat(fieldMap.values())); + } + + /* + * (non-Javadoc) + * + * @see eu.dnetlib.pace.model.document.Document#values(java.lang.String) + */ + @Override + public Field values(final String name) { + return fieldMap.get(name); + } + + /* + * (non-Javadoc) + * + * @see eu.dnetlib.pace.model.document.Document#fieldNames() + */ + @Override + public Set fieldNames() { + return fieldMap.keySet(); + } + + /* + * (non-Javadoc) + * + * @see java.lang.Object#toString() + */ + @Override + public String toString() { + return MapDocumentSerializer.toString(this); + // return String.format("Document(%s)", fieldMap.toString()); + } + + /** + * To byte array. + * + * @return the byte[] + */ + public byte[] toByteArray() { + return MapDocumentSerializer.toByteArray(this); + } + + /* + * (non-Javadoc) + * + * @see eu.dnetlib.pace.model.document.Document#getIdentifier() + */ + @Override + public String getIdentifier() { + return identifier; + } + + /** + * Sets the identifier. + * + * @param identifier + * the new identifier + */ + public void setIdentifier(final String identifier) { + this.identifier = identifier; + } + + /** + * Gets the field map. + * + * @return the field map + */ + public Map getFieldMap() { + return fieldMap; + } + + /** + * Sets the field map. + * + * @param fieldMap + * the field map + */ + public void setFieldMap(final Map fieldMap) { + this.fieldMap = fieldMap; + } + +} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/MapDocumentComparator.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/MapDocumentComparator.java new file mode 100644 index 000000000..7217b2b59 --- /dev/null +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/MapDocumentComparator.java @@ -0,0 +1,50 @@ +package eu.dnetlib.pace.model; + +import java.util.Comparator; + +import com.google.common.collect.Iterables; + +import eu.dnetlib.pace.clustering.NGramUtils; + +/** + * The Class MapDocumentComparator. + */ +public class MapDocumentComparator implements Comparator { + + /** The comparator field. */ + private String comparatorField; + + private final FieldList emptyField = new FieldListImpl(); + + /** + * Instantiates a new map document comparator. + * + * @param comparatorField + * the comparator field + */ + public MapDocumentComparator(final String comparatorField) { + this.comparatorField = comparatorField; + } + + /* + * (non-Javadoc) + * + * @see java.util.Comparator#compare(java.lang.Object, java.lang.Object) + */ + @Override + public int compare(final Document d1, final Document d2) { + + if (d1.values(comparatorField).isEmpty() || d2.values(comparatorField).isEmpty()) return 0; + + final String o1 = Iterables.getFirst(d1.values(comparatorField), emptyField).stringValue(); + final String o2 = Iterables.getFirst(d2.values(comparatorField), emptyField).stringValue(); + + if ((o1 == null) || (o2 == null)) return 0; + + final String to1 = NGramUtils.cleanupForOrdering(o1); + final String to2 = NGramUtils.cleanupForOrdering(o2); + + return to1.compareTo(to2); + } + +} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/MapDocumentSerializer.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/MapDocumentSerializer.java new file mode 100644 index 000000000..e5b3522df --- /dev/null +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/MapDocumentSerializer.java @@ -0,0 +1,101 @@ +package eu.dnetlib.pace.model; + +import java.lang.reflect.Type; + +import com.google.gson.GsonBuilder; +import com.google.gson.InstanceCreator; +import com.google.gson.JsonDeserializationContext; +import com.google.gson.JsonDeserializer; +import com.google.gson.JsonElement; +import com.google.gson.JsonObject; +import com.google.gson.JsonParseException; + +/** + * The Class MapDocumentSerializer. + */ +public class MapDocumentSerializer implements InstanceCreator { + + @Override + public MapDocument createInstance(final Type type) { + return new MapDocument(); + } + + /** + * Decode. + * + * @param s + * the String + * @return the map document + */ + public static MapDocument decode(final String s) { + final GsonBuilder gson = new GsonBuilder(); + + gson.registerTypeAdapter(Field.class, new JsonDeserializer() { + + @Override + public Field deserialize(final JsonElement json, final Type typeOfT, final JsonDeserializationContext context) throws JsonParseException { + final FieldListImpl fl = new FieldListImpl(); + if (json.isJsonObject()) { + + fl.add(handleJsonObject(json.getAsJsonObject())); + + } else if (json.isJsonArray()) { + + for (final JsonElement e : json.getAsJsonArray()) { + if (e.isJsonObject()) { + fl.add(handleJsonObject(e.getAsJsonObject())); + } + } + } + return fl; + } + + private Field handleJsonObject(final JsonObject o) { + final FieldListImpl fl = new FieldListImpl(); + final String name = o.get("name").getAsString(); + final String type = o.get("type").getAsString(); + final String value = o.get("value").getAsString(); + fl.add(new FieldValueImpl(eu.dnetlib.pace.config.Type.valueOf(type), name, value)); + return fl; + } + }); + + return gson.create().fromJson(s, MapDocument.class); + } + + /** + * Decode. + * + * @param bytes + * the bytes + * @return the map document + */ + public static MapDocument decode(final byte[] bytes) { + return decode(new String(bytes)); + } + + /** + * To string. + * + * @param doc + * the doc + * @return the string + */ + public static String toString(final MapDocument doc) { + final GsonBuilder b = new GsonBuilder(); + return b.setPrettyPrinting().create().toJson(doc); + + } + + /** + * To byte array. + * + * @param doc + * the doc + * @return the byte[] + */ + public static byte[] toByteArray(final MapDocument doc) { + return toString(doc).getBytes(); + } + +} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/Person.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/Person.java new file mode 100644 index 000000000..ec3340672 --- /dev/null +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/Person.java @@ -0,0 +1,155 @@ +package eu.dnetlib.pace.model; + +import java.nio.charset.Charset; +import java.text.Normalizer; +import java.util.List; +import java.util.Set; + +import com.google.common.base.Joiner; +import com.google.common.base.Splitter; +import com.google.common.collect.Iterables; +import com.google.common.collect.Lists; +import com.google.common.hash.Hashing; + +import eu.dnetlib.pace.common.AbstractPaceFunctions; +import eu.dnetlib.pace.util.Capitalise; +import eu.dnetlib.pace.util.DotAbbreviations; + +public class Person { + + private static final String UTF8 = "UTF-8"; + private List name = Lists.newArrayList(); + private List surname = Lists.newArrayList(); + private List fullname = Lists.newArrayList(); + private final String original; + + private static Set particles = null; + + public Person(String s, final boolean aggressive) { + original = s; + s = Normalizer.normalize(s, Normalizer.Form.NFD); + s = s.replaceAll("\\(.+\\)", ""); + s = s.replaceAll("\\[.+\\]", ""); + s = s.replaceAll("\\{.+\\}", ""); + s = s.replaceAll("\\s+-\\s+", "-"); + s = s.replaceAll("[\\p{Punct}&&[^,-]]", " "); + s = s.replaceAll("\\d", " "); + s = s.replaceAll("\\n", " "); + s = s.replaceAll("\\.", " "); + s = s.replaceAll("\\s+", " "); + + if (aggressive) { + s = s.replaceAll("[\\p{InCombiningDiacriticalMarks}&&[^,-]]", ""); + // s = s.replaceAll("[\\W&&[^,-]]", ""); + } + + if (s.contains(",")) { + final String[] arr = s.split(","); + if (arr.length == 1) { + fullname = splitTerms(arr[0]); + } else if (arr.length > 1) { + surname = splitTerms(arr[0]); + name = splitTerms(arr[1]); + fullname.addAll(surname); + fullname.addAll(name); + } + } else { + fullname = splitTerms(s); + + int lastInitialPosition = fullname.size(); + boolean hasSurnameInUpperCase = false; + + for (int i = 0; i < fullname.size(); i++) { + final String term = fullname.get(i); + if (term.length() == 1) { + lastInitialPosition = i; + } else if (term.equals(term.toUpperCase())) { + hasSurnameInUpperCase = true; + } + } + + if (lastInitialPosition < (fullname.size() - 1)) { // Case: Michele G. Artini + name = fullname.subList(0, lastInitialPosition + 1); + surname = fullname.subList(lastInitialPosition + 1, fullname.size()); + } else if (hasSurnameInUpperCase) { // Case: Michele ARTINI + for (final String term : fullname) { + if ((term.length() > 1) && term.equals(term.toUpperCase())) { + surname.add(term); + } else { + name.add(term); + } + } + } + } + } + + private List splitTerms(final String s) { + if (particles == null) { + particles = AbstractPaceFunctions.loadFromClasspath("/eu/dnetlib/pace/config/name_particles.txt"); + } + + final List list = Lists.newArrayList(); + for (final String part : Splitter.on(" ").omitEmptyStrings().split(s)) { + if (!particles.contains(part.toLowerCase())) { + list.add(part); + } + } + return list; + } + + public List getName() { + return name; + } + + public String getNameString() { + return Joiner.on(" ").join(getName()); + } + + public List getSurname() { + return surname; + } + + public List getFullname() { + return fullname; + } + + public String getOriginal() { + return original; + } + + public String hash() { + return Hashing.murmur3_128().hashString(getNormalisedFullname(), Charset.forName(UTF8)).toString(); + } + + public String getNormalisedFirstName() { + return Joiner.on(" ").join(getCapitalFirstnames()); + } + + public String getNormalisedSurname() { + return Joiner.on(" ").join(getCapitalSurname()); + } + + public String getSurnameString() { + return Joiner.on(" ").join(getSurname()); + } + + public String getNormalisedFullname() { + return isAccurate() ? getNormalisedSurname() + ", " + getNormalisedFirstName() : Joiner.on(" ").join(fullname); + } + + public List getCapitalFirstnames() { + return Lists.newArrayList(Iterables.transform(getNameWithAbbreviations(), new Capitalise())); + } + + public List getCapitalSurname() { + return Lists.newArrayList(Iterables.transform(surname, new Capitalise())); + } + + public List getNameWithAbbreviations() { + return Lists.newArrayList(Iterables.transform(name, new DotAbbreviations())); + } + + public boolean isAccurate() { + return ((name != null) && (surname != null) && !name.isEmpty() && !surname.isEmpty()); + } +} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/PersonComparatorUtils.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/PersonComparatorUtils.java new file mode 100644 index 000000000..a900a6082 --- /dev/null +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/PersonComparatorUtils.java @@ -0,0 +1,118 @@ +package eu.dnetlib.pace.model; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; +import java.util.Set; + +import com.google.common.collect.Lists; +import com.google.common.collect.Sets; + +public class PersonComparatorUtils { + + private static final int MAX_FULLNAME_LENGTH = 50; + + public static Set getNgramsForPerson(String fullname) { + + Set set = Sets.newHashSet(); + + if (fullname.length() > MAX_FULLNAME_LENGTH) { + return set; + } + + Person p = new Person(fullname, true); + + if (p.isAccurate()) { + for (String name : p.getName()) { + for (String surname : p.getSurname()) { + set.add((name.charAt(0) + "_" + surname).toLowerCase()); + } + } + } else { + List list = p.getFullname(); + for (int i = 0; i < list.size(); i++) { + if (list.get(i).length() > 1) { + for (int j = 0; j < list.size(); j++) { + if (i != j) { + set.add((list.get(j).charAt(0) + "_" + list.get(i)).toLowerCase()); + } + } + } + } + } + + return set; + } + + public static boolean areSimilar(String s1, String s2) { + Person p1 = new Person(s1, true); + Person p2 = new Person(s2, true); + + if (p1.isAccurate() && p2.isAccurate()) { + return verifyNames(p1.getName(), p2.getName()) && verifySurnames(p1.getSurname(), p2.getSurname()); + } else { + return verifyFullnames(p1.getFullname(), p2.getFullname()); + } + } + + private static boolean verifyNames(List list1, List list2) { + return verifySimilarity(extractExtendedNames(list1), extractExtendedNames(list2)) + && verifySimilarity(extractInitials(list1), extractInitials(list2)); + } + + private static boolean verifySurnames(List list1, List list2) { + if (list1.size() != list2.size()) { + return false; + } + for (int i = 0; i < list1.size(); i++) { + if (!list1.get(i).equalsIgnoreCase(list2.get(i))) { + return false; + } + } + return true; + } + + private static boolean verifyFullnames(List list1, List list2) { + Collections.sort(list1); + Collections.sort(list2); + return verifySimilarity(extractExtendedNames(list1), extractExtendedNames(list2)) + && verifySimilarity(extractInitials(list1), extractInitials(list2)); + } + + private static List extractExtendedNames(List list) { + ArrayList res = Lists.newArrayList(); + for (String s : list) { + if (s.length() > 1) { + res.add(s.toLowerCase()); + } + } + return res; + } + + private static List extractInitials(List list) { + ArrayList res = Lists.newArrayList(); + for (String s : list) { + res.add(s.substring(0, 1).toLowerCase()); + } + return res; + } + + private static boolean verifySimilarity(List list1, List list2) { + if (list1.size() > list2.size()) { + return verifySimilarity(list2, list1); + } + + // NB: List2 is greater than list1 (or equal) + int pos = -1; + for (String s : list1) { + int curr = list2.indexOf(s); + if (curr > pos) { + list2.set(curr, "*"); // I invalidate the found element, example: "amm - amm" + pos = curr; + } else { + return false; + } + } + return true; + } +} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/adaptor/Pid.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/adaptor/Pid.java new file mode 100644 index 000000000..3dd70f7a3 --- /dev/null +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/adaptor/Pid.java @@ -0,0 +1,57 @@ +package eu.dnetlib.pace.model.adaptor; + +import java.util.List; + +import com.google.common.base.Function; +import com.google.common.collect.Iterables; +import com.google.common.collect.Lists; +import com.google.common.reflect.TypeToken; +import com.google.gson.Gson; +import com.google.gson.GsonBuilder; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; + +/** + * Created by claudio on 01/03/16. + */ +public class Pid { + + private static final Log log = LogFactory.getLog(Pid.class); + + private String value; + + private String type; + + public static List fromOafJson(final List json) { + + log.debug(String.format("\nPid: %s", json)); + + final GsonBuilder gb = new GsonBuilder(); + gb.registerTypeAdapter(Pid.class, new PidOafSerialiser()); + final Gson gson = gb.create(); + + return Lists.newArrayList(Iterables.transform(json, new Function() { + @Override + public Pid apply(final String s) { + return gson.fromJson(s, Pid.class); + } + })); + } + + public String getType() { + return type; + } + + public void setType(final String type) { + this.type = type; + } + + public String getValue() { + return value; + } + + public void setValue(final String value) { + this.value = value; + } + +} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/adaptor/PidOafSerialiser.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/adaptor/PidOafSerialiser.java new file mode 100644 index 000000000..8acaee673 --- /dev/null +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/adaptor/PidOafSerialiser.java @@ -0,0 +1,50 @@ +package eu.dnetlib.pace.model.adaptor; + +import java.lang.reflect.Type; +import java.util.List; + +import com.google.common.collect.Lists; +import com.google.gson.*; +import eu.dnetlib.pace.model.gt.GTAuthor; + +/** + * Created by claudio on 01/03/16. + */ +public class PidOafSerialiser implements JsonDeserializer { + + private static final String VALUE = "value"; + + private static final String QUALIFIER = "qualifier"; + private static final String CLASSID = "classid"; + + @Override + public Pid deserialize(final JsonElement json, final Type typeOfT, final JsonDeserializationContext context) throws JsonParseException { + + final Pid pid = new Pid(); + + pid.setType(getType(json)); + pid.setValue(getValue(json)); + + return pid; + } + + private String getValue(final JsonElement json) { + final JsonObject obj =json.getAsJsonObject(); + return obj.get(VALUE).getAsString(); + + } + + private String getType(final JsonElement json) { + + final JsonObject obj =json.getAsJsonObject(); + + if (!obj.has(QUALIFIER)) + throw new IllegalArgumentException("pid does not contain any type: " + json.toString()); + + final JsonObject qualifier = obj.getAsJsonObject(QUALIFIER); + + final JsonElement classid = qualifier.get(CLASSID); + + return classid.getAsString(); + } +} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/gt/Author.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/gt/Author.java new file mode 100644 index 000000000..17bd49d84 --- /dev/null +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/gt/Author.java @@ -0,0 +1,129 @@ +package eu.dnetlib.pace.model.gt; + +import java.util.List; +import java.util.Set; + +import org.apache.commons.lang.StringUtils; + +import com.google.common.collect.ComparisonChain; +import com.google.common.collect.Lists; +import com.google.common.collect.Ordering; +import com.google.common.collect.Sets; +import com.google.gson.Gson; + +public class Author implements Comparable { + + private String id; + private String fullname; + private String firstname; + private String secondnames; + + private List matches = Lists.newArrayList(); + private Set coauthors = Sets.newHashSet(); + private SubjectsMap subjectsMap = new SubjectsMap(); + + public Author() { + super(); + } + + public Author(final Author a) { + this.id = a.getId(); + this.fullname = a.getFullname(); + this.firstname = a.getFirstname(); + this.secondnames = a.getSecondnames(); + + this.matches = a.getMatches(); + this.coauthors = a.getCoauthors(); + this.subjectsMap = a.getSubjectsMap(); + } + + public boolean hasMatches() { + return (getMatches() != null) && !getMatches().isEmpty(); + } + + public boolean hasCoauthors() { + return (getCoauthors() != null) && !getCoauthors().isEmpty(); + } + + public boolean isWellFormed() { + return StringUtils.isNotBlank(getSecondnames()) && StringUtils.isNotBlank(getFirstname()); + } + + public String getId() { + return id; + } + + public void setId(final String id) { + this.id = id; + } + + public String getFullname() { + return fullname; + } + + public void setFullname(final String fullname) { + this.fullname = fullname; + } + + public String getFirstname() { + return firstname; + } + + public void setFirstname(final String firstname) { + this.firstname = firstname; + } + + public String getSecondnames() { + return secondnames; + } + + public void setSecondnames(final String secondnames) { + this.secondnames = secondnames; + } + + public List getMatches() { + return matches; + } + + public void setMatches(final List matches) { + this.matches = matches; + } + + public Set getCoauthors() { + return coauthors; + } + + public void setCoauthors(final Set coauthors) { + this.coauthors = coauthors; + } + + @Override + public String toString() { + return new Gson().toJson(this); + } + + @Override + public int hashCode() { + return getId().hashCode(); + } + + @Override + public int compareTo(final Author o) { + return ComparisonChain.start() + .compare(this.getId(), o.getId(), Ordering.natural().nullsLast()) + .result(); + } + + @Override + public boolean equals(final Object o) { + return (o instanceof Author) && getId().equals(((Author) o).getId()); + } + + public SubjectsMap getSubjectsMap() { + return subjectsMap; + } + + public void setSubjectsMap(final SubjectsMap subjectsMap) { + this.subjectsMap = subjectsMap; + } +} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/gt/AuthorSet.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/gt/AuthorSet.java new file mode 100644 index 000000000..c3f2576be --- /dev/null +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/gt/AuthorSet.java @@ -0,0 +1,37 @@ +package eu.dnetlib.pace.model.gt; + +import com.google.gson.Gson; + +public class AuthorSet { + + private String id; + private Authors authors; + + public AuthorSet(final String id, final Authors authors) { + super(); + this.id = id; + this.authors = authors; + } + + public String getId() { + return id; + } + + public void setId(final String id) { + this.id = id; + } + + public Authors getAuthors() { + return authors; + } + + public void setAuthors(final Authors authors) { + this.authors = authors; + } + + @Override + public String toString() { + return new Gson().toJson(this); + } + +} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/gt/Authors.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/gt/Authors.java new file mode 100644 index 000000000..e74c43816 --- /dev/null +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/gt/Authors.java @@ -0,0 +1,54 @@ +package eu.dnetlib.pace.model.gt; + +import java.util.Collection; +import java.util.HashSet; + +import com.google.common.collect.ComparisonChain; +import com.google.common.collect.Ordering; +import com.google.common.collect.Sets; +import com.google.gson.Gson; + +public class Authors extends HashSet implements Comparable { + + private static final long serialVersionUID = -6878376220805286142L; + + public Authors() { + super(); + } + + public Authors(final Collection authors) { + super(authors); + } + + public Authors(final Author author) { + super(Sets.newHashSet(author)); + } + + @Override + public int compareTo(final Authors a) { + return ComparisonChain.start() + .compare(this.size(), a.size(), Ordering.natural().nullsLast()) + .result(); + } + + @Override + public String toString() { + return new Gson().toJson(this); + } + + @Override + public boolean equals(final Object o) { + final boolean res = o instanceof Authors; + return res && (Sets.intersection(this, (Authors) o).size() == this.size()); + } + + @Override + public int hashCode() { + int res = 0; + for (final Author a : this) { + res += a.hashCode(); + } + return res; + } + +} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/gt/CoAuthor.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/gt/CoAuthor.java new file mode 100644 index 000000000..d4ce32de5 --- /dev/null +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/gt/CoAuthor.java @@ -0,0 +1,50 @@ +package eu.dnetlib.pace.model.gt; + +import com.google.gson.Gson; +import org.apache.commons.lang.StringUtils; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; + +public class CoAuthor extends Author { + + private static final Log log = LogFactory.getLog(CoAuthor.class); + private String anchorId = null; + + public CoAuthor() { + super(); + } + + public CoAuthor(final Author author) { + super(author); + } + + public boolean hasAnchorId() { + return StringUtils.isNotBlank(getAnchorId()); + } + + public String getAnchorId() { + return anchorId; + } + + public void setAnchorId(final String anchorId) { + this.anchorId = anchorId; + } + + @Override + public String toString() { + return new Gson().toJson(this); + } + + @Override + public int hashCode() { + return getId() != null ? getId().hashCode() : getFullname().hashCode(); + } + + @Override + public boolean equals(final Object o) { + return (o instanceof CoAuthor) && StringUtils.isNotBlank(getId()) ? + getId().equals(((CoAuthor) o).getId()) : + getFullname().equals(((CoAuthor) o).getFullname()); + } + +} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/gt/CoAuthorSet.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/gt/CoAuthorSet.java new file mode 100644 index 000000000..90898f624 --- /dev/null +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/gt/CoAuthorSet.java @@ -0,0 +1,36 @@ +package eu.dnetlib.pace.model.gt; + +import com.google.gson.Gson; + +public class CoAuthorSet { + + private Author author; + private Authors coAuthors; + + public CoAuthorSet(final Author author, final Authors coAuthors) { + super(); + this.author = author; + this.coAuthors = coAuthors; + } + + public Author getAuthor() { + return author; + } + + public void setAuthor(final Author author) { + this.author = author; + } + + public Authors getCoAuthors() { + return coAuthors; + } + + public void setCoAuthors(final Authors coAuthors) { + this.coAuthors = coAuthors; + } + + @Override + public String toString() { + return new Gson().toJson(this); + } +} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/gt/CoAuthorSetLite.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/gt/CoAuthorSetLite.java new file mode 100644 index 000000000..a48e2d8a4 --- /dev/null +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/gt/CoAuthorSetLite.java @@ -0,0 +1,40 @@ +package eu.dnetlib.pace.model.gt; + +import java.util.Set; + +import com.google.gson.Gson; + +public class CoAuthorSetLite { + + private String id; + + private Set coAuthors; + + public CoAuthorSetLite(final String id, final Set coAuthors) { + super(); + this.id = id; + this.coAuthors = coAuthors; + } + + public Set getCoAuthors() { + return coAuthors; + } + + public void setCoAuthors(final Set coAuthors) { + this.coAuthors = coAuthors; + } + + public String getId() { + return id; + } + + public void setId(final String id) { + this.id = id; + } + + @Override + public String toString() { + return new Gson().toJson(this); + } + +} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/gt/CoAuthors.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/gt/CoAuthors.java new file mode 100644 index 000000000..8e7eca269 --- /dev/null +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/gt/CoAuthors.java @@ -0,0 +1,78 @@ +package eu.dnetlib.pace.model.gt; + +import java.util.Collection; +import java.util.HashSet; + +import com.google.common.base.Function; +import com.google.common.collect.ComparisonChain; +import com.google.common.collect.Ordering; +import com.google.common.collect.Sets; +import com.google.gson.Gson; + +public class CoAuthors extends HashSet implements Comparable { + + private static final long serialVersionUID = 2525591524516562892L; + + private Function hashFunction; + + private static Function defaultHashFunction = new Function() { + + @Override + public Integer apply(final CoAuthors input) { + int res = 0; + for (final CoAuthor a : input) { + res += a.hashCode(); + } + return res; + + } + }; + + public CoAuthors() { + super(); + } + + public CoAuthors(final Collection coauthors) { + super(coauthors); + } + + public CoAuthors(final CoAuthor coauthor) { + super(Sets.newHashSet(coauthor)); + } + + public Function getHashFunction() { + return hashFunction; + } + + public void setHashFunction(final Function hashFunction) { + this.hashFunction = hashFunction; + } + + @Override + public int compareTo(final CoAuthors a) { + return ComparisonChain.start() + .compare(this.size(), a.size(), Ordering.natural().nullsLast()) + .result(); + } + + @Override + public String toString() { + return new Gson().toJson(this); + } + + @Override + public boolean equals(final Object o) { + final boolean res = o instanceof CoAuthors; + return res && (Sets.intersection(this, (CoAuthors) o).size() == this.size()); + } + + public String hashCodeString() { + return String.valueOf(hashCode()); + } + + @Override + public int hashCode() { + return (getHashFunction() != null) ? getHashFunction().apply(this) : defaultHashFunction.apply(this); + } + +} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/gt/GTAuthor.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/gt/GTAuthor.java new file mode 100644 index 000000000..e91edccc1 --- /dev/null +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/gt/GTAuthor.java @@ -0,0 +1,197 @@ +package eu.dnetlib.pace.model.gt; + +import java.util.Collections; +import java.util.Comparator; +import java.util.List; +import java.util.Map; + +import com.google.common.base.Function; +import com.google.common.collect.ComparisonChain; +import com.google.common.collect.Iterables; +import com.google.common.collect.Lists; +import com.google.common.collect.Maps; +import com.google.common.collect.Ordering; +import com.google.gson.Gson; +import com.google.gson.GsonBuilder; +import eu.dnetlib.pace.model.adaptor.PidOafSerialiser; + +public class GTAuthor implements Comparable { + + private String id; + private Author author; + private Authors merged; + private CoAuthors coAuthors; + private boolean anchor; + + public GTAuthor() {} + + public GTAuthor(final String id, final Authors merged, final CoAuthors coAuthors, final boolean anchor) { + super(); + + if ((merged == null) || merged.isEmpty()) + throw new IllegalArgumentException("empty merged author set, id: " + id); + + this.author = pickAuthor(merged); + this.id = id; + this.merged = merged; + this.coAuthors = coAuthors; + this.anchor = anchor; + } + + class AuthorFrequency extends Author { + + private Integer frequency = new Integer(1); + + public AuthorFrequency(final Author a) { + super(a); + } + + public void increment() { + setFrequency(getFrequency() + 1); + } + + public Integer getFrequency() { + return frequency; + } + + public void setFrequency(final Integer frequency) { + this.frequency = frequency; + } + } + + private Author pickAuthor(final Authors merged) { + final List freq = getFrequencies(merged); + Collections.sort(freq, Collections.reverseOrder(new Comparator() { + + @Override + public int compare(final AuthorFrequency o1, final AuthorFrequency o2) { + return ComparisonChain.start().compare(o1.getFullname().length(), o2.getFullname().length()).compare(o1.getFrequency(), o2.getFrequency()) + .result(); + } + })); + + return Iterables.getFirst(freq, null); + } + + private List getFrequencies(final Authors merged) { + final Map countMap = Maps.newHashMap(); + for (final Author a : merged) { + final Integer count = countMap.get(a.getFullname()); + if (count == null) { + countMap.put(a.getFullname(), new Integer(1)); + } else { + countMap.put(a.getFullname(), count + 1); + } + } + + return Lists.newArrayList(Iterables.transform(merged, new Function() { + + @Override + public AuthorFrequency apply(final Author a) { + final AuthorFrequency af = new AuthorFrequency(a); + final Integer freq = countMap.get(af.getFullname()); + af.setFrequency(freq); + return af; + } + })); + } + + public String getId() { + return id; + } + + public void setId(final String id) { + this.id = id; + } + + public Author getAuthor() { + return author; + } + + public void setAuthor(final Author author) { + this.author = author; + } + + public boolean hasMerged() { + return (getMerged() != null) && !getMerged().isEmpty(); + } + + public Authors getMerged() { + return merged; + } + + public void setMerged(final Authors merged) { + this.merged = merged; + } + + public boolean hasCoAuthors() { + return (getCoAuthors() != null) && !getCoAuthors().isEmpty(); + } + + public CoAuthors getCoAuthors() { + return coAuthors; + } + + public void setCoAuthors(final CoAuthors coAuthors) { + this.coAuthors = coAuthors; + } + + public boolean isAnchor() { + return anchor; + } + + public void setAnchor(final boolean anchor) { + this.anchor = anchor; + } + + public static GTAuthor fromJson(final String json) { + final Gson gson = new Gson(); + return gson.fromJson(json, GTAuthor.class); + } + + public static List fromOafJson(final List json) { + + final GsonBuilder gb = new GsonBuilder(); + gb.registerTypeAdapter(GTAuthor.class, new GTAuthorOafSerialiser()); + final Gson gson = gb.create(); + + return Lists.newArrayList(Iterables.transform(json, new Function() { + @Override + public GTAuthor apply(final String s) { + return gson.fromJson(s, GTAuthor.class); + } + })); + } + + public static GTAuthor fromOafJson(final String json) { + + final GsonBuilder gb = new GsonBuilder(); + gb.registerTypeAdapter(GTAuthor.class, new GTAuthorOafSerialiser()); + final Gson gson = gb.create(); + + return gson.fromJson(json, GTAuthor.class); + } + + @Override + public String toString() { + return new Gson().toJson(this); + } + + @Override + public int hashCode() { + return getId().hashCode(); + } + + @Override + public int compareTo(final GTAuthor o) { + return ComparisonChain.start() + .compare(this.getId(), o.getId(), Ordering.natural().nullsLast()) + .result(); + } + + @Override + public boolean equals(final Object o) { + return (o instanceof GTAuthor) && getId().equals(((GTAuthor) o).getId()); + } + +} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/gt/GTAuthorOafSerialiser.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/gt/GTAuthorOafSerialiser.java new file mode 100644 index 000000000..cb541b953 --- /dev/null +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/gt/GTAuthorOafSerialiser.java @@ -0,0 +1,104 @@ +package eu.dnetlib.pace.model.gt; + +import java.lang.reflect.Type; + +import com.google.common.base.Function; +import com.google.common.base.Joiner; +import com.google.common.collect.Iterables; +import com.google.common.collect.Lists; +import com.google.gson.JsonDeserializationContext; +import com.google.gson.JsonDeserializer; +import com.google.gson.JsonElement; +import com.google.gson.JsonObject; +import com.google.gson.JsonParseException; + +public class GTAuthorOafSerialiser implements JsonDeserializer { + + private static final String VALUE = "value"; + private static final String SECONDNAMES = "secondnames"; + private static final String FIRSTNAME = "firstname"; + private static final String FULLNAME = "fullname"; + private static final String ID = "id"; + private static final String MERGEDPERSON = "mergedperson"; + private static final String METADATA = "metadata"; + private static final String ANCHOR_ID = "anchorId"; + private static final String COAUTHOR = "coauthor"; + + @Override + public GTAuthor deserialize(final JsonElement json, final Type typeOfT, final JsonDeserializationContext context) throws JsonParseException { + final GTAuthor gta = new GTAuthor(); + + gta.setAuthor(getAuthor(json)); + gta.setMerged(getMerged(json)); + + gta.setCoAuthors(getCoAuthors(json)); + + return gta; + } + + private CoAuthors getCoAuthors(final JsonElement json) { + final JsonObject obj = json.getAsJsonObject(); + if (!obj.has(COAUTHOR)) return null; + return new CoAuthors(Lists.newArrayList(Iterables.transform(obj.get(COAUTHOR).getAsJsonArray(), + new Function() { + + @Override + public CoAuthor apply(final JsonElement in) { + final CoAuthor a = new CoAuthor(getAuthor(in)); + final JsonObject jsonObject = in.getAsJsonObject(); + if (jsonObject.has(ANCHOR_ID)) { + a.setAnchorId(jsonObject.get(ANCHOR_ID).getAsString()); + } + return a; + } + }))); + } + + private Author getAuthor(final JsonElement json) { + + final Author a = new Author(); + a.setCoauthors(null); + a.setMatches(null); + + final JsonObject jso = json.getAsJsonObject(); + + a.setId(jso.has(ID) ? jso.get(ID).getAsString() : null); + + final JsonObject jsonObject = json.getAsJsonObject(); + if (jsonObject.has(METADATA)) { + final JsonObject m = jsonObject.get(METADATA).getAsJsonObject(); + a.setFullname(getValue(m, FULLNAME)); + a.setFirstname(getValue(m, FIRSTNAME)); + a.setSecondnames(getValues(m, SECONDNAMES)); + } + return a; + } + + private Authors getMerged(final JsonElement json) { + final JsonObject obj = json.getAsJsonObject(); + if (!obj.has(MERGEDPERSON)) return null; + return new Authors(Lists.newArrayList(Iterables.transform(obj.get(MERGEDPERSON).getAsJsonArray(), + new Function() { + + @Override + public Author apply(final JsonElement in) { + return getAuthor(in); + } + }))); + } + + private String getValues(final JsonObject m, final String fieldName) { + return m.has(fieldName) ? Joiner.on(" ").join(Iterables.transform(m.get(fieldName).getAsJsonArray(), new Function() { + + @Override + public String apply(final JsonElement in) { + return in.getAsJsonObject().get(VALUE).getAsString(); + } + })) : null; + } + + private String getValue(final JsonObject m, final String fieldName) { + return m.has(fieldName) ? m.get(fieldName).getAsJsonObject().get(VALUE).getAsString() : null; + } + +} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/gt/Group.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/gt/Group.java new file mode 100644 index 000000000..86d93deb9 --- /dev/null +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/gt/Group.java @@ -0,0 +1,44 @@ +package eu.dnetlib.pace.model.gt; + +import java.util.List; + +import com.google.gson.Gson; + +public class Group { + + private String id; + private int size; + private List results; + + public Group() {} + + public String getId() { + return id; + } + + public void setId(final String id) { + this.id = id; + } + + public int getSize() { + return size; + } + + public void setSize(final int size) { + this.size = size; + } + + public List getResults() { + return results; + } + + public void setResults(final List results) { + this.results = results; + } + + @Override + public String toString() { + return new Gson().toJson(this); + } + +} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/gt/InvertedAuthor.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/gt/InvertedAuthor.java new file mode 100644 index 000000000..b9fa7f966 --- /dev/null +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/gt/InvertedAuthor.java @@ -0,0 +1,41 @@ +package eu.dnetlib.pace.model.gt; + +import java.util.Collection; + +import com.google.gson.Gson; + +public class InvertedAuthor { + + private Author author; + private Collection ids; + + public InvertedAuthor() {} + + public InvertedAuthor(final Author author, final Collection ids) { + super(); + this.author = author; + this.ids = ids; + } + + public Author getAuthor() { + return author; + } + + public void setAuthor(final Author author) { + this.author = author; + } + + public Collection getIds() { + return ids; + } + + public void setIds(final Collection ids) { + this.ids = ids; + } + + @Override + public String toString() { + return new Gson().toJson(this); + } + +} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/gt/Match.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/gt/Match.java new file mode 100644 index 000000000..e919069c7 --- /dev/null +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/gt/Match.java @@ -0,0 +1,31 @@ +package eu.dnetlib.pace.model.gt; + +public class Match extends Author { + + private double score; + + public Match() { + super(); + } + + public static Match from(final Author a) { + final Match m = new Match(); + if (a.isWellFormed()) { + m.setFirstname(a.getFirstname()); + m.setSecondnames(a.getSecondnames()); + } + m.setFullname(a.getFullname()); + m.setId(a.getId()); + + return m; + } + + public double getScore() { + return score; + } + + public void setScore(final double score) { + this.score = score; + } + +} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/gt/Result.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/gt/Result.java new file mode 100644 index 000000000..d35c3bb2c --- /dev/null +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/gt/Result.java @@ -0,0 +1,72 @@ +package eu.dnetlib.pace.model.gt; + +import java.util.List; + +import com.google.common.collect.ComparisonChain; +import com.google.common.collect.Ordering; +import com.google.gson.Gson; + +public class Result implements Comparable { + + private String id; + private String originalId; + private String title; + private List authors; + + private double meanDistance; + + public Result() {} + + public String getId() { + return id; + } + + public void setId(final String id) { + this.id = id; + } + + public String getOriginalId() { + return originalId; + } + + public void setOriginalId(final String originalId) { + this.originalId = originalId; + } + + public String getTitle() { + return title; + } + + public void setTitle(final String title) { + this.title = title; + } + + public List getAuthors() { + return authors; + } + + public void setAuthors(final List authors) { + this.authors = authors; + } + + @Override + public String toString() { + return new Gson().toJson(this); + } + + @Override + public int compareTo(final Result o) { + return ComparisonChain.start() + .compare(this.getAuthors().size(), o.getAuthors().size(), Ordering.natural().nullsLast()) + .result(); + } + + public double getMeanDistance() { + return meanDistance; + } + + public void setMeanDistance(final double meanDistance) { + this.meanDistance = meanDistance; + } + +} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/gt/ScoredResult.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/gt/ScoredResult.java new file mode 100644 index 000000000..5d4526c4b --- /dev/null +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/gt/ScoredResult.java @@ -0,0 +1,26 @@ +package eu.dnetlib.pace.model.gt; + +import com.google.gson.Gson; + +public class ScoredResult extends Result { + + private double meanDistance; + + public ScoredResult() { + super(); + } + + public double getMeanDistance() { + return meanDistance; + } + + public void setMeanDistance(final double meanDistance) { + this.meanDistance = meanDistance; + } + + @Override + public String toString() { + return new Gson().toJson(this); + } + +} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/gt/Subjects.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/gt/Subjects.java new file mode 100644 index 000000000..fc2221aef --- /dev/null +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/gt/Subjects.java @@ -0,0 +1,10 @@ +package eu.dnetlib.pace.model.gt; + +import java.util.HashMap; + +/** + * Created by claudio on 07/03/16. + */ +public class Subjects extends HashMap { + +} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/gt/SubjectsMap.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/gt/SubjectsMap.java new file mode 100644 index 000000000..04ba4c6c1 --- /dev/null +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/model/gt/SubjectsMap.java @@ -0,0 +1,35 @@ +package eu.dnetlib.pace.model.gt; + +import java.util.HashMap; +import java.util.Map.Entry; + +/** + * Created by claudio on 07/03/16. + */ +public class SubjectsMap extends HashMap { + + public SubjectsMap mergeFrom(SubjectsMap sm) { + + for(Entry e : sm.entrySet()) { + if (!this.containsKey(e.getKey())) { + Subjects sub = new Subjects(); + + sub.putAll(e.getValue()); + + this.put(e.getKey(), sub); + } else { + for (Entry es : e.getValue().entrySet()) { + final Subjects subjects = this.get(e.getKey()); + if (subjects.containsKey(es.getKey())) { + subjects.put(es.getKey(), es.getValue() + subjects.get(es.getKey())); + } else { + subjects.put(es.getKey(), new Integer(1)); + } + } + } + } + + return this; + } + +} diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/util/Capitalise.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/util/Capitalise.java new file mode 100644 index 000000000..2d2510112 --- /dev/null +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/util/Capitalise.java @@ -0,0 +1,15 @@ +package eu.dnetlib.pace.util; + +import org.apache.commons.lang.WordUtils; + +import com.google.common.base.Function; + +public class Capitalise implements Function { + + private final char[] DELIM = { ' ', '-' }; + + @Override + public String apply(final String s) { + return WordUtils.capitalize(s.toLowerCase(), DELIM); + } +}; diff --git a/dnet-pace-core/src/main/java/eu/dnetlib/pace/util/DotAbbreviations.java b/dnet-pace-core/src/main/java/eu/dnetlib/pace/util/DotAbbreviations.java new file mode 100644 index 000000000..fdbd6e99d --- /dev/null +++ b/dnet-pace-core/src/main/java/eu/dnetlib/pace/util/DotAbbreviations.java @@ -0,0 +1,10 @@ +package eu.dnetlib.pace.util; + +import com.google.common.base.Function; + +public class DotAbbreviations implements Function { + @Override + public String apply(String s) { + return s.length() == 1 ? s + "." : s; + } +}; \ No newline at end of file diff --git a/dnet-pace-core/src/main/resources/eu/dnetlib/pace/config/dedupConfig.st b/dnet-pace-core/src/main/resources/eu/dnetlib/pace/config/dedupConfig.st new file mode 100644 index 000000000..fdf569377 --- /dev/null +++ b/dnet-pace-core/src/main/resources/eu/dnetlib/pace/config/dedupConfig.st @@ -0,0 +1,23 @@ +{ + "wf" : { + "configurationId" : "$configurationId$", + "threshold" : "$threshold$", + "run" : "$run$", + "entityType" : "$entityType$", + "orderField" : "$orderField$", + "queueMaxSize" : "$queueMaxSize$", + "groupMaxSize" : "$groupMaxSize$", + "slidingWindowSize" : "$slidingWindowSize$", + "rootBuilder" : [ $rootBuilder:{"$it$"};separator=", "$ ], + "includeChildren" : "$includeChildren$" + }, + "pace" : { + "clustering" : [ + ], + "conditions" : [ + ], + "model" : [ + ], + "blacklists" : { } + } +} \ No newline at end of file diff --git a/dnet-pace-core/src/main/resources/eu/dnetlib/pace/config/name_particles.txt b/dnet-pace-core/src/main/resources/eu/dnetlib/pace/config/name_particles.txt new file mode 100644 index 000000000..dae37c9dc --- /dev/null +++ b/dnet-pace-core/src/main/resources/eu/dnetlib/pace/config/name_particles.txt @@ -0,0 +1,7 @@ +van +der +de +dell +sig +mr +mrs diff --git a/dnet-pace-core/src/main/resources/eu/dnetlib/pace/config/ngram_blacklist.txt b/dnet-pace-core/src/main/resources/eu/dnetlib/pace/config/ngram_blacklist.txt new file mode 100644 index 000000000..e69de29bb diff --git a/dnet-pace-core/src/main/resources/eu/dnetlib/pace/config/stopwords_en.txt b/dnet-pace-core/src/main/resources/eu/dnetlib/pace/config/stopwords_en.txt new file mode 100644 index 000000000..9a76d823c --- /dev/null +++ b/dnet-pace-core/src/main/resources/eu/dnetlib/pace/config/stopwords_en.txt @@ -0,0 +1,139 @@ +a +about +above +after +again +against +all +an +and +any +are +aren +as +at +be +because +been +before +being +below +between +both +but +by +can +cannot +could +couldn +did +didn +do +does +doesn +doing +don +down +during +each +few +for +from +further +had +hadn +has +hasn +have +havent +having +he +hed +her +here +hers +herself +him +himself +his +how +if +in +into +is +isn +it +its +itself +let +more +most +mustn +myself +no +nor +not +of +off +on +once +only +other +ought +our +ours +ourselves +out +over +own +s +same +shan +she +should +shouldn +so +some +such +than +that +the +their +theirs +them +themselves +then +there +these +they +this +those +through +to +too +under +until +up +very +was +wasn +we +were +weren +what +when +where +which +while +who +whom +why +with +won +would +wouldn +you +your +yours +yourself +yourselves \ No newline at end of file diff --git a/dnet-pace-core/src/main/resources/eu/dnetlib/pace/config/title_blacklist.txt b/dnet-pace-core/src/main/resources/eu/dnetlib/pace/config/title_blacklist.txt new file mode 100644 index 000000000..374bec1f9 --- /dev/null +++ b/dnet-pace-core/src/main/resources/eu/dnetlib/pace/config/title_blacklist.txt @@ -0,0 +1,14 @@ +^(Corpus Oral Dialectal \(COD\)\.).*$ +^(Kiri Karl Morgensternile).*$ +^(\[Eksliibris Aleksandr).*\]$ +^(Kiri A\. de Vignolles).*$ +^(2 kirja Karl Morgensternile).*$ +^(Pirita kloostri idaosa arheoloogilised).*$ +^(Kiri tundmatule).*$ +^(Kiri Jenaer Allgemeine Literaturzeitung toimetusele).*$ +^(Eksliibris Nikolai Birukovile).*$ +^(Eksliibris Nikolai Issakovile).*$ +^(\[Eksliibris Aleksandr).*$ +^(WHP Cruise Summary Information of section).*$ +^(Measurement of the top quark\-pair production cross section with ATLAS in pp collisions at).*$ +^(Measurement of the spin\-dependent structure function).* \ No newline at end of file diff --git a/dnet-pace-core/src/test/java/eu/dnetlib/pace/AbstractPaceTest.java b/dnet-pace-core/src/test/java/eu/dnetlib/pace/AbstractPaceTest.java new file mode 100644 index 000000000..9d3919194 --- /dev/null +++ b/dnet-pace-core/src/test/java/eu/dnetlib/pace/AbstractPaceTest.java @@ -0,0 +1,32 @@ +package eu.dnetlib.pace; + +import java.io.IOException; +import java.io.StringWriter; + +import org.apache.commons.io.IOUtils; + +import eu.dnetlib.pace.config.Type; +import eu.dnetlib.pace.model.Field; +import eu.dnetlib.pace.model.FieldValueImpl; + +public abstract class AbstractPaceTest { + + protected String readFromClasspath(final String filename) { + final StringWriter sw = new StringWriter(); + try { + IOUtils.copy(getClass().getResourceAsStream(filename), sw); + return sw.toString(); + } catch (final IOException e) { + throw new RuntimeException("cannot load resource from classpath: " + filename); + } + } + + protected Field title(final String s) { + return new FieldValueImpl(Type.String, "title", s); + } + + protected Field person(final String s) { + return new FieldValueImpl(Type.JSON, "person", s); + } + +} diff --git a/dnet-pace-core/src/test/java/eu/dnetlib/pace/clustering/ClusteringFunctionTest.java b/dnet-pace-core/src/test/java/eu/dnetlib/pace/clustering/ClusteringFunctionTest.java new file mode 100644 index 000000000..c8bbda17c --- /dev/null +++ b/dnet-pace-core/src/test/java/eu/dnetlib/pace/clustering/ClusteringFunctionTest.java @@ -0,0 +1,108 @@ +package eu.dnetlib.pace.clustering; + +import java.util.Map; + +import com.google.common.collect.Lists; +import com.google.common.collect.Maps; +import eu.dnetlib.pace.AbstractPaceTest; +import org.junit.Before; +import org.junit.Test; + +public class ClusteringFunctionTest extends AbstractPaceTest { + + private Map params; + + @Before + public void setUp() throws Exception { + params = Maps.newHashMap(); + } + + @Test + public void testNgram() { + params.put("ngramLen", 3); + params.put("max", 8); + params.put("maxPerToken", 2); + params.put("minNgramLen", 1); + + final ClusteringFunction ngram = new Ngrams(params); + + final String s = "Search for the Standard Model Higgs Boson"; + System.out.println(s); + System.out.println(ngram.apply(Lists.newArrayList(title(s)))); + } + + @Test + public void testNgramPairs() { + params.put("ngramLen", 3); + params.put("max", 3); + + final ClusteringFunction np = new NgramPairs(params); + + final String s = "Search for the Standard Model Higgs Boson"; + System.out.println(s); + System.out.println(np.apply(Lists.newArrayList(title(s)))); + } + + @Test + public void testSortedNgramPairs() { + params.put("ngramLen", 3); + params.put("max", 1); + + final ClusteringFunction np = new SortedNgramPairs(params); + + final String s1 = "University of Pisa"; + System.out.println(s1); + System.out.println(np.apply(Lists.newArrayList(title(s1)))); + + final String s2 = "Pisa University"; + System.out.println(s2); + System.out.println(np.apply(Lists.newArrayList(title(s2)))); + } + + @Test + public void testAcronym() { + params.put("max", 4); + params.put("minLen", 1); + params.put("maxLen", 3); + + final ClusteringFunction acro = new Acronyms(params); + + final String s = "Search for the Standard Model Higgs Boson"; + System.out.println(s); + System.out.println(acro.apply(Lists.newArrayList(title(s)))); + } + + @Test + public void testSuffixPrefix() { + params.put("len", 3); + params.put("max", 4); + + final ClusteringFunction sp = new SuffixPrefix(params); + + final String s = "Search for the Standard Model Higgs Boson"; + System.out.println(s); + System.out.println(sp.apply(Lists.newArrayList(title(s)))); + } + + @Test + public void testFieldValue() { + + params.put("randomLength", 5); + + final ClusteringFunction sp = new SpaceTrimmingFieldValue(params); + + final String s = "Search for the Standard Model Higgs Boson"; + System.out.println(s); + System.out.println(sp.apply(Lists.newArrayList(title(s)))); + } + + @Test + public void testPersonClustering2() { + final ClusteringFunction cf = new PersonClustering(params); + + final String s = readFromClasspath("gt.author.json"); + System.out.println(s); + System.out.println(cf.apply(Lists.newArrayList(person(s)))); + } + +} diff --git a/dnet-pace-core/src/test/java/eu/dnetlib/pace/config/ConfigTest.java b/dnet-pace-core/src/test/java/eu/dnetlib/pace/config/ConfigTest.java new file mode 100644 index 000000000..26ff387c8 --- /dev/null +++ b/dnet-pace-core/src/test/java/eu/dnetlib/pace/config/ConfigTest.java @@ -0,0 +1,24 @@ +package eu.dnetlib.pace.config; + +import java.io.IOException; + +import eu.dnetlib.pace.AbstractPaceTest; +import eu.dnetlib.pace.model.MapDocument; +import eu.dnetlib.pace.model.MapDocumentSerializer; +import org.apache.commons.io.IOUtils; +import org.junit.Test; + +import static org.junit.Assert.assertNotNull; + +public class ConfigTest extends AbstractPaceTest { + + @Test + public void test() throws IOException { + final DedupConfig cfg = DedupConfig.load(readFromClasspath("result.pace.conf.json")); + + assertNotNull(cfg); + + System.out.println(cfg.toString()); + } + +} diff --git a/dnet-pace-core/src/test/java/eu/dnetlib/pace/distance/DistanceAlgoTest.java b/dnet-pace-core/src/test/java/eu/dnetlib/pace/distance/DistanceAlgoTest.java new file mode 100644 index 000000000..f28251c57 --- /dev/null +++ b/dnet-pace-core/src/test/java/eu/dnetlib/pace/distance/DistanceAlgoTest.java @@ -0,0 +1,38 @@ +package eu.dnetlib.pace.distance; + +import org.junit.Before; +import org.junit.Test; + +import eu.dnetlib.pace.common.AbstractPaceFunctions; + +public class DistanceAlgoTest extends AbstractPaceFunctions { + + private final static String TEST_STRING = "Toshiba NB550D: è un netbook su piattaforma AMD Fusion⁽¹²⁾."; + + @Before + public void setup() { + System.out.println("****************************************************************"); + System.out.println("Test String : " + TEST_STRING); + } + + @Test + public void testGetNumbers() { + System.out.println("Numbers : " + getNumbers(TEST_STRING)); + } + + @Test + public void testRemoveSymbols() { + System.out.println("Without symbols: " + removeSymbols(TEST_STRING)); + } + + @Test + public void testFixAliases() { + System.out.println("Fixed aliases : " + fixAliases(TEST_STRING)); + } + + @Test + public void testCleanup() { + System.out.println("cleaned up : " + cleanup(TEST_STRING)); + } + +} diff --git a/dnet-pace-core/src/test/java/eu/dnetlib/pace/model/PersonComparatorUtilsNGramsTest.java b/dnet-pace-core/src/test/java/eu/dnetlib/pace/model/PersonComparatorUtilsNGramsTest.java new file mode 100644 index 000000000..b78866c6c --- /dev/null +++ b/dnet-pace-core/src/test/java/eu/dnetlib/pace/model/PersonComparatorUtilsNGramsTest.java @@ -0,0 +1,126 @@ +package eu.dnetlib.pace.model; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; + +import java.util.Set; + +import org.junit.Test; + +public class PersonComparatorUtilsNGramsTest { + + @Test + public void testNormaizePerson_1() { + verifyGetNgramsForPerson("Artini Michele", 2, "a_michele", "m_artini"); + } + + @Test + public void testNormaizePerson_2() { + verifyGetNgramsForPerson("Michele Artini", 2, "a_michele", "m_artini"); + } + + @Test + public void testNormaizePerson_3() { + verifyGetNgramsForPerson("Michele ARTINI", 1, "m_artini"); + } + + @Test + public void testNormaizePerson_4() { + verifyGetNgramsForPerson("ARTINI Michele", 1, "m_artini"); + } + + @Test + public void testNormaizePerson_5() { + verifyGetNgramsForPerson("Michele G. Artini", 2, "m_artini", "g_artini"); + } + + @Test + public void testNormaizePerson_6() { + verifyGetNgramsForPerson(" Artini, Michele ", 1, "m_artini"); + } + + @Test + public void testNormaizePerson_7() { + verifyGetNgramsForPerson("Artini, Michele (sig.)", 1, "m_artini"); + } + + @Test + public void testNormaizePerson_8() { + verifyGetNgramsForPerson("Artini Michele [sig.] ", 2, "a_michele", "m_artini"); + } + + @Test + public void testNormaizePerson_9() { + verifyGetNgramsForPerson("Artini, M", 1, "m_artini"); + } + + @Test + public void testNormaizePerson_10() { + verifyGetNgramsForPerson("Artini, M.", 1, "m_artini"); + } + + @Test + public void testNormaizePerson_11() { + verifyGetNgramsForPerson("Artini, M. (sig.)", 1, "m_artini"); + } + + @Test + public void testNormaizePerson_12() { + verifyGetNgramsForPerson("Artini, M[sig.] ", 1, "m_artini"); + } + + @Test + public void testNormaizePerson_13() { + verifyGetNgramsForPerson("Artini-SIG, Michele ", 1, "m_artini-sig"); + } + + @Test + public void testNormaizePerson_14() { + verifyGetNgramsForPerson("Artini - SIG, Michele ", 1, "m_artini-sig"); + } + + @Test + public void testNormaizePerson_15() { + verifyGetNgramsForPerson("Artini {sig.}, M", 1, "m_artini"); + } + + @Test + public void testNormaizePerson_16() { + verifyGetNgramsForPerson("Artini, M., sig.", 1, "m_artini"); + } + + @Test + public void testNormaizePerson_17() { + verifyGetNgramsForPerson("AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA, BBBBBBBBBBBBBBBBBBBBBBBBBBBBB CCCCCCCCCCCCCCCCCCCC", 0); + } + + @Test + public void testNormaizePerson_18() { + verifyGetNgramsForPerson("Dell'amico, Andrea", 1, "a_amico"); + } + + @Test + public void testNormaizePerson_19() { + verifyGetNgramsForPerson("Smith, Paul van der", 1, "p_smith"); + } + + @Test + public void testNormaizePerson_20() { + verifyGetNgramsForPerson("AAAAAAA, BBBB, CCCC, DDDD, EEEE", 1, "b_aaaaaaa"); + } + + @Test + public void testNormaizePerson_21() { + verifyGetNgramsForPerson("Kompetenzzentrum Informelle Bildung (KIB),", 6); + } + + private void verifyGetNgramsForPerson(String name, int expectedSize, String... expectedTokens) { + Set list = PersonComparatorUtils.getNgramsForPerson(name); + System.out.println(list); + assertEquals(expectedSize, list.size()); + for (String s : expectedTokens) { + assertTrue(list.contains(s)); + } + } + +} diff --git a/dnet-pace-core/src/test/java/eu/dnetlib/pace/model/PersonComparatorUtilsSimilarityTest.java b/dnet-pace-core/src/test/java/eu/dnetlib/pace/model/PersonComparatorUtilsSimilarityTest.java new file mode 100644 index 000000000..20da8db87 --- /dev/null +++ b/dnet-pace-core/src/test/java/eu/dnetlib/pace/model/PersonComparatorUtilsSimilarityTest.java @@ -0,0 +1,89 @@ +package eu.dnetlib.pace.model; + +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertTrue; + +import org.junit.Test; + +public class PersonComparatorUtilsSimilarityTest { + + @Test + public void testSimilarity_0() { + assertTrue(PersonComparatorUtils.areSimilar("Artini Michele", "Michele Artini")); + } + + @Test + public void testSimilarity_1() { + assertTrue(PersonComparatorUtils.areSimilar("ARTINI Michele", "Artini, Michele")); + } + + @Test + public void testSimilarity_2() { + assertTrue(PersonComparatorUtils.areSimilar("Artini, M.", "Artini Michele")); + } + + @Test + public void testSimilarity_3() { + assertTrue(PersonComparatorUtils.areSimilar("Artini, M.G.", "Artini, Michele")); + } + + @Test + public void testSimilarity_4() { + assertTrue(PersonComparatorUtils.areSimilar("Artini, M.", "Artini, M.G.")); + } + + @Test + public void testSimilarity_5() { + assertTrue(PersonComparatorUtils.areSimilar("Artini, M. (sig.)", "Artini, Michele")); + } + + @Test + public void testSimilarity_6() { + assertFalse(PersonComparatorUtils.areSimilar("Artini, M.", "Artini, G.")); + } + + @Test + public void testSimilarity_7() { + assertFalse(PersonComparatorUtils.areSimilar("Artini, M.G.", "Artini, M.A.")); + } + + @Test + public void testSimilarity_8() { + assertFalse(PersonComparatorUtils.areSimilar("Artini, M.", "Artini, Giuseppe")); + } + + @Test + public void testSimilarity_9() { + assertFalse(PersonComparatorUtils.areSimilar("Manghi, Paolo", "Artini, Michele")); + } + + @Test + public void testSimilarity_10() { + assertTrue(PersonComparatorUtils.areSimilar("Artini, Michele", "Artini, Michele Giovanni")); + } + + @Test + public void testSimilarity_11() { + assertFalse(PersonComparatorUtils.areSimilar("Artini, M.A.G.", "Artini, M.B.G.")); + } + + @Test + public void testSimilarity_12() { + assertFalse(PersonComparatorUtils.areSimilar("Artini Manghi, M.", "Artini, Michele")); + } + + @Test + public void testSimilarity_13() { + assertTrue(PersonComparatorUtils.areSimilar("Artini Manghi, M.", "Artini Manghi Michele")); + } + + @Test + public void testSimilarity_14() { + assertFalse(PersonComparatorUtils.areSimilar("Artini, Michele", "Michele, Artini")); + } + + @Test + public void testSimilarity_15() { + assertTrue(PersonComparatorUtils.areSimilar("Artini, M.", "Michele ARTINI")); + } +} \ No newline at end of file diff --git a/dnet-pace-core/src/test/java/eu/dnetlib/pace/model/PersonTest.java b/dnet-pace-core/src/test/java/eu/dnetlib/pace/model/PersonTest.java new file mode 100644 index 000000000..a457fd8de --- /dev/null +++ b/dnet-pace-core/src/test/java/eu/dnetlib/pace/model/PersonTest.java @@ -0,0 +1,111 @@ +package eu.dnetlib.pace.model; + +import static org.junit.Assert.assertEquals; + +import java.text.Normalizer; +import java.util.Queue; + +import org.junit.Test; + +import com.google.common.collect.Lists; + +public class PersonTest { + + @Test + public void test_1() { + check("Atzori, Claudio", "Atzori, Claudio"); + } + + @Test + public void test_2() { + check("Atzori, Claudio A.", "Atzori, Claudio A."); + } + + @Test + public void test_3() { + check("Claudio ATZORI", "Atzori, Claudio"); + } + + @Test + public void test_4() { + check("ATZORI, Claudio", "Atzori, Claudio"); + } + + @Test + public void test_5() { + check("Claudio Atzori", "Claudio Atzori"); + } + + @Test + public void test_6() { + check(" Manghi , Paolo", "Manghi, Paolo"); + } + + @Test + public void test_7() { + check("ATZORI, CLAUDIO", "Atzori, Claudio"); + } + + @Test + public void test_8() { + check("ATZORI, CLAUDIO A", "Atzori, Claudio A."); + } + + @Test + public void test_9() { + check("Bølviken, B.", "Bølviken, B."); + } + + @Test + public void test_10() { + check("Bñlviken, B.", "B" + Normalizer.normalize("ñ", Normalizer.Form.NFD) + "lviken, B."); + } + + @Test + public void test_11() { + check("aáeéiíoóöőuúüű AÁEÉIÍOÓÖŐUÚÜŰ ø", "Aaeeiioooouuuu, Aaeeiioooouuuu Ø.", true); + } + + @Test + public void test_12() { + check("aáeéiíoóöőuúüű AÁEÉIÍOÓÖŐUÚÜŰz ø", Normalizer.normalize("aáeéiíoóöőuúüű AÁEÉIÍOÓÖŐUÚÜŰz ø", Normalizer.Form.NFD), false); + } + + @Test + public void test_13() { + check("Tkačíková, Daniela", Normalizer.normalize("Tkačíková, Daniela", Normalizer.Form.NFD), false); + } + + @Test + public void test_hashes() { + checkHash(" Claudio ATZORI ", "ATZORI Claudio", "Atzori , Claudio", "ATZORI, Claudio"); + } + + private void checkHash(String... ss) { + Queue q = Lists.newLinkedList(Lists.newArrayList(ss)); + String h1 = new Person(q.remove(), false).hash(); + while (!q.isEmpty()) { + assertEquals(h1, new Person(q.remove(), false).hash()); + } + } + + private void check(String s, String expectedFullName) { + check(s, expectedFullName, false); + } + + private void check(String s, String expectedFullName, boolean aggressive) { + Person p = new Person(s, aggressive); + + System.out.println("original: " + p.getOriginal()); + System.out.println("accurate: " + p.isAccurate()); + System.out.println("normalised: '" + p.getNormalisedFullname() + "'"); + if (p.isAccurate()) { + System.out.println("name: " + p.getNormalisedFirstName()); + System.out.println("surname: " + p.getNormalisedSurname()); + } + System.out.println("hash: " + p.hash()); + System.out.println(""); + assertEquals(expectedFullName, p.getNormalisedFullname()); + } + +} diff --git a/dnet-pace-core/src/test/resources/eu/dnetlib/pace/clustering/gt.author.json b/dnet-pace-core/src/test/resources/eu/dnetlib/pace/clustering/gt.author.json new file mode 100644 index 000000000..d7fbf2166 --- /dev/null +++ b/dnet-pace-core/src/test/resources/eu/dnetlib/pace/clustering/gt.author.json @@ -0,0 +1 @@ +{"metadata": {"firstname": {"value": "Margaret R."},"secondnames": [{"value": "Macdonald"}],"fullname": {"value": "Macdonald, Margaret R."}},"coauthor": [{"id": "30|od________88::1d22c2a22d7a1c7082006154ae6dd221","anchorId": "30|dedup_wf_001::7b1cfb3c4ec57d71cf331ba669a8e12c","metadata": {"firstname": {"value": "Maria Teresa"},"secondnames": [{"value": "Catanese"}],"fullname": {"value": "Catanese, Maria Teresa"}}},{"id": "30|od________88::2299c043fcaa751e266c82ec24b5a6cf","anchorId": "30|dedup_wf_001::ce73dc26c95e27d22f88e9ed9948b322","metadata": {"firstname": {"value": "Thomas S."},"secondnames": [{"value": "Oh"}],"fullname": {"value": "Oh, Thomas S."}}},{"id": "30|od_______908::52d670e6298c055c6c9c496aad4f2913","anchorId": "30|dedup_wf_001::8e1fafd9778a4cb5569830c299e5b52e","metadata": {"firstname": {"value": "Salman R."},"secondnames": [{"value": "Khetani"}],"fullname": {"value": "Khetani, Salman R."}}},{"id": "30|od________88::1458ae8d3663574e53dcd849ff8aa27d","anchorId": "30|dedup_wf_001::dd9f1dce92f402424de0d7d8afd7ca2d","metadata": {"firstname": {"value": "Sangeeta N."},"secondnames": [{"value": "Bhatia"}],"fullname": {"value": "Bhatia, Sangeeta N."}}},{"id": "30|od________88::837b992599e35b1a9baed833bf9a216e","anchorId": "30|dedup_wf_001::acb87ae171fd37f0ad65bcb728b11064","metadata": {"firstname": {"value": "Andrew J."},"secondnames": [{"value": "Syder"}],"fullname": {"value": "Syder, Andrew J."}}},{"id": "30|od_______908::2299c043fcaa751e266c82ec24b5a6cf","anchorId": "30|dedup_wf_001::ce73dc26c95e27d22f88e9ed9948b322","metadata": {"firstname": {"value": "Thomas S."},"secondnames": [{"value": "Oh"}],"fullname": {"value": "Oh, Thomas S."}}},{"id": "30|od_______908::97e1b5f96f76500dfd9e10ee0de5d380","anchorId": "30|dedup_wf_001::da35eb52feb1b1a789861976342b2570","metadata": {"firstname": {"value": "John W."},"secondnames": [{"value": "Schoggins"}],"fullname": {"value": "Schoggins, John W."}}},{"id": "30|od________88::97e1b5f96f76500dfd9e10ee0de5d380","anchorId": "30|dedup_wf_001::da35eb52feb1b1a789861976342b2570","metadata": {"firstname": {"value": "John W."},"secondnames": [{"value": "Schoggins"}],"fullname": {"value": "Schoggins, John W."}}},{"id": "30|od_______908::5bd4cd7e4065ffd73f39817e2a1bb1ae","anchorId": "30|dedup_wf_001::8ea4c1052c6a7aa1bb2b1097cb3893d2","metadata": {"firstname": {"value": "Lok Man J."},"secondnames": [{"value": "Law"}],"fullname": {"value": "Law, Lok Man J."}}},{"id": "30|od________88::845fd19e1e7201fcd1c492775f04a56b","anchorId": "30|dedup_wf_001::4e971919118e71ea2b2ac840ca319956","metadata": {"firstname": {"value": "Alexander"},"secondnames": [{"value": "Ploss"}],"fullname": {"value": "Ploss, Alexander"}}},{"id": "30|od_______908::7b6a37259ff32dba0e7ea884b8446228","anchorId": "30|dedup_wf_001::a600d9103b7947b1c52f823f8e4833cc","metadata": {"firstname": {"value": "Christopher T."},"secondnames": [{"value": "Jones"}],"fullname": {"value": "Jones, Christopher T."}}},{"id": "30|od________88::5bd4cd7e4065ffd73f39817e2a1bb1ae","anchorId": "30|dedup_wf_001::8ea4c1052c6a7aa1bb2b1097cb3893d2","metadata": {"firstname": {"value": "Lok Man J."},"secondnames": [{"value": "Law"}],"fullname": {"value": "Law, Lok Man J."}}},{"id": "30|od_______908::1d22c2a22d7a1c7082006154ae6dd221","anchorId": "30|dedup_wf_001::7b1cfb3c4ec57d71cf331ba669a8e12c","metadata": {"firstname": {"value": "Maria Teresa"},"secondnames": [{"value": "Catanese"}],"fullname": {"value": "Catanese, Maria Teresa"}}},{"id": "30|od________88::52d670e6298c055c6c9c496aad4f2913","anchorId": "30|dedup_wf_001::8e1fafd9778a4cb5569830c299e5b52e","metadata": {"firstname": {"value": "Salman R."},"secondnames": [{"value": "Khetani"}],"fullname": {"value": "Khetani, Salman R."}}},{"id": "30|od_______908::46acd9f206c2559f13b9119f8c5aef4c","anchorId": "30|dedup_wf_001::06a55cf2c97156d48ec49bcaf4bddcaf","metadata": {"firstname": {"value": "Stephen P."},"secondnames": [{"value": "Goff"}],"fullname": {"value": "Goff, Stephen P."}}},{"id": "30|od________88::7b6a37259ff32dba0e7ea884b8446228","anchorId": "30|dedup_wf_001::a600d9103b7947b1c52f823f8e4833cc","metadata": {"firstname": {"value": "Christopher T."},"secondnames": [{"value": "Jones"}],"fullname": {"value": "Jones, Christopher T."}}},{"id": "30|od_______908::1458ae8d3663574e53dcd849ff8aa27d","anchorId": "30|dedup_wf_001::dd9f1dce92f402424de0d7d8afd7ca2d","metadata": {"firstname": {"value": "Sangeeta N."},"secondnames": [{"value": "Bhatia"}],"fullname": {"value": "Bhatia, Sangeeta N."}}},{"id": "30|od_______908::845fd19e1e7201fcd1c492775f04a56b","anchorId": "30|dedup_wf_001::4e971919118e71ea2b2ac840ca319956","metadata": {"firstname": {"value": "Alexander"},"secondnames": [{"value": "Ploss"}],"fullname": {"value": "Ploss, Alexander"}}},{"id": "30|od_______908::837b992599e35b1a9baed833bf9a216e","anchorId": "30|dedup_wf_001::acb87ae171fd37f0ad65bcb728b11064","metadata": {"firstname": {"value": "Andrew J."},"secondnames": [{"value": "Syder"}],"fullname": {"value": "Syder, Andrew J."}}}],"mergedperson": [{"id": "30|od_______908::715fc4c41052a5b8ce881b23b826f648","metadata": {"firstname": {"value": "Margaret R."},"secondnames": [{"value": "Macdonald"}],"fullname": {"value": "Macdonald, Margaret R."}}},{"id": "30|od________88::715fc4c41052a5b8ce881b23b826f648","metadata": {"firstname": {"value": "Margaret R."},"secondnames": [{"value": "Macdonald"}],"fullname": {"value": "Macdonald, Margaret R."}}}],"anchor": true} \ No newline at end of file diff --git a/dnet-pace-core/src/test/resources/eu/dnetlib/pace/config/result.pace.conf.json b/dnet-pace-core/src/test/resources/eu/dnetlib/pace/config/result.pace.conf.json new file mode 100644 index 000000000..7d8fe244b --- /dev/null +++ b/dnet-pace-core/src/test/resources/eu/dnetlib/pace/config/result.pace.conf.json @@ -0,0 +1,53 @@ +{ + "wf" : { + "threshold" : "0.99", + "run" : "001", + "entityType" : "result", + "orderField" : "title", + "queueMaxSize" : "2000", + "groupMaxSize" : "10", + "slidingWindowSize" : "200", + "rootBuilder" : [ "result" ], + "includeChildren" : "true" + }, + "pace" : { + "clustering" : [ + { "name" : "acronyms", "fields" : [ "title" ], "params" : { "max" : "1", "minLen" : "2", "maxLen" : "4"} }, + { "name" : "ngrampairs", "fields" : [ "title" ], "params" : { "max" : "1", "ngramLen" : "3"} }, + { "name" : "suffixprefix", "fields" : [ "title" ], "params" : { "max" : "1", "len" : "3" } } + ], + "strictConditions" : [ + { "name" : "exactMatch", "fields" : [ "pid" ] } + ], + "conditions" : [ + { "name" : "yearMatch", "fields" : [ "dateofacceptance" ] }, + { "name" : "titleVersionMatch", "fields" : [ "title" ] }, + { "name" : "sizeMatch", "fields" : [ "authors" ] } + ], + "model" : [ + { "name" : "pid", "algo" : "Null", "type" : "String", "weight" : "0.0", "ignoreMissing" : "true", "path" : "pid[qualifier#classid = {doi}]/value", "overrideMatch" : "true" }, + { "name" : "title", "algo" : "JaroWinkler", "type" : "String", "weight" : "1.0", "ignoreMissing" : "false", "path" : "result/metadata/title[qualifier#classid = {main title}]/value" }, + { "name" : "dateofacceptance", "algo" : "Null", "type" : "String", "weight" : "0.0", "ignoreMissing" : "true", "path" : "result/metadata/dateofacceptance/value" } , + { "name" : "authors", "algo" : "Null", "type" : "List", "weight" : "0.0", "ignoreMissing" : "true", "path" : "result/author/metadata/fullname/value" } + ], + "blacklists" : { + "title" : [ + "^(Corpus Oral Dialectal \\(COD\\)\\.).*$", + "^(Kiri Karl Morgensternile).*$", + "^(\\[Eksliibris Aleksandr).*\\]$", + "^(\\[Eksliibris Aleksandr).*$", + "^(Eksliibris Aleksandr).*$", + "^(Kiri A\\. de Vignolles).*$", + "^(2 kirja Karl Morgensternile).*$", + "^(Pirita kloostri idaosa arheoloogilised).*$", + "^(Kiri tundmatule).*$", + "^(Kiri Jenaer Allgemeine Literaturzeitung toimetusele).*$", + "^(Eksliibris Nikolai Birukovile).*$", + "^(Eksliibris Nikolai Issakovile).*$", + "^(WHP Cruise Summary Information of section).*$", + "^(Measurement of the top quark\\-pair production cross section with ATLAS in pp collisions at).*$", + "^(Measurement of the spin\\-dependent structure function).*" + ] } + } + +} diff --git a/dnet-pace-core/src/test/resources/eu/dnetlib/pace/config/title_blacklist.txt b/dnet-pace-core/src/test/resources/eu/dnetlib/pace/config/title_blacklist.txt new file mode 100644 index 000000000..f9774db08 --- /dev/null +++ b/dnet-pace-core/src/test/resources/eu/dnetlib/pace/config/title_blacklist.txt @@ -0,0 +1,15 @@ +^(Corpus Oral Dialectal \(COD\)\.).*$ +^(Kiri Karl Morgensternile).*$ +^(\[Eksliibris Aleksandr).*\]$ +^(Kiri A\. de Vignolles).*$ +^(2 kirja Karl Morgensternile).*$ +^(Pirita kloostri idaosa arheoloogilised).*$ +^(Kiri tundmatule).*$ +^(Kiri Jenaer Allgemeine Literaturzeitung toimetusele).*$ +^(Eksliibris Nikolai Birukovile).*$ +^(Eksliibris Nikolai Issakovile).*$ +^(\[Eksliibris Aleksandr).*$ +^(WHP Cruise Summary Information of section).*$ +^(Measurement of the top quark\-pair production cross section with ATLAS in pp collisions at).*$ +^(Measurement of the spin\-dependent structure function).* +^(lorem ipsum).* \ No newline at end of file