forked from D-Net/dnet-hadoop
Merge pull request 'Refactor Dedup using Spark Dataframe API, initial support for scala 2.12 and Spark 3.4' (#324) from dedup-with-dataframe-2 into beta
Reviewed-on: D-Net/dnet-hadoop#324
This commit is contained in:
commit
8c63e4a864
|
@ -52,6 +52,8 @@
|
||||||
</execution>
|
</execution>
|
||||||
</executions>
|
</executions>
|
||||||
<configuration>
|
<configuration>
|
||||||
|
<failOnMultipleScalaVersions>true</failOnMultipleScalaVersions>
|
||||||
|
<scalaCompatVersion>${scala.binary.version}</scalaCompatVersion>
|
||||||
<scalaVersion>${scala.version}</scalaVersion>
|
<scalaVersion>${scala.version}</scalaVersion>
|
||||||
</configuration>
|
</configuration>
|
||||||
</plugin>
|
</plugin>
|
||||||
|
@ -81,11 +83,11 @@
|
||||||
|
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>org.apache.spark</groupId>
|
<groupId>org.apache.spark</groupId>
|
||||||
<artifactId>spark-core_2.11</artifactId>
|
<artifactId>spark-core_${scala.binary.version}</artifactId>
|
||||||
</dependency>
|
</dependency>
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>org.apache.spark</groupId>
|
<groupId>org.apache.spark</groupId>
|
||||||
<artifactId>spark-sql_2.11</artifactId>
|
<artifactId>spark-sql_${scala.binary.version}</artifactId>
|
||||||
</dependency>
|
</dependency>
|
||||||
|
|
||||||
<dependency>
|
<dependency>
|
||||||
|
@ -159,7 +161,7 @@
|
||||||
|
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>eu.dnetlib.dhp</groupId>
|
<groupId>eu.dnetlib.dhp</groupId>
|
||||||
<artifactId>dhp-schemas</artifactId>
|
<artifactId>${dhp-schemas.artifact}</artifactId>
|
||||||
</dependency>
|
</dependency>
|
||||||
|
|
||||||
<dependency>
|
<dependency>
|
||||||
|
|
|
@ -20,7 +20,7 @@
|
||||||
<plugin>
|
<plugin>
|
||||||
<groupId>net.alchim31.maven</groupId>
|
<groupId>net.alchim31.maven</groupId>
|
||||||
<artifactId>scala-maven-plugin</artifactId>
|
<artifactId>scala-maven-plugin</artifactId>
|
||||||
<version>4.0.1</version>
|
<version>${net.alchim31.maven.version}</version>
|
||||||
<executions>
|
<executions>
|
||||||
<execution>
|
<execution>
|
||||||
<id>scala-compile-first</id>
|
<id>scala-compile-first</id>
|
||||||
|
@ -39,8 +39,9 @@
|
||||||
</execution>
|
</execution>
|
||||||
</executions>
|
</executions>
|
||||||
<configuration>
|
<configuration>
|
||||||
|
<failOnMultipleScalaVersions>true</failOnMultipleScalaVersions>
|
||||||
|
<scalaCompatVersion>${scala.binary.version}</scalaCompatVersion>
|
||||||
<scalaVersion>${scala.version}</scalaVersion>
|
<scalaVersion>${scala.version}</scalaVersion>
|
||||||
<addScalacArgs>-target:jvm-1.8</addScalacArgs>
|
|
||||||
</configuration>
|
</configuration>
|
||||||
</plugin>
|
</plugin>
|
||||||
</plugins>
|
</plugins>
|
||||||
|
@ -68,7 +69,6 @@
|
||||||
<groupId>commons-io</groupId>
|
<groupId>commons-io</groupId>
|
||||||
<artifactId>commons-io</artifactId>
|
<artifactId>commons-io</artifactId>
|
||||||
</dependency>
|
</dependency>
|
||||||
|
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>org.antlr</groupId>
|
<groupId>org.antlr</groupId>
|
||||||
<artifactId>stringtemplate</artifactId>
|
<artifactId>stringtemplate</artifactId>
|
||||||
|
@ -89,17 +89,22 @@
|
||||||
<groupId>org.apache.commons</groupId>
|
<groupId>org.apache.commons</groupId>
|
||||||
<artifactId>commons-math3</artifactId>
|
<artifactId>commons-math3</artifactId>
|
||||||
</dependency>
|
</dependency>
|
||||||
|
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>com.jayway.jsonpath</groupId>
|
<groupId>com.jayway.jsonpath</groupId>
|
||||||
<artifactId>json-path</artifactId>
|
<artifactId>json-path</artifactId>
|
||||||
</dependency>
|
</dependency>
|
||||||
|
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>com.ibm.icu</groupId>
|
<groupId>com.ibm.icu</groupId>
|
||||||
<artifactId>icu4j</artifactId>
|
<artifactId>icu4j</artifactId>
|
||||||
</dependency>
|
</dependency>
|
||||||
|
<dependency>
|
||||||
|
<groupId>org.apache.spark</groupId>
|
||||||
|
<artifactId>spark-core_${scala.binary.version}</artifactId>
|
||||||
|
</dependency>
|
||||||
|
<dependency>
|
||||||
|
<groupId>org.apache.spark</groupId>
|
||||||
|
<artifactId>spark-sql_${scala.binary.version}</artifactId>
|
||||||
|
</dependency>
|
||||||
</dependencies>
|
</dependencies>
|
||||||
|
|
||||||
</project>
|
</project>
|
||||||
|
|
|
@ -11,7 +11,6 @@ import org.apache.commons.lang3.StringUtils;
|
||||||
|
|
||||||
import eu.dnetlib.pace.common.AbstractPaceFunctions;
|
import eu.dnetlib.pace.common.AbstractPaceFunctions;
|
||||||
import eu.dnetlib.pace.config.Config;
|
import eu.dnetlib.pace.config.Config;
|
||||||
import eu.dnetlib.pace.model.Field;
|
|
||||||
|
|
||||||
public abstract class AbstractClusteringFunction extends AbstractPaceFunctions implements ClusteringFunction {
|
public abstract class AbstractClusteringFunction extends AbstractPaceFunctions implements ClusteringFunction {
|
||||||
|
|
||||||
|
@ -24,11 +23,10 @@ public abstract class AbstractClusteringFunction extends AbstractPaceFunctions i
|
||||||
protected abstract Collection<String> doApply(Config conf, String s);
|
protected abstract Collection<String> doApply(Config conf, String s);
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Collection<String> apply(Config conf, List<Field> fields) {
|
public Collection<String> apply(Config conf, List<String> fields) {
|
||||||
return fields
|
return fields
|
||||||
.stream()
|
.stream()
|
||||||
.filter(f -> !f.isEmpty())
|
.filter(f -> !f.isEmpty())
|
||||||
.map(Field::stringValue)
|
|
||||||
.map(this::normalize)
|
.map(this::normalize)
|
||||||
.map(s -> filterAllStopWords(s))
|
.map(s -> filterAllStopWords(s))
|
||||||
.map(s -> doApply(conf, s))
|
.map(s -> doApply(conf, s))
|
||||||
|
|
|
@ -1,60 +0,0 @@
|
||||||
|
|
||||||
package eu.dnetlib.pace.clustering;
|
|
||||||
|
|
||||||
import java.util.Collection;
|
|
||||||
import java.util.List;
|
|
||||||
import java.util.Map;
|
|
||||||
import java.util.Map.Entry;
|
|
||||||
import java.util.regex.Pattern;
|
|
||||||
|
|
||||||
import com.google.common.collect.Maps;
|
|
||||||
|
|
||||||
import eu.dnetlib.pace.config.Config;
|
|
||||||
import eu.dnetlib.pace.model.Document;
|
|
||||||
import eu.dnetlib.pace.model.Field;
|
|
||||||
import eu.dnetlib.pace.model.FieldListImpl;
|
|
||||||
import eu.dnetlib.pace.model.MapDocument;
|
|
||||||
|
|
||||||
public class BlacklistAwareClusteringCombiner extends ClusteringCombiner {
|
|
||||||
|
|
||||||
public static Collection<String> filterAndCombine(final MapDocument a, final Config conf) {
|
|
||||||
Document filtered = filter(a, conf.blacklists());
|
|
||||||
return combine(filtered, conf);
|
|
||||||
}
|
|
||||||
|
|
||||||
private static MapDocument filter(final MapDocument a, final Map<String, List<Pattern>> blacklists) {
|
|
||||||
if (blacklists == null || blacklists.isEmpty()) {
|
|
||||||
return a;
|
|
||||||
}
|
|
||||||
|
|
||||||
final Map<String, Field> filtered = Maps.newHashMap(a.getFieldMap());
|
|
||||||
|
|
||||||
for (final Entry<String, List<Pattern>> e : blacklists.entrySet()) {
|
|
||||||
Field fields = a.getFieldMap().get(e.getKey());
|
|
||||||
if (fields != null) {
|
|
||||||
final FieldListImpl fl = new FieldListImpl();
|
|
||||||
|
|
||||||
for (Field f : fields) {
|
|
||||||
if (!isBlackListed(f.stringValue(), e.getValue())) {
|
|
||||||
fl.add(f);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
filtered.put(e.getKey(), fl);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return new MapDocument(a.getIdentifier(), filtered);
|
|
||||||
}
|
|
||||||
|
|
||||||
private static boolean isBlackListed(String value, List<Pattern> blacklist) {
|
|
||||||
for (Pattern pattern : blacklist) {
|
|
||||||
if (pattern.matcher(value).matches()) {
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
|
@ -1,64 +0,0 @@
|
||||||
|
|
||||||
package eu.dnetlib.pace.clustering;
|
|
||||||
|
|
||||||
import java.util.ArrayList;
|
|
||||||
import java.util.Collection;
|
|
||||||
import java.util.List;
|
|
||||||
import java.util.stream.Collectors;
|
|
||||||
|
|
||||||
import org.apache.commons.lang3.StringUtils;
|
|
||||||
|
|
||||||
import com.google.common.collect.Sets;
|
|
||||||
|
|
||||||
import eu.dnetlib.pace.config.Config;
|
|
||||||
import eu.dnetlib.pace.model.ClusteringDef;
|
|
||||||
import eu.dnetlib.pace.model.Document;
|
|
||||||
import eu.dnetlib.pace.model.Field;
|
|
||||||
import eu.dnetlib.pace.model.FieldValueImpl;
|
|
||||||
|
|
||||||
public class ClusteringCombiner {
|
|
||||||
|
|
||||||
private static String SEPARATOR = ":";
|
|
||||||
private static String COLLAPSE_ON = "collapseOn";
|
|
||||||
|
|
||||||
public static Collection<String> combine(final Document a, final Config conf) {
|
|
||||||
final Collection<String> res = Sets.newLinkedHashSet();
|
|
||||||
for (final ClusteringDef cd : conf.clusterings()) {
|
|
||||||
for (final String fieldName : cd.getFields()) {
|
|
||||||
String prefix = getPrefix(cd, fieldName);
|
|
||||||
|
|
||||||
Field values = a.values(fieldName);
|
|
||||||
List<Field> fields = new ArrayList<>();
|
|
||||||
|
|
||||||
if (values instanceof FieldValueImpl) {
|
|
||||||
fields.add(values);
|
|
||||||
} else {
|
|
||||||
fields.addAll((List<Field>) values);
|
|
||||||
}
|
|
||||||
|
|
||||||
res
|
|
||||||
.addAll(
|
|
||||||
cd
|
|
||||||
.clusteringFunction()
|
|
||||||
.apply(conf, fields)
|
|
||||||
.stream()
|
|
||||||
.map(k -> prefix + SEPARATOR + k)
|
|
||||||
.collect(Collectors.toList()));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return res;
|
|
||||||
}
|
|
||||||
|
|
||||||
private static String getPrefix(ClusteringDef cd, String fieldName) {
|
|
||||||
return cd.getName() + SEPARATOR +
|
|
||||||
cd
|
|
||||||
.getParams()
|
|
||||||
.keySet()
|
|
||||||
.stream()
|
|
||||||
.filter(k -> k.contains(COLLAPSE_ON))
|
|
||||||
.findFirst()
|
|
||||||
.map(k -> StringUtils.substringAfter(k, SEPARATOR))
|
|
||||||
.orElse(fieldName);
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
|
@ -6,11 +6,10 @@ import java.util.List;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
|
|
||||||
import eu.dnetlib.pace.config.Config;
|
import eu.dnetlib.pace.config.Config;
|
||||||
import eu.dnetlib.pace.model.Field;
|
|
||||||
|
|
||||||
public interface ClusteringFunction {
|
public interface ClusteringFunction {
|
||||||
|
|
||||||
public Collection<String> apply(Config config, List<Field> fields);
|
public Collection<String> apply(Config config, List<String> fields);
|
||||||
|
|
||||||
public Map<String, Integer> getParams();
|
public Map<String, Integer> getParams();
|
||||||
|
|
||||||
|
|
|
@ -6,9 +6,7 @@ import java.util.stream.Collectors;
|
||||||
|
|
||||||
import org.apache.commons.lang3.StringUtils;
|
import org.apache.commons.lang3.StringUtils;
|
||||||
|
|
||||||
import eu.dnetlib.pace.common.AbstractPaceFunctions;
|
|
||||||
import eu.dnetlib.pace.config.Config;
|
import eu.dnetlib.pace.config.Config;
|
||||||
import eu.dnetlib.pace.model.Field;
|
|
||||||
|
|
||||||
@ClusteringClass("keywordsclustering")
|
@ClusteringClass("keywordsclustering")
|
||||||
public class KeywordsClustering extends AbstractClusteringFunction {
|
public class KeywordsClustering extends AbstractClusteringFunction {
|
||||||
|
@ -40,11 +38,10 @@ public class KeywordsClustering extends AbstractClusteringFunction {
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Collection<String> apply(final Config conf, List<Field> fields) {
|
public Collection<String> apply(final Config conf, List<String> fields) {
|
||||||
return fields
|
return fields
|
||||||
.stream()
|
.stream()
|
||||||
.filter(f -> !f.isEmpty())
|
.filter(f -> !f.isEmpty())
|
||||||
.map(Field::stringValue)
|
|
||||||
.map(this::cleanup)
|
.map(this::cleanup)
|
||||||
.map(this::normalize)
|
.map(this::normalize)
|
||||||
.map(s -> filterAllStopWords(s))
|
.map(s -> filterAllStopWords(s))
|
||||||
|
|
|
@ -9,7 +9,6 @@ import org.apache.commons.lang3.StringUtils;
|
||||||
import com.google.common.collect.Lists;
|
import com.google.common.collect.Lists;
|
||||||
|
|
||||||
import eu.dnetlib.pace.config.Config;
|
import eu.dnetlib.pace.config.Config;
|
||||||
import eu.dnetlib.pace.model.Field;
|
|
||||||
import eu.dnetlib.pace.model.Person;
|
import eu.dnetlib.pace.model.Person;
|
||||||
|
|
||||||
@ClusteringClass("lnfi")
|
@ClusteringClass("lnfi")
|
||||||
|
@ -22,11 +21,10 @@ public class LastNameFirstInitial extends AbstractClusteringFunction {
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Collection<String> apply(Config conf, List<Field> fields) {
|
public Collection<String> apply(Config conf, List<String> fields) {
|
||||||
return fields
|
return fields
|
||||||
.stream()
|
.stream()
|
||||||
.filter(f -> !f.isEmpty())
|
.filter(f -> !f.isEmpty())
|
||||||
.map(Field::stringValue)
|
|
||||||
.map(this::normalize)
|
.map(this::normalize)
|
||||||
.map(s -> doApply(conf, s))
|
.map(s -> doApply(conf, s))
|
||||||
.map(c -> filterBlacklisted(c, ngramBlacklist))
|
.map(c -> filterBlacklisted(c, ngramBlacklist))
|
||||||
|
|
|
@ -11,7 +11,6 @@ import com.google.common.collect.Lists;
|
||||||
import com.google.common.collect.Sets;
|
import com.google.common.collect.Sets;
|
||||||
|
|
||||||
import eu.dnetlib.pace.config.Config;
|
import eu.dnetlib.pace.config.Config;
|
||||||
import eu.dnetlib.pace.model.Field;
|
|
||||||
|
|
||||||
@ClusteringClass("lowercase")
|
@ClusteringClass("lowercase")
|
||||||
public class LowercaseClustering extends AbstractClusteringFunction {
|
public class LowercaseClustering extends AbstractClusteringFunction {
|
||||||
|
@ -21,10 +20,10 @@ public class LowercaseClustering extends AbstractClusteringFunction {
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Collection<String> apply(Config conf, List<Field> fields) {
|
public Collection<String> apply(Config conf, List<String> fields) {
|
||||||
Collection<String> c = Sets.newLinkedHashSet();
|
Collection<String> c = Sets.newLinkedHashSet();
|
||||||
for (Field f : fields) {
|
for (String f : fields) {
|
||||||
c.addAll(doApply(conf, f.stringValue()));
|
c.addAll(doApply(conf, f));
|
||||||
}
|
}
|
||||||
return c;
|
return c;
|
||||||
}
|
}
|
||||||
|
|
|
@ -8,15 +8,15 @@ import org.apache.commons.lang3.StringUtils;
|
||||||
import eu.dnetlib.pace.common.AbstractPaceFunctions;
|
import eu.dnetlib.pace.common.AbstractPaceFunctions;
|
||||||
|
|
||||||
public class NGramUtils extends AbstractPaceFunctions {
|
public class NGramUtils extends AbstractPaceFunctions {
|
||||||
|
static private final NGramUtils NGRAMUTILS = new NGramUtils();
|
||||||
|
|
||||||
private static final int SIZE = 100;
|
private static final int SIZE = 100;
|
||||||
|
|
||||||
private static Set<String> stopwords = AbstractPaceFunctions
|
private static final Set<String> stopwords = AbstractPaceFunctions
|
||||||
.loadFromClasspath("/eu/dnetlib/pace/config/stopwords_en.txt");
|
.loadFromClasspath("/eu/dnetlib/pace/config/stopwords_en.txt");
|
||||||
|
|
||||||
public static String cleanupForOrdering(String s) {
|
public static String cleanupForOrdering(String s) {
|
||||||
NGramUtils utils = new NGramUtils();
|
return (NGRAMUTILS.filterStopWords(NGRAMUTILS.normalize(s), stopwords) + StringUtils.repeat(" ", SIZE))
|
||||||
return (utils.filterStopWords(utils.normalize(s), stopwords) + StringUtils.repeat(" ", SIZE))
|
|
||||||
.substring(0, SIZE)
|
.substring(0, SIZE)
|
||||||
.replaceAll(" ", "");
|
.replaceAll(" ", "");
|
||||||
}
|
}
|
||||||
|
|
|
@ -2,7 +2,6 @@
|
||||||
package eu.dnetlib.pace.clustering;
|
package eu.dnetlib.pace.clustering;
|
||||||
|
|
||||||
import java.util.Collection;
|
import java.util.Collection;
|
||||||
import java.util.HashMap;
|
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
|
|
||||||
|
@ -14,7 +13,11 @@ import eu.dnetlib.pace.config.Config;
|
||||||
public class NgramPairs extends Ngrams {
|
public class NgramPairs extends Ngrams {
|
||||||
|
|
||||||
public NgramPairs(Map<String, Integer> params) {
|
public NgramPairs(Map<String, Integer> params) {
|
||||||
super(params);
|
super(params, false);
|
||||||
|
}
|
||||||
|
|
||||||
|
public NgramPairs(Map<String, Integer> params, boolean sorted) {
|
||||||
|
super(params, sorted);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
|
|
@ -8,8 +8,15 @@ import eu.dnetlib.pace.config.Config;
|
||||||
@ClusteringClass("ngrams")
|
@ClusteringClass("ngrams")
|
||||||
public class Ngrams extends AbstractClusteringFunction {
|
public class Ngrams extends AbstractClusteringFunction {
|
||||||
|
|
||||||
|
private final boolean sorted;
|
||||||
|
|
||||||
public Ngrams(Map<String, Integer> params) {
|
public Ngrams(Map<String, Integer> params) {
|
||||||
|
this(params, false);
|
||||||
|
}
|
||||||
|
|
||||||
|
public Ngrams(Map<String, Integer> params, boolean sorted) {
|
||||||
super(params);
|
super(params);
|
||||||
|
this.sorted = sorted;
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
@ -19,20 +26,21 @@ public class Ngrams extends AbstractClusteringFunction {
|
||||||
|
|
||||||
protected Collection<String> getNgrams(String s, int ngramLen, int max, int maxPerToken, int minNgramLen) {
|
protected Collection<String> getNgrams(String s, int ngramLen, int max, int maxPerToken, int minNgramLen) {
|
||||||
|
|
||||||
final Collection<String> ngrams = new LinkedHashSet<String>();
|
final Collection<String> ngrams = sorted ? new TreeSet<>() : new LinkedHashSet<String>();
|
||||||
final StringTokenizer st = new StringTokenizer(s);
|
final StringTokenizer st = new StringTokenizer(s);
|
||||||
|
|
||||||
while (st.hasMoreTokens()) {
|
while (st.hasMoreTokens()) {
|
||||||
final String token = st.nextToken();
|
final String token = st.nextToken();
|
||||||
if (!token.isEmpty()) {
|
if (!token.isEmpty()) {
|
||||||
|
|
||||||
for (int i = 0; i < maxPerToken && ngramLen + i <= token.length(); i++) {
|
for (int i = 0; i < maxPerToken && ngramLen + i <= token.length(); i++) {
|
||||||
String ngram = (token + " ").substring(i, ngramLen + i).trim();
|
String ngram = token.substring(i, Math.min(ngramLen + i, token.length())).trim();
|
||||||
if (ngrams.size() >= max) {
|
|
||||||
return ngrams;
|
|
||||||
}
|
|
||||||
if (ngram.length() >= minNgramLen) {
|
if (ngram.length() >= minNgramLen) {
|
||||||
ngrams.add(ngram);
|
ngrams.add(ngram);
|
||||||
|
|
||||||
|
if (ngrams.size() >= max) {
|
||||||
|
return ngrams;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -12,7 +12,6 @@ import com.google.common.collect.Sets;
|
||||||
|
|
||||||
import eu.dnetlib.pace.common.AbstractPaceFunctions;
|
import eu.dnetlib.pace.common.AbstractPaceFunctions;
|
||||||
import eu.dnetlib.pace.config.Config;
|
import eu.dnetlib.pace.config.Config;
|
||||||
import eu.dnetlib.pace.model.Field;
|
|
||||||
import eu.dnetlib.pace.model.Person;
|
import eu.dnetlib.pace.model.Person;
|
||||||
|
|
||||||
@ClusteringClass("personClustering")
|
@ClusteringClass("personClustering")
|
||||||
|
@ -27,19 +26,19 @@ public class PersonClustering extends AbstractPaceFunctions implements Clusterin
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Collection<String> apply(final Config conf, final List<Field> fields) {
|
public Collection<String> apply(final Config conf, final List<String> fields) {
|
||||||
final Set<String> hashes = Sets.newHashSet();
|
final Set<String> hashes = Sets.newHashSet();
|
||||||
|
|
||||||
for (final Field f : fields) {
|
for (final String f : fields) {
|
||||||
|
|
||||||
final Person person = new Person(f.stringValue(), false);
|
final Person person = new Person(f, false);
|
||||||
|
|
||||||
if (StringUtils.isNotBlank(person.getNormalisedFirstName())
|
if (StringUtils.isNotBlank(person.getNormalisedFirstName())
|
||||||
&& StringUtils.isNotBlank(person.getNormalisedSurname())) {
|
&& StringUtils.isNotBlank(person.getNormalisedSurname())) {
|
||||||
hashes.add(firstLC(person.getNormalisedFirstName()) + person.getNormalisedSurname().toLowerCase());
|
hashes.add(firstLC(person.getNormalisedFirstName()) + person.getNormalisedSurname().toLowerCase());
|
||||||
} else {
|
} else {
|
||||||
for (final String token1 : tokens(f.stringValue(), MAX_TOKENS)) {
|
for (final String token1 : tokens(f, MAX_TOKENS)) {
|
||||||
for (final String token2 : tokens(f.stringValue(), MAX_TOKENS)) {
|
for (final String token2 : tokens(f, MAX_TOKENS)) {
|
||||||
if (!token1.equals(token2)) {
|
if (!token1.equals(token2)) {
|
||||||
hashes.add(firstLC(token1) + token2);
|
hashes.add(firstLC(token1) + token2);
|
||||||
}
|
}
|
||||||
|
|
|
@ -13,7 +13,7 @@ import eu.dnetlib.pace.config.Config;
|
||||||
public class SortedNgramPairs extends NgramPairs {
|
public class SortedNgramPairs extends NgramPairs {
|
||||||
|
|
||||||
public SortedNgramPairs(Map<String, Integer> params) {
|
public SortedNgramPairs(Map<String, Integer> params) {
|
||||||
super(params);
|
super(params, false);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
|
|
@ -11,7 +11,6 @@ import java.util.stream.Collectors;
|
||||||
|
|
||||||
import eu.dnetlib.pace.common.AbstractPaceFunctions;
|
import eu.dnetlib.pace.common.AbstractPaceFunctions;
|
||||||
import eu.dnetlib.pace.config.Config;
|
import eu.dnetlib.pace.config.Config;
|
||||||
import eu.dnetlib.pace.model.Field;
|
|
||||||
|
|
||||||
@ClusteringClass("urlclustering")
|
@ClusteringClass("urlclustering")
|
||||||
public class UrlClustering extends AbstractPaceFunctions implements ClusteringFunction {
|
public class UrlClustering extends AbstractPaceFunctions implements ClusteringFunction {
|
||||||
|
@ -23,12 +22,11 @@ public class UrlClustering extends AbstractPaceFunctions implements ClusteringFu
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Collection<String> apply(final Config conf, List<Field> fields) {
|
public Collection<String> apply(final Config conf, List<String> fields) {
|
||||||
try {
|
try {
|
||||||
return fields
|
return fields
|
||||||
.stream()
|
.stream()
|
||||||
.filter(f -> !f.isEmpty())
|
.filter(f -> !f.isEmpty())
|
||||||
.map(Field::stringValue)
|
|
||||||
.map(this::asUrl)
|
.map(this::asUrl)
|
||||||
.map(URL::getHost)
|
.map(URL::getHost)
|
||||||
.collect(Collectors.toCollection(HashSet::new));
|
.collect(Collectors.toCollection(HashSet::new));
|
||||||
|
|
|
@ -16,13 +16,11 @@ import org.apache.commons.lang3.StringUtils;
|
||||||
import com.google.common.base.Joiner;
|
import com.google.common.base.Joiner;
|
||||||
import com.google.common.base.Splitter;
|
import com.google.common.base.Splitter;
|
||||||
import com.google.common.collect.Iterables;
|
import com.google.common.collect.Iterables;
|
||||||
|
import com.google.common.collect.Lists;
|
||||||
import com.google.common.collect.Sets;
|
import com.google.common.collect.Sets;
|
||||||
import com.ibm.icu.text.Transliterator;
|
import com.ibm.icu.text.Transliterator;
|
||||||
|
|
||||||
import eu.dnetlib.pace.clustering.NGramUtils;
|
import eu.dnetlib.pace.clustering.NGramUtils;
|
||||||
import eu.dnetlib.pace.model.Field;
|
|
||||||
import eu.dnetlib.pace.model.FieldList;
|
|
||||||
import eu.dnetlib.pace.model.FieldListImpl;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Set of common functions for the framework
|
* Set of common functions for the framework
|
||||||
|
@ -51,28 +49,25 @@ public abstract class AbstractPaceFunctions {
|
||||||
protected static Set<String> ngramBlacklist = loadFromClasspath("/eu/dnetlib/pace/config/ngram_blacklist.txt");
|
protected static Set<String> ngramBlacklist = loadFromClasspath("/eu/dnetlib/pace/config/ngram_blacklist.txt");
|
||||||
|
|
||||||
// html regex for normalization
|
// html regex for normalization
|
||||||
public final String HTML_REGEX = "<[^>]*>";
|
public static final Pattern HTML_REGEX = Pattern.compile("<[^>]*>");
|
||||||
|
|
||||||
private static final String alpha = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789 ";
|
private static final String alpha = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789 ";
|
||||||
private static final String aliases_from = "⁰¹²³⁴⁵⁶⁷⁸⁹⁺⁻⁼⁽⁾ⁿ₀₁₂₃₄₅₆₇₈₉₊₋₌₍₎àáâäæãåāèéêëēėęəîïíīįìôöòóœøōõûüùúūßśšłžźżçćčñń";
|
private static final String aliases_from = "⁰¹²³⁴⁵⁶⁷⁸⁹⁺⁻⁼⁽⁾ⁿ₀₁₂₃₄₅₆₇₈₉₊₋₌₍₎àáâäæãåāèéêëēėęəîïíīįìôöòóœøōõûüùúūßśšłžźżçćčñń";
|
||||||
private static final String aliases_to = "0123456789+-=()n0123456789+-=()aaaaaaaaeeeeeeeeiiiiiioooooooouuuuussslzzzcccnn";
|
private static final String aliases_to = "0123456789+-=()n0123456789+-=()aaaaaaaaeeeeeeeeiiiiiioooooooouuuuussslzzzcccnn";
|
||||||
|
|
||||||
// doi prefix for normalization
|
// doi prefix for normalization
|
||||||
public final String DOI_PREFIX = "(https?:\\/\\/dx\\.doi\\.org\\/)|(doi:)";
|
public static final Pattern DOI_PREFIX = Pattern.compile("(https?:\\/\\/dx\\.doi\\.org\\/)|(doi:)");
|
||||||
|
|
||||||
private Pattern numberPattern = Pattern.compile("-?\\d+(\\.\\d+)?");
|
private static Pattern numberPattern = Pattern.compile("-?\\d+(\\.\\d+)?");
|
||||||
|
|
||||||
private Pattern hexUnicodePattern = Pattern.compile("\\\\u(\\p{XDigit}{4})");
|
private static Pattern hexUnicodePattern = Pattern.compile("\\\\u(\\p{XDigit}{4})");
|
||||||
|
|
||||||
protected final static FieldList EMPTY_FIELD = new FieldListImpl();
|
|
||||||
|
|
||||||
protected String concat(final List<String> l) {
|
protected String concat(final List<String> l) {
|
||||||
return Joiner.on(" ").skipNulls().join(l);
|
return Joiner.on(" ").skipNulls().join(l);
|
||||||
}
|
}
|
||||||
|
|
||||||
protected String cleanup(final String s) {
|
protected String cleanup(final String s) {
|
||||||
|
final String s1 = HTML_REGEX.matcher(s).replaceAll("");
|
||||||
final String s1 = s.replaceAll(HTML_REGEX, "");
|
|
||||||
final String s2 = unicodeNormalization(s1.toLowerCase());
|
final String s2 = unicodeNormalization(s1.toLowerCase());
|
||||||
final String s3 = nfd(s2);
|
final String s3 = nfd(s2);
|
||||||
final String s4 = fixXML(s3);
|
final String s4 = fixXML(s3);
|
||||||
|
@ -162,11 +157,6 @@ public abstract class AbstractPaceFunctions {
|
||||||
return sb.toString().replaceAll("\\s+", " ");
|
return sb.toString().replaceAll("\\s+", " ");
|
||||||
}
|
}
|
||||||
|
|
||||||
protected String getFirstValue(final Field values) {
|
|
||||||
return (values != null) && !Iterables.isEmpty(values) ? Iterables.getFirst(values, EMPTY_FIELD).stringValue()
|
|
||||||
: "";
|
|
||||||
}
|
|
||||||
|
|
||||||
protected boolean notNull(final String s) {
|
protected boolean notNull(final String s) {
|
||||||
return s != null;
|
return s != null;
|
||||||
}
|
}
|
||||||
|
@ -316,7 +306,7 @@ public abstract class AbstractPaceFunctions {
|
||||||
}
|
}
|
||||||
|
|
||||||
public String normalizePid(String pid) {
|
public String normalizePid(String pid) {
|
||||||
return pid.toLowerCase().replaceAll(DOI_PREFIX, "");
|
return DOI_PREFIX.matcher(pid.toLowerCase()).replaceAll("");
|
||||||
}
|
}
|
||||||
|
|
||||||
// get the list of keywords into the input string
|
// get the list of keywords into the input string
|
||||||
|
|
|
@ -3,7 +3,7 @@ package eu.dnetlib.pace.config;
|
||||||
|
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
import java.util.regex.Pattern;
|
import java.util.function.Predicate;
|
||||||
|
|
||||||
import eu.dnetlib.pace.model.ClusteringDef;
|
import eu.dnetlib.pace.model.ClusteringDef;
|
||||||
import eu.dnetlib.pace.model.FieldDef;
|
import eu.dnetlib.pace.model.FieldDef;
|
||||||
|
@ -30,13 +30,6 @@ public interface Config {
|
||||||
*/
|
*/
|
||||||
public Map<String, TreeNodeDef> decisionTree();
|
public Map<String, TreeNodeDef> decisionTree();
|
||||||
|
|
||||||
/**
|
|
||||||
* Field configuration definitions.
|
|
||||||
*
|
|
||||||
* @return the list of definitions
|
|
||||||
*/
|
|
||||||
public Map<String, FieldDef> modelMap();
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Clusterings.
|
* Clusterings.
|
||||||
*
|
*
|
||||||
|
@ -49,7 +42,7 @@ public interface Config {
|
||||||
*
|
*
|
||||||
* @return the map
|
* @return the map
|
||||||
*/
|
*/
|
||||||
public Map<String, List<Pattern>> blacklists();
|
public Map<String, Predicate<String>> blacklists();
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Translation map.
|
* Translation map.
|
||||||
|
|
|
@ -4,18 +4,19 @@ package eu.dnetlib.pace.config;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.Serializable;
|
import java.io.Serializable;
|
||||||
import java.nio.charset.StandardCharsets;
|
import java.nio.charset.StandardCharsets;
|
||||||
|
import java.util.AbstractMap;
|
||||||
import java.util.HashMap;
|
import java.util.HashMap;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
import java.util.Map.Entry;
|
import java.util.Map.Entry;
|
||||||
|
import java.util.function.Predicate;
|
||||||
import java.util.regex.Pattern;
|
import java.util.regex.Pattern;
|
||||||
|
import java.util.regex.PatternSyntaxException;
|
||||||
import java.util.stream.Collectors;
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
import org.antlr.stringtemplate.StringTemplate;
|
import org.antlr.stringtemplate.StringTemplate;
|
||||||
import org.apache.commons.io.IOUtils;
|
import org.apache.commons.io.IOUtils;
|
||||||
import org.apache.commons.lang3.StringUtils;
|
import org.apache.commons.lang3.StringUtils;
|
||||||
import org.apache.commons.logging.Log;
|
|
||||||
import org.apache.commons.logging.LogFactory;
|
|
||||||
|
|
||||||
import com.fasterxml.jackson.annotation.JsonIgnore;
|
import com.fasterxml.jackson.annotation.JsonIgnore;
|
||||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||||
|
@ -27,9 +28,6 @@ import eu.dnetlib.pace.tree.support.TreeNodeDef;
|
||||||
import eu.dnetlib.pace.util.PaceException;
|
import eu.dnetlib.pace.util.PaceException;
|
||||||
|
|
||||||
public class DedupConfig implements Config, Serializable {
|
public class DedupConfig implements Config, Serializable {
|
||||||
|
|
||||||
private static final Log log = LogFactory.getLog(DedupConfig.class);
|
|
||||||
|
|
||||||
private static String CONFIG_TEMPLATE = "dedupConfig.st";
|
private static String CONFIG_TEMPLATE = "dedupConfig.st";
|
||||||
|
|
||||||
private PaceConfig pace;
|
private PaceConfig pace;
|
||||||
|
@ -37,7 +35,7 @@ public class DedupConfig implements Config, Serializable {
|
||||||
private WfConfig wf;
|
private WfConfig wf;
|
||||||
|
|
||||||
@JsonIgnore
|
@JsonIgnore
|
||||||
private Map<String, List<Pattern>> blacklists;
|
private Map<String, Predicate<String>> blacklists;
|
||||||
|
|
||||||
private static Map<String, String> defaults = Maps.newHashMap();
|
private static Map<String, String> defaults = Maps.newHashMap();
|
||||||
|
|
||||||
|
@ -72,19 +70,29 @@ public class DedupConfig implements Config, Serializable {
|
||||||
.getBlacklists()
|
.getBlacklists()
|
||||||
.entrySet()
|
.entrySet()
|
||||||
.stream()
|
.stream()
|
||||||
|
.map(
|
||||||
|
e -> new AbstractMap.SimpleEntry<String, List<Pattern>>(e.getKey(),
|
||||||
|
e
|
||||||
|
.getValue()
|
||||||
|
.stream()
|
||||||
|
.filter(s -> !StringUtils.isBlank(s))
|
||||||
|
.map(Pattern::compile)
|
||||||
|
.collect(Collectors.toList())))
|
||||||
.collect(
|
.collect(
|
||||||
Collectors
|
Collectors
|
||||||
.toMap(
|
.toMap(
|
||||||
e -> e.getKey(),
|
e -> e.getKey(),
|
||||||
e -> e
|
e -> (Predicate<String> & Serializable) s -> e
|
||||||
.getValue()
|
.getValue()
|
||||||
.stream()
|
.stream()
|
||||||
.filter(s -> !StringUtils.isBlank(s))
|
.filter(p -> p.matcher(s).matches())
|
||||||
.map(Pattern::compile)
|
.findFirst()
|
||||||
.collect(Collectors.toList())));
|
.isPresent()))
|
||||||
|
|
||||||
|
;
|
||||||
|
|
||||||
return config;
|
return config;
|
||||||
} catch (IOException e) {
|
} catch (IOException | PatternSyntaxException e) {
|
||||||
throw new PaceException("Error in parsing configuration json", e);
|
throw new PaceException("Error in parsing configuration json", e);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -152,18 +160,13 @@ public class DedupConfig implements Config, Serializable {
|
||||||
return getPace().getModel();
|
return getPace().getModel();
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
|
||||||
public Map<String, FieldDef> modelMap() {
|
|
||||||
return getPace().getModelMap();
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public List<ClusteringDef> clusterings() {
|
public List<ClusteringDef> clusterings() {
|
||||||
return getPace().getClustering();
|
return getPace().getClustering();
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Map<String, List<Pattern>> blacklists() {
|
public Map<String, Predicate<String>> blacklists() {
|
||||||
return blacklists;
|
return blacklists;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -28,6 +28,10 @@ public class PaceConfig extends AbstractPaceFunctions implements Serializable {
|
||||||
@JsonIgnore
|
@JsonIgnore
|
||||||
private Map<String, String> translationMap;
|
private Map<String, String> translationMap;
|
||||||
|
|
||||||
|
public Map<String, FieldDef> getModelMap() {
|
||||||
|
return modelMap;
|
||||||
|
}
|
||||||
|
|
||||||
@JsonIgnore
|
@JsonIgnore
|
||||||
private Map<String, FieldDef> modelMap;
|
private Map<String, FieldDef> modelMap;
|
||||||
|
|
||||||
|
@ -101,13 +105,4 @@ public class PaceConfig extends AbstractPaceFunctions implements Serializable {
|
||||||
public void setSynonyms(Map<String, List<String>> synonyms) {
|
public void setSynonyms(Map<String, List<String>> synonyms) {
|
||||||
this.synonyms = synonyms;
|
this.synonyms = synonyms;
|
||||||
}
|
}
|
||||||
|
|
||||||
public Map<String, FieldDef> getModelMap() {
|
|
||||||
return modelMap;
|
|
||||||
}
|
|
||||||
|
|
||||||
public void setModelMap(final Map<String, FieldDef> modelMap) {
|
|
||||||
this.modelMap = modelMap;
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,72 +0,0 @@
|
||||||
|
|
||||||
package eu.dnetlib.pace.model;
|
|
||||||
|
|
||||||
import eu.dnetlib.pace.config.Type;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* The Class AbstractField.
|
|
||||||
*/
|
|
||||||
public abstract class AbstractField implements Field {
|
|
||||||
|
|
||||||
/** The type. */
|
|
||||||
protected Type type = Type.String;
|
|
||||||
|
|
||||||
/** The name. */
|
|
||||||
protected String name;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Instantiates a new abstract field.
|
|
||||||
*/
|
|
||||||
protected AbstractField() {
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Instantiates a new abstract field.
|
|
||||||
*
|
|
||||||
* @param type
|
|
||||||
* the type
|
|
||||||
* @param name
|
|
||||||
* the name
|
|
||||||
*/
|
|
||||||
protected AbstractField(final Type type, final String name) {
|
|
||||||
this.type = type;
|
|
||||||
this.name = name;
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
* (non-Javadoc)
|
|
||||||
* @see eu.dnetlib.pace.model.Field#getName()
|
|
||||||
*/
|
|
||||||
@Override
|
|
||||||
public String getName() {
|
|
||||||
return name;
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
* (non-Javadoc)
|
|
||||||
* @see eu.dnetlib.pace.model.Field#getType()
|
|
||||||
*/
|
|
||||||
@Override
|
|
||||||
public Type getType() {
|
|
||||||
return type;
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
* (non-Javadoc)
|
|
||||||
* @see eu.dnetlib.pace.model.Field#setName(java.lang.String)
|
|
||||||
*/
|
|
||||||
@Override
|
|
||||||
public void setName(final String name) {
|
|
||||||
this.name = name;
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
* (non-Javadoc)
|
|
||||||
* @see eu.dnetlib.pace.model.Field#setType(eu.dnetlib.pace.config.Type)
|
|
||||||
*/
|
|
||||||
@Override
|
|
||||||
public void setType(final Type type) {
|
|
||||||
this.type = type;
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
|
@ -1,40 +0,0 @@
|
||||||
|
|
||||||
package eu.dnetlib.pace.model;
|
|
||||||
|
|
||||||
import java.util.Set;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* The Interface Document. Models the common operations available on a Pace Document.
|
|
||||||
*/
|
|
||||||
public interface Document {
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Gets the identifier.
|
|
||||||
*
|
|
||||||
* @return the identifier
|
|
||||||
*/
|
|
||||||
String getIdentifier();
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Fields.
|
|
||||||
*
|
|
||||||
* @return the iterable
|
|
||||||
*/
|
|
||||||
Iterable<Field> fields();
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Values.
|
|
||||||
*
|
|
||||||
* @param name
|
|
||||||
* the name
|
|
||||||
* @return the field list
|
|
||||||
*/
|
|
||||||
Field values(String name);
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Field names.
|
|
||||||
*
|
|
||||||
* @return the sets the
|
|
||||||
*/
|
|
||||||
Set<String> fieldNames();
|
|
||||||
}
|
|
|
@ -1,57 +0,0 @@
|
||||||
|
|
||||||
package eu.dnetlib.pace.model;
|
|
||||||
|
|
||||||
import java.io.Serializable;
|
|
||||||
|
|
||||||
import eu.dnetlib.pace.config.Type;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* The Interface Field.
|
|
||||||
*/
|
|
||||||
public interface Field extends Iterable<Field>, Serializable {
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Gets the name.
|
|
||||||
*
|
|
||||||
* @return the name
|
|
||||||
*/
|
|
||||||
public String getName();
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Sets the name.
|
|
||||||
*
|
|
||||||
* @param name
|
|
||||||
* the new name
|
|
||||||
*/
|
|
||||||
public void setName(String name);
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Gets the type.
|
|
||||||
*
|
|
||||||
* @return the type
|
|
||||||
*/
|
|
||||||
public Type getType();
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Sets the type.
|
|
||||||
*
|
|
||||||
* @param type
|
|
||||||
* the new type
|
|
||||||
*/
|
|
||||||
public void setType(Type type);
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Checks if is empty.
|
|
||||||
*
|
|
||||||
* @return true, if is empty
|
|
||||||
*/
|
|
||||||
public boolean isEmpty();
|
|
||||||
|
|
||||||
/**
|
|
||||||
* String value.
|
|
||||||
*
|
|
||||||
* @return the string
|
|
||||||
*/
|
|
||||||
public String stringValue();
|
|
||||||
|
|
||||||
}
|
|
|
@ -39,20 +39,6 @@ public class FieldDef implements Serializable {
|
||||||
public FieldDef() {
|
public FieldDef() {
|
||||||
}
|
}
|
||||||
|
|
||||||
// def apply(s: String): Field[A]
|
|
||||||
public Field apply(final Type type, final String s) {
|
|
||||||
switch (type) {
|
|
||||||
case Int:
|
|
||||||
return new FieldValueImpl(type, name, Integer.parseInt(s));
|
|
||||||
case String:
|
|
||||||
return new FieldValueImpl(type, name, s);
|
|
||||||
case List:
|
|
||||||
return new FieldListImpl(name, type);
|
|
||||||
default:
|
|
||||||
throw new IllegalArgumentException("Casting not implemented for type " + type);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
public String getName() {
|
public String getName() {
|
||||||
return name;
|
return name;
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,25 +0,0 @@
|
||||||
|
|
||||||
package eu.dnetlib.pace.model;
|
|
||||||
|
|
||||||
import java.util.List;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* The Interface FieldList.
|
|
||||||
*/
|
|
||||||
public interface FieldList extends List<Field>, Field {
|
|
||||||
|
|
||||||
/**
|
|
||||||
* String list.
|
|
||||||
*
|
|
||||||
* @return the list
|
|
||||||
*/
|
|
||||||
public List<String> stringList();
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Double[] Array
|
|
||||||
*
|
|
||||||
* @return the double[] array
|
|
||||||
*/
|
|
||||||
public double[] doubleArray();
|
|
||||||
|
|
||||||
}
|
|
|
@ -1,315 +0,0 @@
|
||||||
|
|
||||||
package eu.dnetlib.pace.model;
|
|
||||||
|
|
||||||
import java.util.Collection;
|
|
||||||
import java.util.Iterator;
|
|
||||||
import java.util.List;
|
|
||||||
import java.util.ListIterator;
|
|
||||||
|
|
||||||
import com.fasterxml.jackson.core.JsonProcessingException;
|
|
||||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
|
||||||
import com.google.common.base.Function;
|
|
||||||
import com.google.common.base.Joiner;
|
|
||||||
import com.google.common.collect.Iterables;
|
|
||||||
import com.google.common.collect.Lists;
|
|
||||||
|
|
||||||
import eu.dnetlib.pace.config.Type;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* The Class FieldListImpl.
|
|
||||||
*/
|
|
||||||
public class FieldListImpl extends AbstractField implements FieldList {
|
|
||||||
|
|
||||||
/** The fields. */
|
|
||||||
private List<Field> fields;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Instantiates a new field list impl.
|
|
||||||
*/
|
|
||||||
public FieldListImpl() {
|
|
||||||
fields = Lists.newArrayList();
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Instantiates a new field list impl.
|
|
||||||
*
|
|
||||||
* @param name
|
|
||||||
* the name
|
|
||||||
*/
|
|
||||||
public FieldListImpl(final String name, final Type type) {
|
|
||||||
super(type, name);
|
|
||||||
fields = Lists.newArrayList();
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
* (non-Javadoc)
|
|
||||||
* @see java.util.List#add(java.lang.Object)
|
|
||||||
*/
|
|
||||||
@Override
|
|
||||||
public boolean add(final Field f) {
|
|
||||||
return fields.add(f);
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
* (non-Javadoc)
|
|
||||||
* @see java.util.List#add(int, java.lang.Object)
|
|
||||||
*/
|
|
||||||
@Override
|
|
||||||
public void add(final int i, final Field f) {
|
|
||||||
fields.add(i, f);
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
* (non-Javadoc)
|
|
||||||
* @see java.util.List#addAll(java.util.Collection)
|
|
||||||
*/
|
|
||||||
@Override
|
|
||||||
public boolean addAll(final Collection<? extends Field> f) {
|
|
||||||
return fields.addAll(f);
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
* (non-Javadoc)
|
|
||||||
* @see java.util.List#addAll(int, java.util.Collection)
|
|
||||||
*/
|
|
||||||
@Override
|
|
||||||
public boolean addAll(final int i, final Collection<? extends Field> f) {
|
|
||||||
return fields.addAll(i, f);
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
* (non-Javadoc)
|
|
||||||
* @see java.util.List#clear()
|
|
||||||
*/
|
|
||||||
@Override
|
|
||||||
public void clear() {
|
|
||||||
fields.clear();
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
* (non-Javadoc)
|
|
||||||
* @see java.util.List#contains(java.lang.Object)
|
|
||||||
*/
|
|
||||||
@Override
|
|
||||||
public boolean contains(final Object o) {
|
|
||||||
return fields.contains(o);
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
* (non-Javadoc)
|
|
||||||
* @see java.util.List#containsAll(java.util.Collection)
|
|
||||||
*/
|
|
||||||
@Override
|
|
||||||
public boolean containsAll(final Collection<?> f) {
|
|
||||||
return fields.containsAll(f);
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
* (non-Javadoc)
|
|
||||||
* @see java.util.List#get(int)
|
|
||||||
*/
|
|
||||||
@Override
|
|
||||||
public Field get(final int i) {
|
|
||||||
return fields.get(i);
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
* (non-Javadoc)
|
|
||||||
* @see java.util.List#indexOf(java.lang.Object)
|
|
||||||
*/
|
|
||||||
@Override
|
|
||||||
public int indexOf(final Object o) {
|
|
||||||
return fields.indexOf(o);
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
* (non-Javadoc)
|
|
||||||
* @see eu.dnetlib.pace.model.Field#isEmpty()
|
|
||||||
*/
|
|
||||||
@Override
|
|
||||||
public boolean isEmpty() {
|
|
||||||
return Iterables.all(fields, f -> f.isEmpty());
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
* (non-Javadoc)
|
|
||||||
* @see java.lang.Iterable#iterator()
|
|
||||||
*/
|
|
||||||
@Override
|
|
||||||
public Iterator<Field> iterator() {
|
|
||||||
return fields.iterator();
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
* (non-Javadoc)
|
|
||||||
* @see java.util.List#lastIndexOf(java.lang.Object)
|
|
||||||
*/
|
|
||||||
@Override
|
|
||||||
public int lastIndexOf(final Object o) {
|
|
||||||
return fields.lastIndexOf(o);
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
* (non-Javadoc)
|
|
||||||
* @see java.util.List#listIterator()
|
|
||||||
*/
|
|
||||||
@Override
|
|
||||||
public ListIterator<Field> listIterator() {
|
|
||||||
return fields.listIterator();
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
* (non-Javadoc)
|
|
||||||
* @see java.util.List#listIterator(int)
|
|
||||||
*/
|
|
||||||
@Override
|
|
||||||
public ListIterator<Field> listIterator(final int i) {
|
|
||||||
return fields.listIterator(i);
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
* (non-Javadoc)
|
|
||||||
* @see java.util.List#remove(java.lang.Object)
|
|
||||||
*/
|
|
||||||
@Override
|
|
||||||
public boolean remove(final Object o) {
|
|
||||||
return fields.remove(o);
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
* (non-Javadoc)
|
|
||||||
* @see java.util.List#remove(int)
|
|
||||||
*/
|
|
||||||
@Override
|
|
||||||
public Field remove(final int i) {
|
|
||||||
return fields.remove(i);
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
* (non-Javadoc)
|
|
||||||
* @see java.util.List#removeAll(java.util.Collection)
|
|
||||||
*/
|
|
||||||
@Override
|
|
||||||
public boolean removeAll(final Collection<?> f) {
|
|
||||||
return fields.removeAll(f);
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
* (non-Javadoc)
|
|
||||||
* @see java.util.List#retainAll(java.util.Collection)
|
|
||||||
*/
|
|
||||||
@Override
|
|
||||||
public boolean retainAll(final Collection<?> f) {
|
|
||||||
return fields.retainAll(f);
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
* (non-Javadoc)
|
|
||||||
* @see java.util.List#set(int, java.lang.Object)
|
|
||||||
*/
|
|
||||||
@Override
|
|
||||||
public Field set(final int i, final Field f) {
|
|
||||||
return fields.set(i, f);
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
* (non-Javadoc)
|
|
||||||
* @see java.util.List#size()
|
|
||||||
*/
|
|
||||||
@Override
|
|
||||||
public int size() {
|
|
||||||
return fields.size();
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
* (non-Javadoc)
|
|
||||||
* @see java.util.List#subList(int, int)
|
|
||||||
*/
|
|
||||||
@Override
|
|
||||||
public List<Field> subList(final int from, final int to) {
|
|
||||||
return fields.subList(from, to);
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
* (non-Javadoc)
|
|
||||||
* @see java.util.List#toArray()
|
|
||||||
*/
|
|
||||||
@Override
|
|
||||||
public Object[] toArray() {
|
|
||||||
return fields.toArray();
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
* (non-Javadoc)
|
|
||||||
* @see java.util.List#toArray(java.lang.Object[])
|
|
||||||
*/
|
|
||||||
@Override
|
|
||||||
public <T> T[] toArray(final T[] t) {
|
|
||||||
return fields.toArray(t);
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
* (non-Javadoc)
|
|
||||||
* @see eu.dnetlib.pace.model.Field#stringValue()
|
|
||||||
*/
|
|
||||||
@Override
|
|
||||||
public String stringValue() {
|
|
||||||
switch (getType()) {
|
|
||||||
|
|
||||||
case List:
|
|
||||||
case Int:
|
|
||||||
case String:
|
|
||||||
return Joiner.on(" ").join(stringList());
|
|
||||||
case JSON:
|
|
||||||
String json;
|
|
||||||
try {
|
|
||||||
json = new ObjectMapper().writeValueAsString(this);
|
|
||||||
} catch (JsonProcessingException e) {
|
|
||||||
json = null;
|
|
||||||
}
|
|
||||||
return json;
|
|
||||||
default:
|
|
||||||
throw new IllegalArgumentException("Unknown type: " + getType().toString());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
* (non-Javadoc)
|
|
||||||
* @see eu.dnetlib.pace.model.FieldList#stringList()
|
|
||||||
*/
|
|
||||||
@Override
|
|
||||||
public List<String> stringList() {
|
|
||||||
return Lists.newArrayList(Iterables.transform(fields, getValuesTransformer()));
|
|
||||||
}
|
|
||||||
|
|
||||||
private Function<Field, String> getValuesTransformer() {
|
|
||||||
return new Function<Field, String>() {
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public String apply(final Field f) {
|
|
||||||
return f.stringValue();
|
|
||||||
}
|
|
||||||
};
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public double[] doubleArray() {
|
|
||||||
return Lists.newArrayList(Iterables.transform(fields, getDouble())).stream().mapToDouble(d -> d).toArray();
|
|
||||||
}
|
|
||||||
|
|
||||||
private Function<Field, Double> getDouble() {
|
|
||||||
|
|
||||||
return new Function<Field, Double>() {
|
|
||||||
@Override
|
|
||||||
public Double apply(final Field f) {
|
|
||||||
return Double.parseDouble(f.stringValue());
|
|
||||||
}
|
|
||||||
};
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public String toString() {
|
|
||||||
return stringList().toString();
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
|
@ -1,26 +0,0 @@
|
||||||
|
|
||||||
package eu.dnetlib.pace.model;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* The Interface FieldValue.
|
|
||||||
*/
|
|
||||||
public interface FieldValue extends Field {
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Gets the value.
|
|
||||||
*
|
|
||||||
* @return the value
|
|
||||||
*/
|
|
||||||
public Object getValue();
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Sets the value.
|
|
||||||
*
|
|
||||||
* @param value
|
|
||||||
* the new value
|
|
||||||
*/
|
|
||||||
public void setValue(final Object value);
|
|
||||||
|
|
||||||
public double[] doubleArrayValue();
|
|
||||||
|
|
||||||
}
|
|
|
@ -1,135 +0,0 @@
|
||||||
|
|
||||||
package eu.dnetlib.pace.model;
|
|
||||||
|
|
||||||
import java.net.MalformedURLException;
|
|
||||||
import java.net.URL;
|
|
||||||
import java.util.Collections;
|
|
||||||
import java.util.Iterator;
|
|
||||||
import java.util.List;
|
|
||||||
|
|
||||||
import org.apache.commons.lang3.StringUtils;
|
|
||||||
|
|
||||||
import eu.dnetlib.pace.config.Type;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* The Class FieldValueImpl.
|
|
||||||
*/
|
|
||||||
public class FieldValueImpl extends AbstractField implements FieldValue {
|
|
||||||
|
|
||||||
/** The value. */
|
|
||||||
private Object value = null;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Instantiates a new field value impl.
|
|
||||||
*/
|
|
||||||
public FieldValueImpl() {
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Instantiates a new field value impl.
|
|
||||||
*
|
|
||||||
* @param type
|
|
||||||
* the type
|
|
||||||
* @param name
|
|
||||||
* the name
|
|
||||||
* @param value
|
|
||||||
* the value
|
|
||||||
*/
|
|
||||||
public FieldValueImpl(final Type type, final String name, final Object value) {
|
|
||||||
super(type, name);
|
|
||||||
this.value = value;
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
* (non-Javadoc)
|
|
||||||
* @see eu.dnetlib.pace.model.Field#isEmpty()
|
|
||||||
*/
|
|
||||||
@Override
|
|
||||||
public boolean isEmpty() {
|
|
||||||
if (value == null)
|
|
||||||
return false;
|
|
||||||
|
|
||||||
switch (type) {
|
|
||||||
case String:
|
|
||||||
case JSON:
|
|
||||||
return value.toString().isEmpty();
|
|
||||||
case List:
|
|
||||||
try {
|
|
||||||
List<?> list = (List<?>) value;
|
|
||||||
return list.isEmpty() || ((FieldValueImpl) list.get(0)).isEmpty();
|
|
||||||
} catch (Exception e) {
|
|
||||||
throw new RuntimeException(value.toString());
|
|
||||||
}
|
|
||||||
case URL:
|
|
||||||
String str = value.toString();
|
|
||||||
return StringUtils.isBlank(str) || !isValidURL(str);
|
|
||||||
case DoubleArray:
|
|
||||||
return doubleArrayValue().length == 0;
|
|
||||||
default:
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
private boolean isValidURL(final String s) {
|
|
||||||
try {
|
|
||||||
new URL(s);
|
|
||||||
return true;
|
|
||||||
} catch (MalformedURLException e) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
* (non-Javadoc)
|
|
||||||
* @see eu.dnetlib.pace.model.FieldValue#getValue()
|
|
||||||
*/
|
|
||||||
@Override
|
|
||||||
public Object getValue() {
|
|
||||||
return value;
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
* (non-Javadoc)
|
|
||||||
* @see eu.dnetlib.pace.model.FieldValue#setValue(java.lang.Object)
|
|
||||||
*/
|
|
||||||
@Override
|
|
||||||
public void setValue(final Object value) {
|
|
||||||
this.value = value;
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
* (non-Javadoc)
|
|
||||||
* @see eu.dnetlib.pace.model.Field#stringValue()
|
|
||||||
*/
|
|
||||||
@Override
|
|
||||||
// @SuppressWarnings("unchecked")
|
|
||||||
public String stringValue() {
|
|
||||||
return String.valueOf(getValue());
|
|
||||||
// switch (getType()) {
|
|
||||||
//
|
|
||||||
// case Int:
|
|
||||||
// return String.valueOf(getValue());
|
|
||||||
// case List:
|
|
||||||
// return Joiner.on(" ").join((List<String>) getValue());
|
|
||||||
// case String:
|
|
||||||
// return (String) getValue();
|
|
||||||
// default:
|
|
||||||
// throw new IllegalArgumentException("Unknown type: " + getType().toString());
|
|
||||||
// }
|
|
||||||
}
|
|
||||||
|
|
||||||
public double[] doubleArrayValue() {
|
|
||||||
return (double[]) getValue();
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
* (non-Javadoc)
|
|
||||||
* @see java.lang.Iterable#iterator()
|
|
||||||
*/
|
|
||||||
@Override
|
|
||||||
@SuppressWarnings("unchecked")
|
|
||||||
public Iterator<Field> iterator() {
|
|
||||||
return Collections.singleton((Field) this).iterator();
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
|
@ -1,143 +0,0 @@
|
||||||
|
|
||||||
package eu.dnetlib.pace.model;
|
|
||||||
|
|
||||||
import java.io.Serializable;
|
|
||||||
import java.util.Map;
|
|
||||||
import java.util.Set;
|
|
||||||
|
|
||||||
import com.google.common.collect.Iterables;
|
|
||||||
import com.google.common.collect.Lists;
|
|
||||||
import com.google.common.collect.Maps;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* The Class MapDocument.
|
|
||||||
*/
|
|
||||||
public class MapDocument implements Document, Serializable {
|
|
||||||
|
|
||||||
/** The identifier. */
|
|
||||||
private String identifier;
|
|
||||||
|
|
||||||
/** The field map. */
|
|
||||||
private Map<String, Field> fieldMap;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Instantiates a new map document.
|
|
||||||
*/
|
|
||||||
public MapDocument() {
|
|
||||||
identifier = null;
|
|
||||||
fieldMap = Maps.newHashMap();
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Instantiates a new map document.
|
|
||||||
*
|
|
||||||
* @param identifier
|
|
||||||
* the identifier
|
|
||||||
* @param fieldMap
|
|
||||||
* the field map
|
|
||||||
*/
|
|
||||||
public MapDocument(final String identifier, final Map<String, Field> fieldMap) {
|
|
||||||
this.setIdentifier(identifier);
|
|
||||||
this.fieldMap = fieldMap;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Instantiates a new map document.
|
|
||||||
*
|
|
||||||
* @param identifier
|
|
||||||
* the identifier
|
|
||||||
* @param data
|
|
||||||
* the data
|
|
||||||
*/
|
|
||||||
public MapDocument(final String identifier, final byte[] data) {
|
|
||||||
final MapDocument doc = MapDocumentSerializer.decode(data);
|
|
||||||
|
|
||||||
this.fieldMap = doc.fieldMap;
|
|
||||||
this.identifier = doc.identifier;
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
* (non-Javadoc)
|
|
||||||
* @see eu.dnetlib.pace.model.document.Document#fields()
|
|
||||||
*/
|
|
||||||
@Override
|
|
||||||
public Iterable<Field> fields() {
|
|
||||||
return Lists.newArrayList(Iterables.concat(fieldMap.values()));
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
* (non-Javadoc)
|
|
||||||
* @see eu.dnetlib.pace.model.document.Document#values(java.lang.String)
|
|
||||||
*/
|
|
||||||
@Override
|
|
||||||
public Field values(final String name) {
|
|
||||||
return fieldMap.get(name);
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
* (non-Javadoc)
|
|
||||||
* @see eu.dnetlib.pace.model.document.Document#fieldNames()
|
|
||||||
*/
|
|
||||||
@Override
|
|
||||||
public Set<String> fieldNames() {
|
|
||||||
return fieldMap.keySet();
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
* (non-Javadoc)
|
|
||||||
* @see java.lang.Object#toString()
|
|
||||||
*/
|
|
||||||
@Override
|
|
||||||
public String toString() {
|
|
||||||
return MapDocumentSerializer.toString(this);
|
|
||||||
// return String.format("Document(%s)", fieldMap.toString());
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* To byte array.
|
|
||||||
*
|
|
||||||
* @return the byte[]
|
|
||||||
*/
|
|
||||||
public byte[] toByteArray() {
|
|
||||||
return MapDocumentSerializer.toByteArray(this);
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
* (non-Javadoc)
|
|
||||||
* @see eu.dnetlib.pace.model.document.Document#getIdentifier()
|
|
||||||
*/
|
|
||||||
@Override
|
|
||||||
public String getIdentifier() {
|
|
||||||
return identifier;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Sets the identifier.
|
|
||||||
*
|
|
||||||
* @param identifier
|
|
||||||
* the new identifier
|
|
||||||
*/
|
|
||||||
public void setIdentifier(final String identifier) {
|
|
||||||
this.identifier = identifier;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Gets the field map.
|
|
||||||
*
|
|
||||||
* @return the field map
|
|
||||||
*/
|
|
||||||
public Map<String, Field> getFieldMap() {
|
|
||||||
return fieldMap;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Sets the field map.
|
|
||||||
*
|
|
||||||
* @param fieldMap
|
|
||||||
* the field map
|
|
||||||
*/
|
|
||||||
public void setFieldMap(final Map<String, Field> fieldMap) {
|
|
||||||
this.fieldMap = fieldMap;
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
|
@ -1,52 +0,0 @@
|
||||||
|
|
||||||
package eu.dnetlib.pace.model;
|
|
||||||
|
|
||||||
import java.util.Comparator;
|
|
||||||
|
|
||||||
import com.google.common.collect.Iterables;
|
|
||||||
|
|
||||||
import eu.dnetlib.pace.clustering.NGramUtils;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* The Class MapDocumentComparator.
|
|
||||||
*/
|
|
||||||
public class MapDocumentComparator implements Comparator<Document> {
|
|
||||||
|
|
||||||
/** The comparator field. */
|
|
||||||
private String comparatorField;
|
|
||||||
|
|
||||||
private final FieldList emptyField = new FieldListImpl();
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Instantiates a new map document comparator.
|
|
||||||
*
|
|
||||||
* @param comparatorField
|
|
||||||
* the comparator field
|
|
||||||
*/
|
|
||||||
public MapDocumentComparator(final String comparatorField) {
|
|
||||||
this.comparatorField = comparatorField;
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
* (non-Javadoc)
|
|
||||||
* @see java.util.Comparator#compare(java.lang.Object, java.lang.Object)
|
|
||||||
*/
|
|
||||||
@Override
|
|
||||||
public int compare(final Document d1, final Document d2) {
|
|
||||||
|
|
||||||
if (d1.values(comparatorField).isEmpty() || d2.values(comparatorField).isEmpty())
|
|
||||||
return 0;
|
|
||||||
|
|
||||||
final String o1 = Iterables.getFirst(d1.values(comparatorField), emptyField).stringValue();
|
|
||||||
final String o2 = Iterables.getFirst(d2.values(comparatorField), emptyField).stringValue();
|
|
||||||
|
|
||||||
if ((o1 == null) || (o2 == null))
|
|
||||||
return 0;
|
|
||||||
|
|
||||||
final String to1 = NGramUtils.cleanupForOrdering(o1);
|
|
||||||
final String to2 = NGramUtils.cleanupForOrdering(o2);
|
|
||||||
|
|
||||||
return to1.compareTo(to2);
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
|
@ -1,103 +0,0 @@
|
||||||
|
|
||||||
package eu.dnetlib.pace.model;
|
|
||||||
|
|
||||||
import java.lang.reflect.Type;
|
|
||||||
|
|
||||||
import com.google.gson.GsonBuilder;
|
|
||||||
import com.google.gson.InstanceCreator;
|
|
||||||
import com.google.gson.JsonDeserializationContext;
|
|
||||||
import com.google.gson.JsonDeserializer;
|
|
||||||
import com.google.gson.JsonElement;
|
|
||||||
import com.google.gson.JsonObject;
|
|
||||||
import com.google.gson.JsonParseException;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* The Class MapDocumentSerializer.
|
|
||||||
*/
|
|
||||||
public class MapDocumentSerializer implements InstanceCreator<MapDocument> {
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public MapDocument createInstance(final Type type) {
|
|
||||||
return new MapDocument();
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Decode.
|
|
||||||
*
|
|
||||||
* @param s
|
|
||||||
* the String
|
|
||||||
* @return the map document
|
|
||||||
*/
|
|
||||||
public static MapDocument decode(final String s) {
|
|
||||||
final GsonBuilder gson = new GsonBuilder();
|
|
||||||
|
|
||||||
gson.registerTypeAdapter(Field.class, new JsonDeserializer<Field>() {
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public Field deserialize(final JsonElement json, final Type typeOfT,
|
|
||||||
final JsonDeserializationContext context) throws JsonParseException {
|
|
||||||
final FieldListImpl fl = new FieldListImpl();
|
|
||||||
if (json.isJsonObject()) {
|
|
||||||
|
|
||||||
fl.add(handleJsonObject(json.getAsJsonObject()));
|
|
||||||
|
|
||||||
} else if (json.isJsonArray()) {
|
|
||||||
|
|
||||||
for (final JsonElement e : json.getAsJsonArray()) {
|
|
||||||
if (e.isJsonObject()) {
|
|
||||||
fl.add(handleJsonObject(e.getAsJsonObject()));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return fl;
|
|
||||||
}
|
|
||||||
|
|
||||||
private Field handleJsonObject(final JsonObject o) {
|
|
||||||
final FieldListImpl fl = new FieldListImpl();
|
|
||||||
final String name = o.get("name").getAsString();
|
|
||||||
final String type = o.get("type").getAsString();
|
|
||||||
final String value = o.get("value").getAsString();
|
|
||||||
fl.add(new FieldValueImpl(eu.dnetlib.pace.config.Type.valueOf(type), name, value));
|
|
||||||
return fl;
|
|
||||||
}
|
|
||||||
});
|
|
||||||
|
|
||||||
return gson.create().fromJson(s, MapDocument.class);
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Decode.
|
|
||||||
*
|
|
||||||
* @param bytes
|
|
||||||
* the bytes
|
|
||||||
* @return the map document
|
|
||||||
*/
|
|
||||||
public static MapDocument decode(final byte[] bytes) {
|
|
||||||
return decode(new String(bytes));
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* To string.
|
|
||||||
*
|
|
||||||
* @param doc
|
|
||||||
* the doc
|
|
||||||
* @return the string
|
|
||||||
*/
|
|
||||||
public static String toString(final MapDocument doc) {
|
|
||||||
final GsonBuilder b = new GsonBuilder();
|
|
||||||
return b.setPrettyPrinting().create().toJson(doc);
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* To byte array.
|
|
||||||
*
|
|
||||||
* @param doc
|
|
||||||
* the doc
|
|
||||||
* @return the byte[]
|
|
||||||
*/
|
|
||||||
public static byte[] toByteArray(final MapDocument doc) {
|
|
||||||
return toString(doc).getBytes();
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
|
@ -0,0 +1,65 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.pace.model;
|
||||||
|
|
||||||
|
import java.util.Comparator;
|
||||||
|
|
||||||
|
import org.apache.spark.sql.Row;
|
||||||
|
|
||||||
|
import eu.dnetlib.pace.clustering.NGramUtils;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* The Class MapDocumentComparator.
|
||||||
|
*/
|
||||||
|
public class RowDataOrderingComparator implements Comparator<Row> {
|
||||||
|
|
||||||
|
/** The comparator field. */
|
||||||
|
private final int comparatorField;
|
||||||
|
private final int identityFieldPosition;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Instantiates a new map document comparator.
|
||||||
|
*
|
||||||
|
* @param comparatorField
|
||||||
|
* the comparator field
|
||||||
|
*/
|
||||||
|
public RowDataOrderingComparator(final int comparatorField, int identityFieldPosition) {
|
||||||
|
this.comparatorField = comparatorField;
|
||||||
|
this.identityFieldPosition = identityFieldPosition;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* (non-Javadoc)
|
||||||
|
* @see java.util.Comparator#compare(java.lang.Object, java.lang.Object)
|
||||||
|
*/
|
||||||
|
@Override
|
||||||
|
public int compare(final Row d1, final Row d2) {
|
||||||
|
if (d1 == null)
|
||||||
|
return d2 == null ? 0 : -1;
|
||||||
|
else if (d2 == null) {
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
final String o1 = d1.getString(comparatorField);
|
||||||
|
final String o2 = d2.getString(comparatorField);
|
||||||
|
|
||||||
|
if (o1 == null)
|
||||||
|
return o2 == null ? 0 : -1;
|
||||||
|
else if (o2 == null) {
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
final String to1 = NGramUtils.cleanupForOrdering(o1);
|
||||||
|
final String to2 = NGramUtils.cleanupForOrdering(o2);
|
||||||
|
|
||||||
|
int res = to1.compareTo(to2);
|
||||||
|
if (res == 0) {
|
||||||
|
res = o1.compareTo(o2);
|
||||||
|
if (res == 0) {
|
||||||
|
return d1.getString(identityFieldPosition).compareTo(d2.getString(identityFieldPosition));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return res;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -0,0 +1,131 @@
|
||||||
|
package eu.dnetlib.pace.model
|
||||||
|
|
||||||
|
import eu.dnetlib.pace.config.{DedupConfig, Type}
|
||||||
|
import eu.dnetlib.pace.util.{BlockProcessor, SparkReporter}
|
||||||
|
import org.apache.spark.SparkContext
|
||||||
|
import org.apache.spark.sql.catalyst.expressions.Literal
|
||||||
|
import org.apache.spark.sql.expressions._
|
||||||
|
import org.apache.spark.sql.functions.{col, lit, udf}
|
||||||
|
import org.apache.spark.sql.types._
|
||||||
|
import org.apache.spark.sql.{Column, Dataset, Row, functions}
|
||||||
|
|
||||||
|
import java.util.function.Predicate
|
||||||
|
import java.util.stream.Collectors
|
||||||
|
import scala.collection.JavaConversions._
|
||||||
|
import scala.collection.JavaConverters._
|
||||||
|
import scala.collection.mutable
|
||||||
|
case class SparkDeduper(conf: DedupConfig) extends Serializable {
|
||||||
|
|
||||||
|
val model: SparkModel = SparkModel(conf)
|
||||||
|
|
||||||
|
val dedup: (Dataset[Row] => Dataset[Row]) = df => {
|
||||||
|
df.transform(filterAndCleanup)
|
||||||
|
.transform(generateClustersWithCollect)
|
||||||
|
.transform(processBlocks)
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
val filterAndCleanup: (Dataset[Row] => Dataset[Row]) = df => {
|
||||||
|
val df_with_filters = conf.getPace.getModel.asScala.foldLeft(df)((res, fdef) => {
|
||||||
|
if (conf.blacklists.containsKey(fdef.getName)) {
|
||||||
|
res.withColumn(
|
||||||
|
fdef.getName + "_filtered",
|
||||||
|
filterColumnUDF(fdef).apply(new Column(fdef.getName))
|
||||||
|
)
|
||||||
|
} else {
|
||||||
|
res
|
||||||
|
}
|
||||||
|
})
|
||||||
|
|
||||||
|
df_with_filters
|
||||||
|
}
|
||||||
|
|
||||||
|
def filterColumnUDF(fdef: FieldDef): UserDefinedFunction = {
|
||||||
|
val blacklist: Predicate[String] = conf.blacklists().get(fdef.getName)
|
||||||
|
|
||||||
|
if (blacklist == null) {
|
||||||
|
throw new IllegalArgumentException("Column: " + fdef.getName + " does not have any filter")
|
||||||
|
} else {
|
||||||
|
fdef.getType match {
|
||||||
|
case Type.List | Type.JSON =>
|
||||||
|
udf[Array[String], Array[String]](values => {
|
||||||
|
values.filter((v: String) => !blacklist.test(v))
|
||||||
|
})
|
||||||
|
|
||||||
|
case _ =>
|
||||||
|
udf[String, String](v => {
|
||||||
|
if (blacklist.test(v)) ""
|
||||||
|
else v
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
val generateClustersWithCollect: (Dataset[Row] => Dataset[Row]) = df_with_filters => {
|
||||||
|
var df_with_clustering_keys: Dataset[Row] = null
|
||||||
|
|
||||||
|
for ((cd, idx) <- conf.clusterings().zipWithIndex) {
|
||||||
|
val inputColumns = cd.getFields().foldLeft(Seq[Column]())((acc, fName) => {
|
||||||
|
val column = if (conf.blacklists.containsKey(fName))
|
||||||
|
Seq(col(fName + "_filtered"))
|
||||||
|
else
|
||||||
|
Seq(col(fName))
|
||||||
|
|
||||||
|
acc ++ column
|
||||||
|
})
|
||||||
|
|
||||||
|
// Add 'key' column with the value generated by the given clustering definition
|
||||||
|
val ds: Dataset[Row] = df_with_filters
|
||||||
|
.withColumn("clustering", lit(cd.getName + "::" + idx))
|
||||||
|
.withColumn("key", functions.explode(clusterValuesUDF(cd).apply(functions.array(inputColumns: _*))))
|
||||||
|
// Add position column having the position of the row within the set of rows having the same key value ordered by the sorting value
|
||||||
|
.withColumn("position", functions.row_number().over(Window.partitionBy("key").orderBy(col(model.orderingFieldName), col(model.identifierFieldName))))
|
||||||
|
|
||||||
|
if (df_with_clustering_keys == null)
|
||||||
|
df_with_clustering_keys = ds
|
||||||
|
else
|
||||||
|
df_with_clustering_keys = df_with_clustering_keys.union(ds)
|
||||||
|
}
|
||||||
|
|
||||||
|
//TODO: analytics
|
||||||
|
|
||||||
|
val df_with_blocks = df_with_clustering_keys
|
||||||
|
// filter out rows with position exceeding the maxqueuesize parameter
|
||||||
|
.filter(col("position").leq(conf.getWf.getQueueMaxSize))
|
||||||
|
.groupBy("clustering", "key")
|
||||||
|
.agg(functions.collect_set(functions.struct(model.schema.fieldNames.map(col): _*)).as("block"))
|
||||||
|
.filter(functions.size(new Column("block")).gt(1))
|
||||||
|
|
||||||
|
df_with_blocks
|
||||||
|
}
|
||||||
|
|
||||||
|
def clusterValuesUDF(cd: ClusteringDef) = {
|
||||||
|
udf[mutable.WrappedArray[String], mutable.WrappedArray[Any]](values => {
|
||||||
|
values.flatMap(f => cd.clusteringFunction().apply(conf, Seq(f.toString).asJava).asScala)
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
val processBlocks: (Dataset[Row] => Dataset[Row]) = df => {
|
||||||
|
df.filter(functions.size(new Column("block")).geq(new Literal(2, DataTypes.IntegerType)))
|
||||||
|
.withColumn("relations", processBlock(df.sqlContext.sparkContext).apply(new Column("block")))
|
||||||
|
.select(functions.explode(new Column("relations")).as("relation"))
|
||||||
|
}
|
||||||
|
|
||||||
|
def processBlock(implicit sc: SparkContext) = {
|
||||||
|
val accumulators = SparkReporter.constructAccumulator(conf, sc)
|
||||||
|
|
||||||
|
udf[Array[(String, String)], mutable.WrappedArray[Row]](block => {
|
||||||
|
val reporter = new SparkReporter(accumulators)
|
||||||
|
|
||||||
|
val mapDocuments = block.asJava.stream()
|
||||||
|
.sorted(new RowDataOrderingComparator(model.orderingFieldPosition, model.identityFieldPosition))
|
||||||
|
.limit(conf.getWf.getQueueMaxSize)
|
||||||
|
.collect(Collectors.toList[Row]())
|
||||||
|
|
||||||
|
new BlockProcessor(conf, model.identityFieldPosition, model.orderingFieldPosition).processSortedRows(mapDocuments, reporter)
|
||||||
|
|
||||||
|
reporter.getRelations.asScala.toArray
|
||||||
|
}).asNondeterministic()
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -0,0 +1,108 @@
|
||||||
|
package eu.dnetlib.pace.model
|
||||||
|
|
||||||
|
import com.jayway.jsonpath.{Configuration, JsonPath}
|
||||||
|
import eu.dnetlib.pace.config.{DedupConfig, Type}
|
||||||
|
import eu.dnetlib.pace.util.MapDocumentUtil
|
||||||
|
import org.apache.spark.sql.catalyst.encoders.RowEncoder
|
||||||
|
import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema
|
||||||
|
import org.apache.spark.sql.types.{DataTypes, Metadata, StructField, StructType}
|
||||||
|
import org.apache.spark.sql.{Dataset, Row}
|
||||||
|
|
||||||
|
import java.util.regex.Pattern
|
||||||
|
import scala.collection.JavaConverters._
|
||||||
|
|
||||||
|
case class SparkModel(conf: DedupConfig) {
|
||||||
|
private val URL_REGEX: Pattern = Pattern.compile("^\\s*(http|https|ftp)\\://.*")
|
||||||
|
|
||||||
|
private val CONCAT_REGEX: Pattern = Pattern.compile("\\|\\|\\|")
|
||||||
|
|
||||||
|
val identifierFieldName = "identifier"
|
||||||
|
|
||||||
|
val orderingFieldName = if (!conf.getWf.getOrderField.isEmpty) conf.getWf.getOrderField else identifierFieldName
|
||||||
|
|
||||||
|
val schema: StructType = {
|
||||||
|
// create an implicit identifier field
|
||||||
|
val identifier = new FieldDef()
|
||||||
|
identifier.setName(identifierFieldName)
|
||||||
|
identifier.setType(Type.String)
|
||||||
|
|
||||||
|
// Construct a Spark StructType representing the schema of the model
|
||||||
|
(Seq(identifier) ++ conf.getPace.getModel.asScala)
|
||||||
|
.foldLeft(
|
||||||
|
new StructType()
|
||||||
|
)((resType, fieldDef) => {
|
||||||
|
resType.add(fieldDef.getType match {
|
||||||
|
case Type.List | Type.JSON =>
|
||||||
|
StructField(fieldDef.getName, DataTypes.createArrayType(DataTypes.StringType), true, Metadata.empty)
|
||||||
|
case Type.DoubleArray =>
|
||||||
|
StructField(fieldDef.getName, DataTypes.createArrayType(DataTypes.DoubleType), true, Metadata.empty)
|
||||||
|
case _ =>
|
||||||
|
StructField(fieldDef.getName, DataTypes.StringType, true, Metadata.empty)
|
||||||
|
})
|
||||||
|
})
|
||||||
|
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
val identityFieldPosition: Int = schema.fieldIndex(identifierFieldName)
|
||||||
|
|
||||||
|
val orderingFieldPosition: Int = schema.fieldIndex(orderingFieldName)
|
||||||
|
|
||||||
|
val parseJsonDataset: (Dataset[String] => Dataset[Row]) = df => {
|
||||||
|
df.map(r => rowFromJson(r))(RowEncoder(schema))
|
||||||
|
}
|
||||||
|
|
||||||
|
def rowFromJson(json: String): Row = {
|
||||||
|
val documentContext =
|
||||||
|
JsonPath.using(Configuration.defaultConfiguration.addOptions(com.jayway.jsonpath.Option.SUPPRESS_EXCEPTIONS)).parse(json)
|
||||||
|
val values = new Array[Any](schema.size)
|
||||||
|
|
||||||
|
values(identityFieldPosition) = MapDocumentUtil.getJPathString(conf.getWf.getIdPath, documentContext)
|
||||||
|
|
||||||
|
schema.fieldNames.zipWithIndex.foldLeft(values) {
|
||||||
|
case ((res, (fname, index))) => {
|
||||||
|
val fdef = conf.getPace.getModelMap.get(fname)
|
||||||
|
|
||||||
|
if (fdef != null) {
|
||||||
|
res(index) = fdef.getType match {
|
||||||
|
case Type.String | Type.Int =>
|
||||||
|
MapDocumentUtil.truncateValue(
|
||||||
|
MapDocumentUtil.getJPathString(fdef.getPath, documentContext),
|
||||||
|
fdef.getLength
|
||||||
|
)
|
||||||
|
|
||||||
|
case Type.URL =>
|
||||||
|
var uv = MapDocumentUtil.getJPathString(fdef.getPath, documentContext)
|
||||||
|
if (!URL_REGEX.matcher(uv).matches)
|
||||||
|
uv = ""
|
||||||
|
uv
|
||||||
|
|
||||||
|
case Type.List | Type.JSON =>
|
||||||
|
MapDocumentUtil.truncateList(
|
||||||
|
MapDocumentUtil.getJPathList(fdef.getPath, documentContext, fdef.getType),
|
||||||
|
fdef.getSize
|
||||||
|
).toArray
|
||||||
|
|
||||||
|
case Type.StringConcat =>
|
||||||
|
val jpaths = CONCAT_REGEX.split(fdef.getPath)
|
||||||
|
|
||||||
|
MapDocumentUtil.truncateValue(
|
||||||
|
jpaths
|
||||||
|
.map(jpath => MapDocumentUtil.getJPathString(jpath, documentContext))
|
||||||
|
.mkString(" "),
|
||||||
|
fdef.getLength
|
||||||
|
)
|
||||||
|
|
||||||
|
case Type.DoubleArray =>
|
||||||
|
MapDocumentUtil.getJPathArray(fdef.getPath, json)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
res
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
new GenericRowWithSchema(values, schema)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
|
@ -6,12 +6,11 @@ import java.util.Map;
|
||||||
import com.wcohen.ss.AbstractStringDistance;
|
import com.wcohen.ss.AbstractStringDistance;
|
||||||
|
|
||||||
import eu.dnetlib.pace.config.Config;
|
import eu.dnetlib.pace.config.Config;
|
||||||
import eu.dnetlib.pace.model.Field;
|
|
||||||
import eu.dnetlib.pace.tree.support.AbstractComparator;
|
import eu.dnetlib.pace.tree.support.AbstractComparator;
|
||||||
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||||
|
|
||||||
@ComparatorClass("alwaysMatch")
|
@ComparatorClass("alwaysMatch")
|
||||||
public class AlwaysMatch extends AbstractComparator {
|
public class AlwaysMatch<T> extends AbstractComparator<T> {
|
||||||
|
|
||||||
public AlwaysMatch(final Map<String, String> params) {
|
public AlwaysMatch(final Map<String, String> params) {
|
||||||
super(params, new com.wcohen.ss.JaroWinkler());
|
super(params, new com.wcohen.ss.JaroWinkler());
|
||||||
|
@ -26,7 +25,7 @@ public class AlwaysMatch extends AbstractComparator {
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public double compare(final Field a, final Field b, final Config conf) {
|
public double compare(final Object a, final Object b, final Config conf) {
|
||||||
return 1.0;
|
return 1.0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -1,25 +1,19 @@
|
||||||
|
|
||||||
package eu.dnetlib.pace.tree;
|
package eu.dnetlib.pace.tree;
|
||||||
|
|
||||||
import java.util.Comparator;
|
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
import java.util.function.Function;
|
|
||||||
import java.util.stream.Collectors;
|
import java.util.stream.Collectors;
|
||||||
import java.util.stream.Stream;
|
|
||||||
|
|
||||||
import com.google.common.collect.Iterables;
|
|
||||||
import com.wcohen.ss.AbstractStringDistance;
|
import com.wcohen.ss.AbstractStringDistance;
|
||||||
|
|
||||||
import eu.dnetlib.pace.config.Config;
|
import eu.dnetlib.pace.config.Config;
|
||||||
import eu.dnetlib.pace.model.Field;
|
|
||||||
import eu.dnetlib.pace.model.FieldList;
|
|
||||||
import eu.dnetlib.pace.model.Person;
|
import eu.dnetlib.pace.model.Person;
|
||||||
import eu.dnetlib.pace.tree.support.AbstractComparator;
|
import eu.dnetlib.pace.tree.support.AbstractListComparator;
|
||||||
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||||
|
|
||||||
@ComparatorClass("authorsMatch")
|
@ComparatorClass("authorsMatch")
|
||||||
public class AuthorsMatch extends AbstractComparator {
|
public class AuthorsMatch extends AbstractListComparator {
|
||||||
|
|
||||||
Map<String, String> params;
|
Map<String, String> params;
|
||||||
|
|
||||||
|
@ -49,24 +43,16 @@ public class AuthorsMatch extends AbstractComparator {
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public double compare(final Field a, final Field b, final Config conf) {
|
public double compare(final List<String> a, final List<String> b, final Config conf) {
|
||||||
|
|
||||||
if (a.isEmpty() || b.isEmpty())
|
if (a.isEmpty() || b.isEmpty())
|
||||||
return -1;
|
return -1;
|
||||||
|
|
||||||
if (((FieldList) a).size() > SIZE_THRESHOLD || ((FieldList) b).size() > SIZE_THRESHOLD)
|
if (a.size() > SIZE_THRESHOLD || b.size() > SIZE_THRESHOLD)
|
||||||
return 1.0;
|
return 1.0;
|
||||||
|
|
||||||
List<Person> aList = ((FieldList) a)
|
List<Person> aList = a.stream().map(author -> new Person(author, false)).collect(Collectors.toList());
|
||||||
.stringList()
|
List<Person> bList = b.stream().map(author -> new Person(author, false)).collect(Collectors.toList());
|
||||||
.stream()
|
|
||||||
.map(author -> new Person(author, false))
|
|
||||||
.collect(Collectors.toList());
|
|
||||||
List<Person> bList = ((FieldList) b)
|
|
||||||
.stringList()
|
|
||||||
.stream()
|
|
||||||
.map(author -> new Person(author, false))
|
|
||||||
.collect(Collectors.toList());
|
|
||||||
|
|
||||||
common = 0;
|
common = 0;
|
||||||
// compare each element of List1 with each element of List2
|
// compare each element of List1 with each element of List2
|
||||||
|
|
|
@ -5,11 +5,11 @@ import java.util.Map;
|
||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
|
|
||||||
import eu.dnetlib.pace.config.Config;
|
import eu.dnetlib.pace.config.Config;
|
||||||
import eu.dnetlib.pace.tree.support.AbstractComparator;
|
import eu.dnetlib.pace.tree.support.AbstractStringComparator;
|
||||||
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||||
|
|
||||||
@ComparatorClass("cityMatch")
|
@ComparatorClass("cityMatch")
|
||||||
public class CityMatch extends AbstractComparator {
|
public class CityMatch extends AbstractStringComparator {
|
||||||
|
|
||||||
private Map<String, String> params;
|
private Map<String, String> params;
|
||||||
|
|
||||||
|
|
|
@ -1,21 +1,14 @@
|
||||||
|
|
||||||
package eu.dnetlib.pace.tree;
|
package eu.dnetlib.pace.tree;
|
||||||
|
|
||||||
import java.util.HashMap;
|
|
||||||
import java.util.List;
|
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
import java.util.stream.Collectors;
|
|
||||||
|
|
||||||
import eu.dnetlib.pace.config.Config;
|
import eu.dnetlib.pace.config.Config;
|
||||||
import eu.dnetlib.pace.model.Field;
|
|
||||||
import eu.dnetlib.pace.model.FieldList;
|
|
||||||
import eu.dnetlib.pace.model.FieldValueImpl;
|
|
||||||
import eu.dnetlib.pace.model.Person;
|
|
||||||
import eu.dnetlib.pace.tree.support.AbstractComparator;
|
import eu.dnetlib.pace.tree.support.AbstractComparator;
|
||||||
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||||
|
|
||||||
@ComparatorClass("cosineSimilarity")
|
@ComparatorClass("cosineSimilarity")
|
||||||
public class CosineSimilarity extends AbstractComparator {
|
public class CosineSimilarity extends AbstractComparator<double[]> {
|
||||||
|
|
||||||
Map<String, String> params;
|
Map<String, String> params;
|
||||||
|
|
||||||
|
@ -24,15 +17,16 @@ public class CosineSimilarity extends AbstractComparator {
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public double compare(final Field a, final Field b, final Config conf) {
|
public double compare(Object a, Object b, Config config) {
|
||||||
|
return compare((double[]) a, (double[]) b, config);
|
||||||
|
}
|
||||||
|
|
||||||
if (a.isEmpty() || b.isEmpty())
|
public double compare(final double[] a, final double[] b, final Config conf) {
|
||||||
|
|
||||||
|
if (a.length == 0 || b.length == 0)
|
||||||
return -1;
|
return -1;
|
||||||
|
|
||||||
double[] aVector = ((FieldValueImpl) a).doubleArrayValue();
|
return cosineSimilarity(a, b);
|
||||||
double[] bVector = ((FieldValueImpl) b).doubleArrayValue();
|
|
||||||
|
|
||||||
return cosineSimilarity(aVector, bVector);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
double cosineSimilarity(double[] a, double[] b) {
|
double cosineSimilarity(double[] a, double[] b) {
|
||||||
|
|
|
@ -3,7 +3,6 @@ package eu.dnetlib.pace.tree;
|
||||||
|
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
|
|
||||||
import eu.dnetlib.pace.model.Field;
|
|
||||||
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -21,8 +20,8 @@ public class DoiExactMatch extends ExactMatchIgnoreCase {
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
protected String getValue(final Field f) {
|
protected String toString(final Object f) {
|
||||||
return super.getValue(f).replaceAll(PREFIX, "");
|
return super.toString(f).replaceAll(PREFIX, "");
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -5,7 +5,6 @@ import java.net.MalformedURLException;
|
||||||
import java.net.URL;
|
import java.net.URL;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
|
|
||||||
import eu.dnetlib.pace.model.Field;
|
|
||||||
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||||
|
|
||||||
@ComparatorClass("domainExactMatch")
|
@ComparatorClass("domainExactMatch")
|
||||||
|
@ -16,10 +15,10 @@ public class DomainExactMatch extends ExactMatchIgnoreCase {
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
protected String getValue(final Field f) {
|
protected String toString(final Object f) {
|
||||||
|
|
||||||
try {
|
try {
|
||||||
return asUrl(super.getValue(f)).getHost();
|
return asUrl(super.toString(f)).getHost();
|
||||||
} catch (MalformedURLException e) {
|
} catch (MalformedURLException e) {
|
||||||
return "";
|
return "";
|
||||||
}
|
}
|
||||||
|
|
|
@ -6,11 +6,11 @@ import java.util.Map;
|
||||||
import com.wcohen.ss.AbstractStringDistance;
|
import com.wcohen.ss.AbstractStringDistance;
|
||||||
|
|
||||||
import eu.dnetlib.pace.config.Config;
|
import eu.dnetlib.pace.config.Config;
|
||||||
import eu.dnetlib.pace.tree.support.AbstractComparator;
|
import eu.dnetlib.pace.tree.support.AbstractStringComparator;
|
||||||
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||||
|
|
||||||
@ComparatorClass("exactMatch")
|
@ComparatorClass("exactMatch")
|
||||||
public class ExactMatch extends AbstractComparator {
|
public class ExactMatch extends AbstractStringComparator {
|
||||||
|
|
||||||
public ExactMatch(Map<String, String> params) {
|
public ExactMatch(Map<String, String> params) {
|
||||||
super(params, new com.wcohen.ss.JaroWinkler());
|
super(params, new com.wcohen.ss.JaroWinkler());
|
||||||
|
|
|
@ -4,30 +4,26 @@ package eu.dnetlib.pace.tree;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
|
|
||||||
import eu.dnetlib.pace.config.Config;
|
import eu.dnetlib.pace.config.Config;
|
||||||
import eu.dnetlib.pace.model.Field;
|
import eu.dnetlib.pace.tree.support.AbstractStringComparator;
|
||||||
import eu.dnetlib.pace.tree.support.AbstractComparator;
|
|
||||||
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||||
|
|
||||||
@ComparatorClass("exactMatchIgnoreCase")
|
@ComparatorClass("exactMatchIgnoreCase")
|
||||||
public class ExactMatchIgnoreCase extends AbstractComparator {
|
public class ExactMatchIgnoreCase extends AbstractStringComparator {
|
||||||
|
|
||||||
public ExactMatchIgnoreCase(Map<String, String> params) {
|
public ExactMatchIgnoreCase(Map<String, String> params) {
|
||||||
super(params);
|
super(params);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public double compare(Field a, Field b, final Config conf) {
|
public double compare(String a, String b, final Config conf) {
|
||||||
|
|
||||||
final String fa = getValue(a);
|
if (a.isEmpty() || b.isEmpty())
|
||||||
final String fb = getValue(b);
|
|
||||||
|
|
||||||
if (fa.isEmpty() || fb.isEmpty())
|
|
||||||
return -1;
|
return -1;
|
||||||
|
|
||||||
return fa.equalsIgnoreCase(fb) ? 1 : 0;
|
return a.equalsIgnoreCase(b) ? 1 : 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
protected String getValue(final Field f) {
|
protected String toString(final Object object) {
|
||||||
return getFirstValue(f);
|
return toFirstString(object);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -10,13 +10,11 @@ import java.util.stream.Collectors;
|
||||||
import com.google.common.collect.Sets;
|
import com.google.common.collect.Sets;
|
||||||
|
|
||||||
import eu.dnetlib.pace.config.Config;
|
import eu.dnetlib.pace.config.Config;
|
||||||
import eu.dnetlib.pace.model.Field;
|
import eu.dnetlib.pace.tree.support.AbstractListComparator;
|
||||||
import eu.dnetlib.pace.model.FieldList;
|
|
||||||
import eu.dnetlib.pace.tree.support.AbstractComparator;
|
|
||||||
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||||
|
|
||||||
@ComparatorClass("instanceTypeMatch")
|
@ComparatorClass("instanceTypeMatch")
|
||||||
public class InstanceTypeMatch extends AbstractComparator {
|
public class InstanceTypeMatch extends AbstractListComparator {
|
||||||
|
|
||||||
final Map<String, String> translationMap = new HashMap<>();
|
final Map<String, String> translationMap = new HashMap<>();
|
||||||
|
|
||||||
|
@ -42,21 +40,18 @@ public class InstanceTypeMatch extends AbstractComparator {
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public double compare(final Field a, final Field b, final Config conf) {
|
public double compare(final List<String> a, final List<String> b, final Config conf) {
|
||||||
|
|
||||||
if (a == null || b == null) {
|
if (a == null || b == null) {
|
||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
|
|
||||||
final List<String> sa = ((FieldList) a).stringList();
|
if (a.isEmpty() || b.isEmpty()) {
|
||||||
final List<String> sb = ((FieldList) b).stringList();
|
|
||||||
|
|
||||||
if (sa.isEmpty() || sb.isEmpty()) {
|
|
||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
|
|
||||||
final Set<String> ca = sa.stream().map(this::translate).collect(Collectors.toSet());
|
final Set<String> ca = a.stream().map(this::translate).collect(Collectors.toSet());
|
||||||
final Set<String> cb = sb.stream().map(this::translate).collect(Collectors.toSet());
|
final Set<String> cb = b.stream().map(this::translate).collect(Collectors.toSet());
|
||||||
|
|
||||||
// if at least one is a jolly type, it must produce a match
|
// if at least one is a jolly type, it must produce a match
|
||||||
if (ca.contains("*") || cb.contains("*"))
|
if (ca.contains("*") || cb.contains("*"))
|
||||||
|
|
|
@ -6,12 +6,12 @@ import java.util.Map;
|
||||||
import com.wcohen.ss.AbstractStringDistance;
|
import com.wcohen.ss.AbstractStringDistance;
|
||||||
|
|
||||||
import eu.dnetlib.pace.config.Config;
|
import eu.dnetlib.pace.config.Config;
|
||||||
import eu.dnetlib.pace.tree.support.AbstractComparator;
|
import eu.dnetlib.pace.tree.support.AbstractStringComparator;
|
||||||
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||||
|
|
||||||
//case class JaroWinkler(w: Double) extends SecondStringDistanceAlgo(w, new com.wcohen.ss.JaroWinkler())
|
//case class JaroWinkler(w: Double) extends SecondStringDistanceAlgo(w, new com.wcohen.ss.JaroWinkler())
|
||||||
@ComparatorClass("jaroWinkler")
|
@ComparatorClass("jaroWinkler")
|
||||||
public class JaroWinkler extends AbstractComparator {
|
public class JaroWinkler extends AbstractStringComparator {
|
||||||
|
|
||||||
public JaroWinkler(Map<String, String> params) {
|
public JaroWinkler(Map<String, String> params) {
|
||||||
super(params, new com.wcohen.ss.JaroWinkler());
|
super(params, new com.wcohen.ss.JaroWinkler());
|
||||||
|
|
|
@ -7,11 +7,11 @@ import java.util.Set;
|
||||||
import com.wcohen.ss.AbstractStringDistance;
|
import com.wcohen.ss.AbstractStringDistance;
|
||||||
|
|
||||||
import eu.dnetlib.pace.config.Config;
|
import eu.dnetlib.pace.config.Config;
|
||||||
import eu.dnetlib.pace.tree.support.AbstractComparator;
|
import eu.dnetlib.pace.tree.support.AbstractStringComparator;
|
||||||
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||||
|
|
||||||
@ComparatorClass("jaroWinklerNormalizedName")
|
@ComparatorClass("jaroWinklerNormalizedName")
|
||||||
public class JaroWinklerNormalizedName extends AbstractComparator {
|
public class JaroWinklerNormalizedName extends AbstractStringComparator {
|
||||||
|
|
||||||
private Map<String, String> params;
|
private Map<String, String> params;
|
||||||
|
|
||||||
|
|
|
@ -6,12 +6,12 @@ import java.util.Map;
|
||||||
import com.wcohen.ss.AbstractStringDistance;
|
import com.wcohen.ss.AbstractStringDistance;
|
||||||
|
|
||||||
import eu.dnetlib.pace.config.Config;
|
import eu.dnetlib.pace.config.Config;
|
||||||
import eu.dnetlib.pace.tree.support.AbstractComparator;
|
import eu.dnetlib.pace.tree.support.AbstractStringComparator;
|
||||||
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||||
|
|
||||||
//case class JaroWinkler(w: Double) extends SecondStringDistanceAlgo(w, new com.wcohen.ss.JaroWinkler())
|
//case class JaroWinkler(w: Double) extends SecondStringDistanceAlgo(w, new com.wcohen.ss.JaroWinkler())
|
||||||
@ComparatorClass("jaroWinklerTitle")
|
@ComparatorClass("jaroWinklerTitle")
|
||||||
public class JaroWinklerTitle extends AbstractComparator {
|
public class JaroWinklerTitle extends AbstractStringComparator {
|
||||||
|
|
||||||
public JaroWinklerTitle(Map<String, String> params) {
|
public JaroWinklerTitle(Map<String, String> params) {
|
||||||
super(params, new com.wcohen.ss.JaroWinkler());
|
super(params, new com.wcohen.ss.JaroWinkler());
|
||||||
|
|
|
@ -10,16 +10,18 @@ import org.apache.commons.logging.Log;
|
||||||
import org.apache.commons.logging.LogFactory;
|
import org.apache.commons.logging.LogFactory;
|
||||||
|
|
||||||
import com.google.common.collect.Sets;
|
import com.google.common.collect.Sets;
|
||||||
|
import com.jayway.jsonpath.Configuration;
|
||||||
|
import com.jayway.jsonpath.DocumentContext;
|
||||||
|
import com.jayway.jsonpath.JsonPath;
|
||||||
|
import com.jayway.jsonpath.Option;
|
||||||
|
|
||||||
import eu.dnetlib.pace.config.Config;
|
import eu.dnetlib.pace.config.Config;
|
||||||
import eu.dnetlib.pace.model.Field;
|
import eu.dnetlib.pace.tree.support.AbstractListComparator;
|
||||||
import eu.dnetlib.pace.model.FieldList;
|
|
||||||
import eu.dnetlib.pace.tree.support.AbstractComparator;
|
|
||||||
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||||
import eu.dnetlib.pace.util.MapDocumentUtil;
|
import eu.dnetlib.pace.util.MapDocumentUtil;
|
||||||
|
|
||||||
@ComparatorClass("jsonListMatch")
|
@ComparatorClass("jsonListMatch")
|
||||||
public class JsonListMatch extends AbstractComparator {
|
public class JsonListMatch extends AbstractListComparator {
|
||||||
|
|
||||||
private static final Log log = LogFactory.getLog(JsonListMatch.class);
|
private static final Log log = LogFactory.getLog(JsonListMatch.class);
|
||||||
private Map<String, String> params;
|
private Map<String, String> params;
|
||||||
|
@ -34,11 +36,7 @@ public class JsonListMatch extends AbstractComparator {
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public double compare(final Field a, final Field b, final Config conf) {
|
public double compare(final List<String> sa, final List<String> sb, final Config conf) {
|
||||||
|
|
||||||
final List<String> sa = ((FieldList) a).stringList();
|
|
||||||
final List<String> sb = ((FieldList) b).stringList();
|
|
||||||
|
|
||||||
if (sa.isEmpty() || sb.isEmpty()) {
|
if (sa.isEmpty() || sb.isEmpty()) {
|
||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
|
@ -65,14 +63,17 @@ public class JsonListMatch extends AbstractComparator {
|
||||||
|
|
||||||
StringBuilder st = new StringBuilder(); // to build the string used for comparisons basing on the jpath into
|
StringBuilder st = new StringBuilder(); // to build the string used for comparisons basing on the jpath into
|
||||||
// parameters
|
// parameters
|
||||||
|
final DocumentContext documentContext = JsonPath
|
||||||
|
.using(Configuration.defaultConfiguration().addOptions(Option.SUPPRESS_EXCEPTIONS))
|
||||||
|
.parse(json);
|
||||||
// for each path in the param list
|
// for each path in the param list
|
||||||
for (String key : params.keySet().stream().filter(k -> k.contains("jpath")).collect(Collectors.toList())) {
|
for (String key : params.keySet().stream().filter(k -> k.contains("jpath")).collect(Collectors.toList())) {
|
||||||
String path = params.get(key);
|
String path = params.get(key);
|
||||||
String value = MapDocumentUtil.getJPathString(path, json);
|
String value = MapDocumentUtil.getJPathString(path, documentContext);
|
||||||
if (value == null || value.isEmpty())
|
if (value == null || value.isEmpty())
|
||||||
value = "";
|
value = "";
|
||||||
st.append(value + "::");
|
st.append(value);
|
||||||
|
st.append("::");
|
||||||
}
|
}
|
||||||
|
|
||||||
st.setLength(st.length() - 2);
|
st.setLength(st.length() - 2);
|
||||||
|
|
|
@ -5,11 +5,11 @@ import java.util.Map;
|
||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
|
|
||||||
import eu.dnetlib.pace.config.Config;
|
import eu.dnetlib.pace.config.Config;
|
||||||
import eu.dnetlib.pace.tree.support.AbstractComparator;
|
import eu.dnetlib.pace.tree.support.AbstractStringComparator;
|
||||||
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||||
|
|
||||||
@ComparatorClass("keywordMatch")
|
@ComparatorClass("keywordMatch")
|
||||||
public class KeywordMatch extends AbstractComparator {
|
public class KeywordMatch extends AbstractStringComparator {
|
||||||
|
|
||||||
Map<String, String> params;
|
Map<String, String> params;
|
||||||
|
|
||||||
|
|
|
@ -5,11 +5,11 @@ import java.util.Map;
|
||||||
|
|
||||||
import com.wcohen.ss.AbstractStringDistance;
|
import com.wcohen.ss.AbstractStringDistance;
|
||||||
|
|
||||||
import eu.dnetlib.pace.tree.support.AbstractComparator;
|
import eu.dnetlib.pace.tree.support.AbstractStringComparator;
|
||||||
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||||
|
|
||||||
@ComparatorClass("level2JaroWinkler")
|
@ComparatorClass("level2JaroWinkler")
|
||||||
public class Level2JaroWinkler extends AbstractComparator {
|
public class Level2JaroWinkler extends AbstractStringComparator {
|
||||||
|
|
||||||
public Level2JaroWinkler(Map<String, String> params) {
|
public Level2JaroWinkler(Map<String, String> params) {
|
||||||
super(params, new com.wcohen.ss.Level2JaroWinkler());
|
super(params, new com.wcohen.ss.Level2JaroWinkler());
|
||||||
|
|
|
@ -6,11 +6,11 @@ import java.util.Map;
|
||||||
import com.wcohen.ss.AbstractStringDistance;
|
import com.wcohen.ss.AbstractStringDistance;
|
||||||
|
|
||||||
import eu.dnetlib.pace.config.Config;
|
import eu.dnetlib.pace.config.Config;
|
||||||
import eu.dnetlib.pace.tree.support.AbstractComparator;
|
import eu.dnetlib.pace.tree.support.AbstractStringComparator;
|
||||||
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||||
|
|
||||||
@ComparatorClass("level2JaroWinklerTitle")
|
@ComparatorClass("level2JaroWinklerTitle")
|
||||||
public class Level2JaroWinklerTitle extends AbstractComparator {
|
public class Level2JaroWinklerTitle extends AbstractStringComparator {
|
||||||
|
|
||||||
public Level2JaroWinklerTitle(Map<String, String> params) {
|
public Level2JaroWinklerTitle(Map<String, String> params) {
|
||||||
super(params, new com.wcohen.ss.Level2JaroWinkler());
|
super(params, new com.wcohen.ss.Level2JaroWinkler());
|
||||||
|
|
|
@ -5,11 +5,11 @@ import java.util.Map;
|
||||||
|
|
||||||
import com.wcohen.ss.AbstractStringDistance;
|
import com.wcohen.ss.AbstractStringDistance;
|
||||||
|
|
||||||
import eu.dnetlib.pace.tree.support.AbstractComparator;
|
import eu.dnetlib.pace.tree.support.AbstractStringComparator;
|
||||||
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||||
|
|
||||||
@ComparatorClass("level2Levenstein")
|
@ComparatorClass("level2Levenstein")
|
||||||
public class Level2Levenstein extends AbstractComparator {
|
public class Level2Levenstein extends AbstractStringComparator {
|
||||||
|
|
||||||
public Level2Levenstein(Map<String, String> params) {
|
public Level2Levenstein(Map<String, String> params) {
|
||||||
super(params, new com.wcohen.ss.Level2Levenstein());
|
super(params, new com.wcohen.ss.Level2Levenstein());
|
||||||
|
|
|
@ -5,11 +5,11 @@ import java.util.Map;
|
||||||
|
|
||||||
import com.wcohen.ss.AbstractStringDistance;
|
import com.wcohen.ss.AbstractStringDistance;
|
||||||
|
|
||||||
import eu.dnetlib.pace.tree.support.AbstractComparator;
|
import eu.dnetlib.pace.tree.support.AbstractStringComparator;
|
||||||
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||||
|
|
||||||
@ComparatorClass("levenstein")
|
@ComparatorClass("levenstein")
|
||||||
public class Levenstein extends AbstractComparator {
|
public class Levenstein extends AbstractStringComparator {
|
||||||
|
|
||||||
public Levenstein(Map<String, String> params) {
|
public Levenstein(Map<String, String> params) {
|
||||||
super(params, new com.wcohen.ss.Levenstein());
|
super(params, new com.wcohen.ss.Levenstein());
|
||||||
|
|
|
@ -9,11 +9,11 @@ import org.apache.commons.logging.LogFactory;
|
||||||
import com.wcohen.ss.AbstractStringDistance;
|
import com.wcohen.ss.AbstractStringDistance;
|
||||||
|
|
||||||
import eu.dnetlib.pace.config.Config;
|
import eu.dnetlib.pace.config.Config;
|
||||||
import eu.dnetlib.pace.tree.support.AbstractComparator;
|
import eu.dnetlib.pace.tree.support.AbstractStringComparator;
|
||||||
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||||
|
|
||||||
@ComparatorClass("levensteinTitle")
|
@ComparatorClass("levensteinTitle")
|
||||||
public class LevensteinTitle extends AbstractComparator {
|
public class LevensteinTitle extends AbstractStringComparator {
|
||||||
|
|
||||||
private static final Log log = LogFactory.getLog(LevensteinTitle.class);
|
private static final Log log = LogFactory.getLog(LevensteinTitle.class);
|
||||||
|
|
||||||
|
|
|
@ -6,14 +6,14 @@ import java.util.Map;
|
||||||
import com.wcohen.ss.AbstractStringDistance;
|
import com.wcohen.ss.AbstractStringDistance;
|
||||||
|
|
||||||
import eu.dnetlib.pace.config.Config;
|
import eu.dnetlib.pace.config.Config;
|
||||||
import eu.dnetlib.pace.tree.support.AbstractComparator;
|
import eu.dnetlib.pace.tree.support.AbstractStringComparator;
|
||||||
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Compared compare between two titles, ignoring version numbers. Suitable for Software entities.
|
* Compared compare between two titles, ignoring version numbers. Suitable for Software entities.
|
||||||
*/
|
*/
|
||||||
@ComparatorClass("levensteinTitleIgnoreVersion")
|
@ComparatorClass("levensteinTitleIgnoreVersion")
|
||||||
public class LevensteinTitleIgnoreVersion extends AbstractComparator {
|
public class LevensteinTitleIgnoreVersion extends AbstractStringComparator {
|
||||||
|
|
||||||
public LevensteinTitleIgnoreVersion(Map<String, String> params) {
|
public LevensteinTitleIgnoreVersion(Map<String, String> params) {
|
||||||
super(params, new com.wcohen.ss.Levenstein());
|
super(params, new com.wcohen.ss.Levenstein());
|
||||||
|
|
|
@ -3,15 +3,10 @@ package eu.dnetlib.pace.tree;
|
||||||
|
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
import java.util.Set;
|
|
||||||
import java.util.stream.Collectors;
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
import com.google.common.collect.Sets;
|
|
||||||
|
|
||||||
import eu.dnetlib.pace.config.Config;
|
import eu.dnetlib.pace.config.Config;
|
||||||
import eu.dnetlib.pace.model.Field;
|
import eu.dnetlib.pace.tree.support.AbstractListComparator;
|
||||||
import eu.dnetlib.pace.model.FieldList;
|
|
||||||
import eu.dnetlib.pace.tree.support.AbstractComparator;
|
|
||||||
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -20,7 +15,7 @@ import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||||
* @author miconis
|
* @author miconis
|
||||||
* */
|
* */
|
||||||
@ComparatorClass("listContainsMatch")
|
@ComparatorClass("listContainsMatch")
|
||||||
public class ListContainsMatch extends AbstractComparator {
|
public class ListContainsMatch extends AbstractListComparator {
|
||||||
|
|
||||||
private Map<String, String> params;
|
private Map<String, String> params;
|
||||||
private boolean CASE_SENSITIVE;
|
private boolean CASE_SENSITIVE;
|
||||||
|
@ -38,11 +33,7 @@ public class ListContainsMatch extends AbstractComparator {
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public double compare(final Field a, final Field b, final Config conf) {
|
public double compare(List<String> sa, List<String> sb, Config conf) {
|
||||||
|
|
||||||
List<String> sa = ((FieldList) a).stringList();
|
|
||||||
List<String> sb = ((FieldList) b).stringList();
|
|
||||||
|
|
||||||
if (sa.isEmpty() || sb.isEmpty()) {
|
if (sa.isEmpty() || sb.isEmpty()) {
|
||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
|
|
|
@ -6,11 +6,11 @@ import java.util.Map;
|
||||||
import com.wcohen.ss.AbstractStringDistance;
|
import com.wcohen.ss.AbstractStringDistance;
|
||||||
|
|
||||||
import eu.dnetlib.pace.config.Config;
|
import eu.dnetlib.pace.config.Config;
|
||||||
import eu.dnetlib.pace.tree.support.AbstractComparator;
|
import eu.dnetlib.pace.tree.support.AbstractStringComparator;
|
||||||
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||||
|
|
||||||
@ComparatorClass("mustBeDifferent")
|
@ComparatorClass("mustBeDifferent")
|
||||||
public class MustBeDifferent extends AbstractComparator {
|
public class MustBeDifferent extends AbstractStringComparator {
|
||||||
|
|
||||||
public MustBeDifferent(Map<String, String> params) {
|
public MustBeDifferent(Map<String, String> params) {
|
||||||
super(params, new com.wcohen.ss.Levenstein());
|
super(params, new com.wcohen.ss.Levenstein());
|
||||||
|
|
|
@ -4,7 +4,6 @@ package eu.dnetlib.pace.tree;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
|
|
||||||
import eu.dnetlib.pace.config.Config;
|
import eu.dnetlib.pace.config.Config;
|
||||||
import eu.dnetlib.pace.model.Field;
|
|
||||||
import eu.dnetlib.pace.tree.support.Comparator;
|
import eu.dnetlib.pace.tree.support.Comparator;
|
||||||
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||||
|
|
||||||
|
@ -13,13 +12,13 @@ import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||||
* NullDistanceAlgo.
|
* NullDistanceAlgo.
|
||||||
*/
|
*/
|
||||||
@ComparatorClass("null")
|
@ComparatorClass("null")
|
||||||
public class NullDistanceAlgo implements Comparator {
|
public class NullDistanceAlgo<T> implements Comparator<T> {
|
||||||
|
|
||||||
public NullDistanceAlgo(Map<String, String> params) {
|
public NullDistanceAlgo(Map<String, String> params) {
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public double compare(Field a, Field b, Config config) {
|
public double compare(Object a, Object b, Config config) {
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -4,11 +4,11 @@ package eu.dnetlib.pace.tree;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
|
|
||||||
import eu.dnetlib.pace.config.Config;
|
import eu.dnetlib.pace.config.Config;
|
||||||
import eu.dnetlib.pace.tree.support.AbstractComparator;
|
import eu.dnetlib.pace.tree.support.AbstractStringComparator;
|
||||||
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||||
|
|
||||||
@ComparatorClass("numbersComparator")
|
@ComparatorClass("numbersComparator")
|
||||||
public class NumbersComparator extends AbstractComparator {
|
public class NumbersComparator extends AbstractStringComparator {
|
||||||
|
|
||||||
Map<String, String> params;
|
Map<String, String> params;
|
||||||
|
|
||||||
|
|
|
@ -4,11 +4,11 @@ package eu.dnetlib.pace.tree;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
|
|
||||||
import eu.dnetlib.pace.config.Config;
|
import eu.dnetlib.pace.config.Config;
|
||||||
import eu.dnetlib.pace.tree.support.AbstractComparator;
|
import eu.dnetlib.pace.tree.support.AbstractStringComparator;
|
||||||
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||||
|
|
||||||
@ComparatorClass("numbersMatch")
|
@ComparatorClass("numbersMatch")
|
||||||
public class NumbersMatch extends AbstractComparator {
|
public class NumbersMatch extends AbstractStringComparator {
|
||||||
|
|
||||||
public NumbersMatch(Map<String, String> params) {
|
public NumbersMatch(Map<String, String> params) {
|
||||||
super(params);
|
super(params);
|
||||||
|
|
|
@ -4,11 +4,11 @@ package eu.dnetlib.pace.tree;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
|
|
||||||
import eu.dnetlib.pace.config.Config;
|
import eu.dnetlib.pace.config.Config;
|
||||||
import eu.dnetlib.pace.tree.support.AbstractComparator;
|
import eu.dnetlib.pace.tree.support.AbstractStringComparator;
|
||||||
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||||
|
|
||||||
@ComparatorClass("romansMatch")
|
@ComparatorClass("romansMatch")
|
||||||
public class RomansMatch extends AbstractComparator {
|
public class RomansMatch extends AbstractStringComparator {
|
||||||
|
|
||||||
public RomansMatch(Map<String, String> params) {
|
public RomansMatch(Map<String, String> params) {
|
||||||
super(params);
|
super(params);
|
||||||
|
|
|
@ -4,11 +4,8 @@ package eu.dnetlib.pace.tree;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
|
|
||||||
import com.google.common.collect.Iterables;
|
|
||||||
|
|
||||||
import eu.dnetlib.pace.config.Config;
|
import eu.dnetlib.pace.config.Config;
|
||||||
import eu.dnetlib.pace.model.Field;
|
import eu.dnetlib.pace.tree.support.AbstractListComparator;
|
||||||
import eu.dnetlib.pace.tree.support.AbstractComparator;
|
|
||||||
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -17,7 +14,7 @@ import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||||
* @author claudio
|
* @author claudio
|
||||||
*/
|
*/
|
||||||
@ComparatorClass("sizeMatch")
|
@ComparatorClass("sizeMatch")
|
||||||
public class SizeMatch extends AbstractComparator {
|
public class SizeMatch extends AbstractListComparator {
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Instantiates a new size match.
|
* Instantiates a new size match.
|
||||||
|
@ -30,23 +27,12 @@ public class SizeMatch extends AbstractComparator {
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public double compare(final Field a, final Field b, final Config conf) {
|
public double compare(final List<String> a, final List<String> b, final Config conf) {
|
||||||
|
|
||||||
if (a.isEmpty() || b.isEmpty())
|
if (a.isEmpty() || b.isEmpty())
|
||||||
return -1;
|
return -1.0;
|
||||||
|
|
||||||
return Iterables.size(a) == Iterables.size(b) ? 1 : 0;
|
return a.size() == b.size() ? 1.0 : 0.0;
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Checks if is empty.
|
|
||||||
*
|
|
||||||
* @param a
|
|
||||||
* the a
|
|
||||||
* @return true, if is empty
|
|
||||||
*/
|
|
||||||
protected boolean isEmpty(final Iterable<?> a) {
|
|
||||||
return (a == null) || Iterables.isEmpty(a);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -4,7 +4,7 @@ package eu.dnetlib.pace.tree;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
|
|
||||||
import eu.dnetlib.pace.config.Config;
|
import eu.dnetlib.pace.config.Config;
|
||||||
import eu.dnetlib.pace.tree.support.AbstractComparator;
|
import eu.dnetlib.pace.tree.support.AbstractStringComparator;
|
||||||
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -13,7 +13,7 @@ import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||||
* @author miconis
|
* @author miconis
|
||||||
* */
|
* */
|
||||||
@ComparatorClass("stringContainsMatch")
|
@ComparatorClass("stringContainsMatch")
|
||||||
public class StringContainsMatch extends AbstractComparator {
|
public class StringContainsMatch extends AbstractStringComparator {
|
||||||
|
|
||||||
private Map<String, String> params;
|
private Map<String, String> params;
|
||||||
|
|
||||||
|
|
|
@ -2,6 +2,7 @@
|
||||||
package eu.dnetlib.pace.tree;
|
package eu.dnetlib.pace.tree;
|
||||||
|
|
||||||
import java.util.HashSet;
|
import java.util.HashSet;
|
||||||
|
import java.util.List;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
|
|
||||||
|
@ -11,13 +12,11 @@ import org.apache.commons.logging.LogFactory;
|
||||||
import com.google.common.collect.Sets;
|
import com.google.common.collect.Sets;
|
||||||
|
|
||||||
import eu.dnetlib.pace.config.Config;
|
import eu.dnetlib.pace.config.Config;
|
||||||
import eu.dnetlib.pace.model.Field;
|
import eu.dnetlib.pace.tree.support.AbstractListComparator;
|
||||||
import eu.dnetlib.pace.model.FieldList;
|
|
||||||
import eu.dnetlib.pace.tree.support.AbstractComparator;
|
|
||||||
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||||
|
|
||||||
@ComparatorClass("stringListMatch")
|
@ComparatorClass("stringListMatch")
|
||||||
public class StringListMatch extends AbstractComparator {
|
public class StringListMatch extends AbstractListComparator {
|
||||||
|
|
||||||
private static final Log log = LogFactory.getLog(StringListMatch.class);
|
private static final Log log = LogFactory.getLog(StringListMatch.class);
|
||||||
private Map<String, String> params;
|
private Map<String, String> params;
|
||||||
|
@ -32,10 +31,10 @@ public class StringListMatch extends AbstractComparator {
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public double compare(final Field a, final Field b, final Config conf) {
|
public double compare(final List<String> a, final List<String> b, final Config conf) {
|
||||||
|
|
||||||
final Set<String> pa = new HashSet<>(((FieldList) a).stringList());
|
final Set<String> pa = new HashSet<>(a);
|
||||||
final Set<String> pb = new HashSet<>(((FieldList) b).stringList());
|
final Set<String> pb = new HashSet<>(b);
|
||||||
|
|
||||||
if (pa.isEmpty() || pb.isEmpty()) {
|
if (pa.isEmpty() || pb.isEmpty()) {
|
||||||
return -1; // return undefined if one of the two lists is empty
|
return -1; // return undefined if one of the two lists is empty
|
||||||
|
|
|
@ -8,25 +8,24 @@ import org.apache.commons.lang3.StringUtils;
|
||||||
import com.wcohen.ss.AbstractStringDistance;
|
import com.wcohen.ss.AbstractStringDistance;
|
||||||
|
|
||||||
import eu.dnetlib.pace.config.Config;
|
import eu.dnetlib.pace.config.Config;
|
||||||
import eu.dnetlib.pace.config.Type;
|
import eu.dnetlib.pace.tree.support.AbstractStringComparator;
|
||||||
import eu.dnetlib.pace.model.Field;
|
|
||||||
import eu.dnetlib.pace.tree.support.AbstractComparator;
|
|
||||||
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* The Class SubStringLevenstein.
|
* The Class SubStringLevenstein.
|
||||||
*/
|
*/
|
||||||
@ComparatorClass("subStringLevenstein")
|
@ComparatorClass("subStringLevenstein")
|
||||||
public class SubStringLevenstein extends AbstractComparator {
|
public class SubStringLevenstein extends AbstractStringComparator {
|
||||||
|
|
||||||
/** The limit. */
|
/**
|
||||||
|
* The limit.
|
||||||
|
*/
|
||||||
protected int limit;
|
protected int limit;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Instantiates a new sub string levenstein.
|
* Instantiates a new sub string levenstein.
|
||||||
*
|
*
|
||||||
* @param w
|
* @param w the w
|
||||||
* the w
|
|
||||||
*/
|
*/
|
||||||
public SubStringLevenstein(final double w) {
|
public SubStringLevenstein(final double w) {
|
||||||
super(w, new com.wcohen.ss.Levenstein());
|
super(w, new com.wcohen.ss.Levenstein());
|
||||||
|
@ -39,11 +38,9 @@ public class SubStringLevenstein extends AbstractComparator {
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Instantiates a new sub string levenstein.
|
* Instantiates a new sub string levenstein.
|
||||||
*
|
*
|
||||||
* @param w
|
* @param w the w
|
||||||
* the w
|
* @param limit the limit
|
||||||
* @param limit
|
|
||||||
* the limit
|
|
||||||
*/
|
*/
|
||||||
public SubStringLevenstein(final double w, final int limit) {
|
public SubStringLevenstein(final double w, final int limit) {
|
||||||
super(w, new com.wcohen.ss.Levenstein());
|
super(w, new com.wcohen.ss.Levenstein());
|
||||||
|
@ -52,13 +49,10 @@ public class SubStringLevenstein extends AbstractComparator {
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Instantiates a new sub string levenstein.
|
* Instantiates a new sub string levenstein.
|
||||||
*
|
*
|
||||||
* @param w
|
* @param w the w
|
||||||
* the w
|
* @param limit the limit
|
||||||
* @param limit
|
* @param ssalgo the ssalgo
|
||||||
* the limit
|
|
||||||
* @param ssalgo
|
|
||||||
* the ssalgo
|
|
||||||
*/
|
*/
|
||||||
protected SubStringLevenstein(final double w, final int limit, final AbstractStringDistance ssalgo) {
|
protected SubStringLevenstein(final double w, final int limit, final AbstractStringDistance ssalgo) {
|
||||||
super(w, ssalgo);
|
super(w, ssalgo);
|
||||||
|
@ -71,11 +65,8 @@ public class SubStringLevenstein extends AbstractComparator {
|
||||||
* eu.dnetlib.pace.model.Field)
|
* eu.dnetlib.pace.model.Field)
|
||||||
*/
|
*/
|
||||||
@Override
|
@Override
|
||||||
public double distance(final Field a, final Field b, final Config conf) {
|
public double distance(final String a, final String b, final Config conf) {
|
||||||
if (a.getType().equals(Type.String) && b.getType().equals(Type.String))
|
return distance(StringUtils.left(a, limit), StringUtils.left(b, limit), conf);
|
||||||
return distance(StringUtils.left(a.stringValue(), limit), StringUtils.left(b.stringValue(), limit), conf);
|
|
||||||
|
|
||||||
throw new IllegalArgumentException("invalid types\n- A: " + a.toString() + "\n- B: " + b.toString());
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
|
|
@ -1,12 +1,10 @@
|
||||||
|
|
||||||
package eu.dnetlib.pace.tree;
|
package eu.dnetlib.pace.tree;
|
||||||
|
|
||||||
import java.util.List;
|
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
|
|
||||||
import eu.dnetlib.pace.config.Config;
|
import eu.dnetlib.pace.config.Config;
|
||||||
import eu.dnetlib.pace.model.Field;
|
import eu.dnetlib.pace.tree.support.AbstractStringComparator;
|
||||||
import eu.dnetlib.pace.tree.support.AbstractComparator;
|
|
||||||
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -16,17 +14,14 @@ import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||||
*
|
*
|
||||||
*/
|
*/
|
||||||
@ComparatorClass("titleVersionMatch")
|
@ComparatorClass("titleVersionMatch")
|
||||||
public class TitleVersionMatch extends AbstractComparator {
|
public class TitleVersionMatch extends AbstractStringComparator {
|
||||||
|
|
||||||
public TitleVersionMatch(final Map<String, String> params) {
|
public TitleVersionMatch(final Map<String, String> params) {
|
||||||
super(params);
|
super(params);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public double compare(final Field a, final Field b, final Config conf) {
|
public double compare(final String valueA, final String valueB, final Config conf) {
|
||||||
final String valueA = getFirstValue(a);
|
|
||||||
final String valueB = getFirstValue(b);
|
|
||||||
|
|
||||||
if (valueA.isEmpty() || valueB.isEmpty())
|
if (valueA.isEmpty() || valueB.isEmpty())
|
||||||
return -1;
|
return -1;
|
||||||
|
|
||||||
|
@ -38,4 +33,7 @@ public class TitleVersionMatch extends AbstractComparator {
|
||||||
return getClass().getSimpleName() + ":" + super.toString();
|
return getClass().getSimpleName() + ":" + super.toString();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
protected String toString(final Object object) {
|
||||||
|
return toFirstString(object);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -8,7 +8,6 @@ import java.util.Map;
|
||||||
import org.apache.commons.lang3.StringUtils;
|
import org.apache.commons.lang3.StringUtils;
|
||||||
|
|
||||||
import eu.dnetlib.pace.config.Config;
|
import eu.dnetlib.pace.config.Config;
|
||||||
import eu.dnetlib.pace.model.Field;
|
|
||||||
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||||
|
|
||||||
@ComparatorClass("urlMatcher")
|
@ComparatorClass("urlMatcher")
|
||||||
|
@ -31,9 +30,9 @@ public class UrlMatcher extends Levenstein {
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public double distance(Field a, Field b, final Config conf) {
|
public double distance(String a, String b, final Config conf) {
|
||||||
final URL urlA = asUrl(getFirstValue(a));
|
final URL urlA = asUrl(a);
|
||||||
final URL urlB = asUrl(getFirstValue(b));
|
final URL urlB = asUrl(b);
|
||||||
|
|
||||||
if (!urlA.getHost().equalsIgnoreCase(urlB.getHost())) {
|
if (!urlA.getHost().equalsIgnoreCase(urlB.getHost())) {
|
||||||
return 0.0;
|
return 0.0;
|
||||||
|
@ -58,4 +57,7 @@ public class UrlMatcher extends Levenstein {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
protected String toString(final Object object) {
|
||||||
|
return toFirstString(object);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -6,8 +6,7 @@ import java.util.Map;
|
||||||
import org.apache.commons.lang3.StringUtils;
|
import org.apache.commons.lang3.StringUtils;
|
||||||
|
|
||||||
import eu.dnetlib.pace.config.Config;
|
import eu.dnetlib.pace.config.Config;
|
||||||
import eu.dnetlib.pace.model.Field;
|
import eu.dnetlib.pace.tree.support.AbstractStringComparator;
|
||||||
import eu.dnetlib.pace.tree.support.AbstractComparator;
|
|
||||||
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -16,7 +15,7 @@ import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||||
* @author claudio
|
* @author claudio
|
||||||
*/
|
*/
|
||||||
@ComparatorClass("yearMatch")
|
@ComparatorClass("yearMatch")
|
||||||
public class YearMatch extends AbstractComparator {
|
public class YearMatch extends AbstractStringComparator {
|
||||||
|
|
||||||
private int limit = 4;
|
private int limit = 4;
|
||||||
|
|
||||||
|
@ -25,7 +24,7 @@ public class YearMatch extends AbstractComparator {
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public double compare(final Field a, final Field b, final Config conf) {
|
public double compare(final String a, final String b, final Config conf) {
|
||||||
final String valueA = getNumbers(getFirstValue(a));
|
final String valueA = getNumbers(getFirstValue(a));
|
||||||
final String valueB = getNumbers(getFirstValue(b));
|
final String valueB = getNumbers(getFirstValue(b));
|
||||||
|
|
||||||
|
@ -42,8 +41,8 @@ public class YearMatch extends AbstractComparator {
|
||||||
return s.length() == limit;
|
return s.length() == limit;
|
||||||
}
|
}
|
||||||
|
|
||||||
protected String getFirstValue(final Field value) {
|
protected String getFirstValue(final String value) {
|
||||||
return (value != null) && !value.isEmpty() ? StringUtils.left(value.stringValue(), limit) : "";
|
return (value != null) && !value.isEmpty() ? StringUtils.left(value, limit) : "";
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
|
|
@ -4,15 +4,14 @@ package eu.dnetlib.pace.tree.support;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
|
|
||||||
|
import com.google.common.base.Joiner;
|
||||||
|
import com.google.common.collect.Lists;
|
||||||
import com.wcohen.ss.AbstractStringDistance;
|
import com.wcohen.ss.AbstractStringDistance;
|
||||||
|
|
||||||
import eu.dnetlib.pace.common.AbstractPaceFunctions;
|
import eu.dnetlib.pace.common.AbstractPaceFunctions;
|
||||||
import eu.dnetlib.pace.config.Config;
|
import eu.dnetlib.pace.config.Config;
|
||||||
import eu.dnetlib.pace.config.Type;
|
|
||||||
import eu.dnetlib.pace.model.Field;
|
|
||||||
import eu.dnetlib.pace.model.FieldList;
|
|
||||||
|
|
||||||
public abstract class AbstractComparator extends AbstractPaceFunctions implements Comparator {
|
public abstract class AbstractComparator<T> extends AbstractPaceFunctions implements Comparator<T> {
|
||||||
|
|
||||||
/** The ssalgo. */
|
/** The ssalgo. */
|
||||||
protected AbstractStringDistance ssalgo;
|
protected AbstractStringDistance ssalgo;
|
||||||
|
@ -69,8 +68,8 @@ public abstract class AbstractComparator extends AbstractPaceFunctions implement
|
||||||
* the b
|
* the b
|
||||||
* @return the double
|
* @return the double
|
||||||
*/
|
*/
|
||||||
public double distance(final String a, final String b, final Config conf) {
|
|
||||||
|
|
||||||
|
protected double distance(final String a, final String b, final Config conf) {
|
||||||
if (a.isEmpty() || b.isEmpty()) {
|
if (a.isEmpty() || b.isEmpty()) {
|
||||||
return -1; // return -1 if a field is missing
|
return -1; // return -1 if a field is missing
|
||||||
}
|
}
|
||||||
|
@ -78,49 +77,50 @@ public abstract class AbstractComparator extends AbstractPaceFunctions implement
|
||||||
return normalize(score);
|
return normalize(score);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
protected double compare(final String a, final String b, final Config conf) {
|
||||||
* Distance.
|
|
||||||
*
|
|
||||||
* @param a
|
|
||||||
* the a
|
|
||||||
* @param b
|
|
||||||
* the b
|
|
||||||
* @return the double
|
|
||||||
*/
|
|
||||||
protected double distance(final List<String> a, final List<String> b, final Config conf) {
|
|
||||||
return distance(concat(a), concat(b), conf);
|
|
||||||
}
|
|
||||||
|
|
||||||
public double distance(final Field a, final Field b, final Config conf) {
|
|
||||||
if (a.getType().equals(Type.String) && b.getType().equals(Type.String))
|
|
||||||
return distance(a.stringValue(), b.stringValue(), conf);
|
|
||||||
if (a.getType().equals(Type.List) && b.getType().equals(Type.List))
|
|
||||||
return distance(toList(a), toList(b), conf);
|
|
||||||
|
|
||||||
throw new IllegalArgumentException("invalid types\n- A: " + a.toString() + "\n- B: " + b.toString());
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public double compare(final Field a, final Field b, final Config conf) {
|
|
||||||
if (a.isEmpty() || b.isEmpty())
|
if (a.isEmpty() || b.isEmpty())
|
||||||
return -1;
|
return -1;
|
||||||
if (a.getType().equals(Type.String) && b.getType().equals(Type.String))
|
return distance(a, b, conf);
|
||||||
return distance(a.stringValue(), b.stringValue(), conf);
|
|
||||||
if (a.getType().equals(Type.List) && b.getType().equals(Type.List))
|
|
||||||
return distance(toList(a), toList(b), conf);
|
|
||||||
|
|
||||||
throw new IllegalArgumentException("invalid types\n- A: " + a.toString() + "\n- B: " + b.toString());
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* To list.
|
* Convert the given argument to a List of Strings
|
||||||
*
|
*
|
||||||
* @param list
|
* @param object
|
||||||
* the list
|
* function argument
|
||||||
* @return the list
|
* @return the list
|
||||||
*/
|
*/
|
||||||
protected List<String> toList(final Field list) {
|
protected List<String> toList(final Object object) {
|
||||||
return ((FieldList) list).stringList();
|
if (object instanceof List) {
|
||||||
|
return (List<String>) object;
|
||||||
|
}
|
||||||
|
|
||||||
|
return Lists.newArrayList(object.toString());
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Convert the given argument to a String
|
||||||
|
*
|
||||||
|
* @param object
|
||||||
|
* function argument
|
||||||
|
* @return the list
|
||||||
|
*/
|
||||||
|
protected String toString(final Object object) {
|
||||||
|
if (object instanceof List) {
|
||||||
|
List<String> l = (List<String>) object;
|
||||||
|
return Joiner.on(" ").join(l);
|
||||||
|
}
|
||||||
|
|
||||||
|
return object.toString();
|
||||||
|
}
|
||||||
|
|
||||||
|
protected String toFirstString(final Object object) {
|
||||||
|
if (object instanceof List) {
|
||||||
|
List<String> l = (List<String>) object;
|
||||||
|
return l.isEmpty() ? "" : l.get(0);
|
||||||
|
}
|
||||||
|
|
||||||
|
return object.toString();
|
||||||
}
|
}
|
||||||
|
|
||||||
public double getWeight() {
|
public double getWeight() {
|
||||||
|
|
|
@ -0,0 +1,39 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.pace.tree.support;
|
||||||
|
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Map;
|
||||||
|
|
||||||
|
import com.wcohen.ss.AbstractStringDistance;
|
||||||
|
|
||||||
|
import eu.dnetlib.pace.config.Config;
|
||||||
|
|
||||||
|
abstract public class AbstractListComparator extends AbstractComparator<List<String>> {
|
||||||
|
protected AbstractListComparator(Map<String, String> params) {
|
||||||
|
super(params);
|
||||||
|
}
|
||||||
|
|
||||||
|
protected AbstractListComparator(Map<String, String> params, AbstractStringDistance ssalgo) {
|
||||||
|
super(params, ssalgo);
|
||||||
|
}
|
||||||
|
|
||||||
|
protected AbstractListComparator(double weight, AbstractStringDistance ssalgo) {
|
||||||
|
super(weight, ssalgo);
|
||||||
|
}
|
||||||
|
|
||||||
|
protected AbstractListComparator(AbstractStringDistance ssalgo) {
|
||||||
|
super(ssalgo);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public double compare(Object a, Object b, Config conf) {
|
||||||
|
return compare(toList(a), toList(b), conf);
|
||||||
|
}
|
||||||
|
|
||||||
|
public double compare(final List<String> a, final List<String> b, final Config conf) {
|
||||||
|
if (a.isEmpty() || b.isEmpty())
|
||||||
|
return -1;
|
||||||
|
|
||||||
|
return distance(concat(a), concat(b), conf);
|
||||||
|
}
|
||||||
|
}
|
|
@ -8,10 +8,7 @@ import java.util.Map;
|
||||||
import com.google.common.collect.Lists;
|
import com.google.common.collect.Lists;
|
||||||
import com.wcohen.ss.AbstractStringDistance;
|
import com.wcohen.ss.AbstractStringDistance;
|
||||||
|
|
||||||
import eu.dnetlib.pace.model.Field;
|
public abstract class AbstractSortedComparator extends AbstractListComparator {
|
||||||
import eu.dnetlib.pace.model.FieldList;
|
|
||||||
|
|
||||||
public abstract class AbstractSortedComparator extends AbstractComparator {
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Instantiates a new sorted second string compare algo.
|
* Instantiates a new sorted second string compare algo.
|
||||||
|
@ -30,11 +27,14 @@ public abstract class AbstractSortedComparator extends AbstractComparator {
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
protected List<String> toList(final Field list) {
|
protected List<String> toList(final Object object) {
|
||||||
FieldList fl = (FieldList) list;
|
if (object instanceof List) {
|
||||||
List<String> values = Lists.newArrayList(fl.stringList());
|
List<String> fl = (List<String>) object;
|
||||||
Collections.sort(values);
|
List<String> values = Lists.newArrayList(fl);
|
||||||
return values;
|
Collections.sort(values);
|
||||||
}
|
return values;
|
||||||
|
}
|
||||||
|
|
||||||
|
return Lists.newArrayList(object.toString());
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -0,0 +1,46 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.pace.tree.support;
|
||||||
|
|
||||||
|
import java.util.Map;
|
||||||
|
|
||||||
|
import com.wcohen.ss.AbstractStringDistance;
|
||||||
|
|
||||||
|
import eu.dnetlib.pace.config.Config;
|
||||||
|
|
||||||
|
public abstract class AbstractStringComparator extends AbstractComparator<String> {
|
||||||
|
protected AbstractStringComparator(Map<String, String> params) {
|
||||||
|
super(params);
|
||||||
|
}
|
||||||
|
|
||||||
|
protected AbstractStringComparator(Map<String, String> params, AbstractStringDistance ssalgo) {
|
||||||
|
super(params, ssalgo);
|
||||||
|
}
|
||||||
|
|
||||||
|
protected AbstractStringComparator(double weight, AbstractStringDistance ssalgo) {
|
||||||
|
super(weight, ssalgo);
|
||||||
|
}
|
||||||
|
|
||||||
|
protected AbstractStringComparator(AbstractStringDistance ssalgo) {
|
||||||
|
super(ssalgo);
|
||||||
|
}
|
||||||
|
|
||||||
|
public double distance(final String a, final String b, final Config conf) {
|
||||||
|
if (a.isEmpty() || b.isEmpty()) {
|
||||||
|
return -1; // return -1 if a field is missing
|
||||||
|
}
|
||||||
|
double score = ssalgo.score(a, b);
|
||||||
|
return normalize(score);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public double compare(Object a, Object b, Config conf) {
|
||||||
|
return compare(toString(a), toString(b), conf);
|
||||||
|
}
|
||||||
|
|
||||||
|
public double compare(final String a, final String b, final Config conf) {
|
||||||
|
if (a.isEmpty() || b.isEmpty())
|
||||||
|
return -1;
|
||||||
|
return distance(a, b, conf);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -2,13 +2,11 @@
|
||||||
package eu.dnetlib.pace.tree.support;
|
package eu.dnetlib.pace.tree.support;
|
||||||
|
|
||||||
import eu.dnetlib.pace.config.Config;
|
import eu.dnetlib.pace.config.Config;
|
||||||
import eu.dnetlib.pace.model.Field;
|
|
||||||
|
|
||||||
public interface Comparator {
|
public interface Comparator<T> {
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* return : -1 -> can't decide (i.e. missing field) >0 -> similarity degree (depends on the algorithm)
|
* return : -1 -> can't decide (i.e. missing field) >0 -> similarity degree (depends on the algorithm)
|
||||||
*/
|
*/
|
||||||
public double compare(Field a, Field b, Config conf);
|
public double compare(Object a, Object b, Config conf);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -6,7 +6,6 @@ import java.io.Serializable;
|
||||||
|
|
||||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||||
|
|
||||||
import eu.dnetlib.pace.model.Field;
|
|
||||||
import eu.dnetlib.pace.util.PaceException;
|
import eu.dnetlib.pace.util.PaceException;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -17,12 +16,12 @@ public class FieldStats implements Serializable {
|
||||||
private double weight; // weight for the field (to be used in the aggregation)
|
private double weight; // weight for the field (to be used in the aggregation)
|
||||||
private double threshold; // threshold for the field (to be used in some kind of aggregations)
|
private double threshold; // threshold for the field (to be used in some kind of aggregations)
|
||||||
private double result; // the result of the comparison
|
private double result; // the result of the comparison
|
||||||
private Field a;
|
private Object a;
|
||||||
private Field b;
|
private Object b;
|
||||||
|
|
||||||
private boolean countIfUndefined;
|
private boolean countIfUndefined;
|
||||||
|
|
||||||
public FieldStats(double weight, double threshold, double result, boolean countIfUndefined, Field a, Field b) {
|
public FieldStats(double weight, double threshold, double result, boolean countIfUndefined, Object a, Object b) {
|
||||||
this.weight = weight;
|
this.weight = weight;
|
||||||
this.threshold = threshold;
|
this.threshold = threshold;
|
||||||
this.result = result;
|
this.result = result;
|
||||||
|
@ -63,19 +62,19 @@ public class FieldStats implements Serializable {
|
||||||
this.countIfUndefined = countIfUndefined;
|
this.countIfUndefined = countIfUndefined;
|
||||||
}
|
}
|
||||||
|
|
||||||
public Field getA() {
|
public Object getA() {
|
||||||
return a;
|
return a;
|
||||||
}
|
}
|
||||||
|
|
||||||
public void setA(Field a) {
|
public void setA(Object a) {
|
||||||
this.a = a;
|
this.a = a;
|
||||||
}
|
}
|
||||||
|
|
||||||
public Field getB() {
|
public Object getB() {
|
||||||
return b;
|
return b;
|
||||||
}
|
}
|
||||||
|
|
||||||
public void setB(Field b) {
|
public void setB(Object b) {
|
||||||
this.b = b;
|
this.b = b;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -7,10 +7,19 @@ public enum MatchType {
|
||||||
|
|
||||||
public static MatchType parse(String value) {
|
public static MatchType parse(String value) {
|
||||||
|
|
||||||
try {
|
if (MATCH.name().equals(value)) {
|
||||||
return MatchType.valueOf(value);
|
return MATCH;
|
||||||
} catch (IllegalArgumentException e) {
|
} else if (NO_MATCH.name().equals(value)) {
|
||||||
return MatchType.UNDEFINED; // return UNDEFINED if the enum is not parsable
|
return NO_MATCH;
|
||||||
|
} else {
|
||||||
|
return UNDEFINED;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// try {
|
||||||
|
// return MatchType.valueOf(value);
|
||||||
|
// }
|
||||||
|
// catch (IllegalArgumentException e) {
|
||||||
|
// return MatchType.UNDEFINED; //return UNDEFINED if the enum is not parsable
|
||||||
|
// }
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -3,17 +3,17 @@ package eu.dnetlib.pace.tree.support;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.Serializable;
|
import java.io.Serializable;
|
||||||
import java.io.StringWriter;
|
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
import org.apache.commons.lang3.StringUtils;
|
import org.apache.spark.sql.Row;
|
||||||
|
import org.apache.spark.sql.types.ArrayType;
|
||||||
|
import org.apache.spark.sql.types.DataType;
|
||||||
|
import org.apache.spark.sql.types.StringType;
|
||||||
|
|
||||||
import com.fasterxml.jackson.core.JsonProcessingException;
|
|
||||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||||
|
|
||||||
import eu.dnetlib.pace.config.Config;
|
import eu.dnetlib.pace.config.Config;
|
||||||
import eu.dnetlib.pace.config.PaceConfig;
|
import eu.dnetlib.pace.config.PaceConfig;
|
||||||
import eu.dnetlib.pace.model.MapDocument;
|
|
||||||
import eu.dnetlib.pace.util.PaceException;
|
import eu.dnetlib.pace.util.PaceException;
|
||||||
|
|
||||||
public class TreeNodeDef implements Serializable {
|
public class TreeNodeDef implements Serializable {
|
||||||
|
@ -46,31 +46,27 @@ public class TreeNodeDef implements Serializable {
|
||||||
}
|
}
|
||||||
|
|
||||||
// function for the evaluation of the node
|
// function for the evaluation of the node
|
||||||
public TreeNodeStats evaluate(MapDocument doc1, MapDocument doc2, Config conf) {
|
public TreeNodeStats evaluate(Row doc1, Row doc2, Config conf) {
|
||||||
|
|
||||||
TreeNodeStats stats = new TreeNodeStats();
|
TreeNodeStats stats = new TreeNodeStats();
|
||||||
|
|
||||||
// for each field in the node, it computes the
|
// for each field in the node, it computes the
|
||||||
for (FieldConf fieldConf : fields) {
|
for (FieldConf fieldConf : fields) {
|
||||||
|
|
||||||
double weight = fieldConf.getWeight();
|
double weight = fieldConf.getWeight();
|
||||||
|
|
||||||
double result;
|
double result;
|
||||||
|
|
||||||
|
Object value1 = getJavaValue(doc1, fieldConf.getField());
|
||||||
|
Object value2 = getJavaValue(doc2, fieldConf.getField());
|
||||||
|
|
||||||
// if the param specifies a cross comparison (i.e. compare elements from different fields), compute the
|
// if the param specifies a cross comparison (i.e. compare elements from different fields), compute the
|
||||||
// result for both sides and return the maximum
|
// result for both sides and return the maximum
|
||||||
if (fieldConf.getParams().keySet().stream().anyMatch(k -> k.contains(CROSS_COMPARE))) {
|
String crossField = fieldConf.getParams().get(CROSS_COMPARE);
|
||||||
String crossField = fieldConf.getParams().get(CROSS_COMPARE);
|
if (crossField != null) {
|
||||||
double result1 = comparator(fieldConf)
|
double result1 = comparator(fieldConf).compare(value1, getJavaValue(doc2, crossField), conf);
|
||||||
.compare(doc1.getFieldMap().get(fieldConf.getField()), doc2.getFieldMap().get(crossField), conf);
|
double result2 = comparator(fieldConf).compare(getJavaValue(doc1, crossField), value2, conf);
|
||||||
double result2 = comparator(fieldConf)
|
|
||||||
.compare(doc1.getFieldMap().get(crossField), doc2.getFieldMap().get(fieldConf.getField()), conf);
|
|
||||||
result = Math.max(result1, result2);
|
result = Math.max(result1, result2);
|
||||||
} else {
|
} else {
|
||||||
result = comparator(fieldConf)
|
result = comparator(fieldConf).compare(value1, value2, conf);
|
||||||
.compare(
|
|
||||||
doc1.getFieldMap().get(fieldConf.getField()), doc2.getFieldMap().get(fieldConf.getField()),
|
|
||||||
conf);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
stats
|
stats
|
||||||
|
@ -81,13 +77,27 @@ public class TreeNodeDef implements Serializable {
|
||||||
Double.parseDouble(fieldConf.getParams().getOrDefault("threshold", "1.0")),
|
Double.parseDouble(fieldConf.getParams().getOrDefault("threshold", "1.0")),
|
||||||
result,
|
result,
|
||||||
fieldConf.isCountIfUndefined(),
|
fieldConf.isCountIfUndefined(),
|
||||||
doc1.getFieldMap().get(fieldConf.getField()),
|
value1,
|
||||||
doc2.getFieldMap().get(fieldConf.getField())));
|
value2));
|
||||||
}
|
}
|
||||||
|
|
||||||
return stats;
|
return stats;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public Object getJavaValue(Row row, String name) {
|
||||||
|
int pos = row.fieldIndex(name);
|
||||||
|
if (pos >= 0) {
|
||||||
|
DataType dt = row.schema().fields()[pos].dataType();
|
||||||
|
if (dt instanceof StringType) {
|
||||||
|
return row.getString(pos);
|
||||||
|
} else if (dt instanceof ArrayType) {
|
||||||
|
return row.getList(pos);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
private Comparator comparator(final FieldConf field) {
|
private Comparator comparator(final FieldConf field) {
|
||||||
|
|
||||||
return PaceConfig.resolver.getComparator(field.getComparator(), field.getParams());
|
return PaceConfig.resolver.getComparator(field.getComparator(), field.getParams());
|
||||||
|
|
|
@ -3,9 +3,9 @@ package eu.dnetlib.pace.tree.support;
|
||||||
|
|
||||||
import org.apache.commons.logging.Log;
|
import org.apache.commons.logging.Log;
|
||||||
import org.apache.commons.logging.LogFactory;
|
import org.apache.commons.logging.LogFactory;
|
||||||
|
import org.apache.spark.sql.Row;
|
||||||
|
|
||||||
import eu.dnetlib.pace.config.Config;
|
import eu.dnetlib.pace.config.Config;
|
||||||
import eu.dnetlib.pace.model.MapDocument;
|
|
||||||
import eu.dnetlib.pace.util.PaceException;
|
import eu.dnetlib.pace.util.PaceException;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -21,72 +21,72 @@ public class TreeProcessor {
|
||||||
this.config = config;
|
this.config = config;
|
||||||
}
|
}
|
||||||
|
|
||||||
public boolean compare(final MapDocument a, final MapDocument b) {
|
// row based copies
|
||||||
|
|
||||||
|
public boolean compare(final Row a, final Row b) {
|
||||||
// evaluate the decision tree
|
// evaluate the decision tree
|
||||||
return evaluateTree(a, b).getResult() == MatchType.MATCH;
|
return evaluateTree(a, b).getResult() == MatchType.MATCH;
|
||||||
}
|
}
|
||||||
|
|
||||||
public TreeStats evaluateTree(final MapDocument doc1, final MapDocument doc2) {
|
public TreeStats evaluateTree(final Row doc1, final Row doc2) {
|
||||||
|
|
||||||
TreeStats treeStats = new TreeStats();
|
TreeStats treeStats = new TreeStats();
|
||||||
|
|
||||||
String current = "start";
|
String nextNodeName = "start";
|
||||||
|
|
||||||
while (MatchType.parse(current) == MatchType.UNDEFINED) {
|
do {
|
||||||
|
|
||||||
TreeNodeDef currentNode = config.decisionTree().get(current);
|
TreeNodeDef currentNode = config.decisionTree().get(nextNodeName);
|
||||||
// throw an exception if the node doesn't exist
|
// throw an exception if the node doesn't exist
|
||||||
if (currentNode == null)
|
if (currentNode == null)
|
||||||
throw new PaceException("Missing tree node: " + current);
|
throw new PaceException("Missing tree node: " + nextNodeName);
|
||||||
|
|
||||||
TreeNodeStats stats = currentNode.evaluate(doc1, doc2, config);
|
TreeNodeStats stats = currentNode.evaluate(doc1, doc2, config);
|
||||||
treeStats.addNodeStats(current, stats);
|
treeStats.addNodeStats(nextNodeName, stats);
|
||||||
|
|
||||||
// if ignoreUndefined=false the miss is considered as undefined
|
// if ignoreUndefined=false the miss is considered as undefined
|
||||||
if (!currentNode.isIgnoreUndefined() && stats.undefinedCount() > 0) {
|
if (!currentNode.isIgnoreUndefined() && stats.undefinedCount() > 0) {
|
||||||
current = currentNode.getUndefined();
|
nextNodeName = currentNode.getUndefined();
|
||||||
}
|
}
|
||||||
// if ignoreUndefined=true the miss is ignored and the score computed anyway
|
// if ignoreUndefined=true the miss is ignored and the score computed anyway
|
||||||
else if (stats.getFinalScore(currentNode.getAggregation()) >= currentNode.getThreshold()) {
|
else if (stats.getFinalScore(currentNode.getAggregation()) >= currentNode.getThreshold()) {
|
||||||
current = currentNode.getPositive();
|
nextNodeName = currentNode.getPositive();
|
||||||
} else {
|
} else {
|
||||||
current = currentNode.getNegative();
|
nextNodeName = currentNode.getNegative();
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
} while (MatchType.parse(nextNodeName) == MatchType.UNDEFINED);
|
||||||
|
|
||||||
treeStats.setResult(MatchType.parse(current));
|
treeStats.setResult(MatchType.parse(nextNodeName));
|
||||||
return treeStats;
|
return treeStats;
|
||||||
}
|
}
|
||||||
|
|
||||||
public double computeScore(final MapDocument doc1, final MapDocument doc2) {
|
public double computeScore(final Row doc1, final Row doc2) {
|
||||||
String current = "start";
|
String nextNodeName = "start";
|
||||||
double score = 0.0;
|
double score = 0.0;
|
||||||
|
|
||||||
while (MatchType.parse(current) == MatchType.UNDEFINED) {
|
do {
|
||||||
|
|
||||||
TreeNodeDef currentNode = config.decisionTree().get(current);
|
TreeNodeDef currentNode = config.decisionTree().get(nextNodeName);
|
||||||
// throw an exception if the node doesn't exist
|
// throw an exception if the node doesn't exist
|
||||||
if (currentNode == null)
|
if (currentNode == null)
|
||||||
throw new PaceException("The Tree Node doesn't exist: " + current);
|
throw new PaceException("The Tree Node doesn't exist: " + nextNodeName);
|
||||||
|
|
||||||
TreeNodeStats stats = currentNode.evaluate(doc1, doc2, config);
|
TreeNodeStats stats = currentNode.evaluate(doc1, doc2, config);
|
||||||
|
|
||||||
score = stats.getFinalScore(currentNode.getAggregation());
|
score = stats.getFinalScore(currentNode.getAggregation());
|
||||||
// if ignoreUndefined=false the miss is considered as undefined
|
// if ignoreUndefined=false the miss is considered as undefined
|
||||||
if (!currentNode.isIgnoreUndefined() && stats.undefinedCount() > 0) {
|
if (!currentNode.isIgnoreUndefined() && stats.undefinedCount() > 0) {
|
||||||
current = currentNode.getUndefined();
|
nextNodeName = currentNode.getUndefined();
|
||||||
}
|
}
|
||||||
// if ignoreUndefined=true the miss is ignored and the score computed anyway
|
// if ignoreUndefined=true the miss is ignored and the score computed anyway
|
||||||
else if (stats.getFinalScore(currentNode.getAggregation()) >= currentNode.getThreshold()) {
|
else if (stats.getFinalScore(currentNode.getAggregation()) >= currentNode.getThreshold()) {
|
||||||
current = currentNode.getPositive();
|
nextNodeName = currentNode.getPositive();
|
||||||
} else {
|
} else {
|
||||||
current = currentNode.getNegative();
|
nextNodeName = currentNode.getNegative();
|
||||||
}
|
}
|
||||||
|
} while (MatchType.parse(nextNodeName) == MatchType.UNDEFINED);
|
||||||
}
|
|
||||||
|
|
||||||
return score;
|
return score;
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,20 +1,19 @@
|
||||||
|
|
||||||
package eu.dnetlib.pace.util;
|
package eu.dnetlib.pace.util;
|
||||||
|
|
||||||
import java.util.*;
|
import java.util.ArrayList;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
import org.apache.commons.lang3.StringUtils;
|
import org.apache.commons.lang3.StringUtils;
|
||||||
import org.apache.commons.logging.Log;
|
import org.apache.commons.logging.Log;
|
||||||
import org.apache.commons.logging.LogFactory;
|
import org.apache.commons.logging.LogFactory;
|
||||||
|
import org.apache.spark.sql.Row;
|
||||||
|
import org.apache.spark.sql.types.ArrayType;
|
||||||
|
import org.apache.spark.sql.types.DataType;
|
||||||
|
import org.apache.spark.sql.types.StringType;
|
||||||
|
|
||||||
import com.google.common.collect.Lists;
|
|
||||||
|
|
||||||
import eu.dnetlib.pace.clustering.NGramUtils;
|
|
||||||
import eu.dnetlib.pace.config.DedupConfig;
|
import eu.dnetlib.pace.config.DedupConfig;
|
||||||
import eu.dnetlib.pace.config.WfConfig;
|
import eu.dnetlib.pace.config.WfConfig;
|
||||||
import eu.dnetlib.pace.model.Field;
|
|
||||||
import eu.dnetlib.pace.model.MapDocument;
|
|
||||||
import eu.dnetlib.pace.model.MapDocumentComparator;
|
|
||||||
import eu.dnetlib.pace.tree.support.TreeProcessor;
|
import eu.dnetlib.pace.tree.support.TreeProcessor;
|
||||||
|
|
||||||
public class BlockProcessor {
|
public class BlockProcessor {
|
||||||
|
@ -25,6 +24,9 @@ public class BlockProcessor {
|
||||||
|
|
||||||
private DedupConfig dedupConf;
|
private DedupConfig dedupConf;
|
||||||
|
|
||||||
|
private final int identifierFieldPos;
|
||||||
|
private final int orderFieldPos;
|
||||||
|
|
||||||
public static void constructAccumulator(final DedupConfig dedupConf) {
|
public static void constructAccumulator(final DedupConfig dedupConf) {
|
||||||
accumulators.add(String.format("%s::%s", dedupConf.getWf().getEntityType(), "records per hash key = 1"));
|
accumulators.add(String.format("%s::%s", dedupConf.getWf().getEntityType(), "records per hash key = 1"));
|
||||||
accumulators
|
accumulators
|
||||||
|
@ -47,152 +49,80 @@ public class BlockProcessor {
|
||||||
.add(String.format("%s::%s", dedupConf.getWf().getEntityType(), "d < " + dedupConf.getWf().getThreshold()));
|
.add(String.format("%s::%s", dedupConf.getWf().getEntityType(), "d < " + dedupConf.getWf().getThreshold()));
|
||||||
}
|
}
|
||||||
|
|
||||||
public BlockProcessor(DedupConfig dedupConf) {
|
public BlockProcessor(DedupConfig dedupConf, int identifierFieldPos, int orderFieldPos) {
|
||||||
this.dedupConf = dedupConf;
|
this.dedupConf = dedupConf;
|
||||||
|
this.identifierFieldPos = identifierFieldPos;
|
||||||
|
this.orderFieldPos = orderFieldPos;
|
||||||
}
|
}
|
||||||
|
|
||||||
public void processSortedBlock(final String key, final List<MapDocument> documents, final Reporter context) {
|
public void processSortedRows(final List<Row> documents, final Reporter context) {
|
||||||
if (documents.size() > 1) {
|
if (documents.size() > 1) {
|
||||||
// log.info("reducing key: '" + key + "' records: " + q.size());
|
// log.info("reducing key: '" + key + "' records: " + q.size());
|
||||||
process(prepare(documents), context);
|
processRows(documents, context);
|
||||||
|
|
||||||
} else {
|
} else {
|
||||||
context.incrementCounter(dedupConf.getWf().getEntityType(), "records per hash key = 1", 1);
|
context.incrementCounter(dedupConf.getWf().getEntityType(), "records per hash key = 1", 1);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
public void process(final String key, final Iterable<MapDocument> documents, final Reporter context) {
|
private void processRows(final List<Row> queue, final Reporter context) {
|
||||||
|
|
||||||
final Queue<MapDocument> q = prepare(documents);
|
for (int pivotPos = 0; pivotPos < queue.size(); pivotPos++) {
|
||||||
|
final Row pivot = queue.get(pivotPos);
|
||||||
|
|
||||||
if (q.size() > 1) {
|
final String idPivot = pivot.getString(identifierFieldPos); // identifier
|
||||||
// log.info("reducing key: '" + key + "' records: " + q.size());
|
final Object fieldsPivot = getJavaValue(pivot, orderFieldPos);
|
||||||
process(simplifyQueue(q, key, context), context);
|
final String fieldPivot = (fieldsPivot == null) ? "" : fieldsPivot.toString();
|
||||||
|
final WfConfig wf = dedupConf.getWf();
|
||||||
} else {
|
|
||||||
context.incrementCounter(dedupConf.getWf().getEntityType(), "records per hash key = 1", 1);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
private Queue<MapDocument> prepare(final Iterable<MapDocument> documents) {
|
|
||||||
final Queue<MapDocument> queue = new PriorityQueue<>(100,
|
|
||||||
new MapDocumentComparator(dedupConf.getWf().getOrderField()));
|
|
||||||
|
|
||||||
final Set<String> seen = new HashSet<String>();
|
|
||||||
final int queueMaxSize = dedupConf.getWf().getQueueMaxSize();
|
|
||||||
|
|
||||||
documents.forEach(doc -> {
|
|
||||||
if (queue.size() <= queueMaxSize) {
|
|
||||||
final String id = doc.getIdentifier();
|
|
||||||
|
|
||||||
if (!seen.contains(id)) {
|
|
||||||
seen.add(id);
|
|
||||||
queue.add(doc);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
});
|
|
||||||
|
|
||||||
return queue;
|
|
||||||
}
|
|
||||||
|
|
||||||
private Queue<MapDocument> simplifyQueue(final Queue<MapDocument> queue, final String ngram,
|
|
||||||
final Reporter context) {
|
|
||||||
final Queue<MapDocument> q = new LinkedList<>();
|
|
||||||
|
|
||||||
String fieldRef = "";
|
|
||||||
final List<MapDocument> tempResults = Lists.newArrayList();
|
|
||||||
|
|
||||||
while (!queue.isEmpty()) {
|
|
||||||
final MapDocument result = queue.remove();
|
|
||||||
|
|
||||||
final String orderFieldName = dedupConf.getWf().getOrderField();
|
|
||||||
final Field orderFieldValue = result.values(orderFieldName);
|
|
||||||
if (!orderFieldValue.isEmpty()) {
|
|
||||||
final String field = NGramUtils.cleanupForOrdering(orderFieldValue.stringValue());
|
|
||||||
if (field.equals(fieldRef)) {
|
|
||||||
tempResults.add(result);
|
|
||||||
} else {
|
|
||||||
populateSimplifiedQueue(q, tempResults, context, fieldRef, ngram);
|
|
||||||
tempResults.clear();
|
|
||||||
tempResults.add(result);
|
|
||||||
fieldRef = field;
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
context
|
|
||||||
.incrementCounter(
|
|
||||||
dedupConf.getWf().getEntityType(), "missing " + dedupConf.getWf().getOrderField(), 1);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
populateSimplifiedQueue(q, tempResults, context, fieldRef, ngram);
|
|
||||||
|
|
||||||
return q;
|
|
||||||
}
|
|
||||||
|
|
||||||
private void populateSimplifiedQueue(final Queue<MapDocument> q,
|
|
||||||
final List<MapDocument> tempResults,
|
|
||||||
final Reporter context,
|
|
||||||
final String fieldRef,
|
|
||||||
final String ngram) {
|
|
||||||
WfConfig wf = dedupConf.getWf();
|
|
||||||
if (tempResults.size() < wf.getGroupMaxSize()) {
|
|
||||||
q.addAll(tempResults);
|
|
||||||
} else {
|
|
||||||
context
|
|
||||||
.incrementCounter(
|
|
||||||
wf.getEntityType(),
|
|
||||||
String.format("Skipped records for count(%s) >= %s", wf.getOrderField(), wf.getGroupMaxSize()),
|
|
||||||
tempResults.size());
|
|
||||||
// log.info("Skipped field: " + fieldRef + " - size: " + tempResults.size() + " - ngram: " + ngram);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
private void process(final Queue<MapDocument> queue, final Reporter context) {
|
|
||||||
|
|
||||||
while (!queue.isEmpty()) {
|
|
||||||
|
|
||||||
final MapDocument pivot = queue.remove();
|
|
||||||
final String idPivot = pivot.getIdentifier();
|
|
||||||
|
|
||||||
WfConfig wf = dedupConf.getWf();
|
|
||||||
final Field fieldsPivot = pivot.values(wf.getOrderField());
|
|
||||||
final String fieldPivot = (fieldsPivot == null) || fieldsPivot.isEmpty() ? "" : fieldsPivot.stringValue();
|
|
||||||
|
|
||||||
if (fieldPivot != null) {
|
if (fieldPivot != null) {
|
||||||
int i = 0;
|
int i = 0;
|
||||||
for (final MapDocument curr : queue) {
|
for (int windowPos = pivotPos + 1; windowPos < queue.size(); windowPos++) {
|
||||||
final String idCurr = curr.getIdentifier();
|
final Row curr = queue.get(windowPos);
|
||||||
|
final String idCurr = curr.getString(identifierFieldPos); // identifier
|
||||||
|
|
||||||
if (mustSkip(idCurr)) {
|
if (mustSkip(idCurr)) {
|
||||||
|
|
||||||
context.incrementCounter(wf.getEntityType(), "skip list", 1);
|
context.incrementCounter(wf.getEntityType(), "skip list", 1);
|
||||||
|
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (i > wf.getSlidingWindowSize()) {
|
if (++i > wf.getSlidingWindowSize()) {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
final Field fieldsCurr = curr.values(wf.getOrderField());
|
final Object fieldsCurr = getJavaValue(curr, orderFieldPos);
|
||||||
final String fieldCurr = (fieldsCurr == null) || fieldsCurr.isEmpty() ? null
|
final String fieldCurr = (fieldsCurr == null) ? null : fieldsCurr.toString();
|
||||||
: fieldsCurr.stringValue();
|
|
||||||
|
|
||||||
if (!idCurr.equals(idPivot) && (fieldCurr != null)) {
|
if (!idCurr.equals(idPivot) && (fieldCurr != null)) {
|
||||||
|
|
||||||
final TreeProcessor treeProcessor = new TreeProcessor(dedupConf);
|
final TreeProcessor treeProcessor = new TreeProcessor(dedupConf);
|
||||||
|
|
||||||
emitOutput(treeProcessor.compare(pivot, curr), idPivot, idCurr, context);
|
emitOutput(treeProcessor.compare(pivot, curr), idPivot, idCurr, context);
|
||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public Object getJavaValue(Row row, int pos) {
|
||||||
|
DataType dt = row.schema().fields()[pos].dataType();
|
||||||
|
if (dt instanceof StringType) {
|
||||||
|
return row.getString(pos);
|
||||||
|
} else if (dt instanceof ArrayType) {
|
||||||
|
return row.getList(pos);
|
||||||
|
}
|
||||||
|
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
private void emitOutput(final boolean result, final String idPivot, final String idCurr, final Reporter context) {
|
private void emitOutput(final boolean result, final String idPivot, final String idCurr, final Reporter context) {
|
||||||
|
|
||||||
if (result) {
|
if (result) {
|
||||||
writeSimilarity(context, idPivot, idCurr);
|
if (idPivot.compareTo(idCurr) <= 0) {
|
||||||
|
writeSimilarity(context, idPivot, idCurr);
|
||||||
|
} else {
|
||||||
|
writeSimilarity(context, idCurr, idPivot);
|
||||||
|
}
|
||||||
context.incrementCounter(dedupConf.getWf().getEntityType(), "dedupSimilarity (x2)", 1);
|
context.incrementCounter(dedupConf.getWf().getEntityType(), "dedupSimilarity (x2)", 1);
|
||||||
} else {
|
} else {
|
||||||
context.incrementCounter(dedupConf.getWf().getEntityType(), "d < " + dedupConf.getWf().getThreshold(), 1);
|
context.incrementCounter(dedupConf.getWf().getEntityType(), "d < " + dedupConf.getWf().getThreshold(), 1);
|
||||||
|
@ -211,7 +141,6 @@ public class BlockProcessor {
|
||||||
final String type = dedupConf.getWf().getEntityType();
|
final String type = dedupConf.getWf().getEntityType();
|
||||||
|
|
||||||
context.emit(type, from, to);
|
context.emit(type, from, to);
|
||||||
context.emit(type, to, from);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,276 +0,0 @@
|
||||||
|
|
||||||
package eu.dnetlib.pace.util;
|
|
||||||
|
|
||||||
import java.util.*;
|
|
||||||
|
|
||||||
import org.apache.commons.lang3.StringUtils;
|
|
||||||
import org.apache.commons.logging.Log;
|
|
||||||
import org.apache.commons.logging.LogFactory;
|
|
||||||
|
|
||||||
import com.google.common.collect.Lists;
|
|
||||||
|
|
||||||
import eu.dnetlib.pace.clustering.NGramUtils;
|
|
||||||
import eu.dnetlib.pace.config.DedupConfig;
|
|
||||||
import eu.dnetlib.pace.config.WfConfig;
|
|
||||||
import eu.dnetlib.pace.model.Field;
|
|
||||||
import eu.dnetlib.pace.model.MapDocument;
|
|
||||||
import eu.dnetlib.pace.model.MapDocumentComparator;
|
|
||||||
import eu.dnetlib.pace.tree.*;
|
|
||||||
import eu.dnetlib.pace.tree.support.TreeProcessor;
|
|
||||||
|
|
||||||
public class BlockProcessorForTesting {
|
|
||||||
|
|
||||||
public static final List<String> accumulators = new ArrayList<>();
|
|
||||||
|
|
||||||
private static final Log log = LogFactory.getLog(eu.dnetlib.pace.util.BlockProcessorForTesting.class);
|
|
||||||
|
|
||||||
private DedupConfig dedupConf;
|
|
||||||
|
|
||||||
public static void constructAccumulator(final DedupConfig dedupConf) {
|
|
||||||
accumulators.add(String.format("%s::%s", dedupConf.getWf().getEntityType(), "records per hash key = 1"));
|
|
||||||
accumulators
|
|
||||||
.add(
|
|
||||||
String
|
|
||||||
.format(
|
|
||||||
"%s::%s", dedupConf.getWf().getEntityType(), "missing " + dedupConf.getWf().getOrderField()));
|
|
||||||
accumulators
|
|
||||||
.add(
|
|
||||||
String
|
|
||||||
.format(
|
|
||||||
"%s::%s", dedupConf.getWf().getEntityType(),
|
|
||||||
String
|
|
||||||
.format(
|
|
||||||
"Skipped records for count(%s) >= %s", dedupConf.getWf().getOrderField(),
|
|
||||||
dedupConf.getWf().getGroupMaxSize())));
|
|
||||||
accumulators.add(String.format("%s::%s", dedupConf.getWf().getEntityType(), "skip list"));
|
|
||||||
accumulators.add(String.format("%s::%s", dedupConf.getWf().getEntityType(), "dedupSimilarity (x2)"));
|
|
||||||
accumulators
|
|
||||||
.add(String.format("%s::%s", dedupConf.getWf().getEntityType(), "d < " + dedupConf.getWf().getThreshold()));
|
|
||||||
}
|
|
||||||
|
|
||||||
public BlockProcessorForTesting(DedupConfig dedupConf) {
|
|
||||||
this.dedupConf = dedupConf;
|
|
||||||
}
|
|
||||||
|
|
||||||
public void processSortedBlock(final String key, final List<MapDocument> documents, final Reporter context,
|
|
||||||
boolean useTree, boolean noMatch) {
|
|
||||||
if (documents.size() > 1) {
|
|
||||||
// log.info("reducing key: '" + key + "' records: " + q.size());
|
|
||||||
process(prepare(documents), context, useTree, noMatch);
|
|
||||||
|
|
||||||
} else {
|
|
||||||
context.incrementCounter(dedupConf.getWf().getEntityType(), "records per hash key = 1", 1);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
public void process(final String key, final Iterable<MapDocument> documents, final Reporter context,
|
|
||||||
boolean useTree, boolean noMatch) {
|
|
||||||
|
|
||||||
final Queue<MapDocument> q = prepare(documents);
|
|
||||||
|
|
||||||
if (q.size() > 1) {
|
|
||||||
// log.info("reducing key: '" + key + "' records: " + q.size());
|
|
||||||
process(simplifyQueue(q, key, context), context, useTree, noMatch);
|
|
||||||
|
|
||||||
} else {
|
|
||||||
context.incrementCounter(dedupConf.getWf().getEntityType(), "records per hash key = 1", 1);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
private Queue<MapDocument> prepare(final Iterable<MapDocument> documents) {
|
|
||||||
final Queue<MapDocument> queue = new PriorityQueue<>(100,
|
|
||||||
new MapDocumentComparator(dedupConf.getWf().getOrderField()));
|
|
||||||
|
|
||||||
final Set<String> seen = new HashSet<String>();
|
|
||||||
final int queueMaxSize = dedupConf.getWf().getQueueMaxSize();
|
|
||||||
|
|
||||||
documents.forEach(doc -> {
|
|
||||||
if (queue.size() <= queueMaxSize) {
|
|
||||||
final String id = doc.getIdentifier();
|
|
||||||
|
|
||||||
if (!seen.contains(id)) {
|
|
||||||
seen.add(id);
|
|
||||||
queue.add(doc);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
});
|
|
||||||
|
|
||||||
return queue;
|
|
||||||
}
|
|
||||||
|
|
||||||
private Queue<MapDocument> simplifyQueue(final Queue<MapDocument> queue, final String ngram,
|
|
||||||
final Reporter context) {
|
|
||||||
final Queue<MapDocument> q = new LinkedList<>();
|
|
||||||
|
|
||||||
String fieldRef = "";
|
|
||||||
final List<MapDocument> tempResults = Lists.newArrayList();
|
|
||||||
|
|
||||||
while (!queue.isEmpty()) {
|
|
||||||
final MapDocument result = queue.remove();
|
|
||||||
|
|
||||||
final String orderFieldName = dedupConf.getWf().getOrderField();
|
|
||||||
final Field orderFieldValue = result.values(orderFieldName);
|
|
||||||
if (!orderFieldValue.isEmpty()) {
|
|
||||||
final String field = NGramUtils.cleanupForOrdering(orderFieldValue.stringValue());
|
|
||||||
if (field.equals(fieldRef)) {
|
|
||||||
tempResults.add(result);
|
|
||||||
} else {
|
|
||||||
populateSimplifiedQueue(q, tempResults, context, fieldRef, ngram);
|
|
||||||
tempResults.clear();
|
|
||||||
tempResults.add(result);
|
|
||||||
fieldRef = field;
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
context
|
|
||||||
.incrementCounter(
|
|
||||||
dedupConf.getWf().getEntityType(), "missing " + dedupConf.getWf().getOrderField(), 1);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
populateSimplifiedQueue(q, tempResults, context, fieldRef, ngram);
|
|
||||||
|
|
||||||
return q;
|
|
||||||
}
|
|
||||||
|
|
||||||
private void populateSimplifiedQueue(final Queue<MapDocument> q,
|
|
||||||
final List<MapDocument> tempResults,
|
|
||||||
final Reporter context,
|
|
||||||
final String fieldRef,
|
|
||||||
final String ngram) {
|
|
||||||
WfConfig wf = dedupConf.getWf();
|
|
||||||
if (tempResults.size() < wf.getGroupMaxSize()) {
|
|
||||||
q.addAll(tempResults);
|
|
||||||
} else {
|
|
||||||
context
|
|
||||||
.incrementCounter(
|
|
||||||
wf.getEntityType(),
|
|
||||||
String.format("Skipped records for count(%s) >= %s", wf.getOrderField(), wf.getGroupMaxSize()),
|
|
||||||
tempResults.size());
|
|
||||||
// log.info("Skipped field: " + fieldRef + " - size: " + tempResults.size() + " - ngram: " + ngram);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
private void process(final Queue<MapDocument> queue, final Reporter context, boolean useTree, boolean noMatch) {
|
|
||||||
|
|
||||||
while (!queue.isEmpty()) {
|
|
||||||
|
|
||||||
final MapDocument pivot = queue.remove();
|
|
||||||
final String idPivot = pivot.getIdentifier();
|
|
||||||
|
|
||||||
WfConfig wf = dedupConf.getWf();
|
|
||||||
final Field fieldsPivot = pivot.values(wf.getOrderField());
|
|
||||||
final String fieldPivot = (fieldsPivot == null) || fieldsPivot.isEmpty() ? "" : fieldsPivot.stringValue();
|
|
||||||
|
|
||||||
if (fieldPivot != null) {
|
|
||||||
int i = 0;
|
|
||||||
for (final MapDocument curr : queue) {
|
|
||||||
final String idCurr = curr.getIdentifier();
|
|
||||||
|
|
||||||
if (mustSkip(idCurr)) {
|
|
||||||
|
|
||||||
context.incrementCounter(wf.getEntityType(), "skip list", 1);
|
|
||||||
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (i > wf.getSlidingWindowSize()) {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
|
|
||||||
final Field fieldsCurr = curr.values(wf.getOrderField());
|
|
||||||
final String fieldCurr = (fieldsCurr == null) || fieldsCurr.isEmpty() ? null
|
|
||||||
: fieldsCurr.stringValue();
|
|
||||||
|
|
||||||
if (!idCurr.equals(idPivot) && (fieldCurr != null)) {
|
|
||||||
|
|
||||||
// draws no match relations (test purpose)
|
|
||||||
if (noMatch) {
|
|
||||||
emitOutput(!new TreeProcessor(dedupConf).compare(pivot, curr), idPivot, idCurr, context);
|
|
||||||
} else {
|
|
||||||
// use the decision tree implementation or the "normal" implementation of the similarity
|
|
||||||
// score (valid only for publications)
|
|
||||||
if (useTree)
|
|
||||||
emitOutput(new TreeProcessor(dedupConf).compare(pivot, curr), idPivot, idCurr, context);
|
|
||||||
else
|
|
||||||
emitOutput(publicationCompare(pivot, curr, dedupConf), idPivot, idCurr, context);
|
|
||||||
}
|
|
||||||
// if(new TreeProcessor(dedupConf).compare(pivot, curr) != publicationCompare(pivot, curr, dedupConf)) {
|
|
||||||
// emitOutput(true, idPivot, idCurr, context);
|
|
||||||
// }
|
|
||||||
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
protected static boolean compareInstanceType(MapDocument a, MapDocument b, DedupConfig conf) {
|
|
||||||
Map<String, String> params = new HashMap<>();
|
|
||||||
InstanceTypeMatch instanceTypeMatch = new InstanceTypeMatch(params);
|
|
||||||
double compare = instanceTypeMatch
|
|
||||||
.compare(a.getFieldMap().get("instance"), b.getFieldMap().get("instance"), conf);
|
|
||||||
return compare >= 1.0;
|
|
||||||
}
|
|
||||||
|
|
||||||
private boolean publicationCompare(MapDocument a, MapDocument b, DedupConfig config) {
|
|
||||||
// if the score gives 1, the publications are equivalent
|
|
||||||
Map<String, String> params = new HashMap<>();
|
|
||||||
params.put("jpath_value", "$.value");
|
|
||||||
params.put("jpath_classid", "$.qualifier.classid");
|
|
||||||
params.put("mode", "count");
|
|
||||||
|
|
||||||
double score = 0.0;
|
|
||||||
|
|
||||||
// levenstein title
|
|
||||||
LevensteinTitle levensteinTitle = new LevensteinTitle(params);
|
|
||||||
if (levensteinTitle.compare(a.getFieldMap().get("title"), b.getFieldMap().get("title"), config) >= 0.9) {
|
|
||||||
score += 0.2;
|
|
||||||
}
|
|
||||||
|
|
||||||
// pid
|
|
||||||
JsonListMatch jsonListMatch = new JsonListMatch(params);
|
|
||||||
if (jsonListMatch.compare(a.getFieldMap().get("pid"), b.getFieldMap().get("pid"), config) >= 1.0) {
|
|
||||||
score += 0.5;
|
|
||||||
}
|
|
||||||
|
|
||||||
// title version
|
|
||||||
TitleVersionMatch titleVersionMatch = new TitleVersionMatch(params);
|
|
||||||
double result1 = titleVersionMatch.compare(a.getFieldMap().get("title"), b.getFieldMap().get("title"), config);
|
|
||||||
if (result1 < 0 || result1 >= 1.0) {
|
|
||||||
score += 0.1;
|
|
||||||
}
|
|
||||||
|
|
||||||
// authors match
|
|
||||||
params.remove("mode");
|
|
||||||
AuthorsMatch authorsMatch = new AuthorsMatch(params);
|
|
||||||
double result2 = authorsMatch.compare(a.getFieldMap().get("authors"), b.getFieldMap().get("authors"), config);
|
|
||||||
if (result2 < 0 || result2 >= 0.6) {
|
|
||||||
score += 0.2;
|
|
||||||
}
|
|
||||||
|
|
||||||
return score >= 0.5;
|
|
||||||
}
|
|
||||||
|
|
||||||
private void emitOutput(final boolean result, final String idPivot, final String idCurr, final Reporter context) {
|
|
||||||
|
|
||||||
if (result) {
|
|
||||||
writeSimilarity(context, idPivot, idCurr);
|
|
||||||
context.incrementCounter(dedupConf.getWf().getEntityType(), "dedupSimilarity (x2)", 1);
|
|
||||||
} else {
|
|
||||||
context.incrementCounter(dedupConf.getWf().getEntityType(), "d < " + dedupConf.getWf().getThreshold(), 1);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
private boolean mustSkip(final String idPivot) {
|
|
||||||
return dedupConf.getWf().getSkipList().contains(getNsPrefix(idPivot));
|
|
||||||
}
|
|
||||||
|
|
||||||
private String getNsPrefix(final String id) {
|
|
||||||
return StringUtils.substringBetween(id, "|", "::");
|
|
||||||
}
|
|
||||||
|
|
||||||
private void writeSimilarity(final Reporter context, final String from, final String to) {
|
|
||||||
final String type = dedupConf.getWf().getEntityType();
|
|
||||||
|
|
||||||
context.emit(type, from, to);
|
|
||||||
}
|
|
||||||
}
|
|
|
@ -18,6 +18,7 @@ package eu.dnetlib.pace.util;
|
||||||
* See the License for the specific language governing permissions and
|
* See the License for the specific language governing permissions and
|
||||||
* limitations under the License.
|
* limitations under the License.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
import java.io.UnsupportedEncodingException;
|
import java.io.UnsupportedEncodingException;
|
||||||
import java.net.URLDecoder;
|
import java.net.URLDecoder;
|
||||||
import java.net.URLEncoder;
|
import java.net.URLEncoder;
|
||||||
|
|
|
@ -2,19 +2,20 @@
|
||||||
package eu.dnetlib.pace.util;
|
package eu.dnetlib.pace.util;
|
||||||
|
|
||||||
import java.math.BigDecimal;
|
import java.math.BigDecimal;
|
||||||
import java.util.*;
|
import java.util.ArrayList;
|
||||||
|
import java.util.LinkedHashMap;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.concurrent.ConcurrentHashMap;
|
||||||
import java.util.function.Predicate;
|
import java.util.function.Predicate;
|
||||||
import java.util.stream.Collectors;
|
|
||||||
|
|
||||||
import com.fasterxml.jackson.core.JsonProcessingException;
|
import com.fasterxml.jackson.core.JsonProcessingException;
|
||||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||||
import com.jayway.jsonpath.Configuration;
|
import com.jayway.jsonpath.DocumentContext;
|
||||||
import com.jayway.jsonpath.JsonPath;
|
import com.jayway.jsonpath.JsonPath;
|
||||||
import com.jayway.jsonpath.Option;
|
import com.jayway.jsonpath.spi.cache.Cache;
|
||||||
|
import com.jayway.jsonpath.spi.cache.CacheProvider;
|
||||||
|
|
||||||
import eu.dnetlib.pace.config.DedupConfig;
|
|
||||||
import eu.dnetlib.pace.config.Type;
|
import eu.dnetlib.pace.config.Type;
|
||||||
import eu.dnetlib.pace.model.*;
|
|
||||||
import net.minidev.json.JSONArray;
|
import net.minidev.json.JSONArray;
|
||||||
|
|
||||||
public class MapDocumentUtil {
|
public class MapDocumentUtil {
|
||||||
|
@ -22,103 +23,20 @@ public class MapDocumentUtil {
|
||||||
public static final String URL_REGEX = "^(http|https|ftp)\\://.*";
|
public static final String URL_REGEX = "^(http|https|ftp)\\://.*";
|
||||||
public static Predicate<String> urlFilter = s -> s.trim().matches(URL_REGEX);
|
public static Predicate<String> urlFilter = s -> s.trim().matches(URL_REGEX);
|
||||||
|
|
||||||
public static MapDocument asMapDocumentWithJPath(DedupConfig conf, final String json) {
|
static {
|
||||||
MapDocument m = new MapDocument();
|
CacheProvider.setCache(new Cache() {
|
||||||
m.setIdentifier(getJPathString(conf.getWf().getIdPath(), json));
|
private final ConcurrentHashMap<String, JsonPath> jsonPathCache = new ConcurrentHashMap();
|
||||||
Map<String, Field> stringField = new HashMap<>();
|
|
||||||
conf.getPace().getModel().forEach(fdef -> {
|
@Override
|
||||||
switch (fdef.getType()) {
|
public JsonPath get(String key) {
|
||||||
case String:
|
return jsonPathCache.get(key);
|
||||||
case Int:
|
}
|
||||||
stringField
|
|
||||||
.put(
|
@Override
|
||||||
fdef.getName(), new FieldValueImpl(fdef.getType(), fdef.getName(),
|
public void put(String key, JsonPath value) {
|
||||||
truncateValue(getJPathString(fdef.getPath(), json), fdef.getLength())));
|
jsonPathCache.put(key, value);
|
||||||
break;
|
|
||||||
case URL:
|
|
||||||
String uv = getJPathString(fdef.getPath(), json);
|
|
||||||
if (!urlFilter.test(uv))
|
|
||||||
uv = "";
|
|
||||||
stringField.put(fdef.getName(), new FieldValueImpl(fdef.getType(), fdef.getName(), uv));
|
|
||||||
break;
|
|
||||||
case List:
|
|
||||||
case JSON:
|
|
||||||
FieldListImpl fi = new FieldListImpl(fdef.getName(), fdef.getType());
|
|
||||||
truncateList(getJPathList(fdef.getPath(), json, fdef.getType()), fdef.getSize())
|
|
||||||
.stream()
|
|
||||||
.map(item -> new FieldValueImpl(Type.String, fdef.getName(), item))
|
|
||||||
.forEach(fi::add);
|
|
||||||
stringField.put(fdef.getName(), fi);
|
|
||||||
break;
|
|
||||||
case DoubleArray:
|
|
||||||
stringField
|
|
||||||
.put(
|
|
||||||
fdef.getName(),
|
|
||||||
new FieldValueImpl(Type.DoubleArray,
|
|
||||||
fdef.getName(),
|
|
||||||
getJPathArray(fdef.getPath(), json)));
|
|
||||||
break;
|
|
||||||
case StringConcat:
|
|
||||||
String[] jpaths = fdef.getPath().split("\\|\\|\\|");
|
|
||||||
stringField
|
|
||||||
.put(
|
|
||||||
fdef.getName(),
|
|
||||||
new FieldValueImpl(Type.String,
|
|
||||||
fdef.getName(),
|
|
||||||
truncateValue(
|
|
||||||
Arrays
|
|
||||||
.stream(jpaths)
|
|
||||||
.map(jpath -> getJPathString(jpath, json))
|
|
||||||
.collect(Collectors.joining(" ")),
|
|
||||||
fdef.getLength())));
|
|
||||||
break;
|
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
m.setFieldMap(stringField);
|
|
||||||
return m;
|
|
||||||
}
|
|
||||||
|
|
||||||
public static List<String> getJPathList(String path, String json, Type type) {
|
|
||||||
if (type == Type.List)
|
|
||||||
return JsonPath
|
|
||||||
.using(
|
|
||||||
Configuration
|
|
||||||
.defaultConfiguration()
|
|
||||||
.addOptions(Option.ALWAYS_RETURN_LIST, Option.SUPPRESS_EXCEPTIONS))
|
|
||||||
.parse(json)
|
|
||||||
.read(path);
|
|
||||||
Object jresult;
|
|
||||||
List<String> result = new ArrayList<>();
|
|
||||||
try {
|
|
||||||
jresult = JsonPath.read(json, path);
|
|
||||||
} catch (Throwable e) {
|
|
||||||
return result;
|
|
||||||
}
|
|
||||||
if (jresult instanceof JSONArray) {
|
|
||||||
|
|
||||||
((JSONArray) jresult).forEach(it -> {
|
|
||||||
|
|
||||||
try {
|
|
||||||
result.add(new ObjectMapper().writeValueAsString(it));
|
|
||||||
} catch (JsonProcessingException e) {
|
|
||||||
|
|
||||||
}
|
|
||||||
});
|
|
||||||
return result;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (jresult instanceof LinkedHashMap) {
|
|
||||||
try {
|
|
||||||
result.add(new ObjectMapper().writeValueAsString(jresult));
|
|
||||||
} catch (JsonProcessingException e) {
|
|
||||||
|
|
||||||
}
|
|
||||||
return result;
|
|
||||||
}
|
|
||||||
if (jresult instanceof String) {
|
|
||||||
result.add((String) jresult);
|
|
||||||
}
|
|
||||||
return result;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public static String getJPathString(final String jsonPath, final String json) {
|
public static String getJPathString(final String jsonPath, final String json) {
|
||||||
|
@ -174,4 +92,54 @@ public class MapDocumentUtil {
|
||||||
return list.subList(0, size);
|
return list.subList(0, size);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public static String getJPathString(final String jsonPath, final DocumentContext json) {
|
||||||
|
try {
|
||||||
|
Object o = json.read(jsonPath);
|
||||||
|
if (o instanceof String)
|
||||||
|
return (String) o;
|
||||||
|
if (o instanceof JSONArray && ((JSONArray) o).size() > 0)
|
||||||
|
return (String) ((JSONArray) o).get(0);
|
||||||
|
return "";
|
||||||
|
} catch (Exception e) {
|
||||||
|
return "";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public static List<String> getJPathList(String path, DocumentContext json, Type type) {
|
||||||
|
// if (type == Type.List)
|
||||||
|
// return JsonPath.using(Configuration.defaultConfiguration().addOptions(Option.ALWAYS_RETURN_LIST,
|
||||||
|
// Option.SUPPRESS_EXCEPTIONS)).parse(json).read(path);
|
||||||
|
Object jresult;
|
||||||
|
List<String> result = new ArrayList<>();
|
||||||
|
try {
|
||||||
|
jresult = json.read(path);
|
||||||
|
} catch (Throwable e) {
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (jresult instanceof JSONArray) {
|
||||||
|
((JSONArray) jresult).forEach(it -> {
|
||||||
|
try {
|
||||||
|
result.add(new ObjectMapper().writeValueAsString(it));
|
||||||
|
} catch (JsonProcessingException e) {
|
||||||
|
|
||||||
|
}
|
||||||
|
});
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (jresult instanceof LinkedHashMap) {
|
||||||
|
try {
|
||||||
|
result.add(new ObjectMapper().writeValueAsString(jresult));
|
||||||
|
} catch (JsonProcessingException e) {
|
||||||
|
|
||||||
|
}
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
if (jresult instanceof String) {
|
||||||
|
result.add((String) jresult);
|
||||||
|
}
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -0,0 +1,85 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.pace.util;
|
||||||
|
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.HashMap;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Map;
|
||||||
|
|
||||||
|
import org.apache.spark.SparkContext;
|
||||||
|
import org.apache.spark.util.LongAccumulator;
|
||||||
|
|
||||||
|
import eu.dnetlib.pace.config.DedupConfig;
|
||||||
|
import scala.Serializable;
|
||||||
|
import scala.Tuple2;
|
||||||
|
|
||||||
|
public class SparkReporter implements Serializable, Reporter {
|
||||||
|
|
||||||
|
private final List<Tuple2<String, String>> relations = new ArrayList<>();
|
||||||
|
|
||||||
|
private final Map<String, LongAccumulator> accumulators;
|
||||||
|
|
||||||
|
public SparkReporter(Map<String, LongAccumulator> accumulators) {
|
||||||
|
this.accumulators = accumulators;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void incrementCounter(
|
||||||
|
String counterGroup,
|
||||||
|
String counterName,
|
||||||
|
long delta,
|
||||||
|
Map<String, LongAccumulator> accumulators) {
|
||||||
|
|
||||||
|
final String accumulatorName = String.format("%s::%s", counterGroup, counterName);
|
||||||
|
if (accumulators.containsKey(accumulatorName)) {
|
||||||
|
accumulators.get(accumulatorName).add(delta);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void incrementCounter(String counterGroup, String counterName, long delta) {
|
||||||
|
|
||||||
|
incrementCounter(counterGroup, counterName, delta, accumulators);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void emit(String type, String from, String to) {
|
||||||
|
relations.add(new Tuple2<>(from, to));
|
||||||
|
}
|
||||||
|
|
||||||
|
public List<Tuple2<String, String>> getRelations() {
|
||||||
|
return relations;
|
||||||
|
}
|
||||||
|
|
||||||
|
public static Map<String, LongAccumulator> constructAccumulator(
|
||||||
|
final DedupConfig dedupConf, final SparkContext context) {
|
||||||
|
|
||||||
|
Map<String, LongAccumulator> accumulators = new HashMap<>();
|
||||||
|
|
||||||
|
String acc1 = String.format("%s::%s", dedupConf.getWf().getEntityType(), "records per hash key = 1");
|
||||||
|
accumulators.put(acc1, context.longAccumulator(acc1));
|
||||||
|
String acc2 = String
|
||||||
|
.format(
|
||||||
|
"%s::%s",
|
||||||
|
dedupConf.getWf().getEntityType(), "missing " + dedupConf.getWf().getOrderField());
|
||||||
|
accumulators.put(acc2, context.longAccumulator(acc2));
|
||||||
|
String acc3 = String
|
||||||
|
.format(
|
||||||
|
"%s::%s",
|
||||||
|
dedupConf.getWf().getEntityType(),
|
||||||
|
String
|
||||||
|
.format(
|
||||||
|
"Skipped records for count(%s) >= %s",
|
||||||
|
dedupConf.getWf().getOrderField(), dedupConf.getWf().getGroupMaxSize()));
|
||||||
|
accumulators.put(acc3, context.longAccumulator(acc3));
|
||||||
|
String acc4 = String.format("%s::%s", dedupConf.getWf().getEntityType(), "skip list");
|
||||||
|
accumulators.put(acc4, context.longAccumulator(acc4));
|
||||||
|
String acc5 = String.format("%s::%s", dedupConf.getWf().getEntityType(), "dedupSimilarity (x2)");
|
||||||
|
accumulators.put(acc5, context.longAccumulator(acc5));
|
||||||
|
String acc6 = String
|
||||||
|
.format(
|
||||||
|
"%s::%s", dedupConf.getWf().getEntityType(), "d < " + dedupConf.getWf().getThreshold());
|
||||||
|
accumulators.put(acc6, context.longAccumulator(acc6));
|
||||||
|
|
||||||
|
return accumulators;
|
||||||
|
}
|
||||||
|
}
|
|
@ -3,57 +3,42 @@ package eu.dnetlib.pace;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.StringWriter;
|
import java.io.StringWriter;
|
||||||
import java.nio.charset.StandardCharsets;
|
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.stream.Collectors;
|
|
||||||
|
|
||||||
import org.apache.commons.io.IOUtils;
|
import org.apache.commons.io.IOUtils;
|
||||||
|
|
||||||
import eu.dnetlib.pace.common.AbstractPaceFunctions;
|
import eu.dnetlib.pace.common.AbstractPaceFunctions;
|
||||||
import eu.dnetlib.pace.config.Type;
|
|
||||||
import eu.dnetlib.pace.model.Field;
|
|
||||||
import eu.dnetlib.pace.model.FieldListImpl;
|
|
||||||
import eu.dnetlib.pace.model.FieldValueImpl;
|
|
||||||
|
|
||||||
public abstract class AbstractPaceTest extends AbstractPaceFunctions {
|
public abstract class AbstractPaceTest extends AbstractPaceFunctions {
|
||||||
|
|
||||||
protected String readFromClasspath(final String filename) {
|
protected String readFromClasspath(final String filename) {
|
||||||
final StringWriter sw = new StringWriter();
|
final StringWriter sw = new StringWriter();
|
||||||
try {
|
try {
|
||||||
IOUtils.copy(getClass().getResourceAsStream(filename), sw, StandardCharsets.UTF_8);
|
IOUtils.copy(getClass().getResourceAsStream(filename), sw);
|
||||||
return sw.toString();
|
return sw.toString();
|
||||||
} catch (final IOException e) {
|
} catch (final IOException e) {
|
||||||
throw new RuntimeException("cannot load resource from classpath: " + filename);
|
throw new RuntimeException("cannot load resource from classpath: " + filename);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
protected Field title(final String s) {
|
protected String title(final String s) {
|
||||||
return new FieldValueImpl(Type.String, "title", s);
|
return s;
|
||||||
}
|
}
|
||||||
|
|
||||||
protected Field person(final String s) {
|
protected String person(final String s) {
|
||||||
return new FieldValueImpl(Type.JSON, "person", s);
|
return s;
|
||||||
}
|
}
|
||||||
|
|
||||||
protected Field url(final String s) {
|
protected String url(final String s) {
|
||||||
return new FieldValueImpl(Type.URL, "url", s);
|
return s;
|
||||||
}
|
}
|
||||||
|
|
||||||
protected Field array(final double[] a) {
|
protected double[] array(final double[] a) {
|
||||||
return new FieldValueImpl(Type.DoubleArray, "array", a);
|
|
||||||
}
|
|
||||||
|
|
||||||
protected Field createFieldList(List<String> strings, String fieldName) {
|
|
||||||
|
|
||||||
List<FieldValueImpl> fieldValueStream = strings
|
|
||||||
.stream()
|
|
||||||
.map(s -> new FieldValueImpl(Type.String, fieldName, s))
|
|
||||||
.collect(Collectors.toList());
|
|
||||||
|
|
||||||
FieldListImpl a = new FieldListImpl();
|
|
||||||
a.addAll(fieldValueStream);
|
|
||||||
|
|
||||||
return a;
|
return a;
|
||||||
|
}
|
||||||
|
|
||||||
|
protected List<String> createFieldList(List<String> strings, String fieldName) {
|
||||||
|
return strings;
|
||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -2,14 +2,12 @@
|
||||||
package eu.dnetlib.pace.clustering;
|
package eu.dnetlib.pace.clustering;
|
||||||
|
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
import java.util.Set;
|
|
||||||
import java.util.stream.Collectors;
|
|
||||||
|
|
||||||
import org.junit.jupiter.api.*;
|
import org.junit.jupiter.api.BeforeAll;
|
||||||
|
import org.junit.jupiter.api.Test;
|
||||||
|
|
||||||
import com.google.common.collect.Lists;
|
import com.google.common.collect.Lists;
|
||||||
import com.google.common.collect.Maps;
|
import com.google.common.collect.Maps;
|
||||||
import com.google.common.collect.Sets;
|
|
||||||
|
|
||||||
import eu.dnetlib.pace.AbstractPaceTest;
|
import eu.dnetlib.pace.AbstractPaceTest;
|
||||||
import eu.dnetlib.pace.common.AbstractPaceFunctions;
|
import eu.dnetlib.pace.common.AbstractPaceFunctions;
|
||||||
|
@ -37,7 +35,7 @@ public class ClusteringFunctionTest extends AbstractPaceTest {
|
||||||
|
|
||||||
final String s = "http://www.test.it/path/to/resource";
|
final String s = "http://www.test.it/path/to/resource";
|
||||||
System.out.println(s);
|
System.out.println(s);
|
||||||
System.out.println(urlClustering.apply(conf, Lists.newArrayList(url(s))));
|
System.out.println(urlClustering.apply(conf, Lists.newArrayList(s)));
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
|
@ -51,7 +49,7 @@ public class ClusteringFunctionTest extends AbstractPaceTest {
|
||||||
|
|
||||||
final String s = "Search for the Standard Model Higgs Boson";
|
final String s = "Search for the Standard Model Higgs Boson";
|
||||||
System.out.println(s);
|
System.out.println(s);
|
||||||
System.out.println(ngram.apply(conf, Lists.newArrayList(title(s))));
|
System.out.println(ngram.apply(conf, Lists.newArrayList(s)));
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
|
@ -63,7 +61,7 @@ public class ClusteringFunctionTest extends AbstractPaceTest {
|
||||||
|
|
||||||
final String s = "Search for the Standard Model Higgs Boson";
|
final String s = "Search for the Standard Model Higgs Boson";
|
||||||
System.out.println(s);
|
System.out.println(s);
|
||||||
System.out.println(np.apply(conf, Lists.newArrayList(title(s))));
|
System.out.println(np.apply(conf, Lists.newArrayList(s)));
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
|
@ -75,15 +73,15 @@ public class ClusteringFunctionTest extends AbstractPaceTest {
|
||||||
|
|
||||||
final String s1 = "University of Pisa";
|
final String s1 = "University of Pisa";
|
||||||
System.out.println(s1);
|
System.out.println(s1);
|
||||||
System.out.println(np.apply(conf, Lists.newArrayList(title(s1))));
|
System.out.println(np.apply(conf, Lists.newArrayList(s1)));
|
||||||
|
|
||||||
final String s2 = "Pisa University";
|
final String s2 = "Pisa University";
|
||||||
System.out.println(s2);
|
System.out.println(s2);
|
||||||
System.out.println(np.apply(conf, Lists.newArrayList(title(s2))));
|
System.out.println(np.apply(conf, Lists.newArrayList(s2)));
|
||||||
|
|
||||||
final String s3 = "Parco Tecnologico Agroalimentare Umbria";
|
final String s3 = "Parco Tecnologico Agroalimentare Umbria";
|
||||||
System.out.println(s3);
|
System.out.println(s3);
|
||||||
System.out.println(np.apply(conf, Lists.newArrayList(title(s3))));
|
System.out.println(np.apply(conf, Lists.newArrayList(s3)));
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -97,7 +95,7 @@ public class ClusteringFunctionTest extends AbstractPaceTest {
|
||||||
|
|
||||||
final String s = "Search for the Standard Model Higgs Boson";
|
final String s = "Search for the Standard Model Higgs Boson";
|
||||||
System.out.println(s);
|
System.out.println(s);
|
||||||
System.out.println(acro.apply(conf, Lists.newArrayList(title(s))));
|
System.out.println(acro.apply(conf, Lists.newArrayList(s)));
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
|
@ -109,12 +107,12 @@ public class ClusteringFunctionTest extends AbstractPaceTest {
|
||||||
|
|
||||||
final String s = "Search for the Standard Model Higgs Boson";
|
final String s = "Search for the Standard Model Higgs Boson";
|
||||||
System.out.println(s);
|
System.out.println(s);
|
||||||
System.out.println(sp.apply(conf, Lists.newArrayList(title(s))));
|
System.out.println(sp.apply(conf, Lists.newArrayList(s)));
|
||||||
|
|
||||||
params.put("len", 3);
|
params.put("len", 3);
|
||||||
params.put("max", 1);
|
params.put("max", 1);
|
||||||
|
|
||||||
System.out.println(sp.apply(conf, Lists.newArrayList(title("Framework for general-purpose deduplication"))));
|
System.out.println(sp.apply(conf, Lists.newArrayList("Framework for general-purpose deduplication")));
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
|
@ -127,7 +125,7 @@ public class ClusteringFunctionTest extends AbstractPaceTest {
|
||||||
|
|
||||||
final String s = "Search for the Standard Model Higgs Boson";
|
final String s = "Search for the Standard Model Higgs Boson";
|
||||||
System.out.println(s);
|
System.out.println(s);
|
||||||
System.out.println(sp.apply(conf, Lists.newArrayList(title(s))));
|
System.out.println(sp.apply(conf, Lists.newArrayList(s)));
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
|
@ -138,31 +136,31 @@ public class ClusteringFunctionTest extends AbstractPaceTest {
|
||||||
|
|
||||||
String s = "Search for the Standard Model Higgs Boson";
|
String s = "Search for the Standard Model Higgs Boson";
|
||||||
System.out.println(s);
|
System.out.println(s);
|
||||||
System.out.println(sp.apply(conf, Lists.newArrayList(title(s))));
|
System.out.println(sp.apply(conf, Lists.newArrayList(s)));
|
||||||
|
|
||||||
s = "A Physical Education Teacher Is Like...: Examining Turkish Students Perceptions of Physical Education Teachers Through Metaphor Analysis";
|
s = "A Physical Education Teacher Is Like...: Examining Turkish Students Perceptions of Physical Education Teachers Through Metaphor Analysis";
|
||||||
System.out.println(s);
|
System.out.println(s);
|
||||||
System.out.println(sp.apply(conf, Lists.newArrayList(title(s))));
|
System.out.println(sp.apply(conf, Lists.newArrayList(s)));
|
||||||
|
|
||||||
s = "Structure of a Eukaryotic Nonribosomal Peptide Synthetase Adenylation Domain That Activates a Large Hydroxamate Amino Acid in Siderophore Biosynthesis";
|
s = "Structure of a Eukaryotic Nonribosomal Peptide Synthetase Adenylation Domain That Activates a Large Hydroxamate Amino Acid in Siderophore Biosynthesis";
|
||||||
System.out.println(s);
|
System.out.println(s);
|
||||||
System.out.println(sp.apply(conf, Lists.newArrayList(title(s))));
|
System.out.println(sp.apply(conf, Lists.newArrayList(s)));
|
||||||
|
|
||||||
s = "Performance Evaluation";
|
s = "Performance Evaluation";
|
||||||
System.out.println(s);
|
System.out.println(s);
|
||||||
System.out.println(sp.apply(conf, Lists.newArrayList(title(s))));
|
System.out.println(sp.apply(conf, Lists.newArrayList(s)));
|
||||||
|
|
||||||
s = "JRC Open Power Plants Database (JRC-PPDB-OPEN)";
|
s = "JRC Open Power Plants Database (JRC-PPDB-OPEN)";
|
||||||
System.out.println(s);
|
System.out.println(s);
|
||||||
System.out.println(sp.apply(conf, Lists.newArrayList(title(s))));
|
System.out.println(sp.apply(conf, Lists.newArrayList(s)));
|
||||||
|
|
||||||
s = "JRC Open Power Plants Database";
|
s = "JRC Open Power Plants Database";
|
||||||
System.out.println(s);
|
System.out.println(s);
|
||||||
System.out.println(sp.apply(conf, Lists.newArrayList(title(s))));
|
System.out.println(sp.apply(conf, Lists.newArrayList(s)));
|
||||||
|
|
||||||
s = "niivue/niivue: 0.21.1";
|
s = "niivue/niivue: 0.21.1";
|
||||||
System.out.println(s);
|
System.out.println(s);
|
||||||
System.out.println(sp.apply(conf, Lists.newArrayList(title(s))));
|
System.out.println(sp.apply(conf, Lists.newArrayList(s)));
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -175,7 +173,7 @@ public class ClusteringFunctionTest extends AbstractPaceTest {
|
||||||
|
|
||||||
final String s = "Search for the Standard Model Higgs Boson";
|
final String s = "Search for the Standard Model Higgs Boson";
|
||||||
System.out.println(s);
|
System.out.println(s);
|
||||||
System.out.println(sp.apply(conf, Lists.newArrayList(title(s))));
|
System.out.println(sp.apply(conf, Lists.newArrayList(s)));
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
|
@ -184,35 +182,35 @@ public class ClusteringFunctionTest extends AbstractPaceTest {
|
||||||
final ClusteringFunction cf = new KeywordsClustering(params);
|
final ClusteringFunction cf = new KeywordsClustering(params);
|
||||||
final String s = "Polytechnic University of Turin";
|
final String s = "Polytechnic University of Turin";
|
||||||
System.out.println(s);
|
System.out.println(s);
|
||||||
System.out.println(cf.apply(conf, Lists.newArrayList(title(s))));
|
System.out.println(cf.apply(conf, Lists.newArrayList(s)));
|
||||||
|
|
||||||
final String s1 = "POLITECNICO DI TORINO";
|
final String s1 = "POLITECNICO DI TORINO";
|
||||||
System.out.println(s1);
|
System.out.println(s1);
|
||||||
System.out.println(cf.apply(conf, Lists.newArrayList(title(s1))));
|
System.out.println(cf.apply(conf, Lists.newArrayList(s1)));
|
||||||
|
|
||||||
final String s2 = "Universita farmaceutica culturale di milano bergamo";
|
final String s2 = "Universita farmaceutica culturale di milano bergamo";
|
||||||
System.out.println("s2 = " + s2);
|
System.out.println("s2 = " + s2);
|
||||||
System.out.println(cf.apply(conf, Lists.newArrayList(title(s2))));
|
System.out.println(cf.apply(conf, Lists.newArrayList(s2)));
|
||||||
|
|
||||||
final String s3 = "universita universita milano milano";
|
final String s3 = "universita universita milano milano";
|
||||||
System.out.println("s3 = " + s3);
|
System.out.println("s3 = " + s3);
|
||||||
System.out.println(cf.apply(conf, Lists.newArrayList(title(s3))));
|
System.out.println(cf.apply(conf, Lists.newArrayList(s3)));
|
||||||
|
|
||||||
final String s4 = "Politechniki Warszawskiej (Warsaw University of Technology)";
|
final String s4 = "Politechniki Warszawskiej (Warsaw University of Technology)";
|
||||||
System.out.println("s4 = " + s4);
|
System.out.println("s4 = " + s4);
|
||||||
System.out.println(cf.apply(conf, Lists.newArrayList(title(s4))));
|
System.out.println(cf.apply(conf, Lists.newArrayList(s4)));
|
||||||
|
|
||||||
final String s5 = "İstanbul Ticarət Universiteti";
|
final String s5 = "İstanbul Ticarət Universiteti";
|
||||||
System.out.println("s5 = " + s5);
|
System.out.println("s5 = " + s5);
|
||||||
System.out.println(cf.apply(conf, Lists.newArrayList(title(s5))));
|
System.out.println(cf.apply(conf, Lists.newArrayList(s5)));
|
||||||
|
|
||||||
final String s6 = "National and Kapodistrian University of Athens";
|
final String s6 = "National and Kapodistrian University of Athens";
|
||||||
System.out.println("s6 = " + s6);
|
System.out.println("s6 = " + s6);
|
||||||
System.out.println(cf.apply(conf, Lists.newArrayList(title(s6))));
|
System.out.println(cf.apply(conf, Lists.newArrayList(s6)));
|
||||||
|
|
||||||
final String s7 = "Εθνικό και Καποδιστριακό Πανεπιστήμιο Αθηνών";
|
final String s7 = "Εθνικό και Καποδιστριακό Πανεπιστήμιο Αθηνών";
|
||||||
System.out.println("s7 = " + s7);
|
System.out.println("s7 = " + s7);
|
||||||
System.out.println(cf.apply(conf, Lists.newArrayList(title(s7))));
|
System.out.println(cf.apply(conf, Lists.newArrayList(s7)));
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -222,11 +220,11 @@ public class ClusteringFunctionTest extends AbstractPaceTest {
|
||||||
final ClusteringFunction cf = new PersonClustering(params);
|
final ClusteringFunction cf = new PersonClustering(params);
|
||||||
final String s = "Abd-Alla, Abo-el-nour N.";
|
final String s = "Abd-Alla, Abo-el-nour N.";
|
||||||
System.out.println("s = " + s);
|
System.out.println("s = " + s);
|
||||||
System.out.println(cf.apply(conf, Lists.newArrayList(title(s))));
|
System.out.println(cf.apply(conf, Lists.newArrayList(s)));
|
||||||
|
|
||||||
final String s1 = "Manghi, Paolo";
|
final String s1 = "Manghi, Paolo";
|
||||||
System.out.println("s1 = " + s1);
|
System.out.println("s1 = " + s1);
|
||||||
System.out.println(cf.apply(conf, Lists.newArrayList(title(s1))));
|
System.out.println(cf.apply(conf, Lists.newArrayList(s1)));
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -236,11 +234,11 @@ public class ClusteringFunctionTest extends AbstractPaceTest {
|
||||||
final ClusteringFunction cf = new PersonHash(params);
|
final ClusteringFunction cf = new PersonHash(params);
|
||||||
final String s = "Manghi, Paolo";
|
final String s = "Manghi, Paolo";
|
||||||
System.out.println("s = " + s);
|
System.out.println("s = " + s);
|
||||||
System.out.println(cf.apply(conf, Lists.newArrayList(title(s))));
|
System.out.println(cf.apply(conf, Lists.newArrayList(s)));
|
||||||
|
|
||||||
final String s1 = "Manghi, P.";
|
final String s1 = "Manghi, P.";
|
||||||
System.out.println("s = " + s1);
|
System.out.println("s = " + s1);
|
||||||
System.out.println(cf.apply(conf, Lists.newArrayList(title(s1))));
|
System.out.println(cf.apply(conf, Lists.newArrayList(s1)));
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -250,7 +248,7 @@ public class ClusteringFunctionTest extends AbstractPaceTest {
|
||||||
final ClusteringFunction cf = new LastNameFirstInitial(params);
|
final ClusteringFunction cf = new LastNameFirstInitial(params);
|
||||||
final String s = "LI Yonghong";
|
final String s = "LI Yonghong";
|
||||||
System.out.println("s = " + s);
|
System.out.println("s = " + s);
|
||||||
System.out.println(cf.apply(conf, Lists.newArrayList(title(s))));
|
System.out.println(cf.apply(conf, Lists.newArrayList(s)));
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -3,19 +3,16 @@ package eu.dnetlib.pace.comparators;
|
||||||
|
|
||||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||||
|
|
||||||
import java.util.ArrayList;
|
import java.util.*;
|
||||||
import java.util.Arrays;
|
|
||||||
import java.util.HashMap;
|
|
||||||
import java.util.Map;
|
|
||||||
|
|
||||||
import org.junit.jupiter.api.*;
|
import org.junit.jupiter.api.BeforeAll;
|
||||||
|
import org.junit.jupiter.api.BeforeEach;
|
||||||
|
import org.junit.jupiter.api.Test;
|
||||||
|
import org.junit.jupiter.api.TestInstance;
|
||||||
|
|
||||||
import eu.dnetlib.pace.AbstractPaceTest;
|
import eu.dnetlib.pace.AbstractPaceTest;
|
||||||
import eu.dnetlib.pace.clustering.NGramUtils;
|
import eu.dnetlib.pace.clustering.NGramUtils;
|
||||||
import eu.dnetlib.pace.config.DedupConfig;
|
import eu.dnetlib.pace.config.DedupConfig;
|
||||||
import eu.dnetlib.pace.config.Type;
|
|
||||||
import eu.dnetlib.pace.model.Field;
|
|
||||||
import eu.dnetlib.pace.model.FieldValueImpl;
|
|
||||||
import eu.dnetlib.pace.tree.*;
|
import eu.dnetlib.pace.tree.*;
|
||||||
|
|
||||||
@TestInstance(TestInstance.Lifecycle.PER_CLASS)
|
@TestInstance(TestInstance.Lifecycle.PER_CLASS)
|
||||||
|
@ -99,8 +96,8 @@ public class ComparatorTest extends AbstractPaceTest {
|
||||||
@Test
|
@Test
|
||||||
public void listContainsMatchTest() {
|
public void listContainsMatchTest() {
|
||||||
|
|
||||||
Field a = createFieldList(Arrays.asList("Article", "Publication", "ORP"), "instanceType");
|
List<String> a = createFieldList(Arrays.asList("Article", "Publication", "ORP"), "instanceType");
|
||||||
Field b = createFieldList(Arrays.asList("Publication", "Article", "ORP"), "instanceType");
|
List<String> b = createFieldList(Arrays.asList("Publication", "Article", "ORP"), "instanceType");
|
||||||
|
|
||||||
params.put("string", "Article");
|
params.put("string", "Article");
|
||||||
params.put("bool", "XOR");
|
params.put("bool", "XOR");
|
||||||
|
@ -214,31 +211,32 @@ public class ComparatorTest extends AbstractPaceTest {
|
||||||
|
|
||||||
final InstanceTypeMatch instanceTypeMatch = new InstanceTypeMatch(params);
|
final InstanceTypeMatch instanceTypeMatch = new InstanceTypeMatch(params);
|
||||||
|
|
||||||
Field a = createFieldList(Arrays.asList("Article", "Article", "Article"), "instanceType");
|
List<String> a = createFieldList(Arrays.asList("Article", "Article", "Article"), "instanceType");
|
||||||
Field b = createFieldList(Arrays.asList("Article", "Article", "Article"), "instanceType");
|
List<String> b = createFieldList(Arrays.asList("Article", "Article", "Article"), "instanceType");
|
||||||
double result = instanceTypeMatch.compare(a, b, conf);
|
double result = instanceTypeMatch.compare(a, b, conf);
|
||||||
|
|
||||||
assertEquals(1.0, result);
|
assertEquals(1.0, result);
|
||||||
|
|
||||||
Field c = createFieldList(
|
List<String> c = createFieldList(
|
||||||
Arrays.asList("Conference object", "Conference object", "Conference object"), "instanceType");
|
Arrays.asList("Conference object", "Conference object", "Conference object"), "instanceType");
|
||||||
result = instanceTypeMatch.compare(c, b, conf);
|
result = instanceTypeMatch.compare(c, b, conf);
|
||||||
|
|
||||||
assertEquals(1.0, result);
|
assertEquals(1.0, result);
|
||||||
|
|
||||||
Field d = createFieldList(Arrays.asList("Master thesis", "Master thesis", "Master thesis"), "instanceType");
|
List<String> d = createFieldList(
|
||||||
Field e = createFieldList(
|
Arrays.asList("Master thesis", "Master thesis", "Master thesis"), "instanceType");
|
||||||
|
List<String> e = createFieldList(
|
||||||
Arrays.asList("Bachelor thesis", "Bachelor thesis", "Bachelor thesis"), "instanceType");
|
Arrays.asList("Bachelor thesis", "Bachelor thesis", "Bachelor thesis"), "instanceType");
|
||||||
result = instanceTypeMatch.compare(d, e, conf);
|
result = instanceTypeMatch.compare(d, e, conf);
|
||||||
|
|
||||||
assertEquals(1.0, result);
|
assertEquals(1.0, result);
|
||||||
|
|
||||||
Field g = createFieldList(Arrays.asList("Software Paper", "Software Paper"), "instanceType");
|
List<String> g = createFieldList(Arrays.asList("Software Paper", "Software Paper"), "instanceType");
|
||||||
result = instanceTypeMatch.compare(e, g, conf);
|
result = instanceTypeMatch.compare(e, g, conf);
|
||||||
|
|
||||||
assertEquals(0.0, result);
|
assertEquals(0.0, result);
|
||||||
|
|
||||||
Field h = createFieldList(Arrays.asList("Other literature type", "Article"), "instanceType");
|
List<String> h = createFieldList(Arrays.asList("Other literature type", "Article"), "instanceType");
|
||||||
result = instanceTypeMatch.compare(a, h, conf);
|
result = instanceTypeMatch.compare(a, h, conf);
|
||||||
|
|
||||||
assertEquals(1.0, result);
|
assertEquals(1.0, result);
|
||||||
|
@ -249,15 +247,15 @@ public class ComparatorTest extends AbstractPaceTest {
|
||||||
|
|
||||||
AuthorsMatch authorsMatch = new AuthorsMatch(params);
|
AuthorsMatch authorsMatch = new AuthorsMatch(params);
|
||||||
|
|
||||||
Field a = createFieldList(
|
List<String> a = createFieldList(
|
||||||
Arrays.asList("La Bruzzo, Sandro", "Atzori, Claudio", "De Bonis, Michele"), "authors");
|
Arrays.asList("La Bruzzo, Sandro", "Atzori, Claudio", "De Bonis, Michele"), "authors");
|
||||||
Field b = createFieldList(Arrays.asList("Atzori, C.", "La Bruzzo, S.", "De Bonis, M."), "authors");
|
List<String> b = createFieldList(Arrays.asList("Atzori, C.", "La Bruzzo, S.", "De Bonis, M."), "authors");
|
||||||
double result = authorsMatch.compare(a, b, conf);
|
double result = authorsMatch.compare(a, b, conf);
|
||||||
|
|
||||||
assertEquals(1.0, result);
|
assertEquals(1.0, result);
|
||||||
|
|
||||||
Field c = createFieldList(Arrays.asList("Manghi, Paolo"), "authors");
|
List<String> c = createFieldList(Arrays.asList("Manghi, Paolo"), "authors");
|
||||||
Field d = createFieldList(Arrays.asList("Manghi, Pasquale"), "authors");
|
List<String> d = createFieldList(Arrays.asList("Manghi, Pasquale"), "authors");
|
||||||
result = authorsMatch.compare(c, d, conf);
|
result = authorsMatch.compare(c, d, conf);
|
||||||
|
|
||||||
assertEquals(0.0, result);
|
assertEquals(0.0, result);
|
||||||
|
@ -268,12 +266,12 @@ public class ComparatorTest extends AbstractPaceTest {
|
||||||
|
|
||||||
assertEquals(1.0, result);
|
assertEquals(1.0, result);
|
||||||
|
|
||||||
Field e = createFieldList(Arrays.asList("Manghi, Paolo", "Atzori, Claudio"), "authors");
|
List<String> e = createFieldList(Arrays.asList("Manghi, Paolo", "Atzori, Claudio"), "authors");
|
||||||
result = authorsMatch.compare(a, e, conf);
|
result = authorsMatch.compare(a, e, conf);
|
||||||
|
|
||||||
assertEquals(0.25, result);
|
assertEquals(0.25, result);
|
||||||
|
|
||||||
Field f = createFieldList(new ArrayList<>(), "authors");
|
List<String> f = createFieldList(new ArrayList<>(), "authors");
|
||||||
result = authorsMatch.compare(f, f, conf);
|
result = authorsMatch.compare(f, f, conf);
|
||||||
System.out.println("result = " + result);
|
System.out.println("result = " + result);
|
||||||
|
|
||||||
|
@ -284,12 +282,12 @@ public class ComparatorTest extends AbstractPaceTest {
|
||||||
|
|
||||||
JsonListMatch jsonListMatch = new JsonListMatch(params);
|
JsonListMatch jsonListMatch = new JsonListMatch(params);
|
||||||
|
|
||||||
Field a = createFieldList(
|
List<String> a = createFieldList(
|
||||||
Arrays
|
Arrays
|
||||||
.asList(
|
.asList(
|
||||||
"{\"datainfo\":{\"deletedbyinference\":false,\"inferenceprovenance\":null,\"inferred\":false,\"invisible\":false,\"provenanceaction\":{\"classid\":\"sysimport:actionset\",\"classname\":\"Harvested\",\"schemeid\":\"dnet:provenanceActions\",\"schemename\":\"dnet:provenanceActions\"},\"trust\":\"0.9\"},\"qualifier\":{\"classid\":\"doi\",\"classname\":\"Digital Object Identifier\",\"schemeid\":\"dnet:pid_types\",\"schemename\":\"dnet:pid_types\"},\"value\":\"10.1111/pbi.12655\"}"),
|
"{\"datainfo\":{\"deletedbyinference\":false,\"inferenceprovenance\":null,\"inferred\":false,\"invisible\":false,\"provenanceaction\":{\"classid\":\"sysimport:actionset\",\"classname\":\"Harvested\",\"schemeid\":\"dnet:provenanceActions\",\"schemename\":\"dnet:provenanceActions\"},\"trust\":\"0.9\"},\"qualifier\":{\"classid\":\"doi\",\"classname\":\"Digital Object Identifier\",\"schemeid\":\"dnet:pid_types\",\"schemename\":\"dnet:pid_types\"},\"value\":\"10.1111/pbi.12655\"}"),
|
||||||
"authors");
|
"authors");
|
||||||
Field b = createFieldList(
|
List<String> b = createFieldList(
|
||||||
Arrays
|
Arrays
|
||||||
.asList(
|
.asList(
|
||||||
"{\"datainfo\":{\"deletedbyinference\":false,\"inferenceprovenance\":\"\",\"inferred\":false,\"invisible\":false,\"provenanceaction\":{\"classid\":\"sysimport:crosswalk:repository\",\"classname\":\"Harvested\",\"schemeid\":\"dnet:provenanceActions\",\"schemename\":\"dnet:provenanceActions\"},\"trust\":\"0.9\"},\"qualifier\":{\"classid\":\"pmc\",\"classname\":\"PubMed Central ID\",\"schemeid\":\"dnet:pid_types\",\"schemename\":\"dnet:pid_types\"},\"value\":\"PMC5399005\"}",
|
"{\"datainfo\":{\"deletedbyinference\":false,\"inferenceprovenance\":\"\",\"inferred\":false,\"invisible\":false,\"provenanceaction\":{\"classid\":\"sysimport:crosswalk:repository\",\"classname\":\"Harvested\",\"schemeid\":\"dnet:provenanceActions\",\"schemename\":\"dnet:provenanceActions\"},\"trust\":\"0.9\"},\"qualifier\":{\"classid\":\"pmc\",\"classname\":\"PubMed Central ID\",\"schemeid\":\"dnet:pid_types\",\"schemename\":\"dnet:pid_types\"},\"value\":\"PMC5399005\"}",
|
||||||
|
@ -313,8 +311,8 @@ public class ComparatorTest extends AbstractPaceTest {
|
||||||
public void domainExactMatch() {
|
public void domainExactMatch() {
|
||||||
|
|
||||||
DomainExactMatch domainExactMatch = new DomainExactMatch(params);
|
DomainExactMatch domainExactMatch = new DomainExactMatch(params);
|
||||||
Field a = url("http://www.flowrepository.org");
|
String a = url("http://www.flowrepository.org");
|
||||||
Field b = url("http://flowrepository.org/");
|
String b = url("http://flowrepository.org/");
|
||||||
|
|
||||||
double compare = domainExactMatch.compare(a, b, conf);
|
double compare = domainExactMatch.compare(a, b, conf);
|
||||||
System.out.println("compare = " + compare);
|
System.out.println("compare = " + compare);
|
||||||
|
@ -326,12 +324,12 @@ public class ComparatorTest extends AbstractPaceTest {
|
||||||
|
|
||||||
CosineSimilarity cosineSimilarity = new CosineSimilarity(params);
|
CosineSimilarity cosineSimilarity = new CosineSimilarity(params);
|
||||||
|
|
||||||
Field a = new FieldValueImpl(Type.DoubleArray, "array", new double[] {
|
double[] a = new double[] {
|
||||||
1, 2, 3
|
1, 2, 3
|
||||||
});
|
};
|
||||||
Field b = new FieldValueImpl(Type.DoubleArray, "array", new double[] {
|
double[] b = new double[] {
|
||||||
1, 2, 3
|
1, 2, 3
|
||||||
});
|
};
|
||||||
|
|
||||||
double compare = cosineSimilarity.compare(a, b, conf);
|
double compare = cosineSimilarity.compare(a, b, conf);
|
||||||
|
|
||||||
|
|
|
@ -3,26 +3,14 @@ package eu.dnetlib.pace.config;
|
||||||
|
|
||||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||||
import static org.junit.jupiter.api.Assertions.assertNotNull;
|
import static org.junit.jupiter.api.Assertions.assertNotNull;
|
||||||
import static org.junit.jupiter.api.Assertions.assertTrue;
|
|
||||||
|
|
||||||
import java.util.*;
|
import java.util.HashMap;
|
||||||
import java.util.stream.Collectors;
|
import java.util.Map;
|
||||||
|
|
||||||
import org.junit.jupiter.api.*;
|
import org.junit.jupiter.api.BeforeAll;
|
||||||
|
import org.junit.jupiter.api.Test;
|
||||||
|
|
||||||
import eu.dnetlib.pace.AbstractPaceTest;
|
import eu.dnetlib.pace.AbstractPaceTest;
|
||||||
import eu.dnetlib.pace.clustering.BlacklistAwareClusteringCombiner;
|
|
||||||
import eu.dnetlib.pace.clustering.ClusteringClass;
|
|
||||||
import eu.dnetlib.pace.clustering.ClusteringCombiner;
|
|
||||||
import eu.dnetlib.pace.model.Field;
|
|
||||||
import eu.dnetlib.pace.model.FieldList;
|
|
||||||
import eu.dnetlib.pace.model.FieldValue;
|
|
||||||
import eu.dnetlib.pace.model.MapDocument;
|
|
||||||
import eu.dnetlib.pace.tree.JsonListMatch;
|
|
||||||
import eu.dnetlib.pace.tree.support.AggType;
|
|
||||||
import eu.dnetlib.pace.tree.support.FieldConf;
|
|
||||||
import eu.dnetlib.pace.tree.support.TreeNodeDef;
|
|
||||||
import eu.dnetlib.pace.tree.support.TreeNodeStats;
|
|
||||||
import eu.dnetlib.pace.util.MapDocumentUtil;
|
import eu.dnetlib.pace.util.MapDocumentUtil;
|
||||||
|
|
||||||
public class ConfigTest extends AbstractPaceTest {
|
public class ConfigTest extends AbstractPaceTest {
|
||||||
|
@ -82,41 +70,6 @@ public class ConfigTest extends AbstractPaceTest {
|
||||||
assertEquals(0, load.getPace().translationMap().keySet().size());
|
assertEquals(0, load.getPace().translationMap().keySet().size());
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
|
||||||
public void asMapDocumentTest1() {
|
|
||||||
|
|
||||||
DedupConfig dedupConf = DedupConfig.load(readFromClasspath("publication.current.conf.json"));
|
|
||||||
|
|
||||||
final String json = readFromClasspath("publication.json");
|
|
||||||
|
|
||||||
final MapDocument mapDocument = MapDocumentUtil.asMapDocumentWithJPath(dedupConf, json);
|
|
||||||
|
|
||||||
// System.out.println("mapDocument = " + mapDocument.getFieldMap());
|
|
||||||
|
|
||||||
// JsonListMatch jsonListMatch = new JsonListMatch(params);
|
|
||||||
//
|
|
||||||
// jsonListMatch.compare(mapDocument.getFieldMap().get("pid"), mapDocument.getFieldMap().get("pid"), null);
|
|
||||||
|
|
||||||
System.out.println("mapDocument = " + mapDocument.getFieldMap().get("title").stringValue());
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
@Test
|
|
||||||
public void authorAsMapDocument() {
|
|
||||||
|
|
||||||
DedupConfig dedupConf = DedupConfig.load(readFromClasspath("author.fdup.conf.json"));
|
|
||||||
|
|
||||||
final String json = readFromClasspath("author.json");
|
|
||||||
|
|
||||||
final MapDocument mapDocument = MapDocumentUtil.asMapDocumentWithJPath(dedupConf, json);
|
|
||||||
|
|
||||||
System.out
|
|
||||||
.println(
|
|
||||||
"mapDocument = "
|
|
||||||
+ Arrays.toString(((FieldValue) mapDocument.getFieldMap().get("topics")).doubleArrayValue()));
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testJPath() {
|
public void testJPath() {
|
||||||
final String json = readFromClasspath("organization.json");
|
final String json = readFromClasspath("organization.json");
|
||||||
|
@ -126,53 +79,4 @@ public class ConfigTest extends AbstractPaceTest {
|
||||||
System.out.println("result = " + MapDocumentUtil.getJPathString(jpath, json));
|
System.out.println("result = " + MapDocumentUtil.getJPathString(jpath, json));
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
|
||||||
public void clusteringCombinerTest() {
|
|
||||||
|
|
||||||
DedupConfig dedupConf = DedupConfig.load(readFromClasspath("publication.current.conf.json"));
|
|
||||||
|
|
||||||
final String json = readFromClasspath("publication.json");
|
|
||||||
|
|
||||||
final MapDocument mapDocument = MapDocumentUtil.asMapDocumentWithJPath(dedupConf, json);
|
|
||||||
|
|
||||||
String[] combine = ClusteringCombiner.combine(mapDocument, dedupConf).toArray(new String[3]);
|
|
||||||
|
|
||||||
assertEquals("test", combine[0].split(":")[1]);
|
|
||||||
assertEquals("title", combine[1].split(":")[1]);
|
|
||||||
assertEquals("doi", combine[2].split(":")[1]);
|
|
||||||
}
|
|
||||||
|
|
||||||
@Test
|
|
||||||
public void filterAndCombineTest() {
|
|
||||||
|
|
||||||
DedupConfig dedupConf = DedupConfig.load(readFromClasspath("pub.prod.conf.json"));
|
|
||||||
|
|
||||||
final String json = readFromClasspath("publication.example.json");
|
|
||||||
|
|
||||||
final MapDocument mapDocument = MapDocumentUtil.asMapDocumentWithJPath(dedupConf, json);
|
|
||||||
|
|
||||||
Collection<String> strings = BlacklistAwareClusteringCombiner.filterAndCombine(mapDocument, dedupConf);
|
|
||||||
|
|
||||||
for (String s : strings) {
|
|
||||||
System.out.println("s = " + s);
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
@Test
|
|
||||||
public void crossCompareTest() {
|
|
||||||
|
|
||||||
DedupConfig dedupConf = DedupConfig.load(readFromClasspath("organization.cross.compare.conf.json"));
|
|
||||||
|
|
||||||
TreeNodeDef treeNode = dedupConf.decisionTree().get("start");
|
|
||||||
|
|
||||||
final String json = readFromClasspath("organization.json");
|
|
||||||
|
|
||||||
final MapDocument doc = MapDocumentUtil.asMapDocumentWithJPath(dedupConf, json);
|
|
||||||
|
|
||||||
TreeNodeStats nodeStats = treeNode.evaluate(doc, doc, dedupConf);
|
|
||||||
|
|
||||||
assertTrue(nodeStats.getFinalScore(AggType.MAX) > 0.7);
|
|
||||||
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -6,9 +6,11 @@ import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||||
import java.util.HashMap;
|
import java.util.HashMap;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
|
|
||||||
import org.junit.jupiter.api.*;
|
import org.junit.jupiter.api.BeforeAll;
|
||||||
|
import org.junit.jupiter.api.Test;
|
||||||
|
|
||||||
import eu.dnetlib.pace.model.Person;
|
import eu.dnetlib.pace.model.Person;
|
||||||
|
import jdk.nashorn.internal.ir.annotations.Ignore;
|
||||||
|
|
||||||
public class UtilTest {
|
public class UtilTest {
|
||||||
|
|
||||||
|
@ -20,6 +22,7 @@ public class UtilTest {
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
|
@Ignore
|
||||||
public void paceResolverTest() {
|
public void paceResolverTest() {
|
||||||
PaceResolver paceResolver = new PaceResolver();
|
PaceResolver paceResolver = new PaceResolver();
|
||||||
paceResolver.getComparator("keywordMatch", params);
|
paceResolver.getComparator("keywordMatch", params);
|
||||||
|
|
|
@ -11,12 +11,12 @@
|
||||||
<dependencies>
|
<dependencies>
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>org.apache.spark</groupId>
|
<groupId>org.apache.spark</groupId>
|
||||||
<artifactId>spark-core_2.11</artifactId>
|
<artifactId>spark-core_${scala.binary.version}</artifactId>
|
||||||
</dependency>
|
</dependency>
|
||||||
|
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>org.apache.spark</groupId>
|
<groupId>org.apache.spark</groupId>
|
||||||
<artifactId>spark-sql_2.11</artifactId>
|
<artifactId>spark-sql_${scala.binary.version}</artifactId>
|
||||||
</dependency>
|
</dependency>
|
||||||
|
|
||||||
<dependency>
|
<dependency>
|
||||||
|
|
|
@ -38,6 +38,8 @@
|
||||||
</execution>
|
</execution>
|
||||||
</executions>
|
</executions>
|
||||||
<configuration>
|
<configuration>
|
||||||
|
<failOnMultipleScalaVersions>true</failOnMultipleScalaVersions>
|
||||||
|
<scalaCompatVersion>${scala.binary.version}</scalaCompatVersion>
|
||||||
<scalaVersion>${scala.version}</scalaVersion>
|
<scalaVersion>${scala.version}</scalaVersion>
|
||||||
</configuration>
|
</configuration>
|
||||||
</plugin>
|
</plugin>
|
||||||
|
@ -54,11 +56,11 @@
|
||||||
|
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>org.apache.spark</groupId>
|
<groupId>org.apache.spark</groupId>
|
||||||
<artifactId>spark-core_2.11</artifactId>
|
<artifactId>spark-core_${scala.binary.version}</artifactId>
|
||||||
</dependency>
|
</dependency>
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>org.apache.spark</groupId>
|
<groupId>org.apache.spark</groupId>
|
||||||
<artifactId>spark-sql_2.11</artifactId>
|
<artifactId>spark-sql_${scala.binary.version}</artifactId>
|
||||||
</dependency>
|
</dependency>
|
||||||
|
|
||||||
<dependency>
|
<dependency>
|
||||||
|
@ -75,6 +77,11 @@
|
||||||
<groupId>dom4j</groupId>
|
<groupId>dom4j</groupId>
|
||||||
<artifactId>dom4j</artifactId>
|
<artifactId>dom4j</artifactId>
|
||||||
</dependency>
|
</dependency>
|
||||||
|
<dependency>
|
||||||
|
<groupId>org.scala-lang.modules</groupId>
|
||||||
|
<artifactId>scala-xml_${scala.binary.version}</artifactId>
|
||||||
|
<version>${scala-xml.version}</version>
|
||||||
|
</dependency>
|
||||||
|
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>xml-apis</groupId>
|
<groupId>xml-apis</groupId>
|
||||||
|
|
|
@ -7,8 +7,8 @@ import java.util.ArrayList;
|
||||||
import java.util.Iterator;
|
import java.util.Iterator;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
import org.apache.commons.lang.StringUtils;
|
import org.apache.commons.lang3.StringUtils;
|
||||||
import org.apache.commons.lang.reflect.FieldUtils;
|
import org.apache.commons.lang3.reflect.FieldUtils;
|
||||||
import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
|
import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
|
||||||
import org.apache.poi.openxml4j.opc.OPCPackage;
|
import org.apache.poi.openxml4j.opc.OPCPackage;
|
||||||
import org.apache.poi.ss.usermodel.Cell;
|
import org.apache.poi.ss.usermodel.Cell;
|
||||||
|
|
|
@ -1,12 +1,13 @@
|
||||||
|
|
||||||
package eu.dnetlib.dhp.actionmanager.project.utils;
|
package eu.dnetlib.dhp.actionmanager.project.utils;
|
||||||
|
|
||||||
import java.io.*;
|
import java.io.BufferedWriter;
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.OutputStreamWriter;
|
||||||
|
import java.io.Serializable;
|
||||||
import java.nio.charset.StandardCharsets;
|
import java.nio.charset.StandardCharsets;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.zip.ZipEntry;
|
|
||||||
import java.util.zip.ZipInputStream;
|
|
||||||
|
|
||||||
import org.apache.commons.io.IOUtils;
|
import org.apache.commons.io.IOUtils;
|
||||||
import org.apache.hadoop.conf.Configuration;
|
import org.apache.hadoop.conf.Configuration;
|
||||||
|
@ -66,7 +67,7 @@ public class ReadProjects implements Serializable {
|
||||||
|
|
||||||
FSDataInputStream inputStream = fs.open(hdfsreadpath);
|
FSDataInputStream inputStream = fs.open(hdfsreadpath);
|
||||||
|
|
||||||
ArrayList<Project> projects = OBJECT_MAPPER
|
List<Project> projects = OBJECT_MAPPER
|
||||||
.readValue(
|
.readValue(
|
||||||
IOUtils.toString(inputStream, "UTF-8"),
|
IOUtils.toString(inputStream, "UTF-8"),
|
||||||
new TypeReference<List<Project>>() {
|
new TypeReference<List<Project>>() {
|
||||||
|
|
|
@ -6,7 +6,6 @@ import java.io.IOException;
|
||||||
import java.io.OutputStreamWriter;
|
import java.io.OutputStreamWriter;
|
||||||
import java.io.Serializable;
|
import java.io.Serializable;
|
||||||
import java.nio.charset.StandardCharsets;
|
import java.nio.charset.StandardCharsets;
|
||||||
import java.util.ArrayList;
|
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
import org.apache.commons.io.IOUtils;
|
import org.apache.commons.io.IOUtils;
|
||||||
|
@ -23,7 +22,6 @@ import com.fasterxml.jackson.databind.ObjectMapper;
|
||||||
|
|
||||||
import eu.dnetlib.dhp.actionmanager.project.PrepareProjects;
|
import eu.dnetlib.dhp.actionmanager.project.PrepareProjects;
|
||||||
import eu.dnetlib.dhp.actionmanager.project.utils.model.JsonTopic;
|
import eu.dnetlib.dhp.actionmanager.project.utils.model.JsonTopic;
|
||||||
import eu.dnetlib.dhp.actionmanager.project.utils.model.Project;
|
|
||||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -68,7 +66,7 @@ public class ReadTopics implements Serializable {
|
||||||
|
|
||||||
FSDataInputStream inputStream = fs.open(hdfsreadpath);
|
FSDataInputStream inputStream = fs.open(hdfsreadpath);
|
||||||
|
|
||||||
ArrayList<JsonTopic> topics = OBJECT_MAPPER
|
List<JsonTopic> topics = OBJECT_MAPPER
|
||||||
.readValue(
|
.readValue(
|
||||||
IOUtils.toString(inputStream, "UTF-8"),
|
IOUtils.toString(inputStream, "UTF-8"),
|
||||||
new TypeReference<List<JsonTopic>>() {
|
new TypeReference<List<JsonTopic>>() {
|
||||||
|
|
|
@ -9,7 +9,7 @@ import java.util.Iterator;
|
||||||
import java.util.Queue;
|
import java.util.Queue;
|
||||||
import java.util.concurrent.PriorityBlockingQueue;
|
import java.util.concurrent.PriorityBlockingQueue;
|
||||||
|
|
||||||
import org.apache.commons.lang.StringUtils;
|
import org.apache.commons.lang3.StringUtils;
|
||||||
import org.dom4j.Document;
|
import org.dom4j.Document;
|
||||||
import org.dom4j.DocumentException;
|
import org.dom4j.DocumentException;
|
||||||
import org.dom4j.DocumentHelper;
|
import org.dom4j.DocumentHelper;
|
||||||
|
|
|
@ -16,11 +16,11 @@
|
||||||
</dependency>
|
</dependency>
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>org.apache.spark</groupId>
|
<groupId>org.apache.spark</groupId>
|
||||||
<artifactId>spark-core_2.11</artifactId>
|
<artifactId>spark-core_${scala.binary.version}</artifactId>
|
||||||
</dependency>
|
</dependency>
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>org.apache.spark</groupId>
|
<groupId>org.apache.spark</groupId>
|
||||||
<artifactId>spark-sql_2.11</artifactId>
|
<artifactId>spark-sql_${scala.binary.version}</artifactId>
|
||||||
</dependency>
|
</dependency>
|
||||||
</dependencies>
|
</dependencies>
|
||||||
|
|
||||||
|
|
|
@ -18,11 +18,11 @@
|
||||||
|
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>org.apache.spark</groupId>
|
<groupId>org.apache.spark</groupId>
|
||||||
<artifactId>spark-core_2.11</artifactId>
|
<artifactId>spark-core_${scala.binary.version}</artifactId>
|
||||||
</dependency>
|
</dependency>
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>org.apache.spark</groupId>
|
<groupId>org.apache.spark</groupId>
|
||||||
<artifactId>spark-sql_2.11</artifactId>
|
<artifactId>spark-sql_${scala.binary.version}</artifactId>
|
||||||
</dependency>
|
</dependency>
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>org.elasticsearch</groupId>
|
<groupId>org.elasticsearch</groupId>
|
||||||
|
|
|
@ -2,7 +2,7 @@
|
||||||
package eu.dnetlib.dhp.broker.oa;
|
package eu.dnetlib.dhp.broker.oa;
|
||||||
|
|
||||||
import org.apache.commons.io.IOUtils;
|
import org.apache.commons.io.IOUtils;
|
||||||
import org.apache.commons.lang.StringUtils;
|
import org.apache.commons.lang3.StringUtils;
|
||||||
import org.apache.spark.SparkConf;
|
import org.apache.spark.SparkConf;
|
||||||
import org.apache.spark.api.java.function.FilterFunction;
|
import org.apache.spark.api.java.function.FilterFunction;
|
||||||
import org.apache.spark.api.java.function.MapFunction;
|
import org.apache.spark.api.java.function.MapFunction;
|
||||||
|
|
|
@ -3,6 +3,7 @@ package eu.dnetlib.dhp.broker.oa.util;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
|
||||||
|
import org.apache.spark.sql.Row;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
@ -10,9 +11,8 @@ import com.fasterxml.jackson.databind.ObjectMapper;
|
||||||
|
|
||||||
import eu.dnetlib.broker.objects.OaBrokerMainEntity;
|
import eu.dnetlib.broker.objects.OaBrokerMainEntity;
|
||||||
import eu.dnetlib.pace.config.DedupConfig;
|
import eu.dnetlib.pace.config.DedupConfig;
|
||||||
import eu.dnetlib.pace.model.MapDocument;
|
import eu.dnetlib.pace.model.SparkDeduper;
|
||||||
import eu.dnetlib.pace.tree.support.TreeProcessor;
|
import eu.dnetlib.pace.tree.support.TreeProcessor;
|
||||||
import eu.dnetlib.pace.util.MapDocumentUtil;
|
|
||||||
|
|
||||||
public class TrustUtils {
|
public class TrustUtils {
|
||||||
|
|
||||||
|
@ -20,13 +20,18 @@ public class TrustUtils {
|
||||||
|
|
||||||
private static DedupConfig dedupConfig;
|
private static DedupConfig dedupConfig;
|
||||||
|
|
||||||
|
private static SparkDeduper deduper;
|
||||||
|
|
||||||
|
private static final ObjectMapper mapper;
|
||||||
|
|
||||||
static {
|
static {
|
||||||
final ObjectMapper mapper = new ObjectMapper();
|
mapper = new ObjectMapper();
|
||||||
try {
|
try {
|
||||||
dedupConfig = mapper
|
dedupConfig = mapper
|
||||||
.readValue(
|
.readValue(
|
||||||
DedupConfig.class.getResourceAsStream("/eu/dnetlib/dhp/broker/oa/dedupConfig/dedupConfig.json"),
|
DedupConfig.class.getResourceAsStream("/eu/dnetlib/dhp/broker/oa/dedupConfig/dedupConfig.json"),
|
||||||
DedupConfig.class);
|
DedupConfig.class);
|
||||||
|
deduper = new SparkDeduper(dedupConfig);
|
||||||
} catch (final IOException e) {
|
} catch (final IOException e) {
|
||||||
log.error("Error loading dedupConfig, e");
|
log.error("Error loading dedupConfig, e");
|
||||||
}
|
}
|
||||||
|
@ -42,11 +47,8 @@ public class TrustUtils {
|
||||||
}
|
}
|
||||||
|
|
||||||
try {
|
try {
|
||||||
final ObjectMapper objectMapper = new ObjectMapper();
|
final Row doc1 = deduper.model().rowFromJson(mapper.writeValueAsString(r1));
|
||||||
final MapDocument doc1 = MapDocumentUtil
|
final Row doc2 = deduper.model().rowFromJson(mapper.writeValueAsString(r2));
|
||||||
.asMapDocumentWithJPath(dedupConfig, objectMapper.writeValueAsString(r1));
|
|
||||||
final MapDocument doc2 = MapDocumentUtil
|
|
||||||
.asMapDocumentWithJPath(dedupConfig, objectMapper.writeValueAsString(r2));
|
|
||||||
|
|
||||||
final double score = new TreeProcessor(dedupConfig).computeScore(doc1, doc2);
|
final double score = new TreeProcessor(dedupConfig).computeScore(doc1, doc2);
|
||||||
|
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
|
|
||||||
package eu.dnetlib.dhp.broker.oa.util.aggregators.stats;
|
package eu.dnetlib.dhp.broker.oa.util.aggregators.stats;
|
||||||
|
|
||||||
import org.apache.commons.lang.StringUtils;
|
import org.apache.commons.lang3.StringUtils;
|
||||||
import org.apache.spark.sql.Encoder;
|
import org.apache.spark.sql.Encoder;
|
||||||
import org.apache.spark.sql.Encoders;
|
import org.apache.spark.sql.Encoders;
|
||||||
import org.apache.spark.sql.expressions.Aggregator;
|
import org.apache.spark.sql.expressions.Aggregator;
|
||||||
|
|
|
@ -12,6 +12,7 @@ import java.util.stream.Collectors;
|
||||||
|
|
||||||
import org.apache.commons.io.FileUtils;
|
import org.apache.commons.io.FileUtils;
|
||||||
import org.apache.spark.SparkConf;
|
import org.apache.spark.SparkConf;
|
||||||
|
import org.apache.spark.api.java.function.FilterFunction;
|
||||||
import org.apache.spark.api.java.function.MapFunction;
|
import org.apache.spark.api.java.function.MapFunction;
|
||||||
import org.apache.spark.sql.Encoders;
|
import org.apache.spark.sql.Encoders;
|
||||||
import org.apache.spark.sql.SparkSession;
|
import org.apache.spark.sql.SparkSession;
|
||||||
|
@ -82,8 +83,8 @@ public class SimpleVariableJobTest {
|
||||||
|
|
||||||
final long n = spark
|
final long n = spark
|
||||||
.createDataset(inputList, Encoders.STRING())
|
.createDataset(inputList, Encoders.STRING())
|
||||||
.filter(s -> filter(map.get(s)))
|
.filter((FilterFunction<String>) s -> filter(map.get(s)))
|
||||||
.map((MapFunction<String, String>) s -> s.toLowerCase(), Encoders.STRING())
|
.map((MapFunction<String, String>) String::toLowerCase, Encoders.STRING())
|
||||||
.count();
|
.count();
|
||||||
|
|
||||||
System.out.println(n);
|
System.out.println(n);
|
||||||
|
@ -96,8 +97,8 @@ public class SimpleVariableJobTest {
|
||||||
|
|
||||||
final long n = spark
|
final long n = spark
|
||||||
.createDataset(inputList, Encoders.STRING())
|
.createDataset(inputList, Encoders.STRING())
|
||||||
.filter(s -> filter(staticMap.get(s)))
|
.filter((FilterFunction<String>) s -> filter(staticMap.get(s)))
|
||||||
.map((MapFunction<String, String>) s -> s.toLowerCase(), Encoders.STRING())
|
.map((MapFunction<String, String>) String::toLowerCase, Encoders.STRING())
|
||||||
.count();
|
.count();
|
||||||
|
|
||||||
System.out.println(n);
|
System.out.println(n);
|
||||||
|
|
|
@ -13,7 +13,7 @@
|
||||||
<plugin>
|
<plugin>
|
||||||
<groupId>net.alchim31.maven</groupId>
|
<groupId>net.alchim31.maven</groupId>
|
||||||
<artifactId>scala-maven-plugin</artifactId>
|
<artifactId>scala-maven-plugin</artifactId>
|
||||||
<version>4.0.1</version>
|
<version>${net.alchim31.maven.version}</version>
|
||||||
<executions>
|
<executions>
|
||||||
<execution>
|
<execution>
|
||||||
<id>scala-compile-first</id>
|
<id>scala-compile-first</id>
|
||||||
|
@ -32,6 +32,8 @@
|
||||||
</execution>
|
</execution>
|
||||||
</executions>
|
</executions>
|
||||||
<configuration>
|
<configuration>
|
||||||
|
<failOnMultipleScalaVersions>true</failOnMultipleScalaVersions>
|
||||||
|
<scalaCompatVersion>${scala.binary.version}</scalaCompatVersion>
|
||||||
<scalaVersion>${scala.version}</scalaVersion>
|
<scalaVersion>${scala.version}</scalaVersion>
|
||||||
</configuration>
|
</configuration>
|
||||||
</plugin>
|
</plugin>
|
||||||
|
@ -53,30 +55,35 @@
|
||||||
<version>${project.version}</version>
|
<version>${project.version}</version>
|
||||||
</dependency>
|
</dependency>
|
||||||
|
|
||||||
|
<dependency>
|
||||||
|
<groupId>org.apache.commons</groupId>
|
||||||
|
<artifactId>commons-lang3</artifactId>
|
||||||
|
</dependency>
|
||||||
|
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>org.scala-lang.modules</groupId>
|
<groupId>org.scala-lang.modules</groupId>
|
||||||
<artifactId>scala-java8-compat_2.11</artifactId>
|
<artifactId>scala-java8-compat_${scala.binary.version}</artifactId>
|
||||||
<version>1.0.2</version>
|
<version>1.0.2</version>
|
||||||
</dependency>
|
</dependency>
|
||||||
|
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>org.scala-lang.modules</groupId>
|
<groupId>org.scala-lang.modules</groupId>
|
||||||
<artifactId>scala-collection-compat_2.11</artifactId>
|
<artifactId>scala-collection-compat_${scala.binary.version}</artifactId>
|
||||||
<version>2.8.0</version>
|
<version>2.11.0</version>
|
||||||
</dependency>
|
</dependency>
|
||||||
|
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>org.apache.spark</groupId>
|
<groupId>org.apache.spark</groupId>
|
||||||
<artifactId>spark-core_2.11</artifactId>
|
<artifactId>spark-core_${scala.binary.version}</artifactId>
|
||||||
</dependency>
|
</dependency>
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>org.apache.spark</groupId>
|
<groupId>org.apache.spark</groupId>
|
||||||
<artifactId>spark-sql_2.11</artifactId>
|
<artifactId>spark-sql_${scala.binary.version}</artifactId>
|
||||||
</dependency>
|
</dependency>
|
||||||
|
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>org.apache.spark</groupId>
|
<groupId>org.apache.spark</groupId>
|
||||||
<artifactId>spark-graphx_2.11</artifactId>
|
<artifactId>spark-graphx_${scala.binary.version}</artifactId>
|
||||||
</dependency>
|
</dependency>
|
||||||
|
|
||||||
<dependency>
|
<dependency>
|
||||||
|
|
|
@ -88,9 +88,7 @@ abstract class AbstractSparkAction implements Serializable {
|
||||||
"for $x in /RESOURCE_PROFILE[.//RESOURCE_IDENTIFIER/@value = '%s'] return $x//DEDUPLICATION/text()",
|
"for $x in /RESOURCE_PROFILE[.//RESOURCE_IDENTIFIER/@value = '%s'] return $x//DEDUPLICATION/text()",
|
||||||
configProfileId));
|
configProfileId));
|
||||||
|
|
||||||
DedupConfig dedupConfig = new ObjectMapper().readValue(conf, DedupConfig.class);
|
DedupConfig dedupConfig = DedupConfig.load(conf);
|
||||||
dedupConfig.getPace().initModel();
|
|
||||||
dedupConfig.getPace().initTranslationMap();
|
|
||||||
dedupConfig.getWf().setConfigurationId(actionSetId);
|
dedupConfig.getWf().setConfigurationId(actionSetId);
|
||||||
|
|
||||||
return dedupConfig;
|
return dedupConfig;
|
||||||
|
|
|
@ -5,14 +5,14 @@ import static java.util.Collections.reverseOrder;
|
||||||
import static java.util.Map.Entry.comparingByValue;
|
import static java.util.Map.Entry.comparingByValue;
|
||||||
import static java.util.stream.Collectors.toMap;
|
import static java.util.stream.Collectors.toMap;
|
||||||
|
|
||||||
import static org.apache.commons.lang.StringUtils.endsWith;
|
import static org.apache.commons.lang3.StringUtils.endsWith;
|
||||||
import static org.apache.commons.lang.StringUtils.substringBefore;
|
import static org.apache.commons.lang3.StringUtils.substringBefore;
|
||||||
|
|
||||||
import java.time.Year;
|
import java.time.Year;
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
import java.util.stream.Collectors;
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
import org.apache.commons.lang.StringUtils;
|
import org.apache.commons.lang3.StringUtils;
|
||||||
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.Field;
|
import eu.dnetlib.dhp.schema.oaf.Field;
|
||||||
|
|
||||||
|
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue