Compare commits

..

No commits in common. "master" and "dnet-dedup-4.1.9" have entirely different histories.

75 changed files with 2531 additions and 9062 deletions

2
.gitignore vendored
View File

@ -19,5 +19,3 @@
/build
spark-warehouse
/dhp-workflows/dhp-graph-mapper/job-override.properties
test.properties

View File

@ -6,7 +6,7 @@
<parent>
<groupId>eu.dnetlib</groupId>
<artifactId>dhp-build</artifactId>
<version>4.1.13-SNAPSHOT</version>
<version>4.1.9</version>
</parent>
<artifactId>dhp-build-assembly-resources</artifactId>

View File

@ -6,11 +6,10 @@
<parent>
<groupId>eu.dnetlib</groupId>
<artifactId>dhp-build</artifactId>
<version>4.1.13-SNAPSHOT</version>
<version>4.1.9</version>
</parent>
<artifactId>dhp-build-properties-maven-plugin</artifactId>
<version>4.1.13-SNAPSHOT</version>
<packaging>maven-plugin</packaging>
<description>This module is a maven plugin implementing custom properties substitutions in the build lifecycle</description>
@ -20,19 +19,16 @@
<groupId>org.apache.maven</groupId>
<artifactId>maven-plugin-api</artifactId>
<version>3.6.3</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>org.apache.maven</groupId>
<artifactId>maven-project</artifactId>
<version>2.2.1</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>org.apache.maven</groupId>
<artifactId>maven-artifact</artifactId>
<version>2.2.1</version>
<scope>provided</scope>
</dependency>
<dependency>
@ -104,29 +100,6 @@
</configuration>
</plugin>
</plugins>
<pluginManagement>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-plugin-plugin</artifactId>
<version>3.2</version>
<configuration>
<skipErrorNoDescriptorsFound>true</skipErrorNoDescriptorsFound>
</configuration>
<executions>
<execution>
<id>mojo-descriptor</id>
<phase>process-classes</phase>
<goals>
<goal>descriptor</goal>
</goals>
</execution>
</executions>
</plugin>
</plugins>
</pluginManagement>
</build>
</project>

View File

@ -8,8 +8,6 @@ import static org.junit.jupiter.api.Assertions.assertNull;
import org.junit.jupiter.api.*;
import java.nio.file.Paths;
/** @author mhorst, claudio.atzori */
public class GenerateOoziePropertiesMojoTest {
@ -68,7 +66,7 @@ public class GenerateOoziePropertiesMojoTest {
clearSystemProperties();
// given
String workflowSourceDir = Paths.get("eu/dnetlib/dhp/").toString();
String workflowSourceDir = "eu/dnetlib/dhp/";
System.setProperty(PROPERTY_NAME_WF_SOURCE_DIR, workflowSourceDir);
// execute
@ -83,14 +81,14 @@ public class GenerateOoziePropertiesMojoTest {
clearSystemProperties();
// given
String workflowSourceDir = Paths.get("eu/dnetlib/dhp/wf/transformers").toString();
String workflowSourceDir = "eu/dnetlib/dhp/wf/transformers";
System.setProperty(PROPERTY_NAME_WF_SOURCE_DIR, workflowSourceDir);
// execute
mojo.execute();
// assert
assertEquals(Paths.get("wf/transformers").toString(), System.getProperty(PROPERTY_NAME_SANDBOX_NAME));
assertEquals("wf/transformers", System.getProperty(PROPERTY_NAME_SANDBOX_NAME));
}
@Test
@ -98,13 +96,13 @@ public class GenerateOoziePropertiesMojoTest {
clearSystemProperties();
// given
String workflowSourceDir = Paths.get("wf/transformers").toString();
String workflowSourceDir = "wf/transformers";
System.setProperty(PROPERTY_NAME_WF_SOURCE_DIR, workflowSourceDir);
// execute
mojo.execute();
// assert
assertEquals(Paths.get("wf/transformers").toString(), System.getProperty(PROPERTY_NAME_SANDBOX_NAME));
assertEquals("wf/transformers", System.getProperty(PROPERTY_NAME_SANDBOX_NAME));
}
}

View File

@ -0,0 +1 @@
# Thu Dec 30 13:11:51 CET 2021

View File

@ -5,7 +5,7 @@
<groupId>eu.dnetlib</groupId>
<artifactId>dhp-code-style</artifactId>
<version>4.1.13-SNAPSHOT</version>
<version>4.1.9</version>
<packaging>jar</packaging>

View File

@ -4,7 +4,7 @@
<parent>
<groupId>eu.dnetlib</groupId>
<artifactId>dnet-dedup</artifactId>
<version>4.1.13-SNAPSHOT</version>
<version>4.1.9</version>
<relativePath>../pom.xml</relativePath>
</parent>
<artifactId>dhp-build</artifactId>

View File

@ -1,6 +1,6 @@
useTree = true
entitiesPath = /user/michele.debonis/lda_experiments/authors_pubmed
workingPath = /user/michele.debonis/authors_dedup/gt2_dedup
numPartitions = 1000
dedupConfPath = /user/michele.debonis/lda_experiments/authors.fdup.gt2.conf.json
groundTruthFieldJPath = $.orcid
entitiesPath = /tmp/publications_test_dump
#entitiesPath = /tmp/prod_provision/graph/02_graph_cleaned/publication
workingPath = /user/michele.debonis/new_dedup_test/workingdirtree
dedupConfPath = /user/michele.debonis/new_dedup_test/pubs.tree.conf.json
numPartitions = 8000
useTree = true

View File

@ -6,7 +6,7 @@
<parent>
<groupId>eu.dnetlib</groupId>
<artifactId>dnet-dedup</artifactId>
<version>4.1.13-SNAPSHOT</version>
<version>4.1.9</version>
<relativePath>../pom.xml</relativePath>
</parent>

View File

@ -1,7 +1,7 @@
package eu.dnetlib;
import com.google.common.hash.Hashing;
import eu.dnetlib.graph.JavaGraphProcessor;
import eu.dnetlib.graph.GraphProcessor;
import eu.dnetlib.pace.config.DedupConfig;
import eu.dnetlib.pace.model.MapDocument;
import eu.dnetlib.pace.util.BlockProcessorForTesting;
@ -19,6 +19,7 @@ import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.graphx.Edge;
import org.apache.spark.rdd.RDD;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SaveMode;
@ -57,13 +58,14 @@ public class Deduper implements Serializable {
.reduceByKey((b1, b2) -> Block.from(b1, b2, of, maxQueueSize));
}
public static Iterator<Tuple2<String, String>> ccToMergeRel(Tuple2<String, List<String>> cc, DedupConfig dedupConf) {
return cc._2()
public static Iterator<Tuple2<String, String>> ccToMergeRel(ConnectedComponent cc, DedupConfig dedupConf) {
return cc
.getDocs()
.stream()
.flatMap(
id -> {
List<Tuple2<String, String>> tmp = new ArrayList<>();
tmp.add(new Tuple2<>(cc._1(), id));
tmp.add(new Tuple2<>(cc.getCcId(), id));
return tmp.stream();
})
.iterator();
@ -136,19 +138,21 @@ public class Deduper implements Serializable {
.map(s -> MapDocumentUtil.getJPathString(dedupConf.getWf().getIdPath(), s))
.mapToPair((PairFunction<String, Object, String>) s -> new Tuple2<>(hash(s), s));
final JavaRDD<Edge<String>> edgeRdd = spark
final RDD<Edge<String>> edgeRdd = spark
.read()
.load(simRelsPath)
.as(Encoders.bean(Relation.class))
.javaRDD()
.map(Relation::toEdgeRdd);
.map(Relation::toEdgeRdd)
.rdd();
JavaPairRDD<String, List<String>> ccs = JavaGraphProcessor
.findCCs(vertexes, edgeRdd, dedupConf.getWf().getMaxIterations());
JavaRDD<ConnectedComponent> ccs = GraphProcessor
.findCCs(vertexes.rdd(), edgeRdd, maxIterations)
.toJavaRDD();
JavaRDD<Relation> mergeRel = ccs
.filter(cc -> cc._2().size() > 1)
.flatMap(cc -> Deduper.ccToMergeRel(cc, dedupConf))
.filter(k -> k.getDocs().size() > 1)
.flatMap(cc -> ccToMergeRel(cc, dedupConf))
.map(it -> new Relation(it._1(), it._2(), "mergeRel"));
final Dataset<Relation> mergeRels = spark
@ -159,7 +163,7 @@ public class Deduper implements Serializable {
mergeRels.write().mode(SaveMode.Overwrite).parquet(mergeRelsPath);
}
public static void createDedupEntity(DedupConfig dedupConf, String simRelsPath, String mergeRelsPath, String entitiesPath, SparkSession spark, String dedupEntityPath){
public static void createDedupEntity(DedupConfig dedupConf, String mergeRelsPath, String entitiesPath, SparkSession spark, String dedupEntityPath){
JavaPairRDD<String, String> entities = spark
.read()
@ -170,15 +174,7 @@ public class Deduper implements Serializable {
.toJavaRDD()
.mapToPair(t -> t);
// <source_raw_id, relation(source, target)>
JavaPairRDD<String, Relation> simRels = spark
.read()
.load(simRelsPath)
.as(Encoders.bean(Relation.class))
.toJavaRDD()
.mapToPair(r-> new Tuple2<>(r.getSource(), r));
// <raw_id, relation(dedup_id, raw_id)>
// <source, target>: source is the dedup_id, target is the id of the mergedIn
JavaPairRDD<String, Relation> mergeRels = spark
.read()
.load(mergeRelsPath)
@ -191,22 +187,7 @@ public class Deduper implements Serializable {
.groupByKey()
.map(t-> entityMerger(t._1(), t._2().iterator()));
JavaPairRDD<String, Iterable<Relation>> simRelsWithDedupId = simRels
.join(mergeRels)
.mapToPair(x -> new Tuple2<>(x._2()._2().getSource(), x._2()._1()))
.groupByKey();
JavaRDD<ConnectedComponent> groupEntity = mergeRels.join(entities)
.mapToPair(t -> new Tuple2<>(t._2()._1().getSource(), t._2()._2()))
.groupByKey()
.join(simRelsWithDedupId)
.map(x -> new ConnectedComponent(
x._1(),
x._2()._1(),
x._2()._2())
);
groupEntity.saveAsTextFile(dedupEntityPath);
dedupEntities.saveAsTextFile(dedupEntityPath);
}
}

View File

@ -1,56 +0,0 @@
package eu.dnetlib.graph;
import com.clearspring.analytics.util.Lists;
import com.google.common.collect.Sets;
import eu.dnetlib.pace.utils.Utility;
import eu.dnetlib.support.ConnectedComponent;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.graphx.*;
import org.apache.spark.rdd.RDD;
import org.apache.spark.storage.StorageLevel;
import scala.Tuple2;
import scala.reflect.ClassTag;
import scala.reflect.ClassTag$;
import java.util.List;
public class JavaGraphProcessor {
//<ccId, list(json)>
public static JavaPairRDD<String, List<String>> findCCs(JavaPairRDD<Object, String> vertexes, JavaRDD<Edge<String>> edges, int maxIterations) {
ClassTag<String> stringTag = ClassTag$.MODULE$.apply(String.class);
Graph<String, String> graph =
Graph.apply(
vertexes.rdd(),
edges.rdd(),
"",
StorageLevel.MEMORY_ONLY(),
StorageLevel.MEMORY_ONLY(),
stringTag,
stringTag
);
GraphOps<String, String> graphOps = new GraphOps<>(graph, stringTag, stringTag);
JavaRDD<Tuple2<Object, Object>> cc = graphOps.connectedComponents(maxIterations).vertices().toJavaRDD();
JavaPairRDD<Object, String> joinResult = vertexes
.leftOuterJoin(cc.mapToPair(x -> x))
.mapToPair(x -> {
if (!x._2()._2().isPresent()) {
return new Tuple2<>(x._1(), x._2()._1());
} else {
return new Tuple2<>(x._2()._2(), x._2()._1());
}
});
return joinResult
.groupByKey()
.map(x -> Lists.newArrayList(x._2()))
.zipWithUniqueId()
.mapToPair(x -> new Tuple2<>("dedup______::" + x._2().toString(), x._1()));
}
}

View File

@ -19,7 +19,6 @@ import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.Serializable;
import java.nio.charset.StandardCharsets;
import java.util.stream.Collectors;
public abstract class AbstractSparkJob implements Serializable {
@ -60,7 +59,7 @@ public abstract class AbstractSparkJob implements Serializable {
Path path=new Path(filePath);
FileSystem fs = FileSystem.get(new Configuration());
BufferedReader br=new BufferedReader(new InputStreamReader(fs.open(path), StandardCharsets.UTF_8));
BufferedReader br=new BufferedReader(new InputStreamReader(fs.open(path)));
try {
return String.join("", br.lines().collect(Collectors.toList()));
} finally {

View File

@ -1,36 +1,20 @@
package eu.dnetlib.jobs;
import eu.dnetlib.Deduper;
import eu.dnetlib.pace.config.DedupConfig;
import eu.dnetlib.pace.config.Type;
import eu.dnetlib.pace.model.FieldValueImpl;
import eu.dnetlib.pace.model.MapDocument;
import eu.dnetlib.pace.util.MapDocumentUtil;
import eu.dnetlib.pace.utils.Utility;
import eu.dnetlib.support.ArgumentApplicationParser;
import eu.dnetlib.support.Block;
import eu.dnetlib.support.ConnectedComponent;
import eu.dnetlib.support.Relation;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SparkSession;
import org.codehaus.jackson.map.ObjectMapper;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import scala.Tuple2;
import java.io.IOException;
import java.util.List;
import java.util.Optional;
import java.util.stream.Collectors;
public class SparkComputeStatistics extends AbstractSparkJob {
@ -58,42 +42,18 @@ public class SparkComputeStatistics extends AbstractSparkJob {
@Override
public void run() throws IOException {
//https://towardsdatascience.com/7-evaluation-metrics-for-clustering-algorithms-bdc537ff54d2#:~:text=There%20are%20two%20types%20of,to%20all%20unsupervised%20learning%20results)
// read oozie parameters
final String entitiesPath = parser.get("entitiesPath");
final String workingPath = parser.get("workingPath");
final String dedupConfPath = parser.get("dedupConfPath");
final String groundTruthFieldJPath = parser.get("groundTruthFieldJPath");
final int numPartitions = Optional
.ofNullable(parser.get("numPartitions"))
.map(Integer::valueOf)
.orElse(NUM_PARTITIONS);
log.info("entitiesPath: '{}'", entitiesPath);
log.info("workingPath: '{}'", workingPath);
log.info("numPartitions: '{}'", numPartitions);
log.info("dedupConfPath: '{}'", dedupConfPath);
log.info("groundTruthFieldJPath: '{}'", groundTruthFieldJPath);
JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
DedupConfig dedupConfig = loadDedupConfig(dedupConfPath);
JavaPairRDD<String, MapDocument> mapDocuments = sc
.textFile(entitiesPath)
.repartition(numPartitions)
.mapToPair(
(PairFunction<String, String, MapDocument>) s -> {
MapDocument d = MapDocumentUtil.asMapDocumentWithJPath(dedupConfig, s);
//put in the map the groundTruthField used to compute statistics
d.getFieldMap().put("groundTruth", new FieldValueImpl(Type.String, "groundTruth", MapDocumentUtil.getJPathString(groundTruthFieldJPath, s)));
return new Tuple2<>(d.getIdentifier(), d);
});
JavaRDD<String> entities = mapDocuments.map(d -> d._2().getFieldMap().get("groundTruth").stringValue());
// create blocks
JavaRDD<List<String>> blocks = Deduper.createSortedBlocks(mapDocuments, dedupConfig)
.map(b -> b._2().getDocuments().stream().map(d -> d.getFieldMap().get("groundTruth").stringValue()).collect(Collectors.toList()));
log.info("entitiesPath: '{}'", entitiesPath);
log.info("workingPath: '{}'", workingPath);
log.info("numPartitions: '{}'", numPartitions);
// <source, target>: source is the dedup_id, target is the id of the mergedIn
JavaRDD<Relation> mergerels = spark
@ -108,38 +68,15 @@ public class SparkComputeStatistics extends AbstractSparkJob {
.as(Encoders.bean(Relation.class))
.toJavaRDD();
JavaRDD<List<String>> groups = sc.textFile(workingPath + "/groupentities")
.map(e -> new ObjectMapper().readValue(e, ConnectedComponent.class))
.map(e -> e.getDocs().stream().map(d -> MapDocumentUtil.getJPathString(groundTruthFieldJPath, d)).collect(Collectors.toList()));
long entities_number = entities.count();
long blocks_number = blocks.count();
double blocks_randIndex = randIndex(blocks);
long simrels_number = simrels.count();
long mergerels_number = mergerels.count();
double groups_randIndex = randIndex(groups);
long groups_number = groups.count();
long groundtruth_number = entities.filter(e -> !e.isEmpty()).count();
long correct_groups = groups.filter(x -> x.stream().distinct().count()==1).count();
long wrong_groups = groups_number - correct_groups;
long connected_components = mergerels.groupBy(Relation::getSource).count();
String print =
"Entities : " + entities_number + "\n" +
"Ground Truth : " + groundtruth_number + "\n" +
"Blocks : " + blocks_number + "\n" +
"Blocks RI : " + blocks_randIndex + "\n" +
"SimRels : " + simrels_number + "\n" +
"MergeRels : " + mergerels_number + "\n" +
"Groups : " + groups_number + " (correct: " + correct_groups + ", wrong: " + wrong_groups + ")\n" +
"Groups RI : " + groups_randIndex;
System.out.println(print);
writeStatsFileToHDFS(groundtruth_number, entities_number, blocks_randIndex, groups_randIndex, blocks_number, simrels_number, mergerels_number, groups_number, workingPath + "/stats_file.txt");
writeStatsFileToHDFS(simrels_number, mergerels_number, connected_components, workingPath + "/stats_file");
}
public static void writeStatsFileToHDFS(long groundtruth_number, long entities_number, double blocks_randIndex, double groups_randIndex, long blocks_number, long simrels_number, long mergerels_number, long groups_number, String filePath) throws IOException {
public static void writeStatsFileToHDFS(long simrels_number, long mergerels_number, long connected_components, String filePath) throws IOException {
Configuration conf = new Configuration();
FileSystem fs = FileSystem.get(conf);
@ -156,14 +93,9 @@ public class SparkComputeStatistics extends AbstractSparkJob {
}
String print =
"Entities : " + entities_number + "\n" +
"Ground Truth : " + groundtruth_number + "\n" +
"Blocks : " + blocks_number + "\n" +
"Blocks RI : " + blocks_randIndex + "\n" +
"SimRels : " + simrels_number + "\n" +
"MergeRels : " + mergerels_number + "\n" +
"Groups : " + groups_number + "\n" +
"Groups RI : " + groups_randIndex;
"Similarity Relations : " + simrels_number + "\n" +
"Merge Relations : " + mergerels_number + "\n" +
"Connected Components : " + connected_components;
// Create file to write
FSDataOutputStream out = fs.create(outFile);
@ -177,31 +109,5 @@ public class SparkComputeStatistics extends AbstractSparkJob {
e.printStackTrace();
}
}
//TODO find another maesure that takes into account all the elements outside of the group too
//RandIndex = number of pairwise correct predictions/total number of possible pairs (in the same cluster) -> bounded between 0 and 1
public double randIndex(JavaRDD<List<String>> clusters) {
Tuple2<Integer, Integer> reduce = clusters.map(c -> {
int num = 0;
for (String id : c.stream().distinct().filter(s -> !s.isEmpty()).collect(Collectors.toList())) {
int n = (int) c.stream().filter(i -> i.equals(id)).count();
num += binomialCoefficient(n);
}
int den = binomialCoefficient(c.size());
return new Tuple2<>(num, den);
})
.reduce((a, b) -> new Tuple2<>(a._1() + b._1(), a._2() + b._2()));
return (double)reduce._1()/ reduce._2();
}
private static int binomialCoefficient(int n)
{
return n*(n-1)/2;
}
//V-measure = harmonic mean of homogeneity and completeness, homogeneity = each cluster contains only members of a single class, completeness = all members of a given class are assigned to the same cluster
}

View File

@ -7,7 +7,6 @@ import eu.dnetlib.pace.utils.Utility;
import eu.dnetlib.support.ArgumentApplicationParser;
import eu.dnetlib.support.ConnectedComponent;
import eu.dnetlib.support.Relation;
import org.apache.hadoop.io.compress.GzipCodec;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
@ -17,32 +16,29 @@ import org.apache.spark.sql.SparkSession;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import scala.Tuple2;
import scala.Tuple3;
import java.io.IOException;
import java.util.*;
import java.util.stream.Collectors;
import java.util.stream.StreamSupport;
import java.util.Optional;
public class SparkCreateGroupEntity extends AbstractSparkJob {
public class SparkCreateDedupEntity extends AbstractSparkJob {
private static final Logger log = LoggerFactory.getLogger(SparkCreateGroupEntity.class);
private static final Logger log = LoggerFactory.getLogger(eu.dnetlib.jobs.SparkCreateDedupEntity.class);
public SparkCreateGroupEntity(ArgumentApplicationParser parser, SparkSession spark) {
public SparkCreateDedupEntity(ArgumentApplicationParser parser, SparkSession spark) {
super(parser, spark);
}
public static void main(String[] args) throws Exception {
ArgumentApplicationParser parser = new ArgumentApplicationParser(
Utility.readResource("/jobs/parameters/createGroupEntity_parameters.json", SparkCreateGroupEntity.class)
Utility.readResource("/jobs/parameters/createDedupEntity_parameters.json", SparkCreateDedupEntity.class)
);
parser.parseArgument(args);
SparkConf conf = new SparkConf();
new SparkCreateGroupEntity(
new SparkCreateDedupEntity(
parser,
getSparkSession(conf)
).run();
@ -67,7 +63,6 @@ public class SparkCreateGroupEntity extends AbstractSparkJob {
DedupConfig dedupConf = DedupConfig.load(readFileFromHDFS(dedupConfPath));
// <raw_id, json>
JavaPairRDD<String, String> entities = spark
.read()
.textFile(entitiesPath)
@ -77,15 +72,7 @@ public class SparkCreateGroupEntity extends AbstractSparkJob {
.toJavaRDD()
.mapToPair(t -> t);
// <source_raw_id, relation(source, target)>
JavaPairRDD<String, Relation> simRels = spark
.read()
.load(workingPath + "/simrels")
.as(Encoders.bean(Relation.class))
.toJavaRDD()
.mapToPair(r-> new Tuple2<>(r.getSource(), r));
// <raw_id, relation(dedup_id, raw_id)>
// <source, target>: source is the dedup_id, target is the id of the mergedIn
JavaPairRDD<String, Relation> mergeRels = spark
.read()
.load(workingPath + "/mergerels")
@ -93,23 +80,12 @@ public class SparkCreateGroupEntity extends AbstractSparkJob {
.toJavaRDD()
.mapToPair(r -> new Tuple2<>(r.getTarget(), r));
// <dedup_id, simrel>
JavaPairRDD<String, Iterable<Relation>> simRelsWithDedupId = simRels
.join(mergeRels)
.mapToPair(x -> new Tuple2<>(x._2()._2().getSource(), x._2()._1()))
.groupByKey();
JavaRDD<ConnectedComponent> groupEntity = mergeRels.join(entities)
JavaRDD<ConnectedComponent> dedupEntities = mergeRels.join(entities)
.mapToPair(t -> new Tuple2<>(t._2()._1().getSource(), t._2()._2()))
.groupByKey()
.join(simRelsWithDedupId)
.map(x -> new ConnectedComponent(
x._1(),
x._2()._1(),
x._2()._2())
);
.map(t-> Deduper.entityMerger(t._1(), t._2().iterator()));
groupEntity.saveAsTextFile(workingPath + "/groupentities", GzipCodec.class);
dedupEntities.saveAsTextFile(workingPath + "dedupentity");
}

View File

@ -1,7 +1,7 @@
package eu.dnetlib.jobs;
import eu.dnetlib.Deduper;
import eu.dnetlib.graph.JavaGraphProcessor;
import eu.dnetlib.graph.GraphProcessor;
import eu.dnetlib.pace.config.DedupConfig;
import eu.dnetlib.pace.util.MapDocumentUtil;
import eu.dnetlib.pace.utils.Utility;
@ -24,7 +24,6 @@ import org.slf4j.LoggerFactory;
import scala.Tuple2;
import java.io.IOException;
import java.util.List;
import java.util.Optional;
import static eu.dnetlib.Deduper.hash;
@ -79,18 +78,20 @@ public class SparkCreateMergeRels extends AbstractSparkJob {
.map(s -> MapDocumentUtil.getJPathString(dedupConf.getWf().getIdPath(), s))
.mapToPair((PairFunction<String, Object, String>) s -> new Tuple2<>(hash(s), s));
final JavaRDD<Edge<String>> edgeRdd = spark
final RDD<Edge<String>> edgeRdd = spark
.read()
.load(workingPath + "/simrels")
.as(Encoders.bean(Relation.class))
.javaRDD()
.map(Relation::toEdgeRdd);
.map(Relation::toEdgeRdd)
.rdd();
JavaPairRDD<String, List<String>> ccs = JavaGraphProcessor
.findCCs(vertexes, edgeRdd, dedupConf.getWf().getMaxIterations());
JavaRDD<ConnectedComponent> ccs = GraphProcessor
.findCCs(vertexes.rdd(), edgeRdd, dedupConf.getWf().getMaxIterations())
.toJavaRDD();
JavaRDD<Relation> mergeRel = ccs
.filter(cc -> cc._2().size() > 1)
.filter(k -> k.getDocs().size() > 1)
.flatMap(cc -> Deduper.ccToMergeRel(cc, dedupConf))
.map(it -> new Relation(it._1(), it._2(), "mergeRel"));

View File

@ -14,7 +14,6 @@ import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.sql.Encoder;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SaveMode;
import org.apache.spark.sql.SparkSession;

View File

@ -1,7 +1,10 @@
package eu.dnetlib.support;
import java.io.Serializable;
import java.util.*;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.Iterator;
import java.util.List;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import java.util.stream.StreamSupport;
@ -9,7 +12,6 @@ import java.util.stream.StreamSupport;
import com.google.common.collect.Lists;
import eu.dnetlib.pace.model.MapDocument;
import org.codehaus.jackson.annotate.JsonIgnore;
public class Block implements Serializable {
@ -21,11 +23,6 @@ public class Block implements Serializable {
super();
}
public Block(String key, List<MapDocument> documents) {
this.key = key;
this.documents = documents;
}
public Block(String key, Iterable<MapDocument> documents) {
this.key = key;
this.documents = Lists.newArrayList(documents);

View File

@ -5,35 +5,54 @@ import java.io.Serializable;
import java.util.HashSet;
import java.util.Set;
import com.google.common.collect.Sets;
import eu.dnetlib.pace.utils.Utility;
import org.apache.commons.lang.StringUtils;
import org.codehaus.jackson.annotate.JsonIgnore;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.pace.util.PaceException;
import org.codehaus.jackson.map.ObjectMapper;
public class ConnectedComponent implements Serializable {
private HashSet<String> docs;
private String ccId;
private HashSet<Relation> simrels;
public ConnectedComponent() {
}
public ConnectedComponent(String ccId, Set<String> docs, Set<Relation> simrels) {
this.docs = new HashSet<>(docs);
this.ccId = ccId;
this.simrels = new HashSet<>(simrels);
}
public ConnectedComponent(Set<String> docs) {
this.docs = new HashSet<>(docs);
//initialization of id and relations missing
createID();
}
public ConnectedComponent(String ccId, Iterable<String> docs, Iterable<Relation> simrels) {
this.ccId = ccId;
this.docs = Sets.newHashSet(docs);
this.simrels = Sets.newHashSet(simrels);
public String createID() {
if (docs.size() > 1) {
final String s = getMin();
ccId = "dedup::" + Utility.md5(s);
return ccId;
} else {
return docs.iterator().next();
}
}
@JsonIgnore
public String getMin() {
final StringBuilder min = new StringBuilder();
docs
.forEach(
i -> {
if (StringUtils.isBlank(min.toString())) {
min.append(i);
} else {
if (min.toString().compareTo(i) > 0) {
min.setLength(0);
min.append(i);
}
}
});
return min.toString();
}
@Override
@ -61,12 +80,4 @@ public class ConnectedComponent implements Serializable {
public void setCcId(String ccId) {
this.ccId = ccId;
}
public void setSimrels(HashSet<Relation> simrels) {
this.simrels = simrels;
}
public HashSet<Relation> getSimrels() {
return simrels;
}
}

View File

@ -16,10 +16,6 @@
<name>dedupConfPath</name>
<description>path for the dedup configuration file</description>
</property>
<property>
<name>groundTruthFieldJPath</name>
<description>jpath of the field to be used as ground truth</description>
</property>
<property>
<name>sparkDriverMemory</name>
<description>memory for driver process</description>
@ -142,33 +138,6 @@
<arg>--numPartitions</arg><arg>${numPartitions}</arg>
<arg>--dedupConfPath</arg><arg>${dedupConfPath}</arg>
</spark>
<ok to="CreateGroupEntities"/>
<error to="Kill"/>
</action>
<action name="CreateGroupEntities">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>Create Group Entities</name>
<class>eu.dnetlib.jobs.SparkCreateGroupEntity</class>
<jar>dnet-dedup-test-${projectVersion}.jar</jar>
<spark-opts>
--executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=3840
</spark-opts>
<arg>--entitiesPath</arg><arg>${entitiesPath}</arg>
<arg>--workingPath</arg><arg>${workingPath}</arg>
<arg>--numPartitions</arg><arg>${numPartitions}</arg>
<arg>--dedupConfPath</arg><arg>${dedupConfPath}</arg>
</spark>
<ok to="ComputeStatistics"/>
<error to="Kill"/>
</action>
@ -193,12 +162,36 @@
<arg>--entitiesPath</arg><arg>${entitiesPath}</arg>
<arg>--workingPath</arg><arg>${workingPath}</arg>
<arg>--numPartitions</arg><arg>${numPartitions}</arg>
<arg>--dedupConfPath</arg><arg>${dedupConfPath}</arg>
<arg>--groundTruthFieldJPath</arg><arg>${groundTruthFieldJPath}</arg>
</spark>
<ok to="End"/>
<error to="Kill"/>
</action>
<!--<action name="CreateDedupEntities">-->
<!--<spark xmlns="uri:oozie:spark-action:0.2">-->
<!--<master>yarn</master>-->
<!--<mode>cluster</mode>-->
<!--<name>Create Dedup Entities</name>-->
<!--<class>eu.dnetlib.jobs.SparkCreateDedupEntity</class>-->
<!--<jar>dnet-dedup-test-${projectVersion}.jar</jar>-->
<!--<spark-opts>-->
<!--&#45;&#45;executor-memory=${sparkExecutorMemory}-->
<!--&#45;&#45;executor-cores=${sparkExecutorCores}-->
<!--&#45;&#45;driver-memory=${sparkDriverMemory}-->
<!--&#45;&#45;conf spark.extraListeners=${spark2ExtraListeners}-->
<!--&#45;&#45;conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}-->
<!--&#45;&#45;conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}-->
<!--&#45;&#45;conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}-->
<!--&#45;&#45;conf spark.sql.shuffle.partitions=3840-->
<!--</spark-opts>-->
<!--<arg>&#45;&#45;entitiesPath</arg><arg>${entitiesPath}</arg>-->
<!--<arg>&#45;&#45;workingPath</arg><arg>${workingPath}</arg>-->
<!--<arg>&#45;&#45;numPartitions</arg><arg>${numPartitions}</arg>-->
<!--<arg>&#45;&#45;dedupConfPath</arg><arg>${dedupConfPath}</arg>-->
<!--</spark>-->
<!--<ok to="End"/>-->
<!--<error to="Kill"/>-->
<!--</action>-->
<end name="End"/>
</workflow-app>

View File

@ -16,17 +16,5 @@
"paramLongName": "numPartitions",
"paramDescription": "number of partitions for the similarity relations intermediate phases",
"paramRequired": false
},
{
"paramName": "dc",
"paramLongName": "dedupConfPath",
"paramDescription": "dedup configuration to be used",
"paramRequired": true
},
{
"paramName": "gt",
"paramLongName": "groundTruthFieldJPath",
"paramDescription": "field to be used as groundtruth",
"paramRequired": true
}
]

File diff suppressed because one or more lines are too long

View File

@ -1,134 +0,0 @@
{
"wf": {
"threshold": "0.99",
"dedupRun": "001",
"entityType": "author",
"subEntityType": "author",
"subEntityValue": "author",
"orderField": "fullname",
"queueMaxSize": "200",
"groupMaxSize": "100",
"maxChildren": "100",
"slidingWindowSize": "50",
"rootBuilder": [
"result",
"resultProject_outcome_isProducedBy",
"resultResult_publicationDataset_isRelatedTo",
"resultResult_similarity_isAmongTopNSimilarDocuments",
"resultResult_similarity_hasAmongTopNSimilarDocuments",
"resultOrganization_affiliation_isAffiliatedWith",
"resultResult_part_hasPart",
"resultResult_part_isPartOf",
"resultResult_supplement_isSupplementTo",
"resultResult_supplement_isSupplementedBy",
"resultResult_version_isVersionOf"
],
"includeChildren": "true",
"maxIterations": 20,
"idPath": "$.id"
},
"pace": {
"clustering" : [
{ "name" : "lnfi", "fields" : [ "fullname" ], "params" : {} }
],
"decisionTree": {
"start": {
"fields": [
{
"field": "orcid",
"comparator": "exactMatch",
"weight": 1.0,
"countIfUndefined": "true",
"params": {}
}
],
"threshold": 1.0,
"aggregation": "MAX",
"positive": "MATCH",
"negative": "NO_MATCH",
"undefined": "orcids",
"ignoreUndefined": "true"
},
"orcids": {
"fields": [
{
"field": "orcids",
"comparator": "stringListMatch",
"weight": 1.0,
"countIfUndefined": "true",
"params": {"type": "count"}
}
],
"threshold": 3.0,
"aggregation": "MAX",
"positive": "MATCH",
"negative": "coauthors",
"undefined": "coauthors",
"ignoreUndefined": "true"
},
"coauthors": {
"fields": [
{
"field": "coauthors",
"comparator": "authorsMatch",
"weight": 1.0,
"countIfUndefined": "true",
"params": {"type": "count"}
}
],
"threshold": 1.0,
"aggregation": "MAX",
"positive": "topicsMatch",
"negative": "NO_MATCH",
"undefined": "topicsMatch",
"ignoreUndefined": "true"
},
"topicsMatch": {
"fields": [
{
"field": "topics",
"comparator": "cosineSimilarity",
"weight": 1.0,
"countIfUndefined": "true",
"params": {}
}
],
"threshold": 0.5,
"aggregation": "MAX",
"positive": "MATCH",
"negative": "NO_MATCH",
"undefined": "NO_MATCH",
"ignoreUndefined": "false"
}
},
"model": [
{
"name": "topics",
"type": "DoubleArray",
"path": "$.topics"
},
{
"name": "fullname",
"type": "String",
"path": "$.fullname"
},
{
"name": "orcid",
"type": "String",
"path": "$.orcid"
},
{
"name": "coauthors",
"type": "List",
"path": "$.coAuthors[*].fullname"
},
{
"name": "orcids",
"type": "List",
"path": "$.coAuthors[*].orcid"
}
],
"blacklists": {},
"synonyms": {}
}
}

View File

@ -1,134 +0,0 @@
{
"wf": {
"threshold": "0.99",
"dedupRun": "001",
"entityType": "author",
"subEntityType": "author",
"subEntityValue": "author",
"orderField": "fullname",
"queueMaxSize": "200",
"groupMaxSize": "100",
"maxChildren": "100",
"slidingWindowSize": "50",
"rootBuilder": [
"result",
"resultProject_outcome_isProducedBy",
"resultResult_publicationDataset_isRelatedTo",
"resultResult_similarity_isAmongTopNSimilarDocuments",
"resultResult_similarity_hasAmongTopNSimilarDocuments",
"resultOrganization_affiliation_isAffiliatedWith",
"resultResult_part_hasPart",
"resultResult_part_isPartOf",
"resultResult_supplement_isSupplementTo",
"resultResult_supplement_isSupplementedBy",
"resultResult_version_isVersionOf"
],
"includeChildren": "true",
"maxIterations": 20,
"idPath": "$.id"
},
"pace": {
"clustering" : [
{ "name" : "lnfi", "fields" : [ "fullname" ], "params" : {} }
],
"decisionTree": {
"start": {
"fields": [
{
"field": "orcid",
"comparator": "exactMatch",
"weight": 1.0,
"countIfUndefined": "true",
"params": {}
}
],
"threshold": 1.0,
"aggregation": "MAX",
"positive": "MATCH",
"negative": "NO_MATCH",
"undefined": "orcids",
"ignoreUndefined": "true"
},
"orcids": {
"fields": [
{
"field": "orcids",
"comparator": "stringListMatch",
"weight": 1.0,
"countIfUndefined": "true",
"params": {"type": "count"}
}
],
"threshold": 3.0,
"aggregation": "MAX",
"positive": "MATCH",
"negative": "coauthors",
"undefined": "coauthors",
"ignoreUndefined": "true"
},
"coauthors": {
"fields": [
{
"field": "coauthors",
"comparator": "authorsMatch",
"weight": 1.0,
"countIfUndefined": "true",
"params": {"type": "count"}
}
],
"threshold": 1.0,
"aggregation": "MAX",
"positive": "topicsMatch",
"negative": "NO_MATCH",
"undefined": "topicsMatch",
"ignoreUndefined": "true"
},
"topicsMatch": {
"fields": [
{
"field": "topics",
"comparator": "cosineSimilarity",
"weight": 1.0,
"countIfUndefined": "true",
"params": {}
}
],
"threshold": 1.0,
"aggregation": "MAX",
"positive": "MATCH",
"negative": "NO_MATCH",
"undefined": "NO_MATCH",
"ignoreUndefined": "false"
}
},
"model": [
{
"name": "topics",
"type": "DoubleArray",
"path": "$.topics"
},
{
"name": "fullname",
"type": "String",
"path": "$.fullname"
},
{
"name": "orcid",
"type": "String",
"path": "$.orcid"
},
{
"name": "coauthors",
"type": "List",
"path": "$.coAuthors[*].fullname"
},
{
"name": "orcids",
"type": "List",
"path": "$.coAuthors[*].orcid"
}
],
"blacklists": {},
"synonyms": {}
}
}

View File

@ -3,7 +3,7 @@
"threshold" : "0.99",
"dedupRun" : "001",
"entityType" : "datasource",
"orderField" : "englishname",
"orderField" : "name",
"queueMaxSize" : "2000",
"groupMaxSize" : "50",
"slidingWindowSize" : "200",
@ -14,9 +14,8 @@
},
"pace" : {
"clustering" : [
{ "name" : "sortedngrampairs", "fields" : [ "officialname", "englishname" ], "params" : { "max" : 2, "ngramLen" : "3", "collapseOn:name": "0"} },
{ "name" : "suffixprefix", "fields" : [ "officialname", "englishname" ], "params" : { "max" : 1, "len" : "3", "collapseOn:name": "0" } },
{"name" : "ngrams", "fields" : ["officialname", "englishname"], "params" : {"ngramLen": 4, "max" : 2, "maxPerToken": 2, "minNgramLen": 1, "collapseOn:name": "0"}},
{ "name" : "sortedngrampairs", "fields" : [ "name" ], "params" : { "max" : 2, "ngramLen" : "3"} },
{ "name" : "suffixprefix", "fields" : [ "name" ], "params" : { "max" : 1, "len" : "3" } },
{ "name" : "urlclustering", "fields" : [ "websiteurl" ], "params" : { } }
],
"decisionTree" : {
@ -40,36 +39,16 @@
"layer2": {
"fields": [
{
"field": "officialname",
"field": "name",
"comparator": "levensteinTitle",
"weight": 1.0,
"countIfUndefined": "true",
"params": {
"threshold": 0.9
}
},
{
"field": "englishname",
"comparator": "levensteinTitle",
"weight": 1.0,
"countIfUndefined": "true",
"params": {
"threshold": 0.9
}
},
{
"field": "officialname",
"comparator": "levensteinTitle",
"weight": 1.0,
"countIfUndefined": "true",
"params": {
"crossCompare": "englishname",
"threshold": 0.9
}
}
],
"threshold": 0.9,
"aggregation": "MAX",
"aggregation": "AVG",
"positive": "MATCH",
"negative": "NO_MATCH",
"undefined": "NO_MATCH",
@ -77,11 +56,12 @@
}
},
"model" : [
{ "name" : "englishname", "type" : "String", "path" : "$.englishname" },
{ "name" : "officialname", "type" : "String", "path" : "$.officialname" },
{ "name" : "name", "type" : "String", "path" : "$.name" },
{ "name" : "websiteurl", "type" : "URL", "path" : "$.websiteurl" }
],
"blacklists" : {},
"blacklists" : {
"legalname" : []
},
"synonyms": {}
}
}

View File

@ -3,9 +3,8 @@
"threshold" : "0.99",
"dedupRun" : "001",
"entityType" : "organization",
"subEntityValue": "organization",
"orderField" : "legalname",
"queueMaxSize" : "100000",
"queueMaxSize" : "2000",
"groupMaxSize" : "50",
"slidingWindowSize" : "200",
"idPath":"$.id",
@ -144,10 +143,10 @@
}
},
"model" : [
{ "name" : "country", "type" : "String", "path" : "$.country.classid"},
{ "name" : "legalshortname", "type" : "String", "path" : "$.legalshortname.value"},
{ "name" : "legalname", "type" : "String", "path" : "$.legalname.value" },
{ "name" : "websiteurl", "type" : "URL", "path" : "$.websiteurl.value" },
{ "name" : "country", "type" : "String", "path" : "$.organization.metadata.country.classid"},
{ "name" : "legalshortname", "type" : "String", "path" : "$.organization.metadata.legalshortname.value"},
{ "name" : "legalname", "type" : "String", "path" : "$.organization.metadata.legalname.value" },
{ "name" : "websiteurl", "type" : "URL", "path" : "$.organization.metadata.websiteurl.value" },
{ "name" : "gridid", "type" : "String", "path" : "$.pid[?(@.qualifier.classid =='grid')].value"},
{ "name" : "originalId", "type" : "String", "path" : "$.id" }
],
@ -155,7 +154,7 @@
"legalname" : []
},
"synonyms": {
"key::1": ["university","università", "universitas", "università studi","universitario","universitaria","université", "universite", "universitaire","universitaires","universidad","universitade","Universität","universitaet","Uniwersytet","университет","universiteit","πανεπιστήμιο","universitesi","universiteti", "universiti", "Πανεπιστήμιο", "panepistemio"],
"key::1": ["university","università", "universitas", "università studi","universitario","universitaria","université", "universite", "universitaire","universitaires","universidad","universitade","Universität","universitaet","Uniwersytet","университет","universiteit","πανεπιστήμιο","universitesi","universiteti", "universiti"],
"key::2": ["studies","studi","études","estudios","estudos","Studien","studia","исследования","studies","σπουδές"],
"key::3": ["advanced","superiore","supérieur","supérieure","supérieurs","supérieures","avancado","avancados","fortgeschrittene","fortgeschritten","zaawansowany","передовой","gevorderd","gevorderde","προχωρημένος","προχωρημένη","προχωρημένο","προχωρημένες","προχωρημένα","wyzsza"],
"key::4": ["institute","istituto","institut","instituto","instituto","Institut","instytut","институт","instituut","ινστιτούτο"],
@ -164,7 +163,7 @@
"key::7": ["college","collegio","colegio","faculdade","Hochschule","Szkoła Wyższa","Высшая школа","κολλέγιο"],
"key::8": ["foundation","fondazione","fondation","fundación","fundação","Stiftung","Fundacja","фонд","stichting","ίδρυμα","idryma"],
"key::9": ["center","centro","centre","centro","centro","zentrum","centrum","центр","centrum","κέντρο"],
"key::10": ["national","nazionale","national","nationale","nationaux","nationales","nacional","nacional","national","krajowy","национальный","nationaal","nationale","εθνικό", "eθνικό"],
"key::10": ["national","nazionale","national","nationale","nationaux","nationales","nacional","nacional","national","krajowy","национальный","nationaal","nationale","εθνικό"],
"key::11": ["association","associazione","association","asociación","associação","Verein","verband","stowarzyszenie","ассоциация","associatie"],
"key::12": ["society","societa","société","sociedad","sociedade","gesellschaft","społeczeństwo","общество","maatschappij","κοινωνία"],
"key::13": ["international","internazionale","international","internacional","internacional","international","międzynarodowy","Международный","internationaal","internationale","διεθνής","διεθνή","διεθνές"],

View File

@ -178,7 +178,6 @@
"params": {
"surname_th": 0.75,
"fullname_th": 0.75,
"size_th": 20,
"mode": "surname"
}
}
@ -216,8 +215,8 @@
},
{
"name": "title",
"type": "StringConcat",
"path": "$.title[?(@.qualifier.classid == 'main title')].value|||$.title[?(@.qualifier.classid == 'subtitle')].value",
"type": "String",
"path": "$.title[?(@.qualifier.classid == 'main title')].value",
"length": 250,
"size": 5
},

View File

@ -51,6 +51,37 @@
],
"decisionTree": {
"start": {
"fields": [
{
"field": "pid",
"comparator": "jsonListMatch",
"weight": 1.0,
"countIfUndefined": "false",
"params": {
"jpath_value": "$.value",
"jpath_classid": "$.qualifier.classid"
}
},
{
"field": "pid",
"comparator": "jsonListMatch",
"weight": 1.0,
"countIfUndefined": "false",
"params": {
"jpath_value": "$.value",
"jpath_classid": "$.qualifier.classid",
"crossCompare": "alternateid"
}
}
],
"threshold": 0.5,
"aggregation": "MAX",
"positive": "layer1",
"negative": "layer2",
"undefined": "layer2",
"ignoreUndefined": "true"
},
"layer1": {
"fields": [
{
"field": "title",
@ -63,8 +94,49 @@
"threshold": 0.9,
"aggregation": "AVG",
"positive": "MATCH",
"negative": "MATCH",
"undefined": "MATCH",
"negative": "NO_MATCH",
"undefined": "NO_MATCH",
"ignoreUndefined": "true"
},
"layer2": {
"fields": [
{
"field": "title",
"comparator": "titleVersionMatch",
"weight": 1.0,
"countIfUndefined": "false",
"params": {}
},
{
"field": "authors",
"comparator": "sizeMatch",
"weight": 1.0,
"countIfUndefined": "false",
"params": {}
}
],
"threshold": 1.0,
"aggregation": "AND",
"positive": "layer3",
"negative": "NO_MATCH",
"undefined": "layer3",
"ignoreUndefined": "false"
},
"layer3": {
"fields": [
{
"field": "title",
"comparator": "levensteinTitle",
"weight": 1.0,
"countIfUndefined": "true",
"params": {}
}
],
"threshold": 0.99,
"aggregation": "AVG",
"positive": "MATCH",
"negative": "NO_MATCH",
"undefined": "NO_MATCH",
"ignoreUndefined": "true"
}
},

View File

@ -6,9 +6,9 @@
"subEntityType": "resulttype",
"subEntityValue": "publication",
"orderField": "title",
"queueMaxSize": "200",
"groupMaxSize": "100",
"maxChildren": "100",
"queueMaxSize": "5000",
"groupMaxSize": "2000",
"maxChildren": "1000",
"slidingWindowSize": "50",
"rootBuilder": [
"result",
@ -28,26 +28,9 @@
"idPath": "$.id"
},
"pace": {
"clustering": [
{
"name": "wordsStatsSuffixPrefixChain",
"fields": [
"title"
],
"params": {
"mod": "10"
}
},
{
"name": "lowercase",
"fields": [
"doi",
"altdoi"
],
"params": {
"collapseOn:pid": "0"
}
}
"clustering" : [
{ "name" : "wordsStatsSuffixPrefixChain", "fields" : [ "title" ], "params" : { "mod" : "10" } },
{ "name" : "lowercase", "fields" : [ "doi" ], "params" : { } }
],
"decisionTree": {
"start": {
@ -59,75 +42,18 @@
"countIfUndefined": "false",
"params": {
"jpath_value": "$.value",
"jpath_classid": "$.qualifier.classid",
"mode": "count"
"jpath_classid": "$.qualifier.classid"
}
}
],
"threshold": 1.0,
"aggregation": "MAX",
"positive": "MATCH",
"negative": "instanceTypeCheck",
"undefined": "instanceTypeCheck",
"ignoreUndefined": "false"
},
"instanceTypeCheck": {
"fields": [
{
"field": "instance",
"comparator": "instanceTypeMatch",
"weight": 1.0,
"countIfUndefined": "false",
"params": {}
}
],
"threshold": 0.5,
"aggregation": "MAX",
"positive": "pidVSaltid",
"negative": "NO_MATCH",
"undefined": "pidVSaltid",
"ignoreUndefined": "true"
},
"pidVSaltid": {
"fields": [
{
"field": "pid",
"comparator": "jsonListMatch",
"weight": 1.0,
"countIfUndefined": "false",
"params": {
"jpath_value": "$.value",
"jpath_classid": "$.qualifier.classid",
"crossCompare": "alternateid",
"mode": "count"
}
}
],
"threshold": 1.0,
"aggregation": "MAX",
"positive": "softCheck",
"negative": "earlyExits",
"undefined": "earlyExits",
"ignoreUndefined": "true"
},
"softCheck": {
"fields": [
{
"field": "title",
"comparator": "levensteinTitle",
"weight": 1.0,
"countIfUndefined": "true",
"params": {}
}
],
"threshold": 0.9,
"aggregation": "AVG",
"positive": "MATCH",
"negative": "NO_MATCH",
"undefined": "NO_MATCH",
"negative": "layer2",
"undefined": "layer2",
"ignoreUndefined": "true"
},
"earlyExits": {
"layer2": {
"fields": [
{
"field": "title",
@ -146,12 +72,12 @@
],
"threshold": 1.0,
"aggregation": "AND",
"positive": "strongCheck",
"positive": "layer3",
"negative": "NO_MATCH",
"undefined": "strongCheck",
"undefined": "layer3",
"ignoreUndefined": "false"
},
"strongCheck": {
"layer3": {
"fields": [
{
"field": "title",
@ -163,30 +89,9 @@
],
"threshold": 0.99,
"aggregation": "AVG",
"positive": "surnames",
"negative": "NO_MATCH",
"undefined": "NO_MATCH",
"ignoreUndefined": "true"
},
"surnames": {
"fields": [
{
"field": "authors",
"comparator": "authorsMatch",
"weight": 1.0,
"countIfUndefined": "false",
"params": {
"surname_th": 0.75,
"fullname_th": 0.75,
"mode": "full"
}
}
],
"threshold": 0.6,
"aggregation": "MAX",
"positive": "MATCH",
"negative": "NO_MATCH",
"undefined": "MATCH",
"undefined": "NO_MATCH",
"ignoreUndefined": "true"
}
},
@ -194,29 +99,18 @@
{
"name": "doi",
"type": "String",
"path": "$.instance[*].pid[?(@.qualifier.classid == 'doi')].value"
},
{
"name": "altdoi",
"type": "String",
"path": "$.instance[*].alternateIdentifier[?(@.qualifier.classid == 'doi')].value"
"path": "$.pid[?(@.qualifier.classid == 'doi')].value"
},
{
"name": "pid",
"type": "JSON",
"path": "$.instance[*].pid[*]",
"overrideMatch": "true"
},
{
"name": "alternateid",
"type": "JSON",
"path": "$.instance[*].alternateIdentifier[*]",
"path": "$.pid",
"overrideMatch": "true"
},
{
"name": "title",
"type": "StringConcat",
"path": "$.title[?(@.qualifier.classid == 'main title')].value|||$.title[?(@.qualifier.classid == 'subtitle')].value",
"type": "String",
"path": "$.title[?(@.qualifier.classid == 'main title')].value",
"length": 250,
"size": 5
},
@ -230,11 +124,6 @@
"name": "resulttype",
"type": "String",
"path": "$.resulttype.classid"
},
{
"name": "instance",
"type": "List",
"path": "$.instance[*].instancetype.classname"
}
],
"blacklists": {
@ -465,16 +354,7 @@
"^(Measurement of the top quark\\-pair production cross section with ATLAS in pp collisions at).*$",
"^(Measurement of the spin\\-dependent structure function).*",
"(?i)^.*authors[']? reply\\.?$",
"(?i)^.*authors[']? response\\.?$",
"^Data [mM]anagement [sS]ervices\\.$",
"Research and Advanced Technology for Digital Libraries",
"(?i)^risky business$",
"(?i)^great expectations\\.?$",
"(?i)^what's in a name\\?$",
"(?i)^decisions, decisions\\.?$",
"(?i)^update to our reader, reviewer, and author communities.*",
"(?i)^lest we forget$",
"(?i)^measure for measure$"
"(?i)^.*authors[']? response\\.?$"
]
},
"synonyms": {}

View File

@ -1,381 +0,0 @@
{
"wf": {
"threshold": "0.99",
"dedupRun": "001",
"entityType": "result",
"subEntityType": "resulttype",
"subEntityValue": "publication",
"orderField": "title",
"queueMaxSize": "200",
"groupMaxSize": "100",
"maxChildren": "100",
"slidingWindowSize": "100",
"rootBuilder": [
"result",
"resultProject_outcome_isProducedBy",
"resultResult_publicationDataset_isRelatedTo",
"resultResult_similarity_isAmongTopNSimilarDocuments",
"resultResult_similarity_hasAmongTopNSimilarDocuments",
"resultOrganization_affiliation_isAffiliatedWith",
"resultResult_part_hasPart",
"resultResult_part_isPartOf",
"resultResult_supplement_isSupplementTo",
"resultResult_supplement_isSupplementedBy",
"resultResult_version_isVersionOf"
],
"includeChildren": "true",
"maxIterations": 20,
"idPath": "$.id"
},
"pace": {
"clustering" : [
{ "name" : "suffixprefix", "fields" : [ "title" ], "params" : { "max" : "1", "len" : "3" } },
{ "name" : "lowercase", "fields" : [ "doi", "altdoi" ], "params" : { "collapseOn:pid": "0"} }
],
"decisionTree": {
"start": {
"fields": [
{
"field": "pid",
"comparator": "jsonListMatch",
"weight": 1.0,
"countIfUndefined": "false",
"params": {
"jpath_value": "$.value",
"jpath_classid": "$.qualifier.classid",
"mode": "count"
}
}
],
"threshold": 1.0,
"aggregation": "MAX",
"positive": "MATCH",
"negative": "versionCheck",
"undefined": "versionCheck",
"ignoreUndefined": "true"
},
"versionCheck": {
"fields": [
{
"field": "title",
"comparator": "titleVersionMatch",
"weight": 1.0,
"countIfUndefined": "false",
"params": {}
}
],
"threshold": 1.0,
"aggregation": "MAX",
"positive": "titleCheck",
"negative": "NO_MATCH",
"undefined": "titleCheck",
"ignoreUndefined": "false"
},
"titleCheck": {
"fields": [
{
"field": "title",
"comparator": "levensteinTitle",
"weight": 1.0,
"countIfUndefined": "true",
"params": {}
}
],
"threshold": 0.9,
"aggregation": "MAX",
"positive": "authorsCheck",
"negative": "NO_MATCH",
"undefined": "NO_MATCH",
"ignoreUndefined": "true"
},
"authorsCheck": {
"fields": [
{
"field": "authors",
"comparator": "authorsMatch",
"weight": 1.0,
"countIfUndefined": "false",
"params": {}
}
],
"threshold": 0.6,
"aggregation": "MAX",
"positive": "MATCH",
"negative": "NO_MATCH",
"undefined": "MATCH",
"ignoreUndefined": "false"
}
},
"model": [
{
"name": "doi",
"type": "String",
"path": "$.instance[*].pid[?(@.qualifier.classid == 'doi')].value"
},
{
"name": "altdoi",
"type": "String",
"path": "$.instance[*].alternateIdentifier[?(@.qualifier.classid == 'doi')].value"
},
{
"name": "pid",
"type": "JSON",
"path": "$.instance[*].pid[*]",
"overrideMatch": "true"
},
{
"name": "title",
"type": "String",
"path": "$.title[?(@.qualifier.classid == 'main title')].value",
"length": 250,
"size": 5
},
{
"name": "authors",
"type": "List",
"path": "$.author[*].fullname",
"size": 200
},
{
"name": "resulttype",
"type": "String",
"path": "$.resulttype.classid"
}
],
"blacklists": {
"title": [
"(?i)^Data Management Plan",
"^Inside Front Cover$",
"(?i)^Poster presentations$",
"^THE ASSOCIATION AND THE GENERAL MEDICAL COUNCIL$",
"^Problems with perinatal pathology\\.?$",
"(?i)^Cases? of Puerperal Convulsions$",
"(?i)^Operative Gyna?ecology$",
"(?i)^Mind the gap\\!?\\:?$",
"^Chronic fatigue syndrome\\.?$",
"^Cartas? ao editor Letters? to the Editor$",
"^Note from the Editor$",
"^Anesthesia Abstract$",
"^Annual report$",
"(?i)^“?THE RADICAL PREVENTION OF VENEREAL DISEASE\\.?”?$",
"(?i)^Graph and Table of Infectious Diseases?$",
"^Presentation$",
"(?i)^Reviews and Information on Publications$",
"(?i)^PUBLIC HEALTH SERVICES?$",
"(?i)^COMBINED TEXT-?BOOK OF OBSTETRICS AND GYN(Æ|ae)COLOGY$",
"(?i)^Adrese autora$",
"(?i)^Systematic Part .*\\. Catalogus Fossilium Austriae, Band 2: Echinoidea neogenica$",
"(?i)^Acknowledgement to Referees$",
"(?i)^Behçet's disease\\.?$",
"(?i)^Isolation and identification of restriction endonuclease.*$",
"(?i)^CEREBROVASCULAR DISEASES?.?$",
"(?i)^Screening for abdominal aortic aneurysms?\\.?$",
"^Event management$",
"(?i)^Breakfast and Crohn's disease.*\\.?$",
"^Cálculo de concentraciones en disoluciones acuosas. Ejercicio interactivo\\..*\\.$",
"(?i)^Genetic and functional analyses of SHANK2 mutations suggest a multiple hit model of Autism spectrum disorders?\\.?$",
"^Gushi hakubutsugaku$",
"^Starobosanski nadpisi u Bosni i Hercegovini \\(.*\\)$",
"^Intestinal spirocha?etosis$",
"^Treatment of Rodent Ulcer$",
"(?i)^\\W*Cloud Computing\\W*$",
"^Compendio mathematico : en que se contienen todas las materias mas principales de las Ciencias que tratan de la cantidad$",
"^Free Communications, Poster Presentations: Session [A-F]$",
"^“The Historical Aspects? of Quackery\\.?”$",
"^A designated centre for people with disabilities operated by St John of God Community Services (Limited|Ltd), Louth$",
"^P(er|re)-Mile Premiums for Auto Insurance\\.?$",
"(?i)^Case Report$",
"^Boletín Informativo$",
"(?i)^Glioblastoma Multiforme$",
"(?i)^Nuevos táxones animales descritos en la península Ibérica y Macaronesia desde 1994 \\(.*\\)$",
"^Zaměstnanecké výhody$",
"(?i)^The Economics of Terrorism and Counter-Terrorism: A Survey \\(Part .*\\)$",
"(?i)^Carotid body tumours?\\.?$",
"(?i)^\\[Españoles en Francia : La condición Emigrante.*\\]$",
"^Avant-propos$",
"(?i)^St\\. Patrick's Cathedral, Dublin, County Dublin - Head(s)? and Capital(s)?$",
"(?i)^St\\. Patrick's Cathedral, Dublin, County Dublin - Bases?$",
"(?i)^PUBLIC HEALTH VERSUS THE STATE$",
"^Viñetas de Cortázar$",
"(?i)^Search for heavy neutrinos and W(\\[|_|\\(|_\\{|-)?R(\\]|\\)|\\})? bosons with right-handed couplings in a left-right symmetric model in pp collisions at.*TeV(\\.)?$",
"(?i)^Measurement of the pseudorapidity and centrality dependence of the transverse energy density in Pb(-?)Pb collisions at.*tev(\\.?)$",
"(?i)^Search for resonances decaying into top-quark pairs using fully hadronic decays in pp collisions with ATLAS at.*TeV$",
"(?i)^Search for neutral minimal supersymmetric standard model Higgs bosons decaying to tau pairs in pp collisions at.*tev$",
"(?i)^Relatório de Estágio (de|em) Angiologia e Cirurgia Vascular$",
"^Aus der AGMB$",
"^Znanstveno-stručni prilozi$",
"(?i)^Zhodnocení finanční situace podniku a návrhy na zlepšení$",
"(?i)^Evaluation of the Financial Situation in the Firm and Proposals to its Improvement$",
"(?i)^Hodnocení finanční situace podniku a návrhy na její zlepšení$",
"^Finanční analýza podniku$",
"^Financial analysis( of business)?$",
"(?i)^Textbook of Gyn(a)?(Æ)?(e)?cology$",
"^Jikken nihon shūshinsho$",
"(?i)^CORONER('|s)(s|') INQUESTS$",
"(?i)^(Μελέτη παραγόντων )?risk management( για ανάπτυξη και εφαρμογή ενός πληροφοριακού συστήματος| και ανάπτυξη συστήματος)?$",
"(?i)^Consultants' contract(s)?$",
"(?i)^Upute autorima$",
"(?i)^Bijdrage tot de Kennis van den Godsdienst der Dajaks van Lan(d|f)ak en Tajan$",
"^Joshi shin kokubun$",
"^Kōtō shōgaku dokuhon nōson'yō$",
"^Jinjō shōgaku shōka$",
"^Shōgaku shūjichō$",
"^Nihon joshi dokuhon$",
"^Joshi shin dokuhon$",
"^Chūtō kanbun dokuhon$",
"^Wabun dokuhon$",
"(?i)^(Analysis of economy selected village or town|Rozbor hospodaření vybrané obce či města)$",
"(?i)^cardiac rehabilitation$",
"(?i)^Analytical summary$",
"^Thesaurus resolutionum Sacrae Congregationis Concilii$",
"(?i)^Sumario analítico(\\s{1})?(Analitic summary)?$",
"^Prikazi i osvrti$",
"^Rodinný dům s provozovnou$",
"^Family house with an establishment$",
"^Shinsei chūtō shin kokugun$",
"^Pulmonary alveolar proteinosis(\\.?)$",
"^Shinshū kanbun$",
"^Viñeta(s?) de Rodríguez$",
"(?i)^RUBRIKA UREDNIKA$",
"^A Matching Model of the Academic Publication Market$",
"^Yōgaku kōyō$",
"^Internetový marketing$",
"^Internet marketing$",
"^Chūtō kokugo dokuhon$",
"^Kokugo dokuhon$",
"^Antibiotic Cover for Dental Extraction(s?)$",
"^Strategie podniku$",
"^Strategy of an Enterprise$",
"(?i)^respiratory disease(s?)(\\.?)$",
"^Award(s?) for Gallantry in Civil Defence$",
"^Podniková kultura$",
"^Corporate Culture$",
"^Severe hyponatraemia in hospital inpatient(s?)(\\.?)$",
"^Pracovní motivace$",
"^Work Motivation$",
"^Kaitei kōtō jogaku dokuhon$",
"^Konsolidovaná účetní závěrka$",
"^Consolidated Financial Statements$",
"(?i)^intracranial tumour(s?)$",
"^Climate Change Mitigation Options and Directed Technical Change: A Decentralized Equilibrium Analysis$",
"^\\[CERVECERIAS MAHOU(\\.|\\:) INTERIOR\\] \\[Material gráfico\\]$",
"^Housing Market Dynamics(\\:|\\.) On the Contribution of Income Shocks and Credit Constraint(s?)$",
"^\\[Funciones auxiliares de la música en Radio París,.*\\]$",
"^Úroveň motivačního procesu jako způsobu vedení lidí$",
"^The level of motivation process as a leadership$",
"^Pay-beds in N(\\.?)H(\\.?)S(\\.?) Hospitals$",
"(?i)^news and events$",
"(?i)^NOVOSTI I DOGAĐAJI$",
"^Sansū no gakushū$",
"^Posouzení informačního systému firmy a návrh změn$",
"^Information System Assessment and Proposal for ICT Modification$",
"^Stresové zatížení pracovníků ve vybrané profesi$",
"^Stress load in a specific job$",
"^Sunday: Poster Sessions, Pt.*$",
"^Monday: Poster Sessions, Pt.*$",
"^Wednesday: Poster Sessions, Pt.*",
"^Tuesday: Poster Sessions, Pt.*$",
"^Analýza reklamy$",
"^Analysis of advertising$",
"^Shōgaku shūshinsho$",
"^Shōgaku sansū$",
"^Shintei joshi kokubun$",
"^Taishō joshi kokubun dokuhon$",
"^Joshi kokubun$",
"^Účetní uzávěrka a účetní závěrka v ČR$",
"(?i)^The \"?Causes\"? of Cancer$",
"^Normas para la publicación de artículos$",
"^Editor('|s)(s|') [Rr]eply$",
"^Editor(|s)(s|) letter$",
"^Redaktoriaus žodis$",
"^DISCUSSION ON THE PRECEDING PAPER$",
"^Kōtō shōgaku shūshinsho jidōyō$",
"^Shōgaku nihon rekishi$",
"^(Theory of the flow of action currents in isolated myelinated nerve fibers).*$",
"^Préface$",
"^Occupational [Hh]ealth [Ss]ervices.$",
"^In Memoriam Professor Toshiyuki TAKESHIMA$",
"^Účetní závěrka ve vybraném podniku.*$",
"^Financial statements in selected company$",
"^Abdominal [Aa]ortic [Aa]neurysms.*$",
"^Pseudomyxoma peritonei$",
"^Kazalo autora$",
"(?i)^uvodna riječ$",
"^Motivace jako způsob vedení lidí$",
"^Motivation as a leadership$",
"^Polyfunkční dům$",
"^Multi\\-funkcional building$",
"^Podnikatelský plán$",
"(?i)^Podnikatelský záměr$",
"(?i)^Business Plan$",
"^Oceňování nemovitostí$",
"^Marketingová komunikace$",
"^Marketing communication$",
"^Sumario Analítico$",
"^Riječ uredništva$",
"^Savjetovanja i priredbe$",
"^Índice$",
"^(Starobosanski nadpisi).*$",
"^Vzdělávání pracovníků v organizaci$",
"^Staff training in organization$",
"^(Life Histories of North American Geometridae).*$",
"^Strategická analýza podniku$",
"^Strategic Analysis of an Enterprise$",
"^Sadržaj$",
"^Upute suradnicima$",
"^Rodinný dům$",
"(?i)^Fami(l)?ly house$",
"^Upute autorima$",
"^Strategic Analysis$",
"^Finanční analýza vybraného podniku$",
"^Finanční analýza$",
"^Riječ urednika$",
"(?i)^Content(s?)$",
"(?i)^Inhalt$",
"^Jinjō shōgaku shūshinsho jidōyō$",
"(?i)^Index$",
"^Chūgaku kokubun kyōkasho$",
"^Retrato de una mujer$",
"^Retrato de un hombre$",
"^Kōtō shōgaku dokuhon$",
"^Shotōka kokugo$",
"^Shōgaku dokuhon$",
"^Jinjō shōgaku kokugo dokuhon$",
"^Shinsei kokugo dokuhon$",
"^Teikoku dokuhon$",
"^Instructions to Authors$",
"^KİTAP TAHLİLİ$",
"^PRZEGLĄD PIŚMIENNICTWA$",
"(?i)^Presentación$",
"^İçindekiler$",
"(?i)^Tabl?e of contents$",
"^(CODICE DEL BEATO DE LOS REYES FERNANDO I Y SANCHA).*$",
"^(\\[MADRID\\. BIBL\\. NAC\\. N.*KING FERDINAND I.*FROM SAN ISIDORO DE LEON\\. FACUNDUS SCRIPSIT DATED.*\\]).*",
"^Editorial( Board)?$",
"(?i)^Editorial \\(English\\)$",
"^Editörden$",
"^(Corpus Oral Dialectal \\(COD\\)\\.).*$",
"^(Kiri Karl Morgensternile).*$",
"^(\\[Eksliibris Aleksandr).*\\]$",
"^(\\[Eksliibris Aleksandr).*$",
"^(Eksliibris Aleksandr).*$",
"^(Kiri A\\. de Vignolles).*$",
"^(2 kirja Karl Morgensternile).*$",
"^(Pirita kloostri idaosa arheoloogilised).*$",
"^(Kiri tundmatule).*$",
"^(Kiri Jenaer Allgemeine Literaturzeitung toimetusele).*$",
"^(Eksliibris Nikolai Birukovile).*$",
"^(Eksliibris Nikolai Issakovile).*$",
"^(WHP Cruise Summary Information of section).*$",
"^(Measurement of the top quark\\-pair production cross section with ATLAS in pp collisions at).*$",
"^(Measurement of the spin\\-dependent structure function).*",
"(?i)^.*authors[']? reply\\.?$",
"(?i)^.*authors[']? response\\.?$",
"^Data [mM]anagement [sS]ervices\\.$",
"Research and Advanced Technology for Digital Libraries",
"Food and Nutrition"
]
},
"synonyms": {}
}
}

View File

@ -1,150 +0,0 @@
{
"wf" : {
"threshold" : "0.99",
"dedupRun" : "001",
"entityType" : "result",
"subEntityType" : "resulttype",
"subEntityValue" : "software",
"orderField" : "title",
"queueMaxSize" : "200",
"groupMaxSize" : "100",
"maxChildren" : "100",
"slidingWindowSize" : "50",
"rootBuilder" : [ "result", "resultProject_outcome_isProducedBy", "resultResult_publicationDataset_isRelatedTo", "resultResult_similarity_isAmongTopNSimilarDocuments", "resultResult_similarity_hasAmongTopNSimilarDocuments", "resultOrganization_affiliation_hasAuthorInstitution", "resultResult_part_hasPart", "resultResult_part_isPartOf", "resultResult_supplement_isSupplementTo", "resultResult_supplement_isSupplementedBy", "resultResult_version_isVersionOf" ],
"includeChildren" : "true"
},
"pace" : {
"clustering" : [
{ "name" : "wordsStatsSuffixPrefixChain", "fields" : [ "title" ], "params" : { "mod" : "10" } },
{ "name" : "lowercase", "fields" : [ "doi", "altdoi" ], "params" : { "collapseOn:pid":"0"} },
{ "name" : "ngrams", "fields" : [ "title" ], "params" : {"ngramLen": 3, "max": 4, "maxPerToken":1, "minNgramLen":3}},
{ "name" : "urlclustering", "fields": [ "url" ], "params" : {}}
],
"decisionTree": {
"start": {
"fields": [
{
"field": "doi",
"comparator": "exactMatch",
"weight": 1,
"countIfUndefined": "false",
"params": {}
}
],
"threshold": 1,
"aggregation": "MAX",
"positive": "MATCH",
"negative": "titleCheck",
"undefined": "titleCheck",
"ignoreUndefined": "false"
},
"titleCheck": {
"fields": [
{
"field": "title",
"comparator": "levensteinTitleIgnoreVersion",
"weight": 1,
"countIfUndefined": "false",
"params": {}
}
],
"threshold": 0.95,
"aggregation": "AVG",
"positive": "pidCheck",
"negative": "NO_MATCH",
"undefined": "NO_MATCH",
"ignoreUndefined": "false"
},
"pidCheck": {
"fields": [
{
"field": "altdoi",
"comparator": "exactMatch",
"weight": 1,
"countIfUndefined": "false",
"params": {}
},
{
"field": "doi",
"comparator": "exactMatch",
"weight": 1,
"countIfUndefined": "false",
"params": {"crossCompare": "altdoi"}
},
{
"field": "url",
"comparator": "exactMatch",
"weight": 1,
"countIfUndefined": "false",
"params": {}
}
],
"threshold": 1,
"aggregation": "OR",
"positive": "MATCH",
"negative": "authorsCheck",
"undefined": "authorsCheck",
"ignoreUndefined": "false"
},
"authorsCheck": {
"fields": [
{
"field": "authors",
"comparator": "authorsMatch",
"weight": 1.0,
"countIfUndefined": "false",
"params": {
"surname_th": 0.70,
"fullname_th": 0.70,
"size_th": 20,
"mode": "surname"
}
}
],
"threshold": 1,
"aggregation": "AVG",
"positive": "MATCH",
"negative": "NO_MATCH",
"undefined": "MATCH",
"ignoreUndefined": "false"
}
},
"model" : [
{
"name" : "doi",
"type" : "String",
"path" : "$.instance.pid[?(@.qualifier.classid == 'doi')].value"
},
{
"name" : "altdoi",
"type" : "String",
"path" : "$.instance.alternateIdentifier[?(@.qualifier.classid == 'doi')].value"
},
{
"name" : "title",
"type" : "String",
"path" : "$.title[?(@.qualifier.classid == 'main title')].value",
"length" : 250,
"size" : 5
},
{
"name" : "url",
"type" : "String",
"path" : "$.instance.url"
},
{
"name" : "resulttype",
"type" : "String",
"path" : "$.resulttype.classid"
},
{
"name": "authors",
"type": "List",
"path": "$.author[*].fullname",
"size": 200
}
],
"blacklists" : {},
"synonyms": {}
}
}

View File

@ -1,4 +0,0 @@
{"websiteurl": "https://fairsharing.org", "englishname": "FAIRsharing", "officialname": "FAIRsharing", "id": "eosc________::oxford_e-research_centre::oxford_e-research_centre.fairsharing"}
{"websiteurl": "https://FAIRsharing.org", "englishname": "FAIRsharing", "officialname": "FAIRsharing", "id": "fairsharing_::2521"}
{"websiteurl": "https://fairsharing.org/", "englishname": "formerly: biosharing", "officialname": "FAIRsharing", "id": "re3data_____::r3d100010142"}
{"websiteurl": "https://fairsharing.org/", "englishname": "FAIRsharing", "officialname": "FAIRsharing", "id": "openaire____::fairsharing"}

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View File

@ -1,32 +0,0 @@
[
{
"paramName": "e",
"paramLongName": "entitiesPath",
"paramDescription": "the input entities",
"paramRequired": true
},
{
"paramName": "w",
"paramLongName": "workingPath",
"paramDescription": "path of the working directory",
"paramRequired": true
},
{
"paramName": "np",
"paramLongName": "numPartitions",
"paramDescription": "number of partitions for the similarity relations intermediate phases",
"paramRequired": false
},
{
"paramName": "dc",
"paramLongName": "dedupConfPath",
"paramDescription": "dedup configuration to be used",
"paramRequired": false
},
{
"paramName": "gt",
"paramLongName": "groundTruthFieldJPath",
"paramDescription": "field to be used as groundtruth",
"paramRequired": false
}
]

View File

@ -6,7 +6,7 @@
<parent>
<groupId>eu.dnetlib</groupId>
<artifactId>dnet-dedup</artifactId>
<version>4.1.13-SNAPSHOT</version>
<version>4.1.9</version>
<relativePath>../pom.xml</relativePath>
</parent>
@ -67,11 +67,6 @@
<artifactId>json-path</artifactId>
</dependency>
<dependency>
<groupId>com.ibm.icu</groupId>
<artifactId>icu4j</artifactId>
</dependency>
</dependencies>
</project>

View File

@ -1,59 +1,59 @@
package eu.dnetlib.pace.clustering;
import com.google.common.collect.Maps;
import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.model.Document;
import eu.dnetlib.pace.model.Field;
import eu.dnetlib.pace.model.FieldListImpl;
import eu.dnetlib.pace.model.MapDocument;
import java.util.Collection;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.regex.Pattern;
import java.util.Set;
import com.google.common.collect.Iterables;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.model.Document;
import eu.dnetlib.pace.model.Field;
import eu.dnetlib.pace.model.FieldListImpl;
import eu.dnetlib.pace.model.MapDocument;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
public class BlacklistAwareClusteringCombiner extends ClusteringCombiner {
public static Collection<String> filterAndCombine(final MapDocument a, final Config conf) {
Document filtered = filter(a, conf.blacklists());
return combine(filtered, conf);
}
private static final Log log = LogFactory.getLog(BlacklistAwareClusteringCombiner.class);
private static MapDocument filter(final MapDocument a, final Map<String, List<Pattern>> blacklists) {
if (blacklists == null || blacklists.isEmpty()) {
return a;
}
public static Collection<String> filterAndCombine(final MapDocument a, final Config conf) {
final Map<String, Field> filtered = Maps.newHashMap(a.getFieldMap());
final Document filtered = new BlacklistAwareClusteringCombiner().filter(a, conf.blacklists());
return combine(filtered, conf);
}
for (final Entry<String, List<Pattern>> e : blacklists.entrySet()) {
Field fields = a.getFieldMap().get(e.getKey());
if (fields != null) {
final FieldListImpl fl = new FieldListImpl();
private MapDocument filter(final MapDocument a, final Map<String, List<String>> blacklists) {
final Map<String, Field> filtered = Maps.newHashMap(a.getFieldMap());
if (blacklists != null) {
for (final Entry<String, Field> e : filtered.entrySet()) {
for (Field f : fields) {
if (!isBlackListed(f.stringValue(), e.getValue())) {
fl.add(f);
}
}
filtered.put(e.getKey(), fl);
}
}
return new MapDocument(a.getIdentifier(), filtered);
}
private static boolean isBlackListed(String value, List<Pattern> blacklist) {
for (Pattern pattern : blacklist) {
if (pattern.matcher(value).matches()) {
return true;
}
}
return false;
}
final FieldListImpl fl = new FieldListImpl();
fl.addAll(Lists.newArrayList(Iterables.filter(e.getValue(), new FieldFilter(e.getKey(), blacklists))));
filtered.put(e.getKey(), fl);
}
}
return new MapDocument(a.getIdentifier(), filtered);
}
/**
* Tries to match the fields in the regex blacklist.
*
* @param fieldName
* @param value
* @return true if the field matches, false otherwise
*/
protected boolean regexMatches(final String fieldName, final String value, final Map<String, Set<String>> blacklists) {
if (blacklists.containsKey(fieldName)) {
for (final String regex : blacklists.get(fieldName)) {
if (value.matches(regex)) return true;
}
}
return false;
}
}

View File

@ -20,6 +20,10 @@ public class ClusteringCombiner {
private static String COLLAPSE_ON= "collapseOn";
public static Collection<String> combine(final Document a, final Config conf) {
return new ClusteringCombiner().doCombine(a, conf);
}
private Collection<String> doCombine(final Document a, final Config conf) {
final Collection<String> res = Sets.newLinkedHashSet();
for (final ClusteringDef cd : conf.clusterings()) {
for (final String fieldName : cd.getFields()) {
@ -47,7 +51,7 @@ public class ClusteringCombiner {
return res;
}
private static String getPrefix(ClusteringDef cd, String fieldName) {
private String getPrefix(ClusteringDef cd, String fieldName) {
return cd.getName()+ SEPARATOR +
cd.getParams().keySet()
.stream()

View File

@ -0,0 +1,48 @@
package eu.dnetlib.pace.clustering;
import java.util.List;
import java.util.Map;
import com.google.common.base.Predicate;
import eu.dnetlib.pace.model.Field;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
public class FieldFilter implements Predicate<Field> {
private static final Log log = LogFactory.getLog(FieldFilter.class);
private Map<String, List<String>> blacklists;
private String filedName;
public FieldFilter(final String fieldName, final Map<String, List<String>> blacklists) {
this.filedName = fieldName;
this.blacklists = blacklists;
}
@Override
public boolean apply(final Field f) {
return !regexMatches(filedName, f.stringValue(), blacklists);
}
/**
* Tries to match the fields in the regex blacklist.
*
* @param fieldName
* @param value
* @return true if the field matches, false otherwise
*/
protected boolean regexMatches(final String fieldName, final String value, final Map<String, List<String>> blacklists) {
if (blacklists.containsKey(fieldName)) {
final Iterable<String> regexes = blacklists.get(fieldName);
for (final String regex : regexes) {
if (StringUtils.isBlank(regex)) return false;
if (value.matches(regex)) return true;
}
}
return false;
}
}

View File

@ -41,7 +41,7 @@ public class KeywordsClustering extends AbstractClusteringFunction {
public Collection<String> apply(final Config conf, List<Field> fields) {
return fields.stream().filter(f -> !f.isEmpty())
.map(Field::stringValue)
.map(this::cleanup)
.map(this::cleanup) //TODO can I add this to the AbstractClusteringFunction without overriding the method here?
.map(this::normalize)
.map(s -> filterAllStopWords(s))
.map(s -> doApply(conf, s))

View File

@ -1,77 +0,0 @@
package eu.dnetlib.pace.clustering;
import com.google.common.collect.Lists;
import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.model.Field;
import eu.dnetlib.pace.model.Person;
import org.apache.commons.lang3.StringUtils;
import java.util.*;
import java.util.stream.Collectors;
@ClusteringClass("lnfi")
public class LastNameFirstInitial extends AbstractClusteringFunction{
private boolean DEFAULT_AGGRESSIVE = true;
public LastNameFirstInitial(final Map<String, Integer> params) {
super(params);
}
@Override
public Collection<String> apply(Config conf, List<Field> fields) {
return fields.stream().filter(f -> !f.isEmpty())
.map(Field::stringValue)
.map(this::normalize)
.map(s -> doApply(conf, s))
.map(c -> filterBlacklisted(c, ngramBlacklist))
.flatMap(c -> c.stream())
.filter(StringUtils::isNotBlank)
.collect(Collectors.toCollection(HashSet::new));
}
@Override
protected String normalize(final String s) {
return fixAliases(transliterate(nfd(unicodeNormalization(s))))
// do not compact the regexes in a single expression, would cause StackOverflowError in case of large input strings
.replaceAll("[^ \\w]+", "")
.replaceAll("(\\p{InCombiningDiacriticalMarks})+", "")
.replaceAll("(\\p{Punct})+", " ")
.replaceAll("(\\d)+", " ")
.replaceAll("(\\n)+", " ")
.trim();
}
@Override
protected Collection<String> doApply(final Config conf, final String s) {
final List<String> res = Lists.newArrayList();
final boolean aggressive = (Boolean) (getParams().containsKey("aggressive") ? getParams().get("aggressive") : DEFAULT_AGGRESSIVE);
Person p = new Person(s, aggressive);
if (p.isAccurate()) {
String lastName = p.getNormalisedSurname().toLowerCase();
String firstInitial = p.getNormalisedFirstName().toLowerCase().substring(0,1);
res.add(firstInitial.concat(lastName));
}
else { // is not accurate, meaning it has no defined name and surname
List<String> fullname = Arrays.asList(p.getNormalisedFullname().split(" "));
if (fullname.size() == 1) {
res.add(p.getNormalisedFullname().toLowerCase());
}
else if (fullname.size() == 2) {
res.add(fullname.get(0).substring(0,1).concat(fullname.get(1)).toLowerCase());
res.add(fullname.get(1).substring(0,1).concat(fullname.get(0)).toLowerCase());
}
else {
res.add(fullname.get(0).substring(0,1).concat(fullname.get(fullname.size()-1)).toLowerCase());
res.add(fullname.get(fullname.size()-1).substring(0,1).concat(fullname.get(0)).toLowerCase());
}
}
return res;
}
}

View File

@ -9,7 +9,7 @@ import com.google.common.collect.Lists;
import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.model.Person;
@ClusteringClass("personHash")
@ClusteringClass("personhash")
public class PersonHash extends AbstractClusteringFunction {
private boolean DEFAULT_AGGRESSIVE = false;

View File

@ -3,20 +3,24 @@ package eu.dnetlib.pace.common;
import com.google.common.base.Joiner;
import com.google.common.base.Splitter;
import com.google.common.collect.Iterables;
import com.google.common.collect.Lists;
import com.google.common.collect.Sets;
import com.ibm.icu.text.Transliterator;
import eu.dnetlib.pace.clustering.NGramUtils;
import eu.dnetlib.pace.config.Type;
import eu.dnetlib.pace.model.Field;
import eu.dnetlib.pace.model.FieldList;
import eu.dnetlib.pace.model.FieldListImpl;
import eu.dnetlib.pace.model.FieldValueImpl;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils;
import java.io.IOException;
import java.io.StringWriter;
import java.io.UnsupportedEncodingException;
import java.nio.charset.StandardCharsets;
import java.text.Normalizer;
import java.util.*;
import java.util.function.Function;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
@ -32,7 +36,6 @@ public abstract class AbstractPaceFunctions {
private static Map<String, String> cityMap = AbstractPaceFunctions.loadMapFromClasspath("/eu/dnetlib/pace/config/city_map.csv");
//list of stopwords in different languages
protected static Set<String> stopwords_gr = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_gr.txt");
protected static Set<String> stopwords_en = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_en.txt");
protected static Set<String> stopwords_de = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_de.txt");
protected static Set<String> stopwords_es = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_es.txt");
@ -40,9 +43,6 @@ public abstract class AbstractPaceFunctions {
protected static Set<String> stopwords_it = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_it.txt");
protected static Set<String> stopwords_pt = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_pt.txt");
//transliterator
protected static Transliterator transliterator = Transliterator.getInstance("Any-Eng");
//blacklist of ngrams: to avoid generic keys
protected static Set<String> ngramBlacklist = loadFromClasspath("/eu/dnetlib/pace/config/ngram_blacklist.txt");
@ -68,13 +68,15 @@ public abstract class AbstractPaceFunctions {
protected String cleanup(final String s) {
final String s1 = s.replaceAll(HTML_REGEX, "");
final String s2 = unicodeNormalization(s1.toLowerCase());
final String s3 = nfd(s2);
final String s4 = fixXML(s3);
final String s5 = s4.replaceAll("([0-9]+)", " $1 ");
final String s6 = transliterate(s5);
final String s7 = fixAliases(s6);
final String s00 = s.replaceAll(HTML_REGEX, "");
final String s0 = unicodeNormalization(s00.toLowerCase());
final String s1 = fixAliases(s0);
final String s2 = nfd(s1);
final String s3 = s2.replaceAll("&ndash;", " ");
final String s4 = s3.replaceAll("&amp;", " ");
final String s5 = s4.replaceAll("&quot;", " ");
final String s6 = s5.replaceAll("&minus;", " ");
final String s7 = s6.replaceAll("([0-9]+)", " $1 ");
final String s8 = s7.replaceAll("[^\\p{ASCII}]", "");
final String s9 = s8.replaceAll("[\\p{Punct}]", " ");
final String s10 = s9.replaceAll("\\n", " ");
@ -83,14 +85,6 @@ public abstract class AbstractPaceFunctions {
return s12;
}
protected String fixXML(final String a){
return a.replaceAll("&ndash;", " ")
.replaceAll("&amp;", " ")
.replaceAll("&quot;", " ")
.replaceAll("&minus;", " ");
}
protected boolean checkNumbers(final String a, final String b) {
final String numbersA = getNumbers(a);
final String numbersB = getNumbers(b);
@ -128,31 +122,19 @@ public abstract class AbstractPaceFunctions {
protected static String fixAliases(final String s) {
final StringBuilder sb = new StringBuilder();
s.chars().forEach(ch -> {
for (final char ch : Lists.charactersOf(s)) {
final int i = StringUtils.indexOf(aliases_from, ch);
sb.append(i >= 0 ? aliases_to.charAt(i) : (char)ch);
});
sb.append(i >= 0 ? aliases_to.charAt(i) : ch);
}
return sb.toString();
}
protected static String transliterate(final String s) {
try {
return transliterator.transliterate(s);
}
catch(Exception e) {
return s;
}
}
protected String removeSymbols(final String s) {
final StringBuilder sb = new StringBuilder();
s.chars().forEach(ch -> {
sb.append(StringUtils.contains(alpha, ch) ? (char)ch : ' ');
});
for (final char ch : Lists.charactersOf(s)) {
sb.append(StringUtils.contains(alpha, ch) ? ch : " ");
}
return sb.toString().replaceAll("\\s+", " ");
}
@ -165,7 +147,7 @@ public abstract class AbstractPaceFunctions {
}
protected String normalize(final String s) {
return fixAliases(transliterate(nfd(unicodeNormalization(s))))
return nfd(unicodeNormalization(s))
.toLowerCase()
// do not compact the regexes in a single expression, would cause StackOverflowError in case of large input strings
.replaceAll("[^ \\w]+", "")
@ -218,7 +200,6 @@ public abstract class AbstractPaceFunctions {
s = filterStopWords(s, stopwords_fr);
s = filterStopWords(s, stopwords_pt);
s = filterStopWords(s, stopwords_es);
s = filterStopWords(s, stopwords_gr);
return s;
}
@ -234,13 +215,10 @@ public abstract class AbstractPaceFunctions {
}
public static Set<String> loadFromClasspath(final String classpath) {
Transliterator transliterator = Transliterator.getInstance("Any-Eng");
final Set<String> h = Sets.newHashSet();
try {
for (final String s : IOUtils.readLines(NGramUtils.class.getResourceAsStream(classpath), StandardCharsets.UTF_8)) {
h.add(fixAliases(transliterator.transliterate(s))); //transliteration of the stopwords
for (final String s : IOUtils.readLines(NGramUtils.class.getResourceAsStream(classpath))) {
h.add(s);
}
} catch (final Throwable e) {
return Sets.newHashSet();
@ -249,17 +227,14 @@ public abstract class AbstractPaceFunctions {
}
public static Map<String, String> loadMapFromClasspath(final String classpath) {
Transliterator transliterator = Transliterator.getInstance("Any-Eng");
final Map<String, String> m = new HashMap<>();
try {
for (final String s : IOUtils.readLines(AbstractPaceFunctions.class.getResourceAsStream(classpath), StandardCharsets.UTF_8)) {
for (final String s : IOUtils.readLines(AbstractPaceFunctions.class.getResourceAsStream(classpath))) {
//string is like this: code;word1;word2;word3
String[] line = s.split(";");
String value = line[0];
for (int i = 1; i < line.length; i++) {
m.put(fixAliases(transliterator.transliterate(line[i].toLowerCase())), value);
m.put(line[i].toLowerCase(), value);
}
}
} catch (final Throwable e) {
@ -347,7 +322,7 @@ public abstract class AbstractPaceFunctions {
public static <T> String readFromClasspath(final String filename, final Class<T> clazz) {
final StringWriter sw = new StringWriter();
try {
IOUtils.copy(clazz.getResourceAsStream(filename), sw, StandardCharsets.UTF_8);
IOUtils.copy(clazz.getResourceAsStream(filename), sw);
return sw.toString();
} catch (final IOException e) {
throw new RuntimeException("cannot load resource from classpath: " + filename);

View File

@ -2,7 +2,6 @@ package eu.dnetlib.pace.config;
import java.util.List;
import java.util.Map;
import java.util.regex.Pattern;
import eu.dnetlib.pace.model.ClusteringDef;
import eu.dnetlib.pace.model.FieldDef;
@ -48,7 +47,7 @@ public interface Config {
*
* @return the map
*/
public Map<String, List<Pattern>> blacklists();
public Map<String, List<String>> blacklists();
/**

View File

@ -1,6 +1,5 @@
package eu.dnetlib.pace.config;
import com.fasterxml.jackson.annotation.JsonIgnore;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.google.common.collect.Maps;
import eu.dnetlib.pace.model.ClusteringDef;
@ -8,19 +7,15 @@ import eu.dnetlib.pace.model.FieldDef;
import eu.dnetlib.pace.util.PaceException;
import org.antlr.stringtemplate.StringTemplate;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import java.io.IOException;
import java.io.Serializable;
import java.nio.charset.StandardCharsets;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
import eu.dnetlib.pace.tree.support.TreeNodeDef;
@ -36,9 +31,6 @@ public class DedupConfig implements Config, Serializable {
private WfConfig wf;
@JsonIgnore
private Map<String, List<Pattern>> blacklists;
private static Map<String, String> defaults = Maps.newHashMap();
static {
@ -65,12 +57,6 @@ public class DedupConfig implements Config, Serializable {
config = new ObjectMapper().readValue(json, DedupConfig.class);
config.getPace().initModel();
config.getPace().initTranslationMap();
config.blacklists = config.getPace().getBlacklists().entrySet()
.stream()
.collect(Collectors.toMap(e -> e.getKey(),
e ->e.getValue().stream().filter(s -> !StringUtils.isBlank(s)).map(Pattern::compile).collect(Collectors.toList()) ));
return config;
} catch (IOException e) {
throw new PaceException("Error in parsing configuration json", e);
@ -102,7 +88,7 @@ public class DedupConfig implements Config, Serializable {
}
private String readFromClasspath(final String resource) throws IOException {
return IOUtils.toString(getClass().getResource(resource), StandardCharsets.UTF_8);
return IOUtils.toString(getClass().getResource(resource));
}
public PaceConfig getPace() {
@ -151,8 +137,8 @@ public class DedupConfig implements Config, Serializable {
}
@Override
public Map<String, List<Pattern>> blacklists() {
return blacklists;
public Map<String, List<String>> blacklists() {
return getPace().getBlacklists();
}
@Override

View File

@ -3,7 +3,6 @@ package eu.dnetlib.pace.config;
import com.fasterxml.jackson.annotation.JsonIgnore;
import com.google.common.collect.Maps;
import com.ibm.icu.text.Transliterator;
import eu.dnetlib.pace.common.AbstractPaceFunctions;
import eu.dnetlib.pace.model.ClusteringDef;
import eu.dnetlib.pace.model.FieldDef;
@ -44,12 +43,10 @@ public class PaceConfig extends AbstractPaceFunctions implements Serializable {
public void initTranslationMap(){
translationMap = Maps.newHashMap();
Transliterator transliterator = Transliterator.getInstance("Any-Eng");
for (String key : synonyms.keySet()) {
for (String term : synonyms.get(key)){
translationMap.put(
fixAliases(transliterator.transliterate(term.toLowerCase())),
normalize(term.toLowerCase()),
key);
}
}

View File

@ -1,5 +1,5 @@
package eu.dnetlib.pace.config;
public enum Type {
String, Int, List, JSON, URL, StringConcat, DoubleArray
String, Int, List, JSON, URL
}

View File

@ -20,6 +20,4 @@ public interface FieldValue extends Field {
*/
public void setValue(final Object value);
public double[] doubleArrayValue();
}

View File

@ -58,10 +58,8 @@ public class FieldValueImpl extends AbstractField implements FieldValue {
throw new RuntimeException(value.toString());
}
case URL:
String str = value.toString();
return StringUtils.isBlank(str) || !isValidURL(str);
case DoubleArray:
return doubleArrayValue().length==0;
String str = value.toString();
return StringUtils.isBlank(str) || !isValidURL(str);
default:
return true;
}
@ -118,10 +116,6 @@ public class FieldValueImpl extends AbstractField implements FieldValue {
// }
}
public double[] doubleArrayValue() {
return (double[])getValue();
}
/*
* (non-Javadoc)
*

View File

@ -43,7 +43,7 @@ public class Person {
// s = s.replaceAll("[\\W&&[^,-]]", "");
}
if (s.contains(",")) { //if the name contains a comma it is easy derivable the name and the surname
if (s.contains(",")) {
final String[] arr = s.split(",");
if (arr.length == 1) {
fullname = splitTerms(arr[0]);

View File

@ -25,8 +25,6 @@ public class AuthorsMatch extends AbstractComparator {
private double NAME_THRESHOLD;
private double FULLNAME_THRESHOLD;
private String MODE; //full or surname
private int SIZE_THRESHOLD;
private String TYPE; //count or percentage
private int common;
public AuthorsMatch(Map<String, String> params){
@ -37,8 +35,6 @@ public class AuthorsMatch extends AbstractComparator {
SURNAME_THRESHOLD = Double.parseDouble(params.getOrDefault("surname_th", "0.95"));
NAME_THRESHOLD = Double.parseDouble(params.getOrDefault("name_th", "0.95"));
FULLNAME_THRESHOLD = Double.parseDouble(params.getOrDefault("fullname_th", "0.9"));
SIZE_THRESHOLD = Integer.parseInt(params.getOrDefault("size_th", "20"));
TYPE = params.getOrDefault("type", "percentage");
common = 0;
}
@ -52,9 +48,6 @@ public class AuthorsMatch extends AbstractComparator {
if (a.isEmpty() || b.isEmpty())
return -1;
if (((FieldList) a).size() > SIZE_THRESHOLD || ((FieldList) b).size() > SIZE_THRESHOLD)
return 1.0;
List<Person> aList = ((FieldList) a).stringList().stream().map(author -> new Person(author, false)).collect(Collectors.toList());
List<Person> bList = ((FieldList) b).stringList().stream().map(author -> new Person(author, false)).collect(Collectors.toList());
@ -67,10 +60,7 @@ public class AuthorsMatch extends AbstractComparator {
//both persons are inaccurate
if (!p1.isAccurate() && !p2.isAccurate()) {
//compare just normalized fullnames
String fullname1 = normalization(p1.getNormalisedFullname().isEmpty()? p1.getOriginal() : p1.getNormalisedFullname());
String fullname2 = normalization(p2.getNormalisedFullname().isEmpty()? p2.getOriginal() : p2.getNormalisedFullname());
if (ssalgo.score(fullname1, fullname2) > FULLNAME_THRESHOLD) {
if (ssalgo.score(normalization(p1.getNormalisedFullname()), normalization(p2.getNormalisedFullname())) > FULLNAME_THRESHOLD) {
common += 1;
break;
}
@ -79,14 +69,10 @@ public class AuthorsMatch extends AbstractComparator {
//one person is inaccurate
if (p1.isAccurate() ^ p2.isAccurate()) {
//prepare data
//data for the accurate person
String name = normalization(p1.isAccurate()? p1.getNormalisedFirstName() : p2.getNormalisedFirstName());
String surname = normalization(p1.isAccurate()? p1.getNormalisedSurname() : p2.getNormalisedSurname());
String name = p1.isAccurate()? normalization(p1.getNormalisedFirstName()) : normalization(p2.getNormalisedFirstName());
String surname = p1.isAccurate()? normalization(p2.getNormalisedSurname()) : normalization(p2.getNormalisedSurname());
//data for the inaccurate person
String fullname = normalization(
p1.isAccurate() ? ((p2.getNormalisedFullname().isEmpty()) ? p2.getOriginal() : p2.getNormalisedFullname()) : (p1.getNormalisedFullname().isEmpty() ? p1.getOriginal() : p1.getNormalisedFullname())
);
String fullname = p1.isAccurate()? normalization(p2.getNormalisedFullname()) : normalization(p1.getNormalisedFullname());
if (fullname.contains(surname)) {
if (MODE.equals("full")) {
@ -125,12 +111,7 @@ public class AuthorsMatch extends AbstractComparator {
//normalization factor to compute the score
int normFactor = aList.size() == bList.size() ? aList.size() : (aList.size() + bList.size() - common);
if(TYPE.equals("percentage")) {
return (double) common / normFactor;
}
else {
return (double) common;
}
return (double)common / normFactor;
}
public boolean compareSurname(Person p1, Person p2) {

View File

@ -1,53 +0,0 @@
package eu.dnetlib.pace.tree;
import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.model.Field;
import eu.dnetlib.pace.model.FieldList;
import eu.dnetlib.pace.model.FieldValueImpl;
import eu.dnetlib.pace.model.Person;
import eu.dnetlib.pace.tree.support.AbstractComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;
@ComparatorClass("cosineSimilarity")
public class CosineSimilarity extends AbstractComparator {
Map<String, String> params;
public CosineSimilarity(Map<String,String> params) {
super(params);
}
@Override
public double compare(final Field a, final Field b, final Config conf) {
if (a.isEmpty() || b.isEmpty())
return -1;
double[] aVector = ((FieldValueImpl) a).doubleArrayValue();
double[] bVector = ((FieldValueImpl) b).doubleArrayValue();
return cosineSimilarity(aVector, bVector);
}
double cosineSimilarity(double[] a, double[] b) {
double dotProduct = 0;
double normASum = 0;
double normBSum = 0;
for(int i = 0; i < a.length; i ++) {
dotProduct += a[i] * b[i];
normASum += a[i] * a[i];
normBSum += b[i] * b[i];
}
double eucledianDist = Math.sqrt(normASum) * Math.sqrt(normBSum);
return dotProduct / eucledianDist;
}
}

View File

@ -16,7 +16,6 @@ public class DomainExactMatch extends ExactMatchIgnoreCase {
@Override
protected String getValue(final Field f) {
try {
return asUrl(super.getValue(f)).getHost();
} catch (MalformedURLException e) {

View File

@ -50,9 +50,6 @@ public class JaroWinklerNormalizedName extends AbstractComparator {
cb = removeKeywords(cb, keywords2);
cb = removeKeywords(cb, cities2);
ca = ca.replaceAll("[ ]{2,}", " ");
cb = cb.replaceAll("[ ]{2,}", " ");
if (ca.isEmpty() && cb.isEmpty())
return 1.0;
else

View File

@ -1,34 +0,0 @@
package eu.dnetlib.pace.tree;
import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.tree.support.AbstractComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass;
import java.util.Map;
@ComparatorClass("numbersComparator")
public class NumbersComparator extends AbstractComparator {
Map<String, String> params;
public NumbersComparator(Map<String, String> params) {
super(params);
this.params = params;
}
@Override
public double distance(String a, String b, Config conf) {
//extracts numbers from the field
String numbers1 = getNumbers(nfd(a));
String numbers2 = getNumbers(nfd(b));
if (numbers1.isEmpty() || numbers2.isEmpty())
return -1.0;
int n1 = Integer.parseInt(numbers1);
int n2 = Integer.parseInt(numbers2);
return Math.abs(n1 - n2);
}
}

View File

@ -42,25 +42,22 @@ public class StringContainsMatch extends AbstractComparator {
STRING = STRING.toLowerCase();
}
if (AGGREGATOR != null) {
switch (AGGREGATOR) {
case "AND":
if (ca.contains(STRING) && cb.contains(STRING))
return 1.0;
break;
case "OR":
if (ca.contains(STRING) || cb.contains(STRING))
return 1.0;
break;
case "XOR":
if (ca.contains(STRING) ^ cb.contains(STRING))
return 1.0;
break;
default:
return 0.0;
}
switch(AGGREGATOR) {
case "AND":
if(ca.contains(STRING) && cb.contains(STRING))
return 1.0;
break;
case "OR":
if(ca.contains(STRING) || cb.contains(STRING))
return 1.0;
break;
case "XOR":
if(ca.contains(STRING) ^ cb.contains(STRING))
return 1.0;
break;
default:
return 0.0;
}
return 0.0;
}
}

View File

@ -19,13 +19,9 @@ public class StringListMatch extends AbstractComparator {
private static final Log log = LogFactory.getLog(StringListMatch.class);
private Map<String, String> params;
final private String TYPE; //percentage or count
public StringListMatch(final Map<String, String> params) {
super(params);
this.params = params;
TYPE = params.getOrDefault("type", "percentage");
}
@Override
@ -35,7 +31,7 @@ public class StringListMatch extends AbstractComparator {
final Set<String> pb = new HashSet<>(((FieldList) b).stringList());
if (pa.isEmpty() || pb.isEmpty()) {
return -1; //return undefined if one of the two lists is empty
return -1; //return undefined if one of the two lists of pids is empty
}
int incommon = Sets.intersection(pa, pb).size();
@ -45,10 +41,7 @@ public class StringListMatch extends AbstractComparator {
return 0.0;
}
if(TYPE.equals("percentage"))
return (double)incommon / (incommon + simDiff);
else
return incommon;
return (double)incommon / (incommon + simDiff);
}
}

View File

@ -1,6 +1,5 @@
package eu.dnetlib.pace.tree.support;
import com.fasterxml.jackson.core.JsonProcessingException;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.config.PaceConfig;
@ -10,7 +9,6 @@ import org.apache.commons.lang3.StringUtils;
import java.io.IOException;
import java.io.Serializable;
import java.io.StringWriter;
import java.util.List;
public class TreeNodeDef implements Serializable {
@ -59,9 +57,8 @@ public class TreeNodeDef implements Serializable {
double result2 = comparator(fieldConf).compare(doc1.getFieldMap().get(crossField), doc2.getFieldMap().get(fieldConf.getField()), conf);
result = Math.max(result1,result2);
}
else {
else
result = comparator(fieldConf).compare(doc1.getFieldMap().get(fieldConf.getField()), doc2.getFieldMap().get(fieldConf.getField()), conf);
}
stats.addFieldStats(
fieldConf.getComparator() + " on " + fieldConf.getField() + " " + fields.indexOf(fieldConf),

View File

@ -161,14 +161,11 @@ public class BlockProcessorForTesting {
}
else {
//use the decision tree implementation or the "normal" implementation of the similarity score (valid only for publications)
if (useTree)
if(useTree)
emitOutput(new TreeProcessor(dedupConf).compare(pivot, curr), idPivot, idCurr, context);
else
emitOutput(publicationCompare(pivot, curr, dedupConf), idPivot, idCurr, context);
}
// if(new TreeProcessor(dedupConf).compare(pivot, curr) != publicationCompare(pivot, curr, dedupConf)) {
// emitOutput(true, idPivot, idCurr, context);
// }
}
}
@ -183,45 +180,38 @@ public class BlockProcessorForTesting {
return compare>=1.0;
}
private boolean publicationCompare(MapDocument a, MapDocument b, DedupConfig config) {
//if the score gives 1, the publications are equivalent
Map<String, String> params = new HashMap<>();
params.put("jpath_value", "$.value");
params.put("jpath_classid", "$.qualifier.classid");
params.put("mode", "count");
private boolean publicationCompare(MapDocument a, MapDocument b, DedupConfig config) {
double score = 0.0;
double score = 0.0;
//LAYER 1 - comparison of the PIDs json lists
Map<String, String> params = new HashMap<>();
params.put("jpath_value", "$.value");
params.put("jpath_classid", "$.qualifier.classid");
JsonListMatch jsonListMatch = new JsonListMatch(params);
double result = jsonListMatch.compare(a.getFieldMap().get("pid"), b.getFieldMap().get("pid"), config);
if (result >= 0.5) //if the result of the comparison is greater than the threshold
score += 10.0; //high score because it should match when the first condition is satisfied
else
score += 0.0;
//levenstein title
LevensteinTitle levensteinTitle = new LevensteinTitle(params);
if(levensteinTitle.compare(a.getFieldMap().get("title"), b.getFieldMap().get("title"), config) >= 0.9) {
score += 0.2;
//LAYER 2 - comparison of the title version and the size of the authors lists
TitleVersionMatch titleVersionMatch = new TitleVersionMatch(params);
double result1 = titleVersionMatch.compare(a.getFieldMap().get("title"), b.getFieldMap().get("title"), config);
SizeMatch sizeMatch = new SizeMatch(params);
double result2 = sizeMatch.compare(a.getFieldMap().get("authors"), b.getFieldMap().get("authors"), config);
if (Math.min(result1, result2) != 0)
score+=0;
else
score-=2;
//LAYER 3 - computation of levenshtein on titles
LevensteinTitle levensteinTitle = new LevensteinTitle(params);
double result3 = levensteinTitle.compare(a.getFieldMap().get("title"), b.getFieldMap().get("title"), config);
score += Double.isNaN(result3)?0.0:result3;
return score >= 0.99;
}
//pid
JsonListMatch jsonListMatch = new JsonListMatch(params);
if (jsonListMatch.compare(a.getFieldMap().get("pid"), b.getFieldMap().get("pid"), config) >= 1.0) {
score += 0.5;
}
//title version
TitleVersionMatch titleVersionMatch = new TitleVersionMatch(params);
double result1 = titleVersionMatch.compare(a.getFieldMap().get("title"), b.getFieldMap().get("title"), config);
if(result1<0 || result1>=1.0) {
score += 0.1;
}
//authors match
params.remove("mode");
AuthorsMatch authorsMatch = new AuthorsMatch(params);
double result2 = authorsMatch.compare(a.getFieldMap().get("authors"), b.getFieldMap().get("authors"), config);
if(result2 <0|| result2>=0.6) {
score += 0.2;
}
return score>=0.5;
}
private void emitOutput(final boolean result, final String idPivot, final String idCurr, final Reporter context) {
if (result) {
@ -244,5 +234,6 @@ public class BlockProcessorForTesting {
final String type = dedupConf.getWf().getEntityType();
context.emit(type, from, to);
context.emit(type, to, from);
}
}

View File

@ -7,13 +7,14 @@ import com.jayway.jsonpath.JsonPath;
import com.jayway.jsonpath.Option;
import eu.dnetlib.pace.config.DedupConfig;
import eu.dnetlib.pace.config.Type;
import eu.dnetlib.pace.model.*;
import eu.dnetlib.pace.model.Field;
import eu.dnetlib.pace.model.FieldListImpl;
import eu.dnetlib.pace.model.FieldValueImpl;
import eu.dnetlib.pace.model.MapDocument;
import net.minidev.json.JSONArray;
import java.math.BigDecimal;
import java.util.*;
import java.util.function.Predicate;
import java.util.stream.Collectors;
public class MapDocumentUtil {
@ -44,25 +45,6 @@ public class MapDocumentUtil {
.forEach(fi::add);
stringField.put(fdef.getName(), fi);
break;
case DoubleArray:
stringField.put(
fdef.getName(),
new FieldValueImpl(Type.DoubleArray,
fdef.getName(),
getJPathArray(fdef.getPath(), json))
);
break;
case StringConcat:
String[] jpaths = fdef.getPath().split("\\|\\|\\|");
stringField.put(
fdef.getName(),
new FieldValueImpl(Type.String,
fdef.getName(),
truncateValue(Arrays.stream(jpaths).map(jpath -> getJPathString(jpath, json)).collect(Collectors.joining(" ")),
fdef.getLength())
)
);
break;
}
});
m.setFieldMap(stringField);
@ -121,30 +103,6 @@ public class MapDocumentUtil {
}
}
public static double[] getJPathArray(final String jsonPath, final String json) {
try {
Object o = JsonPath.read(json, jsonPath);
if (o instanceof double[])
return (double[]) o;
if (o instanceof JSONArray) {
Object[] objects = ((JSONArray) o).toArray();
double[] array = new double[objects.length];
for (int i = 0; i < objects.length; i++) {
if (objects[i] instanceof BigDecimal)
array[i] = ((BigDecimal)objects[i]).doubleValue();
else
array[i] = (double) objects[i];
}
return array;
}
return new double[0];
}
catch (Exception e) {
e.printStackTrace();
return new double[0];
}
}
public static String truncateValue(String value, int length) {
if (value == null)

View File

@ -909,6 +909,7 @@ city::2389086;Berberati;BBT;Berberati;Berbérati;Берберати;
city::2389853;Bangui;BGF;Bangi;Bangis;Bangui;Mpan'nkoui;ban ji;bang-gi;bangi;bangwyy;Μπανγκουί;Банги;Бангі;בנגואי;بانگوئی;ባንጊ;バンギ;班基;방기;
city::2255414;Pointe-Noire;PNR;Pointe-Noire;Puehnt-Nuar;Puent Nuaras;puaengteunualeu;Пуэнт-Нуар;푸앵트누아르;
city::2258261;Dolisie;DIS;Dolisi;Dolisie;Dolisje;Dolizi;Dolosie;Loubomo;Lubomo;dolliji;dorishi;duo li xi;dwlysy;Долиси;Лубомо;دولیسی;ドリシー;多利西;돌리지;
city::2259383;Kayes;Jacob;Kai;Kajes;Kaye;Kayes;Kaï;Кайес;
city::2260535;Brazzaville;BZV;Braza;Brazavil;Brazavilis;Brazavilo;Brazzavil';Brazzaville;Maya-Maya;Mprazabil;N'Tamo;beulajabil;brazafyl;brazawyl;brzwwyl;bu la chai wei er;burazavu~iru;Μπραζαβίλ;Браззавиль;ברזוויל;برازافيل;برازاویل;ብራዛቪል;ブラザヴィル;布拉柴维尔;브라자빌;
city::2657896;Zurich;Cirihe;Cirikh;Ciurichas;Cjurikh;Cjurikh khot;Cuerih;Curych;Cürih;Cīrihe;Gorad Cjurykh;Lungsod ng Zuerich;Lungsod ng Zürich;Su-la-sie;Suerix;Syurix;Sürix;Sŭ-là̤-sié;Tsuerix;Tsurique;Tsürix;Turicum;Turitg;ZRH;Zeurich;Zirich;Zirik;Zuerich;Zuerigh;Zuerih;Zuric;Zurich;Zuricu;Zurigh;Zurigo;Zuriko;Zurique;Zurych;Zurìcu;Zyriche;Zyrihu;Zúric;Zúrich;Zürich;Zürigh;Zürih;churihhi;chwilihi;curikku;jhyurika;jurikha;su li shi;su rik;suricc;tsiurikhi;tsyryk;zi'urikha;zwrykh;zyryk;zyurikha;zywrch;zywrh;zywrkh;Ζυρίχη;Горад Цюрых;Цирих;Цюрих;Цюрих хот;Ցյուրիխ;ציריך;زوريخ;زوریخ;زيورخ;زیورخ;زیورچ;سيۇرىخ;څوریخ;ܙܝܘܪܚ;ܬܣܝܪܝܟ;ज़्यूरिख़;झ्युरिक;জুরিখ;ਜ਼ਿਊਰਿਖ;சூரிக்கு;സൂറിച്ച്;ซูริก;ဇူးရစ်ချ်မြို့;ციურიხი;ዙሪክ;チューリッヒ;苏黎世;蘇黎世;취리히;
city::2657970;Winterthur;Eulachstadt;Gorad Vintehrtur;Vintertour;Vintertur;Vintertura;Vinterturas;Vinterturi;Vinterturo;Vintertūra;Vintertūras;Vitudurum;Winterthour;Winterthur;ZLI;binteotueo;fyntrtwr;vu~intato~uru;wen te tu er;wntrtwr;Βίντερτουρ;Винтертур;Вінтертур;Горад Вінтэртур;فينترتور;ونترتور;ونٹرتھر;ვინტერთური;ヴィンタートゥール;温特图尔;빈터투어;
@ -2993,7 +2994,7 @@ city::262036;Glyfada;Aixone;Glifadha;Glifádha;Glyfada;Glyfáda;Γλυφάδα;
city::262135;Galatsi;Galatsi;Galatsion;Galátsi;Galátsion;Γαλάτσι;Γαλάτσιον;
city::263986;Agios Dimitrios;Agios Dimitrios;Ayios Dhimitrios;Brakhami;Brakhámi;Áyios Dhimítrios;Άγιος Δημήτριος;
city::264194;Agia Paraskevi;Agia Paraskeue;Agia Paraskevi;Agía Paraskeví;Ayia Paraskevi;Ayía Paraskeví;Αγία Παρασκευή;
city::264371;Athens;athenon;ATH;Afina;Afini;Afiny;An Aithin;Ateena;Atehny;Aten;Atena;Atenai;Atenas;Atenas - Athena;Atenas - Αθήνα;Αθηνών;Atene;Atenes;Ateni;Ateno;Atenoj;Ateny;Athen;Athena;Athenae;Athenai;Athene;Athenes;Athens;Atheny;Athina;Athinai;Athinia;Athènes;Athén;Athénes;Athény;Athína;Athínai;Atina;Atény;Atēnas;Atėnai;Aþena;Kota Athena;Lungsod ng Athina;Lungsod ng Athína;atene;atene si;ateni;athensa;athyna;atn;etens;xethens;ya dian;Αθήνα;Αθήναι;Αθηνα;Αθηναι;Атина;Атэны;Афины;Афіни;Аѳины;Աթենք;אתונה;آتن;أثينا;ئافېنا;ܐܬܝܢܐ;अथेन्स;ஏதென்ஸ்;เอเธนส์;ათენი;Ἀθῆναι;アテネ;雅典;아테네;아테네 시;
city::264371;Athens;ATH;Afina;Afini;Afiny;An Aithin;Ateena;Atehny;Aten;Atena;Atenai;Atenas;Atenas - Athena;Atenas - Αθήνα;Atene;Atenes;Ateni;Ateno;Atenoj;Ateny;Athen;Athena;Athenae;Athenai;Athene;Athenes;Athens;Atheny;Athina;Athinai;Athinia;Athènes;Athén;Athénes;Athény;Athína;Athínai;Atina;Atény;Atēnas;Atėnai;Aþena;Kota Athena;Lungsod ng Athina;Lungsod ng Athína;atene;atene si;ateni;athensa;athyna;atn;etens;xethens;ya dian;Αθήνα;Αθήναι;Αθηνα;Αθηναι;Атина;Атэны;Афины;Афіни;Аѳины;Աթենք;אתונה;آتن;أثينا;ئافېنا;ܐܬܝܢܐ;अथेन्स;ஏதென்ஸ்;เอเธนส์;ათენი;Ἀθῆναι;アテネ;雅典;아테네;아테네 시;
city::265243;Marousi;Amarousio;Amarousion;Amaroúsion;Marousi;Maroussi;Maroúsi;Αμαρούσιον;Μαρούσι;
city::265488;Acharnes;Acharnae;Acharnai;Acharne;Acharnes;Akharnai;Akharnaí;Menidhi;Menidhion;Menidi;Menidion;Menioi;Menídhi;Menídhion;Meníoi;Αχαρνές;Αχαρναί;Μενίδι;Μενίδιον;
city::265533;Aigaleo;Aegaleo;Aigaleo;Aigáleo;Egaleo;Αιγάλεω;
@ -5209,6 +5210,7 @@ city::2451478;Segou;Segi;Segou;Segu;Segú;Senkou;Ségou;sai gu;segu;sgw;syghw;Σ
city::2453348;Mopti;MZI;Mopti;Moptis;mo pu ti;mobti;moputi;mwbty;mwpty;Μοπτί;Мопти;Мопті;موبتي;موپتی;موپٹی;モプティ;莫普提;몹티;
city::2453662;Markala;Markala;
city::2454268;Koutiala;KTX;Koutiala;Kutiala;ku jia la;Кутиала;庫佳拉;
city::2455518;Kayes;Gorad Kaes;KYS;Kaes;Kagies;Kajes;Kajesas;Kayes;Kayi;ka yi;kai;kays;keseu;kyz;Καγιές;Горад Каес;Каес;Кайес;Каєс;كايس;کایس;کیز;კაესი;カイ;卡伊;케스;
city::2457163;Gao;GAQ;Gao;Nkao;gao;gaw;gayw;jaw;jia ao;ka xo;Γκάο;Гао;Ґао;גאו;جاو;گائو;گاو;กาโอ;ガオ;加奥;가오;
city::2460596;Bamako;BKO;Bamaco - Bamako;Bamakas;Bamako;Bamaku;Bamakó;Bamakɔ;Bammaco;Bammako;Mpamako;ba ma ke;bamako;bamakw;bmqw;Μπαμάκο;Бамако;Բամակո;במקו;باماكو;باماکو;ባማኮ;バマコ;巴馬科;바마코;
city::1285173;Yenangyaung;Yaynangyoung;Yenangyaung;Yenangyoung;
@ -7472,6 +7474,7 @@ city::4177887;West Palm Beach;Litus Palmense Occidentale;Okcidenta Palm Beach;PB
city::4178003;Weston;Uehston;Veston;Weston;vestana;wei si dun;wstwn;wstwn flwryda;Вестон;Уэстон;وستون;وستون، فلوریدا;वेस्टन;韦斯顿;
city::4179320;Albany;ABY;Albany;City of Opportunity;Olbani;albani;albany;albany jarjya;albany jwrjya;ao er ba ni;olbeoni;orubani;Олбани;Олбані;آلبانی، جورجیا;ألباني;البانی، جارجیا;अल्बानी;オールバニ;奧爾巴尼;올버니;
city::4179574;Alpharetta;Al'faretta;Alfareta;Alpharetta;New Prospect Campground;alfarta jwrjya;alfaryta;alfaryta jarjya;alphareta;Алфарета;Альфаретта;آلفارتا، جورجیا;ألفاريتا;الفاریتا، جارجیا;अल्फारेटा;
city::4180386;Athens;AHN;Atens;Atensas;Athens;Athens i Georgia;Athens-Clarke County;Atina;Atuns;Cedar Shoals;aeseonseu;asenzu;athensa;athyna;atn jwrjya;atynz jwrjya;ethensaklarka ka'unti;ya dian;Атенс;Атина;Атънс;אתנס;آتئنز، جورجیا;آتن، جورجیا;أثينا;ایتھنز، جارجیا;अथेन्स;एथेन्सक्लार्क काउन्टी;アセンズ;雅典;애선스;
city::4180439;Atlanta;ATL;Atlant;Atlanta;Atlantae;Atlonta;Canebrake;Gorad Atlanta;Marthasdale;Marthasville;Standing Peachtree;Terminus;White Hall;Whitehall;aeteullaenta;arrlanra nagaram;atalanta;ateullaenta;ateullanta;atlanta;atoranta;atʼlantʼa;etalanta;etlanta;ya te lan da;Ατλάντα;Атлантæ;Атланта;Горад Атланта;Ատլանտա;אטלאנטא;אטלנטה;آتلانتا;أتلانتا;ئەتڵانتا;اٹلانٹا;اٹلانٹا، جارجیا;अटलांटा;अटलान्टा;एट्लान्टा;एत्लान्ता;আটলান্টা;એટલાન્ટા;அட்லான்டா;అట్లాంటా;ಅಟ್ಲಾಂಟಾ;അറ്റ്‌ലാന്റാ നഗരം;แอตแลนตา;ཨ་ཊི་ལཱན་ཊཱ།;အတ္တလန္တာမြို့;ატლანტა;አትላንታ;アトランタ;亚特兰大;亞特蘭大;아틀란타;아틀랜타;애틀랜타;
city::4184530;Brookhaven;Brookhaven;Brookhaven Heights;Nort Atlanta;North Atlanta;brwk hawn jwrjya;brwkhafn;nartha etlanta;Норт Атланта;بروكهافن;بروک هاون، جورجیا;بروک ہیون، جارجیا;नर्थ एट्लान्टा;
city::4188985;Columbus;CSG;Columbus;Kolambus;Kolumbas;Kolumbus;Kulumbus;ge lun bu;klmbws jwrjya;kolambasa;kolleombeoseu;kolumbus;koronbasu;kwlmbs jarjya;kwlwmbws;qwlwmbws;Коламбус;Колумбус;Кълъмбъс;קולומבוס;كولومبوس;کلمبوس، جورجیا;کولمبس، جارجیا;कोलम्बस;コロンバス;哥伦布;콜럼버스;
@ -7950,7 +7953,7 @@ city::5258957;La Crosse;Gateway City;LSE;La Crosse;La Kros;La-Kross;Lac Rosse;La
city::5261457;Madison;Gorad Madysan;MSN;Madison;Madisonas;Madisonia;Madisons;Madisun;Mantison;Medison;Medisona;Mehdison;madisan;madison;madisoni;madyswn;maediseun;mai di xun;maidisana;mdysn wyskansyn;mdyswn;medisana;metican;Μάντισον;Горад Мадысан;Мадисон;Мадисън;Медисон;Медісон;Мэдисон;Մեդիսոն;מדיסון;ماديسون;مدیسن، ویسکانسین;میڈیسن;میڈیسون، وسکونسن;माडिसन्;मॅडिसन;मेडिसन;मैडिसन;மேடிசன்;მადისონი;マディソン;麦迪逊;매디슨;
city::5263045;Milwaukee;Gorad Miluoki;Juneautown;Kilbourntown;MKE;Mahn-a-wau-kee Seepe;Mahn-a-wauk-ee See-pe;Mahn-a-waukee Seepe;Mahn-a-waukie;Mahn-ah-wauk Seepe;Mahnawauk;Man-a-wau-kee;Man-a-wauk-ee;Man-na-wah-kie;Mana'wa;Manawaki;Manawaukee;Manayaukee;Maunahwauke;Mee-lee-waug-ee;Meliki;Melleoki;Melwarik;Meneawkee;Meolaki;Mil-wah-kie;Milgouoki;Milioke;Millewacki;Millicki;Milo-aki;Milouagui;Milouakik;Milowages;Miluoki;Miluokʻi;Milvauchia;Milvoki;Milvokio;Milvokis;Milwacky;Milwahkie;Milwalka;Milwalky;Milwarck;Milwarik;Milwaucki;Milwaukee;Milwaukie;Minewaki;Miniaki;Minnawack;Winnipesaukee;mi er wo ji;mil wxki;mil-woki;mila'oyaki;milavoki;miluokʼi;milvaki;milvakki;milvauki;miruu~oki;mlwaky;mylwaky;mylwaky wyskansyn;mylwwqy;Μιλγουόκι;Горад Мілуокі;Милвоки;Милуоки;Мілуокі;Միլուոքի;מילוואקי;מילווקי;ملواکی;ميلواكي;میلواکی، ویسکانسین;मिलवॉकी;मिल्वौकी;মিলওয়াকি;மில்வாக்கி;మిల్వాకీ;ಮಿಲ್ವಾಕೀ;มิลวอกี;მილუოკი;ミルウォーキー;密尔沃基;密爾沃基;밀워키;
city::5264870;North La Crosse;;
city::5265838;Oshkosh;Algoma;Brooklyn;OSH;Oshkosh;Oskosh;Sawdust City;ashkwsh wyskansyn;ausakosa;awshkwsh;awshkwsh wskwnsn;oshukoshu;Ошкош;אושקוש;أوشكوش;اشکوش، ویسکانسین;اوشکوش، وسکونسن;औशकोश;ოშკოში;オシュコシュ;
city::5265838;Oshkosh;Algoma;Athens;Brooklyn;OSH;Oshkosh;Oskosh;Sawdust City;ashkwsh wyskansyn;ausakosa;awshkwsh;awshkwsh wskwnsn;oshukoshu;Ошкош;אושקוש;أوشكوش;اشکوش، ویسکانسین;اوشکوش، وسکونسن;औशकोश;ოშკოში;オシュコシュ;
city::5268249;Racine;Kipikawi;Port Gilbert;RAC;Racine;Rasijn;Rasin;Rasinas;la xin;leosin;rashin;rasini;rasyn;rasyn wskwnsn;rysyn wyskansyn;Расийн;Расин;Расін;ראסין;راسين;راسین، وسکونسن;ریسین، ویسکانسین;რასინი;ラシーン;拉辛;러신;
city::5278052;Waukesha;Prairieville;UES;Uokesho;Uokisha;Vokesha;Vokisha;Waukesha;u~okisho;wakysha wyskansyn;wawkysha wskwnsn;wkysha;Вокеша;Вокиша;Уокешо;Уокиша;واوکیشا، وسکونسن;واکیشا، ویسکانسین;وكيشا;უოკეშო;ウォキショー;
city::5278420;West Allis;Vest Alis;alys ghrby wyskansyn;wyst alys;Вест Алис;آلیس غربی، ویسکانسین;ويست أليس;ویسٹ الیس، وسکونسن;უესტ-ალისი;

Can't render this file because it is too large.

View File

@ -1,847 +0,0 @@
ένα
έναν
ένας
αι
ακομα
ακομη
ακριβως
αληθεια
αληθινα
αλλα
αλλαχου
αλλες
αλλη
αλλην
αλλης
αλλιως
αλλιωτικα
αλλο
αλλοι
αλλοιως
αλλοιωτικα
αλλον
αλλος
αλλοτε
αλλου
αλλους
αλλων
αμα
αμεσα
αμεσως
αν
ανα
αναμεσα
αναμεταξυ
ανευ
αντι
αντιπερα
αντις
ανω
ανωτερω
αξαφνα
απ
απεναντι
απο
αποψε
από
αρα
αραγε
αργα
αργοτερο
αριστερα
αρκετα
αρχικα
ας
αυριο
αυτα
αυτες
αυτεσ
αυτη
αυτην
αυτης
αυτο
αυτοι
αυτον
αυτος
αυτοσ
αυτου
αυτους
αυτουσ
αυτων
αφοτου
αφου
αἱ
αἳ
αἵ
αὐτόσ
αὐτὸς
αὖ
α∆ιακοπα
βεβαια
βεβαιοτατα
γάρ
γα
γα^
γε
γι
για
γοῦν
γρηγορα
γυρω
γὰρ
δ'
δέ
δή
δαί
δαίσ
δαὶ
δαὶς
δε
δεν
δι
δι'
διά
δια
διὰ
δὲ
δὴ
δ’
εαν
εαυτο
εαυτον
εαυτου
εαυτους
εαυτων
εγκαιρα
εγκαιρως
εγω
ειθε
ειμαι
ειμαστε
ειναι
εις
εισαι
εισαστε
ειστε
ειτε
ειχα
ειχαμε
ειχαν
ειχατε
ειχε
ειχες
ει∆εμη
εκ
εκαστα
εκαστες
εκαστη
εκαστην
εκαστης
εκαστο
εκαστοι
εκαστον
εκαστος
εκαστου
εκαστους
εκαστων
εκει
εκεινα
εκεινες
εκεινεσ
εκεινη
εκεινην
εκεινης
εκεινο
εκεινοι
εκεινον
εκεινος
εκεινοσ
εκεινου
εκεινους
εκεινουσ
εκεινων
εκτος
εμας
εμεις
εμενα
εμπρος
εν
ενα
εναν
ενας
ενος
εντελως
εντος
εντωμεταξυ
ενω
ενός
εξ
εξαφνα
εξης
εξισου
εξω
επ
επί
επανω
επειτα
επει∆η
επι
επισης
επομενως
εσας
εσεις
εσενα
εστω
εσυ
ετερα
ετεραι
ετερας
ετερες
ετερη
ετερης
ετερο
ετεροι
ετερον
ετερος
ετερου
ετερους
ετερων
ετουτα
ετουτες
ετουτη
ετουτην
ετουτης
ετουτο
ετουτοι
ετουτον
ετουτος
ετουτου
ετουτους
ετουτων
ετσι
ευγε
ευθυς
ευτυχως
εφεξης
εχει
εχεις
εχετε
εχθες
εχομε
εχουμε
εχουν
εχτες
εχω
εως
εἰ
εἰμί
εἰμὶ
εἰς
εἰσ
εἴ
εἴμι
εἴτε
ε∆ω
η
ημασταν
ημαστε
ημουν
ησασταν
ησαστε
ησουν
ηταν
ητανε
ητοι
ηττον
η∆η
θα
ι
ιι
ιιι
ισαμε
ισια
ισως
ισωσ
ι∆ια
ι∆ιαν
ι∆ιας
ι∆ιες
ι∆ιο
ι∆ιοι
ι∆ιον
ι∆ιος
ι∆ιου
ι∆ιους
ι∆ιων
ι∆ιως
κ
καί
καίτοι
καθ
καθε
καθεμια
καθεμιας
καθενα
καθενας
καθενος
καθετι
καθολου
καθως
και
κακα
κακως
καλα
καλως
καμια
καμιαν
καμιας
καμποσα
καμποσες
καμποση
καμποσην
καμποσης
καμποσο
καμποσοι
καμποσον
καμποσος
καμποσου
καμποσους
καμποσων
κανεις
κανεν
κανενα
κανεναν
κανενας
κανενος
καποια
καποιαν
καποιας
καποιες
καποιο
καποιοι
καποιον
καποιος
καποιου
καποιους
καποιων
καποτε
καπου
καπως
κατ
κατά
κατα
κατι
κατιτι
κατοπιν
κατω
κατὰ
καὶ
κι
κιολας
κλπ
κοντα
κτλ
κυριως
κἀν
κἂν
λιγακι
λιγο
λιγωτερο
λογω
λοιπα
λοιπον
μέν
μέσα
μή
μήτε
μία
μα
μαζι
μακαρι
μακρυα
μαλιστα
μαλλον
μας
με
μεθ
μεθαυριο
μειον
μελει
μελλεται
μεμιας
μεν
μερικα
μερικες
μερικοι
μερικους
μερικων
μεσα
μετ
μετά
μετα
μεταξυ
μετὰ
μεχρι
μη
μην
μηπως
μητε
μη∆ε
μιά
μια
μιαν
μιας
μολις
μολονοτι
μοναχα
μονες
μονη
μονην
μονης
μονο
μονοι
μονομιας
μονος
μονου
μονους
μονων
μου
μπορει
μπορουν
μπραβο
μπρος
μἐν
μὲν
μὴ
μὴν
να
ναι
νωρις
ξανα
ξαφνικα
ο
οι
ολα
ολες
ολη
ολην
ολης
ολο
ολογυρα
ολοι
ολον
ολονεν
ολος
ολοτελα
ολου
ολους
ολων
ολως
ολως∆ιολου
ομως
ομωσ
οποια
οποιαν
οποιαν∆ηποτε
οποιας
οποιας∆ηποτε
οποια∆ηποτε
οποιες
οποιες∆ηποτε
οποιο
οποιοι
οποιον
οποιον∆ηποτε
οποιος
οποιος∆ηποτε
οποιου
οποιους
οποιους∆ηποτε
οποιου∆ηποτε
οποιο∆ηποτε
οποιων
οποιων∆ηποτε
οποι∆ηποτε
οποτε
οποτε∆ηποτε
οπου
οπου∆ηποτε
οπως
οπωσ
ορισμενα
ορισμενες
ορισμενων
ορισμενως
οσα
οσα∆ηποτε
οσες
οσες∆ηποτε
οση
οσην
οσην∆ηποτε
οσης
οσης∆ηποτε
οση∆ηποτε
οσο
οσοι
οσοι∆ηποτε
οσον
οσον∆ηποτε
οσος
οσος∆ηποτε
οσου
οσους
οσους∆ηποτε
οσου∆ηποτε
οσο∆ηποτε
οσων
οσων∆ηποτε
οταν
οτι
οτι∆ηποτε
οτου
ου
ουτε
ου∆ε
οχι
οἱ
οἳ
οἷς
οὐ
οὐδ
οὐδέ
οὐδείσ
οὐδεὶς
οὐδὲ
οὐδὲν
οὐκ
οὐχ
οὐχὶ
οὓς
οὔτε
οὕτω
οὕτως
οὕτωσ
οὖν
οὗ
οὗτος
οὗτοσ
παλι
παντοτε
παντου
παντως
παρ
παρά
παρα
παρὰ
περί
περα
περι
περιπου
περισσοτερο
περσι
περυσι
περὶ
πια
πιθανον
πιο
πισω
πλαι
πλεον
πλην
ποια
ποιαν
ποιας
ποιες
ποιεσ
ποιο
ποιοι
ποιον
ποιος
ποιοσ
ποιου
ποιους
ποιουσ
ποιων
πολυ
ποσες
ποση
ποσην
ποσης
ποσοι
ποσος
ποσους
ποτε
που
πουθε
πουθενα
ποῦ
πρεπει
πριν
προ
προκειμενου
προκειται
προπερσι
προς
προσ
προτου
προχθες
προχτες
πρωτυτερα
πρόσ
πρὸ
πρὸς
πως
πωσ
σαν
σας
σε
σεις
σημερα
σιγα
σου
στα
στη
στην
στης
στις
στο
στον
στου
στους
στων
συγχρονως
συν
συναμα
συνεπως
συνηθως
συχνα
συχνας
συχνες
συχνη
συχνην
συχνης
συχνο
συχνοι
συχνον
συχνος
συχνου
συχνους
συχνων
συχνως
σχε∆ον
σωστα
σόσ
σύ
σύν
σὸς
σὺ
σὺν
τά
τήν
τί
τίς
τίσ
τα
ταυτα
ταυτες
ταυτη
ταυτην
ταυτης
ταυτο,ταυτον
ταυτος
ταυτου
ταυτων
ταχα
ταχατε
ταῖς
τα∆ε
τε
τελικα
τελικως
τες
τετοια
τετοιαν
τετοιας
τετοιες
τετοιο
τετοιοι
τετοιον
τετοιος
τετοιου
τετοιους
τετοιων
τη
την
της
τησ
τι
τινα
τιποτα
τιποτε
τις
τισ
το
τοί
τοι
τοιοῦτος
τοιοῦτοσ
τον
τος
τοσα
τοσες
τοση
τοσην
τοσης
τοσο
τοσοι
τοσον
τοσος
τοσου
τοσους
τοσων
τοτε
του
τουλαχιστο
τουλαχιστον
τους
τουτα
τουτες
τουτη
τουτην
τουτης
τουτο
τουτοι
τουτοις
τουτον
τουτος
τουτου
τουτους
τουτων
τούσ
τοὺς
τοῖς
τοῦ
τυχον
των
τωρα
τό
τόν
τότε
τὰ
τὰς
τὴν
τὸ
τὸν
τῆς
τῆσ
τῇ
τῶν
τῷ
υπ
υπερ
υπο
υποψη
υποψιν
υπό
υστερα
φετος
χαμηλα
χθες
χτες
χωρις
χωριστα
ψηλα
ω
ωραια
ως
ωσ
ωσαν
ωσοτου
ωσπου
ωστε
ωστοσο
ωχ
ἀλλ'
ἀλλά
ἀλλὰ
ἀλλ’
ἀπ
ἀπό
ἀπὸ
ἀφ
ἂν
ἄλλος
ἄλλοσ
ἄν
ἄρα
ἅμα
ἐάν
ἐγώ
ἐγὼ
ἐκ
ἐμόσ
ἐμὸς
ἐν
ἐξ
ἐπί
ἐπεὶ
ἐπὶ
ἐστι
ἐφ
ἐὰν
ἑαυτοῦ
ἔτι
ἧς
ἵνα
ὃν
ὃς
ὅδε
ὅθεν
ὅπερ
ὅς
ὅσ
ὅστις
ὅστισ
ὅτε
ὅτι
ὑμόσ
ὑπ
ὑπέρ
ὑπό
ὑπὲρ
ὑπὸ
ὡς
ὡσ
ὥς
ὥστε
∆α
∆ε
∆εινα
∆εν
∆εξια
∆ηθεν
∆ηλα∆η
∆ι
∆ια
∆ιαρκως
∆ικα
∆ικο
∆ικοι
∆ικος
∆ικου
∆ικους
∆ιολου
∆ιπλα
∆ιχως

View File

@ -9,7 +9,6 @@ import org.apache.commons.io.IOUtils;
import java.io.IOException;
import java.io.StringWriter;
import java.nio.charset.StandardCharsets;
import java.util.List;
import java.util.stream.Collectors;
@ -18,7 +17,7 @@ public abstract class AbstractPaceTest extends AbstractPaceFunctions {
protected String readFromClasspath(final String filename) {
final StringWriter sw = new StringWriter();
try {
IOUtils.copy(getClass().getResourceAsStream(filename), sw, StandardCharsets.UTF_8);
IOUtils.copy(getClass().getResourceAsStream(filename), sw);
return sw.toString();
} catch (final IOException e) {
throw new RuntimeException("cannot load resource from classpath: " + filename);
@ -37,10 +36,6 @@ public abstract class AbstractPaceTest extends AbstractPaceFunctions {
return new FieldValueImpl(Type.URL, "url", s);
}
protected Field array(final double[] a) {
return new FieldValueImpl(Type.DoubleArray, "array", a);
}
protected Field createFieldList(List<String> strings, String fieldName){
List<FieldValueImpl> fieldValueStream = strings.stream().map(s -> new FieldValueImpl(Type.String, fieldName, s)).collect(Collectors.toList());

View File

@ -2,15 +2,12 @@ package eu.dnetlib.pace.clustering;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import com.google.common.collect.Sets;
import eu.dnetlib.pace.AbstractPaceTest;
import eu.dnetlib.pace.common.AbstractPaceFunctions;
import eu.dnetlib.pace.config.DedupConfig;
import org.junit.jupiter.api.*;
import java.util.Map;
import java.util.Set;
import java.util.stream.Collectors;
public class ClusteringFunctionTest extends AbstractPaceTest {
@ -103,11 +100,6 @@ public class ClusteringFunctionTest extends AbstractPaceTest {
final String s = "Search for the Standard Model Higgs Boson";
System.out.println(s);
System.out.println(sp.apply(conf, Lists.newArrayList(title(s))));
params.put("len", 3);
params.put("max", 1);
System.out.println(sp.apply(conf, Lists.newArrayList(title("Framework for general-purpose deduplication"))));
}
@Test
@ -153,10 +145,6 @@ public class ClusteringFunctionTest extends AbstractPaceTest {
System.out.println(s);
System.out.println(sp.apply(conf, Lists.newArrayList(title(s))));
s = "niivue/niivue: 0.21.1";
System.out.println(s);
System.out.println(sp.apply(conf, Lists.newArrayList(title(s))));
}
@Test
@ -199,51 +187,5 @@ public class ClusteringFunctionTest extends AbstractPaceTest {
System.out.println("s5 = " + s5);
System.out.println(cf.apply(conf, Lists.newArrayList(title(s5))));
final String s6 = "National and Kapodistrian University of Athens";
System.out.println("s6 = " + s6);
System.out.println(cf.apply(conf, Lists.newArrayList(title(s6))));
final String s7 = "Εθνικό και Καποδιστριακό Πανεπιστήμιο Αθηνών";
System.out.println("s7 = " + s7);
System.out.println(cf.apply(conf, Lists.newArrayList(title(s7))));
}
@Test
public void testPersonClustering(){
final ClusteringFunction cf = new PersonClustering(params);
final String s = "Abd-Alla, Abo-el-nour N.";
System.out.println("s = " + s);
System.out.println(cf.apply(conf, Lists.newArrayList(title(s))));
final String s1 = "Manghi, Paolo";
System.out.println("s1 = " + s1);
System.out.println(cf.apply(conf, Lists.newArrayList(title(s1))));
}
@Test
public void testPersonHash(){
final ClusteringFunction cf = new PersonHash(params);
final String s = "Manghi, Paolo";
System.out.println("s = " + s);
System.out.println(cf.apply(conf, Lists.newArrayList(title(s))));
final String s1 = "Manghi, P.";
System.out.println("s = " + s1);
System.out.println(cf.apply(conf, Lists.newArrayList(title(s1))));
}
@Test
public void testLastNameFirstInitial(){
final ClusteringFunction cf = new LastNameFirstInitial(params);
final String s = "LI Yonghong";
System.out.println("s = " + s);
System.out.println(cf.apply(conf, Lists.newArrayList(title(s))));
}
}
}

View File

@ -2,16 +2,13 @@ package eu.dnetlib.pace.comparators;
import eu.dnetlib.pace.AbstractPaceTest;
import eu.dnetlib.pace.clustering.NGramUtils;
import eu.dnetlib.pace.config.Type;
import eu.dnetlib.pace.model.Field;
import eu.dnetlib.pace.model.FieldValueImpl;
import eu.dnetlib.pace.tree.*;
import eu.dnetlib.pace.config.DedupConfig;
import org.junit.jupiter.api.*;
import static org.junit.jupiter.api.Assertions.assertEquals;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.Map;
@ -24,20 +21,15 @@ public class ComparatorTest extends AbstractPaceTest {
@BeforeAll
public void setup() {
conf = DedupConfig.load(readFromClasspath("/eu/dnetlib/pace/config/organization.current.conf.json", ComparatorTest.class));
}
@BeforeEach
public void beforeEachTest() {
params = new HashMap<>();
params.put("weight", "1.0");
params.put("surname_th", "0.99");
params.put("name_th", "0.95");
params.put("jpath_value", "$.value");
params.put("jpath_classid", "$.qualifier.classid");
conf = DedupConfig.load(readFromClasspath("/eu/dnetlib/pace/config/organization.current.conf.json", ComparatorTest.class));
}
@Test
public void testCleanForSorting() {
NGramUtils utils = new NGramUtils();
@ -64,10 +56,7 @@ public class ComparatorTest extends AbstractPaceTest {
//particular cases
assertEquals(1.0, cityMatch.distance("Free University of Bozen-Bolzano", "Università di Bolzano", conf));
assertEquals(1.0, cityMatch.distance("Politechniki Warszawskiej (Warsaw University of Technology)", "Warsaw University of Technology", conf));
// failing becasuse 'Allen' is a transliterrated greek stopword
// assertEquals(-1.0, cityMatch.distance("Allen (United States)", "United States Military Academy", conf));
assertEquals(-1.0, cityMatch.distance("Washington (United States)", "United States Military Academy", conf));
assertEquals(-1.0, cityMatch.distance("Allen (United States)", "United States Military Academy", conf));
}
@Test
@ -81,7 +70,7 @@ public class ComparatorTest extends AbstractPaceTest {
assertEquals(1.0, keywordMatch.distance("Polytechnic University of Turin", "POLITECNICO DI TORINO", conf));
assertEquals(1.0, keywordMatch.distance("Istanbul Commerce University", "İstanbul Ticarət Universiteti", conf));
assertEquals(1.0, keywordMatch.distance("Franklin College", "Concordia College", conf));
assertEquals(2.0/3.0, keywordMatch.distance("University of Georgia", "Georgia State University", conf));
assertEquals(0.5, keywordMatch.distance("University of Georgia", "Georgia State University", conf));
assertEquals(0.5, keywordMatch.distance("University College London", "University of London", conf));
assertEquals(0.5, keywordMatch.distance("Washington State University", "University of Washington", conf));
assertEquals(-1.0, keywordMatch.distance("Allen (United States)", "United States Military Academy", conf));
@ -115,7 +104,7 @@ public class ComparatorTest extends AbstractPaceTest {
public void stringContainsMatchTest(){
params.put("string", "openorgs");
params.put("aggregator", "XOR");
params.put("bool", "XOR");
params.put("caseSensitive", "false");
StringContainsMatch stringContainsMatch = new StringContainsMatch(params);
@ -123,7 +112,7 @@ public class ComparatorTest extends AbstractPaceTest {
assertEquals(0.0, stringContainsMatch.distance("openorgs", "openorgs", conf));
params.put("string", "openorgs");
params.put("aggregator", "AND");
params.put("bool", "AND");
params.put("caseSensitive", "false");
stringContainsMatch = new StringContainsMatch(params);
@ -257,10 +246,6 @@ public class ComparatorTest extends AbstractPaceTest {
assertEquals(0.25, result);
Field f = createFieldList(new ArrayList<>(), "authors");
result = authorsMatch.compare(f,f, conf);
System.out.println("result = " + result);
}
@Test
@ -282,30 +267,5 @@ public class ComparatorTest extends AbstractPaceTest {
assertEquals(1.0, result);
}
@Test
public void domainExactMatch() {
DomainExactMatch domainExactMatch = new DomainExactMatch(params);
Field a = url("http://www.flowrepository.org");
Field b = url("http://flowrepository.org/");
double compare = domainExactMatch.compare(a, b, conf);
System.out.println("compare = " + compare);
}
@Test
public void cosineSimilarity() {
CosineSimilarity cosineSimilarity = new CosineSimilarity(params);
Field a = new FieldValueImpl(Type.DoubleArray, "array", new double[]{1,2,3});
Field b = new FieldValueImpl(Type.DoubleArray, "array", new double[]{1,2,3});
double compare = cosineSimilarity.compare(a, b, conf);
System.out.println("compare = " + compare);
}
}

View File

@ -7,7 +7,6 @@ import eu.dnetlib.pace.clustering.ClusteringClass;
import eu.dnetlib.pace.clustering.ClusteringCombiner;
import eu.dnetlib.pace.model.Field;
import eu.dnetlib.pace.model.FieldList;
import eu.dnetlib.pace.model.FieldValue;
import eu.dnetlib.pace.model.MapDocument;
import eu.dnetlib.pace.tree.JsonListMatch;
import eu.dnetlib.pace.tree.support.AggType;
@ -21,7 +20,10 @@ import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertNotNull;
import static org.junit.jupiter.api.Assertions.assertTrue;
import java.util.*;
import java.util.Collection;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;
@ -83,7 +85,7 @@ public class ConfigTest extends AbstractPaceTest {
}
@Test
public void asMapDocumentTest1() {
public void asMapDocumentTest() {
DedupConfig dedupConf = DedupConfig.load(readFromClasspath("publication.current.conf.json"));
@ -101,19 +103,6 @@ public class ConfigTest extends AbstractPaceTest {
}
@Test
public void authorAsMapDocument() {
DedupConfig dedupConf = DedupConfig.load(readFromClasspath("author.fdup.conf.json"));
final String json = readFromClasspath("author.json");
final MapDocument mapDocument = MapDocumentUtil.asMapDocumentWithJPath(dedupConf, json);
System.out.println("mapDocument = " + Arrays.toString(((FieldValue) mapDocument.getFieldMap().get("topics")).doubleArrayValue()));
}
@Test
public void testJPath() {
final String json = readFromClasspath("organization.json");

View File

@ -1,6 +1,7 @@
package eu.dnetlib.pace.util;
import eu.dnetlib.pace.model.Person;
import jdk.nashorn.internal.ir.annotations.Ignore;
import org.junit.jupiter.api.*;
import java.util.HashMap;
@ -17,6 +18,7 @@ public class UtilTest {
}
@Test
@Ignore
public void paceResolverTest() {
PaceResolver paceResolver = new PaceResolver();
paceResolver.getComparator("keywordMatch", params);
@ -28,11 +30,6 @@ public class UtilTest {
assertEquals("kennedy", p.getSurnameString());
assertEquals("j f", p.getNameString());
p = new Person("Guan-Hua Du", false);
System.out.println("surname = " + p.getSurnameString());
System.out.println("name = " + p.getNameString());
}
}

View File

@ -1,134 +0,0 @@
{
"wf": {
"threshold": "0.99",
"dedupRun": "001",
"entityType": "author",
"subEntityType": "author",
"subEntityValue": "author",
"orderField": "fullname",
"queueMaxSize": "200",
"groupMaxSize": "100",
"maxChildren": "100",
"slidingWindowSize": "50",
"rootBuilder": [
"result",
"resultProject_outcome_isProducedBy",
"resultResult_publicationDataset_isRelatedTo",
"resultResult_similarity_isAmongTopNSimilarDocuments",
"resultResult_similarity_hasAmongTopNSimilarDocuments",
"resultOrganization_affiliation_isAffiliatedWith",
"resultResult_part_hasPart",
"resultResult_part_isPartOf",
"resultResult_supplement_isSupplementTo",
"resultResult_supplement_isSupplementedBy",
"resultResult_version_isVersionOf"
],
"includeChildren": "true",
"maxIterations": 20,
"idPath": "$.id"
},
"pace": {
"clustering" : [
{ "name" : "lnfi", "fields" : [ "fullname" ], "params" : {} }
],
"decisionTree": {
"start": {
"fields": [
{
"field": "orcid",
"comparator": "exactMatch",
"weight": 1.0,
"countIfUndefined": "true",
"params": {}
}
],
"threshold": 1.0,
"aggregation": "MAX",
"positive": "MATCH",
"negative": "NO_MATCH",
"undefined": "orcids",
"ignoreUndefined": "true"
},
"orcids": {
"fields": [
{
"field": "orcids",
"comparator": "stringListMatch",
"weight": 1.0,
"countIfUndefined": "true",
"params": {"type": "count"}
}
],
"threshold": 3.0,
"aggregation": "MAX",
"positive": "MATCH",
"negative": "coauthors",
"undefined": "coauthors",
"ignoreUndefined": "true"
},
"coauthors": {
"fields": [
{
"field": "coauthors",
"comparator": "authorsMatch",
"weight": 1.0,
"countIfUndefined": "true",
"params": {"type": "count"}
}
],
"threshold": 1.0,
"aggregation": "MAX",
"positive": "topicsMatch",
"negative": "NO_MATCH",
"undefined": "topicsMatch",
"ignoreUndefined": "true"
},
"topicsMatch": {
"fields": [
{
"field": "topics",
"comparator": "cosineSimilarity",
"weight": 1.0,
"countIfUndefined": "true",
"params": {}
}
],
"threshold": 1.0,
"aggregation": "MAX",
"positive": "MATCH",
"negative": "NO_MATCH",
"undefined": "NO_MATCH",
"ignoreUndefined": "false"
}
},
"model": [
{
"name": "topics",
"type": "DoubleArray",
"path": "$.topics"
},
{
"name": "fullname",
"type": "String",
"path": "$.fullname"
},
{
"name": "orcid",
"type": "String",
"path": "$.orcid"
},
{
"name": "coauthors",
"type": "List",
"path": "$.coAuthors[*].fullname"
},
{
"name": "orcids",
"type": "List",
"path": "$.coAuthors[*].orcid"
}
],
"blacklists": {},
"synonyms": {}
}
}

View File

@ -1 +0,0 @@
{"fullname":"Zaragoza, Maria Cleofé","firstname":"Maria Cleofé","lastname":"Zaragoza","coAuthors":[{"fullname":"Cambras, Trinitat","lastname":"Cambras","firstname":"Trinitat","orcid":"0000-0002-9009-4690"},{"fullname":"Castro-Marrero, Jesús","lastname":"Castro-Marrero","firstname":"Jesús","orcid":""},{"fullname":"Díez-Noguera, Antoni","lastname":"Díez-Noguera","firstname":"Antoni","orcid":""},{"fullname":"Alegre, José","lastname":"Alegre","firstname":"José","orcid":"0000-0002-7582-7585"}],"topics":[0.9522090839562252,0.04779091604377485],"orcid":"0000-0002-9797-0219","id":"author::1a10826c83c7f9f0dcebe7df05e37a2a","pubId":"50|pmid________::db7fd19db5a620eafad40cfb97f9690d"}

30
pom.xml
View File

@ -5,7 +5,7 @@
<groupId>eu.dnetlib</groupId>
<artifactId>dnet-dedup</artifactId>
<version>4.1.13-SNAPSHOT</version>
<version>4.1.9</version>
<packaging>pom</packaging>
@ -22,7 +22,7 @@
<scm>
<developerConnection>scm:git:https://code-repo.d4science.org/D-Net/dnet-dedup.git</developerConnection>
<tag>dnet-dedup-4.0.3</tag>
<tag>dnet-dedup-4.1.9</tag>
</scm>
<modules>
@ -144,7 +144,14 @@
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-surefire-plugin</artifactId>
<version>2.22.0</version>
<version>2.19.1</version>
<dependencies>
<dependency>
<groupId>org.junit.jupiter</groupId>
<artifactId>junit-jupiter</artifactId>
<version>${junit-jupiter.version}</version>
</dependency>
</dependencies>
<configuration>
<redirectTestOutputToFile>false</redirectTestOutputToFile>
</configuration>
@ -254,7 +261,7 @@
<oozie.use.system.libpath>true</oozie.use.system.libpath>
<properties.maven.plugin.version>2.0.1</properties.maven.plugin.version>
<junit-jupiter.version>5.6.1</junit-jupiter.version>
<maven.dependency.eu.dnetlib.dhp.dhp-build-assembly-resources.jar.path>../dhp-build/dhp-build-assembly-resources/target/dhp-build-assembly-resources-${project.version}.jar</maven.dependency.eu.dnetlib.dhp.dhp-build-assembly-resources.jar.path>
<maven.dependency.eu.dnetlib.dhp.dhp-build-assembly-resources.jar.path>../dhp-build/dhp-build-assembly-resources/target/dhp-build-assembly-resources-4.1.8-SNAPSHOT.jar</maven.dependency.eu.dnetlib.dhp.dhp-build-assembly-resources.jar.path>
</properties>
@ -404,11 +411,20 @@
</dependency>
<dependency>
<groupId>com.ibm.icu</groupId>
<artifactId>icu4j</artifactId>
<version>70.1</version>
<groupId>org.mockito</groupId>
<artifactId>mockito-core</artifactId>
<version>3.3.3</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.mockito</groupId>
<artifactId>mockito-junit-jupiter</artifactId>
<version>3.3.3</version>
<scope>test</scope>
</dependency>
</dependencies>
</dependencyManagement>