Compare commits

...

14 Commits

Author SHA1 Message Date
Claudio Atzori f04f9dd6c1 Merge pull request 'Precompile blacklists patterns before evaluating clustering criteria' (#1) from optimized-clustering into master
Reviewed-on: #1
2023-06-19 12:43:49 +02:00
Giambattista Bloisi d2d173773e Precompile blacklists patterns before evaluating clustering criteria
Enable Junit 5 tests in maven builds
Make path comparisons platform-independent
Read String resource files assuming they are encoded in UTF-8
Fix a few test conditions
2023-06-16 09:41:11 +02:00
Michele De Bonis 7e2e7dcdcd implementation of the support for authors deduplication: cosinesimilarity comparator and double array json parser 2023-04-17 11:06:27 +02:00
Michele De Bonis b5584f084a minor change in the author match which now can compute count and percentage 2023-04-04 17:10:37 +02:00
Michele De Bonis b4b6a61576 configuration updated for testing 2023-02-02 12:05:06 +01:00
Michele De Bonis 66472ce408 implementation of author dedup configuration and lnfi clustering function 2023-01-31 11:53:10 +01:00
Michele De Bonis 00466512ea implementation of the new software configuration 2022-11-22 17:48:34 +01:00
Michele De Bonis 42cff050e7 minor changes 2022-11-21 14:35:46 +01:00
miconis 5aebe63f22 implementation of new configuration for datasource deduplication 2022-04-26 11:30:40 +02:00
miconis fb2eed9f0e implementation of the java version of the graph processor 2022-04-19 15:29:29 +02:00
miconis 6c47fb0e67 implementation of comparators and clustering function for the author deduplication 2022-04-19 10:18:09 +02:00
miconis 9618e889bd test implementation for the new fdup version 2022-04-13 09:48:56 +02:00
miconis 661818da9e bug fix in test 2022-03-21 14:43:55 +01:00
miconis 66b64937ed [maven-release-plugin] prepare for next development iteration 2022-03-15 15:06:18 +01:00
67 changed files with 8101 additions and 2484 deletions

2
.gitignore vendored
View File

@ -19,3 +19,5 @@
/build
spark-warehouse
/dhp-workflows/dhp-graph-mapper/job-override.properties
test.properties

View File

@ -6,7 +6,7 @@
<parent>
<groupId>eu.dnetlib</groupId>
<artifactId>dhp-build</artifactId>
<version>4.1.12</version>
<version>4.1.13-SNAPSHOT</version>
</parent>
<artifactId>dhp-build-assembly-resources</artifactId>

View File

@ -6,10 +6,11 @@
<parent>
<groupId>eu.dnetlib</groupId>
<artifactId>dhp-build</artifactId>
<version>4.1.12</version>
<version>4.1.13-SNAPSHOT</version>
</parent>
<artifactId>dhp-build-properties-maven-plugin</artifactId>
<version>4.1.13-SNAPSHOT</version>
<packaging>maven-plugin</packaging>
<description>This module is a maven plugin implementing custom properties substitutions in the build lifecycle</description>
@ -19,16 +20,19 @@
<groupId>org.apache.maven</groupId>
<artifactId>maven-plugin-api</artifactId>
<version>3.6.3</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>org.apache.maven</groupId>
<artifactId>maven-project</artifactId>
<version>2.2.1</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>org.apache.maven</groupId>
<artifactId>maven-artifact</artifactId>
<version>2.2.1</version>
<scope>provided</scope>
</dependency>
<dependency>
@ -100,6 +104,29 @@
</configuration>
</plugin>
</plugins>
<pluginManagement>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-plugin-plugin</artifactId>
<version>3.2</version>
<configuration>
<skipErrorNoDescriptorsFound>true</skipErrorNoDescriptorsFound>
</configuration>
<executions>
<execution>
<id>mojo-descriptor</id>
<phase>process-classes</phase>
<goals>
<goal>descriptor</goal>
</goals>
</execution>
</executions>
</plugin>
</plugins>
</pluginManagement>
</build>
</project>

View File

@ -8,6 +8,8 @@ import static org.junit.jupiter.api.Assertions.assertNull;
import org.junit.jupiter.api.*;
import java.nio.file.Paths;
/** @author mhorst, claudio.atzori */
public class GenerateOoziePropertiesMojoTest {
@ -66,7 +68,7 @@ public class GenerateOoziePropertiesMojoTest {
clearSystemProperties();
// given
String workflowSourceDir = "eu/dnetlib/dhp/";
String workflowSourceDir = Paths.get("eu/dnetlib/dhp/").toString();
System.setProperty(PROPERTY_NAME_WF_SOURCE_DIR, workflowSourceDir);
// execute
@ -81,14 +83,14 @@ public class GenerateOoziePropertiesMojoTest {
clearSystemProperties();
// given
String workflowSourceDir = "eu/dnetlib/dhp/wf/transformers";
String workflowSourceDir = Paths.get("eu/dnetlib/dhp/wf/transformers").toString();
System.setProperty(PROPERTY_NAME_WF_SOURCE_DIR, workflowSourceDir);
// execute
mojo.execute();
// assert
assertEquals("wf/transformers", System.getProperty(PROPERTY_NAME_SANDBOX_NAME));
assertEquals(Paths.get("wf/transformers").toString(), System.getProperty(PROPERTY_NAME_SANDBOX_NAME));
}
@Test
@ -96,13 +98,13 @@ public class GenerateOoziePropertiesMojoTest {
clearSystemProperties();
// given
String workflowSourceDir = "wf/transformers";
String workflowSourceDir = Paths.get("wf/transformers").toString();
System.setProperty(PROPERTY_NAME_WF_SOURCE_DIR, workflowSourceDir);
// execute
mojo.execute();
// assert
assertEquals("wf/transformers", System.getProperty(PROPERTY_NAME_SANDBOX_NAME));
assertEquals(Paths.get("wf/transformers").toString(), System.getProperty(PROPERTY_NAME_SANDBOX_NAME));
}
}

View File

@ -1,2 +0,0 @@
# Tue Mar 15 14:58:05 CET 2022
projectPropertyKey=projectPropertyValue

View File

@ -5,7 +5,7 @@
<groupId>eu.dnetlib</groupId>
<artifactId>dhp-code-style</artifactId>
<version>4.1.12</version>
<version>4.1.13-SNAPSHOT</version>
<packaging>jar</packaging>

View File

@ -4,7 +4,7 @@
<parent>
<groupId>eu.dnetlib</groupId>
<artifactId>dnet-dedup</artifactId>
<version>4.1.12</version>
<version>4.1.13-SNAPSHOT</version>
<relativePath>../pom.xml</relativePath>
</parent>
<artifactId>dhp-build</artifactId>

View File

@ -1,6 +1,6 @@
entitiesPath = /tmp/publications_test_dump
#entitiesPath = /tmp/prod_provision/graph/02_graph_cleaned/publication
workingPath = /user/michele.debonis/new_dedup_test/workingdirtree
dedupConfPath = /user/michele.debonis/new_dedup_test/pubs.tree.conf.json
numPartitions = 8000
useTree = true
useTree = true
entitiesPath = /user/michele.debonis/lda_experiments/authors_pubmed
workingPath = /user/michele.debonis/authors_dedup/gt2_dedup
numPartitions = 1000
dedupConfPath = /user/michele.debonis/lda_experiments/authors.fdup.gt2.conf.json
groundTruthFieldJPath = $.orcid

View File

@ -6,7 +6,7 @@
<parent>
<groupId>eu.dnetlib</groupId>
<artifactId>dnet-dedup</artifactId>
<version>4.1.12</version>
<version>4.1.13-SNAPSHOT</version>
<relativePath>../pom.xml</relativePath>
</parent>

View File

@ -1,7 +1,7 @@
package eu.dnetlib;
import com.google.common.hash.Hashing;
import eu.dnetlib.graph.GraphProcessor;
import eu.dnetlib.graph.JavaGraphProcessor;
import eu.dnetlib.pace.config.DedupConfig;
import eu.dnetlib.pace.model.MapDocument;
import eu.dnetlib.pace.util.BlockProcessorForTesting;
@ -19,7 +19,6 @@ import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.graphx.Edge;
import org.apache.spark.rdd.RDD;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SaveMode;
@ -58,14 +57,13 @@ public class Deduper implements Serializable {
.reduceByKey((b1, b2) -> Block.from(b1, b2, of, maxQueueSize));
}
public static Iterator<Tuple2<String, String>> ccToMergeRel(ConnectedComponent cc, DedupConfig dedupConf) {
return cc
.getDocs()
public static Iterator<Tuple2<String, String>> ccToMergeRel(Tuple2<String, List<String>> cc, DedupConfig dedupConf) {
return cc._2()
.stream()
.flatMap(
id -> {
List<Tuple2<String, String>> tmp = new ArrayList<>();
tmp.add(new Tuple2<>(cc.getCcId(), id));
tmp.add(new Tuple2<>(cc._1(), id));
return tmp.stream();
})
.iterator();
@ -138,21 +136,19 @@ public class Deduper implements Serializable {
.map(s -> MapDocumentUtil.getJPathString(dedupConf.getWf().getIdPath(), s))
.mapToPair((PairFunction<String, Object, String>) s -> new Tuple2<>(hash(s), s));
final RDD<Edge<String>> edgeRdd = spark
final JavaRDD<Edge<String>> edgeRdd = spark
.read()
.load(simRelsPath)
.as(Encoders.bean(Relation.class))
.javaRDD()
.map(Relation::toEdgeRdd)
.rdd();
.map(Relation::toEdgeRdd);
JavaRDD<ConnectedComponent> ccs = GraphProcessor
.findCCs(vertexes.rdd(), edgeRdd, maxIterations)
.toJavaRDD();
JavaPairRDD<String, List<String>> ccs = JavaGraphProcessor
.findCCs(vertexes, edgeRdd, dedupConf.getWf().getMaxIterations());
JavaRDD<Relation> mergeRel = ccs
.filter(k -> k.getDocs().size() > 1)
.flatMap(cc -> ccToMergeRel(cc, dedupConf))
.filter(cc -> cc._2().size() > 1)
.flatMap(cc -> Deduper.ccToMergeRel(cc, dedupConf))
.map(it -> new Relation(it._1(), it._2(), "mergeRel"));
final Dataset<Relation> mergeRels = spark
@ -163,7 +159,7 @@ public class Deduper implements Serializable {
mergeRels.write().mode(SaveMode.Overwrite).parquet(mergeRelsPath);
}
public static void createDedupEntity(DedupConfig dedupConf, String mergeRelsPath, String entitiesPath, SparkSession spark, String dedupEntityPath){
public static void createDedupEntity(DedupConfig dedupConf, String simRelsPath, String mergeRelsPath, String entitiesPath, SparkSession spark, String dedupEntityPath){
JavaPairRDD<String, String> entities = spark
.read()
@ -174,7 +170,15 @@ public class Deduper implements Serializable {
.toJavaRDD()
.mapToPair(t -> t);
// <source, target>: source is the dedup_id, target is the id of the mergedIn
// <source_raw_id, relation(source, target)>
JavaPairRDD<String, Relation> simRels = spark
.read()
.load(simRelsPath)
.as(Encoders.bean(Relation.class))
.toJavaRDD()
.mapToPair(r-> new Tuple2<>(r.getSource(), r));
// <raw_id, relation(dedup_id, raw_id)>
JavaPairRDD<String, Relation> mergeRels = spark
.read()
.load(mergeRelsPath)
@ -187,7 +191,22 @@ public class Deduper implements Serializable {
.groupByKey()
.map(t-> entityMerger(t._1(), t._2().iterator()));
dedupEntities.saveAsTextFile(dedupEntityPath);
JavaPairRDD<String, Iterable<Relation>> simRelsWithDedupId = simRels
.join(mergeRels)
.mapToPair(x -> new Tuple2<>(x._2()._2().getSource(), x._2()._1()))
.groupByKey();
JavaRDD<ConnectedComponent> groupEntity = mergeRels.join(entities)
.mapToPair(t -> new Tuple2<>(t._2()._1().getSource(), t._2()._2()))
.groupByKey()
.join(simRelsWithDedupId)
.map(x -> new ConnectedComponent(
x._1(),
x._2()._1(),
x._2()._2())
);
groupEntity.saveAsTextFile(dedupEntityPath);
}
}

View File

@ -0,0 +1,56 @@
package eu.dnetlib.graph;
import com.clearspring.analytics.util.Lists;
import com.google.common.collect.Sets;
import eu.dnetlib.pace.utils.Utility;
import eu.dnetlib.support.ConnectedComponent;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.graphx.*;
import org.apache.spark.rdd.RDD;
import org.apache.spark.storage.StorageLevel;
import scala.Tuple2;
import scala.reflect.ClassTag;
import scala.reflect.ClassTag$;
import java.util.List;
public class JavaGraphProcessor {
//<ccId, list(json)>
public static JavaPairRDD<String, List<String>> findCCs(JavaPairRDD<Object, String> vertexes, JavaRDD<Edge<String>> edges, int maxIterations) {
ClassTag<String> stringTag = ClassTag$.MODULE$.apply(String.class);
Graph<String, String> graph =
Graph.apply(
vertexes.rdd(),
edges.rdd(),
"",
StorageLevel.MEMORY_ONLY(),
StorageLevel.MEMORY_ONLY(),
stringTag,
stringTag
);
GraphOps<String, String> graphOps = new GraphOps<>(graph, stringTag, stringTag);
JavaRDD<Tuple2<Object, Object>> cc = graphOps.connectedComponents(maxIterations).vertices().toJavaRDD();
JavaPairRDD<Object, String> joinResult = vertexes
.leftOuterJoin(cc.mapToPair(x -> x))
.mapToPair(x -> {
if (!x._2()._2().isPresent()) {
return new Tuple2<>(x._1(), x._2()._1());
} else {
return new Tuple2<>(x._2()._2(), x._2()._1());
}
});
return joinResult
.groupByKey()
.map(x -> Lists.newArrayList(x._2()))
.zipWithUniqueId()
.mapToPair(x -> new Tuple2<>("dedup______::" + x._2().toString(), x._1()));
}
}

View File

@ -19,6 +19,7 @@ import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.Serializable;
import java.nio.charset.StandardCharsets;
import java.util.stream.Collectors;
public abstract class AbstractSparkJob implements Serializable {
@ -59,7 +60,7 @@ public abstract class AbstractSparkJob implements Serializable {
Path path=new Path(filePath);
FileSystem fs = FileSystem.get(new Configuration());
BufferedReader br=new BufferedReader(new InputStreamReader(fs.open(path)));
BufferedReader br=new BufferedReader(new InputStreamReader(fs.open(path), StandardCharsets.UTF_8));
try {
return String.join("", br.lines().collect(Collectors.toList()));
} finally {

View File

@ -1,20 +1,36 @@
package eu.dnetlib.jobs;
import eu.dnetlib.Deduper;
import eu.dnetlib.pace.config.DedupConfig;
import eu.dnetlib.pace.config.Type;
import eu.dnetlib.pace.model.FieldValueImpl;
import eu.dnetlib.pace.model.MapDocument;
import eu.dnetlib.pace.util.MapDocumentUtil;
import eu.dnetlib.pace.utils.Utility;
import eu.dnetlib.support.ArgumentApplicationParser;
import eu.dnetlib.support.Block;
import eu.dnetlib.support.ConnectedComponent;
import eu.dnetlib.support.Relation;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SparkSession;
import org.codehaus.jackson.map.ObjectMapper;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import scala.Tuple2;
import java.io.IOException;
import java.util.List;
import java.util.Optional;
import java.util.stream.Collectors;
public class SparkComputeStatistics extends AbstractSparkJob {
@ -42,18 +58,42 @@ public class SparkComputeStatistics extends AbstractSparkJob {
@Override
public void run() throws IOException {
//https://towardsdatascience.com/7-evaluation-metrics-for-clustering-algorithms-bdc537ff54d2#:~:text=There%20are%20two%20types%20of,to%20all%20unsupervised%20learning%20results)
// read oozie parameters
final String entitiesPath = parser.get("entitiesPath");
final String workingPath = parser.get("workingPath");
final String dedupConfPath = parser.get("dedupConfPath");
final String groundTruthFieldJPath = parser.get("groundTruthFieldJPath");
final int numPartitions = Optional
.ofNullable(parser.get("numPartitions"))
.map(Integer::valueOf)
.orElse(NUM_PARTITIONS);
log.info("entitiesPath: '{}'", entitiesPath);
log.info("workingPath: '{}'", workingPath);
log.info("numPartitions: '{}'", numPartitions);
log.info("entitiesPath: '{}'", entitiesPath);
log.info("workingPath: '{}'", workingPath);
log.info("numPartitions: '{}'", numPartitions);
log.info("dedupConfPath: '{}'", dedupConfPath);
log.info("groundTruthFieldJPath: '{}'", groundTruthFieldJPath);
JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
DedupConfig dedupConfig = loadDedupConfig(dedupConfPath);
JavaPairRDD<String, MapDocument> mapDocuments = sc
.textFile(entitiesPath)
.repartition(numPartitions)
.mapToPair(
(PairFunction<String, String, MapDocument>) s -> {
MapDocument d = MapDocumentUtil.asMapDocumentWithJPath(dedupConfig, s);
//put in the map the groundTruthField used to compute statistics
d.getFieldMap().put("groundTruth", new FieldValueImpl(Type.String, "groundTruth", MapDocumentUtil.getJPathString(groundTruthFieldJPath, s)));
return new Tuple2<>(d.getIdentifier(), d);
});
JavaRDD<String> entities = mapDocuments.map(d -> d._2().getFieldMap().get("groundTruth").stringValue());
// create blocks
JavaRDD<List<String>> blocks = Deduper.createSortedBlocks(mapDocuments, dedupConfig)
.map(b -> b._2().getDocuments().stream().map(d -> d.getFieldMap().get("groundTruth").stringValue()).collect(Collectors.toList()));
// <source, target>: source is the dedup_id, target is the id of the mergedIn
JavaRDD<Relation> mergerels = spark
@ -68,15 +108,38 @@ public class SparkComputeStatistics extends AbstractSparkJob {
.as(Encoders.bean(Relation.class))
.toJavaRDD();
JavaRDD<List<String>> groups = sc.textFile(workingPath + "/groupentities")
.map(e -> new ObjectMapper().readValue(e, ConnectedComponent.class))
.map(e -> e.getDocs().stream().map(d -> MapDocumentUtil.getJPathString(groundTruthFieldJPath, d)).collect(Collectors.toList()));
long entities_number = entities.count();
long blocks_number = blocks.count();
double blocks_randIndex = randIndex(blocks);
long simrels_number = simrels.count();
long mergerels_number = mergerels.count();
long connected_components = mergerels.groupBy(Relation::getSource).count();
double groups_randIndex = randIndex(groups);
long groups_number = groups.count();
long groundtruth_number = entities.filter(e -> !e.isEmpty()).count();
long correct_groups = groups.filter(x -> x.stream().distinct().count()==1).count();
long wrong_groups = groups_number - correct_groups;
writeStatsFileToHDFS(simrels_number, mergerels_number, connected_components, workingPath + "/stats_file");
String print =
"Entities : " + entities_number + "\n" +
"Ground Truth : " + groundtruth_number + "\n" +
"Blocks : " + blocks_number + "\n" +
"Blocks RI : " + blocks_randIndex + "\n" +
"SimRels : " + simrels_number + "\n" +
"MergeRels : " + mergerels_number + "\n" +
"Groups : " + groups_number + " (correct: " + correct_groups + ", wrong: " + wrong_groups + ")\n" +
"Groups RI : " + groups_randIndex;
System.out.println(print);
writeStatsFileToHDFS(groundtruth_number, entities_number, blocks_randIndex, groups_randIndex, blocks_number, simrels_number, mergerels_number, groups_number, workingPath + "/stats_file.txt");
}
public static void writeStatsFileToHDFS(long simrels_number, long mergerels_number, long connected_components, String filePath) throws IOException {
public static void writeStatsFileToHDFS(long groundtruth_number, long entities_number, double blocks_randIndex, double groups_randIndex, long blocks_number, long simrels_number, long mergerels_number, long groups_number, String filePath) throws IOException {
Configuration conf = new Configuration();
FileSystem fs = FileSystem.get(conf);
@ -93,9 +156,14 @@ public class SparkComputeStatistics extends AbstractSparkJob {
}
String print =
"Similarity Relations : " + simrels_number + "\n" +
"Merge Relations : " + mergerels_number + "\n" +
"Connected Components : " + connected_components;
"Entities : " + entities_number + "\n" +
"Ground Truth : " + groundtruth_number + "\n" +
"Blocks : " + blocks_number + "\n" +
"Blocks RI : " + blocks_randIndex + "\n" +
"SimRels : " + simrels_number + "\n" +
"MergeRels : " + mergerels_number + "\n" +
"Groups : " + groups_number + "\n" +
"Groups RI : " + groups_randIndex;
// Create file to write
FSDataOutputStream out = fs.create(outFile);
@ -109,5 +177,31 @@ public class SparkComputeStatistics extends AbstractSparkJob {
e.printStackTrace();
}
}
//TODO find another maesure that takes into account all the elements outside of the group too
//RandIndex = number of pairwise correct predictions/total number of possible pairs (in the same cluster) -> bounded between 0 and 1
public double randIndex(JavaRDD<List<String>> clusters) {
Tuple2<Integer, Integer> reduce = clusters.map(c -> {
int num = 0;
for (String id : c.stream().distinct().filter(s -> !s.isEmpty()).collect(Collectors.toList())) {
int n = (int) c.stream().filter(i -> i.equals(id)).count();
num += binomialCoefficient(n);
}
int den = binomialCoefficient(c.size());
return new Tuple2<>(num, den);
})
.reduce((a, b) -> new Tuple2<>(a._1() + b._1(), a._2() + b._2()));
return (double)reduce._1()/ reduce._2();
}
private static int binomialCoefficient(int n)
{
return n*(n-1)/2;
}
//V-measure = harmonic mean of homogeneity and completeness, homogeneity = each cluster contains only members of a single class, completeness = all members of a given class are assigned to the same cluster
}

View File

@ -7,6 +7,7 @@ import eu.dnetlib.pace.utils.Utility;
import eu.dnetlib.support.ArgumentApplicationParser;
import eu.dnetlib.support.ConnectedComponent;
import eu.dnetlib.support.Relation;
import org.apache.hadoop.io.compress.GzipCodec;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
@ -16,29 +17,32 @@ import org.apache.spark.sql.SparkSession;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import scala.Tuple2;
import scala.Tuple3;
import java.io.IOException;
import java.util.Optional;
import java.util.*;
import java.util.stream.Collectors;
import java.util.stream.StreamSupport;
public class SparkCreateDedupEntity extends AbstractSparkJob {
public class SparkCreateGroupEntity extends AbstractSparkJob {
private static final Logger log = LoggerFactory.getLogger(eu.dnetlib.jobs.SparkCreateDedupEntity.class);
private static final Logger log = LoggerFactory.getLogger(SparkCreateGroupEntity.class);
public SparkCreateDedupEntity(ArgumentApplicationParser parser, SparkSession spark) {
public SparkCreateGroupEntity(ArgumentApplicationParser parser, SparkSession spark) {
super(parser, spark);
}
public static void main(String[] args) throws Exception {
ArgumentApplicationParser parser = new ArgumentApplicationParser(
Utility.readResource("/jobs/parameters/createDedupEntity_parameters.json", SparkCreateDedupEntity.class)
Utility.readResource("/jobs/parameters/createGroupEntity_parameters.json", SparkCreateGroupEntity.class)
);
parser.parseArgument(args);
SparkConf conf = new SparkConf();
new SparkCreateDedupEntity(
new SparkCreateGroupEntity(
parser,
getSparkSession(conf)
).run();
@ -63,6 +67,7 @@ public class SparkCreateDedupEntity extends AbstractSparkJob {
DedupConfig dedupConf = DedupConfig.load(readFileFromHDFS(dedupConfPath));
// <raw_id, json>
JavaPairRDD<String, String> entities = spark
.read()
.textFile(entitiesPath)
@ -72,7 +77,15 @@ public class SparkCreateDedupEntity extends AbstractSparkJob {
.toJavaRDD()
.mapToPair(t -> t);
// <source, target>: source is the dedup_id, target is the id of the mergedIn
// <source_raw_id, relation(source, target)>
JavaPairRDD<String, Relation> simRels = spark
.read()
.load(workingPath + "/simrels")
.as(Encoders.bean(Relation.class))
.toJavaRDD()
.mapToPair(r-> new Tuple2<>(r.getSource(), r));
// <raw_id, relation(dedup_id, raw_id)>
JavaPairRDD<String, Relation> mergeRels = spark
.read()
.load(workingPath + "/mergerels")
@ -80,12 +93,23 @@ public class SparkCreateDedupEntity extends AbstractSparkJob {
.toJavaRDD()
.mapToPair(r -> new Tuple2<>(r.getTarget(), r));
JavaRDD<ConnectedComponent> dedupEntities = mergeRels.join(entities)
// <dedup_id, simrel>
JavaPairRDD<String, Iterable<Relation>> simRelsWithDedupId = simRels
.join(mergeRels)
.mapToPair(x -> new Tuple2<>(x._2()._2().getSource(), x._2()._1()))
.groupByKey();
JavaRDD<ConnectedComponent> groupEntity = mergeRels.join(entities)
.mapToPair(t -> new Tuple2<>(t._2()._1().getSource(), t._2()._2()))
.groupByKey()
.map(t-> Deduper.entityMerger(t._1(), t._2().iterator()));
.join(simRelsWithDedupId)
.map(x -> new ConnectedComponent(
x._1(),
x._2()._1(),
x._2()._2())
);
dedupEntities.saveAsTextFile(workingPath + "dedupentity");
groupEntity.saveAsTextFile(workingPath + "/groupentities", GzipCodec.class);
}

View File

@ -1,7 +1,7 @@
package eu.dnetlib.jobs;
import eu.dnetlib.Deduper;
import eu.dnetlib.graph.GraphProcessor;
import eu.dnetlib.graph.JavaGraphProcessor;
import eu.dnetlib.pace.config.DedupConfig;
import eu.dnetlib.pace.util.MapDocumentUtil;
import eu.dnetlib.pace.utils.Utility;
@ -24,6 +24,7 @@ import org.slf4j.LoggerFactory;
import scala.Tuple2;
import java.io.IOException;
import java.util.List;
import java.util.Optional;
import static eu.dnetlib.Deduper.hash;
@ -78,20 +79,18 @@ public class SparkCreateMergeRels extends AbstractSparkJob {
.map(s -> MapDocumentUtil.getJPathString(dedupConf.getWf().getIdPath(), s))
.mapToPair((PairFunction<String, Object, String>) s -> new Tuple2<>(hash(s), s));
final RDD<Edge<String>> edgeRdd = spark
final JavaRDD<Edge<String>> edgeRdd = spark
.read()
.load(workingPath + "/simrels")
.as(Encoders.bean(Relation.class))
.javaRDD()
.map(Relation::toEdgeRdd)
.rdd();
.map(Relation::toEdgeRdd);
JavaRDD<ConnectedComponent> ccs = GraphProcessor
.findCCs(vertexes.rdd(), edgeRdd, dedupConf.getWf().getMaxIterations())
.toJavaRDD();
JavaPairRDD<String, List<String>> ccs = JavaGraphProcessor
.findCCs(vertexes, edgeRdd, dedupConf.getWf().getMaxIterations());
JavaRDD<Relation> mergeRel = ccs
.filter(k -> k.getDocs().size() > 1)
.filter(cc -> cc._2().size() > 1)
.flatMap(cc -> Deduper.ccToMergeRel(cc, dedupConf))
.map(it -> new Relation(it._1(), it._2(), "mergeRel"));

View File

@ -14,6 +14,7 @@ import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.sql.Encoder;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SaveMode;
import org.apache.spark.sql.SparkSession;

View File

@ -1,10 +1,7 @@
package eu.dnetlib.support;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.Iterator;
import java.util.List;
import java.util.*;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import java.util.stream.StreamSupport;
@ -12,6 +9,7 @@ import java.util.stream.StreamSupport;
import com.google.common.collect.Lists;
import eu.dnetlib.pace.model.MapDocument;
import org.codehaus.jackson.annotate.JsonIgnore;
public class Block implements Serializable {
@ -23,6 +21,11 @@ public class Block implements Serializable {
super();
}
public Block(String key, List<MapDocument> documents) {
this.key = key;
this.documents = documents;
}
public Block(String key, Iterable<MapDocument> documents) {
this.key = key;
this.documents = Lists.newArrayList(documents);

View File

@ -5,54 +5,35 @@ import java.io.Serializable;
import java.util.HashSet;
import java.util.Set;
import eu.dnetlib.pace.utils.Utility;
import org.apache.commons.lang.StringUtils;
import org.codehaus.jackson.annotate.JsonIgnore;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.google.common.collect.Sets;
import eu.dnetlib.pace.util.PaceException;
import org.codehaus.jackson.map.ObjectMapper;
public class ConnectedComponent implements Serializable {
private HashSet<String> docs;
private String ccId;
private HashSet<Relation> simrels;
public ConnectedComponent() {
}
public ConnectedComponent(String ccId, Set<String> docs, Set<Relation> simrels) {
this.docs = new HashSet<>(docs);
this.ccId = ccId;
this.simrels = new HashSet<>(simrels);
}
public ConnectedComponent(Set<String> docs) {
this.docs = new HashSet<>(docs);
createID();
//initialization of id and relations missing
}
public String createID() {
if (docs.size() > 1) {
final String s = getMin();
ccId = "dedup::" + Utility.md5(s);
return ccId;
} else {
return docs.iterator().next();
}
}
@JsonIgnore
public String getMin() {
final StringBuilder min = new StringBuilder();
docs
.forEach(
i -> {
if (StringUtils.isBlank(min.toString())) {
min.append(i);
} else {
if (min.toString().compareTo(i) > 0) {
min.setLength(0);
min.append(i);
}
}
});
return min.toString();
public ConnectedComponent(String ccId, Iterable<String> docs, Iterable<Relation> simrels) {
this.ccId = ccId;
this.docs = Sets.newHashSet(docs);
this.simrels = Sets.newHashSet(simrels);
}
@Override
@ -80,4 +61,12 @@ public class ConnectedComponent implements Serializable {
public void setCcId(String ccId) {
this.ccId = ccId;
}
public void setSimrels(HashSet<Relation> simrels) {
this.simrels = simrels;
}
public HashSet<Relation> getSimrels() {
return simrels;
}
}

View File

@ -16,6 +16,10 @@
<name>dedupConfPath</name>
<description>path for the dedup configuration file</description>
</property>
<property>
<name>groundTruthFieldJPath</name>
<description>jpath of the field to be used as ground truth</description>
</property>
<property>
<name>sparkDriverMemory</name>
<description>memory for driver process</description>
@ -138,6 +142,33 @@
<arg>--numPartitions</arg><arg>${numPartitions}</arg>
<arg>--dedupConfPath</arg><arg>${dedupConfPath}</arg>
</spark>
<ok to="CreateGroupEntities"/>
<error to="Kill"/>
</action>
<action name="CreateGroupEntities">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>Create Group Entities</name>
<class>eu.dnetlib.jobs.SparkCreateGroupEntity</class>
<jar>dnet-dedup-test-${projectVersion}.jar</jar>
<spark-opts>
--executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=3840
</spark-opts>
<arg>--entitiesPath</arg><arg>${entitiesPath}</arg>
<arg>--workingPath</arg><arg>${workingPath}</arg>
<arg>--numPartitions</arg><arg>${numPartitions}</arg>
<arg>--dedupConfPath</arg><arg>${dedupConfPath}</arg>
</spark>
<ok to="ComputeStatistics"/>
<error to="Kill"/>
</action>
@ -162,36 +193,12 @@
<arg>--entitiesPath</arg><arg>${entitiesPath}</arg>
<arg>--workingPath</arg><arg>${workingPath}</arg>
<arg>--numPartitions</arg><arg>${numPartitions}</arg>
<arg>--dedupConfPath</arg><arg>${dedupConfPath}</arg>
<arg>--groundTruthFieldJPath</arg><arg>${groundTruthFieldJPath}</arg>
</spark>
<ok to="End"/>
<error to="Kill"/>
</action>
<!--<action name="CreateDedupEntities">-->
<!--<spark xmlns="uri:oozie:spark-action:0.2">-->
<!--<master>yarn</master>-->
<!--<mode>cluster</mode>-->
<!--<name>Create Dedup Entities</name>-->
<!--<class>eu.dnetlib.jobs.SparkCreateDedupEntity</class>-->
<!--<jar>dnet-dedup-test-${projectVersion}.jar</jar>-->
<!--<spark-opts>-->
<!--&#45;&#45;executor-memory=${sparkExecutorMemory}-->
<!--&#45;&#45;executor-cores=${sparkExecutorCores}-->
<!--&#45;&#45;driver-memory=${sparkDriverMemory}-->
<!--&#45;&#45;conf spark.extraListeners=${spark2ExtraListeners}-->
<!--&#45;&#45;conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}-->
<!--&#45;&#45;conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}-->
<!--&#45;&#45;conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}-->
<!--&#45;&#45;conf spark.sql.shuffle.partitions=3840-->
<!--</spark-opts>-->
<!--<arg>&#45;&#45;entitiesPath</arg><arg>${entitiesPath}</arg>-->
<!--<arg>&#45;&#45;workingPath</arg><arg>${workingPath}</arg>-->
<!--<arg>&#45;&#45;numPartitions</arg><arg>${numPartitions}</arg>-->
<!--<arg>&#45;&#45;dedupConfPath</arg><arg>${dedupConfPath}</arg>-->
<!--</spark>-->
<!--<ok to="End"/>-->
<!--<error to="Kill"/>-->
<!--</action>-->
<end name="End"/>
</workflow-app>

View File

@ -16,5 +16,17 @@
"paramLongName": "numPartitions",
"paramDescription": "number of partitions for the similarity relations intermediate phases",
"paramRequired": false
},
{
"paramName": "dc",
"paramLongName": "dedupConfPath",
"paramDescription": "dedup configuration to be used",
"paramRequired": true
},
{
"paramName": "gt",
"paramLongName": "groundTruthFieldJPath",
"paramDescription": "field to be used as groundtruth",
"paramRequired": true
}
]

File diff suppressed because one or more lines are too long

View File

@ -0,0 +1,134 @@
{
"wf": {
"threshold": "0.99",
"dedupRun": "001",
"entityType": "author",
"subEntityType": "author",
"subEntityValue": "author",
"orderField": "fullname",
"queueMaxSize": "200",
"groupMaxSize": "100",
"maxChildren": "100",
"slidingWindowSize": "50",
"rootBuilder": [
"result",
"resultProject_outcome_isProducedBy",
"resultResult_publicationDataset_isRelatedTo",
"resultResult_similarity_isAmongTopNSimilarDocuments",
"resultResult_similarity_hasAmongTopNSimilarDocuments",
"resultOrganization_affiliation_isAffiliatedWith",
"resultResult_part_hasPart",
"resultResult_part_isPartOf",
"resultResult_supplement_isSupplementTo",
"resultResult_supplement_isSupplementedBy",
"resultResult_version_isVersionOf"
],
"includeChildren": "true",
"maxIterations": 20,
"idPath": "$.id"
},
"pace": {
"clustering" : [
{ "name" : "lnfi", "fields" : [ "fullname" ], "params" : {} }
],
"decisionTree": {
"start": {
"fields": [
{
"field": "orcid",
"comparator": "exactMatch",
"weight": 1.0,
"countIfUndefined": "true",
"params": {}
}
],
"threshold": 1.0,
"aggregation": "MAX",
"positive": "MATCH",
"negative": "NO_MATCH",
"undefined": "orcids",
"ignoreUndefined": "true"
},
"orcids": {
"fields": [
{
"field": "orcids",
"comparator": "stringListMatch",
"weight": 1.0,
"countIfUndefined": "true",
"params": {"type": "count"}
}
],
"threshold": 3.0,
"aggregation": "MAX",
"positive": "MATCH",
"negative": "coauthors",
"undefined": "coauthors",
"ignoreUndefined": "true"
},
"coauthors": {
"fields": [
{
"field": "coauthors",
"comparator": "authorsMatch",
"weight": 1.0,
"countIfUndefined": "true",
"params": {"type": "count"}
}
],
"threshold": 1.0,
"aggregation": "MAX",
"positive": "topicsMatch",
"negative": "NO_MATCH",
"undefined": "topicsMatch",
"ignoreUndefined": "true"
},
"topicsMatch": {
"fields": [
{
"field": "topics",
"comparator": "cosineSimilarity",
"weight": 1.0,
"countIfUndefined": "true",
"params": {}
}
],
"threshold": 0.5,
"aggregation": "MAX",
"positive": "MATCH",
"negative": "NO_MATCH",
"undefined": "NO_MATCH",
"ignoreUndefined": "false"
}
},
"model": [
{
"name": "topics",
"type": "DoubleArray",
"path": "$.topics"
},
{
"name": "fullname",
"type": "String",
"path": "$.fullname"
},
{
"name": "orcid",
"type": "String",
"path": "$.orcid"
},
{
"name": "coauthors",
"type": "List",
"path": "$.coAuthors[*].fullname"
},
{
"name": "orcids",
"type": "List",
"path": "$.coAuthors[*].orcid"
}
],
"blacklists": {},
"synonyms": {}
}
}

View File

@ -0,0 +1,134 @@
{
"wf": {
"threshold": "0.99",
"dedupRun": "001",
"entityType": "author",
"subEntityType": "author",
"subEntityValue": "author",
"orderField": "fullname",
"queueMaxSize": "200",
"groupMaxSize": "100",
"maxChildren": "100",
"slidingWindowSize": "50",
"rootBuilder": [
"result",
"resultProject_outcome_isProducedBy",
"resultResult_publicationDataset_isRelatedTo",
"resultResult_similarity_isAmongTopNSimilarDocuments",
"resultResult_similarity_hasAmongTopNSimilarDocuments",
"resultOrganization_affiliation_isAffiliatedWith",
"resultResult_part_hasPart",
"resultResult_part_isPartOf",
"resultResult_supplement_isSupplementTo",
"resultResult_supplement_isSupplementedBy",
"resultResult_version_isVersionOf"
],
"includeChildren": "true",
"maxIterations": 20,
"idPath": "$.id"
},
"pace": {
"clustering" : [
{ "name" : "lnfi", "fields" : [ "fullname" ], "params" : {} }
],
"decisionTree": {
"start": {
"fields": [
{
"field": "orcid",
"comparator": "exactMatch",
"weight": 1.0,
"countIfUndefined": "true",
"params": {}
}
],
"threshold": 1.0,
"aggregation": "MAX",
"positive": "MATCH",
"negative": "NO_MATCH",
"undefined": "orcids",
"ignoreUndefined": "true"
},
"orcids": {
"fields": [
{
"field": "orcids",
"comparator": "stringListMatch",
"weight": 1.0,
"countIfUndefined": "true",
"params": {"type": "count"}
}
],
"threshold": 3.0,
"aggregation": "MAX",
"positive": "MATCH",
"negative": "coauthors",
"undefined": "coauthors",
"ignoreUndefined": "true"
},
"coauthors": {
"fields": [
{
"field": "coauthors",
"comparator": "authorsMatch",
"weight": 1.0,
"countIfUndefined": "true",
"params": {"type": "count"}
}
],
"threshold": 1.0,
"aggregation": "MAX",
"positive": "topicsMatch",
"negative": "NO_MATCH",
"undefined": "topicsMatch",
"ignoreUndefined": "true"
},
"topicsMatch": {
"fields": [
{
"field": "topics",
"comparator": "cosineSimilarity",
"weight": 1.0,
"countIfUndefined": "true",
"params": {}
}
],
"threshold": 1.0,
"aggregation": "MAX",
"positive": "MATCH",
"negative": "NO_MATCH",
"undefined": "NO_MATCH",
"ignoreUndefined": "false"
}
},
"model": [
{
"name": "topics",
"type": "DoubleArray",
"path": "$.topics"
},
{
"name": "fullname",
"type": "String",
"path": "$.fullname"
},
{
"name": "orcid",
"type": "String",
"path": "$.orcid"
},
{
"name": "coauthors",
"type": "List",
"path": "$.coAuthors[*].fullname"
},
{
"name": "orcids",
"type": "List",
"path": "$.coAuthors[*].orcid"
}
],
"blacklists": {},
"synonyms": {}
}
}

View File

@ -3,7 +3,7 @@
"threshold" : "0.99",
"dedupRun" : "001",
"entityType" : "datasource",
"orderField" : "name",
"orderField" : "englishname",
"queueMaxSize" : "2000",
"groupMaxSize" : "50",
"slidingWindowSize" : "200",
@ -14,8 +14,9 @@
},
"pace" : {
"clustering" : [
{ "name" : "sortedngrampairs", "fields" : [ "name" ], "params" : { "max" : 2, "ngramLen" : "3"} },
{ "name" : "suffixprefix", "fields" : [ "name" ], "params" : { "max" : 1, "len" : "3" } },
{ "name" : "sortedngrampairs", "fields" : [ "officialname", "englishname" ], "params" : { "max" : 2, "ngramLen" : "3", "collapseOn:name": "0"} },
{ "name" : "suffixprefix", "fields" : [ "officialname", "englishname" ], "params" : { "max" : 1, "len" : "3", "collapseOn:name": "0" } },
{"name" : "ngrams", "fields" : ["officialname", "englishname"], "params" : {"ngramLen": 4, "max" : 2, "maxPerToken": 2, "minNgramLen": 1, "collapseOn:name": "0"}},
{ "name" : "urlclustering", "fields" : [ "websiteurl" ], "params" : { } }
],
"decisionTree" : {
@ -39,16 +40,36 @@
"layer2": {
"fields": [
{
"field": "name",
"field": "officialname",
"comparator": "levensteinTitle",
"weight": 1.0,
"countIfUndefined": "true",
"params": {
"threshold": 0.9
}
},
{
"field": "englishname",
"comparator": "levensteinTitle",
"weight": 1.0,
"countIfUndefined": "true",
"params": {
"threshold": 0.9
}
},
{
"field": "officialname",
"comparator": "levensteinTitle",
"weight": 1.0,
"countIfUndefined": "true",
"params": {
"crossCompare": "englishname",
"threshold": 0.9
}
}
],
"threshold": 0.9,
"aggregation": "AVG",
"aggregation": "MAX",
"positive": "MATCH",
"negative": "NO_MATCH",
"undefined": "NO_MATCH",
@ -56,12 +77,11 @@
}
},
"model" : [
{ "name" : "name", "type" : "String", "path" : "$.name" },
{ "name" : "englishname", "type" : "String", "path" : "$.englishname" },
{ "name" : "officialname", "type" : "String", "path" : "$.officialname" },
{ "name" : "websiteurl", "type" : "URL", "path" : "$.websiteurl" }
],
"blacklists" : {
"legalname" : []
},
"blacklists" : {},
"synonyms": {}
}
}

View File

@ -51,37 +51,6 @@
],
"decisionTree": {
"start": {
"fields": [
{
"field": "pid",
"comparator": "jsonListMatch",
"weight": 1.0,
"countIfUndefined": "false",
"params": {
"jpath_value": "$.value",
"jpath_classid": "$.qualifier.classid"
}
},
{
"field": "pid",
"comparator": "jsonListMatch",
"weight": 1.0,
"countIfUndefined": "false",
"params": {
"jpath_value": "$.value",
"jpath_classid": "$.qualifier.classid",
"crossCompare": "alternateid"
}
}
],
"threshold": 0.5,
"aggregation": "MAX",
"positive": "layer1",
"negative": "layer2",
"undefined": "layer2",
"ignoreUndefined": "true"
},
"layer1": {
"fields": [
{
"field": "title",
@ -94,49 +63,8 @@
"threshold": 0.9,
"aggregation": "AVG",
"positive": "MATCH",
"negative": "NO_MATCH",
"undefined": "NO_MATCH",
"ignoreUndefined": "true"
},
"layer2": {
"fields": [
{
"field": "title",
"comparator": "titleVersionMatch",
"weight": 1.0,
"countIfUndefined": "false",
"params": {}
},
{
"field": "authors",
"comparator": "sizeMatch",
"weight": 1.0,
"countIfUndefined": "false",
"params": {}
}
],
"threshold": 1.0,
"aggregation": "AND",
"positive": "layer3",
"negative": "NO_MATCH",
"undefined": "layer3",
"ignoreUndefined": "false"
},
"layer3": {
"fields": [
{
"field": "title",
"comparator": "levensteinTitle",
"weight": 1.0,
"countIfUndefined": "true",
"params": {}
}
],
"threshold": 0.99,
"aggregation": "AVG",
"positive": "MATCH",
"negative": "NO_MATCH",
"undefined": "NO_MATCH",
"negative": "MATCH",
"undefined": "MATCH",
"ignoreUndefined": "true"
}
},

View File

@ -6,9 +6,9 @@
"subEntityType": "resulttype",
"subEntityValue": "publication",
"orderField": "title",
"queueMaxSize": "5000",
"groupMaxSize": "2000",
"maxChildren": "1000",
"queueMaxSize": "200",
"groupMaxSize": "100",
"maxChildren": "100",
"slidingWindowSize": "50",
"rootBuilder": [
"result",
@ -28,9 +28,26 @@
"idPath": "$.id"
},
"pace": {
"clustering" : [
{ "name" : "wordsStatsSuffixPrefixChain", "fields" : [ "title" ], "params" : { "mod" : "10" } },
{ "name" : "lowercase", "fields" : [ "doi" ], "params" : { } }
"clustering": [
{
"name": "wordsStatsSuffixPrefixChain",
"fields": [
"title"
],
"params": {
"mod": "10"
}
},
{
"name": "lowercase",
"fields": [
"doi",
"altdoi"
],
"params": {
"collapseOn:pid": "0"
}
}
],
"decisionTree": {
"start": {
@ -42,18 +59,75 @@
"countIfUndefined": "false",
"params": {
"jpath_value": "$.value",
"jpath_classid": "$.qualifier.classid"
"jpath_classid": "$.qualifier.classid",
"mode": "count"
}
}
],
"threshold": 0.5,
"aggregation": "AVG",
"threshold": 1.0,
"aggregation": "MAX",
"positive": "MATCH",
"negative": "layer2",
"undefined": "layer2",
"negative": "instanceTypeCheck",
"undefined": "instanceTypeCheck",
"ignoreUndefined": "false"
},
"instanceTypeCheck": {
"fields": [
{
"field": "instance",
"comparator": "instanceTypeMatch",
"weight": 1.0,
"countIfUndefined": "false",
"params": {}
}
],
"threshold": 0.5,
"aggregation": "MAX",
"positive": "pidVSaltid",
"negative": "NO_MATCH",
"undefined": "pidVSaltid",
"ignoreUndefined": "true"
},
"layer2": {
"pidVSaltid": {
"fields": [
{
"field": "pid",
"comparator": "jsonListMatch",
"weight": 1.0,
"countIfUndefined": "false",
"params": {
"jpath_value": "$.value",
"jpath_classid": "$.qualifier.classid",
"crossCompare": "alternateid",
"mode": "count"
}
}
],
"threshold": 1.0,
"aggregation": "MAX",
"positive": "softCheck",
"negative": "earlyExits",
"undefined": "earlyExits",
"ignoreUndefined": "true"
},
"softCheck": {
"fields": [
{
"field": "title",
"comparator": "levensteinTitle",
"weight": 1.0,
"countIfUndefined": "true",
"params": {}
}
],
"threshold": 0.9,
"aggregation": "AVG",
"positive": "MATCH",
"negative": "NO_MATCH",
"undefined": "NO_MATCH",
"ignoreUndefined": "true"
},
"earlyExits": {
"fields": [
{
"field": "title",
@ -72,12 +146,12 @@
],
"threshold": 1.0,
"aggregation": "AND",
"positive": "layer3",
"positive": "strongCheck",
"negative": "NO_MATCH",
"undefined": "layer3",
"undefined": "strongCheck",
"ignoreUndefined": "false"
},
"layer3": {
"strongCheck": {
"fields": [
{
"field": "title",
@ -89,28 +163,60 @@
],
"threshold": 0.99,
"aggregation": "AVG",
"positive": "MATCH",
"positive": "surnames",
"negative": "NO_MATCH",
"undefined": "NO_MATCH",
"ignoreUndefined": "true"
},
"surnames": {
"fields": [
{
"field": "authors",
"comparator": "authorsMatch",
"weight": 1.0,
"countIfUndefined": "false",
"params": {
"surname_th": 0.75,
"fullname_th": 0.75,
"mode": "full"
}
}
],
"threshold": 0.6,
"aggregation": "MAX",
"positive": "MATCH",
"negative": "NO_MATCH",
"undefined": "MATCH",
"ignoreUndefined": "true"
}
},
"model": [
{
"name": "doi",
"type": "String",
"path": "$.pid[?(@.qualifier.classid == 'doi')].value"
"path": "$.instance[*].pid[?(@.qualifier.classid == 'doi')].value"
},
{
"name": "altdoi",
"type": "String",
"path": "$.instance[*].alternateIdentifier[?(@.qualifier.classid == 'doi')].value"
},
{
"name": "pid",
"type": "JSON",
"path": "$.pid",
"path": "$.instance[*].pid[*]",
"overrideMatch": "true"
},
{
"name": "alternateid",
"type": "JSON",
"path": "$.instance[*].alternateIdentifier[*]",
"overrideMatch": "true"
},
{
"name": "title",
"type": "String",
"path": "$.title[?(@.qualifier.classid == 'main title')].value",
"type": "StringConcat",
"path": "$.title[?(@.qualifier.classid == 'main title')].value|||$.title[?(@.qualifier.classid == 'subtitle')].value",
"length": 250,
"size": 5
},
@ -124,6 +230,11 @@
"name": "resulttype",
"type": "String",
"path": "$.resulttype.classid"
},
{
"name": "instance",
"type": "List",
"path": "$.instance[*].instancetype.classname"
}
],
"blacklists": {
@ -354,7 +465,16 @@
"^(Measurement of the top quark\\-pair production cross section with ATLAS in pp collisions at).*$",
"^(Measurement of the spin\\-dependent structure function).*",
"(?i)^.*authors[']? reply\\.?$",
"(?i)^.*authors[']? response\\.?$"
"(?i)^.*authors[']? response\\.?$",
"^Data [mM]anagement [sS]ervices\\.$",
"Research and Advanced Technology for Digital Libraries",
"(?i)^risky business$",
"(?i)^great expectations\\.?$",
"(?i)^what's in a name\\?$",
"(?i)^decisions, decisions\\.?$",
"(?i)^update to our reader, reviewer, and author communities.*",
"(?i)^lest we forget$",
"(?i)^measure for measure$"
]
},
"synonyms": {}

View File

@ -0,0 +1,381 @@
{
"wf": {
"threshold": "0.99",
"dedupRun": "001",
"entityType": "result",
"subEntityType": "resulttype",
"subEntityValue": "publication",
"orderField": "title",
"queueMaxSize": "200",
"groupMaxSize": "100",
"maxChildren": "100",
"slidingWindowSize": "100",
"rootBuilder": [
"result",
"resultProject_outcome_isProducedBy",
"resultResult_publicationDataset_isRelatedTo",
"resultResult_similarity_isAmongTopNSimilarDocuments",
"resultResult_similarity_hasAmongTopNSimilarDocuments",
"resultOrganization_affiliation_isAffiliatedWith",
"resultResult_part_hasPart",
"resultResult_part_isPartOf",
"resultResult_supplement_isSupplementTo",
"resultResult_supplement_isSupplementedBy",
"resultResult_version_isVersionOf"
],
"includeChildren": "true",
"maxIterations": 20,
"idPath": "$.id"
},
"pace": {
"clustering" : [
{ "name" : "suffixprefix", "fields" : [ "title" ], "params" : { "max" : "1", "len" : "3" } },
{ "name" : "lowercase", "fields" : [ "doi", "altdoi" ], "params" : { "collapseOn:pid": "0"} }
],
"decisionTree": {
"start": {
"fields": [
{
"field": "pid",
"comparator": "jsonListMatch",
"weight": 1.0,
"countIfUndefined": "false",
"params": {
"jpath_value": "$.value",
"jpath_classid": "$.qualifier.classid",
"mode": "count"
}
}
],
"threshold": 1.0,
"aggregation": "MAX",
"positive": "MATCH",
"negative": "versionCheck",
"undefined": "versionCheck",
"ignoreUndefined": "true"
},
"versionCheck": {
"fields": [
{
"field": "title",
"comparator": "titleVersionMatch",
"weight": 1.0,
"countIfUndefined": "false",
"params": {}
}
],
"threshold": 1.0,
"aggregation": "MAX",
"positive": "titleCheck",
"negative": "NO_MATCH",
"undefined": "titleCheck",
"ignoreUndefined": "false"
},
"titleCheck": {
"fields": [
{
"field": "title",
"comparator": "levensteinTitle",
"weight": 1.0,
"countIfUndefined": "true",
"params": {}
}
],
"threshold": 0.9,
"aggregation": "MAX",
"positive": "authorsCheck",
"negative": "NO_MATCH",
"undefined": "NO_MATCH",
"ignoreUndefined": "true"
},
"authorsCheck": {
"fields": [
{
"field": "authors",
"comparator": "authorsMatch",
"weight": 1.0,
"countIfUndefined": "false",
"params": {}
}
],
"threshold": 0.6,
"aggregation": "MAX",
"positive": "MATCH",
"negative": "NO_MATCH",
"undefined": "MATCH",
"ignoreUndefined": "false"
}
},
"model": [
{
"name": "doi",
"type": "String",
"path": "$.instance[*].pid[?(@.qualifier.classid == 'doi')].value"
},
{
"name": "altdoi",
"type": "String",
"path": "$.instance[*].alternateIdentifier[?(@.qualifier.classid == 'doi')].value"
},
{
"name": "pid",
"type": "JSON",
"path": "$.instance[*].pid[*]",
"overrideMatch": "true"
},
{
"name": "title",
"type": "String",
"path": "$.title[?(@.qualifier.classid == 'main title')].value",
"length": 250,
"size": 5
},
{
"name": "authors",
"type": "List",
"path": "$.author[*].fullname",
"size": 200
},
{
"name": "resulttype",
"type": "String",
"path": "$.resulttype.classid"
}
],
"blacklists": {
"title": [
"(?i)^Data Management Plan",
"^Inside Front Cover$",
"(?i)^Poster presentations$",
"^THE ASSOCIATION AND THE GENERAL MEDICAL COUNCIL$",
"^Problems with perinatal pathology\\.?$",
"(?i)^Cases? of Puerperal Convulsions$",
"(?i)^Operative Gyna?ecology$",
"(?i)^Mind the gap\\!?\\:?$",
"^Chronic fatigue syndrome\\.?$",
"^Cartas? ao editor Letters? to the Editor$",
"^Note from the Editor$",
"^Anesthesia Abstract$",
"^Annual report$",
"(?i)^“?THE RADICAL PREVENTION OF VENEREAL DISEASE\\.?”?$",
"(?i)^Graph and Table of Infectious Diseases?$",
"^Presentation$",
"(?i)^Reviews and Information on Publications$",
"(?i)^PUBLIC HEALTH SERVICES?$",
"(?i)^COMBINED TEXT-?BOOK OF OBSTETRICS AND GYN(Æ|ae)COLOGY$",
"(?i)^Adrese autora$",
"(?i)^Systematic Part .*\\. Catalogus Fossilium Austriae, Band 2: Echinoidea neogenica$",
"(?i)^Acknowledgement to Referees$",
"(?i)^Behçet's disease\\.?$",
"(?i)^Isolation and identification of restriction endonuclease.*$",
"(?i)^CEREBROVASCULAR DISEASES?.?$",
"(?i)^Screening for abdominal aortic aneurysms?\\.?$",
"^Event management$",
"(?i)^Breakfast and Crohn's disease.*\\.?$",
"^Cálculo de concentraciones en disoluciones acuosas. Ejercicio interactivo\\..*\\.$",
"(?i)^Genetic and functional analyses of SHANK2 mutations suggest a multiple hit model of Autism spectrum disorders?\\.?$",
"^Gushi hakubutsugaku$",
"^Starobosanski nadpisi u Bosni i Hercegovini \\(.*\\)$",
"^Intestinal spirocha?etosis$",
"^Treatment of Rodent Ulcer$",
"(?i)^\\W*Cloud Computing\\W*$",
"^Compendio mathematico : en que se contienen todas las materias mas principales de las Ciencias que tratan de la cantidad$",
"^Free Communications, Poster Presentations: Session [A-F]$",
"^“The Historical Aspects? of Quackery\\.?”$",
"^A designated centre for people with disabilities operated by St John of God Community Services (Limited|Ltd), Louth$",
"^P(er|re)-Mile Premiums for Auto Insurance\\.?$",
"(?i)^Case Report$",
"^Boletín Informativo$",
"(?i)^Glioblastoma Multiforme$",
"(?i)^Nuevos táxones animales descritos en la península Ibérica y Macaronesia desde 1994 \\(.*\\)$",
"^Zaměstnanecké výhody$",
"(?i)^The Economics of Terrorism and Counter-Terrorism: A Survey \\(Part .*\\)$",
"(?i)^Carotid body tumours?\\.?$",
"(?i)^\\[Españoles en Francia : La condición Emigrante.*\\]$",
"^Avant-propos$",
"(?i)^St\\. Patrick's Cathedral, Dublin, County Dublin - Head(s)? and Capital(s)?$",
"(?i)^St\\. Patrick's Cathedral, Dublin, County Dublin - Bases?$",
"(?i)^PUBLIC HEALTH VERSUS THE STATE$",
"^Viñetas de Cortázar$",
"(?i)^Search for heavy neutrinos and W(\\[|_|\\(|_\\{|-)?R(\\]|\\)|\\})? bosons with right-handed couplings in a left-right symmetric model in pp collisions at.*TeV(\\.)?$",
"(?i)^Measurement of the pseudorapidity and centrality dependence of the transverse energy density in Pb(-?)Pb collisions at.*tev(\\.?)$",
"(?i)^Search for resonances decaying into top-quark pairs using fully hadronic decays in pp collisions with ATLAS at.*TeV$",
"(?i)^Search for neutral minimal supersymmetric standard model Higgs bosons decaying to tau pairs in pp collisions at.*tev$",
"(?i)^Relatório de Estágio (de|em) Angiologia e Cirurgia Vascular$",
"^Aus der AGMB$",
"^Znanstveno-stručni prilozi$",
"(?i)^Zhodnocení finanční situace podniku a návrhy na zlepšení$",
"(?i)^Evaluation of the Financial Situation in the Firm and Proposals to its Improvement$",
"(?i)^Hodnocení finanční situace podniku a návrhy na její zlepšení$",
"^Finanční analýza podniku$",
"^Financial analysis( of business)?$",
"(?i)^Textbook of Gyn(a)?(Æ)?(e)?cology$",
"^Jikken nihon shūshinsho$",
"(?i)^CORONER('|s)(s|') INQUESTS$",
"(?i)^(Μελέτη παραγόντων )?risk management( για ανάπτυξη και εφαρμογή ενός πληροφοριακού συστήματος| και ανάπτυξη συστήματος)?$",
"(?i)^Consultants' contract(s)?$",
"(?i)^Upute autorima$",
"(?i)^Bijdrage tot de Kennis van den Godsdienst der Dajaks van Lan(d|f)ak en Tajan$",
"^Joshi shin kokubun$",
"^Kōtō shōgaku dokuhon nōson'yō$",
"^Jinjō shōgaku shōka$",
"^Shōgaku shūjichō$",
"^Nihon joshi dokuhon$",
"^Joshi shin dokuhon$",
"^Chūtō kanbun dokuhon$",
"^Wabun dokuhon$",
"(?i)^(Analysis of economy selected village or town|Rozbor hospodaření vybrané obce či města)$",
"(?i)^cardiac rehabilitation$",
"(?i)^Analytical summary$",
"^Thesaurus resolutionum Sacrae Congregationis Concilii$",
"(?i)^Sumario analítico(\\s{1})?(Analitic summary)?$",
"^Prikazi i osvrti$",
"^Rodinný dům s provozovnou$",
"^Family house with an establishment$",
"^Shinsei chūtō shin kokugun$",
"^Pulmonary alveolar proteinosis(\\.?)$",
"^Shinshū kanbun$",
"^Viñeta(s?) de Rodríguez$",
"(?i)^RUBRIKA UREDNIKA$",
"^A Matching Model of the Academic Publication Market$",
"^Yōgaku kōyō$",
"^Internetový marketing$",
"^Internet marketing$",
"^Chūtō kokugo dokuhon$",
"^Kokugo dokuhon$",
"^Antibiotic Cover for Dental Extraction(s?)$",
"^Strategie podniku$",
"^Strategy of an Enterprise$",
"(?i)^respiratory disease(s?)(\\.?)$",
"^Award(s?) for Gallantry in Civil Defence$",
"^Podniková kultura$",
"^Corporate Culture$",
"^Severe hyponatraemia in hospital inpatient(s?)(\\.?)$",
"^Pracovní motivace$",
"^Work Motivation$",
"^Kaitei kōtō jogaku dokuhon$",
"^Konsolidovaná účetní závěrka$",
"^Consolidated Financial Statements$",
"(?i)^intracranial tumour(s?)$",
"^Climate Change Mitigation Options and Directed Technical Change: A Decentralized Equilibrium Analysis$",
"^\\[CERVECERIAS MAHOU(\\.|\\:) INTERIOR\\] \\[Material gráfico\\]$",
"^Housing Market Dynamics(\\:|\\.) On the Contribution of Income Shocks and Credit Constraint(s?)$",
"^\\[Funciones auxiliares de la música en Radio París,.*\\]$",
"^Úroveň motivačního procesu jako způsobu vedení lidí$",
"^The level of motivation process as a leadership$",
"^Pay-beds in N(\\.?)H(\\.?)S(\\.?) Hospitals$",
"(?i)^news and events$",
"(?i)^NOVOSTI I DOGAĐAJI$",
"^Sansū no gakushū$",
"^Posouzení informačního systému firmy a návrh změn$",
"^Information System Assessment and Proposal for ICT Modification$",
"^Stresové zatížení pracovníků ve vybrané profesi$",
"^Stress load in a specific job$",
"^Sunday: Poster Sessions, Pt.*$",
"^Monday: Poster Sessions, Pt.*$",
"^Wednesday: Poster Sessions, Pt.*",
"^Tuesday: Poster Sessions, Pt.*$",
"^Analýza reklamy$",
"^Analysis of advertising$",
"^Shōgaku shūshinsho$",
"^Shōgaku sansū$",
"^Shintei joshi kokubun$",
"^Taishō joshi kokubun dokuhon$",
"^Joshi kokubun$",
"^Účetní uzávěrka a účetní závěrka v ČR$",
"(?i)^The \"?Causes\"? of Cancer$",
"^Normas para la publicación de artículos$",
"^Editor('|s)(s|') [Rr]eply$",
"^Editor(|s)(s|) letter$",
"^Redaktoriaus žodis$",
"^DISCUSSION ON THE PRECEDING PAPER$",
"^Kōtō shōgaku shūshinsho jidōyō$",
"^Shōgaku nihon rekishi$",
"^(Theory of the flow of action currents in isolated myelinated nerve fibers).*$",
"^Préface$",
"^Occupational [Hh]ealth [Ss]ervices.$",
"^In Memoriam Professor Toshiyuki TAKESHIMA$",
"^Účetní závěrka ve vybraném podniku.*$",
"^Financial statements in selected company$",
"^Abdominal [Aa]ortic [Aa]neurysms.*$",
"^Pseudomyxoma peritonei$",
"^Kazalo autora$",
"(?i)^uvodna riječ$",
"^Motivace jako způsob vedení lidí$",
"^Motivation as a leadership$",
"^Polyfunkční dům$",
"^Multi\\-funkcional building$",
"^Podnikatelský plán$",
"(?i)^Podnikatelský záměr$",
"(?i)^Business Plan$",
"^Oceňování nemovitostí$",
"^Marketingová komunikace$",
"^Marketing communication$",
"^Sumario Analítico$",
"^Riječ uredništva$",
"^Savjetovanja i priredbe$",
"^Índice$",
"^(Starobosanski nadpisi).*$",
"^Vzdělávání pracovníků v organizaci$",
"^Staff training in organization$",
"^(Life Histories of North American Geometridae).*$",
"^Strategická analýza podniku$",
"^Strategic Analysis of an Enterprise$",
"^Sadržaj$",
"^Upute suradnicima$",
"^Rodinný dům$",
"(?i)^Fami(l)?ly house$",
"^Upute autorima$",
"^Strategic Analysis$",
"^Finanční analýza vybraného podniku$",
"^Finanční analýza$",
"^Riječ urednika$",
"(?i)^Content(s?)$",
"(?i)^Inhalt$",
"^Jinjō shōgaku shūshinsho jidōyō$",
"(?i)^Index$",
"^Chūgaku kokubun kyōkasho$",
"^Retrato de una mujer$",
"^Retrato de un hombre$",
"^Kōtō shōgaku dokuhon$",
"^Shotōka kokugo$",
"^Shōgaku dokuhon$",
"^Jinjō shōgaku kokugo dokuhon$",
"^Shinsei kokugo dokuhon$",
"^Teikoku dokuhon$",
"^Instructions to Authors$",
"^KİTAP TAHLİLİ$",
"^PRZEGLĄD PIŚMIENNICTWA$",
"(?i)^Presentación$",
"^İçindekiler$",
"(?i)^Tabl?e of contents$",
"^(CODICE DEL BEATO DE LOS REYES FERNANDO I Y SANCHA).*$",
"^(\\[MADRID\\. BIBL\\. NAC\\. N.*KING FERDINAND I.*FROM SAN ISIDORO DE LEON\\. FACUNDUS SCRIPSIT DATED.*\\]).*",
"^Editorial( Board)?$",
"(?i)^Editorial \\(English\\)$",
"^Editörden$",
"^(Corpus Oral Dialectal \\(COD\\)\\.).*$",
"^(Kiri Karl Morgensternile).*$",
"^(\\[Eksliibris Aleksandr).*\\]$",
"^(\\[Eksliibris Aleksandr).*$",
"^(Eksliibris Aleksandr).*$",
"^(Kiri A\\. de Vignolles).*$",
"^(2 kirja Karl Morgensternile).*$",
"^(Pirita kloostri idaosa arheoloogilised).*$",
"^(Kiri tundmatule).*$",
"^(Kiri Jenaer Allgemeine Literaturzeitung toimetusele).*$",
"^(Eksliibris Nikolai Birukovile).*$",
"^(Eksliibris Nikolai Issakovile).*$",
"^(WHP Cruise Summary Information of section).*$",
"^(Measurement of the top quark\\-pair production cross section with ATLAS in pp collisions at).*$",
"^(Measurement of the spin\\-dependent structure function).*",
"(?i)^.*authors[']? reply\\.?$",
"(?i)^.*authors[']? response\\.?$",
"^Data [mM]anagement [sS]ervices\\.$",
"Research and Advanced Technology for Digital Libraries",
"Food and Nutrition"
]
},
"synonyms": {}
}
}

View File

@ -0,0 +1,150 @@
{
"wf" : {
"threshold" : "0.99",
"dedupRun" : "001",
"entityType" : "result",
"subEntityType" : "resulttype",
"subEntityValue" : "software",
"orderField" : "title",
"queueMaxSize" : "200",
"groupMaxSize" : "100",
"maxChildren" : "100",
"slidingWindowSize" : "50",
"rootBuilder" : [ "result", "resultProject_outcome_isProducedBy", "resultResult_publicationDataset_isRelatedTo", "resultResult_similarity_isAmongTopNSimilarDocuments", "resultResult_similarity_hasAmongTopNSimilarDocuments", "resultOrganization_affiliation_hasAuthorInstitution", "resultResult_part_hasPart", "resultResult_part_isPartOf", "resultResult_supplement_isSupplementTo", "resultResult_supplement_isSupplementedBy", "resultResult_version_isVersionOf" ],
"includeChildren" : "true"
},
"pace" : {
"clustering" : [
{ "name" : "wordsStatsSuffixPrefixChain", "fields" : [ "title" ], "params" : { "mod" : "10" } },
{ "name" : "lowercase", "fields" : [ "doi", "altdoi" ], "params" : { "collapseOn:pid":"0"} },
{ "name" : "ngrams", "fields" : [ "title" ], "params" : {"ngramLen": 3, "max": 4, "maxPerToken":1, "minNgramLen":3}},
{ "name" : "urlclustering", "fields": [ "url" ], "params" : {}}
],
"decisionTree": {
"start": {
"fields": [
{
"field": "doi",
"comparator": "exactMatch",
"weight": 1,
"countIfUndefined": "false",
"params": {}
}
],
"threshold": 1,
"aggregation": "MAX",
"positive": "MATCH",
"negative": "titleCheck",
"undefined": "titleCheck",
"ignoreUndefined": "false"
},
"titleCheck": {
"fields": [
{
"field": "title",
"comparator": "levensteinTitleIgnoreVersion",
"weight": 1,
"countIfUndefined": "false",
"params": {}
}
],
"threshold": 0.95,
"aggregation": "AVG",
"positive": "pidCheck",
"negative": "NO_MATCH",
"undefined": "NO_MATCH",
"ignoreUndefined": "false"
},
"pidCheck": {
"fields": [
{
"field": "altdoi",
"comparator": "exactMatch",
"weight": 1,
"countIfUndefined": "false",
"params": {}
},
{
"field": "doi",
"comparator": "exactMatch",
"weight": 1,
"countIfUndefined": "false",
"params": {"crossCompare": "altdoi"}
},
{
"field": "url",
"comparator": "exactMatch",
"weight": 1,
"countIfUndefined": "false",
"params": {}
}
],
"threshold": 1,
"aggregation": "OR",
"positive": "MATCH",
"negative": "authorsCheck",
"undefined": "authorsCheck",
"ignoreUndefined": "false"
},
"authorsCheck": {
"fields": [
{
"field": "authors",
"comparator": "authorsMatch",
"weight": 1.0,
"countIfUndefined": "false",
"params": {
"surname_th": 0.70,
"fullname_th": 0.70,
"size_th": 20,
"mode": "surname"
}
}
],
"threshold": 1,
"aggregation": "AVG",
"positive": "MATCH",
"negative": "NO_MATCH",
"undefined": "MATCH",
"ignoreUndefined": "false"
}
},
"model" : [
{
"name" : "doi",
"type" : "String",
"path" : "$.instance.pid[?(@.qualifier.classid == 'doi')].value"
},
{
"name" : "altdoi",
"type" : "String",
"path" : "$.instance.alternateIdentifier[?(@.qualifier.classid == 'doi')].value"
},
{
"name" : "title",
"type" : "String",
"path" : "$.title[?(@.qualifier.classid == 'main title')].value",
"length" : 250,
"size" : 5
},
{
"name" : "url",
"type" : "String",
"path" : "$.instance.url"
},
{
"name" : "resulttype",
"type" : "String",
"path" : "$.resulttype.classid"
},
{
"name": "authors",
"type": "List",
"path": "$.author[*].fullname",
"size": 200
}
],
"blacklists" : {},
"synonyms": {}
}
}

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,4 @@
{"websiteurl": "https://fairsharing.org", "englishname": "FAIRsharing", "officialname": "FAIRsharing", "id": "eosc________::oxford_e-research_centre::oxford_e-research_centre.fairsharing"}
{"websiteurl": "https://FAIRsharing.org", "englishname": "FAIRsharing", "officialname": "FAIRsharing", "id": "fairsharing_::2521"}
{"websiteurl": "https://fairsharing.org/", "englishname": "formerly: biosharing", "officialname": "FAIRsharing", "id": "re3data_____::r3d100010142"}
{"websiteurl": "https://fairsharing.org/", "englishname": "FAIRsharing", "officialname": "FAIRsharing", "id": "openaire____::fairsharing"}

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View File

@ -0,0 +1,32 @@
[
{
"paramName": "e",
"paramLongName": "entitiesPath",
"paramDescription": "the input entities",
"paramRequired": true
},
{
"paramName": "w",
"paramLongName": "workingPath",
"paramDescription": "path of the working directory",
"paramRequired": true
},
{
"paramName": "np",
"paramLongName": "numPartitions",
"paramDescription": "number of partitions for the similarity relations intermediate phases",
"paramRequired": false
},
{
"paramName": "dc",
"paramLongName": "dedupConfPath",
"paramDescription": "dedup configuration to be used",
"paramRequired": false
},
{
"paramName": "gt",
"paramLongName": "groundTruthFieldJPath",
"paramDescription": "field to be used as groundtruth",
"paramRequired": false
}
]

View File

@ -6,7 +6,7 @@
<parent>
<groupId>eu.dnetlib</groupId>
<artifactId>dnet-dedup</artifactId>
<version>4.1.12</version>
<version>4.1.13-SNAPSHOT</version>
<relativePath>../pom.xml</relativePath>
</parent>

View File

@ -1,59 +1,59 @@
package eu.dnetlib.pace.clustering;
import java.util.Collection;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
import com.google.common.collect.Iterables;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.model.Document;
import eu.dnetlib.pace.model.Field;
import eu.dnetlib.pace.model.FieldListImpl;
import eu.dnetlib.pace.model.MapDocument;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import java.util.Collection;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.regex.Pattern;
public class BlacklistAwareClusteringCombiner extends ClusteringCombiner {
private static final Log log = LogFactory.getLog(BlacklistAwareClusteringCombiner.class);
public static Collection<String> filterAndCombine(final MapDocument a, final Config conf) {
Document filtered = filter(a, conf.blacklists());
return combine(filtered, conf);
}
public static Collection<String> filterAndCombine(final MapDocument a, final Config conf) {
private static MapDocument filter(final MapDocument a, final Map<String, List<Pattern>> blacklists) {
if (blacklists == null || blacklists.isEmpty()) {
return a;
}
final Document filtered = new BlacklistAwareClusteringCombiner().filter(a, conf.blacklists());
return combine(filtered, conf);
}
final Map<String, Field> filtered = Maps.newHashMap(a.getFieldMap());
private MapDocument filter(final MapDocument a, final Map<String, List<String>> blacklists) {
final Map<String, Field> filtered = Maps.newHashMap(a.getFieldMap());
if (blacklists != null) {
for (final Entry<String, Field> e : filtered.entrySet()) {
for (final Entry<String, List<Pattern>> e : blacklists.entrySet()) {
Field fields = a.getFieldMap().get(e.getKey());
if (fields != null) {
final FieldListImpl fl = new FieldListImpl();
final FieldListImpl fl = new FieldListImpl();
fl.addAll(Lists.newArrayList(Iterables.filter(e.getValue(), new FieldFilter(e.getKey(), blacklists))));
filtered.put(e.getKey(), fl);
}
}
return new MapDocument(a.getIdentifier(), filtered);
}
for (Field f : fields) {
if (!isBlackListed(f.stringValue(), e.getValue())) {
fl.add(f);
}
}
filtered.put(e.getKey(), fl);
}
}
return new MapDocument(a.getIdentifier(), filtered);
}
private static boolean isBlackListed(String value, List<Pattern> blacklist) {
for (Pattern pattern : blacklist) {
if (pattern.matcher(value).matches()) {
return true;
}
}
return false;
}
/**
* Tries to match the fields in the regex blacklist.
*
* @param fieldName
* @param value
* @return true if the field matches, false otherwise
*/
protected boolean regexMatches(final String fieldName, final String value, final Map<String, Set<String>> blacklists) {
if (blacklists.containsKey(fieldName)) {
for (final String regex : blacklists.get(fieldName)) {
if (value.matches(regex)) return true;
}
}
return false;
}
}

View File

@ -20,10 +20,6 @@ public class ClusteringCombiner {
private static String COLLAPSE_ON= "collapseOn";
public static Collection<String> combine(final Document a, final Config conf) {
return new ClusteringCombiner().doCombine(a, conf);
}
private Collection<String> doCombine(final Document a, final Config conf) {
final Collection<String> res = Sets.newLinkedHashSet();
for (final ClusteringDef cd : conf.clusterings()) {
for (final String fieldName : cd.getFields()) {
@ -51,7 +47,7 @@ public class ClusteringCombiner {
return res;
}
private String getPrefix(ClusteringDef cd, String fieldName) {
private static String getPrefix(ClusteringDef cd, String fieldName) {
return cd.getName()+ SEPARATOR +
cd.getParams().keySet()
.stream()

View File

@ -1,48 +0,0 @@
package eu.dnetlib.pace.clustering;
import java.util.List;
import java.util.Map;
import com.google.common.base.Predicate;
import eu.dnetlib.pace.model.Field;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
public class FieldFilter implements Predicate<Field> {
private static final Log log = LogFactory.getLog(FieldFilter.class);
private Map<String, List<String>> blacklists;
private String filedName;
public FieldFilter(final String fieldName, final Map<String, List<String>> blacklists) {
this.filedName = fieldName;
this.blacklists = blacklists;
}
@Override
public boolean apply(final Field f) {
return !regexMatches(filedName, f.stringValue(), blacklists);
}
/**
* Tries to match the fields in the regex blacklist.
*
* @param fieldName
* @param value
* @return true if the field matches, false otherwise
*/
protected boolean regexMatches(final String fieldName, final String value, final Map<String, List<String>> blacklists) {
if (blacklists.containsKey(fieldName)) {
final Iterable<String> regexes = blacklists.get(fieldName);
for (final String regex : regexes) {
if (StringUtils.isBlank(regex)) return false;
if (value.matches(regex)) return true;
}
}
return false;
}
}

View File

@ -0,0 +1,77 @@
package eu.dnetlib.pace.clustering;
import com.google.common.collect.Lists;
import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.model.Field;
import eu.dnetlib.pace.model.Person;
import org.apache.commons.lang3.StringUtils;
import java.util.*;
import java.util.stream.Collectors;
@ClusteringClass("lnfi")
public class LastNameFirstInitial extends AbstractClusteringFunction{
private boolean DEFAULT_AGGRESSIVE = true;
public LastNameFirstInitial(final Map<String, Integer> params) {
super(params);
}
@Override
public Collection<String> apply(Config conf, List<Field> fields) {
return fields.stream().filter(f -> !f.isEmpty())
.map(Field::stringValue)
.map(this::normalize)
.map(s -> doApply(conf, s))
.map(c -> filterBlacklisted(c, ngramBlacklist))
.flatMap(c -> c.stream())
.filter(StringUtils::isNotBlank)
.collect(Collectors.toCollection(HashSet::new));
}
@Override
protected String normalize(final String s) {
return fixAliases(transliterate(nfd(unicodeNormalization(s))))
// do not compact the regexes in a single expression, would cause StackOverflowError in case of large input strings
.replaceAll("[^ \\w]+", "")
.replaceAll("(\\p{InCombiningDiacriticalMarks})+", "")
.replaceAll("(\\p{Punct})+", " ")
.replaceAll("(\\d)+", " ")
.replaceAll("(\\n)+", " ")
.trim();
}
@Override
protected Collection<String> doApply(final Config conf, final String s) {
final List<String> res = Lists.newArrayList();
final boolean aggressive = (Boolean) (getParams().containsKey("aggressive") ? getParams().get("aggressive") : DEFAULT_AGGRESSIVE);
Person p = new Person(s, aggressive);
if (p.isAccurate()) {
String lastName = p.getNormalisedSurname().toLowerCase();
String firstInitial = p.getNormalisedFirstName().toLowerCase().substring(0,1);
res.add(firstInitial.concat(lastName));
}
else { // is not accurate, meaning it has no defined name and surname
List<String> fullname = Arrays.asList(p.getNormalisedFullname().split(" "));
if (fullname.size() == 1) {
res.add(p.getNormalisedFullname().toLowerCase());
}
else if (fullname.size() == 2) {
res.add(fullname.get(0).substring(0,1).concat(fullname.get(1)).toLowerCase());
res.add(fullname.get(1).substring(0,1).concat(fullname.get(0)).toLowerCase());
}
else {
res.add(fullname.get(0).substring(0,1).concat(fullname.get(fullname.size()-1)).toLowerCase());
res.add(fullname.get(fullname.size()-1).substring(0,1).concat(fullname.get(0)).toLowerCase());
}
}
return res;
}
}

View File

@ -9,7 +9,7 @@ import com.google.common.collect.Lists;
import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.model.Person;
@ClusteringClass("personhash")
@ClusteringClass("personHash")
public class PersonHash extends AbstractClusteringFunction {
private boolean DEFAULT_AGGRESSIVE = false;

View File

@ -3,28 +3,23 @@ package eu.dnetlib.pace.common;
import com.google.common.base.Joiner;
import com.google.common.base.Splitter;
import com.google.common.collect.Iterables;
import com.google.common.collect.Lists;
import com.google.common.collect.Sets;
import com.ibm.icu.text.Transliterator;
import eu.dnetlib.pace.clustering.NGramUtils;
import eu.dnetlib.pace.config.Type;
import eu.dnetlib.pace.model.Field;
import eu.dnetlib.pace.model.FieldList;
import eu.dnetlib.pace.model.FieldListImpl;
import eu.dnetlib.pace.model.FieldValueImpl;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils;
import java.io.IOException;
import java.io.StringWriter;
import java.io.UnsupportedEncodingException;
import java.nio.charset.StandardCharsets;
import java.text.Normalizer;
import java.util.*;
import java.util.function.Function;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
import com.ibm.icu.text.Transliterator;
/**
* Set of common functions for the framework
@ -133,10 +128,12 @@ public abstract class AbstractPaceFunctions {
protected static String fixAliases(final String s) {
final StringBuilder sb = new StringBuilder();
for (final char ch : Lists.charactersOf(s)) {
s.chars().forEach(ch -> {
final int i = StringUtils.indexOf(aliases_from, ch);
sb.append(i >= 0 ? aliases_to.charAt(i) : ch);
}
sb.append(i >= 0 ? aliases_to.charAt(i) : (char)ch);
});
return sb.toString();
}
@ -152,9 +149,10 @@ public abstract class AbstractPaceFunctions {
protected String removeSymbols(final String s) {
final StringBuilder sb = new StringBuilder();
for (final char ch : Lists.charactersOf(s)) {
sb.append(StringUtils.contains(alpha, ch) ? ch : " ");
}
s.chars().forEach(ch -> {
sb.append(StringUtils.contains(alpha, ch) ? (char)ch : ' ');
});
return sb.toString().replaceAll("\\s+", " ");
}
@ -241,7 +239,7 @@ public abstract class AbstractPaceFunctions {
final Set<String> h = Sets.newHashSet();
try {
for (final String s : IOUtils.readLines(NGramUtils.class.getResourceAsStream(classpath))) {
for (final String s : IOUtils.readLines(NGramUtils.class.getResourceAsStream(classpath), StandardCharsets.UTF_8)) {
h.add(fixAliases(transliterator.transliterate(s))); //transliteration of the stopwords
}
} catch (final Throwable e) {
@ -256,7 +254,7 @@ public abstract class AbstractPaceFunctions {
final Map<String, String> m = new HashMap<>();
try {
for (final String s : IOUtils.readLines(AbstractPaceFunctions.class.getResourceAsStream(classpath))) {
for (final String s : IOUtils.readLines(AbstractPaceFunctions.class.getResourceAsStream(classpath), StandardCharsets.UTF_8)) {
//string is like this: code;word1;word2;word3
String[] line = s.split(";");
String value = line[0];
@ -349,7 +347,7 @@ public abstract class AbstractPaceFunctions {
public static <T> String readFromClasspath(final String filename, final Class<T> clazz) {
final StringWriter sw = new StringWriter();
try {
IOUtils.copy(clazz.getResourceAsStream(filename), sw);
IOUtils.copy(clazz.getResourceAsStream(filename), sw, StandardCharsets.UTF_8);
return sw.toString();
} catch (final IOException e) {
throw new RuntimeException("cannot load resource from classpath: " + filename);

View File

@ -2,6 +2,7 @@ package eu.dnetlib.pace.config;
import java.util.List;
import java.util.Map;
import java.util.regex.Pattern;
import eu.dnetlib.pace.model.ClusteringDef;
import eu.dnetlib.pace.model.FieldDef;
@ -47,7 +48,7 @@ public interface Config {
*
* @return the map
*/
public Map<String, List<String>> blacklists();
public Map<String, List<Pattern>> blacklists();
/**

View File

@ -1,5 +1,6 @@
package eu.dnetlib.pace.config;
import com.fasterxml.jackson.annotation.JsonIgnore;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.google.common.collect.Maps;
import eu.dnetlib.pace.model.ClusteringDef;
@ -7,15 +8,19 @@ import eu.dnetlib.pace.model.FieldDef;
import eu.dnetlib.pace.util.PaceException;
import org.antlr.stringtemplate.StringTemplate;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import java.io.IOException;
import java.io.Serializable;
import java.nio.charset.StandardCharsets;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
import eu.dnetlib.pace.tree.support.TreeNodeDef;
@ -31,6 +36,9 @@ public class DedupConfig implements Config, Serializable {
private WfConfig wf;
@JsonIgnore
private Map<String, List<Pattern>> blacklists;
private static Map<String, String> defaults = Maps.newHashMap();
static {
@ -57,6 +65,12 @@ public class DedupConfig implements Config, Serializable {
config = new ObjectMapper().readValue(json, DedupConfig.class);
config.getPace().initModel();
config.getPace().initTranslationMap();
config.blacklists = config.getPace().getBlacklists().entrySet()
.stream()
.collect(Collectors.toMap(e -> e.getKey(),
e ->e.getValue().stream().filter(s -> !StringUtils.isBlank(s)).map(Pattern::compile).collect(Collectors.toList()) ));
return config;
} catch (IOException e) {
throw new PaceException("Error in parsing configuration json", e);
@ -88,7 +102,7 @@ public class DedupConfig implements Config, Serializable {
}
private String readFromClasspath(final String resource) throws IOException {
return IOUtils.toString(getClass().getResource(resource));
return IOUtils.toString(getClass().getResource(resource), StandardCharsets.UTF_8);
}
public PaceConfig getPace() {
@ -137,8 +151,8 @@ public class DedupConfig implements Config, Serializable {
}
@Override
public Map<String, List<String>> blacklists() {
return getPace().getBlacklists();
public Map<String, List<Pattern>> blacklists() {
return blacklists;
}
@Override

View File

@ -1,5 +1,5 @@
package eu.dnetlib.pace.config;
public enum Type {
String, Int, List, JSON, URL, StringConcat
String, Int, List, JSON, URL, StringConcat, DoubleArray
}

View File

@ -20,4 +20,6 @@ public interface FieldValue extends Field {
*/
public void setValue(final Object value);
public double[] doubleArrayValue();
}

View File

@ -58,8 +58,10 @@ public class FieldValueImpl extends AbstractField implements FieldValue {
throw new RuntimeException(value.toString());
}
case URL:
String str = value.toString();
return StringUtils.isBlank(str) || !isValidURL(str);
String str = value.toString();
return StringUtils.isBlank(str) || !isValidURL(str);
case DoubleArray:
return doubleArrayValue().length==0;
default:
return true;
}
@ -116,6 +118,10 @@ public class FieldValueImpl extends AbstractField implements FieldValue {
// }
}
public double[] doubleArrayValue() {
return (double[])getValue();
}
/*
* (non-Javadoc)
*

View File

@ -43,7 +43,7 @@ public class Person {
// s = s.replaceAll("[\\W&&[^,-]]", "");
}
if (s.contains(",")) {
if (s.contains(",")) { //if the name contains a comma it is easy derivable the name and the surname
final String[] arr = s.split(",");
if (arr.length == 1) {
fullname = splitTerms(arr[0]);

View File

@ -26,6 +26,7 @@ public class AuthorsMatch extends AbstractComparator {
private double FULLNAME_THRESHOLD;
private String MODE; //full or surname
private int SIZE_THRESHOLD;
private String TYPE; //count or percentage
private int common;
public AuthorsMatch(Map<String, String> params){
@ -37,6 +38,7 @@ public class AuthorsMatch extends AbstractComparator {
NAME_THRESHOLD = Double.parseDouble(params.getOrDefault("name_th", "0.95"));
FULLNAME_THRESHOLD = Double.parseDouble(params.getOrDefault("fullname_th", "0.9"));
SIZE_THRESHOLD = Integer.parseInt(params.getOrDefault("size_th", "20"));
TYPE = params.getOrDefault("type", "percentage");
common = 0;
}
@ -50,7 +52,7 @@ public class AuthorsMatch extends AbstractComparator {
if (a.isEmpty() || b.isEmpty())
return -1;
if (((FieldList) a).size() > SIZE_THRESHOLD || ((FieldList) a).size() > SIZE_THRESHOLD)
if (((FieldList) a).size() > SIZE_THRESHOLD || ((FieldList) b).size() > SIZE_THRESHOLD)
return 1.0;
List<Person> aList = ((FieldList) a).stringList().stream().map(author -> new Person(author, false)).collect(Collectors.toList());
@ -123,7 +125,12 @@ public class AuthorsMatch extends AbstractComparator {
//normalization factor to compute the score
int normFactor = aList.size() == bList.size() ? aList.size() : (aList.size() + bList.size() - common);
return (double)common / normFactor;
if(TYPE.equals("percentage")) {
return (double) common / normFactor;
}
else {
return (double) common;
}
}
public boolean compareSurname(Person p1, Person p2) {

View File

@ -0,0 +1,53 @@
package eu.dnetlib.pace.tree;
import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.model.Field;
import eu.dnetlib.pace.model.FieldList;
import eu.dnetlib.pace.model.FieldValueImpl;
import eu.dnetlib.pace.model.Person;
import eu.dnetlib.pace.tree.support.AbstractComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;
@ComparatorClass("cosineSimilarity")
public class CosineSimilarity extends AbstractComparator {
Map<String, String> params;
public CosineSimilarity(Map<String,String> params) {
super(params);
}
@Override
public double compare(final Field a, final Field b, final Config conf) {
if (a.isEmpty() || b.isEmpty())
return -1;
double[] aVector = ((FieldValueImpl) a).doubleArrayValue();
double[] bVector = ((FieldValueImpl) b).doubleArrayValue();
return cosineSimilarity(aVector, bVector);
}
double cosineSimilarity(double[] a, double[] b) {
double dotProduct = 0;
double normASum = 0;
double normBSum = 0;
for(int i = 0; i < a.length; i ++) {
dotProduct += a[i] * b[i];
normASum += a[i] * a[i];
normBSum += b[i] * b[i];
}
double eucledianDist = Math.sqrt(normASum) * Math.sqrt(normBSum);
return dotProduct / eucledianDist;
}
}

View File

@ -16,6 +16,7 @@ public class DomainExactMatch extends ExactMatchIgnoreCase {
@Override
protected String getValue(final Field f) {
try {
return asUrl(super.getValue(f)).getHost();
} catch (MalformedURLException e) {

View File

@ -0,0 +1,34 @@
package eu.dnetlib.pace.tree;
import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.tree.support.AbstractComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass;
import java.util.Map;
@ComparatorClass("numbersComparator")
public class NumbersComparator extends AbstractComparator {
Map<String, String> params;
public NumbersComparator(Map<String, String> params) {
super(params);
this.params = params;
}
@Override
public double distance(String a, String b, Config conf) {
//extracts numbers from the field
String numbers1 = getNumbers(nfd(a));
String numbers2 = getNumbers(nfd(b));
if (numbers1.isEmpty() || numbers2.isEmpty())
return -1.0;
int n1 = Integer.parseInt(numbers1);
int n2 = Integer.parseInt(numbers2);
return Math.abs(n1 - n2);
}
}

View File

@ -42,22 +42,25 @@ public class StringContainsMatch extends AbstractComparator {
STRING = STRING.toLowerCase();
}
switch(AGGREGATOR) {
case "AND":
if(ca.contains(STRING) && cb.contains(STRING))
return 1.0;
break;
case "OR":
if(ca.contains(STRING) || cb.contains(STRING))
return 1.0;
break;
case "XOR":
if(ca.contains(STRING) ^ cb.contains(STRING))
return 1.0;
break;
default:
return 0.0;
if (AGGREGATOR != null) {
switch (AGGREGATOR) {
case "AND":
if (ca.contains(STRING) && cb.contains(STRING))
return 1.0;
break;
case "OR":
if (ca.contains(STRING) || cb.contains(STRING))
return 1.0;
break;
case "XOR":
if (ca.contains(STRING) ^ cb.contains(STRING))
return 1.0;
break;
default:
return 0.0;
}
}
return 0.0;
}
}

View File

@ -19,9 +19,13 @@ public class StringListMatch extends AbstractComparator {
private static final Log log = LogFactory.getLog(StringListMatch.class);
private Map<String, String> params;
final private String TYPE; //percentage or count
public StringListMatch(final Map<String, String> params) {
super(params);
this.params = params;
TYPE = params.getOrDefault("type", "percentage");
}
@Override
@ -31,7 +35,7 @@ public class StringListMatch extends AbstractComparator {
final Set<String> pb = new HashSet<>(((FieldList) b).stringList());
if (pa.isEmpty() || pb.isEmpty()) {
return -1; //return undefined if one of the two lists of pids is empty
return -1; //return undefined if one of the two lists is empty
}
int incommon = Sets.intersection(pa, pb).size();
@ -41,7 +45,10 @@ public class StringListMatch extends AbstractComparator {
return 0.0;
}
return (double)incommon / (incommon + simDiff);
if(TYPE.equals("percentage"))
return (double)incommon / (incommon + simDiff);
else
return incommon;
}
}

View File

@ -1,5 +1,6 @@
package eu.dnetlib.pace.tree.support;
import com.fasterxml.jackson.core.JsonProcessingException;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.config.PaceConfig;
@ -9,6 +10,7 @@ import org.apache.commons.lang3.StringUtils;
import java.io.IOException;
import java.io.Serializable;
import java.io.StringWriter;
import java.util.List;
public class TreeNodeDef implements Serializable {
@ -57,8 +59,9 @@ public class TreeNodeDef implements Serializable {
double result2 = comparator(fieldConf).compare(doc1.getFieldMap().get(crossField), doc2.getFieldMap().get(fieldConf.getField()), conf);
result = Math.max(result1,result2);
}
else
else {
result = comparator(fieldConf).compare(doc1.getFieldMap().get(fieldConf.getField()), doc2.getFieldMap().get(fieldConf.getField()), conf);
}
stats.addFieldStats(
fieldConf.getComparator() + " on " + fieldConf.getField() + " " + fields.indexOf(fieldConf),

View File

@ -161,11 +161,14 @@ public class BlockProcessorForTesting {
}
else {
//use the decision tree implementation or the "normal" implementation of the similarity score (valid only for publications)
if(useTree)
if (useTree)
emitOutput(new TreeProcessor(dedupConf).compare(pivot, curr), idPivot, idCurr, context);
else
emitOutput(publicationCompare(pivot, curr, dedupConf), idPivot, idCurr, context);
}
// if(new TreeProcessor(dedupConf).compare(pivot, curr) != publicationCompare(pivot, curr, dedupConf)) {
// emitOutput(true, idPivot, idCurr, context);
// }
}
}
@ -180,38 +183,45 @@ public class BlockProcessorForTesting {
return compare>=1.0;
}
private boolean publicationCompare(MapDocument a, MapDocument b, DedupConfig config) {
private boolean publicationCompare(MapDocument a, MapDocument b, DedupConfig config) {
//if the score gives 1, the publications are equivalent
Map<String, String> params = new HashMap<>();
params.put("jpath_value", "$.value");
params.put("jpath_classid", "$.qualifier.classid");
params.put("mode", "count");
double score = 0.0;
//LAYER 1 - comparison of the PIDs json lists
Map<String, String> params = new HashMap<>();
params.put("jpath_value", "$.value");
params.put("jpath_classid", "$.qualifier.classid");
JsonListMatch jsonListMatch = new JsonListMatch(params);
double result = jsonListMatch.compare(a.getFieldMap().get("pid"), b.getFieldMap().get("pid"), config);
if (result >= 0.5) //if the result of the comparison is greater than the threshold
score += 10.0; //high score because it should match when the first condition is satisfied
else
score += 0.0;
double score = 0.0;
//LAYER 2 - comparison of the title version and the size of the authors lists
TitleVersionMatch titleVersionMatch = new TitleVersionMatch(params);
double result1 = titleVersionMatch.compare(a.getFieldMap().get("title"), b.getFieldMap().get("title"), config);
SizeMatch sizeMatch = new SizeMatch(params);
double result2 = sizeMatch.compare(a.getFieldMap().get("authors"), b.getFieldMap().get("authors"), config);
if (Math.min(result1, result2) != 0)
score+=0;
else
score-=2;
//LAYER 3 - computation of levenshtein on titles
LevensteinTitle levensteinTitle = new LevensteinTitle(params);
double result3 = levensteinTitle.compare(a.getFieldMap().get("title"), b.getFieldMap().get("title"), config);
score += Double.isNaN(result3)?0.0:result3;
return score >= 0.99;
//levenstein title
LevensteinTitle levensteinTitle = new LevensteinTitle(params);
if(levensteinTitle.compare(a.getFieldMap().get("title"), b.getFieldMap().get("title"), config) >= 0.9) {
score += 0.2;
}
//pid
JsonListMatch jsonListMatch = new JsonListMatch(params);
if (jsonListMatch.compare(a.getFieldMap().get("pid"), b.getFieldMap().get("pid"), config) >= 1.0) {
score += 0.5;
}
//title version
TitleVersionMatch titleVersionMatch = new TitleVersionMatch(params);
double result1 = titleVersionMatch.compare(a.getFieldMap().get("title"), b.getFieldMap().get("title"), config);
if(result1<0 || result1>=1.0) {
score += 0.1;
}
//authors match
params.remove("mode");
AuthorsMatch authorsMatch = new AuthorsMatch(params);
double result2 = authorsMatch.compare(a.getFieldMap().get("authors"), b.getFieldMap().get("authors"), config);
if(result2 <0|| result2>=0.6) {
score += 0.2;
}
return score>=0.5;
}
private void emitOutput(final boolean result, final String idPivot, final String idCurr, final Reporter context) {
if (result) {
@ -234,6 +244,5 @@ public class BlockProcessorForTesting {
final String type = dedupConf.getWf().getEntityType();
context.emit(type, from, to);
context.emit(type, to, from);
}
}

View File

@ -7,12 +7,10 @@ import com.jayway.jsonpath.JsonPath;
import com.jayway.jsonpath.Option;
import eu.dnetlib.pace.config.DedupConfig;
import eu.dnetlib.pace.config.Type;
import eu.dnetlib.pace.model.Field;
import eu.dnetlib.pace.model.FieldListImpl;
import eu.dnetlib.pace.model.FieldValueImpl;
import eu.dnetlib.pace.model.MapDocument;
import eu.dnetlib.pace.model.*;
import net.minidev.json.JSONArray;
import java.math.BigDecimal;
import java.util.*;
import java.util.function.Predicate;
import java.util.stream.Collectors;
@ -46,6 +44,14 @@ public class MapDocumentUtil {
.forEach(fi::add);
stringField.put(fdef.getName(), fi);
break;
case DoubleArray:
stringField.put(
fdef.getName(),
new FieldValueImpl(Type.DoubleArray,
fdef.getName(),
getJPathArray(fdef.getPath(), json))
);
break;
case StringConcat:
String[] jpaths = fdef.getPath().split("\\|\\|\\|");
stringField.put(
@ -115,6 +121,30 @@ public class MapDocumentUtil {
}
}
public static double[] getJPathArray(final String jsonPath, final String json) {
try {
Object o = JsonPath.read(json, jsonPath);
if (o instanceof double[])
return (double[]) o;
if (o instanceof JSONArray) {
Object[] objects = ((JSONArray) o).toArray();
double[] array = new double[objects.length];
for (int i = 0; i < objects.length; i++) {
if (objects[i] instanceof BigDecimal)
array[i] = ((BigDecimal)objects[i]).doubleValue();
else
array[i] = (double) objects[i];
}
return array;
}
return new double[0];
}
catch (Exception e) {
e.printStackTrace();
return new double[0];
}
}
public static String truncateValue(String value, int length) {
if (value == null)

View File

@ -9,6 +9,7 @@ import org.apache.commons.io.IOUtils;
import java.io.IOException;
import java.io.StringWriter;
import java.nio.charset.StandardCharsets;
import java.util.List;
import java.util.stream.Collectors;
@ -17,7 +18,7 @@ public abstract class AbstractPaceTest extends AbstractPaceFunctions {
protected String readFromClasspath(final String filename) {
final StringWriter sw = new StringWriter();
try {
IOUtils.copy(getClass().getResourceAsStream(filename), sw);
IOUtils.copy(getClass().getResourceAsStream(filename), sw, StandardCharsets.UTF_8);
return sw.toString();
} catch (final IOException e) {
throw new RuntimeException("cannot load resource from classpath: " + filename);
@ -36,6 +37,10 @@ public abstract class AbstractPaceTest extends AbstractPaceFunctions {
return new FieldValueImpl(Type.URL, "url", s);
}
protected Field array(final double[] a) {
return new FieldValueImpl(Type.DoubleArray, "array", a);
}
protected Field createFieldList(List<String> strings, String fieldName){
List<FieldValueImpl> fieldValueStream = strings.stream().map(s -> new FieldValueImpl(Type.String, fieldName, s)).collect(Collectors.toList());

View File

@ -103,6 +103,11 @@ public class ClusteringFunctionTest extends AbstractPaceTest {
final String s = "Search for the Standard Model Higgs Boson";
System.out.println(s);
System.out.println(sp.apply(conf, Lists.newArrayList(title(s))));
params.put("len", 3);
params.put("max", 1);
System.out.println(sp.apply(conf, Lists.newArrayList(title("Framework for general-purpose deduplication"))));
}
@Test
@ -148,6 +153,10 @@ public class ClusteringFunctionTest extends AbstractPaceTest {
System.out.println(s);
System.out.println(sp.apply(conf, Lists.newArrayList(title(s))));
s = "niivue/niivue: 0.21.1";
System.out.println(s);
System.out.println(sp.apply(conf, Lists.newArrayList(title(s))));
}
@Test
@ -200,4 +209,41 @@ public class ClusteringFunctionTest extends AbstractPaceTest {
}
}
@Test
public void testPersonClustering(){
final ClusteringFunction cf = new PersonClustering(params);
final String s = "Abd-Alla, Abo-el-nour N.";
System.out.println("s = " + s);
System.out.println(cf.apply(conf, Lists.newArrayList(title(s))));
final String s1 = "Manghi, Paolo";
System.out.println("s1 = " + s1);
System.out.println(cf.apply(conf, Lists.newArrayList(title(s1))));
}
@Test
public void testPersonHash(){
final ClusteringFunction cf = new PersonHash(params);
final String s = "Manghi, Paolo";
System.out.println("s = " + s);
System.out.println(cf.apply(conf, Lists.newArrayList(title(s))));
final String s1 = "Manghi, P.";
System.out.println("s = " + s1);
System.out.println(cf.apply(conf, Lists.newArrayList(title(s1))));
}
@Test
public void testLastNameFirstInitial(){
final ClusteringFunction cf = new LastNameFirstInitial(params);
final String s = "LI Yonghong";
System.out.println("s = " + s);
System.out.println(cf.apply(conf, Lists.newArrayList(title(s))));
}
}

View File

@ -2,13 +2,16 @@ package eu.dnetlib.pace.comparators;
import eu.dnetlib.pace.AbstractPaceTest;
import eu.dnetlib.pace.clustering.NGramUtils;
import eu.dnetlib.pace.config.Type;
import eu.dnetlib.pace.model.Field;
import eu.dnetlib.pace.model.FieldValueImpl;
import eu.dnetlib.pace.tree.*;
import eu.dnetlib.pace.config.DedupConfig;
import org.junit.jupiter.api.*;
import static org.junit.jupiter.api.Assertions.assertEquals;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.Map;
@ -21,15 +24,20 @@ public class ComparatorTest extends AbstractPaceTest {
@BeforeAll
public void setup() {
conf = DedupConfig.load(readFromClasspath("/eu/dnetlib/pace/config/organization.current.conf.json", ComparatorTest.class));
}
@BeforeEach
public void beforeEachTest() {
params = new HashMap<>();
params.put("weight", "1.0");
params.put("surname_th", "0.99");
params.put("name_th", "0.95");
params.put("jpath_value", "$.value");
params.put("jpath_classid", "$.qualifier.classid");
conf = DedupConfig.load(readFromClasspath("/eu/dnetlib/pace/config/organization.current.conf.json", ComparatorTest.class));
}
@Test
public void testCleanForSorting() {
NGramUtils utils = new NGramUtils();
@ -56,7 +64,10 @@ public class ComparatorTest extends AbstractPaceTest {
//particular cases
assertEquals(1.0, cityMatch.distance("Free University of Bozen-Bolzano", "Università di Bolzano", conf));
assertEquals(1.0, cityMatch.distance("Politechniki Warszawskiej (Warsaw University of Technology)", "Warsaw University of Technology", conf));
assertEquals(-1.0, cityMatch.distance("Allen (United States)", "United States Military Academy", conf));
// failing becasuse 'Allen' is a transliterrated greek stopword
// assertEquals(-1.0, cityMatch.distance("Allen (United States)", "United States Military Academy", conf));
assertEquals(-1.0, cityMatch.distance("Washington (United States)", "United States Military Academy", conf));
}
@Test
@ -70,7 +81,7 @@ public class ComparatorTest extends AbstractPaceTest {
assertEquals(1.0, keywordMatch.distance("Polytechnic University of Turin", "POLITECNICO DI TORINO", conf));
assertEquals(1.0, keywordMatch.distance("Istanbul Commerce University", "İstanbul Ticarət Universiteti", conf));
assertEquals(1.0, keywordMatch.distance("Franklin College", "Concordia College", conf));
assertEquals(0.5, keywordMatch.distance("University of Georgia", "Georgia State University", conf));
assertEquals(2.0/3.0, keywordMatch.distance("University of Georgia", "Georgia State University", conf));
assertEquals(0.5, keywordMatch.distance("University College London", "University of London", conf));
assertEquals(0.5, keywordMatch.distance("Washington State University", "University of Washington", conf));
assertEquals(-1.0, keywordMatch.distance("Allen (United States)", "United States Military Academy", conf));
@ -104,7 +115,7 @@ public class ComparatorTest extends AbstractPaceTest {
public void stringContainsMatchTest(){
params.put("string", "openorgs");
params.put("bool", "XOR");
params.put("aggregator", "XOR");
params.put("caseSensitive", "false");
StringContainsMatch stringContainsMatch = new StringContainsMatch(params);
@ -112,7 +123,7 @@ public class ComparatorTest extends AbstractPaceTest {
assertEquals(0.0, stringContainsMatch.distance("openorgs", "openorgs", conf));
params.put("string", "openorgs");
params.put("bool", "AND");
params.put("aggregator", "AND");
params.put("caseSensitive", "false");
stringContainsMatch = new StringContainsMatch(params);
@ -246,6 +257,10 @@ public class ComparatorTest extends AbstractPaceTest {
assertEquals(0.25, result);
Field f = createFieldList(new ArrayList<>(), "authors");
result = authorsMatch.compare(f,f, conf);
System.out.println("result = " + result);
}
@Test
@ -267,5 +282,30 @@ public class ComparatorTest extends AbstractPaceTest {
assertEquals(1.0, result);
}
@Test
public void domainExactMatch() {
DomainExactMatch domainExactMatch = new DomainExactMatch(params);
Field a = url("http://www.flowrepository.org");
Field b = url("http://flowrepository.org/");
double compare = domainExactMatch.compare(a, b, conf);
System.out.println("compare = " + compare);
}
@Test
public void cosineSimilarity() {
CosineSimilarity cosineSimilarity = new CosineSimilarity(params);
Field a = new FieldValueImpl(Type.DoubleArray, "array", new double[]{1,2,3});
Field b = new FieldValueImpl(Type.DoubleArray, "array", new double[]{1,2,3});
double compare = cosineSimilarity.compare(a, b, conf);
System.out.println("compare = " + compare);
}
}

View File

@ -7,6 +7,7 @@ import eu.dnetlib.pace.clustering.ClusteringClass;
import eu.dnetlib.pace.clustering.ClusteringCombiner;
import eu.dnetlib.pace.model.Field;
import eu.dnetlib.pace.model.FieldList;
import eu.dnetlib.pace.model.FieldValue;
import eu.dnetlib.pace.model.MapDocument;
import eu.dnetlib.pace.tree.JsonListMatch;
import eu.dnetlib.pace.tree.support.AggType;
@ -20,10 +21,7 @@ import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertNotNull;
import static org.junit.jupiter.api.Assertions.assertTrue;
import java.util.Collection;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.*;
import java.util.stream.Collectors;
@ -85,7 +83,7 @@ public class ConfigTest extends AbstractPaceTest {
}
@Test
public void asMapDocumentTest() {
public void asMapDocumentTest1() {
DedupConfig dedupConf = DedupConfig.load(readFromClasspath("publication.current.conf.json"));
@ -103,6 +101,19 @@ public class ConfigTest extends AbstractPaceTest {
}
@Test
public void authorAsMapDocument() {
DedupConfig dedupConf = DedupConfig.load(readFromClasspath("author.fdup.conf.json"));
final String json = readFromClasspath("author.json");
final MapDocument mapDocument = MapDocumentUtil.asMapDocumentWithJPath(dedupConf, json);
System.out.println("mapDocument = " + Arrays.toString(((FieldValue) mapDocument.getFieldMap().get("topics")).doubleArrayValue()));
}
@Test
public void testJPath() {
final String json = readFromClasspath("organization.json");

View File

@ -1,7 +1,6 @@
package eu.dnetlib.pace.util;
import eu.dnetlib.pace.model.Person;
import jdk.nashorn.internal.ir.annotations.Ignore;
import org.junit.jupiter.api.*;
import java.util.HashMap;
@ -18,7 +17,6 @@ public class UtilTest {
}
@Test
@Ignore
public void paceResolverTest() {
PaceResolver paceResolver = new PaceResolver();
paceResolver.getComparator("keywordMatch", params);
@ -30,6 +28,11 @@ public class UtilTest {
assertEquals("kennedy", p.getSurnameString());
assertEquals("j f", p.getNameString());
p = new Person("Guan-Hua Du", false);
System.out.println("surname = " + p.getSurnameString());
System.out.println("name = " + p.getNameString());
}
}

View File

@ -0,0 +1,134 @@
{
"wf": {
"threshold": "0.99",
"dedupRun": "001",
"entityType": "author",
"subEntityType": "author",
"subEntityValue": "author",
"orderField": "fullname",
"queueMaxSize": "200",
"groupMaxSize": "100",
"maxChildren": "100",
"slidingWindowSize": "50",
"rootBuilder": [
"result",
"resultProject_outcome_isProducedBy",
"resultResult_publicationDataset_isRelatedTo",
"resultResult_similarity_isAmongTopNSimilarDocuments",
"resultResult_similarity_hasAmongTopNSimilarDocuments",
"resultOrganization_affiliation_isAffiliatedWith",
"resultResult_part_hasPart",
"resultResult_part_isPartOf",
"resultResult_supplement_isSupplementTo",
"resultResult_supplement_isSupplementedBy",
"resultResult_version_isVersionOf"
],
"includeChildren": "true",
"maxIterations": 20,
"idPath": "$.id"
},
"pace": {
"clustering" : [
{ "name" : "lnfi", "fields" : [ "fullname" ], "params" : {} }
],
"decisionTree": {
"start": {
"fields": [
{
"field": "orcid",
"comparator": "exactMatch",
"weight": 1.0,
"countIfUndefined": "true",
"params": {}
}
],
"threshold": 1.0,
"aggregation": "MAX",
"positive": "MATCH",
"negative": "NO_MATCH",
"undefined": "orcids",
"ignoreUndefined": "true"
},
"orcids": {
"fields": [
{
"field": "orcids",
"comparator": "stringListMatch",
"weight": 1.0,
"countIfUndefined": "true",
"params": {"type": "count"}
}
],
"threshold": 3.0,
"aggregation": "MAX",
"positive": "MATCH",
"negative": "coauthors",
"undefined": "coauthors",
"ignoreUndefined": "true"
},
"coauthors": {
"fields": [
{
"field": "coauthors",
"comparator": "authorsMatch",
"weight": 1.0,
"countIfUndefined": "true",
"params": {"type": "count"}
}
],
"threshold": 1.0,
"aggregation": "MAX",
"positive": "topicsMatch",
"negative": "NO_MATCH",
"undefined": "topicsMatch",
"ignoreUndefined": "true"
},
"topicsMatch": {
"fields": [
{
"field": "topics",
"comparator": "cosineSimilarity",
"weight": 1.0,
"countIfUndefined": "true",
"params": {}
}
],
"threshold": 1.0,
"aggregation": "MAX",
"positive": "MATCH",
"negative": "NO_MATCH",
"undefined": "NO_MATCH",
"ignoreUndefined": "false"
}
},
"model": [
{
"name": "topics",
"type": "DoubleArray",
"path": "$.topics"
},
{
"name": "fullname",
"type": "String",
"path": "$.fullname"
},
{
"name": "orcid",
"type": "String",
"path": "$.orcid"
},
{
"name": "coauthors",
"type": "List",
"path": "$.coAuthors[*].fullname"
},
{
"name": "orcids",
"type": "List",
"path": "$.coAuthors[*].orcid"
}
],
"blacklists": {},
"synonyms": {}
}
}

View File

@ -0,0 +1 @@
{"fullname":"Zaragoza, Maria Cleofé","firstname":"Maria Cleofé","lastname":"Zaragoza","coAuthors":[{"fullname":"Cambras, Trinitat","lastname":"Cambras","firstname":"Trinitat","orcid":"0000-0002-9009-4690"},{"fullname":"Castro-Marrero, Jesús","lastname":"Castro-Marrero","firstname":"Jesús","orcid":""},{"fullname":"Díez-Noguera, Antoni","lastname":"Díez-Noguera","firstname":"Antoni","orcid":""},{"fullname":"Alegre, José","lastname":"Alegre","firstname":"José","orcid":"0000-0002-7582-7585"}],"topics":[0.9522090839562252,0.04779091604377485],"orcid":"0000-0002-9797-0219","id":"author::1a10826c83c7f9f0dcebe7df05e37a2a","pubId":"50|pmid________::db7fd19db5a620eafad40cfb97f9690d"}

30
pom.xml
View File

@ -5,7 +5,7 @@
<groupId>eu.dnetlib</groupId>
<artifactId>dnet-dedup</artifactId>
<version>4.1.12</version>
<version>4.1.13-SNAPSHOT</version>
<packaging>pom</packaging>
@ -22,7 +22,7 @@
<scm>
<developerConnection>scm:git:https://code-repo.d4science.org/D-Net/dnet-dedup.git</developerConnection>
<tag>dnet-dedup-4.1.12</tag>
<tag>dnet-dedup-4.0.3</tag>
</scm>
<modules>
@ -144,14 +144,7 @@
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-surefire-plugin</artifactId>
<version>2.19.1</version>
<dependencies>
<dependency>
<groupId>org.junit.jupiter</groupId>
<artifactId>junit-jupiter</artifactId>
<version>${junit-jupiter.version}</version>
</dependency>
</dependencies>
<version>2.22.0</version>
<configuration>
<redirectTestOutputToFile>false</redirectTestOutputToFile>
</configuration>
@ -261,7 +254,7 @@
<oozie.use.system.libpath>true</oozie.use.system.libpath>
<properties.maven.plugin.version>2.0.1</properties.maven.plugin.version>
<junit-jupiter.version>5.6.1</junit-jupiter.version>
<maven.dependency.eu.dnetlib.dhp.dhp-build-assembly-resources.jar.path>../dhp-build/dhp-build-assembly-resources/target/dhp-build-assembly-resources-4.1.8-SNAPSHOT.jar</maven.dependency.eu.dnetlib.dhp.dhp-build-assembly-resources.jar.path>
<maven.dependency.eu.dnetlib.dhp.dhp-build-assembly-resources.jar.path>../dhp-build/dhp-build-assembly-resources/target/dhp-build-assembly-resources-${project.version}.jar</maven.dependency.eu.dnetlib.dhp.dhp-build-assembly-resources.jar.path>
</properties>
@ -410,27 +403,12 @@
<version>2.4.0</version>
</dependency>
<dependency>
<groupId>org.mockito</groupId>
<artifactId>mockito-core</artifactId>
<version>3.3.3</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.mockito</groupId>
<artifactId>mockito-junit-jupiter</artifactId>
<version>3.3.3</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>com.ibm.icu</groupId>
<artifactId>icu4j</artifactId>
<version>70.1</version>
</dependency>
</dependencies>
</dependencyManagement>