Compare commits

..

No commits in common. "master" and "dnet-dedup-4.1.12" have entirely different histories.

67 changed files with 2484 additions and 8101 deletions

2
.gitignore vendored
View File

@ -19,5 +19,3 @@
/build
spark-warehouse
/dhp-workflows/dhp-graph-mapper/job-override.properties
test.properties

View File

@ -6,7 +6,7 @@
<parent>
<groupId>eu.dnetlib</groupId>
<artifactId>dhp-build</artifactId>
<version>4.1.13-SNAPSHOT</version>
<version>4.1.12</version>
</parent>
<artifactId>dhp-build-assembly-resources</artifactId>

View File

@ -6,11 +6,10 @@
<parent>
<groupId>eu.dnetlib</groupId>
<artifactId>dhp-build</artifactId>
<version>4.1.13-SNAPSHOT</version>
<version>4.1.12</version>
</parent>
<artifactId>dhp-build-properties-maven-plugin</artifactId>
<version>4.1.13-SNAPSHOT</version>
<packaging>maven-plugin</packaging>
<description>This module is a maven plugin implementing custom properties substitutions in the build lifecycle</description>
@ -20,19 +19,16 @@
<groupId>org.apache.maven</groupId>
<artifactId>maven-plugin-api</artifactId>
<version>3.6.3</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>org.apache.maven</groupId>
<artifactId>maven-project</artifactId>
<version>2.2.1</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>org.apache.maven</groupId>
<artifactId>maven-artifact</artifactId>
<version>2.2.1</version>
<scope>provided</scope>
</dependency>
<dependency>
@ -104,29 +100,6 @@
</configuration>
</plugin>
</plugins>
<pluginManagement>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-plugin-plugin</artifactId>
<version>3.2</version>
<configuration>
<skipErrorNoDescriptorsFound>true</skipErrorNoDescriptorsFound>
</configuration>
<executions>
<execution>
<id>mojo-descriptor</id>
<phase>process-classes</phase>
<goals>
<goal>descriptor</goal>
</goals>
</execution>
</executions>
</plugin>
</plugins>
</pluginManagement>
</build>
</project>

View File

@ -8,8 +8,6 @@ import static org.junit.jupiter.api.Assertions.assertNull;
import org.junit.jupiter.api.*;
import java.nio.file.Paths;
/** @author mhorst, claudio.atzori */
public class GenerateOoziePropertiesMojoTest {
@ -68,7 +66,7 @@ public class GenerateOoziePropertiesMojoTest {
clearSystemProperties();
// given
String workflowSourceDir = Paths.get("eu/dnetlib/dhp/").toString();
String workflowSourceDir = "eu/dnetlib/dhp/";
System.setProperty(PROPERTY_NAME_WF_SOURCE_DIR, workflowSourceDir);
// execute
@ -83,14 +81,14 @@ public class GenerateOoziePropertiesMojoTest {
clearSystemProperties();
// given
String workflowSourceDir = Paths.get("eu/dnetlib/dhp/wf/transformers").toString();
String workflowSourceDir = "eu/dnetlib/dhp/wf/transformers";
System.setProperty(PROPERTY_NAME_WF_SOURCE_DIR, workflowSourceDir);
// execute
mojo.execute();
// assert
assertEquals(Paths.get("wf/transformers").toString(), System.getProperty(PROPERTY_NAME_SANDBOX_NAME));
assertEquals("wf/transformers", System.getProperty(PROPERTY_NAME_SANDBOX_NAME));
}
@Test
@ -98,13 +96,13 @@ public class GenerateOoziePropertiesMojoTest {
clearSystemProperties();
// given
String workflowSourceDir = Paths.get("wf/transformers").toString();
String workflowSourceDir = "wf/transformers";
System.setProperty(PROPERTY_NAME_WF_SOURCE_DIR, workflowSourceDir);
// execute
mojo.execute();
// assert
assertEquals(Paths.get("wf/transformers").toString(), System.getProperty(PROPERTY_NAME_SANDBOX_NAME));
assertEquals("wf/transformers", System.getProperty(PROPERTY_NAME_SANDBOX_NAME));
}
}

View File

@ -0,0 +1,2 @@
# Tue Mar 15 14:58:05 CET 2022
projectPropertyKey=projectPropertyValue

View File

@ -5,7 +5,7 @@
<groupId>eu.dnetlib</groupId>
<artifactId>dhp-code-style</artifactId>
<version>4.1.13-SNAPSHOT</version>
<version>4.1.12</version>
<packaging>jar</packaging>

View File

@ -4,7 +4,7 @@
<parent>
<groupId>eu.dnetlib</groupId>
<artifactId>dnet-dedup</artifactId>
<version>4.1.13-SNAPSHOT</version>
<version>4.1.12</version>
<relativePath>../pom.xml</relativePath>
</parent>
<artifactId>dhp-build</artifactId>

View File

@ -1,6 +1,6 @@
useTree = true
entitiesPath = /user/michele.debonis/lda_experiments/authors_pubmed
workingPath = /user/michele.debonis/authors_dedup/gt2_dedup
numPartitions = 1000
dedupConfPath = /user/michele.debonis/lda_experiments/authors.fdup.gt2.conf.json
groundTruthFieldJPath = $.orcid
entitiesPath = /tmp/publications_test_dump
#entitiesPath = /tmp/prod_provision/graph/02_graph_cleaned/publication
workingPath = /user/michele.debonis/new_dedup_test/workingdirtree
dedupConfPath = /user/michele.debonis/new_dedup_test/pubs.tree.conf.json
numPartitions = 8000
useTree = true

View File

@ -6,7 +6,7 @@
<parent>
<groupId>eu.dnetlib</groupId>
<artifactId>dnet-dedup</artifactId>
<version>4.1.13-SNAPSHOT</version>
<version>4.1.12</version>
<relativePath>../pom.xml</relativePath>
</parent>

View File

@ -1,7 +1,7 @@
package eu.dnetlib;
import com.google.common.hash.Hashing;
import eu.dnetlib.graph.JavaGraphProcessor;
import eu.dnetlib.graph.GraphProcessor;
import eu.dnetlib.pace.config.DedupConfig;
import eu.dnetlib.pace.model.MapDocument;
import eu.dnetlib.pace.util.BlockProcessorForTesting;
@ -19,6 +19,7 @@ import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.graphx.Edge;
import org.apache.spark.rdd.RDD;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SaveMode;
@ -57,13 +58,14 @@ public class Deduper implements Serializable {
.reduceByKey((b1, b2) -> Block.from(b1, b2, of, maxQueueSize));
}
public static Iterator<Tuple2<String, String>> ccToMergeRel(Tuple2<String, List<String>> cc, DedupConfig dedupConf) {
return cc._2()
public static Iterator<Tuple2<String, String>> ccToMergeRel(ConnectedComponent cc, DedupConfig dedupConf) {
return cc
.getDocs()
.stream()
.flatMap(
id -> {
List<Tuple2<String, String>> tmp = new ArrayList<>();
tmp.add(new Tuple2<>(cc._1(), id));
tmp.add(new Tuple2<>(cc.getCcId(), id));
return tmp.stream();
})
.iterator();
@ -136,19 +138,21 @@ public class Deduper implements Serializable {
.map(s -> MapDocumentUtil.getJPathString(dedupConf.getWf().getIdPath(), s))
.mapToPair((PairFunction<String, Object, String>) s -> new Tuple2<>(hash(s), s));
final JavaRDD<Edge<String>> edgeRdd = spark
final RDD<Edge<String>> edgeRdd = spark
.read()
.load(simRelsPath)
.as(Encoders.bean(Relation.class))
.javaRDD()
.map(Relation::toEdgeRdd);
.map(Relation::toEdgeRdd)
.rdd();
JavaPairRDD<String, List<String>> ccs = JavaGraphProcessor
.findCCs(vertexes, edgeRdd, dedupConf.getWf().getMaxIterations());
JavaRDD<ConnectedComponent> ccs = GraphProcessor
.findCCs(vertexes.rdd(), edgeRdd, maxIterations)
.toJavaRDD();
JavaRDD<Relation> mergeRel = ccs
.filter(cc -> cc._2().size() > 1)
.flatMap(cc -> Deduper.ccToMergeRel(cc, dedupConf))
.filter(k -> k.getDocs().size() > 1)
.flatMap(cc -> ccToMergeRel(cc, dedupConf))
.map(it -> new Relation(it._1(), it._2(), "mergeRel"));
final Dataset<Relation> mergeRels = spark
@ -159,7 +163,7 @@ public class Deduper implements Serializable {
mergeRels.write().mode(SaveMode.Overwrite).parquet(mergeRelsPath);
}
public static void createDedupEntity(DedupConfig dedupConf, String simRelsPath, String mergeRelsPath, String entitiesPath, SparkSession spark, String dedupEntityPath){
public static void createDedupEntity(DedupConfig dedupConf, String mergeRelsPath, String entitiesPath, SparkSession spark, String dedupEntityPath){
JavaPairRDD<String, String> entities = spark
.read()
@ -170,15 +174,7 @@ public class Deduper implements Serializable {
.toJavaRDD()
.mapToPair(t -> t);
// <source_raw_id, relation(source, target)>
JavaPairRDD<String, Relation> simRels = spark
.read()
.load(simRelsPath)
.as(Encoders.bean(Relation.class))
.toJavaRDD()
.mapToPair(r-> new Tuple2<>(r.getSource(), r));
// <raw_id, relation(dedup_id, raw_id)>
// <source, target>: source is the dedup_id, target is the id of the mergedIn
JavaPairRDD<String, Relation> mergeRels = spark
.read()
.load(mergeRelsPath)
@ -191,22 +187,7 @@ public class Deduper implements Serializable {
.groupByKey()
.map(t-> entityMerger(t._1(), t._2().iterator()));
JavaPairRDD<String, Iterable<Relation>> simRelsWithDedupId = simRels
.join(mergeRels)
.mapToPair(x -> new Tuple2<>(x._2()._2().getSource(), x._2()._1()))
.groupByKey();
JavaRDD<ConnectedComponent> groupEntity = mergeRels.join(entities)
.mapToPair(t -> new Tuple2<>(t._2()._1().getSource(), t._2()._2()))
.groupByKey()
.join(simRelsWithDedupId)
.map(x -> new ConnectedComponent(
x._1(),
x._2()._1(),
x._2()._2())
);
groupEntity.saveAsTextFile(dedupEntityPath);
dedupEntities.saveAsTextFile(dedupEntityPath);
}
}

View File

@ -1,56 +0,0 @@
package eu.dnetlib.graph;
import com.clearspring.analytics.util.Lists;
import com.google.common.collect.Sets;
import eu.dnetlib.pace.utils.Utility;
import eu.dnetlib.support.ConnectedComponent;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.graphx.*;
import org.apache.spark.rdd.RDD;
import org.apache.spark.storage.StorageLevel;
import scala.Tuple2;
import scala.reflect.ClassTag;
import scala.reflect.ClassTag$;
import java.util.List;
public class JavaGraphProcessor {
//<ccId, list(json)>
public static JavaPairRDD<String, List<String>> findCCs(JavaPairRDD<Object, String> vertexes, JavaRDD<Edge<String>> edges, int maxIterations) {
ClassTag<String> stringTag = ClassTag$.MODULE$.apply(String.class);
Graph<String, String> graph =
Graph.apply(
vertexes.rdd(),
edges.rdd(),
"",
StorageLevel.MEMORY_ONLY(),
StorageLevel.MEMORY_ONLY(),
stringTag,
stringTag
);
GraphOps<String, String> graphOps = new GraphOps<>(graph, stringTag, stringTag);
JavaRDD<Tuple2<Object, Object>> cc = graphOps.connectedComponents(maxIterations).vertices().toJavaRDD();
JavaPairRDD<Object, String> joinResult = vertexes
.leftOuterJoin(cc.mapToPair(x -> x))
.mapToPair(x -> {
if (!x._2()._2().isPresent()) {
return new Tuple2<>(x._1(), x._2()._1());
} else {
return new Tuple2<>(x._2()._2(), x._2()._1());
}
});
return joinResult
.groupByKey()
.map(x -> Lists.newArrayList(x._2()))
.zipWithUniqueId()
.mapToPair(x -> new Tuple2<>("dedup______::" + x._2().toString(), x._1()));
}
}

View File

@ -19,7 +19,6 @@ import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.Serializable;
import java.nio.charset.StandardCharsets;
import java.util.stream.Collectors;
public abstract class AbstractSparkJob implements Serializable {
@ -60,7 +59,7 @@ public abstract class AbstractSparkJob implements Serializable {
Path path=new Path(filePath);
FileSystem fs = FileSystem.get(new Configuration());
BufferedReader br=new BufferedReader(new InputStreamReader(fs.open(path), StandardCharsets.UTF_8));
BufferedReader br=new BufferedReader(new InputStreamReader(fs.open(path)));
try {
return String.join("", br.lines().collect(Collectors.toList()));
} finally {

View File

@ -1,36 +1,20 @@
package eu.dnetlib.jobs;
import eu.dnetlib.Deduper;
import eu.dnetlib.pace.config.DedupConfig;
import eu.dnetlib.pace.config.Type;
import eu.dnetlib.pace.model.FieldValueImpl;
import eu.dnetlib.pace.model.MapDocument;
import eu.dnetlib.pace.util.MapDocumentUtil;
import eu.dnetlib.pace.utils.Utility;
import eu.dnetlib.support.ArgumentApplicationParser;
import eu.dnetlib.support.Block;
import eu.dnetlib.support.ConnectedComponent;
import eu.dnetlib.support.Relation;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SparkSession;
import org.codehaus.jackson.map.ObjectMapper;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import scala.Tuple2;
import java.io.IOException;
import java.util.List;
import java.util.Optional;
import java.util.stream.Collectors;
public class SparkComputeStatistics extends AbstractSparkJob {
@ -58,42 +42,18 @@ public class SparkComputeStatistics extends AbstractSparkJob {
@Override
public void run() throws IOException {
//https://towardsdatascience.com/7-evaluation-metrics-for-clustering-algorithms-bdc537ff54d2#:~:text=There%20are%20two%20types%20of,to%20all%20unsupervised%20learning%20results)
// read oozie parameters
final String entitiesPath = parser.get("entitiesPath");
final String workingPath = parser.get("workingPath");
final String dedupConfPath = parser.get("dedupConfPath");
final String groundTruthFieldJPath = parser.get("groundTruthFieldJPath");
final int numPartitions = Optional
.ofNullable(parser.get("numPartitions"))
.map(Integer::valueOf)
.orElse(NUM_PARTITIONS);
log.info("entitiesPath: '{}'", entitiesPath);
log.info("workingPath: '{}'", workingPath);
log.info("numPartitions: '{}'", numPartitions);
log.info("dedupConfPath: '{}'", dedupConfPath);
log.info("groundTruthFieldJPath: '{}'", groundTruthFieldJPath);
JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
DedupConfig dedupConfig = loadDedupConfig(dedupConfPath);
JavaPairRDD<String, MapDocument> mapDocuments = sc
.textFile(entitiesPath)
.repartition(numPartitions)
.mapToPair(
(PairFunction<String, String, MapDocument>) s -> {
MapDocument d = MapDocumentUtil.asMapDocumentWithJPath(dedupConfig, s);
//put in the map the groundTruthField used to compute statistics
d.getFieldMap().put("groundTruth", new FieldValueImpl(Type.String, "groundTruth", MapDocumentUtil.getJPathString(groundTruthFieldJPath, s)));
return new Tuple2<>(d.getIdentifier(), d);
});
JavaRDD<String> entities = mapDocuments.map(d -> d._2().getFieldMap().get("groundTruth").stringValue());
// create blocks
JavaRDD<List<String>> blocks = Deduper.createSortedBlocks(mapDocuments, dedupConfig)
.map(b -> b._2().getDocuments().stream().map(d -> d.getFieldMap().get("groundTruth").stringValue()).collect(Collectors.toList()));
log.info("entitiesPath: '{}'", entitiesPath);
log.info("workingPath: '{}'", workingPath);
log.info("numPartitions: '{}'", numPartitions);
// <source, target>: source is the dedup_id, target is the id of the mergedIn
JavaRDD<Relation> mergerels = spark
@ -108,38 +68,15 @@ public class SparkComputeStatistics extends AbstractSparkJob {
.as(Encoders.bean(Relation.class))
.toJavaRDD();
JavaRDD<List<String>> groups = sc.textFile(workingPath + "/groupentities")
.map(e -> new ObjectMapper().readValue(e, ConnectedComponent.class))
.map(e -> e.getDocs().stream().map(d -> MapDocumentUtil.getJPathString(groundTruthFieldJPath, d)).collect(Collectors.toList()));
long entities_number = entities.count();
long blocks_number = blocks.count();
double blocks_randIndex = randIndex(blocks);
long simrels_number = simrels.count();
long mergerels_number = mergerels.count();
double groups_randIndex = randIndex(groups);
long groups_number = groups.count();
long groundtruth_number = entities.filter(e -> !e.isEmpty()).count();
long correct_groups = groups.filter(x -> x.stream().distinct().count()==1).count();
long wrong_groups = groups_number - correct_groups;
long connected_components = mergerels.groupBy(Relation::getSource).count();
String print =
"Entities : " + entities_number + "\n" +
"Ground Truth : " + groundtruth_number + "\n" +
"Blocks : " + blocks_number + "\n" +
"Blocks RI : " + blocks_randIndex + "\n" +
"SimRels : " + simrels_number + "\n" +
"MergeRels : " + mergerels_number + "\n" +
"Groups : " + groups_number + " (correct: " + correct_groups + ", wrong: " + wrong_groups + ")\n" +
"Groups RI : " + groups_randIndex;
System.out.println(print);
writeStatsFileToHDFS(groundtruth_number, entities_number, blocks_randIndex, groups_randIndex, blocks_number, simrels_number, mergerels_number, groups_number, workingPath + "/stats_file.txt");
writeStatsFileToHDFS(simrels_number, mergerels_number, connected_components, workingPath + "/stats_file");
}
public static void writeStatsFileToHDFS(long groundtruth_number, long entities_number, double blocks_randIndex, double groups_randIndex, long blocks_number, long simrels_number, long mergerels_number, long groups_number, String filePath) throws IOException {
public static void writeStatsFileToHDFS(long simrels_number, long mergerels_number, long connected_components, String filePath) throws IOException {
Configuration conf = new Configuration();
FileSystem fs = FileSystem.get(conf);
@ -156,14 +93,9 @@ public class SparkComputeStatistics extends AbstractSparkJob {
}
String print =
"Entities : " + entities_number + "\n" +
"Ground Truth : " + groundtruth_number + "\n" +
"Blocks : " + blocks_number + "\n" +
"Blocks RI : " + blocks_randIndex + "\n" +
"SimRels : " + simrels_number + "\n" +
"MergeRels : " + mergerels_number + "\n" +
"Groups : " + groups_number + "\n" +
"Groups RI : " + groups_randIndex;
"Similarity Relations : " + simrels_number + "\n" +
"Merge Relations : " + mergerels_number + "\n" +
"Connected Components : " + connected_components;
// Create file to write
FSDataOutputStream out = fs.create(outFile);
@ -177,31 +109,5 @@ public class SparkComputeStatistics extends AbstractSparkJob {
e.printStackTrace();
}
}
//TODO find another maesure that takes into account all the elements outside of the group too
//RandIndex = number of pairwise correct predictions/total number of possible pairs (in the same cluster) -> bounded between 0 and 1
public double randIndex(JavaRDD<List<String>> clusters) {
Tuple2<Integer, Integer> reduce = clusters.map(c -> {
int num = 0;
for (String id : c.stream().distinct().filter(s -> !s.isEmpty()).collect(Collectors.toList())) {
int n = (int) c.stream().filter(i -> i.equals(id)).count();
num += binomialCoefficient(n);
}
int den = binomialCoefficient(c.size());
return new Tuple2<>(num, den);
})
.reduce((a, b) -> new Tuple2<>(a._1() + b._1(), a._2() + b._2()));
return (double)reduce._1()/ reduce._2();
}
private static int binomialCoefficient(int n)
{
return n*(n-1)/2;
}
//V-measure = harmonic mean of homogeneity and completeness, homogeneity = each cluster contains only members of a single class, completeness = all members of a given class are assigned to the same cluster
}

View File

@ -7,7 +7,6 @@ import eu.dnetlib.pace.utils.Utility;
import eu.dnetlib.support.ArgumentApplicationParser;
import eu.dnetlib.support.ConnectedComponent;
import eu.dnetlib.support.Relation;
import org.apache.hadoop.io.compress.GzipCodec;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
@ -17,32 +16,29 @@ import org.apache.spark.sql.SparkSession;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import scala.Tuple2;
import scala.Tuple3;
import java.io.IOException;
import java.util.*;
import java.util.stream.Collectors;
import java.util.stream.StreamSupport;
import java.util.Optional;
public class SparkCreateGroupEntity extends AbstractSparkJob {
public class SparkCreateDedupEntity extends AbstractSparkJob {
private static final Logger log = LoggerFactory.getLogger(SparkCreateGroupEntity.class);
private static final Logger log = LoggerFactory.getLogger(eu.dnetlib.jobs.SparkCreateDedupEntity.class);
public SparkCreateGroupEntity(ArgumentApplicationParser parser, SparkSession spark) {
public SparkCreateDedupEntity(ArgumentApplicationParser parser, SparkSession spark) {
super(parser, spark);
}
public static void main(String[] args) throws Exception {
ArgumentApplicationParser parser = new ArgumentApplicationParser(
Utility.readResource("/jobs/parameters/createGroupEntity_parameters.json", SparkCreateGroupEntity.class)
Utility.readResource("/jobs/parameters/createDedupEntity_parameters.json", SparkCreateDedupEntity.class)
);
parser.parseArgument(args);
SparkConf conf = new SparkConf();
new SparkCreateGroupEntity(
new SparkCreateDedupEntity(
parser,
getSparkSession(conf)
).run();
@ -67,7 +63,6 @@ public class SparkCreateGroupEntity extends AbstractSparkJob {
DedupConfig dedupConf = DedupConfig.load(readFileFromHDFS(dedupConfPath));
// <raw_id, json>
JavaPairRDD<String, String> entities = spark
.read()
.textFile(entitiesPath)
@ -77,15 +72,7 @@ public class SparkCreateGroupEntity extends AbstractSparkJob {
.toJavaRDD()
.mapToPair(t -> t);
// <source_raw_id, relation(source, target)>
JavaPairRDD<String, Relation> simRels = spark
.read()
.load(workingPath + "/simrels")
.as(Encoders.bean(Relation.class))
.toJavaRDD()
.mapToPair(r-> new Tuple2<>(r.getSource(), r));
// <raw_id, relation(dedup_id, raw_id)>
// <source, target>: source is the dedup_id, target is the id of the mergedIn
JavaPairRDD<String, Relation> mergeRels = spark
.read()
.load(workingPath + "/mergerels")
@ -93,23 +80,12 @@ public class SparkCreateGroupEntity extends AbstractSparkJob {
.toJavaRDD()
.mapToPair(r -> new Tuple2<>(r.getTarget(), r));
// <dedup_id, simrel>
JavaPairRDD<String, Iterable<Relation>> simRelsWithDedupId = simRels
.join(mergeRels)
.mapToPair(x -> new Tuple2<>(x._2()._2().getSource(), x._2()._1()))
.groupByKey();
JavaRDD<ConnectedComponent> groupEntity = mergeRels.join(entities)
JavaRDD<ConnectedComponent> dedupEntities = mergeRels.join(entities)
.mapToPair(t -> new Tuple2<>(t._2()._1().getSource(), t._2()._2()))
.groupByKey()
.join(simRelsWithDedupId)
.map(x -> new ConnectedComponent(
x._1(),
x._2()._1(),
x._2()._2())
);
.map(t-> Deduper.entityMerger(t._1(), t._2().iterator()));
groupEntity.saveAsTextFile(workingPath + "/groupentities", GzipCodec.class);
dedupEntities.saveAsTextFile(workingPath + "dedupentity");
}

View File

@ -1,7 +1,7 @@
package eu.dnetlib.jobs;
import eu.dnetlib.Deduper;
import eu.dnetlib.graph.JavaGraphProcessor;
import eu.dnetlib.graph.GraphProcessor;
import eu.dnetlib.pace.config.DedupConfig;
import eu.dnetlib.pace.util.MapDocumentUtil;
import eu.dnetlib.pace.utils.Utility;
@ -24,7 +24,6 @@ import org.slf4j.LoggerFactory;
import scala.Tuple2;
import java.io.IOException;
import java.util.List;
import java.util.Optional;
import static eu.dnetlib.Deduper.hash;
@ -79,18 +78,20 @@ public class SparkCreateMergeRels extends AbstractSparkJob {
.map(s -> MapDocumentUtil.getJPathString(dedupConf.getWf().getIdPath(), s))
.mapToPair((PairFunction<String, Object, String>) s -> new Tuple2<>(hash(s), s));
final JavaRDD<Edge<String>> edgeRdd = spark
final RDD<Edge<String>> edgeRdd = spark
.read()
.load(workingPath + "/simrels")
.as(Encoders.bean(Relation.class))
.javaRDD()
.map(Relation::toEdgeRdd);
.map(Relation::toEdgeRdd)
.rdd();
JavaPairRDD<String, List<String>> ccs = JavaGraphProcessor
.findCCs(vertexes, edgeRdd, dedupConf.getWf().getMaxIterations());
JavaRDD<ConnectedComponent> ccs = GraphProcessor
.findCCs(vertexes.rdd(), edgeRdd, dedupConf.getWf().getMaxIterations())
.toJavaRDD();
JavaRDD<Relation> mergeRel = ccs
.filter(cc -> cc._2().size() > 1)
.filter(k -> k.getDocs().size() > 1)
.flatMap(cc -> Deduper.ccToMergeRel(cc, dedupConf))
.map(it -> new Relation(it._1(), it._2(), "mergeRel"));

View File

@ -14,7 +14,6 @@ import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.sql.Encoder;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SaveMode;
import org.apache.spark.sql.SparkSession;

View File

@ -1,7 +1,10 @@
package eu.dnetlib.support;
import java.io.Serializable;
import java.util.*;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.Iterator;
import java.util.List;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import java.util.stream.StreamSupport;
@ -9,7 +12,6 @@ import java.util.stream.StreamSupport;
import com.google.common.collect.Lists;
import eu.dnetlib.pace.model.MapDocument;
import org.codehaus.jackson.annotate.JsonIgnore;
public class Block implements Serializable {
@ -21,11 +23,6 @@ public class Block implements Serializable {
super();
}
public Block(String key, List<MapDocument> documents) {
this.key = key;
this.documents = documents;
}
public Block(String key, Iterable<MapDocument> documents) {
this.key = key;
this.documents = Lists.newArrayList(documents);

View File

@ -5,35 +5,54 @@ import java.io.Serializable;
import java.util.HashSet;
import java.util.Set;
import com.google.common.collect.Sets;
import eu.dnetlib.pace.utils.Utility;
import org.apache.commons.lang.StringUtils;
import org.codehaus.jackson.annotate.JsonIgnore;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.pace.util.PaceException;
import org.codehaus.jackson.map.ObjectMapper;
public class ConnectedComponent implements Serializable {
private HashSet<String> docs;
private String ccId;
private HashSet<Relation> simrels;
public ConnectedComponent() {
}
public ConnectedComponent(String ccId, Set<String> docs, Set<Relation> simrels) {
this.docs = new HashSet<>(docs);
this.ccId = ccId;
this.simrels = new HashSet<>(simrels);
}
public ConnectedComponent(Set<String> docs) {
this.docs = new HashSet<>(docs);
//initialization of id and relations missing
createID();
}
public ConnectedComponent(String ccId, Iterable<String> docs, Iterable<Relation> simrels) {
this.ccId = ccId;
this.docs = Sets.newHashSet(docs);
this.simrels = Sets.newHashSet(simrels);
public String createID() {
if (docs.size() > 1) {
final String s = getMin();
ccId = "dedup::" + Utility.md5(s);
return ccId;
} else {
return docs.iterator().next();
}
}
@JsonIgnore
public String getMin() {
final StringBuilder min = new StringBuilder();
docs
.forEach(
i -> {
if (StringUtils.isBlank(min.toString())) {
min.append(i);
} else {
if (min.toString().compareTo(i) > 0) {
min.setLength(0);
min.append(i);
}
}
});
return min.toString();
}
@Override
@ -61,12 +80,4 @@ public class ConnectedComponent implements Serializable {
public void setCcId(String ccId) {
this.ccId = ccId;
}
public void setSimrels(HashSet<Relation> simrels) {
this.simrels = simrels;
}
public HashSet<Relation> getSimrels() {
return simrels;
}
}

View File

@ -16,10 +16,6 @@
<name>dedupConfPath</name>
<description>path for the dedup configuration file</description>
</property>
<property>
<name>groundTruthFieldJPath</name>
<description>jpath of the field to be used as ground truth</description>
</property>
<property>
<name>sparkDriverMemory</name>
<description>memory for driver process</description>
@ -142,33 +138,6 @@
<arg>--numPartitions</arg><arg>${numPartitions}</arg>
<arg>--dedupConfPath</arg><arg>${dedupConfPath}</arg>
</spark>
<ok to="CreateGroupEntities"/>
<error to="Kill"/>
</action>
<action name="CreateGroupEntities">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>Create Group Entities</name>
<class>eu.dnetlib.jobs.SparkCreateGroupEntity</class>
<jar>dnet-dedup-test-${projectVersion}.jar</jar>
<spark-opts>
--executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=3840
</spark-opts>
<arg>--entitiesPath</arg><arg>${entitiesPath}</arg>
<arg>--workingPath</arg><arg>${workingPath}</arg>
<arg>--numPartitions</arg><arg>${numPartitions}</arg>
<arg>--dedupConfPath</arg><arg>${dedupConfPath}</arg>
</spark>
<ok to="ComputeStatistics"/>
<error to="Kill"/>
</action>
@ -193,12 +162,36 @@
<arg>--entitiesPath</arg><arg>${entitiesPath}</arg>
<arg>--workingPath</arg><arg>${workingPath}</arg>
<arg>--numPartitions</arg><arg>${numPartitions}</arg>
<arg>--dedupConfPath</arg><arg>${dedupConfPath}</arg>
<arg>--groundTruthFieldJPath</arg><arg>${groundTruthFieldJPath}</arg>
</spark>
<ok to="End"/>
<error to="Kill"/>
</action>
<!--<action name="CreateDedupEntities">-->
<!--<spark xmlns="uri:oozie:spark-action:0.2">-->
<!--<master>yarn</master>-->
<!--<mode>cluster</mode>-->
<!--<name>Create Dedup Entities</name>-->
<!--<class>eu.dnetlib.jobs.SparkCreateDedupEntity</class>-->
<!--<jar>dnet-dedup-test-${projectVersion}.jar</jar>-->
<!--<spark-opts>-->
<!--&#45;&#45;executor-memory=${sparkExecutorMemory}-->
<!--&#45;&#45;executor-cores=${sparkExecutorCores}-->
<!--&#45;&#45;driver-memory=${sparkDriverMemory}-->
<!--&#45;&#45;conf spark.extraListeners=${spark2ExtraListeners}-->
<!--&#45;&#45;conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}-->
<!--&#45;&#45;conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}-->
<!--&#45;&#45;conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}-->
<!--&#45;&#45;conf spark.sql.shuffle.partitions=3840-->
<!--</spark-opts>-->
<!--<arg>&#45;&#45;entitiesPath</arg><arg>${entitiesPath}</arg>-->
<!--<arg>&#45;&#45;workingPath</arg><arg>${workingPath}</arg>-->
<!--<arg>&#45;&#45;numPartitions</arg><arg>${numPartitions}</arg>-->
<!--<arg>&#45;&#45;dedupConfPath</arg><arg>${dedupConfPath}</arg>-->
<!--</spark>-->
<!--<ok to="End"/>-->
<!--<error to="Kill"/>-->
<!--</action>-->
<end name="End"/>
</workflow-app>

View File

@ -16,17 +16,5 @@
"paramLongName": "numPartitions",
"paramDescription": "number of partitions for the similarity relations intermediate phases",
"paramRequired": false
},
{
"paramName": "dc",
"paramLongName": "dedupConfPath",
"paramDescription": "dedup configuration to be used",
"paramRequired": true
},
{
"paramName": "gt",
"paramLongName": "groundTruthFieldJPath",
"paramDescription": "field to be used as groundtruth",
"paramRequired": true
}
]

File diff suppressed because one or more lines are too long

View File

@ -1,134 +0,0 @@
{
"wf": {
"threshold": "0.99",
"dedupRun": "001",
"entityType": "author",
"subEntityType": "author",
"subEntityValue": "author",
"orderField": "fullname",
"queueMaxSize": "200",
"groupMaxSize": "100",
"maxChildren": "100",
"slidingWindowSize": "50",
"rootBuilder": [
"result",
"resultProject_outcome_isProducedBy",
"resultResult_publicationDataset_isRelatedTo",
"resultResult_similarity_isAmongTopNSimilarDocuments",
"resultResult_similarity_hasAmongTopNSimilarDocuments",
"resultOrganization_affiliation_isAffiliatedWith",
"resultResult_part_hasPart",
"resultResult_part_isPartOf",
"resultResult_supplement_isSupplementTo",
"resultResult_supplement_isSupplementedBy",
"resultResult_version_isVersionOf"
],
"includeChildren": "true",
"maxIterations": 20,
"idPath": "$.id"
},
"pace": {
"clustering" : [
{ "name" : "lnfi", "fields" : [ "fullname" ], "params" : {} }
],
"decisionTree": {
"start": {
"fields": [
{
"field": "orcid",
"comparator": "exactMatch",
"weight": 1.0,
"countIfUndefined": "true",
"params": {}
}
],
"threshold": 1.0,
"aggregation": "MAX",
"positive": "MATCH",
"negative": "NO_MATCH",
"undefined": "orcids",
"ignoreUndefined": "true"
},
"orcids": {
"fields": [
{
"field": "orcids",
"comparator": "stringListMatch",
"weight": 1.0,
"countIfUndefined": "true",
"params": {"type": "count"}
}
],
"threshold": 3.0,
"aggregation": "MAX",
"positive": "MATCH",
"negative": "coauthors",
"undefined": "coauthors",
"ignoreUndefined": "true"
},
"coauthors": {
"fields": [
{
"field": "coauthors",
"comparator": "authorsMatch",
"weight": 1.0,
"countIfUndefined": "true",
"params": {"type": "count"}
}
],
"threshold": 1.0,
"aggregation": "MAX",
"positive": "topicsMatch",
"negative": "NO_MATCH",
"undefined": "topicsMatch",
"ignoreUndefined": "true"
},
"topicsMatch": {
"fields": [
{
"field": "topics",
"comparator": "cosineSimilarity",
"weight": 1.0,
"countIfUndefined": "true",
"params": {}
}
],
"threshold": 0.5,
"aggregation": "MAX",
"positive": "MATCH",
"negative": "NO_MATCH",
"undefined": "NO_MATCH",
"ignoreUndefined": "false"
}
},
"model": [
{
"name": "topics",
"type": "DoubleArray",
"path": "$.topics"
},
{
"name": "fullname",
"type": "String",
"path": "$.fullname"
},
{
"name": "orcid",
"type": "String",
"path": "$.orcid"
},
{
"name": "coauthors",
"type": "List",
"path": "$.coAuthors[*].fullname"
},
{
"name": "orcids",
"type": "List",
"path": "$.coAuthors[*].orcid"
}
],
"blacklists": {},
"synonyms": {}
}
}

View File

@ -1,134 +0,0 @@
{
"wf": {
"threshold": "0.99",
"dedupRun": "001",
"entityType": "author",
"subEntityType": "author",
"subEntityValue": "author",
"orderField": "fullname",
"queueMaxSize": "200",
"groupMaxSize": "100",
"maxChildren": "100",
"slidingWindowSize": "50",
"rootBuilder": [
"result",
"resultProject_outcome_isProducedBy",
"resultResult_publicationDataset_isRelatedTo",
"resultResult_similarity_isAmongTopNSimilarDocuments",
"resultResult_similarity_hasAmongTopNSimilarDocuments",
"resultOrganization_affiliation_isAffiliatedWith",
"resultResult_part_hasPart",
"resultResult_part_isPartOf",
"resultResult_supplement_isSupplementTo",
"resultResult_supplement_isSupplementedBy",
"resultResult_version_isVersionOf"
],
"includeChildren": "true",
"maxIterations": 20,
"idPath": "$.id"
},
"pace": {
"clustering" : [
{ "name" : "lnfi", "fields" : [ "fullname" ], "params" : {} }
],
"decisionTree": {
"start": {
"fields": [
{
"field": "orcid",
"comparator": "exactMatch",
"weight": 1.0,
"countIfUndefined": "true",
"params": {}
}
],
"threshold": 1.0,
"aggregation": "MAX",
"positive": "MATCH",
"negative": "NO_MATCH",
"undefined": "orcids",
"ignoreUndefined": "true"
},
"orcids": {
"fields": [
{
"field": "orcids",
"comparator": "stringListMatch",
"weight": 1.0,
"countIfUndefined": "true",
"params": {"type": "count"}
}
],
"threshold": 3.0,
"aggregation": "MAX",
"positive": "MATCH",
"negative": "coauthors",
"undefined": "coauthors",
"ignoreUndefined": "true"
},
"coauthors": {
"fields": [
{
"field": "coauthors",
"comparator": "authorsMatch",
"weight": 1.0,
"countIfUndefined": "true",
"params": {"type": "count"}
}
],
"threshold": 1.0,
"aggregation": "MAX",
"positive": "topicsMatch",
"negative": "NO_MATCH",
"undefined": "topicsMatch",
"ignoreUndefined": "true"
},
"topicsMatch": {
"fields": [
{
"field": "topics",
"comparator": "cosineSimilarity",
"weight": 1.0,
"countIfUndefined": "true",
"params": {}
}
],
"threshold": 1.0,
"aggregation": "MAX",
"positive": "MATCH",
"negative": "NO_MATCH",
"undefined": "NO_MATCH",
"ignoreUndefined": "false"
}
},
"model": [
{
"name": "topics",
"type": "DoubleArray",
"path": "$.topics"
},
{
"name": "fullname",
"type": "String",
"path": "$.fullname"
},
{
"name": "orcid",
"type": "String",
"path": "$.orcid"
},
{
"name": "coauthors",
"type": "List",
"path": "$.coAuthors[*].fullname"
},
{
"name": "orcids",
"type": "List",
"path": "$.coAuthors[*].orcid"
}
],
"blacklists": {},
"synonyms": {}
}
}

View File

@ -3,7 +3,7 @@
"threshold" : "0.99",
"dedupRun" : "001",
"entityType" : "datasource",
"orderField" : "englishname",
"orderField" : "name",
"queueMaxSize" : "2000",
"groupMaxSize" : "50",
"slidingWindowSize" : "200",
@ -14,9 +14,8 @@
},
"pace" : {
"clustering" : [
{ "name" : "sortedngrampairs", "fields" : [ "officialname", "englishname" ], "params" : { "max" : 2, "ngramLen" : "3", "collapseOn:name": "0"} },
{ "name" : "suffixprefix", "fields" : [ "officialname", "englishname" ], "params" : { "max" : 1, "len" : "3", "collapseOn:name": "0" } },
{"name" : "ngrams", "fields" : ["officialname", "englishname"], "params" : {"ngramLen": 4, "max" : 2, "maxPerToken": 2, "minNgramLen": 1, "collapseOn:name": "0"}},
{ "name" : "sortedngrampairs", "fields" : [ "name" ], "params" : { "max" : 2, "ngramLen" : "3"} },
{ "name" : "suffixprefix", "fields" : [ "name" ], "params" : { "max" : 1, "len" : "3" } },
{ "name" : "urlclustering", "fields" : [ "websiteurl" ], "params" : { } }
],
"decisionTree" : {
@ -40,36 +39,16 @@
"layer2": {
"fields": [
{
"field": "officialname",
"field": "name",
"comparator": "levensteinTitle",
"weight": 1.0,
"countIfUndefined": "true",
"params": {
"threshold": 0.9
}
},
{
"field": "englishname",
"comparator": "levensteinTitle",
"weight": 1.0,
"countIfUndefined": "true",
"params": {
"threshold": 0.9
}
},
{
"field": "officialname",
"comparator": "levensteinTitle",
"weight": 1.0,
"countIfUndefined": "true",
"params": {
"crossCompare": "englishname",
"threshold": 0.9
}
}
],
"threshold": 0.9,
"aggregation": "MAX",
"aggregation": "AVG",
"positive": "MATCH",
"negative": "NO_MATCH",
"undefined": "NO_MATCH",
@ -77,11 +56,12 @@
}
},
"model" : [
{ "name" : "englishname", "type" : "String", "path" : "$.englishname" },
{ "name" : "officialname", "type" : "String", "path" : "$.officialname" },
{ "name" : "name", "type" : "String", "path" : "$.name" },
{ "name" : "websiteurl", "type" : "URL", "path" : "$.websiteurl" }
],
"blacklists" : {},
"blacklists" : {
"legalname" : []
},
"synonyms": {}
}
}

View File

@ -51,6 +51,37 @@
],
"decisionTree": {
"start": {
"fields": [
{
"field": "pid",
"comparator": "jsonListMatch",
"weight": 1.0,
"countIfUndefined": "false",
"params": {
"jpath_value": "$.value",
"jpath_classid": "$.qualifier.classid"
}
},
{
"field": "pid",
"comparator": "jsonListMatch",
"weight": 1.0,
"countIfUndefined": "false",
"params": {
"jpath_value": "$.value",
"jpath_classid": "$.qualifier.classid",
"crossCompare": "alternateid"
}
}
],
"threshold": 0.5,
"aggregation": "MAX",
"positive": "layer1",
"negative": "layer2",
"undefined": "layer2",
"ignoreUndefined": "true"
},
"layer1": {
"fields": [
{
"field": "title",
@ -63,8 +94,49 @@
"threshold": 0.9,
"aggregation": "AVG",
"positive": "MATCH",
"negative": "MATCH",
"undefined": "MATCH",
"negative": "NO_MATCH",
"undefined": "NO_MATCH",
"ignoreUndefined": "true"
},
"layer2": {
"fields": [
{
"field": "title",
"comparator": "titleVersionMatch",
"weight": 1.0,
"countIfUndefined": "false",
"params": {}
},
{
"field": "authors",
"comparator": "sizeMatch",
"weight": 1.0,
"countIfUndefined": "false",
"params": {}
}
],
"threshold": 1.0,
"aggregation": "AND",
"positive": "layer3",
"negative": "NO_MATCH",
"undefined": "layer3",
"ignoreUndefined": "false"
},
"layer3": {
"fields": [
{
"field": "title",
"comparator": "levensteinTitle",
"weight": 1.0,
"countIfUndefined": "true",
"params": {}
}
],
"threshold": 0.99,
"aggregation": "AVG",
"positive": "MATCH",
"negative": "NO_MATCH",
"undefined": "NO_MATCH",
"ignoreUndefined": "true"
}
},

View File

@ -6,9 +6,9 @@
"subEntityType": "resulttype",
"subEntityValue": "publication",
"orderField": "title",
"queueMaxSize": "200",
"groupMaxSize": "100",
"maxChildren": "100",
"queueMaxSize": "5000",
"groupMaxSize": "2000",
"maxChildren": "1000",
"slidingWindowSize": "50",
"rootBuilder": [
"result",
@ -28,26 +28,9 @@
"idPath": "$.id"
},
"pace": {
"clustering": [
{
"name": "wordsStatsSuffixPrefixChain",
"fields": [
"title"
],
"params": {
"mod": "10"
}
},
{
"name": "lowercase",
"fields": [
"doi",
"altdoi"
],
"params": {
"collapseOn:pid": "0"
}
}
"clustering" : [
{ "name" : "wordsStatsSuffixPrefixChain", "fields" : [ "title" ], "params" : { "mod" : "10" } },
{ "name" : "lowercase", "fields" : [ "doi" ], "params" : { } }
],
"decisionTree": {
"start": {
@ -59,75 +42,18 @@
"countIfUndefined": "false",
"params": {
"jpath_value": "$.value",
"jpath_classid": "$.qualifier.classid",
"mode": "count"
"jpath_classid": "$.qualifier.classid"
}
}
],
"threshold": 1.0,
"aggregation": "MAX",
"positive": "MATCH",
"negative": "instanceTypeCheck",
"undefined": "instanceTypeCheck",
"ignoreUndefined": "false"
},
"instanceTypeCheck": {
"fields": [
{
"field": "instance",
"comparator": "instanceTypeMatch",
"weight": 1.0,
"countIfUndefined": "false",
"params": {}
}
],
"threshold": 0.5,
"aggregation": "MAX",
"positive": "pidVSaltid",
"negative": "NO_MATCH",
"undefined": "pidVSaltid",
"ignoreUndefined": "true"
},
"pidVSaltid": {
"fields": [
{
"field": "pid",
"comparator": "jsonListMatch",
"weight": 1.0,
"countIfUndefined": "false",
"params": {
"jpath_value": "$.value",
"jpath_classid": "$.qualifier.classid",
"crossCompare": "alternateid",
"mode": "count"
}
}
],
"threshold": 1.0,
"aggregation": "MAX",
"positive": "softCheck",
"negative": "earlyExits",
"undefined": "earlyExits",
"ignoreUndefined": "true"
},
"softCheck": {
"fields": [
{
"field": "title",
"comparator": "levensteinTitle",
"weight": 1.0,
"countIfUndefined": "true",
"params": {}
}
],
"threshold": 0.9,
"aggregation": "AVG",
"positive": "MATCH",
"negative": "NO_MATCH",
"undefined": "NO_MATCH",
"negative": "layer2",
"undefined": "layer2",
"ignoreUndefined": "true"
},
"earlyExits": {
"layer2": {
"fields": [
{
"field": "title",
@ -146,12 +72,12 @@
],
"threshold": 1.0,
"aggregation": "AND",
"positive": "strongCheck",
"positive": "layer3",
"negative": "NO_MATCH",
"undefined": "strongCheck",
"undefined": "layer3",
"ignoreUndefined": "false"
},
"strongCheck": {
"layer3": {
"fields": [
{
"field": "title",
@ -163,30 +89,9 @@
],
"threshold": 0.99,
"aggregation": "AVG",
"positive": "surnames",
"negative": "NO_MATCH",
"undefined": "NO_MATCH",
"ignoreUndefined": "true"
},
"surnames": {
"fields": [
{
"field": "authors",
"comparator": "authorsMatch",
"weight": 1.0,
"countIfUndefined": "false",
"params": {
"surname_th": 0.75,
"fullname_th": 0.75,
"mode": "full"
}
}
],
"threshold": 0.6,
"aggregation": "MAX",
"positive": "MATCH",
"negative": "NO_MATCH",
"undefined": "MATCH",
"undefined": "NO_MATCH",
"ignoreUndefined": "true"
}
},
@ -194,29 +99,18 @@
{
"name": "doi",
"type": "String",
"path": "$.instance[*].pid[?(@.qualifier.classid == 'doi')].value"
},
{
"name": "altdoi",
"type": "String",
"path": "$.instance[*].alternateIdentifier[?(@.qualifier.classid == 'doi')].value"
"path": "$.pid[?(@.qualifier.classid == 'doi')].value"
},
{
"name": "pid",
"type": "JSON",
"path": "$.instance[*].pid[*]",
"overrideMatch": "true"
},
{
"name": "alternateid",
"type": "JSON",
"path": "$.instance[*].alternateIdentifier[*]",
"path": "$.pid",
"overrideMatch": "true"
},
{
"name": "title",
"type": "StringConcat",
"path": "$.title[?(@.qualifier.classid == 'main title')].value|||$.title[?(@.qualifier.classid == 'subtitle')].value",
"type": "String",
"path": "$.title[?(@.qualifier.classid == 'main title')].value",
"length": 250,
"size": 5
},
@ -230,11 +124,6 @@
"name": "resulttype",
"type": "String",
"path": "$.resulttype.classid"
},
{
"name": "instance",
"type": "List",
"path": "$.instance[*].instancetype.classname"
}
],
"blacklists": {
@ -465,16 +354,7 @@
"^(Measurement of the top quark\\-pair production cross section with ATLAS in pp collisions at).*$",
"^(Measurement of the spin\\-dependent structure function).*",
"(?i)^.*authors[']? reply\\.?$",
"(?i)^.*authors[']? response\\.?$",
"^Data [mM]anagement [sS]ervices\\.$",
"Research and Advanced Technology for Digital Libraries",
"(?i)^risky business$",
"(?i)^great expectations\\.?$",
"(?i)^what's in a name\\?$",
"(?i)^decisions, decisions\\.?$",
"(?i)^update to our reader, reviewer, and author communities.*",
"(?i)^lest we forget$",
"(?i)^measure for measure$"
"(?i)^.*authors[']? response\\.?$"
]
},
"synonyms": {}

View File

@ -1,381 +0,0 @@
{
"wf": {
"threshold": "0.99",
"dedupRun": "001",
"entityType": "result",
"subEntityType": "resulttype",
"subEntityValue": "publication",
"orderField": "title",
"queueMaxSize": "200",
"groupMaxSize": "100",
"maxChildren": "100",
"slidingWindowSize": "100",
"rootBuilder": [
"result",
"resultProject_outcome_isProducedBy",
"resultResult_publicationDataset_isRelatedTo",
"resultResult_similarity_isAmongTopNSimilarDocuments",
"resultResult_similarity_hasAmongTopNSimilarDocuments",
"resultOrganization_affiliation_isAffiliatedWith",
"resultResult_part_hasPart",
"resultResult_part_isPartOf",
"resultResult_supplement_isSupplementTo",
"resultResult_supplement_isSupplementedBy",
"resultResult_version_isVersionOf"
],
"includeChildren": "true",
"maxIterations": 20,
"idPath": "$.id"
},
"pace": {
"clustering" : [
{ "name" : "suffixprefix", "fields" : [ "title" ], "params" : { "max" : "1", "len" : "3" } },
{ "name" : "lowercase", "fields" : [ "doi", "altdoi" ], "params" : { "collapseOn:pid": "0"} }
],
"decisionTree": {
"start": {
"fields": [
{
"field": "pid",
"comparator": "jsonListMatch",
"weight": 1.0,
"countIfUndefined": "false",
"params": {
"jpath_value": "$.value",
"jpath_classid": "$.qualifier.classid",
"mode": "count"
}
}
],
"threshold": 1.0,
"aggregation": "MAX",
"positive": "MATCH",
"negative": "versionCheck",
"undefined": "versionCheck",
"ignoreUndefined": "true"
},
"versionCheck": {
"fields": [
{
"field": "title",
"comparator": "titleVersionMatch",
"weight": 1.0,
"countIfUndefined": "false",
"params": {}
}
],
"threshold": 1.0,
"aggregation": "MAX",
"positive": "titleCheck",
"negative": "NO_MATCH",
"undefined": "titleCheck",
"ignoreUndefined": "false"
},
"titleCheck": {
"fields": [
{
"field": "title",
"comparator": "levensteinTitle",
"weight": 1.0,
"countIfUndefined": "true",
"params": {}
}
],
"threshold": 0.9,
"aggregation": "MAX",
"positive": "authorsCheck",
"negative": "NO_MATCH",
"undefined": "NO_MATCH",
"ignoreUndefined": "true"
},
"authorsCheck": {
"fields": [
{
"field": "authors",
"comparator": "authorsMatch",
"weight": 1.0,
"countIfUndefined": "false",
"params": {}
}
],
"threshold": 0.6,
"aggregation": "MAX",
"positive": "MATCH",
"negative": "NO_MATCH",
"undefined": "MATCH",
"ignoreUndefined": "false"
}
},
"model": [
{
"name": "doi",
"type": "String",
"path": "$.instance[*].pid[?(@.qualifier.classid == 'doi')].value"
},
{
"name": "altdoi",
"type": "String",
"path": "$.instance[*].alternateIdentifier[?(@.qualifier.classid == 'doi')].value"
},
{
"name": "pid",
"type": "JSON",
"path": "$.instance[*].pid[*]",
"overrideMatch": "true"
},
{
"name": "title",
"type": "String",
"path": "$.title[?(@.qualifier.classid == 'main title')].value",
"length": 250,
"size": 5
},
{
"name": "authors",
"type": "List",
"path": "$.author[*].fullname",
"size": 200
},
{
"name": "resulttype",
"type": "String",
"path": "$.resulttype.classid"
}
],
"blacklists": {
"title": [
"(?i)^Data Management Plan",
"^Inside Front Cover$",
"(?i)^Poster presentations$",
"^THE ASSOCIATION AND THE GENERAL MEDICAL COUNCIL$",
"^Problems with perinatal pathology\\.?$",
"(?i)^Cases? of Puerperal Convulsions$",
"(?i)^Operative Gyna?ecology$",
"(?i)^Mind the gap\\!?\\:?$",
"^Chronic fatigue syndrome\\.?$",
"^Cartas? ao editor Letters? to the Editor$",
"^Note from the Editor$",
"^Anesthesia Abstract$",
"^Annual report$",
"(?i)^“?THE RADICAL PREVENTION OF VENEREAL DISEASE\\.?”?$",
"(?i)^Graph and Table of Infectious Diseases?$",
"^Presentation$",
"(?i)^Reviews and Information on Publications$",
"(?i)^PUBLIC HEALTH SERVICES?$",
"(?i)^COMBINED TEXT-?BOOK OF OBSTETRICS AND GYN(Æ|ae)COLOGY$",
"(?i)^Adrese autora$",
"(?i)^Systematic Part .*\\. Catalogus Fossilium Austriae, Band 2: Echinoidea neogenica$",
"(?i)^Acknowledgement to Referees$",
"(?i)^Behçet's disease\\.?$",
"(?i)^Isolation and identification of restriction endonuclease.*$",
"(?i)^CEREBROVASCULAR DISEASES?.?$",
"(?i)^Screening for abdominal aortic aneurysms?\\.?$",
"^Event management$",
"(?i)^Breakfast and Crohn's disease.*\\.?$",
"^Cálculo de concentraciones en disoluciones acuosas. Ejercicio interactivo\\..*\\.$",
"(?i)^Genetic and functional analyses of SHANK2 mutations suggest a multiple hit model of Autism spectrum disorders?\\.?$",
"^Gushi hakubutsugaku$",
"^Starobosanski nadpisi u Bosni i Hercegovini \\(.*\\)$",
"^Intestinal spirocha?etosis$",
"^Treatment of Rodent Ulcer$",
"(?i)^\\W*Cloud Computing\\W*$",
"^Compendio mathematico : en que se contienen todas las materias mas principales de las Ciencias que tratan de la cantidad$",
"^Free Communications, Poster Presentations: Session [A-F]$",
"^“The Historical Aspects? of Quackery\\.?”$",
"^A designated centre for people with disabilities operated by St John of God Community Services (Limited|Ltd), Louth$",
"^P(er|re)-Mile Premiums for Auto Insurance\\.?$",
"(?i)^Case Report$",
"^Boletín Informativo$",
"(?i)^Glioblastoma Multiforme$",
"(?i)^Nuevos táxones animales descritos en la península Ibérica y Macaronesia desde 1994 \\(.*\\)$",
"^Zaměstnanecké výhody$",
"(?i)^The Economics of Terrorism and Counter-Terrorism: A Survey \\(Part .*\\)$",
"(?i)^Carotid body tumours?\\.?$",
"(?i)^\\[Españoles en Francia : La condición Emigrante.*\\]$",
"^Avant-propos$",
"(?i)^St\\. Patrick's Cathedral, Dublin, County Dublin - Head(s)? and Capital(s)?$",
"(?i)^St\\. Patrick's Cathedral, Dublin, County Dublin - Bases?$",
"(?i)^PUBLIC HEALTH VERSUS THE STATE$",
"^Viñetas de Cortázar$",
"(?i)^Search for heavy neutrinos and W(\\[|_|\\(|_\\{|-)?R(\\]|\\)|\\})? bosons with right-handed couplings in a left-right symmetric model in pp collisions at.*TeV(\\.)?$",
"(?i)^Measurement of the pseudorapidity and centrality dependence of the transverse energy density in Pb(-?)Pb collisions at.*tev(\\.?)$",
"(?i)^Search for resonances decaying into top-quark pairs using fully hadronic decays in pp collisions with ATLAS at.*TeV$",
"(?i)^Search for neutral minimal supersymmetric standard model Higgs bosons decaying to tau pairs in pp collisions at.*tev$",
"(?i)^Relatório de Estágio (de|em) Angiologia e Cirurgia Vascular$",
"^Aus der AGMB$",
"^Znanstveno-stručni prilozi$",
"(?i)^Zhodnocení finanční situace podniku a návrhy na zlepšení$",
"(?i)^Evaluation of the Financial Situation in the Firm and Proposals to its Improvement$",
"(?i)^Hodnocení finanční situace podniku a návrhy na její zlepšení$",
"^Finanční analýza podniku$",
"^Financial analysis( of business)?$",
"(?i)^Textbook of Gyn(a)?(Æ)?(e)?cology$",
"^Jikken nihon shūshinsho$",
"(?i)^CORONER('|s)(s|') INQUESTS$",
"(?i)^(Μελέτη παραγόντων )?risk management( για ανάπτυξη και εφαρμογή ενός πληροφοριακού συστήματος| και ανάπτυξη συστήματος)?$",
"(?i)^Consultants' contract(s)?$",
"(?i)^Upute autorima$",
"(?i)^Bijdrage tot de Kennis van den Godsdienst der Dajaks van Lan(d|f)ak en Tajan$",
"^Joshi shin kokubun$",
"^Kōtō shōgaku dokuhon nōson'yō$",
"^Jinjō shōgaku shōka$",
"^Shōgaku shūjichō$",
"^Nihon joshi dokuhon$",
"^Joshi shin dokuhon$",
"^Chūtō kanbun dokuhon$",
"^Wabun dokuhon$",
"(?i)^(Analysis of economy selected village or town|Rozbor hospodaření vybrané obce či města)$",
"(?i)^cardiac rehabilitation$",
"(?i)^Analytical summary$",
"^Thesaurus resolutionum Sacrae Congregationis Concilii$",
"(?i)^Sumario analítico(\\s{1})?(Analitic summary)?$",
"^Prikazi i osvrti$",
"^Rodinný dům s provozovnou$",
"^Family house with an establishment$",
"^Shinsei chūtō shin kokugun$",
"^Pulmonary alveolar proteinosis(\\.?)$",
"^Shinshū kanbun$",
"^Viñeta(s?) de Rodríguez$",
"(?i)^RUBRIKA UREDNIKA$",
"^A Matching Model of the Academic Publication Market$",
"^Yōgaku kōyō$",
"^Internetový marketing$",
"^Internet marketing$",
"^Chūtō kokugo dokuhon$",
"^Kokugo dokuhon$",
"^Antibiotic Cover for Dental Extraction(s?)$",
"^Strategie podniku$",
"^Strategy of an Enterprise$",
"(?i)^respiratory disease(s?)(\\.?)$",
"^Award(s?) for Gallantry in Civil Defence$",
"^Podniková kultura$",
"^Corporate Culture$",
"^Severe hyponatraemia in hospital inpatient(s?)(\\.?)$",
"^Pracovní motivace$",
"^Work Motivation$",
"^Kaitei kōtō jogaku dokuhon$",
"^Konsolidovaná účetní závěrka$",
"^Consolidated Financial Statements$",
"(?i)^intracranial tumour(s?)$",
"^Climate Change Mitigation Options and Directed Technical Change: A Decentralized Equilibrium Analysis$",
"^\\[CERVECERIAS MAHOU(\\.|\\:) INTERIOR\\] \\[Material gráfico\\]$",
"^Housing Market Dynamics(\\:|\\.) On the Contribution of Income Shocks and Credit Constraint(s?)$",
"^\\[Funciones auxiliares de la música en Radio París,.*\\]$",
"^Úroveň motivačního procesu jako způsobu vedení lidí$",
"^The level of motivation process as a leadership$",
"^Pay-beds in N(\\.?)H(\\.?)S(\\.?) Hospitals$",
"(?i)^news and events$",
"(?i)^NOVOSTI I DOGAĐAJI$",
"^Sansū no gakushū$",
"^Posouzení informačního systému firmy a návrh změn$",
"^Information System Assessment and Proposal for ICT Modification$",
"^Stresové zatížení pracovníků ve vybrané profesi$",
"^Stress load in a specific job$",
"^Sunday: Poster Sessions, Pt.*$",
"^Monday: Poster Sessions, Pt.*$",
"^Wednesday: Poster Sessions, Pt.*",
"^Tuesday: Poster Sessions, Pt.*$",
"^Analýza reklamy$",
"^Analysis of advertising$",
"^Shōgaku shūshinsho$",
"^Shōgaku sansū$",
"^Shintei joshi kokubun$",
"^Taishō joshi kokubun dokuhon$",
"^Joshi kokubun$",
"^Účetní uzávěrka a účetní závěrka v ČR$",
"(?i)^The \"?Causes\"? of Cancer$",
"^Normas para la publicación de artículos$",
"^Editor('|s)(s|') [Rr]eply$",
"^Editor(|s)(s|) letter$",
"^Redaktoriaus žodis$",
"^DISCUSSION ON THE PRECEDING PAPER$",
"^Kōtō shōgaku shūshinsho jidōyō$",
"^Shōgaku nihon rekishi$",
"^(Theory of the flow of action currents in isolated myelinated nerve fibers).*$",
"^Préface$",
"^Occupational [Hh]ealth [Ss]ervices.$",
"^In Memoriam Professor Toshiyuki TAKESHIMA$",
"^Účetní závěrka ve vybraném podniku.*$",
"^Financial statements in selected company$",
"^Abdominal [Aa]ortic [Aa]neurysms.*$",
"^Pseudomyxoma peritonei$",
"^Kazalo autora$",
"(?i)^uvodna riječ$",
"^Motivace jako způsob vedení lidí$",
"^Motivation as a leadership$",
"^Polyfunkční dům$",
"^Multi\\-funkcional building$",
"^Podnikatelský plán$",
"(?i)^Podnikatelský záměr$",
"(?i)^Business Plan$",
"^Oceňování nemovitostí$",
"^Marketingová komunikace$",
"^Marketing communication$",
"^Sumario Analítico$",
"^Riječ uredništva$",
"^Savjetovanja i priredbe$",
"^Índice$",
"^(Starobosanski nadpisi).*$",
"^Vzdělávání pracovníků v organizaci$",
"^Staff training in organization$",
"^(Life Histories of North American Geometridae).*$",
"^Strategická analýza podniku$",
"^Strategic Analysis of an Enterprise$",
"^Sadržaj$",
"^Upute suradnicima$",
"^Rodinný dům$",
"(?i)^Fami(l)?ly house$",
"^Upute autorima$",
"^Strategic Analysis$",
"^Finanční analýza vybraného podniku$",
"^Finanční analýza$",
"^Riječ urednika$",
"(?i)^Content(s?)$",
"(?i)^Inhalt$",
"^Jinjō shōgaku shūshinsho jidōyō$",
"(?i)^Index$",
"^Chūgaku kokubun kyōkasho$",
"^Retrato de una mujer$",
"^Retrato de un hombre$",
"^Kōtō shōgaku dokuhon$",
"^Shotōka kokugo$",
"^Shōgaku dokuhon$",
"^Jinjō shōgaku kokugo dokuhon$",
"^Shinsei kokugo dokuhon$",
"^Teikoku dokuhon$",
"^Instructions to Authors$",
"^KİTAP TAHLİLİ$",
"^PRZEGLĄD PIŚMIENNICTWA$",
"(?i)^Presentación$",
"^İçindekiler$",
"(?i)^Tabl?e of contents$",
"^(CODICE DEL BEATO DE LOS REYES FERNANDO I Y SANCHA).*$",
"^(\\[MADRID\\. BIBL\\. NAC\\. N.*KING FERDINAND I.*FROM SAN ISIDORO DE LEON\\. FACUNDUS SCRIPSIT DATED.*\\]).*",
"^Editorial( Board)?$",
"(?i)^Editorial \\(English\\)$",
"^Editörden$",
"^(Corpus Oral Dialectal \\(COD\\)\\.).*$",
"^(Kiri Karl Morgensternile).*$",
"^(\\[Eksliibris Aleksandr).*\\]$",
"^(\\[Eksliibris Aleksandr).*$",
"^(Eksliibris Aleksandr).*$",
"^(Kiri A\\. de Vignolles).*$",
"^(2 kirja Karl Morgensternile).*$",
"^(Pirita kloostri idaosa arheoloogilised).*$",
"^(Kiri tundmatule).*$",
"^(Kiri Jenaer Allgemeine Literaturzeitung toimetusele).*$",
"^(Eksliibris Nikolai Birukovile).*$",
"^(Eksliibris Nikolai Issakovile).*$",
"^(WHP Cruise Summary Information of section).*$",
"^(Measurement of the top quark\\-pair production cross section with ATLAS in pp collisions at).*$",
"^(Measurement of the spin\\-dependent structure function).*",
"(?i)^.*authors[']? reply\\.?$",
"(?i)^.*authors[']? response\\.?$",
"^Data [mM]anagement [sS]ervices\\.$",
"Research and Advanced Technology for Digital Libraries",
"Food and Nutrition"
]
},
"synonyms": {}
}
}

View File

@ -1,150 +0,0 @@
{
"wf" : {
"threshold" : "0.99",
"dedupRun" : "001",
"entityType" : "result",
"subEntityType" : "resulttype",
"subEntityValue" : "software",
"orderField" : "title",
"queueMaxSize" : "200",
"groupMaxSize" : "100",
"maxChildren" : "100",
"slidingWindowSize" : "50",
"rootBuilder" : [ "result", "resultProject_outcome_isProducedBy", "resultResult_publicationDataset_isRelatedTo", "resultResult_similarity_isAmongTopNSimilarDocuments", "resultResult_similarity_hasAmongTopNSimilarDocuments", "resultOrganization_affiliation_hasAuthorInstitution", "resultResult_part_hasPart", "resultResult_part_isPartOf", "resultResult_supplement_isSupplementTo", "resultResult_supplement_isSupplementedBy", "resultResult_version_isVersionOf" ],
"includeChildren" : "true"
},
"pace" : {
"clustering" : [
{ "name" : "wordsStatsSuffixPrefixChain", "fields" : [ "title" ], "params" : { "mod" : "10" } },
{ "name" : "lowercase", "fields" : [ "doi", "altdoi" ], "params" : { "collapseOn:pid":"0"} },
{ "name" : "ngrams", "fields" : [ "title" ], "params" : {"ngramLen": 3, "max": 4, "maxPerToken":1, "minNgramLen":3}},
{ "name" : "urlclustering", "fields": [ "url" ], "params" : {}}
],
"decisionTree": {
"start": {
"fields": [
{
"field": "doi",
"comparator": "exactMatch",
"weight": 1,
"countIfUndefined": "false",
"params": {}
}
],
"threshold": 1,
"aggregation": "MAX",
"positive": "MATCH",
"negative": "titleCheck",
"undefined": "titleCheck",
"ignoreUndefined": "false"
},
"titleCheck": {
"fields": [
{
"field": "title",
"comparator": "levensteinTitleIgnoreVersion",
"weight": 1,
"countIfUndefined": "false",
"params": {}
}
],
"threshold": 0.95,
"aggregation": "AVG",
"positive": "pidCheck",
"negative": "NO_MATCH",
"undefined": "NO_MATCH",
"ignoreUndefined": "false"
},
"pidCheck": {
"fields": [
{
"field": "altdoi",
"comparator": "exactMatch",
"weight": 1,
"countIfUndefined": "false",
"params": {}
},
{
"field": "doi",
"comparator": "exactMatch",
"weight": 1,
"countIfUndefined": "false",
"params": {"crossCompare": "altdoi"}
},
{
"field": "url",
"comparator": "exactMatch",
"weight": 1,
"countIfUndefined": "false",
"params": {}
}
],
"threshold": 1,
"aggregation": "OR",
"positive": "MATCH",
"negative": "authorsCheck",
"undefined": "authorsCheck",
"ignoreUndefined": "false"
},
"authorsCheck": {
"fields": [
{
"field": "authors",
"comparator": "authorsMatch",
"weight": 1.0,
"countIfUndefined": "false",
"params": {
"surname_th": 0.70,
"fullname_th": 0.70,
"size_th": 20,
"mode": "surname"
}
}
],
"threshold": 1,
"aggregation": "AVG",
"positive": "MATCH",
"negative": "NO_MATCH",
"undefined": "MATCH",
"ignoreUndefined": "false"
}
},
"model" : [
{
"name" : "doi",
"type" : "String",
"path" : "$.instance.pid[?(@.qualifier.classid == 'doi')].value"
},
{
"name" : "altdoi",
"type" : "String",
"path" : "$.instance.alternateIdentifier[?(@.qualifier.classid == 'doi')].value"
},
{
"name" : "title",
"type" : "String",
"path" : "$.title[?(@.qualifier.classid == 'main title')].value",
"length" : 250,
"size" : 5
},
{
"name" : "url",
"type" : "String",
"path" : "$.instance.url"
},
{
"name" : "resulttype",
"type" : "String",
"path" : "$.resulttype.classid"
},
{
"name": "authors",
"type": "List",
"path": "$.author[*].fullname",
"size": 200
}
],
"blacklists" : {},
"synonyms": {}
}
}

View File

@ -1,4 +0,0 @@
{"websiteurl": "https://fairsharing.org", "englishname": "FAIRsharing", "officialname": "FAIRsharing", "id": "eosc________::oxford_e-research_centre::oxford_e-research_centre.fairsharing"}
{"websiteurl": "https://FAIRsharing.org", "englishname": "FAIRsharing", "officialname": "FAIRsharing", "id": "fairsharing_::2521"}
{"websiteurl": "https://fairsharing.org/", "englishname": "formerly: biosharing", "officialname": "FAIRsharing", "id": "re3data_____::r3d100010142"}
{"websiteurl": "https://fairsharing.org/", "englishname": "FAIRsharing", "officialname": "FAIRsharing", "id": "openaire____::fairsharing"}

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View File

@ -1,32 +0,0 @@
[
{
"paramName": "e",
"paramLongName": "entitiesPath",
"paramDescription": "the input entities",
"paramRequired": true
},
{
"paramName": "w",
"paramLongName": "workingPath",
"paramDescription": "path of the working directory",
"paramRequired": true
},
{
"paramName": "np",
"paramLongName": "numPartitions",
"paramDescription": "number of partitions for the similarity relations intermediate phases",
"paramRequired": false
},
{
"paramName": "dc",
"paramLongName": "dedupConfPath",
"paramDescription": "dedup configuration to be used",
"paramRequired": false
},
{
"paramName": "gt",
"paramLongName": "groundTruthFieldJPath",
"paramDescription": "field to be used as groundtruth",
"paramRequired": false
}
]

View File

@ -6,7 +6,7 @@
<parent>
<groupId>eu.dnetlib</groupId>
<artifactId>dnet-dedup</artifactId>
<version>4.1.13-SNAPSHOT</version>
<version>4.1.12</version>
<relativePath>../pom.xml</relativePath>
</parent>

View File

@ -1,59 +1,59 @@
package eu.dnetlib.pace.clustering;
import com.google.common.collect.Maps;
import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.model.Document;
import eu.dnetlib.pace.model.Field;
import eu.dnetlib.pace.model.FieldListImpl;
import eu.dnetlib.pace.model.MapDocument;
import java.util.Collection;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.regex.Pattern;
import java.util.Set;
import com.google.common.collect.Iterables;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.model.Document;
import eu.dnetlib.pace.model.Field;
import eu.dnetlib.pace.model.FieldListImpl;
import eu.dnetlib.pace.model.MapDocument;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
public class BlacklistAwareClusteringCombiner extends ClusteringCombiner {
public static Collection<String> filterAndCombine(final MapDocument a, final Config conf) {
Document filtered = filter(a, conf.blacklists());
return combine(filtered, conf);
}
private static final Log log = LogFactory.getLog(BlacklistAwareClusteringCombiner.class);
private static MapDocument filter(final MapDocument a, final Map<String, List<Pattern>> blacklists) {
if (blacklists == null || blacklists.isEmpty()) {
return a;
}
public static Collection<String> filterAndCombine(final MapDocument a, final Config conf) {
final Map<String, Field> filtered = Maps.newHashMap(a.getFieldMap());
final Document filtered = new BlacklistAwareClusteringCombiner().filter(a, conf.blacklists());
return combine(filtered, conf);
}
for (final Entry<String, List<Pattern>> e : blacklists.entrySet()) {
Field fields = a.getFieldMap().get(e.getKey());
if (fields != null) {
final FieldListImpl fl = new FieldListImpl();
private MapDocument filter(final MapDocument a, final Map<String, List<String>> blacklists) {
final Map<String, Field> filtered = Maps.newHashMap(a.getFieldMap());
if (blacklists != null) {
for (final Entry<String, Field> e : filtered.entrySet()) {
for (Field f : fields) {
if (!isBlackListed(f.stringValue(), e.getValue())) {
fl.add(f);
}
}
filtered.put(e.getKey(), fl);
}
}
return new MapDocument(a.getIdentifier(), filtered);
}
private static boolean isBlackListed(String value, List<Pattern> blacklist) {
for (Pattern pattern : blacklist) {
if (pattern.matcher(value).matches()) {
return true;
}
}
return false;
}
final FieldListImpl fl = new FieldListImpl();
fl.addAll(Lists.newArrayList(Iterables.filter(e.getValue(), new FieldFilter(e.getKey(), blacklists))));
filtered.put(e.getKey(), fl);
}
}
return new MapDocument(a.getIdentifier(), filtered);
}
/**
* Tries to match the fields in the regex blacklist.
*
* @param fieldName
* @param value
* @return true if the field matches, false otherwise
*/
protected boolean regexMatches(final String fieldName, final String value, final Map<String, Set<String>> blacklists) {
if (blacklists.containsKey(fieldName)) {
for (final String regex : blacklists.get(fieldName)) {
if (value.matches(regex)) return true;
}
}
return false;
}
}

View File

@ -20,6 +20,10 @@ public class ClusteringCombiner {
private static String COLLAPSE_ON= "collapseOn";
public static Collection<String> combine(final Document a, final Config conf) {
return new ClusteringCombiner().doCombine(a, conf);
}
private Collection<String> doCombine(final Document a, final Config conf) {
final Collection<String> res = Sets.newLinkedHashSet();
for (final ClusteringDef cd : conf.clusterings()) {
for (final String fieldName : cd.getFields()) {
@ -47,7 +51,7 @@ public class ClusteringCombiner {
return res;
}
private static String getPrefix(ClusteringDef cd, String fieldName) {
private String getPrefix(ClusteringDef cd, String fieldName) {
return cd.getName()+ SEPARATOR +
cd.getParams().keySet()
.stream()

View File

@ -0,0 +1,48 @@
package eu.dnetlib.pace.clustering;
import java.util.List;
import java.util.Map;
import com.google.common.base.Predicate;
import eu.dnetlib.pace.model.Field;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
public class FieldFilter implements Predicate<Field> {
private static final Log log = LogFactory.getLog(FieldFilter.class);
private Map<String, List<String>> blacklists;
private String filedName;
public FieldFilter(final String fieldName, final Map<String, List<String>> blacklists) {
this.filedName = fieldName;
this.blacklists = blacklists;
}
@Override
public boolean apply(final Field f) {
return !regexMatches(filedName, f.stringValue(), blacklists);
}
/**
* Tries to match the fields in the regex blacklist.
*
* @param fieldName
* @param value
* @return true if the field matches, false otherwise
*/
protected boolean regexMatches(final String fieldName, final String value, final Map<String, List<String>> blacklists) {
if (blacklists.containsKey(fieldName)) {
final Iterable<String> regexes = blacklists.get(fieldName);
for (final String regex : regexes) {
if (StringUtils.isBlank(regex)) return false;
if (value.matches(regex)) return true;
}
}
return false;
}
}

View File

@ -1,77 +0,0 @@
package eu.dnetlib.pace.clustering;
import com.google.common.collect.Lists;
import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.model.Field;
import eu.dnetlib.pace.model.Person;
import org.apache.commons.lang3.StringUtils;
import java.util.*;
import java.util.stream.Collectors;
@ClusteringClass("lnfi")
public class LastNameFirstInitial extends AbstractClusteringFunction{
private boolean DEFAULT_AGGRESSIVE = true;
public LastNameFirstInitial(final Map<String, Integer> params) {
super(params);
}
@Override
public Collection<String> apply(Config conf, List<Field> fields) {
return fields.stream().filter(f -> !f.isEmpty())
.map(Field::stringValue)
.map(this::normalize)
.map(s -> doApply(conf, s))
.map(c -> filterBlacklisted(c, ngramBlacklist))
.flatMap(c -> c.stream())
.filter(StringUtils::isNotBlank)
.collect(Collectors.toCollection(HashSet::new));
}
@Override
protected String normalize(final String s) {
return fixAliases(transliterate(nfd(unicodeNormalization(s))))
// do not compact the regexes in a single expression, would cause StackOverflowError in case of large input strings
.replaceAll("[^ \\w]+", "")
.replaceAll("(\\p{InCombiningDiacriticalMarks})+", "")
.replaceAll("(\\p{Punct})+", " ")
.replaceAll("(\\d)+", " ")
.replaceAll("(\\n)+", " ")
.trim();
}
@Override
protected Collection<String> doApply(final Config conf, final String s) {
final List<String> res = Lists.newArrayList();
final boolean aggressive = (Boolean) (getParams().containsKey("aggressive") ? getParams().get("aggressive") : DEFAULT_AGGRESSIVE);
Person p = new Person(s, aggressive);
if (p.isAccurate()) {
String lastName = p.getNormalisedSurname().toLowerCase();
String firstInitial = p.getNormalisedFirstName().toLowerCase().substring(0,1);
res.add(firstInitial.concat(lastName));
}
else { // is not accurate, meaning it has no defined name and surname
List<String> fullname = Arrays.asList(p.getNormalisedFullname().split(" "));
if (fullname.size() == 1) {
res.add(p.getNormalisedFullname().toLowerCase());
}
else if (fullname.size() == 2) {
res.add(fullname.get(0).substring(0,1).concat(fullname.get(1)).toLowerCase());
res.add(fullname.get(1).substring(0,1).concat(fullname.get(0)).toLowerCase());
}
else {
res.add(fullname.get(0).substring(0,1).concat(fullname.get(fullname.size()-1)).toLowerCase());
res.add(fullname.get(fullname.size()-1).substring(0,1).concat(fullname.get(0)).toLowerCase());
}
}
return res;
}
}

View File

@ -9,7 +9,7 @@ import com.google.common.collect.Lists;
import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.model.Person;
@ClusteringClass("personHash")
@ClusteringClass("personhash")
public class PersonHash extends AbstractClusteringFunction {
private boolean DEFAULT_AGGRESSIVE = false;

View File

@ -3,23 +3,28 @@ package eu.dnetlib.pace.common;
import com.google.common.base.Joiner;
import com.google.common.base.Splitter;
import com.google.common.collect.Iterables;
import com.google.common.collect.Lists;
import com.google.common.collect.Sets;
import com.ibm.icu.text.Transliterator;
import eu.dnetlib.pace.clustering.NGramUtils;
import eu.dnetlib.pace.config.Type;
import eu.dnetlib.pace.model.Field;
import eu.dnetlib.pace.model.FieldList;
import eu.dnetlib.pace.model.FieldListImpl;
import eu.dnetlib.pace.model.FieldValueImpl;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils;
import java.io.IOException;
import java.io.StringWriter;
import java.io.UnsupportedEncodingException;
import java.nio.charset.StandardCharsets;
import java.text.Normalizer;
import java.util.*;
import java.util.function.Function;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
import com.ibm.icu.text.Transliterator;
/**
* Set of common functions for the framework
@ -128,12 +133,10 @@ public abstract class AbstractPaceFunctions {
protected static String fixAliases(final String s) {
final StringBuilder sb = new StringBuilder();
s.chars().forEach(ch -> {
for (final char ch : Lists.charactersOf(s)) {
final int i = StringUtils.indexOf(aliases_from, ch);
sb.append(i >= 0 ? aliases_to.charAt(i) : (char)ch);
});
sb.append(i >= 0 ? aliases_to.charAt(i) : ch);
}
return sb.toString();
}
@ -149,10 +152,9 @@ public abstract class AbstractPaceFunctions {
protected String removeSymbols(final String s) {
final StringBuilder sb = new StringBuilder();
s.chars().forEach(ch -> {
sb.append(StringUtils.contains(alpha, ch) ? (char)ch : ' ');
});
for (final char ch : Lists.charactersOf(s)) {
sb.append(StringUtils.contains(alpha, ch) ? ch : " ");
}
return sb.toString().replaceAll("\\s+", " ");
}
@ -239,7 +241,7 @@ public abstract class AbstractPaceFunctions {
final Set<String> h = Sets.newHashSet();
try {
for (final String s : IOUtils.readLines(NGramUtils.class.getResourceAsStream(classpath), StandardCharsets.UTF_8)) {
for (final String s : IOUtils.readLines(NGramUtils.class.getResourceAsStream(classpath))) {
h.add(fixAliases(transliterator.transliterate(s))); //transliteration of the stopwords
}
} catch (final Throwable e) {
@ -254,7 +256,7 @@ public abstract class AbstractPaceFunctions {
final Map<String, String> m = new HashMap<>();
try {
for (final String s : IOUtils.readLines(AbstractPaceFunctions.class.getResourceAsStream(classpath), StandardCharsets.UTF_8)) {
for (final String s : IOUtils.readLines(AbstractPaceFunctions.class.getResourceAsStream(classpath))) {
//string is like this: code;word1;word2;word3
String[] line = s.split(";");
String value = line[0];
@ -347,7 +349,7 @@ public abstract class AbstractPaceFunctions {
public static <T> String readFromClasspath(final String filename, final Class<T> clazz) {
final StringWriter sw = new StringWriter();
try {
IOUtils.copy(clazz.getResourceAsStream(filename), sw, StandardCharsets.UTF_8);
IOUtils.copy(clazz.getResourceAsStream(filename), sw);
return sw.toString();
} catch (final IOException e) {
throw new RuntimeException("cannot load resource from classpath: " + filename);

View File

@ -2,7 +2,6 @@ package eu.dnetlib.pace.config;
import java.util.List;
import java.util.Map;
import java.util.regex.Pattern;
import eu.dnetlib.pace.model.ClusteringDef;
import eu.dnetlib.pace.model.FieldDef;
@ -48,7 +47,7 @@ public interface Config {
*
* @return the map
*/
public Map<String, List<Pattern>> blacklists();
public Map<String, List<String>> blacklists();
/**

View File

@ -1,6 +1,5 @@
package eu.dnetlib.pace.config;
import com.fasterxml.jackson.annotation.JsonIgnore;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.google.common.collect.Maps;
import eu.dnetlib.pace.model.ClusteringDef;
@ -8,19 +7,15 @@ import eu.dnetlib.pace.model.FieldDef;
import eu.dnetlib.pace.util.PaceException;
import org.antlr.stringtemplate.StringTemplate;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import java.io.IOException;
import java.io.Serializable;
import java.nio.charset.StandardCharsets;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
import eu.dnetlib.pace.tree.support.TreeNodeDef;
@ -36,9 +31,6 @@ public class DedupConfig implements Config, Serializable {
private WfConfig wf;
@JsonIgnore
private Map<String, List<Pattern>> blacklists;
private static Map<String, String> defaults = Maps.newHashMap();
static {
@ -65,12 +57,6 @@ public class DedupConfig implements Config, Serializable {
config = new ObjectMapper().readValue(json, DedupConfig.class);
config.getPace().initModel();
config.getPace().initTranslationMap();
config.blacklists = config.getPace().getBlacklists().entrySet()
.stream()
.collect(Collectors.toMap(e -> e.getKey(),
e ->e.getValue().stream().filter(s -> !StringUtils.isBlank(s)).map(Pattern::compile).collect(Collectors.toList()) ));
return config;
} catch (IOException e) {
throw new PaceException("Error in parsing configuration json", e);
@ -102,7 +88,7 @@ public class DedupConfig implements Config, Serializable {
}
private String readFromClasspath(final String resource) throws IOException {
return IOUtils.toString(getClass().getResource(resource), StandardCharsets.UTF_8);
return IOUtils.toString(getClass().getResource(resource));
}
public PaceConfig getPace() {
@ -151,8 +137,8 @@ public class DedupConfig implements Config, Serializable {
}
@Override
public Map<String, List<Pattern>> blacklists() {
return blacklists;
public Map<String, List<String>> blacklists() {
return getPace().getBlacklists();
}
@Override

View File

@ -1,5 +1,5 @@
package eu.dnetlib.pace.config;
public enum Type {
String, Int, List, JSON, URL, StringConcat, DoubleArray
String, Int, List, JSON, URL, StringConcat
}

View File

@ -20,6 +20,4 @@ public interface FieldValue extends Field {
*/
public void setValue(final Object value);
public double[] doubleArrayValue();
}

View File

@ -58,10 +58,8 @@ public class FieldValueImpl extends AbstractField implements FieldValue {
throw new RuntimeException(value.toString());
}
case URL:
String str = value.toString();
return StringUtils.isBlank(str) || !isValidURL(str);
case DoubleArray:
return doubleArrayValue().length==0;
String str = value.toString();
return StringUtils.isBlank(str) || !isValidURL(str);
default:
return true;
}
@ -118,10 +116,6 @@ public class FieldValueImpl extends AbstractField implements FieldValue {
// }
}
public double[] doubleArrayValue() {
return (double[])getValue();
}
/*
* (non-Javadoc)
*

View File

@ -43,7 +43,7 @@ public class Person {
// s = s.replaceAll("[\\W&&[^,-]]", "");
}
if (s.contains(",")) { //if the name contains a comma it is easy derivable the name and the surname
if (s.contains(",")) {
final String[] arr = s.split(",");
if (arr.length == 1) {
fullname = splitTerms(arr[0]);

View File

@ -26,7 +26,6 @@ public class AuthorsMatch extends AbstractComparator {
private double FULLNAME_THRESHOLD;
private String MODE; //full or surname
private int SIZE_THRESHOLD;
private String TYPE; //count or percentage
private int common;
public AuthorsMatch(Map<String, String> params){
@ -38,7 +37,6 @@ public class AuthorsMatch extends AbstractComparator {
NAME_THRESHOLD = Double.parseDouble(params.getOrDefault("name_th", "0.95"));
FULLNAME_THRESHOLD = Double.parseDouble(params.getOrDefault("fullname_th", "0.9"));
SIZE_THRESHOLD = Integer.parseInt(params.getOrDefault("size_th", "20"));
TYPE = params.getOrDefault("type", "percentage");
common = 0;
}
@ -52,7 +50,7 @@ public class AuthorsMatch extends AbstractComparator {
if (a.isEmpty() || b.isEmpty())
return -1;
if (((FieldList) a).size() > SIZE_THRESHOLD || ((FieldList) b).size() > SIZE_THRESHOLD)
if (((FieldList) a).size() > SIZE_THRESHOLD || ((FieldList) a).size() > SIZE_THRESHOLD)
return 1.0;
List<Person> aList = ((FieldList) a).stringList().stream().map(author -> new Person(author, false)).collect(Collectors.toList());
@ -125,12 +123,7 @@ public class AuthorsMatch extends AbstractComparator {
//normalization factor to compute the score
int normFactor = aList.size() == bList.size() ? aList.size() : (aList.size() + bList.size() - common);
if(TYPE.equals("percentage")) {
return (double) common / normFactor;
}
else {
return (double) common;
}
return (double)common / normFactor;
}
public boolean compareSurname(Person p1, Person p2) {

View File

@ -1,53 +0,0 @@
package eu.dnetlib.pace.tree;
import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.model.Field;
import eu.dnetlib.pace.model.FieldList;
import eu.dnetlib.pace.model.FieldValueImpl;
import eu.dnetlib.pace.model.Person;
import eu.dnetlib.pace.tree.support.AbstractComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;
@ComparatorClass("cosineSimilarity")
public class CosineSimilarity extends AbstractComparator {
Map<String, String> params;
public CosineSimilarity(Map<String,String> params) {
super(params);
}
@Override
public double compare(final Field a, final Field b, final Config conf) {
if (a.isEmpty() || b.isEmpty())
return -1;
double[] aVector = ((FieldValueImpl) a).doubleArrayValue();
double[] bVector = ((FieldValueImpl) b).doubleArrayValue();
return cosineSimilarity(aVector, bVector);
}
double cosineSimilarity(double[] a, double[] b) {
double dotProduct = 0;
double normASum = 0;
double normBSum = 0;
for(int i = 0; i < a.length; i ++) {
dotProduct += a[i] * b[i];
normASum += a[i] * a[i];
normBSum += b[i] * b[i];
}
double eucledianDist = Math.sqrt(normASum) * Math.sqrt(normBSum);
return dotProduct / eucledianDist;
}
}

View File

@ -16,7 +16,6 @@ public class DomainExactMatch extends ExactMatchIgnoreCase {
@Override
protected String getValue(final Field f) {
try {
return asUrl(super.getValue(f)).getHost();
} catch (MalformedURLException e) {

View File

@ -1,34 +0,0 @@
package eu.dnetlib.pace.tree;
import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.tree.support.AbstractComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass;
import java.util.Map;
@ComparatorClass("numbersComparator")
public class NumbersComparator extends AbstractComparator {
Map<String, String> params;
public NumbersComparator(Map<String, String> params) {
super(params);
this.params = params;
}
@Override
public double distance(String a, String b, Config conf) {
//extracts numbers from the field
String numbers1 = getNumbers(nfd(a));
String numbers2 = getNumbers(nfd(b));
if (numbers1.isEmpty() || numbers2.isEmpty())
return -1.0;
int n1 = Integer.parseInt(numbers1);
int n2 = Integer.parseInt(numbers2);
return Math.abs(n1 - n2);
}
}

View File

@ -42,25 +42,22 @@ public class StringContainsMatch extends AbstractComparator {
STRING = STRING.toLowerCase();
}
if (AGGREGATOR != null) {
switch (AGGREGATOR) {
case "AND":
if (ca.contains(STRING) && cb.contains(STRING))
return 1.0;
break;
case "OR":
if (ca.contains(STRING) || cb.contains(STRING))
return 1.0;
break;
case "XOR":
if (ca.contains(STRING) ^ cb.contains(STRING))
return 1.0;
break;
default:
return 0.0;
}
switch(AGGREGATOR) {
case "AND":
if(ca.contains(STRING) && cb.contains(STRING))
return 1.0;
break;
case "OR":
if(ca.contains(STRING) || cb.contains(STRING))
return 1.0;
break;
case "XOR":
if(ca.contains(STRING) ^ cb.contains(STRING))
return 1.0;
break;
default:
return 0.0;
}
return 0.0;
}
}

View File

@ -19,13 +19,9 @@ public class StringListMatch extends AbstractComparator {
private static final Log log = LogFactory.getLog(StringListMatch.class);
private Map<String, String> params;
final private String TYPE; //percentage or count
public StringListMatch(final Map<String, String> params) {
super(params);
this.params = params;
TYPE = params.getOrDefault("type", "percentage");
}
@Override
@ -35,7 +31,7 @@ public class StringListMatch extends AbstractComparator {
final Set<String> pb = new HashSet<>(((FieldList) b).stringList());
if (pa.isEmpty() || pb.isEmpty()) {
return -1; //return undefined if one of the two lists is empty
return -1; //return undefined if one of the two lists of pids is empty
}
int incommon = Sets.intersection(pa, pb).size();
@ -45,10 +41,7 @@ public class StringListMatch extends AbstractComparator {
return 0.0;
}
if(TYPE.equals("percentage"))
return (double)incommon / (incommon + simDiff);
else
return incommon;
return (double)incommon / (incommon + simDiff);
}
}

View File

@ -1,6 +1,5 @@
package eu.dnetlib.pace.tree.support;
import com.fasterxml.jackson.core.JsonProcessingException;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.config.PaceConfig;
@ -10,7 +9,6 @@ import org.apache.commons.lang3.StringUtils;
import java.io.IOException;
import java.io.Serializable;
import java.io.StringWriter;
import java.util.List;
public class TreeNodeDef implements Serializable {
@ -59,9 +57,8 @@ public class TreeNodeDef implements Serializable {
double result2 = comparator(fieldConf).compare(doc1.getFieldMap().get(crossField), doc2.getFieldMap().get(fieldConf.getField()), conf);
result = Math.max(result1,result2);
}
else {
else
result = comparator(fieldConf).compare(doc1.getFieldMap().get(fieldConf.getField()), doc2.getFieldMap().get(fieldConf.getField()), conf);
}
stats.addFieldStats(
fieldConf.getComparator() + " on " + fieldConf.getField() + " " + fields.indexOf(fieldConf),

View File

@ -161,14 +161,11 @@ public class BlockProcessorForTesting {
}
else {
//use the decision tree implementation or the "normal" implementation of the similarity score (valid only for publications)
if (useTree)
if(useTree)
emitOutput(new TreeProcessor(dedupConf).compare(pivot, curr), idPivot, idCurr, context);
else
emitOutput(publicationCompare(pivot, curr, dedupConf), idPivot, idCurr, context);
}
// if(new TreeProcessor(dedupConf).compare(pivot, curr) != publicationCompare(pivot, curr, dedupConf)) {
// emitOutput(true, idPivot, idCurr, context);
// }
}
}
@ -183,45 +180,38 @@ public class BlockProcessorForTesting {
return compare>=1.0;
}
private boolean publicationCompare(MapDocument a, MapDocument b, DedupConfig config) {
//if the score gives 1, the publications are equivalent
Map<String, String> params = new HashMap<>();
params.put("jpath_value", "$.value");
params.put("jpath_classid", "$.qualifier.classid");
params.put("mode", "count");
private boolean publicationCompare(MapDocument a, MapDocument b, DedupConfig config) {
double score = 0.0;
double score = 0.0;
//LAYER 1 - comparison of the PIDs json lists
Map<String, String> params = new HashMap<>();
params.put("jpath_value", "$.value");
params.put("jpath_classid", "$.qualifier.classid");
JsonListMatch jsonListMatch = new JsonListMatch(params);
double result = jsonListMatch.compare(a.getFieldMap().get("pid"), b.getFieldMap().get("pid"), config);
if (result >= 0.5) //if the result of the comparison is greater than the threshold
score += 10.0; //high score because it should match when the first condition is satisfied
else
score += 0.0;
//levenstein title
LevensteinTitle levensteinTitle = new LevensteinTitle(params);
if(levensteinTitle.compare(a.getFieldMap().get("title"), b.getFieldMap().get("title"), config) >= 0.9) {
score += 0.2;
//LAYER 2 - comparison of the title version and the size of the authors lists
TitleVersionMatch titleVersionMatch = new TitleVersionMatch(params);
double result1 = titleVersionMatch.compare(a.getFieldMap().get("title"), b.getFieldMap().get("title"), config);
SizeMatch sizeMatch = new SizeMatch(params);
double result2 = sizeMatch.compare(a.getFieldMap().get("authors"), b.getFieldMap().get("authors"), config);
if (Math.min(result1, result2) != 0)
score+=0;
else
score-=2;
//LAYER 3 - computation of levenshtein on titles
LevensteinTitle levensteinTitle = new LevensteinTitle(params);
double result3 = levensteinTitle.compare(a.getFieldMap().get("title"), b.getFieldMap().get("title"), config);
score += Double.isNaN(result3)?0.0:result3;
return score >= 0.99;
}
//pid
JsonListMatch jsonListMatch = new JsonListMatch(params);
if (jsonListMatch.compare(a.getFieldMap().get("pid"), b.getFieldMap().get("pid"), config) >= 1.0) {
score += 0.5;
}
//title version
TitleVersionMatch titleVersionMatch = new TitleVersionMatch(params);
double result1 = titleVersionMatch.compare(a.getFieldMap().get("title"), b.getFieldMap().get("title"), config);
if(result1<0 || result1>=1.0) {
score += 0.1;
}
//authors match
params.remove("mode");
AuthorsMatch authorsMatch = new AuthorsMatch(params);
double result2 = authorsMatch.compare(a.getFieldMap().get("authors"), b.getFieldMap().get("authors"), config);
if(result2 <0|| result2>=0.6) {
score += 0.2;
}
return score>=0.5;
}
private void emitOutput(final boolean result, final String idPivot, final String idCurr, final Reporter context) {
if (result) {
@ -244,5 +234,6 @@ public class BlockProcessorForTesting {
final String type = dedupConf.getWf().getEntityType();
context.emit(type, from, to);
context.emit(type, to, from);
}
}

View File

@ -7,10 +7,12 @@ import com.jayway.jsonpath.JsonPath;
import com.jayway.jsonpath.Option;
import eu.dnetlib.pace.config.DedupConfig;
import eu.dnetlib.pace.config.Type;
import eu.dnetlib.pace.model.*;
import eu.dnetlib.pace.model.Field;
import eu.dnetlib.pace.model.FieldListImpl;
import eu.dnetlib.pace.model.FieldValueImpl;
import eu.dnetlib.pace.model.MapDocument;
import net.minidev.json.JSONArray;
import java.math.BigDecimal;
import java.util.*;
import java.util.function.Predicate;
import java.util.stream.Collectors;
@ -44,14 +46,6 @@ public class MapDocumentUtil {
.forEach(fi::add);
stringField.put(fdef.getName(), fi);
break;
case DoubleArray:
stringField.put(
fdef.getName(),
new FieldValueImpl(Type.DoubleArray,
fdef.getName(),
getJPathArray(fdef.getPath(), json))
);
break;
case StringConcat:
String[] jpaths = fdef.getPath().split("\\|\\|\\|");
stringField.put(
@ -121,30 +115,6 @@ public class MapDocumentUtil {
}
}
public static double[] getJPathArray(final String jsonPath, final String json) {
try {
Object o = JsonPath.read(json, jsonPath);
if (o instanceof double[])
return (double[]) o;
if (o instanceof JSONArray) {
Object[] objects = ((JSONArray) o).toArray();
double[] array = new double[objects.length];
for (int i = 0; i < objects.length; i++) {
if (objects[i] instanceof BigDecimal)
array[i] = ((BigDecimal)objects[i]).doubleValue();
else
array[i] = (double) objects[i];
}
return array;
}
return new double[0];
}
catch (Exception e) {
e.printStackTrace();
return new double[0];
}
}
public static String truncateValue(String value, int length) {
if (value == null)

View File

@ -9,7 +9,6 @@ import org.apache.commons.io.IOUtils;
import java.io.IOException;
import java.io.StringWriter;
import java.nio.charset.StandardCharsets;
import java.util.List;
import java.util.stream.Collectors;
@ -18,7 +17,7 @@ public abstract class AbstractPaceTest extends AbstractPaceFunctions {
protected String readFromClasspath(final String filename) {
final StringWriter sw = new StringWriter();
try {
IOUtils.copy(getClass().getResourceAsStream(filename), sw, StandardCharsets.UTF_8);
IOUtils.copy(getClass().getResourceAsStream(filename), sw);
return sw.toString();
} catch (final IOException e) {
throw new RuntimeException("cannot load resource from classpath: " + filename);
@ -37,10 +36,6 @@ public abstract class AbstractPaceTest extends AbstractPaceFunctions {
return new FieldValueImpl(Type.URL, "url", s);
}
protected Field array(final double[] a) {
return new FieldValueImpl(Type.DoubleArray, "array", a);
}
protected Field createFieldList(List<String> strings, String fieldName){
List<FieldValueImpl> fieldValueStream = strings.stream().map(s -> new FieldValueImpl(Type.String, fieldName, s)).collect(Collectors.toList());

View File

@ -103,11 +103,6 @@ public class ClusteringFunctionTest extends AbstractPaceTest {
final String s = "Search for the Standard Model Higgs Boson";
System.out.println(s);
System.out.println(sp.apply(conf, Lists.newArrayList(title(s))));
params.put("len", 3);
params.put("max", 1);
System.out.println(sp.apply(conf, Lists.newArrayList(title("Framework for general-purpose deduplication"))));
}
@Test
@ -153,10 +148,6 @@ public class ClusteringFunctionTest extends AbstractPaceTest {
System.out.println(s);
System.out.println(sp.apply(conf, Lists.newArrayList(title(s))));
s = "niivue/niivue: 0.21.1";
System.out.println(s);
System.out.println(sp.apply(conf, Lists.newArrayList(title(s))));
}
@Test
@ -209,41 +200,4 @@ public class ClusteringFunctionTest extends AbstractPaceTest {
}
@Test
public void testPersonClustering(){
final ClusteringFunction cf = new PersonClustering(params);
final String s = "Abd-Alla, Abo-el-nour N.";
System.out.println("s = " + s);
System.out.println(cf.apply(conf, Lists.newArrayList(title(s))));
final String s1 = "Manghi, Paolo";
System.out.println("s1 = " + s1);
System.out.println(cf.apply(conf, Lists.newArrayList(title(s1))));
}
@Test
public void testPersonHash(){
final ClusteringFunction cf = new PersonHash(params);
final String s = "Manghi, Paolo";
System.out.println("s = " + s);
System.out.println(cf.apply(conf, Lists.newArrayList(title(s))));
final String s1 = "Manghi, P.";
System.out.println("s = " + s1);
System.out.println(cf.apply(conf, Lists.newArrayList(title(s1))));
}
@Test
public void testLastNameFirstInitial(){
final ClusteringFunction cf = new LastNameFirstInitial(params);
final String s = "LI Yonghong";
System.out.println("s = " + s);
System.out.println(cf.apply(conf, Lists.newArrayList(title(s))));
}
}
}

View File

@ -2,16 +2,13 @@ package eu.dnetlib.pace.comparators;
import eu.dnetlib.pace.AbstractPaceTest;
import eu.dnetlib.pace.clustering.NGramUtils;
import eu.dnetlib.pace.config.Type;
import eu.dnetlib.pace.model.Field;
import eu.dnetlib.pace.model.FieldValueImpl;
import eu.dnetlib.pace.tree.*;
import eu.dnetlib.pace.config.DedupConfig;
import org.junit.jupiter.api.*;
import static org.junit.jupiter.api.Assertions.assertEquals;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.Map;
@ -24,20 +21,15 @@ public class ComparatorTest extends AbstractPaceTest {
@BeforeAll
public void setup() {
conf = DedupConfig.load(readFromClasspath("/eu/dnetlib/pace/config/organization.current.conf.json", ComparatorTest.class));
}
@BeforeEach
public void beforeEachTest() {
params = new HashMap<>();
params.put("weight", "1.0");
params.put("surname_th", "0.99");
params.put("name_th", "0.95");
params.put("jpath_value", "$.value");
params.put("jpath_classid", "$.qualifier.classid");
conf = DedupConfig.load(readFromClasspath("/eu/dnetlib/pace/config/organization.current.conf.json", ComparatorTest.class));
}
@Test
public void testCleanForSorting() {
NGramUtils utils = new NGramUtils();
@ -64,10 +56,7 @@ public class ComparatorTest extends AbstractPaceTest {
//particular cases
assertEquals(1.0, cityMatch.distance("Free University of Bozen-Bolzano", "Università di Bolzano", conf));
assertEquals(1.0, cityMatch.distance("Politechniki Warszawskiej (Warsaw University of Technology)", "Warsaw University of Technology", conf));
// failing becasuse 'Allen' is a transliterrated greek stopword
// assertEquals(-1.0, cityMatch.distance("Allen (United States)", "United States Military Academy", conf));
assertEquals(-1.0, cityMatch.distance("Washington (United States)", "United States Military Academy", conf));
assertEquals(-1.0, cityMatch.distance("Allen (United States)", "United States Military Academy", conf));
}
@Test
@ -81,7 +70,7 @@ public class ComparatorTest extends AbstractPaceTest {
assertEquals(1.0, keywordMatch.distance("Polytechnic University of Turin", "POLITECNICO DI TORINO", conf));
assertEquals(1.0, keywordMatch.distance("Istanbul Commerce University", "İstanbul Ticarət Universiteti", conf));
assertEquals(1.0, keywordMatch.distance("Franklin College", "Concordia College", conf));
assertEquals(2.0/3.0, keywordMatch.distance("University of Georgia", "Georgia State University", conf));
assertEquals(0.5, keywordMatch.distance("University of Georgia", "Georgia State University", conf));
assertEquals(0.5, keywordMatch.distance("University College London", "University of London", conf));
assertEquals(0.5, keywordMatch.distance("Washington State University", "University of Washington", conf));
assertEquals(-1.0, keywordMatch.distance("Allen (United States)", "United States Military Academy", conf));
@ -115,7 +104,7 @@ public class ComparatorTest extends AbstractPaceTest {
public void stringContainsMatchTest(){
params.put("string", "openorgs");
params.put("aggregator", "XOR");
params.put("bool", "XOR");
params.put("caseSensitive", "false");
StringContainsMatch stringContainsMatch = new StringContainsMatch(params);
@ -123,7 +112,7 @@ public class ComparatorTest extends AbstractPaceTest {
assertEquals(0.0, stringContainsMatch.distance("openorgs", "openorgs", conf));
params.put("string", "openorgs");
params.put("aggregator", "AND");
params.put("bool", "AND");
params.put("caseSensitive", "false");
stringContainsMatch = new StringContainsMatch(params);
@ -257,10 +246,6 @@ public class ComparatorTest extends AbstractPaceTest {
assertEquals(0.25, result);
Field f = createFieldList(new ArrayList<>(), "authors");
result = authorsMatch.compare(f,f, conf);
System.out.println("result = " + result);
}
@Test
@ -282,30 +267,5 @@ public class ComparatorTest extends AbstractPaceTest {
assertEquals(1.0, result);
}
@Test
public void domainExactMatch() {
DomainExactMatch domainExactMatch = new DomainExactMatch(params);
Field a = url("http://www.flowrepository.org");
Field b = url("http://flowrepository.org/");
double compare = domainExactMatch.compare(a, b, conf);
System.out.println("compare = " + compare);
}
@Test
public void cosineSimilarity() {
CosineSimilarity cosineSimilarity = new CosineSimilarity(params);
Field a = new FieldValueImpl(Type.DoubleArray, "array", new double[]{1,2,3});
Field b = new FieldValueImpl(Type.DoubleArray, "array", new double[]{1,2,3});
double compare = cosineSimilarity.compare(a, b, conf);
System.out.println("compare = " + compare);
}
}

View File

@ -7,7 +7,6 @@ import eu.dnetlib.pace.clustering.ClusteringClass;
import eu.dnetlib.pace.clustering.ClusteringCombiner;
import eu.dnetlib.pace.model.Field;
import eu.dnetlib.pace.model.FieldList;
import eu.dnetlib.pace.model.FieldValue;
import eu.dnetlib.pace.model.MapDocument;
import eu.dnetlib.pace.tree.JsonListMatch;
import eu.dnetlib.pace.tree.support.AggType;
@ -21,7 +20,10 @@ import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertNotNull;
import static org.junit.jupiter.api.Assertions.assertTrue;
import java.util.*;
import java.util.Collection;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;
@ -83,7 +85,7 @@ public class ConfigTest extends AbstractPaceTest {
}
@Test
public void asMapDocumentTest1() {
public void asMapDocumentTest() {
DedupConfig dedupConf = DedupConfig.load(readFromClasspath("publication.current.conf.json"));
@ -101,19 +103,6 @@ public class ConfigTest extends AbstractPaceTest {
}
@Test
public void authorAsMapDocument() {
DedupConfig dedupConf = DedupConfig.load(readFromClasspath("author.fdup.conf.json"));
final String json = readFromClasspath("author.json");
final MapDocument mapDocument = MapDocumentUtil.asMapDocumentWithJPath(dedupConf, json);
System.out.println("mapDocument = " + Arrays.toString(((FieldValue) mapDocument.getFieldMap().get("topics")).doubleArrayValue()));
}
@Test
public void testJPath() {
final String json = readFromClasspath("organization.json");

View File

@ -1,6 +1,7 @@
package eu.dnetlib.pace.util;
import eu.dnetlib.pace.model.Person;
import jdk.nashorn.internal.ir.annotations.Ignore;
import org.junit.jupiter.api.*;
import java.util.HashMap;
@ -17,6 +18,7 @@ public class UtilTest {
}
@Test
@Ignore
public void paceResolverTest() {
PaceResolver paceResolver = new PaceResolver();
paceResolver.getComparator("keywordMatch", params);
@ -28,11 +30,6 @@ public class UtilTest {
assertEquals("kennedy", p.getSurnameString());
assertEquals("j f", p.getNameString());
p = new Person("Guan-Hua Du", false);
System.out.println("surname = " + p.getSurnameString());
System.out.println("name = " + p.getNameString());
}
}

View File

@ -1,134 +0,0 @@
{
"wf": {
"threshold": "0.99",
"dedupRun": "001",
"entityType": "author",
"subEntityType": "author",
"subEntityValue": "author",
"orderField": "fullname",
"queueMaxSize": "200",
"groupMaxSize": "100",
"maxChildren": "100",
"slidingWindowSize": "50",
"rootBuilder": [
"result",
"resultProject_outcome_isProducedBy",
"resultResult_publicationDataset_isRelatedTo",
"resultResult_similarity_isAmongTopNSimilarDocuments",
"resultResult_similarity_hasAmongTopNSimilarDocuments",
"resultOrganization_affiliation_isAffiliatedWith",
"resultResult_part_hasPart",
"resultResult_part_isPartOf",
"resultResult_supplement_isSupplementTo",
"resultResult_supplement_isSupplementedBy",
"resultResult_version_isVersionOf"
],
"includeChildren": "true",
"maxIterations": 20,
"idPath": "$.id"
},
"pace": {
"clustering" : [
{ "name" : "lnfi", "fields" : [ "fullname" ], "params" : {} }
],
"decisionTree": {
"start": {
"fields": [
{
"field": "orcid",
"comparator": "exactMatch",
"weight": 1.0,
"countIfUndefined": "true",
"params": {}
}
],
"threshold": 1.0,
"aggregation": "MAX",
"positive": "MATCH",
"negative": "NO_MATCH",
"undefined": "orcids",
"ignoreUndefined": "true"
},
"orcids": {
"fields": [
{
"field": "orcids",
"comparator": "stringListMatch",
"weight": 1.0,
"countIfUndefined": "true",
"params": {"type": "count"}
}
],
"threshold": 3.0,
"aggregation": "MAX",
"positive": "MATCH",
"negative": "coauthors",
"undefined": "coauthors",
"ignoreUndefined": "true"
},
"coauthors": {
"fields": [
{
"field": "coauthors",
"comparator": "authorsMatch",
"weight": 1.0,
"countIfUndefined": "true",
"params": {"type": "count"}
}
],
"threshold": 1.0,
"aggregation": "MAX",
"positive": "topicsMatch",
"negative": "NO_MATCH",
"undefined": "topicsMatch",
"ignoreUndefined": "true"
},
"topicsMatch": {
"fields": [
{
"field": "topics",
"comparator": "cosineSimilarity",
"weight": 1.0,
"countIfUndefined": "true",
"params": {}
}
],
"threshold": 1.0,
"aggregation": "MAX",
"positive": "MATCH",
"negative": "NO_MATCH",
"undefined": "NO_MATCH",
"ignoreUndefined": "false"
}
},
"model": [
{
"name": "topics",
"type": "DoubleArray",
"path": "$.topics"
},
{
"name": "fullname",
"type": "String",
"path": "$.fullname"
},
{
"name": "orcid",
"type": "String",
"path": "$.orcid"
},
{
"name": "coauthors",
"type": "List",
"path": "$.coAuthors[*].fullname"
},
{
"name": "orcids",
"type": "List",
"path": "$.coAuthors[*].orcid"
}
],
"blacklists": {},
"synonyms": {}
}
}

View File

@ -1 +0,0 @@
{"fullname":"Zaragoza, Maria Cleofé","firstname":"Maria Cleofé","lastname":"Zaragoza","coAuthors":[{"fullname":"Cambras, Trinitat","lastname":"Cambras","firstname":"Trinitat","orcid":"0000-0002-9009-4690"},{"fullname":"Castro-Marrero, Jesús","lastname":"Castro-Marrero","firstname":"Jesús","orcid":""},{"fullname":"Díez-Noguera, Antoni","lastname":"Díez-Noguera","firstname":"Antoni","orcid":""},{"fullname":"Alegre, José","lastname":"Alegre","firstname":"José","orcid":"0000-0002-7582-7585"}],"topics":[0.9522090839562252,0.04779091604377485],"orcid":"0000-0002-9797-0219","id":"author::1a10826c83c7f9f0dcebe7df05e37a2a","pubId":"50|pmid________::db7fd19db5a620eafad40cfb97f9690d"}

30
pom.xml
View File

@ -5,7 +5,7 @@
<groupId>eu.dnetlib</groupId>
<artifactId>dnet-dedup</artifactId>
<version>4.1.13-SNAPSHOT</version>
<version>4.1.12</version>
<packaging>pom</packaging>
@ -22,7 +22,7 @@
<scm>
<developerConnection>scm:git:https://code-repo.d4science.org/D-Net/dnet-dedup.git</developerConnection>
<tag>dnet-dedup-4.0.3</tag>
<tag>dnet-dedup-4.1.12</tag>
</scm>
<modules>
@ -144,7 +144,14 @@
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-surefire-plugin</artifactId>
<version>2.22.0</version>
<version>2.19.1</version>
<dependencies>
<dependency>
<groupId>org.junit.jupiter</groupId>
<artifactId>junit-jupiter</artifactId>
<version>${junit-jupiter.version}</version>
</dependency>
</dependencies>
<configuration>
<redirectTestOutputToFile>false</redirectTestOutputToFile>
</configuration>
@ -254,7 +261,7 @@
<oozie.use.system.libpath>true</oozie.use.system.libpath>
<properties.maven.plugin.version>2.0.1</properties.maven.plugin.version>
<junit-jupiter.version>5.6.1</junit-jupiter.version>
<maven.dependency.eu.dnetlib.dhp.dhp-build-assembly-resources.jar.path>../dhp-build/dhp-build-assembly-resources/target/dhp-build-assembly-resources-${project.version}.jar</maven.dependency.eu.dnetlib.dhp.dhp-build-assembly-resources.jar.path>
<maven.dependency.eu.dnetlib.dhp.dhp-build-assembly-resources.jar.path>../dhp-build/dhp-build-assembly-resources/target/dhp-build-assembly-resources-4.1.8-SNAPSHOT.jar</maven.dependency.eu.dnetlib.dhp.dhp-build-assembly-resources.jar.path>
</properties>
@ -403,12 +410,27 @@
<version>2.4.0</version>
</dependency>
<dependency>
<groupId>org.mockito</groupId>
<artifactId>mockito-core</artifactId>
<version>3.3.3</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.mockito</groupId>
<artifactId>mockito-junit-jupiter</artifactId>
<version>3.3.3</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>com.ibm.icu</groupId>
<artifactId>icu4j</artifactId>
<version>70.1</version>
</dependency>
</dependencies>
</dependencyManagement>