From 85070ce3fe24d1b351619e7ec607de4a1489a171 Mon Sep 17 00:00:00 2001 From: miconis Date: Tue, 6 Aug 2019 12:09:34 +0200 Subject: [PATCH] addition of the BlockUtils class for meta-blocking, implementation of a new local test with edge filtering example --- dnet-dedup-test/dependency-reduced-pom.xml | 119 +++++++++++ dnet-dedup-test/pom.xml | 27 +++ .../src/main/java/eu/dnetlib/Block.java | 50 +++++ .../main/java/eu/dnetlib/SparkLocalTest.java | 41 ++-- .../main/java/eu/dnetlib/SparkLocalTest2.java | 118 +++++++++++ .../eu/dnetlib/graph/GraphProcessor.scala | 2 +- .../eu/dnetlib/pace/utils/BlockUtils.java | 168 +++++++++++++++ .../dnetlib/reporter/SparkBlockProcessor.java | 2 - .../reporter/SparkBlockProcessor2.java | 193 ++++++++++++++++++ .../eu/dnetlib/pace/org.curr.beta.conf | 35 ++++ .../resources/eu/dnetlib/pace/org.curr.conf | 2 +- .../eu/dnetlib/pace/organization.to.fix.json | 37 ++-- .../java/eu/dnetlib/pace/DedupTestIT.java | 1 - .../dnetlib/pace/config/translation_map.csv | 2 +- .../clustering/ClusteringFunctionTest.java | 2 +- .../pace/distance/DistanceAlgoTest.java | 7 + 16 files changed, 753 insertions(+), 53 deletions(-) create mode 100644 dnet-dedup-test/dependency-reduced-pom.xml create mode 100644 dnet-dedup-test/src/main/java/eu/dnetlib/Block.java create mode 100644 dnet-dedup-test/src/main/java/eu/dnetlib/SparkLocalTest2.java create mode 100644 dnet-dedup-test/src/main/java/eu/dnetlib/pace/utils/BlockUtils.java create mode 100644 dnet-dedup-test/src/main/java/eu/dnetlib/reporter/SparkBlockProcessor2.java create mode 100644 dnet-dedup-test/src/main/resources/eu/dnetlib/pace/org.curr.beta.conf diff --git a/dnet-dedup-test/dependency-reduced-pom.xml b/dnet-dedup-test/dependency-reduced-pom.xml new file mode 100644 index 0000000..07b9268 --- /dev/null +++ b/dnet-dedup-test/dependency-reduced-pom.xml @@ -0,0 +1,119 @@ + + + + dnet-dedup + eu.dnetlib + 3.0.14-SNAPSHOT + + 4.0.0 + dnet-dedup-test + + + + maven-shade-plugin + 2.4.3 + + + package + + shade + + + + + *:* + + META-INF/*.SF + META-INF/*.DSA + META-INF/*.RSA + + + + + + + + + maven-deploy-plugin + 2.7 + + true + + + + maven-compiler-plugin + + 1.8 + 1.8 + + **/*.java + + + + + net.alchim31.maven + scala-maven-plugin + 4.0.1 + + + scala-compile-first + initialize + + add-source + compile + + + + scala-test-compile + process-test-resources + + testCompile + + + + + ${scala.version} + + + + + + + junit + junit + 4.9 + test + + + hamcrest-core + org.hamcrest + + + + + org.apache.oozie + oozie-client + 5.1.0 + test + + + json-simple + com.googlecode.json-simple + + + jms + javax.jms + + + slf4j-simple + org.slf4j + + + oozie-fluent-job-api + org.apache.oozie + + + + + + diff --git a/dnet-dedup-test/pom.xml b/dnet-dedup-test/pom.xml index 601de65..e5d429b 100644 --- a/dnet-dedup-test/pom.xml +++ b/dnet-dedup-test/pom.xml @@ -15,6 +15,33 @@ + + + org.apache.maven.plugins + maven-shade-plugin + 2.4.3 + + + package + + shade + + + + + *:* + + META-INF/*.SF + META-INF/*.DSA + META-INF/*.RSA + + + + + + + + org.apache.maven.plugins maven-deploy-plugin diff --git a/dnet-dedup-test/src/main/java/eu/dnetlib/Block.java b/dnet-dedup-test/src/main/java/eu/dnetlib/Block.java new file mode 100644 index 0000000..5d8aa98 --- /dev/null +++ b/dnet-dedup-test/src/main/java/eu/dnetlib/Block.java @@ -0,0 +1,50 @@ +package eu.dnetlib; + +import eu.dnetlib.pace.model.MapDocument; + +import java.io.Serializable; +import java.util.Arrays; +import java.util.List; +import java.util.stream.Collectors; +import java.util.stream.StreamSupport; + +public class Block implements Serializable { + + String key; + List elements; + + public Block(String key, Iterable elements){ + this.key = key; + this.elements = StreamSupport.stream(elements.spliterator(), false).collect(Collectors.toList()); + } + + public Block(String key, List elements){ + this.key = key; + this.elements = elements; + } + + public String getKey() { + return key; + } + + public void setKey(String key) { + this.key = key; + } + + public List getElements() { + return elements; + } + + public void setElements(List elements) { + this.elements = elements; + } + + public int comparisons(){ + int size = elements.size(); + return (size*(size-1)/2); + } + + public int elements(){ + return elements.size(); + } +} diff --git a/dnet-dedup-test/src/main/java/eu/dnetlib/SparkLocalTest.java b/dnet-dedup-test/src/main/java/eu/dnetlib/SparkLocalTest.java index 8a1783a..851c82e 100644 --- a/dnet-dedup-test/src/main/java/eu/dnetlib/SparkLocalTest.java +++ b/dnet-dedup-test/src/main/java/eu/dnetlib/SparkLocalTest.java @@ -1,12 +1,12 @@ package eu.dnetlib; -import com.google.common.collect.Iterables; import eu.dnetlib.graph.GraphProcessor; import eu.dnetlib.pace.config.DedupConfig; import eu.dnetlib.pace.model.MapDocument; import eu.dnetlib.pace.utils.PaceUtils; import eu.dnetlib.reporter.SparkBlockProcessor; import eu.dnetlib.reporter.SparkReporter; + import org.apache.spark.api.java.JavaPairRDD; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; @@ -17,13 +17,15 @@ import org.apache.spark.util.LongAccumulator; import scala.Tuple2; import java.net.URL; -import java.util.Map; +import java.util.*; import java.util.stream.Collectors; public class SparkLocalTest { public static void main(String[] args) { + double startTime = System.currentTimeMillis(); + final SparkSession spark = SparkSession .builder() .appName("Deduplication") @@ -33,7 +35,7 @@ public class SparkLocalTest { final JavaSparkContext context = new JavaSparkContext(spark.sparkContext()); final URL dataset = SparkLocalTest.class.getResource("/eu/dnetlib/pace/organization.to.fix.json"); - final JavaRDD dataRDD = context.textFile(dataset.getPath()); + JavaRDD dataRDD = context.textFile(dataset.getPath()); //read the configuration from the classpath final DedupConfig config = DedupConfig.load(Utility.readFromClasspath("/eu/dnetlib/pace/org.curr.conf", SparkLocalTest.class)); @@ -46,32 +48,24 @@ public class SparkLocalTest { return new Tuple2<>(mapDocument.getIdentifier(), mapDocument); }); -// mapDocs.foreach(doc -> System.out.println("doc = " + doc._2().getFieldMap().get("legalname"))); - -// mapDocs.filter(d -> d._2().getFieldMap().get("doi").stringValue().length() > 0).foreach(d -> System.out.println(d)); -// mapDocs.filter(d -> d._2().getFieldMap().get("documentationUrl").stringValue().length() > 0).foreach(d -> System.out.println(d)); +// System.out.println("mapDocs = " + mapDocs.count()); RDD> vertexes = mapDocs.mapToPair(t -> new Tuple2( (long) t._1().hashCode(), t._2())).rdd(); //create relations between documents - JavaPairRDD> blocks = mapDocs.reduceByKey((a, b) -> a) //the reduce is just to be sure that we haven't document with same id + JavaRDD blocks = mapDocs.reduceByKey((a, b) -> a) //the reduce is just to be sure that we haven't document with same id //Clustering: from to List .flatMapToPair(a -> { final MapDocument currentDocument = a._2(); return Utility.getGroupingKeys(config, currentDocument).stream() .map(it -> new Tuple2<>(it, currentDocument)).collect(Collectors.toList()).iterator(); - }).groupByKey();//group documents basing on the key - - //print blocks -// blocks = blocks.filter(b -> Iterables.size(b._2())>1); -//// vertexes = blocks.flatMap(b -> b._2().iterator()).map(t -> new Tuple2((long) t.getIdentifier().hashCode(), t)).rdd(); -// blocks.map(group -> new DocumentsBlock(group._1(), group._2())).foreach(b -> System.out.println(b)); + }).groupByKey().map(b -> new Block(b._1(), b._2())).filter(b -> b.getElements().size()>1); //create relations by comparing only elements in the same group final JavaPairRDD relationRDD = blocks.flatMapToPair(it -> { final SparkReporter reporter = new SparkReporter(); - new SparkBlockProcessor(config).process(it._1(), it._2(), reporter, accumulators); + new SparkBlockProcessor(config).process(it.getKey(), it.getElements(), reporter, accumulators); return reporter.getReport().iterator(); }); @@ -79,6 +73,14 @@ public class SparkLocalTest { JavaRDD ccs = GraphProcessor.findCCs(vertexes, edgeRdd, 20).toJavaRDD(); + System.out.println("total time = " + (System.currentTimeMillis()-startTime)); + + printStatistics(ccs); + accumulators.forEach((name, acc) -> System.out.println(name + " -> " + acc.value())); + + } + + public static void printStatistics(JavaRDD ccs){ final JavaRDD connectedComponents = ccs.filter(cc -> cc.getDocs().size()>1); final JavaRDD nonDeduplicated = ccs.filter(cc -> cc.getDocs().size()==1); @@ -86,6 +88,9 @@ public class SparkLocalTest { connectedComponents.foreach(cc -> { System.out.println(cc); }); + connectedComponents.foreach(cc -> { + cc.getDocs().stream().forEach(d -> System.out.println(d.getFieldMap().get("legalname") + " | " + d.getFieldMap().get("legalshortname"))); + }); //print nondeduped nonDeduplicated.foreach(cc -> { System.out.println(cc); @@ -95,12 +100,6 @@ public class SparkLocalTest { System.out.println("Duplicates: " + connectedComponents.flatMap(cc -> cc.getDocs().iterator()).count()); System.out.println("Connected Components: " + connectedComponents.count()); - accumulators.forEach((name, acc) -> System.out.println(name + " -> " + acc.value())); - -// //print ids -// ccs.foreach(cc -> System.out.println(cc.getId())); -// connectedComponents.saveAsTextFile("file:///Users/miconis/Downloads/dumps/organizations_dedup"); - } } \ No newline at end of file diff --git a/dnet-dedup-test/src/main/java/eu/dnetlib/SparkLocalTest2.java b/dnet-dedup-test/src/main/java/eu/dnetlib/SparkLocalTest2.java new file mode 100644 index 0000000..3ce188d --- /dev/null +++ b/dnet-dedup-test/src/main/java/eu/dnetlib/SparkLocalTest2.java @@ -0,0 +1,118 @@ +package eu.dnetlib; + +import com.google.common.collect.Lists; +import eu.dnetlib.graph.GraphProcessor; +import eu.dnetlib.pace.config.DedupConfig; +import eu.dnetlib.pace.model.MapDocument; +import eu.dnetlib.pace.utils.PaceUtils; +import eu.dnetlib.reporter.SparkBlockProcessor2; +import eu.dnetlib.reporter.SparkReporter; +import org.apache.spark.api.java.JavaPairRDD; +import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.graphx.Edge; +import org.apache.spark.rdd.RDD; +import org.apache.spark.sql.SparkSession; +import org.apache.spark.util.LongAccumulator; +import scala.Tuple2; + +import java.math.BigInteger; +import java.net.URL; +import java.util.ArrayList; +import java.util.List; +import java.util.Map; +import java.util.stream.Collectors; + +public class SparkLocalTest2 { + + public static void main(String[] args) { + + double startTime = System.currentTimeMillis(); + + final SparkSession spark = SparkSession + .builder() + .appName("Deduplication") + .master("local[*]") + .getOrCreate(); + + final JavaSparkContext context = new JavaSparkContext(spark.sparkContext()); + + final URL dataset = SparkLocalTest2.class.getResource("/eu/dnetlib/pace/softwares.huge.json"); + JavaRDD dataRDD = context.textFile(dataset.getPath()); + + //read the configuration from the classpath + final DedupConfig config = DedupConfig.load(Utility.readFromClasspath("/eu/dnetlib/pace/software.pace.conf", SparkLocalTest2.class)); + + Map accumulators = Utility.constructAccumulator(config, context.sc()); + + //create vertexes of the graph: + JavaPairRDD mapDocs = dataRDD.mapToPair(it -> { + MapDocument mapDocument = PaceUtils.asMapDocument(config, it); + return new Tuple2<>(mapDocument.getIdentifier(), mapDocument); + }); + +// System.out.println("mapDocs = " + mapDocs.count()); + + RDD> vertexes = mapDocs.mapToPair(t -> new Tuple2( (long) t._1().hashCode(), t._2())).rdd(); + + //create relations between documents + JavaRDD blocks = mapDocs.reduceByKey((a, b) -> a) //the reduce is just to be sure that we haven't document with same id + //Clustering: from to List + .flatMapToPair(a -> { + final MapDocument currentDocument = a._2(); + + return Utility.getGroupingKeys(config, currentDocument).stream() + .map(it -> new Tuple2<>(it, currentDocument)).collect(Collectors.toList()).iterator(); + }).groupByKey().map(b -> new Block(b._1(), b._2())).filter(b -> b.getElements().size()>1); + +// //BLOCK PURGING +// blocks = BlockUtils.blockPurging2(blocks); +//// blockPurging(blocks); +// +//// //BLOCK FILTERING +// blocks = BlockUtils.blockFiltering(blocks); + + JavaPairRDD, Integer> edge = blocks.flatMap(it -> { + final SparkReporter reporter = new SparkReporter(); + return new SparkBlockProcessor2(config).process(it.getKey(), it.getElements(), reporter, accumulators); + }).mapToPair(candidate -> new Tuple2<>(candidate, 1)) + .reduceByKey((a, b) -> a + b); + + final JavaPairRDD relationRDD = edge.filter(e -> { + final SparkReporter reporter = new SparkReporter(); + return new SparkBlockProcessor2(config).isSimilar(e._1(), reporter, accumulators); + }).mapToPair(t -> new Tuple2<>(t._1()._1().getIdentifier(), t._1()._2().getIdentifier())); + + System.out.println("relationRDD = " + relationRDD.count()); + + final RDD> edgeRdd = relationRDD.map(it -> new Edge<>(it._1().hashCode(),it._2().hashCode(), "similarTo")).rdd(); + + JavaRDD ccs = GraphProcessor.findCCs(vertexes, edgeRdd, 20).toJavaRDD(); + + System.out.println("total time = " + (System.currentTimeMillis()-startTime)); + + printStatistics(ccs); + accumulators.forEach((name, acc) -> System.out.println(name + " -> " + acc.value())); + + } + + public static void printStatistics(JavaRDD ccs){ + final JavaRDD connectedComponents = ccs.filter(cc -> cc.getDocs().size()>1); + final JavaRDD nonDeduplicated = ccs.filter(cc -> cc.getDocs().size()==1); + +// //print deduped +// connectedComponents.foreach(cc -> { +// System.out.println(cc); +// }); +// //print nondeduped +// nonDeduplicated.foreach(cc -> { +// System.out.println(cc); +// }); + + System.out.println("Non duplicates: " + nonDeduplicated.count()); + System.out.println("Duplicates: " + connectedComponents.flatMap(cc -> cc.getDocs().iterator()).count()); + System.out.println("Connected Components: " + connectedComponents.count()); + + } + +} \ No newline at end of file diff --git a/dnet-dedup-test/src/main/java/eu/dnetlib/graph/GraphProcessor.scala b/dnet-dedup-test/src/main/java/eu/dnetlib/graph/GraphProcessor.scala index bc63101..2d815c7 100644 --- a/dnet-dedup-test/src/main/java/eu/dnetlib/graph/GraphProcessor.scala +++ b/dnet-dedup-test/src/main/java/eu/dnetlib/graph/GraphProcessor.scala @@ -10,7 +10,7 @@ import scala.collection.JavaConversions; object GraphProcessor { def findCCs(vertexes: RDD[(VertexId, MapDocument)], edges: RDD[Edge[String]], maxIterations: Int): RDD[ConnectedComponent] = { - val graph: Graph[MapDocument, String] = Graph(vertexes, edges) + val graph: Graph[MapDocument, String] = Graph(vertexes, edges).partitionBy(PartitionStrategy.RandomVertexCut) //TODO remember to remove partitionby val cc = graph.connectedComponents(maxIterations).vertices val joinResult = vertexes.leftOuterJoin(cc).map { diff --git a/dnet-dedup-test/src/main/java/eu/dnetlib/pace/utils/BlockUtils.java b/dnet-dedup-test/src/main/java/eu/dnetlib/pace/utils/BlockUtils.java new file mode 100644 index 0000000..99152dc --- /dev/null +++ b/dnet-dedup-test/src/main/java/eu/dnetlib/pace/utils/BlockUtils.java @@ -0,0 +1,168 @@ +package eu.dnetlib.pace.utils; + +import com.google.common.collect.Lists; +import eu.dnetlib.Block; +import org.apache.spark.api.java.JavaPairRDD; +import org.apache.spark.api.java.JavaRDD; +import scala.Tuple2; + +import java.io.Serializable; +import java.math.BigInteger; +import java.util.ArrayList; +import java.util.Comparator; +import java.util.List; +import java.util.stream.Collectors; + +public class BlockUtils implements Serializable { + + public static double getOptimalComparisonNumber(JavaRDD blocks) { + + double SMOOTHING_FACTOR = 1.05; + + //pairRDD: cardinality, #elements + List>> collect = blocks.mapToPair(b -> new Tuple2<>(b.comparisons(), b.elements())) + .mapToPair(bs -> new Tuple2<>(bs._1(), new Tuple2<>(bs._1(), bs._2()))) + .reduceByKey((a, b) -> new Tuple2<>(a._1() + b._1(), a._2() + b._2())).collect(); + + collect = new ArrayList<>(collect); + collect.sort(Comparator.comparing(Tuple2::_1)); + + double[] blockAssignments = new double[collect.size()]; + double[] comparisonsLevel = new double[collect.size()]; + double[] totalComparisonsPerLevel = new double[collect.size()]; + Integer totalComparisons = collect.get(0)._2()._1(); + Integer totalBlockSize = collect.get(0)._2()._2(); + blockAssignments[0] = totalBlockSize; + comparisonsLevel[0] = collect.get(0)._1(); + totalComparisonsPerLevel[0] = totalComparisons; + for (int i=1; i blocks){ + + BigInteger numberOfComparisons = BigInteger.ZERO; + BigInteger totalSizeOfBlocks = BigInteger.ZERO; + BigInteger blockSize; + + //block_size, frequency + JavaPairRDD blocksFreq = blocks.mapToPair(b -> new Tuple2<>(b.getKey(), b.elements())) + .mapToPair(bs -> new Tuple2<>(bs._2(),1)) + .reduceByKey((a,b) -> a+b).sortByKey(); + + ArrayList> blockSizesAndFreq = new ArrayList<>(blocksFreq.collect()); + + double CC = 0d; + int freq; + + /* + * statistics: array of pairs (blockSize, CC) for every blockSize + */ + ArrayList> statistics = new ArrayList<>(); + + for (int i = 0; i < blockSizesAndFreq.size(); i++) { + blockSize = new BigInteger(blockSizesAndFreq.get(i)._1.toString()); + + freq = blockSizesAndFreq.get(i)._2; + + totalSizeOfBlocks = totalSizeOfBlocks.add(BigInteger.valueOf(freq).multiply(blockSize)); + + //accumulated number of comparisons + numberOfComparisons = numberOfComparisons.add(BigInteger.valueOf(freq) + .multiply(blockSize.multiply(blockSize.subtract(BigInteger.ONE)).shiftLeft(1))); + + CC = totalSizeOfBlocks.doubleValue() / numberOfComparisons.doubleValue(); + + Tuple2 st = new Tuple2<>(blockSize.intValue(), CC); + + statistics.add(st); + + } + + int optimalBlockSize = statistics.get(statistics.size() - 1)._1;// lastBlockSize; + + double eps = 1d; //smoothing factor + + /* + * find minimum difference for every adjacent pair i,i-1 the minimum difference + * represents the optimal blockSize + */ + for (int i = statistics.size() - 1; i >= 1; i--) { + if (Math.abs(statistics.get(i)._2 - statistics.get(i - 1)._2) < eps) { + + eps = Math.abs(statistics.get(i)._2 - statistics.get(i - 1)._2); + + optimalBlockSize = statistics.get(i)._1; + } + } + return optimalBlockSize; + } + + //cut blocks basing on number of elements + public static JavaRDD blockPurging(JavaRDD blocks) { + + int optimalBlockSize = getOptimalBlockSize(blocks); + + System.out.println("optimalBlockSize = " + optimalBlockSize); + + return blocks.filter(b -> b.getElements().size() < optimalBlockSize); + } + + //cut blocks basing on number of comparisons + public static JavaRDD blockPurging2(JavaRDD blocks) { + + double optimalComparisonNumber = getOptimalComparisonNumber(blocks); + + System.out.println("optimalComparisonNumber = " + optimalComparisonNumber); + + return blocks.filter(b -> b.comparisons() < optimalComparisonNumber); + } + + public static JavaRDD blockFiltering(JavaRDD blocks) { + double RATIO = 0.85; + + return blocks + .flatMapToPair(b -> b.getElements().stream().map(e -> new Tuple2<>(e, new Tuple2<>(b.getKey(), b.comparisons()))).iterator()) + .groupByKey() + .mapToPair(es -> { + List> b = Lists.newArrayList(es._2()); + b.sort(Comparator.comparing(Tuple2::_2)); + int size = b.size(); + long limit = Math.round(size*RATIO); + return new Tuple2<>(es._1(),b.subList(0,(int)limit)); + }) + .flatMapToPair(es -> es._2().stream().map(it -> new Tuple2<>(it._1(), es._1())).collect(Collectors.toList()).iterator()) + .groupByKey().map(b -> new Block(b._1(), b._2())); + } + +} diff --git a/dnet-dedup-test/src/main/java/eu/dnetlib/reporter/SparkBlockProcessor.java b/dnet-dedup-test/src/main/java/eu/dnetlib/reporter/SparkBlockProcessor.java index 21e6fd2..5402536 100644 --- a/dnet-dedup-test/src/main/java/eu/dnetlib/reporter/SparkBlockProcessor.java +++ b/dnet-dedup-test/src/main/java/eu/dnetlib/reporter/SparkBlockProcessor.java @@ -32,8 +32,6 @@ public class SparkBlockProcessor { final Queue q = prepare(documents); if (q.size() > 1) { -// log.info("reducing key: '" + key + "' records: " + q.size()); - //process(q, context); process(simplifyQueue(q, key, context, accumulators), context, accumulators); } else { diff --git a/dnet-dedup-test/src/main/java/eu/dnetlib/reporter/SparkBlockProcessor2.java b/dnet-dedup-test/src/main/java/eu/dnetlib/reporter/SparkBlockProcessor2.java new file mode 100644 index 0000000..aa30db8 --- /dev/null +++ b/dnet-dedup-test/src/main/java/eu/dnetlib/reporter/SparkBlockProcessor2.java @@ -0,0 +1,193 @@ +package eu.dnetlib.reporter; +import com.google.common.collect.Lists; +import eu.dnetlib.pace.clustering.NGramUtils; +import eu.dnetlib.pace.config.DedupConfig; +import eu.dnetlib.pace.config.WfConfig; +import eu.dnetlib.pace.distance.PaceDocumentDistance; +import eu.dnetlib.pace.distance.eval.ScoreResult; +import eu.dnetlib.pace.model.Field; +import eu.dnetlib.pace.model.MapDocument; +import eu.dnetlib.pace.model.MapDocumentComparator; +import org.apache.commons.lang.StringUtils; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.spark.util.LongAccumulator; +import scala.Tuple2; + +import java.util.*; + +public class SparkBlockProcessor2 { + + private static final Log log = LogFactory.getLog(SparkBlockProcessor2.class); + + private DedupConfig dedupConf; + + public SparkBlockProcessor2(DedupConfig dedupConf) { + this.dedupConf = dedupConf; + } + + public boolean isSimilar(Tuple2 t, SparkReporter context, Map accumulators) { + + final PaceDocumentDistance algo = new PaceDocumentDistance(); + + final ScoreResult sr = similarity(algo, t._1(), t._2()); + + final double d = sr.getScore(); + + if (d >= dedupConf.getWf().getThreshold()) { + context.incrementCounter(dedupConf.getWf().getEntityType(), "dedupSimilarity (x2)", 1, accumulators); + return true; + } else { + context.incrementCounter(dedupConf.getWf().getEntityType(), "d < " + dedupConf.getWf().getThreshold(), 1, accumulators); + return false; + } + } + + public Iterator> process(final String key, final Iterable documents, final SparkReporter context, Map accumulators) { + + final Queue q = prepare(documents); + + if (q.size() > 1) { + return process(simplifyQueue(q, key, context, accumulators), context, accumulators); + + } else { + context.incrementCounter(dedupConf.getWf().getEntityType(), "records per hash key = 1", 1, accumulators); + return new ArrayList>().iterator(); + } + } + + private Queue prepare(final Iterable documents) { + final Queue queue = new PriorityQueue<>(100, new MapDocumentComparator(dedupConf.getWf().getOrderField())); + + final Set seen = new HashSet(); + final int queueMaxSize = dedupConf.getWf().getQueueMaxSize(); + + documents.forEach(doc -> { + if (queue.size() <= queueMaxSize) { + final String id = doc.getIdentifier(); + + if (!seen.contains(id)) { + seen.add(id); + queue.add(doc); + } + } + }); + + return queue; + } + + private Queue simplifyQueue(final Queue queue, final String ngram, final SparkReporter context, Map accumulators) { + final Queue q = new LinkedList<>(); + + String fieldRef = ""; + final List tempResults = Lists.newArrayList(); + + while (!queue.isEmpty()) { + final MapDocument result = queue.remove(); + + final String orderFieldName = dedupConf.getWf().getOrderField(); + final Field orderFieldValue = result.values(orderFieldName); + if (!orderFieldValue.isEmpty()) { + final String field = NGramUtils.cleanupForOrdering(orderFieldValue.stringValue()); + if (field.equals(fieldRef)) { + tempResults.add(result); + } else { + populateSimplifiedQueue(q, tempResults, context, fieldRef, ngram, accumulators); + tempResults.clear(); + tempResults.add(result); + fieldRef = field; + } + } else { + context.incrementCounter(dedupConf.getWf().getEntityType(), "missing " + dedupConf.getWf().getOrderField(), 1, accumulators); + } + } + populateSimplifiedQueue(q, tempResults, context, fieldRef, ngram, accumulators); + + return q; + } + + private void populateSimplifiedQueue(final Queue q, + final List tempResults, + final SparkReporter context, + final String fieldRef, + final String ngram, + Map accumulators) { + WfConfig wf = dedupConf.getWf(); + if (tempResults.size() < wf.getGroupMaxSize()) { + q.addAll(tempResults); + } else { + context.incrementCounter(wf.getEntityType(), String.format("Skipped records for count(%s) >= %s", wf.getOrderField(), wf.getGroupMaxSize()), tempResults.size(), accumulators); +// log.info("Skipped field: " + fieldRef + " - size: " + tempResults.size() + " - ngram: " + ngram); + } + } + + private Iterator> process(final Queue queue, final SparkReporter context, Map accumulators) { + + final PaceDocumentDistance algo = new PaceDocumentDistance(); + + List> ret = new ArrayList<>(); + + while (!queue.isEmpty()) { + + final MapDocument pivot = queue.remove(); + final String idPivot = pivot.getIdentifier(); + + WfConfig wf = dedupConf.getWf(); + final Field fieldsPivot = pivot.values(wf.getOrderField()); + final String fieldPivot = (fieldsPivot == null) || fieldsPivot.isEmpty() ? null : fieldsPivot.stringValue(); + + if (fieldPivot != null) { + // System.out.println(idPivot + " --> " + fieldPivot); + + int i = 0; + for (final MapDocument curr : queue) { + final String idCurr = curr.getIdentifier(); + + if (mustSkip(idCurr)) { + + context.incrementCounter(wf.getEntityType(), "skip list", 1, accumulators); + + break; + } + + if (i > wf.getSlidingWindowSize()) { + break; + } + + final Field fieldsCurr = curr.values(wf.getOrderField()); + final String fieldCurr = (fieldsCurr == null) || fieldsCurr.isEmpty() ? null : fieldsCurr.stringValue(); + + if (!idCurr.equals(idPivot) && (fieldCurr != null)) { + + if (pivot.getIdentifier().compareTo(curr.getIdentifier())<0){ + ret.add(new Tuple2<>(pivot, curr)); + } else { + ret.add(new Tuple2<>(curr, pivot)); + } + i++; + } + } + } + } + + return ret.iterator(); + } + + private ScoreResult similarity(final PaceDocumentDistance algo, final MapDocument a, final MapDocument b) { + try { + return algo.between(a, b, dedupConf); + } catch(Throwable e) { + log.error(String.format("\nA: %s\n----------------------\nB: %s", a, b), e); + throw new IllegalArgumentException(e); + } + } + + private boolean mustSkip(final String idPivot) { + return dedupConf.getWf().getSkipList().contains(getNsPrefix(idPivot)); + } + + private String getNsPrefix(final String id) { + return StringUtils.substringBetween(id, "|", "::"); + } + +} diff --git a/dnet-dedup-test/src/main/resources/eu/dnetlib/pace/org.curr.beta.conf b/dnet-dedup-test/src/main/resources/eu/dnetlib/pace/org.curr.beta.conf new file mode 100644 index 0000000..d184605 --- /dev/null +++ b/dnet-dedup-test/src/main/resources/eu/dnetlib/pace/org.curr.beta.conf @@ -0,0 +1,35 @@ +{ + "wf" : { + "threshold" : "0.9", + "dedupRun" : "001", + "entityType" : "organization", + "orderField" : "legalname", + "queueMaxSize" : "2000", + "groupMaxSize" : "50", + "slidingWindowSize" : "200", + "rootBuilder" : [ "organization", "projectOrganization_participation_isParticipant", "datasourceOrganization_provision_isProvidedBy" ], + "includeChildren" : "true" + }, + "pace" : { + "clustering" : [ + { "name" : "sortedngrampairs", "fields" : [ "legalname" ], "params" : { "max" : 2, "ngramLen" : "3"} }, + { "name" : "suffixprefix", "fields" : [ "legalname" ], "params" : { "max" : 1, "len" : "3" } }, + { "name" : "urlclustering", "fields" : [ "websiteurl" ], "params" : { } } + ], + "strictConditions" : [ + { "name" : "exactMatch", "fields" : [ "gridid" ] } + ], + "conditions" : [ + { "name" : "exactMatch", "fields" : [ "country" ] }, + { "name" : "DomainExactMatch", "fields" : [ "websiteurl" ] } + ], + "model" : [ + { "name" : "country", "algo" : "Null", "type" : "String", "weight" : "0", "ignoreMissing" : "true", "path" : "organization/metadata/country/classid" }, + { "name" : "legalshortname", "algo" : "JaroWinklerNormalizedName", "type" : "String", "weight" : "0.1", "ignoreMissing" : "true", "path" : "organization/metadata/legalshortname/value" }, + { "name" : "legalname", "algo" : "JaroWinklerNormalizedName", "type" : "String", "weight" : "0.9", "ignoreMissing" : "false", "path" : "organization/metadata/legalname/value", "params" : {"windowSize" : 4, "threshold" : 0.5} }, + { "name" : "websiteurl", "algo" : "Null", "type" : "URL", "weight" : "0", "ignoreMissing" : "true", "path" : "organization/metadata/websiteurl/value", "params" : { "host" : 0.5, "path" : 0.5 } }, + { "name" : "gridid", "algo" : "Null", "type" : "String", "weight" : "0.0", "ignoreMissing" : "true", "path" : "pid[qualifier#classid = {grid}]/value" } + ], + "blacklists" : { } + } +} \ No newline at end of file diff --git a/dnet-dedup-test/src/main/resources/eu/dnetlib/pace/org.curr.conf b/dnet-dedup-test/src/main/resources/eu/dnetlib/pace/org.curr.conf index 4384df8..21fd1e2 100644 --- a/dnet-dedup-test/src/main/resources/eu/dnetlib/pace/org.curr.conf +++ b/dnet-dedup-test/src/main/resources/eu/dnetlib/pace/org.curr.conf @@ -27,7 +27,7 @@ "model" : [ { "name" : "country", "algo" : "Null", "type" : "String", "weight" : "0", "ignoreMissing" : "false", "path" : "organization/metadata/country/classid" }, { "name" : "legalshortname", "algo" : "JaroWinklerNormalizedName", "type" : "String", "weight" : "0.1", "ignoreMissing" : "true", "path" : "organization/metadata/legalshortname/value" }, - { "name" : "legalname", "algo" : "JaroWinklerNormalizedName", "type" : "String", "weight" : "0.9", "ignoreMissing" : "false", "path" : "organization/metadata/legalname/value", "params" : {"windowSize" : 4, "threshold" : 0.5} }, + { "name" : "legalname", "algo" : "JaroWinklerNormalizedName", "type" : "String", "weight" : "0.9", "ignoreMissing" : "false", "path" : "organization/metadata/legalname/value", "params" : {"windowSize" : 4, "threshold" : 0.7} }, { "name" : "websiteurl", "algo" : "Null", "type" : "URL", "weight" : "0", "ignoreMissing" : "true", "path" : "organization/metadata/websiteurl/value", "params" : { "host" : 0.5, "path" : 0.5 } }, { "name" : "gridid", "algo" : "Null", "type" : "String", "weight" : "0.0", "ignoreMissing" : "true", "path" : "pid[qualifier#classid = {grid}]/value" } ], diff --git a/dnet-dedup-test/src/main/resources/eu/dnetlib/pace/organization.to.fix.json b/dnet-dedup-test/src/main/resources/eu/dnetlib/pace/organization.to.fix.json index ab544bf..1e335ed 100644 --- a/dnet-dedup-test/src/main/resources/eu/dnetlib/pace/organization.to.fix.json +++ b/dnet-dedup-test/src/main/resources/eu/dnetlib/pace/organization.to.fix.json @@ -1,25 +1,12 @@ -{"dateoftransformation":"2019-06-26","originalId":["corda__h2020::999864846"],"collectedfrom":[{"value":"CORDA - COmmon Research DAta Warehouse - Horizon 2020","key":"10|openaire____::a55eb91348674d853191f4f4fd73d078"}],"organization":{"metadata":{"eclegalbody":{"value":"true"},"eclegalperson":{"value":"true"},"ecinternationalorganization":{"value":"false"},"legalshortname":{"value":"UPV"},"ecresearchorganization":{"value":"true"},"ecnonprofit":{"value":"true"},"ecenterprise":{"value":"false"},"websiteurl":{"value":"http://www.upv.es"},"ecnutscode":{"value":"false"},"ecinternationalorganizationeurinterests":{"value":"false"},"legalname":{"value":"UNIVERSITAT POLITECNICA DE VALENCIA"},"country":{"classid":"ES","classname":"Spain","schemename":"dnet:countries","schemeid":"dnet:countries"},"echighereducation":{"value":"true"},"ecsmevalidated":{"value":"false"}}},"dateofcollection":"2018-03-12","type":20,"id":"20|corda__h2020::542f36b24ca4203a5b4dfc8396ef7475"} -{"dateoftransformation":"2019-06-26","originalId":["corda__h2020::999828859"],"collectedfrom":[{"value":"CORDA - COmmon Research DAta Warehouse - Horizon 2020","key":"10|openaire____::a55eb91348674d853191f4f4fd73d078"}],"organization":{"metadata":{"eclegalbody":{"value":"true"},"eclegalperson":{"value":"true"},"ecinternationalorganization":{"value":"false"},"legalshortname":{"value":"UPCT"},"ecresearchorganization":{"value":"true"},"ecnonprofit":{"value":"true"},"ecenterprise":{"value":"false"},"websiteurl":{"value":"http://www.upct.es"},"ecnutscode":{"value":"false"},"ecinternationalorganizationeurinterests":{"value":"false"},"legalname":{"value":"UNIVERSIDAD POLITECNICA DE CARTAGENA"},"country":{"classid":"ES","classname":"Spain","schemename":"dnet:countries","schemeid":"dnet:countries"},"echighereducation":{"value":"true"},"ecsmevalidated":{"value":"false"}}},"dateofcollection":"2018-03-12","type":20,"id":"20|corda__h2020::9df7abeaef39ed5bac9c3e3a10a2be91"} -{"dateoftransformation":"2019-06-26","originalId":["corda__h2020::999974844"],"collectedfrom":[{"value":"CORDA - COmmon Research DAta Warehouse - Horizon 2020","key":"10|openaire____::a55eb91348674d853191f4f4fd73d078"}],"organization":{"metadata":{"eclegalbody":{"value":"true"},"eclegalperson":{"value":"true"},"ecinternationalorganization":{"value":"false"},"legalshortname":{"value":"UPM"},"ecresearchorganization":{"value":"true"},"ecnonprofit":{"value":"true"},"ecenterprise":{"value":"false"},"websiteurl":{"value":"http://www.upm.es"},"ecnutscode":{"value":"false"},"ecinternationalorganizationeurinterests":{"value":"false"},"legalname":{"value":"UNIVERSIDAD POLITECNICA DE MADRID"},"country":{"classid":"ES","classname":"Spain","schemename":"dnet:countries","schemeid":"dnet:countries"},"echighereducation":{"value":"true"},"ecsmevalidated":{"value":"false"}}},"dateofcollection":"2018-03-12","type":20,"id":"20|corda__h2020::7a44a773d0dc629e9af5cef563c2478a"} -{"dateoftransformation":"2018-11-20","originalId":["corda_______::999976202"],"collectedfrom":[{"value":"CORDA - COmmon Research DAta Warehouse","key":"10|openaire____::b30dac7baac631f3da7c2bb18dd9891f"}],"organization":{"metadata":{"eclegalbody":{"value":"true"},"eclegalperson":{"value":"true"},"ecinternationalorganization":{"value":"false"},"legalshortname":{"value":"UPC"},"ecresearchorganization":{"value":"true"},"ecnonprofit":{"value":"true"},"websiteurl":{"value":"http://www.upc.edu"},"ecnutscode":{"value":"false"},"ecinternationalorganizationeurinterests":{"value":"false"},"legalname":{"value":"UNIVERSITAT POLITECNICA DE CATALUNYA"},"country":{"classid":"ES","classname":"Spain","schemename":"dnet:countries","schemeid":"dnet:countries"},"echighereducation":{"value":"true"}}},"dateofcollection":"2018-03-12","type":20,"id":"20|corda_______::0a1f410e6b3374e015b9aead9d97731a"} -{"dateoftransformation":"2018-11-20","originalId":["corda_______::999864846"],"collectedfrom":[{"value":"CORDA - COmmon Research DAta Warehouse","key":"10|openaire____::b30dac7baac631f3da7c2bb18dd9891f"}],"organization":{"metadata":{"eclegalbody":{"value":"true"},"eclegalperson":{"value":"true"},"ecinternationalorganization":{"value":"false"},"legalshortname":{"value":"UPV"},"ecresearchorganization":{"value":"true"},"ecnonprofit":{"value":"true"},"ecenterprise":{"value":"false"},"websiteurl":{"value":"http://www.upv.es"},"ecnutscode":{"value":"false"},"ecinternationalorganizationeurinterests":{"value":"false"},"legalname":{"value":"UNIVERSITAT POLITECNICA DE VALENCIA"},"country":{"classid":"ES","classname":"Spain","schemename":"dnet:countries","schemeid":"dnet:countries"},"echighereducation":{"value":"true"}}},"dateofcollection":"2018-03-12","type":20,"id":"20|corda_______::542f36b24ca4203a5b4dfc8396ef7475"} -{"dateoftransformation":"2018-11-20","originalId":["corda_______::999974844"],"collectedfrom":[{"value":"CORDA - COmmon Research DAta Warehouse","key":"10|openaire____::b30dac7baac631f3da7c2bb18dd9891f"}],"organization":{"metadata":{"eclegalbody":{"value":"true"},"eclegalperson":{"value":"true"},"ecinternationalorganization":{"value":"false"},"legalshortname":{"value":"UPM"},"ecresearchorganization":{"value":"true"},"ecnonprofit":{"value":"true"},"ecenterprise":{"value":"false"},"websiteurl":{"value":"http://www.upm.es"},"ecnutscode":{"value":"false"},"ecinternationalorganizationeurinterests":{"value":"false"},"legalname":{"value":"UNIVERSIDAD POLITECNICA DE MADRID"},"country":{"classid":"ES","classname":"Spain","schemename":"dnet:countries","schemeid":"dnet:countries"},"echighereducation":{"value":"true"},"ecsmevalidated":{"value":"false"}}},"dateofcollection":"2018-03-12","type":20,"id":"20|corda_______::7a44a773d0dc629e9af5cef563c2478a"} -{"dateoftransformation":"2019-06-26","originalId":["corda__h2020::999976202"],"collectedfrom":[{"value":"CORDA - COmmon Research DAta Warehouse - Horizon 2020","key":"10|openaire____::a55eb91348674d853191f4f4fd73d078"}],"organization":{"metadata":{"eclegalbody":{"value":"true"},"eclegalperson":{"value":"true"},"ecinternationalorganization":{"value":"false"},"legalshortname":{"value":"UPC"},"ecresearchorganization":{"value":"true"},"ecnonprofit":{"value":"true"},"ecenterprise":{"value":"false"},"websiteurl":{"value":"http://www.upc.edu"},"ecnutscode":{"value":"false"},"ecinternationalorganizationeurinterests":{"value":"false"},"legalname":{"value":"UNIVERSITAT POLITECNICA DE CATALUNYA"},"country":{"classid":"ES","classname":"Spain","schemename":"dnet:countries","schemeid":"dnet:countries"},"echighereducation":{"value":"true"},"ecsmevalidated":{"value":"false"}}},"dateofcollection":"2018-03-12","type":20,"id":"20|corda__h2020::0a1f410e6b3374e015b9aead9d97731a"} -{"dateoftransformation":"2018-11-20","originalId":["corda_______::999828859"],"collectedfrom":[{"value":"CORDA - COmmon Research DAta Warehouse","key":"10|openaire____::b30dac7baac631f3da7c2bb18dd9891f"}],"organization":{"metadata":{"eclegalbody":{"value":"true"},"eclegalperson":{"value":"true"},"legalshortname":{"value":"UPCT"},"ecresearchorganization":{"value":"true"},"ecnonprofit":{"value":"true"},"websiteurl":{"value":"http://www.upct.es"},"ecnutscode":{"value":"false"},"legalname":{"value":"UNIVERSIDAD POLITECNICA DE CARTAGENA"},"country":{"classid":"ES","classname":"Spain","schemename":"dnet:countries","schemeid":"dnet:countries"},"echighereducation":{"value":"true"}}},"dateofcollection":"2018-03-12","type":20,"id":"20|corda_______::9df7abeaef39ed5bac9c3e3a10a2be91"} -{"dateoftransformation":"2018-09-19","originalId":["doajarticles::Universidad_Politécnica_de_Madrid"],"collectedfrom":[{"value":"DOAJ-Articles","key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824"}],"organization":{"metadata":{"eclegalbody":{"value":"false"},"eclegalperson":{"value":"false"},"ecinternationalorganization":{"value":"false"},"legalshortname":{"value":"Universidad Politécnica de Madrid"},"ecresearchorganization":{"value":"false"},"ecnonprofit":{"value":"false"},"ecenterprise":{"value":"false"},"ecnutscode":{"value":"false"},"ecinternationalorganizationeurinterests":{"value":"false"},"legalname":{"value":"Universidad Politécnica de Madrid"},"country":{"classid":"ES","classname":"Spain","schemename":"dnet:countries","schemeid":"dnet:countries"},"echighereducation":{"value":"false"},"ecsmevalidated":{"value":"false"}}},"dateofcollection":"2018-09-19","type":20,"id":"20|doajarticles::3cdfe6eec5fafec302e02cb7be692318"} -{"dateoftransformation":"2018-09-19","originalId":["doajarticles::Universitat_Politècnica_de_València"],"collectedfrom":[{"value":"DOAJ-Articles","key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824"}],"organization":{"metadata":{"eclegalbody":{"value":"false"},"eclegalperson":{"value":"false"},"ecinternationalorganization":{"value":"false"},"legalshortname":{"value":"Universitat Politècnica de València"},"ecresearchorganization":{"value":"false"},"ecnonprofit":{"value":"false"},"ecenterprise":{"value":"false"},"ecnutscode":{"value":"false"},"ecinternationalorganizationeurinterests":{"value":"false"},"legalname":{"value":"Universitat Politècnica de València"},"country":{"classid":"ES","classname":"Spain","schemename":"dnet:countries","schemeid":"dnet:countries"},"echighereducation":{"value":"false"},"ecsmevalidated":{"value":"false"}}},"dateofcollection":"2018-09-19","type":20,"id":"20|doajarticles::b9fc5f6fb04d0f8d7d0b163718f9785c"} -{"dateoftransformation":"2018-09-19","originalId":["doajarticles::Universidad_Politécnica_de_Cartagena"],"collectedfrom":[{"value":"DOAJ-Articles","key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824"}],"organization":{"metadata":{"eclegalbody":{"value":"false"},"eclegalperson":{"value":"false"},"ecinternationalorganization":{"value":"false"},"legalshortname":{"value":"Universidad Politécnica de Cartagena"},"ecresearchorganization":{"value":"false"},"ecnonprofit":{"value":"false"},"ecenterprise":{"value":"false"},"ecnutscode":{"value":"false"},"ecinternationalorganizationeurinterests":{"value":"false"},"legalname":{"value":"Universidad Politécnica de Cartagena"},"country":{"classid":"ES","classname":"Spain","schemename":"dnet:countries","schemeid":"dnet:countries"},"echighereducation":{"value":"false"},"ecsmevalidated":{"value":"false"}}},"dateofcollection":"2018-09-19","type":20,"id":"20|doajarticles::cf3bddf37ca3440dbd8e5ba6e81f6df3"} -{"dateoftransformation":"2018-09-19","originalId":["doajarticles::Universitat_Politècnica_de_Catalunya"],"collectedfrom":[{"value":"DOAJ-Articles","key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824"}],"organization":{"metadata":{"eclegalbody":{"value":"false"},"eclegalperson":{"value":"false"},"ecinternationalorganization":{"value":"false"},"legalshortname":{"value":"Universitat Politècnica de Catalunya"},"ecresearchorganization":{"value":"false"},"ecnonprofit":{"value":"false"},"ecenterprise":{"value":"false"},"ecnutscode":{"value":"false"},"ecinternationalorganizationeurinterests":{"value":"false"},"legalname":{"value":"Universitat Politècnica de Catalunya"},"country":{"classid":"ES","classname":"Spain","schemename":"dnet:countries","schemeid":"dnet:countries"},"echighereducation":{"value":"false"},"ecsmevalidated":{"value":"false"}}},"dateofcollection":"2018-09-19","type":20,"id":"20|doajarticles::7e232375e9902c823cf3cd4e1a16de5e"} -{"dateoftransformation":"2018-09-19","originalId":["doajarticles::Universidad_Politécnica_de_Cataluña"],"collectedfrom":[{"value":"DOAJ-Articles","key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824"}],"organization":{"metadata":{"eclegalbody":{"value":"false"},"eclegalperson":{"value":"false"},"ecinternationalorganization":{"value":"false"},"legalshortname":{"value":"Universidad Politécnica de Cataluña"},"ecresearchorganization":{"value":"false"},"ecnonprofit":{"value":"false"},"ecenterprise":{"value":"false"},"ecnutscode":{"value":"false"},"ecinternationalorganizationeurinterests":{"value":"false"},"legalname":{"value":"Universidad Politécnica de Cataluña"},"country":{"classid":"ES","classname":"Spain","schemename":"dnet:countries","schemeid":"dnet:countries"},"echighereducation":{"value":"false"},"ecsmevalidated":{"value":"false"}}},"dateofcollection":"2018-09-19","type":20,"id":"20|doajarticles::98e85e22b787db82b4c806e4f452f456"} -{"dateoftransformation":"2018-09-19","originalId":["doajarticles::Universitat_Politècnica_deValència"],"collectedfrom":[{"value":"DOAJ-Articles","key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824"}],"organization":{"metadata":{"eclegalbody":{"value":"false"},"eclegalperson":{"value":"false"},"ecinternationalorganization":{"value":"false"},"legalshortname":{"value":"Universitat Politècnica deValència"},"ecresearchorganization":{"value":"false"},"ecnonprofit":{"value":"false"},"ecenterprise":{"value":"false"},"ecnutscode":{"value":"false"},"ecinternationalorganizationeurinterests":{"value":"false"},"legalname":{"value":"Universitat Politècnica deValència"},"country":{"classid":"ES","classname":"Spain","schemename":"dnet:countries","schemeid":"dnet:countries"},"echighereducation":{"value":"false"},"ecsmevalidated":{"value":"false"}}},"dateofcollection":"2018-09-19","type":20,"id":"20|doajarticles::637209da5f9f175227735f91e63d4999"} -{"dateoftransformation":"2018-09-13","originalId":["re3data_____::c9820581fcddf3a7a50652e4ac5e5aab"],"collectedfrom":[{"value":"Registry of Research Data Repository","key":"10|openaire____::21f8a223b9925c2f87c404096080b046"}],"organization":{"metadata":{"eclegalbody":{"value":"false"},"eclegalperson":{"value":"false"},"ecinternationalorganization":{"value":"false"},"ecresearchorganization":{"value":"false"},"ecnonprofit":{"value":"false"},"ecenterprise":{"value":"false"},"websiteurl":{"value":"https://www.upc.edu"},"ecnutscode":{"value":"false"},"ecinternationalorganizationeurinterests":{"value":"false"},"legalname":{"value":"Universitat Politècnica de Catalunya"},"country":{"classid":"ES","classname":"Spain","schemename":"dnet:countries","schemeid":"dnet:countries"},"echighereducation":{"value":"false"},"ecsmevalidated":{"value":"false"}}},"dateofcollection":"2018-03-12","type":20,"id":"20|re3data_____::044d264e43ee7cd7bfbc3deb1532037a"} -{"dateoftransformation":"2019-04-11","originalId":["re3data_____::211b4cb099c317e9cd9073476566ca47"],"collectedfrom":[{"value":"Registry of Research Data Repository","key":"10|openaire____::21f8a223b9925c2f87c404096080b046"}],"organization":{"metadata":{"eclegalbody":{"value":"false"},"eclegalperson":{"value":"false"},"ecinternationalorganization":{"value":"false"},"ecresearchorganization":{"value":"false"},"ecnonprofit":{"value":"true"},"ecenterprise":{"value":"false"},"websiteurl":{"value":"http://www.upv.es/index-en.html"},"ecnutscode":{"value":"false"},"ecinternationalorganizationeurinterests":{"value":"false"},"legalname":{"value":"Universitat Politècnica de València"},"country":{"classid":"ES","classname":"Spain","schemename":"dnet:countries","schemeid":"dnet:countries"},"echighereducation":{"value":"false"},"ecsmevalidated":{"value":"false"}}},"dateofcollection":"2019-04-11","type":20,"id":"20|re3data_____::465c66fba7c037fde2dcc962f94e44a9"} -{"dateoftransformation":"2018-09-13","originalId":["re3data_____::fbdc48c28389453ae6008622c1e5b0e1"],"collectedfrom":[{"value":"Registry of Research Data Repository","key":"10|openaire____::21f8a223b9925c2f87c404096080b046"}],"organization":{"metadata":{"eclegalbody":{"value":"false"},"eclegalperson":{"value":"false"},"ecinternationalorganization":{"value":"false"},"legalshortname":{"value":"UPM"},"ecresearchorganization":{"value":"false"},"ecnonprofit":{"value":"false"},"ecenterprise":{"value":"false"},"websiteurl":{"value":"http://www.upm.es/"},"ecnutscode":{"value":"false"},"ecinternationalorganizationeurinterests":{"value":"false"},"legalname":{"value":"Universidad Politécnica de Madrid"},"country":{"classid":"ES","classname":"Spain","schemename":"dnet:countries","schemeid":"dnet:countries"},"echighereducation":{"value":"false"},"ecsmevalidated":{"value":"false"}}},"dateofcollection":"2017-02-13","type":20,"id":"20|re3data_____::0cbf4c44e9d045ffb0aa303354fff947"} -{"dateoftransformation":"2018-09-13","originalId":["snsf________::Universitat_Politècnica_de_Catalunya"],"collectedfrom":[{"value":"SNSF - Swiss National Science Foundation","key":"10|openaire____::d8f3c25e18304608ce8e816e99603d7a"}],"organization":{"metadata":{"eclegalbody":{"value":"false"},"eclegalperson":{"value":"false"},"ecinternationalorganization":{"value":"false"},"ecnonprofit":{"value":"false"},"ecresearchorganization":{"value":"false"},"ecenterprise":{"value":"false"},"ecnutscode":{"value":"false"},"ecinternationalorganizationeurinterests":{"value":"false"},"legalname":{"value":"Universitat Politècnica de Catalunya"},"country":{"classid":"ES","classname":"Spain","schemename":"dnet:countries","schemeid":"dnet:countries"},"echighereducation":{"value":"false"},"ecsmevalidated":{"value":"false"}}},"dateofcollection":"2017-09-23","type":20,"id":"20|snsf________::7e232375e9902c823cf3cd4e1a16de5e"} -{"dateoftransformation":"2018-09-13","originalId":["opendoar____::Universidad_Politécnica_de_Cartagena"],"collectedfrom":[{"value":"OpenDOAR","key":"10|openaire____::47ce9e9f4fad46e732cff06419ecaabb"}],"organization":{"metadata":{"eclegalbody":{"value":"false"},"eclegalperson":{"value":"false"},"ecinternationalorganization":{"value":"false"},"legalshortname":{"value":"UPCT"},"ecresearchorganization":{"value":"false"},"ecnonprofit":{"value":"false"},"ecenterprise":{"value":"false"},"websiteurl":{"value":"http://www.upct.es/"},"ecnutscode":{"value":"false"},"ecinternationalorganizationeurinterests":{"value":"false"},"legalname":{"value":"Universidad Politécnica de Cartagena"},"country":{"classid":"ES","classname":"Spain","schemename":"dnet:countries","schemeid":"dnet:countries"},"echighereducation":{"value":"false"},"ecsmevalidated":{"value":"false"}}},"dateofcollection":"2015-08-24","type":20,"id":"20|opendoar____::cf3bddf37ca3440dbd8e5ba6e81f6df3"} -{"dateoftransformation":"2018-09-13","originalId":["opendoar____::Universitat_Politècnica_de_València"],"collectedfrom":[{"value":"OpenDOAR","key":"10|openaire____::47ce9e9f4fad46e732cff06419ecaabb"}],"organization":{"metadata":{"eclegalbody":{"value":"false"},"eclegalperson":{"value":"false"},"ecinternationalorganization":{"value":"false"},"ecresearchorganization":{"value":"false"},"ecnonprofit":{"value":"false"},"ecenterprise":{"value":"false"},"websiteurl":{"value":"http://www.upv.es/"},"ecnutscode":{"value":"false"},"ecinternationalorganizationeurinterests":{"value":"false"},"legalname":{"value":"Universitat Politècnica de València"},"country":{"classid":"ES","classname":"Spain","schemename":"dnet:countries","schemeid":"dnet:countries"},"echighereducation":{"value":"false"},"ecsmevalidated":{"value":"false"}}},"dateofcollection":"2015-08-24","type":20,"id":"20|opendoar____::b9fc5f6fb04d0f8d7d0b163718f9785c"} -{"dateoftransformation":"2018-09-13","originalId":["opendoar____::Universidad_Politecnica_de_Madrid"],"collectedfrom":[{"value":"OpenDOAR","key":"10|openaire____::47ce9e9f4fad46e732cff06419ecaabb"}],"organization":{"metadata":{"eclegalbody":{"value":"false"},"eclegalperson":{"value":"false"},"ecinternationalorganization":{"value":"false"},"ecresearchorganization":{"value":"false"},"ecnonprofit":{"value":"false"},"ecenterprise":{"value":"false"},"websiteurl":{"value":"http://www.upm.es/"},"ecnutscode":{"value":"false"},"ecinternationalorganizationeurinterests":{"value":"false"},"legalname":{"value":"Universidad Politecnica de Madrid"},"country":{"classid":"ES","classname":"Spain","schemename":"dnet:countries","schemeid":"dnet:countries"},"echighereducation":{"value":"false"},"ecsmevalidated":{"value":"false"}}},"dateofcollection":"2015-08-24","type":20,"id":"20|opendoar____::c4b8d1e68de570b064c666c370bc2168"} -{"dateoftransformation":"2018-11-12","originalId":["opendoar____::Universitat_Politènica_de_Catalunya"],"collectedfrom":[{"value":"OpenDOAR","key":"10|openaire____::47ce9e9f4fad46e732cff06419ecaabb"}],"organization":{"metadata":{"eclegalbody":{"value":"false"},"eclegalperson":{"value":"false"},"ecinternationalorganization":{"value":"false"},"legalshortname":{"value":"UPC"},"ecresearchorganization":{"value":"false"},"ecnonprofit":{"value":"false"},"ecenterprise":{"value":"false"},"websiteurl":{"value":"http://www.upc.edu/"},"ecnutscode":{"value":"false"},"ecinternationalorganizationeurinterests":{"value":"false"},"legalname":{"value":"Universitat Politènica de Catalunya"},"country":{"classid":"ES","classname":"Spain","schemename":"dnet:countries","schemeid":"dnet:countries"},"echighereducation":{"value":"false"},"ecsmevalidated":{"value":"false"}}},"dateofcollection":"2018-11-12","type":20,"id":"20|opendoar____::54e1f400c1b5dba4fa5a18571d50c3f3"} -{"dateoftransformation":"2019-05-19","originalId":["rcuk________::B0DA50CA-D11E-4251-9678-4AA2F93DB545"],"collectedfrom":[{"value":"Research Councils UK","key":"10|openaire____::ab2d3310741ea80d3b8726f651502858"}],"organization":{"metadata":{"eclegalbody":{"value":"false"},"eclegalperson":{"value":"false"},"ecinternationalorganization":{"value":"false"},"ecnonprofit":{"value":"false"},"ecresearchorganization":{"value":"false"},"ecenterprise":{"value":"false"},"ecnutscode":{"value":"false"},"ecinternationalorganizationeurinterests":{"value":"false"},"legalname":{"value":"Polytechnic University of Catalonia"},"country":{"classid":"ES","classname":"Spain","schemename":"dnet:countries","schemeid":"dnet:countries"},"echighereducation":{"value":"false"},"ecsmevalidated":{"value":"false"}}},"dateofcollection":"2017-11-04","type":20,"id":"20|rcuk________::5cd82a9ed265a9cb392d185a688cad40"} -{"dateoftransformation":"2018-09-13","originalId":["openaire____::UE9MSS1SRUQgKFJldmlzdGFzIERpZ2l0YWxlcyBQb2xpdMOpY25pY2FzKQ==::UNIVERSIDAD POLITECNICA DE MADRID"],"collectedfrom":[{"value":"","key":""}],"organization":{"metadata":{"eclegalbody":{"value":"false"},"eclegalperson":{"value":"false"},"ecinternationalorganization":{"value":"false"},"ecnonprofit":{"value":"false"},"ecresearchorganization":{"value":"false"},"ecenterprise":{"value":"false"},"ecnutscode":{"value":"false"},"ecinternationalorganizationeurinterests":{"value":"false"},"legalname":{"value":"UNIVERSIDAD POLITECNICA DE MADRID"},"country":{"classid":"ES","classname":"Spain","schemename":"dnet:countries","schemeid":"dnet:countries"},"echighereducation":{"value":"false"},"ecsmevalidated":{"value":"false"}}},"dateofcollection":"2016-11-03","type":20,"id":"20|openaire____::9f185b9f2f1c932b492f6fb53c8c5caf"} -{"dateoftransformation":"2018-09-13","originalId":["openaire____::issn24441309::UNIVERSIDAD POLITECNICA DE MADRID"],"collectedfrom":[{"value":"","key":""}],"organization":{"metadata":{"eclegalbody":{"value":"false"},"eclegalperson":{"value":"false"},"ecinternationalorganization":{"value":"false"},"ecnonprofit":{"value":"false"},"ecresearchorganization":{"value":"false"},"ecenterprise":{"value":"false"},"ecnutscode":{"value":"false"},"ecinternationalorganizationeurinterests":{"value":"false"},"legalname":{"value":"UNIVERSIDAD POLITECNICA DE MADRID"},"country":{"classid":"ES","classname":"Spain","schemename":"dnet:countries","schemeid":"dnet:countries"},"echighereducation":{"value":"false"},"ecsmevalidated":{"value":"false"}}},"dateofcollection":"2016-11-03","type":20,"id":"20|openaire____::935af0561af9c6e33326818b32805241"} \ No newline at end of file +{"collectedfrom":[{"value":"GRID - Global Research Identifier Database","key":"10|openaire____::ff4a008470319a22d9cf3d14af485977"}],"organization":{"metadata":{"legalshortname":{"value":"NIOK"},"websiteurl":{"value":"http://www.niok.eu/"},"country":{"classid":"NL","classname":"Netherlands","schemename":"dnet:countries","schemeid":"dnet:countries"},"legalname":{"value":"NIOK"}}},"pid":[{"qualifier":{"classid":"grid","classname":"grid","schemename":"dnet:pid_types","schemeid":"dnet:pid_types"},"value":"grid.450158.d"}],"type":20,"id":"20|grid________::6183d331a1920dd81b8c10620a8b3a8a"} +{"collectedfrom":[{"value":"GRID - Global Research Identifier Database","key":"10|openaire____::ff4a008470319a22d9cf3d14af485977"}],"organization":{"metadata":{"legalshortname":{"value":"NIVEL"},"websiteurl":{"value":"http://www.nivel.nl/en"},"country":{"classid":"NL","classname":"Netherlands","schemename":"dnet:countries","schemeid":"dnet:countries"},"legalname":{"value":"NIVEL"}}},"pid":[{"qualifier":{"classid":"grid","classname":"grid","schemename":"dnet:pid_types","schemeid":"dnet:pid_types"},"value":"grid.416005.6"}],"type":20,"id":"20|grid________::8f65fd4e764086db897cc648e9cbbaed"} +{"collectedfrom":[{"value":"GRID - Global Research Identifier Database","key":"10|openaire____::ff4a008470319a22d9cf3d14af485977"}],"organization":{"metadata":{"legalshortname":{"value":"SCP"},"websiteurl":{"value":"http://www.scp.nl/english/"},"country":{"classid":"NL","classname":"Netherlands","schemename":"dnet:countries","schemeid":"dnet:countries"},"legalname":{"value":"Netherlands Institute for Social Research"}}},"pid":[{"qualifier":{"classid":"grid","classname":"grid","schemename":"dnet:pid_types","schemeid":"dnet:pid_types"},"value":"grid.438038.4"}],"type":20,"id":"20|grid________::c69cffc4997b54bb2eb5ca6aebcda18b"} +{"collectedfrom":[{"value":"GRID - Global Research Identifier Database","key":"10|openaire____::ff4a008470319a22d9cf3d14af485977"}],"organization":{"metadata":{"legalshortname":{"value":"NIVEL"},"websiteurl":{"value":"http://www.nivel.nl/en"},"country":{"classid":"NL","classname":"Netherlands","schemename":"dnet:countries","schemeid":"dnet:countries"},"legalname":{"value":"Netherlands Institute for Health Services Research"}}},"pid":[{"qualifier":{"classid":"grid","classname":"grid","schemename":"dnet:pid_types","schemeid":"dnet:pid_types"},"value":"grid.416005.6"}],"type":20,"id":"20|grid________::5b72dc608480f3d5569a7bfe3cbdaf07"} +{"collectedfrom":[{"value":"GRID - Global Research Identifier Database","key":"10|openaire____::ff4a008470319a22d9cf3d14af485977"}],"organization":{"metadata":{"legalshortname":{"value":"SCP"},"websiteurl":{"value":"http://www.scp.nl/english/"},"country":{"classid":"NL","classname":"Netherlands","schemename":"dnet:countries","schemeid":"dnet:countries"},"legalname":{"value":"SCP"}}},"pid":[{"qualifier":{"classid":"grid","classname":"grid","schemename":"dnet:pid_types","schemeid":"dnet:pid_types"},"value":"grid.438038.4"}],"type":20,"id":"20|grid________::6b7b927a3ae25f1639a6ef27b35021b5"} +{"collectedfrom":[{"value":"GRID - Global Research Identifier Database","key":"10|openaire____::ff4a008470319a22d9cf3d14af485977"}],"organization":{"metadata":{"legalshortname":{"value":"NIOK"},"websiteurl":{"value":"http://www.niok.eu/"},"country":{"classid":"NL","classname":"Netherlands","schemename":"dnet:countries","schemeid":"dnet:countries"},"legalname":{"value":"Netherlands Institute for Catalysis Research"}}},"pid":[{"qualifier":{"classid":"grid","classname":"grid","schemename":"dnet:pid_types","schemeid":"dnet:pid_types"},"value":"grid.450158.d"}],"type":20,"id":"20|grid________::6af340f03c44041737859d3e1354d1fe"} +{"collectedfrom":[{"value":"GRID - Global Research Identifier Database","key":"10|openaire____::ff4a008470319a22d9cf3d14af485977"}],"organization":{"metadata":{"legalshortname":{"value":"NIVEL"},"websiteurl":{"value":"http://www.nivel.nl/en"},"country":{"classid":"NL","classname":"Netherlands","schemename":"dnet:countries","schemeid":"dnet:countries"},"legalname":{"value":"Nederlands Instituut voor Onderzoek van de Gezondheidszorg"}}},"pid":[{"qualifier":{"classid":"grid","classname":"grid","schemename":"dnet:pid_types","schemeid":"dnet:pid_types"},"value":"grid.416005.6"}],"type":20,"id":"20|grid________::69ab0f5ed7da9d961355cb4eb24b8613"} +{"collectedfrom":[{"value":"GRID - Global Research Identifier Database","key":"10|openaire____::ff4a008470319a22d9cf3d14af485977"}],"organization":{"metadata":{"legalshortname":{"value":"NIOK"},"websiteurl":{"value":"http://www.niok.eu/"},"country":{"classid":"NL","classname":"Netherlands","schemename":"dnet:countries","schemeid":"dnet:countries"},"legalname":{"value":"Nederlands Instituut voor Onderzoek in de Katalyse"}}},"pid":[{"qualifier":{"classid":"grid","classname":"grid","schemename":"dnet:pid_types","schemeid":"dnet:pid_types"},"value":"grid.450158.d"}],"type":20,"id":"20|grid________::267cf3ce23903e0a8403653019ce8187"} +{"dateoftransformation":"2018-11-20","originalId":["corda_______::998294125"],"collectedfrom":[{"value":"CORDA - COmmon Research DAta Warehouse","key":"10|openaire____::b30dac7baac631f3da7c2bb18dd9891f"}],"organization":{"metadata":{"eclegalbody":{"value":"false"},"eclegalperson":{"value":"true"},"ecinternationalorganization":{"value":"false"},"legalshortname":{"value":"NIVEL"},"ecresearchorganization":{"value":"true"},"ecnonprofit":{"value":"true"},"ecenterprise":{"value":"false"},"websiteurl":{"value":"http://www.nivel.nl"},"ecnutscode":{"value":"false"},"ecinternationalorganizationeurinterests":{"value":"false"},"legalname":{"value":"STICHTING NEDERLANDS INSTITUUT VOOR ONDERZOEK VAN DE GEZONDHEIDSZORG"},"country":{"classid":"NL","classname":"Netherlands","schemename":"dnet:countries","schemeid":"dnet:countries"},"ecsmevalidated":{"value":"false"}}},"dateofcollection":"2018-03-12","type":20,"id":"20|corda_______::755737ed505484ea374062762ef05ef6"} +{"dateoftransformation":"2019-06-26","originalId":["corda__h2020::998294125"],"collectedfrom":[{"value":"CORDA - COmmon Research DAta Warehouse - Horizon 2020","key":"10|openaire____::a55eb91348674d853191f4f4fd73d078"}],"organization":{"metadata":{"eclegalbody":{"value":"false"},"eclegalperson":{"value":"true"},"ecinternationalorganization":{"value":"false"},"legalshortname":{"value":"NIVEL"},"ecresearchorganization":{"value":"true"},"ecnonprofit":{"value":"true"},"ecenterprise":{"value":"false"},"websiteurl":{"value":"http://www.nivel.nl"},"ecnutscode":{"value":"false"},"ecinternationalorganizationeurinterests":{"value":"false"},"legalname":{"value":"STICHTING NEDERLANDS INSTITUUT VOOR ONDERZOEK VAN DE GEZONDHEIDSZORG"},"country":{"classid":"NL","classname":"Netherlands","schemename":"dnet:countries","schemeid":"dnet:countries"},"echighereducation":{"value":"false"},"ecsmevalidated":{"value":"false"}}},"dateofcollection":"2018-03-12","type":20,"id":"20|corda__h2020::755737ed505484ea374062762ef05ef6"} +{"dateoftransformation":"2018-09-13","originalId":["snsf________::The_Netherlands_Institute_of_Health_Services_Research_NIVEL"],"collectedfrom":[{"value":"SNSF - Swiss National Science Foundation","key":"10|openaire____::d8f3c25e18304608ce8e816e99603d7a"}],"organization":{"metadata":{"eclegalbody":{"value":"false"},"eclegalperson":{"value":"false"},"ecinternationalorganization":{"value":"false"},"ecnonprofit":{"value":"false"},"ecresearchorganization":{"value":"false"},"ecenterprise":{"value":"false"},"ecnutscode":{"value":"false"},"ecinternationalorganizationeurinterests":{"value":"false"},"legalname":{"value":"The Netherlands Institute of Health Services Research NIVEL"},"country":{"classid":"NL","classname":"Netherlands","schemename":"dnet:countries","schemeid":"dnet:countries"},"echighereducation":{"value":"false"},"ecsmevalidated":{"value":"false"}}},"dateofcollection":"2017-09-22","type":20,"id":"20|snsf________::10653be4e9c170181486aa9782346d81"} +{"dateoftransformation":"2018-09-13","originalId":["openaire____::088a0087-4bc6-4c38-a052-b446c3b225a7::The Netherlands Institute for Social Research"],"collectedfrom":[{"value":"","key":""}],"organization":{"metadata":{"eclegalbody":{"value":"false"},"eclegalperson":{"value":"false"},"ecinternationalorganization":{"value":"false"},"ecnonprofit":{"value":"false"},"ecresearchorganization":{"value":"false"},"ecenterprise":{"value":"false"},"ecnutscode":{"value":"false"},"ecinternationalorganizationeurinterests":{"value":"false"},"legalname":{"value":"The Netherlands Institute for Social Research"},"country":{"classid":"NL","classname":"Netherlands","schemename":"dnet:countries","schemeid":"dnet:countries"},"echighereducation":{"value":"false"},"ecsmevalidated":{"value":"false"}}},"dateofcollection":"2016-03-30","type":20,"id":"20|openaire____::857b30f258c43852a2cb57875ac40892"} \ No newline at end of file diff --git a/dnet-dedup-test/src/test/java/eu/dnetlib/pace/DedupTestIT.java b/dnet-dedup-test/src/test/java/eu/dnetlib/pace/DedupTestIT.java index 3f25bbd..702b4ab 100644 --- a/dnet-dedup-test/src/test/java/eu/dnetlib/pace/DedupTestIT.java +++ b/dnet-dedup-test/src/test/java/eu/dnetlib/pace/DedupTestIT.java @@ -55,7 +55,6 @@ public class DedupTestIT { } - static Properties readProperties(final String propFile) { Properties prop = new Properties(); diff --git a/dnet-pace-core/src/main/resources/eu/dnetlib/pace/config/translation_map.csv b/dnet-pace-core/src/main/resources/eu/dnetlib/pace/config/translation_map.csv index bab6836..4aad426 100644 --- a/dnet-pace-core/src/main/resources/eu/dnetlib/pace/config/translation_map.csv +++ b/dnet-pace-core/src/main/resources/eu/dnetlib/pace/config/translation_map.csv @@ -58,7 +58,7 @@ key::57;social;sociali;social;social;Sozial;sociaal;maatschappelijk;κοινων key::58;environmental;ambiente;medioambiental;ambiente;medioambiente;meioambiente;Umwelt;milieu;milieuwetenschap;milieukunde;περιβαλλοντικός;περιβαλλοντική;περιβαλλοντικό;περιβαλλοντικά;çevre;környezeti;okoliški;keskonna;; key::59;business;economia;economiche;economica;negocio;empresa;negócio;Unternehmen;bedrijf;bedrijfskunde;επιχείρηση;iş;üzleti;posel;ettevõte/äri; key::60;pharmaceuticals;pharmacy;farmacia;farmaceutica;farmacéutica;farmacia;farmacêutica;farmácia;Pharmazeutika;Arzneimittelkunde;farmaceutica;geneesmiddelen;apotheek;φαρμακευτικός;φαρμακευτική;φαρμακευτικό;φαρμακευτικά;φαρμακείο;ilaç;eczane;gyógyszerészeti;gyógyszertár;farmacevtika;lekarništvo;farmaatsia;farmatseutiline; -key::61;healthcare;salute;atenciónmédica;cuidadodelasalud;cuidadoscomasaúde;Gesundheitswesen;gezondheidszorg;ιατροφαρμακευτικήπερίθαλψη;sağlıkhizmeti;egészségügy;zdravstvo;tervishoid;tervishoiu; +key::61;healthcare;health services;salute;atenciónmédica;cuidadodelasalud;cuidadoscomasaúde;Gesundheitswesen;gezondheidszorg;ιατροφαρμακευτικήπερίθαλψη;sağlıkhizmeti;egészségügy;zdravstvo;tervishoid;tervishoiu; key::62;history;storia;historia;história;Geschichte;geschiedenis;geschiedkunde;ιστορία;tarih;történelem;zgodovina;ajalugu; key::63;materials;materiali;materia;materiales;materiais;materialen;υλικά;τεκμήρια;malzemeler;anyagok;materiali;materjalid;vahendid; key::64;economics;economia;economiche;economica;economía;economia;Wirtschaft;economie;οικονομικά;οικονομικέςεπιστήμες;ekonomi;közgazdaságtan;gospodarstvo;ekonomija;majanduslik;majandus; diff --git a/dnet-pace-core/src/test/java/eu/dnetlib/pace/clustering/ClusteringFunctionTest.java b/dnet-pace-core/src/test/java/eu/dnetlib/pace/clustering/ClusteringFunctionTest.java index 58f86d0..265f397 100644 --- a/dnet-pace-core/src/test/java/eu/dnetlib/pace/clustering/ClusteringFunctionTest.java +++ b/dnet-pace-core/src/test/java/eu/dnetlib/pace/clustering/ClusteringFunctionTest.java @@ -46,7 +46,7 @@ public class ClusteringFunctionTest extends AbstractPaceTest { @Test public void testNgramPairs() { params.put("ngramLen", 3); - params.put("max", 3); + params.put("max", 1); final ClusteringFunction np = new NgramPairs(params); diff --git a/dnet-pace-core/src/test/java/eu/dnetlib/pace/distance/DistanceAlgoTest.java b/dnet-pace-core/src/test/java/eu/dnetlib/pace/distance/DistanceAlgoTest.java index 11ae418..1cce9a6 100644 --- a/dnet-pace-core/src/test/java/eu/dnetlib/pace/distance/DistanceAlgoTest.java +++ b/dnet-pace-core/src/test/java/eu/dnetlib/pace/distance/DistanceAlgoTest.java @@ -1,5 +1,6 @@ package eu.dnetlib.pace.distance; +import eu.dnetlib.pace.clustering.NGramUtils; import eu.dnetlib.pace.distance.algo.JaroWinklerNormalizedName; import org.junit.Before; import org.junit.Test; @@ -25,6 +26,12 @@ public class DistanceAlgoTest extends AbstractPaceFunctions { params.put("weight", 1.0); } + @Test + public void testCleanForSorting() { + NGramUtils utils = new NGramUtils(); + System.out.println("utils = " + utils.cleanupForOrdering("University of Pisa")); + } + @Test public void testGetNumbers() { System.out.println("Numbers : " + getNumbers(TEST_STRING));