addition of the BlockUtils class for meta-blocking, implementation of a new local test with edge filtering example

This commit is contained in:
miconis 2019-08-06 12:09:34 +02:00
parent 2472f2b1e8
commit 85070ce3fe
16 changed files with 753 additions and 53 deletions

View File

@ -0,0 +1,119 @@
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
<parent>
<artifactId>dnet-dedup</artifactId>
<groupId>eu.dnetlib</groupId>
<version>3.0.14-SNAPSHOT</version>
</parent>
<modelVersion>4.0.0</modelVersion>
<artifactId>dnet-dedup-test</artifactId>
<build>
<plugins>
<plugin>
<artifactId>maven-shade-plugin</artifactId>
<version>2.4.3</version>
<executions>
<execution>
<phase>package</phase>
<goals>
<goal>shade</goal>
</goals>
<configuration>
<filters>
<filter>
<artifact>*:*</artifact>
<excludes>
<exclude>META-INF/*.SF</exclude>
<exclude>META-INF/*.DSA</exclude>
<exclude>META-INF/*.RSA</exclude>
</excludes>
</filter>
</filters>
</configuration>
</execution>
</executions>
</plugin>
<plugin>
<artifactId>maven-deploy-plugin</artifactId>
<version>2.7</version>
<configuration>
<skip>true</skip>
</configuration>
</plugin>
<plugin>
<artifactId>maven-compiler-plugin</artifactId>
<configuration>
<source>1.8</source>
<target>1.8</target>
<includes>
<include>**/*.java</include>
</includes>
</configuration>
</plugin>
<plugin>
<groupId>net.alchim31.maven</groupId>
<artifactId>scala-maven-plugin</artifactId>
<version>4.0.1</version>
<executions>
<execution>
<id>scala-compile-first</id>
<phase>initialize</phase>
<goals>
<goal>add-source</goal>
<goal>compile</goal>
</goals>
</execution>
<execution>
<id>scala-test-compile</id>
<phase>process-test-resources</phase>
<goals>
<goal>testCompile</goal>
</goals>
</execution>
</executions>
<configuration>
<scalaVersion>${scala.version}</scalaVersion>
</configuration>
</plugin>
</plugins>
</build>
<dependencies>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>4.9</version>
<scope>test</scope>
<exclusions>
<exclusion>
<artifactId>hamcrest-core</artifactId>
<groupId>org.hamcrest</groupId>
</exclusion>
</exclusions>
</dependency>
<dependency>
<groupId>org.apache.oozie</groupId>
<artifactId>oozie-client</artifactId>
<version>5.1.0</version>
<scope>test</scope>
<exclusions>
<exclusion>
<artifactId>json-simple</artifactId>
<groupId>com.googlecode.json-simple</groupId>
</exclusion>
<exclusion>
<artifactId>jms</artifactId>
<groupId>javax.jms</groupId>
</exclusion>
<exclusion>
<artifactId>slf4j-simple</artifactId>
<groupId>org.slf4j</groupId>
</exclusion>
<exclusion>
<artifactId>oozie-fluent-job-api</artifactId>
<groupId>org.apache.oozie</groupId>
</exclusion>
</exclusions>
</dependency>
</dependencies>
</project>

View File

@ -15,6 +15,33 @@
<build>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-shade-plugin</artifactId>
<version>2.4.3</version>
<executions>
<execution>
<phase>package</phase>
<goals>
<goal>shade</goal>
</goals>
<configuration>
<filters>
<filter>
<artifact>*:*</artifact>
<excludes>
<exclude>META-INF/*.SF</exclude>
<exclude>META-INF/*.DSA</exclude>
<exclude>META-INF/*.RSA</exclude>
</excludes>
</filter>
</filters>
</configuration>
</execution>
</executions>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-deploy-plugin</artifactId>

View File

@ -0,0 +1,50 @@
package eu.dnetlib;
import eu.dnetlib.pace.model.MapDocument;
import java.io.Serializable;
import java.util.Arrays;
import java.util.List;
import java.util.stream.Collectors;
import java.util.stream.StreamSupport;
public class Block implements Serializable {
String key;
List<MapDocument> elements;
public Block(String key, Iterable<MapDocument> elements){
this.key = key;
this.elements = StreamSupport.stream(elements.spliterator(), false).collect(Collectors.toList());
}
public Block(String key, List<MapDocument> elements){
this.key = key;
this.elements = elements;
}
public String getKey() {
return key;
}
public void setKey(String key) {
this.key = key;
}
public List<MapDocument> getElements() {
return elements;
}
public void setElements(List<MapDocument> elements) {
this.elements = elements;
}
public int comparisons(){
int size = elements.size();
return (size*(size-1)/2);
}
public int elements(){
return elements.size();
}
}

View File

@ -1,12 +1,12 @@
package eu.dnetlib;
import com.google.common.collect.Iterables;
import eu.dnetlib.graph.GraphProcessor;
import eu.dnetlib.pace.config.DedupConfig;
import eu.dnetlib.pace.model.MapDocument;
import eu.dnetlib.pace.utils.PaceUtils;
import eu.dnetlib.reporter.SparkBlockProcessor;
import eu.dnetlib.reporter.SparkReporter;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
@ -17,13 +17,15 @@ import org.apache.spark.util.LongAccumulator;
import scala.Tuple2;
import java.net.URL;
import java.util.Map;
import java.util.*;
import java.util.stream.Collectors;
public class SparkLocalTest {
public static void main(String[] args) {
double startTime = System.currentTimeMillis();
final SparkSession spark = SparkSession
.builder()
.appName("Deduplication")
@ -33,7 +35,7 @@ public class SparkLocalTest {
final JavaSparkContext context = new JavaSparkContext(spark.sparkContext());
final URL dataset = SparkLocalTest.class.getResource("/eu/dnetlib/pace/organization.to.fix.json");
final JavaRDD<String> dataRDD = context.textFile(dataset.getPath());
JavaRDD<String> dataRDD = context.textFile(dataset.getPath());
//read the configuration from the classpath
final DedupConfig config = DedupConfig.load(Utility.readFromClasspath("/eu/dnetlib/pace/org.curr.conf", SparkLocalTest.class));
@ -46,32 +48,24 @@ public class SparkLocalTest {
return new Tuple2<>(mapDocument.getIdentifier(), mapDocument);
});
// mapDocs.foreach(doc -> System.out.println("doc = " + doc._2().getFieldMap().get("legalname")));
// mapDocs.filter(d -> d._2().getFieldMap().get("doi").stringValue().length() > 0).foreach(d -> System.out.println(d));
// mapDocs.filter(d -> d._2().getFieldMap().get("documentationUrl").stringValue().length() > 0).foreach(d -> System.out.println(d));
// System.out.println("mapDocs = " + mapDocs.count());
RDD<Tuple2<Object, MapDocument>> vertexes = mapDocs.mapToPair(t -> new Tuple2<Object, MapDocument>( (long) t._1().hashCode(), t._2())).rdd();
//create relations between documents
JavaPairRDD<String, Iterable<MapDocument>> blocks = mapDocs.reduceByKey((a, b) -> a) //the reduce is just to be sure that we haven't document with same id
JavaRDD<Block> blocks = mapDocs.reduceByKey((a, b) -> a) //the reduce is just to be sure that we haven't document with same id
//Clustering: from <id, doc> to List<groupkey,doc>
.flatMapToPair(a -> {
final MapDocument currentDocument = a._2();
return Utility.getGroupingKeys(config, currentDocument).stream()
.map(it -> new Tuple2<>(it, currentDocument)).collect(Collectors.toList()).iterator();
}).groupByKey();//group documents basing on the key
//print blocks
// blocks = blocks.filter(b -> Iterables.size(b._2())>1);
//// vertexes = blocks.flatMap(b -> b._2().iterator()).map(t -> new Tuple2<Object, MapDocument>((long) t.getIdentifier().hashCode(), t)).rdd();
// blocks.map(group -> new DocumentsBlock(group._1(), group._2())).foreach(b -> System.out.println(b));
}).groupByKey().map(b -> new Block(b._1(), b._2())).filter(b -> b.getElements().size()>1);
//create relations by comparing only elements in the same group
final JavaPairRDD<String, String> relationRDD = blocks.flatMapToPair(it -> {
final SparkReporter reporter = new SparkReporter();
new SparkBlockProcessor(config).process(it._1(), it._2(), reporter, accumulators);
new SparkBlockProcessor(config).process(it.getKey(), it.getElements(), reporter, accumulators);
return reporter.getReport().iterator();
});
@ -79,6 +73,14 @@ public class SparkLocalTest {
JavaRDD<ConnectedComponent> ccs = GraphProcessor.findCCs(vertexes, edgeRdd, 20).toJavaRDD();
System.out.println("total time = " + (System.currentTimeMillis()-startTime));
printStatistics(ccs);
accumulators.forEach((name, acc) -> System.out.println(name + " -> " + acc.value()));
}
public static void printStatistics(JavaRDD<ConnectedComponent> ccs){
final JavaRDD<ConnectedComponent> connectedComponents = ccs.filter(cc -> cc.getDocs().size()>1);
final JavaRDD<ConnectedComponent> nonDeduplicated = ccs.filter(cc -> cc.getDocs().size()==1);
@ -86,6 +88,9 @@ public class SparkLocalTest {
connectedComponents.foreach(cc -> {
System.out.println(cc);
});
connectedComponents.foreach(cc -> {
cc.getDocs().stream().forEach(d -> System.out.println(d.getFieldMap().get("legalname") + " | " + d.getFieldMap().get("legalshortname")));
});
//print nondeduped
nonDeduplicated.foreach(cc -> {
System.out.println(cc);
@ -95,12 +100,6 @@ public class SparkLocalTest {
System.out.println("Duplicates: " + connectedComponents.flatMap(cc -> cc.getDocs().iterator()).count());
System.out.println("Connected Components: " + connectedComponents.count());
accumulators.forEach((name, acc) -> System.out.println(name + " -> " + acc.value()));
// //print ids
// ccs.foreach(cc -> System.out.println(cc.getId()));
// connectedComponents.saveAsTextFile("file:///Users/miconis/Downloads/dumps/organizations_dedup");
}
}

View File

@ -0,0 +1,118 @@
package eu.dnetlib;
import com.google.common.collect.Lists;
import eu.dnetlib.graph.GraphProcessor;
import eu.dnetlib.pace.config.DedupConfig;
import eu.dnetlib.pace.model.MapDocument;
import eu.dnetlib.pace.utils.PaceUtils;
import eu.dnetlib.reporter.SparkBlockProcessor2;
import eu.dnetlib.reporter.SparkReporter;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.graphx.Edge;
import org.apache.spark.rdd.RDD;
import org.apache.spark.sql.SparkSession;
import org.apache.spark.util.LongAccumulator;
import scala.Tuple2;
import java.math.BigInteger;
import java.net.URL;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;
public class SparkLocalTest2 {
public static void main(String[] args) {
double startTime = System.currentTimeMillis();
final SparkSession spark = SparkSession
.builder()
.appName("Deduplication")
.master("local[*]")
.getOrCreate();
final JavaSparkContext context = new JavaSparkContext(spark.sparkContext());
final URL dataset = SparkLocalTest2.class.getResource("/eu/dnetlib/pace/softwares.huge.json");
JavaRDD<String> dataRDD = context.textFile(dataset.getPath());
//read the configuration from the classpath
final DedupConfig config = DedupConfig.load(Utility.readFromClasspath("/eu/dnetlib/pace/software.pace.conf", SparkLocalTest2.class));
Map<String, LongAccumulator> accumulators = Utility.constructAccumulator(config, context.sc());
//create vertexes of the graph: <ID, MapDocument>
JavaPairRDD<String, MapDocument> mapDocs = dataRDD.mapToPair(it -> {
MapDocument mapDocument = PaceUtils.asMapDocument(config, it);
return new Tuple2<>(mapDocument.getIdentifier(), mapDocument);
});
// System.out.println("mapDocs = " + mapDocs.count());
RDD<Tuple2<Object, MapDocument>> vertexes = mapDocs.mapToPair(t -> new Tuple2<Object, MapDocument>( (long) t._1().hashCode(), t._2())).rdd();
//create relations between documents
JavaRDD<Block> blocks = mapDocs.reduceByKey((a, b) -> a) //the reduce is just to be sure that we haven't document with same id
//Clustering: from <id, doc> to List<groupkey,doc>
.flatMapToPair(a -> {
final MapDocument currentDocument = a._2();
return Utility.getGroupingKeys(config, currentDocument).stream()
.map(it -> new Tuple2<>(it, currentDocument)).collect(Collectors.toList()).iterator();
}).groupByKey().map(b -> new Block(b._1(), b._2())).filter(b -> b.getElements().size()>1);
// //BLOCK PURGING
// blocks = BlockUtils.blockPurging2(blocks);
//// blockPurging(blocks);
//
//// //BLOCK FILTERING
// blocks = BlockUtils.blockFiltering(blocks);
JavaPairRDD<Tuple2<MapDocument, MapDocument>, Integer> edge = blocks.flatMap(it -> {
final SparkReporter reporter = new SparkReporter();
return new SparkBlockProcessor2(config).process(it.getKey(), it.getElements(), reporter, accumulators);
}).mapToPair(candidate -> new Tuple2<>(candidate, 1))
.reduceByKey((a, b) -> a + b);
final JavaPairRDD<String, String> relationRDD = edge.filter(e -> {
final SparkReporter reporter = new SparkReporter();
return new SparkBlockProcessor2(config).isSimilar(e._1(), reporter, accumulators);
}).mapToPair(t -> new Tuple2<>(t._1()._1().getIdentifier(), t._1()._2().getIdentifier()));
System.out.println("relationRDD = " + relationRDD.count());
final RDD<Edge<String>> edgeRdd = relationRDD.map(it -> new Edge<>(it._1().hashCode(),it._2().hashCode(), "similarTo")).rdd();
JavaRDD<ConnectedComponent> ccs = GraphProcessor.findCCs(vertexes, edgeRdd, 20).toJavaRDD();
System.out.println("total time = " + (System.currentTimeMillis()-startTime));
printStatistics(ccs);
accumulators.forEach((name, acc) -> System.out.println(name + " -> " + acc.value()));
}
public static void printStatistics(JavaRDD<ConnectedComponent> ccs){
final JavaRDD<ConnectedComponent> connectedComponents = ccs.filter(cc -> cc.getDocs().size()>1);
final JavaRDD<ConnectedComponent> nonDeduplicated = ccs.filter(cc -> cc.getDocs().size()==1);
// //print deduped
// connectedComponents.foreach(cc -> {
// System.out.println(cc);
// });
// //print nondeduped
// nonDeduplicated.foreach(cc -> {
// System.out.println(cc);
// });
System.out.println("Non duplicates: " + nonDeduplicated.count());
System.out.println("Duplicates: " + connectedComponents.flatMap(cc -> cc.getDocs().iterator()).count());
System.out.println("Connected Components: " + connectedComponents.count());
}
}

View File

@ -10,7 +10,7 @@ import scala.collection.JavaConversions;
object GraphProcessor {
def findCCs(vertexes: RDD[(VertexId, MapDocument)], edges: RDD[Edge[String]], maxIterations: Int): RDD[ConnectedComponent] = {
val graph: Graph[MapDocument, String] = Graph(vertexes, edges)
val graph: Graph[MapDocument, String] = Graph(vertexes, edges).partitionBy(PartitionStrategy.RandomVertexCut) //TODO remember to remove partitionby
val cc = graph.connectedComponents(maxIterations).vertices
val joinResult = vertexes.leftOuterJoin(cc).map {

View File

@ -0,0 +1,168 @@
package eu.dnetlib.pace.utils;
import com.google.common.collect.Lists;
import eu.dnetlib.Block;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import scala.Tuple2;
import java.io.Serializable;
import java.math.BigInteger;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.List;
import java.util.stream.Collectors;
public class BlockUtils implements Serializable {
public static double getOptimalComparisonNumber(JavaRDD<Block> blocks) {
double SMOOTHING_FACTOR = 1.05;
//pairRDD: cardinality, #elements
List<Tuple2<Integer, Tuple2<Integer, Integer>>> collect = blocks.mapToPair(b -> new Tuple2<>(b.comparisons(), b.elements()))
.mapToPair(bs -> new Tuple2<>(bs._1(), new Tuple2<>(bs._1(), bs._2())))
.reduceByKey((a, b) -> new Tuple2<>(a._1() + b._1(), a._2() + b._2())).collect();
collect = new ArrayList<>(collect);
collect.sort(Comparator.comparing(Tuple2::_1));
double[] blockAssignments = new double[collect.size()];
double[] comparisonsLevel = new double[collect.size()];
double[] totalComparisonsPerLevel = new double[collect.size()];
Integer totalComparisons = collect.get(0)._2()._1();
Integer totalBlockSize = collect.get(0)._2()._2();
blockAssignments[0] = totalBlockSize;
comparisonsLevel[0] = collect.get(0)._1();
totalComparisonsPerLevel[0] = totalComparisons;
for (int i=1; i<collect.size(); i++){
Integer comparisonLevel = collect.get(i)._1();
totalComparisons += collect.get(i)._2()._1();
totalBlockSize += collect.get(i)._2()._2();
blockAssignments[i] = totalBlockSize;
comparisonsLevel[i] = comparisonLevel;
totalComparisonsPerLevel[i] = totalComparisons;
}
double currentBC = 0;
double currentCC = 0;
double currentSize = 0;
double previousBC = 0;
double previousCC = 0;
double previousSize = 0;
int arraySize = blockAssignments.length;
for (int i = arraySize-1; 0 <= i; i--) {
previousSize = currentSize;
previousBC = currentBC;
previousCC = currentCC;
currentSize = comparisonsLevel[i];
currentBC = blockAssignments[i];
currentCC = totalComparisonsPerLevel[i];
if (currentBC*previousCC < SMOOTHING_FACTOR*currentCC*previousBC) {
break;
}
}
return previousSize;
}
public static int getOptimalBlockSize(JavaRDD<Block> blocks){
BigInteger numberOfComparisons = BigInteger.ZERO;
BigInteger totalSizeOfBlocks = BigInteger.ZERO;
BigInteger blockSize;
//block_size, frequency
JavaPairRDD<Integer, Integer> blocksFreq = blocks.mapToPair(b -> new Tuple2<>(b.getKey(), b.elements()))
.mapToPair(bs -> new Tuple2<>(bs._2(),1))
.reduceByKey((a,b) -> a+b).sortByKey();
ArrayList<Tuple2<Integer, Integer>> blockSizesAndFreq = new ArrayList<>(blocksFreq.collect());
double CC = 0d;
int freq;
/*
* statistics: array of pairs (blockSize, CC) for every blockSize
*/
ArrayList<Tuple2<Integer, Double>> statistics = new ArrayList<>();
for (int i = 0; i < blockSizesAndFreq.size(); i++) {
blockSize = new BigInteger(blockSizesAndFreq.get(i)._1.toString());
freq = blockSizesAndFreq.get(i)._2;
totalSizeOfBlocks = totalSizeOfBlocks.add(BigInteger.valueOf(freq).multiply(blockSize));
//accumulated number of comparisons
numberOfComparisons = numberOfComparisons.add(BigInteger.valueOf(freq)
.multiply(blockSize.multiply(blockSize.subtract(BigInteger.ONE)).shiftLeft(1)));
CC = totalSizeOfBlocks.doubleValue() / numberOfComparisons.doubleValue();
Tuple2<Integer, Double> st = new Tuple2<>(blockSize.intValue(), CC);
statistics.add(st);
}
int optimalBlockSize = statistics.get(statistics.size() - 1)._1;// lastBlockSize;
double eps = 1d; //smoothing factor
/*
* find minimum difference for every adjacent pair i,i-1 the minimum difference
* represents the optimal blockSize
*/
for (int i = statistics.size() - 1; i >= 1; i--) {
if (Math.abs(statistics.get(i)._2 - statistics.get(i - 1)._2) < eps) {
eps = Math.abs(statistics.get(i)._2 - statistics.get(i - 1)._2);
optimalBlockSize = statistics.get(i)._1;
}
}
return optimalBlockSize;
}
//cut blocks basing on number of elements
public static JavaRDD<Block> blockPurging(JavaRDD<Block> blocks) {
int optimalBlockSize = getOptimalBlockSize(blocks);
System.out.println("optimalBlockSize = " + optimalBlockSize);
return blocks.filter(b -> b.getElements().size() < optimalBlockSize);
}
//cut blocks basing on number of comparisons
public static JavaRDD<Block> blockPurging2(JavaRDD<Block> blocks) {
double optimalComparisonNumber = getOptimalComparisonNumber(blocks);
System.out.println("optimalComparisonNumber = " + optimalComparisonNumber);
return blocks.filter(b -> b.comparisons() < optimalComparisonNumber);
}
public static JavaRDD<Block> blockFiltering(JavaRDD<Block> blocks) {
double RATIO = 0.85;
return blocks
.flatMapToPair(b -> b.getElements().stream().map(e -> new Tuple2<>(e, new Tuple2<>(b.getKey(), b.comparisons()))).iterator())
.groupByKey()
.mapToPair(es -> {
List<Tuple2<String, Integer>> b = Lists.newArrayList(es._2());
b.sort(Comparator.comparing(Tuple2::_2));
int size = b.size();
long limit = Math.round(size*RATIO);
return new Tuple2<>(es._1(),b.subList(0,(int)limit));
})
.flatMapToPair(es -> es._2().stream().map(it -> new Tuple2<>(it._1(), es._1())).collect(Collectors.toList()).iterator())
.groupByKey().map(b -> new Block(b._1(), b._2()));
}
}

View File

@ -32,8 +32,6 @@ public class SparkBlockProcessor {
final Queue<MapDocument> q = prepare(documents);
if (q.size() > 1) {
// log.info("reducing key: '" + key + "' records: " + q.size());
//process(q, context);
process(simplifyQueue(q, key, context, accumulators), context, accumulators);
} else {

View File

@ -0,0 +1,193 @@
package eu.dnetlib.reporter;
import com.google.common.collect.Lists;
import eu.dnetlib.pace.clustering.NGramUtils;
import eu.dnetlib.pace.config.DedupConfig;
import eu.dnetlib.pace.config.WfConfig;
import eu.dnetlib.pace.distance.PaceDocumentDistance;
import eu.dnetlib.pace.distance.eval.ScoreResult;
import eu.dnetlib.pace.model.Field;
import eu.dnetlib.pace.model.MapDocument;
import eu.dnetlib.pace.model.MapDocumentComparator;
import org.apache.commons.lang.StringUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.spark.util.LongAccumulator;
import scala.Tuple2;
import java.util.*;
public class SparkBlockProcessor2 {
private static final Log log = LogFactory.getLog(SparkBlockProcessor2.class);
private DedupConfig dedupConf;
public SparkBlockProcessor2(DedupConfig dedupConf) {
this.dedupConf = dedupConf;
}
public boolean isSimilar(Tuple2<MapDocument, MapDocument> t, SparkReporter context, Map<String, LongAccumulator> accumulators) {
final PaceDocumentDistance algo = new PaceDocumentDistance();
final ScoreResult sr = similarity(algo, t._1(), t._2());
final double d = sr.getScore();
if (d >= dedupConf.getWf().getThreshold()) {
context.incrementCounter(dedupConf.getWf().getEntityType(), "dedupSimilarity (x2)", 1, accumulators);
return true;
} else {
context.incrementCounter(dedupConf.getWf().getEntityType(), "d < " + dedupConf.getWf().getThreshold(), 1, accumulators);
return false;
}
}
public Iterator<Tuple2<MapDocument, MapDocument>> process(final String key, final Iterable<MapDocument> documents, final SparkReporter context, Map<String, LongAccumulator> accumulators) {
final Queue<MapDocument> q = prepare(documents);
if (q.size() > 1) {
return process(simplifyQueue(q, key, context, accumulators), context, accumulators);
} else {
context.incrementCounter(dedupConf.getWf().getEntityType(), "records per hash key = 1", 1, accumulators);
return new ArrayList<Tuple2<MapDocument,MapDocument>>().iterator();
}
}
private Queue<MapDocument> prepare(final Iterable<MapDocument> documents) {
final Queue<MapDocument> queue = new PriorityQueue<>(100, new MapDocumentComparator(dedupConf.getWf().getOrderField()));
final Set<String> seen = new HashSet<String>();
final int queueMaxSize = dedupConf.getWf().getQueueMaxSize();
documents.forEach(doc -> {
if (queue.size() <= queueMaxSize) {
final String id = doc.getIdentifier();
if (!seen.contains(id)) {
seen.add(id);
queue.add(doc);
}
}
});
return queue;
}
private Queue<MapDocument> simplifyQueue(final Queue<MapDocument> queue, final String ngram, final SparkReporter context, Map<String, LongAccumulator> accumulators) {
final Queue<MapDocument> q = new LinkedList<>();
String fieldRef = "";
final List<MapDocument> tempResults = Lists.newArrayList();
while (!queue.isEmpty()) {
final MapDocument result = queue.remove();
final String orderFieldName = dedupConf.getWf().getOrderField();
final Field orderFieldValue = result.values(orderFieldName);
if (!orderFieldValue.isEmpty()) {
final String field = NGramUtils.cleanupForOrdering(orderFieldValue.stringValue());
if (field.equals(fieldRef)) {
tempResults.add(result);
} else {
populateSimplifiedQueue(q, tempResults, context, fieldRef, ngram, accumulators);
tempResults.clear();
tempResults.add(result);
fieldRef = field;
}
} else {
context.incrementCounter(dedupConf.getWf().getEntityType(), "missing " + dedupConf.getWf().getOrderField(), 1, accumulators);
}
}
populateSimplifiedQueue(q, tempResults, context, fieldRef, ngram, accumulators);
return q;
}
private void populateSimplifiedQueue(final Queue<MapDocument> q,
final List<MapDocument> tempResults,
final SparkReporter context,
final String fieldRef,
final String ngram,
Map<String, LongAccumulator> accumulators) {
WfConfig wf = dedupConf.getWf();
if (tempResults.size() < wf.getGroupMaxSize()) {
q.addAll(tempResults);
} else {
context.incrementCounter(wf.getEntityType(), String.format("Skipped records for count(%s) >= %s", wf.getOrderField(), wf.getGroupMaxSize()), tempResults.size(), accumulators);
// log.info("Skipped field: " + fieldRef + " - size: " + tempResults.size() + " - ngram: " + ngram);
}
}
private Iterator<Tuple2<MapDocument, MapDocument>> process(final Queue<MapDocument> queue, final SparkReporter context, Map<String, LongAccumulator> accumulators) {
final PaceDocumentDistance algo = new PaceDocumentDistance();
List<Tuple2<MapDocument, MapDocument>> ret = new ArrayList<>();
while (!queue.isEmpty()) {
final MapDocument pivot = queue.remove();
final String idPivot = pivot.getIdentifier();
WfConfig wf = dedupConf.getWf();
final Field fieldsPivot = pivot.values(wf.getOrderField());
final String fieldPivot = (fieldsPivot == null) || fieldsPivot.isEmpty() ? null : fieldsPivot.stringValue();
if (fieldPivot != null) {
// System.out.println(idPivot + " --> " + fieldPivot);
int i = 0;
for (final MapDocument curr : queue) {
final String idCurr = curr.getIdentifier();
if (mustSkip(idCurr)) {
context.incrementCounter(wf.getEntityType(), "skip list", 1, accumulators);
break;
}
if (i > wf.getSlidingWindowSize()) {
break;
}
final Field fieldsCurr = curr.values(wf.getOrderField());
final String fieldCurr = (fieldsCurr == null) || fieldsCurr.isEmpty() ? null : fieldsCurr.stringValue();
if (!idCurr.equals(idPivot) && (fieldCurr != null)) {
if (pivot.getIdentifier().compareTo(curr.getIdentifier())<0){
ret.add(new Tuple2<>(pivot, curr));
} else {
ret.add(new Tuple2<>(curr, pivot));
}
i++;
}
}
}
}
return ret.iterator();
}
private ScoreResult similarity(final PaceDocumentDistance algo, final MapDocument a, final MapDocument b) {
try {
return algo.between(a, b, dedupConf);
} catch(Throwable e) {
log.error(String.format("\nA: %s\n----------------------\nB: %s", a, b), e);
throw new IllegalArgumentException(e);
}
}
private boolean mustSkip(final String idPivot) {
return dedupConf.getWf().getSkipList().contains(getNsPrefix(idPivot));
}
private String getNsPrefix(final String id) {
return StringUtils.substringBetween(id, "|", "::");
}
}

View File

@ -0,0 +1,35 @@
{
"wf" : {
"threshold" : "0.9",
"dedupRun" : "001",
"entityType" : "organization",
"orderField" : "legalname",
"queueMaxSize" : "2000",
"groupMaxSize" : "50",
"slidingWindowSize" : "200",
"rootBuilder" : [ "organization", "projectOrganization_participation_isParticipant", "datasourceOrganization_provision_isProvidedBy" ],
"includeChildren" : "true"
},
"pace" : {
"clustering" : [
{ "name" : "sortedngrampairs", "fields" : [ "legalname" ], "params" : { "max" : 2, "ngramLen" : "3"} },
{ "name" : "suffixprefix", "fields" : [ "legalname" ], "params" : { "max" : 1, "len" : "3" } },
{ "name" : "urlclustering", "fields" : [ "websiteurl" ], "params" : { } }
],
"strictConditions" : [
{ "name" : "exactMatch", "fields" : [ "gridid" ] }
],
"conditions" : [
{ "name" : "exactMatch", "fields" : [ "country" ] },
{ "name" : "DomainExactMatch", "fields" : [ "websiteurl" ] }
],
"model" : [
{ "name" : "country", "algo" : "Null", "type" : "String", "weight" : "0", "ignoreMissing" : "true", "path" : "organization/metadata/country/classid" },
{ "name" : "legalshortname", "algo" : "JaroWinklerNormalizedName", "type" : "String", "weight" : "0.1", "ignoreMissing" : "true", "path" : "organization/metadata/legalshortname/value" },
{ "name" : "legalname", "algo" : "JaroWinklerNormalizedName", "type" : "String", "weight" : "0.9", "ignoreMissing" : "false", "path" : "organization/metadata/legalname/value", "params" : {"windowSize" : 4, "threshold" : 0.5} },
{ "name" : "websiteurl", "algo" : "Null", "type" : "URL", "weight" : "0", "ignoreMissing" : "true", "path" : "organization/metadata/websiteurl/value", "params" : { "host" : 0.5, "path" : 0.5 } },
{ "name" : "gridid", "algo" : "Null", "type" : "String", "weight" : "0.0", "ignoreMissing" : "true", "path" : "pid[qualifier#classid = {grid}]/value" }
],
"blacklists" : { }
}
}

View File

@ -27,7 +27,7 @@
"model" : [
{ "name" : "country", "algo" : "Null", "type" : "String", "weight" : "0", "ignoreMissing" : "false", "path" : "organization/metadata/country/classid" },
{ "name" : "legalshortname", "algo" : "JaroWinklerNormalizedName", "type" : "String", "weight" : "0.1", "ignoreMissing" : "true", "path" : "organization/metadata/legalshortname/value" },
{ "name" : "legalname", "algo" : "JaroWinklerNormalizedName", "type" : "String", "weight" : "0.9", "ignoreMissing" : "false", "path" : "organization/metadata/legalname/value", "params" : {"windowSize" : 4, "threshold" : 0.5} },
{ "name" : "legalname", "algo" : "JaroWinklerNormalizedName", "type" : "String", "weight" : "0.9", "ignoreMissing" : "false", "path" : "organization/metadata/legalname/value", "params" : {"windowSize" : 4, "threshold" : 0.7} },
{ "name" : "websiteurl", "algo" : "Null", "type" : "URL", "weight" : "0", "ignoreMissing" : "true", "path" : "organization/metadata/websiteurl/value", "params" : { "host" : 0.5, "path" : 0.5 } },
{ "name" : "gridid", "algo" : "Null", "type" : "String", "weight" : "0.0", "ignoreMissing" : "true", "path" : "pid[qualifier#classid = {grid}]/value" }
],

View File

@ -1,25 +1,12 @@
{"dateoftransformation":"2019-06-26","originalId":["corda__h2020::999864846"],"collectedfrom":[{"value":"CORDA - COmmon Research DAta Warehouse - Horizon 2020","key":"10|openaire____::a55eb91348674d853191f4f4fd73d078"}],"organization":{"metadata":{"eclegalbody":{"value":"true"},"eclegalperson":{"value":"true"},"ecinternationalorganization":{"value":"false"},"legalshortname":{"value":"UPV"},"ecresearchorganization":{"value":"true"},"ecnonprofit":{"value":"true"},"ecenterprise":{"value":"false"},"websiteurl":{"value":"http://www.upv.es"},"ecnutscode":{"value":"false"},"ecinternationalorganizationeurinterests":{"value":"false"},"legalname":{"value":"UNIVERSITAT POLITECNICA DE VALENCIA"},"country":{"classid":"ES","classname":"Spain","schemename":"dnet:countries","schemeid":"dnet:countries"},"echighereducation":{"value":"true"},"ecsmevalidated":{"value":"false"}}},"dateofcollection":"2018-03-12","type":20,"id":"20|corda__h2020::542f36b24ca4203a5b4dfc8396ef7475"}
{"dateoftransformation":"2019-06-26","originalId":["corda__h2020::999828859"],"collectedfrom":[{"value":"CORDA - COmmon Research DAta Warehouse - Horizon 2020","key":"10|openaire____::a55eb91348674d853191f4f4fd73d078"}],"organization":{"metadata":{"eclegalbody":{"value":"true"},"eclegalperson":{"value":"true"},"ecinternationalorganization":{"value":"false"},"legalshortname":{"value":"UPCT"},"ecresearchorganization":{"value":"true"},"ecnonprofit":{"value":"true"},"ecenterprise":{"value":"false"},"websiteurl":{"value":"http://www.upct.es"},"ecnutscode":{"value":"false"},"ecinternationalorganizationeurinterests":{"value":"false"},"legalname":{"value":"UNIVERSIDAD POLITECNICA DE CARTAGENA"},"country":{"classid":"ES","classname":"Spain","schemename":"dnet:countries","schemeid":"dnet:countries"},"echighereducation":{"value":"true"},"ecsmevalidated":{"value":"false"}}},"dateofcollection":"2018-03-12","type":20,"id":"20|corda__h2020::9df7abeaef39ed5bac9c3e3a10a2be91"}
{"dateoftransformation":"2019-06-26","originalId":["corda__h2020::999974844"],"collectedfrom":[{"value":"CORDA - COmmon Research DAta Warehouse - Horizon 2020","key":"10|openaire____::a55eb91348674d853191f4f4fd73d078"}],"organization":{"metadata":{"eclegalbody":{"value":"true"},"eclegalperson":{"value":"true"},"ecinternationalorganization":{"value":"false"},"legalshortname":{"value":"UPM"},"ecresearchorganization":{"value":"true"},"ecnonprofit":{"value":"true"},"ecenterprise":{"value":"false"},"websiteurl":{"value":"http://www.upm.es"},"ecnutscode":{"value":"false"},"ecinternationalorganizationeurinterests":{"value":"false"},"legalname":{"value":"UNIVERSIDAD POLITECNICA DE MADRID"},"country":{"classid":"ES","classname":"Spain","schemename":"dnet:countries","schemeid":"dnet:countries"},"echighereducation":{"value":"true"},"ecsmevalidated":{"value":"false"}}},"dateofcollection":"2018-03-12","type":20,"id":"20|corda__h2020::7a44a773d0dc629e9af5cef563c2478a"}
{"dateoftransformation":"2018-11-20","originalId":["corda_______::999976202"],"collectedfrom":[{"value":"CORDA - COmmon Research DAta Warehouse","key":"10|openaire____::b30dac7baac631f3da7c2bb18dd9891f"}],"organization":{"metadata":{"eclegalbody":{"value":"true"},"eclegalperson":{"value":"true"},"ecinternationalorganization":{"value":"false"},"legalshortname":{"value":"UPC"},"ecresearchorganization":{"value":"true"},"ecnonprofit":{"value":"true"},"websiteurl":{"value":"http://www.upc.edu"},"ecnutscode":{"value":"false"},"ecinternationalorganizationeurinterests":{"value":"false"},"legalname":{"value":"UNIVERSITAT POLITECNICA DE CATALUNYA"},"country":{"classid":"ES","classname":"Spain","schemename":"dnet:countries","schemeid":"dnet:countries"},"echighereducation":{"value":"true"}}},"dateofcollection":"2018-03-12","type":20,"id":"20|corda_______::0a1f410e6b3374e015b9aead9d97731a"}
{"dateoftransformation":"2018-11-20","originalId":["corda_______::999864846"],"collectedfrom":[{"value":"CORDA - COmmon Research DAta Warehouse","key":"10|openaire____::b30dac7baac631f3da7c2bb18dd9891f"}],"organization":{"metadata":{"eclegalbody":{"value":"true"},"eclegalperson":{"value":"true"},"ecinternationalorganization":{"value":"false"},"legalshortname":{"value":"UPV"},"ecresearchorganization":{"value":"true"},"ecnonprofit":{"value":"true"},"ecenterprise":{"value":"false"},"websiteurl":{"value":"http://www.upv.es"},"ecnutscode":{"value":"false"},"ecinternationalorganizationeurinterests":{"value":"false"},"legalname":{"value":"UNIVERSITAT POLITECNICA DE VALENCIA"},"country":{"classid":"ES","classname":"Spain","schemename":"dnet:countries","schemeid":"dnet:countries"},"echighereducation":{"value":"true"}}},"dateofcollection":"2018-03-12","type":20,"id":"20|corda_______::542f36b24ca4203a5b4dfc8396ef7475"}
{"dateoftransformation":"2018-11-20","originalId":["corda_______::999974844"],"collectedfrom":[{"value":"CORDA - COmmon Research DAta Warehouse","key":"10|openaire____::b30dac7baac631f3da7c2bb18dd9891f"}],"organization":{"metadata":{"eclegalbody":{"value":"true"},"eclegalperson":{"value":"true"},"ecinternationalorganization":{"value":"false"},"legalshortname":{"value":"UPM"},"ecresearchorganization":{"value":"true"},"ecnonprofit":{"value":"true"},"ecenterprise":{"value":"false"},"websiteurl":{"value":"http://www.upm.es"},"ecnutscode":{"value":"false"},"ecinternationalorganizationeurinterests":{"value":"false"},"legalname":{"value":"UNIVERSIDAD POLITECNICA DE MADRID"},"country":{"classid":"ES","classname":"Spain","schemename":"dnet:countries","schemeid":"dnet:countries"},"echighereducation":{"value":"true"},"ecsmevalidated":{"value":"false"}}},"dateofcollection":"2018-03-12","type":20,"id":"20|corda_______::7a44a773d0dc629e9af5cef563c2478a"}
{"dateoftransformation":"2019-06-26","originalId":["corda__h2020::999976202"],"collectedfrom":[{"value":"CORDA - COmmon Research DAta Warehouse - Horizon 2020","key":"10|openaire____::a55eb91348674d853191f4f4fd73d078"}],"organization":{"metadata":{"eclegalbody":{"value":"true"},"eclegalperson":{"value":"true"},"ecinternationalorganization":{"value":"false"},"legalshortname":{"value":"UPC"},"ecresearchorganization":{"value":"true"},"ecnonprofit":{"value":"true"},"ecenterprise":{"value":"false"},"websiteurl":{"value":"http://www.upc.edu"},"ecnutscode":{"value":"false"},"ecinternationalorganizationeurinterests":{"value":"false"},"legalname":{"value":"UNIVERSITAT POLITECNICA DE CATALUNYA"},"country":{"classid":"ES","classname":"Spain","schemename":"dnet:countries","schemeid":"dnet:countries"},"echighereducation":{"value":"true"},"ecsmevalidated":{"value":"false"}}},"dateofcollection":"2018-03-12","type":20,"id":"20|corda__h2020::0a1f410e6b3374e015b9aead9d97731a"}
{"dateoftransformation":"2018-11-20","originalId":["corda_______::999828859"],"collectedfrom":[{"value":"CORDA - COmmon Research DAta Warehouse","key":"10|openaire____::b30dac7baac631f3da7c2bb18dd9891f"}],"organization":{"metadata":{"eclegalbody":{"value":"true"},"eclegalperson":{"value":"true"},"legalshortname":{"value":"UPCT"},"ecresearchorganization":{"value":"true"},"ecnonprofit":{"value":"true"},"websiteurl":{"value":"http://www.upct.es"},"ecnutscode":{"value":"false"},"legalname":{"value":"UNIVERSIDAD POLITECNICA DE CARTAGENA"},"country":{"classid":"ES","classname":"Spain","schemename":"dnet:countries","schemeid":"dnet:countries"},"echighereducation":{"value":"true"}}},"dateofcollection":"2018-03-12","type":20,"id":"20|corda_______::9df7abeaef39ed5bac9c3e3a10a2be91"}
{"dateoftransformation":"2018-09-19","originalId":["doajarticles::Universidad_Politécnica_de_Madrid"],"collectedfrom":[{"value":"DOAJ-Articles","key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824"}],"organization":{"metadata":{"eclegalbody":{"value":"false"},"eclegalperson":{"value":"false"},"ecinternationalorganization":{"value":"false"},"legalshortname":{"value":"Universidad Politécnica de Madrid"},"ecresearchorganization":{"value":"false"},"ecnonprofit":{"value":"false"},"ecenterprise":{"value":"false"},"ecnutscode":{"value":"false"},"ecinternationalorganizationeurinterests":{"value":"false"},"legalname":{"value":"Universidad Politécnica de Madrid"},"country":{"classid":"ES","classname":"Spain","schemename":"dnet:countries","schemeid":"dnet:countries"},"echighereducation":{"value":"false"},"ecsmevalidated":{"value":"false"}}},"dateofcollection":"2018-09-19","type":20,"id":"20|doajarticles::3cdfe6eec5fafec302e02cb7be692318"}
{"dateoftransformation":"2018-09-19","originalId":["doajarticles::Universitat_Politècnica_de_València"],"collectedfrom":[{"value":"DOAJ-Articles","key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824"}],"organization":{"metadata":{"eclegalbody":{"value":"false"},"eclegalperson":{"value":"false"},"ecinternationalorganization":{"value":"false"},"legalshortname":{"value":"Universitat Politècnica de València"},"ecresearchorganization":{"value":"false"},"ecnonprofit":{"value":"false"},"ecenterprise":{"value":"false"},"ecnutscode":{"value":"false"},"ecinternationalorganizationeurinterests":{"value":"false"},"legalname":{"value":"Universitat Politècnica de València"},"country":{"classid":"ES","classname":"Spain","schemename":"dnet:countries","schemeid":"dnet:countries"},"echighereducation":{"value":"false"},"ecsmevalidated":{"value":"false"}}},"dateofcollection":"2018-09-19","type":20,"id":"20|doajarticles::b9fc5f6fb04d0f8d7d0b163718f9785c"}
{"dateoftransformation":"2018-09-19","originalId":["doajarticles::Universidad_Politécnica_de_Cartagena"],"collectedfrom":[{"value":"DOAJ-Articles","key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824"}],"organization":{"metadata":{"eclegalbody":{"value":"false"},"eclegalperson":{"value":"false"},"ecinternationalorganization":{"value":"false"},"legalshortname":{"value":"Universidad Politécnica de Cartagena"},"ecresearchorganization":{"value":"false"},"ecnonprofit":{"value":"false"},"ecenterprise":{"value":"false"},"ecnutscode":{"value":"false"},"ecinternationalorganizationeurinterests":{"value":"false"},"legalname":{"value":"Universidad Politécnica de Cartagena"},"country":{"classid":"ES","classname":"Spain","schemename":"dnet:countries","schemeid":"dnet:countries"},"echighereducation":{"value":"false"},"ecsmevalidated":{"value":"false"}}},"dateofcollection":"2018-09-19","type":20,"id":"20|doajarticles::cf3bddf37ca3440dbd8e5ba6e81f6df3"}
{"dateoftransformation":"2018-09-19","originalId":["doajarticles::Universitat_Politècnica_de_Catalunya"],"collectedfrom":[{"value":"DOAJ-Articles","key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824"}],"organization":{"metadata":{"eclegalbody":{"value":"false"},"eclegalperson":{"value":"false"},"ecinternationalorganization":{"value":"false"},"legalshortname":{"value":"Universitat Politècnica de Catalunya"},"ecresearchorganization":{"value":"false"},"ecnonprofit":{"value":"false"},"ecenterprise":{"value":"false"},"ecnutscode":{"value":"false"},"ecinternationalorganizationeurinterests":{"value":"false"},"legalname":{"value":"Universitat Politècnica de Catalunya"},"country":{"classid":"ES","classname":"Spain","schemename":"dnet:countries","schemeid":"dnet:countries"},"echighereducation":{"value":"false"},"ecsmevalidated":{"value":"false"}}},"dateofcollection":"2018-09-19","type":20,"id":"20|doajarticles::7e232375e9902c823cf3cd4e1a16de5e"}
{"dateoftransformation":"2018-09-19","originalId":["doajarticles::Universidad_Politécnica_de_Cataluña"],"collectedfrom":[{"value":"DOAJ-Articles","key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824"}],"organization":{"metadata":{"eclegalbody":{"value":"false"},"eclegalperson":{"value":"false"},"ecinternationalorganization":{"value":"false"},"legalshortname":{"value":"Universidad Politécnica de Cataluña"},"ecresearchorganization":{"value":"false"},"ecnonprofit":{"value":"false"},"ecenterprise":{"value":"false"},"ecnutscode":{"value":"false"},"ecinternationalorganizationeurinterests":{"value":"false"},"legalname":{"value":"Universidad Politécnica de Cataluña"},"country":{"classid":"ES","classname":"Spain","schemename":"dnet:countries","schemeid":"dnet:countries"},"echighereducation":{"value":"false"},"ecsmevalidated":{"value":"false"}}},"dateofcollection":"2018-09-19","type":20,"id":"20|doajarticles::98e85e22b787db82b4c806e4f452f456"}
{"dateoftransformation":"2018-09-19","originalId":["doajarticles::Universitat_Politècnica_deValència"],"collectedfrom":[{"value":"DOAJ-Articles","key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824"}],"organization":{"metadata":{"eclegalbody":{"value":"false"},"eclegalperson":{"value":"false"},"ecinternationalorganization":{"value":"false"},"legalshortname":{"value":"Universitat Politècnica deValència"},"ecresearchorganization":{"value":"false"},"ecnonprofit":{"value":"false"},"ecenterprise":{"value":"false"},"ecnutscode":{"value":"false"},"ecinternationalorganizationeurinterests":{"value":"false"},"legalname":{"value":"Universitat Politècnica deValència"},"country":{"classid":"ES","classname":"Spain","schemename":"dnet:countries","schemeid":"dnet:countries"},"echighereducation":{"value":"false"},"ecsmevalidated":{"value":"false"}}},"dateofcollection":"2018-09-19","type":20,"id":"20|doajarticles::637209da5f9f175227735f91e63d4999"}
{"dateoftransformation":"2018-09-13","originalId":["re3data_____::c9820581fcddf3a7a50652e4ac5e5aab"],"collectedfrom":[{"value":"Registry of Research Data Repository","key":"10|openaire____::21f8a223b9925c2f87c404096080b046"}],"organization":{"metadata":{"eclegalbody":{"value":"false"},"eclegalperson":{"value":"false"},"ecinternationalorganization":{"value":"false"},"ecresearchorganization":{"value":"false"},"ecnonprofit":{"value":"false"},"ecenterprise":{"value":"false"},"websiteurl":{"value":"https://www.upc.edu"},"ecnutscode":{"value":"false"},"ecinternationalorganizationeurinterests":{"value":"false"},"legalname":{"value":"Universitat Politècnica de Catalunya"},"country":{"classid":"ES","classname":"Spain","schemename":"dnet:countries","schemeid":"dnet:countries"},"echighereducation":{"value":"false"},"ecsmevalidated":{"value":"false"}}},"dateofcollection":"2018-03-12","type":20,"id":"20|re3data_____::044d264e43ee7cd7bfbc3deb1532037a"}
{"dateoftransformation":"2019-04-11","originalId":["re3data_____::211b4cb099c317e9cd9073476566ca47"],"collectedfrom":[{"value":"Registry of Research Data Repository","key":"10|openaire____::21f8a223b9925c2f87c404096080b046"}],"organization":{"metadata":{"eclegalbody":{"value":"false"},"eclegalperson":{"value":"false"},"ecinternationalorganization":{"value":"false"},"ecresearchorganization":{"value":"false"},"ecnonprofit":{"value":"true"},"ecenterprise":{"value":"false"},"websiteurl":{"value":"http://www.upv.es/index-en.html"},"ecnutscode":{"value":"false"},"ecinternationalorganizationeurinterests":{"value":"false"},"legalname":{"value":"Universitat Politècnica de València"},"country":{"classid":"ES","classname":"Spain","schemename":"dnet:countries","schemeid":"dnet:countries"},"echighereducation":{"value":"false"},"ecsmevalidated":{"value":"false"}}},"dateofcollection":"2019-04-11","type":20,"id":"20|re3data_____::465c66fba7c037fde2dcc962f94e44a9"}
{"dateoftransformation":"2018-09-13","originalId":["re3data_____::fbdc48c28389453ae6008622c1e5b0e1"],"collectedfrom":[{"value":"Registry of Research Data Repository","key":"10|openaire____::21f8a223b9925c2f87c404096080b046"}],"organization":{"metadata":{"eclegalbody":{"value":"false"},"eclegalperson":{"value":"false"},"ecinternationalorganization":{"value":"false"},"legalshortname":{"value":"UPM"},"ecresearchorganization":{"value":"false"},"ecnonprofit":{"value":"false"},"ecenterprise":{"value":"false"},"websiteurl":{"value":"http://www.upm.es/"},"ecnutscode":{"value":"false"},"ecinternationalorganizationeurinterests":{"value":"false"},"legalname":{"value":"Universidad Politécnica de Madrid"},"country":{"classid":"ES","classname":"Spain","schemename":"dnet:countries","schemeid":"dnet:countries"},"echighereducation":{"value":"false"},"ecsmevalidated":{"value":"false"}}},"dateofcollection":"2017-02-13","type":20,"id":"20|re3data_____::0cbf4c44e9d045ffb0aa303354fff947"}
{"dateoftransformation":"2018-09-13","originalId":["snsf________::Universitat_Politècnica_de_Catalunya"],"collectedfrom":[{"value":"SNSF - Swiss National Science Foundation","key":"10|openaire____::d8f3c25e18304608ce8e816e99603d7a"}],"organization":{"metadata":{"eclegalbody":{"value":"false"},"eclegalperson":{"value":"false"},"ecinternationalorganization":{"value":"false"},"ecnonprofit":{"value":"false"},"ecresearchorganization":{"value":"false"},"ecenterprise":{"value":"false"},"ecnutscode":{"value":"false"},"ecinternationalorganizationeurinterests":{"value":"false"},"legalname":{"value":"Universitat Politècnica de Catalunya"},"country":{"classid":"ES","classname":"Spain","schemename":"dnet:countries","schemeid":"dnet:countries"},"echighereducation":{"value":"false"},"ecsmevalidated":{"value":"false"}}},"dateofcollection":"2017-09-23","type":20,"id":"20|snsf________::7e232375e9902c823cf3cd4e1a16de5e"}
{"dateoftransformation":"2018-09-13","originalId":["opendoar____::Universidad_Politécnica_de_Cartagena"],"collectedfrom":[{"value":"OpenDOAR","key":"10|openaire____::47ce9e9f4fad46e732cff06419ecaabb"}],"organization":{"metadata":{"eclegalbody":{"value":"false"},"eclegalperson":{"value":"false"},"ecinternationalorganization":{"value":"false"},"legalshortname":{"value":"UPCT"},"ecresearchorganization":{"value":"false"},"ecnonprofit":{"value":"false"},"ecenterprise":{"value":"false"},"websiteurl":{"value":"http://www.upct.es/"},"ecnutscode":{"value":"false"},"ecinternationalorganizationeurinterests":{"value":"false"},"legalname":{"value":"Universidad Politécnica de Cartagena"},"country":{"classid":"ES","classname":"Spain","schemename":"dnet:countries","schemeid":"dnet:countries"},"echighereducation":{"value":"false"},"ecsmevalidated":{"value":"false"}}},"dateofcollection":"2015-08-24","type":20,"id":"20|opendoar____::cf3bddf37ca3440dbd8e5ba6e81f6df3"}
{"dateoftransformation":"2018-09-13","originalId":["opendoar____::Universitat_Politècnica_de_València"],"collectedfrom":[{"value":"OpenDOAR","key":"10|openaire____::47ce9e9f4fad46e732cff06419ecaabb"}],"organization":{"metadata":{"eclegalbody":{"value":"false"},"eclegalperson":{"value":"false"},"ecinternationalorganization":{"value":"false"},"ecresearchorganization":{"value":"false"},"ecnonprofit":{"value":"false"},"ecenterprise":{"value":"false"},"websiteurl":{"value":"http://www.upv.es/"},"ecnutscode":{"value":"false"},"ecinternationalorganizationeurinterests":{"value":"false"},"legalname":{"value":"Universitat Politècnica de València"},"country":{"classid":"ES","classname":"Spain","schemename":"dnet:countries","schemeid":"dnet:countries"},"echighereducation":{"value":"false"},"ecsmevalidated":{"value":"false"}}},"dateofcollection":"2015-08-24","type":20,"id":"20|opendoar____::b9fc5f6fb04d0f8d7d0b163718f9785c"}
{"dateoftransformation":"2018-09-13","originalId":["opendoar____::Universidad_Politecnica_de_Madrid"],"collectedfrom":[{"value":"OpenDOAR","key":"10|openaire____::47ce9e9f4fad46e732cff06419ecaabb"}],"organization":{"metadata":{"eclegalbody":{"value":"false"},"eclegalperson":{"value":"false"},"ecinternationalorganization":{"value":"false"},"ecresearchorganization":{"value":"false"},"ecnonprofit":{"value":"false"},"ecenterprise":{"value":"false"},"websiteurl":{"value":"http://www.upm.es/"},"ecnutscode":{"value":"false"},"ecinternationalorganizationeurinterests":{"value":"false"},"legalname":{"value":"Universidad Politecnica de Madrid"},"country":{"classid":"ES","classname":"Spain","schemename":"dnet:countries","schemeid":"dnet:countries"},"echighereducation":{"value":"false"},"ecsmevalidated":{"value":"false"}}},"dateofcollection":"2015-08-24","type":20,"id":"20|opendoar____::c4b8d1e68de570b064c666c370bc2168"}
{"dateoftransformation":"2018-11-12","originalId":["opendoar____::Universitat_Politènica_de_Catalunya"],"collectedfrom":[{"value":"OpenDOAR","key":"10|openaire____::47ce9e9f4fad46e732cff06419ecaabb"}],"organization":{"metadata":{"eclegalbody":{"value":"false"},"eclegalperson":{"value":"false"},"ecinternationalorganization":{"value":"false"},"legalshortname":{"value":"UPC"},"ecresearchorganization":{"value":"false"},"ecnonprofit":{"value":"false"},"ecenterprise":{"value":"false"},"websiteurl":{"value":"http://www.upc.edu/"},"ecnutscode":{"value":"false"},"ecinternationalorganizationeurinterests":{"value":"false"},"legalname":{"value":"Universitat Politènica de Catalunya"},"country":{"classid":"ES","classname":"Spain","schemename":"dnet:countries","schemeid":"dnet:countries"},"echighereducation":{"value":"false"},"ecsmevalidated":{"value":"false"}}},"dateofcollection":"2018-11-12","type":20,"id":"20|opendoar____::54e1f400c1b5dba4fa5a18571d50c3f3"}
{"dateoftransformation":"2019-05-19","originalId":["rcuk________::B0DA50CA-D11E-4251-9678-4AA2F93DB545"],"collectedfrom":[{"value":"Research Councils UK","key":"10|openaire____::ab2d3310741ea80d3b8726f651502858"}],"organization":{"metadata":{"eclegalbody":{"value":"false"},"eclegalperson":{"value":"false"},"ecinternationalorganization":{"value":"false"},"ecnonprofit":{"value":"false"},"ecresearchorganization":{"value":"false"},"ecenterprise":{"value":"false"},"ecnutscode":{"value":"false"},"ecinternationalorganizationeurinterests":{"value":"false"},"legalname":{"value":"Polytechnic University of Catalonia"},"country":{"classid":"ES","classname":"Spain","schemename":"dnet:countries","schemeid":"dnet:countries"},"echighereducation":{"value":"false"},"ecsmevalidated":{"value":"false"}}},"dateofcollection":"2017-11-04","type":20,"id":"20|rcuk________::5cd82a9ed265a9cb392d185a688cad40"}
{"dateoftransformation":"2018-09-13","originalId":["openaire____::UE9MSS1SRUQgKFJldmlzdGFzIERpZ2l0YWxlcyBQb2xpdMOpY25pY2FzKQ==::UNIVERSIDAD POLITECNICA DE MADRID"],"collectedfrom":[{"value":"","key":""}],"organization":{"metadata":{"eclegalbody":{"value":"false"},"eclegalperson":{"value":"false"},"ecinternationalorganization":{"value":"false"},"ecnonprofit":{"value":"false"},"ecresearchorganization":{"value":"false"},"ecenterprise":{"value":"false"},"ecnutscode":{"value":"false"},"ecinternationalorganizationeurinterests":{"value":"false"},"legalname":{"value":"UNIVERSIDAD POLITECNICA DE MADRID"},"country":{"classid":"ES","classname":"Spain","schemename":"dnet:countries","schemeid":"dnet:countries"},"echighereducation":{"value":"false"},"ecsmevalidated":{"value":"false"}}},"dateofcollection":"2016-11-03","type":20,"id":"20|openaire____::9f185b9f2f1c932b492f6fb53c8c5caf"}
{"dateoftransformation":"2018-09-13","originalId":["openaire____::issn24441309::UNIVERSIDAD POLITECNICA DE MADRID"],"collectedfrom":[{"value":"","key":""}],"organization":{"metadata":{"eclegalbody":{"value":"false"},"eclegalperson":{"value":"false"},"ecinternationalorganization":{"value":"false"},"ecnonprofit":{"value":"false"},"ecresearchorganization":{"value":"false"},"ecenterprise":{"value":"false"},"ecnutscode":{"value":"false"},"ecinternationalorganizationeurinterests":{"value":"false"},"legalname":{"value":"UNIVERSIDAD POLITECNICA DE MADRID"},"country":{"classid":"ES","classname":"Spain","schemename":"dnet:countries","schemeid":"dnet:countries"},"echighereducation":{"value":"false"},"ecsmevalidated":{"value":"false"}}},"dateofcollection":"2016-11-03","type":20,"id":"20|openaire____::935af0561af9c6e33326818b32805241"}
{"collectedfrom":[{"value":"GRID - Global Research Identifier Database","key":"10|openaire____::ff4a008470319a22d9cf3d14af485977"}],"organization":{"metadata":{"legalshortname":{"value":"NIOK"},"websiteurl":{"value":"http://www.niok.eu/"},"country":{"classid":"NL","classname":"Netherlands","schemename":"dnet:countries","schemeid":"dnet:countries"},"legalname":{"value":"NIOK"}}},"pid":[{"qualifier":{"classid":"grid","classname":"grid","schemename":"dnet:pid_types","schemeid":"dnet:pid_types"},"value":"grid.450158.d"}],"type":20,"id":"20|grid________::6183d331a1920dd81b8c10620a8b3a8a"}
{"collectedfrom":[{"value":"GRID - Global Research Identifier Database","key":"10|openaire____::ff4a008470319a22d9cf3d14af485977"}],"organization":{"metadata":{"legalshortname":{"value":"NIVEL"},"websiteurl":{"value":"http://www.nivel.nl/en"},"country":{"classid":"NL","classname":"Netherlands","schemename":"dnet:countries","schemeid":"dnet:countries"},"legalname":{"value":"NIVEL"}}},"pid":[{"qualifier":{"classid":"grid","classname":"grid","schemename":"dnet:pid_types","schemeid":"dnet:pid_types"},"value":"grid.416005.6"}],"type":20,"id":"20|grid________::8f65fd4e764086db897cc648e9cbbaed"}
{"collectedfrom":[{"value":"GRID - Global Research Identifier Database","key":"10|openaire____::ff4a008470319a22d9cf3d14af485977"}],"organization":{"metadata":{"legalshortname":{"value":"SCP"},"websiteurl":{"value":"http://www.scp.nl/english/"},"country":{"classid":"NL","classname":"Netherlands","schemename":"dnet:countries","schemeid":"dnet:countries"},"legalname":{"value":"Netherlands Institute for Social Research"}}},"pid":[{"qualifier":{"classid":"grid","classname":"grid","schemename":"dnet:pid_types","schemeid":"dnet:pid_types"},"value":"grid.438038.4"}],"type":20,"id":"20|grid________::c69cffc4997b54bb2eb5ca6aebcda18b"}
{"collectedfrom":[{"value":"GRID - Global Research Identifier Database","key":"10|openaire____::ff4a008470319a22d9cf3d14af485977"}],"organization":{"metadata":{"legalshortname":{"value":"NIVEL"},"websiteurl":{"value":"http://www.nivel.nl/en"},"country":{"classid":"NL","classname":"Netherlands","schemename":"dnet:countries","schemeid":"dnet:countries"},"legalname":{"value":"Netherlands Institute for Health Services Research"}}},"pid":[{"qualifier":{"classid":"grid","classname":"grid","schemename":"dnet:pid_types","schemeid":"dnet:pid_types"},"value":"grid.416005.6"}],"type":20,"id":"20|grid________::5b72dc608480f3d5569a7bfe3cbdaf07"}
{"collectedfrom":[{"value":"GRID - Global Research Identifier Database","key":"10|openaire____::ff4a008470319a22d9cf3d14af485977"}],"organization":{"metadata":{"legalshortname":{"value":"SCP"},"websiteurl":{"value":"http://www.scp.nl/english/"},"country":{"classid":"NL","classname":"Netherlands","schemename":"dnet:countries","schemeid":"dnet:countries"},"legalname":{"value":"SCP"}}},"pid":[{"qualifier":{"classid":"grid","classname":"grid","schemename":"dnet:pid_types","schemeid":"dnet:pid_types"},"value":"grid.438038.4"}],"type":20,"id":"20|grid________::6b7b927a3ae25f1639a6ef27b35021b5"}
{"collectedfrom":[{"value":"GRID - Global Research Identifier Database","key":"10|openaire____::ff4a008470319a22d9cf3d14af485977"}],"organization":{"metadata":{"legalshortname":{"value":"NIOK"},"websiteurl":{"value":"http://www.niok.eu/"},"country":{"classid":"NL","classname":"Netherlands","schemename":"dnet:countries","schemeid":"dnet:countries"},"legalname":{"value":"Netherlands Institute for Catalysis Research"}}},"pid":[{"qualifier":{"classid":"grid","classname":"grid","schemename":"dnet:pid_types","schemeid":"dnet:pid_types"},"value":"grid.450158.d"}],"type":20,"id":"20|grid________::6af340f03c44041737859d3e1354d1fe"}
{"collectedfrom":[{"value":"GRID - Global Research Identifier Database","key":"10|openaire____::ff4a008470319a22d9cf3d14af485977"}],"organization":{"metadata":{"legalshortname":{"value":"NIVEL"},"websiteurl":{"value":"http://www.nivel.nl/en"},"country":{"classid":"NL","classname":"Netherlands","schemename":"dnet:countries","schemeid":"dnet:countries"},"legalname":{"value":"Nederlands Instituut voor Onderzoek van de Gezondheidszorg"}}},"pid":[{"qualifier":{"classid":"grid","classname":"grid","schemename":"dnet:pid_types","schemeid":"dnet:pid_types"},"value":"grid.416005.6"}],"type":20,"id":"20|grid________::69ab0f5ed7da9d961355cb4eb24b8613"}
{"collectedfrom":[{"value":"GRID - Global Research Identifier Database","key":"10|openaire____::ff4a008470319a22d9cf3d14af485977"}],"organization":{"metadata":{"legalshortname":{"value":"NIOK"},"websiteurl":{"value":"http://www.niok.eu/"},"country":{"classid":"NL","classname":"Netherlands","schemename":"dnet:countries","schemeid":"dnet:countries"},"legalname":{"value":"Nederlands Instituut voor Onderzoek in de Katalyse"}}},"pid":[{"qualifier":{"classid":"grid","classname":"grid","schemename":"dnet:pid_types","schemeid":"dnet:pid_types"},"value":"grid.450158.d"}],"type":20,"id":"20|grid________::267cf3ce23903e0a8403653019ce8187"}
{"dateoftransformation":"2018-11-20","originalId":["corda_______::998294125"],"collectedfrom":[{"value":"CORDA - COmmon Research DAta Warehouse","key":"10|openaire____::b30dac7baac631f3da7c2bb18dd9891f"}],"organization":{"metadata":{"eclegalbody":{"value":"false"},"eclegalperson":{"value":"true"},"ecinternationalorganization":{"value":"false"},"legalshortname":{"value":"NIVEL"},"ecresearchorganization":{"value":"true"},"ecnonprofit":{"value":"true"},"ecenterprise":{"value":"false"},"websiteurl":{"value":"http://www.nivel.nl"},"ecnutscode":{"value":"false"},"ecinternationalorganizationeurinterests":{"value":"false"},"legalname":{"value":"STICHTING NEDERLANDS INSTITUUT VOOR ONDERZOEK VAN DE GEZONDHEIDSZORG"},"country":{"classid":"NL","classname":"Netherlands","schemename":"dnet:countries","schemeid":"dnet:countries"},"ecsmevalidated":{"value":"false"}}},"dateofcollection":"2018-03-12","type":20,"id":"20|corda_______::755737ed505484ea374062762ef05ef6"}
{"dateoftransformation":"2019-06-26","originalId":["corda__h2020::998294125"],"collectedfrom":[{"value":"CORDA - COmmon Research DAta Warehouse - Horizon 2020","key":"10|openaire____::a55eb91348674d853191f4f4fd73d078"}],"organization":{"metadata":{"eclegalbody":{"value":"false"},"eclegalperson":{"value":"true"},"ecinternationalorganization":{"value":"false"},"legalshortname":{"value":"NIVEL"},"ecresearchorganization":{"value":"true"},"ecnonprofit":{"value":"true"},"ecenterprise":{"value":"false"},"websiteurl":{"value":"http://www.nivel.nl"},"ecnutscode":{"value":"false"},"ecinternationalorganizationeurinterests":{"value":"false"},"legalname":{"value":"STICHTING NEDERLANDS INSTITUUT VOOR ONDERZOEK VAN DE GEZONDHEIDSZORG"},"country":{"classid":"NL","classname":"Netherlands","schemename":"dnet:countries","schemeid":"dnet:countries"},"echighereducation":{"value":"false"},"ecsmevalidated":{"value":"false"}}},"dateofcollection":"2018-03-12","type":20,"id":"20|corda__h2020::755737ed505484ea374062762ef05ef6"}
{"dateoftransformation":"2018-09-13","originalId":["snsf________::The_Netherlands_Institute_of_Health_Services_Research_NIVEL"],"collectedfrom":[{"value":"SNSF - Swiss National Science Foundation","key":"10|openaire____::d8f3c25e18304608ce8e816e99603d7a"}],"organization":{"metadata":{"eclegalbody":{"value":"false"},"eclegalperson":{"value":"false"},"ecinternationalorganization":{"value":"false"},"ecnonprofit":{"value":"false"},"ecresearchorganization":{"value":"false"},"ecenterprise":{"value":"false"},"ecnutscode":{"value":"false"},"ecinternationalorganizationeurinterests":{"value":"false"},"legalname":{"value":"The Netherlands Institute of Health Services Research NIVEL"},"country":{"classid":"NL","classname":"Netherlands","schemename":"dnet:countries","schemeid":"dnet:countries"},"echighereducation":{"value":"false"},"ecsmevalidated":{"value":"false"}}},"dateofcollection":"2017-09-22","type":20,"id":"20|snsf________::10653be4e9c170181486aa9782346d81"}
{"dateoftransformation":"2018-09-13","originalId":["openaire____::088a0087-4bc6-4c38-a052-b446c3b225a7::The Netherlands Institute for Social Research"],"collectedfrom":[{"value":"","key":""}],"organization":{"metadata":{"eclegalbody":{"value":"false"},"eclegalperson":{"value":"false"},"ecinternationalorganization":{"value":"false"},"ecnonprofit":{"value":"false"},"ecresearchorganization":{"value":"false"},"ecenterprise":{"value":"false"},"ecnutscode":{"value":"false"},"ecinternationalorganizationeurinterests":{"value":"false"},"legalname":{"value":"The Netherlands Institute for Social Research"},"country":{"classid":"NL","classname":"Netherlands","schemename":"dnet:countries","schemeid":"dnet:countries"},"echighereducation":{"value":"false"},"ecsmevalidated":{"value":"false"}}},"dateofcollection":"2016-03-30","type":20,"id":"20|openaire____::857b30f258c43852a2cb57875ac40892"}

View File

@ -55,7 +55,6 @@ public class DedupTestIT {
}
static Properties readProperties(final String propFile) {
Properties prop = new Properties();

View File

@ -58,7 +58,7 @@ key::57;social;sociali;social;social;Sozial;sociaal;maatschappelijk;κοινων
key::58;environmental;ambiente;medioambiental;ambiente;medioambiente;meioambiente;Umwelt;milieu;milieuwetenschap;milieukunde;περιβαλλοντικός;περιβαλλοντική;περιβαλλοντικό;περιβαλλοντικά;çevre;környezeti;okoliški;keskonna;;
key::59;business;economia;economiche;economica;negocio;empresa;negócio;Unternehmen;bedrijf;bedrijfskunde;επιχείρηση;iş;üzleti;posel;ettevõte/äri;
key::60;pharmaceuticals;pharmacy;farmacia;farmaceutica;farmacéutica;farmacia;farmacêutica;farmácia;Pharmazeutika;Arzneimittelkunde;farmaceutica;geneesmiddelen;apotheek;φαρμακευτικός;φαρμακευτική;φαρμακευτικό;φαρμακευτικά;φαρμακείο;ilaç;eczane;gyógyszerészeti;gyógyszertár;farmacevtika;lekarništvo;farmaatsia;farmatseutiline;
key::61;healthcare;salute;atenciónmédica;cuidadodelasalud;cuidadoscomasaúde;Gesundheitswesen;gezondheidszorg;ιατροφαρμακευτικήπερίθαλψη;sağlıkhizmeti;egészségügy;zdravstvo;tervishoid;tervishoiu;
key::61;healthcare;health services;salute;atenciónmédica;cuidadodelasalud;cuidadoscomasaúde;Gesundheitswesen;gezondheidszorg;ιατροφαρμακευτικήπερίθαλψη;sağlıkhizmeti;egészségügy;zdravstvo;tervishoid;tervishoiu;
key::62;history;storia;historia;história;Geschichte;geschiedenis;geschiedkunde;ιστορία;tarih;történelem;zgodovina;ajalugu;
key::63;materials;materiali;materia;materiales;materiais;materialen;υλικά;τεκμήρια;malzemeler;anyagok;materiali;materjalid;vahendid;
key::64;economics;economia;economiche;economica;economía;economia;Wirtschaft;economie;οικονομικά;οικονομικέςεπιστήμες;ekonomi;közgazdaságtan;gospodarstvo;ekonomija;majanduslik;majandus;

1 key::1;university;università;università studi;universitario;universitaria;université;universitaire;universitaires;universidad;universitade;Universität;universitaet;Uniwersytet;университет;universiteit;πανεπιστήμιο
58 key::58;environmental;ambiente;medioambiental;ambiente;medioambiente;meioambiente;Umwelt;milieu;milieuwetenschap;milieukunde;περιβαλλοντικός;περιβαλλοντική;περιβαλλοντικό;περιβαλλοντικά;çevre;környezeti;okoliški;keskonna;;
59 key::59;business;economia;economiche;economica;negocio;empresa;negócio;Unternehmen;bedrijf;bedrijfskunde;επιχείρηση;iş;üzleti;posel;ettevõte/äri;
60 key::60;pharmaceuticals;pharmacy;farmacia;farmaceutica;farmacéutica;farmacia;farmacêutica;farmácia;Pharmazeutika;Arzneimittelkunde;farmaceutica;geneesmiddelen;apotheek;φαρμακευτικός;φαρμακευτική;φαρμακευτικό;φαρμακευτικά;φαρμακείο;ilaç;eczane;gyógyszerészeti;gyógyszertár;farmacevtika;lekarništvo;farmaatsia;farmatseutiline;
61 key::61;healthcare;salute;atenciónmédica;cuidadodelasalud;cuidadoscomasaúde;Gesundheitswesen;gezondheidszorg;ιατροφαρμακευτικήπερίθαλψη;sağlıkhizmeti;egészségügy;zdravstvo;tervishoid;tervishoiu; key::61;healthcare;health services;salute;atenciónmédica;cuidadodelasalud;cuidadoscomasaúde;Gesundheitswesen;gezondheidszorg;ιατροφαρμακευτικήπερίθαλψη;sağlıkhizmeti;egészségügy;zdravstvo;tervishoid;tervishoiu;
62 key::62;history;storia;historia;história;Geschichte;geschiedenis;geschiedkunde;ιστορία;tarih;történelem;zgodovina;ajalugu;
63 key::63;materials;materiali;materia;materiales;materiais;materialen;υλικά;τεκμήρια;malzemeler;anyagok;materiali;materjalid;vahendid;
64 key::64;economics;economia;economiche;economica;economía;economia;Wirtschaft;economie;οικονομικά;οικονομικέςεπιστήμες;ekonomi;közgazdaságtan;gospodarstvo;ekonomija;majanduslik;majandus;

View File

@ -46,7 +46,7 @@ public class ClusteringFunctionTest extends AbstractPaceTest {
@Test
public void testNgramPairs() {
params.put("ngramLen", 3);
params.put("max", 3);
params.put("max", 1);
final ClusteringFunction np = new NgramPairs(params);

View File

@ -1,5 +1,6 @@
package eu.dnetlib.pace.distance;
import eu.dnetlib.pace.clustering.NGramUtils;
import eu.dnetlib.pace.distance.algo.JaroWinklerNormalizedName;
import org.junit.Before;
import org.junit.Test;
@ -25,6 +26,12 @@ public class DistanceAlgoTest extends AbstractPaceFunctions {
params.put("weight", 1.0);
}
@Test
public void testCleanForSorting() {
NGramUtils utils = new NGramUtils();
System.out.println("utils = " + utils.cleanupForOrdering("University of Pisa"));
}
@Test
public void testGetNumbers() {
System.out.println("Numbers : " + getNumbers(TEST_STRING));