code cleaning, distribution of the classes in packages and implementation of the new configuration

This commit is contained in:
miconis 2019-11-07 12:47:12 +01:00
parent 3ff5be675b
commit 5b3adb3e65
84 changed files with 948 additions and 6359 deletions

View File

@ -26,8 +26,8 @@ if score<\th --- negative result
- positive: specifies the key of the next node in case of positive result
- negative: specifies the key of the next node in case of negative result
- undefined: specifies the key of the next node in case of undefined result
- ignoreMissing: defines the behavior of the treeNode in case of a missing field
> e.g. if a comparator on a particular field produces an undefined result (-1), if ignoreMissing=true that field is simply ignored, otherwise the entire treeNode score is considered to be -1
- countIfUndefined: defines the behavior of the treeNode in case of a missing field
> e.g. if a comparator on a particular field produces an undefined result (-1), if countIfUndefined=true that field is simply ignored, otherwise the entire treeNode score is considered to be -1
In order to make the decision tree work, the BlockProcessor has been modified with the following changes:
- if the decision tree is defined into the JSON configuration the deduplication process relies on it

View File

@ -0,0 +1,50 @@
package eu.dnetlib;
import eu.dnetlib.pace.config.DedupConfig;
import eu.dnetlib.pace.utils.Utility;
import eu.dnetlib.support.ConnectedComponent;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.sql.SparkSession;
import java.io.IOException;
public class DedupSparkJob {
private static Log log = LogFactory.getLog(DedupSparkJob.class);
public static void main(String[] args) throws IOException {
final String inputSpacePath = args[0];
final String dedupConfigPath = args[1];
final String outputPath = args[2] + "_output";
final SparkSession spark = SparkSession
.builder()
.appName("Deduplication")
.master("yarn")
.getOrCreate();
final JavaSparkContext context = new JavaSparkContext(spark.sparkContext());
final JavaRDD<String> entities = Utility.loadDataFromHDFS(inputSpacePath, context);
final DedupConfig config = Utility.loadConfigFromHDFS(dedupConfigPath);
JavaRDD<ConnectedComponent> ccs = Deduper.dedup(context, entities, config);
//save connected components on textfile
Utility.deleteIfExists(outputPath);
ccs.saveAsTextFile(outputPath);
final JavaRDD<ConnectedComponent> connectedComponents = ccs.filter(cc -> cc.getDocs().size()>1);
final JavaRDD<ConnectedComponent> nonDeduplicated = ccs.filter(cc -> cc.getDocs().size()==1);
log.info("Non duplicates: " + nonDeduplicated.count());
log.info("Duplicates: " + connectedComponents.flatMap(cc -> cc.getDocs().iterator()).count());
log.info("Connected Components: " + connectedComponents.count());
}
}

View File

@ -0,0 +1,70 @@
package eu.dnetlib;
import eu.dnetlib.graph.GraphProcessor;
import eu.dnetlib.pace.config.DedupConfig;
import eu.dnetlib.pace.model.MapDocument;
import eu.dnetlib.pace.util.BlockProcessor;
import eu.dnetlib.pace.utils.PaceUtils;
import eu.dnetlib.pace.utils.Utility;
import eu.dnetlib.reporter.SparkReporter;
import eu.dnetlib.support.ConnectedComponent;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.graphx.Edge;
import org.apache.spark.rdd.RDD;
import org.apache.spark.util.LongAccumulator;
import scala.Tuple2;
import java.util.Map;
import java.util.stream.Collectors;
public class Deduper {
private static final Log log = LogFactory.getLog(Deduper.class);
/**
* @param: the spark context
* @param: list of JSON entities to be deduped
* @param: the dedup configuration
*
* @return the list of connected components generated by the deduplication
*/
public static JavaRDD<ConnectedComponent> dedup(JavaSparkContext context, JavaRDD<String> entities, DedupConfig config){
Map<String, LongAccumulator> accumulators = Utility.constructAccumulator(config, context.sc());
//create vertexes of the graph: <ID, MapDocument>
JavaPairRDD<String, MapDocument> mapDocs = entities.mapToPair(it -> {
MapDocument mapDocument = PaceUtils.asMapDocument(config, it);
return new Tuple2<>(mapDocument.getIdentifier(), mapDocument);
});
RDD<Tuple2<Object, MapDocument>> vertexes = mapDocs.mapToPair(t -> new Tuple2<Object, MapDocument>( (long) t._1().hashCode(), t._2())).rdd();
//create blocks for deduplication
JavaPairRDD<String, Iterable<MapDocument>> blocks = mapDocs.reduceByKey((a, b) -> a) //the reduce is just to be sure that we haven't document with same id
//Clustering: from <id, doc> to List<groupkey,doc>
.flatMapToPair(a -> {
final MapDocument currentDocument = a._2();
return Utility.getGroupingKeys(config, currentDocument).stream()
.map(it -> new Tuple2<>(it, currentDocument)).collect(Collectors.toList()).iterator();
}).groupByKey();
//create relations by comparing only elements in the same group
final JavaPairRDD<String, String> relationRDD = blocks.flatMapToPair(it -> {
final SparkReporter reporter = new SparkReporter(accumulators);
new BlockProcessor(config).process(it._1(), it._2(), reporter);
return reporter.getRelations().iterator();
});
final RDD<Edge<String>> edgeRdd = relationRDD.map(it -> new Edge<>(it._1().hashCode(),it._2().hashCode(), "equalTo")).rdd();
accumulators.forEach((name, acc) -> log.info(name + " -> " + acc.value()));
return GraphProcessor.findCCs(vertexes, edgeRdd, config.getWf().getMaxIterations()).toJavaRDD();
}
}

View File

@ -1,52 +0,0 @@
package eu.dnetlib;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.google.common.collect.Sets;
import eu.dnetlib.pace.model.MapDocument;
import eu.dnetlib.pace.util.PaceException;
import java.io.IOException;
import java.io.Serializable;
import java.util.Set;
public class DocumentsBlock implements Serializable {
String key;
Set<MapDocument> documents;
public DocumentsBlock(String key, Set<MapDocument> documents) {
this.key = key;
this.documents = documents;
}
public DocumentsBlock(String key, Iterable<MapDocument> documents) {
this.key = key;
this.documents = Sets.newHashSet(documents);
}
public String getKey() {
return key;
}
public void setKey(String key) {
this.key = key;
}
public Iterable<MapDocument> getDocuments() {
return documents;
}
public void setDocuments(Set<MapDocument> documents) {
this.documents = documents;
}
@Override
public String toString(){
ObjectMapper mapper = new ObjectMapper();
try {
return mapper.writeValueAsString(this);
} catch (IOException e) {
throw new PaceException("Failed to create Json: ", e);
}
}
}

View File

@ -1,107 +0,0 @@
package eu.dnetlib;
import eu.dnetlib.graph.GraphProcessor;
import eu.dnetlib.pace.config.DedupConfig;
import eu.dnetlib.pace.model.MapDocument;
import eu.dnetlib.pace.util.BlockProcessor;
import eu.dnetlib.pace.utils.PaceUtils;
import eu.dnetlib.reporter.SparkReporter;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.graphx.Edge;
import org.apache.spark.rdd.RDD;
import org.apache.spark.sql.SparkSession;
import org.apache.spark.util.LongAccumulator;
import scala.Tuple2;
import java.net.URL;
import java.util.*;
import java.util.stream.Collectors;
public class SparkLocalTest {
public static void main(String[] args) {
double startTime = System.currentTimeMillis();
final SparkSession spark = SparkSession
.builder()
.appName("Deduplication")
.master("local[*]")
.getOrCreate();
final JavaSparkContext context = new JavaSparkContext(spark.sparkContext());
final URL dataset = SparkLocalTest.class.getResource("/eu/dnetlib/pace/organization.to.fix.json");
JavaRDD<String> dataRDD = context.textFile(dataset.getPath());
//read the configuration from the classpath
final DedupConfig config = DedupConfig.load(Utility.readFromClasspath("/eu/dnetlib/pace/org.curr.conf", SparkLocalTest.class));
Map<String, LongAccumulator> accumulators = Utility.constructAccumulator(config, context.sc());
//create vertexes of the graph: <ID, MapDocument>
JavaPairRDD<String, MapDocument> mapDocs = dataRDD.mapToPair(it -> {
MapDocument mapDocument = PaceUtils.asMapDocument(config, it);
return new Tuple2<>(mapDocument.getIdentifier(), mapDocument);
});
// System.out.println("mapDocs = " + mapDocs.count());
RDD<Tuple2<Object, MapDocument>> vertexes = mapDocs.mapToPair(t -> new Tuple2<Object, MapDocument>( (long) t._1().hashCode(), t._2())).rdd();
//create relations between documents
JavaRDD<Block> blocks = mapDocs.reduceByKey((a, b) -> a) //the reduce is just to be sure that we haven't document with same id
//Clustering: from <id, doc> to List<groupkey,doc>
.flatMapToPair(a -> {
final MapDocument currentDocument = a._2();
return Utility.getGroupingKeys(config, currentDocument).stream()
.map(it -> new Tuple2<>(it, currentDocument)).collect(Collectors.toList()).iterator();
}).groupByKey().map(b -> new Block(b._1(), b._2())).filter(b -> b.getElements().size()>1);
//create relations by comparing only elements in the same group
final JavaPairRDD<String, String> relationRDD = blocks.flatMapToPair(it -> {
final SparkReporter reporter = new SparkReporter(accumulators);
new BlockProcessor(config).process(it.getKey(), it.getElements(), reporter);
return reporter.getRelations().iterator();
});
final RDD<Edge<String>> edgeRdd = relationRDD.map(it -> new Edge<>(it._1().hashCode(),it._2().hashCode(), "similarTo")).rdd();
JavaRDD<ConnectedComponent> ccs = GraphProcessor.findCCs(vertexes, edgeRdd, 20).toJavaRDD();
System.out.println("total time = " + (System.currentTimeMillis()-startTime));
printStatistics(ccs);
accumulators.forEach((name, acc) -> System.out.println(name + " -> " + acc.value()));
}
public static void printStatistics(JavaRDD<ConnectedComponent> ccs){
final JavaRDD<ConnectedComponent> connectedComponents = ccs.filter(cc -> cc.getDocs().size()>1);
final JavaRDD<ConnectedComponent> nonDeduplicated = ccs.filter(cc -> cc.getDocs().size()==1);
//print deduped
connectedComponents.foreach(cc -> {
System.out.println(cc);
});
// connectedComponents.foreach(cc -> {
// cc.getDocs().stream().forEach(d -> {
// System.out.println(d.getFieldMap().get("legalname") + " | " + d.getFieldMap().get("legalshortname"));
// });
// });
//print nondeduped
nonDeduplicated.foreach(cc -> {
System.out.println(cc);
});
System.out.println("Non duplicates: " + nonDeduplicated.count());
System.out.println("Duplicates: " + connectedComponents.flatMap(cc -> cc.getDocs().iterator()).count());
System.out.println("Connected Components: " + connectedComponents.count());
}
}

View File

@ -1,118 +0,0 @@
//package eu.dnetlib;
//
//import com.google.common.collect.Lists;
//import eu.dnetlib.graph.GraphProcessor;
//import eu.dnetlib.pace.config.DedupConfig;
//import eu.dnetlib.pace.model.MapDocument;
//import eu.dnetlib.pace.utils.PaceUtils;
//import eu.dnetlib.reporter.SparkBlockProcessor2;
//import eu.dnetlib.reporter.SparkReporter;
//import org.apache.spark.api.java.JavaPairRDD;
//import org.apache.spark.api.java.JavaRDD;
//import org.apache.spark.api.java.JavaSparkContext;
//import org.apache.spark.graphx.Edge;
//import org.apache.spark.rdd.RDD;
//import org.apache.spark.sql.SparkSession;
//import org.apache.spark.util.LongAccumulator;
//import scala.Tuple2;
//
//import java.math.BigInteger;
//import java.net.URL;
//import java.util.ArrayList;
//import java.util.List;
//import java.util.Map;
//import java.util.stream.Collectors;
//
//public class SparkLocalTest2 {
//
// public static void main(String[] args) {
//
// double startTime = System.currentTimeMillis();
//
// final SparkSession spark = SparkSession
// .builder()
// .appName("Deduplication")
// .master("local[*]")
// .getOrCreate();
//
// final JavaSparkContext context = new JavaSparkContext(spark.sparkContext());
//
// final URL dataset = SparkLocalTest2.class.getResource("/eu/dnetlib/pace/softwares.huge.json");
// JavaRDD<String> dataRDD = context.textFile(dataset.getPath());
//
// //read the configuration from the classpath
// final DedupConfig config = DedupConfig.load(Utility.readFromClasspath("/eu/dnetlib/pace/software.pace.conf", SparkLocalTest2.class));
//
// Map<String, LongAccumulator> accumulators = Utility.constructAccumulator(config, context.sc());
//
// //create vertexes of the graph: <ID, MapDocument>
// JavaPairRDD<String, MapDocument> mapDocs = dataRDD.mapToPair(it -> {
// MapDocument mapDocument = PaceUtils.asMapDocument(config, it);
// return new Tuple2<>(mapDocument.getIdentifier(), mapDocument);
// });
//
//// System.out.println("mapDocs = " + mapDocs.count());
//
// RDD<Tuple2<Object, MapDocument>> vertexes = mapDocs.mapToPair(t -> new Tuple2<Object, MapDocument>( (long) t._1().hashCode(), t._2())).rdd();
//
// //create relations between documents
// JavaRDD<Block> blocks = mapDocs.reduceByKey((a, b) -> a) //the reduce is just to be sure that we haven't document with same id
// //Clustering: from <id, doc> to List<groupkey,doc>
// .flatMapToPair(a -> {
// final MapDocument currentDocument = a._2();
//
// return Utility.getGroupingKeys(config, currentDocument).stream()
// .map(it -> new Tuple2<>(it, currentDocument)).collect(Collectors.toList()).iterator();
// }).groupByKey().map(b -> new Block(b._1(), b._2())).filter(b -> b.getElements().size()>1);
//
//// //BLOCK PURGING
//// blocks = BlockUtils.blockPurging2(blocks);
////// blockPurging(blocks);
////
////// //BLOCK FILTERING
//// blocks = BlockUtils.blockFiltering(blocks);
//
// JavaPairRDD<Tuple2<MapDocument, MapDocument>, Integer> edge = blocks.flatMap(it -> {
// final SparkReporter reporter = new SparkReporter(accumulators);
// return new SparkBlockProcessor2(config).process(it.getKey(), it.getElements(), reporter, accumulators);
// }).mapToPair(candidate -> new Tuple2<>(candidate, 1))
// .reduceByKey((a, b) -> a + b);
//
// final JavaPairRDD<String, String> relationRDD = edge.filter(e -> {
// final SparkReporter reporter = new SparkReporter(accumulators);
// return new SparkBlockProcessor2(config).isSimilar(e._1(), reporter, accumulators);
// }).mapToPair(t -> new Tuple2<>(t._1()._1().getIdentifier(), t._1()._2().getIdentifier()));
//
// System.out.println("relationRDD = " + relationRDD.count());
//
// final RDD<Edge<String>> edgeRdd = relationRDD.map(it -> new Edge<>(it._1().hashCode(),it._2().hashCode(), "similarTo")).rdd();
//
// JavaRDD<ConnectedComponent> ccs = GraphProcessor.findCCs(vertexes, edgeRdd, 20).toJavaRDD();
//
// System.out.println("total time = " + (System.currentTimeMillis()-startTime));
//
// printStatistics(ccs);
// accumulators.forEach((name, acc) -> System.out.println(name + " -> " + acc.value()));
//
// }
//
// public static void printStatistics(JavaRDD<ConnectedComponent> ccs){
// final JavaRDD<ConnectedComponent> connectedComponents = ccs.filter(cc -> cc.getDocs().size()>1);
// final JavaRDD<ConnectedComponent> nonDeduplicated = ccs.filter(cc -> cc.getDocs().size()==1);
//
//// //print deduped
//// connectedComponents.foreach(cc -> {
//// System.out.println(cc);
//// });
//// //print nondeduped
//// nonDeduplicated.foreach(cc -> {
//// System.out.println(cc);
//// });
//
// System.out.println("Non duplicates: " + nonDeduplicated.count());
// System.out.println("Duplicates: " + connectedComponents.flatMap(cc -> cc.getDocs().iterator()).count());
// System.out.println("Connected Components: " + connectedComponents.count());
//
// }
//
//}

View File

@ -1,90 +0,0 @@
//package eu.dnetlib;
//
//import eu.dnetlib.graph.GraphProcessor;
//import eu.dnetlib.pace.config.DedupConfig;
//import eu.dnetlib.pace.model.MapDocument;
//import eu.dnetlib.pace.utils.PaceUtils;
//import eu.dnetlib.reporter.SparkBlockProcessor;
//import eu.dnetlib.reporter.SparkReporter;
//import org.apache.spark.api.java.JavaPairRDD;
//import org.apache.spark.api.java.JavaRDD;
//import org.apache.spark.api.java.JavaSparkContext;
//import org.apache.spark.graphx.Edge;
//import org.apache.spark.rdd.RDD;
//import org.apache.spark.sql.SparkSession;
//import org.apache.spark.util.LongAccumulator;
//import scala.Tuple2;
//
//import java.io.IOException;
//import java.util.Map;
//import java.util.stream.Collectors;
//
//public class SparkTest {
//
// public static void main(String[] args) throws IOException {
//
// final String inputSpacePath = args[0];
// final String dedupConfigPath = args[1];
// final String groupsPath = args[2] + "_groups";
// final String outputPath = args[2] + "_output";
//
// final SparkSession spark = SparkSession
// .builder()
// .appName("Deduplication")
// .master("yarn")
// .getOrCreate();
//
// final JavaSparkContext context = new JavaSparkContext(spark.sparkContext());
//
// final JavaRDD<String> dataRDD = Utility.loadDataFromHDFS(inputSpacePath, context);
//
// final DedupConfig config = Utility.loadConfigFromHDFS(dedupConfigPath);
//
// Map<String, LongAccumulator> accumulators = Utility.constructAccumulator(config, context.sc());
//
// //create vertexes of the graph: <ID, MapDocument>
// JavaPairRDD<String, MapDocument> mapDocs = dataRDD.mapToPair(it -> {
// MapDocument mapDocument = PaceUtils.asMapDocument(config, it);
// return new Tuple2<>(mapDocument.getIdentifier(), mapDocument);
// });
// RDD<Tuple2<Object, MapDocument>> vertexes = mapDocs.mapToPair(t -> new Tuple2<Object, MapDocument>( (long) t._1().hashCode(), t._2())).rdd();
//
// //group documents basing on clustering
// JavaPairRDD<String, Iterable<MapDocument>> blocks = mapDocs.reduceByKey((a, b) -> a) //the reduce is just to be sure that we haven't document with same id
// //Clustering: from <id, doc> to List<groupkey,doc>
// .flatMapToPair(a -> {
// final MapDocument currentDocument = a._2();
//
// return Utility.getGroupingKeys(config, currentDocument).stream()
// .map(it -> new Tuple2<>(it, currentDocument)).collect(Collectors.toList()).iterator();
// }).groupByKey();
//
// Utility.deleteIfExists(groupsPath);
// blocks.map(group -> new DocumentsBlock(group._1(), group._2())).saveAsTextFile(groupsPath);
//
// //create relations by comparing only elements in the same group
// final JavaPairRDD<String, String> relationRDD = blocks.flatMapToPair(it -> {
// final SparkReporter reporter = new SparkReporter(accumulators);
// new SparkBlockProcessor(config).process(it._1(), it._2(), reporter, accumulators);
// return reporter.getRelations().iterator();
// });
//
// final RDD<Edge<String>> edgeRdd = relationRDD.map(it -> new Edge<>(it._1().hashCode(),it._2().hashCode(), "similarTo")).rdd();
//
// JavaRDD<ConnectedComponent> ccs = GraphProcessor.findCCs(vertexes, edgeRdd, 20).toJavaRDD();
//
// //save connected components on textfile
// Utility.deleteIfExists(outputPath);
// ccs.saveAsTextFile(outputPath);
//
// final JavaRDD<ConnectedComponent> connectedComponents = ccs.filter(cc -> cc.getDocs().size()>1);
// final JavaRDD<ConnectedComponent> nonDeduplicated = ccs.filter(cc -> cc.getDocs().size()==1);
//
// System.out.println("Non duplicates: " + nonDeduplicated.count());
// System.out.println("Duplicates: " + connectedComponents.flatMap(cc -> cc.getDocs().iterator()).count());
// System.out.println("Connected Components: " + connectedComponents.count());
// accumulators.forEach((name, acc) -> System.out.println(name + " -> " + acc.value()));
//
// }
//
//}

View File

@ -1,7 +1,7 @@
package eu.dnetlib.graph
import eu.dnetlib.ConnectedComponent
import eu.dnetlib.pace.model.MapDocument
import eu.dnetlib.support.ConnectedComponent
import org.apache.spark.graphx._
import org.apache.spark.rdd.RDD

View File

@ -1,7 +1,7 @@
package eu.dnetlib.pace.utils;
import com.google.common.collect.Lists;
import eu.dnetlib.Block;
import eu.dnetlib.support.Block;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import scala.Tuple2;

View File

@ -1,4 +1,4 @@
package eu.dnetlib;
package eu.dnetlib.pace.utils;
import com.google.common.collect.Sets;
import eu.dnetlib.pace.clustering.BlacklistAwareClusteringCombiner;
@ -65,7 +65,7 @@ public class Utility {
}
static <T> String readFromClasspath(final String filename, final Class<T> clazz) {
public static <T> String readFromClasspath(final String filename, final Class<T> clazz) {
final StringWriter sw = new StringWriter();
try {
IOUtils.copy(clazz.getResourceAsStream(filename), sw);
@ -75,7 +75,7 @@ public class Utility {
}
}
static Set<String> getGroupingKeys(DedupConfig conf, MapDocument doc) {
public static Set<String> getGroupingKeys(DedupConfig conf, MapDocument doc) {
return Sets.newHashSet(BlacklistAwareClusteringCombiner.filterAndCombine(doc, conf));
}
}

View File

@ -1,189 +0,0 @@
//package eu.dnetlib.reporter;
//import com.google.common.collect.Lists;
//import eu.dnetlib.pace.clustering.NGramUtils;
//import eu.dnetlib.pace.config.DedupConfig;
//import eu.dnetlib.pace.config.WfConfig;
//import eu.dnetlib.pace.distance.PaceDocumentDistance;
//import eu.dnetlib.pace.distance.eval.ScoreResult;
//import eu.dnetlib.pace.model.Field;
//import eu.dnetlib.pace.model.MapDocument;
//import eu.dnetlib.pace.model.MapDocumentComparator;
//import org.apache.commons.lang.StringUtils;
//import org.apache.commons.logging.Log;
//import org.apache.commons.logging.LogFactory;
//import org.apache.spark.util.LongAccumulator;
//
//import java.util.*;
//
//public class SparkBlockProcessor {
//
// public static final List<String> accumulators= new ArrayList<>();
//
// private static final Log log = LogFactory.getLog(SparkBlockProcessor.class);
//
// private DedupConfig dedupConf;
//
// public SparkBlockProcessor(DedupConfig dedupConf) {
// this.dedupConf = dedupConf;
// }
//
// public void process(final String key, final Iterable<MapDocument> documents, final SparkReporter context, Map<String, LongAccumulator> accumulators) {
//
// final Queue<MapDocument> q = prepare(documents);
//
// if (q.size() > 1) {
// process(simplifyQueue(q, key, context, accumulators), context, accumulators);
//
// } else {
// context.incrementCounter(dedupConf.getWf().getEntityType(), "records per hash key = 1", 1, accumulators);
// }
// }
//
// private Queue<MapDocument> prepare(final Iterable<MapDocument> documents) {
// final Queue<MapDocument> queue = new PriorityQueue<>(100, new MapDocumentComparator(dedupConf.getWf().getOrderField()));
//
// final Set<String> seen = new HashSet<String>();
// final int queueMaxSize = dedupConf.getWf().getQueueMaxSize();
//
// documents.forEach(doc -> {
// if (queue.size() <= queueMaxSize) {
// final String id = doc.getIdentifier();
//
// if (!seen.contains(id)) {
// seen.add(id);
// queue.add(doc);
// }
// }
// });
//
// return queue;
// }
//
// private Queue<MapDocument> simplifyQueue(final Queue<MapDocument> queue, final String ngram, final SparkReporter context, Map<String, LongAccumulator> accumulators) {
// final Queue<MapDocument> q = new LinkedList<>();
//
// String fieldRef = "";
// final List<MapDocument> tempResults = Lists.newArrayList();
//
// while (!queue.isEmpty()) {
// final MapDocument result = queue.remove();
//
// final String orderFieldName = dedupConf.getWf().getOrderField();
// final Field orderFieldValue = result.values(orderFieldName);
// if (!orderFieldValue.isEmpty()) {
// final String field = NGramUtils.cleanupForOrdering(orderFieldValue.stringValue());
// if (field.equals(fieldRef)) {
// tempResults.add(result);
// } else {
// populateSimplifiedQueue(q, tempResults, context, fieldRef, ngram, accumulators);
// tempResults.clear();
// tempResults.add(result);
// fieldRef = field;
// }
// } else {
// context.incrementCounter(dedupConf.getWf().getEntityType(), "missing " + dedupConf.getWf().getOrderField(), 1, accumulators);
// }
// }
// populateSimplifiedQueue(q, tempResults, context, fieldRef, ngram, accumulators);
//
// return q;
// }
//
// private void populateSimplifiedQueue(final Queue<MapDocument> q,
// final List<MapDocument> tempResults,
// final SparkReporter context,
// final String fieldRef,
// final String ngram,
// Map<String, LongAccumulator> accumulators) {
// WfConfig wf = dedupConf.getWf();
// if (tempResults.size() < wf.getGroupMaxSize()) {
// q.addAll(tempResults);
// } else {
// context.incrementCounter(wf.getEntityType(), String.format("Skipped records for count(%s) >= %s", wf.getOrderField(), wf.getGroupMaxSize()), tempResults.size(), accumulators);
//// log.info("Skipped field: " + fieldRef + " - size: " + tempResults.size() + " - ngram: " + ngram);
// }
// }
//
// private void process(final Queue<MapDocument> queue, final SparkReporter context, Map<String, LongAccumulator> accumulators) {
//
// final PaceDocumentDistance algo = new PaceDocumentDistance();
//
// while (!queue.isEmpty()) {
//
// final MapDocument pivot = queue.remove();
// final String idPivot = pivot.getIdentifier();
//
// WfConfig wf = dedupConf.getWf();
// final Field fieldsPivot = pivot.values(wf.getOrderField());
// final String fieldPivot = (fieldsPivot == null) || fieldsPivot.isEmpty() ? null : fieldsPivot.stringValue();
//
// if (fieldPivot != null) {
// // System.out.println(idPivot + " --> " + fieldPivot);
//
// int i = 0;
// for (final MapDocument curr : queue) {
// final String idCurr = curr.getIdentifier();
//
// if (mustSkip(idCurr)) {
//
// context.incrementCounter(wf.getEntityType(), "skip list", 1, accumulators);
//
// break;
// }
//
// if (i > wf.getSlidingWindowSize()) {
// break;
// }
//
// final Field fieldsCurr = curr.values(wf.getOrderField());
// final String fieldCurr = (fieldsCurr == null) || fieldsCurr.isEmpty() ? null : fieldsCurr.stringValue();
//
// if (!idCurr.equals(idPivot) && (fieldCurr != null)) {
//
// final ScoreResult sr = similarity(algo, pivot, curr);
//// log.info(sr.toString()+"SCORE "+ sr.getScore());
// emitOutput(sr, idPivot, idCurr, context, accumulators);
// i++;
// }
// }
// }
// }
// }
//
// private void emitOutput(final ScoreResult sr, final String idPivot, final String idCurr, final SparkReporter context, Map<String, LongAccumulator> accumulators) {
// final double d = sr.getScore();
//
// if (d >= dedupConf.getWf().getThreshold()) {
//
// writeSimilarity(context, idPivot, idCurr);
// context.incrementCounter(dedupConf.getWf().getEntityType(), "dedupSimilarity (x2)", 1, accumulators);
// } else {
// context.incrementCounter(dedupConf.getWf().getEntityType(), "d < " + dedupConf.getWf().getThreshold(), 1, accumulators);
// }
// }
//
// private ScoreResult similarity(final PaceDocumentDistance algo, final MapDocument a, final MapDocument b) {
// try {
// return algo.between(a, b, dedupConf);
// } catch(Throwable e) {
// log.error(String.format("\nA: %s\n----------------------\nB: %s", a, b), e);
// throw new IllegalArgumentException(e);
// }
// }
//
// private boolean mustSkip(final String idPivot) {
// return dedupConf.getWf().getSkipList().contains(getNsPrefix(idPivot));
// }
//
// private String getNsPrefix(final String id) {
// return StringUtils.substringBetween(id, "|", "::");
// }
//
// private void writeSimilarity(final SparkReporter context, final String from, final String to) {
// final String type = dedupConf.getWf().getEntityType();
//
// context.emit(type, from, to);
//// context.emit(type, to, from);
// }
//
//}

View File

@ -1,193 +0,0 @@
//package eu.dnetlib.reporter;
//import com.google.common.collect.Lists;
//import eu.dnetlib.pace.clustering.NGramUtils;
//import eu.dnetlib.pace.config.DedupConfig;
//import eu.dnetlib.pace.config.WfConfig;
//import eu.dnetlib.pace.distance.PaceDocumentDistance;
//import eu.dnetlib.pace.distance.eval.ScoreResult;
//import eu.dnetlib.pace.model.Field;
//import eu.dnetlib.pace.model.MapDocument;
//import eu.dnetlib.pace.model.MapDocumentComparator;
//import org.apache.commons.lang.StringUtils;
//import org.apache.commons.logging.Log;
//import org.apache.commons.logging.LogFactory;
//import org.apache.spark.util.LongAccumulator;
//import scala.Tuple2;
//
//import java.util.*;
//
//public class SparkBlockProcessor2 {
//
// private static final Log log = LogFactory.getLog(SparkBlockProcessor2.class);
//
// private DedupConfig dedupConf;
//
// public SparkBlockProcessor2(DedupConfig dedupConf) {
// this.dedupConf = dedupConf;
// }
//
// public boolean isSimilar(Tuple2<MapDocument, MapDocument> t, SparkReporter context, Map<String, LongAccumulator> accumulators) {
//
// final PaceDocumentDistance algo = new PaceDocumentDistance();
//
// final ScoreResult sr = similarity(algo, t._1(), t._2());
//
// final double d = sr.getScore();
//
// if (d >= dedupConf.getWf().getThreshold()) {
// context.incrementCounter(dedupConf.getWf().getEntityType(), "dedupSimilarity (x2)", 1, accumulators);
// return true;
// } else {
// context.incrementCounter(dedupConf.getWf().getEntityType(), "d < " + dedupConf.getWf().getThreshold(), 1, accumulators);
// return false;
// }
// }
//
// public Iterator<Tuple2<MapDocument, MapDocument>> process(final String key, final Iterable<MapDocument> documents, final SparkReporter context, Map<String, LongAccumulator> accumulators) {
//
// final Queue<MapDocument> q = prepare(documents);
//
// if (q.size() > 1) {
// return process(simplifyQueue(q, key, context, accumulators), context, accumulators);
//
// } else {
// context.incrementCounter(dedupConf.getWf().getEntityType(), "records per hash key = 1", 1, accumulators);
// return new ArrayList<Tuple2<MapDocument,MapDocument>>().iterator();
// }
// }
//
// private Queue<MapDocument> prepare(final Iterable<MapDocument> documents) {
// final Queue<MapDocument> queue = new PriorityQueue<>(100, new MapDocumentComparator(dedupConf.getWf().getOrderField()));
//
// final Set<String> seen = new HashSet<String>();
// final int queueMaxSize = dedupConf.getWf().getQueueMaxSize();
//
// documents.forEach(doc -> {
// if (queue.size() <= queueMaxSize) {
// final String id = doc.getIdentifier();
//
// if (!seen.contains(id)) {
// seen.add(id);
// queue.add(doc);
// }
// }
// });
//
// return queue;
// }
//
// private Queue<MapDocument> simplifyQueue(final Queue<MapDocument> queue, final String ngram, final SparkReporter context, Map<String, LongAccumulator> accumulators) {
// final Queue<MapDocument> q = new LinkedList<>();
//
// String fieldRef = "";
// final List<MapDocument> tempResults = Lists.newArrayList();
//
// while (!queue.isEmpty()) {
// final MapDocument result = queue.remove();
//
// final String orderFieldName = dedupConf.getWf().getOrderField();
// final Field orderFieldValue = result.values(orderFieldName);
// if (!orderFieldValue.isEmpty()) {
// final String field = NGramUtils.cleanupForOrdering(orderFieldValue.stringValue());
// if (field.equals(fieldRef)) {
// tempResults.add(result);
// } else {
// populateSimplifiedQueue(q, tempResults, context, fieldRef, ngram, accumulators);
// tempResults.clear();
// tempResults.add(result);
// fieldRef = field;
// }
// } else {
// context.incrementCounter(dedupConf.getWf().getEntityType(), "missing " + dedupConf.getWf().getOrderField(), 1, accumulators);
// }
// }
// populateSimplifiedQueue(q, tempResults, context, fieldRef, ngram, accumulators);
//
// return q;
// }
//
// private void populateSimplifiedQueue(final Queue<MapDocument> q,
// final List<MapDocument> tempResults,
// final SparkReporter context,
// final String fieldRef,
// final String ngram,
// Map<String, LongAccumulator> accumulators) {
// WfConfig wf = dedupConf.getWf();
// if (tempResults.size() < wf.getGroupMaxSize()) {
// q.addAll(tempResults);
// } else {
// context.incrementCounter(wf.getEntityType(), String.format("Skipped records for count(%s) >= %s", wf.getOrderField(), wf.getGroupMaxSize()), tempResults.size(), accumulators);
//// log.info("Skipped field: " + fieldRef + " - size: " + tempResults.size() + " - ngram: " + ngram);
// }
// }
//
// private Iterator<Tuple2<MapDocument, MapDocument>> process(final Queue<MapDocument> queue, final SparkReporter context, Map<String, LongAccumulator> accumulators) {
//
// final PaceDocumentDistance algo = new PaceDocumentDistance();
//
// List<Tuple2<MapDocument, MapDocument>> ret = new ArrayList<>();
//
// while (!queue.isEmpty()) {
//
// final MapDocument pivot = queue.remove();
// final String idPivot = pivot.getIdentifier();
//
// WfConfig wf = dedupConf.getWf();
// final Field fieldsPivot = pivot.values(wf.getOrderField());
// final String fieldPivot = (fieldsPivot == null) || fieldsPivot.isEmpty() ? null : fieldsPivot.stringValue();
//
// if (fieldPivot != null) {
// // System.out.println(idPivot + " --> " + fieldPivot);
//
// int i = 0;
// for (final MapDocument curr : queue) {
// final String idCurr = curr.getIdentifier();
//
// if (mustSkip(idCurr)) {
//
// context.incrementCounter(wf.getEntityType(), "skip list", 1, accumulators);
//
// break;
// }
//
// if (i > wf.getSlidingWindowSize()) {
// break;
// }
//
// final Field fieldsCurr = curr.values(wf.getOrderField());
// final String fieldCurr = (fieldsCurr == null) || fieldsCurr.isEmpty() ? null : fieldsCurr.stringValue();
//
// if (!idCurr.equals(idPivot) && (fieldCurr != null)) {
//
// if (pivot.getIdentifier().compareTo(curr.getIdentifier())<0){
// ret.add(new Tuple2<>(pivot, curr));
// } else {
// ret.add(new Tuple2<>(curr, pivot));
// }
// i++;
// }
// }
// }
// }
//
// return ret.iterator();
// }
//
// private ScoreResult similarity(final PaceDocumentDistance algo, final MapDocument a, final MapDocument b) {
// try {
// return algo.between(a, b, dedupConf);
// } catch(Throwable e) {
// log.error(String.format("\nA: %s\n----------------------\nB: %s", a, b), e);
// throw new IllegalArgumentException(e);
// }
// }
//
// private boolean mustSkip(final String idPivot) {
// return dedupConf.getWf().getSkipList().contains(getNsPrefix(idPivot));
// }
//
// private String getNsPrefix(final String id) {
// return StringUtils.substringBetween(id, "|", "::");
// }
//
//}

View File

@ -1,11 +1,8 @@
package eu.dnetlib.reporter;
import eu.dnetlib.Utility;
import eu.dnetlib.pace.config.DedupConfig;
import eu.dnetlib.pace.util.Reporter;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.spark.SparkContext;
import org.apache.spark.util.LongAccumulator;
import scala.Serializable;
import scala.Tuple2;

View File

@ -1,4 +1,4 @@
package eu.dnetlib;
package eu.dnetlib.support;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.pace.model.MapDocument;

View File

@ -1,4 +1,4 @@
package eu.dnetlib;
package eu.dnetlib.support;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.pace.model.Field;

View File

@ -1,4 +1,4 @@
package eu.dnetlib;
package eu.dnetlib.support;
import org.apache.commons.logging.Log;

View File

@ -1,4 +1,4 @@
package eu.dnetlib;
package eu.dnetlib.support;
import org.apache.spark.util.AccumulatorV2;

View File

@ -1,2 +0,0 @@
{ "type": 30, "id": "30|author::id1", "person": { "metadata":{"orcid": "orcid1", "fullname": "smith, john", "firstname": "john", "lastname": "smith", "pubID": "pubid1", "pubDOI": "pubdoi1", "coauthors": ["la bruzzo, sandro", "atzori, claudio", "baglioni, miriam", "bardi, alessia"], "topics": [0.0,0.0,0.0], "rank":1, "area":"1"}}}
{ "type": 30, "id": "30|author::id2", "person": { "metadata":{"orcid": "", "fullname": "smith, john", "firstname": "john", "lastname": "smith", "pubID": "pubid2", "pubDOI": "pubdoi2", "coauthors": ["la bruzzo, sandro", "atzori, claudio", "baglioni, miriam", "bardi, alessia"], "topics": [0.0,0.0,0.0], "rank":3, "area":"1"}}}

View File

@ -1,40 +0,0 @@
{
"wf" : {
"threshold" : "0.99",
"dedupRun" : "001",
"entityType" : "person",
"orderField" : "fullname",
"queueMaxSize" : "2000",
"groupMaxSize" : "10",
"slidingWindowSize" : "200",
"rootBuilder" : [ "person" ],
"includeChildren" : "true"
},
"pace": {
"clustering": [
{"name": "personClustering", "fieldsCount": ["fullname"], "params": {}}
],
"necessaryConditions": [],
"decisionTree": {
"start": {"fieldsCount": [{"field":"pubID", "comparator":"exactMatch", "weight":1.0, "params":{}}], "threshold":1.0, "aggregation": "SUM", "positive":"NO_MATCH", "negative":"layer2", "undefined": "layer2", "ignoreMissing": "false"},
"layer2": {"fieldsCount": [{"field":"orcid", "comparator":"exactMatch", "weight":1.0, "params":{}}], "threshold":1.0, "aggregation": "SUM", "positive":"ORCID_MATCH", "negative":"NO_MATCH", "undefined": "layer3", "ignoreMissing": "false"},
"layer3": {"fieldsCount": [{"field":"firstname", "comparator":"similar", "weight":1.0, "params":{}}], "threshold":0.7, "aggregation": "SUM", "positive":"layer4", "negative":"NO_MATCH", "undefined": "layer4", "ignoreMissing": "false"},
"layer4": {"fieldsCount": [{"field":"coauthors", "comparator":"coauthorsMatch", "weight":1.0, "params":{"minCoauthors":6, "maxCoauthors": 200}}], "threshold":5.0, "aggregation": "SUM", "positive":"COAUTHORS_MATCH", "negative":"NO_MATCH", "undefined": "layer5", "ignoreMissing": "false"},
"layer5": {"fieldsCount": [{"field":"area", "comparator":"exactMatch", "weight":1.0, "params":{}}], "threshold":1.0, "aggregation": "SUM", "positive":"layer6", "negative":"NO_MATCH", "undefined": "NO_MATCH", "ignoreMissing": "false"},
"layer6": {"fieldsCount": [{"field":"topics", "comparator":"topicsMatch", "weight":1.0, "params":{}}], "threshold":0.7, "aggregation": "SUM", "positive":"TOPICS_MATCH", "negative":"NO_MATCH", "undefined": "NO_MATCH", "ignoreMissing": "false"}
},
"model": [
{"name": "fullname", "algo": "Null", "type": "String", "weight": "0", "ignoreMissing": "false", "path": "person/metadata/fullname"},
{"name": "firstname", "algo": "Null", "type": "String", "weight": "0", "ignoreMissing": "false", "path": "person/metadata/firstname"},
{"name": "lastname", "algo": "Null", "type": "String", "weight": "0", "ignoreMissing": "false", "path": "person/metadata/lastname"},
{"name": "coauthors", "algo": "Null", "type": "String", "weight": "0", "ignoreMissing": "false", "path": "person/metadata/coauthors"},
{"name": "orcid", "algo": "Null", "type": "String", "weight": "0", "ignoreMissing": "false", "path": "person/metadata/orcid"},
{"name": "topics", "algo": "Null", "type": "String", "weight": "0", "ignoreMissing": "false", "path": "person/metadata/topics"},
{"name": "pubID", "algo": "Null", "type": "String", "weight": "0", "ignoreMissing": "false", "path": "person/metadata/pubID"},
{"name": "pubDOI", "algo": "Null", "type": "String", "weight": "0", "ignoreMissing": "false", "path": "person/metadata/pubDOI"},
{"name": "rank", "algo": "Null", "type": "Int", "weight": "0", "ignoreMissing": "false", "path": "person/metadata/rank"},
{"name": "area", "algo": "Null", "type": "String", "weight": "0", "ignoreMissing": "false", "path": "person/metadata/area"}
],
"blacklists": {}
}
}

View File

@ -1,35 +0,0 @@
{
"wf" : {
"threshold" : "0.9",
"dedupRun" : "001",
"entityType" : "organization",
"orderField" : "legalname",
"queueMaxSize" : "2000",
"groupMaxSize" : "50",
"slidingWindowSize" : "200",
"rootBuilder" : [ "organization", "projectOrganization_participation_isParticipant", "datasourceOrganization_provision_isProvidedBy" ],
"includeChildren" : "true"
},
"pace" : {
"clustering" : [
{ "name" : "sortedngrampairs", "fieldsCount" : [ "legalname" ], "params" : { "max" : 2, "ngramLen" : "3"} },
{ "name" : "suffixprefix", "fieldsCount" : [ "legalname" ], "params" : { "max" : 1, "len" : "3" } },
{ "name" : "urlclustering", "fieldsCount" : [ "websiteurl" ], "params" : { } }
],
"sufficientConditions" : [
{ "name" : "exactMatch", "fieldsCount" : [ "gridid" ] }
],
"necessaryConditions" : [
{ "name" : "exactMatch", "fieldsCount" : [ "country" ] },
{ "name" : "DomainExactMatch", "fieldsCount" : [ "websiteurl" ] }
],
"model" : [
{ "name" : "country", "algo" : "Null", "type" : "String", "weight" : "0", "ignoreMissing" : "true", "path" : "organization/metadata/country/classid" },
{ "name" : "legalshortname", "algo" : "JaroWinklerNormalizedName", "type" : "String", "weight" : "0.1", "ignoreMissing" : "true", "path" : "organization/metadata/legalshortname/value" },
{ "name" : "legalname", "algo" : "JaroWinklerNormalizedName", "type" : "String", "weight" : "0.9", "ignoreMissing" : "false", "path" : "organization/metadata/legalname/value", "params" : {"windowSize" : 4, "threshold" : 0.5} },
{ "name" : "websiteurl", "algo" : "Null", "type" : "URL", "weight" : "0", "ignoreMissing" : "true", "path" : "organization/metadata/websiteurl/value", "params" : { "host" : 0.5, "path" : 0.5 } },
{ "name" : "gridid", "algo" : "Null", "type" : "String", "weight" : "0.0", "ignoreMissing" : "true", "path" : "pid[qualifier#classid = {grid}]/value" }
],
"blacklists" : { }
}
}

View File

@ -1,30 +0,0 @@
{
"wf" : {
"threshold" : "0.98",
"dedupRun" : "001",
"entityType" : "organization",
"orderField" : "legalname",
"queueMaxSize" : "2000",
"groupMaxSize" : "10",
"slidingWindowSize" : "200",
"rootBuilder" : [ "organization", "projectOrganization_participation_isParticipant", "datasourceOrganization_provision_isProvidedBy" ],
"includeChildren" : "true"
},
"pace" : {
"clustering" : [
{ "name" : "ngrampairs", "fieldsCount" : [ "legalname" ], "params" : { "max" : "1", "ngramLen" : "3"} },
{ "name" : "suffixprefix", "fieldsCount" : [ "legalname" ], "params" : { "max" : "1", "len" : "3" } },
{ "name" : "spacetrimmingfieldvalue", "fieldsCount" : [ "legalshortname" ], "params" : { "randomLength" : "5" } }
],
"necessaryConditions" : [
{ "name" : "exactMatch", "fieldsCount" : [ "country" ] }
],
"model" : [
{ "name" : "legalname", "algo" : "JaroWinkler", "type" : "String", "weight" : "0.3", "ignoreMissing" : "false", "path" : "organization/metadata/legalname/value" },
{ "name" : "legalshortname", "algo" : "JaroWinkler", "type" : "String", "weight" : "0.3", "ignoreMissing" : "true", "path" : "organization/metadata/legalshortname/value" },
{ "name" : "country", "algo" : "Null", "type" : "String", "weight" : "0", "ignoreMissing" : "true", "path" : "organization/metadata/country/classid" },
{ "name" : "websiteurl", "algo" : "urlMatcher", "type" : "URL", "weight" : "0.4", "ignoreMissing" : "true", "path" : "organization/metadata/websiteurl/value", "params" : {"host":"1.0", "path":"0.0"} }
],
"blacklists" : { }
}
}

View File

@ -1,36 +0,0 @@
{
"wf" : {
"threshold" : "0.85",
"dedupRun" : "001",
"entityType" : "organization",
"orderField" : "legalname",
"queueMaxSize" : "20000",
"groupMaxSize" : "20",
"slidingWindowSize" : "400",
"rootBuilder" : [ "organization", "projectOrganization_participation_isParticipant", "datasourceOrganization_provision_isProvidedBy" ],
"includeChildren" : "true"
},
"pace" : {
"clustering" : [
{ "name" : "ngrampairs", "fieldsCount" : [ "legalname" ], "params" : { "max" : 1, "ngramLen" : "3" } },
{ "name" : "suffixprefix", "fieldsCount" : [ "legalname" ], "params" : { "max" : 1, "len" : "3" } },
{ "name" : "immutablefieldvalue", "fieldsCount" : [ "country" ], "params" : { } },
{ "name" : "spacetrimmingfieldvalue", "fieldsCount" : [ "legalshortname" ], "params" : { "randomLength" : "5" } },
{ "name" : "urlclustering", "fieldsCount" : [ "websiteurl" ], "params" : { } }
],
"sufficientConditions":[
{ "name" : "exactMatch", "fieldsCount" : [ "gridid" ] }
],
"necessaryConditions" : [
{ "name" : "exactMatch", "fieldsCount" : [ "country" ] }
],
"model" : [
{ "name" : "legalname", "algo" : "LevensteinTitle", "type" : "String", "weight" : "0.2", "ignoreMissing" : "false", "path" : "organization/metadata/legalname/value" },
{ "name" : "legalshortname", "algo" : "LevensteinTitle", "type" : "String", "weight" : "0.2", "ignoreMissing" : "true", "path" : "organization/metadata/legalshortname/value" },
{ "name" : "websiteurl", "algo" : "urlMatcher", "type" : "URL", "weight" : "0.6", "ignoreMissing" : "true", "path" : "organization/metadata/websiteurl/value", "params" : { "host" : 0.5, "path" : 0.5 } },
{ "name" : "country", "algo" : "Null", "type" : "String", "weight" : "0.0", "ignoreMissing" : "true", "path" : "organization/metadata/country/classid" },
{ "name" : "gridid", "algo" : "Null", "type" : "String", "weight" : "0.0", "ignoreMissing" : "true", "path" : "pid[qualifier#classid = {grid}]/value" }
],
"blacklists" : { }
}
}

View File

@ -1,27 +0,0 @@
{
"wf" : {
"threshold" : "0.98",
"dedupRun" : "001",
"entityType" : "organization",
"orderField" : "legalname",
"queueMaxSize" : "2000",
"groupMaxSize" : "10",
"slidingWindowSize" : "200",
"rootBuilder" : [ "organization", "projectOrganization_participation_isParticipant", "datasourceOrganization_provision_isProvidedBy" ],
"includeChildren" : "true"
},
"pace" : {
"clustering" : [
{ "name" : "ngrampairs", "fieldsCount" : [ "legalname" ], "params" : { "max" : "1", "ngramLen" : "3"} },
{ "name" : "suffixprefix", "fieldsCount" : [ "legalname" ], "params" : { "max" : "1", "len" : "3" } }
],
"necessaryConditions" : [
{ "name" : "exactMatch", "fieldsCount" : [ "country" ] }
],
"model" : [
{ "name" : "legalname", "algo" : "Levenstein", "type" : "String", "weight" : "1.0", "ignoreMissing" : "false", "path" : "organization/metadata/legalname/value" },
{ "name" : "country", "algo" : "Null", "type" : "String", "weight" : "0", "ignoreMissing" : "true", "path" : "organization/metadata/country/classid" }
],
"blacklists" : { }
}
}

View File

@ -1,33 +0,0 @@
{
"wf" : {
"threshold" : "0.98",
"dedupRun" : "001",
"entityType" : "organization",
"orderField" : "legalname",
"queueMaxSize" : "2000",
"groupMaxSize" : "10",
"slidingWindowSize" : "200",
"rootBuilder" : [ "organization", "projectOrganization_participation_isParticipant", "datasourceOrganization_provision_isProvidedBy" ],
"includeChildren" : "true"
},
"pace" : {
"clustering" : [
{ "name" : "ngrampairs", "fieldsCount" : [ "legalname" ], "params" : { "max" : 1, "ngramLen" : "3"} },
{ "name" : "suffixprefix", "fieldsCount" : [ "legalname" ], "params" : { "max" : 1, "len" : "3" } },
{ "name" : "urlclustering", "fieldsCount" : [ "websiteurl" ], "params" : { } }
],
"necessaryConditions" : [
{ "name" : "exactMatch", "fieldsCount" : [ "country" ] },
{ "name" : "DomainExactMatch", "fieldsCount" : [ "websiteurl" ] }
],
"decisionTree": {},
"model" : [
{ "name" : "legalname", "algo" : "Null", "type" : "String", "weight" : "0", "ignoreMissing" : "false", "path" : "organization/metadata/legalname/value" },
{ "name" : "country", "algo" : "Null", "type" : "String", "weight" : "0", "ignoreMissing" : "true", "path" : "organization/metadata/country/classid" },
{ "name" : "legalshortname", "algo" : "JaroWinkler", "type" : "String", "weight" : "0.3", "ignoreMissing" : "true", "path" : "organization/metadata/legalshortname/value" },
{ "name" : "legalname", "algo" : "JaroWinklerNormalizedName", "type" : "String", "weight" : "0.7", "ignoreMissing" : "false", "path" : "organization/metadata/legalname/value", "params" : { "windowSize" : 4 } },
{ "name" : "websiteurl", "algo" : "Null", "type" : "URL", "weight" : "0", "ignoreMissing" : "true", "path" : "organization/metadata/websiteurl/value", "params" : { "host" : 0.5, "path" : 0.5 } }
],
"blacklists" : { }
}
}

View File

@ -1,18 +0,0 @@
<<<<<<< HEAD
{"collectedfrom":[{"value":"GRID - Global Research Identifier Database","key":"10|openaire____::ff4a008470319a22d9cf3d14af485977"}],"organization":{"metadata":{"legalshortname":{"value":"NIOK"},"websiteurl":{"value":"http://www.niok.eu/"},"country":{"classid":"NL","classname":"Netherlands","schemename":"dnet:countries","schemeid":"dnet:countries"},"legalname":{"value":"NIOK"}}},"pid":[{"qualifier":{"classid":"grid","classname":"grid","schemename":"dnet:pid_types","schemeid":"dnet:pid_types"},"value":"grid.450158.d"}],"type":20,"id":"20|grid________::6183d331a1920dd81b8c10620a8b3a8a"}
{"collectedfrom":[{"value":"GRID - Global Research Identifier Database","key":"10|openaire____::ff4a008470319a22d9cf3d14af485977"}],"organization":{"metadata":{"legalshortname":{"value":"NIVEL"},"websiteurl":{"value":"http://www.nivel.nl/en"},"country":{"classid":"NL","classname":"Netherlands","schemename":"dnet:countries","schemeid":"dnet:countries"},"legalname":{"value":"NIVEL"}}},"pid":[{"qualifier":{"classid":"grid","classname":"grid","schemename":"dnet:pid_types","schemeid":"dnet:pid_types"},"value":"grid.416005.6"}],"type":20,"id":"20|grid________::8f65fd4e764086db897cc648e9cbbaed"}
{"collectedfrom":[{"value":"GRID - Global Research Identifier Database","key":"10|openaire____::ff4a008470319a22d9cf3d14af485977"}],"organization":{"metadata":{"legalshortname":{"value":"SCP"},"websiteurl":{"value":"http://www.scp.nl/english/"},"country":{"classid":"NL","classname":"Netherlands","schemename":"dnet:countries","schemeid":"dnet:countries"},"legalname":{"value":"Netherlands Institute for Social Research"}}},"pid":[{"qualifier":{"classid":"grid","classname":"grid","schemename":"dnet:pid_types","schemeid":"dnet:pid_types"},"value":"grid.438038.4"}],"type":20,"id":"20|grid________::c69cffc4997b54bb2eb5ca6aebcda18b"}
{"collectedfrom":[{"value":"GRID - Global Research Identifier Database","key":"10|openaire____::ff4a008470319a22d9cf3d14af485977"}],"organization":{"metadata":{"legalshortname":{"value":"NIVEL"},"websiteurl":{"value":"http://www.nivel.nl/en"},"country":{"classid":"NL","classname":"Netherlands","schemename":"dnet:countries","schemeid":"dnet:countries"},"legalname":{"value":"Netherlands Institute for Health Services Research"}}},"pid":[{"qualifier":{"classid":"grid","classname":"grid","schemename":"dnet:pid_types","schemeid":"dnet:pid_types"},"value":"grid.416005.6"}],"type":20,"id":"20|grid________::5b72dc608480f3d5569a7bfe3cbdaf07"}
{"collectedfrom":[{"value":"GRID - Global Research Identifier Database","key":"10|openaire____::ff4a008470319a22d9cf3d14af485977"}],"organization":{"metadata":{"legalshortname":{"value":"SCP"},"websiteurl":{"value":"http://www.scp.nl/english/"},"country":{"classid":"NL","classname":"Netherlands","schemename":"dnet:countries","schemeid":"dnet:countries"},"legalname":{"value":"SCP"}}},"pid":[{"qualifier":{"classid":"grid","classname":"grid","schemename":"dnet:pid_types","schemeid":"dnet:pid_types"},"value":"grid.438038.4"}],"type":20,"id":"20|grid________::6b7b927a3ae25f1639a6ef27b35021b5"}
{"collectedfrom":[{"value":"GRID - Global Research Identifier Database","key":"10|openaire____::ff4a008470319a22d9cf3d14af485977"}],"organization":{"metadata":{"legalshortname":{"value":"NIOK"},"websiteurl":{"value":"http://www.niok.eu/"},"country":{"classid":"NL","classname":"Netherlands","schemename":"dnet:countries","schemeid":"dnet:countries"},"legalname":{"value":"Netherlands Institute for Catalysis Research"}}},"pid":[{"qualifier":{"classid":"grid","classname":"grid","schemename":"dnet:pid_types","schemeid":"dnet:pid_types"},"value":"grid.450158.d"}],"type":20,"id":"20|grid________::6af340f03c44041737859d3e1354d1fe"}
{"collectedfrom":[{"value":"GRID - Global Research Identifier Database","key":"10|openaire____::ff4a008470319a22d9cf3d14af485977"}],"organization":{"metadata":{"legalshortname":{"value":"NIVEL"},"websiteurl":{"value":"http://www.nivel.nl/en"},"country":{"classid":"NL","classname":"Netherlands","schemename":"dnet:countries","schemeid":"dnet:countries"},"legalname":{"value":"Nederlands Instituut voor Onderzoek van de Gezondheidszorg"}}},"pid":[{"qualifier":{"classid":"grid","classname":"grid","schemename":"dnet:pid_types","schemeid":"dnet:pid_types"},"value":"grid.416005.6"}],"type":20,"id":"20|grid________::69ab0f5ed7da9d961355cb4eb24b8613"}
{"collectedfrom":[{"value":"GRID - Global Research Identifier Database","key":"10|openaire____::ff4a008470319a22d9cf3d14af485977"}],"organization":{"metadata":{"legalshortname":{"value":"NIOK"},"websiteurl":{"value":"http://www.niok.eu/"},"country":{"classid":"NL","classname":"Netherlands","schemename":"dnet:countries","schemeid":"dnet:countries"},"legalname":{"value":"Nederlands Instituut voor Onderzoek in de Katalyse"}}},"pid":[{"qualifier":{"classid":"grid","classname":"grid","schemename":"dnet:pid_types","schemeid":"dnet:pid_types"},"value":"grid.450158.d"}],"type":20,"id":"20|grid________::267cf3ce23903e0a8403653019ce8187"}
{"dateoftransformation":"2018-11-20","originalId":["corda_______::998294125"],"collectedfrom":[{"value":"CORDA - COmmon Research DAta Warehouse","key":"10|openaire____::b30dac7baac631f3da7c2bb18dd9891f"}],"organization":{"metadata":{"eclegalbody":{"value":"false"},"eclegalperson":{"value":"true"},"ecinternationalorganization":{"value":"false"},"legalshortname":{"value":"NIVEL"},"ecresearchorganization":{"value":"true"},"ecnonprofit":{"value":"true"},"ecenterprise":{"value":"false"},"websiteurl":{"value":"http://www.nivel.nl"},"ecnutscode":{"value":"false"},"ecinternationalorganizationeurinterests":{"value":"false"},"legalname":{"value":"STICHTING NEDERLANDS INSTITUUT VOOR ONDERZOEK VAN DE GEZONDHEIDSZORG"},"country":{"classid":"NL","classname":"Netherlands","schemename":"dnet:countries","schemeid":"dnet:countries"},"ecsmevalidated":{"value":"false"}}},"dateofcollection":"2018-03-12","type":20,"id":"20|corda_______::755737ed505484ea374062762ef05ef6"}
{"dateoftransformation":"2019-06-26","originalId":["corda__h2020::998294125"],"collectedfrom":[{"value":"CORDA - COmmon Research DAta Warehouse - Horizon 2020","key":"10|openaire____::a55eb91348674d853191f4f4fd73d078"}],"organization":{"metadata":{"eclegalbody":{"value":"false"},"eclegalperson":{"value":"true"},"ecinternationalorganization":{"value":"false"},"legalshortname":{"value":"NIVEL"},"ecresearchorganization":{"value":"true"},"ecnonprofit":{"value":"true"},"ecenterprise":{"value":"false"},"websiteurl":{"value":"http://www.nivel.nl"},"ecnutscode":{"value":"false"},"ecinternationalorganizationeurinterests":{"value":"false"},"legalname":{"value":"STICHTING NEDERLANDS INSTITUUT VOOR ONDERZOEK VAN DE GEZONDHEIDSZORG"},"country":{"classid":"NL","classname":"Netherlands","schemename":"dnet:countries","schemeid":"dnet:countries"},"echighereducation":{"value":"false"},"ecsmevalidated":{"value":"false"}}},"dateofcollection":"2018-03-12","type":20,"id":"20|corda__h2020::755737ed505484ea374062762ef05ef6"}
{"dateoftransformation":"2018-09-13","originalId":["snsf________::The_Netherlands_Institute_of_Health_Services_Research_NIVEL"],"collectedfrom":[{"value":"SNSF - Swiss National Science Foundation","key":"10|openaire____::d8f3c25e18304608ce8e816e99603d7a"}],"organization":{"metadata":{"eclegalbody":{"value":"false"},"eclegalperson":{"value":"false"},"ecinternationalorganization":{"value":"false"},"ecnonprofit":{"value":"false"},"ecresearchorganization":{"value":"false"},"ecenterprise":{"value":"false"},"ecnutscode":{"value":"false"},"ecinternationalorganizationeurinterests":{"value":"false"},"legalname":{"value":"The Netherlands Institute of Health Services Research NIVEL"},"country":{"classid":"NL","classname":"Netherlands","schemename":"dnet:countries","schemeid":"dnet:countries"},"echighereducation":{"value":"false"},"ecsmevalidated":{"value":"false"}}},"dateofcollection":"2017-09-22","type":20,"id":"20|snsf________::10653be4e9c170181486aa9782346d81"}
{"dateoftransformation":"2018-09-13","originalId":["openaire____::088a0087-4bc6-4c38-a052-b446c3b225a7::The Netherlands Institute for Social Research"],"collectedfrom":[{"value":"","key":""}],"organization":{"metadata":{"eclegalbody":{"value":"false"},"eclegalperson":{"value":"false"},"ecinternationalorganization":{"value":"false"},"ecnonprofit":{"value":"false"},"ecresearchorganization":{"value":"false"},"ecenterprise":{"value":"false"},"ecnutscode":{"value":"false"},"ecinternationalorganizationeurinterests":{"value":"false"},"legalname":{"value":"The Netherlands Institute for Social Research"},"country":{"classid":"NL","classname":"Netherlands","schemename":"dnet:countries","schemeid":"dnet:countries"},"echighereducation":{"value":"false"},"ecsmevalidated":{"value":"false"}}},"dateofcollection":"2016-03-30","type":20,"id":"20|openaire____::857b30f258c43852a2cb57875ac40892"}
=======
{"dateoftransformation":"2018-11-20","originalId":["corda_______::999987066"],"collectedfrom":[{"value":"CORDA - COmmon Research DAta Warehouse","key":"10|openaire____::b30dac7baac631f3da7c2bb18dd9891f"}],"organization":{"metadata":{"eclegalbody":{"value":"false"},"eclegalperson":{"value":"true"},"ecinternationalorganization":{"value":"false"},"legalshortname":{"value":"NLR"},"ecresearchorganization":{"value":"true"},"ecnonprofit":{"value":"true"},"ecenterprise":{"value":"false"},"websiteurl":{"value":"http://www.nlr.nl"},"ecnutscode":{"value":"false"},"ecinternationalorganizationeurinterests":{"value":"false"},"legalname":{"value":"STICHTING NATIONAAL LUCHT- EN RUIMTEVAARTLABORATORIUM"},"country":{"classid":"NL","classname":"Netherlands","schemename":"dnet:countries","schemeid":"dnet:countries"},"echighereducation":{"value":"false"},"ecsmevalidated":{"value":"false"}}},"dateofcollection":"2018-03-12","type":20,"id":"20|corda_______::9cb56cf06fbe3926d0c88ee320908848"}
{"dateoftransformation":"2019-06-26","originalId":["corda__h2020::999987066"],"collectedfrom":[{"value":"CORDA - COmmon Research DAta Warehouse - Horizon 2020","key":"10|openaire____::a55eb91348674d853191f4f4fd73d078"}],"organization":{"metadata":{"eclegalbody":{"value":"false"},"eclegalperson":{"value":"true"},"ecinternationalorganization":{"value":"false"},"legalshortname":{"value":"NLR"},"ecresearchorganization":{"value":"true"},"ecnonprofit":{"value":"true"},"ecenterprise":{"value":"false"},"websiteurl":{"value":"http://www.nlr.nl"},"ecnutscode":{"value":"false"},"ecinternationalorganizationeurinterests":{"value":"false"},"legalname":{"value":"STICHTING NATIONAAL LUCHT- EN RUIMTEVAARTLABORATORIUM"},"country":{"classid":"NL","classname":"Netherlands","schemename":"dnet:countries","schemeid":"dnet:countries"},"echighereducation":{"value":"false"},"ecsmevalidated":{"value":"false"}}},"dateofcollection":"2018-03-12","type":20,"id":"20|corda__h2020::9cb56cf06fbe3926d0c88ee320908848"}
{"dateoftransformation":"2018-11-12","originalId":["opendoar____::Netherlands_Aerospace_Centre"],"collectedfrom":[{"value":"OpenDOAR","key":"10|openaire____::47ce9e9f4fad46e732cff06419ecaabb"}],"organization":{"metadata":{"eclegalbody":{"value":"false"},"eclegalperson":{"value":"false"},"ecinternationalorganization":{"value":"false"},"legalshortname":{"value":"NLR"},"ecresearchorganization":{"value":"false"},"ecnonprofit":{"value":"false"},"ecenterprise":{"value":"false"},"websiteurl":{"value":"http://www.nlr.nl/"},"ecnutscode":{"value":"false"},"ecinternationalorganizationeurinterests":{"value":"false"},"legalname":{"value":"Netherlands Aerospace Centre"},"country":{"classid":"NL","classname":"Netherlands","schemename":"dnet:countries","schemeid":"dnet:countries"},"echighereducation":{"value":"false"},"ecsmevalidated":{"value":"false"}}},"dateofcollection":"2018-11-12","type":20,"id":"20|opendoar____::ce12359dec61a8e00837c3e507918812"}
>>>>>>> origin/master

View File

@ -1,5 +0,0 @@
{"dateoftransformation":"2018-09-17","originalId":["corda__h2020::999838074"],"collectedfrom":[{"value":"CORDA - COmmon Research DAta Warehouse - Horizon 2020","key":"10|openaire____::a55eb91348674d853191f4f4fd73d078"}],"organization":{"metadata":{"eclegalbody":{"value":"true"},"eclegalperson":{"value":"true"},"ecinternationalorganization":{"value":"false"},"legalshortname":{"value":"UNIVR"},"ecresearchorganization":{"value":"true"},"ecnonprofit":{"value":"true"},"ecenterprise":{"value":"false"},"websiteurl":{"value":"http://www.univr.it"},"ecnutscode":{"value":"false"},"ecinternationalorganizationeurinterests":{"value":"false"},"legalname":{"value":"UNIVERSITA DEGLI STUDI DI VERONA"},"country":{"classid":"IT","classname":"IT","schemename":"dnet:countries","schemeid":"dnet:countries"},"echighereducation":{"value":"true"},"ecsmevalidated":{"value":"false"}}},"dateofcollection":"2018-03-12","type":20,"id":"20|corda__h2020::3898c35de19616484a0e901a92a709f5"}
{"dateoftransformation":"2018-09-13","originalId":["corda_______::999838074"],"collectedfrom":[{"value":"CORDA - COmmon Research DAta Warehouse","key":"10|openaire____::b30dac7baac631f3da7c2bb18dd9891f"}],"organization":{"metadata":{"eclegalbody":{"value":"true"},"eclegalperson":{"value":"true"},"ecinternationalorganization":{"value":"false"},"legalshortname":{"value":"UNIVR"},"ecresearchorganization":{"value":"true"},"ecnonprofit":{"value":"true"},"ecenterprise":{"value":"false"},"websiteurl":{"value":"http://www.univr.it"},"ecnutscode":{"value":"false"},"ecinternationalorganizationeurinterests":{"value":"false"},"legalname":{"value":"UNIVERSITA DEGLI STUDI DI VERONA"},"country":{"classid":"IT","classname":"IT","schemename":"dnet:countries","schemeid":"dnet:countries"},"echighereducation":{"value":"true"}}},"dateofcollection":"2018-03-12","type":20,"id":"20|corda_______::3898c35de19616484a0e901a92a709f5"}
{"dateoftransformation":"2018-09-17","originalId":["corda__h2020::999976687"],"collectedfrom":[{"value":"CORDA - COmmon Research DAta Warehouse - Horizon 2020","key":"10|openaire____::a55eb91348674d853191f4f4fd73d078"}],"organization":{"metadata":{"eclegalbody":{"value":"true"},"eclegalperson":{"value":"true"},"ecinternationalorganization":{"value":"false"},"legalshortname":{"value":"UNIGE"},"ecresearchorganization":{"value":"true"},"ecnonprofit":{"value":"true"},"ecenterprise":{"value":"false"},"websiteurl":{"value":"http://www.unige.it"},"ecnutscode":{"value":"false"},"ecinternationalorganizationeurinterests":{"value":"false"},"legalname":{"value":"UNIVERSITA DEGLI STUDI DI GENOVA"},"country":{"classid":"IT","classname":"IT","schemename":"dnet:countries","schemeid":"dnet:countries"},"echighereducation":{"value":"true"},"ecsmevalidated":{"value":"false"}}},"dateofcollection":"2018-03-12","type":20,"id":"20|corda__h2020::67fba37704a39567853e54615c5371fe"}
{"dateoftransformation":"2018-09-13","originalId":["opendoar____::Università_degli_Studi_di_Genova"],"collectedfrom":[{"value":"OpenDOAR","key":"10|openaire____::47ce9e9f4fad46e732cff06419ecaabb"}],"organization":{"metadata":{"eclegalbody":{"value":"false"},"eclegalperson":{"value":"false"},"ecinternationalorganization":{"value":"false"},"ecresearchorganization":{"value":"false"},"ecnonprofit":{"value":"false"},"ecenterprise":{"value":"false"},"websiteurl":{"value":"https://www.unige.it/"},"ecnutscode":{"value":"false"},"ecinternationalorganizationeurinterests":{"value":"false"},"legalname":{"value":"Università degli Studi di Genova"},"country":{"classid":"IT","classname":"IT","schemename":"dnet:countries","schemeid":"dnet:countries"},"echighereducation":{"value":"false"},"ecsmevalidated":{"value":"false"}}},"dateofcollection":"2016-09-01","type":20,"id":"20|opendoar____::fcd6c93c2863e6be9c6f6a66d761c92d"}
{"dateoftransformation":"2018-09-13","originalId":["opendoar____::Università_degli_Studi_di_Verona"],"collectedfrom":[{"value":"OpenDOAR","key":"10|openaire____::47ce9e9f4fad46e732cff06419ecaabb"}],"organization":{"metadata":{"eclegalbody":{"value":"false"},"eclegalperson":{"value":"false"},"ecinternationalorganization":{"value":"false"},"ecresearchorganization":{"value":"false"},"ecnonprofit":{"value":"false"},"ecenterprise":{"value":"false"},"websiteurl":{"value":"http://www.univr.it/"},"ecnutscode":{"value":"false"},"ecinternationalorganizationeurinterests":{"value":"false"},"legalname":{"value":"Università degli Studi di Verona"},"country":{"classid":"IT","classname":"IT","schemename":"dnet:countries","schemeid":"dnet:countries"},"echighereducation":{"value":"false"},"ecsmevalidated":{"value":"false"}}},"dateofcollection":"2015-08-24","type":20,"id":"20|opendoar____::3f2f26e4bf71340e806ec956884fe34e"}

View File

@ -1,2 +0,0 @@
{"dateoftransformation":"2018-09-17","originalId":["corda__h2020::999838074"],"collectedfrom":[{"value":"CORDA - COmmon Research DAta Warehouse - Horizon 2020","key":"10|openaire____::a55eb91348674d853191f4f4fd73d078"}],"organization":{"metadata":{"eclegalbody":{"value":"true"},"eclegalperson":{"value":"true"},"ecinternationalorganization":{"value":"false"},"legalshortname":{"value":"UNIVR"},"ecresearchorganization":{"value":"true"},"ecnonprofit":{"value":"true"},"ecenterprise":{"value":"false"},"websiteurl":{"value":"http://www.univr.it"},"ecnutscode":{"value":"false"},"ecinternationalorganizationeurinterests":{"value":"false"},"legalname":{"value":"VERONA"},"country":{"classid":"IT","classname":"IT","schemename":"dnet:countries","schemeid":"dnet:countries"},"echighereducation":{"value":"true"},"ecsmevalidated":{"value":"false"}}},"dateofcollection":"2018-03-12","type":20,"id":"20|corda__h2020::3898c35de19616484a0e901a92a709f5"}
{"dateoftransformation":"2018-09-13","originalId":["corda_______::999838074"],"collectedfrom":[{"value":"CORDA - COmmon Research DAta Warehouse","key":"10|openaire____::b30dac7baac631f3da7c2bb18dd9891f"}],"organization":{"metadata":{"eclegalbody":{"value":"true"},"eclegalperson":{"value":"true"},"ecinternationalorganization":{"value":"false"},"legalshortname":{"value":"UNIVR"},"ecresearchorganization":{"value":"true"},"ecnonprofit":{"value":"true"},"ecenterprise":{"value":"false"},"websiteurl":{"value":"http://www.univr.it"},"ecnutscode":{"value":"false"},"ecinternationalorganizationeurinterests":{"value":"false"},"legalname":{"value":"GENOVA"},"country":{"classid":"IT","classname":"IT","schemename":"dnet:countries","schemeid":"dnet:countries"},"echighereducation":{"value":"true"}}},"dateofcollection":"2018-03-12","type":20,"id":"20|corda_______::3898c35de19616484a0e901a92a709f5"}

View File

@ -1,51 +0,0 @@
{
"wf" : {
"threshold" : "0.99",
"dedupRun" : "001",
"entityType" : "result",
"orderField" : "title",
"queueMaxSize" : "2000",
"groupMaxSize" : "10",
"slidingWindowSize" : "200",
"rootBuilder" : [ "result" ],
"includeChildren" : "true"
},
"pace" : {
"clustering" : [
{ "name" : "acronyms", "fieldsCount" : [ "title" ], "params" : { "max" : "1", "minLen" : "2", "maxLen" : "4"} },
{ "name" : "ngrampairs", "fieldsCount" : [ "title" ], "params" : { "max" : "1", "ngramLen" : "3"} },
{ "name" : "suffixprefix", "fieldsCount" : [ "title" ], "params" : { "max" : "1", "len" : "3" } }
],
"necessaryConditions" : [
{ "name" : "yearMatch", "fieldsCount" : [ "dateofacceptance" ] },
{ "name" : "titleVersionMatch", "fieldsCount" : [ "title" ] },
{ "name" : "sizeMatch", "fieldsCount" : [ "authors" ] } ,
{ "name" : "pidMatch", "fieldsCount" : [ "pid" ] }
],
"model" : [
{ "name" : "pid", "algo" : "Null", "type" : "JSON", "weight" : "0.0", "ignoreMissing" : "true", "path" : "pid", "overrideMatch" : "true" },
{ "name" : "title", "algo" : "LevensteinTitle", "type" : "String", "weight" : "1.0", "ignoreMissing" : "false", "path" : "result/metadata/title[qualifier#classid = {main title}]/value" },
{ "name" : "dateofacceptance", "algo" : "Null", "type" : "String", "weight" : "0.0", "ignoreMissing" : "true", "path" : "result/metadata/dateofacceptance/value" } ,
{ "name" : "authors", "algo" : "Null", "type" : "String", "weight" : "0.0", "ignoreMissing" : "true", "path" : "result/metadata/author/fullname", "size" : 2 }
],
"blacklists" : {
"title" : [
"^(Corpus Oral Dialectal \\(COD\\)\\.).*$",
"^(Kiri Karl Morgensternile).*$",
"^(\\[Eksliibris Aleksandr).*\\]$",
"^(\\[Eksliibris Aleksandr).*$",
"^(Eksliibris Aleksandr).*$",
"^(Kiri A\\. de Vignolles).*$",
"^(2 kirja Karl Morgensternile).*$",
"^(Pirita kloostri idaosa arheoloogilised).*$",
"^(Kiri tundmatule).*$",
"^(Kiri Jenaer Allgemeine Literaturzeitung toimetusele).*$",
"^(Eksliibris Nikolai Birukovile).*$",
"^(Eksliibris Nikolai Issakovile).*$",
"^(WHP Cruise Summary Information of section).*$",
"^(Measurement of the top quark\\-pair production cross section with ATLAS in pp collisions at).*$",
"^(Measurement of the spin\\-dependent structure function).*"
] }
}
}

View File

@ -1,22 +0,0 @@
{
"wf" : {
"threshold" : "0.99",
"dedupRun" : "001",
"entityType" : "result",
"orderField" : "title",
"queueMaxSize" : "2000",
"groupMaxSize" : "10",
"slidingWindowSize" : "200",
"rootBuilder" : [ "result" ],
"includeChildren" : "true"
},
"pace" : {
"necessaryConditions" : [ ],
"model" : [
{ "name" : "title", "algo" : "JaroWinkler", "type" : "String", "weight" : "1.0", "ignoreMissing" : "false", "path" : "result/metadata/title[qualifier#classid = {main title}]/value" }
],
"blacklists" : { },
"synonyms" : { }
}
}

View File

@ -1,10 +0,0 @@
{"dateoftransformation":"2016-03-12T12:49:38.412Z","pid":[{"qualifier":{"classid":"doi","classname":"doi","schemename":"dnet:pid_types","schemeid":"dnet:pid_types"},"value":"10.1145/1621890.1621915"}],"originalId":["1621915"],"oaiprovenance":{},"result":{"instance":[{"hostedby":{"value":"ACM Digital Library","key":"10|openaire____::02b55e4f52388520bfe11f959f836e68"},"url":["http://dl.acm.org/citation.cfm?id=1621915"],"dateofacceptance":{"value":"2009-06-16"},"collectedfrom":{"value":"ACM Digital Library","key":"10|openaire____::02b55e4f52388520bfe11f959f836e68"},"accessright":{"classid":"CLOSED","classname":"Closed Access","schemename":"dnet:access_modes","schemeid":"dnet:access_modes"},"instancetype":{"classid":"0001","classname":"Article","schemename":"dnet:publication_resource","schemeid":"dnet:publication_resource"}}],"metadata":{"embargoenddate":{"value":""},"language":{"classid":"und","classname":"Undetermined","schemename":"dnet:languages","schemeid":"dnet:languages"},"author":[{"fullname":"Ferm&#237;n Gal&#225;n","rank":1},{"fullname":"Americo Sampaio","rank":2},{"fullname":"Luis Rodero-Merino","rank":3},{"fullname":"Irit Loy","rank":4},{"fullname":"Victor Gil","rank":5},{"fullname":"Luis Vaquero","rank":6}],"title":[{"qualifier":{"classid":"main title","classname":"main title","schemename":"dnet:dataCite_title","schemeid":"dnet:dataCite_title"},"value":"Service specification in cloud environments based on extensions to open standards"}],"resulttype":{"classid":"publication","classname":"publication","schemename":"dnet:result_typologies","schemeid":"dnet:result_typologies"},"dateofacceptance":{"value":"2009-06-16"}}},"collectedfrom":[{"value":"ACM Digital Library","key":"10|openaire____::02b55e4f52388520bfe11f959f836e68"}],"dateofcollection":"2015-01-20T00:00:00Z","type":50,"id":"50|acm_________::0002c24f82c295e925a2bdf7bbf49bfc"}
{"dateoftransformation":"2016-03-12T12:49:38.413Z","pid":[{"qualifier":{"classid":"doi","classname":"doi","schemename":"dnet:pid_types","schemeid":"dnet:pid_types"},"value":"10.1109/PESOS.2009.5068828"}],"originalId":["1564735"],"oaiprovenance":{},"result":{"instance":[{"hostedby":{"value":"ACM Digital Library","key":"10|openaire____::02b55e4f52388520bfe11f959f836e68"},"url":["http://dl.acm.org/citation.cfm?id=1564735"],"dateofacceptance":{"value":"2009-05-18"},"collectedfrom":{"value":"ACM Digital Library","key":"10|openaire____::02b55e4f52388520bfe11f959f836e68"},"accessright":{"classid":"CLOSED","classname":"Closed Access","schemename":"dnet:access_modes","schemeid":"dnet:access_modes"},"instancetype":{"classid":"0001","classname":"Article","schemename":"dnet:publication_resource","schemeid":"dnet:publication_resource"}}],"metadata":{"embargoenddate":{"value":""},"language":{"classid":"und","classname":"Undetermined","schemename":"dnet:languages","schemeid":"dnet:languages"},"author":[{"fullname":"Domenico Bianculli","rank":1},{"fullname":"Carlo Ghezzi","rank":2},{"fullname":"Cesare Pautasso","rank":3}],"title":[{"qualifier":{"classid":"main title","classname":"main title","schemename":"dnet:dataCite_title","schemeid":"dnet:dataCite_title"},"value":"Embedding continuous lifelong verification in service life cycles"}],"resulttype":{"classid":"publication","classname":"publication","schemename":"dnet:result_typologies","schemeid":"dnet:result_typologies"},"dateofacceptance":{"value":"2009-05-18"}}},"collectedfrom":[{"value":"ACM Digital Library","key":"10|openaire____::02b55e4f52388520bfe11f959f836e68"}],"dateofcollection":"2015-01-20T00:00:00Z","type":50,"id":"50|acm_________::000e0060b89af1706db93e289527a88d"}
{"dateoftransformation":"2016-03-12T12:49:38.413Z","pid":[{"qualifier":{"classid":"doi","classname":"doi","schemename":"dnet:pid_types","schemeid":"dnet:pid_types"},"value":"10.1145/1966913.1966935"}],"originalId":["1966935"],"oaiprovenance":{},"result":{"instance":[{"hostedby":{"value":"ACM Digital Library","key":"10|openaire____::02b55e4f52388520bfe11f959f836e68"},"url":["http://dl.acm.org/citation.cfm?id=1966935"],"dateofacceptance":{"value":"2011-03-22"},"collectedfrom":{"value":"ACM Digital Library","key":"10|openaire____::02b55e4f52388520bfe11f959f836e68"},"accessright":{"classid":"CLOSED","classname":"Closed Access","schemename":"dnet:access_modes","schemeid":"dnet:access_modes"},"instancetype":{"classid":"0001","classname":"Article","schemename":"dnet:publication_resource","schemeid":"dnet:publication_resource"}}],"metadata":{"embargoenddate":{"value":""},"language":{"classid":"und","classname":"Undetermined","schemename":"dnet:languages","schemeid":"dnet:languages"},"author":[{"fullname":"Francesco Alberti","rank":1},{"fullname":"Alessandro Armando","rank":2},{"fullname":"Silvio Ranise","rank":3}],"title":[{"qualifier":{"classid":"main title","classname":"main title","schemename":"dnet:dataCite_title","schemeid":"dnet:dataCite_title"},"value":"Efficient symbolic automated analysis of administrative attribute-based RBAC-policies"}],"resulttype":{"classid":"publication","classname":"publication","schemename":"dnet:result_typologies","schemeid":"dnet:result_typologies"},"dateofacceptance":{"value":"2011-03-22"}}},"collectedfrom":[{"value":"ACM Digital Library","key":"10|openaire____::02b55e4f52388520bfe11f959f836e68"}],"dateofcollection":"2015-01-20T00:00:00Z","type":50,"id":"50|acm_________::00680ab21c76269e780f5e9e7e636619"}
{"dateoftransformation":"2016-03-12T12:49:38.414Z","pid":[{"qualifier":{"classid":"doi","classname":"doi","schemename":"dnet:pid_types","schemeid":"dnet:pid_types"},"value":"10.1145/1570433.1570486"}],"originalId":["1570486"],"oaiprovenance":{},"result":{"instance":[{"hostedby":{"value":"ACM Digital Library","key":"10|openaire____::02b55e4f52388520bfe11f959f836e68"},"url":["http://dl.acm.org/citation.cfm?id=1570486"],"dateofacceptance":{"value":"2009-07-15"},"collectedfrom":{"value":"ACM Digital Library","key":"10|openaire____::02b55e4f52388520bfe11f959f836e68"},"accessright":{"classid":"CLOSED","classname":"Closed Access","schemename":"dnet:access_modes","schemeid":"dnet:access_modes"},"instancetype":{"classid":"0001","classname":"Article","schemename":"dnet:publication_resource","schemeid":"dnet:publication_resource"}}],"metadata":{"embargoenddate":{"value":""},"language":{"classid":"und","classname":"Undetermined","schemename":"dnet:languages","schemeid":"dnet:languages"},"author":[{"fullname":"Daniel Schreiber","rank":1},{"fullname":"Melanie Hartmann","rank":2},{"fullname":"Max M&#252;hlh&#228;user","surname":"Hlh User","name":"Max M.","rank":3}],"title":[{"qualifier":{"classid":"main title","classname":"main title","schemename":"dnet:dataCite_title","schemeid":"dnet:dataCite_title"},"value":"MundoMonkey"}],"resulttype":{"classid":"publication","classname":"publication","schemename":"dnet:result_typologies","schemeid":"dnet:result_typologies"},"dateofacceptance":{"value":"2009-07-15"}}},"collectedfrom":[{"value":"ACM Digital Library","key":"10|openaire____::02b55e4f52388520bfe11f959f836e68"}],"dateofcollection":"2015-01-20T00:00:00Z","type":50,"id":"50|acm_________::008169b761b014b88105a9ed96bb0b4c"}
{"dateoftransformation":"2016-03-12T12:49:38.414Z","pid":[{"qualifier":{"classid":"doi","classname":"doi","schemename":"dnet:pid_types","schemeid":"dnet:pid_types"},"value":"10.1145/2002259.2002332"}],"originalId":["2002332"],"oaiprovenance":{},"result":{"instance":[{"hostedby":{"value":"ACM Digital Library","key":"10|openaire____::02b55e4f52388520bfe11f959f836e68"},"url":["http://dl.acm.org/citation.cfm?id=2002332"],"dateofacceptance":{"value":"2011-07-11"},"collectedfrom":{"value":"ACM Digital Library","key":"10|openaire____::02b55e4f52388520bfe11f959f836e68"},"accessright":{"classid":"CLOSED","classname":"Closed Access","schemename":"dnet:access_modes","schemeid":"dnet:access_modes"},"instancetype":{"classid":"0001","classname":"Article","schemename":"dnet:publication_resource","schemeid":"dnet:publication_resource"}}],"metadata":{"embargoenddate":{"value":""},"language":{"classid":"und","classname":"Undetermined","schemename":"dnet:languages","schemeid":"dnet:languages"},"author":[{"fullname":"Roland St&#252;hmer","rank":1},{"fullname":"Nenad Stojanovic","rank":2}],"title":[{"qualifier":{"classid":"main title","classname":"main title","schemename":"dnet:dataCite_title","schemeid":"dnet:dataCite_title"},"value":"Large-scale, situation-driven and quality-aware event marketplace"}],"resulttype":{"classid":"publication","classname":"publication","schemename":"dnet:result_typologies","schemeid":"dnet:result_typologies"},"dateofacceptance":{"value":"2011-07-11"}}},"collectedfrom":[{"value":"ACM Digital Library","key":"10|openaire____::02b55e4f52388520bfe11f959f836e68"}],"dateofcollection":"2015-01-20T00:00:00Z","type":50,"id":"50|acm_________::00b0f37683e305a90c3397f328fb558a"}
{"dateoftransformation":"2016-03-12T12:49:38.414Z","pid":[{"qualifier":{"classid":"doi","classname":"doi","schemename":"dnet:pid_types","schemeid":"dnet:pid_types"},"value":"10.1145/1852658.1852664"}],"originalId":["1852664"],"oaiprovenance":{},"result":{"instance":[{"hostedby":{"value":"ACM Digital Library","key":"10|openaire____::02b55e4f52388520bfe11f959f836e68"},"url":["http://dl.acm.org/citation.cfm?id=1852664"],"dateofacceptance":{"value":"2010-04-13"},"collectedfrom":{"value":"ACM Digital Library","key":"10|openaire____::02b55e4f52388520bfe11f959f836e68"},"accessright":{"classid":"CLOSED","classname":"Closed Access","schemename":"dnet:access_modes","schemeid":"dnet:access_modes"},"instancetype":{"classid":"0001","classname":"Article","schemename":"dnet:publication_resource","schemeid":"dnet:publication_resource"}}],"metadata":{"embargoenddate":{"value":""},"language":{"classid":"und","classname":"Undetermined","schemename":"dnet:languages","schemeid":"dnet:languages"},"author":[{"fullname":"Michal Kryczka","rank":1},{"fullname":"Ruben Cuevas","rank":2},{"fullname":"Carmen Guerrero","rank":3},{"fullname":"Eiko Yoneki","rank":4},{"fullname":"Arturo Azcorra","rank":5}],"title":[{"qualifier":{"classid":"main title","classname":"main title","schemename":"dnet:dataCite_title","schemeid":"dnet:dataCite_title"},"value":"A first step towards user assisted online social networks"}],"resulttype":{"classid":"publication","classname":"publication","schemename":"dnet:result_typologies","schemeid":"dnet:result_typologies"},"dateofacceptance":{"value":"2010-04-13"}}},"collectedfrom":[{"value":"ACM Digital Library","key":"10|openaire____::02b55e4f52388520bfe11f959f836e68"}],"dateofcollection":"2015-01-20T00:00:00Z","type":50,"id":"50|acm_________::00e918f80a81af40a5e5770024f9256f"}
{"dateoftransformation":"2016-03-12T12:49:38.415Z","pid":[{"qualifier":{"classid":"doi","classname":"doi","schemename":"dnet:pid_types","schemeid":"dnet:pid_types"},"value":"10.1145/1978582.1978584"}],"originalId":["1978584"],"oaiprovenance":{},"result":{"instance":[{"hostedby":{"value":"ACM Digital Library","key":"10|openaire____::02b55e4f52388520bfe11f959f836e68"},"url":["http://dl.acm.org/citation.cfm?id=1978584"],"dateofacceptance":{"value":"2011-05-11"},"collectedfrom":{"value":"ACM Digital Library","key":"10|openaire____::02b55e4f52388520bfe11f959f836e68"},"accessright":{"classid":"CLOSED","classname":"Closed Access","schemename":"dnet:access_modes","schemeid":"dnet:access_modes"},"instancetype":{"classid":"0001","classname":"Article","schemename":"dnet:publication_resource","schemeid":"dnet:publication_resource"}}],"metadata":{"embargoenddate":{"value":""},"language":{"classid":"und","classname":"Undetermined","schemename":"dnet:languages","schemeid":"dnet:languages"},"author":[{"fullname":"Salvatore D'Antonio","surname":"Antonio","name":"Salvatore D.","rank":1},{"fullname":"Luigi Coppolino","rank":2},{"fullname":"Ivano Elia","rank":3},{"fullname":"Valerio Formicola","rank":4}],"title":[{"qualifier":{"classid":"main title","classname":"main title","schemename":"dnet:dataCite_title","schemeid":"dnet:dataCite_title"},"value":"Security issues of a phasor data concentrator for smart grid infrastructure"}],"resulttype":{"classid":"publication","classname":"publication","schemename":"dnet:result_typologies","schemeid":"dnet:result_typologies"},"dateofacceptance":{"value":"2011-05-11"}}},"collectedfrom":[{"value":"ACM Digital Library","key":"10|openaire____::02b55e4f52388520bfe11f959f836e68"}],"dateofcollection":"2015-01-20T00:00:00Z","type":50,"id":"50|acm_________::012f02c31a80f63a43772e662aca364f"}
{"dateoftransformation":"2016-03-12T12:49:38.415Z","pid":[{"qualifier":{"classid":"doi","classname":"doi","schemename":"dnet:pid_types","schemeid":"dnet:pid_types"},"value":"10.1145/1809400.1809402"}],"originalId":["1809402"],"oaiprovenance":{},"result":{"instance":[{"hostedby":{"value":"ACM Digital Library","key":"10|openaire____::02b55e4f52388520bfe11f959f836e68"},"url":["http://dl.acm.org/citation.cfm?id=1809402"],"dateofacceptance":{"value":"2010-05-27"},"collectedfrom":{"value":"ACM Digital Library","key":"10|openaire____::02b55e4f52388520bfe11f959f836e68"},"accessright":{"classid":"CLOSED","classname":"Closed Access","schemename":"dnet:access_modes","schemeid":"dnet:access_modes"},"instancetype":{"classid":"0001","classname":"Article","schemename":"dnet:publication_resource","schemeid":"dnet:publication_resource"}}],"metadata":{"embargoenddate":{"value":""},"language":{"classid":"und","classname":"Undetermined","schemename":"dnet:languages","schemeid":"dnet:languages"},"author":[{"fullname":"Kai Puolam&#228;ki","rank":1},{"fullname":"Alessio Bertone","rank":2}],"title":[{"qualifier":{"classid":"main title","classname":"main title","schemename":"dnet:dataCite_title","schemeid":"dnet:dataCite_title"},"value":"Introduction to the special issue on visual analytics and knowledge discovery"}],"resulttype":{"classid":"publication","classname":"publication","schemename":"dnet:result_typologies","schemeid":"dnet:result_typologies"},"dateofacceptance":{"value":"2010-05-27"}}},"collectedfrom":[{"value":"ACM Digital Library","key":"10|openaire____::02b55e4f52388520bfe11f959f836e68"}],"dateofcollection":"2015-01-20T00:00:00Z","type":50,"id":"50|acm_________::020794cfeedb650987bf93d3e3e09011"}
{"dateoftransformation":"2016-03-12T12:49:38.416Z","pid":[{"qualifier":{"classid":"doi","classname":"doi","schemename":"dnet:pid_types","schemeid":"dnet:pid_types"},"value":"10.1145/1851275.1851254"}],"originalId":["1851254"],"oaiprovenance":{},"result":{"instance":[{"hostedby":{"value":"ACM Digital Library","key":"10|openaire____::02b55e4f52388520bfe11f959f836e68"},"url":["http://dl.acm.org/citation.cfm?id=1851254"],"dateofacceptance":{"value":"2010-08-30"},"collectedfrom":{"value":"ACM Digital Library","key":"10|openaire____::02b55e4f52388520bfe11f959f836e68"},"accessright":{"classid":"CLOSED","classname":"Closed Access","schemename":"dnet:access_modes","schemeid":"dnet:access_modes"},"instancetype":{"classid":"0001","classname":"Article","schemename":"dnet:publication_resource","schemeid":"dnet:publication_resource"}}],"metadata":{"embargoenddate":{"value":""},"language":{"classid":"und","classname":"Undetermined","schemename":"dnet:languages","schemeid":"dnet:languages"},"author":[{"fullname":"S&#233;bastien Barr&#233;","surname":"Bastien Barr","name":"S.","rank":1},{"fullname":"Olivier Bonaventure","rank":2},{"fullname":"Costin Raiciu","rank":3},{"fullname":"Mark Handley","rank":4}],"title":[{"qualifier":{"classid":"main title","classname":"main title","schemename":"dnet:dataCite_title","schemeid":"dnet:dataCite_title"},"value":"Experimenting with multipath TCP"}],"resulttype":{"classid":"publication","classname":"publication","schemename":"dnet:result_typologies","schemeid":"dnet:result_typologies"},"dateofacceptance":{"value":"2010-08-30"}}},"collectedfrom":[{"value":"ACM Digital Library","key":"10|openaire____::02b55e4f52388520bfe11f959f836e68"}],"dateofcollection":"2015-01-20T00:00:00Z","type":50,"id":"50|acm_________::023fa75845681e2812d97440d070fb69"}
{"dateoftransformation":"2016-03-12T12:49:38.416Z","originalId":["2043516"],"oaiprovenance":{},"result":{"instance":[{"hostedby":{"value":"ACM Digital Library","key":"10|openaire____::02b55e4f52388520bfe11f959f836e68"},"url":["http://dl.acm.org/citation.cfm?id=2043516"],"dateofacceptance":{"value":"2011-09-06"},"collectedfrom":{"value":"ACM Digital Library","key":"10|openaire____::02b55e4f52388520bfe11f959f836e68"},"accessright":{"classid":"CLOSED","classname":"Closed Access","schemename":"dnet:access_modes","schemeid":"dnet:access_modes"},"instancetype":{"classid":"0001","classname":"Article","schemename":"dnet:publication_resource","schemeid":"dnet:publication_resource"}}],"metadata":{"embargoenddate":{"value":""},"language":{"classid":"und","classname":"Undetermined","schemename":"dnet:languages","schemeid":"dnet:languages"},"author":[{"fullname":"Paolo Pileggi","rank":1},{"fullname":"Giuseppe Bianchi","rank":2}],"title":[{"qualifier":{"classid":"main title","classname":"main title","schemename":"dnet:dataCite_title","schemeid":"dnet:dataCite_title"},"value":"Traffic-centric modeling of future wireless internet access technologies"}],"resulttype":{"classid":"publication","classname":"publication","schemename":"dnet:result_typologies","schemeid":"dnet:result_typologies"},"dateofacceptance":{"value":"2011-09-06"}}},"collectedfrom":[{"value":"ACM Digital Library","key":"10|openaire____::02b55e4f52388520bfe11f959f836e68"}],"dateofcollection":"2015-01-20T00:00:00Z","type":50,"id":"50|acm_________::02a8fbd0aa341df6dbb8323f453091f8"}

View File

@ -1,37 +0,0 @@
{
"wf" : {
"threshold" : "0.99",
"dedupRun" : "001",
"entityType" : "result",
"subEntityType" : "resulttype",
"subEntityValue" : "software",
"orderField" : "title",
"queueMaxSize" : "2000",
"groupMaxSize" : "10",
"slidingWindowSize" : "200",
"rootBuilder" : [ "result", "resultProject_outcome_isProducedBy", "resultResult_publicationDataset_isRelatedTo", "resultResult_similarity_isAmongTopNSimilarDocuments", "resultResult_similarity_hasAmongTopNSimilarDocuments", "resultOrganization_affiliation_isAffiliatedWith", "resultResult_part_hasPart", "resultResult_part_isPartOf", "resultResult_supplement_isSupplementTo", "resultResult_supplement_isSupplementedBy", "resultResult_version_isVersionOf" ],
"includeChildren" : "true"
},
"pace" : {
"clustering" : [
{ "name" : "ngrampairs", "fieldsCount" : [ "title" ], "params" : { "max" : "1", "ngramLen" : "3"} },
{ "name" : "suffixprefix", "fieldsCount" : [ "title" ], "params" : { "max" : "1", "len" : "3" } },
{ "name" : "lowercase", "fieldsCount" : [ "doi", "url" ], "params" : { } }
],
"sufficientConditions" : [
{ "name" : "exactMatch", "fieldsCount" : [ "doi", "resulttype", "url" ] }
],
"necessaryConditions" : [
{ "name" : "titleVersionMatch", "fieldsCount" : [ "title" ] }
],
"model" : [
{ "name" : "doi", "algo" : "Null", "type" : "String", "weight" : "0.0", "ignoreMissing" : "true", "path" : "pid[qualifier#classid = {doi}]/value" },
{ "name" : "title", "algo" : "LevensteinTitle", "type" : "String", "weight" : "1.0", "ignoreMissing" : "false", "path" : "result/metadata/title[qualifier#classid = {main title}]/value" },
{ "name" : "url", "algo" : "Null", "type" : "String", "weight" : "0.0", "ignoreMissing" : "true", "path" : "result/instance/url" },
{ "name" : "resulttype", "algo" : "Null", "type" : "String", "weight" : "0.0", "ignoreMissing" : "false", "path" : "result/metadata/resulttype/classid" }
],
"blacklists" : {
}
}
}

View File

@ -28,38 +28,14 @@ import java.util.stream.IntStream;
public abstract class AbstractProtoPaceTest extends OafTest {
protected DedupConfig getResultFullConf() {
return DedupConfig.load(readFromClasspath("/eu/dnetlib/pace/result.full.pace.conf"));
}
protected DedupConfig getResultSimpleConf() {
return DedupConfig.load(readFromClasspath("/eu/dnetlib/pace/result.simple.pace.conf"));
}
protected DedupConfig getResultConf() {
return DedupConfig.load(readFromClasspath("/eu/dnetlib/pace/result.pace.conf"));
}
protected DedupConfig getOrganizationSimpleConf() {
return DedupConfig.load(readFromClasspath("/eu/dnetlib/pace/organization.pace.conf"));
protected DedupConfig getOrganizationCurrentConf() {
return DedupConfig.load(readFromClasspath("/eu/dnetlib/pace/config/organization.current.conf"));
}
protected DedupConfig getOrganizationTestConf() {
return DedupConfig.load(readFromClasspath("/eu/dnetlib/pace/organization.test.conf"));
return DedupConfig.load(readFromClasspath("/eu/dnetlib/pace/config/organization.test.conf"));
}
protected DedupConfig getAuthorsTestConf() {
return DedupConfig.load(readFromClasspath("/eu/dnetlib/pace/authors.test.pace.conf"));
}
protected DedupConfig getResultAuthorsConf() {
return DedupConfig.load(readFromClasspath("/eu/dnetlib/pace/result.authors.pace.conf"));
}
protected DedupConfig getResultProdConf() {
return DedupConfig.load(readFromClasspath("/eu/dnetlib/pace/result.prod.pace.conf"));
}
protected MapDocument author(final Config conf, final String id, final Oaf oaf) {
return ProtoDocumentBuilder.newInstance(id, oaf.getEntity(), conf.model());
}

View File

@ -0,0 +1,77 @@
package eu.dnetlib.pace;
import eu.dnetlib.Deduper;
import eu.dnetlib.pace.config.DedupConfig;
import eu.dnetlib.pace.utils.Utility;
import eu.dnetlib.support.ConnectedComponent;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.sql.SparkSession;
import org.junit.Before;
import org.junit.Ignore;
import org.junit.Test;
import java.net.URL;
public class DedupLocalTest {
JavaSparkContext context;
JavaRDD<String> entities;
DedupConfig conf;
@Before
public void setup() {
conf = DedupConfig.load(Utility.readFromClasspath("/eu/dnetlib/pace/config/organization.strict.conf", DedupLocalTest.class));
final SparkSession spark = SparkSession
.builder()
.appName("Deduplication")
.master("local[*]")
.getOrCreate();
context = new JavaSparkContext(spark.sparkContext());
final URL dataset = getClass().getResource("/eu/dnetlib/pace/examples/organization.to.fix.json");
entities = context.textFile(dataset.getPath());
}
@Ignore
@Test
public void dedupTest(){
double startTime = System.currentTimeMillis();
JavaRDD<ConnectedComponent> ccs = Deduper.dedup(context, entities, conf);
System.out.println("total time = " + (System.currentTimeMillis()-startTime));
printStatistics(ccs);
// accumulators.forEach((name, acc) -> System.out.println(name + " -> " + acc.value()));
}
public static void printStatistics(JavaRDD<ConnectedComponent> ccs){
final JavaRDD<ConnectedComponent> connectedComponents = ccs.filter(cc -> cc.getDocs().size()>1);
final JavaRDD<ConnectedComponent> nonDeduplicated = ccs.filter(cc -> cc.getDocs().size()==1);
//print deduped
connectedComponents.foreach(cc -> {
System.out.println(cc);
});
// connectedComponents.foreach(cc -> {
// cc.getDocs().stream().forEach(d -> {
// System.out.println(d.getFieldMap().get("legalname") + " | " + d.getFieldMap().get("legalshortname"));
// });
// });
//print nondeduped
nonDeduplicated.foreach(cc -> {
System.out.println(cc);
});
System.out.println("Non duplicates: " + nonDeduplicated.count());
System.out.println("Duplicates: " + connectedComponents.flatMap(cc -> cc.getDocs().iterator()).count());
System.out.println("Connected Components: " + connectedComponents.count());
}
}

View File

@ -1,41 +0,0 @@
package eu.dnetlib.pace.clustering;
import eu.dnetlib.pace.AbstractProtoPaceTest;
import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.config.Type;
import eu.dnetlib.pace.model.FieldListImpl;
import eu.dnetlib.pace.model.FieldValueImpl;
import eu.dnetlib.pace.model.MapDocument;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.junit.Before;
import org.junit.Test;
public class BlacklistAwareClusteringCombinerTest extends AbstractProtoPaceTest {
private static final Log log = LogFactory.getLog(BlacklistAwareClusteringCombinerTest.class);
private Config config;
@Before
public void setUp() {
config = getResultFullConf();
}
@Test
public void testCombine() {
final MapDocument result =
result(config, "A", "Dipping in Cygnus X-2 in a multi-wavelength campaign due to absorption of extended ADC emission", "2013");
final FieldListImpl fl = new FieldListImpl();
fl.add(new FieldValueImpl(Type.String, "desc", "hello world description pipeline"));
result.getFieldMap().put("desc", fl);
fl.clear();
fl.add(new FieldValueImpl(Type.String, "title", "lorem ipsum cabalie qwerty"));
final FieldListImpl field = (FieldListImpl) result.getFieldMap().get("title");
field.add(fl);
log.info(BlacklistAwareClusteringCombiner.filterAndCombine(result, config));
}
}

View File

@ -1,7 +1,6 @@
package eu.dnetlib.pace.clustering;
import eu.dnetlib.pace.AbstractProtoPaceTest;
import eu.dnetlib.pace.clustering.ClusteringCombiner;
import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.config.Type;
import eu.dnetlib.pace.model.FieldListImpl;
@ -20,20 +19,23 @@ public class ClusteringCombinerTest extends AbstractProtoPaceTest {
@Before
public void setUp() {
config = getResultFullConf();
config = getOrganizationTestConf();
}
@Test
public void testCombine() {
String title = "Dipping in Cygnus X-2 in a multi-wavelength campaign due to absorption of extended ADC emission";
MapDocument result = result(config, "A", title, "2013");
FieldListImpl fl = new FieldListImpl();
fl.add(new FieldValueImpl(Type.String, "desc", "lorem ipsum cabalie qwerty"));
final MapDocument organization = organization(config, "A", "University of Turin", "UNITO");
log.info("University of Turin");
log.info(ClusteringCombiner.combine(organization, config));
}
result.getFieldMap().put("desc", fl);
log.info(title);
log.info(ClusteringCombiner.combine(result, config));
@Test
public void testCombineBlacklistAware() {
final MapDocument organization = organization(config, "A", "University of Turin", "UNITO");
log.info("University of Turin");
log.info(BlacklistAwareClusteringCombiner.filterAndCombine(organization, config));
}
}

View File

@ -1,360 +0,0 @@
//package eu.dnetlib.pace.distance;
//
//import com.google.common.collect.Lists;
//import com.google.common.collect.Sets;
//import com.googlecode.protobuf.format.JsonFormat;
//import eu.dnetlib.data.proto.OafProtos;
//import eu.dnetlib.pace.AbstractProtoPaceTest;
//import eu.dnetlib.pace.clustering.BlacklistAwareClusteringCombiner;
//import eu.dnetlib.pace.config.Config;
//import eu.dnetlib.pace.config.DedupConfig;
//import eu.dnetlib.pace.distance.eval.ScoreResult;
//import eu.dnetlib.pace.model.MapDocument;
//import eu.dnetlib.pace.model.ProtoDocumentBuilder;
//import org.apache.commons.logging.Log;
//import org.apache.commons.logging.LogFactory;
//import org.junit.Ignore;
//import org.junit.Test;
//
//import java.io.IOException;
//import java.util.List;
//import java.util.Set;
//
//import static org.junit.Assert.assertFalse;
//import static org.junit.Assert.assertTrue;
//
//public class DetectorTest extends AbstractProtoPaceTest {
//
// private static final Log log = LogFactory.getLog(DetectorTest.class);
//
// @Test
// public void testDistanceResultSimple() {
// final Config config = getResultSimpleConf();
// final MapDocument resA = result(config, "A", "Recent results from CDF");
// final MapDocument resB = result(config, "B", "Recent results from CDF");
// final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config);
// final double d = sr.getScore();
// log.info(String.format(" d ---> %s", d));
// assertTrue(d == 1.0);
// }
//
// @Test
// public void testDistanceResultSimpleMissingDates() {
// final Config config = getResultSimpleConf();
// final MapDocument resA = result(config, "A", "Recent results from BES");
// final MapDocument resB = result(config, "A", "Recent results from CES");
// final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config);
// final double d = sr.getScore();
// log.info(String.format(" d ---> %s", d));
// assertTrue(d > 0.97);
// }
//
// @Test
// public void testDistanceResultInvalidDate() {
// final Config config = getResultConf();
// final MapDocument resA = result(config, "A", "title title title 6BESR", "2013-01-05");
// final MapDocument resB = result(config, "B", "title title title 6BESR", "qwerty");
// final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config);
// final double d = sr.getScore();
// log.info(String.format(" d ---> %s", d));
// assertTrue(d == 1.0);
// }
//
// @Ignore
// @Test
// public void testDistanceResultMissingOneDate() {
// final Config config = getResultConf();
// final MapDocument resA = result(config, "A", "title title title 6BESR", null);
// final MapDocument resB = result(config, "B", "title title title 6CLER", "2012-02");
// final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config);
// double d = sr.getScore();
// log.info(String.format(" d ---> %s", d));
// assertTrue((d > 0.9) && (d < 1.0));
// }
//
// @Ignore
// @Test
// public void testDistanceResult() {
// final Config config = getResultConf();
// final MapDocument resA = result(config, "A", "title title title BES", "");
// final MapDocument resB = result(config, "B", "title title title CLEO");
// final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config);
// double d = sr.getScore();
// log.info(String.format(" d ---> %s", d));
// assertTrue((d > 0.9) && (d < 1.0));
// }
//
// @Ignore
// @Test
// public void testDistanceResultMissingTwoDate() {
// final Config config = getResultConf();
// final MapDocument resA = result(config, "A", "bellaciao");
// final MapDocument resB = result(config, "B", "bellocioa");
// final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config);
// double d = sr.getScore();
// log.info(String.format(" d ---> %s", d));
// assertTrue((d > 0.9) && (d < 1.0));
// }
//
// @Ignore
// @Test
// public void testDistanceOrganizationIgnoreMissing() {
// final Config config = getOrganizationSimpleConf();
// final MapDocument orgA = organization(config, "A", "CONSIGLIO NAZIONALE DELLE RICERCHE");
// final MapDocument orgB = organization(config, "B", "CONSIGLIO NAZIONALE DELLE RICERCHE", "CNR");
// final ScoreResult sr = new PaceDocumentDistance().between(orgA, orgB, config);
// final double d = sr.getScore();
// log.info(String.format(" d ---> %s", d));
// assertTrue(d > 0.99);
// }
//
//
// @Test
// public void testDistanceOrganizations() {
// final Config config = getOrganizationTestConf();
// final MapDocument orgA = organization(config, "A", "UNIVERSITA DEGLI STUDI DI VERONA");
// final MapDocument orgB = organization(config, "B", "UNIVERSITY OF GENOVA");
// final ScoreResult sr = new PaceDocumentDistance().between(orgA, orgB, config);
// final double d = sr.getScore();
// log.info(String.format(" d ---> %s", d));
// }
//
//
// @Test
// public void testDistanceResultCase1() {
// final Config config = getResultConf();
// final MapDocument resA = result(config, "A", "Search the Standard Model Higgs boson", "2003");
// final MapDocument resB = result(config, "B", "Search for the Standard Model Higgs Boson", "2003");
// final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config);
// double d = sr.getScore();
// log.info(String.format(" d ---> %s", d));
// System.out.println("d = " + d);
// assertTrue((d >= 0.9) && (d <= 1.0));
// }
//
// @Test
// public void testDistanceResultCaseDoiMatch1() {
// final Config config = getResultConf();
// final MapDocument resA = result(config, "A", "Search the Standard Model Higgs boson", "2003", "10.1594/PANGAEA.726855");
// final MapDocument resB = result(config, "B", "Search the Standard Model Higgs Boson", "2003", "10.1594/PANGAEA.726855");
// final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config);
// double d = sr.getScore();
// log.info(String.format(" d ---> %s", d));
// assertTrue("exact DOIs will produce an exact match", d == 1.0);
// }
//
// @Test
// public void testDistanceResultCaseDoiMatch2() {
// final Config config = getResultConf();
// final MapDocument resA = result(config, "A", "Conference proceedings on X. Appendix", "2003", "10.1594/PANGAEA.726855");
// final MapDocument resB = result(config, "B", "Search the Standard Model Higgs Boson", "2005", "10.1594/PANGAEA.726855");
// final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config);
// double d = sr.getScore();
// log.info(String.format(" d ---> %s", d));
// assertTrue("exact DOIs will produce an exact match, regardless of different titles or publication years", d == 1.0);
// }
//
// @Test
// public void testDistanceResultCaseDoiMatch3() {
// final Config config = getResultConf();
// final MapDocument resA = result(config, "A", "Conference proceedings on X. Appendix", "2003", "10.1016/j.jmb.2010.12.024");
// final MapDocument resB = result(config, "B", "Conference proceedings on X. Appendix", "2003");
// final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config);
// double d = sr.getScore();
// log.info(String.format(" d ---> %s", d));
// assertTrue("a missing DOI will casue the comparsion to continue with the following necessaryConditions", d == 1.0);
// }
//
// @Test
// public void testDistanceResultCaseDoiMatch4() {
// final Config config = getResultConf();
// final MapDocument resA = result(config, "A", "Conference proceedings on X. Appendix", "2003", "10.1016/j.jmb.2010.12.024");
// final MapDocument resB = result(config, "B", "Conference proceedings on X. Appendix", "2005");
// final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config);
// double d = sr.getScore();
// log.info(String.format(" d ---> %s", d));
// assertTrue("a missing DOI, comparsion continues with the following necessaryConditions, different publication years will drop the score to 0", d == 0.0);
// }
//
// @Test
// public void testDistanceResultCaseDoiMatch5() {
// final Config config = getResultConf();
// final MapDocument resA = result(config, "A", "Search for the Standard Model Higgs Boson", "2003", "10.1016/j.jmb.2010.12.020");
// final MapDocument resB = result(config, "B", "Search the Standard Model Higgs Boson", "2003");
// final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config);
// double d = sr.getScore();
// log.info(String.format(" d ---> %s", d));
// assertTrue("a missing DOI, comparsion continues with the following necessaryConditions", (d > 0.9) && (d < 1.0));
// }
//
// @Test
// public void testDistanceResultCaseDoiMatch6() {
// final Config config = getResultConf();
// final MapDocument resA = result(config, "A", "Conference proceedings on X. Appendix", "2003", "10.1016/j.jmb.2010.12.024");
// final MapDocument resB = result(config, "B", "Conference proceedings on X. Appendix", "2003", "anotherDifferentDOI");
// final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config);
// double d = sr.getScore();
// log.info(String.format(" d ---> %s", d));
// assertTrue("different DOIs will NOT drop the score to 0, then evaluate other fields", d == 1.0);
// }
//
// @Test
// public void testDistanceResultCaseDoiMatch7() {
// final Config config = getResultConf();
// final MapDocument resA = result(config, "A", "Adrenal Insufficiency asd asd", "1951", Lists.newArrayList("PMC2037944", "axdsds"));
// final MapDocument resB = result(config, "B", "Adrenal Insufficiency", "1951", "PMC2037944");
// final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config);
// double d = sr.getScore();
// log.info(String.format(" d ---> %s", d));
// assertTrue("different DOIs will drop the score to 0, regardless of the other fields", d > 0.89 & d < 1);
// }
//
// // http://dx.doi.org/10.1594/PANGAEA.726855 doi:10.1594/PANGAEA.726855
// @Test
// public void testDistanceResultCaseAuthor1() {
// final Config config = getResultAuthorsConf();
// final List<String> authorsA = Lists.newArrayList("a", "b", "c", "d");
// final List<String> authorsB = Lists.newArrayList("a", "b", "c");
// final List<String> pid = Lists.newArrayList();
// final MapDocument resA = result(config, "A", "Search the Standard Model Higgs Boson", "2003", pid, authorsA);
// final MapDocument resB = result(config, "B", "Search the Standard Model Higgs Boson", "2003", pid, authorsB);
// final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config);
// final double d = sr.getScore();
// log.info(String.format(" d ---> %s", d));
// assertTrue(d == 0.0);
// }
//
// @Test
// public void testDistanceResultCaseAuthor2() {
// final Config config = getResultAuthorsConf();
// final List<String> authorsA = Lists.newArrayList("a", "b", "c");
// final List<String> authorsB = Lists.newArrayList("a", "b", "c");
// final List<String> pid = Lists.newArrayList();
// final MapDocument resA = result(config, "A", "Search the Standard Model Higgs Boson", "2003", pid, authorsA);
// final MapDocument resB = result(config, "B", "Search the Standard Model Higgs Boson", "2003", pid, authorsB);
// final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config);
// final double d = sr.getScore();
// log.info(String.format(" d ---> %s", d));
// assertTrue(d == 1.0);
// }
//
// @Test
// public void testDistanceResultCaseAuthor3() {
// final Config config = getResultAuthorsConf();
// final List<String> authorsA = Lists.newArrayList("Bardi, A.", "Manghi, P.", "Artini, M.");
// final List<String> authorsB = Lists.newArrayList("Bardi Alessia", "Manghi Paolo", "Artini Michele");
// final List<String> pid = Lists.newArrayList();
// final MapDocument resA = result(config, "A", "Search the Standard Model Higgs Boson", "2003", pid, authorsA);
// final MapDocument resB = result(config, "B", "Search the Standard Model Higgs Boson", "2003", pid, authorsB);
// final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config);
// double d = sr.getScore();
// log.info(String.format(" d ---> %s", d));
// assertTrue((d > 0.9) && (d < 1.0));
// }
//
// @Test
// public void testDistanceResultCaseAuthor4() {
// final Config config = getResultAuthorsConf();
// final List<String> authorsA = Lists.newArrayList("Bardi, Alessia", "Manghi, Paolo", "Artini, Michele", "a");
// final List<String> authorsB = Lists.newArrayList("Bardi Alessia", "Manghi Paolo", "Artini Michele");
// final List<String> pid = Lists.newArrayList();
// final MapDocument resA = result(config, "A", "Search the Standard Model Higgs Boson", "2003", pid, authorsA);
// final MapDocument resB = result(config, "B", "Search the Standard Model Higgs Boson", "2003", pid, authorsB);
// final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config);
// final double d = sr.getScore();
// log.info(String.format(" d ---> %s", d));
// // assertTrue(d.getScore() == 0.0);
// }
//
// @Test
// public void testDistanceResultNoPidsConf() {
// final Config config = getResultFullConf();
// final MapDocument resA =
// result(config, "A", "Presentations of perforated colonic pathology in patients with polymyalgia rheumatica: two case reports", "2010");
// final MapDocument resB =
// result(config, "B", "Presentations of perforated colonic pathology in patients with polymyalgia rheumatica: two case reportsX", "2010");
// final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config);
// final double s = sr.getScore();
// log.info(sr.toString());
// log.info(String.format(" s ---> %s", s));
// // assertTrue(d.getScore() == 0.0);
// }
//
// @Test
// public void testDistanceResultPidsConf() {
// final Config config = getResultFullConf();
// final List<String> authorsA = Lists.newArrayList("Nagarajan Pranesh", "Guy Vautier", "Punyanganie de Silva");
// final List<String> authorsB = Lists.newArrayList("Pranesh Nagarajan", "Vautier Guy", "de Silva Punyanganie");
// final List<String> pidA = Lists.newArrayList("10.1186/1752-1947-4-299", "a", "b");
// final MapDocument resA =
// result(config, "A", "Presentations of perforated colonic pathology in patients with polymyalgia rheumatica: two case reports", "2010",
// pidA, authorsA);
// final List<String> pidB = Lists.newArrayList("c", "a", "10.1186/1752-1947-4-299", "d");
// final MapDocument resB =
// result(config, "B", "Presentations of perforated colonic pathology in patients with polymyalgia rheumatica: two case reportsX", "2010",
// pidB, authorsB);
// final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config);
// final double s = sr.getScore();
// log.info(sr.toString());
// log.info(String.format(" s ---> %s", s));
// // assertTrue(d.getScore() == 0.0);
// }
//
// @Test
// public void testDistanceResultFullConf() {
// final Config config = getResultFullConf();
// final List<String> authorsA = Lists.newArrayList("Nagarajan Pranesh", "Guy Vautier", "Punyanganie de Silva");
// final List<String> authorsB = Lists.newArrayList("Pranesh Nagarajan", "Vautier Guy", "de Silva Punyanganie");
// final MapDocument resA =
// result(config, "A", "Presentations of perforated colonic pathology in patients with polymyalgia rheumatica: two case reports", "2010",
// "10.1186/1752-1947-4-299", authorsA);
// final MapDocument resB =
// result(config, "B", "Presentations of perforated colonic pathology in patients with polymyalgia rheumatica: two case reports", "2010",
// "10.1186/1752-1947-4-299", authorsB);
// final ScoreResult sr = new PaceDocumentDistance().between(resA, resB, config);
// final double d = sr.getScore();
// log.info(String.format(" d ---> %s", d));
// // assertTrue(d.getScore() == 0.0);
// }
//
// @Ignore
// @Test
// public void testDistance() throws IOException {
// final DedupConfig conf = DedupConfig.load(readFromClasspath("/eu/dnetlib/pace/result.prod.pace.json"));
// final MapDocument crossref = asMapDocument(conf, "/eu/dnetlib/pace/crossref.json");
// final MapDocument alicante = asMapDocument(conf, "/eu/dnetlib/pace/alicante.json");
// final ScoreResult result = new PaceDocumentDistance().between(crossref, alicante, conf);
// log.info("score = " + result);
// }
//
// @Ignore
// @Test
// public void testDistanceOrgs() throws IOException {
// final DedupConfig conf = DedupConfig.load(readFromClasspath("/eu/dnetlib/pace/organization.pace.conf"));
// final MapDocument orgA = asMapDocument(conf, readFromClasspath("/eu/dnetlib/pace/organization1.json"));
// final MapDocument orgB = asMapDocument(conf, readFromClasspath("/eu/dnetlib/pace/organization2.json"));
// Set<String> keysA = getGroupingKeys(conf, orgA);
// Set<String> keysB = getGroupingKeys(conf, orgB);
// assertFalse(String.format("A: %s\nB: %s", keysA, keysB), Sets.intersection(keysA, keysB).isEmpty());
// log.info("clustering keys A = " + getGroupingKeys(conf, orgA));
// log.info("clustering keys B = " + getGroupingKeys(conf, orgB));
// final ScoreResult result = new PaceDocumentDistance().between(orgA, orgB, conf);
// log.info("score = " + result);
// log.info("compare = " + result.getScore());
// }
//
// private Set<String> getGroupingKeys(DedupConfig conf, MapDocument doc) {
// return Sets.newHashSet(BlacklistAwareClusteringCombiner.filterAndCombine(doc, conf));
// }
//
// private MapDocument asMapDocument(DedupConfig conf, final String json) {
// OafProtos.OafEntity.Builder b = OafProtos.OafEntity.newBuilder();
// try {
// JsonFormat.merge(json, b);
// } catch (JsonFormat.ParseException e) {
// throw new IllegalArgumentException(e);
// }
// return ProtoDocumentBuilder.newInstance(b.getId(), b.build(), conf.getPace().getModel());
//
// }
//}

View File

@ -21,9 +21,9 @@ public class ProtoDocumentBuilderTest extends AbstractProtoPaceTest {
final String id = "12345";
final Config config = getResultFullConf();
final Config config = getOrganizationTestConf();
final MapDocument document = ProtoDocumentBuilder.newInstance(id, getResult(id), config.model());
final MapDocument document = ProtoDocumentBuilder.newInstance(id, getOrganization(id), config.model());
assertFalse(document.fieldNames().isEmpty());
assertFalse(Iterables.isEmpty(document.fields()));

View File

@ -1,121 +0,0 @@
{
"dateoftransformation": "2018-08-07T06:48:42.668Z",
"originalId": [
"oai:rua.ua.es:10045/34236"
],
"oaiprovenance": {
"originDescription": {
"metadataNamespace": "http://www.openarchives.org/OAI/2.0/oai_dc/",
"altered": true,
"baseURL": "http://rua.ua.es/dspace-oai/request",
"datestamp": "2016-04-28T11:28:35Z",
"harvestDate": "2018-06-14T13:53:42.185Z",
"identifier": "oai:rua.ua.es:10045/34236"
}
},
"result": {
"instance": [
{
"hostedby": {
"value": "Repositorio Institucional de la Universidad de Alicante",
"key": "10|opendoar____::e820a45f1dfc7b95282d10b6087e11c0"
},
"url": [
"http://hdl.handle.net/10045/34236"
],
"dateofacceptance": {
"value": "2013-11-27"
},
"collectedfrom": {
"value": "Repositorio Institucional de la Universidad de Alicante",
"key": "10|opendoar____::e820a45f1dfc7b95282d10b6087e11c0"
},
"accessright": {
"classid": "OPEN",
"classname": "Open Access",
"schemename": "dnet:access_modes",
"schemeid": "dnet:access_modes"
},
"instancetype": {
"classid": "0010",
"classname": "Lecture",
"schemename": "dnet:publication_resource",
"schemeid": "dnet:publication_resource"
}
}
],
"metadata": {
"language": {
"classid": "eng",
"classname": "English",
"schemename": "dnet:languages",
"schemeid": "dnet:languages"
},
"title": [
{
"qualifier": {
"classid": "main title",
"classname": "main title",
"schemename": "dnet:dataCite_title",
"schemeid": "dnet:dataCite_title"
},
"value": "Henry James (1843-1916)"
}
],
"journal": {
"name": ""
},
"author": [
{
"fullname": "Gómez Reus, Teresa",
"surname": "Gómez Reus",
"name": "Teresa",
"rank": 1
}
],
"resulttype": {
"classid": "other",
"classname": "other",
"schemename": "dnet:result_typologies",
"schemeid": "dnet:result_typologies"
},
"dateofacceptance": {
"value": "2013-11-27"
},
"contributor": [
{
"value": "Universidad de Alicante. Departamento de Filología Inglesa"
}
],
"subject": [
{
"qualifier": {
"classid": "keyword",
"classname": "keyword",
"schemename": "dnet:result_subject",
"schemeid": "dnet:result_subject"
},
"value": "James, Henry"
},
{
"qualifier": {
"classid": "keyword",
"classname": "keyword",
"schemename": "dnet:result_subject",
"schemeid": "dnet:result_subject"
},
"value": "Filología Inglesa"
}
]
}
},
"collectedfrom": [
{
"value": "Repositorio Institucional de la Universidad de Alicante",
"key": "10|opendoar____::e820a45f1dfc7b95282d10b6087e11c0"
}
],
"dateofcollection": "2018-06-14T13:53:42.185Z",
"type": 50,
"id": "50|od_______935::2b908ad38030168759c568f49af50784"
}

View File

@ -16,24 +16,24 @@
],
"necessaryConditions": [],
"decisionTree": {
"start": {"fieldsCount": [{"field":"pubID", "comparator":"exactMatch", "weight":1.0, "params":{}}], "threshold":1.0, "aggregation": "SUM", "positive":"NO_MATCH", "negative":"layer2", "undefined": "layer2", "ignoreMissing": "false"},
"layer2": {"fieldsCount": [{"field":"orcid", "comparator":"exactMatch", "weight":1.0, "params":{}}], "threshold":1.0, "aggregation": "SUM", "positive":"ORCID_MATCH", "negative":"NO_MATCH", "undefined": "layer3", "ignoreMissing": "false"},
"layer3": {"fieldsCount": [{"field":"firstname", "comparator":"similar", "weight":1.0, "params":{}}], "threshold":0.7, "aggregation": "SUM", "positive":"layer4", "negative":"NO_MATCH", "undefined": "layer4", "ignoreMissing": "false"},
"layer4": {"fieldsCount": [{"field":"coauthors", "comparator":"coauthorsMatch", "weight":1.0, "params":{"minCoauthors":6, "maxCoauthors": 200}}], "threshold":5.0, "aggregation": "SUM", "positive":"COAUTHORS_MATCH", "negative":"NO_MATCH", "undefined": "layer5", "ignoreMissing": "false"},
"layer5": {"fieldsCount": [{"field":"area", "comparator":"exactMatch", "weight":1.0, "params":{}}], "threshold":1.0, "aggregation": "SUM", "positive":"layer6", "negative":"NO_MATCH", "undefined": "NO_MATCH", "ignoreMissing": "false"},
"layer6": {"fieldsCount": [{"field":"topics", "comparator":"topicsMatch", "weight":1.0, "params":{}}], "threshold":0.7, "aggregation": "SUM", "positive":"TOPICS_MATCH", "negative":"NO_MATCH", "undefined": "NO_MATCH", "ignoreMissing": "false"}
"start": {"fieldsCount": [{"field":"pubID", "comparator":"exactMatch", "weight":1.0, "params":{}}], "threshold":1.0, "aggregation": "SUM", "positive":"NO_MATCH", "negative":"layer2", "undefined": "layer2", "countIfUndefined": "false"},
"layer2": {"fieldsCount": [{"field":"orcid", "comparator":"exactMatch", "weight":1.0, "params":{}}], "threshold":1.0, "aggregation": "SUM", "positive":"ORCID_MATCH", "negative":"NO_MATCH", "undefined": "layer3", "countIfUndefined": "false"},
"layer3": {"fieldsCount": [{"field":"firstname", "comparator":"similar", "weight":1.0, "params":{}}], "threshold":0.7, "aggregation": "SUM", "positive":"layer4", "negative":"NO_MATCH", "undefined": "layer4", "countIfUndefined": "false"},
"layer4": {"fieldsCount": [{"field":"coauthors", "comparator":"coauthorsMatch", "weight":1.0, "params":{"minCoauthors":6, "maxCoauthors": 200}}], "threshold":5.0, "aggregation": "SUM", "positive":"COAUTHORS_MATCH", "negative":"NO_MATCH", "undefined": "layer5", "countIfUndefined": "false"},
"layer5": {"fieldsCount": [{"field":"area", "comparator":"exactMatch", "weight":1.0, "params":{}}], "threshold":1.0, "aggregation": "SUM", "positive":"layer6", "negative":"NO_MATCH", "undefined": "NO_MATCH", "countIfUndefined": "false"},
"layer6": {"fieldsCount": [{"field":"topics", "comparator":"topicsMatch", "weight":1.0, "params":{}}], "threshold":0.7, "aggregation": "SUM", "positive":"TOPICS_MATCH", "negative":"NO_MATCH", "undefined": "NO_MATCH", "countIfUndefined": "false"}
},
"model": [
{"name": "fullname", "algo": "Null", "type": "String", "weight": "0", "ignoreMissing": "false", "path": "person/metadata/fullname"},
{"name": "firstname", "algo": "Null", "type": "String", "weight": "0", "ignoreMissing": "false", "path": "person/metadata/firstname"},
{"name": "lastname", "algo": "Null", "type": "String", "weight": "0", "ignoreMissing": "false", "path": "person/metadata/lastname"},
{"name": "coauthors", "algo": "Null", "type": "String", "weight": "0", "ignoreMissing": "false", "path": "person/metadata/coauthors"},
{"name": "orcid", "algo": "Null", "type": "String", "weight": "0", "ignoreMissing": "false", "path": "person/metadata/orcid"},
{"name": "topics", "algo": "Null", "type": "String", "weight": "0", "ignoreMissing": "false", "path": "person/metadata/topics"},
{"name": "pubID", "algo": "Null", "type": "String", "weight": "0", "ignoreMissing": "false", "path": "person/metadata/pubID"},
{"name": "pubDOI", "algo": "Null", "type": "String", "weight": "0", "ignoreMissing": "false", "path": "person/metadata/pubDOI"},
{"name": "rank", "algo": "Null", "type": "Int", "weight": "0", "ignoreMissing": "false", "path": "person/metadata/rank"},
{"name": "area", "algo": "Null", "type": "String", "weight": "0", "ignoreMissing": "false", "path": "person/metadata/area"}
{"name": "fullname", "algo": "Null", "type": "String", "weight": "0", "countIfUndefined": "false", "path": "person/metadata/fullname"},
{"name": "firstname", "algo": "Null", "type": "String", "weight": "0", "countIfUndefined": "false", "path": "person/metadata/firstname"},
{"name": "lastname", "algo": "Null", "type": "String", "weight": "0", "countIfUndefined": "false", "path": "person/metadata/lastname"},
{"name": "coauthors", "algo": "Null", "type": "String", "weight": "0", "countIfUndefined": "false", "path": "person/metadata/coauthors"},
{"name": "orcid", "algo": "Null", "type": "String", "weight": "0", "countIfUndefined": "false", "path": "person/metadata/orcid"},
{"name": "topics", "algo": "Null", "type": "String", "weight": "0", "countIfUndefined": "false", "path": "person/metadata/topics"},
{"name": "pubID", "algo": "Null", "type": "String", "weight": "0", "countIfUndefined": "false", "path": "person/metadata/pubID"},
{"name": "pubDOI", "algo": "Null", "type": "String", "weight": "0", "countIfUndefined": "false", "path": "person/metadata/pubDOI"},
{"name": "rank", "algo": "Null", "type": "Int", "weight": "0", "countIfUndefined": "false", "path": "person/metadata/rank"},
{"name": "area", "algo": "Null", "type": "String", "weight": "0", "countIfUndefined": "false", "path": "person/metadata/area"}
],
"blacklists": {}
}

View File

@ -8,46 +8,98 @@
"groupMaxSize" : "50",
"slidingWindowSize" : "200",
"rootBuilder" : [ "organization", "projectOrganization_participation_isParticipant", "datasourceOrganization_provision_isProvidedBy" ],
"includeChildren" : "true"
"includeChildren" : "true",
"maxIterations": "20"
},
"pace" : {
"clustering" : [
<<<<<<< HEAD
{ "name" : "sortedngrampairs", "fieldsCount" : [ "legalname" ], "params" : { "max" : 2, "ngramLen" : "3"} },
{ "name" : "suffixprefix", "fieldsCount" : [ "legalname" ], "params" : { "max" : 1, "len" : "3" } },
{ "name" : "urlclustering", "fieldsCount" : [ "websiteurl" ], "params" : { } }
=======
{ "name" : "sortedngrampairs", "fields" : [ "legalname" ], "params" : { "max" : 2, "ngramLen" : "3"} },
{ "name" : "suffixprefix", "fields" : [ "legalname" ], "params" : { "max" : 1, "len" : "3" } },
{ "name" : "urlclustering", "fields" : [ "websiteurl" ], "params" : { } },
{ "name" : "keywordsclustering", "fields" : [ "legalname" ], "params" : { "max": 2, "windowSize": 4} }
>>>>>>> origin/master
],
"sufficientConditions" : [
{ "name" : "exactMatch", "fieldsCount" : [ "gridid" ] }
],
<<<<<<< HEAD
"necessaryConditions" : [
{ "name" : "exactMatch", "fieldsCount" : [ "country" ] },
{ "name" : "DomainExactMatch", "fieldsCount" : [ "websiteurl" ] }
=======
"conditions" : [
{ "name" : "DomainExactMatch", "fields" : [ "websiteurl" ] },
{ "name" : "exactMatch", "fields" : [ "country" ] }
>>>>>>> origin/master
],
"decisionTree" : {
"start": {
"fields": [
{
"field": "gridid",
"comparator": "exactMatch",
"weight": 1,
"countIfUndefined": "false",
"params": {}
}
],
"threshold": 1,
"aggregation": "SC",
"positive": "MATCH",
"negative": "NO_MATCH",
"undefined": "layer2",
"ignoreUndefined": "false"
},
"layer2": {
"fields": [
{
"field": "websiteurl",
"comparator": "domainExactMatch",
"weight": 1,
"countIfUndefined": "false",
"params": {}
},
{
"field": "country",
"comparator": "exactMatch",
"weight": 1,
"countIfUndefined": "true",
"params": {}
}
],
"threshold": 1,
"aggregation": "NC",
"positive": "layer3",
"negative": "NO_MATCH",
"undefined": "layer3",
"ignoreUndefined": "false"
},
"layer3": {
"fields": [
{
"field": "legalname",
"comparator": "jaroWinklerNormalizedName",
"weight": 0.9,
"countIfUndefined": "true",
"params": {
"windowSize": 4,
"threshold": 0.7
}
},
{
"field": "legalshortname",
"comparator": "jaroWinklerNormalizedName",
"weight": 0.1,
"countIfUndefined": "false",
"params": {}
}
],
"threshold": 0.9,
"aggregation": "W_MEAN",
"positive": "MATCH",
"negative": "NO_MATCH",
"undefined": "NO_MATCH",
"ignoreUndefined": "true"
}
},
"model" : [
{ "name" : "country", "algo" : "Null", "type" : "String", "weight" : "0", "ignoreMissing" : "false", "path" : "organization/metadata/country/classid" },
{ "name" : "legalshortname", "algo" : "JaroWinklerNormalizedName", "type" : "String", "weight" : "0.1", "ignoreMissing" : "true", "path" : "organization/metadata/legalshortname/value" },
{ "name" : "legalname", "algo" : "JaroWinklerNormalizedName", "type" : "String", "weight" : "0.9", "ignoreMissing" : "false", "path" : "organization/metadata/legalname/value", "params" : {"windowSize" : 4, "threshold" : 0.7} },
{ "name" : "websiteurl", "algo" : "Null", "type" : "URL", "weight" : "0", "ignoreMissing" : "true", "path" : "organization/metadata/websiteurl/value", "params" : { "host" : 0.5, "path" : 0.5 } },
{ "name" : "gridid", "algo" : "Null", "type" : "String", "weight" : "0.0", "ignoreMissing" : "true", "path" : "pid[qualifier#classid = {grid}]/value" }
{ "name" : "country", "type" : "String", "path" : "organization/metadata/country/classid"},
{ "name" : "legalshortname", "type" : "String", "path" : "organization/metadata/legalshortname/value"},
{ "name" : "legalname", "type" : "String", "path" : "organization/metadata/legalname/value" },
{ "name" : "websiteurl", "type" : "URL", "path" : "organization/metadata/websiteurl/value" },
{ "name" : "gridid", "type" : "String", "path" : "pid[qualifier#classid = {grid}]/value"}
],
"blacklists" : {
"legalname" : []
},
"synonyms": {
"key::1": ["university","università","universita","università studi","universita studi","universitario","universitaria","université","universitaire","universitaires","universidad","universitade","Universität","universitaet","Uniwersytet","университет","universiteit","πανεπιστήμιο","universitesi","universiteti"],
"key::1": ["university","università","università studi","universitario","universitaria","université","universitaire","universitaires","universidad","universitade","Universität","universitaet","Uniwersytet","университет","universiteit","πανεπιστήμιο","universitesi","universiteti"],
"key::2": ["studies","studi","études","estudios","estudos","Studien","studia","исследования","studies","σπουδές"],
"key::3": ["advanced","superiore","supérieur","supérieure","supérieurs","supérieures","avancado","avancados","fortgeschrittene","fortgeschritten","zaawansowany","передовой","gevorderd","gevorderde","προχωρημένος","προχωρημένη","προχωρημένο","προχωρημένες","προχωρημένα","wyzsza"],
"key::4": ["institute","istituto","institut","instituto","instituto","Institut","instytut","институт","instituut","ινστιτούτο"],
@ -97,58 +149,58 @@
"key::48": ["promotion","promozione","продвижение","proothisis","forderung"],
"key::49": ["medical","medicine","clinical","medicina","clinici","médico","medicina","clínica","médico","medicina","clínica","medizinisch","Medizin","klinisch","medisch","geneeskunde","klinisch","ιατρικός","ιατρική","ιατρικό","ιατρικά","κλινικός","κλινική","κλινικό","κλινικά","tıbbi","tıp","klinik","orvosi","orvostudomány","klinikai","zdravniški","medicinski","klinični","meditsiini","kliinik","kliiniline"],
"key::50": ["technology","technological","tecnologia","tecnologie","tecnología","tecnológico","tecnologia","tecnológico","Technologie","technologisch","technologie","technologisch","τεχνολογία","τεχνολογικός","τεχνολογική","τεχνολογικό","teknoloji","teknolojik","technológia","technológiai","tehnologija","tehnološki","tehnoloogia","tehnoloogiline","technologii","technical","texniki","teknik"],
"key::51": ["science","scientific","scienza","scientifiche","scienze","ciencia","científico","ciência","científico","Wissenschaft","wissenschaftlich","wetenschap","wetenschappelijk","επιστήμη","επιστημονικός","επιστημονική","επιστημονικό","επιστημονικά","bilim","bilimsel","tudomány","tudományos","znanost","znanstveni","teadus","teaduslik"],
"key::52": ["engineering","ingegneria","ingeniería","engenharia","Ingenieurwissenschaft","ingenieurswetenschappen","bouwkunde","μηχανικός","μηχανική","μηχανικό","mühendislik","mérnöki","Inženirstvo","inseneeria","inseneri"],
"key::53": ["management","gestione","gestionale","gestionali","gestión","administración","gestão","administração","Verwaltung","management","διαχείριση","yönetim","menedzsment","vodstvo","upravljanje","management","juhtkond","juhtimine","haldus"],
"key::54": ["energy","energia","energía","energia","Energie","energie","ενέργεια","enerji","energia","energija","energia"],
"key::55": ["agricultural","agriculture","agricoltura","agricole","agrícola","agricultura","agrícola","agricultura","landwirtschaftlich","Landwirtschaft","landbouwkundig","landbouw","αγροτικός","αγροτική","αγροτικό","γεωργικός","γεωργική","γεωργικό","γεωργία","tarımsal","tarım","mezőgazdasági","mezőgazdaság","poljedelski","poljedelstvo","põllumajandus","põllumajanduslik"],
"key::56": ["information","informazione","información","informação","Information","informatie","πληροφορία","bilgi","információ","informacija","informatsioon","informatycznych"],
"key::57": ["social","sociali","social","social","Sozial","sociaal","maatschappelijk","κοινωνικός","κοινωνική","κοινωνικό","κοινωνικά","sosyal","szociális","družbeni","sotsiaal","sotsiaalne"],
"key::58": ["environmental","ambiente","medioambiental","ambiente","medioambiente","meioambiente","Umwelt","milieu","milieuwetenschap","milieukunde","περιβαλλοντικός","περιβαλλοντική","περιβαλλοντικό","περιβαλλοντικά","çevre","környezeti","okoliški","keskonna"],
"key::59": ["business","economia","economiche","economica","negocio","empresa","negócio","Unternehmen","bedrijf","bedrijfskunde","επιχείρηση","iş","üzleti","posel","ettevõte/äri"],
"key::60": ["pharmaceuticals","pharmacy","farmacia","farmaceutica","farmacéutica","farmacia","farmacêutica","farmácia","Pharmazeutika","Arzneimittelkunde","farmaceutica","geneesmiddelen","apotheek","φαρμακευτικός","φαρμακευτική","φαρμακευτικό","φαρμακευτικά","φαρμακείο","ilaç","eczane","gyógyszerészeti","gyógyszertár","farmacevtika","lekarništvo","farmaatsia","farmatseutiline"],
"key::61": ["healthcare","health services","salute","atenciónmédica","cuidadodelasalud","cuidadoscomasaúde","Gesundheitswesen","gezondheidszorg","ιατροφαρμακευτικήπερίθαλψη","sağlıkhizmeti","egészségügy","zdravstvo","tervishoid","tervishoiu"],
"key::62": ["history","storia","historia","história","Geschichte","geschiedenis","geschiedkunde","ιστορία","tarih","történelem","zgodovina","ajalugu"],
"key::63": ["materials","materiali","materia","materiales","materiais","materialen","υλικά","τεκμήρια","malzemeler","anyagok","materiali","materjalid","vahendid"],
"key::64": ["economics","economia","economiche","economica","economía","economia","Wirtschaft","economie","οικονομικά","οικονομικέςεπιστήμες","ekonomi","közgazdaságtan","gospodarstvo","ekonomija","majanduslik","majandus"],
"key::65": ["therapeutics","terapeutica","terapéutica","terapêutica","therapie","θεραπευτική","tedavibilimi","gyógykezelés","terapevtika","terapeutiline","ravi"],
"key::66": ["oncology","oncologia","oncologico","oncología","oncologia","Onkologie","oncologie","ογκολογία","onkoloji","onkológia","onkologija","onkoloogia"],
"key::67": ["natural","naturali","naturale","natural","natural","natürlich","natuurlijk","φυσικός","φυσική","φυσικό","φυσικά","doğal","természetes","naraven","loodus"],
"key::68": ["educational","educazione","pedagogia","educacional","educativo","educacional","pädagogisch","educatief","εκπαιδευτικός","εκπαιδευτική","εκπαιδευτικό","εκπαιδευτικά","eğitimsel","oktatási","izobraževalen","haridus","hariduslik"],
"key::69": ["biomedical","biomedica","biomédico","biomédico","biomedizinisch","biomedisch","βιοιατρικός","βιοιατρική","βιοιατρικό","βιοιατρικά","biyomedikal","orvosbiológiai","biomedicinski","biomeditsiiniline"],
"key::70": ["veterinary","veterinaria","veterinarie","veterinaria","veterinária","tierärtzlich","veterinair","veeartsenijlkunde","κτηνιατρικός","κτηνιατρική","κτηνιατρικό","κτηνιατρικά","veteriner","állatorvosi","veterinar","veterinarski","veterinaaria"],
"key::71": ["chemistry","chimica","química","química","Chemie","chemie","scheikunde","χημεία","kimya","kémia","kemija","keemia"],
"key::72": ["security","sicurezza","seguridad","segurança","Sicherheit","veiligheid","ασφάλεια","güvenlik","biztonsági","varnost","turvalisus","julgeolek"],
"key::73": ["biotechnology","biotecnologia","biotecnologie","biotecnología","biotecnologia","Biotechnologie","biotechnologie","βιοτεχνολογία","biyoteknoloji","biotechnológia","biotehnologija","biotehnoloogia"],
"key::74": ["military","militare","militari","militar","militar","Militär","militair","leger","στρατιωτικός","στρατιωτική","στρατιωτικό","στρατιωτικά","askeri","katonai","vojaški","vojni","militaar","wojskowa"],
"key::75": ["theological","teologia","teologico","teológico","tecnológica","theologisch","theologisch","θεολογικός","θεολογική","θεολογικό","θεολογικά","teolojik","technológiai","teološki","teoloogia","usuteadus","teoloogiline"],
"key::76": ["electronics","elettronica","electrónica","eletrônicos","Elektronik","elektronica","ηλεκτρονική","elektronik","elektronika","elektronika","elektroonika"],
"key::77": ["forestry","forestale","forestali","silvicultura","forestal","floresta","Forstwirtschaft","bosbouw","δασοκομία","δασολογία","ormancılık","erdészet","gozdarstvo","metsandus"],
"key::78": ["maritime","marittima","marittime","marittimo","marítimo","marítimo","maritiem","ναυτικός","ναυτική","ναυτικό","ναυτικά","ναυτιλιακός","ναυτιλιακή","ναυτιλιακό","ναυτιλιακά","θαλάσσιος","θαλάσσια","θαλάσσιο","denizcilik","tengeri","morski","mere","merendus"],
"key::79": ["sports","sport","deportes","esportes","Sport","sport","sportwetenschappen","άθληση","γυμναστικήδραστηριότητα","spor","sport","šport","sport","spordi"],
"key::80": ["surgery","chirurgia","chirurgiche","cirugía","cirurgia","Chirurgie","chirurgie","heelkunde","εγχείρηση","επέμβαση","χειρουργικήεπέμβαση","cerrahi","sebészet","kirurgija","kirurgia"],
"key::81": ["cultural","culturale","culturali","cultura","cultural","cultural","kulturell","cultureel","πολιτιστικός","πολιτιστική","πολιτιστικό","πολιτισμικός","πολιτισμική","πολιτισμικό","kültürel","kultúrális","kulturni","kultuuri","kultuuriline"],
"key::82": ["computerscience","informatica","ordenador","computadora","informática","computación","cienciasdelacomputación","ciênciadacomputação","Computer","computer","υπολογιστής","ηλεκτρονικόςυπολογιστής","bilgisayar","számítógép","računalnik","arvuti"],
"key::83": ["finance","financial","finanza","finanziarie","finanza","financiero","finanças","financeiro","Finanzen","finanziell","financiën","financieel","χρηματοοικονομικά","χρηματοδότηση","finanse","finansal","pénzügy","pénzügyi","finance","finančni","finants","finantsiline"],
"key::84": ["communication","comunicazione","comuniciación","comunicação","Kommunikation","communication","επικοινωνία","iletişim","kommunikáció","komuniciranje","kommunikatsioon"],
"key::85": ["justice","giustizia","justicia","justiça","Recht","Justiz","justitie","gerechtigheid","δικαιοσύνη","υπουργείοδικαιοσύνης","δίκαιο","adalet","igazságügy","pravo","õigus"],
"key::86": ["aerospace","aerospaziale","aerospaziali","aeroespacio","aeroespaço","Luftfahrt","luchtvaart","ruimtevaart","αεροπορικός","αεροπορική","αεροπορικό","αεροναυπηγικός","αεροναυπηγική","αεροναυπηγικό","αεροναυπηγικά","havacılıkveuzay","légtér","zrakoplovstvo","atmosfäär","kosmos"],
"key::87": ["dermatology","dermatologia","dermatología","dermatologia","Dermatologie","dermatologie","δρματολογία","dermatoloji","bőrgyógyászat","dermatológia","dermatologija","dermatoloogia"],
"key::88": ["architecture","architettura","arquitectura","arquitetura","Architektur","architectuur","αρχιτεκτονική","mimarlık","építészet","arhitektura","arhitektuur"],
"key::89": ["mathematics","matematica","matematiche","matemáticas","matemáticas","Mathematik","wiskunde","mathematica","μαθηματικά","matematik","matematika","matematika","matemaatika"],
"key::90": ["language","lingue","linguistica","linguistiche","lenguaje","idioma","língua","idioma","Sprache","taal","taalkunde","γλώσσα","dil","nyelv","jezik","keel"],
"key::91": ["neuroscience","neuroscienza","neurociencia","neurociência","Neurowissenschaft","neurowetenschappen","νευροεπιστήμη","nörobilim","idegtudomány","nevroznanost","neuroteadused"],
"key::92": ["automation","automazione","automatización","automação","Automatisierung","automatisering","αυτοματοποίηση","otomasyon","automatizálás","avtomatizacija","automatiseeritud"],
"key::93": ["pediatric","pediatria","pediatriche","pediatrico","pediátrico","pediatría","pediátrico","pediatria","pädiatrisch","pediatrische","παιδιατρική","pediatrik","gyermekgyógyászat","pediatrija","pediaatria"],
"key::94": ["photonics","fotonica","fotoniche","fotónica","fotônica","Photonik","fotonica","φωτονική","fotonik","fotonika","fotonika","fotoonika"],
"key::95": ["mechanics","meccanica","meccaniche","mecánica","mecânica","Mechanik","Maschinenbau","mechanica","werktuigkunde","μηχανικής","mekanik","gépészet","mehanika","mehaanika"],
"key::96": ["psychiatrics","psichiatria","psichiatrica","psichiatriche","psiquiatría","psiquiatria","Psychiatrie","psychiatrie","ψυχιατρική","psikiyatrik","pszihiátria","psihiatrija","psühhaatria"],
"key::97": ["psychology","fisiologia","psicología","psicologia","Psychologie","psychologie","ψυχολογία","psikoloji","pszihológia","psihologija","psühholoogia"],
"key::98": ["automotive","industriaautomobilistica","industriadelautomóvil","automotriz","industriaautomotriz","automotivo","Automobilindustrie","autoindustrie","αυτοκίνητος","αυτοκίνητη","αυτοκίνητο","αυτοκινούμενος","αυτοκινούμενη","αυτοκινούμενο","αυτοκινητιστικός","αυτοκινητιστική","αυτοκινητιστικό","otomotiv","autóipari","samogiben","avtomobilskaindustrija","auto-"],
"key::99": ["neurology","neurologia","neurologiche","neurología","neurologia","Neurologie","neurologie","zenuwleer","νευρολογία","nöroloji","neurológia","ideggyógyászat","nevrologija","neuroloogia"],
"key::100": ["geology","geologia","geologiche","geología","geologia","Geologie","geologie","aardkunde","γεωλογία","jeoloji","geológia","földtudomány","geologija","geoloogia"],
"key::101": ["microbiology","microbiologia","micro-biologia","microbiologiche","microbiología","microbiologia","Mikrobiologie","microbiologie","μικροβιολογία","mikrobiyoloji","mikrobiológia","mikrobiologija","mikrobioloogia"],
"key::102": ["informatics","informatica","informática","informática","informatica"],
"key::51": ["science","scientific","scienza","scientifiche","scienze","ciencia","científico","ciência","científico","Wissenschaft","wissenschaftlich","wetenschap","wetenschappelijk","επιστήμη","επιστημονικός","επιστημονική","επιστημονικό","επιστημονικά","bilim","bilimsel","tudomány","tudományos","znanost","znanstveni","teadus","teaduslik",""],
"key::52": ["engineering","ingegneria","ingeniería","engenharia","Ingenieurwissenschaft","ingenieurswetenschappen","bouwkunde","μηχανικός","μηχανική","μηχανικό","mühendislik","mérnöki","Inženirstvo","inseneeria","inseneri",""],
"key::53": ["management","gestione","gestionale","gestionali","gestión","administración","gestão","administração","Verwaltung","management","διαχείριση","yönetim","menedzsment","vodstvo","upravljanje","management","juhtkond","juhtimine","haldus",""],
"key::54": ["energy","energia","energía","energia","Energie","energie","ενέργεια","enerji","energia","energija","energia",""],
"key::55": ["agricultural","agriculture","agricoltura","agricole","agrícola","agricultura","agrícola","agricultura","landwirtschaftlich","Landwirtschaft","landbouwkundig","landbouw","αγροτικός","αγροτική","αγροτικό","γεωργικός","γεωργική","γεωργικό","γεωργία","tarımsal","tarım","mezőgazdasági","mezőgazdaság","poljedelski","poljedelstvo","põllumajandus","põllumajanduslik",""],
"key::56": ["information","informazione","información","informação","Information","informatie","πληροφορία","bilgi","információ","informacija","informatsioon","informatycznych",""],
"key::57": ["social","sociali","social","social","Sozial","sociaal","maatschappelijk","κοινωνικός","κοινωνική","κοινωνικό","κοινωνικά","sosyal","szociális","družbeni","sotsiaal","sotsiaalne",""],
"key::58": ["environmental","ambiente","medioambiental","ambiente","medioambiente","meioambiente","Umwelt","milieu","milieuwetenschap","milieukunde","περιβαλλοντικός","περιβαλλοντική","περιβαλλοντικό","περιβαλλοντικά","çevre","környezeti","okoliški","keskonna",""],
"key::59": ["business","economia","economiche","economica","negocio","empresa","negócio","Unternehmen","bedrijf","bedrijfskunde","επιχείρηση","iş","üzleti","posel","ettevõte/äri",""],
"key::60": ["pharmaceuticals","pharmacy","farmacia","farmaceutica","farmacéutica","farmacia","farmacêutica","farmácia","Pharmazeutika","Arzneimittelkunde","farmaceutica","geneesmiddelen","apotheek","φαρμακευτικός","φαρμακευτική","φαρμακευτικό","φαρμακευτικά","φαρμακείο","ilaç","eczane","gyógyszerészeti","gyógyszertár","farmacevtika","lekarništvo","farmaatsia","farmatseutiline",""],
"key::61": ["healthcare","health services","salute","atenciónmédica","cuidadodelasalud","cuidadoscomasaúde","Gesundheitswesen","gezondheidszorg","ιατροφαρμακευτικήπερίθαλψη","sağlıkhizmeti","egészségügy","zdravstvo","tervishoid","tervishoiu",""],
"key::62": ["history","storia","historia","história","Geschichte","geschiedenis","geschiedkunde","ιστορία","tarih","történelem","zgodovina","ajalugu",""],
"key::63": ["materials","materiali","materia","materiales","materiais","materialen","υλικά","τεκμήρια","malzemeler","anyagok","materiali","materjalid","vahendid",""],
"key::64": ["economics","economia","economiche","economica","economía","economia","Wirtschaft","economie","οικονομικά","οικονομικέςεπιστήμες","ekonomi","közgazdaságtan","gospodarstvo","ekonomija","majanduslik","majandus",""],
"key::65": ["therapeutics","terapeutica","terapéutica","terapêutica","therapie","θεραπευτική","tedavibilimi","gyógykezelés","terapevtika","terapeutiline","ravi",""],
"key::66": ["oncology","oncologia","oncologico","oncología","oncologia","Onkologie","oncologie","ογκολογία","onkoloji","onkológia","onkologija","onkoloogia",""],
"key::67": ["natural","naturali","naturale","natural","natural","natürlich","natuurlijk","φυσικός","φυσική","φυσικό","φυσικά","doğal","természetes","naraven","loodus",""],
"key::68": ["educational","educazione","pedagogia","educacional","educativo","educacional","pädagogisch","educatief","εκπαιδευτικός","εκπαιδευτική","εκπαιδευτικό","εκπαιδευτικά","eğitimsel","oktatási","izobraževalen","haridus","hariduslik",""],
"key::69": ["biomedical","biomedica","biomédico","biomédico","biomedizinisch","biomedisch","βιοιατρικός","βιοιατρική","βιοιατρικό","βιοιατρικά","biyomedikal","orvosbiológiai","biomedicinski","biomeditsiiniline",""],
"key::70": ["veterinary","veterinaria","veterinarie","veterinaria","veterinária","tierärtzlich","veterinair","veeartsenijlkunde","κτηνιατρικός","κτηνιατρική","κτηνιατρικό","κτηνιατρικά","veteriner","állatorvosi","veterinar","veterinarski","veterinaaria",""],
"key::71": ["chemistry","chimica","química","química","Chemie","chemie","scheikunde","χημεία","kimya","kémia","kemija","keemia",""],
"key::72": ["security","sicurezza","seguridad","segurança","Sicherheit","veiligheid","ασφάλεια","güvenlik","biztonsági","varnost","turvalisus","julgeolek",""],
"key::73": ["biotechnology","biotecnologia","biotecnologie","biotecnología","biotecnologia","Biotechnologie","biotechnologie","βιοτεχνολογία","biyoteknoloji","biotechnológia","biotehnologija","biotehnoloogia",""],
"key::74": ["military","militare","militari","militar","militar","Militär","militair","leger","στρατιωτικός","στρατιωτική","στρατιωτικό","στρατιωτικά","askeri","katonai","vojaški","vojni","militaar","wojskowa",""],
"key::75": ["theological","teologia","teologico","teológico","tecnológica","theologisch","theologisch","θεολογικός","θεολογική","θεολογικό","θεολογικά","teolojik","technológiai","teološki","teoloogia","usuteadus","teoloogiline",""],
"key::76": ["electronics","elettronica","electrónica","eletrônicos","Elektronik","elektronica","ηλεκτρονική","elektronik","elektronika","elektronika","elektroonika",""],
"key::77": ["forestry","forestale","forestali","silvicultura","forestal","floresta","Forstwirtschaft","bosbouw","δασοκομία","δασολογία","ormancılık","erdészet","gozdarstvo","metsandus",""],
"key::78": ["maritime","marittima","marittime","marittimo","marítimo","marítimo","maritiem","ναυτικός","ναυτική","ναυτικό","ναυτικά","ναυτιλιακός","ναυτιλιακή","ναυτιλιακό","ναυτιλιακά","θαλάσσιος","θαλάσσια","θαλάσσιο","denizcilik","tengeri","morski","mere","merendus",""],
"key::79": ["sports","sport","deportes","esportes","Sport","sport","sportwetenschappen","άθληση","γυμναστικήδραστηριότητα","spor","sport","šport","sport","spordi",""],
"key::80": ["surgery","chirurgia","chirurgiche","cirugía","cirurgia","Chirurgie","chirurgie","heelkunde","εγχείρηση","επέμβαση","χειρουργικήεπέμβαση","cerrahi","sebészet","kirurgija","kirurgia",""],
"key::81": ["cultural","culturale","culturali","cultura","cultural","cultural","kulturell","cultureel","πολιτιστικός","πολιτιστική","πολιτιστικό","πολιτισμικός","πολιτισμική","πολιτισμικό","kültürel","kultúrális","kulturni","kultuuri","kultuuriline",""],
"key::82": ["computerscience","informatica","ordenador","computadora","informática","computación","cienciasdelacomputación","ciênciadacomputação","Computer","computer","υπολογιστής","ηλεκτρονικόςυπολογιστής","bilgisayar","számítógép","računalnik","arvuti",""],
"key::83": ["finance","financial","finanza","finanziarie","finanza","financiero","finanças","financeiro","Finanzen","finanziell","financiën","financieel","χρηματοοικονομικά","χρηματοδότηση","finanse","finansal","pénzügy","pénzügyi","finance","finančni","finants","finantsiline",""],
"key::84": ["communication","comunicazione","comuniciación","comunicação","Kommunikation","communication","επικοινωνία","iletişim","kommunikáció","komuniciranje","kommunikatsioon",""],
"key::85": ["justice","giustizia","justicia","justiça","Recht","Justiz","justitie","gerechtigheid","δικαιοσύνη","υπουργείοδικαιοσύνης","δίκαιο","adalet","igazságügy","pravo","õigus",""],
"key::86": ["aerospace","aerospaziale","aerospaziali","aeroespacio","aeroespaço","Luftfahrt","luchtvaart","ruimtevaart","αεροπορικός","αεροπορική","αεροπορικό","αεροναυπηγικός","αεροναυπηγική","αεροναυπηγικό","αεροναυπηγικά","havacılıkveuzay","légtér","zrakoplovstvo","atmosfäär","kosmos",""],
"key::87": ["dermatology","dermatologia","dermatología","dermatologia","Dermatologie","dermatologie","δρματολογία","dermatoloji","bőrgyógyászat","dermatológia","dermatologija","dermatoloogia",""],
"key::88": ["architecture","architettura","arquitectura","arquitetura","Architektur","architectuur","αρχιτεκτονική","mimarlık","építészet","arhitektura","arhitektuur",""],
"key::89": ["mathematics","matematica","matematiche","matemáticas","matemáticas","Mathematik","wiskunde","mathematica","μαθηματικά","matematik","matematika","matematika","matemaatika",""],
"key::90": ["language","lingue","linguistica","linguistiche","lenguaje","idioma","língua","idioma","Sprache","taal","taalkunde","γλώσσα","dil","nyelv","jezik","keel",""],
"key::91": ["neuroscience","neuroscienza","neurociencia","neurociência","Neurowissenschaft","neurowetenschappen","νευροεπιστήμη","nörobilim","idegtudomány","nevroznanost","neuroteadused",""],
"key::92": ["automation","automazione","automatización","automação","Automatisierung","automatisering","αυτοματοποίηση","otomasyon","automatizálás","avtomatizacija","automatiseeritud",""],
"key::93": ["pediatric","pediatria","pediatriche","pediatrico","pediátrico","pediatría","pediátrico","pediatria","pädiatrisch","pediatrische","παιδιατρική","pediatrik","gyermekgyógyászat","pediatrija","pediaatria",""],
"key::94": ["photonics","fotonica","fotoniche","fotónica","fotônica","Photonik","fotonica","φωτονική","fotonik","fotonika","fotonika","fotoonika",""],
"key::95": ["mechanics","meccanica","meccaniche","mecánica","mecânica","Mechanik","Maschinenbau","mechanica","werktuigkunde","μηχανικής","mekanik","gépészet","mehanika","mehaanika",""],
"key::96": ["psychiatrics","psichiatria","psichiatrica","psichiatriche","psiquiatría","psiquiatria","Psychiatrie","psychiatrie","ψυχιατρική","psikiyatrik","pszihiátria","psihiatrija","psühhaatria",""],
"key::97": ["psychology","fisiologia","psicología","psicologia","Psychologie","psychologie","ψυχολογία","psikoloji","pszihológia","psihologija","psühholoogia",""],
"key::98": ["automotive","industriaautomobilistica","industriadelautomóvil","automotriz","industriaautomotriz","automotivo","Automobilindustrie","autoindustrie","αυτοκίνητος","αυτοκίνητη","αυτοκίνητο","αυτοκινούμενος","αυτοκινούμενη","αυτοκινούμενο","αυτοκινητιστικός","αυτοκινητιστική","αυτοκινητιστικό","otomotiv","autóipari","samogiben","avtomobilskaindustrija","auto-",""],
"key::99": ["neurology","neurologia","neurologiche","neurología","neurologia","Neurologie","neurologie","zenuwleer","νευρολογία","nöroloji","neurológia","ideggyógyászat","nevrologija","neuroloogia",""],
"key::100": ["geology","geologia","geologiche","geología","geologia","Geologie","geologie","aardkunde","γεωλογία","jeoloji","geológia","földtudomány","geologija","geoloogia",""],
"key::101": ["microbiology","microbiologia","micro-biologia","microbiologiche","microbiología","microbiologia","Mikrobiologie","microbiologie","μικροβιολογία","mikrobiyoloji","mikrobiológia","mikrobiologija","mikrobioloogia",""],
"key::102": ["informatics","informatica","informática","informática","informatica",""],
"key::103": ["forschungsgemeinschaft","comunita ricerca","research community","research foundation","research association"],
"key::104": ["commerce","ticaret","ticarət","commercio","trade","handel","comercio"]
}

View File

@ -0,0 +1,208 @@
{
"wf" : {
"threshold" : "0.99",
"dedupRun" : "001",
"entityType" : "organization",
"orderField" : "legalname",
"queueMaxSize" : "2000",
"groupMaxSize" : "50",
"slidingWindowSize" : "200",
"rootBuilder" : [ "organization", "projectOrganization_participation_isParticipant", "datasourceOrganization_provision_isProvidedBy" ],
"includeChildren" : "true",
"maxIterations": "20"
},
"pace" : {
"clustering" : [
{ "name" : "sortedngrampairs", "fields" : [ "legalname" ], "params" : { "max" : 2, "ngramLen" : "3"} },
{ "name" : "suffixprefix", "fields" : [ "legalname" ], "params" : { "max" : 1, "len" : "3" } },
{ "name" : "urlclustering", "fields" : [ "websiteurl" ], "params" : { } },
{ "name" : "keywordsclustering", "fields" : [ "legalname" ], "params" : { "max": 2, "windowSize": 4} }
],
"decisionTree" : {
"start": {
"fields": [
{
"field": "gridid",
"comparator": "exactMatch",
"weight": 1,
"countIfUndefined": "false",
"params": {}
}
],
"threshold": 1,
"aggregation": "SC",
"positive": "MATCH",
"negative": "NO_MATCH",
"undefined": "layer2",
"ignoreUndefined": "false"
},
"layer2": {
"fields": [
{
"field": "websiteurl",
"comparator": "domainExactMatch",
"weight": 1,
"countIfUndefined": "false",
"params": {}
},
{
"field": "country",
"comparator": "exactMatch",
"weight": 1,
"countIfUndefined": "true",
"params": {}
}
],
"threshold": 1,
"aggregation": "NC",
"positive": "layer3",
"negative": "NO_MATCH",
"undefined": "layer3",
"ignoreUndefined": "false"
},
"layer3": {
"fields": [
{
"field": "legalname",
"comparator": "jaroWinklerNormalizedName",
"weight": 0.9,
"countIfUndefined": "true",
"params": {
"windowSize": 4,
"threshold": 0.7
}
},
{
"field": "legalshortname",
"comparator": "jaroWinklerNormalizedName",
"weight": 0.1,
"countIfUndefined": "false",
"params": {}
}
],
"threshold": 0.9,
"aggregation": "W_MEAN",
"positive": "MATCH",
"negative": "NO_MATCH",
"undefined": "NO_MATCH",
"ignoreUndefined": "true"
}
},
"model" : [
{ "name" : "country", "type" : "String", "path" : "organization/metadata/country/classid"},
{ "name" : "legalshortname", "type" : "String", "path" : "organization/metadata/legalshortname/value"},
{ "name" : "legalname", "type" : "String", "path" : "organization/metadata/legalname/value" },
{ "name" : "websiteurl", "type" : "URL", "path" : "organization/metadata/websiteurl/value" },
{ "name" : "gridid", "type" : "String", "path" : "pid[qualifier#classid = {grid}]/value"}
],
"blacklists" : {
"legalname" : []
},
"synonyms": {
"key::1": ["university","università","università studi","universitario","universitaria","université","universitaire","universitaires","universidad","universitade","Universität","universitaet","Uniwersytet","университет","universiteit","πανεπιστήμιο","universitesi","universiteti"],
"key::2": ["studies","studi","études","estudios","estudos","Studien","studia","исследования","studies","σπουδές"],
"key::3": ["advanced","superiore","supérieur","supérieure","supérieurs","supérieures","avancado","avancados","fortgeschrittene","fortgeschritten","zaawansowany","передовой","gevorderd","gevorderde","προχωρημένος","προχωρημένη","προχωρημένο","προχωρημένες","προχωρημένα","wyzsza"],
"key::4": ["institute","istituto","institut","instituto","instituto","Institut","instytut","институт","instituut","ινστιτούτο"],
"key::5": ["hospital","ospedale","hôpital","hospital","hospital","Krankenhaus","szpital","больница","ziekenhuis","νοσοκομείο"],
"key::6": ["research","ricerca","recherche","investigacion","pesquisa","Forschung","badania","исследования","onderzoek","έρευνα","erevna","erevnas"],
"key::7": ["college","collegio","université","colegio","faculdade","Hochschule","Szkoła Wyższa","Высшая школа","universiteit","κολλέγιο"],
"key::8": ["foundation","fondazione","fondation","fundación","fundação","Stiftung","Fundacja","фонд","stichting","ίδρυμα","idryma"],
"key::9": ["center","centro","centre","centro","centro","zentrum","centrum","центр","centrum","κέντρο"],
"key::10": ["national","nazionale","national","nationale","nationaux","nationales","nacional","nacional","national","krajowy","национальный","nationaal","nationale","εθνικό"],
"key::11": ["association","associazione","association","asociación","associação","Verein","verband","stowarzyszenie","ассоциация","associatie"],
"key::12": ["society","societa","société","sociedad","sociedade","gesellschaft","społeczeństwo","общество","maatschappij","κοινωνία"],
"key::13": ["international","internazionale","international","internacional","internacional","international","międzynarodowy","Международный","internationaal","internationale","διεθνής","διεθνή","διεθνές"],
"key::14": ["community","comunita","communauté","comunidad","comunidade","Gemeinschaft","społeczność","сообщество","gemeenschap","κοινότητα"],
"key::15": ["school","scuola","école","escuela","escola","schule","Szkoła","школа","school","σχολείο"],
"key::16": ["education","educazione","éducation","educacion","Educação","Bildung","Edukacja","образование","opleiding","εκπαίδευση"],
"key::17": ["academy","accademia","académie","academia","academia","Akademie","akademie","академия","academie","ακαδημία"],
"key::18": ["public","pubblico","public","publique","publics","publiques","publico","publico","Öffentlichkeit","publiczny","публичный","publiek","publieke","δημόσιος","δημόσια","δημόσιο"],
"key::19": ["museum","museo","musée","mueso","museu","museum","muzeum","музей","museum","μουσείο"],
"key::20": ["group","gruppo","groupe","grupo","grupo","gruppe","grupa","группа","groep","ομάδα","όμιλος"],
"key::21": ["department","dipartimento","département","departamento","departamento","abteilung","departament","отдел","afdeling","τμήμα"],
"key::22": ["council","consiglio","conseil","Consejo","conselho","gesellschaft","rada","совет","raad","συμβούλιο"],
"key::23": ["library","biblioteca","bibliothèque","biblioteca","biblioteca","Bibliothek","biblioteka","библиотека","bibliotheek","βιβλιοθήκη"],
"key::24": ["ministry","ministero","ministère","ministerio","ministério","Ministerium","ministerstwo","министерство","ministerie","υπουργείο"],
"key::25": ["services","servizi","services","servicios","Serviços","Dienstleistungen","usługi","услуги","diensten","υπηρεσίες"],
"key::26": ["central","centrale","central","centrale","centrales","central","central","zentral","centralny","цетральный","centraal","κεντρικός","κεντρική","κεντρικό","κεντρικά"],
"key::27": ["general","generale","général","générale","généraux","générales","general","geral","general","Allgemeines","general","общий","algemeen","algemene","γενικός","γενική","γενικό","γενικά"],
"key::28": ["applied","applicati","appliqué","appliquée","appliqués","appliquées","aplicado","aplicada","angewendet","stosowany","прикладной","toegepast","toegepaste","εφαρμοσμένος","εφαρμοσμένη","εφαρμοσμένο","εφαρμοσμένα"],
"key::29": ["european","europee","europea","européen","européenne","européens","européennes","europeo","europeu","europäisch","europejski","европейский","Europees","Europese","ευρωπαϊκός","ευρωπαϊκή","ευρωπαϊκό","ευρωπαϊκά"],
"key::30": ["agency","agenzia","agence","agencia","agencia","agentur","agencja","агенция","agentschap","πρακτορείο"],
"key::31": ["laboratory","laboratorio","laboratoire","laboratorio","laboratorio","labor","laboratorium","лаборатория","laboratorium","εργαστήριο"],
"key::32": ["industry","industria","industrie","индустрия","industrie","βιομηχανία"],
"key::33": ["industrial","industriale","industriel","industrielle","industriels","industrielles","индустриальный","industrieel","βιομηχανικός","βιομηχανική","βιομηχανικό","βιομηχανικά","βιομηχανικές"],
"key::34": ["consortium","consorzio","consortium","консорциум","consortium","κοινοπραξία"],
"key::35": ["organization","organizzazione","organisation","organización","organização","organizacja","организация","organisatie","οργανισμός"],
"key::36": ["authority","autorità","autorité","авторитет","autoriteit"],
"key::37": ["federation","federazione","fédération","федерация","federatie","ομοσπονδία"],
"key::38": ["observatory","osservatorio","observatoire","обсерватория","observatorium","αστεροσκοπείο"],
"key::39": ["bureau","ufficio","bureau","офис","bureau","γραφείο"],
"key::40": ["company","impresa","compagnie","société","компания","bedrijf","εταιρία"],
"key::41": ["polytechnic","politecnico","polytechnique","политехника","polytechnisch","πολυτεχνείο","universita politecnica","polytechnic university","universidad politecnica","universitat politecnica","politechnika","politechniki","university technology","university science technology"],
"key::42": ["coalition","coalizione","coalition","коалиция","coalitie","συνασπισμός"],
"key::43": ["initiative","iniziativa","initiative","инициатива","initiatief","πρωτοβουλία"],
"key::44": ["academic","accademico","académique","universitaire","акадеческий academisch","ακαδημαϊκός","ακαδημαϊκή","ακαδημαϊκό","ακαδημαϊκές","ακαδημαϊκοί"],
"key::45": ["institution","istituzione","institution","институциональный","instelling","ινστιτούτο"],
"key::46": ["division","divisione","division","отделение","divisie","τμήμα"],
"key::47": ["committee","comitato","comité","комитет","commissie","επιτροπή"],
"key::48": ["promotion","promozione","продвижение","proothisis","forderung"],
"key::49": ["medical","medicine","clinical","medicina","clinici","médico","medicina","clínica","médico","medicina","clínica","medizinisch","Medizin","klinisch","medisch","geneeskunde","klinisch","ιατρικός","ιατρική","ιατρικό","ιατρικά","κλινικός","κλινική","κλινικό","κλινικά","tıbbi","tıp","klinik","orvosi","orvostudomány","klinikai","zdravniški","medicinski","klinični","meditsiini","kliinik","kliiniline"],
"key::50": ["technology","technological","tecnologia","tecnologie","tecnología","tecnológico","tecnologia","tecnológico","Technologie","technologisch","technologie","technologisch","τεχνολογία","τεχνολογικός","τεχνολογική","τεχνολογικό","teknoloji","teknolojik","technológia","technológiai","tehnologija","tehnološki","tehnoloogia","tehnoloogiline","technologii","technical","texniki","teknik"],
"key::51": ["science","scientific","scienza","scientifiche","scienze","ciencia","científico","ciência","científico","Wissenschaft","wissenschaftlich","wetenschap","wetenschappelijk","επιστήμη","επιστημονικός","επιστημονική","επιστημονικό","επιστημονικά","bilim","bilimsel","tudomány","tudományos","znanost","znanstveni","teadus","teaduslik",""],
"key::52": ["engineering","ingegneria","ingeniería","engenharia","Ingenieurwissenschaft","ingenieurswetenschappen","bouwkunde","μηχανικός","μηχανική","μηχανικό","mühendislik","mérnöki","Inženirstvo","inseneeria","inseneri",""],
"key::53": ["management","gestione","gestionale","gestionali","gestión","administración","gestão","administração","Verwaltung","management","διαχείριση","yönetim","menedzsment","vodstvo","upravljanje","management","juhtkond","juhtimine","haldus",""],
"key::54": ["energy","energia","energía","energia","Energie","energie","ενέργεια","enerji","energia","energija","energia",""],
"key::55": ["agricultural","agriculture","agricoltura","agricole","agrícola","agricultura","agrícola","agricultura","landwirtschaftlich","Landwirtschaft","landbouwkundig","landbouw","αγροτικός","αγροτική","αγροτικό","γεωργικός","γεωργική","γεωργικό","γεωργία","tarımsal","tarım","mezőgazdasági","mezőgazdaság","poljedelski","poljedelstvo","põllumajandus","põllumajanduslik",""],
"key::56": ["information","informazione","información","informação","Information","informatie","πληροφορία","bilgi","információ","informacija","informatsioon","informatycznych",""],
"key::57": ["social","sociali","social","social","Sozial","sociaal","maatschappelijk","κοινωνικός","κοινωνική","κοινωνικό","κοινωνικά","sosyal","szociális","družbeni","sotsiaal","sotsiaalne",""],
"key::58": ["environmental","ambiente","medioambiental","ambiente","medioambiente","meioambiente","Umwelt","milieu","milieuwetenschap","milieukunde","περιβαλλοντικός","περιβαλλοντική","περιβαλλοντικό","περιβαλλοντικά","çevre","környezeti","okoliški","keskonna",""],
"key::59": ["business","economia","economiche","economica","negocio","empresa","negócio","Unternehmen","bedrijf","bedrijfskunde","επιχείρηση","iş","üzleti","posel","ettevõte/äri",""],
"key::60": ["pharmaceuticals","pharmacy","farmacia","farmaceutica","farmacéutica","farmacia","farmacêutica","farmácia","Pharmazeutika","Arzneimittelkunde","farmaceutica","geneesmiddelen","apotheek","φαρμακευτικός","φαρμακευτική","φαρμακευτικό","φαρμακευτικά","φαρμακείο","ilaç","eczane","gyógyszerészeti","gyógyszertár","farmacevtika","lekarništvo","farmaatsia","farmatseutiline",""],
"key::61": ["healthcare","health services","salute","atenciónmédica","cuidadodelasalud","cuidadoscomasaúde","Gesundheitswesen","gezondheidszorg","ιατροφαρμακευτικήπερίθαλψη","sağlıkhizmeti","egészségügy","zdravstvo","tervishoid","tervishoiu",""],
"key::62": ["history","storia","historia","história","Geschichte","geschiedenis","geschiedkunde","ιστορία","tarih","történelem","zgodovina","ajalugu",""],
"key::63": ["materials","materiali","materia","materiales","materiais","materialen","υλικά","τεκμήρια","malzemeler","anyagok","materiali","materjalid","vahendid",""],
"key::64": ["economics","economia","economiche","economica","economía","economia","Wirtschaft","economie","οικονομικά","οικονομικέςεπιστήμες","ekonomi","közgazdaságtan","gospodarstvo","ekonomija","majanduslik","majandus",""],
"key::65": ["therapeutics","terapeutica","terapéutica","terapêutica","therapie","θεραπευτική","tedavibilimi","gyógykezelés","terapevtika","terapeutiline","ravi",""],
"key::66": ["oncology","oncologia","oncologico","oncología","oncologia","Onkologie","oncologie","ογκολογία","onkoloji","onkológia","onkologija","onkoloogia",""],
"key::67": ["natural","naturali","naturale","natural","natural","natürlich","natuurlijk","φυσικός","φυσική","φυσικό","φυσικά","doğal","természetes","naraven","loodus",""],
"key::68": ["educational","educazione","pedagogia","educacional","educativo","educacional","pädagogisch","educatief","εκπαιδευτικός","εκπαιδευτική","εκπαιδευτικό","εκπαιδευτικά","eğitimsel","oktatási","izobraževalen","haridus","hariduslik",""],
"key::69": ["biomedical","biomedica","biomédico","biomédico","biomedizinisch","biomedisch","βιοιατρικός","βιοιατρική","βιοιατρικό","βιοιατρικά","biyomedikal","orvosbiológiai","biomedicinski","biomeditsiiniline",""],
"key::70": ["veterinary","veterinaria","veterinarie","veterinaria","veterinária","tierärtzlich","veterinair","veeartsenijlkunde","κτηνιατρικός","κτηνιατρική","κτηνιατρικό","κτηνιατρικά","veteriner","állatorvosi","veterinar","veterinarski","veterinaaria",""],
"key::71": ["chemistry","chimica","química","química","Chemie","chemie","scheikunde","χημεία","kimya","kémia","kemija","keemia",""],
"key::72": ["security","sicurezza","seguridad","segurança","Sicherheit","veiligheid","ασφάλεια","güvenlik","biztonsági","varnost","turvalisus","julgeolek",""],
"key::73": ["biotechnology","biotecnologia","biotecnologie","biotecnología","biotecnologia","Biotechnologie","biotechnologie","βιοτεχνολογία","biyoteknoloji","biotechnológia","biotehnologija","biotehnoloogia",""],
"key::74": ["military","militare","militari","militar","militar","Militär","militair","leger","στρατιωτικός","στρατιωτική","στρατιωτικό","στρατιωτικά","askeri","katonai","vojaški","vojni","militaar","wojskowa",""],
"key::75": ["theological","teologia","teologico","teológico","tecnológica","theologisch","theologisch","θεολογικός","θεολογική","θεολογικό","θεολογικά","teolojik","technológiai","teološki","teoloogia","usuteadus","teoloogiline",""],
"key::76": ["electronics","elettronica","electrónica","eletrônicos","Elektronik","elektronica","ηλεκτρονική","elektronik","elektronika","elektronika","elektroonika",""],
"key::77": ["forestry","forestale","forestali","silvicultura","forestal","floresta","Forstwirtschaft","bosbouw","δασοκομία","δασολογία","ormancılık","erdészet","gozdarstvo","metsandus",""],
"key::78": ["maritime","marittima","marittime","marittimo","marítimo","marítimo","maritiem","ναυτικός","ναυτική","ναυτικό","ναυτικά","ναυτιλιακός","ναυτιλιακή","ναυτιλιακό","ναυτιλιακά","θαλάσσιος","θαλάσσια","θαλάσσιο","denizcilik","tengeri","morski","mere","merendus",""],
"key::79": ["sports","sport","deportes","esportes","Sport","sport","sportwetenschappen","άθληση","γυμναστικήδραστηριότητα","spor","sport","šport","sport","spordi",""],
"key::80": ["surgery","chirurgia","chirurgiche","cirugía","cirurgia","Chirurgie","chirurgie","heelkunde","εγχείρηση","επέμβαση","χειρουργικήεπέμβαση","cerrahi","sebészet","kirurgija","kirurgia",""],
"key::81": ["cultural","culturale","culturali","cultura","cultural","cultural","kulturell","cultureel","πολιτιστικός","πολιτιστική","πολιτιστικό","πολιτισμικός","πολιτισμική","πολιτισμικό","kültürel","kultúrális","kulturni","kultuuri","kultuuriline",""],
"key::82": ["computerscience","informatica","ordenador","computadora","informática","computación","cienciasdelacomputación","ciênciadacomputação","Computer","computer","υπολογιστής","ηλεκτρονικόςυπολογιστής","bilgisayar","számítógép","računalnik","arvuti",""],
"key::83": ["finance","financial","finanza","finanziarie","finanza","financiero","finanças","financeiro","Finanzen","finanziell","financiën","financieel","χρηματοοικονομικά","χρηματοδότηση","finanse","finansal","pénzügy","pénzügyi","finance","finančni","finants","finantsiline",""],
"key::84": ["communication","comunicazione","comuniciación","comunicação","Kommunikation","communication","επικοινωνία","iletişim","kommunikáció","komuniciranje","kommunikatsioon",""],
"key::85": ["justice","giustizia","justicia","justiça","Recht","Justiz","justitie","gerechtigheid","δικαιοσύνη","υπουργείοδικαιοσύνης","δίκαιο","adalet","igazságügy","pravo","õigus",""],
"key::86": ["aerospace","aerospaziale","aerospaziali","aeroespacio","aeroespaço","Luftfahrt","luchtvaart","ruimtevaart","αεροπορικός","αεροπορική","αεροπορικό","αεροναυπηγικός","αεροναυπηγική","αεροναυπηγικό","αεροναυπηγικά","havacılıkveuzay","légtér","zrakoplovstvo","atmosfäär","kosmos",""],
"key::87": ["dermatology","dermatologia","dermatología","dermatologia","Dermatologie","dermatologie","δρματολογία","dermatoloji","bőrgyógyászat","dermatológia","dermatologija","dermatoloogia",""],
"key::88": ["architecture","architettura","arquitectura","arquitetura","Architektur","architectuur","αρχιτεκτονική","mimarlık","építészet","arhitektura","arhitektuur",""],
"key::89": ["mathematics","matematica","matematiche","matemáticas","matemáticas","Mathematik","wiskunde","mathematica","μαθηματικά","matematik","matematika","matematika","matemaatika",""],
"key::90": ["language","lingue","linguistica","linguistiche","lenguaje","idioma","língua","idioma","Sprache","taal","taalkunde","γλώσσα","dil","nyelv","jezik","keel",""],
"key::91": ["neuroscience","neuroscienza","neurociencia","neurociência","Neurowissenschaft","neurowetenschappen","νευροεπιστήμη","nörobilim","idegtudomány","nevroznanost","neuroteadused",""],
"key::92": ["automation","automazione","automatización","automação","Automatisierung","automatisering","αυτοματοποίηση","otomasyon","automatizálás","avtomatizacija","automatiseeritud",""],
"key::93": ["pediatric","pediatria","pediatriche","pediatrico","pediátrico","pediatría","pediátrico","pediatria","pädiatrisch","pediatrische","παιδιατρική","pediatrik","gyermekgyógyászat","pediatrija","pediaatria",""],
"key::94": ["photonics","fotonica","fotoniche","fotónica","fotônica","Photonik","fotonica","φωτονική","fotonik","fotonika","fotonika","fotoonika",""],
"key::95": ["mechanics","meccanica","meccaniche","mecánica","mecânica","Mechanik","Maschinenbau","mechanica","werktuigkunde","μηχανικής","mekanik","gépészet","mehanika","mehaanika",""],
"key::96": ["psychiatrics","psichiatria","psichiatrica","psichiatriche","psiquiatría","psiquiatria","Psychiatrie","psychiatrie","ψυχιατρική","psikiyatrik","pszihiátria","psihiatrija","psühhaatria",""],
"key::97": ["psychology","fisiologia","psicología","psicologia","Psychologie","psychologie","ψυχολογία","psikoloji","pszihológia","psihologija","psühholoogia",""],
"key::98": ["automotive","industriaautomobilistica","industriadelautomóvil","automotriz","industriaautomotriz","automotivo","Automobilindustrie","autoindustrie","αυτοκίνητος","αυτοκίνητη","αυτοκίνητο","αυτοκινούμενος","αυτοκινούμενη","αυτοκινούμενο","αυτοκινητιστικός","αυτοκινητιστική","αυτοκινητιστικό","otomotiv","autóipari","samogiben","avtomobilskaindustrija","auto-",""],
"key::99": ["neurology","neurologia","neurologiche","neurología","neurologia","Neurologie","neurologie","zenuwleer","νευρολογία","nöroloji","neurológia","ideggyógyászat","nevrologija","neuroloogia",""],
"key::100": ["geology","geologia","geologiche","geología","geologia","Geologie","geologie","aardkunde","γεωλογία","jeoloji","geológia","földtudomány","geologija","geoloogia",""],
"key::101": ["microbiology","microbiologia","micro-biologia","microbiologiche","microbiología","microbiologia","Mikrobiologie","microbiologie","μικροβιολογία","mikrobiyoloji","mikrobiológia","mikrobiologija","mikrobioloogia",""],
"key::102": ["informatics","informatica","informática","informática","informatica",""],
"key::103": ["forschungsgemeinschaft","comunita ricerca","research community","research foundation","research association"],
"key::104": ["commerce","ticaret","ticarət","commercio","trade","handel","comercio"]
}
}
}

View File

@ -8,42 +8,33 @@
"groupMaxSize" : "50",
"slidingWindowSize" : "200",
"rootBuilder" : [ "organization", "projectOrganization_participation_isParticipant", "datasourceOrganization_provision_isProvidedBy" ],
"includeChildren" : "true"
"includeChildren" : "true",
"maxIterations": "20"
},
"pace" : {
"clustering" : [
<<<<<<< HEAD
{ "name" : "ngrampairs", "fieldsCount" : [ "legalname" ], "params" : { "max" : "1", "ngramLen" : "3"} },
{ "name" : "suffixprefix", "fieldsCount" : [ "legalname" ], "params" : { "max" : "1", "len" : "3" } }
],
"necessaryConditions" : [
{ "name" : "exactMatch", "fieldsCount" : [ "country" ] }
=======
{ "name" : "sortedngrampairs", "fields" : [ "legalname" ], "params" : { "max" : 2, "ngramLen" : "3"} },
{ "name" : "suffixprefix", "fields" : [ "legalname" ], "params" : { "max" : 1, "len" : "3" } },
{ "name" : "urlclustering", "fields" : [ "websiteurl" ], "params" : { } },
{ "name" : "keywordsclustering", "fields" : [ "legalname" ], "params" : { "max": 2, "windowSize": 4} }
],
"strictConditions" : [
{ "name" : "exactMatch", "fields" : [ "gridid" ] }
],
"conditions" : [
{ "name" : "DomainExactMatch", "fields" : [ "websiteurl" ] },
{ "name" : "exactMatch", "fields" : [ "country" ] }
>>>>>>> origin/master
],
"decisionTree" : {
"start": {"fields": [{"field":"gridid", "comparator":"exactMatch", "weight":1.0, "countIfUndefined":"true", "params": {}}], "threshold":1.0, "aggregation": "MAX", "positive":"MATCH", "negative":"layer2", "undefined":"layer2", "ignoreUndefined": "true"},
"layer2": {"fields": [{"field":"websiteurl", "comparator":"domainExactMatch", "weight":1.0, "countIfUndefined":"true", "params" : {}}, {"field":"country", "comparator":"exactMatch", "weight":1.0, "countIfUndefined":"false", "params": {}}], "threshold":1.0, "aggregation": "MIN", "positive":"layer3", "negative":"NO_MATCH", "undefined":"layer3", "ignoreUndefined": "false"},
"layer3": {"fields": [{"field":"legalname", "comparator":"jaroWinklerNormalizedName", "weight":0.9, "countIfUndefined":"false", "params":{"windowSize" : 4, "threshold" : 0.7}}, {"field":"legalshortname", "comparator":"jaroWinklerNormalizedName", "weight":0.1, "countIfUndefined":"true", "params":{}}], "threshold": 0.9, "aggregation": "W_MEAN", "positive":"MATCH", "negative":"NO_MATCH", "undefined":"NO_MATCH", "ignoreUndefined": "true"}
},
"model" : [
{ "name" : "country", "algo" : "Null", "type" : "String", "weight" : "0", "ignoreMissing" : "false", "path" : "organization/metadata/country/classid" },
{ "name" : "legalshortname", "algo" : "JaroWinklerNormalizedName", "type" : "String", "weight" : "0.1", "ignoreMissing" : "true", "path" : "organization/metadata/legalshortname/value" },
{ "name" : "legalname", "algo" : "JaroWinklerNormalizedName", "type" : "String", "weight" : "0.9", "ignoreMissing" : "false", "path" : "organization/metadata/legalname/value", "params" : {"windowSize" : 4, "threshold" : 0.7} },
{ "name" : "websiteurl", "algo" : "Null", "type" : "URL", "weight" : "0", "ignoreMissing" : "true", "path" : "organization/metadata/websiteurl/value", "params" : { "host" : 0.5, "path" : 0.5 } },
{ "name" : "gridid", "algo" : "Null", "type" : "String", "weight" : "0.0", "ignoreMissing" : "true", "path" : "pid[qualifier#classid = {grid}]/value" }
{ "name" : "country", "type" : "String", "path" : "organization/metadata/country/classid"},
{ "name" : "legalshortname", "type" : "String", "path" : "organization/metadata/legalshortname/value"},
{ "name" : "legalname", "type" : "String", "path" : "organization/metadata/legalname/value" },
{ "name" : "websiteurl", "type" : "URL", "path" : "organization/metadata/websiteurl/value" },
{ "name" : "gridid", "type" : "String", "path" : "pid[qualifier#classid = {grid}]/value"}
],
"blacklists" : {
"legalname" : []
"legalname" : ["University of Turin"]
},
"synonyms": {
"key::1": ["university","università","universita","università studi","universita studi","universitario","universitaria","université","universitaire","universitaires","universidad","universitade","Universität","universitaet","Uniwersytet","университет","universiteit","πανεπιστήμιο","universitesi","universiteti"],
"key::1": ["university","università","università studi","universitario","universitaria","université","universitaire","universitaires","universidad","universitade","Universität","universitaet","Uniwersytet","университет","universiteit","πανεπιστήμιο","universitesi","universiteti"],
"key::2": ["studies","studi","études","estudios","estudos","Studien","studia","исследования","studies","σπουδές"],
"key::3": ["advanced","superiore","supérieur","supérieure","supérieurs","supérieures","avancado","avancados","fortgeschrittene","fortgeschritten","zaawansowany","передовой","gevorderd","gevorderde","προχωρημένος","προχωρημένη","προχωρημένο","προχωρημένες","προχωρημένα","wyzsza"],
"key::4": ["institute","istituto","institut","instituto","instituto","Institut","instytut","институт","instituut","ινστιτούτο"],
@ -93,58 +84,58 @@
"key::48": ["promotion","promozione","продвижение","proothisis","forderung"],
"key::49": ["medical","medicine","clinical","medicina","clinici","médico","medicina","clínica","médico","medicina","clínica","medizinisch","Medizin","klinisch","medisch","geneeskunde","klinisch","ιατρικός","ιατρική","ιατρικό","ιατρικά","κλινικός","κλινική","κλινικό","κλινικά","tıbbi","tıp","klinik","orvosi","orvostudomány","klinikai","zdravniški","medicinski","klinični","meditsiini","kliinik","kliiniline"],
"key::50": ["technology","technological","tecnologia","tecnologie","tecnología","tecnológico","tecnologia","tecnológico","Technologie","technologisch","technologie","technologisch","τεχνολογία","τεχνολογικός","τεχνολογική","τεχνολογικό","teknoloji","teknolojik","technológia","technológiai","tehnologija","tehnološki","tehnoloogia","tehnoloogiline","technologii","technical","texniki","teknik"],
"key::51": ["science","scientific","scienza","scientifiche","scienze","ciencia","científico","ciência","científico","Wissenschaft","wissenschaftlich","wetenschap","wetenschappelijk","επιστήμη","επιστημονικός","επιστημονική","επιστημονικό","επιστημονικά","bilim","bilimsel","tudomány","tudományos","znanost","znanstveni","teadus","teaduslik"],
"key::52": ["engineering","ingegneria","ingeniería","engenharia","Ingenieurwissenschaft","ingenieurswetenschappen","bouwkunde","μηχανικός","μηχανική","μηχανικό","mühendislik","mérnöki","Inženirstvo","inseneeria","inseneri"],
"key::53": ["management","gestione","gestionale","gestionali","gestión","administración","gestão","administração","Verwaltung","management","διαχείριση","yönetim","menedzsment","vodstvo","upravljanje","management","juhtkond","juhtimine","haldus"],
"key::54": ["energy","energia","energía","energia","Energie","energie","ενέργεια","enerji","energia","energija","energia"],
"key::55": ["agricultural","agriculture","agricoltura","agricole","agrícola","agricultura","agrícola","agricultura","landwirtschaftlich","Landwirtschaft","landbouwkundig","landbouw","αγροτικός","αγροτική","αγροτικό","γεωργικός","γεωργική","γεωργικό","γεωργία","tarımsal","tarım","mezőgazdasági","mezőgazdaság","poljedelski","poljedelstvo","põllumajandus","põllumajanduslik"],
"key::56": ["information","informazione","información","informação","Information","informatie","πληροφορία","bilgi","információ","informacija","informatsioon","informatycznych"],
"key::57": ["social","sociali","social","social","Sozial","sociaal","maatschappelijk","κοινωνικός","κοινωνική","κοινωνικό","κοινωνικά","sosyal","szociális","družbeni","sotsiaal","sotsiaalne"],
"key::58": ["environmental","ambiente","medioambiental","ambiente","medioambiente","meioambiente","Umwelt","milieu","milieuwetenschap","milieukunde","περιβαλλοντικός","περιβαλλοντική","περιβαλλοντικό","περιβαλλοντικά","çevre","környezeti","okoliški","keskonna"],
"key::59": ["business","economia","economiche","economica","negocio","empresa","negócio","Unternehmen","bedrijf","bedrijfskunde","επιχείρηση","iş","üzleti","posel","ettevõte/äri"],
"key::60": ["pharmaceuticals","pharmacy","farmacia","farmaceutica","farmacéutica","farmacia","farmacêutica","farmácia","Pharmazeutika","Arzneimittelkunde","farmaceutica","geneesmiddelen","apotheek","φαρμακευτικός","φαρμακευτική","φαρμακευτικό","φαρμακευτικά","φαρμακείο","ilaç","eczane","gyógyszerészeti","gyógyszertár","farmacevtika","lekarništvo","farmaatsia","farmatseutiline"],
"key::61": ["healthcare","health services","salute","atenciónmédica","cuidadodelasalud","cuidadoscomasaúde","Gesundheitswesen","gezondheidszorg","ιατροφαρμακευτικήπερίθαλψη","sağlıkhizmeti","egészségügy","zdravstvo","tervishoid","tervishoiu"],
"key::62": ["history","storia","historia","história","Geschichte","geschiedenis","geschiedkunde","ιστορία","tarih","történelem","zgodovina","ajalugu"],
"key::63": ["materials","materiali","materia","materiales","materiais","materialen","υλικά","τεκμήρια","malzemeler","anyagok","materiali","materjalid","vahendid"],
"key::64": ["economics","economia","economiche","economica","economía","economia","Wirtschaft","economie","οικονομικά","οικονομικέςεπιστήμες","ekonomi","közgazdaságtan","gospodarstvo","ekonomija","majanduslik","majandus"],
"key::65": ["therapeutics","terapeutica","terapéutica","terapêutica","therapie","θεραπευτική","tedavibilimi","gyógykezelés","terapevtika","terapeutiline","ravi"],
"key::66": ["oncology","oncologia","oncologico","oncología","oncologia","Onkologie","oncologie","ογκολογία","onkoloji","onkológia","onkologija","onkoloogia"],
"key::67": ["natural","naturali","naturale","natural","natural","natürlich","natuurlijk","φυσικός","φυσική","φυσικό","φυσικά","doğal","természetes","naraven","loodus"],
"key::68": ["educational","educazione","pedagogia","educacional","educativo","educacional","pädagogisch","educatief","εκπαιδευτικός","εκπαιδευτική","εκπαιδευτικό","εκπαιδευτικά","eğitimsel","oktatási","izobraževalen","haridus","hariduslik"],
"key::69": ["biomedical","biomedica","biomédico","biomédico","biomedizinisch","biomedisch","βιοιατρικός","βιοιατρική","βιοιατρικό","βιοιατρικά","biyomedikal","orvosbiológiai","biomedicinski","biomeditsiiniline"],
"key::70": ["veterinary","veterinaria","veterinarie","veterinaria","veterinária","tierärtzlich","veterinair","veeartsenijlkunde","κτηνιατρικός","κτηνιατρική","κτηνιατρικό","κτηνιατρικά","veteriner","állatorvosi","veterinar","veterinarski","veterinaaria"],
"key::71": ["chemistry","chimica","química","química","Chemie","chemie","scheikunde","χημεία","kimya","kémia","kemija","keemia"],
"key::72": ["security","sicurezza","seguridad","segurança","Sicherheit","veiligheid","ασφάλεια","güvenlik","biztonsági","varnost","turvalisus","julgeolek"],
"key::73": ["biotechnology","biotecnologia","biotecnologie","biotecnología","biotecnologia","Biotechnologie","biotechnologie","βιοτεχνολογία","biyoteknoloji","biotechnológia","biotehnologija","biotehnoloogia"],
"key::74": ["military","militare","militari","militar","militar","Militär","militair","leger","στρατιωτικός","στρατιωτική","στρατιωτικό","στρατιωτικά","askeri","katonai","vojaški","vojni","militaar","wojskowa"],
"key::75": ["theological","teologia","teologico","teológico","tecnológica","theologisch","theologisch","θεολογικός","θεολογική","θεολογικό","θεολογικά","teolojik","technológiai","teološki","teoloogia","usuteadus","teoloogiline"],
"key::76": ["electronics","elettronica","electrónica","eletrônicos","Elektronik","elektronica","ηλεκτρονική","elektronik","elektronika","elektronika","elektroonika"],
"key::77": ["forestry","forestale","forestali","silvicultura","forestal","floresta","Forstwirtschaft","bosbouw","δασοκομία","δασολογία","ormancılık","erdészet","gozdarstvo","metsandus"],
"key::78": ["maritime","marittima","marittime","marittimo","marítimo","marítimo","maritiem","ναυτικός","ναυτική","ναυτικό","ναυτικά","ναυτιλιακός","ναυτιλιακή","ναυτιλιακό","ναυτιλιακά","θαλάσσιος","θαλάσσια","θαλάσσιο","denizcilik","tengeri","morski","mere","merendus"],
"key::79": ["sports","sport","deportes","esportes","Sport","sport","sportwetenschappen","άθληση","γυμναστικήδραστηριότητα","spor","sport","šport","sport","spordi"],
"key::80": ["surgery","chirurgia","chirurgiche","cirugía","cirurgia","Chirurgie","chirurgie","heelkunde","εγχείρηση","επέμβαση","χειρουργικήεπέμβαση","cerrahi","sebészet","kirurgija","kirurgia"],
"key::81": ["cultural","culturale","culturali","cultura","cultural","cultural","kulturell","cultureel","πολιτιστικός","πολιτιστική","πολιτιστικό","πολιτισμικός","πολιτισμική","πολιτισμικό","kültürel","kultúrális","kulturni","kultuuri","kultuuriline"],
"key::82": ["computerscience","informatica","ordenador","computadora","informática","computación","cienciasdelacomputación","ciênciadacomputação","Computer","computer","υπολογιστής","ηλεκτρονικόςυπολογιστής","bilgisayar","számítógép","računalnik","arvuti"],
"key::83": ["finance","financial","finanza","finanziarie","finanza","financiero","finanças","financeiro","Finanzen","finanziell","financiën","financieel","χρηματοοικονομικά","χρηματοδότηση","finanse","finansal","pénzügy","pénzügyi","finance","finančni","finants","finantsiline"],
"key::84": ["communication","comunicazione","comuniciación","comunicação","Kommunikation","communication","επικοινωνία","iletişim","kommunikáció","komuniciranje","kommunikatsioon"],
"key::85": ["justice","giustizia","justicia","justiça","Recht","Justiz","justitie","gerechtigheid","δικαιοσύνη","υπουργείοδικαιοσύνης","δίκαιο","adalet","igazságügy","pravo","õigus"],
"key::86": ["aerospace","aerospaziale","aerospaziali","aeroespacio","aeroespaço","Luftfahrt","luchtvaart","ruimtevaart","αεροπορικός","αεροπορική","αεροπορικό","αεροναυπηγικός","αεροναυπηγική","αεροναυπηγικό","αεροναυπηγικά","havacılıkveuzay","légtér","zrakoplovstvo","atmosfäär","kosmos"],
"key::87": ["dermatology","dermatologia","dermatología","dermatologia","Dermatologie","dermatologie","δρματολογία","dermatoloji","bőrgyógyászat","dermatológia","dermatologija","dermatoloogia"],
"key::88": ["architecture","architettura","arquitectura","arquitetura","Architektur","architectuur","αρχιτεκτονική","mimarlık","építészet","arhitektura","arhitektuur"],
"key::89": ["mathematics","matematica","matematiche","matemáticas","matemáticas","Mathematik","wiskunde","mathematica","μαθηματικά","matematik","matematika","matematika","matemaatika"],
"key::90": ["language","lingue","linguistica","linguistiche","lenguaje","idioma","língua","idioma","Sprache","taal","taalkunde","γλώσσα","dil","nyelv","jezik","keel"],
"key::91": ["neuroscience","neuroscienza","neurociencia","neurociência","Neurowissenschaft","neurowetenschappen","νευροεπιστήμη","nörobilim","idegtudomány","nevroznanost","neuroteadused"],
"key::92": ["automation","automazione","automatización","automação","Automatisierung","automatisering","αυτοματοποίηση","otomasyon","automatizálás","avtomatizacija","automatiseeritud"],
"key::93": ["pediatric","pediatria","pediatriche","pediatrico","pediátrico","pediatría","pediátrico","pediatria","pädiatrisch","pediatrische","παιδιατρική","pediatrik","gyermekgyógyászat","pediatrija","pediaatria"],
"key::94": ["photonics","fotonica","fotoniche","fotónica","fotônica","Photonik","fotonica","φωτονική","fotonik","fotonika","fotonika","fotoonika"],
"key::95": ["mechanics","meccanica","meccaniche","mecánica","mecânica","Mechanik","Maschinenbau","mechanica","werktuigkunde","μηχανικής","mekanik","gépészet","mehanika","mehaanika"],
"key::96": ["psychiatrics","psichiatria","psichiatrica","psichiatriche","psiquiatría","psiquiatria","Psychiatrie","psychiatrie","ψυχιατρική","psikiyatrik","pszihiátria","psihiatrija","psühhaatria"],
"key::97": ["psychology","fisiologia","psicología","psicologia","Psychologie","psychologie","ψυχολογία","psikoloji","pszihológia","psihologija","psühholoogia"],
"key::98": ["automotive","industriaautomobilistica","industriadelautomóvil","automotriz","industriaautomotriz","automotivo","Automobilindustrie","autoindustrie","αυτοκίνητος","αυτοκίνητη","αυτοκίνητο","αυτοκινούμενος","αυτοκινούμενη","αυτοκινούμενο","αυτοκινητιστικός","αυτοκινητιστική","αυτοκινητιστικό","otomotiv","autóipari","samogiben","avtomobilskaindustrija","auto-"],
"key::99": ["neurology","neurologia","neurologiche","neurología","neurologia","Neurologie","neurologie","zenuwleer","νευρολογία","nöroloji","neurológia","ideggyógyászat","nevrologija","neuroloogia"],
"key::100": ["geology","geologia","geologiche","geología","geologia","Geologie","geologie","aardkunde","γεωλογία","jeoloji","geológia","földtudomány","geologija","geoloogia"],
"key::101": ["microbiology","microbiologia","micro-biologia","microbiologiche","microbiología","microbiologia","Mikrobiologie","microbiologie","μικροβιολογία","mikrobiyoloji","mikrobiológia","mikrobiologija","mikrobioloogia"],
"key::102": ["informatics","informatica","informática","informática","informatica"],
"key::51": ["science","scientific","scienza","scientifiche","scienze","ciencia","científico","ciência","científico","Wissenschaft","wissenschaftlich","wetenschap","wetenschappelijk","επιστήμη","επιστημονικός","επιστημονική","επιστημονικό","επιστημονικά","bilim","bilimsel","tudomány","tudományos","znanost","znanstveni","teadus","teaduslik",""],
"key::52": ["engineering","ingegneria","ingeniería","engenharia","Ingenieurwissenschaft","ingenieurswetenschappen","bouwkunde","μηχανικός","μηχανική","μηχανικό","mühendislik","mérnöki","Inženirstvo","inseneeria","inseneri",""],
"key::53": ["management","gestione","gestionale","gestionali","gestión","administración","gestão","administração","Verwaltung","management","διαχείριση","yönetim","menedzsment","vodstvo","upravljanje","management","juhtkond","juhtimine","haldus",""],
"key::54": ["energy","energia","energía","energia","Energie","energie","ενέργεια","enerji","energia","energija","energia",""],
"key::55": ["agricultural","agriculture","agricoltura","agricole","agrícola","agricultura","agrícola","agricultura","landwirtschaftlich","Landwirtschaft","landbouwkundig","landbouw","αγροτικός","αγροτική","αγροτικό","γεωργικός","γεωργική","γεωργικό","γεωργία","tarımsal","tarım","mezőgazdasági","mezőgazdaság","poljedelski","poljedelstvo","põllumajandus","põllumajanduslik",""],
"key::56": ["information","informazione","información","informação","Information","informatie","πληροφορία","bilgi","információ","informacija","informatsioon","informatycznych",""],
"key::57": ["social","sociali","social","social","Sozial","sociaal","maatschappelijk","κοινωνικός","κοινωνική","κοινωνικό","κοινωνικά","sosyal","szociális","družbeni","sotsiaal","sotsiaalne",""],
"key::58": ["environmental","ambiente","medioambiental","ambiente","medioambiente","meioambiente","Umwelt","milieu","milieuwetenschap","milieukunde","περιβαλλοντικός","περιβαλλοντική","περιβαλλοντικό","περιβαλλοντικά","çevre","környezeti","okoliški","keskonna",""],
"key::59": ["business","economia","economiche","economica","negocio","empresa","negócio","Unternehmen","bedrijf","bedrijfskunde","επιχείρηση","iş","üzleti","posel","ettevõte/äri",""],
"key::60": ["pharmaceuticals","pharmacy","farmacia","farmaceutica","farmacéutica","farmacia","farmacêutica","farmácia","Pharmazeutika","Arzneimittelkunde","farmaceutica","geneesmiddelen","apotheek","φαρμακευτικός","φαρμακευτική","φαρμακευτικό","φαρμακευτικά","φαρμακείο","ilaç","eczane","gyógyszerészeti","gyógyszertár","farmacevtika","lekarništvo","farmaatsia","farmatseutiline",""],
"key::61": ["healthcare","health services","salute","atenciónmédica","cuidadodelasalud","cuidadoscomasaúde","Gesundheitswesen","gezondheidszorg","ιατροφαρμακευτικήπερίθαλψη","sağlıkhizmeti","egészségügy","zdravstvo","tervishoid","tervishoiu",""],
"key::62": ["history","storia","historia","história","Geschichte","geschiedenis","geschiedkunde","ιστορία","tarih","történelem","zgodovina","ajalugu",""],
"key::63": ["materials","materiali","materia","materiales","materiais","materialen","υλικά","τεκμήρια","malzemeler","anyagok","materiali","materjalid","vahendid",""],
"key::64": ["economics","economia","economiche","economica","economía","economia","Wirtschaft","economie","οικονομικά","οικονομικέςεπιστήμες","ekonomi","közgazdaságtan","gospodarstvo","ekonomija","majanduslik","majandus",""],
"key::65": ["therapeutics","terapeutica","terapéutica","terapêutica","therapie","θεραπευτική","tedavibilimi","gyógykezelés","terapevtika","terapeutiline","ravi",""],
"key::66": ["oncology","oncologia","oncologico","oncología","oncologia","Onkologie","oncologie","ογκολογία","onkoloji","onkológia","onkologija","onkoloogia",""],
"key::67": ["natural","naturali","naturale","natural","natural","natürlich","natuurlijk","φυσικός","φυσική","φυσικό","φυσικά","doğal","természetes","naraven","loodus",""],
"key::68": ["educational","educazione","pedagogia","educacional","educativo","educacional","pädagogisch","educatief","εκπαιδευτικός","εκπαιδευτική","εκπαιδευτικό","εκπαιδευτικά","eğitimsel","oktatási","izobraževalen","haridus","hariduslik",""],
"key::69": ["biomedical","biomedica","biomédico","biomédico","biomedizinisch","biomedisch","βιοιατρικός","βιοιατρική","βιοιατρικό","βιοιατρικά","biyomedikal","orvosbiológiai","biomedicinski","biomeditsiiniline",""],
"key::70": ["veterinary","veterinaria","veterinarie","veterinaria","veterinária","tierärtzlich","veterinair","veeartsenijlkunde","κτηνιατρικός","κτηνιατρική","κτηνιατρικό","κτηνιατρικά","veteriner","állatorvosi","veterinar","veterinarski","veterinaaria",""],
"key::71": ["chemistry","chimica","química","química","Chemie","chemie","scheikunde","χημεία","kimya","kémia","kemija","keemia",""],
"key::72": ["security","sicurezza","seguridad","segurança","Sicherheit","veiligheid","ασφάλεια","güvenlik","biztonsági","varnost","turvalisus","julgeolek",""],
"key::73": ["biotechnology","biotecnologia","biotecnologie","biotecnología","biotecnologia","Biotechnologie","biotechnologie","βιοτεχνολογία","biyoteknoloji","biotechnológia","biotehnologija","biotehnoloogia",""],
"key::74": ["military","militare","militari","militar","militar","Militär","militair","leger","στρατιωτικός","στρατιωτική","στρατιωτικό","στρατιωτικά","askeri","katonai","vojaški","vojni","militaar","wojskowa",""],
"key::75": ["theological","teologia","teologico","teológico","tecnológica","theologisch","theologisch","θεολογικός","θεολογική","θεολογικό","θεολογικά","teolojik","technológiai","teološki","teoloogia","usuteadus","teoloogiline",""],
"key::76": ["electronics","elettronica","electrónica","eletrônicos","Elektronik","elektronica","ηλεκτρονική","elektronik","elektronika","elektronika","elektroonika",""],
"key::77": ["forestry","forestale","forestali","silvicultura","forestal","floresta","Forstwirtschaft","bosbouw","δασοκομία","δασολογία","ormancılık","erdészet","gozdarstvo","metsandus",""],
"key::78": ["maritime","marittima","marittime","marittimo","marítimo","marítimo","maritiem","ναυτικός","ναυτική","ναυτικό","ναυτικά","ναυτιλιακός","ναυτιλιακή","ναυτιλιακό","ναυτιλιακά","θαλάσσιος","θαλάσσια","θαλάσσιο","denizcilik","tengeri","morski","mere","merendus",""],
"key::79": ["sports","sport","deportes","esportes","Sport","sport","sportwetenschappen","άθληση","γυμναστικήδραστηριότητα","spor","sport","šport","sport","spordi",""],
"key::80": ["surgery","chirurgia","chirurgiche","cirugía","cirurgia","Chirurgie","chirurgie","heelkunde","εγχείρηση","επέμβαση","χειρουργικήεπέμβαση","cerrahi","sebészet","kirurgija","kirurgia",""],
"key::81": ["cultural","culturale","culturali","cultura","cultural","cultural","kulturell","cultureel","πολιτιστικός","πολιτιστική","πολιτιστικό","πολιτισμικός","πολιτισμική","πολιτισμικό","kültürel","kultúrális","kulturni","kultuuri","kultuuriline",""],
"key::82": ["computerscience","informatica","ordenador","computadora","informática","computación","cienciasdelacomputación","ciênciadacomputação","Computer","computer","υπολογιστής","ηλεκτρονικόςυπολογιστής","bilgisayar","számítógép","računalnik","arvuti",""],
"key::83": ["finance","financial","finanza","finanziarie","finanza","financiero","finanças","financeiro","Finanzen","finanziell","financiën","financieel","χρηματοοικονομικά","χρηματοδότηση","finanse","finansal","pénzügy","pénzügyi","finance","finančni","finants","finantsiline",""],
"key::84": ["communication","comunicazione","comuniciación","comunicação","Kommunikation","communication","επικοινωνία","iletişim","kommunikáció","komuniciranje","kommunikatsioon",""],
"key::85": ["justice","giustizia","justicia","justiça","Recht","Justiz","justitie","gerechtigheid","δικαιοσύνη","υπουργείοδικαιοσύνης","δίκαιο","adalet","igazságügy","pravo","õigus",""],
"key::86": ["aerospace","aerospaziale","aerospaziali","aeroespacio","aeroespaço","Luftfahrt","luchtvaart","ruimtevaart","αεροπορικός","αεροπορική","αεροπορικό","αεροναυπηγικός","αεροναυπηγική","αεροναυπηγικό","αεροναυπηγικά","havacılıkveuzay","légtér","zrakoplovstvo","atmosfäär","kosmos",""],
"key::87": ["dermatology","dermatologia","dermatología","dermatologia","Dermatologie","dermatologie","δρματολογία","dermatoloji","bőrgyógyászat","dermatológia","dermatologija","dermatoloogia",""],
"key::88": ["architecture","architettura","arquitectura","arquitetura","Architektur","architectuur","αρχιτεκτονική","mimarlık","építészet","arhitektura","arhitektuur",""],
"key::89": ["mathematics","matematica","matematiche","matemáticas","matemáticas","Mathematik","wiskunde","mathematica","μαθηματικά","matematik","matematika","matematika","matemaatika",""],
"key::90": ["language","lingue","linguistica","linguistiche","lenguaje","idioma","língua","idioma","Sprache","taal","taalkunde","γλώσσα","dil","nyelv","jezik","keel",""],
"key::91": ["neuroscience","neuroscienza","neurociencia","neurociência","Neurowissenschaft","neurowetenschappen","νευροεπιστήμη","nörobilim","idegtudomány","nevroznanost","neuroteadused",""],
"key::92": ["automation","automazione","automatización","automação","Automatisierung","automatisering","αυτοματοποίηση","otomasyon","automatizálás","avtomatizacija","automatiseeritud",""],
"key::93": ["pediatric","pediatria","pediatriche","pediatrico","pediátrico","pediatría","pediátrico","pediatria","pädiatrisch","pediatrische","παιδιατρική","pediatrik","gyermekgyógyászat","pediatrija","pediaatria",""],
"key::94": ["photonics","fotonica","fotoniche","fotónica","fotônica","Photonik","fotonica","φωτονική","fotonik","fotonika","fotonika","fotoonika",""],
"key::95": ["mechanics","meccanica","meccaniche","mecánica","mecânica","Mechanik","Maschinenbau","mechanica","werktuigkunde","μηχανικής","mekanik","gépészet","mehanika","mehaanika",""],
"key::96": ["psychiatrics","psichiatria","psichiatrica","psichiatriche","psiquiatría","psiquiatria","Psychiatrie","psychiatrie","ψυχιατρική","psikiyatrik","pszihiátria","psihiatrija","psühhaatria",""],
"key::97": ["psychology","fisiologia","psicología","psicologia","Psychologie","psychologie","ψυχολογία","psikoloji","pszihológia","psihologija","psühholoogia",""],
"key::98": ["automotive","industriaautomobilistica","industriadelautomóvil","automotriz","industriaautomotriz","automotivo","Automobilindustrie","autoindustrie","αυτοκίνητος","αυτοκίνητη","αυτοκίνητο","αυτοκινούμενος","αυτοκινούμενη","αυτοκινούμενο","αυτοκινητιστικός","αυτοκινητιστική","αυτοκινητιστικό","otomotiv","autóipari","samogiben","avtomobilskaindustrija","auto-",""],
"key::99": ["neurology","neurologia","neurologiche","neurología","neurologia","Neurologie","neurologie","zenuwleer","νευρολογία","nöroloji","neurológia","ideggyógyászat","nevrologija","neuroloogia",""],
"key::100": ["geology","geologia","geologiche","geología","geologia","Geologie","geologie","aardkunde","γεωλογία","jeoloji","geológia","földtudomány","geologija","geoloogia",""],
"key::101": ["microbiology","microbiologia","micro-biologia","microbiologiche","microbiología","microbiologia","Mikrobiologie","microbiologie","μικροβιολογία","mikrobiyoloji","mikrobiológia","mikrobiologija","mikrobioloogia",""],
"key::102": ["informatics","informatica","informática","informática","informatica",""],
"key::103": ["forschungsgemeinschaft","comunita ricerca","research community","research foundation","research association"],
"key::104": ["commerce","ticaret","ticarət","commercio","trade","handel","comercio"]
}

View File

@ -27,11 +27,11 @@
{ "name" : "exactMatch", "fieldsCount" : ["resulttype"] }
],
"model" : [
{ "name" : "doi", "algo" : "Null", "type" : "String", "weight" : "0.0", "ignoreMissing" : "true", "path" : "pid[qualifier#classid = {doi}]/value" },
{ "name" : "title", "algo" : "LevensteinTitleIgnoreVersion", "type" : "String", "weight" : "1.0", "ignoreMissing" : "false", "path" : "result/metadata/title[qualifier#classid = {main title}]/value", "length" : 250, "size" : 5 },
{ "name" : "url", "algo" : "Null", "type" : "String", "weight" : "0.0", "ignoreMissing" : "true", "path" : "result/instance/url" },
{ "name" : "resulttype", "algo" : "Null", "type" : "String", "weight" : "0.0", "ignoreMissing" : "false", "path" : "result/metadata/resulttype/classid" },
{ "name" : "documentationUrl", "algo" : "Null", "type" : "String", "weight" : "0.0", "ignoreMissing" : "false", "path" : "result/metadata/documentationUrl/value" }
{ "name" : "doi", "algo" : "Null", "type" : "String", "weight" : "0.0", "countIfUndefined" : "true", "path" : "pid[qualifier#classid = {doi}]/value" },
{ "name" : "title", "algo" : "LevensteinTitleIgnoreVersion", "type" : "String", "weight" : "1.0", "countIfUndefined" : "false", "path" : "result/metadata/title[qualifier#classid = {main title}]/value", "length" : 250, "size" : 5 },
{ "name" : "url", "algo" : "Null", "type" : "String", "weight" : "0.0", "countIfUndefined" : "true", "path" : "result/instance/url" },
{ "name" : "resulttype", "algo" : "Null", "type" : "String", "weight" : "0.0", "countIfUndefined" : "false", "path" : "result/metadata/resulttype/classid" },
{ "name" : "documentationUrl", "algo" : "Null", "type" : "String", "weight" : "0.0", "countIfUndefined" : "false", "path" : "result/metadata/documentationUrl/value" }
],
"blacklists" : {

View File

@ -1,78 +0,0 @@
{
"pid": [
{
"qualifier": {
"classid": "doi",
"classname": "doi",
"schemename": "dnet:pid_types",
"schemeid": "dnet:pid_types"
},
"value": "10.1002/9781444393675.ch6"
}
],
"result": {
"instance": [
{
"url": [
"http://dx.doi.org/10.1002/9781444393675.ch6"
],
"collectedfrom": {
"value": "CrossRef",
"key": "10|openaire____::081b82f96300b6a6e3d282bad31cb6e2"
},
"hostedby": {
"value": "Unknown Repository",
"key": "10|openaire____::55045bd2a65019fd8e6741a755395c8c"
},
"accessright": {
"classid": "CLOSED",
"classname": "Closed Access",
"schemename": "dnet:access_modes",
"schemeid": "dnet:access_modes"
},
"instancetype": {
"classid": "0013",
"classname": "Part of book or chapter of book",
"schemename": "dnet:publication_resource",
"schemeid": "dnet:publication_resource"
}
}
],
"metadata": {
"title": [
{
"qualifier": {
"classid": "main title",
"classname": "main title",
"schemename": "dnet:dataCite_title",
"schemeid": "dnet:dataCite_title"
},
"value": "Henry James (1843-1916)"
}
],
"resulttype": {
"classid": "publication",
"classname": "publication",
"schemename": "dnet:result_typologies",
"schemeid": "dnet:result_typologies"
}
}
},
"collectedfrom": [
{
"value": "Microsoft Academic Graph",
"key": "10|openaire____::5f532a3fc4f1ea403f37070f59a7a53a"
},
{
"value": "CrossRef",
"key": "10|openaire____::081b82f96300b6a6e3d282bad31cb6e2"
},
{
"value": "UnpayWall",
"key": "10|openaire____::8ac8380272269217cb09a928c8caa993"
}
],
"dateofcollection": "2018-08-07 12:24:48Z",
"type": 50,
"id": "50|crossref____::0000002a9885b7ec89b7b9d8ff3331a0"
}

View File

@ -0,0 +1,45 @@
{"dateoftransformation":"2019-10-14 08:59:35.295767","originalId":["openorgs____::0000047537"],"pid":[{"qualifier":{"classid":"grid.ac","classname":"grid.ac","schemename":"dnet:pid_types","schemeid":"dnet:pid_types"},"value":"grid.456952.9"}],"collectedfrom":[{"value":"OpenOrgs Database","key":"10|openaire____::0362fcdb3076765d9c0041ad331553e8"}],"organization":{"metadata":{"legalshortname":{"value":"SBPz"},"websiteurl":{"value":"http://www.sbpz.org.br/"},"country":{"classid":"BR","classname":"Brazil","schemename":"dnet:countries","schemeid":"dnet:countries"},"alternativeNames":[{"value":"Brazilian Society of Protozoology"},{"value":"Sociedade Brasileira de Protozoologia"}],"legalname":{"value":"Brazilian Society of Protozoology"}}},"dateofcollection":"","type":20,"id":"20|openorgs____::4e6bbbff9bb96265728ee81d0b760a6a"}
{"dateoftransformation":"2019-10-14 08:59:35.295767","originalId":["openorgs____::0000061195"],"pid":[{"qualifier":{"classid":"FundRef","classname":"FundRef","schemename":"dnet:pid_types","schemeid":"dnet:pid_types"},"value":"501100002363"},{"qualifier":{"classid":"grid.ac","classname":"grid.ac","schemename":"dnet:pid_types","schemeid":"dnet:pid_types"},"value":"grid.470801.a"}],"collectedfrom":[{"value":"OpenOrgs Database","key":"10|openaire____::0362fcdb3076765d9c0041ad331553e8"}],"organization":{"metadata":{"legalshortname":{"value":"SBI"},"websiteurl":{"value":"http://www.infectologia.org.br/"},"country":{"classid":"BR","classname":"Brazil","schemename":"dnet:countries","schemeid":"dnet:countries"},"alternativeNames":[{"value":"Sociedade Brasileira Infectologia"}],"legalname":{"value":"Sociedade Brasileira Infectologia"}}},"dateofcollection":"","type":20,"id":"20|openorgs____::147a341962a63d5202a380e6dd543423"}
{"dateoftransformation":"2018-09-19","originalId":["doajarticles::Sociedade_Brasileira_de_Reumatologia"],"collectedfrom":[{"value":"DOAJ-Articles","key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824"}],"organization":{"metadata":{"eclegalbody":{"value":"false"},"eclegalperson":{"value":"false"},"ecinternationalorganization":{"value":"false"},"legalshortname":{"value":"Sociedade Brasileira de Reumatologia"},"ecresearchorganization":{"value":"false"},"ecnonprofit":{"value":"false"},"ecenterprise":{"value":"false"},"ecnutscode":{"value":"false"},"ecinternationalorganizationeurinterests":{"value":"false"},"legalname":{"value":"Sociedade Brasileira de Reumatologia"},"country":{"classid":"BR","classname":"Brazil","schemename":"dnet:countries","schemeid":"dnet:countries"},"echighereducation":{"value":"false"},"ecsmevalidated":{"value":"false"}}},"dateofcollection":"2018-09-19","type":20,"id":"20|doajarticles::0019ba7a22c5bc733c3206bde28ff568"}
{"dateoftransformation":"2019-10-14 08:59:35.295767","originalId":["openorgs____::0000063373"],"pid":[{"qualifier":{"classid":"grid.ac","classname":"grid.ac","schemename":"dnet:pid_types","schemeid":"dnet:pid_types"},"value":"grid.472996.7"}],"collectedfrom":[{"value":"OpenOrgs Database","key":"10|openaire____::0362fcdb3076765d9c0041ad331553e8"}],"organization":{"metadata":{"legalshortname":{"value":"SBEM"},"websiteurl":{"value":"http://www.sbemdf.com/"},"country":{"classid":"BR","classname":"Brazil","schemename":"dnet:countries","schemeid":"dnet:countries"},"alternativeNames":[{"value":"Sociedade Brasileira de Educação Matemática"}],"legalname":{"value":"Sociedade Brasileira de Educação Matemática"}}},"dateofcollection":"","type":20,"id":"20|openorgs____::112eda27634b246dfc673db9ee2370e0"}
{"dateoftransformation":"2019-10-14 08:59:35.295767","originalId":["openorgs____::0000047456"],"pid":[{"qualifier":{"classid":"grid.ac","classname":"grid.ac","schemename":"dnet:pid_types","schemeid":"dnet:pid_types"},"value":"grid.456871.9"}],"collectedfrom":[{"value":"OpenOrgs Database","key":"10|openaire____::0362fcdb3076765d9c0041ad331553e8"}],"organization":{"metadata":{"legalshortname":{"value":"Sociedade Brasileira de Catálise"},"websiteurl":{"value":"http://www.sbcat.org/"},"country":{"classid":"BR","classname":"Brazil","schemename":"dnet:countries","schemeid":"dnet:countries"},"alternativeNames":[{"value":"SBCat"},{"value":"Sociedade Brasileira de Catálise"}],"legalname":{"value":"Sociedade Brasileira de Catálise"}}},"dateofcollection":"","type":20,"id":"20|openorgs____::118cd54c6764d090109192e07e8187f7"}
{"dateoftransformation":"2018-09-19","originalId":["doajarticles::Sociedade_Brasileira_de_Física"],"collectedfrom":[{"value":"DOAJ-Articles","key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824"}],"organization":{"metadata":{"eclegalbody":{"value":"false"},"eclegalperson":{"value":"false"},"ecinternationalorganization":{"value":"false"},"legalshortname":{"value":"Sociedade Brasileira de Física"},"ecresearchorganization":{"value":"false"},"ecnonprofit":{"value":"false"},"ecenterprise":{"value":"false"},"ecnutscode":{"value":"false"},"ecinternationalorganizationeurinterests":{"value":"false"},"legalname":{"value":"Sociedade Brasileira de Física"},"country":{"classid":"BR","classname":"Brazil","schemename":"dnet:countries","schemeid":"dnet:countries"},"echighereducation":{"value":"false"},"ecsmevalidated":{"value":"false"}}},"dateofcollection":"2018-09-19","type":20,"id":"20|doajarticles::0aa0dbb4e641521d10e3d93ab335b7af"}
{"dateoftransformation":"2018-09-19","originalId":["doajarticles::Sociedade_Brasileira_de_Urologia"],"collectedfrom":[{"value":"DOAJ-Articles","key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824"}],"organization":{"metadata":{"eclegalbody":{"value":"false"},"eclegalperson":{"value":"false"},"ecinternationalorganization":{"value":"false"},"legalshortname":{"value":"Sociedade Brasileira de Urologia"},"ecresearchorganization":{"value":"false"},"ecnonprofit":{"value":"false"},"ecenterprise":{"value":"false"},"ecnutscode":{"value":"false"},"ecinternationalorganizationeurinterests":{"value":"false"},"legalname":{"value":"Sociedade Brasileira de Urologia"},"country":{"classid":"BR","classname":"Brazil","schemename":"dnet:countries","schemeid":"dnet:countries"},"echighereducation":{"value":"false"},"ecsmevalidated":{"value":"false"}}},"dateofcollection":"2018-09-19","type":20,"id":"20|doajarticles::1f0b63554653f1eefc97f17ebe5e8c2b"}
{"dateoftransformation":"2018-09-19","originalId":["doajarticles::Sociedade_Brasileira_de_Oftalmologia"],"collectedfrom":[{"value":"DOAJ-Articles","key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824"}],"organization":{"metadata":{"eclegalbody":{"value":"false"},"eclegalperson":{"value":"false"},"ecinternationalorganization":{"value":"false"},"legalshortname":{"value":"Sociedade Brasileira de Oftalmologia"},"ecresearchorganization":{"value":"false"},"ecnonprofit":{"value":"false"},"ecenterprise":{"value":"false"},"ecnutscode":{"value":"false"},"ecinternationalorganizationeurinterests":{"value":"false"},"legalname":{"value":"Sociedade Brasileira de Oftalmologia"},"country":{"classid":"BR","classname":"Brazil","schemename":"dnet:countries","schemeid":"dnet:countries"},"echighereducation":{"value":"false"},"ecsmevalidated":{"value":"false"}}},"dateofcollection":"2018-09-19","type":20,"id":"20|doajarticles::2211118adf702ff5b7080a86b998d5d4"}
{"dateoftransformation":"2018-09-19","originalId":["doajarticles::Sociedade_Brasileira_de_Química"],"collectedfrom":[{"value":"DOAJ-Articles","key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824"}],"organization":{"metadata":{"eclegalbody":{"value":"false"},"eclegalperson":{"value":"false"},"ecinternationalorganization":{"value":"false"},"legalshortname":{"value":"Sociedade Brasileira de Química"},"ecresearchorganization":{"value":"false"},"ecnonprofit":{"value":"false"},"ecenterprise":{"value":"false"},"ecnutscode":{"value":"false"},"ecinternationalorganizationeurinterests":{"value":"false"},"legalname":{"value":"Sociedade Brasileira de Química"},"country":{"classid":"BR","classname":"Brazil","schemename":"dnet:countries","schemeid":"dnet:countries"},"echighereducation":{"value":"false"},"ecsmevalidated":{"value":"false"}}},"dateofcollection":"2018-09-19","type":20,"id":"20|doajarticles::26485473d67ce2365939195c8182eca8"}
{"dateoftransformation":"2018-09-19","originalId":["doajarticles::Sociedade_Brasileira_de_Zootecnia"],"collectedfrom":[{"value":"DOAJ-Articles","key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824"}],"organization":{"metadata":{"eclegalbody":{"value":"false"},"eclegalperson":{"value":"false"},"ecinternationalorganization":{"value":"false"},"legalshortname":{"value":"Sociedade Brasileira de Zootecnia"},"ecresearchorganization":{"value":"false"},"ecnonprofit":{"value":"false"},"ecenterprise":{"value":"false"},"ecnutscode":{"value":"false"},"ecinternationalorganizationeurinterests":{"value":"false"},"legalname":{"value":"Sociedade Brasileira de Zootecnia"},"country":{"classid":"BR","classname":"Brazil","schemename":"dnet:countries","schemeid":"dnet:countries"},"echighereducation":{"value":"false"},"ecsmevalidated":{"value":"false"}}},"dateofcollection":"2018-09-19","type":20,"id":"20|doajarticles::2c0d0814ec3a30dedfe012f7cbd719b7"}
{"dateoftransformation":"2018-09-19","originalId":["doajarticles::Sociedade_Brasileira_de_Farmacognosia"],"collectedfrom":[{"value":"DOAJ-Articles","key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824"}],"organization":{"metadata":{"eclegalbody":{"value":"false"},"eclegalperson":{"value":"false"},"ecinternationalorganization":{"value":"false"},"legalshortname":{"value":"Sociedade Brasileira de Farmacognosia"},"ecresearchorganization":{"value":"false"},"ecnonprofit":{"value":"false"},"ecenterprise":{"value":"false"},"ecnutscode":{"value":"false"},"ecinternationalorganizationeurinterests":{"value":"false"},"legalname":{"value":"Sociedade Brasileira de Farmacognosia"},"country":{"classid":"BR","classname":"Brazil","schemename":"dnet:countries","schemeid":"dnet:countries"},"echighereducation":{"value":"false"},"ecsmevalidated":{"value":"false"}}},"dateofcollection":"2018-09-19","type":20,"id":"20|doajarticles::42327426aab0ba4234a1d5d742cd30fc"}
{"dateoftransformation":"2018-09-19","originalId":["doajarticles::Sociedade_Brasileira_de_Pneumologia_e_Tisiologia"],"collectedfrom":[{"value":"DOAJ-Articles","key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824"}],"organization":{"metadata":{"eclegalbody":{"value":"false"},"eclegalperson":{"value":"false"},"ecinternationalorganization":{"value":"false"},"legalshortname":{"value":"Sociedade Brasileira de Pneumologia e Tisiologia"},"ecresearchorganization":{"value":"false"},"ecnonprofit":{"value":"false"},"ecenterprise":{"value":"false"},"ecnutscode":{"value":"false"},"ecinternationalorganizationeurinterests":{"value":"false"},"legalname":{"value":"Sociedade Brasileira de Pneumologia e Tisiologia"},"country":{"classid":"BR","classname":"Brazil","schemename":"dnet:countries","schemeid":"dnet:countries"},"echighereducation":{"value":"false"},"ecsmevalidated":{"value":"false"}}},"dateofcollection":"2018-09-19","type":20,"id":"20|doajarticles::43d3cfd26c4136750bd75d184971a358"}
{"dateoftransformation":"2018-09-19","originalId":["doajarticles::Sociedade_Brasileira_de_Nefrologia"],"collectedfrom":[{"value":"DOAJ-Articles","key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824"}],"organization":{"metadata":{"eclegalbody":{"value":"false"},"eclegalperson":{"value":"false"},"ecinternationalorganization":{"value":"false"},"legalshortname":{"value":"Sociedade Brasileira de Nefrologia"},"ecresearchorganization":{"value":"false"},"ecnonprofit":{"value":"false"},"ecenterprise":{"value":"false"},"ecnutscode":{"value":"false"},"ecinternationalorganizationeurinterests":{"value":"false"},"legalname":{"value":"Sociedade Brasileira de Nefrologia"},"country":{"classid":"BR","classname":"Brazil","schemename":"dnet:countries","schemeid":"dnet:countries"},"echighereducation":{"value":"false"},"ecsmevalidated":{"value":"false"}}},"dateofcollection":"2018-09-19","type":20,"id":"20|doajarticles::4c6426d05e90a3ecb65ab861dc559464"}
{"dateoftransformation":"2018-09-19","originalId":["doajarticles::Sociedade_Brasileira_de_Computação"],"collectedfrom":[{"value":"DOAJ-Articles","key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824"}],"organization":{"metadata":{"eclegalbody":{"value":"false"},"eclegalperson":{"value":"false"},"ecinternationalorganization":{"value":"false"},"legalshortname":{"value":"Sociedade Brasileira de Computação"},"ecresearchorganization":{"value":"false"},"ecnonprofit":{"value":"false"},"ecenterprise":{"value":"false"},"ecnutscode":{"value":"false"},"ecinternationalorganizationeurinterests":{"value":"false"},"legalname":{"value":"Sociedade Brasileira de Computação"},"country":{"classid":"BR","classname":"Brazil","schemename":"dnet:countries","schemeid":"dnet:countries"},"echighereducation":{"value":"false"},"ecsmevalidated":{"value":"false"}}},"dateofcollection":"2018-09-19","type":20,"id":"20|doajarticles::4ed8701d66ab67fc9914eef54c602625"}
{"dateoftransformation":"2018-09-19","originalId":["doajarticles::Sociedade_Brasileira_de_Virologia"],"collectedfrom":[{"value":"DOAJ-Articles","key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824"}],"organization":{"metadata":{"eclegalbody":{"value":"false"},"eclegalperson":{"value":"false"},"ecinternationalorganization":{"value":"false"},"legalshortname":{"value":"Sociedade Brasileira de Virologia"},"ecresearchorganization":{"value":"false"},"ecnonprofit":{"value":"false"},"ecenterprise":{"value":"false"},"ecnutscode":{"value":"false"},"ecinternationalorganizationeurinterests":{"value":"false"},"legalname":{"value":"Sociedade Brasileira de Virologia"},"country":{"classid":"BR","classname":"Brazil","schemename":"dnet:countries","schemeid":"dnet:countries"},"echighereducation":{"value":"false"},"ecsmevalidated":{"value":"false"}}},"dateofcollection":"2018-09-19","type":20,"id":"20|doajarticles::57d056348d91c00a6b5b3384b960c02b"}
{"dateoftransformation":"2018-09-19","originalId":["doajarticles::Sociedade_Brasileira_de_Fonoaudiologia"],"collectedfrom":[{"value":"DOAJ-Articles","key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824"}],"organization":{"metadata":{"eclegalbody":{"value":"false"},"eclegalperson":{"value":"false"},"ecinternationalorganization":{"value":"false"},"legalshortname":{"value":"Sociedade Brasileira de Fonoaudiologia"},"ecresearchorganization":{"value":"false"},"ecnonprofit":{"value":"false"},"ecenterprise":{"value":"false"},"ecnutscode":{"value":"false"},"ecinternationalorganizationeurinterests":{"value":"false"},"legalname":{"value":"Sociedade Brasileira de Fonoaudiologia"},"country":{"classid":"BR","classname":"Brazil","schemename":"dnet:countries","schemeid":"dnet:countries"},"echighereducation":{"value":"false"},"ecsmevalidated":{"value":"false"}}},"dateofcollection":"2018-09-19","type":20,"id":"20|doajarticles::5abc3db7f05732b9fb034510fa00e01f"}
{"dateoftransformation":"2018-09-19","originalId":["doajarticles::Sociedade_Brasileira_de_Ciência_do_Solo"],"collectedfrom":[{"value":"DOAJ-Articles","key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824"}],"organization":{"metadata":{"eclegalbody":{"value":"false"},"eclegalperson":{"value":"false"},"ecinternationalorganization":{"value":"false"},"legalshortname":{"value":"Sociedade Brasileira de Ciência do Solo"},"ecresearchorganization":{"value":"false"},"ecnonprofit":{"value":"false"},"ecenterprise":{"value":"false"},"ecnutscode":{"value":"false"},"ecinternationalorganizationeurinterests":{"value":"false"},"legalname":{"value":"Sociedade Brasileira de Ciência do Solo"},"country":{"classid":"BR","classname":"Brazil","schemename":"dnet:countries","schemeid":"dnet:countries"},"echighereducation":{"value":"false"},"ecsmevalidated":{"value":"false"}}},"dateofcollection":"2018-09-19","type":20,"id":"20|doajarticles::699ed9ecc727c90e9321e43e495e03ee"}
{"dateoftransformation":"2019-10-14 08:59:35.295767","originalId":["openorgs____::0000047474"],"pid":[{"qualifier":{"classid":"grid.ac","classname":"grid.ac","schemename":"dnet:pid_types","schemeid":"dnet:pid_types"},"value":"grid.456889.e"}],"collectedfrom":[{"value":"OpenOrgs Database","key":"10|openaire____::0362fcdb3076765d9c0041ad331553e8"}],"organization":{"metadata":{"legalshortname":{"value":"SEB"},"websiteurl":{"value":"http://www.seb.org.br/english"},"country":{"classid":"BR","classname":"Brazil","schemename":"dnet:countries","schemeid":"dnet:countries"},"alternativeNames":[{"value":"Entomological Society of Brazil"},{"value":"Sociedade Brasileira de Entomologia"}],"legalname":{"value":"Entomological Society of Brazil"}}},"dateofcollection":"","type":20,"id":"20|openorgs____::e5afffc163e999604fb7d4b783e2356b"}
{"dateoftransformation":"2019-10-14 08:59:35.295767","originalId":["openorgs____::0000047480"],"pid":[{"qualifier":{"classid":"ISNI","classname":"ISNI","schemename":"dnet:pid_types","schemeid":"dnet:pid_types"},"value":"0000 0001 2194 1537"},{"qualifier":{"classid":"grid.ac","classname":"grid.ac","schemename":"dnet:pid_types","schemeid":"dnet:pid_types"},"value":"grid.456895.3"}],"collectedfrom":[{"value":"OpenOrgs Database","key":"10|openaire____::0362fcdb3076765d9c0041ad331553e8"}],"organization":{"metadata":{"legalshortname":{"value":"SBF"},"websiteurl":{"value":"http://www.sbfisica.org.br/v1/"},"country":{"classid":"BR","classname":"Brazil","schemename":"dnet:countries","schemeid":"dnet:countries"},"alternativeNames":[{"value":"Sociedade Brasileira de Física"}],"legalname":{"value":"Sociedade Brasileira de Física"}}},"dateofcollection":"","type":20,"id":"20|openorgs____::e309c5e37b7891444a464725b67258b8"}
{"dateoftransformation":"2018-09-19","originalId":["doajarticles::Sociedade_Brasileira_para_o_Estudo_da_Dor"],"collectedfrom":[{"value":"DOAJ-Articles","key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824"}],"organization":{"metadata":{"eclegalbody":{"value":"false"},"eclegalperson":{"value":"false"},"ecinternationalorganization":{"value":"false"},"legalshortname":{"value":"Sociedade Brasileira para o Estudo da Dor"},"ecresearchorganization":{"value":"false"},"ecnonprofit":{"value":"false"},"ecenterprise":{"value":"false"},"ecnutscode":{"value":"false"},"ecinternationalorganizationeurinterests":{"value":"false"},"legalname":{"value":"Sociedade Brasileira para o Estudo da Dor"},"country":{"classid":"BR","classname":"Brazil","schemename":"dnet:countries","schemeid":"dnet:countries"},"echighereducation":{"value":"false"},"ecsmevalidated":{"value":"false"}}},"dateofcollection":"2018-09-19","type":20,"id":"20|doajarticles::84ba1a581f8294bc89c748a8f5b443fd"}
{"dateoftransformation":"2019-10-14 08:59:35.295767","originalId":["openorgs____::0000063369"],"pid":[{"qualifier":{"classid":"ISNI","classname":"ISNI","schemename":"dnet:pid_types","schemeid":"dnet:pid_types"},"value":"0000 0000 9730 9282"},{"qualifier":{"classid":"grid.ac","classname":"grid.ac","schemename":"dnet:pid_types","schemeid":"dnet:pid_types"},"value":"grid.472992.3"}],"collectedfrom":[{"value":"OpenOrgs Database","key":"10|openaire____::0362fcdb3076765d9c0041ad331553e8"}],"organization":{"metadata":{"legalshortname":{"value":"SBC"},"websiteurl":{"value":"http://www.sbc.org.br/"},"country":{"classid":"BR","classname":"Brazil","schemename":"dnet:countries","schemeid":"dnet:countries"},"alternativeNames":[{"value":"Brazilian Computer Society"},{"value":"Sociedade Brasileira de Computação"}],"legalname":{"value":"Brazilian Computer Society"}}},"dateofcollection":"","type":20,"id":"20|openorgs____::de81d18b690dc0e08dd69f87ced49482"}
{"dateoftransformation":"2019-10-14 08:59:35.295767","originalId":["openorgs____::0000047421"],"pid":[{"qualifier":{"classid":"grid.ac","classname":"grid.ac","schemename":"dnet:pid_types","schemeid":"dnet:pid_types"},"value":"grid.456835.9"}],"collectedfrom":[{"value":"OpenOrgs Database","key":"10|openaire____::0362fcdb3076765d9c0041ad331553e8"}],"organization":{"metadata":{"legalshortname":{"value":"SBPR"},"websiteurl":{"value":"http://www.sbpr.org.br/"},"country":{"classid":"BR","classname":"Brazil","schemename":"dnet:countries","schemeid":"dnet:countries"},"alternativeNames":[{"value":"Brazilian Radiation Protection Society"},{"value":"Sociedade Brasileira de Proteção Radiológica"}],"legalname":{"value":"Brazilian Radiation Protection Society"}}},"dateofcollection":"","type":20,"id":"20|openorgs____::de18fb186ecc5ad820c65e5c22ac301d"}
{"dateoftransformation":"2018-09-19","originalId":["doajarticles::Sociedade_Brasileira_de_Anestesiologia"],"collectedfrom":[{"value":"DOAJ-Articles","key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824"}],"organization":{"metadata":{"eclegalbody":{"value":"false"},"eclegalperson":{"value":"false"},"ecinternationalorganization":{"value":"false"},"legalshortname":{"value":"Sociedade Brasileira de Anestesiologia"},"ecresearchorganization":{"value":"false"},"ecnonprofit":{"value":"false"},"ecenterprise":{"value":"false"},"ecnutscode":{"value":"false"},"ecinternationalorganizationeurinterests":{"value":"false"},"legalname":{"value":"Sociedade Brasileira de Anestesiologia"},"country":{"classid":"BR","classname":"Brazil","schemename":"dnet:countries","schemeid":"dnet:countries"},"echighereducation":{"value":"false"},"ecsmevalidated":{"value":"false"}}},"dateofcollection":"2018-09-19","type":20,"id":"20|doajarticles::9b00d9a80bae6c64921e37a4ee1c2dcd"}
{"dateoftransformation":"2018-09-19","originalId":["doajarticles::Sociedade_Brasileira_de_Meteorologia"],"collectedfrom":[{"value":"DOAJ-Articles","key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824"}],"organization":{"metadata":{"eclegalbody":{"value":"false"},"eclegalperson":{"value":"false"},"ecinternationalorganization":{"value":"false"},"legalshortname":{"value":"Sociedade Brasileira de Meteorologia"},"ecresearchorganization":{"value":"false"},"ecnonprofit":{"value":"false"},"ecenterprise":{"value":"false"},"ecnutscode":{"value":"false"},"ecinternationalorganizationeurinterests":{"value":"false"},"legalname":{"value":"Sociedade Brasileira de Meteorologia"},"country":{"classid":"BR","classname":"Brazil","schemename":"dnet:countries","schemeid":"dnet:countries"},"echighereducation":{"value":"false"},"ecsmevalidated":{"value":"false"}}},"dateofcollection":"2018-09-19","type":20,"id":"20|doajarticles::9b23dc9de16b3d1b1d8c457ac9d879aa"}
{"dateoftransformation":"2018-09-19","originalId":["doajarticles::Conselho_Brasileiro_de_Oftalmologia"],"collectedfrom":[{"value":"DOAJ-Articles","key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824"}],"organization":{"metadata":{"eclegalbody":{"value":"false"},"eclegalperson":{"value":"false"},"ecinternationalorganization":{"value":"false"},"legalshortname":{"value":"Conselho Brasileiro de Oftalmologia"},"ecresearchorganization":{"value":"false"},"ecnonprofit":{"value":"false"},"ecenterprise":{"value":"false"},"ecnutscode":{"value":"false"},"ecinternationalorganizationeurinterests":{"value":"false"},"legalname":{"value":"Conselho Brasileiro de Oftalmologia"},"country":{"classid":"BR","classname":"Brazil","schemename":"dnet:countries","schemeid":"dnet:countries"},"echighereducation":{"value":"false"},"ecsmevalidated":{"value":"false"}}},"dateofcollection":"2018-09-19","type":20,"id":"20|doajarticles::a8515c8050b9aa32f27df728f647a3a1"}
{"dateoftransformation":"2018-09-19","originalId":["doajarticles::Sociedade_Brasileira_de_Sociologia"],"collectedfrom":[{"value":"DOAJ-Articles","key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824"}],"organization":{"metadata":{"eclegalbody":{"value":"false"},"eclegalperson":{"value":"false"},"ecinternationalorganization":{"value":"false"},"legalshortname":{"value":"Sociedade Brasileira de Sociologia"},"ecresearchorganization":{"value":"false"},"ecnonprofit":{"value":"false"},"ecenterprise":{"value":"false"},"ecnutscode":{"value":"false"},"ecinternationalorganizationeurinterests":{"value":"false"},"legalname":{"value":"Sociedade Brasileira de Sociologia"},"country":{"classid":"BR","classname":"Brazil","schemename":"dnet:countries","schemeid":"dnet:countries"},"echighereducation":{"value":"false"},"ecsmevalidated":{"value":"false"}}},"dateofcollection":"2018-09-19","type":20,"id":"20|doajarticles::a9b4f7f1ff369a3b72aca05df024daae"}
{"dateoftransformation":"2018-09-19","originalId":["doajarticles::Sociedade_Brasileira_de_Matemática"],"collectedfrom":[{"value":"DOAJ-Articles","key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824"}],"organization":{"metadata":{"eclegalbody":{"value":"false"},"eclegalperson":{"value":"false"},"ecinternationalorganization":{"value":"false"},"legalshortname":{"value":"Sociedade Brasileira de Matemática"},"ecresearchorganization":{"value":"false"},"ecnonprofit":{"value":"false"},"ecenterprise":{"value":"false"},"ecnutscode":{"value":"false"},"ecinternationalorganizationeurinterests":{"value":"false"},"legalname":{"value":"Sociedade Brasileira de Matemática"},"country":{"classid":"BR","classname":"Brazil","schemename":"dnet:countries","schemeid":"dnet:countries"},"echighereducation":{"value":"false"},"ecsmevalidated":{"value":"false"}}},"dateofcollection":"2018-09-19","type":20,"id":"20|doajarticles::ab799243ddcda20d45158e39d6644675"}
{"dateoftransformation":"2018-09-19","originalId":["doajarticles::Sociedade_Brasileira_de_Ictiologia"],"collectedfrom":[{"value":"DOAJ-Articles","key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824"}],"organization":{"metadata":{"eclegalbody":{"value":"false"},"eclegalperson":{"value":"false"},"ecinternationalorganization":{"value":"false"},"legalshortname":{"value":"Sociedade Brasileira de Ictiologia"},"ecresearchorganization":{"value":"false"},"ecnonprofit":{"value":"false"},"ecenterprise":{"value":"false"},"ecnutscode":{"value":"false"},"ecinternationalorganizationeurinterests":{"value":"false"},"legalname":{"value":"Sociedade Brasileira de Ictiologia"},"country":{"classid":"BR","classname":"Brazil","schemename":"dnet:countries","schemeid":"dnet:countries"},"echighereducation":{"value":"false"},"ecsmevalidated":{"value":"false"}}},"dateofcollection":"2018-09-19","type":20,"id":"20|doajarticles::af40a2664d0c65c5b9f4f9c3d8b7a361"}
{"dateoftransformation":"2018-09-19","originalId":["doajarticles::Sociedade_Brasileira_de_Cartografia"],"collectedfrom":[{"value":"DOAJ-Articles","key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824"}],"organization":{"metadata":{"eclegalbody":{"value":"false"},"eclegalperson":{"value":"false"},"ecinternationalorganization":{"value":"false"},"legalshortname":{"value":"Sociedade Brasileira de Cartografia"},"ecresearchorganization":{"value":"false"},"ecnonprofit":{"value":"false"},"ecenterprise":{"value":"false"},"ecnutscode":{"value":"false"},"ecinternationalorganizationeurinterests":{"value":"false"},"legalname":{"value":"Sociedade Brasileira de Cartografia"},"country":{"classid":"BR","classname":"Brazil","schemename":"dnet:countries","schemeid":"dnet:countries"},"echighereducation":{"value":"false"},"ecsmevalidated":{"value":"false"}}},"dateofcollection":"2018-09-19","type":20,"id":"20|doajarticles::b20fe3c74f4ce1ea9fe51b7430904641"}
{"dateoftransformation":"2019-10-14 08:59:35.295767","originalId":["openorgs____::0000047451"],"pid":[{"qualifier":{"classid":"grid.ac","classname":"grid.ac","schemename":"dnet:pid_types","schemeid":"dnet:pid_types"},"value":"grid.456866.f"}],"collectedfrom":[{"value":"OpenOrgs Database","key":"10|openaire____::0362fcdb3076765d9c0041ad331553e8"}],"organization":{"metadata":{"legalshortname":{"value":"SBBC"},"websiteurl":{"value":"http://www.sbbc.org.br/"},"country":{"classid":"BR","classname":"Brazil","schemename":"dnet:countries","schemeid":"dnet:countries"},"alternativeNames":[{"value":"Sociedade Brasileira de Biologia Celular"}],"legalname":{"value":"Sociedade Brasileira de Biologia Celular"}}},"dateofcollection":"","type":20,"id":"20|openorgs____::131a553780dbaac8a5a9d2ed72184a4e"}
{"dateoftransformation":"2018-09-19","originalId":["doajarticles::Sociedade_Brasileira_de_Carcinologia"],"collectedfrom":[{"value":"DOAJ-Articles","key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824"}],"organization":{"metadata":{"eclegalbody":{"value":"false"},"eclegalperson":{"value":"false"},"ecinternationalorganization":{"value":"false"},"legalshortname":{"value":"Sociedade Brasileira de Carcinologia"},"ecresearchorganization":{"value":"false"},"ecnonprofit":{"value":"false"},"ecenterprise":{"value":"false"},"ecnutscode":{"value":"false"},"ecinternationalorganizationeurinterests":{"value":"false"},"legalname":{"value":"Sociedade Brasileira de Carcinologia"},"country":{"classid":"BR","classname":"Brazil","schemename":"dnet:countries","schemeid":"dnet:countries"},"echighereducation":{"value":"false"},"ecsmevalidated":{"value":"false"}}},"dateofcollection":"2018-09-19","type":20,"id":"20|doajarticles::df7f2b8fbbc54d72427c126180282b5c"}
{"dateoftransformation":"2018-09-19","originalId":["doajarticles::Sociedade_Brasileira_de_Educação_Matemática"],"collectedfrom":[{"value":"DOAJ-Articles","key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824"}],"organization":{"metadata":{"eclegalbody":{"value":"false"},"eclegalperson":{"value":"false"},"ecinternationalorganization":{"value":"false"},"legalshortname":{"value":"Sociedade Brasileira de Educação Matemática"},"ecresearchorganization":{"value":"false"},"ecnonprofit":{"value":"false"},"ecenterprise":{"value":"false"},"ecnutscode":{"value":"false"},"ecinternationalorganizationeurinterests":{"value":"false"},"legalname":{"value":"Sociedade Brasileira de Educação Matemática"},"country":{"classid":"BR","classname":"Brazil","schemename":"dnet:countries","schemeid":"dnet:countries"},"echighereducation":{"value":"false"},"ecsmevalidated":{"value":"false"}}},"dateofcollection":"2018-09-19","type":20,"id":"20|doajarticles::ec10c30a33588ad4884e042a4ea76a4a"}
{"dateoftransformation":"2018-09-19","originalId":["doajarticles::Sociedade_Brasileira_de_Genética"],"collectedfrom":[{"value":"DOAJ-Articles","key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824"}],"organization":{"metadata":{"eclegalbody":{"value":"false"},"eclegalperson":{"value":"false"},"ecinternationalorganization":{"value":"false"},"legalshortname":{"value":"Sociedade Brasileira de Genética"},"ecresearchorganization":{"value":"false"},"ecnonprofit":{"value":"false"},"ecenterprise":{"value":"false"},"ecnutscode":{"value":"false"},"ecinternationalorganizationeurinterests":{"value":"false"},"legalname":{"value":"Sociedade Brasileira de Genética"},"country":{"classid":"BR","classname":"Brazil","schemename":"dnet:countries","schemeid":"dnet:countries"},"echighereducation":{"value":"false"},"ecsmevalidated":{"value":"false"}}},"dateofcollection":"2018-09-19","type":20,"id":"20|doajarticles::eeccef7ac84b2b6cb9228b9871306e6b"}
{"dateoftransformation":"2018-09-19","originalId":["doajarticles::Sociedade_Brasileira_de_Nematologia"],"collectedfrom":[{"value":"DOAJ-Articles","key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824"}],"organization":{"metadata":{"eclegalbody":{"value":"false"},"eclegalperson":{"value":"false"},"ecinternationalorganization":{"value":"false"},"legalshortname":{"value":"Sociedade Brasileira de Nematologia"},"ecresearchorganization":{"value":"false"},"ecnonprofit":{"value":"false"},"ecenterprise":{"value":"false"},"ecnutscode":{"value":"false"},"ecinternationalorganizationeurinterests":{"value":"false"},"legalname":{"value":"Sociedade Brasileira de Nematologia"},"country":{"classid":"BR","classname":"Brazil","schemename":"dnet:countries","schemeid":"dnet:countries"},"echighereducation":{"value":"false"},"ecsmevalidated":{"value":"false"}}},"dateofcollection":"2018-09-19","type":20,"id":"20|doajarticles::ef00ed15b58761c0e9dd60a28b361d6c"}
{"dateoftransformation":"2018-09-19","originalId":["doajarticles::Sociedade_Brasileira_de_Zoologia"],"collectedfrom":[{"value":"DOAJ-Articles","key":"10|driver______::bee53aa31dc2cbb538c10c2b65fa5824"}],"organization":{"metadata":{"eclegalbody":{"value":"false"},"eclegalperson":{"value":"false"},"ecinternationalorganization":{"value":"false"},"legalshortname":{"value":"Sociedade Brasileira de Zoologia"},"ecresearchorganization":{"value":"false"},"ecnonprofit":{"value":"false"},"ecenterprise":{"value":"false"},"ecnutscode":{"value":"false"},"ecinternationalorganizationeurinterests":{"value":"false"},"legalname":{"value":"Sociedade Brasileira de Zoologia"},"country":{"classid":"BR","classname":"Brazil","schemename":"dnet:countries","schemeid":"dnet:countries"},"echighereducation":{"value":"false"},"ecsmevalidated":{"value":"false"}}},"dateofcollection":"2018-09-19","type":20,"id":"20|doajarticles::fbb36fba9e188642c6e0fe2edf92fcab"}
{"dateoftransformation":"2019-10-14 08:59:35.295767","originalId":["openorgs____::0000048938"],"pid":[{"qualifier":{"classid":"ISNI","classname":"ISNI","schemename":"dnet:pid_types","schemeid":"dnet:pid_types"},"value":"0000 0004 0370 1590"},{"qualifier":{"classid":"grid.ac","classname":"grid.ac","schemename":"dnet:pid_types","schemeid":"dnet:pid_types"},"value":"grid.458384.6"}],"collectedfrom":[{"value":"OpenOrgs Database","key":"10|openaire____::0362fcdb3076765d9c0041ad331553e8"}],"organization":{"metadata":{"legalshortname":{"value":"Sociedade Brasileira de Diabetes"},"websiteurl":{"value":"http://www.diabetes2015.com.br/"},"country":{"classid":"BR","classname":"Brazil","schemename":"dnet:countries","schemeid":"dnet:countries"},"alternativeNames":[{"value":"Sociedade Brasileira de Diabetes"}],"legalname":{"value":"Sociedade Brasileira de Diabetes"}}},"dateofcollection":"","type":20,"id":"20|openorgs____::c2f008bfb4ecb4f247d4dd58bf7b0b4a"}
{"dateoftransformation":"2019-10-14 08:59:35.295767","originalId":["openorgs____::0000047463"],"pid":[{"qualifier":{"classid":"ISNI","classname":"ISNI","schemename":"dnet:pid_types","schemeid":"dnet:pid_types"},"value":"0000 0000 9597 369X"},{"qualifier":{"classid":"grid.ac","classname":"grid.ac","schemename":"dnet:pid_types","schemeid":"dnet:pid_types"},"value":"grid.456878.0"}],"collectedfrom":[{"value":"OpenOrgs Database","key":"10|openaire____::0362fcdb3076765d9c0041ad331553e8"}],"organization":{"metadata":{"legalshortname":{"value":"SBE"},"websiteurl":{"value":"http://www.sbe.org.br/"},"country":{"classid":"BR","classname":"Brazil","schemename":"dnet:countries","schemeid":"dnet:countries"},"alternativeNames":[{"value":"Sociedade Brasileira de Econometria"}],"legalname":{"value":"Sociedade Brasileira de Econometria"}}},"dateofcollection":"","type":20,"id":"20|openorgs____::b0f3e2d9fb020fb2fb5337e000b60950"}
{"dateoftransformation":"2019-10-14 08:59:35.295767","originalId":["openorgs____::0000080000"],"pid":[{"qualifier":{"classid":"grid.ac","classname":"grid.ac","schemename":"dnet:pid_types","schemeid":"dnet:pid_types"},"value":"grid.489804.9"}],"collectedfrom":[{"value":"OpenOrgs Database","key":"10|openaire____::0362fcdb3076765d9c0041ad331553e8"}],"organization":{"metadata":{"legalshortname":{"value":"SOBED"},"websiteurl":{"value":"http://www.sobed.org.br/"},"country":{"classid":"BR","classname":"Brazil","schemename":"dnet:countries","schemeid":"dnet:countries"},"alternativeNames":[{"value":"Sociedade Brasileira de Endoscopia Digestiva"}],"legalname":{"value":"Sociedade Brasileira de Endoscopia Digestiva"}}},"dateofcollection":"","type":20,"id":"20|openorgs____::a9e8946464a21a7d541c8cb144ac582c"}
{"dateoftransformation":"2019-10-14 08:59:35.295767","originalId":["openorgs____::0000063384"],"pid":[{"qualifier":{"classid":"ISNI","classname":"ISNI","schemename":"dnet:pid_types","schemeid":"dnet:pid_types"},"value":"0000 0000 8647 9462"},{"qualifier":{"classid":"grid.ac","classname":"grid.ac","schemename":"dnet:pid_types","schemeid":"dnet:pid_types"},"value":"grid.473008.8"}],"collectedfrom":[{"value":"OpenOrgs Database","key":"10|openaire____::0362fcdb3076765d9c0041ad331553e8"}],"organization":{"metadata":{"legalshortname":{"value":"SBGf"},"websiteurl":{"value":"http://sbgfisica.org/portal/"},"country":{"classid":"BR","classname":"Brazil","schemename":"dnet:countries","schemeid":"dnet:countries"},"alternativeNames":[{"value":"Sociedade Brasileira de Geofísica"}],"legalname":{"value":"Sociedade Brasileira de Geofísica"}}},"dateofcollection":"","type":20,"id":"20|openorgs____::94d58c101e635f4e6453e46447281b5f"}
{"dateoftransformation":"2019-10-14 08:59:35.295767","originalId":["openorgs____::0000061191"],"pid":[{"qualifier":{"classid":"ISNI","classname":"ISNI","schemename":"dnet:pid_types","schemeid":"dnet:pid_types"},"value":"0000 0001 0222 4495"},{"qualifier":{"classid":"FundRef","classname":"FundRef","schemename":"dnet:pid_types","schemeid":"dnet:pid_types"},"value":"501100002362"},{"qualifier":{"classid":"grid.ac","classname":"grid.ac","schemename":"dnet:pid_types","schemeid":"dnet:pid_types"},"value":"grid.470798.5"}],"collectedfrom":[{"value":"OpenOrgs Database","key":"10|openaire____::0362fcdb3076765d9c0041ad331553e8"}],"organization":{"metadata":{"legalshortname":{"value":"SBA"},"websiteurl":{"value":"http://www.sba.com.br/"},"country":{"classid":"BR","classname":"Brazil","schemename":"dnet:countries","schemeid":"dnet:countries"},"alternativeNames":[{"value":"Sociedade Brasileira de Anestesiologia"}],"legalname":{"value":"Sociedade Brasileira de Anestesiologia"}}},"dateofcollection":"","type":20,"id":"20|openorgs____::81e37135a2dfd2a93f8bce3801105e36"}
{"dateoftransformation":"2019-10-14 08:59:35.295767","originalId":["openorgs____::0000047481"],"pid":[{"qualifier":{"classid":"grid.ac","classname":"grid.ac","schemename":"dnet:pid_types","schemeid":"dnet:pid_types"},"value":"grid.456896.0"}],"collectedfrom":[{"value":"OpenOrgs Database","key":"10|openaire____::0362fcdb3076765d9c0041ad331553e8"}],"organization":{"metadata":{"legalshortname":{"value":"SBF"},"websiteurl":{"value":"http://www.sbfa.org.br/"},"country":{"classid":"BR","classname":"Brazil","schemename":"dnet:countries","schemeid":"dnet:countries"},"alternativeNames":[{"value":"Brazilian Society of Speech"},{"value":"Sociedade Brasileira de Fonoaudiologia"}],"legalname":{"value":"Brazilian Society of Speech"}}},"dateofcollection":"","type":20,"id":"20|openorgs____::80b06f7983db1f27beaf68b029e42dc2"}
{"dateoftransformation":"2019-10-14 08:59:35.295767","originalId":["openorgs____::0000063393"],"pid":[{"qualifier":{"classid":"ISNI","classname":"ISNI","schemename":"dnet:pid_types","schemeid":"dnet:pid_types"},"value":"0000 0000 9117 1497"},{"qualifier":{"classid":"grid.ac","classname":"grid.ac","schemename":"dnet:pid_types","schemeid":"dnet:pid_types"},"value":"grid.473017.6"}],"collectedfrom":[{"value":"OpenOrgs Database","key":"10|openaire____::0362fcdb3076765d9c0041ad331553e8"}],"organization":{"metadata":{"legalshortname":{"value":"SBC"},"websiteurl":{"value":"http://www.cardiol.br/"},"country":{"classid":"BR","classname":"Brazil","schemename":"dnet:countries","schemeid":"dnet:countries"},"alternativeNames":[{"value":"Sociedade Brasileira de Cardiologia"}],"legalname":{"value":"Sociedade Brasileira de Cardiologia"}}},"dateofcollection":"","type":20,"id":"20|openorgs____::797007b2c21cdc82c37bd264ce64831a"}
{"dateoftransformation":"2019-10-14 08:59:35.295767","originalId":["openorgs____::0000047543"],"pid":[{"qualifier":{"classid":"ISNI","classname":"ISNI","schemename":"dnet:pid_types","schemeid":"dnet:pid_types"},"value":"0000 0001 0941 9199"},{"qualifier":{"classid":"grid.ac","classname":"grid.ac","schemename":"dnet:pid_types","schemeid":"dnet:pid_types"},"value":"grid.456958.3"}],"collectedfrom":[{"value":"OpenOrgs Database","key":"10|openaire____::0362fcdb3076765d9c0041ad331553e8"}],"organization":{"metadata":{"legalshortname":{"value":"SBQ"},"websiteurl":{"value":"http://www.sbq.org.br/"},"country":{"classid":"BR","classname":"Brazil","schemename":"dnet:countries","schemeid":"dnet:countries"},"alternativeNames":[{"value":"Sociedade Brasileira de Química"}],"legalname":{"value":"Sociedade Brasileira de Química"}}},"dateofcollection":"","type":20,"id":"20|openorgs____::44f57e7e51c863a35896c4ff6f869c4e"}
{"dateoftransformation":"2019-10-14 08:59:35.295767","originalId":["openorgs____::0000047491"],"pid":[{"qualifier":{"classid":"ISNI","classname":"ISNI","schemename":"dnet:pid_types","schemeid":"dnet:pid_types"},"value":"0000 0000 9175 5388"},{"qualifier":{"classid":"grid.ac","classname":"grid.ac","schemename":"dnet:pid_types","schemeid":"dnet:pid_types"},"value":"grid.456906.8"}],"collectedfrom":[{"value":"OpenOrgs Database","key":"10|openaire____::0362fcdb3076765d9c0041ad331553e8"}],"organization":{"metadata":{"legalshortname":{"value":"SBG"},"websiteurl":{"value":"http://www.sbg.org.br/"},"country":{"classid":"BR","classname":"Brazil","schemename":"dnet:countries","schemeid":"dnet:countries"},"alternativeNames":[{"value":"Sociedade Brasileira de Genética"}],"legalname":{"value":"Sociedade Brasileira de Genética"}}},"dateofcollection":"","type":20,"id":"20|openorgs____::6e3e6400575199feb050c0e990fbb972"}
{"dateoftransformation":"2019-10-14 08:59:35.295767","originalId":["openorgs____::0000047545"],"pid":[{"qualifier":{"classid":"grid.ac","classname":"grid.ac","schemename":"dnet:pid_types","schemeid":"dnet:pid_types"},"value":"grid.456960.8"}],"collectedfrom":[{"value":"OpenOrgs Database","key":"10|openaire____::0362fcdb3076765d9c0041ad331553e8"}],"organization":{"metadata":{"legalshortname":{"value":"SBTOx"},"websiteurl":{"value":"http://www.sbtox.org.br/"},"country":{"classid":"BR","classname":"Brazil","schemename":"dnet:countries","schemeid":"dnet:countries"},"alternativeNames":[{"value":"Sociedade Brasileira de Toxicologia"}],"legalname":{"value":"Sociedade Brasileira de Toxicologia"}}},"dateofcollection":"","type":20,"id":"20|openorgs____::6e030efb3783fa52ef2f58dec07dd12c"}

View File

@ -1,34 +0,0 @@
{
"wf" : {
"threshold" : "0.85",
"dedupRun" : "001",
"entityType" : "organization",
"orderField" : "legalname",
"queueMaxSize" : "20000",
"groupMaxSize" : "20",
"slidingWindowSize" : "400",
"rootBuilder" : [ "organization", "projectOrganization_participation_isParticipant", "datasourceOrganization_provision_isProvidedBy" ],
"includeChildren" : "true"
},
"pace" : {
"clustering" : [
{ "name" : "ngrampairs", "fieldsCount" : [ "legalname" ], "params" : { "max" : 1, "ngramLen" : "3" } },
{ "name" : "suffixprefix", "fieldsCount" : [ "legalname" ], "params" : { "max" : 1, "len" : "3" } },
{ "name" : "immutablefieldvalue", "fieldsCount" : [ "country" ], "params" : { } },
{ "name" : "spacetrimmingfieldvalue", "fieldsCount" : [ "legalshortname" ], "params" : { "randomLength" : "5" } },
{ "name" : "urlclustering", "fieldsCount" : [ "websiteurl" ], "params" : { } }
],
"necessaryConditions" : [
{ "name" : "exactMatch", "fieldsCount" : [ "country" ] },
{ "name" : "mustBeDifferent", "fieldsCount" : [ "gridid" ] }
],
"model" : [
{ "name" : "legalname", "algo" : "LevensteinTitle", "type" : "String", "weight" : "0.2", "ignoreMissing" : "false", "path" : "organization/metadata/legalname/value" },
{ "name" : "legalshortname", "algo" : "LevensteinTitle", "type" : "String", "weight" : "0.2", "ignoreMissing" : "true", "path" : "organization/metadata/legalshortname/value" },
{ "name" : "websiteurl", "algo" : "urlMatcher", "type" : "URL", "weight" : "0.6", "ignoreMissing" : "true", "path" : "organization/metadata/websiteurl/value", "params" : { "host" : 0.5, "path" : 0.5 } },
{ "name" : "country", "algo" : "Null", "type" : "String", "weight" : "0.0", "ignoreMissing" : "true", "path" : "organization/metadata/country/classid" },
{ "name" : "gridid", "algo" : "Null", "type" : "String", "weight" : "0.0", "ignoreMissing" : "true", "path" : "pid[qualifier#classid = {grid}]/value" }
],
"blacklists" : { }
}
}

File diff suppressed because it is too large Load Diff

View File

@ -1,26 +0,0 @@
{
"wf" : {
"threshold" : "0.99",
"dedupRun" : "001",
"entityType" : "result",
"orderField" : "title",
"queueMaxSize" : "2000",
"groupMaxSize" : "10",
"slidingWindowSize" : "200",
"rootBuilder" : [ "result" ],
"includeChildren" : "true"
},
"pace" : {
"necessaryConditions" : [
{ "name" : "sizeMatch", "fieldsCount" : [ "authors" ] },
{ "name" : "titleVersionMatch", "fieldsCount" : [ "title" ] }
],
"model" : [
{ "name" : "title", "algo" : "JaroWinkler", "type" : "String", "weight" : "0.5", "ignoreMissing" : "false", "path" : "result/metadata/title[qualifier#classid = {main title}]/value" },
{ "name" : "authors", "algo" : "SortedLevel2JaroWinkler", "type" : "String", "weight" : "0.5", "ignoreMissing" : "true", "path" : "result/metadata/author/fullname" }
],
"blacklists" : { },
"synonyms" : { }
}
}

View File

@ -1,53 +0,0 @@
{
"wf" : {
"threshold" : "0.99",
"dedupRun" : "001",
"entityType" : "result",
"orderField" : "title",
"queueMaxSize" : "2000",
"groupMaxSize" : "10",
"slidingWindowSize" : "200",
"rootBuilder" : [ "result" ],
"includeChildren" : "true"
},
"pace" : {
"clustering" : [
{ "name" : "acronyms", "fieldsCount" : [ "title" ], "params" : { "max" : "1", "minLen" : "2", "maxLen" : "4"} },
{ "name" : "ngrampairs", "fieldsCount" : [ "title" ], "params" : { "max" : "1", "ngramLen" : "3"} },
{ "name" : "suffixprefix", "fieldsCount" : [ "title" ], "params" : { "max" : "1", "len" : "3" } }
],
"necessaryConditions" : [
{ "name" : "yearMatch", "fieldsCount" : [ "dateofacceptance" ] },
{ "name" : "titleVersionMatch", "fieldsCount" : [ "title" ] },
{ "name" : "sizeMatch", "fieldsCount" : [ "authors" ] } ,
{ "name" : "pidMatch", "fieldsCount" : [ "pid" ] }
],
"model" : [
{ "name" : "pid", "algo" : "Null", "type" : "JSON", "weight" : "0.0", "ignoreMissing" : "true", "path" : "pid", "overrideMatch" : "true" },
{ "name" : "title", "algo" : "JaroWinkler", "type" : "String", "weight" : "1.0", "ignoreMissing" : "false", "path" : "result/metadata/title[qualifier#classid = {main title}]/value" },
{ "name" : "dateofacceptance", "algo" : "Null", "type" : "String", "weight" : "0.0", "ignoreMissing" : "true", "path" : "result/metadata/dateofacceptance/value" } ,
{ "name" : "authors", "algo" : "Null", "type" : "String", "weight" : "0.0", "ignoreMissing" : "true", "path" : "result/metadata/author/fullname" }
],
"blacklists" : {
"title" : [
"^(Corpus Oral Dialectal \\(COD\\)\\.).*$",
"^(Kiri Karl Morgensternile).*$",
"^(\\[Eksliibris Aleksandr).*\\]$",
"^(\\[Eksliibris Aleksandr).*$",
"^(Eksliibris Aleksandr).*$",
"^(Kiri A\\. de Vignolles).*$",
"^(2 kirja Karl Morgensternile).*$",
"^(Pirita kloostri idaosa arheoloogilised).*$",
"^(Kiri tundmatule).*$",
"^(Kiri Jenaer Allgemeine Literaturzeitung toimetusele).*$",
"^(Eksliibris Nikolai Birukovile).*$",
"^(Eksliibris Nikolai Issakovile).*$",
"^(WHP Cruise Summary Information of section).*$",
"^(Measurement of the top quark\\-pair production cross section with ATLAS in pp collisions at).*$",
"^(Measurement of the spin\\-dependent structure function).*"
] },
"synonyms" : {
}
}
}

View File

@ -1,30 +0,0 @@
{
"wf" : {
"threshold" : "0.99",
"dedupRun" : "001",
"entityType" : "result",
"orderField" : "title",
"queueMaxSize" : "2000",
"groupMaxSize" : "10",
"slidingWindowSize" : "200",
"rootBuilder" : [ "result" ],
"includeChildren" : "true"
},
"pace" : {
"sufficientConditions" : [
{ "name" : "pidMatch", "fieldsCount" : [ "pid" ] }
],
"necessaryConditions" : [
{ "name" : "yearMatch", "fieldsCount" : [ "dateofacceptance" ] },
{ "name" : "titleVersionMatch", "fieldsCount" : [ "title" ] }
],
"model" : [
{ "name" : "pid", "algo" : "Null", "type" : "JSON", "weight" : "0.0", "ignoreMissing" : "true", "path" : "pid", "overrideMatch" : "true" },
{ "name" : "title", "algo" : "Level2Levenstein", "type" : "String", "weight" : "1.0", "ignoreMissing" : "false", "path" : "result/metadata/title[qualifier#classid = {main title}]/value" },
{ "name" : "dateofacceptance", "algo" : "Null", "type" : "String", "weight" : "0.0", "ignoreMissing" : "true", "path" : "result/metadata/dateofacceptance/value" }
],
"blacklists" : { },
"synonyms" : { }
}
}

View File

@ -1,273 +0,0 @@
{
"wf" : {
"threshold" : "0.99",
"dedupRun" : "001",
"entityType" : "result",
"orderField" : "title",
"queueMaxSize" : "4000",
"groupMaxSize" : "40",
"slidingWindowSize" : "200",
"rootBuilder" : [ "result", "personResult_authorship_hasAuthor", "resultProject_outcome_isProducedBy", "resultResult_publicationDataset_isRelatedTo", "resultResult_similarity_isAmongTopNSimilarDocuments", "resultResult_similarity_hasAmongTopNSimilarDocuments" ],
"includeChildren" : "true",
"maxChildren" : "40"
},
"pace" : {
"clustering" : [
{ "name" : "ngrampairs", "fieldsCount" : [ "title" ], "params" : { "max" : "1", "ngramLen" : "3"} },
{ "name" : "suffixprefix", "fieldsCount" : [ "title" ], "params" : { "max" : "1", "len" : "3" } },
{ "name" : "lowercase", "fieldsCount" : [ "doi" ], "params" : { } }
],
"sufficientConditions" : [
{ "name" : "pidMatch", "fieldsCount" : [ "pid" ] }
],
"necessaryConditions" : [
{ "name" : "titleVersionMatch", "fieldsCount" : [ "title" ] },
{ "name" : "sizeMatch", "fieldsCount" : [ "authors" ] }
],
"model" : [
{ "name" : "doi", "algo" : "Null", "type" : "String", "weight" : "0.0", "ignoreMissing" : "true", "path" : "pid[qualifier#classid = {doi}]/value" },
{ "name" : "pid", "algo" : "Null", "type" : "JSON", "weight" : "0.0", "ignoreMissing" : "true", "path" : "pid", "overrideMatch" : "true" },
{ "name" : "title", "algo" : "LevensteinTitle", "type" : "String", "weight" : "1.0", "ignoreMissing" : "false", "path" : "result/metadata/title[qualifier#classid = {main title}]/value" },
{ "name" : "authors", "algo" : "Null", "type" : "String", "weight" : "0.0", "ignoreMissing" : "true", "path" : "result/author/metadata/fullname/value" }
],
"blacklists" : {
"title" : [
"^THE ASSOCIATION AND THE GENERAL MEDICAL COUNCIL$",
"^Problems with perinatal pathology\.?$",
"(?i)^Cases? of Puerperal Convulsions$",
"(?i)^Operative Gyna?ecology$",
"(?i)^Mind the gap\!?\:?$",
"^Chronic fatigue syndrome\.?$",
"^Cartas? ao editor Letters? to the Editor$",
"^Note from the Editor$",
"^Anesthesia Abstract$",
"^Annual report$",
"(?i)^“?THE RADICAL PREVENTION OF VENEREAL DISEASE\.?”?$",
"(?i)^Graph and Table of Infectious Diseases?$",
"^Presentation$",
"(?i)^Reviews and Information on Publications$",
"(?i)^PUBLIC HEALTH SERVICES?$",
"(?i)^COMBINED TEXT-?BOOK OF OBSTETRICS AND GYN(Æ|ae)COLOGY$",
"(?i)^Adrese autora$",
"(?i)^Systematic Part .*\. Catalogus Fossilium Austriae, Band 2: Echinoidea neogenica$",
"(?i)^Acknowledgement to Referees$",
"(?i)^Behçet's disease\.?$",
"(?i)^Isolation and identification of restriction endonuclease.*$",
"(?i)^CEREBROVASCULAR DISEASES?.?$",
"(?i)^Screening for abdominal aortic aneurysms?\.?$",
"^Event management$",
"(?i)^Breakfast and Crohn's disease.*\.?$",
"^Cálculo de concentraciones en disoluciones acuosas. Ejercicio interactivo\..*\.$",
"(?i)^Genetic and functional analyses of SHANK2 mutations suggest a multiple hit model of Autism spectrum disorders?\.?$",
"^Gushi hakubutsugaku$",
"^Starobosanski nadpisi u Bosni i Hercegovini \(.*\)$",
"^Intestinal spirocha?etosis$",
"^Treatment of Rodent Ulcer$",
"(?i)^\W*Cloud Computing\W*$",
"^Compendio mathematico : en que se contienen todas las materias mas principales de las Ciencias que tratan de la cantidad$",
"^Free Communications, Poster Presentations: Session [A-F]$",
"^“The Historical Aspects? of Quackery\.?”$",
"^A designated centre for people with disabilities operated by St John of God Community Services (Limited|Ltd), Louth$",
"^P(er|re)-Mile Premiums for Auto Insurance\\.?$",
"(?i)^Case Report$",
"^Boletín Informativo$",
"(?i)^Glioblastoma Multiforme$",
"(?i)^Nuevos táxones animales descritos en la península Ibérica y Macaronesia desde 1994 \\(.*\\)$",
"^Zaměstnanecké výhody$",
"(?i)^The Economics of Terrorism and Counter-Terrorism: A Survey \\(Part .*\\)$",
"(?i)^Carotid body tumours?\\.?$",
"(?i)^\\[Españoles en Francia : La condición Emigrante.*\\]$",
"^Avant-propos$",
"(?i)^St\. Patrick's Cathedral, Dublin, County Dublin - Head(s)? and Capital(s)?$",
"(?i)^St\. Patrick's Cathedral, Dublin, County Dublin - Bases?$",
"(?i)^PUBLIC HEALTH VERSUS THE STATE$",
"^Viñetas de Cortázar$",
"(?i)^Search for heavy neutrinos and W(\\[|_|\\(|_\\{|-)?R(\\]|\\)|\\})? bosons with right-handed couplings in a left-right symmetric model in pp collisions at.*TeV(\.)?$",
"(?i)^Measurement of the pseudorapidity and centrality dependence of the transverse energy density in Pb(-?)Pb collisions at.*tev(\.?)$",
"(?i)^Search for resonances decaying into top-quark pairs using fully hadronic decays in pp collisions with ATLAS at.*TeV$",
"(?i)^Search for neutral minimal supersymmetric standard model Higgs bosons decaying to tau pairs in pp collisions at.*tev$",
"(?i)^Relatório de Estágio (de|em) Angiologia e Cirurgia Vascular$",
"^Aus der AGMB$",
"^Znanstveno-stručni prilozi$",
"(?i)^Zhodnocení finanční situace podniku a návrhy na zlepšení$",
"(?i)^Evaluation of the Financial Situation in the Firm and Proposals to its Improvement$",
"(?i)^Hodnocení finanční situace podniku a návrhy na její zlepšení$",
"^Finanční analýza podniku$",
"^Financial analysis( of business)?$",
"(?i)^Textbook of Gyn(a)?(Æ)?(e)?cology$",
"^Jikken nihon shūshinsho$",
"(?i)^CORONER('|s)(s|') INQUESTS$",
"(?i)^(Μελέτη παραγόντων )?risk management( για ανάπτυξη και εφαρμογή ενός πληροφοριακού συστήματος| και ανάπτυξη συστήματος)?$",
"(?i)^Consultants' contract(s)?$",
"(?i)^Upute autorima$",
"(?i)^Bijdrage tot de Kennis van den Godsdienst der Dajaks van Lan(d|f)ak en Tajan$",
"^Joshi shin kokubun$",
"^Kōtō shōgaku dokuhon nōson'yō$",
"^Jinjō shōgaku shōka$",
"^Shōgaku shūjichō$",
"^Nihon joshi dokuhon$",
"^Joshi shin dokuhon$",
"^Chūtō kanbun dokuhon$",
"^Wabun dokuhon$",
"(?i)^(Analysis of economy selected village or town|Rozbor hospodaření vybrané obce či města)$",
"(?i)^cardiac rehabilitation$",
"(?i)^Analytical summary$",
"^Thesaurus resolutionum Sacrae Congregationis Concilii$",
"(?i)^Sumario analítico(\\s{1})?(Analitic summary)?$",
"^Prikazi i osvrti$",
"^Rodinný dům s provozovnou$",
"^Family house with an establishment$",
"^Shinsei chūtō shin kokugun$",
"^Pulmonary alveolar proteinosis(\\.?)$",
"^Shinshū kanbun$",
"^Viñeta(s?) de Rodríguez$",
"(?i)^RUBRIKA UREDNIKA$",
"^A Matching Model of the Academic Publication Market$",
"^Yōgaku kōyō$",
"^Internetový marketing$",
"^Internet marketing$",
"^Chūtō kokugo dokuhon$",
"^Kokugo dokuhon$",
"^Antibiotic Cover for Dental Extraction(s?)$",
"^Strategie podniku$",
"^Strategy of an Enterprise$",
"(?i)^respiratory disease(s?)(\.?)$",
"^Award(s?) for Gallantry in Civil Defence$",
"^Podniková kultura$",
"^Corporate Culture$",
"^Severe hyponatraemia in hospital inpatient(s?)(\.?)$",
"^Pracovní motivace$",
"^Work Motivation$",
"^Kaitei kōtō jogaku dokuhon$",
"^Konsolidovaná účetní závěrka$",
"^Consolidated Financial Statements$",
"(?i)^intracranial tumour(s?)$",
"^Climate Change Mitigation Options and Directed Technical Change: A Decentralized Equilibrium Analysis$",
"^\\[CERVECERIAS MAHOU(\\.|\\:) INTERIOR\\] \\[Material gráfico\\]$",
"^Housing Market Dynamics(\\:|\\.) On the Contribution of Income Shocks and Credit Constraint(s?)$",
"^\\[Funciones auxiliares de la música en Radio París,.*\\]$",
"^Úroveň motivačního procesu jako způsobu vedení lidí$",
"^The level of motivation process as a leadership$",
"^Pay-beds in N(\\.?)H(\\.?)S(\\.?) Hospitals$",
"(?i)^news and events$",
"(?i)^NOVOSTI I DOGAĐAJI$",
"^Sansū no gakushū$",
"^Posouzení informačního systému firmy a návrh změn$",
"^Information System Assessment and Proposal for ICT Modification$",
"^Stresové zatížení pracovníků ve vybrané profesi$",
"^Stress load in a specific job$",
"^Sunday: Poster Sessions, Pt.*$",
"^Monday: Poster Sessions, Pt.*$",
"^Wednesday: Poster Sessions, Pt.*",
"^Tuesday: Poster Sessions, Pt.*$",
"^Analýza reklamy$",
"^Analysis of advertising$",
"^Shōgaku shūshinsho$",
"^Shōgaku sansū$",
"^Shintei joshi kokubun$",
"^Taishō joshi kokubun dokuhon$",
"^Joshi kokubun$",
"^Účetní uzávěrka a účetní závěrka v ČR$",
"(?i)^The \"?Causes\"? of Cancer$",
"^Normas para la publicación de artículos$",
"^Editor('|s)(s|') [Rr]eply$",
"^Editor(|s)(s|) letter$",
"^Redaktoriaus žodis$",
"^DISCUSSION ON THE PRECEDING PAPER$",
"^Kōtō shōgaku shūshinsho jidōyō$",
"^Shōgaku nihon rekishi$",
"^(Theory of the flow of action currents in isolated myelinated nerve fibers).*$",
"^Préface$",
"^Occupational [Hh]ealth [Ss]ervices.$",
"^In Memoriam Professor Toshiyuki TAKESHIMA$",
"^Účetní závěrka ve vybraném podniku.*$",
"^Financial statements in selected company$",
"^Abdominal [Aa]ortic [Aa]neurysms.*$",
"^Pseudomyxoma peritonei$",
"^Kazalo autora$",
"(?i)^uvodna riječ$",
"^Motivace jako způsob vedení lidí$",
"^Motivation as a leadership$",
"^Polyfunkční dům$",
"^Multi\\-funkcional building$",
"^Podnikatelský plán$",
"(?i)^Podnikatelský záměr$",
"(?i)^Business Plan$",
"^Oceňování nemovitostí$",
"^Marketingová komunikace$",
"^Marketing communication$",
"^Sumario Analítico$",
"^Riječ uredništva$",
"^Savjetovanja i priredbe$",
"^Índice$",
"^(Starobosanski nadpisi).*$",
"^Vzdělávání pracovníků v organizaci$",
"^Staff training in organization$",
"^(Life Histories of North American Geometridae).*$",
"^Strategická analýza podniku$",
"^Strategic Analysis of an Enterprise$",
"^Sadržaj$",
"^Upute suradnicima$",
"^Rodinný dům$",
"(?i)^Fami(l)?ly house$",
"^Upute autorima$",
"^Strategic Analysis$",
"^Finanční analýza vybraného podniku$",
"^Finanční analýza$",
"^Riječ urednika$",
"(?i)^Content(s?)$",
"(?i)^Inhalt$",
"^Jinjō shōgaku shūshinsho jidōyō$",
"(?i)^Index$",
"^Chūgaku kokubun kyōkasho$",
"^Retrato de una mujer$",
"^Retrato de un hombre$",
"^Kōtō shōgaku dokuhon$",
"^Shotōka kokugo$",
"^Shōgaku dokuhon$",
"^Jinjō shōgaku kokugo dokuhon$",
"^Shinsei kokugo dokuhon$",
"^Teikoku dokuhon$",
"^Instructions to Authors$",
"^KİTAP TAHLİLİ$",
"^PRZEGLĄD PIŚMIENNICTWA$",
"(?i)^Presentación$",
"^İçindekiler$",
"(?i)^Tabl?e of contents$",
"^(CODICE DEL BEATO DE LOS REYES FERNANDO I Y SANCHA).*$",
"^(\\[MADRID\\. BIBL\\. NAC\\. N.*KING FERDINAND I.*FROM SAN ISIDORO DE LEON\\. FACUNDUS SCRIPSIT DATED.*\\]).*",
"^Editorial( Board)?$",
"(?i)^Editorial \\(English\\)$",
"^Editörden$",
"^(Corpus Oral Dialectal \\(COD\\)\\.).*$",
"^(Kiri Karl Morgensternile).*$",
"^(\\[Eksliibris Aleksandr).*\\]$",
"^(\\[Eksliibris Aleksandr).*$",
"^(Eksliibris Aleksandr).*$",
"^(Kiri A\\. de Vignolles).*$",
"^(2 kirja Karl Morgensternile).*$",
"^(Pirita kloostri idaosa arheoloogilised).*$",
"^(Kiri tundmatule).*$",
"^(Kiri Jenaer Allgemeine Literaturzeitung toimetusele).*$",
"^(Eksliibris Nikolai Birukovile).*$",
"^(Eksliibris Nikolai Issakovile).*$",
"^(WHP Cruise Summary Information of section).*$",
"^(Measurement of the top quark\\-pair production cross section with ATLAS in pp collisions at).*$",
"^(Measurement of the spin\\-dependent structure function).*",
"(?i)^.*authors[']? reply\.?$",
"(?i)^.*authors[']? response\.?$"
]
}
}
}

View File

@ -1,275 +0,0 @@
{
"wf" : {
"threshold" : "0.99",
"dedupRun" : "001",
"entityType" : "result",
"orderField" : "title",
"queueMaxSize" : "4000",
"groupMaxSize" : "40",
"slidingWindowSize" : "200",
"rootBuilder" : [ "result", "resultProject_outcome_isProducedBy", "resultResult_publicationDataset_isRelatedTo", "resultResult_similarity_isAmongTopNSimilarDocuments", "resultResult_similarity_hasAmongTopNSimilarDocuments" ],
"includeChildren" : "true",
"maxChildren" : "40"
},
"pace" : {
"clustering" : [
{ "name" : "ngrampairs", "fields" : [ "title" ], "params" : { "max" : "1", "ngramLen" : "3"} },
{ "name" : "suffixprefix", "fields" : [ "title" ], "params" : { "max" : "1", "len" : "3" } },
{ "name" : "lowercase", "fields" : [ "doi" ], "params" : { } }
],
"strictConditions" : [
{ "name" : "pidMatch", "fields" : [ "pid" ] }
],
"conditions" : [
{ "name" : "titleVersionMatch", "fields" : [ "title" ] },
{ "name" : "sizeMatch", "fields" : [ "authors" ] }
],
"model" : [
{ "name" : "doi", "algo" : "Null", "type" : "String", "weight" : "0.0", "ignoreMissing" : "true", "path" : "pid[qualifier#classid = {doi}]/value" },
{ "name" : "pid", "algo" : "Null", "type" : "JSON", "weight" : "0.0", "ignoreMissing" : "true", "path" : "pid", "overrideMatch" : "true" },
{ "name" : "title", "algo" : "LevensteinTitle", "type" : "String", "weight" : "1.0", "ignoreMissing" : "false", "path" : "result/metadata/title[qualifier#classid = {main title}]/value" },
{ "name" : "authors", "algo" : "Null", "type" : "String", "weight" : "0.0", "ignoreMissing" : "true", "path" : "result/metadata/author/fullname" }
],
"blacklists" : {
"title" : [
"^Inside Front Cover$",
"(?i)^Poster presentations$",
"^THE ASSOCIATION AND THE GENERAL MEDICAL COUNCIL$",
"^Problems with perinatal pathology\.?$",
"(?i)^Cases? of Puerperal Convulsions$",
"(?i)^Operative Gyna?ecology$",
"(?i)^Mind the gap\!?\:?$",
"^Chronic fatigue syndrome\.?$",
"^Cartas? ao editor Letters? to the Editor$",
"^Note from the Editor$",
"^Anesthesia Abstract$",
"^Annual report$",
"(?i)^“?THE RADICAL PREVENTION OF VENEREAL DISEASE\.?”?$",
"(?i)^Graph and Table of Infectious Diseases?$",
"^Presentation$",
"(?i)^Reviews and Information on Publications$",
"(?i)^PUBLIC HEALTH SERVICES?$",
"(?i)^COMBINED TEXT-?BOOK OF OBSTETRICS AND GYN(Æ|ae)COLOGY$",
"(?i)^Adrese autora$",
"(?i)^Systematic Part .*\. Catalogus Fossilium Austriae, Band 2: Echinoidea neogenica$",
"(?i)^Acknowledgement to Referees$",
"(?i)^Behçet's disease\.?$",
"(?i)^Isolation and identification of restriction endonuclease.*$",
"(?i)^CEREBROVASCULAR DISEASES?.?$",
"(?i)^Screening for abdominal aortic aneurysms?\.?$",
"^Event management$",
"(?i)^Breakfast and Crohn's disease.*\.?$",
"^Cálculo de concentraciones en disoluciones acuosas. Ejercicio interactivo\..*\.$",
"(?i)^Genetic and functional analyses of SHANK2 mutations suggest a multiple hit model of Autism spectrum disorders?\.?$",
"^Gushi hakubutsugaku$",
"^Starobosanski nadpisi u Bosni i Hercegovini \(.*\)$",
"^Intestinal spirocha?etosis$",
"^Treatment of Rodent Ulcer$",
"(?i)^\W*Cloud Computing\W*$",
"^Compendio mathematico : en que se contienen todas las materias mas principales de las Ciencias que tratan de la cantidad$",
"^Free Communications, Poster Presentations: Session [A-F]$",
"^“The Historical Aspects? of Quackery\.?”$",
"^A designated centre for people with disabilities operated by St John of God Community Services (Limited|Ltd), Louth$",
"^P(er|re)-Mile Premiums for Auto Insurance\\.?$",
"(?i)^Case Report$",
"^Boletín Informativo$",
"(?i)^Glioblastoma Multiforme$",
"(?i)^Nuevos táxones animales descritos en la península Ibérica y Macaronesia desde 1994 \\(.*\\)$",
"^Zaměstnanecké výhody$",
"(?i)^The Economics of Terrorism and Counter-Terrorism: A Survey \\(Part .*\\)$",
"(?i)^Carotid body tumours?\\.?$",
"(?i)^\\[Españoles en Francia : La condición Emigrante.*\\]$",
"^Avant-propos$",
"(?i)^St\. Patrick's Cathedral, Dublin, County Dublin - Head(s)? and Capital(s)?$",
"(?i)^St\. Patrick's Cathedral, Dublin, County Dublin - Bases?$",
"(?i)^PUBLIC HEALTH VERSUS THE STATE$",
"^Viñetas de Cortázar$",
"(?i)^Search for heavy neutrinos and W(\\[|_|\\(|_\\{|-)?R(\\]|\\)|\\})? bosons with right-handed couplings in a left-right symmetric model in pp collisions at.*TeV(\.)?$",
"(?i)^Measurement of the pseudorapidity and centrality dependence of the transverse energy density in Pb(-?)Pb collisions at.*tev(\.?)$",
"(?i)^Search for resonances decaying into top-quark pairs using fully hadronic decays in pp collisions with ATLAS at.*TeV$",
"(?i)^Search for neutral minimal supersymmetric standard model Higgs bosons decaying to tau pairs in pp collisions at.*tev$",
"(?i)^Relatório de Estágio (de|em) Angiologia e Cirurgia Vascular$",
"^Aus der AGMB$",
"^Znanstveno-stručni prilozi$",
"(?i)^Zhodnocení finanční situace podniku a návrhy na zlepšení$",
"(?i)^Evaluation of the Financial Situation in the Firm and Proposals to its Improvement$",
"(?i)^Hodnocení finanční situace podniku a návrhy na její zlepšení$",
"^Finanční analýza podniku$",
"^Financial analysis( of business)?$",
"(?i)^Textbook of Gyn(a)?(Æ)?(e)?cology$",
"^Jikken nihon shūshinsho$",
"(?i)^CORONER('|s)(s|') INQUESTS$",
"(?i)^(Μελέτη παραγόντων )?risk management( για ανάπτυξη και εφαρμογή ενός πληροφοριακού συστήματος| και ανάπτυξη συστήματος)?$",
"(?i)^Consultants' contract(s)?$",
"(?i)^Upute autorima$",
"(?i)^Bijdrage tot de Kennis van den Godsdienst der Dajaks van Lan(d|f)ak en Tajan$",
"^Joshi shin kokubun$",
"^Kōtō shōgaku dokuhon nōson'yō$",
"^Jinjō shōgaku shōka$",
"^Shōgaku shūjichō$",
"^Nihon joshi dokuhon$",
"^Joshi shin dokuhon$",
"^Chūtō kanbun dokuhon$",
"^Wabun dokuhon$",
"(?i)^(Analysis of economy selected village or town|Rozbor hospodaření vybrané obce či města)$",
"(?i)^cardiac rehabilitation$",
"(?i)^Analytical summary$",
"^Thesaurus resolutionum Sacrae Congregationis Concilii$",
"(?i)^Sumario analítico(\\s{1})?(Analitic summary)?$",
"^Prikazi i osvrti$",
"^Rodinný dům s provozovnou$",
"^Family house with an establishment$",
"^Shinsei chūtō shin kokugun$",
"^Pulmonary alveolar proteinosis(\\.?)$",
"^Shinshū kanbun$",
"^Viñeta(s?) de Rodríguez$",
"(?i)^RUBRIKA UREDNIKA$",
"^A Matching Model of the Academic Publication Market$",
"^Yōgaku kōyō$",
"^Internetový marketing$",
"^Internet marketing$",
"^Chūtō kokugo dokuhon$",
"^Kokugo dokuhon$",
"^Antibiotic Cover for Dental Extraction(s?)$",
"^Strategie podniku$",
"^Strategy of an Enterprise$",
"(?i)^respiratory disease(s?)(\.?)$",
"^Award(s?) for Gallantry in Civil Defence$",
"^Podniková kultura$",
"^Corporate Culture$",
"^Severe hyponatraemia in hospital inpatient(s?)(\.?)$",
"^Pracovní motivace$",
"^Work Motivation$",
"^Kaitei kōtō jogaku dokuhon$",
"^Konsolidovaná účetní závěrka$",
"^Consolidated Financial Statements$",
"(?i)^intracranial tumour(s?)$",
"^Climate Change Mitigation Options and Directed Technical Change: A Decentralized Equilibrium Analysis$",
"^\\[CERVECERIAS MAHOU(\\.|\\:) INTERIOR\\] \\[Material gráfico\\]$",
"^Housing Market Dynamics(\\:|\\.) On the Contribution of Income Shocks and Credit Constraint(s?)$",
"^\\[Funciones auxiliares de la música en Radio París,.*\\]$",
"^Úroveň motivačního procesu jako způsobu vedení lidí$",
"^The level of motivation process as a leadership$",
"^Pay-beds in N(\\.?)H(\\.?)S(\\.?) Hospitals$",
"(?i)^news and events$",
"(?i)^NOVOSTI I DOGAĐAJI$",
"^Sansū no gakushū$",
"^Posouzení informačního systému firmy a návrh změn$",
"^Information System Assessment and Proposal for ICT Modification$",
"^Stresové zatížení pracovníků ve vybrané profesi$",
"^Stress load in a specific job$",
"^Sunday: Poster Sessions, Pt.*$",
"^Monday: Poster Sessions, Pt.*$",
"^Wednesday: Poster Sessions, Pt.*",
"^Tuesday: Poster Sessions, Pt.*$",
"^Analýza reklamy$",
"^Analysis of advertising$",
"^Shōgaku shūshinsho$",
"^Shōgaku sansū$",
"^Shintei joshi kokubun$",
"^Taishō joshi kokubun dokuhon$",
"^Joshi kokubun$",
"^Účetní uzávěrka a účetní závěrka v ČR$",
"(?i)^The \"?Causes\"? of Cancer$",
"^Normas para la publicación de artículos$",
"^Editor('|s)(s|') [Rr]eply$",
"^Editor(|s)(s|) letter$",
"^Redaktoriaus žodis$",
"^DISCUSSION ON THE PRECEDING PAPER$",
"^Kōtō shōgaku shūshinsho jidōyō$",
"^Shōgaku nihon rekishi$",
"^(Theory of the flow of action currents in isolated myelinated nerve fibers).*$",
"^Préface$",
"^Occupational [Hh]ealth [Ss]ervices.$",
"^In Memoriam Professor Toshiyuki TAKESHIMA$",
"^Účetní závěrka ve vybraném podniku.*$",
"^Financial statements in selected company$",
"^Abdominal [Aa]ortic [Aa]neurysms.*$",
"^Pseudomyxoma peritonei$",
"^Kazalo autora$",
"(?i)^uvodna riječ$",
"^Motivace jako způsob vedení lidí$",
"^Motivation as a leadership$",
"^Polyfunkční dům$",
"^Multi\\-funkcional building$",
"^Podnikatelský plán$",
"(?i)^Podnikatelský záměr$",
"(?i)^Business Plan$",
"^Oceňování nemovitostí$",
"^Marketingová komunikace$",
"^Marketing communication$",
"^Sumario Analítico$",
"^Riječ uredništva$",
"^Savjetovanja i priredbe$",
"^Índice$",
"^(Starobosanski nadpisi).*$",
"^Vzdělávání pracovníků v organizaci$",
"^Staff training in organization$",
"^(Life Histories of North American Geometridae).*$",
"^Strategická analýza podniku$",
"^Strategic Analysis of an Enterprise$",
"^Sadržaj$",
"^Upute suradnicima$",
"^Rodinný dům$",
"(?i)^Fami(l)?ly house$",
"^Upute autorima$",
"^Strategic Analysis$",
"^Finanční analýza vybraného podniku$",
"^Finanční analýza$",
"^Riječ urednika$",
"(?i)^Content(s?)$",
"(?i)^Inhalt$",
"^Jinjō shōgaku shūshinsho jidōyō$",
"(?i)^Index$",
"^Chūgaku kokubun kyōkasho$",
"^Retrato de una mujer$",
"^Retrato de un hombre$",
"^Kōtō shōgaku dokuhon$",
"^Shotōka kokugo$",
"^Shōgaku dokuhon$",
"^Jinjō shōgaku kokugo dokuhon$",
"^Shinsei kokugo dokuhon$",
"^Teikoku dokuhon$",
"^Instructions to Authors$",
"^KİTAP TAHLİLİ$",
"^PRZEGLĄD PIŚMIENNICTWA$",
"(?i)^Presentación$",
"^İçindekiler$",
"(?i)^Tabl?e of contents$",
"^(CODICE DEL BEATO DE LOS REYES FERNANDO I Y SANCHA).*$",
"^(\\[MADRID\\. BIBL\\. NAC\\. N.*KING FERDINAND I.*FROM SAN ISIDORO DE LEON\\. FACUNDUS SCRIPSIT DATED.*\\]).*",
"^Editorial( Board)?$",
"(?i)^Editorial \\(English\\)$",
"^Editörden$",
"^(Corpus Oral Dialectal \\(COD\\)\\.).*$",
"^(Kiri Karl Morgensternile).*$",
"^(\\[Eksliibris Aleksandr).*\\]$",
"^(\\[Eksliibris Aleksandr).*$",
"^(Eksliibris Aleksandr).*$",
"^(Kiri A\\. de Vignolles).*$",
"^(2 kirja Karl Morgensternile).*$",
"^(Pirita kloostri idaosa arheoloogilised).*$",
"^(Kiri tundmatule).*$",
"^(Kiri Jenaer Allgemeine Literaturzeitung toimetusele).*$",
"^(Eksliibris Nikolai Birukovile).*$",
"^(Eksliibris Nikolai Issakovile).*$",
"^(WHP Cruise Summary Information of section).*$",
"^(Measurement of the top quark\\-pair production cross section with ATLAS in pp collisions at).*$",
"^(Measurement of the spin\\-dependent structure function).*",
"(?i)^.*authors[']? reply\.?$",
"(?i)^.*authors[']? response\.?$"
]
}
}
}

View File

@ -1,22 +0,0 @@
{
"wf" : {
"threshold" : "0.99",
"dedupRun" : "001",
"entityType" : "result",
"orderField" : "title",
"queueMaxSize" : "2000",
"groupMaxSize" : "10",
"slidingWindowSize" : "200",
"rootBuilder" : [ "result" ],
"includeChildren" : "true"
},
"pace" : {
"necessaryConditions" : [ ],
"model" : [
{ "name" : "title", "algo" : "JaroWinkler", "type" : "String", "weight" : "1.0", "ignoreMissing" : "false", "path" : "result/metadata/title[qualifier#classid = {main title}]/value" }
],
"blacklists" : { } ,
"synonyms" : { }
}
}

View File

@ -20,15 +20,17 @@ import java.util.*;
import java.util.stream.Collectors;
/**
* Set of common functions
* Set of common functions for the framework
*
* @author claudio
*
*/
public abstract class AbstractPaceFunctions {
//city map to be used when translating the city names into codes
private static Map<String,String> cityMap = AbstractPaceFunctions.loadMapFromClasspath("/eu/dnetlib/pace/config/city_map.csv");
//list of stopwords in different languages
protected static Set<String> stopwords_en = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_en.txt");
protected static Set<String> stopwords_de = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_de.txt");
protected static Set<String> stopwords_es = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_es.txt");
@ -36,15 +38,14 @@ public abstract class AbstractPaceFunctions {
protected static Set<String> stopwords_it = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_it.txt");
protected static Set<String> stopwords_pt = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_pt.txt");
//blacklist of ngrams: to avoid generic keys
protected static Set<String> ngramBlacklist = loadFromClasspath("/eu/dnetlib/pace/config/ngram_blacklist.txt");
private static final String alpha = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789 ";
private static final String aliases_from = "⁰¹²³⁴⁵⁶⁷⁸⁹⁺⁻⁼⁽⁾ⁿ₀₁₂₃₄₅₆₇₈₉₊₋₌₍₎àáâäæãåāèéêëēėęîïíīįìôöòóœøōõûüùúūßśšłžźżçćčñń";
private static final String aliases_to = "0123456789+-=()n0123456789+-=()aaaaaaaaeeeeeeeiiiiiioooooooouuuuussslzzzcccnn";
private static final String special_from = "İə";
private static final String special_to = "Ie";
private static final String aliases_from = "⁰¹²³⁴⁵⁶⁷⁸⁹⁺⁻⁼⁽⁾ⁿ₀₁₂₃₄₅₆₇₈₉₊₋₌₍₎àáâäæãåāèéêëēėęəîïíīįìôöòóœøōõûüùúūßśšłžźżçćčñń";
private static final String aliases_to = "0123456789+-=()n0123456789+-=()aaaaaaaaeeeeeeeeiiiiiioooooooouuuuussslzzzcccnn";
//doi prefix for normalization
public final String DOI_PREFIX = "(https?:\\/\\/dx\\.doi\\.org\\/)|(doi:)";
protected final static FieldList EMPTY_FIELD = new FieldListImpl();
@ -54,8 +55,7 @@ public abstract class AbstractPaceFunctions {
}
protected String cleanup(final String s) {
final String ss = fixSpecial(s); //TODO is there something implemented to replace strange symbols with latin letters?
final String s0 = ss.toLowerCase();
final String s0 = s.toLowerCase();
final String s1 = fixAliases(s0);
final String s2 = nfd(s1);
final String s3 = s2.replaceAll("&ndash;", " ");
@ -63,15 +63,12 @@ public abstract class AbstractPaceFunctions {
final String s5 = s4.replaceAll("&quot;", " ");
final String s6 = s5.replaceAll("&minus;", " ");
final String s7 = s6.replaceAll("([0-9]+)", " $1 ");
final String s8 = s7.replaceAll("[^\\p{ASCII}]|\\p{Punct}", " ");
final String s9 = s8.replaceAll("\\n", " ");
final String s10 = s9.replaceAll("(?m)\\s+", " ");
final String s11 = s10.trim();
return s11;
}
protected String finalCleanup(final String s) {
return s.toLowerCase();
final String s8 = s7.replaceAll("[^\\p{ASCII}]", "");
final String s9 = s8.replaceAll("[\\p{Punct}]", " ");
final String s10 = s9.replaceAll("\\n", " ");
final String s11 = s10.replaceAll("(?m)\\s+", " ");
final String s12 = s11.trim();
return s12;
}
protected boolean checkNumbers(final String a, final String b) {
@ -98,16 +95,6 @@ public abstract class AbstractPaceFunctions {
return s.replaceAll("\\D", "");
}
//sometimes the toLowerCase() produces error, this is meant to prevent them by replacing special character before the lowercase function
protected static String fixSpecial(final String s) {
final StringBuilder sb = new StringBuilder();
for (final char ch : Lists.charactersOf(s)) {
final int i = StringUtils.indexOf(special_from, ch);
sb.append(i >= 0 ? special_to.charAt(i) : ch);
}
return sb.toString();
}
protected static String fixAliases(final String s) {
final StringBuilder sb = new StringBuilder();
for (final char ch : Lists.charactersOf(s)) {
@ -134,20 +121,19 @@ public abstract class AbstractPaceFunctions {
return s != null;
}
// ///////////////////////
protected String normalize(final String s) {
return nfd(s).toLowerCase()
return nfd(s)
.toLowerCase()
// do not compact the regexes in a single expression, would cause StackOverflowError in case of large input strings
.replaceAll("(\\W)+", " ")
.replaceAll("(\\p{InCombiningDiacriticalMarks})+", " ")
.replaceAll("[^ \\w]+", "")
.replaceAll("(\\p{InCombiningDiacriticalMarks})+", "")
.replaceAll("(\\p{Punct})+", " ")
.replaceAll("(\\d)+", " ")
.replaceAll("(\\n)+", " ")
.trim();
}
private String nfd(final String s) {
public String nfd(final String s) {
return Normalizer.normalize(s, Normalizer.Form.NFD);
}
@ -186,8 +172,6 @@ public abstract class AbstractPaceFunctions {
return newset;
}
// ////////////////////
public static Set<String> loadFromClasspath(final String classpath) {
final Set<String> h = Sets.newHashSet();
try {
@ -217,17 +201,6 @@ public abstract class AbstractPaceFunctions {
return m;
}
//translate the string: replace the keywords with the code
public String translate(String s1, Map<String, String> translationMap){
final StringTokenizer st = new StringTokenizer(s1);
final StringBuilder sb = new StringBuilder();
while (st.hasMoreTokens()){
final String token = st.nextToken();
sb.append(translationMap.getOrDefault(token,token) + " ");
}
return sb.toString().trim();
}
public String removeKeywords(String s, Set<String> keywords) {
s = " " + s + " ";
@ -238,7 +211,6 @@ public abstract class AbstractPaceFunctions {
return s.trim();
}
public double keywordsCompare(Set<String> s1, Set<String> s2, Map<String, String> translationMap){
Set<String> k1 = keywordsToCodes(s1, translationMap);
@ -252,23 +224,6 @@ public abstract class AbstractPaceFunctions {
return (double)CollectionUtils.intersection(k1,k2).size()/(double)longer;
}
//returns true if at least 1 city is in common
//returns true if no cities are contained in names
//returns false if one of the two names have no city
public boolean sameCity(Set<String> s1, Set<String> s2){
Set<String> c1 = citiesToCodes(s1);
Set<String> c2 = citiesToCodes(s2);
if (c1.isEmpty() && c2.isEmpty())
return true;
else {
if (c1.isEmpty() ^ c2.isEmpty())
return false;
return CollectionUtils.intersection(c1, c2).size() > 0;
}
}
//convert the set of keywords to codes
public Set<String> toCodes(Set<String> keywords, Map<String, String> translationMap) {
return keywords.stream().map(s -> translationMap.get(s)).collect(Collectors.toSet());
@ -294,7 +249,7 @@ public abstract class AbstractPaceFunctions {
return pid.toLowerCase().replaceAll(DOI_PREFIX, "");
}
//get the list of codes into the input string
//get the list of keywords into the input string
public Set<String> getKeywords(String s1, Map<String, String> translationMap, int windowSize){
String s = s1;
@ -311,10 +266,10 @@ public abstract class AbstractPaceFunctions {
while (length != 0) {
for (int i = 0; i<=tokens.size()-length; i++){
String candidate = Joiner.on(" ").join(tokens.subList(i, i + length));
String candidate = concat(tokens.subList(i, i + length));
if (translationMap.containsKey(candidate)) {
codes.add(candidate);
s = s.replace(candidate, "");
s = s.replace(candidate, "").trim();
}
}

View File

@ -45,6 +45,7 @@ public class DedupConfig implements Config, Serializable {
defaults.put("slidingWindowSize", "200");
defaults.put("rootBuilder", "result");
defaults.put("includeChildren", "true");
defaults.put("maxIterations", "20");
}
public DedupConfig() {}

View File

@ -1,7 +1,7 @@
package eu.dnetlib.pace.config;
import com.google.common.collect.Maps;
import eu.dnetlib.pace.common.AbstractPaceFunctions;
import eu.dnetlib.pace.model.ClusteringDef;
import eu.dnetlib.pace.model.FieldDef;
import eu.dnetlib.pace.tree.support.TreeNodeDef;
@ -9,11 +9,10 @@ import eu.dnetlib.pace.util.PaceResolver;
import org.codehaus.jackson.annotate.JsonIgnore;
import java.io.Serializable;
import java.text.Normalizer;
import java.util.List;
import java.util.Map;
public class PaceConfig implements Serializable {
public class PaceConfig extends AbstractPaceFunctions implements Serializable {
private List<FieldDef> model;
@ -46,7 +45,7 @@ public class PaceConfig implements Serializable {
for (String key : synonyms.keySet()) {
for (String term : synonyms.get(key)){
translationMap.put(
Normalizer.normalize(term.toLowerCase(), Normalizer.Form.NFD),
normalize(term.toLowerCase()),
key);
}
}

View File

@ -76,6 +76,12 @@ public class WfConfig implements Serializable {
/** Maximum number of allowed children. */
private int maxChildren = MAX_CHILDREN;
/** Default maximum number of iterations. */
private final static int MAX_ITERATIONS = 20;
/** Maximum number of iterations */
private int maxIterations = MAX_ITERATIONS;
public WfConfig() {}
/**
@ -104,7 +110,7 @@ public class WfConfig implements Serializable {
*/
public WfConfig(final String entityType, final String orderField, final List<String> rootBuilder, final String dedupRun,
final double threshold,
final Set<String> skipList, final int queueMaxSize, final int groupMaxSize, final int slidingWindowSize, final boolean includeChildren) {
final Set<String> skipList, final int queueMaxSize, final int groupMaxSize, final int slidingWindowSize, final boolean includeChildren, final int maxIterations) {
super();
this.entityType = entityType;
this.orderField = orderField;
@ -116,6 +122,7 @@ public class WfConfig implements Serializable {
this.groupMaxSize = groupMaxSize;
this.slidingWindowSize = slidingWindowSize;
this.includeChildren = includeChildren;
this.maxIterations = maxIterations;
}
/**
@ -245,6 +252,15 @@ public class WfConfig implements Serializable {
this.maxChildren = maxChildren;
}
public int getMaxIterations() {
return maxIterations;
}
public WfConfig setMaxIterations(int maxIterations) {
this.maxIterations = maxIterations;
return this;
}
/*
* (non-Javadoc)
*

View File

@ -5,6 +5,7 @@ import eu.dnetlib.pace.tree.support.AbstractComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass;
import eu.dnetlib.pace.config.Config;
import org.apache.commons.collections.CollectionUtils;
import java.util.Map;
import java.util.Set;
@ -44,7 +45,7 @@ public class JaroWinklerNormalizedName extends AbstractComparator {
Set<String> cities1 = getCities(ca, params.getOrDefault("windowSize", 4).intValue());
Set<String> cities2 = getCities(cb, params.getOrDefault("windowSize", 4).intValue());
if (sameCity(cities1,cities2)) {
if (checkCities(cities1,cities2)) {
if (keywordsCompare(keywords1, keywords2, conf.translationMap())>params.getOrDefault("threshold", 0.5).doubleValue()) {
@ -64,6 +65,22 @@ public class JaroWinklerNormalizedName extends AbstractComparator {
return 0.0;
}
//returns true if at least 1 city is in common
//returns true if no cities are contained in names
//returns false if one of the two names have no city
public boolean checkCities(Set<String> s1, Set<String> s2){
Set<String> c1 = citiesToCodes(s1);
Set<String> c2 = citiesToCodes(s2);
if (c1.isEmpty() && c2.isEmpty())
return true;
else {
if (c1.isEmpty() ^ c2.isEmpty())
return false;
return CollectionUtils.intersection(c1, c2).size() > 0;
}
}
@Override
public double getWeight() {
return super.weight;

View File

@ -31,10 +31,7 @@ public class Level2JaroWinklerTitle extends AbstractComparator {
if (check) return 0.5;
final String cca = finalCleanup(ca);
final String ccb = finalCleanup(cb);
return ssalgo.score(cca, ccb);
return ssalgo.score(ca, cb);
}
@Override

View File

@ -36,10 +36,7 @@ public class LevensteinTitle extends AbstractComparator {
if (check) return 0.5;
final String cca = finalCleanup(ca);
final String ccb = finalCleanup(cb);
return normalize(ssalgo.score(cca, ccb), cca.length(), ccb.length());
return normalize(ssalgo.score(ca, cb), ca.length(), cb.length());
}
private double normalize(final double score, final int la, final int lb) {

View File

@ -37,10 +37,7 @@ public class LevensteinTitleIgnoreVersion extends AbstractComparator {
ca = filterAllStopWords(ca);
cb = filterAllStopWords(cb);
final String cca = finalCleanup(ca);
final String ccb = finalCleanup(cb);
return normalize(ssalgo.score(cca, ccb), cca.length(), ccb.length());
return normalize(ssalgo.score(ca, cb), ca.length(), cb.length());
}
private double normalize(final double score, final int la, final int lb) {

View File

@ -4,11 +4,15 @@ import eu.dnetlib.pace.util.PaceException;
public enum AggType {
WEIGHTED_MEAN,
AVG,
W_MEAN, //weighted mean
AVG, //average
SUM,
MAX,
MIN;
MIN,
NC, //necessary condition
SC, //sufficient condition
AND,
OR;
public static AggType getEnum(String value) {

View File

@ -6,7 +6,7 @@ import eu.dnetlib.pace.model.Field;
public interface Comparator {
/*
* return : -1 -> can't decide (missing field)
* return : -1 -> can't decide (i.e. missing field)
* >0 -> similarity degree (depends on the algorithm)
* */
public double compare(Field a, Field b, Config conf);

View File

@ -14,25 +14,25 @@ public class FieldConf implements Serializable {
private double weight = 1.0; //weight for the field (to be used in the aggregation)
private Map<String,Number> params; //parameters
private boolean ignoreMissing;
private boolean countIfUndefined;
public boolean isIgnoreMissing() {
return ignoreMissing;
public boolean isCountIfUndefined() {
return countIfUndefined;
}
public void setIgnoreMissing(boolean ignoreMissing) {
this.ignoreMissing = ignoreMissing;
public void setCountIfUndefined(boolean countIfUndefined) {
this.countIfUndefined = countIfUndefined;
}
public FieldConf() {
}
public FieldConf(String field, String comparator, double weight, Map<String, Number> params, boolean ignoreMissing) {
public FieldConf(String field, String comparator, double weight, Map<String, Number> params, boolean countIfUndefined) {
this.field = field;
this.comparator = comparator;
this.weight = weight;
this.params = params;
this.ignoreMissing = ignoreMissing;
this.countIfUndefined = countIfUndefined;
}
public String getField() {

View File

@ -4,7 +4,6 @@ import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.config.PaceConfig;
import eu.dnetlib.pace.model.MapDocument;
import eu.dnetlib.pace.util.PaceException;
import org.codehaus.jackson.annotate.JsonIgnore;
import org.codehaus.jackson.map.ObjectMapper;
import java.io.IOException;
@ -22,16 +21,16 @@ public class TreeNodeDef implements Serializable {
private String negative;
private String undefined;
boolean ignoreMissing;
boolean ignoreUndefined;
public TreeNodeDef(List<FieldConf> fields, AggType aggregation, double threshold, String positive, String negative, String undefined, boolean ignoreMissing) {
public TreeNodeDef(List<FieldConf> fields, AggType aggregation, double threshold, String positive, String negative, String undefined, boolean ignoreUndefined) {
this.fields = fields;
this.aggregation = aggregation;
this.threshold = threshold;
this.positive = positive;
this.negative = negative;
this.undefined = undefined;
this.ignoreMissing = ignoreMissing;
this.ignoreUndefined = ignoreUndefined;
}
public TreeNodeDef() {
@ -48,9 +47,9 @@ public class TreeNodeDef implements Serializable {
double result = comparator(fieldConf).compare(doc1.getFieldMap().get(fieldConf.getField()), doc2.getFieldMap().get(fieldConf.getField()), conf);
if (result == -1) { //if the field is missing
stats.incrementMissCount();
if (!fieldConf.isIgnoreMissing()) {
if (result == -1) { //if the comparison is undefined
stats.incrementUndefinedCount();
if (fieldConf.isCountIfUndefined()) { //if it must be taken into account, increment weights (i.e. the average would be lower)
stats.incrementWeightsSum(weight);
}
}
@ -117,12 +116,12 @@ public class TreeNodeDef implements Serializable {
this.undefined = undefined;
}
public boolean isIgnoreMissing() {
return ignoreMissing;
public boolean isIgnoreUndefined() {
return ignoreUndefined;
}
public void setIgnoreMissing(boolean ignoreMissing) {
this.ignoreMissing = ignoreMissing;
public void setIgnoreUndefined(boolean ignoreUndefined) {
this.ignoreUndefined = ignoreUndefined;
}
@Override

View File

@ -7,7 +7,7 @@ import java.io.Serializable;
public class TreeNodeStats implements Serializable {
private DescriptiveStatistics stats;
private int missCount = 0;
private int undefinedCount = 0; //counter for the number of undefined comparisons between the fields in the tree node
private int fieldsCount = 0;
private double weightsSum = 0.0;
@ -15,8 +15,8 @@ public class TreeNodeStats implements Serializable {
this.stats = new DescriptiveStatistics();
}
public TreeNodeStats(int missCount, int fieldsCount, double weightsSum) {
this.missCount = missCount;
public TreeNodeStats(int undefinedCount, int fieldsCount, double weightsSum) {
this.undefinedCount = undefinedCount;
this.fieldsCount = fieldsCount;
this.weightsSum = weightsSum;
}
@ -29,12 +29,12 @@ public class TreeNodeStats implements Serializable {
this.stats = stats;
}
public int getMissCount() {
return missCount;
public int getUndefinedCount() {
return undefinedCount;
}
public void setMissCount(int missCount) {
this.missCount = missCount;
public void setUndefinedCount(int undefinedCount) {
this.undefinedCount = undefinedCount;
}
public int getFieldsCount() {
@ -57,8 +57,8 @@ public class TreeNodeStats implements Serializable {
this.weightsSum += delta;
}
public void incrementMissCount(){
this.missCount += 1;
public void incrementUndefinedCount(){
this.undefinedCount += 1;
}
public void incrementScoresSum(double delta){
@ -72,11 +72,15 @@ public class TreeNodeStats implements Serializable {
return stats.getMean();
case SUM:
return stats.getSum();
case SC:
case OR:
case MAX:
return stats.getMax();
case NC:
case AND:
case MIN:
return stats.getMin();
case WEIGHTED_MEAN:
case W_MEAN:
return stats.getSum()/weightsSum;
default:
return 0.0;

View File

@ -40,9 +40,11 @@ public class TreeProcessor {
TreeNodeStats stats = currentNode.evaluate(doc1, doc2, config);
if (!currentNode.isIgnoreMissing() && stats.getMissCount()>0) {
//if ignoreUndefined=false the miss is considered as undefined
if (!currentNode.isIgnoreUndefined() && stats.getUndefinedCount()>0) {
current = currentNode.getUndefined();
}
//if ignoreUndefined=true the miss is ignored and the score computed anyway
else if (stats.getFinalScore(currentNode.getAggregation()) >= currentNode.getThreshold()) {
current = currentNode.getPositive();
}

View File

@ -1,5 +1,6 @@
package eu.dnetlib.pace;
import eu.dnetlib.pace.common.AbstractPaceFunctions;
import eu.dnetlib.pace.config.Type;
import eu.dnetlib.pace.model.Field;
import eu.dnetlib.pace.model.FieldListImpl;
@ -11,7 +12,7 @@ import java.io.StringWriter;
import java.util.List;
import java.util.stream.Collectors;
public abstract class AbstractPaceTest {
public abstract class AbstractPaceTest extends AbstractPaceFunctions {
protected String readFromClasspath(final String filename) {
final StringWriter sw = new StringWriter();

View File

@ -18,7 +18,7 @@ public class ClusteringFunctionTest extends AbstractPaceTest {
@Before
public void setUp() throws Exception {
params = Maps.newHashMap();
conf = DedupConfig.load(AbstractPaceFunctions.readFromClasspath("/eu/dnetlib/pace/config/org.curr.conf", ClusteringFunctionTest.class));
conf = DedupConfig.load(AbstractPaceFunctions.readFromClasspath("/eu/dnetlib/pace/config/organization.current.conf", ClusteringFunctionTest.class));
}
@Test

View File

@ -1,5 +1,6 @@
package eu.dnetlib.pace.common;
import org.junit.Assert;
import org.junit.Test;
import static junit.framework.Assert.assertEquals;
@ -7,6 +8,8 @@ import static junit.framework.Assert.assertTrue;
public class PaceFunctionTest extends AbstractPaceFunctions {
private final static String TEST_STRING = "Toshiba NB550D: è un netbook su piattaforma AMD Fusion⁽¹²⁾.";
@Test
public void normalizePidTest(){
@ -14,7 +17,6 @@ public class PaceFunctionTest extends AbstractPaceFunctions {
assertEquals("10.1109/tns.2015.2493347", normalizePid("10.1109/TNS.2015.2493347"));
assertEquals("10.0001/testdoi", normalizePid("http://dx.doi.org/10.0001/testDOI"));
assertEquals("10.0001/testdoi", normalizePid("https://dx.doi.org/10.0001/testDOI"));
}
@Test
@ -22,4 +24,35 @@ public class PaceFunctionTest extends AbstractPaceFunctions {
assertEquals("universita politecnica marche", filterAllStopWords("universita politecnica delle marche"));
}
@Test
public void normalizeTest() {
Assert.assertEquals("universitat", normalize("Universität"));
System.out.println(normalize("İstanbul Ticarət Universiteti"));
}
@Test
public void cleanupTest() {
assertEquals("istanbul ticaret universiteti", cleanup("İstanbul Ticarət Universiteti"));
System.out.println("cleaned up : " + cleanup(TEST_STRING));
}
@Test
public void testGetNumbers() {
System.out.println("Numbers : " + getNumbers(TEST_STRING));
}
@Test
public void testRemoveSymbols() {
System.out.println("Without symbols: " + removeSymbols(TEST_STRING));
}
@Test
public void testFixAliases() {
System.out.println("Fixed aliases : " + fixAliases(TEST_STRING));
}
}

View File

@ -14,9 +14,8 @@ import java.util.Map;
import static junit.framework.Assert.assertEquals;
import static junit.framework.Assert.assertTrue;
public class DistanceAlgoTest extends AbstractPaceFunctions {
public class ComparatorTest extends AbstractPaceFunctions {
private final static String TEST_STRING = "Toshiba NB550D: è un netbook su piattaforma AMD Fusion⁽¹²⁾.";
private Map<String, Number> params;
private DedupConfig conf;
@ -24,7 +23,8 @@ public class DistanceAlgoTest extends AbstractPaceFunctions {
public void setup() {
params = new HashMap<>();
params.put("weight", 1.0);
conf = DedupConfig.load(readFromClasspath("/eu/dnetlib/pace/config/org.curr.conf", DistanceAlgoTest.class));
conf = DedupConfig.load(readFromClasspath("/eu/dnetlib/pace/config/organization.current.conf", ComparatorTest.class));
}
@Test
@ -33,26 +33,6 @@ public class DistanceAlgoTest extends AbstractPaceFunctions {
System.out.println("utils = " + utils.cleanupForOrdering("University of Pisa"));
}
@Test
public void testGetNumbers() {
System.out.println("Numbers : " + getNumbers(TEST_STRING));
}
@Test
public void testRemoveSymbols() {
System.out.println("Without symbols: " + removeSymbols(TEST_STRING));
}
@Test
public void testFixAliases() {
System.out.println("Fixed aliases : " + fixAliases(TEST_STRING));
}
@Test
public void testCleanup() {
System.out.println("cleaned up : " + cleanup(TEST_STRING));
}
@Test
public void testJaroWinklerNormalizedName() {
final JaroWinklerNormalizedName jaroWinklerNormalizedName = new JaroWinklerNormalizedName(params);

View File

@ -3,6 +3,8 @@ package eu.dnetlib.pace.config;
import eu.dnetlib.pace.AbstractPaceTest;
import org.junit.Test;
import java.util.Map;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertNotNull;
import static org.junit.Assert.assertTrue;
@ -11,15 +13,10 @@ public class ConfigTest extends AbstractPaceTest {
@Test
public void dedupConfigSerializationTest() {
final DedupConfig cfgFromClasspath = DedupConfig.load(readFromClasspath("org.curr.conf"));
final DedupConfig cfgFromClasspath = DedupConfig.load(readFromClasspath("organization.current.conf"));
final String conf = cfgFromClasspath.toString();
// System.out.println("*****SERIALIZED*****");
// System.out.println(conf);
// System.out.println("*****FROM CLASSPATH*****");
// System.out.println(readFromClasspath("result.pace.conf.json"));
final DedupConfig cfgFromSerialization = DedupConfig.load(conf);
assertEquals(cfgFromClasspath.toString(), cfgFromSerialization.toString());
@ -27,29 +24,36 @@ public class ConfigTest extends AbstractPaceTest {
assertNotNull(cfgFromClasspath);
assertNotNull(cfgFromSerialization);
}
@Test
public void dedupConfigTest() {
DedupConfig load = DedupConfig.load(readFromClasspath("org.curr.conf"));
DedupConfig load = DedupConfig.load(readFromClasspath("organization.current.conf"));
System.out.println(load.toString());
}
@Test
public void translationMapTest() {
public void initTranslationMapTest() {
DedupConfig load = DedupConfig.load(readFromClasspath("org.curr.conf"));
DedupConfig load = DedupConfig.load(readFromClasspath("organization.current.conf"));
Map<String, String> translationMap = load.translationMap();
System.out.println("translationMap = " + translationMap.size());
for (String key: translationMap.keySet()) {
if (translationMap.get(key).equals("key::1"))
System.out.println("key = " + key);
}
System.out.println("translationMap = " + load.getPace().translationMap().toString());
}
@Test
public void emptyTranslationMapTest() {
DedupConfig load = DedupConfig.load(readFromClasspath("org.test.conf"));
DedupConfig load = DedupConfig.load(readFromClasspath("organization.no_synonyms.conf"));
assertEquals(0, load.getPace().translationMap().keySet().size());
}

View File

@ -1,40 +0,0 @@
{
"wf" : {
"threshold" : "0.9",
"dedupRun" : "001",
"entityType" : "organization",
"orderField" : "legalname",
"queueMaxSize" : "2000",
"groupMaxSize" : "50",
"slidingWindowSize" : "200",
"rootBuilder" : [ "organization", "projectOrganization_participation_isParticipant", "datasourceOrganization_provision_isProvidedBy" ],
"includeChildren" : "true"
},
"pace" : {
"clustering" : [
{ "name" : "sortedngrampairs", "fields" : [ "legalname" ], "params" : { "max" : 2, "ngramLen" : "3"} },
{ "name" : "suffixprefix", "fields" : [ "legalname" ], "params" : { "max" : 1, "len" : "3" } },
{ "name" : "urlclustering", "fields" : [ "websiteurl" ], "params" : { } },
{ "name" : "keywordsclustering", "fields" : [ "legalname" ], "params" : { "max": 2, "windowSize": 4} }
],
"strictConditions" : [
{ "name" : "exactMatch", "fields" : [ "gridid" ] }
],
"conditions" : [
{ "name" : "DomainExactMatch", "fields" : [ "websiteurl" ] },
{ "name" : "exactMatch", "fields" : [ "country" ] }
],
"model" : [
{ "name" : "country", "algo" : "Null", "type" : "String", "weight" : "0", "ignoreMissing" : "false", "path" : "organization/metadata/country/classid" },
{ "name" : "legalshortname", "algo" : "JaroWinklerNormalizedName", "type" : "String", "weight" : "0.1", "ignoreMissing" : "true", "path" : "organization/metadata/legalshortname/value" },
{ "name" : "legalname", "algo" : "JaroWinklerNormalizedName", "type" : "String", "weight" : "0.9", "ignoreMissing" : "false", "path" : "organization/metadata/legalname/value", "params" : {"windowSize" : 4, "threshold" : 0.7} },
{ "name" : "websiteurl", "algo" : "Null", "type" : "URL", "weight" : "0", "ignoreMissing" : "true", "path" : "organization/metadata/websiteurl/value", "params" : { "host" : 0.5, "path" : 0.5 } },
{ "name" : "gridid", "algo" : "Null", "type" : "String", "weight" : "0.0", "ignoreMissing" : "true", "path" : "pid[qualifier#classid = {grid}]/value" }
],
"blacklists" : {
"legalname" : []
},
"synonyms": {
}
}
}

View File

@ -18,9 +18,9 @@
{ "name" : "keywordsclustering", "fields" : [ "legalname" ], "params" : { "max": 2, "windowSize": 4} }
],
"decisionTree" : {
"start": {"fields": [{"field":"gridid", "comparator":"exactMatch", "weight":1.0, "ignoreMissing":"true", "params": {}}], "threshold":1.0, "aggregation": "MAX", "positive":"MATCH", "negative":"layer2", "undefined":"layer2", "ignoreMissing": "true"},
"layer2": {"fields": [{"field":"websiteurl", "comparator":"domainExactMatch", "weight":1.0, "ignoreMissing":"true", "params" : {}}, {"field":"country", "comparator":"exactMatch", "weight":1.0, "ignoreMissing":"false", "params": {}}], "threshold":1.0, "aggregation": "MIN", "positive":"layer3", "negative":"NO_MATCH", "undefined":"layer3", "ignoreMissing": "false"},
"layer3": {"fields": [{"field":"legalname", "comparator":"jaroWinklerNormalizedName", "weight":0.9, "ignoreMissing":"false", "params":{"windowSize" : 4, "threshold" : 0.7}}, {"field":"legalshortname", "comparator":"jaroWinklerNormalizedName", "weight":0.1, "ignoreMissing":"true", "params":{}}], "threshold": 0.9, "aggregation": "WEIGHTED_MEAN", "positive":"MATCH", "negative":"NO_MATCH", "undefined":"NO_MATCH", "ignoreMissing": "true"}
"start": {"fields": [{"field":"gridid", "comparator":"exactMatch", "weight":1.0, "countIfUndefined":"true", "params": {}}], "threshold":1.0, "aggregation": "MAX", "positive":"MATCH", "negative":"layer2", "undefined":"layer2", "ignoreUndefined": "true"},
"layer2": {"fields": [{"field":"websiteurl", "comparator":"domainExactMatch", "weight":1.0, "countIfUndefined":"true", "params" : {}}, {"field":"country", "comparator":"exactMatch", "weight":1.0, "countIfUndefined":"false", "params": {}}], "threshold":1.0, "aggregation": "MIN", "positive":"layer3", "negative":"NO_MATCH", "undefined":"layer3", "ignoreUndefined": "false"},
"layer3": {"fields": [{"field":"legalname", "comparator":"jaroWinklerNormalizedName", "weight":0.9, "countIfUndefined":"false", "params":{"windowSize" : 4, "threshold" : 0.7}}, {"field":"legalshortname", "comparator":"jaroWinklerNormalizedName", "weight":0.1, "countIfUndefined":"true", "params":{}}], "threshold": 0.9, "aggregation": "W_MEAN", "positive":"MATCH", "negative":"NO_MATCH", "undefined":"NO_MATCH", "ignoreUndefined": "true"}
},
"model" : [
{ "name" : "country", "type" : "String", "path" : "organization/metadata/country/classid"},

View File

@ -0,0 +1,38 @@
{
"wf" : {
"threshold" : "0.9",
"dedupRun" : "001",
"entityType" : "organization",
"orderField" : "legalname",
"queueMaxSize" : "2000",
"groupMaxSize" : "50",
"slidingWindowSize" : "200",
"rootBuilder" : [ "organization", "projectOrganization_participation_isParticipant", "datasourceOrganization_provision_isProvidedBy" ],
"includeChildren" : "true"
},
"pace" : {
"clustering" : [
{ "name" : "sortedngrampairs", "fields" : [ "legalname" ], "params" : { "max" : 2, "ngramLen" : "3"} },
{ "name" : "suffixprefix", "fields" : [ "legalname" ], "params" : { "max" : 1, "len" : "3" } },
{ "name" : "urlclustering", "fields" : [ "websiteurl" ], "params" : { } },
{ "name" : "keywordsclustering", "fields" : [ "legalname" ], "params" : { "max": 2, "windowSize": 4} }
],
"decisionTree" : {
"start": {"fields": [{"field":"gridid", "comparator":"exactMatch", "weight":1.0, "countIfUndefined":"true", "params": {}}], "threshold":1.0, "aggregation": "MAX", "positive":"MATCH", "negative":"layer2", "undefined":"layer2", "ignoreUndefined": "true"},
"layer2": {"fields": [{"field":"websiteurl", "comparator":"domainExactMatch", "weight":1.0, "countIfUndefined":"true", "params" : {}}, {"field":"country", "comparator":"exactMatch", "weight":1.0, "countIfUndefined":"false", "params": {}}], "threshold":1.0, "aggregation": "MIN", "positive":"layer3", "negative":"NO_MATCH", "undefined":"layer3", "ignoreUndefined": "false"},
"layer3": {"fields": [{"field":"legalname", "comparator":"jaroWinklerNormalizedName", "weight":0.9, "countIfUndefined":"false", "params":{"windowSize" : 4, "threshold" : 0.7}}, {"field":"legalshortname", "comparator":"jaroWinklerNormalizedName", "weight":0.1, "countIfUndefined":"true", "params":{}}], "threshold": 0.9, "aggregation": "W_MEAN", "positive":"MATCH", "negative":"NO_MATCH", "undefined":"NO_MATCH", "ignoreUndefined": "true"}
},
"model" : [
{ "name" : "country", "type" : "String", "path" : "organization/metadata/country/classid"},
{ "name" : "legalshortname", "type" : "String", "path" : "organization/metadata/legalshortname/value"},
{ "name" : "legalname", "type" : "String", "path" : "organization/metadata/legalname/value" },
{ "name" : "websiteurl", "type" : "URL", "path" : "organization/metadata/websiteurl/value" },
{ "name" : "gridid", "type" : "String", "path" : "pid[qualifier#classid = {grid}]/value"}
],
"blacklists" : {
"legalname" : []
},
"synonyms": {
}
}
}

View File

@ -1,53 +0,0 @@
{
"wf" : {
"threshold" : "0.99",
"dedupRun" : "001",
"entityType" : "result",
"orderField" : "title",
"queueMaxSize" : "2000",
"groupMaxSize" : "10",
"slidingWindowSize" : "200",
"rootBuilder" : [ "result" ],
"includeChildren" : "true"
},
"pace" : {
"clustering" : [
{ "name" : "acronyms", "fields" : [ "title" ], "params" : { "max" : "1", "minLen" : "2", "maxLen" : "4"} },
{ "name" : "ngrampairs", "fields" : [ "title" ], "params" : { "max" : "1", "ngramLen" : "3"} },
{ "name" : "suffixprefix", "fields" : [ "title" ], "params" : { "max" : "1", "len" : "3" } }
],
"strictConditions" : [
{ "name" : "exactMatch", "fields" : [ "pid" ] }
],
"conditions" : [
{ "name" : "yearMatch", "fields" : [ "dateofacceptance" ] },
{ "name" : "titleVersionMatch", "fields" : [ "title" ] },
{ "name" : "sizeMatch", "fields" : [ "authors" ] }
],
"model" : [
{ "name" : "pid", "algo" : "Null", "type" : "String", "weight" : "0.0", "ignoreMissing" : "true", "path" : "pid[qualifier#classid = {doi}]/value", "overrideMatch" : "true" },
{ "name" : "title", "algo" : "JaroWinkler", "type" : "String", "weight" : "1.0", "ignoreMissing" : "false", "path" : "result/metadata/title[qualifier#classid = {main title}]/value" },
{ "name" : "dateofacceptance", "algo" : "Null", "type" : "String", "weight" : "0.0", "ignoreMissing" : "true", "path" : "result/metadata/dateofacceptance/value" } ,
{ "name" : "authors", "algo" : "Null", "type" : "List", "weight" : "0.0", "ignoreMissing" : "true", "path" : "result/author/metadata/fullname/value" }
],
"blacklists" : {
"title" : [
"^(Corpus Oral Dialectal \\(COD\\)\\.).*$",
"^(Kiri Karl Morgensternile).*$",
"^(\\[Eksliibris Aleksandr).*\\]$",
"^(\\[Eksliibris Aleksandr).*$",
"^(Eksliibris Aleksandr).*$",
"^(Kiri A\\. de Vignolles).*$",
"^(2 kirja Karl Morgensternile).*$",
"^(Pirita kloostri idaosa arheoloogilised).*$",
"^(Kiri tundmatule).*$",
"^(Kiri Jenaer Allgemeine Literaturzeitung toimetusele).*$",
"^(Eksliibris Nikolai Birukovile).*$",
"^(Eksliibris Nikolai Issakovile).*$",
"^(WHP Cruise Summary Information of section).*$",
"^(Measurement of the top quark\\-pair production cross section with ATLAS in pp collisions at).*$",
"^(Measurement of the spin\\-dependent structure function).*"
] }
}
}

View File

@ -0,0 +1,51 @@
{
"wf" : {
"threshold" : "0.99",
"dedupRun" : "001",
"entityType" : "result",
"orderField" : "title",
"queueMaxSize" : "2000",
"groupMaxSize" : "10",
"slidingWindowSize" : "200",
"rootBuilder" : [ "result" ],
"includeChildren" : "true"
},
"pace" : {
"clustering" : [
{ "name" : "acronyms", "fields" : [ "title" ], "params" : { "max" : "1", "minLen" : "2", "maxLen" : "4"} },
{ "name" : "ngrampairs", "fields" : [ "title" ], "params" : { "max" : "1", "ngramLen" : "3"} },
{ "name" : "suffixprefix", "fields" : [ "title" ], "params" : { "max" : "1", "len" : "3" } }
],
"decisionTree": {
"start": {"fields": [{"field":"pid", "comparator":"exactMatch", "weight":1.0, "countIfUndefined":"true", "params": {}}], "threshold":1.0, "aggregation": "MAX", "positive":"MATCH", "negative":"layer2", "undefined":"layer2", "ignoreUndefined": "true"},
"layer2": {"fields": [{"field":"dateofacceptance", "comparator":"yearMatch", "weight":1.0, "countIfUndefined":"true", "params" : {}}, {"field":"title", "comparator":"titleVersionMatch", "weight":1.0, "countIfUndefined":"false", "params": {}}, {"field":"authors", "comparator":"sizeMatch", "weight":1.0, "countIfUndefined":"false", "params": {}}], "threshold":1.0, "aggregation": "MIN", "positive":"layer3", "negative":"NO_MATCH", "undefined":"layer3", "ignoreUndefined": "false"},
"layer3": {"fields": [{"field":"title", "comparator":"JaroWinkler", "weight":1.0, "countIfUndefined":"false", "params":{}}], "threshold": 0.99, "aggregation": "MAX", "positive":"MATCH", "negative":"NO_MATCH", "undefined":"NO_MATCH", "ignoreUndefined": "false"}
},
"model" : [
{ "name" : "pid", "type" : "String", "path" : "pid[qualifier#classid = {doi}]/value", "overrideMatch" : "true" },
{ "name" : "title", "type" : "String", "path" : "result/metadata/title[qualifier#classid = {main title}]/value" },
{ "name" : "dateofacceptance", "type" : "String", "path" : "result/metadata/dateofacceptance/value" } ,
{ "name" : "authors", "type" : "List", "path" : "result/author/metadata/fullname/value" }
],
"blacklists" : {
"title" : [
"^(Corpus Oral Dialectal \\(COD\\)\\.).*$",
"^(Kiri Karl Morgensternile).*$",
"^(\\[Eksliibris Aleksandr).*\\]$",
"^(\\[Eksliibris Aleksandr).*$",
"^(Eksliibris Aleksandr).*$",
"^(Kiri A\\. de Vignolles).*$",
"^(2 kirja Karl Morgensternile).*$",
"^(Pirita kloostri idaosa arheoloogilised).*$",
"^(Kiri tundmatule).*$",
"^(Kiri Jenaer Allgemeine Literaturzeitung toimetusele).*$",
"^(Eksliibris Nikolai Birukovile).*$",
"^(Eksliibris Nikolai Issakovile).*$",
"^(WHP Cruise Summary Information of section).*$",
"^(Measurement of the top quark\\-pair production cross section with ATLAS in pp collisions at).*$",
"^(Measurement of the spin\\-dependent structure function).*"
] },
"synonyms": {}
}
}