diff --git a/dhp-workflows/dhp-dedup/pom.xml b/dhp-workflows/dhp-dedup/pom.xml index 0721af25d..691fbe6d5 100644 --- a/dhp-workflows/dhp-dedup/pom.xml +++ b/dhp-workflows/dhp-dedup/pom.xml @@ -65,6 +65,15 @@ com.arakelian java-jq + + dom4j + dom4j + + + jaxen + jaxen + + eu.dnetlib @@ -83,8 +92,6 @@ jackson-core - - diff --git a/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/SparkCreateConnectedComponent.java b/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/SparkCreateConnectedComponent.java deleted file mode 100644 index 3a92a1558..000000000 --- a/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/SparkCreateConnectedComponent.java +++ /dev/null @@ -1,79 +0,0 @@ -package eu.dnetlib.dedup; - -import com.google.common.collect.Iterables; -import com.google.common.collect.Lists; -import com.google.common.hash.HashFunction; -import com.google.common.hash.Hashing; -import eu.dnetlib.dedup.graph.ConnectedComponent; -import eu.dnetlib.dedup.graph.GraphProcessor; -import eu.dnetlib.dhp.application.ArgumentApplicationParser; -import eu.dnetlib.dhp.schema.oaf.Relation; -import eu.dnetlib.pace.config.DedupConfig; -import eu.dnetlib.pace.util.MapDocumentUtil; -import org.apache.commons.io.IOUtils; -import org.apache.spark.api.java.JavaPairRDD; -import org.apache.spark.api.java.JavaRDD; -import org.apache.spark.api.java.JavaSparkContext; -import org.apache.spark.api.java.function.FlatMapFunction; -import org.apache.spark.api.java.function.PairFunction; -import org.apache.spark.graphx.Edge; -import org.apache.spark.rdd.RDD; -import org.apache.spark.sql.Dataset; -import org.apache.spark.sql.Encoders; -import org.apache.spark.sql.SparkSession; -import scala.Tuple2; - -import java.util.ArrayList; -import java.util.List; - -public class SparkCreateConnectedComponent { - - public static void main(String[] args) throws Exception { - final ArgumentApplicationParser parser = new ArgumentApplicationParser(IOUtils.toString(SparkCreateConnectedComponent.class.getResourceAsStream("/eu/dnetlib/dhp/dedup/dedup_parameters.json"))); - parser.parseArgument(args); - final SparkSession spark = SparkSession - .builder() - .appName(SparkCreateConnectedComponent.class.getSimpleName()) - .master(parser.get("master")) - .getOrCreate(); - - final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); - final String inputPath = parser.get("sourcePath"); - final String entity = parser.get("entity"); - final String targetPath = parser.get("targetPath"); -// final DedupConfig dedupConf = DedupConfig.load(IOUtils.toString(SparkCreateConnectedComponent.class.getResourceAsStream("/eu/dnetlib/dhp/dedup/conf/org.curr.conf2.json"))); - final DedupConfig dedupConf = DedupConfig.load(parser.get("dedupConf")); - - final JavaPairRDD vertexes = sc.textFile(inputPath + "/" + entity) - .map(s -> MapDocumentUtil.getJPathString(dedupConf.getWf().getIdPath(), s)) - .mapToPair((PairFunction) - s -> new Tuple2(getHashcode(s), s) - ); - - final Dataset similarityRelations = spark.read().load(DedupUtility.createSimRelPath(targetPath,entity)).as(Encoders.bean(Relation.class)); - final RDD> edgeRdd = similarityRelations.javaRDD().map(it -> new Edge<>(getHashcode(it.getSource()), getHashcode(it.getTarget()), it.getRelClass())).rdd(); - final JavaRDD cc = GraphProcessor.findCCs(vertexes.rdd(), edgeRdd, dedupConf.getWf().getMaxIterations()).toJavaRDD(); - final Dataset mergeRelation = spark.createDataset(cc.filter(k->k.getDocIds().size()>1).flatMap((FlatMapFunction) c -> - c.getDocIds() - .stream() - .flatMap(id -> { - List tmp = new ArrayList<>(); - Relation r = new Relation(); - r.setSource(c.getCcId()); - r.setTarget(id); - r.setRelClass("merges"); - tmp.add(r); - r = new Relation(); - r.setTarget(c.getCcId()); - r.setSource(id); - r.setRelClass("isMergedIn"); - tmp.add(r); - return tmp.stream(); - }).iterator()).rdd(), Encoders.bean(Relation.class)); - mergeRelation.write().mode("overwrite").save(DedupUtility.createMergeRelPath(targetPath,entity)); - } - - public static long getHashcode(final String id) { - return Hashing.murmur3_128().hashString(id).asLong(); - } -} diff --git a/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/SparkCreateDedupRecord.java b/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/SparkCreateDedupRecord.java deleted file mode 100644 index db2306526..000000000 --- a/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/SparkCreateDedupRecord.java +++ /dev/null @@ -1,39 +0,0 @@ -package eu.dnetlib.dedup; - -import com.fasterxml.jackson.databind.ObjectMapper; -import eu.dnetlib.dhp.application.ArgumentApplicationParser; -import eu.dnetlib.dhp.schema.oaf.OafEntity; -import eu.dnetlib.pace.config.DedupConfig; -import org.apache.commons.io.IOUtils; -import org.apache.spark.api.java.JavaRDD; -import org.apache.spark.api.java.JavaSparkContext; -import org.apache.spark.sql.SparkSession; - -public class SparkCreateDedupRecord { - - public static void main(String[] args) throws Exception { - final ArgumentApplicationParser parser = new ArgumentApplicationParser(IOUtils.toString(SparkCreateDedupRecord.class.getResourceAsStream("/eu/dnetlib/dhp/dedup/dedupRecord_parameters.json"))); - parser.parseArgument(args); - final SparkSession spark = SparkSession - .builder() - .appName(SparkCreateDedupRecord.class.getSimpleName()) - .master(parser.get("master")) - .getOrCreate(); - - final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); - final String sourcePath = parser.get("sourcePath"); - final String entity = parser.get("entity"); - final String dedupPath = parser.get("dedupPath"); -// final DedupConfig dedupConf = DedupConfig.load(IOUtils.toString(SparkCreateDedupRecord.class.getResourceAsStream("/eu/dnetlib/dhp/dedup/conf/org.curr.conf2.json"))); - final DedupConfig dedupConf = DedupConfig.load(parser.get("dedupConf")); - - final JavaRDD dedupRecord = DedupRecordFactory.createDedupRecord(sc, spark, DedupUtility.createMergeRelPath(dedupPath,entity), DedupUtility.createEntityPath(sourcePath,entity), OafEntityType.valueOf(entity), dedupConf); - dedupRecord.map(r-> { - ObjectMapper mapper = new ObjectMapper(); - return mapper.writeValueAsString(r); - }).saveAsTextFile(dedupPath+"/"+entity+"_dedup_record_json"); - - - } - -} diff --git a/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/SparkCreateSimRels.java b/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/SparkCreateSimRels.java deleted file mode 100644 index 831e45daf..000000000 --- a/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/SparkCreateSimRels.java +++ /dev/null @@ -1,73 +0,0 @@ -package eu.dnetlib.dedup; - -import com.google.common.hash.Hashing; -import eu.dnetlib.dhp.application.ArgumentApplicationParser; -import eu.dnetlib.dhp.schema.oaf.Relation; -import eu.dnetlib.pace.config.DedupConfig; -import eu.dnetlib.pace.model.MapDocument; -import eu.dnetlib.pace.util.MapDocumentUtil; -import org.apache.commons.io.IOUtils; -import org.apache.spark.api.java.JavaPairRDD; -import org.apache.spark.api.java.JavaRDD; -import org.apache.spark.api.java.JavaSparkContext; -import org.apache.spark.api.java.function.PairFunction; -import org.apache.spark.sql.Encoders; -import org.apache.spark.sql.SparkSession; -import scala.Tuple2; - -import java.util.List; - - -/** - * This Spark class creates similarity relations between entities, saving result - * - * param request: - * sourcePath - * entityType - * target Path - */ -public class SparkCreateSimRels { - - public static void main(String[] args) throws Exception { - final ArgumentApplicationParser parser = new ArgumentApplicationParser(IOUtils.toString(SparkCreateSimRels.class.getResourceAsStream("/eu/dnetlib/dhp/dedup/dedup_parameters.json"))); - parser.parseArgument(args); - final SparkSession spark = SparkSession - .builder() - .appName(SparkCreateSimRels.class.getSimpleName()) - .master(parser.get("master")) - .getOrCreate(); - - final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); - final String inputPath = parser.get("sourcePath"); - final String entity = parser.get("entity"); - final String targetPath = parser.get("targetPath"); -// final DedupConfig dedupConf = DedupConfig.load(IOUtils.toString(SparkCreateSimRels.class.getResourceAsStream("/eu/dnetlib/dhp/dedup/conf/org.curr.conf.json"))); - final DedupConfig dedupConf = DedupConfig.load(parser.get("dedupConf")); - - final long total = sc.textFile(inputPath + "/" + entity).count(); - - JavaPairRDD mapDocument = sc.textFile(inputPath + "/" + entity) - .mapToPair(s->{ - MapDocument d = MapDocumentUtil.asMapDocumentWithJPath(dedupConf,s); - return new Tuple2<>(d.getIdentifier(), d);}); - - //create blocks for deduplication - JavaPairRDD> blocks = Deduper.createsortedBlocks(sc, mapDocument, dedupConf); -// JavaPairRDD> blocks = Deduper.createBlocks(sc, mapDocument, dedupConf); - - //create relations by comparing only elements in the same group - final JavaPairRDD dedupRels = Deduper.computeRelations2(sc, blocks, dedupConf); -// final JavaPairRDD dedupRels = Deduper.computeRelations(sc, blocks, dedupConf); - - final JavaRDD isSimilarToRDD = dedupRels.map(simRel -> { - final Relation r = new Relation(); - r.setSource(simRel._1()); - r.setTarget(simRel._2()); - r.setRelClass("isSimilarTo"); - return r; - }); - - spark.createDataset(isSimilarToRDD.rdd(), Encoders.bean(Relation.class)).write().mode("overwrite").save( DedupUtility.createSimRelPath(targetPath,entity)); - - } -} \ No newline at end of file diff --git a/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/DatePicker.java b/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dhp/dedup/DatePicker.java similarity index 99% rename from dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/DatePicker.java rename to dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dhp/dedup/DatePicker.java index 73f178edc..bd5c1118e 100644 --- a/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/DatePicker.java +++ b/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dhp/dedup/DatePicker.java @@ -1,4 +1,4 @@ -package eu.dnetlib.dedup; +package eu.dnetlib.dhp.dedup; import eu.dnetlib.dhp.schema.oaf.Field; import org.apache.commons.lang.StringUtils; diff --git a/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/DedupRecordFactory.java b/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dhp/dedup/DedupRecordFactory.java similarity index 98% rename from dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/DedupRecordFactory.java rename to dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dhp/dedup/DedupRecordFactory.java index 5f81669e9..583e90ab9 100644 --- a/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/DedupRecordFactory.java +++ b/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dhp/dedup/DedupRecordFactory.java @@ -1,11 +1,9 @@ -package eu.dnetlib.dedup; +package eu.dnetlib.dhp.dedup; import com.google.common.collect.Lists; import eu.dnetlib.dhp.schema.oaf.*; import eu.dnetlib.pace.config.DedupConfig; import eu.dnetlib.pace.util.MapDocumentUtil; -import org.apache.commons.lang.NotImplementedException; -import org.apache.commons.lang.StringUtils; import org.apache.spark.api.java.JavaPairRDD; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; @@ -16,9 +14,6 @@ import org.codehaus.jackson.map.ObjectMapper; import scala.Tuple2; import java.util.Collection; -import java.util.Random; - -import static java.util.stream.Collectors.toMap; public class DedupRecordFactory { diff --git a/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/DedupUtility.java b/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dhp/dedup/DedupUtility.java similarity index 74% rename from dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/DedupUtility.java rename to dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dhp/dedup/DedupUtility.java index 3bed74f86..3d505888a 100644 --- a/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/DedupUtility.java +++ b/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dhp/dedup/DedupUtility.java @@ -1,35 +1,32 @@ -package eu.dnetlib.dedup; +package eu.dnetlib.dhp.dedup; import com.google.common.collect.Sets; import com.wcohen.ss.JaroWinkler; import eu.dnetlib.dhp.schema.oaf.Author; import eu.dnetlib.dhp.schema.oaf.StructuredProperty; +import eu.dnetlib.dhp.utils.ISLookupClientFactory; +import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException; +import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService; import eu.dnetlib.pace.clustering.BlacklistAwareClusteringCombiner; import eu.dnetlib.pace.config.DedupConfig; import eu.dnetlib.pace.model.MapDocument; import eu.dnetlib.pace.model.Person; import org.apache.commons.codec.binary.Hex; -import org.apache.commons.io.IOUtils; import org.apache.commons.lang3.StringUtils; -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.FSDataInputStream; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; import org.apache.spark.SparkContext; -import org.apache.spark.api.java.JavaRDD; -import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.util.LongAccumulator; +import org.dom4j.Document; +import org.dom4j.DocumentException; +import org.dom4j.Element; +import org.dom4j.io.SAXReader; import scala.Tuple2; -import java.io.IOException; -import java.io.StringWriter; -import java.nio.charset.StandardCharsets; +import java.io.StringReader; import java.security.MessageDigest; import java.text.Normalizer; import java.util.*; import java.util.stream.Collectors; -import java.util.stream.Stream; public class DedupUtility { private static final Double THRESHOLD = 0.95; @@ -54,38 +51,6 @@ public class DedupUtility { return accumulators; } - public static JavaRDD loadDataFromHDFS(String path, JavaSparkContext context) { - return context.textFile(path); - } - - public static void deleteIfExists(String path) throws IOException { - Configuration conf = new Configuration(); - FileSystem fileSystem = FileSystem.get(conf); - if (fileSystem.exists(new Path(path))) { - fileSystem.delete(new Path(path), true); - } - } - - public static DedupConfig loadConfigFromHDFS(String path) throws IOException { - - Configuration conf = new Configuration(); - FileSystem fileSystem = FileSystem.get(conf); - FSDataInputStream inputStream = new FSDataInputStream(fileSystem.open(new Path(path))); - - return DedupConfig.load(IOUtils.toString(inputStream, StandardCharsets.UTF_8.name())); - - } - - static String readFromClasspath(final String filename, final Class clazz) { - final StringWriter sw = new StringWriter(); - try { - IOUtils.copy(clazz.getResourceAsStream(filename), sw); - return sw.toString(); - } catch (final IOException e) { - throw new RuntimeException("cannot load resource from classpath: " + filename); - } - } - static Set getGroupingKeys(DedupConfig conf, MapDocument doc) { return Sets.newHashSet(BlacklistAwareClusteringCombiner.filterAndCombine(doc, conf)); } @@ -146,16 +111,20 @@ public class DedupUtility { }); } + public static String createDedupRecordPath(final String basePath, final String actionSetId, final String entityType) { + return String.format("%s/%s/%s_deduprecord", basePath, actionSetId, entityType); + } + public static String createEntityPath(final String basePath, final String entityType) { return String.format("%s/%s", basePath, entityType); } - public static String createSimRelPath(final String basePath, final String entityType) { - return String.format("%s/%s_simRel", basePath, entityType); + public static String createSimRelPath(final String basePath, final String actionSetId,final String entityType) { + return String.format("%s/%s/%s_simrel", basePath, actionSetId, entityType); } - public static String createMergeRelPath(final String basePath, final String entityType) { - return String.format("%s/%s_mergeRel", basePath, entityType); + public static String createMergeRelPath(final String basePath, final String actionSetId, final String entityType) { + return String.format("%s/%s/%s_mergerel", basePath, actionSetId, entityType); } private static Double sim(Author a, Author b) { @@ -216,4 +185,37 @@ public class DedupUtility { return false; return a.getPid().stream().anyMatch(p -> p != null && StringUtils.isNotBlank(p.getValue())); } + + public static List getConfigurations(String isLookUpUrl, String orchestrator) throws ISLookUpException, DocumentException { + final ISLookUpService isLookUpService = ISLookupClientFactory.getLookUpService(isLookUpUrl); + + final String xquery = String.format("/RESOURCE_PROFILE[.//DEDUPLICATION/ACTION_SET/@id = '%s']", orchestrator); + + String orchestratorProfile = isLookUpService.getResourceProfileByQuery(xquery); + + final Document doc = new SAXReader().read(new StringReader(orchestratorProfile)); + + final String actionSetId = doc.valueOf("//DEDUPLICATION/ACTION_SET/@id"); + final List configurations = new ArrayList<>(); + + for (final Object o : doc.selectNodes("//SCAN_SEQUENCE/SCAN")) { + configurations.add(loadConfig(isLookUpService, actionSetId, o)); + } + + return configurations; + + } + + private static DedupConfig loadConfig(final ISLookUpService isLookUpService, final String actionSetId, final Object o) + throws ISLookUpException { + final Element s = (Element) o; + final String configProfileId = s.attributeValue("id"); + final String conf = + isLookUpService.getResourceProfileByQuery(String.format( + "for $x in /RESOURCE_PROFILE[.//RESOURCE_IDENTIFIER/@value = '%s'] return $x//DEDUPLICATION/text()", + configProfileId)); + final DedupConfig dedupConfig = DedupConfig.load(conf); + dedupConfig.getWf().setConfigurationId(actionSetId); + return dedupConfig; + } } diff --git a/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/Deduper.java b/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dhp/dedup/Deduper.java similarity index 99% rename from dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/Deduper.java rename to dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dhp/dedup/Deduper.java index 7206f892f..dda71fbcf 100644 --- a/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/Deduper.java +++ b/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dhp/dedup/Deduper.java @@ -1,7 +1,6 @@ -package eu.dnetlib.dedup; +package eu.dnetlib.dhp.dedup; import eu.dnetlib.pace.config.DedupConfig; -import eu.dnetlib.pace.model.Field; import eu.dnetlib.pace.model.MapDocument; import eu.dnetlib.pace.util.BlockProcessor; import eu.dnetlib.pace.util.MapDocumentUtil; diff --git a/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/OafEntityType.java b/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dhp/dedup/OafEntityType.java similarity index 83% rename from dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/OafEntityType.java rename to dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dhp/dedup/OafEntityType.java index fb347ed51..66f0b3ce6 100644 --- a/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/OafEntityType.java +++ b/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dhp/dedup/OafEntityType.java @@ -1,4 +1,4 @@ -package eu.dnetlib.dedup; +package eu.dnetlib.dhp.dedup; public enum OafEntityType { diff --git a/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dhp/dedup/SparkCreateConnectedComponent.java b/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dhp/dedup/SparkCreateConnectedComponent.java new file mode 100644 index 000000000..75b1dd01c --- /dev/null +++ b/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dhp/dedup/SparkCreateConnectedComponent.java @@ -0,0 +1,100 @@ +package eu.dnetlib.dhp.dedup; + +import com.google.common.hash.Hashing; +import eu.dnetlib.dhp.dedup.graph.ConnectedComponent; +import eu.dnetlib.dhp.dedup.graph.GraphProcessor; +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.schema.oaf.Relation; +import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException; +import eu.dnetlib.pace.config.DedupConfig; +import eu.dnetlib.pace.util.MapDocumentUtil; +import org.apache.commons.io.IOUtils; +import org.apache.spark.SparkConf; +import org.apache.spark.api.java.JavaPairRDD; +import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.api.java.function.FlatMapFunction; +import org.apache.spark.api.java.function.PairFunction; +import org.apache.spark.graphx.Edge; +import org.apache.spark.rdd.RDD; +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Encoders; +import org.apache.spark.sql.SparkSession; +import org.dom4j.DocumentException; +import scala.Tuple2; + +import java.util.ArrayList; +import java.util.List; + +public class SparkCreateConnectedComponent { + + public static void main(String[] args) throws Exception { + final ArgumentApplicationParser parser = new ArgumentApplicationParser(IOUtils.toString(SparkCreateConnectedComponent.class.getResourceAsStream("/eu/dnetlib/dhp/dedup/createCC_parameters.json"))); + parser.parseArgument(args); + + new SparkCreateConnectedComponent().run(parser); + } + + private void run(ArgumentApplicationParser parser) throws ISLookUpException, DocumentException { + + final String graphBasePath = parser.get("graphBasePath"); + final String workingPath = parser.get("workingPath"); + final String isLookUpUrl = parser.get("isLookUpUrl"); + final String actionSetId = parser.get("actionSetId"); + + try (SparkSession spark = getSparkSession(parser)) { + + final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); + + for (DedupConfig dedupConf: DedupUtility.getConfigurations(isLookUpUrl, actionSetId)) { + + final String entity = dedupConf.getWf().getEntityType(); + final String subEntity = dedupConf.getWf().getSubEntityValue(); + + final JavaPairRDD vertexes = sc.textFile(graphBasePath + "/" + subEntity) + .map(s -> MapDocumentUtil.getJPathString(dedupConf.getWf().getIdPath(), s)) + .mapToPair((PairFunction) + s -> new Tuple2(getHashcode(s), s) + ); + + final Dataset similarityRelations = spark.read().load(DedupUtility.createSimRelPath(workingPath, actionSetId, subEntity)).as(Encoders.bean(Relation.class)); + final RDD> edgeRdd = similarityRelations.javaRDD().map(it -> new Edge<>(getHashcode(it.getSource()), getHashcode(it.getTarget()), it.getRelClass())).rdd(); + final JavaRDD cc = GraphProcessor.findCCs(vertexes.rdd(), edgeRdd, dedupConf.getWf().getMaxIterations()).toJavaRDD(); + final Dataset mergeRelation = spark.createDataset(cc.filter(k -> k.getDocIds().size() > 1).flatMap((FlatMapFunction) c -> + c.getDocIds() + .stream() + .flatMap(id -> { + List tmp = new ArrayList<>(); + Relation r = new Relation(); + r.setSource(c.getCcId()); + r.setTarget(id); + r.setRelClass("merges"); + tmp.add(r); + r = new Relation(); + r.setTarget(c.getCcId()); + r.setSource(id); + r.setRelClass("isMergedIn"); + tmp.add(r); + return tmp.stream(); + }).iterator()).rdd(), Encoders.bean(Relation.class)); + mergeRelation.write().mode("overwrite").save(DedupUtility.createMergeRelPath(workingPath, actionSetId, entity)); + } + } + } + + public static long getHashcode(final String id) { + return Hashing.murmur3_128().hashString(id).asLong(); + } + + private static SparkSession getSparkSession(ArgumentApplicationParser parser) { + SparkConf conf = new SparkConf(); + + return SparkSession + .builder() + .appName(SparkCreateSimRels.class.getSimpleName()) + .master(parser.get("master")) + .config(conf) + .enableHiveSupport() + .getOrCreate(); + } +} diff --git a/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dhp/dedup/SparkCreateDedupRecord.java b/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dhp/dedup/SparkCreateDedupRecord.java new file mode 100644 index 000000000..51d0760e0 --- /dev/null +++ b/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dhp/dedup/SparkCreateDedupRecord.java @@ -0,0 +1,62 @@ +package eu.dnetlib.dhp.dedup; + +import com.fasterxml.jackson.databind.ObjectMapper; +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.schema.oaf.OafEntity; +import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException; +import eu.dnetlib.pace.config.DedupConfig; +import org.apache.commons.io.IOUtils; +import org.apache.spark.SparkConf; +import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.sql.SparkSession; +import org.dom4j.DocumentException; + +public class SparkCreateDedupRecord { + + public static void main(String[] args) throws Exception { + final ArgumentApplicationParser parser = new ArgumentApplicationParser(IOUtils.toString(SparkCreateDedupRecord.class.getResourceAsStream("/eu/dnetlib/dhp/dedup/createDedupRecord_parameters.json"))); + parser.parseArgument(args); + + new SparkCreateDedupRecord().run(parser); + } + + private void run(ArgumentApplicationParser parser) throws ISLookUpException, DocumentException { + + final String graphBasePath = parser.get("graphBasePath"); + final String isLookUpUrl = parser.get("isLookUpUrl"); + final String actionSetId = parser.get("actionSetId"); + final String workingPath = parser.get("workingPath"); + + try (SparkSession spark = getSparkSession(parser)) { + final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); + + for (DedupConfig dedupConf: DedupUtility.getConfigurations(isLookUpUrl, actionSetId)) { + String subEntity = dedupConf.getWf().getSubEntityValue(); + + final String mergeRelPath = DedupUtility.createMergeRelPath(workingPath, actionSetId, subEntity); + final String entityPath = DedupUtility.createEntityPath(graphBasePath, subEntity); + final OafEntityType entityType = OafEntityType.valueOf(subEntity); + final JavaRDD dedupRecord = + DedupRecordFactory.createDedupRecord(sc, spark, mergeRelPath, entityPath, entityType, dedupConf); + dedupRecord.map(r -> { + ObjectMapper mapper = new ObjectMapper(); + return mapper.writeValueAsString(r); + }).saveAsTextFile(DedupUtility.createDedupRecordPath(workingPath, actionSetId, subEntity)); + } + } + } + + private static SparkSession getSparkSession(ArgumentApplicationParser parser) { + SparkConf conf = new SparkConf(); + + return SparkSession + .builder() + .appName(SparkCreateDedupRecord.class.getSimpleName()) + .master(parser.get("master")) + .config(conf) + .enableHiveSupport() + .getOrCreate(); + } +} + diff --git a/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dhp/dedup/SparkCreateSimRels.java b/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dhp/dedup/SparkCreateSimRels.java new file mode 100644 index 000000000..0fc72db1e --- /dev/null +++ b/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dhp/dedup/SparkCreateSimRels.java @@ -0,0 +1,134 @@ +package eu.dnetlib.dhp.dedup; + +import com.fasterxml.jackson.core.JsonProcessingException; +import com.fasterxml.jackson.databind.ObjectMapper; +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.schema.action.AtomicAction; +import eu.dnetlib.dhp.schema.oaf.Relation; +import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException; +import eu.dnetlib.pace.config.DedupConfig; +import eu.dnetlib.pace.model.MapDocument; +import eu.dnetlib.pace.util.MapDocumentUtil; +import org.apache.commons.io.IOUtils; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.compress.GzipCodec; +import org.apache.hadoop.mapred.SequenceFileOutputFormat; +import org.apache.spark.SparkConf; +import org.apache.spark.api.java.JavaPairRDD; +import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.sql.Encoders; +import org.apache.spark.sql.SparkSession; +import org.dom4j.DocumentException; +import scala.Tuple2; + +import java.io.Serializable; +import java.util.List; + +public class SparkCreateSimRels implements Serializable { + + private static final Log log = LogFactory.getLog(SparkCreateSimRels.class); + + public static void main(String[] args) throws Exception { + final ArgumentApplicationParser parser = new ArgumentApplicationParser(IOUtils.toString(SparkCreateSimRels.class.getResourceAsStream("/eu/dnetlib/dhp/dedup/createSimRels_parameters.json"))); + parser.parseArgument(args); + + new SparkCreateSimRels().run(parser); + } + + private void run(ArgumentApplicationParser parser) throws ISLookUpException, DocumentException { + + //read oozie parameters + final String graphBasePath = parser.get("graphBasePath"); + final String isLookUpUrl = parser.get("isLookUpUrl"); + final String actionSetId = parser.get("actionSetId"); + final String workingPath = parser.get("workingPath"); + + System.out.println(String.format("graphBasePath: '%s'", graphBasePath)); + System.out.println(String.format("isLookUpUrl: '%s'", isLookUpUrl)); + System.out.println(String.format("actionSetId: '%s'", actionSetId)); + System.out.println(String.format("workingPath: '%s'", workingPath)); + + try (SparkSession spark = getSparkSession(parser)) { + final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); + + //for each dedup configuration + for (DedupConfig dedupConf: DedupUtility.getConfigurations(isLookUpUrl, actionSetId)) { + final String entity = dedupConf.getWf().getEntityType(); + final String subEntity = dedupConf.getWf().getSubEntityValue(); + + JavaPairRDD mapDocument = sc.textFile(DedupUtility.createEntityPath(graphBasePath, subEntity)) + .mapToPair(s -> { + MapDocument d = MapDocumentUtil.asMapDocumentWithJPath(dedupConf, s); + return new Tuple2<>(d.getIdentifier(), d); + }); + + //create blocks for deduplication + JavaPairRDD> blocks = Deduper.createsortedBlocks(sc, mapDocument, dedupConf); + + //create relations by comparing only elements in the same group + final JavaPairRDD dedupRels = Deduper.computeRelations2(sc, blocks, dedupConf); + + JavaRDD relationsRDD = dedupRels.map(r -> createSimRel(r._1(), r._2(), entity)); + + //save the simrel in the workingdir + spark.createDataset(relationsRDD.rdd(), Encoders.bean(Relation.class)) + .write() + .mode("overwrite") + .save(DedupUtility.createSimRelPath(workingPath, actionSetId, subEntity)); + } + } + } + + /** + * Utility method used to create an atomic action from a Relation object + * @param relation input relation + * @return A tuple2 with [id, json serialization of the atomic action] + * @throws JsonProcessingException + */ + public Tuple2 createSequenceFileRow(Relation relation) throws JsonProcessingException { + + ObjectMapper mapper = new ObjectMapper(); + + String id = relation.getSource() + "@" + relation.getRelClass() + "@" + relation.getTarget(); + AtomicAction aa = new AtomicAction<>(Relation.class, relation); + + return new Tuple2<>( + new Text(id), + new Text(mapper.writeValueAsString(aa)) + ); + } + + public Relation createSimRel(String source, String target, String entity){ + final Relation r = new Relation(); + r.setSource(source); + r.setTarget(target); + + switch(entity){ + case "result": + r.setRelClass("resultResult_dedupSimilarity_isSimilarTo"); + break; + case "organization": + r.setRelClass("organizationOrganization_dedupSimilarity_isSimilarTo"); + break; + default: + r.setRelClass("isSimilarTo"); + break; + } + return r; + } + + private static SparkSession getSparkSession(ArgumentApplicationParser parser) { + SparkConf conf = new SparkConf(); + + return SparkSession + .builder() + .appName(SparkCreateSimRels.class.getSimpleName()) + .master(parser.get("master")) + .config(conf) + .getOrCreate(); + } + +} diff --git a/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dhp/dedup/SparkPropagateRelation.java b/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dhp/dedup/SparkPropagateRelation.java new file mode 100644 index 000000000..5c7be2817 --- /dev/null +++ b/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dhp/dedup/SparkPropagateRelation.java @@ -0,0 +1,169 @@ +package eu.dnetlib.dhp.dedup; + +import com.fasterxml.jackson.databind.DeserializationFeature; +import com.fasterxml.jackson.databind.ObjectMapper; +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.schema.oaf.DataInfo; +import eu.dnetlib.dhp.schema.oaf.Oaf; +import eu.dnetlib.dhp.schema.oaf.Relation; +import eu.dnetlib.pace.util.MapDocumentUtil; +import org.apache.commons.io.IOUtils; +import org.apache.hadoop.io.compress.GzipCodec; +import org.apache.spark.SparkConf; +import org.apache.spark.api.java.JavaPairRDD; +import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.api.java.Optional; +import org.apache.spark.api.java.function.Function; +import org.apache.spark.api.java.function.PairFunction; +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Encoders; +import org.apache.spark.sql.Row; +import org.apache.spark.sql.SparkSession; +import scala.Tuple2; + +import java.io.IOException; + +public class SparkPropagateRelation { + + enum FieldType { + SOURCE, + TARGET + } + + final static String SOURCEJSONPATH = "$.source"; + final static String TARGETJSONPATH = "$.target"; + + public static void main(String[] args) throws Exception { + final ArgumentApplicationParser parser = new ArgumentApplicationParser(IOUtils.toString(SparkPropagateRelation.class.getResourceAsStream("/eu/dnetlib/dhp/dedup/propagateRelation_parameters.json"))); + parser.parseArgument(args); + + new SparkPropagateRelation().run(parser); + } + + public void run(ArgumentApplicationParser parser) { + + final String graphBasePath = parser.get("graphBasePath"); + final String workingPath = parser.get("workingPath"); + final String dedupGraphPath = parser.get("dedupGraphPath"); + + try (SparkSession spark = getSparkSession(parser)) { + final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); + + final Dataset mergeRels = spark.read().load(DedupUtility.createMergeRelPath(workingPath, "*", "*")).as(Encoders.bean(Relation.class)); + + final JavaPairRDD mergedIds = mergeRels + .where("relClass == 'merges'") + .select(mergeRels.col("source"), mergeRels.col("target")) + .distinct() + .toJavaRDD() + .mapToPair((PairFunction) r -> new Tuple2<>(r.getString(1), r.getString(0))); + + JavaRDD relations = sc.textFile(DedupUtility.createEntityPath(graphBasePath, "relation")); + + JavaRDD newRels = relations.mapToPair( + (PairFunction) s -> + new Tuple2<>(MapDocumentUtil.getJPathString(SOURCEJSONPATH, s), s)) + .leftOuterJoin(mergedIds) + .map((Function>>, String>) v1 -> { + if (v1._2()._2().isPresent()) { + return replaceField(v1._2()._1(), v1._2()._2().get(), FieldType.SOURCE); + } + return v1._2()._1(); + }) + .mapToPair( + (PairFunction) s -> + new Tuple2<>(MapDocumentUtil.getJPathString(TARGETJSONPATH, s), s)) + .leftOuterJoin(mergedIds) + .map((Function>>, String>) v1 -> { + if (v1._2()._2().isPresent()) { + return replaceField(v1._2()._1(), v1._2()._2().get(), FieldType.TARGET); + } + return v1._2()._1(); + }).filter(SparkPropagateRelation::containsDedup) + .repartition(500); + + //update deleted by inference + relations = relations.mapToPair( + (PairFunction) s -> + new Tuple2<>(MapDocumentUtil.getJPathString(SOURCEJSONPATH, s), s)) + .leftOuterJoin(mergedIds) + .map((Function>>, String>) v1 -> { + if (v1._2()._2().isPresent()) { + return updateDeletedByInference(v1._2()._1(), Relation.class); + } + return v1._2()._1(); + }) + .mapToPair( + (PairFunction) s -> + new Tuple2<>(MapDocumentUtil.getJPathString(TARGETJSONPATH, s), s)) + .leftOuterJoin(mergedIds) + .map((Function>>, String>) v1 -> { + if (v1._2()._2().isPresent()) { + return updateDeletedByInference(v1._2()._1(), Relation.class); + } + return v1._2()._1(); + }) + .repartition(500); + + newRels.union(relations).repartition(1000) + .saveAsTextFile(DedupUtility.createEntityPath(dedupGraphPath, "relation"), GzipCodec.class); + } + } + + private static boolean containsDedup(final String json) { + final String source = MapDocumentUtil.getJPathString(SOURCEJSONPATH, json); + final String target = MapDocumentUtil.getJPathString(TARGETJSONPATH, json); + + return source.toLowerCase().contains("dedup") || target.toLowerCase().contains("dedup"); + } + + private static String replaceField(final String json, final String id, final FieldType type) { + ObjectMapper mapper = new ObjectMapper(); + mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false); + try { + Relation relation = mapper.readValue(json, Relation.class); + if (relation.getDataInfo() == null) + relation.setDataInfo(new DataInfo()); + relation.getDataInfo().setDeletedbyinference(false); + switch (type) { + case SOURCE: + relation.setSource(id); + return mapper.writeValueAsString(relation); + case TARGET: + relation.setTarget(id); + return mapper.writeValueAsString(relation); + default: + throw new IllegalArgumentException(""); + } + } catch (IOException e) { + throw new RuntimeException("unable to deserialize json relation: " + json, e); + } + } + + private static SparkSession getSparkSession(ArgumentApplicationParser parser) { + SparkConf conf = new SparkConf(); + + return SparkSession + .builder() + .appName(SparkPropagateRelation.class.getSimpleName()) + .master(parser.get("master")) + .config(conf) + .enableHiveSupport() + .getOrCreate(); + } + + private static String updateDeletedByInference(final String json, final Class clazz) { + final ObjectMapper mapper = new ObjectMapper(); + mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false); + try { + Oaf entity = mapper.readValue(json, clazz); + if (entity.getDataInfo()== null) + entity.setDataInfo(new DataInfo()); + entity.getDataInfo().setDeletedbyinference(true); + return mapper.writeValueAsString(entity); + } catch (IOException e) { + throw new RuntimeException("Unable to convert json", e); + } + } +} diff --git a/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/SparkReporter.java b/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dhp/dedup/SparkReporter.java similarity index 97% rename from dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/SparkReporter.java rename to dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dhp/dedup/SparkReporter.java index 165a10b25..c83a66e70 100644 --- a/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/SparkReporter.java +++ b/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dhp/dedup/SparkReporter.java @@ -1,4 +1,4 @@ -package eu.dnetlib.dedup; +package eu.dnetlib.dhp.dedup; import eu.dnetlib.pace.util.Reporter; import org.apache.commons.logging.Log; diff --git a/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dhp/dedup/SparkUpdateEntity.java b/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dhp/dedup/SparkUpdateEntity.java new file mode 100644 index 000000000..b8b41d217 --- /dev/null +++ b/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dhp/dedup/SparkUpdateEntity.java @@ -0,0 +1,141 @@ +package eu.dnetlib.dhp.dedup; + +import com.fasterxml.jackson.databind.DeserializationFeature; +import com.fasterxml.jackson.databind.ObjectMapper; +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.schema.oaf.*; +import eu.dnetlib.pace.util.MapDocumentUtil; +import org.apache.commons.io.IOUtils; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.*; +import org.apache.hadoop.io.compress.GzipCodec; +import org.apache.spark.SparkConf; +import org.apache.spark.api.java.JavaPairRDD; +import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.api.java.function.PairFunction; +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Encoders; +import org.apache.spark.sql.Row; +import org.apache.spark.sql.SparkSession; +import scala.Tuple2; + +import java.io.IOException; +import java.io.Serializable; + +public class SparkUpdateEntity implements Serializable { + + final String IDJSONPATH = "$.id"; + + public static void main(String[] args) throws Exception { + final ArgumentApplicationParser parser = new ArgumentApplicationParser(IOUtils.toString(SparkUpdateEntity.class.getResourceAsStream("/eu/dnetlib/dhp/dedup/updateEntity_parameters.json"))); + parser.parseArgument(args); + + new SparkUpdateEntity().run(parser); + } + + public boolean mergeRelExists(String basePath, String entity) throws IOException { + + boolean result = false; + + FileSystem fileSystem = FileSystem.get(new Configuration()); + + FileStatus[] fileStatuses = fileSystem.listStatus(new Path(basePath)); + + for (FileStatus fs : fileStatuses) { + if (fs.isDirectory()) + if (fileSystem.exists(new Path(DedupUtility.createMergeRelPath(basePath, fs.getPath().getName(), entity)))) + result = true; + } + + return result; + } + + public void run(ArgumentApplicationParser parser) throws IOException { + + final String graphBasePath = parser.get("graphBasePath"); + final String workingPath = parser.get("workingPath"); + final String dedupGraphPath = parser.get("dedupGraphPath"); + + try (SparkSession spark = getSparkSession(parser)) { + + final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); + + //for each entity + for (OafEntityType entity: OafEntityType.values()) { + + JavaRDD sourceEntity = sc.textFile(DedupUtility.createEntityPath(graphBasePath, entity.toString())); + + if (mergeRelExists(workingPath, entity.toString())) { + + final Dataset rel = spark.read().load(DedupUtility.createMergeRelPath(workingPath, "*", entity.toString())).as(Encoders.bean(Relation.class)); + + final JavaPairRDD mergedIds = rel + .where("relClass == 'merges'") + .select(rel.col("target")) + .distinct() + .toJavaRDD() + .mapToPair((PairFunction) r -> new Tuple2<>(r.getString(0), "d")); + + final JavaRDD dedupEntity = sc.textFile(DedupUtility.createDedupRecordPath(workingPath, "*", entity.toString())); + + JavaPairRDD entitiesWithId = sourceEntity.mapToPair((PairFunction) s -> new Tuple2<>(MapDocumentUtil.getJPathString(IDJSONPATH, s), s)); + + JavaRDD map = entitiesWithId.leftOuterJoin(mergedIds).map(k -> k._2()._2().isPresent() ? updateDeletedByInference(k._2()._1(), getOafClass(entity)) : k._2()._1()); + sourceEntity = map.union(dedupEntity); + + } + + sourceEntity.saveAsTextFile(dedupGraphPath + "/" + entity, GzipCodec.class); + + } + } + } + + public Class getOafClass(OafEntityType className) { + switch (className.toString()) { + case "publication": + return Publication.class; + case "dataset": + return eu.dnetlib.dhp.schema.oaf.Dataset.class; + case "datasource": + return Datasource.class; + case "software": + return Software.class; + case "organization": + return Organization.class; + case "otherresearchproduct": + return OtherResearchProduct.class; + case "project": + return Project.class; + default: + throw new IllegalArgumentException("Illegal type " + className); + } + } + + private static String updateDeletedByInference(final String json, final Class clazz) { + final ObjectMapper mapper = new ObjectMapper(); + mapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false); + try { + Oaf entity = mapper.readValue(json, clazz); + if (entity.getDataInfo()== null) + entity.setDataInfo(new DataInfo()); + entity.getDataInfo().setDeletedbyinference(true); + return mapper.writeValueAsString(entity); + } catch (IOException e) { + throw new RuntimeException("Unable to convert json", e); + } + } + + private static SparkSession getSparkSession(ArgumentApplicationParser parser) { + SparkConf conf = new SparkConf(); + + return SparkSession + .builder() + .appName(SparkUpdateEntity.class.getSimpleName()) + .master(parser.get("master")) + .config(conf) + .enableHiveSupport() + .getOrCreate(); + } +} diff --git a/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/graph/ConnectedComponent.java b/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dhp/dedup/graph/ConnectedComponent.java similarity index 95% rename from dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/graph/ConnectedComponent.java rename to dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dhp/dedup/graph/ConnectedComponent.java index 27a61c02d..dd1a370c5 100644 --- a/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/graph/ConnectedComponent.java +++ b/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dhp/dedup/graph/ConnectedComponent.java @@ -1,7 +1,7 @@ -package eu.dnetlib.dedup.graph; +package eu.dnetlib.dhp.dedup.graph; import com.fasterxml.jackson.databind.ObjectMapper; -import eu.dnetlib.dedup.DedupUtility; +import eu.dnetlib.dhp.dedup.DedupUtility; import eu.dnetlib.pace.util.PaceException; import org.apache.commons.lang.StringUtils; import org.codehaus.jackson.annotate.JsonIgnore; diff --git a/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/graph/GraphProcessor.scala b/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dhp/dedup/graph/GraphProcessor.scala similarity index 96% rename from dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/graph/GraphProcessor.scala rename to dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dhp/dedup/graph/GraphProcessor.scala index 38c695152..80b0b9ef4 100644 --- a/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/graph/GraphProcessor.scala +++ b/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dhp/dedup/graph/GraphProcessor.scala @@ -1,4 +1,4 @@ -package eu.dnetlib.dedup.graph +package eu.dnetlib.dhp.dedup.graph import org.apache.spark.graphx._ import org.apache.spark.rdd.RDD diff --git a/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/oozie_app/config-default.xml b/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/consistency/oozie_app/config-default.xml similarity index 62% rename from dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/oozie_app/config-default.xml rename to dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/consistency/oozie_app/config-default.xml index fcab9dd00..2e0ed9aee 100644 --- a/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/oozie_app/config-default.xml +++ b/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/consistency/oozie_app/config-default.xml @@ -15,12 +15,4 @@ oozie.action.sharelib.for.spark spark2 - - hive_metastore_uris - thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083 - - - hive_db_name - openaire - \ No newline at end of file diff --git a/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/consistency/oozie_app/workflow.xml b/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/consistency/oozie_app/workflow.xml new file mode 100644 index 000000000..d481a6cfb --- /dev/null +++ b/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/consistency/oozie_app/workflow.xml @@ -0,0 +1,105 @@ + + + + graphBasePath + the raw graph base path + + + workingPath + path of the working directory + + + dedupGraphPath + path of the dedup graph + + + sparkDriverMemory + memory for driver process + + + sparkExecutorMemory + memory for individual executor + + + sparkExecutorCores + number of cores used by single executor + + + + + ${jobTracker} + ${nameNode} + + + mapreduce.job.queuename + ${queueName} + + + oozie.launcher.mapred.job.queue.name + ${oozieLauncherQueueName} + + + + + + + + Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] + + + + + + + + yarn + cluster + Update Entity + eu.dnetlib.dhp.dedup.SparkUpdateEntity + dhp-dedup-${projectVersion}.jar + + --executor-memory ${sparkExecutorMemory} + --executor-cores ${sparkExecutorCores} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners="com.cloudera.spark.lineage.NavigatorAppListener" + --conf spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener" + --conf spark.sql.warehouse.dir="/user/hive/warehouse" + + -mtyarn + --i${graphBasePath} + --w${workingPath} + --o${dedupGraphPath} + + + + + + + + + + + yarn + cluster + Update Relations + eu.dnetlib.dhp.dedup.SparkPropagateRelation + dhp-dedup-${projectVersion}.jar + + --executor-memory ${sparkExecutorMemory} + --executor-cores ${sparkExecutorCores} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners="com.cloudera.spark.lineage.NavigatorAppListener" + --conf spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener" + --conf spark.sql.warehouse.dir="/user/hive/warehouse" + + -mtyarn + --i${graphBasePath} + --o${dedupGraphPath} + --w${workingPath} + + + + + + + \ No newline at end of file diff --git a/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/createCC_parameters.json b/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/createCC_parameters.json new file mode 100644 index 000000000..42ef2b78e --- /dev/null +++ b/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/createCC_parameters.json @@ -0,0 +1,32 @@ +[ + { + "paramName": "mt", + "paramLongName": "master", + "paramDescription": "should be local or yarn", + "paramRequired": true + }, + { + "paramName": "asi", + "paramLongName": "actionSetId", + "paramDescription": "action set identifier (name of the orchestrator)", + "paramRequired": true + }, + { + "paramName": "i", + "paramLongName": "graphBasePath", + "paramDescription": "the base path of the raw graph", + "paramRequired": true + }, + { + "paramName": "la", + "paramLongName": "isLookUpUrl", + "paramDescription": "the url for the lookup service", + "paramRequired": true + }, + { + "paramName": "w", + "paramLongName": "workingPath", + "paramDescription": "path for the working directory", + "paramRequired": true + } +] \ No newline at end of file diff --git a/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/createDedupRecord_parameters.json b/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/createDedupRecord_parameters.json new file mode 100644 index 000000000..f7bf5e518 --- /dev/null +++ b/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/createDedupRecord_parameters.json @@ -0,0 +1,32 @@ +[ + { + "paramName": "mt", + "paramLongName": "master", + "paramDescription": "should be local or yarn", + "paramRequired": true + }, + { + "paramName": "i", + "paramLongName": "graphBasePath", + "paramDescription": "the base path of raw graph", + "paramRequired": true + }, + { + "paramName": "w", + "paramLongName": "workingPath", + "paramDescription": "the working directory path", + "paramRequired": true + }, + { + "paramName": "la", + "paramLongName": "isLookUpUrl", + "paramDescription": "the url of the lookup service", + "paramRequired": true + }, + { + "paramName": "asi", + "paramLongName": "actionSetId", + "paramDescription": "the id of the actionset (orchestrator)", + "paramRequired": true + } +] \ No newline at end of file diff --git a/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/createSimRels_parameters.json b/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/createSimRels_parameters.json new file mode 100644 index 000000000..8cffa86dc --- /dev/null +++ b/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/createSimRels_parameters.json @@ -0,0 +1,32 @@ +[ + { + "paramName": "mt", + "paramLongName": "master", + "paramDescription": "should be local or yarn", + "paramRequired": true + }, + { + "paramName": "la", + "paramLongName": "isLookUpUrl", + "paramDescription": "address for the LookUp", + "paramRequired": true + }, + { + "paramName": "asi", + "paramLongName": "actionSetId", + "paramDescription": "action set identifier (name of the orchestrator)", + "paramRequired": true + }, + { + "paramName": "i", + "paramLongName": "graphBasePath", + "paramDescription": "the base path of the raw graph", + "paramRequired": true + }, + { + "paramName": "w", + "paramLongName": "workingPath", + "paramDescription": "path of the working directory", + "paramRequired": true + } +] \ No newline at end of file diff --git a/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/dedup_parameters.json b/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/dedup_parameters.json deleted file mode 100644 index 8ba8515d0..000000000 --- a/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/dedup_parameters.json +++ /dev/null @@ -1,33 +0,0 @@ -[ - { - "paramName": "mt", - "paramLongName": "master", - "paramDescription": "should be local or yarn", - "paramRequired": true - }, - { - "paramName": "s", - "paramLongName": "sourcePath", - "paramDescription": "the path of the sequential file to read", - "paramRequired": true - }, - { - "paramName": "e", - "paramLongName": "entity", - "paramDescription": "the type of entity to be deduped", - "paramRequired": true - }, - { - "paramName": "c", - "paramLongName": "dedupConf", - "paramDescription": "dedup configuration to be used", - "compressed": true, - "paramRequired": true - }, - { - "paramName": "t", - "paramLongName": "targetPath", - "paramDescription": "target path to save dedup result", - "paramRequired": true - } -] \ No newline at end of file diff --git a/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/oozie_app/workflow.xml b/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/oozie_app/workflow.xml deleted file mode 100644 index 5a00a5967..000000000 --- a/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/oozie_app/workflow.xml +++ /dev/null @@ -1,126 +0,0 @@ - - - - sourcePath - the source path - - - entity - the entity that should be processed - - - dedupConf - the dedup Configuration - - - targetPath - the target path - - - sparkDriverMemory - memory for driver process - - - sparkExecutorMemory - memory for individual executor - - - sparkExecutorCores - number of cores used by single executor - - - - - - - - Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] - - - - - - - - - - - - - - ${jobTracker} - ${nameNode} - yarn-cluster - cluster - Create Similarity Relations - eu.dnetlib.dedup.SparkCreateSimRels - dhp-dedup-${projectVersion}.jar - --executor-memory ${sparkExecutorMemory} --executor-cores ${sparkExecutorCores} - --driver-memory=${sparkDriverMemory} --conf - spark.extraListeners="com.cloudera.spark.lineage.NavigatorAppListener" --conf - spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener" --conf - spark.sql.warehouse.dir="/user/hive/warehouse" - - -mtyarn-cluster - --sourcePath${sourcePath} - --targetPath${targetPath} - --entity${entity} - --dedupConf${dedupConf} - - - - - - - - - ${jobTracker} - ${nameNode} - yarn-cluster - cluster - Create Connected Components - eu.dnetlib.dedup.SparkCreateConnectedComponent - dhp-dedup-${projectVersion}.jar - --executor-memory ${sparkExecutorMemory} --executor-cores ${sparkExecutorCores} - --driver-memory=${sparkDriverMemory} --conf - spark.extraListeners="com.cloudera.spark.lineage.NavigatorAppListener" --conf - spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener" --conf - spark.sql.warehouse.dir="/user/hive/warehouse" - - -mtyarn-cluster - --sourcePath${sourcePath} - --targetPath${targetPath} - --entity${entity} - --dedupConf${dedupConf} - - - - - - - - ${jobTracker} - ${nameNode} - yarn-cluster - cluster - Create Dedup Record - eu.dnetlib.dedup.SparkCreateDedupRecord - dhp-dedup-${projectVersion}.jar - --executor-memory ${sparkExecutorMemory} --executor-cores ${sparkExecutorCores} - --driver-memory=${sparkDriverMemory} --conf - spark.extraListeners="com.cloudera.spark.lineage.NavigatorAppListener" --conf - spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener" --conf - spark.sql.warehouse.dir="/user/hive/warehouse" - - -mtyarn-cluster - --sourcePath${sourcePath} - --dedupPath${dedupPath} - --entity${entity} - --dedupConf${dedupConf} - - - - - - - \ No newline at end of file diff --git a/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/propagateRelation_parameters.json b/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/propagateRelation_parameters.json new file mode 100644 index 000000000..721a783e1 --- /dev/null +++ b/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/propagateRelation_parameters.json @@ -0,0 +1,26 @@ +[ + { + "paramName": "mt", + "paramLongName": "master", + "paramDescription": "should be local or yarn", + "paramRequired": true + }, + { + "paramName": "i", + "paramLongName": "graphBasePath", + "paramDescription": "the base path of raw graph", + "paramRequired": true + }, + { + "paramName": "w", + "paramLongName": "workingPath", + "paramDescription": "the working directory path", + "paramRequired": true + }, + { + "paramName": "o", + "paramLongName": "dedupGraphPath", + "paramDescription": "the path of the dedup graph", + "paramRequired": true + } +] \ No newline at end of file diff --git a/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/scan/oozie_app/config-default.xml b/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/scan/oozie_app/config-default.xml new file mode 100644 index 000000000..2e0ed9aee --- /dev/null +++ b/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/scan/oozie_app/config-default.xml @@ -0,0 +1,18 @@ + + + jobTracker + yarnRM + + + nameNode + hdfs://nameservice1 + + + oozie.use.system.libpath + true + + + oozie.action.sharelib.for.spark + spark2 + + \ No newline at end of file diff --git a/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/scan/oozie_app/workflow.xml b/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/scan/oozie_app/workflow.xml new file mode 100644 index 000000000..e2c7f425b --- /dev/null +++ b/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/scan/oozie_app/workflow.xml @@ -0,0 +1,138 @@ + + + + graphBasePath + the raw graph base path + + + isLookUpUrl + the address of the lookUp service + + + actionSetId + id of the actionSet + + + workingPath + path for the working directory + + + sparkDriverMemory + memory for driver process + + + sparkExecutorMemory + memory for individual executor + + + sparkExecutorCores + number of cores used by single executor + + + + + ${jobTracker} + ${nameNode} + + + mapreduce.job.queuename + ${queueName} + + + oozie.launcher.mapred.job.queue.name + ${oozieLauncherQueueName} + + + + + + + + Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] + + + + + + + + yarn + cluster + Create Similarity Relations + eu.dnetlib.dhp.dedup.SparkCreateSimRels + dhp-dedup-${projectVersion}.jar + + --executor-memory ${sparkExecutorMemory} + --executor-cores ${sparkExecutorCores} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners="com.cloudera.spark.lineage.NavigatorAppListener" + --conf spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener" + + -mtyarn + --i${graphBasePath} + --la${isLookUpUrl} + --asi${actionSetId} + --w${workingPath} + + + + + + + + + + + yarn + cluster + Create Merge Relations + eu.dnetlib.dhp.dedup.SparkCreateConnectedComponent + dhp-dedup-${projectVersion}.jar + + --executor-memory ${sparkExecutorMemory} + --executor-cores ${sparkExecutorCores} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners="com.cloudera.spark.lineage.NavigatorAppListener" + --conf spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener" + --conf spark.sql.warehouse.dir="/user/hive/warehouse" + + -mtyarn + --i${graphBasePath} + --w${workingPath} + --la${isLookUpUrl} + --asi${actionSetId} + + + + + + + + + + + yarn + cluster + Create Dedup Record + eu.dnetlib.dhp.dedup.SparkCreateDedupRecord + dhp-dedup-${projectVersion}.jar + + --executor-memory ${sparkExecutorMemory} + --executor-cores ${sparkExecutorCores} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners="com.cloudera.spark.lineage.NavigatorAppListener" + --conf spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener" + --conf spark.sql.warehouse.dir="/user/hive/warehouse" + + -mtyarn + --i${graphBasePath} + --w${workingPath} + --la${isLookUpUrl} + --asi${actionSetId} + + + + + + + \ No newline at end of file diff --git a/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/updateEntity_parameters.json b/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/updateEntity_parameters.json new file mode 100644 index 000000000..06b67f732 --- /dev/null +++ b/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/updateEntity_parameters.json @@ -0,0 +1,26 @@ +[ +{ + "paramName": "mt", + "paramLongName": "master", + "paramDescription": "should be local or yarn", + "paramRequired": true +}, +{ + "paramName": "i", + "paramLongName": "graphBasePath", + "paramDescription": "the base path of raw graph", + "paramRequired": true +}, +{ + "paramName": "w", + "paramLongName": "workingPath", + "paramDescription": "the working directory path", + "paramRequired": true +}, +{ + "paramName": "o", + "paramLongName": "dedupGraphPath", + "paramDescription": "the path of the dedup graph", + "paramRequired": true + } +] \ No newline at end of file diff --git a/dhp-workflows/dhp-dedup/src/test/java/eu/dnetlib/dedup/jpath/JsonPathTest.java b/dhp-workflows/dhp-dedup/src/test/java/eu/dnetlib/dedup/jpath/JsonPathTest.java deleted file mode 100644 index 7a63cfe24..000000000 --- a/dhp-workflows/dhp-dedup/src/test/java/eu/dnetlib/dedup/jpath/JsonPathTest.java +++ /dev/null @@ -1,31 +0,0 @@ -package eu.dnetlib.dedup.jpath; - -import com.fasterxml.jackson.core.JsonProcessingException; -import com.fasterxml.jackson.databind.ObjectMapper; -import com.jayway.jsonpath.JsonPath; -import org.apache.commons.io.IOUtils; -import org.junit.Test; -import java.util.List; -import java.util.Map; - -public class JsonPathTest { - - @Test - public void testJPath () throws Exception { - final String json = IOUtils.toString(getClass().getResourceAsStream("/eu/dnetlib/dedup/conf/sample.json")); - List> pid = JsonPath.read(json, "$.pid[*]"); -// System.out.println(json); - - pid.forEach(it -> { - try { - System.out.println(new ObjectMapper().writeValueAsString(it)); - } catch (JsonProcessingException e) { - e.printStackTrace(); - } - }); - - - - - } -} diff --git a/dhp-workflows/dhp-dedup/src/test/java/eu/dnetlib/dedup/MergeAuthorTest.java b/dhp-workflows/dhp-dedup/src/test/java/eu/dnetlib/dhp/dedup/MergeAuthorTest.java similarity index 97% rename from dhp-workflows/dhp-dedup/src/test/java/eu/dnetlib/dedup/MergeAuthorTest.java rename to dhp-workflows/dhp-dedup/src/test/java/eu/dnetlib/dhp/dedup/MergeAuthorTest.java index 817f2075c..e8bfd08fd 100644 --- a/dhp-workflows/dhp-dedup/src/test/java/eu/dnetlib/dedup/MergeAuthorTest.java +++ b/dhp-workflows/dhp-dedup/src/test/java/eu/dnetlib/dhp/dedup/MergeAuthorTest.java @@ -1,4 +1,4 @@ -package eu.dnetlib.dedup; +package eu.dnetlib.dhp.dedup; import eu.dnetlib.dhp.schema.oaf.Publication; import org.apache.commons.io.IOUtils; diff --git a/dhp-workflows/dhp-dedup/src/test/java/eu/dnetlib/dedup/SparkCreateDedupTest.java b/dhp-workflows/dhp-dedup/src/test/java/eu/dnetlib/dhp/dedup/SparkCreateDedupTest.java similarity index 69% rename from dhp-workflows/dhp-dedup/src/test/java/eu/dnetlib/dedup/SparkCreateDedupTest.java rename to dhp-workflows/dhp-dedup/src/test/java/eu/dnetlib/dhp/dedup/SparkCreateDedupTest.java index f0d9547cf..8f1e3b0ae 100644 --- a/dhp-workflows/dhp-dedup/src/test/java/eu/dnetlib/dedup/SparkCreateDedupTest.java +++ b/dhp-workflows/dhp-dedup/src/test/java/eu/dnetlib/dhp/dedup/SparkCreateDedupTest.java @@ -1,19 +1,16 @@ -package eu.dnetlib.dedup; +package eu.dnetlib.dhp.dedup; -import com.fasterxml.jackson.databind.ObjectMapper; import com.google.common.hash.HashFunction; import com.google.common.hash.Hashing; import eu.dnetlib.dhp.application.ArgumentApplicationParser; -import eu.dnetlib.dhp.schema.oaf.Publication; -import org.apache.commons.io.FileUtils; import org.apache.commons.io.IOUtils; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.*; import org.junit.Before; import org.junit.Ignore; import org.junit.Test; -import java.io.File; import java.io.IOException; -import java.util.List; public class SparkCreateDedupTest { @@ -22,19 +19,20 @@ public class SparkCreateDedupTest { @Before public void setUp() throws IOException { - configuration = IOUtils.toString(getClass().getResourceAsStream("/eu/dnetlib/dedup/conf/org.curr.conf.json")); - +// configuration = IOUtils.toString(getClass().getResourceAsStream("/eu/dnetlib/dedup/conf/org.curr.conf.json")); + configuration = ""; } @Test @Ignore public void createSimRelsTest() throws Exception { - SparkCreateSimRels.main(new String[] { + SparkCreateSimRels.main(new String[]{ "-mt", "local[*]", - "-s", "/Users/miconis/dumps", - "-e", entity, - "-c", ArgumentApplicationParser.compressArgument(configuration), - "-t", "/tmp/dedup", + "-i", "/Users/miconis/dumps", + "-o", "/tmp/dedup/rawset_test", + "-asi", "dedup-similarity-result-levenstein", + "-la", "lookupurl", + "-w", "workingPath" }); } @@ -42,7 +40,7 @@ public class SparkCreateDedupTest { @Ignore public void createCCTest() throws Exception { - SparkCreateConnectedComponent.main(new String[] { + SparkCreateConnectedComponent.main(new String[]{ "-mt", "local[*]", "-s", "/Users/miconis/dumps", "-e", entity, @@ -54,7 +52,7 @@ public class SparkCreateDedupTest { @Test @Ignore public void dedupRecordTest() throws Exception { - SparkCreateDedupRecord.main(new String[] { + SparkCreateDedupRecord.main(new String[]{ "-mt", "local[*]", "-s", "/Users/miconis/dumps", "-e", entity, @@ -64,23 +62,22 @@ public class SparkCreateDedupTest { } @Test + @Ignore public void printConfiguration() throws Exception { System.out.println(ArgumentApplicationParser.compressArgument(configuration)); } @Test + @Ignore public void testHashCode() { final String s1 = "20|grid________::6031f94bef015a37783268ec1e75f17f"; final String s2 = "20|nsf_________::b12be9edf414df8ee66b4c52a2d8da46"; final HashFunction hashFunction = Hashing.murmur3_128(); - System.out.println( s1.hashCode()); + System.out.println(s1.hashCode()); System.out.println(hashFunction.hashString(s1).asLong()); - System.out.println( s2.hashCode()); + System.out.println(s2.hashCode()); System.out.println(hashFunction.hashString(s2).asLong()); - } - - } diff --git a/dhp-workflows/dhp-dedup/src/test/java/eu/dnetlib/dhp/dedup/jpath/JsonPathTest.java b/dhp-workflows/dhp-dedup/src/test/java/eu/dnetlib/dhp/dedup/jpath/JsonPathTest.java new file mode 100644 index 000000000..b3e8d5656 --- /dev/null +++ b/dhp-workflows/dhp-dedup/src/test/java/eu/dnetlib/dhp/dedup/jpath/JsonPathTest.java @@ -0,0 +1,295 @@ +package eu.dnetlib.dhp.dedup.jpath; + +import com.fasterxml.jackson.core.JsonProcessingException; +import com.fasterxml.jackson.databind.ObjectMapper; +import com.jayway.jsonpath.JsonPath; +import eu.dnetlib.pace.config.DedupConfig; +import eu.dnetlib.pace.model.MapDocument; +import eu.dnetlib.pace.util.MapDocumentUtil; +import org.apache.commons.io.IOUtils; +import org.junit.Test; +import java.util.List; +import java.util.Map; + +public class JsonPathTest { + + String json = "{\t\"dataInfo\":{\t\t\"invisible\":false,\t\t\"inferred\":false,\t\t\"deletedbyinference\":false,\t\t\"trust\":\"0.810000002384185791\",\t\t\"inferenceprovenance\":\"\",\t\t\"provenanceaction\":{\t\t\t\"classid\":\"sysimport:crosswalk:entityregistry\",\t\t\t\"classname\":\"sysimport:crosswalk:entityregistry\",\t\t\t\"schemeid\":\"dnet:provenance_actions\",\t\t\t\"schemename\":\"dnet:provenance_actions\"\t\t}\t},\t\"lastupdatetimestamp\":1584960968152,\t\"id\":\"20|corda__h2020::9faf23721249f26ac2c16eb857ea1fb9\",\t\"originalId\":[\t\t\"corda__h2020::927957582\"\t],\t\"collectedfrom\":[\t\t{\t\t\t\"key\":\"openaire____::corda_h2020\",\t\t\t\"value\":\"CORDA - COmmon Research DAta Warehouse - Horizon 2020\",\t\t\t\"dataInfo\":null\t\t}\t],\t\"pid\":[\t],\t\"dateofcollection\":\"2016-06-05\",\t\"dateoftransformation\":\"2019-11-19\",\t\"extraInfo\":[\t],\t\"oaiprovenance\":null,\t\"legalshortname\":{\t\t\"value\":\"Comentor AB\",\t\t\"dataInfo\":{\t\t\t\"invisible\":false,\t\t\t\"inferred\":false,\t\t\t\"deletedbyinference\":false,\t\t\t\"trust\":\"0.810000002384185791\",\t\t\t\"inferenceprovenance\":\"\",\t\t\t\"provenanceaction\":{\t\t\t\t\"classid\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"classname\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"schemeid\":\"dnet:provenance_actions\",\t\t\t\t\"schemename\":\"dnet:provenance_actions\"\t\t\t}\t\t}\t},\t\"legalname\":{\t\t\"value\":\"Comentor AB\",\t\t\"dataInfo\":{\t\t\t\"invisible\":false,\t\t\t\"inferred\":false,\t\t\t\"deletedbyinference\":false,\t\t\t\"trust\":\"0.810000002384185791\",\t\t\t\"inferenceprovenance\":\"\",\t\t\t\"provenanceaction\":{\t\t\t\t\"classid\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"classname\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"schemeid\":\"dnet:provenance_actions\",\t\t\t\t\"schemename\":\"dnet:provenance_actions\"\t\t\t}\t\t}\t},\t\"alternativeNames\":[\t],\t\"websiteurl\":{\t\t\"value\":\"http://www.comentor.se\",\t\t\"dataInfo\":{\t\t\t\"invisible\":false,\t\t\t\"inferred\":false,\t\t\t\"deletedbyinference\":false,\t\t\t\"trust\":\"0.810000002384185791\",\t\t\t\"inferenceprovenance\":\"\",\t\t\t\"provenanceaction\":{\t\t\t\t\"classid\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"classname\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"schemeid\":\"dnet:provenance_actions\",\t\t\t\t\"schemename\":\"dnet:provenance_actions\"\t\t\t}\t\t}\t},\t\"logourl\":null,\t\"eclegalbody\":{\t\t\"value\":\"false\",\t\t\"dataInfo\":{\t\t\t\"invisible\":false,\t\t\t\"inferred\":false,\t\t\t\"deletedbyinference\":false,\t\t\t\"trust\":\"0.810000002384185791\",\t\t\t\"inferenceprovenance\":\"\",\t\t\t\"provenanceaction\":{\t\t\t\t\"classid\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"classname\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"schemeid\":\"dnet:provenance_actions\",\t\t\t\t\"schemename\":\"dnet:provenance_actions\"\t\t\t}\t\t}\t},\t\"eclegalperson\":{\t\t\"value\":\"true\",\t\t\"dataInfo\":{\t\t\t\"invisible\":false,\t\t\t\"inferred\":false,\t\t\t\"deletedbyinference\":false,\t\t\t\"trust\":\"0.810000002384185791\",\t\t\t\"inferenceprovenance\":\"\",\t\t\t\"provenanceaction\":{\t\t\t\t\"classid\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"classname\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"schemeid\":\"dnet:provenance_actions\",\t\t\t\t\"schemename\":\"dnet:provenance_actions\"\t\t\t}\t\t}\t},\t\"ecnonprofit\":{\t\t\"value\":\"false\",\t\t\"dataInfo\":{\t\t\t\"invisible\":false,\t\t\t\"inferred\":false,\t\t\t\"deletedbyinference\":false,\t\t\t\"trust\":\"0.810000002384185791\",\t\t\t\"inferenceprovenance\":\"\",\t\t\t\"provenanceaction\":{\t\t\t\t\"classid\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"classname\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"schemeid\":\"dnet:provenance_actions\",\t\t\t\t\"schemename\":\"dnet:provenance_actions\"\t\t\t}\t\t}\t},\t\"ecresearchorganization\":{\t\t\"value\":\"false\",\t\t\"dataInfo\":{\t\t\t\"invisible\":false,\t\t\t\"inferred\":false,\t\t\t\"deletedbyinference\":false,\t\t\t\"trust\":\"0.810000002384185791\",\t\t\t\"inferenceprovenance\":\"\",\t\t\t\"provenanceaction\":{\t\t\t\t\"classid\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"classname\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"schemeid\":\"dnet:provenance_actions\",\t\t\t\t\"schemename\":\"dnet:provenance_actions\"\t\t\t}\t\t}\t},\t\"echighereducation\":{\t\t\"value\":\"false\",\t\t\"dataInfo\":{\t\t\t\"invisible\":false,\t\t\t\"inferred\":false,\t\t\t\"deletedbyinference\":false,\t\t\t\"trust\":\"0.810000002384185791\",\t\t\t\"inferenceprovenance\":\"\",\t\t\t\"provenanceaction\":{\t\t\t\t\"classid\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"classname\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"schemeid\":\"dnet:provenance_actions\",\t\t\t\t\"schemename\":\"dnet:provenance_actions\"\t\t\t}\t\t}\t},\t\"ecinternationalorganizationeurinterests\":{\t\t\"value\":\"false\",\t\t\"dataInfo\":{\t\t\t\"invisible\":false,\t\t\t\"inferred\":false,\t\t\t\"deletedbyinference\":false,\t\t\t\"trust\":\"0.810000002384185791\",\t\t\t\"inferenceprovenance\":\"\",\t\t\t\"provenanceaction\":{\t\t\t\t\"classid\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"classname\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"schemeid\":\"dnet:provenance_actions\",\t\t\t\t\"schemename\":\"dnet:provenance_actions\"\t\t\t}\t\t}\t},\t\"ecinternationalorganization\":{\t\t\"value\":\"false\",\t\t\"dataInfo\":{\t\t\t\"invisible\":false,\t\t\t\"inferred\":false,\t\t\t\"deletedbyinference\":false,\t\t\t\"trust\":\"0.810000002384185791\",\t\t\t\"inferenceprovenance\":\"\",\t\t\t\"provenanceaction\":{\t\t\t\t\"classid\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"classname\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"schemeid\":\"dnet:provenance_actions\",\t\t\t\t\"schemename\":\"dnet:provenance_actions\"\t\t\t}\t\t}\t},\t\"ecenterprise\":{\t\t\"value\":\"false\",\t\t\"dataInfo\":{\t\t\t\"invisible\":false,\t\t\t\"inferred\":false,\t\t\t\"deletedbyinference\":false,\t\t\t\"trust\":\"0.810000002384185791\",\t\t\t\"inferenceprovenance\":\"\",\t\t\t\"provenanceaction\":{\t\t\t\t\"classid\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"classname\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"schemeid\":\"dnet:provenance_actions\",\t\t\t\t\"schemename\":\"dnet:provenance_actions\"\t\t\t}\t\t}\t},\t\"ecsmevalidated\":{\t\t\"value\":\"true\",\t\t\"dataInfo\":{\t\t\t\"invisible\":false,\t\t\t\"inferred\":false,\t\t\t\"deletedbyinference\":false,\t\t\t\"trust\":\"0.810000002384185791\",\t\t\t\"inferenceprovenance\":\"\",\t\t\t\"provenanceaction\":{\t\t\t\t\"classid\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"classname\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"schemeid\":\"dnet:provenance_actions\",\t\t\t\t\"schemename\":\"dnet:provenance_actions\"\t\t\t}\t\t}\t},\t\"ecnutscode\":{\t\t\"value\":\"false\",\t\t\"dataInfo\":{\t\t\t\"invisible\":false,\t\t\t\"inferred\":false,\t\t\t\"deletedbyinference\":false,\t\t\t\"trust\":\"0.810000002384185791\",\t\t\t\"inferenceprovenance\":\"\",\t\t\t\"provenanceaction\":{\t\t\t\t\"classid\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"classname\":\"sysimport:crosswalk:entityregistry\",\t\t\t\t\"schemeid\":\"dnet:provenance_actions\",\t\t\t\t\"schemename\":\"dnet:provenance_actions\"\t\t\t}\t\t}\t},\t\"country\":null}"; + DedupConfig conf = DedupConfig.load("{\n" + + " \"wf\" : {\n" + + " \"threshold\" : \"0.99\",\n" + + " \"dedupRun\" : \"001\",\n" + + " \"entityType\" : \"organization\",\n" + + " \"subEntityValue\": \"organization\",\n" + + " \"orderField\" : \"legalname\",\n" + + " \"queueMaxSize\" : \"2000\",\n" + + " \"groupMaxSize\" : \"50\",\n" + + " \"slidingWindowSize\" : \"200\",\n" + + " \"idPath\":\"$.id\",\n" + + " \"rootBuilder\" : [ \"organization\", \"projectOrganization_participation_isParticipant\", \"datasourceOrganization_provision_isProvidedBy\" ],\n" + + " \"includeChildren\" : \"true\",\n" + + " \"maxIterations\": \"20\"\n" + + " },\n" + + " \"pace\" : {\n" + + " \"clustering\" : [\n" + + " { \"name\" : \"sortedngrampairs\", \"fields\" : [ \"legalname\" ], \"params\" : { \"max\" : 2, \"ngramLen\" : \"3\"} },\n" + + " { \"name\" : \"suffixprefix\", \"fields\" : [ \"legalname\" ], \"params\" : { \"max\" : 1, \"len\" : \"3\" } },\n" + + " { \"name\" : \"urlclustering\", \"fields\" : [ \"websiteurl\" ], \"params\" : { } },\n" + + " { \"name\" : \"keywordsclustering\", \"fields\" : [ \"legalname\" ], \"params\" : { \"max\": 2, \"windowSize\": 4} }\n" + + " ],\n" + + " \"decisionTree\" : {\n" + + " \"start\": {\n" + + " \"fields\": [\n" + + " {\n" + + " \"field\": \"gridid\",\n" + + " \"comparator\": \"exactMatch\",\n" + + " \"weight\": 1,\n" + + " \"countIfUndefined\": \"false\",\n" + + " \"params\": {}\n" + + " }\n" + + " ],\n" + + " \"threshold\": 1,\n" + + " \"aggregation\": \"AVG\",\n" + + " \"positive\": \"MATCH\",\n" + + " \"negative\": \"NO_MATCH\",\n" + + " \"undefined\": \"layer2\",\n" + + " \"ignoreUndefined\": \"false\"\n" + + " },\n" + + " \"layer2\": {\n" + + " \"fields\": [\n" + + " {\n" + + " \"field\": \"websiteurl\",\n" + + " \"comparator\": \"domainExactMatch\",\n" + + " \"weight\": 1,\n" + + " \"countIfUndefined\": \"false\",\n" + + " \"params\": {}\n" + + " },\n" + + " {\n" + + " \"field\": \"country\",\n" + + " \"comparator\": \"exactMatch\",\n" + + " \"weight\": 1,\n" + + " \"countIfUndefined\": \"true\",\n" + + " \"params\": {}\n" + + " },\n" + + " {\n" + + " \"field\": \"legalname\",\n" + + " \"comparator\": \"numbersMatch\",\n" + + " \"weight\": 1,\n" + + " \"countIfUndefined\": \"true\",\n" + + " \"params\": {}\n" + + " },\n" + + " {\n" + + " \"field\": \"legalname\",\n" + + " \"comparator\": \"romansMatch\",\n" + + " \"weight\": 1,\n" + + " \"countIfUndefined\": \"true\",\n" + + " \"params\": {}\n" + + " }\n" + + " ],\n" + + " \"threshold\": 1,\n" + + " \"aggregation\": \"AND\",\n" + + " \"positive\": \"layer3\",\n" + + " \"negative\": \"NO_MATCH\",\n" + + " \"undefined\": \"layer3\",\n" + + " \"ignoreUndefined\": \"true\"\n" + + " },\n" + + " \"layer3\": {\n" + + " \"fields\": [\n" + + " {\n" + + " \"field\": \"legalname\",\n" + + " \"comparator\": \"cityMatch\",\n" + + " \"weight\": 1.0,\n" + + " \"countIfUndefined\": \"true\",\n" + + " \"params\": {\n" + + " \"windowSize\": \"4\"\n" + + " }\n" + + " }\n" + + " ],\n" + + " \"threshold\": 0.1,\n" + + " \"aggregation\": \"AVG\",\n" + + " \"positive\": \"layer4\",\n" + + " \"negative\": \"NO_MATCH\",\n" + + " \"undefined\": \"NO_MATCH\",\n" + + " \"ignoreUndefined\": \"true\"\n" + + " },\n" + + " \"layer4\": {\n" + + " \"fields\": [\n" + + " {\n" + + " \"field\": \"legalname\",\n" + + " \"comparator\": \"keywordMatch\",\n" + + " \"weight\": 1.0,\n" + + " \"countIfUndefined\": \"true\",\n" + + " \"params\": {\n" + + " \"windowSize\": \"4\"\n" + + " }\n" + + " }\n" + + " ],\n" + + " \"threshold\": 0.7,\n" + + " \"aggregation\": \"AVG\",\n" + + " \"positive\": \"layer5\",\n" + + " \"negative\": \"NO_MATCH\",\n" + + " \"undefined\": \"layer5\",\n" + + " \"ignoreUndefined\": \"true\"\n" + + " },\n" + + " \"layer5\": {\n" + + " \"fields\": [\n" + + " {\n" + + " \"field\": \"legalname\",\n" + + " \"comparator\": \"jaroWinklerNormalizedName\",\n" + + " \"weight\": 0.9,\n" + + " \"countIfUndefined\": \"true\",\n" + + " \"params\": {\n" + + " \"windowSize\": \"4\"\n" + + " }\n" + + " },\n" + + " {\n" + + " \"field\": \"legalshortname\",\n" + + " \"comparator\": \"jaroWinklerNormalizedName\",\n" + + " \"weight\": 0.1,\n" + + " \"countIfUndefined\": \"false\",\n" + + " \"params\": {\n" + + " \"windowSize\": 4\n" + + " }\n" + + " }\n" + + " ],\n" + + " \"threshold\": 0.9,\n" + + " \"aggregation\": \"W_MEAN\",\n" + + " \"positive\": \"MATCH\",\n" + + " \"negative\": \"NO_MATCH\",\n" + + " \"undefined\": \"NO_MATCH\",\n" + + " \"ignoreUndefined\": \"true\"\n" + + " }\n" + + " },\n" + + " \"model\" : [\n" + + " { \"name\" : \"country\", \"type\" : \"String\", \"path\" : \"$.country.classid\"},\n" + + " { \"name\" : \"legalshortname\", \"type\" : \"String\", \"path\" : \"$.legalshortname.value\"},\n" + + " { \"name\" : \"legalname\", \"type\" : \"String\", \"path\" : \"$.legalname.value\" },\n" + + " { \"name\" : \"websiteurl\", \"type\" : \"URL\", \"path\" : \"$.websiteurl.value\" },\n" + + " { \"name\" : \"gridid\", \"type\" : \"String\", \"path\" : \"$.pid[?(@.qualifier.classid =='grid')].value\"},\n" + + " { \"name\" : \"originalId\", \"type\" : \"String\", \"path\" : \"$.id\" }\n" + + " ],\n" + + " \"blacklists\" : {\n" + + " \"legalname\" : []\n" + + " },\n" + + " \"synonyms\": {\n" + + " \"key::1\": [\"university\",\"università\", \"universitas\", \"università studi\",\"universitario\",\"universitaria\",\"université\", \"universite\", \"universitaire\",\"universitaires\",\"universidad\",\"universitade\",\"Universität\",\"universitaet\",\"Uniwersytet\",\"университет\",\"universiteit\",\"πανεπιστήμιο\",\"universitesi\",\"universiteti\", \"universiti\"],\n" + + " \"key::2\": [\"studies\",\"studi\",\"études\",\"estudios\",\"estudos\",\"Studien\",\"studia\",\"исследования\",\"studies\",\"σπουδές\"],\n" + + " \"key::3\": [\"advanced\",\"superiore\",\"supérieur\",\"supérieure\",\"supérieurs\",\"supérieures\",\"avancado\",\"avancados\",\"fortgeschrittene\",\"fortgeschritten\",\"zaawansowany\",\"передовой\",\"gevorderd\",\"gevorderde\",\"προχωρημένος\",\"προχωρημένη\",\"προχωρημένο\",\"προχωρημένες\",\"προχωρημένα\",\"wyzsza\"],\n" + + " \"key::4\": [\"institute\",\"istituto\",\"institut\",\"instituto\",\"instituto\",\"Institut\",\"instytut\",\"институт\",\"instituut\",\"ινστιτούτο\"],\n" + + " \"key::5\": [\"hospital\",\"ospedale\",\"hôpital\",\"hospital\",\"hospital\",\"Krankenhaus\",\"szpital\",\"больница\",\"ziekenhuis\",\"νοσοκομείο\"],\n" + + " \"key::6\": [\"research\",\"ricerca\",\"recherche\",\"investigacion\",\"pesquisa\",\"Forschung\",\"badania\",\"исследования\",\"onderzoek\",\"έρευνα\",\"erevna\",\"erevnas\"],\n" + + " \"key::7\": [\"college\",\"collegio\",\"colegio\",\"faculdade\",\"Hochschule\",\"Szkoła Wyższa\",\"Высшая школа\",\"κολλέγιο\"],\n" + + " \"key::8\": [\"foundation\",\"fondazione\",\"fondation\",\"fundación\",\"fundação\",\"Stiftung\",\"Fundacja\",\"фонд\",\"stichting\",\"ίδρυμα\",\"idryma\"],\n" + + " \"key::9\": [\"center\",\"centro\",\"centre\",\"centro\",\"centro\",\"zentrum\",\"centrum\",\"центр\",\"centrum\",\"κέντρο\"],\n" + + " \"key::10\": [\"national\",\"nazionale\",\"national\",\"nationale\",\"nationaux\",\"nationales\",\"nacional\",\"nacional\",\"national\",\"krajowy\",\"национальный\",\"nationaal\",\"nationale\",\"εθνικό\"],\n" + + " \"key::11\": [\"association\",\"associazione\",\"association\",\"asociación\",\"associação\",\"Verein\",\"verband\",\"stowarzyszenie\",\"ассоциация\",\"associatie\"],\n" + + " \"key::12\": [\"society\",\"societa\",\"société\",\"sociedad\",\"sociedade\",\"gesellschaft\",\"społeczeństwo\",\"общество\",\"maatschappij\",\"κοινωνία\"],\n" + + " \"key::13\": [\"international\",\"internazionale\",\"international\",\"internacional\",\"internacional\",\"international\",\"międzynarodowy\",\"Международный\",\"internationaal\",\"internationale\",\"διεθνής\",\"διεθνή\",\"διεθνές\"],\n" + + " \"key::14\": [\"community\",\"comunita\",\"communauté\",\"comunidad\",\"comunidade\",\"Gemeinschaft\",\"społeczność\",\"сообщество\",\"gemeenschap\",\"κοινότητα\"],\n" + + " \"key::15\": [\"school\",\"scuola\",\"école\",\"escuela\",\"escola\",\"schule\",\"Szkoła\",\"школа\",\"school\",\"σχολείο\"],\n" + + " \"key::16\": [\"education\",\"educazione\",\"éducation\",\"educacion\",\"Educação\",\"Bildung\",\"Edukacja\",\"образование\",\"opleiding\",\"εκπαίδευση\"],\n" + + " \"key::17\": [\"academy\",\"accademia\",\"académie\",\"academia\",\"academia\",\"Akademie\",\"akademie\",\"академия\",\"academie\",\"ακαδημία\"],\n" + + " \"key::18\": [\"public\",\"pubblico\",\"public\",\"publique\",\"publics\",\"publiques\",\"publico\",\"publico\",\"Öffentlichkeit\",\"publiczny\",\"публичный\",\"publiek\",\"publieke\",\"δημόσιος\",\"δημόσια\",\"δημόσιο\"],\n" + + " \"key::19\": [\"museum\",\"museo\",\"musée\",\"mueso\",\"museu\",\"museum\",\"muzeum\",\"музей\",\"museum\",\"μουσείο\"],\n" + + " \"key::20\": [\"group\",\"gruppo\",\"groupe\",\"grupo\",\"grupo\",\"gruppe\",\"grupa\",\"группа\",\"groep\",\"ομάδα\",\"όμιλος\"],\n" + + " \"key::21\": [\"department\",\"dipartimento\",\"département\",\"departamento\",\"departamento\",\"abteilung\",\"departament\",\"отдел\",\"afdeling\",\"τμήμα\"],\n" + + " \"key::22\": [\"council\",\"consiglio\",\"conseil\",\"Consejo\",\"conselho\",\"gesellschaft\",\"rada\",\"совет\",\"raad\",\"συμβούλιο\"],\n" + + " \"key::23\": [\"library\",\"biblioteca\",\"bibliothèque\",\"biblioteca\",\"biblioteca\",\"Bibliothek\",\"biblioteka\",\"библиотека\",\"bibliotheek\",\"βιβλιοθήκη\"],\n" + + " \"key::24\": [\"ministry\",\"ministero\",\"ministère\",\"ministerio\",\"ministério\",\"Ministerium\",\"ministerstwo\",\"министерство\",\"ministerie\",\"υπουργείο\"],\n" + + " \"key::25\": [\"services\",\"servizi\",\"services\",\"servicios\",\"Serviços\",\"Dienstleistungen\",\"usługi\",\"услуги\",\"diensten\",\"υπηρεσίες\"],\n" + + " \"key::26\": [\"central\",\"centrale\",\"central\",\"centrale\",\"centrales\",\"central\",\"central\",\"zentral\",\"centralny\",\"цетральный\",\"centraal\",\"κεντρικός\",\"κεντρική\",\"κεντρικό\",\"κεντρικά\"],\n" + + " \"key::27\": [\"general\",\"generale\",\"général\",\"générale\",\"généraux\",\"générales\",\"general\",\"geral\",\"general\",\"Allgemeines\",\"general\",\"общий\",\"algemeen\",\"algemene\",\"γενικός\",\"γενική\",\"γενικό\",\"γενικά\"],\n" + + " \"key::28\": [\"applied\",\"applicati\",\"appliqué\",\"appliquée\",\"appliqués\",\"appliquées\",\"aplicado\",\"aplicada\",\"angewendet\",\"stosowany\",\"прикладной\",\"toegepast\",\"toegepaste\",\"εφαρμοσμένος\",\"εφαρμοσμένη\",\"εφαρμοσμένο\",\"εφαρμοσμένα\"],\n" + + " \"key::29\": [\"european\",\"europee\",\"europea\",\"européen\",\"européenne\",\"européens\",\"européennes\",\"europeo\",\"europeu\",\"europäisch\",\"europejski\",\"европейский\",\"Europees\",\"Europese\",\"ευρωπαϊκός\",\"ευρωπαϊκή\",\"ευρωπαϊκό\",\"ευρωπαϊκά\"],\n" + + " \"key::30\": [\"agency\",\"agenzia\",\"agence\",\"agencia\",\"agencia\",\"agentur\",\"agencja\",\"агенция\",\"agentschap\",\"πρακτορείο\"],\n" + + " \"key::31\": [\"laboratory\",\"laboratorio\",\"laboratoire\",\"laboratorio\",\"laboratorio\",\"labor\",\"laboratorium\",\"лаборатория\",\"laboratorium\",\"εργαστήριο\"],\n" + + " \"key::32\": [\"industry\",\"industria\",\"industrie\",\"индустрия\",\"industrie\",\"βιομηχανία\"],\n" + + " \"key::33\": [\"industrial\",\"industriale\",\"industriel\",\"industrielle\",\"industriels\",\"industrielles\",\"индустриальный\",\"industrieel\",\"βιομηχανικός\",\"βιομηχανική\",\"βιομηχανικό\",\"βιομηχανικά\",\"βιομηχανικές\"],\n" + + " \"key::34\": [\"consortium\",\"consorzio\",\"consortium\",\"консорциум\",\"consortium\",\"κοινοπραξία\"],\n" + + " \"key::35\": [\"organization\",\"organizzazione\",\"organisation\",\"organización\",\"organização\",\"organizacja\",\"организация\",\"organisatie\",\"οργανισμός\"],\n" + + " \"key::36\": [\"authority\",\"autorità\",\"autorité\",\"авторитет\",\"autoriteit\"],\n" + + " \"key::37\": [\"federation\",\"federazione\",\"fédération\",\"федерация\",\"federatie\",\"ομοσπονδία\"],\n" + + " \"key::38\": [\"observatory\",\"osservatorio\",\"observatoire\",\"обсерватория\",\"observatorium\",\"αστεροσκοπείο\"],\n" + + " \"key::39\": [\"bureau\",\"ufficio\",\"bureau\",\"офис\",\"bureau\",\"γραφείο\"],\n" + + " \"key::40\": [\"company\",\"impresa\",\"compagnie\",\"société\",\"компания\",\"bedrijf\",\"εταιρία\"],\n" + + " \"key::41\": [\"polytechnic\",\"politecnico\",\"polytechnique\",\"политехника\",\"polytechnisch\",\"πολυτεχνείο\",\"universita politecnica\",\"polytechnic university\",\"universidad politecnica\",\"universitat politecnica\",\"politechnika\",\"politechniki\",\"university technology\",\"university science technology\"],\n" + + " \"key::42\": [\"coalition\",\"coalizione\",\"coalition\",\"коалиция\",\"coalitie\",\"συνασπισμός\"],\n" + + " \"key::43\": [\"initiative\",\"iniziativa\",\"initiative\",\"инициатива\",\"initiatief\",\"πρωτοβουλία\"],\n" + + " \"key::44\": [\"academic\",\"accademico\",\"académique\",\"universitaire\",\"акадеческий academisch\",\"ακαδημαϊκός\",\"ακαδημαϊκή\",\"ακαδημαϊκό\",\"ακαδημαϊκές\",\"ακαδημαϊκοί\"],\n" + + " \"key::45\": [\"institution\",\"istituzione\",\"institution\",\"институциональный\",\"instelling\",\"ινστιτούτο\"],\n" + + " \"key::46\": [\"division\",\"divisione\",\"division\",\"отделение\",\"divisie\",\"τμήμα\"],\n" + + " \"key::47\": [\"committee\",\"comitato\",\"comité\",\"комитет\",\"commissie\",\"επιτροπή\"],\n" + + " \"key::48\": [\"promotion\",\"promozione\",\"продвижение\",\"proothisis\",\"forderung\"],\n" + + " \"key::49\": [\"medical\",\"medicine\",\"clinical\",\"medicina\",\"clinici\",\"médico\",\"medicina\",\"clínica\",\"médico\",\"medicina\",\"clínica\",\"medizinisch\",\"Medizin\",\"klinisch\",\"medisch\",\"geneeskunde\",\"klinisch\",\"ιατρικός\",\"ιατρική\",\"ιατρικό\",\"ιατρικά\",\"κλινικός\",\"κλινική\",\"κλινικό\",\"κλινικά\",\"tıbbi\",\"tıp\",\"klinik\",\"orvosi\",\"orvostudomány\",\"klinikai\",\"zdravniški\",\"medicinski\",\"klinični\",\"meditsiini\",\"kliinik\",\"kliiniline\"],\n" + + " \"key::50\": [\"technology\",\"technological\",\"tecnologia\",\"tecnologie\",\"tecnología\",\"tecnológico\",\"tecnologia\",\"tecnológico\",\"Technologie\",\"technologisch\",\"technologie\",\"technologisch\",\"τεχνολογία\",\"τεχνολογικός\",\"τεχνολογική\",\"τεχνολογικό\",\"teknoloji\",\"teknolojik\",\"technológia\",\"technológiai\",\"tehnologija\",\"tehnološki\",\"tehnoloogia\",\"tehnoloogiline\",\"technologii\",\"technical\",\"texniki\",\"teknik\"],\n" + + " \"key::51\": [\"science\",\"scientific\",\"scienza\",\"scientifiche\",\"scienze\",\"ciencia\",\"científico\",\"ciência\",\"científico\",\"Wissenschaft\",\"wissenschaftlich\",\"wetenschap\",\"wetenschappelijk\",\"επιστήμη\",\"επιστημονικός\",\"επιστημονική\",\"επιστημονικό\",\"επιστημονικά\",\"bilim\",\"bilimsel\",\"tudomány\",\"tudományos\",\"znanost\",\"znanstveni\",\"teadus\",\"teaduslik\",\"\"],\n" + + " \"key::52\": [\"engineering\",\"ingegneria\",\"ingeniería\",\"engenharia\",\"Ingenieurwissenschaft\",\"ingenieurswetenschappen\",\"bouwkunde\",\"μηχανικός\",\"μηχανική\",\"μηχανικό\",\"mühendislik\",\"mérnöki\",\"Inženirstvo\",\"inseneeria\",\"inseneri\",\"\"],\n" + + " \"key::53\": [\"management\",\"gestione\",\"gestionale\",\"gestionali\",\"gestión\",\"administración\",\"gestão\",\"administração\",\"Verwaltung\",\"management\",\"διαχείριση\",\"yönetim\",\"menedzsment\",\"vodstvo\",\"upravljanje\",\"management\",\"juhtkond\",\"juhtimine\",\"haldus\",\"\"],\n" + + " \"key::54\": [\"energy\",\"energia\",\"energía\",\"energia\",\"Energie\",\"energie\",\"ενέργεια\",\"enerji\",\"energia\",\"energija\",\"energia\",\"\"],\n" + + " \"key::55\": [\"agricultural\",\"agriculture\",\"agricoltura\",\"agricole\",\"agrícola\",\"agricultura\",\"agrícola\",\"agricultura\",\"landwirtschaftlich\",\"Landwirtschaft\",\"landbouwkundig\",\"landbouw\",\"αγροτικός\",\"αγροτική\",\"αγροτικό\",\"γεωργικός\",\"γεωργική\",\"γεωργικό\",\"γεωργία\",\"tarımsal\",\"tarım\",\"mezőgazdasági\",\"mezőgazdaság\",\"poljedelski\",\"poljedelstvo\",\"põllumajandus\",\"põllumajanduslik\",\"\"],\n" + + " \"key::56\": [\"information\",\"informazione\",\"información\",\"informação\",\"Information\",\"informatie\",\"πληροφορία\",\"bilgi\",\"információ\",\"informacija\",\"informatsioon\",\"informatycznych\",\"\"],\n" + + " \"key::57\": [\"social\",\"sociali\",\"social\",\"social\",\"Sozial\",\"sociaal\",\"maatschappelijk\",\"κοινωνικός\",\"κοινωνική\",\"κοινωνικό\",\"κοινωνικά\",\"sosyal\",\"szociális\",\"družbeni\",\"sotsiaal\",\"sotsiaalne\",\"\"],\n" + + " \"key::58\": [\"environmental\",\"ambiente\",\"medioambiental\",\"ambiente\",\"medioambiente\",\"meioambiente\",\"Umwelt\",\"milieu\",\"milieuwetenschap\",\"milieukunde\",\"περιβαλλοντικός\",\"περιβαλλοντική\",\"περιβαλλοντικό\",\"περιβαλλοντικά\",\"çevre\",\"környezeti\",\"okoliški\",\"keskonna\",\"\"],\n" + + " \"key::59\": [\"business\",\"economia\",\"economiche\",\"economica\",\"negocio\",\"empresa\",\"negócio\",\"Unternehmen\",\"bedrijf\",\"bedrijfskunde\",\"επιχείρηση\",\"iş\",\"üzleti\",\"posel\",\"ettevõte/äri\",\"\"],\n" + + " \"key::60\": [\"pharmaceuticals\",\"pharmacy\",\"farmacia\",\"farmaceutica\",\"farmacéutica\",\"farmacia\",\"farmacêutica\",\"farmácia\",\"Pharmazeutika\",\"Arzneimittelkunde\",\"farmaceutica\",\"geneesmiddelen\",\"apotheek\",\"φαρμακευτικός\",\"φαρμακευτική\",\"φαρμακευτικό\",\"φαρμακευτικά\",\"φαρμακείο\",\"ilaç\",\"eczane\",\"gyógyszerészeti\",\"gyógyszertár\",\"farmacevtika\",\"lekarništvo\",\"farmaatsia\",\"farmatseutiline\",\"\"],\n" + + " \"key::61\": [\"healthcare\",\"health services\",\"salute\",\"atenciónmédica\",\"cuidadodelasalud\",\"cuidadoscomasaúde\",\"Gesundheitswesen\",\"gezondheidszorg\",\"ιατροφαρμακευτικήπερίθαλψη\",\"sağlıkhizmeti\",\"egészségügy\",\"zdravstvo\",\"tervishoid\",\"tervishoiu\",\"\"],\n" + + " \"key::62\": [\"history\",\"storia\",\"historia\",\"história\",\"Geschichte\",\"geschiedenis\",\"geschiedkunde\",\"ιστορία\",\"tarih\",\"történelem\",\"zgodovina\",\"ajalugu\",\"\"],\n" + + " \"key::63\": [\"materials\",\"materiali\",\"materia\",\"materiales\",\"materiais\",\"materialen\",\"υλικά\",\"τεκμήρια\",\"malzemeler\",\"anyagok\",\"materiali\",\"materjalid\",\"vahendid\",\"\"],\n" + + " \"key::64\": [\"economics\",\"economia\",\"economiche\",\"economica\",\"economía\",\"economia\",\"Wirtschaft\",\"economie\",\"οικονομικά\",\"οικονομικέςεπιστήμες\",\"ekonomi\",\"közgazdaságtan\",\"gospodarstvo\",\"ekonomija\",\"majanduslik\",\"majandus\",\"\"],\n" + + " \"key::65\": [\"therapeutics\",\"terapeutica\",\"terapéutica\",\"terapêutica\",\"therapie\",\"θεραπευτική\",\"tedavibilimi\",\"gyógykezelés\",\"terapevtika\",\"terapeutiline\",\"ravi\",\"\"],\n" + + " \"key::66\": [\"oncology\",\"oncologia\",\"oncologico\",\"oncología\",\"oncologia\",\"Onkologie\",\"oncologie\",\"ογκολογία\",\"onkoloji\",\"onkológia\",\"onkologija\",\"onkoloogia\",\"\"],\n" + + " \"key::67\": [\"natural\",\"naturali\",\"naturale\",\"natural\",\"natural\",\"natürlich\",\"natuurlijk\",\"φυσικός\",\"φυσική\",\"φυσικό\",\"φυσικά\",\"doğal\",\"természetes\",\"naraven\",\"loodus\",\"\"],\n" + + " \"key::68\": [\"educational\",\"educazione\",\"pedagogia\",\"educacional\",\"educativo\",\"educacional\",\"pädagogisch\",\"educatief\",\"εκπαιδευτικός\",\"εκπαιδευτική\",\"εκπαιδευτικό\",\"εκπαιδευτικά\",\"eğitimsel\",\"oktatási\",\"izobraževalen\",\"haridus\",\"hariduslik\",\"\"],\n" + + " \"key::69\": [\"biomedical\",\"biomedica\",\"biomédico\",\"biomédico\",\"biomedizinisch\",\"biomedisch\",\"βιοιατρικός\",\"βιοιατρική\",\"βιοιατρικό\",\"βιοιατρικά\",\"biyomedikal\",\"orvosbiológiai\",\"biomedicinski\",\"biomeditsiiniline\",\"\"],\n" + + " \"key::70\": [\"veterinary\",\"veterinaria\",\"veterinarie\",\"veterinaria\",\"veterinária\",\"tierärtzlich\",\"veterinair\",\"veeartsenijlkunde\",\"κτηνιατρικός\",\"κτηνιατρική\",\"κτηνιατρικό\",\"κτηνιατρικά\",\"veteriner\",\"állatorvosi\",\"veterinar\",\"veterinarski\",\"veterinaaria\",\"\"],\n" + + " \"key::71\": [\"chemistry\",\"chimica\",\"química\",\"química\",\"Chemie\",\"chemie\",\"scheikunde\",\"χημεία\",\"kimya\",\"kémia\",\"kemija\",\"keemia\",\"\"],\n" + + " \"key::72\": [\"security\",\"sicurezza\",\"seguridad\",\"segurança\",\"Sicherheit\",\"veiligheid\",\"ασφάλεια\",\"güvenlik\",\"biztonsági\",\"varnost\",\"turvalisus\",\"julgeolek\",\"\"],\n" + + " \"key::73\": [\"biotechnology\",\"biotecnologia\",\"biotecnologie\",\"biotecnología\",\"biotecnologia\",\"Biotechnologie\",\"biotechnologie\",\"βιοτεχνολογία\",\"biyoteknoloji\",\"biotechnológia\",\"biotehnologija\",\"biotehnoloogia\",\"\"],\n" + + " \"key::74\": [\"military\",\"militare\",\"militari\",\"militar\",\"militar\",\"Militär\",\"militair\",\"leger\",\"στρατιωτικός\",\"στρατιωτική\",\"στρατιωτικό\",\"στρατιωτικά\",\"askeri\",\"katonai\",\"vojaški\",\"vojni\",\"militaar\",\"wojskowa\",\"\"],\n" + + " \"key::75\": [\"theological\",\"teologia\",\"teologico\",\"teológico\",\"tecnológica\",\"theologisch\",\"theologisch\",\"θεολογικός\",\"θεολογική\",\"θεολογικό\",\"θεολογικά\",\"teolojik\",\"technológiai\",\"teološki\",\"teoloogia\",\"usuteadus\",\"teoloogiline\",\"\"],\n" + + " \"key::76\": [\"electronics\",\"elettronica\",\"electrónica\",\"eletrônicos\",\"Elektronik\",\"elektronica\",\"ηλεκτρονική\",\"elektronik\",\"elektronika\",\"elektronika\",\"elektroonika\",\"\"],\n" + + " \"key::77\": [\"forestry\",\"forestale\",\"forestali\",\"silvicultura\",\"forestal\",\"floresta\",\"Forstwirtschaft\",\"bosbouw\",\"δασοκομία\",\"δασολογία\",\"ormancılık\",\"erdészet\",\"gozdarstvo\",\"metsandus\",\"\"],\n" + + " \"key::78\": [\"maritime\",\"marittima\",\"marittime\",\"marittimo\",\"marítimo\",\"marítimo\",\"maritiem\",\"ναυτικός\",\"ναυτική\",\"ναυτικό\",\"ναυτικά\",\"ναυτιλιακός\",\"ναυτιλιακή\",\"ναυτιλιακό\",\"ναυτιλιακά\",\"θαλάσσιος\",\"θαλάσσια\",\"θαλάσσιο\",\"denizcilik\",\"tengeri\",\"morski\",\"mere\",\"merendus\",\"\"],\n" + + " \"key::79\": [\"sports\",\"sport\",\"deportes\",\"esportes\",\"Sport\",\"sport\",\"sportwetenschappen\",\"άθληση\",\"γυμναστικήδραστηριότητα\",\"spor\",\"sport\",\"šport\",\"sport\",\"spordi\",\"\"],\n" + + " \"key::80\": [\"surgery\",\"chirurgia\",\"chirurgiche\",\"cirugía\",\"cirurgia\",\"Chirurgie\",\"chirurgie\",\"heelkunde\",\"εγχείρηση\",\"επέμβαση\",\"χειρουργικήεπέμβαση\",\"cerrahi\",\"sebészet\",\"kirurgija\",\"kirurgia\",\"\"],\n" + + " \"key::81\": [\"cultural\",\"culturale\",\"culturali\",\"cultura\",\"cultural\",\"cultural\",\"kulturell\",\"cultureel\",\"πολιτιστικός\",\"πολιτιστική\",\"πολιτιστικό\",\"πολιτισμικός\",\"πολιτισμική\",\"πολιτισμικό\",\"kültürel\",\"kultúrális\",\"kulturni\",\"kultuuri\",\"kultuuriline\",\"\"],\n" + + " \"key::82\": [\"computerscience\",\"informatica\",\"ordenador\",\"computadora\",\"informática\",\"computación\",\"cienciasdelacomputación\",\"ciênciadacomputação\",\"Computer\",\"computer\",\"υπολογιστής\",\"ηλεκτρονικόςυπολογιστής\",\"bilgisayar\",\"számítógép\",\"računalnik\",\"arvuti\",\"\"],\n" + + " \"key::83\": [\"finance\",\"financial\",\"finanza\",\"finanziarie\",\"finanza\",\"financiero\",\"finanças\",\"financeiro\",\"Finanzen\",\"finanziell\",\"financiën\",\"financieel\",\"χρηματοοικονομικά\",\"χρηματοδότηση\",\"finanse\",\"finansal\",\"pénzügy\",\"pénzügyi\",\"finance\",\"finančni\",\"finants\",\"finantsiline\",\"\"],\n" + + " \"key::84\": [\"communication\",\"comunicazione\",\"comuniciación\",\"comunicação\",\"Kommunikation\",\"communication\",\"επικοινωνία\",\"iletişim\",\"kommunikáció\",\"komuniciranje\",\"kommunikatsioon\",\"\"],\n" + + " \"key::85\": [\"justice\",\"giustizia\",\"justicia\",\"justiça\",\"Recht\",\"Justiz\",\"justitie\",\"gerechtigheid\",\"δικαιοσύνη\",\"υπουργείοδικαιοσύνης\",\"δίκαιο\",\"adalet\",\"igazságügy\",\"pravo\",\"õigus\",\"\"],\n" + + " \"key::86\": [\"aerospace\",\"aerospaziale\",\"aerospaziali\",\"aeroespacio\",\"aeroespaço\",\"Luftfahrt\",\"luchtvaart\",\"ruimtevaart\",\"αεροπορικός\",\"αεροπορική\",\"αεροπορικό\",\"αεροναυπηγικός\",\"αεροναυπηγική\",\"αεροναυπηγικό\",\"αεροναυπηγικά\",\"havacılıkveuzay\",\"légtér\",\"zrakoplovstvo\",\"atmosfäär\",\"kosmos\",\"\"],\n" + + " \"key::87\": [\"dermatology\",\"dermatologia\",\"dermatología\",\"dermatologia\",\"Dermatologie\",\"dermatologie\",\"δρματολογία\",\"dermatoloji\",\"bőrgyógyászat\",\"dermatológia\",\"dermatologija\",\"dermatoloogia\",\"\"],\n" + + " \"key::88\": [\"architecture\",\"architettura\",\"arquitectura\",\"arquitetura\",\"Architektur\",\"architectuur\",\"αρχιτεκτονική\",\"mimarlık\",\"építészet\",\"arhitektura\",\"arhitektuur\",\"\"],\n" + + " \"key::89\": [\"mathematics\",\"matematica\",\"matematiche\",\"matemáticas\",\"matemáticas\",\"Mathematik\",\"wiskunde\",\"mathematica\",\"μαθηματικά\",\"matematik\",\"matematika\",\"matematika\",\"matemaatika\",\"\"],\n" + + " \"key::90\": [\"language\",\"lingue\",\"linguistica\",\"linguistiche\",\"lenguaje\",\"idioma\",\"língua\",\"idioma\",\"Sprache\",\"taal\",\"taalkunde\",\"γλώσσα\",\"dil\",\"nyelv\",\"jezik\",\"keel\",\"\"],\n" + + " \"key::91\": [\"neuroscience\",\"neuroscienza\",\"neurociencia\",\"neurociência\",\"Neurowissenschaft\",\"neurowetenschappen\",\"νευροεπιστήμη\",\"nörobilim\",\"idegtudomány\",\"nevroznanost\",\"neuroteadused\",\"\"],\n" + + " \"key::92\": [\"automation\",\"automazione\",\"automatización\",\"automação\",\"Automatisierung\",\"automatisering\",\"αυτοματοποίηση\",\"otomasyon\",\"automatizálás\",\"avtomatizacija\",\"automatiseeritud\",\"\"],\n" + + " \"key::93\": [\"pediatric\",\"pediatria\",\"pediatriche\",\"pediatrico\",\"pediátrico\",\"pediatría\",\"pediátrico\",\"pediatria\",\"pädiatrisch\",\"pediatrische\",\"παιδιατρική\",\"pediatrik\",\"gyermekgyógyászat\",\"pediatrija\",\"pediaatria\",\"\"],\n" + + " \"key::94\": [\"photonics\",\"fotonica\",\"fotoniche\",\"fotónica\",\"fotônica\",\"Photonik\",\"fotonica\",\"φωτονική\",\"fotonik\",\"fotonika\",\"fotonika\",\"fotoonika\",\"\"],\n" + + " \"key::95\": [\"mechanics\", \"mechanical\", \"meccanica\",\"meccaniche\",\"mecánica\",\"mecânica\",\"Mechanik\",\"Maschinenbau\",\"mechanica\",\"werktuigkunde\",\"μηχανικής\",\"mekanik\",\"gépészet\",\"mehanika\",\"mehaanika\",\"\"],\n" + + " \"key::96\": [\"psychiatrics\",\"psichiatria\",\"psichiatrica\",\"psichiatriche\",\"psiquiatría\",\"psiquiatria\",\"Psychiatrie\",\"psychiatrie\",\"ψυχιατρική\",\"psikiyatrik\",\"pszihiátria\",\"psihiatrija\",\"psühhaatria\",\"\"],\n" + + " \"key::97\": [\"psychology\",\"fisiologia\",\"psicología\",\"psicologia\",\"Psychologie\",\"psychologie\",\"ψυχολογία\",\"psikoloji\",\"pszihológia\",\"psihologija\",\"psühholoogia\",\"\"],\n" + + " \"key::98\": [\"automotive\",\"industriaautomobilistica\",\"industriadelautomóvil\",\"automotriz\",\"industriaautomotriz\",\"automotivo\",\"Automobilindustrie\",\"autoindustrie\",\"αυτοκίνητος\",\"αυτοκίνητη\",\"αυτοκίνητο\",\"αυτοκινούμενος\",\"αυτοκινούμενη\",\"αυτοκινούμενο\",\"αυτοκινητιστικός\",\"αυτοκινητιστική\",\"αυτοκινητιστικό\",\"otomotiv\",\"autóipari\",\"samogiben\",\"avtomobilskaindustrija\",\"auto-\",\"\"],\n" + + " \"key::99\": [\"neurology\",\"neurologia\",\"neurologiche\",\"neurología\",\"neurologia\",\"Neurologie\",\"neurologie\",\"zenuwleer\",\"νευρολογία\",\"nöroloji\",\"neurológia\",\"ideggyógyászat\",\"nevrologija\",\"neuroloogia\",\"\"],\n" + + " \"key::100\": [\"geology\",\"geologia\",\"geologiche\",\"geología\",\"geologia\",\"Geologie\",\"geologie\",\"aardkunde\",\"γεωλογία\",\"jeoloji\",\"geológia\",\"földtudomány\",\"geologija\",\"geoloogia\",\"\"],\n" + + " \"key::101\": [\"microbiology\",\"microbiologia\",\"micro-biologia\",\"microbiologiche\",\"microbiología\",\"microbiologia\",\"Mikrobiologie\",\"microbiologie\",\"μικροβιολογία\",\"mikrobiyoloji\",\"mikrobiológia\",\"mikrobiologija\",\"mikrobioloogia\",\"\"],\n" + + " \"key::102\": [\"informatics\",\"informatica\",\"informática\",\"informática\",\"informatica\",\"\"],\n" + + " \"key::103\": [\"forschungsgemeinschaft\",\"comunita ricerca\",\"research community\",\"research foundation\",\"research association\"],\n" + + " \"key::104\": [\"commerce\",\"ticaret\",\"ticarət\",\"commercio\",\"trade\",\"handel\",\"comercio\"],\n" + + " \"key::105\" : [\"state\", \"stato\", \"etade\", \"estado\", \"statale\", \"etat\", \"zustand\", \"estado\"],\n" + + " \"key::106\" : [\"seminary\", \"seminario\", \"seminaire\", \"seminar\"],\n" + + " \"key::107\" : [\"agricultural forestry\", \"af\", \"a f\"],\n" + + " \"key::108\" : [\"agricultural mechanical\", \"am\", \"a m\"],\n" + + " \"key::109\" : [\"catholic\", \"catholique\", \"katholische\", \"catolica\", \"cattolica\", \"catolico\"]\n" + + " }\n" + + " }\n" + + "}"); + + @Test + public void testJPath () throws Exception { + + MapDocument d = MapDocumentUtil.asMapDocumentWithJPath(conf, json); + + System.out.println("d = " + d); + + } +} diff --git a/dhp-workflows/dhp-dedup/src/test/resources/eu/dnetlib/dedup/conf/pub.curr.conf.json b/dhp-workflows/dhp-dedup/src/test/resources/eu/dnetlib/dedup/conf/pub.curr.conf.json deleted file mode 100644 index 3e861fb71..000000000 --- a/dhp-workflows/dhp-dedup/src/test/resources/eu/dnetlib/dedup/conf/pub.curr.conf.json +++ /dev/null @@ -1,280 +0,0 @@ -{ - "wf" : { - "threshold" : "0.99", - "dedupRun" : "001", - "entityType" : "result", - "subEntityType" : "resulttype", - "subEntityValue" : "publication", - "orderField" : "title", - "queueMaxSize" : "2000", - "groupMaxSize" : "100", - "maxChildren" : "100", - "idPath": "$.id", - "slidingWindowSize" : "200", - "rootBuilder" : [ "result", "resultProject_outcome_isProducedBy", "resultResult_publicationDataset_isRelatedTo", "resultResult_similarity_isAmongTopNSimilarDocuments", "resultResult_similarity_hasAmongTopNSimilarDocuments", "resultOrganization_affiliation_isAffiliatedWith", "resultResult_part_hasPart", "resultResult_part_isPartOf", "resultResult_supplement_isSupplementTo", "resultResult_supplement_isSupplementedBy", "resultResult_version_isVersionOf" ], - "includeChildren" : "true" - }, - "pace" : { - "clustering" : [ - { "name" : "ngrampairs", "fields" : [ "title" ], "params" : { "max" : "1", "ngramLen" : "3"} }, - { "name" : "suffixprefix", "fields" : [ "title" ], "params" : { "max" : "1", "len" : "3" } }, - { "name" : "lowercase", "fields" : [ "doi" ], "params" : { } } - ], - "strictConditions" : [ - { "name" : "pidMatch", "fields" : [ "pid" ] } - ], - "conditions" : [ - { "name" : "titleVersionMatch", "fields" : [ "title" ] }, - { "name" : "sizeMatch", "fields" : [ "authors" ] } - ], - "model" : [ - { "name" : "doi", "algo" : "Null", "type" : "String", "weight" : "0.0", "ignoreMissing" : "true", "path" : "$.pid[?(@.qualifier.classid ==\"doi\")].value" }, - { "name" : "pid", "algo" : "Null", "type" : "JSON", "weight" : "0.0", "ignoreMissing" : "true", "path" : "$.pid", "overrideMatch" : "true" }, - { "name" : "title", "algo" : "LevensteinTitle", "type" : "String", "weight" : "1.0", "ignoreMissing" : "false", "path" : "$.title[?(@.qualifier.classid ==\"main title\")].value", "length" : 250, "size" : 5 }, - { "name" : "authors", "algo" : "Null", "type" : "List", "weight" : "0.0", "ignoreMissing" : "true", "path" : "$.author[*].fullname", "size" : 200 }, - { "name" : "resulttype", "algo" : "Null", "type" : "String", "weight" : "0.0", "ignoreMissing" : "false", "path" : "$.resulttype.classid" } - ], - "synonyms": {}, - "blacklists" : { - "title" : [ - "^Inside Front Cover$", - "(?i)^Poster presentations$", - "^THE ASSOCIATION AND THE GENERAL MEDICAL COUNCIL$", - "^Problems with perinatal pathology\\.?$", - "(?i)^Cases? of Puerperal Convulsions$", - "(?i)^Operative Gyna?ecology$", - "(?i)^Mind the gap\\!?\\:?$", - "^Chronic fatigue syndrome\\.?$", - "^Cartas? ao editor Letters? to the Editor$", - "^Note from the Editor$", - "^Anesthesia Abstract$", - - "^Annual report$", - "(?i)^“?THE RADICAL PREVENTION OF VENEREAL DISEASE\\.?”?$", - "(?i)^Graph and Table of Infectious Diseases?$", - "^Presentation$", - "(?i)^Reviews and Information on Publications$", - "(?i)^PUBLIC HEALTH SERVICES?$", - "(?i)^COMBINED TEXT-?BOOK OF OBSTETRICS AND GYN(Æ|ae)COLOGY$", - "(?i)^Adrese autora$", - "(?i)^Systematic Part .*\\. Catalogus Fossilium Austriae, Band 2: Echinoidea neogenica$", - "(?i)^Acknowledgement to Referees$", - "(?i)^Behçet's disease\\.?$", - "(?i)^Isolation and identification of restriction endonuclease.*$", - "(?i)^CEREBROVASCULAR DISEASES?.?$", - "(?i)^Screening for abdominal aortic aneurysms?\\.?$", - "^Event management$", - "(?i)^Breakfast and Crohn's disease.*\\.?$", - "^Cálculo de concentraciones en disoluciones acuosas. Ejercicio interactivo\\..*\\.$", - "(?i)^Genetic and functional analyses of SHANK2 mutations suggest a multiple hit model of Autism spectrum disorders?\\.?$", - "^Gushi hakubutsugaku$", - - "^Starobosanski nadpisi u Bosni i Hercegovini \\(.*\\)$", - "^Intestinal spirocha?etosis$", - "^Treatment of Rodent Ulcer$", - "(?i)^\\W*Cloud Computing\\W*$", - "^Compendio mathematico : en que se contienen todas las materias mas principales de las Ciencias que tratan de la cantidad$", - "^Free Communications, Poster Presentations: Session [A-F]$", - - "^“The Historical Aspects? of Quackery\\.?”$", - "^A designated centre for people with disabilities operated by St John of God Community Services (Limited|Ltd), Louth$", - "^P(er|re)-Mile Premiums for Auto Insurance\\.?$", - "(?i)^Case Report$", - "^Boletín Informativo$", - "(?i)^Glioblastoma Multiforme$", - "(?i)^Nuevos táxones animales descritos en la península Ibérica y Macaronesia desde 1994 \\(.*\\)$", - "^Zaměstnanecké výhody$", - "(?i)^The Economics of Terrorism and Counter-Terrorism: A Survey \\(Part .*\\)$", - "(?i)^Carotid body tumours?\\.?$", - "(?i)^\\[Españoles en Francia : La condición Emigrante.*\\]$", - "^Avant-propos$", - "(?i)^St\\. Patrick's Cathedral, Dublin, County Dublin - Head(s)? and Capital(s)?$", - "(?i)^St\\. Patrick's Cathedral, Dublin, County Dublin - Bases?$", - "(?i)^PUBLIC HEALTH VERSUS THE STATE$", - "^Viñetas de Cortázar$", - "(?i)^Search for heavy neutrinos and W(\\[|_|\\(|_\\{|-)?R(\\]|\\)|\\})? bosons with right-handed couplings in a left-right symmetric model in pp collisions at.*TeV(\\.)?$", - "(?i)^Measurement of the pseudorapidity and centrality dependence of the transverse energy density in Pb(-?)Pb collisions at.*tev(\\.?)$", - "(?i)^Search for resonances decaying into top-quark pairs using fully hadronic decays in pp collisions with ATLAS at.*TeV$", - "(?i)^Search for neutral minimal supersymmetric standard model Higgs bosons decaying to tau pairs in pp collisions at.*tev$", - - "(?i)^Relatório de Estágio (de|em) Angiologia e Cirurgia Vascular$", - "^Aus der AGMB$", - - "^Znanstveno-stručni prilozi$", - "(?i)^Zhodnocení finanční situace podniku a návrhy na zlepšení$", - "(?i)^Evaluation of the Financial Situation in the Firm and Proposals to its Improvement$", - "(?i)^Hodnocení finanční situace podniku a návrhy na její zlepšení$", - "^Finanční analýza podniku$", - "^Financial analysis( of business)?$", - "(?i)^Textbook of Gyn(a)?(Æ)?(e)?cology$", - "^Jikken nihon shūshinsho$", - "(?i)^CORONER('|s)(s|') INQUESTS$", - "(?i)^(Μελέτη παραγόντων )?risk management( για ανάπτυξη και εφαρμογή ενός πληροφοριακού συστήματος| και ανάπτυξη συστήματος)?$", - "(?i)^Consultants' contract(s)?$", - "(?i)^Upute autorima$", - "(?i)^Bijdrage tot de Kennis van den Godsdienst der Dajaks van Lan(d|f)ak en Tajan$", - "^Joshi shin kokubun$", - "^Kōtō shōgaku dokuhon nōson'yō$", - "^Jinjō shōgaku shōka$", - "^Shōgaku shūjichō$", - "^Nihon joshi dokuhon$", - "^Joshi shin dokuhon$", - "^Chūtō kanbun dokuhon$", - "^Wabun dokuhon$", - "(?i)^(Analysis of economy selected village or town|Rozbor hospodaření vybrané obce či města)$", - "(?i)^cardiac rehabilitation$", - "(?i)^Analytical summary$", - "^Thesaurus resolutionum Sacrae Congregationis Concilii$", - "(?i)^Sumario analítico(\\s{1})?(Analitic summary)?$", - "^Prikazi i osvrti$", - "^Rodinný dům s provozovnou$", - "^Family house with an establishment$", - "^Shinsei chūtō shin kokugun$", - "^Pulmonary alveolar proteinosis(\\.?)$", - "^Shinshū kanbun$", - "^Viñeta(s?) de Rodríguez$", - "(?i)^RUBRIKA UREDNIKA$", - "^A Matching Model of the Academic Publication Market$", - "^Yōgaku kōyō$", - - "^Internetový marketing$", - "^Internet marketing$", - "^Chūtō kokugo dokuhon$", - "^Kokugo dokuhon$", - "^Antibiotic Cover for Dental Extraction(s?)$", - "^Strategie podniku$", - "^Strategy of an Enterprise$", - "(?i)^respiratory disease(s?)(\\.?)$", - "^Award(s?) for Gallantry in Civil Defence$", - "^Podniková kultura$", - "^Corporate Culture$", - "^Severe hyponatraemia in hospital inpatient(s?)(\\.?)$", - "^Pracovní motivace$", - "^Work Motivation$", - "^Kaitei kōtō jogaku dokuhon$", - "^Konsolidovaná účetní závěrka$", - "^Consolidated Financial Statements$", - "(?i)^intracranial tumour(s?)$", - "^Climate Change Mitigation Options and Directed Technical Change: A Decentralized Equilibrium Analysis$", - "^\\[CERVECERIAS MAHOU(\\.|\\:) INTERIOR\\] \\[Material gráfico\\]$", - "^Housing Market Dynamics(\\:|\\.) On the Contribution of Income Shocks and Credit Constraint(s?)$", - "^\\[Funciones auxiliares de la música en Radio París,.*\\]$", - "^Úroveň motivačního procesu jako způsobu vedení lidí$", - "^The level of motivation process as a leadership$", - "^Pay-beds in N(\\.?)H(\\.?)S(\\.?) Hospitals$", - "(?i)^news and events$", - "(?i)^NOVOSTI I DOGAĐAJI$", - "^Sansū no gakushū$", - "^Posouzení informačního systému firmy a návrh změn$", - "^Information System Assessment and Proposal for ICT Modification$", - "^Stresové zatížení pracovníků ve vybrané profesi$", - "^Stress load in a specific job$", - - "^Sunday: Poster Sessions, Pt.*$", - "^Monday: Poster Sessions, Pt.*$", - "^Wednesday: Poster Sessions, Pt.*", - "^Tuesday: Poster Sessions, Pt.*$", - - "^Analýza reklamy$", - "^Analysis of advertising$", - - "^Shōgaku shūshinsho$", - "^Shōgaku sansū$", - "^Shintei joshi kokubun$", - "^Taishō joshi kokubun dokuhon$", - "^Joshi kokubun$", - - "^Účetní uzávěrka a účetní závěrka v ČR$", - "(?i)^The \"?Causes\"? of Cancer$", - "^Normas para la publicación de artículos$", - "^Editor('|s)(s|') [Rr]eply$", - "^Editor(’|s)(s|’) letter$", - "^Redaktoriaus žodis$", - "^DISCUSSION ON THE PRECEDING PAPER$", - "^Kōtō shōgaku shūshinsho jidōyō$", - "^Shōgaku nihon rekishi$", - "^(Theory of the flow of action currents in isolated myelinated nerve fibers).*$", - "^Préface$", - "^Occupational [Hh]ealth [Ss]ervices.$", - "^In Memoriam Professor Toshiyuki TAKESHIMA$", - "^Účetní závěrka ve vybraném podniku.*$", - "^Financial statements in selected company$", - "^Abdominal [Aa]ortic [Aa]neurysms.*$", - "^Pseudomyxoma peritonei$", - "^Kazalo autora$", - - "(?i)^uvodna riječ$", - "^Motivace jako způsob vedení lidí$", - "^Motivation as a leadership$", - "^Polyfunkční dům$", - "^Multi\\-funkcional building$", - "^Podnikatelský plán$", - "(?i)^Podnikatelský záměr$", - "(?i)^Business Plan$", - "^Oceňování nemovitostí$", - "^Marketingová komunikace$", - "^Marketing communication$", - "^Sumario Analítico$", - "^Riječ uredništva$", - "^Savjetovanja i priredbe$", - "^Índice$", - "^(Starobosanski nadpisi).*$", - "^Vzdělávání pracovníků v organizaci$", - "^Staff training in organization$", - "^(Life Histories of North American Geometridae).*$", - "^Strategická analýza podniku$", - "^Strategic Analysis of an Enterprise$", - "^Sadržaj$", - "^Upute suradnicima$", - "^Rodinný dům$", - "(?i)^Fami(l)?ly house$", - "^Upute autorima$", - "^Strategic Analysis$", - "^Finanční analýza vybraného podniku$", - "^Finanční analýza$", - "^Riječ urednika$", - "(?i)^Content(s?)$", - "(?i)^Inhalt$", - "^Jinjō shōgaku shūshinsho jidōyō$", - "(?i)^Index$", - "^Chūgaku kokubun kyōkasho$", - "^Retrato de una mujer$", - "^Retrato de un hombre$", - "^Kōtō shōgaku dokuhon$", - "^Shotōka kokugo$", - "^Shōgaku dokuhon$", - "^Jinjō shōgaku kokugo dokuhon$", - "^Shinsei kokugo dokuhon$", - "^Teikoku dokuhon$", - "^Instructions to Authors$", - "^KİTAP TAHLİLİ$", - "^PRZEGLĄD PIŚMIENNICTWA$", - "(?i)^Presentación$", - "^İçindekiler$", - "(?i)^Tabl?e of contents$", - "^(CODICE DEL BEATO DE LOS REYES FERNANDO I Y SANCHA).*$", - "^(\\[MADRID\\. BIBL\\. NAC\\. N.*KING FERDINAND I.*FROM SAN ISIDORO DE LEON\\. FACUNDUS SCRIPSIT DATED.*\\]).*", - "^Editorial( Board)?$", - "(?i)^Editorial \\(English\\)$", - "^Editörden$", - "^(Corpus Oral Dialectal \\(COD\\)\\.).*$", - "^(Kiri Karl Morgensternile).*$", - "^(\\[Eksliibris Aleksandr).*\\]$", - "^(\\[Eksliibris Aleksandr).*$", - "^(Eksliibris Aleksandr).*$", - "^(Kiri A\\. de Vignolles).*$", - "^(2 kirja Karl Morgensternile).*$", - "^(Pirita kloostri idaosa arheoloogilised).*$", - "^(Kiri tundmatule).*$", - "^(Kiri Jenaer Allgemeine Literaturzeitung toimetusele).*$", - "^(Eksliibris Nikolai Birukovile).*$", - "^(Eksliibris Nikolai Issakovile).*$", - "^(WHP Cruise Summary Information of section).*$", - "^(Measurement of the top quark\\-pair production cross section with ATLAS in pp collisions at).*$", - "^(Measurement of the spin\\-dependent structure function).*", - "(?i)^.*authors['’′]? reply\\.?$", - "(?i)^.*authors['’′]? response\\.?$" - ] - } - } -} \ No newline at end of file diff --git a/dhp-workflows/dhp-dedup/src/test/resources/eu/dnetlib/dedup/conf/org.curr.conf.json b/dhp-workflows/dhp-dedup/src/test/resources/eu/dnetlib/dhp/dedup/conf/org.curr.conf.json similarity index 97% rename from dhp-workflows/dhp-dedup/src/test/resources/eu/dnetlib/dedup/conf/org.curr.conf.json rename to dhp-workflows/dhp-dedup/src/test/resources/eu/dnetlib/dhp/dedup/conf/org.curr.conf.json index 2d0905562..726f2b899 100644 --- a/dhp-workflows/dhp-dedup/src/test/resources/eu/dnetlib/dedup/conf/org.curr.conf.json +++ b/dhp-workflows/dhp-dedup/src/test/resources/eu/dnetlib/dhp/dedup/conf/org.curr.conf.json @@ -3,6 +3,7 @@ "threshold" : "0.99", "dedupRun" : "001", "entityType" : "organization", + "subEntityValue": "organization", "orderField" : "legalname", "queueMaxSize" : "2000", "groupMaxSize" : "50", @@ -87,8 +88,8 @@ } } ], - "threshold": 0.7, - "aggregation": "W_MEAN", + "threshold": 0.1, + "aggregation": "AVG", "positive": "layer4", "negative": "NO_MATCH", "undefined": "NO_MATCH", @@ -106,7 +107,7 @@ } } ], - "threshold": 0.9, + "threshold": 0.7, "aggregation": "AVG", "positive": "layer5", "negative": "NO_MATCH", @@ -129,7 +130,9 @@ "comparator": "jaroWinklerNormalizedName", "weight": 0.1, "countIfUndefined": "false", - "params": {} + "params": { + "windowSize": 4 + } } ], "threshold": 0.9, @@ -145,14 +148,14 @@ { "name" : "legalshortname", "type" : "String", "path" : "$.legalshortname.value"}, { "name" : "legalname", "type" : "String", "path" : "$.legalname.value" }, { "name" : "websiteurl", "type" : "URL", "path" : "$.websiteurl.value" }, - { "name" : "gridid", "type" : "String", "path" : "$.pid[?(@.qualifier.classid =='grid.ac')].value"}, + { "name" : "gridid", "type" : "String", "path" : "$.pid[?(@.qualifier.classid =='grid')].value"}, { "name" : "originalId", "type" : "String", "path" : "$.id" } ], "blacklists" : { "legalname" : [] }, "synonyms": { - "key::1": ["university","università","università studi","universitario","universitaria","université", "universite", "universitaire","universitaires","universidad","universitade","Universität","universitaet","Uniwersytet","университет","universiteit","πανεπιστήμιο","universitesi","universiteti", "universiti"], + "key::1": ["university","università", "universitas", "università studi","universitario","universitaria","université", "universite", "universitaire","universitaires","universidad","universitade","Universität","universitaet","Uniwersytet","университет","universiteit","πανεπιστήμιο","universitesi","universiteti", "universiti"], "key::2": ["studies","studi","études","estudios","estudos","Studien","studia","исследования","studies","σπουδές"], "key::3": ["advanced","superiore","supérieur","supérieure","supérieurs","supérieures","avancado","avancados","fortgeschrittene","fortgeschritten","zaawansowany","передовой","gevorderd","gevorderde","προχωρημένος","προχωρημένη","προχωρημένο","προχωρημένες","προχωρημένα","wyzsza"], "key::4": ["institute","istituto","institut","instituto","instituto","Institut","instytut","институт","instituut","ινστιτούτο"], diff --git a/dhp-workflows/dhp-dedup/src/test/resources/eu/dnetlib/dedup/conf/pub_dt.curr.conf.json b/dhp-workflows/dhp-dedup/src/test/resources/eu/dnetlib/dhp/dedup/conf/pub.curr.conf.json similarity index 96% rename from dhp-workflows/dhp-dedup/src/test/resources/eu/dnetlib/dedup/conf/pub_dt.curr.conf.json rename to dhp-workflows/dhp-dedup/src/test/resources/eu/dnetlib/dhp/dedup/conf/pub.curr.conf.json index 6ca0ecd53..d471ccb89 100644 --- a/dhp-workflows/dhp-dedup/src/test/resources/eu/dnetlib/dedup/conf/pub_dt.curr.conf.json +++ b/dhp-workflows/dhp-dedup/src/test/resources/eu/dnetlib/dhp/dedup/conf/pub.curr.conf.json @@ -28,34 +28,10 @@ "idPath": "$.id" }, "pace": { - "clustering": [ - { - "name": "ngrampairs", - "fields": [ - "title" - ], - "params": { - "max": "1", - "ngramLen": "3" - } - }, - { - "name": "suffixprefix", - "fields": [ - "title" - ], - "params": { - "max": "1", - "len": "3" - } - }, - { - "name": "lowercase", - "fields": [ - "doi" - ], - "params": {} - } + "clustering" : [ + { "name" : "ngrampairs", "fields" : [ "title" ], "params" : { "max" : "1", "ngramLen" : "3"} }, + { "name" : "suffixprefix", "fields" : [ "title" ], "params" : { "max" : "1", "len" : "3" } }, + { "name" : "lowercase", "fields" : [ "doi" ], "params" : { } } ], "decisionTree": { "start": { diff --git a/dhp-workflows/dhp-dedup/src/test/resources/eu/dnetlib/dedup/conf/sample.json b/dhp-workflows/dhp-dedup/src/test/resources/eu/dnetlib/dhp/dedup/conf/sample.json similarity index 100% rename from dhp-workflows/dhp-dedup/src/test/resources/eu/dnetlib/dedup/conf/sample.json rename to dhp-workflows/dhp-dedup/src/test/resources/eu/dnetlib/dhp/dedup/conf/sample.json diff --git a/dhp-workflows/dhp-dedup/src/test/resources/eu/dnetlib/dedup/json/authors_merge.json b/dhp-workflows/dhp-dedup/src/test/resources/eu/dnetlib/dhp/dedup/json/authors_merge.json similarity index 100% rename from dhp-workflows/dhp-dedup/src/test/resources/eu/dnetlib/dedup/json/authors_merge.json rename to dhp-workflows/dhp-dedup/src/test/resources/eu/dnetlib/dhp/dedup/json/authors_merge.json diff --git a/dhp-workflows/dhp-graph-provision/job-override.properties b/dhp-workflows/dhp-graph-provision/job-override.properties index 68816c224..8230dfc18 100644 --- a/dhp-workflows/dhp-graph-provision/job-override.properties +++ b/dhp-workflows/dhp-graph-provision/job-override.properties @@ -1,12 +1,14 @@ -sparkDriverMemory=10G -sparkExecutorMemory=15G +sparkExecutorCoresForJoining=1 +sparkDriverMemoryForJoining=10G +sparkExecutorMemoryForJoining=15G +sparkExecutorCoresForIndexing=64 +sparkDriverMemoryForIndexing=3G +sparkExecutorMemoryForIndexing=2G #isLookupUrl=http://services.openaire.eu:8280/is/services/isLookUp isLookupUrl=http://beta.services.openaire.eu:8280/is/services/isLookUp?wsdl sourcePath=/tmp/db_openaireplus_services.export_dhp.2020.02.03 outputPath=/tmp/openaire_provision format=TMF batchSize=2000 -sparkExecutorCoresForJoining=128 -sparkExecutorCoresForIndexing=64 reuseRecords=false otherDsTypeId=scholarcomminfra, infospace, pubsrepository::mock, entityregistry, entityregistry::projects, entityregistry::repositories, websource \ No newline at end of file diff --git a/pom.xml b/pom.xml index f47d49ea7..1ae078128 100644 --- a/pom.xml +++ b/pom.xml @@ -1,6 +1,6 @@ + xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd"> 4.0.0 eu.dnetlib.dhp @@ -101,12 +101,12 @@ org.apache.hadoop hadoop-common - ${dhp.hadoop.version} - provided - - - org.apache.hadoop - hadoop-client + ${dhp.hadoop.version} + provided + + + org.apache.hadoop + hadoop-client ${dhp.hadoop.version} provided @@ -174,11 +174,11 @@ provided - - net.sf.saxon - Saxon-HE - 9.9.1-6 - + + net.sf.saxon + Saxon-HE + 9.9.1-6 + dom4j @@ -199,56 +199,56 @@ - com.mycila.xmltool - xmltool - 3.3 - + com.mycila.xmltool + xmltool + 3.3 + - - org.apache.solr - solr-solrj - 7.5.0 - - - * - * - - - - - com.lucidworks.spark - spark-solr - 3.6.0 - - - * - * - - - + + org.apache.solr + solr-solrj + 7.5.0 + + + * + * + + + + + com.lucidworks.spark + spark-solr + 3.6.0 + + + * + * + + + - - org.apache.httpcomponents - httpclient - 4.5.3 - - - org.apache.httpcomponents - httpmime - 4.5.3 - - - org.noggit - noggit - 0.8 - - - org.apache.zookeeper - zookeeper - 3.4.11 - + + org.apache.httpcomponents + httpclient + 4.5.3 + + + org.apache.httpcomponents + httpmime + 4.5.3 + + + org.noggit + noggit + 0.8 + + + org.apache.zookeeper + zookeeper + 3.4.11 + - + net.schmizz sshj 0.10.0 @@ -290,17 +290,17 @@ dnet-pace-core 4.0.0 - - eu.dnetlib - cnr-rmi-api - [2.0.0,3.0.0) - + + eu.dnetlib + cnr-rmi-api + [2.0.0,3.0.0) + - - org.apache.cxf - cxf-rt-transports-http - 3.1.5 - + + org.apache.cxf + cxf-rt-transports-http + 3.1.5 + javax.persistence javax.persistence-api @@ -308,36 +308,36 @@ provided - - com.rabbitmq - amqp-client - 5.6.0 - - - com.jayway.jsonpath - json-path - 2.4.0 - - - com.arakelian - java-jq - 0.10.1 - - - edu.cmu - secondstring - 1.0.0 - - - org.mongodb - mongo-java-driver - ${mongodb.driver.version} - - - org.antlr - stringtemplate - 4.0 - + + com.rabbitmq + amqp-client + 5.6.0 + + + com.jayway.jsonpath + json-path + 2.4.0 + + + com.arakelian + java-jq + 0.10.1 + + + edu.cmu + secondstring + 1.0.0 + + + org.mongodb + mongo-java-driver + ${mongodb.driver.version} + + + org.antlr + stringtemplate + 4.0 + org.apache.oozie @@ -509,4 +509,3 @@ 3.4.2 -