From 4b66b471a4a77ed34f7fb4fa097586cc2a7be921 Mon Sep 17 00:00:00 2001 From: miconis Date: Tue, 10 Dec 2019 14:57:16 +0100 Subject: [PATCH] implementation of the sorting by trust mechanism and the merge of oaf entities --- .../eu/dnetlib/dhp/schema/oaf/Context.java | 19 + .../java/eu/dnetlib/dhp/schema/oaf/Field.java | 21 + .../eu/dnetlib/dhp/schema/oaf/KeyValue.java | 23 + .../eu/dnetlib/dhp/schema/oaf/OafEntity.java | 31 +- .../dnetlib/dhp/schema/oaf/Publication.java | 12 + .../eu/dnetlib/dhp/schema/oaf/Qualifier.java | 28 + .../eu/dnetlib/dhp/schema/oaf/Result.java | 171 +- .../dhp/schema/oaf/StructuredProperty.java | 24 + .../eu/dnetlib/dhp/schema/oaf/MergeTest.java | 89 + .../eu/dnetlib/dhp/transform/ext_simple.xsl | 2 +- .../resources/eu/dnetlib/dhp/transform/tr.xml | 12 +- .../eu/dnetlib/dedup/DedupRecordFactory.java | 169 ++ .../java/eu/dnetlib/dedup/OafComparator.java | 15 + .../java/eu/dnetlib/dedup/OafEntityType.java | 13 + .../main/java/eu/dnetlib/dedup/OafKey.java | 31 + .../java/eu/dnetlib/dedup/OafPartitioner.java | 59 + .../dedup/SparkCreateConnectedComponent.java | 4 +- .../dnetlib/dedup/SparkCreateDedupRecord.java | 82 +- .../eu/dnetlib/dedup/SparkCreateSimRels.java | 3 +- .../dnetlib/dedup/graph/GraphProcessor.scala | 4 +- .../dnetlib/dhp/dedup/conf/org.curr.conf.json | 1745 ----------------- .../dhp/dedup/oozie_app/config-default.xml | 14 +- .../dnetlib/dedup/SparkCreateDedupTest.java | 33 +- .../eu/dnetlib/dedup/conf/org.curr.conf.json | 10 +- .../dnetlib}/dedup/conf/org.curr.conf2.json | 0 .../eu/dnetlib}/dedup/conf/pub.curr.conf.json | 0 .../dhp/distcp/oozie_app/config-default.xml | 8 +- .../dnetlib/dhp/distcp/oozie_app/workflow.xml | 4 +- .../dhp/graph/oozie_app/config-default.xml | 12 +- dhp-workflows/docs/oozie-installer.markdown | 4 +- dhp-workflows/pom.xml | 4 +- 31 files changed, 769 insertions(+), 1877 deletions(-) create mode 100644 dhp-schemas/src/test/java/eu/dnetlib/dhp/schema/oaf/MergeTest.java create mode 100644 dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/DedupRecordFactory.java create mode 100644 dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/OafComparator.java create mode 100644 dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/OafEntityType.java create mode 100644 dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/OafKey.java create mode 100644 dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/OafPartitioner.java delete mode 100644 dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/conf/org.curr.conf.json rename dhp-workflows/dhp-dedup/src/{main/resources/eu/dnetlib/dhp => test/resources/eu/dnetlib}/dedup/conf/org.curr.conf2.json (100%) rename dhp-workflows/dhp-dedup/src/{main/resources/eu/dnetlib/dhp => test/resources/eu/dnetlib}/dedup/conf/pub.curr.conf.json (100%) diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Context.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Context.java index 64e23088e..8f46a0e23 100644 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Context.java +++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Context.java @@ -23,4 +23,23 @@ public class Context implements Serializable { public void setDataInfo(List dataInfo) { this.dataInfo = dataInfo; } + + @Override + public int hashCode() { + return id.hashCode(); + } + + @Override + public boolean equals(Object obj) { + if (this == obj) + return true; + if (obj == null) + return false; + if (getClass() != obj.getClass()) + return false; + + Context other = (Context) obj; + + return id.equals(other.getId()); + } } diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Field.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Field.java index 8834900c9..b645d275f 100644 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Field.java +++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Field.java @@ -23,4 +23,25 @@ public class Field implements Serializable { public void setDataInfo(DataInfo dataInfo) { this.dataInfo = dataInfo; } + + @Override + public int hashCode(){ + return getValue().hashCode(); + } + + @Override + public boolean equals(Object obj) { + if (this == obj) + return true; + if (obj == null) + return false; + if (getClass() != obj.getClass()) + return false; + + Field other = (Field) obj; + + return getValue().equals(other.getValue()); + } + + } diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/KeyValue.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/KeyValue.java index 1c4c7e6ef..fd394e188 100644 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/KeyValue.java +++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/KeyValue.java @@ -33,4 +33,27 @@ public class KeyValue implements Serializable { public void setDataInfo(DataInfo dataInfo) { this.dataInfo = dataInfo; } + + public String toComparableString() { + return String.format("%s::%s", key != null ? key.toLowerCase() : "", value != null ? value.toLowerCase() : ""); + } + + @Override + public int hashCode() { + return toComparableString().hashCode(); + } + + @Override + public boolean equals(Object obj) { + if (this == obj) + return true; + if (obj == null) + return false; + if (getClass() != obj.getClass()) + return false; + + KeyValue other = (KeyValue) obj; + + return toComparableString().equals(other.toComparableString()); + } } diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/OafEntity.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/OafEntity.java index 791667b46..955ddfd01 100644 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/OafEntity.java +++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/OafEntity.java @@ -1,7 +1,8 @@ package eu.dnetlib.dhp.schema.oaf; import java.io.Serializable; -import java.util.List; +import java.util.*; +import java.util.stream.Collectors; public abstract class OafEntity extends Oaf implements Serializable { @@ -84,4 +85,32 @@ public abstract class OafEntity extends Oaf implements Serializable { public void setOaiprovenance(OAIProvenance oaiprovenance) { this.oaiprovenance = oaiprovenance; } + + public void mergeFrom(OafEntity e) { + + if (e == null) + return; + + originalId = mergeLists(originalId, e.getOriginalId()); + + collectedfrom = mergeLists(collectedfrom, e.getCollectedfrom()); + + pid = mergeLists(pid, e.getPid()); + + dateofcollection = e.getDateofcollection(); + + dateoftransformation = e.getDateoftransformation(); + + extraInfo = mergeLists(extraInfo, e.getExtraInfo()); + + oaiprovenance = e.getOaiprovenance(); + + } + + protected List mergeLists(final List... lists) { + + return Arrays.stream(lists).filter(Objects::nonNull).flatMap(List::stream).distinct().collect(Collectors.toList()); + } + + } diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Publication.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Publication.java index 9ca9cd3d6..181062f32 100644 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Publication.java +++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Publication.java @@ -14,4 +14,16 @@ public class Publication extends Result implements Serializable { public void setJournal(Journal journal) { this.journal = journal; } + + @Override + public void mergeFrom(OafEntity e) { + super.mergeFrom(e); + + Publication p = (Publication) e; + + if (p.getJournal() != null) + journal = p.getJournal(); + } + + } diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Qualifier.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Qualifier.java index 60889535d..9c52d7310 100644 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Qualifier.java +++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Qualifier.java @@ -40,4 +40,32 @@ public class Qualifier implements Serializable { public void setSchemename(String schemename) { this.schemename = schemename; } + + public String toComparableString() { + return String.format("%s::%s::%s::%s", + classid != null ? classid : "", + classname != null ? classname : "", + schemeid != null ? schemeid : "", + schemename != null ? schemename : ""); + } + + @Override + public int hashCode() { + return toComparableString().hashCode(); + } + + @Override + public boolean equals(Object obj) { + if (this == obj) + return true; + if (obj == null) + return false; + if (getClass() != obj.getClass()) + return false; + + Qualifier other = (Qualifier) obj; + + return toComparableString() + .equals(other.toComparableString()); + } } diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Result.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Result.java index 7b08e71c2..0e34d8ba6 100644 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Result.java +++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/Result.java @@ -1,7 +1,11 @@ package eu.dnetlib.dhp.schema.oaf; +import org.apache.commons.lang3.StringUtils; + import java.io.Serializable; import java.util.List; +import java.util.Map; +import java.util.Objects; public abstract class Result extends OafEntity implements Serializable { @@ -12,35 +16,35 @@ public abstract class Result extends OafEntity implements Serializable { // common fields private Qualifier language; - + private List country; private List subject; - + private List title; - + private List relevantdate; private List> description; - + private Field dateofacceptance; - + private Field publisher; - + private Field embargoenddate; - + private List> source; - + private List> fulltext; // remove candidate - + private List> format; - + private List> contributor; - + private Qualifier resourcetype; - + private List> coverage; - + private Field refereed; //peer-review status private List context; @@ -240,4 +244,145 @@ public abstract class Result extends OafEntity implements Serializable { this.processingchargecurrency = processingchargecurrency; return this; } + + @Override + public void mergeFrom(OafEntity e) { + super.mergeFrom(e); + + Result r = (Result) e; + + mergeAuthors(r.getAuthor()); + + //TODO mergeFrom is used only for create Dedup Records since the creation of these two fields requires more complex functions (maybe they will be filled in an external function) +// if (author == null) +// author = r.getAuthor(); //authors will be replaced because they could be too much +// dateofacceptance = r.getDateofacceptance(); +// instance = mergeLists(instance, r.getInstance()); + + if (r.getResulttype() != null) + resulttype = r.getResulttype(); + + if (r.getLanguage() != null) + language = r.getLanguage(); + + country = mergeLists(country, r.getCountry()); + + subject = mergeLists(subject, r.getSubject()); + + title = mergeLists(title, r.getTitle()); + + relevantdate = mergeLists(relevantdate, r.getRelevantdate()); + + description = mergeLists(description, r.getDescription()); + + if (r.getPublisher() != null) + publisher = r.getPublisher(); + + if (r.getEmbargoenddate() != null) + embargoenddate = r.getEmbargoenddate(); + + source = mergeLists(source, r.getSource()); + + fulltext = mergeLists(fulltext, r.getFulltext()); + + format = mergeLists(format, r.getFormat()); + + contributor = mergeLists(contributor, r.getContributor()); + + if (r.getResourcetype() != null) + resourcetype = r.getResourcetype(); + + coverage = mergeLists(coverage, r.getCoverage()); + + if (r.getRefereed() != null) + refereed = r.getRefereed(); + + context = mergeLists(context, r.getContext()); + + if (r.getProcessingchargeamount() != null) + processingchargeamount = r.getProcessingchargeamount(); + + if (r.getProcessingchargecurrency() != null) + processingchargecurrency = r.getProcessingchargecurrency(); + + externalReference = mergeLists(externalReference, r.getExternalReference()); + + } + + public void mergeAuthors(List authors){ + int c1 = countAuthorsPids(author); + int c2 = countAuthorsPids(authors); + int s1 = authorsSize(author); + int s2 = authorsSize(authors); + + + //if both have no authors with pids and authors is bigger than author + if (c1 == 0 && c2 == 0 && author.size() authors){ + if (authors == null) + return -1; + + return (int) authors.stream().map(this::extractAuthorPid).filter(Objects::nonNull).filter(StringUtils::isNotBlank).count(); + } + + public int authorsSize(List authors){ + if (authors == null) + return 0; + return authors.size(); + } + + public String extractAuthorPid(Author a){ + + if(a == null || a.getPid() == null || a.getPid().size() == 0) + return null; + + StringBuilder mainPid = new StringBuilder(); + + a.getPid().forEach(pid ->{ + if (pid.getQualifier().getClassid().equalsIgnoreCase("orcid")) { + mainPid.setLength(0); + mainPid.append(pid.getValue()); + } + else { + if(mainPid.length() == 0) + mainPid.append(pid.getValue()); + } + }); + + return mainPid.toString(); + + } } diff --git a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/StructuredProperty.java b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/StructuredProperty.java index 79ebdd7f9..ea2370c7a 100644 --- a/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/StructuredProperty.java +++ b/dhp-schemas/src/main/java/eu/dnetlib/dhp/schema/oaf/StructuredProperty.java @@ -33,4 +33,28 @@ public class StructuredProperty implements Serializable { public void setDataInfo(DataInfo dataInfo) { this.dataInfo = dataInfo; } + + public String toComparableString(){ + return String.format("%s::%s", value != null ? value.toLowerCase() : "", qualifier != null ? qualifier.toComparableString().toLowerCase() : ""); + } + + @Override + public int hashCode() { + return toComparableString().hashCode(); + } + + @Override + public boolean equals(Object obj) { + if (this == obj) + return true; + if (obj == null) + return false; + if (getClass() != obj.getClass()) + return false; + + StructuredProperty other = (StructuredProperty) obj; + + return toComparableString() + .equals(other.toComparableString()); + } } diff --git a/dhp-schemas/src/test/java/eu/dnetlib/dhp/schema/oaf/MergeTest.java b/dhp-schemas/src/test/java/eu/dnetlib/dhp/schema/oaf/MergeTest.java new file mode 100644 index 000000000..e487ddcba --- /dev/null +++ b/dhp-schemas/src/test/java/eu/dnetlib/dhp/schema/oaf/MergeTest.java @@ -0,0 +1,89 @@ +package eu.dnetlib.dhp.schema.oaf; + +import org.junit.Assert; +import org.junit.Before; +import org.junit.Test; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; + +public class MergeTest { + + OafEntity oaf; + + @Before + public void setUp() { + oaf = new Publication(); + } + + @Test + public void mergeListsTest() { + + //string list merge test + List a = Arrays.asList("a", "b", "c", "e"); + List b = Arrays.asList("a", "b", "c", "d"); + List c = null; + + System.out.println("merge result 1 = " + oaf.mergeLists(a, b)); + + System.out.println("merge result 2 = " + oaf.mergeLists(a, c)); + + System.out.println("merge result 3 = " + oaf.mergeLists(c, c)); + } + + @Test + public void mergePublicationCollectedFromTest() { + + Publication a = new Publication(); + Publication b = new Publication(); + + a.setCollectedfrom(Arrays.asList(setKV("a", "open"), setKV("b", "closed"))); + b.setCollectedfrom(Arrays.asList(setKV("A", "open"), setKV("b", "Open"))); + + a.mergeFrom(b); + + Assert.assertNotNull(a.getCollectedfrom()); + Assert.assertEquals(3, a.getCollectedfrom().size()); + + } + + @Test + public void mergePublicationSubjectTest() { + + Publication a = new Publication(); + Publication b = new Publication(); + + a.setSubject(Arrays.asList(setSP("a", "open", "classe"), setSP("b", "open", "classe"))); + b.setSubject(Arrays.asList(setSP("A", "open", "classe"), setSP("c", "open", "classe"))); + + a.mergeFrom(b); + + Assert.assertNotNull(a.getSubject()); + Assert.assertEquals(3, a.getSubject().size()); + + } + + private KeyValue setKV(final String key, final String value) { + + KeyValue k = new KeyValue(); + + k.setKey(key); + k.setValue(value); + + return k; + } + + private StructuredProperty setSP(final String value, final String schema, final String classname) { + StructuredProperty s = new StructuredProperty(); + s.setValue(value); + Qualifier q = new Qualifier(); + q.setClassname(classname); + q.setClassid(classname); + q.setSchemename(schema); + q.setSchemeid(schema); + s.setQualifier(q); + return s; + } +} diff --git a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/ext_simple.xsl b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/ext_simple.xsl index cef50aa95..5f5ed5a3b 100644 --- a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/ext_simple.xsl +++ b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/ext_simple.xsl @@ -9,7 +9,7 @@ - + diff --git a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/tr.xml b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/tr.xml index a9eae8576..ef6d9f7ac 100644 --- a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/tr.xml +++ b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/transform/tr.xml @@ -1,11 +1,11 @@
- - - - - + + + + +
@@ -24,7 +24,7 @@ - + diff --git a/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/DedupRecordFactory.java b/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/DedupRecordFactory.java new file mode 100644 index 000000000..371e80349 --- /dev/null +++ b/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/DedupRecordFactory.java @@ -0,0 +1,169 @@ +package eu.dnetlib.dedup; + +import com.google.common.collect.Iterables; +import com.google.common.collect.Lists; +import eu.dnetlib.dhp.schema.oaf.*; +import eu.dnetlib.pace.config.DedupConfig; +import eu.dnetlib.pace.util.MapDocumentUtil; +import org.apache.commons.lang.NotImplementedException; +import org.apache.commons.lang.StringUtils; +import org.apache.spark.api.java.JavaPairRDD; +import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.api.java.function.Function; +import org.apache.spark.api.java.function.PairFunction; +import org.apache.spark.sql.Encoders; +import org.apache.spark.sql.SparkSession; +import org.codehaus.jackson.map.ObjectMapper; +import scala.Tuple2; + +import java.io.IOException; +import java.util.Collection; +import java.util.List; +import java.util.Random; + +import static java.util.stream.Collectors.toMap; + +public class DedupRecordFactory { + + public JavaRDD createDedupRecord(final JavaSparkContext sc, final SparkSession spark, final String mergeRelsInputPath, final String entitiesInputPath, final OafEntityType entityType, final DedupConfig dedupConf){ + + // + final JavaPairRDD inputJsonEntities = sc.textFile(entitiesInputPath) + .mapToPair((PairFunction) it-> + new Tuple2(MapDocumentUtil.getJPathString(dedupConf.getWf().getIdPath(), it),it) + ); + + //: source is the dedup_id, target is the id of the mergedIn + JavaPairRDD mergeRels = spark + .read().load(mergeRelsInputPath).as(Encoders.bean(Relation.class)) + .where("relClass=='merges'") + .javaRDD() + .mapToPair( + (PairFunction)r-> + new Tuple2(r.getTarget(), r.getSource()) + ); + + // + final JavaPairRDD joinResult = mergeRels.join(inputJsonEntities).mapToPair((PairFunction>, String, String>) Tuple2::_2); + + JavaPairRDD keyJson = joinResult.mapToPair((PairFunction, OafKey, String>) json -> { + + String idValue = json._1(); + + String trust = MapDocumentUtil.getJPathString("$.dataInfo.trust", json._2()); + + //TODO remember to replace this with the actual trust retrieving + if (StringUtils.isBlank(trust)) { + Random generator = new Random(); + int number = generator.nextInt(20); + double result = (number / 100.0) + 0.80; + trust = "" + result; + } + + return new Tuple2(new OafKey(idValue, trust), json._2()); + }); + + OafComparator c = new OafComparator(); + // + JavaPairRDD> sortedJoinResult = keyJson.repartitionAndSortWithinPartitions(new OafPartitioner(keyJson.getNumPartitions()), c) + .mapToPair((PairFunction, String, String>) t -> new Tuple2(t._1().getDedupId(), t._2())) + .groupByKey(); + + + switch(entityType){ + case Publication: + return sortedJoinResult.map(this::publicationMerger); + case Dataset: + return sortedJoinResult.map(this::datasetMerger); + case Project: + return sortedJoinResult.map(this::projectMerger); + case Software: + return sortedJoinResult.map(this::softwareMerger); + case Datasource: + return sortedJoinResult.map(this::datasourceMerger); + case Organization: + return sortedJoinResult.map(this::organizationMerger); + case OtherResearchProduct: + return sortedJoinResult.map(this::otherresearchproductMerger); + default: + return null; + } + + } + + private Publication publicationMerger(Tuple2> e){ + + Publication p = new Publication(); //the result of the merge, to be returned at the end + + p.setId(e._1()); + + final ObjectMapper mapper = new ObjectMapper(); + + final Collection dateofacceptance = Lists.newArrayList(); + final Collection> authors = Lists.newArrayList(); + final Collection> instances = Lists.newArrayList(); + + StringBuilder trust = new StringBuilder("0.0"); + + e._2().forEach(pub -> { + try { + Publication publication = mapper.readValue(pub, Publication.class); + + final String currentTrust = publication.getDataInfo().getTrust(); + if (!currentTrust.equals("1.0")) { + trust.setLength(0); + trust.append(currentTrust); + } + + p.mergeFrom(publication); + + //add to the list if they are not null + if (publication.getDateofacceptance() != null) + dateofacceptance.add(publication.getDateofacceptance().getValue()); + if (publication.getAuthor() != null) + authors.add(publication.getAuthor()); + if (publication.getInstance() != null) + instances.add(publication.getInstance()); + + } catch (Exception exc){} + + }); + + p.setAuthor(null); //TODO create a single list of authors to put in the final publication + + + return p; + } + + private Dataset datasetMerger(Tuple2> e){ + + throw new NotImplementedException(); + } + + private Project projectMerger(Tuple2> e){ + + throw new NotImplementedException(); + } + + private Software softwareMerger(Tuple2> e){ + + throw new NotImplementedException(); + } + + private Datasource datasourceMerger(Tuple2> e){ + + throw new NotImplementedException(); + } + + private Organization organizationMerger(Tuple2> e){ + + throw new NotImplementedException(); + } + + private OtherResearchProduct otherresearchproductMerger(Tuple2> e){ + + throw new NotImplementedException(); + } + +} diff --git a/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/OafComparator.java b/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/OafComparator.java new file mode 100644 index 000000000..2ab78db7c --- /dev/null +++ b/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/OafComparator.java @@ -0,0 +1,15 @@ +package eu.dnetlib.dedup; +import com.google.common.collect.ComparisonChain; +import java.io.Serializable; +import java.util.Comparator; + +public class OafComparator implements Comparator, Serializable { + + @Override + public int compare(OafKey a, OafKey b) { + return ComparisonChain.start() + .compare(a.getDedupId(), b.getDedupId()) + .compare(a.getTrust(), b.getTrust()) + .result(); + } +} \ No newline at end of file diff --git a/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/OafEntityType.java b/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/OafEntityType.java new file mode 100644 index 000000000..4ff2fa873 --- /dev/null +++ b/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/OafEntityType.java @@ -0,0 +1,13 @@ +package eu.dnetlib.dedup; + +public enum OafEntityType { + + Datasource, + Organization, + Project, + Dataset, + OtherResearchProduct, + Software, + Publication + +} diff --git a/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/OafKey.java b/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/OafKey.java new file mode 100644 index 000000000..f66b0457e --- /dev/null +++ b/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/OafKey.java @@ -0,0 +1,31 @@ +package eu.dnetlib.dedup; + +import java.io.Serializable; +public class OafKey implements Serializable { + + private String dedupId; + private String trust; + + public OafKey(String dedupId, String trust) { + this.dedupId = dedupId; + this.trust = trust; + } + public OafKey() { + } + public String getDedupId() { + return dedupId; + } + public void setDedupId(String dedupId) { + this.dedupId = dedupId; + } + public String getTrust() { + return trust; + } + public void setTrust(String trust) { + this.trust = trust; + } + @Override + public String toString(){ + return String.format("%s->%d", dedupId,trust); + } +} diff --git a/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/OafPartitioner.java b/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/OafPartitioner.java new file mode 100644 index 000000000..20885fd0b --- /dev/null +++ b/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/OafPartitioner.java @@ -0,0 +1,59 @@ +package eu.dnetlib.dedup; + +import org.apache.spark.Partitioner; + +import java.io.Serializable; + +public class OafPartitioner extends Partitioner implements Serializable { + + private final int numPartitions; + + public OafPartitioner(int partitions) { + assert (partitions > 0); + this.numPartitions = partitions; + } + + @Override + public int numPartitions() { + return numPartitions; + } + + @Override + public int getPartition(Object key) { + if (key instanceof OafKey) { + @SuppressWarnings("unchecked") + OafKey item = (OafKey) key; + return Math.abs(item.getDedupId().hashCode() % numPartitions); + } else { + throw new IllegalArgumentException("Unexpected Key"); + } + } + + @Override + public int hashCode() { + final int prime = 31; + int result = 1; + result = prime * result + numPartitions; + return result; + } + + @Override + public boolean equals(Object obj) { + if (this == obj) { + return true; + } + if (obj == null) { + return false; + } + if (!(obj instanceof OafPartitioner)) { + return false; + } + // + OafPartitioner other = (OafPartitioner) obj; + if (numPartitions != other.numPartitions) { + return false; + } + // + return true; + } +} diff --git a/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/SparkCreateConnectedComponent.java b/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/SparkCreateConnectedComponent.java index 182bb374a..eacf3d479 100644 --- a/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/SparkCreateConnectedComponent.java +++ b/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/SparkCreateConnectedComponent.java @@ -37,8 +37,8 @@ public class SparkCreateConnectedComponent { final String inputPath = parser.get("sourcePath"); final String entity = parser.get("entity"); final String targetPath = parser.get("targetPath"); - final DedupConfig dedupConf = DedupConfig.load(IOUtils.toString(SparkCreateConnectedComponent.class.getResourceAsStream("/eu/dnetlib/dhp/dedup/conf/org.curr.conf2.json"))); - +// final DedupConfig dedupConf = DedupConfig.load(IOUtils.toString(SparkCreateConnectedComponent.class.getResourceAsStream("/eu/dnetlib/dhp/dedup/conf/org.curr.conf2.json"))); + final DedupConfig dedupConf = DedupConfig.load(parser.get("dedupConf")); final JavaPairRDD vertexes = sc.textFile(inputPath + "/" + entity) .map(s -> MapDocumentUtil.getJPathString(dedupConf.getWf().getIdPath(), s)) diff --git a/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/SparkCreateDedupRecord.java b/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/SparkCreateDedupRecord.java index 6d6165b18..56bdc20f1 100644 --- a/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/SparkCreateDedupRecord.java +++ b/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/SparkCreateDedupRecord.java @@ -3,6 +3,7 @@ package eu.dnetlib.dedup; import com.google.common.collect.ComparisonChain; import com.google.common.collect.Lists; import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.schema.oaf.Publication; import eu.dnetlib.dhp.schema.oaf.Relation; import eu.dnetlib.pace.config.DedupConfig; import eu.dnetlib.pace.util.MapDocumentUtil; @@ -17,6 +18,7 @@ import org.apache.spark.api.java.function.PairFunction; import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Encoders; import org.apache.spark.sql.SparkSession; +import org.apache.spark.sql.types.StructType; import scala.Tuple2; import java.util.ArrayList; @@ -27,26 +29,28 @@ import java.util.List; public class SparkCreateDedupRecord { public static void main(String[] args) throws Exception { - final ArgumentApplicationParser parser = new ArgumentApplicationParser(IOUtils.toString(SparkCreateDedupRecord.class.getResourceAsStream("/eu/dnetlib/dhp/dedup/dedup_parameters.json"))); - parser.parseArgument(args); - final SparkSession spark = SparkSession - .builder() - .appName(SparkCreateDedupRecord.class.getSimpleName()) - .master(parser.get("master")) - .getOrCreate(); - - final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); - final String inputPath = parser.get("sourcePath"); - final String entity = parser.get("entity"); - final String targetPath = parser.get("targetPath"); - final DedupConfig dedupConf = DedupConfig.load(IOUtils.toString(SparkCreateDedupRecord.class.getResourceAsStream("/eu/dnetlib/dhp/dedup/conf/org.curr.conf2.json"))); - - final JavaPairRDD inputJsonEntities = sc.textFile(inputPath + "/" + entity) - .mapToPair((PairFunction)it-> - new Tuple2(MapDocumentUtil.getJPathString(dedupConf.getWf().getIdPath(), it),it) - ); - +// final ArgumentApplicationParser parser = new ArgumentApplicationParser(IOUtils.toString(SparkCreateDedupRecord.class.getResourceAsStream("/eu/dnetlib/dhp/dedup/dedup_parameters.json"))); +// parser.parseArgument(args); +// final SparkSession spark = SparkSession +// .builder() +// .appName(SparkCreateDedupRecord.class.getSimpleName()) +// .master(parser.get("master")) +// .getOrCreate(); +// +// final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext()); +// final String inputPath = parser.get("sourcePath"); +// final String entity = parser.get("entity"); +// final String targetPath = parser.get("targetPath"); +//// final DedupConfig dedupConf = DedupConfig.load(IOUtils.toString(SparkCreateDedupRecord.class.getResourceAsStream("/eu/dnetlib/dhp/dedup/conf/org.curr.conf2.json"))); +// final DedupConfig dedupConf = DedupConfig.load(parser.get("dedupConf")); +// +// // +// final JavaPairRDD inputJsonEntities = sc.textFile(inputPath + "/" + entity) +// .mapToPair((PairFunction)it-> +// new Tuple2(MapDocumentUtil.getJPathString(dedupConf.getWf().getIdPath(), it),it) +// ); +// //: source is the dedup_id, target is the id of the mergedIn // JavaPairRDD mergeRels = spark // .read().load(targetPath + "/" + entity+"_mergeRels").as(Encoders.bean(Relation.class)) // .where("relClass=='merges'") @@ -56,46 +60,12 @@ public class SparkCreateDedupRecord { // new Tuple2(r.getTarget(), r.getSource()) // ); // -// +// // // final JavaPairRDD p = mergeRels.join(inputJsonEntities).mapToPair((PairFunction>, String, String>) Tuple2::_2); -// -// Comparator c = new Comparator() { -// @Override -// public int compare(String s, String t1) { -// return 0; -// } -// }; -// final JavaPairRDD stringStringJavaPairRDD = p.repartitionAndSortWithinPartitions(p.partitioner().get(), c); + StructType schema = Encoders.bean(Publication.class).schema(); -// List inputValues = Arrays.asList( -// new Foo("k",5), -// new Foo("a",1), -// new Foo("a",30), -// new Foo("a",18), -// new Foo("a",22), -// new Foo("b",22), -// new Foo("c",5), -// new Foo("a",5), -// new Foo("s",1), -// new Foo("h",4) -// ); -// -// -// final JavaPairRDD fooFighters = sc.parallelize(inputValues).mapToPair((PairFunction) i -> new Tuple2(i, i)); -// -// -// FooComparator c = new FooComparator(); -// final List>> result = -// fooFighters.repartitionAndSortWithinPartitions(new FooPartitioner(fooFighters.getNumPartitions()), c) -// .mapToPair((PairFunction, String, Foo>) t-> new Tuple2(t._1().getValue(), t._2()) ) -// .groupByKey() -// .mapValues((Function, List>) Lists::newArrayList) -// .collect(); -// -// -// System.out.println(result); - + System.out.println(schema); } } diff --git a/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/SparkCreateSimRels.java b/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/SparkCreateSimRels.java index a7aa18f89..2ae85baf3 100644 --- a/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/SparkCreateSimRels.java +++ b/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/SparkCreateSimRels.java @@ -44,7 +44,8 @@ public class SparkCreateSimRels { final String inputPath = parser.get("sourcePath"); final String entity = parser.get("entity"); final String targetPath = parser.get("targetPath"); - final DedupConfig dedupConf = DedupConfig.load(IOUtils.toString(SparkCreateSimRels.class.getResourceAsStream("/eu/dnetlib/dhp/dedup/conf/org.curr.conf2.json"))); +// final DedupConfig dedupConf = DedupConfig.load(IOUtils.toString(SparkCreateSimRels.class.getResourceAsStream("/eu/dnetlib/dhp/dedup/conf/org.curr.conf2.json"))); + final DedupConfig dedupConf = DedupConfig.load(parser.get("dedupConf")); final long total = sc.textFile(inputPath + "/" + entity).count(); diff --git a/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/graph/GraphProcessor.scala b/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/graph/GraphProcessor.scala index ab2be9170..38c695152 100644 --- a/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/graph/GraphProcessor.scala +++ b/dhp-workflows/dhp-dedup/src/main/java/eu/dnetlib/dedup/graph/GraphProcessor.scala @@ -1,7 +1,5 @@ package eu.dnetlib.dedup.graph - -import eu.dnetlib.pace.model.MapDocument import org.apache.spark.graphx._ import org.apache.spark.rdd.RDD @@ -25,7 +23,7 @@ object GraphProcessor { } val connectedComponents = joinResult.groupByKey() .map[ConnectedComponent](cc => asConnectedComponent(cc)) - (connectedComponents) + connectedComponents } diff --git a/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/conf/org.curr.conf.json b/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/conf/org.curr.conf.json deleted file mode 100644 index a878d2419..000000000 --- a/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/conf/org.curr.conf.json +++ /dev/null @@ -1,1745 +0,0 @@ -{ - "wf": { - "threshold": "0.9", - "dedupRun": "001", - "entityType": "organization", - "orderField": "legalname", - "queueMaxSize": "2000", - "groupMaxSize": "50", - "slidingWindowSize": "200", - "idPath": ".id", - "rootBuilder": [ - "organization", - "projectOrganization_participation_isParticipant", - "datasourceOrganization_provision_isProvidedBy" - ], - "includeChildren": "true" - }, - "pace": { - "clustering": [ - { - "name": "sortedngrampairs", - "fields": [ - "legalname" - ], - "params": { - "max": 2, - "ngramLen": "3" - } - }, - { - "name": "suffixprefix", - "fields": [ - "legalname" - ], - "params": { - "max": 1, - "len": "3" - } - }, - { - "name": "urlclustering", - "fields": [ - "websiteurl" - ], - "params": {} - }, - { - "name": "keywordsclustering", - "fields": [ - "legalname" - ], - "params": { - "max": 2, - "windowSize": 4 - } - } - ], - "strictConditions": [ - { - "name": "exactMatch", - "fields": [ - "gridid" - ] - } - ], - "conditions": [ - { - "name": "DomainExactMatch", - "fields": [ - "websiteurl" - ] - }, - { - "name": "exactMatch", - "fields": [ - "country" - ] - } - ], - "model": [ - { - "name": "country", - "algo": "Null", - "type": "String", - "weight": "0", - "ignoreMissing": "false", - "path": ".country.classid" - }, - { - "name": "legalshortname", - "algo": "JaroWinklerNormalizedName", - "type": "String", - "weight": "0.1", - "ignoreMissing": "true", - "path": ".legalshortname.value" - }, - { - "name": "legalname", - "algo": "JaroWinklerNormalizedName", - "type": "String", - "weight": "0.9", - "ignoreMissing": "false", - "path": ".legalname.value", - "params": { - "windowSize": 4, - "threshold": 0.7 - } - }, - { - "name": "websiteurl", - "algo": "Null", - "type": "URL", - "weight": "0", - "ignoreMissing": "true", - "path": ".websiteurl.value", - "params": { - "host": 0.5, - "path": 0.5 - } - } - ], - "blacklists": { - "legalname": [] - }, - "synonyms": { - "key::1": [ - "university", - "università", - "università studi", - "universitario", - "universitaria", - "université", - "universitaire", - "universitaires", - "universidad", - "universitade", - "Universität", - "universitaet", - "Uniwersytet", - "университет", - "universiteit", - "πανεπιστήμιο", - "universitesi", - "universiteti" - ], - "key::2": [ - "studies", - "studi", - "études", - "estudios", - "estudos", - "Studien", - "studia", - "исследования", - "studies", - "σπουδές" - ], - "key::3": [ - "advanced", - "superiore", - "supérieur", - "supérieure", - "supérieurs", - "supérieures", - "avancado", - "avancados", - "fortgeschrittene", - "fortgeschritten", - "zaawansowany", - "передовой", - "gevorderd", - "gevorderde", - "προχωρημένος", - "προχωρημένη", - "προχωρημένο", - "προχωρημένες", - "προχωρημένα", - "wyzsza" - ], - "key::4": [ - "institute", - "istituto", - "institut", - "instituto", - "instituto", - "Institut", - "instytut", - "институт", - "instituut", - "ινστιτούτο" - ], - "key::5": [ - "hospital", - "ospedale", - "hôpital", - "hospital", - "hospital", - "Krankenhaus", - "szpital", - "больница", - "ziekenhuis", - "νοσοκομείο" - ], - "key::6": [ - "research", - "ricerca", - "recherche", - "investigacion", - "pesquisa", - "Forschung", - "badania", - "исследования", - "onderzoek", - "έρευνα", - "erevna", - "erevnas" - ], - "key::7": [ - "college", - "collegio", - "université", - "colegio", - "faculdade", - "Hochschule", - "Szkoła Wyższa", - "Высшая школа", - "universiteit", - "κολλέγιο" - ], - "key::8": [ - "foundation", - "fondazione", - "fondation", - "fundación", - "fundação", - "Stiftung", - "Fundacja", - "фонд", - "stichting", - "ίδρυμα", - "idryma" - ], - "key::9": [ - "center", - "centro", - "centre", - "centro", - "centro", - "zentrum", - "centrum", - "центр", - "centrum", - "κέντρο" - ], - "key::10": [ - "national", - "nazionale", - "national", - "nationale", - "nationaux", - "nationales", - "nacional", - "nacional", - "national", - "krajowy", - "национальный", - "nationaal", - "nationale", - "εθνικό" - ], - "key::11": [ - "association", - "associazione", - "association", - "asociación", - "associação", - "Verein", - "verband", - "stowarzyszenie", - "ассоциация", - "associatie" - ], - "key::12": [ - "society", - "societa", - "société", - "sociedad", - "sociedade", - "gesellschaft", - "społeczeństwo", - "общество", - "maatschappij", - "κοινωνία" - ], - "key::13": [ - "international", - "internazionale", - "international", - "internacional", - "internacional", - "international", - "międzynarodowy", - "Международный", - "internationaal", - "internationale", - "διεθνής", - "διεθνή", - "διεθνές" - ], - "key::14": [ - "community", - "comunita", - "communauté", - "comunidad", - "comunidade", - "Gemeinschaft", - "społeczność", - "сообщество", - "gemeenschap", - "κοινότητα" - ], - "key::15": [ - "school", - "scuola", - "école", - "escuela", - "escola", - "schule", - "Szkoła", - "школа", - "school", - "σχολείο" - ], - "key::16": [ - "education", - "educazione", - "éducation", - "educacion", - "Educação", - "Bildung", - "Edukacja", - "образование", - "opleiding", - "εκπαίδευση" - ], - "key::17": [ - "academy", - "accademia", - "académie", - "academia", - "academia", - "Akademie", - "akademie", - "академия", - "academie", - "ακαδημία" - ], - "key::18": [ - "public", - "pubblico", - "public", - "publique", - "publics", - "publiques", - "publico", - "publico", - "Öffentlichkeit", - "publiczny", - "публичный", - "publiek", - "publieke", - "δημόσιος", - "δημόσια", - "δημόσιο" - ], - "key::19": [ - "museum", - "museo", - "musée", - "mueso", - "museu", - "museum", - "muzeum", - "музей", - "museum", - "μουσείο" - ], - "key::20": [ - "group", - "gruppo", - "groupe", - "grupo", - "grupo", - "gruppe", - "grupa", - "группа", - "groep", - "ομάδα", - "όμιλος" - ], - "key::21": [ - "department", - "dipartimento", - "département", - "departamento", - "departamento", - "abteilung", - "departament", - "отдел", - "afdeling", - "τμήμα" - ], - "key::22": [ - "council", - "consiglio", - "conseil", - "Consejo", - "conselho", - "gesellschaft", - "rada", - "совет", - "raad", - "συμβούλιο" - ], - "key::23": [ - "library", - "biblioteca", - "bibliothèque", - "biblioteca", - "biblioteca", - "Bibliothek", - "biblioteka", - "библиотека", - "bibliotheek", - "βιβλιοθήκη" - ], - "key::24": [ - "ministry", - "ministero", - "ministère", - "ministerio", - "ministério", - "Ministerium", - "ministerstwo", - "министерство", - "ministerie", - "υπουργείο" - ], - "key::25": [ - "services", - "servizi", - "services", - "servicios", - "Serviços", - "Dienstleistungen", - "usługi", - "услуги", - "diensten", - "υπηρεσίες" - ], - "key::26": [ - "central", - "centrale", - "central", - "centrale", - "centrales", - "central", - "central", - "zentral", - "centralny", - "цетральный", - "centraal", - "κεντρικός", - "κεντρική", - "κεντρικό", - "κεντρικά" - ], - "key::27": [ - "general", - "generale", - "général", - "générale", - "généraux", - "générales", - "general", - "geral", - "general", - "Allgemeines", - "general", - "общий", - "algemeen", - "algemene", - "γενικός", - "γενική", - "γενικό", - "γενικά" - ], - "key::28": [ - "applied", - "applicati", - "appliqué", - "appliquée", - "appliqués", - "appliquées", - "aplicado", - "aplicada", - "angewendet", - "stosowany", - "прикладной", - "toegepast", - "toegepaste", - "εφαρμοσμένος", - "εφαρμοσμένη", - "εφαρμοσμένο", - "εφαρμοσμένα" - ], - "key::29": [ - "european", - "europee", - "europea", - "européen", - "européenne", - "européens", - "européennes", - "europeo", - "europeu", - "europäisch", - "europejski", - "европейский", - "Europees", - "Europese", - "ευρωπαϊκός", - "ευρωπαϊκή", - "ευρωπαϊκό", - "ευρωπαϊκά" - ], - "key::30": [ - "agency", - "agenzia", - "agence", - "agencia", - "agencia", - "agentur", - "agencja", - "агенция", - "agentschap", - "πρακτορείο" - ], - "key::31": [ - "laboratory", - "laboratorio", - "laboratoire", - "laboratorio", - "laboratorio", - "labor", - "laboratorium", - "лаборатория", - "laboratorium", - "εργαστήριο" - ], - "key::32": [ - "industry", - "industria", - "industrie", - "индустрия", - "industrie", - "βιομηχανία" - ], - "key::33": [ - "industrial", - "industriale", - "industriel", - "industrielle", - "industriels", - "industrielles", - "индустриальный", - "industrieel", - "βιομηχανικός", - "βιομηχανική", - "βιομηχανικό", - "βιομηχανικά", - "βιομηχανικές" - ], - "key::34": [ - "consortium", - "consorzio", - "consortium", - "консорциум", - "consortium", - "κοινοπραξία" - ], - "key::35": [ - "organization", - "organizzazione", - "organisation", - "organización", - "organização", - "organizacja", - "организация", - "organisatie", - "οργανισμός" - ], - "key::36": [ - "authority", - "autorità", - "autorité", - "авторитет", - "autoriteit" - ], - "key::37": [ - "federation", - "federazione", - "fédération", - "федерация", - "federatie", - "ομοσπονδία" - ], - "key::38": [ - "observatory", - "osservatorio", - "observatoire", - "обсерватория", - "observatorium", - "αστεροσκοπείο" - ], - "key::39": [ - "bureau", - "ufficio", - "bureau", - "офис", - "bureau", - "γραφείο" - ], - "key::40": [ - "company", - "impresa", - "compagnie", - "société", - "компания", - "bedrijf", - "εταιρία" - ], - "key::41": [ - "polytechnic", - "politecnico", - "polytechnique", - "политехника", - "polytechnisch", - "πολυτεχνείο", - "universita politecnica", - "polytechnic university", - "universidad politecnica", - "universitat politecnica", - "politechnika", - "politechniki", - "university technology", - "university science technology" - ], - "key::42": [ - "coalition", - "coalizione", - "coalition", - "коалиция", - "coalitie", - "συνασπισμός" - ], - "key::43": [ - "initiative", - "iniziativa", - "initiative", - "инициатива", - "initiatief", - "πρωτοβουλία" - ], - "key::44": [ - "academic", - "accademico", - "académique", - "universitaire", - "акадеческий academisch", - "ακαδημαϊκός", - "ακαδημαϊκή", - "ακαδημαϊκό", - "ακαδημαϊκές", - "ακαδημαϊκοί" - ], - "key::45": [ - "institution", - "istituzione", - "institution", - "институциональный", - "instelling", - "ινστιτούτο" - ], - "key::46": [ - "division", - "divisione", - "division", - "отделение", - "divisie", - "τμήμα" - ], - "key::47": [ - "committee", - "comitato", - "comité", - "комитет", - "commissie", - "επιτροπή" - ], - "key::48": [ - "promotion", - "promozione", - "продвижение", - "proothisis", - "forderung" - ], - "key::49": [ - "medical", - "medicine", - "clinical", - "medicina", - "clinici", - "médico", - "medicina", - "clínica", - "médico", - "medicina", - "clínica", - "medizinisch", - "Medizin", - "klinisch", - "medisch", - "geneeskunde", - "klinisch", - "ιατρικός", - "ιατρική", - "ιατρικό", - "ιατρικά", - "κλινικός", - "κλινική", - "κλινικό", - "κλινικά", - "tıbbi", - "tıp", - "klinik", - "orvosi", - "orvostudomány", - "klinikai", - "zdravniški", - "medicinski", - "klinični", - "meditsiini", - "kliinik", - "kliiniline" - ], - "key::50": [ - "technology", - "technological", - "tecnologia", - "tecnologie", - "tecnología", - "tecnológico", - "tecnologia", - "tecnológico", - "Technologie", - "technologisch", - "technologie", - "technologisch", - "τεχνολογία", - "τεχνολογικός", - "τεχνολογική", - "τεχνολογικό", - "teknoloji", - "teknolojik", - "technológia", - "technológiai", - "tehnologija", - "tehnološki", - "tehnoloogia", - "tehnoloogiline", - "technologii", - "technical", - "texniki", - "teknik" - ], - "key::51": [ - "science", - "scientific", - "scienza", - "scientifiche", - "scienze", - "ciencia", - "científico", - "ciência", - "científico", - "Wissenschaft", - "wissenschaftlich", - "wetenschap", - "wetenschappelijk", - "επιστήμη", - "επιστημονικός", - "επιστημονική", - "επιστημονικό", - "επιστημονικά", - "bilim", - "bilimsel", - "tudomány", - "tudományos", - "znanost", - "znanstveni", - "teadus", - "teaduslik", - "" - ], - "key::52": [ - "engineering", - "ingegneria", - "ingeniería", - "engenharia", - "Ingenieurwissenschaft", - "ingenieurswetenschappen", - "bouwkunde", - "μηχανικός", - "μηχανική", - "μηχανικό", - "mühendislik", - "mérnöki", - "Inženirstvo", - "inseneeria", - "inseneri", - "" - ], - "key::53": [ - "management", - "gestione", - "gestionale", - "gestionali", - "gestión", - "administración", - "gestão", - "administração", - "Verwaltung", - "management", - "διαχείριση", - "yönetim", - "menedzsment", - "vodstvo", - "upravljanje", - "management", - "juhtkond", - "juhtimine", - "haldus", - "" - ], - "key::54": [ - "energy", - "energia", - "energía", - "energia", - "Energie", - "energie", - "ενέργεια", - "enerji", - "energia", - "energija", - "energia", - "" - ], - "key::55": [ - "agricultural", - "agriculture", - "agricoltura", - "agricole", - "agrícola", - "agricultura", - "agrícola", - "agricultura", - "landwirtschaftlich", - "Landwirtschaft", - "landbouwkundig", - "landbouw", - "αγροτικός", - "αγροτική", - "αγροτικό", - "γεωργικός", - "γεωργική", - "γεωργικό", - "γεωργία", - "tarımsal", - "tarım", - "mezőgazdasági", - "mezőgazdaság", - "poljedelski", - "poljedelstvo", - "põllumajandus", - "põllumajanduslik", - "" - ], - "key::56": [ - "information", - "informazione", - "información", - "informação", - "Information", - "informatie", - "πληροφορία", - "bilgi", - "információ", - "informacija", - "informatsioon", - "informatycznych", - "" - ], - "key::57": [ - "social", - "sociali", - "social", - "social", - "Sozial", - "sociaal", - "maatschappelijk", - "κοινωνικός", - "κοινωνική", - "κοινωνικό", - "κοινωνικά", - "sosyal", - "szociális", - "družbeni", - "sotsiaal", - "sotsiaalne", - "" - ], - "key::58": [ - "environmental", - "ambiente", - "medioambiental", - "ambiente", - "medioambiente", - "meioambiente", - "Umwelt", - "milieu", - "milieuwetenschap", - "milieukunde", - "περιβαλλοντικός", - "περιβαλλοντική", - "περιβαλλοντικό", - "περιβαλλοντικά", - "çevre", - "környezeti", - "okoliški", - "keskonna", - "" - ], - "key::59": [ - "business", - "economia", - "economiche", - "economica", - "negocio", - "empresa", - "negócio", - "Unternehmen", - "bedrijf", - "bedrijfskunde", - "επιχείρηση", - "iş", - "üzleti", - "posel", - "ettevõte/äri", - "" - ], - "key::60": [ - "pharmaceuticals", - "pharmacy", - "farmacia", - "farmaceutica", - "farmacéutica", - "farmacia", - "farmacêutica", - "farmácia", - "Pharmazeutika", - "Arzneimittelkunde", - "farmaceutica", - "geneesmiddelen", - "apotheek", - "φαρμακευτικός", - "φαρμακευτική", - "φαρμακευτικό", - "φαρμακευτικά", - "φαρμακείο", - "ilaç", - "eczane", - "gyógyszerészeti", - "gyógyszertár", - "farmacevtika", - "lekarništvo", - "farmaatsia", - "farmatseutiline", - "" - ], - "key::61": [ - "healthcare", - "health services", - "salute", - "atenciónmédica", - "cuidadodelasalud", - "cuidadoscomasaúde", - "Gesundheitswesen", - "gezondheidszorg", - "ιατροφαρμακευτικήπερίθαλψη", - "sağlıkhizmeti", - "egészségügy", - "zdravstvo", - "tervishoid", - "tervishoiu", - "" - ], - "key::62": [ - "history", - "storia", - "historia", - "história", - "Geschichte", - "geschiedenis", - "geschiedkunde", - "ιστορία", - "tarih", - "történelem", - "zgodovina", - "ajalugu", - "" - ], - "key::63": [ - "materials", - "materiali", - "materia", - "materiales", - "materiais", - "materialen", - "υλικά", - "τεκμήρια", - "malzemeler", - "anyagok", - "materiali", - "materjalid", - "vahendid", - "" - ], - "key::64": [ - "economics", - "economia", - "economiche", - "economica", - "economía", - "economia", - "Wirtschaft", - "economie", - "οικονομικά", - "οικονομικέςεπιστήμες", - "ekonomi", - "közgazdaságtan", - "gospodarstvo", - "ekonomija", - "majanduslik", - "majandus", - "" - ], - "key::65": [ - "therapeutics", - "terapeutica", - "terapéutica", - "terapêutica", - "therapie", - "θεραπευτική", - "tedavibilimi", - "gyógykezelés", - "terapevtika", - "terapeutiline", - "ravi", - "" - ], - "key::66": [ - "oncology", - "oncologia", - "oncologico", - "oncología", - "oncologia", - "Onkologie", - "oncologie", - "ογκολογία", - "onkoloji", - "onkológia", - "onkologija", - "onkoloogia", - "" - ], - "key::67": [ - "natural", - "naturali", - "naturale", - "natural", - "natural", - "natürlich", - "natuurlijk", - "φυσικός", - "φυσική", - "φυσικό", - "φυσικά", - "doğal", - "természetes", - "naraven", - "loodus", - "" - ], - "key::68": [ - "educational", - "educazione", - "pedagogia", - "educacional", - "educativo", - "educacional", - "pädagogisch", - "educatief", - "εκπαιδευτικός", - "εκπαιδευτική", - "εκπαιδευτικό", - "εκπαιδευτικά", - "eğitimsel", - "oktatási", - "izobraževalen", - "haridus", - "hariduslik", - "" - ], - "key::69": [ - "biomedical", - "biomedica", - "biomédico", - "biomédico", - "biomedizinisch", - "biomedisch", - "βιοιατρικός", - "βιοιατρική", - "βιοιατρικό", - "βιοιατρικά", - "biyomedikal", - "orvosbiológiai", - "biomedicinski", - "biomeditsiiniline", - "" - ], - "key::70": [ - "veterinary", - "veterinaria", - "veterinarie", - "veterinaria", - "veterinária", - "tierärtzlich", - "veterinair", - "veeartsenijlkunde", - "κτηνιατρικός", - "κτηνιατρική", - "κτηνιατρικό", - "κτηνιατρικά", - "veteriner", - "állatorvosi", - "veterinar", - "veterinarski", - "veterinaaria", - "" - ], - "key::71": [ - "chemistry", - "chimica", - "química", - "química", - "Chemie", - "chemie", - "scheikunde", - "χημεία", - "kimya", - "kémia", - "kemija", - "keemia", - "" - ], - "key::72": [ - "security", - "sicurezza", - "seguridad", - "segurança", - "Sicherheit", - "veiligheid", - "ασφάλεια", - "güvenlik", - "biztonsági", - "varnost", - "turvalisus", - "julgeolek", - "" - ], - "key::73": [ - "biotechnology", - "biotecnologia", - "biotecnologie", - "biotecnología", - "biotecnologia", - "Biotechnologie", - "biotechnologie", - "βιοτεχνολογία", - "biyoteknoloji", - "biotechnológia", - "biotehnologija", - "biotehnoloogia", - "" - ], - "key::74": [ - "military", - "militare", - "militari", - "militar", - "militar", - "Militär", - "militair", - "leger", - "στρατιωτικός", - "στρατιωτική", - "στρατιωτικό", - "στρατιωτικά", - "askeri", - "katonai", - "vojaški", - "vojni", - "militaar", - "wojskowa", - "" - ], - "key::75": [ - "theological", - "teologia", - "teologico", - "teológico", - "tecnológica", - "theologisch", - "theologisch", - "θεολογικός", - "θεολογική", - "θεολογικό", - "θεολογικά", - "teolojik", - "technológiai", - "teološki", - "teoloogia", - "usuteadus", - "teoloogiline", - "" - ], - "key::76": [ - "electronics", - "elettronica", - "electrónica", - "eletrônicos", - "Elektronik", - "elektronica", - "ηλεκτρονική", - "elektronik", - "elektronika", - "elektronika", - "elektroonika", - "" - ], - "key::77": [ - "forestry", - "forestale", - "forestali", - "silvicultura", - "forestal", - "floresta", - "Forstwirtschaft", - "bosbouw", - "δασοκομία", - "δασολογία", - "ormancılık", - "erdészet", - "gozdarstvo", - "metsandus", - "" - ], - "key::78": [ - "maritime", - "marittima", - "marittime", - "marittimo", - "marítimo", - "marítimo", - "maritiem", - "ναυτικός", - "ναυτική", - "ναυτικό", - "ναυτικά", - "ναυτιλιακός", - "ναυτιλιακή", - "ναυτιλιακό", - "ναυτιλιακά", - "θαλάσσιος", - "θαλάσσια", - "θαλάσσιο", - "denizcilik", - "tengeri", - "morski", - "mere", - "merendus", - "" - ], - "key::79": [ - "sports", - "sport", - "deportes", - "esportes", - "Sport", - "sport", - "sportwetenschappen", - "άθληση", - "γυμναστικήδραστηριότητα", - "spor", - "sport", - "šport", - "sport", - "spordi", - "" - ], - "key::80": [ - "surgery", - "chirurgia", - "chirurgiche", - "cirugía", - "cirurgia", - "Chirurgie", - "chirurgie", - "heelkunde", - "εγχείρηση", - "επέμβαση", - "χειρουργικήεπέμβαση", - "cerrahi", - "sebészet", - "kirurgija", - "kirurgia", - "" - ], - "key::81": [ - "cultural", - "culturale", - "culturali", - "cultura", - "cultural", - "cultural", - "kulturell", - "cultureel", - "πολιτιστικός", - "πολιτιστική", - "πολιτιστικό", - "πολιτισμικός", - "πολιτισμική", - "πολιτισμικό", - "kültürel", - "kultúrális", - "kulturni", - "kultuuri", - "kultuuriline", - "" - ], - "key::82": [ - "computerscience", - "informatica", - "ordenador", - "computadora", - "informática", - "computación", - "cienciasdelacomputación", - "ciênciadacomputação", - "Computer", - "computer", - "υπολογιστής", - "ηλεκτρονικόςυπολογιστής", - "bilgisayar", - "számítógép", - "računalnik", - "arvuti", - "" - ], - "key::83": [ - "finance", - "financial", - "finanza", - "finanziarie", - "finanza", - "financiero", - "finanças", - "financeiro", - "Finanzen", - "finanziell", - "financiën", - "financieel", - "χρηματοοικονομικά", - "χρηματοδότηση", - "finanse", - "finansal", - "pénzügy", - "pénzügyi", - "finance", - "finančni", - "finants", - "finantsiline", - "" - ], - "key::84": [ - "communication", - "comunicazione", - "comuniciación", - "comunicação", - "Kommunikation", - "communication", - "επικοινωνία", - "iletişim", - "kommunikáció", - "komuniciranje", - "kommunikatsioon", - "" - ], - "key::85": [ - "justice", - "giustizia", - "justicia", - "justiça", - "Recht", - "Justiz", - "justitie", - "gerechtigheid", - "δικαιοσύνη", - "υπουργείοδικαιοσύνης", - "δίκαιο", - "adalet", - "igazságügy", - "pravo", - "õigus", - "" - ], - "key::86": [ - "aerospace", - "aerospaziale", - "aerospaziali", - "aeroespacio", - "aeroespaço", - "Luftfahrt", - "luchtvaart", - "ruimtevaart", - "αεροπορικός", - "αεροπορική", - "αεροπορικό", - "αεροναυπηγικός", - "αεροναυπηγική", - "αεροναυπηγικό", - "αεροναυπηγικά", - "havacılıkveuzay", - "légtér", - "zrakoplovstvo", - "atmosfäär", - "kosmos", - "" - ], - "key::87": [ - "dermatology", - "dermatologia", - "dermatología", - "dermatologia", - "Dermatologie", - "dermatologie", - "δρματολογία", - "dermatoloji", - "bőrgyógyászat", - "dermatológia", - "dermatologija", - "dermatoloogia", - "" - ], - "key::88": [ - "architecture", - "architettura", - "arquitectura", - "arquitetura", - "Architektur", - "architectuur", - "αρχιτεκτονική", - "mimarlık", - "építészet", - "arhitektura", - "arhitektuur", - "" - ], - "key::89": [ - "mathematics", - "matematica", - "matematiche", - "matemáticas", - "matemáticas", - "Mathematik", - "wiskunde", - "mathematica", - "μαθηματικά", - "matematik", - "matematika", - "matematika", - "matemaatika", - "" - ], - "key::90": [ - "language", - "lingue", - "linguistica", - "linguistiche", - "lenguaje", - "idioma", - "língua", - "idioma", - "Sprache", - "taal", - "taalkunde", - "γλώσσα", - "dil", - "nyelv", - "jezik", - "keel", - "" - ], - "key::91": [ - "neuroscience", - "neuroscienza", - "neurociencia", - "neurociência", - "Neurowissenschaft", - "neurowetenschappen", - "νευροεπιστήμη", - "nörobilim", - "idegtudomány", - "nevroznanost", - "neuroteadused", - "" - ], - "key::92": [ - "automation", - "automazione", - "automatización", - "automação", - "Automatisierung", - "automatisering", - "αυτοματοποίηση", - "otomasyon", - "automatizálás", - "avtomatizacija", - "automatiseeritud", - "" - ], - "key::93": [ - "pediatric", - "pediatria", - "pediatriche", - "pediatrico", - "pediátrico", - "pediatría", - "pediátrico", - "pediatria", - "pädiatrisch", - "pediatrische", - "παιδιατρική", - "pediatrik", - "gyermekgyógyászat", - "pediatrija", - "pediaatria", - "" - ], - "key::94": [ - "photonics", - "fotonica", - "fotoniche", - "fotónica", - "fotônica", - "Photonik", - "fotonica", - "φωτονική", - "fotonik", - "fotonika", - "fotonika", - "fotoonika", - "" - ], - "key::95": [ - "mechanics", - "meccanica", - "meccaniche", - "mecánica", - "mecânica", - "Mechanik", - "Maschinenbau", - "mechanica", - "werktuigkunde", - "μηχανικής", - "mekanik", - "gépészet", - "mehanika", - "mehaanika", - "" - ], - "key::96": [ - "psychiatrics", - "psichiatria", - "psichiatrica", - "psichiatriche", - "psiquiatría", - "psiquiatria", - "Psychiatrie", - "psychiatrie", - "ψυχιατρική", - "psikiyatrik", - "pszihiátria", - "psihiatrija", - "psühhaatria", - "" - ], - "key::97": [ - "psychology", - "fisiologia", - "psicología", - "psicologia", - "Psychologie", - "psychologie", - "ψυχολογία", - "psikoloji", - "pszihológia", - "psihologija", - "psühholoogia", - "" - ], - "key::98": [ - "automotive", - "industriaautomobilistica", - "industriadelautomóvil", - "automotriz", - "industriaautomotriz", - "automotivo", - "Automobilindustrie", - "autoindustrie", - "αυτοκίνητος", - "αυτοκίνητη", - "αυτοκίνητο", - "αυτοκινούμενος", - "αυτοκινούμενη", - "αυτοκινούμενο", - "αυτοκινητιστικός", - "αυτοκινητιστική", - "αυτοκινητιστικό", - "otomotiv", - "autóipari", - "samogiben", - "avtomobilskaindustrija", - "auto-", - "" - ], - "key::99": [ - "neurology", - "neurologia", - "neurologiche", - "neurología", - "neurologia", - "Neurologie", - "neurologie", - "zenuwleer", - "νευρολογία", - "nöroloji", - "neurológia", - "ideggyógyászat", - "nevrologija", - "neuroloogia", - "" - ], - "key::100": [ - "geology", - "geologia", - "geologiche", - "geología", - "geologia", - "Geologie", - "geologie", - "aardkunde", - "γεωλογία", - "jeoloji", - "geológia", - "földtudomány", - "geologija", - "geoloogia", - "" - ], - "key::101": [ - "microbiology", - "microbiologia", - "micro-biologia", - "microbiologiche", - "microbiología", - "microbiologia", - "Mikrobiologie", - "microbiologie", - "μικροβιολογία", - "mikrobiyoloji", - "mikrobiológia", - "mikrobiologija", - "mikrobioloogia", - "" - ], - "key::102": [ - "informatics", - "informatica", - "informática", - "informática", - "informatica", - "" - ], - "key::103": [ - "forschungsgemeinschaft", - "comunita ricerca", - "research community", - "research foundation", - "research association" - ], - "key::104": [ - "commerce", - "ticaret", - "ticarət", - "commercio", - "trade", - "handel", - "comercio" - ] - } - } -} \ No newline at end of file diff --git a/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/oozie_app/config-default.xml b/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/oozie_app/config-default.xml index ddea85590..e654bbbb6 100644 --- a/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/oozie_app/config-default.xml +++ b/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/oozie_app/config-default.xml @@ -1,31 +1,31 @@ jobTracker - yarnRM + yarnRM nameNode - hdfs://nameservice1 + hdfs://nameservice1 oozie.use.system.libpath - true + true oozie.action.sharelib.for.spark - spark2 + spark2 hive_metastore_uris - thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083 + thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083 hive_db_name - openaire + openaire dedupConf - {"wf":{"threshold":"0.9","dedupRun":"001","entityType":"organization","orderField":"legalname","queueMaxSize":"2000","groupMaxSize":"50","slidingWindowSize":"200","idPath":".id","rootBuilder":["organization","projectOrganization_participation_isParticipant","datasourceOrganization_provision_isProvidedBy"],"includeChildren":"true"},"pace":{"clustering":[{"name":"sortedngrampairs","fields":["legalname"],"params":{"max":2,"ngramLen":"3"}},{"name":"suffixprefix","fields":["legalname"],"params":{"max":1,"len":"3"}},{"name":"urlclustering","fields":["websiteurl"],"params":{}},{"name":"keywordsclustering","fields":["legalname"],"params":{"max":2,"windowSize":4}}],"strictConditions":[{"name":"exactMatch","fields":["gridid"]}],"conditions":[{"name":"DomainExactMatch","fields":["websiteurl"]},{"name":"exactMatch","fields":["country"]}],"model":[{"name":"country","algo":"Null","type":"String","weight":"0","ignoreMissing":"false","path":".country.classid"},{"name":"legalshortname","algo":"JaroWinklerNormalizedName","type":"String","weight":"0.1","ignoreMissing":"true","path":".legalshortname.value"},{"name":"legalname","algo":"JaroWinklerNormalizedName","type":"String","weight":"0.9","ignoreMissing":"false","path":".legalname.value","params":{"windowSize":4,"threshold":0.7}},{"name":"websiteurl","algo":"Null","type":"URL","weight":"0","ignoreMissing":"true","path":".websiteurl.value","params":{"host":0.5,"path":0.5}},{"name":"gridid","algo":"Null","type":"String","weight":"0.0","ignoreMissing":"true","path":".pid[] | select(.qualifier.classid==\"grid\") | .value"}],"blacklists":{"legalname":[]},"synonyms":{"key::1":["university","università","università studi","universitario","universitaria","université","universitaire","universitaires","universidad","universitade","Universität","universitaet","Uniwersytet","университет","universiteit","πανεπιστήμιο","universitesi","universiteti"],"key::2":["studies","studi","études","estudios","estudos","Studien","studia","исследования","studies","σπουδές"],"key::3":["advanced","superiore","supérieur","supérieure","supérieurs","supérieures","avancado","avancados","fortgeschrittene","fortgeschritten","zaawansowany","передовой","gevorderd","gevorderde","προχωρημένος","προχωρημένη","προχωρημένο","προχωρημένες","προχωρημένα","wyzsza"],"key::4":["institute","istituto","institut","instituto","instituto","Institut","instytut","институт","instituut","ινστιτούτο"],"key::5":["hospital","ospedale","hôpital","hospital","hospital","Krankenhaus","szpital","больница","ziekenhuis","νοσοκομείο"],"key::6":["research","ricerca","recherche","investigacion","pesquisa","Forschung","badania","исследования","onderzoek","έρευνα","erevna","erevnas"],"key::7":["college","collegio","université","colegio","faculdade","Hochschule","Szkoła Wyższa","Высшая школа","universiteit","κολλέγιο"],"key::8":["foundation","fondazione","fondation","fundación","fundação","Stiftung","Fundacja","фонд","stichting","ίδρυμα","idryma"],"key::9":["center","centro","centre","centro","centro","zentrum","centrum","центр","centrum","κέντρο"],"key::10":["national","nazionale","national","nationale","nationaux","nationales","nacional","nacional","national","krajowy","национальный","nationaal","nationale","εθνικό"],"key::11":["association","associazione","association","asociación","associação","Verein","verband","stowarzyszenie","ассоциация","associatie"],"key::12":["society","societa","société","sociedad","sociedade","gesellschaft","społeczeństwo","общество","maatschappij","κοινωνία"],"key::13":["international","internazionale","international","internacional","internacional","international","międzynarodowy","Международный","internationaal","internationale","διεθνής","διεθνή","διεθνές"],"key::14":["community","comunita","communauté","comunidad","comunidade","Gemeinschaft","społeczność","сообщество","gemeenschap","κοινότητα"],"key::15":["school","scuola","école","escuela","escola","schule","Szkoła","школа","school","σχολείο"],"key::16":["education","educazione","éducation","educacion","Educação","Bildung","Edukacja","образование","opleiding","εκπαίδευση"],"key::17":["academy","accademia","académie","academia","academia","Akademie","akademie","академия","academie","ακαδημία"],"key::18":["public","pubblico","public","publique","publics","publiques","publico","publico","Öffentlichkeit","publiczny","публичный","publiek","publieke","δημόσιος","δημόσια","δημόσιο"],"key::19":["museum","museo","musée","mueso","museu","museum","muzeum","музей","museum","μουσείο"],"key::20":["group","gruppo","groupe","grupo","grupo","gruppe","grupa","группа","groep","ομάδα","όμιλος"],"key::21":["department","dipartimento","département","departamento","departamento","abteilung","departament","отдел","afdeling","τμήμα"],"key::22":["council","consiglio","conseil","Consejo","conselho","gesellschaft","rada","совет","raad","συμβούλιο"],"key::23":["library","biblioteca","bibliothèque","biblioteca","biblioteca","Bibliothek","biblioteka","библиотека","bibliotheek","βιβλιοθήκη"],"key::24":["ministry","ministero","ministère","ministerio","ministério","Ministerium","ministerstwo","министерство","ministerie","υπουργείο"],"key::25":["services","servizi","services","servicios","Serviços","Dienstleistungen","usługi","услуги","diensten","υπηρεσίες"],"key::26":["central","centrale","central","centrale","centrales","central","central","zentral","centralny","цетральный","centraal","κεντρικός","κεντρική","κεντρικό","κεντρικά"],"key::27":["general","generale","général","générale","généraux","générales","general","geral","general","Allgemeines","general","общий","algemeen","algemene","γενικός","γενική","γενικό","γενικά"],"key::28":["applied","applicati","appliqué","appliquée","appliqués","appliquées","aplicado","aplicada","angewendet","stosowany","прикладной","toegepast","toegepaste","εφαρμοσμένος","εφαρμοσμένη","εφαρμοσμένο","εφαρμοσμένα"],"key::29":["european","europee","europea","européen","européenne","européens","européennes","europeo","europeu","europäisch","europejski","европейский","Europees","Europese","ευρωπαϊκός","ευρωπαϊκή","ευρωπαϊκό","ευρωπαϊκά"],"key::30":["agency","agenzia","agence","agencia","agencia","agentur","agencja","агенция","agentschap","πρακτορείο"],"key::31":["laboratory","laboratorio","laboratoire","laboratorio","laboratorio","labor","laboratorium","лаборатория","laboratorium","εργαστήριο"],"key::32":["industry","industria","industrie","индустрия","industrie","βιομηχανία"],"key::33":["industrial","industriale","industriel","industrielle","industriels","industrielles","индустриальный","industrieel","βιομηχανικός","βιομηχανική","βιομηχανικό","βιομηχανικά","βιομηχανικές"],"key::34":["consortium","consorzio","consortium","консорциум","consortium","κοινοπραξία"],"key::35":["organization","organizzazione","organisation","organización","organização","organizacja","организация","organisatie","οργανισμός"],"key::36":["authority","autorità","autorité","авторитет","autoriteit"],"key::37":["federation","federazione","fédération","федерация","federatie","ομοσπονδία"],"key::38":["observatory","osservatorio","observatoire","обсерватория","observatorium","αστεροσκοπείο"],"key::39":["bureau","ufficio","bureau","офис","bureau","γραφείο"],"key::40":["company","impresa","compagnie","société","компания","bedrijf","εταιρία"],"key::41":["polytechnic","politecnico","polytechnique","политехника","polytechnisch","πολυτεχνείο","universita politecnica","polytechnic university","universidad politecnica","universitat politecnica","politechnika","politechniki","university technology","university science technology"],"key::42":["coalition","coalizione","coalition","коалиция","coalitie","συνασπισμός"],"key::43":["initiative","iniziativa","initiative","инициатива","initiatief","πρωτοβουλία"],"key::44":["academic","accademico","académique","universitaire","акадеческий academisch","ακαδημαϊκός","ακαδημαϊκή","ακαδημαϊκό","ακαδημαϊκές","ακαδημαϊκοί"],"key::45":["institution","istituzione","institution","институциональный","instelling","ινστιτούτο"],"key::46":["division","divisione","division","отделение","divisie","τμήμα"],"key::47":["committee","comitato","comité","комитет","commissie","επιτροπή"],"key::48":["promotion","promozione","продвижение","proothisis","forderung"],"key::49":["medical","medicine","clinical","medicina","clinici","médico","medicina","clínica","médico","medicina","clínica","medizinisch","Medizin","klinisch","medisch","geneeskunde","klinisch","ιατρικός","ιατρική","ιατρικό","ιατρικά","κλινικός","κλινική","κλινικό","κλινικά","tıbbi","tıp","klinik","orvosi","orvostudomány","klinikai","zdravniški","medicinski","klinični","meditsiini","kliinik","kliiniline"],"key::50":["technology","technological","tecnologia","tecnologie","tecnología","tecnológico","tecnologia","tecnológico","Technologie","technologisch","technologie","technologisch","τεχνολογία","τεχνολογικός","τεχνολογική","τεχνολογικό","teknoloji","teknolojik","technológia","technológiai","tehnologija","tehnološki","tehnoloogia","tehnoloogiline","technologii","technical","texniki","teknik"],"key::51":["science","scientific","scienza","scientifiche","scienze","ciencia","científico","ciência","científico","Wissenschaft","wissenschaftlich","wetenschap","wetenschappelijk","επιστήμη","επιστημονικός","επιστημονική","επιστημονικό","επιστημονικά","bilim","bilimsel","tudomány","tudományos","znanost","znanstveni","teadus","teaduslik",""],"key::52":["engineering","ingegneria","ingeniería","engenharia","Ingenieurwissenschaft","ingenieurswetenschappen","bouwkunde","μηχανικός","μηχανική","μηχανικό","mühendislik","mérnöki","Inženirstvo","inseneeria","inseneri",""],"key::53":["management","gestione","gestionale","gestionali","gestión","administración","gestão","administração","Verwaltung","management","διαχείριση","yönetim","menedzsment","vodstvo","upravljanje","management","juhtkond","juhtimine","haldus",""],"key::54":["energy","energia","energía","energia","Energie","energie","ενέργεια","enerji","energia","energija","energia",""],"key::55":["agricultural","agriculture","agricoltura","agricole","agrícola","agricultura","agrícola","agricultura","landwirtschaftlich","Landwirtschaft","landbouwkundig","landbouw","αγροτικός","αγροτική","αγροτικό","γεωργικός","γεωργική","γεωργικό","γεωργία","tarımsal","tarım","mezőgazdasági","mezőgazdaság","poljedelski","poljedelstvo","põllumajandus","põllumajanduslik",""],"key::56":["information","informazione","información","informação","Information","informatie","πληροφορία","bilgi","információ","informacija","informatsioon","informatycznych",""],"key::57":["social","sociali","social","social","Sozial","sociaal","maatschappelijk","κοινωνικός","κοινωνική","κοινωνικό","κοινωνικά","sosyal","szociális","družbeni","sotsiaal","sotsiaalne",""],"key::58":["environmental","ambiente","medioambiental","ambiente","medioambiente","meioambiente","Umwelt","milieu","milieuwetenschap","milieukunde","περιβαλλοντικός","περιβαλλοντική","περιβαλλοντικό","περιβαλλοντικά","çevre","környezeti","okoliški","keskonna",""],"key::59":["business","economia","economiche","economica","negocio","empresa","negócio","Unternehmen","bedrijf","bedrijfskunde","επιχείρηση","iş","üzleti","posel","ettevõte/äri",""],"key::60":["pharmaceuticals","pharmacy","farmacia","farmaceutica","farmacéutica","farmacia","farmacêutica","farmácia","Pharmazeutika","Arzneimittelkunde","farmaceutica","geneesmiddelen","apotheek","φαρμακευτικός","φαρμακευτική","φαρμακευτικό","φαρμακευτικά","φαρμακείο","ilaç","eczane","gyógyszerészeti","gyógyszertár","farmacevtika","lekarništvo","farmaatsia","farmatseutiline",""],"key::61":["healthcare","health services","salute","atenciónmédica","cuidadodelasalud","cuidadoscomasaúde","Gesundheitswesen","gezondheidszorg","ιατροφαρμακευτικήπερίθαλψη","sağlıkhizmeti","egészségügy","zdravstvo","tervishoid","tervishoiu",""],"key::62":["history","storia","historia","história","Geschichte","geschiedenis","geschiedkunde","ιστορία","tarih","történelem","zgodovina","ajalugu",""],"key::63":["materials","materiali","materia","materiales","materiais","materialen","υλικά","τεκμήρια","malzemeler","anyagok","materiali","materjalid","vahendid",""],"key::64":["economics","economia","economiche","economica","economía","economia","Wirtschaft","economie","οικονομικά","οικονομικέςεπιστήμες","ekonomi","közgazdaságtan","gospodarstvo","ekonomija","majanduslik","majandus",""],"key::65":["therapeutics","terapeutica","terapéutica","terapêutica","therapie","θεραπευτική","tedavibilimi","gyógykezelés","terapevtika","terapeutiline","ravi",""],"key::66":["oncology","oncologia","oncologico","oncología","oncologia","Onkologie","oncologie","ογκολογία","onkoloji","onkológia","onkologija","onkoloogia",""],"key::67":["natural","naturali","naturale","natural","natural","natürlich","natuurlijk","φυσικός","φυσική","φυσικό","φυσικά","doğal","természetes","naraven","loodus",""],"key::68":["educational","educazione","pedagogia","educacional","educativo","educacional","pädagogisch","educatief","εκπαιδευτικός","εκπαιδευτική","εκπαιδευτικό","εκπαιδευτικά","eğitimsel","oktatási","izobraževalen","haridus","hariduslik",""],"key::69":["biomedical","biomedica","biomédico","biomédico","biomedizinisch","biomedisch","βιοιατρικός","βιοιατρική","βιοιατρικό","βιοιατρικά","biyomedikal","orvosbiológiai","biomedicinski","biomeditsiiniline",""],"key::70":["veterinary","veterinaria","veterinarie","veterinaria","veterinária","tierärtzlich","veterinair","veeartsenijlkunde","κτηνιατρικός","κτηνιατρική","κτηνιατρικό","κτηνιατρικά","veteriner","állatorvosi","veterinar","veterinarski","veterinaaria",""],"key::71":["chemistry","chimica","química","química","Chemie","chemie","scheikunde","χημεία","kimya","kémia","kemija","keemia",""],"key::72":["security","sicurezza","seguridad","segurança","Sicherheit","veiligheid","ασφάλεια","güvenlik","biztonsági","varnost","turvalisus","julgeolek",""],"key::73":["biotechnology","biotecnologia","biotecnologie","biotecnología","biotecnologia","Biotechnologie","biotechnologie","βιοτεχνολογία","biyoteknoloji","biotechnológia","biotehnologija","biotehnoloogia",""],"key::74":["military","militare","militari","militar","militar","Militär","militair","leger","στρατιωτικός","στρατιωτική","στρατιωτικό","στρατιωτικά","askeri","katonai","vojaški","vojni","militaar","wojskowa",""],"key::75":["theological","teologia","teologico","teológico","tecnológica","theologisch","theologisch","θεολογικός","θεολογική","θεολογικό","θεολογικά","teolojik","technológiai","teološki","teoloogia","usuteadus","teoloogiline",""],"key::76":["electronics","elettronica","electrónica","eletrônicos","Elektronik","elektronica","ηλεκτρονική","elektronik","elektronika","elektronika","elektroonika",""],"key::77":["forestry","forestale","forestali","silvicultura","forestal","floresta","Forstwirtschaft","bosbouw","δασοκομία","δασολογία","ormancılık","erdészet","gozdarstvo","metsandus",""],"key::78":["maritime","marittima","marittime","marittimo","marítimo","marítimo","maritiem","ναυτικός","ναυτική","ναυτικό","ναυτικά","ναυτιλιακός","ναυτιλιακή","ναυτιλιακό","ναυτιλιακά","θαλάσσιος","θαλάσσια","θαλάσσιο","denizcilik","tengeri","morski","mere","merendus",""],"key::79":["sports","sport","deportes","esportes","Sport","sport","sportwetenschappen","άθληση","γυμναστικήδραστηριότητα","spor","sport","šport","sport","spordi",""],"key::80":["surgery","chirurgia","chirurgiche","cirugía","cirurgia","Chirurgie","chirurgie","heelkunde","εγχείρηση","επέμβαση","χειρουργικήεπέμβαση","cerrahi","sebészet","kirurgija","kirurgia",""],"key::81":["cultural","culturale","culturali","cultura","cultural","cultural","kulturell","cultureel","πολιτιστικός","πολιτιστική","πολιτιστικό","πολιτισμικός","πολιτισμική","πολιτισμικό","kültürel","kultúrális","kulturni","kultuuri","kultuuriline",""],"key::82":["computerscience","informatica","ordenador","computadora","informática","computación","cienciasdelacomputación","ciênciadacomputação","Computer","computer","υπολογιστής","ηλεκτρονικόςυπολογιστής","bilgisayar","számítógép","računalnik","arvuti",""],"key::83":["finance","financial","finanza","finanziarie","finanza","financiero","finanças","financeiro","Finanzen","finanziell","financiën","financieel","χρηματοοικονομικά","χρηματοδότηση","finanse","finansal","pénzügy","pénzügyi","finance","finančni","finants","finantsiline",""],"key::84":["communication","comunicazione","comuniciación","comunicação","Kommunikation","communication","επικοινωνία","iletişim","kommunikáció","komuniciranje","kommunikatsioon",""],"key::85":["justice","giustizia","justicia","justiça","Recht","Justiz","justitie","gerechtigheid","δικαιοσύνη","υπουργείοδικαιοσύνης","δίκαιο","adalet","igazságügy","pravo","õigus",""],"key::86":["aerospace","aerospaziale","aerospaziali","aeroespacio","aeroespaço","Luftfahrt","luchtvaart","ruimtevaart","αεροπορικός","αεροπορική","αεροπορικό","αεροναυπηγικός","αεροναυπηγική","αεροναυπηγικό","αεροναυπηγικά","havacılıkveuzay","légtér","zrakoplovstvo","atmosfäär","kosmos",""],"key::87":["dermatology","dermatologia","dermatología","dermatologia","Dermatologie","dermatologie","δρματολογία","dermatoloji","bőrgyógyászat","dermatológia","dermatologija","dermatoloogia",""],"key::88":["architecture","architettura","arquitectura","arquitetura","Architektur","architectuur","αρχιτεκτονική","mimarlık","építészet","arhitektura","arhitektuur",""],"key::89":["mathematics","matematica","matematiche","matemáticas","matemáticas","Mathematik","wiskunde","mathematica","μαθηματικά","matematik","matematika","matematika","matemaatika",""],"key::90":["language","lingue","linguistica","linguistiche","lenguaje","idioma","língua","idioma","Sprache","taal","taalkunde","γλώσσα","dil","nyelv","jezik","keel",""],"key::91":["neuroscience","neuroscienza","neurociencia","neurociência","Neurowissenschaft","neurowetenschappen","νευροεπιστήμη","nörobilim","idegtudomány","nevroznanost","neuroteadused",""],"key::92":["automation","automazione","automatización","automação","Automatisierung","automatisering","αυτοματοποίηση","otomasyon","automatizálás","avtomatizacija","automatiseeritud",""],"key::93":["pediatric","pediatria","pediatriche","pediatrico","pediátrico","pediatría","pediátrico","pediatria","pädiatrisch","pediatrische","παιδιατρική","pediatrik","gyermekgyógyászat","pediatrija","pediaatria",""],"key::94":["photonics","fotonica","fotoniche","fotónica","fotônica","Photonik","fotonica","φωτονική","fotonik","fotonika","fotonika","fotoonika",""],"key::95":["mechanics","meccanica","meccaniche","mecánica","mecânica","Mechanik","Maschinenbau","mechanica","werktuigkunde","μηχανικής","mekanik","gépészet","mehanika","mehaanika",""],"key::96":["psychiatrics","psichiatria","psichiatrica","psichiatriche","psiquiatría","psiquiatria","Psychiatrie","psychiatrie","ψυχιατρική","psikiyatrik","pszihiátria","psihiatrija","psühhaatria",""],"key::97":["psychology","fisiologia","psicología","psicologia","Psychologie","psychologie","ψυχολογία","psikoloji","pszihológia","psihologija","psühholoogia",""],"key::98":["automotive","industriaautomobilistica","industriadelautomóvil","automotriz","industriaautomotriz","automotivo","Automobilindustrie","autoindustrie","αυτοκίνητος","αυτοκίνητη","αυτοκίνητο","αυτοκινούμενος","αυτοκινούμενη","αυτοκινούμενο","αυτοκινητιστικός","αυτοκινητιστική","αυτοκινητιστικό","otomotiv","autóipari","samogiben","avtomobilskaindustrija","auto-",""],"key::99":["neurology","neurologia","neurologiche","neurología","neurologia","Neurologie","neurologie","zenuwleer","νευρολογία","nöroloji","neurológia","ideggyógyászat","nevrologija","neuroloogia",""],"key::100":["geology","geologia","geologiche","geología","geologia","Geologie","geologie","aardkunde","γεωλογία","jeoloji","geológia","földtudomány","geologija","geoloogia",""],"key::101":["microbiology","microbiologia","micro-biologia","microbiologiche","microbiología","microbiologia","Mikrobiologie","microbiologie","μικροβιολογία","mikrobiyoloji","mikrobiológia","mikrobiologija","mikrobioloogia",""],"key::102":["informatics","informatica","informática","informática","informatica",""],"key::103":["forschungsgemeinschaft","comunita ricerca","research community","research foundation","research association"],"key::104":["commerce","ticaret","ticarət","commercio","trade","handel","comercio"]}}} + {"wf":{"threshold":"0.9","dedupRun":"001","entityType":"organization","orderField":"legalname","queueMaxSize":"2000","groupMaxSize":"50","slidingWindowSize":"200","idPath":".id","rootBuilder":["organization","projectOrganization_participation_isParticipant","datasourceOrganization_provision_isProvidedBy"],"includeChildren":"true"},"pace":{"clustering":[{"name":"sortedngrampairs","fields":["legalname"],"params":{"max":2,"ngramLen":"3"}},{"name":"suffixprefix","fields":["legalname"],"params":{"max":1,"len":"3"}},{"name":"urlclustering","fields":["websiteurl"],"params":{}},{"name":"keywordsclustering","fields":["legalname"],"params":{"max":2,"windowSize":4}}],"strictConditions":[{"name":"exactMatch","fields":["gridid"]}],"conditions":[{"name":"DomainExactMatch","fields":["websiteurl"]},{"name":"exactMatch","fields":["country"]}],"model":[{"name":"country","algo":"Null","type":"String","weight":"0","ignoreMissing":"false","path":".country.classid"},{"name":"legalshortname","algo":"JaroWinklerNormalizedName","type":"String","weight":"0.1","ignoreMissing":"true","path":".legalshortname.dedupId"},{"name":"legalname","algo":"JaroWinklerNormalizedName","type":"String","weight":"0.9","ignoreMissing":"false","path":".legalname.dedupId","params":{"windowSize":4,"threshold":0.7}},{"name":"websiteurl","algo":"Null","type":"URL","weight":"0","ignoreMissing":"true","path":".websiteurl.dedupId","params":{"host":0.5,"path":0.5}},{"name":"gridid","algo":"Null","type":"String","weight":"0.0","ignoreMissing":"true","path":".pid[] | select(.qualifier.classid==\"grid\") | .dedupId"}],"blacklists":{"legalname":[]},"synonyms":{"key::1":["university","università","università studi","universitario","universitaria","université","universitaire","universitaires","universidad","universitade","Universität","universitaet","Uniwersytet","университет","universiteit","πανεπιστήμιο","universitesi","universiteti"],"key::2":["studies","studi","études","estudios","estudos","Studien","studia","исследования","studies","σπουδές"],"key::3":["advanced","superiore","supérieur","supérieure","supérieurs","supérieures","avancado","avancados","fortgeschrittene","fortgeschritten","zaawansowany","передовой","gevorderd","gevorderde","προχωρημένος","προχωρημένη","προχωρημένο","προχωρημένες","προχωρημένα","wyzsza"],"key::4":["institute","istituto","institut","instituto","instituto","Institut","instytut","институт","instituut","ινστιτούτο"],"key::5":["hospital","ospedale","hôpital","hospital","hospital","Krankenhaus","szpital","больница","ziekenhuis","νοσοκομείο"],"key::6":["research","ricerca","recherche","investigacion","pesquisa","Forschung","badania","исследования","onderzoek","έρευνα","erevna","erevnas"],"key::7":["college","collegio","université","colegio","faculdade","Hochschule","Szkoła Wyższa","Высшая школа","universiteit","κολλέγιο"],"key::8":["foundation","fondazione","fondation","fundación","fundação","Stiftung","Fundacja","фонд","stichting","ίδρυμα","idryma"],"key::9":["center","centro","centre","centro","centro","zentrum","centrum","центр","centrum","κέντρο"],"key::10":["national","nazionale","national","nationale","nationaux","nationales","nacional","nacional","national","krajowy","национальный","nationaal","nationale","εθνικό"],"key::11":["association","associazione","association","asociación","associação","Verein","verband","stowarzyszenie","ассоциация","associatie"],"key::12":["society","societa","société","sociedad","sociedade","gesellschaft","społeczeństwo","общество","maatschappij","κοινωνία"],"key::13":["international","internazionale","international","internacional","internacional","international","międzynarodowy","Международный","internationaal","internationale","διεθνής","διεθνή","διεθνές"],"key::14":["community","comunita","communauté","comunidad","comunidade","Gemeinschaft","społeczność","сообщество","gemeenschap","κοινότητα"],"key::15":["school","scuola","école","escuela","escola","schule","Szkoła","школа","school","σχολείο"],"key::16":["education","educazione","éducation","educacion","Educação","Bildung","Edukacja","образование","opleiding","εκπαίδευση"],"key::17":["academy","accademia","académie","academia","academia","Akademie","akademie","академия","academie","ακαδημία"],"key::18":["public","pubblico","public","publique","publics","publiques","publico","publico","Öffentlichkeit","publiczny","публичный","publiek","publieke","δημόσιος","δημόσια","δημόσιο"],"key::19":["museum","museo","musée","mueso","museu","museum","muzeum","музей","museum","μουσείο"],"key::20":["group","gruppo","groupe","grupo","grupo","gruppe","grupa","группа","groep","ομάδα","όμιλος"],"key::21":["department","dipartimento","département","departamento","departamento","abteilung","departament","отдел","afdeling","τμήμα"],"key::22":["council","consiglio","conseil","Consejo","conselho","gesellschaft","rada","совет","raad","συμβούλιο"],"key::23":["library","biblioteca","bibliothèque","biblioteca","biblioteca","Bibliothek","biblioteka","библиотека","bibliotheek","βιβλιοθήκη"],"key::24":["ministry","ministero","ministère","ministerio","ministério","Ministerium","ministerstwo","министерство","ministerie","υπουργείο"],"key::25":["services","servizi","services","servicios","Serviços","Dienstleistungen","usługi","услуги","diensten","υπηρεσίες"],"key::26":["central","centrale","central","centrale","centrales","central","central","zentral","centralny","цетральный","centraal","κεντρικός","κεντρική","κεντρικό","κεντρικά"],"key::27":["general","generale","général","générale","généraux","générales","general","geral","general","Allgemeines","general","общий","algemeen","algemene","γενικός","γενική","γενικό","γενικά"],"key::28":["applied","applicati","appliqué","appliquée","appliqués","appliquées","aplicado","aplicada","angewendet","stosowany","прикладной","toegepast","toegepaste","εφαρμοσμένος","εφαρμοσμένη","εφαρμοσμένο","εφαρμοσμένα"],"key::29":["european","europee","europea","européen","européenne","européens","européennes","europeo","europeu","europäisch","europejski","европейский","Europees","Europese","ευρωπαϊκός","ευρωπαϊκή","ευρωπαϊκό","ευρωπαϊκά"],"key::30":["agency","agenzia","agence","agencia","agencia","agentur","agencja","агенция","agentschap","πρακτορείο"],"key::31":["laboratory","laboratorio","laboratoire","laboratorio","laboratorio","labor","laboratorium","лаборатория","laboratorium","εργαστήριο"],"key::32":["industry","industria","industrie","индустрия","industrie","βιομηχανία"],"key::33":["industrial","industriale","industriel","industrielle","industriels","industrielles","индустриальный","industrieel","βιομηχανικός","βιομηχανική","βιομηχανικό","βιομηχανικά","βιομηχανικές"],"key::34":["consortium","consorzio","consortium","консорциум","consortium","κοινοπραξία"],"key::35":["organization","organizzazione","organisation","organización","organização","organizacja","организация","organisatie","οργανισμός"],"key::36":["authority","autorità","autorité","авторитет","autoriteit"],"key::37":["federation","federazione","fédération","федерация","federatie","ομοσπονδία"],"key::38":["observatory","osservatorio","observatoire","обсерватория","observatorium","αστεροσκοπείο"],"key::39":["bureau","ufficio","bureau","офис","bureau","γραφείο"],"key::40":["company","impresa","compagnie","société","компания","bedrijf","εταιρία"],"key::41":["polytechnic","politecnico","polytechnique","политехника","polytechnisch","πολυτεχνείο","universita politecnica","polytechnic university","universidad politecnica","universitat politecnica","politechnika","politechniki","university technology","university science technology"],"key::42":["coalition","coalizione","coalition","коалиция","coalitie","συνασπισμός"],"key::43":["initiative","iniziativa","initiative","инициатива","initiatief","πρωτοβουλία"],"key::44":["academic","accademico","académique","universitaire","акадеческий academisch","ακαδημαϊκός","ακαδημαϊκή","ακαδημαϊκό","ακαδημαϊκές","ακαδημαϊκοί"],"key::45":["institution","istituzione","institution","институциональный","instelling","ινστιτούτο"],"key::46":["division","divisione","division","отделение","divisie","τμήμα"],"key::47":["committee","comitato","comité","комитет","commissie","επιτροπή"],"key::48":["promotion","promozione","продвижение","proothisis","forderung"],"key::49":["medical","medicine","clinical","medicina","clinici","médico","medicina","clínica","médico","medicina","clínica","medizinisch","Medizin","klinisch","medisch","geneeskunde","klinisch","ιατρικός","ιατρική","ιατρικό","ιατρικά","κλινικός","κλινική","κλινικό","κλινικά","tıbbi","tıp","klinik","orvosi","orvostudomány","klinikai","zdravniški","medicinski","klinični","meditsiini","kliinik","kliiniline"],"key::50":["technology","technological","tecnologia","tecnologie","tecnología","tecnológico","tecnologia","tecnológico","Technologie","technologisch","technologie","technologisch","τεχνολογία","τεχνολογικός","τεχνολογική","τεχνολογικό","teknoloji","teknolojik","technológia","technológiai","tehnologija","tehnološki","tehnoloogia","tehnoloogiline","technologii","technical","texniki","teknik"],"key::51":["science","scientific","scienza","scientifiche","scienze","ciencia","científico","ciência","científico","Wissenschaft","wissenschaftlich","wetenschap","wetenschappelijk","επιστήμη","επιστημονικός","επιστημονική","επιστημονικό","επιστημονικά","bilim","bilimsel","tudomány","tudományos","znanost","znanstveni","teadus","teaduslik",""],"key::52":["engineering","ingegneria","ingeniería","engenharia","Ingenieurwissenschaft","ingenieurswetenschappen","bouwkunde","μηχανικός","μηχανική","μηχανικό","mühendislik","mérnöki","Inženirstvo","inseneeria","inseneri",""],"key::53":["management","gestione","gestionale","gestionali","gestión","administración","gestão","administração","Verwaltung","management","διαχείριση","yönetim","menedzsment","vodstvo","upravljanje","management","juhtkond","juhtimine","haldus",""],"key::54":["energy","energia","energía","energia","Energie","energie","ενέργεια","enerji","energia","energija","energia",""],"key::55":["agricultural","agriculture","agricoltura","agricole","agrícola","agricultura","agrícola","agricultura","landwirtschaftlich","Landwirtschaft","landbouwkundig","landbouw","αγροτικός","αγροτική","αγροτικό","γεωργικός","γεωργική","γεωργικό","γεωργία","tarımsal","tarım","mezőgazdasági","mezőgazdaság","poljedelski","poljedelstvo","põllumajandus","põllumajanduslik",""],"key::56":["information","informazione","información","informação","Information","informatie","πληροφορία","bilgi","információ","informacija","informatsioon","informatycznych",""],"key::57":["social","sociali","social","social","Sozial","sociaal","maatschappelijk","κοινωνικός","κοινωνική","κοινωνικό","κοινωνικά","sosyal","szociális","družbeni","sotsiaal","sotsiaalne",""],"key::58":["environmental","ambiente","medioambiental","ambiente","medioambiente","meioambiente","Umwelt","milieu","milieuwetenschap","milieukunde","περιβαλλοντικός","περιβαλλοντική","περιβαλλοντικό","περιβαλλοντικά","çevre","környezeti","okoliški","keskonna",""],"key::59":["business","economia","economiche","economica","negocio","empresa","negócio","Unternehmen","bedrijf","bedrijfskunde","επιχείρηση","iş","üzleti","posel","ettevõte/äri",""],"key::60":["pharmaceuticals","pharmacy","farmacia","farmaceutica","farmacéutica","farmacia","farmacêutica","farmácia","Pharmazeutika","Arzneimittelkunde","farmaceutica","geneesmiddelen","apotheek","φαρμακευτικός","φαρμακευτική","φαρμακευτικό","φαρμακευτικά","φαρμακείο","ilaç","eczane","gyógyszerészeti","gyógyszertár","farmacevtika","lekarništvo","farmaatsia","farmatseutiline",""],"key::61":["healthcare","health services","salute","atenciónmédica","cuidadodelasalud","cuidadoscomasaúde","Gesundheitswesen","gezondheidszorg","ιατροφαρμακευτικήπερίθαλψη","sağlıkhizmeti","egészségügy","zdravstvo","tervishoid","tervishoiu",""],"key::62":["history","storia","historia","história","Geschichte","geschiedenis","geschiedkunde","ιστορία","tarih","történelem","zgodovina","ajalugu",""],"key::63":["materials","materiali","materia","materiales","materiais","materialen","υλικά","τεκμήρια","malzemeler","anyagok","materiali","materjalid","vahendid",""],"key::64":["economics","economia","economiche","economica","economía","economia","Wirtschaft","economie","οικονομικά","οικονομικέςεπιστήμες","ekonomi","közgazdaságtan","gospodarstvo","ekonomija","majanduslik","majandus",""],"key::65":["therapeutics","terapeutica","terapéutica","terapêutica","therapie","θεραπευτική","tedavibilimi","gyógykezelés","terapevtika","terapeutiline","ravi",""],"key::66":["oncology","oncologia","oncologico","oncología","oncologia","Onkologie","oncologie","ογκολογία","onkoloji","onkológia","onkologija","onkoloogia",""],"key::67":["natural","naturali","naturale","natural","natural","natürlich","natuurlijk","φυσικός","φυσική","φυσικό","φυσικά","doğal","természetes","naraven","loodus",""],"key::68":["educational","educazione","pedagogia","educacional","educativo","educacional","pädagogisch","educatief","εκπαιδευτικός","εκπαιδευτική","εκπαιδευτικό","εκπαιδευτικά","eğitimsel","oktatási","izobraževalen","haridus","hariduslik",""],"key::69":["biomedical","biomedica","biomédico","biomédico","biomedizinisch","biomedisch","βιοιατρικός","βιοιατρική","βιοιατρικό","βιοιατρικά","biyomedikal","orvosbiológiai","biomedicinski","biomeditsiiniline",""],"key::70":["veterinary","veterinaria","veterinarie","veterinaria","veterinária","tierärtzlich","veterinair","veeartsenijlkunde","κτηνιατρικός","κτηνιατρική","κτηνιατρικό","κτηνιατρικά","veteriner","állatorvosi","veterinar","veterinarski","veterinaaria",""],"key::71":["chemistry","chimica","química","química","Chemie","chemie","scheikunde","χημεία","kimya","kémia","kemija","keemia",""],"key::72":["security","sicurezza","seguridad","segurança","Sicherheit","veiligheid","ασφάλεια","güvenlik","biztonsági","varnost","turvalisus","julgeolek",""],"key::73":["biotechnology","biotecnologia","biotecnologie","biotecnología","biotecnologia","Biotechnologie","biotechnologie","βιοτεχνολογία","biyoteknoloji","biotechnológia","biotehnologija","biotehnoloogia",""],"key::74":["military","militare","militari","militar","militar","Militär","militair","leger","στρατιωτικός","στρατιωτική","στρατιωτικό","στρατιωτικά","askeri","katonai","vojaški","vojni","militaar","wojskowa",""],"key::75":["theological","teologia","teologico","teológico","tecnológica","theologisch","theologisch","θεολογικός","θεολογική","θεολογικό","θεολογικά","teolojik","technológiai","teološki","teoloogia","usuteadus","teoloogiline",""],"key::76":["electronics","elettronica","electrónica","eletrônicos","Elektronik","elektronica","ηλεκτρονική","elektronik","elektronika","elektronika","elektroonika",""],"key::77":["forestry","forestale","forestali","silvicultura","forestal","floresta","Forstwirtschaft","bosbouw","δασοκομία","δασολογία","ormancılık","erdészet","gozdarstvo","metsandus",""],"key::78":["maritime","marittima","marittime","marittimo","marítimo","marítimo","maritiem","ναυτικός","ναυτική","ναυτικό","ναυτικά","ναυτιλιακός","ναυτιλιακή","ναυτιλιακό","ναυτιλιακά","θαλάσσιος","θαλάσσια","θαλάσσιο","denizcilik","tengeri","morski","mere","merendus",""],"key::79":["sports","sport","deportes","esportes","Sport","sport","sportwetenschappen","άθληση","γυμναστικήδραστηριότητα","spor","sport","šport","sport","spordi",""],"key::80":["surgery","chirurgia","chirurgiche","cirugía","cirurgia","Chirurgie","chirurgie","heelkunde","εγχείρηση","επέμβαση","χειρουργικήεπέμβαση","cerrahi","sebészet","kirurgija","kirurgia",""],"key::81":["cultural","culturale","culturali","cultura","cultural","cultural","kulturell","cultureel","πολιτιστικός","πολιτιστική","πολιτιστικό","πολιτισμικός","πολιτισμική","πολιτισμικό","kültürel","kultúrális","kulturni","kultuuri","kultuuriline",""],"key::82":["computerscience","informatica","ordenador","computadora","informática","computación","cienciasdelacomputación","ciênciadacomputação","Computer","computer","υπολογιστής","ηλεκτρονικόςυπολογιστής","bilgisayar","számítógép","računalnik","arvuti",""],"key::83":["finance","financial","finanza","finanziarie","finanza","financiero","finanças","financeiro","Finanzen","finanziell","financiën","financieel","χρηματοοικονομικά","χρηματοδότηση","finanse","finansal","pénzügy","pénzügyi","finance","finančni","finants","finantsiline",""],"key::84":["communication","comunicazione","comuniciación","comunicação","Kommunikation","communication","επικοινωνία","iletişim","kommunikáció","komuniciranje","kommunikatsioon",""],"key::85":["justice","giustizia","justicia","justiça","Recht","Justiz","justitie","gerechtigheid","δικαιοσύνη","υπουργείοδικαιοσύνης","δίκαιο","adalet","igazságügy","pravo","õigus",""],"key::86":["aerospace","aerospaziale","aerospaziali","aeroespacio","aeroespaço","Luftfahrt","luchtvaart","ruimtevaart","αεροπορικός","αεροπορική","αεροπορικό","αεροναυπηγικός","αεροναυπηγική","αεροναυπηγικό","αεροναυπηγικά","havacılıkveuzay","légtér","zrakoplovstvo","atmosfäär","kosmos",""],"key::87":["dermatology","dermatologia","dermatología","dermatologia","Dermatologie","dermatologie","δρματολογία","dermatoloji","bőrgyógyászat","dermatológia","dermatologija","dermatoloogia",""],"key::88":["architecture","architettura","arquitectura","arquitetura","Architektur","architectuur","αρχιτεκτονική","mimarlık","építészet","arhitektura","arhitektuur",""],"key::89":["mathematics","matematica","matematiche","matemáticas","matemáticas","Mathematik","wiskunde","mathematica","μαθηματικά","matematik","matematika","matematika","matemaatika",""],"key::90":["language","lingue","linguistica","linguistiche","lenguaje","idioma","língua","idioma","Sprache","taal","taalkunde","γλώσσα","dil","nyelv","jezik","keel",""],"key::91":["neuroscience","neuroscienza","neurociencia","neurociência","Neurowissenschaft","neurowetenschappen","νευροεπιστήμη","nörobilim","idegtudomány","nevroznanost","neuroteadused",""],"key::92":["automation","automazione","automatización","automação","Automatisierung","automatisering","αυτοματοποίηση","otomasyon","automatizálás","avtomatizacija","automatiseeritud",""],"key::93":["pediatric","pediatria","pediatriche","pediatrico","pediátrico","pediatría","pediátrico","pediatria","pädiatrisch","pediatrische","παιδιατρική","pediatrik","gyermekgyógyászat","pediatrija","pediaatria",""],"key::94":["photonics","fotonica","fotoniche","fotónica","fotônica","Photonik","fotonica","φωτονική","fotonik","fotonika","fotonika","fotoonika",""],"key::95":["mechanics","meccanica","meccaniche","mecánica","mecânica","Mechanik","Maschinenbau","mechanica","werktuigkunde","μηχανικής","mekanik","gépészet","mehanika","mehaanika",""],"key::96":["psychiatrics","psichiatria","psichiatrica","psichiatriche","psiquiatría","psiquiatria","Psychiatrie","psychiatrie","ψυχιατρική","psikiyatrik","pszihiátria","psihiatrija","psühhaatria",""],"key::97":["psychology","fisiologia","psicología","psicologia","Psychologie","psychologie","ψυχολογία","psikoloji","pszihológia","psihologija","psühholoogia",""],"key::98":["automotive","industriaautomobilistica","industriadelautomóvil","automotriz","industriaautomotriz","automotivo","Automobilindustrie","autoindustrie","αυτοκίνητος","αυτοκίνητη","αυτοκίνητο","αυτοκινούμενος","αυτοκινούμενη","αυτοκινούμενο","αυτοκινητιστικός","αυτοκινητιστική","αυτοκινητιστικό","otomotiv","autóipari","samogiben","avtomobilskaindustrija","auto-",""],"key::99":["neurology","neurologia","neurologiche","neurología","neurologia","Neurologie","neurologie","zenuwleer","νευρολογία","nöroloji","neurológia","ideggyógyászat","nevrologija","neuroloogia",""],"key::100":["geology","geologia","geologiche","geología","geologia","Geologie","geologie","aardkunde","γεωλογία","jeoloji","geológia","földtudomány","geologija","geoloogia",""],"key::101":["microbiology","microbiologia","micro-biologia","microbiologiche","microbiología","microbiologia","Mikrobiologie","microbiologie","μικροβιολογία","mikrobiyoloji","mikrobiológia","mikrobiologija","mikrobioloogia",""],"key::102":["informatics","informatica","informática","informática","informatica",""],"key::103":["forschungsgemeinschaft","comunita ricerca","research community","research foundation","research association"],"key::104":["commerce","ticaret","ticarət","commercio","trade","handel","comercio"]}}} \ No newline at end of file diff --git a/dhp-workflows/dhp-dedup/src/test/java/eu/dnetlib/dedup/SparkCreateDedupTest.java b/dhp-workflows/dhp-dedup/src/test/java/eu/dnetlib/dedup/SparkCreateDedupTest.java index b16b45ef1..f35baa9f8 100644 --- a/dhp-workflows/dhp-dedup/src/test/java/eu/dnetlib/dedup/SparkCreateDedupTest.java +++ b/dhp-workflows/dhp-dedup/src/test/java/eu/dnetlib/dedup/SparkCreateDedupTest.java @@ -1,5 +1,7 @@ package eu.dnetlib.dedup; +import com.fasterxml.jackson.databind.ObjectMapper; +import eu.dnetlib.dhp.schema.oaf.Publication; import org.apache.commons.io.FileUtils; import org.apache.commons.io.IOUtils; import org.junit.Before; @@ -8,36 +10,37 @@ import org.junit.Test; import java.io.File; import java.io.IOException; +import java.util.List; public class SparkCreateDedupTest { - + String configuration; @Before public void setUp() throws IOException { - FileUtils.deleteDirectory(new File("/tmp/pub_dedup_vertex")); - FileUtils.deleteDirectory(new File("/tmp/pub_dedup_rels")); + configuration = IOUtils.toString(getClass().getResourceAsStream("/eu/dnetlib/dedup/conf/pub.curr.conf.json")); + } - - @Test @Ignore - public void dedupTest() throws Exception { - final String configuration = IOUtils.toString(getClass().getResourceAsStream("/eu/dnetlib/dedup/conf/org.curr.conf.json")); - - + public void createSimRelsTest() throws Exception { SparkCreateSimRels.main(new String[] { "-mt", "local[*]", - "-s", "/home/sandro/betadump", + "-s", "/Users/miconis/dumps", "-e", "publication", "-c", configuration, "-t", "/tmp/dedup", }); + } + + @Test + @Ignore + public void createCCTest() throws Exception { SparkCreateConnectedComponent.main(new String[] { "-mt", "local[*]", - "-s", "/home/sandro/betadump", + "-s", "/Users/miconis/dumps", "-e", "publication", "-c", configuration, "-t", "/tmp/dedup", @@ -49,14 +52,10 @@ public class SparkCreateDedupTest { public void dedupRecordTest() throws Exception { SparkCreateDedupRecord.main(new String[] { "-mt", "local[*]", - "-s", "/home/sandro/betadump", + "-s", "/Users/miconis/dumps", "-e", "publication", - "-c", "configuration", + "-c", configuration, "-t", "/tmp/dedup", }); } - - - - } diff --git a/dhp-workflows/dhp-dedup/src/test/resources/eu/dnetlib/dedup/conf/org.curr.conf.json b/dhp-workflows/dhp-dedup/src/test/resources/eu/dnetlib/dedup/conf/org.curr.conf.json index 55c1d2066..a878d2419 100644 --- a/dhp-workflows/dhp-dedup/src/test/resources/eu/dnetlib/dedup/conf/org.curr.conf.json +++ b/dhp-workflows/dhp-dedup/src/test/resources/eu/dnetlib/dedup/conf/org.curr.conf.json @@ -7,7 +7,7 @@ "queueMaxSize": "2000", "groupMaxSize": "50", "slidingWindowSize": "200", - "idPath": "$.id", + "idPath": ".id", "rootBuilder": [ "organization", "projectOrganization_participation_isParticipant", @@ -117,14 +117,6 @@ "host": 0.5, "path": 0.5 } - }, - { - "name": "gridid", - "algo": "Null", - "type": "String", - "weight": "0.0", - "ignoreMissing": "true", - "path": ".pid[] | select(.qualifier.classid==\"grid\") | .value" } ], "blacklists": { diff --git a/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/conf/org.curr.conf2.json b/dhp-workflows/dhp-dedup/src/test/resources/eu/dnetlib/dedup/conf/org.curr.conf2.json similarity index 100% rename from dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/conf/org.curr.conf2.json rename to dhp-workflows/dhp-dedup/src/test/resources/eu/dnetlib/dedup/conf/org.curr.conf2.json diff --git a/dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/conf/pub.curr.conf.json b/dhp-workflows/dhp-dedup/src/test/resources/eu/dnetlib/dedup/conf/pub.curr.conf.json similarity index 100% rename from dhp-workflows/dhp-dedup/src/main/resources/eu/dnetlib/dhp/dedup/conf/pub.curr.conf.json rename to dhp-workflows/dhp-dedup/src/test/resources/eu/dnetlib/dedup/conf/pub.curr.conf.json diff --git a/dhp-workflows/dhp-distcp/src/main/resources/eu/dnetlib/dhp/distcp/oozie_app/config-default.xml b/dhp-workflows/dhp-distcp/src/main/resources/eu/dnetlib/dhp/distcp/oozie_app/config-default.xml index 905fb9984..292ec14c0 100644 --- a/dhp-workflows/dhp-distcp/src/main/resources/eu/dnetlib/dhp/distcp/oozie_app/config-default.xml +++ b/dhp-workflows/dhp-distcp/src/main/resources/eu/dnetlib/dhp/distcp/oozie_app/config-default.xml @@ -1,18 +1,18 @@ jobTracker - yarnRM + yarnRM nameNode - hdfs://nameservice1 + hdfs://nameservice1 sourceNN - webhdfs://namenode2.hadoop.dm.openaire.eu:50071 + webhdfs://namenode2.hadoop.dm.openaire.eu:50071 oozie.use.system.libpath - true + true \ No newline at end of file diff --git a/dhp-workflows/dhp-distcp/src/main/resources/eu/dnetlib/dhp/distcp/oozie_app/workflow.xml b/dhp-workflows/dhp-distcp/src/main/resources/eu/dnetlib/dhp/distcp/oozie_app/workflow.xml index 91b97332b..5fe802118 100644 --- a/dhp-workflows/dhp-distcp/src/main/resources/eu/dnetlib/dhp/distcp/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-distcp/src/main/resources/eu/dnetlib/dhp/distcp/oozie_app/workflow.xml @@ -14,12 +14,12 @@ hbase_dump_distcp_memory_mb - 6144 + 6144 memory for distcp action copying InfoSpace dump from remote cluster hbase_dump_distcp_num_maps - 1 + 1 maximum number of simultaneous copies of InfoSpace dump from remote location diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/graph/oozie_app/config-default.xml b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/graph/oozie_app/config-default.xml index fcab9dd00..abac9bba2 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/graph/oozie_app/config-default.xml +++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/graph/oozie_app/config-default.xml @@ -1,26 +1,26 @@ jobTracker - yarnRM + yarnRM nameNode - hdfs://nameservice1 + hdfs://nameservice1 oozie.use.system.libpath - true + true oozie.action.sharelib.for.spark - spark2 + spark2 hive_metastore_uris - thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083 + thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083 hive_db_name - openaire + openaire \ No newline at end of file diff --git a/dhp-workflows/docs/oozie-installer.markdown b/dhp-workflows/docs/oozie-installer.markdown index b9486ad5e..90360ec35 100644 --- a/dhp-workflows/docs/oozie-installer.markdown +++ b/dhp-workflows/docs/oozie-installer.markdown @@ -54,7 +54,7 @@ Properties overriding order is the following: 2. `~/.dhp/application.properties` defined properties 3. `${workflow.source.dir}/job.properties` 4. `job-override.properties` (located in the project root dir) -5. `maven -Dparam=value` +5. `maven -Dparam=dedupId` where the maven `-Dparam` property is overriding all the other ones. @@ -73,7 +73,7 @@ Workflow definition requirements This property can be set using maven `-D` switch. -`[oozie_app]` is the default directory name however it can be set to any value as soon as `oozieAppDir` property is provided with directory name as value. +`[oozie_app]` is the default directory name however it can be set to any dedupId as soon as `oozieAppDir` property is provided with directory name as dedupId. Subworkflows are supported as well and subworkflow directories should be nested within `[oozie_app]` directory. diff --git a/dhp-workflows/pom.xml b/dhp-workflows/pom.xml index bd2ca9704..65227a782 100644 --- a/dhp-workflows/pom.xml +++ b/dhp-workflows/pom.xml @@ -73,7 +73,7 @@ attach-test-resources - + provided @@ -326,7 +326,7 @@ - +