diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/AbstractClusteringFunction.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/AbstractClusteringFunction.java index 3da8eb4900..e971ec5bb0 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/AbstractClusteringFunction.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/AbstractClusteringFunction.java @@ -14,9 +14,9 @@ import eu.dnetlib.pace.config.Config; public abstract class AbstractClusteringFunction extends AbstractPaceFunctions implements ClusteringFunction { - protected Map params; + protected Map params; - public AbstractClusteringFunction(final Map params) { + public AbstractClusteringFunction(final Map params) { this.params = params; } @@ -27,7 +27,7 @@ public abstract class AbstractClusteringFunction extends AbstractPaceFunctions i return fields .stream() .filter(f -> !f.isEmpty()) - .map(this::normalize) + .map(s -> normalize(s)) .map(s -> filterAllStopWords(s)) .map(s -> doApply(conf, s)) .map(c -> filterBlacklisted(c, ngramBlacklist)) @@ -36,11 +36,24 @@ public abstract class AbstractClusteringFunction extends AbstractPaceFunctions i .collect(Collectors.toCollection(HashSet::new)); } - public Map getParams() { + public Map getParams() { return params; } protected Integer param(String name) { - return params.get(name); + Object val = params.get(name); + if (val == null) + return null; + if (val instanceof Number) { + return ((Number) val).intValue(); + } + return Integer.parseInt(val.toString()); + } + + protected int paramOrDefault(String name, int i) { + Integer res = param(name); + if (res == null) + res = i; + return res; } } diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/Acronyms.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/Acronyms.java index 9072fbb4b2..b5db27106d 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/Acronyms.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/Acronyms.java @@ -13,7 +13,7 @@ import eu.dnetlib.pace.config.Config; @ClusteringClass("acronyms") public class Acronyms extends AbstractClusteringFunction { - public Acronyms(Map params) { + public Acronyms(Map params) { super(params); } diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/ClusteringFunction.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/ClusteringFunction.java index 8b78524182..269de867d4 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/ClusteringFunction.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/ClusteringFunction.java @@ -11,6 +11,6 @@ public interface ClusteringFunction { public Collection apply(Config config, List fields); - public Map getParams(); + public Map getParams(); } diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/ImmutableFieldValue.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/ImmutableFieldValue.java index bc8844aee0..cbfcde266c 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/ImmutableFieldValue.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/ImmutableFieldValue.java @@ -12,7 +12,7 @@ import eu.dnetlib.pace.config.Config; @ClusteringClass("immutablefieldvalue") public class ImmutableFieldValue extends AbstractClusteringFunction { - public ImmutableFieldValue(final Map params) { + public ImmutableFieldValue(final Map params) { super(params); } diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/JSONListClustering.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/JSONListClustering.java new file mode 100644 index 0000000000..e00092bd0c --- /dev/null +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/JSONListClustering.java @@ -0,0 +1,69 @@ + +package eu.dnetlib.pace.clustering; + +import java.util.Collection; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.stream.Collectors; + +import org.apache.commons.lang3.StringUtils; + +import com.jayway.jsonpath.Configuration; +import com.jayway.jsonpath.DocumentContext; +import com.jayway.jsonpath.JsonPath; +import com.jayway.jsonpath.Option; + +import eu.dnetlib.pace.common.AbstractPaceFunctions; +import eu.dnetlib.pace.config.Config; +import eu.dnetlib.pace.util.MapDocumentUtil; + +@ClusteringClass("jsonlistclustering") +public class JSONListClustering extends AbstractPaceFunctions implements ClusteringFunction { + + private Map params; + + public JSONListClustering(Map params) { + this.params = params; + } + + @Override + public Map getParams() { + return params; + } + + @Override + public Collection apply(Config conf, List fields) { + return fields + .stream() + .filter(f -> !f.isEmpty()) + .map(s -> doApply(conf, s)) + .filter(StringUtils::isNotBlank) + .collect(Collectors.toCollection(HashSet::new)); + } + + private String doApply(Config conf, String json) { + StringBuilder st = new StringBuilder(); // to build the string used for comparisons basing on the jpath into + // parameters + final DocumentContext documentContext = JsonPath + .using(Configuration.defaultConfiguration().addOptions(Option.SUPPRESS_EXCEPTIONS)) + .parse(json); + + // for each path in the param list + for (String key : params.keySet().stream().filter(k -> k.contains("jpath")).collect(Collectors.toList())) { + String path = params.get(key).toString(); + String value = MapDocumentUtil.getJPathString(path, documentContext); + if (value == null || value.isEmpty()) + value = ""; + st.append(value); + st.append(" "); + } + + st.setLength(st.length() - 1); + + if (StringUtils.isBlank(st)) { + return "1"; + } + return st.toString(); + } +} diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/KeywordsClustering.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/KeywordsClustering.java index 38299adb43..fdd8d1fb12 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/KeywordsClustering.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/KeywordsClustering.java @@ -11,7 +11,7 @@ import eu.dnetlib.pace.config.Config; @ClusteringClass("keywordsclustering") public class KeywordsClustering extends AbstractClusteringFunction { - public KeywordsClustering(Map params) { + public KeywordsClustering(Map params) { super(params); } @@ -19,8 +19,8 @@ public class KeywordsClustering extends AbstractClusteringFunction { protected Collection doApply(final Config conf, String s) { // takes city codes and keywords codes without duplicates - Set keywords = getKeywords(s, conf.translationMap(), params.getOrDefault("windowSize", 4)); - Set cities = getCities(s, params.getOrDefault("windowSize", 4)); + Set keywords = getKeywords(s, conf.translationMap(), paramOrDefault("windowSize", 4)); + Set cities = getCities(s, paramOrDefault("windowSize", 4)); // list of combination to return as result final Collection combinations = new LinkedHashSet(); @@ -28,7 +28,7 @@ public class KeywordsClustering extends AbstractClusteringFunction { for (String keyword : keywordsToCodes(keywords, conf.translationMap())) { for (String city : citiesToCodes(cities)) { combinations.add(keyword + "-" + city); - if (combinations.size() >= params.getOrDefault("max", 2)) { + if (combinations.size() >= paramOrDefault("max", 2)) { return combinations; } } @@ -42,8 +42,8 @@ public class KeywordsClustering extends AbstractClusteringFunction { return fields .stream() .filter(f -> !f.isEmpty()) - .map(this::cleanup) - .map(this::normalize) + .map(KeywordsClustering::cleanup) + .map(KeywordsClustering::normalize) .map(s -> filterAllStopWords(s)) .map(s -> doApply(conf, s)) .map(c -> filterBlacklisted(c, ngramBlacklist)) diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/LastNameFirstInitial.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/LastNameFirstInitial.java index 5a385961a6..9692f57624 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/LastNameFirstInitial.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/LastNameFirstInitial.java @@ -16,7 +16,7 @@ public class LastNameFirstInitial extends AbstractClusteringFunction { private boolean DEFAULT_AGGRESSIVE = true; - public LastNameFirstInitial(final Map params) { + public LastNameFirstInitial(final Map params) { super(params); } @@ -25,7 +25,7 @@ public class LastNameFirstInitial extends AbstractClusteringFunction { return fields .stream() .filter(f -> !f.isEmpty()) - .map(this::normalize) + .map(LastNameFirstInitial::normalize) .map(s -> doApply(conf, s)) .map(c -> filterBlacklisted(c, ngramBlacklist)) .flatMap(c -> c.stream()) @@ -33,8 +33,7 @@ public class LastNameFirstInitial extends AbstractClusteringFunction { .collect(Collectors.toCollection(HashSet::new)); } - @Override - protected String normalize(final String s) { + public static String normalize(final String s) { return fixAliases(transliterate(nfd(unicodeNormalization(s)))) // do not compact the regexes in a single expression, would cause StackOverflowError in case of large input // strings diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/LowercaseClustering.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/LowercaseClustering.java index a3a6c48819..807f41dd59 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/LowercaseClustering.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/LowercaseClustering.java @@ -15,7 +15,7 @@ import eu.dnetlib.pace.config.Config; @ClusteringClass("lowercase") public class LowercaseClustering extends AbstractClusteringFunction { - public LowercaseClustering(final Map params) { + public LowercaseClustering(final Map params) { super(params); } diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/NgramPairs.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/NgramPairs.java index aa06aa408e..bcc9667a8b 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/NgramPairs.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/NgramPairs.java @@ -12,11 +12,11 @@ import eu.dnetlib.pace.config.Config; @ClusteringClass("ngrampairs") public class NgramPairs extends Ngrams { - public NgramPairs(Map params) { + public NgramPairs(Map params) { super(params, false); } - public NgramPairs(Map params, boolean sorted) { + public NgramPairs(Map params, boolean sorted) { super(params, sorted); } diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/Ngrams.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/Ngrams.java index 96c305a16a..7b862c729b 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/Ngrams.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/Ngrams.java @@ -10,11 +10,11 @@ public class Ngrams extends AbstractClusteringFunction { private final boolean sorted; - public Ngrams(Map params) { + public Ngrams(Map params) { this(params, false); } - public Ngrams(Map params, boolean sorted) { + public Ngrams(Map params, boolean sorted) { super(params); this.sorted = sorted; } diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/NumAuthorsTitleSuffixPrefixChain.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/NumAuthorsTitleSuffixPrefixChain.java new file mode 100644 index 0000000000..f1d1e17b90 --- /dev/null +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/NumAuthorsTitleSuffixPrefixChain.java @@ -0,0 +1,113 @@ + +package eu.dnetlib.pace.clustering; + +import java.util.*; +import java.util.stream.Collectors; +import java.util.stream.StreamSupport; + +import com.google.common.base.Splitter; +import com.google.common.collect.Sets; + +import eu.dnetlib.pace.config.Config; + +@ClusteringClass("numAuthorsTitleSuffixPrefixChain") +public class NumAuthorsTitleSuffixPrefixChain extends AbstractClusteringFunction { + + public NumAuthorsTitleSuffixPrefixChain(Map params) { + super(params); + } + + @Override + public Collection apply(Config conf, List fields) { + + try { + int num_authors = Math.min(Integer.parseInt(fields.get(0)), 21); // SIZE threshold is 20, +1 + + if (num_authors > 0) { + return super.apply(conf, fields.subList(1, fields.size())) + .stream() + .map(s -> num_authors + "-" + s) + .collect(Collectors.toList()); + } + } catch (NumberFormatException e) { + // missing or null authors array + } + + return Collections.emptyList(); + } + + @Override + protected Collection doApply(Config conf, String s) { + return suffixPrefixChain(cleanup(s), param("mod")); + } + + private Collection suffixPrefixChain(String s, int mod) { + // create the list of words from the string (remove short words) + List wordsList = Arrays + .stream(s.split(" ")) + .filter(si -> si.length() > 3) + .collect(Collectors.toList()); + + final int words = wordsList.size(); + final int letters = s.length(); + + // create the prefix: number of words + number of letters/mod + String prefix = words / mod + "-"; + + return doSuffixPrefixChain(wordsList, prefix); + + } + + private Collection doSuffixPrefixChain(List wordsList, String prefix) { + + Set set = Sets.newLinkedHashSet(); + switch (wordsList.size()) { + case 0: + break; + case 1: + set.add(wordsList.get(0)); + break; + case 2: + set + .add( + prefix + + suffix(wordsList.get(0), 3) + + prefix(wordsList.get(1), 3)); + + set + .add( + prefix + + prefix(wordsList.get(0), 3) + + suffix(wordsList.get(1), 3)); + + break; + default: + set + .add( + prefix + + suffix(wordsList.get(0), 3) + + prefix(wordsList.get(1), 3) + + suffix(wordsList.get(2), 3)); + + set + .add( + prefix + + prefix(wordsList.get(0), 3) + + suffix(wordsList.get(1), 3) + + prefix(wordsList.get(2), 3)); + break; + } + + return set; + + } + + private String suffix(String s, int len) { + return s.substring(s.length() - len); + } + + private String prefix(String s, int len) { + return s.substring(0, len); + } + +} diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/PersonClustering.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/PersonClustering.java index b4a04ce65f..91b51bebbd 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/PersonClustering.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/PersonClustering.java @@ -17,11 +17,11 @@ import eu.dnetlib.pace.model.Person; @ClusteringClass("personClustering") public class PersonClustering extends AbstractPaceFunctions implements ClusteringFunction { - private Map params; + private Map params; private static final int MAX_TOKENS = 5; - public PersonClustering(final Map params) { + public PersonClustering(final Map params) { this.params = params; } @@ -77,7 +77,7 @@ public class PersonClustering extends AbstractPaceFunctions implements Clusterin // } @Override - public Map getParams() { + public Map getParams() { return params; } diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/PersonHash.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/PersonHash.java index a3d58a9be3..09a112c37f 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/PersonHash.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/PersonHash.java @@ -15,7 +15,7 @@ public class PersonHash extends AbstractClusteringFunction { private boolean DEFAULT_AGGRESSIVE = false; - public PersonHash(final Map params) { + public PersonHash(final Map params) { super(params); } diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/RandomClusteringFunction.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/RandomClusteringFunction.java index 2aab926da4..3733dfc742 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/RandomClusteringFunction.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/RandomClusteringFunction.java @@ -8,7 +8,7 @@ import eu.dnetlib.pace.config.Config; public class RandomClusteringFunction extends AbstractClusteringFunction { - public RandomClusteringFunction(Map params) { + public RandomClusteringFunction(Map params) { super(params); } diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/SortedNgramPairs.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/SortedNgramPairs.java index b085ae26d0..ca1b4189b3 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/SortedNgramPairs.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/SortedNgramPairs.java @@ -1,7 +1,10 @@ package eu.dnetlib.pace.clustering; -import java.util.*; +import java.util.Collection; +import java.util.Collections; +import java.util.List; +import java.util.Map; import com.google.common.base.Joiner; import com.google.common.base.Splitter; @@ -12,7 +15,7 @@ import eu.dnetlib.pace.config.Config; @ClusteringClass("sortedngrampairs") public class SortedNgramPairs extends NgramPairs { - public SortedNgramPairs(Map params) { + public SortedNgramPairs(Map params) { super(params, false); } diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/SpaceTrimmingFieldValue.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/SpaceTrimmingFieldValue.java index 392aecc794..048380f7ed 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/SpaceTrimmingFieldValue.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/SpaceTrimmingFieldValue.java @@ -15,7 +15,7 @@ import eu.dnetlib.pace.config.Config; @ClusteringClass("spacetrimmingfieldvalue") public class SpaceTrimmingFieldValue extends AbstractClusteringFunction { - public SpaceTrimmingFieldValue(final Map params) { + public SpaceTrimmingFieldValue(final Map params) { super(params); } @@ -25,7 +25,7 @@ public class SpaceTrimmingFieldValue extends AbstractClusteringFunction { res .add( - StringUtils.isBlank(s) ? RandomStringUtils.random(getParams().get("randomLength")) + StringUtils.isBlank(s) ? RandomStringUtils.random(param("randomLength")) : s.toLowerCase().replaceAll("\\s+", "")); return res; diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/SuffixPrefix.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/SuffixPrefix.java index 2a1c023a96..b6921e9f1a 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/SuffixPrefix.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/SuffixPrefix.java @@ -12,7 +12,7 @@ import eu.dnetlib.pace.config.Config; @ClusteringClass("suffixprefix") public class SuffixPrefix extends AbstractClusteringFunction { - public SuffixPrefix(Map params) { + public SuffixPrefix(Map params) { super(params); } diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/UrlClustering.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/UrlClustering.java index 5b267ad106..34f41085b4 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/UrlClustering.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/UrlClustering.java @@ -15,12 +15,17 @@ import eu.dnetlib.pace.config.Config; @ClusteringClass("urlclustering") public class UrlClustering extends AbstractPaceFunctions implements ClusteringFunction { - protected Map params; + protected Map params; - public UrlClustering(final Map params) { + public UrlClustering(final Map params) { this.params = params; } + @Override + public Map getParams() { + return params; + } + @Override public Collection apply(final Config conf, List fields) { try { @@ -35,11 +40,6 @@ public class UrlClustering extends AbstractPaceFunctions implements ClusteringFu } } - @Override - public Map getParams() { - return null; - } - private URL asUrl(String value) { try { return new URL(value); diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/WordsStatsSuffixPrefixChain.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/WordsStatsSuffixPrefixChain.java index c8e02f8f03..22351cf8ff 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/WordsStatsSuffixPrefixChain.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/WordsStatsSuffixPrefixChain.java @@ -11,7 +11,7 @@ import eu.dnetlib.pace.config.Config; @ClusteringClass("wordsStatsSuffixPrefixChain") public class WordsStatsSuffixPrefixChain extends AbstractClusteringFunction { - public WordsStatsSuffixPrefixChain(Map params) { + public WordsStatsSuffixPrefixChain(Map params) { super(params); } diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/WordsSuffixPrefix.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/WordsSuffixPrefix.java index e606590a53..f9fef376bf 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/WordsSuffixPrefix.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/clustering/WordsSuffixPrefix.java @@ -12,7 +12,7 @@ import eu.dnetlib.pace.config.Config; @ClusteringClass("wordssuffixprefix") public class WordsSuffixPrefix extends AbstractClusteringFunction { - public WordsSuffixPrefix(Map params) { + public WordsSuffixPrefix(Map params) { super(params); } diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/common/AbstractPaceFunctions.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/common/AbstractPaceFunctions.java index b440686ded..ba7639adad 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/common/AbstractPaceFunctions.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/common/AbstractPaceFunctions.java @@ -16,7 +16,6 @@ import org.apache.commons.lang3.StringUtils; import com.google.common.base.Joiner; import com.google.common.base.Splitter; import com.google.common.collect.Iterables; -import com.google.common.collect.Lists; import com.google.common.collect.Sets; import com.ibm.icu.text.Transliterator; @@ -27,7 +26,7 @@ import eu.dnetlib.pace.clustering.NGramUtils; * * @author claudio */ -public abstract class AbstractPaceFunctions { +public class AbstractPaceFunctions { // city map to be used when translating the city names into codes private static Map cityMap = AbstractPaceFunctions @@ -62,11 +61,14 @@ public abstract class AbstractPaceFunctions { private static Pattern hexUnicodePattern = Pattern.compile("\\\\u(\\p{XDigit}{4})"); - protected String concat(final List l) { + private static Pattern romanNumberPattern = Pattern + .compile("^M{0,4}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|V?I{0,3})$"); + + protected static String concat(final List l) { return Joiner.on(" ").skipNulls().join(l); } - protected String cleanup(final String s) { + public static String cleanup(final String s) { final String s1 = HTML_REGEX.matcher(s).replaceAll(""); final String s2 = unicodeNormalization(s1.toLowerCase()); final String s3 = nfd(s2); @@ -82,7 +84,7 @@ public abstract class AbstractPaceFunctions { return s12; } - protected String fixXML(final String a) { + protected static String fixXML(final String a) { return a .replaceAll("–", " ") @@ -91,7 +93,7 @@ public abstract class AbstractPaceFunctions { .replaceAll("−", " "); } - protected boolean checkNumbers(final String a, final String b) { + protected static boolean checkNumbers(final String a, final String b) { final String numbersA = getNumbers(a); final String numbersB = getNumbers(b); final String romansA = getRomans(a); @@ -99,7 +101,7 @@ public abstract class AbstractPaceFunctions { return !numbersA.equals(numbersB) || !romansA.equals(romansB); } - protected String getRomans(final String s) { + protected static String getRomans(final String s) { final StringBuilder sb = new StringBuilder(); for (final String t : s.split(" ")) { sb.append(isRoman(t) ? t : ""); @@ -107,13 +109,12 @@ public abstract class AbstractPaceFunctions { return sb.toString(); } - protected boolean isRoman(final String s) { - return s - .replaceAll("^M{0,4}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|V?I{0,3})$", "qwertyuiop") - .equals("qwertyuiop"); + protected static boolean isRoman(final String s) { + Matcher m = romanNumberPattern.matcher(s); + return m.matches() && m.hitEnd(); } - protected String getNumbers(final String s) { + protected static String getNumbers(final String s) { final StringBuilder sb = new StringBuilder(); for (final String t : s.split(" ")) { sb.append(isNumber(t) ? t : ""); @@ -121,7 +122,7 @@ public abstract class AbstractPaceFunctions { return sb.toString(); } - public boolean isNumber(String strNum) { + public static boolean isNumber(String strNum) { if (strNum == null) { return false; } @@ -147,7 +148,7 @@ public abstract class AbstractPaceFunctions { } } - protected String removeSymbols(final String s) { + protected static String removeSymbols(final String s) { final StringBuilder sb = new StringBuilder(); s.chars().forEach(ch -> { @@ -157,11 +158,11 @@ public abstract class AbstractPaceFunctions { return sb.toString().replaceAll("\\s+", " "); } - protected boolean notNull(final String s) { + protected static boolean notNull(final String s) { return s != null; } - protected String normalize(final String s) { + public static String normalize(final String s) { return fixAliases(transliterate(nfd(unicodeNormalization(s)))) .toLowerCase() // do not compact the regexes in a single expression, would cause StackOverflowError in case of large input @@ -174,16 +175,16 @@ public abstract class AbstractPaceFunctions { .trim(); } - public String nfd(final String s) { + public static String nfd(final String s) { return Normalizer.normalize(s, Normalizer.Form.NFD); } - public String utf8(final String s) { + public static String utf8(final String s) { byte[] bytes = s.getBytes(StandardCharsets.UTF_8); return new String(bytes, StandardCharsets.UTF_8); } - public String unicodeNormalization(final String s) { + public static String unicodeNormalization(final String s) { Matcher m = hexUnicodePattern.matcher(s); StringBuffer buf = new StringBuffer(s.length()); @@ -195,7 +196,7 @@ public abstract class AbstractPaceFunctions { return buf.toString(); } - protected String filterStopWords(final String s, final Set stopwords) { + protected static String filterStopWords(final String s, final Set stopwords) { final StringTokenizer st = new StringTokenizer(s); final StringBuilder sb = new StringBuilder(); while (st.hasMoreTokens()) { @@ -208,7 +209,7 @@ public abstract class AbstractPaceFunctions { return sb.toString().trim(); } - public String filterAllStopWords(String s) { + public static String filterAllStopWords(String s) { s = filterStopWords(s, stopwords_en); s = filterStopWords(s, stopwords_de); @@ -221,7 +222,8 @@ public abstract class AbstractPaceFunctions { return s; } - protected Collection filterBlacklisted(final Collection set, final Set ngramBlacklist) { + protected static Collection filterBlacklisted(final Collection set, + final Set ngramBlacklist) { final Set newset = Sets.newLinkedHashSet(); for (final String s : set) { if (!ngramBlacklist.contains(s)) { @@ -268,7 +270,7 @@ public abstract class AbstractPaceFunctions { return m; } - public String removeKeywords(String s, Set keywords) { + public static String removeKeywords(String s, Set keywords) { s = " " + s + " "; for (String k : keywords) { @@ -278,39 +280,39 @@ public abstract class AbstractPaceFunctions { return s.trim(); } - public double commonElementsPercentage(Set s1, Set s2) { + public static double commonElementsPercentage(Set s1, Set s2) { double longer = Math.max(s1.size(), s2.size()); return (double) s1.stream().filter(s2::contains).count() / longer; } // convert the set of keywords to codes - public Set toCodes(Set keywords, Map translationMap) { + public static Set toCodes(Set keywords, Map translationMap) { return keywords.stream().map(s -> translationMap.get(s)).collect(Collectors.toSet()); } - public Set keywordsToCodes(Set keywords, Map translationMap) { + public static Set keywordsToCodes(Set keywords, Map translationMap) { return toCodes(keywords, translationMap); } - public Set citiesToCodes(Set keywords) { + public static Set citiesToCodes(Set keywords) { return toCodes(keywords, cityMap); } - protected String firstLC(final String s) { + protected static String firstLC(final String s) { return StringUtils.substring(s, 0, 1).toLowerCase(); } - protected Iterable tokens(final String s, final int maxTokens) { + protected static Iterable tokens(final String s, final int maxTokens) { return Iterables.limit(Splitter.on(" ").omitEmptyStrings().trimResults().split(s), maxTokens); } - public String normalizePid(String pid) { + public static String normalizePid(String pid) { return DOI_PREFIX.matcher(pid.toLowerCase()).replaceAll(""); } // get the list of keywords into the input string - public Set getKeywords(String s1, Map translationMap, int windowSize) { + public static Set getKeywords(String s1, Map translationMap, int windowSize) { String s = s1; @@ -340,7 +342,7 @@ public abstract class AbstractPaceFunctions { return codes; } - public Set getCities(String s1, int windowSize) { + public static Set getCities(String s1, int windowSize) { return getKeywords(s1, cityMap, windowSize); } diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/ClusteringDef.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/ClusteringDef.java index d9ad81d42b..5ede2c3804 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/ClusteringDef.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/ClusteringDef.java @@ -18,7 +18,7 @@ public class ClusteringDef implements Serializable { private List fields; - private Map params; + private Map params; public ClusteringDef() { } @@ -43,11 +43,11 @@ public class ClusteringDef implements Serializable { this.fields = fields; } - public Map getParams() { + public Map getParams() { return params; } - public void setParams(final Map params) { + public void setParams(final Map params) { this.params = params; } diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/FieldDef.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/FieldDef.java index f34545e6df..7ad9b74458 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/FieldDef.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/FieldDef.java @@ -2,6 +2,7 @@ package eu.dnetlib.pace.model; import java.io.Serializable; +import java.util.HashSet; import java.util.List; import com.fasterxml.jackson.core.JsonProcessingException; @@ -36,6 +37,16 @@ public class FieldDef implements Serializable { */ private int length = -1; + private HashSet filter; + + private boolean sorted; + + public boolean isSorted() { + return sorted; + } + + private String clean; + public FieldDef() { } @@ -91,6 +102,30 @@ public class FieldDef implements Serializable { this.path = path; } + public HashSet getFilter() { + return filter; + } + + public void setFilter(HashSet filter) { + this.filter = filter; + } + + public boolean getSorted() { + return sorted; + } + + public void setSorted(boolean sorted) { + this.sorted = sorted; + } + + public String getClean() { + return clean; + } + + public void setClean(String clean) { + this.clean = clean; + } + @Override public String toString() { try { diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/SparkDeduper.scala b/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/SparkDeduper.scala index b3f56bcdbe..bc702b9e2d 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/SparkDeduper.scala +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/SparkDeduper.scala @@ -5,9 +5,9 @@ import eu.dnetlib.pace.util.{BlockProcessor, SparkReporter} import org.apache.spark.SparkContext import org.apache.spark.sql.catalyst.expressions.Literal import org.apache.spark.sql.expressions._ -import org.apache.spark.sql.functions.{col, lit, udf} +import org.apache.spark.sql.functions.{col, desc, expr, lit, udf} import org.apache.spark.sql.types._ -import org.apache.spark.sql.{Column, Dataset, Row, functions} +import org.apache.spark.sql.{Column, Dataset, Row, SaveMode, functions} import java.util.function.Predicate import java.util.stream.Collectors @@ -80,6 +80,8 @@ case class SparkDeduper(conf: DedupConfig) extends Serializable { .withColumn("key", functions.explode(clusterValuesUDF(cd).apply(functions.array(inputColumns: _*)))) // Add position column having the position of the row within the set of rows having the same key value ordered by the sorting value .withColumn("position", functions.row_number().over(Window.partitionBy("key").orderBy(col(model.orderingFieldName), col(model.identifierFieldName)))) + // .withColumn("count", functions.max("position").over(Window.partitionBy("key").orderBy(col(model.orderingFieldName), col(model.identifierFieldName)).rowsBetween(Window.unboundedPreceding,Window.unboundedFollowing) )) + // .filter("count > 1") if (df_with_clustering_keys == null) df_with_clustering_keys = ds @@ -88,20 +90,44 @@ case class SparkDeduper(conf: DedupConfig) extends Serializable { } //TODO: analytics + /*df_with_clustering_keys.groupBy(col("clustering"), col("key")) + .agg(expr("max(count) AS size")) + .orderBy(desc("size")) + .show*/ val df_with_blocks = df_with_clustering_keys - // filter out rows with position exceeding the maxqueuesize parameter - .filter(col("position").leq(conf.getWf.getQueueMaxSize)) - .groupBy("clustering", "key") + // split the clustering block into smaller blocks of queuemaxsize + .groupBy(col("clustering"), col("key"), functions.floor(col("position").divide(lit(conf.getWf.getQueueMaxSize)))) .agg(functions.collect_set(functions.struct(model.schema.fieldNames.map(col): _*)).as("block")) .filter(functions.size(new Column("block")).gt(1)) + .union( + //adjacency blocks + df_with_clustering_keys + // filter out leading and trailing elements + .filter(col("position").gt(conf.getWf.getSlidingWindowSize/2)) + //.filter(col("position").lt(col("count").minus(conf.getWf.getSlidingWindowSize/2))) + // create small blocks of records on "the border" of maxqueuesize: getSlidingWindowSize/2 elements before and after + .filter( + col("position").mod(conf.getWf.getQueueMaxSize).lt(conf.getWf.getSlidingWindowSize/2) // slice of the start of block + || col("position").mod(conf.getWf.getQueueMaxSize).gt(conf.getWf.getQueueMaxSize - (conf.getWf.getSlidingWindowSize/2)) //slice of the end of the block + ) + .groupBy(col("clustering"), col("key"), functions.floor((col("position") + lit(conf.getWf.getSlidingWindowSize/2)).divide(lit(conf.getWf.getQueueMaxSize)))) + .agg(functions.collect_set(functions.struct(model.schema.fieldNames.map(col): _*)).as("block")) + .filter(functions.size(new Column("block")).gt(1)) + ) df_with_blocks } def clusterValuesUDF(cd: ClusteringDef) = { udf[mutable.WrappedArray[String], mutable.WrappedArray[Any]](values => { - values.flatMap(f => cd.clusteringFunction().apply(conf, Seq(f.toString).asJava).asScala) + val valueList = values.flatMap { + case a: mutable.WrappedArray[Any] => a.map(_.toString) + case s: Any => Seq(s.toString) + }.asJava; + + mutable.WrappedArray.make(cd.clusteringFunction().apply(conf, valueList).toArray()) + }) } diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/SparkModel.scala b/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/SparkModel.scala index aa997c6e9f..aa04188dae 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/SparkModel.scala +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/model/SparkModel.scala @@ -1,13 +1,16 @@ package eu.dnetlib.pace.model import com.jayway.jsonpath.{Configuration, JsonPath} +import eu.dnetlib.pace.common.AbstractPaceFunctions import eu.dnetlib.pace.config.{DedupConfig, Type} import eu.dnetlib.pace.util.MapDocumentUtil +import org.apache.commons.lang3.StringUtils import org.apache.spark.sql.catalyst.encoders.RowEncoder import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema import org.apache.spark.sql.types.{DataTypes, Metadata, StructField, StructType} import org.apache.spark.sql.{Dataset, Row} +import java.util.Locale import java.util.regex.Pattern import scala.collection.JavaConverters._ @@ -60,7 +63,7 @@ case class SparkModel(conf: DedupConfig) { values(identityFieldPosition) = MapDocumentUtil.getJPathString(conf.getWf.getIdPath, documentContext) schema.fieldNames.zipWithIndex.foldLeft(values) { - case ((res, (fname, index))) => { + case ((res, (fname, index))) => val fdef = conf.getPace.getModelMap.get(fname) if (fdef != null) { @@ -96,13 +99,52 @@ case class SparkModel(conf: DedupConfig) { case Type.DoubleArray => MapDocumentUtil.getJPathArray(fdef.getPath, json) } + + val filter = fdef.getFilter + + if (StringUtils.isNotBlank(fdef.getClean)) { + res(index) = res(index) match { + case x: Seq[String] => x.map(clean(_, fdef.getClean)).toSeq + case _ => clean(res(index).toString, fdef.getClean) + } + } + + if (filter != null && !filter.isEmpty) { + res(index) = res(index) match { + case x: String if filter.contains(x.toLowerCase(Locale.ROOT)) => null + case x: Seq[String] => x.filter(s => !filter.contains(s.toLowerCase(Locale.ROOT))).toSeq + case _ => res(index) + } + } + + if (fdef.getSorted) { + res(index) = res(index) match { + case x: Seq[String] => x.sorted.toSeq + case _ => res(index) + } + } } res - } } new GenericRowWithSchema(values, schema) } + + def clean(value: String, cleantype: String) : String = { + val res = cleantype match { + case "title" => AbstractPaceFunctions.cleanup(value) + case _ => value + } + +// if (!res.equals(AbstractPaceFunctions.normalize(value))) { +// println(res) +// println(AbstractPaceFunctions.normalize(value)) +// println() +// } + + res + } + } diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/AuthorsMatch.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/AuthorsMatch.java index 5c6939e601..edad0ae2e7 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/AuthorsMatch.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/AuthorsMatch.java @@ -23,7 +23,6 @@ public class AuthorsMatch extends AbstractListComparator { private String MODE; // full or surname private int SIZE_THRESHOLD; private String TYPE; // count or percentage - private int common; public AuthorsMatch(Map params) { super(params, new com.wcohen.ss.JaroWinkler()); @@ -35,7 +34,6 @@ public class AuthorsMatch extends AbstractListComparator { FULLNAME_THRESHOLD = Double.parseDouble(params.getOrDefault("fullname_th", "0.9")); SIZE_THRESHOLD = Integer.parseInt(params.getOrDefault("size_th", "20")); TYPE = params.getOrDefault("type", "percentage"); - common = 0; } protected AuthorsMatch(double w, AbstractStringDistance ssalgo) { @@ -44,22 +42,27 @@ public class AuthorsMatch extends AbstractListComparator { @Override public double compare(final List a, final List b, final Config conf) { - if (a.isEmpty() || b.isEmpty()) return -1; if (a.size() > SIZE_THRESHOLD || b.size() > SIZE_THRESHOLD) return 1.0; - List aList = a.stream().map(author -> new Person(author, false)).collect(Collectors.toList()); + int maxMiss = Integer.MAX_VALUE; List bList = b.stream().map(author -> new Person(author, false)).collect(Collectors.toList()); - common = 0; + Double threshold = getDoubleParam("threshold"); + + if (threshold != null && threshold >= 0.0 && threshold <= 1.0 && a.size() == b.size()) { + maxMiss = (int) Math.floor((1 - threshold) * Math.max(a.size(), b.size())); + } + + int common = 0; // compare each element of List1 with each element of List2 - for (Person p1 : aList) + for (int i = 0; i < a.size(); i++) { + Person p1 = new Person(a.get(i), false); for (Person p2 : bList) { - // both persons are inaccurate if (!p1.isAccurate() && !p2.isAccurate()) { // compare just normalized fullnames @@ -118,11 +121,15 @@ public class AuthorsMatch extends AbstractListComparator { } } - } + if (i - common > maxMiss) { + return 0.0; + } + } + // normalization factor to compute the score - int normFactor = aList.size() == bList.size() ? aList.size() : (aList.size() + bList.size() - common); + int normFactor = a.size() == b.size() ? a.size() : (a.size() + b.size() - common); if (TYPE.equals("percentage")) { return (double) common / normFactor; diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/InstanceTypeMatch.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/InstanceTypeMatch.java index 238cb16cec..34ebcf7a7d 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/InstanceTypeMatch.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/InstanceTypeMatch.java @@ -25,6 +25,7 @@ public class InstanceTypeMatch extends AbstractListComparator { translationMap.put("Conference object", "*"); translationMap.put("Other literature type", "*"); translationMap.put("Unknown", "*"); + translationMap.put("UNKNOWN", "*"); // article types translationMap.put("Article", "Article"); @@ -76,5 +77,4 @@ public class InstanceTypeMatch extends AbstractListComparator { protected double normalize(final double d) { return d; } - } diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/LevensteinTitle.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/LevensteinTitle.java index 877cb95abd..e2ee062b56 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/LevensteinTitle.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/LevensteinTitle.java @@ -3,6 +3,7 @@ package eu.dnetlib.pace.tree; import java.util.Map; +import org.apache.commons.lang3.StringUtils; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; @@ -30,16 +31,25 @@ public class LevensteinTitle extends AbstractStringComparator { } @Override - public double distance(final String a, final String b, final Config conf) { - final String ca = cleanup(a); - final String cb = cleanup(b); - + public double distance(final String ca, final String cb, final Config conf) { final boolean check = checkNumbers(ca, cb); if (check) return 0.5; - return normalize(ssalgo.score(ca, cb), ca.length(), cb.length()); + Double threshold = getDoubleParam("threshold"); + + // reduce Levenshtein algo complexity when target threshold is known + if (threshold != null && threshold >= 0.0 && threshold <= 1.0) { + int maxdistance = (int) Math.floor((1 - threshold) * Math.max(ca.length(), cb.length())); + int score = StringUtils.getLevenshteinDistance(ca, cb, maxdistance); + if (score == -1) { + return 0; + } + return normalize(score, ca.length(), cb.length()); + } else { + return normalize(StringUtils.getLevenshteinDistance(ca, cb), ca.length(), cb.length()); + } } private double normalize(final double score, final int la, final int lb) { diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/MaxLengthMatch.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/MaxLengthMatch.java new file mode 100644 index 0000000000..8f525c6d50 --- /dev/null +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/MaxLengthMatch.java @@ -0,0 +1,29 @@ + +package eu.dnetlib.pace.tree; + +import java.util.Map; + +import eu.dnetlib.pace.config.Config; +import eu.dnetlib.pace.tree.support.AbstractStringComparator; +import eu.dnetlib.pace.tree.support.ComparatorClass; + +@ComparatorClass("maxLengthMatch") +public class MaxLengthMatch extends AbstractStringComparator { + + private final int limit; + + public MaxLengthMatch(Map params) { + super(params); + + limit = Integer.parseInt(params.getOrDefault("limit", "200")); + } + + @Override + public double compare(String a, String b, final Config conf) { + return a.length() < limit && b.length() < limit ? 1.0 : -1.0; + } + + protected String toString(final Object object) { + return toFirstString(object); + } +} diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/support/AbstractComparator.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/support/AbstractComparator.java index 8a957c5e32..cde73fd2b4 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/support/AbstractComparator.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/tree/support/AbstractComparator.java @@ -127,4 +127,14 @@ public abstract class AbstractComparator extends AbstractPaceFunctions implem return this.weight; } + public Double getDoubleParam(String name) { + String svalue = params.get(name); + + try { + return Double.parseDouble(svalue); + } catch (Throwable t) { + } + + return null; + } } diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/util/BlockProcessor.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/util/BlockProcessor.java index c2b0ddda7e..177ad73df7 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/util/BlockProcessor.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/util/BlockProcessor.java @@ -67,8 +67,10 @@ public class BlockProcessor { private void processRows(final List queue, final Reporter context) { - for (int pivotPos = 0; pivotPos < queue.size(); pivotPos++) { - final Row pivot = queue.get(pivotPos); + IncrementalConnectedComponents icc = new IncrementalConnectedComponents(queue.size()); + + for (int i = 0; i < queue.size(); i++) { + final Row pivot = queue.get(i); final String idPivot = pivot.getString(identifierFieldPos); // identifier final Object fieldsPivot = getJavaValue(pivot, orderFieldPos); @@ -76,9 +78,9 @@ public class BlockProcessor { final WfConfig wf = dedupConf.getWf(); if (fieldPivot != null) { - int i = 0; - for (int windowPos = pivotPos + 1; windowPos < queue.size(); windowPos++) { - final Row curr = queue.get(windowPos); + for (int j = icc.nextUnconnected(i, i + 1); j >= 0 + && j < queue.size(); j = icc.nextUnconnected(i, j + 1)) { + final Row curr = queue.get(j); final String idCurr = curr.getString(identifierFieldPos); // identifier if (mustSkip(idCurr)) { @@ -86,7 +88,7 @@ public class BlockProcessor { break; } - if (++i > wf.getSlidingWindowSize()) { + if (wf.getSlidingWindowSize() > 0 && (j - i) > wf.getSlidingWindowSize()) { break; } @@ -97,7 +99,9 @@ public class BlockProcessor { final TreeProcessor treeProcessor = new TreeProcessor(dedupConf); - emitOutput(treeProcessor.compare(pivot, curr), idPivot, idCurr, context); + if (emitOutput(treeProcessor.compare(pivot, curr), idPivot, idCurr, context)) { + icc.connect(i, j); + } } } } @@ -115,7 +119,8 @@ public class BlockProcessor { return null; } - private void emitOutput(final boolean result, final String idPivot, final String idCurr, final Reporter context) { + private boolean emitOutput(final boolean result, final String idPivot, final String idCurr, + final Reporter context) { if (result) { if (idPivot.compareTo(idCurr) <= 0) { @@ -127,6 +132,8 @@ public class BlockProcessor { } else { context.incrementCounter(dedupConf.getWf().getEntityType(), "d < " + dedupConf.getWf().getThreshold(), 1); } + + return result; } private boolean mustSkip(final String idPivot) { @@ -142,5 +149,4 @@ public class BlockProcessor { context.emit(type, from, to); } - } diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/util/IncrementalConnectedComponents.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/util/IncrementalConnectedComponents.java new file mode 100644 index 0000000000..ed35239a85 --- /dev/null +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/util/IncrementalConnectedComponents.java @@ -0,0 +1,50 @@ + +package eu.dnetlib.pace.util; + +import java.util.BitSet; + +public class IncrementalConnectedComponents { + final private int size; + + final private BitSet[] indexes; + + IncrementalConnectedComponents(int size) { + this.size = size; + this.indexes = new BitSet[size]; + } + + public void connect(int i, int j) { + if (indexes[i] == null) { + if (indexes[j] == null) { + indexes[i] = new BitSet(size); + } else { + indexes[i] = indexes[j]; + } + } else { + if (indexes[j] != null && indexes[i] != indexes[j]) { + // merge adjacency lists for i and j + indexes[i].or(indexes[j]); + } + } + + indexes[i].set(i); + indexes[i].set(j); + indexes[j] = indexes[i]; + } + + public int nextUnconnected(int i, int j) { + if (indexes[i] == null) { + return j; + } + int result = indexes[i].nextClearBit(j); + + return (result >= size) ? -1 : result; + } + + public BitSet getConnections(int i) { + if (indexes[i] == null) { + return null; + } + return indexes[i]; + } +} diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/util/MapDocumentUtil.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/util/MapDocumentUtil.java index 28244cb3b6..7dc3406633 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/util/MapDocumentUtil.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/util/MapDocumentUtil.java @@ -97,6 +97,8 @@ public class MapDocumentUtil { Object o = json.read(jsonPath); if (o instanceof String) return (String) o; + if (o instanceof Number) + return (String) o.toString(); if (o instanceof JSONArray && ((JSONArray) o).size() > 0) return (String) ((JSONArray) o).get(0); return ""; diff --git a/dhp-pace-core/src/main/java/eu/dnetlib/pace/util/PaceResolver.java b/dhp-pace-core/src/main/java/eu/dnetlib/pace/util/PaceResolver.java index 252205c79c..746892f0cf 100644 --- a/dhp-pace-core/src/main/java/eu/dnetlib/pace/util/PaceResolver.java +++ b/dhp-pace-core/src/main/java/eu/dnetlib/pace/util/PaceResolver.java @@ -40,7 +40,7 @@ public class PaceResolver implements Serializable { Collectors.toMap(cl -> cl.getAnnotation(ComparatorClass.class).value(), cl -> (Class) cl)); } - public ClusteringFunction getClusteringFunction(String name, Map params) throws PaceException { + public ClusteringFunction getClusteringFunction(String name, Map params) throws PaceException { try { return clusteringFunctions.get(name).getDeclaredConstructor(Map.class).newInstance(params); } catch (InstantiationException | IllegalAccessException | InvocationTargetException diff --git a/dhp-pace-core/src/test/java/eu/dnetlib/pace/clustering/ClusteringFunctionTest.java b/dhp-pace-core/src/test/java/eu/dnetlib/pace/clustering/ClusteringFunctionTest.java index f9a1ea9e2a..80e349a3f7 100644 --- a/dhp-pace-core/src/test/java/eu/dnetlib/pace/clustering/ClusteringFunctionTest.java +++ b/dhp-pace-core/src/test/java/eu/dnetlib/pace/clustering/ClusteringFunctionTest.java @@ -15,7 +15,7 @@ import eu.dnetlib.pace.config.DedupConfig; public class ClusteringFunctionTest extends AbstractPaceTest { - private static Map params; + private static Map params; private static DedupConfig conf; @BeforeAll @@ -40,10 +40,10 @@ public class ClusteringFunctionTest extends AbstractPaceTest { @Test public void testNgram() { - params.put("ngramLen", 3); - params.put("max", 8); - params.put("maxPerToken", 2); - params.put("minNgramLen", 1); + params.put("ngramLen", "3"); + params.put("max", "8"); + params.put("maxPerToken", "2"); + params.put("minNgramLen", "1"); final ClusteringFunction ngram = new Ngrams(params); @@ -54,8 +54,8 @@ public class ClusteringFunctionTest extends AbstractPaceTest { @Test public void testNgramPairs() { - params.put("ngramLen", 3); - params.put("max", 2); + params.put("ngramLen", "3"); + params.put("max", "2"); final ClusteringFunction np = new NgramPairs(params); @@ -66,8 +66,8 @@ public class ClusteringFunctionTest extends AbstractPaceTest { @Test public void testSortedNgramPairs() { - params.put("ngramLen", 3); - params.put("max", 2); + params.put("ngramLen", "3"); + params.put("max", "2"); final ClusteringFunction np = new SortedNgramPairs(params); @@ -87,9 +87,9 @@ public class ClusteringFunctionTest extends AbstractPaceTest { @Test public void testAcronym() { - params.put("max", 4); - params.put("minLen", 1); - params.put("maxLen", 3); + params.put("max", "4"); + params.put("minLen", "1"); + params.put("maxLen", "3"); final ClusteringFunction acro = new Acronyms(params); @@ -100,8 +100,8 @@ public class ClusteringFunctionTest extends AbstractPaceTest { @Test public void testSuffixPrefix() { - params.put("len", 3); - params.put("max", 4); + params.put("len", "3"); + params.put("max", "4"); final ClusteringFunction sp = new SuffixPrefix(params); @@ -109,8 +109,8 @@ public class ClusteringFunctionTest extends AbstractPaceTest { System.out.println(s); System.out.println(sp.apply(conf, Lists.newArrayList(s))); - params.put("len", 3); - params.put("max", 1); + params.put("len", "3"); + params.put("max", "1"); System.out.println(sp.apply(conf, Lists.newArrayList("Framework for general-purpose deduplication"))); } @@ -118,8 +118,8 @@ public class ClusteringFunctionTest extends AbstractPaceTest { @Test public void testWordsSuffixPrefix() { - params.put("len", 3); - params.put("max", 4); + params.put("len", "3"); + params.put("max", "4"); final ClusteringFunction sp = new WordsSuffixPrefix(params); @@ -130,7 +130,7 @@ public class ClusteringFunctionTest extends AbstractPaceTest { @Test public void testWordsStatsSuffixPrefix() { - params.put("mod", 10); + params.put("mod", "10"); final ClusteringFunction sp = new WordsStatsSuffixPrefixChain(params); @@ -167,7 +167,7 @@ public class ClusteringFunctionTest extends AbstractPaceTest { @Test public void testFieldValue() { - params.put("randomLength", 5); + params.put("randomLength", "5"); final ClusteringFunction sp = new SpaceTrimmingFieldValue(params); diff --git a/dhp-pace-core/src/test/java/eu/dnetlib/pace/util/IncrementalConnectedComponentsTest.java b/dhp-pace-core/src/test/java/eu/dnetlib/pace/util/IncrementalConnectedComponentsTest.java new file mode 100644 index 0000000000..b0f105d7ce --- /dev/null +++ b/dhp-pace-core/src/test/java/eu/dnetlib/pace/util/IncrementalConnectedComponentsTest.java @@ -0,0 +1,40 @@ + +package eu.dnetlib.pace.util; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertNull; + +import org.junit.jupiter.api.Test; + +public class IncrementalConnectedComponentsTest { + + @Test + public void transitiveClosureTest() { + IncrementalConnectedComponents icc = new IncrementalConnectedComponents(10); + + icc.connect(0, 1); + icc.connect(0, 2); + icc.connect(0, 3); + + icc.connect(1, 2); + icc.connect(1, 4); + icc.connect(1, 5); + + icc.connect(6, 7); + icc.connect(6, 9); + + assertEquals(icc.getConnections(0).toString(), "{0, 1, 2, 3, 4, 5}"); + assertEquals(icc.getConnections(1).toString(), "{0, 1, 2, 3, 4, 5}"); + assertEquals(icc.getConnections(2).toString(), "{0, 1, 2, 3, 4, 5}"); + assertEquals(icc.getConnections(3).toString(), "{0, 1, 2, 3, 4, 5}"); + assertEquals(icc.getConnections(4).toString(), "{0, 1, 2, 3, 4, 5}"); + assertEquals(icc.getConnections(5).toString(), "{0, 1, 2, 3, 4, 5}"); + + assertEquals(icc.getConnections(6).toString(), "{6, 7, 9}"); + assertEquals(icc.getConnections(7).toString(), "{6, 7, 9}"); + assertEquals(icc.getConnections(9).toString(), "{6, 7, 9}"); + + assertNull(icc.getConnections(8)); + } + +} diff --git a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/AbstractSparkAction.java b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/AbstractSparkAction.java index 68af3d6994..0af7bb6d01 100644 --- a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/AbstractSparkAction.java +++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/AbstractSparkAction.java @@ -101,6 +101,10 @@ abstract class AbstractSparkAction implements Serializable { return SparkSession.builder().config(conf).getOrCreate(); } + protected static SparkSession getSparkWithHiveSession(SparkConf conf) { + return SparkSession.builder().enableHiveSupport().config(conf).getOrCreate(); + } + protected static void save(Dataset dataset, String outPath, SaveMode mode) { dataset.write().option("compression", "gzip").mode(mode).json(outPath); } diff --git a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/DedupRecordFactory.java b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/DedupRecordFactory.java index 60669106a7..4c12d1dc65 100644 --- a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/DedupRecordFactory.java +++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/DedupRecordFactory.java @@ -1,128 +1,187 @@ package eu.dnetlib.dhp.oa.dedup; -import java.lang.reflect.InvocationTargetException; -import java.util.*; -import java.util.stream.Collectors; - -import org.apache.commons.beanutils.BeanUtils; -import org.apache.commons.lang3.StringUtils; -import org.apache.spark.api.java.function.MapFunction; -import org.apache.spark.api.java.function.MapGroupsFunction; -import org.apache.spark.sql.Dataset; -import org.apache.spark.sql.Encoders; -import org.apache.spark.sql.SparkSession; - -import com.fasterxml.jackson.databind.DeserializationFeature; -import com.fasterxml.jackson.databind.ObjectMapper; -import com.google.common.collect.Lists; - import eu.dnetlib.dhp.oa.dedup.model.Identifier; import eu.dnetlib.dhp.oa.merge.AuthorMerger; import eu.dnetlib.dhp.schema.common.ModelSupport; -import eu.dnetlib.dhp.schema.oaf.*; +import eu.dnetlib.dhp.schema.oaf.Author; +import eu.dnetlib.dhp.schema.oaf.DataInfo; +import eu.dnetlib.dhp.schema.oaf.OafEntity; +import eu.dnetlib.dhp.schema.oaf.Result; +import org.apache.commons.beanutils.BeanUtils; +import org.apache.commons.lang3.StringUtils; +import org.apache.spark.api.java.function.FlatMapFunction; +import org.apache.spark.api.java.function.MapFunction; +import org.apache.spark.api.java.function.ReduceFunction; +import org.apache.spark.sql.*; import scala.Tuple2; +import scala.Tuple3; +import scala.collection.JavaConversions; + +import java.util.*; +import java.util.stream.Stream; public class DedupRecordFactory { + public static final class DedupRecordReduceState { + public final String dedupId; - protected static final ObjectMapper OBJECT_MAPPER = new ObjectMapper() - .configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false); + public final ArrayList aliases = new ArrayList<>(); - private DedupRecordFactory() { - } + public final HashSet acceptanceDate = new HashSet<>(); - public static Dataset createDedupRecord( - final SparkSession spark, - final DataInfo dataInfo, - final String mergeRelsInputPath, - final String entitiesInputPath, - final Class clazz) { + public OafEntity entity; - long ts = System.currentTimeMillis(); + public DedupRecordReduceState(String dedupId, String id, OafEntity entity) { + this.dedupId = dedupId; + this.entity = entity; + if (entity == null) { + aliases.add(id); + } else { + if (Result.class.isAssignableFrom(entity.getClass())) { + Result result = (Result) entity; + if (result.getDateofacceptance() != null && StringUtils.isNotBlank(result.getDateofacceptance().getValue())) { + acceptanceDate.add(result.getDateofacceptance().getValue()); + } + } + } + } - // - Dataset> entities = spark - .read() - .textFile(entitiesInputPath) - .map( - (MapFunction>) it -> { - T entity = OBJECT_MAPPER.readValue(it, clazz); - return new Tuple2<>(entity.getId(), entity); - }, - Encoders.tuple(Encoders.STRING(), Encoders.kryo(clazz))); + public String getDedupId() { + return dedupId; + } + } + private static final int MAX_ACCEPTANCE_DATE = 20; - // : source is the dedup_id, target is the id of the mergedIn - Dataset> mergeRels = spark - .read() - .load(mergeRelsInputPath) - .as(Encoders.bean(Relation.class)) - .where("relClass == 'merges'") - .map( - (MapFunction>) r -> new Tuple2<>(r.getSource(), r.getTarget()), - Encoders.tuple(Encoders.STRING(), Encoders.STRING())); + private DedupRecordFactory() { + } - return mergeRels - .joinWith(entities, mergeRels.col("_2").equalTo(entities.col("_1")), "inner") - .map( - (MapFunction, Tuple2>, Tuple2>) value -> new Tuple2<>( - value._1()._1(), value._2()._2()), - Encoders.tuple(Encoders.STRING(), Encoders.kryo(clazz))) - .groupByKey( - (MapFunction, String>) Tuple2::_1, Encoders.STRING()) - .mapGroups( - (MapGroupsFunction, T>) (key, - values) -> entityMerger(key, values, ts, dataInfo, clazz), - Encoders.bean(clazz)); - } + public static Dataset createDedupRecord( + final SparkSession spark, + final DataInfo dataInfo, + final String mergeRelsInputPath, + final String entitiesInputPath, + final Class clazz) { - public static T entityMerger( - String id, Iterator> entities, long ts, DataInfo dataInfo, Class clazz) - throws IllegalAccessException, InstantiationException, InvocationTargetException { + final long ts = System.currentTimeMillis(); + final Encoder beanEncoder = Encoders.bean(clazz); + final Encoder kryoEncoder = Encoders.kryo(clazz); - final Comparator> idComparator = new IdentifierComparator<>(); + // + Dataset entities = spark + .read() + .schema(Encoders.bean(clazz).schema()) + .json(entitiesInputPath) + .as(beanEncoder) + .map( + (MapFunction>) entity -> { + return new Tuple2<>(entity.getId(), entity); + }, + Encoders.tuple(Encoders.STRING(), kryoEncoder)) + .selectExpr("_1 AS id", "_2 AS kryoObject"); - final LinkedList entityList = Lists - .newArrayList(entities) - .stream() - .map(t -> Identifier.newInstance(t._2())) - .sorted(idComparator) - .map(Identifier::getEntity) - .collect(Collectors.toCollection(LinkedList::new)); + // : source is the dedup_id, target is the id of the mergedIn + Dataset mergeRels = spark + .read() + .load(mergeRelsInputPath) + .where("relClass == 'merges'") + .selectExpr("source as dedupId", "target as id"); - final T entity = clazz.newInstance(); - final T first = entityList.removeFirst(); + return mergeRels + .join(entities, JavaConversions.asScalaBuffer(Collections.singletonList("id")), "left") + .select("dedupId", "id", "kryoObject") + .as(Encoders.tuple(Encoders.STRING(), Encoders.STRING(), kryoEncoder)) + .map((MapFunction, DedupRecordReduceState>) t -> new DedupRecordReduceState(t._1(), t._2(), t._3()), Encoders.kryo(DedupRecordReduceState.class)) + .groupByKey((MapFunction) DedupRecordReduceState::getDedupId, Encoders.STRING()) + .reduceGroups( + (ReduceFunction) (t1, t2) -> { + if (t1.entity == null) { + t2.aliases.addAll(t1.aliases); + return t2; + } + if (t1.acceptanceDate.size() < MAX_ACCEPTANCE_DATE) { + t1.acceptanceDate.addAll(t2.acceptanceDate); + } + t1.aliases.addAll(t2.aliases); + t1.entity = reduceEntity(t1.entity, t2.entity); - BeanUtils.copyProperties(entity, first); + return t1; + } + ) + .flatMap + ((FlatMapFunction, OafEntity>) t -> { + String dedupId = t._1(); + DedupRecordReduceState agg = t._2(); - final List> authors = Lists.newArrayList(); + if (agg.acceptanceDate.size() >= MAX_ACCEPTANCE_DATE) { + return Collections.emptyIterator(); + } - entityList - .forEach( - duplicate -> { - entity.mergeFrom(duplicate); - if (ModelSupport.isSubClass(duplicate, Result.class)) { - Result r1 = (Result) duplicate; - Optional - .ofNullable(r1.getAuthor()) - .ifPresent(a -> authors.add(a)); - } - }); + return Stream.concat(Stream.of(agg.getDedupId()), agg.aliases.stream()) + .map(id -> { + try { + OafEntity res = (OafEntity) BeanUtils.cloneBean(agg.entity); + res.setId(id); + res.setDataInfo(dataInfo); + res.setLastupdatetimestamp(ts); + return res; + } catch (Exception e) { + throw new RuntimeException(e); + } + }).iterator(); + }, beanEncoder); + } - // set authors and date - if (ModelSupport.isSubClass(entity, Result.class)) { - Optional - .ofNullable(((Result) entity).getAuthor()) - .ifPresent(a -> authors.add(a)); + private static OafEntity reduceEntity(OafEntity entity, OafEntity duplicate) { - ((Result) entity).setAuthor(AuthorMerger.merge(authors)); + if (duplicate == null) { + return entity; } - entity.setId(id); - entity.setLastupdatetimestamp(ts); - entity.setDataInfo(dataInfo); + int compare = new IdentifierComparator<>() + .compare(Identifier.newInstance(entity), Identifier.newInstance(duplicate)); - return entity; - } + if (compare > 0) { + OafEntity swap = duplicate; + duplicate = entity; + entity = swap; + } + + entity.mergeFrom(duplicate); + + if (ModelSupport.isSubClass(duplicate, Result.class)) { + Result re = (Result) entity; + Result rd = (Result) duplicate; + + List> authors = new ArrayList<>(); + if (re.getAuthor() != null) { + authors.add(re.getAuthor()); + } + if (rd.getAuthor() != null) { + authors.add(rd.getAuthor()); + } + + re.setAuthor(AuthorMerger.merge(authors)); + } + + return entity; + } + + public static T entityMerger( + String id, Iterator> entities, long ts, DataInfo dataInfo, Class clazz) { + T base = entities.next()._2(); + + while (entities.hasNext()) { + T duplicate = entities.next()._2(); + if (duplicate != null) + base = (T) reduceEntity(base, duplicate); + } + + base.setId(id); + base.setDataInfo(dataInfo); + base.setLastupdatetimestamp(ts); + + return base; + } } diff --git a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/IdGenerator.java b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/IdGenerator.java index 7e0d660622..1d3d4afdd8 100644 --- a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/IdGenerator.java +++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/IdGenerator.java @@ -1,6 +1,7 @@ package eu.dnetlib.dhp.oa.dedup; +import static eu.dnetlib.dhp.utils.DHPUtils.md5; import static org.apache.commons.lang3.StringUtils.substringAfter; import static org.apache.commons.lang3.StringUtils.substringBefore; @@ -14,33 +15,36 @@ import eu.dnetlib.dhp.schema.oaf.utils.PidType; public class IdGenerator implements Serializable { // pick the best pid from the list (consider date and pidtype) - public static String generate(List> pids, String defaultID) { + public static String generate(List pids, String defaultID) { if (pids == null || pids.isEmpty()) return defaultID; return generateId(pids); } - private static String generateId(List> pids) { - Identifier bp = pids + private static String generateId(List pids) { + Identifier bp = pids .stream() .min(Identifier::compareTo) .orElseThrow(() -> new IllegalStateException("unable to generate id")); - String prefix = substringBefore(bp.getOriginalID(), "|"); - String ns = substringBefore(substringAfter(bp.getOriginalID(), "|"), "::"); - String suffix = substringAfter(bp.getOriginalID(), "::"); + return generate(bp.getOriginalID()); + } + + public static String generate(String originalId) { + String prefix = substringBefore(originalId, "|"); + String ns = substringBefore(substringAfter(originalId, "|"), "::"); + String suffix = substringAfter(originalId, "::"); final String pidType = substringBefore(ns, "_"); if (PidType.isValid(pidType)) { return prefix + "|" + dedupify(ns) + "::" + suffix; } else { - return prefix + "|dedup_wf_001::" + suffix; + return prefix + "|dedup_wf_002::" + md5(originalId); // hash the whole originalId to avoid collisions } } private static String dedupify(String ns) { - StringBuilder prefix; if (PidType.valueOf(substringBefore(ns, "_")) == PidType.openorgs) { prefix = new StringBuilder(substringBefore(ns, "_")); @@ -53,5 +57,4 @@ public class IdGenerator implements Serializable { } return prefix.substring(0, 12); } - } diff --git a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkCreateMergeRels.java b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkCreateMergeRels.java index babbaaabd1..191870d3b0 100644 --- a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkCreateMergeRels.java +++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkCreateMergeRels.java @@ -3,49 +3,47 @@ package eu.dnetlib.dhp.oa.dedup; import static eu.dnetlib.dhp.schema.common.ModelConstants.DNET_PROVENANCE_ACTIONS; import static eu.dnetlib.dhp.schema.common.ModelConstants.PROVENANCE_DEDUP; +import static org.apache.spark.sql.functions.*; import java.io.IOException; -import java.util.*; -import java.util.stream.Collectors; +import java.time.LocalDate; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.Optional; import org.apache.commons.io.IOUtils; +import org.apache.commons.lang3.StringUtils; import org.apache.spark.SparkConf; -import org.apache.spark.api.java.JavaPairRDD; -import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.api.java.function.FlatMapFunction; -import org.apache.spark.api.java.function.MapFunction; -import org.apache.spark.api.java.function.MapGroupsFunction; -import org.apache.spark.graphx.Edge; -import org.apache.spark.rdd.RDD; +import org.apache.spark.sql.*; import org.apache.spark.sql.Dataset; -import org.apache.spark.sql.Encoders; -import org.apache.spark.sql.SaveMode; -import org.apache.spark.sql.SparkSession; +import org.apache.spark.sql.catalyst.encoders.RowEncoder; +import org.apache.spark.sql.expressions.UserDefinedFunction; +import org.apache.spark.sql.expressions.Window; +import org.apache.spark.sql.expressions.WindowSpec; +import org.apache.spark.sql.types.DataTypes; +import org.apache.spark.sql.types.StructType; import org.dom4j.DocumentException; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.xml.sax.SAXException; -import com.google.common.collect.Lists; import com.google.common.hash.Hashing; +import com.kwartile.lib.cc.ConnectedComponent; import eu.dnetlib.dhp.application.ArgumentApplicationParser; -import eu.dnetlib.dhp.oa.dedup.graph.ConnectedComponent; -import eu.dnetlib.dhp.oa.dedup.graph.GraphProcessor; -import eu.dnetlib.dhp.oa.dedup.model.Identifier; import eu.dnetlib.dhp.schema.common.EntityType; import eu.dnetlib.dhp.schema.common.ModelConstants; import eu.dnetlib.dhp.schema.common.ModelSupport; -import eu.dnetlib.dhp.schema.oaf.DataInfo; -import eu.dnetlib.dhp.schema.oaf.OafEntity; -import eu.dnetlib.dhp.schema.oaf.Qualifier; -import eu.dnetlib.dhp.schema.oaf.Relation; +import eu.dnetlib.dhp.schema.oaf.*; +import eu.dnetlib.dhp.schema.oaf.utils.PidType; import eu.dnetlib.dhp.utils.ISLookupClientFactory; import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException; import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService; import eu.dnetlib.pace.config.DedupConfig; -import eu.dnetlib.pace.util.MapDocumentUtil; -import scala.Tuple2; +import scala.Tuple3; +import scala.collection.JavaConversions; public class SparkCreateMergeRels extends AbstractSparkAction { @@ -68,10 +66,12 @@ public class SparkCreateMergeRels extends AbstractSparkAction { log.info("isLookupUrl {}", isLookUpUrl); SparkConf conf = new SparkConf(); + conf.set("hive.metastore.uris", parser.get("hiveMetastoreUris")); conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer"); + conf.registerKryoClasses(ModelSupport.getOafModelClasses()); - new SparkCreateMergeRels(parser, getSparkSession(conf)) + new SparkCreateMergeRels(parser, getSparkWithHiveSession(conf)) .run(ISLookupClientFactory.getLookUpService(isLookUpUrl)); } @@ -87,14 +87,15 @@ public class SparkCreateMergeRels extends AbstractSparkAction { .ofNullable(parser.get("cutConnectedComponent")) .map(Integer::valueOf) .orElse(0); + + final String pivotHistoryDatabase = parser.get("pivotHistoryDatabase"); + log.info("connected component cut: '{}'", cut); log.info("graphBasePath: '{}'", graphBasePath); log.info("isLookUpUrl: '{}'", isLookUpUrl); log.info("actionSetId: '{}'", actionSetId); log.info("workingPath: '{}'", workingPath); - final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); - for (DedupConfig dedupConf : getConfigurations(isLookUpService, actionSetId)) { final String subEntity = dedupConf.getWf().getSubEntityValue(); final Class clazz = ModelSupport.entityTypes.get(EntityType.valueOf(subEntity)); @@ -106,113 +107,172 @@ public class SparkCreateMergeRels extends AbstractSparkAction { final String mergeRelPath = DedupUtility.createMergeRelPath(workingPath, actionSetId, subEntity); - // - JavaPairRDD vertexes = createVertexes(sc, graphBasePath, subEntity, dedupConf); - - final RDD> edgeRdd = spark + final Dataset simRels = spark .read() .load(DedupUtility.createSimRelPath(workingPath, actionSetId, subEntity)) - .as(Encoders.bean(Relation.class)) - .javaRDD() - .map(it -> new Edge<>(hash(it.getSource()), hash(it.getTarget()), it.getRelClass())) - .rdd(); + .select("source", "target"); - Dataset> rawMergeRels = spark - .createDataset( - GraphProcessor - .findCCs(vertexes.rdd(), edgeRdd, maxIterations, cut) - .toJavaRDD() - .filter(k -> k.getIds().size() > 1) - .flatMap(this::ccToRels) - .rdd(), - Encoders.tuple(Encoders.STRING(), Encoders.STRING())); + UserDefinedFunction hashUDF = functions + .udf( + (String s) -> hash(s), DataTypes.LongType); - Dataset> entities = spark + // + Dataset vertexIdMap = simRels + .selectExpr("source as id") + .union(simRels.selectExpr("target as id")) + .distinct() + .withColumn("vertexId", hashUDF.apply(functions.col("id"))); + + // transform simrels into pairs of numeric ids + final Dataset edges = spark .read() - .textFile(DedupUtility.createEntityPath(graphBasePath, subEntity)) - .map( - (MapFunction>) it -> { - OafEntity entity = OBJECT_MAPPER.readValue(it, clazz); - return new Tuple2<>(entity.getId(), entity); - }, - Encoders.tuple(Encoders.STRING(), Encoders.kryo(clazz))); + .load(DedupUtility.createSimRelPath(workingPath, actionSetId, subEntity)) + .select("source", "target") + .withColumn("source", hashUDF.apply(functions.col("source"))) + .withColumn("target", hashUDF.apply(functions.col("target"))); - Dataset mergeRels = rawMergeRels - .joinWith(entities, rawMergeRels.col("_2").equalTo(entities.col("_1")), "inner") - // , - .map( - (MapFunction, Tuple2>, Tuple2>) value -> new Tuple2<>( - value._1()._1(), value._2()._2()), - Encoders.tuple(Encoders.STRING(), Encoders.kryo(clazz))) - // - .groupByKey( - (MapFunction, String>) Tuple2::_1, Encoders.STRING()) - .mapGroups( - (MapGroupsFunction, ConnectedComponent>) this::generateID, - Encoders.bean(ConnectedComponent.class)) - // + // resolve connected components + // ("vertexId", "groupId") + Dataset cliques = ConnectedComponent + .runOnPairs(edges, 50, spark); + + // transform "vertexId" back to its original string value + // groupId is kept numeric as its string value is not used + // ("id", "groupId") + Dataset rawMergeRels = cliques + .join(vertexIdMap, JavaConversions.asScalaBuffer(Collections.singletonList("vertexId")), "inner") + .drop("vertexId") + .distinct(); + + // empty dataframe if historydatabase is not used + Dataset pivotHistory = spark + .createDataset( + Collections.emptyList(), + RowEncoder + .apply(StructType.fromDDL("id STRING, lastUsage STRING"))); + + if (StringUtils.isNotBlank(pivotHistoryDatabase)) { + pivotHistory = spark + .read() + .table(pivotHistoryDatabase + "." + subEntity) + .selectExpr("id", "lastUsage"); + } + + // depending on resulttype collectefrom and dateofacceptance are evaluated differently + String collectedfromExpr = "false AS collectedfrom"; + String dateExpr = "'' AS date"; + + if (Result.class.isAssignableFrom(clazz)) { + if (Publication.class.isAssignableFrom(clazz)) { + collectedfromExpr = "array_contains(collectedfrom.key, '" + ModelConstants.CROSSREF_ID + + "') AS collectedfrom"; + } else if (eu.dnetlib.dhp.schema.oaf.Dataset.class.isAssignableFrom(clazz)) { + collectedfromExpr = "array_contains(collectedfrom.key, '" + ModelConstants.DATACITE_ID + + "') AS collectedfrom"; + } + + dateExpr = "dateofacceptance.value AS date"; + } + + // cap pidType at w3id as from there on they are considered equal + UserDefinedFunction mapPid = udf( + (String s) -> Math.min(PidType.tryValueOf(s).ordinal(), PidType.w3id.ordinal()), DataTypes.IntegerType); + + UserDefinedFunction validDate = udf((String date) -> { + if (StringUtils.isNotBlank(date) + && date.matches(DatePicker.DATE_PATTERN) && DatePicker.inRange(date)) { + return date; + } + return LocalDate.now().plusWeeks(1).toString(); + }, DataTypes.StringType); + + Dataset pivotingData = spark + .read() + .schema(Encoders.bean(clazz).schema()) + .json(DedupUtility.createEntityPath(graphBasePath, subEntity)) + .selectExpr( + "id", + "regexp_extract(id, '^\\\\d+\\\\|([^_]+).*::', 1) AS pidType", + collectedfromExpr, + dateExpr) + .withColumn("pidType", mapPid.apply(col("pidType"))) // ordinal of pid type + .withColumn("date", validDate.apply(col("date"))); + + // ordering to selected pivot id + WindowSpec w = Window + .partitionBy("groupId") + .orderBy( + col("lastUsage").desc_nulls_last(), + col("pidType").asc_nulls_last(), + col("collectedfrom").desc_nulls_last(), + col("date").asc_nulls_last(), + col("id").asc_nulls_last()); + + Dataset output = rawMergeRels + .join(pivotHistory, JavaConversions.asScalaBuffer(Collections.singletonList("id")), "full") + .join(pivotingData, JavaConversions.asScalaBuffer(Collections.singletonList("id")), "left") + .withColumn("pivot", functions.first("id").over(w)) + .withColumn("position", functions.row_number().over(w)) .flatMap( - (FlatMapFunction) cc -> ccToMergeRel(cc, dedupConf), - Encoders.bean(Relation.class)); + (FlatMapFunction>) (Row r) -> { + String id = r.getAs("id"); + String dedupId = IdGenerator.generate(id); - saveParquet(mergeRels, mergeRelPath, SaveMode.Overwrite); + String pivot = r.getAs("pivot"); + String pivotDedupId = IdGenerator.generate(pivot); + // filter out id == pivotDedupId + // those are caused by claim expressed on pivotDedupId + // information will be merged after creating deduprecord + if (id.equals(pivotDedupId)) { + return Collections.emptyIterator(); + } + + ArrayList> res = new ArrayList<>(); + + // singleton pivots have null groupId as they do not match rawMergeRels + if (r.isNullAt(r.fieldIndex("groupId"))) { + // the record is existing if it matches pivotingData + if (!r.isNullAt(r.fieldIndex("collectedfrom"))) { + // create relation with old dedup id + res.add(new Tuple3<>(id, dedupId, null)); + } + return res.iterator(); + } + + // this was a pivot in a previous graph but it has been merged into a new group with different + // pivot + if (!r.isNullAt(r.fieldIndex("lastUsage")) && !pivot.equals(id) && !dedupId.equals(pivotDedupId)) { + // materialize the previous dedup record as a merge relation with the new one + res.add(new Tuple3<>(dedupId, pivotDedupId, null)); + } + + // add merge relations + if (cut <=0 || r.getAs("position") <= cut) { + res.add(new Tuple3<>(id, pivotDedupId, pivot)); + } + + return res.iterator(); + }, Encoders.tuple(Encoders.STRING(), Encoders.STRING(), Encoders.STRING())) + .distinct() + .flatMap( + (FlatMapFunction, Relation>) (Tuple3 r) -> { + String id = r._1(); + String dedupId = r._2(); + String pivot = r._3(); + + ArrayList res = new ArrayList<>(); + res.add(rel(pivot, dedupId, id, ModelConstants.MERGES, dedupConf)); + res.add(rel(pivot, id, dedupId, ModelConstants.IS_MERGED_IN, dedupConf)); + + return res.iterator(); + }, Encoders.bean(Relation.class)); + + saveParquet(output, mergeRelPath, SaveMode.Overwrite); } } - private ConnectedComponent generateID(String key, Iterator> values) { - - List> identifiers = Lists - .newArrayList(values) - .stream() - .map(v -> Identifier.newInstance(v._2())) - .collect(Collectors.toList()); - - String rootID = IdGenerator.generate(identifiers, key); - - if (Objects.equals(rootID, key)) - throw new IllegalStateException("generated default ID: " + rootID); - - return new ConnectedComponent(rootID, - identifiers.stream().map(i -> i.getEntity().getId()).collect(Collectors.toSet())); - } - - private JavaPairRDD createVertexes(JavaSparkContext sc, String graphBasePath, String subEntity, - DedupConfig dedupConf) { - - return sc - .textFile(DedupUtility.createEntityPath(graphBasePath, subEntity)) - .mapToPair(json -> { - String id = MapDocumentUtil.getJPathString(dedupConf.getWf().getIdPath(), json); - return new Tuple2<>(hash(id), id); - }); - } - - private Iterator> ccToRels(ConnectedComponent cc) { - return cc - .getIds() - .stream() - .map(id -> new Tuple2<>(cc.getCcId(), id)) - .iterator(); - } - - private Iterator ccToMergeRel(ConnectedComponent cc, DedupConfig dedupConf) { - return cc - .getIds() - .stream() - .flatMap( - id -> { - List tmp = new ArrayList<>(); - - tmp.add(rel(cc.getCcId(), id, ModelConstants.MERGES, dedupConf)); - tmp.add(rel(id, cc.getCcId(), ModelConstants.IS_MERGED_IN, dedupConf)); - - return tmp.stream(); - }) - .iterator(); - } - - private Relation rel(String source, String target, String relClass, DedupConfig dedupConf) { + private static Relation rel(String pivot, String source, String target, String relClass, DedupConfig dedupConf) { String entityType = dedupConf.getWf().getEntityType(); @@ -238,6 +298,14 @@ public class SparkCreateMergeRels extends AbstractSparkAction { // TODO calculate the trust value based on the similarity score of the elements in the CC r.setDataInfo(info); + + if (pivot != null) { + KeyValue pivotKV = new KeyValue(); + pivotKV.setKey("pivot"); + pivotKV.setValue(pivot); + + r.setProperties(Arrays.asList(pivotKV)); + } return r; } diff --git a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkWhitelistSimRels.java b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkWhitelistSimRels.java index 65ad0c3278..60752a4574 100644 --- a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkWhitelistSimRels.java +++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkWhitelistSimRels.java @@ -91,18 +91,12 @@ public class SparkWhitelistSimRels extends AbstractSparkAction { Dataset entities = spark .read() .textFile(DedupUtility.createEntityPath(graphBasePath, subEntity)) - .repartition(numPartitions) - .withColumn("id", functions.get_json_object(new Column("value"), dedupConf.getWf().getIdPath())); + .select(functions.get_json_object(new Column("value"), dedupConf.getWf().getIdPath()).as("id")) + .distinct(); - Dataset whiteListRels1 = whiteListRels - .join(entities, entities.col("id").equalTo(whiteListRels.col("from")), "inner") - .select("from", "to"); - - Dataset whiteListRels2 = whiteListRels1 - .join(entities, whiteListRels1.col("to").equalTo(entities.col("id")), "inner") - .select("from", "to"); - - Dataset whiteListSimRels = whiteListRels2 + Dataset whiteListSimRels = whiteListRels + .join(entities, entities.col("id").equalTo(whiteListRels.col("from")), "leftsemi") + .join(entities, functions.col("to").equalTo(entities.col("id")), "leftsemi") .map( (MapFunction) r -> DedupUtility .createSimRel(r.getString(0), r.getString(1), entity), diff --git a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/graph/ConnectedComponent.java b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/graph/ConnectedComponent.java deleted file mode 100644 index 4a39a175d4..0000000000 --- a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/graph/ConnectedComponent.java +++ /dev/null @@ -1,100 +0,0 @@ - -package eu.dnetlib.dhp.oa.dedup.graph; - -import java.io.IOException; -import java.io.Serializable; -import java.util.Set; -import java.util.stream.Collectors; - -import org.apache.commons.lang3.StringUtils; -import org.codehaus.jackson.annotate.JsonIgnore; - -import com.fasterxml.jackson.databind.ObjectMapper; - -import eu.dnetlib.dhp.utils.DHPUtils; -import eu.dnetlib.pace.util.PaceException; - -public class ConnectedComponent implements Serializable { - - private String ccId; - private Set ids; - - private static final String CONNECTED_COMPONENT_ID_PREFIX = "connect_comp"; - - public ConnectedComponent(Set ids, final int cut) { - this.ids = ids; - - this.ccId = createDefaultID(); - - if (cut > 0 && ids.size() > cut) { - this.ids = ids - .stream() - .filter(id -> !ccId.equalsIgnoreCase(id)) - .limit(cut - 1) - .collect(Collectors.toSet()); -// this.ids.add(ccId); ?? - } - } - - public ConnectedComponent(String ccId, Set ids) { - this.ccId = ccId; - this.ids = ids; - } - - public String createDefaultID() { - if (ids.size() > 1) { - final String s = getMin(); - String prefix = s.split("\\|")[0]; - ccId = prefix + "|" + CONNECTED_COMPONENT_ID_PREFIX + "::" + DHPUtils.md5(s); - return ccId; - } else { - return ids.iterator().next(); - } - } - - @JsonIgnore - public String getMin() { - - final StringBuilder min = new StringBuilder(); - - ids - .forEach( - id -> { - if (StringUtils.isBlank(min.toString())) { - min.append(id); - } else { - if (min.toString().compareTo(id) > 0) { - min.setLength(0); - min.append(id); - } - } - }); - return min.toString(); - } - - @Override - public String toString() { - ObjectMapper mapper = new ObjectMapper(); - try { - return mapper.writeValueAsString(this); - } catch (IOException e) { - throw new PaceException("Failed to create Json: ", e); - } - } - - public Set getIds() { - return ids; - } - - public void setIds(Set ids) { - this.ids = ids; - } - - public String getCcId() { - return ccId; - } - - public void setCcId(String ccId) { - this.ccId = ccId; - } -} diff --git a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/graph/GraphProcessor.scala b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/graph/GraphProcessor.scala deleted file mode 100644 index f4dd85d758..0000000000 --- a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/graph/GraphProcessor.scala +++ /dev/null @@ -1,37 +0,0 @@ -package eu.dnetlib.dhp.oa.dedup.graph - -import org.apache.spark.graphx._ -import org.apache.spark.rdd.RDD - -import scala.collection.JavaConversions; - -object GraphProcessor { - - def findCCs(vertexes: RDD[(VertexId, String)], edges: RDD[Edge[String]], maxIterations: Int, cut:Int): RDD[ConnectedComponent] = { - val graph: Graph[String, String] = Graph(vertexes, edges).partitionBy(PartitionStrategy.RandomVertexCut) //TODO remember to remove partitionby - val cc = graph.connectedComponents(maxIterations).vertices - - val joinResult = vertexes.leftOuterJoin(cc).map { - case (id, (openaireId, cc)) => { - if (cc.isEmpty) { - (id, openaireId) - } - else { - (cc.get, openaireId) - } - } - } - val connectedComponents = joinResult.groupByKey() - .map[ConnectedComponent](cc => asConnectedComponent(cc, cut)) - connectedComponents - } - - - - def asConnectedComponent(group: (VertexId, Iterable[String]), cut:Int): ConnectedComponent = { - val docs = group._2.toSet[String] - val connectedComponent = new ConnectedComponent(JavaConversions.setAsJavaSet[String](docs), cut); - connectedComponent - } - -} \ No newline at end of file diff --git a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/model/Identifier.java b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/model/Identifier.java index 0cba4fc3ba..e03c3bf95e 100644 --- a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/model/Identifier.java +++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/model/Identifier.java @@ -3,21 +3,21 @@ package eu.dnetlib.dhp.oa.dedup.model; import java.io.Serializable; import java.text.SimpleDateFormat; -import java.util.*; -import java.util.stream.Collectors; +import java.time.LocalDate; +import java.util.Date; +import java.util.List; +import java.util.Objects; import org.apache.commons.lang3.StringUtils; -import com.google.common.collect.Sets; - import eu.dnetlib.dhp.oa.dedup.DatePicker; import eu.dnetlib.dhp.oa.dedup.IdentifierComparator; import eu.dnetlib.dhp.schema.common.EntityType; -import eu.dnetlib.dhp.schema.common.ModelConstants; import eu.dnetlib.dhp.schema.common.ModelSupport; -import eu.dnetlib.dhp.schema.oaf.*; -import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils; -import eu.dnetlib.dhp.schema.oaf.utils.PidComparator; +import eu.dnetlib.dhp.schema.oaf.Field; +import eu.dnetlib.dhp.schema.oaf.KeyValue; +import eu.dnetlib.dhp.schema.oaf.OafEntity; +import eu.dnetlib.dhp.schema.oaf.Result; import eu.dnetlib.dhp.schema.oaf.utils.PidType; public class Identifier implements Serializable, Comparable> { @@ -50,7 +50,7 @@ public class Identifier implements Serializable, Comparable if (Objects.nonNull(date)) { return date; } else { - String sDate = BASE_DATE; + String sDate = LocalDate.now().plusDays(1).toString(); if (ModelSupport.isSubClass(getEntity(), Result.class)) { Result result = (Result) getEntity(); if (isWellformed(result.getDateofacceptance())) { diff --git a/dhp-workflows/dhp-dedup-openaire/src/main/resources/eu/dnetlib/dhp/oa/dedup/createCC_parameters.json b/dhp-workflows/dhp-dedup-openaire/src/main/resources/eu/dnetlib/dhp/oa/dedup/createCC_parameters.json index b1df08535e..4f9f4b0b52 100644 --- a/dhp-workflows/dhp-dedup-openaire/src/main/resources/eu/dnetlib/dhp/oa/dedup/createCC_parameters.json +++ b/dhp-workflows/dhp-dedup-openaire/src/main/resources/eu/dnetlib/dhp/oa/dedup/createCC_parameters.json @@ -28,5 +28,17 @@ "paramLongName": "workingPath", "paramDescription": "path for the working directory", "paramRequired": true + }, + { + "paramName":"h", + "paramLongName":"hiveMetastoreUris", + "paramDescription": "the hive metastore uris", + "paramRequired": true + }, + { + "paramName": "p", + "paramLongName": "pivotHistoryDatabase", + "paramDescription": "Pivot history database", + "paramRequired": false } ] \ No newline at end of file diff --git a/dhp-workflows/dhp-dedup-openaire/src/main/resources/eu/dnetlib/dhp/oa/dedup/scan/oozie_app/config-default.xml b/dhp-workflows/dhp-dedup-openaire/src/main/resources/eu/dnetlib/dhp/oa/dedup/scan/oozie_app/config-default.xml index 2e0ed9aeea..cd29965e3d 100644 --- a/dhp-workflows/dhp-dedup-openaire/src/main/resources/eu/dnetlib/dhp/oa/dedup/scan/oozie_app/config-default.xml +++ b/dhp-workflows/dhp-dedup-openaire/src/main/resources/eu/dnetlib/dhp/oa/dedup/scan/oozie_app/config-default.xml @@ -15,4 +15,8 @@ oozie.action.sharelib.for.spark spark2 + + hiveMetastoreUris + thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083 + \ No newline at end of file diff --git a/dhp-workflows/dhp-dedup-openaire/src/main/resources/eu/dnetlib/dhp/oa/dedup/scan/oozie_app/workflow.xml b/dhp-workflows/dhp-dedup-openaire/src/main/resources/eu/dnetlib/dhp/oa/dedup/scan/oozie_app/workflow.xml index ba2270c8a8..49a331def9 100644 --- a/dhp-workflows/dhp-dedup-openaire/src/main/resources/eu/dnetlib/dhp/oa/dedup/scan/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-dedup-openaire/src/main/resources/eu/dnetlib/dhp/oa/dedup/scan/oozie_app/workflow.xml @@ -188,6 +188,8 @@ --isLookUpUrl${isLookUpUrl} --actionSetId${actionSetId} --cutConnectedComponent${cutConnectedComponent} + --hiveMetastoreUris${hiveMetastoreUris} + --pivotHistoryDatabase${pivotHistoryDatabase} diff --git a/dhp-workflows/dhp-dedup-openaire/src/main/scala/com/kwartile/lib/cc/ConnectedComponent.scala b/dhp-workflows/dhp-dedup-openaire/src/main/scala/com/kwartile/lib/cc/ConnectedComponent.scala new file mode 100644 index 0000000000..4c33622354 --- /dev/null +++ b/dhp-workflows/dhp-dedup-openaire/src/main/scala/com/kwartile/lib/cc/ConnectedComponent.scala @@ -0,0 +1,335 @@ +/** Copyright (c) 2017 Kwartile, Inc., http://www.kwartile.com + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +/** Map-reduce implementation of Connected Component + * Given lists of subgraphs, returns all the nodes that are connected. + */ + +package com.kwartile.lib.cc + +import org.apache.spark.rdd.RDD +import org.apache.spark.sql.{Dataset, Row, SparkSession} +import org.apache.spark.storage.StorageLevel + +import scala.annotation.tailrec +import scala.collection.mutable + +object ConnectedComponent extends Serializable { + + /** Applies Small Star operation on RDD of nodePairs + * + * @param nodePairs on which to apply Small Star operations + * @return new nodePairs after the operation and conncectivy change count + */ + private def smallStar(nodePairs: RDD[(Long, Long)]): (RDD[(Long, Long)], Long) = { + + /** generate RDD of (self, List(neighbors)) where self > neighbors + * E.g.: nodePairs (1, 4), (6, 1), (3, 2), (6, 5) + * will result into (4, List(1)), (6, List(1)), (3, List(2)), (6, List(5)) + */ + val neighbors = nodePairs.map(x => { + val (self, neighbor) = (x._1, x._2) + if (self > neighbor) + (self, neighbor) + else + (neighbor, self) + }) + + /** reduce on self to get list of all its neighbors. + * E.g: (4, List(1)), (6, List(1)), (3, List(2)), (6, List(5)) + * will result into (4, List(1)), (6, List(1, 5)), (3, List(2)) + * Note: + * (1) you may need to tweak number of partitions. + * (2) also, watch out for data skew. In that case, consider using rangePartitioner + */ + val empty = mutable.HashSet[Long]() + val allNeighbors = neighbors.aggregateByKey(empty)( + (lb, v) => lb += v, + (lb1, lb2) => lb1 ++ lb2 + ) + + /** Apply Small Star operation on (self, List(neighbor)) to get newNodePairs and count the change in connectivity + */ + + val newNodePairsWithChangeCount = allNeighbors + .map(x => { + val self = x._1 + val neighbors = x._2.toList + val minNode = argMin(self :: neighbors) + val newNodePairs = (self :: neighbors) + .map(neighbor => { + (neighbor, minNode) + }) + .filter(x => { + val neighbor = x._1 + val minNode = x._2 + (neighbor <= self && neighbor != minNode) || (self == neighbor) + }) + val uniqueNewNodePairs = newNodePairs.toSet.toList + + /** We count the change by taking a diff of the new node pairs with the old node pairs + */ + val connectivityChangeCount = (uniqueNewNodePairs diff neighbors.map((self, _))).length + (uniqueNewNodePairs, connectivityChangeCount) + }) + .persist(StorageLevel.MEMORY_AND_DISK_SER) + + /** Sum all the changeCounts + */ + val totalConnectivityCountChange = newNodePairsWithChangeCount + .mapPartitions(iter => { + val (v, l) = iter.toSeq.unzip + val sum = l.sum + Iterator(sum) + }) + .sum + .toLong + + val newNodePairs = newNodePairsWithChangeCount.map(x => x._1).flatMap(x => x) + newNodePairsWithChangeCount.unpersist(false) + (newNodePairs, totalConnectivityCountChange) + } + + /** Apply Large Star operation on a RDD of nodePairs + * + * @param nodePairs on which to apply Large Star operations + * @return new nodePairs after the operation and conncectivy change count + */ + private def largeStar(nodePairs: RDD[(Long, Long)]): (RDD[(Long, Long)], Long) = { + + /** generate RDD of (self, List(neighbors)) + * E.g.: nodePairs (1, 4), (6, 1), (3, 2), (6, 5) + * will result into (4, List(1)), (1, List(4)), (6, List(1)), (1, List(6)), (3, List(2)), (2, List(3)), (6, List(5)), (5, List(6)) + */ + + val neighbors = nodePairs.flatMap(x => { + val (self, neighbor) = (x._1, x._2) + if (self == neighbor) + List((self, neighbor)) + else + List((self, neighbor), (neighbor, self)) + }) + + /** reduce on self to get list of all its neighbors. + * E.g: (4, List(1)), (1, List(4)), (6, List(1)), (1, List(6)), (3, List(2)), (2, List(3)), (6, List(5)), (5, List(6)) + * will result into (4, List(1)), (1, List(4, 6)), (6, List(1, 5)), (3, List(2)), (2, List(3)), (5, List(6)) + * Note: + * (1) you may need to tweak number of partitions. + * (2) also, watch out for data skew. In that case, consider using rangePartitioner + */ + + val localAdd = (s: mutable.HashSet[Long], v: Long) => s += v + val partitionAdd = (s1: mutable.HashSet[Long], s2: mutable.HashSet[Long]) => s1 ++= s2 + val allNeighbors = + neighbors.aggregateByKey(mutable.HashSet.empty[Long] /*, rangePartitioner*/ )(localAdd, partitionAdd) + + /** Apply Large Star operation on (self, List(neighbor)) to get newNodePairs and count the change in connectivity + */ + + val newNodePairsWithChangeCount = allNeighbors + .map(x => { + val self = x._1 + val neighbors = x._2.toList + val minNode = argMin(self :: neighbors) + val newNodePairs = (self :: neighbors) + .map(neighbor => { + (neighbor, minNode) + }) + .filter(x => { + val neighbor = x._1 + val minNode = x._2 + neighbor > self || neighbor == minNode + }) + + val uniqueNewNodePairs = newNodePairs.toSet.toList + val connectivityChangeCount = (uniqueNewNodePairs diff neighbors.map((self, _))).length + (uniqueNewNodePairs, connectivityChangeCount) + }) + .persist(StorageLevel.MEMORY_AND_DISK_SER) + + val totalConnectivityCountChange = newNodePairsWithChangeCount + .mapPartitions(iter => { + val (v, l) = iter.toSeq.unzip + val sum = l.sum + Iterator(sum) + }) + .sum + .toLong + + /** Sum all the changeCounts + */ + val newNodePairs = newNodePairsWithChangeCount.map(x => x._1).flatMap(x => x) + newNodePairsWithChangeCount.unpersist(false) + (newNodePairs, totalConnectivityCountChange) + } + + private def argMin(nodes: List[Long]): Long = { + nodes.min(Ordering.by((node: Long) => node)) + } + + /** Build nodePairs given a list of nodes. A list of nodes represents a subgraph. + * + * @param nodes that are part of a subgraph + * @return nodePairs for a subgraph + */ + private def buildPairs(nodes: List[Long]): List[(Long, Long)] = { + buildPairs(nodes.head, nodes.tail, null.asInstanceOf[List[(Long, Long)]]) + } + + @tailrec + private def buildPairs(node: Long, neighbors: List[Long], partialPairs: List[(Long, Long)]): List[(Long, Long)] = { + if (neighbors.isEmpty) { + if (partialPairs != null) + List((node, node)) ::: partialPairs + else + List((node, node)) + } else if (neighbors.length == 1) { + val neighbor = neighbors(0) + if (node > neighbor) + if (partialPairs != null) List((node, neighbor)) ::: partialPairs else List((node, neighbor)) + else if (partialPairs != null) List((neighbor, node)) ::: partialPairs + else List((neighbor, node)) + } else { + val newPartialPairs = neighbors + .map(neighbor => { + if (node > neighbor) + List((node, neighbor)) + else + List((neighbor, node)) + }) + .flatMap(x => x) + + if (partialPairs != null) + buildPairs(neighbors.head, neighbors.tail, newPartialPairs ::: partialPairs) + else + buildPairs(neighbors.head, neighbors.tail, newPartialPairs) + } + } + + /** Implements alternatingAlgo. Converges when the changeCount is either 0 or does not change from the previous iteration + * + * @param nodePairs for a graph + * @param largeStarConnectivityChangeCount change count that resulted from the previous iteration + * @param smallStarConnectivityChangeCount change count that resulted from the previous iteration + * @param didConverge flag to indicate the alorigth converged + * @param currIterationCount counter to capture number of iterations + * @param maxIterationCount maximum number iterations to try before giving up + * @return RDD of nodePairs + */ + + @tailrec + private def alternatingAlgo( + nodePairs: RDD[(Long, Long)], + largeStarConnectivityChangeCount: Long, + smallStarConnectivityChangeCount: Long, + didConverge: Boolean, + currIterationCount: Int, + maxIterationCount: Int + ): (RDD[(Long, Long)], Boolean, Long) = { + + val iterationCount = currIterationCount + 1 + if (didConverge) + (nodePairs, true, currIterationCount) + else if (currIterationCount >= maxIterationCount) { + (nodePairs, false, currIterationCount) + } else { + + val (nodePairsLargeStar, currLargeStarConnectivityChangeCount) = largeStar(nodePairs) + val (nodePairsSmallStar, currSmallStarConnectivityChangeCount) = smallStar(nodePairsLargeStar) + + if ( + (currLargeStarConnectivityChangeCount == largeStarConnectivityChangeCount && + currSmallStarConnectivityChangeCount == smallStarConnectivityChangeCount) || + (currSmallStarConnectivityChangeCount == 0 && currLargeStarConnectivityChangeCount == 0) + ) { + alternatingAlgo( + nodePairsSmallStar, + currLargeStarConnectivityChangeCount, + currSmallStarConnectivityChangeCount, + true, + iterationCount, + maxIterationCount + ) + } else { + alternatingAlgo( + nodePairsSmallStar, + currLargeStarConnectivityChangeCount, + currSmallStarConnectivityChangeCount, + false, + iterationCount, + maxIterationCount + ) + } + } + } + + /** Driver function + * + * @param cliques list of nodes representing subgraphs (or cliques) + * @param maxIterationCount maximum number iterations to try before giving up + * @return Connected Components as nodePairs where second member of the nodePair is the minimum node in the component + */ + def run(cliques: RDD[List[Long]], maxIterationCount: Int): (RDD[(Long, Long)], Boolean, Long) = { + + val nodePairs = cliques + .map(aClique => { + buildPairs(aClique) + }) + .flatMap(x => x) + + val (cc, didConverge, iterCount) = alternatingAlgo(nodePairs, 9999999L, 9999999L, false, 0, maxIterationCount) + + if (didConverge) { + (cc, didConverge, iterCount) + } else { + (null.asInstanceOf[RDD[(Long, Long)]], didConverge, iterCount) + } + } + + def runOnPairs(nodePairs: RDD[(Long, Long)], maxIterationCount: Int): (RDD[(Long, Long)], Boolean, Long) = { + val (cc, didConverge, iterCount) = alternatingAlgo(nodePairs, 9999999L, 9999999L, false, 0, maxIterationCount) + + if (didConverge) { + (cc, didConverge, iterCount) + } else { + (null.asInstanceOf[RDD[(Long, Long)]], didConverge, iterCount) + } + } + + def runOnPairs(nodePairs: Dataset[Row], maxIterationCount: Int)(implicit spark: SparkSession): Dataset[Row] = { + import spark.implicits._ + + val (cc, didConverge, iterCount) = alternatingAlgo( + nodePairs.map(e => (e.getLong(0), e.getLong(1))).rdd, + 9999999L, + 9999999L, + false, + 0, + maxIterationCount + ) + + if (didConverge) { + cc.toDF("vertexId", "groupId") + } else { + null.asInstanceOf[Dataset[Row]] + } + } + +} diff --git a/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/SparkDedupTest.java b/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/SparkDedupTest.java index 6c4935637a..8b3480e60b 100644 --- a/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/SparkDedupTest.java +++ b/dhp-workflows/dhp-dedup-openaire/src/test/java/eu/dnetlib/dhp/oa/dedup/SparkDedupTest.java @@ -41,9 +41,13 @@ import com.google.common.collect.Sets; import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.schema.common.ModelConstants; +import eu.dnetlib.dhp.schema.common.ModelSupport; import eu.dnetlib.dhp.schema.oaf.*; +import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils; +import eu.dnetlib.dhp.schema.sx.OafUtils; import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException; import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService; +import scala.Tuple2; @ExtendWith(MockitoExtension.class) @TestMethodOrder(MethodOrderer.OrderAnnotation.class) @@ -97,6 +101,7 @@ public class SparkDedupTest implements Serializable { final SparkConf conf = new SparkConf(); conf.set("spark.sql.shuffle.partitions", "200"); + conf.set("spark.sql.warehouse.dir", testOutputBasePath + "/spark-warehouse"); spark = SparkSession .builder() .appName(SparkDedupTest.class.getSimpleName()) @@ -186,11 +191,11 @@ public class SparkDedupTest implements Serializable { System.out.println("ds_simrel = " + ds_simrel); System.out.println("orp_simrel = " + orp_simrel); - assertEquals(1538, orgs_simrel); - assertEquals(3523, pubs_simrel); - assertEquals(168, sw_simrel); - assertEquals(221, ds_simrel); - assertEquals(3392, orp_simrel); + assertEquals(751, orgs_simrel); + assertEquals(546, pubs_simrel); + assertEquals(113, sw_simrel); + assertEquals(148, ds_simrel); + assertEquals(280, orp_simrel); } @@ -235,10 +240,10 @@ public class SparkDedupTest implements Serializable { .count(); // entities simrels supposed to be equal to the number of previous step (no rels in whitelist) - assertEquals(1538, orgs_simrel); - assertEquals(3523, pubs_simrel); - assertEquals(221, ds_simrel); - assertEquals(3392, orp_simrel); + assertEquals(751, orgs_simrel); + assertEquals(546, pubs_simrel); + assertEquals(148, ds_simrel); + assertEquals(280, orp_simrel); // System.out.println("orgs_simrel = " + orgs_simrel); // System.out.println("pubs_simrel = " + pubs_simrel); // System.out.println("ds_simrel = " + ds_simrel); @@ -268,7 +273,7 @@ public class SparkDedupTest implements Serializable { && rel.getTarget().equalsIgnoreCase(whiteList.get(1).split(WHITELIST_SEPARATOR)[1])) .count() > 0); - assertEquals(170, sw_simrel.count()); + assertEquals(115, sw_simrel.count()); // System.out.println("sw_simrel = " + sw_simrel.count()); } @@ -292,7 +297,9 @@ public class SparkDedupTest implements Serializable { "-w", testOutputBasePath, "-cc", - "3" + "3", + "-h", + "" }); new SparkCreateMergeRels(parser, spark).run(isLookUpService); @@ -365,6 +372,113 @@ public class SparkDedupTest implements Serializable { .deleteDirectory(new File(testOutputBasePath + "/" + testActionSetId + "/otherresearchproduct_mergerel")); } + @Test + @Order(3) + void createMergeRelsWithPivotHistoryTest() throws Exception { + + ArgumentApplicationParser parser = new ArgumentApplicationParser( + classPathResourceAsString("/eu/dnetlib/dhp/oa/dedup/createCC_parameters.json")); + + spark.sql("CREATE DATABASE IF NOT EXISTS pivot_history_test"); + ModelSupport.oafTypes.keySet().forEach(entityType -> { + try { + spark + .read() + .json( + Paths + .get(SparkDedupTest.class.getResource("/eu/dnetlib/dhp/dedup/pivot_history").toURI()) + .toFile() + .getAbsolutePath()) + .write() + .mode("overwrite") + .saveAsTable("pivot_history_test." + entityType); + } catch (URISyntaxException e) { + throw new RuntimeException(e); + } + }); + + parser + .parseArgument( + new String[] { + "-i", + testGraphBasePath, + "-asi", + testActionSetId, + "-la", + "lookupurl", + "-w", + testOutputBasePath, + "-h", + "", + "-pivotHistoryDatabase", + "pivot_history_test" + + }); + + new SparkCreateMergeRels(parser, spark).run(isLookUpService); + + long orgs_mergerel = spark + .read() + .load(testOutputBasePath + "/" + testActionSetId + "/organization_mergerel") + .count(); + final Dataset pubs = spark + .read() + .load(testOutputBasePath + "/" + testActionSetId + "/publication_mergerel") + .as(Encoders.bean(Relation.class)); + long sw_mergerel = spark + .read() + .load(testOutputBasePath + "/" + testActionSetId + "/software_mergerel") + .count(); + long ds_mergerel = spark + .read() + .load(testOutputBasePath + "/" + testActionSetId + "/dataset_mergerel") + .count(); + + long orp_mergerel = spark + .read() + .load(testOutputBasePath + "/" + testActionSetId + "/otherresearchproduct_mergerel") + .count(); + + final List merges = pubs + .filter("source == '50|arXiv_dedup_::c93aeb433eb90ed7a86e29be00791b7c'") + .collectAsList(); + assertEquals(3, merges.size()); + Set dups = Sets + .newHashSet( + "50|doi_________::3b1d0d8e8f930826665df9d6b82fbb73", + "50|doi_________::d5021b53204e4fdeab6ff5d5bc468032", + "50|arXiv_______::c93aeb433eb90ed7a86e29be00791b7c"); + merges.forEach(r -> { + assertEquals(ModelConstants.RESULT_RESULT, r.getRelType()); + assertEquals(ModelConstants.DEDUP, r.getSubRelType()); + assertEquals(ModelConstants.MERGES, r.getRelClass()); + assertTrue(dups.contains(r.getTarget())); + }); + + final List mergedIn = pubs + .filter("target == '50|arXiv_dedup_::c93aeb433eb90ed7a86e29be00791b7c'") + .collectAsList(); + assertEquals(3, mergedIn.size()); + mergedIn.forEach(r -> { + assertEquals(ModelConstants.RESULT_RESULT, r.getRelType()); + assertEquals(ModelConstants.DEDUP, r.getSubRelType()); + assertEquals(ModelConstants.IS_MERGED_IN, r.getRelClass()); + assertTrue(dups.contains(r.getSource())); + }); + + assertEquals(1268, orgs_mergerel); + assertEquals(1112, pubs.count()); + assertEquals(292, sw_mergerel); + assertEquals(476, ds_mergerel); + assertEquals(742, orp_mergerel); +// System.out.println("orgs_mergerel = " + orgs_mergerel); +// System.out.println("pubs_mergerel = " + pubs_mergerel); +// System.out.println("sw_mergerel = " + sw_mergerel); +// System.out.println("ds_mergerel = " + ds_mergerel); +// System.out.println("orp_mergerel = " + orp_mergerel); + + } + @Test @Order(4) void createMergeRelsTest() throws Exception { @@ -382,7 +496,9 @@ public class SparkDedupTest implements Serializable { "-la", "lookupurl", "-w", - testOutputBasePath + testOutputBasePath, + "-h", + "" }); new SparkCreateMergeRels(parser, spark).run(isLookUpService); @@ -437,10 +553,10 @@ public class SparkDedupTest implements Serializable { }); assertEquals(1268, orgs_mergerel); - assertEquals(1450, pubs.count()); - assertEquals(286, sw_mergerel); - assertEquals(472, ds_mergerel); - assertEquals(738, orp_mergerel); + assertEquals(1112, pubs.count()); + assertEquals(292, sw_mergerel); + assertEquals(476, ds_mergerel); + assertEquals(742, orp_mergerel); // System.out.println("orgs_mergerel = " + orgs_mergerel); // System.out.println("pubs_mergerel = " + pubs_mergerel); // System.out.println("sw_mergerel = " + sw_mergerel); @@ -492,8 +608,8 @@ public class SparkDedupTest implements Serializable { .count(); assertEquals(86, orgs_deduprecord); - assertEquals(68, pubs.count()); - assertEquals(49, sw_deduprecord); + assertEquals(91, pubs.count()); + assertEquals(47, sw_deduprecord); assertEquals(97, ds_deduprecord); assertEquals(92, orp_deduprecord); @@ -629,11 +745,11 @@ public class SparkDedupTest implements Serializable { .distinct() .count(); - assertEquals(902, publications); + assertEquals(925, publications); assertEquals(839, organizations); assertEquals(100, projects); assertEquals(100, datasource); - assertEquals(198, softwares); + assertEquals(196, softwares); assertEquals(389, dataset); assertEquals(520, otherresearchproduct); diff --git a/dhp-workflows/dhp-dedup-openaire/src/test/resources/eu/dnetlib/dhp/dedup/conf/ds.curr.conf.json b/dhp-workflows/dhp-dedup-openaire/src/test/resources/eu/dnetlib/dhp/dedup/conf/ds.curr.conf.json index fa889d63b7..ff6670f1e4 100644 --- a/dhp-workflows/dhp-dedup-openaire/src/test/resources/eu/dnetlib/dhp/dedup/conf/ds.curr.conf.json +++ b/dhp-workflows/dhp-dedup-openaire/src/test/resources/eu/dnetlib/dhp/dedup/conf/ds.curr.conf.json @@ -101,7 +101,8 @@ "type" : "String", "path" : "$.title[?(@.qualifier.classid == 'main title')].value", "length" : 250, - "size" : 5 + "size" : 5, + "clean": "title" }, { "name" : "authors", diff --git a/dhp-workflows/dhp-dedup-openaire/src/test/resources/eu/dnetlib/dhp/dedup/conf/orp.curr.conf.json b/dhp-workflows/dhp-dedup-openaire/src/test/resources/eu/dnetlib/dhp/dedup/conf/orp.curr.conf.json index b45b6ae832..a4a3761a36 100644 --- a/dhp-workflows/dhp-dedup-openaire/src/test/resources/eu/dnetlib/dhp/dedup/conf/orp.curr.conf.json +++ b/dhp-workflows/dhp-dedup-openaire/src/test/resources/eu/dnetlib/dhp/dedup/conf/orp.curr.conf.json @@ -101,7 +101,8 @@ "type" : "String", "path" : "$.title[?(@.qualifier.classid == 'main title')].value", "length" : 250, - "size" : 5 + "size" : 5, + "clean": "title" }, { "name" : "authors", diff --git a/dhp-workflows/dhp-dedup-openaire/src/test/resources/eu/dnetlib/dhp/dedup/conf/pub.curr.conf.json b/dhp-workflows/dhp-dedup-openaire/src/test/resources/eu/dnetlib/dhp/dedup/conf/pub.curr.conf.json index 15ebc7a6a0..c3a769874a 100644 --- a/dhp-workflows/dhp-dedup-openaire/src/test/resources/eu/dnetlib/dhp/dedup/conf/pub.curr.conf.json +++ b/dhp-workflows/dhp-dedup-openaire/src/test/resources/eu/dnetlib/dhp/dedup/conf/pub.curr.conf.json @@ -29,9 +29,8 @@ }, "pace": { "clustering" : [ - { "name" : "ngrampairs", "fields" : [ "title" ], "params" : { "max" : "1", "ngramLen" : "3"} }, - { "name" : "suffixprefix", "fields" : [ "title" ], "params" : { "max" : "1", "len" : "3" } }, - { "name" : "lowercase", "fields" : [ "doi" ], "params" : { } } + { "name" : "numAuthorsTitleSuffixPrefixChain", "fields" : [ "num_authors", "title" ], "params" : { "mod" : "10" } }, + { "name" : "jsonlistclustering", "fields" : [ "pid" ], "params" : { "jpath_value": "$.value", "jpath_classid": "$.qualifier.classid"} } ], "decisionTree": { "start": { @@ -79,13 +78,37 @@ "ignoreUndefined": "false" }, "layer3": { + "fields": [ + { + "field": "authors", + "comparator": "authorsMatch", + "weight": 1.0, + "countIfUndefined": "false", + "params": { + "surname_th": 0.75, + "fullname_th": 0.75, + "threshold": 0.6, + "mode": "full" + } + } + ], + "threshold": 0.6, + "aggregation": "MAX", + "positive": "layer4", + "negative": "NO_MATCH", + "undefined": "MATCH", + "ignoreUndefined": "true" + }, + "layer4": { "fields": [ { "field": "title", "comparator": "levensteinTitle", "weight": 1.0, "countIfUndefined": "true", - "params": {} + "params": { + "threshold": "0.99" + } } ], "threshold": 0.99, @@ -97,23 +120,25 @@ } }, "model": [ - { - "name": "doi", - "type": "String", - "path": "$.pid[?(@.qualifier.classid == 'doi')].value" - }, { "name": "pid", "type": "JSON", "path": "$.pid", "overrideMatch": "true" }, + { + "name": "alternateid", + "type": "JSON", + "path": "$.instance[*].alternateIdentifier[*]", + "overrideMatch": "true" + }, { "name": "title", "type": "String", "path": "$.title[?(@.qualifier.classid == 'main title')].value", "length": 250, - "size": 5 + "size": 5, + "clean": "title" }, { "name": "authors", @@ -122,9 +147,9 @@ "size": 200 }, { - "name": "resulttype", + "name": "num_authors", "type": "String", - "path": "$.resulttype.classid" + "path": "$.author.length()" } ], "blacklists": { diff --git a/dhp-workflows/dhp-dedup-openaire/src/test/resources/eu/dnetlib/dhp/dedup/conf/sw.curr.conf.json b/dhp-workflows/dhp-dedup-openaire/src/test/resources/eu/dnetlib/dhp/dedup/conf/sw.curr.conf.json index f53ff385f9..3c6c8aa5f0 100644 --- a/dhp-workflows/dhp-dedup-openaire/src/test/resources/eu/dnetlib/dhp/dedup/conf/sw.curr.conf.json +++ b/dhp-workflows/dhp-dedup-openaire/src/test/resources/eu/dnetlib/dhp/dedup/conf/sw.curr.conf.json @@ -75,7 +75,8 @@ "type" : "String", "path" : "$.title[?(@.qualifier.classid == 'main title')].value", "length" : 250, - "size" : 5 + "size" : 5, + "clean": "title" }, { "name" : "url", diff --git a/dhp-workflows/dhp-dedup-openaire/src/test/resources/eu/dnetlib/dhp/dedup/pivot_history/pivot_history.json b/dhp-workflows/dhp-dedup-openaire/src/test/resources/eu/dnetlib/dhp/dedup/pivot_history/pivot_history.json new file mode 100644 index 0000000000..8af1a6d069 --- /dev/null +++ b/dhp-workflows/dhp-dedup-openaire/src/test/resources/eu/dnetlib/dhp/dedup/pivot_history/pivot_history.json @@ -0,0 +1 @@ +{"id": "50|arXiv_______::c93aeb433eb90ed7a86e29be00791b7c", "firstUsage": "2022-01-01", "lastUsage": "2022-01-01", "dedupId": "50|arXiv_dedup_::c93aeb433eb90ed7a86e29be00791b7c" } \ No newline at end of file diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/api/Utils.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/api/Utils.java index d121b8b7e2..06d0f95c25 100644 --- a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/api/Utils.java +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/api/Utils.java @@ -167,4 +167,11 @@ public class Utils implements Serializable { }); return projectMap; } + + public static List getCommunityIdList(String baseURL) throws IOException { + return getValidCommunities(baseURL) + .stream() + .map(community -> community.getId()) + .collect(Collectors.toList()); + } } diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/SparkBulkTagJob.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/SparkBulkTagJob.java index 5d1b2b38d1..e20fcb081a 100644 --- a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/SparkBulkTagJob.java +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/bulktag/SparkBulkTagJob.java @@ -45,7 +45,7 @@ public class SparkBulkTagJob { .toString( SparkBulkTagJob.class .getResourceAsStream( - "/eu/dnetlib/dhp/bulktag/input_bulkTag_parameters.json")); + "/eu/dnetlib/dhp/wf/subworkflows/bulktag/input_bulkTag_parameters.json")); final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration); parser.parseArgument(args); @@ -105,7 +105,6 @@ public class SparkBulkTagJob { Map>> dsm = cc.getEoscDatasourceMap(); for (String ds : datasources.collectAsList()) { - // final String dsId = ds.substring(3); if (!dsm.containsKey(ds)) { ArrayList> eoscList = new ArrayList<>(); dsm.put(ds, eoscList); @@ -116,13 +115,11 @@ public class SparkBulkTagJob { private static boolean isOKDatasource(Datasource ds) { final String compatibility = ds.getOpenairecompatibility().getClassid(); - boolean isOk = (compatibility.equalsIgnoreCase(OPENAIRE_3) || + return (compatibility.equalsIgnoreCase(OPENAIRE_3) || compatibility.equalsIgnoreCase(OPENAIRE_4) || compatibility.equalsIgnoreCase(OPENAIRE_CRIS) || compatibility.equalsIgnoreCase(OPENAIRE_DATA)) && ds.getCollectedfrom().stream().anyMatch(cf -> cf.getKey().equals(EOSC)); - - return isOk; } private static void execBulkTag( @@ -151,7 +148,13 @@ public class SparkBulkTagJob { .write() .mode(SaveMode.Overwrite) .option("compression", "gzip") - .json(outputPath + e.name()); + .json(outputPath + e.name());// writing the tagging in the working dir for entity + + readPath(spark, outputPath + e.name(), resultClazz) // copy the tagging in the actual result output path + .write() + .mode(SaveMode.Overwrite) + .option("compression", "gzip") + .json(inputPath + e.name()); }); } diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/countrypropagation/PrepareDatasourceCountryAssociation.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/countrypropagation/PrepareDatasourceCountryAssociation.java index b9f3bff523..a016509e57 100644 --- a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/countrypropagation/PrepareDatasourceCountryAssociation.java +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/countrypropagation/PrepareDatasourceCountryAssociation.java @@ -45,7 +45,7 @@ public class PrepareDatasourceCountryAssociation { .toString( PrepareDatasourceCountryAssociation.class .getResourceAsStream( - "/eu/dnetlib/dhp/countrypropagation/input_prepareassoc_parameters.json")); + "/eu/dnetlib/dhp/wf/subworkflows/countrypropagation/input_prepareassoc_parameters.json")); final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration); @@ -66,7 +66,7 @@ public class PrepareDatasourceCountryAssociation { conf, isSparkSessionManaged, spark -> { - removeOutputDir(spark, outputPath); + // removeOutputDir(spark, outputPath); prepareDatasourceCountryAssociation( spark, Arrays.asList(parser.get("whitelist").split(";")), @@ -90,7 +90,8 @@ public class PrepareDatasourceCountryAssociation { (FilterFunction) ds -> !ds.getDataInfo().getDeletedbyinference() && Optional.ofNullable(ds.getDatasourcetype()).isPresent() && Optional.ofNullable(ds.getDatasourcetype().getClassid()).isPresent() && - (allowedtypes.contains(ds.getDatasourcetype().getClassid()) || + ((Optional.ofNullable(ds.getJurisdiction()).isPresent() && + allowedtypes.contains(ds.getJurisdiction().getClassid())) || whitelist.contains(ds.getId()))); // filtering of the relations taking the non deleted by inference and those with IsProvidedBy as relclass diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/countrypropagation/PrepareResultCountrySet.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/countrypropagation/PrepareResultCountrySet.java index 184d24751b..884aa0e47e 100644 --- a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/countrypropagation/PrepareResultCountrySet.java +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/countrypropagation/PrepareResultCountrySet.java @@ -32,7 +32,7 @@ public class PrepareResultCountrySet { .toString( PrepareResultCountrySet.class .getResourceAsStream( - "/eu/dnetlib/dhp/countrypropagation/input_prepareresultcountry_parameters.json")); + "/eu/dnetlib/dhp/wf/subworkflows/countrypropagation/input_prepareresultcountry_parameters.json")); final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration); diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/countrypropagation/SparkCountryPropagationJob.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/countrypropagation/SparkCountryPropagationJob.java index d9f6433a07..92930c18bd 100644 --- a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/countrypropagation/SparkCountryPropagationJob.java +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/countrypropagation/SparkCountryPropagationJob.java @@ -35,7 +35,7 @@ public class SparkCountryPropagationJob { .toString( SparkCountryPropagationJob.class .getResourceAsStream( - "/eu/dnetlib/dhp/countrypropagation/input_countrypropagation_parameters.json")); + "/eu/dnetlib/dhp/wf/subworkflows/countrypropagation/input_countrypropagation_parameters.json")); final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration); @@ -97,6 +97,12 @@ public class SparkCountryPropagationJob { .mode(SaveMode.Overwrite) .json(outputPath); + readPath(spark, outputPath, resultClazz) + .write() + .mode(SaveMode.Overwrite) + .option("compression", "gzip") + .json(sourcePath); + } private static MapFunction, R> getCountryMergeFn() { diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/entitytoorganizationfromsemrel/PrepareInfo.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/entitytoorganizationfromsemrel/PrepareInfo.java index 8d3432f062..bdfdde13bd 100644 --- a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/entitytoorganizationfromsemrel/PrepareInfo.java +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/entitytoorganizationfromsemrel/PrepareInfo.java @@ -60,7 +60,7 @@ public class PrepareInfo implements Serializable { .toString( SparkResultToOrganizationFromIstRepoJob.class .getResourceAsStream( - "/eu/dnetlib/dhp/entitytoorganizationfromsemrel/input_preparation_parameter.json")); + "/eu/dnetlib/dhp/wf/subworkflows/entitytoorganizationfromsemrel/input_preparation_parameter.json")); final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration); diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/entitytoorganizationfromsemrel/SparkResultToOrganizationFromSemRel.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/entitytoorganizationfromsemrel/SparkEntityToOrganizationFromSemRel.java similarity index 97% rename from dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/entitytoorganizationfromsemrel/SparkResultToOrganizationFromSemRel.java rename to dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/entitytoorganizationfromsemrel/SparkEntityToOrganizationFromSemRel.java index 27e502aba0..4e30a6d6a8 100644 --- a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/entitytoorganizationfromsemrel/SparkResultToOrganizationFromSemRel.java +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/entitytoorganizationfromsemrel/SparkEntityToOrganizationFromSemRel.java @@ -27,8 +27,8 @@ import eu.dnetlib.dhp.resulttoorganizationfrominstrepo.SparkResultToOrganization import eu.dnetlib.dhp.schema.common.ModelConstants; import eu.dnetlib.dhp.schema.oaf.Relation; -public class SparkResultToOrganizationFromSemRel implements Serializable { - private static final Logger log = LoggerFactory.getLogger(SparkResultToOrganizationFromSemRel.class); +public class SparkEntityToOrganizationFromSemRel implements Serializable { + private static final Logger log = LoggerFactory.getLogger(SparkEntityToOrganizationFromSemRel.class); private static final int MAX_ITERATION = 5; public static final String NEW_RESULT_RELATION_PATH = "/newResultRelation"; public static final String NEW_PROJECT_RELATION_PATH = "/newProjectRelation"; @@ -39,7 +39,7 @@ public class SparkResultToOrganizationFromSemRel implements Serializable { .toString( SparkResultToOrganizationFromIstRepoJob.class .getResourceAsStream( - "/eu/dnetlib/dhp/entitytoorganizationfromsemrel/input_propagation_parameter.json")); + "/eu/dnetlib/dhp/wf/subworkflows/entitytoorganizationfromsemrel/input_propagation_parameter.json")); final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration); diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/entitytoorganizationfromsemrel/StepActions.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/entitytoorganizationfromsemrel/StepActions.java index 386ea1a5cd..36a7523c50 100644 --- a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/entitytoorganizationfromsemrel/StepActions.java +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/entitytoorganizationfromsemrel/StepActions.java @@ -3,8 +3,8 @@ package eu.dnetlib.dhp.entitytoorganizationfromsemrel; import static eu.dnetlib.dhp.PropagationConstant.*; import static eu.dnetlib.dhp.PropagationConstant.readPath; -import static eu.dnetlib.dhp.entitytoorganizationfromsemrel.SparkResultToOrganizationFromSemRel.NEW_PROJECT_RELATION_PATH; -import static eu.dnetlib.dhp.entitytoorganizationfromsemrel.SparkResultToOrganizationFromSemRel.NEW_RESULT_RELATION_PATH; +import static eu.dnetlib.dhp.entitytoorganizationfromsemrel.SparkEntityToOrganizationFromSemRel.NEW_PROJECT_RELATION_PATH; +import static eu.dnetlib.dhp.entitytoorganizationfromsemrel.SparkEntityToOrganizationFromSemRel.NEW_RESULT_RELATION_PATH; import java.io.Serializable; import java.util.*; @@ -20,7 +20,6 @@ import org.jetbrains.annotations.NotNull; import com.fasterxml.jackson.databind.ObjectMapper; import eu.dnetlib.dhp.KeyValueSet; -import eu.dnetlib.dhp.schema.common.ModelConstants; import eu.dnetlib.dhp.schema.oaf.Relation; import scala.Tuple2; diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/orcidtoresultfromsemrel/PrepareResultOrcidAssociationStep1.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/orcidtoresultfromsemrel/PrepareResultOrcidAssociationStep1.java index 95b870292d..bc72a2ae1b 100644 --- a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/orcidtoresultfromsemrel/PrepareResultOrcidAssociationStep1.java +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/orcidtoresultfromsemrel/PrepareResultOrcidAssociationStep1.java @@ -31,7 +31,7 @@ public class PrepareResultOrcidAssociationStep1 { .toString( PrepareResultOrcidAssociationStep1.class .getResourceAsStream( - "/eu/dnetlib/dhp/orcidtoresultfromsemrel/input_prepareorcidtoresult_parameters.json")); + "/eu/dnetlib/dhp/wf/subworkflows/orcidtoresultfromsemrel/input_prepareorcidtoresult_parameters.json")); final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConf); parser.parseArgument(args); diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/orcidtoresultfromsemrel/PrepareResultOrcidAssociationStep2.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/orcidtoresultfromsemrel/PrepareResultOrcidAssociationStep2.java index c60012a748..46894d0e1b 100644 --- a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/orcidtoresultfromsemrel/PrepareResultOrcidAssociationStep2.java +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/orcidtoresultfromsemrel/PrepareResultOrcidAssociationStep2.java @@ -29,7 +29,7 @@ public class PrepareResultOrcidAssociationStep2 { .toString( PrepareResultOrcidAssociationStep2.class .getResourceAsStream( - "/eu/dnetlib/dhp/orcidtoresultfromsemrel/input_prepareorcidtoresult_parameters2.json")); + "/eu/dnetlib/dhp/wf/subworkflows/orcidtoresultfromsemrel/input_prepareorcidtoresult_parameters2.json")); final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration); diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/orcidtoresultfromsemrel/SparkOrcidToResultFromSemRelJob.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/orcidtoresultfromsemrel/SparkOrcidToResultFromSemRelJob.java index a38b4da2e8..c5d6326581 100644 --- a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/orcidtoresultfromsemrel/SparkOrcidToResultFromSemRelJob.java +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/orcidtoresultfromsemrel/SparkOrcidToResultFromSemRelJob.java @@ -2,7 +2,7 @@ package eu.dnetlib.dhp.orcidtoresultfromsemrel; import static eu.dnetlib.dhp.PropagationConstant.*; -import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkHiveSession; +import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; import java.util.List; import java.util.Optional; @@ -36,7 +36,7 @@ public class SparkOrcidToResultFromSemRelJob { .toString( SparkOrcidToResultFromSemRelJob.class .getResourceAsStream( - "/eu/dnetlib/dhp/orcidtoresultfromsemrel/input_orcidtoresult_parameters.json")); + "/eu/dnetlib/dhp/wf/subworkflows/orcidtoresultfromsemrel/input_orcidtoresult_parameters.json")); final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration); parser.parseArgument(args); @@ -65,9 +65,8 @@ public class SparkOrcidToResultFromSemRelJob { Class resultClazz = (Class) Class.forName(resultClassName); SparkConf conf = new SparkConf(); - conf.set("hive.metastore.uris", parser.get("hive_metastore_uris")); - runWithSparkHiveSession( + runWithSparkSession( conf, isSparkSessionManaged, spark -> { diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/projecttoresult/PrepareProjectResultsAssociation.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/projecttoresult/PrepareProjectResultsAssociation.java index ac61e26f94..8f4e2ad9a5 100644 --- a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/projecttoresult/PrepareProjectResultsAssociation.java +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/projecttoresult/PrepareProjectResultsAssociation.java @@ -28,7 +28,7 @@ public class PrepareProjectResultsAssociation { .toString( PrepareProjectResultsAssociation.class .getResourceAsStream( - "/eu/dnetlib/dhp/projecttoresult/input_prepareprojecttoresult_parameters.json")); + "/eu/dnetlib/dhp/wf/subworkflows/projecttoresult/input_prepareprojecttoresult_parameters.json")); final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration); diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/projecttoresult/SparkResultToProjectThroughSemRelJob.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/projecttoresult/SparkResultToProjectThroughSemRelJob.java index 1ec521af18..e7518673d8 100644 --- a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/projecttoresult/SparkResultToProjectThroughSemRelJob.java +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/projecttoresult/SparkResultToProjectThroughSemRelJob.java @@ -33,7 +33,7 @@ public class SparkResultToProjectThroughSemRelJob { .toString( SparkResultToProjectThroughSemRelJob.class .getResourceAsStream( - "/eu/dnetlib/dhp/projecttoresult/input_projecttoresult_parameters.json")); + "/eu/dnetlib/dhp/wf/subworkflows/projecttoresult/input_projecttoresult_parameters.json")); final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration); diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttocommunityfromorganization/PrepareResultCommunitySet.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttocommunityfromorganization/PrepareResultCommunitySet.java index 54fa601681..be31cd46cc 100644 --- a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttocommunityfromorganization/PrepareResultCommunitySet.java +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttocommunityfromorganization/PrepareResultCommunitySet.java @@ -34,7 +34,7 @@ public class PrepareResultCommunitySet { .toString( PrepareResultCommunitySet.class .getResourceAsStream( - "/eu/dnetlib/dhp/resulttocommunityfromorganization/input_preparecommunitytoresult_parameters.json")); + "/eu/dnetlib/dhp/wf/subworkflows/resulttocommunityfromorganization/input_preparecommunitytoresult_parameters.json")); final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration); parser.parseArgument(args); diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttocommunityfromorganization/SparkResultToCommunityFromOrganizationJob.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttocommunityfromorganization/SparkResultToCommunityFromOrganizationJob.java index df8ca3805b..cc87b80e5e 100644 --- a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttocommunityfromorganization/SparkResultToCommunityFromOrganizationJob.java +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttocommunityfromorganization/SparkResultToCommunityFromOrganizationJob.java @@ -36,7 +36,7 @@ public class SparkResultToCommunityFromOrganizationJob { .toString( SparkResultToCommunityFromOrganizationJob.class .getResourceAsStream( - "/eu/dnetlib/dhp/resulttocommunityfromorganization/input_communitytoresult_parameters.json")); + "/eu/dnetlib/dhp/wf/subworkflows/resulttocommunityfromorganization/input_communitytoresult_parameters.json")); final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration); @@ -92,6 +92,12 @@ public class SparkResultToCommunityFromOrganizationJob { .mode(SaveMode.Overwrite) .option("compression", "gzip") .json(outputPath + e.name()); + + readPath(spark, outputPath + e.name(), resultClazz) + .write() + .mode(SaveMode.Overwrite) + .option("compression", "gzip") + .json(inputPath + e.name()); } }); diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttocommunityfromproject/PrepareResultCommunitySet.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttocommunityfromproject/PrepareResultCommunitySet.java index 467e11a969..512dfa9bed 100644 --- a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttocommunityfromproject/PrepareResultCommunitySet.java +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttocommunityfromproject/PrepareResultCommunitySet.java @@ -38,7 +38,7 @@ public class PrepareResultCommunitySet { .toString( PrepareResultCommunitySet.class .getResourceAsStream( - "/eu/dnetlib/dhp/resulttocommunityfromproject/input_preparecommunitytoresult_parameters.json")); + "/eu/dnetlib/dhp/wf/subworkflows/resulttocommunityfromproject/input_preparecommunitytoresult_parameters.json")); final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration); parser.parseArgument(args); diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttocommunityfromproject/SparkResultToCommunityFromProject.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttocommunityfromproject/SparkResultToCommunityFromProject.java index 6e298cf946..dde5340617 100644 --- a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttocommunityfromproject/SparkResultToCommunityFromProject.java +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttocommunityfromproject/SparkResultToCommunityFromProject.java @@ -44,7 +44,7 @@ public class SparkResultToCommunityFromProject implements Serializable { .toString( SparkResultToCommunityFromProject.class .getResourceAsStream( - "/eu/dnetlib/dhp/resulttocommunityfromproject/input_communitytoresult_parameters.json")); + "/eu/dnetlib/dhp/wf/subworkflows/resulttocommunityfromproject/input_communitytoresult_parameters.json")); final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration); @@ -102,6 +102,12 @@ public class SparkResultToCommunityFromProject implements Serializable { .mode(SaveMode.Overwrite) .option("compression", "gzip") .json(outputPath + e.name()); + + readPath(spark, outputPath + e.name(), resultClazz) + .write() + .mode(SaveMode.Overwrite) + .option("compression", "gzip") + .json(inputPath + e.name()); } }); diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttocommunityfromsemrel/PrepareResultCommunitySetStep1.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttocommunityfromsemrel/PrepareResultCommunitySetStep1.java index 0c836a3ba9..aede9ef05b 100644 --- a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttocommunityfromsemrel/PrepareResultCommunitySetStep1.java +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttocommunityfromsemrel/PrepareResultCommunitySetStep1.java @@ -4,6 +4,7 @@ package eu.dnetlib.dhp.resulttocommunityfromsemrel; import static eu.dnetlib.dhp.PropagationConstant.*; import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkHiveSession; +import java.io.IOException; import java.util.Arrays; import java.util.List; @@ -15,6 +16,7 @@ import org.slf4j.LoggerFactory; import com.google.gson.Gson; +import eu.dnetlib.dhp.api.Utils; import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.resulttocommunityfromorganization.ResultCommunityList; import eu.dnetlib.dhp.schema.oaf.Relation; @@ -26,11 +28,6 @@ import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService; public class PrepareResultCommunitySetStep1 { private static final Logger log = LoggerFactory.getLogger(PrepareResultCommunitySetStep1.class); - private static final String COMMUNITY_LIST_XQUERY = "for $x in collection('/db/DRIVER/ContextDSResources/ContextDSResourceType')" - + " where $x//CONFIGURATION/context[./@type='community' or ./@type='ri']" - + " and $x//CONFIGURATION/context/param[./@name='status']/text() != 'hidden'" - + " return $x//CONFIGURATION/context/@id/string()"; - /** * associates to each result the set of community contexts they are associated to; associates to each target of a * relation with allowed semantics the set of community context it could possibly inherit from the source of the @@ -64,7 +61,7 @@ public class PrepareResultCommunitySetStep1 { .toString( PrepareResultCommunitySetStep1.class .getResourceAsStream( - "/eu/dnetlib/dhp/resulttocommunityfromsemrel/input_preparecommunitytoresult_parameters.json")); + "/eu/dnetlib/dhp/wf/subworkflows/resulttocommunityfromsemrel/input_preparecommunitytoresult_parameters.json")); final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration); @@ -88,10 +85,10 @@ public class PrepareResultCommunitySetStep1 { final List allowedsemrel = Arrays.asList(parser.get("allowedsemrels").split(";")); log.info("allowedSemRel: {}", new Gson().toJson(allowedsemrel)); - final String isLookupUrl = parser.get("isLookUpUrl"); - log.info("isLookupUrl: {}", isLookupUrl); + final String baseURL = parser.get("baseURL"); + log.info("baseURL: {}", baseURL); - final List communityIdList = getCommunityList(isLookupUrl); + final List communityIdList = getCommunityList(baseURL); log.info("communityIdList: {}", new Gson().toJson(communityIdList)); final String resultType = resultClassName.substring(resultClassName.lastIndexOf(".") + 1).toLowerCase(); @@ -159,9 +156,8 @@ public class PrepareResultCommunitySetStep1 { .json(outputResultPath); } - public static List getCommunityList(final String isLookupUrl) throws ISLookUpException { - ISLookUpService isLookUp = ISLookupClientFactory.getLookUpService(isLookupUrl); - return isLookUp.quickSearchProfile(COMMUNITY_LIST_XQUERY); + public static List getCommunityList(final String baseURL) throws IOException { + return Utils.getCommunityIdList(baseURL); } } diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttocommunityfromsemrel/PrepareResultCommunitySetStep2.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttocommunityfromsemrel/PrepareResultCommunitySetStep2.java index 0ddb19a1ac..a53d3dfe32 100644 --- a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttocommunityfromsemrel/PrepareResultCommunitySetStep2.java +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttocommunityfromsemrel/PrepareResultCommunitySetStep2.java @@ -31,7 +31,7 @@ public class PrepareResultCommunitySetStep2 { .toString( PrepareResultCommunitySetStep2.class .getResourceAsStream( - "/eu/dnetlib/dhp/resulttocommunityfromsemrel/input_preparecommunitytoresult2_parameters.json")); + "/eu/dnetlib/dhp/wf/subworkflows/resulttocommunityfromsemrel/input_preparecommunitytoresult2_parameters.json")); final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration); diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttocommunityfromsemrel/SparkResultToCommunityThroughSemRelJob.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttocommunityfromsemrel/SparkResultToCommunityThroughSemRelJob.java index f31a262307..4929c7582d 100644 --- a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttocommunityfromsemrel/SparkResultToCommunityThroughSemRelJob.java +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttocommunityfromsemrel/SparkResultToCommunityThroughSemRelJob.java @@ -33,7 +33,7 @@ public class SparkResultToCommunityThroughSemRelJob { .toString( SparkResultToCommunityThroughSemRelJob.class .getResourceAsStream( - "/eu/dnetlib/dhp/resulttocommunityfromsemrel/input_communitytoresult_parameters.json")); + "/eu/dnetlib/dhp/wf/subworkflows/resulttocommunityfromsemrel/input_communitytoresult_parameters.json")); final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration); @@ -100,6 +100,12 @@ public class SparkResultToCommunityThroughSemRelJob { .mode(SaveMode.Overwrite) .option("compression", "gzip") .json(outputPath); + + readPath(spark, outputPath, resultClazz) + .write() + .mode(SaveMode.Overwrite) + .option("compression", "gzip") + .json(inputPath); } private static MapFunction, R> contextUpdaterFn() { diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/AppendNewRelations.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/AppendNewRelations.java new file mode 100644 index 0000000000..11e9421426 --- /dev/null +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/AppendNewRelations.java @@ -0,0 +1,68 @@ + +package eu.dnetlib.dhp.resulttoorganizationfrominstrepo; + +import static eu.dnetlib.dhp.PropagationConstant.*; +import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; + +import java.io.Serializable; + +import org.apache.commons.io.IOUtils; +import org.apache.spark.SparkConf; +import org.apache.spark.sql.SaveMode; +import org.apache.spark.sql.SparkSession; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import eu.dnetlib.dhp.application.ArgumentApplicationParser; +import eu.dnetlib.dhp.schema.oaf.Relation; + +/** + * @author miriam.baglioni + * @Date 09/12/23 + */ +public class AppendNewRelations implements Serializable { + + private static final Logger log = LoggerFactory.getLogger(AppendNewRelations.class); + + public static void main(String[] args) throws Exception { + + String jsonConfiguration = IOUtils + .toString( + AppendNewRelations.class + .getResourceAsStream( + "/eu/dnetlib/dhp/wf/subworkflows/resulttoorganizationfrominstrepo/input_newrelation_parameters.json")); + + final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration); + + parser.parseArgument(args); + + Boolean isSparkSessionManaged = isSparkSessionManaged(parser); + log.info("isSparkSessionManaged: {}", isSparkSessionManaged); + + String inputPath = parser.get("sourcePath"); + log.info("inputPath: {}", inputPath); + + final String outputPath = parser.get("outputPath"); + log.info("outputPath: {}", outputPath); + + SparkConf conf = new SparkConf(); + + runWithSparkSession( + conf, + isSparkSessionManaged, + spark -> appendNewRelation(spark, inputPath, outputPath)); + } + + private static void appendNewRelation(SparkSession spark, String inputPath, String outputPath) { + + readPath(spark, inputPath + "publication/relation", Relation.class) + .union(readPath(spark, inputPath + "dataset/relation", Relation.class)) + .union(readPath(spark, inputPath + "otherresearchproduct/relation", Relation.class)) + .union(readPath(spark, inputPath + "software/relation", Relation.class)) + .write() + .mode(SaveMode.Append) + .option("compression", "gzip") + .json(outputPath); + } + +} diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/PrepareResultInstRepoAssociation.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/PrepareResultInstRepoAssociation.java index 1663afb32e..57488bd209 100644 --- a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/PrepareResultInstRepoAssociation.java +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/PrepareResultInstRepoAssociation.java @@ -40,7 +40,7 @@ public class PrepareResultInstRepoAssociation { .toString( PrepareResultInstRepoAssociation.class .getResourceAsStream( - "/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/input_prepareresultorg_parameters.json")); + "/eu/dnetlib/dhp/wf/subworkflows/resulttoorganizationfrominstrepo/input_prepareresultorg_parameters.json")); final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration); @@ -52,10 +52,13 @@ public class PrepareResultInstRepoAssociation { String inputPath = parser.get("sourcePath"); log.info("inputPath: {}", inputPath); - final String datasourceOrganizationPath = parser.get("datasourceOrganizationPath"); + final String workingPath = parser.get("workingPath"); + log.info("workingPath : {}", workingPath); + + final String datasourceOrganizationPath = workingPath + "/preparedInfo/datasourceOrganization"; log.info("datasourceOrganizationPath {}: ", datasourceOrganizationPath); - final String alreadyLinkedPath = parser.get("alreadyLinkedPath"); + final String alreadyLinkedPath = workingPath + "/preparedInfo/alreadyLinked"; log.info("alreadyLinkedPath {}: ", alreadyLinkedPath); List blacklist = Optional diff --git a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/SparkResultToOrganizationFromIstRepoJob.java b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/SparkResultToOrganizationFromIstRepoJob.java index 0757ebccd4..c8862b10c4 100644 --- a/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/SparkResultToOrganizationFromIstRepoJob.java +++ b/dhp-workflows/dhp-enrichment/src/main/java/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/SparkResultToOrganizationFromIstRepoJob.java @@ -47,7 +47,7 @@ public class SparkResultToOrganizationFromIstRepoJob { .toString( SparkResultToOrganizationFromIstRepoJob.class .getResourceAsStream( - "/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/input_propagationresulaffiliationfrominstrepo_parameters.json")); + "/eu/dnetlib/dhp/wf/subworkflows/resulttoorganizationfrominstrepo/input_propagationresulaffiliationfrominstrepo_parameters.json")); final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration); @@ -119,7 +119,7 @@ public class SparkResultToOrganizationFromIstRepoJob { "left_outer") .flatMap(createRelationFn(), Encoders.bean(Relation.class)) .write() - .mode(SaveMode.Append) + .mode(SaveMode.Overwrite) .option("compression", "gzip") .json(outputPath); } diff --git a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/bulktag/datasourcemaster_parameters.json b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/bulktag/datasourcemaster_parameters.json deleted file mode 100644 index 9a2eadaa7d..0000000000 --- a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/bulktag/datasourcemaster_parameters.json +++ /dev/null @@ -1,32 +0,0 @@ -[ - { - "paramName": "p", - "paramLongName": "hdfsPath", - "paramDescription": "the path where storing the sequential file", - "paramRequired": true - }, - { - "paramName": "nn", - "paramLongName": "hdfsNameNode", - "paramDescription": "the name node on hdfs", - "paramRequired": true - }, - { - "paramName": "pgurl", - "paramLongName": "postgresUrl", - "paramDescription": "postgres url, example: jdbc:postgresql://localhost:5432/testdb", - "paramRequired": true - }, - { - "paramName": "pguser", - "paramLongName": "postgresUser", - "paramDescription": "postgres user", - "paramRequired": false - }, - { - "paramName": "pgpasswd", - "paramLongName": "postgresPassword", - "paramDescription": "postgres password", - "paramRequired": false - } -] \ No newline at end of file diff --git a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/bulktag/input_eoscTag_parameters.json b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/bulktag/input_eoscTag_parameters.json deleted file mode 100644 index 4c25fea019..0000000000 --- a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/bulktag/input_eoscTag_parameters.json +++ /dev/null @@ -1,21 +0,0 @@ -[ - { - "paramName":"s", - "paramLongName":"sourcePath", - "paramDescription": "the path of the sequencial file to read", - "paramRequired": true - }, - { - "paramName": "wp", - "paramLongName": "workingPath", - "paramDescription": "the path used to store temporary output files", - "paramRequired": true - }, - { - "paramName": "ssm", - "paramLongName": "isSparkSessionManaged", - "paramDescription": "true if the spark session is managed, false otherwise", - "paramRequired": false - } - -] \ No newline at end of file diff --git a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/bulktag/input_eosc_bulkTag_parameters.json b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/bulktag/input_eosc_bulkTag_parameters.json deleted file mode 100644 index 5aace346d9..0000000000 --- a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/bulktag/input_eosc_bulkTag_parameters.json +++ /dev/null @@ -1,41 +0,0 @@ -[ - - { - "paramName":"s", - "paramLongName":"sourcePath", - "paramDescription": "the path of the sequencial file to read", - "paramRequired": true - }, - { - "paramName": "dmp", - "paramLongName":"datasourceMapPath", - "paramDescription": "the path where the association datasource master has been stored", - "paramRequired": true - }, - { - "paramName":"tn", - "paramLongName":"resultTableName", - "paramDescription": "the name of the result table we are currently working on", - "paramRequired": true - }, - { - "paramName": "wp", - "paramLongName": "workingPath", - "paramDescription": "the path used to store temporary output files", - "paramRequired": true - }, - { - "paramName": "ssm", - "paramLongName": "isSparkSessionManaged", - "paramDescription": "true if the spark session is managed, false otherwise", - "paramRequired": false - }, - { - - "paramName": "rt", - "paramLongName": "resultType", - "paramDescription": "the result type", - "paramRequired": true - } - -] \ No newline at end of file diff --git a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/entitytoorganizationfromsemrel/oozie_app/workflow.xml b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/entitytoorganizationfromsemrel/oozie_app/workflow.xml deleted file mode 100644 index 16c8c4e19d..0000000000 --- a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/entitytoorganizationfromsemrel/oozie_app/workflow.xml +++ /dev/null @@ -1,197 +0,0 @@ - - - - sourcePath - the source path - - - outputPath - sets the outputPath - - - - - ${jobTracker} - ${nameNode} - - - oozie.action.sharelib.for.spark - ${oozieActionShareLibForSpark2} - - - - - - - - Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] - - - - - ${wf:conf('resumeFrom') eq 'PrepareInfo'} - - - - - - - - - - - - - - - - - - - - - - - - - - - ${nameNode}/${sourcePath}/relation - ${nameNode}/${outputPath}/relation - - - - - - - - ${nameNode}/${sourcePath}/publication - ${nameNode}/${outputPath}/publication - - - - - - - - ${nameNode}/${sourcePath}/dataset - ${nameNode}/${outputPath}/dataset - - - - - - - - ${nameNode}/${sourcePath}/otherresearchproduct - ${nameNode}/${outputPath}/otherresearchproduct - - - - - - - - ${nameNode}/${sourcePath}/software - ${nameNode}/${outputPath}/software - - - - - - - - ${nameNode}/${sourcePath}/organization - ${nameNode}/${outputPath}/organization - - - - - - - - ${nameNode}/${sourcePath}/project - ${nameNode}/${outputPath}/project - - - - - - - - ${nameNode}/${sourcePath}/datasource - ${nameNode}/${outputPath}/datasource - - - - - - - - - - - yarn - cluster - PrepareResultOrganizationAssociation - eu.dnetlib.dhp.entitytoorganizationfromsemrel.PrepareInfo - dhp-enrichment-${projectVersion}.jar - - --executor-cores=${sparkExecutorCores} - --executor-memory=${sparkExecutorMemory} - --driver-memory=${sparkDriverMemory} - --conf spark.extraListeners=${spark2ExtraListeners} - --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} - --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} - --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} - - --graphPath${sourcePath} - --hive_metastore_uris${hive_metastore_uris} - --leavesPath${workingDir}/preparedInfo/leavesPath - --childParentPath${workingDir}/preparedInfo/childParentPath - --resultOrgPath${workingDir}/preparedInfo/resultOrgPath - --projectOrganizationPath${workingDir}/preparedInfo/projectOrganizationPath - --relationPath${workingDir}/preparedInfo/relation - - - - - - - - yarn - cluster - resultToOrganizationFromSemRel - eu.dnetlib.dhp.entitytoorganizationfromsemrel.SparkResultToOrganizationFromSemRel - dhp-enrichment-${projectVersion}.jar - - --executor-cores=${sparkExecutorCores} - --executor-memory=${sparkExecutorMemory} - --driver-memory=${sparkDriverMemory} - --conf spark.extraListeners=${spark2ExtraListeners} - --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} - --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} - --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} - --conf spark.dynamicAllocation.enabled=true - --conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors} - --conf spark.sql.shuffle.partitions=3840 - - --relationPath${workingDir}/preparedInfo/relation - --outputPath${outputPath}/relation - --leavesPath${workingDir}/preparedInfo/leavesPath - --childParentPath${workingDir}/preparedInfo/childParentPath - --resultOrgPath${workingDir}/preparedInfo/resultOrgPath - --projectOrganizationPath${workingDir}/preparedInfo/projectOrganizationPath - --hive_metastore_uris${hive_metastore_uris} - --workingDir${workingDir}/working - --iterations${iterations} - - - - - - - - - - \ No newline at end of file diff --git a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/wf/main/job.properties b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/wf/main/job.properties new file mode 100644 index 0000000000..4cb759343c --- /dev/null +++ b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/wf/main/job.properties @@ -0,0 +1,28 @@ +sourcePath=/tmp/beta_provision/graph/09_graph_dedup_enriched +resumeFrom=CountryPropagation +allowedsemrelsorcidprop=isSupplementedBy;isSupplementTo +allowedsemrelsresultproject=isSupplementedBy;isSupplementTo +allowedsemrelscommunitysemrel=isSupplementedBy;isSupplementTo +datasourceWhitelistForCountryPropagation=10|opendoar____::16e6a3326dd7d868cbc926602a61e4d0;10|openaire____::fdb035c8b3e0540a8d9a561a6c44f4de;10|eurocrisdris::fe4903425d9040f680d8610d9079ea14;10|openaire____::5b76240cc27a58c6f7ceef7d8c36660e;10|openaire____::172bbccecf8fca44ab6a6653e84cb92a;10|openaire____::149c6590f8a06b46314eed77bfca693f;10|eurocrisdris::a6026877c1a174d60f81fd71f62df1c1;10|openaire____::4692342f0992d91f9e705c26959f09e0;10|openaire____::8d529dbb05ec0284662b391789e8ae2a;10|openaire____::345c9d171ef3c5d706d08041d506428c;10|opendoar____::1c1d4df596d01da60385f0bb17a4a9e0;10|opendoar____::7a614fd06c325499f1680b9896beedeb;10|opendoar____::1ee3dfcd8a0645a25a35977997223d22;10|opendoar____::d296c101daa88a51f6ca8cfc1ac79b50;10|opendoar____::798ed7d4ee7138d49b8828958048130a;10|openaire____::c9d2209ecc4d45ba7b4ca7597acb88a2;10|eurocrisdris::c49e0fe4b9ba7b7fab717d1f0f0a674d;10|eurocrisdris::9ae43d14471c4b33661fedda6f06b539;10|eurocrisdris::432ca599953ff50cd4eeffe22faf3e48 +#allowedtypes=pubsrepository::institutional +allowedtypes=Institutional +outputPath=/tmp/miriam/enrichment_one_step +pathMap ={"author":"$['author'][*]['fullname']", \ + "title":"$['title'][*]['value']",\ + "orcid":"$['author'][*]['pid'][*][?(@['qualifier']['classid']=='orcid')]['value']" ,\ + "orcid_pending":"$['author'][*]['pid'][*][?(@['qualifier']['classid']=='orcid_pending')]['value']" ,\ + "contributor" : "$['contributor'][*]['value']",\ + "description" : "$['description'][*]['value']",\ + "subject" :"$['subject'][*]['value']" , \ + "fos" : "$['subject'][?(@['qualifier']['classid']=='FOS')].value" ,\ + "sdg" : "$['subject'][?(@['qualifier']['classid']=='SDG')].value",\ + "journal":"$['journal'].name",\ + "hostedby":"$['instance'][*]['hostedby']['key']",\ + "collectedfrom":"$['instance'][*]['collectedfrom']['key']",\ + "publisher":"$['publisher'].value",\ + "publicationyear":"$['dateofacceptance'].value"} +blacklist=empty +allowedpids=orcid;orcid_pending +baseURL = https://services.openaire.eu/openaire/community/ +iterations=1 + diff --git a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/wf/main/oozie_app/config-default.xml b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/wf/main/oozie_app/config-default.xml new file mode 100644 index 0000000000..d262cb6e05 --- /dev/null +++ b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/wf/main/oozie_app/config-default.xml @@ -0,0 +1,30 @@ + + + jobTracker + yarnRM + + + nameNode + hdfs://nameservice1 + + + oozie.use.system.libpath + true + + + hiveMetastoreUris + thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083 + + + hiveJdbcUrl + jdbc:hive2://iis-cdh5-test-m3.ocean.icm.edu.pl:10000 + + + hiveDbName + openaire + + + oozie.launcher.mapreduce.user.classpath.first + true + + diff --git a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/wf/main/oozie_app/import.txt b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/wf/main/oozie_app/import.txt new file mode 100644 index 0000000000..b202594148 --- /dev/null +++ b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/wf/main/oozie_app/import.txt @@ -0,0 +1,10 @@ +## This is a classpath-based import file (this header is required) +orcid_propagation classpath eu/dnetlib/dhp/wf/subworkflows/orcidtoresultfromsemrel/oozie_app +bulk_tagging classpath eu/dnetlib/dhp/wf/subworkflows/bulktag/oozie_app +affiliation_inst_repo classpath eu/dnetlib/dhp/wf/subworkflows/resulttoorganizationfrominstrepo/oozie_app +entity_semantic_relation classpath eu/dnetlib/dhp/wf/subworkflows/entitytoorganizationfromsemrel/oozie_app +community_organization classpath eu/dnetlib/dhp/wf/subworkflows/resulttocommunityfromorganization/oozie_app +result_project classpath eu/dnetlib/dhp/wf/subworkflows/projecttoresult/oozie_app +community_project classpath eu/dnetlib/dhp/wf/subworkflows/resulttocommunityfromproject/oozie_app +community_sem_rel classpath eu/dnetlib/dhp/wf/subworkflows/resulttocommunityfromsemrel/oozie_app +country_propagation classpath eu/dnetlib/dhp/wf/subworkflows/countrypropagation/oozie_app \ No newline at end of file diff --git a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/wf/main/oozie_app/workflow.xml b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/wf/main/oozie_app/workflow.xml new file mode 100644 index 0000000000..8e91707b6e --- /dev/null +++ b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/wf/main/oozie_app/workflow.xml @@ -0,0 +1,324 @@ + + + + + sourcePath + the source path + + + allowedsemrelsorcidprop + the semantic relationships allowed for propagation + + + allowedsemrelsresultproject + the allowed semantics + + + allowedsemrelscommunitysemrel + the semantic relationships allowed for propagation + + + datasourceWhitelistForCountryPropagation + the white list + + + allowedtypes + the allowed types + + + outputPath + the output path + + + pathMap + the json path associated to each selection field + + + blacklist + list of datasources in blacklist for the affiliation from instrepo propagation + + + + hiveDbName + the target hive database name + + + hiveJdbcUrl + hive server jdbc url + + + hiveMetastoreUris + hive server metastore URIs + + + sparkDriverMemory + memory for driver process + + + sparkExecutorMemory + memory for individual executor + + + sparkExecutorCores + number of cores used by single executor + + + oozieActionShareLibForSpark2 + oozie action sharelib for spark 2.* + + + spark2ExtraListeners + com.cloudera.spark.lineage.NavigatorAppListener + spark 2.* extra listeners classname + + + spark2SqlQueryExecutionListeners + com.cloudera.spark.lineage.NavigatorQueryListener + spark 2.* sql query execution listeners classname + + + spark2YarnHistoryServerAddress + spark 2.* yarn history server address + + + spark2EventLogDir + spark 2.* event log dir location + + + + + ${jobTracker} + ${nameNode} + + + mapreduce.job.queuename + ${queueName} + + + oozie.launcher.mapred.job.queue.name + ${oozieLauncherQueueName} + + + oozie.action.sharelib.for.spark + ${oozieActionShareLibForSpark2} + + + + + + + + + Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] + + + + + ${wf:conf('resumeFrom') eq 'BulkTagging'} + ${wf:conf('resumeFrom') eq 'AffiliationInstitutionalRepository'} + ${wf:conf('resumeFrom') eq 'AffiliationSemanticRelation'} + ${wf:conf('resumeFrom') eq 'CommunityOrganization'} + ${wf:conf('resumeFrom') eq 'ResultProject'} + ${wf:conf('resumeFrom') eq 'CommunityProject'} + ${wf:conf('resumeFrom') eq 'CommunitySemanticRelation'} + ${wf:conf('resumeFrom') eq 'CountryPropagation'} + + + + + + + + ${wf:appPath()}/orcid_propagation + + + + + sourcePath + ${sourcePath} + + + allowedsemrels + ${allowedsemrelsorcidprop} + + + outputPath + ${outputPath} + + + + + + + + + + ${wf:appPath()}/bulk_tagging + + + + + sourcePath + ${outputPath} + + + baseURL + ${baseURL} + + + pathMap + ${pathMap} + + + + + + + + + + ${wf:appPath()}/affiliation_inst_repo + + + + + sourcePath + ${outputPath} + + + blacklist + ${blacklist} + + + + + + + + + + ${wf:appPath()}/entity_semantic_relation + + + + + sourcePath + ${outputPath} + + + iterations + ${iterations} + + + + + + + + + + ${wf:appPath()}/community_organization + + + + + sourcePath + ${outputPath} + + + baseURL + ${baseURL} + + + + + + + + + + ${wf:appPath()}/result_project + + + + + sourcePath + ${outputPath} + + + allowedsemrels + ${allowedsemrelsresultproject} + + + + + + + + + + ${wf:appPath()}/community_project + + + + + sourcePath + ${outputPath} + + + + + + + + + + ${wf:appPath()}/community_sem_rel + + + + + sourcePath + ${outputPath} + + + allowedsemrels + ${allowedsemrelscommunitysemrel} + + + baseURL + ${baseURL} + + + + + + + + + + ${wf:appPath()}/country_propagation + + + + + sourcePath + ${outputPath} + + + whitelist + ${datasourceWhitelistForCountryPropagation} + + + allowedtypes + ${allowedtypes} + + + + + + + + + + diff --git a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/bulktag/input_bulkTag_parameters.json b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/wf/subworkflows/bulktag/input_bulkTag_parameters.json similarity index 100% rename from dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/bulktag/input_bulkTag_parameters.json rename to dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/wf/subworkflows/bulktag/input_bulkTag_parameters.json diff --git a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/bulktag/oozie_app/config-default.xml b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/wf/subworkflows/bulktag/oozie_app/config-default.xml similarity index 100% rename from dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/bulktag/oozie_app/config-default.xml rename to dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/wf/subworkflows/bulktag/oozie_app/config-default.xml diff --git a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/bulktag/oozie_app/workflow.xml b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/wf/subworkflows/bulktag/oozie_app/workflow.xml similarity index 54% rename from dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/bulktag/oozie_app/workflow.xml rename to dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/wf/subworkflows/bulktag/oozie_app/workflow.xml index 03373eda0b..6c51634484 100644 --- a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/bulktag/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/wf/subworkflows/bulktag/oozie_app/workflow.xml @@ -8,14 +8,11 @@ pathMap the json path associated to each selection field - - outputPath - the output path - baseURL - the community API base URL + The URL to access the community APIs + @@ -37,63 +34,18 @@ - - + + - + - - - - - - - - - - ${nameNode}/${sourcePath}/relation - ${nameNode}/${outputPath}/relation - - - - - - - - ${nameNode}/${sourcePath}/organization - ${nameNode}/${outputPath}/organization - - - - - - - - ${nameNode}/${sourcePath}/project - ${nameNode}/${outputPath}/project - - - - - - - - ${nameNode}/${sourcePath}/datasource - ${nameNode}/${outputPath}/datasource - - - - - - - yarn-cluster cluster - bulkTagging-result + bulkTagging-publication eu.dnetlib.dhp.bulktag.SparkBulkTagJob dhp-enrichment-${projectVersion}.jar @@ -107,7 +59,7 @@ --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} --sourcePath${sourcePath}/ - --outputPath${outputPath}/ + --outputPath${workingDir}/bulktag/ --pathMap${pathMap} --baseURL${baseURL} @@ -115,6 +67,8 @@ + + \ No newline at end of file diff --git a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/countrypropagation/input_countrypropagation_parameters.json b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/wf/subworkflows/countrypropagation/input_countrypropagation_parameters.json similarity index 99% rename from dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/countrypropagation/input_countrypropagation_parameters.json rename to dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/wf/subworkflows/countrypropagation/input_countrypropagation_parameters.json index f217e24582..d3cde8b747 100644 --- a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/countrypropagation/input_countrypropagation_parameters.json +++ b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/wf/subworkflows/countrypropagation/input_countrypropagation_parameters.json @@ -29,4 +29,4 @@ "paramDescription": "true if the spark session is managed, false otherwise", "paramRequired": false } -] \ No newline at end of file +] diff --git a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/countrypropagation/input_prepareassoc_parameters.json b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/wf/subworkflows/countrypropagation/input_prepareassoc_parameters.json similarity index 100% rename from dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/countrypropagation/input_prepareassoc_parameters.json rename to dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/wf/subworkflows/countrypropagation/input_prepareassoc_parameters.json diff --git a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/countrypropagation/input_prepareresultcountry_parameters.json b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/wf/subworkflows/countrypropagation/input_prepareresultcountry_parameters.json similarity index 100% rename from dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/countrypropagation/input_prepareresultcountry_parameters.json rename to dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/wf/subworkflows/countrypropagation/input_prepareresultcountry_parameters.json diff --git a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/countrypropagation/oozie_app/config-default.xml b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/wf/subworkflows/countrypropagation/oozie_app/config-default.xml similarity index 100% rename from dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/countrypropagation/oozie_app/config-default.xml rename to dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/wf/subworkflows/countrypropagation/oozie_app/config-default.xml diff --git a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/countrypropagation/oozie_app/workflow.xml b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/wf/subworkflows/countrypropagation/oozie_app/workflow.xml similarity index 82% rename from dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/countrypropagation/oozie_app/workflow.xml rename to dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/wf/subworkflows/countrypropagation/oozie_app/workflow.xml index 271ccbf722..81d6dc3dc1 100644 --- a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/countrypropagation/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/wf/subworkflows/countrypropagation/oozie_app/workflow.xml @@ -12,11 +12,6 @@ allowedtypes the allowed types - - outputPath - the output path - - @@ -38,57 +33,13 @@ - - + + - + - - - - - - - - - - ${nameNode}/${sourcePath}/relation - ${nameNode}/${outputPath}/relation - - - - - - - - ${nameNode}/${sourcePath}/organization - ${nameNode}/${outputPath}/organization - - - - - - - - ${nameNode}/${sourcePath}/project - ${nameNode}/${outputPath}/project - - - - - - - - ${nameNode}/${sourcePath}/datasource - ${nameNode}/${outputPath}/datasource - - - - - - @@ -112,18 +63,18 @@ --allowedtypes${allowedtypes} --outputPath${workingDir}/preparedInfo - + - - - - - + + + + + - + yarn cluster @@ -153,7 +104,7 @@ - + yarn cluster @@ -183,7 +134,7 @@ - + yarn cluster @@ -213,7 +164,7 @@ - + yarn cluster @@ -243,16 +194,16 @@ - + - - - - - + + + + + - + yarn cluster @@ -275,13 +226,13 @@ --sourcePath${sourcePath}/publication --preparedInfoPath${workingDir}/publication --resultTableNameeu.dnetlib.dhp.schema.oaf.Publication - --outputPath${outputPath}/publication + --outputPath${workingDir}/country/publication - + yarn cluster @@ -304,13 +255,13 @@ --sourcePath${sourcePath}/dataset --preparedInfoPath${workingDir}/dataset --resultTableNameeu.dnetlib.dhp.schema.oaf.Dataset - --outputPath${outputPath}/dataset + --outputPath${workingDir}/country/dataset - + yarn cluster @@ -333,13 +284,13 @@ --sourcePath${sourcePath}/otherresearchproduct --preparedInfoPath${workingDir}/otherresearchproduct --resultTableNameeu.dnetlib.dhp.schema.oaf.OtherResearchProduct - --outputPath${outputPath}/otherresearchproduct + --outputPath${workingDir}/country/otherresearchproduct - + yarn cluster @@ -362,14 +313,21 @@ --sourcePath${sourcePath}/software --preparedInfoPath${workingDir}/software --resultTableNameeu.dnetlib.dhp.schema.oaf.Software - --outputPath${outputPath}/software - + --outputPath${workingDir}/country/software + - - + + + + + + + + + \ No newline at end of file diff --git a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/entitytoorganizationfromsemrel/input_preparation_parameter.json b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/wf/subworkflows/entitytoorganizationfromsemrel/input_preparation_parameter.json similarity index 100% rename from dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/entitytoorganizationfromsemrel/input_preparation_parameter.json rename to dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/wf/subworkflows/entitytoorganizationfromsemrel/input_preparation_parameter.json diff --git a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/entitytoorganizationfromsemrel/input_propagation_parameter.json b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/wf/subworkflows/entitytoorganizationfromsemrel/input_propagation_parameter.json similarity index 100% rename from dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/entitytoorganizationfromsemrel/input_propagation_parameter.json rename to dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/wf/subworkflows/entitytoorganizationfromsemrel/input_propagation_parameter.json diff --git a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/entitytoorganizationfromsemrel/oozie_app/config-default.xml b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/wf/subworkflows/entitytoorganizationfromsemrel/oozie_app/config-default.xml similarity index 100% rename from dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/entitytoorganizationfromsemrel/oozie_app/config-default.xml rename to dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/wf/subworkflows/entitytoorganizationfromsemrel/oozie_app/config-default.xml diff --git a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/wf/subworkflows/entitytoorganizationfromsemrel/oozie_app/workflow.xml b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/wf/subworkflows/entitytoorganizationfromsemrel/oozie_app/workflow.xml new file mode 100644 index 0000000000..05824d209b --- /dev/null +++ b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/wf/subworkflows/entitytoorganizationfromsemrel/oozie_app/workflow.xml @@ -0,0 +1,101 @@ + + + + sourcePath + the source path + + + + + + ${jobTracker} + ${nameNode} + + + oozie.action.sharelib.for.spark + ${oozieActionShareLibForSpark2} + + + + + + + + Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] + + + + + + + + + + + + + + yarn + cluster + PrepareResultProjectOrganizationAssociation + eu.dnetlib.dhp.entitytoorganizationfromsemrel.PrepareInfo + dhp-enrichment-${projectVersion}.jar + + --executor-cores=${sparkExecutorCores} + --executor-memory=${sparkExecutorMemory} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + + --graphPath${sourcePath} + --hive_metastore_uris${hive_metastore_uris} + --leavesPath${workingDir}/entitiesSemanticRelation/preparedInfo/leavesPath + --childParentPath${workingDir}/entitiesSemanticRelation/preparedInfo/childParentPath + --resultOrgPath${workingDir}/entitiesSemanticRelation/preparedInfo/resultOrgPath + --projectOrganizationPath${workingDir}/entitiesSemanticRelation/preparedInfo/projectOrganizationPath + --relationPath${workingDir}/entitiesSemanticRelation/preparedInfo/relation + + + + + + + + yarn + cluster + entityToOrganizationFromSemRel + eu.dnetlib.dhp.entitytoorganizationfromsemrel.SparkEntityToOrganizationFromSemRel + dhp-enrichment-${projectVersion}.jar + + --executor-cores=${sparkExecutorCores} + --executor-memory=${sparkExecutorMemory} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.dynamicAllocation.enabled=true + --conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors} + --conf spark.sql.shuffle.partitions=3840 + + --relationPath${workingDir}/entitiesSemanticRelation/preparedInfo/relation + --outputPath${sourcePath}/relation + --leavesPath${workingDir}/entitiesSemanticRelation/preparedInfo/leavesPath + --childParentPath${workingDir}/entitiesSemanticRelation/preparedInfo/childParentPath + --resultOrgPath${workingDir}/entitiesSemanticRelation/preparedInfo/resultOrgPath + --projectOrganizationPath${workingDir}/entitiesSemanticRelation/preparedInfo/projectOrganizationPath + --hive_metastore_uris${hive_metastore_uris} + --workingDir${workingDir}/entitiesSemanticRelation/working + --iterations${iterations} + + + + + + + + + + \ No newline at end of file diff --git a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/orcidtoresultfromsemrel/input_orcidtoresult_parameters.json b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/wf/subworkflows/orcidtoresultfromsemrel/input_orcidtoresult_parameters.json similarity index 88% rename from dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/orcidtoresultfromsemrel/input_orcidtoresult_parameters.json rename to dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/wf/subworkflows/orcidtoresultfromsemrel/input_orcidtoresult_parameters.json index d8aa7eb9a9..3cbaa23bb6 100644 --- a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/orcidtoresultfromsemrel/input_orcidtoresult_parameters.json +++ b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/wf/subworkflows/orcidtoresultfromsemrel/input_orcidtoresult_parameters.json @@ -11,12 +11,6 @@ "paramDescription": "true if the new version of the graph must be saved", "paramRequired": false }, - { - "paramName":"h", - "paramLongName":"hive_metastore_uris", - "paramDescription": "the hive metastore uris", - "paramRequired": true - }, { "paramName": "out", "paramLongName": "outputPath", diff --git a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/orcidtoresultfromsemrel/input_prepareorcidtoresult_parameters.json b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/wf/subworkflows/orcidtoresultfromsemrel/input_prepareorcidtoresult_parameters.json similarity index 100% rename from dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/orcidtoresultfromsemrel/input_prepareorcidtoresult_parameters.json rename to dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/wf/subworkflows/orcidtoresultfromsemrel/input_prepareorcidtoresult_parameters.json diff --git a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/orcidtoresultfromsemrel/input_prepareorcidtoresult_parameters2.json b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/wf/subworkflows/orcidtoresultfromsemrel/input_prepareorcidtoresult_parameters2.json similarity index 100% rename from dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/orcidtoresultfromsemrel/input_prepareorcidtoresult_parameters2.json rename to dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/wf/subworkflows/orcidtoresultfromsemrel/input_prepareorcidtoresult_parameters2.json diff --git a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/orcidtoresultfromsemrel/oozie_app/config-default.xml b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/wf/subworkflows/orcidtoresultfromsemrel/oozie_app/config-default.xml similarity index 100% rename from dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/orcidtoresultfromsemrel/oozie_app/config-default.xml rename to dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/wf/subworkflows/orcidtoresultfromsemrel/oozie_app/config-default.xml diff --git a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/orcidtoresultfromsemrel/oozie_app/workflow.xml b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/wf/subworkflows/orcidtoresultfromsemrel/oozie_app/workflow.xml similarity index 95% rename from dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/orcidtoresultfromsemrel/oozie_app/workflow.xml rename to dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/wf/subworkflows/orcidtoresultfromsemrel/oozie_app/workflow.xml index 5f52c16585..483a805b10 100644 --- a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/orcidtoresultfromsemrel/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/wf/subworkflows/orcidtoresultfromsemrel/oozie_app/workflow.xml @@ -225,8 +225,8 @@ --conf spark.dynamicAllocation.enabled=true --conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors} - --sourcePath${workingDir}/preparedInfo/targetOrcidAssoc - --outputPath${workingDir}/preparedInfo/mergedOrcidAssoc + --sourcePath${workingDir}/orcidprop + --outputPath${workingDir}/orcidprop/mergedOrcidAssoc @@ -261,9 +261,8 @@ --conf spark.hadoop.mapreduce.reduce.speculative=false --conf spark.sql.shuffle.partitions=3840 - --possibleUpdatesPath${workingDir}/preparedInfo/mergedOrcidAssoc + --possibleUpdatesPath${workingDir}/orcidprop/mergedOrcidAssoc --sourcePath${sourcePath}/publication - --hive_metastore_uris${hive_metastore_uris} --resultTableNameeu.dnetlib.dhp.schema.oaf.Publication --outputPath${outputPath}/publication @@ -292,9 +291,8 @@ --conf spark.hadoop.mapreduce.map.speculative=false --conf spark.hadoop.mapreduce.reduce.speculative=false - --possibleUpdatesPath${workingDir}/preparedInfo/mergedOrcidAssoc + --possibleUpdatesPath${workingDir}/orcidprop/mergedOrcidAssoc --sourcePath${sourcePath}/dataset - --hive_metastore_uris${hive_metastore_uris} --resultTableNameeu.dnetlib.dhp.schema.oaf.Dataset --outputPath${outputPath}/dataset @@ -323,9 +321,8 @@ --conf spark.hadoop.mapreduce.map.speculative=false --conf spark.hadoop.mapreduce.reduce.speculative=false - --possibleUpdatesPath${workingDir}/preparedInfo/mergedOrcidAssoc + --possibleUpdatesPath${workingDir}/orcidprop/mergedOrcidAssoc --sourcePath${sourcePath}/otherresearchproduct - --hive_metastore_uris${hive_metastore_uris} --resultTableNameeu.dnetlib.dhp.schema.oaf.OtherResearchProduct --outputPath${outputPath}/otherresearchproduct @@ -354,9 +351,8 @@ --conf spark.hadoop.mapreduce.map.speculative=false --conf spark.hadoop.mapreduce.reduce.speculative=false - --possibleUpdatesPath${workingDir}/preparedInfo/mergedOrcidAssoc + --possibleUpdatesPath${workingDir}/orcidprop/mergedOrcidAssoc --sourcePath${sourcePath}/software - --hive_metastore_uris${hive_metastore_uris} --resultTableNameeu.dnetlib.dhp.schema.oaf.Software --outputPath${outputPath}/software @@ -365,6 +361,15 @@ + + + + + + + + + diff --git a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/projecttoresult/input_prepareprojecttoresult_parameters.json b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/wf/subworkflows/projecttoresult/input_prepareprojecttoresult_parameters.json similarity index 100% rename from dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/projecttoresult/input_prepareprojecttoresult_parameters.json rename to dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/wf/subworkflows/projecttoresult/input_prepareprojecttoresult_parameters.json diff --git a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/projecttoresult/input_projecttoresult_parameters.json b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/wf/subworkflows/projecttoresult/input_projecttoresult_parameters.json similarity index 100% rename from dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/projecttoresult/input_projecttoresult_parameters.json rename to dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/wf/subworkflows/projecttoresult/input_projecttoresult_parameters.json diff --git a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/projecttoresult/oozie_app/config-default.xml b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/wf/subworkflows/projecttoresult/oozie_app/config-default.xml similarity index 100% rename from dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/projecttoresult/oozie_app/config-default.xml rename to dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/wf/subworkflows/projecttoresult/oozie_app/config-default.xml diff --git a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/projecttoresult/oozie_app/workflow.xml b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/wf/subworkflows/projecttoresult/oozie_app/workflow.xml similarity index 50% rename from dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/projecttoresult/oozie_app/workflow.xml rename to dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/wf/subworkflows/projecttoresult/oozie_app/workflow.xml index 9e91c06fb3..f0db9c777f 100644 --- a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/projecttoresult/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/wf/subworkflows/projecttoresult/oozie_app/workflow.xml @@ -8,10 +8,7 @@ allowedsemrels the allowed semantics - - outputPath - the output path - + @@ -33,98 +30,13 @@ - - + + - + - - - - - - - - - - - - - - ${nameNode}/${sourcePath}/relation - ${nameNode}/${outputPath}/relation - - - - - - - - ${nameNode}/${sourcePath}/publication - ${nameNode}/${outputPath}/publication - - - - - - - - ${nameNode}/${sourcePath}/dataset - ${nameNode}/${outputPath}/dataset - - - - - - - - ${nameNode}/${sourcePath}/otherresearchproduct - ${nameNode}/${outputPath}/otherresearchproduct - - - - - - - - ${nameNode}/${sourcePath}/software - ${nameNode}/${outputPath}/software - - - - - - - - ${nameNode}/${sourcePath}/organization - ${nameNode}/${outputPath}/organization - - - - - - - - ${nameNode}/${sourcePath}/project - ${nameNode}/${outputPath}/project - - - - - - - - ${nameNode}/${sourcePath}/datasource - ${nameNode}/${outputPath}/datasource - - - - - - - yarn @@ -144,8 +56,8 @@ --sourcePath${sourcePath}/relation --allowedsemrels${allowedsemrels} --hive_metastore_uris${hive_metastore_uris} - --potentialUpdatePath${workingDir}/preparedInfo/potentialUpdates - --alreadyLinkedPath${workingDir}/preparedInfo/alreadyLinked + --potentialUpdatePath${workingDir}/resultproject/preparedInfo/potentialUpdates + --alreadyLinkedPath${workingDir}/resultproject/preparedInfo/alreadyLinked @@ -169,16 +81,22 @@ --conf spark.dynamicAllocation.enabled=true --conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors} - --saveGraph${saveGraph} --hive_metastore_uris${hive_metastore_uris} - --outputPath${outputPath}/relation - --potentialUpdatePath${workingDir}/preparedInfo/potentialUpdates - --alreadyLinkedPath${workingDir}/preparedInfo/alreadyLinked + --outputPath${sourcePath}/relation + --potentialUpdatePath${workingDir}/resultproject/preparedInfo/potentialUpdates + --alreadyLinkedPath${workingDir}/resultproject/preparedInfo/alreadyLinked + + + + + + + + - \ No newline at end of file diff --git a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/resulttocommunityfromorganization/input_communitytoresult_parameters.json b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/wf/subworkflows/resulttocommunityfromorganization/input_communitytoresult_parameters.json similarity index 100% rename from dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/resulttocommunityfromorganization/input_communitytoresult_parameters.json rename to dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/wf/subworkflows/resulttocommunityfromorganization/input_communitytoresult_parameters.json diff --git a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/resulttocommunityfromorganization/input_preparecommunitytoresult_parameters.json b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/wf/subworkflows/resulttocommunityfromorganization/input_preparecommunitytoresult_parameters.json similarity index 100% rename from dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/resulttocommunityfromorganization/input_preparecommunitytoresult_parameters.json rename to dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/wf/subworkflows/resulttocommunityfromorganization/input_preparecommunitytoresult_parameters.json diff --git a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/resulttocommunityfromorganization/oozie_app/config-default.xml b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/wf/subworkflows/resulttocommunityfromorganization/oozie_app/config-default.xml similarity index 100% rename from dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/resulttocommunityfromorganization/oozie_app/config-default.xml rename to dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/wf/subworkflows/resulttocommunityfromorganization/oozie_app/config-default.xml diff --git a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/resulttocommunityfromorganization/oozie_app/workflow.xml b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/wf/subworkflows/resulttocommunityfromorganization/oozie_app/workflow.xml similarity index 62% rename from dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/resulttocommunityfromorganization/oozie_app/workflow.xml rename to dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/wf/subworkflows/resulttocommunityfromorganization/oozie_app/workflow.xml index e342bce231..6aeffb4574 100644 --- a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/resulttocommunityfromorganization/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/wf/subworkflows/resulttocommunityfromorganization/oozie_app/workflow.xml @@ -4,13 +4,9 @@ sourcePath the source path - - outputPath - the output path - baseURL - the community API base URL + the baseURL from where to reach the community APIs @@ -33,58 +29,13 @@ - - + + - + - - - - - - - - - - ${nameNode}/${sourcePath}/relation - ${nameNode}/${outputPath}/relation - - - - - - - - ${nameNode}/${sourcePath}/organization - ${nameNode}/${outputPath}/organization - - - - - - - - ${nameNode}/${sourcePath}/project - ${nameNode}/${outputPath}/project - - - - - - - - ${nameNode}/${sourcePath}/datasource - ${nameNode}/${outputPath}/datasource - - - - - - - yarn @@ -102,10 +53,11 @@ --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors} --sourcePath${sourcePath}/relation - --outputPath${workingDir}/preparedInfo/resultCommunityList + --outputPath${workingDir}/communityorganization/preparedInfo/resultCommunityList --hive_metastore_uris${hive_metastore_uris} --baseURL${baseURL} @@ -117,7 +69,7 @@ yarn cluster - community2resultfromorganization + community2resultfromorganization-Publication eu.dnetlib.dhp.resulttocommunityfromorganization.SparkResultToCommunityFromOrganizationJob dhp-enrichment-${projectVersion}.jar @@ -132,16 +84,14 @@ --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} --conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors} - --preparedInfoPath${workingDir}/preparedInfo/resultCommunityList + --preparedInfoPath${workingDir}/communityorganization/preparedInfo/resultCommunityList --sourcePath${sourcePath}/ - --outputPath${outputPath}/ + --outputPath${workingDir}/communityorganization/resulttocommunityfromorganization/ - - \ No newline at end of file diff --git a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/resulttocommunityfromproject/input_communitytoresult_parameters.json b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/wf/subworkflows/resulttocommunityfromproject/input_communitytoresult_parameters.json similarity index 100% rename from dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/resulttocommunityfromproject/input_communitytoresult_parameters.json rename to dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/wf/subworkflows/resulttocommunityfromproject/input_communitytoresult_parameters.json diff --git a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/resulttocommunityfromproject/input_preparecommunitytoresult_parameters.json b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/wf/subworkflows/resulttocommunityfromproject/input_preparecommunitytoresult_parameters.json similarity index 100% rename from dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/resulttocommunityfromproject/input_preparecommunitytoresult_parameters.json rename to dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/wf/subworkflows/resulttocommunityfromproject/input_preparecommunitytoresult_parameters.json diff --git a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/resulttocommunityfromproject/oozie_app/config-default.xml b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/wf/subworkflows/resulttocommunityfromproject/oozie_app/config-default.xml similarity index 100% rename from dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/resulttocommunityfromproject/oozie_app/config-default.xml rename to dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/wf/subworkflows/resulttocommunityfromproject/oozie_app/config-default.xml diff --git a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/resulttocommunityfromproject/oozie_app/workflow.xml b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/wf/subworkflows/resulttocommunityfromproject/oozie_app/workflow.xml similarity index 63% rename from dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/resulttocommunityfromproject/oozie_app/workflow.xml rename to dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/wf/subworkflows/resulttocommunityfromproject/oozie_app/workflow.xml index d0784c94d3..dd845064b2 100644 --- a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/resulttocommunityfromproject/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/wf/subworkflows/resulttocommunityfromproject/oozie_app/workflow.xml @@ -4,13 +4,9 @@ sourcePath the source path - - outputPath - the output path - baseURL - the community API base URL + the base URL to use to select the right community APIs @@ -30,61 +26,15 @@ Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}] - - - + + - + - - - - - - - - - - ${nameNode}/${sourcePath}/relation - ${nameNode}/${outputPath}/relation - - - - - - - - ${nameNode}/${sourcePath}/organization - ${nameNode}/${outputPath}/organization - - - - - - - - ${nameNode}/${sourcePath}/project - ${nameNode}/${outputPath}/project - - - - - - - - ${nameNode}/${sourcePath}/datasource - ${nameNode}/${outputPath}/datasource - - - - - - - yarn @@ -106,7 +56,7 @@ --conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors} --sourcePath${sourcePath}/relation - --outputPath${workingDir}/preparedInfo/resultCommunityList + --outputPath${workingDir}/communitythroughproject/preparedInfo/resultCommunityList --baseURL${baseURL} @@ -132,9 +82,9 @@ --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} --conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors} - --preparedInfoPath${workingDir}/preparedInfo/resultCommunityList + --preparedInfoPath${workingDir}/communitythroughproject/preparedInfo/resultCommunityList --sourcePath${sourcePath}/ - --outputPath${outputPath}/ + --outputPath${workingDir}/communitythroughproject/ diff --git a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/resulttocommunityfromsemrel/input_communitytoresult_parameters.json b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/wf/subworkflows/resulttocommunityfromsemrel/input_communitytoresult_parameters.json similarity index 100% rename from dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/resulttocommunityfromsemrel/input_communitytoresult_parameters.json rename to dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/wf/subworkflows/resulttocommunityfromsemrel/input_communitytoresult_parameters.json diff --git a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/resulttocommunityfromsemrel/input_preparecommunitytoresult2_parameters.json b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/wf/subworkflows/resulttocommunityfromsemrel/input_preparecommunitytoresult2_parameters.json similarity index 100% rename from dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/resulttocommunityfromsemrel/input_preparecommunitytoresult2_parameters.json rename to dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/wf/subworkflows/resulttocommunityfromsemrel/input_preparecommunitytoresult2_parameters.json diff --git a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/resulttocommunityfromsemrel/input_preparecommunitytoresult_parameters.json b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/wf/subworkflows/resulttocommunityfromsemrel/input_preparecommunitytoresult_parameters.json similarity index 80% rename from dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/resulttocommunityfromsemrel/input_preparecommunitytoresult_parameters.json rename to dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/wf/subworkflows/resulttocommunityfromsemrel/input_preparecommunitytoresult_parameters.json index 8c99da673c..c6389ec8da 100644 --- a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/resulttocommunityfromsemrel/input_preparecommunitytoresult_parameters.json +++ b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/wf/subworkflows/resulttocommunityfromsemrel/input_preparecommunitytoresult_parameters.json @@ -1,7 +1,7 @@ [ { - "paramName":"is", - "paramLongName":"isLookUpUrl", + "paramName":"bu", + "paramLongName":"baseURL", "paramDescription": "URL of the isLookUp Service", "paramRequired": true }, @@ -36,9 +36,9 @@ "paramRequired": true }, { - "paramName":"tn", - "paramLongName":"resultTableName", - "paramDescription": "the name of the result table we are currently working on", - "paramRequired": true - } + "paramName":"tn", + "paramLongName":"resultTableName", + "paramDescription": "the name of the result table we are currently working on", + "paramRequired": true + } ] \ No newline at end of file diff --git a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/resulttocommunityfromsemrel/oozie_app/config-default.xml b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/wf/subworkflows/resulttocommunityfromsemrel/oozie_app/config-default.xml similarity index 100% rename from dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/resulttocommunityfromsemrel/oozie_app/config-default.xml rename to dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/wf/subworkflows/resulttocommunityfromsemrel/oozie_app/config-default.xml diff --git a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/resulttocommunityfromsemrel/oozie_app/workflow.xml b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/wf/subworkflows/resulttocommunityfromsemrel/oozie_app/workflow.xml similarity index 73% rename from dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/resulttocommunityfromsemrel/oozie_app/workflow.xml rename to dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/wf/subworkflows/resulttocommunityfromsemrel/oozie_app/workflow.xml index 81b51443c6..773c7fba76 100644 --- a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/resulttocommunityfromsemrel/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/wf/subworkflows/resulttocommunityfromsemrel/oozie_app/workflow.xml @@ -9,7 +9,7 @@ the semantic relationships allowed for propagation - isLookUpUrl + baseURL the isLookup service endpoint @@ -26,66 +26,13 @@ - - + + - + - - - - - - - - - - ${jobTracker} - ${nameNode} - ${nameNode}/${sourcePath}/relation - ${nameNode}/${outputPath}/relation - - - - - - - - ${jobTracker} - ${nameNode} - ${nameNode}/${sourcePath}/organization - ${nameNode}/${outputPath}/organization - - - - - - - - ${jobTracker} - ${nameNode} - ${nameNode}/${sourcePath}/project - ${nameNode}/${outputPath}/project - - - - - - - - ${jobTracker} - ${nameNode} - ${nameNode}/${sourcePath}/datasource - ${nameNode}/${outputPath}/datasource - - - - - - - @@ -101,8 +48,10 @@ eu.dnetlib.dhp.resulttocommunityfromsemrel.PrepareResultCommunitySetStep1 dhp-enrichment-${projectVersion}.jar - --executor-cores=${sparkExecutorCores} - --executor-memory=${sparkExecutorMemory} + --executor-cores=6 + --executor-memory=5G + --conf spark.executor.memoryOverhead=3g + --conf spark.sql.shuffle.partitions=3284 --driver-memory=${sparkDriverMemory} --conf spark.extraListeners=${spark2ExtraListeners} --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} @@ -114,9 +63,9 @@ --sourcePath${sourcePath} --hive_metastore_uris${hive_metastore_uris} --resultTableNameeu.dnetlib.dhp.schema.oaf.Publication - --outputPath${workingDir}/preparedInfo/targetCommunityAssoc + --outputPath${workingDir}/communitysemrel/preparedInfo/targetCommunityAssoc --allowedsemrels${allowedsemrels} - --isLookUpUrl${isLookUpUrl} + --baseURL${baseURL} @@ -130,8 +79,10 @@ eu.dnetlib.dhp.resulttocommunityfromsemrel.PrepareResultCommunitySetStep1 dhp-enrichment-${projectVersion}.jar - --executor-cores=${sparkExecutorCores} - --executor-memory=${sparkExecutorMemory} + --executor-cores=6 + --executor-memory=5G + --conf spark.executor.memoryOverhead=3g + --conf spark.sql.shuffle.partitions=3284 --driver-memory=${sparkDriverMemory} --conf spark.extraListeners=${spark2ExtraListeners} --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} @@ -143,9 +94,9 @@ --sourcePath${sourcePath} --hive_metastore_uris${hive_metastore_uris} --resultTableNameeu.dnetlib.dhp.schema.oaf.Dataset - --outputPath${workingDir}/preparedInfo/targetCommunityAssoc + --outputPath${workingDir}/communitysemrel/preparedInfo/targetCommunityAssoc --allowedsemrels${allowedsemrels} - --isLookUpUrl${isLookUpUrl} + --baseURL${baseURL} @@ -159,8 +110,10 @@ eu.dnetlib.dhp.resulttocommunityfromsemrel.PrepareResultCommunitySetStep1 dhp-enrichment-${projectVersion}.jar - --executor-cores=${sparkExecutorCores} - --executor-memory=${sparkExecutorMemory} + --executor-cores=6 + --executor-memory=5G + --conf spark.executor.memoryOverhead=3g + --conf spark.sql.shuffle.partitions=3284 --driver-memory=${sparkDriverMemory} --conf spark.extraListeners=${spark2ExtraListeners} --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} @@ -172,9 +125,9 @@ --sourcePath${sourcePath} --hive_metastore_uris${hive_metastore_uris} --resultTableNameeu.dnetlib.dhp.schema.oaf.OtherResearchProduct - --outputPath${workingDir}/preparedInfo/targetCommunityAssoc + --outputPath${workingDir}/communitysemrel/preparedInfo/targetCommunityAssoc --allowedsemrels${allowedsemrels} - --isLookUpUrl${isLookUpUrl} + --baseURL${baseURL} @@ -188,8 +141,10 @@ eu.dnetlib.dhp.resulttocommunityfromsemrel.PrepareResultCommunitySetStep1 dhp-enrichment-${projectVersion}.jar - --executor-cores=${sparkExecutorCores} - --executor-memory=${sparkExecutorMemory} + --executor-cores=6 + --executor-memory=5G + --conf spark.executor.memoryOverhead=3g + --conf spark.sql.shuffle.partitions=3284 --driver-memory=${sparkDriverMemory} --conf spark.extraListeners=${spark2ExtraListeners} --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} @@ -201,9 +156,9 @@ --sourcePath${sourcePath} --hive_metastore_uris${hive_metastore_uris} --resultTableNameeu.dnetlib.dhp.schema.oaf.Software - --outputPath${workingDir}/preparedInfo/targetCommunityAssoc + --outputPath${workingDir}/communitysemrel/preparedInfo/targetCommunityAssoc --allowedsemrels${allowedsemrels} - --isLookUpUrl${isLookUpUrl} + --baseURL${baseURL} @@ -219,8 +174,10 @@ eu.dnetlib.dhp.resulttocommunityfromsemrel.PrepareResultCommunitySetStep2 dhp-enrichment-${projectVersion}.jar - --executor-cores=${sparkExecutorCores} - --executor-memory=${sparkExecutorMemory} + --executor-cores=6 + --executor-memory=5G + --conf spark.executor.memoryOverhead=3g + --conf spark.sql.shuffle.partitions=3284 --driver-memory=${sparkDriverMemory} --conf spark.extraListeners=${spark2ExtraListeners} --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} @@ -229,8 +186,8 @@ --conf spark.dynamicAllocation.enabled=true --conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors} - --sourcePath${workingDir}/preparedInfo/targetCommunityAssoc - --outputPath${workingDir}/preparedInfo/mergedCommunityAssoc + --sourcePath${workingDir}/communitysemrel/preparedInfo/targetCommunityAssoc + --outputPath${workingDir}/communitysemrel/preparedInfo/mergedCommunityAssoc @@ -251,8 +208,10 @@ eu.dnetlib.dhp.resulttocommunityfromsemrel.SparkResultToCommunityThroughSemRelJob dhp-enrichment-${projectVersion}.jar - --executor-cores=${sparkExecutorCores} - --executor-memory=${sparkExecutorMemory} + --executor-cores=6 + --executor-memory=5G + --conf spark.executor.memoryOverhead=3g + --conf spark.sql.shuffle.partitions=3284 --driver-memory=${sparkDriverMemory} --conf spark.extraListeners=${spark2ExtraListeners} --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} @@ -261,12 +220,12 @@ --conf spark.dynamicAllocation.enabled=true --conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors} - --preparedInfoPath${workingDir}/preparedInfo/mergedCommunityAssoc + --preparedInfoPath${workingDir}/communitysemrel/preparedInfo/mergedCommunityAssoc --sourcePath${sourcePath}/publication --hive_metastore_uris${hive_metastore_uris} --resultTableNameeu.dnetlib.dhp.schema.oaf.Publication - --outputPath${outputPath}/publication - --saveGraph${saveGraph} + --outputPath${workingDir}/communitysemrel/publication + @@ -280,8 +239,10 @@ eu.dnetlib.dhp.resulttocommunityfromsemrel.SparkResultToCommunityThroughSemRelJob dhp-enrichment-${projectVersion}.jar - --executor-cores=${sparkExecutorCores} - --executor-memory=${sparkExecutorMemory} + --executor-cores=6 + --executor-memory=5G + --conf spark.executor.memoryOverhead=3g + --conf spark.sql.shuffle.partitions=3284 --driver-memory=${sparkDriverMemory} --conf spark.extraListeners=${spark2ExtraListeners} --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} @@ -290,12 +251,12 @@ --conf spark.dynamicAllocation.enabled=true --conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors} - --preparedInfoPath${workingDir}/preparedInfo/mergedCommunityAssoc + --preparedInfoPath${workingDir}/communitysemrel/preparedInfo/mergedCommunityAssoc --sourcePath${sourcePath}/dataset --hive_metastore_uris${hive_metastore_uris} --resultTableNameeu.dnetlib.dhp.schema.oaf.Dataset - --outputPath${outputPath}/dataset - --saveGraph${saveGraph} + --outputPath${workingDir}/communitysemrel/dataset + @@ -309,8 +270,10 @@ eu.dnetlib.dhp.resulttocommunityfromsemrel.SparkResultToCommunityThroughSemRelJob dhp-enrichment-${projectVersion}.jar - --executor-cores=${sparkExecutorCores} - --executor-memory=${sparkExecutorMemory} + --executor-cores=6 + --executor-memory=5G + --conf spark.executor.memoryOverhead=3g + --conf spark.sql.shuffle.partitions=3284 --driver-memory=${sparkDriverMemory} --conf spark.extraListeners=${spark2ExtraListeners} --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} @@ -319,12 +282,12 @@ --conf spark.dynamicAllocation.enabled=true --conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors} - --preparedInfoPath${workingDir}/preparedInfo/mergedCommunityAssoc + --preparedInfoPath${workingDir}/communitysemrel/preparedInfo/mergedCommunityAssoc --sourcePath${sourcePath}/otherresearchproduct --hive_metastore_uris${hive_metastore_uris} --resultTableNameeu.dnetlib.dhp.schema.oaf.OtherResearchProduct - --outputPath${outputPath}/otherresearchproduct - --saveGraph${saveGraph} + --outputPath${workingDir}/communitysemrel/otherresearchproduct + @@ -338,8 +301,10 @@ eu.dnetlib.dhp.resulttocommunityfromsemrel.SparkResultToCommunityThroughSemRelJob dhp-enrichment-${projectVersion}.jar - --executor-cores=${sparkExecutorCores} - --executor-memory=${sparkExecutorMemory} + --executor-cores=6 + --executor-memory=5G + --conf spark.executor.memoryOverhead=3g + --conf spark.sql.shuffle.partitions=3284 --driver-memory=${sparkDriverMemory} --conf spark.extraListeners=${spark2ExtraListeners} --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} @@ -348,12 +313,12 @@ --conf spark.dynamicAllocation.enabled=true --conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors} - --preparedInfoPath${workingDir}/preparedInfo/mergedCommunityAssoc + --preparedInfoPath${workingDir}/communitysemrel/preparedInfo/mergedCommunityAssoc --sourcePath${sourcePath}/software --hive_metastore_uris${hive_metastore_uris} --resultTableNameeu.dnetlib.dhp.schema.oaf.Software - --outputPath${outputPath}/software - --saveGraph${saveGraph} + --outputPath${workingDir}/communitysemrel/software + diff --git a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/wf/subworkflows/resulttoorganizationfrominstrepo/input_newrelation_parameters.json b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/wf/subworkflows/resulttoorganizationfrominstrepo/input_newrelation_parameters.json new file mode 100644 index 0000000000..5fe92cff13 --- /dev/null +++ b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/wf/subworkflows/resulttoorganizationfrominstrepo/input_newrelation_parameters.json @@ -0,0 +1,20 @@ +[ + { + "paramName":"s", + "paramLongName":"sourcePath", + "paramDescription": "the path of the sequencial file to read", + "paramRequired": true + }, + + { + "paramName": "ssm", + "paramLongName": "isSparkSessionManaged", + "paramDescription": "the path where prepared info have been stored", + "paramRequired": false + },{ + "paramName": "o", + "paramLongName": "outputPath", + "paramDescription": "institutional repositories that should not be considered for the propagation", + "paramRequired": false +} +] \ No newline at end of file diff --git a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/input_prepareresultorg_parameters.json b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/wf/subworkflows/resulttoorganizationfrominstrepo/input_prepareresultorg_parameters.json similarity index 66% rename from dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/input_prepareresultorg_parameters.json rename to dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/wf/subworkflows/resulttoorganizationfrominstrepo/input_prepareresultorg_parameters.json index 2f00bacae3..3f4b1d151b 100644 --- a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/input_prepareresultorg_parameters.json +++ b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/wf/subworkflows/resulttoorganizationfrominstrepo/input_prepareresultorg_parameters.json @@ -11,16 +11,11 @@ "paramDescription": "the hive metastore uris", "paramRequired": true }, + { - "paramName":"dop", - "paramLongName":"datasourceOrganizationPath", - "paramDescription": "path where to store/find association from datasource and organization", - "paramRequired": true - }, - { - "paramName":"alp", - "paramLongName":"alreadyLinkedPath", - "paramDescription": "path where to store/find already linked results and organizations", + "paramName":"wp", + "paramLongName":"workingPath", + "paramDescription": "the working path", "paramRequired": true }, { diff --git a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/input_propagationresulaffiliationfrominstrepo_parameters.json b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/wf/subworkflows/resulttoorganizationfrominstrepo/input_propagationresulaffiliationfrominstrepo_parameters.json similarity index 100% rename from dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/input_propagationresulaffiliationfrominstrepo_parameters.json rename to dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/wf/subworkflows/resulttoorganizationfrominstrepo/input_propagationresulaffiliationfrominstrepo_parameters.json diff --git a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/oozie_app/config-default.xml b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/wf/subworkflows/resulttoorganizationfrominstrepo/oozie_app/config-default.xml similarity index 100% rename from dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/oozie_app/config-default.xml rename to dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/wf/subworkflows/resulttoorganizationfrominstrepo/oozie_app/config-default.xml diff --git a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/oozie_app/workflow.xml b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/wf/subworkflows/resulttoorganizationfrominstrepo/oozie_app/workflow.xml similarity index 67% rename from dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/oozie_app/workflow.xml rename to dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/wf/subworkflows/resulttoorganizationfrominstrepo/oozie_app/workflow.xml index edfff8817d..e963453da9 100644 --- a/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/oozie_app/workflow.xml +++ b/dhp-workflows/dhp-enrichment/src/main/resources/eu/dnetlib/dhp/wf/subworkflows/resulttoorganizationfrominstrepo/oozie_app/workflow.xml @@ -5,8 +5,8 @@ the source path - outputPath - sets the outputPath + blacklist + The list of institutional repositories that should not be used for the propagation @@ -29,97 +29,13 @@ - - + + - + - - - - - - - - - - - - - - ${nameNode}/${sourcePath}/relation - ${nameNode}/${outputPath}/relation - - - - - - - - ${nameNode}/${sourcePath}/publication - ${nameNode}/${outputPath}/publication - - - - - - - - ${nameNode}/${sourcePath}/dataset - ${nameNode}/${outputPath}/dataset - - - - - - - - ${nameNode}/${sourcePath}/otherresearchproduct - ${nameNode}/${outputPath}/otherresearchproduct - - - - - - - - ${nameNode}/${sourcePath}/software - ${nameNode}/${outputPath}/software - - - - - - - - ${nameNode}/${sourcePath}/organization - ${nameNode}/${outputPath}/organization - - - - - - - - ${nameNode}/${sourcePath}/project - ${nameNode}/${outputPath}/project - - - - - - - - ${nameNode}/${sourcePath}/datasource - ${nameNode}/${outputPath}/datasource - - - - - - @@ -138,10 +54,9 @@ --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} --sourcePath${sourcePath} - --hive_metastore_uris${hive_metastore_uris} - --datasourceOrganizationPath${workingDir}/preparedInfo/datasourceOrganization - --alreadyLinkedPath${workingDir}/preparedInfo/alreadyLinked + --workingPath${workingDir}/affiliationInstRepo --blacklist${blacklist} + --hive_metastore_uris${hive_metastore_uris} @@ -173,9 +88,9 @@ --conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors} --sourcePath${sourcePath}/publication - --outputPath${outputPath}/relation - --datasourceOrganizationPath${workingDir}/preparedInfo/datasourceOrganization - --alreadyLinkedPath${workingDir}/preparedInfo/alreadyLinked + --outputPath${workingDir}/affiliationInstRepo/publication/relation + --datasourceOrganizationPath${workingDir}/affiliationInstRepo/preparedInfo/datasourceOrganization + --alreadyLinkedPath${workingDir}/affiliationInstRepo/preparedInfo/alreadyLinked --hive_metastore_uris${hive_metastore_uris} --resultTableNameeu.dnetlib.dhp.schema.oaf.Publication @@ -202,9 +117,9 @@ --conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors} --sourcePath${sourcePath}/dataset - --outputPath${outputPath}/relation - --datasourceOrganizationPath${workingDir}/preparedInfo/datasourceOrganization - --alreadyLinkedPath${workingDir}/preparedInfo/alreadyLinked + --outputPath${workingDir}/affiliationInstRepo/dataset/relation + --datasourceOrganizationPath${workingDir}/affiliationInstRepo/preparedInfo/datasourceOrganization + --alreadyLinkedPath${workingDir}/affiliationInstRepo/preparedInfo/alreadyLinked --hive_metastore_uris${hive_metastore_uris} --resultTableNameeu.dnetlib.dhp.schema.oaf.Dataset @@ -231,9 +146,9 @@ --conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors} --sourcePath${sourcePath}/otherresearchproduct - --outputPath${outputPath}/relation - --datasourceOrganizationPath${workingDir}/preparedInfo/datasourceOrganization - --alreadyLinkedPath${workingDir}/preparedInfo/alreadyLinked + --outputPath${workingDir}/affiliationInstRepo/otherresearchproduct/relation + --datasourceOrganizationPath${workingDir}/affiliationInstRepo/preparedInfo/datasourceOrganization + --alreadyLinkedPath${workingDir}/affiliationInstRepo/preparedInfo/alreadyLinked --hive_metastore_uris${hive_metastore_uris} --resultTableNameeu.dnetlib.dhp.schema.oaf.OtherResearchProduct @@ -260,9 +175,9 @@ --conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors} --sourcePath${sourcePath}/software - --outputPath${outputPath}/relation - --datasourceOrganizationPath${workingDir}/preparedInfo/datasourceOrganization - --alreadyLinkedPath${workingDir}/preparedInfo/alreadyLinked + --outputPath${workingDir}/affiliationInstRepo/software/relation + --datasourceOrganizationPath${workingDir}/affiliationInstRepo/preparedInfo/datasourceOrganization + --alreadyLinkedPath${workingDir}/affiliationInstRepo/preparedInfo/alreadyLinked --hive_metastore_uris${hive_metastore_uris} --resultTableNameeu.dnetlib.dhp.schema.oaf.Software @@ -270,7 +185,32 @@ - + + + + + yarn + cluster + append new relations + eu.dnetlib.dhp.resulttoorganizationfrominstrepo.AppendNewRelations + dhp-enrichment-${projectVersion}.jar + + --executor-cores=${sparkExecutorCores} + --executor-memory=${sparkExecutorMemory} + --driver-memory=${sparkDriverMemory} + --conf spark.extraListeners=${spark2ExtraListeners} + --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} + --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} + --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} + --conf spark.dynamicAllocation.enabled=true + --conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors} + + --outputPath${sourcePath}/relation + --sourcePath${workingDir}/affiliationInstRepo/ + + + + diff --git a/dhp-workflows/dhp-enrichment/src/test/java/eu/dnetlib/dhp/entitytoorganizationfromsemrel/SparkJobTest.java b/dhp-workflows/dhp-enrichment/src/test/java/eu/dnetlib/dhp/entitytoorganizationfromsemrel/SparkJobTest.java index 517a20cd96..db917658a6 100644 --- a/dhp-workflows/dhp-enrichment/src/test/java/eu/dnetlib/dhp/entitytoorganizationfromsemrel/SparkJobTest.java +++ b/dhp-workflows/dhp-enrichment/src/test/java/eu/dnetlib/dhp/entitytoorganizationfromsemrel/SparkJobTest.java @@ -114,7 +114,7 @@ public class SparkJobTest { .option("compression", "gzip") .json(workingDir.toString() + "/projectInput"); - SparkResultToOrganizationFromSemRel + SparkEntityToOrganizationFromSemRel .main( new String[] { @@ -395,7 +395,7 @@ public class SparkJobTest { .option("compression", "gzip") .json(workingDir.toString() + "/projectInput"); - SparkResultToOrganizationFromSemRel + SparkEntityToOrganizationFromSemRel .main( new String[] { @@ -678,7 +678,7 @@ public class SparkJobTest { .option("compression", "gzip") .json(workingDir.toString() + "/projectInput"); - SparkResultToOrganizationFromSemRel + SparkEntityToOrganizationFromSemRel .main( new String[] { diff --git a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/hostedbymap/oozie_app/download.sh b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/hostedbymap/oozie_app/download.sh index 35220bd8c9..9877fe7de5 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/hostedbymap/oozie_app/download.sh +++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/hostedbymap/oozie_app/download.sh @@ -1,3 +1,3 @@ #!/bin/bash curl -LSs $1 | hdfs dfs -put - $2/$3 -curl -LSs http://api.crossref.org/works/10.1099/jgv.0.001453 > prova.txt \ No newline at end of file +#curl -LSs http://api.crossref.org/works/10.1099/jgv.0.001453 > prova.txt \ No newline at end of file diff --git a/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/oa/provision/IndexRecordTransformerTest.java b/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/oa/provision/IndexRecordTransformerTest.java index e07ba1b4ea..e728830554 100644 --- a/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/oa/provision/IndexRecordTransformerTest.java +++ b/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/oa/provision/IndexRecordTransformerTest.java @@ -82,7 +82,7 @@ public class IndexRecordTransformerTest { void testPeerReviewed() throws IOException, TransformerException { final XmlRecordFactory xmlRecordFactory = new XmlRecordFactory(contextMapper, false, - XmlConverterJob.schemaLocation); + XmlConverterJob.schemaLocation); final Publication p = load("publication.json", Publication.class); diff --git a/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/oa/provision/XmlIndexingJobTest.java b/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/oa/provision/XmlIndexingJobTest.java index b62acbac34..a3a140cf64 100644 --- a/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/oa/provision/XmlIndexingJobTest.java +++ b/dhp-workflows/dhp-graph-provision/src/test/java/eu/dnetlib/dhp/oa/provision/XmlIndexingJobTest.java @@ -1,6 +1,8 @@ package eu.dnetlib.dhp.oa.provision; +import static org.junit.jupiter.api.Assertions.assertEquals; + import java.io.IOException; import java.io.StringReader; import java.net.URI; @@ -32,8 +34,6 @@ import eu.dnetlib.dhp.oa.provision.utils.ISLookupClient; import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException; import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService; -import static org.junit.jupiter.api.Assertions.assertEquals; - @ExtendWith(MockitoExtension.class) public class XmlIndexingJobTest extends SolrTest { @@ -110,34 +110,33 @@ public class XmlIndexingJobTest extends SolrTest { QueryResponse rsp = miniCluster.getSolrClient().query(new SolrQuery().add(CommonParams.Q, "*:*")); assertEquals( - nRecord, rsp.getResults().getNumFound(), - "the number of indexed records should be equal to the number of input records"); - + nRecord, rsp.getResults().getNumFound(), + "the number of indexed records should be equal to the number of input records"); rsp = miniCluster.getSolrClient().query(new SolrQuery().add(CommonParams.Q, "isgreen:true")); assertEquals( - 0, rsp.getResults().getNumFound(), - "the number of indexed records having isgreen = true"); + 0, rsp.getResults().getNumFound(), + "the number of indexed records having isgreen = true"); rsp = miniCluster.getSolrClient().query(new SolrQuery().add(CommonParams.Q, "openaccesscolor:bronze")); assertEquals( - 0, rsp.getResults().getNumFound(), - "the number of indexed records having openaccesscolor = bronze"); + 0, rsp.getResults().getNumFound(), + "the number of indexed records having openaccesscolor = bronze"); rsp = miniCluster.getSolrClient().query(new SolrQuery().add(CommonParams.Q, "isindiamondjournal:true")); assertEquals( - 0, rsp.getResults().getNumFound(), - "the number of indexed records having isindiamondjournal = true"); + 0, rsp.getResults().getNumFound(), + "the number of indexed records having isindiamondjournal = true"); rsp = miniCluster.getSolrClient().query(new SolrQuery().add(CommonParams.Q, "publiclyfunded:true")); assertEquals( - 0, rsp.getResults().getNumFound(), - "the number of indexed records having publiclyfunded = true"); + 0, rsp.getResults().getNumFound(), + "the number of indexed records having publiclyfunded = true"); rsp = miniCluster.getSolrClient().query(new SolrQuery().add(CommonParams.Q, "peerreviewed:true")); assertEquals( - 0, rsp.getResults().getNumFound(), - "the number of indexed records having peerreviewed = true"); + 0, rsp.getResults().getNumFound(), + "the number of indexed records having peerreviewed = true"); } @Test diff --git a/pom.xml b/pom.xml index 3fd351c1db..6ef320253e 100644 --- a/pom.xml +++ b/pom.xml @@ -931,5 +931,25 @@ --> + + + + arm-silicon-mac + + + aarch64 + mac + + + + + + org.xerial.snappy + snappy-java + 1.1.8.4 + + + + \ No newline at end of file