forked from antonis.lempesis/dnet-hadoop
Merge branch 'beta' into resource_types
This commit is contained in:
commit
cb9e739484
|
@ -14,9 +14,9 @@ import eu.dnetlib.pace.config.Config;
|
||||||
|
|
||||||
public abstract class AbstractClusteringFunction extends AbstractPaceFunctions implements ClusteringFunction {
|
public abstract class AbstractClusteringFunction extends AbstractPaceFunctions implements ClusteringFunction {
|
||||||
|
|
||||||
protected Map<String, Integer> params;
|
protected Map<String, Object> params;
|
||||||
|
|
||||||
public AbstractClusteringFunction(final Map<String, Integer> params) {
|
public AbstractClusteringFunction(final Map<String, Object> params) {
|
||||||
this.params = params;
|
this.params = params;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -27,7 +27,7 @@ public abstract class AbstractClusteringFunction extends AbstractPaceFunctions i
|
||||||
return fields
|
return fields
|
||||||
.stream()
|
.stream()
|
||||||
.filter(f -> !f.isEmpty())
|
.filter(f -> !f.isEmpty())
|
||||||
.map(this::normalize)
|
.map(s -> normalize(s))
|
||||||
.map(s -> filterAllStopWords(s))
|
.map(s -> filterAllStopWords(s))
|
||||||
.map(s -> doApply(conf, s))
|
.map(s -> doApply(conf, s))
|
||||||
.map(c -> filterBlacklisted(c, ngramBlacklist))
|
.map(c -> filterBlacklisted(c, ngramBlacklist))
|
||||||
|
@ -36,11 +36,24 @@ public abstract class AbstractClusteringFunction extends AbstractPaceFunctions i
|
||||||
.collect(Collectors.toCollection(HashSet::new));
|
.collect(Collectors.toCollection(HashSet::new));
|
||||||
}
|
}
|
||||||
|
|
||||||
public Map<String, Integer> getParams() {
|
public Map<String, Object> getParams() {
|
||||||
return params;
|
return params;
|
||||||
}
|
}
|
||||||
|
|
||||||
protected Integer param(String name) {
|
protected Integer param(String name) {
|
||||||
return params.get(name);
|
Object val = params.get(name);
|
||||||
|
if (val == null)
|
||||||
|
return null;
|
||||||
|
if (val instanceof Number) {
|
||||||
|
return ((Number) val).intValue();
|
||||||
|
}
|
||||||
|
return Integer.parseInt(val.toString());
|
||||||
|
}
|
||||||
|
|
||||||
|
protected int paramOrDefault(String name, int i) {
|
||||||
|
Integer res = param(name);
|
||||||
|
if (res == null)
|
||||||
|
res = i;
|
||||||
|
return res;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -13,7 +13,7 @@ import eu.dnetlib.pace.config.Config;
|
||||||
@ClusteringClass("acronyms")
|
@ClusteringClass("acronyms")
|
||||||
public class Acronyms extends AbstractClusteringFunction {
|
public class Acronyms extends AbstractClusteringFunction {
|
||||||
|
|
||||||
public Acronyms(Map<String, Integer> params) {
|
public Acronyms(Map<String, Object> params) {
|
||||||
super(params);
|
super(params);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -11,6 +11,6 @@ public interface ClusteringFunction {
|
||||||
|
|
||||||
public Collection<String> apply(Config config, List<String> fields);
|
public Collection<String> apply(Config config, List<String> fields);
|
||||||
|
|
||||||
public Map<String, Integer> getParams();
|
public Map<String, Object> getParams();
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -12,7 +12,7 @@ import eu.dnetlib.pace.config.Config;
|
||||||
@ClusteringClass("immutablefieldvalue")
|
@ClusteringClass("immutablefieldvalue")
|
||||||
public class ImmutableFieldValue extends AbstractClusteringFunction {
|
public class ImmutableFieldValue extends AbstractClusteringFunction {
|
||||||
|
|
||||||
public ImmutableFieldValue(final Map<String, Integer> params) {
|
public ImmutableFieldValue(final Map<String, Object> params) {
|
||||||
super(params);
|
super(params);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,69 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.pace.clustering;
|
||||||
|
|
||||||
|
import java.util.Collection;
|
||||||
|
import java.util.HashSet;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Map;
|
||||||
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
|
import org.apache.commons.lang3.StringUtils;
|
||||||
|
|
||||||
|
import com.jayway.jsonpath.Configuration;
|
||||||
|
import com.jayway.jsonpath.DocumentContext;
|
||||||
|
import com.jayway.jsonpath.JsonPath;
|
||||||
|
import com.jayway.jsonpath.Option;
|
||||||
|
|
||||||
|
import eu.dnetlib.pace.common.AbstractPaceFunctions;
|
||||||
|
import eu.dnetlib.pace.config.Config;
|
||||||
|
import eu.dnetlib.pace.util.MapDocumentUtil;
|
||||||
|
|
||||||
|
@ClusteringClass("jsonlistclustering")
|
||||||
|
public class JSONListClustering extends AbstractPaceFunctions implements ClusteringFunction {
|
||||||
|
|
||||||
|
private Map<String, Object> params;
|
||||||
|
|
||||||
|
public JSONListClustering(Map<String, Object> params) {
|
||||||
|
this.params = params;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Map<String, Object> getParams() {
|
||||||
|
return params;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Collection<String> apply(Config conf, List<String> fields) {
|
||||||
|
return fields
|
||||||
|
.stream()
|
||||||
|
.filter(f -> !f.isEmpty())
|
||||||
|
.map(s -> doApply(conf, s))
|
||||||
|
.filter(StringUtils::isNotBlank)
|
||||||
|
.collect(Collectors.toCollection(HashSet::new));
|
||||||
|
}
|
||||||
|
|
||||||
|
private String doApply(Config conf, String json) {
|
||||||
|
StringBuilder st = new StringBuilder(); // to build the string used for comparisons basing on the jpath into
|
||||||
|
// parameters
|
||||||
|
final DocumentContext documentContext = JsonPath
|
||||||
|
.using(Configuration.defaultConfiguration().addOptions(Option.SUPPRESS_EXCEPTIONS))
|
||||||
|
.parse(json);
|
||||||
|
|
||||||
|
// for each path in the param list
|
||||||
|
for (String key : params.keySet().stream().filter(k -> k.contains("jpath")).collect(Collectors.toList())) {
|
||||||
|
String path = params.get(key).toString();
|
||||||
|
String value = MapDocumentUtil.getJPathString(path, documentContext);
|
||||||
|
if (value == null || value.isEmpty())
|
||||||
|
value = "";
|
||||||
|
st.append(value);
|
||||||
|
st.append(" ");
|
||||||
|
}
|
||||||
|
|
||||||
|
st.setLength(st.length() - 1);
|
||||||
|
|
||||||
|
if (StringUtils.isBlank(st)) {
|
||||||
|
return "1";
|
||||||
|
}
|
||||||
|
return st.toString();
|
||||||
|
}
|
||||||
|
}
|
|
@ -11,7 +11,7 @@ import eu.dnetlib.pace.config.Config;
|
||||||
@ClusteringClass("keywordsclustering")
|
@ClusteringClass("keywordsclustering")
|
||||||
public class KeywordsClustering extends AbstractClusteringFunction {
|
public class KeywordsClustering extends AbstractClusteringFunction {
|
||||||
|
|
||||||
public KeywordsClustering(Map<String, Integer> params) {
|
public KeywordsClustering(Map<String, Object> params) {
|
||||||
super(params);
|
super(params);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -19,8 +19,8 @@ public class KeywordsClustering extends AbstractClusteringFunction {
|
||||||
protected Collection<String> doApply(final Config conf, String s) {
|
protected Collection<String> doApply(final Config conf, String s) {
|
||||||
|
|
||||||
// takes city codes and keywords codes without duplicates
|
// takes city codes and keywords codes without duplicates
|
||||||
Set<String> keywords = getKeywords(s, conf.translationMap(), params.getOrDefault("windowSize", 4));
|
Set<String> keywords = getKeywords(s, conf.translationMap(), paramOrDefault("windowSize", 4));
|
||||||
Set<String> cities = getCities(s, params.getOrDefault("windowSize", 4));
|
Set<String> cities = getCities(s, paramOrDefault("windowSize", 4));
|
||||||
|
|
||||||
// list of combination to return as result
|
// list of combination to return as result
|
||||||
final Collection<String> combinations = new LinkedHashSet<String>();
|
final Collection<String> combinations = new LinkedHashSet<String>();
|
||||||
|
@ -28,7 +28,7 @@ public class KeywordsClustering extends AbstractClusteringFunction {
|
||||||
for (String keyword : keywordsToCodes(keywords, conf.translationMap())) {
|
for (String keyword : keywordsToCodes(keywords, conf.translationMap())) {
|
||||||
for (String city : citiesToCodes(cities)) {
|
for (String city : citiesToCodes(cities)) {
|
||||||
combinations.add(keyword + "-" + city);
|
combinations.add(keyword + "-" + city);
|
||||||
if (combinations.size() >= params.getOrDefault("max", 2)) {
|
if (combinations.size() >= paramOrDefault("max", 2)) {
|
||||||
return combinations;
|
return combinations;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -42,8 +42,8 @@ public class KeywordsClustering extends AbstractClusteringFunction {
|
||||||
return fields
|
return fields
|
||||||
.stream()
|
.stream()
|
||||||
.filter(f -> !f.isEmpty())
|
.filter(f -> !f.isEmpty())
|
||||||
.map(this::cleanup)
|
.map(KeywordsClustering::cleanup)
|
||||||
.map(this::normalize)
|
.map(KeywordsClustering::normalize)
|
||||||
.map(s -> filterAllStopWords(s))
|
.map(s -> filterAllStopWords(s))
|
||||||
.map(s -> doApply(conf, s))
|
.map(s -> doApply(conf, s))
|
||||||
.map(c -> filterBlacklisted(c, ngramBlacklist))
|
.map(c -> filterBlacklisted(c, ngramBlacklist))
|
||||||
|
|
|
@ -16,7 +16,7 @@ public class LastNameFirstInitial extends AbstractClusteringFunction {
|
||||||
|
|
||||||
private boolean DEFAULT_AGGRESSIVE = true;
|
private boolean DEFAULT_AGGRESSIVE = true;
|
||||||
|
|
||||||
public LastNameFirstInitial(final Map<String, Integer> params) {
|
public LastNameFirstInitial(final Map<String, Object> params) {
|
||||||
super(params);
|
super(params);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -25,7 +25,7 @@ public class LastNameFirstInitial extends AbstractClusteringFunction {
|
||||||
return fields
|
return fields
|
||||||
.stream()
|
.stream()
|
||||||
.filter(f -> !f.isEmpty())
|
.filter(f -> !f.isEmpty())
|
||||||
.map(this::normalize)
|
.map(LastNameFirstInitial::normalize)
|
||||||
.map(s -> doApply(conf, s))
|
.map(s -> doApply(conf, s))
|
||||||
.map(c -> filterBlacklisted(c, ngramBlacklist))
|
.map(c -> filterBlacklisted(c, ngramBlacklist))
|
||||||
.flatMap(c -> c.stream())
|
.flatMap(c -> c.stream())
|
||||||
|
@ -33,8 +33,7 @@ public class LastNameFirstInitial extends AbstractClusteringFunction {
|
||||||
.collect(Collectors.toCollection(HashSet::new));
|
.collect(Collectors.toCollection(HashSet::new));
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
public static String normalize(final String s) {
|
||||||
protected String normalize(final String s) {
|
|
||||||
return fixAliases(transliterate(nfd(unicodeNormalization(s))))
|
return fixAliases(transliterate(nfd(unicodeNormalization(s))))
|
||||||
// do not compact the regexes in a single expression, would cause StackOverflowError in case of large input
|
// do not compact the regexes in a single expression, would cause StackOverflowError in case of large input
|
||||||
// strings
|
// strings
|
||||||
|
|
|
@ -15,7 +15,7 @@ import eu.dnetlib.pace.config.Config;
|
||||||
@ClusteringClass("lowercase")
|
@ClusteringClass("lowercase")
|
||||||
public class LowercaseClustering extends AbstractClusteringFunction {
|
public class LowercaseClustering extends AbstractClusteringFunction {
|
||||||
|
|
||||||
public LowercaseClustering(final Map<String, Integer> params) {
|
public LowercaseClustering(final Map<String, Object> params) {
|
||||||
super(params);
|
super(params);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -12,11 +12,11 @@ import eu.dnetlib.pace.config.Config;
|
||||||
@ClusteringClass("ngrampairs")
|
@ClusteringClass("ngrampairs")
|
||||||
public class NgramPairs extends Ngrams {
|
public class NgramPairs extends Ngrams {
|
||||||
|
|
||||||
public NgramPairs(Map<String, Integer> params) {
|
public NgramPairs(Map<String, Object> params) {
|
||||||
super(params, false);
|
super(params, false);
|
||||||
}
|
}
|
||||||
|
|
||||||
public NgramPairs(Map<String, Integer> params, boolean sorted) {
|
public NgramPairs(Map<String, Object> params, boolean sorted) {
|
||||||
super(params, sorted);
|
super(params, sorted);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -10,11 +10,11 @@ public class Ngrams extends AbstractClusteringFunction {
|
||||||
|
|
||||||
private final boolean sorted;
|
private final boolean sorted;
|
||||||
|
|
||||||
public Ngrams(Map<String, Integer> params) {
|
public Ngrams(Map<String, Object> params) {
|
||||||
this(params, false);
|
this(params, false);
|
||||||
}
|
}
|
||||||
|
|
||||||
public Ngrams(Map<String, Integer> params, boolean sorted) {
|
public Ngrams(Map<String, Object> params, boolean sorted) {
|
||||||
super(params);
|
super(params);
|
||||||
this.sorted = sorted;
|
this.sorted = sorted;
|
||||||
}
|
}
|
||||||
|
|
|
@ -0,0 +1,113 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.pace.clustering;
|
||||||
|
|
||||||
|
import java.util.*;
|
||||||
|
import java.util.stream.Collectors;
|
||||||
|
import java.util.stream.StreamSupport;
|
||||||
|
|
||||||
|
import com.google.common.base.Splitter;
|
||||||
|
import com.google.common.collect.Sets;
|
||||||
|
|
||||||
|
import eu.dnetlib.pace.config.Config;
|
||||||
|
|
||||||
|
@ClusteringClass("numAuthorsTitleSuffixPrefixChain")
|
||||||
|
public class NumAuthorsTitleSuffixPrefixChain extends AbstractClusteringFunction {
|
||||||
|
|
||||||
|
public NumAuthorsTitleSuffixPrefixChain(Map<String, Object> params) {
|
||||||
|
super(params);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Collection<String> apply(Config conf, List<String> fields) {
|
||||||
|
|
||||||
|
try {
|
||||||
|
int num_authors = Math.min(Integer.parseInt(fields.get(0)), 21); // SIZE threshold is 20, +1
|
||||||
|
|
||||||
|
if (num_authors > 0) {
|
||||||
|
return super.apply(conf, fields.subList(1, fields.size()))
|
||||||
|
.stream()
|
||||||
|
.map(s -> num_authors + "-" + s)
|
||||||
|
.collect(Collectors.toList());
|
||||||
|
}
|
||||||
|
} catch (NumberFormatException e) {
|
||||||
|
// missing or null authors array
|
||||||
|
}
|
||||||
|
|
||||||
|
return Collections.emptyList();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected Collection<String> doApply(Config conf, String s) {
|
||||||
|
return suffixPrefixChain(cleanup(s), param("mod"));
|
||||||
|
}
|
||||||
|
|
||||||
|
private Collection<String> suffixPrefixChain(String s, int mod) {
|
||||||
|
// create the list of words from the string (remove short words)
|
||||||
|
List<String> wordsList = Arrays
|
||||||
|
.stream(s.split(" "))
|
||||||
|
.filter(si -> si.length() > 3)
|
||||||
|
.collect(Collectors.toList());
|
||||||
|
|
||||||
|
final int words = wordsList.size();
|
||||||
|
final int letters = s.length();
|
||||||
|
|
||||||
|
// create the prefix: number of words + number of letters/mod
|
||||||
|
String prefix = words / mod + "-";
|
||||||
|
|
||||||
|
return doSuffixPrefixChain(wordsList, prefix);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
private Collection<String> doSuffixPrefixChain(List<String> wordsList, String prefix) {
|
||||||
|
|
||||||
|
Set<String> set = Sets.newLinkedHashSet();
|
||||||
|
switch (wordsList.size()) {
|
||||||
|
case 0:
|
||||||
|
break;
|
||||||
|
case 1:
|
||||||
|
set.add(wordsList.get(0));
|
||||||
|
break;
|
||||||
|
case 2:
|
||||||
|
set
|
||||||
|
.add(
|
||||||
|
prefix +
|
||||||
|
suffix(wordsList.get(0), 3) +
|
||||||
|
prefix(wordsList.get(1), 3));
|
||||||
|
|
||||||
|
set
|
||||||
|
.add(
|
||||||
|
prefix +
|
||||||
|
prefix(wordsList.get(0), 3) +
|
||||||
|
suffix(wordsList.get(1), 3));
|
||||||
|
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
set
|
||||||
|
.add(
|
||||||
|
prefix +
|
||||||
|
suffix(wordsList.get(0), 3) +
|
||||||
|
prefix(wordsList.get(1), 3) +
|
||||||
|
suffix(wordsList.get(2), 3));
|
||||||
|
|
||||||
|
set
|
||||||
|
.add(
|
||||||
|
prefix +
|
||||||
|
prefix(wordsList.get(0), 3) +
|
||||||
|
suffix(wordsList.get(1), 3) +
|
||||||
|
prefix(wordsList.get(2), 3));
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
return set;
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
private String suffix(String s, int len) {
|
||||||
|
return s.substring(s.length() - len);
|
||||||
|
}
|
||||||
|
|
||||||
|
private String prefix(String s, int len) {
|
||||||
|
return s.substring(0, len);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -17,11 +17,11 @@ import eu.dnetlib.pace.model.Person;
|
||||||
@ClusteringClass("personClustering")
|
@ClusteringClass("personClustering")
|
||||||
public class PersonClustering extends AbstractPaceFunctions implements ClusteringFunction {
|
public class PersonClustering extends AbstractPaceFunctions implements ClusteringFunction {
|
||||||
|
|
||||||
private Map<String, Integer> params;
|
private Map<String, Object> params;
|
||||||
|
|
||||||
private static final int MAX_TOKENS = 5;
|
private static final int MAX_TOKENS = 5;
|
||||||
|
|
||||||
public PersonClustering(final Map<String, Integer> params) {
|
public PersonClustering(final Map<String, Object> params) {
|
||||||
this.params = params;
|
this.params = params;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -77,7 +77,7 @@ public class PersonClustering extends AbstractPaceFunctions implements Clusterin
|
||||||
// }
|
// }
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Map<String, Integer> getParams() {
|
public Map<String, Object> getParams() {
|
||||||
return params;
|
return params;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -15,7 +15,7 @@ public class PersonHash extends AbstractClusteringFunction {
|
||||||
|
|
||||||
private boolean DEFAULT_AGGRESSIVE = false;
|
private boolean DEFAULT_AGGRESSIVE = false;
|
||||||
|
|
||||||
public PersonHash(final Map<String, Integer> params) {
|
public PersonHash(final Map<String, Object> params) {
|
||||||
super(params);
|
super(params);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -8,7 +8,7 @@ import eu.dnetlib.pace.config.Config;
|
||||||
|
|
||||||
public class RandomClusteringFunction extends AbstractClusteringFunction {
|
public class RandomClusteringFunction extends AbstractClusteringFunction {
|
||||||
|
|
||||||
public RandomClusteringFunction(Map<String, Integer> params) {
|
public RandomClusteringFunction(Map<String, Object> params) {
|
||||||
super(params);
|
super(params);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -1,7 +1,10 @@
|
||||||
|
|
||||||
package eu.dnetlib.pace.clustering;
|
package eu.dnetlib.pace.clustering;
|
||||||
|
|
||||||
import java.util.*;
|
import java.util.Collection;
|
||||||
|
import java.util.Collections;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Map;
|
||||||
|
|
||||||
import com.google.common.base.Joiner;
|
import com.google.common.base.Joiner;
|
||||||
import com.google.common.base.Splitter;
|
import com.google.common.base.Splitter;
|
||||||
|
@ -12,7 +15,7 @@ import eu.dnetlib.pace.config.Config;
|
||||||
@ClusteringClass("sortedngrampairs")
|
@ClusteringClass("sortedngrampairs")
|
||||||
public class SortedNgramPairs extends NgramPairs {
|
public class SortedNgramPairs extends NgramPairs {
|
||||||
|
|
||||||
public SortedNgramPairs(Map<String, Integer> params) {
|
public SortedNgramPairs(Map<String, Object> params) {
|
||||||
super(params, false);
|
super(params, false);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -15,7 +15,7 @@ import eu.dnetlib.pace.config.Config;
|
||||||
@ClusteringClass("spacetrimmingfieldvalue")
|
@ClusteringClass("spacetrimmingfieldvalue")
|
||||||
public class SpaceTrimmingFieldValue extends AbstractClusteringFunction {
|
public class SpaceTrimmingFieldValue extends AbstractClusteringFunction {
|
||||||
|
|
||||||
public SpaceTrimmingFieldValue(final Map<String, Integer> params) {
|
public SpaceTrimmingFieldValue(final Map<String, Object> params) {
|
||||||
super(params);
|
super(params);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -25,7 +25,7 @@ public class SpaceTrimmingFieldValue extends AbstractClusteringFunction {
|
||||||
|
|
||||||
res
|
res
|
||||||
.add(
|
.add(
|
||||||
StringUtils.isBlank(s) ? RandomStringUtils.random(getParams().get("randomLength"))
|
StringUtils.isBlank(s) ? RandomStringUtils.random(param("randomLength"))
|
||||||
: s.toLowerCase().replaceAll("\\s+", ""));
|
: s.toLowerCase().replaceAll("\\s+", ""));
|
||||||
|
|
||||||
return res;
|
return res;
|
||||||
|
|
|
@ -12,7 +12,7 @@ import eu.dnetlib.pace.config.Config;
|
||||||
@ClusteringClass("suffixprefix")
|
@ClusteringClass("suffixprefix")
|
||||||
public class SuffixPrefix extends AbstractClusteringFunction {
|
public class SuffixPrefix extends AbstractClusteringFunction {
|
||||||
|
|
||||||
public SuffixPrefix(Map<String, Integer> params) {
|
public SuffixPrefix(Map<String, Object> params) {
|
||||||
super(params);
|
super(params);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -15,12 +15,17 @@ import eu.dnetlib.pace.config.Config;
|
||||||
@ClusteringClass("urlclustering")
|
@ClusteringClass("urlclustering")
|
||||||
public class UrlClustering extends AbstractPaceFunctions implements ClusteringFunction {
|
public class UrlClustering extends AbstractPaceFunctions implements ClusteringFunction {
|
||||||
|
|
||||||
protected Map<String, Integer> params;
|
protected Map<String, Object> params;
|
||||||
|
|
||||||
public UrlClustering(final Map<String, Integer> params) {
|
public UrlClustering(final Map<String, Object> params) {
|
||||||
this.params = params;
|
this.params = params;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Map<String, Object> getParams() {
|
||||||
|
return params;
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Collection<String> apply(final Config conf, List<String> fields) {
|
public Collection<String> apply(final Config conf, List<String> fields) {
|
||||||
try {
|
try {
|
||||||
|
@ -35,11 +40,6 @@ public class UrlClustering extends AbstractPaceFunctions implements ClusteringFu
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
|
||||||
public Map<String, Integer> getParams() {
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
|
|
||||||
private URL asUrl(String value) {
|
private URL asUrl(String value) {
|
||||||
try {
|
try {
|
||||||
return new URL(value);
|
return new URL(value);
|
||||||
|
|
|
@ -11,7 +11,7 @@ import eu.dnetlib.pace.config.Config;
|
||||||
@ClusteringClass("wordsStatsSuffixPrefixChain")
|
@ClusteringClass("wordsStatsSuffixPrefixChain")
|
||||||
public class WordsStatsSuffixPrefixChain extends AbstractClusteringFunction {
|
public class WordsStatsSuffixPrefixChain extends AbstractClusteringFunction {
|
||||||
|
|
||||||
public WordsStatsSuffixPrefixChain(Map<String, Integer> params) {
|
public WordsStatsSuffixPrefixChain(Map<String, Object> params) {
|
||||||
super(params);
|
super(params);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -12,7 +12,7 @@ import eu.dnetlib.pace.config.Config;
|
||||||
@ClusteringClass("wordssuffixprefix")
|
@ClusteringClass("wordssuffixprefix")
|
||||||
public class WordsSuffixPrefix extends AbstractClusteringFunction {
|
public class WordsSuffixPrefix extends AbstractClusteringFunction {
|
||||||
|
|
||||||
public WordsSuffixPrefix(Map<String, Integer> params) {
|
public WordsSuffixPrefix(Map<String, Object> params) {
|
||||||
super(params);
|
super(params);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -16,7 +16,6 @@ import org.apache.commons.lang3.StringUtils;
|
||||||
import com.google.common.base.Joiner;
|
import com.google.common.base.Joiner;
|
||||||
import com.google.common.base.Splitter;
|
import com.google.common.base.Splitter;
|
||||||
import com.google.common.collect.Iterables;
|
import com.google.common.collect.Iterables;
|
||||||
import com.google.common.collect.Lists;
|
|
||||||
import com.google.common.collect.Sets;
|
import com.google.common.collect.Sets;
|
||||||
import com.ibm.icu.text.Transliterator;
|
import com.ibm.icu.text.Transliterator;
|
||||||
|
|
||||||
|
@ -27,7 +26,7 @@ import eu.dnetlib.pace.clustering.NGramUtils;
|
||||||
*
|
*
|
||||||
* @author claudio
|
* @author claudio
|
||||||
*/
|
*/
|
||||||
public abstract class AbstractPaceFunctions {
|
public class AbstractPaceFunctions {
|
||||||
|
|
||||||
// city map to be used when translating the city names into codes
|
// city map to be used when translating the city names into codes
|
||||||
private static Map<String, String> cityMap = AbstractPaceFunctions
|
private static Map<String, String> cityMap = AbstractPaceFunctions
|
||||||
|
@ -62,11 +61,14 @@ public abstract class AbstractPaceFunctions {
|
||||||
|
|
||||||
private static Pattern hexUnicodePattern = Pattern.compile("\\\\u(\\p{XDigit}{4})");
|
private static Pattern hexUnicodePattern = Pattern.compile("\\\\u(\\p{XDigit}{4})");
|
||||||
|
|
||||||
protected String concat(final List<String> l) {
|
private static Pattern romanNumberPattern = Pattern
|
||||||
|
.compile("^M{0,4}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|V?I{0,3})$");
|
||||||
|
|
||||||
|
protected static String concat(final List<String> l) {
|
||||||
return Joiner.on(" ").skipNulls().join(l);
|
return Joiner.on(" ").skipNulls().join(l);
|
||||||
}
|
}
|
||||||
|
|
||||||
protected String cleanup(final String s) {
|
public static String cleanup(final String s) {
|
||||||
final String s1 = HTML_REGEX.matcher(s).replaceAll("");
|
final String s1 = HTML_REGEX.matcher(s).replaceAll("");
|
||||||
final String s2 = unicodeNormalization(s1.toLowerCase());
|
final String s2 = unicodeNormalization(s1.toLowerCase());
|
||||||
final String s3 = nfd(s2);
|
final String s3 = nfd(s2);
|
||||||
|
@ -82,7 +84,7 @@ public abstract class AbstractPaceFunctions {
|
||||||
return s12;
|
return s12;
|
||||||
}
|
}
|
||||||
|
|
||||||
protected String fixXML(final String a) {
|
protected static String fixXML(final String a) {
|
||||||
|
|
||||||
return a
|
return a
|
||||||
.replaceAll("–", " ")
|
.replaceAll("–", " ")
|
||||||
|
@ -91,7 +93,7 @@ public abstract class AbstractPaceFunctions {
|
||||||
.replaceAll("−", " ");
|
.replaceAll("−", " ");
|
||||||
}
|
}
|
||||||
|
|
||||||
protected boolean checkNumbers(final String a, final String b) {
|
protected static boolean checkNumbers(final String a, final String b) {
|
||||||
final String numbersA = getNumbers(a);
|
final String numbersA = getNumbers(a);
|
||||||
final String numbersB = getNumbers(b);
|
final String numbersB = getNumbers(b);
|
||||||
final String romansA = getRomans(a);
|
final String romansA = getRomans(a);
|
||||||
|
@ -99,7 +101,7 @@ public abstract class AbstractPaceFunctions {
|
||||||
return !numbersA.equals(numbersB) || !romansA.equals(romansB);
|
return !numbersA.equals(numbersB) || !romansA.equals(romansB);
|
||||||
}
|
}
|
||||||
|
|
||||||
protected String getRomans(final String s) {
|
protected static String getRomans(final String s) {
|
||||||
final StringBuilder sb = new StringBuilder();
|
final StringBuilder sb = new StringBuilder();
|
||||||
for (final String t : s.split(" ")) {
|
for (final String t : s.split(" ")) {
|
||||||
sb.append(isRoman(t) ? t : "");
|
sb.append(isRoman(t) ? t : "");
|
||||||
|
@ -107,13 +109,12 @@ public abstract class AbstractPaceFunctions {
|
||||||
return sb.toString();
|
return sb.toString();
|
||||||
}
|
}
|
||||||
|
|
||||||
protected boolean isRoman(final String s) {
|
protected static boolean isRoman(final String s) {
|
||||||
return s
|
Matcher m = romanNumberPattern.matcher(s);
|
||||||
.replaceAll("^M{0,4}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|V?I{0,3})$", "qwertyuiop")
|
return m.matches() && m.hitEnd();
|
||||||
.equals("qwertyuiop");
|
|
||||||
}
|
}
|
||||||
|
|
||||||
protected String getNumbers(final String s) {
|
protected static String getNumbers(final String s) {
|
||||||
final StringBuilder sb = new StringBuilder();
|
final StringBuilder sb = new StringBuilder();
|
||||||
for (final String t : s.split(" ")) {
|
for (final String t : s.split(" ")) {
|
||||||
sb.append(isNumber(t) ? t : "");
|
sb.append(isNumber(t) ? t : "");
|
||||||
|
@ -121,7 +122,7 @@ public abstract class AbstractPaceFunctions {
|
||||||
return sb.toString();
|
return sb.toString();
|
||||||
}
|
}
|
||||||
|
|
||||||
public boolean isNumber(String strNum) {
|
public static boolean isNumber(String strNum) {
|
||||||
if (strNum == null) {
|
if (strNum == null) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
@ -147,7 +148,7 @@ public abstract class AbstractPaceFunctions {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
protected String removeSymbols(final String s) {
|
protected static String removeSymbols(final String s) {
|
||||||
final StringBuilder sb = new StringBuilder();
|
final StringBuilder sb = new StringBuilder();
|
||||||
|
|
||||||
s.chars().forEach(ch -> {
|
s.chars().forEach(ch -> {
|
||||||
|
@ -157,11 +158,11 @@ public abstract class AbstractPaceFunctions {
|
||||||
return sb.toString().replaceAll("\\s+", " ");
|
return sb.toString().replaceAll("\\s+", " ");
|
||||||
}
|
}
|
||||||
|
|
||||||
protected boolean notNull(final String s) {
|
protected static boolean notNull(final String s) {
|
||||||
return s != null;
|
return s != null;
|
||||||
}
|
}
|
||||||
|
|
||||||
protected String normalize(final String s) {
|
public static String normalize(final String s) {
|
||||||
return fixAliases(transliterate(nfd(unicodeNormalization(s))))
|
return fixAliases(transliterate(nfd(unicodeNormalization(s))))
|
||||||
.toLowerCase()
|
.toLowerCase()
|
||||||
// do not compact the regexes in a single expression, would cause StackOverflowError in case of large input
|
// do not compact the regexes in a single expression, would cause StackOverflowError in case of large input
|
||||||
|
@ -174,16 +175,16 @@ public abstract class AbstractPaceFunctions {
|
||||||
.trim();
|
.trim();
|
||||||
}
|
}
|
||||||
|
|
||||||
public String nfd(final String s) {
|
public static String nfd(final String s) {
|
||||||
return Normalizer.normalize(s, Normalizer.Form.NFD);
|
return Normalizer.normalize(s, Normalizer.Form.NFD);
|
||||||
}
|
}
|
||||||
|
|
||||||
public String utf8(final String s) {
|
public static String utf8(final String s) {
|
||||||
byte[] bytes = s.getBytes(StandardCharsets.UTF_8);
|
byte[] bytes = s.getBytes(StandardCharsets.UTF_8);
|
||||||
return new String(bytes, StandardCharsets.UTF_8);
|
return new String(bytes, StandardCharsets.UTF_8);
|
||||||
}
|
}
|
||||||
|
|
||||||
public String unicodeNormalization(final String s) {
|
public static String unicodeNormalization(final String s) {
|
||||||
|
|
||||||
Matcher m = hexUnicodePattern.matcher(s);
|
Matcher m = hexUnicodePattern.matcher(s);
|
||||||
StringBuffer buf = new StringBuffer(s.length());
|
StringBuffer buf = new StringBuffer(s.length());
|
||||||
|
@ -195,7 +196,7 @@ public abstract class AbstractPaceFunctions {
|
||||||
return buf.toString();
|
return buf.toString();
|
||||||
}
|
}
|
||||||
|
|
||||||
protected String filterStopWords(final String s, final Set<String> stopwords) {
|
protected static String filterStopWords(final String s, final Set<String> stopwords) {
|
||||||
final StringTokenizer st = new StringTokenizer(s);
|
final StringTokenizer st = new StringTokenizer(s);
|
||||||
final StringBuilder sb = new StringBuilder();
|
final StringBuilder sb = new StringBuilder();
|
||||||
while (st.hasMoreTokens()) {
|
while (st.hasMoreTokens()) {
|
||||||
|
@ -208,7 +209,7 @@ public abstract class AbstractPaceFunctions {
|
||||||
return sb.toString().trim();
|
return sb.toString().trim();
|
||||||
}
|
}
|
||||||
|
|
||||||
public String filterAllStopWords(String s) {
|
public static String filterAllStopWords(String s) {
|
||||||
|
|
||||||
s = filterStopWords(s, stopwords_en);
|
s = filterStopWords(s, stopwords_en);
|
||||||
s = filterStopWords(s, stopwords_de);
|
s = filterStopWords(s, stopwords_de);
|
||||||
|
@ -221,7 +222,8 @@ public abstract class AbstractPaceFunctions {
|
||||||
return s;
|
return s;
|
||||||
}
|
}
|
||||||
|
|
||||||
protected Collection<String> filterBlacklisted(final Collection<String> set, final Set<String> ngramBlacklist) {
|
protected static Collection<String> filterBlacklisted(final Collection<String> set,
|
||||||
|
final Set<String> ngramBlacklist) {
|
||||||
final Set<String> newset = Sets.newLinkedHashSet();
|
final Set<String> newset = Sets.newLinkedHashSet();
|
||||||
for (final String s : set) {
|
for (final String s : set) {
|
||||||
if (!ngramBlacklist.contains(s)) {
|
if (!ngramBlacklist.contains(s)) {
|
||||||
|
@ -268,7 +270,7 @@ public abstract class AbstractPaceFunctions {
|
||||||
return m;
|
return m;
|
||||||
}
|
}
|
||||||
|
|
||||||
public String removeKeywords(String s, Set<String> keywords) {
|
public static String removeKeywords(String s, Set<String> keywords) {
|
||||||
|
|
||||||
s = " " + s + " ";
|
s = " " + s + " ";
|
||||||
for (String k : keywords) {
|
for (String k : keywords) {
|
||||||
|
@ -278,39 +280,39 @@ public abstract class AbstractPaceFunctions {
|
||||||
return s.trim();
|
return s.trim();
|
||||||
}
|
}
|
||||||
|
|
||||||
public double commonElementsPercentage(Set<String> s1, Set<String> s2) {
|
public static double commonElementsPercentage(Set<String> s1, Set<String> s2) {
|
||||||
|
|
||||||
double longer = Math.max(s1.size(), s2.size());
|
double longer = Math.max(s1.size(), s2.size());
|
||||||
return (double) s1.stream().filter(s2::contains).count() / longer;
|
return (double) s1.stream().filter(s2::contains).count() / longer;
|
||||||
}
|
}
|
||||||
|
|
||||||
// convert the set of keywords to codes
|
// convert the set of keywords to codes
|
||||||
public Set<String> toCodes(Set<String> keywords, Map<String, String> translationMap) {
|
public static Set<String> toCodes(Set<String> keywords, Map<String, String> translationMap) {
|
||||||
return keywords.stream().map(s -> translationMap.get(s)).collect(Collectors.toSet());
|
return keywords.stream().map(s -> translationMap.get(s)).collect(Collectors.toSet());
|
||||||
}
|
}
|
||||||
|
|
||||||
public Set<String> keywordsToCodes(Set<String> keywords, Map<String, String> translationMap) {
|
public static Set<String> keywordsToCodes(Set<String> keywords, Map<String, String> translationMap) {
|
||||||
return toCodes(keywords, translationMap);
|
return toCodes(keywords, translationMap);
|
||||||
}
|
}
|
||||||
|
|
||||||
public Set<String> citiesToCodes(Set<String> keywords) {
|
public static Set<String> citiesToCodes(Set<String> keywords) {
|
||||||
return toCodes(keywords, cityMap);
|
return toCodes(keywords, cityMap);
|
||||||
}
|
}
|
||||||
|
|
||||||
protected String firstLC(final String s) {
|
protected static String firstLC(final String s) {
|
||||||
return StringUtils.substring(s, 0, 1).toLowerCase();
|
return StringUtils.substring(s, 0, 1).toLowerCase();
|
||||||
}
|
}
|
||||||
|
|
||||||
protected Iterable<String> tokens(final String s, final int maxTokens) {
|
protected static Iterable<String> tokens(final String s, final int maxTokens) {
|
||||||
return Iterables.limit(Splitter.on(" ").omitEmptyStrings().trimResults().split(s), maxTokens);
|
return Iterables.limit(Splitter.on(" ").omitEmptyStrings().trimResults().split(s), maxTokens);
|
||||||
}
|
}
|
||||||
|
|
||||||
public String normalizePid(String pid) {
|
public static String normalizePid(String pid) {
|
||||||
return DOI_PREFIX.matcher(pid.toLowerCase()).replaceAll("");
|
return DOI_PREFIX.matcher(pid.toLowerCase()).replaceAll("");
|
||||||
}
|
}
|
||||||
|
|
||||||
// get the list of keywords into the input string
|
// get the list of keywords into the input string
|
||||||
public Set<String> getKeywords(String s1, Map<String, String> translationMap, int windowSize) {
|
public static Set<String> getKeywords(String s1, Map<String, String> translationMap, int windowSize) {
|
||||||
|
|
||||||
String s = s1;
|
String s = s1;
|
||||||
|
|
||||||
|
@ -340,7 +342,7 @@ public abstract class AbstractPaceFunctions {
|
||||||
return codes;
|
return codes;
|
||||||
}
|
}
|
||||||
|
|
||||||
public Set<String> getCities(String s1, int windowSize) {
|
public static Set<String> getCities(String s1, int windowSize) {
|
||||||
return getKeywords(s1, cityMap, windowSize);
|
return getKeywords(s1, cityMap, windowSize);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -18,7 +18,7 @@ public class ClusteringDef implements Serializable {
|
||||||
|
|
||||||
private List<String> fields;
|
private List<String> fields;
|
||||||
|
|
||||||
private Map<String, Integer> params;
|
private Map<String, Object> params;
|
||||||
|
|
||||||
public ClusteringDef() {
|
public ClusteringDef() {
|
||||||
}
|
}
|
||||||
|
@ -43,11 +43,11 @@ public class ClusteringDef implements Serializable {
|
||||||
this.fields = fields;
|
this.fields = fields;
|
||||||
}
|
}
|
||||||
|
|
||||||
public Map<String, Integer> getParams() {
|
public Map<String, Object> getParams() {
|
||||||
return params;
|
return params;
|
||||||
}
|
}
|
||||||
|
|
||||||
public void setParams(final Map<String, Integer> params) {
|
public void setParams(final Map<String, Object> params) {
|
||||||
this.params = params;
|
this.params = params;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -2,6 +2,7 @@
|
||||||
package eu.dnetlib.pace.model;
|
package eu.dnetlib.pace.model;
|
||||||
|
|
||||||
import java.io.Serializable;
|
import java.io.Serializable;
|
||||||
|
import java.util.HashSet;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
import com.fasterxml.jackson.core.JsonProcessingException;
|
import com.fasterxml.jackson.core.JsonProcessingException;
|
||||||
|
@ -36,6 +37,16 @@ public class FieldDef implements Serializable {
|
||||||
*/
|
*/
|
||||||
private int length = -1;
|
private int length = -1;
|
||||||
|
|
||||||
|
private HashSet<String> filter;
|
||||||
|
|
||||||
|
private boolean sorted;
|
||||||
|
|
||||||
|
public boolean isSorted() {
|
||||||
|
return sorted;
|
||||||
|
}
|
||||||
|
|
||||||
|
private String clean;
|
||||||
|
|
||||||
public FieldDef() {
|
public FieldDef() {
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -91,6 +102,30 @@ public class FieldDef implements Serializable {
|
||||||
this.path = path;
|
this.path = path;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public HashSet<String> getFilter() {
|
||||||
|
return filter;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setFilter(HashSet<String> filter) {
|
||||||
|
this.filter = filter;
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean getSorted() {
|
||||||
|
return sorted;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setSorted(boolean sorted) {
|
||||||
|
this.sorted = sorted;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getClean() {
|
||||||
|
return clean;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setClean(String clean) {
|
||||||
|
this.clean = clean;
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public String toString() {
|
public String toString() {
|
||||||
try {
|
try {
|
||||||
|
|
|
@ -5,9 +5,9 @@ import eu.dnetlib.pace.util.{BlockProcessor, SparkReporter}
|
||||||
import org.apache.spark.SparkContext
|
import org.apache.spark.SparkContext
|
||||||
import org.apache.spark.sql.catalyst.expressions.Literal
|
import org.apache.spark.sql.catalyst.expressions.Literal
|
||||||
import org.apache.spark.sql.expressions._
|
import org.apache.spark.sql.expressions._
|
||||||
import org.apache.spark.sql.functions.{col, lit, udf}
|
import org.apache.spark.sql.functions.{col, desc, expr, lit, udf}
|
||||||
import org.apache.spark.sql.types._
|
import org.apache.spark.sql.types._
|
||||||
import org.apache.spark.sql.{Column, Dataset, Row, functions}
|
import org.apache.spark.sql.{Column, Dataset, Row, SaveMode, functions}
|
||||||
|
|
||||||
import java.util.function.Predicate
|
import java.util.function.Predicate
|
||||||
import java.util.stream.Collectors
|
import java.util.stream.Collectors
|
||||||
|
@ -80,6 +80,8 @@ case class SparkDeduper(conf: DedupConfig) extends Serializable {
|
||||||
.withColumn("key", functions.explode(clusterValuesUDF(cd).apply(functions.array(inputColumns: _*))))
|
.withColumn("key", functions.explode(clusterValuesUDF(cd).apply(functions.array(inputColumns: _*))))
|
||||||
// Add position column having the position of the row within the set of rows having the same key value ordered by the sorting value
|
// Add position column having the position of the row within the set of rows having the same key value ordered by the sorting value
|
||||||
.withColumn("position", functions.row_number().over(Window.partitionBy("key").orderBy(col(model.orderingFieldName), col(model.identifierFieldName))))
|
.withColumn("position", functions.row_number().over(Window.partitionBy("key").orderBy(col(model.orderingFieldName), col(model.identifierFieldName))))
|
||||||
|
// .withColumn("count", functions.max("position").over(Window.partitionBy("key").orderBy(col(model.orderingFieldName), col(model.identifierFieldName)).rowsBetween(Window.unboundedPreceding,Window.unboundedFollowing) ))
|
||||||
|
// .filter("count > 1")
|
||||||
|
|
||||||
if (df_with_clustering_keys == null)
|
if (df_with_clustering_keys == null)
|
||||||
df_with_clustering_keys = ds
|
df_with_clustering_keys = ds
|
||||||
|
@ -88,20 +90,44 @@ case class SparkDeduper(conf: DedupConfig) extends Serializable {
|
||||||
}
|
}
|
||||||
|
|
||||||
//TODO: analytics
|
//TODO: analytics
|
||||||
|
/*df_with_clustering_keys.groupBy(col("clustering"), col("key"))
|
||||||
|
.agg(expr("max(count) AS size"))
|
||||||
|
.orderBy(desc("size"))
|
||||||
|
.show*/
|
||||||
|
|
||||||
val df_with_blocks = df_with_clustering_keys
|
val df_with_blocks = df_with_clustering_keys
|
||||||
// filter out rows with position exceeding the maxqueuesize parameter
|
// split the clustering block into smaller blocks of queuemaxsize
|
||||||
.filter(col("position").leq(conf.getWf.getQueueMaxSize))
|
.groupBy(col("clustering"), col("key"), functions.floor(col("position").divide(lit(conf.getWf.getQueueMaxSize))))
|
||||||
.groupBy("clustering", "key")
|
|
||||||
.agg(functions.collect_set(functions.struct(model.schema.fieldNames.map(col): _*)).as("block"))
|
.agg(functions.collect_set(functions.struct(model.schema.fieldNames.map(col): _*)).as("block"))
|
||||||
.filter(functions.size(new Column("block")).gt(1))
|
.filter(functions.size(new Column("block")).gt(1))
|
||||||
|
.union(
|
||||||
|
//adjacency blocks
|
||||||
|
df_with_clustering_keys
|
||||||
|
// filter out leading and trailing elements
|
||||||
|
.filter(col("position").gt(conf.getWf.getSlidingWindowSize/2))
|
||||||
|
//.filter(col("position").lt(col("count").minus(conf.getWf.getSlidingWindowSize/2)))
|
||||||
|
// create small blocks of records on "the border" of maxqueuesize: getSlidingWindowSize/2 elements before and after
|
||||||
|
.filter(
|
||||||
|
col("position").mod(conf.getWf.getQueueMaxSize).lt(conf.getWf.getSlidingWindowSize/2) // slice of the start of block
|
||||||
|
|| col("position").mod(conf.getWf.getQueueMaxSize).gt(conf.getWf.getQueueMaxSize - (conf.getWf.getSlidingWindowSize/2)) //slice of the end of the block
|
||||||
|
)
|
||||||
|
.groupBy(col("clustering"), col("key"), functions.floor((col("position") + lit(conf.getWf.getSlidingWindowSize/2)).divide(lit(conf.getWf.getQueueMaxSize))))
|
||||||
|
.agg(functions.collect_set(functions.struct(model.schema.fieldNames.map(col): _*)).as("block"))
|
||||||
|
.filter(functions.size(new Column("block")).gt(1))
|
||||||
|
)
|
||||||
|
|
||||||
df_with_blocks
|
df_with_blocks
|
||||||
}
|
}
|
||||||
|
|
||||||
def clusterValuesUDF(cd: ClusteringDef) = {
|
def clusterValuesUDF(cd: ClusteringDef) = {
|
||||||
udf[mutable.WrappedArray[String], mutable.WrappedArray[Any]](values => {
|
udf[mutable.WrappedArray[String], mutable.WrappedArray[Any]](values => {
|
||||||
values.flatMap(f => cd.clusteringFunction().apply(conf, Seq(f.toString).asJava).asScala)
|
val valueList = values.flatMap {
|
||||||
|
case a: mutable.WrappedArray[Any] => a.map(_.toString)
|
||||||
|
case s: Any => Seq(s.toString)
|
||||||
|
}.asJava;
|
||||||
|
|
||||||
|
mutable.WrappedArray.make(cd.clusteringFunction().apply(conf, valueList).toArray())
|
||||||
|
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -1,13 +1,16 @@
|
||||||
package eu.dnetlib.pace.model
|
package eu.dnetlib.pace.model
|
||||||
|
|
||||||
import com.jayway.jsonpath.{Configuration, JsonPath}
|
import com.jayway.jsonpath.{Configuration, JsonPath}
|
||||||
|
import eu.dnetlib.pace.common.AbstractPaceFunctions
|
||||||
import eu.dnetlib.pace.config.{DedupConfig, Type}
|
import eu.dnetlib.pace.config.{DedupConfig, Type}
|
||||||
import eu.dnetlib.pace.util.MapDocumentUtil
|
import eu.dnetlib.pace.util.MapDocumentUtil
|
||||||
|
import org.apache.commons.lang3.StringUtils
|
||||||
import org.apache.spark.sql.catalyst.encoders.RowEncoder
|
import org.apache.spark.sql.catalyst.encoders.RowEncoder
|
||||||
import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema
|
import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema
|
||||||
import org.apache.spark.sql.types.{DataTypes, Metadata, StructField, StructType}
|
import org.apache.spark.sql.types.{DataTypes, Metadata, StructField, StructType}
|
||||||
import org.apache.spark.sql.{Dataset, Row}
|
import org.apache.spark.sql.{Dataset, Row}
|
||||||
|
|
||||||
|
import java.util.Locale
|
||||||
import java.util.regex.Pattern
|
import java.util.regex.Pattern
|
||||||
import scala.collection.JavaConverters._
|
import scala.collection.JavaConverters._
|
||||||
|
|
||||||
|
@ -60,7 +63,7 @@ case class SparkModel(conf: DedupConfig) {
|
||||||
values(identityFieldPosition) = MapDocumentUtil.getJPathString(conf.getWf.getIdPath, documentContext)
|
values(identityFieldPosition) = MapDocumentUtil.getJPathString(conf.getWf.getIdPath, documentContext)
|
||||||
|
|
||||||
schema.fieldNames.zipWithIndex.foldLeft(values) {
|
schema.fieldNames.zipWithIndex.foldLeft(values) {
|
||||||
case ((res, (fname, index))) => {
|
case ((res, (fname, index))) =>
|
||||||
val fdef = conf.getPace.getModelMap.get(fname)
|
val fdef = conf.getPace.getModelMap.get(fname)
|
||||||
|
|
||||||
if (fdef != null) {
|
if (fdef != null) {
|
||||||
|
@ -96,13 +99,52 @@ case class SparkModel(conf: DedupConfig) {
|
||||||
case Type.DoubleArray =>
|
case Type.DoubleArray =>
|
||||||
MapDocumentUtil.getJPathArray(fdef.getPath, json)
|
MapDocumentUtil.getJPathArray(fdef.getPath, json)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
val filter = fdef.getFilter
|
||||||
|
|
||||||
|
if (StringUtils.isNotBlank(fdef.getClean)) {
|
||||||
|
res(index) = res(index) match {
|
||||||
|
case x: Seq[String] => x.map(clean(_, fdef.getClean)).toSeq
|
||||||
|
case _ => clean(res(index).toString, fdef.getClean)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (filter != null && !filter.isEmpty) {
|
||||||
|
res(index) = res(index) match {
|
||||||
|
case x: String if filter.contains(x.toLowerCase(Locale.ROOT)) => null
|
||||||
|
case x: Seq[String] => x.filter(s => !filter.contains(s.toLowerCase(Locale.ROOT))).toSeq
|
||||||
|
case _ => res(index)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (fdef.getSorted) {
|
||||||
|
res(index) = res(index) match {
|
||||||
|
case x: Seq[String] => x.sorted.toSeq
|
||||||
|
case _ => res(index)
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
res
|
res
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
new GenericRowWithSchema(values, schema)
|
new GenericRowWithSchema(values, schema)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
def clean(value: String, cleantype: String) : String = {
|
||||||
|
val res = cleantype match {
|
||||||
|
case "title" => AbstractPaceFunctions.cleanup(value)
|
||||||
|
case _ => value
|
||||||
|
}
|
||||||
|
|
||||||
|
// if (!res.equals(AbstractPaceFunctions.normalize(value))) {
|
||||||
|
// println(res)
|
||||||
|
// println(AbstractPaceFunctions.normalize(value))
|
||||||
|
// println()
|
||||||
|
// }
|
||||||
|
|
||||||
|
res
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -23,7 +23,6 @@ public class AuthorsMatch extends AbstractListComparator {
|
||||||
private String MODE; // full or surname
|
private String MODE; // full or surname
|
||||||
private int SIZE_THRESHOLD;
|
private int SIZE_THRESHOLD;
|
||||||
private String TYPE; // count or percentage
|
private String TYPE; // count or percentage
|
||||||
private int common;
|
|
||||||
|
|
||||||
public AuthorsMatch(Map<String, String> params) {
|
public AuthorsMatch(Map<String, String> params) {
|
||||||
super(params, new com.wcohen.ss.JaroWinkler());
|
super(params, new com.wcohen.ss.JaroWinkler());
|
||||||
|
@ -35,7 +34,6 @@ public class AuthorsMatch extends AbstractListComparator {
|
||||||
FULLNAME_THRESHOLD = Double.parseDouble(params.getOrDefault("fullname_th", "0.9"));
|
FULLNAME_THRESHOLD = Double.parseDouble(params.getOrDefault("fullname_th", "0.9"));
|
||||||
SIZE_THRESHOLD = Integer.parseInt(params.getOrDefault("size_th", "20"));
|
SIZE_THRESHOLD = Integer.parseInt(params.getOrDefault("size_th", "20"));
|
||||||
TYPE = params.getOrDefault("type", "percentage");
|
TYPE = params.getOrDefault("type", "percentage");
|
||||||
common = 0;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
protected AuthorsMatch(double w, AbstractStringDistance ssalgo) {
|
protected AuthorsMatch(double w, AbstractStringDistance ssalgo) {
|
||||||
|
@ -44,22 +42,27 @@ public class AuthorsMatch extends AbstractListComparator {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public double compare(final List<String> a, final List<String> b, final Config conf) {
|
public double compare(final List<String> a, final List<String> b, final Config conf) {
|
||||||
|
|
||||||
if (a.isEmpty() || b.isEmpty())
|
if (a.isEmpty() || b.isEmpty())
|
||||||
return -1;
|
return -1;
|
||||||
|
|
||||||
if (a.size() > SIZE_THRESHOLD || b.size() > SIZE_THRESHOLD)
|
if (a.size() > SIZE_THRESHOLD || b.size() > SIZE_THRESHOLD)
|
||||||
return 1.0;
|
return 1.0;
|
||||||
|
|
||||||
List<Person> aList = a.stream().map(author -> new Person(author, false)).collect(Collectors.toList());
|
int maxMiss = Integer.MAX_VALUE;
|
||||||
List<Person> bList = b.stream().map(author -> new Person(author, false)).collect(Collectors.toList());
|
List<Person> bList = b.stream().map(author -> new Person(author, false)).collect(Collectors.toList());
|
||||||
|
|
||||||
common = 0;
|
Double threshold = getDoubleParam("threshold");
|
||||||
|
|
||||||
|
if (threshold != null && threshold >= 0.0 && threshold <= 1.0 && a.size() == b.size()) {
|
||||||
|
maxMiss = (int) Math.floor((1 - threshold) * Math.max(a.size(), b.size()));
|
||||||
|
}
|
||||||
|
|
||||||
|
int common = 0;
|
||||||
// compare each element of List1 with each element of List2
|
// compare each element of List1 with each element of List2
|
||||||
for (Person p1 : aList)
|
for (int i = 0; i < a.size(); i++) {
|
||||||
|
Person p1 = new Person(a.get(i), false);
|
||||||
|
|
||||||
for (Person p2 : bList) {
|
for (Person p2 : bList) {
|
||||||
|
|
||||||
// both persons are inaccurate
|
// both persons are inaccurate
|
||||||
if (!p1.isAccurate() && !p2.isAccurate()) {
|
if (!p1.isAccurate() && !p2.isAccurate()) {
|
||||||
// compare just normalized fullnames
|
// compare just normalized fullnames
|
||||||
|
@ -118,11 +121,15 @@ public class AuthorsMatch extends AbstractListComparator {
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (i - common > maxMiss) {
|
||||||
|
return 0.0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// normalization factor to compute the score
|
// normalization factor to compute the score
|
||||||
int normFactor = aList.size() == bList.size() ? aList.size() : (aList.size() + bList.size() - common);
|
int normFactor = a.size() == b.size() ? a.size() : (a.size() + b.size() - common);
|
||||||
|
|
||||||
if (TYPE.equals("percentage")) {
|
if (TYPE.equals("percentage")) {
|
||||||
return (double) common / normFactor;
|
return (double) common / normFactor;
|
||||||
|
|
|
@ -25,6 +25,7 @@ public class InstanceTypeMatch extends AbstractListComparator {
|
||||||
translationMap.put("Conference object", "*");
|
translationMap.put("Conference object", "*");
|
||||||
translationMap.put("Other literature type", "*");
|
translationMap.put("Other literature type", "*");
|
||||||
translationMap.put("Unknown", "*");
|
translationMap.put("Unknown", "*");
|
||||||
|
translationMap.put("UNKNOWN", "*");
|
||||||
|
|
||||||
// article types
|
// article types
|
||||||
translationMap.put("Article", "Article");
|
translationMap.put("Article", "Article");
|
||||||
|
@ -76,5 +77,4 @@ public class InstanceTypeMatch extends AbstractListComparator {
|
||||||
protected double normalize(final double d) {
|
protected double normalize(final double d) {
|
||||||
return d;
|
return d;
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -3,6 +3,7 @@ package eu.dnetlib.pace.tree;
|
||||||
|
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
|
|
||||||
|
import org.apache.commons.lang3.StringUtils;
|
||||||
import org.apache.commons.logging.Log;
|
import org.apache.commons.logging.Log;
|
||||||
import org.apache.commons.logging.LogFactory;
|
import org.apache.commons.logging.LogFactory;
|
||||||
|
|
||||||
|
@ -30,16 +31,25 @@ public class LevensteinTitle extends AbstractStringComparator {
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public double distance(final String a, final String b, final Config conf) {
|
public double distance(final String ca, final String cb, final Config conf) {
|
||||||
final String ca = cleanup(a);
|
|
||||||
final String cb = cleanup(b);
|
|
||||||
|
|
||||||
final boolean check = checkNumbers(ca, cb);
|
final boolean check = checkNumbers(ca, cb);
|
||||||
|
|
||||||
if (check)
|
if (check)
|
||||||
return 0.5;
|
return 0.5;
|
||||||
|
|
||||||
return normalize(ssalgo.score(ca, cb), ca.length(), cb.length());
|
Double threshold = getDoubleParam("threshold");
|
||||||
|
|
||||||
|
// reduce Levenshtein algo complexity when target threshold is known
|
||||||
|
if (threshold != null && threshold >= 0.0 && threshold <= 1.0) {
|
||||||
|
int maxdistance = (int) Math.floor((1 - threshold) * Math.max(ca.length(), cb.length()));
|
||||||
|
int score = StringUtils.getLevenshteinDistance(ca, cb, maxdistance);
|
||||||
|
if (score == -1) {
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
return normalize(score, ca.length(), cb.length());
|
||||||
|
} else {
|
||||||
|
return normalize(StringUtils.getLevenshteinDistance(ca, cb), ca.length(), cb.length());
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private double normalize(final double score, final int la, final int lb) {
|
private double normalize(final double score, final int la, final int lb) {
|
||||||
|
|
|
@ -0,0 +1,29 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.pace.tree;
|
||||||
|
|
||||||
|
import java.util.Map;
|
||||||
|
|
||||||
|
import eu.dnetlib.pace.config.Config;
|
||||||
|
import eu.dnetlib.pace.tree.support.AbstractStringComparator;
|
||||||
|
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||||
|
|
||||||
|
@ComparatorClass("maxLengthMatch")
|
||||||
|
public class MaxLengthMatch extends AbstractStringComparator {
|
||||||
|
|
||||||
|
private final int limit;
|
||||||
|
|
||||||
|
public MaxLengthMatch(Map<String, String> params) {
|
||||||
|
super(params);
|
||||||
|
|
||||||
|
limit = Integer.parseInt(params.getOrDefault("limit", "200"));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public double compare(String a, String b, final Config conf) {
|
||||||
|
return a.length() < limit && b.length() < limit ? 1.0 : -1.0;
|
||||||
|
}
|
||||||
|
|
||||||
|
protected String toString(final Object object) {
|
||||||
|
return toFirstString(object);
|
||||||
|
}
|
||||||
|
}
|
|
@ -127,4 +127,14 @@ public abstract class AbstractComparator<T> extends AbstractPaceFunctions implem
|
||||||
return this.weight;
|
return this.weight;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public Double getDoubleParam(String name) {
|
||||||
|
String svalue = params.get(name);
|
||||||
|
|
||||||
|
try {
|
||||||
|
return Double.parseDouble(svalue);
|
||||||
|
} catch (Throwable t) {
|
||||||
|
}
|
||||||
|
|
||||||
|
return null;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -67,8 +67,10 @@ public class BlockProcessor {
|
||||||
|
|
||||||
private void processRows(final List<Row> queue, final Reporter context) {
|
private void processRows(final List<Row> queue, final Reporter context) {
|
||||||
|
|
||||||
for (int pivotPos = 0; pivotPos < queue.size(); pivotPos++) {
|
IncrementalConnectedComponents icc = new IncrementalConnectedComponents(queue.size());
|
||||||
final Row pivot = queue.get(pivotPos);
|
|
||||||
|
for (int i = 0; i < queue.size(); i++) {
|
||||||
|
final Row pivot = queue.get(i);
|
||||||
|
|
||||||
final String idPivot = pivot.getString(identifierFieldPos); // identifier
|
final String idPivot = pivot.getString(identifierFieldPos); // identifier
|
||||||
final Object fieldsPivot = getJavaValue(pivot, orderFieldPos);
|
final Object fieldsPivot = getJavaValue(pivot, orderFieldPos);
|
||||||
|
@ -76,9 +78,9 @@ public class BlockProcessor {
|
||||||
final WfConfig wf = dedupConf.getWf();
|
final WfConfig wf = dedupConf.getWf();
|
||||||
|
|
||||||
if (fieldPivot != null) {
|
if (fieldPivot != null) {
|
||||||
int i = 0;
|
for (int j = icc.nextUnconnected(i, i + 1); j >= 0
|
||||||
for (int windowPos = pivotPos + 1; windowPos < queue.size(); windowPos++) {
|
&& j < queue.size(); j = icc.nextUnconnected(i, j + 1)) {
|
||||||
final Row curr = queue.get(windowPos);
|
final Row curr = queue.get(j);
|
||||||
final String idCurr = curr.getString(identifierFieldPos); // identifier
|
final String idCurr = curr.getString(identifierFieldPos); // identifier
|
||||||
|
|
||||||
if (mustSkip(idCurr)) {
|
if (mustSkip(idCurr)) {
|
||||||
|
@ -86,7 +88,7 @@ public class BlockProcessor {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (++i > wf.getSlidingWindowSize()) {
|
if (wf.getSlidingWindowSize() > 0 && (j - i) > wf.getSlidingWindowSize()) {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -97,7 +99,9 @@ public class BlockProcessor {
|
||||||
|
|
||||||
final TreeProcessor treeProcessor = new TreeProcessor(dedupConf);
|
final TreeProcessor treeProcessor = new TreeProcessor(dedupConf);
|
||||||
|
|
||||||
emitOutput(treeProcessor.compare(pivot, curr), idPivot, idCurr, context);
|
if (emitOutput(treeProcessor.compare(pivot, curr), idPivot, idCurr, context)) {
|
||||||
|
icc.connect(i, j);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -115,7 +119,8 @@ public class BlockProcessor {
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
private void emitOutput(final boolean result, final String idPivot, final String idCurr, final Reporter context) {
|
private boolean emitOutput(final boolean result, final String idPivot, final String idCurr,
|
||||||
|
final Reporter context) {
|
||||||
|
|
||||||
if (result) {
|
if (result) {
|
||||||
if (idPivot.compareTo(idCurr) <= 0) {
|
if (idPivot.compareTo(idCurr) <= 0) {
|
||||||
|
@ -127,6 +132,8 @@ public class BlockProcessor {
|
||||||
} else {
|
} else {
|
||||||
context.incrementCounter(dedupConf.getWf().getEntityType(), "d < " + dedupConf.getWf().getThreshold(), 1);
|
context.incrementCounter(dedupConf.getWf().getEntityType(), "d < " + dedupConf.getWf().getThreshold(), 1);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
private boolean mustSkip(final String idPivot) {
|
private boolean mustSkip(final String idPivot) {
|
||||||
|
@ -142,5 +149,4 @@ public class BlockProcessor {
|
||||||
|
|
||||||
context.emit(type, from, to);
|
context.emit(type, from, to);
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -0,0 +1,50 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.pace.util;
|
||||||
|
|
||||||
|
import java.util.BitSet;
|
||||||
|
|
||||||
|
public class IncrementalConnectedComponents {
|
||||||
|
final private int size;
|
||||||
|
|
||||||
|
final private BitSet[] indexes;
|
||||||
|
|
||||||
|
IncrementalConnectedComponents(int size) {
|
||||||
|
this.size = size;
|
||||||
|
this.indexes = new BitSet[size];
|
||||||
|
}
|
||||||
|
|
||||||
|
public void connect(int i, int j) {
|
||||||
|
if (indexes[i] == null) {
|
||||||
|
if (indexes[j] == null) {
|
||||||
|
indexes[i] = new BitSet(size);
|
||||||
|
} else {
|
||||||
|
indexes[i] = indexes[j];
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
if (indexes[j] != null && indexes[i] != indexes[j]) {
|
||||||
|
// merge adjacency lists for i and j
|
||||||
|
indexes[i].or(indexes[j]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
indexes[i].set(i);
|
||||||
|
indexes[i].set(j);
|
||||||
|
indexes[j] = indexes[i];
|
||||||
|
}
|
||||||
|
|
||||||
|
public int nextUnconnected(int i, int j) {
|
||||||
|
if (indexes[i] == null) {
|
||||||
|
return j;
|
||||||
|
}
|
||||||
|
int result = indexes[i].nextClearBit(j);
|
||||||
|
|
||||||
|
return (result >= size) ? -1 : result;
|
||||||
|
}
|
||||||
|
|
||||||
|
public BitSet getConnections(int i) {
|
||||||
|
if (indexes[i] == null) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
return indexes[i];
|
||||||
|
}
|
||||||
|
}
|
|
@ -97,6 +97,8 @@ public class MapDocumentUtil {
|
||||||
Object o = json.read(jsonPath);
|
Object o = json.read(jsonPath);
|
||||||
if (o instanceof String)
|
if (o instanceof String)
|
||||||
return (String) o;
|
return (String) o;
|
||||||
|
if (o instanceof Number)
|
||||||
|
return (String) o.toString();
|
||||||
if (o instanceof JSONArray && ((JSONArray) o).size() > 0)
|
if (o instanceof JSONArray && ((JSONArray) o).size() > 0)
|
||||||
return (String) ((JSONArray) o).get(0);
|
return (String) ((JSONArray) o).get(0);
|
||||||
return "";
|
return "";
|
||||||
|
|
|
@ -40,7 +40,7 @@ public class PaceResolver implements Serializable {
|
||||||
Collectors.toMap(cl -> cl.getAnnotation(ComparatorClass.class).value(), cl -> (Class<Comparator>) cl));
|
Collectors.toMap(cl -> cl.getAnnotation(ComparatorClass.class).value(), cl -> (Class<Comparator>) cl));
|
||||||
}
|
}
|
||||||
|
|
||||||
public ClusteringFunction getClusteringFunction(String name, Map<String, Integer> params) throws PaceException {
|
public ClusteringFunction getClusteringFunction(String name, Map<String, Object> params) throws PaceException {
|
||||||
try {
|
try {
|
||||||
return clusteringFunctions.get(name).getDeclaredConstructor(Map.class).newInstance(params);
|
return clusteringFunctions.get(name).getDeclaredConstructor(Map.class).newInstance(params);
|
||||||
} catch (InstantiationException | IllegalAccessException | InvocationTargetException
|
} catch (InstantiationException | IllegalAccessException | InvocationTargetException
|
||||||
|
|
|
@ -15,7 +15,7 @@ import eu.dnetlib.pace.config.DedupConfig;
|
||||||
|
|
||||||
public class ClusteringFunctionTest extends AbstractPaceTest {
|
public class ClusteringFunctionTest extends AbstractPaceTest {
|
||||||
|
|
||||||
private static Map<String, Integer> params;
|
private static Map<String, Object> params;
|
||||||
private static DedupConfig conf;
|
private static DedupConfig conf;
|
||||||
|
|
||||||
@BeforeAll
|
@BeforeAll
|
||||||
|
@ -40,10 +40,10 @@ public class ClusteringFunctionTest extends AbstractPaceTest {
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testNgram() {
|
public void testNgram() {
|
||||||
params.put("ngramLen", 3);
|
params.put("ngramLen", "3");
|
||||||
params.put("max", 8);
|
params.put("max", "8");
|
||||||
params.put("maxPerToken", 2);
|
params.put("maxPerToken", "2");
|
||||||
params.put("minNgramLen", 1);
|
params.put("minNgramLen", "1");
|
||||||
|
|
||||||
final ClusteringFunction ngram = new Ngrams(params);
|
final ClusteringFunction ngram = new Ngrams(params);
|
||||||
|
|
||||||
|
@ -54,8 +54,8 @@ public class ClusteringFunctionTest extends AbstractPaceTest {
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testNgramPairs() {
|
public void testNgramPairs() {
|
||||||
params.put("ngramLen", 3);
|
params.put("ngramLen", "3");
|
||||||
params.put("max", 2);
|
params.put("max", "2");
|
||||||
|
|
||||||
final ClusteringFunction np = new NgramPairs(params);
|
final ClusteringFunction np = new NgramPairs(params);
|
||||||
|
|
||||||
|
@ -66,8 +66,8 @@ public class ClusteringFunctionTest extends AbstractPaceTest {
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testSortedNgramPairs() {
|
public void testSortedNgramPairs() {
|
||||||
params.put("ngramLen", 3);
|
params.put("ngramLen", "3");
|
||||||
params.put("max", 2);
|
params.put("max", "2");
|
||||||
|
|
||||||
final ClusteringFunction np = new SortedNgramPairs(params);
|
final ClusteringFunction np = new SortedNgramPairs(params);
|
||||||
|
|
||||||
|
@ -87,9 +87,9 @@ public class ClusteringFunctionTest extends AbstractPaceTest {
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testAcronym() {
|
public void testAcronym() {
|
||||||
params.put("max", 4);
|
params.put("max", "4");
|
||||||
params.put("minLen", 1);
|
params.put("minLen", "1");
|
||||||
params.put("maxLen", 3);
|
params.put("maxLen", "3");
|
||||||
|
|
||||||
final ClusteringFunction acro = new Acronyms(params);
|
final ClusteringFunction acro = new Acronyms(params);
|
||||||
|
|
||||||
|
@ -100,8 +100,8 @@ public class ClusteringFunctionTest extends AbstractPaceTest {
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testSuffixPrefix() {
|
public void testSuffixPrefix() {
|
||||||
params.put("len", 3);
|
params.put("len", "3");
|
||||||
params.put("max", 4);
|
params.put("max", "4");
|
||||||
|
|
||||||
final ClusteringFunction sp = new SuffixPrefix(params);
|
final ClusteringFunction sp = new SuffixPrefix(params);
|
||||||
|
|
||||||
|
@ -109,8 +109,8 @@ public class ClusteringFunctionTest extends AbstractPaceTest {
|
||||||
System.out.println(s);
|
System.out.println(s);
|
||||||
System.out.println(sp.apply(conf, Lists.newArrayList(s)));
|
System.out.println(sp.apply(conf, Lists.newArrayList(s)));
|
||||||
|
|
||||||
params.put("len", 3);
|
params.put("len", "3");
|
||||||
params.put("max", 1);
|
params.put("max", "1");
|
||||||
|
|
||||||
System.out.println(sp.apply(conf, Lists.newArrayList("Framework for general-purpose deduplication")));
|
System.out.println(sp.apply(conf, Lists.newArrayList("Framework for general-purpose deduplication")));
|
||||||
}
|
}
|
||||||
|
@ -118,8 +118,8 @@ public class ClusteringFunctionTest extends AbstractPaceTest {
|
||||||
@Test
|
@Test
|
||||||
public void testWordsSuffixPrefix() {
|
public void testWordsSuffixPrefix() {
|
||||||
|
|
||||||
params.put("len", 3);
|
params.put("len", "3");
|
||||||
params.put("max", 4);
|
params.put("max", "4");
|
||||||
|
|
||||||
final ClusteringFunction sp = new WordsSuffixPrefix(params);
|
final ClusteringFunction sp = new WordsSuffixPrefix(params);
|
||||||
|
|
||||||
|
@ -130,7 +130,7 @@ public class ClusteringFunctionTest extends AbstractPaceTest {
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testWordsStatsSuffixPrefix() {
|
public void testWordsStatsSuffixPrefix() {
|
||||||
params.put("mod", 10);
|
params.put("mod", "10");
|
||||||
|
|
||||||
final ClusteringFunction sp = new WordsStatsSuffixPrefixChain(params);
|
final ClusteringFunction sp = new WordsStatsSuffixPrefixChain(params);
|
||||||
|
|
||||||
|
@ -167,7 +167,7 @@ public class ClusteringFunctionTest extends AbstractPaceTest {
|
||||||
@Test
|
@Test
|
||||||
public void testFieldValue() {
|
public void testFieldValue() {
|
||||||
|
|
||||||
params.put("randomLength", 5);
|
params.put("randomLength", "5");
|
||||||
|
|
||||||
final ClusteringFunction sp = new SpaceTrimmingFieldValue(params);
|
final ClusteringFunction sp = new SpaceTrimmingFieldValue(params);
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,40 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.pace.util;
|
||||||
|
|
||||||
|
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||||
|
import static org.junit.jupiter.api.Assertions.assertNull;
|
||||||
|
|
||||||
|
import org.junit.jupiter.api.Test;
|
||||||
|
|
||||||
|
public class IncrementalConnectedComponentsTest {
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void transitiveClosureTest() {
|
||||||
|
IncrementalConnectedComponents icc = new IncrementalConnectedComponents(10);
|
||||||
|
|
||||||
|
icc.connect(0, 1);
|
||||||
|
icc.connect(0, 2);
|
||||||
|
icc.connect(0, 3);
|
||||||
|
|
||||||
|
icc.connect(1, 2);
|
||||||
|
icc.connect(1, 4);
|
||||||
|
icc.connect(1, 5);
|
||||||
|
|
||||||
|
icc.connect(6, 7);
|
||||||
|
icc.connect(6, 9);
|
||||||
|
|
||||||
|
assertEquals(icc.getConnections(0).toString(), "{0, 1, 2, 3, 4, 5}");
|
||||||
|
assertEquals(icc.getConnections(1).toString(), "{0, 1, 2, 3, 4, 5}");
|
||||||
|
assertEquals(icc.getConnections(2).toString(), "{0, 1, 2, 3, 4, 5}");
|
||||||
|
assertEquals(icc.getConnections(3).toString(), "{0, 1, 2, 3, 4, 5}");
|
||||||
|
assertEquals(icc.getConnections(4).toString(), "{0, 1, 2, 3, 4, 5}");
|
||||||
|
assertEquals(icc.getConnections(5).toString(), "{0, 1, 2, 3, 4, 5}");
|
||||||
|
|
||||||
|
assertEquals(icc.getConnections(6).toString(), "{6, 7, 9}");
|
||||||
|
assertEquals(icc.getConnections(7).toString(), "{6, 7, 9}");
|
||||||
|
assertEquals(icc.getConnections(9).toString(), "{6, 7, 9}");
|
||||||
|
|
||||||
|
assertNull(icc.getConnections(8));
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -101,6 +101,10 @@ abstract class AbstractSparkAction implements Serializable {
|
||||||
return SparkSession.builder().config(conf).getOrCreate();
|
return SparkSession.builder().config(conf).getOrCreate();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
protected static SparkSession getSparkWithHiveSession(SparkConf conf) {
|
||||||
|
return SparkSession.builder().enableHiveSupport().config(conf).getOrCreate();
|
||||||
|
}
|
||||||
|
|
||||||
protected static <T> void save(Dataset<T> dataset, String outPath, SaveMode mode) {
|
protected static <T> void save(Dataset<T> dataset, String outPath, SaveMode mode) {
|
||||||
dataset.write().option("compression", "gzip").mode(mode).json(outPath);
|
dataset.write().option("compression", "gzip").mode(mode).json(outPath);
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,128 +1,187 @@
|
||||||
|
|
||||||
package eu.dnetlib.dhp.oa.dedup;
|
package eu.dnetlib.dhp.oa.dedup;
|
||||||
|
|
||||||
import java.lang.reflect.InvocationTargetException;
|
|
||||||
import java.util.*;
|
|
||||||
import java.util.stream.Collectors;
|
|
||||||
|
|
||||||
import org.apache.commons.beanutils.BeanUtils;
|
|
||||||
import org.apache.commons.lang3.StringUtils;
|
|
||||||
import org.apache.spark.api.java.function.MapFunction;
|
|
||||||
import org.apache.spark.api.java.function.MapGroupsFunction;
|
|
||||||
import org.apache.spark.sql.Dataset;
|
|
||||||
import org.apache.spark.sql.Encoders;
|
|
||||||
import org.apache.spark.sql.SparkSession;
|
|
||||||
|
|
||||||
import com.fasterxml.jackson.databind.DeserializationFeature;
|
|
||||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
|
||||||
import com.google.common.collect.Lists;
|
|
||||||
|
|
||||||
import eu.dnetlib.dhp.oa.dedup.model.Identifier;
|
import eu.dnetlib.dhp.oa.dedup.model.Identifier;
|
||||||
import eu.dnetlib.dhp.oa.merge.AuthorMerger;
|
import eu.dnetlib.dhp.oa.merge.AuthorMerger;
|
||||||
import eu.dnetlib.dhp.schema.common.ModelSupport;
|
import eu.dnetlib.dhp.schema.common.ModelSupport;
|
||||||
import eu.dnetlib.dhp.schema.oaf.*;
|
import eu.dnetlib.dhp.schema.oaf.Author;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.DataInfo;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.OafEntity;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.Result;
|
||||||
|
import org.apache.commons.beanutils.BeanUtils;
|
||||||
|
import org.apache.commons.lang3.StringUtils;
|
||||||
|
import org.apache.spark.api.java.function.FlatMapFunction;
|
||||||
|
import org.apache.spark.api.java.function.MapFunction;
|
||||||
|
import org.apache.spark.api.java.function.ReduceFunction;
|
||||||
|
import org.apache.spark.sql.*;
|
||||||
import scala.Tuple2;
|
import scala.Tuple2;
|
||||||
|
import scala.Tuple3;
|
||||||
|
import scala.collection.JavaConversions;
|
||||||
|
|
||||||
|
import java.util.*;
|
||||||
|
import java.util.stream.Stream;
|
||||||
|
|
||||||
public class DedupRecordFactory {
|
public class DedupRecordFactory {
|
||||||
|
public static final class DedupRecordReduceState {
|
||||||
|
public final String dedupId;
|
||||||
|
|
||||||
protected static final ObjectMapper OBJECT_MAPPER = new ObjectMapper()
|
public final ArrayList<String> aliases = new ArrayList<>();
|
||||||
.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);
|
|
||||||
|
|
||||||
private DedupRecordFactory() {
|
public final HashSet<String> acceptanceDate = new HashSet<>();
|
||||||
}
|
|
||||||
|
|
||||||
public static <T extends OafEntity> Dataset<T> createDedupRecord(
|
public OafEntity entity;
|
||||||
final SparkSession spark,
|
|
||||||
final DataInfo dataInfo,
|
|
||||||
final String mergeRelsInputPath,
|
|
||||||
final String entitiesInputPath,
|
|
||||||
final Class<T> clazz) {
|
|
||||||
|
|
||||||
long ts = System.currentTimeMillis();
|
public DedupRecordReduceState(String dedupId, String id, OafEntity entity) {
|
||||||
|
this.dedupId = dedupId;
|
||||||
|
this.entity = entity;
|
||||||
|
if (entity == null) {
|
||||||
|
aliases.add(id);
|
||||||
|
} else {
|
||||||
|
if (Result.class.isAssignableFrom(entity.getClass())) {
|
||||||
|
Result result = (Result) entity;
|
||||||
|
if (result.getDateofacceptance() != null && StringUtils.isNotBlank(result.getDateofacceptance().getValue())) {
|
||||||
|
acceptanceDate.add(result.getDateofacceptance().getValue());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// <id, json_entity>
|
public String getDedupId() {
|
||||||
Dataset<Tuple2<String, T>> entities = spark
|
return dedupId;
|
||||||
.read()
|
}
|
||||||
.textFile(entitiesInputPath)
|
}
|
||||||
.map(
|
private static final int MAX_ACCEPTANCE_DATE = 20;
|
||||||
(MapFunction<String, Tuple2<String, T>>) it -> {
|
|
||||||
T entity = OBJECT_MAPPER.readValue(it, clazz);
|
|
||||||
return new Tuple2<>(entity.getId(), entity);
|
|
||||||
},
|
|
||||||
Encoders.tuple(Encoders.STRING(), Encoders.kryo(clazz)));
|
|
||||||
|
|
||||||
// <source, target>: source is the dedup_id, target is the id of the mergedIn
|
private DedupRecordFactory() {
|
||||||
Dataset<Tuple2<String, String>> mergeRels = spark
|
}
|
||||||
.read()
|
|
||||||
.load(mergeRelsInputPath)
|
|
||||||
.as(Encoders.bean(Relation.class))
|
|
||||||
.where("relClass == 'merges'")
|
|
||||||
.map(
|
|
||||||
(MapFunction<Relation, Tuple2<String, String>>) r -> new Tuple2<>(r.getSource(), r.getTarget()),
|
|
||||||
Encoders.tuple(Encoders.STRING(), Encoders.STRING()));
|
|
||||||
|
|
||||||
return mergeRels
|
public static Dataset<OafEntity> createDedupRecord(
|
||||||
.joinWith(entities, mergeRels.col("_2").equalTo(entities.col("_1")), "inner")
|
final SparkSession spark,
|
||||||
.map(
|
final DataInfo dataInfo,
|
||||||
(MapFunction<Tuple2<Tuple2<String, String>, Tuple2<String, T>>, Tuple2<String, T>>) value -> new Tuple2<>(
|
final String mergeRelsInputPath,
|
||||||
value._1()._1(), value._2()._2()),
|
final String entitiesInputPath,
|
||||||
Encoders.tuple(Encoders.STRING(), Encoders.kryo(clazz)))
|
final Class<OafEntity> clazz) {
|
||||||
.groupByKey(
|
|
||||||
(MapFunction<Tuple2<String, T>, String>) Tuple2::_1, Encoders.STRING())
|
|
||||||
.mapGroups(
|
|
||||||
(MapGroupsFunction<String, Tuple2<String, T>, T>) (key,
|
|
||||||
values) -> entityMerger(key, values, ts, dataInfo, clazz),
|
|
||||||
Encoders.bean(clazz));
|
|
||||||
}
|
|
||||||
|
|
||||||
public static <T extends OafEntity> T entityMerger(
|
final long ts = System.currentTimeMillis();
|
||||||
String id, Iterator<Tuple2<String, T>> entities, long ts, DataInfo dataInfo, Class<T> clazz)
|
final Encoder<OafEntity> beanEncoder = Encoders.bean(clazz);
|
||||||
throws IllegalAccessException, InstantiationException, InvocationTargetException {
|
final Encoder<OafEntity> kryoEncoder = Encoders.kryo(clazz);
|
||||||
|
|
||||||
final Comparator<Identifier<T>> idComparator = new IdentifierComparator<>();
|
// <id, json_entity>
|
||||||
|
Dataset<Row> entities = spark
|
||||||
|
.read()
|
||||||
|
.schema(Encoders.bean(clazz).schema())
|
||||||
|
.json(entitiesInputPath)
|
||||||
|
.as(beanEncoder)
|
||||||
|
.map(
|
||||||
|
(MapFunction<OafEntity, Tuple2<String, OafEntity>>) entity -> {
|
||||||
|
return new Tuple2<>(entity.getId(), entity);
|
||||||
|
},
|
||||||
|
Encoders.tuple(Encoders.STRING(), kryoEncoder))
|
||||||
|
.selectExpr("_1 AS id", "_2 AS kryoObject");
|
||||||
|
|
||||||
final LinkedList<T> entityList = Lists
|
// <source, target>: source is the dedup_id, target is the id of the mergedIn
|
||||||
.newArrayList(entities)
|
Dataset<Row> mergeRels = spark
|
||||||
.stream()
|
.read()
|
||||||
.map(t -> Identifier.newInstance(t._2()))
|
.load(mergeRelsInputPath)
|
||||||
.sorted(idComparator)
|
.where("relClass == 'merges'")
|
||||||
.map(Identifier::getEntity)
|
.selectExpr("source as dedupId", "target as id");
|
||||||
.collect(Collectors.toCollection(LinkedList::new));
|
|
||||||
|
|
||||||
final T entity = clazz.newInstance();
|
return mergeRels
|
||||||
final T first = entityList.removeFirst();
|
.join(entities, JavaConversions.asScalaBuffer(Collections.singletonList("id")), "left")
|
||||||
|
.select("dedupId", "id", "kryoObject")
|
||||||
|
.as(Encoders.tuple(Encoders.STRING(), Encoders.STRING(), kryoEncoder))
|
||||||
|
.map((MapFunction<Tuple3<String, String, OafEntity>, DedupRecordReduceState>) t -> new DedupRecordReduceState(t._1(), t._2(), t._3()), Encoders.kryo(DedupRecordReduceState.class))
|
||||||
|
.groupByKey((MapFunction<DedupRecordReduceState, String>) DedupRecordReduceState::getDedupId, Encoders.STRING())
|
||||||
|
.reduceGroups(
|
||||||
|
(ReduceFunction<DedupRecordReduceState>) (t1, t2) -> {
|
||||||
|
if (t1.entity == null) {
|
||||||
|
t2.aliases.addAll(t1.aliases);
|
||||||
|
return t2;
|
||||||
|
}
|
||||||
|
if (t1.acceptanceDate.size() < MAX_ACCEPTANCE_DATE) {
|
||||||
|
t1.acceptanceDate.addAll(t2.acceptanceDate);
|
||||||
|
}
|
||||||
|
t1.aliases.addAll(t2.aliases);
|
||||||
|
t1.entity = reduceEntity(t1.entity, t2.entity);
|
||||||
|
|
||||||
BeanUtils.copyProperties(entity, first);
|
return t1;
|
||||||
|
}
|
||||||
|
)
|
||||||
|
.flatMap
|
||||||
|
((FlatMapFunction<Tuple2<String, DedupRecordReduceState>, OafEntity>) t -> {
|
||||||
|
String dedupId = t._1();
|
||||||
|
DedupRecordReduceState agg = t._2();
|
||||||
|
|
||||||
final List<List<Author>> authors = Lists.newArrayList();
|
if (agg.acceptanceDate.size() >= MAX_ACCEPTANCE_DATE) {
|
||||||
|
return Collections.emptyIterator();
|
||||||
|
}
|
||||||
|
|
||||||
entityList
|
return Stream.concat(Stream.of(agg.getDedupId()), agg.aliases.stream())
|
||||||
.forEach(
|
.map(id -> {
|
||||||
duplicate -> {
|
try {
|
||||||
entity.mergeFrom(duplicate);
|
OafEntity res = (OafEntity) BeanUtils.cloneBean(agg.entity);
|
||||||
if (ModelSupport.isSubClass(duplicate, Result.class)) {
|
res.setId(id);
|
||||||
Result r1 = (Result) duplicate;
|
res.setDataInfo(dataInfo);
|
||||||
Optional
|
res.setLastupdatetimestamp(ts);
|
||||||
.ofNullable(r1.getAuthor())
|
return res;
|
||||||
.ifPresent(a -> authors.add(a));
|
} catch (Exception e) {
|
||||||
}
|
throw new RuntimeException(e);
|
||||||
});
|
}
|
||||||
|
}).iterator();
|
||||||
|
}, beanEncoder);
|
||||||
|
}
|
||||||
|
|
||||||
// set authors and date
|
private static OafEntity reduceEntity(OafEntity entity, OafEntity duplicate) {
|
||||||
if (ModelSupport.isSubClass(entity, Result.class)) {
|
|
||||||
Optional
|
|
||||||
.ofNullable(((Result) entity).getAuthor())
|
|
||||||
.ifPresent(a -> authors.add(a));
|
|
||||||
|
|
||||||
((Result) entity).setAuthor(AuthorMerger.merge(authors));
|
if (duplicate == null) {
|
||||||
|
return entity;
|
||||||
}
|
}
|
||||||
|
|
||||||
entity.setId(id);
|
|
||||||
|
|
||||||
entity.setLastupdatetimestamp(ts);
|
int compare = new IdentifierComparator<>()
|
||||||
entity.setDataInfo(dataInfo);
|
.compare(Identifier.newInstance(entity), Identifier.newInstance(duplicate));
|
||||||
|
|
||||||
return entity;
|
if (compare > 0) {
|
||||||
}
|
OafEntity swap = duplicate;
|
||||||
|
duplicate = entity;
|
||||||
|
entity = swap;
|
||||||
|
}
|
||||||
|
|
||||||
|
entity.mergeFrom(duplicate);
|
||||||
|
|
||||||
|
if (ModelSupport.isSubClass(duplicate, Result.class)) {
|
||||||
|
Result re = (Result) entity;
|
||||||
|
Result rd = (Result) duplicate;
|
||||||
|
|
||||||
|
List<List<Author>> authors = new ArrayList<>();
|
||||||
|
if (re.getAuthor() != null) {
|
||||||
|
authors.add(re.getAuthor());
|
||||||
|
}
|
||||||
|
if (rd.getAuthor() != null) {
|
||||||
|
authors.add(rd.getAuthor());
|
||||||
|
}
|
||||||
|
|
||||||
|
re.setAuthor(AuthorMerger.merge(authors));
|
||||||
|
}
|
||||||
|
|
||||||
|
return entity;
|
||||||
|
}
|
||||||
|
|
||||||
|
public static <T extends OafEntity> T entityMerger(
|
||||||
|
String id, Iterator<Tuple2<String, T>> entities, long ts, DataInfo dataInfo, Class<T> clazz) {
|
||||||
|
T base = entities.next()._2();
|
||||||
|
|
||||||
|
while (entities.hasNext()) {
|
||||||
|
T duplicate = entities.next()._2();
|
||||||
|
if (duplicate != null)
|
||||||
|
base = (T) reduceEntity(base, duplicate);
|
||||||
|
}
|
||||||
|
|
||||||
|
base.setId(id);
|
||||||
|
base.setDataInfo(dataInfo);
|
||||||
|
base.setLastupdatetimestamp(ts);
|
||||||
|
|
||||||
|
return base;
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,6 +1,7 @@
|
||||||
|
|
||||||
package eu.dnetlib.dhp.oa.dedup;
|
package eu.dnetlib.dhp.oa.dedup;
|
||||||
|
|
||||||
|
import static eu.dnetlib.dhp.utils.DHPUtils.md5;
|
||||||
import static org.apache.commons.lang3.StringUtils.substringAfter;
|
import static org.apache.commons.lang3.StringUtils.substringAfter;
|
||||||
import static org.apache.commons.lang3.StringUtils.substringBefore;
|
import static org.apache.commons.lang3.StringUtils.substringBefore;
|
||||||
|
|
||||||
|
@ -14,33 +15,36 @@ import eu.dnetlib.dhp.schema.oaf.utils.PidType;
|
||||||
public class IdGenerator implements Serializable {
|
public class IdGenerator implements Serializable {
|
||||||
|
|
||||||
// pick the best pid from the list (consider date and pidtype)
|
// pick the best pid from the list (consider date and pidtype)
|
||||||
public static <T extends OafEntity> String generate(List<Identifier<T>> pids, String defaultID) {
|
public static <T extends OafEntity> String generate(List<? extends Identifier> pids, String defaultID) {
|
||||||
if (pids == null || pids.isEmpty())
|
if (pids == null || pids.isEmpty())
|
||||||
return defaultID;
|
return defaultID;
|
||||||
|
|
||||||
return generateId(pids);
|
return generateId(pids);
|
||||||
}
|
}
|
||||||
|
|
||||||
private static <T extends OafEntity> String generateId(List<Identifier<T>> pids) {
|
private static String generateId(List<? extends Identifier> pids) {
|
||||||
Identifier<T> bp = pids
|
Identifier bp = pids
|
||||||
.stream()
|
.stream()
|
||||||
.min(Identifier::compareTo)
|
.min(Identifier::compareTo)
|
||||||
.orElseThrow(() -> new IllegalStateException("unable to generate id"));
|
.orElseThrow(() -> new IllegalStateException("unable to generate id"));
|
||||||
|
|
||||||
String prefix = substringBefore(bp.getOriginalID(), "|");
|
return generate(bp.getOriginalID());
|
||||||
String ns = substringBefore(substringAfter(bp.getOriginalID(), "|"), "::");
|
}
|
||||||
String suffix = substringAfter(bp.getOriginalID(), "::");
|
|
||||||
|
public static String generate(String originalId) {
|
||||||
|
String prefix = substringBefore(originalId, "|");
|
||||||
|
String ns = substringBefore(substringAfter(originalId, "|"), "::");
|
||||||
|
String suffix = substringAfter(originalId, "::");
|
||||||
|
|
||||||
final String pidType = substringBefore(ns, "_");
|
final String pidType = substringBefore(ns, "_");
|
||||||
if (PidType.isValid(pidType)) {
|
if (PidType.isValid(pidType)) {
|
||||||
return prefix + "|" + dedupify(ns) + "::" + suffix;
|
return prefix + "|" + dedupify(ns) + "::" + suffix;
|
||||||
} else {
|
} else {
|
||||||
return prefix + "|dedup_wf_001::" + suffix;
|
return prefix + "|dedup_wf_002::" + md5(originalId); // hash the whole originalId to avoid collisions
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private static String dedupify(String ns) {
|
private static String dedupify(String ns) {
|
||||||
|
|
||||||
StringBuilder prefix;
|
StringBuilder prefix;
|
||||||
if (PidType.valueOf(substringBefore(ns, "_")) == PidType.openorgs) {
|
if (PidType.valueOf(substringBefore(ns, "_")) == PidType.openorgs) {
|
||||||
prefix = new StringBuilder(substringBefore(ns, "_"));
|
prefix = new StringBuilder(substringBefore(ns, "_"));
|
||||||
|
@ -53,5 +57,4 @@ public class IdGenerator implements Serializable {
|
||||||
}
|
}
|
||||||
return prefix.substring(0, 12);
|
return prefix.substring(0, 12);
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -3,49 +3,47 @@ package eu.dnetlib.dhp.oa.dedup;
|
||||||
|
|
||||||
import static eu.dnetlib.dhp.schema.common.ModelConstants.DNET_PROVENANCE_ACTIONS;
|
import static eu.dnetlib.dhp.schema.common.ModelConstants.DNET_PROVENANCE_ACTIONS;
|
||||||
import static eu.dnetlib.dhp.schema.common.ModelConstants.PROVENANCE_DEDUP;
|
import static eu.dnetlib.dhp.schema.common.ModelConstants.PROVENANCE_DEDUP;
|
||||||
|
import static org.apache.spark.sql.functions.*;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.util.*;
|
import java.time.LocalDate;
|
||||||
import java.util.stream.Collectors;
|
import java.util.ArrayList;
|
||||||
|
import java.util.Arrays;
|
||||||
|
import java.util.Collections;
|
||||||
|
import java.util.Optional;
|
||||||
|
|
||||||
import org.apache.commons.io.IOUtils;
|
import org.apache.commons.io.IOUtils;
|
||||||
|
import org.apache.commons.lang3.StringUtils;
|
||||||
import org.apache.spark.SparkConf;
|
import org.apache.spark.SparkConf;
|
||||||
import org.apache.spark.api.java.JavaPairRDD;
|
|
||||||
import org.apache.spark.api.java.JavaSparkContext;
|
|
||||||
import org.apache.spark.api.java.function.FlatMapFunction;
|
import org.apache.spark.api.java.function.FlatMapFunction;
|
||||||
import org.apache.spark.api.java.function.MapFunction;
|
import org.apache.spark.sql.*;
|
||||||
import org.apache.spark.api.java.function.MapGroupsFunction;
|
|
||||||
import org.apache.spark.graphx.Edge;
|
|
||||||
import org.apache.spark.rdd.RDD;
|
|
||||||
import org.apache.spark.sql.Dataset;
|
import org.apache.spark.sql.Dataset;
|
||||||
import org.apache.spark.sql.Encoders;
|
import org.apache.spark.sql.catalyst.encoders.RowEncoder;
|
||||||
import org.apache.spark.sql.SaveMode;
|
import org.apache.spark.sql.expressions.UserDefinedFunction;
|
||||||
import org.apache.spark.sql.SparkSession;
|
import org.apache.spark.sql.expressions.Window;
|
||||||
|
import org.apache.spark.sql.expressions.WindowSpec;
|
||||||
|
import org.apache.spark.sql.types.DataTypes;
|
||||||
|
import org.apache.spark.sql.types.StructType;
|
||||||
import org.dom4j.DocumentException;
|
import org.dom4j.DocumentException;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
import org.xml.sax.SAXException;
|
import org.xml.sax.SAXException;
|
||||||
|
|
||||||
import com.google.common.collect.Lists;
|
|
||||||
import com.google.common.hash.Hashing;
|
import com.google.common.hash.Hashing;
|
||||||
|
import com.kwartile.lib.cc.ConnectedComponent;
|
||||||
|
|
||||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||||
import eu.dnetlib.dhp.oa.dedup.graph.ConnectedComponent;
|
|
||||||
import eu.dnetlib.dhp.oa.dedup.graph.GraphProcessor;
|
|
||||||
import eu.dnetlib.dhp.oa.dedup.model.Identifier;
|
|
||||||
import eu.dnetlib.dhp.schema.common.EntityType;
|
import eu.dnetlib.dhp.schema.common.EntityType;
|
||||||
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||||
import eu.dnetlib.dhp.schema.common.ModelSupport;
|
import eu.dnetlib.dhp.schema.common.ModelSupport;
|
||||||
import eu.dnetlib.dhp.schema.oaf.DataInfo;
|
import eu.dnetlib.dhp.schema.oaf.*;
|
||||||
import eu.dnetlib.dhp.schema.oaf.OafEntity;
|
import eu.dnetlib.dhp.schema.oaf.utils.PidType;
|
||||||
import eu.dnetlib.dhp.schema.oaf.Qualifier;
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.Relation;
|
|
||||||
import eu.dnetlib.dhp.utils.ISLookupClientFactory;
|
import eu.dnetlib.dhp.utils.ISLookupClientFactory;
|
||||||
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
|
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
|
||||||
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
|
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
|
||||||
import eu.dnetlib.pace.config.DedupConfig;
|
import eu.dnetlib.pace.config.DedupConfig;
|
||||||
import eu.dnetlib.pace.util.MapDocumentUtil;
|
import scala.Tuple3;
|
||||||
import scala.Tuple2;
|
import scala.collection.JavaConversions;
|
||||||
|
|
||||||
public class SparkCreateMergeRels extends AbstractSparkAction {
|
public class SparkCreateMergeRels extends AbstractSparkAction {
|
||||||
|
|
||||||
|
@ -68,10 +66,12 @@ public class SparkCreateMergeRels extends AbstractSparkAction {
|
||||||
log.info("isLookupUrl {}", isLookUpUrl);
|
log.info("isLookupUrl {}", isLookUpUrl);
|
||||||
|
|
||||||
SparkConf conf = new SparkConf();
|
SparkConf conf = new SparkConf();
|
||||||
|
conf.set("hive.metastore.uris", parser.get("hiveMetastoreUris"));
|
||||||
conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer");
|
conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer");
|
||||||
|
|
||||||
conf.registerKryoClasses(ModelSupport.getOafModelClasses());
|
conf.registerKryoClasses(ModelSupport.getOafModelClasses());
|
||||||
|
|
||||||
new SparkCreateMergeRels(parser, getSparkSession(conf))
|
new SparkCreateMergeRels(parser, getSparkWithHiveSession(conf))
|
||||||
.run(ISLookupClientFactory.getLookUpService(isLookUpUrl));
|
.run(ISLookupClientFactory.getLookUpService(isLookUpUrl));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -87,14 +87,15 @@ public class SparkCreateMergeRels extends AbstractSparkAction {
|
||||||
.ofNullable(parser.get("cutConnectedComponent"))
|
.ofNullable(parser.get("cutConnectedComponent"))
|
||||||
.map(Integer::valueOf)
|
.map(Integer::valueOf)
|
||||||
.orElse(0);
|
.orElse(0);
|
||||||
|
|
||||||
|
final String pivotHistoryDatabase = parser.get("pivotHistoryDatabase");
|
||||||
|
|
||||||
log.info("connected component cut: '{}'", cut);
|
log.info("connected component cut: '{}'", cut);
|
||||||
log.info("graphBasePath: '{}'", graphBasePath);
|
log.info("graphBasePath: '{}'", graphBasePath);
|
||||||
log.info("isLookUpUrl: '{}'", isLookUpUrl);
|
log.info("isLookUpUrl: '{}'", isLookUpUrl);
|
||||||
log.info("actionSetId: '{}'", actionSetId);
|
log.info("actionSetId: '{}'", actionSetId);
|
||||||
log.info("workingPath: '{}'", workingPath);
|
log.info("workingPath: '{}'", workingPath);
|
||||||
|
|
||||||
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
|
|
||||||
|
|
||||||
for (DedupConfig dedupConf : getConfigurations(isLookUpService, actionSetId)) {
|
for (DedupConfig dedupConf : getConfigurations(isLookUpService, actionSetId)) {
|
||||||
final String subEntity = dedupConf.getWf().getSubEntityValue();
|
final String subEntity = dedupConf.getWf().getSubEntityValue();
|
||||||
final Class<OafEntity> clazz = ModelSupport.entityTypes.get(EntityType.valueOf(subEntity));
|
final Class<OafEntity> clazz = ModelSupport.entityTypes.get(EntityType.valueOf(subEntity));
|
||||||
|
@ -106,113 +107,172 @@ public class SparkCreateMergeRels extends AbstractSparkAction {
|
||||||
|
|
||||||
final String mergeRelPath = DedupUtility.createMergeRelPath(workingPath, actionSetId, subEntity);
|
final String mergeRelPath = DedupUtility.createMergeRelPath(workingPath, actionSetId, subEntity);
|
||||||
|
|
||||||
// <hash(id), id>
|
final Dataset<Row> simRels = spark
|
||||||
JavaPairRDD<Object, String> vertexes = createVertexes(sc, graphBasePath, subEntity, dedupConf);
|
|
||||||
|
|
||||||
final RDD<Edge<String>> edgeRdd = spark
|
|
||||||
.read()
|
.read()
|
||||||
.load(DedupUtility.createSimRelPath(workingPath, actionSetId, subEntity))
|
.load(DedupUtility.createSimRelPath(workingPath, actionSetId, subEntity))
|
||||||
.as(Encoders.bean(Relation.class))
|
.select("source", "target");
|
||||||
.javaRDD()
|
|
||||||
.map(it -> new Edge<>(hash(it.getSource()), hash(it.getTarget()), it.getRelClass()))
|
|
||||||
.rdd();
|
|
||||||
|
|
||||||
Dataset<Tuple2<String, String>> rawMergeRels = spark
|
UserDefinedFunction hashUDF = functions
|
||||||
.createDataset(
|
.udf(
|
||||||
GraphProcessor
|
(String s) -> hash(s), DataTypes.LongType);
|
||||||
.findCCs(vertexes.rdd(), edgeRdd, maxIterations, cut)
|
|
||||||
.toJavaRDD()
|
|
||||||
.filter(k -> k.getIds().size() > 1)
|
|
||||||
.flatMap(this::ccToRels)
|
|
||||||
.rdd(),
|
|
||||||
Encoders.tuple(Encoders.STRING(), Encoders.STRING()));
|
|
||||||
|
|
||||||
Dataset<Tuple2<String, OafEntity>> entities = spark
|
// <hash(id), id>
|
||||||
|
Dataset<Row> vertexIdMap = simRels
|
||||||
|
.selectExpr("source as id")
|
||||||
|
.union(simRels.selectExpr("target as id"))
|
||||||
|
.distinct()
|
||||||
|
.withColumn("vertexId", hashUDF.apply(functions.col("id")));
|
||||||
|
|
||||||
|
// transform simrels into pairs of numeric ids
|
||||||
|
final Dataset<Row> edges = spark
|
||||||
.read()
|
.read()
|
||||||
.textFile(DedupUtility.createEntityPath(graphBasePath, subEntity))
|
.load(DedupUtility.createSimRelPath(workingPath, actionSetId, subEntity))
|
||||||
.map(
|
.select("source", "target")
|
||||||
(MapFunction<String, Tuple2<String, OafEntity>>) it -> {
|
.withColumn("source", hashUDF.apply(functions.col("source")))
|
||||||
OafEntity entity = OBJECT_MAPPER.readValue(it, clazz);
|
.withColumn("target", hashUDF.apply(functions.col("target")));
|
||||||
return new Tuple2<>(entity.getId(), entity);
|
|
||||||
},
|
|
||||||
Encoders.tuple(Encoders.STRING(), Encoders.kryo(clazz)));
|
|
||||||
|
|
||||||
Dataset<Relation> mergeRels = rawMergeRels
|
// resolve connected components
|
||||||
.joinWith(entities, rawMergeRels.col("_2").equalTo(entities.col("_1")), "inner")
|
// ("vertexId", "groupId")
|
||||||
// <tmp_source,target>,<target,entity>
|
Dataset<Row> cliques = ConnectedComponent
|
||||||
.map(
|
.runOnPairs(edges, 50, spark);
|
||||||
(MapFunction<Tuple2<Tuple2<String, String>, Tuple2<String, OafEntity>>, Tuple2<String, OafEntity>>) value -> new Tuple2<>(
|
|
||||||
value._1()._1(), value._2()._2()),
|
// transform "vertexId" back to its original string value
|
||||||
Encoders.tuple(Encoders.STRING(), Encoders.kryo(clazz)))
|
// groupId is kept numeric as its string value is not used
|
||||||
// <tmp_source,entity>
|
// ("id", "groupId")
|
||||||
.groupByKey(
|
Dataset<Row> rawMergeRels = cliques
|
||||||
(MapFunction<Tuple2<String, OafEntity>, String>) Tuple2::_1, Encoders.STRING())
|
.join(vertexIdMap, JavaConversions.asScalaBuffer(Collections.singletonList("vertexId")), "inner")
|
||||||
.mapGroups(
|
.drop("vertexId")
|
||||||
(MapGroupsFunction<String, Tuple2<String, OafEntity>, ConnectedComponent>) this::generateID,
|
.distinct();
|
||||||
Encoders.bean(ConnectedComponent.class))
|
|
||||||
// <root_id, list(target)>
|
// empty dataframe if historydatabase is not used
|
||||||
|
Dataset<Row> pivotHistory = spark
|
||||||
|
.createDataset(
|
||||||
|
Collections.emptyList(),
|
||||||
|
RowEncoder
|
||||||
|
.apply(StructType.fromDDL("id STRING, lastUsage STRING")));
|
||||||
|
|
||||||
|
if (StringUtils.isNotBlank(pivotHistoryDatabase)) {
|
||||||
|
pivotHistory = spark
|
||||||
|
.read()
|
||||||
|
.table(pivotHistoryDatabase + "." + subEntity)
|
||||||
|
.selectExpr("id", "lastUsage");
|
||||||
|
}
|
||||||
|
|
||||||
|
// depending on resulttype collectefrom and dateofacceptance are evaluated differently
|
||||||
|
String collectedfromExpr = "false AS collectedfrom";
|
||||||
|
String dateExpr = "'' AS date";
|
||||||
|
|
||||||
|
if (Result.class.isAssignableFrom(clazz)) {
|
||||||
|
if (Publication.class.isAssignableFrom(clazz)) {
|
||||||
|
collectedfromExpr = "array_contains(collectedfrom.key, '" + ModelConstants.CROSSREF_ID
|
||||||
|
+ "') AS collectedfrom";
|
||||||
|
} else if (eu.dnetlib.dhp.schema.oaf.Dataset.class.isAssignableFrom(clazz)) {
|
||||||
|
collectedfromExpr = "array_contains(collectedfrom.key, '" + ModelConstants.DATACITE_ID
|
||||||
|
+ "') AS collectedfrom";
|
||||||
|
}
|
||||||
|
|
||||||
|
dateExpr = "dateofacceptance.value AS date";
|
||||||
|
}
|
||||||
|
|
||||||
|
// cap pidType at w3id as from there on they are considered equal
|
||||||
|
UserDefinedFunction mapPid = udf(
|
||||||
|
(String s) -> Math.min(PidType.tryValueOf(s).ordinal(), PidType.w3id.ordinal()), DataTypes.IntegerType);
|
||||||
|
|
||||||
|
UserDefinedFunction validDate = udf((String date) -> {
|
||||||
|
if (StringUtils.isNotBlank(date)
|
||||||
|
&& date.matches(DatePicker.DATE_PATTERN) && DatePicker.inRange(date)) {
|
||||||
|
return date;
|
||||||
|
}
|
||||||
|
return LocalDate.now().plusWeeks(1).toString();
|
||||||
|
}, DataTypes.StringType);
|
||||||
|
|
||||||
|
Dataset<Row> pivotingData = spark
|
||||||
|
.read()
|
||||||
|
.schema(Encoders.bean(clazz).schema())
|
||||||
|
.json(DedupUtility.createEntityPath(graphBasePath, subEntity))
|
||||||
|
.selectExpr(
|
||||||
|
"id",
|
||||||
|
"regexp_extract(id, '^\\\\d+\\\\|([^_]+).*::', 1) AS pidType",
|
||||||
|
collectedfromExpr,
|
||||||
|
dateExpr)
|
||||||
|
.withColumn("pidType", mapPid.apply(col("pidType"))) // ordinal of pid type
|
||||||
|
.withColumn("date", validDate.apply(col("date")));
|
||||||
|
|
||||||
|
// ordering to selected pivot id
|
||||||
|
WindowSpec w = Window
|
||||||
|
.partitionBy("groupId")
|
||||||
|
.orderBy(
|
||||||
|
col("lastUsage").desc_nulls_last(),
|
||||||
|
col("pidType").asc_nulls_last(),
|
||||||
|
col("collectedfrom").desc_nulls_last(),
|
||||||
|
col("date").asc_nulls_last(),
|
||||||
|
col("id").asc_nulls_last());
|
||||||
|
|
||||||
|
Dataset<Relation> output = rawMergeRels
|
||||||
|
.join(pivotHistory, JavaConversions.asScalaBuffer(Collections.singletonList("id")), "full")
|
||||||
|
.join(pivotingData, JavaConversions.asScalaBuffer(Collections.singletonList("id")), "left")
|
||||||
|
.withColumn("pivot", functions.first("id").over(w))
|
||||||
|
.withColumn("position", functions.row_number().over(w))
|
||||||
.flatMap(
|
.flatMap(
|
||||||
(FlatMapFunction<ConnectedComponent, Relation>) cc -> ccToMergeRel(cc, dedupConf),
|
(FlatMapFunction<Row, Tuple3<String, String, String>>) (Row r) -> {
|
||||||
Encoders.bean(Relation.class));
|
String id = r.getAs("id");
|
||||||
|
String dedupId = IdGenerator.generate(id);
|
||||||
|
|
||||||
saveParquet(mergeRels, mergeRelPath, SaveMode.Overwrite);
|
String pivot = r.getAs("pivot");
|
||||||
|
String pivotDedupId = IdGenerator.generate(pivot);
|
||||||
|
|
||||||
|
// filter out id == pivotDedupId
|
||||||
|
// those are caused by claim expressed on pivotDedupId
|
||||||
|
// information will be merged after creating deduprecord
|
||||||
|
if (id.equals(pivotDedupId)) {
|
||||||
|
return Collections.emptyIterator();
|
||||||
|
}
|
||||||
|
|
||||||
|
ArrayList<Tuple3<String, String, String>> res = new ArrayList<>();
|
||||||
|
|
||||||
|
// singleton pivots have null groupId as they do not match rawMergeRels
|
||||||
|
if (r.isNullAt(r.fieldIndex("groupId"))) {
|
||||||
|
// the record is existing if it matches pivotingData
|
||||||
|
if (!r.isNullAt(r.fieldIndex("collectedfrom"))) {
|
||||||
|
// create relation with old dedup id
|
||||||
|
res.add(new Tuple3<>(id, dedupId, null));
|
||||||
|
}
|
||||||
|
return res.iterator();
|
||||||
|
}
|
||||||
|
|
||||||
|
// this was a pivot in a previous graph but it has been merged into a new group with different
|
||||||
|
// pivot
|
||||||
|
if (!r.isNullAt(r.fieldIndex("lastUsage")) && !pivot.equals(id) && !dedupId.equals(pivotDedupId)) {
|
||||||
|
// materialize the previous dedup record as a merge relation with the new one
|
||||||
|
res.add(new Tuple3<>(dedupId, pivotDedupId, null));
|
||||||
|
}
|
||||||
|
|
||||||
|
// add merge relations
|
||||||
|
if (cut <=0 || r.<Integer>getAs("position") <= cut) {
|
||||||
|
res.add(new Tuple3<>(id, pivotDedupId, pivot));
|
||||||
|
}
|
||||||
|
|
||||||
|
return res.iterator();
|
||||||
|
}, Encoders.tuple(Encoders.STRING(), Encoders.STRING(), Encoders.STRING()))
|
||||||
|
.distinct()
|
||||||
|
.flatMap(
|
||||||
|
(FlatMapFunction<Tuple3<String, String, String>, Relation>) (Tuple3<String, String, String> r) -> {
|
||||||
|
String id = r._1();
|
||||||
|
String dedupId = r._2();
|
||||||
|
String pivot = r._3();
|
||||||
|
|
||||||
|
ArrayList<Relation> res = new ArrayList<>();
|
||||||
|
res.add(rel(pivot, dedupId, id, ModelConstants.MERGES, dedupConf));
|
||||||
|
res.add(rel(pivot, id, dedupId, ModelConstants.IS_MERGED_IN, dedupConf));
|
||||||
|
|
||||||
|
return res.iterator();
|
||||||
|
}, Encoders.bean(Relation.class));
|
||||||
|
|
||||||
|
saveParquet(output, mergeRelPath, SaveMode.Overwrite);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private <T extends OafEntity> ConnectedComponent generateID(String key, Iterator<Tuple2<String, T>> values) {
|
private static Relation rel(String pivot, String source, String target, String relClass, DedupConfig dedupConf) {
|
||||||
|
|
||||||
List<Identifier<T>> identifiers = Lists
|
|
||||||
.newArrayList(values)
|
|
||||||
.stream()
|
|
||||||
.map(v -> Identifier.newInstance(v._2()))
|
|
||||||
.collect(Collectors.toList());
|
|
||||||
|
|
||||||
String rootID = IdGenerator.generate(identifiers, key);
|
|
||||||
|
|
||||||
if (Objects.equals(rootID, key))
|
|
||||||
throw new IllegalStateException("generated default ID: " + rootID);
|
|
||||||
|
|
||||||
return new ConnectedComponent(rootID,
|
|
||||||
identifiers.stream().map(i -> i.getEntity().getId()).collect(Collectors.toSet()));
|
|
||||||
}
|
|
||||||
|
|
||||||
private JavaPairRDD<Object, String> createVertexes(JavaSparkContext sc, String graphBasePath, String subEntity,
|
|
||||||
DedupConfig dedupConf) {
|
|
||||||
|
|
||||||
return sc
|
|
||||||
.textFile(DedupUtility.createEntityPath(graphBasePath, subEntity))
|
|
||||||
.mapToPair(json -> {
|
|
||||||
String id = MapDocumentUtil.getJPathString(dedupConf.getWf().getIdPath(), json);
|
|
||||||
return new Tuple2<>(hash(id), id);
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
private Iterator<Tuple2<String, String>> ccToRels(ConnectedComponent cc) {
|
|
||||||
return cc
|
|
||||||
.getIds()
|
|
||||||
.stream()
|
|
||||||
.map(id -> new Tuple2<>(cc.getCcId(), id))
|
|
||||||
.iterator();
|
|
||||||
}
|
|
||||||
|
|
||||||
private Iterator<Relation> ccToMergeRel(ConnectedComponent cc, DedupConfig dedupConf) {
|
|
||||||
return cc
|
|
||||||
.getIds()
|
|
||||||
.stream()
|
|
||||||
.flatMap(
|
|
||||||
id -> {
|
|
||||||
List<Relation> tmp = new ArrayList<>();
|
|
||||||
|
|
||||||
tmp.add(rel(cc.getCcId(), id, ModelConstants.MERGES, dedupConf));
|
|
||||||
tmp.add(rel(id, cc.getCcId(), ModelConstants.IS_MERGED_IN, dedupConf));
|
|
||||||
|
|
||||||
return tmp.stream();
|
|
||||||
})
|
|
||||||
.iterator();
|
|
||||||
}
|
|
||||||
|
|
||||||
private Relation rel(String source, String target, String relClass, DedupConfig dedupConf) {
|
|
||||||
|
|
||||||
String entityType = dedupConf.getWf().getEntityType();
|
String entityType = dedupConf.getWf().getEntityType();
|
||||||
|
|
||||||
|
@ -238,6 +298,14 @@ public class SparkCreateMergeRels extends AbstractSparkAction {
|
||||||
// TODO calculate the trust value based on the similarity score of the elements in the CC
|
// TODO calculate the trust value based on the similarity score of the elements in the CC
|
||||||
|
|
||||||
r.setDataInfo(info);
|
r.setDataInfo(info);
|
||||||
|
|
||||||
|
if (pivot != null) {
|
||||||
|
KeyValue pivotKV = new KeyValue();
|
||||||
|
pivotKV.setKey("pivot");
|
||||||
|
pivotKV.setValue(pivot);
|
||||||
|
|
||||||
|
r.setProperties(Arrays.asList(pivotKV));
|
||||||
|
}
|
||||||
return r;
|
return r;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -91,18 +91,12 @@ public class SparkWhitelistSimRels extends AbstractSparkAction {
|
||||||
Dataset<Row> entities = spark
|
Dataset<Row> entities = spark
|
||||||
.read()
|
.read()
|
||||||
.textFile(DedupUtility.createEntityPath(graphBasePath, subEntity))
|
.textFile(DedupUtility.createEntityPath(graphBasePath, subEntity))
|
||||||
.repartition(numPartitions)
|
.select(functions.get_json_object(new Column("value"), dedupConf.getWf().getIdPath()).as("id"))
|
||||||
.withColumn("id", functions.get_json_object(new Column("value"), dedupConf.getWf().getIdPath()));
|
.distinct();
|
||||||
|
|
||||||
Dataset<Row> whiteListRels1 = whiteListRels
|
Dataset<Relation> whiteListSimRels = whiteListRels
|
||||||
.join(entities, entities.col("id").equalTo(whiteListRels.col("from")), "inner")
|
.join(entities, entities.col("id").equalTo(whiteListRels.col("from")), "leftsemi")
|
||||||
.select("from", "to");
|
.join(entities, functions.col("to").equalTo(entities.col("id")), "leftsemi")
|
||||||
|
|
||||||
Dataset<Row> whiteListRels2 = whiteListRels1
|
|
||||||
.join(entities, whiteListRels1.col("to").equalTo(entities.col("id")), "inner")
|
|
||||||
.select("from", "to");
|
|
||||||
|
|
||||||
Dataset<Relation> whiteListSimRels = whiteListRels2
|
|
||||||
.map(
|
.map(
|
||||||
(MapFunction<Row, Relation>) r -> DedupUtility
|
(MapFunction<Row, Relation>) r -> DedupUtility
|
||||||
.createSimRel(r.getString(0), r.getString(1), entity),
|
.createSimRel(r.getString(0), r.getString(1), entity),
|
||||||
|
|
|
@ -1,100 +0,0 @@
|
||||||
|
|
||||||
package eu.dnetlib.dhp.oa.dedup.graph;
|
|
||||||
|
|
||||||
import java.io.IOException;
|
|
||||||
import java.io.Serializable;
|
|
||||||
import java.util.Set;
|
|
||||||
import java.util.stream.Collectors;
|
|
||||||
|
|
||||||
import org.apache.commons.lang3.StringUtils;
|
|
||||||
import org.codehaus.jackson.annotate.JsonIgnore;
|
|
||||||
|
|
||||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
|
||||||
|
|
||||||
import eu.dnetlib.dhp.utils.DHPUtils;
|
|
||||||
import eu.dnetlib.pace.util.PaceException;
|
|
||||||
|
|
||||||
public class ConnectedComponent implements Serializable {
|
|
||||||
|
|
||||||
private String ccId;
|
|
||||||
private Set<String> ids;
|
|
||||||
|
|
||||||
private static final String CONNECTED_COMPONENT_ID_PREFIX = "connect_comp";
|
|
||||||
|
|
||||||
public ConnectedComponent(Set<String> ids, final int cut) {
|
|
||||||
this.ids = ids;
|
|
||||||
|
|
||||||
this.ccId = createDefaultID();
|
|
||||||
|
|
||||||
if (cut > 0 && ids.size() > cut) {
|
|
||||||
this.ids = ids
|
|
||||||
.stream()
|
|
||||||
.filter(id -> !ccId.equalsIgnoreCase(id))
|
|
||||||
.limit(cut - 1)
|
|
||||||
.collect(Collectors.toSet());
|
|
||||||
// this.ids.add(ccId); ??
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
public ConnectedComponent(String ccId, Set<String> ids) {
|
|
||||||
this.ccId = ccId;
|
|
||||||
this.ids = ids;
|
|
||||||
}
|
|
||||||
|
|
||||||
public String createDefaultID() {
|
|
||||||
if (ids.size() > 1) {
|
|
||||||
final String s = getMin();
|
|
||||||
String prefix = s.split("\\|")[0];
|
|
||||||
ccId = prefix + "|" + CONNECTED_COMPONENT_ID_PREFIX + "::" + DHPUtils.md5(s);
|
|
||||||
return ccId;
|
|
||||||
} else {
|
|
||||||
return ids.iterator().next();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
@JsonIgnore
|
|
||||||
public String getMin() {
|
|
||||||
|
|
||||||
final StringBuilder min = new StringBuilder();
|
|
||||||
|
|
||||||
ids
|
|
||||||
.forEach(
|
|
||||||
id -> {
|
|
||||||
if (StringUtils.isBlank(min.toString())) {
|
|
||||||
min.append(id);
|
|
||||||
} else {
|
|
||||||
if (min.toString().compareTo(id) > 0) {
|
|
||||||
min.setLength(0);
|
|
||||||
min.append(id);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
});
|
|
||||||
return min.toString();
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public String toString() {
|
|
||||||
ObjectMapper mapper = new ObjectMapper();
|
|
||||||
try {
|
|
||||||
return mapper.writeValueAsString(this);
|
|
||||||
} catch (IOException e) {
|
|
||||||
throw new PaceException("Failed to create Json: ", e);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
public Set<String> getIds() {
|
|
||||||
return ids;
|
|
||||||
}
|
|
||||||
|
|
||||||
public void setIds(Set<String> ids) {
|
|
||||||
this.ids = ids;
|
|
||||||
}
|
|
||||||
|
|
||||||
public String getCcId() {
|
|
||||||
return ccId;
|
|
||||||
}
|
|
||||||
|
|
||||||
public void setCcId(String ccId) {
|
|
||||||
this.ccId = ccId;
|
|
||||||
}
|
|
||||||
}
|
|
|
@ -1,37 +0,0 @@
|
||||||
package eu.dnetlib.dhp.oa.dedup.graph
|
|
||||||
|
|
||||||
import org.apache.spark.graphx._
|
|
||||||
import org.apache.spark.rdd.RDD
|
|
||||||
|
|
||||||
import scala.collection.JavaConversions;
|
|
||||||
|
|
||||||
object GraphProcessor {
|
|
||||||
|
|
||||||
def findCCs(vertexes: RDD[(VertexId, String)], edges: RDD[Edge[String]], maxIterations: Int, cut:Int): RDD[ConnectedComponent] = {
|
|
||||||
val graph: Graph[String, String] = Graph(vertexes, edges).partitionBy(PartitionStrategy.RandomVertexCut) //TODO remember to remove partitionby
|
|
||||||
val cc = graph.connectedComponents(maxIterations).vertices
|
|
||||||
|
|
||||||
val joinResult = vertexes.leftOuterJoin(cc).map {
|
|
||||||
case (id, (openaireId, cc)) => {
|
|
||||||
if (cc.isEmpty) {
|
|
||||||
(id, openaireId)
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
(cc.get, openaireId)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
val connectedComponents = joinResult.groupByKey()
|
|
||||||
.map[ConnectedComponent](cc => asConnectedComponent(cc, cut))
|
|
||||||
connectedComponents
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def asConnectedComponent(group: (VertexId, Iterable[String]), cut:Int): ConnectedComponent = {
|
|
||||||
val docs = group._2.toSet[String]
|
|
||||||
val connectedComponent = new ConnectedComponent(JavaConversions.setAsJavaSet[String](docs), cut);
|
|
||||||
connectedComponent
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
|
@ -3,21 +3,21 @@ package eu.dnetlib.dhp.oa.dedup.model;
|
||||||
|
|
||||||
import java.io.Serializable;
|
import java.io.Serializable;
|
||||||
import java.text.SimpleDateFormat;
|
import java.text.SimpleDateFormat;
|
||||||
import java.util.*;
|
import java.time.LocalDate;
|
||||||
import java.util.stream.Collectors;
|
import java.util.Date;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Objects;
|
||||||
|
|
||||||
import org.apache.commons.lang3.StringUtils;
|
import org.apache.commons.lang3.StringUtils;
|
||||||
|
|
||||||
import com.google.common.collect.Sets;
|
|
||||||
|
|
||||||
import eu.dnetlib.dhp.oa.dedup.DatePicker;
|
import eu.dnetlib.dhp.oa.dedup.DatePicker;
|
||||||
import eu.dnetlib.dhp.oa.dedup.IdentifierComparator;
|
import eu.dnetlib.dhp.oa.dedup.IdentifierComparator;
|
||||||
import eu.dnetlib.dhp.schema.common.EntityType;
|
import eu.dnetlib.dhp.schema.common.EntityType;
|
||||||
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
|
||||||
import eu.dnetlib.dhp.schema.common.ModelSupport;
|
import eu.dnetlib.dhp.schema.common.ModelSupport;
|
||||||
import eu.dnetlib.dhp.schema.oaf.*;
|
import eu.dnetlib.dhp.schema.oaf.Field;
|
||||||
import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils;
|
import eu.dnetlib.dhp.schema.oaf.KeyValue;
|
||||||
import eu.dnetlib.dhp.schema.oaf.utils.PidComparator;
|
import eu.dnetlib.dhp.schema.oaf.OafEntity;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.Result;
|
||||||
import eu.dnetlib.dhp.schema.oaf.utils.PidType;
|
import eu.dnetlib.dhp.schema.oaf.utils.PidType;
|
||||||
|
|
||||||
public class Identifier<T extends OafEntity> implements Serializable, Comparable<Identifier<T>> {
|
public class Identifier<T extends OafEntity> implements Serializable, Comparable<Identifier<T>> {
|
||||||
|
@ -50,7 +50,7 @@ public class Identifier<T extends OafEntity> implements Serializable, Comparable
|
||||||
if (Objects.nonNull(date)) {
|
if (Objects.nonNull(date)) {
|
||||||
return date;
|
return date;
|
||||||
} else {
|
} else {
|
||||||
String sDate = BASE_DATE;
|
String sDate = LocalDate.now().plusDays(1).toString();
|
||||||
if (ModelSupport.isSubClass(getEntity(), Result.class)) {
|
if (ModelSupport.isSubClass(getEntity(), Result.class)) {
|
||||||
Result result = (Result) getEntity();
|
Result result = (Result) getEntity();
|
||||||
if (isWellformed(result.getDateofacceptance())) {
|
if (isWellformed(result.getDateofacceptance())) {
|
||||||
|
|
|
@ -28,5 +28,17 @@
|
||||||
"paramLongName": "workingPath",
|
"paramLongName": "workingPath",
|
||||||
"paramDescription": "path for the working directory",
|
"paramDescription": "path for the working directory",
|
||||||
"paramRequired": true
|
"paramRequired": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"paramName":"h",
|
||||||
|
"paramLongName":"hiveMetastoreUris",
|
||||||
|
"paramDescription": "the hive metastore uris",
|
||||||
|
"paramRequired": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"paramName": "p",
|
||||||
|
"paramLongName": "pivotHistoryDatabase",
|
||||||
|
"paramDescription": "Pivot history database",
|
||||||
|
"paramRequired": false
|
||||||
}
|
}
|
||||||
]
|
]
|
|
@ -15,4 +15,8 @@
|
||||||
<name>oozie.action.sharelib.for.spark</name>
|
<name>oozie.action.sharelib.for.spark</name>
|
||||||
<value>spark2</value>
|
<value>spark2</value>
|
||||||
</property>
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>hiveMetastoreUris</name>
|
||||||
|
<value>thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083</value>
|
||||||
|
</property>
|
||||||
</configuration>
|
</configuration>
|
|
@ -188,6 +188,8 @@
|
||||||
<arg>--isLookUpUrl</arg><arg>${isLookUpUrl}</arg>
|
<arg>--isLookUpUrl</arg><arg>${isLookUpUrl}</arg>
|
||||||
<arg>--actionSetId</arg><arg>${actionSetId}</arg>
|
<arg>--actionSetId</arg><arg>${actionSetId}</arg>
|
||||||
<arg>--cutConnectedComponent</arg><arg>${cutConnectedComponent}</arg>
|
<arg>--cutConnectedComponent</arg><arg>${cutConnectedComponent}</arg>
|
||||||
|
<arg>--hiveMetastoreUris</arg><arg>${hiveMetastoreUris}</arg>
|
||||||
|
<arg>--pivotHistoryDatabase</arg><arg>${pivotHistoryDatabase}</arg>
|
||||||
</spark>
|
</spark>
|
||||||
<ok to="CreateDedupRecord"/>
|
<ok to="CreateDedupRecord"/>
|
||||||
<error to="Kill"/>
|
<error to="Kill"/>
|
||||||
|
|
|
@ -0,0 +1,335 @@
|
||||||
|
/** Copyright (c) 2017 Kwartile, Inc., http://www.kwartile.com
|
||||||
|
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||||
|
* of this software and associated documentation files (the "Software"), to deal
|
||||||
|
* in the Software without restriction, including without limitation the rights
|
||||||
|
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||||
|
* copies of the Software, and to permit persons to whom the Software is
|
||||||
|
* furnished to do so, subject to the following conditions:
|
||||||
|
*
|
||||||
|
* The above copyright notice and this permission notice shall be included in all
|
||||||
|
* copies or substantial portions of the Software.
|
||||||
|
*
|
||||||
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||||
|
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||||
|
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||||
|
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||||
|
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||||
|
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||||
|
* SOFTWARE.
|
||||||
|
*/
|
||||||
|
|
||||||
|
/** Map-reduce implementation of Connected Component
|
||||||
|
* Given lists of subgraphs, returns all the nodes that are connected.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package com.kwartile.lib.cc
|
||||||
|
|
||||||
|
import org.apache.spark.rdd.RDD
|
||||||
|
import org.apache.spark.sql.{Dataset, Row, SparkSession}
|
||||||
|
import org.apache.spark.storage.StorageLevel
|
||||||
|
|
||||||
|
import scala.annotation.tailrec
|
||||||
|
import scala.collection.mutable
|
||||||
|
|
||||||
|
object ConnectedComponent extends Serializable {
|
||||||
|
|
||||||
|
/** Applies Small Star operation on RDD of nodePairs
|
||||||
|
*
|
||||||
|
* @param nodePairs on which to apply Small Star operations
|
||||||
|
* @return new nodePairs after the operation and conncectivy change count
|
||||||
|
*/
|
||||||
|
private def smallStar(nodePairs: RDD[(Long, Long)]): (RDD[(Long, Long)], Long) = {
|
||||||
|
|
||||||
|
/** generate RDD of (self, List(neighbors)) where self > neighbors
|
||||||
|
* E.g.: nodePairs (1, 4), (6, 1), (3, 2), (6, 5)
|
||||||
|
* will result into (4, List(1)), (6, List(1)), (3, List(2)), (6, List(5))
|
||||||
|
*/
|
||||||
|
val neighbors = nodePairs.map(x => {
|
||||||
|
val (self, neighbor) = (x._1, x._2)
|
||||||
|
if (self > neighbor)
|
||||||
|
(self, neighbor)
|
||||||
|
else
|
||||||
|
(neighbor, self)
|
||||||
|
})
|
||||||
|
|
||||||
|
/** reduce on self to get list of all its neighbors.
|
||||||
|
* E.g: (4, List(1)), (6, List(1)), (3, List(2)), (6, List(5))
|
||||||
|
* will result into (4, List(1)), (6, List(1, 5)), (3, List(2))
|
||||||
|
* Note:
|
||||||
|
* (1) you may need to tweak number of partitions.
|
||||||
|
* (2) also, watch out for data skew. In that case, consider using rangePartitioner
|
||||||
|
*/
|
||||||
|
val empty = mutable.HashSet[Long]()
|
||||||
|
val allNeighbors = neighbors.aggregateByKey(empty)(
|
||||||
|
(lb, v) => lb += v,
|
||||||
|
(lb1, lb2) => lb1 ++ lb2
|
||||||
|
)
|
||||||
|
|
||||||
|
/** Apply Small Star operation on (self, List(neighbor)) to get newNodePairs and count the change in connectivity
|
||||||
|
*/
|
||||||
|
|
||||||
|
val newNodePairsWithChangeCount = allNeighbors
|
||||||
|
.map(x => {
|
||||||
|
val self = x._1
|
||||||
|
val neighbors = x._2.toList
|
||||||
|
val minNode = argMin(self :: neighbors)
|
||||||
|
val newNodePairs = (self :: neighbors)
|
||||||
|
.map(neighbor => {
|
||||||
|
(neighbor, minNode)
|
||||||
|
})
|
||||||
|
.filter(x => {
|
||||||
|
val neighbor = x._1
|
||||||
|
val minNode = x._2
|
||||||
|
(neighbor <= self && neighbor != minNode) || (self == neighbor)
|
||||||
|
})
|
||||||
|
val uniqueNewNodePairs = newNodePairs.toSet.toList
|
||||||
|
|
||||||
|
/** We count the change by taking a diff of the new node pairs with the old node pairs
|
||||||
|
*/
|
||||||
|
val connectivityChangeCount = (uniqueNewNodePairs diff neighbors.map((self, _))).length
|
||||||
|
(uniqueNewNodePairs, connectivityChangeCount)
|
||||||
|
})
|
||||||
|
.persist(StorageLevel.MEMORY_AND_DISK_SER)
|
||||||
|
|
||||||
|
/** Sum all the changeCounts
|
||||||
|
*/
|
||||||
|
val totalConnectivityCountChange = newNodePairsWithChangeCount
|
||||||
|
.mapPartitions(iter => {
|
||||||
|
val (v, l) = iter.toSeq.unzip
|
||||||
|
val sum = l.sum
|
||||||
|
Iterator(sum)
|
||||||
|
})
|
||||||
|
.sum
|
||||||
|
.toLong
|
||||||
|
|
||||||
|
val newNodePairs = newNodePairsWithChangeCount.map(x => x._1).flatMap(x => x)
|
||||||
|
newNodePairsWithChangeCount.unpersist(false)
|
||||||
|
(newNodePairs, totalConnectivityCountChange)
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Apply Large Star operation on a RDD of nodePairs
|
||||||
|
*
|
||||||
|
* @param nodePairs on which to apply Large Star operations
|
||||||
|
* @return new nodePairs after the operation and conncectivy change count
|
||||||
|
*/
|
||||||
|
private def largeStar(nodePairs: RDD[(Long, Long)]): (RDD[(Long, Long)], Long) = {
|
||||||
|
|
||||||
|
/** generate RDD of (self, List(neighbors))
|
||||||
|
* E.g.: nodePairs (1, 4), (6, 1), (3, 2), (6, 5)
|
||||||
|
* will result into (4, List(1)), (1, List(4)), (6, List(1)), (1, List(6)), (3, List(2)), (2, List(3)), (6, List(5)), (5, List(6))
|
||||||
|
*/
|
||||||
|
|
||||||
|
val neighbors = nodePairs.flatMap(x => {
|
||||||
|
val (self, neighbor) = (x._1, x._2)
|
||||||
|
if (self == neighbor)
|
||||||
|
List((self, neighbor))
|
||||||
|
else
|
||||||
|
List((self, neighbor), (neighbor, self))
|
||||||
|
})
|
||||||
|
|
||||||
|
/** reduce on self to get list of all its neighbors.
|
||||||
|
* E.g: (4, List(1)), (1, List(4)), (6, List(1)), (1, List(6)), (3, List(2)), (2, List(3)), (6, List(5)), (5, List(6))
|
||||||
|
* will result into (4, List(1)), (1, List(4, 6)), (6, List(1, 5)), (3, List(2)), (2, List(3)), (5, List(6))
|
||||||
|
* Note:
|
||||||
|
* (1) you may need to tweak number of partitions.
|
||||||
|
* (2) also, watch out for data skew. In that case, consider using rangePartitioner
|
||||||
|
*/
|
||||||
|
|
||||||
|
val localAdd = (s: mutable.HashSet[Long], v: Long) => s += v
|
||||||
|
val partitionAdd = (s1: mutable.HashSet[Long], s2: mutable.HashSet[Long]) => s1 ++= s2
|
||||||
|
val allNeighbors =
|
||||||
|
neighbors.aggregateByKey(mutable.HashSet.empty[Long] /*, rangePartitioner*/ )(localAdd, partitionAdd)
|
||||||
|
|
||||||
|
/** Apply Large Star operation on (self, List(neighbor)) to get newNodePairs and count the change in connectivity
|
||||||
|
*/
|
||||||
|
|
||||||
|
val newNodePairsWithChangeCount = allNeighbors
|
||||||
|
.map(x => {
|
||||||
|
val self = x._1
|
||||||
|
val neighbors = x._2.toList
|
||||||
|
val minNode = argMin(self :: neighbors)
|
||||||
|
val newNodePairs = (self :: neighbors)
|
||||||
|
.map(neighbor => {
|
||||||
|
(neighbor, minNode)
|
||||||
|
})
|
||||||
|
.filter(x => {
|
||||||
|
val neighbor = x._1
|
||||||
|
val minNode = x._2
|
||||||
|
neighbor > self || neighbor == minNode
|
||||||
|
})
|
||||||
|
|
||||||
|
val uniqueNewNodePairs = newNodePairs.toSet.toList
|
||||||
|
val connectivityChangeCount = (uniqueNewNodePairs diff neighbors.map((self, _))).length
|
||||||
|
(uniqueNewNodePairs, connectivityChangeCount)
|
||||||
|
})
|
||||||
|
.persist(StorageLevel.MEMORY_AND_DISK_SER)
|
||||||
|
|
||||||
|
val totalConnectivityCountChange = newNodePairsWithChangeCount
|
||||||
|
.mapPartitions(iter => {
|
||||||
|
val (v, l) = iter.toSeq.unzip
|
||||||
|
val sum = l.sum
|
||||||
|
Iterator(sum)
|
||||||
|
})
|
||||||
|
.sum
|
||||||
|
.toLong
|
||||||
|
|
||||||
|
/** Sum all the changeCounts
|
||||||
|
*/
|
||||||
|
val newNodePairs = newNodePairsWithChangeCount.map(x => x._1).flatMap(x => x)
|
||||||
|
newNodePairsWithChangeCount.unpersist(false)
|
||||||
|
(newNodePairs, totalConnectivityCountChange)
|
||||||
|
}
|
||||||
|
|
||||||
|
private def argMin(nodes: List[Long]): Long = {
|
||||||
|
nodes.min(Ordering.by((node: Long) => node))
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Build nodePairs given a list of nodes. A list of nodes represents a subgraph.
|
||||||
|
*
|
||||||
|
* @param nodes that are part of a subgraph
|
||||||
|
* @return nodePairs for a subgraph
|
||||||
|
*/
|
||||||
|
private def buildPairs(nodes: List[Long]): List[(Long, Long)] = {
|
||||||
|
buildPairs(nodes.head, nodes.tail, null.asInstanceOf[List[(Long, Long)]])
|
||||||
|
}
|
||||||
|
|
||||||
|
@tailrec
|
||||||
|
private def buildPairs(node: Long, neighbors: List[Long], partialPairs: List[(Long, Long)]): List[(Long, Long)] = {
|
||||||
|
if (neighbors.isEmpty) {
|
||||||
|
if (partialPairs != null)
|
||||||
|
List((node, node)) ::: partialPairs
|
||||||
|
else
|
||||||
|
List((node, node))
|
||||||
|
} else if (neighbors.length == 1) {
|
||||||
|
val neighbor = neighbors(0)
|
||||||
|
if (node > neighbor)
|
||||||
|
if (partialPairs != null) List((node, neighbor)) ::: partialPairs else List((node, neighbor))
|
||||||
|
else if (partialPairs != null) List((neighbor, node)) ::: partialPairs
|
||||||
|
else List((neighbor, node))
|
||||||
|
} else {
|
||||||
|
val newPartialPairs = neighbors
|
||||||
|
.map(neighbor => {
|
||||||
|
if (node > neighbor)
|
||||||
|
List((node, neighbor))
|
||||||
|
else
|
||||||
|
List((neighbor, node))
|
||||||
|
})
|
||||||
|
.flatMap(x => x)
|
||||||
|
|
||||||
|
if (partialPairs != null)
|
||||||
|
buildPairs(neighbors.head, neighbors.tail, newPartialPairs ::: partialPairs)
|
||||||
|
else
|
||||||
|
buildPairs(neighbors.head, neighbors.tail, newPartialPairs)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Implements alternatingAlgo. Converges when the changeCount is either 0 or does not change from the previous iteration
|
||||||
|
*
|
||||||
|
* @param nodePairs for a graph
|
||||||
|
* @param largeStarConnectivityChangeCount change count that resulted from the previous iteration
|
||||||
|
* @param smallStarConnectivityChangeCount change count that resulted from the previous iteration
|
||||||
|
* @param didConverge flag to indicate the alorigth converged
|
||||||
|
* @param currIterationCount counter to capture number of iterations
|
||||||
|
* @param maxIterationCount maximum number iterations to try before giving up
|
||||||
|
* @return RDD of nodePairs
|
||||||
|
*/
|
||||||
|
|
||||||
|
@tailrec
|
||||||
|
private def alternatingAlgo(
|
||||||
|
nodePairs: RDD[(Long, Long)],
|
||||||
|
largeStarConnectivityChangeCount: Long,
|
||||||
|
smallStarConnectivityChangeCount: Long,
|
||||||
|
didConverge: Boolean,
|
||||||
|
currIterationCount: Int,
|
||||||
|
maxIterationCount: Int
|
||||||
|
): (RDD[(Long, Long)], Boolean, Long) = {
|
||||||
|
|
||||||
|
val iterationCount = currIterationCount + 1
|
||||||
|
if (didConverge)
|
||||||
|
(nodePairs, true, currIterationCount)
|
||||||
|
else if (currIterationCount >= maxIterationCount) {
|
||||||
|
(nodePairs, false, currIterationCount)
|
||||||
|
} else {
|
||||||
|
|
||||||
|
val (nodePairsLargeStar, currLargeStarConnectivityChangeCount) = largeStar(nodePairs)
|
||||||
|
val (nodePairsSmallStar, currSmallStarConnectivityChangeCount) = smallStar(nodePairsLargeStar)
|
||||||
|
|
||||||
|
if (
|
||||||
|
(currLargeStarConnectivityChangeCount == largeStarConnectivityChangeCount &&
|
||||||
|
currSmallStarConnectivityChangeCount == smallStarConnectivityChangeCount) ||
|
||||||
|
(currSmallStarConnectivityChangeCount == 0 && currLargeStarConnectivityChangeCount == 0)
|
||||||
|
) {
|
||||||
|
alternatingAlgo(
|
||||||
|
nodePairsSmallStar,
|
||||||
|
currLargeStarConnectivityChangeCount,
|
||||||
|
currSmallStarConnectivityChangeCount,
|
||||||
|
true,
|
||||||
|
iterationCount,
|
||||||
|
maxIterationCount
|
||||||
|
)
|
||||||
|
} else {
|
||||||
|
alternatingAlgo(
|
||||||
|
nodePairsSmallStar,
|
||||||
|
currLargeStarConnectivityChangeCount,
|
||||||
|
currSmallStarConnectivityChangeCount,
|
||||||
|
false,
|
||||||
|
iterationCount,
|
||||||
|
maxIterationCount
|
||||||
|
)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Driver function
|
||||||
|
*
|
||||||
|
* @param cliques list of nodes representing subgraphs (or cliques)
|
||||||
|
* @param maxIterationCount maximum number iterations to try before giving up
|
||||||
|
* @return Connected Components as nodePairs where second member of the nodePair is the minimum node in the component
|
||||||
|
*/
|
||||||
|
def run(cliques: RDD[List[Long]], maxIterationCount: Int): (RDD[(Long, Long)], Boolean, Long) = {
|
||||||
|
|
||||||
|
val nodePairs = cliques
|
||||||
|
.map(aClique => {
|
||||||
|
buildPairs(aClique)
|
||||||
|
})
|
||||||
|
.flatMap(x => x)
|
||||||
|
|
||||||
|
val (cc, didConverge, iterCount) = alternatingAlgo(nodePairs, 9999999L, 9999999L, false, 0, maxIterationCount)
|
||||||
|
|
||||||
|
if (didConverge) {
|
||||||
|
(cc, didConverge, iterCount)
|
||||||
|
} else {
|
||||||
|
(null.asInstanceOf[RDD[(Long, Long)]], didConverge, iterCount)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
def runOnPairs(nodePairs: RDD[(Long, Long)], maxIterationCount: Int): (RDD[(Long, Long)], Boolean, Long) = {
|
||||||
|
val (cc, didConverge, iterCount) = alternatingAlgo(nodePairs, 9999999L, 9999999L, false, 0, maxIterationCount)
|
||||||
|
|
||||||
|
if (didConverge) {
|
||||||
|
(cc, didConverge, iterCount)
|
||||||
|
} else {
|
||||||
|
(null.asInstanceOf[RDD[(Long, Long)]], didConverge, iterCount)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
def runOnPairs(nodePairs: Dataset[Row], maxIterationCount: Int)(implicit spark: SparkSession): Dataset[Row] = {
|
||||||
|
import spark.implicits._
|
||||||
|
|
||||||
|
val (cc, didConverge, iterCount) = alternatingAlgo(
|
||||||
|
nodePairs.map(e => (e.getLong(0), e.getLong(1))).rdd,
|
||||||
|
9999999L,
|
||||||
|
9999999L,
|
||||||
|
false,
|
||||||
|
0,
|
||||||
|
maxIterationCount
|
||||||
|
)
|
||||||
|
|
||||||
|
if (didConverge) {
|
||||||
|
cc.toDF("vertexId", "groupId")
|
||||||
|
} else {
|
||||||
|
null.asInstanceOf[Dataset[Row]]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -41,9 +41,13 @@ import com.google.common.collect.Sets;
|
||||||
|
|
||||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||||
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||||
|
import eu.dnetlib.dhp.schema.common.ModelSupport;
|
||||||
import eu.dnetlib.dhp.schema.oaf.*;
|
import eu.dnetlib.dhp.schema.oaf.*;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils;
|
||||||
|
import eu.dnetlib.dhp.schema.sx.OafUtils;
|
||||||
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
|
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
|
||||||
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
|
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
|
||||||
|
import scala.Tuple2;
|
||||||
|
|
||||||
@ExtendWith(MockitoExtension.class)
|
@ExtendWith(MockitoExtension.class)
|
||||||
@TestMethodOrder(MethodOrderer.OrderAnnotation.class)
|
@TestMethodOrder(MethodOrderer.OrderAnnotation.class)
|
||||||
|
@ -97,6 +101,7 @@ public class SparkDedupTest implements Serializable {
|
||||||
|
|
||||||
final SparkConf conf = new SparkConf();
|
final SparkConf conf = new SparkConf();
|
||||||
conf.set("spark.sql.shuffle.partitions", "200");
|
conf.set("spark.sql.shuffle.partitions", "200");
|
||||||
|
conf.set("spark.sql.warehouse.dir", testOutputBasePath + "/spark-warehouse");
|
||||||
spark = SparkSession
|
spark = SparkSession
|
||||||
.builder()
|
.builder()
|
||||||
.appName(SparkDedupTest.class.getSimpleName())
|
.appName(SparkDedupTest.class.getSimpleName())
|
||||||
|
@ -186,11 +191,11 @@ public class SparkDedupTest implements Serializable {
|
||||||
System.out.println("ds_simrel = " + ds_simrel);
|
System.out.println("ds_simrel = " + ds_simrel);
|
||||||
System.out.println("orp_simrel = " + orp_simrel);
|
System.out.println("orp_simrel = " + orp_simrel);
|
||||||
|
|
||||||
assertEquals(1538, orgs_simrel);
|
assertEquals(751, orgs_simrel);
|
||||||
assertEquals(3523, pubs_simrel);
|
assertEquals(546, pubs_simrel);
|
||||||
assertEquals(168, sw_simrel);
|
assertEquals(113, sw_simrel);
|
||||||
assertEquals(221, ds_simrel);
|
assertEquals(148, ds_simrel);
|
||||||
assertEquals(3392, orp_simrel);
|
assertEquals(280, orp_simrel);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -235,10 +240,10 @@ public class SparkDedupTest implements Serializable {
|
||||||
.count();
|
.count();
|
||||||
|
|
||||||
// entities simrels supposed to be equal to the number of previous step (no rels in whitelist)
|
// entities simrels supposed to be equal to the number of previous step (no rels in whitelist)
|
||||||
assertEquals(1538, orgs_simrel);
|
assertEquals(751, orgs_simrel);
|
||||||
assertEquals(3523, pubs_simrel);
|
assertEquals(546, pubs_simrel);
|
||||||
assertEquals(221, ds_simrel);
|
assertEquals(148, ds_simrel);
|
||||||
assertEquals(3392, orp_simrel);
|
assertEquals(280, orp_simrel);
|
||||||
// System.out.println("orgs_simrel = " + orgs_simrel);
|
// System.out.println("orgs_simrel = " + orgs_simrel);
|
||||||
// System.out.println("pubs_simrel = " + pubs_simrel);
|
// System.out.println("pubs_simrel = " + pubs_simrel);
|
||||||
// System.out.println("ds_simrel = " + ds_simrel);
|
// System.out.println("ds_simrel = " + ds_simrel);
|
||||||
|
@ -268,7 +273,7 @@ public class SparkDedupTest implements Serializable {
|
||||||
&& rel.getTarget().equalsIgnoreCase(whiteList.get(1).split(WHITELIST_SEPARATOR)[1]))
|
&& rel.getTarget().equalsIgnoreCase(whiteList.get(1).split(WHITELIST_SEPARATOR)[1]))
|
||||||
.count() > 0);
|
.count() > 0);
|
||||||
|
|
||||||
assertEquals(170, sw_simrel.count());
|
assertEquals(115, sw_simrel.count());
|
||||||
// System.out.println("sw_simrel = " + sw_simrel.count());
|
// System.out.println("sw_simrel = " + sw_simrel.count());
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -292,7 +297,9 @@ public class SparkDedupTest implements Serializable {
|
||||||
"-w",
|
"-w",
|
||||||
testOutputBasePath,
|
testOutputBasePath,
|
||||||
"-cc",
|
"-cc",
|
||||||
"3"
|
"3",
|
||||||
|
"-h",
|
||||||
|
""
|
||||||
});
|
});
|
||||||
|
|
||||||
new SparkCreateMergeRels(parser, spark).run(isLookUpService);
|
new SparkCreateMergeRels(parser, spark).run(isLookUpService);
|
||||||
|
@ -365,6 +372,113 @@ public class SparkDedupTest implements Serializable {
|
||||||
.deleteDirectory(new File(testOutputBasePath + "/" + testActionSetId + "/otherresearchproduct_mergerel"));
|
.deleteDirectory(new File(testOutputBasePath + "/" + testActionSetId + "/otherresearchproduct_mergerel"));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
@Order(3)
|
||||||
|
void createMergeRelsWithPivotHistoryTest() throws Exception {
|
||||||
|
|
||||||
|
ArgumentApplicationParser parser = new ArgumentApplicationParser(
|
||||||
|
classPathResourceAsString("/eu/dnetlib/dhp/oa/dedup/createCC_parameters.json"));
|
||||||
|
|
||||||
|
spark.sql("CREATE DATABASE IF NOT EXISTS pivot_history_test");
|
||||||
|
ModelSupport.oafTypes.keySet().forEach(entityType -> {
|
||||||
|
try {
|
||||||
|
spark
|
||||||
|
.read()
|
||||||
|
.json(
|
||||||
|
Paths
|
||||||
|
.get(SparkDedupTest.class.getResource("/eu/dnetlib/dhp/dedup/pivot_history").toURI())
|
||||||
|
.toFile()
|
||||||
|
.getAbsolutePath())
|
||||||
|
.write()
|
||||||
|
.mode("overwrite")
|
||||||
|
.saveAsTable("pivot_history_test." + entityType);
|
||||||
|
} catch (URISyntaxException e) {
|
||||||
|
throw new RuntimeException(e);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
parser
|
||||||
|
.parseArgument(
|
||||||
|
new String[] {
|
||||||
|
"-i",
|
||||||
|
testGraphBasePath,
|
||||||
|
"-asi",
|
||||||
|
testActionSetId,
|
||||||
|
"-la",
|
||||||
|
"lookupurl",
|
||||||
|
"-w",
|
||||||
|
testOutputBasePath,
|
||||||
|
"-h",
|
||||||
|
"",
|
||||||
|
"-pivotHistoryDatabase",
|
||||||
|
"pivot_history_test"
|
||||||
|
|
||||||
|
});
|
||||||
|
|
||||||
|
new SparkCreateMergeRels(parser, spark).run(isLookUpService);
|
||||||
|
|
||||||
|
long orgs_mergerel = spark
|
||||||
|
.read()
|
||||||
|
.load(testOutputBasePath + "/" + testActionSetId + "/organization_mergerel")
|
||||||
|
.count();
|
||||||
|
final Dataset<Relation> pubs = spark
|
||||||
|
.read()
|
||||||
|
.load(testOutputBasePath + "/" + testActionSetId + "/publication_mergerel")
|
||||||
|
.as(Encoders.bean(Relation.class));
|
||||||
|
long sw_mergerel = spark
|
||||||
|
.read()
|
||||||
|
.load(testOutputBasePath + "/" + testActionSetId + "/software_mergerel")
|
||||||
|
.count();
|
||||||
|
long ds_mergerel = spark
|
||||||
|
.read()
|
||||||
|
.load(testOutputBasePath + "/" + testActionSetId + "/dataset_mergerel")
|
||||||
|
.count();
|
||||||
|
|
||||||
|
long orp_mergerel = spark
|
||||||
|
.read()
|
||||||
|
.load(testOutputBasePath + "/" + testActionSetId + "/otherresearchproduct_mergerel")
|
||||||
|
.count();
|
||||||
|
|
||||||
|
final List<Relation> merges = pubs
|
||||||
|
.filter("source == '50|arXiv_dedup_::c93aeb433eb90ed7a86e29be00791b7c'")
|
||||||
|
.collectAsList();
|
||||||
|
assertEquals(3, merges.size());
|
||||||
|
Set<String> dups = Sets
|
||||||
|
.newHashSet(
|
||||||
|
"50|doi_________::3b1d0d8e8f930826665df9d6b82fbb73",
|
||||||
|
"50|doi_________::d5021b53204e4fdeab6ff5d5bc468032",
|
||||||
|
"50|arXiv_______::c93aeb433eb90ed7a86e29be00791b7c");
|
||||||
|
merges.forEach(r -> {
|
||||||
|
assertEquals(ModelConstants.RESULT_RESULT, r.getRelType());
|
||||||
|
assertEquals(ModelConstants.DEDUP, r.getSubRelType());
|
||||||
|
assertEquals(ModelConstants.MERGES, r.getRelClass());
|
||||||
|
assertTrue(dups.contains(r.getTarget()));
|
||||||
|
});
|
||||||
|
|
||||||
|
final List<Relation> mergedIn = pubs
|
||||||
|
.filter("target == '50|arXiv_dedup_::c93aeb433eb90ed7a86e29be00791b7c'")
|
||||||
|
.collectAsList();
|
||||||
|
assertEquals(3, mergedIn.size());
|
||||||
|
mergedIn.forEach(r -> {
|
||||||
|
assertEquals(ModelConstants.RESULT_RESULT, r.getRelType());
|
||||||
|
assertEquals(ModelConstants.DEDUP, r.getSubRelType());
|
||||||
|
assertEquals(ModelConstants.IS_MERGED_IN, r.getRelClass());
|
||||||
|
assertTrue(dups.contains(r.getSource()));
|
||||||
|
});
|
||||||
|
|
||||||
|
assertEquals(1268, orgs_mergerel);
|
||||||
|
assertEquals(1112, pubs.count());
|
||||||
|
assertEquals(292, sw_mergerel);
|
||||||
|
assertEquals(476, ds_mergerel);
|
||||||
|
assertEquals(742, orp_mergerel);
|
||||||
|
// System.out.println("orgs_mergerel = " + orgs_mergerel);
|
||||||
|
// System.out.println("pubs_mergerel = " + pubs_mergerel);
|
||||||
|
// System.out.println("sw_mergerel = " + sw_mergerel);
|
||||||
|
// System.out.println("ds_mergerel = " + ds_mergerel);
|
||||||
|
// System.out.println("orp_mergerel = " + orp_mergerel);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
@Order(4)
|
@Order(4)
|
||||||
void createMergeRelsTest() throws Exception {
|
void createMergeRelsTest() throws Exception {
|
||||||
|
@ -382,7 +496,9 @@ public class SparkDedupTest implements Serializable {
|
||||||
"-la",
|
"-la",
|
||||||
"lookupurl",
|
"lookupurl",
|
||||||
"-w",
|
"-w",
|
||||||
testOutputBasePath
|
testOutputBasePath,
|
||||||
|
"-h",
|
||||||
|
""
|
||||||
});
|
});
|
||||||
|
|
||||||
new SparkCreateMergeRels(parser, spark).run(isLookUpService);
|
new SparkCreateMergeRels(parser, spark).run(isLookUpService);
|
||||||
|
@ -437,10 +553,10 @@ public class SparkDedupTest implements Serializable {
|
||||||
});
|
});
|
||||||
|
|
||||||
assertEquals(1268, orgs_mergerel);
|
assertEquals(1268, orgs_mergerel);
|
||||||
assertEquals(1450, pubs.count());
|
assertEquals(1112, pubs.count());
|
||||||
assertEquals(286, sw_mergerel);
|
assertEquals(292, sw_mergerel);
|
||||||
assertEquals(472, ds_mergerel);
|
assertEquals(476, ds_mergerel);
|
||||||
assertEquals(738, orp_mergerel);
|
assertEquals(742, orp_mergerel);
|
||||||
// System.out.println("orgs_mergerel = " + orgs_mergerel);
|
// System.out.println("orgs_mergerel = " + orgs_mergerel);
|
||||||
// System.out.println("pubs_mergerel = " + pubs_mergerel);
|
// System.out.println("pubs_mergerel = " + pubs_mergerel);
|
||||||
// System.out.println("sw_mergerel = " + sw_mergerel);
|
// System.out.println("sw_mergerel = " + sw_mergerel);
|
||||||
|
@ -492,8 +608,8 @@ public class SparkDedupTest implements Serializable {
|
||||||
.count();
|
.count();
|
||||||
|
|
||||||
assertEquals(86, orgs_deduprecord);
|
assertEquals(86, orgs_deduprecord);
|
||||||
assertEquals(68, pubs.count());
|
assertEquals(91, pubs.count());
|
||||||
assertEquals(49, sw_deduprecord);
|
assertEquals(47, sw_deduprecord);
|
||||||
assertEquals(97, ds_deduprecord);
|
assertEquals(97, ds_deduprecord);
|
||||||
assertEquals(92, orp_deduprecord);
|
assertEquals(92, orp_deduprecord);
|
||||||
|
|
||||||
|
@ -629,11 +745,11 @@ public class SparkDedupTest implements Serializable {
|
||||||
.distinct()
|
.distinct()
|
||||||
.count();
|
.count();
|
||||||
|
|
||||||
assertEquals(902, publications);
|
assertEquals(925, publications);
|
||||||
assertEquals(839, organizations);
|
assertEquals(839, organizations);
|
||||||
assertEquals(100, projects);
|
assertEquals(100, projects);
|
||||||
assertEquals(100, datasource);
|
assertEquals(100, datasource);
|
||||||
assertEquals(198, softwares);
|
assertEquals(196, softwares);
|
||||||
assertEquals(389, dataset);
|
assertEquals(389, dataset);
|
||||||
assertEquals(520, otherresearchproduct);
|
assertEquals(520, otherresearchproduct);
|
||||||
|
|
||||||
|
|
|
@ -101,7 +101,8 @@
|
||||||
"type" : "String",
|
"type" : "String",
|
||||||
"path" : "$.title[?(@.qualifier.classid == 'main title')].value",
|
"path" : "$.title[?(@.qualifier.classid == 'main title')].value",
|
||||||
"length" : 250,
|
"length" : 250,
|
||||||
"size" : 5
|
"size" : 5,
|
||||||
|
"clean": "title"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"name" : "authors",
|
"name" : "authors",
|
||||||
|
|
|
@ -101,7 +101,8 @@
|
||||||
"type" : "String",
|
"type" : "String",
|
||||||
"path" : "$.title[?(@.qualifier.classid == 'main title')].value",
|
"path" : "$.title[?(@.qualifier.classid == 'main title')].value",
|
||||||
"length" : 250,
|
"length" : 250,
|
||||||
"size" : 5
|
"size" : 5,
|
||||||
|
"clean": "title"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"name" : "authors",
|
"name" : "authors",
|
||||||
|
|
|
@ -29,9 +29,8 @@
|
||||||
},
|
},
|
||||||
"pace": {
|
"pace": {
|
||||||
"clustering" : [
|
"clustering" : [
|
||||||
{ "name" : "ngrampairs", "fields" : [ "title" ], "params" : { "max" : "1", "ngramLen" : "3"} },
|
{ "name" : "numAuthorsTitleSuffixPrefixChain", "fields" : [ "num_authors", "title" ], "params" : { "mod" : "10" } },
|
||||||
{ "name" : "suffixprefix", "fields" : [ "title" ], "params" : { "max" : "1", "len" : "3" } },
|
{ "name" : "jsonlistclustering", "fields" : [ "pid" ], "params" : { "jpath_value": "$.value", "jpath_classid": "$.qualifier.classid"} }
|
||||||
{ "name" : "lowercase", "fields" : [ "doi" ], "params" : { } }
|
|
||||||
],
|
],
|
||||||
"decisionTree": {
|
"decisionTree": {
|
||||||
"start": {
|
"start": {
|
||||||
|
@ -79,13 +78,37 @@
|
||||||
"ignoreUndefined": "false"
|
"ignoreUndefined": "false"
|
||||||
},
|
},
|
||||||
"layer3": {
|
"layer3": {
|
||||||
|
"fields": [
|
||||||
|
{
|
||||||
|
"field": "authors",
|
||||||
|
"comparator": "authorsMatch",
|
||||||
|
"weight": 1.0,
|
||||||
|
"countIfUndefined": "false",
|
||||||
|
"params": {
|
||||||
|
"surname_th": 0.75,
|
||||||
|
"fullname_th": 0.75,
|
||||||
|
"threshold": 0.6,
|
||||||
|
"mode": "full"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"threshold": 0.6,
|
||||||
|
"aggregation": "MAX",
|
||||||
|
"positive": "layer4",
|
||||||
|
"negative": "NO_MATCH",
|
||||||
|
"undefined": "MATCH",
|
||||||
|
"ignoreUndefined": "true"
|
||||||
|
},
|
||||||
|
"layer4": {
|
||||||
"fields": [
|
"fields": [
|
||||||
{
|
{
|
||||||
"field": "title",
|
"field": "title",
|
||||||
"comparator": "levensteinTitle",
|
"comparator": "levensteinTitle",
|
||||||
"weight": 1.0,
|
"weight": 1.0,
|
||||||
"countIfUndefined": "true",
|
"countIfUndefined": "true",
|
||||||
"params": {}
|
"params": {
|
||||||
|
"threshold": "0.99"
|
||||||
|
}
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"threshold": 0.99,
|
"threshold": 0.99,
|
||||||
|
@ -97,23 +120,25 @@
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"model": [
|
"model": [
|
||||||
{
|
|
||||||
"name": "doi",
|
|
||||||
"type": "String",
|
|
||||||
"path": "$.pid[?(@.qualifier.classid == 'doi')].value"
|
|
||||||
},
|
|
||||||
{
|
{
|
||||||
"name": "pid",
|
"name": "pid",
|
||||||
"type": "JSON",
|
"type": "JSON",
|
||||||
"path": "$.pid",
|
"path": "$.pid",
|
||||||
"overrideMatch": "true"
|
"overrideMatch": "true"
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"name": "alternateid",
|
||||||
|
"type": "JSON",
|
||||||
|
"path": "$.instance[*].alternateIdentifier[*]",
|
||||||
|
"overrideMatch": "true"
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"name": "title",
|
"name": "title",
|
||||||
"type": "String",
|
"type": "String",
|
||||||
"path": "$.title[?(@.qualifier.classid == 'main title')].value",
|
"path": "$.title[?(@.qualifier.classid == 'main title')].value",
|
||||||
"length": 250,
|
"length": 250,
|
||||||
"size": 5
|
"size": 5,
|
||||||
|
"clean": "title"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"name": "authors",
|
"name": "authors",
|
||||||
|
@ -122,9 +147,9 @@
|
||||||
"size": 200
|
"size": 200
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"name": "resulttype",
|
"name": "num_authors",
|
||||||
"type": "String",
|
"type": "String",
|
||||||
"path": "$.resulttype.classid"
|
"path": "$.author.length()"
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"blacklists": {
|
"blacklists": {
|
||||||
|
|
|
@ -75,7 +75,8 @@
|
||||||
"type" : "String",
|
"type" : "String",
|
||||||
"path" : "$.title[?(@.qualifier.classid == 'main title')].value",
|
"path" : "$.title[?(@.qualifier.classid == 'main title')].value",
|
||||||
"length" : 250,
|
"length" : 250,
|
||||||
"size" : 5
|
"size" : 5,
|
||||||
|
"clean": "title"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"name" : "url",
|
"name" : "url",
|
||||||
|
|
|
@ -0,0 +1 @@
|
||||||
|
{"id": "50|arXiv_______::c93aeb433eb90ed7a86e29be00791b7c", "firstUsage": "2022-01-01", "lastUsage": "2022-01-01", "dedupId": "50|arXiv_dedup_::c93aeb433eb90ed7a86e29be00791b7c" }
|
|
@ -167,4 +167,11 @@ public class Utils implements Serializable {
|
||||||
});
|
});
|
||||||
return projectMap;
|
return projectMap;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public static List<String> getCommunityIdList(String baseURL) throws IOException {
|
||||||
|
return getValidCommunities(baseURL)
|
||||||
|
.stream()
|
||||||
|
.map(community -> community.getId())
|
||||||
|
.collect(Collectors.toList());
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -45,7 +45,7 @@ public class SparkBulkTagJob {
|
||||||
.toString(
|
.toString(
|
||||||
SparkBulkTagJob.class
|
SparkBulkTagJob.class
|
||||||
.getResourceAsStream(
|
.getResourceAsStream(
|
||||||
"/eu/dnetlib/dhp/bulktag/input_bulkTag_parameters.json"));
|
"/eu/dnetlib/dhp/wf/subworkflows/bulktag/input_bulkTag_parameters.json"));
|
||||||
|
|
||||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
|
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
|
||||||
parser.parseArgument(args);
|
parser.parseArgument(args);
|
||||||
|
@ -105,7 +105,6 @@ public class SparkBulkTagJob {
|
||||||
Map<String, List<Pair<String, SelectionConstraints>>> dsm = cc.getEoscDatasourceMap();
|
Map<String, List<Pair<String, SelectionConstraints>>> dsm = cc.getEoscDatasourceMap();
|
||||||
|
|
||||||
for (String ds : datasources.collectAsList()) {
|
for (String ds : datasources.collectAsList()) {
|
||||||
// final String dsId = ds.substring(3);
|
|
||||||
if (!dsm.containsKey(ds)) {
|
if (!dsm.containsKey(ds)) {
|
||||||
ArrayList<Pair<String, SelectionConstraints>> eoscList = new ArrayList<>();
|
ArrayList<Pair<String, SelectionConstraints>> eoscList = new ArrayList<>();
|
||||||
dsm.put(ds, eoscList);
|
dsm.put(ds, eoscList);
|
||||||
|
@ -116,13 +115,11 @@ public class SparkBulkTagJob {
|
||||||
|
|
||||||
private static boolean isOKDatasource(Datasource ds) {
|
private static boolean isOKDatasource(Datasource ds) {
|
||||||
final String compatibility = ds.getOpenairecompatibility().getClassid();
|
final String compatibility = ds.getOpenairecompatibility().getClassid();
|
||||||
boolean isOk = (compatibility.equalsIgnoreCase(OPENAIRE_3) ||
|
return (compatibility.equalsIgnoreCase(OPENAIRE_3) ||
|
||||||
compatibility.equalsIgnoreCase(OPENAIRE_4) ||
|
compatibility.equalsIgnoreCase(OPENAIRE_4) ||
|
||||||
compatibility.equalsIgnoreCase(OPENAIRE_CRIS) ||
|
compatibility.equalsIgnoreCase(OPENAIRE_CRIS) ||
|
||||||
compatibility.equalsIgnoreCase(OPENAIRE_DATA)) &&
|
compatibility.equalsIgnoreCase(OPENAIRE_DATA)) &&
|
||||||
ds.getCollectedfrom().stream().anyMatch(cf -> cf.getKey().equals(EOSC));
|
ds.getCollectedfrom().stream().anyMatch(cf -> cf.getKey().equals(EOSC));
|
||||||
|
|
||||||
return isOk;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private static <R extends Result> void execBulkTag(
|
private static <R extends Result> void execBulkTag(
|
||||||
|
@ -151,7 +148,13 @@ public class SparkBulkTagJob {
|
||||||
.write()
|
.write()
|
||||||
.mode(SaveMode.Overwrite)
|
.mode(SaveMode.Overwrite)
|
||||||
.option("compression", "gzip")
|
.option("compression", "gzip")
|
||||||
.json(outputPath + e.name());
|
.json(outputPath + e.name());// writing the tagging in the working dir for entity
|
||||||
|
|
||||||
|
readPath(spark, outputPath + e.name(), resultClazz) // copy the tagging in the actual result output path
|
||||||
|
.write()
|
||||||
|
.mode(SaveMode.Overwrite)
|
||||||
|
.option("compression", "gzip")
|
||||||
|
.json(inputPath + e.name());
|
||||||
});
|
});
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -45,7 +45,7 @@ public class PrepareDatasourceCountryAssociation {
|
||||||
.toString(
|
.toString(
|
||||||
PrepareDatasourceCountryAssociation.class
|
PrepareDatasourceCountryAssociation.class
|
||||||
.getResourceAsStream(
|
.getResourceAsStream(
|
||||||
"/eu/dnetlib/dhp/countrypropagation/input_prepareassoc_parameters.json"));
|
"/eu/dnetlib/dhp/wf/subworkflows/countrypropagation/input_prepareassoc_parameters.json"));
|
||||||
|
|
||||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
|
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
|
||||||
|
|
||||||
|
@ -66,7 +66,7 @@ public class PrepareDatasourceCountryAssociation {
|
||||||
conf,
|
conf,
|
||||||
isSparkSessionManaged,
|
isSparkSessionManaged,
|
||||||
spark -> {
|
spark -> {
|
||||||
removeOutputDir(spark, outputPath);
|
// removeOutputDir(spark, outputPath);
|
||||||
prepareDatasourceCountryAssociation(
|
prepareDatasourceCountryAssociation(
|
||||||
spark,
|
spark,
|
||||||
Arrays.asList(parser.get("whitelist").split(";")),
|
Arrays.asList(parser.get("whitelist").split(";")),
|
||||||
|
@ -90,7 +90,8 @@ public class PrepareDatasourceCountryAssociation {
|
||||||
(FilterFunction<Datasource>) ds -> !ds.getDataInfo().getDeletedbyinference() &&
|
(FilterFunction<Datasource>) ds -> !ds.getDataInfo().getDeletedbyinference() &&
|
||||||
Optional.ofNullable(ds.getDatasourcetype()).isPresent() &&
|
Optional.ofNullable(ds.getDatasourcetype()).isPresent() &&
|
||||||
Optional.ofNullable(ds.getDatasourcetype().getClassid()).isPresent() &&
|
Optional.ofNullable(ds.getDatasourcetype().getClassid()).isPresent() &&
|
||||||
(allowedtypes.contains(ds.getDatasourcetype().getClassid()) ||
|
((Optional.ofNullable(ds.getJurisdiction()).isPresent() &&
|
||||||
|
allowedtypes.contains(ds.getJurisdiction().getClassid())) ||
|
||||||
whitelist.contains(ds.getId())));
|
whitelist.contains(ds.getId())));
|
||||||
|
|
||||||
// filtering of the relations taking the non deleted by inference and those with IsProvidedBy as relclass
|
// filtering of the relations taking the non deleted by inference and those with IsProvidedBy as relclass
|
||||||
|
|
|
@ -32,7 +32,7 @@ public class PrepareResultCountrySet {
|
||||||
.toString(
|
.toString(
|
||||||
PrepareResultCountrySet.class
|
PrepareResultCountrySet.class
|
||||||
.getResourceAsStream(
|
.getResourceAsStream(
|
||||||
"/eu/dnetlib/dhp/countrypropagation/input_prepareresultcountry_parameters.json"));
|
"/eu/dnetlib/dhp/wf/subworkflows/countrypropagation/input_prepareresultcountry_parameters.json"));
|
||||||
|
|
||||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
|
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
|
||||||
|
|
||||||
|
|
|
@ -35,7 +35,7 @@ public class SparkCountryPropagationJob {
|
||||||
.toString(
|
.toString(
|
||||||
SparkCountryPropagationJob.class
|
SparkCountryPropagationJob.class
|
||||||
.getResourceAsStream(
|
.getResourceAsStream(
|
||||||
"/eu/dnetlib/dhp/countrypropagation/input_countrypropagation_parameters.json"));
|
"/eu/dnetlib/dhp/wf/subworkflows/countrypropagation/input_countrypropagation_parameters.json"));
|
||||||
|
|
||||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
|
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
|
||||||
|
|
||||||
|
@ -97,6 +97,12 @@ public class SparkCountryPropagationJob {
|
||||||
.mode(SaveMode.Overwrite)
|
.mode(SaveMode.Overwrite)
|
||||||
.json(outputPath);
|
.json(outputPath);
|
||||||
|
|
||||||
|
readPath(spark, outputPath, resultClazz)
|
||||||
|
.write()
|
||||||
|
.mode(SaveMode.Overwrite)
|
||||||
|
.option("compression", "gzip")
|
||||||
|
.json(sourcePath);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private static <R extends Result> MapFunction<Tuple2<R, ResultCountrySet>, R> getCountryMergeFn() {
|
private static <R extends Result> MapFunction<Tuple2<R, ResultCountrySet>, R> getCountryMergeFn() {
|
||||||
|
|
|
@ -60,7 +60,7 @@ public class PrepareInfo implements Serializable {
|
||||||
.toString(
|
.toString(
|
||||||
SparkResultToOrganizationFromIstRepoJob.class
|
SparkResultToOrganizationFromIstRepoJob.class
|
||||||
.getResourceAsStream(
|
.getResourceAsStream(
|
||||||
"/eu/dnetlib/dhp/entitytoorganizationfromsemrel/input_preparation_parameter.json"));
|
"/eu/dnetlib/dhp/wf/subworkflows/entitytoorganizationfromsemrel/input_preparation_parameter.json"));
|
||||||
|
|
||||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
|
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
|
||||||
|
|
||||||
|
|
|
@ -27,8 +27,8 @@ import eu.dnetlib.dhp.resulttoorganizationfrominstrepo.SparkResultToOrganization
|
||||||
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||||
import eu.dnetlib.dhp.schema.oaf.Relation;
|
import eu.dnetlib.dhp.schema.oaf.Relation;
|
||||||
|
|
||||||
public class SparkResultToOrganizationFromSemRel implements Serializable {
|
public class SparkEntityToOrganizationFromSemRel implements Serializable {
|
||||||
private static final Logger log = LoggerFactory.getLogger(SparkResultToOrganizationFromSemRel.class);
|
private static final Logger log = LoggerFactory.getLogger(SparkEntityToOrganizationFromSemRel.class);
|
||||||
private static final int MAX_ITERATION = 5;
|
private static final int MAX_ITERATION = 5;
|
||||||
public static final String NEW_RESULT_RELATION_PATH = "/newResultRelation";
|
public static final String NEW_RESULT_RELATION_PATH = "/newResultRelation";
|
||||||
public static final String NEW_PROJECT_RELATION_PATH = "/newProjectRelation";
|
public static final String NEW_PROJECT_RELATION_PATH = "/newProjectRelation";
|
||||||
|
@ -39,7 +39,7 @@ public class SparkResultToOrganizationFromSemRel implements Serializable {
|
||||||
.toString(
|
.toString(
|
||||||
SparkResultToOrganizationFromIstRepoJob.class
|
SparkResultToOrganizationFromIstRepoJob.class
|
||||||
.getResourceAsStream(
|
.getResourceAsStream(
|
||||||
"/eu/dnetlib/dhp/entitytoorganizationfromsemrel/input_propagation_parameter.json"));
|
"/eu/dnetlib/dhp/wf/subworkflows/entitytoorganizationfromsemrel/input_propagation_parameter.json"));
|
||||||
|
|
||||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
|
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
|
||||||
|
|
|
@ -3,8 +3,8 @@ package eu.dnetlib.dhp.entitytoorganizationfromsemrel;
|
||||||
|
|
||||||
import static eu.dnetlib.dhp.PropagationConstant.*;
|
import static eu.dnetlib.dhp.PropagationConstant.*;
|
||||||
import static eu.dnetlib.dhp.PropagationConstant.readPath;
|
import static eu.dnetlib.dhp.PropagationConstant.readPath;
|
||||||
import static eu.dnetlib.dhp.entitytoorganizationfromsemrel.SparkResultToOrganizationFromSemRel.NEW_PROJECT_RELATION_PATH;
|
import static eu.dnetlib.dhp.entitytoorganizationfromsemrel.SparkEntityToOrganizationFromSemRel.NEW_PROJECT_RELATION_PATH;
|
||||||
import static eu.dnetlib.dhp.entitytoorganizationfromsemrel.SparkResultToOrganizationFromSemRel.NEW_RESULT_RELATION_PATH;
|
import static eu.dnetlib.dhp.entitytoorganizationfromsemrel.SparkEntityToOrganizationFromSemRel.NEW_RESULT_RELATION_PATH;
|
||||||
|
|
||||||
import java.io.Serializable;
|
import java.io.Serializable;
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
|
@ -20,7 +20,6 @@ import org.jetbrains.annotations.NotNull;
|
||||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||||
|
|
||||||
import eu.dnetlib.dhp.KeyValueSet;
|
import eu.dnetlib.dhp.KeyValueSet;
|
||||||
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.Relation;
|
import eu.dnetlib.dhp.schema.oaf.Relation;
|
||||||
import scala.Tuple2;
|
import scala.Tuple2;
|
||||||
|
|
||||||
|
|
|
@ -31,7 +31,7 @@ public class PrepareResultOrcidAssociationStep1 {
|
||||||
.toString(
|
.toString(
|
||||||
PrepareResultOrcidAssociationStep1.class
|
PrepareResultOrcidAssociationStep1.class
|
||||||
.getResourceAsStream(
|
.getResourceAsStream(
|
||||||
"/eu/dnetlib/dhp/orcidtoresultfromsemrel/input_prepareorcidtoresult_parameters.json"));
|
"/eu/dnetlib/dhp/wf/subworkflows/orcidtoresultfromsemrel/input_prepareorcidtoresult_parameters.json"));
|
||||||
|
|
||||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConf);
|
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConf);
|
||||||
parser.parseArgument(args);
|
parser.parseArgument(args);
|
||||||
|
|
|
@ -29,7 +29,7 @@ public class PrepareResultOrcidAssociationStep2 {
|
||||||
.toString(
|
.toString(
|
||||||
PrepareResultOrcidAssociationStep2.class
|
PrepareResultOrcidAssociationStep2.class
|
||||||
.getResourceAsStream(
|
.getResourceAsStream(
|
||||||
"/eu/dnetlib/dhp/orcidtoresultfromsemrel/input_prepareorcidtoresult_parameters2.json"));
|
"/eu/dnetlib/dhp/wf/subworkflows/orcidtoresultfromsemrel/input_prepareorcidtoresult_parameters2.json"));
|
||||||
|
|
||||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
|
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
|
||||||
|
|
||||||
|
|
|
@ -2,7 +2,7 @@
|
||||||
package eu.dnetlib.dhp.orcidtoresultfromsemrel;
|
package eu.dnetlib.dhp.orcidtoresultfromsemrel;
|
||||||
|
|
||||||
import static eu.dnetlib.dhp.PropagationConstant.*;
|
import static eu.dnetlib.dhp.PropagationConstant.*;
|
||||||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkHiveSession;
|
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
||||||
|
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Optional;
|
import java.util.Optional;
|
||||||
|
@ -36,7 +36,7 @@ public class SparkOrcidToResultFromSemRelJob {
|
||||||
.toString(
|
.toString(
|
||||||
SparkOrcidToResultFromSemRelJob.class
|
SparkOrcidToResultFromSemRelJob.class
|
||||||
.getResourceAsStream(
|
.getResourceAsStream(
|
||||||
"/eu/dnetlib/dhp/orcidtoresultfromsemrel/input_orcidtoresult_parameters.json"));
|
"/eu/dnetlib/dhp/wf/subworkflows/orcidtoresultfromsemrel/input_orcidtoresult_parameters.json"));
|
||||||
|
|
||||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
|
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
|
||||||
parser.parseArgument(args);
|
parser.parseArgument(args);
|
||||||
|
@ -65,9 +65,8 @@ public class SparkOrcidToResultFromSemRelJob {
|
||||||
Class<? extends Result> resultClazz = (Class<? extends Result>) Class.forName(resultClassName);
|
Class<? extends Result> resultClazz = (Class<? extends Result>) Class.forName(resultClassName);
|
||||||
|
|
||||||
SparkConf conf = new SparkConf();
|
SparkConf conf = new SparkConf();
|
||||||
conf.set("hive.metastore.uris", parser.get("hive_metastore_uris"));
|
|
||||||
|
|
||||||
runWithSparkHiveSession(
|
runWithSparkSession(
|
||||||
conf,
|
conf,
|
||||||
isSparkSessionManaged,
|
isSparkSessionManaged,
|
||||||
spark -> {
|
spark -> {
|
||||||
|
|
|
@ -28,7 +28,7 @@ public class PrepareProjectResultsAssociation {
|
||||||
.toString(
|
.toString(
|
||||||
PrepareProjectResultsAssociation.class
|
PrepareProjectResultsAssociation.class
|
||||||
.getResourceAsStream(
|
.getResourceAsStream(
|
||||||
"/eu/dnetlib/dhp/projecttoresult/input_prepareprojecttoresult_parameters.json"));
|
"/eu/dnetlib/dhp/wf/subworkflows/projecttoresult/input_prepareprojecttoresult_parameters.json"));
|
||||||
|
|
||||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
|
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
|
||||||
|
|
||||||
|
|
|
@ -33,7 +33,7 @@ public class SparkResultToProjectThroughSemRelJob {
|
||||||
.toString(
|
.toString(
|
||||||
SparkResultToProjectThroughSemRelJob.class
|
SparkResultToProjectThroughSemRelJob.class
|
||||||
.getResourceAsStream(
|
.getResourceAsStream(
|
||||||
"/eu/dnetlib/dhp/projecttoresult/input_projecttoresult_parameters.json"));
|
"/eu/dnetlib/dhp/wf/subworkflows/projecttoresult/input_projecttoresult_parameters.json"));
|
||||||
|
|
||||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
|
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
|
||||||
|
|
||||||
|
|
|
@ -34,7 +34,7 @@ public class PrepareResultCommunitySet {
|
||||||
.toString(
|
.toString(
|
||||||
PrepareResultCommunitySet.class
|
PrepareResultCommunitySet.class
|
||||||
.getResourceAsStream(
|
.getResourceAsStream(
|
||||||
"/eu/dnetlib/dhp/resulttocommunityfromorganization/input_preparecommunitytoresult_parameters.json"));
|
"/eu/dnetlib/dhp/wf/subworkflows/resulttocommunityfromorganization/input_preparecommunitytoresult_parameters.json"));
|
||||||
|
|
||||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
|
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
|
||||||
parser.parseArgument(args);
|
parser.parseArgument(args);
|
||||||
|
|
|
@ -36,7 +36,7 @@ public class SparkResultToCommunityFromOrganizationJob {
|
||||||
.toString(
|
.toString(
|
||||||
SparkResultToCommunityFromOrganizationJob.class
|
SparkResultToCommunityFromOrganizationJob.class
|
||||||
.getResourceAsStream(
|
.getResourceAsStream(
|
||||||
"/eu/dnetlib/dhp/resulttocommunityfromorganization/input_communitytoresult_parameters.json"));
|
"/eu/dnetlib/dhp/wf/subworkflows/resulttocommunityfromorganization/input_communitytoresult_parameters.json"));
|
||||||
|
|
||||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
|
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
|
||||||
|
|
||||||
|
@ -92,6 +92,12 @@ public class SparkResultToCommunityFromOrganizationJob {
|
||||||
.mode(SaveMode.Overwrite)
|
.mode(SaveMode.Overwrite)
|
||||||
.option("compression", "gzip")
|
.option("compression", "gzip")
|
||||||
.json(outputPath + e.name());
|
.json(outputPath + e.name());
|
||||||
|
|
||||||
|
readPath(spark, outputPath + e.name(), resultClazz)
|
||||||
|
.write()
|
||||||
|
.mode(SaveMode.Overwrite)
|
||||||
|
.option("compression", "gzip")
|
||||||
|
.json(inputPath + e.name());
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
|
|
|
@ -38,7 +38,7 @@ public class PrepareResultCommunitySet {
|
||||||
.toString(
|
.toString(
|
||||||
PrepareResultCommunitySet.class
|
PrepareResultCommunitySet.class
|
||||||
.getResourceAsStream(
|
.getResourceAsStream(
|
||||||
"/eu/dnetlib/dhp/resulttocommunityfromproject/input_preparecommunitytoresult_parameters.json"));
|
"/eu/dnetlib/dhp/wf/subworkflows/resulttocommunityfromproject/input_preparecommunitytoresult_parameters.json"));
|
||||||
|
|
||||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
|
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
|
||||||
parser.parseArgument(args);
|
parser.parseArgument(args);
|
||||||
|
|
|
@ -44,7 +44,7 @@ public class SparkResultToCommunityFromProject implements Serializable {
|
||||||
.toString(
|
.toString(
|
||||||
SparkResultToCommunityFromProject.class
|
SparkResultToCommunityFromProject.class
|
||||||
.getResourceAsStream(
|
.getResourceAsStream(
|
||||||
"/eu/dnetlib/dhp/resulttocommunityfromproject/input_communitytoresult_parameters.json"));
|
"/eu/dnetlib/dhp/wf/subworkflows/resulttocommunityfromproject/input_communitytoresult_parameters.json"));
|
||||||
|
|
||||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
|
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
|
||||||
|
|
||||||
|
@ -102,6 +102,12 @@ public class SparkResultToCommunityFromProject implements Serializable {
|
||||||
.mode(SaveMode.Overwrite)
|
.mode(SaveMode.Overwrite)
|
||||||
.option("compression", "gzip")
|
.option("compression", "gzip")
|
||||||
.json(outputPath + e.name());
|
.json(outputPath + e.name());
|
||||||
|
|
||||||
|
readPath(spark, outputPath + e.name(), resultClazz)
|
||||||
|
.write()
|
||||||
|
.mode(SaveMode.Overwrite)
|
||||||
|
.option("compression", "gzip")
|
||||||
|
.json(inputPath + e.name());
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
|
|
|
@ -4,6 +4,7 @@ package eu.dnetlib.dhp.resulttocommunityfromsemrel;
|
||||||
import static eu.dnetlib.dhp.PropagationConstant.*;
|
import static eu.dnetlib.dhp.PropagationConstant.*;
|
||||||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkHiveSession;
|
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkHiveSession;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
import java.util.Arrays;
|
import java.util.Arrays;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
|
@ -15,6 +16,7 @@ import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
import com.google.gson.Gson;
|
import com.google.gson.Gson;
|
||||||
|
|
||||||
|
import eu.dnetlib.dhp.api.Utils;
|
||||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||||
import eu.dnetlib.dhp.resulttocommunityfromorganization.ResultCommunityList;
|
import eu.dnetlib.dhp.resulttocommunityfromorganization.ResultCommunityList;
|
||||||
import eu.dnetlib.dhp.schema.oaf.Relation;
|
import eu.dnetlib.dhp.schema.oaf.Relation;
|
||||||
|
@ -26,11 +28,6 @@ import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
|
||||||
public class PrepareResultCommunitySetStep1 {
|
public class PrepareResultCommunitySetStep1 {
|
||||||
private static final Logger log = LoggerFactory.getLogger(PrepareResultCommunitySetStep1.class);
|
private static final Logger log = LoggerFactory.getLogger(PrepareResultCommunitySetStep1.class);
|
||||||
|
|
||||||
private static final String COMMUNITY_LIST_XQUERY = "for $x in collection('/db/DRIVER/ContextDSResources/ContextDSResourceType')"
|
|
||||||
+ " where $x//CONFIGURATION/context[./@type='community' or ./@type='ri']"
|
|
||||||
+ " and $x//CONFIGURATION/context/param[./@name='status']/text() != 'hidden'"
|
|
||||||
+ " return $x//CONFIGURATION/context/@id/string()";
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* associates to each result the set of community contexts they are associated to; associates to each target of a
|
* associates to each result the set of community contexts they are associated to; associates to each target of a
|
||||||
* relation with allowed semantics the set of community context it could possibly inherit from the source of the
|
* relation with allowed semantics the set of community context it could possibly inherit from the source of the
|
||||||
|
@ -64,7 +61,7 @@ public class PrepareResultCommunitySetStep1 {
|
||||||
.toString(
|
.toString(
|
||||||
PrepareResultCommunitySetStep1.class
|
PrepareResultCommunitySetStep1.class
|
||||||
.getResourceAsStream(
|
.getResourceAsStream(
|
||||||
"/eu/dnetlib/dhp/resulttocommunityfromsemrel/input_preparecommunitytoresult_parameters.json"));
|
"/eu/dnetlib/dhp/wf/subworkflows/resulttocommunityfromsemrel/input_preparecommunitytoresult_parameters.json"));
|
||||||
|
|
||||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
|
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
|
||||||
|
|
||||||
|
@ -88,10 +85,10 @@ public class PrepareResultCommunitySetStep1 {
|
||||||
final List<String> allowedsemrel = Arrays.asList(parser.get("allowedsemrels").split(";"));
|
final List<String> allowedsemrel = Arrays.asList(parser.get("allowedsemrels").split(";"));
|
||||||
log.info("allowedSemRel: {}", new Gson().toJson(allowedsemrel));
|
log.info("allowedSemRel: {}", new Gson().toJson(allowedsemrel));
|
||||||
|
|
||||||
final String isLookupUrl = parser.get("isLookUpUrl");
|
final String baseURL = parser.get("baseURL");
|
||||||
log.info("isLookupUrl: {}", isLookupUrl);
|
log.info("baseURL: {}", baseURL);
|
||||||
|
|
||||||
final List<String> communityIdList = getCommunityList(isLookupUrl);
|
final List<String> communityIdList = getCommunityList(baseURL);
|
||||||
log.info("communityIdList: {}", new Gson().toJson(communityIdList));
|
log.info("communityIdList: {}", new Gson().toJson(communityIdList));
|
||||||
|
|
||||||
final String resultType = resultClassName.substring(resultClassName.lastIndexOf(".") + 1).toLowerCase();
|
final String resultType = resultClassName.substring(resultClassName.lastIndexOf(".") + 1).toLowerCase();
|
||||||
|
@ -159,9 +156,8 @@ public class PrepareResultCommunitySetStep1 {
|
||||||
.json(outputResultPath);
|
.json(outputResultPath);
|
||||||
}
|
}
|
||||||
|
|
||||||
public static List<String> getCommunityList(final String isLookupUrl) throws ISLookUpException {
|
public static List<String> getCommunityList(final String baseURL) throws IOException {
|
||||||
ISLookUpService isLookUp = ISLookupClientFactory.getLookUpService(isLookupUrl);
|
return Utils.getCommunityIdList(baseURL);
|
||||||
return isLookUp.quickSearchProfile(COMMUNITY_LIST_XQUERY);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -31,7 +31,7 @@ public class PrepareResultCommunitySetStep2 {
|
||||||
.toString(
|
.toString(
|
||||||
PrepareResultCommunitySetStep2.class
|
PrepareResultCommunitySetStep2.class
|
||||||
.getResourceAsStream(
|
.getResourceAsStream(
|
||||||
"/eu/dnetlib/dhp/resulttocommunityfromsemrel/input_preparecommunitytoresult2_parameters.json"));
|
"/eu/dnetlib/dhp/wf/subworkflows/resulttocommunityfromsemrel/input_preparecommunitytoresult2_parameters.json"));
|
||||||
|
|
||||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
|
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
|
||||||
|
|
||||||
|
|
|
@ -33,7 +33,7 @@ public class SparkResultToCommunityThroughSemRelJob {
|
||||||
.toString(
|
.toString(
|
||||||
SparkResultToCommunityThroughSemRelJob.class
|
SparkResultToCommunityThroughSemRelJob.class
|
||||||
.getResourceAsStream(
|
.getResourceAsStream(
|
||||||
"/eu/dnetlib/dhp/resulttocommunityfromsemrel/input_communitytoresult_parameters.json"));
|
"/eu/dnetlib/dhp/wf/subworkflows/resulttocommunityfromsemrel/input_communitytoresult_parameters.json"));
|
||||||
|
|
||||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
|
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
|
||||||
|
|
||||||
|
@ -100,6 +100,12 @@ public class SparkResultToCommunityThroughSemRelJob {
|
||||||
.mode(SaveMode.Overwrite)
|
.mode(SaveMode.Overwrite)
|
||||||
.option("compression", "gzip")
|
.option("compression", "gzip")
|
||||||
.json(outputPath);
|
.json(outputPath);
|
||||||
|
|
||||||
|
readPath(spark, outputPath, resultClazz)
|
||||||
|
.write()
|
||||||
|
.mode(SaveMode.Overwrite)
|
||||||
|
.option("compression", "gzip")
|
||||||
|
.json(inputPath);
|
||||||
}
|
}
|
||||||
|
|
||||||
private static <R extends Result> MapFunction<Tuple2<R, ResultCommunityList>, R> contextUpdaterFn() {
|
private static <R extends Result> MapFunction<Tuple2<R, ResultCommunityList>, R> contextUpdaterFn() {
|
||||||
|
|
|
@ -0,0 +1,68 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.dhp.resulttoorganizationfrominstrepo;
|
||||||
|
|
||||||
|
import static eu.dnetlib.dhp.PropagationConstant.*;
|
||||||
|
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
||||||
|
|
||||||
|
import java.io.Serializable;
|
||||||
|
|
||||||
|
import org.apache.commons.io.IOUtils;
|
||||||
|
import org.apache.spark.SparkConf;
|
||||||
|
import org.apache.spark.sql.SaveMode;
|
||||||
|
import org.apache.spark.sql.SparkSession;
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.Relation;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @author miriam.baglioni
|
||||||
|
* @Date 09/12/23
|
||||||
|
*/
|
||||||
|
public class AppendNewRelations implements Serializable {
|
||||||
|
|
||||||
|
private static final Logger log = LoggerFactory.getLogger(AppendNewRelations.class);
|
||||||
|
|
||||||
|
public static void main(String[] args) throws Exception {
|
||||||
|
|
||||||
|
String jsonConfiguration = IOUtils
|
||||||
|
.toString(
|
||||||
|
AppendNewRelations.class
|
||||||
|
.getResourceAsStream(
|
||||||
|
"/eu/dnetlib/dhp/wf/subworkflows/resulttoorganizationfrominstrepo/input_newrelation_parameters.json"));
|
||||||
|
|
||||||
|
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
|
||||||
|
|
||||||
|
parser.parseArgument(args);
|
||||||
|
|
||||||
|
Boolean isSparkSessionManaged = isSparkSessionManaged(parser);
|
||||||
|
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
|
||||||
|
|
||||||
|
String inputPath = parser.get("sourcePath");
|
||||||
|
log.info("inputPath: {}", inputPath);
|
||||||
|
|
||||||
|
final String outputPath = parser.get("outputPath");
|
||||||
|
log.info("outputPath: {}", outputPath);
|
||||||
|
|
||||||
|
SparkConf conf = new SparkConf();
|
||||||
|
|
||||||
|
runWithSparkSession(
|
||||||
|
conf,
|
||||||
|
isSparkSessionManaged,
|
||||||
|
spark -> appendNewRelation(spark, inputPath, outputPath));
|
||||||
|
}
|
||||||
|
|
||||||
|
private static void appendNewRelation(SparkSession spark, String inputPath, String outputPath) {
|
||||||
|
|
||||||
|
readPath(spark, inputPath + "publication/relation", Relation.class)
|
||||||
|
.union(readPath(spark, inputPath + "dataset/relation", Relation.class))
|
||||||
|
.union(readPath(spark, inputPath + "otherresearchproduct/relation", Relation.class))
|
||||||
|
.union(readPath(spark, inputPath + "software/relation", Relation.class))
|
||||||
|
.write()
|
||||||
|
.mode(SaveMode.Append)
|
||||||
|
.option("compression", "gzip")
|
||||||
|
.json(outputPath);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -40,7 +40,7 @@ public class PrepareResultInstRepoAssociation {
|
||||||
.toString(
|
.toString(
|
||||||
PrepareResultInstRepoAssociation.class
|
PrepareResultInstRepoAssociation.class
|
||||||
.getResourceAsStream(
|
.getResourceAsStream(
|
||||||
"/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/input_prepareresultorg_parameters.json"));
|
"/eu/dnetlib/dhp/wf/subworkflows/resulttoorganizationfrominstrepo/input_prepareresultorg_parameters.json"));
|
||||||
|
|
||||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
|
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
|
||||||
|
|
||||||
|
@ -52,10 +52,13 @@ public class PrepareResultInstRepoAssociation {
|
||||||
String inputPath = parser.get("sourcePath");
|
String inputPath = parser.get("sourcePath");
|
||||||
log.info("inputPath: {}", inputPath);
|
log.info("inputPath: {}", inputPath);
|
||||||
|
|
||||||
final String datasourceOrganizationPath = parser.get("datasourceOrganizationPath");
|
final String workingPath = parser.get("workingPath");
|
||||||
|
log.info("workingPath : {}", workingPath);
|
||||||
|
|
||||||
|
final String datasourceOrganizationPath = workingPath + "/preparedInfo/datasourceOrganization";
|
||||||
log.info("datasourceOrganizationPath {}: ", datasourceOrganizationPath);
|
log.info("datasourceOrganizationPath {}: ", datasourceOrganizationPath);
|
||||||
|
|
||||||
final String alreadyLinkedPath = parser.get("alreadyLinkedPath");
|
final String alreadyLinkedPath = workingPath + "/preparedInfo/alreadyLinked";
|
||||||
log.info("alreadyLinkedPath {}: ", alreadyLinkedPath);
|
log.info("alreadyLinkedPath {}: ", alreadyLinkedPath);
|
||||||
|
|
||||||
List<String> blacklist = Optional
|
List<String> blacklist = Optional
|
||||||
|
|
|
@ -47,7 +47,7 @@ public class SparkResultToOrganizationFromIstRepoJob {
|
||||||
.toString(
|
.toString(
|
||||||
SparkResultToOrganizationFromIstRepoJob.class
|
SparkResultToOrganizationFromIstRepoJob.class
|
||||||
.getResourceAsStream(
|
.getResourceAsStream(
|
||||||
"/eu/dnetlib/dhp/resulttoorganizationfrominstrepo/input_propagationresulaffiliationfrominstrepo_parameters.json"));
|
"/eu/dnetlib/dhp/wf/subworkflows/resulttoorganizationfrominstrepo/input_propagationresulaffiliationfrominstrepo_parameters.json"));
|
||||||
|
|
||||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
|
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
|
||||||
|
|
||||||
|
@ -119,7 +119,7 @@ public class SparkResultToOrganizationFromIstRepoJob {
|
||||||
"left_outer")
|
"left_outer")
|
||||||
.flatMap(createRelationFn(), Encoders.bean(Relation.class))
|
.flatMap(createRelationFn(), Encoders.bean(Relation.class))
|
||||||
.write()
|
.write()
|
||||||
.mode(SaveMode.Append)
|
.mode(SaveMode.Overwrite)
|
||||||
.option("compression", "gzip")
|
.option("compression", "gzip")
|
||||||
.json(outputPath);
|
.json(outputPath);
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,32 +0,0 @@
|
||||||
[
|
|
||||||
{
|
|
||||||
"paramName": "p",
|
|
||||||
"paramLongName": "hdfsPath",
|
|
||||||
"paramDescription": "the path where storing the sequential file",
|
|
||||||
"paramRequired": true
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"paramName": "nn",
|
|
||||||
"paramLongName": "hdfsNameNode",
|
|
||||||
"paramDescription": "the name node on hdfs",
|
|
||||||
"paramRequired": true
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"paramName": "pgurl",
|
|
||||||
"paramLongName": "postgresUrl",
|
|
||||||
"paramDescription": "postgres url, example: jdbc:postgresql://localhost:5432/testdb",
|
|
||||||
"paramRequired": true
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"paramName": "pguser",
|
|
||||||
"paramLongName": "postgresUser",
|
|
||||||
"paramDescription": "postgres user",
|
|
||||||
"paramRequired": false
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"paramName": "pgpasswd",
|
|
||||||
"paramLongName": "postgresPassword",
|
|
||||||
"paramDescription": "postgres password",
|
|
||||||
"paramRequired": false
|
|
||||||
}
|
|
||||||
]
|
|
|
@ -1,21 +0,0 @@
|
||||||
[
|
|
||||||
{
|
|
||||||
"paramName":"s",
|
|
||||||
"paramLongName":"sourcePath",
|
|
||||||
"paramDescription": "the path of the sequencial file to read",
|
|
||||||
"paramRequired": true
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"paramName": "wp",
|
|
||||||
"paramLongName": "workingPath",
|
|
||||||
"paramDescription": "the path used to store temporary output files",
|
|
||||||
"paramRequired": true
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"paramName": "ssm",
|
|
||||||
"paramLongName": "isSparkSessionManaged",
|
|
||||||
"paramDescription": "true if the spark session is managed, false otherwise",
|
|
||||||
"paramRequired": false
|
|
||||||
}
|
|
||||||
|
|
||||||
]
|
|
|
@ -1,41 +0,0 @@
|
||||||
[
|
|
||||||
|
|
||||||
{
|
|
||||||
"paramName":"s",
|
|
||||||
"paramLongName":"sourcePath",
|
|
||||||
"paramDescription": "the path of the sequencial file to read",
|
|
||||||
"paramRequired": true
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"paramName": "dmp",
|
|
||||||
"paramLongName":"datasourceMapPath",
|
|
||||||
"paramDescription": "the path where the association datasource master has been stored",
|
|
||||||
"paramRequired": true
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"paramName":"tn",
|
|
||||||
"paramLongName":"resultTableName",
|
|
||||||
"paramDescription": "the name of the result table we are currently working on",
|
|
||||||
"paramRequired": true
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"paramName": "wp",
|
|
||||||
"paramLongName": "workingPath",
|
|
||||||
"paramDescription": "the path used to store temporary output files",
|
|
||||||
"paramRequired": true
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"paramName": "ssm",
|
|
||||||
"paramLongName": "isSparkSessionManaged",
|
|
||||||
"paramDescription": "true if the spark session is managed, false otherwise",
|
|
||||||
"paramRequired": false
|
|
||||||
},
|
|
||||||
{
|
|
||||||
|
|
||||||
"paramName": "rt",
|
|
||||||
"paramLongName": "resultType",
|
|
||||||
"paramDescription": "the result type",
|
|
||||||
"paramRequired": true
|
|
||||||
}
|
|
||||||
|
|
||||||
]
|
|
|
@ -1,197 +0,0 @@
|
||||||
<workflow-app name="affiliation_from_semrel_propagation" xmlns="uri:oozie:workflow:0.5">
|
|
||||||
<parameters>
|
|
||||||
<property>
|
|
||||||
<name>sourcePath</name>
|
|
||||||
<description>the source path</description>
|
|
||||||
</property>
|
|
||||||
<property>
|
|
||||||
<name>outputPath</name>
|
|
||||||
<description>sets the outputPath</description>
|
|
||||||
</property>
|
|
||||||
</parameters>
|
|
||||||
|
|
||||||
<global>
|
|
||||||
<job-tracker>${jobTracker}</job-tracker>
|
|
||||||
<name-node>${nameNode}</name-node>
|
|
||||||
<configuration>
|
|
||||||
<property>
|
|
||||||
<name>oozie.action.sharelib.for.spark</name>
|
|
||||||
<value>${oozieActionShareLibForSpark2}</value>
|
|
||||||
</property>
|
|
||||||
</configuration>
|
|
||||||
</global>
|
|
||||||
|
|
||||||
<start to="resume_from"/>
|
|
||||||
|
|
||||||
<kill name="Kill">
|
|
||||||
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
|
||||||
</kill>
|
|
||||||
|
|
||||||
<decision name="resume_from">
|
|
||||||
<switch>
|
|
||||||
<case to="prepare_info">${wf:conf('resumeFrom') eq 'PrepareInfo'}</case>
|
|
||||||
<default to="reset_outputpath"/> <!-- first action to be done when downloadDump is to be performed -->
|
|
||||||
</switch>
|
|
||||||
</decision>
|
|
||||||
|
|
||||||
<action name="reset_outputpath">
|
|
||||||
<fs>
|
|
||||||
<delete path="${outputPath}"/>
|
|
||||||
<mkdir path="${outputPath}"/>
|
|
||||||
</fs>
|
|
||||||
<ok to="copy_entities"/>
|
|
||||||
<error to="Kill"/>
|
|
||||||
</action>
|
|
||||||
|
|
||||||
<fork name="copy_entities">
|
|
||||||
<path start="copy_relation"/>
|
|
||||||
<path start="copy_publication"/>
|
|
||||||
<path start="copy_dataset"/>
|
|
||||||
<path start="copy_orp"/>
|
|
||||||
<path start="copy_software"/>
|
|
||||||
<path start="copy_organization"/>
|
|
||||||
<path start="copy_projects"/>
|
|
||||||
<path start="copy_datasources"/>
|
|
||||||
</fork>
|
|
||||||
|
|
||||||
<action name="copy_relation">
|
|
||||||
<distcp xmlns="uri:oozie:distcp-action:0.2">
|
|
||||||
<arg>${nameNode}/${sourcePath}/relation</arg>
|
|
||||||
<arg>${nameNode}/${outputPath}/relation</arg>
|
|
||||||
</distcp>
|
|
||||||
<ok to="wait"/>
|
|
||||||
<error to="Kill"/>
|
|
||||||
</action>
|
|
||||||
|
|
||||||
<action name="copy_publication">
|
|
||||||
<distcp xmlns="uri:oozie:distcp-action:0.2">
|
|
||||||
<arg>${nameNode}/${sourcePath}/publication</arg>
|
|
||||||
<arg>${nameNode}/${outputPath}/publication</arg>
|
|
||||||
</distcp>
|
|
||||||
<ok to="wait"/>
|
|
||||||
<error to="Kill"/>
|
|
||||||
</action>
|
|
||||||
|
|
||||||
<action name="copy_dataset">
|
|
||||||
<distcp xmlns="uri:oozie:distcp-action:0.2">
|
|
||||||
<arg>${nameNode}/${sourcePath}/dataset</arg>
|
|
||||||
<arg>${nameNode}/${outputPath}/dataset</arg>
|
|
||||||
</distcp>
|
|
||||||
<ok to="wait"/>
|
|
||||||
<error to="Kill"/>
|
|
||||||
</action>
|
|
||||||
|
|
||||||
<action name="copy_orp">
|
|
||||||
<distcp xmlns="uri:oozie:distcp-action:0.2">
|
|
||||||
<arg>${nameNode}/${sourcePath}/otherresearchproduct</arg>
|
|
||||||
<arg>${nameNode}/${outputPath}/otherresearchproduct</arg>
|
|
||||||
</distcp>
|
|
||||||
<ok to="wait"/>
|
|
||||||
<error to="Kill"/>
|
|
||||||
</action>
|
|
||||||
|
|
||||||
<action name="copy_software">
|
|
||||||
<distcp xmlns="uri:oozie:distcp-action:0.2">
|
|
||||||
<arg>${nameNode}/${sourcePath}/software</arg>
|
|
||||||
<arg>${nameNode}/${outputPath}/software</arg>
|
|
||||||
</distcp>
|
|
||||||
<ok to="wait"/>
|
|
||||||
<error to="Kill"/>
|
|
||||||
</action>
|
|
||||||
|
|
||||||
<action name="copy_organization">
|
|
||||||
<distcp xmlns="uri:oozie:distcp-action:0.2">
|
|
||||||
<arg>${nameNode}/${sourcePath}/organization</arg>
|
|
||||||
<arg>${nameNode}/${outputPath}/organization</arg>
|
|
||||||
</distcp>
|
|
||||||
<ok to="wait"/>
|
|
||||||
<error to="Kill"/>
|
|
||||||
</action>
|
|
||||||
|
|
||||||
<action name="copy_projects">
|
|
||||||
<distcp xmlns="uri:oozie:distcp-action:0.2">
|
|
||||||
<arg>${nameNode}/${sourcePath}/project</arg>
|
|
||||||
<arg>${nameNode}/${outputPath}/project</arg>
|
|
||||||
</distcp>
|
|
||||||
<ok to="wait"/>
|
|
||||||
<error to="Kill"/>
|
|
||||||
</action>
|
|
||||||
|
|
||||||
<action name="copy_datasources">
|
|
||||||
<distcp xmlns="uri:oozie:distcp-action:0.2">
|
|
||||||
<arg>${nameNode}/${sourcePath}/datasource</arg>
|
|
||||||
<arg>${nameNode}/${outputPath}/datasource</arg>
|
|
||||||
</distcp>
|
|
||||||
<ok to="wait"/>
|
|
||||||
<error to="Kill"/>
|
|
||||||
</action>
|
|
||||||
|
|
||||||
<join name="wait" to="prepare_info"/>
|
|
||||||
|
|
||||||
|
|
||||||
<action name="prepare_info">
|
|
||||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
|
||||||
<master>yarn</master>
|
|
||||||
<mode>cluster</mode>
|
|
||||||
<name>PrepareResultOrganizationAssociation</name>
|
|
||||||
<class>eu.dnetlib.dhp.entitytoorganizationfromsemrel.PrepareInfo</class>
|
|
||||||
<jar>dhp-enrichment-${projectVersion}.jar</jar>
|
|
||||||
<spark-opts>
|
|
||||||
--executor-cores=${sparkExecutorCores}
|
|
||||||
--executor-memory=${sparkExecutorMemory}
|
|
||||||
--driver-memory=${sparkDriverMemory}
|
|
||||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
|
||||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
|
||||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
|
||||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
|
||||||
</spark-opts>
|
|
||||||
<arg>--graphPath</arg><arg>${sourcePath}</arg>
|
|
||||||
<arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg>
|
|
||||||
<arg>--leavesPath</arg><arg>${workingDir}/preparedInfo/leavesPath</arg>
|
|
||||||
<arg>--childParentPath</arg><arg>${workingDir}/preparedInfo/childParentPath</arg>
|
|
||||||
<arg>--resultOrgPath</arg><arg>${workingDir}/preparedInfo/resultOrgPath</arg>
|
|
||||||
<arg>--projectOrganizationPath</arg><arg>${workingDir}/preparedInfo/projectOrganizationPath</arg>
|
|
||||||
<arg>--relationPath</arg><arg>${workingDir}/preparedInfo/relation</arg>
|
|
||||||
</spark>
|
|
||||||
<ok to="apply_resulttoorganization_propagation"/>
|
|
||||||
<error to="Kill"/>
|
|
||||||
</action>
|
|
||||||
|
|
||||||
<action name="apply_resulttoorganization_propagation">
|
|
||||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
|
||||||
<master>yarn</master>
|
|
||||||
<mode>cluster</mode>
|
|
||||||
<name>resultToOrganizationFromSemRel</name>
|
|
||||||
<class>eu.dnetlib.dhp.entitytoorganizationfromsemrel.SparkResultToOrganizationFromSemRel</class>
|
|
||||||
<jar>dhp-enrichment-${projectVersion}.jar</jar>
|
|
||||||
<spark-opts>
|
|
||||||
--executor-cores=${sparkExecutorCores}
|
|
||||||
--executor-memory=${sparkExecutorMemory}
|
|
||||||
--driver-memory=${sparkDriverMemory}
|
|
||||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
|
||||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
|
||||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
|
||||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
|
||||||
--conf spark.dynamicAllocation.enabled=true
|
|
||||||
--conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors}
|
|
||||||
--conf spark.sql.shuffle.partitions=3840
|
|
||||||
</spark-opts>
|
|
||||||
<arg>--relationPath</arg><arg>${workingDir}/preparedInfo/relation</arg>
|
|
||||||
<arg>--outputPath</arg><arg>${outputPath}/relation</arg>
|
|
||||||
<arg>--leavesPath</arg><arg>${workingDir}/preparedInfo/leavesPath</arg>
|
|
||||||
<arg>--childParentPath</arg><arg>${workingDir}/preparedInfo/childParentPath</arg>
|
|
||||||
<arg>--resultOrgPath</arg><arg>${workingDir}/preparedInfo/resultOrgPath</arg>
|
|
||||||
<arg>--projectOrganizationPath</arg><arg>${workingDir}/preparedInfo/projectOrganizationPath</arg>
|
|
||||||
<arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg>
|
|
||||||
<arg>--workingDir</arg><arg>${workingDir}/working</arg>
|
|
||||||
<arg>--iterations</arg><arg>${iterations}</arg>
|
|
||||||
</spark>
|
|
||||||
<ok to="End"/>
|
|
||||||
<error to="Kill"/>
|
|
||||||
</action>
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
<end name="End"/>
|
|
||||||
|
|
||||||
</workflow-app>
|
|
|
@ -0,0 +1,28 @@
|
||||||
|
sourcePath=/tmp/beta_provision/graph/09_graph_dedup_enriched
|
||||||
|
resumeFrom=CountryPropagation
|
||||||
|
allowedsemrelsorcidprop=isSupplementedBy;isSupplementTo
|
||||||
|
allowedsemrelsresultproject=isSupplementedBy;isSupplementTo
|
||||||
|
allowedsemrelscommunitysemrel=isSupplementedBy;isSupplementTo
|
||||||
|
datasourceWhitelistForCountryPropagation=10|opendoar____::16e6a3326dd7d868cbc926602a61e4d0;10|openaire____::fdb035c8b3e0540a8d9a561a6c44f4de;10|eurocrisdris::fe4903425d9040f680d8610d9079ea14;10|openaire____::5b76240cc27a58c6f7ceef7d8c36660e;10|openaire____::172bbccecf8fca44ab6a6653e84cb92a;10|openaire____::149c6590f8a06b46314eed77bfca693f;10|eurocrisdris::a6026877c1a174d60f81fd71f62df1c1;10|openaire____::4692342f0992d91f9e705c26959f09e0;10|openaire____::8d529dbb05ec0284662b391789e8ae2a;10|openaire____::345c9d171ef3c5d706d08041d506428c;10|opendoar____::1c1d4df596d01da60385f0bb17a4a9e0;10|opendoar____::7a614fd06c325499f1680b9896beedeb;10|opendoar____::1ee3dfcd8a0645a25a35977997223d22;10|opendoar____::d296c101daa88a51f6ca8cfc1ac79b50;10|opendoar____::798ed7d4ee7138d49b8828958048130a;10|openaire____::c9d2209ecc4d45ba7b4ca7597acb88a2;10|eurocrisdris::c49e0fe4b9ba7b7fab717d1f0f0a674d;10|eurocrisdris::9ae43d14471c4b33661fedda6f06b539;10|eurocrisdris::432ca599953ff50cd4eeffe22faf3e48
|
||||||
|
#allowedtypes=pubsrepository::institutional
|
||||||
|
allowedtypes=Institutional
|
||||||
|
outputPath=/tmp/miriam/enrichment_one_step
|
||||||
|
pathMap ={"author":"$['author'][*]['fullname']", \
|
||||||
|
"title":"$['title'][*]['value']",\
|
||||||
|
"orcid":"$['author'][*]['pid'][*][?(@['qualifier']['classid']=='orcid')]['value']" ,\
|
||||||
|
"orcid_pending":"$['author'][*]['pid'][*][?(@['qualifier']['classid']=='orcid_pending')]['value']" ,\
|
||||||
|
"contributor" : "$['contributor'][*]['value']",\
|
||||||
|
"description" : "$['description'][*]['value']",\
|
||||||
|
"subject" :"$['subject'][*]['value']" , \
|
||||||
|
"fos" : "$['subject'][?(@['qualifier']['classid']=='FOS')].value" ,\
|
||||||
|
"sdg" : "$['subject'][?(@['qualifier']['classid']=='SDG')].value",\
|
||||||
|
"journal":"$['journal'].name",\
|
||||||
|
"hostedby":"$['instance'][*]['hostedby']['key']",\
|
||||||
|
"collectedfrom":"$['instance'][*]['collectedfrom']['key']",\
|
||||||
|
"publisher":"$['publisher'].value",\
|
||||||
|
"publicationyear":"$['dateofacceptance'].value"}
|
||||||
|
blacklist=empty
|
||||||
|
allowedpids=orcid;orcid_pending
|
||||||
|
baseURL = https://services.openaire.eu/openaire/community/
|
||||||
|
iterations=1
|
||||||
|
|
|
@ -0,0 +1,30 @@
|
||||||
|
<configuration>
|
||||||
|
<property>
|
||||||
|
<name>jobTracker</name>
|
||||||
|
<value>yarnRM</value>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>nameNode</name>
|
||||||
|
<value>hdfs://nameservice1</value>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>oozie.use.system.libpath</name>
|
||||||
|
<value>true</value>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>hiveMetastoreUris</name>
|
||||||
|
<value>thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083</value>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>hiveJdbcUrl</name>
|
||||||
|
<value>jdbc:hive2://iis-cdh5-test-m3.ocean.icm.edu.pl:10000</value>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>hiveDbName</name>
|
||||||
|
<value>openaire</value>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>oozie.launcher.mapreduce.user.classpath.first</name>
|
||||||
|
<value>true</value>
|
||||||
|
</property>
|
||||||
|
</configuration>
|
|
@ -0,0 +1,10 @@
|
||||||
|
## This is a classpath-based import file (this header is required)
|
||||||
|
orcid_propagation classpath eu/dnetlib/dhp/wf/subworkflows/orcidtoresultfromsemrel/oozie_app
|
||||||
|
bulk_tagging classpath eu/dnetlib/dhp/wf/subworkflows/bulktag/oozie_app
|
||||||
|
affiliation_inst_repo classpath eu/dnetlib/dhp/wf/subworkflows/resulttoorganizationfrominstrepo/oozie_app
|
||||||
|
entity_semantic_relation classpath eu/dnetlib/dhp/wf/subworkflows/entitytoorganizationfromsemrel/oozie_app
|
||||||
|
community_organization classpath eu/dnetlib/dhp/wf/subworkflows/resulttocommunityfromorganization/oozie_app
|
||||||
|
result_project classpath eu/dnetlib/dhp/wf/subworkflows/projecttoresult/oozie_app
|
||||||
|
community_project classpath eu/dnetlib/dhp/wf/subworkflows/resulttocommunityfromproject/oozie_app
|
||||||
|
community_sem_rel classpath eu/dnetlib/dhp/wf/subworkflows/resulttocommunityfromsemrel/oozie_app
|
||||||
|
country_propagation classpath eu/dnetlib/dhp/wf/subworkflows/countrypropagation/oozie_app
|
|
@ -0,0 +1,324 @@
|
||||||
|
<workflow-app name="enrichment_main" xmlns="uri:oozie:workflow:0.5">
|
||||||
|
|
||||||
|
<parameters>
|
||||||
|
<property>
|
||||||
|
<name>sourcePath</name>
|
||||||
|
<description>the source path</description>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>allowedsemrelsorcidprop</name>
|
||||||
|
<description>the semantic relationships allowed for propagation</description>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>allowedsemrelsresultproject</name>
|
||||||
|
<description>the allowed semantics </description>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>allowedsemrelscommunitysemrel</name>
|
||||||
|
<description>the semantic relationships allowed for propagation</description>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>datasourceWhitelistForCountryPropagation</name>
|
||||||
|
<description>the white list</description>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>allowedtypes</name>
|
||||||
|
<description>the allowed types</description>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>outputPath</name>
|
||||||
|
<description>the output path</description>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>pathMap</name>
|
||||||
|
<description>the json path associated to each selection field</description>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>blacklist</name>
|
||||||
|
<description>list of datasources in blacklist for the affiliation from instrepo propagation</description>
|
||||||
|
</property>
|
||||||
|
|
||||||
|
<property>
|
||||||
|
<name>hiveDbName</name>
|
||||||
|
<description>the target hive database name</description>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>hiveJdbcUrl</name>
|
||||||
|
<description>hive server jdbc url</description>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>hiveMetastoreUris</name>
|
||||||
|
<description>hive server metastore URIs</description>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>sparkDriverMemory</name>
|
||||||
|
<description>memory for driver process</description>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>sparkExecutorMemory</name>
|
||||||
|
<description>memory for individual executor</description>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>sparkExecutorCores</name>
|
||||||
|
<description>number of cores used by single executor</description>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>oozieActionShareLibForSpark2</name>
|
||||||
|
<description>oozie action sharelib for spark 2.*</description>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>spark2ExtraListeners</name>
|
||||||
|
<value>com.cloudera.spark.lineage.NavigatorAppListener</value>
|
||||||
|
<description>spark 2.* extra listeners classname</description>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>spark2SqlQueryExecutionListeners</name>
|
||||||
|
<value>com.cloudera.spark.lineage.NavigatorQueryListener</value>
|
||||||
|
<description>spark 2.* sql query execution listeners classname</description>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>spark2YarnHistoryServerAddress</name>
|
||||||
|
<description>spark 2.* yarn history server address</description>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>spark2EventLogDir</name>
|
||||||
|
<description>spark 2.* event log dir location</description>
|
||||||
|
</property>
|
||||||
|
</parameters>
|
||||||
|
|
||||||
|
<global>
|
||||||
|
<job-tracker>${jobTracker}</job-tracker>
|
||||||
|
<name-node>${nameNode}</name-node>
|
||||||
|
<configuration>
|
||||||
|
<property>
|
||||||
|
<name>mapreduce.job.queuename</name>
|
||||||
|
<value>${queueName}</value>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>oozie.launcher.mapred.job.queue.name</name>
|
||||||
|
<value>${oozieLauncherQueueName}</value>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>oozie.action.sharelib.for.spark</name>
|
||||||
|
<value>${oozieActionShareLibForSpark2}</value>
|
||||||
|
</property>
|
||||||
|
|
||||||
|
</configuration>
|
||||||
|
</global>
|
||||||
|
|
||||||
|
<start to="resumeFrom"/>
|
||||||
|
|
||||||
|
<kill name="Kill">
|
||||||
|
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
||||||
|
</kill>
|
||||||
|
|
||||||
|
<decision name="resumeFrom">
|
||||||
|
<switch>
|
||||||
|
<case to="bulk_tagging">${wf:conf('resumeFrom') eq 'BulkTagging'}</case>
|
||||||
|
<case to="affiliation_inst_repo">${wf:conf('resumeFrom') eq 'AffiliationInstitutionalRepository'}</case>
|
||||||
|
<case to="entity_semantic_relation">${wf:conf('resumeFrom') eq 'AffiliationSemanticRelation'}</case>
|
||||||
|
<case to="community_organization">${wf:conf('resumeFrom') eq 'CommunityOrganization'}</case>
|
||||||
|
<case to="result_project">${wf:conf('resumeFrom') eq 'ResultProject'}</case>
|
||||||
|
<case to="community_project">${wf:conf('resumeFrom') eq 'CommunityProject'}</case>
|
||||||
|
<case to="community_sem_rel">${wf:conf('resumeFrom') eq 'CommunitySemanticRelation'}</case>
|
||||||
|
<case to="country_propagation">${wf:conf('resumeFrom') eq 'CountryPropagation'}</case>
|
||||||
|
<default to="orcid_propagation"/>
|
||||||
|
</switch>
|
||||||
|
</decision>
|
||||||
|
|
||||||
|
|
||||||
|
<action name="orcid_propagation">
|
||||||
|
<sub-workflow>
|
||||||
|
<app-path>${wf:appPath()}/orcid_propagation
|
||||||
|
</app-path>
|
||||||
|
<propagate-configuration/>
|
||||||
|
<configuration>
|
||||||
|
<property>
|
||||||
|
<name>sourcePath</name>
|
||||||
|
<value>${sourcePath}</value>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>allowedsemrels</name>
|
||||||
|
<value>${allowedsemrelsorcidprop}</value>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>outputPath</name>
|
||||||
|
<value>${outputPath}</value>
|
||||||
|
</property>
|
||||||
|
</configuration>
|
||||||
|
</sub-workflow>
|
||||||
|
<ok to="bulk_tagging" />
|
||||||
|
<error to="Kill" />
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<action name="bulk_tagging">
|
||||||
|
<sub-workflow>
|
||||||
|
<app-path>${wf:appPath()}/bulk_tagging
|
||||||
|
</app-path>
|
||||||
|
<propagate-configuration/>
|
||||||
|
<configuration>
|
||||||
|
<property>
|
||||||
|
<name>sourcePath</name>
|
||||||
|
<value>${outputPath}</value>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>baseURL</name>
|
||||||
|
<value>${baseURL}</value>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>pathMap</name>
|
||||||
|
<value>${pathMap}</value>
|
||||||
|
</property>
|
||||||
|
</configuration>
|
||||||
|
</sub-workflow>
|
||||||
|
<ok to="affiliation_inst_repo" />
|
||||||
|
<error to="Kill" />
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<action name="affiliation_inst_repo">
|
||||||
|
<sub-workflow>
|
||||||
|
<app-path>${wf:appPath()}/affiliation_inst_repo
|
||||||
|
</app-path>
|
||||||
|
<propagate-configuration/>
|
||||||
|
<configuration>
|
||||||
|
<property>
|
||||||
|
<name>sourcePath</name>
|
||||||
|
<value>${outputPath}</value>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>blacklist</name>
|
||||||
|
<value>${blacklist}</value>
|
||||||
|
</property>
|
||||||
|
</configuration>
|
||||||
|
</sub-workflow>
|
||||||
|
<ok to="entity_semantic_relation" />
|
||||||
|
<error to="Kill" />
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<action name="entity_semantic_relation">
|
||||||
|
<sub-workflow>
|
||||||
|
<app-path>${wf:appPath()}/entity_semantic_relation
|
||||||
|
</app-path>
|
||||||
|
<propagate-configuration/>
|
||||||
|
<configuration>
|
||||||
|
<property>
|
||||||
|
<name>sourcePath</name>
|
||||||
|
<value>${outputPath}</value>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>iterations</name>
|
||||||
|
<value>${iterations}</value>
|
||||||
|
</property>
|
||||||
|
</configuration>
|
||||||
|
</sub-workflow>
|
||||||
|
<ok to="community_organization" />
|
||||||
|
<error to="Kill" />
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<action name="community_organization">
|
||||||
|
<sub-workflow>
|
||||||
|
<app-path>${wf:appPath()}/community_organization
|
||||||
|
</app-path>
|
||||||
|
<propagate-configuration/>
|
||||||
|
<configuration>
|
||||||
|
<property>
|
||||||
|
<name>sourcePath</name>
|
||||||
|
<value>${outputPath}</value>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>baseURL</name>
|
||||||
|
<value>${baseURL}</value>
|
||||||
|
</property>
|
||||||
|
</configuration>
|
||||||
|
</sub-workflow>
|
||||||
|
<ok to="result_project" />
|
||||||
|
<error to="Kill" />
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<action name="result_project">
|
||||||
|
<sub-workflow>
|
||||||
|
<app-path>${wf:appPath()}/result_project
|
||||||
|
</app-path>
|
||||||
|
<propagate-configuration/>
|
||||||
|
<configuration>
|
||||||
|
<property>
|
||||||
|
<name>sourcePath</name>
|
||||||
|
<value>${outputPath}</value>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>allowedsemrels</name>
|
||||||
|
<value>${allowedsemrelsresultproject}</value>
|
||||||
|
</property>
|
||||||
|
</configuration>
|
||||||
|
</sub-workflow>
|
||||||
|
<ok to="community_project" />
|
||||||
|
<error to="Kill" />
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<action name="community_project">
|
||||||
|
<sub-workflow>
|
||||||
|
<app-path>${wf:appPath()}/community_project
|
||||||
|
</app-path>
|
||||||
|
<propagate-configuration/>
|
||||||
|
<configuration>
|
||||||
|
<property>
|
||||||
|
<name>sourcePath</name>
|
||||||
|
<value>${outputPath}</value>
|
||||||
|
</property>
|
||||||
|
</configuration>
|
||||||
|
</sub-workflow>
|
||||||
|
<ok to="community_sem_rel" />
|
||||||
|
<error to="Kill" />
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<action name="community_sem_rel">
|
||||||
|
<sub-workflow>
|
||||||
|
<app-path>${wf:appPath()}/community_sem_rel
|
||||||
|
</app-path>
|
||||||
|
<propagate-configuration/>
|
||||||
|
<configuration>
|
||||||
|
<property>
|
||||||
|
<name>sourcePath</name>
|
||||||
|
<value>${outputPath}</value>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>allowedsemrels</name>
|
||||||
|
<value>${allowedsemrelscommunitysemrel}</value>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>baseURL</name>
|
||||||
|
<value>${baseURL}</value>
|
||||||
|
</property>
|
||||||
|
</configuration>
|
||||||
|
</sub-workflow>
|
||||||
|
<ok to="country_propagation" />
|
||||||
|
<error to="Kill" />
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<action name="country_propagation">
|
||||||
|
<sub-workflow>
|
||||||
|
<app-path>${wf:appPath()}/country_propagation
|
||||||
|
</app-path>
|
||||||
|
<propagate-configuration/>
|
||||||
|
<configuration>
|
||||||
|
<property>
|
||||||
|
<name>sourcePath</name>
|
||||||
|
<value>${outputPath}</value>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>whitelist</name>
|
||||||
|
<value>${datasourceWhitelistForCountryPropagation}</value>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>allowedtypes</name>
|
||||||
|
<value>${allowedtypes}</value>
|
||||||
|
</property>
|
||||||
|
</configuration>
|
||||||
|
</sub-workflow>
|
||||||
|
<ok to="End" />
|
||||||
|
<error to="Kill" />
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<end name="End"/>
|
||||||
|
|
||||||
|
</workflow-app>
|
|
@ -8,14 +8,11 @@
|
||||||
<name>pathMap</name>
|
<name>pathMap</name>
|
||||||
<description>the json path associated to each selection field</description>
|
<description>the json path associated to each selection field</description>
|
||||||
</property>
|
</property>
|
||||||
<property>
|
|
||||||
<name>outputPath</name>
|
|
||||||
<description>the output path</description>
|
|
||||||
</property>
|
|
||||||
<property>
|
<property>
|
||||||
<name>baseURL</name>
|
<name>baseURL</name>
|
||||||
<description>the community API base URL</description>
|
<description>The URL to access the community APIs</description>
|
||||||
</property>
|
</property>
|
||||||
|
|
||||||
</parameters>
|
</parameters>
|
||||||
|
|
||||||
<global>
|
<global>
|
||||||
|
@ -37,63 +34,18 @@
|
||||||
|
|
||||||
<action name="reset_outputpath">
|
<action name="reset_outputpath">
|
||||||
<fs>
|
<fs>
|
||||||
<delete path="${outputPath}"/>
|
<delete path="${workingDir}"/>
|
||||||
<mkdir path="${outputPath}"/>
|
<mkdir path="${workingDir}"/>
|
||||||
</fs>
|
</fs>
|
||||||
<ok to="copy_entities"/>
|
<ok to="exec_bulktag"/>
|
||||||
<error to="Kill"/>
|
<error to="Kill"/>
|
||||||
</action>
|
</action>
|
||||||
|
|
||||||
<fork name="copy_entities">
|
|
||||||
<path start="copy_relation"/>
|
|
||||||
<path start="copy_organization"/>
|
|
||||||
<path start="copy_projects"/>
|
|
||||||
<path start="copy_datasources"/>
|
|
||||||
</fork>
|
|
||||||
|
|
||||||
<action name="copy_relation">
|
|
||||||
<distcp xmlns="uri:oozie:distcp-action:0.2">
|
|
||||||
<arg>${nameNode}/${sourcePath}/relation</arg>
|
|
||||||
<arg>${nameNode}/${outputPath}/relation</arg>
|
|
||||||
</distcp>
|
|
||||||
<ok to="copy_wait"/>
|
|
||||||
<error to="Kill"/>
|
|
||||||
</action>
|
|
||||||
|
|
||||||
<action name="copy_organization">
|
|
||||||
<distcp xmlns="uri:oozie:distcp-action:0.2">
|
|
||||||
<arg>${nameNode}/${sourcePath}/organization</arg>
|
|
||||||
<arg>${nameNode}/${outputPath}/organization</arg>
|
|
||||||
</distcp>
|
|
||||||
<ok to="copy_wait"/>
|
|
||||||
<error to="Kill"/>
|
|
||||||
</action>
|
|
||||||
|
|
||||||
<action name="copy_projects">
|
|
||||||
<distcp xmlns="uri:oozie:distcp-action:0.2">
|
|
||||||
<arg>${nameNode}/${sourcePath}/project</arg>
|
|
||||||
<arg>${nameNode}/${outputPath}/project</arg>
|
|
||||||
</distcp>
|
|
||||||
<ok to="copy_wait"/>
|
|
||||||
<error to="Kill"/>
|
|
||||||
</action>
|
|
||||||
|
|
||||||
<action name="copy_datasources">
|
|
||||||
<distcp xmlns="uri:oozie:distcp-action:0.2">
|
|
||||||
<arg>${nameNode}/${sourcePath}/datasource</arg>
|
|
||||||
<arg>${nameNode}/${outputPath}/datasource</arg>
|
|
||||||
</distcp>
|
|
||||||
<ok to="copy_wait"/>
|
|
||||||
<error to="Kill"/>
|
|
||||||
</action>
|
|
||||||
|
|
||||||
<join name="copy_wait" to="exec_bulktag"/>
|
|
||||||
|
|
||||||
<action name="exec_bulktag">
|
<action name="exec_bulktag">
|
||||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||||
<master>yarn-cluster</master>
|
<master>yarn-cluster</master>
|
||||||
<mode>cluster</mode>
|
<mode>cluster</mode>
|
||||||
<name>bulkTagging-result</name>
|
<name>bulkTagging-publication</name>
|
||||||
<class>eu.dnetlib.dhp.bulktag.SparkBulkTagJob</class>
|
<class>eu.dnetlib.dhp.bulktag.SparkBulkTagJob</class>
|
||||||
<jar>dhp-enrichment-${projectVersion}.jar</jar>
|
<jar>dhp-enrichment-${projectVersion}.jar</jar>
|
||||||
<spark-opts>
|
<spark-opts>
|
||||||
|
@ -107,7 +59,7 @@
|
||||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||||
</spark-opts>
|
</spark-opts>
|
||||||
<arg>--sourcePath</arg><arg>${sourcePath}/</arg>
|
<arg>--sourcePath</arg><arg>${sourcePath}/</arg>
|
||||||
<arg>--outputPath</arg><arg>${outputPath}/</arg>
|
<arg>--outputPath</arg><arg>${workingDir}/bulktag/</arg>
|
||||||
<arg>--pathMap</arg><arg>${pathMap}</arg>
|
<arg>--pathMap</arg><arg>${pathMap}</arg>
|
||||||
<arg>--baseURL</arg><arg>${baseURL}</arg>
|
<arg>--baseURL</arg><arg>${baseURL}</arg>
|
||||||
</spark>
|
</spark>
|
||||||
|
@ -115,6 +67,8 @@
|
||||||
<error to="Kill"/>
|
<error to="Kill"/>
|
||||||
</action>
|
</action>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
<end name="End"/>
|
<end name="End"/>
|
||||||
|
|
||||||
</workflow-app>
|
</workflow-app>
|
|
@ -29,4 +29,4 @@
|
||||||
"paramDescription": "true if the spark session is managed, false otherwise",
|
"paramDescription": "true if the spark session is managed, false otherwise",
|
||||||
"paramRequired": false
|
"paramRequired": false
|
||||||
}
|
}
|
||||||
]
|
]
|
|
@ -12,11 +12,6 @@
|
||||||
<name>allowedtypes</name>
|
<name>allowedtypes</name>
|
||||||
<description>the allowed types</description>
|
<description>the allowed types</description>
|
||||||
</property>
|
</property>
|
||||||
<property>
|
|
||||||
<name>outputPath</name>
|
|
||||||
<description>the output path</description>
|
|
||||||
</property>
|
|
||||||
|
|
||||||
</parameters>
|
</parameters>
|
||||||
|
|
||||||
<global>
|
<global>
|
||||||
|
@ -38,57 +33,13 @@
|
||||||
|
|
||||||
<action name="reset_outputpath">
|
<action name="reset_outputpath">
|
||||||
<fs>
|
<fs>
|
||||||
<delete path="${outputPath}"/>
|
<delete path="${workingDir}"/>
|
||||||
<mkdir path="${outputPath}"/>
|
<mkdir path="${workingDir}"/>
|
||||||
</fs>
|
</fs>
|
||||||
<ok to="copy_entities"/>
|
<ok to="prepare_datasource_country_association"/>
|
||||||
<error to="Kill"/>
|
<error to="Kill"/>
|
||||||
</action>
|
</action>
|
||||||
|
|
||||||
<fork name="copy_entities">
|
|
||||||
<path start="copy_relation"/>
|
|
||||||
<path start="copy_organization"/>
|
|
||||||
<path start="copy_projects"/>
|
|
||||||
<path start="copy_datasources"/>
|
|
||||||
</fork>
|
|
||||||
|
|
||||||
<action name="copy_relation">
|
|
||||||
<distcp xmlns="uri:oozie:distcp-action:0.2">
|
|
||||||
<arg>${nameNode}/${sourcePath}/relation</arg>
|
|
||||||
<arg>${nameNode}/${outputPath}/relation</arg>
|
|
||||||
</distcp>
|
|
||||||
<ok to="copy_wait"/>
|
|
||||||
<error to="Kill"/>
|
|
||||||
</action>
|
|
||||||
|
|
||||||
<action name="copy_organization">
|
|
||||||
<distcp xmlns="uri:oozie:distcp-action:0.2">
|
|
||||||
<arg>${nameNode}/${sourcePath}/organization</arg>
|
|
||||||
<arg>${nameNode}/${outputPath}/organization</arg>
|
|
||||||
</distcp>
|
|
||||||
<ok to="copy_wait"/>
|
|
||||||
<error to="Kill"/>
|
|
||||||
</action>
|
|
||||||
|
|
||||||
<action name="copy_projects">
|
|
||||||
<distcp xmlns="uri:oozie:distcp-action:0.2">
|
|
||||||
<arg>${nameNode}/${sourcePath}/project</arg>
|
|
||||||
<arg>${nameNode}/${outputPath}/project</arg>
|
|
||||||
</distcp>
|
|
||||||
<ok to="copy_wait"/>
|
|
||||||
<error to="Kill"/>
|
|
||||||
</action>
|
|
||||||
|
|
||||||
<action name="copy_datasources">
|
|
||||||
<distcp xmlns="uri:oozie:distcp-action:0.2">
|
|
||||||
<arg>${nameNode}/${sourcePath}/datasource</arg>
|
|
||||||
<arg>${nameNode}/${outputPath}/datasource</arg>
|
|
||||||
</distcp>
|
|
||||||
<ok to="copy_wait"/>
|
|
||||||
<error to="Kill"/>
|
|
||||||
</action>
|
|
||||||
|
|
||||||
<join name="copy_wait" to="prepare_datasource_country_association"/>
|
|
||||||
|
|
||||||
<action name="prepare_datasource_country_association">
|
<action name="prepare_datasource_country_association">
|
||||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||||
|
@ -112,18 +63,18 @@
|
||||||
<arg>--allowedtypes</arg><arg>${allowedtypes}</arg>
|
<arg>--allowedtypes</arg><arg>${allowedtypes}</arg>
|
||||||
<arg>--outputPath</arg><arg>${workingDir}/preparedInfo</arg>
|
<arg>--outputPath</arg><arg>${workingDir}/preparedInfo</arg>
|
||||||
</spark>
|
</spark>
|
||||||
<ok to="fork_join_prepare_result_country"/>
|
<ok to="fork_prepare_result_country"/>
|
||||||
<error to="Kill"/>
|
<error to="Kill"/>
|
||||||
</action>
|
</action>
|
||||||
|
|
||||||
<fork name="fork_join_prepare_result_country">
|
<fork name="fork_prepare_result_country">
|
||||||
<path start="join_prepareresult_publication"/>
|
<path start="prepareresult_publication"/>
|
||||||
<path start="join_prepareresult_dataset"/>
|
<path start="prepareresult_dataset"/>
|
||||||
<path start="join_prepareresult_otherresearchproduct"/>
|
<path start="prepareresult_otherresearchproduct"/>
|
||||||
<path start="join_prepareresult_software"/>
|
<path start="prepareresult_software"/>
|
||||||
</fork>
|
</fork>
|
||||||
|
|
||||||
<action name="join_prepareresult_publication">
|
<action name="prepareresult_publication">
|
||||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||||
<master>yarn</master>
|
<master>yarn</master>
|
||||||
<mode>cluster</mode>
|
<mode>cluster</mode>
|
||||||
|
@ -153,7 +104,7 @@
|
||||||
<error to="Kill"/>
|
<error to="Kill"/>
|
||||||
</action>
|
</action>
|
||||||
|
|
||||||
<action name="join_prepareresult_dataset">
|
<action name="prepareresult_dataset">
|
||||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||||
<master>yarn</master>
|
<master>yarn</master>
|
||||||
<mode>cluster</mode>
|
<mode>cluster</mode>
|
||||||
|
@ -183,7 +134,7 @@
|
||||||
<error to="Kill"/>
|
<error to="Kill"/>
|
||||||
</action>
|
</action>
|
||||||
|
|
||||||
<action name="join_prepareresult_otherresearchproduct">
|
<action name="prepareresult_otherresearchproduct">
|
||||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||||
<master>yarn</master>
|
<master>yarn</master>
|
||||||
<mode>cluster</mode>
|
<mode>cluster</mode>
|
||||||
|
@ -213,7 +164,7 @@
|
||||||
<error to="Kill"/>
|
<error to="Kill"/>
|
||||||
</action>
|
</action>
|
||||||
|
|
||||||
<action name="join_prepareresult_software">
|
<action name="prepareresult_software">
|
||||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||||
<master>yarn</master>
|
<master>yarn</master>
|
||||||
<mode>cluster</mode>
|
<mode>cluster</mode>
|
||||||
|
@ -243,16 +194,16 @@
|
||||||
<error to="Kill"/>
|
<error to="Kill"/>
|
||||||
</action>
|
</action>
|
||||||
|
|
||||||
<join name="wait_prepare" to="fork_join_apply_country_propagation"/>
|
<join name="wait_prepare" to="fork_apply_country_propagation"/>
|
||||||
|
|
||||||
<fork name="fork_join_apply_country_propagation">
|
<fork name="fork_apply_country_propagation">
|
||||||
<path start="join_propagation_publication"/>
|
<path start="propagation_publication"/>
|
||||||
<path start="join_propagation_dataset"/>
|
<path start="propagation_dataset"/>
|
||||||
<path start="join_propagation_otherresearchproduct"/>
|
<path start="propagation_otherresearchproduct"/>
|
||||||
<path start="join_propagation_software"/>
|
<path start="propagation_software"/>
|
||||||
</fork>
|
</fork>
|
||||||
|
|
||||||
<action name="join_propagation_publication">
|
<action name="propagation_publication">
|
||||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||||
<master>yarn</master>
|
<master>yarn</master>
|
||||||
<mode>cluster</mode>
|
<mode>cluster</mode>
|
||||||
|
@ -275,13 +226,13 @@
|
||||||
<arg>--sourcePath</arg><arg>${sourcePath}/publication</arg>
|
<arg>--sourcePath</arg><arg>${sourcePath}/publication</arg>
|
||||||
<arg>--preparedInfoPath</arg><arg>${workingDir}/publication</arg>
|
<arg>--preparedInfoPath</arg><arg>${workingDir}/publication</arg>
|
||||||
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Publication</arg>
|
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Publication</arg>
|
||||||
<arg>--outputPath</arg><arg>${outputPath}/publication</arg>
|
<arg>--outputPath</arg><arg>${workingDir}/country/publication</arg>
|
||||||
</spark>
|
</spark>
|
||||||
<ok to="wait"/>
|
<ok to="wait"/>
|
||||||
<error to="Kill"/>
|
<error to="Kill"/>
|
||||||
</action>
|
</action>
|
||||||
|
|
||||||
<action name="join_propagation_dataset">
|
<action name="propagation_dataset">
|
||||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||||
<master>yarn</master>
|
<master>yarn</master>
|
||||||
<mode>cluster</mode>
|
<mode>cluster</mode>
|
||||||
|
@ -304,13 +255,13 @@
|
||||||
<arg>--sourcePath</arg><arg>${sourcePath}/dataset</arg>
|
<arg>--sourcePath</arg><arg>${sourcePath}/dataset</arg>
|
||||||
<arg>--preparedInfoPath</arg><arg>${workingDir}/dataset</arg>
|
<arg>--preparedInfoPath</arg><arg>${workingDir}/dataset</arg>
|
||||||
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Dataset</arg>
|
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Dataset</arg>
|
||||||
<arg>--outputPath</arg><arg>${outputPath}/dataset</arg>
|
<arg>--outputPath</arg><arg>${workingDir}/country/dataset</arg>
|
||||||
</spark>
|
</spark>
|
||||||
<ok to="wait"/>
|
<ok to="wait"/>
|
||||||
<error to="Kill"/>
|
<error to="Kill"/>
|
||||||
</action>
|
</action>
|
||||||
|
|
||||||
<action name="join_propagation_otherresearchproduct">
|
<action name="propagation_otherresearchproduct">
|
||||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||||
<master>yarn</master>
|
<master>yarn</master>
|
||||||
<mode>cluster</mode>
|
<mode>cluster</mode>
|
||||||
|
@ -333,13 +284,13 @@
|
||||||
<arg>--sourcePath</arg><arg>${sourcePath}/otherresearchproduct</arg>
|
<arg>--sourcePath</arg><arg>${sourcePath}/otherresearchproduct</arg>
|
||||||
<arg>--preparedInfoPath</arg><arg>${workingDir}/otherresearchproduct</arg>
|
<arg>--preparedInfoPath</arg><arg>${workingDir}/otherresearchproduct</arg>
|
||||||
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.OtherResearchProduct</arg>
|
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.OtherResearchProduct</arg>
|
||||||
<arg>--outputPath</arg><arg>${outputPath}/otherresearchproduct</arg>
|
<arg>--outputPath</arg><arg>${workingDir}/country/otherresearchproduct</arg>
|
||||||
</spark>
|
</spark>
|
||||||
<ok to="wait"/>
|
<ok to="wait"/>
|
||||||
<error to="Kill"/>
|
<error to="Kill"/>
|
||||||
</action>
|
</action>
|
||||||
|
|
||||||
<action name="join_propagation_software">
|
<action name="propagation_software">
|
||||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||||
<master>yarn</master>
|
<master>yarn</master>
|
||||||
<mode>cluster</mode>
|
<mode>cluster</mode>
|
||||||
|
@ -362,14 +313,21 @@
|
||||||
<arg>--sourcePath</arg><arg>${sourcePath}/software</arg>
|
<arg>--sourcePath</arg><arg>${sourcePath}/software</arg>
|
||||||
<arg>--preparedInfoPath</arg><arg>${workingDir}/software</arg>
|
<arg>--preparedInfoPath</arg><arg>${workingDir}/software</arg>
|
||||||
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Software</arg>
|
<arg>--resultTableName</arg><arg>eu.dnetlib.dhp.schema.oaf.Software</arg>
|
||||||
<arg>--outputPath</arg><arg>${outputPath}/software</arg>
|
<arg>--outputPath</arg><arg>${workingDir}/country/software</arg>
|
||||||
</spark>
|
</spark>
|
||||||
<ok to="wait"/>
|
<ok to="wait"/>
|
||||||
<error to="Kill"/>
|
<error to="Kill"/>
|
||||||
</action>
|
</action>
|
||||||
|
|
||||||
<join name="wait" to="End"/>
|
<join name="wait" to="reset_workingDir"/>
|
||||||
|
<action name="reset_workingDir">
|
||||||
|
<fs>
|
||||||
|
<delete path="${workingDir}"/>
|
||||||
|
<mkdir path="${workingDir}"/>
|
||||||
|
</fs>
|
||||||
|
<ok to="End"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
<end name="End"/>
|
<end name="End"/>
|
||||||
|
|
||||||
</workflow-app>
|
</workflow-app>
|
|
@ -0,0 +1,101 @@
|
||||||
|
<workflow-app name="affiliation_from_semrel_propagation" xmlns="uri:oozie:workflow:0.5">
|
||||||
|
<parameters>
|
||||||
|
<property>
|
||||||
|
<name>sourcePath</name>
|
||||||
|
<description>the source path</description>
|
||||||
|
</property>
|
||||||
|
|
||||||
|
</parameters>
|
||||||
|
|
||||||
|
<global>
|
||||||
|
<job-tracker>${jobTracker}</job-tracker>
|
||||||
|
<name-node>${nameNode}</name-node>
|
||||||
|
<configuration>
|
||||||
|
<property>
|
||||||
|
<name>oozie.action.sharelib.for.spark</name>
|
||||||
|
<value>${oozieActionShareLibForSpark2}</value>
|
||||||
|
</property>
|
||||||
|
</configuration>
|
||||||
|
</global>
|
||||||
|
|
||||||
|
<start to="reset_outputpath"/>
|
||||||
|
|
||||||
|
<kill name="Kill">
|
||||||
|
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
||||||
|
</kill>
|
||||||
|
|
||||||
|
<action name="reset_outputpath">
|
||||||
|
<fs>
|
||||||
|
<delete path="${workingDir}"/>
|
||||||
|
<mkdir path="${workingDir}"/>
|
||||||
|
</fs>
|
||||||
|
<ok to="prepare_info"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<action name="prepare_info">
|
||||||
|
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||||
|
<master>yarn</master>
|
||||||
|
<mode>cluster</mode>
|
||||||
|
<name>PrepareResultProjectOrganizationAssociation</name>
|
||||||
|
<class>eu.dnetlib.dhp.entitytoorganizationfromsemrel.PrepareInfo</class>
|
||||||
|
<jar>dhp-enrichment-${projectVersion}.jar</jar>
|
||||||
|
<spark-opts>
|
||||||
|
--executor-cores=${sparkExecutorCores}
|
||||||
|
--executor-memory=${sparkExecutorMemory}
|
||||||
|
--driver-memory=${sparkDriverMemory}
|
||||||
|
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||||
|
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||||
|
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||||
|
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||||
|
</spark-opts>
|
||||||
|
<arg>--graphPath</arg><arg>${sourcePath}</arg>
|
||||||
|
<arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg>
|
||||||
|
<arg>--leavesPath</arg><arg>${workingDir}/entitiesSemanticRelation/preparedInfo/leavesPath</arg>
|
||||||
|
<arg>--childParentPath</arg><arg>${workingDir}/entitiesSemanticRelation/preparedInfo/childParentPath</arg>
|
||||||
|
<arg>--resultOrgPath</arg><arg>${workingDir}/entitiesSemanticRelation/preparedInfo/resultOrgPath</arg>
|
||||||
|
<arg>--projectOrganizationPath</arg><arg>${workingDir}/entitiesSemanticRelation/preparedInfo/projectOrganizationPath</arg>
|
||||||
|
<arg>--relationPath</arg><arg>${workingDir}/entitiesSemanticRelation/preparedInfo/relation</arg>
|
||||||
|
</spark>
|
||||||
|
<ok to="apply_resulttoorganization_propagation"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<action name="apply_resulttoorganization_propagation">
|
||||||
|
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||||
|
<master>yarn</master>
|
||||||
|
<mode>cluster</mode>
|
||||||
|
<name>entityToOrganizationFromSemRel</name>
|
||||||
|
<class>eu.dnetlib.dhp.entitytoorganizationfromsemrel.SparkEntityToOrganizationFromSemRel</class>
|
||||||
|
<jar>dhp-enrichment-${projectVersion}.jar</jar>
|
||||||
|
<spark-opts>
|
||||||
|
--executor-cores=${sparkExecutorCores}
|
||||||
|
--executor-memory=${sparkExecutorMemory}
|
||||||
|
--driver-memory=${sparkDriverMemory}
|
||||||
|
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||||
|
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||||
|
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||||
|
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||||
|
--conf spark.dynamicAllocation.enabled=true
|
||||||
|
--conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors}
|
||||||
|
--conf spark.sql.shuffle.partitions=3840
|
||||||
|
</spark-opts>
|
||||||
|
<arg>--relationPath</arg><arg>${workingDir}/entitiesSemanticRelation/preparedInfo/relation</arg>
|
||||||
|
<arg>--outputPath</arg><arg>${sourcePath}/relation</arg>
|
||||||
|
<arg>--leavesPath</arg><arg>${workingDir}/entitiesSemanticRelation/preparedInfo/leavesPath</arg>
|
||||||
|
<arg>--childParentPath</arg><arg>${workingDir}/entitiesSemanticRelation/preparedInfo/childParentPath</arg>
|
||||||
|
<arg>--resultOrgPath</arg><arg>${workingDir}/entitiesSemanticRelation/preparedInfo/resultOrgPath</arg>
|
||||||
|
<arg>--projectOrganizationPath</arg><arg>${workingDir}/entitiesSemanticRelation/preparedInfo/projectOrganizationPath</arg>
|
||||||
|
<arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg>
|
||||||
|
<arg>--workingDir</arg><arg>${workingDir}/entitiesSemanticRelation/working</arg>
|
||||||
|
<arg>--iterations</arg><arg>${iterations}</arg>
|
||||||
|
</spark>
|
||||||
|
<ok to="End"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<end name="End"/>
|
||||||
|
|
||||||
|
</workflow-app>
|
|
@ -11,12 +11,6 @@
|
||||||
"paramDescription": "true if the new version of the graph must be saved",
|
"paramDescription": "true if the new version of the graph must be saved",
|
||||||
"paramRequired": false
|
"paramRequired": false
|
||||||
},
|
},
|
||||||
{
|
|
||||||
"paramName":"h",
|
|
||||||
"paramLongName":"hive_metastore_uris",
|
|
||||||
"paramDescription": "the hive metastore uris",
|
|
||||||
"paramRequired": true
|
|
||||||
},
|
|
||||||
{
|
{
|
||||||
"paramName": "out",
|
"paramName": "out",
|
||||||
"paramLongName": "outputPath",
|
"paramLongName": "outputPath",
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue