forked from D-Net/dnet-hadoop
merged from beta
This commit is contained in:
commit
26bf8e763a
|
@ -66,13 +66,14 @@
|
||||||
<groupId>edu.cmu</groupId>
|
<groupId>edu.cmu</groupId>
|
||||||
<artifactId>secondstring</artifactId>
|
<artifactId>secondstring</artifactId>
|
||||||
</dependency>
|
</dependency>
|
||||||
|
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>eu.dnetlib.dhp</groupId>
|
<groupId>com.ibm.icu</groupId>
|
||||||
<artifactId>dhp-pace-core</artifactId>
|
<artifactId>icu4j</artifactId>
|
||||||
<version>${project.version}</version>
|
</dependency>
|
||||||
|
<dependency>
|
||||||
|
<groupId>org.apache.hadoop</groupId>
|
||||||
|
<artifactId>hadoop-common</artifactId>
|
||||||
</dependency>
|
</dependency>
|
||||||
|
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>com.github.sisyphsu</groupId>
|
<groupId>com.github.sisyphsu</groupId>
|
||||||
<artifactId>dateparser</artifactId>
|
<artifactId>dateparser</artifactId>
|
||||||
|
@ -119,6 +120,10 @@
|
||||||
<groupId>net.sf.saxon</groupId>
|
<groupId>net.sf.saxon</groupId>
|
||||||
<artifactId>Saxon-HE</artifactId>
|
<artifactId>Saxon-HE</artifactId>
|
||||||
</dependency>
|
</dependency>
|
||||||
|
<dependency>
|
||||||
|
<groupId>org.slf4j</groupId>
|
||||||
|
<artifactId>jcl-over-slf4j</artifactId>
|
||||||
|
</dependency>
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>org.apache.cxf</groupId>
|
<groupId>org.apache.cxf</groupId>
|
||||||
<artifactId>cxf-rt-transports-http</artifactId>
|
<artifactId>cxf-rt-transports-http</artifactId>
|
||||||
|
|
|
@ -135,7 +135,7 @@ public class GroupEntitiesSparkJob {
|
||||||
.applyCoarVocabularies(entity, vocs),
|
.applyCoarVocabularies(entity, vocs),
|
||||||
OAFENTITY_KRYO_ENC)
|
OAFENTITY_KRYO_ENC)
|
||||||
.groupByKey((MapFunction<OafEntity, String>) OafEntity::getId, Encoders.STRING())
|
.groupByKey((MapFunction<OafEntity, String>) OafEntity::getId, Encoders.STRING())
|
||||||
.mapGroups((MapGroupsFunction<String, OafEntity, OafEntity>) MergeUtils::mergeGroup, OAFENTITY_KRYO_ENC)
|
.mapGroups((MapGroupsFunction<String, OafEntity, OafEntity>) MergeUtils::mergeById, OAFENTITY_KRYO_ENC)
|
||||||
.map(
|
.map(
|
||||||
(MapFunction<OafEntity, Tuple2<String, OafEntity>>) t -> new Tuple2<>(
|
(MapFunction<OafEntity, Tuple2<String, OafEntity>>) t -> new Tuple2<>(
|
||||||
t.getClass().getName(), t),
|
t.getClass().getName(), t),
|
||||||
|
|
|
@ -30,8 +30,16 @@ import eu.dnetlib.dhp.schema.common.ModelSupport;
|
||||||
import eu.dnetlib.dhp.schema.oaf.*;
|
import eu.dnetlib.dhp.schema.oaf.*;
|
||||||
|
|
||||||
public class MergeUtils {
|
public class MergeUtils {
|
||||||
|
public static <T extends Oaf> T mergeById(String s, Iterator<T> oafEntityIterator) {
|
||||||
|
return mergeGroup(s, oafEntityIterator, true);
|
||||||
|
}
|
||||||
|
|
||||||
public static <T extends Oaf> T mergeGroup(String s, Iterator<T> oafEntityIterator) {
|
public static <T extends Oaf> T mergeGroup(String s, Iterator<T> oafEntityIterator) {
|
||||||
|
return mergeGroup(s, oafEntityIterator, false);
|
||||||
|
}
|
||||||
|
|
||||||
|
public static <T extends Oaf> T mergeGroup(String s, Iterator<T> oafEntityIterator,
|
||||||
|
boolean checkDelegateAuthority) {
|
||||||
TreeSet<T> sortedEntities = new TreeSet<>((o1, o2) -> {
|
TreeSet<T> sortedEntities = new TreeSet<>((o1, o2) -> {
|
||||||
int res = 0;
|
int res = 0;
|
||||||
|
|
||||||
|
@ -52,18 +60,22 @@ public class MergeUtils {
|
||||||
sortedEntities.add(oafEntityIterator.next());
|
sortedEntities.add(oafEntityIterator.next());
|
||||||
}
|
}
|
||||||
|
|
||||||
T merged = sortedEntities.descendingIterator().next();
|
|
||||||
|
|
||||||
Iterator<T> it = sortedEntities.descendingIterator();
|
Iterator<T> it = sortedEntities.descendingIterator();
|
||||||
|
T merged = it.next();
|
||||||
|
|
||||||
while (it.hasNext()) {
|
while (it.hasNext()) {
|
||||||
merged = checkedMerge(merged, it.next());
|
merged = checkedMerge(merged, it.next(), checkDelegateAuthority);
|
||||||
}
|
}
|
||||||
|
|
||||||
return merged;
|
return merged;
|
||||||
}
|
}
|
||||||
|
|
||||||
public static <T extends Oaf> T checkedMerge(final T left, final T right) {
|
public static <T extends Oaf> T checkedMerge(final T left, final T right, boolean checkDelegateAuthority) {
|
||||||
return (T) merge(left, right, false);
|
return (T) merge(left, right, checkDelegateAuthority);
|
||||||
|
}
|
||||||
|
|
||||||
|
public static <T extends Result, E extends Result> Result mergeResult(final T left, final E right) {
|
||||||
|
return (Result) merge(left, right, false);
|
||||||
}
|
}
|
||||||
|
|
||||||
public static Oaf merge(final Oaf left, final Oaf right) {
|
public static Oaf merge(final Oaf left, final Oaf right) {
|
||||||
|
@ -108,7 +120,7 @@ public class MergeUtils {
|
||||||
return mergeSoftware((Software) left, (Software) right);
|
return mergeSoftware((Software) left, (Software) right);
|
||||||
}
|
}
|
||||||
|
|
||||||
return mergeResult((Result) left, (Result) right);
|
return mergeResultFields((Result) left, (Result) right);
|
||||||
} else if (sameClass(left, right, Datasource.class)) {
|
} else if (sameClass(left, right, Datasource.class)) {
|
||||||
// TODO
|
// TODO
|
||||||
final int trust = compareTrust(left, right);
|
final int trust = compareTrust(left, right);
|
||||||
|
@ -151,9 +163,9 @@ public class MergeUtils {
|
||||||
}
|
}
|
||||||
// TODO: raise trust to have preferred fields from one or the other??
|
// TODO: raise trust to have preferred fields from one or the other??
|
||||||
if (new ResultTypeComparator().compare(left, right) < 0) {
|
if (new ResultTypeComparator().compare(left, right) < 0) {
|
||||||
return mergeResult(left, right);
|
return mergeResultFields(left, right);
|
||||||
} else {
|
} else {
|
||||||
return mergeResult(right, left);
|
return mergeResultFields(right, left);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -263,6 +275,12 @@ public class MergeUtils {
|
||||||
|
|
||||||
// TODO review
|
// TODO review
|
||||||
private static List<KeyValue> mergeByKey(List<KeyValue> left, List<KeyValue> right, int trust) {
|
private static List<KeyValue> mergeByKey(List<KeyValue> left, List<KeyValue> right, int trust) {
|
||||||
|
if (left == null) {
|
||||||
|
return right;
|
||||||
|
} else if (right == null) {
|
||||||
|
return left;
|
||||||
|
}
|
||||||
|
|
||||||
if (trust < 0) {
|
if (trust < 0) {
|
||||||
List<KeyValue> s = left;
|
List<KeyValue> s = left;
|
||||||
left = right;
|
left = right;
|
||||||
|
@ -368,7 +386,7 @@ public class MergeUtils {
|
||||||
return merge;
|
return merge;
|
||||||
}
|
}
|
||||||
|
|
||||||
public static <T extends Result> T mergeResult(T original, T enrich) {
|
private static <T extends Result> T mergeResultFields(T original, T enrich) {
|
||||||
final int trust = compareTrust(original, enrich);
|
final int trust = compareTrust(original, enrich);
|
||||||
T merge = mergeOafEntityFields(original, enrich, trust);
|
T merge = mergeOafEntityFields(original, enrich, trust);
|
||||||
|
|
||||||
|
@ -694,7 +712,7 @@ public class MergeUtils {
|
||||||
|
|
||||||
private static <T extends OtherResearchProduct> T mergeORP(T original, T enrich) {
|
private static <T extends OtherResearchProduct> T mergeORP(T original, T enrich) {
|
||||||
int trust = compareTrust(original, enrich);
|
int trust = compareTrust(original, enrich);
|
||||||
final T merge = mergeResult(original, enrich);
|
final T merge = mergeResultFields(original, enrich);
|
||||||
|
|
||||||
merge.setContactperson(unionDistinctLists(merge.getContactperson(), enrich.getContactperson(), trust));
|
merge.setContactperson(unionDistinctLists(merge.getContactperson(), enrich.getContactperson(), trust));
|
||||||
merge.setContactgroup(unionDistinctLists(merge.getContactgroup(), enrich.getContactgroup(), trust));
|
merge.setContactgroup(unionDistinctLists(merge.getContactgroup(), enrich.getContactgroup(), trust));
|
||||||
|
@ -705,7 +723,7 @@ public class MergeUtils {
|
||||||
|
|
||||||
private static <T extends Software> T mergeSoftware(T original, T enrich) {
|
private static <T extends Software> T mergeSoftware(T original, T enrich) {
|
||||||
int trust = compareTrust(original, enrich);
|
int trust = compareTrust(original, enrich);
|
||||||
final T merge = mergeResult(original, enrich);
|
final T merge = mergeResultFields(original, enrich);
|
||||||
|
|
||||||
merge.setDocumentationUrl(unionDistinctLists(merge.getDocumentationUrl(), enrich.getDocumentationUrl(), trust));
|
merge.setDocumentationUrl(unionDistinctLists(merge.getDocumentationUrl(), enrich.getDocumentationUrl(), trust));
|
||||||
merge.setLicense(unionDistinctLists(merge.getLicense(), enrich.getLicense(), trust));
|
merge.setLicense(unionDistinctLists(merge.getLicense(), enrich.getLicense(), trust));
|
||||||
|
@ -719,7 +737,7 @@ public class MergeUtils {
|
||||||
|
|
||||||
private static <T extends Dataset> T mergeDataset(T original, T enrich) {
|
private static <T extends Dataset> T mergeDataset(T original, T enrich) {
|
||||||
int trust = compareTrust(original, enrich);
|
int trust = compareTrust(original, enrich);
|
||||||
T merge = mergeResult(original, enrich);
|
T merge = mergeResultFields(original, enrich);
|
||||||
|
|
||||||
merge.setStoragedate(chooseReference(merge.getStoragedate(), enrich.getStoragedate(), trust));
|
merge.setStoragedate(chooseReference(merge.getStoragedate(), enrich.getStoragedate(), trust));
|
||||||
merge.setDevice(chooseReference(merge.getDevice(), enrich.getDevice(), trust));
|
merge.setDevice(chooseReference(merge.getDevice(), enrich.getDevice(), trust));
|
||||||
|
@ -738,7 +756,7 @@ public class MergeUtils {
|
||||||
|
|
||||||
public static <T extends Publication> T mergePublication(T original, T enrich) {
|
public static <T extends Publication> T mergePublication(T original, T enrich) {
|
||||||
final int trust = compareTrust(original, enrich);
|
final int trust = compareTrust(original, enrich);
|
||||||
T merged = mergeResult(original, enrich);
|
T merged = mergeResultFields(original, enrich);
|
||||||
|
|
||||||
merged.setJournal(chooseReference(merged.getJournal(), enrich.getJournal(), trust));
|
merged.setJournal(chooseReference(merged.getJournal(), enrich.getJournal(), trust));
|
||||||
|
|
||||||
|
|
|
@ -36,6 +36,15 @@ public class ResultTypeComparator implements Comparator<Result> {
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (left.getResulttype() == null || left.getResulttype().getClassid() == null) {
|
||||||
|
if (right.getResulttype() == null || right.getResulttype().getClassid() == null) {
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
return 1;
|
||||||
|
} else if (right.getResulttype() == null || right.getResulttype().getClassid() == null) {
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
|
||||||
String lClass = left.getResulttype().getClassid();
|
String lClass = left.getResulttype().getClassid();
|
||||||
String rClass = right.getResulttype().getClassid();
|
String rClass = right.getResulttype().getClassid();
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,100 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.pace.common;
|
||||||
|
|
||||||
|
import com.google.common.base.Splitter;
|
||||||
|
import com.google.common.collect.Iterables;
|
||||||
|
import com.google.common.collect.Sets;
|
||||||
|
import com.ibm.icu.text.Transliterator;
|
||||||
|
import org.apache.commons.io.IOUtils;
|
||||||
|
import org.apache.commons.lang3.StringUtils;
|
||||||
|
|
||||||
|
import java.nio.charset.StandardCharsets;
|
||||||
|
import java.text.Normalizer;
|
||||||
|
import java.util.Set;
|
||||||
|
import java.util.regex.Matcher;
|
||||||
|
import java.util.regex.Pattern;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Set of common functions for the framework
|
||||||
|
*
|
||||||
|
* @author claudio
|
||||||
|
*/
|
||||||
|
public class PaceCommonUtils {
|
||||||
|
|
||||||
|
// transliterator
|
||||||
|
protected static Transliterator transliterator = Transliterator.getInstance("Any-Eng");
|
||||||
|
|
||||||
|
protected static final String aliases_from = "⁰¹²³⁴⁵⁶⁷⁸⁹⁺⁻⁼⁽⁾ⁿ₀₁₂₃₄₅₆₇₈₉₊₋₌₍₎àáâäæãåāèéêëēėęəîïíīįìôöòóœøōõûüùúūßśšłžźżçćčñń";
|
||||||
|
protected static final String aliases_to = "0123456789+-=()n0123456789+-=()aaaaaaaaeeeeeeeeiiiiiioooooooouuuuussslzzzcccnn";
|
||||||
|
|
||||||
|
protected static Pattern hexUnicodePattern = Pattern.compile("\\\\u(\\p{XDigit}{4})");
|
||||||
|
|
||||||
|
protected static String fixAliases(final String s) {
|
||||||
|
final StringBuilder sb = new StringBuilder();
|
||||||
|
|
||||||
|
s.chars().forEach(ch -> {
|
||||||
|
final int i = StringUtils.indexOf(aliases_from, ch);
|
||||||
|
sb.append(i >= 0 ? aliases_to.charAt(i) : (char) ch);
|
||||||
|
});
|
||||||
|
|
||||||
|
return sb.toString();
|
||||||
|
}
|
||||||
|
|
||||||
|
protected static String transliterate(final String s) {
|
||||||
|
try {
|
||||||
|
return transliterator.transliterate(s);
|
||||||
|
} catch (Exception e) {
|
||||||
|
return s;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public static String normalize(final String s) {
|
||||||
|
return fixAliases(transliterate(nfd(unicodeNormalization(s))))
|
||||||
|
.toLowerCase()
|
||||||
|
// do not compact the regexes in a single expression, would cause StackOverflowError in case of large input
|
||||||
|
// strings
|
||||||
|
.replaceAll("[^ \\w]+", "")
|
||||||
|
.replaceAll("(\\p{InCombiningDiacriticalMarks})+", "")
|
||||||
|
.replaceAll("(\\p{Punct})+", " ")
|
||||||
|
.replaceAll("(\\d)+", " ")
|
||||||
|
.replaceAll("(\\n)+", " ")
|
||||||
|
.trim();
|
||||||
|
}
|
||||||
|
|
||||||
|
public static String nfd(final String s) {
|
||||||
|
return Normalizer.normalize(s, Normalizer.Form.NFD);
|
||||||
|
}
|
||||||
|
|
||||||
|
public static String unicodeNormalization(final String s) {
|
||||||
|
|
||||||
|
Matcher m = hexUnicodePattern.matcher(s);
|
||||||
|
StringBuffer buf = new StringBuffer(s.length());
|
||||||
|
while (m.find()) {
|
||||||
|
String ch = String.valueOf((char) Integer.parseInt(m.group(1), 16));
|
||||||
|
m.appendReplacement(buf, Matcher.quoteReplacement(ch));
|
||||||
|
}
|
||||||
|
m.appendTail(buf);
|
||||||
|
return buf.toString();
|
||||||
|
}
|
||||||
|
|
||||||
|
public static Set<String> loadFromClasspath(final String classpath) {
|
||||||
|
|
||||||
|
Transliterator transliterator = Transliterator.getInstance("Any-Eng");
|
||||||
|
|
||||||
|
final Set<String> h = Sets.newHashSet();
|
||||||
|
try {
|
||||||
|
for (final String s : IOUtils
|
||||||
|
.readLines(PaceCommonUtils.class.getResourceAsStream(classpath), StandardCharsets.UTF_8)) {
|
||||||
|
h.add(fixAliases(transliterator.transliterate(s))); // transliteration of the stopwords
|
||||||
|
}
|
||||||
|
} catch (final Throwable e) {
|
||||||
|
return Sets.newHashSet();
|
||||||
|
}
|
||||||
|
return h;
|
||||||
|
}
|
||||||
|
|
||||||
|
protected static Iterable<String> tokens(final String s, final int maxTokens) {
|
||||||
|
return Iterables.limit(Splitter.on(" ").omitEmptyStrings().trimResults().split(s), maxTokens);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -1,21 +1,20 @@
|
||||||
|
|
||||||
package eu.dnetlib.pace.model;
|
package eu.dnetlib.pace.model;
|
||||||
|
|
||||||
import java.nio.charset.Charset;
|
|
||||||
import java.text.Normalizer;
|
|
||||||
import java.util.List;
|
|
||||||
import java.util.Set;
|
|
||||||
|
|
||||||
import com.google.common.base.Joiner;
|
import com.google.common.base.Joiner;
|
||||||
import com.google.common.base.Splitter;
|
import com.google.common.base.Splitter;
|
||||||
import com.google.common.collect.Iterables;
|
import com.google.common.collect.Iterables;
|
||||||
import com.google.common.collect.Lists;
|
import com.google.common.collect.Lists;
|
||||||
import com.google.common.hash.Hashing;
|
import com.google.common.hash.Hashing;
|
||||||
|
import eu.dnetlib.pace.common.PaceCommonUtils;
|
||||||
import eu.dnetlib.pace.common.AbstractPaceFunctions;
|
|
||||||
import eu.dnetlib.pace.util.Capitalise;
|
import eu.dnetlib.pace.util.Capitalise;
|
||||||
import eu.dnetlib.pace.util.DotAbbreviations;
|
import eu.dnetlib.pace.util.DotAbbreviations;
|
||||||
|
|
||||||
|
import java.nio.charset.Charset;
|
||||||
|
import java.text.Normalizer;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Set;
|
||||||
|
|
||||||
public class Person {
|
public class Person {
|
||||||
|
|
||||||
private static final String UTF8 = "UTF-8";
|
private static final String UTF8 = "UTF-8";
|
||||||
|
@ -86,7 +85,7 @@ public class Person {
|
||||||
|
|
||||||
private List<String> splitTerms(final String s) {
|
private List<String> splitTerms(final String s) {
|
||||||
if (particles == null) {
|
if (particles == null) {
|
||||||
particles = AbstractPaceFunctions.loadFromClasspath("/eu/dnetlib/pace/config/name_particles.txt");
|
particles = PaceCommonUtils.loadFromClasspath("/eu/dnetlib/pace/config/name_particles.txt");
|
||||||
}
|
}
|
||||||
|
|
||||||
final List<String> list = Lists.newArrayList();
|
final List<String> list = Lists.newArrayList();
|
|
@ -1,9 +1,8 @@
|
||||||
|
|
||||||
package eu.dnetlib.pace.util;
|
package eu.dnetlib.pace.util;
|
||||||
|
|
||||||
import org.apache.commons.lang3.text.WordUtils;
|
|
||||||
|
|
||||||
import com.google.common.base.Function;
|
import com.google.common.base.Function;
|
||||||
|
import org.apache.commons.lang3.text.WordUtils;
|
||||||
|
|
||||||
public class Capitalise implements Function<String, String> {
|
public class Capitalise implements Function<String, String> {
|
||||||
|
|
||||||
|
@ -15,4 +14,4 @@ public class Capitalise implements Function<String, String> {
|
||||||
public String apply(final String s) {
|
public String apply(final String s) {
|
||||||
return WordUtils.capitalize(s.toLowerCase(), DELIM);
|
return WordUtils.capitalize(s.toLowerCase(), DELIM);
|
||||||
}
|
}
|
||||||
};
|
}
|
|
@ -8,4 +8,4 @@ public class DotAbbreviations implements Function<String, String> {
|
||||||
public String apply(String s) {
|
public String apply(String s) {
|
||||||
return s.length() == 1 ? s + "." : s;
|
return s.length() == 1 ? s + "." : s;
|
||||||
}
|
}
|
||||||
};
|
}
|
|
@ -63,7 +63,7 @@ public class MergeUtilsTest {
|
||||||
assertEquals(1, d1.getCollectedfrom().size());
|
assertEquals(1, d1.getCollectedfrom().size());
|
||||||
assertTrue(cfId(d1.getCollectedfrom()).contains(ModelConstants.CROSSREF_ID));
|
assertTrue(cfId(d1.getCollectedfrom()).contains(ModelConstants.CROSSREF_ID));
|
||||||
|
|
||||||
final Result p1d2 = MergeUtils.checkedMerge(p1, d2);
|
final Result p1d2 = MergeUtils.checkedMerge(p1, d2, true);
|
||||||
assertEquals(ModelConstants.PUBLICATION_RESULTTYPE_CLASSID, p1d2.getResulttype().getClassid());
|
assertEquals(ModelConstants.PUBLICATION_RESULTTYPE_CLASSID, p1d2.getResulttype().getClassid());
|
||||||
assertTrue(p1d2 instanceof Publication);
|
assertTrue(p1d2 instanceof Publication);
|
||||||
assertEquals(p1.getId(), p1d2.getId());
|
assertEquals(p1.getId(), p1d2.getId());
|
||||||
|
@ -74,7 +74,7 @@ public class MergeUtilsTest {
|
||||||
Publication p2 = read("publication_2.json", Publication.class);
|
Publication p2 = read("publication_2.json", Publication.class);
|
||||||
Dataset d1 = read("dataset_1.json", Dataset.class);
|
Dataset d1 = read("dataset_1.json", Dataset.class);
|
||||||
|
|
||||||
final Result p2d1 = MergeUtils.checkedMerge(p2, d1);
|
final Result p2d1 = MergeUtils.checkedMerge(p2, d1, true);
|
||||||
assertEquals((ModelConstants.DATASET_RESULTTYPE_CLASSID), p2d1.getResulttype().getClassid());
|
assertEquals((ModelConstants.DATASET_RESULTTYPE_CLASSID), p2d1.getResulttype().getClassid());
|
||||||
assertTrue(p2d1 instanceof Dataset);
|
assertTrue(p2d1 instanceof Dataset);
|
||||||
assertEquals(d1.getId(), p2d1.getId());
|
assertEquals(d1.getId(), p2d1.getId());
|
||||||
|
@ -86,7 +86,7 @@ public class MergeUtilsTest {
|
||||||
Publication p1 = read("publication_1.json", Publication.class);
|
Publication p1 = read("publication_1.json", Publication.class);
|
||||||
Publication p2 = read("publication_2.json", Publication.class);
|
Publication p2 = read("publication_2.json", Publication.class);
|
||||||
|
|
||||||
Result p1p2 = MergeUtils.checkedMerge(p1, p2);
|
Result p1p2 = MergeUtils.checkedMerge(p1, p2, true);
|
||||||
assertTrue(p1p2 instanceof Publication);
|
assertTrue(p1p2 instanceof Publication);
|
||||||
assertEquals(p1.getId(), p1p2.getId());
|
assertEquals(p1.getId(), p1p2.getId());
|
||||||
assertEquals(2, p1p2.getCollectedfrom().size());
|
assertEquals(2, p1p2.getCollectedfrom().size());
|
||||||
|
|
|
@ -49,6 +49,12 @@
|
||||||
</build>
|
</build>
|
||||||
|
|
||||||
<dependencies>
|
<dependencies>
|
||||||
|
<dependency>
|
||||||
|
<groupId>eu.dnetlib.dhp</groupId>
|
||||||
|
<artifactId>dhp-common</artifactId>
|
||||||
|
<version>${project.version}</version>
|
||||||
|
</dependency>
|
||||||
|
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>edu.cmu</groupId>
|
<groupId>edu.cmu</groupId>
|
||||||
<artifactId>secondstring</artifactId>
|
<artifactId>secondstring</artifactId>
|
||||||
|
|
|
@ -1,32 +1,26 @@
|
||||||
|
|
||||||
package eu.dnetlib.pace.common;
|
package eu.dnetlib.pace.common;
|
||||||
|
|
||||||
|
import com.google.common.base.Joiner;
|
||||||
|
import com.google.common.collect.Sets;
|
||||||
|
import com.ibm.icu.text.Transliterator;
|
||||||
|
import org.apache.commons.io.IOUtils;
|
||||||
|
import org.apache.commons.lang3.StringUtils;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.StringWriter;
|
import java.io.StringWriter;
|
||||||
import java.nio.charset.StandardCharsets;
|
import java.nio.charset.StandardCharsets;
|
||||||
import java.text.Normalizer;
|
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
import java.util.regex.Matcher;
|
import java.util.regex.Matcher;
|
||||||
import java.util.regex.Pattern;
|
import java.util.regex.Pattern;
|
||||||
import java.util.stream.Collectors;
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
import org.apache.commons.io.IOUtils;
|
|
||||||
import org.apache.commons.lang3.StringUtils;
|
|
||||||
|
|
||||||
import com.google.common.base.Joiner;
|
|
||||||
import com.google.common.base.Splitter;
|
|
||||||
import com.google.common.collect.Iterables;
|
|
||||||
import com.google.common.collect.Sets;
|
|
||||||
import com.ibm.icu.text.Transliterator;
|
|
||||||
|
|
||||||
import eu.dnetlib.pace.clustering.NGramUtils;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Set of common functions for the framework
|
* Set of common functions for the framework
|
||||||
*
|
*
|
||||||
* @author claudio
|
* @author claudio
|
||||||
*/
|
*/
|
||||||
public class AbstractPaceFunctions {
|
public class AbstractPaceFunctions extends PaceCommonUtils {
|
||||||
|
|
||||||
// city map to be used when translating the city names into codes
|
// city map to be used when translating the city names into codes
|
||||||
private static Map<String, String> cityMap = AbstractPaceFunctions
|
private static Map<String, String> cityMap = AbstractPaceFunctions
|
||||||
|
@ -41,9 +35,6 @@ public class AbstractPaceFunctions {
|
||||||
protected static Set<String> stopwords_it = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_it.txt");
|
protected static Set<String> stopwords_it = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_it.txt");
|
||||||
protected static Set<String> stopwords_pt = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_pt.txt");
|
protected static Set<String> stopwords_pt = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_pt.txt");
|
||||||
|
|
||||||
// transliterator
|
|
||||||
protected static Transliterator transliterator = Transliterator.getInstance("Any-Eng");
|
|
||||||
|
|
||||||
// blacklist of ngrams: to avoid generic keys
|
// blacklist of ngrams: to avoid generic keys
|
||||||
protected static Set<String> ngramBlacklist = loadFromClasspath("/eu/dnetlib/pace/config/ngram_blacklist.txt");
|
protected static Set<String> ngramBlacklist = loadFromClasspath("/eu/dnetlib/pace/config/ngram_blacklist.txt");
|
||||||
|
|
||||||
|
@ -51,8 +42,6 @@ public class AbstractPaceFunctions {
|
||||||
public static final Pattern HTML_REGEX = Pattern.compile("<[^>]*>");
|
public static final Pattern HTML_REGEX = Pattern.compile("<[^>]*>");
|
||||||
|
|
||||||
private static final String alpha = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789 ";
|
private static final String alpha = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789 ";
|
||||||
private static final String aliases_from = "⁰¹²³⁴⁵⁶⁷⁸⁹⁺⁻⁼⁽⁾ⁿ₀₁₂₃₄₅₆₇₈₉₊₋₌₍₎àáâäæãåāèéêëēėęəîïíīįìôöòóœøōõûüùúūßśšłžźżçćčñń";
|
|
||||||
private static final String aliases_to = "0123456789+-=()n0123456789+-=()aaaaaaaaeeeeeeeeiiiiiioooooooouuuuussslzzzcccnn";
|
|
||||||
|
|
||||||
// doi prefix for normalization
|
// doi prefix for normalization
|
||||||
public static final Pattern DOI_PREFIX = Pattern.compile("(https?:\\/\\/dx\\.doi\\.org\\/)|(doi:)");
|
public static final Pattern DOI_PREFIX = Pattern.compile("(https?:\\/\\/dx\\.doi\\.org\\/)|(doi:)");
|
||||||
|
@ -129,25 +118,6 @@ public class AbstractPaceFunctions {
|
||||||
return numberPattern.matcher(strNum).matches();
|
return numberPattern.matcher(strNum).matches();
|
||||||
}
|
}
|
||||||
|
|
||||||
protected static String fixAliases(final String s) {
|
|
||||||
final StringBuilder sb = new StringBuilder();
|
|
||||||
|
|
||||||
s.chars().forEach(ch -> {
|
|
||||||
final int i = StringUtils.indexOf(aliases_from, ch);
|
|
||||||
sb.append(i >= 0 ? aliases_to.charAt(i) : (char) ch);
|
|
||||||
});
|
|
||||||
|
|
||||||
return sb.toString();
|
|
||||||
}
|
|
||||||
|
|
||||||
protected static String transliterate(final String s) {
|
|
||||||
try {
|
|
||||||
return transliterator.transliterate(s);
|
|
||||||
} catch (Exception e) {
|
|
||||||
return s;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
protected static String removeSymbols(final String s) {
|
protected static String removeSymbols(final String s) {
|
||||||
final StringBuilder sb = new StringBuilder();
|
final StringBuilder sb = new StringBuilder();
|
||||||
|
|
||||||
|
@ -162,23 +132,6 @@ public class AbstractPaceFunctions {
|
||||||
return s != null;
|
return s != null;
|
||||||
}
|
}
|
||||||
|
|
||||||
public static String normalize(final String s) {
|
|
||||||
return fixAliases(transliterate(nfd(unicodeNormalization(s))))
|
|
||||||
.toLowerCase()
|
|
||||||
// do not compact the regexes in a single expression, would cause StackOverflowError in case of large input
|
|
||||||
// strings
|
|
||||||
.replaceAll("[^ \\w]+", "")
|
|
||||||
.replaceAll("(\\p{InCombiningDiacriticalMarks})+", "")
|
|
||||||
.replaceAll("(\\p{Punct})+", " ")
|
|
||||||
.replaceAll("(\\d)+", " ")
|
|
||||||
.replaceAll("(\\n)+", " ")
|
|
||||||
.trim();
|
|
||||||
}
|
|
||||||
|
|
||||||
public static String nfd(final String s) {
|
|
||||||
return Normalizer.normalize(s, Normalizer.Form.NFD);
|
|
||||||
}
|
|
||||||
|
|
||||||
public static String utf8(final String s) {
|
public static String utf8(final String s) {
|
||||||
byte[] bytes = s.getBytes(StandardCharsets.UTF_8);
|
byte[] bytes = s.getBytes(StandardCharsets.UTF_8);
|
||||||
return new String(bytes, StandardCharsets.UTF_8);
|
return new String(bytes, StandardCharsets.UTF_8);
|
||||||
|
@ -233,22 +186,6 @@ public class AbstractPaceFunctions {
|
||||||
return newset;
|
return newset;
|
||||||
}
|
}
|
||||||
|
|
||||||
public static Set<String> loadFromClasspath(final String classpath) {
|
|
||||||
|
|
||||||
Transliterator transliterator = Transliterator.getInstance("Any-Eng");
|
|
||||||
|
|
||||||
final Set<String> h = Sets.newHashSet();
|
|
||||||
try {
|
|
||||||
for (final String s : IOUtils
|
|
||||||
.readLines(NGramUtils.class.getResourceAsStream(classpath), StandardCharsets.UTF_8)) {
|
|
||||||
h.add(fixAliases(transliterator.transliterate(s))); // transliteration of the stopwords
|
|
||||||
}
|
|
||||||
} catch (final Throwable e) {
|
|
||||||
return Sets.newHashSet();
|
|
||||||
}
|
|
||||||
return h;
|
|
||||||
}
|
|
||||||
|
|
||||||
public static Map<String, String> loadMapFromClasspath(final String classpath) {
|
public static Map<String, String> loadMapFromClasspath(final String classpath) {
|
||||||
|
|
||||||
Transliterator transliterator = Transliterator.getInstance("Any-Eng");
|
Transliterator transliterator = Transliterator.getInstance("Any-Eng");
|
||||||
|
@ -303,10 +240,6 @@ public class AbstractPaceFunctions {
|
||||||
return StringUtils.substring(s, 0, 1).toLowerCase();
|
return StringUtils.substring(s, 0, 1).toLowerCase();
|
||||||
}
|
}
|
||||||
|
|
||||||
protected static Iterable<String> tokens(final String s, final int maxTokens) {
|
|
||||||
return Iterables.limit(Splitter.on(" ").omitEmptyStrings().trimResults().split(s), maxTokens);
|
|
||||||
}
|
|
||||||
|
|
||||||
public static String normalizePid(String pid) {
|
public static String normalizePid(String pid) {
|
||||||
return DOI_PREFIX.matcher(pid.toLowerCase()).replaceAll("");
|
return DOI_PREFIX.matcher(pid.toLowerCase()).replaceAll("");
|
||||||
}
|
}
|
||||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -38,7 +38,6 @@
|
||||||
</configuration>
|
</configuration>
|
||||||
</plugin>
|
</plugin>
|
||||||
</plugins>
|
</plugins>
|
||||||
|
|
||||||
</build>
|
</build>
|
||||||
|
|
||||||
<dependencies>
|
<dependencies>
|
||||||
|
|
|
@ -189,7 +189,7 @@ public class DedupRecordFactory {
|
||||||
entity = swap;
|
entity = swap;
|
||||||
}
|
}
|
||||||
|
|
||||||
entity = MergeUtils.checkedMerge(entity, duplicate);
|
entity = MergeUtils.checkedMerge(entity, duplicate, false);
|
||||||
|
|
||||||
if (ModelSupport.isSubClass(duplicate, Result.class)) {
|
if (ModelSupport.isSubClass(duplicate, Result.class)) {
|
||||||
Result re = (Result) entity;
|
Result re = (Result) entity;
|
||||||
|
|
|
@ -175,6 +175,7 @@ public class SparkCreateMergeRels extends AbstractSparkAction {
|
||||||
}
|
}
|
||||||
|
|
||||||
// cap pidType at w3id as from there on they are considered equal
|
// cap pidType at w3id as from there on they are considered equal
|
||||||
|
|
||||||
UserDefinedFunction mapPid = udf(
|
UserDefinedFunction mapPid = udf(
|
||||||
(String s) -> Math.min(PidType.tryValueOf(s).ordinal(), PidType.w3id.ordinal()), DataTypes.IntegerType);
|
(String s) -> Math.min(PidType.tryValueOf(s).ordinal(), PidType.w3id.ordinal()), DataTypes.IntegerType);
|
||||||
|
|
||||||
|
|
|
@ -44,9 +44,11 @@ public class SparkCreateSimRels extends AbstractSparkAction {
|
||||||
parser.parseArgument(args);
|
parser.parseArgument(args);
|
||||||
|
|
||||||
SparkConf conf = new SparkConf();
|
SparkConf conf = new SparkConf();
|
||||||
new SparkCreateSimRels(parser, getSparkSession(conf))
|
try (SparkSession session = getSparkSession(conf)) {
|
||||||
|
new SparkCreateSimRels(parser, session)
|
||||||
.run(ISLookupClientFactory.getLookUpService(parser.get("isLookUpUrl")));
|
.run(ISLookupClientFactory.getLookUpService(parser.get("isLookUpUrl")));
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void run(ISLookUpService isLookUpService)
|
public void run(ISLookUpService isLookUpService)
|
||||||
|
|
|
@ -102,6 +102,8 @@
|
||||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||||
--conf spark.sql.shuffle.partitions=15000
|
--conf spark.sql.shuffle.partitions=15000
|
||||||
|
--conf spark.network.timeout=300s
|
||||||
|
--conf spark.shuffle.registration.timeout=50000
|
||||||
</spark-opts>
|
</spark-opts>
|
||||||
<arg>--graphBasePath</arg><arg>${graphBasePath}</arg>
|
<arg>--graphBasePath</arg><arg>${graphBasePath}</arg>
|
||||||
<arg>--graphOutputPath</arg><arg>${graphOutputPath}</arg>
|
<arg>--graphOutputPath</arg><arg>${graphOutputPath}</arg>
|
||||||
|
|
|
@ -33,16 +33,14 @@
|
||||||
<description>max number of elements in a connected component</description>
|
<description>max number of elements in a connected component</description>
|
||||||
</property>
|
</property>
|
||||||
<property>
|
<property>
|
||||||
<name>sparkDriverMemory</name>
|
<name>sparkResourceOpts</name>
|
||||||
<description>memory for driver process</description>
|
<value>--executor-memory=6G --conf spark.executor.memoryOverhead=4G --executor-cores=6 --driver-memory=8G --driver-cores=4</value>
|
||||||
|
<description>spark resource options</description>
|
||||||
</property>
|
</property>
|
||||||
<property>
|
<property>
|
||||||
<name>sparkExecutorMemory</name>
|
<name>sparkResourceOptsCreateMergeRel</name>
|
||||||
<description>memory for individual executor</description>
|
<value>--executor-memory=6G --conf spark.executor.memoryOverhead=4G --executor-cores=6 --driver-memory=8G --driver-cores=4</value>
|
||||||
</property>
|
<description>spark resource options</description>
|
||||||
<property>
|
|
||||||
<name>sparkExecutorCores</name>
|
|
||||||
<description>number of cores used by single executor</description>
|
|
||||||
</property>
|
</property>
|
||||||
<property>
|
<property>
|
||||||
<name>oozieActionShareLibForSpark2</name>
|
<name>oozieActionShareLibForSpark2</name>
|
||||||
|
@ -119,9 +117,7 @@
|
||||||
<class>eu.dnetlib.dhp.oa.dedup.SparkCreateSimRels</class>
|
<class>eu.dnetlib.dhp.oa.dedup.SparkCreateSimRels</class>
|
||||||
<jar>dhp-dedup-openaire-${projectVersion}.jar</jar>
|
<jar>dhp-dedup-openaire-${projectVersion}.jar</jar>
|
||||||
<spark-opts>
|
<spark-opts>
|
||||||
--executor-memory=${sparkExecutorMemory}
|
${sparkResourceOpts}
|
||||||
--executor-cores=${sparkExecutorCores}
|
|
||||||
--driver-memory=${sparkDriverMemory}
|
|
||||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||||
|
@ -146,9 +142,7 @@
|
||||||
<class>eu.dnetlib.dhp.oa.dedup.SparkWhitelistSimRels</class>
|
<class>eu.dnetlib.dhp.oa.dedup.SparkWhitelistSimRels</class>
|
||||||
<jar>dhp-dedup-openaire-${projectVersion}.jar</jar>
|
<jar>dhp-dedup-openaire-${projectVersion}.jar</jar>
|
||||||
<spark-opts>
|
<spark-opts>
|
||||||
--executor-memory=${sparkExecutorMemory}
|
${sparkResourceOpts}
|
||||||
--executor-cores=${sparkExecutorCores}
|
|
||||||
--driver-memory=${sparkDriverMemory}
|
|
||||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||||
|
@ -174,9 +168,7 @@
|
||||||
<class>eu.dnetlib.dhp.oa.dedup.SparkCreateMergeRels</class>
|
<class>eu.dnetlib.dhp.oa.dedup.SparkCreateMergeRels</class>
|
||||||
<jar>dhp-dedup-openaire-${projectVersion}.jar</jar>
|
<jar>dhp-dedup-openaire-${projectVersion}.jar</jar>
|
||||||
<spark-opts>
|
<spark-opts>
|
||||||
--executor-memory=${sparkExecutorMemory}
|
${sparkResourceOptsCreateMergeRel}
|
||||||
--executor-cores=${sparkExecutorCores}
|
|
||||||
--driver-memory=${sparkDriverMemory}
|
|
||||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||||
|
@ -203,9 +195,7 @@
|
||||||
<class>eu.dnetlib.dhp.oa.dedup.SparkCreateDedupRecord</class>
|
<class>eu.dnetlib.dhp.oa.dedup.SparkCreateDedupRecord</class>
|
||||||
<jar>dhp-dedup-openaire-${projectVersion}.jar</jar>
|
<jar>dhp-dedup-openaire-${projectVersion}.jar</jar>
|
||||||
<spark-opts>
|
<spark-opts>
|
||||||
--executor-memory=${sparkExecutorMemory}
|
${sparkResourceOpts}
|
||||||
--executor-cores=${sparkExecutorCores}
|
|
||||||
--driver-memory=${sparkDriverMemory}
|
|
||||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||||
|
@ -230,9 +220,7 @@
|
||||||
<class>eu.dnetlib.dhp.oa.dedup.SparkCopyOpenorgsMergeRels</class>
|
<class>eu.dnetlib.dhp.oa.dedup.SparkCopyOpenorgsMergeRels</class>
|
||||||
<jar>dhp-dedup-openaire-${projectVersion}.jar</jar>
|
<jar>dhp-dedup-openaire-${projectVersion}.jar</jar>
|
||||||
<spark-opts>
|
<spark-opts>
|
||||||
--executor-memory=${sparkExecutorMemory}
|
${sparkResourceOpts}
|
||||||
--executor-cores=${sparkExecutorCores}
|
|
||||||
--driver-memory=${sparkDriverMemory}
|
|
||||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||||
|
@ -257,9 +245,7 @@
|
||||||
<class>eu.dnetlib.dhp.oa.dedup.SparkCreateOrgsDedupRecord</class>
|
<class>eu.dnetlib.dhp.oa.dedup.SparkCreateOrgsDedupRecord</class>
|
||||||
<jar>dhp-dedup-openaire-${projectVersion}.jar</jar>
|
<jar>dhp-dedup-openaire-${projectVersion}.jar</jar>
|
||||||
<spark-opts>
|
<spark-opts>
|
||||||
--executor-memory=${sparkExecutorMemory}
|
${sparkResourceOpts}
|
||||||
--executor-cores=${sparkExecutorCores}
|
|
||||||
--driver-memory=${sparkDriverMemory}
|
|
||||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||||
|
@ -283,9 +269,7 @@
|
||||||
<class>eu.dnetlib.dhp.oa.dedup.SparkUpdateEntity</class>
|
<class>eu.dnetlib.dhp.oa.dedup.SparkUpdateEntity</class>
|
||||||
<jar>dhp-dedup-openaire-${projectVersion}.jar</jar>
|
<jar>dhp-dedup-openaire-${projectVersion}.jar</jar>
|
||||||
<spark-opts>
|
<spark-opts>
|
||||||
--executor-memory=${sparkExecutorMemory}
|
${sparkResourceOpts}
|
||||||
--executor-cores=${sparkExecutorCores}
|
|
||||||
--driver-memory=${sparkDriverMemory}
|
|
||||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||||
|
@ -309,9 +293,7 @@
|
||||||
<class>eu.dnetlib.dhp.oa.dedup.SparkCopyRelationsNoOpenorgs</class>
|
<class>eu.dnetlib.dhp.oa.dedup.SparkCopyRelationsNoOpenorgs</class>
|
||||||
<jar>dhp-dedup-openaire-${projectVersion}.jar</jar>
|
<jar>dhp-dedup-openaire-${projectVersion}.jar</jar>
|
||||||
<spark-opts>
|
<spark-opts>
|
||||||
--executor-memory=${sparkExecutorMemory}
|
${sparkResourceOpts}
|
||||||
--executor-cores=${sparkExecutorCores}
|
|
||||||
--driver-memory=${sparkDriverMemory}
|
|
||||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||||
|
|
|
@ -123,7 +123,7 @@ class EntityMergerTest implements Serializable {
|
||||||
assertEquals(dataInfo, pub_merged.getDataInfo());
|
assertEquals(dataInfo, pub_merged.getDataInfo());
|
||||||
|
|
||||||
// verify datepicker
|
// verify datepicker
|
||||||
assertEquals("2018-09-30", pub_merged.getDateofacceptance().getValue());
|
assertEquals("2016-01-01", pub_merged.getDateofacceptance().getValue());
|
||||||
|
|
||||||
// verify authors
|
// verify authors
|
||||||
assertEquals(13, pub_merged.getAuthor().size());
|
assertEquals(13, pub_merged.getAuthor().size());
|
||||||
|
|
|
@ -78,7 +78,7 @@ public class IdGeneratorTest {
|
||||||
System.out.println("winner 3 = " + id2);
|
System.out.println("winner 3 = " + id2);
|
||||||
|
|
||||||
assertEquals("50|doi_dedup___::1a77a3bba737f8b669dcf330ad3b37e2", id1);
|
assertEquals("50|doi_dedup___::1a77a3bba737f8b669dcf330ad3b37e2", id1);
|
||||||
assertEquals("50|dedup_wf_001::0829b5191605bdbea36d6502b8c1ce1g", id2);
|
assertEquals("50|dedup_wf_002::345e5d1b80537b0d0e0a49241ae9e516", id2);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
|
|
|
@ -143,7 +143,7 @@ public class SparkOpenorgsDedupTest implements Serializable {
|
||||||
.load(DedupUtility.createSimRelPath(testOutputBasePath, testActionSetId, "organization"))
|
.load(DedupUtility.createSimRelPath(testOutputBasePath, testActionSetId, "organization"))
|
||||||
.count();
|
.count();
|
||||||
|
|
||||||
assertEquals(145, orgs_simrel);
|
assertEquals(86, orgs_simrel);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
|
@ -172,7 +172,7 @@ public class SparkOpenorgsDedupTest implements Serializable {
|
||||||
.load(DedupUtility.createSimRelPath(testOutputBasePath, testActionSetId, "organization"))
|
.load(DedupUtility.createSimRelPath(testOutputBasePath, testActionSetId, "organization"))
|
||||||
.count();
|
.count();
|
||||||
|
|
||||||
assertEquals(181, orgs_simrel);
|
assertEquals(122, orgs_simrel);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
|
@ -196,7 +196,9 @@ public class SparkOpenorgsDedupTest implements Serializable {
|
||||||
"-la",
|
"-la",
|
||||||
"lookupurl",
|
"lookupurl",
|
||||||
"-w",
|
"-w",
|
||||||
testOutputBasePath
|
testOutputBasePath,
|
||||||
|
"-h",
|
||||||
|
""
|
||||||
});
|
});
|
||||||
|
|
||||||
new SparkCreateMergeRels(parser, spark).run(isLookUpService);
|
new SparkCreateMergeRels(parser, spark).run(isLookUpService);
|
||||||
|
|
|
@ -13,14 +13,16 @@ import java.io.Serializable;
|
||||||
import java.net.URISyntaxException;
|
import java.net.URISyntaxException;
|
||||||
import java.nio.file.Path;
|
import java.nio.file.Path;
|
||||||
import java.nio.file.Paths;
|
import java.nio.file.Paths;
|
||||||
import java.util.*;
|
import java.util.HashSet;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Optional;
|
||||||
|
import java.util.Set;
|
||||||
import java.util.stream.Collectors;
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
import org.apache.commons.cli.ParseException;
|
import org.apache.commons.cli.ParseException;
|
||||||
import org.apache.commons.io.FileUtils;
|
import org.apache.commons.io.FileUtils;
|
||||||
import org.apache.commons.io.IOUtils;
|
import org.apache.commons.io.IOUtils;
|
||||||
import org.apache.spark.SparkConf;
|
import org.apache.spark.SparkConf;
|
||||||
import org.apache.spark.api.java.JavaSparkContext;
|
|
||||||
import org.apache.spark.api.java.function.FilterFunction;
|
import org.apache.spark.api.java.function.FilterFunction;
|
||||||
import org.apache.spark.api.java.function.MapFunction;
|
import org.apache.spark.api.java.function.MapFunction;
|
||||||
import org.apache.spark.sql.Dataset;
|
import org.apache.spark.sql.Dataset;
|
||||||
|
@ -129,7 +131,7 @@ public class SparkPublicationRootsTest implements Serializable {
|
||||||
.load(DedupUtility.createSimRelPath(workingPath, testActionSetId, "publication"))
|
.load(DedupUtility.createSimRelPath(workingPath, testActionSetId, "publication"))
|
||||||
.count();
|
.count();
|
||||||
|
|
||||||
assertEquals(37, pubs_simrel);
|
assertEquals(9, pubs_simrel);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
|
@ -142,7 +144,8 @@ public class SparkPublicationRootsTest implements Serializable {
|
||||||
"--actionSetId", testActionSetId,
|
"--actionSetId", testActionSetId,
|
||||||
"--isLookUpUrl", "lookupurl",
|
"--isLookUpUrl", "lookupurl",
|
||||||
"--workingPath", workingPath,
|
"--workingPath", workingPath,
|
||||||
"--cutConnectedComponent", "3"
|
"--cutConnectedComponent", "3",
|
||||||
|
"-h", ""
|
||||||
}), spark)
|
}), spark)
|
||||||
.run(isLookUpService);
|
.run(isLookUpService);
|
||||||
|
|
||||||
|
@ -171,7 +174,8 @@ public class SparkPublicationRootsTest implements Serializable {
|
||||||
"--graphBasePath", graphInputPath,
|
"--graphBasePath", graphInputPath,
|
||||||
"--actionSetId", testActionSetId,
|
"--actionSetId", testActionSetId,
|
||||||
"--isLookUpUrl", "lookupurl",
|
"--isLookUpUrl", "lookupurl",
|
||||||
"--workingPath", workingPath
|
"--workingPath", workingPath,
|
||||||
|
"-h", ""
|
||||||
}), spark)
|
}), spark)
|
||||||
.run(isLookUpService);
|
.run(isLookUpService);
|
||||||
|
|
||||||
|
@ -207,7 +211,7 @@ public class SparkPublicationRootsTest implements Serializable {
|
||||||
assertTrue(dups.contains(r.getSource()));
|
assertTrue(dups.contains(r.getSource()));
|
||||||
});
|
});
|
||||||
|
|
||||||
assertEquals(32, merges.count());
|
assertEquals(26, merges.count());
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
|
@ -228,7 +232,7 @@ public class SparkPublicationRootsTest implements Serializable {
|
||||||
.textFile(workingPath + "/" + testActionSetId + "/publication_deduprecord")
|
.textFile(workingPath + "/" + testActionSetId + "/publication_deduprecord")
|
||||||
.map(asEntity(Publication.class), Encoders.bean(Publication.class));
|
.map(asEntity(Publication.class), Encoders.bean(Publication.class));
|
||||||
|
|
||||||
assertEquals(3, roots.count());
|
assertEquals(4, roots.count());
|
||||||
|
|
||||||
final Dataset<Publication> pubs = spark
|
final Dataset<Publication> pubs = spark
|
||||||
.read()
|
.read()
|
||||||
|
@ -369,7 +373,7 @@ public class SparkPublicationRootsTest implements Serializable {
|
||||||
.distinct()
|
.distinct()
|
||||||
.count();
|
.count();
|
||||||
|
|
||||||
assertEquals(19, publications); // 16 originals + 3 roots
|
assertEquals(20, publications); // 16 originals + 3 roots
|
||||||
|
|
||||||
long deletedPubs = spark
|
long deletedPubs = spark
|
||||||
.read()
|
.read()
|
||||||
|
@ -380,7 +384,7 @@ public class SparkPublicationRootsTest implements Serializable {
|
||||||
.distinct()
|
.distinct()
|
||||||
.count();
|
.count();
|
||||||
|
|
||||||
assertEquals(mergedPubs, deletedPubs);
|
// assertEquals(mergedPubs, deletedPubs);
|
||||||
}
|
}
|
||||||
|
|
||||||
private static String classPathResourceAsString(String path) throws IOException {
|
private static String classPathResourceAsString(String path) throws IOException {
|
||||||
|
|
|
@ -169,10 +169,10 @@ public class SparkStatsTest implements Serializable {
|
||||||
.count();
|
.count();
|
||||||
|
|
||||||
assertEquals(414, orgs_blocks);
|
assertEquals(414, orgs_blocks);
|
||||||
assertEquals(187, pubs_blocks);
|
assertEquals(221, pubs_blocks);
|
||||||
assertEquals(128, sw_blocks);
|
assertEquals(134, sw_blocks);
|
||||||
assertEquals(192, ds_blocks);
|
assertEquals(196, ds_blocks);
|
||||||
assertEquals(194, orp_blocks);
|
assertEquals(198, orp_blocks);
|
||||||
}
|
}
|
||||||
|
|
||||||
@AfterAll
|
@AfterAll
|
||||||
|
|
|
@ -161,7 +161,7 @@ public class SparkResultToCommunityFromProject implements Serializable {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
res.setContext(propagatedContexts);
|
res.setContext(propagatedContexts);
|
||||||
return MergeUtils.checkedMerge(ret, res);
|
return MergeUtils.checkedMerge(ret, res, true);
|
||||||
}
|
}
|
||||||
return ret;
|
return ret;
|
||||||
};
|
};
|
||||||
|
|
|
@ -100,16 +100,12 @@
|
||||||
--executor-cores=${sparkExecutorCores}
|
--executor-cores=${sparkExecutorCores}
|
||||||
--executor-memory=${sparkExecutorMemory}
|
--executor-memory=${sparkExecutorMemory}
|
||||||
--driver-memory=${sparkDriverMemory}
|
--driver-memory=${sparkDriverMemory}
|
||||||
|
--conf spark.executor.memoryOverhead=${sparkExecutorMemory}
|
||||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||||
--conf spark.dynamicAllocation.enabled=true
|
--conf spark.sql.shuffle.partitions=8000
|
||||||
--conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors}
|
|
||||||
--conf spark.sql.shuffle.partitions=3840
|
|
||||||
--conf spark.speculation=false
|
|
||||||
--conf spark.hadoop.mapreduce.map.speculative=false
|
|
||||||
--conf spark.hadoop.mapreduce.reduce.speculative=false
|
|
||||||
</spark-opts>
|
</spark-opts>
|
||||||
<arg>--sourcePath</arg><arg>${sourcePath}</arg>
|
<arg>--sourcePath</arg><arg>${sourcePath}</arg>
|
||||||
<arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg>
|
<arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg>
|
||||||
|
@ -132,12 +128,11 @@
|
||||||
--executor-cores=${sparkExecutorCores}
|
--executor-cores=${sparkExecutorCores}
|
||||||
--executor-memory=${sparkExecutorMemory}
|
--executor-memory=${sparkExecutorMemory}
|
||||||
--driver-memory=${sparkDriverMemory}
|
--driver-memory=${sparkDriverMemory}
|
||||||
|
--conf spark.executor.memoryOverhead=${sparkExecutorMemory}
|
||||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||||
--conf spark.dynamicAllocation.enabled=true
|
|
||||||
--conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors}
|
|
||||||
</spark-opts>
|
</spark-opts>
|
||||||
<arg>--sourcePath</arg><arg>${sourcePath}</arg>
|
<arg>--sourcePath</arg><arg>${sourcePath}</arg>
|
||||||
<arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg>
|
<arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg>
|
||||||
|
@ -160,12 +155,11 @@
|
||||||
--executor-cores=${sparkExecutorCores}
|
--executor-cores=${sparkExecutorCores}
|
||||||
--executor-memory=${sparkExecutorMemory}
|
--executor-memory=${sparkExecutorMemory}
|
||||||
--driver-memory=${sparkDriverMemory}
|
--driver-memory=${sparkDriverMemory}
|
||||||
|
--conf spark.executor.memoryOverhead=${sparkExecutorMemory}
|
||||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||||
--conf spark.dynamicAllocation.enabled=true
|
|
||||||
--conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors}
|
|
||||||
</spark-opts>
|
</spark-opts>
|
||||||
<arg>--sourcePath</arg><arg>${sourcePath}</arg>
|
<arg>--sourcePath</arg><arg>${sourcePath}</arg>
|
||||||
<arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg>
|
<arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg>
|
||||||
|
@ -188,12 +182,11 @@
|
||||||
--executor-cores=${sparkExecutorCores}
|
--executor-cores=${sparkExecutorCores}
|
||||||
--executor-memory=${sparkExecutorMemory}
|
--executor-memory=${sparkExecutorMemory}
|
||||||
--driver-memory=${sparkDriverMemory}
|
--driver-memory=${sparkDriverMemory}
|
||||||
|
--conf spark.executor.memoryOverhead=${sparkExecutorMemory}
|
||||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||||
--conf spark.dynamicAllocation.enabled=true
|
|
||||||
--conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors}
|
|
||||||
</spark-opts>
|
</spark-opts>
|
||||||
<arg>--sourcePath</arg><arg>${sourcePath}</arg>
|
<arg>--sourcePath</arg><arg>${sourcePath}</arg>
|
||||||
<arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg>
|
<arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg>
|
||||||
|
@ -218,12 +211,11 @@
|
||||||
--executor-cores=${sparkExecutorCores}
|
--executor-cores=${sparkExecutorCores}
|
||||||
--executor-memory=${sparkExecutorMemory}
|
--executor-memory=${sparkExecutorMemory}
|
||||||
--driver-memory=${sparkDriverMemory}
|
--driver-memory=${sparkDriverMemory}
|
||||||
|
--conf spark.executor.memoryOverhead=${sparkExecutorMemory}
|
||||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||||
--conf spark.dynamicAllocation.enabled=true
|
|
||||||
--conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors}
|
|
||||||
</spark-opts>
|
</spark-opts>
|
||||||
<arg>--sourcePath</arg><arg>${workingDir}/orcid/targetOrcidAssoc</arg>
|
<arg>--sourcePath</arg><arg>${workingDir}/orcid/targetOrcidAssoc</arg>
|
||||||
<arg>--outputPath</arg><arg>${workingDir}/orcid/mergedOrcidAssoc</arg>
|
<arg>--outputPath</arg><arg>${workingDir}/orcid/mergedOrcidAssoc</arg>
|
||||||
|
@ -247,19 +239,14 @@
|
||||||
<class>eu.dnetlib.dhp.orcidtoresultfromsemrel.SparkOrcidToResultFromSemRelJob</class>
|
<class>eu.dnetlib.dhp.orcidtoresultfromsemrel.SparkOrcidToResultFromSemRelJob</class>
|
||||||
<jar>dhp-enrichment-${projectVersion}.jar</jar>
|
<jar>dhp-enrichment-${projectVersion}.jar</jar>
|
||||||
<spark-opts>
|
<spark-opts>
|
||||||
--executor-cores=4
|
--executor-cores=${sparkExecutorCores}
|
||||||
--executor-memory=4G
|
--executor-memory=${sparkExecutorMemory}
|
||||||
--driver-memory=${sparkDriverMemory}
|
--driver-memory=${sparkDriverMemory}
|
||||||
--conf spark.executor.memoryOverhead=5G
|
--conf spark.executor.memoryOverhead=${sparkExecutorMemory}
|
||||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||||
--conf spark.dynamicAllocation.enabled=true
|
|
||||||
--conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors}
|
|
||||||
--conf spark.speculation=false
|
|
||||||
--conf spark.hadoop.mapreduce.map.speculative=false
|
|
||||||
--conf spark.hadoop.mapreduce.reduce.speculative=false
|
|
||||||
--conf spark.sql.shuffle.partitions=15000
|
--conf spark.sql.shuffle.partitions=15000
|
||||||
</spark-opts>
|
</spark-opts>
|
||||||
<arg>--possibleUpdatesPath</arg><arg>${workingDir}/orcid/mergedOrcidAssoc</arg>
|
<arg>--possibleUpdatesPath</arg><arg>${workingDir}/orcid/mergedOrcidAssoc</arg>
|
||||||
|
@ -282,15 +269,12 @@
|
||||||
--executor-cores=${sparkExecutorCores}
|
--executor-cores=${sparkExecutorCores}
|
||||||
--executor-memory=${sparkExecutorMemory}
|
--executor-memory=${sparkExecutorMemory}
|
||||||
--driver-memory=${sparkDriverMemory}
|
--driver-memory=${sparkDriverMemory}
|
||||||
|
--conf spark.executor.memoryOverhead=${sparkExecutorMemory}
|
||||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||||
--conf spark.dynamicAllocation.enabled=true
|
--conf spark.sql.shuffle.partitions=8000
|
||||||
--conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors}
|
|
||||||
--conf spark.speculation=false
|
|
||||||
--conf spark.hadoop.mapreduce.map.speculative=false
|
|
||||||
--conf spark.hadoop.mapreduce.reduce.speculative=false
|
|
||||||
</spark-opts>
|
</spark-opts>
|
||||||
<arg>--possibleUpdatesPath</arg><arg>${workingDir}/orcid/mergedOrcidAssoc</arg>
|
<arg>--possibleUpdatesPath</arg><arg>${workingDir}/orcid/mergedOrcidAssoc</arg>
|
||||||
<arg>--sourcePath</arg><arg>${sourcePath}/dataset</arg>
|
<arg>--sourcePath</arg><arg>${sourcePath}/dataset</arg>
|
||||||
|
@ -312,15 +296,12 @@
|
||||||
--executor-cores=${sparkExecutorCores}
|
--executor-cores=${sparkExecutorCores}
|
||||||
--executor-memory=${sparkExecutorMemory}
|
--executor-memory=${sparkExecutorMemory}
|
||||||
--driver-memory=${sparkDriverMemory}
|
--driver-memory=${sparkDriverMemory}
|
||||||
|
--conf spark.executor.memoryOverhead=${sparkExecutorMemory}
|
||||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||||
--conf spark.dynamicAllocation.enabled=true
|
--conf spark.sql.shuffle.partitions=8000
|
||||||
--conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors}
|
|
||||||
--conf spark.speculation=false
|
|
||||||
--conf spark.hadoop.mapreduce.map.speculative=false
|
|
||||||
--conf spark.hadoop.mapreduce.reduce.speculative=false
|
|
||||||
</spark-opts>
|
</spark-opts>
|
||||||
<arg>--possibleUpdatesPath</arg><arg>${workingDir}/orcid/mergedOrcidAssoc</arg>
|
<arg>--possibleUpdatesPath</arg><arg>${workingDir}/orcid/mergedOrcidAssoc</arg>
|
||||||
<arg>--sourcePath</arg><arg>${sourcePath}/otherresearchproduct</arg>
|
<arg>--sourcePath</arg><arg>${sourcePath}/otherresearchproduct</arg>
|
||||||
|
@ -342,15 +323,12 @@
|
||||||
--executor-cores=${sparkExecutorCores}
|
--executor-cores=${sparkExecutorCores}
|
||||||
--executor-memory=${sparkExecutorMemory}
|
--executor-memory=${sparkExecutorMemory}
|
||||||
--driver-memory=${sparkDriverMemory}
|
--driver-memory=${sparkDriverMemory}
|
||||||
|
--conf spark.executor.memoryOverhead=${sparkExecutorMemory}
|
||||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||||
--conf spark.dynamicAllocation.enabled=true
|
--conf spark.sql.shuffle.partitions=4000
|
||||||
--conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors}
|
|
||||||
--conf spark.speculation=false
|
|
||||||
--conf spark.hadoop.mapreduce.map.speculative=false
|
|
||||||
--conf spark.hadoop.mapreduce.reduce.speculative=false
|
|
||||||
</spark-opts>
|
</spark-opts>
|
||||||
<arg>--possibleUpdatesPath</arg><arg>${workingDir}/orcid/mergedOrcidAssoc</arg>
|
<arg>--possibleUpdatesPath</arg><arg>${workingDir}/orcid/mergedOrcidAssoc</arg>
|
||||||
<arg>--sourcePath</arg><arg>${sourcePath}/software</arg>
|
<arg>--sourcePath</arg><arg>${sourcePath}/software</arg>
|
||||||
|
@ -363,15 +341,6 @@
|
||||||
|
|
||||||
<join name="wait2" to="End"/>
|
<join name="wait2" to="End"/>
|
||||||
|
|
||||||
<!-- <action name="reset_workingDir">-->
|
|
||||||
<!-- <fs>-->
|
|
||||||
<!-- <delete path="${workingDir}"/>-->
|
|
||||||
<!-- <mkdir path="${workingDir}"/>-->
|
|
||||||
<!-- </fs>-->
|
|
||||||
<!-- <ok to="End"/>-->
|
|
||||||
<!-- <error to="Kill"/>-->
|
|
||||||
<!-- </action>-->
|
|
||||||
|
|
||||||
<end name="End"/>
|
<end name="End"/>
|
||||||
|
|
||||||
</workflow-app>
|
</workflow-app>
|
|
@ -90,6 +90,12 @@
|
||||||
<version>${project.version}</version>
|
<version>${project.version}</version>
|
||||||
</dependency>
|
</dependency>
|
||||||
|
|
||||||
|
<dependency>
|
||||||
|
<groupId>eu.dnetlib.dhp</groupId>
|
||||||
|
<artifactId>dhp-pace-core</artifactId>
|
||||||
|
<version>${project.version}</version>
|
||||||
|
</dependency>
|
||||||
|
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>com.jayway.jsonpath</groupId>
|
<groupId>com.jayway.jsonpath</groupId>
|
||||||
<artifactId>json-path</artifactId>
|
<artifactId>json-path</artifactId>
|
||||||
|
|
|
@ -71,7 +71,7 @@ class GenerateEntitiesApplicationTest {
|
||||||
|
|
||||||
protected <T extends Result> void verifyMerge(Result publication, Result dataset, Class<T> clazz,
|
protected <T extends Result> void verifyMerge(Result publication, Result dataset, Class<T> clazz,
|
||||||
String resultType) {
|
String resultType) {
|
||||||
final Result merge = MergeUtils.mergeResult(publication, dataset);
|
final Result merge = (Result) MergeUtils.merge(publication, dataset);
|
||||||
assertTrue(clazz.isAssignableFrom(merge.getClass()));
|
assertTrue(clazz.isAssignableFrom(merge.getClass()));
|
||||||
assertEquals(resultType, merge.getResulttype().getClassid());
|
assertEquals(resultType, merge.getResulttype().getClassid());
|
||||||
}
|
}
|
||||||
|
|
|
@ -71,6 +71,7 @@
|
||||||
--executor-memory=${sparkHighExecutorMemory}
|
--executor-memory=${sparkHighExecutorMemory}
|
||||||
--executor-cores=${sparkExecutorCores}
|
--executor-cores=${sparkExecutorCores}
|
||||||
--driver-memory=${sparkHighDriverMemory}
|
--driver-memory=${sparkHighDriverMemory}
|
||||||
|
--conf spark.executor.memoryOverhead=${sparkHighExecutorMemory}
|
||||||
--conf spark.sql.shuffle.partitions=${sparkShufflePartitions}
|
--conf spark.sql.shuffle.partitions=${sparkShufflePartitions}
|
||||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||||
|
@ -108,6 +109,7 @@
|
||||||
--executor-memory=${sparkHighExecutorMemory}
|
--executor-memory=${sparkHighExecutorMemory}
|
||||||
--executor-cores=${sparkExecutorCores}
|
--executor-cores=${sparkExecutorCores}
|
||||||
--driver-memory=${sparkNormalDriverMemory}
|
--driver-memory=${sparkNormalDriverMemory}
|
||||||
|
--conf spark.executor.memoryOverhead=${sparkHighExecutorMemory}
|
||||||
--conf spark.sql.shuffle.partitions=${sparkShufflePartitions}
|
--conf spark.sql.shuffle.partitions=${sparkShufflePartitions}
|
||||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||||
|
@ -141,6 +143,7 @@
|
||||||
--executor-memory=${sparkHighExecutorMemory}
|
--executor-memory=${sparkHighExecutorMemory}
|
||||||
--executor-cores=${sparkExecutorCores}
|
--executor-cores=${sparkExecutorCores}
|
||||||
--driver-memory=${sparkNormalDriverMemory}
|
--driver-memory=${sparkNormalDriverMemory}
|
||||||
|
--conf spark.executor.memoryOverhead=${sparkHighExecutorMemory}
|
||||||
--conf spark.sql.shuffle.partitions=${sparkShufflePartitions}
|
--conf spark.sql.shuffle.partitions=${sparkShufflePartitions}
|
||||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||||
|
@ -176,6 +179,7 @@
|
||||||
--executor-memory=${sparkHighExecutorMemory}
|
--executor-memory=${sparkHighExecutorMemory}
|
||||||
--executor-cores=${sparkExecutorCores}
|
--executor-cores=${sparkExecutorCores}
|
||||||
--driver-memory=${sparkNormalDriverMemory}
|
--driver-memory=${sparkNormalDriverMemory}
|
||||||
|
--conf spark.executor.memoryOverhead=${sparkHighExecutorMemory}
|
||||||
--conf spark.sql.shuffle.partitions=${sparkShufflePartitions}
|
--conf spark.sql.shuffle.partitions=${sparkShufflePartitions}
|
||||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||||
|
@ -209,6 +213,7 @@
|
||||||
--executor-memory=${sparkHighExecutorMemory}
|
--executor-memory=${sparkHighExecutorMemory}
|
||||||
--executor-cores=${sparkExecutorCores}
|
--executor-cores=${sparkExecutorCores}
|
||||||
--driver-memory=${sparkNormalDriverMemory}
|
--driver-memory=${sparkNormalDriverMemory}
|
||||||
|
--conf spark.executor.memoryOverhead=${sparkHighExecutorMemory}
|
||||||
--conf spark.sql.shuffle.partitions=${sparkShufflePartitions}
|
--conf spark.sql.shuffle.partitions=${sparkShufflePartitions}
|
||||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||||
|
@ -245,6 +250,7 @@
|
||||||
--executor-memory=${sparkHighExecutorMemory}
|
--executor-memory=${sparkHighExecutorMemory}
|
||||||
--executor-cores=${sparkExecutorCores}
|
--executor-cores=${sparkExecutorCores}
|
||||||
--driver-memory=${sparkNormalDriverMemory}
|
--driver-memory=${sparkNormalDriverMemory}
|
||||||
|
--conf spark.executor.memoryOverhead=${sparkHighExecutorMemory}
|
||||||
--conf spark.sql.shuffle.partitions=${sparkShufflePartitions}
|
--conf spark.sql.shuffle.partitions=${sparkShufflePartitions}
|
||||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||||
|
@ -315,6 +321,7 @@
|
||||||
--executor-memory=${sparkNormalExecutorMemory}
|
--executor-memory=${sparkNormalExecutorMemory}
|
||||||
--executor-cores=${sparkExecutorCores}
|
--executor-cores=${sparkExecutorCores}
|
||||||
--driver-memory=${sparkNormalDriverMemory}
|
--driver-memory=${sparkNormalDriverMemory}
|
||||||
|
--conf spark.executor.memoryOverhead=${sparkNormalExecutorMemory}
|
||||||
--conf spark.sql.shuffle.partitions=${sparkShufflePartitions}
|
--conf spark.sql.shuffle.partitions=${sparkShufflePartitions}
|
||||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||||
|
@ -361,6 +368,7 @@
|
||||||
--executor-memory=${sparkNormalExecutorMemory}
|
--executor-memory=${sparkNormalExecutorMemory}
|
||||||
--executor-cores=${sparkExecutorCores}
|
--executor-cores=${sparkExecutorCores}
|
||||||
--driver-memory=${sparkNormalDriverMemory}
|
--driver-memory=${sparkNormalDriverMemory}
|
||||||
|
--conf spark.executor.memoryOverhead=${sparkNormalExecutorMemory}
|
||||||
--conf spark.sql.shuffle.partitions=${sparkShufflePartitions}
|
--conf spark.sql.shuffle.partitions=${sparkShufflePartitions}
|
||||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||||
|
@ -409,6 +417,7 @@
|
||||||
--executor-memory=${sparkHighExecutorMemory}
|
--executor-memory=${sparkHighExecutorMemory}
|
||||||
--executor-cores=${sparkExecutorCores}
|
--executor-cores=${sparkExecutorCores}
|
||||||
--driver-memory=${sparkHighDriverMemory}
|
--driver-memory=${sparkHighDriverMemory}
|
||||||
|
--conf spark.executor.memoryOverhead=${sparkHighExecutorMemory}
|
||||||
--conf spark.sql.shuffle.partitions=${sparkShufflePartitions}
|
--conf spark.sql.shuffle.partitions=${sparkShufflePartitions}
|
||||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||||
|
@ -444,6 +453,7 @@
|
||||||
--executor-memory=${sparkHighExecutorMemory}
|
--executor-memory=${sparkHighExecutorMemory}
|
||||||
--executor-cores=${sparkExecutorCores}
|
--executor-cores=${sparkExecutorCores}
|
||||||
--driver-memory=${sparkHighDriverMemory}
|
--driver-memory=${sparkHighDriverMemory}
|
||||||
|
--conf spark.executor.memoryOverhead=${sparkHighExecutorMemory}
|
||||||
--conf spark.sql.shuffle.partitions=${sparkShufflePartitions}
|
--conf spark.sql.shuffle.partitions=${sparkShufflePartitions}
|
||||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||||
|
@ -482,6 +492,7 @@
|
||||||
--executor-memory=${sparkHighExecutorMemory}
|
--executor-memory=${sparkHighExecutorMemory}
|
||||||
--executor-cores=${sparkExecutorCores}
|
--executor-cores=${sparkExecutorCores}
|
||||||
--driver-memory=${sparkNormalDriverMemory}
|
--driver-memory=${sparkNormalDriverMemory}
|
||||||
|
--conf spark.executor.memoryOverhead=${sparkHighExecutorMemory}
|
||||||
--conf spark.sql.shuffle.partitions=${sparkShufflePartitions}
|
--conf spark.sql.shuffle.partitions=${sparkShufflePartitions}
|
||||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||||
|
@ -533,6 +544,7 @@
|
||||||
--executor-memory=${sparkNormalExecutorMemory}
|
--executor-memory=${sparkNormalExecutorMemory}
|
||||||
--executor-cores=${sparkExecutorCores}
|
--executor-cores=${sparkExecutorCores}
|
||||||
--driver-memory=${sparkNormalDriverMemory}
|
--driver-memory=${sparkNormalDriverMemory}
|
||||||
|
--conf spark.executor.memoryOverhead=${sparkNormalExecutorMemory}
|
||||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||||
|
|
Loading…
Reference in New Issue