merged again from beta (I hope for the last time)

This commit is contained in:
Sandro La Bruzzo 2024-05-22 11:02:46 +02:00
commit 66c1ffc866
65 changed files with 874 additions and 3529 deletions

View File

@ -66,11 +66,9 @@
<groupId>edu.cmu</groupId> <groupId>edu.cmu</groupId>
<artifactId>secondstring</artifactId> <artifactId>secondstring</artifactId>
</dependency> </dependency>
<dependency> <dependency>
<groupId>eu.dnetlib.dhp</groupId> <groupId>com.ibm.icu</groupId>
<artifactId>dhp-pace-core</artifactId> <artifactId>icu4j</artifactId>
<version>${project.version}</version>
</dependency> </dependency>
<dependency> <dependency>
@ -119,6 +117,10 @@
<groupId>net.sf.saxon</groupId> <groupId>net.sf.saxon</groupId>
<artifactId>Saxon-HE</artifactId> <artifactId>Saxon-HE</artifactId>
</dependency> </dependency>
<dependency>
<groupId>org.slf4j</groupId>
<artifactId>jcl-over-slf4j</artifactId>
</dependency>
<dependency> <dependency>
<groupId>org.apache.cxf</groupId> <groupId>org.apache.cxf</groupId>
<artifactId>cxf-rt-transports-http</artifactId> <artifactId>cxf-rt-transports-http</artifactId>

View File

@ -135,7 +135,7 @@ public class GroupEntitiesSparkJob {
.applyCoarVocabularies(entity, vocs), .applyCoarVocabularies(entity, vocs),
OAFENTITY_KRYO_ENC) OAFENTITY_KRYO_ENC)
.groupByKey((MapFunction<OafEntity, String>) OafEntity::getId, Encoders.STRING()) .groupByKey((MapFunction<OafEntity, String>) OafEntity::getId, Encoders.STRING())
.mapGroups((MapGroupsFunction<String, OafEntity, OafEntity>) MergeUtils::mergeGroup, OAFENTITY_KRYO_ENC) .mapGroups((MapGroupsFunction<String, OafEntity, OafEntity>) MergeUtils::mergeById, OAFENTITY_KRYO_ENC)
.map( .map(
(MapFunction<OafEntity, Tuple2<String, OafEntity>>) t -> new Tuple2<>( (MapFunction<OafEntity, Tuple2<String, OafEntity>>) t -> new Tuple2<>(
t.getClass().getName(), t), t.getClass().getName(), t),

View File

@ -30,8 +30,16 @@ import eu.dnetlib.dhp.schema.common.ModelSupport;
import eu.dnetlib.dhp.schema.oaf.*; import eu.dnetlib.dhp.schema.oaf.*;
public class MergeUtils { public class MergeUtils {
public static <T extends Oaf> T mergeById(String s, Iterator<T> oafEntityIterator) {
return mergeGroup(s, oafEntityIterator, true);
}
public static <T extends Oaf> T mergeGroup(String s, Iterator<T> oafEntityIterator) { public static <T extends Oaf> T mergeGroup(String s, Iterator<T> oafEntityIterator) {
return mergeGroup(s, oafEntityIterator, false);
}
public static <T extends Oaf> T mergeGroup(String s, Iterator<T> oafEntityIterator,
boolean checkDelegateAuthority) {
TreeSet<T> sortedEntities = new TreeSet<>((o1, o2) -> { TreeSet<T> sortedEntities = new TreeSet<>((o1, o2) -> {
int res = 0; int res = 0;
@ -52,18 +60,22 @@ public class MergeUtils {
sortedEntities.add(oafEntityIterator.next()); sortedEntities.add(oafEntityIterator.next());
} }
T merged = sortedEntities.descendingIterator().next();
Iterator<T> it = sortedEntities.descendingIterator(); Iterator<T> it = sortedEntities.descendingIterator();
T merged = it.next();
while (it.hasNext()) { while (it.hasNext()) {
merged = checkedMerge(merged, it.next()); merged = checkedMerge(merged, it.next(), checkDelegateAuthority);
} }
return merged; return merged;
} }
public static <T extends Oaf> T checkedMerge(final T left, final T right) { public static <T extends Oaf> T checkedMerge(final T left, final T right, boolean checkDelegateAuthority) {
return (T) merge(left, right, false); return (T) merge(left, right, checkDelegateAuthority);
}
public static <T extends Result, E extends Result> Result mergeResult(final T left, final E right) {
return (Result) merge(left, right, false);
} }
public static Oaf merge(final Oaf left, final Oaf right) { public static Oaf merge(final Oaf left, final Oaf right) {
@ -108,7 +120,7 @@ public class MergeUtils {
return mergeSoftware((Software) left, (Software) right); return mergeSoftware((Software) left, (Software) right);
} }
return mergeResult((Result) left, (Result) right); return mergeResultFields((Result) left, (Result) right);
} else if (sameClass(left, right, Datasource.class)) { } else if (sameClass(left, right, Datasource.class)) {
// TODO // TODO
final int trust = compareTrust(left, right); final int trust = compareTrust(left, right);
@ -151,9 +163,9 @@ public class MergeUtils {
} }
// TODO: raise trust to have preferred fields from one or the other?? // TODO: raise trust to have preferred fields from one or the other??
if (new ResultTypeComparator().compare(left, right) < 0) { if (new ResultTypeComparator().compare(left, right) < 0) {
return mergeResult(left, right); return mergeResultFields(left, right);
} else { } else {
return mergeResult(right, left); return mergeResultFields(right, left);
} }
} }
@ -263,6 +275,12 @@ public class MergeUtils {
// TODO review // TODO review
private static List<KeyValue> mergeByKey(List<KeyValue> left, List<KeyValue> right, int trust) { private static List<KeyValue> mergeByKey(List<KeyValue> left, List<KeyValue> right, int trust) {
if (left == null) {
return right;
} else if (right == null) {
return left;
}
if (trust < 0) { if (trust < 0) {
List<KeyValue> s = left; List<KeyValue> s = left;
left = right; left = right;
@ -368,7 +386,7 @@ public class MergeUtils {
return merge; return merge;
} }
public static <T extends Result> T mergeResult(T original, T enrich) { private static <T extends Result> T mergeResultFields(T original, T enrich) {
final int trust = compareTrust(original, enrich); final int trust = compareTrust(original, enrich);
T merge = mergeOafEntityFields(original, enrich, trust); T merge = mergeOafEntityFields(original, enrich, trust);
@ -694,7 +712,7 @@ public class MergeUtils {
private static <T extends OtherResearchProduct> T mergeORP(T original, T enrich) { private static <T extends OtherResearchProduct> T mergeORP(T original, T enrich) {
int trust = compareTrust(original, enrich); int trust = compareTrust(original, enrich);
final T merge = mergeResult(original, enrich); final T merge = mergeResultFields(original, enrich);
merge.setContactperson(unionDistinctLists(merge.getContactperson(), enrich.getContactperson(), trust)); merge.setContactperson(unionDistinctLists(merge.getContactperson(), enrich.getContactperson(), trust));
merge.setContactgroup(unionDistinctLists(merge.getContactgroup(), enrich.getContactgroup(), trust)); merge.setContactgroup(unionDistinctLists(merge.getContactgroup(), enrich.getContactgroup(), trust));
@ -705,7 +723,7 @@ public class MergeUtils {
private static <T extends Software> T mergeSoftware(T original, T enrich) { private static <T extends Software> T mergeSoftware(T original, T enrich) {
int trust = compareTrust(original, enrich); int trust = compareTrust(original, enrich);
final T merge = mergeResult(original, enrich); final T merge = mergeResultFields(original, enrich);
merge.setDocumentationUrl(unionDistinctLists(merge.getDocumentationUrl(), enrich.getDocumentationUrl(), trust)); merge.setDocumentationUrl(unionDistinctLists(merge.getDocumentationUrl(), enrich.getDocumentationUrl(), trust));
merge.setLicense(unionDistinctLists(merge.getLicense(), enrich.getLicense(), trust)); merge.setLicense(unionDistinctLists(merge.getLicense(), enrich.getLicense(), trust));
@ -719,7 +737,7 @@ public class MergeUtils {
private static <T extends Dataset> T mergeDataset(T original, T enrich) { private static <T extends Dataset> T mergeDataset(T original, T enrich) {
int trust = compareTrust(original, enrich); int trust = compareTrust(original, enrich);
T merge = mergeResult(original, enrich); T merge = mergeResultFields(original, enrich);
merge.setStoragedate(chooseReference(merge.getStoragedate(), enrich.getStoragedate(), trust)); merge.setStoragedate(chooseReference(merge.getStoragedate(), enrich.getStoragedate(), trust));
merge.setDevice(chooseReference(merge.getDevice(), enrich.getDevice(), trust)); merge.setDevice(chooseReference(merge.getDevice(), enrich.getDevice(), trust));
@ -738,7 +756,7 @@ public class MergeUtils {
public static <T extends Publication> T mergePublication(T original, T enrich) { public static <T extends Publication> T mergePublication(T original, T enrich) {
final int trust = compareTrust(original, enrich); final int trust = compareTrust(original, enrich);
T merged = mergeResult(original, enrich); T merged = mergeResultFields(original, enrich);
merged.setJournal(chooseReference(merged.getJournal(), enrich.getJournal(), trust)); merged.setJournal(chooseReference(merged.getJournal(), enrich.getJournal(), trust));
@ -856,9 +874,11 @@ public class MergeUtils {
if (toEnrichInstances == null) { if (toEnrichInstances == null) {
return enrichmentResult; return enrichmentResult;
} }
if (enrichmentInstances == null) {
return enrichmentResult; if (enrichmentInstances == null || enrichmentInstances.isEmpty()) {
return toEnrichInstances;
} }
Map<String, Instance> ri = toInstanceMap(enrichmentInstances); Map<String, Instance> ri = toInstanceMap(enrichmentInstances);
toEnrichInstances.forEach(i -> { toEnrichInstances.forEach(i -> {

View File

@ -36,6 +36,15 @@ public class ResultTypeComparator implements Comparator<Result> {
return 1; return 1;
} }
if (left.getResulttype() == null || left.getResulttype().getClassid() == null) {
if (right.getResulttype() == null || right.getResulttype().getClassid() == null) {
return 0;
}
return 1;
} else if (right.getResulttype() == null || right.getResulttype().getClassid() == null) {
return -1;
}
String lClass = left.getResulttype().getClassid(); String lClass = left.getResulttype().getClassid();
String rClass = right.getResulttype().getClassid(); String rClass = right.getResulttype().getClassid();

View File

@ -0,0 +1,101 @@
package eu.dnetlib.pace.common;
import java.nio.charset.StandardCharsets;
import java.text.Normalizer;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils;
import com.google.common.base.Splitter;
import com.google.common.collect.Iterables;
import com.google.common.collect.Sets;
import com.ibm.icu.text.Transliterator;
/**
* Set of common functions for the framework
*
* @author claudio
*/
public class PaceCommonUtils {
// transliterator
protected static Transliterator transliterator = Transliterator.getInstance("Any-Eng");
protected static final String aliases_from = "⁰¹²³⁴⁵⁶⁷⁸⁹⁺⁻⁼⁽⁾ⁿ₀₁₂₃₄₅₆₇₈₉₊₋₌₍₎àáâäæãåāèéêëēėęəîïíīįìôöòóœøōõûüùúūßśšłžźżçćčñń";
protected static final String aliases_to = "0123456789+-=()n0123456789+-=()aaaaaaaaeeeeeeeeiiiiiioooooooouuuuussslzzzcccnn";
protected static Pattern hexUnicodePattern = Pattern.compile("\\\\u(\\p{XDigit}{4})");
protected static String fixAliases(final String s) {
final StringBuilder sb = new StringBuilder();
s.chars().forEach(ch -> {
final int i = StringUtils.indexOf(aliases_from, ch);
sb.append(i >= 0 ? aliases_to.charAt(i) : (char) ch);
});
return sb.toString();
}
protected static String transliterate(final String s) {
try {
return transliterator.transliterate(s);
} catch (Exception e) {
return s;
}
}
public static String normalize(final String s) {
return fixAliases(transliterate(nfd(unicodeNormalization(s))))
.toLowerCase()
// do not compact the regexes in a single expression, would cause StackOverflowError in case of large input
// strings
.replaceAll("[^ \\w]+", "")
.replaceAll("(\\p{InCombiningDiacriticalMarks})+", "")
.replaceAll("(\\p{Punct})+", " ")
.replaceAll("(\\d)+", " ")
.replaceAll("(\\n)+", " ")
.trim();
}
public static String nfd(final String s) {
return Normalizer.normalize(s, Normalizer.Form.NFD);
}
public static String unicodeNormalization(final String s) {
Matcher m = hexUnicodePattern.matcher(s);
StringBuffer buf = new StringBuffer(s.length());
while (m.find()) {
String ch = String.valueOf((char) Integer.parseInt(m.group(1), 16));
m.appendReplacement(buf, Matcher.quoteReplacement(ch));
}
m.appendTail(buf);
return buf.toString();
}
public static Set<String> loadFromClasspath(final String classpath) {
Transliterator transliterator = Transliterator.getInstance("Any-Eng");
final Set<String> h = Sets.newHashSet();
try {
for (final String s : IOUtils
.readLines(PaceCommonUtils.class.getResourceAsStream(classpath), StandardCharsets.UTF_8)) {
h.add(fixAliases(transliterator.transliterate(s))); // transliteration of the stopwords
}
} catch (final Throwable e) {
return Sets.newHashSet();
}
return h;
}
protected static Iterable<String> tokens(final String s, final int maxTokens) {
return Iterables.limit(Splitter.on(" ").omitEmptyStrings().trimResults().split(s), maxTokens);
}
}

View File

@ -12,7 +12,7 @@ import com.google.common.collect.Iterables;
import com.google.common.collect.Lists; import com.google.common.collect.Lists;
import com.google.common.hash.Hashing; import com.google.common.hash.Hashing;
import eu.dnetlib.pace.common.AbstractPaceFunctions; import eu.dnetlib.pace.common.PaceCommonUtils;
import eu.dnetlib.pace.util.Capitalise; import eu.dnetlib.pace.util.Capitalise;
import eu.dnetlib.pace.util.DotAbbreviations; import eu.dnetlib.pace.util.DotAbbreviations;
@ -86,7 +86,7 @@ public class Person {
private List<String> splitTerms(final String s) { private List<String> splitTerms(final String s) {
if (particles == null) { if (particles == null) {
particles = AbstractPaceFunctions.loadFromClasspath("/eu/dnetlib/pace/config/name_particles.txt"); particles = PaceCommonUtils.loadFromClasspath("/eu/dnetlib/pace/config/name_particles.txt");
} }
final List<String> list = Lists.newArrayList(); final List<String> list = Lists.newArrayList();

View File

@ -15,4 +15,4 @@ public class Capitalise implements Function<String, String> {
public String apply(final String s) { public String apply(final String s) {
return WordUtils.capitalize(s.toLowerCase(), DELIM); return WordUtils.capitalize(s.toLowerCase(), DELIM);
} }
}; }

View File

@ -8,4 +8,4 @@ public class DotAbbreviations implements Function<String, String> {
public String apply(String s) { public String apply(String s) {
return s.length() == 1 ? s + "." : s; return s.length() == 1 ? s + "." : s;
} }
}; }

View File

@ -389,7 +389,7 @@ object ScholixUtils extends Serializable {
if (persistentIdentifiers.isEmpty) if (persistentIdentifiers.isEmpty)
return null return null
s.setLocalIdentifier(persistentIdentifiers.asJava) s.setLocalIdentifier(persistentIdentifiers.asJava)
s.setTypology(r.getResulttype.getClassid) //s.setTypology(r.getResulttype.getClassid)
s.setSubType(r.getInstance().get(0).getInstancetype.getClassname) s.setSubType(r.getInstance().get(0).getInstancetype.getClassname)

View File

@ -63,7 +63,7 @@ public class MergeUtilsTest {
assertEquals(1, d1.getCollectedfrom().size()); assertEquals(1, d1.getCollectedfrom().size());
assertTrue(cfId(d1.getCollectedfrom()).contains(ModelConstants.CROSSREF_ID)); assertTrue(cfId(d1.getCollectedfrom()).contains(ModelConstants.CROSSREF_ID));
final Result p1d2 = MergeUtils.checkedMerge(p1, d2); final Result p1d2 = MergeUtils.checkedMerge(p1, d2, true);
assertEquals(ModelConstants.PUBLICATION_RESULTTYPE_CLASSID, p1d2.getResulttype().getClassid()); assertEquals(ModelConstants.PUBLICATION_RESULTTYPE_CLASSID, p1d2.getResulttype().getClassid());
assertTrue(p1d2 instanceof Publication); assertTrue(p1d2 instanceof Publication);
assertEquals(p1.getId(), p1d2.getId()); assertEquals(p1.getId(), p1d2.getId());
@ -74,7 +74,7 @@ public class MergeUtilsTest {
Publication p2 = read("publication_2.json", Publication.class); Publication p2 = read("publication_2.json", Publication.class);
Dataset d1 = read("dataset_1.json", Dataset.class); Dataset d1 = read("dataset_1.json", Dataset.class);
final Result p2d1 = MergeUtils.checkedMerge(p2, d1); final Result p2d1 = MergeUtils.checkedMerge(p2, d1, true);
assertEquals((ModelConstants.DATASET_RESULTTYPE_CLASSID), p2d1.getResulttype().getClassid()); assertEquals((ModelConstants.DATASET_RESULTTYPE_CLASSID), p2d1.getResulttype().getClassid());
assertTrue(p2d1 instanceof Dataset); assertTrue(p2d1 instanceof Dataset);
assertEquals(d1.getId(), p2d1.getId()); assertEquals(d1.getId(), p2d1.getId());
@ -86,7 +86,7 @@ public class MergeUtilsTest {
Publication p1 = read("publication_1.json", Publication.class); Publication p1 = read("publication_1.json", Publication.class);
Publication p2 = read("publication_2.json", Publication.class); Publication p2 = read("publication_2.json", Publication.class);
Result p1p2 = MergeUtils.checkedMerge(p1, p2); Result p1p2 = MergeUtils.checkedMerge(p1, p2, true);
assertTrue(p1p2 instanceof Publication); assertTrue(p1p2 instanceof Publication);
assertEquals(p1.getId(), p1p2.getId()); assertEquals(p1.getId(), p1p2.getId());
assertEquals(2, p1p2.getCollectedfrom().size()); assertEquals(2, p1p2.getCollectedfrom().size());

View File

@ -49,6 +49,12 @@
</build> </build>
<dependencies> <dependencies>
<dependency>
<groupId>eu.dnetlib.dhp</groupId>
<artifactId>dhp-common</artifactId>
<version>${project.version}</version>
</dependency>
<dependency> <dependency>
<groupId>edu.cmu</groupId> <groupId>edu.cmu</groupId>
<artifactId>secondstring</artifactId> <artifactId>secondstring</artifactId>

View File

@ -4,7 +4,6 @@ package eu.dnetlib.pace.common;
import java.io.IOException; import java.io.IOException;
import java.io.StringWriter; import java.io.StringWriter;
import java.nio.charset.StandardCharsets; import java.nio.charset.StandardCharsets;
import java.text.Normalizer;
import java.util.*; import java.util.*;
import java.util.regex.Matcher; import java.util.regex.Matcher;
import java.util.regex.Pattern; import java.util.regex.Pattern;
@ -14,19 +13,15 @@ import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.StringUtils;
import com.google.common.base.Joiner; import com.google.common.base.Joiner;
import com.google.common.base.Splitter;
import com.google.common.collect.Iterables;
import com.google.common.collect.Sets; import com.google.common.collect.Sets;
import com.ibm.icu.text.Transliterator; import com.ibm.icu.text.Transliterator;
import eu.dnetlib.pace.clustering.NGramUtils;
/** /**
* Set of common functions for the framework * Set of common functions for the framework
* *
* @author claudio * @author claudio
*/ */
public class AbstractPaceFunctions { public class AbstractPaceFunctions extends PaceCommonUtils {
// city map to be used when translating the city names into codes // city map to be used when translating the city names into codes
private static Map<String, String> cityMap = AbstractPaceFunctions private static Map<String, String> cityMap = AbstractPaceFunctions
@ -41,9 +36,6 @@ public class AbstractPaceFunctions {
protected static Set<String> stopwords_it = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_it.txt"); protected static Set<String> stopwords_it = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_it.txt");
protected static Set<String> stopwords_pt = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_pt.txt"); protected static Set<String> stopwords_pt = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_pt.txt");
// transliterator
protected static Transliterator transliterator = Transliterator.getInstance("Any-Eng");
// blacklist of ngrams: to avoid generic keys // blacklist of ngrams: to avoid generic keys
protected static Set<String> ngramBlacklist = loadFromClasspath("/eu/dnetlib/pace/config/ngram_blacklist.txt"); protected static Set<String> ngramBlacklist = loadFromClasspath("/eu/dnetlib/pace/config/ngram_blacklist.txt");
@ -51,8 +43,6 @@ public class AbstractPaceFunctions {
public static final Pattern HTML_REGEX = Pattern.compile("<[^>]*>"); public static final Pattern HTML_REGEX = Pattern.compile("<[^>]*>");
private static final String alpha = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789 "; private static final String alpha = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789 ";
private static final String aliases_from = "⁰¹²³⁴⁵⁶⁷⁸⁹⁺⁻⁼⁽⁾ⁿ₀₁₂₃₄₅₆₇₈₉₊₋₌₍₎àáâäæãåāèéêëēėęəîïíīįìôöòóœøōõûüùúūßśšłžźżçćčñń";
private static final String aliases_to = "0123456789+-=()n0123456789+-=()aaaaaaaaeeeeeeeeiiiiiioooooooouuuuussslzzzcccnn";
// doi prefix for normalization // doi prefix for normalization
public static final Pattern DOI_PREFIX = Pattern.compile("(https?:\\/\\/dx\\.doi\\.org\\/)|(doi:)"); public static final Pattern DOI_PREFIX = Pattern.compile("(https?:\\/\\/dx\\.doi\\.org\\/)|(doi:)");
@ -129,25 +119,6 @@ public class AbstractPaceFunctions {
return numberPattern.matcher(strNum).matches(); return numberPattern.matcher(strNum).matches();
} }
protected static String fixAliases(final String s) {
final StringBuilder sb = new StringBuilder();
s.chars().forEach(ch -> {
final int i = StringUtils.indexOf(aliases_from, ch);
sb.append(i >= 0 ? aliases_to.charAt(i) : (char) ch);
});
return sb.toString();
}
protected static String transliterate(final String s) {
try {
return transliterator.transliterate(s);
} catch (Exception e) {
return s;
}
}
protected static String removeSymbols(final String s) { protected static String removeSymbols(final String s) {
final StringBuilder sb = new StringBuilder(); final StringBuilder sb = new StringBuilder();
@ -162,23 +133,6 @@ public class AbstractPaceFunctions {
return s != null; return s != null;
} }
public static String normalize(final String s) {
return fixAliases(transliterate(nfd(unicodeNormalization(s))))
.toLowerCase()
// do not compact the regexes in a single expression, would cause StackOverflowError in case of large input
// strings
.replaceAll("[^ \\w]+", "")
.replaceAll("(\\p{InCombiningDiacriticalMarks})+", "")
.replaceAll("(\\p{Punct})+", " ")
.replaceAll("(\\d)+", " ")
.replaceAll("(\\n)+", " ")
.trim();
}
public static String nfd(final String s) {
return Normalizer.normalize(s, Normalizer.Form.NFD);
}
public static String utf8(final String s) { public static String utf8(final String s) {
byte[] bytes = s.getBytes(StandardCharsets.UTF_8); byte[] bytes = s.getBytes(StandardCharsets.UTF_8);
return new String(bytes, StandardCharsets.UTF_8); return new String(bytes, StandardCharsets.UTF_8);
@ -233,22 +187,6 @@ public class AbstractPaceFunctions {
return newset; return newset;
} }
public static Set<String> loadFromClasspath(final String classpath) {
Transliterator transliterator = Transliterator.getInstance("Any-Eng");
final Set<String> h = Sets.newHashSet();
try {
for (final String s : IOUtils
.readLines(NGramUtils.class.getResourceAsStream(classpath), StandardCharsets.UTF_8)) {
h.add(fixAliases(transliterator.transliterate(s))); // transliteration of the stopwords
}
} catch (final Throwable e) {
return Sets.newHashSet();
}
return h;
}
public static Map<String, String> loadMapFromClasspath(final String classpath) { public static Map<String, String> loadMapFromClasspath(final String classpath) {
Transliterator transliterator = Transliterator.getInstance("Any-Eng"); Transliterator transliterator = Transliterator.getInstance("Any-Eng");
@ -303,10 +241,6 @@ public class AbstractPaceFunctions {
return StringUtils.substring(s, 0, 1).toLowerCase(); return StringUtils.substring(s, 0, 1).toLowerCase();
} }
protected static Iterable<String> tokens(final String s, final int maxTokens) {
return Iterables.limit(Splitter.on(" ").omitEmptyStrings().trimResults().split(s), maxTokens);
}
public static String normalizePid(String pid) { public static String normalizePid(String pid) {
return DOI_PREFIX.matcher(pid.toLowerCase()).replaceAll(""); return DOI_PREFIX.matcher(pid.toLowerCase()).replaceAll("");
} }

View File

@ -80,9 +80,11 @@ public class PrepareFOSSparkJob implements Serializable {
fosDataset fosDataset
.groupByKey((MapFunction<FOSDataModel, String>) v -> v.getOaid().toLowerCase(), Encoders.STRING()) .groupByKey((MapFunction<FOSDataModel, String>) v -> v.getOaid().toLowerCase(), Encoders.STRING())
.mapGroups((MapGroupsFunction<String, FOSDataModel, Result>) (k, it) -> { .mapGroups(
return getResult(ModelSupport.getIdPrefix(Result.class) + "|" + k, it); (MapGroupsFunction<String, FOSDataModel, Result>) (k,
}, Encoders.bean(Result.class)) it) -> getResult(
ModelSupport.entityIdPrefix.get(Result.class.getSimpleName().toLowerCase()) + "|" + k, it),
Encoders.bean(Result.class))
.write() .write()
.mode(SaveMode.Overwrite) .mode(SaveMode.Overwrite)
.option("compression", "gzip") .option("compression", "gzip")

View File

@ -18,7 +18,11 @@ import javax.xml.transform.TransformerConfigurationException;
import javax.xml.transform.TransformerFactory; import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource; import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult; import javax.xml.transform.stream.StreamResult;
import javax.xml.xpath.*; import javax.xml.xpath.XPath;
import javax.xml.xpath.XPathConstants;
import javax.xml.xpath.XPathExpression;
import javax.xml.xpath.XPathExpressionException;
import javax.xml.xpath.XPathFactory;
import org.apache.commons.io.IOUtils; import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.StringUtils;
@ -35,7 +39,7 @@ import eu.dnetlib.dhp.common.collection.CollectorException;
import eu.dnetlib.dhp.common.collection.HttpClientParams; import eu.dnetlib.dhp.common.collection.HttpClientParams;
/** /**
* log.info(...) equal to log.trace(...) in the application-logs * log.info(...) equal to log.trace(...) in the application-logs
* <p> * <p>
* known bug: at resumptionType 'discover' if the (resultTotal % resultSizeValue) == 0 the collecting fails -> change the resultSizeValue * known bug: at resumptionType 'discover' if the (resultTotal % resultSizeValue) == 0 the collecting fails -> change the resultSizeValue
* *
@ -47,6 +51,7 @@ public class RestIterator implements Iterator<String> {
private static final Logger log = LoggerFactory.getLogger(RestIterator.class); private static final Logger log = LoggerFactory.getLogger(RestIterator.class);
public static final String UTF_8 = "UTF-8"; public static final String UTF_8 = "UTF-8";
private static final int MAX_ATTEMPTS = 5;
private final HttpClientParams clientParams; private final HttpClientParams clientParams;
@ -60,8 +65,9 @@ public class RestIterator implements Iterator<String> {
private final int resultSizeValue; private final int resultSizeValue;
private int resumptionInt = 0; // integer resumption token (first record to harvest) private int resumptionInt = 0; // integer resumption token (first record to harvest)
private int resultTotal = -1; private int resultTotal = -1;
private String resumptionStr = Integer.toString(resumptionInt); // string resumption token (first record to harvest private String resumptionStr = Integer.toString(this.resumptionInt); // string resumption token (first record to
// or token scanned from results) // harvest
// or token scanned from results)
private InputStream resultStream; private InputStream resultStream;
private Transformer transformer; private Transformer transformer;
private XPath xpath; private XPath xpath;
@ -73,7 +79,7 @@ public class RestIterator implements Iterator<String> {
private final String querySize; private final String querySize;
private final String authMethod; private final String authMethod;
private final String authToken; private final String authToken;
private final Queue<String> recordQueue = new PriorityBlockingQueue<String>(); private final Queue<String> recordQueue = new PriorityBlockingQueue<>();
private int discoverResultSize = 0; private int discoverResultSize = 0;
private int pagination = 1; private int pagination = 1;
/* /*
@ -83,8 +89,8 @@ public class RestIterator implements Iterator<String> {
*/ */
private final String resultOutputFormat; private final String resultOutputFormat;
/** RestIterator class /**
* compatible to version 1.3.33 * RestIterator class compatible to version 1.3.33
*/ */
public RestIterator( public RestIterator(
final HttpClientParams clientParams, final HttpClientParams clientParams,
@ -108,40 +114,42 @@ public class RestIterator implements Iterator<String> {
this.resumptionType = resumptionType; this.resumptionType = resumptionType;
this.resumptionParam = resumptionParam; this.resumptionParam = resumptionParam;
this.resultFormatValue = resultFormatValue; this.resultFormatValue = resultFormatValue;
this.resultSizeValue = Integer.valueOf(resultSizeValueStr); this.resultSizeValue = Integer.parseInt(resultSizeValueStr);
this.queryParams = queryParams; this.queryParams = queryParams;
this.authMethod = authMethod; this.authMethod = authMethod;
this.authToken = authToken; this.authToken = authToken;
this.resultOutputFormat = resultOutputFormat; this.resultOutputFormat = resultOutputFormat;
queryFormat = StringUtils.isNotBlank(resultFormatParam) ? "&" + resultFormatParam + "=" + resultFormatValue this.queryFormat = StringUtils.isNotBlank(resultFormatParam) ? "&" + resultFormatParam + "=" + resultFormatValue
: "";
this.querySize = StringUtils.isNotBlank(resultSizeParam) ? "&" + resultSizeParam + "=" + resultSizeValueStr
: ""; : "";
querySize = StringUtils.isNotBlank(resultSizeParam) ? "&" + resultSizeParam + "=" + resultSizeValueStr : "";
try { try {
initXmlTransformation(resultTotalXpath, resumptionXpath, entityXpath); initXmlTransformation(resultTotalXpath, resumptionXpath, entityXpath);
} catch (Exception e) { } catch (final Exception e) {
throw new IllegalStateException("xml transformation init failed: " + e.getMessage()); throw new IllegalStateException("xml transformation init failed: " + e.getMessage());
} }
initQueue(); initQueue();
} }
private void initXmlTransformation(String resultTotalXpath, String resumptionXpath, String entityXpath) private void initXmlTransformation(final String resultTotalXpath, final String resumptionXpath,
final String entityXpath)
throws TransformerConfigurationException, XPathExpressionException { throws TransformerConfigurationException, XPathExpressionException {
final TransformerFactory factory = TransformerFactory.newInstance(); final TransformerFactory factory = TransformerFactory.newInstance();
transformer = factory.newTransformer(); this.transformer = factory.newTransformer();
transformer.setOutputProperty(OutputKeys.INDENT, "yes"); this.transformer.setOutputProperty(OutputKeys.INDENT, "yes");
transformer.setOutputProperty("{http://xml.apache.org/xslt}indent-amount", "3"); this.transformer.setOutputProperty("{http://xml.apache.org/xslt}indent-amount", "3");
xpath = XPathFactory.newInstance().newXPath(); this.xpath = XPathFactory.newInstance().newXPath();
xprResultTotalPath = xpath.compile(resultTotalXpath); this.xprResultTotalPath = this.xpath.compile(resultTotalXpath);
xprResumptionPath = xpath.compile(StringUtils.isBlank(resumptionXpath) ? "/" : resumptionXpath); this.xprResumptionPath = this.xpath.compile(StringUtils.isBlank(resumptionXpath) ? "/" : resumptionXpath);
xprEntity = xpath.compile(entityXpath); this.xprEntity = this.xpath.compile(entityXpath);
} }
private void initQueue() { private void initQueue() {
query = baseUrl + "?" + queryParams + querySize + queryFormat; this.query = this.baseUrl + "?" + this.queryParams + this.querySize + this.queryFormat;
log.info("REST calls starting with {}", query); log.info("REST calls starting with {}", this.query);
} }
private void disconnect() { private void disconnect() {
@ -154,12 +162,11 @@ public class RestIterator implements Iterator<String> {
*/ */
@Override @Override
public boolean hasNext() { public boolean hasNext() {
if (recordQueue.isEmpty() && query.isEmpty()) { if (this.recordQueue.isEmpty() && this.query.isEmpty()) {
disconnect(); disconnect();
return false; return false;
} else {
return true;
} }
return true;
} }
/* /*
@ -168,214 +175,241 @@ public class RestIterator implements Iterator<String> {
*/ */
@Override @Override
public String next() { public String next() {
synchronized (recordQueue) { synchronized (this.recordQueue) {
while (recordQueue.isEmpty() && !query.isEmpty()) { while (this.recordQueue.isEmpty() && !this.query.isEmpty()) {
try { try {
query = downloadPage(query); this.query = downloadPage(this.query, 0);
} catch (CollectorException e) { } catch (final CollectorException e) {
log.debug("CollectorPlugin.next()-Exception: {}", e); log.debug("CollectorPlugin.next()-Exception: {}", e);
throw new RuntimeException(e); throw new RuntimeException(e);
} }
} }
return recordQueue.poll(); return this.recordQueue.poll();
} }
} }
/* /*
* download page and return nextQuery * download page and return nextQuery (with number of attempt)
*/ */
private String downloadPage(String query) throws CollectorException { private String downloadPage(String query, final int attempt) throws CollectorException {
String resultJson;
String resultXml = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>";
String nextQuery = "";
String emptyXml = resultXml + "<" + JsonUtils.XML_WRAP_TAG + "></" + JsonUtils.XML_WRAP_TAG + ">";
Node resultNode = null;
NodeList nodeList = null;
String qUrlArgument = "";
int urlOldResumptionSize = 0;
InputStream theHttpInputStream;
// check if cursor=* is initial set otherwise add it to the queryParam URL if (attempt > MAX_ATTEMPTS) {
if (resumptionType.equalsIgnoreCase("deep-cursor")) { throw new CollectorException("Max Number of attempts reached, query:" + query);
log.debug("check resumptionType deep-cursor and check cursor=*?{}", query); }
if (!query.contains("&cursor=")) {
query += "&cursor=*"; if (attempt > 0) {
final int delay = (attempt * 5000);
log.debug("Attempt {} with delay {}", attempt, delay);
try {
Thread.sleep(delay);
} catch (final InterruptedException e) {
new CollectorException(e);
} }
} }
try { try {
log.info("requestig URL [{}]", query); String resultJson;
String resultXml = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>";
String nextQuery = "";
final String emptyXml = resultXml + "<" + JsonUtils.XML_WRAP_TAG + "></" + JsonUtils.XML_WRAP_TAG + ">";
Node resultNode = null;
NodeList nodeList = null;
String qUrlArgument = "";
int urlOldResumptionSize = 0;
InputStream theHttpInputStream;
URL qUrl = new URL(query); // check if cursor=* is initial set otherwise add it to the queryParam URL
log.debug("authMethod: {}", authMethod); if ("deep-cursor".equalsIgnoreCase(this.resumptionType)) {
if ("bearer".equalsIgnoreCase(this.authMethod)) { log.debug("check resumptionType deep-cursor and check cursor=*?{}", query);
log.trace("authMethod before inputStream: {}", resultXml); if (!query.contains("&cursor=")) {
HttpURLConnection conn = (HttpURLConnection) qUrl.openConnection(); query += "&cursor=*";
conn.setRequestProperty(HttpHeaders.AUTHORIZATION, "Bearer " + authToken);
conn.setRequestProperty(HttpHeaders.CONTENT_TYPE, ContentType.APPLICATION_JSON.getMimeType());
conn.setRequestMethod("GET");
theHttpInputStream = conn.getInputStream();
} else if (BASIC.equalsIgnoreCase(this.authMethod)) {
log.trace("authMethod before inputStream: {}", resultXml);
HttpURLConnection conn = (HttpURLConnection) qUrl.openConnection();
conn.setRequestProperty(HttpHeaders.AUTHORIZATION, "Basic " + authToken);
conn.setRequestProperty(HttpHeaders.ACCEPT, ContentType.APPLICATION_XML.getMimeType());
conn.setRequestMethod("GET");
theHttpInputStream = conn.getInputStream();
} else {
theHttpInputStream = qUrl.openStream();
}
resultStream = theHttpInputStream;
if ("json".equals(resultOutputFormat)) {
resultJson = IOUtils.toString(resultStream, StandardCharsets.UTF_8);
resultXml = JsonUtils.convertToXML(resultJson);
resultStream = IOUtils.toInputStream(resultXml, UTF_8);
}
if (!(emptyXml).equalsIgnoreCase(resultXml)) {
resultNode = (Node) xpath.evaluate("/", new InputSource(resultStream), XPathConstants.NODE);
nodeList = (NodeList) xprEntity.evaluate(resultNode, XPathConstants.NODESET);
log.debug("nodeList.length: {}", nodeList.getLength());
for (int i = 0; i < nodeList.getLength(); i++) {
StringWriter sw = new StringWriter();
transformer.transform(new DOMSource(nodeList.item(i)), new StreamResult(sw));
String toEnqueue = sw.toString();
if (toEnqueue == null || StringUtils.isBlank(toEnqueue) || emptyXml.equalsIgnoreCase(toEnqueue)) {
log.warn("The following record resulted in empty item for the feeding queue: {}", resultXml);
} else {
recordQueue.add(sw.toString());
}
} }
} else {
log.warn("resultXml is equal with emptyXml");
} }
resumptionInt += resultSizeValue; try {
log.info("requesting URL [{}]", query);
switch (resumptionType.toLowerCase()) { final URL qUrl = new URL(query);
case "scan": // read of resumptionToken , evaluate next results, e.g. OAI, iterate over items log.debug("authMethod: {}", this.authMethod);
resumptionStr = xprResumptionPath.evaluate(resultNode); if ("bearer".equalsIgnoreCase(this.authMethod)) {
break; log.trace("authMethod before inputStream: {}", resultXml);
final HttpURLConnection conn = (HttpURLConnection) qUrl.openConnection();
conn.setRequestProperty(HttpHeaders.AUTHORIZATION, "Bearer " + this.authToken);
conn.setRequestProperty(HttpHeaders.CONTENT_TYPE, ContentType.APPLICATION_JSON.getMimeType());
conn.setRequestMethod("GET");
theHttpInputStream = conn.getInputStream();
} else if (this.BASIC.equalsIgnoreCase(this.authMethod)) {
log.trace("authMethod before inputStream: {}", resultXml);
final HttpURLConnection conn = (HttpURLConnection) qUrl.openConnection();
conn.setRequestProperty(HttpHeaders.AUTHORIZATION, "Basic " + this.authToken);
conn.setRequestProperty(HttpHeaders.ACCEPT, ContentType.APPLICATION_XML.getMimeType());
conn.setRequestMethod("GET");
theHttpInputStream = conn.getInputStream();
} else {
theHttpInputStream = qUrl.openStream();
}
case "count": // begin at one step for all records, iterate over items this.resultStream = theHttpInputStream;
resumptionStr = Integer.toString(resumptionInt); if ("json".equals(this.resultOutputFormat)) {
break; resultJson = IOUtils.toString(this.resultStream, StandardCharsets.UTF_8);
resultXml = JsonUtils.convertToXML(resultJson);
this.resultStream = IOUtils.toInputStream(resultXml, UTF_8);
}
case "discover": // size of result items unknown, iterate over items (for openDOAR - 201808) if (!(emptyXml).equalsIgnoreCase(resultXml)) {
if (resultSizeValue < 2) { resultNode = (Node) this.xpath
throw new CollectorException("Mode: discover, Param 'resultSizeValue' is less than 2"); .evaluate("/", new InputSource(this.resultStream), XPathConstants.NODE);
nodeList = (NodeList) this.xprEntity.evaluate(resultNode, XPathConstants.NODESET);
log.debug("nodeList.length: {}", nodeList.getLength());
for (int i = 0; i < nodeList.getLength(); i++) {
final StringWriter sw = new StringWriter();
this.transformer.transform(new DOMSource(nodeList.item(i)), new StreamResult(sw));
final String toEnqueue = sw.toString();
if ((toEnqueue == null) || StringUtils.isBlank(toEnqueue)
|| emptyXml.equalsIgnoreCase(toEnqueue)) {
log
.warn(
"The following record resulted in empty item for the feeding queue: {}", resultXml);
} else {
this.recordQueue.add(sw.toString());
}
} }
qUrlArgument = qUrl.getQuery(); } else {
String[] arrayQUrlArgument = qUrlArgument.split("&"); log.warn("resultXml is equal with emptyXml");
for (String arrayUrlArgStr : arrayQUrlArgument) { }
if (arrayUrlArgStr.startsWith(resumptionParam)) {
String[] resumptionKeyValue = arrayUrlArgStr.split("="); this.resumptionInt += this.resultSizeValue;
if (isInteger(resumptionKeyValue[1])) {
urlOldResumptionSize = Integer.parseInt(resumptionKeyValue[1]); switch (this.resumptionType.toLowerCase()) {
log.debug("discover OldResumptionSize from Url (int): {}", urlOldResumptionSize); case "scan": // read of resumptionToken , evaluate next results, e.g. OAI, iterate over items
} else { this.resumptionStr = this.xprResumptionPath.evaluate(resultNode);
log.debug("discover OldResumptionSize from Url (str): {}", resumptionKeyValue[1]); break;
case "count": // begin at one step for all records, iterate over items
this.resumptionStr = Integer.toString(this.resumptionInt);
break;
case "discover": // size of result items unknown, iterate over items (for openDOAR - 201808)
if (this.resultSizeValue < 2) {
throw new CollectorException("Mode: discover, Param 'resultSizeValue' is less than 2");
}
qUrlArgument = qUrl.getQuery();
final String[] arrayQUrlArgument = qUrlArgument.split("&");
for (final String arrayUrlArgStr : arrayQUrlArgument) {
if (arrayUrlArgStr.startsWith(this.resumptionParam)) {
final String[] resumptionKeyValue = arrayUrlArgStr.split("=");
if (isInteger(resumptionKeyValue[1])) {
urlOldResumptionSize = Integer.parseInt(resumptionKeyValue[1]);
log.debug("discover OldResumptionSize from Url (int): {}", urlOldResumptionSize);
} else {
log.debug("discover OldResumptionSize from Url (str): {}", resumptionKeyValue[1]);
}
} }
} }
}
if (((emptyXml).equalsIgnoreCase(resultXml)) if (((emptyXml).equalsIgnoreCase(resultXml))
|| ((nodeList != null) && (nodeList.getLength() < resultSizeValue))) { || ((nodeList != null) && (nodeList.getLength() < this.resultSizeValue))) {
// resumptionStr = ""; // resumptionStr = "";
if (nodeList != null) { if (nodeList != null) {
discoverResultSize += nodeList.getLength(); this.discoverResultSize += nodeList.getLength();
}
this.resultTotal = this.discoverResultSize;
} else {
this.resumptionStr = Integer.toString(this.resumptionInt);
this.resultTotal = this.resumptionInt + 1;
if (nodeList != null) {
this.discoverResultSize += nodeList.getLength();
}
} }
resultTotal = discoverResultSize; log.info("discoverResultSize: {}", this.discoverResultSize);
} else { break;
resumptionStr = Integer.toString(resumptionInt);
resultTotal = resumptionInt + 1; case "pagination":
case "page": // pagination, iterate over page numbers
this.pagination += 1;
if (nodeList != null) { if (nodeList != null) {
discoverResultSize += nodeList.getLength(); this.discoverResultSize += nodeList.getLength();
} else {
this.resultTotal = this.discoverResultSize;
this.pagination = this.discoverResultSize;
} }
} this.resumptionInt = this.pagination;
log.info("discoverResultSize: {}", discoverResultSize); this.resumptionStr = Integer.toString(this.resumptionInt);
break; break;
case "pagination": case "deep-cursor": // size of result items unknown, iterate over items (for supporting deep cursor
case "page": // pagination, iterate over page numbers // in
pagination += 1; // solr)
if (nodeList != null) { // isn't relevant -- if (resultSizeValue < 2) {throw new CollectorServiceException("Mode:
discoverResultSize += nodeList.getLength(); // deep-cursor, Param 'resultSizeValue' is less than 2");}
} else {
resultTotal = discoverResultSize;
pagination = discoverResultSize;
}
resumptionInt = pagination;
resumptionStr = Integer.toString(resumptionInt);
break;
case "deep-cursor": // size of result items unknown, iterate over items (for supporting deep cursor in this.resumptionStr = encodeValue(this.xprResumptionPath.evaluate(resultNode));
// solr) this.queryParams = this.queryParams.replace("&cursor=*", "");
// isn't relevant -- if (resultSizeValue < 2) {throw new CollectorServiceException("Mode:
// deep-cursor, Param 'resultSizeValue' is less than 2");}
resumptionStr = encodeValue(xprResumptionPath.evaluate(resultNode)); // terminating if length of nodeList is 0
queryParams = queryParams.replace("&cursor=*", ""); if ((nodeList != null) && (nodeList.getLength() < this.discoverResultSize)) {
this.resumptionInt += ((nodeList.getLength() + 1) - this.resultSizeValue);
} else {
this.resumptionInt += (nodeList.getLength() - this.resultSizeValue); // subtract the
// resultSizeValue
// because the iteration is over
// real length and the
// resultSizeValue is added before
// the switch()
}
// terminating if length of nodeList is 0 this.discoverResultSize = nodeList.getLength();
if ((nodeList != null) && (nodeList.getLength() < discoverResultSize)) {
resumptionInt += (nodeList.getLength() + 1 - resultSizeValue);
} else {
resumptionInt += (nodeList.getLength() - resultSizeValue); // subtract the resultSizeValue
// because the iteration is over
// real length and the
// resultSizeValue is added before
// the switch()
}
discoverResultSize = nodeList.getLength(); log
.debug(
"downloadPage().deep-cursor: resumptionStr=" + this.resumptionStr + " ; queryParams="
+ this.queryParams + " resumptionLengthIncreased: " + this.resumptionInt);
log break;
.debug(
"downloadPage().deep-cursor: resumptionStr=" + resumptionStr + " ; queryParams="
+ queryParams + " resumptionLengthIncreased: " + resumptionInt);
break; default: // otherwise: abort
// resultTotal = resumptionInt;
break;
}
default: // otherwise: abort } catch (final Exception e) {
// resultTotal = resumptionInt; log.error(e.getMessage(), e);
break; throw new IllegalStateException("collection failed: " + e.getMessage());
} }
} catch (Exception e) { try {
log.error(e.getMessage(), e); if (this.resultTotal == -1) {
throw new IllegalStateException("collection failed: " + e.getMessage()); this.resultTotal = Integer.parseInt(this.xprResultTotalPath.evaluate(resultNode));
} if ("page".equalsIgnoreCase(this.resumptionType) && !this.BASIC.equalsIgnoreCase(this.authMethod)) {
this.resultTotal += 1;
try { } // to correct the upper bound
if (resultTotal == -1) { log.info("resultTotal was -1 is now: " + this.resultTotal);
resultTotal = Integer.parseInt(xprResultTotalPath.evaluate(resultNode)); }
if (resumptionType.equalsIgnoreCase("page") && !BASIC.equalsIgnoreCase(authMethod)) { } catch (final Exception e) {
resultTotal += 1; log.error(e.getMessage(), e);
} // to correct the upper bound throw new IllegalStateException("downloadPage resultTotal couldn't parse: " + e.getMessage());
log.info("resultTotal was -1 is now: " + resultTotal);
} }
} catch (Exception e) { log.debug("resultTotal: " + this.resultTotal);
log.error(e.getMessage(), e); log.debug("resInt: " + this.resumptionInt);
throw new IllegalStateException("downloadPage resultTotal couldn't parse: " + e.getMessage()); if (this.resumptionInt <= this.resultTotal) {
nextQuery = this.baseUrl + "?" + this.queryParams + this.querySize + "&" + this.resumptionParam + "="
+ this.resumptionStr
+ this.queryFormat;
} else {
nextQuery = "";
// if (resumptionType.toLowerCase().equals("deep-cursor")) { resumptionInt -= 1; } // correct the
// resumptionInt and prevent a NullPointer Exception at mdStore
}
log.debug("nextQueryUrl: " + nextQuery);
return nextQuery;
} catch (final Throwable e) {
log.warn(e.getMessage(), e);
return downloadPage(query, attempt + 1);
} }
log.debug("resultTotal: " + resultTotal);
log.debug("resInt: " + resumptionInt);
if (resumptionInt <= resultTotal) {
nextQuery = baseUrl + "?" + queryParams + querySize + "&" + resumptionParam + "=" + resumptionStr
+ queryFormat;
} else {
nextQuery = "";
// if (resumptionType.toLowerCase().equals("deep-cursor")) { resumptionInt -= 1; } // correct the
// resumptionInt and prevent a NullPointer Exception at mdStore
}
log.debug("nextQueryUrl: " + nextQuery);
return nextQuery;
} }
private boolean isInteger(String s) { private boolean isInteger(final String s) {
boolean isValidInteger = false; boolean isValidInteger = false;
try { try {
Integer.parseInt(s); Integer.parseInt(s);
@ -383,7 +417,7 @@ public class RestIterator implements Iterator<String> {
// s is a valid integer // s is a valid integer
isValidInteger = true; isValidInteger = true;
} catch (NumberFormatException ex) { } catch (final NumberFormatException ex) {
// s is not an integer // s is not an integer
} }
@ -391,20 +425,20 @@ public class RestIterator implements Iterator<String> {
} }
// Method to encode a string value using `UTF-8` encoding scheme // Method to encode a string value using `UTF-8` encoding scheme
private String encodeValue(String value) { private String encodeValue(final String value) {
try { try {
return URLEncoder.encode(value, StandardCharsets.UTF_8.toString()); return URLEncoder.encode(value, StandardCharsets.UTF_8.toString());
} catch (UnsupportedEncodingException ex) { } catch (final UnsupportedEncodingException ex) {
throw new RuntimeException(ex.getCause()); throw new RuntimeException(ex.getCause());
} }
} }
public String getResultFormatValue() { public String getResultFormatValue() {
return resultFormatValue; return this.resultFormatValue;
} }
public String getResultOutputFormat() { public String getResultOutputFormat() {
return resultOutputFormat; return this.resultOutputFormat;
} }
} }

View File

@ -79,23 +79,6 @@ object MagUtility extends Serializable {
private val MAGCollectedFrom = keyValue(ModelConstants.MAG_ID, ModelConstants.MAG_NAME) private val MAGCollectedFrom = keyValue(ModelConstants.MAG_ID, ModelConstants.MAG_NAME)
private val MAGDataInfo: DataInfo = { private val MAGDataInfo: DataInfo = {
val di = new DataInfo
di.setDeletedbyinference(false)
di.setInferred(false)
di.setInvisible(false)
di.setTrust("0.9")
di.setProvenanceaction(
OafMapperUtils.qualifier(
ModelConstants.SYSIMPORT_ACTIONSET,
ModelConstants.SYSIMPORT_ACTIONSET,
ModelConstants.DNET_PROVENANCE_ACTIONS,
ModelConstants.DNET_PROVENANCE_ACTIONS
)
)
di
}
private val MAGDataInfoInvisible: DataInfo = {
val di = new DataInfo val di = new DataInfo
di.setDeletedbyinference(false) di.setDeletedbyinference(false)
di.setInferred(false) di.setInferred(false)
@ -453,7 +436,6 @@ object MagUtility extends Serializable {
case "repository" => case "repository" =>
result = new Publication() result = new Publication()
result.setDataInfo(MAGDataInfoInvisible)
qualifier( qualifier(
"0038", "0038",
"Other literature type", "Other literature type",
@ -488,8 +470,7 @@ object MagUtility extends Serializable {
} }
if (result != null) { if (result != null) {
if (result.getDataInfo == null) result.setDataInfo(MAGDataInfo)
result.setDataInfo(MAGDataInfo)
val i = new Instance val i = new Instance
i.setInstancetype(tp) i.setInstancetype(tp)
i.setInstanceTypeMapping( i.setInstanceTypeMapping(
@ -512,7 +493,7 @@ object MagUtility extends Serializable {
return null return null
result.setCollectedfrom(List(MAGCollectedFrom).asJava) result.setCollectedfrom(List(MAGCollectedFrom).asJava)
val pidList = List( var pidList = List(
structuredProperty( structuredProperty(
paper.paperId.get.toString, paper.paperId.get.toString,
qualifier( qualifier(
@ -525,8 +506,6 @@ object MagUtility extends Serializable {
) )
) )
result.setPid(pidList.asJava)
result.setOriginalId(pidList.map(s => s.getValue).asJava) result.setOriginalId(pidList.map(s => s.getValue).asJava)
result.setId(s"50|mag_________::${DHPUtils.md5(paper.paperId.get.toString)}") result.setId(s"50|mag_________::${DHPUtils.md5(paper.paperId.get.toString)}")
@ -618,22 +597,23 @@ object MagUtility extends Serializable {
} }
val instance = result.getInstance().get(0) val instance = result.getInstance().get(0)
instance.setPid(pidList.asJava)
if (paper.doi.orNull != null) if (paper.doi.orNull != null) {
instance.setAlternateIdentifier( pidList = pidList ::: List(
List( structuredProperty(
structuredProperty( paper.doi.get,
paper.doi.get, qualifier(
qualifier( PidType.doi.toString,
PidType.doi.toString, PidType.doi.toString,
PidType.doi.toString, ModelConstants.DNET_PID_TYPES,
ModelConstants.DNET_PID_TYPES, ModelConstants.DNET_PID_TYPES
ModelConstants.DNET_PID_TYPES ),
), null
null )
)
).asJava
) )
}
instance.setPid(pidList.asJava)
result.setPid(pidList.asJava)
instance.setUrl(paper.urls.get.asJava) instance.setUrl(paper.urls.get.asJava)
instance.setHostedby(ModelConstants.UNKNOWN_REPOSITORY) instance.setHostedby(ModelConstants.UNKNOWN_REPOSITORY)
instance.setCollectedfrom(MAGCollectedFrom) instance.setCollectedfrom(MAGCollectedFrom)

View File

@ -38,6 +38,7 @@ class SparkMAGtoOAF(propertyPath: String, args: Array[String], log: Logger)
spark.read spark.read
.load(s"$magBasePath/mag_denormalized") .load(s"$magBasePath/mag_denormalized")
.as[MAGPaper] .as[MAGPaper]
.filter(col("doi").isNotNull)
.map(s => MagUtility.convertMAGtoOAF(s)) .map(s => MagUtility.convertMAGtoOAF(s))
.filter(s => s != null) .filter(s => s != null)
.write .write

View File

@ -3,6 +3,7 @@ package eu.dnetlib.dhp.collection.plugin.rest;
import java.util.HashMap; import java.util.HashMap;
import java.util.concurrent.atomic.AtomicInteger; import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.atomic.AtomicLong;
import java.util.stream.Stream; import java.util.stream.Stream;
import org.junit.jupiter.api.Assertions; import org.junit.jupiter.api.Assertions;
@ -69,7 +70,7 @@ public class OsfPreprintCollectorTest {
@Test @Test
@Disabled @Disabled
void test() throws CollectorException { void test_limited() throws CollectorException {
final AtomicInteger i = new AtomicInteger(0); final AtomicInteger i = new AtomicInteger(0);
final Stream<String> stream = this.rcp.collect(this.api, new AggregatorReport()); final Stream<String> stream = this.rcp.collect(this.api, new AggregatorReport());
@ -82,4 +83,23 @@ public class OsfPreprintCollectorTest {
log.info("{}", i.intValue()); log.info("{}", i.intValue());
Assertions.assertTrue(i.intValue() > 0); Assertions.assertTrue(i.intValue() > 0);
} }
@Test
@Disabled
void test_all() throws CollectorException {
final AtomicLong i = new AtomicLong(0);
final Stream<String> stream = this.rcp.collect(this.api, new AggregatorReport());
stream.forEach(s -> {
Assertions.assertTrue(s.length() > 0);
if ((i.incrementAndGet() % 1000) == 0) {
log.info("COLLECTED: {}", i.get());
}
});
log.info("TOTAL: {}", i.get());
Assertions.assertTrue(i.get() > 0);
}
} }

View File

@ -3,6 +3,7 @@ package eu.dnetlib.dhp.collection.mag
import com.fasterxml.jackson.databind.ObjectMapper import com.fasterxml.jackson.databind.ObjectMapper
import eu.dnetlib.dhp.schema.oaf.{Dataset, Publication, Result} import eu.dnetlib.dhp.schema.oaf.{Dataset, Publication, Result}
import org.apache.spark.sql.SparkSession import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.functions.col
import org.junit.jupiter.api.Assertions._ import org.junit.jupiter.api.Assertions._
import org.junit.jupiter.api.Test import org.junit.jupiter.api.Test
@ -18,10 +19,8 @@ class MAGMappingTest {
.master("local[*]") .master("local[*]")
.getOrCreate() .getOrCreate()
val s = new SparkMagOrganizationAS(null, null, null) val s = new SparkMAGtoOAF(null, null, null)
s.convertMAG(spark, "/Users/sandro/Downloads/", "/Users/sandro/Downloads/mag_OAF")
s.generateAS(spark, "/home/sandro/Downloads/mag_test", "/home/sandro/Downloads/mag_AS")
} }
@Test @Test

View File

@ -38,7 +38,6 @@
</configuration> </configuration>
</plugin> </plugin>
</plugins> </plugins>
</build> </build>
<dependencies> <dependencies>

View File

@ -189,7 +189,7 @@ public class DedupRecordFactory {
entity = swap; entity = swap;
} }
entity = MergeUtils.checkedMerge(entity, duplicate); entity = MergeUtils.checkedMerge(entity, duplicate, false);
if (ModelSupport.isSubClass(duplicate, Result.class)) { if (ModelSupport.isSubClass(duplicate, Result.class)) {
Result re = (Result) entity; Result re = (Result) entity;

View File

@ -175,6 +175,7 @@ public class SparkCreateMergeRels extends AbstractSparkAction {
} }
// cap pidType at w3id as from there on they are considered equal // cap pidType at w3id as from there on they are considered equal
UserDefinedFunction mapPid = udf( UserDefinedFunction mapPid = udf(
(String s) -> Math.min(PidType.tryValueOf(s).ordinal(), PidType.w3id.ordinal()), DataTypes.IntegerType); (String s) -> Math.min(PidType.tryValueOf(s).ordinal(), PidType.w3id.ordinal()), DataTypes.IntegerType);

View File

@ -44,8 +44,10 @@ public class SparkCreateSimRels extends AbstractSparkAction {
parser.parseArgument(args); parser.parseArgument(args);
SparkConf conf = new SparkConf(); SparkConf conf = new SparkConf();
new SparkCreateSimRels(parser, getSparkSession(conf)) try (SparkSession session = getSparkSession(conf)) {
.run(ISLookupClientFactory.getLookUpService(parser.get("isLookUpUrl"))); new SparkCreateSimRels(parser, session)
.run(ISLookupClientFactory.getLookUpService(parser.get("isLookUpUrl")));
}
} }
@Override @Override

View File

@ -102,6 +102,8 @@
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=15000 --conf spark.sql.shuffle.partitions=15000
--conf spark.network.timeout=300s
--conf spark.shuffle.registration.timeout=50000
</spark-opts> </spark-opts>
<arg>--graphBasePath</arg><arg>${graphBasePath}</arg> <arg>--graphBasePath</arg><arg>${graphBasePath}</arg>
<arg>--graphOutputPath</arg><arg>${graphOutputPath}</arg> <arg>--graphOutputPath</arg><arg>${graphOutputPath}</arg>

View File

@ -33,16 +33,14 @@
<description>max number of elements in a connected component</description> <description>max number of elements in a connected component</description>
</property> </property>
<property> <property>
<name>sparkDriverMemory</name> <name>sparkResourceOpts</name>
<description>memory for driver process</description> <value>--executor-memory=6G --conf spark.executor.memoryOverhead=4G --executor-cores=6 --driver-memory=8G --driver-cores=4</value>
<description>spark resource options</description>
</property> </property>
<property> <property>
<name>sparkExecutorMemory</name> <name>sparkResourceOptsCreateMergeRel</name>
<description>memory for individual executor</description> <value>--executor-memory=6G --conf spark.executor.memoryOverhead=4G --executor-cores=6 --driver-memory=8G --driver-cores=4</value>
</property> <description>spark resource options</description>
<property>
<name>sparkExecutorCores</name>
<description>number of cores used by single executor</description>
</property> </property>
<property> <property>
<name>oozieActionShareLibForSpark2</name> <name>oozieActionShareLibForSpark2</name>
@ -119,9 +117,7 @@
<class>eu.dnetlib.dhp.oa.dedup.SparkCreateSimRels</class> <class>eu.dnetlib.dhp.oa.dedup.SparkCreateSimRels</class>
<jar>dhp-dedup-openaire-${projectVersion}.jar</jar> <jar>dhp-dedup-openaire-${projectVersion}.jar</jar>
<spark-opts> <spark-opts>
--executor-memory=${sparkExecutorMemory} ${sparkResourceOpts}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners} --conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
@ -146,9 +142,7 @@
<class>eu.dnetlib.dhp.oa.dedup.SparkWhitelistSimRels</class> <class>eu.dnetlib.dhp.oa.dedup.SparkWhitelistSimRels</class>
<jar>dhp-dedup-openaire-${projectVersion}.jar</jar> <jar>dhp-dedup-openaire-${projectVersion}.jar</jar>
<spark-opts> <spark-opts>
--executor-memory=${sparkExecutorMemory} ${sparkResourceOpts}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners} --conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
@ -174,9 +168,7 @@
<class>eu.dnetlib.dhp.oa.dedup.SparkCreateMergeRels</class> <class>eu.dnetlib.dhp.oa.dedup.SparkCreateMergeRels</class>
<jar>dhp-dedup-openaire-${projectVersion}.jar</jar> <jar>dhp-dedup-openaire-${projectVersion}.jar</jar>
<spark-opts> <spark-opts>
--executor-memory=${sparkExecutorMemory} ${sparkResourceOptsCreateMergeRel}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners} --conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
@ -203,9 +195,7 @@
<class>eu.dnetlib.dhp.oa.dedup.SparkCreateDedupRecord</class> <class>eu.dnetlib.dhp.oa.dedup.SparkCreateDedupRecord</class>
<jar>dhp-dedup-openaire-${projectVersion}.jar</jar> <jar>dhp-dedup-openaire-${projectVersion}.jar</jar>
<spark-opts> <spark-opts>
--executor-memory=${sparkExecutorMemory} ${sparkResourceOpts}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners} --conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
@ -230,9 +220,7 @@
<class>eu.dnetlib.dhp.oa.dedup.SparkCopyOpenorgsMergeRels</class> <class>eu.dnetlib.dhp.oa.dedup.SparkCopyOpenorgsMergeRels</class>
<jar>dhp-dedup-openaire-${projectVersion}.jar</jar> <jar>dhp-dedup-openaire-${projectVersion}.jar</jar>
<spark-opts> <spark-opts>
--executor-memory=${sparkExecutorMemory} ${sparkResourceOpts}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners} --conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
@ -257,9 +245,7 @@
<class>eu.dnetlib.dhp.oa.dedup.SparkCreateOrgsDedupRecord</class> <class>eu.dnetlib.dhp.oa.dedup.SparkCreateOrgsDedupRecord</class>
<jar>dhp-dedup-openaire-${projectVersion}.jar</jar> <jar>dhp-dedup-openaire-${projectVersion}.jar</jar>
<spark-opts> <spark-opts>
--executor-memory=${sparkExecutorMemory} ${sparkResourceOpts}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners} --conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
@ -283,9 +269,7 @@
<class>eu.dnetlib.dhp.oa.dedup.SparkUpdateEntity</class> <class>eu.dnetlib.dhp.oa.dedup.SparkUpdateEntity</class>
<jar>dhp-dedup-openaire-${projectVersion}.jar</jar> <jar>dhp-dedup-openaire-${projectVersion}.jar</jar>
<spark-opts> <spark-opts>
--executor-memory=${sparkExecutorMemory} ${sparkResourceOpts}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners} --conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
@ -309,9 +293,7 @@
<class>eu.dnetlib.dhp.oa.dedup.SparkCopyRelationsNoOpenorgs</class> <class>eu.dnetlib.dhp.oa.dedup.SparkCopyRelationsNoOpenorgs</class>
<jar>dhp-dedup-openaire-${projectVersion}.jar</jar> <jar>dhp-dedup-openaire-${projectVersion}.jar</jar>
<spark-opts> <spark-opts>
--executor-memory=${sparkExecutorMemory} ${sparkResourceOpts}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners=${spark2ExtraListeners} --conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}

View File

@ -123,7 +123,7 @@ class EntityMergerTest implements Serializable {
assertEquals(dataInfo, pub_merged.getDataInfo()); assertEquals(dataInfo, pub_merged.getDataInfo());
// verify datepicker // verify datepicker
assertEquals("2018-09-30", pub_merged.getDateofacceptance().getValue()); assertEquals("2016-01-01", pub_merged.getDateofacceptance().getValue());
// verify authors // verify authors
assertEquals(13, pub_merged.getAuthor().size()); assertEquals(13, pub_merged.getAuthor().size());

View File

@ -78,7 +78,7 @@ public class IdGeneratorTest {
System.out.println("winner 3 = " + id2); System.out.println("winner 3 = " + id2);
assertEquals("50|doi_dedup___::1a77a3bba737f8b669dcf330ad3b37e2", id1); assertEquals("50|doi_dedup___::1a77a3bba737f8b669dcf330ad3b37e2", id1);
assertEquals("50|dedup_wf_001::0829b5191605bdbea36d6502b8c1ce1g", id2); assertEquals("50|dedup_wf_002::345e5d1b80537b0d0e0a49241ae9e516", id2);
} }
@Test @Test

View File

@ -143,7 +143,7 @@ public class SparkOpenorgsDedupTest implements Serializable {
.load(DedupUtility.createSimRelPath(testOutputBasePath, testActionSetId, "organization")) .load(DedupUtility.createSimRelPath(testOutputBasePath, testActionSetId, "organization"))
.count(); .count();
assertEquals(145, orgs_simrel); assertEquals(86, orgs_simrel);
} }
@Test @Test
@ -172,7 +172,7 @@ public class SparkOpenorgsDedupTest implements Serializable {
.load(DedupUtility.createSimRelPath(testOutputBasePath, testActionSetId, "organization")) .load(DedupUtility.createSimRelPath(testOutputBasePath, testActionSetId, "organization"))
.count(); .count();
assertEquals(181, orgs_simrel); assertEquals(122, orgs_simrel);
} }
@Test @Test
@ -196,7 +196,9 @@ public class SparkOpenorgsDedupTest implements Serializable {
"-la", "-la",
"lookupurl", "lookupurl",
"-w", "-w",
testOutputBasePath testOutputBasePath,
"-h",
""
}); });
new SparkCreateMergeRels(parser, spark).run(isLookUpService); new SparkCreateMergeRels(parser, spark).run(isLookUpService);

View File

@ -13,14 +13,16 @@ import java.io.Serializable;
import java.net.URISyntaxException; import java.net.URISyntaxException;
import java.nio.file.Path; import java.nio.file.Path;
import java.nio.file.Paths; import java.nio.file.Paths;
import java.util.*; import java.util.HashSet;
import java.util.List;
import java.util.Optional;
import java.util.Set;
import java.util.stream.Collectors; import java.util.stream.Collectors;
import org.apache.commons.cli.ParseException; import org.apache.commons.cli.ParseException;
import org.apache.commons.io.FileUtils; import org.apache.commons.io.FileUtils;
import org.apache.commons.io.IOUtils; import org.apache.commons.io.IOUtils;
import org.apache.spark.SparkConf; import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.FilterFunction; import org.apache.spark.api.java.function.FilterFunction;
import org.apache.spark.api.java.function.MapFunction; import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Dataset;
@ -129,7 +131,7 @@ public class SparkPublicationRootsTest implements Serializable {
.load(DedupUtility.createSimRelPath(workingPath, testActionSetId, "publication")) .load(DedupUtility.createSimRelPath(workingPath, testActionSetId, "publication"))
.count(); .count();
assertEquals(37, pubs_simrel); assertEquals(9, pubs_simrel);
} }
@Test @Test
@ -142,7 +144,8 @@ public class SparkPublicationRootsTest implements Serializable {
"--actionSetId", testActionSetId, "--actionSetId", testActionSetId,
"--isLookUpUrl", "lookupurl", "--isLookUpUrl", "lookupurl",
"--workingPath", workingPath, "--workingPath", workingPath,
"--cutConnectedComponent", "3" "--cutConnectedComponent", "3",
"-h", ""
}), spark) }), spark)
.run(isLookUpService); .run(isLookUpService);
@ -171,7 +174,8 @@ public class SparkPublicationRootsTest implements Serializable {
"--graphBasePath", graphInputPath, "--graphBasePath", graphInputPath,
"--actionSetId", testActionSetId, "--actionSetId", testActionSetId,
"--isLookUpUrl", "lookupurl", "--isLookUpUrl", "lookupurl",
"--workingPath", workingPath "--workingPath", workingPath,
"-h", ""
}), spark) }), spark)
.run(isLookUpService); .run(isLookUpService);
@ -207,7 +211,7 @@ public class SparkPublicationRootsTest implements Serializable {
assertTrue(dups.contains(r.getSource())); assertTrue(dups.contains(r.getSource()));
}); });
assertEquals(32, merges.count()); assertEquals(26, merges.count());
} }
@Test @Test
@ -228,7 +232,7 @@ public class SparkPublicationRootsTest implements Serializable {
.textFile(workingPath + "/" + testActionSetId + "/publication_deduprecord") .textFile(workingPath + "/" + testActionSetId + "/publication_deduprecord")
.map(asEntity(Publication.class), Encoders.bean(Publication.class)); .map(asEntity(Publication.class), Encoders.bean(Publication.class));
assertEquals(3, roots.count()); assertEquals(4, roots.count());
final Dataset<Publication> pubs = spark final Dataset<Publication> pubs = spark
.read() .read()
@ -369,7 +373,7 @@ public class SparkPublicationRootsTest implements Serializable {
.distinct() .distinct()
.count(); .count();
assertEquals(19, publications); // 16 originals + 3 roots assertEquals(20, publications); // 16 originals + 3 roots
long deletedPubs = spark long deletedPubs = spark
.read() .read()
@ -380,7 +384,7 @@ public class SparkPublicationRootsTest implements Serializable {
.distinct() .distinct()
.count(); .count();
assertEquals(mergedPubs, deletedPubs); // assertEquals(mergedPubs, deletedPubs);
} }
private static String classPathResourceAsString(String path) throws IOException { private static String classPathResourceAsString(String path) throws IOException {

View File

@ -169,10 +169,10 @@ public class SparkStatsTest implements Serializable {
.count(); .count();
assertEquals(414, orgs_blocks); assertEquals(414, orgs_blocks);
assertEquals(187, pubs_blocks); assertEquals(221, pubs_blocks);
assertEquals(128, sw_blocks); assertEquals(134, sw_blocks);
assertEquals(192, ds_blocks); assertEquals(196, ds_blocks);
assertEquals(194, orp_blocks); assertEquals(198, orp_blocks);
} }
@AfterAll @AfterAll

View File

@ -161,7 +161,7 @@ public class SparkResultToCommunityFromProject implements Serializable {
} }
} }
res.setContext(propagatedContexts); res.setContext(propagatedContexts);
return MergeUtils.checkedMerge(ret, res); return MergeUtils.checkedMerge(ret, res, true);
} }
return ret; return ret;
}; };

View File

@ -100,16 +100,12 @@
--executor-cores=${sparkExecutorCores} --executor-cores=${sparkExecutorCores}
--executor-memory=${sparkExecutorMemory} --executor-memory=${sparkExecutorMemory}
--driver-memory=${sparkDriverMemory} --driver-memory=${sparkDriverMemory}
--conf spark.executor.memoryOverhead=${sparkExecutorMemory}
--conf spark.extraListeners=${spark2ExtraListeners} --conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.dynamicAllocation.enabled=true --conf spark.sql.shuffle.partitions=8000
--conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors}
--conf spark.sql.shuffle.partitions=3840
--conf spark.speculation=false
--conf spark.hadoop.mapreduce.map.speculative=false
--conf spark.hadoop.mapreduce.reduce.speculative=false
</spark-opts> </spark-opts>
<arg>--sourcePath</arg><arg>${sourcePath}</arg> <arg>--sourcePath</arg><arg>${sourcePath}</arg>
<arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg> <arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg>
@ -132,12 +128,11 @@
--executor-cores=${sparkExecutorCores} --executor-cores=${sparkExecutorCores}
--executor-memory=${sparkExecutorMemory} --executor-memory=${sparkExecutorMemory}
--driver-memory=${sparkDriverMemory} --driver-memory=${sparkDriverMemory}
--conf spark.executor.memoryOverhead=${sparkExecutorMemory}
--conf spark.extraListeners=${spark2ExtraListeners} --conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.dynamicAllocation.enabled=true
--conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors}
</spark-opts> </spark-opts>
<arg>--sourcePath</arg><arg>${sourcePath}</arg> <arg>--sourcePath</arg><arg>${sourcePath}</arg>
<arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg> <arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg>
@ -160,12 +155,11 @@
--executor-cores=${sparkExecutorCores} --executor-cores=${sparkExecutorCores}
--executor-memory=${sparkExecutorMemory} --executor-memory=${sparkExecutorMemory}
--driver-memory=${sparkDriverMemory} --driver-memory=${sparkDriverMemory}
--conf spark.executor.memoryOverhead=${sparkExecutorMemory}
--conf spark.extraListeners=${spark2ExtraListeners} --conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.dynamicAllocation.enabled=true
--conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors}
</spark-opts> </spark-opts>
<arg>--sourcePath</arg><arg>${sourcePath}</arg> <arg>--sourcePath</arg><arg>${sourcePath}</arg>
<arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg> <arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg>
@ -188,12 +182,11 @@
--executor-cores=${sparkExecutorCores} --executor-cores=${sparkExecutorCores}
--executor-memory=${sparkExecutorMemory} --executor-memory=${sparkExecutorMemory}
--driver-memory=${sparkDriverMemory} --driver-memory=${sparkDriverMemory}
--conf spark.executor.memoryOverhead=${sparkExecutorMemory}
--conf spark.extraListeners=${spark2ExtraListeners} --conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.dynamicAllocation.enabled=true
--conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors}
</spark-opts> </spark-opts>
<arg>--sourcePath</arg><arg>${sourcePath}</arg> <arg>--sourcePath</arg><arg>${sourcePath}</arg>
<arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg> <arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg>
@ -218,12 +211,11 @@
--executor-cores=${sparkExecutorCores} --executor-cores=${sparkExecutorCores}
--executor-memory=${sparkExecutorMemory} --executor-memory=${sparkExecutorMemory}
--driver-memory=${sparkDriverMemory} --driver-memory=${sparkDriverMemory}
--conf spark.executor.memoryOverhead=${sparkExecutorMemory}
--conf spark.extraListeners=${spark2ExtraListeners} --conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.dynamicAllocation.enabled=true
--conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors}
</spark-opts> </spark-opts>
<arg>--sourcePath</arg><arg>${workingDir}/orcid/targetOrcidAssoc</arg> <arg>--sourcePath</arg><arg>${workingDir}/orcid/targetOrcidAssoc</arg>
<arg>--outputPath</arg><arg>${workingDir}/orcid/mergedOrcidAssoc</arg> <arg>--outputPath</arg><arg>${workingDir}/orcid/mergedOrcidAssoc</arg>
@ -247,19 +239,14 @@
<class>eu.dnetlib.dhp.orcidtoresultfromsemrel.SparkOrcidToResultFromSemRelJob</class> <class>eu.dnetlib.dhp.orcidtoresultfromsemrel.SparkOrcidToResultFromSemRelJob</class>
<jar>dhp-enrichment-${projectVersion}.jar</jar> <jar>dhp-enrichment-${projectVersion}.jar</jar>
<spark-opts> <spark-opts>
--executor-cores=4 --executor-cores=${sparkExecutorCores}
--executor-memory=4G --executor-memory=${sparkExecutorMemory}
--driver-memory=${sparkDriverMemory} --driver-memory=${sparkDriverMemory}
--conf spark.executor.memoryOverhead=5G --conf spark.executor.memoryOverhead=${sparkExecutorMemory}
--conf spark.extraListeners=${spark2ExtraListeners} --conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.dynamicAllocation.enabled=true
--conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors}
--conf spark.speculation=false
--conf spark.hadoop.mapreduce.map.speculative=false
--conf spark.hadoop.mapreduce.reduce.speculative=false
--conf spark.sql.shuffle.partitions=15000 --conf spark.sql.shuffle.partitions=15000
</spark-opts> </spark-opts>
<arg>--possibleUpdatesPath</arg><arg>${workingDir}/orcid/mergedOrcidAssoc</arg> <arg>--possibleUpdatesPath</arg><arg>${workingDir}/orcid/mergedOrcidAssoc</arg>
@ -282,15 +269,12 @@
--executor-cores=${sparkExecutorCores} --executor-cores=${sparkExecutorCores}
--executor-memory=${sparkExecutorMemory} --executor-memory=${sparkExecutorMemory}
--driver-memory=${sparkDriverMemory} --driver-memory=${sparkDriverMemory}
--conf spark.executor.memoryOverhead=${sparkExecutorMemory}
--conf spark.extraListeners=${spark2ExtraListeners} --conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.dynamicAllocation.enabled=true --conf spark.sql.shuffle.partitions=8000
--conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors}
--conf spark.speculation=false
--conf spark.hadoop.mapreduce.map.speculative=false
--conf spark.hadoop.mapreduce.reduce.speculative=false
</spark-opts> </spark-opts>
<arg>--possibleUpdatesPath</arg><arg>${workingDir}/orcid/mergedOrcidAssoc</arg> <arg>--possibleUpdatesPath</arg><arg>${workingDir}/orcid/mergedOrcidAssoc</arg>
<arg>--sourcePath</arg><arg>${sourcePath}/dataset</arg> <arg>--sourcePath</arg><arg>${sourcePath}/dataset</arg>
@ -312,15 +296,12 @@
--executor-cores=${sparkExecutorCores} --executor-cores=${sparkExecutorCores}
--executor-memory=${sparkExecutorMemory} --executor-memory=${sparkExecutorMemory}
--driver-memory=${sparkDriverMemory} --driver-memory=${sparkDriverMemory}
--conf spark.executor.memoryOverhead=${sparkExecutorMemory}
--conf spark.extraListeners=${spark2ExtraListeners} --conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.dynamicAllocation.enabled=true --conf spark.sql.shuffle.partitions=8000
--conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors}
--conf spark.speculation=false
--conf spark.hadoop.mapreduce.map.speculative=false
--conf spark.hadoop.mapreduce.reduce.speculative=false
</spark-opts> </spark-opts>
<arg>--possibleUpdatesPath</arg><arg>${workingDir}/orcid/mergedOrcidAssoc</arg> <arg>--possibleUpdatesPath</arg><arg>${workingDir}/orcid/mergedOrcidAssoc</arg>
<arg>--sourcePath</arg><arg>${sourcePath}/otherresearchproduct</arg> <arg>--sourcePath</arg><arg>${sourcePath}/otherresearchproduct</arg>
@ -342,15 +323,12 @@
--executor-cores=${sparkExecutorCores} --executor-cores=${sparkExecutorCores}
--executor-memory=${sparkExecutorMemory} --executor-memory=${sparkExecutorMemory}
--driver-memory=${sparkDriverMemory} --driver-memory=${sparkDriverMemory}
--conf spark.executor.memoryOverhead=${sparkExecutorMemory}
--conf spark.extraListeners=${spark2ExtraListeners} --conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.dynamicAllocation.enabled=true --conf spark.sql.shuffle.partitions=4000
--conf spark.dynamicAllocation.maxExecutors=${spark2MaxExecutors}
--conf spark.speculation=false
--conf spark.hadoop.mapreduce.map.speculative=false
--conf spark.hadoop.mapreduce.reduce.speculative=false
</spark-opts> </spark-opts>
<arg>--possibleUpdatesPath</arg><arg>${workingDir}/orcid/mergedOrcidAssoc</arg> <arg>--possibleUpdatesPath</arg><arg>${workingDir}/orcid/mergedOrcidAssoc</arg>
<arg>--sourcePath</arg><arg>${sourcePath}/software</arg> <arg>--sourcePath</arg><arg>${sourcePath}/software</arg>
@ -362,15 +340,6 @@
</action> </action>
<join name="wait2" to="End"/> <join name="wait2" to="End"/>
<!-- <action name="reset_workingDir">-->
<!-- <fs>-->
<!-- <delete path="${workingDir}"/>-->
<!-- <mkdir path="${workingDir}"/>-->
<!-- </fs>-->
<!-- <ok to="End"/>-->
<!-- <error to="Kill"/>-->
<!-- </action>-->
<end name="End"/> <end name="End"/>

View File

@ -90,6 +90,12 @@
<version>${project.version}</version> <version>${project.version}</version>
</dependency> </dependency>
<dependency>
<groupId>eu.dnetlib.dhp</groupId>
<artifactId>dhp-pace-core</artifactId>
<version>${project.version}</version>
</dependency>
<dependency> <dependency>
<groupId>com.jayway.jsonpath</groupId> <groupId>com.jayway.jsonpath</groupId>
<artifactId>json-path</artifactId> <artifactId>json-path</artifactId>

View File

@ -4,6 +4,7 @@ package eu.dnetlib.dhp.oa.graph.clean;
import java.io.Serializable; import java.io.Serializable;
import java.util.HashMap; import java.util.HashMap;
import java.util.Objects; import java.util.Objects;
import java.util.Optional;
import java.util.concurrent.atomic.AtomicReference; import java.util.concurrent.atomic.AtomicReference;
import org.apache.commons.lang3.SerializationUtils; import org.apache.commons.lang3.SerializationUtils;
@ -29,7 +30,10 @@ public class CleaningRuleMap extends HashMap<Class<?>, SerializableConsumer<Obje
mapping.put(AccessRight.class, o -> cleanQualifier(vocabularies, (AccessRight) o)); mapping.put(AccessRight.class, o -> cleanQualifier(vocabularies, (AccessRight) o));
mapping.put(Country.class, o -> cleanCountry(vocabularies, (Country) o)); mapping.put(Country.class, o -> cleanCountry(vocabularies, (Country) o));
mapping.put(Relation.class, o -> cleanRelation(vocabularies, (Relation) o)); mapping.put(Relation.class, o -> cleanRelation(vocabularies, (Relation) o));
mapping.put(Subject.class, o -> cleanSubject(vocabularies, (Subject) o));
// commenting out the subject cleaning until we decide if we want to it or not and the implementation will
// be completed. At the moment it is not capable of expanding the whole hierarchy.
// mapping.put(Subject.class, o -> cleanSubject(vocabularies, (Subject) o));
return mapping; return mapping;
} }
@ -38,6 +42,13 @@ public class CleaningRuleMap extends HashMap<Class<?>, SerializableConsumer<Obje
// TODO cleaning based on different subject vocabs can be added here // TODO cleaning based on different subject vocabs can be added here
} }
/**
* The procedure cleans out the subject values, using a vocabulary identified by the field subject.qualifier.classid.
*
* @param vocabularyId
* @param vocabularies
* @param subject
*/
private static void cleanSubjectForVocabulary(String vocabularyId, VocabularyGroup vocabularies, private static void cleanSubjectForVocabulary(String vocabularyId, VocabularyGroup vocabularies,
Subject subject) { Subject subject) {
@ -49,14 +60,22 @@ public class CleaningRuleMap extends HashMap<Class<?>, SerializableConsumer<Obje
subject.getQualifier().setClassid(vocabularyId); subject.getQualifier().setClassid(vocabularyId);
subject.getQualifier().setClassname(vocabulary.getName()); subject.getQualifier().setClassname(vocabulary.getName());
} }
} else if (vocabularyId.equals(subject.getQualifier().getClassid()) && } else {
Objects.nonNull(subject.getDataInfo()) && final String provenanceActionClassId = Optional
!"subject:fos".equals(subject.getDataInfo().getProvenanceaction())) { .ofNullable(subject.getDataInfo())
Qualifier syn = vocabulary.getSynonymAsQualifier(subject.getValue()); .map(DataInfo::getProvenanceaction)
VocabularyTerm term = vocabulary.getTerm(subject.getValue()); .map(Qualifier::getClassid)
if (Objects.isNull(syn) && Objects.isNull(term)) { .orElse(null);
subject.getQualifier().setClassid(ModelConstants.DNET_SUBJECT_KEYWORD);
subject.getQualifier().setClassname(ModelConstants.DNET_SUBJECT_KEYWORD); if (vocabularyId.equals(subject.getQualifier().getClassid()) &&
!"subject:fos".equals(provenanceActionClassId)) {
Qualifier syn = vocabulary.getSynonymAsQualifier(subject.getValue());
VocabularyTerm term = vocabulary.getTerm(subject.getValue());
if (Objects.isNull(syn) && Objects.isNull(term)) {
subject.getQualifier().setClassid(ModelConstants.DNET_SUBJECT_KEYWORD);
subject.getQualifier().setClassname(ModelConstants.DNET_SUBJECT_KEYWORD);
}
} }
} }
}); });

View File

@ -71,7 +71,7 @@ class GenerateEntitiesApplicationTest {
protected <T extends Result> void verifyMerge(Result publication, Result dataset, Class<T> clazz, protected <T extends Result> void verifyMerge(Result publication, Result dataset, Class<T> clazz,
String resultType) { String resultType) {
final Result merge = MergeUtils.mergeResult(publication, dataset); final Result merge = (Result) MergeUtils.merge(publication, dataset);
assertTrue(clazz.isAssignableFrom(merge.getClass())); assertTrue(clazz.isAssignableFrom(merge.getClass()));
assertEquals(resultType, merge.getResulttype().getClassid()); assertEquals(resultType, merge.getResulttype().getClassid());
} }

View File

@ -153,10 +153,15 @@ public class CreateRelatedEntitiesJob_phase1 {
result result
.getTitle() .getTitle()
.stream() .stream()
.filter(t -> StringUtils.isNotBlank(t.getValue()))
.findFirst() .findFirst()
.map(StructuredProperty::getValue)
.ifPresent( .ifPresent(
title -> re.getTitle().setValue(StringUtils.left(title, ModelHardLimits.MAX_TITLE_LENGTH))); title -> {
re.setTitle(title);
re
.getTitle()
.setValue(StringUtils.left(title.getValue(), ModelHardLimits.MAX_TITLE_LENGTH));
});
} }
if (Objects.nonNull(result.getDescription()) && !result.getDescription().isEmpty()) { if (Objects.nonNull(result.getDescription()) && !result.getDescription().isEmpty()) {
result result

View File

@ -3,24 +3,16 @@ package eu.dnetlib.dhp.oa.provision;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import static eu.dnetlib.dhp.utils.DHPUtils.toSeq; import static eu.dnetlib.dhp.utils.DHPUtils.toSeq;
import static org.apache.spark.sql.functions.*;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
import java.util.Optional; import java.util.Optional;
import org.apache.commons.io.IOUtils; import org.apache.commons.io.IOUtils;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.compress.GzipCodec;
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
import org.apache.spark.SparkConf; import org.apache.spark.SparkConf;
import org.apache.spark.SparkContext; import org.apache.spark.SparkContext;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.function.MapFunction; import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.sql.*; import org.apache.spark.sql.*;
import org.apache.spark.sql.expressions.UserDefinedFunction;
import org.apache.spark.sql.types.DataTypes;
import org.apache.spark.util.LongAccumulator; import org.apache.spark.util.LongAccumulator;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
@ -45,9 +37,9 @@ import scala.Tuple2;
/** /**
* XmlConverterJob converts the JoinedEntities as XML records * XmlConverterJob converts the JoinedEntities as XML records
*/ */
public class XmlConverterJob { public class PayloadConverterJob {
private static final Logger log = LoggerFactory.getLogger(XmlConverterJob.class); private static final Logger log = LoggerFactory.getLogger(PayloadConverterJob.class);
public static final String schemaLocation = "https://www.openaire.eu/schema/1.0/oaf-1.0.xsd"; public static final String schemaLocation = "https://www.openaire.eu/schema/1.0/oaf-1.0.xsd";
@ -56,8 +48,8 @@ public class XmlConverterJob {
final ArgumentApplicationParser parser = new ArgumentApplicationParser( final ArgumentApplicationParser parser = new ArgumentApplicationParser(
IOUtils IOUtils
.toString( .toString(
XmlConverterJob.class PayloadConverterJob.class
.getResourceAsStream("/eu/dnetlib/dhp/oa/provision/input_params_xml_converter.json"))); .getResourceAsStream("/eu/dnetlib/dhp/oa/provision/input_params_payload_converter.json")));
parser.parseArgument(args); parser.parseArgument(args);
final Boolean isSparkSessionManaged = Optional final Boolean isSparkSessionManaged = Optional
@ -72,6 +64,12 @@ public class XmlConverterJob {
final String outputPath = parser.get("outputPath"); final String outputPath = parser.get("outputPath");
log.info("outputPath: {}", outputPath); log.info("outputPath: {}", outputPath);
final Boolean validateXML = Optional
.ofNullable(parser.get("validateXML"))
.map(Boolean::valueOf)
.orElse(Boolean.FALSE);
log.info("validateXML: {}", validateXML);
final String contextApiBaseUrl = parser.get("contextApiBaseUrl"); final String contextApiBaseUrl = parser.get("contextApiBaseUrl");
log.info("contextApiBaseUrl: {}", contextApiBaseUrl); log.info("contextApiBaseUrl: {}", contextApiBaseUrl);
@ -86,18 +84,19 @@ public class XmlConverterJob {
runWithSparkSession(conf, isSparkSessionManaged, spark -> { runWithSparkSession(conf, isSparkSessionManaged, spark -> {
removeOutputDir(spark, outputPath); removeOutputDir(spark, outputPath);
convertToXml( createPayloads(
spark, inputPath, outputPath, ContextMapper.fromAPI(contextApiBaseUrl), spark, inputPath, outputPath, ContextMapper.fromAPI(contextApiBaseUrl),
VocabularyGroup.loadVocsFromIS(isLookup)); VocabularyGroup.loadVocsFromIS(isLookup), validateXML);
}); });
} }
private static void convertToXml( private static void createPayloads(
final SparkSession spark, final SparkSession spark,
final String inputPath, final String inputPath,
final String outputPath, final String outputPath,
final ContextMapper contextMapper, final ContextMapper contextMapper,
final VocabularyGroup vocabularies) { final VocabularyGroup vocabularies,
final Boolean validateXML) {
final XmlRecordFactory recordFactory = new XmlRecordFactory( final XmlRecordFactory recordFactory = new XmlRecordFactory(
prepareAccumulators(spark.sparkContext()), prepareAccumulators(spark.sparkContext()),
@ -118,7 +117,7 @@ public class XmlConverterJob {
.as(Encoders.kryo(JoinedEntity.class)) .as(Encoders.kryo(JoinedEntity.class))
.map( .map(
(MapFunction<JoinedEntity, Tuple2<String, SolrRecord>>) je -> new Tuple2<>( (MapFunction<JoinedEntity, Tuple2<String, SolrRecord>>) je -> new Tuple2<>(
recordFactory.build(je), recordFactory.build(je, validateXML),
ProvisionModelSupport.transform(je, contextMapper, vocabularies)), ProvisionModelSupport.transform(je, contextMapper, vocabularies)),
Encoders.tuple(Encoders.STRING(), Encoders.bean(SolrRecord.class))) Encoders.tuple(Encoders.STRING(), Encoders.bean(SolrRecord.class)))
.map( .map(

View File

@ -2,42 +2,34 @@
package eu.dnetlib.dhp.oa.provision; package eu.dnetlib.dhp.oa.provision;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import static org.apache.spark.sql.functions.col;
import java.util.HashSet; import java.util.HashSet;
import java.util.Optional; import java.util.Optional;
import java.util.PriorityQueue;
import java.util.Set; import java.util.Set;
import java.util.stream.Collectors; import java.util.stream.Collectors;
import java.util.stream.Stream;
import org.apache.commons.io.IOUtils; import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.spark.SparkConf; import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.FilterFunction;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.Encoder;
import org.apache.spark.sql.Encoders; import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SaveMode; import org.apache.spark.sql.SaveMode;
import org.apache.spark.sql.SparkSession; import org.apache.spark.sql.SparkSession;
import org.apache.spark.sql.expressions.Aggregator; import org.apache.spark.sql.expressions.Window;
import org.apache.spark.sql.expressions.WindowSpec;
import org.apache.spark.sql.functions;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.databind.ObjectMapper;
import com.google.common.base.Joiner;
import com.google.common.base.Splitter; import com.google.common.base.Splitter;
import com.google.common.collect.Iterables;
import com.google.common.collect.Sets; import com.google.common.collect.Sets;
import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.common.HdfsSupport; import eu.dnetlib.dhp.common.HdfsSupport;
import eu.dnetlib.dhp.oa.provision.model.ProvisionModelSupport; import eu.dnetlib.dhp.oa.provision.model.ProvisionModelSupport;
import eu.dnetlib.dhp.oa.provision.model.SortableRelationKey;
import eu.dnetlib.dhp.oa.provision.utils.RelationPartitioner;
import eu.dnetlib.dhp.schema.oaf.Relation; import eu.dnetlib.dhp.schema.oaf.Relation;
import scala.Tuple2;
/** /**
* PrepareRelationsJob prunes the relationships: only consider relationships that are not virtually deleted * PrepareRelationsJob prunes the relationships: only consider relationships that are not virtually deleted
@ -130,132 +122,36 @@ public class PrepareRelationsJob {
private static void prepareRelationsRDD(SparkSession spark, String inputRelationsPath, String outputPath, private static void prepareRelationsRDD(SparkSession spark, String inputRelationsPath, String outputPath,
Set<String> relationFilter, int sourceMaxRelations, int targetMaxRelations, int relPartitions) { Set<String> relationFilter, int sourceMaxRelations, int targetMaxRelations, int relPartitions) {
JavaRDD<Relation> rels = readPathRelationRDD(spark, inputRelationsPath) WindowSpec source_w = Window
.filter(rel -> !(rel.getSource().startsWith("unresolved") || rel.getTarget().startsWith("unresolved"))) .partitionBy("source", "subRelType")
.filter(rel -> !rel.getDataInfo().getDeletedbyinference()) .orderBy(col("target").desc_nulls_last());
.filter(rel -> !relationFilter.contains(StringUtils.lowerCase(rel.getRelClass())));
JavaRDD<Relation> pruned = pruneRels( WindowSpec target_w = Window
pruneRels( .partitionBy("target", "subRelType")
rels, .orderBy(col("source").desc_nulls_last());
sourceMaxRelations, relPartitions, (Function<Relation, String>) Relation::getSource),
targetMaxRelations, relPartitions, (Function<Relation, String>) Relation::getTarget);
spark
.createDataset(pruned.rdd(), Encoders.bean(Relation.class))
.repartition(relPartitions)
.write()
.mode(SaveMode.Overwrite)
.parquet(outputPath);
}
private static JavaRDD<Relation> pruneRels(JavaRDD<Relation> rels, int maxRelations,
int relPartitions, Function<Relation, String> idFn) {
return rels
.mapToPair(r -> new Tuple2<>(SortableRelationKey.create(r, idFn.call(r)), r))
.repartitionAndSortWithinPartitions(new RelationPartitioner(relPartitions))
.groupBy(Tuple2::_1)
.map(Tuple2::_2)
.map(t -> Iterables.limit(t, maxRelations))
.flatMap(Iterable::iterator)
.map(Tuple2::_2);
}
// experimental
private static void prepareRelationsDataset(
SparkSession spark, String inputRelationsPath, String outputPath, Set<String> relationFilter, int maxRelations,
int relPartitions) {
spark spark
.read() .read()
.textFile(inputRelationsPath) .schema(Encoders.bean(Relation.class).schema())
.repartition(relPartitions) .json(inputRelationsPath)
.map( .where("source NOT LIKE 'unresolved%' AND target NOT LIKE 'unresolved%'")
(MapFunction<String, Relation>) s -> OBJECT_MAPPER.readValue(s, Relation.class), .where("datainfo.deletedbyinference != true")
Encoders.kryo(Relation.class)) .where(
.filter((FilterFunction<Relation>) rel -> !rel.getDataInfo().getDeletedbyinference()) relationFilter.isEmpty() ? ""
.filter((FilterFunction<Relation>) rel -> !relationFilter.contains(rel.getRelClass())) : "lower(relClass) NOT IN ("
.groupByKey( + relationFilter.stream().map(s -> "'" + s + "'").collect(Collectors.joining(",")) + ")")
(MapFunction<Relation, String>) Relation::getSource, .withColumn("source_w_pos", functions.row_number().over(source_w))
Encoders.STRING()) .where("source_w_pos < " + sourceMaxRelations)
.agg(new RelationAggregator(maxRelations).toColumn()) .drop("source_w_pos")
.flatMap( .withColumn("target_w_pos", functions.row_number().over(target_w))
(FlatMapFunction<Tuple2<String, RelationList>, Relation>) t -> Iterables .where("target_w_pos < " + targetMaxRelations)
.limit(t._2().getRelations(), maxRelations) .drop("target_w_pos")
.iterator(), .coalesce(relPartitions)
Encoders.bean(Relation.class))
.repartition(relPartitions)
.write() .write()
.mode(SaveMode.Overwrite) .mode(SaveMode.Overwrite)
.parquet(outputPath); .parquet(outputPath);
} }
public static class RelationAggregator
extends Aggregator<Relation, RelationList, RelationList> {
private final int maxRelations;
public RelationAggregator(int maxRelations) {
this.maxRelations = maxRelations;
}
@Override
public RelationList zero() {
return new RelationList();
}
@Override
public RelationList reduce(RelationList b, Relation a) {
b.getRelations().add(a);
return getSortableRelationList(b);
}
@Override
public RelationList merge(RelationList b1, RelationList b2) {
b1.getRelations().addAll(b2.getRelations());
return getSortableRelationList(b1);
}
@Override
public RelationList finish(RelationList r) {
return getSortableRelationList(r);
}
private RelationList getSortableRelationList(RelationList b1) {
RelationList sr = new RelationList();
sr
.setRelations(
b1
.getRelations()
.stream()
.limit(maxRelations)
.collect(Collectors.toCollection(() -> new PriorityQueue<>(new RelationComparator()))));
return sr;
}
@Override
public Encoder<RelationList> bufferEncoder() {
return Encoders.kryo(RelationList.class);
}
@Override
public Encoder<RelationList> outputEncoder() {
return Encoders.kryo(RelationList.class);
}
}
/**
* Reads a JavaRDD of eu.dnetlib.dhp.oa.provision.model.SortableRelation objects from a newline delimited json text
* file,
*
* @param spark
* @param inputPath
* @return the JavaRDD<SortableRelation> containing all the relationships
*/
private static JavaRDD<Relation> readPathRelationRDD(
SparkSession spark, final String inputPath) {
JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
return sc.textFile(inputPath).map(s -> OBJECT_MAPPER.readValue(s, Relation.class));
}
private static void removeOutputDir(SparkSession spark, String path) { private static void removeOutputDir(SparkSession spark, String path) {
HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration()); HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration());
} }

View File

@ -1,44 +0,0 @@
package eu.dnetlib.dhp.oa.provision;
import java.util.Comparator;
import java.util.Map;
import java.util.Optional;
import com.google.common.collect.ComparisonChain;
import com.google.common.collect.Maps;
import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.oaf.Relation;
public class RelationComparator implements Comparator<Relation> {
private static final Map<String, Integer> weights = Maps.newHashMap();
static {
weights.put(ModelConstants.OUTCOME, 0);
weights.put(ModelConstants.SUPPLEMENT, 1);
weights.put(ModelConstants.REVIEW, 2);
weights.put(ModelConstants.CITATION, 3);
weights.put(ModelConstants.AFFILIATION, 4);
weights.put(ModelConstants.RELATIONSHIP, 5);
weights.put(ModelConstants.PUBLICATION_DATASET, 6);
weights.put(ModelConstants.SIMILARITY, 7);
weights.put(ModelConstants.PROVISION, 8);
weights.put(ModelConstants.PARTICIPATION, 9);
weights.put(ModelConstants.DEDUP, 10);
}
private Integer getWeight(Relation o) {
return Optional.ofNullable(weights.get(o.getSubRelType())).orElse(Integer.MAX_VALUE);
}
@Override
public int compare(Relation o1, Relation o2) {
return ComparisonChain
.start()
.compare(getWeight(o1), getWeight(o2))
.result();
}
}

View File

@ -1,25 +0,0 @@
package eu.dnetlib.dhp.oa.provision;
import java.io.Serializable;
import java.util.PriorityQueue;
import java.util.Queue;
import eu.dnetlib.dhp.schema.oaf.Relation;
public class RelationList implements Serializable {
private Queue<Relation> relations;
public RelationList() {
this.relations = new PriorityQueue<>(new RelationComparator());
}
public Queue<Relation> getRelations() {
return relations;
}
public void setRelations(Queue<Relation> relations) {
this.relations = relations;
}
}

View File

@ -1,81 +0,0 @@
package eu.dnetlib.dhp.oa.provision;
import java.io.Serializable;
import java.util.Map;
import java.util.Optional;
import com.fasterxml.jackson.annotation.JsonIgnore;
import com.google.common.collect.ComparisonChain;
import com.google.common.collect.Maps;
import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.oaf.Relation;
public class SortableRelation extends Relation implements Comparable<SortableRelation>, Serializable {
private static final Map<String, Integer> weights = Maps.newHashMap();
static {
weights.put(ModelConstants.OUTCOME, 0);
weights.put(ModelConstants.SUPPLEMENT, 1);
weights.put(ModelConstants.REVIEW, 2);
weights.put(ModelConstants.CITATION, 3);
weights.put(ModelConstants.AFFILIATION, 4);
weights.put(ModelConstants.RELATIONSHIP, 5);
weights.put(ModelConstants.PUBLICATION_RESULTTYPE_CLASSID, 6);
weights.put(ModelConstants.SIMILARITY, 7);
weights.put(ModelConstants.PROVISION, 8);
weights.put(ModelConstants.PARTICIPATION, 9);
weights.put(ModelConstants.DEDUP, 10);
}
private static final long serialVersionUID = 34753984579L;
private String groupingKey;
public static SortableRelation create(Relation r, String groupingKey) {
SortableRelation sr = new SortableRelation();
sr.setGroupingKey(groupingKey);
sr.setSource(r.getSource());
sr.setTarget(r.getTarget());
sr.setRelType(r.getRelType());
sr.setSubRelType(r.getSubRelType());
sr.setRelClass(r.getRelClass());
sr.setDataInfo(r.getDataInfo());
sr.setCollectedfrom(r.getCollectedfrom());
sr.setLastupdatetimestamp(r.getLastupdatetimestamp());
sr.setProperties(r.getProperties());
sr.setValidated(r.getValidated());
sr.setValidationDate(r.getValidationDate());
return sr;
}
@JsonIgnore
public Relation asRelation() {
return this;
}
@Override
public int compareTo(SortableRelation o) {
return ComparisonChain
.start()
.compare(getGroupingKey(), o.getGroupingKey())
.compare(getWeight(this), getWeight(o))
.result();
}
private Integer getWeight(SortableRelation o) {
return Optional.ofNullable(weights.get(o.getSubRelType())).orElse(Integer.MAX_VALUE);
}
public String getGroupingKey() {
return groupingKey;
}
public void setGroupingKey(String groupingKey) {
this.groupingKey = groupingKey;
}
}

View File

@ -1,8 +1,6 @@
package eu.dnetlib.dhp.oa.provision.model; package eu.dnetlib.dhp.oa.provision.model;
import static org.apache.commons.lang3.StringUtils.substringBefore;
import java.io.StringReader; import java.io.StringReader;
import java.util.*; import java.util.*;
import java.util.stream.Collectors; import java.util.stream.Collectors;
@ -16,16 +14,15 @@ import org.jetbrains.annotations.Nullable;
import com.google.common.base.Splitter; import com.google.common.base.Splitter;
import com.google.common.collect.Lists; import com.google.common.collect.Lists;
import com.google.common.collect.Maps; import com.google.common.collect.Maps;
import com.google.common.collect.Sets;
import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup; import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup;
import eu.dnetlib.dhp.common.vocabulary.VocabularyTerm; import eu.dnetlib.dhp.common.vocabulary.VocabularyTerm;
import eu.dnetlib.dhp.oa.provision.RelationList;
import eu.dnetlib.dhp.oa.provision.SortableRelation;
import eu.dnetlib.dhp.oa.provision.utils.ContextDef; import eu.dnetlib.dhp.oa.provision.utils.ContextDef;
import eu.dnetlib.dhp.oa.provision.utils.ContextMapper; import eu.dnetlib.dhp.oa.provision.utils.ContextMapper;
import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.common.ModelSupport; import eu.dnetlib.dhp.schema.common.ModelSupport;
import eu.dnetlib.dhp.schema.oaf.*; import eu.dnetlib.dhp.schema.oaf.*;
import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory;
import eu.dnetlib.dhp.schema.solr.*; import eu.dnetlib.dhp.schema.solr.*;
import eu.dnetlib.dhp.schema.solr.AccessRight; import eu.dnetlib.dhp.schema.solr.AccessRight;
import eu.dnetlib.dhp.schema.solr.Author; import eu.dnetlib.dhp.schema.solr.Author;
@ -55,10 +52,7 @@ public class ProvisionModelSupport {
.newArrayList( .newArrayList(
RelatedEntityWrapper.class, RelatedEntityWrapper.class,
JoinedEntity.class, JoinedEntity.class,
RelatedEntity.class, RelatedEntity.class));
SortableRelationKey.class,
SortableRelation.class,
RelationList.class));
return modelClasses.toArray(new Class[] {}); return modelClasses.toArray(new Class[] {});
} }
@ -74,7 +68,11 @@ public class ProvisionModelSupport {
.setHeader( .setHeader(
SolrRecordHeader SolrRecordHeader
.newInstance( .newInstance(
e.getId(), e.getOriginalId(), type, deletedbyinference)); StringUtils
.substringAfter(
e.getId(),
IdentifierFactory.ID_PREFIX_SEPARATOR),
e.getOriginalId(), type, deletedbyinference));
r.setCollectedfrom(asProvenance(e.getCollectedfrom())); r.setCollectedfrom(asProvenance(e.getCollectedfrom()));
r.setContext(asContext(e.getContext(), contextMapper)); r.setContext(asContext(e.getContext(), contextMapper));
r.setPid(asPid(e.getPid())); r.setPid(asPid(e.getPid()));
@ -114,7 +112,8 @@ public class ProvisionModelSupport {
.newInstance( .newInstance(
relation.getRelType(), relation.getRelType(),
relation.getRelClass(), relation.getRelClass(),
relation.getTarget(), relatedRecordType)); StringUtils.substringAfter(relation.getTarget(), IdentifierFactory.ID_PREFIX_SEPARATOR),
relatedRecordType));
rr.setAcronym(re.getAcronym()); rr.setAcronym(re.getAcronym());
rr.setCode(re.getCode()); rr.setCode(re.getCode());
@ -147,6 +146,7 @@ public class ProvisionModelSupport {
ps.setContracttype(mapCodeLabel(p.getContracttype())); ps.setContracttype(mapCodeLabel(p.getContracttype()));
ps.setCurrency(mapField(p.getCurrency())); ps.setCurrency(mapField(p.getCurrency()));
ps.setDuration(mapField(p.getDuration())); ps.setDuration(mapField(p.getDuration()));
ps.setOamandatepublications(mapField(p.getOamandatepublications()));
ps.setCallidentifier(mapField(p.getCallidentifier())); ps.setCallidentifier(mapField(p.getCallidentifier()));
ps.setEcarticle29_3(mapField(p.getEcarticle29_3())); ps.setEcarticle29_3(mapField(p.getEcarticle29_3()));
ps.setEnddate(mapField(p.getEnddate())); ps.setEnddate(mapField(p.getEnddate()));
@ -387,7 +387,7 @@ public class ProvisionModelSupport {
.equals( .equals(
Optional Optional
.ofNullable(t.getQualifier()) .ofNullable(t.getQualifier())
.map(Qualifier::getClassid) .map(Qualifier::getClassname)
.orElse(null))) .orElse(null)))
.map(StructuredProperty::getValue) .map(StructuredProperty::getValue)
.collect(Collectors.toList())) .collect(Collectors.toList()))
@ -405,7 +405,7 @@ public class ProvisionModelSupport {
.equals( .equals(
Optional Optional
.ofNullable(t.getQualifier()) .ofNullable(t.getQualifier())
.map(Qualifier::getClassid) .map(Qualifier::getClassname)
.orElse(null))) .orElse(null)))
.map(StructuredProperty::getValue) .map(StructuredProperty::getValue)
.findFirst()) .findFirst())
@ -472,7 +472,7 @@ public class ProvisionModelSupport {
} }
private static String mapQualifier(eu.dnetlib.dhp.schema.oaf.Qualifier q) { private static String mapQualifier(eu.dnetlib.dhp.schema.oaf.Qualifier q) {
return Optional.ofNullable(q).map(Qualifier::getClassid).orElse(null); return Optional.ofNullable(q).map(Qualifier::getClassname).orElse(null);
} }
private static Journal mapJournal(eu.dnetlib.dhp.schema.oaf.Journal joaf) { private static Journal mapJournal(eu.dnetlib.dhp.schema.oaf.Journal joaf) {
@ -581,7 +581,7 @@ public class ProvisionModelSupport {
.map( .map(
pids -> pids pids -> pids
.stream() .stream()
.map(p -> Pid.newInstance(p.getQualifier().getClassid(), p.getValue())) .map(p -> Pid.newInstance(p.getQualifier().getClassname(), p.getValue()))
.collect(Collectors.toList())) .collect(Collectors.toList()))
.orElse(null); .orElse(null);
} }
@ -606,8 +606,8 @@ public class ProvisionModelSupport {
subjects -> subjects subjects -> subjects
.stream() .stream()
.filter(s -> Objects.nonNull(s.getQualifier())) .filter(s -> Objects.nonNull(s.getQualifier()))
.filter(s -> Objects.nonNull(s.getQualifier().getClassid())) .filter(s -> Objects.nonNull(s.getQualifier().getClassname()))
.map(s -> Subject.newInstance(s.getValue(), s.getQualifier().getClassid())) .map(s -> Subject.newInstance(s.getValue(), s.getQualifier().getClassname()))
.collect(Collectors.toList())) .collect(Collectors.toList()))
.orElse(null); .orElse(null);
} }
@ -619,8 +619,8 @@ public class ProvisionModelSupport {
subjects -> subjects subjects -> subjects
.stream() .stream()
.filter(s -> Objects.nonNull(s.getQualifier())) .filter(s -> Objects.nonNull(s.getQualifier()))
.filter(s -> Objects.nonNull(s.getQualifier().getClassid())) .filter(s -> Objects.nonNull(s.getQualifier().getClassname()))
.map(s -> Subject.newInstance(s.getValue(), s.getQualifier().getClassid())) .map(s -> Subject.newInstance(s.getValue(), s.getQualifier().getClassname()))
.collect(Collectors.toList())) .collect(Collectors.toList()))
.orElse(null); .orElse(null);
} }

View File

@ -93,10 +93,13 @@ public class XmlRecordFactory implements Serializable {
} }
public String build(final JoinedEntity je) { public String build(final JoinedEntity je) {
return build(je, false);
}
public String build(final JoinedEntity je, final Boolean validate) {
final Set<String> contexts = Sets.newHashSet(); final Set<String> contexts = Sets.newHashSet();
// final OafEntity entity = toOafEntity(je.getEntity());
final OafEntity entity = je.getEntity(); final OafEntity entity = je.getEntity();
final TemplateFactory templateFactory = new TemplateFactory(); final TemplateFactory templateFactory = new TemplateFactory();
try { try {
@ -122,7 +125,13 @@ public class XmlRecordFactory implements Serializable {
.buildBody( .buildBody(
mainType, metadata, relations, listChildren(entity, je, templateFactory), listExtraInfo(entity)); mainType, metadata, relations, listChildren(entity, je, templateFactory), listExtraInfo(entity));
return templateFactory.buildRecord(entity, schemaLocation, body); String xmlRecord = templateFactory.buildRecord(entity, schemaLocation, body);
if (Boolean.TRUE.equals(validate)) {
// rise an exception when an invalid record was built
new SAXReader().read(new StringReader(xmlRecord));
}
return xmlRecord;
// return printXML(templateFactory.buildRecord(entity, schemaLocation, body), indent); // return printXML(templateFactory.buildRecord(entity, schemaLocation, body), indent);
} catch (final Throwable e) { } catch (final Throwable e) {
throw new RuntimeException(String.format("error building record '%s'", entity.getId()), e); throw new RuntimeException(String.format("error building record '%s'", entity.getId()), e);
@ -1038,13 +1047,21 @@ public class XmlRecordFactory implements Serializable {
} }
private List<String> measuresAsXml(List<Measure> measures) { private List<String> measuresAsXml(List<Measure> measures) {
return measures return Stream
.stream() .concat(
.map(m -> { measures
List<Tuple2<String, String>> l = Lists.newArrayList(new Tuple2<>("id", m.getId())); .stream()
m.getUnit().forEach(kv -> l.add(new Tuple2<>(kv.getKey(), kv.getValue()))); .filter(m -> !"downloads".equals(m.getId()) && !"views".equals(m.getId()))
return XmlSerializationUtils.asXmlElement("measure", l); .map(m -> {
}) List<Tuple2<String, String>> l = Lists.newArrayList(new Tuple2<>("id", m.getId()));
m.getUnit().forEach(kv -> l.add(new Tuple2<>(kv.getKey(), kv.getValue())));
return XmlSerializationUtils.asXmlElement("measure", l);
}),
measures
.stream()
.filter(m -> "downloads".equals(m.getId()) || "views".equals(m.getId()))
.filter(m -> m.getUnit().stream().anyMatch(u -> Integer.parseInt(u.getValue()) > 0))
.map(m -> XmlSerializationUtils.usageMeasureAsXmlElement("measure", m)))
.collect(Collectors.toList()); .collect(Collectors.toList());
} }

View File

@ -5,7 +5,11 @@ import static eu.dnetlib.dhp.oa.provision.utils.GraphMappingUtils.removePrefix;
import static org.apache.commons.lang3.StringUtils.isBlank; import static org.apache.commons.lang3.StringUtils.isBlank;
import static org.apache.commons.lang3.StringUtils.isNotBlank; import static org.apache.commons.lang3.StringUtils.isNotBlank;
import java.util.HashSet;
import java.util.List; import java.util.List;
import java.util.Optional;
import java.util.Set;
import java.util.stream.Collectors;
import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.StringUtils;
@ -166,6 +170,33 @@ public class XmlSerializationUtils {
return sb.toString(); return sb.toString();
} }
// <measure downloads="0" views="0">infrastruct_::f66f1bd369679b5b077dcdf006089556||OpenAIRE</measure>
public static String usageMeasureAsXmlElement(String name, Measure measure) {
HashSet<String> dsIds = Optional
.ofNullable(measure.getUnit())
.map(
m -> m
.stream()
.map(KeyValue::getKey)
.collect(Collectors.toCollection(HashSet::new)))
.orElse(new HashSet<>());
StringBuilder sb = new StringBuilder();
dsIds.forEach(dsId -> {
sb
.append("<")
.append(name);
for (KeyValue kv : measure.getUnit()) {
sb.append(" ").append(attr(measure.getId(), kv.getValue()));
}
sb
.append(" ")
.append(attr("datasource", dsId))
.append("/>");
});
return sb.toString();
}
public static String mapEoscIf(EoscIfGuidelines e) { public static String mapEoscIf(EoscIfGuidelines e) {
return asXmlElement( return asXmlElement(
"eoscifguidelines", Lists "eoscifguidelines", Lists

View File

@ -22,5 +22,11 @@
"paramLongName": "isLookupUrl", "paramLongName": "isLookupUrl",
"paramDescription": "URL of the context ISLookup Service", "paramDescription": "URL of the context ISLookup Service",
"paramRequired": true "paramRequired": true
},
{
"paramName": "val",
"paramLongName": "validateXML",
"paramDescription": "should the process check the XML validity",
"paramRequired": false
} }
] ]

View File

@ -13,6 +13,11 @@
<name>contextApiBaseUrl</name> <name>contextApiBaseUrl</name>
<description>context API URL</description> <description>context API URL</description>
</property> </property>
<property>
<name>validateXML</name>
<description>should the payload converter validate the XMLs</description>
<value>false</value>
</property>
<property> <property>
<name>relPartitions</name> <name>relPartitions</name>
<description>number or partitions for the relations Dataset</description> <description>number or partitions for the relations Dataset</description>
@ -125,7 +130,7 @@
<case to="prepare_relations">${wf:conf('resumeFrom') eq 'prepare_relations'}</case> <case to="prepare_relations">${wf:conf('resumeFrom') eq 'prepare_relations'}</case>
<case to="fork_join_related_entities">${wf:conf('resumeFrom') eq 'fork_join_related_entities'}</case> <case to="fork_join_related_entities">${wf:conf('resumeFrom') eq 'fork_join_related_entities'}</case>
<case to="fork_join_all_entities">${wf:conf('resumeFrom') eq 'fork_join_all_entities'}</case> <case to="fork_join_all_entities">${wf:conf('resumeFrom') eq 'fork_join_all_entities'}</case>
<case to="convert_to_xml">${wf:conf('resumeFrom') eq 'convert_to_xml'}</case> <case to="create_payloads">${wf:conf('resumeFrom') eq 'create_payloads'}</case>
<case to="drop_solr_collection">${wf:conf('resumeFrom') eq 'drop_solr_collection'}</case> <case to="drop_solr_collection">${wf:conf('resumeFrom') eq 'drop_solr_collection'}</case>
<case to="to_solr_index">${wf:conf('resumeFrom') eq 'to_solr_index'}</case> <case to="to_solr_index">${wf:conf('resumeFrom') eq 'to_solr_index'}</case>
<default to="prepare_relations"/> <default to="prepare_relations"/>
@ -144,21 +149,23 @@
<class>eu.dnetlib.dhp.oa.provision.PrepareRelationsJob</class> <class>eu.dnetlib.dhp.oa.provision.PrepareRelationsJob</class>
<jar>dhp-graph-provision-${projectVersion}.jar</jar> <jar>dhp-graph-provision-${projectVersion}.jar</jar>
<spark-opts> <spark-opts>
--executor-cores=${sparkExecutorCoresForJoining} --executor-cores=4
--executor-memory=${sparkExecutorMemoryForJoining} --executor-memory=6G
--driver-memory=${sparkDriverMemoryForJoining} --driver-memory=${sparkDriverMemoryForJoining}
--conf spark.executor.memoryOverhead=6G
--conf spark.extraListeners=${spark2ExtraListeners} --conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=3840 --conf spark.sql.shuffle.partitions=15000
--conf spark.network.timeout=${sparkNetworkTimeout}
</spark-opts> </spark-opts>
<arg>--inputRelationsPath</arg><arg>${inputGraphRootPath}/relation</arg> <arg>--inputRelationsPath</arg><arg>${inputGraphRootPath}/relation</arg>
<arg>--outputPath</arg><arg>${workingDir}/relation</arg> <arg>--outputPath</arg><arg>${workingDir}/relation</arg>
<arg>--sourceMaxRelations</arg><arg>${sourceMaxRelations}</arg> <arg>--sourceMaxRelations</arg><arg>${sourceMaxRelations}</arg>
<arg>--targetMaxRelations</arg><arg>${targetMaxRelations}</arg> <arg>--targetMaxRelations</arg><arg>${targetMaxRelations}</arg>
<arg>--relationFilter</arg><arg>${relationFilter}</arg> <arg>--relationFilter</arg><arg>${relationFilter}</arg>
<arg>--relPartitions</arg><arg>5000</arg> <arg>--relPartitions</arg><arg>15000</arg>
</spark> </spark>
<ok to="fork_join_related_entities"/> <ok to="fork_join_related_entities"/>
<error to="Kill"/> <error to="Kill"/>
@ -585,19 +592,20 @@
<error to="Kill"/> <error to="Kill"/>
</action> </action>
<join name="wait_join_phase2" to="convert_to_xml"/> <join name="wait_join_phase2" to="create_payloads"/>
<action name="convert_to_xml"> <action name="create_payloads">
<spark xmlns="uri:oozie:spark-action:0.2"> <spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master> <master>yarn</master>
<mode>cluster</mode> <mode>cluster</mode>
<name>convert_to_xml</name> <name>create_payloads</name>
<class>eu.dnetlib.dhp.oa.provision.XmlConverterJob</class> <class>eu.dnetlib.dhp.oa.provision.PayloadConverterJob</class>
<jar>dhp-graph-provision-${projectVersion}.jar</jar> <jar>dhp-graph-provision-${projectVersion}.jar</jar>
<spark-opts> <spark-opts>
--executor-cores=${sparkExecutorCores} --executor-cores=${sparkExecutorCores}
--executor-memory=${sparkExecutorMemory} --executor-memory=${sparkExecutorMemory}
--driver-memory=${sparkDriverMemory} --driver-memory=${sparkDriverMemory}
--conf spark.executor.memoryOverhead=${sparkExecutorMemory}
--conf spark.extraListeners=${spark2ExtraListeners} --conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
@ -607,6 +615,7 @@
</spark-opts> </spark-opts>
<arg>--inputPath</arg><arg>${workingDir}/join_entities</arg> <arg>--inputPath</arg><arg>${workingDir}/join_entities</arg>
<arg>--outputPath</arg><arg>${workingDir}/xml_json</arg> <arg>--outputPath</arg><arg>${workingDir}/xml_json</arg>
<arg>--validateXML</arg><arg>${validateXML}</arg>
<arg>--contextApiBaseUrl</arg><arg>${contextApiBaseUrl}</arg> <arg>--contextApiBaseUrl</arg><arg>${contextApiBaseUrl}</arg>
<arg>--isLookupUrl</arg><arg>${isLookupUrl}</arg> <arg>--isLookupUrl</arg><arg>${isLookupUrl}</arg>
</spark> </spark>

View File

@ -50,7 +50,7 @@ public class EOSCFuture_Test {
final ContextMapper contextMapper = new ContextMapper(); final ContextMapper contextMapper = new ContextMapper();
final XmlRecordFactory xmlRecordFactory = new XmlRecordFactory(contextMapper, false, final XmlRecordFactory xmlRecordFactory = new XmlRecordFactory(contextMapper, false,
XmlConverterJob.schemaLocation); PayloadConverterJob.schemaLocation);
final OtherResearchProduct p = OBJECT_MAPPER final OtherResearchProduct p = OBJECT_MAPPER
.readValue( .readValue(

View File

@ -57,7 +57,7 @@ public class IndexRecordTransformerTest {
public void testPublicationRecordTransformation() throws IOException, TransformerException { public void testPublicationRecordTransformation() throws IOException, TransformerException {
final XmlRecordFactory xmlRecordFactory = new XmlRecordFactory(contextMapper, false, final XmlRecordFactory xmlRecordFactory = new XmlRecordFactory(contextMapper, false,
XmlConverterJob.schemaLocation); PayloadConverterJob.schemaLocation);
final Publication p = load("publication.json", Publication.class); final Publication p = load("publication.json", Publication.class);
final Project pj = load("project.json", Project.class); final Project pj = load("project.json", Project.class);
@ -82,7 +82,7 @@ public class IndexRecordTransformerTest {
void testPeerReviewed() throws IOException, TransformerException { void testPeerReviewed() throws IOException, TransformerException {
final XmlRecordFactory xmlRecordFactory = new XmlRecordFactory(contextMapper, false, final XmlRecordFactory xmlRecordFactory = new XmlRecordFactory(contextMapper, false,
XmlConverterJob.schemaLocation); PayloadConverterJob.schemaLocation);
final Publication p = load("publication.json", Publication.class); final Publication p = load("publication.json", Publication.class);
@ -98,7 +98,7 @@ public class IndexRecordTransformerTest {
public void testRiunet() throws IOException, TransformerException { public void testRiunet() throws IOException, TransformerException {
final XmlRecordFactory xmlRecordFactory = new XmlRecordFactory(contextMapper, false, final XmlRecordFactory xmlRecordFactory = new XmlRecordFactory(contextMapper, false,
XmlConverterJob.schemaLocation); PayloadConverterJob.schemaLocation);
final Publication p = load("riunet.json", Publication.class); final Publication p = load("riunet.json", Publication.class);

View File

@ -37,7 +37,7 @@ public class XmlRecordFactoryTest {
final ContextMapper contextMapper = new ContextMapper(); final ContextMapper contextMapper = new ContextMapper();
final XmlRecordFactory xmlRecordFactory = new XmlRecordFactory(contextMapper, false, final XmlRecordFactory xmlRecordFactory = new XmlRecordFactory(contextMapper, false,
XmlConverterJob.schemaLocation); PayloadConverterJob.schemaLocation);
final Publication p = OBJECT_MAPPER final Publication p = OBJECT_MAPPER
.readValue(IOUtils.toString(getClass().getResourceAsStream("publication.json")), Publication.class); .readValue(IOUtils.toString(getClass().getResourceAsStream("publication.json")), Publication.class);
@ -105,7 +105,7 @@ public class XmlRecordFactoryTest {
final ContextMapper contextMapper = new ContextMapper(); final ContextMapper contextMapper = new ContextMapper();
final XmlRecordFactory xmlRecordFactory = new XmlRecordFactory(contextMapper, false, final XmlRecordFactory xmlRecordFactory = new XmlRecordFactory(contextMapper, false,
XmlConverterJob.schemaLocation); PayloadConverterJob.schemaLocation);
final Publication p = OBJECT_MAPPER final Publication p = OBJECT_MAPPER
.readValue(IOUtils.toString(getClass().getResourceAsStream("publication.json")), Publication.class); .readValue(IOUtils.toString(getClass().getResourceAsStream("publication.json")), Publication.class);
@ -136,7 +136,7 @@ public class XmlRecordFactoryTest {
final ContextMapper contextMapper = new ContextMapper(); final ContextMapper contextMapper = new ContextMapper();
final XmlRecordFactory xmlRecordFactory = new XmlRecordFactory(contextMapper, false, final XmlRecordFactory xmlRecordFactory = new XmlRecordFactory(contextMapper, false,
XmlConverterJob.schemaLocation); PayloadConverterJob.schemaLocation);
final Publication p = OBJECT_MAPPER final Publication p = OBJECT_MAPPER
.readValue(IOUtils.toString(getClass().getResourceAsStream("publication.json")), Publication.class); .readValue(IOUtils.toString(getClass().getResourceAsStream("publication.json")), Publication.class);
@ -166,7 +166,7 @@ public class XmlRecordFactoryTest {
final ContextMapper contextMapper = new ContextMapper(); final ContextMapper contextMapper = new ContextMapper();
final XmlRecordFactory xmlRecordFactory = new XmlRecordFactory(contextMapper, false, final XmlRecordFactory xmlRecordFactory = new XmlRecordFactory(contextMapper, false,
XmlConverterJob.schemaLocation); PayloadConverterJob.schemaLocation);
final Datasource d = OBJECT_MAPPER final Datasource d = OBJECT_MAPPER
.readValue(IOUtils.toString(getClass().getResourceAsStream("datasource.json")), Datasource.class); .readValue(IOUtils.toString(getClass().getResourceAsStream("datasource.json")), Datasource.class);
@ -203,7 +203,7 @@ public class XmlRecordFactoryTest {
final ContextMapper contextMapper = new ContextMapper(); final ContextMapper contextMapper = new ContextMapper();
final XmlRecordFactory xmlRecordFactory = new XmlRecordFactory(contextMapper, false, final XmlRecordFactory xmlRecordFactory = new XmlRecordFactory(contextMapper, false,
XmlConverterJob.schemaLocation); PayloadConverterJob.schemaLocation);
final OtherResearchProduct p = OBJECT_MAPPER final OtherResearchProduct p = OBJECT_MAPPER
.readValue( .readValue(
@ -226,7 +226,7 @@ public class XmlRecordFactoryTest {
final ContextMapper contextMapper = new ContextMapper(); final ContextMapper contextMapper = new ContextMapper();
final XmlRecordFactory xmlRecordFactory = new XmlRecordFactory(contextMapper, false, final XmlRecordFactory xmlRecordFactory = new XmlRecordFactory(contextMapper, false,
XmlConverterJob.schemaLocation); PayloadConverterJob.schemaLocation);
final OtherResearchProduct p = OBJECT_MAPPER final OtherResearchProduct p = OBJECT_MAPPER
.readValue( .readValue(
@ -249,7 +249,7 @@ public class XmlRecordFactoryTest {
final ContextMapper contextMapper = new ContextMapper(); final ContextMapper contextMapper = new ContextMapper();
final XmlRecordFactory xmlRecordFactory = new XmlRecordFactory(contextMapper, false, final XmlRecordFactory xmlRecordFactory = new XmlRecordFactory(contextMapper, false,
XmlConverterJob.schemaLocation); PayloadConverterJob.schemaLocation);
final Publication p = OBJECT_MAPPER final Publication p = OBJECT_MAPPER
.readValue( .readValue(

View File

@ -71,6 +71,7 @@
--executor-memory=${sparkHighExecutorMemory} --executor-memory=${sparkHighExecutorMemory}
--executor-cores=${sparkExecutorCores} --executor-cores=${sparkExecutorCores}
--driver-memory=${sparkHighDriverMemory} --driver-memory=${sparkHighDriverMemory}
--conf spark.executor.memoryOverhead=${sparkHighExecutorMemory}
--conf spark.sql.shuffle.partitions=${sparkShufflePartitions} --conf spark.sql.shuffle.partitions=${sparkShufflePartitions}
--conf spark.extraListeners=${spark2ExtraListeners} --conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
@ -108,6 +109,7 @@
--executor-memory=${sparkHighExecutorMemory} --executor-memory=${sparkHighExecutorMemory}
--executor-cores=${sparkExecutorCores} --executor-cores=${sparkExecutorCores}
--driver-memory=${sparkNormalDriverMemory} --driver-memory=${sparkNormalDriverMemory}
--conf spark.executor.memoryOverhead=${sparkHighExecutorMemory}
--conf spark.sql.shuffle.partitions=${sparkShufflePartitions} --conf spark.sql.shuffle.partitions=${sparkShufflePartitions}
--conf spark.extraListeners=${spark2ExtraListeners} --conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
@ -141,6 +143,7 @@
--executor-memory=${sparkHighExecutorMemory} --executor-memory=${sparkHighExecutorMemory}
--executor-cores=${sparkExecutorCores} --executor-cores=${sparkExecutorCores}
--driver-memory=${sparkNormalDriverMemory} --driver-memory=${sparkNormalDriverMemory}
--conf spark.executor.memoryOverhead=${sparkHighExecutorMemory}
--conf spark.sql.shuffle.partitions=${sparkShufflePartitions} --conf spark.sql.shuffle.partitions=${sparkShufflePartitions}
--conf spark.extraListeners=${spark2ExtraListeners} --conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
@ -176,6 +179,7 @@
--executor-memory=${sparkHighExecutorMemory} --executor-memory=${sparkHighExecutorMemory}
--executor-cores=${sparkExecutorCores} --executor-cores=${sparkExecutorCores}
--driver-memory=${sparkNormalDriverMemory} --driver-memory=${sparkNormalDriverMemory}
--conf spark.executor.memoryOverhead=${sparkHighExecutorMemory}
--conf spark.sql.shuffle.partitions=${sparkShufflePartitions} --conf spark.sql.shuffle.partitions=${sparkShufflePartitions}
--conf spark.extraListeners=${spark2ExtraListeners} --conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
@ -209,6 +213,7 @@
--executor-memory=${sparkHighExecutorMemory} --executor-memory=${sparkHighExecutorMemory}
--executor-cores=${sparkExecutorCores} --executor-cores=${sparkExecutorCores}
--driver-memory=${sparkNormalDriverMemory} --driver-memory=${sparkNormalDriverMemory}
--conf spark.executor.memoryOverhead=${sparkHighExecutorMemory}
--conf spark.sql.shuffle.partitions=${sparkShufflePartitions} --conf spark.sql.shuffle.partitions=${sparkShufflePartitions}
--conf spark.extraListeners=${spark2ExtraListeners} --conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
@ -245,6 +250,7 @@
--executor-memory=${sparkHighExecutorMemory} --executor-memory=${sparkHighExecutorMemory}
--executor-cores=${sparkExecutorCores} --executor-cores=${sparkExecutorCores}
--driver-memory=${sparkNormalDriverMemory} --driver-memory=${sparkNormalDriverMemory}
--conf spark.executor.memoryOverhead=${sparkHighExecutorMemory}
--conf spark.sql.shuffle.partitions=${sparkShufflePartitions} --conf spark.sql.shuffle.partitions=${sparkShufflePartitions}
--conf spark.extraListeners=${spark2ExtraListeners} --conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
@ -315,6 +321,7 @@
--executor-memory=${sparkNormalExecutorMemory} --executor-memory=${sparkNormalExecutorMemory}
--executor-cores=${sparkExecutorCores} --executor-cores=${sparkExecutorCores}
--driver-memory=${sparkNormalDriverMemory} --driver-memory=${sparkNormalDriverMemory}
--conf spark.executor.memoryOverhead=${sparkNormalExecutorMemory}
--conf spark.sql.shuffle.partitions=${sparkShufflePartitions} --conf spark.sql.shuffle.partitions=${sparkShufflePartitions}
--conf spark.extraListeners=${spark2ExtraListeners} --conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
@ -361,6 +368,7 @@
--executor-memory=${sparkNormalExecutorMemory} --executor-memory=${sparkNormalExecutorMemory}
--executor-cores=${sparkExecutorCores} --executor-cores=${sparkExecutorCores}
--driver-memory=${sparkNormalDriverMemory} --driver-memory=${sparkNormalDriverMemory}
--conf spark.executor.memoryOverhead=${sparkNormalExecutorMemory}
--conf spark.sql.shuffle.partitions=${sparkShufflePartitions} --conf spark.sql.shuffle.partitions=${sparkShufflePartitions}
--conf spark.extraListeners=${spark2ExtraListeners} --conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
@ -409,6 +417,7 @@
--executor-memory=${sparkHighExecutorMemory} --executor-memory=${sparkHighExecutorMemory}
--executor-cores=${sparkExecutorCores} --executor-cores=${sparkExecutorCores}
--driver-memory=${sparkHighDriverMemory} --driver-memory=${sparkHighDriverMemory}
--conf spark.executor.memoryOverhead=${sparkHighExecutorMemory}
--conf spark.sql.shuffle.partitions=${sparkShufflePartitions} --conf spark.sql.shuffle.partitions=${sparkShufflePartitions}
--conf spark.extraListeners=${spark2ExtraListeners} --conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
@ -444,6 +453,7 @@
--executor-memory=${sparkHighExecutorMemory} --executor-memory=${sparkHighExecutorMemory}
--executor-cores=${sparkExecutorCores} --executor-cores=${sparkExecutorCores}
--driver-memory=${sparkHighDriverMemory} --driver-memory=${sparkHighDriverMemory}
--conf spark.executor.memoryOverhead=${sparkHighExecutorMemory}
--conf spark.sql.shuffle.partitions=${sparkShufflePartitions} --conf spark.sql.shuffle.partitions=${sparkShufflePartitions}
--conf spark.extraListeners=${spark2ExtraListeners} --conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
@ -482,6 +492,7 @@
--executor-memory=${sparkHighExecutorMemory} --executor-memory=${sparkHighExecutorMemory}
--executor-cores=${sparkExecutorCores} --executor-cores=${sparkExecutorCores}
--driver-memory=${sparkNormalDriverMemory} --driver-memory=${sparkNormalDriverMemory}
--conf spark.executor.memoryOverhead=${sparkHighExecutorMemory}
--conf spark.sql.shuffle.partitions=${sparkShufflePartitions} --conf spark.sql.shuffle.partitions=${sparkShufflePartitions}
--conf spark.extraListeners=${spark2ExtraListeners} --conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
@ -533,6 +544,7 @@
--executor-memory=${sparkNormalExecutorMemory} --executor-memory=${sparkNormalExecutorMemory}
--executor-cores=${sparkExecutorCores} --executor-cores=${sparkExecutorCores}
--driver-memory=${sparkNormalDriverMemory} --driver-memory=${sparkNormalDriverMemory}
--conf spark.executor.memoryOverhead=${sparkNormalExecutorMemory}
--conf spark.extraListeners=${spark2ExtraListeners} --conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}

View File

@ -67,24 +67,21 @@ function copydb() {
if [ -n "$log_errors" ]; then if [ -n "$log_errors" ]; then
echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN DROPPING THE OLD DATABASE! EXITING...\n\n" echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN DROPPING THE OLD DATABASE! EXITING...\n\n"
rm -f error.log rm -f error.log
return 1 exit 2
fi fi
# Make Impala aware of the deletion of the old DB immediately.
sleep 1
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA"
echo -e "\n\nCopying files of '${db}', from Ocean to Impala cluster..\n" echo -e "\n\nCopying files of '${db}', from Ocean to Impala cluster..\n"
# Using max-bandwidth of: 50 * 100 Mb/s = 5 Gb/s # Using max-bandwidth of: 70 * 150 Mb/s = 10.5 Gb/s
# Using max memory of: 50 * 6144 = 300 Gb # Using max memory of: 70 * 6144 = 430 Gb
# Using 1MB as a buffer-size. # Using 1MB as a buffer-size.
# The " -Ddistcp.dynamic.recordsPerChunk=50" arg is not available in our version of hadoop # The " -Ddistcp.dynamic.recordsPerChunk=N" arg is not available in our version of hadoop
# The "ug" args cannot be used as we get a "User does not belong to hive" error. # The "ug" args cannot be used as we get a "User does not belong to hive" error.
# The "p" argument cannot be used, as it blocks the files from being used, giving a "sticky bit"-error, even after applying chmod and chown onm the files. # The "p" argument cannot be used, as it blocks the files from being used, giving a "sticky bit"-error, even after applying chmod and chown onm the files.
hadoop distcp -Dmapreduce.map.memory.mb=6144 -m 70 -bandwidth 150 \ hadoop distcp -Dmapreduce.map.memory.mb=6144 -m 70 -bandwidth 150 \
-numListstatusThreads 40 \ -numListstatusThreads 40 \
-copybuffersize 1048576 \ -copybuffersize 1048576 \
-strategy dynamic \ -strategy dynamic \
-blocksperchunk 8 \
-pb \ -pb \
${OCEAN_HDFS_NODE}/user/hive/warehouse/${db}.db ${IMPALA_HDFS_DB_BASE_PATH} ${OCEAN_HDFS_NODE}/user/hive/warehouse/${db}.db ${IMPALA_HDFS_DB_BASE_PATH}
@ -92,9 +89,9 @@ function copydb() {
if [ $? -eq 0 ]; then if [ $? -eq 0 ]; then
echo -e "\nSuccessfully copied the files of '${db}'.\n" echo -e "\nSuccessfully copied the files of '${db}'.\n"
else else
echo -e "\n\nERROR: FAILED TO TRANSFER THE FILES OF '${db}', WITH 'hadoop distcp'. GOT WITH EXIT STATUS: $?\n\n" echo -e "\n\nERROR: FAILED TO TRANSFER THE FILES OF '${db}', WITH 'hadoop distcp'. GOT EXIT STATUS: $?\n\n"
rm -f error.log rm -f error.log
return 2 exit 3
fi fi
# In case we ever use this script for a writable DB (using inserts/updates), we should perform the following costly operation as well.. # In case we ever use this script for a writable DB (using inserts/updates), we should perform the following costly operation as well..
@ -105,14 +102,11 @@ function copydb() {
# create the new database (with the same name) # create the new database (with the same name)
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "create database ${db}" impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "create database ${db}"
# Make Impala aware of the creation of the new DB immediately.
sleep 1
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA"
sleep 1
# Because "Hive" and "Impala" do not have compatible schemas, we cannot use the "show create table <name>" output from hive to create the exact same table in impala. # Because "Hive" and "Impala" do not have compatible schemas, we cannot use the "show create table <name>" output from hive to create the exact same table in impala.
# So, we have to find at least one parquet file (check if it's there) from the table in the ocean cluster for impala to use it to extract the table-schema itself from that file. # So, we have to find at least one parquet file (check if it's there) from the table in the ocean cluster for impala to use it to extract the table-schema itself from that file.
all_create_view_statements=() all_create_view_statements=()
num_tables=0
entities_on_ocean=`hive -e "show tables in ${db};" | sed 's/WARN:.*//g'` # Get the tables and views without any potential the "WARN" logs. entities_on_ocean=`hive -e "show tables in ${db};" | sed 's/WARN:.*//g'` # Get the tables and views without any potential the "WARN" logs.
for i in ${entities_on_ocean[@]}; do # Use un-quoted values, as the elemetns are single-words. for i in ${entities_on_ocean[@]}; do # Use un-quoted values, as the elemetns are single-words.
@ -129,9 +123,11 @@ function copydb() {
all_create_view_statements+=("$create_view_statement") all_create_view_statements+=("$create_view_statement")
else else
echo -e "\n'${i}' is a table, so we will check for its parquet files and create the table on Impala cluster.\n" echo -e "\n'${i}' is a table, so we will check for its parquet files and create the table on Impala cluster.\n"
((num_tables++))
CURRENT_PRQ_FILE=`hdfs dfs -conf ${IMPALA_CONFIG_FILE} -ls -C "${IMPALA_HDFS_DB_BASE_PATH}/${db}.db/${i}/" | grep -v 'Found' | grep -v '_impala_insert_staging' | head -1` CURRENT_PRQ_FILE=`hdfs dfs -conf ${IMPALA_CONFIG_FILE} -ls -C "${IMPALA_HDFS_DB_BASE_PATH}/${db}.db/${i}/" | grep -v 'Found' | grep -v '_impala_insert_staging' | head -1`
if [ -z "$CURRENT_PRQ_FILE" ]; then # If there is not parquet-file inside. if [ -z "$CURRENT_PRQ_FILE" ]; then # If there is not parquet-file inside.
echo -e "\nERROR: THE TABLE \"${i}\" HAD NO FILES TO GET THE SCHEMA FROM! IT'S EMPTY!\n\n" echo -e "\nERROR: THE TABLE \"${i}\" HAD NO FILES TO GET THE SCHEMA FROM! IT'S EMPTY!\n\n"
exit 4 # Comment out when testing a DB which has such a table, just for performing this exact test-check.
else else
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "create table ${db}.${i} like parquet '${CURRENT_PRQ_FILE}' stored as parquet;" |& tee error.log impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "create table ${db}.${i} like parquet '${CURRENT_PRQ_FILE}' stored as parquet;" |& tee error.log
log_errors=`cat error.log | grep -E "WARN|ERROR|FAILED"` log_errors=`cat error.log | grep -E "WARN|ERROR|FAILED"`
@ -142,74 +138,73 @@ function copydb() {
fi fi
done done
echo -e "\nAll tables have been created, going to create the views..\n" previous_num_of_views_to_retry=${#all_create_view_statements[@]}
if [[ $num_tables -gt 0 ]]; then
echo -e "\nAll ${num_tables} tables have been created, for db '${db}', going to create the ${previous_num_of_views_to_retry} views..\n"
else
echo -e "\nDB '${db}' does not have any tables, moving on to create the ${previous_num_of_views_to_retry} views..\n"
fi
# Time to loop through the views and create them.
# At this point all table-schemas should have been created.
previous_num_of_views_to_retry=${#all_create_view_statements}
if [[ $previous_num_of_views_to_retry -gt 0 ]]; then if [[ $previous_num_of_views_to_retry -gt 0 ]]; then
echo -e "\nAll_create_view_statements:\n\n${all_create_view_statements[@]}\n" # DEBUG echo -e "\nAll_create_view_statements (${previous_num_of_views_to_retry}):\n\n${all_create_view_statements[@]}\n" # DEBUG
# Make Impala aware of the new tables, so it knows them when creating the views.
sleep 1
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA"
sleep 1
else else
echo -e "\nDB '${db}' does not contain any views.\n" echo -e "\nDB '${db}' does not contain any views.\n"
fi fi
level_counter=0 level_counter=0
while [[ ${#all_create_view_statements[@]} -gt 0 ]]; do while [[ $previous_num_of_views_to_retry -gt 0 ]]; do
((level_counter++)) ((level_counter++))
# The only accepted reason for a view to not be created, is if it depends on another view, which has not been created yet. # The only accepted reason for a view to not be created, is if it depends on another view, which has not been created yet.
# In this case, we should retry creating this particular view again. # In this case, we should retry creating this particular view again.
should_retry_create_view_statements=() new_num_of_views_to_retry=0
for create_view_statement in "${all_create_view_statements[@]}"; do # Here we use double quotes, as the elements are phrases, instead of single-words. for create_view_statement in "${all_create_view_statements[@]}"; do # Here we use double quotes, as the elements are phrases, instead of single-words.
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "${create_view_statement}" |& tee error.log # impala-shell prints all logs in stderr, so wee need to capture them and put them in a file, in order to perform "grep" on them later impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "${create_view_statement}" |& tee error.log # impala-shell prints all logs in stderr, so wee need to capture them and put them in a file, in order to perform "grep" on them later
specific_errors=`cat error.log | grep -E "FAILED: ParseException line 1:13 missing TABLE at 'view'|ERROR: AnalysisException: Could not resolve table reference:"` specific_errors=`cat error.log | grep -E "FAILED: ParseException line 1:13 missing TABLE at 'view'|ERROR: AnalysisException: Could not resolve table reference:"`
if [ -n "$specific_errors" ]; then if [ -n "$specific_errors" ]; then
echo -e "\nspecific_errors: ${specific_errors}\n" echo -e "\nspecific_errors: ${specific_errors}\n"
echo -e "\nView '$(cat error.log | grep "CREATE VIEW " | sed 's/CREATE VIEW //g' | sed 's/ as select .*//g')' failed to be created, possibly because it depends on another view.\n" echo -e "\nView '$(cat error.log | grep -Eo "Query: CREATE VIEW ([^\s]+)" | sed 's/Query: CREATE VIEW //g')' failed to be created, possibly because it depends on another view.\n"
should_retry_create_view_statements+=("$create_view_statement") ((new_num_of_views_to_retry++)) # Increment it here, instead of acquiring the array's size in the end, as that doesn't work for some reason.
else else
all_create_view_statements=("${all_create_view_statements[@]/$create_view_statement}") # Remove the current successful statement from the list.
sleep 1 # Wait a bit for Impala to register that the view was created, before possibly referencing it by another view. sleep 1 # Wait a bit for Impala to register that the view was created, before possibly referencing it by another view.
fi fi
done done
new_num_of_views_to_retry=${#should_retry_create_view_statements} all_create_view_statements=("$(echo "${all_create_view_statements[@]}" | grep -v '^[\s]*$')") # Re-index the array, filtering-out any empty elements.
# Although the above command reduces the "active" elements to just the few to-be-retried, it does not manage to make the array return the its true size through the "${#all_create_view_statements[@]}" statement. So we use counters.
if [[ $new_num_of_views_to_retry -eq $previous_num_of_views_to_retry ]]; then if [[ $new_num_of_views_to_retry -eq $previous_num_of_views_to_retry ]]; then
echo -e "\n\nERROR: THE NUMBER OF VIEWS TO RETRY HAS NOT BEEN REDUCED! THE SCRIPT IS LIKELY GOING TO AN INFINITE-LOOP! EXITING..\n\n" echo -e "\n\nERROR: THE NUMBER OF VIEWS TO RETRY HAS NOT BEEN REDUCED! THE SCRIPT IS LIKELY GOING TO AN INFINITE-LOOP! EXITING..\n\n"
return 3 exit 5
elif [[ $new_num_of_views_to_retry -gt 0 ]]; then elif [[ $new_num_of_views_to_retry -gt 0 ]]; then
echo -e "\nTo be retried \"create_view_statements\":\n\n${should_retry_create_view_statements[@]}\n" echo -e "\nTo be retried \"create_view_statements\" (${new_num_of_views_to_retry}):\n\n${all_create_view_statements[@]}\n"
previous_num_of_views_to_retry=$new_num_of_views_to_retry
else else
echo -e "\nFinished creating views, for db: '${db}', in level-${level_counter}.\n" echo -e "\nFinished creating views, for db: '${db}', in level-${level_counter}.\n"
fi fi
all_create_view_statements=("${should_retry_create_view_statement[@]}") # This is needed in any case to either move forward with the rest of the views or stop at 0 remaining views. previous_num_of_views_to_retry=$new_num_of_views_to_retry
done done
sleep 1
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA"
sleep 1
echo -e "\nComputing stats for tables..\n" echo -e "\nComputing stats for tables..\n"
entities_on_impala=`impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} --delimited -q "show tables in ${db}"` entities_on_impala=`impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} --delimited -q "show tables in ${db}"`
for i in ${entities_on_impala[@]}; do # Use un-quoted values, as the elemetns are single-words. for i in ${entities_on_impala[@]}; do # Use un-quoted values, as the elemetns are single-words.
# Taking the create table statement from the Ocean cluster, just to check if its a view, as the output is easier than using impala-shell from Impala cluster. # Taking the create table statement from the Ocean cluster, just to check if its a view, as the output is easier than using impala-shell from Impala cluster.
create_view_statement=`hive -e "show create table ${db}.${i};" | grep "CREATE VIEW"` # This grep works here, as we do not want to match multiple-lines. create_view_statement=`hive -e "show create table ${db}.${i};" | grep "CREATE VIEW"` # This grep works here, as we do not want to match multiple-lines.
if [ -z "$create_view_statement" ]; then # If it's a table, then go load the data to it. if [ -z "$create_view_statement" ]; then # If it's a table, then go load the data to it.
# Invalidate metadata of this DB's tables, in order for Impala to be aware of all parquet files put inside the tables' directories, previously, by "hadoop distcp".
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA ${db}.${i}"
sleep 1
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "compute stats ${db}.${i}"; impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "compute stats ${db}.${i}";
fi fi
done done
# Check if the entities in both clusters are the same, down to the exact names, not just the counts. (they are sorted in the same way both in hive and impala)
if [ "${entities_on_impala[@]}" == "${entities_on_ocean[@]}" ]; then if [ "${entities_on_impala[@]}" == "${entities_on_ocean[@]}" ]; then
echo -e "\nAll entities have been copied to Impala cluster.\n" echo -e "\nAll entities have been copied to Impala cluster.\n"
else else
echo -e "\n\nERROR: 1 OR MORE ENTITIES OF DB '${db}' FAILED TO BE COPIED TO IMPALA CLUSTER!\n\n" echo -e "\n\nERROR: 1 OR MORE ENTITIES OF DB '${db}' FAILED TO BE COPIED TO IMPALA CLUSTER!\n\n"
rm -f error.log rm -f error.log
return 4 exit 6
fi fi
rm -f error.log rm -f error.log

View File

@ -66,24 +66,21 @@ function copydb() {
if [ -n "$log_errors" ]; then if [ -n "$log_errors" ]; then
echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN DROPPING THE OLD DATABASE! EXITING...\n\n" echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN DROPPING THE OLD DATABASE! EXITING...\n\n"
rm -f error.log rm -f error.log
return 1 exit 2
fi fi
# Make Impala aware of the deletion of the old DB immediately.
sleep 1
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA"
echo -e "\n\nCopying files of '${db}', from Ocean to Impala cluster..\n" echo -e "\n\nCopying files of '${db}', from Ocean to Impala cluster..\n"
# Using max-bandwidth of: 50 * 100 Mb/s = 5 Gb/s # Using max-bandwidth of: 70 * 150 Mb/s = 10.5 Gb/s
# Using max memory of: 50 * 6144 = 300 Gb # Using max memory of: 70 * 6144 = 430 Gb
# Using 1MB as a buffer-size. # Using 1MB as a buffer-size.
# The " -Ddistcp.dynamic.recordsPerChunk=50" arg is not available in our version of hadoop # The " -Ddistcp.dynamic.recordsPerChunk=N" arg is not available in our version of hadoop
# The "ug" args cannot be used as we get a "User does not belong to hive" error. # The "ug" args cannot be used as we get a "User does not belong to hive" error.
# The "p" argument cannot be used, as it blocks the files from being used, giving a "sticky bit"-error, even after applying chmod and chown onm the files. # The "p" argument cannot be used, as it blocks the files from being used, giving a "sticky bit"-error, even after applying chmod and chown onm the files.
hadoop distcp -Dmapreduce.map.memory.mb=6144 -m 70 -bandwidth 150 \ hadoop distcp -Dmapreduce.map.memory.mb=6144 -m 70 -bandwidth 150 \
-numListstatusThreads 40 \ -numListstatusThreads 40 \
-copybuffersize 1048576 \ -copybuffersize 1048576 \
-strategy dynamic \ -strategy dynamic \
-blocksperchunk 8 \
-pb \ -pb \
${OCEAN_HDFS_NODE}/user/hive/warehouse/${db}.db ${IMPALA_HDFS_DB_BASE_PATH} ${OCEAN_HDFS_NODE}/user/hive/warehouse/${db}.db ${IMPALA_HDFS_DB_BASE_PATH}
@ -91,9 +88,9 @@ function copydb() {
if [ $? -eq 0 ]; then if [ $? -eq 0 ]; then
echo -e "\nSuccessfully copied the files of '${db}'.\n" echo -e "\nSuccessfully copied the files of '${db}'.\n"
else else
echo -e "\n\nERROR: FAILED TO TRANSFER THE FILES OF '${db}', WITH 'hadoop distcp'. GOT WITH EXIT STATUS: $?\n\n" echo -e "\n\nERROR: FAILED TO TRANSFER THE FILES OF '${db}', WITH 'hadoop distcp'. GOT EXIT STATUS: $?\n\n"
rm -f error.log rm -f error.log
return 2 exit 3
fi fi
# In case we ever use this script for a writable DB (using inserts/updates), we should perform the following costly operation as well.. # In case we ever use this script for a writable DB (using inserts/updates), we should perform the following costly operation as well..
@ -104,14 +101,11 @@ function copydb() {
# create the new database (with the same name) # create the new database (with the same name)
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "create database ${db}" impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "create database ${db}"
# Make Impala aware of the creation of the new DB immediately.
sleep 1
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA"
sleep 1
# Because "Hive" and "Impala" do not have compatible schemas, we cannot use the "show create table <name>" output from hive to create the exact same table in impala. # Because "Hive" and "Impala" do not have compatible schemas, we cannot use the "show create table <name>" output from hive to create the exact same table in impala.
# So, we have to find at least one parquet file (check if it's there) from the table in the ocean cluster for impala to use it to extract the table-schema itself from that file. # So, we have to find at least one parquet file (check if it's there) from the table in the ocean cluster for impala to use it to extract the table-schema itself from that file.
all_create_view_statements=() all_create_view_statements=()
num_tables=0
entities_on_ocean=`hive -e "show tables in ${db};" | sed 's/WARN:.*//g'` # Get the tables and views without any potential the "WARN" logs. entities_on_ocean=`hive -e "show tables in ${db};" | sed 's/WARN:.*//g'` # Get the tables and views without any potential the "WARN" logs.
for i in ${entities_on_ocean[@]}; do # Use un-quoted values, as the elemetns are single-words. for i in ${entities_on_ocean[@]}; do # Use un-quoted values, as the elemetns are single-words.
@ -128,9 +122,11 @@ function copydb() {
all_create_view_statements+=("$create_view_statement") all_create_view_statements+=("$create_view_statement")
else else
echo -e "\n'${i}' is a table, so we will check for its parquet files and create the table on Impala cluster.\n" echo -e "\n'${i}' is a table, so we will check for its parquet files and create the table on Impala cluster.\n"
((num_tables++))
CURRENT_PRQ_FILE=`hdfs dfs -conf ${IMPALA_CONFIG_FILE} -ls -C "${IMPALA_HDFS_DB_BASE_PATH}/${db}.db/${i}/" | grep -v 'Found' | grep -v '_impala_insert_staging' | head -1` CURRENT_PRQ_FILE=`hdfs dfs -conf ${IMPALA_CONFIG_FILE} -ls -C "${IMPALA_HDFS_DB_BASE_PATH}/${db}.db/${i}/" | grep -v 'Found' | grep -v '_impala_insert_staging' | head -1`
if [ -z "$CURRENT_PRQ_FILE" ]; then # If there is not parquet-file inside. if [ -z "$CURRENT_PRQ_FILE" ]; then # If there is not parquet-file inside.
echo -e "\nERROR: THE TABLE \"${i}\" HAD NO FILES TO GET THE SCHEMA FROM! IT'S EMPTY!\n\n" echo -e "\nERROR: THE TABLE \"${i}\" HAD NO FILES TO GET THE SCHEMA FROM! IT'S EMPTY!\n\n"
exit 4 # Comment out when testing a DB which has such a table, just for performing this exact test-check.
else else
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "create table ${db}.${i} like parquet '${CURRENT_PRQ_FILE}' stored as parquet;" |& tee error.log impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "create table ${db}.${i} like parquet '${CURRENT_PRQ_FILE}' stored as parquet;" |& tee error.log
log_errors=`cat error.log | grep -E "WARN|ERROR|FAILED"` log_errors=`cat error.log | grep -E "WARN|ERROR|FAILED"`
@ -141,74 +137,73 @@ function copydb() {
fi fi
done done
echo -e "\nAll tables have been created, going to create the views..\n" previous_num_of_views_to_retry=${#all_create_view_statements[@]}
if [[ $num_tables -gt 0 ]]; then
echo -e "\nAll ${num_tables} tables have been created, for db '${db}', going to create the ${previous_num_of_views_to_retry} views..\n"
else
echo -e "\nDB '${db}' does not have any tables, moving on to create the ${previous_num_of_views_to_retry} views..\n"
fi
# Time to loop through the views and create them.
# At this point all table-schemas should have been created.
previous_num_of_views_to_retry=${#all_create_view_statements}
if [[ $previous_num_of_views_to_retry -gt 0 ]]; then if [[ $previous_num_of_views_to_retry -gt 0 ]]; then
echo -e "\nAll_create_view_statements:\n\n${all_create_view_statements[@]}\n" # DEBUG echo -e "\nAll_create_view_statements (${previous_num_of_views_to_retry}):\n\n${all_create_view_statements[@]}\n" # DEBUG
# Make Impala aware of the new tables, so it knows them when creating the views.
sleep 1
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA"
sleep 1
else else
echo -e "\nDB '${db}' does not contain any views.\n" echo -e "\nDB '${db}' does not contain any views.\n"
fi fi
level_counter=0 level_counter=0
while [[ ${#all_create_view_statements[@]} -gt 0 ]]; do while [[ $previous_num_of_views_to_retry -gt 0 ]]; do
((level_counter++)) ((level_counter++))
# The only accepted reason for a view to not be created, is if it depends on another view, which has not been created yet. # The only accepted reason for a view to not be created, is if it depends on another view, which has not been created yet.
# In this case, we should retry creating this particular view again. # In this case, we should retry creating this particular view again.
should_retry_create_view_statements=() new_num_of_views_to_retry=0
for create_view_statement in "${all_create_view_statements[@]}"; do # Here we use double quotes, as the elements are phrases, instead of single-words. for create_view_statement in "${all_create_view_statements[@]}"; do # Here we use double quotes, as the elements are phrases, instead of single-words.
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "${create_view_statement}" |& tee error.log # impala-shell prints all logs in stderr, so wee need to capture them and put them in a file, in order to perform "grep" on them later impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "${create_view_statement}" |& tee error.log # impala-shell prints all logs in stderr, so wee need to capture them and put them in a file, in order to perform "grep" on them later
specific_errors=`cat error.log | grep -E "FAILED: ParseException line 1:13 missing TABLE at 'view'|ERROR: AnalysisException: Could not resolve table reference:"` specific_errors=`cat error.log | grep -E "FAILED: ParseException line 1:13 missing TABLE at 'view'|ERROR: AnalysisException: Could not resolve table reference:"`
if [ -n "$specific_errors" ]; then if [ -n "$specific_errors" ]; then
echo -e "\nspecific_errors: ${specific_errors}\n" echo -e "\nspecific_errors: ${specific_errors}\n"
echo -e "\nView '$(cat error.log | grep "CREATE VIEW " | sed 's/CREATE VIEW //g' | sed 's/ as select .*//g')' failed to be created, possibly because it depends on another view.\n" echo -e "\nView '$(cat error.log | grep -Eo "Query: CREATE VIEW ([^\s]+)" | sed 's/Query: CREATE VIEW //g')' failed to be created, possibly because it depends on another view.\n"
should_retry_create_view_statements+=("$create_view_statement") ((new_num_of_views_to_retry++)) # Increment it here, instead of acquiring the array's size in the end, as that doesn't work for some reason.
else else
all_create_view_statements=("${all_create_view_statements[@]/$create_view_statement}") # Remove the current successful statement from the list.
sleep 1 # Wait a bit for Impala to register that the view was created, before possibly referencing it by another view. sleep 1 # Wait a bit for Impala to register that the view was created, before possibly referencing it by another view.
fi fi
done done
new_num_of_views_to_retry=${#should_retry_create_view_statements} all_create_view_statements=("$(echo "${all_create_view_statements[@]}" | grep -v '^[\s]*$')") # Re-index the array, filtering-out any empty elements.
# Although the above command reduces the "active" elements to just the few to-be-retried, it does not manage to make the array return the its true size through the "${#all_create_view_statements[@]}" statement. So we use counters.
if [[ $new_num_of_views_to_retry -eq $previous_num_of_views_to_retry ]]; then if [[ $new_num_of_views_to_retry -eq $previous_num_of_views_to_retry ]]; then
echo -e "\n\nERROR: THE NUMBER OF VIEWS TO RETRY HAS NOT BEEN REDUCED! THE SCRIPT IS LIKELY GOING TO AN INFINITE-LOOP! EXITING..\n\n" echo -e "\n\nERROR: THE NUMBER OF VIEWS TO RETRY HAS NOT BEEN REDUCED! THE SCRIPT IS LIKELY GOING TO AN INFINITE-LOOP! EXITING..\n\n"
return 3 exit 5
elif [[ $new_num_of_views_to_retry -gt 0 ]]; then elif [[ $new_num_of_views_to_retry -gt 0 ]]; then
echo -e "\nTo be retried \"create_view_statements\":\n\n${should_retry_create_view_statements[@]}\n" echo -e "\nTo be retried \"create_view_statements\" (${new_num_of_views_to_retry}):\n\n${all_create_view_statements[@]}\n"
previous_num_of_views_to_retry=$new_num_of_views_to_retry
else else
echo -e "\nFinished creating views, for db: '${db}', in level-${level_counter}.\n" echo -e "\nFinished creating views, for db: '${db}', in level-${level_counter}.\n"
fi fi
all_create_view_statements=("${should_retry_create_view_statement[@]}") # This is needed in any case to either move forward with the rest of the views or stop at 0 remaining views. previous_num_of_views_to_retry=$new_num_of_views_to_retry
done done
sleep 1
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA"
sleep 1
echo -e "\nComputing stats for tables..\n" echo -e "\nComputing stats for tables..\n"
entities_on_impala=`impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} --delimited -q "show tables in ${db}"` entities_on_impala=`impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} --delimited -q "show tables in ${db}"`
for i in ${entities_on_impala[@]}; do # Use un-quoted values, as the elemetns are single-words. for i in ${entities_on_impala[@]}; do # Use un-quoted values, as the elemetns are single-words.
# Taking the create table statement from the Ocean cluster, just to check if its a view, as the output is easier than using impala-shell from Impala cluster. # Taking the create table statement from the Ocean cluster, just to check if its a view, as the output is easier than using impala-shell from Impala cluster.
create_view_statement=`hive -e "show create table ${db}.${i};" | grep "CREATE VIEW"` # This grep works here, as we do not want to match multiple-lines. create_view_statement=`hive -e "show create table ${db}.${i};" | grep "CREATE VIEW"` # This grep works here, as we do not want to match multiple-lines.
if [ -z "$create_view_statement" ]; then # If it's a table, then go load the data to it. if [ -z "$create_view_statement" ]; then # If it's a table, then go load the data to it.
# Invalidate metadata of this DB's tables, in order for Impala to be aware of all parquet files put inside the tables' directories, previously, by "hadoop distcp".
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA ${db}.${i}"
sleep 1
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "compute stats ${db}.${i}"; impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "compute stats ${db}.${i}";
fi fi
done done
# Check if the entities in both clusters are the same, down to the exact names, not just the counts. (they are sorted in the same way both in hive and impala)
if [ "${entities_on_impala[@]}" == "${entities_on_ocean[@]}" ]; then if [ "${entities_on_impala[@]}" == "${entities_on_ocean[@]}" ]; then
echo -e "\nAll entities have been copied to Impala cluster.\n" echo -e "\nAll entities have been copied to Impala cluster.\n"
else else
echo -e "\n\nERROR: 1 OR MORE ENTITIES OF DB '${db}' FAILED TO BE COPIED TO IMPALA CLUSTER!\n\n" echo -e "\n\nERROR: 1 OR MORE ENTITIES OF DB '${db}' FAILED TO BE COPIED TO IMPALA CLUSTER!\n\n"
rm -f error.log rm -f error.log
return 4 exit 6
fi fi
rm -f error.log rm -f error.log

View File

@ -66,24 +66,21 @@ function copydb() {
if [ -n "$log_errors" ]; then if [ -n "$log_errors" ]; then
echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN DROPPING THE OLD DATABASE! EXITING...\n\n" echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN DROPPING THE OLD DATABASE! EXITING...\n\n"
rm -f error.log rm -f error.log
return 1 exit 2
fi fi
# Make Impala aware of the deletion of the old DB immediately.
sleep 1
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA"
echo -e "\n\nCopying files of '${db}', from Ocean to Impala cluster..\n" echo -e "\n\nCopying files of '${db}', from Ocean to Impala cluster..\n"
# Using max-bandwidth of: 50 * 100 Mb/s = 5 Gb/s # Using max-bandwidth of: 70 * 150 Mb/s = 10.5 Gb/s
# Using max memory of: 50 * 6144 = 300 Gb # Using max memory of: 70 * 6144 = 430 Gb
# Using 1MB as a buffer-size. # Using 1MB as a buffer-size.
# The " -Ddistcp.dynamic.recordsPerChunk=50" arg is not available in our version of hadoop # The " -Ddistcp.dynamic.recordsPerChunk=N" arg is not available in our version of hadoop
# The "ug" args cannot be used as we get a "User does not belong to hive" error. # The "ug" args cannot be used as we get a "User does not belong to hive" error.
# The "p" argument cannot be used, as it blocks the files from being used, giving a "sticky bit"-error, even after applying chmod and chown onm the files. # The "p" argument cannot be used, as it blocks the files from being used, giving a "sticky bit"-error, even after applying chmod and chown onm the files.
hadoop distcp -Dmapreduce.map.memory.mb=6144 -m 70 -bandwidth 150 \ hadoop distcp -Dmapreduce.map.memory.mb=6144 -m 70 -bandwidth 150 \
-numListstatusThreads 40 \ -numListstatusThreads 40 \
-copybuffersize 1048576 \ -copybuffersize 1048576 \
-strategy dynamic \ -strategy dynamic \
-blocksperchunk 8 \
-pb \ -pb \
${OCEAN_HDFS_NODE}/user/hive/warehouse/${db}.db ${IMPALA_HDFS_DB_BASE_PATH} ${OCEAN_HDFS_NODE}/user/hive/warehouse/${db}.db ${IMPALA_HDFS_DB_BASE_PATH}
@ -91,9 +88,9 @@ function copydb() {
if [ $? -eq 0 ]; then if [ $? -eq 0 ]; then
echo -e "\nSuccessfully copied the files of '${db}'.\n" echo -e "\nSuccessfully copied the files of '${db}'.\n"
else else
echo -e "\n\nERROR: FAILED TO TRANSFER THE FILES OF '${db}', WITH 'hadoop distcp'. GOT WITH EXIT STATUS: $?\n\n" echo -e "\n\nERROR: FAILED TO TRANSFER THE FILES OF '${db}', WITH 'hadoop distcp'. GOT EXIT STATUS: $?\n\n"
rm -f error.log rm -f error.log
return 2 exit 3
fi fi
# In case we ever use this script for a writable DB (using inserts/updates), we should perform the following costly operation as well.. # In case we ever use this script for a writable DB (using inserts/updates), we should perform the following costly operation as well..
@ -104,14 +101,11 @@ function copydb() {
# create the new database (with the same name) # create the new database (with the same name)
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "create database ${db}" impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "create database ${db}"
# Make Impala aware of the creation of the new DB immediately.
sleep 1
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA"
sleep 1
# Because "Hive" and "Impala" do not have compatible schemas, we cannot use the "show create table <name>" output from hive to create the exact same table in impala. # Because "Hive" and "Impala" do not have compatible schemas, we cannot use the "show create table <name>" output from hive to create the exact same table in impala.
# So, we have to find at least one parquet file (check if it's there) from the table in the ocean cluster for impala to use it to extract the table-schema itself from that file. # So, we have to find at least one parquet file (check if it's there) from the table in the ocean cluster for impala to use it to extract the table-schema itself from that file.
all_create_view_statements=() all_create_view_statements=()
num_tables=0
entities_on_ocean=`hive -e "show tables in ${db};" | sed 's/WARN:.*//g'` # Get the tables and views without any potential the "WARN" logs. entities_on_ocean=`hive -e "show tables in ${db};" | sed 's/WARN:.*//g'` # Get the tables and views without any potential the "WARN" logs.
for i in ${entities_on_ocean[@]}; do # Use un-quoted values, as the elemetns are single-words. for i in ${entities_on_ocean[@]}; do # Use un-quoted values, as the elemetns are single-words.
@ -128,9 +122,11 @@ function copydb() {
all_create_view_statements+=("$create_view_statement") all_create_view_statements+=("$create_view_statement")
else else
echo -e "\n'${i}' is a table, so we will check for its parquet files and create the table on Impala cluster.\n" echo -e "\n'${i}' is a table, so we will check for its parquet files and create the table on Impala cluster.\n"
((num_tables++))
CURRENT_PRQ_FILE=`hdfs dfs -conf ${IMPALA_CONFIG_FILE} -ls -C "${IMPALA_HDFS_DB_BASE_PATH}/${db}.db/${i}/" | grep -v 'Found' | grep -v '_impala_insert_staging' | head -1` CURRENT_PRQ_FILE=`hdfs dfs -conf ${IMPALA_CONFIG_FILE} -ls -C "${IMPALA_HDFS_DB_BASE_PATH}/${db}.db/${i}/" | grep -v 'Found' | grep -v '_impala_insert_staging' | head -1`
if [ -z "$CURRENT_PRQ_FILE" ]; then # If there is not parquet-file inside. if [ -z "$CURRENT_PRQ_FILE" ]; then # If there is not parquet-file inside.
echo -e "\nERROR: THE TABLE \"${i}\" HAD NO FILES TO GET THE SCHEMA FROM! IT'S EMPTY!\n\n" echo -e "\nERROR: THE TABLE \"${i}\" HAD NO FILES TO GET THE SCHEMA FROM! IT'S EMPTY!\n\n"
exit 4 # Comment out when testing a DB which has such a table, just for performing this exact test-check.
else else
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "create table ${db}.${i} like parquet '${CURRENT_PRQ_FILE}' stored as parquet;" |& tee error.log impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "create table ${db}.${i} like parquet '${CURRENT_PRQ_FILE}' stored as parquet;" |& tee error.log
log_errors=`cat error.log | grep -E "WARN|ERROR|FAILED"` log_errors=`cat error.log | grep -E "WARN|ERROR|FAILED"`
@ -141,74 +137,73 @@ function copydb() {
fi fi
done done
echo -e "\nAll tables have been created, going to create the views..\n" previous_num_of_views_to_retry=${#all_create_view_statements[@]}
if [[ $num_tables -gt 0 ]]; then
echo -e "\nAll ${num_tables} tables have been created, for db '${db}', going to create the ${previous_num_of_views_to_retry} views..\n"
else
echo -e "\nDB '${db}' does not have any tables, moving on to create the ${previous_num_of_views_to_retry} views..\n"
fi
# Time to loop through the views and create them.
# At this point all table-schemas should have been created.
previous_num_of_views_to_retry=${#all_create_view_statements}
if [[ $previous_num_of_views_to_retry -gt 0 ]]; then if [[ $previous_num_of_views_to_retry -gt 0 ]]; then
echo -e "\nAll_create_view_statements:\n\n${all_create_view_statements[@]}\n" # DEBUG echo -e "\nAll_create_view_statements (${previous_num_of_views_to_retry}):\n\n${all_create_view_statements[@]}\n" # DEBUG
# Make Impala aware of the new tables, so it knows them when creating the views.
sleep 1
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA"
sleep 1
else else
echo -e "\nDB '${db}' does not contain any views.\n" echo -e "\nDB '${db}' does not contain any views.\n"
fi fi
level_counter=0 level_counter=0
while [[ ${#all_create_view_statements[@]} -gt 0 ]]; do while [[ $previous_num_of_views_to_retry -gt 0 ]]; do
((level_counter++)) ((level_counter++))
# The only accepted reason for a view to not be created, is if it depends on another view, which has not been created yet. # The only accepted reason for a view to not be created, is if it depends on another view, which has not been created yet.
# In this case, we should retry creating this particular view again. # In this case, we should retry creating this particular view again.
should_retry_create_view_statements=() new_num_of_views_to_retry=0
for create_view_statement in "${all_create_view_statements[@]}"; do # Here we use double quotes, as the elements are phrases, instead of single-words. for create_view_statement in "${all_create_view_statements[@]}"; do # Here we use double quotes, as the elements are phrases, instead of single-words.
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "${create_view_statement}" |& tee error.log # impala-shell prints all logs in stderr, so wee need to capture them and put them in a file, in order to perform "grep" on them later impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "${create_view_statement}" |& tee error.log # impala-shell prints all logs in stderr, so wee need to capture them and put them in a file, in order to perform "grep" on them later
specific_errors=`cat error.log | grep -E "FAILED: ParseException line 1:13 missing TABLE at 'view'|ERROR: AnalysisException: Could not resolve table reference:"` specific_errors=`cat error.log | grep -E "FAILED: ParseException line 1:13 missing TABLE at 'view'|ERROR: AnalysisException: Could not resolve table reference:"`
if [ -n "$specific_errors" ]; then if [ -n "$specific_errors" ]; then
echo -e "\nspecific_errors: ${specific_errors}\n" echo -e "\nspecific_errors: ${specific_errors}\n"
echo -e "\nView '$(cat error.log | grep "CREATE VIEW " | sed 's/CREATE VIEW //g' | sed 's/ as select .*//g')' failed to be created, possibly because it depends on another view.\n" echo -e "\nView '$(cat error.log | grep -Eo "Query: CREATE VIEW ([^\s]+)" | sed 's/Query: CREATE VIEW //g')' failed to be created, possibly because it depends on another view.\n"
should_retry_create_view_statements+=("$create_view_statement") ((new_num_of_views_to_retry++)) # Increment it here, instead of acquiring the array's size in the end, as that doesn't work for some reason.
else else
all_create_view_statements=("${all_create_view_statements[@]/$create_view_statement}") # Remove the current successful statement from the list.
sleep 1 # Wait a bit for Impala to register that the view was created, before possibly referencing it by another view. sleep 1 # Wait a bit for Impala to register that the view was created, before possibly referencing it by another view.
fi fi
done done
new_num_of_views_to_retry=${#should_retry_create_view_statements} all_create_view_statements=("$(echo "${all_create_view_statements[@]}" | grep -v '^[\s]*$')") # Re-index the array, filtering-out any empty elements.
# Although the above command reduces the "active" elements to just the few to-be-retried, it does not manage to make the array return the its true size through the "${#all_create_view_statements[@]}" statement. So we use counters.
if [[ $new_num_of_views_to_retry -eq $previous_num_of_views_to_retry ]]; then if [[ $new_num_of_views_to_retry -eq $previous_num_of_views_to_retry ]]; then
echo -e "\n\nERROR: THE NUMBER OF VIEWS TO RETRY HAS NOT BEEN REDUCED! THE SCRIPT IS LIKELY GOING TO AN INFINITE-LOOP! EXITING..\n\n" echo -e "\n\nERROR: THE NUMBER OF VIEWS TO RETRY HAS NOT BEEN REDUCED! THE SCRIPT IS LIKELY GOING TO AN INFINITE-LOOP! EXITING..\n\n"
return 3 exit 5
elif [[ $new_num_of_views_to_retry -gt 0 ]]; then elif [[ $new_num_of_views_to_retry -gt 0 ]]; then
echo -e "\nTo be retried \"create_view_statements\":\n\n${should_retry_create_view_statements[@]}\n" echo -e "\nTo be retried \"create_view_statements\" (${new_num_of_views_to_retry}):\n\n${all_create_view_statements[@]}\n"
previous_num_of_views_to_retry=$new_num_of_views_to_retry
else else
echo -e "\nFinished creating views, for db: '${db}', in level-${level_counter}.\n" echo -e "\nFinished creating views, for db: '${db}', in level-${level_counter}.\n"
fi fi
all_create_view_statements=("${should_retry_create_view_statement[@]}") # This is needed in any case to either move forward with the rest of the views or stop at 0 remaining views. previous_num_of_views_to_retry=$new_num_of_views_to_retry
done done
sleep 1
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA"
sleep 1
echo -e "\nComputing stats for tables..\n" echo -e "\nComputing stats for tables..\n"
entities_on_impala=`impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} --delimited -q "show tables in ${db}"` entities_on_impala=`impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} --delimited -q "show tables in ${db}"`
for i in ${entities_on_impala[@]}; do # Use un-quoted values, as the elemetns are single-words. for i in ${entities_on_impala[@]}; do # Use un-quoted values, as the elemetns are single-words.
# Taking the create table statement from the Ocean cluster, just to check if its a view, as the output is easier than using impala-shell from Impala cluster. # Taking the create table statement from the Ocean cluster, just to check if its a view, as the output is easier than using impala-shell from Impala cluster.
create_view_statement=`hive -e "show create table ${db}.${i};" | grep "CREATE VIEW"` # This grep works here, as we do not want to match multiple-lines. create_view_statement=`hive -e "show create table ${db}.${i};" | grep "CREATE VIEW"` # This grep works here, as we do not want to match multiple-lines.
if [ -z "$create_view_statement" ]; then # If it's a table, then go load the data to it. if [ -z "$create_view_statement" ]; then # If it's a table, then go load the data to it.
# Invalidate metadata of this DB's tables, in order for Impala to be aware of all parquet files put inside the tables' directories, previously, by "hadoop distcp".
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA ${db}.${i}"
sleep 1
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "compute stats ${db}.${i}"; impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "compute stats ${db}.${i}";
fi fi
done done
# Check if the entities in both clusters are the same, down to the exact names, not just the counts. (they are sorted in the same way both in hive and impala)
if [ "${entities_on_impala[@]}" == "${entities_on_ocean[@]}" ]; then if [ "${entities_on_impala[@]}" == "${entities_on_ocean[@]}" ]; then
echo -e "\nAll entities have been copied to Impala cluster.\n" echo -e "\nAll entities have been copied to Impala cluster.\n"
else else
echo -e "\n\nERROR: 1 OR MORE ENTITIES OF DB '${db}' FAILED TO BE COPIED TO IMPALA CLUSTER!\n\n" echo -e "\n\nERROR: 1 OR MORE ENTITIES OF DB '${db}' FAILED TO BE COPIED TO IMPALA CLUSTER!\n\n"
rm -f error.log rm -f error.log
return 4 exit 6
fi fi
rm -f error.log rm -f error.log

View File

@ -68,24 +68,21 @@ function copydb() {
if [ -n "$log_errors" ]; then if [ -n "$log_errors" ]; then
echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN DROPPING THE OLD DATABASE! EXITING...\n\n" echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN DROPPING THE OLD DATABASE! EXITING...\n\n"
rm -f error.log rm -f error.log
return 1 exit 2
fi fi
# Make Impala aware of the deletion of the old DB immediately.
sleep 1
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA"
echo -e "\n\nCopying files of '${db}', from Ocean to Impala cluster..\n" echo -e "\n\nCopying files of '${db}', from Ocean to Impala cluster..\n"
# Using max-bandwidth of: 50 * 100 Mb/s = 5 Gb/s # Using max-bandwidth of: 70 * 150 Mb/s = 10.5 Gb/s
# Using max memory of: 50 * 6144 = 300 Gb # Using max memory of: 70 * 6144 = 430 Gb
# Using 1MB as a buffer-size. # Using 1MB as a buffer-size.
# The " -Ddistcp.dynamic.recordsPerChunk=50" arg is not available in our version of hadoop # The " -Ddistcp.dynamic.recordsPerChunk=N" arg is not available in our version of hadoop
# The "ug" args cannot be used as we get a "User does not belong to hive" error. # The "ug" args cannot be used as we get a "User does not belong to hive" error.
# The "p" argument cannot be used, as it blocks the files from being used, giving a "sticky bit"-error, even after applying chmod and chown onm the files. # The "p" argument cannot be used, as it blocks the files from being used, giving a "sticky bit"-error, even after applying chmod and chown onm the files.
hadoop distcp -Dmapreduce.map.memory.mb=6144 -m 70 -bandwidth 150 \ hadoop distcp -Dmapreduce.map.memory.mb=6144 -m 70 -bandwidth 150 \
-numListstatusThreads 40 \ -numListstatusThreads 40 \
-copybuffersize 1048576 \ -copybuffersize 1048576 \
-strategy dynamic \ -strategy dynamic \
-blocksperchunk 8 \
-pb \ -pb \
${OCEAN_HDFS_NODE}/user/hive/warehouse/${db}.db ${IMPALA_HDFS_DB_BASE_PATH} ${OCEAN_HDFS_NODE}/user/hive/warehouse/${db}.db ${IMPALA_HDFS_DB_BASE_PATH}
@ -93,9 +90,9 @@ function copydb() {
if [ $? -eq 0 ]; then if [ $? -eq 0 ]; then
echo -e "\nSuccessfully copied the files of '${db}'.\n" echo -e "\nSuccessfully copied the files of '${db}'.\n"
else else
echo -e "\n\nERROR: FAILED TO TRANSFER THE FILES OF '${db}', WITH 'hadoop distcp'. GOT WITH EXIT STATUS: $?\n\n" echo -e "\n\nERROR: FAILED TO TRANSFER THE FILES OF '${db}', WITH 'hadoop distcp'. GOT EXIT STATUS: $?\n\n"
rm -f error.log rm -f error.log
return 2 exit 3
fi fi
# In case we ever use this script for a writable DB (using inserts/updates), we should perform the following costly operation as well.. # In case we ever use this script for a writable DB (using inserts/updates), we should perform the following costly operation as well..
@ -106,14 +103,11 @@ function copydb() {
# create the new database (with the same name) # create the new database (with the same name)
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "create database ${db}" impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "create database ${db}"
# Make Impala aware of the creation of the new DB immediately.
sleep 1
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA"
sleep 1
# Because "Hive" and "Impala" do not have compatible schemas, we cannot use the "show create table <name>" output from hive to create the exact same table in impala. # Because "Hive" and "Impala" do not have compatible schemas, we cannot use the "show create table <name>" output from hive to create the exact same table in impala.
# So, we have to find at least one parquet file (check if it's there) from the table in the ocean cluster for impala to use it to extract the table-schema itself from that file. # So, we have to find at least one parquet file (check if it's there) from the table in the ocean cluster for impala to use it to extract the table-schema itself from that file.
all_create_view_statements=() all_create_view_statements=()
num_tables=0
entities_on_ocean=`hive -e "show tables in ${db};" | sed 's/WARN:.*//g'` # Get the tables and views without any potential the "WARN" logs. entities_on_ocean=`hive -e "show tables in ${db};" | sed 's/WARN:.*//g'` # Get the tables and views without any potential the "WARN" logs.
for i in ${entities_on_ocean[@]}; do # Use un-quoted values, as the elemetns are single-words. for i in ${entities_on_ocean[@]}; do # Use un-quoted values, as the elemetns are single-words.
@ -130,9 +124,11 @@ function copydb() {
all_create_view_statements+=("$create_view_statement") all_create_view_statements+=("$create_view_statement")
else else
echo -e "\n'${i}' is a table, so we will check for its parquet files and create the table on Impala cluster.\n" echo -e "\n'${i}' is a table, so we will check for its parquet files and create the table on Impala cluster.\n"
((num_tables++))
CURRENT_PRQ_FILE=`hdfs dfs -conf ${IMPALA_CONFIG_FILE} -ls -C "${IMPALA_HDFS_DB_BASE_PATH}/${db}.db/${i}/" | grep -v 'Found' | grep -v '_impala_insert_staging' | head -1` CURRENT_PRQ_FILE=`hdfs dfs -conf ${IMPALA_CONFIG_FILE} -ls -C "${IMPALA_HDFS_DB_BASE_PATH}/${db}.db/${i}/" | grep -v 'Found' | grep -v '_impala_insert_staging' | head -1`
if [ -z "$CURRENT_PRQ_FILE" ]; then # If there is not parquet-file inside. if [ -z "$CURRENT_PRQ_FILE" ]; then # If there is not parquet-file inside.
echo -e "\nERROR: THE TABLE \"${i}\" HAD NO FILES TO GET THE SCHEMA FROM! IT'S EMPTY!\n\n" echo -e "\nERROR: THE TABLE \"${i}\" HAD NO FILES TO GET THE SCHEMA FROM! IT'S EMPTY!\n\n"
exit 4 # Comment out when testing a DB which has such a table, just for performing this exact test-check.
else else
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "create table ${db}.${i} like parquet '${CURRENT_PRQ_FILE}' stored as parquet;" |& tee error.log impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "create table ${db}.${i} like parquet '${CURRENT_PRQ_FILE}' stored as parquet;" |& tee error.log
log_errors=`cat error.log | grep -E "WARN|ERROR|FAILED"` log_errors=`cat error.log | grep -E "WARN|ERROR|FAILED"`
@ -143,74 +139,73 @@ function copydb() {
fi fi
done done
echo -e "\nAll tables have been created, going to create the views..\n" previous_num_of_views_to_retry=${#all_create_view_statements[@]}
if [[ $num_tables -gt 0 ]]; then
echo -e "\nAll ${num_tables} tables have been created, for db '${db}', going to create the ${previous_num_of_views_to_retry} views..\n"
else
echo -e "\nDB '${db}' does not have any tables, moving on to create the ${previous_num_of_views_to_retry} views..\n"
fi
# Time to loop through the views and create them.
# At this point all table-schemas should have been created.
previous_num_of_views_to_retry=${#all_create_view_statements}
if [[ $previous_num_of_views_to_retry -gt 0 ]]; then if [[ $previous_num_of_views_to_retry -gt 0 ]]; then
echo -e "\nAll_create_view_statements:\n\n${all_create_view_statements[@]}\n" # DEBUG echo -e "\nAll_create_view_statements (${previous_num_of_views_to_retry}):\n\n${all_create_view_statements[@]}\n" # DEBUG
# Make Impala aware of the new tables, so it knows them when creating the views.
sleep 1
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA"
sleep 1
else else
echo -e "\nDB '${db}' does not contain any views.\n" echo -e "\nDB '${db}' does not contain any views.\n"
fi fi
level_counter=0 level_counter=0
while [[ ${#all_create_view_statements[@]} -gt 0 ]]; do while [[ $previous_num_of_views_to_retry -gt 0 ]]; do
((level_counter++)) ((level_counter++))
# The only accepted reason for a view to not be created, is if it depends on another view, which has not been created yet. # The only accepted reason for a view to not be created, is if it depends on another view, which has not been created yet.
# In this case, we should retry creating this particular view again. # In this case, we should retry creating this particular view again.
should_retry_create_view_statements=() new_num_of_views_to_retry=0
for create_view_statement in "${all_create_view_statements[@]}"; do # Here we use double quotes, as the elements are phrases, instead of single-words. for create_view_statement in "${all_create_view_statements[@]}"; do # Here we use double quotes, as the elements are phrases, instead of single-words.
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "${create_view_statement}" |& tee error.log # impala-shell prints all logs in stderr, so wee need to capture them and put them in a file, in order to perform "grep" on them later impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "${create_view_statement}" |& tee error.log # impala-shell prints all logs in stderr, so wee need to capture them and put them in a file, in order to perform "grep" on them later
specific_errors=`cat error.log | grep -E "FAILED: ParseException line 1:13 missing TABLE at 'view'|ERROR: AnalysisException: Could not resolve table reference:"` specific_errors=`cat error.log | grep -E "FAILED: ParseException line 1:13 missing TABLE at 'view'|ERROR: AnalysisException: Could not resolve table reference:"`
if [ -n "$specific_errors" ]; then if [ -n "$specific_errors" ]; then
echo -e "\nspecific_errors: ${specific_errors}\n" echo -e "\nspecific_errors: ${specific_errors}\n"
echo -e "\nView '$(cat error.log | grep "CREATE VIEW " | sed 's/CREATE VIEW //g' | sed 's/ as select .*//g')' failed to be created, possibly because it depends on another view.\n" echo -e "\nView '$(cat error.log | grep -Eo "Query: CREATE VIEW ([^\s]+)" | sed 's/Query: CREATE VIEW //g')' failed to be created, possibly because it depends on another view.\n"
should_retry_create_view_statements+=("$create_view_statement") ((new_num_of_views_to_retry++)) # Increment it here, instead of acquiring the array's size in the end, as that doesn't work for some reason.
else else
all_create_view_statements=("${all_create_view_statements[@]/$create_view_statement}") # Remove the current successful statement from the list.
sleep 1 # Wait a bit for Impala to register that the view was created, before possibly referencing it by another view. sleep 1 # Wait a bit for Impala to register that the view was created, before possibly referencing it by another view.
fi fi
done done
new_num_of_views_to_retry=${#should_retry_create_view_statements} all_create_view_statements=("$(echo "${all_create_view_statements[@]}" | grep -v '^[\s]*$')") # Re-index the array, filtering-out any empty elements.
# Although the above command reduces the "active" elements to just the few to-be-retried, it does not manage to make the array return the its true size through the "${#all_create_view_statements[@]}" statement. So we use counters.
if [[ $new_num_of_views_to_retry -eq $previous_num_of_views_to_retry ]]; then if [[ $new_num_of_views_to_retry -eq $previous_num_of_views_to_retry ]]; then
echo -e "\n\nERROR: THE NUMBER OF VIEWS TO RETRY HAS NOT BEEN REDUCED! THE SCRIPT IS LIKELY GOING TO AN INFINITE-LOOP! EXITING..\n\n" echo -e "\n\nERROR: THE NUMBER OF VIEWS TO RETRY HAS NOT BEEN REDUCED! THE SCRIPT IS LIKELY GOING TO AN INFINITE-LOOP! EXITING..\n\n"
return 3 exit 5
elif [[ $new_num_of_views_to_retry -gt 0 ]]; then elif [[ $new_num_of_views_to_retry -gt 0 ]]; then
echo -e "\nTo be retried \"create_view_statements\":\n\n${should_retry_create_view_statements[@]}\n" echo -e "\nTo be retried \"create_view_statements\" (${new_num_of_views_to_retry}):\n\n${all_create_view_statements[@]}\n"
previous_num_of_views_to_retry=$new_num_of_views_to_retry
else else
echo -e "\nFinished creating views, for db: '${db}', in level-${level_counter}.\n" echo -e "\nFinished creating views, for db: '${db}', in level-${level_counter}.\n"
fi fi
all_create_view_statements=("${should_retry_create_view_statement[@]}") # This is needed in any case to either move forward with the rest of the views or stop at 0 remaining views. previous_num_of_views_to_retry=$new_num_of_views_to_retry
done done
sleep 1
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA"
sleep 1
echo -e "\nComputing stats for tables..\n" echo -e "\nComputing stats for tables..\n"
entities_on_impala=`impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} --delimited -q "show tables in ${db}"` entities_on_impala=`impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} --delimited -q "show tables in ${db}"`
for i in ${entities_on_impala[@]}; do # Use un-quoted values, as the elemetns are single-words. for i in ${entities_on_impala[@]}; do # Use un-quoted values, as the elemetns are single-words.
# Taking the create table statement from the Ocean cluster, just to check if its a view, as the output is easier than using impala-shell from Impala cluster. # Taking the create table statement from the Ocean cluster, just to check if its a view, as the output is easier than using impala-shell from Impala cluster.
create_view_statement=`hive -e "show create table ${db}.${i};" | grep "CREATE VIEW"` # This grep works here, as we do not want to match multiple-lines. create_view_statement=`hive -e "show create table ${db}.${i};" | grep "CREATE VIEW"` # This grep works here, as we do not want to match multiple-lines.
if [ -z "$create_view_statement" ]; then # If it's a table, then go load the data to it. if [ -z "$create_view_statement" ]; then # If it's a table, then go load the data to it.
# Invalidate metadata of this DB's tables, in order for Impala to be aware of all parquet files put inside the tables' directories, previously, by "hadoop distcp".
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA ${db}.${i}"
sleep 1
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "compute stats ${db}.${i}"; impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "compute stats ${db}.${i}";
fi fi
done done
# Check if the entities in both clusters are the same, down to the exact names, not just the counts. (they are sorted in the same way both in hive and impala)
if [ "${entities_on_impala[@]}" == "${entities_on_ocean[@]}" ]; then if [ "${entities_on_impala[@]}" == "${entities_on_ocean[@]}" ]; then
echo -e "\nAll entities have been copied to Impala cluster.\n" echo -e "\nAll entities have been copied to Impala cluster.\n"
else else
echo -e "\n\nERROR: 1 OR MORE ENTITIES OF DB '${db}' FAILED TO BE COPIED TO IMPALA CLUSTER!\n\n" echo -e "\n\nERROR: 1 OR MORE ENTITIES OF DB '${db}' FAILED TO BE COPIED TO IMPALA CLUSTER!\n\n"
rm -f error.log rm -f error.log
return 4 exit 6
fi fi
rm -f error.log rm -f error.log

View File

@ -1,3 +1,4 @@
set mapred.job.queue.name=analytics;
------------------------------------------------------ ------------------------------------------------------
------------------------------------------------------ ------------------------------------------------------
-- Additional relations -- Additional relations

View File

@ -1,3 +1,5 @@
set mapred.job.queue.name=analytics;
------------------------------------------------------ ------------------------------------------------------
------------------------------------------------------ ------------------------------------------------------
-- Additional relations -- Additional relations
@ -104,4 +106,4 @@ rel.properties[1].value apc_currency
from ${openaire_db_name}.relation rel from ${openaire_db_name}.relation rel
join ${openaire_db_name}.organization o on o.id=rel.source join ${openaire_db_name}.organization o on o.id=rel.source
join ${openaire_db_name}.result r on r.id=rel.target join ${openaire_db_name}.result r on r.id=rel.target
where rel.subreltype = 'affiliation' and rel.datainfo.deletedbyinference = false and size(rel.properties)>0; where rel.subreltype = 'affiliation' and rel.datainfo.deletedbyinference = false and size(rel.properties)>0;

View File

@ -1,3 +1,5 @@
set mapred.job.queue.name=analytics;
------------------------------------------- -------------------------------------------
--- Extra tables, mostly used by indicators --- Extra tables, mostly used by indicators
@ -63,4 +65,4 @@ from (
join ${stats_db_name}.result res on res.id=r.id join ${stats_db_name}.result res on res.id=r.id
where r.amount is not null; where r.amount is not null;
create or replace view ${stats_db_name}.issn_gold_oa_dataset as select * from ${external_stats_db_name}.issn_gold_oa_dataset; create or replace view ${stats_db_name}.issn_gold_oa_dataset as select * from ${external_stats_db_name}.issn_gold_oa_dataset;

View File

@ -249,7 +249,7 @@ create table if not exists ${stats_db_name}.indi_pub_gold_oa stored as parquet a
left semi join dd on dd.id=pd.datasource left semi join dd on dd.id=pd.datasource
union all union all
select ra.id, 1 as is_gold select ra.id, 1 as is_gold
from ${stats_db_name}.result_accessroute ra on ra.id = pd.id where ra.accessroute = 'gold') tmp on tmp.id=pd.id; /*EOS*/ from ${stats_db_name}.result_accessroute ra where ra.accessroute = 'gold') tmp on tmp.id=pd.id; /*EOS*/
drop table if exists ${stats_db_name}.indi_pub_hybrid_oa_with_cc purge; /*EOS*/ drop table if exists ${stats_db_name}.indi_pub_hybrid_oa_with_cc purge; /*EOS*/
create table if not exists ${stats_db_name}.indi_pub_hybrid_oa_with_cc stored as parquet as create table if not exists ${stats_db_name}.indi_pub_hybrid_oa_with_cc stored as parquet as
@ -294,7 +294,7 @@ left outer join (
join ${stats_db_name}.indi_pub_gold_oa indi_gold on indi_gold.id=p.id join ${stats_db_name}.indi_pub_gold_oa indi_gold on indi_gold.id=p.id
left outer join ${stats_db_name}.result_accessroute ra on ra.id=p.id left outer join ${stats_db_name}.result_accessroute ra on ra.id=p.id
where indi_gold.is_gold=0 and where indi_gold.is_gold=0 and
((d.type like '%Journal%' and ri.accessright not in ('Closed Access', 'Restricted', 'Not Available') and ri.license is not null) or ra.accessroute='hybrid')) tmp on pd.i=tmp.id; /*EOS*/ ((d.type like '%Journal%' and ri.accessright not in ('Closed Access', 'Restricted', 'Not Available') and ri.license is not null) or ra.accessroute='hybrid')) tmp on p.id=tmp.id; /*EOS*/
drop table if exists ${stats_db_name}.indi_org_fairness purge; /*EOS*/ drop table if exists ${stats_db_name}.indi_org_fairness purge; /*EOS*/
create table if not exists ${stats_db_name}.indi_org_fairness stored as parquet as create table if not exists ${stats_db_name}.indi_org_fairness stored as parquet as
@ -380,7 +380,7 @@ CREATE TEMPORARY VIEW allresults as
drop table if exists ${stats_db_name}.indi_org_fairness_pub purge; /*EOS*/ drop table if exists ${stats_db_name}.indi_org_fairness_pub purge; /*EOS*/
create table if not exists ${stats_db_name}.indi_org_fairness_pub as create table if not exists ${stats_db_name}.indi_org_fairness_pub stored as parquet as
select ar.organization, rf.no_result_fair/ar.no_allresults org_fairness select ar.organization, rf.no_result_fair/ar.no_allresults org_fairness
from allresults ar join result_fair rf from allresults ar join result_fair rf
on rf.organization=ar.organization; /*EOS*/ on rf.organization=ar.organization; /*EOS*/
@ -639,7 +639,7 @@ from ${stats_db_name}.publication p
drop table if exists ${stats_db_name}.indi_result_with_pid purge; /*EOS*/ drop table if exists ${stats_db_name}.indi_result_with_pid purge; /*EOS*/
create table if not exists ${stats_db_name}.indi_result_with_pid as create table if not exists ${stats_db_name}.indi_result_with_pid stored as parquet as
select distinct p.id, coalesce(result_with_pid, 0) as result_with_pid select distinct p.id, coalesce(result_with_pid, 0) as result_with_pid
from ${stats_db_name}.result p from ${stats_db_name}.result p
left outer join ( left outer join (
@ -653,7 +653,7 @@ group by rf.id; /*EOS*/
drop table if exists ${stats_db_name}.indi_pub_interdisciplinarity purge; /*EOS*/ drop table if exists ${stats_db_name}.indi_pub_interdisciplinarity purge; /*EOS*/
create table if not exists ${stats_db_name}.indi_pub_interdisciplinarity as create table if not exists ${stats_db_name}.indi_pub_interdisciplinarity stored as parquet as
select distinct p.id as id, coalesce(is_interdisciplinary, 0) select distinct p.id as id, coalesce(is_interdisciplinary, 0)
as is_interdisciplinary as is_interdisciplinary
from pub_fos_totals p from pub_fos_totals p
@ -1006,14 +1006,14 @@ left outer join (
drop table if exists ${stats_db_name}.result_country purge; /*EOS*/ drop table if exists ${stats_db_name}.result_country purge; /*EOS*/
create table ${stats_db_name}.result_country stored as parquet as create table ${stats_db_name}.result_country stored as parquet as
select distinct * select distinct id, country
from ( from (
select ro.id, o.country select ro.id, o.country
from ${stats_db_name}.result_organization ro from ${stats_db_name}.result_organization ro
left outer join ${stats_db_name}.organization o on o.id=ro.organization left outer join ${stats_db_name}.organization o on o.id=ro.organization
union all union all
select rp.id, f.country select rp.id, f.country
from ${stats_db_name}.result_projects from ${stats_db_name}.result_projects rp
left outer join ${stats_db_name}.project p on p.id=rp.project left outer join ${stats_db_name}.project p on p.id=rp.project
left outer join ${stats_db_name}.funder f on f.name=p.funder left outer join ${stats_db_name}.funder f on f.name=p.funder
) rc ) rc

View File

@ -1,3 +1,5 @@
set mapred.job.queue.name=analytics;
---------------------------------------------------- ----------------------------------------------------
-- Shortcuts for various definitions in stats db --- -- Shortcuts for various definitions in stats db ---
---------------------------------------------------- ----------------------------------------------------
@ -25,4 +27,4 @@ drop table if exists ${stats_db_name}.result_gold purge;
create table IF NOT EXISTS ${stats_db_name}.result_gold STORED AS PARQUET as create table IF NOT EXISTS ${stats_db_name}.result_gold STORED AS PARQUET as
select r.id, case when gold.is_gold=1 then true else false end as gold select r.id, case when gold.is_gold=1 then true else false end as gold
from ${stats_db_name}.result r from ${stats_db_name}.result r
left outer join ${stats_db_name}.indi_pub_gold_oa gold on gold.id=r.id; left outer join ${stats_db_name}.indi_pub_gold_oa gold on gold.id=r.id;

View File

@ -1,3 +1,5 @@
set mapred.job.queue.name=analytics;
-- replace the creation of the result view to include the boolean fields from the previous tables (green, gold, -- replace the creation of the result view to include the boolean fields from the previous tables (green, gold,
-- peer reviewed) -- peer reviewed)
drop table if exists ${stats_db_name}.result_tmp; drop table if exists ${stats_db_name}.result_tmp;
@ -53,4 +55,4 @@ LEFT OUTER JOIN ${stats_db_name}.result_gold gold on gold.id=r.id;
drop table if exists ${stats_db_name}.result; drop table if exists ${stats_db_name}.result;
drop view if exists ${stats_db_name}.result; drop view if exists ${stats_db_name}.result;
create table ${stats_db_name}.result stored as parquet as select * from ${stats_db_name}.result_tmp; create table ${stats_db_name}.result stored as parquet as select * from ${stats_db_name}.result_tmp;
drop table ${stats_db_name}.result_tmp; drop table ${stats_db_name}.result_tmp;

View File

@ -1,3 +1,5 @@
set mapred.job.queue.name=analytics;
-------------------------------------------------------------- --------------------------------------------------------------
-------------------------------------------------------------- --------------------------------------------------------------
-- Publication table/view and Publication related tables/views -- Publication table/view and Publication related tables/views
@ -111,4 +113,4 @@ SELECT substr(p.id, 4) AS id, xpath_string(citation.value, "//citation/id[@type=
FROM ${openaire_db_name}.publication p FROM ${openaire_db_name}.publication p
lateral view explode(p.extrainfo) citations AS citation lateral view explode(p.extrainfo) citations AS citation
WHERE xpath_string(citation.value, "//citation/id[@type='openaire']/@value") != "" WHERE xpath_string(citation.value, "//citation/id[@type='openaire']/@value") != ""
and p.datainfo.deletedbyinference = false and p.datainfo.invisible=false; and p.datainfo.deletedbyinference = false and p.datainfo.invisible=false;

View File

@ -368,6 +368,7 @@
${sparkClusterOpts} ${sparkClusterOpts}
${sparkResourceOpts} ${sparkResourceOpts}
${sparkApplicationOpts} ${sparkApplicationOpts}
--queue analytics
</spark-opts> </spark-opts>
<arg>--hiveMetastoreUris</arg><arg>${hive_metastore_uris}</arg> <arg>--hiveMetastoreUris</arg><arg>${hive_metastore_uris}</arg>
<arg>--sql</arg><arg>eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16-createIndicatorsTables.sql</arg> <arg>--sql</arg><arg>eu/dnetlib/dhp/oa/graph/stats/oozie_app/scripts/step16-createIndicatorsTables.sql</arg>
@ -551,4 +552,4 @@
</action> </action>
<end name="End"/> <end name="End"/>
</workflow-app> </workflow-app>

View File

@ -30,6 +30,10 @@
<name>oozie.launcher.mapred.job.queue.name</name> <name>oozie.launcher.mapred.job.queue.name</name>
<value>${oozieLauncherQueueName}</value> <value>${oozieLauncherQueueName}</value>
</property> </property>
<property>
<name>mapred.child.java.opts</name>
<value>-Xmx16g</value>
</property>
</configuration> </configuration>
</global> </global>

View File

@ -960,7 +960,7 @@
<commons.logging.version>1.1.3</commons.logging.version> <commons.logging.version>1.1.3</commons.logging.version>
<commons-validator.version>1.7</commons-validator.version> <commons-validator.version>1.7</commons-validator.version>
<dateparser.version>1.0.7</dateparser.version> <dateparser.version>1.0.7</dateparser.version>
<dhp-schemas.version>[6.1.2-SNAPSHOT]</dhp-schemas.version> <dhp-schemas.version>[6.1.2]</dhp-schemas.version>
<dhp.cdh.version>cdh5.9.2</dhp.cdh.version> <dhp.cdh.version>cdh5.9.2</dhp.cdh.version>
<dhp.commons.lang.version>3.5</dhp.commons.lang.version> <dhp.commons.lang.version>3.5</dhp.commons.lang.version>
<dhp.guava.version>11.0.2</dhp.guava.version> <dhp.guava.version>11.0.2</dhp.guava.version>