1
0
Fork 0

merged from beta

This commit is contained in:
Claudio Atzori 2024-07-17 12:01:40 +02:00
commit 06e3985b77
83 changed files with 2230 additions and 1109 deletions

View File

@ -328,7 +328,7 @@ public class MergeUtils {
final T merged = mergeOafFields(original, enrich, trust); final T merged = mergeOafFields(original, enrich, trust);
merged.setOriginalId(unionDistinctListOfString(merged.getOriginalId(), enrich.getOriginalId())); merged.setOriginalId(unionDistinctListOfString(merged.getOriginalId(), enrich.getOriginalId()));
merged.setPid(unionDistinctLists(merged.getPid(), enrich.getPid(), trust)); merged.setPid(mergeLists(merged.getPid(), enrich.getPid(), trust, MergeUtils::spKeyExtractor, (p1, p2) -> p1));
merged.setDateofcollection(LocalDateTime.now().toString()); merged.setDateofcollection(LocalDateTime.now().toString());
merged merged
.setDateoftransformation( .setDateoftransformation(
@ -464,6 +464,10 @@ public class MergeUtils {
merge.setIsInDiamondJournal(booleanOR(merge.getIsInDiamondJournal(), enrich.getIsInDiamondJournal())); merge.setIsInDiamondJournal(booleanOR(merge.getIsInDiamondJournal(), enrich.getIsInDiamondJournal()));
merge.setPubliclyFunded(booleanOR(merge.getPubliclyFunded(), enrich.getPubliclyFunded())); merge.setPubliclyFunded(booleanOR(merge.getPubliclyFunded(), enrich.getPubliclyFunded()));
if (StringUtils.isBlank(merge.getTransformativeAgreement())) {
merge.setTransformativeAgreement(enrich.getTransformativeAgreement());
}
return merge; return merge;
} }
@ -655,6 +659,13 @@ public class MergeUtils {
return d1; return d1;
} }
if (StringUtils.contains(d1.getValue(), "null")) {
return d2;
}
if (StringUtils.contains(d2.getValue(), "null")) {
return d1;
}
return Stream return Stream
.of(d1, d2) .of(d1, d2)
.min( .min(

View File

@ -2,31 +2,41 @@
package eu.dnetlib.pace.clustering; package eu.dnetlib.pace.clustering;
import java.util.*; import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Collectors; import java.util.stream.Collectors;
import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.StringUtils;
import eu.dnetlib.pace.config.Config; import eu.dnetlib.pace.config.Config;
@ClusteringClass("keywordsclustering") @ClusteringClass("legalnameclustering")
public class KeywordsClustering extends AbstractClusteringFunction { public class LegalnameClustering extends AbstractClusteringFunction {
public KeywordsClustering(Map<String, Object> params) { private static final Pattern CITY_CODE_PATTERN = Pattern.compile("city::\\d+");
private static final Pattern KEYWORD_CODE_PATTERN = Pattern.compile("key::\\d+");
public LegalnameClustering(Map<String, Object> params) {
super(params); super(params);
} }
public Set<String> getRegexList(String input, Pattern codeRegex) {
Matcher matcher = codeRegex.matcher(input);
Set<String> cities = new HashSet<>();
while (matcher.find()) {
cities.add(matcher.group());
}
return cities;
}
@Override @Override
protected Collection<String> doApply(final Config conf, String s) { protected Collection<String> doApply(final Config conf, String s) {
// takes city codes and keywords codes without duplicates
Set<String> keywords = getKeywords(s, conf.translationMap(), paramOrDefault("windowSize", 4));
Set<String> cities = getCities(s, paramOrDefault("windowSize", 4));
// list of combination to return as result // list of combination to return as result
final Collection<String> combinations = new LinkedHashSet<String>(); final Collection<String> combinations = new LinkedHashSet<String>();
for (String keyword : keywordsToCodes(keywords, conf.translationMap())) { for (String keyword : getRegexList(s, KEYWORD_CODE_PATTERN)) {
for (String city : citiesToCodes(cities)) { for (String city : getRegexList(s, CITY_CODE_PATTERN)) {
combinations.add(keyword + "-" + city); combinations.add(keyword + "-" + city);
if (combinations.size() >= paramOrDefault("max", 2)) { if (combinations.size() >= paramOrDefault("max", 2)) {
return combinations; return combinations;
@ -42,9 +52,6 @@ public class KeywordsClustering extends AbstractClusteringFunction {
return fields return fields
.stream() .stream()
.filter(f -> !f.isEmpty()) .filter(f -> !f.isEmpty())
.map(KeywordsClustering::cleanup)
.map(KeywordsClustering::normalize)
.map(s -> filterAllStopWords(s))
.map(s -> doApply(conf, s)) .map(s -> doApply(conf, s))
.map(c -> filterBlacklisted(c, ngramBlacklist)) .map(c -> filterBlacklisted(c, ngramBlacklist))
.flatMap(c -> c.stream()) .flatMap(c -> c.stream())

View File

@ -27,6 +27,14 @@ public class AbstractPaceFunctions extends PaceCommonUtils {
private static Map<String, String> cityMap = AbstractPaceFunctions private static Map<String, String> cityMap = AbstractPaceFunctions
.loadMapFromClasspath("/eu/dnetlib/pace/config/city_map.csv"); .loadMapFromClasspath("/eu/dnetlib/pace/config/city_map.csv");
// keywords map to be used when translating the keyword names into codes
private static Map<String, String> keywordMap = AbstractPaceFunctions
.loadMapFromClasspath("/eu/dnetlib/pace/config/translation_map.csv");
// country map to be used when inferring the country from the city name
private static Map<String, String> countryMap = AbstractPaceFunctions
.loadCountryMapFromClasspath("/eu/dnetlib/pace/config/country_map.csv");
// list of stopwords in different languages // list of stopwords in different languages
protected static Set<String> stopwords_gr = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_gr.txt"); protected static Set<String> stopwords_gr = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_gr.txt");
protected static Set<String> stopwords_en = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_en.txt"); protected static Set<String> stopwords_en = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_en.txt");
@ -74,6 +82,64 @@ public class AbstractPaceFunctions extends PaceCommonUtils {
return s12; return s12;
} }
public static String countryInference(final String original, String inferFrom) {
if (!original.equalsIgnoreCase("unknown"))
return original;
inferFrom = cleanup(inferFrom);
inferFrom = normalize(inferFrom);
inferFrom = filterAllStopWords(inferFrom);
Set<String> cities = getCities(inferFrom, 4);
return citiesToCountry(cities).stream().findFirst().orElse("UNKNOWN");
}
public static String cityInference(String original) {
original = cleanup(original);
original = normalize(original);
original = filterAllStopWords(original);
Set<String> cities = getCities(original, 4);
for (String city : cities) {
original = original.replaceAll(city, cityMap.get(city));
}
return original;
}
public static String keywordInference(String original) {
original = cleanup(original);
original = normalize(original);
original = filterAllStopWords(original);
Set<String> keywords = getKeywords(original, keywordMap, 4);
for (String keyword : keywords) {
original = original.replaceAll(keyword, keywordMap.get(keyword));
}
return original;
}
public static String cityKeywordInference(String original) {
original = cleanup(original);
original = normalize(original);
original = filterAllStopWords(original);
Set<String> keywords = getKeywords(original, keywordMap, 4);
Set<String> cities = getCities(original, 4);
for (String keyword : keywords) {
original = original.replaceAll(keyword, keywordMap.get(keyword));
}
for (String city : cities) {
original = original.replaceAll(city, cityMap.get(city));
}
return original;
}
protected static String fixXML(final String a) { protected static String fixXML(final String a) {
return a return a
@ -208,6 +274,30 @@ public class AbstractPaceFunctions extends PaceCommonUtils {
return m; return m;
} }
public static Map<String, String> loadCountryMapFromClasspath(final String classpath) {
Transliterator transliterator = Transliterator.getInstance("Any-Eng");
final Map<String, String> m = new HashMap<>();
try {
for (final String s : IOUtils
.readLines(AbstractPaceFunctions.class.getResourceAsStream(classpath), StandardCharsets.UTF_8)) {
// string is like this: country_code;city1;city2;city3
String[] line = s.split(";");
String value = line[0];
for (int i = 1; i < line.length; i++) {
String city = fixAliases(transliterator.transliterate(line[i].toLowerCase()));
String code = cityMap.get(city);
m.put(code, value);
}
}
} catch (final Throwable e) {
return new HashMap<>();
}
return m;
}
public static String removeKeywords(String s, Set<String> keywords) { public static String removeKeywords(String s, Set<String> keywords) {
s = " " + s + " "; s = " " + s + " ";
@ -237,6 +327,10 @@ public class AbstractPaceFunctions extends PaceCommonUtils {
return toCodes(keywords, cityMap); return toCodes(keywords, cityMap);
} }
public static Set<String> citiesToCountry(Set<String> cities) {
return toCodes(toCodes(cities, cityMap), countryMap);
}
protected static String firstLC(final String s) { protected static String firstLC(final String s) {
return StringUtils.substring(s, 0, 1).toLowerCase(); return StringUtils.substring(s, 0, 1).toLowerCase();
} }

View File

@ -47,9 +47,21 @@ public class FieldDef implements Serializable {
private String clean; private String clean;
private String infer;
private String inferenceFrom;
public FieldDef() { public FieldDef() {
} }
public String getInferenceFrom() {
return inferenceFrom;
}
public void setInferenceFrom(final String inferenceFrom) {
this.inferenceFrom = inferenceFrom;
}
public String getName() { public String getName() {
return name; return name;
} }
@ -126,6 +138,14 @@ public class FieldDef implements Serializable {
this.clean = clean; this.clean = clean;
} }
public String getInfer() {
return infer;
}
public void setInfer(String infer) {
this.infer = infer;
}
@Override @Override
public String toString() { public String toString() {
try { try {

View File

@ -123,9 +123,19 @@ case class SparkModel(conf: DedupConfig) {
case _ => res(index) case _ => res(index)
} }
} }
if (StringUtils.isNotBlank(fdef.getInfer)) {
val inferFrom : String = if (StringUtils.isNotBlank(fdef.getInferenceFrom)) fdef.getInferenceFrom else fdef.getPath
res(index) = res(index) match {
case x: Seq[String] => x.map(inference(_, MapDocumentUtil.getJPathString(inferFrom, documentContext), fdef.getInfer))
case _ => inference(res(index).toString, MapDocumentUtil.getJPathString(inferFrom, documentContext), fdef.getInfer)
}
}
} }
res res
} }
new GenericRowWithSchema(values, schema) new GenericRowWithSchema(values, schema)
@ -146,5 +156,17 @@ case class SparkModel(conf: DedupConfig) {
res res
} }
def inference(value: String, inferfrom: String, infertype: String) : String = {
val res = infertype match {
case "country" => AbstractPaceFunctions.countryInference(value, inferfrom)
case "city" => AbstractPaceFunctions.cityInference(value)
case "keyword" => AbstractPaceFunctions.keywordInference(value)
case "city_keyword" => AbstractPaceFunctions.cityKeywordInference(value)
case _ => value
}
res
}
} }

View File

@ -1,48 +0,0 @@
package eu.dnetlib.pace.tree;
import java.util.Map;
import java.util.Set;
import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.tree.support.AbstractStringComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass;
@ComparatorClass("cityMatch")
public class CityMatch extends AbstractStringComparator {
private Map<String, String> params;
public CityMatch(Map<String, String> params) {
super(params);
this.params = params;
}
@Override
public double distance(final String a, final String b, final Config conf) {
String ca = cleanup(a);
String cb = cleanup(b);
ca = normalize(ca);
cb = normalize(cb);
ca = filterAllStopWords(ca);
cb = filterAllStopWords(cb);
Set<String> cities1 = getCities(ca, Integer.parseInt(params.getOrDefault("windowSize", "4")));
Set<String> cities2 = getCities(cb, Integer.parseInt(params.getOrDefault("windowSize", "4")));
Set<String> codes1 = citiesToCodes(cities1);
Set<String> codes2 = citiesToCodes(cities2);
// if no cities are detected, the comparator gives 1.0
if (codes1.isEmpty() && codes2.isEmpty())
return 1.0;
else {
if (codes1.isEmpty() ^ codes2.isEmpty())
return -1; // undefined if one of the two has no cities
return commonElementsPercentage(codes1, codes2);
}
}
}

View File

@ -0,0 +1,51 @@
package eu.dnetlib.pace.tree;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.tree.support.AbstractStringComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass;
@ComparatorClass("codeMatch")
public class CodeMatch extends AbstractStringComparator {
private Map<String, String> params;
private Pattern CODE_REGEX;
public CodeMatch(Map<String, String> params) {
super(params);
this.params = params;
this.CODE_REGEX = Pattern.compile(params.getOrDefault("codeRegex", "[a-zA-Z]::\\d+"));
}
public Set<String> getRegexList(String input) {
Matcher matcher = this.CODE_REGEX.matcher(input);
Set<String> cities = new HashSet<>();
while (matcher.find()) {
cities.add(matcher.group());
}
return cities;
}
@Override
public double distance(final String a, final String b, final Config conf) {
Set<String> codes1 = getRegexList(a);
Set<String> codes2 = getRegexList(b);
// if no codes are detected, the comparator gives 1.0
if (codes1.isEmpty() && codes2.isEmpty())
return 1.0;
else {
if (codes1.isEmpty() ^ codes2.isEmpty())
return -1; // undefined if one of the two has no codes
return commonElementsPercentage(codes1, codes2);
}
}
}

View File

@ -0,0 +1,54 @@
package eu.dnetlib.pace.tree;
import java.util.Map;
import java.util.Set;
import com.wcohen.ss.AbstractStringDistance;
import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.tree.support.AbstractStringComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass;
@ComparatorClass("countryMatch")
public class CountryMatch extends AbstractStringComparator {
private Map<String, String> params;
public CountryMatch(Map<String, String> params) {
super(params, new com.wcohen.ss.JaroWinkler());
this.params = params;
}
public CountryMatch(final double weight) {
super(weight, new com.wcohen.ss.JaroWinkler());
}
protected CountryMatch(final double weight, final AbstractStringDistance ssalgo) {
super(weight, ssalgo);
}
@Override
public double distance(final String a, final String b, final Config conf) {
if (a.isEmpty() || b.isEmpty()) {
return -1.0; // return -1 if a field is missing
}
if (a.equalsIgnoreCase("unknown") || b.equalsIgnoreCase("unknown")) {
return -1.0; // return -1 if a country is UNKNOWN
}
return a.equals(b) ? 1.0 : 0;
}
@Override
public double getWeight() {
return super.weight;
}
@Override
protected double normalize(final double d) {
return d;
}
}

View File

@ -0,0 +1,59 @@
package eu.dnetlib.pace.tree;
import java.util.Map;
import java.util.Set;
import com.wcohen.ss.AbstractStringDistance;
import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.tree.support.AbstractStringComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass;
@ComparatorClass("jaroWinklerLegalname")
public class JaroWinklerLegalname extends AbstractStringComparator {
private Map<String, String> params;
private final String CITY_CODE_REGEX = "city::\\d+";
private final String KEYWORD_CODE_REGEX = "key::\\d+";
public JaroWinklerLegalname(Map<String, String> params) {
super(params, new com.wcohen.ss.JaroWinkler());
this.params = params;
}
public JaroWinklerLegalname(double weight) {
super(weight, new com.wcohen.ss.JaroWinkler());
}
protected JaroWinklerLegalname(double weight, AbstractStringDistance ssalgo) {
super(weight, ssalgo);
}
@Override
public double distance(String a, String b, final Config conf) {
String ca = a.replaceAll(CITY_CODE_REGEX, "").replaceAll(KEYWORD_CODE_REGEX, " ");
String cb = b.replaceAll(CITY_CODE_REGEX, "").replaceAll(KEYWORD_CODE_REGEX, " ");
ca = ca.replaceAll("[ ]{2,}", " ");
cb = cb.replaceAll("[ ]{2,}", " ");
if (ca.isEmpty() && cb.isEmpty())
return 1.0;
else
return normalize(ssalgo.score(ca, cb));
}
@Override
public double getWeight() {
return super.weight;
}
@Override
protected double normalize(double d) {
return d;
}
}

View File

@ -1,74 +0,0 @@
package eu.dnetlib.pace.tree;
import java.util.Map;
import java.util.Set;
import com.wcohen.ss.AbstractStringDistance;
import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.tree.support.AbstractStringComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass;
@ComparatorClass("jaroWinklerNormalizedName")
public class JaroWinklerNormalizedName extends AbstractStringComparator {
private Map<String, String> params;
public JaroWinklerNormalizedName(Map<String, String> params) {
super(params, new com.wcohen.ss.JaroWinkler());
this.params = params;
}
public JaroWinklerNormalizedName(double weight) {
super(weight, new com.wcohen.ss.JaroWinkler());
}
protected JaroWinklerNormalizedName(double weight, AbstractStringDistance ssalgo) {
super(weight, ssalgo);
}
@Override
public double distance(String a, String b, final Config conf) {
String ca = cleanup(a);
String cb = cleanup(b);
ca = normalize(ca);
cb = normalize(cb);
ca = filterAllStopWords(ca);
cb = filterAllStopWords(cb);
Set<String> keywords1 = getKeywords(
ca, conf.translationMap(), Integer.parseInt(params.getOrDefault("windowSize", "4")));
Set<String> keywords2 = getKeywords(
cb, conf.translationMap(), Integer.parseInt(params.getOrDefault("windowSize", "4")));
Set<String> cities1 = getCities(ca, Integer.parseInt(params.getOrDefault("windowSize", "4")));
Set<String> cities2 = getCities(cb, Integer.parseInt(params.getOrDefault("windowSize", "4")));
ca = removeKeywords(ca, keywords1);
ca = removeKeywords(ca, cities1);
cb = removeKeywords(cb, keywords2);
cb = removeKeywords(cb, cities2);
ca = ca.replaceAll("[ ]{2,}", " ");
cb = cb.replaceAll("[ ]{2,}", " ");
if (ca.isEmpty() && cb.isEmpty())
return 1.0;
else
return normalize(ssalgo.score(ca, cb));
}
@Override
public double getWeight() {
return super.weight;
}
@Override
protected double normalize(double d) {
return d;
}
}

View File

@ -1,50 +0,0 @@
package eu.dnetlib.pace.tree;
import java.util.Map;
import java.util.Set;
import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.tree.support.AbstractStringComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass;
@ComparatorClass("keywordMatch")
public class KeywordMatch extends AbstractStringComparator {
Map<String, String> params;
public KeywordMatch(Map<String, String> params) {
super(params);
this.params = params;
}
@Override
public double distance(final String a, final String b, final Config conf) {
String ca = cleanup(a);
String cb = cleanup(b);
ca = normalize(ca);
cb = normalize(cb);
ca = filterAllStopWords(ca);
cb = filterAllStopWords(cb);
Set<String> keywords1 = getKeywords(
ca, conf.translationMap(), Integer.parseInt(params.getOrDefault("windowSize", "4")));
Set<String> keywords2 = getKeywords(
cb, conf.translationMap(), Integer.parseInt(params.getOrDefault("windowSize", "4")));
Set<String> codes1 = toCodes(keywords1, conf.translationMap());
Set<String> codes2 = toCodes(keywords2, conf.translationMap());
// if no cities are detected, the comparator gives 1.0
if (codes1.isEmpty() && codes2.isEmpty())
return 1.0;
else {
if (codes1.isEmpty() ^ codes2.isEmpty())
return -1.0; // undefined if one of the two has no keywords
return commonElementsPercentage(codes1, codes2);
}
}
}

View File

@ -48,7 +48,7 @@ public class TreeNodeDef implements Serializable {
// function for the evaluation of the node // function for the evaluation of the node
public TreeNodeStats evaluate(Row doc1, Row doc2, Config conf) { public TreeNodeStats evaluate(Row doc1, Row doc2, Config conf) {
TreeNodeStats stats = new TreeNodeStats(); TreeNodeStats stats = new TreeNodeStats(ignoreUndefined);
// for each field in the node, it computes the // for each field in the node, it computes the
for (FieldConf fieldConf : fields) { for (FieldConf fieldConf : fields) {

View File

@ -9,8 +9,11 @@ public class TreeNodeStats implements Serializable {
private Map<String, FieldStats> results; // this is an accumulator for the results of the node private Map<String, FieldStats> results; // this is an accumulator for the results of the node
public TreeNodeStats() { private final boolean ignoreUndefined;
public TreeNodeStats(boolean ignoreUndefined) {
this.results = new HashMap<>(); this.results = new HashMap<>();
this.ignoreUndefined = ignoreUndefined;
} }
public Map<String, FieldStats> getResults() { public Map<String, FieldStats> getResults() {
@ -22,7 +25,10 @@ public class TreeNodeStats implements Serializable {
} }
public int fieldsCount() { public int fieldsCount() {
return this.results.size(); if (ignoreUndefined)
return this.results.size();
else
return this.results.size() - undefinedCount(); // do not count undefined
} }
public int undefinedCount() { public int undefinedCount() {
@ -78,11 +84,22 @@ public class TreeNodeStats implements Serializable {
double min = 100.0; // random high value double min = 100.0; // random high value
for (FieldStats fs : this.results.values()) { for (FieldStats fs : this.results.values()) {
if (fs.getResult() < min) { if (fs.getResult() < min) {
if (fs.getResult() >= 0.0 || (fs.getResult() == -1 && fs.isCountIfUndefined())) if (fs.getResult() == -1) {
if (fs.isCountIfUndefined()) {
min = 0.0;
} else {
min = -1;
}
} else {
min = fs.getResult(); min = fs.getResult();
}
} }
} }
return min; if (ignoreUndefined) {
return min == -1.0 ? 0.0 : min;
} else {
return min;
}
} }
// if at least one is true, return 1.0 // if at least one is true, return 1.0
@ -91,7 +108,11 @@ public class TreeNodeStats implements Serializable {
if (fieldStats.getResult() >= fieldStats.getThreshold()) if (fieldStats.getResult() >= fieldStats.getThreshold())
return 1.0; return 1.0;
} }
return 0.0; if (!ignoreUndefined && undefinedCount() > 0) {
return -1.0;
} else {
return 0.0;
}
} }
// if at least one is false, return 0.0 // if at least one is false, return 0.0
@ -100,7 +121,7 @@ public class TreeNodeStats implements Serializable {
if (fieldStats.getResult() == -1) { if (fieldStats.getResult() == -1) {
if (fieldStats.isCountIfUndefined()) if (fieldStats.isCountIfUndefined())
return 0.0; return ignoreUndefined ? 0.0 : -1.0;
} else { } else {
if (fieldStats.getResult() < fieldStats.getThreshold()) if (fieldStats.getResult() < fieldStats.getThreshold())
return 0.0; return 0.0;

View File

@ -44,12 +44,10 @@ public class TreeProcessor {
TreeNodeStats stats = currentNode.evaluate(doc1, doc2, config); TreeNodeStats stats = currentNode.evaluate(doc1, doc2, config);
treeStats.addNodeStats(nextNodeName, stats); treeStats.addNodeStats(nextNodeName, stats);
// if ignoreUndefined=false the miss is considered as undefined double finalScore = stats.getFinalScore(currentNode.getAggregation());
if (!currentNode.isIgnoreUndefined() && stats.undefinedCount() > 0) { if (finalScore == -1.0)
nextNodeName = currentNode.getUndefined(); nextNodeName = currentNode.getUndefined();
} else if (finalScore >= currentNode.getThreshold()) {
// if ignoreUndefined=true the miss is ignored and the score computed anyway
else if (stats.getFinalScore(currentNode.getAggregation()) >= currentNode.getThreshold()) {
nextNodeName = currentNode.getPositive(); nextNodeName = currentNode.getPositive();
} else { } else {
nextNodeName = currentNode.getNegative(); nextNodeName = currentNode.getNegative();

File diff suppressed because one or more lines are too long

View File

@ -8,6 +8,7 @@ import org.junit.jupiter.api.Test;
import com.google.common.collect.Lists; import com.google.common.collect.Lists;
import com.google.common.collect.Maps; import com.google.common.collect.Maps;
import com.mongodb.connection.Cluster;
import eu.dnetlib.pace.AbstractPaceTest; import eu.dnetlib.pace.AbstractPaceTest;
import eu.dnetlib.pace.common.AbstractPaceFunctions; import eu.dnetlib.pace.common.AbstractPaceFunctions;
@ -177,41 +178,16 @@ public class ClusteringFunctionTest extends AbstractPaceTest {
} }
@Test @Test
public void testKeywordsClustering() { public void legalnameClustering() {
final ClusteringFunction cf = new KeywordsClustering(params); final ClusteringFunction cf = new LegalnameClustering(params);
final String s = "Polytechnic University of Turin"; String s = "key::1 key::2 city::1";
System.out.println(s); System.out.println(s);
System.out.println(cf.apply(conf, Lists.newArrayList(s))); System.out.println(cf.apply(conf, Lists.newArrayList(s)));
final String s1 = "POLITECNICO DI TORINO"; s = "key::1 key::2 city::1 city::2";
System.out.println(s1); System.out.println(s);
System.out.println(cf.apply(conf, Lists.newArrayList(s1))); System.out.println(cf.apply(conf, Lists.newArrayList(s)));
final String s2 = "Universita farmaceutica culturale di milano bergamo";
System.out.println("s2 = " + s2);
System.out.println(cf.apply(conf, Lists.newArrayList(s2)));
final String s3 = "universita universita milano milano";
System.out.println("s3 = " + s3);
System.out.println(cf.apply(conf, Lists.newArrayList(s3)));
final String s4 = "Politechniki Warszawskiej (Warsaw University of Technology)";
System.out.println("s4 = " + s4);
System.out.println(cf.apply(conf, Lists.newArrayList(s4)));
final String s5 = "İstanbul Ticarət Universiteti";
System.out.println("s5 = " + s5);
System.out.println(cf.apply(conf, Lists.newArrayList(s5)));
final String s6 = "National and Kapodistrian University of Athens";
System.out.println("s6 = " + s6);
System.out.println(cf.apply(conf, Lists.newArrayList(s6)));
final String s7 = "Εθνικό και Καποδιστριακό Πανεπιστήμιο Αθηνών";
System.out.println("s7 = " + s7);
System.out.println(cf.apply(conf, Lists.newArrayList(s7)));
} }
@Test @Test

View File

@ -54,4 +54,47 @@ public class PaceFunctionTest extends AbstractPaceFunctions {
System.out.println("Fixed aliases : " + fixAliases(TEST_STRING)); System.out.println("Fixed aliases : " + fixAliases(TEST_STRING));
} }
@Test
public void countryInferenceTest() {
assertEquals("IT", countryInference("UNKNOWN", "Università di Bologna"));
assertEquals("UK", countryInference("UK", "Università di Bologna"));
assertEquals("IT", countryInference("UNKNOWN", "Universiteé de Naples"));
assertEquals("UNKNOWN", countryInference("UNKNOWN", "Università del Lavoro"));
}
@Test
public void cityInferenceTest() {
assertEquals("universita city::3181928", cityInference("Università di Bologna"));
assertEquals("university city::3170647", cityInference("University of Pisa"));
assertEquals("universita", cityInference("Università del lavoro"));
assertEquals("universita city::3173331 city::3169522", cityInference("Università di Modena e Reggio Emilia"));
}
@Test
public void keywordInferenceTest() {
assertEquals("key::41 turin", keywordInference("Polytechnic University of Turin"));
assertEquals("key::41 torino", keywordInference("POLITECNICO DI TORINO"));
assertEquals(
"key::1 key::60 key::81 milano bergamo",
keywordInference("Universita farmaceutica culturale di milano bergamo"));
assertEquals("key::1 key::1 milano milano", keywordInference("universita universita milano milano"));
assertEquals(
"key::10 kapodistriako panepistemio athenon",
keywordInference("Εθνικό και Καποδιστριακό Πανεπιστήμιο Αθηνών"));
}
@Test
public void cityKeywordInferenceTest() {
assertEquals("key::41 city::3165524", cityKeywordInference("Polytechnic University of Turin"));
assertEquals("key::41 city::3165524", cityKeywordInference("POLITECNICO DI TORINO"));
assertEquals(
"key::1 key::60 key::81 city::3173435 city::3182164",
cityKeywordInference("Universita farmaceutica culturale di milano bergamo"));
assertEquals(
"key::1 key::1 city::3173435 city::3173435", cityKeywordInference("universita universita milano milano"));
assertEquals(
"key::10 kapodistriako panepistemio city::264371",
cityKeywordInference("Εθνικό και Καποδιστριακό Πανεπιστήμιο Αθηνών"));
}
} }

View File

@ -35,6 +35,7 @@ public class ComparatorTest extends AbstractPaceTest {
params.put("name_th", "0.95"); params.put("name_th", "0.95");
params.put("jpath_value", "$.value"); params.put("jpath_value", "$.value");
params.put("jpath_classid", "$.qualifier.classid"); params.put("jpath_classid", "$.qualifier.classid");
params.put("codeRegex", "key::\\d+");
} }
@Test @Test
@ -44,52 +45,23 @@ public class ComparatorTest extends AbstractPaceTest {
} }
@Test @Test
public void cityMatchTest() { public void codeMatchTest() {
final CityMatch cityMatch = new CityMatch(params); CodeMatch codeMatch = new CodeMatch(params);
// both names with no cities // both names with no codes
assertEquals(1.0, cityMatch.distance("Università", "Centro di ricerca", conf)); assertEquals(1.0, codeMatch.distance("testing1", "testing2", conf));
// one of the two names with no cities // one of the two names with no codes
assertEquals(-1.0, cityMatch.distance("Università di Bologna", "Centro di ricerca", conf)); assertEquals(-1.0, codeMatch.distance("testing1 key::1", "testing", conf));
// both names with cities (same) // both names with codes (same)
assertEquals(1.0, cityMatch.distance("Universita di Bologna", "Biblioteca di Bologna", conf)); assertEquals(1.0, codeMatch.distance("testing1 key::1", "testing2 key::1", conf));
// both names with cities (different) // both names with codes (different)
assertEquals(0.0, cityMatch.distance("Universita di Bologna", "Universita di Torino", conf)); assertEquals(0.0, codeMatch.distance("testing1 key::1", "testing2 key::2", conf));
assertEquals(0.0, cityMatch.distance("Franklin College", "Concordia College", conf));
// particular cases // both names with codes (1 same, 1 different)
assertEquals(1.0, cityMatch.distance("Free University of Bozen-Bolzano", "Università di Bolzano", conf)); assertEquals(0.5, codeMatch.distance("key::1 key::2 testing1", "key::1 testing", conf));
assertEquals(
1.0,
cityMatch
.distance(
"Politechniki Warszawskiej (Warsaw University of Technology)", "Warsaw University of Technology",
conf));
// failing becasuse 'Allen' is a transliterrated greek stopword
// assertEquals(-1.0, cityMatch.distance("Allen (United States)", "United States Military Academy", conf));
assertEquals(-1.0, cityMatch.distance("Washington (United States)", "United States Military Academy", conf));
}
@Test
public void keywordMatchTest() {
params.put("threshold", "0.5");
final KeywordMatch keywordMatch = new KeywordMatch(params);
assertEquals(
0.5, keywordMatch.distance("Biblioteca dell'Universita di Bologna", "Università di Bologna", conf));
assertEquals(1.0, keywordMatch.distance("Universita degli studi di Pisa", "Universita di Pisa", conf));
assertEquals(1.0, keywordMatch.distance("Polytechnic University of Turin", "POLITECNICO DI TORINO", conf));
assertEquals(1.0, keywordMatch.distance("Istanbul Commerce University", "İstanbul Ticarət Universiteti", conf));
assertEquals(1.0, keywordMatch.distance("Franklin College", "Concordia College", conf));
assertEquals(2.0 / 3.0, keywordMatch.distance("University of Georgia", "Georgia State University", conf));
assertEquals(0.5, keywordMatch.distance("University College London", "University of London", conf));
assertEquals(0.5, keywordMatch.distance("Washington State University", "University of Washington", conf));
assertEquals(-1.0, keywordMatch.distance("Allen (United States)", "United States Military Academy", conf));
} }
@ -155,15 +127,15 @@ public class ComparatorTest extends AbstractPaceTest {
} }
@Test @Test
public void jaroWinklerNormalizedNameTest() { public void jaroWinklerLegalnameTest() {
final JaroWinklerNormalizedName jaroWinklerNormalizedName = new JaroWinklerNormalizedName(params); final JaroWinklerLegalname jaroWinklerLegalname = new JaroWinklerLegalname(params);
double result = jaroWinklerNormalizedName double result = jaroWinklerLegalname
.distance("AT&T (United States)", "United States Military Academy", conf); .distance("AT&T (United States)", "United States key::2 key::1", conf);
System.out.println("result = " + result); System.out.println("result = " + result);
result = jaroWinklerNormalizedName.distance("NOAA - Servicio Meteorol\\u00f3gico Nacional", "NOAA - NWS", conf); result = jaroWinklerLegalname.distance("NOAA - Servicio Meteorol\\u00f3gico Nacional", "NOAA - NWS", conf);
System.out.println("result = " + result); System.out.println("result = " + result);
} }
@ -336,4 +308,23 @@ public class ComparatorTest extends AbstractPaceTest {
System.out.println("compare = " + compare); System.out.println("compare = " + compare);
} }
@Test
public void countryMatch() {
CountryMatch countryMatch = new CountryMatch(params);
double result = countryMatch.distance("UNKNOWN", "UNKNOWN", conf);
assertEquals(-1.0, result);
result = countryMatch.distance("CL", "UNKNOWN", conf);
assertEquals(-1.0, result);
result = countryMatch.distance("CL", "IT", conf);
assertEquals(0.0, result);
result = countryMatch.distance("CL", "CL", conf);
assertEquals(1.0, result);
}
} }

View File

@ -51,48 +51,5 @@
<artifactId>hadoop-distcp</artifactId> <artifactId>hadoop-distcp</artifactId>
</dependency> </dependency>
<dependency>
<groupId>eu.dnetlib</groupId>
<artifactId>dnet-actionmanager-api</artifactId>
</dependency>
<dependency>
<groupId>eu.dnetlib</groupId>
<artifactId>dnet-actionmanager-common</artifactId>
<exclusions>
<exclusion>
<groupId>eu.dnetlib</groupId>
<artifactId>dnet-openaireplus-mapping-utils</artifactId>
</exclusion>
<exclusion>
<groupId>saxonica</groupId>
<artifactId>saxon</artifactId>
</exclusion>
<exclusion>
<groupId>saxonica</groupId>
<artifactId>saxon-dom</artifactId>
</exclusion>
<exclusion>
<groupId>jgrapht</groupId>
<artifactId>jgrapht</artifactId>
</exclusion>
<exclusion>
<groupId>net.sf.ehcache</groupId>
<artifactId>ehcache</artifactId>
</exclusion>
<exclusion>
<groupId>org.springframework</groupId>
<artifactId>spring-test</artifactId>
</exclusion>
<exclusion>
<groupId>org.apache.*</groupId>
<artifactId>*</artifactId>
</exclusion>
<exclusion>
<groupId>apache</groupId>
<artifactId>*</artifactId>
</exclusion>
</exclusions>
</dependency>
</dependencies> </dependencies>
</project> </project>

View File

@ -4,7 +4,6 @@ package eu.dnetlib.dhp.actionmanager;
import java.io.Serializable; import java.io.Serializable;
import java.io.StringReader; import java.io.StringReader;
import java.util.List; import java.util.List;
import java.util.NoSuchElementException;
import java.util.Optional; import java.util.Optional;
import java.util.Set; import java.util.Set;
import java.util.stream.Collectors; import java.util.stream.Collectors;
@ -22,7 +21,6 @@ import com.google.common.base.Splitter;
import com.google.common.collect.Iterables; import com.google.common.collect.Iterables;
import com.google.common.collect.Sets; import com.google.common.collect.Sets;
import eu.dnetlib.actionmanager.rmi.ActionManagerException;
import eu.dnetlib.dhp.utils.ISLookupClientFactory; import eu.dnetlib.dhp.utils.ISLookupClientFactory;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException; import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService; import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
@ -65,7 +63,7 @@ public class ISClient implements Serializable {
.map(t -> buildDirectory(basePath, t)) .map(t -> buildDirectory(basePath, t))
.collect(Collectors.toList())) .collect(Collectors.toList()))
.orElseThrow(() -> new IllegalStateException("empty set list")); .orElseThrow(() -> new IllegalStateException("empty set list"));
} catch (ActionManagerException | ISLookUpException e) { } catch (ISLookUpException e) {
throw new IllegalStateException("unable to query ActionSets info from the IS"); throw new IllegalStateException("unable to query ActionSets info from the IS");
} }
} }
@ -89,31 +87,18 @@ public class ISClient implements Serializable {
return Joiner.on("/").join(basePath, t.getMiddle(), t.getRight()); return Joiner.on("/").join(basePath, t.getMiddle(), t.getRight());
} }
private String getBasePathHDFS(ISLookUpService isLookup) throws ActionManagerException { private String getBasePathHDFS(ISLookUpService isLookup) throws ISLookUpException {
return queryServiceProperty(isLookup, "basePath"); return queryServiceProperty(isLookup, "basePath");
} }
private String queryServiceProperty(ISLookUpService isLookup, final String propertyName) private String queryServiceProperty(ISLookUpService isLookup, final String propertyName)
throws ActionManagerException { throws ISLookUpException {
final String q = "for $x in /RESOURCE_PROFILE[.//RESOURCE_TYPE/@value='ActionManagerServiceResourceType'] return $x//SERVICE_PROPERTIES/PROPERTY[./@ key='" final String q = "for $x in /RESOURCE_PROFILE[.//RESOURCE_TYPE/@value='ActionManagerServiceResourceType'] return $x//SERVICE_PROPERTIES/PROPERTY[./@ key='"
+ propertyName + propertyName
+ "']/@value/string()"; + "']/@value/string()";
log.debug("quering for service property: {}", q); log.debug("quering for service property: {}", q);
try {
final List<String> value = isLookup.quickSearchProfile(q); final List<String> value = isLookup.quickSearchProfile(q);
return Iterables.getOnlyElement(value); return Iterables.getOnlyElement(value);
} catch (ISLookUpException e) {
String msg = "Error accessing service profile, using query: " + q;
log.error(msg, e);
throw new ActionManagerException(msg, e);
} catch (NoSuchElementException e) {
String msg = "missing service property: " + propertyName;
log.error(msg, e);
throw new ActionManagerException(msg, e);
} catch (IllegalArgumentException e) {
String msg = "found more than one service property: " + propertyName;
log.error(msg, e);
throw new ActionManagerException(msg, e);
}
} }
} }

View File

@ -42,6 +42,9 @@ public class Constants {
public static final String NULL = "NULL"; public static final String NULL = "NULL";
public static final String NA = "N/A"; public static final String NA = "N/A";
public static final String WEB_CRAWL_ID = "10|openaire____::fb98a192f6a055ba495ef414c330834b";
public static final String WEB_CRAWL_NAME = "Web Crawl";
public static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); public static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
private Constants() { private Constants() {

View File

@ -41,9 +41,9 @@ public class PrepareAffiliationRelations implements Serializable {
private static final Logger log = LoggerFactory.getLogger(PrepareAffiliationRelations.class); private static final Logger log = LoggerFactory.getLogger(PrepareAffiliationRelations.class);
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
private static final String ID_PREFIX = "50|doi_________::"; private static final String ID_PREFIX = "50|doi_________::";
public static final String BIP_AFFILIATIONS_CLASSID = "result:organization:bipinference"; public static final String BIP_AFFILIATIONS_CLASSID = "result:organization:openaireinference";
public static final String BIP_AFFILIATIONS_CLASSNAME = "Affiliation relation inferred by BIP!"; public static final String BIP_AFFILIATIONS_CLASSNAME = "Affiliation relation inferred by OpenAIRE";
public static final String BIP_INFERENCE_PROVENANCE = "bip:affiliation:crossref"; public static final String BIP_INFERENCE_PROVENANCE = "openaire:affiliation";
public static <I extends Result> void main(String[] args) throws Exception { public static <I extends Result> void main(String[] args) throws Exception {
@ -71,6 +71,9 @@ public class PrepareAffiliationRelations implements Serializable {
final String dataciteInputPath = parser.get("dataciteInputPath"); final String dataciteInputPath = parser.get("dataciteInputPath");
log.info("dataciteInputPath: {}", dataciteInputPath); log.info("dataciteInputPath: {}", dataciteInputPath);
final String webcrawlInputPath = parser.get("webCrawlInputPath");
log.info("webcrawlInputPath: {}", webcrawlInputPath);
final String outputPath = parser.get("outputPath"); final String outputPath = parser.get("outputPath");
log.info("outputPath: {}", outputPath); log.info("outputPath: {}", outputPath);
@ -102,10 +105,16 @@ public class PrepareAffiliationRelations implements Serializable {
JavaPairRDD<Text, Text> dataciteRelations = prepareAffiliationRelations( JavaPairRDD<Text, Text> dataciteRelations = prepareAffiliationRelations(
spark, dataciteInputPath, collectedFromDatacite); spark, dataciteInputPath, collectedFromDatacite);
List<KeyValue> collectedFromWebCrawl = OafMapperUtils
.listKeyValues(Constants.WEB_CRAWL_ID, Constants.WEB_CRAWL_NAME);
JavaPairRDD<Text, Text> webCrawlRelations = prepareAffiliationRelations(
spark, webcrawlInputPath, collectedFromWebCrawl);
crossrefRelations crossrefRelations
.union(pubmedRelations) .union(pubmedRelations)
.union(openAPCRelations) .union(openAPCRelations)
.union(dataciteRelations) .union(dataciteRelations)
.union(webCrawlRelations)
.saveAsHadoopFile( .saveAsHadoopFile(
outputPath, Text.class, Text.class, SequenceFileOutputFormat.class, BZip2Codec.class); outputPath, Text.class, Text.class, SequenceFileOutputFormat.class, BZip2Codec.class);

View File

@ -5,7 +5,6 @@ import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import java.io.Serializable; import java.io.Serializable;
import java.util.*; import java.util.*;
import java.util.stream.Collectors;
import org.apache.commons.io.IOUtils; import org.apache.commons.io.IOUtils;
import org.apache.hadoop.io.Text; import org.apache.hadoop.io.Text;
@ -21,6 +20,7 @@ import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.actionmanager.Constants;
import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.schema.action.AtomicAction; import eu.dnetlib.dhp.schema.action.AtomicAction;
import eu.dnetlib.dhp.schema.common.ModelConstants; import eu.dnetlib.dhp.schema.common.ModelConstants;
@ -44,8 +44,7 @@ public class CreateActionSetFromWebEntries implements Serializable {
private static final String PMID_PREFIX = "50|pmid________::"; private static final String PMID_PREFIX = "50|pmid________::";
private static final String PMCID_PREFIX = "50|pmc_________::"; private static final String PMCID_PREFIX = "50|pmc_________::";
private static final String WEB_CRAWL_ID = "10|openaire____::fb98a192f6a055ba495ef414c330834b";
private static final String WEB_CRAWL_NAME = "Web Crawl";
public static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); public static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
public static void main(String[] args) throws Exception { public static void main(String[] args) throws Exception {
@ -104,8 +103,6 @@ public class CreateActionSetFromWebEntries implements Serializable {
final String ror = ROR_PREFIX final String ror = ROR_PREFIX
+ IdentifierFactory.md5(PidCleaner.normalizePidValue("ROR", row.getAs("ror"))); + IdentifierFactory.md5(PidCleaner.normalizePidValue("ROR", row.getAs("ror")));
ret.addAll(createAffiliationRelationPairDOI(row.getAs("doi"), ror)); ret.addAll(createAffiliationRelationPairDOI(row.getAs("doi"), ror));
// ret.addAll(createAffiliationRelationPairPMID(row.getAs("pmid"), ror));
// ret.addAll(createAffiliationRelationPairPMCID(row.getAs("pmcid"), ror));
return ret return ret
.iterator(); .iterator();
@ -145,11 +142,6 @@ public class CreateActionSetFromWebEntries implements Serializable {
"institution.country_code as country_code", "publication_year") "institution.country_code as country_code", "publication_year")
.distinct(); .distinct();
// .selectExpr(
// "id", "doi", "ids.pmcid as pmcid", "ids.pmid as pmid", "institution.ror as ror",
// "institution.country_code as country_code", "publication_year")
// .distinct();
} }
private static Dataset<Row> readBlackList(SparkSession spark, String inputPath) { private static Dataset<Row> readBlackList(SparkSession spark, String inputPath) {
@ -220,7 +212,7 @@ public class CreateActionSetFromWebEntries implements Serializable {
ModelConstants.IS_AUTHOR_INSTITUTION_OF, ModelConstants.IS_AUTHOR_INSTITUTION_OF,
Arrays Arrays
.asList( .asList(
OafMapperUtils.keyValue(WEB_CRAWL_ID, WEB_CRAWL_NAME)), OafMapperUtils.keyValue(Constants.WEB_CRAWL_ID, Constants.WEB_CRAWL_NAME)),
OafMapperUtils OafMapperUtils
.dataInfo( .dataInfo(
false, null, false, false, false, null, false, false,
@ -239,7 +231,7 @@ public class CreateActionSetFromWebEntries implements Serializable {
ModelConstants.HAS_AUTHOR_INSTITUTION, ModelConstants.HAS_AUTHOR_INSTITUTION,
Arrays Arrays
.asList( .asList(
OafMapperUtils.keyValue(WEB_CRAWL_ID, WEB_CRAWL_NAME)), OafMapperUtils.keyValue(Constants.WEB_CRAWL_ID, Constants.WEB_CRAWL_NAME)),
OafMapperUtils OafMapperUtils
.dataInfo( .dataInfo(
false, null, false, false, false, null, false, false,

View File

@ -0,0 +1,76 @@
package eu.dnetlib.dhp.collection.plugin.researchfi;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.Spliterator;
import java.util.Spliterators;
import java.util.stream.Stream;
import java.util.stream.StreamSupport;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.http.NameValuePair;
import org.apache.http.client.entity.UrlEncodedFormEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.message.BasicNameValuePair;
import org.json.JSONObject;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import eu.dnetlib.dhp.collection.ApiDescriptor;
import eu.dnetlib.dhp.collection.plugin.CollectorPlugin;
import eu.dnetlib.dhp.common.aggregation.AggregatorReport;
import eu.dnetlib.dhp.common.collection.CollectorException;
public class ResearchFiCollectorPlugin implements CollectorPlugin {
private static final Logger log = LoggerFactory.getLogger(ResearchFiCollectorPlugin.class);
@Override
public Stream<String> collect(final ApiDescriptor api, final AggregatorReport report)
throws CollectorException {
final String authUrl = api.getParams().get("auth_url");
final String clientId = api.getParams().get("auth_client_id");
final String clientSecret = api.getParams().get("auth_client_secret");
final String authToken = authenticate(authUrl, clientId, clientSecret);
final Iterator<String> iter = new ResearchFiIterator(api.getBaseUrl(), authToken);
return StreamSupport.stream(Spliterators.spliteratorUnknownSize(iter, Spliterator.ORDERED), false);
}
private String authenticate(final String authUrl, final String clientId, final String clientSecret)
throws CollectorException {
try (final CloseableHttpClient client = HttpClients.createDefault()) {
final HttpPost req = new HttpPost(authUrl);
final List<NameValuePair> params = new ArrayList<>();
params.add(new BasicNameValuePair("grant_type", "client_credentials"));
params.add(new BasicNameValuePair("client_id", clientId));
params.add(new BasicNameValuePair("client_secret", clientSecret));
req.setEntity(new UrlEncodedFormEntity(params, "UTF-8"));
try (final CloseableHttpResponse response = client.execute(req)) {
final String content = IOUtils.toString(response.getEntity().getContent());
final JSONObject obj = new JSONObject(content);
final String token = obj.getString("access_token");
if (StringUtils.isNotBlank(token)) {
return token;
}
}
} catch (final Throwable e) {
log.warn("Error obtaining access token", e);
throw new CollectorException("Error obtaining access token", e);
}
throw new CollectorException("Access token is missing");
}
}

View File

@ -0,0 +1,117 @@
package eu.dnetlib.dhp.collection.plugin.researchfi;
import java.util.Iterator;
import java.util.Queue;
import java.util.concurrent.PriorityBlockingQueue;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.math.NumberUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.http.Header;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.json.JSONArray;
import eu.dnetlib.dhp.collection.plugin.utils.JsonUtils;
import eu.dnetlib.dhp.common.collection.CollectorException;
public class ResearchFiIterator implements Iterator<String> {
private static final Log log = LogFactory.getLog(ResearchFiIterator.class);
private static final int PAGE_SIZE = 100;
private final String baseUrl;
private final String authToken;
private int currPage;
private int nPages;
private final Queue<String> queue = new PriorityBlockingQueue<>();
public ResearchFiIterator(final String baseUrl, final String authToken) {
this.baseUrl = baseUrl;
this.authToken = authToken;
this.currPage = 0;
this.nPages = 0;
}
private void verifyStarted() {
if (this.currPage == 0) {
try {
nextCall();
} catch (final CollectorException e) {
throw new IllegalStateException(e);
}
}
}
@Override
public boolean hasNext() {
synchronized (this.queue) {
verifyStarted();
return !this.queue.isEmpty();
}
}
@Override
public String next() {
synchronized (this.queue) {
verifyStarted();
final String res = this.queue.poll();
while (this.queue.isEmpty() && (this.currPage < this.nPages)) {
try {
nextCall();
} catch (final CollectorException e) {
throw new IllegalStateException(e);
}
}
return res;
}
}
private void nextCall() throws CollectorException {
this.currPage += 1;
final String url;
if (!this.baseUrl.contains("?")) {
url = String.format("%s?PageNumber=%d&PageSize=%d", this.baseUrl, this.currPage, PAGE_SIZE);
} else if (!this.baseUrl.contains("PageSize=")) {
url = String.format("%s&PageNumber=%d&PageSize=%d", this.baseUrl, this.currPage, PAGE_SIZE);
} else {
url = String.format("%s&PageNumber=%d", this.baseUrl, this.currPage);
}
log.info("Calling url: " + url);
try (final CloseableHttpClient client = HttpClients.createDefault()) {
final HttpGet req = new HttpGet(url);
req.addHeader("Authorization", "Bearer " + this.authToken);
try (final CloseableHttpResponse response = client.execute(req)) {
for (final Header header : response.getAllHeaders()) {
log.debug("HEADER: " + header.getName() + " = " + header.getValue());
if ("x-page-count".equals(header.getName())) {
final int totalPages = NumberUtils.toInt(header.getValue());
if (this.nPages != totalPages) {
this.nPages = NumberUtils.toInt(header.getValue());
log.info("Total pages: " + totalPages);
}
}
}
final String content = IOUtils.toString(response.getEntity().getContent());
final JSONArray jsonArray = new JSONArray(content);
jsonArray.forEach(obj -> this.queue.add(JsonUtils.convertToXML(obj.toString())));
}
} catch (final Throwable e) {
log.warn("Error calling url: " + url, e);
throw new CollectorException("Error calling url: " + url, e);
}
}
}

View File

@ -28,7 +28,13 @@
"paramLongName": "dataciteInputPath", "paramLongName": "dataciteInputPath",
"paramDescription": "the path to get the input data from Datacite", "paramDescription": "the path to get the input data from Datacite",
"paramRequired": true "paramRequired": true
}, },{
"paramName": "wip",
"paramLongName": "webCrawlInputPath",
"paramDescription": "the path to get the input data from Web Crawl",
"paramRequired": true
}
,
{ {
"paramName": "o", "paramName": "o",
"paramLongName": "outputPath", "paramLongName": "outputPath",

View File

@ -17,6 +17,10 @@
<name>dataciteInputPath</name> <name>dataciteInputPath</name>
<description>the path where to find the inferred affiliation relations from Datacite</description> <description>the path where to find the inferred affiliation relations from Datacite</description>
</property> </property>
<property>
<name>webCrawlInputPath</name>
<description>the path where to find the inferred affiliation relations from webCrawl</description>
</property>
<property> <property>
<name>outputPath</name> <name>outputPath</name>
<description>the path where to store the actionset</description> <description>the path where to store the actionset</description>
@ -112,7 +116,7 @@
<arg>--pubmedInputPath</arg><arg>${pubmedInputPath}</arg> <arg>--pubmedInputPath</arg><arg>${pubmedInputPath}</arg>
<arg>--openapcInputPath</arg><arg>${openapcInputPath}</arg> <arg>--openapcInputPath</arg><arg>${openapcInputPath}</arg>
<arg>--dataciteInputPath</arg><arg>${dataciteInputPath}</arg> <arg>--dataciteInputPath</arg><arg>${dataciteInputPath}</arg>
<arg>--webCrawlInputPath</arg><arg>${webCrawlInputPath}</arg>
<arg>--outputPath</arg><arg>${outputPath}</arg> <arg>--outputPath</arg><arg>${outputPath}</arg>
</spark> </spark>
<ok to="End"/> <ok to="End"/>

View File

@ -1,10 +1,5 @@
[ [
{
"id": "100007630",
"uri": "http://dx.doi.org/10.13039/100007630",
"name": "College of Engineering and Informatics, National University of Ireland, Galway",
"synonym": []
},
{ {
"id": "100007731", "id": "100007731",
"uri": "http://dx.doi.org/10.13039/100007731", "uri": "http://dx.doi.org/10.13039/100007731",
@ -432,13 +427,13 @@
"id": "501100001634", "id": "501100001634",
"uri": "http://dx.doi.org/10.13039/501100001634", "uri": "http://dx.doi.org/10.13039/501100001634",
"name": "University of Galway", "name": "University of Galway",
"synonym": [] "synonym": ["501100019905", "100007630", "501100020570", "501100023852"]
}, },
{ {
"id": "501100001635", "id": "501100001635",
"uri": "http://dx.doi.org/10.13039/501100001635", "uri": "http://dx.doi.org/10.13039/501100001635",
"name": "University of Limerick", "name": "University of Limerick",
"synonym": [] "synonym": ["501100014531"]
}, },
{ {
"id": "501100001636", "id": "501100001636",
@ -468,7 +463,7 @@
"id": "501100002736", "id": "501100002736",
"uri": "http://dx.doi.org/10.13039/501100002736", "uri": "http://dx.doi.org/10.13039/501100002736",
"name": "Covidien", "name": "Covidien",
"synonym": [] "synonym": ["501100003956"]
}, },
{ {
"id": "501100002755", "id": "501100002755",
@ -518,12 +513,6 @@
"name": "Irish Institute of Clinical Neuroscience", "name": "Irish Institute of Clinical Neuroscience",
"synonym": [] "synonym": []
}, },
{
"id": "501100003956",
"uri": "http://dx.doi.org/10.13039/501100003956",
"name": "Aspect Medical Systems",
"synonym": []
},
{ {
"id": "501100004162", "id": "501100004162",
"uri": "http://dx.doi.org/10.13039/501100004162", "uri": "http://dx.doi.org/10.13039/501100004162",
@ -644,12 +633,7 @@
"name": "Irish Centre for High-End Computing", "name": "Irish Centre for High-End Computing",
"synonym": [] "synonym": []
}, },
{
"id": "501100019905",
"uri": "http://dx.doi.org/10.13039/501100019905",
"name": "Galway University Foundation",
"synonym": []
},
{ {
"id": "501100020036", "id": "501100020036",
"uri": "http://dx.doi.org/10.13039/501100020036", "uri": "http://dx.doi.org/10.13039/501100020036",
@ -824,12 +808,7 @@
"name": "Energy Policy Research Centre, Economic and Social Research Institute", "name": "Energy Policy Research Centre, Economic and Social Research Institute",
"synonym": [] "synonym": []
}, },
{
"id": "501100014531",
"uri": "http://dx.doi.org/10.13039/501100014531",
"name": "Physical Education and Sport Sciences Department, University of Limerick",
"synonym": []
},
{ {
"id": "501100014745", "id": "501100014745",
"uri": "http://dx.doi.org/10.13039/501100014745", "uri": "http://dx.doi.org/10.13039/501100014745",
@ -842,22 +821,11 @@
"name": "ADAPT - Centre for Digital Content Technology", "name": "ADAPT - Centre for Digital Content Technology",
"synonym": [] "synonym": []
}, },
{
"id": "501100020570",
"uri": "http://dx.doi.org/10.13039/501100020570",
"name": "College of Medicine, Nursing and Health Sciences, National University of Ireland, Galway",
"synonym": []
},
{ {
"id": "501100020871", "id": "501100020871",
"uri": "http://dx.doi.org/10.13039/501100020871", "uri": "http://dx.doi.org/10.13039/501100020871",
"name": "Bernal Institute, University of Limerick", "name": "Bernal Institute, University of Limerick",
"synonym": [] "synonym": []
},
{
"id": "501100023852",
"uri": "http://dx.doi.org/10.13039/501100023852",
"name": "Moore Institute for Research in the Humanities and Social Studies, University of Galway",
"synonym": []
} }
] ]

View File

@ -48,12 +48,37 @@
<description>Used to configure the heap size for the map JVM process. Should be 80% of mapreduce.map.memory.mb.</description> <description>Used to configure the heap size for the map JVM process. Should be 80% of mapreduce.map.memory.mb.</description>
</property> </property>
<property>
<name>JAVA_HOME</name>
<value>/srv/java/openjdk-17</value>
<description>Used to configure the Java home location for oozie.launcher.mapreduce.map.env</description>
</property>
<property>
<name>JAVA_OPTS</name>
<value>-Dcom.sun.security.enableAIAcaIssuers=true</value>
<description>Used to configure the JAVA_OPTS parameter</description>
</property>
</parameters> </parameters>
<global> <global>
<job-tracker>${jobTracker}</job-tracker> <job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node> <name-node>${nameNode}</name-node>
<configuration>
<property>
<name>mapreduce.job.queuename</name>
<value>${queueName}</value>
</property>
<property>
<name>oozie.launcher.mapred.job.queue.name</name>
<value>${oozieLauncherQueueName}</value>
</property>
<property>
<name>oozie.launcher.mapreduce.map.env</name>
<value>JAVA_HOME=${JAVA_HOME}</value>
</property>
</configuration>
</global> </global>
<start to="collection_mode"/> <start to="collection_mode"/>
@ -99,7 +124,7 @@
<action name="CollectionWorker"> <action name="CollectionWorker">
<java> <java>
<main-class>eu.dnetlib.dhp.collection.CollectorWorkerApplication</main-class> <main-class>eu.dnetlib.dhp.collection.CollectorWorkerApplication</main-class>
<java-opts>${collection_java_xmx}</java-opts> <java-opts>${JAVA_OPTS} ${collection_java_xmx}</java-opts>
<arg>--apidescriptor</arg><arg>${apiDescription}</arg> <arg>--apidescriptor</arg><arg>${apiDescription}</arg>
<arg>--namenode</arg><arg>${nameNode}</arg> <arg>--namenode</arg><arg>${nameNode}</arg>
<arg>--workflowId</arg><arg>${workflowId}</arg> <arg>--workflowId</arg><arg>${workflowId}</arg>

View File

@ -93,7 +93,7 @@ case object Crossref2Oaf {
val cf = new KeyValue val cf = new KeyValue
cf.setValue("UnpayWall") cf.setValue("UnpayWall")
cf.setKey(s"10|openaire____:${DHPUtils.md5("UnpayWall".toLowerCase)}") cf.setKey(s"10|openaire____::${DHPUtils.md5("UnpayWall".toLowerCase)}")
cf cf
} }

View File

@ -88,6 +88,7 @@ public class PrepareAffiliationRelationsTest {
"-pubmedInputPath", crossrefAffiliationRelationPath, "-pubmedInputPath", crossrefAffiliationRelationPath,
"-openapcInputPath", crossrefAffiliationRelationPath, "-openapcInputPath", crossrefAffiliationRelationPath,
"-dataciteInputPath", crossrefAffiliationRelationPath, "-dataciteInputPath", crossrefAffiliationRelationPath,
"-webCrawlInputPath", crossrefAffiliationRelationPath,
"-outputPath", outputPath "-outputPath", outputPath
}); });
@ -104,7 +105,7 @@ public class PrepareAffiliationRelationsTest {
// ); // );
// } // }
// count the number of relations // count the number of relations
assertEquals(80, tmp.count()); assertEquals(120, tmp.count());
Dataset<Relation> dataset = spark.createDataset(tmp.rdd(), Encoders.bean(Relation.class)); Dataset<Relation> dataset = spark.createDataset(tmp.rdd(), Encoders.bean(Relation.class));
dataset.createOrReplaceTempView("result"); dataset.createOrReplaceTempView("result");
@ -115,7 +116,7 @@ public class PrepareAffiliationRelationsTest {
// verify that we have equal number of bi-directional relations // verify that we have equal number of bi-directional relations
Assertions Assertions
.assertEquals( .assertEquals(
40, execVerification 60, execVerification
.filter( .filter(
"relClass='" + ModelConstants.HAS_AUTHOR_INSTITUTION + "'") "relClass='" + ModelConstants.HAS_AUTHOR_INSTITUTION + "'")
.collectAsList() .collectAsList()
@ -123,7 +124,7 @@ public class PrepareAffiliationRelationsTest {
Assertions Assertions
.assertEquals( .assertEquals(
40, execVerification 60, execVerification
.filter( .filter(
"relClass='" + ModelConstants.IS_AUTHOR_INSTITUTION_OF + "'") "relClass='" + ModelConstants.IS_AUTHOR_INSTITUTION_OF + "'")
.collectAsList() .collectAsList()

View File

@ -0,0 +1,58 @@
package eu.dnetlib.dhp.collection.plugin.researchfi;
import java.util.HashSet;
import java.util.Set;
import java.util.concurrent.atomic.AtomicLong;
import org.dom4j.DocumentException;
import org.dom4j.DocumentHelper;
import org.junit.jupiter.api.Disabled;
import org.junit.jupiter.api.Test;
import eu.dnetlib.dhp.collection.ApiDescriptor;
import eu.dnetlib.dhp.common.aggregation.AggregatorReport;
import eu.dnetlib.dhp.common.collection.CollectorException;
public class ResearchFiCollectorPluginTest {
private final ResearchFiCollectorPlugin plugin = new ResearchFiCollectorPlugin();
@Test
@Disabled
void testCollect() throws CollectorException {
final ApiDescriptor api = new ApiDescriptor();
api.setBaseUrl("https://research.fi/api/rest/v1/funding-decisions?FunderName=AKA&FundingStartYearFrom=2022");
api.setProtocol("research_fi");
api
.getParams()
.put("auth_url", "https://researchfi-auth.2.rahtiapp.fi/realms/publicapi/protocol/openid-connect/token");
api.getParams().put("auth_client_id", "");
api.getParams().put("auth_client_secret", "");
final AtomicLong count = new AtomicLong(0);
final Set<String> ids = new HashSet<>();
this.plugin.collect(api, new AggregatorReport()).forEach(s -> {
if (count.getAndIncrement() == 0) {
System.out.println("First: " + s);
}
try {
final String id = DocumentHelper.parseText(s).valueOf("/recordWrap/funderProjectNumber");
if (ids.contains(id)) {
System.out.println("Id already present: " + id);
}
ids.add(id);
} catch (final DocumentException e) {
throw new RuntimeException(e);
}
});
System.out.println("Total records: " + count);
System.out.println("Total identifiers: " + ids.size());
}
}

View File

@ -5,3 +5,5 @@
{"DOI":"10.1061\/(asce)0733-9372(2002)128:7(575)","Matchings":[{"RORid":"https:\/\/ror.org\/04j198w64","Confidence":0.82}]} {"DOI":"10.1061\/(asce)0733-9372(2002)128:7(575)","Matchings":[{"RORid":"https:\/\/ror.org\/04j198w64","Confidence":0.82}]}
{"DOI":"10.1061\/(asce)0733-9372(2002)128:7(588)","Matchings":[{"RORid":"https:\/\/ror.org\/03m8km719","Confidence":0.8660254038},{"RORid":"https:\/\/ror.org\/02aze4h65","Confidence":0.87}]} {"DOI":"10.1061\/(asce)0733-9372(2002)128:7(588)","Matchings":[{"RORid":"https:\/\/ror.org\/03m8km719","Confidence":0.8660254038},{"RORid":"https:\/\/ror.org\/02aze4h65","Confidence":0.87}]}
{"DOI":"10.1161\/hy0202.103001","Matchings":[{"RORid":"https:\/\/ror.org\/057xtrt18","Confidence":0.7071067812}]} {"DOI":"10.1161\/hy0202.103001","Matchings":[{"RORid":"https:\/\/ror.org\/057xtrt18","Confidence":0.7071067812}]}
{"DOI": "10.1080/13669877.2015.1042504", "Matchings": [{"Confidence": 1.0, "RORid": "https://ror.org/03265fv13"}]}
{"DOI": "10.1007/3-540-47984-8_14", "Matchings": [{"Confidence": 1.0, "RORid": "https://ror.org/00a0n9e72"}]}

View File

@ -26,15 +26,15 @@ import eu.dnetlib.dhp.schema.oaf.Publication;
import eu.dnetlib.dhp.schema.oaf.Result; import eu.dnetlib.dhp.schema.oaf.Result;
import eu.dnetlib.dhp.schema.oaf.Software; import eu.dnetlib.dhp.schema.oaf.Software;
public class PrepareSimpleEntititiesJob { public class PrepareSimpleEntitiesJob {
private static final Logger log = LoggerFactory.getLogger(PrepareSimpleEntititiesJob.class); private static final Logger log = LoggerFactory.getLogger(PrepareSimpleEntitiesJob.class);
public static void main(final String[] args) throws Exception { public static void main(final String[] args) throws Exception {
final ArgumentApplicationParser parser = new ArgumentApplicationParser( final ArgumentApplicationParser parser = new ArgumentApplicationParser(
IOUtils IOUtils
.toString( .toString(
PrepareSimpleEntititiesJob.class PrepareSimpleEntitiesJob.class
.getResourceAsStream("/eu/dnetlib/dhp/broker/oa/common_params.json"))); .getResourceAsStream("/eu/dnetlib/dhp/broker/oa/common_params.json")));
parser.parseArgument(args); parser.parseArgument(args);

View File

@ -160,8 +160,7 @@ public class ConversionUtils {
.stream() .stream()
.filter(Objects::nonNull) .filter(Objects::nonNull)
.filter(pid -> pid.getQualifier() != null) .filter(pid -> pid.getQualifier() != null)
.filter(pid -> pid.getQualifier().getClassid() != null) .filter(pid -> StringUtils.startsWithIgnoreCase(pid.getQualifier().getClassid(), ModelConstants.ORCID))
.filter(pid -> pid.getQualifier().getClassid().equalsIgnoreCase(ModelConstants.ORCID))
.map(StructuredProperty::getValue) .map(StructuredProperty::getValue)
.map(ConversionUtils::cleanOrcid) .map(ConversionUtils::cleanOrcid)
.filter(StringUtils::isNotBlank) .filter(StringUtils::isNotBlank)

View File

@ -7,7 +7,7 @@
</property> </property>
<property> <property>
<name>outputDir</name> <name>outputDir</name>
<description>the path where the the generated data will be stored</description> <description>the path where the generated data will be stored</description>
</property> </property>
<property> <property>
<name>datasourceIdWhitelist</name> <name>datasourceIdWhitelist</name>
@ -179,17 +179,18 @@
<master>yarn</master> <master>yarn</master>
<mode>cluster</mode> <mode>cluster</mode>
<name>PrepareSimpleEntititiesJob</name> <name>PrepareSimpleEntititiesJob</name>
<class>eu.dnetlib.dhp.broker.oa.PrepareSimpleEntititiesJob</class> <class>eu.dnetlib.dhp.broker.oa.PrepareSimpleEntitiesJob</class>
<jar>dhp-broker-events-${projectVersion}.jar</jar> <jar>dhp-broker-events-${projectVersion}.jar</jar>
<spark-opts> <spark-opts>
--executor-cores=${sparkExecutorCores} --executor-cores=${sparkExecutorCores}
--executor-memory=${sparkExecutorMemory} --executor-memory=${sparkExecutorMemory}
--driver-memory=${sparkDriverMemory} --driver-memory=${sparkDriverMemory}
--conf spark.executor.memoryOverhead=${sparkExecutorMemory}
--conf spark.extraListeners=${spark2ExtraListeners} --conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=3840 --conf spark.sql.shuffle.partitions=5000
</spark-opts> </spark-opts>
<arg>--graphPath</arg><arg>${graphInputPath}</arg> <arg>--graphPath</arg><arg>${graphInputPath}</arg>
<arg>--workingDir</arg><arg>${workingDir}</arg> <arg>--workingDir</arg><arg>${workingDir}</arg>
@ -209,11 +210,12 @@
--executor-cores=${sparkExecutorCores} --executor-cores=${sparkExecutorCores}
--executor-memory=${sparkExecutorMemory} --executor-memory=${sparkExecutorMemory}
--driver-memory=${sparkDriverMemory} --driver-memory=${sparkDriverMemory}
--conf spark.executor.memoryOverhead=${sparkExecutorMemory}
--conf spark.extraListeners=${spark2ExtraListeners} --conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=3840 --conf spark.sql.shuffle.partitions=8000
</spark-opts> </spark-opts>
<arg>--graphPath</arg><arg>${graphInputPath}</arg> <arg>--graphPath</arg><arg>${graphInputPath}</arg>
<arg>--workingDir</arg><arg>${workingDir}</arg> <arg>--workingDir</arg><arg>${workingDir}</arg>
@ -234,11 +236,12 @@
--executor-cores=${sparkExecutorCores} --executor-cores=${sparkExecutorCores}
--executor-memory=${sparkExecutorMemory} --executor-memory=${sparkExecutorMemory}
--driver-memory=${sparkDriverMemory} --driver-memory=${sparkDriverMemory}
--conf spark.executor.memoryOverhead=${sparkExecutorMemory}
--conf spark.extraListeners=${spark2ExtraListeners} --conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=3840 --conf spark.sql.shuffle.partitions=8000
</spark-opts> </spark-opts>
<arg>--graphPath</arg><arg>${graphInputPath}</arg> <arg>--graphPath</arg><arg>${graphInputPath}</arg>
<arg>--workingDir</arg><arg>${workingDir}</arg> <arg>--workingDir</arg><arg>${workingDir}</arg>
@ -258,11 +261,12 @@
--executor-cores=${sparkExecutorCores} --executor-cores=${sparkExecutorCores}
--executor-memory=${sparkExecutorMemory} --executor-memory=${sparkExecutorMemory}
--driver-memory=${sparkDriverMemory} --driver-memory=${sparkDriverMemory}
--conf spark.executor.memoryOverhead=${sparkExecutorMemory}
--conf spark.extraListeners=${spark2ExtraListeners} --conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=3840 --conf spark.sql.shuffle.partitions=5000
</spark-opts> </spark-opts>
<arg>--graphPath</arg><arg>${graphInputPath}</arg> <arg>--graphPath</arg><arg>${graphInputPath}</arg>
<arg>--workingDir</arg><arg>${workingDir}</arg> <arg>--workingDir</arg><arg>${workingDir}</arg>
@ -282,11 +286,12 @@
--executor-cores=${sparkExecutorCores} --executor-cores=${sparkExecutorCores}
--executor-memory=${sparkExecutorMemory} --executor-memory=${sparkExecutorMemory}
--driver-memory=${sparkDriverMemory} --driver-memory=${sparkDriverMemory}
--conf spark.executor.memoryOverhead=${sparkExecutorMemory}
--conf spark.extraListeners=${spark2ExtraListeners} --conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=3840 --conf spark.sql.shuffle.partitions=10000
</spark-opts> </spark-opts>
<arg>--graphPath</arg><arg>${graphInputPath}</arg> <arg>--graphPath</arg><arg>${graphInputPath}</arg>
<arg>--workingDir</arg><arg>${workingDir}</arg> <arg>--workingDir</arg><arg>${workingDir}</arg>
@ -306,11 +311,12 @@
--executor-cores=${sparkExecutorCores} --executor-cores=${sparkExecutorCores}
--executor-memory=${sparkExecutorMemory} --executor-memory=${sparkExecutorMemory}
--driver-memory=${sparkDriverMemory} --driver-memory=${sparkDriverMemory}
--conf spark.executor.memoryOverhead=${sparkExecutorMemory}
--conf spark.extraListeners=${spark2ExtraListeners} --conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=3840 --conf spark.sql.shuffle.partitions=2000
</spark-opts> </spark-opts>
<arg>--graphPath</arg><arg>${graphInputPath}</arg> <arg>--graphPath</arg><arg>${graphInputPath}</arg>
<arg>--workingDir</arg><arg>${workingDir}</arg> <arg>--workingDir</arg><arg>${workingDir}</arg>
@ -332,11 +338,12 @@
--executor-cores=${sparkExecutorCores} --executor-cores=${sparkExecutorCores}
--executor-memory=${sparkExecutorMemory} --executor-memory=${sparkExecutorMemory}
--driver-memory=${sparkDriverMemory} --driver-memory=${sparkDriverMemory}
--conf spark.executor.memoryOverhead=${sparkExecutorMemory}
--conf spark.extraListeners=${spark2ExtraListeners} --conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=3840 --conf spark.sql.shuffle.partitions=8000
</spark-opts> </spark-opts>
<arg>--graphPath</arg><arg>${graphInputPath}</arg> <arg>--graphPath</arg><arg>${graphInputPath}</arg>
<arg>--workingDir</arg><arg>${workingDir}</arg> <arg>--workingDir</arg><arg>${workingDir}</arg>
@ -356,11 +363,12 @@
--executor-cores=${sparkExecutorCores} --executor-cores=${sparkExecutorCores}
--executor-memory=${sparkExecutorMemory} --executor-memory=${sparkExecutorMemory}
--driver-memory=${sparkDriverMemory} --driver-memory=${sparkDriverMemory}
--conf spark.executor.memoryOverhead=${sparkExecutorMemory}
--conf spark.extraListeners=${spark2ExtraListeners} --conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=3840 --conf spark.sql.shuffle.partitions=8000
</spark-opts> </spark-opts>
<arg>--graphPath</arg><arg>${graphInputPath}</arg> <arg>--graphPath</arg><arg>${graphInputPath}</arg>
<arg>--workingDir</arg><arg>${workingDir}</arg> <arg>--workingDir</arg><arg>${workingDir}</arg>
@ -380,11 +388,12 @@
--executor-cores=${sparkExecutorCores} --executor-cores=${sparkExecutorCores}
--executor-memory=${sparkExecutorMemory} --executor-memory=${sparkExecutorMemory}
--driver-memory=${sparkDriverMemory} --driver-memory=${sparkDriverMemory}
--conf spark.executor.memoryOverhead=${sparkExecutorMemory}
--conf spark.extraListeners=${spark2ExtraListeners} --conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=3840 --conf spark.sql.shuffle.partitions=8000
</spark-opts> </spark-opts>
<arg>--graphPath</arg><arg>${graphInputPath}</arg> <arg>--graphPath</arg><arg>${graphInputPath}</arg>
<arg>--workingDir</arg><arg>${workingDir}</arg> <arg>--workingDir</arg><arg>${workingDir}</arg>
@ -404,11 +413,12 @@
--executor-cores=${sparkExecutorCores} --executor-cores=${sparkExecutorCores}
--executor-memory=${sparkExecutorMemory} --executor-memory=${sparkExecutorMemory}
--driver-memory=${sparkDriverMemory} --driver-memory=${sparkDriverMemory}
--conf spark.executor.memoryOverhead=${sparkExecutorMemory}
--conf spark.extraListeners=${spark2ExtraListeners} --conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=3840 --conf spark.sql.shuffle.partitions=8000
</spark-opts> </spark-opts>
<arg>--graphPath</arg><arg>${graphInputPath}</arg> <arg>--graphPath</arg><arg>${graphInputPath}</arg>
<arg>--workingDir</arg><arg>${workingDir}</arg> <arg>--workingDir</arg><arg>${workingDir}</arg>
@ -428,11 +438,12 @@
--executor-cores=${sparkExecutorCores} --executor-cores=${sparkExecutorCores}
--executor-memory=${sparkExecutorMemory} --executor-memory=${sparkExecutorMemory}
--driver-memory=${sparkDriverMemory} --driver-memory=${sparkDriverMemory}
--conf spark.executor.memoryOverhead=${sparkExecutorMemory}
--conf spark.extraListeners=${spark2ExtraListeners} --conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=3840 --conf spark.sql.shuffle.partitions=8000
</spark-opts> </spark-opts>
<arg>--graphPath</arg><arg>${graphInputPath}</arg> <arg>--graphPath</arg><arg>${graphInputPath}</arg>
<arg>--workingDir</arg><arg>${workingDir}</arg> <arg>--workingDir</arg><arg>${workingDir}</arg>
@ -452,11 +463,12 @@
--executor-cores=${sparkExecutorCores} --executor-cores=${sparkExecutorCores}
--executor-memory=${sparkExecutorMemory} --executor-memory=${sparkExecutorMemory}
--driver-memory=${sparkDriverMemory} --driver-memory=${sparkDriverMemory}
--conf spark.executor.memoryOverhead=${sparkExecutorMemory}
--conf spark.extraListeners=${spark2ExtraListeners} --conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=3840 --conf spark.sql.shuffle.partitions=8000
</spark-opts> </spark-opts>
<arg>--graphPath</arg><arg>${graphInputPath}</arg> <arg>--graphPath</arg><arg>${graphInputPath}</arg>
<arg>--workingDir</arg><arg>${workingDir}</arg> <arg>--workingDir</arg><arg>${workingDir}</arg>
@ -476,11 +488,12 @@
--executor-cores=${sparkExecutorCores} --executor-cores=${sparkExecutorCores}
--executor-memory=${sparkExecutorMemory} --executor-memory=${sparkExecutorMemory}
--driver-memory=${sparkDriverMemory} --driver-memory=${sparkDriverMemory}
--conf spark.executor.memoryOverhead=${sparkExecutorMemory}
--conf spark.extraListeners=${spark2ExtraListeners} --conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir} --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=3840 --conf spark.sql.shuffle.partitions=8000
</spark-opts> </spark-opts>
<arg>--workingDir</arg><arg>${workingDir}</arg> <arg>--workingDir</arg><arg>${workingDir}</arg>
<arg>--outputDir</arg><arg>${outputDir}</arg> <arg>--outputDir</arg><arg>${outputDir}</arg>
@ -503,6 +516,7 @@
<spark-opts> <spark-opts>
--executor-memory=${sparkExecutorMemory} --executor-memory=${sparkExecutorMemory}
--driver-memory=${sparkDriverMemory} --driver-memory=${sparkDriverMemory}
--conf spark.executor.memoryOverhead=${sparkExecutorMemory}
--conf spark.dynamicAllocation.maxExecutors=${sparkMaxExecutorsForIndexing} --conf spark.dynamicAllocation.maxExecutors=${sparkMaxExecutorsForIndexing}
--conf spark.extraListeners=${spark2ExtraListeners} --conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
@ -535,6 +549,7 @@
--executor-cores=${sparkExecutorCores} --executor-cores=${sparkExecutorCores}
--executor-memory=${sparkExecutorMemory} --executor-memory=${sparkExecutorMemory}
--driver-memory=${sparkDriverMemory} --driver-memory=${sparkDriverMemory}
--conf spark.executor.memoryOverhead=${sparkExecutorMemory}
--conf spark.extraListeners=${spark2ExtraListeners} --conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
@ -562,6 +577,7 @@
--executor-cores=${sparkExecutorCores} --executor-cores=${sparkExecutorCores}
--executor-memory=${sparkExecutorMemory} --executor-memory=${sparkExecutorMemory}
--driver-memory=${sparkDriverMemory} --driver-memory=${sparkDriverMemory}
--conf spark.executor.memoryOverhead=${sparkExecutorMemory}
--conf spark.extraListeners=${spark2ExtraListeners} --conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress} --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
@ -585,6 +601,7 @@
<spark-opts> <spark-opts>
--executor-memory=${sparkExecutorMemory} --executor-memory=${sparkExecutorMemory}
--driver-memory=${sparkDriverMemory} --driver-memory=${sparkDriverMemory}
--conf spark.executor.memoryOverhead=${sparkExecutorMemory}
--conf spark.dynamicAllocation.maxExecutors=${sparkMaxExecutorsForIndexing} --conf spark.dynamicAllocation.maxExecutors=${sparkMaxExecutorsForIndexing}
--conf spark.extraListeners=${spark2ExtraListeners} --conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners} --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}

View File

@ -0,0 +1,66 @@
package eu.dnetlib.dhp.broker.oa.matchers.simple;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertTrue;
import java.util.List;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
import eu.dnetlib.broker.objects.OaBrokerAuthor;
import eu.dnetlib.broker.objects.OaBrokerMainEntity;
class EnrichMissingAuthorOrcidTest {
final EnrichMissingAuthorOrcid matcher = new EnrichMissingAuthorOrcid();
@BeforeEach
void setUp() throws Exception {
}
@Test
void testFindDifferences_1() {
final OaBrokerMainEntity source = new OaBrokerMainEntity();
final OaBrokerMainEntity target = new OaBrokerMainEntity();
final List<OaBrokerAuthor> list = this.matcher.findDifferences(source, target);
assertTrue(list.isEmpty());
}
@Test
void testFindDifferences_2() {
final OaBrokerMainEntity source = new OaBrokerMainEntity();
final OaBrokerMainEntity target = new OaBrokerMainEntity();
source.getCreators().add(new OaBrokerAuthor("Claudio Atzori", "0000-0001-9613-6639"));
target.getCreators().add(new OaBrokerAuthor("Claudio Atzori", null));
final List<OaBrokerAuthor> list = this.matcher.findDifferences(source, target);
assertEquals(1, list.size());
}
@Test
void testFindDifferences_3() {
final OaBrokerMainEntity source = new OaBrokerMainEntity();
final OaBrokerMainEntity target = new OaBrokerMainEntity();
source.getCreators().add(new OaBrokerAuthor("Claudio Atzori", null));
target.getCreators().add(new OaBrokerAuthor("Claudio Atzori", "0000-0001-9613-6639"));
final List<OaBrokerAuthor> list = this.matcher.findDifferences(source, target);
assertTrue(list.isEmpty());
}
@Test
void testFindDifferences_4() {
final OaBrokerMainEntity source = new OaBrokerMainEntity();
final OaBrokerMainEntity target = new OaBrokerMainEntity();
source.getCreators().add(new OaBrokerAuthor("Claudio Atzori", "0000-0001-9613-6639"));
target.getCreators().add(new OaBrokerAuthor("Claudio Atzori", "0000-0001-9613-6639"));
final List<OaBrokerAuthor> list = this.matcher.findDifferences(source, target);
assertTrue(list.isEmpty());
}
}

View File

@ -2,27 +2,32 @@
package eu.dnetlib.dhp.broker.oa.util; package eu.dnetlib.dhp.broker.oa.util;
import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertNull;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Arrays;
import java.util.List; import java.util.List;
import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test; import org.junit.jupiter.api.Test;
import eu.dnetlib.broker.objects.OaBrokerMainEntity;
import eu.dnetlib.broker.objects.OaBrokerTypedValue; import eu.dnetlib.broker.objects.OaBrokerTypedValue;
import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.oaf.Author;
import eu.dnetlib.dhp.schema.oaf.Instance; import eu.dnetlib.dhp.schema.oaf.Instance;
import eu.dnetlib.dhp.schema.oaf.Qualifier; import eu.dnetlib.dhp.schema.oaf.Qualifier;
import eu.dnetlib.dhp.schema.oaf.Result; import eu.dnetlib.dhp.schema.oaf.Result;
import eu.dnetlib.dhp.schema.oaf.StructuredProperty; import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
class ConversionUtilsTest { public class ConversionUtilsTest {
@BeforeEach @BeforeEach
void setUp() throws Exception { public void setUp() throws Exception {
} }
@Test @Test
void testAllResultPids() { public void testAllResultPids() {
final Qualifier qf = new Qualifier(); final Qualifier qf = new Qualifier();
qf.setClassid("test"); qf.setClassid("test");
qf.setClassname("test"); qf.setClassname("test");
@ -91,4 +96,42 @@ class ConversionUtilsTest {
assertEquals(6, list.size()); assertEquals(6, list.size());
} }
public void testOafResultToBrokerResult() {
final Author a1 = createAuthor("Michele Artini", "0000-0002-4406-428X");
final Author a2 = createAuthor("Claudio Atzori", "http://orcid.org/0000-0001-9613-6639");
final Author a3 = createAuthor("Alessia Bardi", null);
final Result r = new Result();
r.setAuthor(Arrays.asList(a1, a2, a3));
final OaBrokerMainEntity br = ConversionUtils.oafResultToBrokerResult(r);
assertEquals(3, br.getCreators().size());
assertEquals("0000-0002-4406-428X", br.getCreators().get(0).getOrcid());
assertEquals("0000-0001-9613-6639", br.getCreators().get(1).getOrcid());
assertNull(br.getCreators().get(2).getOrcid());
}
private Author createAuthor(final String name, final String orcid) {
final Author a = new Author();
a.setFullname("Michele Artini");
if (orcid != null) {
final Qualifier q = new Qualifier();
q.setClassid(ModelConstants.ORCID);
q.setClassname(ModelConstants.ORCID);
q.setSchemeid("dnet:pids");
q.setSchemename("dnet:pids");
final StructuredProperty pid = new StructuredProperty();
pid.setQualifier(q);
pid.setValue(orcid);
a.setPid(Arrays.asList(pid));
}
return a;
}
} }

View File

@ -203,8 +203,8 @@ public class SparkCreateMergeRels extends AbstractSparkAction {
WindowSpec w = Window WindowSpec w = Window
.partitionBy("groupId") .partitionBy("groupId")
.orderBy( .orderBy(
col("lastUsage").desc_nulls_last(),
col("pidType").asc_nulls_last(), col("pidType").asc_nulls_last(),
col("lastUsage").desc_nulls_last(),
col("collectedfrom").desc_nulls_last(), col("collectedfrom").desc_nulls_last(),
col("date").asc_nulls_last(), col("date").asc_nulls_last(),
col("id").asc_nulls_last()); col("id").asc_nulls_last());

View File

@ -15,4 +15,12 @@
<name>oozie.action.sharelib.for.spark</name> <name>oozie.action.sharelib.for.spark</name>
<value>spark2</value> <value>spark2</value>
</property> </property>
<property>
<name>hiveMetastoreUris</name>
<value>thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083</value>
</property>
<property>
<name>pivotHistoryDatabase</name>
<value>&#x200B;</value>
</property>
</configuration> </configuration>

View File

@ -198,6 +198,8 @@
<arg>--isLookUpUrl</arg><arg>${isLookUpUrl}</arg> <arg>--isLookUpUrl</arg><arg>${isLookUpUrl}</arg>
<arg>--actionSetId</arg><arg>${actionSetId}</arg> <arg>--actionSetId</arg><arg>${actionSetId}</arg>
<arg>--cutConnectedComponent</arg><arg>${cutConnectedComponent}</arg> <arg>--cutConnectedComponent</arg><arg>${cutConnectedComponent}</arg>
<arg>--hiveMetastoreUris</arg><arg>${hiveMetastoreUris}</arg>
<arg>--pivotHistoryDatabase</arg><arg>${pivotHistoryDatabase}</arg>
</spark> </spark>
<ok to="PrepareOrgRels"/> <ok to="PrepareOrgRels"/>
<error to="Kill"/> <error to="Kill"/>

File diff suppressed because one or more lines are too long

View File

@ -190,7 +190,7 @@ public class SparkDedupTest implements Serializable {
System.out.println("orp_simrel = " + orp_simrel); System.out.println("orp_simrel = " + orp_simrel);
if (CHECK_CARDINALITIES) { if (CHECK_CARDINALITIES) {
assertEquals(751, orgs_simrel); assertEquals(742, orgs_simrel);
assertEquals(566, pubs_simrel); assertEquals(566, pubs_simrel);
assertEquals(113, sw_simrel); assertEquals(113, sw_simrel);
assertEquals(148, ds_simrel); assertEquals(148, ds_simrel);
@ -251,7 +251,7 @@ public class SparkDedupTest implements Serializable {
// entities simrels supposed to be equal to the number of previous step (no rels in whitelist) // entities simrels supposed to be equal to the number of previous step (no rels in whitelist)
if (CHECK_CARDINALITIES) { if (CHECK_CARDINALITIES) {
assertEquals(751, orgs_simrel); assertEquals(742, orgs_simrel);
assertEquals(566, pubs_simrel); assertEquals(566, pubs_simrel);
assertEquals(148, ds_simrel); assertEquals(148, ds_simrel);
assertEquals(280, orp_simrel); assertEquals(280, orp_simrel);
@ -442,7 +442,7 @@ public class SparkDedupTest implements Serializable {
final List<Relation> merges = pubs final List<Relation> merges = pubs
.filter("source == '50|arXiv_dedup_::c93aeb433eb90ed7a86e29be00791b7c'") .filter("source == '50|arXiv_dedup_::c93aeb433eb90ed7a86e29be00791b7c'")
.collectAsList(); .collectAsList();
assertEquals(3, merges.size()); assertEquals(1, merges.size());
Set<String> dups = Sets Set<String> dups = Sets
.newHashSet( .newHashSet(
"50|doi_________::3b1d0d8e8f930826665df9d6b82fbb73", "50|doi_________::3b1d0d8e8f930826665df9d6b82fbb73",
@ -451,7 +451,7 @@ public class SparkDedupTest implements Serializable {
merges.forEach(r -> { merges.forEach(r -> {
assertEquals(ModelConstants.RESULT_RESULT, r.getRelType()); assertEquals(ModelConstants.RESULT_RESULT, r.getRelType());
assertEquals(ModelConstants.DEDUP, r.getSubRelType()); assertEquals(ModelConstants.DEDUP, r.getSubRelType());
assertEquals(ModelConstants.MERGES, r.getRelClass()); assertEquals(ModelConstants.IS_MERGED_IN, r.getRelClass());
assertTrue(dups.contains(r.getTarget())); assertTrue(dups.contains(r.getTarget()));
}); });
@ -561,7 +561,7 @@ public class SparkDedupTest implements Serializable {
System.out.println("orp_mergerel = " + orp_mergerel); System.out.println("orp_mergerel = " + orp_mergerel);
if (CHECK_CARDINALITIES) { if (CHECK_CARDINALITIES) {
assertEquals(1268, orgs_mergerel); assertEquals(1278, orgs_mergerel);
assertEquals(1156, pubs.count()); assertEquals(1156, pubs.count());
assertEquals(292, sw_mergerel); assertEquals(292, sw_mergerel);
assertEquals(476, ds_mergerel); assertEquals(476, ds_mergerel);
@ -618,7 +618,7 @@ public class SparkDedupTest implements Serializable {
System.out.println("orp_deduprecord = " + orp_deduprecord); System.out.println("orp_deduprecord = " + orp_deduprecord);
if (CHECK_CARDINALITIES) { if (CHECK_CARDINALITIES) {
assertEquals(86, orgs_deduprecord); assertEquals(78, orgs_deduprecord);
assertEquals(96, pubs.count()); assertEquals(96, pubs.count());
assertEquals(47, sw_deduprecord); assertEquals(47, sw_deduprecord);
assertEquals(97, ds_deduprecord); assertEquals(97, ds_deduprecord);
@ -761,7 +761,7 @@ public class SparkDedupTest implements Serializable {
if (CHECK_CARDINALITIES) { if (CHECK_CARDINALITIES) {
assertEquals(930, publications); assertEquals(930, publications);
assertEquals(839, organizations); assertEquals(831, organizations);
assertEquals(100, projects); assertEquals(100, projects);
assertEquals(100, datasource); assertEquals(100, datasource);
assertEquals(196, softwares); assertEquals(196, softwares);

View File

@ -22,8 +22,11 @@ import java.util.Properties;
import org.apache.commons.io.FileUtils; import org.apache.commons.io.FileUtils;
import org.apache.commons.io.IOUtils; import org.apache.commons.io.IOUtils;
import org.apache.spark.SparkConf; import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders; import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SparkSession; import org.apache.spark.sql.SparkSession;
import org.junit.jupiter.api.*; import org.junit.jupiter.api.*;
import org.junit.jupiter.api.extension.ExtendWith; import org.junit.jupiter.api.extension.ExtendWith;
@ -143,7 +146,7 @@ public class SparkOpenorgsDedupTest implements Serializable {
.load(DedupUtility.createSimRelPath(testOutputBasePath, testActionSetId, "organization")) .load(DedupUtility.createSimRelPath(testOutputBasePath, testActionSetId, "organization"))
.count(); .count();
assertEquals(86, orgs_simrel); assertEquals(92, orgs_simrel);
} }
@Test @Test
@ -172,7 +175,7 @@ public class SparkOpenorgsDedupTest implements Serializable {
.load(DedupUtility.createSimRelPath(testOutputBasePath, testActionSetId, "organization")) .load(DedupUtility.createSimRelPath(testOutputBasePath, testActionSetId, "organization"))
.count(); .count();
assertEquals(122, orgs_simrel); assertEquals(128, orgs_simrel);
} }
@Test @Test
@ -207,7 +210,7 @@ public class SparkOpenorgsDedupTest implements Serializable {
.read() .read()
.load(testOutputBasePath + "/" + testActionSetId + "/organization_mergerel") .load(testOutputBasePath + "/" + testActionSetId + "/organization_mergerel")
.count(); .count();
assertEquals(132, orgs_mergerel); assertEquals(128, orgs_mergerel);
// verify that a DiffRel is in the mergerels (to be sure that the job supposed to remove them has something to // verify that a DiffRel is in the mergerels (to be sure that the job supposed to remove them has something to
// do) // do)

View File

@ -9,6 +9,7 @@ import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.Test; import org.junit.jupiter.api.Test;
import org.junit.platform.commons.util.StringUtils; import org.junit.platform.commons.util.StringUtils;
import eu.dnetlib.dhp.oa.dedup.SparkOpenorgsDedupTest;
import eu.dnetlib.pace.config.DedupConfig; import eu.dnetlib.pace.config.DedupConfig;
import eu.dnetlib.pace.model.SparkModel; import eu.dnetlib.pace.model.SparkModel;
@ -24,6 +25,31 @@ class JsonPathTest {
Row row = SparkModel.apply(conf).rowFromJson(org); Row row = SparkModel.apply(conf).rowFromJson(org);
System.out.println("row = " + row);
Assertions.assertNotNull(row);
Assertions.assertTrue(StringUtils.isNotBlank(row.getAs("identifier")));
System.out.println("row = " + row.getAs("countrytitle"));
}
@Test
void jsonToModelTest() throws IOException {
DedupConfig conf = DedupConfig
.load(
IOUtils
.toString(
SparkOpenorgsDedupTest.class
.getResourceAsStream(
"/eu/dnetlib/dhp/dedup/conf/org.curr.conf.json")));
final String org = IOUtils.toString(getClass().getResourceAsStream("organization_example1.json"));
Row row = SparkModel.apply(conf).rowFromJson(org);
// to check that the same parsing returns the same row
Row row1 = SparkModel.apply(conf).rowFromJson(org);
Assertions.assertEquals(row, row1);
System.out.println("row = " + row);
Assertions.assertNotNull(row); Assertions.assertNotNull(row);
Assertions.assertTrue(StringUtils.isNotBlank(row.getAs("identifier"))); Assertions.assertTrue(StringUtils.isNotBlank(row.getAs("identifier")));
} }

View File

@ -4,8 +4,8 @@
"dedupRun" : "001", "dedupRun" : "001",
"entityType" : "organization", "entityType" : "organization",
"subEntityValue": "organization", "subEntityValue": "organization",
"orderField" : "legalname", "orderField" : "original_legalname",
"queueMaxSize" : "2000", "queueMaxSize" : "100000",
"groupMaxSize" : "50", "groupMaxSize" : "50",
"slidingWindowSize" : "200", "slidingWindowSize" : "200",
"idPath":"$.id", "idPath":"$.id",
@ -15,10 +15,10 @@
}, },
"pace" : { "pace" : {
"clustering" : [ "clustering" : [
{ "name" : "sortedngrampairs", "fields" : [ "legalname" ], "params" : { "max" : 2, "ngramLen" : "3"} }, { "name" : "sortedngrampairs", "fields" : [ "original_legalname" ], "params" : { "max" : 2, "ngramLen" : "3"} },
{ "name" : "suffixprefix", "fields" : [ "legalname" ], "params" : { "max" : 1, "len" : "3" } }, { "name" : "suffixprefix", "fields" : [ "original_legalname" ], "params" : { "max" : 1, "len" : "3" } },
{ "name" : "urlclustering", "fields" : [ "websiteurl" ], "params" : { } }, { "name" : "urlclustering", "fields" : [ "websiteurl" ], "params" : { } },
{ "name" : "keywordsclustering", "fields" : [ "legalname" ], "params" : { "max": 2, "windowSize": 4} } { "name" : "legalnameclustering", "fields" : [ "legalname" ], "params" : { "max": 2} }
], ],
"decisionTree" : { "decisionTree" : {
"start": { "start": {
@ -29,16 +29,23 @@
"weight": 1, "weight": 1,
"countIfUndefined": "false", "countIfUndefined": "false",
"params": {} "params": {}
},
{
"field": "rorid",
"comparator": "exactMatch",
"weight": 1,
"countIfUndefined": "false",
"params": {}
} }
], ],
"threshold": 1, "threshold": 1,
"aggregation": "AVG", "aggregation": "OR",
"positive": "MATCH", "positive": "MATCH",
"negative": "NO_MATCH", "negative": "NO_MATCH",
"undefined": "layer2", "undefined": "necessaryConditions",
"ignoreUndefined": "false" "ignoreUndefined": "false"
}, },
"layer2": { "necessaryConditions": {
"fields": [ "fields": [
{ {
"field": "websiteurl", "field": "websiteurl",
@ -49,20 +56,20 @@
}, },
{ {
"field": "country", "field": "country",
"comparator": "exactMatch", "comparator": "countryMatch",
"weight": 1, "weight": 1,
"countIfUndefined": "true", "countIfUndefined": "true",
"params": {} "params": {}
}, },
{ {
"field": "legalname", "field": "original_legalname",
"comparator": "numbersMatch", "comparator": "numbersMatch",
"weight": 1, "weight": 1,
"countIfUndefined": "true", "countIfUndefined": "true",
"params": {} "params": {}
}, },
{ {
"field": "legalname", "field": "original_legalname",
"comparator": "romansMatch", "comparator": "romansMatch",
"weight": 1, "weight": 1,
"countIfUndefined": "true", "countIfUndefined": "true",
@ -71,68 +78,64 @@
], ],
"threshold": 1, "threshold": 1,
"aggregation": "AND", "aggregation": "AND",
"positive": "layer3", "positive": "cityCheck",
"negative": "NO_MATCH", "negative": "NO_MATCH",
"undefined": "layer3", "undefined": "cityCheck",
"ignoreUndefined": "true" "ignoreUndefined": "true"
}, },
"layer3": { "cityCheck": {
"fields": [ "fields": [
{ {
"field": "legalname", "field": "legalname",
"comparator": "cityMatch", "comparator": "codeMatch",
"weight": 1.0, "weight": 1.0,
"countIfUndefined": "true", "countIfUndefined": "true",
"params": { "params": {
"windowSize": "4" "codeRegex": "city::\\d+"
} }
} }
], ],
"threshold": 0.1, "threshold": 0.1,
"aggregation": "AVG", "aggregation": "AVG",
"positive": "layer4", "positive": "keywordCheck",
"negative": "NO_MATCH", "negative": "NO_MATCH",
"undefined": "NO_MATCH", "undefined": "NO_MATCH",
"ignoreUndefined": "true" "ignoreUndefined": "true"
}, },
"layer4": { "keywordCheck": {
"fields": [ "fields": [
{ {
"field": "legalname", "field": "legalname",
"comparator": "keywordMatch", "comparator": "codeMatch",
"weight": 1.0, "weight": 1.0,
"countIfUndefined": "true", "countIfUndefined": "true",
"params": { "params": {
"windowSize": "4" "codeRegex": "key::\\d+"
} }
} }
], ],
"threshold": 0.7, "threshold": 0.7,
"aggregation": "AVG", "aggregation": "AVG",
"positive": "layer5", "positive": "nameCheck",
"negative": "NO_MATCH", "negative": "NO_MATCH",
"undefined": "layer5", "undefined": "nameCheck",
"ignoreUndefined": "true" "ignoreUndefined": "true"
}, },
"layer5": { "nameCheck": {
"fields": [ "fields": [
{ {
"field": "legalname", "field": "legalname",
"comparator": "jaroWinklerNormalizedName", "comparator": "jaroWinklerLegalname",
"weight": 0.9, "weight": 0.9,
"countIfUndefined": "true", "countIfUndefined": "true",
"params": { "params": {}
"windowSize": "4"
}
}, },
{ {
"field": "legalshortname", "field": "legalshortname",
"comparator": "jaroWinklerNormalizedName", "comparator": "jaroWinklerLegalname",
"weight": 0.1, "weight": 0.1,
"countIfUndefined": "false", "countIfUndefined": "false",
"params": { "params": {}
"windowSize": 4
}
} }
], ],
"threshold": 0.9, "threshold": 0.9,
@ -144,126 +147,16 @@
} }
}, },
"model" : [ "model" : [
{ "name" : "country", "type" : "String", "path" : "$.country.classid"}, { "name" : "country", "type" : "String", "path" : "$.country.classid", "infer" : "country", "inferenceFrom" : "$.legalname.value"},
{ "name" : "legalshortname", "type" : "String", "path" : "$.legalshortname.value"}, { "name" : "legalshortname", "type" : "String", "path" : "$.legalshortname.value", "infer" : "city_keyword"},
{ "name" : "legalname", "type" : "String", "path" : "$.legalname.value" }, { "name" : "original_legalname", "type" : "String", "path" : "$.legalname.value" },
{ "name" : "legalname", "type" : "String", "path" : "$.legalname.value", "infer" : "city_keyword"},
{ "name" : "websiteurl", "type" : "URL", "path" : "$.websiteurl.value" }, { "name" : "websiteurl", "type" : "URL", "path" : "$.websiteurl.value" },
{ "name" : "gridid", "type" : "String", "path" : "$.pid[?(@.qualifier.classid =='grid')].value"}, { "name" : "gridid", "type" : "String", "path" : "$.pid[?(@.qualifier.classid =='grid')].value"},
{ "name" : "rorid", "type" : "String", "path" : "$.pid[?(@.qualifier.classid =='ROR')].value"},
{ "name" : "originalId", "type" : "String", "path" : "$.id" } { "name" : "originalId", "type" : "String", "path" : "$.id" }
], ],
"blacklists" : { "blacklists" : {},
"legalname" : [] "synonyms": {}
},
"synonyms": {
"key::1": ["university","università", "universitas", "università studi","universitario","universitaria","université", "universite", "universitaire","universitaires","universidad","universitade","Universität","universitaet","Uniwersytet","университет","universiteit","πανεπιστήμιο","universitesi","universiteti", "universiti"],
"key::2": ["studies","studi","études","estudios","estudos","Studien","studia","исследования","studies","σπουδές"],
"key::3": ["advanced","superiore","supérieur","supérieure","supérieurs","supérieures","avancado","avancados","fortgeschrittene","fortgeschritten","zaawansowany","передовой","gevorderd","gevorderde","προχωρημένος","προχωρημένη","προχωρημένο","προχωρημένες","προχωρημένα","wyzsza"],
"key::4": ["institute","istituto","institut","instituto","instituto","Institut","instytut","институт","instituut","ινστιτούτο"],
"key::5": ["hospital","ospedale","hôpital","hospital","hospital","Krankenhaus","szpital","больница","ziekenhuis","νοσοκομείο"],
"key::6": ["research","ricerca","recherche","investigacion","pesquisa","Forschung","badania","исследования","onderzoek","έρευνα","erevna","erevnas"],
"key::7": ["college","collegio","colegio","faculdade","Hochschule","Szkoła Wyższa","Высшая школа","κολλέγιο"],
"key::8": ["foundation","fondazione","fondation","fundación","fundação","Stiftung","Fundacja","фонд","stichting","ίδρυμα","idryma"],
"key::9": ["center","centro","centre","centro","centro","zentrum","centrum","центр","centrum","κέντρο"],
"key::10": ["national","nazionale","national","nationale","nationaux","nationales","nacional","nacional","national","krajowy","национальный","nationaal","nationale","εθνικό"],
"key::11": ["association","associazione","association","asociación","associação","Verein","verband","stowarzyszenie","ассоциация","associatie"],
"key::12": ["society","societa","société","sociedad","sociedade","gesellschaft","społeczeństwo","общество","maatschappij","κοινωνία"],
"key::13": ["international","internazionale","international","internacional","internacional","international","międzynarodowy","Международный","internationaal","internationale","διεθνής","διεθνή","διεθνές"],
"key::14": ["community","comunita","communauté","comunidad","comunidade","Gemeinschaft","społeczność","сообщество","gemeenschap","κοινότητα"],
"key::15": ["school","scuola","école","escuela","escola","schule","Szkoła","школа","school","σχολείο"],
"key::16": ["education","educazione","éducation","educacion","Educação","Bildung","Edukacja","образование","opleiding","εκπαίδευση"],
"key::17": ["academy","accademia","académie","academia","academia","Akademie","akademie","академия","academie","ακαδημία"],
"key::18": ["public","pubblico","public","publique","publics","publiques","publico","publico","Öffentlichkeit","publiczny","публичный","publiek","publieke","δημόσιος","δημόσια","δημόσιο"],
"key::19": ["museum","museo","musée","mueso","museu","museum","muzeum","музей","museum","μουσείο"],
"key::20": ["group","gruppo","groupe","grupo","grupo","gruppe","grupa","группа","groep","ομάδα","όμιλος"],
"key::21": ["department","dipartimento","département","departamento","departamento","abteilung","departament","отдел","afdeling","τμήμα"],
"key::22": ["council","consiglio","conseil","Consejo","conselho","gesellschaft","rada","совет","raad","συμβούλιο"],
"key::23": ["library","biblioteca","bibliothèque","biblioteca","biblioteca","Bibliothek","biblioteka","библиотека","bibliotheek","βιβλιοθήκη"],
"key::24": ["ministry","ministero","ministère","ministerio","ministério","Ministerium","ministerstwo","министерство","ministerie","υπουργείο"],
"key::25": ["services","servizi","services","servicios","Serviços","Dienstleistungen","usługi","услуги","diensten","υπηρεσίες"],
"key::26": ["central","centrale","central","centrale","centrales","central","central","zentral","centralny","цетральный","centraal","κεντρικός","κεντρική","κεντρικό","κεντρικά"],
"key::27": ["general","generale","général","générale","généraux","générales","general","geral","general","Allgemeines","general","общий","algemeen","algemene","γενικός","γενική","γενικό","γενικά"],
"key::28": ["applied","applicati","appliqué","appliquée","appliqués","appliquées","aplicado","aplicada","angewendet","stosowany","прикладной","toegepast","toegepaste","εφαρμοσμένος","εφαρμοσμένη","εφαρμοσμένο","εφαρμοσμένα"],
"key::29": ["european","europee","europea","européen","européenne","européens","européennes","europeo","europeu","europäisch","europejski","европейский","Europees","Europese","ευρωπαϊκός","ευρωπαϊκή","ευρωπαϊκό","ευρωπαϊκά"],
"key::30": ["agency","agenzia","agence","agencia","agencia","agentur","agencja","агенция","agentschap","πρακτορείο"],
"key::31": ["laboratory","laboratorio","laboratoire","laboratorio","laboratorio","labor","laboratorium","лаборатория","laboratorium","εργαστήριο"],
"key::32": ["industry","industria","industrie","индустрия","industrie","βιομηχανία"],
"key::33": ["industrial","industriale","industriel","industrielle","industriels","industrielles","индустриальный","industrieel","βιομηχανικός","βιομηχανική","βιομηχανικό","βιομηχανικά","βιομηχανικές"],
"key::34": ["consortium","consorzio","consortium","консорциум","consortium","κοινοπραξία"],
"key::35": ["organization","organizzazione","organisation","organización","organização","organizacja","организация","organisatie","οργανισμός"],
"key::36": ["authority","autorità","autorité","авторитет","autoriteit"],
"key::37": ["federation","federazione","fédération","федерация","federatie","ομοσπονδία"],
"key::38": ["observatory","osservatorio","observatoire","обсерватория","observatorium","αστεροσκοπείο"],
"key::39": ["bureau","ufficio","bureau","офис","bureau","γραφείο"],
"key::40": ["company","impresa","compagnie","société","компания","bedrijf","εταιρία"],
"key::41": ["polytechnic","politecnico","polytechnique","политехника","polytechnisch","πολυτεχνείο","universita politecnica","polytechnic university","universidad politecnica","universitat politecnica","politechnika","politechniki","university technology","university science technology"],
"key::42": ["coalition","coalizione","coalition","коалиция","coalitie","συνασπισμός"],
"key::43": ["initiative","iniziativa","initiative","инициатива","initiatief","πρωτοβουλία"],
"key::44": ["academic","accademico","académique","universitaire","акадеческий academisch","ακαδημαϊκός","ακαδημαϊκή","ακαδημαϊκό","ακαδημαϊκές","ακαδημαϊκοί"],
"key::45": ["institution","istituzione","institution","институциональный","instelling","ινστιτούτο"],
"key::46": ["division","divisione","division","отделение","divisie","τμήμα"],
"key::47": ["committee","comitato","comité","комитет","commissie","επιτροπή"],
"key::48": ["promotion","promozione","продвижение","proothisis","forderung"],
"key::49": ["medical","medicine","clinical","medicina","clinici","médico","medicina","clínica","médico","medicina","clínica","medizinisch","Medizin","klinisch","medisch","geneeskunde","klinisch","ιατρικός","ιατρική","ιατρικό","ιατρικά","κλινικός","κλινική","κλινικό","κλινικά","tıbbi","tıp","klinik","orvosi","orvostudomány","klinikai","zdravniški","medicinski","klinični","meditsiini","kliinik","kliiniline"],
"key::50": ["technology","technological","tecnologia","tecnologie","tecnología","tecnológico","tecnologia","tecnológico","Technologie","technologisch","technologie","technologisch","τεχνολογία","τεχνολογικός","τεχνολογική","τεχνολογικό","teknoloji","teknolojik","technológia","technológiai","tehnologija","tehnološki","tehnoloogia","tehnoloogiline","technologii","technical","texniki","teknik"],
"key::51": ["science","scientific","scienza","scientifiche","scienze","ciencia","científico","ciência","científico","Wissenschaft","wissenschaftlich","wetenschap","wetenschappelijk","επιστήμη","επιστημονικός","επιστημονική","επιστημονικό","επιστημονικά","bilim","bilimsel","tudomány","tudományos","znanost","znanstveni","teadus","teaduslik",""],
"key::52": ["engineering","ingegneria","ingeniería","engenharia","Ingenieurwissenschaft","ingenieurswetenschappen","bouwkunde","μηχανικός","μηχανική","μηχανικό","mühendislik","mérnöki","Inženirstvo","inseneeria","inseneri",""],
"key::53": ["management","gestione","gestionale","gestionali","gestión","administración","gestão","administração","Verwaltung","management","διαχείριση","yönetim","menedzsment","vodstvo","upravljanje","management","juhtkond","juhtimine","haldus",""],
"key::54": ["energy","energia","energía","energia","Energie","energie","ενέργεια","enerji","energia","energija","energia",""],
"key::55": ["agricultural","agriculture","agricoltura","agricole","agrícola","agricultura","agrícola","agricultura","landwirtschaftlich","Landwirtschaft","landbouwkundig","landbouw","αγροτικός","αγροτική","αγροτικό","γεωργικός","γεωργική","γεωργικό","γεωργία","tarımsal","tarım","mezőgazdasági","mezőgazdaság","poljedelski","poljedelstvo","põllumajandus","põllumajanduslik",""],
"key::56": ["information","informazione","información","informação","Information","informatie","πληροφορία","bilgi","információ","informacija","informatsioon","informatycznych",""],
"key::57": ["social","sociali","social","social","Sozial","sociaal","maatschappelijk","κοινωνικός","κοινωνική","κοινωνικό","κοινωνικά","sosyal","szociális","družbeni","sotsiaal","sotsiaalne",""],
"key::58": ["environmental","ambiente","medioambiental","ambiente","medioambiente","meioambiente","Umwelt","milieu","milieuwetenschap","milieukunde","περιβαλλοντικός","περιβαλλοντική","περιβαλλοντικό","περιβαλλοντικά","çevre","környezeti","okoliški","keskonna",""],
"key::59": ["business","economia","economiche","economica","negocio","empresa","negócio","Unternehmen","bedrijf","bedrijfskunde","επιχείρηση","iş","üzleti","posel","ettevõte/äri",""],
"key::60": ["pharmaceuticals","pharmacy","farmacia","farmaceutica","farmacéutica","farmacia","farmacêutica","farmácia","Pharmazeutika","Arzneimittelkunde","farmaceutica","geneesmiddelen","apotheek","φαρμακευτικός","φαρμακευτική","φαρμακευτικό","φαρμακευτικά","φαρμακείο","ilaç","eczane","gyógyszerészeti","gyógyszertár","farmacevtika","lekarništvo","farmaatsia","farmatseutiline",""],
"key::61": ["healthcare","health services","salute","atenciónmédica","cuidadodelasalud","cuidadoscomasaúde","Gesundheitswesen","gezondheidszorg","ιατροφαρμακευτικήπερίθαλψη","sağlıkhizmeti","egészségügy","zdravstvo","tervishoid","tervishoiu",""],
"key::62": ["history","storia","historia","história","Geschichte","geschiedenis","geschiedkunde","ιστορία","tarih","történelem","zgodovina","ajalugu",""],
"key::63": ["materials","materiali","materia","materiales","materiais","materialen","υλικά","τεκμήρια","malzemeler","anyagok","materiali","materjalid","vahendid",""],
"key::64": ["economics","economia","economiche","economica","economía","economia","Wirtschaft","economie","οικονομικά","οικονομικέςεπιστήμες","ekonomi","közgazdaságtan","gospodarstvo","ekonomija","majanduslik","majandus",""],
"key::65": ["therapeutics","terapeutica","terapéutica","terapêutica","therapie","θεραπευτική","tedavibilimi","gyógykezelés","terapevtika","terapeutiline","ravi",""],
"key::66": ["oncology","oncologia","oncologico","oncología","oncologia","Onkologie","oncologie","ογκολογία","onkoloji","onkológia","onkologija","onkoloogia",""],
"key::67": ["natural","naturali","naturale","natural","natural","natürlich","natuurlijk","φυσικός","φυσική","φυσικό","φυσικά","doğal","természetes","naraven","loodus",""],
"key::68": ["educational","educazione","pedagogia","educacional","educativo","educacional","pädagogisch","educatief","εκπαιδευτικός","εκπαιδευτική","εκπαιδευτικό","εκπαιδευτικά","eğitimsel","oktatási","izobraževalen","haridus","hariduslik",""],
"key::69": ["biomedical","biomedica","biomédico","biomédico","biomedizinisch","biomedisch","βιοιατρικός","βιοιατρική","βιοιατρικό","βιοιατρικά","biyomedikal","orvosbiológiai","biomedicinski","biomeditsiiniline",""],
"key::70": ["veterinary","veterinaria","veterinarie","veterinaria","veterinária","tierärtzlich","veterinair","veeartsenijlkunde","κτηνιατρικός","κτηνιατρική","κτηνιατρικό","κτηνιατρικά","veteriner","állatorvosi","veterinar","veterinarski","veterinaaria",""],
"key::71": ["chemistry","chimica","química","química","Chemie","chemie","scheikunde","χημεία","kimya","kémia","kemija","keemia",""],
"key::72": ["security","sicurezza","seguridad","segurança","Sicherheit","veiligheid","ασφάλεια","güvenlik","biztonsági","varnost","turvalisus","julgeolek",""],
"key::73": ["biotechnology","biotecnologia","biotecnologie","biotecnología","biotecnologia","Biotechnologie","biotechnologie","βιοτεχνολογία","biyoteknoloji","biotechnológia","biotehnologija","biotehnoloogia",""],
"key::74": ["military","militare","militari","militar","militar","Militär","militair","leger","στρατιωτικός","στρατιωτική","στρατιωτικό","στρατιωτικά","askeri","katonai","vojaški","vojni","militaar","wojskowa",""],
"key::75": ["theological","teologia","teologico","teológico","tecnológica","theologisch","theologisch","θεολογικός","θεολογική","θεολογικό","θεολογικά","teolojik","technológiai","teološki","teoloogia","usuteadus","teoloogiline",""],
"key::76": ["electronics","elettronica","electrónica","eletrônicos","Elektronik","elektronica","ηλεκτρονική","elektronik","elektronika","elektronika","elektroonika",""],
"key::77": ["forestry","forestale","forestali","silvicultura","forestal","floresta","Forstwirtschaft","bosbouw","δασοκομία","δασολογία","ormancılık","erdészet","gozdarstvo","metsandus",""],
"key::78": ["maritime","marittima","marittime","marittimo","marítimo","marítimo","maritiem","ναυτικός","ναυτική","ναυτικό","ναυτικά","ναυτιλιακός","ναυτιλιακή","ναυτιλιακό","ναυτιλιακά","θαλάσσιος","θαλάσσια","θαλάσσιο","denizcilik","tengeri","morski","mere","merendus",""],
"key::79": ["sports","sport","deportes","esportes","Sport","sport","sportwetenschappen","άθληση","γυμναστικήδραστηριότητα","spor","sport","šport","sport","spordi",""],
"key::80": ["surgery","chirurgia","chirurgiche","cirugía","cirurgia","Chirurgie","chirurgie","heelkunde","εγχείρηση","επέμβαση","χειρουργικήεπέμβαση","cerrahi","sebészet","kirurgija","kirurgia",""],
"key::81": ["cultural","culturale","culturali","cultura","cultural","cultural","kulturell","cultureel","πολιτιστικός","πολιτιστική","πολιτιστικό","πολιτισμικός","πολιτισμική","πολιτισμικό","kültürel","kultúrális","kulturni","kultuuri","kultuuriline",""],
"key::82": ["computerscience","informatica","ordenador","computadora","informática","computación","cienciasdelacomputación","ciênciadacomputação","Computer","computer","υπολογιστής","ηλεκτρονικόςυπολογιστής","bilgisayar","számítógép","računalnik","arvuti",""],
"key::83": ["finance","financial","finanza","finanziarie","finanza","financiero","finanças","financeiro","Finanzen","finanziell","financiën","financieel","χρηματοοικονομικά","χρηματοδότηση","finanse","finansal","pénzügy","pénzügyi","finance","finančni","finants","finantsiline",""],
"key::84": ["communication","comunicazione","comuniciación","comunicação","Kommunikation","communication","επικοινωνία","iletişim","kommunikáció","komuniciranje","kommunikatsioon",""],
"key::85": ["justice","giustizia","justicia","justiça","Recht","Justiz","justitie","gerechtigheid","δικαιοσύνη","υπουργείοδικαιοσύνης","δίκαιο","adalet","igazságügy","pravo","õigus",""],
"key::86": ["aerospace","aerospaziale","aerospaziali","aeroespacio","aeroespaço","Luftfahrt","luchtvaart","ruimtevaart","αεροπορικός","αεροπορική","αεροπορικό","αεροναυπηγικός","αεροναυπηγική","αεροναυπηγικό","αεροναυπηγικά","havacılıkveuzay","légtér","zrakoplovstvo","atmosfäär","kosmos",""],
"key::87": ["dermatology","dermatologia","dermatología","dermatologia","Dermatologie","dermatologie","δρματολογία","dermatoloji","bőrgyógyászat","dermatológia","dermatologija","dermatoloogia",""],
"key::88": ["architecture","architettura","arquitectura","arquitetura","Architektur","architectuur","αρχιτεκτονική","mimarlık","építészet","arhitektura","arhitektuur",""],
"key::89": ["mathematics","matematica","matematiche","matemáticas","matemáticas","Mathematik","wiskunde","mathematica","μαθηματικά","matematik","matematika","matematika","matemaatika",""],
"key::90": ["language","lingue","linguistica","linguistiche","lenguaje","idioma","língua","idioma","Sprache","taal","taalkunde","γλώσσα","dil","nyelv","jezik","keel",""],
"key::91": ["neuroscience","neuroscienza","neurociencia","neurociência","Neurowissenschaft","neurowetenschappen","νευροεπιστήμη","nörobilim","idegtudomány","nevroznanost","neuroteadused",""],
"key::92": ["automation","automazione","automatización","automação","Automatisierung","automatisering","αυτοματοποίηση","otomasyon","automatizálás","avtomatizacija","automatiseeritud",""],
"key::93": ["pediatric","pediatria","pediatriche","pediatrico","pediátrico","pediatría","pediátrico","pediatria","pädiatrisch","pediatrische","παιδιατρική","pediatrik","gyermekgyógyászat","pediatrija","pediaatria",""],
"key::94": ["photonics","fotonica","fotoniche","fotónica","fotônica","Photonik","fotonica","φωτονική","fotonik","fotonika","fotonika","fotoonika",""],
"key::95": ["mechanics", "mechanical", "meccanica","meccaniche","mecánica","mecânica","Mechanik","Maschinenbau","mechanica","werktuigkunde","μηχανικής","mekanik","gépészet","mehanika","mehaanika",""],
"key::96": ["psychiatrics","psichiatria","psichiatrica","psichiatriche","psiquiatría","psiquiatria","Psychiatrie","psychiatrie","ψυχιατρική","psikiyatrik","pszihiátria","psihiatrija","psühhaatria",""],
"key::97": ["psychology","fisiologia","psicología","psicologia","Psychologie","psychologie","ψυχολογία","psikoloji","pszihológia","psihologija","psühholoogia",""],
"key::98": ["automotive","industriaautomobilistica","industriadelautomóvil","automotriz","industriaautomotriz","automotivo","Automobilindustrie","autoindustrie","αυτοκίνητος","αυτοκίνητη","αυτοκίνητο","αυτοκινούμενος","αυτοκινούμενη","αυτοκινούμενο","αυτοκινητιστικός","αυτοκινητιστική","αυτοκινητιστικό","otomotiv","autóipari","samogiben","avtomobilskaindustrija","auto-",""],
"key::99": ["neurology","neurologia","neurologiche","neurología","neurologia","Neurologie","neurologie","zenuwleer","νευρολογία","nöroloji","neurológia","ideggyógyászat","nevrologija","neuroloogia",""],
"key::100": ["geology","geologia","geologiche","geología","geologia","Geologie","geologie","aardkunde","γεωλογία","jeoloji","geológia","földtudomány","geologija","geoloogia",""],
"key::101": ["microbiology","microbiologia","micro-biologia","microbiologiche","microbiología","microbiologia","Mikrobiologie","microbiologie","μικροβιολογία","mikrobiyoloji","mikrobiológia","mikrobiologija","mikrobioloogia",""],
"key::102": ["informatics","informatica","informática","informática","informatica",""],
"key::103": ["forschungsgemeinschaft","comunita ricerca","research community","research foundation","research association"],
"key::104": ["commerce","ticaret","ticarət","commercio","trade","handel","comercio"],
"key::105" : ["state", "stato", "etade", "estado", "statale", "etat", "zustand", "estado"],
"key::106" : ["seminary", "seminario", "seminaire", "seminar"],
"key::107" : ["agricultural forestry", "af", "a f"],
"key::108" : ["agricultural mechanical", "am", "a m"],
"key::109" : ["catholic", "catholique", "katholische", "catolica", "cattolica", "catolico"]
}
} }
} }

View File

@ -33,10 +33,7 @@ import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.bulktag.community.*; import eu.dnetlib.dhp.bulktag.community.*;
import eu.dnetlib.dhp.schema.common.ModelConstants; import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.common.ModelSupport; import eu.dnetlib.dhp.schema.common.ModelSupport;
import eu.dnetlib.dhp.schema.oaf.Context; import eu.dnetlib.dhp.schema.oaf.*;
import eu.dnetlib.dhp.schema.oaf.Datasource;
import eu.dnetlib.dhp.schema.oaf.Project;
import eu.dnetlib.dhp.schema.oaf.Result;
import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils; import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils;
import scala.Tuple2; import scala.Tuple2;
@ -114,27 +111,35 @@ public class SparkBulkTagJob {
extendCommunityConfigurationForEOSC(spark, inputPath, cc); extendCommunityConfigurationForEOSC(spark, inputPath, cc);
execBulkTag( execBulkTag(
spark, inputPath, outputPath, protoMap, cc); spark, inputPath, outputPath, protoMap, cc);
execEntityTag(
spark, inputPath + "organization", outputPath + "organization",
Utils.getCommunityOrganization(baseURL), Organization.class, TaggingConstants.CLASS_ID_ORGANIZATION,
TaggingConstants.CLASS_NAME_BULKTAG_ORGANIZATION);
execEntityTag(
spark, inputPath + "project", outputPath + "project", Utils.getCommunityProjects(baseURL),
Project.class, TaggingConstants.CLASS_ID_PROJECT, TaggingConstants.CLASS_NAME_BULKTAG_PROJECT);
execDatasourceTag(spark, inputPath, outputPath, Utils.getDatasourceCommunities(baseURL)); execDatasourceTag(spark, inputPath, outputPath, Utils.getDatasourceCommunities(baseURL));
execProjectTag(spark, inputPath, outputPath, Utils.getCommunityProjects(baseURL));
}); });
} }
private static void execProjectTag(SparkSession spark, String inputPath, String outputPath, private static <E extends OafEntity> void execEntityTag(SparkSession spark, String inputPath, String outputPath,
CommunityEntityMap communityProjects) { CommunityEntityMap communityEntity, Class<E> entityClass,
Dataset<Project> projects = readPath(spark, inputPath + "project", Project.class); String classID, String calssName) {
Dataset<E> entity = readPath(spark, inputPath, entityClass);
Dataset<EntityCommunities> pc = spark Dataset<EntityCommunities> pc = spark
.createDataset( .createDataset(
communityProjects communityEntity
.keySet() .keySet()
.stream() .stream()
.map(k -> EntityCommunities.newInstance(k, communityProjects.get(k))) .map(k -> EntityCommunities.newInstance(k, communityEntity.get(k)))
.collect(Collectors.toList()), .collect(Collectors.toList()),
Encoders.bean(EntityCommunities.class)); Encoders.bean(EntityCommunities.class));
projects entity
.joinWith(pc, projects.col("id").equalTo(pc.col("entityId")), "left") .joinWith(pc, entity.col("id").equalTo(pc.col("entityId")), "left")
.map((MapFunction<Tuple2<Project, EntityCommunities>, Project>) t2 -> { .map((MapFunction<Tuple2<E, EntityCommunities>, E>) t2 -> {
Project ds = t2._1(); E ds = t2._1();
if (t2._2() != null) { if (t2._2() != null) {
List<String> context = Optional List<String> context = Optional
.ofNullable(ds.getContext()) .ofNullable(ds.getContext())
@ -156,8 +161,8 @@ public class SparkBulkTagJob {
false, TaggingConstants.BULKTAG_DATA_INFO_TYPE, true, false, false, TaggingConstants.BULKTAG_DATA_INFO_TYPE, true, false,
OafMapperUtils OafMapperUtils
.qualifier( .qualifier(
TaggingConstants.CLASS_ID_DATASOURCE, classID,
TaggingConstants.CLASS_NAME_BULKTAG_DATASOURCE, calssName,
ModelConstants.DNET_PROVENANCE_ACTIONS, ModelConstants.DNET_PROVENANCE_ACTIONS,
ModelConstants.DNET_PROVENANCE_ACTIONS), ModelConstants.DNET_PROVENANCE_ACTIONS),
"1"))); "1")));
@ -166,17 +171,17 @@ public class SparkBulkTagJob {
}); });
} }
return ds; return ds;
}, Encoders.bean(Project.class)) }, Encoders.bean(entityClass))
.write() .write()
.mode(SaveMode.Overwrite) .mode(SaveMode.Overwrite)
.option("compression", "gzip") .option("compression", "gzip")
.json(outputPath + "project"); .json(outputPath);
readPath(spark, outputPath + "project", Project.class) readPath(spark, outputPath, entityClass)
.write() .write()
.mode(SaveMode.Overwrite) .mode(SaveMode.Overwrite)
.option("compression", "gzip") .option("compression", "gzip")
.json(inputPath + "project"); .json(inputPath);
} }
private static void execDatasourceTag(SparkSession spark, String inputPath, String outputPath, private static void execDatasourceTag(SparkSession spark, String inputPath, String outputPath,

View File

@ -13,6 +13,9 @@ public class TaggingConstants {
public static final String CLASS_ID_CZENODO = "community:zenodocommunity"; public static final String CLASS_ID_CZENODO = "community:zenodocommunity";
public static final String CLASS_ID_ADVANCED_CONSTRAINT = "community:advconstraint"; public static final String CLASS_ID_ADVANCED_CONSTRAINT = "community:advconstraint";
public static final String CLASS_ID_PROJECT = "community:project";
public static final String CLASS_ID_ORGANIZATION = "community:organization";
public static final String ZENODO_COMMUNITY_INDICATOR = "zenodo.org/communities/"; public static final String ZENODO_COMMUNITY_INDICATOR = "zenodo.org/communities/";
public static final String CLASS_NAME_BULKTAG_SUBJECT = "Bulktagging for Community - Subject"; public static final String CLASS_NAME_BULKTAG_SUBJECT = "Bulktagging for Community - Subject";
@ -20,5 +23,8 @@ public class TaggingConstants {
public static final String CLASS_NAME_BULKTAG_ZENODO = "Bulktagging for Community - Zenodo"; public static final String CLASS_NAME_BULKTAG_ZENODO = "Bulktagging for Community - Zenodo";
public static final String CLASS_NAME_BULKTAG_ADVANCED_CONSTRAINT = "Bulktagging for Community - Advanced Constraints"; public static final String CLASS_NAME_BULKTAG_ADVANCED_CONSTRAINT = "Bulktagging for Community - Advanced Constraints";
public static final String CLASS_NAME_BULKTAG_PROJECT = "Bulktagging for Community - Project";
public static final String CLASS_NAME_BULKTAG_ORGANIZATION = "Bulktagging for Community - Organization";
public static final String TAGGING_TRUST = "0.8"; public static final String TAGGING_TRUST = "0.8";
} }

View File

@ -465,6 +465,138 @@ public class BulkTagJobTest {
} }
@Test
void organizationTag() throws Exception {
final String sourcePath = getClass()
.getResource("/eu/dnetlib/dhp/bulktag/sample/publication/update_datasource/")
.getPath();
LocalFileSystem fs = FileSystem.getLocal(new Configuration());
fs
.copyFromLocalFile(
false, new org.apache.hadoop.fs.Path(getClass()
.getResource("/eu/dnetlib/dhp/bulktag/pathMap/")
.getPath()),
new org.apache.hadoop.fs.Path(workingDir.toString() + "/data/bulktagging/protoMap"));
SparkBulkTagJob
.main(
new String[] {
"-isSparkSessionManaged", Boolean.FALSE.toString(),
"-sourcePath", sourcePath,
"-taggingConf", taggingConf,
"-outputPath", workingDir.toString() + "/",
"-baseURL", "https://services.openaire.eu/openaire/community/",
"-pathMap", workingDir.toString() + "/data/bulktagging/protoMap/pathMap",
"-nameNode", "local"
});
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
JavaRDD<Organization> tmp = sc
.textFile(workingDir.toString() + "/organization")
.map(item -> OBJECT_MAPPER.readValue(item, Organization.class));
Assertions.assertEquals(4, tmp.count());
org.apache.spark.sql.Dataset<Organization> verificationDataset = spark
.createDataset(tmp.rdd(), Encoders.bean(Organization.class));
verificationDataset.createOrReplaceTempView("organization");
String query = "select id, MyT.id community, MyD.provenanceaction.classid provenance, MyD.provenanceaction.classname name "
+ "from organization "
+ "lateral view explode(context) c as MyT "
+ "lateral view explode(MyT.datainfo) d as MyD "
+ "where MyD.inferenceprovenance = 'bulktagging'";
org.apache.spark.sql.Dataset<Row> idExplodeCommunity = spark.sql(query);
idExplodeCommunity.show(false);
Assertions.assertEquals(3, idExplodeCommunity.count());
Assertions
.assertEquals(
3, idExplodeCommunity.filter("provenance = 'community:organization'").count());
Assertions
.assertEquals(
3,
idExplodeCommunity
.filter("name = 'Bulktagging for Community - Organization'")
.count());
Assertions.assertEquals(1, idExplodeCommunity.filter("community = 'netherlands'").count());
Assertions.assertEquals(1, idExplodeCommunity.filter("community = 'beopen'").count());
Assertions.assertEquals(1, idExplodeCommunity.filter("community = 'mes'").count());
}
@Test
void projectTag() throws Exception {
final String sourcePath = getClass()
.getResource("/eu/dnetlib/dhp/bulktag/sample/publication/update_datasource/")
.getPath();
LocalFileSystem fs = FileSystem.getLocal(new Configuration());
fs
.copyFromLocalFile(
false, new org.apache.hadoop.fs.Path(getClass()
.getResource("/eu/dnetlib/dhp/bulktag/pathMap/")
.getPath()),
new org.apache.hadoop.fs.Path(workingDir.toString() + "/data/bulktagging/protoMap"));
SparkBulkTagJob
.main(
new String[] {
"-isSparkSessionManaged", Boolean.FALSE.toString(),
"-sourcePath", sourcePath,
"-taggingConf", taggingConf,
"-outputPath", workingDir.toString() + "/",
"-baseURL", "https://services.openaire.eu/openaire/community/",
"-pathMap", workingDir.toString() + "/data/bulktagging/protoMap/pathMap",
"-nameNode", "local"
});
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
JavaRDD<Project> tmp = sc
.textFile(workingDir.toString() + "/project")
.map(item -> OBJECT_MAPPER.readValue(item, Project.class));
Assertions.assertEquals(4, tmp.count());
org.apache.spark.sql.Dataset<Project> verificationDataset = spark
.createDataset(tmp.rdd(), Encoders.bean(Project.class));
verificationDataset.createOrReplaceTempView("project");
String query = "select id, MyT.id community, MyD.provenanceaction.classid provenance, MyD.provenanceaction.classname name "
+ "from project "
+ "lateral view explode(context) c as MyT "
+ "lateral view explode(MyT.datainfo) d as MyD "
+ "where MyD.inferenceprovenance = 'bulktagging'";
org.apache.spark.sql.Dataset<Row> idExplodeCommunity = spark.sql(query);
idExplodeCommunity.show(false);
Assertions.assertEquals(4, idExplodeCommunity.count());
Assertions
.assertEquals(
4, idExplodeCommunity.filter("provenance = 'community:project'").count());
Assertions
.assertEquals(
4,
idExplodeCommunity
.filter("name = 'Bulktagging for Community - Project'")
.count());
Assertions.assertEquals(1, idExplodeCommunity.filter("community = 'enermaps'").count());
Assertions.assertEquals(1, idExplodeCommunity.filter("community = 'clarin'").count());
Assertions.assertEquals(2, idExplodeCommunity.filter("community = 'dh-ch'").count());
}
@Test @Test
void bulktagByZenodoCommunityTest() throws Exception { void bulktagByZenodoCommunityTest() throws Exception {
final String sourcePath = getClass() final String sourcePath = getClass()

View File

@ -14,4 +14,7 @@ public class ProvisionConstants {
return format + SEPARATOR + LAYOUT + SEPARATOR + INTERPRETATION; return format + SEPARATOR + LAYOUT + SEPARATOR + INTERPRETATION;
} }
public static final String PUBLIC_ALIAS_NAME = "public";
public static final String SHADOW_ALIAS_NAME = "shadow";
} }

View File

@ -9,6 +9,7 @@ import org.apache.commons.io.IOUtils;
import org.apache.solr.client.solrj.SolrResponse; import org.apache.solr.client.solrj.SolrResponse;
import org.apache.solr.client.solrj.SolrServerException; import org.apache.solr.client.solrj.SolrServerException;
import org.apache.solr.client.solrj.impl.CloudSolrClient; import org.apache.solr.client.solrj.impl.CloudSolrClient;
import org.apache.solr.client.solrj.request.CollectionAdminRequest;
import org.apache.solr.client.solrj.response.UpdateResponse; import org.apache.solr.client.solrj.response.UpdateResponse;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
@ -23,7 +24,7 @@ public class SolrAdminApplication implements Closeable {
private static final Logger log = LoggerFactory.getLogger(SolrAdminApplication.class); private static final Logger log = LoggerFactory.getLogger(SolrAdminApplication.class);
enum Action { enum Action {
DELETE_BY_QUERY, COMMIT DELETE_BY_QUERY, COMMIT, UPDATE_ALIASES
} }
private final CloudSolrClient solrClient; private final CloudSolrClient solrClient;
@ -39,9 +40,6 @@ public class SolrAdminApplication implements Closeable {
final String isLookupUrl = parser.get("isLookupUrl"); final String isLookupUrl = parser.get("isLookupUrl");
log.info("isLookupUrl: {}", isLookupUrl); log.info("isLookupUrl: {}", isLookupUrl);
final String format = parser.get("format");
log.info("format: {}", format);
final Action action = Action.valueOf(parser.get("action")); final Action action = Action.valueOf(parser.get("action"));
log.info("action: {}", action); log.info("action: {}", action);
@ -59,11 +57,21 @@ public class SolrAdminApplication implements Closeable {
final String zkHost = isLookup.getZkHost(); final String zkHost = isLookup.getZkHost();
log.info("zkHost: {}", zkHost); log.info("zkHost: {}", zkHost);
final String collection = ProvisionConstants.getCollectionName(format); final String publicFormat = parser.get("publicFormat");
log.info("collection: {}", collection); log.info("publicFormat: {}", publicFormat);
final String shadowFormat = parser.get("shadowFormat");
log.info("shadowFormat: {}", shadowFormat);
// get collection names from metadata format profiles names
final String publicCollection = ProvisionConstants.getCollectionName(publicFormat);
log.info("publicCollection: {}", publicCollection);
final String shadowCollection = ProvisionConstants.getCollectionName(shadowFormat);
log.info("shadowCollection: {}", shadowCollection);
try (SolrAdminApplication app = new SolrAdminApplication(zkHost)) { try (SolrAdminApplication app = new SolrAdminApplication(zkHost)) {
app.execute(action, collection, query, commit); app.execute(action, query, commit, publicCollection, shadowCollection);
} }
} }
@ -72,22 +80,29 @@ public class SolrAdminApplication implements Closeable {
this.solrClient = new CloudSolrClient.Builder(zk.getHosts(), zk.getChroot()).build(); this.solrClient = new CloudSolrClient.Builder(zk.getHosts(), zk.getChroot()).build();
} }
public SolrResponse commit(String collection) throws IOException, SolrServerException { public SolrResponse commit(String shadowCollection) throws IOException, SolrServerException {
return execute(Action.COMMIT, collection, null, true); return execute(Action.COMMIT, null, true, null, shadowCollection);
} }
public SolrResponse execute(Action action, String collection, String query, boolean commit) public SolrResponse execute(Action action, String query, boolean commit,
String publicCollection, String shadowCollection)
throws IOException, SolrServerException { throws IOException, SolrServerException {
switch (action) { switch (action) {
case DELETE_BY_QUERY: case DELETE_BY_QUERY:
UpdateResponse rsp = solrClient.deleteByQuery(collection, query); UpdateResponse rsp = solrClient.deleteByQuery(shadowCollection, query);
if (commit) { if (commit) {
solrClient.commit(collection); return solrClient.commit(shadowCollection);
} }
return rsp; return rsp;
case COMMIT: case COMMIT:
return solrClient.commit(collection); return solrClient.commit(shadowCollection);
case UPDATE_ALIASES:
this.updateAliases(publicCollection, shadowCollection);
return null;
default: default:
throw new IllegalArgumentException("action not managed: " + action); throw new IllegalArgumentException("action not managed: " + action);
} }
@ -98,4 +113,30 @@ public class SolrAdminApplication implements Closeable {
solrClient.close(); solrClient.close();
} }
private void updateAliases(String publicCollection, String shadowCollection)
throws SolrServerException, IOException {
// delete current aliases
this.deleteAlias(ProvisionConstants.PUBLIC_ALIAS_NAME);
this.deleteAlias(ProvisionConstants.SHADOW_ALIAS_NAME);
// create aliases
this.createAlias(ProvisionConstants.PUBLIC_ALIAS_NAME, publicCollection);
this.createAlias(ProvisionConstants.SHADOW_ALIAS_NAME, shadowCollection);
}
public SolrResponse deleteAlias(String aliasName) throws SolrServerException, IOException {
CollectionAdminRequest.DeleteAlias deleteAliasRequest = CollectionAdminRequest.deleteAlias(aliasName);
log.info("deleting alias: {}", aliasName);
return deleteAliasRequest.process(solrClient);
}
public SolrResponse createAlias(String aliasName, String collection) throws IOException, SolrServerException {
CollectionAdminRequest.CreateAlias createAliasRequest = CollectionAdminRequest
.createAlias(aliasName, collection);
log.info("creating alias: {} for collection: {}", aliasName, collection);
return createAliasRequest.process(solrClient);
}
} }

View File

@ -36,7 +36,7 @@ public class SolrRecordDumpJob extends AbstractSolrRecordTransformJob {
private final String inputPath; private final String inputPath;
private final String format; private final String shadowFormat;
private final String outputPath; private final String outputPath;
@ -61,8 +61,8 @@ public class SolrRecordDumpJob extends AbstractSolrRecordTransformJob {
final String inputPath = parser.get("inputPath"); final String inputPath = parser.get("inputPath");
log.info("inputPath: {}", inputPath); log.info("inputPath: {}", inputPath);
final String format = parser.get("format"); final String shadowFormat = parser.get("shadowFormat");
log.info("format: {}", format); log.info("shadowFormat: {}", shadowFormat);
final String outputPath = Optional final String outputPath = Optional
.ofNullable(parser.get("outputPath")) .ofNullable(parser.get("outputPath"))
@ -95,27 +95,24 @@ public class SolrRecordDumpJob extends AbstractSolrRecordTransformJob {
final String isLookupUrl = parser.get("isLookupUrl"); final String isLookupUrl = parser.get("isLookupUrl");
log.info("isLookupUrl: {}", isLookupUrl); log.info("isLookupUrl: {}", isLookupUrl);
final ISLookupClient isLookup = new ISLookupClient(ISLookupClientFactory.getLookUpService(isLookupUrl)); final ISLookupClient isLookup = new ISLookupClient(ISLookupClientFactory.getLookUpService(isLookupUrl));
new SolrRecordDumpJob(spark, inputPath, format, outputPath).run(isLookup); new SolrRecordDumpJob(spark, inputPath, shadowFormat, outputPath).run(isLookup);
}); });
} }
public SolrRecordDumpJob(SparkSession spark, String inputPath, String format, String outputPath) { public SolrRecordDumpJob(SparkSession spark, String inputPath, String shadowFormat, String outputPath) {
this.spark = spark; this.spark = spark;
this.inputPath = inputPath; this.inputPath = inputPath;
this.format = format; this.shadowFormat = shadowFormat;
this.outputPath = outputPath; this.outputPath = outputPath;
} }
public void run(ISLookupClient isLookup) throws ISLookUpException, TransformerException { public void run(ISLookupClient isLookup) throws ISLookUpException, TransformerException {
final String fields = isLookup.getLayoutSource(format); final String fields = isLookup.getLayoutSource(shadowFormat);
log.info("fields: {}", fields); log.info("fields: {}", fields);
final String xslt = isLookup.getLayoutTransformer(); final String xslt = isLookup.getLayoutTransformer();
final String dsId = isLookup.getDsId(format); final String indexRecordXslt = getLayoutTransformer(shadowFormat, fields, xslt);
log.info("dsId: {}", dsId);
final String indexRecordXslt = getLayoutTransformer(format, fields, xslt);
log.info("indexRecordTransformer {}", indexRecordXslt); log.info("indexRecordTransformer {}", indexRecordXslt);
final Encoder<TupleWrapper> encoder = Encoders.bean(TupleWrapper.class); final Encoder<TupleWrapper> encoder = Encoders.bean(TupleWrapper.class);

View File

@ -40,6 +40,8 @@ public class XmlIndexingJob extends AbstractSolrRecordTransformJob {
private final String format; private final String format;
private final String shadowCollection;
private final int batchSize; private final int batchSize;
private final SparkSession spark; private final SparkSession spark;
@ -63,8 +65,11 @@ public class XmlIndexingJob extends AbstractSolrRecordTransformJob {
final String inputPath = parser.get("inputPath"); final String inputPath = parser.get("inputPath");
log.info("inputPath: {}", inputPath); log.info("inputPath: {}", inputPath);
final String format = parser.get("format"); final String shadowFormat = parser.get("shadowFormat");
log.info("format: {}", format); log.info("shadowFormat: {}", shadowFormat);
final String shadowCollection = ProvisionConstants.getCollectionName(shadowFormat);
log.info("shadowCollection: {}", shadowCollection);
final Integer batchSize = Optional final Integer batchSize = Optional
.ofNullable(parser.get("batchSize")) .ofNullable(parser.get("batchSize"))
@ -85,15 +90,17 @@ public class XmlIndexingJob extends AbstractSolrRecordTransformJob {
final String isLookupUrl = parser.get("isLookupUrl"); final String isLookupUrl = parser.get("isLookupUrl");
log.info("isLookupUrl: {}", isLookupUrl); log.info("isLookupUrl: {}", isLookupUrl);
final ISLookupClient isLookup = new ISLookupClient(ISLookupClientFactory.getLookUpService(isLookupUrl)); final ISLookupClient isLookup = new ISLookupClient(ISLookupClientFactory.getLookUpService(isLookupUrl));
new XmlIndexingJob(spark, inputPath, format, batchSize) new XmlIndexingJob(spark, inputPath, shadowFormat, shadowCollection, batchSize)
.run(isLookup); .run(isLookup);
}); });
} }
public XmlIndexingJob(SparkSession spark, String inputPath, String format, Integer batchSize) { public XmlIndexingJob(SparkSession spark, String inputPath, String format, String shadowCollection,
Integer batchSize) {
this.spark = spark; this.spark = spark;
this.inputPath = inputPath; this.inputPath = inputPath;
this.format = format; this.format = format;
this.shadowCollection = shadowCollection;
this.batchSize = batchSize; this.batchSize = batchSize;
} }
@ -103,12 +110,6 @@ public class XmlIndexingJob extends AbstractSolrRecordTransformJob {
final String xslt = isLookup.getLayoutTransformer(); final String xslt = isLookup.getLayoutTransformer();
final String dsId = isLookup.getDsId(format);
log.info("dsId: {}", dsId);
final String collection = ProvisionConstants.getCollectionName(format);
log.info("collection: {}", collection);
final String zkHost = isLookup.getZkHost(); final String zkHost = isLookup.getZkHost();
log.info("zkHost: {}", zkHost); log.info("zkHost: {}", zkHost);
@ -130,7 +131,7 @@ public class XmlIndexingJob extends AbstractSolrRecordTransformJob {
.javaRDD() .javaRDD()
.map( .map(
t -> new StreamingInputDocumentFactory().parseDocument(t.getXml(), t.getJson())); t -> new StreamingInputDocumentFactory().parseDocument(t.getXml(), t.getJson()));
DHPSolrSupport.indexDocs(zkHost, collection, batchSize, docs.rdd()); DHPSolrSupport.indexDocs(zkHost, shadowCollection, batchSize, docs.rdd());
} }
} }

View File

@ -30,11 +30,14 @@ import eu.dnetlib.dhp.schema.solr.Context;
import eu.dnetlib.dhp.schema.solr.Country; import eu.dnetlib.dhp.schema.solr.Country;
import eu.dnetlib.dhp.schema.solr.Datasource; import eu.dnetlib.dhp.schema.solr.Datasource;
import eu.dnetlib.dhp.schema.solr.EoscIfGuidelines; import eu.dnetlib.dhp.schema.solr.EoscIfGuidelines;
import eu.dnetlib.dhp.schema.solr.ExternalReference;
import eu.dnetlib.dhp.schema.solr.Instance; import eu.dnetlib.dhp.schema.solr.Instance;
import eu.dnetlib.dhp.schema.solr.Journal; import eu.dnetlib.dhp.schema.solr.Journal;
import eu.dnetlib.dhp.schema.solr.Measure;
import eu.dnetlib.dhp.schema.solr.OpenAccessColor; import eu.dnetlib.dhp.schema.solr.OpenAccessColor;
import eu.dnetlib.dhp.schema.solr.OpenAccessRoute; import eu.dnetlib.dhp.schema.solr.OpenAccessRoute;
import eu.dnetlib.dhp.schema.solr.Organization; import eu.dnetlib.dhp.schema.solr.Organization;
import eu.dnetlib.dhp.schema.solr.Pid;
import eu.dnetlib.dhp.schema.solr.Project; import eu.dnetlib.dhp.schema.solr.Project;
import eu.dnetlib.dhp.schema.solr.Result; import eu.dnetlib.dhp.schema.solr.Result;
import eu.dnetlib.dhp.schema.solr.Subject; import eu.dnetlib.dhp.schema.solr.Subject;
@ -76,6 +79,7 @@ public class ProvisionModelSupport {
r.setCollectedfrom(asProvenance(e.getCollectedfrom())); r.setCollectedfrom(asProvenance(e.getCollectedfrom()));
r.setContext(asContext(e.getContext(), contextMapper)); r.setContext(asContext(e.getContext(), contextMapper));
r.setPid(asPid(e.getPid())); r.setPid(asPid(e.getPid()));
r.setMeasures(mapMeasures(e.getMeasures()));
if (e instanceof eu.dnetlib.dhp.schema.oaf.Result) { if (e instanceof eu.dnetlib.dhp.schema.oaf.Result) {
r.setResult(mapResult((eu.dnetlib.dhp.schema.oaf.Result) e)); r.setResult(mapResult((eu.dnetlib.dhp.schema.oaf.Result) e));
@ -106,6 +110,14 @@ public class ProvisionModelSupport {
final RelatedEntity re = rew.getTarget(); final RelatedEntity re = rew.getTarget();
final RecordType relatedRecordType = RecordType.valueOf(re.getType()); final RecordType relatedRecordType = RecordType.valueOf(re.getType());
final Relation relation = rew.getRelation(); final Relation relation = rew.getRelation();
final String relationProvenance = Optional
.ofNullable(relation.getDataInfo())
.map(
d -> Optional
.ofNullable(d.getProvenanceaction())
.map(Qualifier::getClassid)
.orElse(null))
.orElse(null);
rr rr
.setHeader( .setHeader(
RelatedRecordHeader RelatedRecordHeader
@ -113,7 +125,9 @@ public class ProvisionModelSupport {
relation.getRelType(), relation.getRelType(),
relation.getRelClass(), relation.getRelClass(),
StringUtils.substringAfter(relation.getTarget(), IdentifierFactory.ID_PREFIX_SEPARATOR), StringUtils.substringAfter(relation.getTarget(), IdentifierFactory.ID_PREFIX_SEPARATOR),
relatedRecordType)); relatedRecordType,
relationProvenance,
Optional.ofNullable(relation.getDataInfo()).map(DataInfo::getTrust).orElse(null)));
rr.setAcronym(re.getAcronym()); rr.setAcronym(re.getAcronym());
rr.setCode(re.getCode()); rr.setCode(re.getCode());
@ -131,11 +145,20 @@ public class ProvisionModelSupport {
rr.setOfficialname(re.getOfficialname()); rr.setOfficialname(re.getOfficialname());
rr.setOpenairecompatibility(mapCodeLabel(re.getOpenairecompatibility())); rr.setOpenairecompatibility(mapCodeLabel(re.getOpenairecompatibility()));
rr.setPid(asPid(re.getPid())); rr.setPid(asPid(re.getPid()));
rr.setProjectTitle(rr.getProjectTitle()); rr.setWebsiteurl(re.getWebsiteurl());
rr.setProjectTitle(re.getProjectTitle());
rr.setPublisher(re.getPublisher()); rr.setPublisher(re.getPublisher());
rr.setResulttype(mapQualifier(re.getResulttype())); rr.setResulttype(mapQualifier(re.getResulttype()));
rr.setTitle(Optional.ofNullable(re.getTitle()).map(StructuredProperty::getValue).orElse(null)); rr.setTitle(Optional.ofNullable(re.getTitle()).map(StructuredProperty::getValue).orElse(null));
if (relation.getValidated() == null) {
relation.setValidated(false);
}
if (ModelConstants.OUTCOME.equals(relation.getSubRelType())
&& StringUtils.isNotBlank(relation.getValidationDate())) {
rr.setValidationDate(relation.getValidationDate());
}
return rr; return rr;
} }
@ -266,6 +289,7 @@ public class ProvisionModelSupport {
ds.setOfficialname(mapField(d.getOfficialname())); ds.setOfficialname(mapField(d.getOfficialname()));
ds.setDescription(mapField(d.getDescription())); ds.setDescription(mapField(d.getDescription()));
ds.setJournal(mapJournal(d.getJournal())); ds.setJournal(mapJournal(d.getJournal()));
ds.setWebsiteurl(mapField(d.getWebsiteurl()));
ds.setLogourl(mapField(d.getLogourl())); ds.setLogourl(mapField(d.getLogourl()));
ds.setAccessinfopackage(mapFieldList(d.getAccessinfopackage())); ds.setAccessinfopackage(mapFieldList(d.getAccessinfopackage()));
ds.setCertificates(mapField(d.getCertificates())); ds.setCertificates(mapField(d.getCertificates()));
@ -311,6 +335,7 @@ public class ProvisionModelSupport {
ds.setSubjects(asSubjectSP(d.getSubjects())); ds.setSubjects(asSubjectSP(d.getSubjects()));
ds.setSubmissionpolicyurl(d.getSubmissionpolicyurl()); ds.setSubmissionpolicyurl(d.getSubmissionpolicyurl());
ds.setThematic(d.getThematic()); ds.setThematic(d.getThematic());
ds.setContentpolicies(mapCodeLabel(d.getContentpolicies()));
ds.setVersioncontrol(d.getVersioncontrol()); ds.setVersioncontrol(d.getVersioncontrol());
ds.setVersioning(mapField(d.getVersioning())); ds.setVersioning(mapField(d.getVersioning()));
@ -326,6 +351,7 @@ public class ProvisionModelSupport {
rs.setOtherTitles(getOtherTitles(r.getTitle())); rs.setOtherTitles(getOtherTitles(r.getTitle()));
rs.setDescription(mapFieldList(r.getDescription())); rs.setDescription(mapFieldList(r.getDescription()));
rs.setSubject(asSubject(r.getSubject())); rs.setSubject(asSubject(r.getSubject()));
rs.setLanguage(asLanguage(r.getLanguage()));
rs.setPublicationdate(mapField(r.getDateofacceptance())); rs.setPublicationdate(mapField(r.getDateofacceptance()));
rs.setPublisher(mapField(r.getPublisher())); rs.setPublisher(mapField(r.getPublisher()));
rs.setEmbargoenddate(mapField(r.getEmbargoenddate())); rs.setEmbargoenddate(mapField(r.getEmbargoenddate()));
@ -341,17 +367,17 @@ public class ProvisionModelSupport {
rs.setCountry(asCountry(r.getCountry())); rs.setCountry(asCountry(r.getCountry()));
rs.setEoscifguidelines(asEOSCIF(r.getEoscifguidelines())); rs.setEoscifguidelines(asEOSCIF(r.getEoscifguidelines()));
rs.setGreen(r.getIsGreen()); rs.setIsGreen(r.getIsGreen());
rs rs
.setOpenAccessColor( .setOpenAccessColor(
Optional Optional
.ofNullable(r.getOpenAccessColor()) .ofNullable(r.getOpenAccessColor())
.map(color -> OpenAccessColor.valueOf(color.toString())) .map(color -> OpenAccessColor.valueOf(color.toString()))
.orElse(null)); .orElse(null));
rs.setInDiamondJournal(r.getIsInDiamondJournal()); rs.setIsInDiamondJournal(r.getIsInDiamondJournal());
rs.setPubliclyFunded(r.getPubliclyFunded()); rs.setPubliclyFunded(r.getPubliclyFunded());
rs.setTransformativeAgreement(r.getTransformativeAgreement()); rs.setTransformativeAgreement(r.getTransformativeAgreement());
rs.setExternalReference(mapExternalReference(r.getExternalReference()));
rs.setInstance(mapInstances(r.getInstance())); rs.setInstance(mapInstances(r.getInstance()));
if (r instanceof Publication) { if (r instanceof Publication) {
@ -375,6 +401,13 @@ public class ProvisionModelSupport {
return rs; return rs;
} }
private static Language asLanguage(Qualifier lang) {
return Optional
.ofNullable(lang)
.map(q -> Language.newInstance(q.getClassid(), q.getClassname()))
.orElse(null);
}
@Nullable @Nullable
private static List<String> getOtherTitles(List<StructuredProperty> titleList) { private static List<String> getOtherTitles(List<StructuredProperty> titleList) {
return Optional return Optional
@ -422,7 +455,7 @@ public class ProvisionModelSupport {
Instance i = new Instance(); Instance i = new Instance();
i.setCollectedfrom(asProvenance(instance.getCollectedfrom())); i.setCollectedfrom(asProvenance(instance.getCollectedfrom()));
i.setHostedby(asProvenance(instance.getHostedby())); i.setHostedby(asProvenance(instance.getHostedby()));
i.setFulltext(i.getFulltext()); i.setFulltext(instance.getFulltext());
i.setPid(asPid(instance.getPid())); i.setPid(asPid(instance.getPid()));
i.setAlternateIdentifier(asPid(instance.getAlternateIdentifier())); i.setAlternateIdentifier(asPid(instance.getAlternateIdentifier()));
i.setAccessright(mapAccessRight(instance.getAccessright())); i.setAccessright(mapAccessRight(instance.getAccessright()));
@ -453,7 +486,8 @@ public class ProvisionModelSupport {
private static AccessRight mapAccessRight(eu.dnetlib.dhp.schema.oaf.AccessRight accessright) { private static AccessRight mapAccessRight(eu.dnetlib.dhp.schema.oaf.AccessRight accessright) {
return AccessRight return AccessRight
.newInstance( .newInstance(
mapQualifier(accessright), accessright.getClassid(),
accessright.getClassname(),
Optional Optional
.ofNullable(accessright.getOpenAccessRoute()) .ofNullable(accessright.getOpenAccessRoute())
.map(route -> OpenAccessRoute.valueOf(route.toString())) .map(route -> OpenAccessRoute.valueOf(route.toString()))
@ -508,7 +542,46 @@ public class ProvisionModelSupport {
} }
private static Provenance asProvenance(KeyValue keyValue) { private static Provenance asProvenance(KeyValue keyValue) {
return Optional.ofNullable(keyValue).map(cf -> Provenance.newInstance(cf.getKey(), cf.getValue())).orElse(null); return Optional
.ofNullable(keyValue)
.map(
kv -> Provenance
.newInstance(
StringUtils.substringAfter(kv.getKey(), IdentifierFactory.ID_PREFIX_SEPARATOR),
kv.getValue()))
.orElse(null);
}
private static List<Measure> mapMeasures(List<eu.dnetlib.dhp.schema.oaf.Measure> measures) {
return Optional
.ofNullable(measures)
.map(
ml -> ml
.stream()
.map(m -> Measure.newInstance(m.getId(), mapCodeLabelKV(m.getUnit())))
.collect(Collectors.toList()))
.orElse(null);
}
private static List<ExternalReference> mapExternalReference(
List<eu.dnetlib.dhp.schema.oaf.ExternalReference> externalReference) {
return Optional
.ofNullable(externalReference)
.map(
ext -> ext
.stream()
.map(
e -> ExternalReference
.newInstance(
e.getSitename(),
e.getLabel(),
e.getAlternateLabel(),
e.getUrl(),
mapCodeLabel(e.getQualifier()),
e.getRefidentifier(),
e.getQuery()))
.collect(Collectors.toList()))
.orElse(Lists.newArrayList());
} }
private static List<Context> asContext(List<eu.dnetlib.dhp.schema.oaf.Context> ctxList, private static List<Context> asContext(List<eu.dnetlib.dhp.schema.oaf.Context> ctxList,
@ -529,7 +602,7 @@ public class ProvisionModelSupport {
} }
return Optional return Optional
.ofNullable(contexts) .of(contexts)
.map( .map(
ctx -> ctx ctx -> ctx
.stream() .stream()
@ -581,7 +654,14 @@ public class ProvisionModelSupport {
.map( .map(
pids -> pids pids -> pids
.stream() .stream()
.map(p -> Pid.newInstance(p.getQualifier().getClassname(), p.getValue())) .filter(p -> Objects.nonNull(p.getQualifier()))
.filter(p -> Objects.nonNull(p.getQualifier().getClassid()))
.map(
p -> Pid
.newInstance(
p.getValue(),
p.getQualifier().getClassid(),
p.getQualifier().getClassname()))
.collect(Collectors.toList())) .collect(Collectors.toList()))
.orElse(null); .orElse(null);
} }
@ -607,7 +687,9 @@ public class ProvisionModelSupport {
.stream() .stream()
.filter(s -> Objects.nonNull(s.getQualifier())) .filter(s -> Objects.nonNull(s.getQualifier()))
.filter(s -> Objects.nonNull(s.getQualifier().getClassname())) .filter(s -> Objects.nonNull(s.getQualifier().getClassname()))
.map(s -> Subject.newInstance(s.getValue(), s.getQualifier().getClassname())) .map(
s -> Subject
.newInstance(s.getValue(), s.getQualifier().getClassid(), s.getQualifier().getClassname()))
.collect(Collectors.toList())) .collect(Collectors.toList()))
.orElse(null); .orElse(null);
} }
@ -620,7 +702,9 @@ public class ProvisionModelSupport {
.stream() .stream()
.filter(s -> Objects.nonNull(s.getQualifier())) .filter(s -> Objects.nonNull(s.getQualifier()))
.filter(s -> Objects.nonNull(s.getQualifier().getClassname())) .filter(s -> Objects.nonNull(s.getQualifier().getClassname()))
.map(s -> Subject.newInstance(s.getValue(), s.getQualifier().getClassname())) .map(
s -> Subject
.newInstance(s.getValue(), s.getQualifier().getClassid(), s.getQualifier().getClassname()))
.collect(Collectors.toList())) .collect(Collectors.toList()))
.orElse(null); .orElse(null);
} }
@ -689,7 +773,7 @@ public class ProvisionModelSupport {
private static CodeLabel mapCodeLabel(KeyValue kv) { private static CodeLabel mapCodeLabel(KeyValue kv) {
return Optional return Optional
.ofNullable(kv) .ofNullable(kv)
.map(q -> CodeLabel.newInstance(kv.getKey(), kv.getValue())) .map(k -> CodeLabel.newInstance(k.getKey(), k.getValue()))
.orElse(null); .orElse(null);
} }

View File

@ -219,6 +219,13 @@ public class XmlRecordFactory implements Serializable {
if (entity.getMeasures() != null) { if (entity.getMeasures() != null) {
metadata.addAll(measuresAsXml(entity.getMeasures())); metadata.addAll(measuresAsXml(entity.getMeasures()));
} }
if (entity.getContext() != null) {
contexts.addAll(entity.getContext().stream().map(Context::getId).collect(Collectors.toList()));
/* FIXME: Workaround for CLARIN mining issue: #3670#note-29 */
if (contexts.contains("dh-ch::subcommunity::2")) {
contexts.add("clarin");
}
}
if (ModelSupport.isResult(type)) { if (ModelSupport.isResult(type)) {
final Result r = (Result) entity; final Result r = (Result) entity;
@ -245,14 +252,6 @@ public class XmlRecordFactory implements Serializable {
.collect(Collectors.toList())); .collect(Collectors.toList()));
} }
if (r.getContext() != null) {
contexts.addAll(r.getContext().stream().map(c -> c.getId()).collect(Collectors.toList()));
/* FIXME: Workaround for CLARIN mining issue: #3670#note-29 */
if (contexts.contains("dh-ch::subcommunity::2")) {
contexts.add("clarin");
}
}
if (r.getTitle() != null) { if (r.getTitle() != null) {
metadata metadata
.addAll( .addAll(
@ -1315,7 +1314,7 @@ public class XmlRecordFactory implements Serializable {
instance instance
.getCollectedfrom() .getCollectedfrom()
.stream() .stream()
.filter(cf -> kvNotBlank(cf)) .filter(XmlRecordFactory::kvNotBlank)
.map(cf -> XmlSerializationUtils.mapKeyValue("collectedfrom", cf)) .map(cf -> XmlSerializationUtils.mapKeyValue("collectedfrom", cf))
.collect(Collectors.toList())); .collect(Collectors.toList()));
} }
@ -1326,7 +1325,7 @@ public class XmlRecordFactory implements Serializable {
instance instance
.getHostedby() .getHostedby()
.stream() .stream()
.filter(hb -> kvNotBlank(hb)) .filter(XmlRecordFactory::kvNotBlank)
.map(hb -> XmlSerializationUtils.mapKeyValue("hostedby", hb)) .map(hb -> XmlSerializationUtils.mapKeyValue("hostedby", hb))
.collect(Collectors.toList())); .collect(Collectors.toList()));
} }
@ -1336,7 +1335,7 @@ public class XmlRecordFactory implements Serializable {
instance instance
.getDateofacceptance() .getDateofacceptance()
.stream() .stream()
.filter(d -> isNotBlank(d)) .filter(StringUtils::isNotBlank)
.map(d -> XmlSerializationUtils.asXmlElement("dateofacceptance", d)) .map(d -> XmlSerializationUtils.asXmlElement("dateofacceptance", d))
.collect(Collectors.toList())); .collect(Collectors.toList()));
} }
@ -1346,7 +1345,7 @@ public class XmlRecordFactory implements Serializable {
instance instance
.getInstancetype() .getInstancetype()
.stream() .stream()
.filter(t -> !StringUtils.isNotBlank(t.getClassid())) .filter(t -> StringUtils.isNotBlank(t.getClassid()))
.map(t -> XmlSerializationUtils.mapQualifier("instancetype", t)) .map(t -> XmlSerializationUtils.mapQualifier("instancetype", t))
.collect(Collectors.toList())); .collect(Collectors.toList()));
} }
@ -1356,7 +1355,7 @@ public class XmlRecordFactory implements Serializable {
instance instance
.getDistributionlocation() .getDistributionlocation()
.stream() .stream()
.filter(d -> isNotBlank(d)) .filter(StringUtils::isNotBlank)
.map(d -> XmlSerializationUtils.asXmlElement("distributionlocation", d)) .map(d -> XmlSerializationUtils.asXmlElement("distributionlocation", d))
.collect(Collectors.toList())); .collect(Collectors.toList()));
} }
@ -1409,7 +1408,7 @@ public class XmlRecordFactory implements Serializable {
instance instance
.getLicense() .getLicense()
.stream() .stream()
.filter(d -> isNotBlank(d)) .filter(StringUtils::isNotBlank)
.map(d -> XmlSerializationUtils.asXmlElement("license", d)) .map(d -> XmlSerializationUtils.asXmlElement("license", d))
.collect(Collectors.toList())); .collect(Collectors.toList()));
} }
@ -1540,11 +1539,16 @@ public class XmlRecordFactory implements Serializable {
.min(new RefereedComparator()) .min(new RefereedComparator())
.orElse(XmlInstance.UNKNOWN_REVIEW_LEVEL)); .orElse(XmlInstance.UNKNOWN_REVIEW_LEVEL));
Map<String, Qualifier> instanceTypes = Maps.newHashMap();
instances.forEach(p -> { instances.forEach(p -> {
final Instance i = p.getRight(); final Instance i = p.getRight();
instance.getCollectedfrom().add(i.getCollectedfrom()); instance.getCollectedfrom().add(i.getCollectedfrom());
instance.getHostedby().add(i.getHostedby()); instance.getHostedby().add(i.getHostedby());
instance.getInstancetype().add(i.getInstancetype());
if (Optional.ofNullable(i.getInstancetype()).map(Qualifier::getClassid).isPresent()) {
instanceTypes.putIfAbsent(i.getInstancetype().getClassid(), i.getInstancetype());
}
instance instance
.setProcessingchargeamount( .setProcessingchargeamount(
Optional.ofNullable(i.getProcessingchargeamount()).map(apc -> apc.getValue()).orElse(null)); Optional.ofNullable(i.getProcessingchargeamount()).map(apc -> apc.getValue()).orElse(null));
@ -1571,6 +1575,8 @@ public class XmlRecordFactory implements Serializable {
.ifPresent(instance::setFulltext); .ifPresent(instance::setFulltext);
}); });
instance.getInstancetype().addAll(instanceTypes.values());
if (instance.getHostedby().size() > 1 if (instance.getHostedby().size() > 1
&& instance.getHostedby().stream().anyMatch(hb -> ModelConstants.UNKNOWN_REPOSITORY.equals(hb))) { && instance.getHostedby().stream().anyMatch(hb -> ModelConstants.UNKNOWN_REPOSITORY.equals(hb))) {
instance.getHostedby().remove(ModelConstants.UNKNOWN_REPOSITORY); instance.getHostedby().remove(ModelConstants.UNKNOWN_REPOSITORY);
@ -1596,9 +1602,7 @@ public class XmlRecordFactory implements Serializable {
private List<String> buildContexts(final String type, final Set<String> contexts) { private List<String> buildContexts(final String type, final Set<String> contexts) {
final List<String> res = Lists.newArrayList(); final List<String> res = Lists.newArrayList();
if (contextMapper != null if (contextMapper != null && !contextMapper.isEmpty()) {
&& !contextMapper.isEmpty()
&& MainEntityType.result.toString().equals(type)) {
XMLTag document = XMLDoc.newDocument(true).addRoot("contextRoot"); XMLTag document = XMLDoc.newDocument(true).addRoot("contextRoot");

View File

@ -13,7 +13,7 @@
}, },
{ {
"paramName": "f", "paramName": "f",
"paramLongName": "format", "paramLongName": "shadowFormat",
"paramDescription": "MDFormat name found in the IS profile", "paramDescription": "MDFormat name found in the IS profile",
"paramRequired": true "paramRequired": true
}, },

View File

@ -13,8 +13,8 @@
}, },
{ {
"paramName": "f", "paramName": "f",
"paramLongName": "format", "paramLongName": "shadowFormat",
"paramDescription": "MDFormat name found in the IS profile", "paramDescription": "MDFormat name found in the IS profile bound to the shadow index collection to feed",
"paramRequired": true "paramRequired": true
}, },
{ {

View File

@ -5,12 +5,6 @@
"paramDescription": "the URL to the ISLookUp Service", "paramDescription": "the URL to the ISLookUp Service",
"paramRequired": true "paramRequired": true
}, },
{
"paramName": "f",
"paramLongName": "format",
"paramDescription": "metadata format profile name",
"paramRequired": true
},
{ {
"paramName": "a", "paramName": "a",
"paramLongName": "action", "paramLongName": "action",
@ -28,5 +22,18 @@
"paramLongName": "commit", "paramLongName": "commit",
"paramDescription": "should the action be followed by a commit?", "paramDescription": "should the action be followed by a commit?",
"paramRequired": false "paramRequired": false
},
{
"paramName": "pf",
"paramLongName": "publicFormat",
"paramDescription": "the name of the public metadata format profile - used to create an alias",
"paramRequired": false
},
{
"paramName": "sf",
"paramLongName": "shadowFormat",
"paramDescription": "the name of the shadow metadata format profile - used to create an alias",
"paramRequired": false
} }
] ]

View File

@ -35,7 +35,7 @@
<description>maximum number of relations allowed for a each entity grouping by target</description> <description>maximum number of relations allowed for a each entity grouping by target</description>
</property> </property>
<property> <property>
<name>format</name> <name>shadowFormat</name>
<description>metadata format name (DMF|TMF)</description> <description>metadata format name (DMF|TMF)</description>
</property> </property>
<property> <property>
@ -133,6 +133,7 @@
<case to="create_payloads">${wf:conf('resumeFrom') eq 'create_payloads'}</case> <case to="create_payloads">${wf:conf('resumeFrom') eq 'create_payloads'}</case>
<case to="drop_solr_collection">${wf:conf('resumeFrom') eq 'drop_solr_collection'}</case> <case to="drop_solr_collection">${wf:conf('resumeFrom') eq 'drop_solr_collection'}</case>
<case to="to_solr_index">${wf:conf('resumeFrom') eq 'to_solr_index'}</case> <case to="to_solr_index">${wf:conf('resumeFrom') eq 'to_solr_index'}</case>
<case to="update_solr_aliases">${wf:conf('resumeFrom') eq 'update_solr_aliases'}</case>
<default to="prepare_relations"/> <default to="prepare_relations"/>
</switch> </switch>
</decision> </decision>
@ -641,8 +642,8 @@
</configuration> </configuration>
<main-class>eu.dnetlib.dhp.oa.provision.SolrAdminApplication</main-class> <main-class>eu.dnetlib.dhp.oa.provision.SolrAdminApplication</main-class>
<arg>--isLookupUrl</arg><arg>${isLookupUrl}</arg> <arg>--isLookupUrl</arg><arg>${isLookupUrl}</arg>
<arg>--format</arg><arg>${format}</arg>
<arg>--action</arg><arg>DELETE_BY_QUERY</arg> <arg>--action</arg><arg>DELETE_BY_QUERY</arg>
<arg>--shadowFormat</arg><arg>${shadowFormat}</arg>
<arg>--query</arg><arg>${solrDeletionQuery}</arg> <arg>--query</arg><arg>${solrDeletionQuery}</arg>
<arg>--commit</arg><arg>true</arg> <arg>--commit</arg><arg>true</arg>
</java> </java>
@ -672,7 +673,7 @@
</spark-opts> </spark-opts>
<arg>--inputPath</arg><arg>${workingDir}/xml_json</arg> <arg>--inputPath</arg><arg>${workingDir}/xml_json</arg>
<arg>--isLookupUrl</arg><arg>${isLookupUrl}</arg> <arg>--isLookupUrl</arg><arg>${isLookupUrl}</arg>
<arg>--format</arg><arg>${format}</arg> <arg>--shadowFormat</arg><arg>${shadowFormat}</arg>
<arg>--batchSize</arg><arg>${batchSize}</arg> <arg>--batchSize</arg><arg>${batchSize}</arg>
</spark> </spark>
<ok to="commit_solr_collection"/> <ok to="commit_solr_collection"/>
@ -689,7 +690,7 @@
</configuration> </configuration>
<main-class>eu.dnetlib.dhp.oa.provision.SolrAdminApplication</main-class> <main-class>eu.dnetlib.dhp.oa.provision.SolrAdminApplication</main-class>
<arg>--isLookupUrl</arg><arg>${isLookupUrl}</arg> <arg>--isLookupUrl</arg><arg>${isLookupUrl}</arg>
<arg>--format</arg><arg>${format}</arg> <arg>--shadowFormat</arg><arg>${shadowFormat}</arg>
<arg>--action</arg><arg>COMMIT</arg> <arg>--action</arg><arg>COMMIT</arg>
</java> </java>
<ok to="End"/> <ok to="End"/>
@ -714,12 +715,31 @@
</spark-opts> </spark-opts>
<arg>--inputPath</arg><arg>${workingDir}/xml_json</arg> <arg>--inputPath</arg><arg>${workingDir}/xml_json</arg>
<arg>--isLookupUrl</arg><arg>${isLookupUrl}</arg> <arg>--isLookupUrl</arg><arg>${isLookupUrl}</arg>
<arg>--format</arg><arg>${format}</arg> <arg>--shadowFormat</arg><arg>${shadowFormat}</arg>
<arg>--outputPath</arg><arg>${workingDir}/solr_documents</arg> <arg>--outputPath</arg><arg>${workingDir}/solr_documents</arg>
</spark> </spark>
<ok to="End"/> <ok to="End"/>
<error to="Kill"/> <error to="Kill"/>
</action> </action>
<!-- Action that updates the solr core aliases - out of order execution, only using the 'resume_from' param -->
<action name="update_solr_aliases">
<java>
<configuration>
<property>
<name>oozie.launcher.mapreduce.user.classpath.first</name>
<value>true</value>
</property>
</configuration>
<main-class>eu.dnetlib.dhp.oa.provision.SolrAdminApplication</main-class>
<arg>--isLookupUrl</arg><arg>${isLookupUrl}</arg>
<arg>--action</arg><arg>UPDATE_ALIASES</arg>
<arg>--publicFormat</arg><arg>${publicFormat}</arg>
<arg>--shadowFormat</arg><arg>${shadowFormat}</arg>
</java>
<ok to="End"/>
<error to="Kill"/>
</action>
<end name="End"/> <end name="End"/>
</workflow-app> </workflow-app>

View File

@ -5,6 +5,7 @@ import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertNotNull; import static org.junit.jupiter.api.Assertions.assertNotNull;
import java.io.IOException; import java.io.IOException;
import java.io.StringReader;
import java.net.MalformedURLException; import java.net.MalformedURLException;
import java.net.URL; import java.net.URL;
import java.util.Arrays; import java.util.Arrays;
@ -16,6 +17,9 @@ import javax.xml.transform.TransformerException;
import org.apache.commons.io.IOUtils; import org.apache.commons.io.IOUtils;
import org.apache.solr.client.solrj.util.ClientUtils; import org.apache.solr.client.solrj.util.ClientUtils;
import org.apache.solr.common.SolrInputDocument; import org.apache.solr.common.SolrInputDocument;
import org.dom4j.Document;
import org.dom4j.DocumentException;
import org.dom4j.io.SAXReader;
import org.junit.jupiter.api.Assertions; import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test; import org.junit.jupiter.api.Test;
@ -34,7 +38,6 @@ import eu.dnetlib.dhp.utils.saxon.SaxonTransformerFactory;
/** /**
* This test can be used to produce a record that can be manually fed to Solr in XML format. * This test can be used to produce a record that can be manually fed to Solr in XML format.
*
* The input is a JoinedEntity, i.e. a json representation of an OpenAIRE entity that embeds all the linked entities. * The input is a JoinedEntity, i.e. a json representation of an OpenAIRE entity that embeds all the linked entities.
*/ */
public class IndexRecordTransformerTest { public class IndexRecordTransformerTest {
@ -54,7 +57,7 @@ public class IndexRecordTransformerTest {
} }
@Test @Test
public void testPublicationRecordTransformation() throws IOException, TransformerException { public void testPublicationRecordTransformation() throws IOException, TransformerException, DocumentException {
final XmlRecordFactory xmlRecordFactory = new XmlRecordFactory(contextMapper, false, final XmlRecordFactory xmlRecordFactory = new XmlRecordFactory(contextMapper, false,
PayloadConverterJob.schemaLocation); PayloadConverterJob.schemaLocation);
@ -71,11 +74,15 @@ public class IndexRecordTransformerTest {
new RelatedEntityWrapper(rel, new RelatedEntityWrapper(rel,
CreateRelatedEntitiesJob_phase1.asRelatedEntity(pj, Project.class)))); CreateRelatedEntitiesJob_phase1.asRelatedEntity(pj, Project.class))));
final String record = xmlRecordFactory.build(je); final String xmlRecord = xmlRecordFactory.build(je);
assertNotNull(record); assertNotNull(xmlRecord);
testRecordTransformation(record); Document doc = new SAXReader().read(new StringReader(xmlRecord));
assertEquals("Article", doc.valueOf("//children/instance/instancetype/@classname"));
testRecordTransformation(xmlRecord);
} }
@Test @Test

View File

@ -4,16 +4,20 @@ package eu.dnetlib.dhp.oa.provision;
import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertTrue; import static org.junit.jupiter.api.Assertions.assertTrue;
import org.apache.solr.client.solrj.request.SolrPing;
import org.apache.solr.client.solrj.response.CollectionAdminResponse;
import org.apache.solr.client.solrj.response.SolrPingResponse; import org.apache.solr.client.solrj.response.SolrPingResponse;
import org.apache.solr.client.solrj.response.UpdateResponse; import org.apache.solr.client.solrj.response.UpdateResponse;
import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.Test; import org.junit.jupiter.api.Test;
class SolrAdminApplicationTest extends SolrTest { class SolrAdminApplicationTest extends SolrTest {
@Test @Test
void testPing() throws Exception { void testPing() throws Exception {
SolrPingResponse pingResponse = miniCluster.getSolrClient().ping(); final SolrPing ping = new SolrPing();
ping.getParams().set("collection", ProvisionConstants.SHADOW_ALIAS_NAME);
SolrPingResponse pingResponse = ping.process(miniCluster.getSolrClient());
log.info("pingResponse: '{}'", pingResponse.getStatus()); log.info("pingResponse: '{}'", pingResponse.getStatus());
assertEquals(0, pingResponse.getStatus()); assertEquals(0, pingResponse.getStatus());
} }
@ -24,7 +28,7 @@ class SolrAdminApplicationTest extends SolrTest {
SolrAdminApplication admin = new SolrAdminApplication(miniCluster.getSolrClient().getZkHost()); SolrAdminApplication admin = new SolrAdminApplication(miniCluster.getSolrClient().getZkHost());
UpdateResponse rsp = (UpdateResponse) admin UpdateResponse rsp = (UpdateResponse) admin
.execute(SolrAdminApplication.Action.DELETE_BY_QUERY, DEFAULT_COLLECTION, "*:*", false); .execute(SolrAdminApplication.Action.DELETE_BY_QUERY, "*:*", false, null, SHADOW_COLLECTION);
assertEquals(0, rsp.getStatus()); assertEquals(0, rsp.getStatus());
} }
@ -34,9 +38,30 @@ class SolrAdminApplicationTest extends SolrTest {
SolrAdminApplication admin = new SolrAdminApplication(miniCluster.getSolrClient().getZkHost()); SolrAdminApplication admin = new SolrAdminApplication(miniCluster.getSolrClient().getZkHost());
UpdateResponse rsp = (UpdateResponse) admin.commit(DEFAULT_COLLECTION); UpdateResponse rsp = (UpdateResponse) admin.commit(SHADOW_COLLECTION);
assertEquals(0, rsp.getStatus()); assertEquals(0, rsp.getStatus());
} }
@Test
void testAdminApplication_CREATE_ALIAS() throws Exception {
SolrAdminApplication admin = new SolrAdminApplication(miniCluster.getSolrClient().getZkHost());
CollectionAdminResponse rsp = (CollectionAdminResponse) admin
.createAlias(ProvisionConstants.PUBLIC_ALIAS_NAME, SHADOW_COLLECTION);
assertEquals(0, rsp.getStatus());
}
@Test
void testAdminApplication_DELETE_ALIAS() throws Exception {
SolrAdminApplication admin = new SolrAdminApplication(miniCluster.getSolrClient().getZkHost());
CollectionAdminResponse rsp = (CollectionAdminResponse) admin.deleteAlias(ProvisionConstants.PUBLIC_ALIAS_NAME);
assertEquals(0, rsp.getStatus());
}
} }

View File

@ -1,21 +1,40 @@
package eu.dnetlib.dhp.oa.provision; package eu.dnetlib.dhp.oa.provision;
import static org.junit.jupiter.api.Assertions.assertEquals;
import java.io.File;
import java.io.IOException; import java.io.IOException;
import java.net.URI; import java.net.URI;
import java.nio.file.Path;
import org.apache.commons.io.FileUtils;
import org.apache.commons.io.IOUtils; import org.apache.commons.io.IOUtils;
import org.apache.solr.client.solrj.SolrQuery; import org.apache.solr.client.solrj.SolrQuery;
import org.apache.solr.client.solrj.embedded.JettyConfig;
import org.apache.solr.client.solrj.impl.CloudSolrClient;
import org.apache.solr.client.solrj.request.CollectionAdminRequest;
import org.apache.solr.client.solrj.request.ConfigSetAdminRequest;
import org.apache.solr.client.solrj.request.QueryRequest;
import org.apache.solr.client.solrj.response.CollectionAdminResponse;
import org.apache.solr.client.solrj.response.QueryResponse; import org.apache.solr.client.solrj.response.QueryResponse;
import org.apache.solr.cloud.MiniSolrCloudCluster;
import org.apache.solr.common.SolrDocument; import org.apache.solr.common.SolrDocument;
import org.apache.solr.common.params.CollectionParams;
import org.apache.solr.common.params.CommonParams; import org.apache.solr.common.params.CommonParams;
import org.apache.solr.common.params.CoreAdminParams;
import org.apache.solr.common.params.ModifiableSolrParams;
import org.apache.solr.common.util.NamedList;
import org.apache.spark.SparkConf; import org.apache.spark.SparkConf;
import org.apache.spark.sql.SparkSession; import org.apache.spark.sql.SparkSession;
import org.junit.jupiter.api.*; import org.junit.jupiter.api.*;
import org.junit.jupiter.api.extension.ExtendWith; import org.junit.jupiter.api.extension.ExtendWith;
import org.junit.jupiter.api.io.TempDir;
import org.mockito.Mock; import org.mockito.Mock;
import org.mockito.Mockito; import org.mockito.Mockito;
import org.mockito.junit.jupiter.MockitoExtension; import org.mockito.junit.jupiter.MockitoExtension;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import eu.dnetlib.dhp.oa.provision.model.SerializableSolrInputDocument; import eu.dnetlib.dhp.oa.provision.model.SerializableSolrInputDocument;
import eu.dnetlib.dhp.oa.provision.utils.ISLookupClient; import eu.dnetlib.dhp.oa.provision.utils.ISLookupClient;
@ -23,7 +42,18 @@ import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService; import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
@ExtendWith(MockitoExtension.class) @ExtendWith(MockitoExtension.class)
public class SolrConfigExploreTest extends SolrExploreTest { public class SolrConfigExploreTest {
protected static final Logger log = LoggerFactory.getLogger(SolrConfigExploreTest.class);
protected static final String SHADOW_FORMAT = "c1";
protected static final String SHADOW_COLLECTION = SHADOW_FORMAT + "-index-openaire";
protected static final String PUBLIC_FORMAT = "c2";
protected static final String PUBLIC_COLLECTION = PUBLIC_FORMAT + "-index-openaire";
protected static final String CONFIG_NAME = "testConfig";
protected static SolrAdminApplication admin;
protected static SparkSession spark; protected static SparkSession spark;
@ -35,15 +65,17 @@ public class SolrConfigExploreTest extends SolrExploreTest {
@Mock @Mock
private ISLookupClient isLookupClient; private ISLookupClient isLookupClient;
@TempDir
public static Path workingDir;
protected static MiniSolrCloudCluster miniCluster;
@BeforeEach @BeforeEach
public void prepareMocks() throws ISLookUpException, IOException { public void prepareMocks() throws ISLookUpException, IOException {
isLookupClient.setIsLookup(isLookUpService); isLookupClient.setIsLookup(isLookUpService);
int solrPort = URI.create("http://" + miniCluster.getZkClient().getZkServerAddress()).getPort(); int solrPort = URI.create("http://" + miniCluster.getZkClient().getZkServerAddress()).getPort();
Mockito
.when(isLookupClient.getDsId(Mockito.anyString()))
.thenReturn("313f0381-23b6-466f-a0b8-c72a9679ac4b_SW5kZXhEU1Jlc291cmNlcy9JbmRleERTUmVzb3VyY2VUeXBl");
Mockito.when(isLookupClient.getZkHost()).thenReturn(String.format("127.0.0.1:%s/solr", solrPort)); Mockito.when(isLookupClient.getZkHost()).thenReturn(String.format("127.0.0.1:%s/solr", solrPort));
Mockito Mockito
.when(isLookupClient.getLayoutSource(Mockito.anyString())) .when(isLookupClient.getLayoutSource(Mockito.anyString()))
@ -54,7 +86,7 @@ public class SolrConfigExploreTest extends SolrExploreTest {
} }
@BeforeAll @BeforeAll
public static void before() { public static void setup() throws Exception {
SparkConf conf = new SparkConf(); SparkConf conf = new SparkConf();
conf.setAppName(XmlIndexingJobTest.class.getSimpleName()); conf.setAppName(XmlIndexingJobTest.class.getSimpleName());
@ -70,15 +102,75 @@ public class SolrConfigExploreTest extends SolrExploreTest {
spark = SparkSession spark = SparkSession
.builder() .builder()
.appName(XmlIndexingJobTest.class.getSimpleName()) .appName(SolrConfigExploreTest.class.getSimpleName())
.config(conf) .config(conf)
.getOrCreate(); .getOrCreate();
// random unassigned HTTP port
final int jettyPort = 0;
final JettyConfig jettyConfig = JettyConfig.builder().setPort(jettyPort).build();
log.info(String.format("working directory: %s", workingDir.toString()));
System.setProperty("solr.log.dir", workingDir.resolve("logs").toString());
// create a MiniSolrCloudCluster instance
miniCluster = new MiniSolrCloudCluster(2, workingDir.resolve("solr"), jettyConfig);
// Upload Solr configuration directory to ZooKeeper
String solrZKConfigDir = "src/test/resources/eu/dnetlib/dhp/oa/provision/solr/conf/exploreTestConfig";
File configDir = new File(solrZKConfigDir);
miniCluster.uploadConfigSet(configDir.toPath(), CONFIG_NAME);
// override settings in the solrconfig include
System.setProperty("solr.tests.maxBufferedDocs", "100000");
System.setProperty("solr.tests.maxIndexingThreads", "-1");
System.setProperty("solr.tests.ramBufferSizeMB", "100");
// use non-test classes so RandomizedRunner isn't necessary
System.setProperty("solr.tests.mergeScheduler", "org.apache.lucene.index.ConcurrentMergeScheduler");
System.setProperty("solr.directoryFactory", "solr.RAMDirectoryFactory");
System.setProperty("solr.lock.type", "single");
log.info(new ConfigSetAdminRequest.List().process(miniCluster.getSolrClient()).toString());
log
.info(
CollectionAdminRequest.ClusterStatus
.getClusterStatus()
.process(miniCluster.getSolrClient())
.toString());
NamedList<Object> res = createCollection(
miniCluster.getSolrClient(), SHADOW_COLLECTION, 4, 2, 20, CONFIG_NAME);
res.forEach(o -> log.info(o.toString()));
// miniCluster.getSolrClient().setDefaultCollection(SHADOW_COLLECTION);
res = createCollection(
miniCluster.getSolrClient(), PUBLIC_COLLECTION, 4, 2, 20, CONFIG_NAME);
res.forEach(o -> log.info(o.toString()));
admin = new SolrAdminApplication(miniCluster.getZkClient().getZkServerAddress());
CollectionAdminResponse rsp = (CollectionAdminResponse) admin
.createAlias(ProvisionConstants.PUBLIC_ALIAS_NAME, PUBLIC_COLLECTION);
assertEquals(0, rsp.getStatus());
rsp = (CollectionAdminResponse) admin.createAlias(ProvisionConstants.SHADOW_ALIAS_NAME, SHADOW_COLLECTION);
assertEquals(0, rsp.getStatus());
log
.info(
CollectionAdminRequest.ClusterStatus
.getClusterStatus()
.process(miniCluster.getSolrClient())
.toString());
} }
@AfterAll @AfterAll
public static void tearDown() { public static void tearDown() throws Exception {
spark.stop(); spark.stop();
miniCluster.shutdown();
FileUtils.deleteDirectory(workingDir.toFile());
} }
@Test @Test
@ -86,8 +178,10 @@ public class SolrConfigExploreTest extends SolrExploreTest {
String inputPath = "src/test/resources/eu/dnetlib/dhp/oa/provision/xml"; String inputPath = "src/test/resources/eu/dnetlib/dhp/oa/provision/xml";
new XmlIndexingJob(spark, inputPath, FORMAT, batchSize).run(isLookupClient); new XmlIndexingJob(spark, inputPath, SHADOW_FORMAT, ProvisionConstants.SHADOW_ALIAS_NAME, batchSize)
Assertions.assertEquals(0, miniCluster.getSolrClient().commit().getStatus()); .run(isLookupClient);
Assertions
.assertEquals(0, miniCluster.getSolrClient().commit(ProvisionConstants.SHADOW_ALIAS_NAME).getStatus());
String[] queryStrings = { String[] queryStrings = {
"cancer", "cancer",
@ -101,14 +195,14 @@ public class SolrConfigExploreTest extends SolrExploreTest {
query.add(CommonParams.Q, q); query.add(CommonParams.Q, q);
query.set("debugQuery", "on"); query.set("debugQuery", "on");
log.info("Submit query to Solr with params: {}", query.toString()); log.info("Submit query to Solr with params: {}", query);
QueryResponse rsp = miniCluster.getSolrClient().query(query); QueryResponse rsp = miniCluster.getSolrClient().query(ProvisionConstants.SHADOW_ALIAS_NAME, query);
// System.out.println(rsp.getHighlighting()); // System.out.println(rsp.getHighlighting());
// System.out.println(rsp.getExplainMap()); // System.out.println(rsp.getExplainMap());
for (SolrDocument doc : rsp.getResults()) { for (SolrDocument doc : rsp.getResults()) {
System.out log
.println( .info(
doc.get("score") + "\t" + doc.get("score") + "\t" +
doc.get("__indexrecordidentifier") + "\t" + doc.get("__indexrecordidentifier") + "\t" +
doc.get("resultidentifier") + "\t" + doc.get("resultidentifier") + "\t" +
@ -122,4 +216,18 @@ public class SolrConfigExploreTest extends SolrExploreTest {
} }
} }
} }
protected static NamedList<Object> createCollection(CloudSolrClient client, String name, int numShards,
int replicationFactor, int maxShardsPerNode, String configName) throws Exception {
ModifiableSolrParams modParams = new ModifiableSolrParams();
modParams.set(CoreAdminParams.ACTION, CollectionParams.CollectionAction.CREATE.name());
modParams.set("name", name);
modParams.set("numShards", numShards);
modParams.set("replicationFactor", replicationFactor);
modParams.set("collection.configName", configName);
modParams.set("maxShardsPerNode", maxShardsPerNode);
QueryRequest request = new QueryRequest(modParams);
request.setPath("/admin/collections");
return client.request(request);
}
} }

View File

@ -2,24 +2,15 @@
package eu.dnetlib.dhp.oa.provision; package eu.dnetlib.dhp.oa.provision;
import java.io.IOException; import java.io.IOException;
import java.io.StringReader;
import java.net.URI; import java.net.URI;
import org.apache.commons.io.IOUtils; import org.apache.commons.io.IOUtils;
import org.apache.hadoop.io.Text;
import org.apache.solr.client.solrj.SolrQuery; import org.apache.solr.client.solrj.SolrQuery;
import org.apache.solr.client.solrj.response.QueryResponse; import org.apache.solr.client.solrj.response.QueryResponse;
import org.apache.solr.common.SolrDocument; import org.apache.solr.common.SolrDocument;
import org.apache.solr.common.SolrInputField;
import org.apache.solr.common.params.CommonParams; import org.apache.solr.common.params.CommonParams;
import org.apache.spark.SparkConf; import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SparkSession; import org.apache.spark.sql.SparkSession;
import org.dom4j.io.SAXReader;
import org.junit.jupiter.api.*; import org.junit.jupiter.api.*;
import org.junit.jupiter.api.extension.ExtendWith; import org.junit.jupiter.api.extension.ExtendWith;
import org.mockito.Mock; import org.mockito.Mock;
@ -50,9 +41,6 @@ public class SolrConfigTest extends SolrTest {
int solrPort = URI.create("http://" + miniCluster.getZkClient().getZkServerAddress()).getPort(); int solrPort = URI.create("http://" + miniCluster.getZkClient().getZkServerAddress()).getPort();
Mockito
.when(isLookupClient.getDsId(Mockito.anyString()))
.thenReturn("313f0381-23b6-466f-a0b8-c72a9679ac4b_SW5kZXhEU1Jlc291cmNlcy9JbmRleERTUmVzb3VyY2VUeXBl");
Mockito.when(isLookupClient.getZkHost()).thenReturn(String.format("127.0.0.1:%s/solr", solrPort)); Mockito.when(isLookupClient.getZkHost()).thenReturn(String.format("127.0.0.1:%s/solr", solrPort));
Mockito Mockito
.when(isLookupClient.getLayoutSource(Mockito.anyString())) .when(isLookupClient.getLayoutSource(Mockito.anyString()))
@ -95,9 +83,10 @@ public class SolrConfigTest extends SolrTest {
String inputPath = "src/test/resources/eu/dnetlib/dhp/oa/provision/xml"; String inputPath = "src/test/resources/eu/dnetlib/dhp/oa/provision/xml";
new XmlIndexingJob(spark, inputPath, FORMAT, batchSize) new XmlIndexingJob(spark, inputPath, SHADOW_FORMAT, ProvisionConstants.SHADOW_ALIAS_NAME, batchSize)
.run(isLookupClient); .run(isLookupClient);
Assertions.assertEquals(0, miniCluster.getSolrClient().commit().getStatus()); Assertions
.assertEquals(0, miniCluster.getSolrClient().commit(ProvisionConstants.SHADOW_ALIAS_NAME).getStatus());
String[] queryStrings = { String[] queryStrings = {
"cancer", "cancer",
@ -109,8 +98,8 @@ public class SolrConfigTest extends SolrTest {
SolrQuery query = new SolrQuery(); SolrQuery query = new SolrQuery();
query.add(CommonParams.Q, q); query.add(CommonParams.Q, q);
log.info("Submit query to Solr with params: {}", query.toString()); log.info("Submit query to Solr with params: {}", query);
QueryResponse rsp = miniCluster.getSolrClient().query(query); QueryResponse rsp = miniCluster.getSolrClient().query(ProvisionConstants.SHADOW_ALIAS_NAME, query);
for (SolrDocument doc : rsp.getResults()) { for (SolrDocument doc : rsp.getResults()) {
System.out System.out

View File

@ -34,58 +34,6 @@ public abstract class SolrExploreTest {
@TempDir @TempDir
public static Path workingDir; public static Path workingDir;
@BeforeAll
public static void setup() throws Exception {
// random unassigned HTTP port
final int jettyPort = 0;
final JettyConfig jettyConfig = JettyConfig.builder().setPort(jettyPort).build();
log.info(String.format("working directory: %s", workingDir.toString()));
System.setProperty("solr.log.dir", workingDir.resolve("logs").toString());
// create a MiniSolrCloudCluster instance
miniCluster = new MiniSolrCloudCluster(2, workingDir.resolve("solr"), jettyConfig);
// Upload Solr configuration directory to ZooKeeper
String solrZKConfigDir = "src/test/resources/eu/dnetlib/dhp/oa/provision/solr/conf/exploreTestConfig";
File configDir = new File(solrZKConfigDir);
miniCluster.uploadConfigSet(configDir.toPath(), CONFIG_NAME);
// override settings in the solrconfig include
System.setProperty("solr.tests.maxBufferedDocs", "100000");
System.setProperty("solr.tests.maxIndexingThreads", "-1");
System.setProperty("solr.tests.ramBufferSizeMB", "100");
// use non-test classes so RandomizedRunner isn't necessary
System.setProperty("solr.tests.mergeScheduler", "org.apache.lucene.index.ConcurrentMergeScheduler");
System.setProperty("solr.directoryFactory", "solr.RAMDirectoryFactory");
System.setProperty("solr.lock.type", "single");
log.info(new ConfigSetAdminRequest.List().process(miniCluster.getSolrClient()).toString());
log
.info(
CollectionAdminRequest.ClusterStatus
.getClusterStatus()
.process(miniCluster.getSolrClient())
.toString());
NamedList<Object> res = createCollection(
miniCluster.getSolrClient(), DEFAULT_COLLECTION, 4, 2, 20, CONFIG_NAME);
res.forEach(o -> log.info(o.toString()));
miniCluster.getSolrClient().setDefaultCollection(DEFAULT_COLLECTION);
log
.info(
CollectionAdminRequest.ClusterStatus
.getClusterStatus()
.process(miniCluster.getSolrClient())
.toString());
}
@AfterAll @AfterAll
public static void shutDown() throws Exception { public static void shutDown() throws Exception {
miniCluster.shutdown(); miniCluster.shutdown();

View File

@ -1,6 +1,8 @@
package eu.dnetlib.dhp.oa.provision; package eu.dnetlib.dhp.oa.provision;
import static org.junit.jupiter.api.Assertions.assertEquals;
import java.io.File; import java.io.File;
import java.nio.file.Path; import java.nio.file.Path;
@ -10,6 +12,7 @@ import org.apache.solr.client.solrj.impl.CloudSolrClient;
import org.apache.solr.client.solrj.request.CollectionAdminRequest; import org.apache.solr.client.solrj.request.CollectionAdminRequest;
import org.apache.solr.client.solrj.request.ConfigSetAdminRequest; import org.apache.solr.client.solrj.request.ConfigSetAdminRequest;
import org.apache.solr.client.solrj.request.QueryRequest; import org.apache.solr.client.solrj.request.QueryRequest;
import org.apache.solr.client.solrj.response.CollectionAdminResponse;
import org.apache.solr.cloud.MiniSolrCloudCluster; import org.apache.solr.cloud.MiniSolrCloudCluster;
import org.apache.solr.common.params.CollectionParams; import org.apache.solr.common.params.CollectionParams;
import org.apache.solr.common.params.CoreAdminParams; import org.apache.solr.common.params.CoreAdminParams;
@ -21,14 +24,21 @@ import org.junit.jupiter.api.io.TempDir;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
import sun.security.provider.SHA;
public abstract class SolrTest { public abstract class SolrTest {
protected static final Logger log = LoggerFactory.getLogger(SolrTest.class); protected static final Logger log = LoggerFactory.getLogger(SolrTest.class);
protected static final String FORMAT = "test"; protected static final String SHADOW_FORMAT = "c1";
protected static final String DEFAULT_COLLECTION = FORMAT + "-index-openaire"; protected static final String SHADOW_COLLECTION = SHADOW_FORMAT + "-index-openaire";
protected static final String PUBLIC_FORMAT = "c2";
protected static final String PUBLIC_COLLECTION = PUBLIC_FORMAT + "-index-openaire";
protected static final String CONFIG_NAME = "testConfig"; protected static final String CONFIG_NAME = "testConfig";
protected static SolrAdminApplication admin;
protected static MiniSolrCloudCluster miniCluster; protected static MiniSolrCloudCluster miniCluster;
@TempDir @TempDir
@ -72,10 +82,21 @@ public abstract class SolrTest {
.toString()); .toString());
NamedList<Object> res = createCollection( NamedList<Object> res = createCollection(
miniCluster.getSolrClient(), DEFAULT_COLLECTION, 4, 2, 20, CONFIG_NAME); miniCluster.getSolrClient(), SHADOW_COLLECTION, 4, 2, 20, CONFIG_NAME);
res.forEach(o -> log.info(o.toString())); res.forEach(o -> log.info(o.toString()));
miniCluster.getSolrClient().setDefaultCollection(DEFAULT_COLLECTION); // miniCluster.getSolrClient().setDefaultCollection(SHADOW_COLLECTION);
res = createCollection(
miniCluster.getSolrClient(), PUBLIC_COLLECTION, 4, 2, 20, CONFIG_NAME);
res.forEach(o -> log.info(o.toString()));
admin = new SolrAdminApplication(miniCluster.getZkClient().getZkServerAddress());
CollectionAdminResponse rsp = (CollectionAdminResponse) admin
.createAlias(ProvisionConstants.PUBLIC_ALIAS_NAME, PUBLIC_COLLECTION);
assertEquals(0, rsp.getStatus());
rsp = (CollectionAdminResponse) admin.createAlias(ProvisionConstants.SHADOW_ALIAS_NAME, SHADOW_COLLECTION);
assertEquals(0, rsp.getStatus());
log log
.info( .info(
@ -83,12 +104,12 @@ public abstract class SolrTest {
.getClusterStatus() .getClusterStatus()
.process(miniCluster.getSolrClient()) .process(miniCluster.getSolrClient())
.toString()); .toString());
} }
@AfterAll @AfterAll
public static void shutDown() throws Exception { public static void shutDown() throws Exception {
miniCluster.shutdown(); miniCluster.shutdown();
admin.close();
FileUtils.deleteDirectory(workingDir.toFile()); FileUtils.deleteDirectory(workingDir.toFile());
} }

View File

@ -10,6 +10,7 @@ import java.util.Optional;
import org.apache.commons.io.IOUtils; import org.apache.commons.io.IOUtils;
import org.apache.solr.client.solrj.SolrQuery; import org.apache.solr.client.solrj.SolrQuery;
import org.apache.solr.client.solrj.SolrResponse;
import org.apache.solr.client.solrj.response.QueryResponse; import org.apache.solr.client.solrj.response.QueryResponse;
import org.apache.solr.common.params.CommonParams; import org.apache.solr.common.params.CommonParams;
import org.apache.spark.SparkConf; import org.apache.spark.SparkConf;
@ -50,9 +51,6 @@ public class XmlIndexingJobTest extends SolrTest {
int solrPort = URI.create("http://" + miniCluster.getZkClient().getZkServerAddress()).getPort(); int solrPort = URI.create("http://" + miniCluster.getZkClient().getZkServerAddress()).getPort();
Mockito
.when(isLookupClient.getDsId(Mockito.anyString()))
.thenReturn("313f0381-23b6-466f-a0b8-c72a9679ac4b_SW5kZXhEU1Jlc291cmNlcy9JbmRleERTUmVzb3VyY2VUeXBl");
Mockito.when(isLookupClient.getZkHost()).thenReturn(String.format("127.0.0.1:%s/solr", solrPort)); Mockito.when(isLookupClient.getZkHost()).thenReturn(String.format("127.0.0.1:%s/solr", solrPort));
Mockito Mockito
.when(isLookupClient.getLayoutSource(Mockito.anyString())) .when(isLookupClient.getLayoutSource(Mockito.anyString()))
@ -103,46 +101,72 @@ public class XmlIndexingJobTest extends SolrTest {
long nRecord = records.count(); long nRecord = records.count();
new XmlIndexingJob(spark, inputPath, FORMAT, batchSize).run(isLookupClient); new XmlIndexingJob(spark, inputPath, SHADOW_FORMAT, ProvisionConstants.SHADOW_ALIAS_NAME, batchSize)
.run(isLookupClient);
assertEquals(0, miniCluster.getSolrClient().commit().getStatus()); assertEquals(0, miniCluster.getSolrClient().commit(SHADOW_COLLECTION).getStatus());
QueryResponse rsp = miniCluster.getSolrClient().query(new SolrQuery().add(CommonParams.Q, "*:*")); QueryResponse rsp = miniCluster
.getSolrClient()
.query(
ProvisionConstants.SHADOW_ALIAS_NAME,
new SolrQuery().add(CommonParams.Q, "*:*"));
assertEquals( assertEquals(
nRecord, rsp.getResults().getNumFound(), nRecord, rsp.getResults().getNumFound(),
"the number of indexed records should be equal to the number of input records"); "the number of indexed records should be equal to the number of input records");
rsp = miniCluster.getSolrClient().query(new SolrQuery().add(CommonParams.Q, "isgreen:true")); rsp = miniCluster
.getSolrClient()
.query(
ProvisionConstants.SHADOW_ALIAS_NAME,
new SolrQuery().add(CommonParams.Q, "isgreen:true"));
assertEquals( assertEquals(
0, rsp.getResults().getNumFound(), 4, rsp.getResults().getNumFound(),
"the number of indexed records having isgreen = true"); "the number of indexed records having isgreen = true");
rsp = miniCluster.getSolrClient().query(new SolrQuery().add(CommonParams.Q, "openaccesscolor:bronze")); rsp = miniCluster
.getSolrClient()
.query(
ProvisionConstants.SHADOW_ALIAS_NAME,
new SolrQuery().add(CommonParams.Q, "openaccesscolor:bronze"));
assertEquals( assertEquals(
0, rsp.getResults().getNumFound(), 2, rsp.getResults().getNumFound(),
"the number of indexed records having openaccesscolor = bronze"); "the number of indexed records having openaccesscolor = bronze");
rsp = miniCluster.getSolrClient().query(new SolrQuery().add(CommonParams.Q, "isindiamondjournal:true")); rsp = miniCluster
.getSolrClient()
.query(
ProvisionConstants.SHADOW_ALIAS_NAME,
new SolrQuery().add(CommonParams.Q, "isindiamondjournal:true"));
assertEquals( assertEquals(
0, rsp.getResults().getNumFound(), 0, rsp.getResults().getNumFound(),
"the number of indexed records having isindiamondjournal = true"); "the number of indexed records having isindiamondjournal = true");
rsp = miniCluster.getSolrClient().query(new SolrQuery().add(CommonParams.Q, "publiclyfunded:true")); rsp = miniCluster
.getSolrClient()
.query(
ProvisionConstants.SHADOW_ALIAS_NAME,
new SolrQuery().add(CommonParams.Q, "publiclyfunded:true"));
assertEquals( assertEquals(
0, rsp.getResults().getNumFound(), 0, rsp.getResults().getNumFound(),
"the number of indexed records having publiclyfunded = true"); "the number of indexed records having publiclyfunded = true");
rsp = miniCluster.getSolrClient().query(new SolrQuery().add(CommonParams.Q, "peerreviewed:true")); rsp = miniCluster
.getSolrClient()
.query(
ProvisionConstants.SHADOW_ALIAS_NAME,
new SolrQuery().add(CommonParams.Q, "peerreviewed:true"));
assertEquals( assertEquals(
0, rsp.getResults().getNumFound(), 35, rsp.getResults().getNumFound(),
"the number of indexed records having peerreviewed = true"); "the number of indexed records having peerreviewed = true");
rsp = miniCluster rsp = miniCluster
.getSolrClient() .getSolrClient()
.query( .query(
ProvisionConstants.SHADOW_ALIAS_NAME,
new SolrQuery() new SolrQuery()
.add(CommonParams.Q, "objidentifier:\"iddesignpres::ae77e56e84ad058d9e7f19fa2f7325db\"") .add(CommonParams.Q, "objidentifier:\"57a035e5b1ae::236d6d8c1e03368b5ae72acfeeb11bbc\"")
.add(CommonParams.FL, "__json")); .add(CommonParams.FL, "__json"));
assertEquals( assertEquals(
1, rsp.getResults().getNumFound(), 1, rsp.getResults().getNumFound(),
@ -158,6 +182,22 @@ public class XmlIndexingJobTest extends SolrTest {
log.info((String) json.get()); log.info((String) json.get());
admin
.execute(
SolrAdminApplication.Action.UPDATE_ALIASES, null, false,
SHADOW_COLLECTION, PUBLIC_COLLECTION);
rsp = miniCluster
.getSolrClient()
.query(
ProvisionConstants.PUBLIC_ALIAS_NAME,
new SolrQuery()
.add(CommonParams.Q, "objidentifier:\"57a035e5b1ae::236d6d8c1e03368b5ae72acfeeb11bbc\"")
.add(CommonParams.FL, "__json"));
assertEquals(
1, rsp.getResults().getNumFound(),
"the number of indexed records having the given identifier, found in the public collection");
} }
} }

View File

@ -1,8 +1,7 @@
package eu.dnetlib.dhp.oa.provision; package eu.dnetlib.dhp.oa.provision;
import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.*;
import static org.junit.jupiter.api.Assertions.assertNotNull;
import java.io.IOException; import java.io.IOException;
import java.io.StringReader; import java.io.StringReader;
@ -22,6 +21,7 @@ import com.google.common.collect.Lists;
import eu.dnetlib.dhp.oa.provision.model.JoinedEntity; import eu.dnetlib.dhp.oa.provision.model.JoinedEntity;
import eu.dnetlib.dhp.oa.provision.model.RelatedEntity; import eu.dnetlib.dhp.oa.provision.model.RelatedEntity;
import eu.dnetlib.dhp.oa.provision.model.RelatedEntityWrapper; import eu.dnetlib.dhp.oa.provision.model.RelatedEntityWrapper;
import eu.dnetlib.dhp.oa.provision.utils.ContextDef;
import eu.dnetlib.dhp.oa.provision.utils.ContextMapper; import eu.dnetlib.dhp.oa.provision.utils.ContextMapper;
import eu.dnetlib.dhp.oa.provision.utils.XmlRecordFactory; import eu.dnetlib.dhp.oa.provision.utils.XmlRecordFactory;
import eu.dnetlib.dhp.schema.oaf.*; import eu.dnetlib.dhp.schema.oaf.*;
@ -51,7 +51,7 @@ public class XmlRecordFactoryTest {
assertNotNull(doc); assertNotNull(doc);
// System.out.println(doc.asXML()); System.out.println(doc.asXML());
assertEquals("0000-0001-9613-6638", doc.valueOf("//creator[@rank = '1']/@orcid")); assertEquals("0000-0001-9613-6638", doc.valueOf("//creator[@rank = '1']/@orcid"));
assertEquals("0000-0001-9613-6639", doc.valueOf("//creator[@rank = '1']/@orcid_pending")); assertEquals("0000-0001-9613-6639", doc.valueOf("//creator[@rank = '1']/@orcid_pending"));
@ -267,4 +267,39 @@ public class XmlRecordFactoryTest {
} }
@Test
public void test_AKA_project() throws DocumentException, IOException {
final ContextMapper contextMapper = new ContextMapper();
contextMapper
.put("dh-ch", new ContextDef("dh-ch", "Digital Humanities and Cultural Heritage", "context", "community"));
contextMapper.put("dh-ch::projects", new ContextDef("dh-ch::projects", "DH-CH Projects", "category", ""));
contextMapper
.put("dh-ch::projects::2", new ContextDef("dh-ch::projects::2", "ARIADNE", "concept", "community"));
final XmlRecordFactory xmlRecordFactory = new XmlRecordFactory(contextMapper, false,
PayloadConverterJob.schemaLocation);
final Project p = OBJECT_MAPPER
.readValue(
IOUtils.toString(getClass().getResourceAsStream("project_aka.json")),
Project.class);
assertNotNull(p.getContext());
assertEquals(1, p.getContext().size());
assertEquals("dh-ch::projects::2", p.getContext().get(0).getId());
final String xml = xmlRecordFactory.build(new JoinedEntity(p));
assertNotNull(xml);
final Document doc = new SAXReader().read(new StringReader(xml));
assertNotNull(doc);
assertEquals("dh-ch", doc.valueOf("//context/@id"));
assertEquals("dh-ch::projects", doc.valueOf("//context/category/@id"));
assertEquals("dh-ch::projects::2", doc.valueOf("//context/category/concept/@id"));
}
} }

View File

@ -68,15 +68,12 @@
<FIELD copy="true" indexable="false" name="externalreflabel" result="false" stat="false" tokenizable="true" xpath="distinct-values(//*[local-name()='entity']/*//children/externalreference/label)"/> <FIELD copy="true" indexable="false" name="externalreflabel" result="false" stat="false" tokenizable="true" xpath="distinct-values(//*[local-name()='entity']/*//children/externalreference/label)"/>
<FIELD copy="true" indexable="true" name="resultidentifier" result="false" stat="false" type="string_ci" xpath="distinct-values(//*[local-name()='entity']/*[local-name()='result']/children/instance/webresource/*[local-name()='url'])"/> <FIELD copy="true" indexable="true" name="resultidentifier" result="false" stat="false" type="string_ci" xpath="distinct-values(//*[local-name()='entity']/*[local-name()='result']/children/instance/webresource/*[local-name()='url'])"/>
<FIELD copy="true" indexable="false" name="resultsource" result="false" stat="false" xpath="distinct-values(//*[local-name()='entity']/*[local-name()='result']/source)"/> <FIELD copy="true" indexable="false" name="resultsource" result="false" stat="false" xpath="distinct-values(//*[local-name()='entity']/*[local-name()='result']/source)"/>
<FIELD indexable="true" multivalued="false" name="isgreen" result="false" stat="false" type="boolean" value="//*[local-name()='entity']/*[local-name()='result']/isgreen"/> <FIELD indexable="true" multivalued="false" name="isgreen" result="false" stat="false" type="boolean" value="//*[local-name()='entity']/*[local-name()='result']/isgreen"/>
<FIELD indexable="true" multivalued="false" name="openaccesscolor" result="false" stat="false" tokenizable="false" value="//*[local-name()='entity']/*[local-name()='result']/openaccesscolor"/> <FIELD indexable="true" multivalued="false" name="openaccesscolor" result="false" stat="false" tokenizable="false" value="//*[local-name()='entity']/*[local-name()='result']/openaccesscolor"/>
<FIELD indexable="true" multivalued="false" name="isindiamondjournal" result="false" stat="false" type="boolean" value="//*[local-name()='entity']/*[local-name()='result']/isindiamondjournal"/> <FIELD indexable="true" multivalued="false" name="isindiamondjournal" result="false" stat="false" type="boolean" value="//*[local-name()='entity']/*[local-name()='result']/isindiamondjournal"/>
<FIELD indexable="true" multivalued="false" name="publiclyfunded" result="false" stat="false" type="boolean" value="//*[local-name()='entity']/*[local-name()='result']/publiclyfunded"/> <FIELD indexable="true" multivalued="false" name="publiclyfunded" result="false" stat="false" type="boolean" value="//*[local-name()='entity']/*[local-name()='result']/publiclyfunded"/>
<FIELD indexable="true" multivalued="false" name="peerreviewed" result="false" stat="false" type="boolean" value="some $refereed in //*[local-name()='entity']/*[local-name()='result']/children/instance/*[local-name()='refereed']/@classid satisfies ($refereed = '0001')"/> <FIELD indexable="true" multivalued="false" name="peerreviewed" result="false" stat="false" type="boolean" value="some $refereed in //*[local-name()='entity']/*[local-name()='result']/children/instance/*[local-name()='refereed']/@classid satisfies ($refereed = '0001')"/>
<FIELD indexable="true" multivalued="false" name="haslicense" result="false" stat="false" type="boolean" value="some $license in //*[local-name()='entity']/*[local-name()='result']/children/instance/*[local-name()='license']/text() satisfies (string-length($license) &gt; 0)"/>
<FIELD indexable="true" name="eoscifguidelines" result="false" stat="false" tokenizable="false" xpath="distinct-values(//*[local-name() = 'result']/eoscifguidelines/@code)"/><!-- FOS and SDGs non tokenizable for faceted search--> <FIELD indexable="true" name="eoscifguidelines" result="false" stat="false" tokenizable="false" xpath="distinct-values(//*[local-name() = 'result']/eoscifguidelines/@code)"/><!-- FOS and SDGs non tokenizable for faceted search-->
<FIELD indexable="true" name="fos" result="false" stat="false" tokenizable="false" xpath="distinct-values(//*[local-name()='entity']/*[local-name()='result']/subject[@classid='FOS'])"/> <FIELD indexable="true" name="fos" result="false" stat="false" tokenizable="false" xpath="distinct-values(//*[local-name()='entity']/*[local-name()='result']/subject[@classid='FOS'])"/>
<FIELD indexable="true" name="foslabel" result="false" stat="false" tokenizable="false" value="concat(./text(), '||', replace(./text(), '^\d+\s', ''))" xpath="//*[local-name()='entity']/*[local-name()='result']/subject[@classid='FOS']"/> <FIELD indexable="true" name="foslabel" result="false" stat="false" tokenizable="false" value="concat(./text(), '||', replace(./text(), '^\d+\s', ''))" xpath="//*[local-name()='entity']/*[local-name()='result']/subject[@classid='FOS']"/>
@ -93,6 +90,7 @@
<FIELD indexable="true" name="relorganizationid" result="false" stat="false" tokenizable="false" xpath="distinct-values(//*[local-name()='entity']/*//rel/to[@type='organization'])"/> <FIELD indexable="true" name="relorganizationid" result="false" stat="false" tokenizable="false" xpath="distinct-values(//*[local-name()='entity']/*//rel/to[@type='organization'])"/>
<FIELD copy="true" indexable="true" name="relorganizationname" result="false" stat="false" xpath="distinct-values(//*[local-name()='entity']/*//rel[./to/@type='organization']/legalname)"/> <FIELD copy="true" indexable="true" name="relorganizationname" result="false" stat="false" xpath="distinct-values(//*[local-name()='entity']/*//rel[./to/@type='organization']/legalname)"/>
<FIELD copy="true" indexable="true" name="relorganizationshortname" result="false" stat="false" xpath="distinct-values(//*[local-name()='entity']/*//rel[./to/@type='organization']/legalshortname)"/> <FIELD copy="true" indexable="true" name="relorganizationshortname" result="false" stat="false" xpath="distinct-values(//*[local-name()='entity']/*//rel[./to/@type='organization']/legalshortname)"/>
<FIELD indexable="true" name="relorganization" result="false" stat="false" tokenizable="false" value="distinct-values(concat(./to, '||', ./legalname))" xpath="//*[local-name()='entity']/*//rel[./to/@type='organization']"/>
<FIELD indexable="true" name="relresultid" result="false" stat="false" tokenizable="false" xpath="distinct-values(//*[local-name()='entity']/*//rel/to[@type='publication' or @type='dataset' or @type='software' or @type='otherresearchproduct'])"/> <FIELD indexable="true" name="relresultid" result="false" stat="false" tokenizable="false" xpath="distinct-values(//*[local-name()='entity']/*//rel/to[@type='publication' or @type='dataset' or @type='software' or @type='otherresearchproduct'])"/>
<FIELD indexable="true" name="relresulttype" result="false" stat="false" tokenizable="false" xpath="distinct-values(//*[local-name()='entity']/*//rel/to/@type)"/> <FIELD indexable="true" name="relresulttype" result="false" stat="false" tokenizable="false" xpath="distinct-values(//*[local-name()='entity']/*//rel/to/@type)"/>
<FIELD indexable="true" name="relclass" result="false" stat="false" tokenizable="false" xpath="distinct-values(//*[local-name()='entity']/*//rel/to/@class)"/> <FIELD indexable="true" name="relclass" result="false" stat="false" tokenizable="false" xpath="distinct-values(//*[local-name()='entity']/*//rel/to/@class)"/>
@ -122,6 +120,7 @@
<FIELD indexable="true" name="categoryid" result="false" stat="false" tokenizable="false" xpath="distinct-values(//*[local-name()='entity']/*[local-name()='result']/context/category/@id)"/> <FIELD indexable="true" name="categoryid" result="false" stat="false" tokenizable="false" xpath="distinct-values(//*[local-name()='entity']/*[local-name()='result']/context/category/@id)"/>
<FIELD indexable="true" name="conceptname" result="false" stat="false" tokenizable="false" xpath="distinct-values(//*[local-name()='entity']/*[local-name()='result']/context/category//concept/@label)"/><!-- new index field for country info from different xpaths for any type of entity --> <FIELD indexable="true" name="conceptname" result="false" stat="false" tokenizable="false" xpath="distinct-values(//*[local-name()='entity']/*[local-name()='result']/context/category//concept/@label)"/><!-- new index field for country info from different xpaths for any type of entity -->
<FIELD indexable="true" name="country" result="false" stat="false" tokenizable="false" xpath="distinct-values(//*[local-name()='entity']/*/country/@classid | //*[local-name()='entity']/*//rel[./to/@type='organization']/country/@classid | //*[local-name()='entity']//funder/@jurisdiction)"/> <FIELD indexable="true" name="country" result="false" stat="false" tokenizable="false" xpath="distinct-values(//*[local-name()='entity']/*/country/@classid | //*[local-name()='entity']/*//rel[./to/@type='organization']/country/@classid | //*[local-name()='entity']//funder/@jurisdiction)"/>
<FIELD indexable="true" name="countrynojurisdiction" result="false" stat="false" tokenizable="false" xpath="distinct-values(//*[local-name()='entity']/*/country/@classid | //*[local-name()='entity']/*//rel[./to/@type='organization']/country/@classid)"/>
<FIELD indexable="false" name="oafentity" result="true" stat="false" tokenizable="false" xpath="//*[local-name() = 'entity']"/><!-- impact indicators --> <FIELD indexable="false" name="oafentity" result="true" stat="false" tokenizable="false" xpath="//*[local-name() = 'entity']"/><!-- impact indicators -->
<FIELD copy="false" indexable="true" multivalued="false" name="influence" result="false" stat="false" type="pfloat" xpath="//measure[@id='influence']/@score/number()"/> <FIELD copy="false" indexable="true" multivalued="false" name="influence" result="false" stat="false" type="pfloat" xpath="//measure[@id='influence']/@score/number()"/>
<FIELD copy="false" indexable="true" multivalued="false" name="influence_class" result="false" stat="false" type="string" xpath="//measure[@id='influence']/@class/string()"/> <FIELD copy="false" indexable="true" multivalued="false" name="influence_class" result="false" stat="false" type="string" xpath="//measure[@id='influence']/@class/string()"/>

File diff suppressed because one or more lines are too long

View File

@ -194,228 +194,173 @@
<fieldType name="tints" class="solr.TrieIntField" positionIncrementGap="0" docValues="true" multiValued="true" precisionStep="8"/> <fieldType name="tints" class="solr.TrieIntField" positionIncrementGap="0" docValues="true" multiValued="true" precisionStep="8"/>
<fieldType name="tlong" class="solr.TrieLongField" positionIncrementGap="0" docValues="true" precisionStep="8"/> <fieldType name="tlong" class="solr.TrieLongField" positionIncrementGap="0" docValues="true" precisionStep="8"/>
<fieldType name="tlongs" class="solr.TrieLongField" positionIncrementGap="0" docValues="true" multiValued="true" precisionStep="8"/> <fieldType name="tlongs" class="solr.TrieLongField" positionIncrementGap="0" docValues="true" multiValued="true" precisionStep="8"/>
<!-- Indexed fields -->
<field name="__all" type="text_common" multiValued="true" indexed="true" stored="false"/> <field name="__all" type="text_common" multiValued="true" indexed="true" stored="false"/>
<field name="__deleted" type="boolean" default="false" omitNorms="true" omitTermFreqAndPositions="true" indexed="true" stored="false"/>
<field name="__dsid" type="string" omitNorms="true" omitTermFreqAndPositions="true" indexed="true" stored="true"/>
<field name="__dsversion" type="pdate" omitNorms="true" omitTermFreqAndPositions="true" indexed="true" stored="true"/>
<field name="__indexrecordidentifier" type="string" multiValued="false" indexed="true" required="true" stored="true"/> <field name="__indexrecordidentifier" type="string" multiValued="false" indexed="true" required="true" stored="true"/>
<field name="__result" type="string" docValues="false" multiValued="false" indexed="false" stored="true"/>
<field name="__json" type="string" docValues="false" multiValued="false" indexed="false" stored="true"/> <field name="__json" type="string" docValues="false" multiValued="false" indexed="false" stored="true"/>
<field name="__result" type="string" docValues="false" multiValued="false" indexed="false" stored="true"/>
<field name="_root_" type="string" docValues="false" indexed="true" stored="false"/> <field name="_root_" type="string" docValues="false" indexed="true" stored="false"/>
<field name="_version_" type="long" multiValued="false" indexed="true" stored="true"/> <field name="_version_" type="long" multiValued="false" indexed="true" stored="true"/>
<field name="authorid" type="string_ci" multiValued="true" indexed="true" stored="false"/> <field name="authorid" type="string_ci" docValues="false" multiValued="true" indexed="true" stored="false"/>
<field name="authoridtype" type="string_ci" multiValued="true" indexed="true" stored="false"/> <field name="categoryid" type="string" docValues="false" multiValued="true" indexed="true" stored="false"/>
<field name="categoryid" type="string" multiValued="true" indexed="true" stored="false"/> <field name="citation_count" type="pint" docValues="false" multiValued="false" indexed="true" stored="false"/>
<field name="categoryname" type="string" multiValued="true" indexed="true" stored="false"/> <field name="citation_count_class" type="string" docValues="false" multiValued="false" indexed="true" stored="false"/>
<field name="collectedfrom" type="string" multiValued="true" indexed="true" stored="false"/> <field name="collectedfromdatasourceid" type="string" docValues="false" multiValued="true" indexed="true" stored="false"/>
<field name="collectedfromdatasourceid" type="string" multiValued="true" indexed="true" stored="false"/> <field name="collectedfromname" type="string" docValues="false" multiValued="true" indexed="true" stored="false"/>
<field name="collectedfromname" type="string" multiValued="true" indexed="true" stored="false"/> <field name="community" type="string" docValues="false" multiValued="true" indexed="true" stored="false"/>
<field name="community" type="string" multiValued="true" indexed="true" stored="false"/> <field name="communityid" type="string" docValues="false" multiValued="true" indexed="true" stored="false"/>
<field name="communityid" type="string" multiValued="true" indexed="true" stored="false"/> <field name="conceptname" type="string" docValues="false" multiValued="true" indexed="true" stored="false"/>
<field name="communityname" type="string" multiValued="true" indexed="true" stored="false"/> <field name="contextid" type="string" docValues="false" multiValued="true" indexed="true" stored="false"/>
<field name="conceptid" type="string" multiValued="true" indexed="true" stored="false"/> <field name="contextname" type="string" docValues="false" multiValued="true" indexed="true" stored="false"/>
<field name="conceptname" type="string" multiValued="true" indexed="true" stored="false"/> <field name="country" type="string" docValues="false" multiValued="true" indexed="true" stored="false"/>
<field name="contextid" type="string" multiValued="true" indexed="true" stored="false"/> <field name="countrynojurisdiction" type="string" docValues="false" multiValued="true" indexed="true" stored="false"/>
<field name="contextname" type="string" multiValued="true" indexed="true" stored="false"/> <field name="datasourcecompatibilityid" type="string" docValues="false" multiValued="false" indexed="true" stored="false"/>
<field name="contexttype" type="string" multiValued="true" indexed="true" stored="false"/> <field name="datasourcecompatibilityname" type="string" docValues="false" multiValued="false" indexed="true" stored="false"/>
<field name="country" type="string" multiValued="true" indexed="true" stored="false"/> <field name="datasourceenglishname" type="text_common" docValues="false" multiValued="true" indexed="true" stored="false"/>
<field name="datasourcecompatibilityid" type="string" multiValued="false" indexed="true" stored="false"/>
<field name="datasourcecompatibilityname" type="string" multiValued="false" indexed="true" stored="false"/>
<field name="datasourceenglishname" type="text_common" multiValued="true" indexed="true" stored="false"/>
<field name="datasourcejurisdiction" type="string" docValues="false" multiValued="true" indexed="true" stored="false"/> <field name="datasourcejurisdiction" type="string" docValues="false" multiValued="true" indexed="true" stored="false"/>
<field name="datasourceodcontenttypes" type="string" multiValued="true" indexed="true" stored="false"/> <field name="datasourceodcontenttypes" type="string" docValues="false" multiValued="true" indexed="true" stored="false"/>
<field name="datasourceoddescription" type="text_common" multiValued="true" indexed="true" stored="false"/> <field name="datasourceoddescription" type="text_common" docValues="false" multiValued="true" indexed="true" stored="false"/>
<field name="datasourceodlanguages" type="string" multiValued="true" indexed="true" stored="false"/> <field name="datasourceodlanguages" type="string" docValues="false" multiValued="true" indexed="true" stored="false"/>
<field name="datasourceodsubjects" type="string" multiValued="true" indexed="true" stored="false"/> <field name="datasourceodsubjects" type="string" docValues="false" multiValued="true" indexed="true" stored="false"/>
<field name="datasourceofficialname" type="text_common" multiValued="true" indexed="true" stored="false"/> <field name="datasourceofficialname" type="text_common" docValues="false" multiValued="true" indexed="true" stored="false"/>
<field name="datasourcesubject" type="ngramtext" multiValued="true" indexed="true" stored="false"/> <field name="datasourcesubject" type="ngramtext" docValues="false" multiValued="true" indexed="true" stored="false"/>
<field name="datasourcethematic" type="string" docValues="false" multiValued="true" indexed="true" stored="false"/> <field name="datasourcethematic" type="string" docValues="false" multiValued="true" indexed="true" stored="false"/>
<field name="datasourcetypename" type="string" multiValued="false" indexed="true" stored="false"/> <field name="datasourcetypename" type="string" docValues="false" multiValued="false" indexed="true" stored="false"/>
<field name="datasourcetypeuiid" type="string" multiValued="false" indexed="true" stored="false"/> <field name="datasourcetypeuiid" type="string" docValues="false" multiValued="false" indexed="true" stored="false"/>
<field name="datasourcetypeuiname" type="string" multiValued="false" indexed="true" stored="false"/> <field name="datasourcetypeuiname" type="string" docValues="false" multiValued="false" indexed="true" stored="false"/>
<field name="dateofcollection" type="pdate" multiValued="false" indexed="true" stored="false"/> <field name="dateofcollection" type="date" docValues="false" multiValued="false" indexed="true" stored="false"/>
<field name="deletedbyinference" type="string" multiValued="true" indexed="true" stored="false"/> <field name="deletedbyinference" type="string" docValues="false" multiValued="true" indexed="true" stored="false"/>
<field name="eoscdatasourcetype" type="string" docValues="false" multiValued="true" indexed="true" stored="false"/> <field name="eoscdatasourcetype" type="string" docValues="false" multiValued="true" indexed="true" stored="false"/>
<field name="eoscifguidelines" type="string" docValues="false" multiValued="true" indexed="true" stored="false"/> <field name="eoscifguidelines" type="string" docValues="false" multiValued="true" indexed="true" stored="false"/>
<field name="eosctype" type="string" docValues="false" multiValued="true" indexed="true" stored="false"/> <field name="eosctype" type="string" docValues="false" multiValued="true" indexed="true" stored="false"/>
<field name="externalrefclass" type="string" multiValued="true" indexed="true" stored="false"/> <field name="externalreflabel" type="text_common" docValues="false" multiValued="true" indexed="true" stored="false"/>
<field name="externalrefid" type="string" multiValued="true" indexed="true" stored="false"/> <field name="fos" type="string" docValues="false" multiValued="true" indexed="true" stored="false"/>
<field name="externalreflabel" type="text_common" multiValued="true" indexed="true" stored="false"/> <field name="foslabel" type="string" docValues="false" multiValued="true" indexed="true" stored="false"/>
<field name="externalrefsite" type="string" multiValued="true" indexed="true" stored="false"/> <field name="funder" type="string" docValues="false" multiValued="true" indexed="true" stored="false"/>
<field name="funder" type="string" multiValued="true" indexed="true" stored="false"/> <field name="funderid" type="string" docValues="false" multiValued="true" indexed="true" stored="false"/>
<field name="funderid" type="string" multiValued="true" indexed="true" stored="false"/> <field name="fundershortname" type="string" docValues="false" multiValued="true" indexed="true" stored="false"/>
<field name="funderjurisdiction" type="string" multiValued="true" indexed="true" stored="false"/> <field name="fundinglevel0_description" type="text_common" docValues="false" multiValued="true" indexed="true" stored="false"/>
<field name="fundername" type="string" multiValued="true" indexed="true" stored="false"/> <field name="fundinglevel0_id" type="string" docValues="false" multiValued="true" indexed="true" stored="false"/>
<field name="funderoriginalname" type="string" multiValued="true" indexed="true" stored="false"/> <field name="fundinglevel0_name" type="string" docValues="false" multiValued="true" indexed="true" stored="false"/>
<field name="fundershortname" type="string" multiValued="true" indexed="true" stored="false"/> <field name="fundinglevel1_description" type="text_common" docValues="false" multiValued="true" indexed="true" stored="false"/>
<field name="fundinglevel0_description" type="text_common" multiValued="true" indexed="true" stored="false"/> <field name="fundinglevel1_id" type="string" docValues="false" multiValued="true" indexed="true" stored="false"/>
<field name="fundinglevel0_id" type="string" multiValued="true" indexed="true" stored="false"/> <field name="fundinglevel1_name" type="string" docValues="false" multiValued="true" indexed="true" stored="false"/>
<field name="fundinglevel0_name" type="string" multiValued="true" indexed="true" stored="false"/> <field name="fundinglevel2_description" type="text_common" docValues="false" multiValued="true" indexed="true" stored="false"/>
<field name="fundinglevel1_description" type="text_common" multiValued="true" indexed="true" stored="false"/> <field name="fundinglevel2_id" type="string" docValues="false" multiValued="true" indexed="true" stored="false"/>
<field name="fundinglevel1_id" type="string" multiValued="true" indexed="true" stored="false"/> <field name="fundinglevel2_name" type="string" docValues="false" multiValued="true" indexed="true" stored="false"/>
<field name="fundinglevel1_name" type="string" multiValued="true" indexed="true" stored="false"/> <field name="haslicense" type="boolean" docValues="false" multiValued="false" indexed="true" stored="false"/>
<field name="fundinglevel2_description" type="text_common" multiValued="true" indexed="true" stored="false"/> <field name="impulse" type="pint" docValues="false" multiValued="false" indexed="true" stored="false"/>
<field name="fundinglevel2_id" type="string" multiValued="true" indexed="true" stored="false"/> <field name="impulse_class" type="string" docValues="false" multiValued="false" indexed="true" stored="false"/>
<field name="fundinglevel2_name" type="string" multiValued="true" indexed="true" stored="false"/> <field name="influence" type="pfloat" docValues="false" multiValued="false" indexed="true" stored="false"/>
<field name="inferenceprovenance" type="string" multiValued="true" indexed="true" stored="false"/> <field name="influence_class" type="string" docValues="false" multiValued="false" indexed="true" stored="false"/>
<field name="inferred" type="string" multiValued="true" indexed="true" stored="false"/> <field name="instancetypename" type="string" docValues="false" multiValued="true" indexed="true" stored="false"/>
<field name="instancetypename" type="string" multiValued="true" indexed="true" stored="false"/> <field name="isgreen" type="boolean" docValues="false" multiValued="false" indexed="true" stored="false"/>
<field name="oafentity" type="string" multiValued="true" indexed="false" stored="false"/> <field name="isindiamondjournal" type="boolean" docValues="false" multiValued="false" indexed="true" stored="false"/>
<field name="oaftype" type="string" multiValued="true" indexed="true" stored="false"/> <field name="oafentity" type="string" docValues="false" multiValued="true" indexed="false" stored="false"/>
<field name="objidentifier" type="string" multiValued="true" indexed="true" stored="false"/> <field name="oaftype" type="string" docValues="false" multiValued="true" indexed="true" stored="false"/>
<field name="organizationalternativenames" type="ngramtext" multiValued="true" indexed="true" stored="false"/> <field name="objidentifier" type="string" docValues="false" multiValued="true" indexed="true" stored="false"/>
<field name="organizationcountryname" type="string" multiValued="false" indexed="true" stored="false"/> <field name="openaccesscolor" type="string" docValues="false" multiValued="false" indexed="true" stored="false"/>
<field name="organizationdupid" type="string" multiValued="true" indexed="true" stored="false"/> <field name="orcidtypevalue" type="string_ci" docValues="false" multiValued="true" indexed="true" stored="false"/>
<field name="organizationecenterprise" type="text_common" multiValued="true" indexed="true" stored="false"/> <field name="organizationalternativenames" type="ngramtext" docValues="false" multiValued="true" indexed="true" stored="false"/>
<field name="organizationecinternationalorganization" type="text_common" multiValued="true" indexed="true" stored="false"/> <field name="organizationdupid" type="string" docValues="false" multiValued="true" indexed="true" stored="false"/>
<field name="organizationecinternationalorganizationeurinterests" type="text_common" multiValued="true" indexed="true" stored="false"/> <field name="organizationlegalname" type="ngramtext" docValues="false" multiValued="true" indexed="true" stored="false"/>
<field name="organizationeclegalbody" type="text_common" multiValued="true" indexed="true" stored="false"/> <field name="organizationlegalshortname" type="ngramtext" docValues="false" multiValued="true" indexed="true" stored="false"/>
<field name="organizationeclegalperson" type="text_common" multiValued="true" indexed="true" stored="false"/> <field name="originalid" type="string_ci" docValues="false" multiValued="true" indexed="true" stored="false"/>
<field name="organizationecnonprofit" type="text_common" multiValued="true" indexed="true" stored="false"/> <field name="peerreviewed" type="boolean" docValues="false" multiValued="false" indexed="true" stored="false"/>
<field name="organizationecnutscode" type="text_common" multiValued="true" indexed="true" stored="false"/> <field name="pid" type="string_ci" docValues="false" multiValued="true" indexed="true" stored="false"/>
<field name="organizationecresearchorganization" type="text_common" multiValued="true" indexed="true" stored="false"/> <field name="pidclassid" type="string" docValues="false" multiValued="true" indexed="true" stored="false"/>
<field name="organizationecsmevalidated" type="text_common" multiValued="true" indexed="true" stored="false"/> <field name="popularity" type="pfloat" docValues="false" multiValued="false" indexed="true" stored="false"/>
<field name="organizationlegalname" type="ngramtext" multiValued="true" indexed="true" stored="false"/> <field name="popularity_alt" type="pfloat" docValues="false" multiValued="false" indexed="true" stored="false"/>
<field name="organizationlegalshortname" type="ngramtext" multiValued="true" indexed="true" stored="false"/> <field name="popularity_alt_class" type="string" docValues="false" multiValued="false" indexed="true" stored="false"/>
<field name="originalid" type="string_ci" multiValued="true" indexed="true" stored="false"/> <field name="popularity_class" type="string" docValues="false" multiValued="false" indexed="true" stored="false"/>
<field name="pid" type="string_ci" multiValued="true" indexed="true" stored="false"/> <field name="projectacronym" type="ngramtext" docValues="false" multiValued="true" indexed="true" stored="false"/>
<field name="pidclassid" type="string" multiValued="true" indexed="true" stored="false"/> <field name="projectcallidentifier" type="string" docValues="false" multiValued="false" indexed="true" stored="false"/>
<field name="pidclassname" type="string" multiValued="true" indexed="true" stored="false"/> <field name="projectcode" type="ngramtext" docValues="false" multiValued="true" indexed="true" stored="false"/>
<field name="projectacronym" type="ngramtext" multiValued="true" indexed="true" stored="false"/> <field name="projectcode_nt" type="string" docValues="false" multiValued="true" indexed="true" stored="false"/>
<field name="projectcallidentifier" type="string" multiValued="false" indexed="true" stored="false"/> <field name="projectduration" type="string" docValues="false" multiValued="false" indexed="true" stored="false"/>
<field name="projectcode" type="ngramtext" multiValued="true" indexed="true" stored="false"/> <field name="projectecsc39" type="string" docValues="false" multiValued="false" indexed="true" stored="false"/>
<field name="projectcode_nt" type="string" multiValued="true" indexed="true" stored="false"/> <field name="projectenddate" type="date" docValues="false" multiValued="false" indexed="true" stored="false"/>
<field name="projectcontracttypename" type="string" multiValued="false" indexed="true" stored="false"/> <field name="projectendyear" type="string" docValues="false" multiValued="false" indexed="true" stored="false"/>
<field name="projectduration" type="string" multiValued="false" indexed="true" stored="false"/> <field name="projectkeywords" type="text_common" docValues="false" multiValued="true" indexed="true" stored="false"/>
<field name="projectecarticle29_3" type="string" multiValued="false" indexed="true" stored="false"/> <field name="projectoamandatepublications" type="string" docValues="false" multiValued="false" indexed="true" stored="false"/>
<field name="projectecsc39" type="string" multiValued="false" indexed="true" stored="false"/> <field name="projectstartdate" type="date" docValues="false" multiValued="false" indexed="true" stored="false"/>
<field name="projectenddate" type="pdate" multiValued="false" indexed="true" stored="false"/> <field name="projectstartyear" type="string" docValues="false" multiValued="false" indexed="true" stored="false"/>
<field name="projectendyear" type="string" multiValued="false" indexed="true" stored="false"/> <field name="projecttitle" type="ngramtext" docValues="false" multiValued="true" indexed="true" stored="false"/>
<field name="projectkeywords" type="text_common" multiValued="true" indexed="true" stored="false"/> <field name="projecttitle_alternative" type="text_en" docValues="false" multiValued="true" indexed="true" stored="false"/>
<field name="projectoamandatepublications" type="string" multiValued="false" indexed="true" stored="false"/> <field name="provenanceactionclassid" type="string" docValues="false" multiValued="true" indexed="true" stored="false"/>
<field name="projectstartdate" type="pdate" multiValued="false" indexed="true" stored="false"/> <field name="publiclyfunded" type="boolean" docValues="false" multiValued="false" indexed="true" stored="false"/>
<field name="projectstartyear" type="string" multiValued="false" indexed="true" stored="false"/> <field name="relclass" type="string" docValues="false" multiValued="true" indexed="true" stored="false"/>
<field name="projectsubject" type="string" multiValued="true" indexed="true" stored="false"/> <field name="relcontracttypename" type="text_common" docValues="false" multiValued="true" indexed="true" stored="false"/>
<field name="projecttitle" type="ngramtext" multiValued="true" indexed="true" stored="false"/> <field name="reldatasourcecompatibilityid" type="string" docValues="false" multiValued="true" indexed="true" stored="false"/>
<field name="provenanceactionclassid" type="string" multiValued="true" indexed="true" stored="false"/> <field name="relfunder" type="string" docValues="false" multiValued="true" indexed="true" stored="false"/>
<field name="relclass" type="string" multiValued="true" indexed="true" stored="false"/> <field name="relfunderid" type="string" docValues="false" multiValued="true" indexed="true" stored="false"/>
<field name="relcollectedfromid" type="string" multiValued="true" indexed="true" stored="false"/> <field name="relfundershortname" type="string" docValues="false" multiValued="true" indexed="true" stored="false"/>
<field name="relcollectedfromname" type="string" multiValued="true" indexed="true" stored="false"/> <field name="relfundinglevel0_id" type="string" docValues="false" multiValued="true" indexed="true" stored="false"/>
<field name="relcontracttypeid" type="string" multiValued="true" indexed="true" stored="false"/> <field name="relfundinglevel0_name" type="string" docValues="false" multiValued="true" indexed="true" stored="false"/>
<field name="relcontracttypename" type="text_common" multiValued="true" indexed="true" stored="false"/> <field name="relfundinglevel1_id" type="string" docValues="false" multiValued="true" indexed="true" stored="false"/>
<field name="reldatasourcecompatibilityid" type="string" multiValued="true" indexed="true" stored="false"/> <field name="relfundinglevel1_name" type="string" docValues="false" multiValued="true" indexed="true" stored="false"/>
<field name="relfunder" type="string" multiValued="true" indexed="true" stored="false"/> <field name="relfundinglevel2_id" type="string" docValues="false" multiValued="true" indexed="true" stored="false"/>
<field name="relfunderid" type="string" multiValued="true" indexed="true" stored="false"/> <field name="relfundinglevel2_name" type="string" docValues="false" multiValued="true" indexed="true" stored="false"/>
<field name="relfunderjurisdiction" type="string" multiValued="true" indexed="true" stored="false"/> <field name="relorganizationcountryid" type="string" docValues="false" multiValued="true" indexed="true" stored="false"/>
<field name="relfundername" type="string" multiValued="true" indexed="true" stored="false"/> <field name="relorganizationcountryname" type="text_common" docValues="false" multiValued="true" indexed="true" stored="false"/>
<field name="relfundershortname" type="string" multiValued="true" indexed="true" stored="false"/> <field name="relorganizationid" type="string" docValues="false" multiValued="true" indexed="true" stored="false"/>
<field name="relfundinglevel0_id" type="string" multiValued="true" indexed="true" stored="false"/> <field name="relorganizationname" type="text_common" docValues="false" multiValued="true" indexed="true" stored="false"/>
<field name="relfundinglevel0_name" type="string" multiValued="true" indexed="true" stored="false"/> <field name="relorganizationshortname" type="text_common" docValues="false" multiValued="true" indexed="true" stored="false"/>
<field name="relfundinglevel1_id" type="string" multiValued="true" indexed="true" stored="false"/> <field name="relproject" type="string" docValues="false" multiValued="true" indexed="true" stored="false"/>
<field name="relfundinglevel1_name" type="string" multiValued="true" indexed="true" stored="false"/> <field name="relprojectcode" type="string" docValues="false" multiValued="true" indexed="true" stored="false"/>
<field name="relfundinglevel2_id" type="string" multiValued="true" indexed="true" stored="false"/> <field name="relprojectid" type="string" docValues="false" multiValued="true" indexed="true" stored="false"/>
<field name="relfundinglevel2_name" type="string" multiValued="true" indexed="true" stored="false"/> <field name="relprojectname" type="string" docValues="false" multiValued="true" indexed="true" stored="false"/>
<field name="relinferenceprovenance" type="string" multiValued="true" indexed="true" stored="false"/> <field name="relprojecttitle" type="text_common" docValues="false" multiValued="true" indexed="true" stored="false"/>
<field name="relinferred" type="string" multiValued="true" indexed="true" stored="false"/> <field name="relresultid" type="string" docValues="false" multiValued="true" indexed="true" stored="false"/>
<field name="relorganizationcountryid" type="string" multiValued="true" indexed="true" stored="false"/> <field name="relresulttype" type="string" docValues="false" multiValued="true" indexed="true" stored="false"/>
<field name="relorganizationcountryname" type="text_common" multiValued="true" indexed="true" stored="false"/> <field name="resultacceptanceyear" type="string" docValues="false" multiValued="false" indexed="true" stored="false"/>
<field name="relorganizationid" type="string" multiValued="true" indexed="true" stored="false"/> <field name="resultauthor" type="text_common" docValues="false" multiValued="true" indexed="true" stored="false"/>
<field name="relorganizationname" type="text_common" multiValued="true" indexed="true" stored="false"/> <field name="resultbestaccessright" type="string" docValues="false" multiValued="false" indexed="true" stored="false"/>
<field name="relorganizationshortname" type="text_common" multiValued="true" indexed="true" stored="false"/> <field name="resultdateofacceptance" type="date" docValues="false" multiValued="false" indexed="true" stored="false"/>
<field name="relproject" type="string" multiValued="true" indexed="true" stored="false"/> <field name="resultdescription" type="text_en" docValues="false" multiValued="true" indexed="true" stored="false"/>
<field name="relprojectcode" type="string" multiValued="true" indexed="true" stored="false"/> <field name="resultdupid" type="string" docValues="false" multiValued="true" indexed="true" stored="false"/>
<field name="relprojectid" type="string" multiValued="true" indexed="true" stored="false"/> <field name="resultembargoenddate" type="date" docValues="false" multiValued="false" indexed="true" stored="false"/>
<field name="relprojectname" type="string" multiValued="true" indexed="true" stored="false"/> <field name="resultembargoendyear" type="string" docValues="false" multiValued="false" indexed="true" stored="false"/>
<field name="relprojecttitle" type="text_common" multiValued="true" indexed="true" stored="false"/> <field name="resulthostingdatasource" type="string" docValues="false" multiValued="true" indexed="true" stored="false"/>
<field name="relprovenanceactionclassid" type="string" multiValued="true" indexed="true" stored="false"/> <field name="resulthostingdatasourceid" type="string" docValues="false" multiValued="true" indexed="true" stored="false"/>
<field name="relresultid" type="string" multiValued="true" indexed="true" stored="false"/> <field name="resultidentifier" type="string_ci" docValues="false" multiValued="true" indexed="true" stored="false"/>
<field name="relresulttype" type="string" multiValued="true" indexed="true" stored="false"/> <field name="resultlanguagename" type="string" docValues="false" multiValued="false" indexed="true" stored="false"/>
<field name="reltrust" type="string" multiValued="true" indexed="true" stored="false"/> <field name="resultpublisher" type="text_common" docValues="false" multiValued="true" indexed="true" stored="false"/>
<field name="resultacceptanceyear" type="string" multiValued="false" indexed="true" stored="false"/> <field name="resultsource" type="text_common" docValues="false" multiValued="true" indexed="true" stored="false"/>
<field name="resultaccessright" type="string" multiValued="true" indexed="true" stored="false"/> <field name="resultsubject" type="text_en" docValues="false" multiValued="true" indexed="true" stored="false"/>
<field name="resultauthor" type="text_common" multiValued="true" indexed="true" stored="false"/> <field name="resulttitle" type="text_en" docValues="false" multiValued="true" indexed="true" stored="false"/>
<field name="resultauthor_nt" type="string_ci" multiValued="true" indexed="true" stored="false"/> <field name="resulttypeid" type="string" docValues="false" multiValued="false" indexed="true" stored="false"/>
<field name="resultbestaccessright" type="string" multiValued="false" indexed="true" stored="false"/> <field name="sdg" type="string" docValues="false" multiValued="true" indexed="true" stored="false"/>
<field name="resultdateofacceptance" type="pdate" multiValued="false" indexed="true" stored="false"/> <field name="semrelid" type="string" docValues="false" multiValued="true" indexed="true" stored="false"/>
<field name="resultdescription" type="text_en" multiValued="true" indexed="true" stored="false"/> <field name="status" type="string_ci" docValues="false" multiValued="true" indexed="true" stored="false"/>
<field name="resultdupid" type="string" multiValued="true" indexed="true" stored="false"/>
<field name="resultembargoenddate" type="pdate" multiValued="false" indexed="true" stored="false"/>
<field name="resultembargoendyear" type="string" multiValued="false" indexed="true" stored="false"/>
<field name="resulthostingdatasource" type="string" multiValued="true" indexed="true" stored="false"/>
<field name="resulthostingdatasourceid" type="string" multiValued="true" indexed="true" stored="false"/>
<field name="resulthostingdatasourcename" type="string" multiValued="true" indexed="true" stored="false"/>
<field name="resultidentifier" type="string" multiValued="true" indexed="true" stored="false"/>
<field name="resultlanguagename" type="string" multiValued="false" indexed="true" stored="false"/>
<field name="resultlicense" type="string" multiValued="true" indexed="true" stored="false"/>
<field name="resultpublisher" type="text_common" multiValued="true" indexed="true" stored="false"/>
<field name="resultsource" type="text_common" multiValued="true" indexed="true" stored="false"/>
<field name="resultsubject" type="text_en" multiValued="true" indexed="true" stored="false"/>
<field name="resultsubjectclass" type="string" multiValued="true" indexed="true" stored="false"/>
<field name="resulttitle" type="text_en" multiValued="true" indexed="true" stored="false"/>
<field name="resulttypeid" type="string" multiValued="false" indexed="true" stored="false"/>
<field name="resulttypename" type="string" multiValued="false" indexed="true" stored="false"/>
<field name="semrelid" type="string" multiValued="true" indexed="true" stored="false"/>
<field name="text" type="text_common" indexed="false" stored="false"/> <field name="text" type="text_common" indexed="false" stored="false"/>
<field name="trust" type="string" multiValued="true" indexed="true" stored="false"/>
<field name="versioning" type="string" multiValued="true" indexed="true" stored="false"/>
<field name="isgreen" type="boolean" multiValued="false" indexed="true" stored="false"/>
<field name="openaccesscolor" type="string" multiValued="false" indexed="true" stored="false"/>
<field name="isindiamondjournal" type="boolean" multiValued="false" indexed="true" stored="false"/>
<field name="publiclyfunded" type="boolean" multiValued="false" indexed="true" stored="false"/>
<field name="peerreviewed" type="boolean" multiValued="false" indexed="true" stored="false"/>
<!-- Copy field definitions follow: -->
<!-- Data source -->
<copyField source="datasourceenglishname" dest="__all"/> <copyField source="datasourceenglishname" dest="__all"/>
<copyField source="datasourceoddescription" dest="__all"/> <copyField source="datasourceoddescription" dest="__all"/>
<copyField source="datasourceodsubjects" dest="__all"/>
<copyField source="datasourceofficialname" dest="__all"/> <copyField source="datasourceofficialname" dest="__all"/>
<copyField source="datasourcesubject" dest="__all"/> <copyField source="datasourcesubject" dest="__all"/>
<copyField source="externalreflabel" dest="__all"/>
<!-- Organization --> <copyField source="fundinglevel0_description" dest="__all"/>
<copyField source="fundinglevel1_description" dest="__all"/>
<copyField source="fundinglevel2_description" dest="__all"/>
<copyField source="organizationalternativenames" dest="__all"/> <copyField source="organizationalternativenames" dest="__all"/>
<copyField source="organizationecenterprise" dest="__all"/>
<copyField source="organizationecinternationalorganization" dest="__all"/>
<copyField source="organizationecinternationalorganizationeurinterests" dest="__all"/>
<copyField source="organizationeclegalbody" dest="__all"/>
<copyField source="organizationeclegalperson" dest="__all"/>
<copyField source="organizationecnonprofit" dest="__all"/>
<copyField source="organizationecnutscode" dest="__all"/>
<copyField source="organizationecresearchorganization" dest="__all"/>
<copyField source="organizationecsmevalidated" dest="__all"/>
<copyField source="organizationlegalname" dest="__all"/> <copyField source="organizationlegalname" dest="__all"/>
<copyField source="organizationlegalshortname" dest="__all"/> <copyField source="organizationlegalshortname" dest="__all"/>
<!-- Project -->
<copyField source="projectacronym" dest="__all"/> <copyField source="projectacronym" dest="__all"/>
<copyField source="projectcode" dest="__all"/> <copyField source="projectcode" dest="__all"/>
<copyField source="projectkeywords" dest="__all"/> <copyField source="projectkeywords" dest="__all"/>
<copyField source="projecttitle" dest="__all"/> <copyField source="projecttitle" dest="__all"/>
<copyField source="projecttitle_alternative" dest="__all"/>
<!-- Result -->
<copyField source="resultpublisher" dest="__all"/>
<copyField source="resultsource" dest="__all"/>
<copyField source="resultidentifier" dest="__all"/>
<copyField source="resultauthor" dest="__all"/>
<copyField source="resulttitle" dest="__all"/>
<copyField source="resultdescription" dest="__all"/>
<copyField source="resultsubject" dest="__all"/>
<copyField source="resultacceptanceyear" dest="__all"/>
<!-- Other -->
<copyField source="externalreflabel" dest="__all"/>
<copyField source="fundinglevel0_description" dest="__all"/>
<copyField source="fundinglevel1_description" dest="__all"/>
<copyField source="fundinglevel2_description" dest="__all"/>
<copyField source="relcontracttypename" dest="__all"/> <copyField source="relcontracttypename" dest="__all"/>
<copyField source="relorganizationcountryname" dest="__all"/> <copyField source="relorganizationcountryname" dest="__all"/>
<copyField source="relorganizationname" dest="__all"/> <copyField source="relorganizationname" dest="__all"/>
<copyField source="relorganizationshortname" dest="__all"/> <copyField source="relorganizationshortname" dest="__all"/>
<copyField source="relprojecttitle" dest="__all"/>
<copyField source="relprojectname" dest="__all"/> <copyField source="relprojectname" dest="__all"/>
<copyField source="relprojecttitle" dest="__all"/>
<copyField source="resultacceptanceyear" dest="__all"/>
<copyField source="resultauthor" dest="__all"/>
<copyField source="resultdescription" dest="__all"/>
<copyField source="resultidentifier" dest="__all"/>
<copyField source="resultpublisher" dest="__all"/>
<copyField source="resultsource" dest="__all"/>
<copyField source="resulttitle" dest="__all"/>
</schema> </schema>

View File

@ -63,7 +63,7 @@ function copydb() {
start_db_time=$(date +%s) start_db_time=$(date +%s)
# Delete the old DB from Impala cluster (if exists). # Delete the old DB from Impala cluster (if exists).
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "drop database if exists ${db} cascade" |& tee error.log # impala-shell prints all logs in stderr, so wee need to capture them and put them in a file, in order to perform "grep" on them later impala-shell -i ${IMPALA_HOSTNAME} -q "drop database if exists ${db} cascade" |& tee error.log # impala-shell prints all logs in stderr, so wee need to capture them and put them in a file, in order to perform "grep" on them later
log_errors=`cat error.log | grep -E "WARN|ERROR|FAILED"` log_errors=`cat error.log | grep -E "WARN|ERROR|FAILED"`
if [ -n "$log_errors" ]; then if [ -n "$log_errors" ]; then
echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN DROPPING THE OLD DATABASE FROM IMPALA CLUSTER! EXITING...\n\n" echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN DROPPING THE OLD DATABASE FROM IMPALA CLUSTER! EXITING...\n\n"
@ -120,7 +120,7 @@ function copydb() {
start_create_schema_time=$(date +%s) start_create_schema_time=$(date +%s)
# create the new database (with the same name) # create the new database (with the same name)
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "create database ${db}" impala-shell -i ${IMPALA_HOSTNAME} -q "create database ${db}"
# Because "Hive" and "Impala" do not have compatible schemas, we cannot use the "show create table <name>" output from hive to create the exact same table in impala. # Because "Hive" and "Impala" do not have compatible schemas, we cannot use the "show create table <name>" output from hive to create the exact same table in impala.
# So, we have to find at least one parquet file (check if it's there) from the table in the ocean cluster for impala to use it to extract the table-schema itself from that file. # So, we have to find at least one parquet file (check if it's there) from the table in the ocean cluster for impala to use it to extract the table-schema itself from that file.
@ -148,7 +148,7 @@ function copydb() {
exit 5 exit 5
fi # This error is not FATAL, do we do not return from this function, in normal circumstances. fi # This error is not FATAL, do we do not return from this function, in normal circumstances.
else else
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "create table ${db}.${i} like parquet '${CURRENT_PRQ_FILE}' stored as parquet;" |& tee error.log impala-shell -i ${IMPALA_HOSTNAME} -q "create table ${db}.${i} like parquet '${CURRENT_PRQ_FILE}' stored as parquet;" |& tee error.log
log_errors=`cat error.log | grep -E "WARN|ERROR|FAILED"` log_errors=`cat error.log | grep -E "WARN|ERROR|FAILED"`
if [ -n "$log_errors" ]; then if [ -n "$log_errors" ]; then
echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN CREATING TABLE '${i}'!\n\n" echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN CREATING TABLE '${i}'!\n\n"
@ -182,7 +182,7 @@ function copydb() {
new_num_of_views_to_retry=0 new_num_of_views_to_retry=0
for create_view_statement in "${all_create_view_statements[@]}"; do # Here we use double quotes, as the elements are phrases, instead of single-words. for create_view_statement in "${all_create_view_statements[@]}"; do # Here we use double quotes, as the elements are phrases, instead of single-words.
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "${create_view_statement}" |& tee error.log # impala-shell prints all logs in stderr, so wee need to capture them and put them in a file, in order to perform "grep" on them later impala-shell -i ${IMPALA_HOSTNAME} -q "${create_view_statement}" |& tee error.log # impala-shell prints all logs in stderr, so wee need to capture them and put them in a file, in order to perform "grep" on them later
specific_errors=`cat error.log | grep -E "FAILED: ParseException line 1:13 missing TABLE at 'view'|ERROR: AnalysisException: Could not resolve table reference:"` specific_errors=`cat error.log | grep -E "FAILED: ParseException line 1:13 missing TABLE at 'view'|ERROR: AnalysisException: Could not resolve table reference:"`
if [ -n "$specific_errors" ]; then if [ -n "$specific_errors" ]; then
echo -e "\nspecific_errors: ${specific_errors}\n" echo -e "\nspecific_errors: ${specific_errors}\n"
@ -212,7 +212,7 @@ function copydb() {
previous_num_of_views_to_retry=$new_num_of_views_to_retry previous_num_of_views_to_retry=$new_num_of_views_to_retry
done done
entities_on_impala=(`impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} --delimited -q "show tables in ${db}"`) entities_on_impala=(`impala-shell -i ${IMPALA_HOSTNAME} --delimited -q "show tables in ${db}"`)
echo -e "\nThe schema of db '${db}', along with ${#entities_on_impala[@]} entities have been created, on Impala cluster, after: $(print_elapsed_time start_create_schema_time)\n" echo -e "\nThe schema of db '${db}', along with ${#entities_on_impala[@]} entities have been created, on Impala cluster, after: $(print_elapsed_time start_create_schema_time)\n"
start_compute_stats_time=$(date +%s) start_compute_stats_time=$(date +%s)
@ -222,9 +222,9 @@ function copydb() {
create_view_statement=`hive -e "show create table ${db}.${i};" | grep "CREATE VIEW"` # This grep works here, as we do not want to match multiple-lines. create_view_statement=`hive -e "show create table ${db}.${i};" | grep "CREATE VIEW"` # This grep works here, as we do not want to match multiple-lines.
if [ -z "$create_view_statement" ]; then # If it's a table, then go load the data to it. if [ -z "$create_view_statement" ]; then # If it's a table, then go load the data to it.
# Invalidate metadata of this DB's tables, in order for Impala to be aware of all parquet files put inside the tables' directories, previously, by "hadoop distcp". # Invalidate metadata of this DB's tables, in order for Impala to be aware of all parquet files put inside the tables' directories, previously, by "hadoop distcp".
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA ${db}.${i}" impala-shell -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA ${db}.${i}"
sleep 1 sleep 1
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "compute stats ${db}.${i}" |& tee error.log impala-shell -i ${IMPALA_HOSTNAME} -q "compute stats ${db}.${i}" |& tee error.log
log_errors=`cat error.log | grep -E "WARN|ERROR|FAILED"` log_errors=`cat error.log | grep -E "WARN|ERROR|FAILED"`
if [ -n "$log_errors" ]; then if [ -n "$log_errors" ]; then
echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN COMPUTING STATS FOR TABLE '${i}'!\n\n" echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN COMPUTING STATS FOR TABLE '${i}'!\n\n"

View File

@ -63,7 +63,7 @@ function copydb() {
start_db_time=$(date +%s) start_db_time=$(date +%s)
# Delete the old DB from Impala cluster (if exists). # Delete the old DB from Impala cluster (if exists).
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "drop database if exists ${db} cascade" |& tee error.log # impala-shell prints all logs in stderr, so wee need to capture them and put them in a file, in order to perform "grep" on them later impala-shell -i ${IMPALA_HOSTNAME} -q "drop database if exists ${db} cascade" |& tee error.log # impala-shell prints all logs in stderr, so wee need to capture them and put them in a file, in order to perform "grep" on them later
log_errors=`cat error.log | grep -E "WARN|ERROR|FAILED"` log_errors=`cat error.log | grep -E "WARN|ERROR|FAILED"`
if [ -n "$log_errors" ]; then if [ -n "$log_errors" ]; then
echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN DROPPING THE OLD DATABASE FROM IMPALA CLUSTER! EXITING...\n\n" echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN DROPPING THE OLD DATABASE FROM IMPALA CLUSTER! EXITING...\n\n"
@ -120,7 +120,7 @@ function copydb() {
start_create_schema_time=$(date +%s) start_create_schema_time=$(date +%s)
# create the new database (with the same name) # create the new database (with the same name)
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "create database ${db}" impala-shell -i ${IMPALA_HOSTNAME} -q "create database ${db}"
# Because "Hive" and "Impala" do not have compatible schemas, we cannot use the "show create table <name>" output from hive to create the exact same table in impala. # Because "Hive" and "Impala" do not have compatible schemas, we cannot use the "show create table <name>" output from hive to create the exact same table in impala.
# So, we have to find at least one parquet file (check if it's there) from the table in the ocean cluster for impala to use it to extract the table-schema itself from that file. # So, we have to find at least one parquet file (check if it's there) from the table in the ocean cluster for impala to use it to extract the table-schema itself from that file.
@ -148,7 +148,7 @@ function copydb() {
exit 5 exit 5
fi # This error is not FATAL, do we do not return from this function, in normal circumstances. fi # This error is not FATAL, do we do not return from this function, in normal circumstances.
else else
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "create table ${db}.${i} like parquet '${CURRENT_PRQ_FILE}' stored as parquet;" |& tee error.log impala-shell -i ${IMPALA_HOSTNAME} -q "create table ${db}.${i} like parquet '${CURRENT_PRQ_FILE}' stored as parquet;" |& tee error.log
log_errors=`cat error.log | grep -E "WARN|ERROR|FAILED"` log_errors=`cat error.log | grep -E "WARN|ERROR|FAILED"`
if [ -n "$log_errors" ]; then if [ -n "$log_errors" ]; then
echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN CREATING TABLE '${i}'!\n\n" echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN CREATING TABLE '${i}'!\n\n"
@ -182,7 +182,7 @@ function copydb() {
new_num_of_views_to_retry=0 new_num_of_views_to_retry=0
for create_view_statement in "${all_create_view_statements[@]}"; do # Here we use double quotes, as the elements are phrases, instead of single-words. for create_view_statement in "${all_create_view_statements[@]}"; do # Here we use double quotes, as the elements are phrases, instead of single-words.
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "${create_view_statement}" |& tee error.log # impala-shell prints all logs in stderr, so wee need to capture them and put them in a file, in order to perform "grep" on them later impala-shell -i ${IMPALA_HOSTNAME} -q "${create_view_statement}" |& tee error.log # impala-shell prints all logs in stderr, so wee need to capture them and put them in a file, in order to perform "grep" on them later
specific_errors=`cat error.log | grep -E "FAILED: ParseException line 1:13 missing TABLE at 'view'|ERROR: AnalysisException: Could not resolve table reference:"` specific_errors=`cat error.log | grep -E "FAILED: ParseException line 1:13 missing TABLE at 'view'|ERROR: AnalysisException: Could not resolve table reference:"`
if [ -n "$specific_errors" ]; then if [ -n "$specific_errors" ]; then
echo -e "\nspecific_errors: ${specific_errors}\n" echo -e "\nspecific_errors: ${specific_errors}\n"
@ -212,7 +212,7 @@ function copydb() {
previous_num_of_views_to_retry=$new_num_of_views_to_retry previous_num_of_views_to_retry=$new_num_of_views_to_retry
done done
entities_on_impala=(`impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} --delimited -q "show tables in ${db}"`) entities_on_impala=(`impala-shell -i ${IMPALA_HOSTNAME} --delimited -q "show tables in ${db}"`)
echo -e "\nThe schema of db '${db}', along with ${#entities_on_impala[@]} entities have been created, on Impala cluster, after: $(print_elapsed_time start_create_schema_time)\n" echo -e "\nThe schema of db '${db}', along with ${#entities_on_impala[@]} entities have been created, on Impala cluster, after: $(print_elapsed_time start_create_schema_time)\n"
start_compute_stats_time=$(date +%s) start_compute_stats_time=$(date +%s)
@ -222,9 +222,9 @@ function copydb() {
create_view_statement=`hive -e "show create table ${db}.${i};" | grep "CREATE VIEW"` # This grep works here, as we do not want to match multiple-lines. create_view_statement=`hive -e "show create table ${db}.${i};" | grep "CREATE VIEW"` # This grep works here, as we do not want to match multiple-lines.
if [ -z "$create_view_statement" ]; then # If it's a table, then go load the data to it. if [ -z "$create_view_statement" ]; then # If it's a table, then go load the data to it.
# Invalidate metadata of this DB's tables, in order for Impala to be aware of all parquet files put inside the tables' directories, previously, by "hadoop distcp". # Invalidate metadata of this DB's tables, in order for Impala to be aware of all parquet files put inside the tables' directories, previously, by "hadoop distcp".
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA ${db}.${i}" impala-shell -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA ${db}.${i}"
sleep 1 sleep 1
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "compute stats ${db}.${i}" |& tee error.log impala-shell -i ${IMPALA_HOSTNAME} -q "compute stats ${db}.${i}" |& tee error.log
log_errors=`cat error.log | grep -E "WARN|ERROR|FAILED"` log_errors=`cat error.log | grep -E "WARN|ERROR|FAILED"`
if [ -n "$log_errors" ]; then if [ -n "$log_errors" ]; then
echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN COMPUTING STATS FOR TABLE '${i}'!\n\n" echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN COMPUTING STATS FOR TABLE '${i}'!\n\n"

View File

@ -63,7 +63,7 @@ function copydb() {
start_db_time=$(date +%s) start_db_time=$(date +%s)
# Delete the old DB from Impala cluster (if exists). # Delete the old DB from Impala cluster (if exists).
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "drop database if exists ${db} cascade" |& tee error.log # impala-shell prints all logs in stderr, so wee need to capture them and put them in a file, in order to perform "grep" on them later impala-shell -i ${IMPALA_HOSTNAME} -q "drop database if exists ${db} cascade" |& tee error.log # impala-shell prints all logs in stderr, so wee need to capture them and put them in a file, in order to perform "grep" on them later
log_errors=`cat error.log | grep -E "WARN|ERROR|FAILED"` log_errors=`cat error.log | grep -E "WARN|ERROR|FAILED"`
if [ -n "$log_errors" ]; then if [ -n "$log_errors" ]; then
echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN DROPPING THE OLD DATABASE FROM IMPALA CLUSTER! EXITING...\n\n" echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN DROPPING THE OLD DATABASE FROM IMPALA CLUSTER! EXITING...\n\n"
@ -120,7 +120,7 @@ function copydb() {
start_create_schema_time=$(date +%s) start_create_schema_time=$(date +%s)
# create the new database (with the same name) # create the new database (with the same name)
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "create database ${db}" impala-shell -i ${IMPALA_HOSTNAME} -q "create database ${db}"
# Because "Hive" and "Impala" do not have compatible schemas, we cannot use the "show create table <name>" output from hive to create the exact same table in impala. # Because "Hive" and "Impala" do not have compatible schemas, we cannot use the "show create table <name>" output from hive to create the exact same table in impala.
# So, we have to find at least one parquet file (check if it's there) from the table in the ocean cluster for impala to use it to extract the table-schema itself from that file. # So, we have to find at least one parquet file (check if it's there) from the table in the ocean cluster for impala to use it to extract the table-schema itself from that file.
@ -148,7 +148,7 @@ function copydb() {
exit 5 exit 5
fi # This error is not FATAL, do we do not return from this function, in normal circumstances. fi # This error is not FATAL, do we do not return from this function, in normal circumstances.
else else
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "create table ${db}.${i} like parquet '${CURRENT_PRQ_FILE}' stored as parquet;" |& tee error.log impala-shell -i ${IMPALA_HOSTNAME} -q "create table ${db}.${i} like parquet '${CURRENT_PRQ_FILE}' stored as parquet;" |& tee error.log
log_errors=`cat error.log | grep -E "WARN|ERROR|FAILED"` log_errors=`cat error.log | grep -E "WARN|ERROR|FAILED"`
if [ -n "$log_errors" ]; then if [ -n "$log_errors" ]; then
echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN CREATING TABLE '${i}'!\n\n" echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN CREATING TABLE '${i}'!\n\n"
@ -182,7 +182,7 @@ function copydb() {
new_num_of_views_to_retry=0 new_num_of_views_to_retry=0
for create_view_statement in "${all_create_view_statements[@]}"; do # Here we use double quotes, as the elements are phrases, instead of single-words. for create_view_statement in "${all_create_view_statements[@]}"; do # Here we use double quotes, as the elements are phrases, instead of single-words.
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "${create_view_statement}" |& tee error.log # impala-shell prints all logs in stderr, so wee need to capture them and put them in a file, in order to perform "grep" on them later impala-shell -i ${IMPALA_HOSTNAME} -q "${create_view_statement}" |& tee error.log # impala-shell prints all logs in stderr, so wee need to capture them and put them in a file, in order to perform "grep" on them later
specific_errors=`cat error.log | grep -E "FAILED: ParseException line 1:13 missing TABLE at 'view'|ERROR: AnalysisException: Could not resolve table reference:"` specific_errors=`cat error.log | grep -E "FAILED: ParseException line 1:13 missing TABLE at 'view'|ERROR: AnalysisException: Could not resolve table reference:"`
if [ -n "$specific_errors" ]; then if [ -n "$specific_errors" ]; then
echo -e "\nspecific_errors: ${specific_errors}\n" echo -e "\nspecific_errors: ${specific_errors}\n"
@ -212,7 +212,7 @@ function copydb() {
previous_num_of_views_to_retry=$new_num_of_views_to_retry previous_num_of_views_to_retry=$new_num_of_views_to_retry
done done
entities_on_impala=(`impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} --delimited -q "show tables in ${db}"`) entities_on_impala=(`impala-shell -i ${IMPALA_HOSTNAME} --delimited -q "show tables in ${db}"`)
echo -e "\nThe schema of db '${db}', along with ${#entities_on_impala[@]} entities have been created, on Impala cluster, after: $(print_elapsed_time start_create_schema_time)\n" echo -e "\nThe schema of db '${db}', along with ${#entities_on_impala[@]} entities have been created, on Impala cluster, after: $(print_elapsed_time start_create_schema_time)\n"
start_compute_stats_time=$(date +%s) start_compute_stats_time=$(date +%s)
@ -222,9 +222,9 @@ function copydb() {
create_view_statement=`hive -e "show create table ${db}.${i};" | grep "CREATE VIEW"` # This grep works here, as we do not want to match multiple-lines. create_view_statement=`hive -e "show create table ${db}.${i};" | grep "CREATE VIEW"` # This grep works here, as we do not want to match multiple-lines.
if [ -z "$create_view_statement" ]; then # If it's a table, then go load the data to it. if [ -z "$create_view_statement" ]; then # If it's a table, then go load the data to it.
# Invalidate metadata of this DB's tables, in order for Impala to be aware of all parquet files put inside the tables' directories, previously, by "hadoop distcp". # Invalidate metadata of this DB's tables, in order for Impala to be aware of all parquet files put inside the tables' directories, previously, by "hadoop distcp".
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA ${db}.${i}" impala-shell -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA ${db}.${i}"
sleep 1 sleep 1
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "compute stats ${db}.${i}" |& tee error.log impala-shell -i ${IMPALA_HOSTNAME} -q "compute stats ${db}.${i}" |& tee error.log
log_errors=`cat error.log | grep -E "WARN|ERROR|FAILED"` log_errors=`cat error.log | grep -E "WARN|ERROR|FAILED"`
if [ -n "$log_errors" ]; then if [ -n "$log_errors" ]; then
echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN COMPUTING STATS FOR TABLE '${i}'!\n\n" echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN COMPUTING STATS FOR TABLE '${i}'!\n\n"

View File

@ -1,4 +1,4 @@
<workflow-app name="Graph Stats" xmlns="uri:oozie:workflow:0.5"> <workflow-app name="Promote Graph Stats" xmlns="uri:oozie:workflow:0.5">
<parameters> <parameters>
<property> <property>
<name>stats_db_name</name> <name>stats_db_name</name>

View File

@ -65,7 +65,7 @@ function copydb() {
start_db_time=$(date +%s) start_db_time=$(date +%s)
# Delete the old DB from Impala cluster (if exists). # Delete the old DB from Impala cluster (if exists).
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "drop database if exists ${db} cascade" |& tee error.log # impala-shell prints all logs in stderr, so wee need to capture them and put them in a file, in order to perform "grep" on them later impala-shell -i ${IMPALA_HOSTNAME} -q "drop database if exists ${db} cascade" |& tee error.log # impala-shell prints all logs in stderr, so wee need to capture them and put them in a file, in order to perform "grep" on them later
log_errors=`cat error.log | grep -E "WARN|ERROR|FAILED"` log_errors=`cat error.log | grep -E "WARN|ERROR|FAILED"`
if [ -n "$log_errors" ]; then if [ -n "$log_errors" ]; then
echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN DROPPING THE OLD DATABASE FROM IMPALA CLUSTER! EXITING...\n\n" echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN DROPPING THE OLD DATABASE FROM IMPALA CLUSTER! EXITING...\n\n"
@ -122,7 +122,7 @@ function copydb() {
start_create_schema_time=$(date +%s) start_create_schema_time=$(date +%s)
# create the new database (with the same name) # create the new database (with the same name)
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "create database ${db}" impala-shell -i ${IMPALA_HOSTNAME} -q "create database ${db}"
# Because "Hive" and "Impala" do not have compatible schemas, we cannot use the "show create table <name>" output from hive to create the exact same table in impala. # Because "Hive" and "Impala" do not have compatible schemas, we cannot use the "show create table <name>" output from hive to create the exact same table in impala.
# So, we have to find at least one parquet file (check if it's there) from the table in the ocean cluster for impala to use it to extract the table-schema itself from that file. # So, we have to find at least one parquet file (check if it's there) from the table in the ocean cluster for impala to use it to extract the table-schema itself from that file.
@ -150,7 +150,7 @@ function copydb() {
exit 5 exit 5
fi # This error is not FATAL, do we do not return from this function, in normal circumstances. fi # This error is not FATAL, do we do not return from this function, in normal circumstances.
else else
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "create table ${db}.${i} like parquet '${CURRENT_PRQ_FILE}' stored as parquet;" |& tee error.log impala-shell -i ${IMPALA_HOSTNAME} -q "create table ${db}.${i} like parquet '${CURRENT_PRQ_FILE}' stored as parquet;" |& tee error.log
log_errors=`cat error.log | grep -E "WARN|ERROR|FAILED"` log_errors=`cat error.log | grep -E "WARN|ERROR|FAILED"`
if [ -n "$log_errors" ]; then if [ -n "$log_errors" ]; then
echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN CREATING TABLE '${i}'!\n\n" echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN CREATING TABLE '${i}'!\n\n"
@ -184,7 +184,7 @@ function copydb() {
new_num_of_views_to_retry=0 new_num_of_views_to_retry=0
for create_view_statement in "${all_create_view_statements[@]}"; do # Here we use double quotes, as the elements are phrases, instead of single-words. for create_view_statement in "${all_create_view_statements[@]}"; do # Here we use double quotes, as the elements are phrases, instead of single-words.
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "${create_view_statement}" |& tee error.log # impala-shell prints all logs in stderr, so wee need to capture them and put them in a file, in order to perform "grep" on them later impala-shell -i ${IMPALA_HOSTNAME} -q "${create_view_statement}" |& tee error.log # impala-shell prints all logs in stderr, so wee need to capture them and put them in a file, in order to perform "grep" on them later
specific_errors=`cat error.log | grep -E "FAILED: ParseException line 1:13 missing TABLE at 'view'|ERROR: AnalysisException: Could not resolve table reference:"` specific_errors=`cat error.log | grep -E "FAILED: ParseException line 1:13 missing TABLE at 'view'|ERROR: AnalysisException: Could not resolve table reference:"`
if [ -n "$specific_errors" ]; then if [ -n "$specific_errors" ]; then
echo -e "\nspecific_errors: ${specific_errors}\n" echo -e "\nspecific_errors: ${specific_errors}\n"
@ -214,7 +214,7 @@ function copydb() {
previous_num_of_views_to_retry=$new_num_of_views_to_retry previous_num_of_views_to_retry=$new_num_of_views_to_retry
done done
entities_on_impala=(`impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} --delimited -q "show tables in ${db}"`) entities_on_impala=(`impala-shell -i ${IMPALA_HOSTNAME} --delimited -q "show tables in ${db}"`)
echo -e "\nThe schema of db '${db}', along with ${#entities_on_impala[@]} entities have been created, on Impala cluster, after: $(print_elapsed_time start_create_schema_time)\n" echo -e "\nThe schema of db '${db}', along with ${#entities_on_impala[@]} entities have been created, on Impala cluster, after: $(print_elapsed_time start_create_schema_time)\n"
start_compute_stats_time=$(date +%s) start_compute_stats_time=$(date +%s)
@ -224,9 +224,9 @@ function copydb() {
create_view_statement=`hive -e "show create table ${db}.${i};" | grep "CREATE VIEW"` # This grep works here, as we do not want to match multiple-lines. create_view_statement=`hive -e "show create table ${db}.${i};" | grep "CREATE VIEW"` # This grep works here, as we do not want to match multiple-lines.
if [ -z "$create_view_statement" ]; then # If it's a table, then go load the data to it. if [ -z "$create_view_statement" ]; then # If it's a table, then go load the data to it.
# Invalidate metadata of this DB's tables, in order for Impala to be aware of all parquet files put inside the tables' directories, previously, by "hadoop distcp". # Invalidate metadata of this DB's tables, in order for Impala to be aware of all parquet files put inside the tables' directories, previously, by "hadoop distcp".
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA ${db}.${i}" impala-shell -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA ${db}.${i}"
sleep 1 sleep 1
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "compute stats ${db}.${i}" |& tee error.log impala-shell -i ${IMPALA_HOSTNAME} -q "compute stats ${db}.${i}" |& tee error.log
log_errors=`cat error.log | grep -E "WARN|ERROR|FAILED"` log_errors=`cat error.log | grep -E "WARN|ERROR|FAILED"`
if [ -n "$log_errors" ]; then if [ -n "$log_errors" ]; then
echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN COMPUTING STATS FOR TABLE '${i}'!\n\n" echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN COMPUTING STATS FOR TABLE '${i}'!\n\n"
@ -271,8 +271,7 @@ copydb $MONITOR_DB'_institutions'
copydb $MONITOR_DB'_ris_tail' copydb $MONITOR_DB'_ris_tail'
contexts="knowmad::other dh-ch::other enermaps::other gotriple::other neanias-atmospheric::other rural-digital-europe::other covid-19::other aurora::other neanias-space::other north-america-studies::other north-american-studies::other eutopia::other" contexts="knowmad::other dh-ch::other enermaps::other gotriple::other neanias-atmospheric::other rural-digital-europe::other covid-19::other aurora::other neanias-space::other north-america-studies::other north-american-studies::other eutopia::other"
for i in ${contexts} for i in ${contexts}; do
do tmp=`echo "$i" | sed 's/'-'/'_'/g' | sed 's/'::'/'_'/g'`
tmp=`echo "$i" | sed 's/'-'/'_'/g' | sed 's/'::'/'_'/g'`
copydb ${MONITOR_DB}'_'${tmp} copydb ${MONITOR_DB}'_'${tmp}
done done

View File

@ -6,21 +6,26 @@ then
ln -sfn ${PYTHON_EGG_CACHE}${link_folder} ${link_folder} ln -sfn ${PYTHON_EGG_CACHE}${link_folder} ${link_folder}
fi fi
export HADOOP_USER_NAME=$3
IMPALA_HOSTNAME='impala-cluster-dn1.openaire.eu'
function createPDFsAggregated() { function createPDFsAggregated() {
db=$1 db=$1
impala-shell --user $HADOOP_USER_NAME -i impala-cluster-dn1.openaire.eu -d ${db} -q "drop table if exists indi_is_result_accessible"; impala-shell -i ${IMPALA_HOSTNAME} -d ${db} -q "drop table if exists indi_is_result_accessible";
impala-shell --user $HADOOP_USER_NAME -i impala-cluster-dn1.openaire.eu -d ${db} -q "create table indi_is_result_accessible stored as parquet as impala-shell -i ${IMPALA_HOSTNAME} -d ${db} -q "create table indi_is_result_accessible stored as parquet as
select distinct p.id, coalesce(is_result_accessible, 0) as is_result_accessible from result p select distinct p.id, coalesce(is_result_accessible, 0) as is_result_accessible from result p
left outer join left outer join
(select id, 1 as is_result_accessible from (select pl.* from result r (select id, 1 as is_result_accessible from (select pl.* from result r
join pdfaggregation_i.publication p on r.id=p.id join pdfaggregation_i.publication p on r.id=p.id
join pdfaggregation_i.payload pl on pl.id=p.id join pdfaggregation_i.payload pl on pl.id=p.id
union all union all
select pl.* from result r select pl.* from result r
join pdfaggregation_i.publication p on r.id=p.dedupid join pdfaggregation_i.publication p on r.id=p.dedupid
join pdfaggregation_i.payload pl on pl.id=p.id) foo) tmp on p.id=tmp.id"; join pdfaggregation_i.payload pl on pl.id=p.id) foo)
tmp on p.id=tmp.id";
} }
STATS_DB=$1 STATS_DB=$1
@ -35,8 +40,7 @@ createPDFsAggregated $MONITOR_DB'_institutions'
createPDFsAggregated $MONITOR_DB'_ris_tail' createPDFsAggregated $MONITOR_DB'_ris_tail'
contexts="knowmad::other dh-ch::other enermaps::other gotriple::other neanias-atmospheric::other rural-digital-europe::other covid-19::other aurora::other neanias-space::other north-america-studies::other north-american-studies::other eutopia::other" contexts="knowmad::other dh-ch::other enermaps::other gotriple::other neanias-atmospheric::other rural-digital-europe::other covid-19::other aurora::other neanias-space::other north-america-studies::other north-american-studies::other eutopia::other"
for i in ${contexts} for i in ${contexts}; do
do tmp=`echo "$i" | sed 's/'-'/'_'/g' | sed 's/'::'/'_'/g'`
tmp=`echo "$i" | sed 's/'-'/'_'/g' | sed 's/'::'/'_'/g'`
createPDFsAggregated ${MONITOR_DB}'_'${tmp} createPDFsAggregated ${MONITOR_DB}'_'${tmp}
done done

View File

@ -51,49 +51,6 @@
<artifactId>hadoop-distcp</artifactId> <artifactId>hadoop-distcp</artifactId>
</dependency> </dependency>
<dependency>
<groupId>eu.dnetlib</groupId>
<artifactId>dnet-actionmanager-api</artifactId>
</dependency>
<dependency>
<groupId>eu.dnetlib</groupId>
<artifactId>dnet-actionmanager-common</artifactId>
<exclusions>
<exclusion>
<groupId>eu.dnetlib</groupId>
<artifactId>dnet-openaireplus-mapping-utils</artifactId>
</exclusion>
<exclusion>
<groupId>saxonica</groupId>
<artifactId>saxon</artifactId>
</exclusion>
<exclusion>
<groupId>saxonica</groupId>
<artifactId>saxon-dom</artifactId>
</exclusion>
<exclusion>
<groupId>jgrapht</groupId>
<artifactId>jgrapht</artifactId>
</exclusion>
<exclusion>
<groupId>net.sf.ehcache</groupId>
<artifactId>ehcache</artifactId>
</exclusion>
<exclusion>
<groupId>org.springframework</groupId>
<artifactId>spring-test</artifactId>
</exclusion>
<exclusion>
<groupId>org.apache.*</groupId>
<artifactId>*</artifactId>
</exclusion>
<exclusion>
<groupId>apache</groupId>
<artifactId>*</artifactId>
</exclusion>
</exclusions>
</dependency>
<dependency> <dependency>
<groupId>org.apache.httpcomponents</groupId> <groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient</artifactId> <artifactId>httpclient</artifactId>

27
pom.xml
View File

@ -440,29 +440,6 @@
<scope>provided</scope> <scope>provided</scope>
</dependency> </dependency>
<dependency>
<groupId>eu.dnetlib</groupId>
<artifactId>dnet-actionmanager-common</artifactId>
<version>${dnet-actionmanager-common.version}</version>
<exclusions>
<exclusion>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-common</artifactId>
</exclusion>
</exclusions>
</dependency>
<dependency>
<groupId>eu.dnetlib</groupId>
<artifactId>dnet-actionmanager-api</artifactId>
<version>${dnet-actionmanager-api.version}</version>
<exclusions>
<exclusion>
<groupId>eu.dnetlib</groupId>
<artifactId>cnr-misc-utils</artifactId>
</exclusion>
</exclusions>
</dependency>
<dependency> <dependency>
<groupId>eu.dnetlib</groupId> <groupId>eu.dnetlib</groupId>
<artifactId>cnr-rmi-api</artifactId> <artifactId>cnr-rmi-api</artifactId>
@ -960,7 +937,7 @@
<commons.logging.version>1.1.3</commons.logging.version> <commons.logging.version>1.1.3</commons.logging.version>
<commons-validator.version>1.7</commons-validator.version> <commons-validator.version>1.7</commons-validator.version>
<dateparser.version>1.0.7</dateparser.version> <dateparser.version>1.0.7</dateparser.version>
<dhp-schemas.version>[6.1.3-SNAPSHOT]</dhp-schemas.version> <dhp-schemas.version>[7.0.0]</dhp-schemas.version>
<dhp.cdh.version>cdh5.9.2</dhp.cdh.version> <dhp.cdh.version>cdh5.9.2</dhp.cdh.version>
<dhp.commons.lang.version>3.5</dhp.commons.lang.version> <dhp.commons.lang.version>3.5</dhp.commons.lang.version>
<dhp.guava.version>11.0.2</dhp.guava.version> <dhp.guava.version>11.0.2</dhp.guava.version>
@ -969,8 +946,6 @@
<dhp.oozie.version>4.1.0-${dhp.cdh.version}</dhp.oozie.version> <dhp.oozie.version>4.1.0-${dhp.cdh.version}</dhp.oozie.version>
<dhp.site.skip>true</dhp.site.skip> <dhp.site.skip>true</dhp.site.skip>
<dhp.spark.version>2.4.0.cloudera2</dhp.spark.version> <dhp.spark.version>2.4.0.cloudera2</dhp.spark.version>
<dnet-actionmanager-api.version>[4.0.3]</dnet-actionmanager-api.version>
<dnet-actionmanager-common.version>[6.0.5]</dnet-actionmanager-common.version>
<dnet-openaire-broker-common.version>[3.1.6]</dnet-openaire-broker-common.version> <dnet-openaire-broker-common.version>[3.1.6]</dnet-openaire-broker-common.version>
<google.gson.version>2.2.2</google.gson.version> <google.gson.version>2.2.2</google.gson.version>
<log4j.version>1.2.17</log4j.version> <log4j.version>1.2.17</log4j.version>