merged from beta
This commit is contained in:
commit
06e3985b77
|
@ -328,7 +328,7 @@ public class MergeUtils {
|
|||
final T merged = mergeOafFields(original, enrich, trust);
|
||||
|
||||
merged.setOriginalId(unionDistinctListOfString(merged.getOriginalId(), enrich.getOriginalId()));
|
||||
merged.setPid(unionDistinctLists(merged.getPid(), enrich.getPid(), trust));
|
||||
merged.setPid(mergeLists(merged.getPid(), enrich.getPid(), trust, MergeUtils::spKeyExtractor, (p1, p2) -> p1));
|
||||
merged.setDateofcollection(LocalDateTime.now().toString());
|
||||
merged
|
||||
.setDateoftransformation(
|
||||
|
@ -464,6 +464,10 @@ public class MergeUtils {
|
|||
merge.setIsInDiamondJournal(booleanOR(merge.getIsInDiamondJournal(), enrich.getIsInDiamondJournal()));
|
||||
merge.setPubliclyFunded(booleanOR(merge.getPubliclyFunded(), enrich.getPubliclyFunded()));
|
||||
|
||||
if (StringUtils.isBlank(merge.getTransformativeAgreement())) {
|
||||
merge.setTransformativeAgreement(enrich.getTransformativeAgreement());
|
||||
}
|
||||
|
||||
return merge;
|
||||
}
|
||||
|
||||
|
@ -655,6 +659,13 @@ public class MergeUtils {
|
|||
return d1;
|
||||
}
|
||||
|
||||
if (StringUtils.contains(d1.getValue(), "null")) {
|
||||
return d2;
|
||||
}
|
||||
if (StringUtils.contains(d2.getValue(), "null")) {
|
||||
return d1;
|
||||
}
|
||||
|
||||
return Stream
|
||||
.of(d1, d2)
|
||||
.min(
|
||||
|
|
|
@ -2,31 +2,41 @@
|
|||
package eu.dnetlib.pace.clustering;
|
||||
|
||||
import java.util.*;
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
|
||||
@ClusteringClass("keywordsclustering")
|
||||
public class KeywordsClustering extends AbstractClusteringFunction {
|
||||
@ClusteringClass("legalnameclustering")
|
||||
public class LegalnameClustering extends AbstractClusteringFunction {
|
||||
|
||||
public KeywordsClustering(Map<String, Object> params) {
|
||||
private static final Pattern CITY_CODE_PATTERN = Pattern.compile("city::\\d+");
|
||||
private static final Pattern KEYWORD_CODE_PATTERN = Pattern.compile("key::\\d+");
|
||||
|
||||
public LegalnameClustering(Map<String, Object> params) {
|
||||
super(params);
|
||||
}
|
||||
|
||||
public Set<String> getRegexList(String input, Pattern codeRegex) {
|
||||
Matcher matcher = codeRegex.matcher(input);
|
||||
Set<String> cities = new HashSet<>();
|
||||
while (matcher.find()) {
|
||||
cities.add(matcher.group());
|
||||
}
|
||||
return cities;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Collection<String> doApply(final Config conf, String s) {
|
||||
|
||||
// takes city codes and keywords codes without duplicates
|
||||
Set<String> keywords = getKeywords(s, conf.translationMap(), paramOrDefault("windowSize", 4));
|
||||
Set<String> cities = getCities(s, paramOrDefault("windowSize", 4));
|
||||
|
||||
// list of combination to return as result
|
||||
final Collection<String> combinations = new LinkedHashSet<String>();
|
||||
|
||||
for (String keyword : keywordsToCodes(keywords, conf.translationMap())) {
|
||||
for (String city : citiesToCodes(cities)) {
|
||||
for (String keyword : getRegexList(s, KEYWORD_CODE_PATTERN)) {
|
||||
for (String city : getRegexList(s, CITY_CODE_PATTERN)) {
|
||||
combinations.add(keyword + "-" + city);
|
||||
if (combinations.size() >= paramOrDefault("max", 2)) {
|
||||
return combinations;
|
||||
|
@ -42,9 +52,6 @@ public class KeywordsClustering extends AbstractClusteringFunction {
|
|||
return fields
|
||||
.stream()
|
||||
.filter(f -> !f.isEmpty())
|
||||
.map(KeywordsClustering::cleanup)
|
||||
.map(KeywordsClustering::normalize)
|
||||
.map(s -> filterAllStopWords(s))
|
||||
.map(s -> doApply(conf, s))
|
||||
.map(c -> filterBlacklisted(c, ngramBlacklist))
|
||||
.flatMap(c -> c.stream())
|
|
@ -27,6 +27,14 @@ public class AbstractPaceFunctions extends PaceCommonUtils {
|
|||
private static Map<String, String> cityMap = AbstractPaceFunctions
|
||||
.loadMapFromClasspath("/eu/dnetlib/pace/config/city_map.csv");
|
||||
|
||||
// keywords map to be used when translating the keyword names into codes
|
||||
private static Map<String, String> keywordMap = AbstractPaceFunctions
|
||||
.loadMapFromClasspath("/eu/dnetlib/pace/config/translation_map.csv");
|
||||
|
||||
// country map to be used when inferring the country from the city name
|
||||
private static Map<String, String> countryMap = AbstractPaceFunctions
|
||||
.loadCountryMapFromClasspath("/eu/dnetlib/pace/config/country_map.csv");
|
||||
|
||||
// list of stopwords in different languages
|
||||
protected static Set<String> stopwords_gr = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_gr.txt");
|
||||
protected static Set<String> stopwords_en = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_en.txt");
|
||||
|
@ -74,6 +82,64 @@ public class AbstractPaceFunctions extends PaceCommonUtils {
|
|||
return s12;
|
||||
}
|
||||
|
||||
public static String countryInference(final String original, String inferFrom) {
|
||||
if (!original.equalsIgnoreCase("unknown"))
|
||||
return original;
|
||||
|
||||
inferFrom = cleanup(inferFrom);
|
||||
inferFrom = normalize(inferFrom);
|
||||
inferFrom = filterAllStopWords(inferFrom);
|
||||
Set<String> cities = getCities(inferFrom, 4);
|
||||
return citiesToCountry(cities).stream().findFirst().orElse("UNKNOWN");
|
||||
}
|
||||
|
||||
public static String cityInference(String original) {
|
||||
original = cleanup(original);
|
||||
original = normalize(original);
|
||||
original = filterAllStopWords(original);
|
||||
|
||||
Set<String> cities = getCities(original, 4);
|
||||
|
||||
for (String city : cities) {
|
||||
original = original.replaceAll(city, cityMap.get(city));
|
||||
}
|
||||
|
||||
return original;
|
||||
}
|
||||
|
||||
public static String keywordInference(String original) {
|
||||
original = cleanup(original);
|
||||
original = normalize(original);
|
||||
original = filterAllStopWords(original);
|
||||
|
||||
Set<String> keywords = getKeywords(original, keywordMap, 4);
|
||||
|
||||
for (String keyword : keywords) {
|
||||
original = original.replaceAll(keyword, keywordMap.get(keyword));
|
||||
}
|
||||
|
||||
return original;
|
||||
}
|
||||
|
||||
public static String cityKeywordInference(String original) {
|
||||
original = cleanup(original);
|
||||
original = normalize(original);
|
||||
original = filterAllStopWords(original);
|
||||
|
||||
Set<String> keywords = getKeywords(original, keywordMap, 4);
|
||||
Set<String> cities = getCities(original, 4);
|
||||
|
||||
for (String keyword : keywords) {
|
||||
original = original.replaceAll(keyword, keywordMap.get(keyword));
|
||||
}
|
||||
|
||||
for (String city : cities) {
|
||||
original = original.replaceAll(city, cityMap.get(city));
|
||||
}
|
||||
|
||||
return original;
|
||||
}
|
||||
|
||||
protected static String fixXML(final String a) {
|
||||
|
||||
return a
|
||||
|
@ -208,6 +274,30 @@ public class AbstractPaceFunctions extends PaceCommonUtils {
|
|||
return m;
|
||||
}
|
||||
|
||||
public static Map<String, String> loadCountryMapFromClasspath(final String classpath) {
|
||||
|
||||
Transliterator transliterator = Transliterator.getInstance("Any-Eng");
|
||||
|
||||
final Map<String, String> m = new HashMap<>();
|
||||
try {
|
||||
for (final String s : IOUtils
|
||||
.readLines(AbstractPaceFunctions.class.getResourceAsStream(classpath), StandardCharsets.UTF_8)) {
|
||||
// string is like this: country_code;city1;city2;city3
|
||||
String[] line = s.split(";");
|
||||
String value = line[0];
|
||||
for (int i = 1; i < line.length; i++) {
|
||||
String city = fixAliases(transliterator.transliterate(line[i].toLowerCase()));
|
||||
String code = cityMap.get(city);
|
||||
m.put(code, value);
|
||||
}
|
||||
}
|
||||
} catch (final Throwable e) {
|
||||
return new HashMap<>();
|
||||
}
|
||||
return m;
|
||||
|
||||
}
|
||||
|
||||
public static String removeKeywords(String s, Set<String> keywords) {
|
||||
|
||||
s = " " + s + " ";
|
||||
|
@ -237,6 +327,10 @@ public class AbstractPaceFunctions extends PaceCommonUtils {
|
|||
return toCodes(keywords, cityMap);
|
||||
}
|
||||
|
||||
public static Set<String> citiesToCountry(Set<String> cities) {
|
||||
return toCodes(toCodes(cities, cityMap), countryMap);
|
||||
}
|
||||
|
||||
protected static String firstLC(final String s) {
|
||||
return StringUtils.substring(s, 0, 1).toLowerCase();
|
||||
}
|
||||
|
|
|
@ -47,9 +47,21 @@ public class FieldDef implements Serializable {
|
|||
|
||||
private String clean;
|
||||
|
||||
private String infer;
|
||||
|
||||
private String inferenceFrom;
|
||||
|
||||
public FieldDef() {
|
||||
}
|
||||
|
||||
public String getInferenceFrom() {
|
||||
return inferenceFrom;
|
||||
}
|
||||
|
||||
public void setInferenceFrom(final String inferenceFrom) {
|
||||
this.inferenceFrom = inferenceFrom;
|
||||
}
|
||||
|
||||
public String getName() {
|
||||
return name;
|
||||
}
|
||||
|
@ -126,6 +138,14 @@ public class FieldDef implements Serializable {
|
|||
this.clean = clean;
|
||||
}
|
||||
|
||||
public String getInfer() {
|
||||
return infer;
|
||||
}
|
||||
|
||||
public void setInfer(String infer) {
|
||||
this.infer = infer;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
try {
|
||||
|
|
|
@ -123,9 +123,19 @@ case class SparkModel(conf: DedupConfig) {
|
|||
case _ => res(index)
|
||||
}
|
||||
}
|
||||
|
||||
if (StringUtils.isNotBlank(fdef.getInfer)) {
|
||||
val inferFrom : String = if (StringUtils.isNotBlank(fdef.getInferenceFrom)) fdef.getInferenceFrom else fdef.getPath
|
||||
res(index) = res(index) match {
|
||||
case x: Seq[String] => x.map(inference(_, MapDocumentUtil.getJPathString(inferFrom, documentContext), fdef.getInfer))
|
||||
case _ => inference(res(index).toString, MapDocumentUtil.getJPathString(inferFrom, documentContext), fdef.getInfer)
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
res
|
||||
|
||||
}
|
||||
|
||||
new GenericRowWithSchema(values, schema)
|
||||
|
@ -146,5 +156,17 @@ case class SparkModel(conf: DedupConfig) {
|
|||
res
|
||||
}
|
||||
|
||||
def inference(value: String, inferfrom: String, infertype: String) : String = {
|
||||
val res = infertype match {
|
||||
case "country" => AbstractPaceFunctions.countryInference(value, inferfrom)
|
||||
case "city" => AbstractPaceFunctions.cityInference(value)
|
||||
case "keyword" => AbstractPaceFunctions.keywordInference(value)
|
||||
case "city_keyword" => AbstractPaceFunctions.cityKeywordInference(value)
|
||||
case _ => value
|
||||
}
|
||||
|
||||
res
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
|
|
@ -1,48 +0,0 @@
|
|||
|
||||
package eu.dnetlib.pace.tree;
|
||||
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
import eu.dnetlib.pace.tree.support.AbstractStringComparator;
|
||||
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||
|
||||
@ComparatorClass("cityMatch")
|
||||
public class CityMatch extends AbstractStringComparator {
|
||||
|
||||
private Map<String, String> params;
|
||||
|
||||
public CityMatch(Map<String, String> params) {
|
||||
super(params);
|
||||
this.params = params;
|
||||
}
|
||||
|
||||
@Override
|
||||
public double distance(final String a, final String b, final Config conf) {
|
||||
|
||||
String ca = cleanup(a);
|
||||
String cb = cleanup(b);
|
||||
|
||||
ca = normalize(ca);
|
||||
cb = normalize(cb);
|
||||
|
||||
ca = filterAllStopWords(ca);
|
||||
cb = filterAllStopWords(cb);
|
||||
|
||||
Set<String> cities1 = getCities(ca, Integer.parseInt(params.getOrDefault("windowSize", "4")));
|
||||
Set<String> cities2 = getCities(cb, Integer.parseInt(params.getOrDefault("windowSize", "4")));
|
||||
|
||||
Set<String> codes1 = citiesToCodes(cities1);
|
||||
Set<String> codes2 = citiesToCodes(cities2);
|
||||
|
||||
// if no cities are detected, the comparator gives 1.0
|
||||
if (codes1.isEmpty() && codes2.isEmpty())
|
||||
return 1.0;
|
||||
else {
|
||||
if (codes1.isEmpty() ^ codes2.isEmpty())
|
||||
return -1; // undefined if one of the two has no cities
|
||||
return commonElementsPercentage(codes1, codes2);
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,51 @@
|
|||
|
||||
package eu.dnetlib.pace.tree;
|
||||
|
||||
import java.util.HashSet;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
import eu.dnetlib.pace.tree.support.AbstractStringComparator;
|
||||
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||
|
||||
@ComparatorClass("codeMatch")
|
||||
public class CodeMatch extends AbstractStringComparator {
|
||||
|
||||
private Map<String, String> params;
|
||||
|
||||
private Pattern CODE_REGEX;
|
||||
|
||||
public CodeMatch(Map<String, String> params) {
|
||||
super(params);
|
||||
this.params = params;
|
||||
this.CODE_REGEX = Pattern.compile(params.getOrDefault("codeRegex", "[a-zA-Z]::\\d+"));
|
||||
}
|
||||
|
||||
public Set<String> getRegexList(String input) {
|
||||
Matcher matcher = this.CODE_REGEX.matcher(input);
|
||||
Set<String> cities = new HashSet<>();
|
||||
while (matcher.find()) {
|
||||
cities.add(matcher.group());
|
||||
}
|
||||
return cities;
|
||||
}
|
||||
|
||||
@Override
|
||||
public double distance(final String a, final String b, final Config conf) {
|
||||
|
||||
Set<String> codes1 = getRegexList(a);
|
||||
Set<String> codes2 = getRegexList(b);
|
||||
|
||||
// if no codes are detected, the comparator gives 1.0
|
||||
if (codes1.isEmpty() && codes2.isEmpty())
|
||||
return 1.0;
|
||||
else {
|
||||
if (codes1.isEmpty() ^ codes2.isEmpty())
|
||||
return -1; // undefined if one of the two has no codes
|
||||
return commonElementsPercentage(codes1, codes2);
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,54 @@
|
|||
|
||||
package eu.dnetlib.pace.tree;
|
||||
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
|
||||
import com.wcohen.ss.AbstractStringDistance;
|
||||
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
import eu.dnetlib.pace.tree.support.AbstractStringComparator;
|
||||
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||
|
||||
@ComparatorClass("countryMatch")
|
||||
public class CountryMatch extends AbstractStringComparator {
|
||||
|
||||
private Map<String, String> params;
|
||||
|
||||
public CountryMatch(Map<String, String> params) {
|
||||
super(params, new com.wcohen.ss.JaroWinkler());
|
||||
this.params = params;
|
||||
}
|
||||
|
||||
public CountryMatch(final double weight) {
|
||||
super(weight, new com.wcohen.ss.JaroWinkler());
|
||||
}
|
||||
|
||||
protected CountryMatch(final double weight, final AbstractStringDistance ssalgo) {
|
||||
super(weight, ssalgo);
|
||||
}
|
||||
|
||||
@Override
|
||||
public double distance(final String a, final String b, final Config conf) {
|
||||
|
||||
if (a.isEmpty() || b.isEmpty()) {
|
||||
return -1.0; // return -1 if a field is missing
|
||||
}
|
||||
if (a.equalsIgnoreCase("unknown") || b.equalsIgnoreCase("unknown")) {
|
||||
return -1.0; // return -1 if a country is UNKNOWN
|
||||
}
|
||||
|
||||
return a.equals(b) ? 1.0 : 0;
|
||||
}
|
||||
|
||||
@Override
|
||||
public double getWeight() {
|
||||
return super.weight;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected double normalize(final double d) {
|
||||
return d;
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,59 @@
|
|||
|
||||
package eu.dnetlib.pace.tree;
|
||||
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
|
||||
import com.wcohen.ss.AbstractStringDistance;
|
||||
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
import eu.dnetlib.pace.tree.support.AbstractStringComparator;
|
||||
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||
|
||||
@ComparatorClass("jaroWinklerLegalname")
|
||||
public class JaroWinklerLegalname extends AbstractStringComparator {
|
||||
|
||||
private Map<String, String> params;
|
||||
|
||||
private final String CITY_CODE_REGEX = "city::\\d+";
|
||||
private final String KEYWORD_CODE_REGEX = "key::\\d+";
|
||||
|
||||
public JaroWinklerLegalname(Map<String, String> params) {
|
||||
super(params, new com.wcohen.ss.JaroWinkler());
|
||||
this.params = params;
|
||||
}
|
||||
|
||||
public JaroWinklerLegalname(double weight) {
|
||||
super(weight, new com.wcohen.ss.JaroWinkler());
|
||||
}
|
||||
|
||||
protected JaroWinklerLegalname(double weight, AbstractStringDistance ssalgo) {
|
||||
super(weight, ssalgo);
|
||||
}
|
||||
|
||||
@Override
|
||||
public double distance(String a, String b, final Config conf) {
|
||||
|
||||
String ca = a.replaceAll(CITY_CODE_REGEX, "").replaceAll(KEYWORD_CODE_REGEX, " ");
|
||||
String cb = b.replaceAll(CITY_CODE_REGEX, "").replaceAll(KEYWORD_CODE_REGEX, " ");
|
||||
|
||||
ca = ca.replaceAll("[ ]{2,}", " ");
|
||||
cb = cb.replaceAll("[ ]{2,}", " ");
|
||||
|
||||
if (ca.isEmpty() && cb.isEmpty())
|
||||
return 1.0;
|
||||
else
|
||||
return normalize(ssalgo.score(ca, cb));
|
||||
}
|
||||
|
||||
@Override
|
||||
public double getWeight() {
|
||||
return super.weight;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected double normalize(double d) {
|
||||
return d;
|
||||
}
|
||||
|
||||
}
|
|
@ -1,74 +0,0 @@
|
|||
|
||||
package eu.dnetlib.pace.tree;
|
||||
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
|
||||
import com.wcohen.ss.AbstractStringDistance;
|
||||
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
import eu.dnetlib.pace.tree.support.AbstractStringComparator;
|
||||
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||
|
||||
@ComparatorClass("jaroWinklerNormalizedName")
|
||||
public class JaroWinklerNormalizedName extends AbstractStringComparator {
|
||||
|
||||
private Map<String, String> params;
|
||||
|
||||
public JaroWinklerNormalizedName(Map<String, String> params) {
|
||||
super(params, new com.wcohen.ss.JaroWinkler());
|
||||
this.params = params;
|
||||
}
|
||||
|
||||
public JaroWinklerNormalizedName(double weight) {
|
||||
super(weight, new com.wcohen.ss.JaroWinkler());
|
||||
}
|
||||
|
||||
protected JaroWinklerNormalizedName(double weight, AbstractStringDistance ssalgo) {
|
||||
super(weight, ssalgo);
|
||||
}
|
||||
|
||||
@Override
|
||||
public double distance(String a, String b, final Config conf) {
|
||||
String ca = cleanup(a);
|
||||
String cb = cleanup(b);
|
||||
|
||||
ca = normalize(ca);
|
||||
cb = normalize(cb);
|
||||
|
||||
ca = filterAllStopWords(ca);
|
||||
cb = filterAllStopWords(cb);
|
||||
|
||||
Set<String> keywords1 = getKeywords(
|
||||
ca, conf.translationMap(), Integer.parseInt(params.getOrDefault("windowSize", "4")));
|
||||
Set<String> keywords2 = getKeywords(
|
||||
cb, conf.translationMap(), Integer.parseInt(params.getOrDefault("windowSize", "4")));
|
||||
|
||||
Set<String> cities1 = getCities(ca, Integer.parseInt(params.getOrDefault("windowSize", "4")));
|
||||
Set<String> cities2 = getCities(cb, Integer.parseInt(params.getOrDefault("windowSize", "4")));
|
||||
|
||||
ca = removeKeywords(ca, keywords1);
|
||||
ca = removeKeywords(ca, cities1);
|
||||
cb = removeKeywords(cb, keywords2);
|
||||
cb = removeKeywords(cb, cities2);
|
||||
|
||||
ca = ca.replaceAll("[ ]{2,}", " ");
|
||||
cb = cb.replaceAll("[ ]{2,}", " ");
|
||||
|
||||
if (ca.isEmpty() && cb.isEmpty())
|
||||
return 1.0;
|
||||
else
|
||||
return normalize(ssalgo.score(ca, cb));
|
||||
}
|
||||
|
||||
@Override
|
||||
public double getWeight() {
|
||||
return super.weight;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected double normalize(double d) {
|
||||
return d;
|
||||
}
|
||||
|
||||
}
|
|
@ -1,50 +0,0 @@
|
|||
|
||||
package eu.dnetlib.pace.tree;
|
||||
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
|
||||
import eu.dnetlib.pace.config.Config;
|
||||
import eu.dnetlib.pace.tree.support.AbstractStringComparator;
|
||||
import eu.dnetlib.pace.tree.support.ComparatorClass;
|
||||
|
||||
@ComparatorClass("keywordMatch")
|
||||
public class KeywordMatch extends AbstractStringComparator {
|
||||
|
||||
Map<String, String> params;
|
||||
|
||||
public KeywordMatch(Map<String, String> params) {
|
||||
super(params);
|
||||
this.params = params;
|
||||
}
|
||||
|
||||
@Override
|
||||
public double distance(final String a, final String b, final Config conf) {
|
||||
|
||||
String ca = cleanup(a);
|
||||
String cb = cleanup(b);
|
||||
|
||||
ca = normalize(ca);
|
||||
cb = normalize(cb);
|
||||
|
||||
ca = filterAllStopWords(ca);
|
||||
cb = filterAllStopWords(cb);
|
||||
|
||||
Set<String> keywords1 = getKeywords(
|
||||
ca, conf.translationMap(), Integer.parseInt(params.getOrDefault("windowSize", "4")));
|
||||
Set<String> keywords2 = getKeywords(
|
||||
cb, conf.translationMap(), Integer.parseInt(params.getOrDefault("windowSize", "4")));
|
||||
|
||||
Set<String> codes1 = toCodes(keywords1, conf.translationMap());
|
||||
Set<String> codes2 = toCodes(keywords2, conf.translationMap());
|
||||
|
||||
// if no cities are detected, the comparator gives 1.0
|
||||
if (codes1.isEmpty() && codes2.isEmpty())
|
||||
return 1.0;
|
||||
else {
|
||||
if (codes1.isEmpty() ^ codes2.isEmpty())
|
||||
return -1.0; // undefined if one of the two has no keywords
|
||||
return commonElementsPercentage(codes1, codes2);
|
||||
}
|
||||
}
|
||||
}
|
|
@ -48,7 +48,7 @@ public class TreeNodeDef implements Serializable {
|
|||
// function for the evaluation of the node
|
||||
public TreeNodeStats evaluate(Row doc1, Row doc2, Config conf) {
|
||||
|
||||
TreeNodeStats stats = new TreeNodeStats();
|
||||
TreeNodeStats stats = new TreeNodeStats(ignoreUndefined);
|
||||
|
||||
// for each field in the node, it computes the
|
||||
for (FieldConf fieldConf : fields) {
|
||||
|
|
|
@ -9,8 +9,11 @@ public class TreeNodeStats implements Serializable {
|
|||
|
||||
private Map<String, FieldStats> results; // this is an accumulator for the results of the node
|
||||
|
||||
public TreeNodeStats() {
|
||||
private final boolean ignoreUndefined;
|
||||
|
||||
public TreeNodeStats(boolean ignoreUndefined) {
|
||||
this.results = new HashMap<>();
|
||||
this.ignoreUndefined = ignoreUndefined;
|
||||
}
|
||||
|
||||
public Map<String, FieldStats> getResults() {
|
||||
|
@ -22,7 +25,10 @@ public class TreeNodeStats implements Serializable {
|
|||
}
|
||||
|
||||
public int fieldsCount() {
|
||||
return this.results.size();
|
||||
if (ignoreUndefined)
|
||||
return this.results.size();
|
||||
else
|
||||
return this.results.size() - undefinedCount(); // do not count undefined
|
||||
}
|
||||
|
||||
public int undefinedCount() {
|
||||
|
@ -78,11 +84,22 @@ public class TreeNodeStats implements Serializable {
|
|||
double min = 100.0; // random high value
|
||||
for (FieldStats fs : this.results.values()) {
|
||||
if (fs.getResult() < min) {
|
||||
if (fs.getResult() >= 0.0 || (fs.getResult() == -1 && fs.isCountIfUndefined()))
|
||||
if (fs.getResult() == -1) {
|
||||
if (fs.isCountIfUndefined()) {
|
||||
min = 0.0;
|
||||
} else {
|
||||
min = -1;
|
||||
}
|
||||
} else {
|
||||
min = fs.getResult();
|
||||
}
|
||||
}
|
||||
}
|
||||
return min;
|
||||
if (ignoreUndefined) {
|
||||
return min == -1.0 ? 0.0 : min;
|
||||
} else {
|
||||
return min;
|
||||
}
|
||||
}
|
||||
|
||||
// if at least one is true, return 1.0
|
||||
|
@ -91,7 +108,11 @@ public class TreeNodeStats implements Serializable {
|
|||
if (fieldStats.getResult() >= fieldStats.getThreshold())
|
||||
return 1.0;
|
||||
}
|
||||
return 0.0;
|
||||
if (!ignoreUndefined && undefinedCount() > 0) {
|
||||
return -1.0;
|
||||
} else {
|
||||
return 0.0;
|
||||
}
|
||||
}
|
||||
|
||||
// if at least one is false, return 0.0
|
||||
|
@ -100,7 +121,7 @@ public class TreeNodeStats implements Serializable {
|
|||
|
||||
if (fieldStats.getResult() == -1) {
|
||||
if (fieldStats.isCountIfUndefined())
|
||||
return 0.0;
|
||||
return ignoreUndefined ? 0.0 : -1.0;
|
||||
} else {
|
||||
if (fieldStats.getResult() < fieldStats.getThreshold())
|
||||
return 0.0;
|
||||
|
|
|
@ -44,12 +44,10 @@ public class TreeProcessor {
|
|||
TreeNodeStats stats = currentNode.evaluate(doc1, doc2, config);
|
||||
treeStats.addNodeStats(nextNodeName, stats);
|
||||
|
||||
// if ignoreUndefined=false the miss is considered as undefined
|
||||
if (!currentNode.isIgnoreUndefined() && stats.undefinedCount() > 0) {
|
||||
double finalScore = stats.getFinalScore(currentNode.getAggregation());
|
||||
if (finalScore == -1.0)
|
||||
nextNodeName = currentNode.getUndefined();
|
||||
}
|
||||
// if ignoreUndefined=true the miss is ignored and the score computed anyway
|
||||
else if (stats.getFinalScore(currentNode.getAggregation()) >= currentNode.getThreshold()) {
|
||||
else if (finalScore >= currentNode.getThreshold()) {
|
||||
nextNodeName = currentNode.getPositive();
|
||||
} else {
|
||||
nextNodeName = currentNode.getNegative();
|
||||
|
|
File diff suppressed because one or more lines are too long
|
@ -8,6 +8,7 @@ import org.junit.jupiter.api.Test;
|
|||
|
||||
import com.google.common.collect.Lists;
|
||||
import com.google.common.collect.Maps;
|
||||
import com.mongodb.connection.Cluster;
|
||||
|
||||
import eu.dnetlib.pace.AbstractPaceTest;
|
||||
import eu.dnetlib.pace.common.AbstractPaceFunctions;
|
||||
|
@ -177,41 +178,16 @@ public class ClusteringFunctionTest extends AbstractPaceTest {
|
|||
}
|
||||
|
||||
@Test
|
||||
public void testKeywordsClustering() {
|
||||
public void legalnameClustering() {
|
||||
|
||||
final ClusteringFunction cf = new KeywordsClustering(params);
|
||||
final String s = "Polytechnic University of Turin";
|
||||
final ClusteringFunction cf = new LegalnameClustering(params);
|
||||
String s = "key::1 key::2 city::1";
|
||||
System.out.println(s);
|
||||
System.out.println(cf.apply(conf, Lists.newArrayList(s)));
|
||||
|
||||
final String s1 = "POLITECNICO DI TORINO";
|
||||
System.out.println(s1);
|
||||
System.out.println(cf.apply(conf, Lists.newArrayList(s1)));
|
||||
|
||||
final String s2 = "Universita farmaceutica culturale di milano bergamo";
|
||||
System.out.println("s2 = " + s2);
|
||||
System.out.println(cf.apply(conf, Lists.newArrayList(s2)));
|
||||
|
||||
final String s3 = "universita universita milano milano";
|
||||
System.out.println("s3 = " + s3);
|
||||
System.out.println(cf.apply(conf, Lists.newArrayList(s3)));
|
||||
|
||||
final String s4 = "Politechniki Warszawskiej (Warsaw University of Technology)";
|
||||
System.out.println("s4 = " + s4);
|
||||
System.out.println(cf.apply(conf, Lists.newArrayList(s4)));
|
||||
|
||||
final String s5 = "İstanbul Ticarət Universiteti";
|
||||
System.out.println("s5 = " + s5);
|
||||
System.out.println(cf.apply(conf, Lists.newArrayList(s5)));
|
||||
|
||||
final String s6 = "National and Kapodistrian University of Athens";
|
||||
System.out.println("s6 = " + s6);
|
||||
System.out.println(cf.apply(conf, Lists.newArrayList(s6)));
|
||||
|
||||
final String s7 = "Εθνικό και Καποδιστριακό Πανεπιστήμιο Αθηνών";
|
||||
System.out.println("s7 = " + s7);
|
||||
System.out.println(cf.apply(conf, Lists.newArrayList(s7)));
|
||||
|
||||
s = "key::1 key::2 city::1 city::2";
|
||||
System.out.println(s);
|
||||
System.out.println(cf.apply(conf, Lists.newArrayList(s)));
|
||||
}
|
||||
|
||||
@Test
|
||||
|
|
|
@ -54,4 +54,47 @@ public class PaceFunctionTest extends AbstractPaceFunctions {
|
|||
System.out.println("Fixed aliases : " + fixAliases(TEST_STRING));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void countryInferenceTest() {
|
||||
assertEquals("IT", countryInference("UNKNOWN", "Università di Bologna"));
|
||||
assertEquals("UK", countryInference("UK", "Università di Bologna"));
|
||||
assertEquals("IT", countryInference("UNKNOWN", "Universiteé de Naples"));
|
||||
assertEquals("UNKNOWN", countryInference("UNKNOWN", "Università del Lavoro"));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void cityInferenceTest() {
|
||||
assertEquals("universita city::3181928", cityInference("Università di Bologna"));
|
||||
assertEquals("university city::3170647", cityInference("University of Pisa"));
|
||||
assertEquals("universita", cityInference("Università del lavoro"));
|
||||
assertEquals("universita city::3173331 city::3169522", cityInference("Università di Modena e Reggio Emilia"));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void keywordInferenceTest() {
|
||||
assertEquals("key::41 turin", keywordInference("Polytechnic University of Turin"));
|
||||
assertEquals("key::41 torino", keywordInference("POLITECNICO DI TORINO"));
|
||||
assertEquals(
|
||||
"key::1 key::60 key::81 milano bergamo",
|
||||
keywordInference("Universita farmaceutica culturale di milano bergamo"));
|
||||
assertEquals("key::1 key::1 milano milano", keywordInference("universita universita milano milano"));
|
||||
assertEquals(
|
||||
"key::10 kapodistriako panepistemio athenon",
|
||||
keywordInference("Εθνικό και Καποδιστριακό Πανεπιστήμιο Αθηνών"));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void cityKeywordInferenceTest() {
|
||||
assertEquals("key::41 city::3165524", cityKeywordInference("Polytechnic University of Turin"));
|
||||
assertEquals("key::41 city::3165524", cityKeywordInference("POLITECNICO DI TORINO"));
|
||||
assertEquals(
|
||||
"key::1 key::60 key::81 city::3173435 city::3182164",
|
||||
cityKeywordInference("Universita farmaceutica culturale di milano bergamo"));
|
||||
assertEquals(
|
||||
"key::1 key::1 city::3173435 city::3173435", cityKeywordInference("universita universita milano milano"));
|
||||
assertEquals(
|
||||
"key::10 kapodistriako panepistemio city::264371",
|
||||
cityKeywordInference("Εθνικό και Καποδιστριακό Πανεπιστήμιο Αθηνών"));
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -35,6 +35,7 @@ public class ComparatorTest extends AbstractPaceTest {
|
|||
params.put("name_th", "0.95");
|
||||
params.put("jpath_value", "$.value");
|
||||
params.put("jpath_classid", "$.qualifier.classid");
|
||||
params.put("codeRegex", "key::\\d+");
|
||||
}
|
||||
|
||||
@Test
|
||||
|
@ -44,52 +45,23 @@ public class ComparatorTest extends AbstractPaceTest {
|
|||
}
|
||||
|
||||
@Test
|
||||
public void cityMatchTest() {
|
||||
final CityMatch cityMatch = new CityMatch(params);
|
||||
public void codeMatchTest() {
|
||||
CodeMatch codeMatch = new CodeMatch(params);
|
||||
|
||||
// both names with no cities
|
||||
assertEquals(1.0, cityMatch.distance("Università", "Centro di ricerca", conf));
|
||||
// both names with no codes
|
||||
assertEquals(1.0, codeMatch.distance("testing1", "testing2", conf));
|
||||
|
||||
// one of the two names with no cities
|
||||
assertEquals(-1.0, cityMatch.distance("Università di Bologna", "Centro di ricerca", conf));
|
||||
// one of the two names with no codes
|
||||
assertEquals(-1.0, codeMatch.distance("testing1 key::1", "testing", conf));
|
||||
|
||||
// both names with cities (same)
|
||||
assertEquals(1.0, cityMatch.distance("Universita di Bologna", "Biblioteca di Bologna", conf));
|
||||
// both names with codes (same)
|
||||
assertEquals(1.0, codeMatch.distance("testing1 key::1", "testing2 key::1", conf));
|
||||
|
||||
// both names with cities (different)
|
||||
assertEquals(0.0, cityMatch.distance("Universita di Bologna", "Universita di Torino", conf));
|
||||
assertEquals(0.0, cityMatch.distance("Franklin College", "Concordia College", conf));
|
||||
// both names with codes (different)
|
||||
assertEquals(0.0, codeMatch.distance("testing1 key::1", "testing2 key::2", conf));
|
||||
|
||||
// particular cases
|
||||
assertEquals(1.0, cityMatch.distance("Free University of Bozen-Bolzano", "Università di Bolzano", conf));
|
||||
assertEquals(
|
||||
1.0,
|
||||
cityMatch
|
||||
.distance(
|
||||
"Politechniki Warszawskiej (Warsaw University of Technology)", "Warsaw University of Technology",
|
||||
conf));
|
||||
|
||||
// failing becasuse 'Allen' is a transliterrated greek stopword
|
||||
// assertEquals(-1.0, cityMatch.distance("Allen (United States)", "United States Military Academy", conf));
|
||||
assertEquals(-1.0, cityMatch.distance("Washington (United States)", "United States Military Academy", conf));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void keywordMatchTest() {
|
||||
params.put("threshold", "0.5");
|
||||
|
||||
final KeywordMatch keywordMatch = new KeywordMatch(params);
|
||||
|
||||
assertEquals(
|
||||
0.5, keywordMatch.distance("Biblioteca dell'Universita di Bologna", "Università di Bologna", conf));
|
||||
assertEquals(1.0, keywordMatch.distance("Universita degli studi di Pisa", "Universita di Pisa", conf));
|
||||
assertEquals(1.0, keywordMatch.distance("Polytechnic University of Turin", "POLITECNICO DI TORINO", conf));
|
||||
assertEquals(1.0, keywordMatch.distance("Istanbul Commerce University", "İstanbul Ticarət Universiteti", conf));
|
||||
assertEquals(1.0, keywordMatch.distance("Franklin College", "Concordia College", conf));
|
||||
assertEquals(2.0 / 3.0, keywordMatch.distance("University of Georgia", "Georgia State University", conf));
|
||||
assertEquals(0.5, keywordMatch.distance("University College London", "University of London", conf));
|
||||
assertEquals(0.5, keywordMatch.distance("Washington State University", "University of Washington", conf));
|
||||
assertEquals(-1.0, keywordMatch.distance("Allen (United States)", "United States Military Academy", conf));
|
||||
// both names with codes (1 same, 1 different)
|
||||
assertEquals(0.5, codeMatch.distance("key::1 key::2 testing1", "key::1 testing", conf));
|
||||
|
||||
}
|
||||
|
||||
|
@ -155,15 +127,15 @@ public class ComparatorTest extends AbstractPaceTest {
|
|||
}
|
||||
|
||||
@Test
|
||||
public void jaroWinklerNormalizedNameTest() {
|
||||
public void jaroWinklerLegalnameTest() {
|
||||
|
||||
final JaroWinklerNormalizedName jaroWinklerNormalizedName = new JaroWinklerNormalizedName(params);
|
||||
final JaroWinklerLegalname jaroWinklerLegalname = new JaroWinklerLegalname(params);
|
||||
|
||||
double result = jaroWinklerNormalizedName
|
||||
.distance("AT&T (United States)", "United States Military Academy", conf);
|
||||
double result = jaroWinklerLegalname
|
||||
.distance("AT&T (United States)", "United States key::2 key::1", conf);
|
||||
System.out.println("result = " + result);
|
||||
|
||||
result = jaroWinklerNormalizedName.distance("NOAA - Servicio Meteorol\\u00f3gico Nacional", "NOAA - NWS", conf);
|
||||
result = jaroWinklerLegalname.distance("NOAA - Servicio Meteorol\\u00f3gico Nacional", "NOAA - NWS", conf);
|
||||
System.out.println("result = " + result);
|
||||
|
||||
}
|
||||
|
@ -336,4 +308,23 @@ public class ComparatorTest extends AbstractPaceTest {
|
|||
System.out.println("compare = " + compare);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void countryMatch() {
|
||||
|
||||
CountryMatch countryMatch = new CountryMatch(params);
|
||||
|
||||
double result = countryMatch.distance("UNKNOWN", "UNKNOWN", conf);
|
||||
assertEquals(-1.0, result);
|
||||
|
||||
result = countryMatch.distance("CL", "UNKNOWN", conf);
|
||||
assertEquals(-1.0, result);
|
||||
|
||||
result = countryMatch.distance("CL", "IT", conf);
|
||||
assertEquals(0.0, result);
|
||||
|
||||
result = countryMatch.distance("CL", "CL", conf);
|
||||
assertEquals(1.0, result);
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -51,48 +51,5 @@
|
|||
<artifactId>hadoop-distcp</artifactId>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>eu.dnetlib</groupId>
|
||||
<artifactId>dnet-actionmanager-api</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>eu.dnetlib</groupId>
|
||||
<artifactId>dnet-actionmanager-common</artifactId>
|
||||
<exclusions>
|
||||
<exclusion>
|
||||
<groupId>eu.dnetlib</groupId>
|
||||
<artifactId>dnet-openaireplus-mapping-utils</artifactId>
|
||||
</exclusion>
|
||||
<exclusion>
|
||||
<groupId>saxonica</groupId>
|
||||
<artifactId>saxon</artifactId>
|
||||
</exclusion>
|
||||
<exclusion>
|
||||
<groupId>saxonica</groupId>
|
||||
<artifactId>saxon-dom</artifactId>
|
||||
</exclusion>
|
||||
<exclusion>
|
||||
<groupId>jgrapht</groupId>
|
||||
<artifactId>jgrapht</artifactId>
|
||||
</exclusion>
|
||||
<exclusion>
|
||||
<groupId>net.sf.ehcache</groupId>
|
||||
<artifactId>ehcache</artifactId>
|
||||
</exclusion>
|
||||
<exclusion>
|
||||
<groupId>org.springframework</groupId>
|
||||
<artifactId>spring-test</artifactId>
|
||||
</exclusion>
|
||||
<exclusion>
|
||||
<groupId>org.apache.*</groupId>
|
||||
<artifactId>*</artifactId>
|
||||
</exclusion>
|
||||
<exclusion>
|
||||
<groupId>apache</groupId>
|
||||
<artifactId>*</artifactId>
|
||||
</exclusion>
|
||||
</exclusions>
|
||||
</dependency>
|
||||
|
||||
</dependencies>
|
||||
</project>
|
||||
|
|
|
@ -4,7 +4,6 @@ package eu.dnetlib.dhp.actionmanager;
|
|||
import java.io.Serializable;
|
||||
import java.io.StringReader;
|
||||
import java.util.List;
|
||||
import java.util.NoSuchElementException;
|
||||
import java.util.Optional;
|
||||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
|
@ -22,7 +21,6 @@ import com.google.common.base.Splitter;
|
|||
import com.google.common.collect.Iterables;
|
||||
import com.google.common.collect.Sets;
|
||||
|
||||
import eu.dnetlib.actionmanager.rmi.ActionManagerException;
|
||||
import eu.dnetlib.dhp.utils.ISLookupClientFactory;
|
||||
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
|
||||
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
|
||||
|
@ -65,7 +63,7 @@ public class ISClient implements Serializable {
|
|||
.map(t -> buildDirectory(basePath, t))
|
||||
.collect(Collectors.toList()))
|
||||
.orElseThrow(() -> new IllegalStateException("empty set list"));
|
||||
} catch (ActionManagerException | ISLookUpException e) {
|
||||
} catch (ISLookUpException e) {
|
||||
throw new IllegalStateException("unable to query ActionSets info from the IS");
|
||||
}
|
||||
}
|
||||
|
@ -89,31 +87,18 @@ public class ISClient implements Serializable {
|
|||
return Joiner.on("/").join(basePath, t.getMiddle(), t.getRight());
|
||||
}
|
||||
|
||||
private String getBasePathHDFS(ISLookUpService isLookup) throws ActionManagerException {
|
||||
private String getBasePathHDFS(ISLookUpService isLookup) throws ISLookUpException {
|
||||
return queryServiceProperty(isLookup, "basePath");
|
||||
}
|
||||
|
||||
private String queryServiceProperty(ISLookUpService isLookup, final String propertyName)
|
||||
throws ActionManagerException {
|
||||
throws ISLookUpException {
|
||||
final String q = "for $x in /RESOURCE_PROFILE[.//RESOURCE_TYPE/@value='ActionManagerServiceResourceType'] return $x//SERVICE_PROPERTIES/PROPERTY[./@ key='"
|
||||
+ propertyName
|
||||
+ "']/@value/string()";
|
||||
log.debug("quering for service property: {}", q);
|
||||
try {
|
||||
final List<String> value = isLookup.quickSearchProfile(q);
|
||||
return Iterables.getOnlyElement(value);
|
||||
} catch (ISLookUpException e) {
|
||||
String msg = "Error accessing service profile, using query: " + q;
|
||||
log.error(msg, e);
|
||||
throw new ActionManagerException(msg, e);
|
||||
} catch (NoSuchElementException e) {
|
||||
String msg = "missing service property: " + propertyName;
|
||||
log.error(msg, e);
|
||||
throw new ActionManagerException(msg, e);
|
||||
} catch (IllegalArgumentException e) {
|
||||
String msg = "found more than one service property: " + propertyName;
|
||||
log.error(msg, e);
|
||||
throw new ActionManagerException(msg, e);
|
||||
}
|
||||
|
||||
final List<String> value = isLookup.quickSearchProfile(q);
|
||||
return Iterables.getOnlyElement(value);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -42,6 +42,9 @@ public class Constants {
|
|||
public static final String NULL = "NULL";
|
||||
public static final String NA = "N/A";
|
||||
|
||||
public static final String WEB_CRAWL_ID = "10|openaire____::fb98a192f6a055ba495ef414c330834b";
|
||||
public static final String WEB_CRAWL_NAME = "Web Crawl";
|
||||
|
||||
public static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
|
||||
|
||||
private Constants() {
|
||||
|
|
|
@ -41,9 +41,9 @@ public class PrepareAffiliationRelations implements Serializable {
|
|||
private static final Logger log = LoggerFactory.getLogger(PrepareAffiliationRelations.class);
|
||||
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
|
||||
private static final String ID_PREFIX = "50|doi_________::";
|
||||
public static final String BIP_AFFILIATIONS_CLASSID = "result:organization:bipinference";
|
||||
public static final String BIP_AFFILIATIONS_CLASSNAME = "Affiliation relation inferred by BIP!";
|
||||
public static final String BIP_INFERENCE_PROVENANCE = "bip:affiliation:crossref";
|
||||
public static final String BIP_AFFILIATIONS_CLASSID = "result:organization:openaireinference";
|
||||
public static final String BIP_AFFILIATIONS_CLASSNAME = "Affiliation relation inferred by OpenAIRE";
|
||||
public static final String BIP_INFERENCE_PROVENANCE = "openaire:affiliation";
|
||||
|
||||
public static <I extends Result> void main(String[] args) throws Exception {
|
||||
|
||||
|
@ -71,6 +71,9 @@ public class PrepareAffiliationRelations implements Serializable {
|
|||
final String dataciteInputPath = parser.get("dataciteInputPath");
|
||||
log.info("dataciteInputPath: {}", dataciteInputPath);
|
||||
|
||||
final String webcrawlInputPath = parser.get("webCrawlInputPath");
|
||||
log.info("webcrawlInputPath: {}", webcrawlInputPath);
|
||||
|
||||
final String outputPath = parser.get("outputPath");
|
||||
log.info("outputPath: {}", outputPath);
|
||||
|
||||
|
@ -102,10 +105,16 @@ public class PrepareAffiliationRelations implements Serializable {
|
|||
JavaPairRDD<Text, Text> dataciteRelations = prepareAffiliationRelations(
|
||||
spark, dataciteInputPath, collectedFromDatacite);
|
||||
|
||||
List<KeyValue> collectedFromWebCrawl = OafMapperUtils
|
||||
.listKeyValues(Constants.WEB_CRAWL_ID, Constants.WEB_CRAWL_NAME);
|
||||
JavaPairRDD<Text, Text> webCrawlRelations = prepareAffiliationRelations(
|
||||
spark, webcrawlInputPath, collectedFromWebCrawl);
|
||||
|
||||
crossrefRelations
|
||||
.union(pubmedRelations)
|
||||
.union(openAPCRelations)
|
||||
.union(dataciteRelations)
|
||||
.union(webCrawlRelations)
|
||||
.saveAsHadoopFile(
|
||||
outputPath, Text.class, Text.class, SequenceFileOutputFormat.class, BZip2Codec.class);
|
||||
|
||||
|
|
|
@ -5,7 +5,6 @@ import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
|||
|
||||
import java.io.Serializable;
|
||||
import java.util.*;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.hadoop.io.Text;
|
||||
|
@ -21,6 +20,7 @@ import org.slf4j.LoggerFactory;
|
|||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
|
||||
import eu.dnetlib.dhp.actionmanager.Constants;
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.schema.action.AtomicAction;
|
||||
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||
|
@ -44,8 +44,7 @@ public class CreateActionSetFromWebEntries implements Serializable {
|
|||
private static final String PMID_PREFIX = "50|pmid________::";
|
||||
|
||||
private static final String PMCID_PREFIX = "50|pmc_________::";
|
||||
private static final String WEB_CRAWL_ID = "10|openaire____::fb98a192f6a055ba495ef414c330834b";
|
||||
private static final String WEB_CRAWL_NAME = "Web Crawl";
|
||||
|
||||
public static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
|
@ -104,8 +103,6 @@ public class CreateActionSetFromWebEntries implements Serializable {
|
|||
final String ror = ROR_PREFIX
|
||||
+ IdentifierFactory.md5(PidCleaner.normalizePidValue("ROR", row.getAs("ror")));
|
||||
ret.addAll(createAffiliationRelationPairDOI(row.getAs("doi"), ror));
|
||||
// ret.addAll(createAffiliationRelationPairPMID(row.getAs("pmid"), ror));
|
||||
// ret.addAll(createAffiliationRelationPairPMCID(row.getAs("pmcid"), ror));
|
||||
|
||||
return ret
|
||||
.iterator();
|
||||
|
@ -145,11 +142,6 @@ public class CreateActionSetFromWebEntries implements Serializable {
|
|||
"institution.country_code as country_code", "publication_year")
|
||||
.distinct();
|
||||
|
||||
// .selectExpr(
|
||||
// "id", "doi", "ids.pmcid as pmcid", "ids.pmid as pmid", "institution.ror as ror",
|
||||
// "institution.country_code as country_code", "publication_year")
|
||||
// .distinct();
|
||||
|
||||
}
|
||||
|
||||
private static Dataset<Row> readBlackList(SparkSession spark, String inputPath) {
|
||||
|
@ -220,7 +212,7 @@ public class CreateActionSetFromWebEntries implements Serializable {
|
|||
ModelConstants.IS_AUTHOR_INSTITUTION_OF,
|
||||
Arrays
|
||||
.asList(
|
||||
OafMapperUtils.keyValue(WEB_CRAWL_ID, WEB_CRAWL_NAME)),
|
||||
OafMapperUtils.keyValue(Constants.WEB_CRAWL_ID, Constants.WEB_CRAWL_NAME)),
|
||||
OafMapperUtils
|
||||
.dataInfo(
|
||||
false, null, false, false,
|
||||
|
@ -239,7 +231,7 @@ public class CreateActionSetFromWebEntries implements Serializable {
|
|||
ModelConstants.HAS_AUTHOR_INSTITUTION,
|
||||
Arrays
|
||||
.asList(
|
||||
OafMapperUtils.keyValue(WEB_CRAWL_ID, WEB_CRAWL_NAME)),
|
||||
OafMapperUtils.keyValue(Constants.WEB_CRAWL_ID, Constants.WEB_CRAWL_NAME)),
|
||||
OafMapperUtils
|
||||
.dataInfo(
|
||||
false, null, false, false,
|
||||
|
|
|
@ -0,0 +1,76 @@
|
|||
|
||||
package eu.dnetlib.dhp.collection.plugin.researchfi;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
import java.util.Spliterator;
|
||||
import java.util.Spliterators;
|
||||
import java.util.stream.Stream;
|
||||
import java.util.stream.StreamSupport;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.apache.http.NameValuePair;
|
||||
import org.apache.http.client.entity.UrlEncodedFormEntity;
|
||||
import org.apache.http.client.methods.CloseableHttpResponse;
|
||||
import org.apache.http.client.methods.HttpPost;
|
||||
import org.apache.http.impl.client.CloseableHttpClient;
|
||||
import org.apache.http.impl.client.HttpClients;
|
||||
import org.apache.http.message.BasicNameValuePair;
|
||||
import org.json.JSONObject;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import eu.dnetlib.dhp.collection.ApiDescriptor;
|
||||
import eu.dnetlib.dhp.collection.plugin.CollectorPlugin;
|
||||
import eu.dnetlib.dhp.common.aggregation.AggregatorReport;
|
||||
import eu.dnetlib.dhp.common.collection.CollectorException;
|
||||
|
||||
public class ResearchFiCollectorPlugin implements CollectorPlugin {
|
||||
|
||||
private static final Logger log = LoggerFactory.getLogger(ResearchFiCollectorPlugin.class);
|
||||
|
||||
@Override
|
||||
public Stream<String> collect(final ApiDescriptor api, final AggregatorReport report)
|
||||
throws CollectorException {
|
||||
|
||||
final String authUrl = api.getParams().get("auth_url");
|
||||
final String clientId = api.getParams().get("auth_client_id");
|
||||
final String clientSecret = api.getParams().get("auth_client_secret");
|
||||
|
||||
final String authToken = authenticate(authUrl, clientId, clientSecret);
|
||||
|
||||
final Iterator<String> iter = new ResearchFiIterator(api.getBaseUrl(), authToken);
|
||||
|
||||
return StreamSupport.stream(Spliterators.spliteratorUnknownSize(iter, Spliterator.ORDERED), false);
|
||||
}
|
||||
|
||||
private String authenticate(final String authUrl, final String clientId, final String clientSecret)
|
||||
throws CollectorException {
|
||||
try (final CloseableHttpClient client = HttpClients.createDefault()) {
|
||||
final HttpPost req = new HttpPost(authUrl);
|
||||
final List<NameValuePair> params = new ArrayList<>();
|
||||
params.add(new BasicNameValuePair("grant_type", "client_credentials"));
|
||||
params.add(new BasicNameValuePair("client_id", clientId));
|
||||
params.add(new BasicNameValuePair("client_secret", clientSecret));
|
||||
|
||||
req.setEntity(new UrlEncodedFormEntity(params, "UTF-8"));
|
||||
|
||||
try (final CloseableHttpResponse response = client.execute(req)) {
|
||||
final String content = IOUtils.toString(response.getEntity().getContent());
|
||||
final JSONObject obj = new JSONObject(content);
|
||||
final String token = obj.getString("access_token");
|
||||
if (StringUtils.isNotBlank(token)) {
|
||||
return token;
|
||||
}
|
||||
}
|
||||
} catch (final Throwable e) {
|
||||
log.warn("Error obtaining access token", e);
|
||||
throw new CollectorException("Error obtaining access token", e);
|
||||
}
|
||||
throw new CollectorException("Access token is missing");
|
||||
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,117 @@
|
|||
|
||||
package eu.dnetlib.dhp.collection.plugin.researchfi;
|
||||
|
||||
import java.util.Iterator;
|
||||
import java.util.Queue;
|
||||
import java.util.concurrent.PriorityBlockingQueue;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.commons.lang3.math.NumberUtils;
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
import org.apache.http.Header;
|
||||
import org.apache.http.client.methods.CloseableHttpResponse;
|
||||
import org.apache.http.client.methods.HttpGet;
|
||||
import org.apache.http.impl.client.CloseableHttpClient;
|
||||
import org.apache.http.impl.client.HttpClients;
|
||||
import org.json.JSONArray;
|
||||
|
||||
import eu.dnetlib.dhp.collection.plugin.utils.JsonUtils;
|
||||
import eu.dnetlib.dhp.common.collection.CollectorException;
|
||||
|
||||
public class ResearchFiIterator implements Iterator<String> {
|
||||
|
||||
private static final Log log = LogFactory.getLog(ResearchFiIterator.class);
|
||||
|
||||
private static final int PAGE_SIZE = 100;
|
||||
|
||||
private final String baseUrl;
|
||||
private final String authToken;
|
||||
private int currPage;
|
||||
private int nPages;
|
||||
|
||||
private final Queue<String> queue = new PriorityBlockingQueue<>();
|
||||
|
||||
public ResearchFiIterator(final String baseUrl, final String authToken) {
|
||||
this.baseUrl = baseUrl;
|
||||
this.authToken = authToken;
|
||||
this.currPage = 0;
|
||||
this.nPages = 0;
|
||||
}
|
||||
|
||||
private void verifyStarted() {
|
||||
if (this.currPage == 0) {
|
||||
try {
|
||||
nextCall();
|
||||
} catch (final CollectorException e) {
|
||||
throw new IllegalStateException(e);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean hasNext() {
|
||||
synchronized (this.queue) {
|
||||
verifyStarted();
|
||||
return !this.queue.isEmpty();
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public String next() {
|
||||
synchronized (this.queue) {
|
||||
verifyStarted();
|
||||
final String res = this.queue.poll();
|
||||
while (this.queue.isEmpty() && (this.currPage < this.nPages)) {
|
||||
try {
|
||||
nextCall();
|
||||
} catch (final CollectorException e) {
|
||||
throw new IllegalStateException(e);
|
||||
}
|
||||
}
|
||||
return res;
|
||||
}
|
||||
}
|
||||
|
||||
private void nextCall() throws CollectorException {
|
||||
|
||||
this.currPage += 1;
|
||||
|
||||
final String url;
|
||||
if (!this.baseUrl.contains("?")) {
|
||||
url = String.format("%s?PageNumber=%d&PageSize=%d", this.baseUrl, this.currPage, PAGE_SIZE);
|
||||
} else if (!this.baseUrl.contains("PageSize=")) {
|
||||
url = String.format("%s&PageNumber=%d&PageSize=%d", this.baseUrl, this.currPage, PAGE_SIZE);
|
||||
} else {
|
||||
url = String.format("%s&PageNumber=%d", this.baseUrl, this.currPage);
|
||||
}
|
||||
log.info("Calling url: " + url);
|
||||
|
||||
try (final CloseableHttpClient client = HttpClients.createDefault()) {
|
||||
|
||||
final HttpGet req = new HttpGet(url);
|
||||
req.addHeader("Authorization", "Bearer " + this.authToken);
|
||||
try (final CloseableHttpResponse response = client.execute(req)) {
|
||||
for (final Header header : response.getAllHeaders()) {
|
||||
log.debug("HEADER: " + header.getName() + " = " + header.getValue());
|
||||
if ("x-page-count".equals(header.getName())) {
|
||||
final int totalPages = NumberUtils.toInt(header.getValue());
|
||||
if (this.nPages != totalPages) {
|
||||
this.nPages = NumberUtils.toInt(header.getValue());
|
||||
log.info("Total pages: " + totalPages);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
final String content = IOUtils.toString(response.getEntity().getContent());
|
||||
final JSONArray jsonArray = new JSONArray(content);
|
||||
|
||||
jsonArray.forEach(obj -> this.queue.add(JsonUtils.convertToXML(obj.toString())));
|
||||
}
|
||||
} catch (final Throwable e) {
|
||||
log.warn("Error calling url: " + url, e);
|
||||
throw new CollectorException("Error calling url: " + url, e);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
|
@ -28,7 +28,13 @@
|
|||
"paramLongName": "dataciteInputPath",
|
||||
"paramDescription": "the path to get the input data from Datacite",
|
||||
"paramRequired": true
|
||||
},
|
||||
},{
|
||||
"paramName": "wip",
|
||||
"paramLongName": "webCrawlInputPath",
|
||||
"paramDescription": "the path to get the input data from Web Crawl",
|
||||
"paramRequired": true
|
||||
}
|
||||
,
|
||||
{
|
||||
"paramName": "o",
|
||||
"paramLongName": "outputPath",
|
||||
|
|
|
@ -17,6 +17,10 @@
|
|||
<name>dataciteInputPath</name>
|
||||
<description>the path where to find the inferred affiliation relations from Datacite</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>webCrawlInputPath</name>
|
||||
<description>the path where to find the inferred affiliation relations from webCrawl</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>outputPath</name>
|
||||
<description>the path where to store the actionset</description>
|
||||
|
@ -112,7 +116,7 @@
|
|||
<arg>--pubmedInputPath</arg><arg>${pubmedInputPath}</arg>
|
||||
<arg>--openapcInputPath</arg><arg>${openapcInputPath}</arg>
|
||||
<arg>--dataciteInputPath</arg><arg>${dataciteInputPath}</arg>
|
||||
|
||||
<arg>--webCrawlInputPath</arg><arg>${webCrawlInputPath}</arg>
|
||||
<arg>--outputPath</arg><arg>${outputPath}</arg>
|
||||
</spark>
|
||||
<ok to="End"/>
|
||||
|
|
|
@ -1,10 +1,5 @@
|
|||
[
|
||||
{
|
||||
"id": "100007630",
|
||||
"uri": "http://dx.doi.org/10.13039/100007630",
|
||||
"name": "College of Engineering and Informatics, National University of Ireland, Galway",
|
||||
"synonym": []
|
||||
},
|
||||
|
||||
{
|
||||
"id": "100007731",
|
||||
"uri": "http://dx.doi.org/10.13039/100007731",
|
||||
|
@ -432,13 +427,13 @@
|
|||
"id": "501100001634",
|
||||
"uri": "http://dx.doi.org/10.13039/501100001634",
|
||||
"name": "University of Galway",
|
||||
"synonym": []
|
||||
"synonym": ["501100019905", "100007630", "501100020570", "501100023852"]
|
||||
},
|
||||
{
|
||||
"id": "501100001635",
|
||||
"uri": "http://dx.doi.org/10.13039/501100001635",
|
||||
"name": "University of Limerick",
|
||||
"synonym": []
|
||||
"synonym": ["501100014531"]
|
||||
},
|
||||
{
|
||||
"id": "501100001636",
|
||||
|
@ -468,7 +463,7 @@
|
|||
"id": "501100002736",
|
||||
"uri": "http://dx.doi.org/10.13039/501100002736",
|
||||
"name": "Covidien",
|
||||
"synonym": []
|
||||
"synonym": ["501100003956"]
|
||||
},
|
||||
{
|
||||
"id": "501100002755",
|
||||
|
@ -518,12 +513,6 @@
|
|||
"name": "Irish Institute of Clinical Neuroscience",
|
||||
"synonym": []
|
||||
},
|
||||
{
|
||||
"id": "501100003956",
|
||||
"uri": "http://dx.doi.org/10.13039/501100003956",
|
||||
"name": "Aspect Medical Systems",
|
||||
"synonym": []
|
||||
},
|
||||
{
|
||||
"id": "501100004162",
|
||||
"uri": "http://dx.doi.org/10.13039/501100004162",
|
||||
|
@ -644,12 +633,7 @@
|
|||
"name": "Irish Centre for High-End Computing",
|
||||
"synonym": []
|
||||
},
|
||||
{
|
||||
"id": "501100019905",
|
||||
"uri": "http://dx.doi.org/10.13039/501100019905",
|
||||
"name": "Galway University Foundation",
|
||||
"synonym": []
|
||||
},
|
||||
|
||||
{
|
||||
"id": "501100020036",
|
||||
"uri": "http://dx.doi.org/10.13039/501100020036",
|
||||
|
@ -824,12 +808,7 @@
|
|||
"name": "Energy Policy Research Centre, Economic and Social Research Institute",
|
||||
"synonym": []
|
||||
},
|
||||
{
|
||||
"id": "501100014531",
|
||||
"uri": "http://dx.doi.org/10.13039/501100014531",
|
||||
"name": "Physical Education and Sport Sciences Department, University of Limerick",
|
||||
"synonym": []
|
||||
},
|
||||
|
||||
{
|
||||
"id": "501100014745",
|
||||
"uri": "http://dx.doi.org/10.13039/501100014745",
|
||||
|
@ -842,22 +821,11 @@
|
|||
"name": "ADAPT - Centre for Digital Content Technology",
|
||||
"synonym": []
|
||||
},
|
||||
{
|
||||
"id": "501100020570",
|
||||
"uri": "http://dx.doi.org/10.13039/501100020570",
|
||||
"name": "College of Medicine, Nursing and Health Sciences, National University of Ireland, Galway",
|
||||
"synonym": []
|
||||
},
|
||||
|
||||
{
|
||||
"id": "501100020871",
|
||||
"uri": "http://dx.doi.org/10.13039/501100020871",
|
||||
"name": "Bernal Institute, University of Limerick",
|
||||
"synonym": []
|
||||
},
|
||||
{
|
||||
"id": "501100023852",
|
||||
"uri": "http://dx.doi.org/10.13039/501100023852",
|
||||
"name": "Moore Institute for Research in the Humanities and Social Studies, University of Galway",
|
||||
"synonym": []
|
||||
}
|
||||
]
|
|
@ -48,12 +48,37 @@
|
|||
<description>Used to configure the heap size for the map JVM process. Should be 80% of mapreduce.map.memory.mb.</description>
|
||||
</property>
|
||||
|
||||
<property>
|
||||
<name>JAVA_HOME</name>
|
||||
<value>/srv/java/openjdk-17</value>
|
||||
<description>Used to configure the Java home location for oozie.launcher.mapreduce.map.env</description>
|
||||
</property>
|
||||
|
||||
<property>
|
||||
<name>JAVA_OPTS</name>
|
||||
<value>-Dcom.sun.security.enableAIAcaIssuers=true</value>
|
||||
<description>Used to configure the JAVA_OPTS parameter</description>
|
||||
</property>
|
||||
|
||||
</parameters>
|
||||
|
||||
<global>
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<configuration>
|
||||
<property>
|
||||
<name>mapreduce.job.queuename</name>
|
||||
<value>${queueName}</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.launcher.mapred.job.queue.name</name>
|
||||
<value>${oozieLauncherQueueName}</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.launcher.mapreduce.map.env</name>
|
||||
<value>JAVA_HOME=${JAVA_HOME}</value>
|
||||
</property>
|
||||
</configuration>
|
||||
</global>
|
||||
|
||||
<start to="collection_mode"/>
|
||||
|
@ -99,7 +124,7 @@
|
|||
<action name="CollectionWorker">
|
||||
<java>
|
||||
<main-class>eu.dnetlib.dhp.collection.CollectorWorkerApplication</main-class>
|
||||
<java-opts>${collection_java_xmx}</java-opts>
|
||||
<java-opts>${JAVA_OPTS} ${collection_java_xmx}</java-opts>
|
||||
<arg>--apidescriptor</arg><arg>${apiDescription}</arg>
|
||||
<arg>--namenode</arg><arg>${nameNode}</arg>
|
||||
<arg>--workflowId</arg><arg>${workflowId}</arg>
|
||||
|
|
|
@ -93,7 +93,7 @@ case object Crossref2Oaf {
|
|||
|
||||
val cf = new KeyValue
|
||||
cf.setValue("UnpayWall")
|
||||
cf.setKey(s"10|openaire____:${DHPUtils.md5("UnpayWall".toLowerCase)}")
|
||||
cf.setKey(s"10|openaire____::${DHPUtils.md5("UnpayWall".toLowerCase)}")
|
||||
cf
|
||||
|
||||
}
|
||||
|
|
|
@ -88,6 +88,7 @@ public class PrepareAffiliationRelationsTest {
|
|||
"-pubmedInputPath", crossrefAffiliationRelationPath,
|
||||
"-openapcInputPath", crossrefAffiliationRelationPath,
|
||||
"-dataciteInputPath", crossrefAffiliationRelationPath,
|
||||
"-webCrawlInputPath", crossrefAffiliationRelationPath,
|
||||
"-outputPath", outputPath
|
||||
});
|
||||
|
||||
|
@ -104,7 +105,7 @@ public class PrepareAffiliationRelationsTest {
|
|||
// );
|
||||
// }
|
||||
// count the number of relations
|
||||
assertEquals(80, tmp.count());
|
||||
assertEquals(120, tmp.count());
|
||||
|
||||
Dataset<Relation> dataset = spark.createDataset(tmp.rdd(), Encoders.bean(Relation.class));
|
||||
dataset.createOrReplaceTempView("result");
|
||||
|
@ -115,7 +116,7 @@ public class PrepareAffiliationRelationsTest {
|
|||
// verify that we have equal number of bi-directional relations
|
||||
Assertions
|
||||
.assertEquals(
|
||||
40, execVerification
|
||||
60, execVerification
|
||||
.filter(
|
||||
"relClass='" + ModelConstants.HAS_AUTHOR_INSTITUTION + "'")
|
||||
.collectAsList()
|
||||
|
@ -123,7 +124,7 @@ public class PrepareAffiliationRelationsTest {
|
|||
|
||||
Assertions
|
||||
.assertEquals(
|
||||
40, execVerification
|
||||
60, execVerification
|
||||
.filter(
|
||||
"relClass='" + ModelConstants.IS_AUTHOR_INSTITUTION_OF + "'")
|
||||
.collectAsList()
|
||||
|
|
|
@ -0,0 +1,58 @@
|
|||
|
||||
package eu.dnetlib.dhp.collection.plugin.researchfi;
|
||||
|
||||
import java.util.HashSet;
|
||||
import java.util.Set;
|
||||
import java.util.concurrent.atomic.AtomicLong;
|
||||
|
||||
import org.dom4j.DocumentException;
|
||||
import org.dom4j.DocumentHelper;
|
||||
import org.junit.jupiter.api.Disabled;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import eu.dnetlib.dhp.collection.ApiDescriptor;
|
||||
import eu.dnetlib.dhp.common.aggregation.AggregatorReport;
|
||||
import eu.dnetlib.dhp.common.collection.CollectorException;
|
||||
|
||||
public class ResearchFiCollectorPluginTest {
|
||||
|
||||
private final ResearchFiCollectorPlugin plugin = new ResearchFiCollectorPlugin();
|
||||
|
||||
@Test
|
||||
@Disabled
|
||||
void testCollect() throws CollectorException {
|
||||
final ApiDescriptor api = new ApiDescriptor();
|
||||
api.setBaseUrl("https://research.fi/api/rest/v1/funding-decisions?FunderName=AKA&FundingStartYearFrom=2022");
|
||||
api.setProtocol("research_fi");
|
||||
api
|
||||
.getParams()
|
||||
.put("auth_url", "https://researchfi-auth.2.rahtiapp.fi/realms/publicapi/protocol/openid-connect/token");
|
||||
api.getParams().put("auth_client_id", "");
|
||||
api.getParams().put("auth_client_secret", "");
|
||||
|
||||
final AtomicLong count = new AtomicLong(0);
|
||||
final Set<String> ids = new HashSet<>();
|
||||
|
||||
this.plugin.collect(api, new AggregatorReport()).forEach(s -> {
|
||||
|
||||
if (count.getAndIncrement() == 0) {
|
||||
System.out.println("First: " + s);
|
||||
}
|
||||
|
||||
try {
|
||||
final String id = DocumentHelper.parseText(s).valueOf("/recordWrap/funderProjectNumber");
|
||||
if (ids.contains(id)) {
|
||||
System.out.println("Id already present: " + id);
|
||||
}
|
||||
ids.add(id);
|
||||
} catch (final DocumentException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
});
|
||||
|
||||
System.out.println("Total records: " + count);
|
||||
System.out.println("Total identifiers: " + ids.size());
|
||||
|
||||
}
|
||||
|
||||
}
|
|
@ -4,4 +4,6 @@
|
|||
{"DOI":"10.1142\/s021821650200186x","Matchings":[{"RORid":"https:\/\/ror.org\/035xkbk20","Confidence":1},{"RORid":"https:\/\/ror.org\/05apxxy63","Confidence":1}]}
|
||||
{"DOI":"10.1061\/(asce)0733-9372(2002)128:7(575)","Matchings":[{"RORid":"https:\/\/ror.org\/04j198w64","Confidence":0.82}]}
|
||||
{"DOI":"10.1061\/(asce)0733-9372(2002)128:7(588)","Matchings":[{"RORid":"https:\/\/ror.org\/03m8km719","Confidence":0.8660254038},{"RORid":"https:\/\/ror.org\/02aze4h65","Confidence":0.87}]}
|
||||
{"DOI":"10.1161\/hy0202.103001","Matchings":[{"RORid":"https:\/\/ror.org\/057xtrt18","Confidence":0.7071067812}]}
|
||||
{"DOI":"10.1161\/hy0202.103001","Matchings":[{"RORid":"https:\/\/ror.org\/057xtrt18","Confidence":0.7071067812}]}
|
||||
{"DOI": "10.1080/13669877.2015.1042504", "Matchings": [{"Confidence": 1.0, "RORid": "https://ror.org/03265fv13"}]}
|
||||
{"DOI": "10.1007/3-540-47984-8_14", "Matchings": [{"Confidence": 1.0, "RORid": "https://ror.org/00a0n9e72"}]}
|
|
@ -26,15 +26,15 @@ import eu.dnetlib.dhp.schema.oaf.Publication;
|
|||
import eu.dnetlib.dhp.schema.oaf.Result;
|
||||
import eu.dnetlib.dhp.schema.oaf.Software;
|
||||
|
||||
public class PrepareSimpleEntititiesJob {
|
||||
public class PrepareSimpleEntitiesJob {
|
||||
|
||||
private static final Logger log = LoggerFactory.getLogger(PrepareSimpleEntititiesJob.class);
|
||||
private static final Logger log = LoggerFactory.getLogger(PrepareSimpleEntitiesJob.class);
|
||||
|
||||
public static void main(final String[] args) throws Exception {
|
||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
|
||||
IOUtils
|
||||
.toString(
|
||||
PrepareSimpleEntititiesJob.class
|
||||
PrepareSimpleEntitiesJob.class
|
||||
.getResourceAsStream("/eu/dnetlib/dhp/broker/oa/common_params.json")));
|
||||
parser.parseArgument(args);
|
||||
|
|
@ -160,8 +160,7 @@ public class ConversionUtils {
|
|||
.stream()
|
||||
.filter(Objects::nonNull)
|
||||
.filter(pid -> pid.getQualifier() != null)
|
||||
.filter(pid -> pid.getQualifier().getClassid() != null)
|
||||
.filter(pid -> pid.getQualifier().getClassid().equalsIgnoreCase(ModelConstants.ORCID))
|
||||
.filter(pid -> StringUtils.startsWithIgnoreCase(pid.getQualifier().getClassid(), ModelConstants.ORCID))
|
||||
.map(StructuredProperty::getValue)
|
||||
.map(ConversionUtils::cleanOrcid)
|
||||
.filter(StringUtils::isNotBlank)
|
||||
|
|
|
@ -7,7 +7,7 @@
|
|||
</property>
|
||||
<property>
|
||||
<name>outputDir</name>
|
||||
<description>the path where the the generated data will be stored</description>
|
||||
<description>the path where the generated data will be stored</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>datasourceIdWhitelist</name>
|
||||
|
@ -179,17 +179,18 @@
|
|||
<master>yarn</master>
|
||||
<mode>cluster</mode>
|
||||
<name>PrepareSimpleEntititiesJob</name>
|
||||
<class>eu.dnetlib.dhp.broker.oa.PrepareSimpleEntititiesJob</class>
|
||||
<class>eu.dnetlib.dhp.broker.oa.PrepareSimpleEntitiesJob</class>
|
||||
<jar>dhp-broker-events-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-cores=${sparkExecutorCores}
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.executor.memoryOverhead=${sparkExecutorMemory}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
--conf spark.sql.shuffle.partitions=3840
|
||||
--conf spark.sql.shuffle.partitions=5000
|
||||
</spark-opts>
|
||||
<arg>--graphPath</arg><arg>${graphInputPath}</arg>
|
||||
<arg>--workingDir</arg><arg>${workingDir}</arg>
|
||||
|
@ -209,11 +210,12 @@
|
|||
--executor-cores=${sparkExecutorCores}
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.executor.memoryOverhead=${sparkExecutorMemory}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
--conf spark.sql.shuffle.partitions=3840
|
||||
--conf spark.sql.shuffle.partitions=8000
|
||||
</spark-opts>
|
||||
<arg>--graphPath</arg><arg>${graphInputPath}</arg>
|
||||
<arg>--workingDir</arg><arg>${workingDir}</arg>
|
||||
|
@ -234,11 +236,12 @@
|
|||
--executor-cores=${sparkExecutorCores}
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.executor.memoryOverhead=${sparkExecutorMemory}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
--conf spark.sql.shuffle.partitions=3840
|
||||
--conf spark.sql.shuffle.partitions=8000
|
||||
</spark-opts>
|
||||
<arg>--graphPath</arg><arg>${graphInputPath}</arg>
|
||||
<arg>--workingDir</arg><arg>${workingDir}</arg>
|
||||
|
@ -258,11 +261,12 @@
|
|||
--executor-cores=${sparkExecutorCores}
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.executor.memoryOverhead=${sparkExecutorMemory}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
--conf spark.sql.shuffle.partitions=3840
|
||||
--conf spark.sql.shuffle.partitions=5000
|
||||
</spark-opts>
|
||||
<arg>--graphPath</arg><arg>${graphInputPath}</arg>
|
||||
<arg>--workingDir</arg><arg>${workingDir}</arg>
|
||||
|
@ -282,11 +286,12 @@
|
|||
--executor-cores=${sparkExecutorCores}
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.executor.memoryOverhead=${sparkExecutorMemory}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
--conf spark.sql.shuffle.partitions=3840
|
||||
--conf spark.sql.shuffle.partitions=10000
|
||||
</spark-opts>
|
||||
<arg>--graphPath</arg><arg>${graphInputPath}</arg>
|
||||
<arg>--workingDir</arg><arg>${workingDir}</arg>
|
||||
|
@ -306,11 +311,12 @@
|
|||
--executor-cores=${sparkExecutorCores}
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.executor.memoryOverhead=${sparkExecutorMemory}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
--conf spark.sql.shuffle.partitions=3840
|
||||
--conf spark.sql.shuffle.partitions=2000
|
||||
</spark-opts>
|
||||
<arg>--graphPath</arg><arg>${graphInputPath}</arg>
|
||||
<arg>--workingDir</arg><arg>${workingDir}</arg>
|
||||
|
@ -332,11 +338,12 @@
|
|||
--executor-cores=${sparkExecutorCores}
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.executor.memoryOverhead=${sparkExecutorMemory}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
--conf spark.sql.shuffle.partitions=3840
|
||||
--conf spark.sql.shuffle.partitions=8000
|
||||
</spark-opts>
|
||||
<arg>--graphPath</arg><arg>${graphInputPath}</arg>
|
||||
<arg>--workingDir</arg><arg>${workingDir}</arg>
|
||||
|
@ -356,11 +363,12 @@
|
|||
--executor-cores=${sparkExecutorCores}
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.executor.memoryOverhead=${sparkExecutorMemory}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
--conf spark.sql.shuffle.partitions=3840
|
||||
--conf spark.sql.shuffle.partitions=8000
|
||||
</spark-opts>
|
||||
<arg>--graphPath</arg><arg>${graphInputPath}</arg>
|
||||
<arg>--workingDir</arg><arg>${workingDir}</arg>
|
||||
|
@ -380,11 +388,12 @@
|
|||
--executor-cores=${sparkExecutorCores}
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.executor.memoryOverhead=${sparkExecutorMemory}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
--conf spark.sql.shuffle.partitions=3840
|
||||
--conf spark.sql.shuffle.partitions=8000
|
||||
</spark-opts>
|
||||
<arg>--graphPath</arg><arg>${graphInputPath}</arg>
|
||||
<arg>--workingDir</arg><arg>${workingDir}</arg>
|
||||
|
@ -404,11 +413,12 @@
|
|||
--executor-cores=${sparkExecutorCores}
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.executor.memoryOverhead=${sparkExecutorMemory}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
--conf spark.sql.shuffle.partitions=3840
|
||||
--conf spark.sql.shuffle.partitions=8000
|
||||
</spark-opts>
|
||||
<arg>--graphPath</arg><arg>${graphInputPath}</arg>
|
||||
<arg>--workingDir</arg><arg>${workingDir}</arg>
|
||||
|
@ -428,11 +438,12 @@
|
|||
--executor-cores=${sparkExecutorCores}
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.executor.memoryOverhead=${sparkExecutorMemory}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
--conf spark.sql.shuffle.partitions=3840
|
||||
--conf spark.sql.shuffle.partitions=8000
|
||||
</spark-opts>
|
||||
<arg>--graphPath</arg><arg>${graphInputPath}</arg>
|
||||
<arg>--workingDir</arg><arg>${workingDir}</arg>
|
||||
|
@ -452,11 +463,12 @@
|
|||
--executor-cores=${sparkExecutorCores}
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.executor.memoryOverhead=${sparkExecutorMemory}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
--conf spark.sql.shuffle.partitions=3840
|
||||
--conf spark.sql.shuffle.partitions=8000
|
||||
</spark-opts>
|
||||
<arg>--graphPath</arg><arg>${graphInputPath}</arg>
|
||||
<arg>--workingDir</arg><arg>${workingDir}</arg>
|
||||
|
@ -476,11 +488,12 @@
|
|||
--executor-cores=${sparkExecutorCores}
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.executor.memoryOverhead=${sparkExecutorMemory}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
--conf spark.sql.shuffle.partitions=3840
|
||||
--conf spark.sql.shuffle.partitions=8000
|
||||
</spark-opts>
|
||||
<arg>--workingDir</arg><arg>${workingDir}</arg>
|
||||
<arg>--outputDir</arg><arg>${outputDir}</arg>
|
||||
|
@ -503,6 +516,7 @@
|
|||
<spark-opts>
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.executor.memoryOverhead=${sparkExecutorMemory}
|
||||
--conf spark.dynamicAllocation.maxExecutors=${sparkMaxExecutorsForIndexing}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
|
@ -535,6 +549,7 @@
|
|||
--executor-cores=${sparkExecutorCores}
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.executor.memoryOverhead=${sparkExecutorMemory}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
|
@ -562,6 +577,7 @@
|
|||
--executor-cores=${sparkExecutorCores}
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.executor.memoryOverhead=${sparkExecutorMemory}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
|
@ -585,6 +601,7 @@
|
|||
<spark-opts>
|
||||
--executor-memory=${sparkExecutorMemory}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.executor.memoryOverhead=${sparkExecutorMemory}
|
||||
--conf spark.dynamicAllocation.maxExecutors=${sparkMaxExecutorsForIndexing}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
|
|
|
@ -0,0 +1,66 @@
|
|||
|
||||
package eu.dnetlib.dhp.broker.oa.matchers.simple;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
import static org.junit.jupiter.api.Assertions.assertTrue;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
import org.junit.jupiter.api.BeforeEach;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import eu.dnetlib.broker.objects.OaBrokerAuthor;
|
||||
import eu.dnetlib.broker.objects.OaBrokerMainEntity;
|
||||
|
||||
class EnrichMissingAuthorOrcidTest {
|
||||
|
||||
final EnrichMissingAuthorOrcid matcher = new EnrichMissingAuthorOrcid();
|
||||
|
||||
@BeforeEach
|
||||
void setUp() throws Exception {
|
||||
}
|
||||
|
||||
@Test
|
||||
void testFindDifferences_1() {
|
||||
final OaBrokerMainEntity source = new OaBrokerMainEntity();
|
||||
final OaBrokerMainEntity target = new OaBrokerMainEntity();
|
||||
final List<OaBrokerAuthor> list = this.matcher.findDifferences(source, target);
|
||||
assertTrue(list.isEmpty());
|
||||
}
|
||||
|
||||
@Test
|
||||
void testFindDifferences_2() {
|
||||
final OaBrokerMainEntity source = new OaBrokerMainEntity();
|
||||
final OaBrokerMainEntity target = new OaBrokerMainEntity();
|
||||
|
||||
source.getCreators().add(new OaBrokerAuthor("Claudio Atzori", "0000-0001-9613-6639"));
|
||||
target.getCreators().add(new OaBrokerAuthor("Claudio Atzori", null));
|
||||
|
||||
final List<OaBrokerAuthor> list = this.matcher.findDifferences(source, target);
|
||||
assertEquals(1, list.size());
|
||||
}
|
||||
|
||||
@Test
|
||||
void testFindDifferences_3() {
|
||||
final OaBrokerMainEntity source = new OaBrokerMainEntity();
|
||||
final OaBrokerMainEntity target = new OaBrokerMainEntity();
|
||||
|
||||
source.getCreators().add(new OaBrokerAuthor("Claudio Atzori", null));
|
||||
target.getCreators().add(new OaBrokerAuthor("Claudio Atzori", "0000-0001-9613-6639"));
|
||||
|
||||
final List<OaBrokerAuthor> list = this.matcher.findDifferences(source, target);
|
||||
assertTrue(list.isEmpty());
|
||||
}
|
||||
|
||||
@Test
|
||||
void testFindDifferences_4() {
|
||||
final OaBrokerMainEntity source = new OaBrokerMainEntity();
|
||||
final OaBrokerMainEntity target = new OaBrokerMainEntity();
|
||||
source.getCreators().add(new OaBrokerAuthor("Claudio Atzori", "0000-0001-9613-6639"));
|
||||
target.getCreators().add(new OaBrokerAuthor("Claudio Atzori", "0000-0001-9613-6639"));
|
||||
|
||||
final List<OaBrokerAuthor> list = this.matcher.findDifferences(source, target);
|
||||
assertTrue(list.isEmpty());
|
||||
}
|
||||
|
||||
}
|
|
@ -2,27 +2,32 @@
|
|||
package eu.dnetlib.dhp.broker.oa.util;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
import static org.junit.jupiter.api.Assertions.assertNull;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
|
||||
import org.junit.jupiter.api.BeforeEach;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import eu.dnetlib.broker.objects.OaBrokerMainEntity;
|
||||
import eu.dnetlib.broker.objects.OaBrokerTypedValue;
|
||||
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||
import eu.dnetlib.dhp.schema.oaf.Author;
|
||||
import eu.dnetlib.dhp.schema.oaf.Instance;
|
||||
import eu.dnetlib.dhp.schema.oaf.Qualifier;
|
||||
import eu.dnetlib.dhp.schema.oaf.Result;
|
||||
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
|
||||
|
||||
class ConversionUtilsTest {
|
||||
public class ConversionUtilsTest {
|
||||
|
||||
@BeforeEach
|
||||
void setUp() throws Exception {
|
||||
public void setUp() throws Exception {
|
||||
}
|
||||
|
||||
@Test
|
||||
void testAllResultPids() {
|
||||
public void testAllResultPids() {
|
||||
final Qualifier qf = new Qualifier();
|
||||
qf.setClassid("test");
|
||||
qf.setClassname("test");
|
||||
|
@ -91,4 +96,42 @@ class ConversionUtilsTest {
|
|||
assertEquals(6, list.size());
|
||||
}
|
||||
|
||||
public void testOafResultToBrokerResult() {
|
||||
|
||||
final Author a1 = createAuthor("Michele Artini", "0000-0002-4406-428X");
|
||||
final Author a2 = createAuthor("Claudio Atzori", "http://orcid.org/0000-0001-9613-6639");
|
||||
final Author a3 = createAuthor("Alessia Bardi", null);
|
||||
|
||||
final Result r = new Result();
|
||||
r.setAuthor(Arrays.asList(a1, a2, a3));
|
||||
|
||||
final OaBrokerMainEntity br = ConversionUtils.oafResultToBrokerResult(r);
|
||||
|
||||
assertEquals(3, br.getCreators().size());
|
||||
assertEquals("0000-0002-4406-428X", br.getCreators().get(0).getOrcid());
|
||||
assertEquals("0000-0001-9613-6639", br.getCreators().get(1).getOrcid());
|
||||
assertNull(br.getCreators().get(2).getOrcid());
|
||||
}
|
||||
|
||||
private Author createAuthor(final String name, final String orcid) {
|
||||
|
||||
final Author a = new Author();
|
||||
a.setFullname("Michele Artini");
|
||||
|
||||
if (orcid != null) {
|
||||
final Qualifier q = new Qualifier();
|
||||
q.setClassid(ModelConstants.ORCID);
|
||||
q.setClassname(ModelConstants.ORCID);
|
||||
q.setSchemeid("dnet:pids");
|
||||
q.setSchemename("dnet:pids");
|
||||
|
||||
final StructuredProperty pid = new StructuredProperty();
|
||||
pid.setQualifier(q);
|
||||
pid.setValue(orcid);
|
||||
|
||||
a.setPid(Arrays.asList(pid));
|
||||
}
|
||||
return a;
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -203,8 +203,8 @@ public class SparkCreateMergeRels extends AbstractSparkAction {
|
|||
WindowSpec w = Window
|
||||
.partitionBy("groupId")
|
||||
.orderBy(
|
||||
col("lastUsage").desc_nulls_last(),
|
||||
col("pidType").asc_nulls_last(),
|
||||
col("lastUsage").desc_nulls_last(),
|
||||
col("collectedfrom").desc_nulls_last(),
|
||||
col("date").asc_nulls_last(),
|
||||
col("id").asc_nulls_last());
|
||||
|
|
|
@ -15,4 +15,12 @@
|
|||
<name>oozie.action.sharelib.for.spark</name>
|
||||
<value>spark2</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>hiveMetastoreUris</name>
|
||||
<value>thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>pivotHistoryDatabase</name>
|
||||
<value>​</value>
|
||||
</property>
|
||||
</configuration>
|
|
@ -198,6 +198,8 @@
|
|||
<arg>--isLookUpUrl</arg><arg>${isLookUpUrl}</arg>
|
||||
<arg>--actionSetId</arg><arg>${actionSetId}</arg>
|
||||
<arg>--cutConnectedComponent</arg><arg>${cutConnectedComponent}</arg>
|
||||
<arg>--hiveMetastoreUris</arg><arg>${hiveMetastoreUris}</arg>
|
||||
<arg>--pivotHistoryDatabase</arg><arg>${pivotHistoryDatabase}</arg>
|
||||
</spark>
|
||||
<ok to="PrepareOrgRels"/>
|
||||
<error to="Kill"/>
|
||||
|
|
File diff suppressed because one or more lines are too long
|
@ -190,7 +190,7 @@ public class SparkDedupTest implements Serializable {
|
|||
System.out.println("orp_simrel = " + orp_simrel);
|
||||
|
||||
if (CHECK_CARDINALITIES) {
|
||||
assertEquals(751, orgs_simrel);
|
||||
assertEquals(742, orgs_simrel);
|
||||
assertEquals(566, pubs_simrel);
|
||||
assertEquals(113, sw_simrel);
|
||||
assertEquals(148, ds_simrel);
|
||||
|
@ -251,7 +251,7 @@ public class SparkDedupTest implements Serializable {
|
|||
|
||||
// entities simrels supposed to be equal to the number of previous step (no rels in whitelist)
|
||||
if (CHECK_CARDINALITIES) {
|
||||
assertEquals(751, orgs_simrel);
|
||||
assertEquals(742, orgs_simrel);
|
||||
assertEquals(566, pubs_simrel);
|
||||
assertEquals(148, ds_simrel);
|
||||
assertEquals(280, orp_simrel);
|
||||
|
@ -442,7 +442,7 @@ public class SparkDedupTest implements Serializable {
|
|||
final List<Relation> merges = pubs
|
||||
.filter("source == '50|arXiv_dedup_::c93aeb433eb90ed7a86e29be00791b7c'")
|
||||
.collectAsList();
|
||||
assertEquals(3, merges.size());
|
||||
assertEquals(1, merges.size());
|
||||
Set<String> dups = Sets
|
||||
.newHashSet(
|
||||
"50|doi_________::3b1d0d8e8f930826665df9d6b82fbb73",
|
||||
|
@ -451,7 +451,7 @@ public class SparkDedupTest implements Serializable {
|
|||
merges.forEach(r -> {
|
||||
assertEquals(ModelConstants.RESULT_RESULT, r.getRelType());
|
||||
assertEquals(ModelConstants.DEDUP, r.getSubRelType());
|
||||
assertEquals(ModelConstants.MERGES, r.getRelClass());
|
||||
assertEquals(ModelConstants.IS_MERGED_IN, r.getRelClass());
|
||||
assertTrue(dups.contains(r.getTarget()));
|
||||
});
|
||||
|
||||
|
@ -561,7 +561,7 @@ public class SparkDedupTest implements Serializable {
|
|||
System.out.println("orp_mergerel = " + orp_mergerel);
|
||||
|
||||
if (CHECK_CARDINALITIES) {
|
||||
assertEquals(1268, orgs_mergerel);
|
||||
assertEquals(1278, orgs_mergerel);
|
||||
assertEquals(1156, pubs.count());
|
||||
assertEquals(292, sw_mergerel);
|
||||
assertEquals(476, ds_mergerel);
|
||||
|
@ -618,7 +618,7 @@ public class SparkDedupTest implements Serializable {
|
|||
System.out.println("orp_deduprecord = " + orp_deduprecord);
|
||||
|
||||
if (CHECK_CARDINALITIES) {
|
||||
assertEquals(86, orgs_deduprecord);
|
||||
assertEquals(78, orgs_deduprecord);
|
||||
assertEquals(96, pubs.count());
|
||||
assertEquals(47, sw_deduprecord);
|
||||
assertEquals(97, ds_deduprecord);
|
||||
|
@ -761,7 +761,7 @@ public class SparkDedupTest implements Serializable {
|
|||
|
||||
if (CHECK_CARDINALITIES) {
|
||||
assertEquals(930, publications);
|
||||
assertEquals(839, organizations);
|
||||
assertEquals(831, organizations);
|
||||
assertEquals(100, projects);
|
||||
assertEquals(100, datasource);
|
||||
assertEquals(196, softwares);
|
||||
|
|
|
@ -22,8 +22,11 @@ import java.util.Properties;
|
|||
import org.apache.commons.io.FileUtils;
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.api.java.JavaRDD;
|
||||
import org.apache.spark.api.java.JavaSparkContext;
|
||||
import org.apache.spark.sql.Dataset;
|
||||
import org.apache.spark.sql.Encoders;
|
||||
import org.apache.spark.sql.Row;
|
||||
import org.apache.spark.sql.SparkSession;
|
||||
import org.junit.jupiter.api.*;
|
||||
import org.junit.jupiter.api.extension.ExtendWith;
|
||||
|
@ -143,7 +146,7 @@ public class SparkOpenorgsDedupTest implements Serializable {
|
|||
.load(DedupUtility.createSimRelPath(testOutputBasePath, testActionSetId, "organization"))
|
||||
.count();
|
||||
|
||||
assertEquals(86, orgs_simrel);
|
||||
assertEquals(92, orgs_simrel);
|
||||
}
|
||||
|
||||
@Test
|
||||
|
@ -172,7 +175,7 @@ public class SparkOpenorgsDedupTest implements Serializable {
|
|||
.load(DedupUtility.createSimRelPath(testOutputBasePath, testActionSetId, "organization"))
|
||||
.count();
|
||||
|
||||
assertEquals(122, orgs_simrel);
|
||||
assertEquals(128, orgs_simrel);
|
||||
}
|
||||
|
||||
@Test
|
||||
|
@ -207,7 +210,7 @@ public class SparkOpenorgsDedupTest implements Serializable {
|
|||
.read()
|
||||
.load(testOutputBasePath + "/" + testActionSetId + "/organization_mergerel")
|
||||
.count();
|
||||
assertEquals(132, orgs_mergerel);
|
||||
assertEquals(128, orgs_mergerel);
|
||||
|
||||
// verify that a DiffRel is in the mergerels (to be sure that the job supposed to remove them has something to
|
||||
// do)
|
||||
|
|
|
@ -9,6 +9,7 @@ import org.junit.jupiter.api.Assertions;
|
|||
import org.junit.jupiter.api.Test;
|
||||
import org.junit.platform.commons.util.StringUtils;
|
||||
|
||||
import eu.dnetlib.dhp.oa.dedup.SparkOpenorgsDedupTest;
|
||||
import eu.dnetlib.pace.config.DedupConfig;
|
||||
import eu.dnetlib.pace.model.SparkModel;
|
||||
|
||||
|
@ -24,6 +25,31 @@ class JsonPathTest {
|
|||
|
||||
Row row = SparkModel.apply(conf).rowFromJson(org);
|
||||
|
||||
System.out.println("row = " + row);
|
||||
Assertions.assertNotNull(row);
|
||||
Assertions.assertTrue(StringUtils.isNotBlank(row.getAs("identifier")));
|
||||
|
||||
System.out.println("row = " + row.getAs("countrytitle"));
|
||||
}
|
||||
|
||||
@Test
|
||||
void jsonToModelTest() throws IOException {
|
||||
DedupConfig conf = DedupConfig
|
||||
.load(
|
||||
IOUtils
|
||||
.toString(
|
||||
SparkOpenorgsDedupTest.class
|
||||
.getResourceAsStream(
|
||||
"/eu/dnetlib/dhp/dedup/conf/org.curr.conf.json")));
|
||||
|
||||
final String org = IOUtils.toString(getClass().getResourceAsStream("organization_example1.json"));
|
||||
|
||||
Row row = SparkModel.apply(conf).rowFromJson(org);
|
||||
// to check that the same parsing returns the same row
|
||||
Row row1 = SparkModel.apply(conf).rowFromJson(org);
|
||||
|
||||
Assertions.assertEquals(row, row1);
|
||||
System.out.println("row = " + row);
|
||||
Assertions.assertNotNull(row);
|
||||
Assertions.assertTrue(StringUtils.isNotBlank(row.getAs("identifier")));
|
||||
}
|
||||
|
|
|
@ -4,8 +4,8 @@
|
|||
"dedupRun" : "001",
|
||||
"entityType" : "organization",
|
||||
"subEntityValue": "organization",
|
||||
"orderField" : "legalname",
|
||||
"queueMaxSize" : "2000",
|
||||
"orderField" : "original_legalname",
|
||||
"queueMaxSize" : "100000",
|
||||
"groupMaxSize" : "50",
|
||||
"slidingWindowSize" : "200",
|
||||
"idPath":"$.id",
|
||||
|
@ -15,10 +15,10 @@
|
|||
},
|
||||
"pace" : {
|
||||
"clustering" : [
|
||||
{ "name" : "sortedngrampairs", "fields" : [ "legalname" ], "params" : { "max" : 2, "ngramLen" : "3"} },
|
||||
{ "name" : "suffixprefix", "fields" : [ "legalname" ], "params" : { "max" : 1, "len" : "3" } },
|
||||
{ "name" : "sortedngrampairs", "fields" : [ "original_legalname" ], "params" : { "max" : 2, "ngramLen" : "3"} },
|
||||
{ "name" : "suffixprefix", "fields" : [ "original_legalname" ], "params" : { "max" : 1, "len" : "3" } },
|
||||
{ "name" : "urlclustering", "fields" : [ "websiteurl" ], "params" : { } },
|
||||
{ "name" : "keywordsclustering", "fields" : [ "legalname" ], "params" : { "max": 2, "windowSize": 4} }
|
||||
{ "name" : "legalnameclustering", "fields" : [ "legalname" ], "params" : { "max": 2} }
|
||||
],
|
||||
"decisionTree" : {
|
||||
"start": {
|
||||
|
@ -29,16 +29,23 @@
|
|||
"weight": 1,
|
||||
"countIfUndefined": "false",
|
||||
"params": {}
|
||||
},
|
||||
{
|
||||
"field": "rorid",
|
||||
"comparator": "exactMatch",
|
||||
"weight": 1,
|
||||
"countIfUndefined": "false",
|
||||
"params": {}
|
||||
}
|
||||
],
|
||||
"threshold": 1,
|
||||
"aggregation": "AVG",
|
||||
"aggregation": "OR",
|
||||
"positive": "MATCH",
|
||||
"negative": "NO_MATCH",
|
||||
"undefined": "layer2",
|
||||
"undefined": "necessaryConditions",
|
||||
"ignoreUndefined": "false"
|
||||
},
|
||||
"layer2": {
|
||||
"necessaryConditions": {
|
||||
"fields": [
|
||||
{
|
||||
"field": "websiteurl",
|
||||
|
@ -49,20 +56,20 @@
|
|||
},
|
||||
{
|
||||
"field": "country",
|
||||
"comparator": "exactMatch",
|
||||
"comparator": "countryMatch",
|
||||
"weight": 1,
|
||||
"countIfUndefined": "true",
|
||||
"params": {}
|
||||
},
|
||||
{
|
||||
"field": "legalname",
|
||||
"field": "original_legalname",
|
||||
"comparator": "numbersMatch",
|
||||
"weight": 1,
|
||||
"countIfUndefined": "true",
|
||||
"params": {}
|
||||
},
|
||||
{
|
||||
"field": "legalname",
|
||||
"field": "original_legalname",
|
||||
"comparator": "romansMatch",
|
||||
"weight": 1,
|
||||
"countIfUndefined": "true",
|
||||
|
@ -71,68 +78,64 @@
|
|||
],
|
||||
"threshold": 1,
|
||||
"aggregation": "AND",
|
||||
"positive": "layer3",
|
||||
"positive": "cityCheck",
|
||||
"negative": "NO_MATCH",
|
||||
"undefined": "layer3",
|
||||
"undefined": "cityCheck",
|
||||
"ignoreUndefined": "true"
|
||||
},
|
||||
"layer3": {
|
||||
"cityCheck": {
|
||||
"fields": [
|
||||
{
|
||||
"field": "legalname",
|
||||
"comparator": "cityMatch",
|
||||
"comparator": "codeMatch",
|
||||
"weight": 1.0,
|
||||
"countIfUndefined": "true",
|
||||
"params": {
|
||||
"windowSize": "4"
|
||||
"codeRegex": "city::\\d+"
|
||||
}
|
||||
}
|
||||
],
|
||||
"threshold": 0.1,
|
||||
"aggregation": "AVG",
|
||||
"positive": "layer4",
|
||||
"positive": "keywordCheck",
|
||||
"negative": "NO_MATCH",
|
||||
"undefined": "NO_MATCH",
|
||||
"ignoreUndefined": "true"
|
||||
},
|
||||
"layer4": {
|
||||
"keywordCheck": {
|
||||
"fields": [
|
||||
{
|
||||
"field": "legalname",
|
||||
"comparator": "keywordMatch",
|
||||
"comparator": "codeMatch",
|
||||
"weight": 1.0,
|
||||
"countIfUndefined": "true",
|
||||
"params": {
|
||||
"windowSize": "4"
|
||||
"codeRegex": "key::\\d+"
|
||||
}
|
||||
}
|
||||
],
|
||||
"threshold": 0.7,
|
||||
"aggregation": "AVG",
|
||||
"positive": "layer5",
|
||||
"positive": "nameCheck",
|
||||
"negative": "NO_MATCH",
|
||||
"undefined": "layer5",
|
||||
"undefined": "nameCheck",
|
||||
"ignoreUndefined": "true"
|
||||
},
|
||||
"layer5": {
|
||||
"nameCheck": {
|
||||
"fields": [
|
||||
{
|
||||
"field": "legalname",
|
||||
"comparator": "jaroWinklerNormalizedName",
|
||||
"comparator": "jaroWinklerLegalname",
|
||||
"weight": 0.9,
|
||||
"countIfUndefined": "true",
|
||||
"params": {
|
||||
"windowSize": "4"
|
||||
}
|
||||
"params": {}
|
||||
},
|
||||
{
|
||||
"field": "legalshortname",
|
||||
"comparator": "jaroWinklerNormalizedName",
|
||||
"comparator": "jaroWinklerLegalname",
|
||||
"weight": 0.1,
|
||||
"countIfUndefined": "false",
|
||||
"params": {
|
||||
"windowSize": 4
|
||||
}
|
||||
"params": {}
|
||||
}
|
||||
],
|
||||
"threshold": 0.9,
|
||||
|
@ -144,126 +147,16 @@
|
|||
}
|
||||
},
|
||||
"model" : [
|
||||
{ "name" : "country", "type" : "String", "path" : "$.country.classid"},
|
||||
{ "name" : "legalshortname", "type" : "String", "path" : "$.legalshortname.value"},
|
||||
{ "name" : "legalname", "type" : "String", "path" : "$.legalname.value" },
|
||||
{ "name" : "country", "type" : "String", "path" : "$.country.classid", "infer" : "country", "inferenceFrom" : "$.legalname.value"},
|
||||
{ "name" : "legalshortname", "type" : "String", "path" : "$.legalshortname.value", "infer" : "city_keyword"},
|
||||
{ "name" : "original_legalname", "type" : "String", "path" : "$.legalname.value" },
|
||||
{ "name" : "legalname", "type" : "String", "path" : "$.legalname.value", "infer" : "city_keyword"},
|
||||
{ "name" : "websiteurl", "type" : "URL", "path" : "$.websiteurl.value" },
|
||||
{ "name" : "gridid", "type" : "String", "path" : "$.pid[?(@.qualifier.classid =='grid')].value"},
|
||||
{ "name" : "rorid", "type" : "String", "path" : "$.pid[?(@.qualifier.classid =='ROR')].value"},
|
||||
{ "name" : "originalId", "type" : "String", "path" : "$.id" }
|
||||
],
|
||||
"blacklists" : {
|
||||
"legalname" : []
|
||||
},
|
||||
"synonyms": {
|
||||
"key::1": ["university","università", "universitas", "università studi","universitario","universitaria","université", "universite", "universitaire","universitaires","universidad","universitade","Universität","universitaet","Uniwersytet","университет","universiteit","πανεπιστήμιο","universitesi","universiteti", "universiti"],
|
||||
"key::2": ["studies","studi","études","estudios","estudos","Studien","studia","исследования","studies","σπουδές"],
|
||||
"key::3": ["advanced","superiore","supérieur","supérieure","supérieurs","supérieures","avancado","avancados","fortgeschrittene","fortgeschritten","zaawansowany","передовой","gevorderd","gevorderde","προχωρημένος","προχωρημένη","προχωρημένο","προχωρημένες","προχωρημένα","wyzsza"],
|
||||
"key::4": ["institute","istituto","institut","instituto","instituto","Institut","instytut","институт","instituut","ινστιτούτο"],
|
||||
"key::5": ["hospital","ospedale","hôpital","hospital","hospital","Krankenhaus","szpital","больница","ziekenhuis","νοσοκομείο"],
|
||||
"key::6": ["research","ricerca","recherche","investigacion","pesquisa","Forschung","badania","исследования","onderzoek","έρευνα","erevna","erevnas"],
|
||||
"key::7": ["college","collegio","colegio","faculdade","Hochschule","Szkoła Wyższa","Высшая школа","κολλέγιο"],
|
||||
"key::8": ["foundation","fondazione","fondation","fundación","fundação","Stiftung","Fundacja","фонд","stichting","ίδρυμα","idryma"],
|
||||
"key::9": ["center","centro","centre","centro","centro","zentrum","centrum","центр","centrum","κέντρο"],
|
||||
"key::10": ["national","nazionale","national","nationale","nationaux","nationales","nacional","nacional","national","krajowy","национальный","nationaal","nationale","εθνικό"],
|
||||
"key::11": ["association","associazione","association","asociación","associação","Verein","verband","stowarzyszenie","ассоциация","associatie"],
|
||||
"key::12": ["society","societa","société","sociedad","sociedade","gesellschaft","społeczeństwo","общество","maatschappij","κοινωνία"],
|
||||
"key::13": ["international","internazionale","international","internacional","internacional","international","międzynarodowy","Международный","internationaal","internationale","διεθνής","διεθνή","διεθνές"],
|
||||
"key::14": ["community","comunita","communauté","comunidad","comunidade","Gemeinschaft","społeczność","сообщество","gemeenschap","κοινότητα"],
|
||||
"key::15": ["school","scuola","école","escuela","escola","schule","Szkoła","школа","school","σχολείο"],
|
||||
"key::16": ["education","educazione","éducation","educacion","Educação","Bildung","Edukacja","образование","opleiding","εκπαίδευση"],
|
||||
"key::17": ["academy","accademia","académie","academia","academia","Akademie","akademie","академия","academie","ακαδημία"],
|
||||
"key::18": ["public","pubblico","public","publique","publics","publiques","publico","publico","Öffentlichkeit","publiczny","публичный","publiek","publieke","δημόσιος","δημόσια","δημόσιο"],
|
||||
"key::19": ["museum","museo","musée","mueso","museu","museum","muzeum","музей","museum","μουσείο"],
|
||||
"key::20": ["group","gruppo","groupe","grupo","grupo","gruppe","grupa","группа","groep","ομάδα","όμιλος"],
|
||||
"key::21": ["department","dipartimento","département","departamento","departamento","abteilung","departament","отдел","afdeling","τμήμα"],
|
||||
"key::22": ["council","consiglio","conseil","Consejo","conselho","gesellschaft","rada","совет","raad","συμβούλιο"],
|
||||
"key::23": ["library","biblioteca","bibliothèque","biblioteca","biblioteca","Bibliothek","biblioteka","библиотека","bibliotheek","βιβλιοθήκη"],
|
||||
"key::24": ["ministry","ministero","ministère","ministerio","ministério","Ministerium","ministerstwo","министерство","ministerie","υπουργείο"],
|
||||
"key::25": ["services","servizi","services","servicios","Serviços","Dienstleistungen","usługi","услуги","diensten","υπηρεσίες"],
|
||||
"key::26": ["central","centrale","central","centrale","centrales","central","central","zentral","centralny","цетральный","centraal","κεντρικός","κεντρική","κεντρικό","κεντρικά"],
|
||||
"key::27": ["general","generale","général","générale","généraux","générales","general","geral","general","Allgemeines","general","общий","algemeen","algemene","γενικός","γενική","γενικό","γενικά"],
|
||||
"key::28": ["applied","applicati","appliqué","appliquée","appliqués","appliquées","aplicado","aplicada","angewendet","stosowany","прикладной","toegepast","toegepaste","εφαρμοσμένος","εφαρμοσμένη","εφαρμοσμένο","εφαρμοσμένα"],
|
||||
"key::29": ["european","europee","europea","européen","européenne","européens","européennes","europeo","europeu","europäisch","europejski","европейский","Europees","Europese","ευρωπαϊκός","ευρωπαϊκή","ευρωπαϊκό","ευρωπαϊκά"],
|
||||
"key::30": ["agency","agenzia","agence","agencia","agencia","agentur","agencja","агенция","agentschap","πρακτορείο"],
|
||||
"key::31": ["laboratory","laboratorio","laboratoire","laboratorio","laboratorio","labor","laboratorium","лаборатория","laboratorium","εργαστήριο"],
|
||||
"key::32": ["industry","industria","industrie","индустрия","industrie","βιομηχανία"],
|
||||
"key::33": ["industrial","industriale","industriel","industrielle","industriels","industrielles","индустриальный","industrieel","βιομηχανικός","βιομηχανική","βιομηχανικό","βιομηχανικά","βιομηχανικές"],
|
||||
"key::34": ["consortium","consorzio","consortium","консорциум","consortium","κοινοπραξία"],
|
||||
"key::35": ["organization","organizzazione","organisation","organización","organização","organizacja","организация","organisatie","οργανισμός"],
|
||||
"key::36": ["authority","autorità","autorité","авторитет","autoriteit"],
|
||||
"key::37": ["federation","federazione","fédération","федерация","federatie","ομοσπονδία"],
|
||||
"key::38": ["observatory","osservatorio","observatoire","обсерватория","observatorium","αστεροσκοπείο"],
|
||||
"key::39": ["bureau","ufficio","bureau","офис","bureau","γραφείο"],
|
||||
"key::40": ["company","impresa","compagnie","société","компания","bedrijf","εταιρία"],
|
||||
"key::41": ["polytechnic","politecnico","polytechnique","политехника","polytechnisch","πολυτεχνείο","universita politecnica","polytechnic university","universidad politecnica","universitat politecnica","politechnika","politechniki","university technology","university science technology"],
|
||||
"key::42": ["coalition","coalizione","coalition","коалиция","coalitie","συνασπισμός"],
|
||||
"key::43": ["initiative","iniziativa","initiative","инициатива","initiatief","πρωτοβουλία"],
|
||||
"key::44": ["academic","accademico","académique","universitaire","акадеческий academisch","ακαδημαϊκός","ακαδημαϊκή","ακαδημαϊκό","ακαδημαϊκές","ακαδημαϊκοί"],
|
||||
"key::45": ["institution","istituzione","institution","институциональный","instelling","ινστιτούτο"],
|
||||
"key::46": ["division","divisione","division","отделение","divisie","τμήμα"],
|
||||
"key::47": ["committee","comitato","comité","комитет","commissie","επιτροπή"],
|
||||
"key::48": ["promotion","promozione","продвижение","proothisis","forderung"],
|
||||
"key::49": ["medical","medicine","clinical","medicina","clinici","médico","medicina","clínica","médico","medicina","clínica","medizinisch","Medizin","klinisch","medisch","geneeskunde","klinisch","ιατρικός","ιατρική","ιατρικό","ιατρικά","κλινικός","κλινική","κλινικό","κλινικά","tıbbi","tıp","klinik","orvosi","orvostudomány","klinikai","zdravniški","medicinski","klinični","meditsiini","kliinik","kliiniline"],
|
||||
"key::50": ["technology","technological","tecnologia","tecnologie","tecnología","tecnológico","tecnologia","tecnológico","Technologie","technologisch","technologie","technologisch","τεχνολογία","τεχνολογικός","τεχνολογική","τεχνολογικό","teknoloji","teknolojik","technológia","technológiai","tehnologija","tehnološki","tehnoloogia","tehnoloogiline","technologii","technical","texniki","teknik"],
|
||||
"key::51": ["science","scientific","scienza","scientifiche","scienze","ciencia","científico","ciência","científico","Wissenschaft","wissenschaftlich","wetenschap","wetenschappelijk","επιστήμη","επιστημονικός","επιστημονική","επιστημονικό","επιστημονικά","bilim","bilimsel","tudomány","tudományos","znanost","znanstveni","teadus","teaduslik",""],
|
||||
"key::52": ["engineering","ingegneria","ingeniería","engenharia","Ingenieurwissenschaft","ingenieurswetenschappen","bouwkunde","μηχανικός","μηχανική","μηχανικό","mühendislik","mérnöki","Inženirstvo","inseneeria","inseneri",""],
|
||||
"key::53": ["management","gestione","gestionale","gestionali","gestión","administración","gestão","administração","Verwaltung","management","διαχείριση","yönetim","menedzsment","vodstvo","upravljanje","management","juhtkond","juhtimine","haldus",""],
|
||||
"key::54": ["energy","energia","energía","energia","Energie","energie","ενέργεια","enerji","energia","energija","energia",""],
|
||||
"key::55": ["agricultural","agriculture","agricoltura","agricole","agrícola","agricultura","agrícola","agricultura","landwirtschaftlich","Landwirtschaft","landbouwkundig","landbouw","αγροτικός","αγροτική","αγροτικό","γεωργικός","γεωργική","γεωργικό","γεωργία","tarımsal","tarım","mezőgazdasági","mezőgazdaság","poljedelski","poljedelstvo","põllumajandus","põllumajanduslik",""],
|
||||
"key::56": ["information","informazione","información","informação","Information","informatie","πληροφορία","bilgi","információ","informacija","informatsioon","informatycznych",""],
|
||||
"key::57": ["social","sociali","social","social","Sozial","sociaal","maatschappelijk","κοινωνικός","κοινωνική","κοινωνικό","κοινωνικά","sosyal","szociális","družbeni","sotsiaal","sotsiaalne",""],
|
||||
"key::58": ["environmental","ambiente","medioambiental","ambiente","medioambiente","meioambiente","Umwelt","milieu","milieuwetenschap","milieukunde","περιβαλλοντικός","περιβαλλοντική","περιβαλλοντικό","περιβαλλοντικά","çevre","környezeti","okoliški","keskonna",""],
|
||||
"key::59": ["business","economia","economiche","economica","negocio","empresa","negócio","Unternehmen","bedrijf","bedrijfskunde","επιχείρηση","iş","üzleti","posel","ettevõte/äri",""],
|
||||
"key::60": ["pharmaceuticals","pharmacy","farmacia","farmaceutica","farmacéutica","farmacia","farmacêutica","farmácia","Pharmazeutika","Arzneimittelkunde","farmaceutica","geneesmiddelen","apotheek","φαρμακευτικός","φαρμακευτική","φαρμακευτικό","φαρμακευτικά","φαρμακείο","ilaç","eczane","gyógyszerészeti","gyógyszertár","farmacevtika","lekarništvo","farmaatsia","farmatseutiline",""],
|
||||
"key::61": ["healthcare","health services","salute","atenciónmédica","cuidadodelasalud","cuidadoscomasaúde","Gesundheitswesen","gezondheidszorg","ιατροφαρμακευτικήπερίθαλψη","sağlıkhizmeti","egészségügy","zdravstvo","tervishoid","tervishoiu",""],
|
||||
"key::62": ["history","storia","historia","história","Geschichte","geschiedenis","geschiedkunde","ιστορία","tarih","történelem","zgodovina","ajalugu",""],
|
||||
"key::63": ["materials","materiali","materia","materiales","materiais","materialen","υλικά","τεκμήρια","malzemeler","anyagok","materiali","materjalid","vahendid",""],
|
||||
"key::64": ["economics","economia","economiche","economica","economía","economia","Wirtschaft","economie","οικονομικά","οικονομικέςεπιστήμες","ekonomi","közgazdaságtan","gospodarstvo","ekonomija","majanduslik","majandus",""],
|
||||
"key::65": ["therapeutics","terapeutica","terapéutica","terapêutica","therapie","θεραπευτική","tedavibilimi","gyógykezelés","terapevtika","terapeutiline","ravi",""],
|
||||
"key::66": ["oncology","oncologia","oncologico","oncología","oncologia","Onkologie","oncologie","ογκολογία","onkoloji","onkológia","onkologija","onkoloogia",""],
|
||||
"key::67": ["natural","naturali","naturale","natural","natural","natürlich","natuurlijk","φυσικός","φυσική","φυσικό","φυσικά","doğal","természetes","naraven","loodus",""],
|
||||
"key::68": ["educational","educazione","pedagogia","educacional","educativo","educacional","pädagogisch","educatief","εκπαιδευτικός","εκπαιδευτική","εκπαιδευτικό","εκπαιδευτικά","eğitimsel","oktatási","izobraževalen","haridus","hariduslik",""],
|
||||
"key::69": ["biomedical","biomedica","biomédico","biomédico","biomedizinisch","biomedisch","βιοιατρικός","βιοιατρική","βιοιατρικό","βιοιατρικά","biyomedikal","orvosbiológiai","biomedicinski","biomeditsiiniline",""],
|
||||
"key::70": ["veterinary","veterinaria","veterinarie","veterinaria","veterinária","tierärtzlich","veterinair","veeartsenijlkunde","κτηνιατρικός","κτηνιατρική","κτηνιατρικό","κτηνιατρικά","veteriner","állatorvosi","veterinar","veterinarski","veterinaaria",""],
|
||||
"key::71": ["chemistry","chimica","química","química","Chemie","chemie","scheikunde","χημεία","kimya","kémia","kemija","keemia",""],
|
||||
"key::72": ["security","sicurezza","seguridad","segurança","Sicherheit","veiligheid","ασφάλεια","güvenlik","biztonsági","varnost","turvalisus","julgeolek",""],
|
||||
"key::73": ["biotechnology","biotecnologia","biotecnologie","biotecnología","biotecnologia","Biotechnologie","biotechnologie","βιοτεχνολογία","biyoteknoloji","biotechnológia","biotehnologija","biotehnoloogia",""],
|
||||
"key::74": ["military","militare","militari","militar","militar","Militär","militair","leger","στρατιωτικός","στρατιωτική","στρατιωτικό","στρατιωτικά","askeri","katonai","vojaški","vojni","militaar","wojskowa",""],
|
||||
"key::75": ["theological","teologia","teologico","teológico","tecnológica","theologisch","theologisch","θεολογικός","θεολογική","θεολογικό","θεολογικά","teolojik","technológiai","teološki","teoloogia","usuteadus","teoloogiline",""],
|
||||
"key::76": ["electronics","elettronica","electrónica","eletrônicos","Elektronik","elektronica","ηλεκτρονική","elektronik","elektronika","elektronika","elektroonika",""],
|
||||
"key::77": ["forestry","forestale","forestali","silvicultura","forestal","floresta","Forstwirtschaft","bosbouw","δασοκομία","δασολογία","ormancılık","erdészet","gozdarstvo","metsandus",""],
|
||||
"key::78": ["maritime","marittima","marittime","marittimo","marítimo","marítimo","maritiem","ναυτικός","ναυτική","ναυτικό","ναυτικά","ναυτιλιακός","ναυτιλιακή","ναυτιλιακό","ναυτιλιακά","θαλάσσιος","θαλάσσια","θαλάσσιο","denizcilik","tengeri","morski","mere","merendus",""],
|
||||
"key::79": ["sports","sport","deportes","esportes","Sport","sport","sportwetenschappen","άθληση","γυμναστικήδραστηριότητα","spor","sport","šport","sport","spordi",""],
|
||||
"key::80": ["surgery","chirurgia","chirurgiche","cirugía","cirurgia","Chirurgie","chirurgie","heelkunde","εγχείρηση","επέμβαση","χειρουργικήεπέμβαση","cerrahi","sebészet","kirurgija","kirurgia",""],
|
||||
"key::81": ["cultural","culturale","culturali","cultura","cultural","cultural","kulturell","cultureel","πολιτιστικός","πολιτιστική","πολιτιστικό","πολιτισμικός","πολιτισμική","πολιτισμικό","kültürel","kultúrális","kulturni","kultuuri","kultuuriline",""],
|
||||
"key::82": ["computerscience","informatica","ordenador","computadora","informática","computación","cienciasdelacomputación","ciênciadacomputação","Computer","computer","υπολογιστής","ηλεκτρονικόςυπολογιστής","bilgisayar","számítógép","računalnik","arvuti",""],
|
||||
"key::83": ["finance","financial","finanza","finanziarie","finanza","financiero","finanças","financeiro","Finanzen","finanziell","financiën","financieel","χρηματοοικονομικά","χρηματοδότηση","finanse","finansal","pénzügy","pénzügyi","finance","finančni","finants","finantsiline",""],
|
||||
"key::84": ["communication","comunicazione","comuniciación","comunicação","Kommunikation","communication","επικοινωνία","iletişim","kommunikáció","komuniciranje","kommunikatsioon",""],
|
||||
"key::85": ["justice","giustizia","justicia","justiça","Recht","Justiz","justitie","gerechtigheid","δικαιοσύνη","υπουργείοδικαιοσύνης","δίκαιο","adalet","igazságügy","pravo","õigus",""],
|
||||
"key::86": ["aerospace","aerospaziale","aerospaziali","aeroespacio","aeroespaço","Luftfahrt","luchtvaart","ruimtevaart","αεροπορικός","αεροπορική","αεροπορικό","αεροναυπηγικός","αεροναυπηγική","αεροναυπηγικό","αεροναυπηγικά","havacılıkveuzay","légtér","zrakoplovstvo","atmosfäär","kosmos",""],
|
||||
"key::87": ["dermatology","dermatologia","dermatología","dermatologia","Dermatologie","dermatologie","δρματολογία","dermatoloji","bőrgyógyászat","dermatológia","dermatologija","dermatoloogia",""],
|
||||
"key::88": ["architecture","architettura","arquitectura","arquitetura","Architektur","architectuur","αρχιτεκτονική","mimarlık","építészet","arhitektura","arhitektuur",""],
|
||||
"key::89": ["mathematics","matematica","matematiche","matemáticas","matemáticas","Mathematik","wiskunde","mathematica","μαθηματικά","matematik","matematika","matematika","matemaatika",""],
|
||||
"key::90": ["language","lingue","linguistica","linguistiche","lenguaje","idioma","língua","idioma","Sprache","taal","taalkunde","γλώσσα","dil","nyelv","jezik","keel",""],
|
||||
"key::91": ["neuroscience","neuroscienza","neurociencia","neurociência","Neurowissenschaft","neurowetenschappen","νευροεπιστήμη","nörobilim","idegtudomány","nevroznanost","neuroteadused",""],
|
||||
"key::92": ["automation","automazione","automatización","automação","Automatisierung","automatisering","αυτοματοποίηση","otomasyon","automatizálás","avtomatizacija","automatiseeritud",""],
|
||||
"key::93": ["pediatric","pediatria","pediatriche","pediatrico","pediátrico","pediatría","pediátrico","pediatria","pädiatrisch","pediatrische","παιδιατρική","pediatrik","gyermekgyógyászat","pediatrija","pediaatria",""],
|
||||
"key::94": ["photonics","fotonica","fotoniche","fotónica","fotônica","Photonik","fotonica","φωτονική","fotonik","fotonika","fotonika","fotoonika",""],
|
||||
"key::95": ["mechanics", "mechanical", "meccanica","meccaniche","mecánica","mecânica","Mechanik","Maschinenbau","mechanica","werktuigkunde","μηχανικής","mekanik","gépészet","mehanika","mehaanika",""],
|
||||
"key::96": ["psychiatrics","psichiatria","psichiatrica","psichiatriche","psiquiatría","psiquiatria","Psychiatrie","psychiatrie","ψυχιατρική","psikiyatrik","pszihiátria","psihiatrija","psühhaatria",""],
|
||||
"key::97": ["psychology","fisiologia","psicología","psicologia","Psychologie","psychologie","ψυχολογία","psikoloji","pszihológia","psihologija","psühholoogia",""],
|
||||
"key::98": ["automotive","industriaautomobilistica","industriadelautomóvil","automotriz","industriaautomotriz","automotivo","Automobilindustrie","autoindustrie","αυτοκίνητος","αυτοκίνητη","αυτοκίνητο","αυτοκινούμενος","αυτοκινούμενη","αυτοκινούμενο","αυτοκινητιστικός","αυτοκινητιστική","αυτοκινητιστικό","otomotiv","autóipari","samogiben","avtomobilskaindustrija","auto-",""],
|
||||
"key::99": ["neurology","neurologia","neurologiche","neurología","neurologia","Neurologie","neurologie","zenuwleer","νευρολογία","nöroloji","neurológia","ideggyógyászat","nevrologija","neuroloogia",""],
|
||||
"key::100": ["geology","geologia","geologiche","geología","geologia","Geologie","geologie","aardkunde","γεωλογία","jeoloji","geológia","földtudomány","geologija","geoloogia",""],
|
||||
"key::101": ["microbiology","microbiologia","micro-biologia","microbiologiche","microbiología","microbiologia","Mikrobiologie","microbiologie","μικροβιολογία","mikrobiyoloji","mikrobiológia","mikrobiologija","mikrobioloogia",""],
|
||||
"key::102": ["informatics","informatica","informática","informática","informatica",""],
|
||||
"key::103": ["forschungsgemeinschaft","comunita ricerca","research community","research foundation","research association"],
|
||||
"key::104": ["commerce","ticaret","ticarət","commercio","trade","handel","comercio"],
|
||||
"key::105" : ["state", "stato", "etade", "estado", "statale", "etat", "zustand", "estado"],
|
||||
"key::106" : ["seminary", "seminario", "seminaire", "seminar"],
|
||||
"key::107" : ["agricultural forestry", "af", "a f"],
|
||||
"key::108" : ["agricultural mechanical", "am", "a m"],
|
||||
"key::109" : ["catholic", "catholique", "katholische", "catolica", "cattolica", "catolico"]
|
||||
}
|
||||
"blacklists" : {},
|
||||
"synonyms": {}
|
||||
}
|
||||
}
|
File diff suppressed because one or more lines are too long
|
@ -33,10 +33,7 @@ import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
|||
import eu.dnetlib.dhp.bulktag.community.*;
|
||||
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||
import eu.dnetlib.dhp.schema.common.ModelSupport;
|
||||
import eu.dnetlib.dhp.schema.oaf.Context;
|
||||
import eu.dnetlib.dhp.schema.oaf.Datasource;
|
||||
import eu.dnetlib.dhp.schema.oaf.Project;
|
||||
import eu.dnetlib.dhp.schema.oaf.Result;
|
||||
import eu.dnetlib.dhp.schema.oaf.*;
|
||||
import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils;
|
||||
import scala.Tuple2;
|
||||
|
||||
|
@ -114,27 +111,35 @@ public class SparkBulkTagJob {
|
|||
extendCommunityConfigurationForEOSC(spark, inputPath, cc);
|
||||
execBulkTag(
|
||||
spark, inputPath, outputPath, protoMap, cc);
|
||||
execEntityTag(
|
||||
spark, inputPath + "organization", outputPath + "organization",
|
||||
Utils.getCommunityOrganization(baseURL), Organization.class, TaggingConstants.CLASS_ID_ORGANIZATION,
|
||||
TaggingConstants.CLASS_NAME_BULKTAG_ORGANIZATION);
|
||||
execEntityTag(
|
||||
spark, inputPath + "project", outputPath + "project", Utils.getCommunityProjects(baseURL),
|
||||
Project.class, TaggingConstants.CLASS_ID_PROJECT, TaggingConstants.CLASS_NAME_BULKTAG_PROJECT);
|
||||
execDatasourceTag(spark, inputPath, outputPath, Utils.getDatasourceCommunities(baseURL));
|
||||
execProjectTag(spark, inputPath, outputPath, Utils.getCommunityProjects(baseURL));
|
||||
|
||||
});
|
||||
}
|
||||
|
||||
private static void execProjectTag(SparkSession spark, String inputPath, String outputPath,
|
||||
CommunityEntityMap communityProjects) {
|
||||
Dataset<Project> projects = readPath(spark, inputPath + "project", Project.class);
|
||||
private static <E extends OafEntity> void execEntityTag(SparkSession spark, String inputPath, String outputPath,
|
||||
CommunityEntityMap communityEntity, Class<E> entityClass,
|
||||
String classID, String calssName) {
|
||||
Dataset<E> entity = readPath(spark, inputPath, entityClass);
|
||||
Dataset<EntityCommunities> pc = spark
|
||||
.createDataset(
|
||||
communityProjects
|
||||
communityEntity
|
||||
.keySet()
|
||||
.stream()
|
||||
.map(k -> EntityCommunities.newInstance(k, communityProjects.get(k)))
|
||||
.map(k -> EntityCommunities.newInstance(k, communityEntity.get(k)))
|
||||
.collect(Collectors.toList()),
|
||||
Encoders.bean(EntityCommunities.class));
|
||||
|
||||
projects
|
||||
.joinWith(pc, projects.col("id").equalTo(pc.col("entityId")), "left")
|
||||
.map((MapFunction<Tuple2<Project, EntityCommunities>, Project>) t2 -> {
|
||||
Project ds = t2._1();
|
||||
entity
|
||||
.joinWith(pc, entity.col("id").equalTo(pc.col("entityId")), "left")
|
||||
.map((MapFunction<Tuple2<E, EntityCommunities>, E>) t2 -> {
|
||||
E ds = t2._1();
|
||||
if (t2._2() != null) {
|
||||
List<String> context = Optional
|
||||
.ofNullable(ds.getContext())
|
||||
|
@ -156,8 +161,8 @@ public class SparkBulkTagJob {
|
|||
false, TaggingConstants.BULKTAG_DATA_INFO_TYPE, true, false,
|
||||
OafMapperUtils
|
||||
.qualifier(
|
||||
TaggingConstants.CLASS_ID_DATASOURCE,
|
||||
TaggingConstants.CLASS_NAME_BULKTAG_DATASOURCE,
|
||||
classID,
|
||||
calssName,
|
||||
ModelConstants.DNET_PROVENANCE_ACTIONS,
|
||||
ModelConstants.DNET_PROVENANCE_ACTIONS),
|
||||
"1")));
|
||||
|
@ -166,17 +171,17 @@ public class SparkBulkTagJob {
|
|||
});
|
||||
}
|
||||
return ds;
|
||||
}, Encoders.bean(Project.class))
|
||||
}, Encoders.bean(entityClass))
|
||||
.write()
|
||||
.mode(SaveMode.Overwrite)
|
||||
.option("compression", "gzip")
|
||||
.json(outputPath + "project");
|
||||
.json(outputPath);
|
||||
|
||||
readPath(spark, outputPath + "project", Project.class)
|
||||
readPath(spark, outputPath, entityClass)
|
||||
.write()
|
||||
.mode(SaveMode.Overwrite)
|
||||
.option("compression", "gzip")
|
||||
.json(inputPath + "project");
|
||||
.json(inputPath);
|
||||
}
|
||||
|
||||
private static void execDatasourceTag(SparkSession spark, String inputPath, String outputPath,
|
||||
|
|
|
@ -13,6 +13,9 @@ public class TaggingConstants {
|
|||
public static final String CLASS_ID_CZENODO = "community:zenodocommunity";
|
||||
public static final String CLASS_ID_ADVANCED_CONSTRAINT = "community:advconstraint";
|
||||
|
||||
public static final String CLASS_ID_PROJECT = "community:project";
|
||||
public static final String CLASS_ID_ORGANIZATION = "community:organization";
|
||||
|
||||
public static final String ZENODO_COMMUNITY_INDICATOR = "zenodo.org/communities/";
|
||||
|
||||
public static final String CLASS_NAME_BULKTAG_SUBJECT = "Bulktagging for Community - Subject";
|
||||
|
@ -20,5 +23,8 @@ public class TaggingConstants {
|
|||
public static final String CLASS_NAME_BULKTAG_ZENODO = "Bulktagging for Community - Zenodo";
|
||||
public static final String CLASS_NAME_BULKTAG_ADVANCED_CONSTRAINT = "Bulktagging for Community - Advanced Constraints";
|
||||
|
||||
public static final String CLASS_NAME_BULKTAG_PROJECT = "Bulktagging for Community - Project";
|
||||
public static final String CLASS_NAME_BULKTAG_ORGANIZATION = "Bulktagging for Community - Organization";
|
||||
|
||||
public static final String TAGGING_TRUST = "0.8";
|
||||
}
|
||||
|
|
|
@ -465,6 +465,138 @@ public class BulkTagJobTest {
|
|||
|
||||
}
|
||||
|
||||
@Test
|
||||
void organizationTag() throws Exception {
|
||||
final String sourcePath = getClass()
|
||||
.getResource("/eu/dnetlib/dhp/bulktag/sample/publication/update_datasource/")
|
||||
.getPath();
|
||||
LocalFileSystem fs = FileSystem.getLocal(new Configuration());
|
||||
fs
|
||||
.copyFromLocalFile(
|
||||
false, new org.apache.hadoop.fs.Path(getClass()
|
||||
.getResource("/eu/dnetlib/dhp/bulktag/pathMap/")
|
||||
.getPath()),
|
||||
new org.apache.hadoop.fs.Path(workingDir.toString() + "/data/bulktagging/protoMap"));
|
||||
SparkBulkTagJob
|
||||
.main(
|
||||
new String[] {
|
||||
|
||||
"-isSparkSessionManaged", Boolean.FALSE.toString(),
|
||||
"-sourcePath", sourcePath,
|
||||
"-taggingConf", taggingConf,
|
||||
|
||||
"-outputPath", workingDir.toString() + "/",
|
||||
"-baseURL", "https://services.openaire.eu/openaire/community/",
|
||||
|
||||
"-pathMap", workingDir.toString() + "/data/bulktagging/protoMap/pathMap",
|
||||
"-nameNode", "local"
|
||||
});
|
||||
|
||||
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
|
||||
|
||||
JavaRDD<Organization> tmp = sc
|
||||
.textFile(workingDir.toString() + "/organization")
|
||||
.map(item -> OBJECT_MAPPER.readValue(item, Organization.class));
|
||||
|
||||
Assertions.assertEquals(4, tmp.count());
|
||||
org.apache.spark.sql.Dataset<Organization> verificationDataset = spark
|
||||
.createDataset(tmp.rdd(), Encoders.bean(Organization.class));
|
||||
|
||||
verificationDataset.createOrReplaceTempView("organization");
|
||||
|
||||
String query = "select id, MyT.id community, MyD.provenanceaction.classid provenance, MyD.provenanceaction.classname name "
|
||||
+ "from organization "
|
||||
+ "lateral view explode(context) c as MyT "
|
||||
+ "lateral view explode(MyT.datainfo) d as MyD "
|
||||
+ "where MyD.inferenceprovenance = 'bulktagging'";
|
||||
|
||||
org.apache.spark.sql.Dataset<Row> idExplodeCommunity = spark.sql(query);
|
||||
|
||||
idExplodeCommunity.show(false);
|
||||
|
||||
Assertions.assertEquals(3, idExplodeCommunity.count());
|
||||
Assertions
|
||||
.assertEquals(
|
||||
3, idExplodeCommunity.filter("provenance = 'community:organization'").count());
|
||||
Assertions
|
||||
.assertEquals(
|
||||
3,
|
||||
idExplodeCommunity
|
||||
.filter("name = 'Bulktagging for Community - Organization'")
|
||||
.count());
|
||||
|
||||
Assertions.assertEquals(1, idExplodeCommunity.filter("community = 'netherlands'").count());
|
||||
Assertions.assertEquals(1, idExplodeCommunity.filter("community = 'beopen'").count());
|
||||
Assertions.assertEquals(1, idExplodeCommunity.filter("community = 'mes'").count());
|
||||
|
||||
}
|
||||
|
||||
@Test
|
||||
void projectTag() throws Exception {
|
||||
final String sourcePath = getClass()
|
||||
.getResource("/eu/dnetlib/dhp/bulktag/sample/publication/update_datasource/")
|
||||
.getPath();
|
||||
LocalFileSystem fs = FileSystem.getLocal(new Configuration());
|
||||
fs
|
||||
.copyFromLocalFile(
|
||||
false, new org.apache.hadoop.fs.Path(getClass()
|
||||
.getResource("/eu/dnetlib/dhp/bulktag/pathMap/")
|
||||
.getPath()),
|
||||
new org.apache.hadoop.fs.Path(workingDir.toString() + "/data/bulktagging/protoMap"));
|
||||
SparkBulkTagJob
|
||||
.main(
|
||||
new String[] {
|
||||
|
||||
"-isSparkSessionManaged", Boolean.FALSE.toString(),
|
||||
"-sourcePath", sourcePath,
|
||||
"-taggingConf", taggingConf,
|
||||
|
||||
"-outputPath", workingDir.toString() + "/",
|
||||
"-baseURL", "https://services.openaire.eu/openaire/community/",
|
||||
|
||||
"-pathMap", workingDir.toString() + "/data/bulktagging/protoMap/pathMap",
|
||||
"-nameNode", "local"
|
||||
});
|
||||
|
||||
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
|
||||
|
||||
JavaRDD<Project> tmp = sc
|
||||
.textFile(workingDir.toString() + "/project")
|
||||
.map(item -> OBJECT_MAPPER.readValue(item, Project.class));
|
||||
|
||||
Assertions.assertEquals(4, tmp.count());
|
||||
org.apache.spark.sql.Dataset<Project> verificationDataset = spark
|
||||
.createDataset(tmp.rdd(), Encoders.bean(Project.class));
|
||||
|
||||
verificationDataset.createOrReplaceTempView("project");
|
||||
|
||||
String query = "select id, MyT.id community, MyD.provenanceaction.classid provenance, MyD.provenanceaction.classname name "
|
||||
+ "from project "
|
||||
+ "lateral view explode(context) c as MyT "
|
||||
+ "lateral view explode(MyT.datainfo) d as MyD "
|
||||
+ "where MyD.inferenceprovenance = 'bulktagging'";
|
||||
|
||||
org.apache.spark.sql.Dataset<Row> idExplodeCommunity = spark.sql(query);
|
||||
|
||||
idExplodeCommunity.show(false);
|
||||
|
||||
Assertions.assertEquals(4, idExplodeCommunity.count());
|
||||
Assertions
|
||||
.assertEquals(
|
||||
4, idExplodeCommunity.filter("provenance = 'community:project'").count());
|
||||
Assertions
|
||||
.assertEquals(
|
||||
4,
|
||||
idExplodeCommunity
|
||||
.filter("name = 'Bulktagging for Community - Project'")
|
||||
.count());
|
||||
|
||||
Assertions.assertEquals(1, idExplodeCommunity.filter("community = 'enermaps'").count());
|
||||
Assertions.assertEquals(1, idExplodeCommunity.filter("community = 'clarin'").count());
|
||||
Assertions.assertEquals(2, idExplodeCommunity.filter("community = 'dh-ch'").count());
|
||||
|
||||
}
|
||||
|
||||
@Test
|
||||
void bulktagByZenodoCommunityTest() throws Exception {
|
||||
final String sourcePath = getClass()
|
||||
|
|
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
|
@ -14,4 +14,7 @@ public class ProvisionConstants {
|
|||
return format + SEPARATOR + LAYOUT + SEPARATOR + INTERPRETATION;
|
||||
}
|
||||
|
||||
public static final String PUBLIC_ALIAS_NAME = "public";
|
||||
public static final String SHADOW_ALIAS_NAME = "shadow";
|
||||
|
||||
}
|
||||
|
|
|
@ -9,6 +9,7 @@ import org.apache.commons.io.IOUtils;
|
|||
import org.apache.solr.client.solrj.SolrResponse;
|
||||
import org.apache.solr.client.solrj.SolrServerException;
|
||||
import org.apache.solr.client.solrj.impl.CloudSolrClient;
|
||||
import org.apache.solr.client.solrj.request.CollectionAdminRequest;
|
||||
import org.apache.solr.client.solrj.response.UpdateResponse;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
@ -23,7 +24,7 @@ public class SolrAdminApplication implements Closeable {
|
|||
private static final Logger log = LoggerFactory.getLogger(SolrAdminApplication.class);
|
||||
|
||||
enum Action {
|
||||
DELETE_BY_QUERY, COMMIT
|
||||
DELETE_BY_QUERY, COMMIT, UPDATE_ALIASES
|
||||
}
|
||||
|
||||
private final CloudSolrClient solrClient;
|
||||
|
@ -39,9 +40,6 @@ public class SolrAdminApplication implements Closeable {
|
|||
final String isLookupUrl = parser.get("isLookupUrl");
|
||||
log.info("isLookupUrl: {}", isLookupUrl);
|
||||
|
||||
final String format = parser.get("format");
|
||||
log.info("format: {}", format);
|
||||
|
||||
final Action action = Action.valueOf(parser.get("action"));
|
||||
log.info("action: {}", action);
|
||||
|
||||
|
@ -59,11 +57,21 @@ public class SolrAdminApplication implements Closeable {
|
|||
final String zkHost = isLookup.getZkHost();
|
||||
log.info("zkHost: {}", zkHost);
|
||||
|
||||
final String collection = ProvisionConstants.getCollectionName(format);
|
||||
log.info("collection: {}", collection);
|
||||
final String publicFormat = parser.get("publicFormat");
|
||||
log.info("publicFormat: {}", publicFormat);
|
||||
|
||||
final String shadowFormat = parser.get("shadowFormat");
|
||||
log.info("shadowFormat: {}", shadowFormat);
|
||||
|
||||
// get collection names from metadata format profiles names
|
||||
final String publicCollection = ProvisionConstants.getCollectionName(publicFormat);
|
||||
log.info("publicCollection: {}", publicCollection);
|
||||
|
||||
final String shadowCollection = ProvisionConstants.getCollectionName(shadowFormat);
|
||||
log.info("shadowCollection: {}", shadowCollection);
|
||||
|
||||
try (SolrAdminApplication app = new SolrAdminApplication(zkHost)) {
|
||||
app.execute(action, collection, query, commit);
|
||||
app.execute(action, query, commit, publicCollection, shadowCollection);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -72,22 +80,29 @@ public class SolrAdminApplication implements Closeable {
|
|||
this.solrClient = new CloudSolrClient.Builder(zk.getHosts(), zk.getChroot()).build();
|
||||
}
|
||||
|
||||
public SolrResponse commit(String collection) throws IOException, SolrServerException {
|
||||
return execute(Action.COMMIT, collection, null, true);
|
||||
public SolrResponse commit(String shadowCollection) throws IOException, SolrServerException {
|
||||
return execute(Action.COMMIT, null, true, null, shadowCollection);
|
||||
}
|
||||
|
||||
public SolrResponse execute(Action action, String collection, String query, boolean commit)
|
||||
public SolrResponse execute(Action action, String query, boolean commit,
|
||||
String publicCollection, String shadowCollection)
|
||||
throws IOException, SolrServerException {
|
||||
switch (action) {
|
||||
|
||||
case DELETE_BY_QUERY:
|
||||
UpdateResponse rsp = solrClient.deleteByQuery(collection, query);
|
||||
UpdateResponse rsp = solrClient.deleteByQuery(shadowCollection, query);
|
||||
if (commit) {
|
||||
solrClient.commit(collection);
|
||||
return solrClient.commit(shadowCollection);
|
||||
}
|
||||
return rsp;
|
||||
|
||||
case COMMIT:
|
||||
return solrClient.commit(collection);
|
||||
return solrClient.commit(shadowCollection);
|
||||
|
||||
case UPDATE_ALIASES:
|
||||
this.updateAliases(publicCollection, shadowCollection);
|
||||
return null;
|
||||
|
||||
default:
|
||||
throw new IllegalArgumentException("action not managed: " + action);
|
||||
}
|
||||
|
@ -98,4 +113,30 @@ public class SolrAdminApplication implements Closeable {
|
|||
solrClient.close();
|
||||
}
|
||||
|
||||
private void updateAliases(String publicCollection, String shadowCollection)
|
||||
throws SolrServerException, IOException {
|
||||
|
||||
// delete current aliases
|
||||
this.deleteAlias(ProvisionConstants.PUBLIC_ALIAS_NAME);
|
||||
this.deleteAlias(ProvisionConstants.SHADOW_ALIAS_NAME);
|
||||
|
||||
// create aliases
|
||||
this.createAlias(ProvisionConstants.PUBLIC_ALIAS_NAME, publicCollection);
|
||||
this.createAlias(ProvisionConstants.SHADOW_ALIAS_NAME, shadowCollection);
|
||||
|
||||
}
|
||||
|
||||
public SolrResponse deleteAlias(String aliasName) throws SolrServerException, IOException {
|
||||
CollectionAdminRequest.DeleteAlias deleteAliasRequest = CollectionAdminRequest.deleteAlias(aliasName);
|
||||
log.info("deleting alias: {}", aliasName);
|
||||
return deleteAliasRequest.process(solrClient);
|
||||
}
|
||||
|
||||
public SolrResponse createAlias(String aliasName, String collection) throws IOException, SolrServerException {
|
||||
CollectionAdminRequest.CreateAlias createAliasRequest = CollectionAdminRequest
|
||||
.createAlias(aliasName, collection);
|
||||
log.info("creating alias: {} for collection: {}", aliasName, collection);
|
||||
return createAliasRequest.process(solrClient);
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -36,7 +36,7 @@ public class SolrRecordDumpJob extends AbstractSolrRecordTransformJob {
|
|||
|
||||
private final String inputPath;
|
||||
|
||||
private final String format;
|
||||
private final String shadowFormat;
|
||||
|
||||
private final String outputPath;
|
||||
|
||||
|
@ -61,8 +61,8 @@ public class SolrRecordDumpJob extends AbstractSolrRecordTransformJob {
|
|||
final String inputPath = parser.get("inputPath");
|
||||
log.info("inputPath: {}", inputPath);
|
||||
|
||||
final String format = parser.get("format");
|
||||
log.info("format: {}", format);
|
||||
final String shadowFormat = parser.get("shadowFormat");
|
||||
log.info("shadowFormat: {}", shadowFormat);
|
||||
|
||||
final String outputPath = Optional
|
||||
.ofNullable(parser.get("outputPath"))
|
||||
|
@ -95,27 +95,24 @@ public class SolrRecordDumpJob extends AbstractSolrRecordTransformJob {
|
|||
final String isLookupUrl = parser.get("isLookupUrl");
|
||||
log.info("isLookupUrl: {}", isLookupUrl);
|
||||
final ISLookupClient isLookup = new ISLookupClient(ISLookupClientFactory.getLookUpService(isLookupUrl));
|
||||
new SolrRecordDumpJob(spark, inputPath, format, outputPath).run(isLookup);
|
||||
new SolrRecordDumpJob(spark, inputPath, shadowFormat, outputPath).run(isLookup);
|
||||
});
|
||||
}
|
||||
|
||||
public SolrRecordDumpJob(SparkSession spark, String inputPath, String format, String outputPath) {
|
||||
public SolrRecordDumpJob(SparkSession spark, String inputPath, String shadowFormat, String outputPath) {
|
||||
this.spark = spark;
|
||||
this.inputPath = inputPath;
|
||||
this.format = format;
|
||||
this.shadowFormat = shadowFormat;
|
||||
this.outputPath = outputPath;
|
||||
}
|
||||
|
||||
public void run(ISLookupClient isLookup) throws ISLookUpException, TransformerException {
|
||||
final String fields = isLookup.getLayoutSource(format);
|
||||
final String fields = isLookup.getLayoutSource(shadowFormat);
|
||||
log.info("fields: {}", fields);
|
||||
|
||||
final String xslt = isLookup.getLayoutTransformer();
|
||||
|
||||
final String dsId = isLookup.getDsId(format);
|
||||
log.info("dsId: {}", dsId);
|
||||
|
||||
final String indexRecordXslt = getLayoutTransformer(format, fields, xslt);
|
||||
final String indexRecordXslt = getLayoutTransformer(shadowFormat, fields, xslt);
|
||||
log.info("indexRecordTransformer {}", indexRecordXslt);
|
||||
|
||||
final Encoder<TupleWrapper> encoder = Encoders.bean(TupleWrapper.class);
|
||||
|
|
|
@ -40,6 +40,8 @@ public class XmlIndexingJob extends AbstractSolrRecordTransformJob {
|
|||
|
||||
private final String format;
|
||||
|
||||
private final String shadowCollection;
|
||||
|
||||
private final int batchSize;
|
||||
|
||||
private final SparkSession spark;
|
||||
|
@ -63,8 +65,11 @@ public class XmlIndexingJob extends AbstractSolrRecordTransformJob {
|
|||
final String inputPath = parser.get("inputPath");
|
||||
log.info("inputPath: {}", inputPath);
|
||||
|
||||
final String format = parser.get("format");
|
||||
log.info("format: {}", format);
|
||||
final String shadowFormat = parser.get("shadowFormat");
|
||||
log.info("shadowFormat: {}", shadowFormat);
|
||||
|
||||
final String shadowCollection = ProvisionConstants.getCollectionName(shadowFormat);
|
||||
log.info("shadowCollection: {}", shadowCollection);
|
||||
|
||||
final Integer batchSize = Optional
|
||||
.ofNullable(parser.get("batchSize"))
|
||||
|
@ -85,15 +90,17 @@ public class XmlIndexingJob extends AbstractSolrRecordTransformJob {
|
|||
final String isLookupUrl = parser.get("isLookupUrl");
|
||||
log.info("isLookupUrl: {}", isLookupUrl);
|
||||
final ISLookupClient isLookup = new ISLookupClient(ISLookupClientFactory.getLookUpService(isLookupUrl));
|
||||
new XmlIndexingJob(spark, inputPath, format, batchSize)
|
||||
new XmlIndexingJob(spark, inputPath, shadowFormat, shadowCollection, batchSize)
|
||||
.run(isLookup);
|
||||
});
|
||||
}
|
||||
|
||||
public XmlIndexingJob(SparkSession spark, String inputPath, String format, Integer batchSize) {
|
||||
public XmlIndexingJob(SparkSession spark, String inputPath, String format, String shadowCollection,
|
||||
Integer batchSize) {
|
||||
this.spark = spark;
|
||||
this.inputPath = inputPath;
|
||||
this.format = format;
|
||||
this.shadowCollection = shadowCollection;
|
||||
this.batchSize = batchSize;
|
||||
}
|
||||
|
||||
|
@ -103,12 +110,6 @@ public class XmlIndexingJob extends AbstractSolrRecordTransformJob {
|
|||
|
||||
final String xslt = isLookup.getLayoutTransformer();
|
||||
|
||||
final String dsId = isLookup.getDsId(format);
|
||||
log.info("dsId: {}", dsId);
|
||||
|
||||
final String collection = ProvisionConstants.getCollectionName(format);
|
||||
log.info("collection: {}", collection);
|
||||
|
||||
final String zkHost = isLookup.getZkHost();
|
||||
log.info("zkHost: {}", zkHost);
|
||||
|
||||
|
@ -130,7 +131,7 @@ public class XmlIndexingJob extends AbstractSolrRecordTransformJob {
|
|||
.javaRDD()
|
||||
.map(
|
||||
t -> new StreamingInputDocumentFactory().parseDocument(t.getXml(), t.getJson()));
|
||||
DHPSolrSupport.indexDocs(zkHost, collection, batchSize, docs.rdd());
|
||||
DHPSolrSupport.indexDocs(zkHost, shadowCollection, batchSize, docs.rdd());
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -30,11 +30,14 @@ import eu.dnetlib.dhp.schema.solr.Context;
|
|||
import eu.dnetlib.dhp.schema.solr.Country;
|
||||
import eu.dnetlib.dhp.schema.solr.Datasource;
|
||||
import eu.dnetlib.dhp.schema.solr.EoscIfGuidelines;
|
||||
import eu.dnetlib.dhp.schema.solr.ExternalReference;
|
||||
import eu.dnetlib.dhp.schema.solr.Instance;
|
||||
import eu.dnetlib.dhp.schema.solr.Journal;
|
||||
import eu.dnetlib.dhp.schema.solr.Measure;
|
||||
import eu.dnetlib.dhp.schema.solr.OpenAccessColor;
|
||||
import eu.dnetlib.dhp.schema.solr.OpenAccessRoute;
|
||||
import eu.dnetlib.dhp.schema.solr.Organization;
|
||||
import eu.dnetlib.dhp.schema.solr.Pid;
|
||||
import eu.dnetlib.dhp.schema.solr.Project;
|
||||
import eu.dnetlib.dhp.schema.solr.Result;
|
||||
import eu.dnetlib.dhp.schema.solr.Subject;
|
||||
|
@ -76,6 +79,7 @@ public class ProvisionModelSupport {
|
|||
r.setCollectedfrom(asProvenance(e.getCollectedfrom()));
|
||||
r.setContext(asContext(e.getContext(), contextMapper));
|
||||
r.setPid(asPid(e.getPid()));
|
||||
r.setMeasures(mapMeasures(e.getMeasures()));
|
||||
|
||||
if (e instanceof eu.dnetlib.dhp.schema.oaf.Result) {
|
||||
r.setResult(mapResult((eu.dnetlib.dhp.schema.oaf.Result) e));
|
||||
|
@ -106,6 +110,14 @@ public class ProvisionModelSupport {
|
|||
final RelatedEntity re = rew.getTarget();
|
||||
final RecordType relatedRecordType = RecordType.valueOf(re.getType());
|
||||
final Relation relation = rew.getRelation();
|
||||
final String relationProvenance = Optional
|
||||
.ofNullable(relation.getDataInfo())
|
||||
.map(
|
||||
d -> Optional
|
||||
.ofNullable(d.getProvenanceaction())
|
||||
.map(Qualifier::getClassid)
|
||||
.orElse(null))
|
||||
.orElse(null);
|
||||
rr
|
||||
.setHeader(
|
||||
RelatedRecordHeader
|
||||
|
@ -113,7 +125,9 @@ public class ProvisionModelSupport {
|
|||
relation.getRelType(),
|
||||
relation.getRelClass(),
|
||||
StringUtils.substringAfter(relation.getTarget(), IdentifierFactory.ID_PREFIX_SEPARATOR),
|
||||
relatedRecordType));
|
||||
relatedRecordType,
|
||||
relationProvenance,
|
||||
Optional.ofNullable(relation.getDataInfo()).map(DataInfo::getTrust).orElse(null)));
|
||||
|
||||
rr.setAcronym(re.getAcronym());
|
||||
rr.setCode(re.getCode());
|
||||
|
@ -131,11 +145,20 @@ public class ProvisionModelSupport {
|
|||
rr.setOfficialname(re.getOfficialname());
|
||||
rr.setOpenairecompatibility(mapCodeLabel(re.getOpenairecompatibility()));
|
||||
rr.setPid(asPid(re.getPid()));
|
||||
rr.setProjectTitle(rr.getProjectTitle());
|
||||
rr.setWebsiteurl(re.getWebsiteurl());
|
||||
rr.setProjectTitle(re.getProjectTitle());
|
||||
rr.setPublisher(re.getPublisher());
|
||||
rr.setResulttype(mapQualifier(re.getResulttype()));
|
||||
rr.setTitle(Optional.ofNullable(re.getTitle()).map(StructuredProperty::getValue).orElse(null));
|
||||
|
||||
if (relation.getValidated() == null) {
|
||||
relation.setValidated(false);
|
||||
}
|
||||
if (ModelConstants.OUTCOME.equals(relation.getSubRelType())
|
||||
&& StringUtils.isNotBlank(relation.getValidationDate())) {
|
||||
rr.setValidationDate(relation.getValidationDate());
|
||||
}
|
||||
|
||||
return rr;
|
||||
}
|
||||
|
||||
|
@ -266,6 +289,7 @@ public class ProvisionModelSupport {
|
|||
ds.setOfficialname(mapField(d.getOfficialname()));
|
||||
ds.setDescription(mapField(d.getDescription()));
|
||||
ds.setJournal(mapJournal(d.getJournal()));
|
||||
ds.setWebsiteurl(mapField(d.getWebsiteurl()));
|
||||
ds.setLogourl(mapField(d.getLogourl()));
|
||||
ds.setAccessinfopackage(mapFieldList(d.getAccessinfopackage()));
|
||||
ds.setCertificates(mapField(d.getCertificates()));
|
||||
|
@ -311,6 +335,7 @@ public class ProvisionModelSupport {
|
|||
ds.setSubjects(asSubjectSP(d.getSubjects()));
|
||||
ds.setSubmissionpolicyurl(d.getSubmissionpolicyurl());
|
||||
ds.setThematic(d.getThematic());
|
||||
ds.setContentpolicies(mapCodeLabel(d.getContentpolicies()));
|
||||
ds.setVersioncontrol(d.getVersioncontrol());
|
||||
ds.setVersioning(mapField(d.getVersioning()));
|
||||
|
||||
|
@ -326,6 +351,7 @@ public class ProvisionModelSupport {
|
|||
rs.setOtherTitles(getOtherTitles(r.getTitle()));
|
||||
rs.setDescription(mapFieldList(r.getDescription()));
|
||||
rs.setSubject(asSubject(r.getSubject()));
|
||||
rs.setLanguage(asLanguage(r.getLanguage()));
|
||||
rs.setPublicationdate(mapField(r.getDateofacceptance()));
|
||||
rs.setPublisher(mapField(r.getPublisher()));
|
||||
rs.setEmbargoenddate(mapField(r.getEmbargoenddate()));
|
||||
|
@ -341,17 +367,17 @@ public class ProvisionModelSupport {
|
|||
rs.setCountry(asCountry(r.getCountry()));
|
||||
rs.setEoscifguidelines(asEOSCIF(r.getEoscifguidelines()));
|
||||
|
||||
rs.setGreen(r.getIsGreen());
|
||||
rs.setIsGreen(r.getIsGreen());
|
||||
rs
|
||||
.setOpenAccessColor(
|
||||
Optional
|
||||
.ofNullable(r.getOpenAccessColor())
|
||||
.map(color -> OpenAccessColor.valueOf(color.toString()))
|
||||
.orElse(null));
|
||||
rs.setInDiamondJournal(r.getIsInDiamondJournal());
|
||||
rs.setIsInDiamondJournal(r.getIsInDiamondJournal());
|
||||
rs.setPubliclyFunded(r.getPubliclyFunded());
|
||||
rs.setTransformativeAgreement(r.getTransformativeAgreement());
|
||||
|
||||
rs.setExternalReference(mapExternalReference(r.getExternalReference()));
|
||||
rs.setInstance(mapInstances(r.getInstance()));
|
||||
|
||||
if (r instanceof Publication) {
|
||||
|
@ -375,6 +401,13 @@ public class ProvisionModelSupport {
|
|||
return rs;
|
||||
}
|
||||
|
||||
private static Language asLanguage(Qualifier lang) {
|
||||
return Optional
|
||||
.ofNullable(lang)
|
||||
.map(q -> Language.newInstance(q.getClassid(), q.getClassname()))
|
||||
.orElse(null);
|
||||
}
|
||||
|
||||
@Nullable
|
||||
private static List<String> getOtherTitles(List<StructuredProperty> titleList) {
|
||||
return Optional
|
||||
|
@ -422,7 +455,7 @@ public class ProvisionModelSupport {
|
|||
Instance i = new Instance();
|
||||
i.setCollectedfrom(asProvenance(instance.getCollectedfrom()));
|
||||
i.setHostedby(asProvenance(instance.getHostedby()));
|
||||
i.setFulltext(i.getFulltext());
|
||||
i.setFulltext(instance.getFulltext());
|
||||
i.setPid(asPid(instance.getPid()));
|
||||
i.setAlternateIdentifier(asPid(instance.getAlternateIdentifier()));
|
||||
i.setAccessright(mapAccessRight(instance.getAccessright()));
|
||||
|
@ -453,7 +486,8 @@ public class ProvisionModelSupport {
|
|||
private static AccessRight mapAccessRight(eu.dnetlib.dhp.schema.oaf.AccessRight accessright) {
|
||||
return AccessRight
|
||||
.newInstance(
|
||||
mapQualifier(accessright),
|
||||
accessright.getClassid(),
|
||||
accessright.getClassname(),
|
||||
Optional
|
||||
.ofNullable(accessright.getOpenAccessRoute())
|
||||
.map(route -> OpenAccessRoute.valueOf(route.toString()))
|
||||
|
@ -508,7 +542,46 @@ public class ProvisionModelSupport {
|
|||
}
|
||||
|
||||
private static Provenance asProvenance(KeyValue keyValue) {
|
||||
return Optional.ofNullable(keyValue).map(cf -> Provenance.newInstance(cf.getKey(), cf.getValue())).orElse(null);
|
||||
return Optional
|
||||
.ofNullable(keyValue)
|
||||
.map(
|
||||
kv -> Provenance
|
||||
.newInstance(
|
||||
StringUtils.substringAfter(kv.getKey(), IdentifierFactory.ID_PREFIX_SEPARATOR),
|
||||
kv.getValue()))
|
||||
.orElse(null);
|
||||
}
|
||||
|
||||
private static List<Measure> mapMeasures(List<eu.dnetlib.dhp.schema.oaf.Measure> measures) {
|
||||
return Optional
|
||||
.ofNullable(measures)
|
||||
.map(
|
||||
ml -> ml
|
||||
.stream()
|
||||
.map(m -> Measure.newInstance(m.getId(), mapCodeLabelKV(m.getUnit())))
|
||||
.collect(Collectors.toList()))
|
||||
.orElse(null);
|
||||
}
|
||||
|
||||
private static List<ExternalReference> mapExternalReference(
|
||||
List<eu.dnetlib.dhp.schema.oaf.ExternalReference> externalReference) {
|
||||
return Optional
|
||||
.ofNullable(externalReference)
|
||||
.map(
|
||||
ext -> ext
|
||||
.stream()
|
||||
.map(
|
||||
e -> ExternalReference
|
||||
.newInstance(
|
||||
e.getSitename(),
|
||||
e.getLabel(),
|
||||
e.getAlternateLabel(),
|
||||
e.getUrl(),
|
||||
mapCodeLabel(e.getQualifier()),
|
||||
e.getRefidentifier(),
|
||||
e.getQuery()))
|
||||
.collect(Collectors.toList()))
|
||||
.orElse(Lists.newArrayList());
|
||||
}
|
||||
|
||||
private static List<Context> asContext(List<eu.dnetlib.dhp.schema.oaf.Context> ctxList,
|
||||
|
@ -529,7 +602,7 @@ public class ProvisionModelSupport {
|
|||
}
|
||||
|
||||
return Optional
|
||||
.ofNullable(contexts)
|
||||
.of(contexts)
|
||||
.map(
|
||||
ctx -> ctx
|
||||
.stream()
|
||||
|
@ -581,7 +654,14 @@ public class ProvisionModelSupport {
|
|||
.map(
|
||||
pids -> pids
|
||||
.stream()
|
||||
.map(p -> Pid.newInstance(p.getQualifier().getClassname(), p.getValue()))
|
||||
.filter(p -> Objects.nonNull(p.getQualifier()))
|
||||
.filter(p -> Objects.nonNull(p.getQualifier().getClassid()))
|
||||
.map(
|
||||
p -> Pid
|
||||
.newInstance(
|
||||
p.getValue(),
|
||||
p.getQualifier().getClassid(),
|
||||
p.getQualifier().getClassname()))
|
||||
.collect(Collectors.toList()))
|
||||
.orElse(null);
|
||||
}
|
||||
|
@ -607,7 +687,9 @@ public class ProvisionModelSupport {
|
|||
.stream()
|
||||
.filter(s -> Objects.nonNull(s.getQualifier()))
|
||||
.filter(s -> Objects.nonNull(s.getQualifier().getClassname()))
|
||||
.map(s -> Subject.newInstance(s.getValue(), s.getQualifier().getClassname()))
|
||||
.map(
|
||||
s -> Subject
|
||||
.newInstance(s.getValue(), s.getQualifier().getClassid(), s.getQualifier().getClassname()))
|
||||
.collect(Collectors.toList()))
|
||||
.orElse(null);
|
||||
}
|
||||
|
@ -620,7 +702,9 @@ public class ProvisionModelSupport {
|
|||
.stream()
|
||||
.filter(s -> Objects.nonNull(s.getQualifier()))
|
||||
.filter(s -> Objects.nonNull(s.getQualifier().getClassname()))
|
||||
.map(s -> Subject.newInstance(s.getValue(), s.getQualifier().getClassname()))
|
||||
.map(
|
||||
s -> Subject
|
||||
.newInstance(s.getValue(), s.getQualifier().getClassid(), s.getQualifier().getClassname()))
|
||||
.collect(Collectors.toList()))
|
||||
.orElse(null);
|
||||
}
|
||||
|
@ -689,7 +773,7 @@ public class ProvisionModelSupport {
|
|||
private static CodeLabel mapCodeLabel(KeyValue kv) {
|
||||
return Optional
|
||||
.ofNullable(kv)
|
||||
.map(q -> CodeLabel.newInstance(kv.getKey(), kv.getValue()))
|
||||
.map(k -> CodeLabel.newInstance(k.getKey(), k.getValue()))
|
||||
.orElse(null);
|
||||
}
|
||||
|
||||
|
|
|
@ -219,6 +219,13 @@ public class XmlRecordFactory implements Serializable {
|
|||
if (entity.getMeasures() != null) {
|
||||
metadata.addAll(measuresAsXml(entity.getMeasures()));
|
||||
}
|
||||
if (entity.getContext() != null) {
|
||||
contexts.addAll(entity.getContext().stream().map(Context::getId).collect(Collectors.toList()));
|
||||
/* FIXME: Workaround for CLARIN mining issue: #3670#note-29 */
|
||||
if (contexts.contains("dh-ch::subcommunity::2")) {
|
||||
contexts.add("clarin");
|
||||
}
|
||||
}
|
||||
|
||||
if (ModelSupport.isResult(type)) {
|
||||
final Result r = (Result) entity;
|
||||
|
@ -245,14 +252,6 @@ public class XmlRecordFactory implements Serializable {
|
|||
.collect(Collectors.toList()));
|
||||
}
|
||||
|
||||
if (r.getContext() != null) {
|
||||
contexts.addAll(r.getContext().stream().map(c -> c.getId()).collect(Collectors.toList()));
|
||||
/* FIXME: Workaround for CLARIN mining issue: #3670#note-29 */
|
||||
if (contexts.contains("dh-ch::subcommunity::2")) {
|
||||
contexts.add("clarin");
|
||||
}
|
||||
}
|
||||
|
||||
if (r.getTitle() != null) {
|
||||
metadata
|
||||
.addAll(
|
||||
|
@ -1315,7 +1314,7 @@ public class XmlRecordFactory implements Serializable {
|
|||
instance
|
||||
.getCollectedfrom()
|
||||
.stream()
|
||||
.filter(cf -> kvNotBlank(cf))
|
||||
.filter(XmlRecordFactory::kvNotBlank)
|
||||
.map(cf -> XmlSerializationUtils.mapKeyValue("collectedfrom", cf))
|
||||
.collect(Collectors.toList()));
|
||||
}
|
||||
|
@ -1326,7 +1325,7 @@ public class XmlRecordFactory implements Serializable {
|
|||
instance
|
||||
.getHostedby()
|
||||
.stream()
|
||||
.filter(hb -> kvNotBlank(hb))
|
||||
.filter(XmlRecordFactory::kvNotBlank)
|
||||
.map(hb -> XmlSerializationUtils.mapKeyValue("hostedby", hb))
|
||||
.collect(Collectors.toList()));
|
||||
}
|
||||
|
@ -1336,7 +1335,7 @@ public class XmlRecordFactory implements Serializable {
|
|||
instance
|
||||
.getDateofacceptance()
|
||||
.stream()
|
||||
.filter(d -> isNotBlank(d))
|
||||
.filter(StringUtils::isNotBlank)
|
||||
.map(d -> XmlSerializationUtils.asXmlElement("dateofacceptance", d))
|
||||
.collect(Collectors.toList()));
|
||||
}
|
||||
|
@ -1346,7 +1345,7 @@ public class XmlRecordFactory implements Serializable {
|
|||
instance
|
||||
.getInstancetype()
|
||||
.stream()
|
||||
.filter(t -> !StringUtils.isNotBlank(t.getClassid()))
|
||||
.filter(t -> StringUtils.isNotBlank(t.getClassid()))
|
||||
.map(t -> XmlSerializationUtils.mapQualifier("instancetype", t))
|
||||
.collect(Collectors.toList()));
|
||||
}
|
||||
|
@ -1356,7 +1355,7 @@ public class XmlRecordFactory implements Serializable {
|
|||
instance
|
||||
.getDistributionlocation()
|
||||
.stream()
|
||||
.filter(d -> isNotBlank(d))
|
||||
.filter(StringUtils::isNotBlank)
|
||||
.map(d -> XmlSerializationUtils.asXmlElement("distributionlocation", d))
|
||||
.collect(Collectors.toList()));
|
||||
}
|
||||
|
@ -1409,7 +1408,7 @@ public class XmlRecordFactory implements Serializable {
|
|||
instance
|
||||
.getLicense()
|
||||
.stream()
|
||||
.filter(d -> isNotBlank(d))
|
||||
.filter(StringUtils::isNotBlank)
|
||||
.map(d -> XmlSerializationUtils.asXmlElement("license", d))
|
||||
.collect(Collectors.toList()));
|
||||
}
|
||||
|
@ -1540,11 +1539,16 @@ public class XmlRecordFactory implements Serializable {
|
|||
.min(new RefereedComparator())
|
||||
.orElse(XmlInstance.UNKNOWN_REVIEW_LEVEL));
|
||||
|
||||
Map<String, Qualifier> instanceTypes = Maps.newHashMap();
|
||||
|
||||
instances.forEach(p -> {
|
||||
final Instance i = p.getRight();
|
||||
instance.getCollectedfrom().add(i.getCollectedfrom());
|
||||
instance.getHostedby().add(i.getHostedby());
|
||||
instance.getInstancetype().add(i.getInstancetype());
|
||||
|
||||
if (Optional.ofNullable(i.getInstancetype()).map(Qualifier::getClassid).isPresent()) {
|
||||
instanceTypes.putIfAbsent(i.getInstancetype().getClassid(), i.getInstancetype());
|
||||
}
|
||||
instance
|
||||
.setProcessingchargeamount(
|
||||
Optional.ofNullable(i.getProcessingchargeamount()).map(apc -> apc.getValue()).orElse(null));
|
||||
|
@ -1571,6 +1575,8 @@ public class XmlRecordFactory implements Serializable {
|
|||
.ifPresent(instance::setFulltext);
|
||||
});
|
||||
|
||||
instance.getInstancetype().addAll(instanceTypes.values());
|
||||
|
||||
if (instance.getHostedby().size() > 1
|
||||
&& instance.getHostedby().stream().anyMatch(hb -> ModelConstants.UNKNOWN_REPOSITORY.equals(hb))) {
|
||||
instance.getHostedby().remove(ModelConstants.UNKNOWN_REPOSITORY);
|
||||
|
@ -1596,9 +1602,7 @@ public class XmlRecordFactory implements Serializable {
|
|||
private List<String> buildContexts(final String type, final Set<String> contexts) {
|
||||
final List<String> res = Lists.newArrayList();
|
||||
|
||||
if (contextMapper != null
|
||||
&& !contextMapper.isEmpty()
|
||||
&& MainEntityType.result.toString().equals(type)) {
|
||||
if (contextMapper != null && !contextMapper.isEmpty()) {
|
||||
|
||||
XMLTag document = XMLDoc.newDocument(true).addRoot("contextRoot");
|
||||
|
||||
|
|
|
@ -13,7 +13,7 @@
|
|||
},
|
||||
{
|
||||
"paramName": "f",
|
||||
"paramLongName": "format",
|
||||
"paramLongName": "shadowFormat",
|
||||
"paramDescription": "MDFormat name found in the IS profile",
|
||||
"paramRequired": true
|
||||
},
|
||||
|
|
|
@ -13,8 +13,8 @@
|
|||
},
|
||||
{
|
||||
"paramName": "f",
|
||||
"paramLongName": "format",
|
||||
"paramDescription": "MDFormat name found in the IS profile",
|
||||
"paramLongName": "shadowFormat",
|
||||
"paramDescription": "MDFormat name found in the IS profile bound to the shadow index collection to feed",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
|
|
|
@ -5,12 +5,6 @@
|
|||
"paramDescription": "the URL to the ISLookUp Service",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "f",
|
||||
"paramLongName": "format",
|
||||
"paramDescription": "metadata format profile name",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "a",
|
||||
"paramLongName": "action",
|
||||
|
@ -28,5 +22,18 @@
|
|||
"paramLongName": "commit",
|
||||
"paramDescription": "should the action be followed by a commit?",
|
||||
"paramRequired": false
|
||||
},
|
||||
{
|
||||
"paramName": "pf",
|
||||
"paramLongName": "publicFormat",
|
||||
"paramDescription": "the name of the public metadata format profile - used to create an alias",
|
||||
"paramRequired": false
|
||||
},
|
||||
{
|
||||
"paramName": "sf",
|
||||
"paramLongName": "shadowFormat",
|
||||
"paramDescription": "the name of the shadow metadata format profile - used to create an alias",
|
||||
"paramRequired": false
|
||||
}
|
||||
|
||||
]
|
|
@ -35,7 +35,7 @@
|
|||
<description>maximum number of relations allowed for a each entity grouping by target</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>format</name>
|
||||
<name>shadowFormat</name>
|
||||
<description>metadata format name (DMF|TMF)</description>
|
||||
</property>
|
||||
<property>
|
||||
|
@ -133,6 +133,7 @@
|
|||
<case to="create_payloads">${wf:conf('resumeFrom') eq 'create_payloads'}</case>
|
||||
<case to="drop_solr_collection">${wf:conf('resumeFrom') eq 'drop_solr_collection'}</case>
|
||||
<case to="to_solr_index">${wf:conf('resumeFrom') eq 'to_solr_index'}</case>
|
||||
<case to="update_solr_aliases">${wf:conf('resumeFrom') eq 'update_solr_aliases'}</case>
|
||||
<default to="prepare_relations"/>
|
||||
</switch>
|
||||
</decision>
|
||||
|
@ -641,8 +642,8 @@
|
|||
</configuration>
|
||||
<main-class>eu.dnetlib.dhp.oa.provision.SolrAdminApplication</main-class>
|
||||
<arg>--isLookupUrl</arg><arg>${isLookupUrl}</arg>
|
||||
<arg>--format</arg><arg>${format}</arg>
|
||||
<arg>--action</arg><arg>DELETE_BY_QUERY</arg>
|
||||
<arg>--shadowFormat</arg><arg>${shadowFormat}</arg>
|
||||
<arg>--query</arg><arg>${solrDeletionQuery}</arg>
|
||||
<arg>--commit</arg><arg>true</arg>
|
||||
</java>
|
||||
|
@ -672,7 +673,7 @@
|
|||
</spark-opts>
|
||||
<arg>--inputPath</arg><arg>${workingDir}/xml_json</arg>
|
||||
<arg>--isLookupUrl</arg><arg>${isLookupUrl}</arg>
|
||||
<arg>--format</arg><arg>${format}</arg>
|
||||
<arg>--shadowFormat</arg><arg>${shadowFormat}</arg>
|
||||
<arg>--batchSize</arg><arg>${batchSize}</arg>
|
||||
</spark>
|
||||
<ok to="commit_solr_collection"/>
|
||||
|
@ -689,7 +690,7 @@
|
|||
</configuration>
|
||||
<main-class>eu.dnetlib.dhp.oa.provision.SolrAdminApplication</main-class>
|
||||
<arg>--isLookupUrl</arg><arg>${isLookupUrl}</arg>
|
||||
<arg>--format</arg><arg>${format}</arg>
|
||||
<arg>--shadowFormat</arg><arg>${shadowFormat}</arg>
|
||||
<arg>--action</arg><arg>COMMIT</arg>
|
||||
</java>
|
||||
<ok to="End"/>
|
||||
|
@ -714,12 +715,31 @@
|
|||
</spark-opts>
|
||||
<arg>--inputPath</arg><arg>${workingDir}/xml_json</arg>
|
||||
<arg>--isLookupUrl</arg><arg>${isLookupUrl}</arg>
|
||||
<arg>--format</arg><arg>${format}</arg>
|
||||
<arg>--shadowFormat</arg><arg>${shadowFormat}</arg>
|
||||
<arg>--outputPath</arg><arg>${workingDir}/solr_documents</arg>
|
||||
</spark>
|
||||
<ok to="End"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<!-- Action that updates the solr core aliases - out of order execution, only using the 'resume_from' param -->
|
||||
<action name="update_solr_aliases">
|
||||
<java>
|
||||
<configuration>
|
||||
<property>
|
||||
<name>oozie.launcher.mapreduce.user.classpath.first</name>
|
||||
<value>true</value>
|
||||
</property>
|
||||
</configuration>
|
||||
<main-class>eu.dnetlib.dhp.oa.provision.SolrAdminApplication</main-class>
|
||||
<arg>--isLookupUrl</arg><arg>${isLookupUrl}</arg>
|
||||
<arg>--action</arg><arg>UPDATE_ALIASES</arg>
|
||||
<arg>--publicFormat</arg><arg>${publicFormat}</arg>
|
||||
<arg>--shadowFormat</arg><arg>${shadowFormat}</arg>
|
||||
</java>
|
||||
<ok to="End"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<end name="End"/>
|
||||
</workflow-app>
|
|
@ -5,6 +5,7 @@ import static org.junit.jupiter.api.Assertions.assertEquals;
|
|||
import static org.junit.jupiter.api.Assertions.assertNotNull;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.StringReader;
|
||||
import java.net.MalformedURLException;
|
||||
import java.net.URL;
|
||||
import java.util.Arrays;
|
||||
|
@ -16,6 +17,9 @@ import javax.xml.transform.TransformerException;
|
|||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.solr.client.solrj.util.ClientUtils;
|
||||
import org.apache.solr.common.SolrInputDocument;
|
||||
import org.dom4j.Document;
|
||||
import org.dom4j.DocumentException;
|
||||
import org.dom4j.io.SAXReader;
|
||||
import org.junit.jupiter.api.Assertions;
|
||||
import org.junit.jupiter.api.BeforeEach;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
@ -34,7 +38,6 @@ import eu.dnetlib.dhp.utils.saxon.SaxonTransformerFactory;
|
|||
|
||||
/**
|
||||
* This test can be used to produce a record that can be manually fed to Solr in XML format.
|
||||
*
|
||||
* The input is a JoinedEntity, i.e. a json representation of an OpenAIRE entity that embeds all the linked entities.
|
||||
*/
|
||||
public class IndexRecordTransformerTest {
|
||||
|
@ -54,7 +57,7 @@ public class IndexRecordTransformerTest {
|
|||
}
|
||||
|
||||
@Test
|
||||
public void testPublicationRecordTransformation() throws IOException, TransformerException {
|
||||
public void testPublicationRecordTransformation() throws IOException, TransformerException, DocumentException {
|
||||
|
||||
final XmlRecordFactory xmlRecordFactory = new XmlRecordFactory(contextMapper, false,
|
||||
PayloadConverterJob.schemaLocation);
|
||||
|
@ -71,11 +74,15 @@ public class IndexRecordTransformerTest {
|
|||
new RelatedEntityWrapper(rel,
|
||||
CreateRelatedEntitiesJob_phase1.asRelatedEntity(pj, Project.class))));
|
||||
|
||||
final String record = xmlRecordFactory.build(je);
|
||||
final String xmlRecord = xmlRecordFactory.build(je);
|
||||
|
||||
assertNotNull(record);
|
||||
assertNotNull(xmlRecord);
|
||||
|
||||
testRecordTransformation(record);
|
||||
Document doc = new SAXReader().read(new StringReader(xmlRecord));
|
||||
|
||||
assertEquals("Article", doc.valueOf("//children/instance/instancetype/@classname"));
|
||||
|
||||
testRecordTransformation(xmlRecord);
|
||||
}
|
||||
|
||||
@Test
|
||||
|
|
|
@ -4,16 +4,20 @@ package eu.dnetlib.dhp.oa.provision;
|
|||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
import static org.junit.jupiter.api.Assertions.assertTrue;
|
||||
|
||||
import org.apache.solr.client.solrj.request.SolrPing;
|
||||
import org.apache.solr.client.solrj.response.CollectionAdminResponse;
|
||||
import org.apache.solr.client.solrj.response.SolrPingResponse;
|
||||
import org.apache.solr.client.solrj.response.UpdateResponse;
|
||||
import org.junit.jupiter.api.Assertions;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
class SolrAdminApplicationTest extends SolrTest {
|
||||
|
||||
@Test
|
||||
void testPing() throws Exception {
|
||||
SolrPingResponse pingResponse = miniCluster.getSolrClient().ping();
|
||||
final SolrPing ping = new SolrPing();
|
||||
ping.getParams().set("collection", ProvisionConstants.SHADOW_ALIAS_NAME);
|
||||
SolrPingResponse pingResponse = ping.process(miniCluster.getSolrClient());
|
||||
|
||||
log.info("pingResponse: '{}'", pingResponse.getStatus());
|
||||
assertEquals(0, pingResponse.getStatus());
|
||||
}
|
||||
|
@ -24,7 +28,7 @@ class SolrAdminApplicationTest extends SolrTest {
|
|||
SolrAdminApplication admin = new SolrAdminApplication(miniCluster.getSolrClient().getZkHost());
|
||||
|
||||
UpdateResponse rsp = (UpdateResponse) admin
|
||||
.execute(SolrAdminApplication.Action.DELETE_BY_QUERY, DEFAULT_COLLECTION, "*:*", false);
|
||||
.execute(SolrAdminApplication.Action.DELETE_BY_QUERY, "*:*", false, null, SHADOW_COLLECTION);
|
||||
|
||||
assertEquals(0, rsp.getStatus());
|
||||
}
|
||||
|
@ -34,9 +38,30 @@ class SolrAdminApplicationTest extends SolrTest {
|
|||
|
||||
SolrAdminApplication admin = new SolrAdminApplication(miniCluster.getSolrClient().getZkHost());
|
||||
|
||||
UpdateResponse rsp = (UpdateResponse) admin.commit(DEFAULT_COLLECTION);
|
||||
UpdateResponse rsp = (UpdateResponse) admin.commit(SHADOW_COLLECTION);
|
||||
|
||||
assertEquals(0, rsp.getStatus());
|
||||
}
|
||||
|
||||
@Test
|
||||
void testAdminApplication_CREATE_ALIAS() throws Exception {
|
||||
|
||||
SolrAdminApplication admin = new SolrAdminApplication(miniCluster.getSolrClient().getZkHost());
|
||||
|
||||
CollectionAdminResponse rsp = (CollectionAdminResponse) admin
|
||||
.createAlias(ProvisionConstants.PUBLIC_ALIAS_NAME, SHADOW_COLLECTION);
|
||||
assertEquals(0, rsp.getStatus());
|
||||
|
||||
}
|
||||
|
||||
@Test
|
||||
void testAdminApplication_DELETE_ALIAS() throws Exception {
|
||||
|
||||
SolrAdminApplication admin = new SolrAdminApplication(miniCluster.getSolrClient().getZkHost());
|
||||
|
||||
CollectionAdminResponse rsp = (CollectionAdminResponse) admin.deleteAlias(ProvisionConstants.PUBLIC_ALIAS_NAME);
|
||||
assertEquals(0, rsp.getStatus());
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -1,21 +1,40 @@
|
|||
|
||||
package eu.dnetlib.dhp.oa.provision;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.net.URI;
|
||||
import java.nio.file.Path;
|
||||
|
||||
import org.apache.commons.io.FileUtils;
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.solr.client.solrj.SolrQuery;
|
||||
import org.apache.solr.client.solrj.embedded.JettyConfig;
|
||||
import org.apache.solr.client.solrj.impl.CloudSolrClient;
|
||||
import org.apache.solr.client.solrj.request.CollectionAdminRequest;
|
||||
import org.apache.solr.client.solrj.request.ConfigSetAdminRequest;
|
||||
import org.apache.solr.client.solrj.request.QueryRequest;
|
||||
import org.apache.solr.client.solrj.response.CollectionAdminResponse;
|
||||
import org.apache.solr.client.solrj.response.QueryResponse;
|
||||
import org.apache.solr.cloud.MiniSolrCloudCluster;
|
||||
import org.apache.solr.common.SolrDocument;
|
||||
import org.apache.solr.common.params.CollectionParams;
|
||||
import org.apache.solr.common.params.CommonParams;
|
||||
import org.apache.solr.common.params.CoreAdminParams;
|
||||
import org.apache.solr.common.params.ModifiableSolrParams;
|
||||
import org.apache.solr.common.util.NamedList;
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.sql.SparkSession;
|
||||
import org.junit.jupiter.api.*;
|
||||
import org.junit.jupiter.api.extension.ExtendWith;
|
||||
import org.junit.jupiter.api.io.TempDir;
|
||||
import org.mockito.Mock;
|
||||
import org.mockito.Mockito;
|
||||
import org.mockito.junit.jupiter.MockitoExtension;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import eu.dnetlib.dhp.oa.provision.model.SerializableSolrInputDocument;
|
||||
import eu.dnetlib.dhp.oa.provision.utils.ISLookupClient;
|
||||
|
@ -23,7 +42,18 @@ import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
|
|||
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
|
||||
|
||||
@ExtendWith(MockitoExtension.class)
|
||||
public class SolrConfigExploreTest extends SolrExploreTest {
|
||||
public class SolrConfigExploreTest {
|
||||
|
||||
protected static final Logger log = LoggerFactory.getLogger(SolrConfigExploreTest.class);
|
||||
|
||||
protected static final String SHADOW_FORMAT = "c1";
|
||||
protected static final String SHADOW_COLLECTION = SHADOW_FORMAT + "-index-openaire";
|
||||
protected static final String PUBLIC_FORMAT = "c2";
|
||||
protected static final String PUBLIC_COLLECTION = PUBLIC_FORMAT + "-index-openaire";
|
||||
|
||||
protected static final String CONFIG_NAME = "testConfig";
|
||||
|
||||
protected static SolrAdminApplication admin;
|
||||
|
||||
protected static SparkSession spark;
|
||||
|
||||
|
@ -35,15 +65,17 @@ public class SolrConfigExploreTest extends SolrExploreTest {
|
|||
@Mock
|
||||
private ISLookupClient isLookupClient;
|
||||
|
||||
@TempDir
|
||||
public static Path workingDir;
|
||||
|
||||
protected static MiniSolrCloudCluster miniCluster;
|
||||
|
||||
@BeforeEach
|
||||
public void prepareMocks() throws ISLookUpException, IOException {
|
||||
isLookupClient.setIsLookup(isLookUpService);
|
||||
|
||||
int solrPort = URI.create("http://" + miniCluster.getZkClient().getZkServerAddress()).getPort();
|
||||
|
||||
Mockito
|
||||
.when(isLookupClient.getDsId(Mockito.anyString()))
|
||||
.thenReturn("313f0381-23b6-466f-a0b8-c72a9679ac4b_SW5kZXhEU1Jlc291cmNlcy9JbmRleERTUmVzb3VyY2VUeXBl");
|
||||
Mockito.when(isLookupClient.getZkHost()).thenReturn(String.format("127.0.0.1:%s/solr", solrPort));
|
||||
Mockito
|
||||
.when(isLookupClient.getLayoutSource(Mockito.anyString()))
|
||||
|
@ -54,7 +86,7 @@ public class SolrConfigExploreTest extends SolrExploreTest {
|
|||
}
|
||||
|
||||
@BeforeAll
|
||||
public static void before() {
|
||||
public static void setup() throws Exception {
|
||||
|
||||
SparkConf conf = new SparkConf();
|
||||
conf.setAppName(XmlIndexingJobTest.class.getSimpleName());
|
||||
|
@ -70,15 +102,75 @@ public class SolrConfigExploreTest extends SolrExploreTest {
|
|||
|
||||
spark = SparkSession
|
||||
.builder()
|
||||
.appName(XmlIndexingJobTest.class.getSimpleName())
|
||||
.appName(SolrConfigExploreTest.class.getSimpleName())
|
||||
.config(conf)
|
||||
.getOrCreate();
|
||||
|
||||
// random unassigned HTTP port
|
||||
final int jettyPort = 0;
|
||||
final JettyConfig jettyConfig = JettyConfig.builder().setPort(jettyPort).build();
|
||||
|
||||
log.info(String.format("working directory: %s", workingDir.toString()));
|
||||
System.setProperty("solr.log.dir", workingDir.resolve("logs").toString());
|
||||
|
||||
// create a MiniSolrCloudCluster instance
|
||||
miniCluster = new MiniSolrCloudCluster(2, workingDir.resolve("solr"), jettyConfig);
|
||||
|
||||
// Upload Solr configuration directory to ZooKeeper
|
||||
String solrZKConfigDir = "src/test/resources/eu/dnetlib/dhp/oa/provision/solr/conf/exploreTestConfig";
|
||||
File configDir = new File(solrZKConfigDir);
|
||||
|
||||
miniCluster.uploadConfigSet(configDir.toPath(), CONFIG_NAME);
|
||||
|
||||
// override settings in the solrconfig include
|
||||
System.setProperty("solr.tests.maxBufferedDocs", "100000");
|
||||
System.setProperty("solr.tests.maxIndexingThreads", "-1");
|
||||
System.setProperty("solr.tests.ramBufferSizeMB", "100");
|
||||
|
||||
// use non-test classes so RandomizedRunner isn't necessary
|
||||
System.setProperty("solr.tests.mergeScheduler", "org.apache.lucene.index.ConcurrentMergeScheduler");
|
||||
System.setProperty("solr.directoryFactory", "solr.RAMDirectoryFactory");
|
||||
System.setProperty("solr.lock.type", "single");
|
||||
|
||||
log.info(new ConfigSetAdminRequest.List().process(miniCluster.getSolrClient()).toString());
|
||||
log
|
||||
.info(
|
||||
CollectionAdminRequest.ClusterStatus
|
||||
.getClusterStatus()
|
||||
.process(miniCluster.getSolrClient())
|
||||
.toString());
|
||||
|
||||
NamedList<Object> res = createCollection(
|
||||
miniCluster.getSolrClient(), SHADOW_COLLECTION, 4, 2, 20, CONFIG_NAME);
|
||||
res.forEach(o -> log.info(o.toString()));
|
||||
|
||||
// miniCluster.getSolrClient().setDefaultCollection(SHADOW_COLLECTION);
|
||||
|
||||
res = createCollection(
|
||||
miniCluster.getSolrClient(), PUBLIC_COLLECTION, 4, 2, 20, CONFIG_NAME);
|
||||
res.forEach(o -> log.info(o.toString()));
|
||||
|
||||
admin = new SolrAdminApplication(miniCluster.getZkClient().getZkServerAddress());
|
||||
CollectionAdminResponse rsp = (CollectionAdminResponse) admin
|
||||
.createAlias(ProvisionConstants.PUBLIC_ALIAS_NAME, PUBLIC_COLLECTION);
|
||||
assertEquals(0, rsp.getStatus());
|
||||
rsp = (CollectionAdminResponse) admin.createAlias(ProvisionConstants.SHADOW_ALIAS_NAME, SHADOW_COLLECTION);
|
||||
assertEquals(0, rsp.getStatus());
|
||||
|
||||
log
|
||||
.info(
|
||||
CollectionAdminRequest.ClusterStatus
|
||||
.getClusterStatus()
|
||||
.process(miniCluster.getSolrClient())
|
||||
.toString());
|
||||
|
||||
}
|
||||
|
||||
@AfterAll
|
||||
public static void tearDown() {
|
||||
public static void tearDown() throws Exception {
|
||||
spark.stop();
|
||||
miniCluster.shutdown();
|
||||
FileUtils.deleteDirectory(workingDir.toFile());
|
||||
}
|
||||
|
||||
@Test
|
||||
|
@ -86,8 +178,10 @@ public class SolrConfigExploreTest extends SolrExploreTest {
|
|||
|
||||
String inputPath = "src/test/resources/eu/dnetlib/dhp/oa/provision/xml";
|
||||
|
||||
new XmlIndexingJob(spark, inputPath, FORMAT, batchSize).run(isLookupClient);
|
||||
Assertions.assertEquals(0, miniCluster.getSolrClient().commit().getStatus());
|
||||
new XmlIndexingJob(spark, inputPath, SHADOW_FORMAT, ProvisionConstants.SHADOW_ALIAS_NAME, batchSize)
|
||||
.run(isLookupClient);
|
||||
Assertions
|
||||
.assertEquals(0, miniCluster.getSolrClient().commit(ProvisionConstants.SHADOW_ALIAS_NAME).getStatus());
|
||||
|
||||
String[] queryStrings = {
|
||||
"cancer",
|
||||
|
@ -101,14 +195,14 @@ public class SolrConfigExploreTest extends SolrExploreTest {
|
|||
query.add(CommonParams.Q, q);
|
||||
query.set("debugQuery", "on");
|
||||
|
||||
log.info("Submit query to Solr with params: {}", query.toString());
|
||||
QueryResponse rsp = miniCluster.getSolrClient().query(query);
|
||||
log.info("Submit query to Solr with params: {}", query);
|
||||
QueryResponse rsp = miniCluster.getSolrClient().query(ProvisionConstants.SHADOW_ALIAS_NAME, query);
|
||||
// System.out.println(rsp.getHighlighting());
|
||||
// System.out.println(rsp.getExplainMap());
|
||||
|
||||
for (SolrDocument doc : rsp.getResults()) {
|
||||
System.out
|
||||
.println(
|
||||
log
|
||||
.info(
|
||||
doc.get("score") + "\t" +
|
||||
doc.get("__indexrecordidentifier") + "\t" +
|
||||
doc.get("resultidentifier") + "\t" +
|
||||
|
@ -122,4 +216,18 @@ public class SolrConfigExploreTest extends SolrExploreTest {
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
protected static NamedList<Object> createCollection(CloudSolrClient client, String name, int numShards,
|
||||
int replicationFactor, int maxShardsPerNode, String configName) throws Exception {
|
||||
ModifiableSolrParams modParams = new ModifiableSolrParams();
|
||||
modParams.set(CoreAdminParams.ACTION, CollectionParams.CollectionAction.CREATE.name());
|
||||
modParams.set("name", name);
|
||||
modParams.set("numShards", numShards);
|
||||
modParams.set("replicationFactor", replicationFactor);
|
||||
modParams.set("collection.configName", configName);
|
||||
modParams.set("maxShardsPerNode", maxShardsPerNode);
|
||||
QueryRequest request = new QueryRequest(modParams);
|
||||
request.setPath("/admin/collections");
|
||||
return client.request(request);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -2,24 +2,15 @@
|
|||
package eu.dnetlib.dhp.oa.provision;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.StringReader;
|
||||
import java.net.URI;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.hadoop.io.Text;
|
||||
import org.apache.solr.client.solrj.SolrQuery;
|
||||
import org.apache.solr.client.solrj.response.QueryResponse;
|
||||
import org.apache.solr.common.SolrDocument;
|
||||
import org.apache.solr.common.SolrInputField;
|
||||
import org.apache.solr.common.params.CommonParams;
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.api.java.JavaPairRDD;
|
||||
import org.apache.spark.api.java.JavaSparkContext;
|
||||
import org.apache.spark.api.java.function.MapFunction;
|
||||
import org.apache.spark.sql.Dataset;
|
||||
import org.apache.spark.sql.Encoders;
|
||||
import org.apache.spark.sql.SparkSession;
|
||||
import org.dom4j.io.SAXReader;
|
||||
import org.junit.jupiter.api.*;
|
||||
import org.junit.jupiter.api.extension.ExtendWith;
|
||||
import org.mockito.Mock;
|
||||
|
@ -50,9 +41,6 @@ public class SolrConfigTest extends SolrTest {
|
|||
|
||||
int solrPort = URI.create("http://" + miniCluster.getZkClient().getZkServerAddress()).getPort();
|
||||
|
||||
Mockito
|
||||
.when(isLookupClient.getDsId(Mockito.anyString()))
|
||||
.thenReturn("313f0381-23b6-466f-a0b8-c72a9679ac4b_SW5kZXhEU1Jlc291cmNlcy9JbmRleERTUmVzb3VyY2VUeXBl");
|
||||
Mockito.when(isLookupClient.getZkHost()).thenReturn(String.format("127.0.0.1:%s/solr", solrPort));
|
||||
Mockito
|
||||
.when(isLookupClient.getLayoutSource(Mockito.anyString()))
|
||||
|
@ -95,9 +83,10 @@ public class SolrConfigTest extends SolrTest {
|
|||
|
||||
String inputPath = "src/test/resources/eu/dnetlib/dhp/oa/provision/xml";
|
||||
|
||||
new XmlIndexingJob(spark, inputPath, FORMAT, batchSize)
|
||||
new XmlIndexingJob(spark, inputPath, SHADOW_FORMAT, ProvisionConstants.SHADOW_ALIAS_NAME, batchSize)
|
||||
.run(isLookupClient);
|
||||
Assertions.assertEquals(0, miniCluster.getSolrClient().commit().getStatus());
|
||||
Assertions
|
||||
.assertEquals(0, miniCluster.getSolrClient().commit(ProvisionConstants.SHADOW_ALIAS_NAME).getStatus());
|
||||
|
||||
String[] queryStrings = {
|
||||
"cancer",
|
||||
|
@ -109,8 +98,8 @@ public class SolrConfigTest extends SolrTest {
|
|||
SolrQuery query = new SolrQuery();
|
||||
query.add(CommonParams.Q, q);
|
||||
|
||||
log.info("Submit query to Solr with params: {}", query.toString());
|
||||
QueryResponse rsp = miniCluster.getSolrClient().query(query);
|
||||
log.info("Submit query to Solr with params: {}", query);
|
||||
QueryResponse rsp = miniCluster.getSolrClient().query(ProvisionConstants.SHADOW_ALIAS_NAME, query);
|
||||
|
||||
for (SolrDocument doc : rsp.getResults()) {
|
||||
System.out
|
||||
|
|
|
@ -34,58 +34,6 @@ public abstract class SolrExploreTest {
|
|||
@TempDir
|
||||
public static Path workingDir;
|
||||
|
||||
@BeforeAll
|
||||
public static void setup() throws Exception {
|
||||
|
||||
// random unassigned HTTP port
|
||||
final int jettyPort = 0;
|
||||
final JettyConfig jettyConfig = JettyConfig.builder().setPort(jettyPort).build();
|
||||
|
||||
log.info(String.format("working directory: %s", workingDir.toString()));
|
||||
System.setProperty("solr.log.dir", workingDir.resolve("logs").toString());
|
||||
|
||||
// create a MiniSolrCloudCluster instance
|
||||
miniCluster = new MiniSolrCloudCluster(2, workingDir.resolve("solr"), jettyConfig);
|
||||
|
||||
// Upload Solr configuration directory to ZooKeeper
|
||||
String solrZKConfigDir = "src/test/resources/eu/dnetlib/dhp/oa/provision/solr/conf/exploreTestConfig";
|
||||
File configDir = new File(solrZKConfigDir);
|
||||
|
||||
miniCluster.uploadConfigSet(configDir.toPath(), CONFIG_NAME);
|
||||
|
||||
// override settings in the solrconfig include
|
||||
System.setProperty("solr.tests.maxBufferedDocs", "100000");
|
||||
System.setProperty("solr.tests.maxIndexingThreads", "-1");
|
||||
System.setProperty("solr.tests.ramBufferSizeMB", "100");
|
||||
|
||||
// use non-test classes so RandomizedRunner isn't necessary
|
||||
System.setProperty("solr.tests.mergeScheduler", "org.apache.lucene.index.ConcurrentMergeScheduler");
|
||||
System.setProperty("solr.directoryFactory", "solr.RAMDirectoryFactory");
|
||||
System.setProperty("solr.lock.type", "single");
|
||||
|
||||
log.info(new ConfigSetAdminRequest.List().process(miniCluster.getSolrClient()).toString());
|
||||
log
|
||||
.info(
|
||||
CollectionAdminRequest.ClusterStatus
|
||||
.getClusterStatus()
|
||||
.process(miniCluster.getSolrClient())
|
||||
.toString());
|
||||
|
||||
NamedList<Object> res = createCollection(
|
||||
miniCluster.getSolrClient(), DEFAULT_COLLECTION, 4, 2, 20, CONFIG_NAME);
|
||||
res.forEach(o -> log.info(o.toString()));
|
||||
|
||||
miniCluster.getSolrClient().setDefaultCollection(DEFAULT_COLLECTION);
|
||||
|
||||
log
|
||||
.info(
|
||||
CollectionAdminRequest.ClusterStatus
|
||||
.getClusterStatus()
|
||||
.process(miniCluster.getSolrClient())
|
||||
.toString());
|
||||
|
||||
}
|
||||
|
||||
@AfterAll
|
||||
public static void shutDown() throws Exception {
|
||||
miniCluster.shutdown();
|
||||
|
|
|
@ -1,6 +1,8 @@
|
|||
|
||||
package eu.dnetlib.dhp.oa.provision;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
|
||||
import java.io.File;
|
||||
import java.nio.file.Path;
|
||||
|
||||
|
@ -10,6 +12,7 @@ import org.apache.solr.client.solrj.impl.CloudSolrClient;
|
|||
import org.apache.solr.client.solrj.request.CollectionAdminRequest;
|
||||
import org.apache.solr.client.solrj.request.ConfigSetAdminRequest;
|
||||
import org.apache.solr.client.solrj.request.QueryRequest;
|
||||
import org.apache.solr.client.solrj.response.CollectionAdminResponse;
|
||||
import org.apache.solr.cloud.MiniSolrCloudCluster;
|
||||
import org.apache.solr.common.params.CollectionParams;
|
||||
import org.apache.solr.common.params.CoreAdminParams;
|
||||
|
@ -21,14 +24,21 @@ import org.junit.jupiter.api.io.TempDir;
|
|||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import sun.security.provider.SHA;
|
||||
|
||||
public abstract class SolrTest {
|
||||
|
||||
protected static final Logger log = LoggerFactory.getLogger(SolrTest.class);
|
||||
|
||||
protected static final String FORMAT = "test";
|
||||
protected static final String DEFAULT_COLLECTION = FORMAT + "-index-openaire";
|
||||
protected static final String SHADOW_FORMAT = "c1";
|
||||
protected static final String SHADOW_COLLECTION = SHADOW_FORMAT + "-index-openaire";
|
||||
protected static final String PUBLIC_FORMAT = "c2";
|
||||
protected static final String PUBLIC_COLLECTION = PUBLIC_FORMAT + "-index-openaire";
|
||||
|
||||
protected static final String CONFIG_NAME = "testConfig";
|
||||
|
||||
protected static SolrAdminApplication admin;
|
||||
|
||||
protected static MiniSolrCloudCluster miniCluster;
|
||||
|
||||
@TempDir
|
||||
|
@ -72,10 +82,21 @@ public abstract class SolrTest {
|
|||
.toString());
|
||||
|
||||
NamedList<Object> res = createCollection(
|
||||
miniCluster.getSolrClient(), DEFAULT_COLLECTION, 4, 2, 20, CONFIG_NAME);
|
||||
miniCluster.getSolrClient(), SHADOW_COLLECTION, 4, 2, 20, CONFIG_NAME);
|
||||
res.forEach(o -> log.info(o.toString()));
|
||||
|
||||
miniCluster.getSolrClient().setDefaultCollection(DEFAULT_COLLECTION);
|
||||
// miniCluster.getSolrClient().setDefaultCollection(SHADOW_COLLECTION);
|
||||
|
||||
res = createCollection(
|
||||
miniCluster.getSolrClient(), PUBLIC_COLLECTION, 4, 2, 20, CONFIG_NAME);
|
||||
res.forEach(o -> log.info(o.toString()));
|
||||
|
||||
admin = new SolrAdminApplication(miniCluster.getZkClient().getZkServerAddress());
|
||||
CollectionAdminResponse rsp = (CollectionAdminResponse) admin
|
||||
.createAlias(ProvisionConstants.PUBLIC_ALIAS_NAME, PUBLIC_COLLECTION);
|
||||
assertEquals(0, rsp.getStatus());
|
||||
rsp = (CollectionAdminResponse) admin.createAlias(ProvisionConstants.SHADOW_ALIAS_NAME, SHADOW_COLLECTION);
|
||||
assertEquals(0, rsp.getStatus());
|
||||
|
||||
log
|
||||
.info(
|
||||
|
@ -83,12 +104,12 @@ public abstract class SolrTest {
|
|||
.getClusterStatus()
|
||||
.process(miniCluster.getSolrClient())
|
||||
.toString());
|
||||
|
||||
}
|
||||
|
||||
@AfterAll
|
||||
public static void shutDown() throws Exception {
|
||||
miniCluster.shutdown();
|
||||
admin.close();
|
||||
FileUtils.deleteDirectory(workingDir.toFile());
|
||||
}
|
||||
|
||||
|
|
|
@ -10,6 +10,7 @@ import java.util.Optional;
|
|||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.solr.client.solrj.SolrQuery;
|
||||
import org.apache.solr.client.solrj.SolrResponse;
|
||||
import org.apache.solr.client.solrj.response.QueryResponse;
|
||||
import org.apache.solr.common.params.CommonParams;
|
||||
import org.apache.spark.SparkConf;
|
||||
|
@ -50,9 +51,6 @@ public class XmlIndexingJobTest extends SolrTest {
|
|||
|
||||
int solrPort = URI.create("http://" + miniCluster.getZkClient().getZkServerAddress()).getPort();
|
||||
|
||||
Mockito
|
||||
.when(isLookupClient.getDsId(Mockito.anyString()))
|
||||
.thenReturn("313f0381-23b6-466f-a0b8-c72a9679ac4b_SW5kZXhEU1Jlc291cmNlcy9JbmRleERTUmVzb3VyY2VUeXBl");
|
||||
Mockito.when(isLookupClient.getZkHost()).thenReturn(String.format("127.0.0.1:%s/solr", solrPort));
|
||||
Mockito
|
||||
.when(isLookupClient.getLayoutSource(Mockito.anyString()))
|
||||
|
@ -103,46 +101,72 @@ public class XmlIndexingJobTest extends SolrTest {
|
|||
|
||||
long nRecord = records.count();
|
||||
|
||||
new XmlIndexingJob(spark, inputPath, FORMAT, batchSize).run(isLookupClient);
|
||||
new XmlIndexingJob(spark, inputPath, SHADOW_FORMAT, ProvisionConstants.SHADOW_ALIAS_NAME, batchSize)
|
||||
.run(isLookupClient);
|
||||
|
||||
assertEquals(0, miniCluster.getSolrClient().commit().getStatus());
|
||||
assertEquals(0, miniCluster.getSolrClient().commit(SHADOW_COLLECTION).getStatus());
|
||||
|
||||
QueryResponse rsp = miniCluster.getSolrClient().query(new SolrQuery().add(CommonParams.Q, "*:*"));
|
||||
QueryResponse rsp = miniCluster
|
||||
.getSolrClient()
|
||||
.query(
|
||||
ProvisionConstants.SHADOW_ALIAS_NAME,
|
||||
new SolrQuery().add(CommonParams.Q, "*:*"));
|
||||
|
||||
assertEquals(
|
||||
nRecord, rsp.getResults().getNumFound(),
|
||||
"the number of indexed records should be equal to the number of input records");
|
||||
|
||||
rsp = miniCluster.getSolrClient().query(new SolrQuery().add(CommonParams.Q, "isgreen:true"));
|
||||
rsp = miniCluster
|
||||
.getSolrClient()
|
||||
.query(
|
||||
ProvisionConstants.SHADOW_ALIAS_NAME,
|
||||
new SolrQuery().add(CommonParams.Q, "isgreen:true"));
|
||||
assertEquals(
|
||||
0, rsp.getResults().getNumFound(),
|
||||
4, rsp.getResults().getNumFound(),
|
||||
"the number of indexed records having isgreen = true");
|
||||
|
||||
rsp = miniCluster.getSolrClient().query(new SolrQuery().add(CommonParams.Q, "openaccesscolor:bronze"));
|
||||
rsp = miniCluster
|
||||
.getSolrClient()
|
||||
.query(
|
||||
ProvisionConstants.SHADOW_ALIAS_NAME,
|
||||
new SolrQuery().add(CommonParams.Q, "openaccesscolor:bronze"));
|
||||
assertEquals(
|
||||
0, rsp.getResults().getNumFound(),
|
||||
2, rsp.getResults().getNumFound(),
|
||||
"the number of indexed records having openaccesscolor = bronze");
|
||||
|
||||
rsp = miniCluster.getSolrClient().query(new SolrQuery().add(CommonParams.Q, "isindiamondjournal:true"));
|
||||
rsp = miniCluster
|
||||
.getSolrClient()
|
||||
.query(
|
||||
ProvisionConstants.SHADOW_ALIAS_NAME,
|
||||
new SolrQuery().add(CommonParams.Q, "isindiamondjournal:true"));
|
||||
assertEquals(
|
||||
0, rsp.getResults().getNumFound(),
|
||||
"the number of indexed records having isindiamondjournal = true");
|
||||
|
||||
rsp = miniCluster.getSolrClient().query(new SolrQuery().add(CommonParams.Q, "publiclyfunded:true"));
|
||||
rsp = miniCluster
|
||||
.getSolrClient()
|
||||
.query(
|
||||
ProvisionConstants.SHADOW_ALIAS_NAME,
|
||||
new SolrQuery().add(CommonParams.Q, "publiclyfunded:true"));
|
||||
assertEquals(
|
||||
0, rsp.getResults().getNumFound(),
|
||||
"the number of indexed records having publiclyfunded = true");
|
||||
|
||||
rsp = miniCluster.getSolrClient().query(new SolrQuery().add(CommonParams.Q, "peerreviewed:true"));
|
||||
rsp = miniCluster
|
||||
.getSolrClient()
|
||||
.query(
|
||||
ProvisionConstants.SHADOW_ALIAS_NAME,
|
||||
new SolrQuery().add(CommonParams.Q, "peerreviewed:true"));
|
||||
assertEquals(
|
||||
0, rsp.getResults().getNumFound(),
|
||||
35, rsp.getResults().getNumFound(),
|
||||
"the number of indexed records having peerreviewed = true");
|
||||
|
||||
rsp = miniCluster
|
||||
.getSolrClient()
|
||||
.query(
|
||||
ProvisionConstants.SHADOW_ALIAS_NAME,
|
||||
new SolrQuery()
|
||||
.add(CommonParams.Q, "objidentifier:\"iddesignpres::ae77e56e84ad058d9e7f19fa2f7325db\"")
|
||||
.add(CommonParams.Q, "objidentifier:\"57a035e5b1ae::236d6d8c1e03368b5ae72acfeeb11bbc\"")
|
||||
.add(CommonParams.FL, "__json"));
|
||||
assertEquals(
|
||||
1, rsp.getResults().getNumFound(),
|
||||
|
@ -158,6 +182,22 @@ public class XmlIndexingJobTest extends SolrTest {
|
|||
|
||||
log.info((String) json.get());
|
||||
|
||||
admin
|
||||
.execute(
|
||||
SolrAdminApplication.Action.UPDATE_ALIASES, null, false,
|
||||
SHADOW_COLLECTION, PUBLIC_COLLECTION);
|
||||
|
||||
rsp = miniCluster
|
||||
.getSolrClient()
|
||||
.query(
|
||||
ProvisionConstants.PUBLIC_ALIAS_NAME,
|
||||
new SolrQuery()
|
||||
.add(CommonParams.Q, "objidentifier:\"57a035e5b1ae::236d6d8c1e03368b5ae72acfeeb11bbc\"")
|
||||
.add(CommonParams.FL, "__json"));
|
||||
|
||||
assertEquals(
|
||||
1, rsp.getResults().getNumFound(),
|
||||
"the number of indexed records having the given identifier, found in the public collection");
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -1,8 +1,7 @@
|
|||
|
||||
package eu.dnetlib.dhp.oa.provision;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
import static org.junit.jupiter.api.Assertions.assertNotNull;
|
||||
import static org.junit.jupiter.api.Assertions.*;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.StringReader;
|
||||
|
@ -22,6 +21,7 @@ import com.google.common.collect.Lists;
|
|||
import eu.dnetlib.dhp.oa.provision.model.JoinedEntity;
|
||||
import eu.dnetlib.dhp.oa.provision.model.RelatedEntity;
|
||||
import eu.dnetlib.dhp.oa.provision.model.RelatedEntityWrapper;
|
||||
import eu.dnetlib.dhp.oa.provision.utils.ContextDef;
|
||||
import eu.dnetlib.dhp.oa.provision.utils.ContextMapper;
|
||||
import eu.dnetlib.dhp.oa.provision.utils.XmlRecordFactory;
|
||||
import eu.dnetlib.dhp.schema.oaf.*;
|
||||
|
@ -51,7 +51,7 @@ public class XmlRecordFactoryTest {
|
|||
|
||||
assertNotNull(doc);
|
||||
|
||||
// System.out.println(doc.asXML());
|
||||
System.out.println(doc.asXML());
|
||||
|
||||
assertEquals("0000-0001-9613-6638", doc.valueOf("//creator[@rank = '1']/@orcid"));
|
||||
assertEquals("0000-0001-9613-6639", doc.valueOf("//creator[@rank = '1']/@orcid_pending"));
|
||||
|
@ -267,4 +267,39 @@ public class XmlRecordFactoryTest {
|
|||
|
||||
}
|
||||
|
||||
@Test
|
||||
public void test_AKA_project() throws DocumentException, IOException {
|
||||
final ContextMapper contextMapper = new ContextMapper();
|
||||
|
||||
contextMapper
|
||||
.put("dh-ch", new ContextDef("dh-ch", "Digital Humanities and Cultural Heritage", "context", "community"));
|
||||
contextMapper.put("dh-ch::projects", new ContextDef("dh-ch::projects", "DH-CH Projects", "category", ""));
|
||||
contextMapper
|
||||
.put("dh-ch::projects::2", new ContextDef("dh-ch::projects::2", "ARIADNE", "concept", "community"));
|
||||
|
||||
final XmlRecordFactory xmlRecordFactory = new XmlRecordFactory(contextMapper, false,
|
||||
PayloadConverterJob.schemaLocation);
|
||||
|
||||
final Project p = OBJECT_MAPPER
|
||||
.readValue(
|
||||
IOUtils.toString(getClass().getResourceAsStream("project_aka.json")),
|
||||
Project.class);
|
||||
|
||||
assertNotNull(p.getContext());
|
||||
assertEquals(1, p.getContext().size());
|
||||
assertEquals("dh-ch::projects::2", p.getContext().get(0).getId());
|
||||
|
||||
final String xml = xmlRecordFactory.build(new JoinedEntity(p));
|
||||
|
||||
assertNotNull(xml);
|
||||
|
||||
final Document doc = new SAXReader().read(new StringReader(xml));
|
||||
|
||||
assertNotNull(doc);
|
||||
|
||||
assertEquals("dh-ch", doc.valueOf("//context/@id"));
|
||||
assertEquals("dh-ch::projects", doc.valueOf("//context/category/@id"));
|
||||
assertEquals("dh-ch::projects::2", doc.valueOf("//context/category/concept/@id"));
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -68,15 +68,12 @@
|
|||
<FIELD copy="true" indexable="false" name="externalreflabel" result="false" stat="false" tokenizable="true" xpath="distinct-values(//*[local-name()='entity']/*//children/externalreference/label)"/>
|
||||
<FIELD copy="true" indexable="true" name="resultidentifier" result="false" stat="false" type="string_ci" xpath="distinct-values(//*[local-name()='entity']/*[local-name()='result']/children/instance/webresource/*[local-name()='url'])"/>
|
||||
<FIELD copy="true" indexable="false" name="resultsource" result="false" stat="false" xpath="distinct-values(//*[local-name()='entity']/*[local-name()='result']/source)"/>
|
||||
|
||||
<FIELD indexable="true" multivalued="false" name="isgreen" result="false" stat="false" type="boolean" value="//*[local-name()='entity']/*[local-name()='result']/isgreen"/>
|
||||
<FIELD indexable="true" multivalued="false" name="openaccesscolor" result="false" stat="false" tokenizable="false" value="//*[local-name()='entity']/*[local-name()='result']/openaccesscolor"/>
|
||||
<FIELD indexable="true" multivalued="false" name="isindiamondjournal" result="false" stat="false" type="boolean" value="//*[local-name()='entity']/*[local-name()='result']/isindiamondjournal"/>
|
||||
<FIELD indexable="true" multivalued="false" name="publiclyfunded" result="false" stat="false" type="boolean" value="//*[local-name()='entity']/*[local-name()='result']/publiclyfunded"/>
|
||||
|
||||
<FIELD indexable="true" multivalued="false" name="peerreviewed" result="false" stat="false" type="boolean" value="some $refereed in //*[local-name()='entity']/*[local-name()='result']/children/instance/*[local-name()='refereed']/@classid satisfies ($refereed = '0001')"/>
|
||||
|
||||
|
||||
<FIELD indexable="true" multivalued="false" name="haslicense" result="false" stat="false" type="boolean" value="some $license in //*[local-name()='entity']/*[local-name()='result']/children/instance/*[local-name()='license']/text() satisfies (string-length($license) > 0)"/>
|
||||
<FIELD indexable="true" name="eoscifguidelines" result="false" stat="false" tokenizable="false" xpath="distinct-values(//*[local-name() = 'result']/eoscifguidelines/@code)"/><!-- FOS and SDGs non tokenizable for faceted search-->
|
||||
<FIELD indexable="true" name="fos" result="false" stat="false" tokenizable="false" xpath="distinct-values(//*[local-name()='entity']/*[local-name()='result']/subject[@classid='FOS'])"/>
|
||||
<FIELD indexable="true" name="foslabel" result="false" stat="false" tokenizable="false" value="concat(./text(), '||', replace(./text(), '^\d+\s', ''))" xpath="//*[local-name()='entity']/*[local-name()='result']/subject[@classid='FOS']"/>
|
||||
|
@ -93,6 +90,7 @@
|
|||
<FIELD indexable="true" name="relorganizationid" result="false" stat="false" tokenizable="false" xpath="distinct-values(//*[local-name()='entity']/*//rel/to[@type='organization'])"/>
|
||||
<FIELD copy="true" indexable="true" name="relorganizationname" result="false" stat="false" xpath="distinct-values(//*[local-name()='entity']/*//rel[./to/@type='organization']/legalname)"/>
|
||||
<FIELD copy="true" indexable="true" name="relorganizationshortname" result="false" stat="false" xpath="distinct-values(//*[local-name()='entity']/*//rel[./to/@type='organization']/legalshortname)"/>
|
||||
<FIELD indexable="true" name="relorganization" result="false" stat="false" tokenizable="false" value="distinct-values(concat(./to, '||', ./legalname))" xpath="//*[local-name()='entity']/*//rel[./to/@type='organization']"/>
|
||||
<FIELD indexable="true" name="relresultid" result="false" stat="false" tokenizable="false" xpath="distinct-values(//*[local-name()='entity']/*//rel/to[@type='publication' or @type='dataset' or @type='software' or @type='otherresearchproduct'])"/>
|
||||
<FIELD indexable="true" name="relresulttype" result="false" stat="false" tokenizable="false" xpath="distinct-values(//*[local-name()='entity']/*//rel/to/@type)"/>
|
||||
<FIELD indexable="true" name="relclass" result="false" stat="false" tokenizable="false" xpath="distinct-values(//*[local-name()='entity']/*//rel/to/@class)"/>
|
||||
|
@ -122,6 +120,7 @@
|
|||
<FIELD indexable="true" name="categoryid" result="false" stat="false" tokenizable="false" xpath="distinct-values(//*[local-name()='entity']/*[local-name()='result']/context/category/@id)"/>
|
||||
<FIELD indexable="true" name="conceptname" result="false" stat="false" tokenizable="false" xpath="distinct-values(//*[local-name()='entity']/*[local-name()='result']/context/category//concept/@label)"/><!-- new index field for country info from different xpaths for any type of entity -->
|
||||
<FIELD indexable="true" name="country" result="false" stat="false" tokenizable="false" xpath="distinct-values(//*[local-name()='entity']/*/country/@classid | //*[local-name()='entity']/*//rel[./to/@type='organization']/country/@classid | //*[local-name()='entity']//funder/@jurisdiction)"/>
|
||||
<FIELD indexable="true" name="countrynojurisdiction" result="false" stat="false" tokenizable="false" xpath="distinct-values(//*[local-name()='entity']/*/country/@classid | //*[local-name()='entity']/*//rel[./to/@type='organization']/country/@classid)"/>
|
||||
<FIELD indexable="false" name="oafentity" result="true" stat="false" tokenizable="false" xpath="//*[local-name() = 'entity']"/><!-- impact indicators -->
|
||||
<FIELD copy="false" indexable="true" multivalued="false" name="influence" result="false" stat="false" type="pfloat" xpath="//measure[@id='influence']/@score/number()"/>
|
||||
<FIELD copy="false" indexable="true" multivalued="false" name="influence_class" result="false" stat="false" type="string" xpath="//measure[@id='influence']/@class/string()"/>
|
||||
|
|
File diff suppressed because one or more lines are too long
|
@ -194,228 +194,173 @@
|
|||
<fieldType name="tints" class="solr.TrieIntField" positionIncrementGap="0" docValues="true" multiValued="true" precisionStep="8"/>
|
||||
<fieldType name="tlong" class="solr.TrieLongField" positionIncrementGap="0" docValues="true" precisionStep="8"/>
|
||||
<fieldType name="tlongs" class="solr.TrieLongField" positionIncrementGap="0" docValues="true" multiValued="true" precisionStep="8"/>
|
||||
|
||||
<!-- Indexed fields -->
|
||||
<field name="__all" type="text_common" multiValued="true" indexed="true" stored="false"/>
|
||||
<field name="__deleted" type="boolean" default="false" omitNorms="true" omitTermFreqAndPositions="true" indexed="true" stored="false"/>
|
||||
<field name="__dsid" type="string" omitNorms="true" omitTermFreqAndPositions="true" indexed="true" stored="true"/>
|
||||
<field name="__dsversion" type="pdate" omitNorms="true" omitTermFreqAndPositions="true" indexed="true" stored="true"/>
|
||||
<field name="__indexrecordidentifier" type="string" multiValued="false" indexed="true" required="true" stored="true"/>
|
||||
<field name="__result" type="string" docValues="false" multiValued="false" indexed="false" stored="true"/>
|
||||
<field name="__json" type="string" docValues="false" multiValued="false" indexed="false" stored="true"/>
|
||||
<field name="__result" type="string" docValues="false" multiValued="false" indexed="false" stored="true"/>
|
||||
<field name="_root_" type="string" docValues="false" indexed="true" stored="false"/>
|
||||
<field name="_version_" type="long" multiValued="false" indexed="true" stored="true"/>
|
||||
<field name="authorid" type="string_ci" multiValued="true" indexed="true" stored="false"/>
|
||||
<field name="authoridtype" type="string_ci" multiValued="true" indexed="true" stored="false"/>
|
||||
<field name="categoryid" type="string" multiValued="true" indexed="true" stored="false"/>
|
||||
<field name="categoryname" type="string" multiValued="true" indexed="true" stored="false"/>
|
||||
<field name="collectedfrom" type="string" multiValued="true" indexed="true" stored="false"/>
|
||||
<field name="collectedfromdatasourceid" type="string" multiValued="true" indexed="true" stored="false"/>
|
||||
<field name="collectedfromname" type="string" multiValued="true" indexed="true" stored="false"/>
|
||||
<field name="community" type="string" multiValued="true" indexed="true" stored="false"/>
|
||||
<field name="communityid" type="string" multiValued="true" indexed="true" stored="false"/>
|
||||
<field name="communityname" type="string" multiValued="true" indexed="true" stored="false"/>
|
||||
<field name="conceptid" type="string" multiValued="true" indexed="true" stored="false"/>
|
||||
<field name="conceptname" type="string" multiValued="true" indexed="true" stored="false"/>
|
||||
<field name="contextid" type="string" multiValued="true" indexed="true" stored="false"/>
|
||||
<field name="contextname" type="string" multiValued="true" indexed="true" stored="false"/>
|
||||
<field name="contexttype" type="string" multiValued="true" indexed="true" stored="false"/>
|
||||
<field name="country" type="string" multiValued="true" indexed="true" stored="false"/>
|
||||
<field name="datasourcecompatibilityid" type="string" multiValued="false" indexed="true" stored="false"/>
|
||||
<field name="datasourcecompatibilityname" type="string" multiValued="false" indexed="true" stored="false"/>
|
||||
<field name="datasourceenglishname" type="text_common" multiValued="true" indexed="true" stored="false"/>
|
||||
<field name="authorid" type="string_ci" docValues="false" multiValued="true" indexed="true" stored="false"/>
|
||||
<field name="categoryid" type="string" docValues="false" multiValued="true" indexed="true" stored="false"/>
|
||||
<field name="citation_count" type="pint" docValues="false" multiValued="false" indexed="true" stored="false"/>
|
||||
<field name="citation_count_class" type="string" docValues="false" multiValued="false" indexed="true" stored="false"/>
|
||||
<field name="collectedfromdatasourceid" type="string" docValues="false" multiValued="true" indexed="true" stored="false"/>
|
||||
<field name="collectedfromname" type="string" docValues="false" multiValued="true" indexed="true" stored="false"/>
|
||||
<field name="community" type="string" docValues="false" multiValued="true" indexed="true" stored="false"/>
|
||||
<field name="communityid" type="string" docValues="false" multiValued="true" indexed="true" stored="false"/>
|
||||
<field name="conceptname" type="string" docValues="false" multiValued="true" indexed="true" stored="false"/>
|
||||
<field name="contextid" type="string" docValues="false" multiValued="true" indexed="true" stored="false"/>
|
||||
<field name="contextname" type="string" docValues="false" multiValued="true" indexed="true" stored="false"/>
|
||||
<field name="country" type="string" docValues="false" multiValued="true" indexed="true" stored="false"/>
|
||||
<field name="countrynojurisdiction" type="string" docValues="false" multiValued="true" indexed="true" stored="false"/>
|
||||
<field name="datasourcecompatibilityid" type="string" docValues="false" multiValued="false" indexed="true" stored="false"/>
|
||||
<field name="datasourcecompatibilityname" type="string" docValues="false" multiValued="false" indexed="true" stored="false"/>
|
||||
<field name="datasourceenglishname" type="text_common" docValues="false" multiValued="true" indexed="true" stored="false"/>
|
||||
<field name="datasourcejurisdiction" type="string" docValues="false" multiValued="true" indexed="true" stored="false"/>
|
||||
<field name="datasourceodcontenttypes" type="string" multiValued="true" indexed="true" stored="false"/>
|
||||
<field name="datasourceoddescription" type="text_common" multiValued="true" indexed="true" stored="false"/>
|
||||
<field name="datasourceodlanguages" type="string" multiValued="true" indexed="true" stored="false"/>
|
||||
<field name="datasourceodsubjects" type="string" multiValued="true" indexed="true" stored="false"/>
|
||||
<field name="datasourceofficialname" type="text_common" multiValued="true" indexed="true" stored="false"/>
|
||||
<field name="datasourcesubject" type="ngramtext" multiValued="true" indexed="true" stored="false"/>
|
||||
<field name="datasourceodcontenttypes" type="string" docValues="false" multiValued="true" indexed="true" stored="false"/>
|
||||
<field name="datasourceoddescription" type="text_common" docValues="false" multiValued="true" indexed="true" stored="false"/>
|
||||
<field name="datasourceodlanguages" type="string" docValues="false" multiValued="true" indexed="true" stored="false"/>
|
||||
<field name="datasourceodsubjects" type="string" docValues="false" multiValued="true" indexed="true" stored="false"/>
|
||||
<field name="datasourceofficialname" type="text_common" docValues="false" multiValued="true" indexed="true" stored="false"/>
|
||||
<field name="datasourcesubject" type="ngramtext" docValues="false" multiValued="true" indexed="true" stored="false"/>
|
||||
<field name="datasourcethematic" type="string" docValues="false" multiValued="true" indexed="true" stored="false"/>
|
||||
<field name="datasourcetypename" type="string" multiValued="false" indexed="true" stored="false"/>
|
||||
<field name="datasourcetypeuiid" type="string" multiValued="false" indexed="true" stored="false"/>
|
||||
<field name="datasourcetypeuiname" type="string" multiValued="false" indexed="true" stored="false"/>
|
||||
<field name="dateofcollection" type="pdate" multiValued="false" indexed="true" stored="false"/>
|
||||
<field name="deletedbyinference" type="string" multiValued="true" indexed="true" stored="false"/>
|
||||
<field name="datasourcetypename" type="string" docValues="false" multiValued="false" indexed="true" stored="false"/>
|
||||
<field name="datasourcetypeuiid" type="string" docValues="false" multiValued="false" indexed="true" stored="false"/>
|
||||
<field name="datasourcetypeuiname" type="string" docValues="false" multiValued="false" indexed="true" stored="false"/>
|
||||
<field name="dateofcollection" type="date" docValues="false" multiValued="false" indexed="true" stored="false"/>
|
||||
<field name="deletedbyinference" type="string" docValues="false" multiValued="true" indexed="true" stored="false"/>
|
||||
<field name="eoscdatasourcetype" type="string" docValues="false" multiValued="true" indexed="true" stored="false"/>
|
||||
<field name="eoscifguidelines" type="string" docValues="false" multiValued="true" indexed="true" stored="false"/>
|
||||
<field name="eosctype" type="string" docValues="false" multiValued="true" indexed="true" stored="false"/>
|
||||
<field name="externalrefclass" type="string" multiValued="true" indexed="true" stored="false"/>
|
||||
<field name="externalrefid" type="string" multiValued="true" indexed="true" stored="false"/>
|
||||
<field name="externalreflabel" type="text_common" multiValued="true" indexed="true" stored="false"/>
|
||||
<field name="externalrefsite" type="string" multiValued="true" indexed="true" stored="false"/>
|
||||
<field name="funder" type="string" multiValued="true" indexed="true" stored="false"/>
|
||||
<field name="funderid" type="string" multiValued="true" indexed="true" stored="false"/>
|
||||
<field name="funderjurisdiction" type="string" multiValued="true" indexed="true" stored="false"/>
|
||||
<field name="fundername" type="string" multiValued="true" indexed="true" stored="false"/>
|
||||
<field name="funderoriginalname" type="string" multiValued="true" indexed="true" stored="false"/>
|
||||
<field name="fundershortname" type="string" multiValued="true" indexed="true" stored="false"/>
|
||||
<field name="fundinglevel0_description" type="text_common" multiValued="true" indexed="true" stored="false"/>
|
||||
<field name="fundinglevel0_id" type="string" multiValued="true" indexed="true" stored="false"/>
|
||||
<field name="fundinglevel0_name" type="string" multiValued="true" indexed="true" stored="false"/>
|
||||
<field name="fundinglevel1_description" type="text_common" multiValued="true" indexed="true" stored="false"/>
|
||||
<field name="fundinglevel1_id" type="string" multiValued="true" indexed="true" stored="false"/>
|
||||
<field name="fundinglevel1_name" type="string" multiValued="true" indexed="true" stored="false"/>
|
||||
<field name="fundinglevel2_description" type="text_common" multiValued="true" indexed="true" stored="false"/>
|
||||
<field name="fundinglevel2_id" type="string" multiValued="true" indexed="true" stored="false"/>
|
||||
<field name="fundinglevel2_name" type="string" multiValued="true" indexed="true" stored="false"/>
|
||||
<field name="inferenceprovenance" type="string" multiValued="true" indexed="true" stored="false"/>
|
||||
<field name="inferred" type="string" multiValued="true" indexed="true" stored="false"/>
|
||||
<field name="instancetypename" type="string" multiValued="true" indexed="true" stored="false"/>
|
||||
<field name="oafentity" type="string" multiValued="true" indexed="false" stored="false"/>
|
||||
<field name="oaftype" type="string" multiValued="true" indexed="true" stored="false"/>
|
||||
<field name="objidentifier" type="string" multiValued="true" indexed="true" stored="false"/>
|
||||
<field name="organizationalternativenames" type="ngramtext" multiValued="true" indexed="true" stored="false"/>
|
||||
<field name="organizationcountryname" type="string" multiValued="false" indexed="true" stored="false"/>
|
||||
<field name="organizationdupid" type="string" multiValued="true" indexed="true" stored="false"/>
|
||||
<field name="organizationecenterprise" type="text_common" multiValued="true" indexed="true" stored="false"/>
|
||||
<field name="organizationecinternationalorganization" type="text_common" multiValued="true" indexed="true" stored="false"/>
|
||||
<field name="organizationecinternationalorganizationeurinterests" type="text_common" multiValued="true" indexed="true" stored="false"/>
|
||||
<field name="organizationeclegalbody" type="text_common" multiValued="true" indexed="true" stored="false"/>
|
||||
<field name="organizationeclegalperson" type="text_common" multiValued="true" indexed="true" stored="false"/>
|
||||
<field name="organizationecnonprofit" type="text_common" multiValued="true" indexed="true" stored="false"/>
|
||||
<field name="organizationecnutscode" type="text_common" multiValued="true" indexed="true" stored="false"/>
|
||||
<field name="organizationecresearchorganization" type="text_common" multiValued="true" indexed="true" stored="false"/>
|
||||
<field name="organizationecsmevalidated" type="text_common" multiValued="true" indexed="true" stored="false"/>
|
||||
<field name="organizationlegalname" type="ngramtext" multiValued="true" indexed="true" stored="false"/>
|
||||
<field name="organizationlegalshortname" type="ngramtext" multiValued="true" indexed="true" stored="false"/>
|
||||
<field name="originalid" type="string_ci" multiValued="true" indexed="true" stored="false"/>
|
||||
<field name="pid" type="string_ci" multiValued="true" indexed="true" stored="false"/>
|
||||
<field name="pidclassid" type="string" multiValued="true" indexed="true" stored="false"/>
|
||||
<field name="pidclassname" type="string" multiValued="true" indexed="true" stored="false"/>
|
||||
<field name="projectacronym" type="ngramtext" multiValued="true" indexed="true" stored="false"/>
|
||||
<field name="projectcallidentifier" type="string" multiValued="false" indexed="true" stored="false"/>
|
||||
<field name="projectcode" type="ngramtext" multiValued="true" indexed="true" stored="false"/>
|
||||
<field name="projectcode_nt" type="string" multiValued="true" indexed="true" stored="false"/>
|
||||
<field name="projectcontracttypename" type="string" multiValued="false" indexed="true" stored="false"/>
|
||||
<field name="projectduration" type="string" multiValued="false" indexed="true" stored="false"/>
|
||||
<field name="projectecarticle29_3" type="string" multiValued="false" indexed="true" stored="false"/>
|
||||
<field name="projectecsc39" type="string" multiValued="false" indexed="true" stored="false"/>
|
||||
<field name="projectenddate" type="pdate" multiValued="false" indexed="true" stored="false"/>
|
||||
<field name="projectendyear" type="string" multiValued="false" indexed="true" stored="false"/>
|
||||
<field name="projectkeywords" type="text_common" multiValued="true" indexed="true" stored="false"/>
|
||||
<field name="projectoamandatepublications" type="string" multiValued="false" indexed="true" stored="false"/>
|
||||
<field name="projectstartdate" type="pdate" multiValued="false" indexed="true" stored="false"/>
|
||||
<field name="projectstartyear" type="string" multiValued="false" indexed="true" stored="false"/>
|
||||
<field name="projectsubject" type="string" multiValued="true" indexed="true" stored="false"/>
|
||||
<field name="projecttitle" type="ngramtext" multiValued="true" indexed="true" stored="false"/>
|
||||
<field name="provenanceactionclassid" type="string" multiValued="true" indexed="true" stored="false"/>
|
||||
<field name="relclass" type="string" multiValued="true" indexed="true" stored="false"/>
|
||||
<field name="relcollectedfromid" type="string" multiValued="true" indexed="true" stored="false"/>
|
||||
<field name="relcollectedfromname" type="string" multiValued="true" indexed="true" stored="false"/>
|
||||
<field name="relcontracttypeid" type="string" multiValued="true" indexed="true" stored="false"/>
|
||||
<field name="relcontracttypename" type="text_common" multiValued="true" indexed="true" stored="false"/>
|
||||
<field name="reldatasourcecompatibilityid" type="string" multiValued="true" indexed="true" stored="false"/>
|
||||
<field name="relfunder" type="string" multiValued="true" indexed="true" stored="false"/>
|
||||
<field name="relfunderid" type="string" multiValued="true" indexed="true" stored="false"/>
|
||||
<field name="relfunderjurisdiction" type="string" multiValued="true" indexed="true" stored="false"/>
|
||||
<field name="relfundername" type="string" multiValued="true" indexed="true" stored="false"/>
|
||||
<field name="relfundershortname" type="string" multiValued="true" indexed="true" stored="false"/>
|
||||
<field name="relfundinglevel0_id" type="string" multiValued="true" indexed="true" stored="false"/>
|
||||
<field name="relfundinglevel0_name" type="string" multiValued="true" indexed="true" stored="false"/>
|
||||
<field name="relfundinglevel1_id" type="string" multiValued="true" indexed="true" stored="false"/>
|
||||
<field name="relfundinglevel1_name" type="string" multiValued="true" indexed="true" stored="false"/>
|
||||
<field name="relfundinglevel2_id" type="string" multiValued="true" indexed="true" stored="false"/>
|
||||
<field name="relfundinglevel2_name" type="string" multiValued="true" indexed="true" stored="false"/>
|
||||
<field name="relinferenceprovenance" type="string" multiValued="true" indexed="true" stored="false"/>
|
||||
<field name="relinferred" type="string" multiValued="true" indexed="true" stored="false"/>
|
||||
<field name="relorganizationcountryid" type="string" multiValued="true" indexed="true" stored="false"/>
|
||||
<field name="relorganizationcountryname" type="text_common" multiValued="true" indexed="true" stored="false"/>
|
||||
<field name="relorganizationid" type="string" multiValued="true" indexed="true" stored="false"/>
|
||||
<field name="relorganizationname" type="text_common" multiValued="true" indexed="true" stored="false"/>
|
||||
<field name="relorganizationshortname" type="text_common" multiValued="true" indexed="true" stored="false"/>
|
||||
<field name="relproject" type="string" multiValued="true" indexed="true" stored="false"/>
|
||||
<field name="relprojectcode" type="string" multiValued="true" indexed="true" stored="false"/>
|
||||
<field name="relprojectid" type="string" multiValued="true" indexed="true" stored="false"/>
|
||||
<field name="relprojectname" type="string" multiValued="true" indexed="true" stored="false"/>
|
||||
<field name="relprojecttitle" type="text_common" multiValued="true" indexed="true" stored="false"/>
|
||||
<field name="relprovenanceactionclassid" type="string" multiValued="true" indexed="true" stored="false"/>
|
||||
<field name="relresultid" type="string" multiValued="true" indexed="true" stored="false"/>
|
||||
<field name="relresulttype" type="string" multiValued="true" indexed="true" stored="false"/>
|
||||
<field name="reltrust" type="string" multiValued="true" indexed="true" stored="false"/>
|
||||
<field name="resultacceptanceyear" type="string" multiValued="false" indexed="true" stored="false"/>
|
||||
<field name="resultaccessright" type="string" multiValued="true" indexed="true" stored="false"/>
|
||||
<field name="resultauthor" type="text_common" multiValued="true" indexed="true" stored="false"/>
|
||||
<field name="resultauthor_nt" type="string_ci" multiValued="true" indexed="true" stored="false"/>
|
||||
<field name="resultbestaccessright" type="string" multiValued="false" indexed="true" stored="false"/>
|
||||
<field name="resultdateofacceptance" type="pdate" multiValued="false" indexed="true" stored="false"/>
|
||||
<field name="resultdescription" type="text_en" multiValued="true" indexed="true" stored="false"/>
|
||||
<field name="resultdupid" type="string" multiValued="true" indexed="true" stored="false"/>
|
||||
<field name="resultembargoenddate" type="pdate" multiValued="false" indexed="true" stored="false"/>
|
||||
<field name="resultembargoendyear" type="string" multiValued="false" indexed="true" stored="false"/>
|
||||
<field name="resulthostingdatasource" type="string" multiValued="true" indexed="true" stored="false"/>
|
||||
<field name="resulthostingdatasourceid" type="string" multiValued="true" indexed="true" stored="false"/>
|
||||
<field name="resulthostingdatasourcename" type="string" multiValued="true" indexed="true" stored="false"/>
|
||||
<field name="resultidentifier" type="string" multiValued="true" indexed="true" stored="false"/>
|
||||
<field name="resultlanguagename" type="string" multiValued="false" indexed="true" stored="false"/>
|
||||
<field name="resultlicense" type="string" multiValued="true" indexed="true" stored="false"/>
|
||||
<field name="resultpublisher" type="text_common" multiValued="true" indexed="true" stored="false"/>
|
||||
<field name="resultsource" type="text_common" multiValued="true" indexed="true" stored="false"/>
|
||||
<field name="resultsubject" type="text_en" multiValued="true" indexed="true" stored="false"/>
|
||||
<field name="resultsubjectclass" type="string" multiValued="true" indexed="true" stored="false"/>
|
||||
<field name="resulttitle" type="text_en" multiValued="true" indexed="true" stored="false"/>
|
||||
<field name="resulttypeid" type="string" multiValued="false" indexed="true" stored="false"/>
|
||||
<field name="resulttypename" type="string" multiValued="false" indexed="true" stored="false"/>
|
||||
<field name="semrelid" type="string" multiValued="true" indexed="true" stored="false"/>
|
||||
<field name="externalreflabel" type="text_common" docValues="false" multiValued="true" indexed="true" stored="false"/>
|
||||
<field name="fos" type="string" docValues="false" multiValued="true" indexed="true" stored="false"/>
|
||||
<field name="foslabel" type="string" docValues="false" multiValued="true" indexed="true" stored="false"/>
|
||||
<field name="funder" type="string" docValues="false" multiValued="true" indexed="true" stored="false"/>
|
||||
<field name="funderid" type="string" docValues="false" multiValued="true" indexed="true" stored="false"/>
|
||||
<field name="fundershortname" type="string" docValues="false" multiValued="true" indexed="true" stored="false"/>
|
||||
<field name="fundinglevel0_description" type="text_common" docValues="false" multiValued="true" indexed="true" stored="false"/>
|
||||
<field name="fundinglevel0_id" type="string" docValues="false" multiValued="true" indexed="true" stored="false"/>
|
||||
<field name="fundinglevel0_name" type="string" docValues="false" multiValued="true" indexed="true" stored="false"/>
|
||||
<field name="fundinglevel1_description" type="text_common" docValues="false" multiValued="true" indexed="true" stored="false"/>
|
||||
<field name="fundinglevel1_id" type="string" docValues="false" multiValued="true" indexed="true" stored="false"/>
|
||||
<field name="fundinglevel1_name" type="string" docValues="false" multiValued="true" indexed="true" stored="false"/>
|
||||
<field name="fundinglevel2_description" type="text_common" docValues="false" multiValued="true" indexed="true" stored="false"/>
|
||||
<field name="fundinglevel2_id" type="string" docValues="false" multiValued="true" indexed="true" stored="false"/>
|
||||
<field name="fundinglevel2_name" type="string" docValues="false" multiValued="true" indexed="true" stored="false"/>
|
||||
<field name="haslicense" type="boolean" docValues="false" multiValued="false" indexed="true" stored="false"/>
|
||||
<field name="impulse" type="pint" docValues="false" multiValued="false" indexed="true" stored="false"/>
|
||||
<field name="impulse_class" type="string" docValues="false" multiValued="false" indexed="true" stored="false"/>
|
||||
<field name="influence" type="pfloat" docValues="false" multiValued="false" indexed="true" stored="false"/>
|
||||
<field name="influence_class" type="string" docValues="false" multiValued="false" indexed="true" stored="false"/>
|
||||
<field name="instancetypename" type="string" docValues="false" multiValued="true" indexed="true" stored="false"/>
|
||||
<field name="isgreen" type="boolean" docValues="false" multiValued="false" indexed="true" stored="false"/>
|
||||
<field name="isindiamondjournal" type="boolean" docValues="false" multiValued="false" indexed="true" stored="false"/>
|
||||
<field name="oafentity" type="string" docValues="false" multiValued="true" indexed="false" stored="false"/>
|
||||
<field name="oaftype" type="string" docValues="false" multiValued="true" indexed="true" stored="false"/>
|
||||
<field name="objidentifier" type="string" docValues="false" multiValued="true" indexed="true" stored="false"/>
|
||||
<field name="openaccesscolor" type="string" docValues="false" multiValued="false" indexed="true" stored="false"/>
|
||||
<field name="orcidtypevalue" type="string_ci" docValues="false" multiValued="true" indexed="true" stored="false"/>
|
||||
<field name="organizationalternativenames" type="ngramtext" docValues="false" multiValued="true" indexed="true" stored="false"/>
|
||||
<field name="organizationdupid" type="string" docValues="false" multiValued="true" indexed="true" stored="false"/>
|
||||
<field name="organizationlegalname" type="ngramtext" docValues="false" multiValued="true" indexed="true" stored="false"/>
|
||||
<field name="organizationlegalshortname" type="ngramtext" docValues="false" multiValued="true" indexed="true" stored="false"/>
|
||||
<field name="originalid" type="string_ci" docValues="false" multiValued="true" indexed="true" stored="false"/>
|
||||
<field name="peerreviewed" type="boolean" docValues="false" multiValued="false" indexed="true" stored="false"/>
|
||||
<field name="pid" type="string_ci" docValues="false" multiValued="true" indexed="true" stored="false"/>
|
||||
<field name="pidclassid" type="string" docValues="false" multiValued="true" indexed="true" stored="false"/>
|
||||
<field name="popularity" type="pfloat" docValues="false" multiValued="false" indexed="true" stored="false"/>
|
||||
<field name="popularity_alt" type="pfloat" docValues="false" multiValued="false" indexed="true" stored="false"/>
|
||||
<field name="popularity_alt_class" type="string" docValues="false" multiValued="false" indexed="true" stored="false"/>
|
||||
<field name="popularity_class" type="string" docValues="false" multiValued="false" indexed="true" stored="false"/>
|
||||
<field name="projectacronym" type="ngramtext" docValues="false" multiValued="true" indexed="true" stored="false"/>
|
||||
<field name="projectcallidentifier" type="string" docValues="false" multiValued="false" indexed="true" stored="false"/>
|
||||
<field name="projectcode" type="ngramtext" docValues="false" multiValued="true" indexed="true" stored="false"/>
|
||||
<field name="projectcode_nt" type="string" docValues="false" multiValued="true" indexed="true" stored="false"/>
|
||||
<field name="projectduration" type="string" docValues="false" multiValued="false" indexed="true" stored="false"/>
|
||||
<field name="projectecsc39" type="string" docValues="false" multiValued="false" indexed="true" stored="false"/>
|
||||
<field name="projectenddate" type="date" docValues="false" multiValued="false" indexed="true" stored="false"/>
|
||||
<field name="projectendyear" type="string" docValues="false" multiValued="false" indexed="true" stored="false"/>
|
||||
<field name="projectkeywords" type="text_common" docValues="false" multiValued="true" indexed="true" stored="false"/>
|
||||
<field name="projectoamandatepublications" type="string" docValues="false" multiValued="false" indexed="true" stored="false"/>
|
||||
<field name="projectstartdate" type="date" docValues="false" multiValued="false" indexed="true" stored="false"/>
|
||||
<field name="projectstartyear" type="string" docValues="false" multiValued="false" indexed="true" stored="false"/>
|
||||
<field name="projecttitle" type="ngramtext" docValues="false" multiValued="true" indexed="true" stored="false"/>
|
||||
<field name="projecttitle_alternative" type="text_en" docValues="false" multiValued="true" indexed="true" stored="false"/>
|
||||
<field name="provenanceactionclassid" type="string" docValues="false" multiValued="true" indexed="true" stored="false"/>
|
||||
<field name="publiclyfunded" type="boolean" docValues="false" multiValued="false" indexed="true" stored="false"/>
|
||||
<field name="relclass" type="string" docValues="false" multiValued="true" indexed="true" stored="false"/>
|
||||
<field name="relcontracttypename" type="text_common" docValues="false" multiValued="true" indexed="true" stored="false"/>
|
||||
<field name="reldatasourcecompatibilityid" type="string" docValues="false" multiValued="true" indexed="true" stored="false"/>
|
||||
<field name="relfunder" type="string" docValues="false" multiValued="true" indexed="true" stored="false"/>
|
||||
<field name="relfunderid" type="string" docValues="false" multiValued="true" indexed="true" stored="false"/>
|
||||
<field name="relfundershortname" type="string" docValues="false" multiValued="true" indexed="true" stored="false"/>
|
||||
<field name="relfundinglevel0_id" type="string" docValues="false" multiValued="true" indexed="true" stored="false"/>
|
||||
<field name="relfundinglevel0_name" type="string" docValues="false" multiValued="true" indexed="true" stored="false"/>
|
||||
<field name="relfundinglevel1_id" type="string" docValues="false" multiValued="true" indexed="true" stored="false"/>
|
||||
<field name="relfundinglevel1_name" type="string" docValues="false" multiValued="true" indexed="true" stored="false"/>
|
||||
<field name="relfundinglevel2_id" type="string" docValues="false" multiValued="true" indexed="true" stored="false"/>
|
||||
<field name="relfundinglevel2_name" type="string" docValues="false" multiValued="true" indexed="true" stored="false"/>
|
||||
<field name="relorganizationcountryid" type="string" docValues="false" multiValued="true" indexed="true" stored="false"/>
|
||||
<field name="relorganizationcountryname" type="text_common" docValues="false" multiValued="true" indexed="true" stored="false"/>
|
||||
<field name="relorganizationid" type="string" docValues="false" multiValued="true" indexed="true" stored="false"/>
|
||||
<field name="relorganizationname" type="text_common" docValues="false" multiValued="true" indexed="true" stored="false"/>
|
||||
<field name="relorganizationshortname" type="text_common" docValues="false" multiValued="true" indexed="true" stored="false"/>
|
||||
<field name="relproject" type="string" docValues="false" multiValued="true" indexed="true" stored="false"/>
|
||||
<field name="relprojectcode" type="string" docValues="false" multiValued="true" indexed="true" stored="false"/>
|
||||
<field name="relprojectid" type="string" docValues="false" multiValued="true" indexed="true" stored="false"/>
|
||||
<field name="relprojectname" type="string" docValues="false" multiValued="true" indexed="true" stored="false"/>
|
||||
<field name="relprojecttitle" type="text_common" docValues="false" multiValued="true" indexed="true" stored="false"/>
|
||||
<field name="relresultid" type="string" docValues="false" multiValued="true" indexed="true" stored="false"/>
|
||||
<field name="relresulttype" type="string" docValues="false" multiValued="true" indexed="true" stored="false"/>
|
||||
<field name="resultacceptanceyear" type="string" docValues="false" multiValued="false" indexed="true" stored="false"/>
|
||||
<field name="resultauthor" type="text_common" docValues="false" multiValued="true" indexed="true" stored="false"/>
|
||||
<field name="resultbestaccessright" type="string" docValues="false" multiValued="false" indexed="true" stored="false"/>
|
||||
<field name="resultdateofacceptance" type="date" docValues="false" multiValued="false" indexed="true" stored="false"/>
|
||||
<field name="resultdescription" type="text_en" docValues="false" multiValued="true" indexed="true" stored="false"/>
|
||||
<field name="resultdupid" type="string" docValues="false" multiValued="true" indexed="true" stored="false"/>
|
||||
<field name="resultembargoenddate" type="date" docValues="false" multiValued="false" indexed="true" stored="false"/>
|
||||
<field name="resultembargoendyear" type="string" docValues="false" multiValued="false" indexed="true" stored="false"/>
|
||||
<field name="resulthostingdatasource" type="string" docValues="false" multiValued="true" indexed="true" stored="false"/>
|
||||
<field name="resulthostingdatasourceid" type="string" docValues="false" multiValued="true" indexed="true" stored="false"/>
|
||||
<field name="resultidentifier" type="string_ci" docValues="false" multiValued="true" indexed="true" stored="false"/>
|
||||
<field name="resultlanguagename" type="string" docValues="false" multiValued="false" indexed="true" stored="false"/>
|
||||
<field name="resultpublisher" type="text_common" docValues="false" multiValued="true" indexed="true" stored="false"/>
|
||||
<field name="resultsource" type="text_common" docValues="false" multiValued="true" indexed="true" stored="false"/>
|
||||
<field name="resultsubject" type="text_en" docValues="false" multiValued="true" indexed="true" stored="false"/>
|
||||
<field name="resulttitle" type="text_en" docValues="false" multiValued="true" indexed="true" stored="false"/>
|
||||
<field name="resulttypeid" type="string" docValues="false" multiValued="false" indexed="true" stored="false"/>
|
||||
<field name="sdg" type="string" docValues="false" multiValued="true" indexed="true" stored="false"/>
|
||||
<field name="semrelid" type="string" docValues="false" multiValued="true" indexed="true" stored="false"/>
|
||||
<field name="status" type="string_ci" docValues="false" multiValued="true" indexed="true" stored="false"/>
|
||||
<field name="text" type="text_common" indexed="false" stored="false"/>
|
||||
<field name="trust" type="string" multiValued="true" indexed="true" stored="false"/>
|
||||
<field name="versioning" type="string" multiValued="true" indexed="true" stored="false"/>
|
||||
<field name="isgreen" type="boolean" multiValued="false" indexed="true" stored="false"/>
|
||||
<field name="openaccesscolor" type="string" multiValued="false" indexed="true" stored="false"/>
|
||||
<field name="isindiamondjournal" type="boolean" multiValued="false" indexed="true" stored="false"/>
|
||||
<field name="publiclyfunded" type="boolean" multiValued="false" indexed="true" stored="false"/>
|
||||
<field name="peerreviewed" type="boolean" multiValued="false" indexed="true" stored="false"/>
|
||||
|
||||
<!-- Copy field definitions follow: -->
|
||||
|
||||
<!-- Data source -->
|
||||
<copyField source="datasourceenglishname" dest="__all"/>
|
||||
<copyField source="datasourceoddescription" dest="__all"/>
|
||||
<copyField source="datasourceodsubjects" dest="__all"/>
|
||||
<copyField source="datasourceofficialname" dest="__all"/>
|
||||
<copyField source="datasourcesubject" dest="__all"/>
|
||||
|
||||
<!-- Organization -->
|
||||
<copyField source="externalreflabel" dest="__all"/>
|
||||
<copyField source="fundinglevel0_description" dest="__all"/>
|
||||
<copyField source="fundinglevel1_description" dest="__all"/>
|
||||
<copyField source="fundinglevel2_description" dest="__all"/>
|
||||
<copyField source="organizationalternativenames" dest="__all"/>
|
||||
<copyField source="organizationecenterprise" dest="__all"/>
|
||||
<copyField source="organizationecinternationalorganization" dest="__all"/>
|
||||
<copyField source="organizationecinternationalorganizationeurinterests" dest="__all"/>
|
||||
<copyField source="organizationeclegalbody" dest="__all"/>
|
||||
<copyField source="organizationeclegalperson" dest="__all"/>
|
||||
<copyField source="organizationecnonprofit" dest="__all"/>
|
||||
<copyField source="organizationecnutscode" dest="__all"/>
|
||||
<copyField source="organizationecresearchorganization" dest="__all"/>
|
||||
<copyField source="organizationecsmevalidated" dest="__all"/>
|
||||
<copyField source="organizationlegalname" dest="__all"/>
|
||||
<copyField source="organizationlegalshortname" dest="__all"/>
|
||||
|
||||
<!-- Project -->
|
||||
<copyField source="projectacronym" dest="__all"/>
|
||||
<copyField source="projectcode" dest="__all"/>
|
||||
<copyField source="projectkeywords" dest="__all"/>
|
||||
<copyField source="projecttitle" dest="__all"/>
|
||||
|
||||
<!-- Result -->
|
||||
<copyField source="resultpublisher" dest="__all"/>
|
||||
<copyField source="resultsource" dest="__all"/>
|
||||
<copyField source="resultidentifier" dest="__all"/>
|
||||
<copyField source="resultauthor" dest="__all"/>
|
||||
<copyField source="resulttitle" dest="__all"/>
|
||||
<copyField source="resultdescription" dest="__all"/>
|
||||
<copyField source="resultsubject" dest="__all"/>
|
||||
<copyField source="resultacceptanceyear" dest="__all"/>
|
||||
|
||||
<!-- Other -->
|
||||
<copyField source="externalreflabel" dest="__all"/>
|
||||
|
||||
<copyField source="fundinglevel0_description" dest="__all"/>
|
||||
<copyField source="fundinglevel1_description" dest="__all"/>
|
||||
<copyField source="fundinglevel2_description" dest="__all"/>
|
||||
|
||||
<copyField source="projecttitle_alternative" dest="__all"/>
|
||||
<copyField source="relcontracttypename" dest="__all"/>
|
||||
<copyField source="relorganizationcountryname" dest="__all"/>
|
||||
<copyField source="relorganizationname" dest="__all"/>
|
||||
<copyField source="relorganizationshortname" dest="__all"/>
|
||||
<copyField source="relprojecttitle" dest="__all"/>
|
||||
<copyField source="relprojectname" dest="__all"/>
|
||||
|
||||
<copyField source="relprojecttitle" dest="__all"/>
|
||||
<copyField source="resultacceptanceyear" dest="__all"/>
|
||||
<copyField source="resultauthor" dest="__all"/>
|
||||
<copyField source="resultdescription" dest="__all"/>
|
||||
<copyField source="resultidentifier" dest="__all"/>
|
||||
<copyField source="resultpublisher" dest="__all"/>
|
||||
<copyField source="resultsource" dest="__all"/>
|
||||
<copyField source="resulttitle" dest="__all"/>
|
||||
</schema>
|
Binary file not shown.
Binary file not shown.
|
@ -63,7 +63,7 @@ function copydb() {
|
|||
start_db_time=$(date +%s)
|
||||
|
||||
# Delete the old DB from Impala cluster (if exists).
|
||||
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "drop database if exists ${db} cascade" |& tee error.log # impala-shell prints all logs in stderr, so wee need to capture them and put them in a file, in order to perform "grep" on them later
|
||||
impala-shell -i ${IMPALA_HOSTNAME} -q "drop database if exists ${db} cascade" |& tee error.log # impala-shell prints all logs in stderr, so wee need to capture them and put them in a file, in order to perform "grep" on them later
|
||||
log_errors=`cat error.log | grep -E "WARN|ERROR|FAILED"`
|
||||
if [ -n "$log_errors" ]; then
|
||||
echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN DROPPING THE OLD DATABASE FROM IMPALA CLUSTER! EXITING...\n\n"
|
||||
|
@ -120,7 +120,7 @@ function copydb() {
|
|||
start_create_schema_time=$(date +%s)
|
||||
|
||||
# create the new database (with the same name)
|
||||
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "create database ${db}"
|
||||
impala-shell -i ${IMPALA_HOSTNAME} -q "create database ${db}"
|
||||
|
||||
# Because "Hive" and "Impala" do not have compatible schemas, we cannot use the "show create table <name>" output from hive to create the exact same table in impala.
|
||||
# So, we have to find at least one parquet file (check if it's there) from the table in the ocean cluster for impala to use it to extract the table-schema itself from that file.
|
||||
|
@ -148,7 +148,7 @@ function copydb() {
|
|||
exit 5
|
||||
fi # This error is not FATAL, do we do not return from this function, in normal circumstances.
|
||||
else
|
||||
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "create table ${db}.${i} like parquet '${CURRENT_PRQ_FILE}' stored as parquet;" |& tee error.log
|
||||
impala-shell -i ${IMPALA_HOSTNAME} -q "create table ${db}.${i} like parquet '${CURRENT_PRQ_FILE}' stored as parquet;" |& tee error.log
|
||||
log_errors=`cat error.log | grep -E "WARN|ERROR|FAILED"`
|
||||
if [ -n "$log_errors" ]; then
|
||||
echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN CREATING TABLE '${i}'!\n\n"
|
||||
|
@ -182,7 +182,7 @@ function copydb() {
|
|||
new_num_of_views_to_retry=0
|
||||
|
||||
for create_view_statement in "${all_create_view_statements[@]}"; do # Here we use double quotes, as the elements are phrases, instead of single-words.
|
||||
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "${create_view_statement}" |& tee error.log # impala-shell prints all logs in stderr, so wee need to capture them and put them in a file, in order to perform "grep" on them later
|
||||
impala-shell -i ${IMPALA_HOSTNAME} -q "${create_view_statement}" |& tee error.log # impala-shell prints all logs in stderr, so wee need to capture them and put them in a file, in order to perform "grep" on them later
|
||||
specific_errors=`cat error.log | grep -E "FAILED: ParseException line 1:13 missing TABLE at 'view'|ERROR: AnalysisException: Could not resolve table reference:"`
|
||||
if [ -n "$specific_errors" ]; then
|
||||
echo -e "\nspecific_errors: ${specific_errors}\n"
|
||||
|
@ -212,7 +212,7 @@ function copydb() {
|
|||
previous_num_of_views_to_retry=$new_num_of_views_to_retry
|
||||
done
|
||||
|
||||
entities_on_impala=(`impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} --delimited -q "show tables in ${db}"`)
|
||||
entities_on_impala=(`impala-shell -i ${IMPALA_HOSTNAME} --delimited -q "show tables in ${db}"`)
|
||||
echo -e "\nThe schema of db '${db}', along with ${#entities_on_impala[@]} entities have been created, on Impala cluster, after: $(print_elapsed_time start_create_schema_time)\n"
|
||||
|
||||
start_compute_stats_time=$(date +%s)
|
||||
|
@ -222,9 +222,9 @@ function copydb() {
|
|||
create_view_statement=`hive -e "show create table ${db}.${i};" | grep "CREATE VIEW"` # This grep works here, as we do not want to match multiple-lines.
|
||||
if [ -z "$create_view_statement" ]; then # If it's a table, then go load the data to it.
|
||||
# Invalidate metadata of this DB's tables, in order for Impala to be aware of all parquet files put inside the tables' directories, previously, by "hadoop distcp".
|
||||
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA ${db}.${i}"
|
||||
impala-shell -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA ${db}.${i}"
|
||||
sleep 1
|
||||
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "compute stats ${db}.${i}" |& tee error.log
|
||||
impala-shell -i ${IMPALA_HOSTNAME} -q "compute stats ${db}.${i}" |& tee error.log
|
||||
log_errors=`cat error.log | grep -E "WARN|ERROR|FAILED"`
|
||||
if [ -n "$log_errors" ]; then
|
||||
echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN COMPUTING STATS FOR TABLE '${i}'!\n\n"
|
||||
|
|
|
@ -63,7 +63,7 @@ function copydb() {
|
|||
start_db_time=$(date +%s)
|
||||
|
||||
# Delete the old DB from Impala cluster (if exists).
|
||||
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "drop database if exists ${db} cascade" |& tee error.log # impala-shell prints all logs in stderr, so wee need to capture them and put them in a file, in order to perform "grep" on them later
|
||||
impala-shell -i ${IMPALA_HOSTNAME} -q "drop database if exists ${db} cascade" |& tee error.log # impala-shell prints all logs in stderr, so wee need to capture them and put them in a file, in order to perform "grep" on them later
|
||||
log_errors=`cat error.log | grep -E "WARN|ERROR|FAILED"`
|
||||
if [ -n "$log_errors" ]; then
|
||||
echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN DROPPING THE OLD DATABASE FROM IMPALA CLUSTER! EXITING...\n\n"
|
||||
|
@ -120,7 +120,7 @@ function copydb() {
|
|||
start_create_schema_time=$(date +%s)
|
||||
|
||||
# create the new database (with the same name)
|
||||
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "create database ${db}"
|
||||
impala-shell -i ${IMPALA_HOSTNAME} -q "create database ${db}"
|
||||
|
||||
# Because "Hive" and "Impala" do not have compatible schemas, we cannot use the "show create table <name>" output from hive to create the exact same table in impala.
|
||||
# So, we have to find at least one parquet file (check if it's there) from the table in the ocean cluster for impala to use it to extract the table-schema itself from that file.
|
||||
|
@ -148,7 +148,7 @@ function copydb() {
|
|||
exit 5
|
||||
fi # This error is not FATAL, do we do not return from this function, in normal circumstances.
|
||||
else
|
||||
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "create table ${db}.${i} like parquet '${CURRENT_PRQ_FILE}' stored as parquet;" |& tee error.log
|
||||
impala-shell -i ${IMPALA_HOSTNAME} -q "create table ${db}.${i} like parquet '${CURRENT_PRQ_FILE}' stored as parquet;" |& tee error.log
|
||||
log_errors=`cat error.log | grep -E "WARN|ERROR|FAILED"`
|
||||
if [ -n "$log_errors" ]; then
|
||||
echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN CREATING TABLE '${i}'!\n\n"
|
||||
|
@ -182,7 +182,7 @@ function copydb() {
|
|||
new_num_of_views_to_retry=0
|
||||
|
||||
for create_view_statement in "${all_create_view_statements[@]}"; do # Here we use double quotes, as the elements are phrases, instead of single-words.
|
||||
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "${create_view_statement}" |& tee error.log # impala-shell prints all logs in stderr, so wee need to capture them and put them in a file, in order to perform "grep" on them later
|
||||
impala-shell -i ${IMPALA_HOSTNAME} -q "${create_view_statement}" |& tee error.log # impala-shell prints all logs in stderr, so wee need to capture them and put them in a file, in order to perform "grep" on them later
|
||||
specific_errors=`cat error.log | grep -E "FAILED: ParseException line 1:13 missing TABLE at 'view'|ERROR: AnalysisException: Could not resolve table reference:"`
|
||||
if [ -n "$specific_errors" ]; then
|
||||
echo -e "\nspecific_errors: ${specific_errors}\n"
|
||||
|
@ -212,7 +212,7 @@ function copydb() {
|
|||
previous_num_of_views_to_retry=$new_num_of_views_to_retry
|
||||
done
|
||||
|
||||
entities_on_impala=(`impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} --delimited -q "show tables in ${db}"`)
|
||||
entities_on_impala=(`impala-shell -i ${IMPALA_HOSTNAME} --delimited -q "show tables in ${db}"`)
|
||||
echo -e "\nThe schema of db '${db}', along with ${#entities_on_impala[@]} entities have been created, on Impala cluster, after: $(print_elapsed_time start_create_schema_time)\n"
|
||||
|
||||
start_compute_stats_time=$(date +%s)
|
||||
|
@ -222,9 +222,9 @@ function copydb() {
|
|||
create_view_statement=`hive -e "show create table ${db}.${i};" | grep "CREATE VIEW"` # This grep works here, as we do not want to match multiple-lines.
|
||||
if [ -z "$create_view_statement" ]; then # If it's a table, then go load the data to it.
|
||||
# Invalidate metadata of this DB's tables, in order for Impala to be aware of all parquet files put inside the tables' directories, previously, by "hadoop distcp".
|
||||
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA ${db}.${i}"
|
||||
impala-shell -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA ${db}.${i}"
|
||||
sleep 1
|
||||
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "compute stats ${db}.${i}" |& tee error.log
|
||||
impala-shell -i ${IMPALA_HOSTNAME} -q "compute stats ${db}.${i}" |& tee error.log
|
||||
log_errors=`cat error.log | grep -E "WARN|ERROR|FAILED"`
|
||||
if [ -n "$log_errors" ]; then
|
||||
echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN COMPUTING STATS FOR TABLE '${i}'!\n\n"
|
||||
|
|
|
@ -63,7 +63,7 @@ function copydb() {
|
|||
start_db_time=$(date +%s)
|
||||
|
||||
# Delete the old DB from Impala cluster (if exists).
|
||||
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "drop database if exists ${db} cascade" |& tee error.log # impala-shell prints all logs in stderr, so wee need to capture them and put them in a file, in order to perform "grep" on them later
|
||||
impala-shell -i ${IMPALA_HOSTNAME} -q "drop database if exists ${db} cascade" |& tee error.log # impala-shell prints all logs in stderr, so wee need to capture them and put them in a file, in order to perform "grep" on them later
|
||||
log_errors=`cat error.log | grep -E "WARN|ERROR|FAILED"`
|
||||
if [ -n "$log_errors" ]; then
|
||||
echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN DROPPING THE OLD DATABASE FROM IMPALA CLUSTER! EXITING...\n\n"
|
||||
|
@ -120,7 +120,7 @@ function copydb() {
|
|||
start_create_schema_time=$(date +%s)
|
||||
|
||||
# create the new database (with the same name)
|
||||
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "create database ${db}"
|
||||
impala-shell -i ${IMPALA_HOSTNAME} -q "create database ${db}"
|
||||
|
||||
# Because "Hive" and "Impala" do not have compatible schemas, we cannot use the "show create table <name>" output from hive to create the exact same table in impala.
|
||||
# So, we have to find at least one parquet file (check if it's there) from the table in the ocean cluster for impala to use it to extract the table-schema itself from that file.
|
||||
|
@ -148,7 +148,7 @@ function copydb() {
|
|||
exit 5
|
||||
fi # This error is not FATAL, do we do not return from this function, in normal circumstances.
|
||||
else
|
||||
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "create table ${db}.${i} like parquet '${CURRENT_PRQ_FILE}' stored as parquet;" |& tee error.log
|
||||
impala-shell -i ${IMPALA_HOSTNAME} -q "create table ${db}.${i} like parquet '${CURRENT_PRQ_FILE}' stored as parquet;" |& tee error.log
|
||||
log_errors=`cat error.log | grep -E "WARN|ERROR|FAILED"`
|
||||
if [ -n "$log_errors" ]; then
|
||||
echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN CREATING TABLE '${i}'!\n\n"
|
||||
|
@ -182,7 +182,7 @@ function copydb() {
|
|||
new_num_of_views_to_retry=0
|
||||
|
||||
for create_view_statement in "${all_create_view_statements[@]}"; do # Here we use double quotes, as the elements are phrases, instead of single-words.
|
||||
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "${create_view_statement}" |& tee error.log # impala-shell prints all logs in stderr, so wee need to capture them and put them in a file, in order to perform "grep" on them later
|
||||
impala-shell -i ${IMPALA_HOSTNAME} -q "${create_view_statement}" |& tee error.log # impala-shell prints all logs in stderr, so wee need to capture them and put them in a file, in order to perform "grep" on them later
|
||||
specific_errors=`cat error.log | grep -E "FAILED: ParseException line 1:13 missing TABLE at 'view'|ERROR: AnalysisException: Could not resolve table reference:"`
|
||||
if [ -n "$specific_errors" ]; then
|
||||
echo -e "\nspecific_errors: ${specific_errors}\n"
|
||||
|
@ -212,7 +212,7 @@ function copydb() {
|
|||
previous_num_of_views_to_retry=$new_num_of_views_to_retry
|
||||
done
|
||||
|
||||
entities_on_impala=(`impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} --delimited -q "show tables in ${db}"`)
|
||||
entities_on_impala=(`impala-shell -i ${IMPALA_HOSTNAME} --delimited -q "show tables in ${db}"`)
|
||||
echo -e "\nThe schema of db '${db}', along with ${#entities_on_impala[@]} entities have been created, on Impala cluster, after: $(print_elapsed_time start_create_schema_time)\n"
|
||||
|
||||
start_compute_stats_time=$(date +%s)
|
||||
|
@ -222,9 +222,9 @@ function copydb() {
|
|||
create_view_statement=`hive -e "show create table ${db}.${i};" | grep "CREATE VIEW"` # This grep works here, as we do not want to match multiple-lines.
|
||||
if [ -z "$create_view_statement" ]; then # If it's a table, then go load the data to it.
|
||||
# Invalidate metadata of this DB's tables, in order for Impala to be aware of all parquet files put inside the tables' directories, previously, by "hadoop distcp".
|
||||
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA ${db}.${i}"
|
||||
impala-shell -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA ${db}.${i}"
|
||||
sleep 1
|
||||
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "compute stats ${db}.${i}" |& tee error.log
|
||||
impala-shell -i ${IMPALA_HOSTNAME} -q "compute stats ${db}.${i}" |& tee error.log
|
||||
log_errors=`cat error.log | grep -E "WARN|ERROR|FAILED"`
|
||||
if [ -n "$log_errors" ]; then
|
||||
echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN COMPUTING STATS FOR TABLE '${i}'!\n\n"
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
<workflow-app name="Graph Stats" xmlns="uri:oozie:workflow:0.5">
|
||||
<workflow-app name="Promote Graph Stats" xmlns="uri:oozie:workflow:0.5">
|
||||
<parameters>
|
||||
<property>
|
||||
<name>stats_db_name</name>
|
||||
|
|
|
@ -65,7 +65,7 @@ function copydb() {
|
|||
start_db_time=$(date +%s)
|
||||
|
||||
# Delete the old DB from Impala cluster (if exists).
|
||||
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "drop database if exists ${db} cascade" |& tee error.log # impala-shell prints all logs in stderr, so wee need to capture them and put them in a file, in order to perform "grep" on them later
|
||||
impala-shell -i ${IMPALA_HOSTNAME} -q "drop database if exists ${db} cascade" |& tee error.log # impala-shell prints all logs in stderr, so wee need to capture them and put them in a file, in order to perform "grep" on them later
|
||||
log_errors=`cat error.log | grep -E "WARN|ERROR|FAILED"`
|
||||
if [ -n "$log_errors" ]; then
|
||||
echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN DROPPING THE OLD DATABASE FROM IMPALA CLUSTER! EXITING...\n\n"
|
||||
|
@ -122,7 +122,7 @@ function copydb() {
|
|||
start_create_schema_time=$(date +%s)
|
||||
|
||||
# create the new database (with the same name)
|
||||
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "create database ${db}"
|
||||
impala-shell -i ${IMPALA_HOSTNAME} -q "create database ${db}"
|
||||
|
||||
# Because "Hive" and "Impala" do not have compatible schemas, we cannot use the "show create table <name>" output from hive to create the exact same table in impala.
|
||||
# So, we have to find at least one parquet file (check if it's there) from the table in the ocean cluster for impala to use it to extract the table-schema itself from that file.
|
||||
|
@ -150,7 +150,7 @@ function copydb() {
|
|||
exit 5
|
||||
fi # This error is not FATAL, do we do not return from this function, in normal circumstances.
|
||||
else
|
||||
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "create table ${db}.${i} like parquet '${CURRENT_PRQ_FILE}' stored as parquet;" |& tee error.log
|
||||
impala-shell -i ${IMPALA_HOSTNAME} -q "create table ${db}.${i} like parquet '${CURRENT_PRQ_FILE}' stored as parquet;" |& tee error.log
|
||||
log_errors=`cat error.log | grep -E "WARN|ERROR|FAILED"`
|
||||
if [ -n "$log_errors" ]; then
|
||||
echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN CREATING TABLE '${i}'!\n\n"
|
||||
|
@ -184,7 +184,7 @@ function copydb() {
|
|||
new_num_of_views_to_retry=0
|
||||
|
||||
for create_view_statement in "${all_create_view_statements[@]}"; do # Here we use double quotes, as the elements are phrases, instead of single-words.
|
||||
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "${create_view_statement}" |& tee error.log # impala-shell prints all logs in stderr, so wee need to capture them and put them in a file, in order to perform "grep" on them later
|
||||
impala-shell -i ${IMPALA_HOSTNAME} -q "${create_view_statement}" |& tee error.log # impala-shell prints all logs in stderr, so wee need to capture them and put them in a file, in order to perform "grep" on them later
|
||||
specific_errors=`cat error.log | grep -E "FAILED: ParseException line 1:13 missing TABLE at 'view'|ERROR: AnalysisException: Could not resolve table reference:"`
|
||||
if [ -n "$specific_errors" ]; then
|
||||
echo -e "\nspecific_errors: ${specific_errors}\n"
|
||||
|
@ -214,7 +214,7 @@ function copydb() {
|
|||
previous_num_of_views_to_retry=$new_num_of_views_to_retry
|
||||
done
|
||||
|
||||
entities_on_impala=(`impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} --delimited -q "show tables in ${db}"`)
|
||||
entities_on_impala=(`impala-shell -i ${IMPALA_HOSTNAME} --delimited -q "show tables in ${db}"`)
|
||||
echo -e "\nThe schema of db '${db}', along with ${#entities_on_impala[@]} entities have been created, on Impala cluster, after: $(print_elapsed_time start_create_schema_time)\n"
|
||||
|
||||
start_compute_stats_time=$(date +%s)
|
||||
|
@ -224,9 +224,9 @@ function copydb() {
|
|||
create_view_statement=`hive -e "show create table ${db}.${i};" | grep "CREATE VIEW"` # This grep works here, as we do not want to match multiple-lines.
|
||||
if [ -z "$create_view_statement" ]; then # If it's a table, then go load the data to it.
|
||||
# Invalidate metadata of this DB's tables, in order for Impala to be aware of all parquet files put inside the tables' directories, previously, by "hadoop distcp".
|
||||
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA ${db}.${i}"
|
||||
impala-shell -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA ${db}.${i}"
|
||||
sleep 1
|
||||
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "compute stats ${db}.${i}" |& tee error.log
|
||||
impala-shell -i ${IMPALA_HOSTNAME} -q "compute stats ${db}.${i}" |& tee error.log
|
||||
log_errors=`cat error.log | grep -E "WARN|ERROR|FAILED"`
|
||||
if [ -n "$log_errors" ]; then
|
||||
echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN COMPUTING STATS FOR TABLE '${i}'!\n\n"
|
||||
|
@ -271,8 +271,7 @@ copydb $MONITOR_DB'_institutions'
|
|||
copydb $MONITOR_DB'_ris_tail'
|
||||
|
||||
contexts="knowmad::other dh-ch::other enermaps::other gotriple::other neanias-atmospheric::other rural-digital-europe::other covid-19::other aurora::other neanias-space::other north-america-studies::other north-american-studies::other eutopia::other"
|
||||
for i in ${contexts}
|
||||
do
|
||||
tmp=`echo "$i" | sed 's/'-'/'_'/g' | sed 's/'::'/'_'/g'`
|
||||
for i in ${contexts}; do
|
||||
tmp=`echo "$i" | sed 's/'-'/'_'/g' | sed 's/'::'/'_'/g'`
|
||||
copydb ${MONITOR_DB}'_'${tmp}
|
||||
done
|
|
@ -6,21 +6,26 @@ then
|
|||
ln -sfn ${PYTHON_EGG_CACHE}${link_folder} ${link_folder}
|
||||
fi
|
||||
|
||||
export HADOOP_USER_NAME=$3
|
||||
|
||||
IMPALA_HOSTNAME='impala-cluster-dn1.openaire.eu'
|
||||
|
||||
function createPDFsAggregated() {
|
||||
db=$1
|
||||
|
||||
impala-shell --user $HADOOP_USER_NAME -i impala-cluster-dn1.openaire.eu -d ${db} -q "drop table if exists indi_is_result_accessible";
|
||||
impala-shell -i ${IMPALA_HOSTNAME} -d ${db} -q "drop table if exists indi_is_result_accessible";
|
||||
|
||||
impala-shell --user $HADOOP_USER_NAME -i impala-cluster-dn1.openaire.eu -d ${db} -q "create table indi_is_result_accessible stored as parquet as
|
||||
impala-shell -i ${IMPALA_HOSTNAME} -d ${db} -q "create table indi_is_result_accessible stored as parquet as
|
||||
select distinct p.id, coalesce(is_result_accessible, 0) as is_result_accessible from result p
|
||||
left outer join
|
||||
(select id, 1 as is_result_accessible from (select pl.* from result r
|
||||
join pdfaggregation_i.publication p on r.id=p.id
|
||||
join pdfaggregation_i.payload pl on pl.id=p.id
|
||||
union all
|
||||
select pl.* from result r
|
||||
join pdfaggregation_i.publication p on r.id=p.dedupid
|
||||
join pdfaggregation_i.payload pl on pl.id=p.id) foo) tmp on p.id=tmp.id";
|
||||
(select id, 1 as is_result_accessible from (select pl.* from result r
|
||||
join pdfaggregation_i.publication p on r.id=p.id
|
||||
join pdfaggregation_i.payload pl on pl.id=p.id
|
||||
union all
|
||||
select pl.* from result r
|
||||
join pdfaggregation_i.publication p on r.id=p.dedupid
|
||||
join pdfaggregation_i.payload pl on pl.id=p.id) foo)
|
||||
tmp on p.id=tmp.id";
|
||||
}
|
||||
|
||||
STATS_DB=$1
|
||||
|
@ -35,8 +40,7 @@ createPDFsAggregated $MONITOR_DB'_institutions'
|
|||
createPDFsAggregated $MONITOR_DB'_ris_tail'
|
||||
|
||||
contexts="knowmad::other dh-ch::other enermaps::other gotriple::other neanias-atmospheric::other rural-digital-europe::other covid-19::other aurora::other neanias-space::other north-america-studies::other north-american-studies::other eutopia::other"
|
||||
for i in ${contexts}
|
||||
do
|
||||
tmp=`echo "$i" | sed 's/'-'/'_'/g' | sed 's/'::'/'_'/g'`
|
||||
for i in ${contexts}; do
|
||||
tmp=`echo "$i" | sed 's/'-'/'_'/g' | sed 's/'::'/'_'/g'`
|
||||
createPDFsAggregated ${MONITOR_DB}'_'${tmp}
|
||||
done
|
|
@ -51,49 +51,6 @@
|
|||
<artifactId>hadoop-distcp</artifactId>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>eu.dnetlib</groupId>
|
||||
<artifactId>dnet-actionmanager-api</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>eu.dnetlib</groupId>
|
||||
<artifactId>dnet-actionmanager-common</artifactId>
|
||||
<exclusions>
|
||||
<exclusion>
|
||||
<groupId>eu.dnetlib</groupId>
|
||||
<artifactId>dnet-openaireplus-mapping-utils</artifactId>
|
||||
</exclusion>
|
||||
<exclusion>
|
||||
<groupId>saxonica</groupId>
|
||||
<artifactId>saxon</artifactId>
|
||||
</exclusion>
|
||||
<exclusion>
|
||||
<groupId>saxonica</groupId>
|
||||
<artifactId>saxon-dom</artifactId>
|
||||
</exclusion>
|
||||
<exclusion>
|
||||
<groupId>jgrapht</groupId>
|
||||
<artifactId>jgrapht</artifactId>
|
||||
</exclusion>
|
||||
<exclusion>
|
||||
<groupId>net.sf.ehcache</groupId>
|
||||
<artifactId>ehcache</artifactId>
|
||||
</exclusion>
|
||||
<exclusion>
|
||||
<groupId>org.springframework</groupId>
|
||||
<artifactId>spring-test</artifactId>
|
||||
</exclusion>
|
||||
<exclusion>
|
||||
<groupId>org.apache.*</groupId>
|
||||
<artifactId>*</artifactId>
|
||||
</exclusion>
|
||||
<exclusion>
|
||||
<groupId>apache</groupId>
|
||||
<artifactId>*</artifactId>
|
||||
</exclusion>
|
||||
</exclusions>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>org.apache.httpcomponents</groupId>
|
||||
<artifactId>httpclient</artifactId>
|
||||
|
|
27
pom.xml
27
pom.xml
|
@ -440,29 +440,6 @@
|
|||
<scope>provided</scope>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>eu.dnetlib</groupId>
|
||||
<artifactId>dnet-actionmanager-common</artifactId>
|
||||
<version>${dnet-actionmanager-common.version}</version>
|
||||
<exclusions>
|
||||
<exclusion>
|
||||
<groupId>org.apache.hadoop</groupId>
|
||||
<artifactId>hadoop-common</artifactId>
|
||||
</exclusion>
|
||||
</exclusions>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>eu.dnetlib</groupId>
|
||||
<artifactId>dnet-actionmanager-api</artifactId>
|
||||
<version>${dnet-actionmanager-api.version}</version>
|
||||
<exclusions>
|
||||
<exclusion>
|
||||
<groupId>eu.dnetlib</groupId>
|
||||
<artifactId>cnr-misc-utils</artifactId>
|
||||
</exclusion>
|
||||
</exclusions>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>eu.dnetlib</groupId>
|
||||
<artifactId>cnr-rmi-api</artifactId>
|
||||
|
@ -960,7 +937,7 @@
|
|||
<commons.logging.version>1.1.3</commons.logging.version>
|
||||
<commons-validator.version>1.7</commons-validator.version>
|
||||
<dateparser.version>1.0.7</dateparser.version>
|
||||
<dhp-schemas.version>[6.1.3-SNAPSHOT]</dhp-schemas.version>
|
||||
<dhp-schemas.version>[7.0.0]</dhp-schemas.version>
|
||||
<dhp.cdh.version>cdh5.9.2</dhp.cdh.version>
|
||||
<dhp.commons.lang.version>3.5</dhp.commons.lang.version>
|
||||
<dhp.guava.version>11.0.2</dhp.guava.version>
|
||||
|
@ -969,8 +946,6 @@
|
|||
<dhp.oozie.version>4.1.0-${dhp.cdh.version}</dhp.oozie.version>
|
||||
<dhp.site.skip>true</dhp.site.skip>
|
||||
<dhp.spark.version>2.4.0.cloudera2</dhp.spark.version>
|
||||
<dnet-actionmanager-api.version>[4.0.3]</dnet-actionmanager-api.version>
|
||||
<dnet-actionmanager-common.version>[6.0.5]</dnet-actionmanager-common.version>
|
||||
<dnet-openaire-broker-common.version>[3.1.6]</dnet-openaire-broker-common.version>
|
||||
<google.gson.version>2.2.2</google.gson.version>
|
||||
<log4j.version>1.2.17</log4j.version>
|
||||
|
|
Loading…
Reference in New Issue