1
0
Fork 0

merged from beta

This commit is contained in:
Claudio Atzori 2024-07-17 12:01:40 +02:00
commit 06e3985b77
83 changed files with 2230 additions and 1109 deletions

View File

@ -328,7 +328,7 @@ public class MergeUtils {
final T merged = mergeOafFields(original, enrich, trust);
merged.setOriginalId(unionDistinctListOfString(merged.getOriginalId(), enrich.getOriginalId()));
merged.setPid(unionDistinctLists(merged.getPid(), enrich.getPid(), trust));
merged.setPid(mergeLists(merged.getPid(), enrich.getPid(), trust, MergeUtils::spKeyExtractor, (p1, p2) -> p1));
merged.setDateofcollection(LocalDateTime.now().toString());
merged
.setDateoftransformation(
@ -464,6 +464,10 @@ public class MergeUtils {
merge.setIsInDiamondJournal(booleanOR(merge.getIsInDiamondJournal(), enrich.getIsInDiamondJournal()));
merge.setPubliclyFunded(booleanOR(merge.getPubliclyFunded(), enrich.getPubliclyFunded()));
if (StringUtils.isBlank(merge.getTransformativeAgreement())) {
merge.setTransformativeAgreement(enrich.getTransformativeAgreement());
}
return merge;
}
@ -655,6 +659,13 @@ public class MergeUtils {
return d1;
}
if (StringUtils.contains(d1.getValue(), "null")) {
return d2;
}
if (StringUtils.contains(d2.getValue(), "null")) {
return d1;
}
return Stream
.of(d1, d2)
.min(

View File

@ -2,31 +2,41 @@
package eu.dnetlib.pace.clustering;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
import org.apache.commons.lang3.StringUtils;
import eu.dnetlib.pace.config.Config;
@ClusteringClass("keywordsclustering")
public class KeywordsClustering extends AbstractClusteringFunction {
@ClusteringClass("legalnameclustering")
public class LegalnameClustering extends AbstractClusteringFunction {
public KeywordsClustering(Map<String, Object> params) {
private static final Pattern CITY_CODE_PATTERN = Pattern.compile("city::\\d+");
private static final Pattern KEYWORD_CODE_PATTERN = Pattern.compile("key::\\d+");
public LegalnameClustering(Map<String, Object> params) {
super(params);
}
public Set<String> getRegexList(String input, Pattern codeRegex) {
Matcher matcher = codeRegex.matcher(input);
Set<String> cities = new HashSet<>();
while (matcher.find()) {
cities.add(matcher.group());
}
return cities;
}
@Override
protected Collection<String> doApply(final Config conf, String s) {
// takes city codes and keywords codes without duplicates
Set<String> keywords = getKeywords(s, conf.translationMap(), paramOrDefault("windowSize", 4));
Set<String> cities = getCities(s, paramOrDefault("windowSize", 4));
// list of combination to return as result
final Collection<String> combinations = new LinkedHashSet<String>();
for (String keyword : keywordsToCodes(keywords, conf.translationMap())) {
for (String city : citiesToCodes(cities)) {
for (String keyword : getRegexList(s, KEYWORD_CODE_PATTERN)) {
for (String city : getRegexList(s, CITY_CODE_PATTERN)) {
combinations.add(keyword + "-" + city);
if (combinations.size() >= paramOrDefault("max", 2)) {
return combinations;
@ -42,9 +52,6 @@ public class KeywordsClustering extends AbstractClusteringFunction {
return fields
.stream()
.filter(f -> !f.isEmpty())
.map(KeywordsClustering::cleanup)
.map(KeywordsClustering::normalize)
.map(s -> filterAllStopWords(s))
.map(s -> doApply(conf, s))
.map(c -> filterBlacklisted(c, ngramBlacklist))
.flatMap(c -> c.stream())

View File

@ -27,6 +27,14 @@ public class AbstractPaceFunctions extends PaceCommonUtils {
private static Map<String, String> cityMap = AbstractPaceFunctions
.loadMapFromClasspath("/eu/dnetlib/pace/config/city_map.csv");
// keywords map to be used when translating the keyword names into codes
private static Map<String, String> keywordMap = AbstractPaceFunctions
.loadMapFromClasspath("/eu/dnetlib/pace/config/translation_map.csv");
// country map to be used when inferring the country from the city name
private static Map<String, String> countryMap = AbstractPaceFunctions
.loadCountryMapFromClasspath("/eu/dnetlib/pace/config/country_map.csv");
// list of stopwords in different languages
protected static Set<String> stopwords_gr = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_gr.txt");
protected static Set<String> stopwords_en = loadFromClasspath("/eu/dnetlib/pace/config/stopwords_en.txt");
@ -74,6 +82,64 @@ public class AbstractPaceFunctions extends PaceCommonUtils {
return s12;
}
public static String countryInference(final String original, String inferFrom) {
if (!original.equalsIgnoreCase("unknown"))
return original;
inferFrom = cleanup(inferFrom);
inferFrom = normalize(inferFrom);
inferFrom = filterAllStopWords(inferFrom);
Set<String> cities = getCities(inferFrom, 4);
return citiesToCountry(cities).stream().findFirst().orElse("UNKNOWN");
}
public static String cityInference(String original) {
original = cleanup(original);
original = normalize(original);
original = filterAllStopWords(original);
Set<String> cities = getCities(original, 4);
for (String city : cities) {
original = original.replaceAll(city, cityMap.get(city));
}
return original;
}
public static String keywordInference(String original) {
original = cleanup(original);
original = normalize(original);
original = filterAllStopWords(original);
Set<String> keywords = getKeywords(original, keywordMap, 4);
for (String keyword : keywords) {
original = original.replaceAll(keyword, keywordMap.get(keyword));
}
return original;
}
public static String cityKeywordInference(String original) {
original = cleanup(original);
original = normalize(original);
original = filterAllStopWords(original);
Set<String> keywords = getKeywords(original, keywordMap, 4);
Set<String> cities = getCities(original, 4);
for (String keyword : keywords) {
original = original.replaceAll(keyword, keywordMap.get(keyword));
}
for (String city : cities) {
original = original.replaceAll(city, cityMap.get(city));
}
return original;
}
protected static String fixXML(final String a) {
return a
@ -208,6 +274,30 @@ public class AbstractPaceFunctions extends PaceCommonUtils {
return m;
}
public static Map<String, String> loadCountryMapFromClasspath(final String classpath) {
Transliterator transliterator = Transliterator.getInstance("Any-Eng");
final Map<String, String> m = new HashMap<>();
try {
for (final String s : IOUtils
.readLines(AbstractPaceFunctions.class.getResourceAsStream(classpath), StandardCharsets.UTF_8)) {
// string is like this: country_code;city1;city2;city3
String[] line = s.split(";");
String value = line[0];
for (int i = 1; i < line.length; i++) {
String city = fixAliases(transliterator.transliterate(line[i].toLowerCase()));
String code = cityMap.get(city);
m.put(code, value);
}
}
} catch (final Throwable e) {
return new HashMap<>();
}
return m;
}
public static String removeKeywords(String s, Set<String> keywords) {
s = " " + s + " ";
@ -237,6 +327,10 @@ public class AbstractPaceFunctions extends PaceCommonUtils {
return toCodes(keywords, cityMap);
}
public static Set<String> citiesToCountry(Set<String> cities) {
return toCodes(toCodes(cities, cityMap), countryMap);
}
protected static String firstLC(final String s) {
return StringUtils.substring(s, 0, 1).toLowerCase();
}

View File

@ -47,9 +47,21 @@ public class FieldDef implements Serializable {
private String clean;
private String infer;
private String inferenceFrom;
public FieldDef() {
}
public String getInferenceFrom() {
return inferenceFrom;
}
public void setInferenceFrom(final String inferenceFrom) {
this.inferenceFrom = inferenceFrom;
}
public String getName() {
return name;
}
@ -126,6 +138,14 @@ public class FieldDef implements Serializable {
this.clean = clean;
}
public String getInfer() {
return infer;
}
public void setInfer(String infer) {
this.infer = infer;
}
@Override
public String toString() {
try {

View File

@ -123,9 +123,19 @@ case class SparkModel(conf: DedupConfig) {
case _ => res(index)
}
}
if (StringUtils.isNotBlank(fdef.getInfer)) {
val inferFrom : String = if (StringUtils.isNotBlank(fdef.getInferenceFrom)) fdef.getInferenceFrom else fdef.getPath
res(index) = res(index) match {
case x: Seq[String] => x.map(inference(_, MapDocumentUtil.getJPathString(inferFrom, documentContext), fdef.getInfer))
case _ => inference(res(index).toString, MapDocumentUtil.getJPathString(inferFrom, documentContext), fdef.getInfer)
}
}
}
res
}
new GenericRowWithSchema(values, schema)
@ -146,5 +156,17 @@ case class SparkModel(conf: DedupConfig) {
res
}
def inference(value: String, inferfrom: String, infertype: String) : String = {
val res = infertype match {
case "country" => AbstractPaceFunctions.countryInference(value, inferfrom)
case "city" => AbstractPaceFunctions.cityInference(value)
case "keyword" => AbstractPaceFunctions.keywordInference(value)
case "city_keyword" => AbstractPaceFunctions.cityKeywordInference(value)
case _ => value
}
res
}
}

View File

@ -1,48 +0,0 @@
package eu.dnetlib.pace.tree;
import java.util.Map;
import java.util.Set;
import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.tree.support.AbstractStringComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass;
@ComparatorClass("cityMatch")
public class CityMatch extends AbstractStringComparator {
private Map<String, String> params;
public CityMatch(Map<String, String> params) {
super(params);
this.params = params;
}
@Override
public double distance(final String a, final String b, final Config conf) {
String ca = cleanup(a);
String cb = cleanup(b);
ca = normalize(ca);
cb = normalize(cb);
ca = filterAllStopWords(ca);
cb = filterAllStopWords(cb);
Set<String> cities1 = getCities(ca, Integer.parseInt(params.getOrDefault("windowSize", "4")));
Set<String> cities2 = getCities(cb, Integer.parseInt(params.getOrDefault("windowSize", "4")));
Set<String> codes1 = citiesToCodes(cities1);
Set<String> codes2 = citiesToCodes(cities2);
// if no cities are detected, the comparator gives 1.0
if (codes1.isEmpty() && codes2.isEmpty())
return 1.0;
else {
if (codes1.isEmpty() ^ codes2.isEmpty())
return -1; // undefined if one of the two has no cities
return commonElementsPercentage(codes1, codes2);
}
}
}

View File

@ -0,0 +1,51 @@
package eu.dnetlib.pace.tree;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.tree.support.AbstractStringComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass;
@ComparatorClass("codeMatch")
public class CodeMatch extends AbstractStringComparator {
private Map<String, String> params;
private Pattern CODE_REGEX;
public CodeMatch(Map<String, String> params) {
super(params);
this.params = params;
this.CODE_REGEX = Pattern.compile(params.getOrDefault("codeRegex", "[a-zA-Z]::\\d+"));
}
public Set<String> getRegexList(String input) {
Matcher matcher = this.CODE_REGEX.matcher(input);
Set<String> cities = new HashSet<>();
while (matcher.find()) {
cities.add(matcher.group());
}
return cities;
}
@Override
public double distance(final String a, final String b, final Config conf) {
Set<String> codes1 = getRegexList(a);
Set<String> codes2 = getRegexList(b);
// if no codes are detected, the comparator gives 1.0
if (codes1.isEmpty() && codes2.isEmpty())
return 1.0;
else {
if (codes1.isEmpty() ^ codes2.isEmpty())
return -1; // undefined if one of the two has no codes
return commonElementsPercentage(codes1, codes2);
}
}
}

View File

@ -0,0 +1,54 @@
package eu.dnetlib.pace.tree;
import java.util.Map;
import java.util.Set;
import com.wcohen.ss.AbstractStringDistance;
import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.tree.support.AbstractStringComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass;
@ComparatorClass("countryMatch")
public class CountryMatch extends AbstractStringComparator {
private Map<String, String> params;
public CountryMatch(Map<String, String> params) {
super(params, new com.wcohen.ss.JaroWinkler());
this.params = params;
}
public CountryMatch(final double weight) {
super(weight, new com.wcohen.ss.JaroWinkler());
}
protected CountryMatch(final double weight, final AbstractStringDistance ssalgo) {
super(weight, ssalgo);
}
@Override
public double distance(final String a, final String b, final Config conf) {
if (a.isEmpty() || b.isEmpty()) {
return -1.0; // return -1 if a field is missing
}
if (a.equalsIgnoreCase("unknown") || b.equalsIgnoreCase("unknown")) {
return -1.0; // return -1 if a country is UNKNOWN
}
return a.equals(b) ? 1.0 : 0;
}
@Override
public double getWeight() {
return super.weight;
}
@Override
protected double normalize(final double d) {
return d;
}
}

View File

@ -0,0 +1,59 @@
package eu.dnetlib.pace.tree;
import java.util.Map;
import java.util.Set;
import com.wcohen.ss.AbstractStringDistance;
import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.tree.support.AbstractStringComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass;
@ComparatorClass("jaroWinklerLegalname")
public class JaroWinklerLegalname extends AbstractStringComparator {
private Map<String, String> params;
private final String CITY_CODE_REGEX = "city::\\d+";
private final String KEYWORD_CODE_REGEX = "key::\\d+";
public JaroWinklerLegalname(Map<String, String> params) {
super(params, new com.wcohen.ss.JaroWinkler());
this.params = params;
}
public JaroWinklerLegalname(double weight) {
super(weight, new com.wcohen.ss.JaroWinkler());
}
protected JaroWinklerLegalname(double weight, AbstractStringDistance ssalgo) {
super(weight, ssalgo);
}
@Override
public double distance(String a, String b, final Config conf) {
String ca = a.replaceAll(CITY_CODE_REGEX, "").replaceAll(KEYWORD_CODE_REGEX, " ");
String cb = b.replaceAll(CITY_CODE_REGEX, "").replaceAll(KEYWORD_CODE_REGEX, " ");
ca = ca.replaceAll("[ ]{2,}", " ");
cb = cb.replaceAll("[ ]{2,}", " ");
if (ca.isEmpty() && cb.isEmpty())
return 1.0;
else
return normalize(ssalgo.score(ca, cb));
}
@Override
public double getWeight() {
return super.weight;
}
@Override
protected double normalize(double d) {
return d;
}
}

View File

@ -1,74 +0,0 @@
package eu.dnetlib.pace.tree;
import java.util.Map;
import java.util.Set;
import com.wcohen.ss.AbstractStringDistance;
import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.tree.support.AbstractStringComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass;
@ComparatorClass("jaroWinklerNormalizedName")
public class JaroWinklerNormalizedName extends AbstractStringComparator {
private Map<String, String> params;
public JaroWinklerNormalizedName(Map<String, String> params) {
super(params, new com.wcohen.ss.JaroWinkler());
this.params = params;
}
public JaroWinklerNormalizedName(double weight) {
super(weight, new com.wcohen.ss.JaroWinkler());
}
protected JaroWinklerNormalizedName(double weight, AbstractStringDistance ssalgo) {
super(weight, ssalgo);
}
@Override
public double distance(String a, String b, final Config conf) {
String ca = cleanup(a);
String cb = cleanup(b);
ca = normalize(ca);
cb = normalize(cb);
ca = filterAllStopWords(ca);
cb = filterAllStopWords(cb);
Set<String> keywords1 = getKeywords(
ca, conf.translationMap(), Integer.parseInt(params.getOrDefault("windowSize", "4")));
Set<String> keywords2 = getKeywords(
cb, conf.translationMap(), Integer.parseInt(params.getOrDefault("windowSize", "4")));
Set<String> cities1 = getCities(ca, Integer.parseInt(params.getOrDefault("windowSize", "4")));
Set<String> cities2 = getCities(cb, Integer.parseInt(params.getOrDefault("windowSize", "4")));
ca = removeKeywords(ca, keywords1);
ca = removeKeywords(ca, cities1);
cb = removeKeywords(cb, keywords2);
cb = removeKeywords(cb, cities2);
ca = ca.replaceAll("[ ]{2,}", " ");
cb = cb.replaceAll("[ ]{2,}", " ");
if (ca.isEmpty() && cb.isEmpty())
return 1.0;
else
return normalize(ssalgo.score(ca, cb));
}
@Override
public double getWeight() {
return super.weight;
}
@Override
protected double normalize(double d) {
return d;
}
}

View File

@ -1,50 +0,0 @@
package eu.dnetlib.pace.tree;
import java.util.Map;
import java.util.Set;
import eu.dnetlib.pace.config.Config;
import eu.dnetlib.pace.tree.support.AbstractStringComparator;
import eu.dnetlib.pace.tree.support.ComparatorClass;
@ComparatorClass("keywordMatch")
public class KeywordMatch extends AbstractStringComparator {
Map<String, String> params;
public KeywordMatch(Map<String, String> params) {
super(params);
this.params = params;
}
@Override
public double distance(final String a, final String b, final Config conf) {
String ca = cleanup(a);
String cb = cleanup(b);
ca = normalize(ca);
cb = normalize(cb);
ca = filterAllStopWords(ca);
cb = filterAllStopWords(cb);
Set<String> keywords1 = getKeywords(
ca, conf.translationMap(), Integer.parseInt(params.getOrDefault("windowSize", "4")));
Set<String> keywords2 = getKeywords(
cb, conf.translationMap(), Integer.parseInt(params.getOrDefault("windowSize", "4")));
Set<String> codes1 = toCodes(keywords1, conf.translationMap());
Set<String> codes2 = toCodes(keywords2, conf.translationMap());
// if no cities are detected, the comparator gives 1.0
if (codes1.isEmpty() && codes2.isEmpty())
return 1.0;
else {
if (codes1.isEmpty() ^ codes2.isEmpty())
return -1.0; // undefined if one of the two has no keywords
return commonElementsPercentage(codes1, codes2);
}
}
}

View File

@ -48,7 +48,7 @@ public class TreeNodeDef implements Serializable {
// function for the evaluation of the node
public TreeNodeStats evaluate(Row doc1, Row doc2, Config conf) {
TreeNodeStats stats = new TreeNodeStats();
TreeNodeStats stats = new TreeNodeStats(ignoreUndefined);
// for each field in the node, it computes the
for (FieldConf fieldConf : fields) {

View File

@ -9,8 +9,11 @@ public class TreeNodeStats implements Serializable {
private Map<String, FieldStats> results; // this is an accumulator for the results of the node
public TreeNodeStats() {
private final boolean ignoreUndefined;
public TreeNodeStats(boolean ignoreUndefined) {
this.results = new HashMap<>();
this.ignoreUndefined = ignoreUndefined;
}
public Map<String, FieldStats> getResults() {
@ -22,7 +25,10 @@ public class TreeNodeStats implements Serializable {
}
public int fieldsCount() {
if (ignoreUndefined)
return this.results.size();
else
return this.results.size() - undefinedCount(); // do not count undefined
}
public int undefinedCount() {
@ -78,12 +84,23 @@ public class TreeNodeStats implements Serializable {
double min = 100.0; // random high value
for (FieldStats fs : this.results.values()) {
if (fs.getResult() < min) {
if (fs.getResult() >= 0.0 || (fs.getResult() == -1 && fs.isCountIfUndefined()))
if (fs.getResult() == -1) {
if (fs.isCountIfUndefined()) {
min = 0.0;
} else {
min = -1;
}
} else {
min = fs.getResult();
}
}
}
if (ignoreUndefined) {
return min == -1.0 ? 0.0 : min;
} else {
return min;
}
}
// if at least one is true, return 1.0
public double or() {
@ -91,8 +108,12 @@ public class TreeNodeStats implements Serializable {
if (fieldStats.getResult() >= fieldStats.getThreshold())
return 1.0;
}
if (!ignoreUndefined && undefinedCount() > 0) {
return -1.0;
} else {
return 0.0;
}
}
// if at least one is false, return 0.0
public double and() {
@ -100,7 +121,7 @@ public class TreeNodeStats implements Serializable {
if (fieldStats.getResult() == -1) {
if (fieldStats.isCountIfUndefined())
return 0.0;
return ignoreUndefined ? 0.0 : -1.0;
} else {
if (fieldStats.getResult() < fieldStats.getThreshold())
return 0.0;

View File

@ -44,12 +44,10 @@ public class TreeProcessor {
TreeNodeStats stats = currentNode.evaluate(doc1, doc2, config);
treeStats.addNodeStats(nextNodeName, stats);
// if ignoreUndefined=false the miss is considered as undefined
if (!currentNode.isIgnoreUndefined() && stats.undefinedCount() > 0) {
double finalScore = stats.getFinalScore(currentNode.getAggregation());
if (finalScore == -1.0)
nextNodeName = currentNode.getUndefined();
}
// if ignoreUndefined=true the miss is ignored and the score computed anyway
else if (stats.getFinalScore(currentNode.getAggregation()) >= currentNode.getThreshold()) {
else if (finalScore >= currentNode.getThreshold()) {
nextNodeName = currentNode.getPositive();
} else {
nextNodeName = currentNode.getNegative();

File diff suppressed because one or more lines are too long

View File

@ -8,6 +8,7 @@ import org.junit.jupiter.api.Test;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import com.mongodb.connection.Cluster;
import eu.dnetlib.pace.AbstractPaceTest;
import eu.dnetlib.pace.common.AbstractPaceFunctions;
@ -177,41 +178,16 @@ public class ClusteringFunctionTest extends AbstractPaceTest {
}
@Test
public void testKeywordsClustering() {
public void legalnameClustering() {
final ClusteringFunction cf = new KeywordsClustering(params);
final String s = "Polytechnic University of Turin";
final ClusteringFunction cf = new LegalnameClustering(params);
String s = "key::1 key::2 city::1";
System.out.println(s);
System.out.println(cf.apply(conf, Lists.newArrayList(s)));
final String s1 = "POLITECNICO DI TORINO";
System.out.println(s1);
System.out.println(cf.apply(conf, Lists.newArrayList(s1)));
final String s2 = "Universita farmaceutica culturale di milano bergamo";
System.out.println("s2 = " + s2);
System.out.println(cf.apply(conf, Lists.newArrayList(s2)));
final String s3 = "universita universita milano milano";
System.out.println("s3 = " + s3);
System.out.println(cf.apply(conf, Lists.newArrayList(s3)));
final String s4 = "Politechniki Warszawskiej (Warsaw University of Technology)";
System.out.println("s4 = " + s4);
System.out.println(cf.apply(conf, Lists.newArrayList(s4)));
final String s5 = "İstanbul Ticarət Universiteti";
System.out.println("s5 = " + s5);
System.out.println(cf.apply(conf, Lists.newArrayList(s5)));
final String s6 = "National and Kapodistrian University of Athens";
System.out.println("s6 = " + s6);
System.out.println(cf.apply(conf, Lists.newArrayList(s6)));
final String s7 = "Εθνικό και Καποδιστριακό Πανεπιστήμιο Αθηνών";
System.out.println("s7 = " + s7);
System.out.println(cf.apply(conf, Lists.newArrayList(s7)));
s = "key::1 key::2 city::1 city::2";
System.out.println(s);
System.out.println(cf.apply(conf, Lists.newArrayList(s)));
}
@Test

View File

@ -54,4 +54,47 @@ public class PaceFunctionTest extends AbstractPaceFunctions {
System.out.println("Fixed aliases : " + fixAliases(TEST_STRING));
}
@Test
public void countryInferenceTest() {
assertEquals("IT", countryInference("UNKNOWN", "Università di Bologna"));
assertEquals("UK", countryInference("UK", "Università di Bologna"));
assertEquals("IT", countryInference("UNKNOWN", "Universiteé de Naples"));
assertEquals("UNKNOWN", countryInference("UNKNOWN", "Università del Lavoro"));
}
@Test
public void cityInferenceTest() {
assertEquals("universita city::3181928", cityInference("Università di Bologna"));
assertEquals("university city::3170647", cityInference("University of Pisa"));
assertEquals("universita", cityInference("Università del lavoro"));
assertEquals("universita city::3173331 city::3169522", cityInference("Università di Modena e Reggio Emilia"));
}
@Test
public void keywordInferenceTest() {
assertEquals("key::41 turin", keywordInference("Polytechnic University of Turin"));
assertEquals("key::41 torino", keywordInference("POLITECNICO DI TORINO"));
assertEquals(
"key::1 key::60 key::81 milano bergamo",
keywordInference("Universita farmaceutica culturale di milano bergamo"));
assertEquals("key::1 key::1 milano milano", keywordInference("universita universita milano milano"));
assertEquals(
"key::10 kapodistriako panepistemio athenon",
keywordInference("Εθνικό και Καποδιστριακό Πανεπιστήμιο Αθηνών"));
}
@Test
public void cityKeywordInferenceTest() {
assertEquals("key::41 city::3165524", cityKeywordInference("Polytechnic University of Turin"));
assertEquals("key::41 city::3165524", cityKeywordInference("POLITECNICO DI TORINO"));
assertEquals(
"key::1 key::60 key::81 city::3173435 city::3182164",
cityKeywordInference("Universita farmaceutica culturale di milano bergamo"));
assertEquals(
"key::1 key::1 city::3173435 city::3173435", cityKeywordInference("universita universita milano milano"));
assertEquals(
"key::10 kapodistriako panepistemio city::264371",
cityKeywordInference("Εθνικό και Καποδιστριακό Πανεπιστήμιο Αθηνών"));
}
}

View File

@ -35,6 +35,7 @@ public class ComparatorTest extends AbstractPaceTest {
params.put("name_th", "0.95");
params.put("jpath_value", "$.value");
params.put("jpath_classid", "$.qualifier.classid");
params.put("codeRegex", "key::\\d+");
}
@Test
@ -44,52 +45,23 @@ public class ComparatorTest extends AbstractPaceTest {
}
@Test
public void cityMatchTest() {
final CityMatch cityMatch = new CityMatch(params);
public void codeMatchTest() {
CodeMatch codeMatch = new CodeMatch(params);
// both names with no cities
assertEquals(1.0, cityMatch.distance("Università", "Centro di ricerca", conf));
// both names with no codes
assertEquals(1.0, codeMatch.distance("testing1", "testing2", conf));
// one of the two names with no cities
assertEquals(-1.0, cityMatch.distance("Università di Bologna", "Centro di ricerca", conf));
// one of the two names with no codes
assertEquals(-1.0, codeMatch.distance("testing1 key::1", "testing", conf));
// both names with cities (same)
assertEquals(1.0, cityMatch.distance("Universita di Bologna", "Biblioteca di Bologna", conf));
// both names with codes (same)
assertEquals(1.0, codeMatch.distance("testing1 key::1", "testing2 key::1", conf));
// both names with cities (different)
assertEquals(0.0, cityMatch.distance("Universita di Bologna", "Universita di Torino", conf));
assertEquals(0.0, cityMatch.distance("Franklin College", "Concordia College", conf));
// both names with codes (different)
assertEquals(0.0, codeMatch.distance("testing1 key::1", "testing2 key::2", conf));
// particular cases
assertEquals(1.0, cityMatch.distance("Free University of Bozen-Bolzano", "Università di Bolzano", conf));
assertEquals(
1.0,
cityMatch
.distance(
"Politechniki Warszawskiej (Warsaw University of Technology)", "Warsaw University of Technology",
conf));
// failing becasuse 'Allen' is a transliterrated greek stopword
// assertEquals(-1.0, cityMatch.distance("Allen (United States)", "United States Military Academy", conf));
assertEquals(-1.0, cityMatch.distance("Washington (United States)", "United States Military Academy", conf));
}
@Test
public void keywordMatchTest() {
params.put("threshold", "0.5");
final KeywordMatch keywordMatch = new KeywordMatch(params);
assertEquals(
0.5, keywordMatch.distance("Biblioteca dell'Universita di Bologna", "Università di Bologna", conf));
assertEquals(1.0, keywordMatch.distance("Universita degli studi di Pisa", "Universita di Pisa", conf));
assertEquals(1.0, keywordMatch.distance("Polytechnic University of Turin", "POLITECNICO DI TORINO", conf));
assertEquals(1.0, keywordMatch.distance("Istanbul Commerce University", "İstanbul Ticarət Universiteti", conf));
assertEquals(1.0, keywordMatch.distance("Franklin College", "Concordia College", conf));
assertEquals(2.0 / 3.0, keywordMatch.distance("University of Georgia", "Georgia State University", conf));
assertEquals(0.5, keywordMatch.distance("University College London", "University of London", conf));
assertEquals(0.5, keywordMatch.distance("Washington State University", "University of Washington", conf));
assertEquals(-1.0, keywordMatch.distance("Allen (United States)", "United States Military Academy", conf));
// both names with codes (1 same, 1 different)
assertEquals(0.5, codeMatch.distance("key::1 key::2 testing1", "key::1 testing", conf));
}
@ -155,15 +127,15 @@ public class ComparatorTest extends AbstractPaceTest {
}
@Test
public void jaroWinklerNormalizedNameTest() {
public void jaroWinklerLegalnameTest() {
final JaroWinklerNormalizedName jaroWinklerNormalizedName = new JaroWinklerNormalizedName(params);
final JaroWinklerLegalname jaroWinklerLegalname = new JaroWinklerLegalname(params);
double result = jaroWinklerNormalizedName
.distance("AT&T (United States)", "United States Military Academy", conf);
double result = jaroWinklerLegalname
.distance("AT&T (United States)", "United States key::2 key::1", conf);
System.out.println("result = " + result);
result = jaroWinklerNormalizedName.distance("NOAA - Servicio Meteorol\\u00f3gico Nacional", "NOAA - NWS", conf);
result = jaroWinklerLegalname.distance("NOAA - Servicio Meteorol\\u00f3gico Nacional", "NOAA - NWS", conf);
System.out.println("result = " + result);
}
@ -336,4 +308,23 @@ public class ComparatorTest extends AbstractPaceTest {
System.out.println("compare = " + compare);
}
@Test
public void countryMatch() {
CountryMatch countryMatch = new CountryMatch(params);
double result = countryMatch.distance("UNKNOWN", "UNKNOWN", conf);
assertEquals(-1.0, result);
result = countryMatch.distance("CL", "UNKNOWN", conf);
assertEquals(-1.0, result);
result = countryMatch.distance("CL", "IT", conf);
assertEquals(0.0, result);
result = countryMatch.distance("CL", "CL", conf);
assertEquals(1.0, result);
}
}

View File

@ -51,48 +51,5 @@
<artifactId>hadoop-distcp</artifactId>
</dependency>
<dependency>
<groupId>eu.dnetlib</groupId>
<artifactId>dnet-actionmanager-api</artifactId>
</dependency>
<dependency>
<groupId>eu.dnetlib</groupId>
<artifactId>dnet-actionmanager-common</artifactId>
<exclusions>
<exclusion>
<groupId>eu.dnetlib</groupId>
<artifactId>dnet-openaireplus-mapping-utils</artifactId>
</exclusion>
<exclusion>
<groupId>saxonica</groupId>
<artifactId>saxon</artifactId>
</exclusion>
<exclusion>
<groupId>saxonica</groupId>
<artifactId>saxon-dom</artifactId>
</exclusion>
<exclusion>
<groupId>jgrapht</groupId>
<artifactId>jgrapht</artifactId>
</exclusion>
<exclusion>
<groupId>net.sf.ehcache</groupId>
<artifactId>ehcache</artifactId>
</exclusion>
<exclusion>
<groupId>org.springframework</groupId>
<artifactId>spring-test</artifactId>
</exclusion>
<exclusion>
<groupId>org.apache.*</groupId>
<artifactId>*</artifactId>
</exclusion>
<exclusion>
<groupId>apache</groupId>
<artifactId>*</artifactId>
</exclusion>
</exclusions>
</dependency>
</dependencies>
</project>

View File

@ -4,7 +4,6 @@ package eu.dnetlib.dhp.actionmanager;
import java.io.Serializable;
import java.io.StringReader;
import java.util.List;
import java.util.NoSuchElementException;
import java.util.Optional;
import java.util.Set;
import java.util.stream.Collectors;
@ -22,7 +21,6 @@ import com.google.common.base.Splitter;
import com.google.common.collect.Iterables;
import com.google.common.collect.Sets;
import eu.dnetlib.actionmanager.rmi.ActionManagerException;
import eu.dnetlib.dhp.utils.ISLookupClientFactory;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
@ -65,7 +63,7 @@ public class ISClient implements Serializable {
.map(t -> buildDirectory(basePath, t))
.collect(Collectors.toList()))
.orElseThrow(() -> new IllegalStateException("empty set list"));
} catch (ActionManagerException | ISLookUpException e) {
} catch (ISLookUpException e) {
throw new IllegalStateException("unable to query ActionSets info from the IS");
}
}
@ -89,31 +87,18 @@ public class ISClient implements Serializable {
return Joiner.on("/").join(basePath, t.getMiddle(), t.getRight());
}
private String getBasePathHDFS(ISLookUpService isLookup) throws ActionManagerException {
private String getBasePathHDFS(ISLookUpService isLookup) throws ISLookUpException {
return queryServiceProperty(isLookup, "basePath");
}
private String queryServiceProperty(ISLookUpService isLookup, final String propertyName)
throws ActionManagerException {
throws ISLookUpException {
final String q = "for $x in /RESOURCE_PROFILE[.//RESOURCE_TYPE/@value='ActionManagerServiceResourceType'] return $x//SERVICE_PROPERTIES/PROPERTY[./@ key='"
+ propertyName
+ "']/@value/string()";
log.debug("quering for service property: {}", q);
try {
final List<String> value = isLookup.quickSearchProfile(q);
return Iterables.getOnlyElement(value);
} catch (ISLookUpException e) {
String msg = "Error accessing service profile, using query: " + q;
log.error(msg, e);
throw new ActionManagerException(msg, e);
} catch (NoSuchElementException e) {
String msg = "missing service property: " + propertyName;
log.error(msg, e);
throw new ActionManagerException(msg, e);
} catch (IllegalArgumentException e) {
String msg = "found more than one service property: " + propertyName;
log.error(msg, e);
throw new ActionManagerException(msg, e);
}
}
}

View File

@ -42,6 +42,9 @@ public class Constants {
public static final String NULL = "NULL";
public static final String NA = "N/A";
public static final String WEB_CRAWL_ID = "10|openaire____::fb98a192f6a055ba495ef414c330834b";
public static final String WEB_CRAWL_NAME = "Web Crawl";
public static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
private Constants() {

View File

@ -41,9 +41,9 @@ public class PrepareAffiliationRelations implements Serializable {
private static final Logger log = LoggerFactory.getLogger(PrepareAffiliationRelations.class);
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
private static final String ID_PREFIX = "50|doi_________::";
public static final String BIP_AFFILIATIONS_CLASSID = "result:organization:bipinference";
public static final String BIP_AFFILIATIONS_CLASSNAME = "Affiliation relation inferred by BIP!";
public static final String BIP_INFERENCE_PROVENANCE = "bip:affiliation:crossref";
public static final String BIP_AFFILIATIONS_CLASSID = "result:organization:openaireinference";
public static final String BIP_AFFILIATIONS_CLASSNAME = "Affiliation relation inferred by OpenAIRE";
public static final String BIP_INFERENCE_PROVENANCE = "openaire:affiliation";
public static <I extends Result> void main(String[] args) throws Exception {
@ -71,6 +71,9 @@ public class PrepareAffiliationRelations implements Serializable {
final String dataciteInputPath = parser.get("dataciteInputPath");
log.info("dataciteInputPath: {}", dataciteInputPath);
final String webcrawlInputPath = parser.get("webCrawlInputPath");
log.info("webcrawlInputPath: {}", webcrawlInputPath);
final String outputPath = parser.get("outputPath");
log.info("outputPath: {}", outputPath);
@ -102,10 +105,16 @@ public class PrepareAffiliationRelations implements Serializable {
JavaPairRDD<Text, Text> dataciteRelations = prepareAffiliationRelations(
spark, dataciteInputPath, collectedFromDatacite);
List<KeyValue> collectedFromWebCrawl = OafMapperUtils
.listKeyValues(Constants.WEB_CRAWL_ID, Constants.WEB_CRAWL_NAME);
JavaPairRDD<Text, Text> webCrawlRelations = prepareAffiliationRelations(
spark, webcrawlInputPath, collectedFromWebCrawl);
crossrefRelations
.union(pubmedRelations)
.union(openAPCRelations)
.union(dataciteRelations)
.union(webCrawlRelations)
.saveAsHadoopFile(
outputPath, Text.class, Text.class, SequenceFileOutputFormat.class, BZip2Codec.class);

View File

@ -5,7 +5,6 @@ import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import java.io.Serializable;
import java.util.*;
import java.util.stream.Collectors;
import org.apache.commons.io.IOUtils;
import org.apache.hadoop.io.Text;
@ -21,6 +20,7 @@ import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.actionmanager.Constants;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.schema.action.AtomicAction;
import eu.dnetlib.dhp.schema.common.ModelConstants;
@ -44,8 +44,7 @@ public class CreateActionSetFromWebEntries implements Serializable {
private static final String PMID_PREFIX = "50|pmid________::";
private static final String PMCID_PREFIX = "50|pmc_________::";
private static final String WEB_CRAWL_ID = "10|openaire____::fb98a192f6a055ba495ef414c330834b";
private static final String WEB_CRAWL_NAME = "Web Crawl";
public static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
public static void main(String[] args) throws Exception {
@ -104,8 +103,6 @@ public class CreateActionSetFromWebEntries implements Serializable {
final String ror = ROR_PREFIX
+ IdentifierFactory.md5(PidCleaner.normalizePidValue("ROR", row.getAs("ror")));
ret.addAll(createAffiliationRelationPairDOI(row.getAs("doi"), ror));
// ret.addAll(createAffiliationRelationPairPMID(row.getAs("pmid"), ror));
// ret.addAll(createAffiliationRelationPairPMCID(row.getAs("pmcid"), ror));
return ret
.iterator();
@ -145,11 +142,6 @@ public class CreateActionSetFromWebEntries implements Serializable {
"institution.country_code as country_code", "publication_year")
.distinct();
// .selectExpr(
// "id", "doi", "ids.pmcid as pmcid", "ids.pmid as pmid", "institution.ror as ror",
// "institution.country_code as country_code", "publication_year")
// .distinct();
}
private static Dataset<Row> readBlackList(SparkSession spark, String inputPath) {
@ -220,7 +212,7 @@ public class CreateActionSetFromWebEntries implements Serializable {
ModelConstants.IS_AUTHOR_INSTITUTION_OF,
Arrays
.asList(
OafMapperUtils.keyValue(WEB_CRAWL_ID, WEB_CRAWL_NAME)),
OafMapperUtils.keyValue(Constants.WEB_CRAWL_ID, Constants.WEB_CRAWL_NAME)),
OafMapperUtils
.dataInfo(
false, null, false, false,
@ -239,7 +231,7 @@ public class CreateActionSetFromWebEntries implements Serializable {
ModelConstants.HAS_AUTHOR_INSTITUTION,
Arrays
.asList(
OafMapperUtils.keyValue(WEB_CRAWL_ID, WEB_CRAWL_NAME)),
OafMapperUtils.keyValue(Constants.WEB_CRAWL_ID, Constants.WEB_CRAWL_NAME)),
OafMapperUtils
.dataInfo(
false, null, false, false,

View File

@ -0,0 +1,76 @@
package eu.dnetlib.dhp.collection.plugin.researchfi;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.Spliterator;
import java.util.Spliterators;
import java.util.stream.Stream;
import java.util.stream.StreamSupport;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.http.NameValuePair;
import org.apache.http.client.entity.UrlEncodedFormEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.message.BasicNameValuePair;
import org.json.JSONObject;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import eu.dnetlib.dhp.collection.ApiDescriptor;
import eu.dnetlib.dhp.collection.plugin.CollectorPlugin;
import eu.dnetlib.dhp.common.aggregation.AggregatorReport;
import eu.dnetlib.dhp.common.collection.CollectorException;
public class ResearchFiCollectorPlugin implements CollectorPlugin {
private static final Logger log = LoggerFactory.getLogger(ResearchFiCollectorPlugin.class);
@Override
public Stream<String> collect(final ApiDescriptor api, final AggregatorReport report)
throws CollectorException {
final String authUrl = api.getParams().get("auth_url");
final String clientId = api.getParams().get("auth_client_id");
final String clientSecret = api.getParams().get("auth_client_secret");
final String authToken = authenticate(authUrl, clientId, clientSecret);
final Iterator<String> iter = new ResearchFiIterator(api.getBaseUrl(), authToken);
return StreamSupport.stream(Spliterators.spliteratorUnknownSize(iter, Spliterator.ORDERED), false);
}
private String authenticate(final String authUrl, final String clientId, final String clientSecret)
throws CollectorException {
try (final CloseableHttpClient client = HttpClients.createDefault()) {
final HttpPost req = new HttpPost(authUrl);
final List<NameValuePair> params = new ArrayList<>();
params.add(new BasicNameValuePair("grant_type", "client_credentials"));
params.add(new BasicNameValuePair("client_id", clientId));
params.add(new BasicNameValuePair("client_secret", clientSecret));
req.setEntity(new UrlEncodedFormEntity(params, "UTF-8"));
try (final CloseableHttpResponse response = client.execute(req)) {
final String content = IOUtils.toString(response.getEntity().getContent());
final JSONObject obj = new JSONObject(content);
final String token = obj.getString("access_token");
if (StringUtils.isNotBlank(token)) {
return token;
}
}
} catch (final Throwable e) {
log.warn("Error obtaining access token", e);
throw new CollectorException("Error obtaining access token", e);
}
throw new CollectorException("Access token is missing");
}
}

View File

@ -0,0 +1,117 @@
package eu.dnetlib.dhp.collection.plugin.researchfi;
import java.util.Iterator;
import java.util.Queue;
import java.util.concurrent.PriorityBlockingQueue;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.math.NumberUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.http.Header;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.json.JSONArray;
import eu.dnetlib.dhp.collection.plugin.utils.JsonUtils;
import eu.dnetlib.dhp.common.collection.CollectorException;
public class ResearchFiIterator implements Iterator<String> {
private static final Log log = LogFactory.getLog(ResearchFiIterator.class);
private static final int PAGE_SIZE = 100;
private final String baseUrl;
private final String authToken;
private int currPage;
private int nPages;
private final Queue<String> queue = new PriorityBlockingQueue<>();
public ResearchFiIterator(final String baseUrl, final String authToken) {
this.baseUrl = baseUrl;
this.authToken = authToken;
this.currPage = 0;
this.nPages = 0;
}
private void verifyStarted() {
if (this.currPage == 0) {
try {
nextCall();
} catch (final CollectorException e) {
throw new IllegalStateException(e);
}
}
}
@Override
public boolean hasNext() {
synchronized (this.queue) {
verifyStarted();
return !this.queue.isEmpty();
}
}
@Override
public String next() {
synchronized (this.queue) {
verifyStarted();
final String res = this.queue.poll();
while (this.queue.isEmpty() && (this.currPage < this.nPages)) {
try {
nextCall();
} catch (final CollectorException e) {
throw new IllegalStateException(e);
}
}
return res;
}
}
private void nextCall() throws CollectorException {
this.currPage += 1;
final String url;
if (!this.baseUrl.contains("?")) {
url = String.format("%s?PageNumber=%d&PageSize=%d", this.baseUrl, this.currPage, PAGE_SIZE);
} else if (!this.baseUrl.contains("PageSize=")) {
url = String.format("%s&PageNumber=%d&PageSize=%d", this.baseUrl, this.currPage, PAGE_SIZE);
} else {
url = String.format("%s&PageNumber=%d", this.baseUrl, this.currPage);
}
log.info("Calling url: " + url);
try (final CloseableHttpClient client = HttpClients.createDefault()) {
final HttpGet req = new HttpGet(url);
req.addHeader("Authorization", "Bearer " + this.authToken);
try (final CloseableHttpResponse response = client.execute(req)) {
for (final Header header : response.getAllHeaders()) {
log.debug("HEADER: " + header.getName() + " = " + header.getValue());
if ("x-page-count".equals(header.getName())) {
final int totalPages = NumberUtils.toInt(header.getValue());
if (this.nPages != totalPages) {
this.nPages = NumberUtils.toInt(header.getValue());
log.info("Total pages: " + totalPages);
}
}
}
final String content = IOUtils.toString(response.getEntity().getContent());
final JSONArray jsonArray = new JSONArray(content);
jsonArray.forEach(obj -> this.queue.add(JsonUtils.convertToXML(obj.toString())));
}
} catch (final Throwable e) {
log.warn("Error calling url: " + url, e);
throw new CollectorException("Error calling url: " + url, e);
}
}
}

View File

@ -28,7 +28,13 @@
"paramLongName": "dataciteInputPath",
"paramDescription": "the path to get the input data from Datacite",
"paramRequired": true
},
},{
"paramName": "wip",
"paramLongName": "webCrawlInputPath",
"paramDescription": "the path to get the input data from Web Crawl",
"paramRequired": true
}
,
{
"paramName": "o",
"paramLongName": "outputPath",

View File

@ -17,6 +17,10 @@
<name>dataciteInputPath</name>
<description>the path where to find the inferred affiliation relations from Datacite</description>
</property>
<property>
<name>webCrawlInputPath</name>
<description>the path where to find the inferred affiliation relations from webCrawl</description>
</property>
<property>
<name>outputPath</name>
<description>the path where to store the actionset</description>
@ -112,7 +116,7 @@
<arg>--pubmedInputPath</arg><arg>${pubmedInputPath}</arg>
<arg>--openapcInputPath</arg><arg>${openapcInputPath}</arg>
<arg>--dataciteInputPath</arg><arg>${dataciteInputPath}</arg>
<arg>--webCrawlInputPath</arg><arg>${webCrawlInputPath}</arg>
<arg>--outputPath</arg><arg>${outputPath}</arg>
</spark>
<ok to="End"/>

View File

@ -1,10 +1,5 @@
[
{
"id": "100007630",
"uri": "http://dx.doi.org/10.13039/100007630",
"name": "College of Engineering and Informatics, National University of Ireland, Galway",
"synonym": []
},
{
"id": "100007731",
"uri": "http://dx.doi.org/10.13039/100007731",
@ -432,13 +427,13 @@
"id": "501100001634",
"uri": "http://dx.doi.org/10.13039/501100001634",
"name": "University of Galway",
"synonym": []
"synonym": ["501100019905", "100007630", "501100020570", "501100023852"]
},
{
"id": "501100001635",
"uri": "http://dx.doi.org/10.13039/501100001635",
"name": "University of Limerick",
"synonym": []
"synonym": ["501100014531"]
},
{
"id": "501100001636",
@ -468,7 +463,7 @@
"id": "501100002736",
"uri": "http://dx.doi.org/10.13039/501100002736",
"name": "Covidien",
"synonym": []
"synonym": ["501100003956"]
},
{
"id": "501100002755",
@ -518,12 +513,6 @@
"name": "Irish Institute of Clinical Neuroscience",
"synonym": []
},
{
"id": "501100003956",
"uri": "http://dx.doi.org/10.13039/501100003956",
"name": "Aspect Medical Systems",
"synonym": []
},
{
"id": "501100004162",
"uri": "http://dx.doi.org/10.13039/501100004162",
@ -644,12 +633,7 @@
"name": "Irish Centre for High-End Computing",
"synonym": []
},
{
"id": "501100019905",
"uri": "http://dx.doi.org/10.13039/501100019905",
"name": "Galway University Foundation",
"synonym": []
},
{
"id": "501100020036",
"uri": "http://dx.doi.org/10.13039/501100020036",
@ -824,12 +808,7 @@
"name": "Energy Policy Research Centre, Economic and Social Research Institute",
"synonym": []
},
{
"id": "501100014531",
"uri": "http://dx.doi.org/10.13039/501100014531",
"name": "Physical Education and Sport Sciences Department, University of Limerick",
"synonym": []
},
{
"id": "501100014745",
"uri": "http://dx.doi.org/10.13039/501100014745",
@ -842,22 +821,11 @@
"name": "ADAPT - Centre for Digital Content Technology",
"synonym": []
},
{
"id": "501100020570",
"uri": "http://dx.doi.org/10.13039/501100020570",
"name": "College of Medicine, Nursing and Health Sciences, National University of Ireland, Galway",
"synonym": []
},
{
"id": "501100020871",
"uri": "http://dx.doi.org/10.13039/501100020871",
"name": "Bernal Institute, University of Limerick",
"synonym": []
},
{
"id": "501100023852",
"uri": "http://dx.doi.org/10.13039/501100023852",
"name": "Moore Institute for Research in the Humanities and Social Studies, University of Galway",
"synonym": []
}
]

View File

@ -48,12 +48,37 @@
<description>Used to configure the heap size for the map JVM process. Should be 80% of mapreduce.map.memory.mb.</description>
</property>
<property>
<name>JAVA_HOME</name>
<value>/srv/java/openjdk-17</value>
<description>Used to configure the Java home location for oozie.launcher.mapreduce.map.env</description>
</property>
<property>
<name>JAVA_OPTS</name>
<value>-Dcom.sun.security.enableAIAcaIssuers=true</value>
<description>Used to configure the JAVA_OPTS parameter</description>
</property>
</parameters>
<global>
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<configuration>
<property>
<name>mapreduce.job.queuename</name>
<value>${queueName}</value>
</property>
<property>
<name>oozie.launcher.mapred.job.queue.name</name>
<value>${oozieLauncherQueueName}</value>
</property>
<property>
<name>oozie.launcher.mapreduce.map.env</name>
<value>JAVA_HOME=${JAVA_HOME}</value>
</property>
</configuration>
</global>
<start to="collection_mode"/>
@ -99,7 +124,7 @@
<action name="CollectionWorker">
<java>
<main-class>eu.dnetlib.dhp.collection.CollectorWorkerApplication</main-class>
<java-opts>${collection_java_xmx}</java-opts>
<java-opts>${JAVA_OPTS} ${collection_java_xmx}</java-opts>
<arg>--apidescriptor</arg><arg>${apiDescription}</arg>
<arg>--namenode</arg><arg>${nameNode}</arg>
<arg>--workflowId</arg><arg>${workflowId}</arg>

View File

@ -93,7 +93,7 @@ case object Crossref2Oaf {
val cf = new KeyValue
cf.setValue("UnpayWall")
cf.setKey(s"10|openaire____:${DHPUtils.md5("UnpayWall".toLowerCase)}")
cf.setKey(s"10|openaire____::${DHPUtils.md5("UnpayWall".toLowerCase)}")
cf
}

View File

@ -88,6 +88,7 @@ public class PrepareAffiliationRelationsTest {
"-pubmedInputPath", crossrefAffiliationRelationPath,
"-openapcInputPath", crossrefAffiliationRelationPath,
"-dataciteInputPath", crossrefAffiliationRelationPath,
"-webCrawlInputPath", crossrefAffiliationRelationPath,
"-outputPath", outputPath
});
@ -104,7 +105,7 @@ public class PrepareAffiliationRelationsTest {
// );
// }
// count the number of relations
assertEquals(80, tmp.count());
assertEquals(120, tmp.count());
Dataset<Relation> dataset = spark.createDataset(tmp.rdd(), Encoders.bean(Relation.class));
dataset.createOrReplaceTempView("result");
@ -115,7 +116,7 @@ public class PrepareAffiliationRelationsTest {
// verify that we have equal number of bi-directional relations
Assertions
.assertEquals(
40, execVerification
60, execVerification
.filter(
"relClass='" + ModelConstants.HAS_AUTHOR_INSTITUTION + "'")
.collectAsList()
@ -123,7 +124,7 @@ public class PrepareAffiliationRelationsTest {
Assertions
.assertEquals(
40, execVerification
60, execVerification
.filter(
"relClass='" + ModelConstants.IS_AUTHOR_INSTITUTION_OF + "'")
.collectAsList()

View File

@ -0,0 +1,58 @@
package eu.dnetlib.dhp.collection.plugin.researchfi;
import java.util.HashSet;
import java.util.Set;
import java.util.concurrent.atomic.AtomicLong;
import org.dom4j.DocumentException;
import org.dom4j.DocumentHelper;
import org.junit.jupiter.api.Disabled;
import org.junit.jupiter.api.Test;
import eu.dnetlib.dhp.collection.ApiDescriptor;
import eu.dnetlib.dhp.common.aggregation.AggregatorReport;
import eu.dnetlib.dhp.common.collection.CollectorException;
public class ResearchFiCollectorPluginTest {
private final ResearchFiCollectorPlugin plugin = new ResearchFiCollectorPlugin();
@Test
@Disabled
void testCollect() throws CollectorException {
final ApiDescriptor api = new ApiDescriptor();
api.setBaseUrl("https://research.fi/api/rest/v1/funding-decisions?FunderName=AKA&FundingStartYearFrom=2022");
api.setProtocol("research_fi");
api
.getParams()
.put("auth_url", "https://researchfi-auth.2.rahtiapp.fi/realms/publicapi/protocol/openid-connect/token");
api.getParams().put("auth_client_id", "");
api.getParams().put("auth_client_secret", "");
final AtomicLong count = new AtomicLong(0);
final Set<String> ids = new HashSet<>();
this.plugin.collect(api, new AggregatorReport()).forEach(s -> {
if (count.getAndIncrement() == 0) {
System.out.println("First: " + s);
}
try {
final String id = DocumentHelper.parseText(s).valueOf("/recordWrap/funderProjectNumber");
if (ids.contains(id)) {
System.out.println("Id already present: " + id);
}
ids.add(id);
} catch (final DocumentException e) {
throw new RuntimeException(e);
}
});
System.out.println("Total records: " + count);
System.out.println("Total identifiers: " + ids.size());
}
}

View File

@ -5,3 +5,5 @@
{"DOI":"10.1061\/(asce)0733-9372(2002)128:7(575)","Matchings":[{"RORid":"https:\/\/ror.org\/04j198w64","Confidence":0.82}]}
{"DOI":"10.1061\/(asce)0733-9372(2002)128:7(588)","Matchings":[{"RORid":"https:\/\/ror.org\/03m8km719","Confidence":0.8660254038},{"RORid":"https:\/\/ror.org\/02aze4h65","Confidence":0.87}]}
{"DOI":"10.1161\/hy0202.103001","Matchings":[{"RORid":"https:\/\/ror.org\/057xtrt18","Confidence":0.7071067812}]}
{"DOI": "10.1080/13669877.2015.1042504", "Matchings": [{"Confidence": 1.0, "RORid": "https://ror.org/03265fv13"}]}
{"DOI": "10.1007/3-540-47984-8_14", "Matchings": [{"Confidence": 1.0, "RORid": "https://ror.org/00a0n9e72"}]}

View File

@ -26,15 +26,15 @@ import eu.dnetlib.dhp.schema.oaf.Publication;
import eu.dnetlib.dhp.schema.oaf.Result;
import eu.dnetlib.dhp.schema.oaf.Software;
public class PrepareSimpleEntititiesJob {
public class PrepareSimpleEntitiesJob {
private static final Logger log = LoggerFactory.getLogger(PrepareSimpleEntititiesJob.class);
private static final Logger log = LoggerFactory.getLogger(PrepareSimpleEntitiesJob.class);
public static void main(final String[] args) throws Exception {
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
IOUtils
.toString(
PrepareSimpleEntititiesJob.class
PrepareSimpleEntitiesJob.class
.getResourceAsStream("/eu/dnetlib/dhp/broker/oa/common_params.json")));
parser.parseArgument(args);

View File

@ -160,8 +160,7 @@ public class ConversionUtils {
.stream()
.filter(Objects::nonNull)
.filter(pid -> pid.getQualifier() != null)
.filter(pid -> pid.getQualifier().getClassid() != null)
.filter(pid -> pid.getQualifier().getClassid().equalsIgnoreCase(ModelConstants.ORCID))
.filter(pid -> StringUtils.startsWithIgnoreCase(pid.getQualifier().getClassid(), ModelConstants.ORCID))
.map(StructuredProperty::getValue)
.map(ConversionUtils::cleanOrcid)
.filter(StringUtils::isNotBlank)

View File

@ -7,7 +7,7 @@
</property>
<property>
<name>outputDir</name>
<description>the path where the the generated data will be stored</description>
<description>the path where the generated data will be stored</description>
</property>
<property>
<name>datasourceIdWhitelist</name>
@ -179,17 +179,18 @@
<master>yarn</master>
<mode>cluster</mode>
<name>PrepareSimpleEntititiesJob</name>
<class>eu.dnetlib.dhp.broker.oa.PrepareSimpleEntititiesJob</class>
<class>eu.dnetlib.dhp.broker.oa.PrepareSimpleEntitiesJob</class>
<jar>dhp-broker-events-${projectVersion}.jar</jar>
<spark-opts>
--executor-cores=${sparkExecutorCores}
--executor-memory=${sparkExecutorMemory}
--driver-memory=${sparkDriverMemory}
--conf spark.executor.memoryOverhead=${sparkExecutorMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=3840
--conf spark.sql.shuffle.partitions=5000
</spark-opts>
<arg>--graphPath</arg><arg>${graphInputPath}</arg>
<arg>--workingDir</arg><arg>${workingDir}</arg>
@ -209,11 +210,12 @@
--executor-cores=${sparkExecutorCores}
--executor-memory=${sparkExecutorMemory}
--driver-memory=${sparkDriverMemory}
--conf spark.executor.memoryOverhead=${sparkExecutorMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=3840
--conf spark.sql.shuffle.partitions=8000
</spark-opts>
<arg>--graphPath</arg><arg>${graphInputPath}</arg>
<arg>--workingDir</arg><arg>${workingDir}</arg>
@ -234,11 +236,12 @@
--executor-cores=${sparkExecutorCores}
--executor-memory=${sparkExecutorMemory}
--driver-memory=${sparkDriverMemory}
--conf spark.executor.memoryOverhead=${sparkExecutorMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=3840
--conf spark.sql.shuffle.partitions=8000
</spark-opts>
<arg>--graphPath</arg><arg>${graphInputPath}</arg>
<arg>--workingDir</arg><arg>${workingDir}</arg>
@ -258,11 +261,12 @@
--executor-cores=${sparkExecutorCores}
--executor-memory=${sparkExecutorMemory}
--driver-memory=${sparkDriverMemory}
--conf spark.executor.memoryOverhead=${sparkExecutorMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=3840
--conf spark.sql.shuffle.partitions=5000
</spark-opts>
<arg>--graphPath</arg><arg>${graphInputPath}</arg>
<arg>--workingDir</arg><arg>${workingDir}</arg>
@ -282,11 +286,12 @@
--executor-cores=${sparkExecutorCores}
--executor-memory=${sparkExecutorMemory}
--driver-memory=${sparkDriverMemory}
--conf spark.executor.memoryOverhead=${sparkExecutorMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=3840
--conf spark.sql.shuffle.partitions=10000
</spark-opts>
<arg>--graphPath</arg><arg>${graphInputPath}</arg>
<arg>--workingDir</arg><arg>${workingDir}</arg>
@ -306,11 +311,12 @@
--executor-cores=${sparkExecutorCores}
--executor-memory=${sparkExecutorMemory}
--driver-memory=${sparkDriverMemory}
--conf spark.executor.memoryOverhead=${sparkExecutorMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=3840
--conf spark.sql.shuffle.partitions=2000
</spark-opts>
<arg>--graphPath</arg><arg>${graphInputPath}</arg>
<arg>--workingDir</arg><arg>${workingDir}</arg>
@ -332,11 +338,12 @@
--executor-cores=${sparkExecutorCores}
--executor-memory=${sparkExecutorMemory}
--driver-memory=${sparkDriverMemory}
--conf spark.executor.memoryOverhead=${sparkExecutorMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=3840
--conf spark.sql.shuffle.partitions=8000
</spark-opts>
<arg>--graphPath</arg><arg>${graphInputPath}</arg>
<arg>--workingDir</arg><arg>${workingDir}</arg>
@ -356,11 +363,12 @@
--executor-cores=${sparkExecutorCores}
--executor-memory=${sparkExecutorMemory}
--driver-memory=${sparkDriverMemory}
--conf spark.executor.memoryOverhead=${sparkExecutorMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=3840
--conf spark.sql.shuffle.partitions=8000
</spark-opts>
<arg>--graphPath</arg><arg>${graphInputPath}</arg>
<arg>--workingDir</arg><arg>${workingDir}</arg>
@ -380,11 +388,12 @@
--executor-cores=${sparkExecutorCores}
--executor-memory=${sparkExecutorMemory}
--driver-memory=${sparkDriverMemory}
--conf spark.executor.memoryOverhead=${sparkExecutorMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=3840
--conf spark.sql.shuffle.partitions=8000
</spark-opts>
<arg>--graphPath</arg><arg>${graphInputPath}</arg>
<arg>--workingDir</arg><arg>${workingDir}</arg>
@ -404,11 +413,12 @@
--executor-cores=${sparkExecutorCores}
--executor-memory=${sparkExecutorMemory}
--driver-memory=${sparkDriverMemory}
--conf spark.executor.memoryOverhead=${sparkExecutorMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=3840
--conf spark.sql.shuffle.partitions=8000
</spark-opts>
<arg>--graphPath</arg><arg>${graphInputPath}</arg>
<arg>--workingDir</arg><arg>${workingDir}</arg>
@ -428,11 +438,12 @@
--executor-cores=${sparkExecutorCores}
--executor-memory=${sparkExecutorMemory}
--driver-memory=${sparkDriverMemory}
--conf spark.executor.memoryOverhead=${sparkExecutorMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=3840
--conf spark.sql.shuffle.partitions=8000
</spark-opts>
<arg>--graphPath</arg><arg>${graphInputPath}</arg>
<arg>--workingDir</arg><arg>${workingDir}</arg>
@ -452,11 +463,12 @@
--executor-cores=${sparkExecutorCores}
--executor-memory=${sparkExecutorMemory}
--driver-memory=${sparkDriverMemory}
--conf spark.executor.memoryOverhead=${sparkExecutorMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=3840
--conf spark.sql.shuffle.partitions=8000
</spark-opts>
<arg>--graphPath</arg><arg>${graphInputPath}</arg>
<arg>--workingDir</arg><arg>${workingDir}</arg>
@ -476,11 +488,12 @@
--executor-cores=${sparkExecutorCores}
--executor-memory=${sparkExecutorMemory}
--driver-memory=${sparkDriverMemory}
--conf spark.executor.memoryOverhead=${sparkExecutorMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=3840
--conf spark.sql.shuffle.partitions=8000
</spark-opts>
<arg>--workingDir</arg><arg>${workingDir}</arg>
<arg>--outputDir</arg><arg>${outputDir}</arg>
@ -503,6 +516,7 @@
<spark-opts>
--executor-memory=${sparkExecutorMemory}
--driver-memory=${sparkDriverMemory}
--conf spark.executor.memoryOverhead=${sparkExecutorMemory}
--conf spark.dynamicAllocation.maxExecutors=${sparkMaxExecutorsForIndexing}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
@ -535,6 +549,7 @@
--executor-cores=${sparkExecutorCores}
--executor-memory=${sparkExecutorMemory}
--driver-memory=${sparkDriverMemory}
--conf spark.executor.memoryOverhead=${sparkExecutorMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
@ -562,6 +577,7 @@
--executor-cores=${sparkExecutorCores}
--executor-memory=${sparkExecutorMemory}
--driver-memory=${sparkDriverMemory}
--conf spark.executor.memoryOverhead=${sparkExecutorMemory}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
@ -585,6 +601,7 @@
<spark-opts>
--executor-memory=${sparkExecutorMemory}
--driver-memory=${sparkDriverMemory}
--conf spark.executor.memoryOverhead=${sparkExecutorMemory}
--conf spark.dynamicAllocation.maxExecutors=${sparkMaxExecutorsForIndexing}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}

View File

@ -0,0 +1,66 @@
package eu.dnetlib.dhp.broker.oa.matchers.simple;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertTrue;
import java.util.List;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
import eu.dnetlib.broker.objects.OaBrokerAuthor;
import eu.dnetlib.broker.objects.OaBrokerMainEntity;
class EnrichMissingAuthorOrcidTest {
final EnrichMissingAuthorOrcid matcher = new EnrichMissingAuthorOrcid();
@BeforeEach
void setUp() throws Exception {
}
@Test
void testFindDifferences_1() {
final OaBrokerMainEntity source = new OaBrokerMainEntity();
final OaBrokerMainEntity target = new OaBrokerMainEntity();
final List<OaBrokerAuthor> list = this.matcher.findDifferences(source, target);
assertTrue(list.isEmpty());
}
@Test
void testFindDifferences_2() {
final OaBrokerMainEntity source = new OaBrokerMainEntity();
final OaBrokerMainEntity target = new OaBrokerMainEntity();
source.getCreators().add(new OaBrokerAuthor("Claudio Atzori", "0000-0001-9613-6639"));
target.getCreators().add(new OaBrokerAuthor("Claudio Atzori", null));
final List<OaBrokerAuthor> list = this.matcher.findDifferences(source, target);
assertEquals(1, list.size());
}
@Test
void testFindDifferences_3() {
final OaBrokerMainEntity source = new OaBrokerMainEntity();
final OaBrokerMainEntity target = new OaBrokerMainEntity();
source.getCreators().add(new OaBrokerAuthor("Claudio Atzori", null));
target.getCreators().add(new OaBrokerAuthor("Claudio Atzori", "0000-0001-9613-6639"));
final List<OaBrokerAuthor> list = this.matcher.findDifferences(source, target);
assertTrue(list.isEmpty());
}
@Test
void testFindDifferences_4() {
final OaBrokerMainEntity source = new OaBrokerMainEntity();
final OaBrokerMainEntity target = new OaBrokerMainEntity();
source.getCreators().add(new OaBrokerAuthor("Claudio Atzori", "0000-0001-9613-6639"));
target.getCreators().add(new OaBrokerAuthor("Claudio Atzori", "0000-0001-9613-6639"));
final List<OaBrokerAuthor> list = this.matcher.findDifferences(source, target);
assertTrue(list.isEmpty());
}
}

View File

@ -2,27 +2,32 @@
package eu.dnetlib.dhp.broker.oa.util;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertNull;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
import eu.dnetlib.broker.objects.OaBrokerMainEntity;
import eu.dnetlib.broker.objects.OaBrokerTypedValue;
import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.oaf.Author;
import eu.dnetlib.dhp.schema.oaf.Instance;
import eu.dnetlib.dhp.schema.oaf.Qualifier;
import eu.dnetlib.dhp.schema.oaf.Result;
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
class ConversionUtilsTest {
public class ConversionUtilsTest {
@BeforeEach
void setUp() throws Exception {
public void setUp() throws Exception {
}
@Test
void testAllResultPids() {
public void testAllResultPids() {
final Qualifier qf = new Qualifier();
qf.setClassid("test");
qf.setClassname("test");
@ -91,4 +96,42 @@ class ConversionUtilsTest {
assertEquals(6, list.size());
}
public void testOafResultToBrokerResult() {
final Author a1 = createAuthor("Michele Artini", "0000-0002-4406-428X");
final Author a2 = createAuthor("Claudio Atzori", "http://orcid.org/0000-0001-9613-6639");
final Author a3 = createAuthor("Alessia Bardi", null);
final Result r = new Result();
r.setAuthor(Arrays.asList(a1, a2, a3));
final OaBrokerMainEntity br = ConversionUtils.oafResultToBrokerResult(r);
assertEquals(3, br.getCreators().size());
assertEquals("0000-0002-4406-428X", br.getCreators().get(0).getOrcid());
assertEquals("0000-0001-9613-6639", br.getCreators().get(1).getOrcid());
assertNull(br.getCreators().get(2).getOrcid());
}
private Author createAuthor(final String name, final String orcid) {
final Author a = new Author();
a.setFullname("Michele Artini");
if (orcid != null) {
final Qualifier q = new Qualifier();
q.setClassid(ModelConstants.ORCID);
q.setClassname(ModelConstants.ORCID);
q.setSchemeid("dnet:pids");
q.setSchemename("dnet:pids");
final StructuredProperty pid = new StructuredProperty();
pid.setQualifier(q);
pid.setValue(orcid);
a.setPid(Arrays.asList(pid));
}
return a;
}
}

View File

@ -203,8 +203,8 @@ public class SparkCreateMergeRels extends AbstractSparkAction {
WindowSpec w = Window
.partitionBy("groupId")
.orderBy(
col("lastUsage").desc_nulls_last(),
col("pidType").asc_nulls_last(),
col("lastUsage").desc_nulls_last(),
col("collectedfrom").desc_nulls_last(),
col("date").asc_nulls_last(),
col("id").asc_nulls_last());

View File

@ -15,4 +15,12 @@
<name>oozie.action.sharelib.for.spark</name>
<value>spark2</value>
</property>
<property>
<name>hiveMetastoreUris</name>
<value>thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083</value>
</property>
<property>
<name>pivotHistoryDatabase</name>
<value>&#x200B;</value>
</property>
</configuration>

View File

@ -198,6 +198,8 @@
<arg>--isLookUpUrl</arg><arg>${isLookUpUrl}</arg>
<arg>--actionSetId</arg><arg>${actionSetId}</arg>
<arg>--cutConnectedComponent</arg><arg>${cutConnectedComponent}</arg>
<arg>--hiveMetastoreUris</arg><arg>${hiveMetastoreUris}</arg>
<arg>--pivotHistoryDatabase</arg><arg>${pivotHistoryDatabase}</arg>
</spark>
<ok to="PrepareOrgRels"/>
<error to="Kill"/>

File diff suppressed because one or more lines are too long

View File

@ -190,7 +190,7 @@ public class SparkDedupTest implements Serializable {
System.out.println("orp_simrel = " + orp_simrel);
if (CHECK_CARDINALITIES) {
assertEquals(751, orgs_simrel);
assertEquals(742, orgs_simrel);
assertEquals(566, pubs_simrel);
assertEquals(113, sw_simrel);
assertEquals(148, ds_simrel);
@ -251,7 +251,7 @@ public class SparkDedupTest implements Serializable {
// entities simrels supposed to be equal to the number of previous step (no rels in whitelist)
if (CHECK_CARDINALITIES) {
assertEquals(751, orgs_simrel);
assertEquals(742, orgs_simrel);
assertEquals(566, pubs_simrel);
assertEquals(148, ds_simrel);
assertEquals(280, orp_simrel);
@ -442,7 +442,7 @@ public class SparkDedupTest implements Serializable {
final List<Relation> merges = pubs
.filter("source == '50|arXiv_dedup_::c93aeb433eb90ed7a86e29be00791b7c'")
.collectAsList();
assertEquals(3, merges.size());
assertEquals(1, merges.size());
Set<String> dups = Sets
.newHashSet(
"50|doi_________::3b1d0d8e8f930826665df9d6b82fbb73",
@ -451,7 +451,7 @@ public class SparkDedupTest implements Serializable {
merges.forEach(r -> {
assertEquals(ModelConstants.RESULT_RESULT, r.getRelType());
assertEquals(ModelConstants.DEDUP, r.getSubRelType());
assertEquals(ModelConstants.MERGES, r.getRelClass());
assertEquals(ModelConstants.IS_MERGED_IN, r.getRelClass());
assertTrue(dups.contains(r.getTarget()));
});
@ -561,7 +561,7 @@ public class SparkDedupTest implements Serializable {
System.out.println("orp_mergerel = " + orp_mergerel);
if (CHECK_CARDINALITIES) {
assertEquals(1268, orgs_mergerel);
assertEquals(1278, orgs_mergerel);
assertEquals(1156, pubs.count());
assertEquals(292, sw_mergerel);
assertEquals(476, ds_mergerel);
@ -618,7 +618,7 @@ public class SparkDedupTest implements Serializable {
System.out.println("orp_deduprecord = " + orp_deduprecord);
if (CHECK_CARDINALITIES) {
assertEquals(86, orgs_deduprecord);
assertEquals(78, orgs_deduprecord);
assertEquals(96, pubs.count());
assertEquals(47, sw_deduprecord);
assertEquals(97, ds_deduprecord);
@ -761,7 +761,7 @@ public class SparkDedupTest implements Serializable {
if (CHECK_CARDINALITIES) {
assertEquals(930, publications);
assertEquals(839, organizations);
assertEquals(831, organizations);
assertEquals(100, projects);
assertEquals(100, datasource);
assertEquals(196, softwares);

View File

@ -22,8 +22,11 @@ import java.util.Properties;
import org.apache.commons.io.FileUtils;
import org.apache.commons.io.IOUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SparkSession;
import org.junit.jupiter.api.*;
import org.junit.jupiter.api.extension.ExtendWith;
@ -143,7 +146,7 @@ public class SparkOpenorgsDedupTest implements Serializable {
.load(DedupUtility.createSimRelPath(testOutputBasePath, testActionSetId, "organization"))
.count();
assertEquals(86, orgs_simrel);
assertEquals(92, orgs_simrel);
}
@Test
@ -172,7 +175,7 @@ public class SparkOpenorgsDedupTest implements Serializable {
.load(DedupUtility.createSimRelPath(testOutputBasePath, testActionSetId, "organization"))
.count();
assertEquals(122, orgs_simrel);
assertEquals(128, orgs_simrel);
}
@Test
@ -207,7 +210,7 @@ public class SparkOpenorgsDedupTest implements Serializable {
.read()
.load(testOutputBasePath + "/" + testActionSetId + "/organization_mergerel")
.count();
assertEquals(132, orgs_mergerel);
assertEquals(128, orgs_mergerel);
// verify that a DiffRel is in the mergerels (to be sure that the job supposed to remove them has something to
// do)

View File

@ -9,6 +9,7 @@ import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.Test;
import org.junit.platform.commons.util.StringUtils;
import eu.dnetlib.dhp.oa.dedup.SparkOpenorgsDedupTest;
import eu.dnetlib.pace.config.DedupConfig;
import eu.dnetlib.pace.model.SparkModel;
@ -24,6 +25,31 @@ class JsonPathTest {
Row row = SparkModel.apply(conf).rowFromJson(org);
System.out.println("row = " + row);
Assertions.assertNotNull(row);
Assertions.assertTrue(StringUtils.isNotBlank(row.getAs("identifier")));
System.out.println("row = " + row.getAs("countrytitle"));
}
@Test
void jsonToModelTest() throws IOException {
DedupConfig conf = DedupConfig
.load(
IOUtils
.toString(
SparkOpenorgsDedupTest.class
.getResourceAsStream(
"/eu/dnetlib/dhp/dedup/conf/org.curr.conf.json")));
final String org = IOUtils.toString(getClass().getResourceAsStream("organization_example1.json"));
Row row = SparkModel.apply(conf).rowFromJson(org);
// to check that the same parsing returns the same row
Row row1 = SparkModel.apply(conf).rowFromJson(org);
Assertions.assertEquals(row, row1);
System.out.println("row = " + row);
Assertions.assertNotNull(row);
Assertions.assertTrue(StringUtils.isNotBlank(row.getAs("identifier")));
}

View File

@ -4,8 +4,8 @@
"dedupRun" : "001",
"entityType" : "organization",
"subEntityValue": "organization",
"orderField" : "legalname",
"queueMaxSize" : "2000",
"orderField" : "original_legalname",
"queueMaxSize" : "100000",
"groupMaxSize" : "50",
"slidingWindowSize" : "200",
"idPath":"$.id",
@ -15,10 +15,10 @@
},
"pace" : {
"clustering" : [
{ "name" : "sortedngrampairs", "fields" : [ "legalname" ], "params" : { "max" : 2, "ngramLen" : "3"} },
{ "name" : "suffixprefix", "fields" : [ "legalname" ], "params" : { "max" : 1, "len" : "3" } },
{ "name" : "sortedngrampairs", "fields" : [ "original_legalname" ], "params" : { "max" : 2, "ngramLen" : "3"} },
{ "name" : "suffixprefix", "fields" : [ "original_legalname" ], "params" : { "max" : 1, "len" : "3" } },
{ "name" : "urlclustering", "fields" : [ "websiteurl" ], "params" : { } },
{ "name" : "keywordsclustering", "fields" : [ "legalname" ], "params" : { "max": 2, "windowSize": 4} }
{ "name" : "legalnameclustering", "fields" : [ "legalname" ], "params" : { "max": 2} }
],
"decisionTree" : {
"start": {
@ -29,16 +29,23 @@
"weight": 1,
"countIfUndefined": "false",
"params": {}
},
{
"field": "rorid",
"comparator": "exactMatch",
"weight": 1,
"countIfUndefined": "false",
"params": {}
}
],
"threshold": 1,
"aggregation": "AVG",
"aggregation": "OR",
"positive": "MATCH",
"negative": "NO_MATCH",
"undefined": "layer2",
"undefined": "necessaryConditions",
"ignoreUndefined": "false"
},
"layer2": {
"necessaryConditions": {
"fields": [
{
"field": "websiteurl",
@ -49,20 +56,20 @@
},
{
"field": "country",
"comparator": "exactMatch",
"comparator": "countryMatch",
"weight": 1,
"countIfUndefined": "true",
"params": {}
},
{
"field": "legalname",
"field": "original_legalname",
"comparator": "numbersMatch",
"weight": 1,
"countIfUndefined": "true",
"params": {}
},
{
"field": "legalname",
"field": "original_legalname",
"comparator": "romansMatch",
"weight": 1,
"countIfUndefined": "true",
@ -71,68 +78,64 @@
],
"threshold": 1,
"aggregation": "AND",
"positive": "layer3",
"positive": "cityCheck",
"negative": "NO_MATCH",
"undefined": "layer3",
"undefined": "cityCheck",
"ignoreUndefined": "true"
},
"layer3": {
"cityCheck": {
"fields": [
{
"field": "legalname",
"comparator": "cityMatch",
"comparator": "codeMatch",
"weight": 1.0,
"countIfUndefined": "true",
"params": {
"windowSize": "4"
"codeRegex": "city::\\d+"
}
}
],
"threshold": 0.1,
"aggregation": "AVG",
"positive": "layer4",
"positive": "keywordCheck",
"negative": "NO_MATCH",
"undefined": "NO_MATCH",
"ignoreUndefined": "true"
},
"layer4": {
"keywordCheck": {
"fields": [
{
"field": "legalname",
"comparator": "keywordMatch",
"comparator": "codeMatch",
"weight": 1.0,
"countIfUndefined": "true",
"params": {
"windowSize": "4"
"codeRegex": "key::\\d+"
}
}
],
"threshold": 0.7,
"aggregation": "AVG",
"positive": "layer5",
"positive": "nameCheck",
"negative": "NO_MATCH",
"undefined": "layer5",
"undefined": "nameCheck",
"ignoreUndefined": "true"
},
"layer5": {
"nameCheck": {
"fields": [
{
"field": "legalname",
"comparator": "jaroWinklerNormalizedName",
"comparator": "jaroWinklerLegalname",
"weight": 0.9,
"countIfUndefined": "true",
"params": {
"windowSize": "4"
}
"params": {}
},
{
"field": "legalshortname",
"comparator": "jaroWinklerNormalizedName",
"comparator": "jaroWinklerLegalname",
"weight": 0.1,
"countIfUndefined": "false",
"params": {
"windowSize": 4
}
"params": {}
}
],
"threshold": 0.9,
@ -144,126 +147,16 @@
}
},
"model" : [
{ "name" : "country", "type" : "String", "path" : "$.country.classid"},
{ "name" : "legalshortname", "type" : "String", "path" : "$.legalshortname.value"},
{ "name" : "legalname", "type" : "String", "path" : "$.legalname.value" },
{ "name" : "country", "type" : "String", "path" : "$.country.classid", "infer" : "country", "inferenceFrom" : "$.legalname.value"},
{ "name" : "legalshortname", "type" : "String", "path" : "$.legalshortname.value", "infer" : "city_keyword"},
{ "name" : "original_legalname", "type" : "String", "path" : "$.legalname.value" },
{ "name" : "legalname", "type" : "String", "path" : "$.legalname.value", "infer" : "city_keyword"},
{ "name" : "websiteurl", "type" : "URL", "path" : "$.websiteurl.value" },
{ "name" : "gridid", "type" : "String", "path" : "$.pid[?(@.qualifier.classid =='grid')].value"},
{ "name" : "rorid", "type" : "String", "path" : "$.pid[?(@.qualifier.classid =='ROR')].value"},
{ "name" : "originalId", "type" : "String", "path" : "$.id" }
],
"blacklists" : {
"legalname" : []
},
"synonyms": {
"key::1": ["university","università", "universitas", "università studi","universitario","universitaria","université", "universite", "universitaire","universitaires","universidad","universitade","Universität","universitaet","Uniwersytet","университет","universiteit","πανεπιστήμιο","universitesi","universiteti", "universiti"],
"key::2": ["studies","studi","études","estudios","estudos","Studien","studia","исследования","studies","σπουδές"],
"key::3": ["advanced","superiore","supérieur","supérieure","supérieurs","supérieures","avancado","avancados","fortgeschrittene","fortgeschritten","zaawansowany","передовой","gevorderd","gevorderde","προχωρημένος","προχωρημένη","προχωρημένο","προχωρημένες","προχωρημένα","wyzsza"],
"key::4": ["institute","istituto","institut","instituto","instituto","Institut","instytut","институт","instituut","ινστιτούτο"],
"key::5": ["hospital","ospedale","hôpital","hospital","hospital","Krankenhaus","szpital","больница","ziekenhuis","νοσοκομείο"],
"key::6": ["research","ricerca","recherche","investigacion","pesquisa","Forschung","badania","исследования","onderzoek","έρευνα","erevna","erevnas"],
"key::7": ["college","collegio","colegio","faculdade","Hochschule","Szkoła Wyższa","Высшая школа","κολλέγιο"],
"key::8": ["foundation","fondazione","fondation","fundación","fundação","Stiftung","Fundacja","фонд","stichting","ίδρυμα","idryma"],
"key::9": ["center","centro","centre","centro","centro","zentrum","centrum","центр","centrum","κέντρο"],
"key::10": ["national","nazionale","national","nationale","nationaux","nationales","nacional","nacional","national","krajowy","национальный","nationaal","nationale","εθνικό"],
"key::11": ["association","associazione","association","asociación","associação","Verein","verband","stowarzyszenie","ассоциация","associatie"],
"key::12": ["society","societa","société","sociedad","sociedade","gesellschaft","społeczeństwo","общество","maatschappij","κοινωνία"],
"key::13": ["international","internazionale","international","internacional","internacional","international","międzynarodowy","Международный","internationaal","internationale","διεθνής","διεθνή","διεθνές"],
"key::14": ["community","comunita","communauté","comunidad","comunidade","Gemeinschaft","społeczność","сообщество","gemeenschap","κοινότητα"],
"key::15": ["school","scuola","école","escuela","escola","schule","Szkoła","школа","school","σχολείο"],
"key::16": ["education","educazione","éducation","educacion","Educação","Bildung","Edukacja","образование","opleiding","εκπαίδευση"],
"key::17": ["academy","accademia","académie","academia","academia","Akademie","akademie","академия","academie","ακαδημία"],
"key::18": ["public","pubblico","public","publique","publics","publiques","publico","publico","Öffentlichkeit","publiczny","публичный","publiek","publieke","δημόσιος","δημόσια","δημόσιο"],
"key::19": ["museum","museo","musée","mueso","museu","museum","muzeum","музей","museum","μουσείο"],
"key::20": ["group","gruppo","groupe","grupo","grupo","gruppe","grupa","группа","groep","ομάδα","όμιλος"],
"key::21": ["department","dipartimento","département","departamento","departamento","abteilung","departament","отдел","afdeling","τμήμα"],
"key::22": ["council","consiglio","conseil","Consejo","conselho","gesellschaft","rada","совет","raad","συμβούλιο"],
"key::23": ["library","biblioteca","bibliothèque","biblioteca","biblioteca","Bibliothek","biblioteka","библиотека","bibliotheek","βιβλιοθήκη"],
"key::24": ["ministry","ministero","ministère","ministerio","ministério","Ministerium","ministerstwo","министерство","ministerie","υπουργείο"],
"key::25": ["services","servizi","services","servicios","Serviços","Dienstleistungen","usługi","услуги","diensten","υπηρεσίες"],
"key::26": ["central","centrale","central","centrale","centrales","central","central","zentral","centralny","цетральный","centraal","κεντρικός","κεντρική","κεντρικό","κεντρικά"],
"key::27": ["general","generale","général","générale","généraux","générales","general","geral","general","Allgemeines","general","общий","algemeen","algemene","γενικός","γενική","γενικό","γενικά"],
"key::28": ["applied","applicati","appliqué","appliquée","appliqués","appliquées","aplicado","aplicada","angewendet","stosowany","прикладной","toegepast","toegepaste","εφαρμοσμένος","εφαρμοσμένη","εφαρμοσμένο","εφαρμοσμένα"],
"key::29": ["european","europee","europea","européen","européenne","européens","européennes","europeo","europeu","europäisch","europejski","европейский","Europees","Europese","ευρωπαϊκός","ευρωπαϊκή","ευρωπαϊκό","ευρωπαϊκά"],
"key::30": ["agency","agenzia","agence","agencia","agencia","agentur","agencja","агенция","agentschap","πρακτορείο"],
"key::31": ["laboratory","laboratorio","laboratoire","laboratorio","laboratorio","labor","laboratorium","лаборатория","laboratorium","εργαστήριο"],
"key::32": ["industry","industria","industrie","индустрия","industrie","βιομηχανία"],
"key::33": ["industrial","industriale","industriel","industrielle","industriels","industrielles","индустриальный","industrieel","βιομηχανικός","βιομηχανική","βιομηχανικό","βιομηχανικά","βιομηχανικές"],
"key::34": ["consortium","consorzio","consortium","консорциум","consortium","κοινοπραξία"],
"key::35": ["organization","organizzazione","organisation","organización","organização","organizacja","организация","organisatie","οργανισμός"],
"key::36": ["authority","autorità","autorité","авторитет","autoriteit"],
"key::37": ["federation","federazione","fédération","федерация","federatie","ομοσπονδία"],
"key::38": ["observatory","osservatorio","observatoire","обсерватория","observatorium","αστεροσκοπείο"],
"key::39": ["bureau","ufficio","bureau","офис","bureau","γραφείο"],
"key::40": ["company","impresa","compagnie","société","компания","bedrijf","εταιρία"],
"key::41": ["polytechnic","politecnico","polytechnique","политехника","polytechnisch","πολυτεχνείο","universita politecnica","polytechnic university","universidad politecnica","universitat politecnica","politechnika","politechniki","university technology","university science technology"],
"key::42": ["coalition","coalizione","coalition","коалиция","coalitie","συνασπισμός"],
"key::43": ["initiative","iniziativa","initiative","инициатива","initiatief","πρωτοβουλία"],
"key::44": ["academic","accademico","académique","universitaire","акадеческий academisch","ακαδημαϊκός","ακαδημαϊκή","ακαδημαϊκό","ακαδημαϊκές","ακαδημαϊκοί"],
"key::45": ["institution","istituzione","institution","институциональный","instelling","ινστιτούτο"],
"key::46": ["division","divisione","division","отделение","divisie","τμήμα"],
"key::47": ["committee","comitato","comité","комитет","commissie","επιτροπή"],
"key::48": ["promotion","promozione","продвижение","proothisis","forderung"],
"key::49": ["medical","medicine","clinical","medicina","clinici","médico","medicina","clínica","médico","medicina","clínica","medizinisch","Medizin","klinisch","medisch","geneeskunde","klinisch","ιατρικός","ιατρική","ιατρικό","ιατρικά","κλινικός","κλινική","κλινικό","κλινικά","tıbbi","tıp","klinik","orvosi","orvostudomány","klinikai","zdravniški","medicinski","klinični","meditsiini","kliinik","kliiniline"],
"key::50": ["technology","technological","tecnologia","tecnologie","tecnología","tecnológico","tecnologia","tecnológico","Technologie","technologisch","technologie","technologisch","τεχνολογία","τεχνολογικός","τεχνολογική","τεχνολογικό","teknoloji","teknolojik","technológia","technológiai","tehnologija","tehnološki","tehnoloogia","tehnoloogiline","technologii","technical","texniki","teknik"],
"key::51": ["science","scientific","scienza","scientifiche","scienze","ciencia","científico","ciência","científico","Wissenschaft","wissenschaftlich","wetenschap","wetenschappelijk","επιστήμη","επιστημονικός","επιστημονική","επιστημονικό","επιστημονικά","bilim","bilimsel","tudomány","tudományos","znanost","znanstveni","teadus","teaduslik",""],
"key::52": ["engineering","ingegneria","ingeniería","engenharia","Ingenieurwissenschaft","ingenieurswetenschappen","bouwkunde","μηχανικός","μηχανική","μηχανικό","mühendislik","mérnöki","Inženirstvo","inseneeria","inseneri",""],
"key::53": ["management","gestione","gestionale","gestionali","gestión","administración","gestão","administração","Verwaltung","management","διαχείριση","yönetim","menedzsment","vodstvo","upravljanje","management","juhtkond","juhtimine","haldus",""],
"key::54": ["energy","energia","energía","energia","Energie","energie","ενέργεια","enerji","energia","energija","energia",""],
"key::55": ["agricultural","agriculture","agricoltura","agricole","agrícola","agricultura","agrícola","agricultura","landwirtschaftlich","Landwirtschaft","landbouwkundig","landbouw","αγροτικός","αγροτική","αγροτικό","γεωργικός","γεωργική","γεωργικό","γεωργία","tarımsal","tarım","mezőgazdasági","mezőgazdaság","poljedelski","poljedelstvo","põllumajandus","põllumajanduslik",""],
"key::56": ["information","informazione","información","informação","Information","informatie","πληροφορία","bilgi","információ","informacija","informatsioon","informatycznych",""],
"key::57": ["social","sociali","social","social","Sozial","sociaal","maatschappelijk","κοινωνικός","κοινωνική","κοινωνικό","κοινωνικά","sosyal","szociális","družbeni","sotsiaal","sotsiaalne",""],
"key::58": ["environmental","ambiente","medioambiental","ambiente","medioambiente","meioambiente","Umwelt","milieu","milieuwetenschap","milieukunde","περιβαλλοντικός","περιβαλλοντική","περιβαλλοντικό","περιβαλλοντικά","çevre","környezeti","okoliški","keskonna",""],
"key::59": ["business","economia","economiche","economica","negocio","empresa","negócio","Unternehmen","bedrijf","bedrijfskunde","επιχείρηση","iş","üzleti","posel","ettevõte/äri",""],
"key::60": ["pharmaceuticals","pharmacy","farmacia","farmaceutica","farmacéutica","farmacia","farmacêutica","farmácia","Pharmazeutika","Arzneimittelkunde","farmaceutica","geneesmiddelen","apotheek","φαρμακευτικός","φαρμακευτική","φαρμακευτικό","φαρμακευτικά","φαρμακείο","ilaç","eczane","gyógyszerészeti","gyógyszertár","farmacevtika","lekarništvo","farmaatsia","farmatseutiline",""],
"key::61": ["healthcare","health services","salute","atenciónmédica","cuidadodelasalud","cuidadoscomasaúde","Gesundheitswesen","gezondheidszorg","ιατροφαρμακευτικήπερίθαλψη","sağlıkhizmeti","egészségügy","zdravstvo","tervishoid","tervishoiu",""],
"key::62": ["history","storia","historia","história","Geschichte","geschiedenis","geschiedkunde","ιστορία","tarih","történelem","zgodovina","ajalugu",""],
"key::63": ["materials","materiali","materia","materiales","materiais","materialen","υλικά","τεκμήρια","malzemeler","anyagok","materiali","materjalid","vahendid",""],
"key::64": ["economics","economia","economiche","economica","economía","economia","Wirtschaft","economie","οικονομικά","οικονομικέςεπιστήμες","ekonomi","közgazdaságtan","gospodarstvo","ekonomija","majanduslik","majandus",""],
"key::65": ["therapeutics","terapeutica","terapéutica","terapêutica","therapie","θεραπευτική","tedavibilimi","gyógykezelés","terapevtika","terapeutiline","ravi",""],
"key::66": ["oncology","oncologia","oncologico","oncología","oncologia","Onkologie","oncologie","ογκολογία","onkoloji","onkológia","onkologija","onkoloogia",""],
"key::67": ["natural","naturali","naturale","natural","natural","natürlich","natuurlijk","φυσικός","φυσική","φυσικό","φυσικά","doğal","természetes","naraven","loodus",""],
"key::68": ["educational","educazione","pedagogia","educacional","educativo","educacional","pädagogisch","educatief","εκπαιδευτικός","εκπαιδευτική","εκπαιδευτικό","εκπαιδευτικά","eğitimsel","oktatási","izobraževalen","haridus","hariduslik",""],
"key::69": ["biomedical","biomedica","biomédico","biomédico","biomedizinisch","biomedisch","βιοιατρικός","βιοιατρική","βιοιατρικό","βιοιατρικά","biyomedikal","orvosbiológiai","biomedicinski","biomeditsiiniline",""],
"key::70": ["veterinary","veterinaria","veterinarie","veterinaria","veterinária","tierärtzlich","veterinair","veeartsenijlkunde","κτηνιατρικός","κτηνιατρική","κτηνιατρικό","κτηνιατρικά","veteriner","állatorvosi","veterinar","veterinarski","veterinaaria",""],
"key::71": ["chemistry","chimica","química","química","Chemie","chemie","scheikunde","χημεία","kimya","kémia","kemija","keemia",""],
"key::72": ["security","sicurezza","seguridad","segurança","Sicherheit","veiligheid","ασφάλεια","güvenlik","biztonsági","varnost","turvalisus","julgeolek",""],
"key::73": ["biotechnology","biotecnologia","biotecnologie","biotecnología","biotecnologia","Biotechnologie","biotechnologie","βιοτεχνολογία","biyoteknoloji","biotechnológia","biotehnologija","biotehnoloogia",""],
"key::74": ["military","militare","militari","militar","militar","Militär","militair","leger","στρατιωτικός","στρατιωτική","στρατιωτικό","στρατιωτικά","askeri","katonai","vojaški","vojni","militaar","wojskowa",""],
"key::75": ["theological","teologia","teologico","teológico","tecnológica","theologisch","theologisch","θεολογικός","θεολογική","θεολογικό","θεολογικά","teolojik","technológiai","teološki","teoloogia","usuteadus","teoloogiline",""],
"key::76": ["electronics","elettronica","electrónica","eletrônicos","Elektronik","elektronica","ηλεκτρονική","elektronik","elektronika","elektronika","elektroonika",""],
"key::77": ["forestry","forestale","forestali","silvicultura","forestal","floresta","Forstwirtschaft","bosbouw","δασοκομία","δασολογία","ormancılık","erdészet","gozdarstvo","metsandus",""],
"key::78": ["maritime","marittima","marittime","marittimo","marítimo","marítimo","maritiem","ναυτικός","ναυτική","ναυτικό","ναυτικά","ναυτιλιακός","ναυτιλιακή","ναυτιλιακό","ναυτιλιακά","θαλάσσιος","θαλάσσια","θαλάσσιο","denizcilik","tengeri","morski","mere","merendus",""],
"key::79": ["sports","sport","deportes","esportes","Sport","sport","sportwetenschappen","άθληση","γυμναστικήδραστηριότητα","spor","sport","šport","sport","spordi",""],
"key::80": ["surgery","chirurgia","chirurgiche","cirugía","cirurgia","Chirurgie","chirurgie","heelkunde","εγχείρηση","επέμβαση","χειρουργικήεπέμβαση","cerrahi","sebészet","kirurgija","kirurgia",""],
"key::81": ["cultural","culturale","culturali","cultura","cultural","cultural","kulturell","cultureel","πολιτιστικός","πολιτιστική","πολιτιστικό","πολιτισμικός","πολιτισμική","πολιτισμικό","kültürel","kultúrális","kulturni","kultuuri","kultuuriline",""],
"key::82": ["computerscience","informatica","ordenador","computadora","informática","computación","cienciasdelacomputación","ciênciadacomputação","Computer","computer","υπολογιστής","ηλεκτρονικόςυπολογιστής","bilgisayar","számítógép","računalnik","arvuti",""],
"key::83": ["finance","financial","finanza","finanziarie","finanza","financiero","finanças","financeiro","Finanzen","finanziell","financiën","financieel","χρηματοοικονομικά","χρηματοδότηση","finanse","finansal","pénzügy","pénzügyi","finance","finančni","finants","finantsiline",""],
"key::84": ["communication","comunicazione","comuniciación","comunicação","Kommunikation","communication","επικοινωνία","iletişim","kommunikáció","komuniciranje","kommunikatsioon",""],
"key::85": ["justice","giustizia","justicia","justiça","Recht","Justiz","justitie","gerechtigheid","δικαιοσύνη","υπουργείοδικαιοσύνης","δίκαιο","adalet","igazságügy","pravo","õigus",""],
"key::86": ["aerospace","aerospaziale","aerospaziali","aeroespacio","aeroespaço","Luftfahrt","luchtvaart","ruimtevaart","αεροπορικός","αεροπορική","αεροπορικό","αεροναυπηγικός","αεροναυπηγική","αεροναυπηγικό","αεροναυπηγικά","havacılıkveuzay","légtér","zrakoplovstvo","atmosfäär","kosmos",""],
"key::87": ["dermatology","dermatologia","dermatología","dermatologia","Dermatologie","dermatologie","δρματολογία","dermatoloji","bőrgyógyászat","dermatológia","dermatologija","dermatoloogia",""],
"key::88": ["architecture","architettura","arquitectura","arquitetura","Architektur","architectuur","αρχιτεκτονική","mimarlık","építészet","arhitektura","arhitektuur",""],
"key::89": ["mathematics","matematica","matematiche","matemáticas","matemáticas","Mathematik","wiskunde","mathematica","μαθηματικά","matematik","matematika","matematika","matemaatika",""],
"key::90": ["language","lingue","linguistica","linguistiche","lenguaje","idioma","língua","idioma","Sprache","taal","taalkunde","γλώσσα","dil","nyelv","jezik","keel",""],
"key::91": ["neuroscience","neuroscienza","neurociencia","neurociência","Neurowissenschaft","neurowetenschappen","νευροεπιστήμη","nörobilim","idegtudomány","nevroznanost","neuroteadused",""],
"key::92": ["automation","automazione","automatización","automação","Automatisierung","automatisering","αυτοματοποίηση","otomasyon","automatizálás","avtomatizacija","automatiseeritud",""],
"key::93": ["pediatric","pediatria","pediatriche","pediatrico","pediátrico","pediatría","pediátrico","pediatria","pädiatrisch","pediatrische","παιδιατρική","pediatrik","gyermekgyógyászat","pediatrija","pediaatria",""],
"key::94": ["photonics","fotonica","fotoniche","fotónica","fotônica","Photonik","fotonica","φωτονική","fotonik","fotonika","fotonika","fotoonika",""],
"key::95": ["mechanics", "mechanical", "meccanica","meccaniche","mecánica","mecânica","Mechanik","Maschinenbau","mechanica","werktuigkunde","μηχανικής","mekanik","gépészet","mehanika","mehaanika",""],
"key::96": ["psychiatrics","psichiatria","psichiatrica","psichiatriche","psiquiatría","psiquiatria","Psychiatrie","psychiatrie","ψυχιατρική","psikiyatrik","pszihiátria","psihiatrija","psühhaatria",""],
"key::97": ["psychology","fisiologia","psicología","psicologia","Psychologie","psychologie","ψυχολογία","psikoloji","pszihológia","psihologija","psühholoogia",""],
"key::98": ["automotive","industriaautomobilistica","industriadelautomóvil","automotriz","industriaautomotriz","automotivo","Automobilindustrie","autoindustrie","αυτοκίνητος","αυτοκίνητη","αυτοκίνητο","αυτοκινούμενος","αυτοκινούμενη","αυτοκινούμενο","αυτοκινητιστικός","αυτοκινητιστική","αυτοκινητιστικό","otomotiv","autóipari","samogiben","avtomobilskaindustrija","auto-",""],
"key::99": ["neurology","neurologia","neurologiche","neurología","neurologia","Neurologie","neurologie","zenuwleer","νευρολογία","nöroloji","neurológia","ideggyógyászat","nevrologija","neuroloogia",""],
"key::100": ["geology","geologia","geologiche","geología","geologia","Geologie","geologie","aardkunde","γεωλογία","jeoloji","geológia","földtudomány","geologija","geoloogia",""],
"key::101": ["microbiology","microbiologia","micro-biologia","microbiologiche","microbiología","microbiologia","Mikrobiologie","microbiologie","μικροβιολογία","mikrobiyoloji","mikrobiológia","mikrobiologija","mikrobioloogia",""],
"key::102": ["informatics","informatica","informática","informática","informatica",""],
"key::103": ["forschungsgemeinschaft","comunita ricerca","research community","research foundation","research association"],
"key::104": ["commerce","ticaret","ticarət","commercio","trade","handel","comercio"],
"key::105" : ["state", "stato", "etade", "estado", "statale", "etat", "zustand", "estado"],
"key::106" : ["seminary", "seminario", "seminaire", "seminar"],
"key::107" : ["agricultural forestry", "af", "a f"],
"key::108" : ["agricultural mechanical", "am", "a m"],
"key::109" : ["catholic", "catholique", "katholische", "catolica", "cattolica", "catolico"]
}
"blacklists" : {},
"synonyms": {}
}
}

View File

@ -33,10 +33,7 @@ import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.bulktag.community.*;
import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.common.ModelSupport;
import eu.dnetlib.dhp.schema.oaf.Context;
import eu.dnetlib.dhp.schema.oaf.Datasource;
import eu.dnetlib.dhp.schema.oaf.Project;
import eu.dnetlib.dhp.schema.oaf.Result;
import eu.dnetlib.dhp.schema.oaf.*;
import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils;
import scala.Tuple2;
@ -114,27 +111,35 @@ public class SparkBulkTagJob {
extendCommunityConfigurationForEOSC(spark, inputPath, cc);
execBulkTag(
spark, inputPath, outputPath, protoMap, cc);
execEntityTag(
spark, inputPath + "organization", outputPath + "organization",
Utils.getCommunityOrganization(baseURL), Organization.class, TaggingConstants.CLASS_ID_ORGANIZATION,
TaggingConstants.CLASS_NAME_BULKTAG_ORGANIZATION);
execEntityTag(
spark, inputPath + "project", outputPath + "project", Utils.getCommunityProjects(baseURL),
Project.class, TaggingConstants.CLASS_ID_PROJECT, TaggingConstants.CLASS_NAME_BULKTAG_PROJECT);
execDatasourceTag(spark, inputPath, outputPath, Utils.getDatasourceCommunities(baseURL));
execProjectTag(spark, inputPath, outputPath, Utils.getCommunityProjects(baseURL));
});
}
private static void execProjectTag(SparkSession spark, String inputPath, String outputPath,
CommunityEntityMap communityProjects) {
Dataset<Project> projects = readPath(spark, inputPath + "project", Project.class);
private static <E extends OafEntity> void execEntityTag(SparkSession spark, String inputPath, String outputPath,
CommunityEntityMap communityEntity, Class<E> entityClass,
String classID, String calssName) {
Dataset<E> entity = readPath(spark, inputPath, entityClass);
Dataset<EntityCommunities> pc = spark
.createDataset(
communityProjects
communityEntity
.keySet()
.stream()
.map(k -> EntityCommunities.newInstance(k, communityProjects.get(k)))
.map(k -> EntityCommunities.newInstance(k, communityEntity.get(k)))
.collect(Collectors.toList()),
Encoders.bean(EntityCommunities.class));
projects
.joinWith(pc, projects.col("id").equalTo(pc.col("entityId")), "left")
.map((MapFunction<Tuple2<Project, EntityCommunities>, Project>) t2 -> {
Project ds = t2._1();
entity
.joinWith(pc, entity.col("id").equalTo(pc.col("entityId")), "left")
.map((MapFunction<Tuple2<E, EntityCommunities>, E>) t2 -> {
E ds = t2._1();
if (t2._2() != null) {
List<String> context = Optional
.ofNullable(ds.getContext())
@ -156,8 +161,8 @@ public class SparkBulkTagJob {
false, TaggingConstants.BULKTAG_DATA_INFO_TYPE, true, false,
OafMapperUtils
.qualifier(
TaggingConstants.CLASS_ID_DATASOURCE,
TaggingConstants.CLASS_NAME_BULKTAG_DATASOURCE,
classID,
calssName,
ModelConstants.DNET_PROVENANCE_ACTIONS,
ModelConstants.DNET_PROVENANCE_ACTIONS),
"1")));
@ -166,17 +171,17 @@ public class SparkBulkTagJob {
});
}
return ds;
}, Encoders.bean(Project.class))
}, Encoders.bean(entityClass))
.write()
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.json(outputPath + "project");
.json(outputPath);
readPath(spark, outputPath + "project", Project.class)
readPath(spark, outputPath, entityClass)
.write()
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.json(inputPath + "project");
.json(inputPath);
}
private static void execDatasourceTag(SparkSession spark, String inputPath, String outputPath,

View File

@ -13,6 +13,9 @@ public class TaggingConstants {
public static final String CLASS_ID_CZENODO = "community:zenodocommunity";
public static final String CLASS_ID_ADVANCED_CONSTRAINT = "community:advconstraint";
public static final String CLASS_ID_PROJECT = "community:project";
public static final String CLASS_ID_ORGANIZATION = "community:organization";
public static final String ZENODO_COMMUNITY_INDICATOR = "zenodo.org/communities/";
public static final String CLASS_NAME_BULKTAG_SUBJECT = "Bulktagging for Community - Subject";
@ -20,5 +23,8 @@ public class TaggingConstants {
public static final String CLASS_NAME_BULKTAG_ZENODO = "Bulktagging for Community - Zenodo";
public static final String CLASS_NAME_BULKTAG_ADVANCED_CONSTRAINT = "Bulktagging for Community - Advanced Constraints";
public static final String CLASS_NAME_BULKTAG_PROJECT = "Bulktagging for Community - Project";
public static final String CLASS_NAME_BULKTAG_ORGANIZATION = "Bulktagging for Community - Organization";
public static final String TAGGING_TRUST = "0.8";
}

View File

@ -465,6 +465,138 @@ public class BulkTagJobTest {
}
@Test
void organizationTag() throws Exception {
final String sourcePath = getClass()
.getResource("/eu/dnetlib/dhp/bulktag/sample/publication/update_datasource/")
.getPath();
LocalFileSystem fs = FileSystem.getLocal(new Configuration());
fs
.copyFromLocalFile(
false, new org.apache.hadoop.fs.Path(getClass()
.getResource("/eu/dnetlib/dhp/bulktag/pathMap/")
.getPath()),
new org.apache.hadoop.fs.Path(workingDir.toString() + "/data/bulktagging/protoMap"));
SparkBulkTagJob
.main(
new String[] {
"-isSparkSessionManaged", Boolean.FALSE.toString(),
"-sourcePath", sourcePath,
"-taggingConf", taggingConf,
"-outputPath", workingDir.toString() + "/",
"-baseURL", "https://services.openaire.eu/openaire/community/",
"-pathMap", workingDir.toString() + "/data/bulktagging/protoMap/pathMap",
"-nameNode", "local"
});
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
JavaRDD<Organization> tmp = sc
.textFile(workingDir.toString() + "/organization")
.map(item -> OBJECT_MAPPER.readValue(item, Organization.class));
Assertions.assertEquals(4, tmp.count());
org.apache.spark.sql.Dataset<Organization> verificationDataset = spark
.createDataset(tmp.rdd(), Encoders.bean(Organization.class));
verificationDataset.createOrReplaceTempView("organization");
String query = "select id, MyT.id community, MyD.provenanceaction.classid provenance, MyD.provenanceaction.classname name "
+ "from organization "
+ "lateral view explode(context) c as MyT "
+ "lateral view explode(MyT.datainfo) d as MyD "
+ "where MyD.inferenceprovenance = 'bulktagging'";
org.apache.spark.sql.Dataset<Row> idExplodeCommunity = spark.sql(query);
idExplodeCommunity.show(false);
Assertions.assertEquals(3, idExplodeCommunity.count());
Assertions
.assertEquals(
3, idExplodeCommunity.filter("provenance = 'community:organization'").count());
Assertions
.assertEquals(
3,
idExplodeCommunity
.filter("name = 'Bulktagging for Community - Organization'")
.count());
Assertions.assertEquals(1, idExplodeCommunity.filter("community = 'netherlands'").count());
Assertions.assertEquals(1, idExplodeCommunity.filter("community = 'beopen'").count());
Assertions.assertEquals(1, idExplodeCommunity.filter("community = 'mes'").count());
}
@Test
void projectTag() throws Exception {
final String sourcePath = getClass()
.getResource("/eu/dnetlib/dhp/bulktag/sample/publication/update_datasource/")
.getPath();
LocalFileSystem fs = FileSystem.getLocal(new Configuration());
fs
.copyFromLocalFile(
false, new org.apache.hadoop.fs.Path(getClass()
.getResource("/eu/dnetlib/dhp/bulktag/pathMap/")
.getPath()),
new org.apache.hadoop.fs.Path(workingDir.toString() + "/data/bulktagging/protoMap"));
SparkBulkTagJob
.main(
new String[] {
"-isSparkSessionManaged", Boolean.FALSE.toString(),
"-sourcePath", sourcePath,
"-taggingConf", taggingConf,
"-outputPath", workingDir.toString() + "/",
"-baseURL", "https://services.openaire.eu/openaire/community/",
"-pathMap", workingDir.toString() + "/data/bulktagging/protoMap/pathMap",
"-nameNode", "local"
});
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
JavaRDD<Project> tmp = sc
.textFile(workingDir.toString() + "/project")
.map(item -> OBJECT_MAPPER.readValue(item, Project.class));
Assertions.assertEquals(4, tmp.count());
org.apache.spark.sql.Dataset<Project> verificationDataset = spark
.createDataset(tmp.rdd(), Encoders.bean(Project.class));
verificationDataset.createOrReplaceTempView("project");
String query = "select id, MyT.id community, MyD.provenanceaction.classid provenance, MyD.provenanceaction.classname name "
+ "from project "
+ "lateral view explode(context) c as MyT "
+ "lateral view explode(MyT.datainfo) d as MyD "
+ "where MyD.inferenceprovenance = 'bulktagging'";
org.apache.spark.sql.Dataset<Row> idExplodeCommunity = spark.sql(query);
idExplodeCommunity.show(false);
Assertions.assertEquals(4, idExplodeCommunity.count());
Assertions
.assertEquals(
4, idExplodeCommunity.filter("provenance = 'community:project'").count());
Assertions
.assertEquals(
4,
idExplodeCommunity
.filter("name = 'Bulktagging for Community - Project'")
.count());
Assertions.assertEquals(1, idExplodeCommunity.filter("community = 'enermaps'").count());
Assertions.assertEquals(1, idExplodeCommunity.filter("community = 'clarin'").count());
Assertions.assertEquals(2, idExplodeCommunity.filter("community = 'dh-ch'").count());
}
@Test
void bulktagByZenodoCommunityTest() throws Exception {
final String sourcePath = getClass()

View File

@ -14,4 +14,7 @@ public class ProvisionConstants {
return format + SEPARATOR + LAYOUT + SEPARATOR + INTERPRETATION;
}
public static final String PUBLIC_ALIAS_NAME = "public";
public static final String SHADOW_ALIAS_NAME = "shadow";
}

View File

@ -9,6 +9,7 @@ import org.apache.commons.io.IOUtils;
import org.apache.solr.client.solrj.SolrResponse;
import org.apache.solr.client.solrj.SolrServerException;
import org.apache.solr.client.solrj.impl.CloudSolrClient;
import org.apache.solr.client.solrj.request.CollectionAdminRequest;
import org.apache.solr.client.solrj.response.UpdateResponse;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@ -23,7 +24,7 @@ public class SolrAdminApplication implements Closeable {
private static final Logger log = LoggerFactory.getLogger(SolrAdminApplication.class);
enum Action {
DELETE_BY_QUERY, COMMIT
DELETE_BY_QUERY, COMMIT, UPDATE_ALIASES
}
private final CloudSolrClient solrClient;
@ -39,9 +40,6 @@ public class SolrAdminApplication implements Closeable {
final String isLookupUrl = parser.get("isLookupUrl");
log.info("isLookupUrl: {}", isLookupUrl);
final String format = parser.get("format");
log.info("format: {}", format);
final Action action = Action.valueOf(parser.get("action"));
log.info("action: {}", action);
@ -59,11 +57,21 @@ public class SolrAdminApplication implements Closeable {
final String zkHost = isLookup.getZkHost();
log.info("zkHost: {}", zkHost);
final String collection = ProvisionConstants.getCollectionName(format);
log.info("collection: {}", collection);
final String publicFormat = parser.get("publicFormat");
log.info("publicFormat: {}", publicFormat);
final String shadowFormat = parser.get("shadowFormat");
log.info("shadowFormat: {}", shadowFormat);
// get collection names from metadata format profiles names
final String publicCollection = ProvisionConstants.getCollectionName(publicFormat);
log.info("publicCollection: {}", publicCollection);
final String shadowCollection = ProvisionConstants.getCollectionName(shadowFormat);
log.info("shadowCollection: {}", shadowCollection);
try (SolrAdminApplication app = new SolrAdminApplication(zkHost)) {
app.execute(action, collection, query, commit);
app.execute(action, query, commit, publicCollection, shadowCollection);
}
}
@ -72,22 +80,29 @@ public class SolrAdminApplication implements Closeable {
this.solrClient = new CloudSolrClient.Builder(zk.getHosts(), zk.getChroot()).build();
}
public SolrResponse commit(String collection) throws IOException, SolrServerException {
return execute(Action.COMMIT, collection, null, true);
public SolrResponse commit(String shadowCollection) throws IOException, SolrServerException {
return execute(Action.COMMIT, null, true, null, shadowCollection);
}
public SolrResponse execute(Action action, String collection, String query, boolean commit)
public SolrResponse execute(Action action, String query, boolean commit,
String publicCollection, String shadowCollection)
throws IOException, SolrServerException {
switch (action) {
case DELETE_BY_QUERY:
UpdateResponse rsp = solrClient.deleteByQuery(collection, query);
UpdateResponse rsp = solrClient.deleteByQuery(shadowCollection, query);
if (commit) {
solrClient.commit(collection);
return solrClient.commit(shadowCollection);
}
return rsp;
case COMMIT:
return solrClient.commit(collection);
return solrClient.commit(shadowCollection);
case UPDATE_ALIASES:
this.updateAliases(publicCollection, shadowCollection);
return null;
default:
throw new IllegalArgumentException("action not managed: " + action);
}
@ -98,4 +113,30 @@ public class SolrAdminApplication implements Closeable {
solrClient.close();
}
private void updateAliases(String publicCollection, String shadowCollection)
throws SolrServerException, IOException {
// delete current aliases
this.deleteAlias(ProvisionConstants.PUBLIC_ALIAS_NAME);
this.deleteAlias(ProvisionConstants.SHADOW_ALIAS_NAME);
// create aliases
this.createAlias(ProvisionConstants.PUBLIC_ALIAS_NAME, publicCollection);
this.createAlias(ProvisionConstants.SHADOW_ALIAS_NAME, shadowCollection);
}
public SolrResponse deleteAlias(String aliasName) throws SolrServerException, IOException {
CollectionAdminRequest.DeleteAlias deleteAliasRequest = CollectionAdminRequest.deleteAlias(aliasName);
log.info("deleting alias: {}", aliasName);
return deleteAliasRequest.process(solrClient);
}
public SolrResponse createAlias(String aliasName, String collection) throws IOException, SolrServerException {
CollectionAdminRequest.CreateAlias createAliasRequest = CollectionAdminRequest
.createAlias(aliasName, collection);
log.info("creating alias: {} for collection: {}", aliasName, collection);
return createAliasRequest.process(solrClient);
}
}

View File

@ -36,7 +36,7 @@ public class SolrRecordDumpJob extends AbstractSolrRecordTransformJob {
private final String inputPath;
private final String format;
private final String shadowFormat;
private final String outputPath;
@ -61,8 +61,8 @@ public class SolrRecordDumpJob extends AbstractSolrRecordTransformJob {
final String inputPath = parser.get("inputPath");
log.info("inputPath: {}", inputPath);
final String format = parser.get("format");
log.info("format: {}", format);
final String shadowFormat = parser.get("shadowFormat");
log.info("shadowFormat: {}", shadowFormat);
final String outputPath = Optional
.ofNullable(parser.get("outputPath"))
@ -95,27 +95,24 @@ public class SolrRecordDumpJob extends AbstractSolrRecordTransformJob {
final String isLookupUrl = parser.get("isLookupUrl");
log.info("isLookupUrl: {}", isLookupUrl);
final ISLookupClient isLookup = new ISLookupClient(ISLookupClientFactory.getLookUpService(isLookupUrl));
new SolrRecordDumpJob(spark, inputPath, format, outputPath).run(isLookup);
new SolrRecordDumpJob(spark, inputPath, shadowFormat, outputPath).run(isLookup);
});
}
public SolrRecordDumpJob(SparkSession spark, String inputPath, String format, String outputPath) {
public SolrRecordDumpJob(SparkSession spark, String inputPath, String shadowFormat, String outputPath) {
this.spark = spark;
this.inputPath = inputPath;
this.format = format;
this.shadowFormat = shadowFormat;
this.outputPath = outputPath;
}
public void run(ISLookupClient isLookup) throws ISLookUpException, TransformerException {
final String fields = isLookup.getLayoutSource(format);
final String fields = isLookup.getLayoutSource(shadowFormat);
log.info("fields: {}", fields);
final String xslt = isLookup.getLayoutTransformer();
final String dsId = isLookup.getDsId(format);
log.info("dsId: {}", dsId);
final String indexRecordXslt = getLayoutTransformer(format, fields, xslt);
final String indexRecordXslt = getLayoutTransformer(shadowFormat, fields, xslt);
log.info("indexRecordTransformer {}", indexRecordXslt);
final Encoder<TupleWrapper> encoder = Encoders.bean(TupleWrapper.class);

View File

@ -40,6 +40,8 @@ public class XmlIndexingJob extends AbstractSolrRecordTransformJob {
private final String format;
private final String shadowCollection;
private final int batchSize;
private final SparkSession spark;
@ -63,8 +65,11 @@ public class XmlIndexingJob extends AbstractSolrRecordTransformJob {
final String inputPath = parser.get("inputPath");
log.info("inputPath: {}", inputPath);
final String format = parser.get("format");
log.info("format: {}", format);
final String shadowFormat = parser.get("shadowFormat");
log.info("shadowFormat: {}", shadowFormat);
final String shadowCollection = ProvisionConstants.getCollectionName(shadowFormat);
log.info("shadowCollection: {}", shadowCollection);
final Integer batchSize = Optional
.ofNullable(parser.get("batchSize"))
@ -85,15 +90,17 @@ public class XmlIndexingJob extends AbstractSolrRecordTransformJob {
final String isLookupUrl = parser.get("isLookupUrl");
log.info("isLookupUrl: {}", isLookupUrl);
final ISLookupClient isLookup = new ISLookupClient(ISLookupClientFactory.getLookUpService(isLookupUrl));
new XmlIndexingJob(spark, inputPath, format, batchSize)
new XmlIndexingJob(spark, inputPath, shadowFormat, shadowCollection, batchSize)
.run(isLookup);
});
}
public XmlIndexingJob(SparkSession spark, String inputPath, String format, Integer batchSize) {
public XmlIndexingJob(SparkSession spark, String inputPath, String format, String shadowCollection,
Integer batchSize) {
this.spark = spark;
this.inputPath = inputPath;
this.format = format;
this.shadowCollection = shadowCollection;
this.batchSize = batchSize;
}
@ -103,12 +110,6 @@ public class XmlIndexingJob extends AbstractSolrRecordTransformJob {
final String xslt = isLookup.getLayoutTransformer();
final String dsId = isLookup.getDsId(format);
log.info("dsId: {}", dsId);
final String collection = ProvisionConstants.getCollectionName(format);
log.info("collection: {}", collection);
final String zkHost = isLookup.getZkHost();
log.info("zkHost: {}", zkHost);
@ -130,7 +131,7 @@ public class XmlIndexingJob extends AbstractSolrRecordTransformJob {
.javaRDD()
.map(
t -> new StreamingInputDocumentFactory().parseDocument(t.getXml(), t.getJson()));
DHPSolrSupport.indexDocs(zkHost, collection, batchSize, docs.rdd());
DHPSolrSupport.indexDocs(zkHost, shadowCollection, batchSize, docs.rdd());
}
}

View File

@ -30,11 +30,14 @@ import eu.dnetlib.dhp.schema.solr.Context;
import eu.dnetlib.dhp.schema.solr.Country;
import eu.dnetlib.dhp.schema.solr.Datasource;
import eu.dnetlib.dhp.schema.solr.EoscIfGuidelines;
import eu.dnetlib.dhp.schema.solr.ExternalReference;
import eu.dnetlib.dhp.schema.solr.Instance;
import eu.dnetlib.dhp.schema.solr.Journal;
import eu.dnetlib.dhp.schema.solr.Measure;
import eu.dnetlib.dhp.schema.solr.OpenAccessColor;
import eu.dnetlib.dhp.schema.solr.OpenAccessRoute;
import eu.dnetlib.dhp.schema.solr.Organization;
import eu.dnetlib.dhp.schema.solr.Pid;
import eu.dnetlib.dhp.schema.solr.Project;
import eu.dnetlib.dhp.schema.solr.Result;
import eu.dnetlib.dhp.schema.solr.Subject;
@ -76,6 +79,7 @@ public class ProvisionModelSupport {
r.setCollectedfrom(asProvenance(e.getCollectedfrom()));
r.setContext(asContext(e.getContext(), contextMapper));
r.setPid(asPid(e.getPid()));
r.setMeasures(mapMeasures(e.getMeasures()));
if (e instanceof eu.dnetlib.dhp.schema.oaf.Result) {
r.setResult(mapResult((eu.dnetlib.dhp.schema.oaf.Result) e));
@ -106,6 +110,14 @@ public class ProvisionModelSupport {
final RelatedEntity re = rew.getTarget();
final RecordType relatedRecordType = RecordType.valueOf(re.getType());
final Relation relation = rew.getRelation();
final String relationProvenance = Optional
.ofNullable(relation.getDataInfo())
.map(
d -> Optional
.ofNullable(d.getProvenanceaction())
.map(Qualifier::getClassid)
.orElse(null))
.orElse(null);
rr
.setHeader(
RelatedRecordHeader
@ -113,7 +125,9 @@ public class ProvisionModelSupport {
relation.getRelType(),
relation.getRelClass(),
StringUtils.substringAfter(relation.getTarget(), IdentifierFactory.ID_PREFIX_SEPARATOR),
relatedRecordType));
relatedRecordType,
relationProvenance,
Optional.ofNullable(relation.getDataInfo()).map(DataInfo::getTrust).orElse(null)));
rr.setAcronym(re.getAcronym());
rr.setCode(re.getCode());
@ -131,11 +145,20 @@ public class ProvisionModelSupport {
rr.setOfficialname(re.getOfficialname());
rr.setOpenairecompatibility(mapCodeLabel(re.getOpenairecompatibility()));
rr.setPid(asPid(re.getPid()));
rr.setProjectTitle(rr.getProjectTitle());
rr.setWebsiteurl(re.getWebsiteurl());
rr.setProjectTitle(re.getProjectTitle());
rr.setPublisher(re.getPublisher());
rr.setResulttype(mapQualifier(re.getResulttype()));
rr.setTitle(Optional.ofNullable(re.getTitle()).map(StructuredProperty::getValue).orElse(null));
if (relation.getValidated() == null) {
relation.setValidated(false);
}
if (ModelConstants.OUTCOME.equals(relation.getSubRelType())
&& StringUtils.isNotBlank(relation.getValidationDate())) {
rr.setValidationDate(relation.getValidationDate());
}
return rr;
}
@ -266,6 +289,7 @@ public class ProvisionModelSupport {
ds.setOfficialname(mapField(d.getOfficialname()));
ds.setDescription(mapField(d.getDescription()));
ds.setJournal(mapJournal(d.getJournal()));
ds.setWebsiteurl(mapField(d.getWebsiteurl()));
ds.setLogourl(mapField(d.getLogourl()));
ds.setAccessinfopackage(mapFieldList(d.getAccessinfopackage()));
ds.setCertificates(mapField(d.getCertificates()));
@ -311,6 +335,7 @@ public class ProvisionModelSupport {
ds.setSubjects(asSubjectSP(d.getSubjects()));
ds.setSubmissionpolicyurl(d.getSubmissionpolicyurl());
ds.setThematic(d.getThematic());
ds.setContentpolicies(mapCodeLabel(d.getContentpolicies()));
ds.setVersioncontrol(d.getVersioncontrol());
ds.setVersioning(mapField(d.getVersioning()));
@ -326,6 +351,7 @@ public class ProvisionModelSupport {
rs.setOtherTitles(getOtherTitles(r.getTitle()));
rs.setDescription(mapFieldList(r.getDescription()));
rs.setSubject(asSubject(r.getSubject()));
rs.setLanguage(asLanguage(r.getLanguage()));
rs.setPublicationdate(mapField(r.getDateofacceptance()));
rs.setPublisher(mapField(r.getPublisher()));
rs.setEmbargoenddate(mapField(r.getEmbargoenddate()));
@ -341,17 +367,17 @@ public class ProvisionModelSupport {
rs.setCountry(asCountry(r.getCountry()));
rs.setEoscifguidelines(asEOSCIF(r.getEoscifguidelines()));
rs.setGreen(r.getIsGreen());
rs.setIsGreen(r.getIsGreen());
rs
.setOpenAccessColor(
Optional
.ofNullable(r.getOpenAccessColor())
.map(color -> OpenAccessColor.valueOf(color.toString()))
.orElse(null));
rs.setInDiamondJournal(r.getIsInDiamondJournal());
rs.setIsInDiamondJournal(r.getIsInDiamondJournal());
rs.setPubliclyFunded(r.getPubliclyFunded());
rs.setTransformativeAgreement(r.getTransformativeAgreement());
rs.setExternalReference(mapExternalReference(r.getExternalReference()));
rs.setInstance(mapInstances(r.getInstance()));
if (r instanceof Publication) {
@ -375,6 +401,13 @@ public class ProvisionModelSupport {
return rs;
}
private static Language asLanguage(Qualifier lang) {
return Optional
.ofNullable(lang)
.map(q -> Language.newInstance(q.getClassid(), q.getClassname()))
.orElse(null);
}
@Nullable
private static List<String> getOtherTitles(List<StructuredProperty> titleList) {
return Optional
@ -422,7 +455,7 @@ public class ProvisionModelSupport {
Instance i = new Instance();
i.setCollectedfrom(asProvenance(instance.getCollectedfrom()));
i.setHostedby(asProvenance(instance.getHostedby()));
i.setFulltext(i.getFulltext());
i.setFulltext(instance.getFulltext());
i.setPid(asPid(instance.getPid()));
i.setAlternateIdentifier(asPid(instance.getAlternateIdentifier()));
i.setAccessright(mapAccessRight(instance.getAccessright()));
@ -453,7 +486,8 @@ public class ProvisionModelSupport {
private static AccessRight mapAccessRight(eu.dnetlib.dhp.schema.oaf.AccessRight accessright) {
return AccessRight
.newInstance(
mapQualifier(accessright),
accessright.getClassid(),
accessright.getClassname(),
Optional
.ofNullable(accessright.getOpenAccessRoute())
.map(route -> OpenAccessRoute.valueOf(route.toString()))
@ -508,7 +542,46 @@ public class ProvisionModelSupport {
}
private static Provenance asProvenance(KeyValue keyValue) {
return Optional.ofNullable(keyValue).map(cf -> Provenance.newInstance(cf.getKey(), cf.getValue())).orElse(null);
return Optional
.ofNullable(keyValue)
.map(
kv -> Provenance
.newInstance(
StringUtils.substringAfter(kv.getKey(), IdentifierFactory.ID_PREFIX_SEPARATOR),
kv.getValue()))
.orElse(null);
}
private static List<Measure> mapMeasures(List<eu.dnetlib.dhp.schema.oaf.Measure> measures) {
return Optional
.ofNullable(measures)
.map(
ml -> ml
.stream()
.map(m -> Measure.newInstance(m.getId(), mapCodeLabelKV(m.getUnit())))
.collect(Collectors.toList()))
.orElse(null);
}
private static List<ExternalReference> mapExternalReference(
List<eu.dnetlib.dhp.schema.oaf.ExternalReference> externalReference) {
return Optional
.ofNullable(externalReference)
.map(
ext -> ext
.stream()
.map(
e -> ExternalReference
.newInstance(
e.getSitename(),
e.getLabel(),
e.getAlternateLabel(),
e.getUrl(),
mapCodeLabel(e.getQualifier()),
e.getRefidentifier(),
e.getQuery()))
.collect(Collectors.toList()))
.orElse(Lists.newArrayList());
}
private static List<Context> asContext(List<eu.dnetlib.dhp.schema.oaf.Context> ctxList,
@ -529,7 +602,7 @@ public class ProvisionModelSupport {
}
return Optional
.ofNullable(contexts)
.of(contexts)
.map(
ctx -> ctx
.stream()
@ -581,7 +654,14 @@ public class ProvisionModelSupport {
.map(
pids -> pids
.stream()
.map(p -> Pid.newInstance(p.getQualifier().getClassname(), p.getValue()))
.filter(p -> Objects.nonNull(p.getQualifier()))
.filter(p -> Objects.nonNull(p.getQualifier().getClassid()))
.map(
p -> Pid
.newInstance(
p.getValue(),
p.getQualifier().getClassid(),
p.getQualifier().getClassname()))
.collect(Collectors.toList()))
.orElse(null);
}
@ -607,7 +687,9 @@ public class ProvisionModelSupport {
.stream()
.filter(s -> Objects.nonNull(s.getQualifier()))
.filter(s -> Objects.nonNull(s.getQualifier().getClassname()))
.map(s -> Subject.newInstance(s.getValue(), s.getQualifier().getClassname()))
.map(
s -> Subject
.newInstance(s.getValue(), s.getQualifier().getClassid(), s.getQualifier().getClassname()))
.collect(Collectors.toList()))
.orElse(null);
}
@ -620,7 +702,9 @@ public class ProvisionModelSupport {
.stream()
.filter(s -> Objects.nonNull(s.getQualifier()))
.filter(s -> Objects.nonNull(s.getQualifier().getClassname()))
.map(s -> Subject.newInstance(s.getValue(), s.getQualifier().getClassname()))
.map(
s -> Subject
.newInstance(s.getValue(), s.getQualifier().getClassid(), s.getQualifier().getClassname()))
.collect(Collectors.toList()))
.orElse(null);
}
@ -689,7 +773,7 @@ public class ProvisionModelSupport {
private static CodeLabel mapCodeLabel(KeyValue kv) {
return Optional
.ofNullable(kv)
.map(q -> CodeLabel.newInstance(kv.getKey(), kv.getValue()))
.map(k -> CodeLabel.newInstance(k.getKey(), k.getValue()))
.orElse(null);
}

View File

@ -219,6 +219,13 @@ public class XmlRecordFactory implements Serializable {
if (entity.getMeasures() != null) {
metadata.addAll(measuresAsXml(entity.getMeasures()));
}
if (entity.getContext() != null) {
contexts.addAll(entity.getContext().stream().map(Context::getId).collect(Collectors.toList()));
/* FIXME: Workaround for CLARIN mining issue: #3670#note-29 */
if (contexts.contains("dh-ch::subcommunity::2")) {
contexts.add("clarin");
}
}
if (ModelSupport.isResult(type)) {
final Result r = (Result) entity;
@ -245,14 +252,6 @@ public class XmlRecordFactory implements Serializable {
.collect(Collectors.toList()));
}
if (r.getContext() != null) {
contexts.addAll(r.getContext().stream().map(c -> c.getId()).collect(Collectors.toList()));
/* FIXME: Workaround for CLARIN mining issue: #3670#note-29 */
if (contexts.contains("dh-ch::subcommunity::2")) {
contexts.add("clarin");
}
}
if (r.getTitle() != null) {
metadata
.addAll(
@ -1315,7 +1314,7 @@ public class XmlRecordFactory implements Serializable {
instance
.getCollectedfrom()
.stream()
.filter(cf -> kvNotBlank(cf))
.filter(XmlRecordFactory::kvNotBlank)
.map(cf -> XmlSerializationUtils.mapKeyValue("collectedfrom", cf))
.collect(Collectors.toList()));
}
@ -1326,7 +1325,7 @@ public class XmlRecordFactory implements Serializable {
instance
.getHostedby()
.stream()
.filter(hb -> kvNotBlank(hb))
.filter(XmlRecordFactory::kvNotBlank)
.map(hb -> XmlSerializationUtils.mapKeyValue("hostedby", hb))
.collect(Collectors.toList()));
}
@ -1336,7 +1335,7 @@ public class XmlRecordFactory implements Serializable {
instance
.getDateofacceptance()
.stream()
.filter(d -> isNotBlank(d))
.filter(StringUtils::isNotBlank)
.map(d -> XmlSerializationUtils.asXmlElement("dateofacceptance", d))
.collect(Collectors.toList()));
}
@ -1346,7 +1345,7 @@ public class XmlRecordFactory implements Serializable {
instance
.getInstancetype()
.stream()
.filter(t -> !StringUtils.isNotBlank(t.getClassid()))
.filter(t -> StringUtils.isNotBlank(t.getClassid()))
.map(t -> XmlSerializationUtils.mapQualifier("instancetype", t))
.collect(Collectors.toList()));
}
@ -1356,7 +1355,7 @@ public class XmlRecordFactory implements Serializable {
instance
.getDistributionlocation()
.stream()
.filter(d -> isNotBlank(d))
.filter(StringUtils::isNotBlank)
.map(d -> XmlSerializationUtils.asXmlElement("distributionlocation", d))
.collect(Collectors.toList()));
}
@ -1409,7 +1408,7 @@ public class XmlRecordFactory implements Serializable {
instance
.getLicense()
.stream()
.filter(d -> isNotBlank(d))
.filter(StringUtils::isNotBlank)
.map(d -> XmlSerializationUtils.asXmlElement("license", d))
.collect(Collectors.toList()));
}
@ -1540,11 +1539,16 @@ public class XmlRecordFactory implements Serializable {
.min(new RefereedComparator())
.orElse(XmlInstance.UNKNOWN_REVIEW_LEVEL));
Map<String, Qualifier> instanceTypes = Maps.newHashMap();
instances.forEach(p -> {
final Instance i = p.getRight();
instance.getCollectedfrom().add(i.getCollectedfrom());
instance.getHostedby().add(i.getHostedby());
instance.getInstancetype().add(i.getInstancetype());
if (Optional.ofNullable(i.getInstancetype()).map(Qualifier::getClassid).isPresent()) {
instanceTypes.putIfAbsent(i.getInstancetype().getClassid(), i.getInstancetype());
}
instance
.setProcessingchargeamount(
Optional.ofNullable(i.getProcessingchargeamount()).map(apc -> apc.getValue()).orElse(null));
@ -1571,6 +1575,8 @@ public class XmlRecordFactory implements Serializable {
.ifPresent(instance::setFulltext);
});
instance.getInstancetype().addAll(instanceTypes.values());
if (instance.getHostedby().size() > 1
&& instance.getHostedby().stream().anyMatch(hb -> ModelConstants.UNKNOWN_REPOSITORY.equals(hb))) {
instance.getHostedby().remove(ModelConstants.UNKNOWN_REPOSITORY);
@ -1596,9 +1602,7 @@ public class XmlRecordFactory implements Serializable {
private List<String> buildContexts(final String type, final Set<String> contexts) {
final List<String> res = Lists.newArrayList();
if (contextMapper != null
&& !contextMapper.isEmpty()
&& MainEntityType.result.toString().equals(type)) {
if (contextMapper != null && !contextMapper.isEmpty()) {
XMLTag document = XMLDoc.newDocument(true).addRoot("contextRoot");

View File

@ -13,7 +13,7 @@
},
{
"paramName": "f",
"paramLongName": "format",
"paramLongName": "shadowFormat",
"paramDescription": "MDFormat name found in the IS profile",
"paramRequired": true
},

View File

@ -13,8 +13,8 @@
},
{
"paramName": "f",
"paramLongName": "format",
"paramDescription": "MDFormat name found in the IS profile",
"paramLongName": "shadowFormat",
"paramDescription": "MDFormat name found in the IS profile bound to the shadow index collection to feed",
"paramRequired": true
},
{

View File

@ -5,12 +5,6 @@
"paramDescription": "the URL to the ISLookUp Service",
"paramRequired": true
},
{
"paramName": "f",
"paramLongName": "format",
"paramDescription": "metadata format profile name",
"paramRequired": true
},
{
"paramName": "a",
"paramLongName": "action",
@ -28,5 +22,18 @@
"paramLongName": "commit",
"paramDescription": "should the action be followed by a commit?",
"paramRequired": false
},
{
"paramName": "pf",
"paramLongName": "publicFormat",
"paramDescription": "the name of the public metadata format profile - used to create an alias",
"paramRequired": false
},
{
"paramName": "sf",
"paramLongName": "shadowFormat",
"paramDescription": "the name of the shadow metadata format profile - used to create an alias",
"paramRequired": false
}
]

View File

@ -35,7 +35,7 @@
<description>maximum number of relations allowed for a each entity grouping by target</description>
</property>
<property>
<name>format</name>
<name>shadowFormat</name>
<description>metadata format name (DMF|TMF)</description>
</property>
<property>
@ -133,6 +133,7 @@
<case to="create_payloads">${wf:conf('resumeFrom') eq 'create_payloads'}</case>
<case to="drop_solr_collection">${wf:conf('resumeFrom') eq 'drop_solr_collection'}</case>
<case to="to_solr_index">${wf:conf('resumeFrom') eq 'to_solr_index'}</case>
<case to="update_solr_aliases">${wf:conf('resumeFrom') eq 'update_solr_aliases'}</case>
<default to="prepare_relations"/>
</switch>
</decision>
@ -641,8 +642,8 @@
</configuration>
<main-class>eu.dnetlib.dhp.oa.provision.SolrAdminApplication</main-class>
<arg>--isLookupUrl</arg><arg>${isLookupUrl}</arg>
<arg>--format</arg><arg>${format}</arg>
<arg>--action</arg><arg>DELETE_BY_QUERY</arg>
<arg>--shadowFormat</arg><arg>${shadowFormat}</arg>
<arg>--query</arg><arg>${solrDeletionQuery}</arg>
<arg>--commit</arg><arg>true</arg>
</java>
@ -672,7 +673,7 @@
</spark-opts>
<arg>--inputPath</arg><arg>${workingDir}/xml_json</arg>
<arg>--isLookupUrl</arg><arg>${isLookupUrl}</arg>
<arg>--format</arg><arg>${format}</arg>
<arg>--shadowFormat</arg><arg>${shadowFormat}</arg>
<arg>--batchSize</arg><arg>${batchSize}</arg>
</spark>
<ok to="commit_solr_collection"/>
@ -689,7 +690,7 @@
</configuration>
<main-class>eu.dnetlib.dhp.oa.provision.SolrAdminApplication</main-class>
<arg>--isLookupUrl</arg><arg>${isLookupUrl}</arg>
<arg>--format</arg><arg>${format}</arg>
<arg>--shadowFormat</arg><arg>${shadowFormat}</arg>
<arg>--action</arg><arg>COMMIT</arg>
</java>
<ok to="End"/>
@ -714,12 +715,31 @@
</spark-opts>
<arg>--inputPath</arg><arg>${workingDir}/xml_json</arg>
<arg>--isLookupUrl</arg><arg>${isLookupUrl}</arg>
<arg>--format</arg><arg>${format}</arg>
<arg>--shadowFormat</arg><arg>${shadowFormat}</arg>
<arg>--outputPath</arg><arg>${workingDir}/solr_documents</arg>
</spark>
<ok to="End"/>
<error to="Kill"/>
</action>
<!-- Action that updates the solr core aliases - out of order execution, only using the 'resume_from' param -->
<action name="update_solr_aliases">
<java>
<configuration>
<property>
<name>oozie.launcher.mapreduce.user.classpath.first</name>
<value>true</value>
</property>
</configuration>
<main-class>eu.dnetlib.dhp.oa.provision.SolrAdminApplication</main-class>
<arg>--isLookupUrl</arg><arg>${isLookupUrl}</arg>
<arg>--action</arg><arg>UPDATE_ALIASES</arg>
<arg>--publicFormat</arg><arg>${publicFormat}</arg>
<arg>--shadowFormat</arg><arg>${shadowFormat}</arg>
</java>
<ok to="End"/>
<error to="Kill"/>
</action>
<end name="End"/>
</workflow-app>

View File

@ -5,6 +5,7 @@ import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertNotNull;
import java.io.IOException;
import java.io.StringReader;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.Arrays;
@ -16,6 +17,9 @@ import javax.xml.transform.TransformerException;
import org.apache.commons.io.IOUtils;
import org.apache.solr.client.solrj.util.ClientUtils;
import org.apache.solr.common.SolrInputDocument;
import org.dom4j.Document;
import org.dom4j.DocumentException;
import org.dom4j.io.SAXReader;
import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
@ -34,7 +38,6 @@ import eu.dnetlib.dhp.utils.saxon.SaxonTransformerFactory;
/**
* This test can be used to produce a record that can be manually fed to Solr in XML format.
*
* The input is a JoinedEntity, i.e. a json representation of an OpenAIRE entity that embeds all the linked entities.
*/
public class IndexRecordTransformerTest {
@ -54,7 +57,7 @@ public class IndexRecordTransformerTest {
}
@Test
public void testPublicationRecordTransformation() throws IOException, TransformerException {
public void testPublicationRecordTransformation() throws IOException, TransformerException, DocumentException {
final XmlRecordFactory xmlRecordFactory = new XmlRecordFactory(contextMapper, false,
PayloadConverterJob.schemaLocation);
@ -71,11 +74,15 @@ public class IndexRecordTransformerTest {
new RelatedEntityWrapper(rel,
CreateRelatedEntitiesJob_phase1.asRelatedEntity(pj, Project.class))));
final String record = xmlRecordFactory.build(je);
final String xmlRecord = xmlRecordFactory.build(je);
assertNotNull(record);
assertNotNull(xmlRecord);
testRecordTransformation(record);
Document doc = new SAXReader().read(new StringReader(xmlRecord));
assertEquals("Article", doc.valueOf("//children/instance/instancetype/@classname"));
testRecordTransformation(xmlRecord);
}
@Test

View File

@ -4,16 +4,20 @@ package eu.dnetlib.dhp.oa.provision;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertTrue;
import org.apache.solr.client.solrj.request.SolrPing;
import org.apache.solr.client.solrj.response.CollectionAdminResponse;
import org.apache.solr.client.solrj.response.SolrPingResponse;
import org.apache.solr.client.solrj.response.UpdateResponse;
import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.Test;
class SolrAdminApplicationTest extends SolrTest {
@Test
void testPing() throws Exception {
SolrPingResponse pingResponse = miniCluster.getSolrClient().ping();
final SolrPing ping = new SolrPing();
ping.getParams().set("collection", ProvisionConstants.SHADOW_ALIAS_NAME);
SolrPingResponse pingResponse = ping.process(miniCluster.getSolrClient());
log.info("pingResponse: '{}'", pingResponse.getStatus());
assertEquals(0, pingResponse.getStatus());
}
@ -24,7 +28,7 @@ class SolrAdminApplicationTest extends SolrTest {
SolrAdminApplication admin = new SolrAdminApplication(miniCluster.getSolrClient().getZkHost());
UpdateResponse rsp = (UpdateResponse) admin
.execute(SolrAdminApplication.Action.DELETE_BY_QUERY, DEFAULT_COLLECTION, "*:*", false);
.execute(SolrAdminApplication.Action.DELETE_BY_QUERY, "*:*", false, null, SHADOW_COLLECTION);
assertEquals(0, rsp.getStatus());
}
@ -34,9 +38,30 @@ class SolrAdminApplicationTest extends SolrTest {
SolrAdminApplication admin = new SolrAdminApplication(miniCluster.getSolrClient().getZkHost());
UpdateResponse rsp = (UpdateResponse) admin.commit(DEFAULT_COLLECTION);
UpdateResponse rsp = (UpdateResponse) admin.commit(SHADOW_COLLECTION);
assertEquals(0, rsp.getStatus());
}
@Test
void testAdminApplication_CREATE_ALIAS() throws Exception {
SolrAdminApplication admin = new SolrAdminApplication(miniCluster.getSolrClient().getZkHost());
CollectionAdminResponse rsp = (CollectionAdminResponse) admin
.createAlias(ProvisionConstants.PUBLIC_ALIAS_NAME, SHADOW_COLLECTION);
assertEquals(0, rsp.getStatus());
}
@Test
void testAdminApplication_DELETE_ALIAS() throws Exception {
SolrAdminApplication admin = new SolrAdminApplication(miniCluster.getSolrClient().getZkHost());
CollectionAdminResponse rsp = (CollectionAdminResponse) admin.deleteAlias(ProvisionConstants.PUBLIC_ALIAS_NAME);
assertEquals(0, rsp.getStatus());
}
}

View File

@ -1,21 +1,40 @@
package eu.dnetlib.dhp.oa.provision;
import static org.junit.jupiter.api.Assertions.assertEquals;
import java.io.File;
import java.io.IOException;
import java.net.URI;
import java.nio.file.Path;
import org.apache.commons.io.FileUtils;
import org.apache.commons.io.IOUtils;
import org.apache.solr.client.solrj.SolrQuery;
import org.apache.solr.client.solrj.embedded.JettyConfig;
import org.apache.solr.client.solrj.impl.CloudSolrClient;
import org.apache.solr.client.solrj.request.CollectionAdminRequest;
import org.apache.solr.client.solrj.request.ConfigSetAdminRequest;
import org.apache.solr.client.solrj.request.QueryRequest;
import org.apache.solr.client.solrj.response.CollectionAdminResponse;
import org.apache.solr.client.solrj.response.QueryResponse;
import org.apache.solr.cloud.MiniSolrCloudCluster;
import org.apache.solr.common.SolrDocument;
import org.apache.solr.common.params.CollectionParams;
import org.apache.solr.common.params.CommonParams;
import org.apache.solr.common.params.CoreAdminParams;
import org.apache.solr.common.params.ModifiableSolrParams;
import org.apache.solr.common.util.NamedList;
import org.apache.spark.SparkConf;
import org.apache.spark.sql.SparkSession;
import org.junit.jupiter.api.*;
import org.junit.jupiter.api.extension.ExtendWith;
import org.junit.jupiter.api.io.TempDir;
import org.mockito.Mock;
import org.mockito.Mockito;
import org.mockito.junit.jupiter.MockitoExtension;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import eu.dnetlib.dhp.oa.provision.model.SerializableSolrInputDocument;
import eu.dnetlib.dhp.oa.provision.utils.ISLookupClient;
@ -23,7 +42,18 @@ import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
@ExtendWith(MockitoExtension.class)
public class SolrConfigExploreTest extends SolrExploreTest {
public class SolrConfigExploreTest {
protected static final Logger log = LoggerFactory.getLogger(SolrConfigExploreTest.class);
protected static final String SHADOW_FORMAT = "c1";
protected static final String SHADOW_COLLECTION = SHADOW_FORMAT + "-index-openaire";
protected static final String PUBLIC_FORMAT = "c2";
protected static final String PUBLIC_COLLECTION = PUBLIC_FORMAT + "-index-openaire";
protected static final String CONFIG_NAME = "testConfig";
protected static SolrAdminApplication admin;
protected static SparkSession spark;
@ -35,15 +65,17 @@ public class SolrConfigExploreTest extends SolrExploreTest {
@Mock
private ISLookupClient isLookupClient;
@TempDir
public static Path workingDir;
protected static MiniSolrCloudCluster miniCluster;
@BeforeEach
public void prepareMocks() throws ISLookUpException, IOException {
isLookupClient.setIsLookup(isLookUpService);
int solrPort = URI.create("http://" + miniCluster.getZkClient().getZkServerAddress()).getPort();
Mockito
.when(isLookupClient.getDsId(Mockito.anyString()))
.thenReturn("313f0381-23b6-466f-a0b8-c72a9679ac4b_SW5kZXhEU1Jlc291cmNlcy9JbmRleERTUmVzb3VyY2VUeXBl");
Mockito.when(isLookupClient.getZkHost()).thenReturn(String.format("127.0.0.1:%s/solr", solrPort));
Mockito
.when(isLookupClient.getLayoutSource(Mockito.anyString()))
@ -54,7 +86,7 @@ public class SolrConfigExploreTest extends SolrExploreTest {
}
@BeforeAll
public static void before() {
public static void setup() throws Exception {
SparkConf conf = new SparkConf();
conf.setAppName(XmlIndexingJobTest.class.getSimpleName());
@ -70,15 +102,75 @@ public class SolrConfigExploreTest extends SolrExploreTest {
spark = SparkSession
.builder()
.appName(XmlIndexingJobTest.class.getSimpleName())
.appName(SolrConfigExploreTest.class.getSimpleName())
.config(conf)
.getOrCreate();
// random unassigned HTTP port
final int jettyPort = 0;
final JettyConfig jettyConfig = JettyConfig.builder().setPort(jettyPort).build();
log.info(String.format("working directory: %s", workingDir.toString()));
System.setProperty("solr.log.dir", workingDir.resolve("logs").toString());
// create a MiniSolrCloudCluster instance
miniCluster = new MiniSolrCloudCluster(2, workingDir.resolve("solr"), jettyConfig);
// Upload Solr configuration directory to ZooKeeper
String solrZKConfigDir = "src/test/resources/eu/dnetlib/dhp/oa/provision/solr/conf/exploreTestConfig";
File configDir = new File(solrZKConfigDir);
miniCluster.uploadConfigSet(configDir.toPath(), CONFIG_NAME);
// override settings in the solrconfig include
System.setProperty("solr.tests.maxBufferedDocs", "100000");
System.setProperty("solr.tests.maxIndexingThreads", "-1");
System.setProperty("solr.tests.ramBufferSizeMB", "100");
// use non-test classes so RandomizedRunner isn't necessary
System.setProperty("solr.tests.mergeScheduler", "org.apache.lucene.index.ConcurrentMergeScheduler");
System.setProperty("solr.directoryFactory", "solr.RAMDirectoryFactory");
System.setProperty("solr.lock.type", "single");
log.info(new ConfigSetAdminRequest.List().process(miniCluster.getSolrClient()).toString());
log
.info(
CollectionAdminRequest.ClusterStatus
.getClusterStatus()
.process(miniCluster.getSolrClient())
.toString());
NamedList<Object> res = createCollection(
miniCluster.getSolrClient(), SHADOW_COLLECTION, 4, 2, 20, CONFIG_NAME);
res.forEach(o -> log.info(o.toString()));
// miniCluster.getSolrClient().setDefaultCollection(SHADOW_COLLECTION);
res = createCollection(
miniCluster.getSolrClient(), PUBLIC_COLLECTION, 4, 2, 20, CONFIG_NAME);
res.forEach(o -> log.info(o.toString()));
admin = new SolrAdminApplication(miniCluster.getZkClient().getZkServerAddress());
CollectionAdminResponse rsp = (CollectionAdminResponse) admin
.createAlias(ProvisionConstants.PUBLIC_ALIAS_NAME, PUBLIC_COLLECTION);
assertEquals(0, rsp.getStatus());
rsp = (CollectionAdminResponse) admin.createAlias(ProvisionConstants.SHADOW_ALIAS_NAME, SHADOW_COLLECTION);
assertEquals(0, rsp.getStatus());
log
.info(
CollectionAdminRequest.ClusterStatus
.getClusterStatus()
.process(miniCluster.getSolrClient())
.toString());
}
@AfterAll
public static void tearDown() {
public static void tearDown() throws Exception {
spark.stop();
miniCluster.shutdown();
FileUtils.deleteDirectory(workingDir.toFile());
}
@Test
@ -86,8 +178,10 @@ public class SolrConfigExploreTest extends SolrExploreTest {
String inputPath = "src/test/resources/eu/dnetlib/dhp/oa/provision/xml";
new XmlIndexingJob(spark, inputPath, FORMAT, batchSize).run(isLookupClient);
Assertions.assertEquals(0, miniCluster.getSolrClient().commit().getStatus());
new XmlIndexingJob(spark, inputPath, SHADOW_FORMAT, ProvisionConstants.SHADOW_ALIAS_NAME, batchSize)
.run(isLookupClient);
Assertions
.assertEquals(0, miniCluster.getSolrClient().commit(ProvisionConstants.SHADOW_ALIAS_NAME).getStatus());
String[] queryStrings = {
"cancer",
@ -101,14 +195,14 @@ public class SolrConfigExploreTest extends SolrExploreTest {
query.add(CommonParams.Q, q);
query.set("debugQuery", "on");
log.info("Submit query to Solr with params: {}", query.toString());
QueryResponse rsp = miniCluster.getSolrClient().query(query);
log.info("Submit query to Solr with params: {}", query);
QueryResponse rsp = miniCluster.getSolrClient().query(ProvisionConstants.SHADOW_ALIAS_NAME, query);
// System.out.println(rsp.getHighlighting());
// System.out.println(rsp.getExplainMap());
for (SolrDocument doc : rsp.getResults()) {
System.out
.println(
log
.info(
doc.get("score") + "\t" +
doc.get("__indexrecordidentifier") + "\t" +
doc.get("resultidentifier") + "\t" +
@ -122,4 +216,18 @@ public class SolrConfigExploreTest extends SolrExploreTest {
}
}
}
protected static NamedList<Object> createCollection(CloudSolrClient client, String name, int numShards,
int replicationFactor, int maxShardsPerNode, String configName) throws Exception {
ModifiableSolrParams modParams = new ModifiableSolrParams();
modParams.set(CoreAdminParams.ACTION, CollectionParams.CollectionAction.CREATE.name());
modParams.set("name", name);
modParams.set("numShards", numShards);
modParams.set("replicationFactor", replicationFactor);
modParams.set("collection.configName", configName);
modParams.set("maxShardsPerNode", maxShardsPerNode);
QueryRequest request = new QueryRequest(modParams);
request.setPath("/admin/collections");
return client.request(request);
}
}

View File

@ -2,24 +2,15 @@
package eu.dnetlib.dhp.oa.provision;
import java.io.IOException;
import java.io.StringReader;
import java.net.URI;
import org.apache.commons.io.IOUtils;
import org.apache.hadoop.io.Text;
import org.apache.solr.client.solrj.SolrQuery;
import org.apache.solr.client.solrj.response.QueryResponse;
import org.apache.solr.common.SolrDocument;
import org.apache.solr.common.SolrInputField;
import org.apache.solr.common.params.CommonParams;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SparkSession;
import org.dom4j.io.SAXReader;
import org.junit.jupiter.api.*;
import org.junit.jupiter.api.extension.ExtendWith;
import org.mockito.Mock;
@ -50,9 +41,6 @@ public class SolrConfigTest extends SolrTest {
int solrPort = URI.create("http://" + miniCluster.getZkClient().getZkServerAddress()).getPort();
Mockito
.when(isLookupClient.getDsId(Mockito.anyString()))
.thenReturn("313f0381-23b6-466f-a0b8-c72a9679ac4b_SW5kZXhEU1Jlc291cmNlcy9JbmRleERTUmVzb3VyY2VUeXBl");
Mockito.when(isLookupClient.getZkHost()).thenReturn(String.format("127.0.0.1:%s/solr", solrPort));
Mockito
.when(isLookupClient.getLayoutSource(Mockito.anyString()))
@ -95,9 +83,10 @@ public class SolrConfigTest extends SolrTest {
String inputPath = "src/test/resources/eu/dnetlib/dhp/oa/provision/xml";
new XmlIndexingJob(spark, inputPath, FORMAT, batchSize)
new XmlIndexingJob(spark, inputPath, SHADOW_FORMAT, ProvisionConstants.SHADOW_ALIAS_NAME, batchSize)
.run(isLookupClient);
Assertions.assertEquals(0, miniCluster.getSolrClient().commit().getStatus());
Assertions
.assertEquals(0, miniCluster.getSolrClient().commit(ProvisionConstants.SHADOW_ALIAS_NAME).getStatus());
String[] queryStrings = {
"cancer",
@ -109,8 +98,8 @@ public class SolrConfigTest extends SolrTest {
SolrQuery query = new SolrQuery();
query.add(CommonParams.Q, q);
log.info("Submit query to Solr with params: {}", query.toString());
QueryResponse rsp = miniCluster.getSolrClient().query(query);
log.info("Submit query to Solr with params: {}", query);
QueryResponse rsp = miniCluster.getSolrClient().query(ProvisionConstants.SHADOW_ALIAS_NAME, query);
for (SolrDocument doc : rsp.getResults()) {
System.out

View File

@ -34,58 +34,6 @@ public abstract class SolrExploreTest {
@TempDir
public static Path workingDir;
@BeforeAll
public static void setup() throws Exception {
// random unassigned HTTP port
final int jettyPort = 0;
final JettyConfig jettyConfig = JettyConfig.builder().setPort(jettyPort).build();
log.info(String.format("working directory: %s", workingDir.toString()));
System.setProperty("solr.log.dir", workingDir.resolve("logs").toString());
// create a MiniSolrCloudCluster instance
miniCluster = new MiniSolrCloudCluster(2, workingDir.resolve("solr"), jettyConfig);
// Upload Solr configuration directory to ZooKeeper
String solrZKConfigDir = "src/test/resources/eu/dnetlib/dhp/oa/provision/solr/conf/exploreTestConfig";
File configDir = new File(solrZKConfigDir);
miniCluster.uploadConfigSet(configDir.toPath(), CONFIG_NAME);
// override settings in the solrconfig include
System.setProperty("solr.tests.maxBufferedDocs", "100000");
System.setProperty("solr.tests.maxIndexingThreads", "-1");
System.setProperty("solr.tests.ramBufferSizeMB", "100");
// use non-test classes so RandomizedRunner isn't necessary
System.setProperty("solr.tests.mergeScheduler", "org.apache.lucene.index.ConcurrentMergeScheduler");
System.setProperty("solr.directoryFactory", "solr.RAMDirectoryFactory");
System.setProperty("solr.lock.type", "single");
log.info(new ConfigSetAdminRequest.List().process(miniCluster.getSolrClient()).toString());
log
.info(
CollectionAdminRequest.ClusterStatus
.getClusterStatus()
.process(miniCluster.getSolrClient())
.toString());
NamedList<Object> res = createCollection(
miniCluster.getSolrClient(), DEFAULT_COLLECTION, 4, 2, 20, CONFIG_NAME);
res.forEach(o -> log.info(o.toString()));
miniCluster.getSolrClient().setDefaultCollection(DEFAULT_COLLECTION);
log
.info(
CollectionAdminRequest.ClusterStatus
.getClusterStatus()
.process(miniCluster.getSolrClient())
.toString());
}
@AfterAll
public static void shutDown() throws Exception {
miniCluster.shutdown();

View File

@ -1,6 +1,8 @@
package eu.dnetlib.dhp.oa.provision;
import static org.junit.jupiter.api.Assertions.assertEquals;
import java.io.File;
import java.nio.file.Path;
@ -10,6 +12,7 @@ import org.apache.solr.client.solrj.impl.CloudSolrClient;
import org.apache.solr.client.solrj.request.CollectionAdminRequest;
import org.apache.solr.client.solrj.request.ConfigSetAdminRequest;
import org.apache.solr.client.solrj.request.QueryRequest;
import org.apache.solr.client.solrj.response.CollectionAdminResponse;
import org.apache.solr.cloud.MiniSolrCloudCluster;
import org.apache.solr.common.params.CollectionParams;
import org.apache.solr.common.params.CoreAdminParams;
@ -21,14 +24,21 @@ import org.junit.jupiter.api.io.TempDir;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import sun.security.provider.SHA;
public abstract class SolrTest {
protected static final Logger log = LoggerFactory.getLogger(SolrTest.class);
protected static final String FORMAT = "test";
protected static final String DEFAULT_COLLECTION = FORMAT + "-index-openaire";
protected static final String SHADOW_FORMAT = "c1";
protected static final String SHADOW_COLLECTION = SHADOW_FORMAT + "-index-openaire";
protected static final String PUBLIC_FORMAT = "c2";
protected static final String PUBLIC_COLLECTION = PUBLIC_FORMAT + "-index-openaire";
protected static final String CONFIG_NAME = "testConfig";
protected static SolrAdminApplication admin;
protected static MiniSolrCloudCluster miniCluster;
@TempDir
@ -72,10 +82,21 @@ public abstract class SolrTest {
.toString());
NamedList<Object> res = createCollection(
miniCluster.getSolrClient(), DEFAULT_COLLECTION, 4, 2, 20, CONFIG_NAME);
miniCluster.getSolrClient(), SHADOW_COLLECTION, 4, 2, 20, CONFIG_NAME);
res.forEach(o -> log.info(o.toString()));
miniCluster.getSolrClient().setDefaultCollection(DEFAULT_COLLECTION);
// miniCluster.getSolrClient().setDefaultCollection(SHADOW_COLLECTION);
res = createCollection(
miniCluster.getSolrClient(), PUBLIC_COLLECTION, 4, 2, 20, CONFIG_NAME);
res.forEach(o -> log.info(o.toString()));
admin = new SolrAdminApplication(miniCluster.getZkClient().getZkServerAddress());
CollectionAdminResponse rsp = (CollectionAdminResponse) admin
.createAlias(ProvisionConstants.PUBLIC_ALIAS_NAME, PUBLIC_COLLECTION);
assertEquals(0, rsp.getStatus());
rsp = (CollectionAdminResponse) admin.createAlias(ProvisionConstants.SHADOW_ALIAS_NAME, SHADOW_COLLECTION);
assertEquals(0, rsp.getStatus());
log
.info(
@ -83,12 +104,12 @@ public abstract class SolrTest {
.getClusterStatus()
.process(miniCluster.getSolrClient())
.toString());
}
@AfterAll
public static void shutDown() throws Exception {
miniCluster.shutdown();
admin.close();
FileUtils.deleteDirectory(workingDir.toFile());
}

View File

@ -10,6 +10,7 @@ import java.util.Optional;
import org.apache.commons.io.IOUtils;
import org.apache.solr.client.solrj.SolrQuery;
import org.apache.solr.client.solrj.SolrResponse;
import org.apache.solr.client.solrj.response.QueryResponse;
import org.apache.solr.common.params.CommonParams;
import org.apache.spark.SparkConf;
@ -50,9 +51,6 @@ public class XmlIndexingJobTest extends SolrTest {
int solrPort = URI.create("http://" + miniCluster.getZkClient().getZkServerAddress()).getPort();
Mockito
.when(isLookupClient.getDsId(Mockito.anyString()))
.thenReturn("313f0381-23b6-466f-a0b8-c72a9679ac4b_SW5kZXhEU1Jlc291cmNlcy9JbmRleERTUmVzb3VyY2VUeXBl");
Mockito.when(isLookupClient.getZkHost()).thenReturn(String.format("127.0.0.1:%s/solr", solrPort));
Mockito
.when(isLookupClient.getLayoutSource(Mockito.anyString()))
@ -103,46 +101,72 @@ public class XmlIndexingJobTest extends SolrTest {
long nRecord = records.count();
new XmlIndexingJob(spark, inputPath, FORMAT, batchSize).run(isLookupClient);
new XmlIndexingJob(spark, inputPath, SHADOW_FORMAT, ProvisionConstants.SHADOW_ALIAS_NAME, batchSize)
.run(isLookupClient);
assertEquals(0, miniCluster.getSolrClient().commit().getStatus());
assertEquals(0, miniCluster.getSolrClient().commit(SHADOW_COLLECTION).getStatus());
QueryResponse rsp = miniCluster.getSolrClient().query(new SolrQuery().add(CommonParams.Q, "*:*"));
QueryResponse rsp = miniCluster
.getSolrClient()
.query(
ProvisionConstants.SHADOW_ALIAS_NAME,
new SolrQuery().add(CommonParams.Q, "*:*"));
assertEquals(
nRecord, rsp.getResults().getNumFound(),
"the number of indexed records should be equal to the number of input records");
rsp = miniCluster.getSolrClient().query(new SolrQuery().add(CommonParams.Q, "isgreen:true"));
rsp = miniCluster
.getSolrClient()
.query(
ProvisionConstants.SHADOW_ALIAS_NAME,
new SolrQuery().add(CommonParams.Q, "isgreen:true"));
assertEquals(
0, rsp.getResults().getNumFound(),
4, rsp.getResults().getNumFound(),
"the number of indexed records having isgreen = true");
rsp = miniCluster.getSolrClient().query(new SolrQuery().add(CommonParams.Q, "openaccesscolor:bronze"));
rsp = miniCluster
.getSolrClient()
.query(
ProvisionConstants.SHADOW_ALIAS_NAME,
new SolrQuery().add(CommonParams.Q, "openaccesscolor:bronze"));
assertEquals(
0, rsp.getResults().getNumFound(),
2, rsp.getResults().getNumFound(),
"the number of indexed records having openaccesscolor = bronze");
rsp = miniCluster.getSolrClient().query(new SolrQuery().add(CommonParams.Q, "isindiamondjournal:true"));
rsp = miniCluster
.getSolrClient()
.query(
ProvisionConstants.SHADOW_ALIAS_NAME,
new SolrQuery().add(CommonParams.Q, "isindiamondjournal:true"));
assertEquals(
0, rsp.getResults().getNumFound(),
"the number of indexed records having isindiamondjournal = true");
rsp = miniCluster.getSolrClient().query(new SolrQuery().add(CommonParams.Q, "publiclyfunded:true"));
rsp = miniCluster
.getSolrClient()
.query(
ProvisionConstants.SHADOW_ALIAS_NAME,
new SolrQuery().add(CommonParams.Q, "publiclyfunded:true"));
assertEquals(
0, rsp.getResults().getNumFound(),
"the number of indexed records having publiclyfunded = true");
rsp = miniCluster.getSolrClient().query(new SolrQuery().add(CommonParams.Q, "peerreviewed:true"));
rsp = miniCluster
.getSolrClient()
.query(
ProvisionConstants.SHADOW_ALIAS_NAME,
new SolrQuery().add(CommonParams.Q, "peerreviewed:true"));
assertEquals(
0, rsp.getResults().getNumFound(),
35, rsp.getResults().getNumFound(),
"the number of indexed records having peerreviewed = true");
rsp = miniCluster
.getSolrClient()
.query(
ProvisionConstants.SHADOW_ALIAS_NAME,
new SolrQuery()
.add(CommonParams.Q, "objidentifier:\"iddesignpres::ae77e56e84ad058d9e7f19fa2f7325db\"")
.add(CommonParams.Q, "objidentifier:\"57a035e5b1ae::236d6d8c1e03368b5ae72acfeeb11bbc\"")
.add(CommonParams.FL, "__json"));
assertEquals(
1, rsp.getResults().getNumFound(),
@ -158,6 +182,22 @@ public class XmlIndexingJobTest extends SolrTest {
log.info((String) json.get());
admin
.execute(
SolrAdminApplication.Action.UPDATE_ALIASES, null, false,
SHADOW_COLLECTION, PUBLIC_COLLECTION);
rsp = miniCluster
.getSolrClient()
.query(
ProvisionConstants.PUBLIC_ALIAS_NAME,
new SolrQuery()
.add(CommonParams.Q, "objidentifier:\"57a035e5b1ae::236d6d8c1e03368b5ae72acfeeb11bbc\"")
.add(CommonParams.FL, "__json"));
assertEquals(
1, rsp.getResults().getNumFound(),
"the number of indexed records having the given identifier, found in the public collection");
}
}

View File

@ -1,8 +1,7 @@
package eu.dnetlib.dhp.oa.provision;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertNotNull;
import static org.junit.jupiter.api.Assertions.*;
import java.io.IOException;
import java.io.StringReader;
@ -22,6 +21,7 @@ import com.google.common.collect.Lists;
import eu.dnetlib.dhp.oa.provision.model.JoinedEntity;
import eu.dnetlib.dhp.oa.provision.model.RelatedEntity;
import eu.dnetlib.dhp.oa.provision.model.RelatedEntityWrapper;
import eu.dnetlib.dhp.oa.provision.utils.ContextDef;
import eu.dnetlib.dhp.oa.provision.utils.ContextMapper;
import eu.dnetlib.dhp.oa.provision.utils.XmlRecordFactory;
import eu.dnetlib.dhp.schema.oaf.*;
@ -51,7 +51,7 @@ public class XmlRecordFactoryTest {
assertNotNull(doc);
// System.out.println(doc.asXML());
System.out.println(doc.asXML());
assertEquals("0000-0001-9613-6638", doc.valueOf("//creator[@rank = '1']/@orcid"));
assertEquals("0000-0001-9613-6639", doc.valueOf("//creator[@rank = '1']/@orcid_pending"));
@ -267,4 +267,39 @@ public class XmlRecordFactoryTest {
}
@Test
public void test_AKA_project() throws DocumentException, IOException {
final ContextMapper contextMapper = new ContextMapper();
contextMapper
.put("dh-ch", new ContextDef("dh-ch", "Digital Humanities and Cultural Heritage", "context", "community"));
contextMapper.put("dh-ch::projects", new ContextDef("dh-ch::projects", "DH-CH Projects", "category", ""));
contextMapper
.put("dh-ch::projects::2", new ContextDef("dh-ch::projects::2", "ARIADNE", "concept", "community"));
final XmlRecordFactory xmlRecordFactory = new XmlRecordFactory(contextMapper, false,
PayloadConverterJob.schemaLocation);
final Project p = OBJECT_MAPPER
.readValue(
IOUtils.toString(getClass().getResourceAsStream("project_aka.json")),
Project.class);
assertNotNull(p.getContext());
assertEquals(1, p.getContext().size());
assertEquals("dh-ch::projects::2", p.getContext().get(0).getId());
final String xml = xmlRecordFactory.build(new JoinedEntity(p));
assertNotNull(xml);
final Document doc = new SAXReader().read(new StringReader(xml));
assertNotNull(doc);
assertEquals("dh-ch", doc.valueOf("//context/@id"));
assertEquals("dh-ch::projects", doc.valueOf("//context/category/@id"));
assertEquals("dh-ch::projects::2", doc.valueOf("//context/category/concept/@id"));
}
}

View File

@ -68,15 +68,12 @@
<FIELD copy="true" indexable="false" name="externalreflabel" result="false" stat="false" tokenizable="true" xpath="distinct-values(//*[local-name()='entity']/*//children/externalreference/label)"/>
<FIELD copy="true" indexable="true" name="resultidentifier" result="false" stat="false" type="string_ci" xpath="distinct-values(//*[local-name()='entity']/*[local-name()='result']/children/instance/webresource/*[local-name()='url'])"/>
<FIELD copy="true" indexable="false" name="resultsource" result="false" stat="false" xpath="distinct-values(//*[local-name()='entity']/*[local-name()='result']/source)"/>
<FIELD indexable="true" multivalued="false" name="isgreen" result="false" stat="false" type="boolean" value="//*[local-name()='entity']/*[local-name()='result']/isgreen"/>
<FIELD indexable="true" multivalued="false" name="openaccesscolor" result="false" stat="false" tokenizable="false" value="//*[local-name()='entity']/*[local-name()='result']/openaccesscolor"/>
<FIELD indexable="true" multivalued="false" name="isindiamondjournal" result="false" stat="false" type="boolean" value="//*[local-name()='entity']/*[local-name()='result']/isindiamondjournal"/>
<FIELD indexable="true" multivalued="false" name="publiclyfunded" result="false" stat="false" type="boolean" value="//*[local-name()='entity']/*[local-name()='result']/publiclyfunded"/>
<FIELD indexable="true" multivalued="false" name="peerreviewed" result="false" stat="false" type="boolean" value="some $refereed in //*[local-name()='entity']/*[local-name()='result']/children/instance/*[local-name()='refereed']/@classid satisfies ($refereed = '0001')"/>
<FIELD indexable="true" multivalued="false" name="haslicense" result="false" stat="false" type="boolean" value="some $license in //*[local-name()='entity']/*[local-name()='result']/children/instance/*[local-name()='license']/text() satisfies (string-length($license) &gt; 0)"/>
<FIELD indexable="true" name="eoscifguidelines" result="false" stat="false" tokenizable="false" xpath="distinct-values(//*[local-name() = 'result']/eoscifguidelines/@code)"/><!-- FOS and SDGs non tokenizable for faceted search-->
<FIELD indexable="true" name="fos" result="false" stat="false" tokenizable="false" xpath="distinct-values(//*[local-name()='entity']/*[local-name()='result']/subject[@classid='FOS'])"/>
<FIELD indexable="true" name="foslabel" result="false" stat="false" tokenizable="false" value="concat(./text(), '||', replace(./text(), '^\d+\s', ''))" xpath="//*[local-name()='entity']/*[local-name()='result']/subject[@classid='FOS']"/>
@ -93,6 +90,7 @@
<FIELD indexable="true" name="relorganizationid" result="false" stat="false" tokenizable="false" xpath="distinct-values(//*[local-name()='entity']/*//rel/to[@type='organization'])"/>
<FIELD copy="true" indexable="true" name="relorganizationname" result="false" stat="false" xpath="distinct-values(//*[local-name()='entity']/*//rel[./to/@type='organization']/legalname)"/>
<FIELD copy="true" indexable="true" name="relorganizationshortname" result="false" stat="false" xpath="distinct-values(//*[local-name()='entity']/*//rel[./to/@type='organization']/legalshortname)"/>
<FIELD indexable="true" name="relorganization" result="false" stat="false" tokenizable="false" value="distinct-values(concat(./to, '||', ./legalname))" xpath="//*[local-name()='entity']/*//rel[./to/@type='organization']"/>
<FIELD indexable="true" name="relresultid" result="false" stat="false" tokenizable="false" xpath="distinct-values(//*[local-name()='entity']/*//rel/to[@type='publication' or @type='dataset' or @type='software' or @type='otherresearchproduct'])"/>
<FIELD indexable="true" name="relresulttype" result="false" stat="false" tokenizable="false" xpath="distinct-values(//*[local-name()='entity']/*//rel/to/@type)"/>
<FIELD indexable="true" name="relclass" result="false" stat="false" tokenizable="false" xpath="distinct-values(//*[local-name()='entity']/*//rel/to/@class)"/>
@ -122,6 +120,7 @@
<FIELD indexable="true" name="categoryid" result="false" stat="false" tokenizable="false" xpath="distinct-values(//*[local-name()='entity']/*[local-name()='result']/context/category/@id)"/>
<FIELD indexable="true" name="conceptname" result="false" stat="false" tokenizable="false" xpath="distinct-values(//*[local-name()='entity']/*[local-name()='result']/context/category//concept/@label)"/><!-- new index field for country info from different xpaths for any type of entity -->
<FIELD indexable="true" name="country" result="false" stat="false" tokenizable="false" xpath="distinct-values(//*[local-name()='entity']/*/country/@classid | //*[local-name()='entity']/*//rel[./to/@type='organization']/country/@classid | //*[local-name()='entity']//funder/@jurisdiction)"/>
<FIELD indexable="true" name="countrynojurisdiction" result="false" stat="false" tokenizable="false" xpath="distinct-values(//*[local-name()='entity']/*/country/@classid | //*[local-name()='entity']/*//rel[./to/@type='organization']/country/@classid)"/>
<FIELD indexable="false" name="oafentity" result="true" stat="false" tokenizable="false" xpath="//*[local-name() = 'entity']"/><!-- impact indicators -->
<FIELD copy="false" indexable="true" multivalued="false" name="influence" result="false" stat="false" type="pfloat" xpath="//measure[@id='influence']/@score/number()"/>
<FIELD copy="false" indexable="true" multivalued="false" name="influence_class" result="false" stat="false" type="string" xpath="//measure[@id='influence']/@class/string()"/>

File diff suppressed because one or more lines are too long

View File

@ -194,228 +194,173 @@
<fieldType name="tints" class="solr.TrieIntField" positionIncrementGap="0" docValues="true" multiValued="true" precisionStep="8"/>
<fieldType name="tlong" class="solr.TrieLongField" positionIncrementGap="0" docValues="true" precisionStep="8"/>
<fieldType name="tlongs" class="solr.TrieLongField" positionIncrementGap="0" docValues="true" multiValued="true" precisionStep="8"/>
<!-- Indexed fields -->
<field name="__all" type="text_common" multiValued="true" indexed="true" stored="false"/>
<field name="__deleted" type="boolean" default="false" omitNorms="true" omitTermFreqAndPositions="true" indexed="true" stored="false"/>
<field name="__dsid" type="string" omitNorms="true" omitTermFreqAndPositions="true" indexed="true" stored="true"/>
<field name="__dsversion" type="pdate" omitNorms="true" omitTermFreqAndPositions="true" indexed="true" stored="true"/>
<field name="__indexrecordidentifier" type="string" multiValued="false" indexed="true" required="true" stored="true"/>
<field name="__result" type="string" docValues="false" multiValued="false" indexed="false" stored="true"/>
<field name="__json" type="string" docValues="false" multiValued="false" indexed="false" stored="true"/>
<field name="__result" type="string" docValues="false" multiValued="false" indexed="false" stored="true"/>
<field name="_root_" type="string" docValues="false" indexed="true" stored="false"/>
<field name="_version_" type="long" multiValued="false" indexed="true" stored="true"/>
<field name="authorid" type="string_ci" multiValued="true" indexed="true" stored="false"/>
<field name="authoridtype" type="string_ci" multiValued="true" indexed="true" stored="false"/>
<field name="categoryid" type="string" multiValued="true" indexed="true" stored="false"/>
<field name="categoryname" type="string" multiValued="true" indexed="true" stored="false"/>
<field name="collectedfrom" type="string" multiValued="true" indexed="true" stored="false"/>
<field name="collectedfromdatasourceid" type="string" multiValued="true" indexed="true" stored="false"/>
<field name="collectedfromname" type="string" multiValued="true" indexed="true" stored="false"/>
<field name="community" type="string" multiValued="true" indexed="true" stored="false"/>
<field name="communityid" type="string" multiValued="true" indexed="true" stored="false"/>
<field name="communityname" type="string" multiValued="true" indexed="true" stored="false"/>
<field name="conceptid" type="string" multiValued="true" indexed="true" stored="false"/>
<field name="conceptname" type="string" multiValued="true" indexed="true" stored="false"/>
<field name="contextid" type="string" multiValued="true" indexed="true" stored="false"/>
<field name="contextname" type="string" multiValued="true" indexed="true" stored="false"/>
<field name="contexttype" type="string" multiValued="true" indexed="true" stored="false"/>
<field name="country" type="string" multiValued="true" indexed="true" stored="false"/>
<field name="datasourcecompatibilityid" type="string" multiValued="false" indexed="true" stored="false"/>
<field name="datasourcecompatibilityname" type="string" multiValued="false" indexed="true" stored="false"/>
<field name="datasourceenglishname" type="text_common" multiValued="true" indexed="true" stored="false"/>
<field name="authorid" type="string_ci" docValues="false" multiValued="true" indexed="true" stored="false"/>
<field name="categoryid" type="string" docValues="false" multiValued="true" indexed="true" stored="false"/>
<field name="citation_count" type="pint" docValues="false" multiValued="false" indexed="true" stored="false"/>
<field name="citation_count_class" type="string" docValues="false" multiValued="false" indexed="true" stored="false"/>
<field name="collectedfromdatasourceid" type="string" docValues="false" multiValued="true" indexed="true" stored="false"/>
<field name="collectedfromname" type="string" docValues="false" multiValued="true" indexed="true" stored="false"/>
<field name="community" type="string" docValues="false" multiValued="true" indexed="true" stored="false"/>
<field name="communityid" type="string" docValues="false" multiValued="true" indexed="true" stored="false"/>
<field name="conceptname" type="string" docValues="false" multiValued="true" indexed="true" stored="false"/>
<field name="contextid" type="string" docValues="false" multiValued="true" indexed="true" stored="false"/>
<field name="contextname" type="string" docValues="false" multiValued="true" indexed="true" stored="false"/>
<field name="country" type="string" docValues="false" multiValued="true" indexed="true" stored="false"/>
<field name="countrynojurisdiction" type="string" docValues="false" multiValued="true" indexed="true" stored="false"/>
<field name="datasourcecompatibilityid" type="string" docValues="false" multiValued="false" indexed="true" stored="false"/>
<field name="datasourcecompatibilityname" type="string" docValues="false" multiValued="false" indexed="true" stored="false"/>
<field name="datasourceenglishname" type="text_common" docValues="false" multiValued="true" indexed="true" stored="false"/>
<field name="datasourcejurisdiction" type="string" docValues="false" multiValued="true" indexed="true" stored="false"/>
<field name="datasourceodcontenttypes" type="string" multiValued="true" indexed="true" stored="false"/>
<field name="datasourceoddescription" type="text_common" multiValued="true" indexed="true" stored="false"/>
<field name="datasourceodlanguages" type="string" multiValued="true" indexed="true" stored="false"/>
<field name="datasourceodsubjects" type="string" multiValued="true" indexed="true" stored="false"/>
<field name="datasourceofficialname" type="text_common" multiValued="true" indexed="true" stored="false"/>
<field name="datasourcesubject" type="ngramtext" multiValued="true" indexed="true" stored="false"/>
<field name="datasourceodcontenttypes" type="string" docValues="false" multiValued="true" indexed="true" stored="false"/>
<field name="datasourceoddescription" type="text_common" docValues="false" multiValued="true" indexed="true" stored="false"/>
<field name="datasourceodlanguages" type="string" docValues="false" multiValued="true" indexed="true" stored="false"/>
<field name="datasourceodsubjects" type="string" docValues="false" multiValued="true" indexed="true" stored="false"/>
<field name="datasourceofficialname" type="text_common" docValues="false" multiValued="true" indexed="true" stored="false"/>
<field name="datasourcesubject" type="ngramtext" docValues="false" multiValued="true" indexed="true" stored="false"/>
<field name="datasourcethematic" type="string" docValues="false" multiValued="true" indexed="true" stored="false"/>
<field name="datasourcetypename" type="string" multiValued="false" indexed="true" stored="false"/>
<field name="datasourcetypeuiid" type="string" multiValued="false" indexed="true" stored="false"/>
<field name="datasourcetypeuiname" type="string" multiValued="false" indexed="true" stored="false"/>
<field name="dateofcollection" type="pdate" multiValued="false" indexed="true" stored="false"/>
<field name="deletedbyinference" type="string" multiValued="true" indexed="true" stored="false"/>
<field name="datasourcetypename" type="string" docValues="false" multiValued="false" indexed="true" stored="false"/>
<field name="datasourcetypeuiid" type="string" docValues="false" multiValued="false" indexed="true" stored="false"/>
<field name="datasourcetypeuiname" type="string" docValues="false" multiValued="false" indexed="true" stored="false"/>
<field name="dateofcollection" type="date" docValues="false" multiValued="false" indexed="true" stored="false"/>
<field name="deletedbyinference" type="string" docValues="false" multiValued="true" indexed="true" stored="false"/>
<field name="eoscdatasourcetype" type="string" docValues="false" multiValued="true" indexed="true" stored="false"/>
<field name="eoscifguidelines" type="string" docValues="false" multiValued="true" indexed="true" stored="false"/>
<field name="eosctype" type="string" docValues="false" multiValued="true" indexed="true" stored="false"/>
<field name="externalrefclass" type="string" multiValued="true" indexed="true" stored="false"/>
<field name="externalrefid" type="string" multiValued="true" indexed="true" stored="false"/>
<field name="externalreflabel" type="text_common" multiValued="true" indexed="true" stored="false"/>
<field name="externalrefsite" type="string" multiValued="true" indexed="true" stored="false"/>
<field name="funder" type="string" multiValued="true" indexed="true" stored="false"/>
<field name="funderid" type="string" multiValued="true" indexed="true" stored="false"/>
<field name="funderjurisdiction" type="string" multiValued="true" indexed="true" stored="false"/>
<field name="fundername" type="string" multiValued="true" indexed="true" stored="false"/>
<field name="funderoriginalname" type="string" multiValued="true" indexed="true" stored="false"/>
<field name="fundershortname" type="string" multiValued="true" indexed="true" stored="false"/>
<field name="fundinglevel0_description" type="text_common" multiValued="true" indexed="true" stored="false"/>
<field name="fundinglevel0_id" type="string" multiValued="true" indexed="true" stored="false"/>
<field name="fundinglevel0_name" type="string" multiValued="true" indexed="true" stored="false"/>
<field name="fundinglevel1_description" type="text_common" multiValued="true" indexed="true" stored="false"/>
<field name="fundinglevel1_id" type="string" multiValued="true" indexed="true" stored="false"/>
<field name="fundinglevel1_name" type="string" multiValued="true" indexed="true" stored="false"/>
<field name="fundinglevel2_description" type="text_common" multiValued="true" indexed="true" stored="false"/>
<field name="fundinglevel2_id" type="string" multiValued="true" indexed="true" stored="false"/>
<field name="fundinglevel2_name" type="string" multiValued="true" indexed="true" stored="false"/>
<field name="inferenceprovenance" type="string" multiValued="true" indexed="true" stored="false"/>
<field name="inferred" type="string" multiValued="true" indexed="true" stored="false"/>
<field name="instancetypename" type="string" multiValued="true" indexed="true" stored="false"/>
<field name="oafentity" type="string" multiValued="true" indexed="false" stored="false"/>
<field name="oaftype" type="string" multiValued="true" indexed="true" stored="false"/>
<field name="objidentifier" type="string" multiValued="true" indexed="true" stored="false"/>
<field name="organizationalternativenames" type="ngramtext" multiValued="true" indexed="true" stored="false"/>
<field name="organizationcountryname" type="string" multiValued="false" indexed="true" stored="false"/>
<field name="organizationdupid" type="string" multiValued="true" indexed="true" stored="false"/>
<field name="organizationecenterprise" type="text_common" multiValued="true" indexed="true" stored="false"/>
<field name="organizationecinternationalorganization" type="text_common" multiValued="true" indexed="true" stored="false"/>
<field name="organizationecinternationalorganizationeurinterests" type="text_common" multiValued="true" indexed="true" stored="false"/>
<field name="organizationeclegalbody" type="text_common" multiValued="true" indexed="true" stored="false"/>
<field name="organizationeclegalperson" type="text_common" multiValued="true" indexed="true" stored="false"/>
<field name="organizationecnonprofit" type="text_common" multiValued="true" indexed="true" stored="false"/>
<field name="organizationecnutscode" type="text_common" multiValued="true" indexed="true" stored="false"/>
<field name="organizationecresearchorganization" type="text_common" multiValued="true" indexed="true" stored="false"/>
<field name="organizationecsmevalidated" type="text_common" multiValued="true" indexed="true" stored="false"/>
<field name="organizationlegalname" type="ngramtext" multiValued="true" indexed="true" stored="false"/>
<field name="organizationlegalshortname" type="ngramtext" multiValued="true" indexed="true" stored="false"/>
<field name="originalid" type="string_ci" multiValued="true" indexed="true" stored="false"/>
<field name="pid" type="string_ci" multiValued="true" indexed="true" stored="false"/>
<field name="pidclassid" type="string" multiValued="true" indexed="true" stored="false"/>
<field name="pidclassname" type="string" multiValued="true" indexed="true" stored="false"/>
<field name="projectacronym" type="ngramtext" multiValued="true" indexed="true" stored="false"/>
<field name="projectcallidentifier" type="string" multiValued="false" indexed="true" stored="false"/>
<field name="projectcode" type="ngramtext" multiValued="true" indexed="true" stored="false"/>
<field name="projectcode_nt" type="string" multiValued="true" indexed="true" stored="false"/>
<field name="projectcontracttypename" type="string" multiValued="false" indexed="true" stored="false"/>
<field name="projectduration" type="string" multiValued="false" indexed="true" stored="false"/>
<field name="projectecarticle29_3" type="string" multiValued="false" indexed="true" stored="false"/>
<field name="projectecsc39" type="string" multiValued="false" indexed="true" stored="false"/>
<field name="projectenddate" type="pdate" multiValued="false" indexed="true" stored="false"/>
<field name="projectendyear" type="string" multiValued="false" indexed="true" stored="false"/>
<field name="projectkeywords" type="text_common" multiValued="true" indexed="true" stored="false"/>
<field name="projectoamandatepublications" type="string" multiValued="false" indexed="true" stored="false"/>
<field name="projectstartdate" type="pdate" multiValued="false" indexed="true" stored="false"/>
<field name="projectstartyear" type="string" multiValued="false" indexed="true" stored="false"/>
<field name="projectsubject" type="string" multiValued="true" indexed="true" stored="false"/>
<field name="projecttitle" type="ngramtext" multiValued="true" indexed="true" stored="false"/>
<field name="provenanceactionclassid" type="string" multiValued="true" indexed="true" stored="false"/>
<field name="relclass" type="string" multiValued="true" indexed="true" stored="false"/>
<field name="relcollectedfromid" type="string" multiValued="true" indexed="true" stored="false"/>
<field name="relcollectedfromname" type="string" multiValued="true" indexed="true" stored="false"/>
<field name="relcontracttypeid" type="string" multiValued="true" indexed="true" stored="false"/>
<field name="relcontracttypename" type="text_common" multiValued="true" indexed="true" stored="false"/>
<field name="reldatasourcecompatibilityid" type="string" multiValued="true" indexed="true" stored="false"/>
<field name="relfunder" type="string" multiValued="true" indexed="true" stored="false"/>
<field name="relfunderid" type="string" multiValued="true" indexed="true" stored="false"/>
<field name="relfunderjurisdiction" type="string" multiValued="true" indexed="true" stored="false"/>
<field name="relfundername" type="string" multiValued="true" indexed="true" stored="false"/>
<field name="relfundershortname" type="string" multiValued="true" indexed="true" stored="false"/>
<field name="relfundinglevel0_id" type="string" multiValued="true" indexed="true" stored="false"/>
<field name="relfundinglevel0_name" type="string" multiValued="true" indexed="true" stored="false"/>
<field name="relfundinglevel1_id" type="string" multiValued="true" indexed="true" stored="false"/>
<field name="relfundinglevel1_name" type="string" multiValued="true" indexed="true" stored="false"/>
<field name="relfundinglevel2_id" type="string" multiValued="true" indexed="true" stored="false"/>
<field name="relfundinglevel2_name" type="string" multiValued="true" indexed="true" stored="false"/>
<field name="relinferenceprovenance" type="string" multiValued="true" indexed="true" stored="false"/>
<field name="relinferred" type="string" multiValued="true" indexed="true" stored="false"/>
<field name="relorganizationcountryid" type="string" multiValued="true" indexed="true" stored="false"/>
<field name="relorganizationcountryname" type="text_common" multiValued="true" indexed="true" stored="false"/>
<field name="relorganizationid" type="string" multiValued="true" indexed="true" stored="false"/>
<field name="relorganizationname" type="text_common" multiValued="true" indexed="true" stored="false"/>
<field name="relorganizationshortname" type="text_common" multiValued="true" indexed="true" stored="false"/>
<field name="relproject" type="string" multiValued="true" indexed="true" stored="false"/>
<field name="relprojectcode" type="string" multiValued="true" indexed="true" stored="false"/>
<field name="relprojectid" type="string" multiValued="true" indexed="true" stored="false"/>
<field name="relprojectname" type="string" multiValued="true" indexed="true" stored="false"/>
<field name="relprojecttitle" type="text_common" multiValued="true" indexed="true" stored="false"/>
<field name="relprovenanceactionclassid" type="string" multiValued="true" indexed="true" stored="false"/>
<field name="relresultid" type="string" multiValued="true" indexed="true" stored="false"/>
<field name="relresulttype" type="string" multiValued="true" indexed="true" stored="false"/>
<field name="reltrust" type="string" multiValued="true" indexed="true" stored="false"/>
<field name="resultacceptanceyear" type="string" multiValued="false" indexed="true" stored="false"/>
<field name="resultaccessright" type="string" multiValued="true" indexed="true" stored="false"/>
<field name="resultauthor" type="text_common" multiValued="true" indexed="true" stored="false"/>
<field name="resultauthor_nt" type="string_ci" multiValued="true" indexed="true" stored="false"/>
<field name="resultbestaccessright" type="string" multiValued="false" indexed="true" stored="false"/>
<field name="resultdateofacceptance" type="pdate" multiValued="false" indexed="true" stored="false"/>
<field name="resultdescription" type="text_en" multiValued="true" indexed="true" stored="false"/>
<field name="resultdupid" type="string" multiValued="true" indexed="true" stored="false"/>
<field name="resultembargoenddate" type="pdate" multiValued="false" indexed="true" stored="false"/>
<field name="resultembargoendyear" type="string" multiValued="false" indexed="true" stored="false"/>
<field name="resulthostingdatasource" type="string" multiValued="true" indexed="true" stored="false"/>
<field name="resulthostingdatasourceid" type="string" multiValued="true" indexed="true" stored="false"/>
<field name="resulthostingdatasourcename" type="string" multiValued="true" indexed="true" stored="false"/>
<field name="resultidentifier" type="string" multiValued="true" indexed="true" stored="false"/>
<field name="resultlanguagename" type="string" multiValued="false" indexed="true" stored="false"/>
<field name="resultlicense" type="string" multiValued="true" indexed="true" stored="false"/>
<field name="resultpublisher" type="text_common" multiValued="true" indexed="true" stored="false"/>
<field name="resultsource" type="text_common" multiValued="true" indexed="true" stored="false"/>
<field name="resultsubject" type="text_en" multiValued="true" indexed="true" stored="false"/>
<field name="resultsubjectclass" type="string" multiValued="true" indexed="true" stored="false"/>
<field name="resulttitle" type="text_en" multiValued="true" indexed="true" stored="false"/>
<field name="resulttypeid" type="string" multiValued="false" indexed="true" stored="false"/>
<field name="resulttypename" type="string" multiValued="false" indexed="true" stored="false"/>
<field name="semrelid" type="string" multiValued="true" indexed="true" stored="false"/>
<field name="externalreflabel" type="text_common" docValues="false" multiValued="true" indexed="true" stored="false"/>
<field name="fos" type="string" docValues="false" multiValued="true" indexed="true" stored="false"/>
<field name="foslabel" type="string" docValues="false" multiValued="true" indexed="true" stored="false"/>
<field name="funder" type="string" docValues="false" multiValued="true" indexed="true" stored="false"/>
<field name="funderid" type="string" docValues="false" multiValued="true" indexed="true" stored="false"/>
<field name="fundershortname" type="string" docValues="false" multiValued="true" indexed="true" stored="false"/>
<field name="fundinglevel0_description" type="text_common" docValues="false" multiValued="true" indexed="true" stored="false"/>
<field name="fundinglevel0_id" type="string" docValues="false" multiValued="true" indexed="true" stored="false"/>
<field name="fundinglevel0_name" type="string" docValues="false" multiValued="true" indexed="true" stored="false"/>
<field name="fundinglevel1_description" type="text_common" docValues="false" multiValued="true" indexed="true" stored="false"/>
<field name="fundinglevel1_id" type="string" docValues="false" multiValued="true" indexed="true" stored="false"/>
<field name="fundinglevel1_name" type="string" docValues="false" multiValued="true" indexed="true" stored="false"/>
<field name="fundinglevel2_description" type="text_common" docValues="false" multiValued="true" indexed="true" stored="false"/>
<field name="fundinglevel2_id" type="string" docValues="false" multiValued="true" indexed="true" stored="false"/>
<field name="fundinglevel2_name" type="string" docValues="false" multiValued="true" indexed="true" stored="false"/>
<field name="haslicense" type="boolean" docValues="false" multiValued="false" indexed="true" stored="false"/>
<field name="impulse" type="pint" docValues="false" multiValued="false" indexed="true" stored="false"/>
<field name="impulse_class" type="string" docValues="false" multiValued="false" indexed="true" stored="false"/>
<field name="influence" type="pfloat" docValues="false" multiValued="false" indexed="true" stored="false"/>
<field name="influence_class" type="string" docValues="false" multiValued="false" indexed="true" stored="false"/>
<field name="instancetypename" type="string" docValues="false" multiValued="true" indexed="true" stored="false"/>
<field name="isgreen" type="boolean" docValues="false" multiValued="false" indexed="true" stored="false"/>
<field name="isindiamondjournal" type="boolean" docValues="false" multiValued="false" indexed="true" stored="false"/>
<field name="oafentity" type="string" docValues="false" multiValued="true" indexed="false" stored="false"/>
<field name="oaftype" type="string" docValues="false" multiValued="true" indexed="true" stored="false"/>
<field name="objidentifier" type="string" docValues="false" multiValued="true" indexed="true" stored="false"/>
<field name="openaccesscolor" type="string" docValues="false" multiValued="false" indexed="true" stored="false"/>
<field name="orcidtypevalue" type="string_ci" docValues="false" multiValued="true" indexed="true" stored="false"/>
<field name="organizationalternativenames" type="ngramtext" docValues="false" multiValued="true" indexed="true" stored="false"/>
<field name="organizationdupid" type="string" docValues="false" multiValued="true" indexed="true" stored="false"/>
<field name="organizationlegalname" type="ngramtext" docValues="false" multiValued="true" indexed="true" stored="false"/>
<field name="organizationlegalshortname" type="ngramtext" docValues="false" multiValued="true" indexed="true" stored="false"/>
<field name="originalid" type="string_ci" docValues="false" multiValued="true" indexed="true" stored="false"/>
<field name="peerreviewed" type="boolean" docValues="false" multiValued="false" indexed="true" stored="false"/>
<field name="pid" type="string_ci" docValues="false" multiValued="true" indexed="true" stored="false"/>
<field name="pidclassid" type="string" docValues="false" multiValued="true" indexed="true" stored="false"/>
<field name="popularity" type="pfloat" docValues="false" multiValued="false" indexed="true" stored="false"/>
<field name="popularity_alt" type="pfloat" docValues="false" multiValued="false" indexed="true" stored="false"/>
<field name="popularity_alt_class" type="string" docValues="false" multiValued="false" indexed="true" stored="false"/>
<field name="popularity_class" type="string" docValues="false" multiValued="false" indexed="true" stored="false"/>
<field name="projectacronym" type="ngramtext" docValues="false" multiValued="true" indexed="true" stored="false"/>
<field name="projectcallidentifier" type="string" docValues="false" multiValued="false" indexed="true" stored="false"/>
<field name="projectcode" type="ngramtext" docValues="false" multiValued="true" indexed="true" stored="false"/>
<field name="projectcode_nt" type="string" docValues="false" multiValued="true" indexed="true" stored="false"/>
<field name="projectduration" type="string" docValues="false" multiValued="false" indexed="true" stored="false"/>
<field name="projectecsc39" type="string" docValues="false" multiValued="false" indexed="true" stored="false"/>
<field name="projectenddate" type="date" docValues="false" multiValued="false" indexed="true" stored="false"/>
<field name="projectendyear" type="string" docValues="false" multiValued="false" indexed="true" stored="false"/>
<field name="projectkeywords" type="text_common" docValues="false" multiValued="true" indexed="true" stored="false"/>
<field name="projectoamandatepublications" type="string" docValues="false" multiValued="false" indexed="true" stored="false"/>
<field name="projectstartdate" type="date" docValues="false" multiValued="false" indexed="true" stored="false"/>
<field name="projectstartyear" type="string" docValues="false" multiValued="false" indexed="true" stored="false"/>
<field name="projecttitle" type="ngramtext" docValues="false" multiValued="true" indexed="true" stored="false"/>
<field name="projecttitle_alternative" type="text_en" docValues="false" multiValued="true" indexed="true" stored="false"/>
<field name="provenanceactionclassid" type="string" docValues="false" multiValued="true" indexed="true" stored="false"/>
<field name="publiclyfunded" type="boolean" docValues="false" multiValued="false" indexed="true" stored="false"/>
<field name="relclass" type="string" docValues="false" multiValued="true" indexed="true" stored="false"/>
<field name="relcontracttypename" type="text_common" docValues="false" multiValued="true" indexed="true" stored="false"/>
<field name="reldatasourcecompatibilityid" type="string" docValues="false" multiValued="true" indexed="true" stored="false"/>
<field name="relfunder" type="string" docValues="false" multiValued="true" indexed="true" stored="false"/>
<field name="relfunderid" type="string" docValues="false" multiValued="true" indexed="true" stored="false"/>
<field name="relfundershortname" type="string" docValues="false" multiValued="true" indexed="true" stored="false"/>
<field name="relfundinglevel0_id" type="string" docValues="false" multiValued="true" indexed="true" stored="false"/>
<field name="relfundinglevel0_name" type="string" docValues="false" multiValued="true" indexed="true" stored="false"/>
<field name="relfundinglevel1_id" type="string" docValues="false" multiValued="true" indexed="true" stored="false"/>
<field name="relfundinglevel1_name" type="string" docValues="false" multiValued="true" indexed="true" stored="false"/>
<field name="relfundinglevel2_id" type="string" docValues="false" multiValued="true" indexed="true" stored="false"/>
<field name="relfundinglevel2_name" type="string" docValues="false" multiValued="true" indexed="true" stored="false"/>
<field name="relorganizationcountryid" type="string" docValues="false" multiValued="true" indexed="true" stored="false"/>
<field name="relorganizationcountryname" type="text_common" docValues="false" multiValued="true" indexed="true" stored="false"/>
<field name="relorganizationid" type="string" docValues="false" multiValued="true" indexed="true" stored="false"/>
<field name="relorganizationname" type="text_common" docValues="false" multiValued="true" indexed="true" stored="false"/>
<field name="relorganizationshortname" type="text_common" docValues="false" multiValued="true" indexed="true" stored="false"/>
<field name="relproject" type="string" docValues="false" multiValued="true" indexed="true" stored="false"/>
<field name="relprojectcode" type="string" docValues="false" multiValued="true" indexed="true" stored="false"/>
<field name="relprojectid" type="string" docValues="false" multiValued="true" indexed="true" stored="false"/>
<field name="relprojectname" type="string" docValues="false" multiValued="true" indexed="true" stored="false"/>
<field name="relprojecttitle" type="text_common" docValues="false" multiValued="true" indexed="true" stored="false"/>
<field name="relresultid" type="string" docValues="false" multiValued="true" indexed="true" stored="false"/>
<field name="relresulttype" type="string" docValues="false" multiValued="true" indexed="true" stored="false"/>
<field name="resultacceptanceyear" type="string" docValues="false" multiValued="false" indexed="true" stored="false"/>
<field name="resultauthor" type="text_common" docValues="false" multiValued="true" indexed="true" stored="false"/>
<field name="resultbestaccessright" type="string" docValues="false" multiValued="false" indexed="true" stored="false"/>
<field name="resultdateofacceptance" type="date" docValues="false" multiValued="false" indexed="true" stored="false"/>
<field name="resultdescription" type="text_en" docValues="false" multiValued="true" indexed="true" stored="false"/>
<field name="resultdupid" type="string" docValues="false" multiValued="true" indexed="true" stored="false"/>
<field name="resultembargoenddate" type="date" docValues="false" multiValued="false" indexed="true" stored="false"/>
<field name="resultembargoendyear" type="string" docValues="false" multiValued="false" indexed="true" stored="false"/>
<field name="resulthostingdatasource" type="string" docValues="false" multiValued="true" indexed="true" stored="false"/>
<field name="resulthostingdatasourceid" type="string" docValues="false" multiValued="true" indexed="true" stored="false"/>
<field name="resultidentifier" type="string_ci" docValues="false" multiValued="true" indexed="true" stored="false"/>
<field name="resultlanguagename" type="string" docValues="false" multiValued="false" indexed="true" stored="false"/>
<field name="resultpublisher" type="text_common" docValues="false" multiValued="true" indexed="true" stored="false"/>
<field name="resultsource" type="text_common" docValues="false" multiValued="true" indexed="true" stored="false"/>
<field name="resultsubject" type="text_en" docValues="false" multiValued="true" indexed="true" stored="false"/>
<field name="resulttitle" type="text_en" docValues="false" multiValued="true" indexed="true" stored="false"/>
<field name="resulttypeid" type="string" docValues="false" multiValued="false" indexed="true" stored="false"/>
<field name="sdg" type="string" docValues="false" multiValued="true" indexed="true" stored="false"/>
<field name="semrelid" type="string" docValues="false" multiValued="true" indexed="true" stored="false"/>
<field name="status" type="string_ci" docValues="false" multiValued="true" indexed="true" stored="false"/>
<field name="text" type="text_common" indexed="false" stored="false"/>
<field name="trust" type="string" multiValued="true" indexed="true" stored="false"/>
<field name="versioning" type="string" multiValued="true" indexed="true" stored="false"/>
<field name="isgreen" type="boolean" multiValued="false" indexed="true" stored="false"/>
<field name="openaccesscolor" type="string" multiValued="false" indexed="true" stored="false"/>
<field name="isindiamondjournal" type="boolean" multiValued="false" indexed="true" stored="false"/>
<field name="publiclyfunded" type="boolean" multiValued="false" indexed="true" stored="false"/>
<field name="peerreviewed" type="boolean" multiValued="false" indexed="true" stored="false"/>
<!-- Copy field definitions follow: -->
<!-- Data source -->
<copyField source="datasourceenglishname" dest="__all"/>
<copyField source="datasourceoddescription" dest="__all"/>
<copyField source="datasourceodsubjects" dest="__all"/>
<copyField source="datasourceofficialname" dest="__all"/>
<copyField source="datasourcesubject" dest="__all"/>
<!-- Organization -->
<copyField source="externalreflabel" dest="__all"/>
<copyField source="fundinglevel0_description" dest="__all"/>
<copyField source="fundinglevel1_description" dest="__all"/>
<copyField source="fundinglevel2_description" dest="__all"/>
<copyField source="organizationalternativenames" dest="__all"/>
<copyField source="organizationecenterprise" dest="__all"/>
<copyField source="organizationecinternationalorganization" dest="__all"/>
<copyField source="organizationecinternationalorganizationeurinterests" dest="__all"/>
<copyField source="organizationeclegalbody" dest="__all"/>
<copyField source="organizationeclegalperson" dest="__all"/>
<copyField source="organizationecnonprofit" dest="__all"/>
<copyField source="organizationecnutscode" dest="__all"/>
<copyField source="organizationecresearchorganization" dest="__all"/>
<copyField source="organizationecsmevalidated" dest="__all"/>
<copyField source="organizationlegalname" dest="__all"/>
<copyField source="organizationlegalshortname" dest="__all"/>
<!-- Project -->
<copyField source="projectacronym" dest="__all"/>
<copyField source="projectcode" dest="__all"/>
<copyField source="projectkeywords" dest="__all"/>
<copyField source="projecttitle" dest="__all"/>
<!-- Result -->
<copyField source="resultpublisher" dest="__all"/>
<copyField source="resultsource" dest="__all"/>
<copyField source="resultidentifier" dest="__all"/>
<copyField source="resultauthor" dest="__all"/>
<copyField source="resulttitle" dest="__all"/>
<copyField source="resultdescription" dest="__all"/>
<copyField source="resultsubject" dest="__all"/>
<copyField source="resultacceptanceyear" dest="__all"/>
<!-- Other -->
<copyField source="externalreflabel" dest="__all"/>
<copyField source="fundinglevel0_description" dest="__all"/>
<copyField source="fundinglevel1_description" dest="__all"/>
<copyField source="fundinglevel2_description" dest="__all"/>
<copyField source="projecttitle_alternative" dest="__all"/>
<copyField source="relcontracttypename" dest="__all"/>
<copyField source="relorganizationcountryname" dest="__all"/>
<copyField source="relorganizationname" dest="__all"/>
<copyField source="relorganizationshortname" dest="__all"/>
<copyField source="relprojecttitle" dest="__all"/>
<copyField source="relprojectname" dest="__all"/>
<copyField source="relprojecttitle" dest="__all"/>
<copyField source="resultacceptanceyear" dest="__all"/>
<copyField source="resultauthor" dest="__all"/>
<copyField source="resultdescription" dest="__all"/>
<copyField source="resultidentifier" dest="__all"/>
<copyField source="resultpublisher" dest="__all"/>
<copyField source="resultsource" dest="__all"/>
<copyField source="resulttitle" dest="__all"/>
</schema>

View File

@ -63,7 +63,7 @@ function copydb() {
start_db_time=$(date +%s)
# Delete the old DB from Impala cluster (if exists).
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "drop database if exists ${db} cascade" |& tee error.log # impala-shell prints all logs in stderr, so wee need to capture them and put them in a file, in order to perform "grep" on them later
impala-shell -i ${IMPALA_HOSTNAME} -q "drop database if exists ${db} cascade" |& tee error.log # impala-shell prints all logs in stderr, so wee need to capture them and put them in a file, in order to perform "grep" on them later
log_errors=`cat error.log | grep -E "WARN|ERROR|FAILED"`
if [ -n "$log_errors" ]; then
echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN DROPPING THE OLD DATABASE FROM IMPALA CLUSTER! EXITING...\n\n"
@ -120,7 +120,7 @@ function copydb() {
start_create_schema_time=$(date +%s)
# create the new database (with the same name)
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "create database ${db}"
impala-shell -i ${IMPALA_HOSTNAME} -q "create database ${db}"
# Because "Hive" and "Impala" do not have compatible schemas, we cannot use the "show create table <name>" output from hive to create the exact same table in impala.
# So, we have to find at least one parquet file (check if it's there) from the table in the ocean cluster for impala to use it to extract the table-schema itself from that file.
@ -148,7 +148,7 @@ function copydb() {
exit 5
fi # This error is not FATAL, do we do not return from this function, in normal circumstances.
else
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "create table ${db}.${i} like parquet '${CURRENT_PRQ_FILE}' stored as parquet;" |& tee error.log
impala-shell -i ${IMPALA_HOSTNAME} -q "create table ${db}.${i} like parquet '${CURRENT_PRQ_FILE}' stored as parquet;" |& tee error.log
log_errors=`cat error.log | grep -E "WARN|ERROR|FAILED"`
if [ -n "$log_errors" ]; then
echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN CREATING TABLE '${i}'!\n\n"
@ -182,7 +182,7 @@ function copydb() {
new_num_of_views_to_retry=0
for create_view_statement in "${all_create_view_statements[@]}"; do # Here we use double quotes, as the elements are phrases, instead of single-words.
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "${create_view_statement}" |& tee error.log # impala-shell prints all logs in stderr, so wee need to capture them and put them in a file, in order to perform "grep" on them later
impala-shell -i ${IMPALA_HOSTNAME} -q "${create_view_statement}" |& tee error.log # impala-shell prints all logs in stderr, so wee need to capture them and put them in a file, in order to perform "grep" on them later
specific_errors=`cat error.log | grep -E "FAILED: ParseException line 1:13 missing TABLE at 'view'|ERROR: AnalysisException: Could not resolve table reference:"`
if [ -n "$specific_errors" ]; then
echo -e "\nspecific_errors: ${specific_errors}\n"
@ -212,7 +212,7 @@ function copydb() {
previous_num_of_views_to_retry=$new_num_of_views_to_retry
done
entities_on_impala=(`impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} --delimited -q "show tables in ${db}"`)
entities_on_impala=(`impala-shell -i ${IMPALA_HOSTNAME} --delimited -q "show tables in ${db}"`)
echo -e "\nThe schema of db '${db}', along with ${#entities_on_impala[@]} entities have been created, on Impala cluster, after: $(print_elapsed_time start_create_schema_time)\n"
start_compute_stats_time=$(date +%s)
@ -222,9 +222,9 @@ function copydb() {
create_view_statement=`hive -e "show create table ${db}.${i};" | grep "CREATE VIEW"` # This grep works here, as we do not want to match multiple-lines.
if [ -z "$create_view_statement" ]; then # If it's a table, then go load the data to it.
# Invalidate metadata of this DB's tables, in order for Impala to be aware of all parquet files put inside the tables' directories, previously, by "hadoop distcp".
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA ${db}.${i}"
impala-shell -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA ${db}.${i}"
sleep 1
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "compute stats ${db}.${i}" |& tee error.log
impala-shell -i ${IMPALA_HOSTNAME} -q "compute stats ${db}.${i}" |& tee error.log
log_errors=`cat error.log | grep -E "WARN|ERROR|FAILED"`
if [ -n "$log_errors" ]; then
echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN COMPUTING STATS FOR TABLE '${i}'!\n\n"

View File

@ -63,7 +63,7 @@ function copydb() {
start_db_time=$(date +%s)
# Delete the old DB from Impala cluster (if exists).
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "drop database if exists ${db} cascade" |& tee error.log # impala-shell prints all logs in stderr, so wee need to capture them and put them in a file, in order to perform "grep" on them later
impala-shell -i ${IMPALA_HOSTNAME} -q "drop database if exists ${db} cascade" |& tee error.log # impala-shell prints all logs in stderr, so wee need to capture them and put them in a file, in order to perform "grep" on them later
log_errors=`cat error.log | grep -E "WARN|ERROR|FAILED"`
if [ -n "$log_errors" ]; then
echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN DROPPING THE OLD DATABASE FROM IMPALA CLUSTER! EXITING...\n\n"
@ -120,7 +120,7 @@ function copydb() {
start_create_schema_time=$(date +%s)
# create the new database (with the same name)
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "create database ${db}"
impala-shell -i ${IMPALA_HOSTNAME} -q "create database ${db}"
# Because "Hive" and "Impala" do not have compatible schemas, we cannot use the "show create table <name>" output from hive to create the exact same table in impala.
# So, we have to find at least one parquet file (check if it's there) from the table in the ocean cluster for impala to use it to extract the table-schema itself from that file.
@ -148,7 +148,7 @@ function copydb() {
exit 5
fi # This error is not FATAL, do we do not return from this function, in normal circumstances.
else
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "create table ${db}.${i} like parquet '${CURRENT_PRQ_FILE}' stored as parquet;" |& tee error.log
impala-shell -i ${IMPALA_HOSTNAME} -q "create table ${db}.${i} like parquet '${CURRENT_PRQ_FILE}' stored as parquet;" |& tee error.log
log_errors=`cat error.log | grep -E "WARN|ERROR|FAILED"`
if [ -n "$log_errors" ]; then
echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN CREATING TABLE '${i}'!\n\n"
@ -182,7 +182,7 @@ function copydb() {
new_num_of_views_to_retry=0
for create_view_statement in "${all_create_view_statements[@]}"; do # Here we use double quotes, as the elements are phrases, instead of single-words.
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "${create_view_statement}" |& tee error.log # impala-shell prints all logs in stderr, so wee need to capture them and put them in a file, in order to perform "grep" on them later
impala-shell -i ${IMPALA_HOSTNAME} -q "${create_view_statement}" |& tee error.log # impala-shell prints all logs in stderr, so wee need to capture them and put them in a file, in order to perform "grep" on them later
specific_errors=`cat error.log | grep -E "FAILED: ParseException line 1:13 missing TABLE at 'view'|ERROR: AnalysisException: Could not resolve table reference:"`
if [ -n "$specific_errors" ]; then
echo -e "\nspecific_errors: ${specific_errors}\n"
@ -212,7 +212,7 @@ function copydb() {
previous_num_of_views_to_retry=$new_num_of_views_to_retry
done
entities_on_impala=(`impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} --delimited -q "show tables in ${db}"`)
entities_on_impala=(`impala-shell -i ${IMPALA_HOSTNAME} --delimited -q "show tables in ${db}"`)
echo -e "\nThe schema of db '${db}', along with ${#entities_on_impala[@]} entities have been created, on Impala cluster, after: $(print_elapsed_time start_create_schema_time)\n"
start_compute_stats_time=$(date +%s)
@ -222,9 +222,9 @@ function copydb() {
create_view_statement=`hive -e "show create table ${db}.${i};" | grep "CREATE VIEW"` # This grep works here, as we do not want to match multiple-lines.
if [ -z "$create_view_statement" ]; then # If it's a table, then go load the data to it.
# Invalidate metadata of this DB's tables, in order for Impala to be aware of all parquet files put inside the tables' directories, previously, by "hadoop distcp".
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA ${db}.${i}"
impala-shell -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA ${db}.${i}"
sleep 1
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "compute stats ${db}.${i}" |& tee error.log
impala-shell -i ${IMPALA_HOSTNAME} -q "compute stats ${db}.${i}" |& tee error.log
log_errors=`cat error.log | grep -E "WARN|ERROR|FAILED"`
if [ -n "$log_errors" ]; then
echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN COMPUTING STATS FOR TABLE '${i}'!\n\n"

View File

@ -63,7 +63,7 @@ function copydb() {
start_db_time=$(date +%s)
# Delete the old DB from Impala cluster (if exists).
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "drop database if exists ${db} cascade" |& tee error.log # impala-shell prints all logs in stderr, so wee need to capture them and put them in a file, in order to perform "grep" on them later
impala-shell -i ${IMPALA_HOSTNAME} -q "drop database if exists ${db} cascade" |& tee error.log # impala-shell prints all logs in stderr, so wee need to capture them and put them in a file, in order to perform "grep" on them later
log_errors=`cat error.log | grep -E "WARN|ERROR|FAILED"`
if [ -n "$log_errors" ]; then
echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN DROPPING THE OLD DATABASE FROM IMPALA CLUSTER! EXITING...\n\n"
@ -120,7 +120,7 @@ function copydb() {
start_create_schema_time=$(date +%s)
# create the new database (with the same name)
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "create database ${db}"
impala-shell -i ${IMPALA_HOSTNAME} -q "create database ${db}"
# Because "Hive" and "Impala" do not have compatible schemas, we cannot use the "show create table <name>" output from hive to create the exact same table in impala.
# So, we have to find at least one parquet file (check if it's there) from the table in the ocean cluster for impala to use it to extract the table-schema itself from that file.
@ -148,7 +148,7 @@ function copydb() {
exit 5
fi # This error is not FATAL, do we do not return from this function, in normal circumstances.
else
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "create table ${db}.${i} like parquet '${CURRENT_PRQ_FILE}' stored as parquet;" |& tee error.log
impala-shell -i ${IMPALA_HOSTNAME} -q "create table ${db}.${i} like parquet '${CURRENT_PRQ_FILE}' stored as parquet;" |& tee error.log
log_errors=`cat error.log | grep -E "WARN|ERROR|FAILED"`
if [ -n "$log_errors" ]; then
echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN CREATING TABLE '${i}'!\n\n"
@ -182,7 +182,7 @@ function copydb() {
new_num_of_views_to_retry=0
for create_view_statement in "${all_create_view_statements[@]}"; do # Here we use double quotes, as the elements are phrases, instead of single-words.
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "${create_view_statement}" |& tee error.log # impala-shell prints all logs in stderr, so wee need to capture them and put them in a file, in order to perform "grep" on them later
impala-shell -i ${IMPALA_HOSTNAME} -q "${create_view_statement}" |& tee error.log # impala-shell prints all logs in stderr, so wee need to capture them and put them in a file, in order to perform "grep" on them later
specific_errors=`cat error.log | grep -E "FAILED: ParseException line 1:13 missing TABLE at 'view'|ERROR: AnalysisException: Could not resolve table reference:"`
if [ -n "$specific_errors" ]; then
echo -e "\nspecific_errors: ${specific_errors}\n"
@ -212,7 +212,7 @@ function copydb() {
previous_num_of_views_to_retry=$new_num_of_views_to_retry
done
entities_on_impala=(`impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} --delimited -q "show tables in ${db}"`)
entities_on_impala=(`impala-shell -i ${IMPALA_HOSTNAME} --delimited -q "show tables in ${db}"`)
echo -e "\nThe schema of db '${db}', along with ${#entities_on_impala[@]} entities have been created, on Impala cluster, after: $(print_elapsed_time start_create_schema_time)\n"
start_compute_stats_time=$(date +%s)
@ -222,9 +222,9 @@ function copydb() {
create_view_statement=`hive -e "show create table ${db}.${i};" | grep "CREATE VIEW"` # This grep works here, as we do not want to match multiple-lines.
if [ -z "$create_view_statement" ]; then # If it's a table, then go load the data to it.
# Invalidate metadata of this DB's tables, in order for Impala to be aware of all parquet files put inside the tables' directories, previously, by "hadoop distcp".
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA ${db}.${i}"
impala-shell -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA ${db}.${i}"
sleep 1
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "compute stats ${db}.${i}" |& tee error.log
impala-shell -i ${IMPALA_HOSTNAME} -q "compute stats ${db}.${i}" |& tee error.log
log_errors=`cat error.log | grep -E "WARN|ERROR|FAILED"`
if [ -n "$log_errors" ]; then
echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN COMPUTING STATS FOR TABLE '${i}'!\n\n"

View File

@ -1,4 +1,4 @@
<workflow-app name="Graph Stats" xmlns="uri:oozie:workflow:0.5">
<workflow-app name="Promote Graph Stats" xmlns="uri:oozie:workflow:0.5">
<parameters>
<property>
<name>stats_db_name</name>

View File

@ -65,7 +65,7 @@ function copydb() {
start_db_time=$(date +%s)
# Delete the old DB from Impala cluster (if exists).
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "drop database if exists ${db} cascade" |& tee error.log # impala-shell prints all logs in stderr, so wee need to capture them and put them in a file, in order to perform "grep" on them later
impala-shell -i ${IMPALA_HOSTNAME} -q "drop database if exists ${db} cascade" |& tee error.log # impala-shell prints all logs in stderr, so wee need to capture them and put them in a file, in order to perform "grep" on them later
log_errors=`cat error.log | grep -E "WARN|ERROR|FAILED"`
if [ -n "$log_errors" ]; then
echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN DROPPING THE OLD DATABASE FROM IMPALA CLUSTER! EXITING...\n\n"
@ -122,7 +122,7 @@ function copydb() {
start_create_schema_time=$(date +%s)
# create the new database (with the same name)
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "create database ${db}"
impala-shell -i ${IMPALA_HOSTNAME} -q "create database ${db}"
# Because "Hive" and "Impala" do not have compatible schemas, we cannot use the "show create table <name>" output from hive to create the exact same table in impala.
# So, we have to find at least one parquet file (check if it's there) from the table in the ocean cluster for impala to use it to extract the table-schema itself from that file.
@ -150,7 +150,7 @@ function copydb() {
exit 5
fi # This error is not FATAL, do we do not return from this function, in normal circumstances.
else
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "create table ${db}.${i} like parquet '${CURRENT_PRQ_FILE}' stored as parquet;" |& tee error.log
impala-shell -i ${IMPALA_HOSTNAME} -q "create table ${db}.${i} like parquet '${CURRENT_PRQ_FILE}' stored as parquet;" |& tee error.log
log_errors=`cat error.log | grep -E "WARN|ERROR|FAILED"`
if [ -n "$log_errors" ]; then
echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN CREATING TABLE '${i}'!\n\n"
@ -184,7 +184,7 @@ function copydb() {
new_num_of_views_to_retry=0
for create_view_statement in "${all_create_view_statements[@]}"; do # Here we use double quotes, as the elements are phrases, instead of single-words.
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "${create_view_statement}" |& tee error.log # impala-shell prints all logs in stderr, so wee need to capture them and put them in a file, in order to perform "grep" on them later
impala-shell -i ${IMPALA_HOSTNAME} -q "${create_view_statement}" |& tee error.log # impala-shell prints all logs in stderr, so wee need to capture them and put them in a file, in order to perform "grep" on them later
specific_errors=`cat error.log | grep -E "FAILED: ParseException line 1:13 missing TABLE at 'view'|ERROR: AnalysisException: Could not resolve table reference:"`
if [ -n "$specific_errors" ]; then
echo -e "\nspecific_errors: ${specific_errors}\n"
@ -214,7 +214,7 @@ function copydb() {
previous_num_of_views_to_retry=$new_num_of_views_to_retry
done
entities_on_impala=(`impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} --delimited -q "show tables in ${db}"`)
entities_on_impala=(`impala-shell -i ${IMPALA_HOSTNAME} --delimited -q "show tables in ${db}"`)
echo -e "\nThe schema of db '${db}', along with ${#entities_on_impala[@]} entities have been created, on Impala cluster, after: $(print_elapsed_time start_create_schema_time)\n"
start_compute_stats_time=$(date +%s)
@ -224,9 +224,9 @@ function copydb() {
create_view_statement=`hive -e "show create table ${db}.${i};" | grep "CREATE VIEW"` # This grep works here, as we do not want to match multiple-lines.
if [ -z "$create_view_statement" ]; then # If it's a table, then go load the data to it.
# Invalidate metadata of this DB's tables, in order for Impala to be aware of all parquet files put inside the tables' directories, previously, by "hadoop distcp".
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA ${db}.${i}"
impala-shell -i ${IMPALA_HOSTNAME} -q "INVALIDATE METADATA ${db}.${i}"
sleep 1
impala-shell --user ${HADOOP_USER_NAME} -i ${IMPALA_HOSTNAME} -q "compute stats ${db}.${i}" |& tee error.log
impala-shell -i ${IMPALA_HOSTNAME} -q "compute stats ${db}.${i}" |& tee error.log
log_errors=`cat error.log | grep -E "WARN|ERROR|FAILED"`
if [ -n "$log_errors" ]; then
echo -e "\n\nERROR: THERE WAS A PROBLEM WHEN COMPUTING STATS FOR TABLE '${i}'!\n\n"
@ -271,8 +271,7 @@ copydb $MONITOR_DB'_institutions'
copydb $MONITOR_DB'_ris_tail'
contexts="knowmad::other dh-ch::other enermaps::other gotriple::other neanias-atmospheric::other rural-digital-europe::other covid-19::other aurora::other neanias-space::other north-america-studies::other north-american-studies::other eutopia::other"
for i in ${contexts}
do
for i in ${contexts}; do
tmp=`echo "$i" | sed 's/'-'/'_'/g' | sed 's/'::'/'_'/g'`
copydb ${MONITOR_DB}'_'${tmp}
done

View File

@ -6,12 +6,16 @@ then
ln -sfn ${PYTHON_EGG_CACHE}${link_folder} ${link_folder}
fi
export HADOOP_USER_NAME=$3
IMPALA_HOSTNAME='impala-cluster-dn1.openaire.eu'
function createPDFsAggregated() {
db=$1
impala-shell --user $HADOOP_USER_NAME -i impala-cluster-dn1.openaire.eu -d ${db} -q "drop table if exists indi_is_result_accessible";
impala-shell -i ${IMPALA_HOSTNAME} -d ${db} -q "drop table if exists indi_is_result_accessible";
impala-shell --user $HADOOP_USER_NAME -i impala-cluster-dn1.openaire.eu -d ${db} -q "create table indi_is_result_accessible stored as parquet as
impala-shell -i ${IMPALA_HOSTNAME} -d ${db} -q "create table indi_is_result_accessible stored as parquet as
select distinct p.id, coalesce(is_result_accessible, 0) as is_result_accessible from result p
left outer join
(select id, 1 as is_result_accessible from (select pl.* from result r
@ -20,7 +24,8 @@ impala-shell --user $HADOOP_USER_NAME -i impala-cluster-dn1.openaire.eu -d ${db}
union all
select pl.* from result r
join pdfaggregation_i.publication p on r.id=p.dedupid
join pdfaggregation_i.payload pl on pl.id=p.id) foo) tmp on p.id=tmp.id";
join pdfaggregation_i.payload pl on pl.id=p.id) foo)
tmp on p.id=tmp.id";
}
STATS_DB=$1
@ -35,8 +40,7 @@ createPDFsAggregated $MONITOR_DB'_institutions'
createPDFsAggregated $MONITOR_DB'_ris_tail'
contexts="knowmad::other dh-ch::other enermaps::other gotriple::other neanias-atmospheric::other rural-digital-europe::other covid-19::other aurora::other neanias-space::other north-america-studies::other north-american-studies::other eutopia::other"
for i in ${contexts}
do
for i in ${contexts}; do
tmp=`echo "$i" | sed 's/'-'/'_'/g' | sed 's/'::'/'_'/g'`
createPDFsAggregated ${MONITOR_DB}'_'${tmp}
done

View File

@ -51,49 +51,6 @@
<artifactId>hadoop-distcp</artifactId>
</dependency>
<dependency>
<groupId>eu.dnetlib</groupId>
<artifactId>dnet-actionmanager-api</artifactId>
</dependency>
<dependency>
<groupId>eu.dnetlib</groupId>
<artifactId>dnet-actionmanager-common</artifactId>
<exclusions>
<exclusion>
<groupId>eu.dnetlib</groupId>
<artifactId>dnet-openaireplus-mapping-utils</artifactId>
</exclusion>
<exclusion>
<groupId>saxonica</groupId>
<artifactId>saxon</artifactId>
</exclusion>
<exclusion>
<groupId>saxonica</groupId>
<artifactId>saxon-dom</artifactId>
</exclusion>
<exclusion>
<groupId>jgrapht</groupId>
<artifactId>jgrapht</artifactId>
</exclusion>
<exclusion>
<groupId>net.sf.ehcache</groupId>
<artifactId>ehcache</artifactId>
</exclusion>
<exclusion>
<groupId>org.springframework</groupId>
<artifactId>spring-test</artifactId>
</exclusion>
<exclusion>
<groupId>org.apache.*</groupId>
<artifactId>*</artifactId>
</exclusion>
<exclusion>
<groupId>apache</groupId>
<artifactId>*</artifactId>
</exclusion>
</exclusions>
</dependency>
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient</artifactId>

27
pom.xml
View File

@ -440,29 +440,6 @@
<scope>provided</scope>
</dependency>
<dependency>
<groupId>eu.dnetlib</groupId>
<artifactId>dnet-actionmanager-common</artifactId>
<version>${dnet-actionmanager-common.version}</version>
<exclusions>
<exclusion>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-common</artifactId>
</exclusion>
</exclusions>
</dependency>
<dependency>
<groupId>eu.dnetlib</groupId>
<artifactId>dnet-actionmanager-api</artifactId>
<version>${dnet-actionmanager-api.version}</version>
<exclusions>
<exclusion>
<groupId>eu.dnetlib</groupId>
<artifactId>cnr-misc-utils</artifactId>
</exclusion>
</exclusions>
</dependency>
<dependency>
<groupId>eu.dnetlib</groupId>
<artifactId>cnr-rmi-api</artifactId>
@ -960,7 +937,7 @@
<commons.logging.version>1.1.3</commons.logging.version>
<commons-validator.version>1.7</commons-validator.version>
<dateparser.version>1.0.7</dateparser.version>
<dhp-schemas.version>[6.1.3-SNAPSHOT]</dhp-schemas.version>
<dhp-schemas.version>[7.0.0]</dhp-schemas.version>
<dhp.cdh.version>cdh5.9.2</dhp.cdh.version>
<dhp.commons.lang.version>3.5</dhp.commons.lang.version>
<dhp.guava.version>11.0.2</dhp.guava.version>
@ -969,8 +946,6 @@
<dhp.oozie.version>4.1.0-${dhp.cdh.version}</dhp.oozie.version>
<dhp.site.skip>true</dhp.site.skip>
<dhp.spark.version>2.4.0.cloudera2</dhp.spark.version>
<dnet-actionmanager-api.version>[4.0.3]</dnet-actionmanager-api.version>
<dnet-actionmanager-common.version>[6.0.5]</dnet-actionmanager-common.version>
<dnet-openaire-broker-common.version>[3.1.6]</dnet-openaire-broker-common.version>
<google.gson.version>2.2.2</google.gson.version>
<log4j.version>1.2.17</log4j.version>