forked from D-Net/dnet-hadoop
merged from master
This commit is contained in:
commit
8d2bb24512
|
@ -4,6 +4,7 @@ package eu.dnetlib.dhp.oa.merge;
|
||||||
import java.text.Normalizer;
|
import java.text.Normalizer;
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
import java.util.stream.Collectors;
|
import java.util.stream.Collectors;
|
||||||
|
import java.util.stream.Stream;
|
||||||
|
|
||||||
import org.apache.commons.lang3.StringUtils;
|
import org.apache.commons.lang3.StringUtils;
|
||||||
|
|
||||||
|
@ -32,27 +33,33 @@ public class AuthorMerger {
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public static List<Author> mergeAuthor(final List<Author> a, final List<Author> b) {
|
public static List<Author> mergeAuthor(final List<Author> a, final List<Author> b, Double threshold) {
|
||||||
int pa = countAuthorsPids(a);
|
int pa = countAuthorsPids(a);
|
||||||
int pb = countAuthorsPids(b);
|
int pb = countAuthorsPids(b);
|
||||||
List<Author> base, enrich;
|
List<Author> base, enrich;
|
||||||
int sa = authorsSize(a);
|
int sa = authorsSize(a);
|
||||||
int sb = authorsSize(b);
|
int sb = authorsSize(b);
|
||||||
|
|
||||||
if (pa == pb) {
|
if (sa == sb) {
|
||||||
base = sa > sb ? a : b;
|
|
||||||
enrich = sa > sb ? b : a;
|
|
||||||
} else {
|
|
||||||
base = pa > pb ? a : b;
|
base = pa > pb ? a : b;
|
||||||
enrich = pa > pb ? b : a;
|
enrich = pa > pb ? b : a;
|
||||||
|
} else {
|
||||||
|
base = sa > sb ? a : b;
|
||||||
|
enrich = sa > sb ? b : a;
|
||||||
}
|
}
|
||||||
enrichPidFromList(base, enrich);
|
enrichPidFromList(base, enrich, threshold);
|
||||||
return base;
|
return base;
|
||||||
}
|
}
|
||||||
|
|
||||||
private static void enrichPidFromList(List<Author> base, List<Author> enrich) {
|
public static List<Author> mergeAuthor(final List<Author> a, final List<Author> b) {
|
||||||
|
return mergeAuthor(a, b, THRESHOLD);
|
||||||
|
}
|
||||||
|
|
||||||
|
private static void enrichPidFromList(List<Author> base, List<Author> enrich, Double threshold) {
|
||||||
if (base == null || enrich == null)
|
if (base == null || enrich == null)
|
||||||
return;
|
return;
|
||||||
|
|
||||||
|
// <pidComparableString, Author> (if an Author has more than 1 pid, it appears 2 times in the list)
|
||||||
final Map<String, Author> basePidAuthorMap = base
|
final Map<String, Author> basePidAuthorMap = base
|
||||||
.stream()
|
.stream()
|
||||||
.filter(a -> a.getPid() != null && a.getPid().size() > 0)
|
.filter(a -> a.getPid() != null && a.getPid().size() > 0)
|
||||||
|
@ -63,6 +70,7 @@ public class AuthorMerger {
|
||||||
.map(p -> new Tuple2<>(pidToComparableString(p), a)))
|
.map(p -> new Tuple2<>(pidToComparableString(p), a)))
|
||||||
.collect(Collectors.toMap(Tuple2::_1, Tuple2::_2, (x1, x2) -> x1));
|
.collect(Collectors.toMap(Tuple2::_1, Tuple2::_2, (x1, x2) -> x1));
|
||||||
|
|
||||||
|
// <pid, Author> (list of pid that are missing in the other list)
|
||||||
final List<Tuple2<StructuredProperty, Author>> pidToEnrich = enrich
|
final List<Tuple2<StructuredProperty, Author>> pidToEnrich = enrich
|
||||||
.stream()
|
.stream()
|
||||||
.filter(a -> a.getPid() != null && a.getPid().size() > 0)
|
.filter(a -> a.getPid() != null && a.getPid().size() > 0)
|
||||||
|
@ -83,10 +91,10 @@ public class AuthorMerger {
|
||||||
.max(Comparator.comparing(Tuple2::_1));
|
.max(Comparator.comparing(Tuple2::_1));
|
||||||
|
|
||||||
if (simAuthor.isPresent()) {
|
if (simAuthor.isPresent()) {
|
||||||
double th = THRESHOLD;
|
double th = threshold;
|
||||||
// increase the threshold if the surname is too short
|
// increase the threshold if the surname is too short
|
||||||
if (simAuthor.get()._2().getSurname() != null
|
if (simAuthor.get()._2().getSurname() != null
|
||||||
&& simAuthor.get()._2().getSurname().length() <= 3)
|
&& simAuthor.get()._2().getSurname().length() <= 3 && threshold > 0.0)
|
||||||
th = 0.99;
|
th = 0.99;
|
||||||
|
|
||||||
if (simAuthor.get()._1() > th) {
|
if (simAuthor.get()._1() > th) {
|
||||||
|
@ -156,7 +164,7 @@ public class AuthorMerger {
|
||||||
}
|
}
|
||||||
|
|
||||||
private static String normalize(final String s) {
|
private static String normalize(final String s) {
|
||||||
return nfd(s)
|
String[] normalized = nfd(s)
|
||||||
.toLowerCase()
|
.toLowerCase()
|
||||||
// do not compact the regexes in a single expression, would cause StackOverflowError
|
// do not compact the regexes in a single expression, would cause StackOverflowError
|
||||||
// in case
|
// in case
|
||||||
|
@ -166,7 +174,12 @@ public class AuthorMerger {
|
||||||
.replaceAll("(\\p{Punct})+", " ")
|
.replaceAll("(\\p{Punct})+", " ")
|
||||||
.replaceAll("(\\d)+", " ")
|
.replaceAll("(\\d)+", " ")
|
||||||
.replaceAll("(\\n)+", " ")
|
.replaceAll("(\\n)+", " ")
|
||||||
.trim();
|
.trim()
|
||||||
|
.split(" ");
|
||||||
|
|
||||||
|
Arrays.sort(normalized);
|
||||||
|
|
||||||
|
return String.join(" ", normalized);
|
||||||
}
|
}
|
||||||
|
|
||||||
private static String nfd(final String s) {
|
private static String nfd(final String s) {
|
||||||
|
|
|
@ -0,0 +1,100 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.dhp.oa.merge;
|
||||||
|
|
||||||
|
import java.io.BufferedReader;
|
||||||
|
import java.io.FileReader;
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.nio.file.Paths;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
|
import org.junit.jupiter.api.Assertions;
|
||||||
|
import org.junit.jupiter.api.BeforeEach;
|
||||||
|
import org.junit.jupiter.api.Test;
|
||||||
|
|
||||||
|
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||||
|
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.Author;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.Publication;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
|
||||||
|
import eu.dnetlib.pace.util.MapDocumentUtil;
|
||||||
|
import scala.Tuple2;
|
||||||
|
|
||||||
|
public class AuthorMergerTest {
|
||||||
|
|
||||||
|
private String publicationsBasePath;
|
||||||
|
|
||||||
|
private List<List<Author>> authors;
|
||||||
|
|
||||||
|
@BeforeEach
|
||||||
|
public void setUp() throws Exception {
|
||||||
|
|
||||||
|
publicationsBasePath = Paths
|
||||||
|
.get(AuthorMergerTest.class.getResource("/eu/dnetlib/dhp/oa/merge").toURI())
|
||||||
|
.toFile()
|
||||||
|
.getAbsolutePath();
|
||||||
|
|
||||||
|
authors = readSample(publicationsBasePath + "/publications_with_authors.json", Publication.class)
|
||||||
|
.stream()
|
||||||
|
.map(p -> p._2().getAuthor())
|
||||||
|
.collect(Collectors.toList());
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void mergeTest() { // used in the dedup: threshold set to 0.95
|
||||||
|
|
||||||
|
for (List<Author> authors1 : authors) {
|
||||||
|
System.out.println("List " + (authors.indexOf(authors1) + 1));
|
||||||
|
for (Author author : authors1) {
|
||||||
|
System.out.println(authorToString(author));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
List<Author> merge = AuthorMerger.merge(authors);
|
||||||
|
|
||||||
|
System.out.println("Merge ");
|
||||||
|
for (Author author : merge) {
|
||||||
|
System.out.println(authorToString(author));
|
||||||
|
}
|
||||||
|
|
||||||
|
Assertions.assertEquals(7, merge.size());
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
public <T> List<Tuple2<String, T>> readSample(String path, Class<T> clazz) {
|
||||||
|
List<Tuple2<String, T>> res = new ArrayList<>();
|
||||||
|
BufferedReader reader;
|
||||||
|
try {
|
||||||
|
reader = new BufferedReader(new FileReader(path));
|
||||||
|
String line = reader.readLine();
|
||||||
|
while (line != null) {
|
||||||
|
res
|
||||||
|
.add(
|
||||||
|
new Tuple2<>(
|
||||||
|
MapDocumentUtil.getJPathString("$.id", line),
|
||||||
|
new ObjectMapper().readValue(line, clazz)));
|
||||||
|
// read next line
|
||||||
|
line = reader.readLine();
|
||||||
|
}
|
||||||
|
reader.close();
|
||||||
|
} catch (IOException e) {
|
||||||
|
e.printStackTrace();
|
||||||
|
}
|
||||||
|
|
||||||
|
return res;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String authorToString(Author a) {
|
||||||
|
|
||||||
|
String print = "Fullname = ";
|
||||||
|
print += a.getFullname() + " pid = [";
|
||||||
|
if (a.getPid() != null)
|
||||||
|
for (StructuredProperty sp : a.getPid()) {
|
||||||
|
print += sp.toComparableString() + " ";
|
||||||
|
}
|
||||||
|
print += "]";
|
||||||
|
return print;
|
||||||
|
}
|
||||||
|
}
|
File diff suppressed because one or more lines are too long
|
@ -108,6 +108,8 @@ public class ModelConstants {
|
||||||
public static final KeyValue UNKNOWN_REPOSITORY = keyValue(
|
public static final KeyValue UNKNOWN_REPOSITORY = keyValue(
|
||||||
"10|openaire____::55045bd2a65019fd8e6741a755395c8c", "Unknown Repository");
|
"10|openaire____::55045bd2a65019fd8e6741a755395c8c", "Unknown Repository");
|
||||||
|
|
||||||
|
public static final Qualifier UNKNOWN_COUNTRY = qualifier(UNKNOWN, "Unknown", DNET_COUNTRY_TYPE, DNET_COUNTRY_TYPE);
|
||||||
|
|
||||||
private static Qualifier qualifier(
|
private static Qualifier qualifier(
|
||||||
final String classid,
|
final String classid,
|
||||||
final String classname,
|
final String classname,
|
||||||
|
|
|
@ -3,6 +3,9 @@ package eu.dnetlib.dhp.schema.common;
|
||||||
|
|
||||||
import static com.google.common.base.Preconditions.checkArgument;
|
import static com.google.common.base.Preconditions.checkArgument;
|
||||||
|
|
||||||
|
import java.text.ParseException;
|
||||||
|
import java.text.SimpleDateFormat;
|
||||||
|
import java.util.Date;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
import java.util.Objects;
|
import java.util.Objects;
|
||||||
import java.util.Optional;
|
import java.util.Optional;
|
||||||
|
@ -473,4 +476,25 @@ public class ModelSupport {
|
||||||
private static <T extends Oaf> String idFnForOafEntity(T t) {
|
private static <T extends Oaf> String idFnForOafEntity(T t) {
|
||||||
return ((OafEntity) t).getId();
|
return ((OafEntity) t).getId();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public static final String ISO8601FORMAT = "yyyy-MM-dd'T'HH:mm:ssZ";
|
||||||
|
|
||||||
|
public static String oldest(String dateA, String dateB) throws ParseException {
|
||||||
|
|
||||||
|
if (StringUtils.isBlank(dateA)) {
|
||||||
|
return dateB;
|
||||||
|
}
|
||||||
|
if (StringUtils.isBlank(dateB)) {
|
||||||
|
return dateA;
|
||||||
|
}
|
||||||
|
if (StringUtils.isNotBlank(dateA) && StringUtils.isNotBlank(dateB)) {
|
||||||
|
|
||||||
|
final Date a = new SimpleDateFormat(ISO8601FORMAT).parse(dateA);
|
||||||
|
final Date b = new SimpleDateFormat(ISO8601FORMAT).parse(dateB);
|
||||||
|
|
||||||
|
return a.before(b) ? dateA : dateB;
|
||||||
|
} else {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,7 +1,6 @@
|
||||||
|
|
||||||
package eu.dnetlib.dhp.schema.oaf;
|
package eu.dnetlib.dhp.schema.oaf;
|
||||||
|
|
||||||
import java.io.Serializable;
|
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
import com.google.common.base.Objects;
|
import com.google.common.base.Objects;
|
||||||
|
@ -9,7 +8,7 @@ import com.google.common.base.Objects;
|
||||||
/**
|
/**
|
||||||
* Represent a measure, must be further described by a system available resource providing name and descriptions.
|
* Represent a measure, must be further described by a system available resource providing name and descriptions.
|
||||||
*/
|
*/
|
||||||
public class Measure implements Serializable {
|
public class Measure {
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Unique measure identifier.
|
* Unique measure identifier.
|
||||||
|
@ -17,7 +16,7 @@ public class Measure implements Serializable {
|
||||||
private String id;
|
private String id;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* List of units associated with this measure. KeyValue provides a pair to store the label (key) and the value, plus
|
* List of units associated with this measure. KeyValue provides a pair to store the laber (key) and the value, plus
|
||||||
* common provenance information.
|
* common provenance information.
|
||||||
*/
|
*/
|
||||||
private List<KeyValue> unit;
|
private List<KeyValue> unit;
|
||||||
|
|
|
@ -62,8 +62,6 @@ public abstract class Oaf implements Serializable {
|
||||||
.distinct() // relies on KeyValue.equals
|
.distinct() // relies on KeyValue.equals
|
||||||
.collect(Collectors.toList()));
|
.collect(Collectors.toList()));
|
||||||
|
|
||||||
mergeOAFDataInfo(o);
|
|
||||||
|
|
||||||
setLastupdatetimestamp(
|
setLastupdatetimestamp(
|
||||||
Math
|
Math
|
||||||
.max(
|
.max(
|
||||||
|
|
|
@ -351,8 +351,6 @@ public class Project extends OafEntity implements Serializable {
|
||||||
? p.getFundedamount()
|
? p.getFundedamount()
|
||||||
: fundedamount;
|
: fundedamount;
|
||||||
|
|
||||||
// programme = mergeLists(programme, p.getProgramme());
|
|
||||||
|
|
||||||
h2020classification = mergeLists(h2020classification, p.getH2020classification());
|
h2020classification = mergeLists(h2020classification, p.getH2020classification());
|
||||||
|
|
||||||
mergeOAFDataInfo(e);
|
mergeOAFDataInfo(e);
|
||||||
|
|
|
@ -1,8 +1,11 @@
|
||||||
|
|
||||||
package eu.dnetlib.dhp.schema.oaf;
|
package eu.dnetlib.dhp.schema.oaf;
|
||||||
|
|
||||||
|
import eu.dnetlib.dhp.schema.common.ModelSupport;
|
||||||
|
|
||||||
import static com.google.common.base.Preconditions.checkArgument;
|
import static com.google.common.base.Preconditions.checkArgument;
|
||||||
|
|
||||||
|
import java.text.ParseException;
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
import java.util.stream.Collectors;
|
import java.util.stream.Collectors;
|
||||||
import java.util.stream.Stream;
|
import java.util.stream.Stream;
|
||||||
|
@ -106,7 +109,7 @@ public class Relation extends Oaf {
|
||||||
}
|
}
|
||||||
|
|
||||||
public Boolean getValidated() {
|
public Boolean getValidated() {
|
||||||
return validated;
|
return Objects.nonNull(validated) && validated;
|
||||||
}
|
}
|
||||||
|
|
||||||
public void setValidated(Boolean validated) {
|
public void setValidated(Boolean validated) {
|
||||||
|
@ -130,6 +133,13 @@ public class Relation extends Oaf {
|
||||||
Objects.equals(getSubRelType(), r.getSubRelType()), "subRelType(s) must be equal");
|
Objects.equals(getSubRelType(), r.getSubRelType()), "subRelType(s) must be equal");
|
||||||
checkArgument(Objects.equals(getRelClass(), r.getRelClass()), "relClass(es) must be equal");
|
checkArgument(Objects.equals(getRelClass(), r.getRelClass()), "relClass(es) must be equal");
|
||||||
|
|
||||||
|
setValidated(getValidated() || r.getValidated());
|
||||||
|
try {
|
||||||
|
setValidationDate(ModelSupport.oldest(getValidationDate(), r.getValidationDate()));
|
||||||
|
} catch (ParseException e) {
|
||||||
|
throw new IllegalArgumentException(String.format("invalid validation date format in relation [s:%s, t:%s]: %s", getSource(), getTarget(), getValidationDate()));
|
||||||
|
}
|
||||||
|
|
||||||
super.mergeFrom(r);
|
super.mergeFrom(r);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -243,7 +243,7 @@ public class Result extends OafEntity implements Serializable {
|
||||||
|
|
||||||
Result r = (Result) e;
|
Result r = (Result) e;
|
||||||
|
|
||||||
// TODO consider merging also Measures
|
measures = mergeLists(measures, r.getMeasures());
|
||||||
|
|
||||||
instance = mergeLists(instance, r.getInstance());
|
instance = mergeLists(instance, r.getInstance());
|
||||||
|
|
||||||
|
@ -323,13 +323,13 @@ public class Result extends OafEntity implements Serializable {
|
||||||
if (a.size() == b.size()) {
|
if (a.size() == b.size()) {
|
||||||
int msa = a
|
int msa = a
|
||||||
.stream()
|
.stream()
|
||||||
.filter(i -> i.getValue() != null)
|
.filter(i -> i != null && i.getValue() != null)
|
||||||
.map(i -> i.getValue().length())
|
.map(i -> i.getValue().length())
|
||||||
.max(Comparator.naturalOrder())
|
.max(Comparator.naturalOrder())
|
||||||
.orElse(0);
|
.orElse(0);
|
||||||
int msb = b
|
int msb = b
|
||||||
.stream()
|
.stream()
|
||||||
.filter(i -> i.getValue() != null)
|
.filter(i -> i != null && i.getValue() != null)
|
||||||
.map(i -> i.getValue().length())
|
.map(i -> i.getValue().length())
|
||||||
.max(Comparator.naturalOrder())
|
.max(Comparator.naturalOrder())
|
||||||
.orElse(0);
|
.orElse(0);
|
||||||
|
|
|
@ -63,6 +63,51 @@ public class MergeTest {
|
||||||
assertEquals(3, a.getSubject().size());
|
assertEquals(3, a.getSubject().size());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void mergeRelationTest() {
|
||||||
|
|
||||||
|
Relation a = createRel(null, null);
|
||||||
|
Relation b = createRel(null, null);
|
||||||
|
a.mergeFrom(b);
|
||||||
|
assertEquals(a, b);
|
||||||
|
|
||||||
|
a = createRel(true, null);
|
||||||
|
b = createRel(null, null);
|
||||||
|
a.mergeFrom(b);
|
||||||
|
assertEquals(true, a.getValidated());
|
||||||
|
|
||||||
|
a = createRel(true, null);
|
||||||
|
b = createRel(false, null);
|
||||||
|
a.mergeFrom(b);
|
||||||
|
assertEquals(true, a.getValidated());
|
||||||
|
|
||||||
|
a = createRel(true, null);
|
||||||
|
b = createRel(true, "2016-04-05T12:41:19.202Z");
|
||||||
|
a.mergeFrom(b);
|
||||||
|
assertEquals("2016-04-05T12:41:19.202Z", a.getValidationDate());
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void mergeRelationTestParseException() {
|
||||||
|
assertThrows(IllegalArgumentException.class, () -> {
|
||||||
|
Relation a = createRel(true, "2016-04-05");
|
||||||
|
Relation b = createRel(true, "2016-04-05");
|
||||||
|
a.mergeFrom(b);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
private Relation createRel(Boolean validated, String validationDate) {
|
||||||
|
Relation rel = new Relation();
|
||||||
|
rel.setSource("1");
|
||||||
|
rel.setTarget("2");
|
||||||
|
rel.setRelType("reltype");
|
||||||
|
rel.setSubRelType("subreltype");
|
||||||
|
rel.setRelClass("relclass");
|
||||||
|
rel.setValidated(validated);
|
||||||
|
rel.setValidationDate(validationDate);
|
||||||
|
return rel;
|
||||||
|
}
|
||||||
|
|
||||||
private KeyValue setKV(final String key, final String value) {
|
private KeyValue setKV(final String key, final String value) {
|
||||||
|
|
||||||
KeyValue k = new KeyValue();
|
KeyValue k = new KeyValue();
|
||||||
|
|
|
@ -125,16 +125,17 @@ public class SparkAtomicActionScoreJob implements Serializable {
|
||||||
return ret;
|
return ret;
|
||||||
}, Encoders.bean(BipScore.class))
|
}, Encoders.bean(BipScore.class))
|
||||||
.groupByKey((MapFunction<BipScore, String>) value -> value.getId(), Encoders.STRING())
|
.groupByKey((MapFunction<BipScore, String>) value -> value.getId(), Encoders.STRING())
|
||||||
.mapGroups((MapGroupsFunction<String, BipScore, I>) (k, it) -> {
|
.mapGroups((MapGroupsFunction<String, BipScore, Result>) (k, it) -> {
|
||||||
Result ret = inputClazz.newInstance();
|
Result ret = new Result();
|
||||||
|
ret.setDataInfo(getDataInfo());
|
||||||
BipScore first = it.next();
|
BipScore first = it.next();
|
||||||
ret.setId(first.getId());
|
ret.setId(first.getId());
|
||||||
|
|
||||||
ret.setMeasures(getMeasure(first));
|
ret.setMeasures(getMeasure(first));
|
||||||
it.forEachRemaining(value -> ret.getMeasures().addAll(getMeasure(value)));
|
it.forEachRemaining(value -> ret.getMeasures().addAll(getMeasure(value)));
|
||||||
|
|
||||||
return (I) ret;
|
return ret;
|
||||||
}, Encoders.bean(inputClazz))
|
}, Encoders.bean(Result.class))
|
||||||
.toJavaRDD()
|
.toJavaRDD()
|
||||||
.map(p -> new AtomicAction(inputClazz, p))
|
.map(p -> new AtomicAction(inputClazz, p))
|
||||||
.mapToPair(
|
.mapToPair(
|
||||||
|
|
|
@ -4,17 +4,17 @@
|
||||||
"paramLongName": "isSparkSessionManaged",
|
"paramLongName": "isSparkSessionManaged",
|
||||||
"paramDescription": "when true will stop SparkSession after job execution",
|
"paramDescription": "when true will stop SparkSession after job execution",
|
||||||
"paramRequired": false
|
"paramRequired": false
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"paramName": "ip",
|
"paramName": "ip",
|
||||||
"paramLongName": "inputPath",
|
"paramLongName": "inputPath",
|
||||||
"paramDescription": "the URL from where to get the programme file",
|
"paramDescription": "the URL from where to get the programme file",
|
||||||
"paramRequired": true
|
"paramRequired": true
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"paramName": "o",
|
"paramName": "o",
|
||||||
"paramLongName": "outputPath",
|
"paramLongName": "outputPath",
|
||||||
"paramDescription": "the path of the new ActionSet",
|
"paramDescription": "the path of the new ActionSet",
|
||||||
"paramRequired": true
|
"paramRequired": true
|
||||||
}
|
}
|
||||||
]
|
]
|
|
@ -4,19 +4,19 @@
|
||||||
"paramLongName": "isSparkSessionManaged",
|
"paramLongName": "isSparkSessionManaged",
|
||||||
"paramDescription": "when true will stop SparkSession after job execution",
|
"paramDescription": "when true will stop SparkSession after job execution",
|
||||||
"paramRequired": false
|
"paramRequired": false
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"paramName": "ip",
|
"paramName": "ip",
|
||||||
"paramLongName": "inputPath",
|
"paramLongName": "inputPath",
|
||||||
"paramDescription": "the URL from where to get the programme file",
|
"paramDescription": "the URL from where to get the programme file",
|
||||||
"paramRequired": true
|
"paramRequired": true
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"paramName": "o",
|
"paramName": "o",
|
||||||
"paramLongName": "outputPath",
|
"paramLongName": "outputPath",
|
||||||
"paramDescription": "the path of the new ActionSet",
|
"paramDescription": "the path of the new ActionSet",
|
||||||
"paramRequired": true
|
"paramRequired": true
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"paramName": "rtn",
|
"paramName": "rtn",
|
||||||
"paramLongName": "resultTableName",
|
"paramLongName": "resultTableName",
|
||||||
|
|
|
@ -21,10 +21,10 @@
|
||||||
</kill>
|
</kill>
|
||||||
<action name="deleteoutputpath">
|
<action name="deleteoutputpath">
|
||||||
<fs>
|
<fs>
|
||||||
<delete path='${outputPath}'/>
|
<delete path="${outputPath}"/>
|
||||||
<mkdir path='${outputPath}'/>
|
<mkdir path="${outputPath}"/>
|
||||||
<delete path='${workingDir}'/>
|
<delete path="${workingDir}"/>
|
||||||
<mkdir path='${workingDir}'/>
|
<mkdir path="${workingDir}"/>
|
||||||
</fs>
|
</fs>
|
||||||
<ok to="atomicactions"/>
|
<ok to="atomicactions"/>
|
||||||
<error to="Kill"/>
|
<error to="Kill"/>
|
||||||
|
|
|
@ -5,18 +5,12 @@ import java.io.IOException;
|
||||||
import java.nio.file.Files;
|
import java.nio.file.Files;
|
||||||
import java.nio.file.Path;
|
import java.nio.file.Path;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.stream.Collectors;
|
|
||||||
|
|
||||||
import org.apache.commons.io.FileUtils;
|
import org.apache.commons.io.FileUtils;
|
||||||
import org.apache.hadoop.io.Text;
|
import org.apache.hadoop.io.Text;
|
||||||
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
|
|
||||||
import org.apache.spark.SparkConf;
|
import org.apache.spark.SparkConf;
|
||||||
import org.apache.spark.api.java.JavaRDD;
|
import org.apache.spark.api.java.JavaRDD;
|
||||||
import org.apache.spark.api.java.JavaSparkContext;
|
import org.apache.spark.api.java.JavaSparkContext;
|
||||||
import org.apache.spark.api.java.function.FilterFunction;
|
|
||||||
import org.apache.spark.api.java.function.ForeachFunction;
|
|
||||||
import org.apache.spark.api.java.function.MapFunction;
|
|
||||||
import org.apache.spark.api.java.function.MapGroupsFunction;
|
|
||||||
import org.apache.spark.sql.Dataset;
|
import org.apache.spark.sql.Dataset;
|
||||||
import org.apache.spark.sql.Encoders;
|
import org.apache.spark.sql.Encoders;
|
||||||
import org.apache.spark.sql.Row;
|
import org.apache.spark.sql.Row;
|
||||||
|
@ -31,9 +25,7 @@ import org.slf4j.LoggerFactory;
|
||||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||||
|
|
||||||
import eu.dnetlib.dhp.schema.action.AtomicAction;
|
import eu.dnetlib.dhp.schema.action.AtomicAction;
|
||||||
import eu.dnetlib.dhp.schema.oaf.*;
|
import eu.dnetlib.dhp.schema.oaf.Publication;
|
||||||
import eu.dnetlib.dhp.schema.oaf.KeyValue;
|
|
||||||
import scala.Tuple2;
|
|
||||||
|
|
||||||
public class SparkAtomicActionScoreJobTest {
|
public class SparkAtomicActionScoreJobTest {
|
||||||
|
|
||||||
|
|
|
@ -32,15 +32,15 @@ public class CheckDuplictedIdsJob {
|
||||||
IOUtils
|
IOUtils
|
||||||
.toString(
|
.toString(
|
||||||
CheckDuplictedIdsJob.class
|
CheckDuplictedIdsJob.class
|
||||||
.getResourceAsStream("/eu/dnetlib/dhp/broker/oa/common_params.json")));
|
.getResourceAsStream("/eu/dnetlib/dhp/broker/oa/check_duplicates.json")));
|
||||||
parser.parseArgument(args);
|
parser.parseArgument(args);
|
||||||
|
|
||||||
final SparkConf conf = new SparkConf();
|
final SparkConf conf = new SparkConf();
|
||||||
|
|
||||||
final String eventsPath = parser.get("workingPath") + "/events";
|
final String eventsPath = parser.get("outputDir") + "/events";
|
||||||
log.info("eventsPath: {}", eventsPath);
|
log.info("eventsPath: {}", eventsPath);
|
||||||
|
|
||||||
final String countPath = parser.get("workingPath") + "/counts";
|
final String countPath = parser.get("outputDir") + "/counts";
|
||||||
log.info("countPath: {}", countPath);
|
log.info("countPath: {}", countPath);
|
||||||
|
|
||||||
final SparkSession spark = SparkSession.builder().config(conf).getOrCreate();
|
final SparkSession spark = SparkSession.builder().config(conf).getOrCreate();
|
||||||
|
@ -59,6 +59,7 @@ public class CheckDuplictedIdsJob {
|
||||||
.map(o -> ClusterUtils.incrementAccumulator(o, total), Encoders.tuple(Encoders.STRING(), Encoders.LONG()))
|
.map(o -> ClusterUtils.incrementAccumulator(o, total), Encoders.tuple(Encoders.STRING(), Encoders.LONG()))
|
||||||
.write()
|
.write()
|
||||||
.mode(SaveMode.Overwrite)
|
.mode(SaveMode.Overwrite)
|
||||||
|
.option("compression", "gzip")
|
||||||
.json(countPath);
|
.json(countPath);
|
||||||
;
|
;
|
||||||
|
|
||||||
|
|
|
@ -44,10 +44,10 @@ public class GenerateEventsJob {
|
||||||
.orElse(Boolean.TRUE);
|
.orElse(Boolean.TRUE);
|
||||||
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
|
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
|
||||||
|
|
||||||
final String workingPath = parser.get("workingPath");
|
final String workingDir = parser.get("workingDir");
|
||||||
log.info("workingPath: {}", workingPath);
|
log.info("workingDir: {}", workingDir);
|
||||||
|
|
||||||
final String eventsPath = workingPath + "/events";
|
final String eventsPath = parser.get("outputDir") + "/events";
|
||||||
log.info("eventsPath: {}", eventsPath);
|
log.info("eventsPath: {}", eventsPath);
|
||||||
|
|
||||||
final Set<String> dsIdWhitelist = ClusterUtils.parseParamAsList(parser, "datasourceIdWhitelist");
|
final Set<String> dsIdWhitelist = ClusterUtils.parseParamAsList(parser, "datasourceIdWhitelist");
|
||||||
|
@ -59,6 +59,9 @@ public class GenerateEventsJob {
|
||||||
final Set<String> dsIdBlacklist = ClusterUtils.parseParamAsList(parser, "datasourceIdBlacklist");
|
final Set<String> dsIdBlacklist = ClusterUtils.parseParamAsList(parser, "datasourceIdBlacklist");
|
||||||
log.info("datasourceIdBlacklist: {}", StringUtils.join(dsIdBlacklist, ","));
|
log.info("datasourceIdBlacklist: {}", StringUtils.join(dsIdBlacklist, ","));
|
||||||
|
|
||||||
|
final Set<String> topicWhitelist = ClusterUtils.parseParamAsList(parser, "topicWhitelist");
|
||||||
|
log.info("topicWhitelist: {}", StringUtils.join(topicWhitelist, ","));
|
||||||
|
|
||||||
final SparkConf conf = new SparkConf();
|
final SparkConf conf = new SparkConf();
|
||||||
|
|
||||||
runWithSparkSession(conf, isSparkSessionManaged, spark -> {
|
runWithSparkSession(conf, isSparkSessionManaged, spark -> {
|
||||||
|
@ -70,12 +73,12 @@ public class GenerateEventsJob {
|
||||||
final LongAccumulator total = spark.sparkContext().longAccumulator("total_events");
|
final LongAccumulator total = spark.sparkContext().longAccumulator("total_events");
|
||||||
|
|
||||||
final Dataset<ResultGroup> groups = ClusterUtils
|
final Dataset<ResultGroup> groups = ClusterUtils
|
||||||
.readPath(spark, workingPath + "/duplicates", ResultGroup.class);
|
.readPath(spark, workingDir + "/duplicates", ResultGroup.class);
|
||||||
|
|
||||||
final Dataset<Event> dataset = groups
|
final Dataset<Event> dataset = groups
|
||||||
.map(
|
.map(
|
||||||
g -> EventFinder
|
g -> EventFinder
|
||||||
.generateEvents(g, dsIdWhitelist, dsIdBlacklist, dsTypeWhitelist, accumulators),
|
.generateEvents(g, dsIdWhitelist, dsIdBlacklist, dsTypeWhitelist, topicWhitelist, accumulators),
|
||||||
Encoders
|
Encoders
|
||||||
.bean(EventGroup.class))
|
.bean(EventGroup.class))
|
||||||
.flatMap(g -> g.getData().iterator(), Encoders.bean(Event.class));
|
.flatMap(g -> g.getData().iterator(), Encoders.bean(Event.class));
|
||||||
|
|
|
@ -46,7 +46,7 @@ public class GenerateStatsJob {
|
||||||
|
|
||||||
final SparkConf conf = new SparkConf();
|
final SparkConf conf = new SparkConf();
|
||||||
|
|
||||||
final String eventsPath = parser.get("workingPath") + "/events";
|
final String eventsPath = parser.get("outputDir") + "/events";
|
||||||
log.info("eventsPath: {}", eventsPath);
|
log.info("eventsPath: {}", eventsPath);
|
||||||
|
|
||||||
final String dbUrl = parser.get("dbUrl");
|
final String dbUrl = parser.get("dbUrl");
|
||||||
|
|
|
@ -46,7 +46,7 @@ public class IndexEventSubsetJob {
|
||||||
|
|
||||||
final SparkConf conf = new SparkConf();
|
final SparkConf conf = new SparkConf();
|
||||||
|
|
||||||
final String eventsPath = parser.get("workingPath") + "/events";
|
final String eventsPath = parser.get("outputDir") + "/events";
|
||||||
log.info("eventsPath: {}", eventsPath);
|
log.info("eventsPath: {}", eventsPath);
|
||||||
|
|
||||||
final String index = parser.get("index");
|
final String index = parser.get("index");
|
||||||
|
@ -55,6 +55,18 @@ public class IndexEventSubsetJob {
|
||||||
final String indexHost = parser.get("esHost");
|
final String indexHost = parser.get("esHost");
|
||||||
log.info("indexHost: {}", indexHost);
|
log.info("indexHost: {}", indexHost);
|
||||||
|
|
||||||
|
final String esBatchWriteRetryCount = parser.get("esBatchWriteRetryCount");
|
||||||
|
log.info("esBatchWriteRetryCount: {}", esBatchWriteRetryCount);
|
||||||
|
|
||||||
|
final String esBatchWriteRetryWait = parser.get("esBatchWriteRetryWait");
|
||||||
|
log.info("esBatchWriteRetryWait: {}", esBatchWriteRetryWait);
|
||||||
|
|
||||||
|
final String esBatchSizeEntries = parser.get("esBatchSizeEntries");
|
||||||
|
log.info("esBatchSizeEntries: {}", esBatchSizeEntries);
|
||||||
|
|
||||||
|
final String esNodesWanOnly = parser.get("esNodesWanOnly");
|
||||||
|
log.info("esNodesWanOnly: {}", esNodesWanOnly);
|
||||||
|
|
||||||
final int maxEventsForTopic = NumberUtils.toInt(parser.get("maxEventsForTopic"));
|
final int maxEventsForTopic = NumberUtils.toInt(parser.get("maxEventsForTopic"));
|
||||||
log.info("maxEventsForTopic: {}", maxEventsForTopic);
|
log.info("maxEventsForTopic: {}", maxEventsForTopic);
|
||||||
|
|
||||||
|
@ -86,10 +98,10 @@ public class IndexEventSubsetJob {
|
||||||
esCfg.put("es.index.auto.create", "false");
|
esCfg.put("es.index.auto.create", "false");
|
||||||
esCfg.put("es.nodes", indexHost);
|
esCfg.put("es.nodes", indexHost);
|
||||||
esCfg.put("es.mapping.id", "eventId"); // THE PRIMARY KEY
|
esCfg.put("es.mapping.id", "eventId"); // THE PRIMARY KEY
|
||||||
esCfg.put("es.batch.write.retry.count", "8");
|
esCfg.put("es.batch.write.retry.count", esBatchWriteRetryCount);
|
||||||
esCfg.put("es.batch.write.retry.wait", "60s");
|
esCfg.put("es.batch.write.retry.wait", esBatchWriteRetryWait);
|
||||||
esCfg.put("es.batch.size.entries", "200");
|
esCfg.put("es.batch.size.entries", esBatchSizeEntries);
|
||||||
esCfg.put("es.nodes.wan.only", "true");
|
esCfg.put("es.nodes.wan.only", esNodesWanOnly);
|
||||||
|
|
||||||
log.info("*** Start indexing");
|
log.info("*** Start indexing");
|
||||||
JavaEsSpark.saveJsonToEs(inputRdd, index, esCfg);
|
JavaEsSpark.saveJsonToEs(inputRdd, index, esCfg);
|
||||||
|
|
|
@ -54,7 +54,7 @@ public class IndexNotificationsJob {
|
||||||
|
|
||||||
final SparkConf conf = new SparkConf();
|
final SparkConf conf = new SparkConf();
|
||||||
|
|
||||||
final String eventsPath = parser.get("workingPath") + "/events";
|
final String eventsPath = parser.get("outputDir") + "/events";
|
||||||
log.info("eventsPath: {}", eventsPath);
|
log.info("eventsPath: {}", eventsPath);
|
||||||
|
|
||||||
final String index = parser.get("index");
|
final String index = parser.get("index");
|
||||||
|
@ -63,6 +63,18 @@ public class IndexNotificationsJob {
|
||||||
final String indexHost = parser.get("esHost");
|
final String indexHost = parser.get("esHost");
|
||||||
log.info("indexHost: {}", indexHost);
|
log.info("indexHost: {}", indexHost);
|
||||||
|
|
||||||
|
final String esBatchWriteRetryCount = parser.get("esBatchWriteRetryCount");
|
||||||
|
log.info("esBatchWriteRetryCount: {}", esBatchWriteRetryCount);
|
||||||
|
|
||||||
|
final String esBatchWriteRetryWait = parser.get("esBatchWriteRetryWait");
|
||||||
|
log.info("esBatchWriteRetryWait: {}", esBatchWriteRetryWait);
|
||||||
|
|
||||||
|
final String esBatchSizeEntries = parser.get("esBatchSizeEntries");
|
||||||
|
log.info("esBatchSizeEntries: {}", esBatchSizeEntries);
|
||||||
|
|
||||||
|
final String esNodesWanOnly = parser.get("esNodesWanOnly");
|
||||||
|
log.info("esNodesWanOnly: {}", esNodesWanOnly);
|
||||||
|
|
||||||
final String brokerApiBaseUrl = parser.get("brokerApiBaseUrl");
|
final String brokerApiBaseUrl = parser.get("brokerApiBaseUrl");
|
||||||
log.info("brokerApiBaseUrl: {}", brokerApiBaseUrl);
|
log.info("brokerApiBaseUrl: {}", brokerApiBaseUrl);
|
||||||
|
|
||||||
|
@ -92,10 +104,10 @@ public class IndexNotificationsJob {
|
||||||
esCfg.put("es.index.auto.create", "false");
|
esCfg.put("es.index.auto.create", "false");
|
||||||
esCfg.put("es.nodes", indexHost);
|
esCfg.put("es.nodes", indexHost);
|
||||||
esCfg.put("es.mapping.id", "notificationId"); // THE PRIMARY KEY
|
esCfg.put("es.mapping.id", "notificationId"); // THE PRIMARY KEY
|
||||||
esCfg.put("es.batch.write.retry.count", "8");
|
esCfg.put("es.batch.write.retry.count", esBatchWriteRetryCount);
|
||||||
esCfg.put("es.batch.write.retry.wait", "60s");
|
esCfg.put("es.batch.write.retry.wait", esBatchWriteRetryWait);
|
||||||
esCfg.put("es.batch.size.entries", "200");
|
esCfg.put("es.batch.size.entries", esBatchSizeEntries);
|
||||||
esCfg.put("es.nodes.wan.only", "true");
|
esCfg.put("es.nodes.wan.only", esNodesWanOnly);
|
||||||
|
|
||||||
log.info("*** Start indexing");
|
log.info("*** Start indexing");
|
||||||
JavaEsSpark.saveJsonToEs(inputRdd, index, esCfg);
|
JavaEsSpark.saveJsonToEs(inputRdd, index, esCfg);
|
||||||
|
|
|
@ -36,7 +36,7 @@ public class IndexOnESJob {
|
||||||
|
|
||||||
final SparkConf conf = new SparkConf();
|
final SparkConf conf = new SparkConf();
|
||||||
|
|
||||||
final String eventsPath = parser.get("workingPath") + "/events";
|
final String eventsPath = parser.get("outputDir") + "/events";
|
||||||
log.info("eventsPath: {}", eventsPath);
|
log.info("eventsPath: {}", eventsPath);
|
||||||
|
|
||||||
final String index = parser.get("index");
|
final String index = parser.get("index");
|
||||||
|
@ -45,6 +45,18 @@ public class IndexOnESJob {
|
||||||
final String indexHost = parser.get("esHost");
|
final String indexHost = parser.get("esHost");
|
||||||
log.info("indexHost: {}", indexHost);
|
log.info("indexHost: {}", indexHost);
|
||||||
|
|
||||||
|
final String esBatchWriteRetryCount = parser.get("esBatchWriteRetryCount");
|
||||||
|
log.info("esBatchWriteRetryCount: {}", esBatchWriteRetryCount);
|
||||||
|
|
||||||
|
final String esBatchWriteRetryWait = parser.get("esBatchWriteRetryWait");
|
||||||
|
log.info("esBatchWriteRetryWait: {}", esBatchWriteRetryWait);
|
||||||
|
|
||||||
|
final String esBatchSizeEntries = parser.get("esBatchSizeEntries");
|
||||||
|
log.info("esBatchSizeEntries: {}", esBatchSizeEntries);
|
||||||
|
|
||||||
|
final String esNodesWanOnly = parser.get("esNodesWanOnly");
|
||||||
|
log.info("esNodesWanOnly: {}", esNodesWanOnly);
|
||||||
|
|
||||||
final SparkSession spark = SparkSession.builder().config(conf).getOrCreate();
|
final SparkSession spark = SparkSession.builder().config(conf).getOrCreate();
|
||||||
|
|
||||||
final JavaRDD<String> inputRdd = ClusterUtils
|
final JavaRDD<String> inputRdd = ClusterUtils
|
||||||
|
@ -53,15 +65,13 @@ public class IndexOnESJob {
|
||||||
.javaRDD();
|
.javaRDD();
|
||||||
|
|
||||||
final Map<String, String> esCfg = new HashMap<>();
|
final Map<String, String> esCfg = new HashMap<>();
|
||||||
// esCfg.put("es.nodes", "10.19.65.51, 10.19.65.52, 10.19.65.53, 10.19.65.54");
|
|
||||||
|
|
||||||
esCfg.put("es.index.auto.create", "false");
|
esCfg.put("es.index.auto.create", "false");
|
||||||
esCfg.put("es.nodes", indexHost);
|
esCfg.put("es.nodes", indexHost);
|
||||||
esCfg.put("es.mapping.id", "eventId"); // THE PRIMARY KEY
|
esCfg.put("es.mapping.id", "eventId"); // THE PRIMARY KEY
|
||||||
esCfg.put("es.batch.write.retry.count", "8");
|
esCfg.put("es.batch.write.retry.count", esBatchWriteRetryCount);
|
||||||
esCfg.put("es.batch.write.retry.wait", "60s");
|
esCfg.put("es.batch.write.retry.wait", esBatchWriteRetryWait);
|
||||||
esCfg.put("es.batch.size.entries", "200");
|
esCfg.put("es.batch.size.entries", esBatchSizeEntries);
|
||||||
esCfg.put("es.nodes.wan.only", "true");
|
esCfg.put("es.nodes.wan.only", esNodesWanOnly);
|
||||||
|
|
||||||
JavaEsSpark.saveJsonToEs(inputRdd, index, esCfg);
|
JavaEsSpark.saveJsonToEs(inputRdd, index, esCfg);
|
||||||
}
|
}
|
||||||
|
|
|
@ -42,10 +42,10 @@ public class JoinStep0Job {
|
||||||
final String graphPath = parser.get("graphPath");
|
final String graphPath = parser.get("graphPath");
|
||||||
log.info("graphPath: {}", graphPath);
|
log.info("graphPath: {}", graphPath);
|
||||||
|
|
||||||
final String workingPath = parser.get("workingPath");
|
final String workingDir = parser.get("workingDir");
|
||||||
log.info("workingPath: {}", workingPath);
|
log.info("workingDir: {}", workingDir);
|
||||||
|
|
||||||
final String joinedEntitiesPath = workingPath + "/joinedEntities_step0";
|
final String joinedEntitiesPath = workingDir + "/joinedEntities_step0";
|
||||||
log.info("joinedEntitiesPath: {}", joinedEntitiesPath);
|
log.info("joinedEntitiesPath: {}", joinedEntitiesPath);
|
||||||
|
|
||||||
final SparkConf conf = new SparkConf();
|
final SparkConf conf = new SparkConf();
|
||||||
|
@ -57,10 +57,10 @@ public class JoinStep0Job {
|
||||||
final LongAccumulator total = spark.sparkContext().longAccumulator("total_entities");
|
final LongAccumulator total = spark.sparkContext().longAccumulator("total_entities");
|
||||||
|
|
||||||
final Dataset<OaBrokerMainEntity> sources = ClusterUtils
|
final Dataset<OaBrokerMainEntity> sources = ClusterUtils
|
||||||
.readPath(spark, workingPath + "/simpleEntities", OaBrokerMainEntity.class);
|
.readPath(spark, workingDir + "/simpleEntities", OaBrokerMainEntity.class);
|
||||||
|
|
||||||
final Dataset<RelatedDatasource> typedRels = ClusterUtils
|
final Dataset<RelatedDatasource> typedRels = ClusterUtils
|
||||||
.readPath(spark, workingPath + "/relatedDatasources", RelatedDatasource.class);
|
.readPath(spark, workingDir + "/relatedDatasources", RelatedDatasource.class);
|
||||||
|
|
||||||
final TypedColumn<Tuple2<OaBrokerMainEntity, RelatedDatasource>, OaBrokerMainEntity> aggr = new RelatedDatasourceAggregator()
|
final TypedColumn<Tuple2<OaBrokerMainEntity, RelatedDatasource>, OaBrokerMainEntity> aggr = new RelatedDatasourceAggregator()
|
||||||
.toColumn();
|
.toColumn();
|
||||||
|
|
|
@ -40,10 +40,10 @@ public class JoinStep1Job {
|
||||||
.orElse(Boolean.TRUE);
|
.orElse(Boolean.TRUE);
|
||||||
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
|
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
|
||||||
|
|
||||||
final String workingPath = parser.get("workingPath");
|
final String workingDir = parser.get("workingDir");
|
||||||
log.info("workingPath: {}", workingPath);
|
log.info("workingDir: {}", workingDir);
|
||||||
|
|
||||||
final String joinedEntitiesPath = workingPath + "/joinedEntities_step1";
|
final String joinedEntitiesPath = workingDir + "/joinedEntities_step1";
|
||||||
log.info("joinedEntitiesPath: {}", joinedEntitiesPath);
|
log.info("joinedEntitiesPath: {}", joinedEntitiesPath);
|
||||||
|
|
||||||
final SparkConf conf = new SparkConf();
|
final SparkConf conf = new SparkConf();
|
||||||
|
@ -55,10 +55,10 @@ public class JoinStep1Job {
|
||||||
final LongAccumulator total = spark.sparkContext().longAccumulator("total_entities");
|
final LongAccumulator total = spark.sparkContext().longAccumulator("total_entities");
|
||||||
|
|
||||||
final Dataset<OaBrokerMainEntity> sources = ClusterUtils
|
final Dataset<OaBrokerMainEntity> sources = ClusterUtils
|
||||||
.readPath(spark, workingPath + "/joinedEntities_step0", OaBrokerMainEntity.class);
|
.readPath(spark, workingDir + "/joinedEntities_step0", OaBrokerMainEntity.class);
|
||||||
|
|
||||||
final Dataset<RelatedProject> typedRels = ClusterUtils
|
final Dataset<RelatedProject> typedRels = ClusterUtils
|
||||||
.readPath(spark, workingPath + "/relatedProjects", RelatedProject.class);
|
.readPath(spark, workingDir + "/relatedProjects", RelatedProject.class);
|
||||||
|
|
||||||
final TypedColumn<Tuple2<OaBrokerMainEntity, RelatedProject>, OaBrokerMainEntity> aggr = new RelatedProjectAggregator()
|
final TypedColumn<Tuple2<OaBrokerMainEntity, RelatedProject>, OaBrokerMainEntity> aggr = new RelatedProjectAggregator()
|
||||||
.toColumn();
|
.toColumn();
|
||||||
|
|
|
@ -39,10 +39,10 @@ public class JoinStep2Job {
|
||||||
.orElse(Boolean.TRUE);
|
.orElse(Boolean.TRUE);
|
||||||
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
|
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
|
||||||
|
|
||||||
final String workingPath = parser.get("workingPath");
|
final String workingDir = parser.get("workingDir");
|
||||||
log.info("workingPath: {}", workingPath);
|
log.info("workingDir: {}", workingDir);
|
||||||
|
|
||||||
final String joinedEntitiesPath = workingPath + "/joinedEntities_step2";
|
final String joinedEntitiesPath = workingDir + "/joinedEntities_step2";
|
||||||
log.info("joinedEntitiesPath: {}", joinedEntitiesPath);
|
log.info("joinedEntitiesPath: {}", joinedEntitiesPath);
|
||||||
|
|
||||||
final SparkConf conf = new SparkConf();
|
final SparkConf conf = new SparkConf();
|
||||||
|
@ -54,10 +54,10 @@ public class JoinStep2Job {
|
||||||
final LongAccumulator total = spark.sparkContext().longAccumulator("total_entities");
|
final LongAccumulator total = spark.sparkContext().longAccumulator("total_entities");
|
||||||
|
|
||||||
final Dataset<OaBrokerMainEntity> sources = ClusterUtils
|
final Dataset<OaBrokerMainEntity> sources = ClusterUtils
|
||||||
.readPath(spark, workingPath + "/joinedEntities_step1", OaBrokerMainEntity.class);
|
.readPath(spark, workingDir + "/joinedEntities_step1", OaBrokerMainEntity.class);
|
||||||
|
|
||||||
final Dataset<RelatedSoftware> typedRels = ClusterUtils
|
final Dataset<RelatedSoftware> typedRels = ClusterUtils
|
||||||
.readPath(spark, workingPath + "/relatedSoftwares", RelatedSoftware.class);
|
.readPath(spark, workingDir + "/relatedSoftwares", RelatedSoftware.class);
|
||||||
|
|
||||||
final TypedColumn<Tuple2<OaBrokerMainEntity, RelatedSoftware>, OaBrokerMainEntity> aggr = new RelatedSoftwareAggregator()
|
final TypedColumn<Tuple2<OaBrokerMainEntity, RelatedSoftware>, OaBrokerMainEntity> aggr = new RelatedSoftwareAggregator()
|
||||||
.toColumn();
|
.toColumn();
|
||||||
|
|
|
@ -40,10 +40,10 @@ public class JoinStep3Job {
|
||||||
.orElse(Boolean.TRUE);
|
.orElse(Boolean.TRUE);
|
||||||
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
|
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
|
||||||
|
|
||||||
final String workingPath = parser.get("workingPath");
|
final String workingDir = parser.get("workingDir");
|
||||||
log.info("workingPath: {}", workingPath);
|
log.info("workingDir: {}", workingDir);
|
||||||
|
|
||||||
final String joinedEntitiesPath = workingPath + "/joinedEntities_step3";
|
final String joinedEntitiesPath = workingDir + "/joinedEntities_step3";
|
||||||
log.info("joinedEntitiesPath: {}", joinedEntitiesPath);
|
log.info("joinedEntitiesPath: {}", joinedEntitiesPath);
|
||||||
|
|
||||||
final SparkConf conf = new SparkConf();
|
final SparkConf conf = new SparkConf();
|
||||||
|
@ -55,10 +55,10 @@ public class JoinStep3Job {
|
||||||
final LongAccumulator total = spark.sparkContext().longAccumulator("total_entities");
|
final LongAccumulator total = spark.sparkContext().longAccumulator("total_entities");
|
||||||
|
|
||||||
final Dataset<OaBrokerMainEntity> sources = ClusterUtils
|
final Dataset<OaBrokerMainEntity> sources = ClusterUtils
|
||||||
.readPath(spark, workingPath + "/joinedEntities_step2", OaBrokerMainEntity.class);
|
.readPath(spark, workingDir + "/joinedEntities_step2", OaBrokerMainEntity.class);
|
||||||
|
|
||||||
final Dataset<RelatedDataset> typedRels = ClusterUtils
|
final Dataset<RelatedDataset> typedRels = ClusterUtils
|
||||||
.readPath(spark, workingPath + "/relatedDatasets", RelatedDataset.class);
|
.readPath(spark, workingDir + "/relatedDatasets", RelatedDataset.class);
|
||||||
|
|
||||||
final TypedColumn<Tuple2<OaBrokerMainEntity, RelatedDataset>, OaBrokerMainEntity> aggr = new RelatedDatasetAggregator()
|
final TypedColumn<Tuple2<OaBrokerMainEntity, RelatedDataset>, OaBrokerMainEntity> aggr = new RelatedDatasetAggregator()
|
||||||
.toColumn();
|
.toColumn();
|
||||||
|
|
|
@ -40,10 +40,10 @@ public class JoinStep4Job {
|
||||||
.orElse(Boolean.TRUE);
|
.orElse(Boolean.TRUE);
|
||||||
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
|
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
|
||||||
|
|
||||||
final String workingPath = parser.get("workingPath");
|
final String workingDir = parser.get("workingDir");
|
||||||
log.info("workingPath: {}", workingPath);
|
log.info("workingDir: {}", workingDir);
|
||||||
|
|
||||||
final String joinedEntitiesPath = workingPath + "/joinedEntities_step4";
|
final String joinedEntitiesPath = workingDir + "/joinedEntities_step4";
|
||||||
log.info("joinedEntitiesPath: {}", joinedEntitiesPath);
|
log.info("joinedEntitiesPath: {}", joinedEntitiesPath);
|
||||||
|
|
||||||
final SparkConf conf = new SparkConf();
|
final SparkConf conf = new SparkConf();
|
||||||
|
@ -55,10 +55,10 @@ public class JoinStep4Job {
|
||||||
final LongAccumulator total = spark.sparkContext().longAccumulator("total_entities");
|
final LongAccumulator total = spark.sparkContext().longAccumulator("total_entities");
|
||||||
|
|
||||||
final Dataset<OaBrokerMainEntity> sources = ClusterUtils
|
final Dataset<OaBrokerMainEntity> sources = ClusterUtils
|
||||||
.readPath(spark, workingPath + "/joinedEntities_step3", OaBrokerMainEntity.class);
|
.readPath(spark, workingDir + "/joinedEntities_step3", OaBrokerMainEntity.class);
|
||||||
|
|
||||||
final Dataset<RelatedPublication> typedRels = ClusterUtils
|
final Dataset<RelatedPublication> typedRels = ClusterUtils
|
||||||
.readPath(spark, workingPath + "/relatedPublications", RelatedPublication.class);
|
.readPath(spark, workingDir + "/relatedPublications", RelatedPublication.class);
|
||||||
|
|
||||||
final TypedColumn<Tuple2<OaBrokerMainEntity, RelatedPublication>, OaBrokerMainEntity> aggr = new RelatedPublicationAggregator()
|
final TypedColumn<Tuple2<OaBrokerMainEntity, RelatedPublication>, OaBrokerMainEntity> aggr = new RelatedPublicationAggregator()
|
||||||
.toColumn();
|
.toColumn();
|
||||||
|
|
|
@ -36,7 +36,7 @@ import eu.dnetlib.dhp.broker.oa.util.ClusterUtils;
|
||||||
public class PartitionEventsByDsIdJob {
|
public class PartitionEventsByDsIdJob {
|
||||||
|
|
||||||
private static final Logger log = LoggerFactory.getLogger(PartitionEventsByDsIdJob.class);
|
private static final Logger log = LoggerFactory.getLogger(PartitionEventsByDsIdJob.class);
|
||||||
private static final String OPENDOAR_NSPREFIX = "10|opendoar____::";
|
private static final String OPENDOAR_NSPREFIX = "opendoar____::";
|
||||||
|
|
||||||
public static void main(final String[] args) throws Exception {
|
public static void main(final String[] args) throws Exception {
|
||||||
|
|
||||||
|
@ -55,10 +55,10 @@ public class PartitionEventsByDsIdJob {
|
||||||
|
|
||||||
final SparkConf conf = new SparkConf();
|
final SparkConf conf = new SparkConf();
|
||||||
|
|
||||||
final String eventsPath = parser.get("workingPath") + "/events";
|
final String eventsPath = parser.get("outputDir") + "/events";
|
||||||
log.info("eventsPath: {}", eventsPath);
|
log.info("eventsPath: {}", eventsPath);
|
||||||
|
|
||||||
final String partitionPath = parser.get("workingPath") + "/eventsByOpendoarId";
|
final String partitionPath = parser.get("outputDir") + "/eventsByOpendoarId";
|
||||||
log.info("partitionPath: {}", partitionPath);
|
log.info("partitionPath: {}", partitionPath);
|
||||||
|
|
||||||
final String opendoarIds = parser.get("opendoarIds");
|
final String opendoarIds = parser.get("opendoarIds");
|
||||||
|
@ -91,6 +91,7 @@ public class PartitionEventsByDsIdJob {
|
||||||
.write()
|
.write()
|
||||||
.partitionBy("group")
|
.partitionBy("group")
|
||||||
.mode(SaveMode.Overwrite)
|
.mode(SaveMode.Overwrite)
|
||||||
|
.option("compression", "gzip")
|
||||||
.json(partitionPath);
|
.json(partitionPath);
|
||||||
|
|
||||||
});
|
});
|
||||||
|
@ -122,6 +123,7 @@ public class PartitionEventsByDsIdJob {
|
||||||
|
|
||||||
final ShortEventMessageWithGroupId res = new ShortEventMessageWithGroupId();
|
final ShortEventMessageWithGroupId res = new ShortEventMessageWithGroupId();
|
||||||
|
|
||||||
|
res.setEventId(e.getEventId());
|
||||||
res.setOriginalId(payload.getResult().getOriginalId());
|
res.setOriginalId(payload.getResult().getOriginalId());
|
||||||
res.setTitle(payload.getResult().getTitles().stream().filter(StringUtils::isNotBlank).findFirst().orElse(null));
|
res.setTitle(payload.getResult().getTitles().stream().filter(StringUtils::isNotBlank).findFirst().orElse(null));
|
||||||
res.setTopic(e.getTopic());
|
res.setTopic(e.getTopic());
|
||||||
|
|
|
@ -45,10 +45,10 @@ public class PrepareGroupsJob {
|
||||||
final String graphPath = parser.get("graphPath");
|
final String graphPath = parser.get("graphPath");
|
||||||
log.info("graphPath: {}", graphPath);
|
log.info("graphPath: {}", graphPath);
|
||||||
|
|
||||||
final String workingPath = parser.get("workingPath");
|
final String workingDir = parser.get("workingDir");
|
||||||
log.info("workingPath: {}", workingPath);
|
log.info("workingDir: {}", workingDir);
|
||||||
|
|
||||||
final String groupsPath = workingPath + "/duplicates";
|
final String groupsPath = workingDir + "/duplicates";
|
||||||
log.info("groupsPath: {}", groupsPath);
|
log.info("groupsPath: {}", groupsPath);
|
||||||
|
|
||||||
final SparkConf conf = new SparkConf();
|
final SparkConf conf = new SparkConf();
|
||||||
|
@ -60,10 +60,10 @@ public class PrepareGroupsJob {
|
||||||
final LongAccumulator total = spark.sparkContext().longAccumulator("total_groups");
|
final LongAccumulator total = spark.sparkContext().longAccumulator("total_groups");
|
||||||
|
|
||||||
final Dataset<OaBrokerMainEntity> results = ClusterUtils
|
final Dataset<OaBrokerMainEntity> results = ClusterUtils
|
||||||
.readPath(spark, workingPath + "/joinedEntities_step4", OaBrokerMainEntity.class);
|
.readPath(spark, workingDir + "/joinedEntities_step4", OaBrokerMainEntity.class);
|
||||||
|
|
||||||
final Dataset<Relation> mergedRels = ClusterUtils
|
final Dataset<Relation> mergedRels = ClusterUtils
|
||||||
.readPath(spark, graphPath + "/relation", Relation.class)
|
.loadRelations(graphPath, spark)
|
||||||
.filter(r -> r.getRelClass().equals(BrokerConstants.IS_MERGED_IN_CLASS));
|
.filter(r -> r.getRelClass().equals(BrokerConstants.IS_MERGED_IN_CLASS));
|
||||||
|
|
||||||
final TypedColumn<Tuple2<OaBrokerMainEntity, Relation>, ResultGroup> aggr = new ResultAggregator()
|
final TypedColumn<Tuple2<OaBrokerMainEntity, Relation>, ResultGroup> aggr = new ResultAggregator()
|
||||||
|
|
|
@ -42,10 +42,10 @@ public class PrepareRelatedDatasetsJob {
|
||||||
final String graphPath = parser.get("graphPath");
|
final String graphPath = parser.get("graphPath");
|
||||||
log.info("graphPath: {}", graphPath);
|
log.info("graphPath: {}", graphPath);
|
||||||
|
|
||||||
final String workingPath = parser.get("workingPath");
|
final String workingDir = parser.get("workingDir");
|
||||||
log.info("workingPath: {}", workingPath);
|
log.info("workingDir: {}", workingDir);
|
||||||
|
|
||||||
final String relsPath = workingPath + "/relatedDatasets";
|
final String relsPath = workingDir + "/relatedDatasets";
|
||||||
log.info("relsPath: {}", relsPath);
|
log.info("relsPath: {}", relsPath);
|
||||||
|
|
||||||
final SparkConf conf = new SparkConf();
|
final SparkConf conf = new SparkConf();
|
||||||
|
@ -62,7 +62,7 @@ public class PrepareRelatedDatasetsJob {
|
||||||
.map(ConversionUtils::oafDatasetToBrokerDataset, Encoders.bean(OaBrokerRelatedDataset.class));
|
.map(ConversionUtils::oafDatasetToBrokerDataset, Encoders.bean(OaBrokerRelatedDataset.class));
|
||||||
|
|
||||||
final Dataset<Relation> rels = ClusterUtils
|
final Dataset<Relation> rels = ClusterUtils
|
||||||
.readPath(spark, graphPath + "/relation", Relation.class)
|
.loadRelations(graphPath, spark)
|
||||||
.filter(r -> r.getDataInfo().getDeletedbyinference())
|
.filter(r -> r.getDataInfo().getDeletedbyinference())
|
||||||
.filter(r -> r.getRelType().equals(ModelConstants.RESULT_RESULT))
|
.filter(r -> r.getRelType().equals(ModelConstants.RESULT_RESULT))
|
||||||
.filter(r -> ClusterUtils.isValidResultResultClass(r.getRelClass()))
|
.filter(r -> ClusterUtils.isValidResultResultClass(r.getRelClass()))
|
||||||
|
@ -72,7 +72,8 @@ public class PrepareRelatedDatasetsJob {
|
||||||
final Dataset<RelatedDataset> dataset = rels
|
final Dataset<RelatedDataset> dataset = rels
|
||||||
.joinWith(datasets, datasets.col("openaireId").equalTo(rels.col("target")), "inner")
|
.joinWith(datasets, datasets.col("openaireId").equalTo(rels.col("target")), "inner")
|
||||||
.map(t -> {
|
.map(t -> {
|
||||||
final RelatedDataset rel = new RelatedDataset(t._1.getSource(), t._2);
|
final RelatedDataset rel = new RelatedDataset(t._1.getSource(),
|
||||||
|
t._2);
|
||||||
rel.getRelDataset().setRelType(t._1.getRelClass());
|
rel.getRelDataset().setRelType(t._1.getRelClass());
|
||||||
return rel;
|
return rel;
|
||||||
}, Encoders.bean(RelatedDataset.class));
|
}, Encoders.bean(RelatedDataset.class));
|
||||||
|
|
|
@ -48,10 +48,10 @@ public class PrepareRelatedDatasourcesJob {
|
||||||
final String graphPath = parser.get("graphPath");
|
final String graphPath = parser.get("graphPath");
|
||||||
log.info("graphPath: {}", graphPath);
|
log.info("graphPath: {}", graphPath);
|
||||||
|
|
||||||
final String workingPath = parser.get("workingPath");
|
final String workingDir = parser.get("workingDir");
|
||||||
log.info("workingPath: {}", workingPath);
|
log.info("workingDir: {}", workingDir);
|
||||||
|
|
||||||
final String relsPath = workingPath + "/relatedDatasources";
|
final String relsPath = workingDir + "/relatedDatasources";
|
||||||
log.info("relsPath: {}", relsPath);
|
log.info("relsPath: {}", relsPath);
|
||||||
|
|
||||||
final SparkConf conf = new SparkConf();
|
final SparkConf conf = new SparkConf();
|
||||||
|
|
|
@ -44,10 +44,10 @@ public class PrepareRelatedProjectsJob {
|
||||||
final String graphPath = parser.get("graphPath");
|
final String graphPath = parser.get("graphPath");
|
||||||
log.info("graphPath: {}", graphPath);
|
log.info("graphPath: {}", graphPath);
|
||||||
|
|
||||||
final String workingPath = parser.get("workingPath");
|
final String workingDir = parser.get("workingDir");
|
||||||
log.info("workingPath: {}", workingPath);
|
log.info("workingDir: {}", workingDir);
|
||||||
|
|
||||||
final String relsPath = workingPath + "/relatedProjects";
|
final String relsPath = workingDir + "/relatedProjects";
|
||||||
log.info("relsPath: {}", relsPath);
|
log.info("relsPath: {}", relsPath);
|
||||||
|
|
||||||
final SparkConf conf = new SparkConf();
|
final SparkConf conf = new SparkConf();
|
||||||
|
@ -64,7 +64,7 @@ public class PrepareRelatedProjectsJob {
|
||||||
.map(ConversionUtils::oafProjectToBrokerProject, Encoders.bean(OaBrokerProject.class));
|
.map(ConversionUtils::oafProjectToBrokerProject, Encoders.bean(OaBrokerProject.class));
|
||||||
|
|
||||||
final Dataset<Relation> rels = ClusterUtils
|
final Dataset<Relation> rels = ClusterUtils
|
||||||
.readPath(spark, graphPath + "/relation", Relation.class)
|
.loadRelations(graphPath, spark)
|
||||||
.filter(r -> r.getDataInfo().getDeletedbyinference())
|
.filter(r -> r.getDataInfo().getDeletedbyinference())
|
||||||
.filter(r -> r.getRelType().equals(ModelConstants.RESULT_PROJECT))
|
.filter(r -> r.getRelType().equals(ModelConstants.RESULT_PROJECT))
|
||||||
.filter(r -> !r.getRelClass().equals(BrokerConstants.IS_MERGED_IN_CLASS))
|
.filter(r -> !r.getRelClass().equals(BrokerConstants.IS_MERGED_IN_CLASS))
|
||||||
|
|
|
@ -43,10 +43,10 @@ public class PrepareRelatedPublicationsJob {
|
||||||
final String graphPath = parser.get("graphPath");
|
final String graphPath = parser.get("graphPath");
|
||||||
log.info("graphPath: {}", graphPath);
|
log.info("graphPath: {}", graphPath);
|
||||||
|
|
||||||
final String workingPath = parser.get("workingPath");
|
final String workingDir = parser.get("workingDir");
|
||||||
log.info("workingPath: {}", workingPath);
|
log.info("workingDir: {}", workingDir);
|
||||||
|
|
||||||
final String relsPath = workingPath + "/relatedPublications";
|
final String relsPath = workingDir + "/relatedPublications";
|
||||||
log.info("relsPath: {}", relsPath);
|
log.info("relsPath: {}", relsPath);
|
||||||
|
|
||||||
final SparkConf conf = new SparkConf();
|
final SparkConf conf = new SparkConf();
|
||||||
|
@ -65,7 +65,7 @@ public class PrepareRelatedPublicationsJob {
|
||||||
Encoders.bean(OaBrokerRelatedPublication.class));
|
Encoders.bean(OaBrokerRelatedPublication.class));
|
||||||
|
|
||||||
final Dataset<Relation> rels = ClusterUtils
|
final Dataset<Relation> rels = ClusterUtils
|
||||||
.readPath(spark, graphPath + "/relation", Relation.class)
|
.loadRelations(graphPath, spark)
|
||||||
.filter(r -> r.getDataInfo().getDeletedbyinference())
|
.filter(r -> r.getDataInfo().getDeletedbyinference())
|
||||||
.filter(r -> r.getRelType().equals(ModelConstants.RESULT_RESULT))
|
.filter(r -> r.getRelType().equals(ModelConstants.RESULT_RESULT))
|
||||||
.filter(r -> ClusterUtils.isValidResultResultClass(r.getRelClass()))
|
.filter(r -> ClusterUtils.isValidResultResultClass(r.getRelClass()))
|
||||||
|
@ -75,7 +75,8 @@ public class PrepareRelatedPublicationsJob {
|
||||||
final Dataset<RelatedPublication> dataset = rels
|
final Dataset<RelatedPublication> dataset = rels
|
||||||
.joinWith(pubs, pubs.col("openaireId").equalTo(rels.col("target")), "inner")
|
.joinWith(pubs, pubs.col("openaireId").equalTo(rels.col("target")), "inner")
|
||||||
.map(t -> {
|
.map(t -> {
|
||||||
final RelatedPublication rel = new RelatedPublication(t._1.getSource(), t._2);
|
final RelatedPublication rel = new RelatedPublication(
|
||||||
|
t._1.getSource(), t._2);
|
||||||
rel.getRelPublication().setRelType(t._1.getRelClass());
|
rel.getRelPublication().setRelType(t._1.getRelClass());
|
||||||
return rel;
|
return rel;
|
||||||
}, Encoders.bean(RelatedPublication.class));
|
}, Encoders.bean(RelatedPublication.class));
|
||||||
|
|
|
@ -44,10 +44,10 @@ public class PrepareRelatedSoftwaresJob {
|
||||||
final String graphPath = parser.get("graphPath");
|
final String graphPath = parser.get("graphPath");
|
||||||
log.info("graphPath: {}", graphPath);
|
log.info("graphPath: {}", graphPath);
|
||||||
|
|
||||||
final String workingPath = parser.get("workingPath");
|
final String workingDir = parser.get("workingDir");
|
||||||
log.info("workingPath: {}", workingPath);
|
log.info("workingDir: {}", workingDir);
|
||||||
|
|
||||||
final String relsPath = workingPath + "/relatedSoftwares";
|
final String relsPath = workingDir + "/relatedSoftwares";
|
||||||
log.info("relsPath: {}", relsPath);
|
log.info("relsPath: {}", relsPath);
|
||||||
|
|
||||||
final SparkConf conf = new SparkConf();
|
final SparkConf conf = new SparkConf();
|
||||||
|
@ -64,7 +64,7 @@ public class PrepareRelatedSoftwaresJob {
|
||||||
.map(ConversionUtils::oafSoftwareToBrokerSoftware, Encoders.bean(OaBrokerRelatedSoftware.class));
|
.map(ConversionUtils::oafSoftwareToBrokerSoftware, Encoders.bean(OaBrokerRelatedSoftware.class));
|
||||||
|
|
||||||
final Dataset<Relation> rels = ClusterUtils
|
final Dataset<Relation> rels = ClusterUtils
|
||||||
.readPath(spark, graphPath + "/relation", Relation.class)
|
.loadRelations(graphPath, spark)
|
||||||
.filter(r -> r.getDataInfo().getDeletedbyinference())
|
.filter(r -> r.getDataInfo().getDeletedbyinference())
|
||||||
.filter(r -> r.getRelType().equals(ModelConstants.RESULT_RESULT))
|
.filter(r -> r.getRelType().equals(ModelConstants.RESULT_RESULT))
|
||||||
.filter(r -> !r.getRelClass().equals(BrokerConstants.IS_MERGED_IN_CLASS))
|
.filter(r -> !r.getRelClass().equals(BrokerConstants.IS_MERGED_IN_CLASS))
|
||||||
|
|
|
@ -44,10 +44,10 @@ public class PrepareSimpleEntititiesJob {
|
||||||
final String graphPath = parser.get("graphPath");
|
final String graphPath = parser.get("graphPath");
|
||||||
log.info("graphPath: {}", graphPath);
|
log.info("graphPath: {}", graphPath);
|
||||||
|
|
||||||
final String workingPath = parser.get("workingPath");
|
final String workingDir = parser.get("workingDir");
|
||||||
log.info("workingPath: {}", workingPath);
|
log.info("workingDir: {}", workingDir);
|
||||||
|
|
||||||
final String simpleEntitiesPath = workingPath + "/simpleEntities";
|
final String simpleEntitiesPath = workingDir + "/simpleEntities";
|
||||||
log.info("simpleEntitiesPath: {}", simpleEntitiesPath);
|
log.info("simpleEntitiesPath: {}", simpleEntitiesPath);
|
||||||
|
|
||||||
final SparkConf conf = new SparkConf();
|
final SparkConf conf = new SparkConf();
|
||||||
|
|
|
@ -16,7 +16,24 @@ public class EnrichMissingSubject extends UpdateMatcher<OaBrokerTypedValue> {
|
||||||
|
|
||||||
public EnrichMissingSubject() {
|
public EnrichMissingSubject() {
|
||||||
super(20,
|
super(20,
|
||||||
s -> Topic.fromPath("ENRICH/MISSING/SUBJECT/" + s.getType()),
|
s -> {
|
||||||
|
switch (s.getType().toLowerCase()) {
|
||||||
|
case "acm":
|
||||||
|
return Topic.ENRICH_MISSING_SUBJECT_ACM;
|
||||||
|
case "arxiv":
|
||||||
|
return Topic.ENRICH_MISSING_SUBJECT_ARXIV;
|
||||||
|
case "ddc":
|
||||||
|
return Topic.ENRICH_MISSING_SUBJECT_DDC;
|
||||||
|
case "jel":
|
||||||
|
return Topic.ENRICH_MISSING_SUBJECT_JEL;
|
||||||
|
case "mesh":
|
||||||
|
return Topic.ENRICH_MISSING_SUBJECT_MESHEUROPMC;
|
||||||
|
case "rvk":
|
||||||
|
return Topic.ENRICH_MISSING_SUBJECT_RVK;
|
||||||
|
default:
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
},
|
||||||
(p, s) -> p.getSubjects().add(s),
|
(p, s) -> p.getSubjects().add(s),
|
||||||
s -> subjectAsString(s));
|
s -> subjectAsString(s));
|
||||||
}
|
}
|
||||||
|
|
|
@ -16,7 +16,24 @@ public class EnrichMoreSubject extends UpdateMatcher<OaBrokerTypedValue> {
|
||||||
|
|
||||||
public EnrichMoreSubject() {
|
public EnrichMoreSubject() {
|
||||||
super(20,
|
super(20,
|
||||||
s -> Topic.fromPath("ENRICH/MORE/SUBJECT/" + s.getType()),
|
s -> {
|
||||||
|
switch (s.getType().toLowerCase()) {
|
||||||
|
case "acm":
|
||||||
|
return Topic.ENRICH_MORE_SUBJECT_ACM;
|
||||||
|
case "arxiv":
|
||||||
|
return Topic.ENRICH_MORE_SUBJECT_ARXIV;
|
||||||
|
case "ddc":
|
||||||
|
return Topic.ENRICH_MORE_SUBJECT_DDC;
|
||||||
|
case "jel":
|
||||||
|
return Topic.ENRICH_MORE_SUBJECT_JEL;
|
||||||
|
case "mesh":
|
||||||
|
return Topic.ENRICH_MORE_SUBJECT_MESHEUROPMC;
|
||||||
|
case "rvk":
|
||||||
|
return Topic.ENRICH_MORE_SUBJECT_RVK;
|
||||||
|
default:
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
},
|
||||||
(p, s) -> p.getSubjects().add(s),
|
(p, s) -> p.getSubjects().add(s),
|
||||||
s -> subjectAsString(s));
|
s -> subjectAsString(s));
|
||||||
}
|
}
|
||||||
|
|
|
@ -17,6 +17,7 @@ import com.fasterxml.jackson.databind.ObjectMapper;
|
||||||
|
|
||||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||||
import eu.dnetlib.dhp.common.HdfsSupport;
|
import eu.dnetlib.dhp.common.HdfsSupport;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.Relation;
|
||||||
|
|
||||||
public class ClusterUtils {
|
public class ClusterUtils {
|
||||||
|
|
||||||
|
@ -30,6 +31,16 @@ public class ClusterUtils {
|
||||||
HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration());
|
HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public static Dataset<Relation> loadRelations(final String graphPath, final SparkSession spark) {
|
||||||
|
return ClusterUtils
|
||||||
|
.readPath(spark, graphPath + "/relation", Relation.class)
|
||||||
|
.map(r -> {
|
||||||
|
r.setSource(ConversionUtils.cleanOpenaireId(r.getSource()));
|
||||||
|
r.setTarget(ConversionUtils.cleanOpenaireId(r.getTarget()));
|
||||||
|
return r;
|
||||||
|
}, Encoders.bean(Relation.class));
|
||||||
|
}
|
||||||
|
|
||||||
public static <R> Dataset<R> readPath(
|
public static <R> Dataset<R> readPath(
|
||||||
final SparkSession spark,
|
final SparkSession spark,
|
||||||
final String inputPath,
|
final String inputPath,
|
||||||
|
@ -67,6 +78,7 @@ public class ClusterUtils {
|
||||||
.map(o -> ClusterUtils.incrementAccumulator(o, acc), Encoders.bean(clazz))
|
.map(o -> ClusterUtils.incrementAccumulator(o, acc), Encoders.bean(clazz))
|
||||||
.write()
|
.write()
|
||||||
.mode(SaveMode.Overwrite)
|
.mode(SaveMode.Overwrite)
|
||||||
|
.option("compression", "gzip")
|
||||||
.json(path);
|
.json(path);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -74,7 +74,7 @@ public class ConversionUtils {
|
||||||
}
|
}
|
||||||
|
|
||||||
final OaBrokerRelatedDataset res = new OaBrokerRelatedDataset();
|
final OaBrokerRelatedDataset res = new OaBrokerRelatedDataset();
|
||||||
res.setOpenaireId(d.getId());
|
res.setOpenaireId(cleanOpenaireId(d.getId()));
|
||||||
res.setOriginalId(first(d.getOriginalId()));
|
res.setOriginalId(first(d.getOriginalId()));
|
||||||
res.setTitle(structPropValue(d.getTitle()));
|
res.setTitle(structPropValue(d.getTitle()));
|
||||||
res.setPids(mappedList(d.getPid(), ConversionUtils::oafPidToBrokerPid));
|
res.setPids(mappedList(d.getPid(), ConversionUtils::oafPidToBrokerPid));
|
||||||
|
@ -89,7 +89,7 @@ public class ConversionUtils {
|
||||||
}
|
}
|
||||||
|
|
||||||
final OaBrokerRelatedPublication res = new OaBrokerRelatedPublication();
|
final OaBrokerRelatedPublication res = new OaBrokerRelatedPublication();
|
||||||
res.setOpenaireId(p.getId());
|
res.setOpenaireId(cleanOpenaireId(p.getId()));
|
||||||
res.setOriginalId(first(p.getOriginalId()));
|
res.setOriginalId(first(p.getOriginalId()));
|
||||||
res.setTitle(structPropValue(p.getTitle()));
|
res.setTitle(structPropValue(p.getTitle()));
|
||||||
res.setPids(mappedList(p.getPid(), ConversionUtils::oafPidToBrokerPid));
|
res.setPids(mappedList(p.getPid(), ConversionUtils::oafPidToBrokerPid));
|
||||||
|
@ -106,7 +106,7 @@ public class ConversionUtils {
|
||||||
|
|
||||||
final OaBrokerMainEntity res = new OaBrokerMainEntity();
|
final OaBrokerMainEntity res = new OaBrokerMainEntity();
|
||||||
|
|
||||||
res.setOpenaireId(result.getId());
|
res.setOpenaireId(cleanOpenaireId(result.getId()));
|
||||||
res.setOriginalId(first(result.getOriginalId()));
|
res.setOriginalId(first(result.getOriginalId()));
|
||||||
res.setTypology(classId(result.getResulttype()));
|
res.setTypology(classId(result.getResulttype()));
|
||||||
res.setTitles(structPropList(result.getTitle()));
|
res.setTitles(structPropList(result.getTitle()));
|
||||||
|
@ -129,6 +129,10 @@ public class ConversionUtils {
|
||||||
return res;
|
return res;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public static String cleanOpenaireId(final String id) {
|
||||||
|
return id.contains("|") ? StringUtils.substringAfter(id, "|") : id;
|
||||||
|
}
|
||||||
|
|
||||||
private static OaBrokerAuthor oafAuthorToBrokerAuthor(final Author author) {
|
private static OaBrokerAuthor oafAuthorToBrokerAuthor(final Author author) {
|
||||||
if (author == null) {
|
if (author == null) {
|
||||||
return null;
|
return null;
|
||||||
|
@ -188,7 +192,7 @@ public class ConversionUtils {
|
||||||
}
|
}
|
||||||
|
|
||||||
final OaBrokerProject res = new OaBrokerProject();
|
final OaBrokerProject res = new OaBrokerProject();
|
||||||
res.setOpenaireId(p.getId());
|
res.setOpenaireId(cleanOpenaireId(p.getId()));
|
||||||
res.setTitle(fieldValue(p.getTitle()));
|
res.setTitle(fieldValue(p.getTitle()));
|
||||||
res.setAcronym(fieldValue(p.getAcronym()));
|
res.setAcronym(fieldValue(p.getAcronym()));
|
||||||
res.setCode(fieldValue(p.getCode()));
|
res.setCode(fieldValue(p.getCode()));
|
||||||
|
@ -214,7 +218,7 @@ public class ConversionUtils {
|
||||||
}
|
}
|
||||||
|
|
||||||
final OaBrokerRelatedSoftware res = new OaBrokerRelatedSoftware();
|
final OaBrokerRelatedSoftware res = new OaBrokerRelatedSoftware();
|
||||||
res.setOpenaireId(sw.getId());
|
res.setOpenaireId(cleanOpenaireId(sw.getId()));
|
||||||
res.setName(structPropValue(sw.getTitle()));
|
res.setName(structPropValue(sw.getTitle()));
|
||||||
res.setDescription(fieldValue(sw.getDescription()));
|
res.setDescription(fieldValue(sw.getDescription()));
|
||||||
res.setRepository(fieldValue(sw.getCodeRepositoryUrl()));
|
res.setRepository(fieldValue(sw.getCodeRepositoryUrl()));
|
||||||
|
@ -230,7 +234,7 @@ public class ConversionUtils {
|
||||||
|
|
||||||
final OaBrokerRelatedDatasource res = new OaBrokerRelatedDatasource();
|
final OaBrokerRelatedDatasource res = new OaBrokerRelatedDatasource();
|
||||||
res.setName(StringUtils.defaultIfBlank(fieldValue(ds.getOfficialname()), fieldValue(ds.getEnglishname())));
|
res.setName(StringUtils.defaultIfBlank(fieldValue(ds.getOfficialname()), fieldValue(ds.getEnglishname())));
|
||||||
res.setOpenaireId(ds.getId());
|
res.setOpenaireId(cleanOpenaireId(ds.getId()));
|
||||||
res.setType(classId(ds.getDatasourcetype()));
|
res.setType(classId(ds.getDatasourcetype()));
|
||||||
return res;
|
return res;
|
||||||
}
|
}
|
||||||
|
|
|
@ -59,9 +59,18 @@ public class DatasourceRelationsAccumulator implements Serializable {
|
||||||
final DatasourceRelationsAccumulator res = new DatasourceRelationsAccumulator();
|
final DatasourceRelationsAccumulator res = new DatasourceRelationsAccumulator();
|
||||||
collectedFromSet
|
collectedFromSet
|
||||||
.stream()
|
.stream()
|
||||||
.map(s -> new Tuple3<>(r.getId(), s, BrokerConstants.COLLECTED_FROM_REL))
|
.map(
|
||||||
|
s -> new Tuple3<>(ConversionUtils.cleanOpenaireId(r.getId()), ConversionUtils.cleanOpenaireId(s),
|
||||||
|
BrokerConstants.COLLECTED_FROM_REL))
|
||||||
.forEach(res::addTuple);
|
.forEach(res::addTuple);
|
||||||
hostedBySet.stream().map(s -> new Tuple3<>(r.getId(), s, BrokerConstants.HOSTED_BY_REL)).forEach(res::addTuple);
|
|
||||||
|
hostedBySet
|
||||||
|
.stream()
|
||||||
|
.map(
|
||||||
|
s -> new Tuple3<>(ConversionUtils.cleanOpenaireId(r.getId()), ConversionUtils.cleanOpenaireId(s),
|
||||||
|
BrokerConstants.HOSTED_BY_REL))
|
||||||
|
.forEach(res::addTuple);
|
||||||
|
|
||||||
return res;
|
return res;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -76,6 +76,7 @@ public class EventFinder {
|
||||||
final Set<String> dsIdWhitelist,
|
final Set<String> dsIdWhitelist,
|
||||||
final Set<String> dsIdBlacklist,
|
final Set<String> dsIdBlacklist,
|
||||||
final Set<String> dsTypeWhitelist,
|
final Set<String> dsTypeWhitelist,
|
||||||
|
final Set<String> topicWhitelist,
|
||||||
final Map<String, LongAccumulator> accumulators) {
|
final Map<String, LongAccumulator> accumulators) {
|
||||||
|
|
||||||
final List<UpdateInfo<?>> list = new ArrayList<>();
|
final List<UpdateInfo<?>> list = new ArrayList<>();
|
||||||
|
@ -84,7 +85,13 @@ public class EventFinder {
|
||||||
for (final OaBrokerRelatedDatasource targetDs : target.getDatasources()) {
|
for (final OaBrokerRelatedDatasource targetDs : target.getDatasources()) {
|
||||||
if (verifyTarget(targetDs, dsIdWhitelist, dsIdBlacklist, dsTypeWhitelist)) {
|
if (verifyTarget(targetDs, dsIdWhitelist, dsIdBlacklist, dsTypeWhitelist)) {
|
||||||
for (final UpdateMatcher<?> matcher : matchers) {
|
for (final UpdateMatcher<?> matcher : matchers) {
|
||||||
list.addAll(matcher.searchUpdatesForRecord(target, targetDs, results.getData(), accumulators));
|
for (final UpdateInfo<?> info : matcher
|
||||||
|
.searchUpdatesForRecord(target, targetDs, results.getData(), accumulators)) {
|
||||||
|
if (topicWhitelist == null || topicWhitelist.isEmpty()
|
||||||
|
|| topicWhitelist.contains(info.getTopic().getPath())) {
|
||||||
|
list.add(info);
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -0,0 +1,9 @@
|
||||||
|
[
|
||||||
|
|
||||||
|
{
|
||||||
|
"paramName": "o",
|
||||||
|
"paramLongName": "outputDir",
|
||||||
|
"paramDescription": "the path where the data are stored",
|
||||||
|
"paramRequired": true
|
||||||
|
}
|
||||||
|
]
|
|
@ -7,7 +7,7 @@
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"paramName": "o",
|
"paramName": "o",
|
||||||
"paramLongName": "workingPath",
|
"paramLongName": "workingDir",
|
||||||
"paramDescription": "the path where the temporary data will be stored",
|
"paramDescription": "the path where the temporary data will be stored",
|
||||||
"paramRequired": true
|
"paramRequired": true
|
||||||
}
|
}
|
||||||
|
|
|
@ -6,7 +6,7 @@
|
||||||
<description>the path where the graph is stored</description>
|
<description>the path where the graph is stored</description>
|
||||||
</property>
|
</property>
|
||||||
<property>
|
<property>
|
||||||
<name>workingPath</name>
|
<name>outputDir</name>
|
||||||
<description>the path where the the generated data will be stored</description>
|
<description>the path where the the generated data will be stored</description>
|
||||||
</property>
|
</property>
|
||||||
<property>
|
<property>
|
||||||
|
@ -24,6 +24,11 @@
|
||||||
<value>-</value>
|
<value>-</value>
|
||||||
<description>a black list (comma separeted, - for empty list) of datasource ids</description>
|
<description>a black list (comma separeted, - for empty list) of datasource ids</description>
|
||||||
</property>
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>topicWhitelist</name>
|
||||||
|
<value>*</value>
|
||||||
|
<description>a white list (comma separeted, * for all) of topics</description>
|
||||||
|
</property>
|
||||||
<property>
|
<property>
|
||||||
<name>esEventIndexName</name>
|
<name>esEventIndexName</name>
|
||||||
<description>the elasticsearch index name for events</description>
|
<description>the elasticsearch index name for events</description>
|
||||||
|
@ -36,6 +41,26 @@
|
||||||
<name>esIndexHost</name>
|
<name>esIndexHost</name>
|
||||||
<description>the elasticsearch host</description>
|
<description>the elasticsearch host</description>
|
||||||
</property>
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>esBatchWriteRetryCount</name>
|
||||||
|
<value>8</value>
|
||||||
|
<description>an ES configuration property</description>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>esBatchWriteRetryWait</name>
|
||||||
|
<value>60s</value>
|
||||||
|
<description>an ES configuration property</description>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>esBatchSizeEntries</name>
|
||||||
|
<value>200</value>
|
||||||
|
<description>an ES configuration property</description>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>esNodesWanOnly</name>
|
||||||
|
<value>true</value>
|
||||||
|
<description>an ES configuration property</description>
|
||||||
|
</property>
|
||||||
<property>
|
<property>
|
||||||
<name>maxIndexedEventsForDsAndTopic</name>
|
<name>maxIndexedEventsForDsAndTopic</name>
|
||||||
<description>the max number of events for each couple (ds/topic)</description>
|
<description>the max number of events for each couple (ds/topic)</description>
|
||||||
|
@ -111,15 +136,25 @@
|
||||||
</configuration>
|
</configuration>
|
||||||
</global>
|
</global>
|
||||||
|
|
||||||
<start to="ensure_working_path"/>
|
<start to="resume_from"/>
|
||||||
|
|
||||||
|
<decision name="resume_from">
|
||||||
|
<switch>
|
||||||
|
<case to="ensure_output_dir">${wf:conf('resumeFrom') eq 'ensure_output_dir'}</case>
|
||||||
|
<case to="index_event_subset">${wf:conf('resumeFrom') eq 'index_event_subset'}</case>
|
||||||
|
<case to="stats">${wf:conf('resumeFrom') eq 'stats'}</case>
|
||||||
|
<case to="index_notifications">${wf:conf('resumeFrom') eq 'index_notifications'}</case>
|
||||||
|
<default to="ensure_output_dir"/>
|
||||||
|
</switch>
|
||||||
|
</decision>
|
||||||
|
|
||||||
<kill name="Kill">
|
<kill name="Kill">
|
||||||
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
||||||
</kill>
|
</kill>
|
||||||
|
|
||||||
<action name="ensure_working_path">
|
<action name="ensure_output_dir">
|
||||||
<fs>
|
<fs>
|
||||||
<mkdir path='${workingPath}'/>
|
<mkdir path='${outputDir}'/>
|
||||||
</fs>
|
</fs>
|
||||||
<ok to="start_entities_and_rels"/>
|
<ok to="start_entities_and_rels"/>
|
||||||
<error to="Kill"/>
|
<error to="Kill"/>
|
||||||
|
@ -152,7 +187,7 @@
|
||||||
--conf spark.sql.shuffle.partitions=3840
|
--conf spark.sql.shuffle.partitions=3840
|
||||||
</spark-opts>
|
</spark-opts>
|
||||||
<arg>--graphPath</arg><arg>${graphInputPath}</arg>
|
<arg>--graphPath</arg><arg>${graphInputPath}</arg>
|
||||||
<arg>--workingPath</arg><arg>${workingPath}</arg>
|
<arg>--workingDir</arg><arg>${workingDir}</arg>
|
||||||
</spark>
|
</spark>
|
||||||
<ok to="wait_entities_and_rels"/>
|
<ok to="wait_entities_and_rels"/>
|
||||||
<error to="Kill"/>
|
<error to="Kill"/>
|
||||||
|
@ -176,7 +211,7 @@
|
||||||
--conf spark.sql.shuffle.partitions=3840
|
--conf spark.sql.shuffle.partitions=3840
|
||||||
</spark-opts>
|
</spark-opts>
|
||||||
<arg>--graphPath</arg><arg>${graphInputPath}</arg>
|
<arg>--graphPath</arg><arg>${graphInputPath}</arg>
|
||||||
<arg>--workingPath</arg><arg>${workingPath}</arg>
|
<arg>--workingDir</arg><arg>${workingDir}</arg>
|
||||||
</spark>
|
</spark>
|
||||||
<ok to="wait_entities_and_rels"/>
|
<ok to="wait_entities_and_rels"/>
|
||||||
<error to="Kill"/>
|
<error to="Kill"/>
|
||||||
|
@ -201,7 +236,7 @@
|
||||||
--conf spark.sql.shuffle.partitions=3840
|
--conf spark.sql.shuffle.partitions=3840
|
||||||
</spark-opts>
|
</spark-opts>
|
||||||
<arg>--graphPath</arg><arg>${graphInputPath}</arg>
|
<arg>--graphPath</arg><arg>${graphInputPath}</arg>
|
||||||
<arg>--workingPath</arg><arg>${workingPath}</arg>
|
<arg>--workingDir</arg><arg>${workingDir}</arg>
|
||||||
</spark>
|
</spark>
|
||||||
<ok to="wait_entities_and_rels"/>
|
<ok to="wait_entities_and_rels"/>
|
||||||
<error to="Kill"/>
|
<error to="Kill"/>
|
||||||
|
@ -225,7 +260,7 @@
|
||||||
--conf spark.sql.shuffle.partitions=3840
|
--conf spark.sql.shuffle.partitions=3840
|
||||||
</spark-opts>
|
</spark-opts>
|
||||||
<arg>--graphPath</arg><arg>${graphInputPath}</arg>
|
<arg>--graphPath</arg><arg>${graphInputPath}</arg>
|
||||||
<arg>--workingPath</arg><arg>${workingPath}</arg>
|
<arg>--workingDir</arg><arg>${workingDir}</arg>
|
||||||
</spark>
|
</spark>
|
||||||
<ok to="wait_entities_and_rels"/>
|
<ok to="wait_entities_and_rels"/>
|
||||||
<error to="Kill"/>
|
<error to="Kill"/>
|
||||||
|
@ -249,7 +284,7 @@
|
||||||
--conf spark.sql.shuffle.partitions=3840
|
--conf spark.sql.shuffle.partitions=3840
|
||||||
</spark-opts>
|
</spark-opts>
|
||||||
<arg>--graphPath</arg><arg>${graphInputPath}</arg>
|
<arg>--graphPath</arg><arg>${graphInputPath}</arg>
|
||||||
<arg>--workingPath</arg><arg>${workingPath}</arg>
|
<arg>--workingDir</arg><arg>${workingDir}</arg>
|
||||||
</spark>
|
</spark>
|
||||||
<ok to="wait_entities_and_rels"/>
|
<ok to="wait_entities_and_rels"/>
|
||||||
<error to="Kill"/>
|
<error to="Kill"/>
|
||||||
|
@ -273,7 +308,7 @@
|
||||||
--conf spark.sql.shuffle.partitions=3840
|
--conf spark.sql.shuffle.partitions=3840
|
||||||
</spark-opts>
|
</spark-opts>
|
||||||
<arg>--graphPath</arg><arg>${graphInputPath}</arg>
|
<arg>--graphPath</arg><arg>${graphInputPath}</arg>
|
||||||
<arg>--workingPath</arg><arg>${workingPath}</arg>
|
<arg>--workingDir</arg><arg>${workingDir}</arg>
|
||||||
</spark>
|
</spark>
|
||||||
<ok to="wait_entities_and_rels"/>
|
<ok to="wait_entities_and_rels"/>
|
||||||
<error to="Kill"/>
|
<error to="Kill"/>
|
||||||
|
@ -299,7 +334,7 @@
|
||||||
--conf spark.sql.shuffle.partitions=3840
|
--conf spark.sql.shuffle.partitions=3840
|
||||||
</spark-opts>
|
</spark-opts>
|
||||||
<arg>--graphPath</arg><arg>${graphInputPath}</arg>
|
<arg>--graphPath</arg><arg>${graphInputPath}</arg>
|
||||||
<arg>--workingPath</arg><arg>${workingPath}</arg>
|
<arg>--workingDir</arg><arg>${workingDir}</arg>
|
||||||
</spark>
|
</spark>
|
||||||
<ok to="join_entities_step1"/>
|
<ok to="join_entities_step1"/>
|
||||||
<error to="Kill"/>
|
<error to="Kill"/>
|
||||||
|
@ -323,7 +358,7 @@
|
||||||
--conf spark.sql.shuffle.partitions=3840
|
--conf spark.sql.shuffle.partitions=3840
|
||||||
</spark-opts>
|
</spark-opts>
|
||||||
<arg>--graphPath</arg><arg>${graphInputPath}</arg>
|
<arg>--graphPath</arg><arg>${graphInputPath}</arg>
|
||||||
<arg>--workingPath</arg><arg>${workingPath}</arg>
|
<arg>--workingDir</arg><arg>${workingDir}</arg>
|
||||||
</spark>
|
</spark>
|
||||||
<ok to="join_entities_step2"/>
|
<ok to="join_entities_step2"/>
|
||||||
<error to="Kill"/>
|
<error to="Kill"/>
|
||||||
|
@ -347,7 +382,7 @@
|
||||||
--conf spark.sql.shuffle.partitions=3840
|
--conf spark.sql.shuffle.partitions=3840
|
||||||
</spark-opts>
|
</spark-opts>
|
||||||
<arg>--graphPath</arg><arg>${graphInputPath}</arg>
|
<arg>--graphPath</arg><arg>${graphInputPath}</arg>
|
||||||
<arg>--workingPath</arg><arg>${workingPath}</arg>
|
<arg>--workingDir</arg><arg>${workingDir}</arg>
|
||||||
</spark>
|
</spark>
|
||||||
<ok to="join_entities_step3"/>
|
<ok to="join_entities_step3"/>
|
||||||
<error to="Kill"/>
|
<error to="Kill"/>
|
||||||
|
@ -371,7 +406,7 @@
|
||||||
--conf spark.sql.shuffle.partitions=3840
|
--conf spark.sql.shuffle.partitions=3840
|
||||||
</spark-opts>
|
</spark-opts>
|
||||||
<arg>--graphPath</arg><arg>${graphInputPath}</arg>
|
<arg>--graphPath</arg><arg>${graphInputPath}</arg>
|
||||||
<arg>--workingPath</arg><arg>${workingPath}</arg>
|
<arg>--workingDir</arg><arg>${workingDir}</arg>
|
||||||
</spark>
|
</spark>
|
||||||
<ok to="join_entities_step4"/>
|
<ok to="join_entities_step4"/>
|
||||||
<error to="Kill"/>
|
<error to="Kill"/>
|
||||||
|
@ -395,7 +430,7 @@
|
||||||
--conf spark.sql.shuffle.partitions=3840
|
--conf spark.sql.shuffle.partitions=3840
|
||||||
</spark-opts>
|
</spark-opts>
|
||||||
<arg>--graphPath</arg><arg>${graphInputPath}</arg>
|
<arg>--graphPath</arg><arg>${graphInputPath}</arg>
|
||||||
<arg>--workingPath</arg><arg>${workingPath}</arg>
|
<arg>--workingDir</arg><arg>${workingDir}</arg>
|
||||||
</spark>
|
</spark>
|
||||||
<ok to="prepare_groups"/>
|
<ok to="prepare_groups"/>
|
||||||
<error to="Kill"/>
|
<error to="Kill"/>
|
||||||
|
@ -419,7 +454,7 @@
|
||||||
--conf spark.sql.shuffle.partitions=3840
|
--conf spark.sql.shuffle.partitions=3840
|
||||||
</spark-opts>
|
</spark-opts>
|
||||||
<arg>--graphPath</arg><arg>${graphInputPath}</arg>
|
<arg>--graphPath</arg><arg>${graphInputPath}</arg>
|
||||||
<arg>--workingPath</arg><arg>${workingPath}</arg>
|
<arg>--workingDir</arg><arg>${workingDir}</arg>
|
||||||
</spark>
|
</spark>
|
||||||
<ok to="generate_events"/>
|
<ok to="generate_events"/>
|
||||||
<error to="Kill"/>
|
<error to="Kill"/>
|
||||||
|
@ -442,10 +477,12 @@
|
||||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||||
--conf spark.sql.shuffle.partitions=3840
|
--conf spark.sql.shuffle.partitions=3840
|
||||||
</spark-opts>
|
</spark-opts>
|
||||||
<arg>--workingPath</arg><arg>${workingPath}</arg>
|
<arg>--workingDir</arg><arg>${workingDir}</arg>
|
||||||
|
<arg>--outputDir</arg><arg>${outputDir}</arg>
|
||||||
<arg>--datasourceIdWhitelist</arg><arg>${datasourceIdWhitelist}</arg>
|
<arg>--datasourceIdWhitelist</arg><arg>${datasourceIdWhitelist}</arg>
|
||||||
<arg>--datasourceTypeWhitelist</arg><arg>${datasourceTypeWhitelist}</arg>
|
<arg>--datasourceTypeWhitelist</arg><arg>${datasourceTypeWhitelist}</arg>
|
||||||
<arg>--datasourceIdBlacklist</arg><arg>${datasourceIdBlacklist}</arg>
|
<arg>--datasourceIdBlacklist</arg><arg>${datasourceIdBlacklist}</arg>
|
||||||
|
<arg>--topicWhitelist</arg><arg>${topicWhitelist}</arg>
|
||||||
</spark>
|
</spark>
|
||||||
<ok to="index_event_subset"/>
|
<ok to="index_event_subset"/>
|
||||||
<error to="Kill"/>
|
<error to="Kill"/>
|
||||||
|
@ -468,38 +505,16 @@
|
||||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||||
--conf spark.sql.shuffle.partitions=3840
|
--conf spark.sql.shuffle.partitions=3840
|
||||||
</spark-opts>
|
</spark-opts>
|
||||||
<arg>--workingPath</arg><arg>${workingPath}</arg>
|
<arg>--outputDir</arg><arg>${outputDir}</arg>
|
||||||
<arg>--index</arg><arg>${esEventIndexName}</arg>
|
<arg>--index</arg><arg>${esEventIndexName}</arg>
|
||||||
<arg>--esHost</arg><arg>${esIndexHost}</arg>
|
<arg>--esHost</arg><arg>${esIndexHost}</arg>
|
||||||
|
<arg>--esBatchWriteRetryCount</arg><arg>${esBatchWriteRetryCount}</arg>
|
||||||
|
<arg>--esBatchWriteRetryWait</arg><arg>${esBatchWriteRetryWait}</arg>
|
||||||
|
<arg>--esBatchSizeEntries</arg><arg>${esBatchSizeEntries}</arg>
|
||||||
|
<arg>--esNodesWanOnly</arg><arg>${esNodesWanOnly}</arg>
|
||||||
<arg>--maxEventsForTopic</arg><arg>${maxIndexedEventsForDsAndTopic}</arg>
|
<arg>--maxEventsForTopic</arg><arg>${maxIndexedEventsForDsAndTopic}</arg>
|
||||||
<arg>--brokerApiBaseUrl</arg><arg>${brokerApiBaseUrl}</arg>
|
<arg>--brokerApiBaseUrl</arg><arg>${brokerApiBaseUrl}</arg>
|
||||||
</spark>
|
</spark>
|
||||||
<ok to="index_notifications"/>
|
|
||||||
<error to="Kill"/>
|
|
||||||
</action>
|
|
||||||
|
|
||||||
<action name="index_notifications">
|
|
||||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
|
||||||
<master>yarn</master>
|
|
||||||
<mode>cluster</mode>
|
|
||||||
<name>IndexNotificationsOnESJob</name>
|
|
||||||
<class>eu.dnetlib.dhp.broker.oa.IndexNotificationsJob</class>
|
|
||||||
<jar>dhp-broker-events-${projectVersion}.jar</jar>
|
|
||||||
<spark-opts>
|
|
||||||
--executor-memory=${sparkExecutorMemory}
|
|
||||||
--driver-memory=${sparkDriverMemory}
|
|
||||||
--conf spark.dynamicAllocation.maxExecutors="8"
|
|
||||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
|
||||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
|
||||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
|
||||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
|
||||||
--conf spark.sql.shuffle.partitions=3840
|
|
||||||
</spark-opts>
|
|
||||||
<arg>--workingPath</arg><arg>${workingPath}</arg>
|
|
||||||
<arg>--index</arg><arg>${esNotificationsIndexName}</arg>
|
|
||||||
<arg>--esHost</arg><arg>${esIndexHost}</arg>
|
|
||||||
<arg>--brokerApiBaseUrl</arg><arg>${brokerApiBaseUrl}</arg>
|
|
||||||
</spark>
|
|
||||||
<ok to="stats"/>
|
<ok to="stats"/>
|
||||||
<error to="Kill"/>
|
<error to="Kill"/>
|
||||||
</action>
|
</action>
|
||||||
|
@ -521,12 +536,42 @@
|
||||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||||
--conf spark.sql.shuffle.partitions=3840
|
--conf spark.sql.shuffle.partitions=3840
|
||||||
</spark-opts>
|
</spark-opts>
|
||||||
<arg>--workingPath</arg><arg>${workingPath}</arg>
|
<arg>--outputDir</arg><arg>${outputDir}</arg>
|
||||||
<arg>--dbUrl</arg><arg>${brokerDbUrl}</arg>
|
<arg>--dbUrl</arg><arg>${brokerDbUrl}</arg>
|
||||||
<arg>--dbUser</arg><arg>${brokerDbUser}</arg>
|
<arg>--dbUser</arg><arg>${brokerDbUser}</arg>
|
||||||
<arg>--dbPassword</arg><arg>${brokerDbPassword}</arg>
|
<arg>--dbPassword</arg><arg>${brokerDbPassword}</arg>
|
||||||
<arg>--brokerApiBaseUrl</arg><arg>${brokerApiBaseUrl}</arg>
|
<arg>--brokerApiBaseUrl</arg><arg>${brokerApiBaseUrl}</arg>
|
||||||
</spark>
|
</spark>
|
||||||
|
<ok to="index_notifications"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<action name="index_notifications">
|
||||||
|
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||||
|
<master>yarn</master>
|
||||||
|
<mode>cluster</mode>
|
||||||
|
<name>IndexNotificationsOnESJob</name>
|
||||||
|
<class>eu.dnetlib.dhp.broker.oa.IndexNotificationsJob</class>
|
||||||
|
<jar>dhp-broker-events-${projectVersion}.jar</jar>
|
||||||
|
<spark-opts>
|
||||||
|
--executor-memory=${sparkExecutorMemory}
|
||||||
|
--driver-memory=${sparkDriverMemory}
|
||||||
|
--conf spark.dynamicAllocation.maxExecutors="8"
|
||||||
|
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||||
|
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||||
|
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||||
|
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||||
|
--conf spark.sql.shuffle.partitions=3840
|
||||||
|
</spark-opts>
|
||||||
|
<arg>--outputDir</arg><arg>${outputDir}</arg>
|
||||||
|
<arg>--index</arg><arg>${esNotificationsIndexName}</arg>
|
||||||
|
<arg>--esHost</arg><arg>${esIndexHost}</arg>
|
||||||
|
<arg>--esBatchWriteRetryCount</arg><arg>${esBatchWriteRetryCount}</arg>
|
||||||
|
<arg>--esBatchWriteRetryWait</arg><arg>${esBatchWriteRetryWait}</arg>
|
||||||
|
<arg>--esBatchSizeEntries</arg><arg>${esBatchSizeEntries}</arg>
|
||||||
|
<arg>--esNodesWanOnly</arg><arg>${esNodesWanOnly}</arg>
|
||||||
|
<arg>--brokerApiBaseUrl</arg><arg>${brokerApiBaseUrl}</arg>
|
||||||
|
</spark>
|
||||||
<ok to="End"/>
|
<ok to="End"/>
|
||||||
<error to="Kill"/>
|
<error to="Kill"/>
|
||||||
</action>
|
</action>
|
||||||
|
|
|
@ -1,7 +1,13 @@
|
||||||
[
|
[
|
||||||
|
{
|
||||||
|
"paramName": "wp",
|
||||||
|
"paramLongName": "workingDir",
|
||||||
|
"paramDescription": "the path where the temporary data are stored",
|
||||||
|
"paramRequired": true
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"paramName": "o",
|
"paramName": "o",
|
||||||
"paramLongName": "workingPath",
|
"paramLongName": "outputDir",
|
||||||
"paramDescription": "the path where the generated events will be stored",
|
"paramDescription": "the path where the generated events will be stored",
|
||||||
"paramRequired": true
|
"paramRequired": true
|
||||||
},
|
},
|
||||||
|
@ -22,5 +28,11 @@
|
||||||
"paramLongName": "datasourceIdBlacklist",
|
"paramLongName": "datasourceIdBlacklist",
|
||||||
"paramDescription": "a black list (comma separeted, - for empty list) of datasource ids",
|
"paramDescription": "a black list (comma separeted, - for empty list) of datasource ids",
|
||||||
"paramRequired": true
|
"paramRequired": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"paramName": "topicWhitelist",
|
||||||
|
"paramLongName": "topicWhitelist",
|
||||||
|
"paramDescription": "a white list (comma separeted, * for all) of topics",
|
||||||
|
"paramRequired": true
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
|
|
|
@ -1,8 +1,8 @@
|
||||||
[
|
[
|
||||||
{
|
{
|
||||||
"paramName": "o",
|
"paramName": "o",
|
||||||
"paramLongName": "workingPath",
|
"paramLongName": "outputDir",
|
||||||
"paramDescription": "the workinh path",
|
"paramDescription": "the data path",
|
||||||
"paramRequired": true
|
"paramRequired": true
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
@ -16,5 +16,29 @@
|
||||||
"paramLongName": "esHost",
|
"paramLongName": "esHost",
|
||||||
"paramDescription": "the ES host",
|
"paramDescription": "the ES host",
|
||||||
"paramRequired": true
|
"paramRequired": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"paramName": "esBatchWriteRetryCount",
|
||||||
|
"paramLongName": "esBatchWriteRetryCount",
|
||||||
|
"paramDescription": "an ES configuration property",
|
||||||
|
"paramRequired": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"paramName": "esBatchWriteRetryWait",
|
||||||
|
"paramLongName": "esBatchWriteRetryWait",
|
||||||
|
"paramDescription": "an ES configuration property",
|
||||||
|
"paramRequired": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"paramName": "esBatchSizeEntries",
|
||||||
|
"paramLongName": "esBatchSizeEntries",
|
||||||
|
"paramDescription": "an ES configuration property",
|
||||||
|
"paramRequired": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"paramName": "esNodesWanOnly",
|
||||||
|
"paramLongName": "esNodesWanOnly",
|
||||||
|
"paramDescription": "an ES configuration property",
|
||||||
|
"paramRequired": true
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
|
|
|
@ -1,8 +1,8 @@
|
||||||
[
|
[
|
||||||
{
|
{
|
||||||
"paramName": "o",
|
"paramName": "o",
|
||||||
"paramLongName": "workingPath",
|
"paramLongName": "outputDir",
|
||||||
"paramDescription": "the workinh path",
|
"paramDescription": "the path where the generated data are stored",
|
||||||
"paramRequired": true
|
"paramRequired": true
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
@ -17,6 +17,30 @@
|
||||||
"paramDescription": "the ES host",
|
"paramDescription": "the ES host",
|
||||||
"paramRequired": true
|
"paramRequired": true
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"paramName": "esBatchWriteRetryCount",
|
||||||
|
"paramLongName": "esBatchWriteRetryCount",
|
||||||
|
"paramDescription": "an ES configuration property",
|
||||||
|
"paramRequired": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"paramName": "esBatchWriteRetryWait",
|
||||||
|
"paramLongName": "esBatchWriteRetryWait",
|
||||||
|
"paramDescription": "an ES configuration property",
|
||||||
|
"paramRequired": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"paramName": "esBatchSizeEntries",
|
||||||
|
"paramLongName": "esBatchSizeEntries",
|
||||||
|
"paramDescription": "an ES configuration property",
|
||||||
|
"paramRequired": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"paramName": "esNodesWanOnly",
|
||||||
|
"paramLongName": "esNodesWanOnly",
|
||||||
|
"paramDescription": "an ES configuration property",
|
||||||
|
"paramRequired": true
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"paramName": "n",
|
"paramName": "n",
|
||||||
"paramLongName": "maxEventsForTopic",
|
"paramLongName": "maxEventsForTopic",
|
||||||
|
|
|
@ -1,8 +1,8 @@
|
||||||
[
|
[
|
||||||
{
|
{
|
||||||
"paramName": "o",
|
"paramName": "o",
|
||||||
"paramLongName": "workingPath",
|
"paramLongName": "outputDir",
|
||||||
"paramDescription": "the workinh path",
|
"paramDescription": "the dir that contains the events folder",
|
||||||
"paramRequired": true
|
"paramRequired": true
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
@ -17,6 +17,30 @@
|
||||||
"paramDescription": "the ES host",
|
"paramDescription": "the ES host",
|
||||||
"paramRequired": true
|
"paramRequired": true
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"paramName": "esBatchWriteRetryCount",
|
||||||
|
"paramLongName": "esBatchWriteRetryCount",
|
||||||
|
"paramDescription": "an ES configuration property",
|
||||||
|
"paramRequired": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"paramName": "esBatchWriteRetryWait",
|
||||||
|
"paramLongName": "esBatchWriteRetryWait",
|
||||||
|
"paramDescription": "an ES configuration property",
|
||||||
|
"paramRequired": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"paramName": "esBatchSizeEntries",
|
||||||
|
"paramLongName": "esBatchSizeEntries",
|
||||||
|
"paramDescription": "an ES configuration property",
|
||||||
|
"paramRequired": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"paramName": "esNodesWanOnly",
|
||||||
|
"paramLongName": "esNodesWanOnly",
|
||||||
|
"paramDescription": "an ES configuration property",
|
||||||
|
"paramRequired": true
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"paramName": "broker",
|
"paramName": "broker",
|
||||||
"paramLongName": "brokerApiBaseUrl",
|
"paramLongName": "brokerApiBaseUrl",
|
||||||
|
|
|
@ -6,8 +6,8 @@
|
||||||
<description>the path where the graph is stored</description>
|
<description>the path where the graph is stored</description>
|
||||||
</property>
|
</property>
|
||||||
<property>
|
<property>
|
||||||
<name>workingPath</name>
|
<name>outputDir</name>
|
||||||
<description>the path where the the generated data will be stored</description>
|
<description>the path where the the generated data are stored</description>
|
||||||
</property>
|
</property>
|
||||||
<property>
|
<property>
|
||||||
<name>datasourceIdWhitelist</name>
|
<name>datasourceIdWhitelist</name>
|
||||||
|
@ -36,6 +36,26 @@
|
||||||
<name>esIndexHost</name>
|
<name>esIndexHost</name>
|
||||||
<description>the elasticsearch host</description>
|
<description>the elasticsearch host</description>
|
||||||
</property>
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>esBatchWriteRetryCount</name>
|
||||||
|
<value>8</value>
|
||||||
|
<description>an ES configuration property</description>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>esBatchWriteRetryWait</name>
|
||||||
|
<value>60s</value>
|
||||||
|
<description>an ES configuration property</description>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>esBatchSizeEntries</name>
|
||||||
|
<value>200</value>
|
||||||
|
<description>an ES configuration property</description>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>esNodesWanOnly</name>
|
||||||
|
<value>true</value>
|
||||||
|
<description>an ES configuration property</description>
|
||||||
|
</property>
|
||||||
<property>
|
<property>
|
||||||
<name>maxIndexedEventsForDsAndTopic</name>
|
<name>maxIndexedEventsForDsAndTopic</name>
|
||||||
<description>the max number of events for each couple (ds/topic)</description>
|
<description>the max number of events for each couple (ds/topic)</description>
|
||||||
|
@ -122,9 +142,13 @@
|
||||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||||
--conf spark.sql.shuffle.partitions=3840
|
--conf spark.sql.shuffle.partitions=3840
|
||||||
</spark-opts>
|
</spark-opts>
|
||||||
<arg>--workingPath</arg><arg>${workingPath}</arg>
|
<arg>--outputDir</arg><arg>${outputDir}</arg>
|
||||||
<arg>--index</arg><arg>${esNotificationsIndexName}</arg>
|
<arg>--index</arg><arg>${esNotificationsIndexName}</arg>
|
||||||
<arg>--esHost</arg><arg>${esIndexHost}</arg>
|
<arg>--esHost</arg><arg>${esIndexHost}</arg>
|
||||||
|
<arg>--esBatchWriteRetryCount</arg><arg>${esBatchWriteRetryCount}</arg>
|
||||||
|
<arg>--esBatchWriteRetryWait</arg><arg>${esBatchWriteRetryWait}</arg>
|
||||||
|
<arg>--esBatchSizeEntries</arg><arg>${esBatchSizeEntries}</arg>
|
||||||
|
<arg>--esNodesWanOnly</arg><arg>${esNodesWanOnly}</arg>
|
||||||
<arg>--brokerApiBaseUrl</arg><arg>${brokerApiBaseUrl}</arg>
|
<arg>--brokerApiBaseUrl</arg><arg>${brokerApiBaseUrl}</arg>
|
||||||
</spark>
|
</spark>
|
||||||
<ok to="End"/>
|
<ok to="End"/>
|
||||||
|
|
|
@ -1,8 +1,8 @@
|
||||||
[
|
[
|
||||||
{
|
{
|
||||||
"paramName": "o",
|
"paramName": "o",
|
||||||
"paramLongName": "workingPath",
|
"paramLongName": "outputDir",
|
||||||
"paramDescription": "the path where the temporary data will be stored",
|
"paramDescription": "the path where the data will be stored",
|
||||||
"paramRequired": true
|
"paramRequired": true
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
|
|
@ -6,7 +6,7 @@
|
||||||
<description>the opendoar IDs whitelist (comma separated)</description>
|
<description>the opendoar IDs whitelist (comma separated)</description>
|
||||||
</property>
|
</property>
|
||||||
<property>
|
<property>
|
||||||
<name>workingPath</name>
|
<name>outputDir</name>
|
||||||
<description>the path where the the generated data will be stored</description>
|
<description>the path where the the generated data will be stored</description>
|
||||||
</property>
|
</property>
|
||||||
<property>
|
<property>
|
||||||
|
@ -87,7 +87,7 @@
|
||||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||||
--conf spark.sql.shuffle.partitions=3840
|
--conf spark.sql.shuffle.partitions=3840
|
||||||
</spark-opts>
|
</spark-opts>
|
||||||
<arg>--workingPath</arg><arg>${workingPath}</arg>
|
<arg>--outputDir</arg><arg>${outputDir}</arg>
|
||||||
<arg>--opendoarIds</arg><arg>${opendoarIds}</arg>
|
<arg>--opendoarIds</arg><arg>${opendoarIds}</arg>
|
||||||
</spark>
|
</spark>
|
||||||
<ok to="End"/>
|
<ok to="End"/>
|
||||||
|
|
|
@ -0,0 +1,18 @@
|
||||||
|
<configuration>
|
||||||
|
<property>
|
||||||
|
<name>jobTracker</name>
|
||||||
|
<value>yarnRM</value>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>nameNode</name>
|
||||||
|
<value>hdfs://nameservice1</value>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>oozie.use.system.libpath</name>
|
||||||
|
<value>true</value>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>oozie.action.sharelib.for.spark</name>
|
||||||
|
<value>spark2</value>
|
||||||
|
</property>
|
||||||
|
</configuration>
|
|
@ -0,0 +1,140 @@
|
||||||
|
<workflow-app name="reindex_events" xmlns="uri:oozie:workflow:0.5">
|
||||||
|
|
||||||
|
<parameters>
|
||||||
|
<property>
|
||||||
|
<name>outputDir</name>
|
||||||
|
<description>the path where the the generated data will be stored</description>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>esEventIndexName</name>
|
||||||
|
<description>the elasticsearch index name for events</description>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>esIndexHost</name>
|
||||||
|
<description>the elasticsearch host</description>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>esBatchWriteRetryCount</name>
|
||||||
|
<value>8</value>
|
||||||
|
<description>an ES configuration property</description>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>esBatchWriteRetryWait</name>
|
||||||
|
<value>60s</value>
|
||||||
|
<description>an ES configuration property</description>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>esBatchSizeEntries</name>
|
||||||
|
<value>200</value>
|
||||||
|
<description>an ES configuration property</description>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>esNodesWanOnly</name>
|
||||||
|
<value>true</value>
|
||||||
|
<description>an ES configuration property</description>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>maxIndexedEventsForDsAndTopic</name>
|
||||||
|
<description>the max number of events for each couple (ds/topic)</description>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>brokerApiBaseUrl</name>
|
||||||
|
<description>the url of the broker service api</description>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>sparkDriverMemory</name>
|
||||||
|
<description>memory for driver process</description>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>sparkExecutorMemory</name>
|
||||||
|
<description>memory for individual executor</description>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>sparkExecutorCores</name>
|
||||||
|
<description>number of cores used by single executor</description>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>oozieActionShareLibForSpark2</name>
|
||||||
|
<description>oozie action sharelib for spark 2.*</description>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>spark2ExtraListeners</name>
|
||||||
|
<value>com.cloudera.spark.lineage.NavigatorAppListener</value>
|
||||||
|
<description>spark 2.* extra listeners classname</description>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>spark2SqlQueryExecutionListeners</name>
|
||||||
|
<value>com.cloudera.spark.lineage.NavigatorQueryListener</value>
|
||||||
|
<description>spark 2.* sql query execution listeners classname</description>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>spark2YarnHistoryServerAddress</name>
|
||||||
|
<description>spark 2.* yarn history server address</description>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>spark2EventLogDir</name>
|
||||||
|
<description>spark 2.* event log dir location</description>
|
||||||
|
</property>
|
||||||
|
</parameters>
|
||||||
|
|
||||||
|
<global>
|
||||||
|
<job-tracker>${jobTracker}</job-tracker>
|
||||||
|
<name-node>${nameNode}</name-node>
|
||||||
|
<configuration>
|
||||||
|
<property>
|
||||||
|
<name>mapreduce.job.queuename</name>
|
||||||
|
<value>${queueName}</value>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>oozie.launcher.mapred.job.queue.name</name>
|
||||||
|
<value>${oozieLauncherQueueName}</value>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>oozie.action.sharelib.for.spark</name>
|
||||||
|
<value>${oozieActionShareLibForSpark2}</value>
|
||||||
|
</property>
|
||||||
|
</configuration>
|
||||||
|
</global>
|
||||||
|
|
||||||
|
<start to="index_event_subset"/>
|
||||||
|
|
||||||
|
<kill name="Kill">
|
||||||
|
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
||||||
|
</kill>
|
||||||
|
|
||||||
|
<action name="index_event_subset">
|
||||||
|
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||||
|
<master>yarn</master>
|
||||||
|
<mode>cluster</mode>
|
||||||
|
<name>IndexEventSubsetOnESJob</name>
|
||||||
|
<class>eu.dnetlib.dhp.broker.oa.IndexEventSubsetJob</class>
|
||||||
|
<jar>dhp-broker-events-${projectVersion}.jar</jar>
|
||||||
|
<spark-opts>
|
||||||
|
--executor-memory=${sparkExecutorMemory}
|
||||||
|
--driver-memory=${sparkDriverMemory}
|
||||||
|
--conf spark.dynamicAllocation.maxExecutors="8"
|
||||||
|
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||||
|
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||||
|
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||||
|
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||||
|
--conf spark.sql.shuffle.partitions=3840
|
||||||
|
</spark-opts>
|
||||||
|
<arg>--outputDir</arg><arg>${outputDir}</arg>
|
||||||
|
<arg>--index</arg><arg>${esEventIndexName}</arg>
|
||||||
|
<arg>--esHost</arg><arg>${esIndexHost}</arg>
|
||||||
|
<arg>--esBatchWriteRetryCount</arg><arg>${esBatchWriteRetryCount}</arg>
|
||||||
|
<arg>--esBatchWriteRetryWait</arg><arg>${esBatchWriteRetryWait}</arg>
|
||||||
|
<arg>--esBatchSizeEntries</arg><arg>${esBatchSizeEntries}</arg>
|
||||||
|
<arg>--esNodesWanOnly</arg><arg>${esNodesWanOnly}</arg>
|
||||||
|
<arg>--maxEventsForTopic</arg><arg>${maxIndexedEventsForDsAndTopic}</arg>
|
||||||
|
<arg>--brokerApiBaseUrl</arg><arg>${brokerApiBaseUrl}</arg>
|
||||||
|
</spark>
|
||||||
|
<ok to="End"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<end name="End"/>
|
||||||
|
|
||||||
|
</workflow-app>
|
|
@ -0,0 +1,18 @@
|
||||||
|
<configuration>
|
||||||
|
<property>
|
||||||
|
<name>jobTracker</name>
|
||||||
|
<value>yarnRM</value>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>nameNode</name>
|
||||||
|
<value>hdfs://nameservice1</value>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>oozie.use.system.libpath</name>
|
||||||
|
<value>true</value>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>oozie.action.sharelib.for.spark</name>
|
||||||
|
<value>spark2</value>
|
||||||
|
</property>
|
||||||
|
</configuration>
|
|
@ -0,0 +1,114 @@
|
||||||
|
<workflow-app name="create broker events" xmlns="uri:oozie:workflow:0.5">
|
||||||
|
|
||||||
|
<parameters>
|
||||||
|
<property>
|
||||||
|
<name>outputDir</name>
|
||||||
|
<description>the path where the the generated data will be stored</description>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>brokerApiBaseUrl</name>
|
||||||
|
<description>the url of the broker service api</description>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>brokerDbUrl</name>
|
||||||
|
<description>the url of the broker database</description>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>brokerDbUser</name>
|
||||||
|
<description>the user of the broker database</description>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>brokerDbPassword</name>
|
||||||
|
<description>the password of the broker database</description>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>sparkDriverMemory</name>
|
||||||
|
<description>memory for driver process</description>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>sparkExecutorMemory</name>
|
||||||
|
<description>memory for individual executor</description>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>sparkExecutorCores</name>
|
||||||
|
<description>number of cores used by single executor</description>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>oozieActionShareLibForSpark2</name>
|
||||||
|
<description>oozie action sharelib for spark 2.*</description>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>spark2ExtraListeners</name>
|
||||||
|
<value>com.cloudera.spark.lineage.NavigatorAppListener</value>
|
||||||
|
<description>spark 2.* extra listeners classname</description>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>spark2SqlQueryExecutionListeners</name>
|
||||||
|
<value>com.cloudera.spark.lineage.NavigatorQueryListener</value>
|
||||||
|
<description>spark 2.* sql query execution listeners classname</description>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>spark2YarnHistoryServerAddress</name>
|
||||||
|
<description>spark 2.* yarn history server address</description>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>spark2EventLogDir</name>
|
||||||
|
<description>spark 2.* event log dir location</description>
|
||||||
|
</property>
|
||||||
|
</parameters>
|
||||||
|
|
||||||
|
<global>
|
||||||
|
<job-tracker>${jobTracker}</job-tracker>
|
||||||
|
<name-node>${nameNode}</name-node>
|
||||||
|
<configuration>
|
||||||
|
<property>
|
||||||
|
<name>mapreduce.job.queuename</name>
|
||||||
|
<value>${queueName}</value>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>oozie.launcher.mapred.job.queue.name</name>
|
||||||
|
<value>${oozieLauncherQueueName}</value>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>oozie.action.sharelib.for.spark</name>
|
||||||
|
<value>${oozieActionShareLibForSpark2}</value>
|
||||||
|
</property>
|
||||||
|
</configuration>
|
||||||
|
</global>
|
||||||
|
|
||||||
|
<start to="stats"/>
|
||||||
|
|
||||||
|
<kill name="Kill">
|
||||||
|
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
||||||
|
</kill>
|
||||||
|
|
||||||
|
<action name="stats">
|
||||||
|
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||||
|
<master>yarn</master>
|
||||||
|
<mode>cluster</mode>
|
||||||
|
<name>GenerateStatsJob</name>
|
||||||
|
<class>eu.dnetlib.dhp.broker.oa.GenerateStatsJob</class>
|
||||||
|
<jar>dhp-broker-events-${projectVersion}.jar</jar>
|
||||||
|
<spark-opts>
|
||||||
|
--executor-cores=${sparkExecutorCores}
|
||||||
|
--executor-memory=${sparkExecutorMemory}
|
||||||
|
--driver-memory=${sparkDriverMemory}
|
||||||
|
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||||
|
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||||
|
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||||
|
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||||
|
--conf spark.sql.shuffle.partitions=3840
|
||||||
|
</spark-opts>
|
||||||
|
<arg>--outputDir</arg><arg>${outputDir}</arg>
|
||||||
|
<arg>--dbUrl</arg><arg>${brokerDbUrl}</arg>
|
||||||
|
<arg>--dbUser</arg><arg>${brokerDbUser}</arg>
|
||||||
|
<arg>--dbPassword</arg><arg>${brokerDbPassword}</arg>
|
||||||
|
<arg>--brokerApiBaseUrl</arg><arg>${brokerApiBaseUrl}</arg>
|
||||||
|
</spark>
|
||||||
|
<ok to="End"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<end name="End"/>
|
||||||
|
|
||||||
|
</workflow-app>
|
|
@ -1,8 +1,8 @@
|
||||||
[
|
[
|
||||||
{
|
{
|
||||||
"paramName": "wp",
|
"paramName": "o",
|
||||||
"paramLongName": "workingPath",
|
"paramLongName": "outputDir",
|
||||||
"paramDescription": "the working path",
|
"paramDescription": "the path where generated data are stored",
|
||||||
"paramRequired": true
|
"paramRequired": true
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
|
|
@ -7,7 +7,6 @@ import java.util.List;
|
||||||
import org.apache.commons.io.IOUtils;
|
import org.apache.commons.io.IOUtils;
|
||||||
import org.apache.spark.api.java.JavaPairRDD;
|
import org.apache.spark.api.java.JavaPairRDD;
|
||||||
import org.apache.spark.api.java.JavaRDD;
|
import org.apache.spark.api.java.JavaRDD;
|
||||||
import org.apache.spark.api.java.JavaSparkContext;
|
|
||||||
import org.apache.spark.api.java.function.FlatMapFunction;
|
import org.apache.spark.api.java.function.FlatMapFunction;
|
||||||
import org.apache.spark.api.java.function.MapFunction;
|
import org.apache.spark.api.java.function.MapFunction;
|
||||||
import org.apache.spark.api.java.function.PairFunction;
|
import org.apache.spark.api.java.function.PairFunction;
|
||||||
|
@ -16,8 +15,8 @@ import org.apache.spark.rdd.RDD;
|
||||||
import org.apache.spark.sql.Dataset;
|
import org.apache.spark.sql.Dataset;
|
||||||
import org.apache.spark.sql.Encoders;
|
import org.apache.spark.sql.Encoders;
|
||||||
import org.apache.spark.sql.SparkSession;
|
import org.apache.spark.sql.SparkSession;
|
||||||
import org.codehaus.jackson.map.ObjectMapper;
|
|
||||||
|
|
||||||
|
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||||
import com.google.common.hash.Hashing;
|
import com.google.common.hash.Hashing;
|
||||||
|
|
||||||
import eu.dnetlib.dedup.graph.ConnectedComponent;
|
import eu.dnetlib.dedup.graph.ConnectedComponent;
|
||||||
|
|
|
@ -10,7 +10,8 @@ import org.apache.spark.api.java.JavaSparkContext;
|
||||||
import org.apache.spark.api.java.function.MapFunction;
|
import org.apache.spark.api.java.function.MapFunction;
|
||||||
import org.apache.spark.sql.Encoders;
|
import org.apache.spark.sql.Encoders;
|
||||||
import org.apache.spark.sql.SparkSession;
|
import org.apache.spark.sql.SparkSession;
|
||||||
import org.codehaus.jackson.map.ObjectMapper;
|
|
||||||
|
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||||
|
|
||||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||||
import eu.dnetlib.dhp.schema.oaf.Oaf;
|
import eu.dnetlib.dhp.schema.oaf.Oaf;
|
||||||
|
|
|
@ -4,14 +4,13 @@ import eu.dnetlib.dhp.schema.action.AtomicAction
|
||||||
import eu.dnetlib.dhp.schema.oaf.{DataInfo, Dataset, Field, Instance, KeyValue, Oaf, Organization, Publication, Qualifier, Relation, Result, StructuredProperty}
|
import eu.dnetlib.dhp.schema.oaf.{DataInfo, Dataset, Field, Instance, KeyValue, Oaf, Organization, Publication, Qualifier, Relation, Result, StructuredProperty}
|
||||||
import eu.dnetlib.dhp.utils.DHPUtils
|
import eu.dnetlib.dhp.utils.DHPUtils
|
||||||
import org.apache.commons.lang3.StringUtils
|
import org.apache.commons.lang3.StringUtils
|
||||||
import org.codehaus.jackson.map.ObjectMapper
|
import com.fasterxml.jackson.databind.ObjectMapper
|
||||||
import org.json4s
|
import org.json4s
|
||||||
import org.json4s.DefaultFormats
|
import org.json4s.DefaultFormats
|
||||||
import org.json4s.jackson.JsonMethods.parse
|
import org.json4s.jackson.JsonMethods.parse
|
||||||
import org.slf4j.{Logger, LoggerFactory}
|
import org.slf4j.{Logger, LoggerFactory}
|
||||||
|
|
||||||
import scala.collection.JavaConverters._
|
import scala.collection.JavaConverters._
|
||||||
import scala.io.Source
|
|
||||||
|
|
||||||
|
|
||||||
case class HostedByItemType(id: String, officialname: String, issn: String, eissn: String, lissn: String, openAccess: Boolean) {}
|
case class HostedByItemType(id: String, officialname: String, issn: String, eissn: String, lissn: String, openAccess: Boolean) {}
|
||||||
|
@ -19,23 +18,18 @@ case class HostedByItemType(id: String, officialname: String, issn: String, eiss
|
||||||
case class DoiBoostAffiliation(PaperId:Long, AffiliationId:Long, GridId:Option[String], OfficialPage:Option[String], DisplayName:Option[String]){}
|
case class DoiBoostAffiliation(PaperId:Long, AffiliationId:Long, GridId:Option[String], OfficialPage:Option[String], DisplayName:Option[String]){}
|
||||||
|
|
||||||
object DoiBoostMappingUtil {
|
object DoiBoostMappingUtil {
|
||||||
def getUnknownCountry(): Qualifier = {
|
|
||||||
createQualifier("UNKNOWN","UNKNOWN","dnet:countries","dnet:countries")
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def generateMAGAffiliationId(affId: String): String = {
|
def generateMAGAffiliationId(affId: String): String = {
|
||||||
s"20|microsoft___$SEPARATOR${DHPUtils.md5(affId)}"
|
s"20|microsoft___$SEPARATOR${DHPUtils.md5(affId)}"
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
val logger: Logger = LoggerFactory.getLogger(getClass)
|
val logger: Logger = LoggerFactory.getLogger(getClass)
|
||||||
|
|
||||||
//STATIC STRING
|
//STATIC STRING
|
||||||
val MAG = "microsoft"
|
val MAG = "microsoft"
|
||||||
val MAG_NAME = "Microsoft Academic Graph"
|
val MAG_NAME = "Microsoft Academic Graph"
|
||||||
val ORCID = "ORCID"
|
val ORCID = "orcid"
|
||||||
|
val ORCID_PENDING = "orcid_pending"
|
||||||
val CROSSREF = "Crossref"
|
val CROSSREF = "Crossref"
|
||||||
val UNPAYWALL = "UnpayWall"
|
val UNPAYWALL = "UnpayWall"
|
||||||
val GRID_AC = "grid.ac"
|
val GRID_AC = "grid.ac"
|
||||||
|
|
|
@ -39,33 +39,38 @@ object SparkGenerateDOIBoostActionSet {
|
||||||
val dbaffiliationRelationPath = parser.get("dbaffiliationRelationPath")
|
val dbaffiliationRelationPath = parser.get("dbaffiliationRelationPath")
|
||||||
val dbOrganizationPath = parser.get("dbOrganizationPath")
|
val dbOrganizationPath = parser.get("dbOrganizationPath")
|
||||||
val workingDirPath = parser.get("targetPath")
|
val workingDirPath = parser.get("targetPath")
|
||||||
|
val sequenceFilePath = parser.get("sFilePath")
|
||||||
|
|
||||||
spark.read.load(dbDatasetPath).as[OafDataset]
|
val asDataset = spark.read.load(dbDatasetPath).as[OafDataset]
|
||||||
.map(d =>DoiBoostMappingUtil.fixResult(d))
|
.map(d =>DoiBoostMappingUtil.fixResult(d))
|
||||||
.map(d=>DoiBoostMappingUtil.toActionSet(d))(Encoders.tuple(Encoders.STRING, Encoders.STRING))
|
.map(d=>DoiBoostMappingUtil.toActionSet(d))(Encoders.tuple(Encoders.STRING, Encoders.STRING))
|
||||||
.write.mode(SaveMode.Overwrite).save(s"$workingDirPath/actionSet")
|
// .write.mode(SaveMode.Overwrite).save(s"$workingDirPath/actionSet")
|
||||||
|
|
||||||
spark.read.load(dbPublicationPath).as[Publication]
|
val asPublication =spark.read.load(dbPublicationPath).as[Publication]
|
||||||
.map(d=>DoiBoostMappingUtil.toActionSet(d))(Encoders.tuple(Encoders.STRING, Encoders.STRING))
|
.map(d=>DoiBoostMappingUtil.toActionSet(d))(Encoders.tuple(Encoders.STRING, Encoders.STRING))
|
||||||
.write.mode(SaveMode.Append).save(s"$workingDirPath/actionSet")
|
// .write.mode(SaveMode.Append).save(s"$workingDirPath/actionSet")
|
||||||
|
|
||||||
spark.read.load(dbOrganizationPath).as[Organization]
|
val asOrganization = spark.read.load(dbOrganizationPath).as[Organization]
|
||||||
.map(d=>DoiBoostMappingUtil.toActionSet(d))(Encoders.tuple(Encoders.STRING, Encoders.STRING))
|
.map(d=>DoiBoostMappingUtil.toActionSet(d))(Encoders.tuple(Encoders.STRING, Encoders.STRING))
|
||||||
.write.mode(SaveMode.Append).save(s"$workingDirPath/actionSet")
|
// .write.mode(SaveMode.Append).save(s"$workingDirPath/actionSet")
|
||||||
|
|
||||||
|
|
||||||
spark.read.load(crossRefRelation).as[Relation]
|
val asCRelation = spark.read.load(crossRefRelation).as[Relation]
|
||||||
.map(d=>DoiBoostMappingUtil.toActionSet(d))(Encoders.tuple(Encoders.STRING, Encoders.STRING))
|
.map(d=>DoiBoostMappingUtil.toActionSet(d))(Encoders.tuple(Encoders.STRING, Encoders.STRING))
|
||||||
.write.mode(SaveMode.Append).save(s"$workingDirPath/actionSet")
|
// .write.mode(SaveMode.Append).save(s"$workingDirPath/actionSet")
|
||||||
|
|
||||||
spark.read.load(dbaffiliationRelationPath).as[Relation]
|
val asRelAffiliation = spark.read.load(dbaffiliationRelationPath).as[Relation]
|
||||||
.map(d=>DoiBoostMappingUtil.toActionSet(d))(Encoders.tuple(Encoders.STRING, Encoders.STRING))
|
.map(d=>DoiBoostMappingUtil.toActionSet(d))(Encoders.tuple(Encoders.STRING, Encoders.STRING))
|
||||||
.write.mode(SaveMode.Append).save(s"$workingDirPath/actionSet")
|
// .write.mode(SaveMode.Append).save(s"$workingDirPath/actionSet")
|
||||||
|
|
||||||
|
|
||||||
val d: Dataset[(String, String)] =spark.read.load(s"$workingDirPath/actionSet").as[(String,String)]
|
|
||||||
|
|
||||||
d.rdd.map(s => (new Text(s._1), new Text(s._2))).saveAsHadoopFile(s"$workingDirPath/rawset", classOf[Text], classOf[Text], classOf[SequenceFileOutputFormat[Text,Text]], classOf[GzipCodec])
|
|
||||||
|
val d: Dataset[(String, String)] = asDataset.union(asPublication).union(asOrganization).union(asCRelation).union(asRelAffiliation)
|
||||||
|
|
||||||
|
// spark.read.load(s"$workingDirPath/actionSet").as[(String,String)]
|
||||||
|
|
||||||
|
d.rdd.repartition(6000).map(s => (new Text(s._1), new Text(s._2))).saveAsHadoopFile(s"$sequenceFilePath", classOf[Text], classOf[Text], classOf[SequenceFileOutputFormat[Text,Text]], classOf[GzipCodec])
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -2,6 +2,7 @@ package eu.dnetlib.doiboost
|
||||||
|
|
||||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser
|
||||||
import eu.dnetlib.dhp.oa.merge.AuthorMerger
|
import eu.dnetlib.dhp.oa.merge.AuthorMerger
|
||||||
|
import eu.dnetlib.dhp.schema.common.ModelConstants
|
||||||
import eu.dnetlib.dhp.schema.oaf.{Organization, Publication, Relation, Dataset => OafDataset}
|
import eu.dnetlib.dhp.schema.oaf.{Organization, Publication, Relation, Dataset => OafDataset}
|
||||||
import eu.dnetlib.doiboost.mag.ConversionUtil
|
import eu.dnetlib.doiboost.mag.ConversionUtil
|
||||||
import org.apache.commons.io.IOUtils
|
import org.apache.commons.io.IOUtils
|
||||||
|
@ -30,7 +31,7 @@ object SparkGenerateDoiBoost {
|
||||||
import spark.implicits._
|
import spark.implicits._
|
||||||
|
|
||||||
val hostedByMapPath = parser.get("hostedByMapPath")
|
val hostedByMapPath = parser.get("hostedByMapPath")
|
||||||
val workingDirPath = parser.get("workingDirPath")
|
val workingDirPath = parser.get("workingPath")
|
||||||
|
|
||||||
|
|
||||||
implicit val mapEncoderPub: Encoder[Publication] = Encoders.kryo[Publication]
|
implicit val mapEncoderPub: Encoder[Publication] = Encoders.kryo[Publication]
|
||||||
|
@ -132,7 +133,7 @@ object SparkGenerateDoiBoost {
|
||||||
o.setLegalname(DoiBoostMappingUtil.asField(affiliation.DisplayName.get))
|
o.setLegalname(DoiBoostMappingUtil.asField(affiliation.DisplayName.get))
|
||||||
if (affiliation.OfficialPage.isDefined)
|
if (affiliation.OfficialPage.isDefined)
|
||||||
o.setWebsiteurl(DoiBoostMappingUtil.asField(affiliation.OfficialPage.get))
|
o.setWebsiteurl(DoiBoostMappingUtil.asField(affiliation.OfficialPage.get))
|
||||||
o.setCountry(DoiBoostMappingUtil.getUnknownCountry())
|
o.setCountry(ModelConstants.UNKNOWN_COUNTRY)
|
||||||
o
|
o
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
|
|
|
@ -206,7 +206,7 @@ case object Crossref2Oaf {
|
||||||
a.setSurname(family)
|
a.setSurname(family)
|
||||||
a.setFullname(s"$given $family")
|
a.setFullname(s"$given $family")
|
||||||
if (StringUtils.isNotBlank(orcid))
|
if (StringUtils.isNotBlank(orcid))
|
||||||
a.setPid(List(createSP(orcid, ORCID, PID_TYPES, generateDataInfo())).asJava)
|
a.setPid(List(createSP(orcid, ORCID_PENDING, PID_TYPES, generateDataInfo())).asJava)
|
||||||
|
|
||||||
a
|
a
|
||||||
}
|
}
|
||||||
|
|
|
@ -2,18 +2,16 @@
|
||||||
package eu.dnetlib.doiboost.crossref;
|
package eu.dnetlib.doiboost.crossref;
|
||||||
|
|
||||||
import java.io.ByteArrayOutputStream;
|
import java.io.ByteArrayOutputStream;
|
||||||
|
import java.util.Optional;
|
||||||
import java.util.zip.Inflater;
|
import java.util.zip.Inflater;
|
||||||
|
|
||||||
import org.apache.commons.codec.binary.Base64;
|
import org.apache.commons.codec.binary.Base64;
|
||||||
import org.apache.commons.io.IOUtils;
|
import org.apache.commons.io.IOUtils;
|
||||||
import org.apache.commons.lang3.StringUtils;
|
|
||||||
import org.apache.hadoop.conf.Configuration;
|
import org.apache.hadoop.conf.Configuration;
|
||||||
import org.apache.hadoop.fs.Path;
|
import org.apache.hadoop.fs.Path;
|
||||||
import org.apache.hadoop.io.IntWritable;
|
import org.apache.hadoop.io.IntWritable;
|
||||||
import org.apache.hadoop.io.SequenceFile;
|
import org.apache.hadoop.io.SequenceFile;
|
||||||
import org.apache.hadoop.io.Text;
|
import org.apache.hadoop.io.Text;
|
||||||
import org.slf4j.Logger;
|
|
||||||
import org.slf4j.LoggerFactory;
|
|
||||||
|
|
||||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||||
|
|
||||||
|
@ -30,34 +28,45 @@ public class CrossrefImporter {
|
||||||
|
|
||||||
parser.parseArgument(args);
|
parser.parseArgument(args);
|
||||||
|
|
||||||
final String hdfsuri = parser.get("namenode");
|
final String namenode = parser.get("namenode");
|
||||||
System.out.println("HDFS URI" + hdfsuri);
|
System.out.println("namenode: " + namenode);
|
||||||
Path hdfswritepath = new Path(parser.get("targetPath"));
|
|
||||||
System.out.println("TargetPath: " + hdfsuri);
|
|
||||||
|
|
||||||
final Long timestamp = StringUtils.isNotBlank(parser.get("timestamp"))
|
Path targetPath = new Path(parser.get("targetPath"));
|
||||||
? Long.parseLong(parser.get("timestamp"))
|
System.out.println("targetPath: " + targetPath);
|
||||||
: -1;
|
|
||||||
|
|
||||||
if (timestamp > 0)
|
final Long timestamp = Optional
|
||||||
System.out.println("Timestamp added " + timestamp);
|
.ofNullable(parser.get("timestamp"))
|
||||||
|
.map(s -> {
|
||||||
|
try {
|
||||||
|
return Long.parseLong(s);
|
||||||
|
} catch (NumberFormatException e) {
|
||||||
|
return -1L;
|
||||||
|
}
|
||||||
|
})
|
||||||
|
.orElse(-1L);
|
||||||
|
System.out.println("timestamp: " + timestamp);
|
||||||
|
|
||||||
|
final String esServer = parser.get("esServer");
|
||||||
|
System.out.println("esServer: " + esServer);
|
||||||
|
|
||||||
|
final String esIndex = parser.get("esIndex");
|
||||||
|
System.out.println("esIndex: " + esIndex);
|
||||||
|
|
||||||
// ====== Init HDFS File System Object
|
// ====== Init HDFS File System Object
|
||||||
Configuration conf = new Configuration();
|
Configuration conf = new Configuration();
|
||||||
// Set FileSystem URI
|
// Set FileSystem URI
|
||||||
conf.set("fs.defaultFS", hdfsuri);
|
conf.set("fs.defaultFS", namenode);
|
||||||
// Because of Maven
|
// Because of Maven
|
||||||
conf.set("fs.hdfs.impl", org.apache.hadoop.hdfs.DistributedFileSystem.class.getName());
|
conf.set("fs.hdfs.impl", org.apache.hadoop.hdfs.DistributedFileSystem.class.getName());
|
||||||
conf.set("fs.file.impl", org.apache.hadoop.fs.LocalFileSystem.class.getName());
|
conf.set("fs.file.impl", org.apache.hadoop.fs.LocalFileSystem.class.getName());
|
||||||
|
|
||||||
ESClient client = timestamp > 0
|
// "ip-90-147-167-25.ct1.garrservices.it", "crossref"
|
||||||
? new ESClient("ip-90-147-167-25.ct1.garrservices.it", "crossref", timestamp)
|
final ESClient client = new ESClient(esServer, esIndex, timestamp);
|
||||||
: new ESClient("ip-90-147-167-25.ct1.garrservices.it", "crossref");
|
|
||||||
|
|
||||||
try (SequenceFile.Writer writer = SequenceFile
|
try (SequenceFile.Writer writer = SequenceFile
|
||||||
.createWriter(
|
.createWriter(
|
||||||
conf,
|
conf,
|
||||||
SequenceFile.Writer.file(hdfswritepath),
|
SequenceFile.Writer.file(targetPath),
|
||||||
SequenceFile.Writer.keyClass(IntWritable.class),
|
SequenceFile.Writer.keyClass(IntWritable.class),
|
||||||
SequenceFile.Writer.valueClass(Text.class))) {
|
SequenceFile.Writer.valueClass(Text.class))) {
|
||||||
|
|
||||||
|
@ -74,8 +83,7 @@ public class CrossrefImporter {
|
||||||
end = System.currentTimeMillis();
|
end = System.currentTimeMillis();
|
||||||
final float time = (end - start) / 1000.0F;
|
final float time = (end - start) / 1000.0F;
|
||||||
System.out
|
System.out
|
||||||
.println(
|
.println(String.format("Imported %s records last 100000 imported in %s seconds", i, time));
|
||||||
String.format("Imported %d records last 100000 imported in %f seconds", i, time));
|
|
||||||
start = System.currentTimeMillis();
|
start = System.currentTimeMillis();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,11 +1,11 @@
|
||||||
|
|
||||||
package eu.dnetlib.doiboost.crossref;
|
package eu.dnetlib.doiboost.crossref;
|
||||||
|
|
||||||
import java.io.IOException;
|
|
||||||
import java.util.Iterator;
|
import java.util.Iterator;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
import org.apache.commons.io.IOUtils;
|
import org.apache.commons.io.IOUtils;
|
||||||
|
import org.apache.http.HttpHeaders;
|
||||||
import org.apache.http.client.methods.CloseableHttpResponse;
|
import org.apache.http.client.methods.CloseableHttpResponse;
|
||||||
import org.apache.http.client.methods.HttpPost;
|
import org.apache.http.client.methods.HttpPost;
|
||||||
import org.apache.http.entity.StringEntity;
|
import org.apache.http.entity.StringEntity;
|
||||||
|
@ -17,13 +17,17 @@ import org.slf4j.LoggerFactory;
|
||||||
import com.jayway.jsonpath.JsonPath;
|
import com.jayway.jsonpath.JsonPath;
|
||||||
|
|
||||||
public class ESClient implements Iterator<String> {
|
public class ESClient implements Iterator<String> {
|
||||||
private static final Logger logger = LoggerFactory.getLogger(ESClient.class);
|
|
||||||
|
|
||||||
static final String blobPath = "$.hits[*].hits[*]._source.blob";
|
private static final String BLOB_PATH = "$.hits.hits[*]._source.blob";
|
||||||
static final String scrollIdPath = "$._scroll_id";
|
private static final String SCROLL_ID_PATH = "$._scroll_id";
|
||||||
static final String JSON_NO_TS = "{\"size\":1000}";
|
private static final String JSON_NO_TS = "{\"size\":1000}";
|
||||||
static final String JSON_WITH_TS = "{\"size\":1000, \"query\":{\"range\":{\"timestamp\":{\"gte\":%d}}}}";
|
private static final String JSON_WITH_TS = "{\"size\":1000, \"query\":{\"range\":{\"timestamp\":{\"gte\":%d}}}}";
|
||||||
static final String JSON_SCROLL = "{\"scroll_id\":\"%s\",\"scroll\" : \"1m\"}";
|
private static final String JSON_SCROLL = "{\"scroll_id\":\"%s\",\"scroll\" : \"1m\"}";
|
||||||
|
|
||||||
|
public static final String APPLICATION_JSON = "application/json";
|
||||||
|
|
||||||
|
public static final String ES_SEARCH_URL = "http://%s:9200/%s/_search?scroll=1m";
|
||||||
|
public static final String ES_SCROLL_URL = "http://%s:9200/_search/scroll";
|
||||||
|
|
||||||
private final String scrollId;
|
private final String scrollId;
|
||||||
|
|
||||||
|
@ -31,47 +35,30 @@ public class ESClient implements Iterator<String> {
|
||||||
|
|
||||||
private final String esHost;
|
private final String esHost;
|
||||||
|
|
||||||
public ESClient(final String esHost, final String esIndex) throws IOException {
|
public ESClient(final String esHost, final String esIndex, final long timestamp) {
|
||||||
|
|
||||||
this.esHost = esHost;
|
this.esHost = esHost;
|
||||||
final String body = getResponse(
|
|
||||||
String.format("http://%s:9200/%s/_search?scroll=1m", esHost, esIndex), JSON_NO_TS);
|
|
||||||
scrollId = getJPathString(scrollIdPath, body);
|
|
||||||
buffer = getBlobs(body);
|
|
||||||
}
|
|
||||||
|
|
||||||
public ESClient(final String esHost, final String esIndex, final long timestamp)
|
final String body = timestamp > 0
|
||||||
throws IOException {
|
? getResponse(String.format(ES_SEARCH_URL, esHost, esIndex), String.format(JSON_WITH_TS, timestamp))
|
||||||
this.esHost = esHost;
|
: getResponse(String.format(ES_SEARCH_URL, esHost, esIndex), JSON_NO_TS);
|
||||||
final String body = getResponse(
|
scrollId = getJPathString(SCROLL_ID_PATH, body);
|
||||||
String.format("http://%s:9200/%s/_search?scroll=1m", esHost, esIndex),
|
|
||||||
String.format(JSON_WITH_TS, timestamp));
|
|
||||||
scrollId = getJPathString(scrollIdPath, body);
|
|
||||||
buffer = getBlobs(body);
|
buffer = getBlobs(body);
|
||||||
}
|
}
|
||||||
|
|
||||||
private String getResponse(final String url, final String json) {
|
private String getResponse(final String url, final String json) {
|
||||||
CloseableHttpClient client = HttpClients.createDefault();
|
try (CloseableHttpClient client = HttpClients.createDefault()) {
|
||||||
try {
|
|
||||||
|
|
||||||
HttpPost httpPost = new HttpPost(url);
|
HttpPost httpPost = new HttpPost(url);
|
||||||
if (json != null) {
|
if (json != null) {
|
||||||
StringEntity entity = new StringEntity(json);
|
StringEntity entity = new StringEntity(json);
|
||||||
httpPost.setEntity(entity);
|
httpPost.setEntity(entity);
|
||||||
httpPost.setHeader("Accept", "application/json");
|
httpPost.setHeader(HttpHeaders.ACCEPT, APPLICATION_JSON);
|
||||||
httpPost.setHeader("Content-type", "application/json");
|
httpPost.setHeader(HttpHeaders.CONTENT_TYPE, APPLICATION_JSON);
|
||||||
}
|
}
|
||||||
CloseableHttpResponse response = client.execute(httpPost);
|
try (CloseableHttpResponse response = client.execute(httpPost)) {
|
||||||
|
|
||||||
return IOUtils.toString(response.getEntity().getContent());
|
return IOUtils.toString(response.getEntity().getContent());
|
||||||
|
}
|
||||||
} catch (Throwable e) {
|
} catch (Throwable e) {
|
||||||
throw new RuntimeException("Error on executing request ", e);
|
throw new RuntimeException("Error on executing request ", e);
|
||||||
} finally {
|
|
||||||
try {
|
|
||||||
client.close();
|
|
||||||
} catch (IOException e) {
|
|
||||||
throw new RuntimeException("Unable to close client ", e);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -87,7 +74,7 @@ public class ESClient implements Iterator<String> {
|
||||||
}
|
}
|
||||||
|
|
||||||
private List<String> getBlobs(final String body) {
|
private List<String> getBlobs(final String body) {
|
||||||
final List<String> res = JsonPath.read(body, "$.hits.hits[*]._source.blob");
|
final List<String> res = JsonPath.read(body, BLOB_PATH);
|
||||||
return res;
|
return res;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -102,11 +89,11 @@ public class ESClient implements Iterator<String> {
|
||||||
if (buffer.isEmpty()) {
|
if (buffer.isEmpty()) {
|
||||||
|
|
||||||
final String json_param = String.format(JSON_SCROLL, scrollId);
|
final String json_param = String.format(JSON_SCROLL, scrollId);
|
||||||
final String body = getResponse(String.format("http://%s:9200/_search/scroll", esHost), json_param);
|
final String body = getResponse(String.format(ES_SCROLL_URL, esHost), json_param);
|
||||||
try {
|
try {
|
||||||
buffer = getBlobs(body);
|
buffer = getBlobs(body);
|
||||||
} catch (Throwable e) {
|
} catch (Throwable e) {
|
||||||
logger.error("Error on get next page: body:" + body);
|
System.out.println("Error on get next page: body:" + body);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return nextItem;
|
return nextItem;
|
||||||
|
|
|
@ -11,7 +11,7 @@ import org.slf4j.{Logger, LoggerFactory}
|
||||||
|
|
||||||
import scala.collection.JavaConverters._
|
import scala.collection.JavaConverters._
|
||||||
|
|
||||||
object SparkPreProcessMAG {
|
object SparkProcessMAG {
|
||||||
def main(args: Array[String]): Unit = {
|
def main(args: Array[String]): Unit = {
|
||||||
|
|
||||||
val logger: Logger = LoggerFactory.getLogger(getClass)
|
val logger: Logger = LoggerFactory.getLogger(getClass)
|
|
@ -1,11 +1,11 @@
|
||||||
package eu.dnetlib.doiboost.orcid
|
package eu.dnetlib.doiboost.orcid
|
||||||
|
|
||||||
|
import com.fasterxml.jackson.databind.ObjectMapper
|
||||||
import eu.dnetlib.dhp.schema.oaf.{Author, DataInfo, Publication}
|
import eu.dnetlib.dhp.schema.oaf.{Author, DataInfo, Publication}
|
||||||
import eu.dnetlib.dhp.schema.orcid.OrcidDOI
|
import eu.dnetlib.dhp.schema.orcid.OrcidDOI
|
||||||
import eu.dnetlib.doiboost.DoiBoostMappingUtil
|
import eu.dnetlib.doiboost.DoiBoostMappingUtil
|
||||||
import eu.dnetlib.doiboost.DoiBoostMappingUtil.{ORCID, PID_TYPES, createSP, generateDataInfo, generateIdentifier}
|
import eu.dnetlib.doiboost.DoiBoostMappingUtil.{ORCID, PID_TYPES, createSP, generateDataInfo, generateIdentifier}
|
||||||
import org.apache.commons.lang.StringUtils
|
import org.apache.commons.lang.StringUtils
|
||||||
import org.codehaus.jackson.map.ObjectMapper
|
|
||||||
import org.slf4j.{Logger, LoggerFactory}
|
import org.slf4j.{Logger, LoggerFactory}
|
||||||
|
|
||||||
import scala.collection.JavaConverters._
|
import scala.collection.JavaConverters._
|
||||||
|
@ -18,7 +18,7 @@ case class ORCIDItem(oid:String,name:String,surname:String,creditName:String,err
|
||||||
case class ORCIDElement(doi:String, authors:List[ORCIDItem]) {}
|
case class ORCIDElement(doi:String, authors:List[ORCIDItem]) {}
|
||||||
object ORCIDToOAF {
|
object ORCIDToOAF {
|
||||||
val logger: Logger = LoggerFactory.getLogger(ORCIDToOAF.getClass)
|
val logger: Logger = LoggerFactory.getLogger(ORCIDToOAF.getClass)
|
||||||
val mapper = new ObjectMapper
|
val mapper = new ObjectMapper()
|
||||||
|
|
||||||
def isJsonValid(inputStr: String): Boolean = {
|
def isJsonValid(inputStr: String): Boolean = {
|
||||||
import java.io.IOException
|
import java.io.IOException
|
||||||
|
|
|
@ -3,10 +3,8 @@ package eu.dnetlib.doiboost.orcid;
|
||||||
|
|
||||||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
||||||
|
|
||||||
import java.io.IOException;
|
|
||||||
import java.text.SimpleDateFormat;
|
import java.text.SimpleDateFormat;
|
||||||
import java.util.Date;
|
import java.util.Date;
|
||||||
import java.util.List;
|
|
||||||
import java.util.Optional;
|
import java.util.Optional;
|
||||||
|
|
||||||
import org.apache.commons.io.IOUtils;
|
import org.apache.commons.io.IOUtils;
|
||||||
|
@ -18,11 +16,9 @@ import org.apache.http.impl.client.CloseableHttpClient;
|
||||||
import org.apache.http.impl.client.HttpClients;
|
import org.apache.http.impl.client.HttpClients;
|
||||||
import org.apache.spark.SparkConf;
|
import org.apache.spark.SparkConf;
|
||||||
import org.apache.spark.api.java.JavaPairRDD;
|
import org.apache.spark.api.java.JavaPairRDD;
|
||||||
import org.apache.spark.api.java.JavaRDD;
|
|
||||||
import org.apache.spark.api.java.JavaSparkContext;
|
import org.apache.spark.api.java.JavaSparkContext;
|
||||||
import org.apache.spark.api.java.function.Function;
|
import org.apache.spark.api.java.function.Function;
|
||||||
import org.apache.spark.util.LongAccumulator;
|
import org.apache.spark.util.LongAccumulator;
|
||||||
import org.mortbay.log.Log;
|
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
@ -36,7 +32,7 @@ public class SparkDownloadOrcidAuthors {
|
||||||
static final String DATE_FORMAT = "yyyy-MM-dd HH:mm:ss";
|
static final String DATE_FORMAT = "yyyy-MM-dd HH:mm:ss";
|
||||||
static final String lastUpdate = "2020-09-29 00:00:00";
|
static final String lastUpdate = "2020-09-29 00:00:00";
|
||||||
|
|
||||||
public static void main(String[] args) throws IOException, Exception {
|
public static void main(String[] args) throws Exception {
|
||||||
|
|
||||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
|
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
|
||||||
IOUtils
|
IOUtils
|
||||||
|
@ -51,12 +47,12 @@ public class SparkDownloadOrcidAuthors {
|
||||||
.orElse(Boolean.TRUE);
|
.orElse(Boolean.TRUE);
|
||||||
logger.info("isSparkSessionManaged: {}", isSparkSessionManaged);
|
logger.info("isSparkSessionManaged: {}", isSparkSessionManaged);
|
||||||
final String workingPath = parser.get("workingPath");
|
final String workingPath = parser.get("workingPath");
|
||||||
logger.info("workingPath: ", workingPath);
|
logger.info("workingPath: {}", workingPath);
|
||||||
final String outputPath = parser.get("outputPath");
|
final String outputPath = parser.get("outputPath");
|
||||||
logger.info("outputPath: ", outputPath);
|
logger.info("outputPath: {}", outputPath);
|
||||||
final String token = parser.get("token");
|
final String token = parser.get("token");
|
||||||
final String lambdaFileName = parser.get("lambdaFileName");
|
final String lambdaFileName = parser.get("lambdaFileName");
|
||||||
logger.info("lambdaFileName: ", lambdaFileName);
|
logger.info("lambdaFileName: {}", lambdaFileName);
|
||||||
|
|
||||||
SparkConf conf = new SparkConf();
|
SparkConf conf = new SparkConf();
|
||||||
runWithSparkSession(
|
runWithSparkSession(
|
||||||
|
@ -171,8 +167,8 @@ public class SparkDownloadOrcidAuthors {
|
||||||
}
|
}
|
||||||
|
|
||||||
private static boolean isModified(String orcidId, String modifiedDate) {
|
private static boolean isModified(String orcidId, String modifiedDate) {
|
||||||
Date modifiedDateDt = null;
|
Date modifiedDateDt;
|
||||||
Date lastUpdateDt = null;
|
Date lastUpdateDt;
|
||||||
try {
|
try {
|
||||||
if (modifiedDate.length() != 19) {
|
if (modifiedDate.length() != 19) {
|
||||||
modifiedDate = modifiedDate.substring(0, 19);
|
modifiedDate = modifiedDate.substring(0, 19);
|
||||||
|
|
|
@ -5,5 +5,6 @@
|
||||||
{"paramName": "cr", "paramLongName":"crossRefRelation", "paramDescription": "the UnpayWall Publication Path", "paramRequired": true},
|
{"paramName": "cr", "paramLongName":"crossRefRelation", "paramDescription": "the UnpayWall Publication Path", "paramRequired": true},
|
||||||
{"paramName": "da", "paramLongName":"dbaffiliationRelationPath", "paramDescription": "the MAG Publication Path", "paramRequired": true},
|
{"paramName": "da", "paramLongName":"dbaffiliationRelationPath", "paramDescription": "the MAG Publication Path", "paramRequired": true},
|
||||||
{"paramName": "do", "paramLongName":"dbOrganizationPath", "paramDescription": "the MAG Publication Path", "paramRequired": true},
|
{"paramName": "do", "paramLongName":"dbOrganizationPath", "paramDescription": "the MAG Publication Path", "paramRequired": true},
|
||||||
{"paramName": "w", "paramLongName":"targetPath", "paramDescription": "the Working Path", "paramRequired": true}
|
{"paramName": "w", "paramLongName":"targetPath", "paramDescription": "the Working Path", "paramRequired": true},
|
||||||
|
{"paramName": "sp", "paramLongName":"sFilePath", "paramDescription": "the Sequence file Path", "paramRequired": true}
|
||||||
]
|
]
|
||||||
|
|
|
@ -3,5 +3,5 @@
|
||||||
{"paramName": "hb", "paramLongName":"hostedByMapPath", "paramDescription": "the hosted By Map Path", "paramRequired": true},
|
{"paramName": "hb", "paramLongName":"hostedByMapPath", "paramDescription": "the hosted By Map Path", "paramRequired": true},
|
||||||
{"paramName": "ap", "paramLongName":"affiliationPath", "paramDescription": "the Affliation Path", "paramRequired": true},
|
{"paramName": "ap", "paramLongName":"affiliationPath", "paramDescription": "the Affliation Path", "paramRequired": true},
|
||||||
{"paramName": "pa", "paramLongName":"paperAffiliationPath", "paramDescription": "the paperAffiliation Path", "paramRequired": true},
|
{"paramName": "pa", "paramLongName":"paperAffiliationPath", "paramDescription": "the paperAffiliation Path", "paramRequired": true},
|
||||||
{"paramName": "w", "paramLongName":"workingDirPath", "paramDescription": "the Working Path", "paramRequired": true}
|
{"paramName": "w", "paramLongName":"workingPath", "paramDescription": "the Working Path", "paramRequired": true}
|
||||||
]
|
]
|
||||||
|
|
|
@ -1,5 +1,7 @@
|
||||||
[
|
[
|
||||||
{"paramName":"t", "paramLongName":"targetPath", "paramDescription": "the path of the sequencial file to write", "paramRequired": true},
|
{"paramName":"t", "paramLongName":"targetPath", "paramDescription": "the path of the sequencial file to write", "paramRequired": true},
|
||||||
{"paramName":"n", "paramLongName":"namenode", "paramDescription": "the hive metastore uris", "paramRequired": true},
|
{"paramName":"n", "paramLongName":"namenode", "paramDescription": "the hive metastore uris", "paramRequired": true},
|
||||||
{"paramName":"ts", "paramLongName":"timestamp", "paramDescription": "timestamp", "paramRequired": false}
|
{"paramName":"ts", "paramLongName":"timestamp", "paramDescription": "timestamp", "paramRequired": false},
|
||||||
|
{"paramName":"ess", "paramLongName":"esServer", "paramDescription": "elasticsearch server url", "paramRequired": true},
|
||||||
|
{"paramName":"esi", "paramLongName":"esIndex", "paramDescription": "elasticsearch index name", "paramRequired": true}
|
||||||
]
|
]
|
|
@ -15,6 +15,10 @@
|
||||||
<name>oozie.action.sharelib.for.spark</name>
|
<name>oozie.action.sharelib.for.spark</name>
|
||||||
<value>spark2</value>
|
<value>spark2</value>
|
||||||
</property>
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>oozie.launcher.mapreduce.user.classpath.first</name>
|
||||||
|
<value>true</value>
|
||||||
|
</property>
|
||||||
<property>
|
<property>
|
||||||
<name>hive_metastore_uris</name>
|
<name>hive_metastore_uris</name>
|
||||||
<value>thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083</value>
|
<value>thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083</value>
|
||||||
|
@ -23,36 +27,16 @@
|
||||||
<name>spark2YarnHistoryServerAddress</name>
|
<name>spark2YarnHistoryServerAddress</name>
|
||||||
<value>http://iis-cdh5-test-gw.ocean.icm.edu.pl:18089</value>
|
<value>http://iis-cdh5-test-gw.ocean.icm.edu.pl:18089</value>
|
||||||
</property>
|
</property>
|
||||||
<property>
|
|
||||||
<name>spark2ExtraListeners</name>
|
|
||||||
<value>com.cloudera.spark.lineage.NavigatorAppListener</value>
|
|
||||||
</property>
|
|
||||||
<property>
|
|
||||||
<name>spark2SqlQueryExecutionListeners</name>
|
|
||||||
<value>com.cloudera.spark.lineage.NavigatorQueryListener</value>
|
|
||||||
</property>
|
|
||||||
<property>
|
|
||||||
<name>oozie.launcher.mapreduce.user.classpath.first</name>
|
|
||||||
<value>true</value>
|
|
||||||
</property>
|
|
||||||
<property>
|
|
||||||
<name>sparkExecutorNumber</name>
|
|
||||||
<value>4</value>
|
|
||||||
</property>
|
|
||||||
<property>
|
<property>
|
||||||
<name>spark2EventLogDir</name>
|
<name>spark2EventLogDir</name>
|
||||||
<value>/user/spark/spark2ApplicationHistory</value>
|
<value>/user/spark/spark2ApplicationHistory</value>
|
||||||
</property>
|
</property>
|
||||||
<property>
|
<property>
|
||||||
<name>sparkDriverMemory</name>
|
<name>spark2ExtraListeners</name>
|
||||||
<value>15G</value>
|
<value>"com.cloudera.spark.lineage.NavigatorAppListener"</value>
|
||||||
</property>
|
</property>
|
||||||
<property>
|
<property>
|
||||||
<name>sparkExecutorMemory</name>
|
<name>spark2SqlQueryExecutionListeners</name>
|
||||||
<value>6G</value>
|
<value>"com.cloudera.spark.lineage.NavigatorQueryListener"</value>
|
||||||
</property>
|
|
||||||
<property>
|
|
||||||
<name>sparkExecutorCores</name>
|
|
||||||
<value>1</value>
|
|
||||||
</property>
|
</property>
|
||||||
</configuration>
|
</configuration>
|
|
@ -0,0 +1,335 @@
|
||||||
|
<workflow-app name="Generate DOIBoost ActionSet" xmlns="uri:oozie:workflow:0.5">
|
||||||
|
<parameters>
|
||||||
|
<property>
|
||||||
|
<name>sparkDriverMemory</name>
|
||||||
|
<description>memory for driver process</description>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>sparkExecutorMemory</name>
|
||||||
|
<description>memory for individual executor</description>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>sparkExecutorIntersectionMemory</name>
|
||||||
|
<description>memory for individual executor</description>
|
||||||
|
</property>
|
||||||
|
|
||||||
|
<property>
|
||||||
|
<name>sparkExecutorCores</name>
|
||||||
|
<description>number of cores used by single executor</description>
|
||||||
|
</property>
|
||||||
|
|
||||||
|
|
||||||
|
<!-- Itersection Parameters -->
|
||||||
|
<property>
|
||||||
|
<name>workingPath</name>
|
||||||
|
<description>the working Path</description>
|
||||||
|
</property>
|
||||||
|
|
||||||
|
<property>
|
||||||
|
<name>hostedByMapPath</name>
|
||||||
|
<description>the hostedByMap Path</description>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>outputPath</name>
|
||||||
|
<description>the Path of the sequence file action set</description>
|
||||||
|
</property>
|
||||||
|
|
||||||
|
|
||||||
|
<!-- Crossref Parameters -->
|
||||||
|
<property>
|
||||||
|
<name>inputPathCrossref</name>
|
||||||
|
<description>the Crossref input path</description>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>crossrefTimestamp</name>
|
||||||
|
<description>Timestamp for the Crossref incremental Harvesting</description>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>esServer</name>
|
||||||
|
<description>elasticsearch server url for the Crossref Harvesting</description>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>esIndex</name>
|
||||||
|
<description>elasticsearch index name for the Crossref Harvesting</description>
|
||||||
|
</property>
|
||||||
|
|
||||||
|
<!-- MAG Parameters -->
|
||||||
|
<property>
|
||||||
|
<name>inputPathMAG</name>
|
||||||
|
<description>the MAG working path</description>
|
||||||
|
</property>
|
||||||
|
|
||||||
|
|
||||||
|
<!-- UnpayWall Parameters -->
|
||||||
|
<property>
|
||||||
|
<name>inputPathUnpayWall</name>
|
||||||
|
<description>the UnpayWall working path</description>
|
||||||
|
</property>
|
||||||
|
|
||||||
|
<!-- ORCID Parameters -->
|
||||||
|
<property>
|
||||||
|
<name>inputPathOrcid</name>
|
||||||
|
<description>the ORCID working path</description>
|
||||||
|
</property>
|
||||||
|
|
||||||
|
</parameters>
|
||||||
|
|
||||||
|
<global>
|
||||||
|
<job-tracker>${jobTracker}</job-tracker>
|
||||||
|
<name-node>${nameNode}</name-node>
|
||||||
|
<configuration>
|
||||||
|
<property>
|
||||||
|
<name>oozie.action.sharelib.for.spark</name>
|
||||||
|
<value>${oozieActionShareLibForSpark2}</value>
|
||||||
|
</property>
|
||||||
|
</configuration>
|
||||||
|
</global>
|
||||||
|
|
||||||
|
<start to="resume_from"/>
|
||||||
|
|
||||||
|
<decision name="resume_from">
|
||||||
|
<switch>
|
||||||
|
<case to="ConvertCrossrefToOAF">${wf:conf('resumeFrom') eq 'ConvertCrossrefToOAF'}</case>
|
||||||
|
<case to="ResetMagWorkingPath">${wf:conf('resumeFrom') eq 'ResetMagWorkingPath'}</case>
|
||||||
|
<case to="ProcessMAG">${wf:conf('resumeFrom') eq 'PreprocessMag'}</case>
|
||||||
|
<case to="ProcessUW">${wf:conf('resumeFrom') eq 'PreprocessUW'}</case>
|
||||||
|
<case to="ProcessORCID">${wf:conf('resumeFrom') eq 'PreprocessORCID'}</case>
|
||||||
|
<case to="CreateDOIBoost">${wf:conf('resumeFrom') eq 'CreateDOIBoost'}</case>
|
||||||
|
<case to="GenerateActionSet">${wf:conf('resumeFrom') eq 'GenerateActionSet'}</case>
|
||||||
|
<default to="ImportCrossRef"/>
|
||||||
|
</switch>
|
||||||
|
</decision>
|
||||||
|
|
||||||
|
<kill name="Kill">
|
||||||
|
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
||||||
|
</kill>
|
||||||
|
|
||||||
|
<action name="ImportCrossRef">
|
||||||
|
<java>
|
||||||
|
<main-class>eu.dnetlib.doiboost.crossref.CrossrefImporter</main-class>
|
||||||
|
<arg>--targetPath</arg><arg>${inputPathCrossref}/index_update</arg>
|
||||||
|
<arg>--namenode</arg><arg>${nameNode}</arg>
|
||||||
|
<arg>--esServer</arg><arg>${esServer}</arg>
|
||||||
|
<arg>--esIndex</arg><arg>${esIndex}</arg>
|
||||||
|
<arg>--timestamp</arg><arg>${crossrefTimestamp}</arg>
|
||||||
|
</java>
|
||||||
|
<ok to="GenerateCrossrefDataset"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
|
||||||
|
<!-- CROSSREF SECTION -->
|
||||||
|
|
||||||
|
<action name="GenerateCrossrefDataset">
|
||||||
|
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||||
|
<master>yarn-cluster</master>
|
||||||
|
<mode>cluster</mode>
|
||||||
|
<name>GenerateCrossrefDataset</name>
|
||||||
|
<class>eu.dnetlib.doiboost.crossref.CrossrefDataset</class>
|
||||||
|
<jar>dhp-doiboost-${projectVersion}.jar</jar>
|
||||||
|
<spark-opts>
|
||||||
|
--executor-memory=${sparkExecutorMemory}
|
||||||
|
--executor-cores=${sparkExecutorCores}
|
||||||
|
--driver-memory=${sparkDriverMemory}
|
||||||
|
--conf spark.sql.shuffle.partitions=3840
|
||||||
|
${sparkExtraOPT}
|
||||||
|
</spark-opts>
|
||||||
|
<arg>--workingPath</arg><arg>${inputPathCrossref}</arg>
|
||||||
|
<arg>--master</arg><arg>yarn-cluster</arg>
|
||||||
|
</spark>
|
||||||
|
<ok to="RenameDataset"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<action name="RenameDataset">
|
||||||
|
<fs>
|
||||||
|
<delete path="${inputPathCrossref}/crossref_ds"/>
|
||||||
|
<move source="${inputPathCrossref}/crossref_ds_updated"
|
||||||
|
target="${inputPathCrossref}/crossref_ds"/>
|
||||||
|
</fs>
|
||||||
|
<ok to="ConvertCrossrefToOAF"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
|
||||||
|
<action name="ConvertCrossrefToOAF">
|
||||||
|
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||||
|
<master>yarn-cluster</master>
|
||||||
|
<mode>cluster</mode>
|
||||||
|
<name>ConvertCrossrefToOAF</name>
|
||||||
|
<class>eu.dnetlib.doiboost.crossref.SparkMapDumpIntoOAF</class>
|
||||||
|
<jar>dhp-doiboost-${projectVersion}.jar</jar>
|
||||||
|
<spark-opts>
|
||||||
|
--executor-memory=${sparkExecutorMemory}
|
||||||
|
--executor-cores=${sparkExecutorCores}
|
||||||
|
--driver-memory=${sparkDriverMemory}
|
||||||
|
--conf spark.sql.shuffle.partitions=3840
|
||||||
|
${sparkExtraOPT}
|
||||||
|
</spark-opts>
|
||||||
|
<arg>--sourcePath</arg><arg>${inputPathCrossref}/crossref_ds</arg>
|
||||||
|
<arg>--targetPath</arg><arg>${workingPath}</arg>
|
||||||
|
<arg>--master</arg><arg>yarn-cluster</arg>
|
||||||
|
</spark>
|
||||||
|
<ok to="ResetMagWorkingPath"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
<!-- MAG SECTION -->
|
||||||
|
<action name="ResetMagWorkingPath">
|
||||||
|
<fs>
|
||||||
|
<delete path="${inputPathMAG}/dataset"/>
|
||||||
|
<delete path="${inputPathMAG}/process"/>
|
||||||
|
<delete path="${inputPathMAG}/dataset"/>
|
||||||
|
</fs>
|
||||||
|
<ok to="ConvertMagToDataset"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<action name="ConvertMagToDataset">
|
||||||
|
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||||
|
<master>yarn-cluster</master>
|
||||||
|
<mode>cluster</mode>
|
||||||
|
<name>Convert Mag to Dataset</name>
|
||||||
|
<class>eu.dnetlib.doiboost.mag.SparkImportMagIntoDataset</class>
|
||||||
|
<jar>dhp-doiboost-${projectVersion}.jar</jar>
|
||||||
|
<spark-opts>
|
||||||
|
--executor-memory=${sparkExecutorMemory}
|
||||||
|
--executor-cores=${sparkExecutorCores}
|
||||||
|
--driver-memory=${sparkDriverMemory}
|
||||||
|
${sparkExtraOPT}
|
||||||
|
</spark-opts>
|
||||||
|
<arg>--sourcePath</arg><arg>${inputPathMAG}/input</arg>
|
||||||
|
<arg>--targetPath</arg><arg>${inputPathMAG}/dataset</arg>
|
||||||
|
<arg>--master</arg><arg>yarn-cluster</arg>
|
||||||
|
</spark>
|
||||||
|
<ok to="ProcessMAG"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<action name="ProcessMAG">
|
||||||
|
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||||
|
<master>yarn-cluster</master>
|
||||||
|
<mode>cluster</mode>
|
||||||
|
<name>Convert Mag to OAF Dataset</name>
|
||||||
|
<class>eu.dnetlib.doiboost.mag.SparkProcessMAG</class>
|
||||||
|
<jar>dhp-doiboost-${projectVersion}.jar</jar>
|
||||||
|
<spark-opts>
|
||||||
|
--executor-memory=${sparkExecutorMemory}
|
||||||
|
--executor-cores=${sparkExecutorCores}
|
||||||
|
--driver-memory=${sparkDriverMemory}
|
||||||
|
--conf spark.sql.shuffle.partitions=3840
|
||||||
|
${sparkExtraOPT}
|
||||||
|
</spark-opts>
|
||||||
|
<arg>--sourcePath</arg><arg>${inputPathMAG}/dataset</arg>
|
||||||
|
<arg>--workingPath</arg><arg>${inputPathMAG}/process</arg>
|
||||||
|
<arg>--targetPath</arg><arg>${workingPath}</arg>
|
||||||
|
<arg>--master</arg><arg>yarn-cluster</arg>
|
||||||
|
</spark>
|
||||||
|
<ok to="ProcessUW"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<!-- UnpayWall SECTION -->
|
||||||
|
|
||||||
|
<action name="ProcessUW">
|
||||||
|
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||||
|
<master>yarn-cluster</master>
|
||||||
|
<mode>cluster</mode>
|
||||||
|
<name>Convert UnpayWall to Dataset</name>
|
||||||
|
<class>eu.dnetlib.doiboost.uw.SparkMapUnpayWallToOAF</class>
|
||||||
|
<jar>dhp-doiboost-${projectVersion}.jar</jar>
|
||||||
|
<spark-opts>
|
||||||
|
--executor-memory=${sparkExecutorMemory}
|
||||||
|
--executor-cores=${sparkExecutorCores}
|
||||||
|
--driver-memory=${sparkDriverMemory}
|
||||||
|
--conf spark.sql.shuffle.partitions=3840
|
||||||
|
${sparkExtraOPT}
|
||||||
|
</spark-opts>
|
||||||
|
<arg>--sourcePath</arg><arg>${inputPathUnpayWall}/uw_extracted</arg>
|
||||||
|
<arg>--targetPath</arg><arg>${workingPath}</arg>
|
||||||
|
<arg>--master</arg><arg>yarn-cluster</arg>
|
||||||
|
</spark>
|
||||||
|
<ok to="ProcessORCID"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<!-- ORCID SECTION -->
|
||||||
|
<action name="ProcessORCID">
|
||||||
|
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||||
|
<master>yarn-cluster</master>
|
||||||
|
<mode>cluster</mode>
|
||||||
|
<name>Convert ORCID to Dataset</name>
|
||||||
|
<class>eu.dnetlib.doiboost.orcid.SparkConvertORCIDToOAF</class>
|
||||||
|
<jar>dhp-doiboost-${projectVersion}.jar</jar>
|
||||||
|
<spark-opts>
|
||||||
|
--executor-memory=${sparkExecutorMemory}
|
||||||
|
--executor-cores=${sparkExecutorCores}
|
||||||
|
--driver-memory=${sparkDriverMemory}
|
||||||
|
--conf spark.sql.shuffle.partitions=3840
|
||||||
|
${sparkExtraOPT}
|
||||||
|
</spark-opts>
|
||||||
|
<arg>--sourcePath</arg><arg>${inputPathOrcid}</arg>
|
||||||
|
<arg>--targetPath</arg><arg>${workingPath}</arg>
|
||||||
|
<arg>--master</arg><arg>yarn-cluster</arg>
|
||||||
|
</spark>
|
||||||
|
<ok to="CreateDOIBoost"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<!-- INTERSECTION SECTION-->
|
||||||
|
<action name="CreateDOIBoost">
|
||||||
|
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||||
|
<master>yarn-cluster</master>
|
||||||
|
<mode>cluster</mode>
|
||||||
|
<name>Create DOIBoost Infospace</name>
|
||||||
|
<class>eu.dnetlib.doiboost.SparkGenerateDoiBoost</class>
|
||||||
|
<jar>dhp-doiboost-${projectVersion}.jar</jar>
|
||||||
|
<spark-opts>
|
||||||
|
--executor-memory=${sparkExecutorIntersectionMemory}
|
||||||
|
--executor-cores=${sparkExecutorCores}
|
||||||
|
--driver-memory=${sparkDriverMemory}
|
||||||
|
--conf spark.sql.shuffle.partitions=3840
|
||||||
|
${sparkExtraOPT}
|
||||||
|
</spark-opts>
|
||||||
|
<arg>--hostedByMapPath</arg><arg>${hostedByMapPath}</arg>
|
||||||
|
<arg>--affiliationPath</arg><arg>${inputPathMAG}/process/Affiliations</arg>
|
||||||
|
<arg>--paperAffiliationPath</arg><arg>${inputPathMAG}/process/PaperAuthorAffiliations</arg>
|
||||||
|
<arg>--workingPath</arg><arg>${workingPath}</arg>
|
||||||
|
<arg>--master</arg><arg>yarn-cluster</arg>
|
||||||
|
</spark>
|
||||||
|
<ok to="GenerateActionSet"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
|
||||||
|
<action name="GenerateActionSet">
|
||||||
|
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||||
|
<master>yarn-cluster</master>
|
||||||
|
<mode>cluster</mode>
|
||||||
|
<name>Generate DOIBoost ActionSet</name>
|
||||||
|
<class>eu.dnetlib.doiboost.SparkGenerateDOIBoostActionSet</class>
|
||||||
|
<jar>dhp-doiboost-${projectVersion}.jar</jar>
|
||||||
|
<spark-opts>
|
||||||
|
--executor-memory=${sparkExecutorMemory}
|
||||||
|
--executor-cores=${sparkExecutorCores}
|
||||||
|
--driver-memory=${sparkDriverMemory}
|
||||||
|
--conf spark.sql.shuffle.partitions=3840
|
||||||
|
${sparkExtraOPT}
|
||||||
|
</spark-opts>
|
||||||
|
<arg>--dbPublicationPath</arg><arg>${workingPath}/doiBoostPublicationFiltered</arg>
|
||||||
|
<arg>--dbDatasetPath</arg><arg>${workingPath}/crossrefDataset</arg>
|
||||||
|
<arg>--crossRefRelation</arg><arg>${workingPath}/crossrefRelation</arg>
|
||||||
|
<arg>--dbaffiliationRelationPath</arg><arg>${workingPath}/doiBoostPublicationAffiliation</arg>
|
||||||
|
<arg>--dbOrganizationPath</arg><arg>${workingPath}/doiBoostOrganization</arg>
|
||||||
|
<arg>--targetPath</arg><arg>${workingPath}/actionDataSet</arg>
|
||||||
|
<arg>--sFilePath</arg><arg>${outputPath}</arg>
|
||||||
|
<arg>--master</arg><arg>yarn-cluster</arg>
|
||||||
|
</spark>
|
||||||
|
<ok to="End"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<end name="End"/>
|
||||||
|
</workflow-app>
|
|
@ -1,9 +1,9 @@
|
||||||
package eu.dnetlib.doiboost.orcid
|
package eu.dnetlib.doiboost.orcid
|
||||||
|
|
||||||
|
import com.fasterxml.jackson.databind.ObjectMapper
|
||||||
import eu.dnetlib.dhp.schema.oaf.Publication
|
import eu.dnetlib.dhp.schema.oaf.Publication
|
||||||
import eu.dnetlib.doiboost.orcid.SparkConvertORCIDToOAF.getClass
|
import eu.dnetlib.doiboost.orcid.SparkConvertORCIDToOAF.getClass
|
||||||
import org.apache.spark.sql.{Encoder, Encoders, SparkSession}
|
import org.apache.spark.sql.{Encoder, Encoders, SparkSession}
|
||||||
import org.codehaus.jackson.map.ObjectMapper
|
|
||||||
import org.junit.jupiter.api.Assertions._
|
import org.junit.jupiter.api.Assertions._
|
||||||
import org.junit.jupiter.api.Test
|
import org.junit.jupiter.api.Test
|
||||||
import org.slf4j.{Logger, LoggerFactory}
|
import org.slf4j.{Logger, LoggerFactory}
|
||||||
|
|
|
@ -104,7 +104,7 @@ public class PrepareResultOrcidAssociationStep1 {
|
||||||
+ " LATERAL VIEW EXPLODE (author) a AS MyT "
|
+ " LATERAL VIEW EXPLODE (author) a AS MyT "
|
||||||
+ " LATERAL VIEW EXPLODE (MyT.pid) p AS MyP "
|
+ " LATERAL VIEW EXPLODE (MyT.pid) p AS MyP "
|
||||||
+ " WHERE lower(MyP.qualifier.classid) = '" + ModelConstants.ORCID + "' or "
|
+ " WHERE lower(MyP.qualifier.classid) = '" + ModelConstants.ORCID + "' or "
|
||||||
+ " lower(MyP.qalifier.classid) = '" + ModelConstants.ORCID_PENDING + "') tmp "
|
+ " lower(MyP.qualifier.classid) = '" + ModelConstants.ORCID_PENDING + "') tmp "
|
||||||
+ " GROUP BY id) r_t "
|
+ " GROUP BY id) r_t "
|
||||||
+ " JOIN ("
|
+ " JOIN ("
|
||||||
+ " SELECT source, target "
|
+ " SELECT source, target "
|
||||||
|
|
|
@ -108,7 +108,7 @@ public class SparkResultToCommunityFromOrganizationJob {
|
||||||
.stream()
|
.stream()
|
||||||
.map(con -> con.getId())
|
.map(con -> con.getId())
|
||||||
.collect(Collectors.toList());
|
.collect(Collectors.toList());
|
||||||
Result res = new Result();
|
R res = (R) ret.getClass().newInstance();
|
||||||
res.setId(ret.getId());
|
res.setId(ret.getId());
|
||||||
List<Context> propagatedContexts = new ArrayList<>();
|
List<Context> propagatedContexts = new ArrayList<>();
|
||||||
for (String cId : communitySet) {
|
for (String cId : communitySet) {
|
||||||
|
|
|
@ -130,7 +130,7 @@ public class SparkResultToCommunityThroughSemRelJob {
|
||||||
})
|
})
|
||||||
.filter(Objects::nonNull)
|
.filter(Objects::nonNull)
|
||||||
.collect(Collectors.toList());
|
.collect(Collectors.toList());
|
||||||
Result r = new Result();
|
R r = (R) ret.getClass().newInstance();
|
||||||
r.setId(ret.getId());
|
r.setId(ret.getId());
|
||||||
r.setContext(contextList);
|
r.setContext(contextList);
|
||||||
ret.mergeFrom(r);
|
ret.mergeFrom(r);
|
||||||
|
|
|
@ -24,7 +24,6 @@ import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||||
|
|
||||||
import eu.dnetlib.dhp.orcidtoresultfromsemrel.OrcidPropagationJobTest;
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.Dataset;
|
import eu.dnetlib.dhp.schema.oaf.Dataset;
|
||||||
|
|
||||||
public class ResultToCommunityJobTest {
|
public class ResultToCommunityJobTest {
|
||||||
|
@ -66,7 +65,7 @@ public class ResultToCommunityJobTest {
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void test1() throws Exception {
|
public void testSparkResultToCommunityThroughSemRelJob() throws Exception {
|
||||||
SparkResultToCommunityThroughSemRelJob
|
SparkResultToCommunityThroughSemRelJob
|
||||||
.main(
|
.main(
|
||||||
new String[] {
|
new String[] {
|
||||||
|
|
|
@ -23,7 +23,15 @@ import static eu.dnetlib.dhp.schema.common.ModelConstants.RESULT_PROJECT;
|
||||||
import static eu.dnetlib.dhp.schema.common.ModelConstants.RESULT_RESULT;
|
import static eu.dnetlib.dhp.schema.common.ModelConstants.RESULT_RESULT;
|
||||||
import static eu.dnetlib.dhp.schema.common.ModelConstants.SOFTWARE_DEFAULT_RESULTTYPE;
|
import static eu.dnetlib.dhp.schema.common.ModelConstants.SOFTWARE_DEFAULT_RESULTTYPE;
|
||||||
import static eu.dnetlib.dhp.schema.common.ModelConstants.USER_CLAIM;
|
import static eu.dnetlib.dhp.schema.common.ModelConstants.USER_CLAIM;
|
||||||
import static eu.dnetlib.dhp.schema.oaf.OafMapperUtils.*;
|
import static eu.dnetlib.dhp.schema.oaf.OafMapperUtils.asString;
|
||||||
|
import static eu.dnetlib.dhp.schema.oaf.OafMapperUtils.createOpenaireId;
|
||||||
|
import static eu.dnetlib.dhp.schema.oaf.OafMapperUtils.dataInfo;
|
||||||
|
import static eu.dnetlib.dhp.schema.oaf.OafMapperUtils.field;
|
||||||
|
import static eu.dnetlib.dhp.schema.oaf.OafMapperUtils.journal;
|
||||||
|
import static eu.dnetlib.dhp.schema.oaf.OafMapperUtils.listFields;
|
||||||
|
import static eu.dnetlib.dhp.schema.oaf.OafMapperUtils.listKeyValues;
|
||||||
|
import static eu.dnetlib.dhp.schema.oaf.OafMapperUtils.qualifier;
|
||||||
|
import static eu.dnetlib.dhp.schema.oaf.OafMapperUtils.structuredProperty;
|
||||||
|
|
||||||
import java.io.Closeable;
|
import java.io.Closeable;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
@ -462,44 +470,48 @@ public class MigrateDbEntitiesApplication extends AbstractMigrationApplication i
|
||||||
|
|
||||||
return Arrays.asList(r);
|
return Arrays.asList(r);
|
||||||
} else {
|
} else {
|
||||||
|
final String validationDate = rs.getString("curation_date");
|
||||||
|
|
||||||
final String sourceId = createOpenaireId(rs.getString(SOURCE_TYPE), rs.getString("source_id"), false);
|
final String sourceId = createOpenaireId(rs.getString(SOURCE_TYPE), rs.getString("source_id"), false);
|
||||||
final String targetId = createOpenaireId(rs.getString(TARGET_TYPE), rs.getString("target_id"), false);
|
final String targetId = createOpenaireId(rs.getString(TARGET_TYPE), rs.getString("target_id"), false);
|
||||||
|
|
||||||
final Relation r1 = new Relation();
|
final Relation r1 = new Relation();
|
||||||
final Relation r2 = new Relation();
|
final Relation r2 = new Relation();
|
||||||
|
|
||||||
if (rs.getString(SOURCE_TYPE).equals("project")) {
|
r1.setValidated(true);
|
||||||
|
r1.setValidationDate(validationDate);
|
||||||
r1.setCollectedfrom(collectedFrom);
|
r1.setCollectedfrom(collectedFrom);
|
||||||
r1.setRelType(RESULT_PROJECT);
|
|
||||||
r1.setSubRelType(OUTCOME);
|
|
||||||
r1.setRelClass(PRODUCES);
|
|
||||||
|
|
||||||
r2.setCollectedfrom(collectedFrom);
|
|
||||||
r2.setRelType(RESULT_PROJECT);
|
|
||||||
r2.setSubRelType(OUTCOME);
|
|
||||||
r2.setRelClass(IS_PRODUCED_BY);
|
|
||||||
} else {
|
|
||||||
r1.setCollectedfrom(collectedFrom);
|
|
||||||
r1.setRelType(RESULT_RESULT);
|
|
||||||
r1.setSubRelType(RELATIONSHIP);
|
|
||||||
r1.setRelClass(IS_RELATED_TO);
|
|
||||||
|
|
||||||
r2.setCollectedfrom(collectedFrom);
|
|
||||||
r2.setRelType(RESULT_RESULT);
|
|
||||||
r2.setSubRelType(RELATIONSHIP);
|
|
||||||
r2.setRelClass(IS_RELATED_TO);
|
|
||||||
}
|
|
||||||
|
|
||||||
r1.setSource(sourceId);
|
r1.setSource(sourceId);
|
||||||
r1.setTarget(targetId);
|
r1.setTarget(targetId);
|
||||||
r1.setDataInfo(info);
|
r1.setDataInfo(info);
|
||||||
r1.setLastupdatetimestamp(lastUpdateTimestamp);
|
r1.setLastupdatetimestamp(lastUpdateTimestamp);
|
||||||
|
|
||||||
|
r2.setValidationDate(validationDate);
|
||||||
|
r2.setValidated(true);
|
||||||
|
r2.setCollectedfrom(collectedFrom);
|
||||||
r2.setSource(targetId);
|
r2.setSource(targetId);
|
||||||
r2.setTarget(sourceId);
|
r2.setTarget(sourceId);
|
||||||
r2.setDataInfo(info);
|
r2.setDataInfo(info);
|
||||||
r2.setLastupdatetimestamp(lastUpdateTimestamp);
|
r2.setLastupdatetimestamp(lastUpdateTimestamp);
|
||||||
|
|
||||||
|
if (rs.getString(SOURCE_TYPE).equals("project")) {
|
||||||
|
r1.setRelType(RESULT_PROJECT);
|
||||||
|
r1.setSubRelType(OUTCOME);
|
||||||
|
r1.setRelClass(PRODUCES);
|
||||||
|
|
||||||
|
r2.setRelType(RESULT_PROJECT);
|
||||||
|
r2.setSubRelType(OUTCOME);
|
||||||
|
r2.setRelClass(IS_PRODUCED_BY);
|
||||||
|
} else {
|
||||||
|
r1.setRelType(RESULT_RESULT);
|
||||||
|
r1.setSubRelType(RELATIONSHIP);
|
||||||
|
r1.setRelClass(IS_RELATED_TO);
|
||||||
|
|
||||||
|
r2.setRelType(RESULT_RESULT);
|
||||||
|
r2.setSubRelType(RELATIONSHIP);
|
||||||
|
r2.setRelClass(IS_RELATED_TO);
|
||||||
|
}
|
||||||
|
|
||||||
return Arrays.asList(r1, r2);
|
return Arrays.asList(r1, r2);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -36,17 +36,19 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper {
|
||||||
final Node n = (Node) o;
|
final Node n = (Node) o;
|
||||||
final Author author = new Author();
|
final Author author = new Author();
|
||||||
final String fullname = n.valueOf("./datacite:creatorName");
|
final String fullname = n.valueOf("./datacite:creatorName");
|
||||||
|
final String name = n.valueOf("./datacite:givenName");
|
||||||
|
final String surname = n.valueOf("./datacite:familyName");
|
||||||
|
if (StringUtils.isNotBlank(fullname) || StringUtils.isNotBlank(name) || StringUtils.isNotBlank(surname)) {
|
||||||
author.setFullname(fullname);
|
author.setFullname(fullname);
|
||||||
|
|
||||||
final PacePerson pp = new PacePerson(fullname, false);
|
final PacePerson pp = new PacePerson(fullname, false);
|
||||||
final String name = n.valueOf("./datacite:givenName");
|
|
||||||
if (StringUtils.isBlank(name) & pp.isAccurate()) {
|
if (StringUtils.isBlank(name) & pp.isAccurate()) {
|
||||||
author.setName(pp.getNormalisedFirstName());
|
author.setName(pp.getNormalisedFirstName());
|
||||||
} else {
|
} else {
|
||||||
author.setName(name);
|
author.setName(name);
|
||||||
}
|
}
|
||||||
|
|
||||||
final String surname = n.valueOf("./datacite:familyName");
|
|
||||||
if (StringUtils.isBlank(surname) & pp.isAccurate()) {
|
if (StringUtils.isBlank(surname) & pp.isAccurate()) {
|
||||||
author.setSurname(pp.getNormalisedSurname());
|
author.setSurname(pp.getNormalisedSurname());
|
||||||
} else {
|
} else {
|
||||||
|
@ -62,6 +64,7 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper {
|
||||||
author.setRank(pos++);
|
author.setRank(pos++);
|
||||||
res.add(author);
|
res.add(author);
|
||||||
}
|
}
|
||||||
|
}
|
||||||
return res;
|
return res;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -11,7 +11,8 @@ import org.apache.hadoop.conf.Configuration;
|
||||||
import org.apache.hadoop.fs.Path;
|
import org.apache.hadoop.fs.Path;
|
||||||
import org.apache.hadoop.io.SequenceFile;
|
import org.apache.hadoop.io.SequenceFile;
|
||||||
import org.apache.hadoop.io.Text;
|
import org.apache.hadoop.io.Text;
|
||||||
import org.codehaus.jackson.map.ObjectMapper;
|
|
||||||
|
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||||
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.Oaf;
|
import eu.dnetlib.dhp.schema.oaf.Oaf;
|
||||||
|
|
||||||
|
|
|
@ -1 +1 @@
|
||||||
SELECT source_type, source_id, target_type, target_id, semantics FROM claim WHERE approved=TRUE;
|
SELECT source_type, source_id, target_type, target_id, semantics, curation_date::text FROM claim WHERE approved=TRUE;
|
|
@ -24,8 +24,14 @@ import com.fasterxml.jackson.databind.ObjectMapper;
|
||||||
import eu.dnetlib.dhp.oa.graph.clean.CleaningFunctionTest;
|
import eu.dnetlib.dhp.oa.graph.clean.CleaningFunctionTest;
|
||||||
import eu.dnetlib.dhp.oa.graph.raw.common.VocabularyGroup;
|
import eu.dnetlib.dhp.oa.graph.raw.common.VocabularyGroup;
|
||||||
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||||
import eu.dnetlib.dhp.schema.oaf.*;
|
import eu.dnetlib.dhp.schema.oaf.Author;
|
||||||
import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory;
|
import eu.dnetlib.dhp.schema.oaf.Dataset;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.Field;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.Oaf;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.Publication;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.Relation;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.Software;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
|
||||||
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
|
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
|
||||||
|
|
||||||
@ExtendWith(MockitoExtension.class)
|
@ExtendWith(MockitoExtension.class)
|
||||||
|
@ -65,7 +71,7 @@ public class MappersTest {
|
||||||
|
|
||||||
assertValidId(p.getId());
|
assertValidId(p.getId());
|
||||||
|
|
||||||
assertTrue(p.getOriginalId().size() == 2);
|
assertTrue(p.getOriginalId().size() == 1);
|
||||||
assertEquals("10.3897/oneeco.2.e13718", p.getOriginalId().get(0));
|
assertEquals("10.3897/oneeco.2.e13718", p.getOriginalId().get(0));
|
||||||
|
|
||||||
assertValidId(p.getCollectedfrom().get(0).getKey());
|
assertValidId(p.getCollectedfrom().get(0).getKey());
|
||||||
|
@ -119,8 +125,26 @@ public class MappersTest {
|
||||||
|
|
||||||
assertNotNull(p.getBestaccessright());
|
assertNotNull(p.getBestaccessright());
|
||||||
assertEquals("OPEN", p.getBestaccessright().getClassid());
|
assertEquals("OPEN", p.getBestaccessright().getClassid());
|
||||||
verifyRelations(p, r1, r2);
|
assertValidId(r1.getSource());
|
||||||
|
assertValidId(r1.getTarget());
|
||||||
|
assertValidId(r2.getSource());
|
||||||
|
assertValidId(r2.getTarget());
|
||||||
|
assertValidId(r1.getCollectedfrom().get(0).getKey());
|
||||||
|
assertValidId(r2.getCollectedfrom().get(0).getKey());
|
||||||
|
assertNotNull(r1.getDataInfo());
|
||||||
|
assertNotNull(r2.getDataInfo());
|
||||||
|
assertNotNull(r1.getDataInfo().getTrust());
|
||||||
|
assertNotNull(r2.getDataInfo().getTrust());
|
||||||
|
assertEquals(r1.getSource(), r2.getTarget());
|
||||||
|
assertEquals(r2.getSource(), r1.getTarget());
|
||||||
|
assertTrue(StringUtils.isNotBlank(r1.getRelClass()));
|
||||||
|
assertTrue(StringUtils.isNotBlank(r2.getRelClass()));
|
||||||
|
assertTrue(StringUtils.isNotBlank(r1.getRelType()));
|
||||||
|
assertTrue(StringUtils.isNotBlank(r2.getRelType()));
|
||||||
|
assertTrue(r1.getValidated());
|
||||||
|
assertTrue(r2.getValidated());
|
||||||
|
assertEquals(r1.getValidationDate(), "2020-01-01");
|
||||||
|
assertEquals(r2.getValidationDate(), "2020-01-01");
|
||||||
// System.out.println(new ObjectMapper().writeValueAsString(p));
|
// System.out.println(new ObjectMapper().writeValueAsString(p));
|
||||||
// System.out.println(new ObjectMapper().writeValueAsString(r1));
|
// System.out.println(new ObjectMapper().writeValueAsString(r1));
|
||||||
// System.out.println(new ObjectMapper().writeValueAsString(r2));
|
// System.out.println(new ObjectMapper().writeValueAsString(r2));
|
||||||
|
@ -158,7 +182,7 @@ public class MappersTest {
|
||||||
final Relation r2 = (Relation) list.get(2);
|
final Relation r2 = (Relation) list.get(2);
|
||||||
|
|
||||||
assertValidId(d.getId());
|
assertValidId(d.getId());
|
||||||
assertTrue(d.getOriginalId().size() == 2);
|
assertTrue(d.getOriginalId().size() == 1);
|
||||||
assertEquals("oai:zenodo.org:3234526", d.getOriginalId().get(0));
|
assertEquals("oai:zenodo.org:3234526", d.getOriginalId().get(0));
|
||||||
assertValidId(d.getCollectedfrom().get(0).getKey());
|
assertValidId(d.getCollectedfrom().get(0).getKey());
|
||||||
assertTrue(StringUtils.isNotBlank(d.getTitle().get(0).getValue()));
|
assertTrue(StringUtils.isNotBlank(d.getTitle().get(0).getValue()));
|
||||||
|
@ -211,19 +235,10 @@ public class MappersTest {
|
||||||
});
|
});
|
||||||
assertEquals("0001", d.getInstance().get(0).getRefereed().getClassid());
|
assertEquals("0001", d.getInstance().get(0).getRefereed().getClassid());
|
||||||
|
|
||||||
verifyRelations(d, r1, r2);
|
|
||||||
}
|
|
||||||
|
|
||||||
private void verifyRelations(OafEntity e, Relation r1, Relation r2) {
|
|
||||||
assertEquals(e.getId(), r1.getSource());
|
|
||||||
assertEquals(e.getId(), r2.getTarget());
|
|
||||||
|
|
||||||
assertValidId(r1.getSource());
|
assertValidId(r1.getSource());
|
||||||
assertValidId(r1.getTarget());
|
assertValidId(r1.getTarget());
|
||||||
assertValidId(r2.getSource());
|
assertValidId(r2.getSource());
|
||||||
assertValidId(r2.getTarget());
|
assertValidId(r2.getTarget());
|
||||||
assertValidId(r1.getCollectedfrom().get(0).getKey());
|
|
||||||
assertValidId(r2.getCollectedfrom().get(0).getKey());
|
|
||||||
assertNotNull(r1.getDataInfo());
|
assertNotNull(r1.getDataInfo());
|
||||||
assertNotNull(r2.getDataInfo());
|
assertNotNull(r2.getDataInfo());
|
||||||
assertNotNull(r1.getDataInfo().getTrust());
|
assertNotNull(r1.getDataInfo().getTrust());
|
||||||
|
@ -234,6 +249,10 @@ public class MappersTest {
|
||||||
assertTrue(StringUtils.isNotBlank(r2.getRelClass()));
|
assertTrue(StringUtils.isNotBlank(r2.getRelClass()));
|
||||||
assertTrue(StringUtils.isNotBlank(r1.getRelType()));
|
assertTrue(StringUtils.isNotBlank(r1.getRelType()));
|
||||||
assertTrue(StringUtils.isNotBlank(r2.getRelType()));
|
assertTrue(StringUtils.isNotBlank(r2.getRelType()));
|
||||||
|
assertTrue(r1.getValidated());
|
||||||
|
assertTrue(r2.getValidated());
|
||||||
|
assertEquals(r1.getValidationDate(), "2020-01-01");
|
||||||
|
assertEquals(r2.getValidationDate(), "2020-01-01");
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
|
@ -343,6 +362,37 @@ public class MappersTest {
|
||||||
assertValidId(p.getId());
|
assertValidId(p.getId());
|
||||||
assertValidId(p.getCollectedfrom().get(0).getKey());
|
assertValidId(p.getCollectedfrom().get(0).getKey());
|
||||||
assertTrue(StringUtils.isNotBlank(p.getTitle().get(0).getValue()));
|
assertTrue(StringUtils.isNotBlank(p.getTitle().get(0).getValue()));
|
||||||
|
assertEquals(1, p.getAuthor().size());
|
||||||
|
assertEquals("OPEN", p.getBestaccessright().getClassid());
|
||||||
|
assertTrue(StringUtils.isNotBlank(p.getPid().get(0).getValue()));
|
||||||
|
assertTrue(StringUtils.isNotBlank(p.getPid().get(0).getQualifier().getClassid()));
|
||||||
|
assertEquals("dataset", p.getResulttype().getClassname());
|
||||||
|
assertEquals(1, p.getInstance().size());
|
||||||
|
assertEquals("OPEN", p.getInstance().get(0).getAccessright().getClassid());
|
||||||
|
assertValidId(p.getInstance().get(0).getCollectedfrom().getKey());
|
||||||
|
assertValidId(p.getInstance().get(0).getHostedby().getKey());
|
||||||
|
assertEquals(
|
||||||
|
"http://creativecommons.org/licenses/by/3.0/de/legalcode", p.getInstance().get(0).getLicense().getValue());
|
||||||
|
assertEquals(1, p.getInstance().get(0).getUrl().size());
|
||||||
|
// System.out.println(p.getInstance().get(0).getUrl().get(0));
|
||||||
|
// System.out.println(p.getInstance().get(0).getHostedby().getValue());
|
||||||
|
System.out.println(p.getPid().get(0).getValue());
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void testBologna() throws IOException {
|
||||||
|
final String xml = IOUtils.toString(getClass().getResourceAsStream("oaf-bologna.xml"));
|
||||||
|
final List<Oaf> list = new OafToOafMapper(vocs, false, true).processMdRecord(xml);
|
||||||
|
|
||||||
|
System.out.println("***************");
|
||||||
|
System.out.println(new ObjectMapper().writeValueAsString(list));
|
||||||
|
System.out.println("***************");
|
||||||
|
|
||||||
|
final Publication p = (Publication) list.get(0);
|
||||||
|
assertValidId(p.getId());
|
||||||
|
assertValidId(p.getCollectedfrom().get(0).getKey());
|
||||||
|
System.out.println(p.getTitle().get(0).getValue());
|
||||||
|
assertTrue(StringUtils.isNotBlank(p.getTitle().get(0).getValue()));
|
||||||
System.out.println(p.getTitle().get(0).getValue());
|
System.out.println(p.getTitle().get(0).getValue());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -35,7 +35,7 @@ public class MigrateDbEntitiesApplicationTest {
|
||||||
|
|
||||||
private MigrateDbEntitiesApplication app;
|
private MigrateDbEntitiesApplication app;
|
||||||
|
|
||||||
@Mock
|
@Mock(lenient = true)
|
||||||
private ResultSet rs;
|
private ResultSet rs;
|
||||||
|
|
||||||
@Mock
|
@Mock
|
||||||
|
|
|
@ -1047,6 +1047,7 @@ dnet:pid_types @=@ dnet:pid_types @=@ urn @=@ urn
|
||||||
dnet:pid_types @=@ dnet:pid_types @=@ who @=@ WHO Identifier
|
dnet:pid_types @=@ dnet:pid_types @=@ who @=@ WHO Identifier
|
||||||
dnet:pid_types @=@ dnet:pid_types @=@ drks @=@ DRKS Identifier
|
dnet:pid_types @=@ dnet:pid_types @=@ drks @=@ DRKS Identifier
|
||||||
dnet:pid_types @=@ dnet:pid_types @=@ handle @=@ Handle
|
dnet:pid_types @=@ dnet:pid_types @=@ handle @=@ Handle
|
||||||
|
dnet:pid_types @=@ dnet:pid_types @=@ data.europa.eu @=@ EU Persistent URL
|
||||||
dnet:topic_types @=@ dnet:topic_types @=@ ENRICH/MISSING/SUBJECT/ACM @=@ An ACM classification term that can be associated to your publications
|
dnet:topic_types @=@ dnet:topic_types @=@ ENRICH/MISSING/SUBJECT/ACM @=@ An ACM classification term that can be associated to your publications
|
||||||
dnet:topic_types @=@ dnet:topic_types @=@ ENRICH/MISSING/SUBJECT/ARXIV @=@ An ARXIV classification term that can be associated to your publications
|
dnet:topic_types @=@ dnet:topic_types @=@ ENRICH/MISSING/SUBJECT/ARXIV @=@ An ARXIV classification term that can be associated to your publications
|
||||||
dnet:topic_types @=@ dnet:topic_types @=@ ENRICH/MISSING/SUBJECT/DDC @=@ A Dewey Decimal classification term (DDC) that can be associated to your publications
|
dnet:topic_types @=@ dnet:topic_types @=@ ENRICH/MISSING/SUBJECT/DDC @=@ A Dewey Decimal classification term (DDC) that can be associated to your publications
|
||||||
|
|
|
@ -0,0 +1,368 @@
|
||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<oai:record xmlns:dr="http://www.driver-repository.eu/namespace/dr"
|
||||||
|
xmlns:dri="http://www.driver-repository.eu/namespace/dri"
|
||||||
|
xmlns:oaf="http://namespace.openaire.eu/oaf" xmlns:oai="http://www.openarchives.org/OAI/2.0/">
|
||||||
|
<oai:header>
|
||||||
|
<dri:objIdentifier>r3c4b2081b22::0007d64b38bb2b399120f9993f95d911</dri:objIdentifier>
|
||||||
|
<dri:recordIdentifier>60a0a5b0-b63c-473f-b8bc-207ea037eb3b</dri:recordIdentifier>
|
||||||
|
<dri:dateOfCollection>2021-01-28T17:24:33.095+01:00</dri:dateOfCollection>
|
||||||
|
<oaf:datasourceprefix>r3c4b2081b22</oaf:datasourceprefix>
|
||||||
|
<dr:dateOfTransformation>2021-02-03T16:57:03.099+01:00</dr:dateOfTransformation>
|
||||||
|
</oai:header>
|
||||||
|
<oai:metadata>
|
||||||
|
<datacite:resource
|
||||||
|
xmlns:datacite="http://datacite.org/schema/kernel-4"
|
||||||
|
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://datacite.org/schema/kernel-4 http://schema.datacite.org/meta/kernel-4/metadata.xsd">
|
||||||
|
<datacite:identifier identifierType="URL">http://data.europa.eu/88u/dataset/60a0a5b0-b63c-473f-b8bc-207ea037eb3b</datacite:identifier>
|
||||||
|
<datacite:creators/>
|
||||||
|
<datacite:titles>
|
||||||
|
<datacite:title xml:lang="en">GMIS - Favourable feeding habitat of adult Atlantic bluefin tuna (ABFT) Monthly 1998-2017 (frequency of occurence, %)</datacite:title>
|
||||||
|
</datacite:titles>
|
||||||
|
<datacite:publisher>JRC</datacite:publisher>
|
||||||
|
<datacite:publicationYear>2019</datacite:publicationYear>
|
||||||
|
<datacite:dates>
|
||||||
|
<datacite:date dateType="Issued">2019-07-09</datacite:date>
|
||||||
|
</datacite:dates>
|
||||||
|
<datacite:resourceType resourceTypeGeneral="Dataset">Dataset</datacite:resourceType>
|
||||||
|
<datacite:descriptions>
|
||||||
|
<datacite:description descriptionType="Abstract" xml:lang="en">The favourable feeding habitat of the Atlantic bluefin tuna is daily identified linking their ecological traits with environmental variables from satellite remote sensing and physical ocean models. The feeding habitat is mostly related to the occurrence of productive oceanic features (such as eddies) that are detected by satellite sensors of ocean colour (chlorophyll-a fronts). The physical variables used are sea surface temperature and sea surface height anomaly. More information: https://fishreg.jrc.ec.europa.eu/fish-habitat, Peer-reviewed publication: http://www.sciencedirect.com/science/article/pii/S0079661116000070</datacite:description>
|
||||||
|
</datacite:descriptions>
|
||||||
|
<datacite:subjects>
|
||||||
|
<datacite:subject>protected area</datacite:subject>
|
||||||
|
<datacite:subject>environmental monitoring</datacite:subject>
|
||||||
|
<datacite:subject>oceanography</datacite:subject>
|
||||||
|
<datacite:subject>ocean</datacite:subject>
|
||||||
|
<datacite:subject>Environmental Monitoring Facilities</datacite:subject>
|
||||||
|
<datacite:subject>Environment</datacite:subject>
|
||||||
|
<datacite:subject>Protected Sites</datacite:subject>
|
||||||
|
<datacite:subject>Oceanographic Geographical Features</datacite:subject>
|
||||||
|
<datacite:subject>environmental data</datacite:subject>
|
||||||
|
<datacite:subject classid="eu-data-theme"
|
||||||
|
classname="EU Data Theme"
|
||||||
|
schemeid="dnet:subject_classification_typologies" schemename="dnet:subject_classification_typologies">http://publications.europa.eu/resource/authority/data-theme/TECH</datacite:subject>
|
||||||
|
<datacite:subject classid="eu-data-theme"
|
||||||
|
classname="EU Data Theme"
|
||||||
|
schemeid="dnet:subject_classification_typologies" schemename="dnet:subject_classification_typologies">http://publications.europa.eu/resource/authority/data-theme/ENVI</datacite:subject>
|
||||||
|
<datacite:subject classid="eurovoc" classname="EuroVoc"
|
||||||
|
schemeid="dnet:subject_classification_typologies" schemename="dnet:subject_classification_typologies">http://eurovoc.europa.eu/2114</datacite:subject>
|
||||||
|
<datacite:subject classid="eurovoc" classname="EuroVoc"
|
||||||
|
schemeid="dnet:subject_classification_typologies" schemename="dnet:subject_classification_typologies">http://eurovoc.europa.eu/2107</datacite:subject>
|
||||||
|
<datacite:subject classid="eurovoc" classname="EuroVoc"
|
||||||
|
schemeid="dnet:subject_classification_typologies" schemename="dnet:subject_classification_typologies">http://eurovoc.europa.eu/4801</datacite:subject>
|
||||||
|
<datacite:subject classid="eurovoc" classname="EuroVoc"
|
||||||
|
schemeid="dnet:subject_classification_typologies" schemename="dnet:subject_classification_typologies">http://eurovoc.europa.eu/3140</datacite:subject>
|
||||||
|
</datacite:subjects>
|
||||||
|
<datacite:formats>
|
||||||
|
<datacite:format>http://publications.europa.eu/resource/authority/file-type/OP_DATPRO</datacite:format>
|
||||||
|
</datacite:formats>
|
||||||
|
<datacite:geoLocations>
|
||||||
|
<datacite:geoLocation>
|
||||||
|
<datacite:geoLocationPlace>http://publications.europa.eu/resource/authority/country/ESH</datacite:geoLocationPlace>
|
||||||
|
</datacite:geoLocation>
|
||||||
|
<datacite:geoLocation>
|
||||||
|
<datacite:geoLocationPlace>http://publications.europa.eu/resource/authority/country/ESP</datacite:geoLocationPlace>
|
||||||
|
</datacite:geoLocation>
|
||||||
|
<datacite:geoLocation>
|
||||||
|
<datacite:geoLocationPlace>http://publications.europa.eu/resource/authority/country/EST</datacite:geoLocationPlace>
|
||||||
|
</datacite:geoLocation>
|
||||||
|
<datacite:geoLocation>
|
||||||
|
<datacite:geoLocationPlace>http://publications.europa.eu/resource/authority/country/ETH</datacite:geoLocationPlace>
|
||||||
|
</datacite:geoLocation>
|
||||||
|
<datacite:geoLocation>
|
||||||
|
<datacite:geoLocationPlace>http://publications.europa.eu/resource/authority/country/DNK</datacite:geoLocationPlace>
|
||||||
|
</datacite:geoLocation>
|
||||||
|
<datacite:geoLocation>
|
||||||
|
<datacite:geoLocationPlace>http://publications.europa.eu/resource/authority/country/DZA</datacite:geoLocationPlace>
|
||||||
|
</datacite:geoLocation>
|
||||||
|
<datacite:geoLocation>
|
||||||
|
<datacite:geoLocationPlace>http://publications.europa.eu/resource/authority/country/EGY</datacite:geoLocationPlace>
|
||||||
|
</datacite:geoLocation>
|
||||||
|
<datacite:geoLocation>
|
||||||
|
<datacite:geoLocationPlace>http://publications.europa.eu/resource/authority/country/ERI</datacite:geoLocationPlace>
|
||||||
|
</datacite:geoLocation>
|
||||||
|
<datacite:geoLocation>
|
||||||
|
<datacite:geoLocationPlace>http://publications.europa.eu/resource/authority/country/FIN</datacite:geoLocationPlace>
|
||||||
|
</datacite:geoLocation>
|
||||||
|
<datacite:geoLocation>
|
||||||
|
<datacite:geoLocationPlace>http://publications.europa.eu/resource/authority/country/FRA</datacite:geoLocationPlace>
|
||||||
|
</datacite:geoLocation>
|
||||||
|
<datacite:geoLocation>
|
||||||
|
<datacite:geoLocationPlace>http://publications.europa.eu/resource/authority/country/1A0</datacite:geoLocationPlace>
|
||||||
|
</datacite:geoLocation>
|
||||||
|
<datacite:geoLocation>
|
||||||
|
<datacite:geoLocationPlace>http://publications.europa.eu/resource/authority/country/AUT</datacite:geoLocationPlace>
|
||||||
|
</datacite:geoLocation>
|
||||||
|
<datacite:geoLocation>
|
||||||
|
<datacite:geoLocationPlace>http://publications.europa.eu/resource/authority/country/BGR</datacite:geoLocationPlace>
|
||||||
|
</datacite:geoLocation>
|
||||||
|
<datacite:geoLocation>
|
||||||
|
<datacite:geoLocationPlace>http://publications.europa.eu/resource/authority/country/MDA</datacite:geoLocationPlace>
|
||||||
|
</datacite:geoLocation>
|
||||||
|
<datacite:geoLocation>
|
||||||
|
<datacite:geoLocationPlace>http://publications.europa.eu/resource/authority/country/MCO</datacite:geoLocationPlace>
|
||||||
|
</datacite:geoLocation>
|
||||||
|
<datacite:geoLocation>
|
||||||
|
<datacite:geoLocationPlace>http://publications.europa.eu/resource/authority/country/LUX</datacite:geoLocationPlace>
|
||||||
|
</datacite:geoLocation>
|
||||||
|
<datacite:geoLocation>
|
||||||
|
<datacite:geoLocationPlace>http://publications.europa.eu/resource/authority/country/LTU</datacite:geoLocationPlace>
|
||||||
|
</datacite:geoLocation>
|
||||||
|
<datacite:geoLocation>
|
||||||
|
<datacite:geoLocationPlace>http://publications.europa.eu/resource/authority/country/MAR</datacite:geoLocationPlace>
|
||||||
|
</datacite:geoLocation>
|
||||||
|
<datacite:geoLocation>
|
||||||
|
<datacite:geoLocationPlace>http://publications.europa.eu/resource/authority/country/LVA</datacite:geoLocationPlace>
|
||||||
|
</datacite:geoLocation>
|
||||||
|
<datacite:geoLocation>
|
||||||
|
<datacite:geoLocationPlace>http://publications.europa.eu/resource/authority/country/LBN</datacite:geoLocationPlace>
|
||||||
|
</datacite:geoLocation>
|
||||||
|
<datacite:geoLocation>
|
||||||
|
<datacite:geoLocationPlace>http://publications.europa.eu/resource/authority/country/JOR</datacite:geoLocationPlace>
|
||||||
|
</datacite:geoLocation>
|
||||||
|
<datacite:geoLocation>
|
||||||
|
<datacite:geoLocationPlace>http://publications.europa.eu/resource/authority/country/LIE</datacite:geoLocationPlace>
|
||||||
|
</datacite:geoLocation>
|
||||||
|
<datacite:geoLocation>
|
||||||
|
<datacite:geoLocationPlace>http://publications.europa.eu/resource/authority/country/LBY</datacite:geoLocationPlace>
|
||||||
|
</datacite:geoLocation>
|
||||||
|
<datacite:geoLocation>
|
||||||
|
<datacite:geoLocationPlace>http://publications.europa.eu/resource/authority/country/TUN</datacite:geoLocationPlace>
|
||||||
|
</datacite:geoLocation>
|
||||||
|
<datacite:geoLocation>
|
||||||
|
<datacite:geoLocationPlace>http://publications.europa.eu/resource/authority/country/TUR</datacite:geoLocationPlace>
|
||||||
|
</datacite:geoLocation>
|
||||||
|
<datacite:geoLocation>
|
||||||
|
<datacite:geoLocationPlace>http://publications.europa.eu/resource/authority/country/SVK</datacite:geoLocationPlace>
|
||||||
|
</datacite:geoLocation>
|
||||||
|
<datacite:geoLocation>
|
||||||
|
<datacite:geoLocationPlace>http://publications.europa.eu/resource/authority/country/SVN</datacite:geoLocationPlace>
|
||||||
|
</datacite:geoLocation>
|
||||||
|
<datacite:geoLocation>
|
||||||
|
<datacite:geoLocationPlace>http://publications.europa.eu/resource/authority/country/SRB</datacite:geoLocationPlace>
|
||||||
|
</datacite:geoLocation>
|
||||||
|
<datacite:geoLocation>
|
||||||
|
<datacite:geoLocationPlace>http://publications.europa.eu/resource/authority/country/SSD</datacite:geoLocationPlace>
|
||||||
|
</datacite:geoLocation>
|
||||||
|
<datacite:geoLocation>
|
||||||
|
<datacite:geoLocationPlace>http://publications.europa.eu/resource/authority/country/TCD</datacite:geoLocationPlace>
|
||||||
|
</datacite:geoLocation>
|
||||||
|
<datacite:geoLocation>
|
||||||
|
<datacite:geoLocationPlace>http://publications.europa.eu/resource/authority/country/TGO</datacite:geoLocationPlace>
|
||||||
|
</datacite:geoLocation>
|
||||||
|
<datacite:geoLocation>
|
||||||
|
<datacite:geoLocationPlace>http://publications.europa.eu/resource/authority/country/SWE</datacite:geoLocationPlace>
|
||||||
|
</datacite:geoLocation>
|
||||||
|
<datacite:geoLocation>
|
||||||
|
<datacite:geoLocationPlace>http://publications.europa.eu/resource/authority/country/SYR</datacite:geoLocationPlace>
|
||||||
|
</datacite:geoLocation>
|
||||||
|
<datacite:geoLocation>
|
||||||
|
<datacite:geoLocationPlace>http://publications.europa.eu/resource/authority/country/AND</datacite:geoLocationPlace>
|
||||||
|
</datacite:geoLocation>
|
||||||
|
<datacite:geoLocation>
|
||||||
|
<datacite:geoLocationPlace>http://publications.europa.eu/resource/authority/country/BFA</datacite:geoLocationPlace>
|
||||||
|
</datacite:geoLocation>
|
||||||
|
<datacite:geoLocation>
|
||||||
|
<datacite:geoLocationPlace>http://publications.europa.eu/resource/authority/country/GRC</datacite:geoLocationPlace>
|
||||||
|
</datacite:geoLocation>
|
||||||
|
<datacite:geoLocation>
|
||||||
|
<datacite:geoLocationPlace>http://publications.europa.eu/resource/authority/country/GNB</datacite:geoLocationPlace>
|
||||||
|
</datacite:geoLocation>
|
||||||
|
<datacite:geoLocation>
|
||||||
|
<datacite:geoLocationPlace>http://publications.europa.eu/resource/authority/country/GGY</datacite:geoLocationPlace>
|
||||||
|
</datacite:geoLocation>
|
||||||
|
<datacite:geoLocation>
|
||||||
|
<datacite:geoLocationPlace>http://publications.europa.eu/resource/authority/country/GEO</datacite:geoLocationPlace>
|
||||||
|
</datacite:geoLocation>
|
||||||
|
<datacite:geoLocation>
|
||||||
|
<datacite:geoLocationPlace>http://publications.europa.eu/resource/authority/country/GBR</datacite:geoLocationPlace>
|
||||||
|
</datacite:geoLocation>
|
||||||
|
<datacite:geoLocation>
|
||||||
|
<datacite:geoLocationPlace>http://publications.europa.eu/resource/authority/country/FRO</datacite:geoLocationPlace>
|
||||||
|
</datacite:geoLocation>
|
||||||
|
<datacite:geoLocation>
|
||||||
|
<datacite:geoLocationPlace>http://publications.europa.eu/resource/authority/country/GMB</datacite:geoLocationPlace>
|
||||||
|
</datacite:geoLocation>
|
||||||
|
<datacite:geoLocation>
|
||||||
|
<datacite:geoLocationPlace>http://publications.europa.eu/resource/authority/country/GIN</datacite:geoLocationPlace>
|
||||||
|
</datacite:geoLocation>
|
||||||
|
<datacite:geoLocation>
|
||||||
|
<datacite:geoLocationPlace>http://publications.europa.eu/resource/authority/country/GIB</datacite:geoLocationPlace>
|
||||||
|
</datacite:geoLocation>
|
||||||
|
<datacite:geoLocation>
|
||||||
|
<datacite:geoLocationPlace>http://publications.europa.eu/resource/authority/country/GHA</datacite:geoLocationPlace>
|
||||||
|
</datacite:geoLocation>
|
||||||
|
<datacite:geoLocation>
|
||||||
|
<datacite:geoLocationPlace>http://publications.europa.eu/resource/authority/country/MKD</datacite:geoLocationPlace>
|
||||||
|
</datacite:geoLocation>
|
||||||
|
<datacite:geoLocation>
|
||||||
|
<datacite:geoLocationPlace>http://publications.europa.eu/resource/authority/country/MLI</datacite:geoLocationPlace>
|
||||||
|
</datacite:geoLocation>
|
||||||
|
<datacite:geoLocation>
|
||||||
|
<datacite:geoLocationPlace>http://publications.europa.eu/resource/authority/country/MLT</datacite:geoLocationPlace>
|
||||||
|
</datacite:geoLocation>
|
||||||
|
<datacite:geoLocation>
|
||||||
|
<datacite:geoLocationPlace>http://publications.europa.eu/resource/authority/country/MNE</datacite:geoLocationPlace>
|
||||||
|
</datacite:geoLocation>
|
||||||
|
<datacite:geoLocation>
|
||||||
|
<datacite:geoLocationPlace>http://publications.europa.eu/resource/authority/country/MRT</datacite:geoLocationPlace>
|
||||||
|
</datacite:geoLocation>
|
||||||
|
<datacite:geoLocation>
|
||||||
|
<datacite:geoLocationPlace>http://publications.europa.eu/resource/authority/country/NER</datacite:geoLocationPlace>
|
||||||
|
</datacite:geoLocation>
|
||||||
|
<datacite:geoLocation>
|
||||||
|
<datacite:geoLocationPlace>http://publications.europa.eu/resource/authority/country/NGA</datacite:geoLocationPlace>
|
||||||
|
</datacite:geoLocation>
|
||||||
|
<datacite:geoLocation>
|
||||||
|
<datacite:geoLocationPlace>http://publications.europa.eu/resource/authority/country/NLD</datacite:geoLocationPlace>
|
||||||
|
</datacite:geoLocation>
|
||||||
|
<datacite:geoLocation>
|
||||||
|
<datacite:geoLocationPlace>http://publications.europa.eu/resource/authority/country/NOR</datacite:geoLocationPlace>
|
||||||
|
</datacite:geoLocation>
|
||||||
|
<datacite:geoLocation>
|
||||||
|
<datacite:geoLocationPlace>http://publications.europa.eu/resource/authority/country/POL</datacite:geoLocationPlace>
|
||||||
|
</datacite:geoLocation>
|
||||||
|
<datacite:geoLocation>
|
||||||
|
<datacite:geoLocationPlace>http://publications.europa.eu/resource/authority/country/ALB</datacite:geoLocationPlace>
|
||||||
|
</datacite:geoLocation>
|
||||||
|
<datacite:geoLocation>
|
||||||
|
<datacite:geoLocationPlace>http://publications.europa.eu/resource/authority/country/BEN</datacite:geoLocationPlace>
|
||||||
|
</datacite:geoLocation>
|
||||||
|
<datacite:geoLocation>
|
||||||
|
<datacite:geoLocationPlace>http://publications.europa.eu/resource/authority/country/VAT</datacite:geoLocationPlace>
|
||||||
|
</datacite:geoLocation>
|
||||||
|
<datacite:geoLocation>
|
||||||
|
<datacite:geoLocationPlace>http://publications.europa.eu/resource/authority/country/UKR</datacite:geoLocationPlace>
|
||||||
|
</datacite:geoLocation>
|
||||||
|
<datacite:geoLocation>
|
||||||
|
<datacite:geoLocationPlace>http://publications.europa.eu/resource/authority/country/CAF</datacite:geoLocationPlace>
|
||||||
|
</datacite:geoLocation>
|
||||||
|
<datacite:geoLocation>
|
||||||
|
<datacite:geoLocationPlace>http://publications.europa.eu/resource/authority/country/BLR</datacite:geoLocationPlace>
|
||||||
|
</datacite:geoLocation>
|
||||||
|
<datacite:geoLocation>
|
||||||
|
<datacite:geoLocationPlace>http://publications.europa.eu/resource/authority/country/CIV</datacite:geoLocationPlace>
|
||||||
|
</datacite:geoLocation>
|
||||||
|
<datacite:geoLocation>
|
||||||
|
<datacite:geoLocationPlace>http://publications.europa.eu/resource/authority/country/CHE</datacite:geoLocationPlace>
|
||||||
|
</datacite:geoLocation>
|
||||||
|
<datacite:geoLocation>
|
||||||
|
<datacite:geoLocationPlace>http://publications.europa.eu/resource/authority/country/CPV</datacite:geoLocationPlace>
|
||||||
|
</datacite:geoLocation>
|
||||||
|
<datacite:geoLocation>
|
||||||
|
<datacite:geoLocationPlace>http://publications.europa.eu/resource/authority/country/CMR</datacite:geoLocationPlace>
|
||||||
|
</datacite:geoLocation>
|
||||||
|
<datacite:geoLocation>
|
||||||
|
<datacite:geoLocationPlace>http://publications.europa.eu/resource/authority/country/CZE</datacite:geoLocationPlace>
|
||||||
|
</datacite:geoLocation>
|
||||||
|
<datacite:geoLocation>
|
||||||
|
<datacite:geoLocationPlace>http://publications.europa.eu/resource/authority/country/CYP</datacite:geoLocationPlace>
|
||||||
|
</datacite:geoLocation>
|
||||||
|
<datacite:geoLocation>
|
||||||
|
<datacite:geoLocationPlace>http://publications.europa.eu/resource/authority/country/DJI</datacite:geoLocationPlace>
|
||||||
|
</datacite:geoLocation>
|
||||||
|
<datacite:geoLocation>
|
||||||
|
<datacite:geoLocationPlace>http://publications.europa.eu/resource/authority/country/DEU</datacite:geoLocationPlace>
|
||||||
|
</datacite:geoLocation>
|
||||||
|
<datacite:geoLocation>
|
||||||
|
<datacite:geoLocationPlace>http://publications.europa.eu/resource/authority/country/ITA</datacite:geoLocationPlace>
|
||||||
|
</datacite:geoLocation>
|
||||||
|
<datacite:geoLocation>
|
||||||
|
<datacite:geoLocationPlace>http://publications.europa.eu/resource/authority/country/JEY</datacite:geoLocationPlace>
|
||||||
|
</datacite:geoLocation>
|
||||||
|
<datacite:geoLocation>
|
||||||
|
<datacite:geoLocationPlace>http://publications.europa.eu/resource/authority/country/ISL</datacite:geoLocationPlace>
|
||||||
|
</datacite:geoLocation>
|
||||||
|
<datacite:geoLocation>
|
||||||
|
<datacite:geoLocationPlace>http://publications.europa.eu/resource/authority/country/ISR</datacite:geoLocationPlace>
|
||||||
|
</datacite:geoLocation>
|
||||||
|
<datacite:geoLocation>
|
||||||
|
<datacite:geoLocationPlace>http://publications.europa.eu/resource/authority/country/IRL</datacite:geoLocationPlace>
|
||||||
|
</datacite:geoLocation>
|
||||||
|
<datacite:geoLocation>
|
||||||
|
<datacite:geoLocationPlace>http://publications.europa.eu/resource/authority/country/IRQ</datacite:geoLocationPlace>
|
||||||
|
</datacite:geoLocation>
|
||||||
|
<datacite:geoLocation>
|
||||||
|
<datacite:geoLocationPlace>http://publications.europa.eu/resource/authority/country/HUN</datacite:geoLocationPlace>
|
||||||
|
</datacite:geoLocation>
|
||||||
|
<datacite:geoLocation>
|
||||||
|
<datacite:geoLocationPlace>http://publications.europa.eu/resource/authority/country/IMN</datacite:geoLocationPlace>
|
||||||
|
</datacite:geoLocation>
|
||||||
|
<datacite:geoLocation>
|
||||||
|
<datacite:geoLocationPlace>http://publications.europa.eu/resource/authority/country/GRL</datacite:geoLocationPlace>
|
||||||
|
</datacite:geoLocation>
|
||||||
|
<datacite:geoLocation>
|
||||||
|
<datacite:geoLocationPlace>http://publications.europa.eu/resource/authority/country/HRV</datacite:geoLocationPlace>
|
||||||
|
</datacite:geoLocation>
|
||||||
|
<datacite:geoLocation>
|
||||||
|
<datacite:geoLocationPlace>http://publications.europa.eu/resource/authority/country/ALA</datacite:geoLocationPlace>
|
||||||
|
</datacite:geoLocation>
|
||||||
|
<datacite:geoLocation>
|
||||||
|
<datacite:geoLocationPlace>http://publications.europa.eu/resource/authority/country/BEL</datacite:geoLocationPlace>
|
||||||
|
</datacite:geoLocation>
|
||||||
|
<datacite:geoLocation>
|
||||||
|
<datacite:geoLocationPlace>http://publications.europa.eu/resource/authority/country/BIH</datacite:geoLocationPlace>
|
||||||
|
</datacite:geoLocation>
|
||||||
|
<datacite:geoLocation>
|
||||||
|
<datacite:geoLocationPlace>http://publications.europa.eu/resource/authority/country/SLE</datacite:geoLocationPlace>
|
||||||
|
</datacite:geoLocation>
|
||||||
|
<datacite:geoLocation>
|
||||||
|
<datacite:geoLocationPlace>http://publications.europa.eu/resource/authority/country/SEN</datacite:geoLocationPlace>
|
||||||
|
</datacite:geoLocation>
|
||||||
|
<datacite:geoLocation>
|
||||||
|
<datacite:geoLocationPlace>http://publications.europa.eu/resource/authority/country/SDN</datacite:geoLocationPlace>
|
||||||
|
</datacite:geoLocation>
|
||||||
|
<datacite:geoLocation>
|
||||||
|
<datacite:geoLocationPlace>http://publications.europa.eu/resource/authority/country/SAU</datacite:geoLocationPlace>
|
||||||
|
</datacite:geoLocation>
|
||||||
|
<datacite:geoLocation>
|
||||||
|
<datacite:geoLocationPlace>http://publications.europa.eu/resource/authority/country/RUS</datacite:geoLocationPlace>
|
||||||
|
</datacite:geoLocation>
|
||||||
|
<datacite:geoLocation>
|
||||||
|
<datacite:geoLocationPlace>http://publications.europa.eu/resource/authority/country/ROU</datacite:geoLocationPlace>
|
||||||
|
</datacite:geoLocation>
|
||||||
|
<datacite:geoLocation>
|
||||||
|
<datacite:geoLocationPlace>http://publications.europa.eu/resource/authority/country/PSE</datacite:geoLocationPlace>
|
||||||
|
</datacite:geoLocation>
|
||||||
|
<datacite:geoLocation>
|
||||||
|
<datacite:geoLocationPlace>http://publications.europa.eu/resource/authority/country/PRT</datacite:geoLocationPlace>
|
||||||
|
</datacite:geoLocation>
|
||||||
|
<datacite:geoLocation>
|
||||||
|
<datacite:geoLocationPlace>http://publications.europa.eu/resource/authority/country/SOM</datacite:geoLocationPlace>
|
||||||
|
</datacite:geoLocation>
|
||||||
|
<datacite:geoLocation>
|
||||||
|
<datacite:geoLocationPlace>http://publications.europa.eu/resource/authority/country/SMR</datacite:geoLocationPlace>
|
||||||
|
</datacite:geoLocation>
|
||||||
|
</datacite:geoLocations>
|
||||||
|
</datacite:resource>
|
||||||
|
<oaf:identifier identifierType="data.europa.eu">http://data.europa.eu/88u/dataset/60a0a5b0-b63c-473f-b8bc-207ea037eb3b</oaf:identifier>
|
||||||
|
<oaf:country>EU</oaf:country>
|
||||||
|
<oaf:accessrights>OPEN</oaf:accessrights>
|
||||||
|
<dr:CobjCategory type="dataset">0021</dr:CobjCategory>
|
||||||
|
<oaf:dateAccepted>2019-07-09</oaf:dateAccepted>
|
||||||
|
<oaf:hostedBy id="re3data_____::r3d100011728" name="European Union Open Data Portal"/>
|
||||||
|
<oaf:collectedFrom id="re3data_____::r3d100011728" name="European Union Open Data Portal"/>
|
||||||
|
<oaf:license>CC_BY_4_0</oaf:license>
|
||||||
|
<oaf:language>ENG</oaf:language>
|
||||||
|
</oai:metadata>
|
||||||
|
<about xmlns:dc="http://purl.org/dc/elements/1.1/"
|
||||||
|
xmlns:prov="http://www.openarchives.org/OAI/2.0/provenance" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
|
||||||
|
<provenance xmlns="http://www.openarchives.org/OAI/2.0/provenance" xsi:schemaLocation="http://www.openarchives.org/OAI/2.0/provenance http://www.openarchives.org/OAI/2.0/provenance.xsd">
|
||||||
|
<originDescription altered="true" harvestDate="2021-01-28T17:24:33.095+01:00">
|
||||||
|
<baseURL>https%3A%2F%2Fdata.europa.eu%2Feuodp%2Fdata%2Fapiodp%2Faction%2Fpackage_search</baseURL>
|
||||||
|
<identifier/>
|
||||||
|
<datestamp/>
|
||||||
|
<metadataNamespace/>
|
||||||
|
</originDescription>
|
||||||
|
</provenance>
|
||||||
|
<oaf:datainfo>
|
||||||
|
<oaf:inferred>false</oaf:inferred>
|
||||||
|
<oaf:deletedbyinference>false</oaf:deletedbyinference>
|
||||||
|
<oaf:trust>0.9</oaf:trust>
|
||||||
|
<oaf:inferenceprovenance/>
|
||||||
|
<oaf:provenanceaction classid="sysimport:crosswalk"
|
||||||
|
classname="sysimport:crosswalk"
|
||||||
|
schemeid="dnet:provenanceActions" schemename="dnet:provenanceActions"/>
|
||||||
|
</oaf:datainfo>
|
||||||
|
</about>
|
||||||
|
</oai:record>
|
||||||
|
|
|
@ -46,6 +46,7 @@
|
||||||
<oaf:collectedFrom id="openaire____::crossref" name="Crossref"/>
|
<oaf:collectedFrom id="openaire____::crossref" name="Crossref"/>
|
||||||
<oaf:identifier identifierType="doi">10.1080/23744235.2020.1774644</oaf:identifier>
|
<oaf:identifier identifierType="doi">10.1080/23744235.2020.1774644</oaf:identifier>
|
||||||
<oaf:journal eissn="2374-4243" ep="3" iss="" issn="2374-4235" sp="1" vol="">Infectious Diseases</oaf:journal>
|
<oaf:journal eissn="2374-4243" ep="3" iss="" issn="2374-4235" sp="1" vol="">Infectious Diseases</oaf:journal>
|
||||||
|
<oaf:projectid validationDate="2020-12-07T11:15:59.627Z">corda__h2020::814530</oaf:projectid>
|
||||||
</metadata>
|
</metadata>
|
||||||
<about xmlns:oai="http://www.openarchives.org/OAI/2.0/">
|
<about xmlns:oai="http://www.openarchives.org/OAI/2.0/">
|
||||||
<provenance xmlns="http://www.openarchives.org/OAI/2.0/provenance" xsi:schemaLocation="http://www.openarchives.org/OAI/2.0/provenance http://www.openarchives.org/OAI/2.0/provenance.xsd">
|
<provenance xmlns="http://www.openarchives.org/OAI/2.0/provenance" xsi:schemaLocation="http://www.openarchives.org/OAI/2.0/provenance http://www.openarchives.org/OAI/2.0/provenance.xsd">
|
||||||
|
|
|
@ -0,0 +1,117 @@
|
||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<record xmlns:datacite="http://datacite.org/schema/kernel-3"
|
||||||
|
xmlns:dr="http://www.driver-repository.eu/namespace/dr"
|
||||||
|
xmlns:oaf="http://namespace.openaire.eu/oaf" xmlns:oai="http://www.openarchives.org/OAI/2.0/">
|
||||||
|
<oai:header xmlns="http://namespace.openaire.eu/"
|
||||||
|
xmlns:dc="http://purl.org/dc/elements/1.1/"
|
||||||
|
xmlns:dri="http://www.driver-repository.eu/namespace/dri"
|
||||||
|
xmlns:prov="http://www.openarchives.org/OAI/2.0/provenance" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
|
||||||
|
<dri:objIdentifier>r3f52792889d::00002412cb25f2f3047712d00ab2c8eb</dri:objIdentifier>
|
||||||
|
<dri:recordIdentifier>hdl:11858/00-1734-0000-0003-EE73-2</dri:recordIdentifier>
|
||||||
|
<dri:dateOfCollection>2020-12-16T10:04:03.148Z</dri:dateOfCollection>
|
||||||
|
<oaf:datasourceprefix>r3f52792889d</oaf:datasourceprefix>
|
||||||
|
<identifier xmlns="http://www.openarchives.org/OAI/2.0/">textgrid:rn8z.0</identifier>
|
||||||
|
<datestamp xmlns="http://www.openarchives.org/OAI/2.0/">2012-01-29T20:54:12Z</datestamp>
|
||||||
|
<dr:dateOfTransformation>2020-12-16T16:02:37.562Z</dr:dateOfTransformation>
|
||||||
|
</oai:header>
|
||||||
|
<metadata>
|
||||||
|
<datacite:resource xmlns="http://www.openarchives.org/OAI/2.0/"
|
||||||
|
xmlns:dc="http://purl.org/dc/elements/1.1/"
|
||||||
|
xmlns:dri="http://www.driver-repository.eu/namespace/dri"
|
||||||
|
xmlns:prov="http://www.openarchives.org/OAI/2.0/provenance" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
|
||||||
|
<datacite:identifier identifierType="Handle">hdl:11858/00-1734-0000-0003-EE73-2</datacite:identifier>
|
||||||
|
<datacite:creators>
|
||||||
|
<datacite:creator>
|
||||||
|
<datacite:creatorName></datacite:creatorName>
|
||||||
|
<datacite:nameIdentifier nameIdentifierScheme="pnd" schemeURI="https://ref.de.dariah.eu/pndsearch/pndquery.xql?id="></datacite:nameIdentifier>
|
||||||
|
</datacite:creator>
|
||||||
|
</datacite:creators>
|
||||||
|
<datacite:titles>
|
||||||
|
<datacite:title titleType="Other">Auf dem Trocknen</datacite:title>
|
||||||
|
<datacite:title titleType="Other">Detlev von Liliencron: Gute Nacht. Hinterlassene Gedichte, Berlin: Schuster & Loeffler, 1909.</datacite:title>
|
||||||
|
</datacite:titles>
|
||||||
|
<datacite:publisher>TextGrid</datacite:publisher>
|
||||||
|
<datacite:publicationYear>2012</datacite:publicationYear>
|
||||||
|
<datacite:contributors>
|
||||||
|
<datacite:contributor contributorType="DataManager">
|
||||||
|
<datacite:contributorName>tvitt@textgrid.de</datacite:contributorName>
|
||||||
|
</datacite:contributor>
|
||||||
|
<datacite:contributor contributorType="Other">
|
||||||
|
<datacite:contributorName>Digitale Bibliothek</datacite:contributorName>
|
||||||
|
<datacite:nameIdentifier nameIdentifierScheme="textgrid" schemeURI="http://www.textgridlab.org/schema/textgrid-metadata_2010.xsd">TGPR-372fe6dc-57f2-6cd4-01b5-2c4bbefcfd3c</datacite:nameIdentifier>
|
||||||
|
</datacite:contributor>
|
||||||
|
</datacite:contributors>
|
||||||
|
<datacite:dates>
|
||||||
|
<datacite:date dateType="Created">2012-01-29T20:54:12Z</datacite:date>
|
||||||
|
<datacite:date dateType="Issued">2012-01-29T20:54:12Z</datacite:date>
|
||||||
|
<datacite:date dateType="Updated">2012-01-29T20:54:12Z</datacite:date>
|
||||||
|
</datacite:dates>
|
||||||
|
<datacite:resourceType resourceTypeGeneral="Dataset"/>
|
||||||
|
<alternateIdentifiers xmlns="http://datacite.org/schema/kernel-3">
|
||||||
|
<datacite:alternateIdentifier alternateIdentifierType="URI" xmlns="http://www.openarchives.org/OAI/2.0/">textgrid:rn8z.0</datacite:alternateIdentifier>
|
||||||
|
<alternateIdentifier alternateIdentifierType="URL">http://hdl.handle.net/hdl:11858/00-1734-0000-0003-EE73-2</alternateIdentifier>
|
||||||
|
</alternateIdentifiers>
|
||||||
|
<datacite:relatedIdentifiers>
|
||||||
|
<datacite:relatedIdentifier relatedIdentifierType="Handle" relationType="IsPartOf">hdl:11858/00-1734-0000-0003-EE72-4</datacite:relatedIdentifier>
|
||||||
|
</datacite:relatedIdentifiers>
|
||||||
|
<datacite:sizes>
|
||||||
|
<datacite:size>527 Bytes</datacite:size>
|
||||||
|
</datacite:sizes>
|
||||||
|
<datacite:formats>
|
||||||
|
<datacite:format>text/tg.edition+tg.aggregation+xml</datacite:format>
|
||||||
|
</datacite:formats>
|
||||||
|
<datacite:version>0</datacite:version>
|
||||||
|
<datacite:rightsList>
|
||||||
|
<datacite:rights rightsURI="http://creativecommons.org/licenses/by/3.0/de/legalcode"> Der annotierte Datenbestand der Digitalen Bibliothek inklusive
|
||||||
|
Metadaten sowie davon einzeln zugängliche Teile sind eine Abwandlung
|
||||||
|
des Datenbestandes von www.editura.de durch TextGrid und werden
|
||||||
|
unter der Lizenz Creative Commons Namensnennung 3.0 Deutschland
|
||||||
|
Lizenz (by-Nennung TextGrid) veröffentlicht. Die Lizenz bezieht sich
|
||||||
|
nicht auf die der Annotation zu Grunde liegenden allgemeinfreien
|
||||||
|
Texte (Siehe auch Punkt 2 der Lizenzbestimmungen).</datacite:rights>
|
||||||
|
<datacite:rights rightsURI="info:eu-repo/semantics/openAccess"/>
|
||||||
|
</datacite:rightsList>
|
||||||
|
<datacite:descriptions>
|
||||||
|
<datacite:description descriptionType="Abstract"/>
|
||||||
|
</datacite:descriptions>
|
||||||
|
<datacite:geoLocations>
|
||||||
|
<datacite:geoLocation>
|
||||||
|
<datacite:geoLocationPlace
|
||||||
|
xmlns:xs="http://www.w3.org/2001/XMLSchema" xsi:type="xs:string">Berlin</datacite:geoLocationPlace>
|
||||||
|
</datacite:geoLocation>
|
||||||
|
</datacite:geoLocations>
|
||||||
|
</datacite:resource>
|
||||||
|
<oaf:identifier identifierType="handle">hdl:11858/00-1734-0000-0003-EE73-2</oaf:identifier>
|
||||||
|
<dr:CobjCategory type="dataset">0021</dr:CobjCategory>
|
||||||
|
<oaf:refereed>0002</oaf:refereed>
|
||||||
|
<oaf:dateAccepted>2012-01-29</oaf:dateAccepted>
|
||||||
|
<oaf:accessrights>OPEN</oaf:accessrights>
|
||||||
|
<oaf:license>http://creativecommons.org/licenses/by/3.0/de/legalcode</oaf:license>
|
||||||
|
<oaf:language>und</oaf:language>
|
||||||
|
<oaf:country>DE</oaf:country>
|
||||||
|
<oaf:hostedBy id="re3data_____::r3d100011365" name="TextGrid Repository"/>
|
||||||
|
<oaf:collectedFrom id="re3data_____::r3d100011365" name="TextGrid Repository"/>
|
||||||
|
</metadata>
|
||||||
|
<about xmlns:dc="http://purl.org/dc/elements/1.1/"
|
||||||
|
xmlns:dri="http://www.driver-repository.eu/namespace/dri"
|
||||||
|
xmlns:prov="http://www.openarchives.org/OAI/2.0/provenance" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
|
||||||
|
<provenance xmlns="http://www.openarchives.org/OAI/2.0/provenance" xsi:schemaLocation="http://www.openarchives.org/OAI/2.0/provenance http://www.openarchives.org/OAI/2.0/provenance.xsd">
|
||||||
|
<originDescription altered="true" harvestDate="2020-12-16T10:04:03.148Z">
|
||||||
|
<baseURL>https%3A%2F%2Fdev.textgridlab.org%2F1.0%2Ftgoaipmh%2Foai</baseURL>
|
||||||
|
<identifier>textgrid:rn8z.0</identifier>
|
||||||
|
<datestamp>2012-01-29T20:54:12Z</datestamp>
|
||||||
|
<metadataNamespace/>
|
||||||
|
</originDescription>
|
||||||
|
</provenance>
|
||||||
|
<oaf:datainfo>
|
||||||
|
<oaf:inferred>false</oaf:inferred>
|
||||||
|
<oaf:deletedbyinference>false</oaf:deletedbyinference>
|
||||||
|
<oaf:trust>0.9</oaf:trust>
|
||||||
|
<oaf:inferenceprovenance/>
|
||||||
|
<oaf:provenanceaction classid="sysimport:crosswalk:datasetarchive"
|
||||||
|
classname="sysimport:crosswalk:datasetarchive"
|
||||||
|
schemeid="dnet:provenanceActions" schemename="dnet:provenanceActions"/>
|
||||||
|
</oaf:datainfo>
|
||||||
|
</about>
|
||||||
|
</record>
|
||||||
|
|
|
@ -6,29 +6,29 @@
|
||||||
xmlns:dc="http://purl.org/dc/elements/1.1/"
|
xmlns:dc="http://purl.org/dc/elements/1.1/"
|
||||||
xmlns:dri="http://www.driver-repository.eu/namespace/dri"
|
xmlns:dri="http://www.driver-repository.eu/namespace/dri"
|
||||||
xmlns:prov="http://www.openarchives.org/OAI/2.0/provenance" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
|
xmlns:prov="http://www.openarchives.org/OAI/2.0/provenance" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
|
||||||
<dri:objIdentifier>r3f52792889d::000051aa1f61d77d2c0b340091f8024e</dri:objIdentifier>
|
<dri:objIdentifier>r3f52792889d::00002412cb25f2f3047712d00ab2c8eb</dri:objIdentifier>
|
||||||
<dri:recordIdentifier>textgrid:q9cv.0</dri:recordIdentifier>
|
<dri:recordIdentifier>hdl:11858/00-1734-0000-0003-EE73-2</dri:recordIdentifier>
|
||||||
<dri:dateOfCollection>2020-11-17T09:34:11.128+01:00</dri:dateOfCollection>
|
<dri:dateOfCollection>2020-12-16T10:04:03.148Z</dri:dateOfCollection>
|
||||||
<oaf:datasourceprefix>r3f52792889d</oaf:datasourceprefix>
|
<oaf:datasourceprefix>r3f52792889d</oaf:datasourceprefix>
|
||||||
<identifier xmlns="http://www.openarchives.org/OAI/2.0/">textgrid:q9cv.0</identifier>
|
<identifier xmlns="http://www.openarchives.org/OAI/2.0/">textgrid:rn8z.0</identifier>
|
||||||
<datestamp xmlns="http://www.openarchives.org/OAI/2.0/">2012-01-21T13:35:20Z</datestamp>
|
<datestamp xmlns="http://www.openarchives.org/OAI/2.0/">2012-01-29T20:54:12Z</datestamp>
|
||||||
<dr:dateOfTransformation>2020-11-17T19:08:56.703+01:00</dr:dateOfTransformation>
|
<dr:dateOfTransformation>2020-12-16T16:02:37.562Z</dr:dateOfTransformation>
|
||||||
</oai:header>
|
</oai:header>
|
||||||
<metadata>
|
<metadata>
|
||||||
<datacite:resource xmlns="http://www.openarchives.org/OAI/2.0/"
|
<datacite:resource xmlns="http://www.openarchives.org/OAI/2.0/"
|
||||||
xmlns:dc="http://purl.org/dc/elements/1.1/"
|
xmlns:dc="http://purl.org/dc/elements/1.1/"
|
||||||
xmlns:dri="http://www.driver-repository.eu/namespace/dri"
|
xmlns:dri="http://www.driver-repository.eu/namespace/dri"
|
||||||
xmlns:prov="http://www.openarchives.org/OAI/2.0/provenance" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
|
xmlns:prov="http://www.openarchives.org/OAI/2.0/provenance" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
|
||||||
<datacite:identifier identifierType="Handle">hdl:11858/00-1734-0000-0003-7664-F</datacite:identifier>
|
<datacite:identifier identifierType="Handle">hdl:11858/00-1734-0000-0003-EE73-2</datacite:identifier>
|
||||||
<datacite:creators>
|
<datacite:creators>
|
||||||
<datacite:creator>
|
<datacite:creator>
|
||||||
<datacite:creatorName>Hoffmann von Fallersleben, August Heinrich</datacite:creatorName>
|
<datacite:creatorName>Liliencron, Detlev von</datacite:creatorName>
|
||||||
<datacite:nameIdentifier nameIdentifierScheme="pnd" schemeURI="https://de.dariah.eu/pnd-service">118552589</datacite:nameIdentifier>
|
<datacite:nameIdentifier nameIdentifierScheme="pnd" schemeURI="https://ref.de.dariah.eu/pndsearch/pndquery.xql?id=">118572954</datacite:nameIdentifier>
|
||||||
</datacite:creator>
|
</datacite:creator>
|
||||||
</datacite:creators>
|
</datacite:creators>
|
||||||
<datacite:titles>
|
<datacite:titles>
|
||||||
<datacite:title titleType="Other">Mailied</datacite:title>
|
<datacite:title titleType="Other">Auf dem Trocknen</datacite:title>
|
||||||
<datacite:title titleType="Other">August Heinrich Hoffmann von Fallersleben: Unpolitische Lieder von Hoffmann von Fallersleben, 1. + 2. Theil, 1. Theil, Hamburg: Hoffmann und Campe, 1841.</datacite:title>
|
<datacite:title titleType="Other">Detlev von Liliencron: Gute Nacht. Hinterlassene Gedichte, Berlin: Schuster & Loeffler, 1909.</datacite:title>
|
||||||
</datacite:titles>
|
</datacite:titles>
|
||||||
<datacite:publisher>TextGrid</datacite:publisher>
|
<datacite:publisher>TextGrid</datacite:publisher>
|
||||||
<datacite:publicationYear>2012</datacite:publicationYear>
|
<datacite:publicationYear>2012</datacite:publicationYear>
|
||||||
|
@ -38,21 +38,21 @@
|
||||||
</datacite:contributor>
|
</datacite:contributor>
|
||||||
<datacite:contributor contributorType="Other">
|
<datacite:contributor contributorType="Other">
|
||||||
<datacite:contributorName>Digitale Bibliothek</datacite:contributorName>
|
<datacite:contributorName>Digitale Bibliothek</datacite:contributorName>
|
||||||
<datacite:nameIdentifier nameIdentifierScheme="textgrid">TGPR-372fe6dc-57f2-6cd4-01b5-2c4bbefcfd3c</datacite:nameIdentifier>
|
<datacite:nameIdentifier nameIdentifierScheme="textgrid" schemeURI="http://www.textgridlab.org/schema/textgrid-metadata_2010.xsd">TGPR-372fe6dc-57f2-6cd4-01b5-2c4bbefcfd3c</datacite:nameIdentifier>
|
||||||
</datacite:contributor>
|
</datacite:contributor>
|
||||||
</datacite:contributors>
|
</datacite:contributors>
|
||||||
<datacite:dates>
|
<datacite:dates>
|
||||||
<datacite:date dateType="Created">2012-01-21T13:35:20Z</datacite:date>
|
<datacite:date dateType="Created">2012-01-29T20:54:12Z</datacite:date>
|
||||||
<datacite:date dateType="Issued">2012-01-21T13:35:20Z</datacite:date>
|
<datacite:date dateType="Issued">2012-01-29T20:54:12Z</datacite:date>
|
||||||
<datacite:date dateType="Updated">2012-01-21T13:35:20Z</datacite:date>
|
<datacite:date dateType="Updated">2012-01-29T20:54:12Z</datacite:date>
|
||||||
</datacite:dates>
|
</datacite:dates>
|
||||||
<datacite:resourceType resourceTypeGeneral="Dataset"/>
|
<datacite:resourceType resourceTypeGeneral="Dataset"/>
|
||||||
<alternateIdentifiers xmlns="http://datacite.org/schema/kernel-3">
|
<alternateIdentifiers xmlns="http://datacite.org/schema/kernel-3">
|
||||||
<datacite:alternateIdentifier alternateIdentifierType="URI" xmlns="http://www.openarchives.org/OAI/2.0/">textgrid:q9cv.0</datacite:alternateIdentifier>
|
<datacite:alternateIdentifier alternateIdentifierType="URI" xmlns="http://www.openarchives.org/OAI/2.0/">textgrid:rn8z.0</datacite:alternateIdentifier>
|
||||||
<alternateIdentifier alternateIdentifierType="URL">http://hdl.handle.net/hdl:11858/00-1734-0000-0003-7664-F</alternateIdentifier>
|
<alternateIdentifier alternateIdentifierType="URL">http://hdl.handle.net/hdl:11858/00-1734-0000-0003-EE73-2</alternateIdentifier>
|
||||||
</alternateIdentifiers>
|
</alternateIdentifiers>
|
||||||
<datacite:relatedIdentifiers>
|
<datacite:relatedIdentifiers>
|
||||||
<datacite:relatedIdentifier relatedIdentifierType="Handle" relationType="IsPartOf">hdl:11858/00-1734-0000-0003-7666-B</datacite:relatedIdentifier>
|
<datacite:relatedIdentifier relatedIdentifierType="Handle" relationType="IsPartOf">hdl:11858/00-1734-0000-0003-EE72-4</datacite:relatedIdentifier>
|
||||||
</datacite:relatedIdentifiers>
|
</datacite:relatedIdentifiers>
|
||||||
<datacite:sizes>
|
<datacite:sizes>
|
||||||
<datacite:size>527 Bytes</datacite:size>
|
<datacite:size>527 Bytes</datacite:size>
|
||||||
|
@ -77,17 +77,18 @@
|
||||||
<datacite:geoLocations>
|
<datacite:geoLocations>
|
||||||
<datacite:geoLocation>
|
<datacite:geoLocation>
|
||||||
<datacite:geoLocationPlace
|
<datacite:geoLocationPlace
|
||||||
xmlns:xs="http://www.w3.org/2001/XMLSchema" xsi:type="xs:string">Hamburg</datacite:geoLocationPlace>
|
xmlns:xs="http://www.w3.org/2001/XMLSchema" xsi:type="xs:string">Berlin</datacite:geoLocationPlace>
|
||||||
</datacite:geoLocation>
|
</datacite:geoLocation>
|
||||||
</datacite:geoLocations>
|
</datacite:geoLocations>
|
||||||
</datacite:resource>
|
</datacite:resource>
|
||||||
<oaf:identifier identifierType="handle">hdl:11858/00-1734-0000-0003-7664-F</oaf:identifier>
|
<oaf:identifier identifierType="handle">hdl:11858/00-1734-0000-0003-EE73-2</oaf:identifier>
|
||||||
<dr:CobjCategory type="dataset">0021</dr:CobjCategory>
|
<dr:CobjCategory type="dataset">0021</dr:CobjCategory>
|
||||||
<oaf:refereed>0002</oaf:refereed>
|
<oaf:refereed>0002</oaf:refereed>
|
||||||
<oaf:dateAccepted>2012-01-21</oaf:dateAccepted>
|
<oaf:dateAccepted>2012-01-29</oaf:dateAccepted>
|
||||||
<oaf:accessrights>OPEN</oaf:accessrights>
|
<oaf:accessrights>OPEN</oaf:accessrights>
|
||||||
<oaf:license>http://creativecommons.org/licenses/by/3.0/de/legalcode</oaf:license>
|
<oaf:license>http://creativecommons.org/licenses/by/3.0/de/legalcode</oaf:license>
|
||||||
<oaf:language>und</oaf:language>
|
<oaf:language>und</oaf:language>
|
||||||
|
<oaf:country>DE</oaf:country>
|
||||||
<oaf:hostedBy id="re3data_____::r3d100011365" name="TextGrid Repository"/>
|
<oaf:hostedBy id="re3data_____::r3d100011365" name="TextGrid Repository"/>
|
||||||
<oaf:collectedFrom id="re3data_____::r3d100011365" name="TextGrid Repository"/>
|
<oaf:collectedFrom id="re3data_____::r3d100011365" name="TextGrid Repository"/>
|
||||||
</metadata>
|
</metadata>
|
||||||
|
@ -95,11 +96,11 @@
|
||||||
xmlns:dri="http://www.driver-repository.eu/namespace/dri"
|
xmlns:dri="http://www.driver-repository.eu/namespace/dri"
|
||||||
xmlns:prov="http://www.openarchives.org/OAI/2.0/provenance" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
|
xmlns:prov="http://www.openarchives.org/OAI/2.0/provenance" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
|
||||||
<provenance xmlns="http://www.openarchives.org/OAI/2.0/provenance" xsi:schemaLocation="http://www.openarchives.org/OAI/2.0/provenance http://www.openarchives.org/OAI/2.0/provenance.xsd">
|
<provenance xmlns="http://www.openarchives.org/OAI/2.0/provenance" xsi:schemaLocation="http://www.openarchives.org/OAI/2.0/provenance http://www.openarchives.org/OAI/2.0/provenance.xsd">
|
||||||
<originDescription altered="true" harvestDate="2020-11-17T09:34:11.128+01:00">
|
<originDescription altered="true" harvestDate="2020-12-16T10:04:03.148Z">
|
||||||
<baseURL>https%3A%2F%2Fdev.textgridlab.org%2F1.0%2Ftgoaipmh%2Foai</baseURL>
|
<baseURL>https%3A%2F%2Fdev.textgridlab.org%2F1.0%2Ftgoaipmh%2Foai</baseURL>
|
||||||
<identifier>textgrid:q9cv.0</identifier>
|
<identifier>textgrid:rn8z.0</identifier>
|
||||||
<datestamp>2012-01-21T13:35:20Z</datestamp>
|
<datestamp>2012-01-29T20:54:12Z</datestamp>
|
||||||
<metadataNamespace>http://schema.datacite.org/oai/oai-1.0/</metadataNamespace>
|
<metadataNamespace/>
|
||||||
</originDescription>
|
</originDescription>
|
||||||
</provenance>
|
</provenance>
|
||||||
<oaf:datainfo>
|
<oaf:datainfo>
|
||||||
|
@ -107,9 +108,10 @@
|
||||||
<oaf:deletedbyinference>false</oaf:deletedbyinference>
|
<oaf:deletedbyinference>false</oaf:deletedbyinference>
|
||||||
<oaf:trust>0.9</oaf:trust>
|
<oaf:trust>0.9</oaf:trust>
|
||||||
<oaf:inferenceprovenance/>
|
<oaf:inferenceprovenance/>
|
||||||
<oaf:provenanceaction classid="sysimport:crosswalk"
|
<oaf:provenanceaction classid="sysimport:crosswalk:datasetarchive"
|
||||||
classname="sysimport:crosswalk"
|
classname="sysimport:crosswalk:datasetarchive"
|
||||||
schemeid="dnet:provenanceActions" schemename="dnet:provenanceActions"/>
|
schemeid="dnet:provenanceActions" schemename="dnet:provenanceActions"/>
|
||||||
</oaf:datainfo>
|
</oaf:datainfo>
|
||||||
</about>
|
</about>
|
||||||
</record>
|
</record>
|
||||||
|
|
||||||
|
|
|
@ -1,15 +1,15 @@
|
||||||
package eu.dnetlib.dhp.export
|
package eu.dnetlib.dhp.export
|
||||||
|
|
||||||
|
import com.fasterxml.jackson.databind.ObjectMapper
|
||||||
|
|
||||||
import java.time.LocalDateTime
|
import java.time.LocalDateTime
|
||||||
import java.time.format.DateTimeFormatter
|
import java.time.format.DateTimeFormatter
|
||||||
|
|
||||||
import eu.dnetlib.dhp.common.PacePerson
|
import eu.dnetlib.dhp.common.PacePerson
|
||||||
import eu.dnetlib.dhp.schema.action.AtomicAction
|
import eu.dnetlib.dhp.schema.action.AtomicAction
|
||||||
import eu.dnetlib.dhp.schema.oaf.{Author, Dataset, ExternalReference, Field, Instance, KeyValue, Oaf, Publication, Qualifier, Relation, Result, StructuredProperty}
|
import eu.dnetlib.dhp.schema.oaf.{Author, Dataset, ExternalReference, Field, Instance, KeyValue, Oaf, Publication, Qualifier, Relation, Result, StructuredProperty}
|
||||||
import eu.dnetlib.dhp.schema.scholexplorer.{DLIDataset, DLIPublication}
|
import eu.dnetlib.dhp.schema.scholexplorer.{DLIDataset, DLIPublication}
|
||||||
import eu.dnetlib.dhp.utils.DHPUtils
|
import eu.dnetlib.dhp.utils.DHPUtils
|
||||||
import org.apache.commons.lang3.StringUtils
|
import org.apache.commons.lang3.StringUtils
|
||||||
import org.codehaus.jackson.map.ObjectMapper
|
|
||||||
import eu.dnetlib.dhp.schema.scholexplorer.OafUtils._
|
import eu.dnetlib.dhp.schema.scholexplorer.OafUtils._
|
||||||
|
|
||||||
import scala.collection.JavaConverters._
|
import scala.collection.JavaConverters._
|
||||||
|
|
|
@ -1,27 +1,21 @@
|
||||||
package eu.dnetlib.dhp.`export`
|
package eu.dnetlib.dhp.`export`
|
||||||
|
|
||||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser
|
||||||
import eu.dnetlib.dhp.schema.oaf.{Instance, Publication, Relation, Dataset => OafDataset}
|
import eu.dnetlib.dhp.schema.oaf.{Publication, Relation, Dataset => OafDataset}
|
||||||
import eu.dnetlib.dhp.schema.scholexplorer.{DLIDataset, DLIPublication}
|
import eu.dnetlib.dhp.schema.scholexplorer.{DLIDataset, DLIPublication}
|
||||||
import org.apache.commons.io.IOUtils
|
import org.apache.commons.io.IOUtils
|
||||||
import org.apache.hadoop.io.Text
|
import org.apache.hadoop.io.Text
|
||||||
import org.apache.hadoop.io.compress.GzipCodec
|
import org.apache.hadoop.io.compress.GzipCodec
|
||||||
import org.apache.hadoop.mapred.SequenceFileOutputFormat
|
import org.apache.hadoop.mapred.SequenceFileOutputFormat
|
||||||
import org.apache.spark.rdd.RDD
|
|
||||||
import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession}
|
import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession}
|
||||||
import org.apache.spark.sql.functions._
|
import org.apache.spark.sql.functions._
|
||||||
import org.apache.spark.sql.expressions.Window
|
import org.apache.spark.sql.expressions.Window
|
||||||
import org.apache.spark.{SparkConf, SparkContext}
|
import org.apache.spark.SparkConf
|
||||||
import org.codehaus.jackson.map.ObjectMapper
|
|
||||||
|
|
||||||
import scala.collection.mutable.ArrayBuffer
|
import scala.collection.mutable.ArrayBuffer
|
||||||
import scala.collection.JavaConverters._
|
|
||||||
|
|
||||||
object SparkExportContentForOpenAire {
|
object SparkExportContentForOpenAire {
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def main(args: Array[String]): Unit = {
|
def main(args: Array[String]): Unit = {
|
||||||
val conf: SparkConf = new SparkConf()
|
val conf: SparkConf = new SparkConf()
|
||||||
val parser = new ArgumentApplicationParser(IOUtils.toString(SparkExportContentForOpenAire.getClass.getResourceAsStream("input_export_content_parameters.json")))
|
val parser = new ArgumentApplicationParser(IOUtils.toString(SparkExportContentForOpenAire.getClass.getResourceAsStream("input_export_content_parameters.json")))
|
||||||
|
@ -178,11 +172,4 @@ object SparkExportContentForOpenAire {
|
||||||
fRels.union(fpubs).union(fdats).rdd.map(s => (new Text(s._1), new Text(s._2))).saveAsHadoopFile(s"$workingPath/export/rawset", classOf[Text], classOf[Text], classOf[SequenceFileOutputFormat[Text,Text]], classOf[GzipCodec])
|
fRels.union(fpubs).union(fdats).rdd.map(s => (new Text(s._1), new Text(s._2))).saveAsHadoopFile(s"$workingPath/export/rawset", classOf[Text], classOf[Text], classOf[SequenceFileOutputFormat[Text,Text]], classOf[GzipCodec])
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -10,7 +10,8 @@ import org.apache.http.client.methods.HttpPut;
|
||||||
import org.apache.http.entity.StringEntity;
|
import org.apache.http.entity.StringEntity;
|
||||||
import org.apache.http.impl.client.CloseableHttpClient;
|
import org.apache.http.impl.client.CloseableHttpClient;
|
||||||
import org.apache.http.impl.client.HttpClients;
|
import org.apache.http.impl.client.HttpClients;
|
||||||
import org.codehaus.jackson.map.ObjectMapper;
|
|
||||||
|
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||||
|
|
||||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||||
|
|
||||||
|
|
|
@ -1,5 +1,6 @@
|
||||||
package eu.dnetlib.dhp.provision
|
package eu.dnetlib.dhp.provision
|
||||||
|
|
||||||
|
import com.fasterxml.jackson.databind.ObjectMapper
|
||||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser
|
||||||
import eu.dnetlib.dhp.provision.scholix.Scholix
|
import eu.dnetlib.dhp.provision.scholix.Scholix
|
||||||
import eu.dnetlib.dhp.provision.scholix.summary.ScholixSummary
|
import eu.dnetlib.dhp.provision.scholix.summary.ScholixSummary
|
||||||
|
@ -7,7 +8,6 @@ import org.apache.commons.io.IOUtils
|
||||||
import org.apache.hadoop.io.compress.GzipCodec
|
import org.apache.hadoop.io.compress.GzipCodec
|
||||||
import org.apache.spark.SparkConf
|
import org.apache.spark.SparkConf
|
||||||
import org.apache.spark.sql.{Encoder, Encoders, SparkSession}
|
import org.apache.spark.sql.{Encoder, Encoders, SparkSession}
|
||||||
import org.codehaus.jackson.map.ObjectMapper
|
|
||||||
|
|
||||||
object SparkConvertDatasetToJson {
|
object SparkConvertDatasetToJson {
|
||||||
|
|
||||||
|
|
|
@ -54,6 +54,13 @@
|
||||||
<artifactId>spark-solr</artifactId>
|
<artifactId>spark-solr</artifactId>
|
||||||
</dependency>
|
</dependency>
|
||||||
|
|
||||||
|
<!-- the solr-test-framework requires the old junit:junit test framework -->
|
||||||
|
<dependency>
|
||||||
|
<groupId>junit</groupId>
|
||||||
|
<artifactId>junit</artifactId>
|
||||||
|
<version>4.12</version>
|
||||||
|
<scope>test</scope>
|
||||||
|
</dependency>
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>org.apache.solr</groupId>
|
<groupId>org.apache.solr</groupId>
|
||||||
<artifactId>solr-test-framework</artifactId>
|
<artifactId>solr-test-framework</artifactId>
|
||||||
|
@ -140,6 +147,12 @@
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>org.apache.zookeeper</groupId>
|
<groupId>org.apache.zookeeper</groupId>
|
||||||
<artifactId>zookeeper</artifactId>
|
<artifactId>zookeeper</artifactId>
|
||||||
|
<exclusions>
|
||||||
|
<exclusion>
|
||||||
|
<groupId>junit</groupId>
|
||||||
|
<artifactId>junit</artifactId>
|
||||||
|
</exclusion>
|
||||||
|
</exclusions>
|
||||||
</dependency>
|
</dependency>
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -27,6 +27,7 @@ import eu.dnetlib.dhp.common.HdfsSupport;
|
||||||
import eu.dnetlib.dhp.oa.provision.model.JoinedEntity;
|
import eu.dnetlib.dhp.oa.provision.model.JoinedEntity;
|
||||||
import eu.dnetlib.dhp.oa.provision.model.ProvisionModelSupport;
|
import eu.dnetlib.dhp.oa.provision.model.ProvisionModelSupport;
|
||||||
import eu.dnetlib.dhp.oa.provision.model.RelatedEntityWrapper;
|
import eu.dnetlib.dhp.oa.provision.model.RelatedEntityWrapper;
|
||||||
|
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||||
import eu.dnetlib.dhp.schema.common.ModelSupport;
|
import eu.dnetlib.dhp.schema.common.ModelSupport;
|
||||||
import eu.dnetlib.dhp.schema.oaf.*;
|
import eu.dnetlib.dhp.schema.oaf.*;
|
||||||
import scala.Tuple2;
|
import scala.Tuple2;
|
||||||
|
@ -272,11 +273,7 @@ public class CreateRelatedEntitiesJob_phase2 {
|
||||||
.filter(Objects::nonNull)
|
.filter(Objects::nonNull)
|
||||||
.map(Qualifier::getClassid)
|
.map(Qualifier::getClassid)
|
||||||
.filter(StringUtils::isNotBlank)
|
.filter(StringUtils::isNotBlank)
|
||||||
.anyMatch(c -> "orcid".equals(c.toLowerCase()));
|
.anyMatch(c -> c.toLowerCase().contains(ModelConstants.ORCID));
|
||||||
}
|
|
||||||
|
|
||||||
private static FilterFunction<JoinedEntity> filterEmptyEntityFn() {
|
|
||||||
return (FilterFunction<JoinedEntity>) v -> Objects.nonNull(v.getEntity());
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private static void removeOutputDir(SparkSession spark, String path) {
|
private static void removeOutputDir(SparkSession spark, String path) {
|
||||||
|
|
|
@ -0,0 +1,52 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.dhp.oa.provision.utils;
|
||||||
|
|
||||||
|
import java.util.Comparator;
|
||||||
|
import java.util.Optional;
|
||||||
|
|
||||||
|
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||||
|
import eu.dnetlib.dhp.schema.common.ModelSupport;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.Qualifier;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
|
||||||
|
|
||||||
|
public class AuthorPidTypeComparator implements Comparator<StructuredProperty> {
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int compare(StructuredProperty left, StructuredProperty right) {
|
||||||
|
|
||||||
|
String lClass = Optional
|
||||||
|
.ofNullable(left)
|
||||||
|
.map(StructuredProperty::getQualifier)
|
||||||
|
.map(Qualifier::getClassid)
|
||||||
|
.orElse(null);
|
||||||
|
|
||||||
|
String rClass = Optional
|
||||||
|
.ofNullable(right)
|
||||||
|
.map(StructuredProperty::getQualifier)
|
||||||
|
.map(Qualifier::getClassid)
|
||||||
|
.orElse(null);
|
||||||
|
|
||||||
|
if (lClass == null && rClass == null)
|
||||||
|
return 0;
|
||||||
|
if (lClass == null)
|
||||||
|
return 1;
|
||||||
|
if (rClass == null)
|
||||||
|
return -1;
|
||||||
|
|
||||||
|
if (lClass.equals(rClass))
|
||||||
|
return 0;
|
||||||
|
|
||||||
|
if (lClass.equals(ModelConstants.ORCID))
|
||||||
|
return -1;
|
||||||
|
if (rClass.equals(ModelConstants.ORCID))
|
||||||
|
return 1;
|
||||||
|
|
||||||
|
if (lClass.equals(ModelConstants.ORCID_PENDING))
|
||||||
|
return -1;
|
||||||
|
if (rClass.equals(ModelConstants.ORCID_PENDING))
|
||||||
|
return 1;
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -7,13 +7,16 @@ import java.util.Set;
|
||||||
|
|
||||||
import com.google.common.collect.Sets;
|
import com.google.common.collect.Sets;
|
||||||
|
|
||||||
|
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||||
import eu.dnetlib.dhp.schema.oaf.*;
|
import eu.dnetlib.dhp.schema.oaf.*;
|
||||||
|
|
||||||
public class GraphMappingUtils {
|
public class GraphMappingUtils {
|
||||||
|
|
||||||
public static final String SEPARATOR = "_";
|
public static final String SEPARATOR = "_";
|
||||||
|
|
||||||
public static Set<String> authorPidTypes = Sets.newHashSet("orcid", "magidentifier");
|
public static Set<String> authorPidTypes = Sets
|
||||||
|
.newHashSet(
|
||||||
|
ModelConstants.ORCID, ModelConstants.ORCID_PENDING, "magidentifier");
|
||||||
|
|
||||||
public static String removePrefix(final String s) {
|
public static String removePrefix(final String s) {
|
||||||
if (s.contains("|"))
|
if (s.contains("|"))
|
||||||
|
|
|
@ -73,7 +73,9 @@ public class TemplateFactory {
|
||||||
final Collection<String> fields,
|
final Collection<String> fields,
|
||||||
final String semanticclass,
|
final String semanticclass,
|
||||||
final String semantischeme,
|
final String semantischeme,
|
||||||
final DataInfo info) {
|
final DataInfo info,
|
||||||
|
final boolean validated,
|
||||||
|
final String validationDate) {
|
||||||
return getTemplate(resources.getRel())
|
return getTemplate(resources.getRel())
|
||||||
.add("type", type)
|
.add("type", type)
|
||||||
.add("objIdentifier", escapeXml(removePrefix(objIdentifier)))
|
.add("objIdentifier", escapeXml(removePrefix(objIdentifier)))
|
||||||
|
@ -86,6 +88,8 @@ public class TemplateFactory {
|
||||||
.add(
|
.add(
|
||||||
"provenanceaction",
|
"provenanceaction",
|
||||||
info.getProvenanceaction() != null ? info.getProvenanceaction().getClassid() : "")
|
info.getProvenanceaction() != null ? info.getProvenanceaction().getClassid() : "")
|
||||||
|
.add("validated", validated)
|
||||||
|
.add("validationdate", validationDate)
|
||||||
.render();
|
.render();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -254,6 +254,18 @@ public class XmlRecordFactory implements Serializable {
|
||||||
p -> p,
|
p -> p,
|
||||||
(p1, p2) -> p1))
|
(p1, p2) -> p1))
|
||||||
.values()
|
.values()
|
||||||
|
.stream()
|
||||||
|
.collect(
|
||||||
|
Collectors
|
||||||
|
.groupingBy(
|
||||||
|
p -> p.getValue(),
|
||||||
|
Collectors
|
||||||
|
.mapping(
|
||||||
|
p -> p,
|
||||||
|
Collectors.minBy(new AuthorPidTypeComparator()))))
|
||||||
|
.values()
|
||||||
|
.stream()
|
||||||
|
.map(op -> op.get())
|
||||||
.forEach(
|
.forEach(
|
||||||
sp -> {
|
sp -> {
|
||||||
String pidType = getAuthorPidType(sp.getQualifier().getClassid());
|
String pidType = getAuthorPidType(sp.getQualifier().getClassid());
|
||||||
|
@ -1082,9 +1094,12 @@ public class XmlRecordFactory implements Serializable {
|
||||||
String.format("missing scheme for: <%s - %s>", type.toString(), targetType));
|
String.format("missing scheme for: <%s - %s>", type.toString(), targetType));
|
||||||
}
|
}
|
||||||
final HashSet<String> fields = Sets.newHashSet(mapFields(link, contexts));
|
final HashSet<String> fields = Sets.newHashSet(mapFields(link, contexts));
|
||||||
|
if (rel.getValidated() == null)
|
||||||
|
rel.setValidated(false);
|
||||||
return templateFactory
|
return templateFactory
|
||||||
.getRel(
|
.getRel(
|
||||||
targetType, rel.getTarget(), fields, rel.getRelClass(), scheme, rel.getDataInfo());
|
targetType, rel.getTarget(), fields, rel.getRelClass(), scheme, rel.getDataInfo(), rel.getValidated(),
|
||||||
|
rel.getValidationDate());
|
||||||
}
|
}
|
||||||
|
|
||||||
private List<String> listChildren(
|
private List<String> listChildren(
|
||||||
|
|
|
@ -1,4 +1,5 @@
|
||||||
<rel inferred="$inferred$" trust="$trust$" inferenceprovenance="$inferenceprovenance$" provenanceaction="$provenanceaction$">
|
<rel inferred="$inferred$" trust="$trust$" inferenceprovenance="$inferenceprovenance$" provenanceaction="$provenanceaction$">
|
||||||
<to class="$class$" scheme="$scheme$" type="$type$">$objIdentifier$</to>
|
$if(validated)$<validated date="$validationdate$"/>$else$$endif$
|
||||||
|
<to class="$class$" scheme="$scheme$" type="$type$">$objIdentifier$</to>
|
||||||
$metadata:{ it | $it$ }$
|
$metadata:{ it | $it$ }$
|
||||||
</rel>
|
</rel>
|
|
@ -6,15 +6,13 @@ import org.apache.solr.client.solrj.response.UpdateResponse;
|
||||||
import org.junit.jupiter.api.Assertions;
|
import org.junit.jupiter.api.Assertions;
|
||||||
import org.junit.jupiter.api.Test;
|
import org.junit.jupiter.api.Test;
|
||||||
|
|
||||||
import junit.framework.Assert;
|
|
||||||
|
|
||||||
public class SolrAdminApplicationTest extends SolrTest {
|
public class SolrAdminApplicationTest extends SolrTest {
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testPing() throws Exception {
|
public void testPing() throws Exception {
|
||||||
SolrPingResponse pingResponse = miniCluster.getSolrClient().ping();
|
SolrPingResponse pingResponse = miniCluster.getSolrClient().ping();
|
||||||
log.info("pingResponse: '{}'", pingResponse.getStatus());
|
log.info("pingResponse: '{}'", pingResponse.getStatus());
|
||||||
Assert.assertTrue(pingResponse.getStatus() == 0);
|
Assertions.assertTrue(pingResponse.getStatus() == 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
|
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue